1 /* $NetBSD: sys_futex.c,v 1.18 2022/04/21 12:05:13 riastradh Exp $ */ 2 3 /*- 4 * Copyright (c) 2018, 2019, 2020 The NetBSD Foundation, Inc. 5 * All rights reserved. 6 * 7 * This code is derived from software contributed to The NetBSD Foundation 8 * by Taylor R. Campbell and Jason R. Thorpe. 9 * 10 * Redistribution and use in source and binary forms, with or without 11 * modification, are permitted provided that the following conditions 12 * are met: 13 * 1. Redistributions of source code must retain the above copyright 14 * notice, this list of conditions and the following disclaimer. 15 * 2. Redistributions in binary form must reproduce the above copyright 16 * notice, this list of conditions and the following disclaimer in the 17 * documentation and/or other materials provided with the distribution. 18 * 19 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS 20 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 21 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 22 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS 23 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 24 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 25 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 26 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 27 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 28 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 29 * POSSIBILITY OF SUCH DAMAGE. 30 */ 31 32 #include <sys/cdefs.h> 33 __KERNEL_RCSID(0, "$NetBSD: sys_futex.c,v 1.18 2022/04/21 12:05:13 riastradh Exp $"); 34 35 /* 36 * Futexes 37 * 38 * The futex system call coordinates notifying threads waiting for 39 * changes on a 32-bit word of memory. The word can be managed by 40 * CPU atomic operations in userland, without system calls, as long 41 * as there is no contention. 42 * 43 * The simplest use case demonstrating the utility is: 44 * 45 * // 32-bit word of memory shared among threads or 46 * // processes in userland. lock & 1 means owned; 47 * // lock & 2 means there are waiters waiting. 48 * volatile int lock = 0; 49 * 50 * int v; 51 * 52 * // Acquire a lock. 53 * do { 54 * v = lock; 55 * if (v & 1) { 56 * // Lock is held. Set a bit to say that 57 * // there are waiters, and wait for lock 58 * // to change to anything other than v; 59 * // then retry. 60 * if (atomic_cas_uint(&lock, v, v | 2) != v) 61 * continue; 62 * futex(FUTEX_WAIT, &lock, v | 2, NULL, NULL, 0); 63 * continue; 64 * } 65 * } while (atomic_cas_uint(&lock, v, v & ~1) != v); 66 * membar_acquire(); 67 * 68 * ... 69 * 70 * // Release the lock. Optimistically assume there are 71 * // no waiters first until demonstrated otherwise. 72 * membar_release(); 73 * if (atomic_cas_uint(&lock, 1, 0) != 1) { 74 * // There may be waiters. 75 * v = atomic_swap_uint(&lock, 0); 76 * // If there are still waiters, wake one. 77 * if (v & 2) 78 * futex(FUTEX_WAKE, &lock, 1, NULL, NULL, 0); 79 * } 80 * 81 * The goal is to avoid the futex system call unless there is 82 * contention; then if there is contention, to guarantee no missed 83 * wakeups. 84 * 85 * For a simple implementation, futex(FUTEX_WAIT) could queue 86 * itself to be woken, double-check the lock word, and then sleep; 87 * spurious wakeups are generally a fact of life, so any 88 * FUTEX_WAKE could just wake every FUTEX_WAIT in the system. 89 * 90 * If this were all there is to it, we could then increase 91 * parallelism by refining the approximation: partition the 92 * waiters into buckets by hashing the lock addresses to reduce 93 * the incidence of spurious wakeups. But this is not all. 94 * 95 * The futex(FUTEX_CMP_REQUEUE, &lock, n, &lock2, m, val) 96 * operation not only wakes n waiters on lock if lock == val, but 97 * also _transfers_ m additional waiters to lock2. Unless wakeups 98 * on lock2 also trigger wakeups on lock, we cannot move waiters 99 * to lock2 if they merely share the same hash as waiters on lock. 100 * Thus, we can't approximately distribute waiters into queues by 101 * a hash function; we must distinguish futex queues exactly by 102 * lock address. 103 * 104 * For now, we use a global red/black tree to index futexes. This 105 * should be replaced by a lockless radix tree with a thread to 106 * free entries no longer in use once all lookups on all CPUs have 107 * completed. 108 * 109 * Specifically, we maintain two maps: 110 * 111 * futex_tab.va[vmspace, va] for private futexes 112 * futex_tab.oa[uvm_voaddr] for shared futexes 113 * 114 * This implementation does not support priority inheritance. 115 */ 116 117 #include <sys/param.h> 118 #include <sys/types.h> 119 #include <sys/atomic.h> 120 #include <sys/condvar.h> 121 #include <sys/futex.h> 122 #include <sys/mutex.h> 123 #include <sys/rbtree.h> 124 #include <sys/queue.h> 125 126 #include <sys/syscall.h> 127 #include <sys/syscallargs.h> 128 #include <sys/syscallvar.h> 129 130 #include <uvm/uvm_extern.h> 131 132 /* 133 * Lock order: 134 * 135 * futex_tab.lock 136 * futex::fx_qlock ordered by kva of struct futex 137 * -> futex_wait::fw_lock only one at a time 138 * futex_wait::fw_lock only one at a time 139 * -> futex::fx_abortlock only one at a time 140 */ 141 142 /* 143 * union futex_key 144 * 145 * A futex is addressed either by a vmspace+va (private) or by 146 * a uvm_voaddr (shared). 147 */ 148 union futex_key { 149 struct { 150 struct vmspace *vmspace; 151 vaddr_t va; 152 } fk_private; 153 struct uvm_voaddr fk_shared; 154 }; 155 156 /* 157 * struct futex 158 * 159 * Kernel state for a futex located at a particular address in a 160 * particular virtual address space. 161 * 162 * N.B. fx_refcnt is an unsigned long because we need to be able 163 * to operate on it atomically on all systems while at the same 164 * time rendering practically impossible the chance of it reaching 165 * its max value. In practice, we're limited by the number of LWPs 166 * that can be present on the system at any given time, and the 167 * assumption is that limit will be good enough on a 32-bit platform. 168 * See futex_wake() for why overflow needs to be avoided. 169 */ 170 struct futex { 171 union futex_key fx_key; 172 unsigned long fx_refcnt; 173 bool fx_shared; 174 bool fx_on_tree; 175 struct rb_node fx_node; 176 177 kmutex_t fx_qlock; 178 TAILQ_HEAD(, futex_wait) fx_queue; 179 180 kmutex_t fx_abortlock; 181 LIST_HEAD(, futex_wait) fx_abortlist; 182 kcondvar_t fx_abortcv; 183 }; 184 185 /* 186 * struct futex_wait 187 * 188 * State for a thread to wait on a futex. Threads wait on fw_cv 189 * for fw_bitset to be set to zero. The thread may transition to 190 * a different futex queue at any time under the futex's lock. 191 */ 192 struct futex_wait { 193 kmutex_t fw_lock; 194 kcondvar_t fw_cv; 195 struct futex *fw_futex; 196 TAILQ_ENTRY(futex_wait) fw_entry; /* queue lock */ 197 LIST_ENTRY(futex_wait) fw_abort; /* queue abortlock */ 198 int fw_bitset; 199 bool fw_aborting; /* fw_lock */ 200 }; 201 202 /* 203 * futex_tab 204 * 205 * Global trees of futexes by vmspace/va and VM object address. 206 * 207 * XXX This obviously doesn't scale in parallel. We could use a 208 * pserialize-safe data structure, but there may be a high cost to 209 * frequent deletion since we don't cache futexes after we're done 210 * with them. We could use hashed locks. But for now, just make 211 * sure userland can't DoS the serial performance, by using a 212 * balanced binary tree for lookup. 213 * 214 * XXX We could use a per-process tree for the table indexed by 215 * virtual address to reduce contention between processes. 216 */ 217 static struct { 218 kmutex_t lock; 219 struct rb_tree va; 220 struct rb_tree oa; 221 } futex_tab __cacheline_aligned; 222 223 static int 224 compare_futex_key(void *cookie, const void *n, const void *k) 225 { 226 const struct futex *fa = n; 227 const union futex_key *fka = &fa->fx_key; 228 const union futex_key *fkb = k; 229 230 if ((uintptr_t)fka->fk_private.vmspace < 231 (uintptr_t)fkb->fk_private.vmspace) 232 return -1; 233 if ((uintptr_t)fka->fk_private.vmspace > 234 (uintptr_t)fkb->fk_private.vmspace) 235 return +1; 236 if (fka->fk_private.va < fkb->fk_private.va) 237 return -1; 238 if (fka->fk_private.va > fkb->fk_private.va) 239 return +1; 240 return 0; 241 } 242 243 static int 244 compare_futex(void *cookie, const void *na, const void *nb) 245 { 246 const struct futex *fa = na; 247 const struct futex *fb = nb; 248 249 return compare_futex_key(cookie, fa, &fb->fx_key); 250 } 251 252 static const rb_tree_ops_t futex_rb_ops = { 253 .rbto_compare_nodes = compare_futex, 254 .rbto_compare_key = compare_futex_key, 255 .rbto_node_offset = offsetof(struct futex, fx_node), 256 }; 257 258 static int 259 compare_futex_shared_key(void *cookie, const void *n, const void *k) 260 { 261 const struct futex *fa = n; 262 const union futex_key *fka = &fa->fx_key; 263 const union futex_key *fkb = k; 264 265 return uvm_voaddr_compare(&fka->fk_shared, &fkb->fk_shared); 266 } 267 268 static int 269 compare_futex_shared(void *cookie, const void *na, const void *nb) 270 { 271 const struct futex *fa = na; 272 const struct futex *fb = nb; 273 274 return compare_futex_shared_key(cookie, fa, &fb->fx_key); 275 } 276 277 static const rb_tree_ops_t futex_shared_rb_ops = { 278 .rbto_compare_nodes = compare_futex_shared, 279 .rbto_compare_key = compare_futex_shared_key, 280 .rbto_node_offset = offsetof(struct futex, fx_node), 281 }; 282 283 static void futex_wait_dequeue(struct futex_wait *, struct futex *); 284 285 /* 286 * futex_load(uaddr, kaddr) 287 * 288 * Perform a single atomic load to read *uaddr, and return the 289 * result in *kaddr. Return 0 on success, EFAULT if uaddr is not 290 * mapped. 291 */ 292 static inline int 293 futex_load(int *uaddr, int *kaddr) 294 { 295 return ufetch_int((u_int *)uaddr, (u_int *)kaddr); 296 } 297 298 /* 299 * futex_test(uaddr, expected) 300 * 301 * True if *uaddr == expected. False if *uaddr != expected, or if 302 * uaddr is not mapped. 303 */ 304 static bool 305 futex_test(int *uaddr, int expected) 306 { 307 int val; 308 int error; 309 310 error = futex_load(uaddr, &val); 311 if (error) 312 return false; 313 return val == expected; 314 } 315 316 /* 317 * futex_sys_init() 318 * 319 * Initialize the futex subsystem. 320 */ 321 void 322 futex_sys_init(void) 323 { 324 325 mutex_init(&futex_tab.lock, MUTEX_DEFAULT, IPL_NONE); 326 rb_tree_init(&futex_tab.va, &futex_rb_ops); 327 rb_tree_init(&futex_tab.oa, &futex_shared_rb_ops); 328 } 329 330 /* 331 * futex_sys_fini() 332 * 333 * Finalize the futex subsystem. 334 */ 335 void 336 futex_sys_fini(void) 337 { 338 339 KASSERT(RB_TREE_MIN(&futex_tab.oa) == NULL); 340 KASSERT(RB_TREE_MIN(&futex_tab.va) == NULL); 341 mutex_destroy(&futex_tab.lock); 342 } 343 344 /* 345 * futex_queue_init(f) 346 * 347 * Initialize the futex queue. Caller must call futex_queue_fini 348 * when done. 349 * 350 * Never sleeps. 351 */ 352 static void 353 futex_queue_init(struct futex *f) 354 { 355 356 mutex_init(&f->fx_qlock, MUTEX_DEFAULT, IPL_NONE); 357 mutex_init(&f->fx_abortlock, MUTEX_DEFAULT, IPL_NONE); 358 cv_init(&f->fx_abortcv, "fqabort"); 359 LIST_INIT(&f->fx_abortlist); 360 TAILQ_INIT(&f->fx_queue); 361 } 362 363 /* 364 * futex_queue_drain(f) 365 * 366 * Wait for any aborting waiters in f; then empty the queue of 367 * any stragglers and wake them. Caller must guarantee no new 368 * references to f. 369 * 370 * May sleep. 371 */ 372 static void 373 futex_queue_drain(struct futex *f) 374 { 375 struct futex_wait *fw, *fw_next; 376 377 mutex_enter(&f->fx_abortlock); 378 while (!LIST_EMPTY(&f->fx_abortlist)) 379 cv_wait(&f->fx_abortcv, &f->fx_abortlock); 380 mutex_exit(&f->fx_abortlock); 381 382 mutex_enter(&f->fx_qlock); 383 TAILQ_FOREACH_SAFE(fw, &f->fx_queue, fw_entry, fw_next) { 384 mutex_enter(&fw->fw_lock); 385 futex_wait_dequeue(fw, f); 386 cv_broadcast(&fw->fw_cv); 387 mutex_exit(&fw->fw_lock); 388 } 389 mutex_exit(&f->fx_qlock); 390 } 391 392 /* 393 * futex_queue_fini(fq) 394 * 395 * Finalize the futex queue initialized by futex_queue_init. Queue 396 * must be empty. Caller must not use f again until a subsequent 397 * futex_queue_init. 398 */ 399 static void 400 futex_queue_fini(struct futex *f) 401 { 402 403 KASSERT(TAILQ_EMPTY(&f->fx_queue)); 404 KASSERT(LIST_EMPTY(&f->fx_abortlist)); 405 mutex_destroy(&f->fx_qlock); 406 mutex_destroy(&f->fx_abortlock); 407 cv_destroy(&f->fx_abortcv); 408 } 409 410 /* 411 * futex_key_init(key, vm, va, shared) 412 * 413 * Initialize a futex key for lookup, etc. 414 */ 415 static int 416 futex_key_init(union futex_key *fk, struct vmspace *vm, vaddr_t va, bool shared) 417 { 418 int error = 0; 419 420 if (__predict_false(shared)) { 421 if (!uvm_voaddr_acquire(&vm->vm_map, va, &fk->fk_shared)) 422 error = EFAULT; 423 } else { 424 fk->fk_private.vmspace = vm; 425 fk->fk_private.va = va; 426 } 427 428 return error; 429 } 430 431 /* 432 * futex_key_fini(key, shared) 433 * 434 * Release a futex key. 435 */ 436 static void 437 futex_key_fini(union futex_key *fk, bool shared) 438 { 439 if (__predict_false(shared)) 440 uvm_voaddr_release(&fk->fk_shared); 441 memset(fk, 0, sizeof(*fk)); 442 } 443 444 /* 445 * futex_create(fk, shared) 446 * 447 * Create a futex. Initial reference count is 1, representing the 448 * caller. Returns NULL on failure. Always takes ownership of the 449 * key, either transferring it to the newly-created futex, or releasing 450 * the key if creation fails. 451 * 452 * Never sleeps for memory, but may sleep to acquire a lock. 453 */ 454 static struct futex * 455 futex_create(union futex_key *fk, bool shared) 456 { 457 struct futex *f; 458 459 f = kmem_alloc(sizeof(*f), KM_NOSLEEP); 460 if (f == NULL) { 461 futex_key_fini(fk, shared); 462 return NULL; 463 } 464 f->fx_key = *fk; 465 f->fx_refcnt = 1; 466 f->fx_shared = shared; 467 f->fx_on_tree = false; 468 futex_queue_init(f); 469 470 return f; 471 } 472 473 /* 474 * futex_destroy(f) 475 * 476 * Destroy a futex created with futex_create. Reference count 477 * must be zero. 478 * 479 * May sleep. 480 */ 481 static void 482 futex_destroy(struct futex *f) 483 { 484 485 ASSERT_SLEEPABLE(); 486 487 KASSERT(atomic_load_relaxed(&f->fx_refcnt) == 0); 488 KASSERT(!f->fx_on_tree); 489 490 /* Drain and destroy the private queue. */ 491 futex_queue_drain(f); 492 futex_queue_fini(f); 493 494 futex_key_fini(&f->fx_key, f->fx_shared); 495 496 kmem_free(f, sizeof(*f)); 497 } 498 499 /* 500 * futex_hold(f) 501 * 502 * Attempt to acquire a reference to f. Return 0 on success, 503 * ENFILE on too many references. 504 * 505 * Never sleeps. 506 */ 507 static int 508 futex_hold(struct futex *f) 509 { 510 unsigned long refcnt; 511 512 do { 513 refcnt = atomic_load_relaxed(&f->fx_refcnt); 514 if (refcnt == ULONG_MAX) 515 return ENFILE; 516 } while (atomic_cas_ulong(&f->fx_refcnt, refcnt, refcnt + 1) != refcnt); 517 518 return 0; 519 } 520 521 /* 522 * futex_rele(f) 523 * 524 * Release a reference to f acquired with futex_create or 525 * futex_hold. 526 * 527 * May sleep to free f. 528 */ 529 static void 530 futex_rele(struct futex *f) 531 { 532 unsigned long refcnt; 533 534 ASSERT_SLEEPABLE(); 535 536 do { 537 refcnt = atomic_load_relaxed(&f->fx_refcnt); 538 if (refcnt == 1) 539 goto trylast; 540 #ifndef __HAVE_ATOMIC_AS_MEMBAR 541 membar_release(); 542 #endif 543 } while (atomic_cas_ulong(&f->fx_refcnt, refcnt, refcnt - 1) != refcnt); 544 return; 545 546 trylast: 547 mutex_enter(&futex_tab.lock); 548 if (atomic_dec_ulong_nv(&f->fx_refcnt) == 0) { 549 #ifndef __HAVE_ATOMIC_AS_MEMBAR 550 membar_acquire(); 551 #endif 552 if (f->fx_on_tree) { 553 if (__predict_false(f->fx_shared)) 554 rb_tree_remove_node(&futex_tab.oa, f); 555 else 556 rb_tree_remove_node(&futex_tab.va, f); 557 f->fx_on_tree = false; 558 } 559 } else { 560 /* References remain -- don't destroy it. */ 561 f = NULL; 562 } 563 mutex_exit(&futex_tab.lock); 564 if (f != NULL) 565 futex_destroy(f); 566 } 567 568 /* 569 * futex_rele_not_last(f) 570 * 571 * Release a reference to f acquired with futex_create or 572 * futex_hold. 573 * 574 * This version asserts that we are not dropping the last 575 * reference to f. 576 */ 577 static void 578 futex_rele_not_last(struct futex *f) 579 { 580 unsigned long refcnt; 581 582 do { 583 refcnt = atomic_load_relaxed(&f->fx_refcnt); 584 KASSERT(refcnt > 1); 585 } while (atomic_cas_ulong(&f->fx_refcnt, refcnt, refcnt - 1) != refcnt); 586 } 587 588 /* 589 * futex_lookup_by_key(key, shared, &f) 590 * 591 * Try to find an existing futex va reference in the specified key 592 * On success, return 0, set f to found futex or to NULL if not found, 593 * and increment f's reference count if found. 594 * 595 * Return ENFILE if reference count too high. 596 * 597 * Internal lookup routine shared by futex_lookup() and 598 * futex_lookup_create(). 599 */ 600 static int 601 futex_lookup_by_key(union futex_key *fk, bool shared, struct futex **fp) 602 { 603 struct futex *f; 604 int error = 0; 605 606 mutex_enter(&futex_tab.lock); 607 if (__predict_false(shared)) { 608 f = rb_tree_find_node(&futex_tab.oa, fk); 609 } else { 610 f = rb_tree_find_node(&futex_tab.va, fk); 611 } 612 if (f) { 613 error = futex_hold(f); 614 if (error) 615 f = NULL; 616 } 617 *fp = f; 618 mutex_exit(&futex_tab.lock); 619 620 return error; 621 } 622 623 /* 624 * futex_insert(f, fp) 625 * 626 * Try to insert the futex f into the tree by va. If there 627 * already is a futex for its va, acquire a reference to it, and 628 * store it in *fp; otherwise store f in *fp. 629 * 630 * Return 0 on success, ENFILE if there already is a futex but its 631 * reference count is too high. 632 */ 633 static int 634 futex_insert(struct futex *f, struct futex **fp) 635 { 636 struct futex *f0; 637 int error; 638 639 KASSERT(atomic_load_relaxed(&f->fx_refcnt) != 0); 640 KASSERT(!f->fx_on_tree); 641 642 mutex_enter(&futex_tab.lock); 643 if (__predict_false(f->fx_shared)) 644 f0 = rb_tree_insert_node(&futex_tab.oa, f); 645 else 646 f0 = rb_tree_insert_node(&futex_tab.va, f); 647 if (f0 == f) { 648 f->fx_on_tree = true; 649 error = 0; 650 } else { 651 KASSERT(atomic_load_relaxed(&f0->fx_refcnt) != 0); 652 KASSERT(f0->fx_on_tree); 653 error = futex_hold(f0); 654 if (error) 655 goto out; 656 } 657 *fp = f0; 658 out: mutex_exit(&futex_tab.lock); 659 660 return error; 661 } 662 663 /* 664 * futex_lookup(uaddr, shared, &f) 665 * 666 * Find a futex at the userland pointer uaddr in the current 667 * process's VM space. On success, return the futex in f and 668 * increment its reference count. 669 * 670 * Caller must call futex_rele when done. 671 */ 672 static int 673 futex_lookup(int *uaddr, bool shared, struct futex **fp) 674 { 675 union futex_key fk; 676 struct vmspace *vm = curproc->p_vmspace; 677 vaddr_t va = (vaddr_t)uaddr; 678 int error; 679 680 /* 681 * Reject unaligned user pointers so we don't cross page 682 * boundaries and so atomics will work. 683 */ 684 if ((va & 3) != 0) 685 return EINVAL; 686 687 /* Look it up. */ 688 error = futex_key_init(&fk, vm, va, shared); 689 if (error) 690 return error; 691 692 error = futex_lookup_by_key(&fk, shared, fp); 693 futex_key_fini(&fk, shared); 694 if (error) 695 return error; 696 697 KASSERT(*fp == NULL || (*fp)->fx_shared == shared); 698 KASSERT(*fp == NULL || atomic_load_relaxed(&(*fp)->fx_refcnt) != 0); 699 700 /* 701 * Success! (Caller must still check whether we found 702 * anything, but nothing went _wrong_ like trying to use 703 * unmapped memory.) 704 */ 705 KASSERT(error == 0); 706 707 return error; 708 } 709 710 /* 711 * futex_lookup_create(uaddr, shared, &f) 712 * 713 * Find or create a futex at the userland pointer uaddr in the 714 * current process's VM space. On success, return the futex in f 715 * and increment its reference count. 716 * 717 * Caller must call futex_rele when done. 718 */ 719 static int 720 futex_lookup_create(int *uaddr, bool shared, struct futex **fp) 721 { 722 union futex_key fk; 723 struct vmspace *vm = curproc->p_vmspace; 724 struct futex *f = NULL; 725 vaddr_t va = (vaddr_t)uaddr; 726 int error; 727 728 /* 729 * Reject unaligned user pointers so we don't cross page 730 * boundaries and so atomics will work. 731 */ 732 if ((va & 3) != 0) 733 return EINVAL; 734 735 error = futex_key_init(&fk, vm, va, shared); 736 if (error) 737 return error; 738 739 /* 740 * Optimistically assume there already is one, and try to find 741 * it. 742 */ 743 error = futex_lookup_by_key(&fk, shared, fp); 744 if (error || *fp != NULL) { 745 /* 746 * We either found one, or there was an error. 747 * In either case, we are done with the key. 748 */ 749 futex_key_fini(&fk, shared); 750 goto out; 751 } 752 753 /* 754 * Create a futex record. This transfers ownership of the key 755 * in all cases. 756 */ 757 f = futex_create(&fk, shared); 758 if (f == NULL) { 759 error = ENOMEM; 760 goto out; 761 } 762 763 /* 764 * Insert our new futex, or use existing if someone else beat 765 * us to it. 766 */ 767 error = futex_insert(f, fp); 768 if (error) 769 goto out; 770 if (*fp == f) 771 f = NULL; /* don't release on exit */ 772 773 /* Success! */ 774 KASSERT(error == 0); 775 776 out: if (f != NULL) 777 futex_rele(f); 778 KASSERT(error || *fp != NULL); 779 KASSERT(error || atomic_load_relaxed(&(*fp)->fx_refcnt) != 0); 780 return error; 781 } 782 783 /* 784 * futex_wait_init(fw, bitset) 785 * 786 * Initialize a record for a thread to wait on a futex matching 787 * the specified bit set. Should be passed to futex_wait_enqueue 788 * before futex_wait, and should be passed to futex_wait_fini when 789 * done. 790 */ 791 static void 792 futex_wait_init(struct futex_wait *fw, int bitset) 793 { 794 795 KASSERT(bitset); 796 797 mutex_init(&fw->fw_lock, MUTEX_DEFAULT, IPL_NONE); 798 cv_init(&fw->fw_cv, "futex"); 799 fw->fw_futex = NULL; 800 fw->fw_bitset = bitset; 801 fw->fw_aborting = false; 802 } 803 804 /* 805 * futex_wait_fini(fw) 806 * 807 * Finalize a record for a futex waiter. Must not be on any 808 * futex's queue. 809 */ 810 static void 811 futex_wait_fini(struct futex_wait *fw) 812 { 813 814 KASSERT(fw->fw_futex == NULL); 815 816 cv_destroy(&fw->fw_cv); 817 mutex_destroy(&fw->fw_lock); 818 } 819 820 /* 821 * futex_wait_enqueue(fw, f) 822 * 823 * Put fw on the futex queue. Must be done before futex_wait. 824 * Caller must hold fw's lock and f's lock, and fw must not be on 825 * any existing futex's waiter list. 826 */ 827 static void 828 futex_wait_enqueue(struct futex_wait *fw, struct futex *f) 829 { 830 831 KASSERT(mutex_owned(&f->fx_qlock)); 832 KASSERT(mutex_owned(&fw->fw_lock)); 833 KASSERT(fw->fw_futex == NULL); 834 KASSERT(!fw->fw_aborting); 835 836 fw->fw_futex = f; 837 TAILQ_INSERT_TAIL(&f->fx_queue, fw, fw_entry); 838 } 839 840 /* 841 * futex_wait_dequeue(fw, f) 842 * 843 * Remove fw from the futex queue. Precludes subsequent 844 * futex_wait until a futex_wait_enqueue. Caller must hold fw's 845 * lock and f's lock, and fw must be on f. 846 */ 847 static void 848 futex_wait_dequeue(struct futex_wait *fw, struct futex *f) 849 { 850 851 KASSERT(mutex_owned(&f->fx_qlock)); 852 KASSERT(mutex_owned(&fw->fw_lock)); 853 KASSERT(fw->fw_futex == f); 854 855 TAILQ_REMOVE(&f->fx_queue, fw, fw_entry); 856 fw->fw_futex = NULL; 857 } 858 859 /* 860 * futex_wait_abort(fw) 861 * 862 * Caller is no longer waiting for fw. Remove it from any queue 863 * if it was on one. Caller must hold fw->fw_lock. 864 */ 865 static void 866 futex_wait_abort(struct futex_wait *fw) 867 { 868 struct futex *f; 869 870 KASSERT(mutex_owned(&fw->fw_lock)); 871 872 /* 873 * Grab the futex queue. It can't go away as long as we hold 874 * fw_lock. However, we can't take the queue lock because 875 * that's a lock order reversal. 876 */ 877 f = fw->fw_futex; 878 879 /* Put us on the abort list so that fq won't go away. */ 880 mutex_enter(&f->fx_abortlock); 881 LIST_INSERT_HEAD(&f->fx_abortlist, fw, fw_abort); 882 mutex_exit(&f->fx_abortlock); 883 884 /* 885 * Mark fw as aborting so it won't lose wakeups and won't be 886 * transferred to any other queue. 887 */ 888 fw->fw_aborting = true; 889 890 /* f is now stable, so we can release fw_lock. */ 891 mutex_exit(&fw->fw_lock); 892 893 /* Now we can remove fw under the queue lock. */ 894 mutex_enter(&f->fx_qlock); 895 mutex_enter(&fw->fw_lock); 896 futex_wait_dequeue(fw, f); 897 mutex_exit(&fw->fw_lock); 898 mutex_exit(&f->fx_qlock); 899 900 /* 901 * Finally, remove us from the abort list and notify anyone 902 * waiting for the abort to complete if we were the last to go. 903 */ 904 mutex_enter(&f->fx_abortlock); 905 LIST_REMOVE(fw, fw_abort); 906 if (LIST_EMPTY(&f->fx_abortlist)) 907 cv_broadcast(&f->fx_abortcv); 908 mutex_exit(&f->fx_abortlock); 909 910 /* 911 * Release our reference to the futex now that we are not 912 * waiting for it. 913 */ 914 futex_rele(f); 915 916 /* 917 * Reacquire the fw lock as caller expects. Verify that we're 918 * aborting and no longer associated with a futex. 919 */ 920 mutex_enter(&fw->fw_lock); 921 KASSERT(fw->fw_aborting); 922 KASSERT(fw->fw_futex == NULL); 923 } 924 925 /* 926 * futex_wait(fw, deadline, clkid) 927 * 928 * fw must be a waiter on a futex's queue. Wait until deadline on 929 * the clock clkid, or forever if deadline is NULL, for a futex 930 * wakeup. Return 0 on explicit wakeup or destruction of futex, 931 * ETIMEDOUT on timeout, EINTR/ERESTART on signal. Either way, fw 932 * will no longer be on a futex queue on return. 933 */ 934 static int 935 futex_wait(struct futex_wait *fw, const struct timespec *deadline, 936 clockid_t clkid) 937 { 938 int error = 0; 939 940 /* Test and wait under the wait lock. */ 941 mutex_enter(&fw->fw_lock); 942 943 for (;;) { 944 /* If we're done yet, stop and report success. */ 945 if (fw->fw_bitset == 0 || fw->fw_futex == NULL) { 946 error = 0; 947 break; 948 } 949 950 /* If anything went wrong in the last iteration, stop. */ 951 if (error) 952 break; 953 954 /* Not done yet. Wait. */ 955 if (deadline) { 956 struct timespec ts; 957 958 /* Check our watch. */ 959 error = clock_gettime1(clkid, &ts); 960 if (error) 961 break; 962 963 /* If we're past the deadline, ETIMEDOUT. */ 964 if (timespeccmp(deadline, &ts, <=)) { 965 error = ETIMEDOUT; 966 break; 967 } 968 969 /* Count how much time is left. */ 970 timespecsub(deadline, &ts, &ts); 971 972 /* Wait for that much time, allowing signals. */ 973 error = cv_timedwait_sig(&fw->fw_cv, &fw->fw_lock, 974 tstohz(&ts)); 975 } else { 976 /* Wait indefinitely, allowing signals. */ 977 error = cv_wait_sig(&fw->fw_cv, &fw->fw_lock); 978 } 979 } 980 981 /* 982 * If we were woken up, the waker will have removed fw from the 983 * queue. But if anything went wrong, we must remove fw from 984 * the queue ourselves. While here, convert EWOULDBLOCK to 985 * ETIMEDOUT. 986 */ 987 if (error) { 988 futex_wait_abort(fw); 989 if (error == EWOULDBLOCK) 990 error = ETIMEDOUT; 991 } 992 993 mutex_exit(&fw->fw_lock); 994 995 return error; 996 } 997 998 /* 999 * futex_wake(f, nwake, f2, nrequeue, bitset) 1000 * 1001 * Wake up to nwake waiters on f matching bitset; then, if f2 is 1002 * provided, move up to nrequeue remaining waiters on f matching 1003 * bitset to f2. Return the number of waiters actually woken. 1004 * Caller must hold the locks of f and f2, if provided. 1005 */ 1006 static unsigned 1007 futex_wake(struct futex *f, unsigned nwake, struct futex *f2, 1008 unsigned nrequeue, int bitset) 1009 { 1010 struct futex_wait *fw, *fw_next; 1011 unsigned nwoken = 0; 1012 int hold_error __diagused; 1013 1014 KASSERT(mutex_owned(&f->fx_qlock)); 1015 KASSERT(f2 == NULL || mutex_owned(&f2->fx_qlock)); 1016 1017 /* Wake up to nwake waiters, and count the number woken. */ 1018 TAILQ_FOREACH_SAFE(fw, &f->fx_queue, fw_entry, fw_next) { 1019 if ((fw->fw_bitset & bitset) == 0) 1020 continue; 1021 if (nwake > 0) { 1022 mutex_enter(&fw->fw_lock); 1023 if (__predict_false(fw->fw_aborting)) { 1024 mutex_exit(&fw->fw_lock); 1025 continue; 1026 } 1027 futex_wait_dequeue(fw, f); 1028 fw->fw_bitset = 0; 1029 cv_broadcast(&fw->fw_cv); 1030 mutex_exit(&fw->fw_lock); 1031 nwake--; 1032 nwoken++; 1033 /* 1034 * Drop the futex reference on behalf of the 1035 * waiter. We assert this is not the last 1036 * reference on the futex (our caller should 1037 * also have one). 1038 */ 1039 futex_rele_not_last(f); 1040 } else { 1041 break; 1042 } 1043 } 1044 1045 if (f2) { 1046 /* Move up to nrequeue waiters from f's queue to f2's queue. */ 1047 TAILQ_FOREACH_SAFE(fw, &f->fx_queue, fw_entry, fw_next) { 1048 if ((fw->fw_bitset & bitset) == 0) 1049 continue; 1050 if (nrequeue > 0) { 1051 mutex_enter(&fw->fw_lock); 1052 if (__predict_false(fw->fw_aborting)) { 1053 mutex_exit(&fw->fw_lock); 1054 continue; 1055 } 1056 futex_wait_dequeue(fw, f); 1057 futex_wait_enqueue(fw, f2); 1058 mutex_exit(&fw->fw_lock); 1059 nrequeue--; 1060 /* 1061 * Transfer the reference from f to f2. 1062 * As above, we assert that we are not 1063 * dropping the last reference to f here. 1064 * 1065 * XXX futex_hold() could theoretically 1066 * XXX fail here. 1067 */ 1068 futex_rele_not_last(f); 1069 hold_error = futex_hold(f2); 1070 KASSERT(hold_error == 0); 1071 } else { 1072 break; 1073 } 1074 } 1075 } else { 1076 KASSERT(nrequeue == 0); 1077 } 1078 1079 /* Return the number of waiters woken. */ 1080 return nwoken; 1081 } 1082 1083 /* 1084 * futex_queue_lock(f) 1085 * 1086 * Acquire the queue lock of f. Pair with futex_queue_unlock. Do 1087 * not use if caller needs to acquire two locks; use 1088 * futex_queue_lock2 instead. 1089 */ 1090 static void 1091 futex_queue_lock(struct futex *f) 1092 { 1093 mutex_enter(&f->fx_qlock); 1094 } 1095 1096 /* 1097 * futex_queue_unlock(f) 1098 * 1099 * Release the queue lock of f. 1100 */ 1101 static void 1102 futex_queue_unlock(struct futex *f) 1103 { 1104 mutex_exit(&f->fx_qlock); 1105 } 1106 1107 /* 1108 * futex_queue_lock2(f, f2) 1109 * 1110 * Acquire the queue locks of both f and f2, which may be null, or 1111 * which may have the same underlying queue. If they are 1112 * distinct, an arbitrary total order is chosen on the locks. 1113 * 1114 * Callers should only ever acquire multiple queue locks 1115 * simultaneously using futex_queue_lock2. 1116 */ 1117 static void 1118 futex_queue_lock2(struct futex *f, struct futex *f2) 1119 { 1120 1121 /* 1122 * If both are null, do nothing; if one is null and the other 1123 * is not, lock the other and be done with it. 1124 */ 1125 if (f == NULL && f2 == NULL) { 1126 return; 1127 } else if (f == NULL) { 1128 mutex_enter(&f2->fx_qlock); 1129 return; 1130 } else if (f2 == NULL) { 1131 mutex_enter(&f->fx_qlock); 1132 return; 1133 } 1134 1135 /* If both futexes are the same, acquire only one. */ 1136 if (f == f2) { 1137 mutex_enter(&f->fx_qlock); 1138 return; 1139 } 1140 1141 /* Otherwise, use the ordering on the kva of the futex pointer. */ 1142 if ((uintptr_t)f < (uintptr_t)f2) { 1143 mutex_enter(&f->fx_qlock); 1144 mutex_enter(&f2->fx_qlock); 1145 } else { 1146 mutex_enter(&f2->fx_qlock); 1147 mutex_enter(&f->fx_qlock); 1148 } 1149 } 1150 1151 /* 1152 * futex_queue_unlock2(f, f2) 1153 * 1154 * Release the queue locks of both f and f2, which may be null, or 1155 * which may have the same underlying queue. 1156 */ 1157 static void 1158 futex_queue_unlock2(struct futex *f, struct futex *f2) 1159 { 1160 1161 /* 1162 * If both are null, do nothing; if one is null and the other 1163 * is not, unlock the other and be done with it. 1164 */ 1165 if (f == NULL && f2 == NULL) { 1166 return; 1167 } else if (f == NULL) { 1168 mutex_exit(&f2->fx_qlock); 1169 return; 1170 } else if (f2 == NULL) { 1171 mutex_exit(&f->fx_qlock); 1172 return; 1173 } 1174 1175 /* If both futexes are the same, release only one. */ 1176 if (f == f2) { 1177 mutex_exit(&f->fx_qlock); 1178 return; 1179 } 1180 1181 /* Otherwise, use the ordering on the kva of the futex pointer. */ 1182 if ((uintptr_t)f < (uintptr_t)f2) { 1183 mutex_exit(&f2->fx_qlock); 1184 mutex_exit(&f->fx_qlock); 1185 } else { 1186 mutex_exit(&f->fx_qlock); 1187 mutex_exit(&f2->fx_qlock); 1188 } 1189 } 1190 1191 /* 1192 * futex_func_wait(uaddr, val, val3, timeout, clkid, clkflags, retval) 1193 * 1194 * Implement futex(FUTEX_WAIT). 1195 */ 1196 static int 1197 futex_func_wait(bool shared, int *uaddr, int val, int val3, 1198 const struct timespec *timeout, clockid_t clkid, int clkflags, 1199 register_t *retval) 1200 { 1201 struct futex *f; 1202 struct futex_wait wait, *fw = &wait; 1203 struct timespec ts; 1204 const struct timespec *deadline; 1205 int error; 1206 1207 /* 1208 * If there's nothing to wait for, and nobody will ever wake 1209 * us, then don't set anything up to wait -- just stop here. 1210 */ 1211 if (val3 == 0) 1212 return EINVAL; 1213 1214 /* Optimistically test before anything else. */ 1215 if (!futex_test(uaddr, val)) 1216 return EAGAIN; 1217 1218 /* Determine a deadline on the specified clock. */ 1219 if (timeout == NULL || (clkflags & TIMER_ABSTIME) == TIMER_ABSTIME) { 1220 deadline = timeout; 1221 } else { 1222 error = clock_gettime1(clkid, &ts); 1223 if (error) 1224 return error; 1225 timespecadd(&ts, timeout, &ts); 1226 deadline = &ts; 1227 } 1228 1229 /* Get the futex, creating it if necessary. */ 1230 error = futex_lookup_create(uaddr, shared, &f); 1231 if (error) 1232 return error; 1233 KASSERT(f); 1234 1235 /* Get ready to wait. */ 1236 futex_wait_init(fw, val3); 1237 1238 /* 1239 * Under the queue lock, check the value again: if it has 1240 * already changed, EAGAIN; otherwise enqueue the waiter. 1241 * Since FUTEX_WAKE will use the same lock and be done after 1242 * modifying the value, the order in which we check and enqueue 1243 * is immaterial. 1244 */ 1245 futex_queue_lock(f); 1246 if (!futex_test(uaddr, val)) { 1247 futex_queue_unlock(f); 1248 error = EAGAIN; 1249 goto out; 1250 } 1251 mutex_enter(&fw->fw_lock); 1252 futex_wait_enqueue(fw, f); 1253 mutex_exit(&fw->fw_lock); 1254 futex_queue_unlock(f); 1255 1256 /* 1257 * We cannot drop our reference to the futex here, because 1258 * we might be enqueued on a different one when we are awakened. 1259 * The references will be managed on our behalf in the requeue 1260 * and wake cases. 1261 */ 1262 f = NULL; 1263 1264 /* Wait. */ 1265 error = futex_wait(fw, deadline, clkid); 1266 if (error) 1267 goto out; 1268 1269 /* Return 0 on success, error on failure. */ 1270 *retval = 0; 1271 1272 out: if (f != NULL) 1273 futex_rele(f); 1274 futex_wait_fini(fw); 1275 return error; 1276 } 1277 1278 /* 1279 * futex_func_wake(uaddr, val, val3, retval) 1280 * 1281 * Implement futex(FUTEX_WAKE) and futex(FUTEX_WAKE_BITSET). 1282 */ 1283 static int 1284 futex_func_wake(bool shared, int *uaddr, int val, int val3, register_t *retval) 1285 { 1286 struct futex *f; 1287 unsigned int nwoken = 0; 1288 int error = 0; 1289 1290 /* Reject negative number of wakeups. */ 1291 if (val < 0) { 1292 error = EINVAL; 1293 goto out; 1294 } 1295 1296 /* Look up the futex, if any. */ 1297 error = futex_lookup(uaddr, shared, &f); 1298 if (error) 1299 goto out; 1300 1301 /* If there's no futex, there are no waiters to wake. */ 1302 if (f == NULL) 1303 goto out; 1304 1305 /* 1306 * Under f's queue lock, wake the waiters and remember the 1307 * number woken. 1308 */ 1309 futex_queue_lock(f); 1310 nwoken = futex_wake(f, val, NULL, 0, val3); 1311 futex_queue_unlock(f); 1312 1313 /* Release the futex. */ 1314 futex_rele(f); 1315 1316 out: 1317 /* Return the number of waiters woken. */ 1318 *retval = nwoken; 1319 1320 /* Success! */ 1321 return error; 1322 } 1323 1324 /* 1325 * futex_func_requeue(op, uaddr, val, uaddr2, val2, val3, retval) 1326 * 1327 * Implement futex(FUTEX_REQUEUE) and futex(FUTEX_CMP_REQUEUE). 1328 */ 1329 static int 1330 futex_func_requeue(bool shared, int op, int *uaddr, int val, int *uaddr2, 1331 int val2, int val3, register_t *retval) 1332 { 1333 struct futex *f = NULL, *f2 = NULL; 1334 unsigned nwoken = 0; /* default to zero woken on early return */ 1335 int error; 1336 1337 /* Reject negative number of wakeups or requeues. */ 1338 if (val < 0 || val2 < 0) { 1339 error = EINVAL; 1340 goto out; 1341 } 1342 1343 /* Look up the source futex, if any. */ 1344 error = futex_lookup(uaddr, shared, &f); 1345 if (error) 1346 goto out; 1347 1348 /* If there is none, nothing to do. */ 1349 if (f == NULL) 1350 goto out; 1351 1352 /* 1353 * We may need to create the destination futex because it's 1354 * entirely possible it does not currently have any waiters. 1355 */ 1356 error = futex_lookup_create(uaddr2, shared, &f2); 1357 if (error) 1358 goto out; 1359 1360 /* 1361 * Under the futexes' queue locks, check the value; if 1362 * unchanged from val3, wake the waiters. 1363 */ 1364 futex_queue_lock2(f, f2); 1365 if (op == FUTEX_CMP_REQUEUE && !futex_test(uaddr, val3)) { 1366 error = EAGAIN; 1367 } else { 1368 error = 0; 1369 nwoken = futex_wake(f, val, f2, val2, FUTEX_BITSET_MATCH_ANY); 1370 } 1371 futex_queue_unlock2(f, f2); 1372 1373 out: 1374 /* Return the number of waiters woken. */ 1375 *retval = nwoken; 1376 1377 /* Release the futexes if we got them. */ 1378 if (f2) 1379 futex_rele(f2); 1380 if (f) 1381 futex_rele(f); 1382 return error; 1383 } 1384 1385 /* 1386 * futex_validate_op_cmp(val3) 1387 * 1388 * Validate an op/cmp argument for FUTEX_WAKE_OP. 1389 */ 1390 static int 1391 futex_validate_op_cmp(int val3) 1392 { 1393 int op = __SHIFTOUT(val3, FUTEX_OP_OP_MASK); 1394 int cmp = __SHIFTOUT(val3, FUTEX_OP_CMP_MASK); 1395 1396 if (op & FUTEX_OP_OPARG_SHIFT) { 1397 int oparg = __SHIFTOUT(val3, FUTEX_OP_OPARG_MASK); 1398 if (oparg < 0) 1399 return EINVAL; 1400 if (oparg >= 32) 1401 return EINVAL; 1402 op &= ~FUTEX_OP_OPARG_SHIFT; 1403 } 1404 1405 switch (op) { 1406 case FUTEX_OP_SET: 1407 case FUTEX_OP_ADD: 1408 case FUTEX_OP_OR: 1409 case FUTEX_OP_ANDN: 1410 case FUTEX_OP_XOR: 1411 break; 1412 default: 1413 return EINVAL; 1414 } 1415 1416 switch (cmp) { 1417 case FUTEX_OP_CMP_EQ: 1418 case FUTEX_OP_CMP_NE: 1419 case FUTEX_OP_CMP_LT: 1420 case FUTEX_OP_CMP_LE: 1421 case FUTEX_OP_CMP_GT: 1422 case FUTEX_OP_CMP_GE: 1423 break; 1424 default: 1425 return EINVAL; 1426 } 1427 1428 return 0; 1429 } 1430 1431 /* 1432 * futex_compute_op(oldval, val3) 1433 * 1434 * Apply a FUTEX_WAIT_OP operation to oldval. 1435 */ 1436 static int 1437 futex_compute_op(int oldval, int val3) 1438 { 1439 int op = __SHIFTOUT(val3, FUTEX_OP_OP_MASK); 1440 int oparg = __SHIFTOUT(val3, FUTEX_OP_OPARG_MASK); 1441 1442 if (op & FUTEX_OP_OPARG_SHIFT) { 1443 KASSERT(oparg >= 0); 1444 KASSERT(oparg < 32); 1445 oparg = 1u << oparg; 1446 op &= ~FUTEX_OP_OPARG_SHIFT; 1447 } 1448 1449 switch (op) { 1450 case FUTEX_OP_SET: 1451 return oparg; 1452 1453 case FUTEX_OP_ADD: 1454 /* 1455 * Avoid signed arithmetic overflow by doing 1456 * arithmetic unsigned and converting back to signed 1457 * at the end. 1458 */ 1459 return (int)((unsigned)oldval + (unsigned)oparg); 1460 1461 case FUTEX_OP_OR: 1462 return oldval | oparg; 1463 1464 case FUTEX_OP_ANDN: 1465 return oldval & ~oparg; 1466 1467 case FUTEX_OP_XOR: 1468 return oldval ^ oparg; 1469 1470 default: 1471 panic("invalid futex op"); 1472 } 1473 } 1474 1475 /* 1476 * futex_compute_cmp(oldval, val3) 1477 * 1478 * Apply a FUTEX_WAIT_OP comparison to oldval. 1479 */ 1480 static bool 1481 futex_compute_cmp(int oldval, int val3) 1482 { 1483 int cmp = __SHIFTOUT(val3, FUTEX_OP_CMP_MASK); 1484 int cmparg = __SHIFTOUT(val3, FUTEX_OP_CMPARG_MASK); 1485 1486 switch (cmp) { 1487 case FUTEX_OP_CMP_EQ: 1488 return (oldval == cmparg); 1489 1490 case FUTEX_OP_CMP_NE: 1491 return (oldval != cmparg); 1492 1493 case FUTEX_OP_CMP_LT: 1494 return (oldval < cmparg); 1495 1496 case FUTEX_OP_CMP_LE: 1497 return (oldval <= cmparg); 1498 1499 case FUTEX_OP_CMP_GT: 1500 return (oldval > cmparg); 1501 1502 case FUTEX_OP_CMP_GE: 1503 return (oldval >= cmparg); 1504 1505 default: 1506 panic("invalid futex cmp operation"); 1507 } 1508 } 1509 1510 /* 1511 * futex_func_wake_op(uaddr, val, uaddr2, val2, val3, retval) 1512 * 1513 * Implement futex(FUTEX_WAKE_OP). 1514 */ 1515 static int 1516 futex_func_wake_op(bool shared, int *uaddr, int val, int *uaddr2, int val2, 1517 int val3, register_t *retval) 1518 { 1519 struct futex *f = NULL, *f2 = NULL; 1520 int oldval, newval, actual; 1521 unsigned nwoken = 0; 1522 int error; 1523 1524 /* Reject negative number of wakeups. */ 1525 if (val < 0 || val2 < 0) { 1526 error = EINVAL; 1527 goto out; 1528 } 1529 1530 /* Reject invalid operations before we start doing things. */ 1531 if ((error = futex_validate_op_cmp(val3)) != 0) 1532 goto out; 1533 1534 /* Look up the first futex, if any. */ 1535 error = futex_lookup(uaddr, shared, &f); 1536 if (error) 1537 goto out; 1538 1539 /* Look up the second futex, if any. */ 1540 error = futex_lookup(uaddr2, shared, &f2); 1541 if (error) 1542 goto out; 1543 1544 /* 1545 * Under the queue locks: 1546 * 1547 * 1. Read/modify/write: *uaddr2 op= oparg. 1548 * 2. Unconditionally wake uaddr. 1549 * 3. Conditionally wake uaddr2, if it previously matched val2. 1550 */ 1551 futex_queue_lock2(f, f2); 1552 do { 1553 error = futex_load(uaddr2, &oldval); 1554 if (error) 1555 goto out_unlock; 1556 newval = futex_compute_op(oldval, val3); 1557 error = ucas_int(uaddr2, oldval, newval, &actual); 1558 if (error) 1559 goto out_unlock; 1560 } while (actual != oldval); 1561 nwoken = (f ? futex_wake(f, val, NULL, 0, FUTEX_BITSET_MATCH_ANY) : 0); 1562 if (f2 && futex_compute_cmp(oldval, val3)) 1563 nwoken += futex_wake(f2, val2, NULL, 0, 1564 FUTEX_BITSET_MATCH_ANY); 1565 1566 /* Success! */ 1567 error = 0; 1568 out_unlock: 1569 futex_queue_unlock2(f, f2); 1570 1571 out: 1572 /* Return the number of waiters woken. */ 1573 *retval = nwoken; 1574 1575 /* Release the futexes, if we got them. */ 1576 if (f2) 1577 futex_rele(f2); 1578 if (f) 1579 futex_rele(f); 1580 return error; 1581 } 1582 1583 /* 1584 * do_futex(uaddr, op, val, timeout, uaddr2, val2, val3) 1585 * 1586 * Implement the futex system call with all the parameters 1587 * parsed out. 1588 */ 1589 int 1590 do_futex(int *uaddr, int op, int val, const struct timespec *timeout, 1591 int *uaddr2, int val2, int val3, register_t *retval) 1592 { 1593 const bool shared = (op & FUTEX_PRIVATE_FLAG) ? false : true; 1594 const clockid_t clkid = (op & FUTEX_CLOCK_REALTIME) ? CLOCK_REALTIME 1595 : CLOCK_MONOTONIC; 1596 1597 op &= FUTEX_CMD_MASK; 1598 1599 switch (op) { 1600 case FUTEX_WAIT: 1601 return futex_func_wait(shared, uaddr, val, 1602 FUTEX_BITSET_MATCH_ANY, timeout, clkid, TIMER_RELTIME, 1603 retval); 1604 1605 case FUTEX_WAKE: 1606 val3 = FUTEX_BITSET_MATCH_ANY; 1607 /* FALLTHROUGH */ 1608 case FUTEX_WAKE_BITSET: 1609 return futex_func_wake(shared, uaddr, val, val3, retval); 1610 1611 case FUTEX_REQUEUE: 1612 case FUTEX_CMP_REQUEUE: 1613 return futex_func_requeue(shared, op, uaddr, val, uaddr2, 1614 val2, val3, retval); 1615 1616 case FUTEX_WAIT_BITSET: 1617 return futex_func_wait(shared, uaddr, val, val3, timeout, 1618 clkid, TIMER_ABSTIME, retval); 1619 1620 case FUTEX_WAKE_OP: 1621 return futex_func_wake_op(shared, uaddr, val, uaddr2, val2, 1622 val3, retval); 1623 1624 case FUTEX_FD: 1625 default: 1626 return ENOSYS; 1627 } 1628 } 1629 1630 /* 1631 * sys___futex(l, uap, retval) 1632 * 1633 * __futex(2) system call: generic futex operations. 1634 */ 1635 int 1636 sys___futex(struct lwp *l, const struct sys___futex_args *uap, 1637 register_t *retval) 1638 { 1639 /* { 1640 syscallarg(int *) uaddr; 1641 syscallarg(int) op; 1642 syscallarg(int) val; 1643 syscallarg(const struct timespec *) timeout; 1644 syscallarg(int *) uaddr2; 1645 syscallarg(int) val2; 1646 syscallarg(int) val3; 1647 } */ 1648 struct timespec ts, *tsp; 1649 int error; 1650 1651 /* 1652 * Copy in the timeout argument, if specified. 1653 */ 1654 if (SCARG(uap, timeout)) { 1655 error = copyin(SCARG(uap, timeout), &ts, sizeof(ts)); 1656 if (error) 1657 return error; 1658 tsp = &ts; 1659 } else { 1660 tsp = NULL; 1661 } 1662 1663 return do_futex(SCARG(uap, uaddr), SCARG(uap, op), SCARG(uap, val), 1664 tsp, SCARG(uap, uaddr2), SCARG(uap, val2), SCARG(uap, val3), 1665 retval); 1666 } 1667 1668 /* 1669 * sys___futex_set_robust_list(l, uap, retval) 1670 * 1671 * __futex_set_robust_list(2) system call for robust futexes. 1672 */ 1673 int 1674 sys___futex_set_robust_list(struct lwp *l, 1675 const struct sys___futex_set_robust_list_args *uap, register_t *retval) 1676 { 1677 /* { 1678 syscallarg(void *) head; 1679 syscallarg(size_t) len; 1680 } */ 1681 void *head = SCARG(uap, head); 1682 1683 if (SCARG(uap, len) != _FUTEX_ROBUST_HEAD_SIZE) 1684 return EINVAL; 1685 if ((uintptr_t)head % sizeof(u_long)) 1686 return EINVAL; 1687 1688 l->l_robust_head = (uintptr_t)head; 1689 1690 return 0; 1691 } 1692 1693 /* 1694 * sys___futex_get_robust_list(l, uap, retval) 1695 * 1696 * __futex_get_robust_list(2) system call for robust futexes. 1697 */ 1698 int 1699 sys___futex_get_robust_list(struct lwp *l, 1700 const struct sys___futex_get_robust_list_args *uap, register_t *retval) 1701 { 1702 /* { 1703 syscallarg(lwpid_t) lwpid; 1704 syscallarg(void **) headp; 1705 syscallarg(size_t *) lenp; 1706 } */ 1707 void *head; 1708 const size_t len = _FUTEX_ROBUST_HEAD_SIZE; 1709 int error; 1710 1711 error = futex_robust_head_lookup(l, SCARG(uap, lwpid), &head); 1712 if (error) 1713 return error; 1714 1715 /* Copy out the head pointer and the head structure length. */ 1716 error = copyout(&head, SCARG(uap, headp), sizeof(head)); 1717 if (__predict_true(error == 0)) { 1718 error = copyout(&len, SCARG(uap, lenp), sizeof(len)); 1719 } 1720 1721 return error; 1722 } 1723 1724 /* 1725 * release_futex(uva, tid) 1726 * 1727 * Try to release the robust futex at uva in the current process 1728 * on lwp exit. If anything goes wrong, silently fail. It is the 1729 * userland program's obligation to arrange correct behaviour. 1730 */ 1731 static void 1732 release_futex(uintptr_t const uptr, lwpid_t const tid, bool const is_pi, 1733 bool const is_pending) 1734 { 1735 int *uaddr; 1736 struct futex *f; 1737 int oldval, newval, actual; 1738 int error; 1739 1740 /* If it's misaligned, tough. */ 1741 if (__predict_false(uptr & 3)) 1742 return; 1743 uaddr = (int *)uptr; 1744 1745 error = futex_load(uaddr, &oldval); 1746 if (__predict_false(error)) 1747 return; 1748 1749 /* 1750 * There are two race conditions we need to handle here: 1751 * 1752 * 1. User space cleared the futex word but died before 1753 * being able to issue the wakeup. No wakeups will 1754 * ever be issued, oops! 1755 * 1756 * 2. Awakened waiter died before being able to acquire 1757 * the futex in user space. Any other waiters are 1758 * now stuck, oops! 1759 * 1760 * In both of these cases, the futex word will be 0 (because 1761 * it's updated before the wake is issued). The best we can 1762 * do is detect this situation if it's the pending futex and 1763 * issue a wake without modifying the futex word. 1764 * 1765 * XXX eventual PI handling? 1766 */ 1767 if (__predict_false(is_pending && (oldval & ~FUTEX_WAITERS) == 0)) { 1768 register_t retval; 1769 (void) futex_func_wake(/*shared*/true, uaddr, 1, 1770 FUTEX_BITSET_MATCH_ANY, &retval); 1771 return; 1772 } 1773 1774 /* Optimistically test whether we need to do anything at all. */ 1775 if ((oldval & FUTEX_TID_MASK) != tid) 1776 return; 1777 1778 /* 1779 * We need to handle the case where this thread owned the futex, 1780 * but it was uncontended. In this case, there won't be any 1781 * kernel state to look up. All we can do is mark the futex 1782 * as a zombie to be mopped up the next time another thread 1783 * attempts to acquire it. 1784 * 1785 * N.B. It's important to ensure to set FUTEX_OWNER_DIED in 1786 * this loop, even if waiters appear while we're are doing 1787 * so. This is beause FUTEX_WAITERS is set by user space 1788 * before calling __futex() to wait, and the futex needs 1789 * to be marked as a zombie when the new waiter gets into 1790 * the kernel. 1791 */ 1792 if ((oldval & FUTEX_WAITERS) == 0) { 1793 do { 1794 error = futex_load(uaddr, &oldval); 1795 if (error) 1796 return; 1797 if ((oldval & FUTEX_TID_MASK) != tid) 1798 return; 1799 newval = oldval | FUTEX_OWNER_DIED; 1800 error = ucas_int(uaddr, oldval, newval, &actual); 1801 if (error) 1802 return; 1803 } while (actual != oldval); 1804 1805 /* 1806 * If where is still no indication of waiters, then there is 1807 * no more work for us to do. 1808 */ 1809 if ((oldval & FUTEX_WAITERS) == 0) 1810 return; 1811 } 1812 1813 /* 1814 * Look for a shared futex since we have no positive indication 1815 * it is private. If we can't, tough. 1816 */ 1817 error = futex_lookup(uaddr, /*shared*/true, &f); 1818 if (error) 1819 return; 1820 1821 /* 1822 * If there's no kernel state for this futex, there's nothing to 1823 * release. 1824 */ 1825 if (f == NULL) 1826 return; 1827 1828 /* Work under the futex queue lock. */ 1829 futex_queue_lock(f); 1830 1831 /* 1832 * Fetch the word: if the tid doesn't match ours, skip; 1833 * otherwise, set the owner-died bit, atomically. 1834 */ 1835 do { 1836 error = futex_load(uaddr, &oldval); 1837 if (error) 1838 goto out; 1839 if ((oldval & FUTEX_TID_MASK) != tid) 1840 goto out; 1841 newval = oldval | FUTEX_OWNER_DIED; 1842 error = ucas_int(uaddr, oldval, newval, &actual); 1843 if (error) 1844 goto out; 1845 } while (actual != oldval); 1846 1847 /* 1848 * If there may be waiters, try to wake one. If anything goes 1849 * wrong, tough. 1850 * 1851 * XXX eventual PI handling? 1852 */ 1853 if (oldval & FUTEX_WAITERS) 1854 (void)futex_wake(f, 1, NULL, 0, FUTEX_BITSET_MATCH_ANY); 1855 1856 /* Unlock the queue and release the futex. */ 1857 out: futex_queue_unlock(f); 1858 futex_rele(f); 1859 } 1860 1861 /* 1862 * futex_robust_head_lookup(l, lwpid) 1863 * 1864 * Helper function to look up a robust head by LWP ID. 1865 */ 1866 int 1867 futex_robust_head_lookup(struct lwp *l, lwpid_t lwpid, void **headp) 1868 { 1869 struct proc *p = l->l_proc; 1870 1871 /* Find the other lwp, if requested; otherwise use our robust head. */ 1872 if (lwpid) { 1873 mutex_enter(p->p_lock); 1874 l = lwp_find(p, lwpid); 1875 if (l == NULL) { 1876 mutex_exit(p->p_lock); 1877 return ESRCH; 1878 } 1879 *headp = (void *)l->l_robust_head; 1880 mutex_exit(p->p_lock); 1881 } else { 1882 *headp = (void *)l->l_robust_head; 1883 } 1884 return 0; 1885 } 1886 1887 /* 1888 * futex_fetch_robust_head(uaddr) 1889 * 1890 * Helper routine to fetch the futex robust list head that 1891 * handles 32-bit binaries running on 64-bit kernels. 1892 */ 1893 static int 1894 futex_fetch_robust_head(uintptr_t uaddr, u_long *rhead) 1895 { 1896 #ifdef _LP64 1897 if (curproc->p_flag & PK_32) { 1898 uint32_t rhead32[_FUTEX_ROBUST_HEAD_NWORDS]; 1899 int error; 1900 1901 error = copyin((void *)uaddr, rhead32, sizeof(rhead32)); 1902 if (__predict_true(error == 0)) { 1903 for (int i = 0; i < _FUTEX_ROBUST_HEAD_NWORDS; i++) { 1904 if (i == _FUTEX_ROBUST_HEAD_OFFSET) { 1905 /* 1906 * Make sure the offset is sign- 1907 * extended. 1908 */ 1909 rhead[i] = (int32_t)rhead32[i]; 1910 } else { 1911 rhead[i] = rhead32[i]; 1912 } 1913 } 1914 } 1915 return error; 1916 } 1917 #endif /* _L64 */ 1918 1919 return copyin((void *)uaddr, rhead, 1920 sizeof(*rhead) * _FUTEX_ROBUST_HEAD_NWORDS); 1921 } 1922 1923 /* 1924 * futex_decode_robust_word(word) 1925 * 1926 * Decode a robust futex list word into the entry and entry 1927 * properties. 1928 */ 1929 static inline void 1930 futex_decode_robust_word(uintptr_t const word, uintptr_t * const entry, 1931 bool * const is_pi) 1932 { 1933 *is_pi = (word & _FUTEX_ROBUST_ENTRY_PI) ? true : false; 1934 *entry = word & ~_FUTEX_ROBUST_ENTRY_PI; 1935 } 1936 1937 /* 1938 * futex_fetch_robust_entry(uaddr) 1939 * 1940 * Helper routine to fetch and decode a robust futex entry 1941 * that handles 32-bit binaries running on 64-bit kernels. 1942 */ 1943 static int 1944 futex_fetch_robust_entry(uintptr_t const uaddr, uintptr_t * const valp, 1945 bool * const is_pi) 1946 { 1947 uintptr_t val = 0; 1948 int error = 0; 1949 1950 #ifdef _LP64 1951 if (curproc->p_flag & PK_32) { 1952 uint32_t val32; 1953 1954 error = ufetch_32((uint32_t *)uaddr, &val32); 1955 if (__predict_true(error == 0)) 1956 val = val32; 1957 } else 1958 #endif /* _LP64 */ 1959 error = ufetch_long((u_long *)uaddr, (u_long *)&val); 1960 if (__predict_false(error)) 1961 return error; 1962 1963 futex_decode_robust_word(val, valp, is_pi); 1964 return 0; 1965 } 1966 1967 /* 1968 * futex_release_all_lwp(l, tid) 1969 * 1970 * Release all l's robust futexes. If anything looks funny in 1971 * the process, give up -- it's userland's responsibility to dot 1972 * the i's and cross the t's. 1973 */ 1974 void 1975 futex_release_all_lwp(struct lwp * const l) 1976 { 1977 u_long rhead[_FUTEX_ROBUST_HEAD_NWORDS]; 1978 int limit = 1000000; 1979 int error; 1980 1981 /* If there's no robust list there's nothing to do. */ 1982 if (l->l_robust_head == 0) 1983 return; 1984 1985 KASSERT((l->l_lid & FUTEX_TID_MASK) == l->l_lid); 1986 1987 /* Read the final snapshot of the robust list head. */ 1988 error = futex_fetch_robust_head(l->l_robust_head, rhead); 1989 if (error) { 1990 printf("WARNING: pid %jd (%s) lwp %jd:" 1991 " unmapped robust futex list head\n", 1992 (uintmax_t)l->l_proc->p_pid, l->l_proc->p_comm, 1993 (uintmax_t)l->l_lid); 1994 return; 1995 } 1996 1997 const long offset = (long)rhead[_FUTEX_ROBUST_HEAD_OFFSET]; 1998 1999 uintptr_t next, pending; 2000 bool is_pi, pending_is_pi; 2001 2002 futex_decode_robust_word(rhead[_FUTEX_ROBUST_HEAD_LIST], 2003 &next, &is_pi); 2004 futex_decode_robust_word(rhead[_FUTEX_ROBUST_HEAD_PENDING], 2005 &pending, &pending_is_pi); 2006 2007 /* 2008 * Walk down the list of locked futexes and release them, up 2009 * to one million of them before we give up. 2010 */ 2011 2012 while (next != l->l_robust_head && limit-- > 0) { 2013 /* pending handled below. */ 2014 if (next != pending) 2015 release_futex(next + offset, l->l_lid, is_pi, false); 2016 error = futex_fetch_robust_entry(next, &next, &is_pi); 2017 if (error) 2018 break; 2019 preempt_point(); 2020 } 2021 if (limit <= 0) { 2022 printf("WARNING: pid %jd (%s) lwp %jd:" 2023 " exhausted robust futex limit\n", 2024 (uintmax_t)l->l_proc->p_pid, l->l_proc->p_comm, 2025 (uintmax_t)l->l_lid); 2026 } 2027 2028 /* If there's a pending futex, it may need to be released too. */ 2029 if (pending != 0) { 2030 release_futex(pending + offset, l->l_lid, pending_is_pi, true); 2031 } 2032 } 2033