1 /* $NetBSD: sys_futex.c,v 1.11 2020/05/05 15:25:18 riastradh Exp $ */ 2 3 /*- 4 * Copyright (c) 2018, 2019, 2020 The NetBSD Foundation, Inc. 5 * All rights reserved. 6 * 7 * This code is derived from software contributed to The NetBSD Foundation 8 * by Taylor R. Campbell and Jason R. Thorpe. 9 * 10 * Redistribution and use in source and binary forms, with or without 11 * modification, are permitted provided that the following conditions 12 * are met: 13 * 1. Redistributions of source code must retain the above copyright 14 * notice, this list of conditions and the following disclaimer. 15 * 2. Redistributions in binary form must reproduce the above copyright 16 * notice, this list of conditions and the following disclaimer in the 17 * documentation and/or other materials provided with the distribution. 18 * 19 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS 20 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 21 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 22 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS 23 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 24 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 25 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 26 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 27 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 28 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 29 * POSSIBILITY OF SUCH DAMAGE. 30 */ 31 32 #include <sys/cdefs.h> 33 __KERNEL_RCSID(0, "$NetBSD: sys_futex.c,v 1.11 2020/05/05 15:25:18 riastradh Exp $"); 34 35 /* 36 * Futexes 37 * 38 * The futex system call coordinates notifying threads waiting for 39 * changes on a 32-bit word of memory. The word can be managed by 40 * CPU atomic operations in userland, without system calls, as long 41 * as there is no contention. 42 * 43 * The simplest use case demonstrating the utility is: 44 * 45 * // 32-bit word of memory shared among threads or 46 * // processes in userland. lock & 1 means owned; 47 * // lock & 2 means there are waiters waiting. 48 * volatile int lock = 0; 49 * 50 * int v; 51 * 52 * // Acquire a lock. 53 * do { 54 * v = lock; 55 * if (v & 1) { 56 * // Lock is held. Set a bit to say that 57 * // there are waiters, and wait for lock 58 * // to change to anything other than v; 59 * // then retry. 60 * if (atomic_cas_uint(&lock, v, v | 2) != v) 61 * continue; 62 * futex(FUTEX_WAIT, &lock, v | 2, NULL, NULL, 0); 63 * continue; 64 * } 65 * } while (atomic_cas_uint(&lock, v, v & ~1) != v); 66 * membar_enter(); 67 * 68 * ... 69 * 70 * // Release the lock. Optimistically assume there are 71 * // no waiters first until demonstrated otherwise. 72 * membar_exit(); 73 * if (atomic_cas_uint(&lock, 1, 0) != 1) { 74 * // There may be waiters. 75 * v = atomic_swap_uint(&lock, 0); 76 * // If there are still waiters, wake one. 77 * if (v & 2) 78 * futex(FUTEX_WAKE, &lock, 1, NULL, NULL, 0); 79 * } 80 * 81 * The goal is to avoid the futex system call unless there is 82 * contention; then if there is contention, to guarantee no missed 83 * wakeups. 84 * 85 * For a simple implementation, futex(FUTEX_WAIT) could queue 86 * itself to be woken, double-check the lock word, and then sleep; 87 * spurious wakeups are generally a fact of life, so any 88 * FUTEX_WAKE could just wake every FUTEX_WAIT in the system. 89 * 90 * If this were all there is to it, we could then increase 91 * parallelism by refining the approximation: partition the 92 * waiters into buckets by hashing the lock addresses to reduce 93 * the incidence of spurious wakeups. But this is not all. 94 * 95 * The futex(FUTEX_CMP_REQUEUE, &lock, n, &lock2, m, val) 96 * operation not only wakes n waiters on lock if lock == val, but 97 * also _transfers_ m additional waiters to lock2. Unless wakeups 98 * on lock2 also trigger wakeups on lock, we cannot move waiters 99 * to lock2 if they merely share the same hash as waiters on lock. 100 * Thus, we can't approximately distribute waiters into queues by 101 * a hash function; we must distinguish futex queues exactly by 102 * lock address. 103 * 104 * For now, we use a global red/black tree to index futexes. This 105 * should be replaced by a lockless radix tree with a thread to 106 * free entries no longer in use once all lookups on all CPUs have 107 * completed. 108 * 109 * Specifically, we maintain two maps: 110 * 111 * futex_tab.va[vmspace, va] for private futexes 112 * futex_tab.oa[uvm_voaddr] for shared futexes 113 * 114 * This implementation does not support priority inheritance. 115 */ 116 117 #include <sys/types.h> 118 #include <sys/atomic.h> 119 #include <sys/condvar.h> 120 #include <sys/futex.h> 121 #include <sys/mutex.h> 122 #include <sys/rbtree.h> 123 #include <sys/queue.h> 124 125 #include <sys/syscall.h> 126 #include <sys/syscallargs.h> 127 #include <sys/syscallvar.h> 128 129 #include <uvm/uvm_extern.h> 130 131 /* 132 * Lock order: 133 * 134 * futex_tab.lock 135 * futex::fx_qlock ordered by kva of struct futex 136 * -> futex_wait::fw_lock only one at a time 137 * futex_wait::fw_lock only one at a time 138 * -> futex::fx_abortlock only one at a time 139 */ 140 141 /* 142 * union futex_key 143 * 144 * A futex is addressed either by a vmspace+va (private) or by 145 * a uvm_voaddr (shared). 146 */ 147 union futex_key { 148 struct { 149 struct vmspace *vmspace; 150 vaddr_t va; 151 } fk_private; 152 struct uvm_voaddr fk_shared; 153 }; 154 155 /* 156 * struct futex 157 * 158 * Kernel state for a futex located at a particular address in a 159 * particular virtual address space. 160 * 161 * N.B. fx_refcnt is an unsigned long because we need to be able 162 * to operate on it atomically on all systems while at the same 163 * time rendering practically impossible the chance of it reaching 164 * its max value. In practice, we're limited by the number of LWPs 165 * that can be present on the system at any given time, and the 166 * assumption is that limit will be good enough on a 32-bit platform. 167 * See futex_wake() for why overflow needs to be avoided. 168 */ 169 struct futex { 170 union futex_key fx_key; 171 unsigned long fx_refcnt; 172 bool fx_shared; 173 bool fx_on_tree; 174 struct rb_node fx_node; 175 176 kmutex_t fx_qlock; 177 TAILQ_HEAD(, futex_wait) fx_queue; 178 179 kmutex_t fx_abortlock; 180 LIST_HEAD(, futex_wait) fx_abortlist; 181 kcondvar_t fx_abortcv; 182 }; 183 184 /* 185 * struct futex_wait 186 * 187 * State for a thread to wait on a futex. Threads wait on fw_cv 188 * for fw_bitset to be set to zero. The thread may transition to 189 * a different futex queue at any time under the futex's lock. 190 */ 191 struct futex_wait { 192 kmutex_t fw_lock; 193 kcondvar_t fw_cv; 194 struct futex *fw_futex; 195 TAILQ_ENTRY(futex_wait) fw_entry; /* queue lock */ 196 LIST_ENTRY(futex_wait) fw_abort; /* queue abortlock */ 197 int fw_bitset; 198 bool fw_aborting; /* fw_lock */ 199 }; 200 201 /* 202 * futex_tab 203 * 204 * Global trees of futexes by vmspace/va and VM object address. 205 * 206 * XXX This obviously doesn't scale in parallel. We could use a 207 * pserialize-safe data structure, but there may be a high cost to 208 * frequent deletion since we don't cache futexes after we're done 209 * with them. We could use hashed locks. But for now, just make 210 * sure userland can't DoS the serial performance, by using a 211 * balanced binary tree for lookup. 212 * 213 * XXX We could use a per-process tree for the table indexed by 214 * virtual address to reduce contention between processes. 215 */ 216 static struct { 217 kmutex_t lock; 218 struct rb_tree va; 219 struct rb_tree oa; 220 } futex_tab __cacheline_aligned; 221 222 static int 223 compare_futex_key(void *cookie, const void *n, const void *k) 224 { 225 const struct futex *fa = n; 226 const union futex_key *fka = &fa->fx_key; 227 const union futex_key *fkb = k; 228 229 if ((uintptr_t)fka->fk_private.vmspace < 230 (uintptr_t)fkb->fk_private.vmspace) 231 return -1; 232 if ((uintptr_t)fka->fk_private.vmspace > 233 (uintptr_t)fkb->fk_private.vmspace) 234 return +1; 235 if (fka->fk_private.va < fkb->fk_private.va) 236 return -1; 237 if (fka->fk_private.va > fkb->fk_private.va) 238 return -1; 239 return 0; 240 } 241 242 static int 243 compare_futex(void *cookie, const void *na, const void *nb) 244 { 245 const struct futex *fa = na; 246 const struct futex *fb = nb; 247 248 return compare_futex_key(cookie, fa, &fb->fx_key); 249 } 250 251 static const rb_tree_ops_t futex_rb_ops = { 252 .rbto_compare_nodes = compare_futex, 253 .rbto_compare_key = compare_futex_key, 254 .rbto_node_offset = offsetof(struct futex, fx_node), 255 }; 256 257 static int 258 compare_futex_shared_key(void *cookie, const void *n, const void *k) 259 { 260 const struct futex *fa = n; 261 const union futex_key *fka = &fa->fx_key; 262 const union futex_key *fkb = k; 263 264 return uvm_voaddr_compare(&fka->fk_shared, &fkb->fk_shared); 265 } 266 267 static int 268 compare_futex_shared(void *cookie, const void *na, const void *nb) 269 { 270 const struct futex *fa = na; 271 const struct futex *fb = nb; 272 273 return compare_futex_shared_key(cookie, fa, &fb->fx_key); 274 } 275 276 static const rb_tree_ops_t futex_shared_rb_ops = { 277 .rbto_compare_nodes = compare_futex_shared, 278 .rbto_compare_key = compare_futex_shared_key, 279 .rbto_node_offset = offsetof(struct futex, fx_node), 280 }; 281 282 static void futex_wait_dequeue(struct futex_wait *, struct futex *); 283 284 /* 285 * futex_load(uaddr, kaddr) 286 * 287 * Perform a single atomic load to read *uaddr, and return the 288 * result in *kaddr. Return 0 on success, EFAULT if uaddr is not 289 * mapped. 290 */ 291 static inline int 292 futex_load(int *uaddr, int *kaddr) 293 { 294 return ufetch_int((u_int *)uaddr, (u_int *)kaddr); 295 } 296 297 /* 298 * futex_test(uaddr, expected) 299 * 300 * True if *uaddr == expected. False if *uaddr != expected, or if 301 * uaddr is not mapped. 302 */ 303 static bool 304 futex_test(int *uaddr, int expected) 305 { 306 int val; 307 int error; 308 309 error = futex_load(uaddr, &val); 310 if (error) 311 return false; 312 return val == expected; 313 } 314 315 /* 316 * futex_sys_init() 317 * 318 * Initialize the futex subsystem. 319 */ 320 void 321 futex_sys_init(void) 322 { 323 324 mutex_init(&futex_tab.lock, MUTEX_DEFAULT, IPL_NONE); 325 rb_tree_init(&futex_tab.va, &futex_rb_ops); 326 rb_tree_init(&futex_tab.oa, &futex_shared_rb_ops); 327 } 328 329 /* 330 * futex_sys_fini() 331 * 332 * Finalize the futex subsystem. 333 */ 334 void 335 futex_sys_fini(void) 336 { 337 338 KASSERT(RB_TREE_MIN(&futex_tab.oa) == NULL); 339 KASSERT(RB_TREE_MIN(&futex_tab.va) == NULL); 340 mutex_destroy(&futex_tab.lock); 341 } 342 343 /* 344 * futex_queue_init(f) 345 * 346 * Initialize the futex queue. Caller must call futex_queue_fini 347 * when done. 348 * 349 * Never sleeps. 350 */ 351 static void 352 futex_queue_init(struct futex *f) 353 { 354 355 mutex_init(&f->fx_qlock, MUTEX_DEFAULT, IPL_NONE); 356 mutex_init(&f->fx_abortlock, MUTEX_DEFAULT, IPL_NONE); 357 cv_init(&f->fx_abortcv, "fqabort"); 358 LIST_INIT(&f->fx_abortlist); 359 TAILQ_INIT(&f->fx_queue); 360 } 361 362 /* 363 * futex_queue_drain(f) 364 * 365 * Wait for any aborting waiters in f; then empty the queue of 366 * any stragglers and wake them. Caller must guarantee no new 367 * references to f. 368 * 369 * May sleep. 370 */ 371 static void 372 futex_queue_drain(struct futex *f) 373 { 374 struct futex_wait *fw, *fw_next; 375 376 mutex_enter(&f->fx_abortlock); 377 while (!LIST_EMPTY(&f->fx_abortlist)) 378 cv_wait(&f->fx_abortcv, &f->fx_abortlock); 379 mutex_exit(&f->fx_abortlock); 380 381 mutex_enter(&f->fx_qlock); 382 TAILQ_FOREACH_SAFE(fw, &f->fx_queue, fw_entry, fw_next) { 383 mutex_enter(&fw->fw_lock); 384 futex_wait_dequeue(fw, f); 385 cv_broadcast(&fw->fw_cv); 386 mutex_exit(&fw->fw_lock); 387 } 388 mutex_exit(&f->fx_qlock); 389 } 390 391 /* 392 * futex_queue_fini(fq) 393 * 394 * Finalize the futex queue initialized by futex_queue_init. Queue 395 * must be empty. Caller must not use f again until a subsequent 396 * futex_queue_init. 397 */ 398 static void 399 futex_queue_fini(struct futex *f) 400 { 401 402 KASSERT(TAILQ_EMPTY(&f->fx_queue)); 403 KASSERT(LIST_EMPTY(&f->fx_abortlist)); 404 mutex_destroy(&f->fx_qlock); 405 mutex_destroy(&f->fx_abortlock); 406 cv_destroy(&f->fx_abortcv); 407 } 408 409 /* 410 * futex_key_init(key, vm, va, shared) 411 * 412 * Initialize a futex key for lookup, etc. 413 */ 414 static int 415 futex_key_init(union futex_key *fk, struct vmspace *vm, vaddr_t va, bool shared) 416 { 417 int error = 0; 418 419 if (__predict_false(shared)) { 420 if (!uvm_voaddr_acquire(&vm->vm_map, va, &fk->fk_shared)) 421 error = EFAULT; 422 } else { 423 fk->fk_private.vmspace = vm; 424 fk->fk_private.va = va; 425 } 426 427 return error; 428 } 429 430 /* 431 * futex_key_fini(key, shared) 432 * 433 * Release a futex key. 434 */ 435 static void 436 futex_key_fini(union futex_key *fk, bool shared) 437 { 438 if (__predict_false(shared)) 439 uvm_voaddr_release(&fk->fk_shared); 440 memset(fk, 0, sizeof(*fk)); 441 } 442 443 /* 444 * futex_create(fk, shared) 445 * 446 * Create a futex. Initial reference count is 1, representing the 447 * caller. Returns NULL on failure. Always takes ownership of the 448 * key, either transferring it to the newly-created futex, or releasing 449 * the key if creation fails. 450 * 451 * Never sleeps for memory, but may sleep to acquire a lock. 452 */ 453 static struct futex * 454 futex_create(union futex_key *fk, bool shared) 455 { 456 struct futex *f; 457 458 f = kmem_alloc(sizeof(*f), KM_NOSLEEP); 459 if (f == NULL) { 460 futex_key_fini(fk, shared); 461 return NULL; 462 } 463 f->fx_key = *fk; 464 f->fx_refcnt = 1; 465 f->fx_shared = shared; 466 f->fx_on_tree = false; 467 futex_queue_init(f); 468 469 return f; 470 } 471 472 /* 473 * futex_destroy(f) 474 * 475 * Destroy a futex created with futex_create. Reference count 476 * must be zero. 477 * 478 * May sleep. 479 */ 480 static void 481 futex_destroy(struct futex *f) 482 { 483 484 ASSERT_SLEEPABLE(); 485 486 KASSERT(atomic_load_relaxed(&f->fx_refcnt) == 0); 487 KASSERT(!f->fx_on_tree); 488 489 /* Drain and destroy the private queue. */ 490 futex_queue_drain(f); 491 futex_queue_fini(f); 492 493 futex_key_fini(&f->fx_key, f->fx_shared); 494 495 kmem_free(f, sizeof(*f)); 496 } 497 498 /* 499 * futex_hold(f) 500 * 501 * Attempt to acquire a reference to f. Return 0 on success, 502 * ENFILE on too many references. 503 * 504 * Never sleeps. 505 */ 506 static int 507 futex_hold(struct futex *f) 508 { 509 unsigned long refcnt; 510 511 do { 512 refcnt = atomic_load_relaxed(&f->fx_refcnt); 513 if (refcnt == ULONG_MAX) 514 return ENFILE; 515 } while (atomic_cas_ulong(&f->fx_refcnt, refcnt, refcnt + 1) != refcnt); 516 517 return 0; 518 } 519 520 /* 521 * futex_rele(f) 522 * 523 * Release a reference to f acquired with futex_create or 524 * futex_hold. 525 * 526 * May sleep to free f. 527 */ 528 static void 529 futex_rele(struct futex *f) 530 { 531 unsigned long refcnt; 532 533 ASSERT_SLEEPABLE(); 534 535 do { 536 refcnt = atomic_load_relaxed(&f->fx_refcnt); 537 if (refcnt == 1) 538 goto trylast; 539 } while (atomic_cas_ulong(&f->fx_refcnt, refcnt, refcnt - 1) != refcnt); 540 return; 541 542 trylast: 543 mutex_enter(&futex_tab.lock); 544 if (atomic_dec_ulong_nv(&f->fx_refcnt) == 0) { 545 if (f->fx_on_tree) { 546 if (__predict_false(f->fx_shared)) 547 rb_tree_remove_node(&futex_tab.oa, f); 548 else 549 rb_tree_remove_node(&futex_tab.va, f); 550 f->fx_on_tree = false; 551 } 552 } else { 553 /* References remain -- don't destroy it. */ 554 f = NULL; 555 } 556 mutex_exit(&futex_tab.lock); 557 if (f != NULL) 558 futex_destroy(f); 559 } 560 561 /* 562 * futex_rele_not_last(f) 563 * 564 * Release a reference to f acquired with futex_create or 565 * futex_hold. 566 * 567 * This version asserts that we are not dropping the last 568 * reference to f. 569 */ 570 static void 571 futex_rele_not_last(struct futex *f) 572 { 573 unsigned long refcnt; 574 575 do { 576 refcnt = atomic_load_relaxed(&f->fx_refcnt); 577 KASSERT(refcnt > 1); 578 } while (atomic_cas_ulong(&f->fx_refcnt, refcnt, refcnt - 1) != refcnt); 579 } 580 581 /* 582 * futex_lookup_by_key(key, shared, &f) 583 * 584 * Try to find an existing futex va reference in the specified key 585 * On success, return 0, set f to found futex or to NULL if not found, 586 * and increment f's reference count if found. 587 * 588 * Return ENFILE if reference count too high. 589 * 590 * Internal lookup routine shared by futex_lookup() and 591 * futex_lookup_create(). 592 */ 593 static int 594 futex_lookup_by_key(union futex_key *fk, bool shared, struct futex **fp) 595 { 596 struct futex *f; 597 int error = 0; 598 599 mutex_enter(&futex_tab.lock); 600 if (__predict_false(shared)) { 601 f = rb_tree_find_node(&futex_tab.oa, fk); 602 } else { 603 f = rb_tree_find_node(&futex_tab.va, fk); 604 } 605 if (f) { 606 error = futex_hold(f); 607 if (error) 608 f = NULL; 609 } 610 *fp = f; 611 mutex_exit(&futex_tab.lock); 612 613 return error; 614 } 615 616 /* 617 * futex_insert(f, fp) 618 * 619 * Try to insert the futex f into the tree by va. If there 620 * already is a futex for its va, acquire a reference to it, and 621 * store it in *fp; otherwise store f in *fp. 622 * 623 * Return 0 on success, ENFILE if there already is a futex but its 624 * reference count is too high. 625 */ 626 static int 627 futex_insert(struct futex *f, struct futex **fp) 628 { 629 struct futex *f0; 630 int error; 631 632 KASSERT(atomic_load_relaxed(&f->fx_refcnt) != 0); 633 KASSERT(!f->fx_on_tree); 634 635 mutex_enter(&futex_tab.lock); 636 if (__predict_false(f->fx_shared)) 637 f0 = rb_tree_insert_node(&futex_tab.oa, f); 638 else 639 f0 = rb_tree_insert_node(&futex_tab.va, f); 640 if (f0 == f) { 641 f->fx_on_tree = true; 642 error = 0; 643 } else { 644 KASSERT(atomic_load_relaxed(&f0->fx_refcnt) != 0); 645 KASSERT(f0->fx_on_tree); 646 error = futex_hold(f0); 647 if (error) 648 goto out; 649 } 650 *fp = f0; 651 out: mutex_exit(&futex_tab.lock); 652 653 return error; 654 } 655 656 /* 657 * futex_lookup(uaddr, shared, &f) 658 * 659 * Find a futex at the userland pointer uaddr in the current 660 * process's VM space. On success, return the futex in f and 661 * increment its reference count. 662 * 663 * Caller must call futex_rele when done. 664 */ 665 static int 666 futex_lookup(int *uaddr, bool shared, struct futex **fp) 667 { 668 union futex_key fk; 669 struct vmspace *vm = curproc->p_vmspace; 670 vaddr_t va = (vaddr_t)uaddr; 671 int error; 672 673 /* 674 * Reject unaligned user pointers so we don't cross page 675 * boundaries and so atomics will work. 676 */ 677 if ((va & 3) != 0) 678 return EINVAL; 679 680 /* Look it up. */ 681 error = futex_key_init(&fk, vm, va, shared); 682 if (error) 683 return error; 684 685 error = futex_lookup_by_key(&fk, shared, fp); 686 futex_key_fini(&fk, shared); 687 if (error) 688 return error; 689 690 KASSERT(*fp == NULL || (*fp)->fx_shared == shared); 691 KASSERT(*fp == NULL || atomic_load_relaxed(&(*fp)->fx_refcnt) != 0); 692 693 /* 694 * Success! (Caller must still check whether we found 695 * anything, but nothing went _wrong_ like trying to use 696 * unmapped memory.) 697 */ 698 KASSERT(error == 0); 699 700 return error; 701 } 702 703 /* 704 * futex_lookup_create(uaddr, shared, &f) 705 * 706 * Find or create a futex at the userland pointer uaddr in the 707 * current process's VM space. On success, return the futex in f 708 * and increment its reference count. 709 * 710 * Caller must call futex_rele when done. 711 */ 712 static int 713 futex_lookup_create(int *uaddr, bool shared, struct futex **fp) 714 { 715 union futex_key fk; 716 struct vmspace *vm = curproc->p_vmspace; 717 struct futex *f = NULL; 718 vaddr_t va = (vaddr_t)uaddr; 719 int error; 720 721 /* 722 * Reject unaligned user pointers so we don't cross page 723 * boundaries and so atomics will work. 724 */ 725 if ((va & 3) != 0) 726 return EINVAL; 727 728 error = futex_key_init(&fk, vm, va, shared); 729 if (error) 730 return error; 731 732 /* 733 * Optimistically assume there already is one, and try to find 734 * it. 735 */ 736 error = futex_lookup_by_key(&fk, shared, fp); 737 if (error || *fp != NULL) { 738 /* 739 * We either found one, or there was an error. 740 * In either case, we are done with the key. 741 */ 742 futex_key_fini(&fk, shared); 743 goto out; 744 } 745 746 /* 747 * Create a futex recoard. This tranfers ownership of the key 748 * in all cases. 749 */ 750 f = futex_create(&fk, shared); 751 if (f == NULL) { 752 error = ENOMEM; 753 goto out; 754 } 755 756 /* 757 * Insert our new futex, or use existing if someone else beat 758 * us to it. 759 */ 760 error = futex_insert(f, fp); 761 if (error) 762 goto out; 763 if (*fp == f) 764 f = NULL; /* don't release on exit */ 765 766 /* Success! */ 767 KASSERT(error == 0); 768 769 out: if (f != NULL) 770 futex_rele(f); 771 KASSERT(error || *fp != NULL); 772 KASSERT(error || atomic_load_relaxed(&(*fp)->fx_refcnt) != 0); 773 return error; 774 } 775 776 /* 777 * futex_wait_init(fw, bitset) 778 * 779 * Initialize a record for a thread to wait on a futex matching 780 * the specified bit set. Should be passed to futex_wait_enqueue 781 * before futex_wait, and should be passed to futex_wait_fini when 782 * done. 783 */ 784 static void 785 futex_wait_init(struct futex_wait *fw, int bitset) 786 { 787 788 KASSERT(bitset); 789 790 mutex_init(&fw->fw_lock, MUTEX_DEFAULT, IPL_NONE); 791 cv_init(&fw->fw_cv, "futex"); 792 fw->fw_futex = NULL; 793 fw->fw_bitset = bitset; 794 fw->fw_aborting = false; 795 } 796 797 /* 798 * futex_wait_fini(fw) 799 * 800 * Finalize a record for a futex waiter. Must not be on any 801 * futex's queue. 802 */ 803 static void 804 futex_wait_fini(struct futex_wait *fw) 805 { 806 807 KASSERT(fw->fw_futex == NULL); 808 809 cv_destroy(&fw->fw_cv); 810 mutex_destroy(&fw->fw_lock); 811 } 812 813 /* 814 * futex_wait_enqueue(fw, f) 815 * 816 * Put fw on the futex queue. Must be done before futex_wait. 817 * Caller must hold fw's lock and f's lock, and fw must not be on 818 * any existing futex's waiter list. 819 */ 820 static void 821 futex_wait_enqueue(struct futex_wait *fw, struct futex *f) 822 { 823 824 KASSERT(mutex_owned(&f->fx_qlock)); 825 KASSERT(mutex_owned(&fw->fw_lock)); 826 KASSERT(fw->fw_futex == NULL); 827 KASSERT(!fw->fw_aborting); 828 829 fw->fw_futex = f; 830 TAILQ_INSERT_TAIL(&f->fx_queue, fw, fw_entry); 831 } 832 833 /* 834 * futex_wait_dequeue(fw, f) 835 * 836 * Remove fw from the futex queue. Precludes subsequent 837 * futex_wait until a futex_wait_enqueue. Caller must hold fw's 838 * lock and f's lock, and fw must be on f. 839 */ 840 static void 841 futex_wait_dequeue(struct futex_wait *fw, struct futex *f) 842 { 843 844 KASSERT(mutex_owned(&f->fx_qlock)); 845 KASSERT(mutex_owned(&fw->fw_lock)); 846 KASSERT(fw->fw_futex == f); 847 848 TAILQ_REMOVE(&f->fx_queue, fw, fw_entry); 849 fw->fw_futex = NULL; 850 } 851 852 /* 853 * futex_wait_abort(fw) 854 * 855 * Caller is no longer waiting for fw. Remove it from any queue 856 * if it was on one. Caller must hold fw->fw_lock. 857 */ 858 static void 859 futex_wait_abort(struct futex_wait *fw) 860 { 861 struct futex *f; 862 863 KASSERT(mutex_owned(&fw->fw_lock)); 864 865 /* 866 * Grab the futex queue. It can't go away as long as we hold 867 * fw_lock. However, we can't take the queue lock because 868 * that's a lock order reversal. 869 */ 870 f = fw->fw_futex; 871 872 /* Put us on the abort list so that fq won't go away. */ 873 mutex_enter(&f->fx_abortlock); 874 LIST_INSERT_HEAD(&f->fx_abortlist, fw, fw_abort); 875 mutex_exit(&f->fx_abortlock); 876 877 /* 878 * Mark fw as aborting so it won't lose wakeups and won't be 879 * transferred to any other queue. 880 */ 881 fw->fw_aborting = true; 882 883 /* f is now stable, so we can release fw_lock. */ 884 mutex_exit(&fw->fw_lock); 885 886 /* Now we can remove fw under the queue lock. */ 887 mutex_enter(&f->fx_qlock); 888 mutex_enter(&fw->fw_lock); 889 futex_wait_dequeue(fw, f); 890 mutex_exit(&fw->fw_lock); 891 mutex_exit(&f->fx_qlock); 892 893 /* 894 * Finally, remove us from the abort list and notify anyone 895 * waiting for the abort to complete if we were the last to go. 896 */ 897 mutex_enter(&f->fx_abortlock); 898 LIST_REMOVE(fw, fw_abort); 899 if (LIST_EMPTY(&f->fx_abortlist)) 900 cv_broadcast(&f->fx_abortcv); 901 mutex_exit(&f->fx_abortlock); 902 903 /* 904 * Release our reference to the futex now that we are not 905 * waiting for it. 906 */ 907 futex_rele(f); 908 909 /* 910 * Reacquire the fw lock as caller expects. Verify that we're 911 * aborting and no longer associated with a futex. 912 */ 913 mutex_enter(&fw->fw_lock); 914 KASSERT(fw->fw_aborting); 915 KASSERT(fw->fw_futex == NULL); 916 } 917 918 /* 919 * futex_wait(fw, deadline, clkid) 920 * 921 * fw must be a waiter on a futex's queue. Wait until deadline on 922 * the clock clkid, or forever if deadline is NULL, for a futex 923 * wakeup. Return 0 on explicit wakeup or destruction of futex, 924 * ETIMEDOUT on timeout, EINTR/ERESTART on signal. Either way, fw 925 * will no longer be on a futex queue on return. 926 */ 927 static int 928 futex_wait(struct futex_wait *fw, const struct timespec *deadline, 929 clockid_t clkid) 930 { 931 int error = 0; 932 933 /* Test and wait under the wait lock. */ 934 mutex_enter(&fw->fw_lock); 935 936 for (;;) { 937 /* If we're done yet, stop and report success. */ 938 if (fw->fw_bitset == 0 || fw->fw_futex == NULL) { 939 error = 0; 940 break; 941 } 942 943 /* If anything went wrong in the last iteration, stop. */ 944 if (error) 945 break; 946 947 /* Not done yet. Wait. */ 948 if (deadline) { 949 struct timespec ts; 950 951 /* Check our watch. */ 952 error = clock_gettime1(clkid, &ts); 953 if (error) 954 break; 955 956 /* If we're past the deadline, ETIMEDOUT. */ 957 if (timespeccmp(deadline, &ts, <=)) { 958 error = ETIMEDOUT; 959 break; 960 } 961 962 /* Count how much time is left. */ 963 timespecsub(deadline, &ts, &ts); 964 965 /* Wait for that much time, allowing signals. */ 966 error = cv_timedwait_sig(&fw->fw_cv, &fw->fw_lock, 967 tstohz(&ts)); 968 } else { 969 /* Wait indefinitely, allowing signals. */ 970 error = cv_wait_sig(&fw->fw_cv, &fw->fw_lock); 971 } 972 } 973 974 /* 975 * If we were woken up, the waker will have removed fw from the 976 * queue. But if anything went wrong, we must remove fw from 977 * the queue ourselves. While here, convert EWOULDBLOCK to 978 * ETIMEDOUT. 979 */ 980 if (error) { 981 futex_wait_abort(fw); 982 if (error == EWOULDBLOCK) 983 error = ETIMEDOUT; 984 } 985 986 mutex_exit(&fw->fw_lock); 987 988 return error; 989 } 990 991 /* 992 * futex_wake(f, nwake, f2, nrequeue, bitset) 993 * 994 * Wake up to nwake waiters on f matching bitset; then, if f2 is 995 * provided, move up to nrequeue remaining waiters on f matching 996 * bitset to f2. Return the number of waiters actually woken. 997 * Caller must hold the locks of f and f2, if provided. 998 */ 999 static unsigned 1000 futex_wake(struct futex *f, unsigned nwake, struct futex *f2, 1001 unsigned nrequeue, int bitset) 1002 { 1003 struct futex_wait *fw, *fw_next; 1004 unsigned nwoken = 0; 1005 int hold_error __diagused; 1006 1007 KASSERT(mutex_owned(&f->fx_qlock)); 1008 KASSERT(f2 == NULL || mutex_owned(&f2->fx_qlock)); 1009 1010 /* Wake up to nwake waiters, and count the number woken. */ 1011 TAILQ_FOREACH_SAFE(fw, &f->fx_queue, fw_entry, fw_next) { 1012 if ((fw->fw_bitset & bitset) == 0) 1013 continue; 1014 if (nwake > 0) { 1015 mutex_enter(&fw->fw_lock); 1016 if (__predict_false(fw->fw_aborting)) { 1017 mutex_exit(&fw->fw_lock); 1018 continue; 1019 } 1020 futex_wait_dequeue(fw, f); 1021 fw->fw_bitset = 0; 1022 cv_broadcast(&fw->fw_cv); 1023 mutex_exit(&fw->fw_lock); 1024 nwake--; 1025 nwoken++; 1026 /* 1027 * Drop the futex reference on behalf of the 1028 * waiter. We assert this is not the last 1029 * reference on the futex (our caller should 1030 * also have one). 1031 */ 1032 futex_rele_not_last(f); 1033 } else { 1034 break; 1035 } 1036 } 1037 1038 if (f2) { 1039 /* Move up to nrequeue waiters from f's queue to f2's queue. */ 1040 TAILQ_FOREACH_SAFE(fw, &f->fx_queue, fw_entry, fw_next) { 1041 if ((fw->fw_bitset & bitset) == 0) 1042 continue; 1043 if (nrequeue > 0) { 1044 mutex_enter(&fw->fw_lock); 1045 if (__predict_false(fw->fw_aborting)) { 1046 mutex_exit(&fw->fw_lock); 1047 continue; 1048 } 1049 futex_wait_dequeue(fw, f); 1050 futex_wait_enqueue(fw, f2); 1051 mutex_exit(&fw->fw_lock); 1052 nrequeue--; 1053 /* 1054 * Transfer the reference from f to f2. 1055 * As above, we assert that we are not 1056 * dropping the last reference to f here. 1057 * 1058 * XXX futex_hold() could theoretically 1059 * XXX fail here. 1060 */ 1061 futex_rele_not_last(f); 1062 hold_error = futex_hold(f2); 1063 KASSERT(hold_error == 0); 1064 } else { 1065 break; 1066 } 1067 } 1068 } else { 1069 KASSERT(nrequeue == 0); 1070 } 1071 1072 /* Return the number of waiters woken. */ 1073 return nwoken; 1074 } 1075 1076 /* 1077 * futex_queue_lock(f) 1078 * 1079 * Acquire the queue lock of f. Pair with futex_queue_unlock. Do 1080 * not use if caller needs to acquire two locks; use 1081 * futex_queue_lock2 instead. 1082 */ 1083 static void 1084 futex_queue_lock(struct futex *f) 1085 { 1086 mutex_enter(&f->fx_qlock); 1087 } 1088 1089 /* 1090 * futex_queue_unlock(f) 1091 * 1092 * Release the queue lock of f. 1093 */ 1094 static void 1095 futex_queue_unlock(struct futex *f) 1096 { 1097 mutex_exit(&f->fx_qlock); 1098 } 1099 1100 /* 1101 * futex_queue_lock2(f, f2) 1102 * 1103 * Acquire the queue locks of both f and f2, which may be null, or 1104 * which may have the same underlying queue. If they are 1105 * distinct, an arbitrary total order is chosen on the locks. 1106 * 1107 * Callers should only ever acquire multiple queue locks 1108 * simultaneously using futex_queue_lock2. 1109 */ 1110 static void 1111 futex_queue_lock2(struct futex *f, struct futex *f2) 1112 { 1113 1114 /* 1115 * If both are null, do nothing; if one is null and the other 1116 * is not, lock the other and be done with it. 1117 */ 1118 if (f == NULL && f2 == NULL) { 1119 return; 1120 } else if (f == NULL) { 1121 mutex_enter(&f2->fx_qlock); 1122 return; 1123 } else if (f2 == NULL) { 1124 mutex_enter(&f->fx_qlock); 1125 return; 1126 } 1127 1128 /* If both futexes are the same, acquire only one. */ 1129 if (f == f2) { 1130 mutex_enter(&f->fx_qlock); 1131 return; 1132 } 1133 1134 /* Otherwise, use the ordering on the kva of the futex pointer. */ 1135 if ((uintptr_t)f < (uintptr_t)f2) { 1136 mutex_enter(&f->fx_qlock); 1137 mutex_enter(&f2->fx_qlock); 1138 } else { 1139 mutex_enter(&f2->fx_qlock); 1140 mutex_enter(&f->fx_qlock); 1141 } 1142 } 1143 1144 /* 1145 * futex_queue_unlock2(f, f2) 1146 * 1147 * Release the queue locks of both f and f2, which may be null, or 1148 * which may have the same underlying queue. 1149 */ 1150 static void 1151 futex_queue_unlock2(struct futex *f, struct futex *f2) 1152 { 1153 1154 /* 1155 * If both are null, do nothing; if one is null and the other 1156 * is not, unlock the other and be done with it. 1157 */ 1158 if (f == NULL && f2 == NULL) { 1159 return; 1160 } else if (f == NULL) { 1161 mutex_exit(&f2->fx_qlock); 1162 return; 1163 } else if (f2 == NULL) { 1164 mutex_exit(&f->fx_qlock); 1165 return; 1166 } 1167 1168 /* If both futexes are the same, release only one. */ 1169 if (f == f2) { 1170 mutex_exit(&f->fx_qlock); 1171 return; 1172 } 1173 1174 /* Otherwise, use the ordering on the kva of the futex pointer. */ 1175 if ((uintptr_t)f < (uintptr_t)f2) { 1176 mutex_exit(&f2->fx_qlock); 1177 mutex_exit(&f->fx_qlock); 1178 } else { 1179 mutex_exit(&f->fx_qlock); 1180 mutex_exit(&f2->fx_qlock); 1181 } 1182 } 1183 1184 /* 1185 * futex_func_wait(uaddr, val, val3, timeout, clkid, clkflags, retval) 1186 * 1187 * Implement futex(FUTEX_WAIT). 1188 */ 1189 static int 1190 futex_func_wait(bool shared, int *uaddr, int val, int val3, 1191 const struct timespec *timeout, clockid_t clkid, int clkflags, 1192 register_t *retval) 1193 { 1194 struct futex *f; 1195 struct futex_wait wait, *fw = &wait; 1196 struct timespec ts; 1197 const struct timespec *deadline; 1198 int error; 1199 1200 /* 1201 * If there's nothing to wait for, and nobody will ever wake 1202 * us, then don't set anything up to wait -- just stop here. 1203 */ 1204 if (val3 == 0) 1205 return EINVAL; 1206 1207 /* Optimistically test before anything else. */ 1208 if (!futex_test(uaddr, val)) 1209 return EAGAIN; 1210 1211 /* Determine a deadline on the specified clock. */ 1212 if (timeout == NULL || (clkflags & TIMER_ABSTIME) == TIMER_ABSTIME) { 1213 deadline = timeout; 1214 } else { 1215 error = clock_gettime1(clkid, &ts); 1216 if (error) 1217 return error; 1218 timespecadd(&ts, timeout, &ts); 1219 deadline = &ts; 1220 } 1221 1222 /* Get the futex, creating it if necessary. */ 1223 error = futex_lookup_create(uaddr, shared, &f); 1224 if (error) 1225 return error; 1226 KASSERT(f); 1227 1228 /* Get ready to wait. */ 1229 futex_wait_init(fw, val3); 1230 1231 /* 1232 * Under the queue lock, check the value again: if it has 1233 * already changed, EAGAIN; otherwise enqueue the waiter. 1234 * Since FUTEX_WAKE will use the same lock and be done after 1235 * modifying the value, the order in which we check and enqueue 1236 * is immaterial. 1237 */ 1238 futex_queue_lock(f); 1239 if (!futex_test(uaddr, val)) { 1240 futex_queue_unlock(f); 1241 error = EAGAIN; 1242 goto out; 1243 } 1244 mutex_enter(&fw->fw_lock); 1245 futex_wait_enqueue(fw, f); 1246 mutex_exit(&fw->fw_lock); 1247 futex_queue_unlock(f); 1248 1249 /* 1250 * We cannot drop our reference to the futex here, because 1251 * we might be enqueued on a different one when we are awakened. 1252 * The references will be managed on our behalf in the requeue 1253 * and wake cases. 1254 */ 1255 f = NULL; 1256 1257 /* Wait. */ 1258 error = futex_wait(fw, deadline, clkid); 1259 if (error) 1260 goto out; 1261 1262 /* Return 0 on success, error on failure. */ 1263 *retval = 0; 1264 1265 out: if (f != NULL) 1266 futex_rele(f); 1267 futex_wait_fini(fw); 1268 return error; 1269 } 1270 1271 /* 1272 * futex_func_wake(uaddr, val, val3, retval) 1273 * 1274 * Implement futex(FUTEX_WAKE) and futex(FUTEX_WAKE_BITSET). 1275 */ 1276 static int 1277 futex_func_wake(bool shared, int *uaddr, int val, int val3, register_t *retval) 1278 { 1279 struct futex *f; 1280 unsigned int nwoken = 0; 1281 int error = 0; 1282 1283 /* Reject negative number of wakeups. */ 1284 if (val < 0) { 1285 error = EINVAL; 1286 goto out; 1287 } 1288 1289 /* Look up the futex, if any. */ 1290 error = futex_lookup(uaddr, shared, &f); 1291 if (error) 1292 goto out; 1293 1294 /* If there's no futex, there are no waiters to wake. */ 1295 if (f == NULL) 1296 goto out; 1297 1298 /* 1299 * Under f's queue lock, wake the waiters and remember the 1300 * number woken. 1301 */ 1302 futex_queue_lock(f); 1303 nwoken = futex_wake(f, val, NULL, 0, val3); 1304 futex_queue_unlock(f); 1305 1306 /* Release the futex. */ 1307 futex_rele(f); 1308 1309 out: 1310 /* Return the number of waiters woken. */ 1311 *retval = nwoken; 1312 1313 /* Success! */ 1314 return error; 1315 } 1316 1317 /* 1318 * futex_func_requeue(op, uaddr, val, uaddr2, val2, val3, retval) 1319 * 1320 * Implement futex(FUTEX_REQUEUE) and futex(FUTEX_CMP_REQUEUE). 1321 */ 1322 static int 1323 futex_func_requeue(bool shared, int op, int *uaddr, int val, int *uaddr2, 1324 int val2, int val3, register_t *retval) 1325 { 1326 struct futex *f = NULL, *f2 = NULL; 1327 unsigned nwoken = 0; /* default to zero woken on early return */ 1328 int error; 1329 1330 /* Reject negative number of wakeups or requeues. */ 1331 if (val < 0 || val2 < 0) { 1332 error = EINVAL; 1333 goto out; 1334 } 1335 1336 /* Look up the source futex, if any. */ 1337 error = futex_lookup(uaddr, shared, &f); 1338 if (error) 1339 goto out; 1340 1341 /* If there is none, nothing to do. */ 1342 if (f == NULL) 1343 goto out; 1344 1345 /* 1346 * We may need to create the destination futex because it's 1347 * entirely possible it does not currently have any waiters. 1348 */ 1349 error = futex_lookup_create(uaddr2, shared, &f2); 1350 if (error) 1351 goto out; 1352 1353 /* 1354 * Under the futexes' queue locks, check the value; if 1355 * unchanged from val3, wake the waiters. 1356 */ 1357 futex_queue_lock2(f, f2); 1358 if (op == FUTEX_CMP_REQUEUE && !futex_test(uaddr, val3)) { 1359 error = EAGAIN; 1360 } else { 1361 error = 0; 1362 nwoken = futex_wake(f, val, f2, val2, FUTEX_BITSET_MATCH_ANY); 1363 } 1364 futex_queue_unlock2(f, f2); 1365 1366 out: 1367 /* Return the number of waiters woken. */ 1368 *retval = nwoken; 1369 1370 /* Release the futexes if we got them. */ 1371 if (f2) 1372 futex_rele(f2); 1373 if (f) 1374 futex_rele(f); 1375 return error; 1376 } 1377 1378 /* 1379 * futex_validate_op_cmp(val3) 1380 * 1381 * Validate an op/cmp argument for FUTEX_WAKE_OP. 1382 */ 1383 static int 1384 futex_validate_op_cmp(int val3) 1385 { 1386 int op = __SHIFTOUT(val3, FUTEX_OP_OP_MASK); 1387 int cmp = __SHIFTOUT(val3, FUTEX_OP_CMP_MASK); 1388 1389 if (op & FUTEX_OP_OPARG_SHIFT) { 1390 int oparg = __SHIFTOUT(val3, FUTEX_OP_OPARG_MASK); 1391 if (oparg < 0) 1392 return EINVAL; 1393 if (oparg >= 32) 1394 return EINVAL; 1395 op &= ~FUTEX_OP_OPARG_SHIFT; 1396 } 1397 1398 switch (op) { 1399 case FUTEX_OP_SET: 1400 case FUTEX_OP_ADD: 1401 case FUTEX_OP_OR: 1402 case FUTEX_OP_ANDN: 1403 case FUTEX_OP_XOR: 1404 break; 1405 default: 1406 return EINVAL; 1407 } 1408 1409 switch (cmp) { 1410 case FUTEX_OP_CMP_EQ: 1411 case FUTEX_OP_CMP_NE: 1412 case FUTEX_OP_CMP_LT: 1413 case FUTEX_OP_CMP_LE: 1414 case FUTEX_OP_CMP_GT: 1415 case FUTEX_OP_CMP_GE: 1416 break; 1417 default: 1418 return EINVAL; 1419 } 1420 1421 return 0; 1422 } 1423 1424 /* 1425 * futex_compute_op(oldval, val3) 1426 * 1427 * Apply a FUTEX_WAIT_OP operation to oldval. 1428 */ 1429 static int 1430 futex_compute_op(int oldval, int val3) 1431 { 1432 int op = __SHIFTOUT(val3, FUTEX_OP_OP_MASK); 1433 int oparg = __SHIFTOUT(val3, FUTEX_OP_OPARG_MASK); 1434 1435 if (op & FUTEX_OP_OPARG_SHIFT) { 1436 KASSERT(oparg >= 0); 1437 KASSERT(oparg < 32); 1438 oparg = 1u << oparg; 1439 op &= ~FUTEX_OP_OPARG_SHIFT; 1440 } 1441 1442 switch (op) { 1443 case FUTEX_OP_SET: 1444 return oparg; 1445 1446 case FUTEX_OP_ADD: 1447 /* 1448 * Avoid signed arithmetic overflow by doing 1449 * arithmetic unsigned and converting back to signed 1450 * at the end. 1451 */ 1452 return (int)((unsigned)oldval + (unsigned)oparg); 1453 1454 case FUTEX_OP_OR: 1455 return oldval | oparg; 1456 1457 case FUTEX_OP_ANDN: 1458 return oldval & ~oparg; 1459 1460 case FUTEX_OP_XOR: 1461 return oldval ^ oparg; 1462 1463 default: 1464 panic("invalid futex op"); 1465 } 1466 } 1467 1468 /* 1469 * futex_compute_cmp(oldval, val3) 1470 * 1471 * Apply a FUTEX_WAIT_OP comparison to oldval. 1472 */ 1473 static bool 1474 futex_compute_cmp(int oldval, int val3) 1475 { 1476 int cmp = __SHIFTOUT(val3, FUTEX_OP_CMP_MASK); 1477 int cmparg = __SHIFTOUT(val3, FUTEX_OP_CMPARG_MASK); 1478 1479 switch (cmp) { 1480 case FUTEX_OP_CMP_EQ: 1481 return (oldval == cmparg); 1482 1483 case FUTEX_OP_CMP_NE: 1484 return (oldval != cmparg); 1485 1486 case FUTEX_OP_CMP_LT: 1487 return (oldval < cmparg); 1488 1489 case FUTEX_OP_CMP_LE: 1490 return (oldval <= cmparg); 1491 1492 case FUTEX_OP_CMP_GT: 1493 return (oldval > cmparg); 1494 1495 case FUTEX_OP_CMP_GE: 1496 return (oldval >= cmparg); 1497 1498 default: 1499 panic("invalid futex cmp operation"); 1500 } 1501 } 1502 1503 /* 1504 * futex_func_wake_op(uaddr, val, uaddr2, val2, val3, retval) 1505 * 1506 * Implement futex(FUTEX_WAKE_OP). 1507 */ 1508 static int 1509 futex_func_wake_op(bool shared, int *uaddr, int val, int *uaddr2, int val2, 1510 int val3, register_t *retval) 1511 { 1512 struct futex *f = NULL, *f2 = NULL; 1513 int oldval, newval, actual; 1514 unsigned nwoken = 0; 1515 int error; 1516 1517 /* Reject negative number of wakeups. */ 1518 if (val < 0 || val2 < 0) { 1519 error = EINVAL; 1520 goto out; 1521 } 1522 1523 /* Reject invalid operations before we start doing things. */ 1524 if ((error = futex_validate_op_cmp(val3)) != 0) 1525 goto out; 1526 1527 /* Look up the first futex, if any. */ 1528 error = futex_lookup(uaddr, shared, &f); 1529 if (error) 1530 goto out; 1531 1532 /* Look up the second futex, if any. */ 1533 error = futex_lookup(uaddr2, shared, &f2); 1534 if (error) 1535 goto out; 1536 1537 /* 1538 * Under the queue locks: 1539 * 1540 * 1. Read/modify/write: *uaddr2 op= oparg. 1541 * 2. Unconditionally wake uaddr. 1542 * 3. Conditionally wake uaddr2, if it previously matched val2. 1543 */ 1544 futex_queue_lock2(f, f2); 1545 do { 1546 error = futex_load(uaddr2, &oldval); 1547 if (error) 1548 goto out_unlock; 1549 newval = futex_compute_op(oldval, val3); 1550 error = ucas_int(uaddr2, oldval, newval, &actual); 1551 if (error) 1552 goto out_unlock; 1553 } while (actual != oldval); 1554 nwoken = (f ? futex_wake(f, val, NULL, 0, FUTEX_BITSET_MATCH_ANY) : 0); 1555 if (f2 && futex_compute_cmp(oldval, val3)) 1556 nwoken += futex_wake(f2, val2, NULL, 0, 1557 FUTEX_BITSET_MATCH_ANY); 1558 1559 /* Success! */ 1560 error = 0; 1561 out_unlock: 1562 futex_queue_unlock2(f, f2); 1563 1564 out: 1565 /* Return the number of waiters woken. */ 1566 *retval = nwoken; 1567 1568 /* Release the futexes, if we got them. */ 1569 if (f2) 1570 futex_rele(f2); 1571 if (f) 1572 futex_rele(f); 1573 return error; 1574 } 1575 1576 /* 1577 * do_futex(uaddr, op, val, timeout, uaddr2, val2, val3) 1578 * 1579 * Implement the futex system call with all the parameters 1580 * parsed out. 1581 */ 1582 int 1583 do_futex(int *uaddr, int op, int val, const struct timespec *timeout, 1584 int *uaddr2, int val2, int val3, register_t *retval) 1585 { 1586 const bool shared = (op & FUTEX_PRIVATE_FLAG) ? false : true; 1587 const clockid_t clkid = (op & FUTEX_CLOCK_REALTIME) ? CLOCK_REALTIME 1588 : CLOCK_MONOTONIC; 1589 1590 op &= FUTEX_CMD_MASK; 1591 1592 switch (op) { 1593 case FUTEX_WAIT: 1594 return futex_func_wait(shared, uaddr, val, 1595 FUTEX_BITSET_MATCH_ANY, timeout, clkid, TIMER_RELTIME, 1596 retval); 1597 1598 case FUTEX_WAKE: 1599 val3 = FUTEX_BITSET_MATCH_ANY; 1600 /* FALLTHROUGH */ 1601 case FUTEX_WAKE_BITSET: 1602 return futex_func_wake(shared, uaddr, val, val3, retval); 1603 1604 case FUTEX_REQUEUE: 1605 case FUTEX_CMP_REQUEUE: 1606 return futex_func_requeue(shared, op, uaddr, val, uaddr2, 1607 val2, val3, retval); 1608 1609 case FUTEX_WAIT_BITSET: 1610 return futex_func_wait(shared, uaddr, val, val3, timeout, 1611 clkid, TIMER_ABSTIME, retval); 1612 1613 case FUTEX_WAKE_OP: 1614 return futex_func_wake_op(shared, uaddr, val, uaddr2, val2, 1615 val3, retval); 1616 1617 case FUTEX_FD: 1618 default: 1619 return ENOSYS; 1620 } 1621 } 1622 1623 /* 1624 * sys___futex(l, uap, retval) 1625 * 1626 * __futex(2) system call: generic futex operations. 1627 */ 1628 int 1629 sys___futex(struct lwp *l, const struct sys___futex_args *uap, 1630 register_t *retval) 1631 { 1632 /* { 1633 syscallarg(int *) uaddr; 1634 syscallarg(int) op; 1635 syscallarg(int) val; 1636 syscallarg(const struct timespec *) timeout; 1637 syscallarg(int *) uaddr2; 1638 syscallarg(int) val2; 1639 syscallarg(int) val3; 1640 } */ 1641 struct timespec ts, *tsp; 1642 int error; 1643 1644 /* 1645 * Copy in the timeout argument, if specified. 1646 */ 1647 if (SCARG(uap, timeout)) { 1648 error = copyin(SCARG(uap, timeout), &ts, sizeof(ts)); 1649 if (error) 1650 return error; 1651 tsp = &ts; 1652 } else { 1653 tsp = NULL; 1654 } 1655 1656 return do_futex(SCARG(uap, uaddr), SCARG(uap, op), SCARG(uap, val), 1657 tsp, SCARG(uap, uaddr2), SCARG(uap, val2), SCARG(uap, val3), 1658 retval); 1659 } 1660 1661 /* 1662 * sys___futex_set_robust_list(l, uap, retval) 1663 * 1664 * __futex_set_robust_list(2) system call for robust futexes. 1665 */ 1666 int 1667 sys___futex_set_robust_list(struct lwp *l, 1668 const struct sys___futex_set_robust_list_args *uap, register_t *retval) 1669 { 1670 /* { 1671 syscallarg(void *) head; 1672 syscallarg(size_t) len; 1673 } */ 1674 void *head = SCARG(uap, head); 1675 1676 if (SCARG(uap, len) != _FUTEX_ROBUST_HEAD_SIZE) 1677 return EINVAL; 1678 if ((uintptr_t)head % sizeof(u_long)) 1679 return EINVAL; 1680 1681 l->l_robust_head = (uintptr_t)head; 1682 1683 return 0; 1684 } 1685 1686 /* 1687 * sys___futex_get_robust_list(l, uap, retval) 1688 * 1689 * __futex_get_robust_list(2) system call for robust futexes. 1690 */ 1691 int 1692 sys___futex_get_robust_list(struct lwp *l, 1693 const struct sys___futex_get_robust_list_args *uap, register_t *retval) 1694 { 1695 /* { 1696 syscallarg(lwpid_t) lwpid; 1697 syscallarg(void **) headp; 1698 syscallarg(size_t *) lenp; 1699 } */ 1700 void *head; 1701 const size_t len = _FUTEX_ROBUST_HEAD_SIZE; 1702 int error; 1703 1704 error = futex_robust_head_lookup(l, SCARG(uap, lwpid), &head); 1705 if (error) 1706 return error; 1707 1708 /* Copy out the head pointer and the head structure length. */ 1709 error = copyout(&head, SCARG(uap, headp), sizeof(head)); 1710 if (__predict_true(error == 0)) { 1711 error = copyout(&len, SCARG(uap, lenp), sizeof(len)); 1712 } 1713 1714 return error; 1715 } 1716 1717 /* 1718 * release_futex(uva, tid) 1719 * 1720 * Try to release the robust futex at uva in the current process 1721 * on lwp exit. If anything goes wrong, silently fail. It is the 1722 * userland program's obligation to arrange correct behaviour. 1723 */ 1724 static void 1725 release_futex(uintptr_t const uptr, lwpid_t const tid, bool const is_pi, 1726 bool const is_pending) 1727 { 1728 int *uaddr; 1729 struct futex *f; 1730 int oldval, newval, actual; 1731 int error; 1732 1733 /* If it's misaligned, tough. */ 1734 if (__predict_false(uptr & 3)) 1735 return; 1736 uaddr = (int *)uptr; 1737 1738 error = futex_load(uaddr, &oldval); 1739 if (__predict_false(error)) 1740 return; 1741 1742 /* 1743 * There are two race conditions we need to handle here: 1744 * 1745 * 1. User space cleared the futex word but died before 1746 * being able to issue the wakeup. No wakeups will 1747 * ever be issued, oops! 1748 * 1749 * 2. Awakened waiter died before being able to acquire 1750 * the futex in user space. Any other waiters are 1751 * now stuck, oops! 1752 * 1753 * In both of these cases, the futex word will be 0 (because 1754 * it's updated before the wake is issued). The best we can 1755 * do is detect this situation if it's the pending futex and 1756 * issue a wake without modifying the futex word. 1757 * 1758 * XXX eventual PI handling? 1759 */ 1760 if (__predict_false(is_pending && (oldval & ~FUTEX_WAITERS) == 0)) { 1761 register_t retval; 1762 (void) futex_func_wake(/*shared*/true, uaddr, 1, 1763 FUTEX_BITSET_MATCH_ANY, &retval); 1764 return; 1765 } 1766 1767 /* Optimistically test whether we need to do anything at all. */ 1768 if ((oldval & FUTEX_TID_MASK) != tid) 1769 return; 1770 1771 /* 1772 * We need to handle the case where this thread owned the futex, 1773 * but it was uncontended. In this case, there won't be any 1774 * kernel state to look up. All we can do is mark the futex 1775 * as a zombie to be mopped up the next time another thread 1776 * attempts to acquire it. 1777 * 1778 * N.B. It's important to ensure to set FUTEX_OWNER_DIED in 1779 * this loop, even if waiters appear while we're are doing 1780 * so. This is beause FUTEX_WAITERS is set by user space 1781 * before calling __futex() to wait, and the futex needs 1782 * to be marked as a zombie when the new waiter gets into 1783 * the kernel. 1784 */ 1785 if ((oldval & FUTEX_WAITERS) == 0) { 1786 do { 1787 error = futex_load(uaddr, &oldval); 1788 if (error) 1789 return; 1790 if ((oldval & FUTEX_TID_MASK) != tid) 1791 return; 1792 newval = oldval | FUTEX_OWNER_DIED; 1793 error = ucas_int(uaddr, oldval, newval, &actual); 1794 if (error) 1795 return; 1796 } while (actual != oldval); 1797 1798 /* 1799 * If where is still no indication of waiters, then there is 1800 * no more work for us to do. 1801 */ 1802 if ((oldval & FUTEX_WAITERS) == 0) 1803 return; 1804 } 1805 1806 /* 1807 * Look for a shared futex since we have no positive indication 1808 * it is private. If we can't, tough. 1809 */ 1810 error = futex_lookup(uaddr, /*shared*/true, &f); 1811 if (error) 1812 return; 1813 1814 /* 1815 * If there's no kernel state for this futex, there's nothing to 1816 * release. 1817 */ 1818 if (f == NULL) 1819 return; 1820 1821 /* Work under the futex queue lock. */ 1822 futex_queue_lock(f); 1823 1824 /* 1825 * Fetch the word: if the tid doesn't match ours, skip; 1826 * otherwise, set the owner-died bit, atomically. 1827 */ 1828 do { 1829 error = futex_load(uaddr, &oldval); 1830 if (error) 1831 goto out; 1832 if ((oldval & FUTEX_TID_MASK) != tid) 1833 goto out; 1834 newval = oldval | FUTEX_OWNER_DIED; 1835 error = ucas_int(uaddr, oldval, newval, &actual); 1836 if (error) 1837 goto out; 1838 } while (actual != oldval); 1839 1840 /* 1841 * If there may be waiters, try to wake one. If anything goes 1842 * wrong, tough. 1843 * 1844 * XXX eventual PI handling? 1845 */ 1846 if (oldval & FUTEX_WAITERS) 1847 (void)futex_wake(f, 1, NULL, 0, FUTEX_BITSET_MATCH_ANY); 1848 1849 /* Unlock the queue and release the futex. */ 1850 out: futex_queue_unlock(f); 1851 futex_rele(f); 1852 } 1853 1854 /* 1855 * futex_robust_head_lookup(l, lwpid) 1856 * 1857 * Helper function to look up a robust head by LWP ID. 1858 */ 1859 int 1860 futex_robust_head_lookup(struct lwp *l, lwpid_t lwpid, void **headp) 1861 { 1862 struct proc *p = l->l_proc; 1863 1864 /* Find the other lwp, if requested; otherwise use our robust head. */ 1865 if (lwpid) { 1866 mutex_enter(p->p_lock); 1867 l = lwp_find(p, lwpid); 1868 if (l == NULL) { 1869 mutex_exit(p->p_lock); 1870 return ESRCH; 1871 } 1872 *headp = (void *)l->l_robust_head; 1873 mutex_exit(p->p_lock); 1874 } else { 1875 *headp = (void *)l->l_robust_head; 1876 } 1877 return 0; 1878 } 1879 1880 /* 1881 * futex_fetch_robust_head(uaddr) 1882 * 1883 * Helper routine to fetch the futex robust list head that 1884 * handles 32-bit binaries running on 64-bit kernels. 1885 */ 1886 static int 1887 futex_fetch_robust_head(uintptr_t uaddr, u_long *rhead) 1888 { 1889 #ifdef _LP64 1890 if (curproc->p_flag & PK_32) { 1891 uint32_t rhead32[_FUTEX_ROBUST_HEAD_NWORDS]; 1892 int error; 1893 1894 error = copyin((void *)uaddr, rhead32, sizeof(rhead32)); 1895 if (__predict_true(error == 0)) { 1896 for (int i = 0; i < _FUTEX_ROBUST_HEAD_NWORDS; i++) { 1897 if (i == _FUTEX_ROBUST_HEAD_OFFSET) { 1898 /* 1899 * Make sure the offset is sign- 1900 * extended. 1901 */ 1902 rhead[i] = (int32_t)rhead32[i]; 1903 } else { 1904 rhead[i] = rhead32[i]; 1905 } 1906 } 1907 } 1908 return error; 1909 } 1910 #endif /* _L64 */ 1911 1912 return copyin((void *)uaddr, rhead, 1913 sizeof(*rhead) * _FUTEX_ROBUST_HEAD_NWORDS); 1914 } 1915 1916 /* 1917 * futex_decode_robust_word(word) 1918 * 1919 * Decode a robust futex list word into the entry and entry 1920 * properties. 1921 */ 1922 static inline void 1923 futex_decode_robust_word(uintptr_t const word, uintptr_t * const entry, 1924 bool * const is_pi) 1925 { 1926 *is_pi = (word & _FUTEX_ROBUST_ENTRY_PI) ? true : false; 1927 *entry = word & ~_FUTEX_ROBUST_ENTRY_PI; 1928 } 1929 1930 /* 1931 * futex_fetch_robust_entry(uaddr) 1932 * 1933 * Helper routine to fetch and decode a robust futex entry 1934 * that handles 32-bit binaries running on 64-bit kernels. 1935 */ 1936 static int 1937 futex_fetch_robust_entry(uintptr_t const uaddr, uintptr_t * const valp, 1938 bool * const is_pi) 1939 { 1940 uintptr_t val = 0; 1941 int error = 0; 1942 1943 #ifdef _LP64 1944 if (curproc->p_flag & PK_32) { 1945 uint32_t val32; 1946 1947 error = ufetch_32((uint32_t *)uaddr, &val32); 1948 if (__predict_true(error == 0)) 1949 val = val32; 1950 } else 1951 #endif /* _LP64 */ 1952 error = ufetch_long((u_long *)uaddr, (u_long *)&val); 1953 if (__predict_false(error)) 1954 return error; 1955 1956 futex_decode_robust_word(val, valp, is_pi); 1957 return 0; 1958 } 1959 1960 /* 1961 * futex_release_all_lwp(l, tid) 1962 * 1963 * Release all l's robust futexes. If anything looks funny in 1964 * the process, give up -- it's userland's responsibility to dot 1965 * the i's and cross the t's. 1966 */ 1967 void 1968 futex_release_all_lwp(struct lwp * const l, lwpid_t const tid) 1969 { 1970 u_long rhead[_FUTEX_ROBUST_HEAD_NWORDS]; 1971 int limit = 1000000; 1972 int error; 1973 1974 /* If there's no robust list there's nothing to do. */ 1975 if (l->l_robust_head == 0) 1976 return; 1977 1978 /* Read the final snapshot of the robust list head. */ 1979 error = futex_fetch_robust_head(l->l_robust_head, rhead); 1980 if (error) { 1981 printf("WARNING: pid %jd (%s) lwp %jd tid %jd:" 1982 " unmapped robust futex list head\n", 1983 (uintmax_t)l->l_proc->p_pid, l->l_proc->p_comm, 1984 (uintmax_t)l->l_lid, (uintmax_t)tid); 1985 return; 1986 } 1987 1988 const long offset = (long)rhead[_FUTEX_ROBUST_HEAD_OFFSET]; 1989 1990 uintptr_t next, pending; 1991 bool is_pi, pending_is_pi; 1992 1993 futex_decode_robust_word(rhead[_FUTEX_ROBUST_HEAD_LIST], 1994 &next, &is_pi); 1995 futex_decode_robust_word(rhead[_FUTEX_ROBUST_HEAD_PENDING], 1996 &pending, &pending_is_pi); 1997 1998 /* 1999 * Walk down the list of locked futexes and release them, up 2000 * to one million of them before we give up. 2001 */ 2002 2003 while (next != l->l_robust_head && limit-- > 0) { 2004 /* pending handled below. */ 2005 if (next != pending) 2006 release_futex(next + offset, tid, is_pi, false); 2007 error = futex_fetch_robust_entry(next, &next, &is_pi); 2008 if (error) 2009 break; 2010 preempt_point(); 2011 } 2012 if (limit <= 0) { 2013 printf("WARNING: pid %jd (%s) lwp %jd tid %jd:" 2014 " exhausted robust futex limit\n", 2015 (uintmax_t)l->l_proc->p_pid, l->l_proc->p_comm, 2016 (uintmax_t)l->l_lid, (uintmax_t)tid); 2017 } 2018 2019 /* If there's a pending futex, it may need to be released too. */ 2020 if (pending != 0) { 2021 release_futex(pending + offset, tid, pending_is_pi, true); 2022 } 2023 } 2024