1/* $NetBSD: sys_futex.c,v 1.20 2024/04/11 13:51:36 riastradh Exp $ */ 2 3/*- 4 * Copyright (c) 2018, 2019, 2020 The NetBSD Foundation, Inc. 5 * All rights reserved. 6 * 7 * This code is derived from software contributed to The NetBSD Foundation 8 * by Taylor R. Campbell and Jason R. Thorpe. 9 * 10 * Redistribution and use in source and binary forms, with or without 11 * modification, are permitted provided that the following conditions 12 * are met: 13 * 1. Redistributions of source code must retain the above copyright 14 * notice, this list of conditions and the following disclaimer. 15 * 2. Redistributions in binary form must reproduce the above copyright 16 * notice, this list of conditions and the following disclaimer in the 17 * documentation and/or other materials provided with the distribution. 18 * 19 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS 20 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 21 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 22 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS 23 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 24 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 25 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 26 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 27 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 28 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 29 * POSSIBILITY OF SUCH DAMAGE. 30 */ 31 32#include <sys/cdefs.h> 33__KERNEL_RCSID(0, "$NetBSD: sys_futex.c,v 1.20 2024/04/11 13:51:36 riastradh Exp $"); 34 35/* 36 * Futexes 37 * 38 * The futex system call coordinates notifying threads waiting for 39 * changes on a 32-bit word of memory. The word can be managed by 40 * CPU atomic operations in userland, without system calls, as long 41 * as there is no contention. 42 * 43 * The simplest use case demonstrating the utility is: 44 * 45 * // 32-bit word of memory shared among threads or 46 * // processes in userland. lock & 1 means owned; 47 * // lock & 2 means there are waiters waiting. 48 * volatile int lock = 0; 49 * 50 * int v; 51 * 52 * // Acquire a lock. 53 * do { 54 * v = lock; 55 * if (v & 1) { 56 * // Lock is held. Set a bit to say that 57 * // there are waiters, and wait for lock 58 * // to change to anything other than v; 59 * // then retry. 60 * if (atomic_cas_uint(&lock, v, v | 2) != v) 61 * continue; 62 * futex(FUTEX_WAIT, &lock, v | 2, NULL, NULL, 0); 63 * continue; 64 * } 65 * } while (atomic_cas_uint(&lock, v, v | 1) != v); 66 * membar_acquire(); 67 * 68 * ... 69 * 70 * // Release the lock. Optimistically assume there are 71 * // no waiters first until demonstrated otherwise. 72 * membar_release(); 73 * if (atomic_cas_uint(&lock, 1, 0) != 1) { 74 * // There may be waiters. 75 * v = atomic_swap_uint(&lock, 0); 76 * // If there are still waiters, wake one. 77 * if (v & 2) 78 * futex(FUTEX_WAKE, &lock, 1, NULL, NULL, 0); 79 * } 80 * 81 * The goal is to avoid the futex system call unless there is 82 * contention; then if there is contention, to guarantee no missed 83 * wakeups. 84 * 85 * For a simple implementation, futex(FUTEX_WAIT) could queue 86 * itself to be woken, double-check the lock word, and then sleep; 87 * spurious wakeups are generally a fact of life, so any 88 * FUTEX_WAKE could just wake every FUTEX_WAIT in the system. 89 * 90 * If this were all there is to it, we could then increase 91 * parallelism by refining the approximation: partition the 92 * waiters into buckets by hashing the lock addresses to reduce 93 * the incidence of spurious wakeups. But this is not all. 94 * 95 * The futex(FUTEX_CMP_REQUEUE, &lock, n, &lock2, m, val) 96 * operation not only wakes n waiters on lock if lock == val, but 97 * also _transfers_ m additional waiters to lock2. Unless wakeups 98 * on lock2 also trigger wakeups on lock, we cannot move waiters 99 * to lock2 if they merely share the same hash as waiters on lock. 100 * Thus, we can't approximately distribute waiters into queues by 101 * a hash function; we must distinguish futex queues exactly by 102 * lock address. 103 * 104 * For now, we use a global red/black tree to index futexes. This 105 * should be replaced by a lockless radix tree with a thread to 106 * free entries no longer in use once all lookups on all CPUs have 107 * completed. 108 * 109 * Specifically, we maintain two maps: 110 * 111 * futex_tab.va[vmspace, va] for private futexes 112 * futex_tab.oa[uvm_voaddr] for shared futexes 113 * 114 * This implementation does not support priority inheritance. 115 */ 116 117#include <sys/param.h> 118#include <sys/types.h> 119#include <sys/atomic.h> 120#include <sys/condvar.h> 121#include <sys/futex.h> 122#include <sys/mutex.h> 123#include <sys/rbtree.h> 124#include <sys/queue.h> 125 126#include <sys/syscall.h> 127#include <sys/syscallargs.h> 128#include <sys/syscallvar.h> 129 130#include <uvm/uvm_extern.h> 131 132/* 133 * Lock order: 134 * 135 * futex_tab.lock 136 * futex::fx_qlock ordered by kva of struct futex 137 * -> futex_wait::fw_lock only one at a time 138 * futex_wait::fw_lock only one at a time 139 * -> futex::fx_abortlock only one at a time 140 */ 141 142/* 143 * union futex_key 144 * 145 * A futex is addressed either by a vmspace+va (private) or by 146 * a uvm_voaddr (shared). 147 */ 148union futex_key { 149 struct { 150 struct vmspace *vmspace; 151 vaddr_t va; 152 } fk_private; 153 struct uvm_voaddr fk_shared; 154}; 155 156/* 157 * struct futex 158 * 159 * Kernel state for a futex located at a particular address in a 160 * particular virtual address space. 161 * 162 * N.B. fx_refcnt is an unsigned long because we need to be able 163 * to operate on it atomically on all systems while at the same 164 * time rendering practically impossible the chance of it reaching 165 * its max value. In practice, we're limited by the number of LWPs 166 * that can be present on the system at any given time, and the 167 * assumption is that limit will be good enough on a 32-bit platform. 168 * See futex_wake() for why overflow needs to be avoided. 169 */ 170struct futex { 171 union futex_key fx_key; 172 unsigned long fx_refcnt; 173 bool fx_shared; 174 bool fx_on_tree; 175 struct rb_node fx_node; 176 177 kmutex_t fx_qlock; 178 TAILQ_HEAD(, futex_wait) fx_queue; 179 180 kmutex_t fx_abortlock; 181 LIST_HEAD(, futex_wait) fx_abortlist; 182 kcondvar_t fx_abortcv; 183}; 184 185/* 186 * struct futex_wait 187 * 188 * State for a thread to wait on a futex. Threads wait on fw_cv 189 * for fw_bitset to be set to zero. The thread may transition to 190 * a different futex queue at any time under the futex's lock. 191 */ 192struct futex_wait { 193 kmutex_t fw_lock; 194 kcondvar_t fw_cv; 195 struct futex *fw_futex; 196 TAILQ_ENTRY(futex_wait) fw_entry; /* queue lock */ 197 LIST_ENTRY(futex_wait) fw_abort; /* queue abortlock */ 198 int fw_bitset; 199 bool fw_aborting; /* fw_lock */ 200}; 201 202/* 203 * futex_tab 204 * 205 * Global trees of futexes by vmspace/va and VM object address. 206 * 207 * XXX This obviously doesn't scale in parallel. We could use a 208 * pserialize-safe data structure, but there may be a high cost to 209 * frequent deletion since we don't cache futexes after we're done 210 * with them. We could use hashed locks. But for now, just make 211 * sure userland can't DoS the serial performance, by using a 212 * balanced binary tree for lookup. 213 * 214 * XXX We could use a per-process tree for the table indexed by 215 * virtual address to reduce contention between processes. 216 */ 217static struct { 218 kmutex_t lock; 219 struct rb_tree va; 220 struct rb_tree oa; 221} futex_tab __cacheline_aligned; 222 223static int 224compare_futex_key(void *cookie, const void *n, const void *k) 225{ 226 const struct futex *fa = n; 227 const union futex_key *fka = &fa->fx_key; 228 const union futex_key *fkb = k; 229 230 if ((uintptr_t)fka->fk_private.vmspace < 231 (uintptr_t)fkb->fk_private.vmspace) 232 return -1; 233 if ((uintptr_t)fka->fk_private.vmspace > 234 (uintptr_t)fkb->fk_private.vmspace) 235 return +1; 236 if (fka->fk_private.va < fkb->fk_private.va) 237 return -1; 238 if (fka->fk_private.va > fkb->fk_private.va) 239 return +1; 240 return 0; 241} 242 243static int 244compare_futex(void *cookie, const void *na, const void *nb) 245{ 246 const struct futex *fa = na; 247 const struct futex *fb = nb; 248 249 return compare_futex_key(cookie, fa, &fb->fx_key); 250} 251 252static const rb_tree_ops_t futex_rb_ops = { 253 .rbto_compare_nodes = compare_futex, 254 .rbto_compare_key = compare_futex_key, 255 .rbto_node_offset = offsetof(struct futex, fx_node), 256}; 257 258static int 259compare_futex_shared_key(void *cookie, const void *n, const void *k) 260{ 261 const struct futex *fa = n; 262 const union futex_key *fka = &fa->fx_key; 263 const union futex_key *fkb = k; 264 265 return uvm_voaddr_compare(&fka->fk_shared, &fkb->fk_shared); 266} 267 268static int 269compare_futex_shared(void *cookie, const void *na, const void *nb) 270{ 271 const struct futex *fa = na; 272 const struct futex *fb = nb; 273 274 return compare_futex_shared_key(cookie, fa, &fb->fx_key); 275} 276 277static const rb_tree_ops_t futex_shared_rb_ops = { 278 .rbto_compare_nodes = compare_futex_shared, 279 .rbto_compare_key = compare_futex_shared_key, 280 .rbto_node_offset = offsetof(struct futex, fx_node), 281}; 282 283static void futex_wait_dequeue(struct futex_wait *, struct futex *); 284 285/* 286 * futex_load(uaddr, kaddr) 287 * 288 * Perform a single atomic load to read *uaddr, and return the 289 * result in *kaddr. Return 0 on success, EFAULT if uaddr is not 290 * mapped. 291 */ 292static inline int 293futex_load(int *uaddr, int *kaddr) 294{ 295 return ufetch_int((u_int *)uaddr, (u_int *)kaddr); 296} 297 298/* 299 * futex_test(uaddr, expected) 300 * 301 * True if *uaddr == expected. False if *uaddr != expected, or if 302 * uaddr is not mapped. 303 */ 304static bool 305futex_test(int *uaddr, int expected) 306{ 307 int val; 308 int error; 309 310 error = futex_load(uaddr, &val); 311 if (error) 312 return false; 313 return val == expected; 314} 315 316/* 317 * futex_sys_init() 318 * 319 * Initialize the futex subsystem. 320 */ 321void 322futex_sys_init(void) 323{ 324 325 mutex_init(&futex_tab.lock, MUTEX_DEFAULT, IPL_NONE); 326 rb_tree_init(&futex_tab.va, &futex_rb_ops); 327 rb_tree_init(&futex_tab.oa, &futex_shared_rb_ops); 328} 329 330/* 331 * futex_sys_fini() 332 * 333 * Finalize the futex subsystem. 334 */ 335void 336futex_sys_fini(void) 337{ 338 339 KASSERT(RB_TREE_MIN(&futex_tab.oa) == NULL); 340 KASSERT(RB_TREE_MIN(&futex_tab.va) == NULL); 341 mutex_destroy(&futex_tab.lock); 342} 343 344/* 345 * futex_queue_init(f) 346 * 347 * Initialize the futex queue. Caller must call futex_queue_fini 348 * when done. 349 * 350 * Never sleeps. 351 */ 352static void 353futex_queue_init(struct futex *f) 354{ 355 356 mutex_init(&f->fx_qlock, MUTEX_DEFAULT, IPL_NONE); 357 mutex_init(&f->fx_abortlock, MUTEX_DEFAULT, IPL_NONE); 358 cv_init(&f->fx_abortcv, "fqabort"); 359 LIST_INIT(&f->fx_abortlist); 360 TAILQ_INIT(&f->fx_queue); 361} 362 363/* 364 * futex_queue_drain(f) 365 * 366 * Wait for any aborting waiters in f; then empty the queue of 367 * any stragglers and wake them. Caller must guarantee no new 368 * references to f. 369 * 370 * May sleep. 371 */ 372static void 373futex_queue_drain(struct futex *f) 374{ 375 struct futex_wait *fw, *fw_next; 376 377 mutex_enter(&f->fx_abortlock); 378 while (!LIST_EMPTY(&f->fx_abortlist)) 379 cv_wait(&f->fx_abortcv, &f->fx_abortlock); 380 mutex_exit(&f->fx_abortlock); 381 382 mutex_enter(&f->fx_qlock); 383 TAILQ_FOREACH_SAFE(fw, &f->fx_queue, fw_entry, fw_next) { 384 mutex_enter(&fw->fw_lock); 385 futex_wait_dequeue(fw, f); 386 cv_broadcast(&fw->fw_cv); 387 mutex_exit(&fw->fw_lock); 388 } 389 mutex_exit(&f->fx_qlock); 390} 391 392/* 393 * futex_queue_fini(fq) 394 * 395 * Finalize the futex queue initialized by futex_queue_init. Queue 396 * must be empty. Caller must not use f again until a subsequent 397 * futex_queue_init. 398 */ 399static void 400futex_queue_fini(struct futex *f) 401{ 402 403 KASSERT(TAILQ_EMPTY(&f->fx_queue)); 404 KASSERT(LIST_EMPTY(&f->fx_abortlist)); 405 mutex_destroy(&f->fx_qlock); 406 mutex_destroy(&f->fx_abortlock); 407 cv_destroy(&f->fx_abortcv); 408} 409 410/* 411 * futex_key_init(key, vm, va, shared) 412 * 413 * Initialize a futex key for lookup, etc. 414 */ 415static int 416futex_key_init(union futex_key *fk, struct vmspace *vm, vaddr_t va, bool shared) 417{ 418 int error = 0; 419 420 if (__predict_false(shared)) { 421 if (!uvm_voaddr_acquire(&vm->vm_map, va, &fk->fk_shared)) 422 error = EFAULT; 423 } else { 424 fk->fk_private.vmspace = vm; 425 fk->fk_private.va = va; 426 } 427 428 return error; 429} 430 431/* 432 * futex_key_fini(key, shared) 433 * 434 * Release a futex key. 435 */ 436static void 437futex_key_fini(union futex_key *fk, bool shared) 438{ 439 if (__predict_false(shared)) 440 uvm_voaddr_release(&fk->fk_shared); 441 memset(fk, 0, sizeof(*fk)); 442} 443 444/* 445 * futex_create(fk, shared) 446 * 447 * Create a futex. Initial reference count is 1, representing the 448 * caller. Returns NULL on failure. Always takes ownership of the 449 * key, either transferring it to the newly-created futex, or releasing 450 * the key if creation fails. 451 * 452 * Never sleeps for memory, but may sleep to acquire a lock. 453 */ 454static struct futex * 455futex_create(union futex_key *fk, bool shared) 456{ 457 struct futex *f; 458 459 f = kmem_alloc(sizeof(*f), KM_NOSLEEP); 460 if (f == NULL) { 461 futex_key_fini(fk, shared); 462 return NULL; 463 } 464 f->fx_key = *fk; 465 f->fx_refcnt = 1; 466 f->fx_shared = shared; 467 f->fx_on_tree = false; 468 futex_queue_init(f); 469 470 return f; 471} 472 473/* 474 * futex_destroy(f) 475 * 476 * Destroy a futex created with futex_create. Reference count 477 * must be zero. 478 * 479 * May sleep. 480 */ 481static void 482futex_destroy(struct futex *f) 483{ 484 485 ASSERT_SLEEPABLE(); 486 487 KASSERT(atomic_load_relaxed(&f->fx_refcnt) == 0); 488 KASSERT(!f->fx_on_tree); 489 490 /* Drain and destroy the private queue. */ 491 futex_queue_drain(f); 492 futex_queue_fini(f); 493 494 futex_key_fini(&f->fx_key, f->fx_shared); 495 496 kmem_free(f, sizeof(*f)); 497} 498 499/* 500 * futex_hold(f) 501 * 502 * Attempt to acquire a reference to f. Return 0 on success, 503 * ENFILE on too many references. 504 * 505 * Never sleeps. 506 */ 507static int 508futex_hold(struct futex *f) 509{ 510 unsigned long refcnt; 511 512 do { 513 refcnt = atomic_load_relaxed(&f->fx_refcnt); 514 if (refcnt == ULONG_MAX) 515 return ENFILE; 516 } while (atomic_cas_ulong(&f->fx_refcnt, refcnt, refcnt + 1) != refcnt); 517 518 return 0; 519} 520 521/* 522 * futex_rele(f) 523 * 524 * Release a reference to f acquired with futex_create or 525 * futex_hold. 526 * 527 * May sleep to free f. 528 */ 529static void 530futex_rele(struct futex *f) 531{ 532 unsigned long refcnt; 533 534 ASSERT_SLEEPABLE(); 535 536 do { 537 refcnt = atomic_load_relaxed(&f->fx_refcnt); 538 if (refcnt == 1) 539 goto trylast; 540 membar_release(); 541 } while (atomic_cas_ulong(&f->fx_refcnt, refcnt, refcnt - 1) != refcnt); 542 return; 543 544trylast: 545 mutex_enter(&futex_tab.lock); 546 if (atomic_dec_ulong_nv(&f->fx_refcnt) == 0) { 547 membar_acquire(); 548 if (f->fx_on_tree) { 549 if (__predict_false(f->fx_shared)) 550 rb_tree_remove_node(&futex_tab.oa, f); 551 else 552 rb_tree_remove_node(&futex_tab.va, f); 553 f->fx_on_tree = false; 554 } 555 } else { 556 /* References remain -- don't destroy it. */ 557 f = NULL; 558 } 559 mutex_exit(&futex_tab.lock); 560 if (f != NULL) 561 futex_destroy(f); 562} 563 564/* 565 * futex_rele_not_last(f) 566 * 567 * Release a reference to f acquired with futex_create or 568 * futex_hold. 569 * 570 * This version asserts that we are not dropping the last 571 * reference to f. 572 */ 573static void 574futex_rele_not_last(struct futex *f) 575{ 576 unsigned long refcnt; 577 578 do { 579 refcnt = atomic_load_relaxed(&f->fx_refcnt); 580 KASSERT(refcnt > 1); 581 } while (atomic_cas_ulong(&f->fx_refcnt, refcnt, refcnt - 1) != refcnt); 582} 583 584/* 585 * futex_lookup_by_key(key, shared, &f) 586 * 587 * Try to find an existing futex va reference in the specified key 588 * On success, return 0, set f to found futex or to NULL if not found, 589 * and increment f's reference count if found. 590 * 591 * Return ENFILE if reference count too high. 592 * 593 * Internal lookup routine shared by futex_lookup() and 594 * futex_lookup_create(). 595 */ 596static int 597futex_lookup_by_key(union futex_key *fk, bool shared, struct futex **fp) 598{ 599 struct futex *f; 600 int error = 0; 601 602 mutex_enter(&futex_tab.lock); 603 if (__predict_false(shared)) { 604 f = rb_tree_find_node(&futex_tab.oa, fk); 605 } else { 606 f = rb_tree_find_node(&futex_tab.va, fk); 607 } 608 if (f) { 609 error = futex_hold(f); 610 if (error) 611 f = NULL; 612 } 613 *fp = f; 614 mutex_exit(&futex_tab.lock); 615 616 return error; 617} 618 619/* 620 * futex_insert(f, fp) 621 * 622 * Try to insert the futex f into the tree by va. If there 623 * already is a futex for its va, acquire a reference to it, and 624 * store it in *fp; otherwise store f in *fp. 625 * 626 * Return 0 on success, ENFILE if there already is a futex but its 627 * reference count is too high. 628 */ 629static int 630futex_insert(struct futex *f, struct futex **fp) 631{ 632 struct futex *f0; 633 int error; 634 635 KASSERT(atomic_load_relaxed(&f->fx_refcnt) != 0); 636 KASSERT(!f->fx_on_tree); 637 638 mutex_enter(&futex_tab.lock); 639 if (__predict_false(f->fx_shared)) 640 f0 = rb_tree_insert_node(&futex_tab.oa, f); 641 else 642 f0 = rb_tree_insert_node(&futex_tab.va, f); 643 if (f0 == f) { 644 f->fx_on_tree = true; 645 error = 0; 646 } else { 647 KASSERT(atomic_load_relaxed(&f0->fx_refcnt) != 0); 648 KASSERT(f0->fx_on_tree); 649 error = futex_hold(f0); 650 if (error) 651 goto out; 652 } 653 *fp = f0; 654out: mutex_exit(&futex_tab.lock); 655 656 return error; 657} 658 659/* 660 * futex_lookup(uaddr, shared, &f) 661 * 662 * Find a futex at the userland pointer uaddr in the current 663 * process's VM space. On success, return the futex in f and 664 * increment its reference count. 665 * 666 * Caller must call futex_rele when done. 667 */ 668static int 669futex_lookup(int *uaddr, bool shared, struct futex **fp) 670{ 671 union futex_key fk; 672 struct vmspace *vm = curproc->p_vmspace; 673 vaddr_t va = (vaddr_t)uaddr; 674 int error; 675 676 /* 677 * Reject unaligned user pointers so we don't cross page 678 * boundaries and so atomics will work. 679 */ 680 if ((va & 3) != 0) 681 return EINVAL; 682 683 /* Look it up. */ 684 error = futex_key_init(&fk, vm, va, shared); 685 if (error) 686 return error; 687 688 error = futex_lookup_by_key(&fk, shared, fp); 689 futex_key_fini(&fk, shared); 690 if (error) 691 return error; 692 693 KASSERT(*fp == NULL || (*fp)->fx_shared == shared); 694 KASSERT(*fp == NULL || atomic_load_relaxed(&(*fp)->fx_refcnt) != 0); 695 696 /* 697 * Success! (Caller must still check whether we found 698 * anything, but nothing went _wrong_ like trying to use 699 * unmapped memory.) 700 */ 701 KASSERT(error == 0); 702 703 return error; 704} 705 706/* 707 * futex_lookup_create(uaddr, shared, &f) 708 * 709 * Find or create a futex at the userland pointer uaddr in the 710 * current process's VM space. On success, return the futex in f 711 * and increment its reference count. 712 * 713 * Caller must call futex_rele when done. 714 */ 715static int 716futex_lookup_create(int *uaddr, bool shared, struct futex **fp) 717{ 718 union futex_key fk; 719 struct vmspace *vm = curproc->p_vmspace; 720 struct futex *f = NULL; 721 vaddr_t va = (vaddr_t)uaddr; 722 int error; 723 724 /* 725 * Reject unaligned user pointers so we don't cross page 726 * boundaries and so atomics will work. 727 */ 728 if ((va & 3) != 0) 729 return EINVAL; 730 731 error = futex_key_init(&fk, vm, va, shared); 732 if (error) 733 return error; 734 735 /* 736 * Optimistically assume there already is one, and try to find 737 * it. 738 */ 739 error = futex_lookup_by_key(&fk, shared, fp); 740 if (error || *fp != NULL) { 741 /* 742 * We either found one, or there was an error. 743 * In either case, we are done with the key. 744 */ 745 futex_key_fini(&fk, shared); 746 goto out; 747 } 748 749 /* 750 * Create a futex record. This transfers ownership of the key 751 * in all cases. 752 */ 753 f = futex_create(&fk, shared); 754 if (f == NULL) { 755 error = ENOMEM; 756 goto out; 757 } 758 759 /* 760 * Insert our new futex, or use existing if someone else beat 761 * us to it. 762 */ 763 error = futex_insert(f, fp); 764 if (error) 765 goto out; 766 if (*fp == f) 767 f = NULL; /* don't release on exit */ 768 769 /* Success! */ 770 KASSERT(error == 0); 771 772out: if (f != NULL) 773 futex_rele(f); 774 KASSERT(error || *fp != NULL); 775 KASSERT(error || atomic_load_relaxed(&(*fp)->fx_refcnt) != 0); 776 return error; 777} 778 779/* 780 * futex_wait_init(fw, bitset) 781 * 782 * Initialize a record for a thread to wait on a futex matching 783 * the specified bit set. Should be passed to futex_wait_enqueue 784 * before futex_wait, and should be passed to futex_wait_fini when 785 * done. 786 */ 787static void 788futex_wait_init(struct futex_wait *fw, int bitset) 789{ 790 791 KASSERT(bitset); 792 793 mutex_init(&fw->fw_lock, MUTEX_DEFAULT, IPL_NONE); 794 cv_init(&fw->fw_cv, "futex"); 795 fw->fw_futex = NULL; 796 fw->fw_bitset = bitset; 797 fw->fw_aborting = false; 798} 799 800/* 801 * futex_wait_fini(fw) 802 * 803 * Finalize a record for a futex waiter. Must not be on any 804 * futex's queue. 805 */ 806static void 807futex_wait_fini(struct futex_wait *fw) 808{ 809 810 KASSERT(fw->fw_futex == NULL); 811 812 cv_destroy(&fw->fw_cv); 813 mutex_destroy(&fw->fw_lock); 814} 815 816/* 817 * futex_wait_enqueue(fw, f) 818 * 819 * Put fw on the futex queue. Must be done before futex_wait. 820 * Caller must hold fw's lock and f's lock, and fw must not be on 821 * any existing futex's waiter list. 822 */ 823static void 824futex_wait_enqueue(struct futex_wait *fw, struct futex *f) 825{ 826 827 KASSERT(mutex_owned(&f->fx_qlock)); 828 KASSERT(mutex_owned(&fw->fw_lock)); 829 KASSERT(fw->fw_futex == NULL); 830 KASSERT(!fw->fw_aborting); 831 832 fw->fw_futex = f; 833 TAILQ_INSERT_TAIL(&f->fx_queue, fw, fw_entry); 834} 835 836/* 837 * futex_wait_dequeue(fw, f) 838 * 839 * Remove fw from the futex queue. Precludes subsequent 840 * futex_wait until a futex_wait_enqueue. Caller must hold fw's 841 * lock and f's lock, and fw must be on f. 842 */ 843static void 844futex_wait_dequeue(struct futex_wait *fw, struct futex *f) 845{ 846 847 KASSERT(mutex_owned(&f->fx_qlock)); 848 KASSERT(mutex_owned(&fw->fw_lock)); 849 KASSERT(fw->fw_futex == f); 850 851 TAILQ_REMOVE(&f->fx_queue, fw, fw_entry); 852 fw->fw_futex = NULL; 853} 854 855/* 856 * futex_wait_abort(fw) 857 * 858 * Caller is no longer waiting for fw. Remove it from any queue 859 * if it was on one. Caller must hold fw->fw_lock. 860 */ 861static void 862futex_wait_abort(struct futex_wait *fw) 863{ 864 struct futex *f; 865 866 KASSERT(mutex_owned(&fw->fw_lock)); 867 868 /* 869 * Grab the futex queue. It can't go away as long as we hold 870 * fw_lock. However, we can't take the queue lock because 871 * that's a lock order reversal. 872 */ 873 f = fw->fw_futex; 874 875 /* Put us on the abort list so that fq won't go away. */ 876 mutex_enter(&f->fx_abortlock); 877 LIST_INSERT_HEAD(&f->fx_abortlist, fw, fw_abort); 878 mutex_exit(&f->fx_abortlock); 879 880 /* 881 * Mark fw as aborting so it won't lose wakeups and won't be 882 * transferred to any other queue. 883 */ 884 fw->fw_aborting = true; 885 886 /* f is now stable, so we can release fw_lock. */ 887 mutex_exit(&fw->fw_lock); 888 889 /* Now we can remove fw under the queue lock. */ 890 mutex_enter(&f->fx_qlock); 891 mutex_enter(&fw->fw_lock); 892 futex_wait_dequeue(fw, f); 893 mutex_exit(&fw->fw_lock); 894 mutex_exit(&f->fx_qlock); 895 896 /* 897 * Finally, remove us from the abort list and notify anyone 898 * waiting for the abort to complete if we were the last to go. 899 */ 900 mutex_enter(&f->fx_abortlock); 901 LIST_REMOVE(fw, fw_abort); 902 if (LIST_EMPTY(&f->fx_abortlist)) 903 cv_broadcast(&f->fx_abortcv); 904 mutex_exit(&f->fx_abortlock); 905 906 /* 907 * Release our reference to the futex now that we are not 908 * waiting for it. 909 */ 910 futex_rele(f); 911 912 /* 913 * Reacquire the fw lock as caller expects. Verify that we're 914 * aborting and no longer associated with a futex. 915 */ 916 mutex_enter(&fw->fw_lock); 917 KASSERT(fw->fw_aborting); 918 KASSERT(fw->fw_futex == NULL); 919} 920 921/* 922 * futex_wait(fw, deadline, clkid) 923 * 924 * fw must be a waiter on a futex's queue. Wait until deadline on 925 * the clock clkid, or forever if deadline is NULL, for a futex 926 * wakeup. Return 0 on explicit wakeup or destruction of futex, 927 * ETIMEDOUT on timeout, EINTR/ERESTART on signal. Either way, fw 928 * will no longer be on a futex queue on return. 929 */ 930static int 931futex_wait(struct futex_wait *fw, const struct timespec *deadline, 932 clockid_t clkid) 933{ 934 int error = 0; 935 936 /* Test and wait under the wait lock. */ 937 mutex_enter(&fw->fw_lock); 938 939 for (;;) { 940 /* If we're done yet, stop and report success. */ 941 if (fw->fw_bitset == 0 || fw->fw_futex == NULL) { 942 error = 0; 943 break; 944 } 945 946 /* If anything went wrong in the last iteration, stop. */ 947 if (error) 948 break; 949 950 /* Not done yet. Wait. */ 951 if (deadline) { 952 struct timespec ts; 953 954 /* Check our watch. */ 955 error = clock_gettime1(clkid, &ts); 956 if (error) 957 break; 958 959 /* If we're past the deadline, ETIMEDOUT. */ 960 if (timespeccmp(deadline, &ts, <=)) { 961 error = ETIMEDOUT; 962 break; 963 } 964 965 /* Count how much time is left. */ 966 timespecsub(deadline, &ts, &ts); 967 968 /* Wait for that much time, allowing signals. */ 969 error = cv_timedwait_sig(&fw->fw_cv, &fw->fw_lock, 970 tstohz(&ts)); 971 } else { 972 /* Wait indefinitely, allowing signals. */ 973 error = cv_wait_sig(&fw->fw_cv, &fw->fw_lock); 974 } 975 } 976 977 /* 978 * If we were woken up, the waker will have removed fw from the 979 * queue. But if anything went wrong, we must remove fw from 980 * the queue ourselves. While here, convert EWOULDBLOCK to 981 * ETIMEDOUT. 982 */ 983 if (error) { 984 futex_wait_abort(fw); 985 if (error == EWOULDBLOCK) 986 error = ETIMEDOUT; 987 } 988 989 mutex_exit(&fw->fw_lock); 990 991 return error; 992} 993 994/* 995 * futex_wake(f, nwake, f2, nrequeue, bitset) 996 * 997 * Wake up to nwake waiters on f matching bitset; then, if f2 is 998 * provided, move up to nrequeue remaining waiters on f matching 999 * bitset to f2. Return the number of waiters actually woken. 1000 * Caller must hold the locks of f and f2, if provided. 1001 */ 1002static unsigned 1003futex_wake(struct futex *f, unsigned nwake, struct futex *f2, 1004 unsigned nrequeue, int bitset) 1005{ 1006 struct futex_wait *fw, *fw_next; 1007 unsigned nwoken = 0; 1008 int hold_error __diagused; 1009 1010 KASSERT(mutex_owned(&f->fx_qlock)); 1011 KASSERT(f2 == NULL || mutex_owned(&f2->fx_qlock)); 1012 1013 /* Wake up to nwake waiters, and count the number woken. */ 1014 TAILQ_FOREACH_SAFE(fw, &f->fx_queue, fw_entry, fw_next) { 1015 if ((fw->fw_bitset & bitset) == 0) 1016 continue; 1017 if (nwake > 0) { 1018 mutex_enter(&fw->fw_lock); 1019 if (__predict_false(fw->fw_aborting)) { 1020 mutex_exit(&fw->fw_lock); 1021 continue; 1022 } 1023 futex_wait_dequeue(fw, f); 1024 fw->fw_bitset = 0; 1025 cv_broadcast(&fw->fw_cv); 1026 mutex_exit(&fw->fw_lock); 1027 nwake--; 1028 nwoken++; 1029 /* 1030 * Drop the futex reference on behalf of the 1031 * waiter. We assert this is not the last 1032 * reference on the futex (our caller should 1033 * also have one). 1034 */ 1035 futex_rele_not_last(f); 1036 } else { 1037 break; 1038 } 1039 } 1040 1041 if (f2) { 1042 /* Move up to nrequeue waiters from f's queue to f2's queue. */ 1043 TAILQ_FOREACH_SAFE(fw, &f->fx_queue, fw_entry, fw_next) { 1044 if ((fw->fw_bitset & bitset) == 0) 1045 continue; 1046 if (nrequeue > 0) { 1047 mutex_enter(&fw->fw_lock); 1048 if (__predict_false(fw->fw_aborting)) { 1049 mutex_exit(&fw->fw_lock); 1050 continue; 1051 } 1052 futex_wait_dequeue(fw, f); 1053 futex_wait_enqueue(fw, f2); 1054 mutex_exit(&fw->fw_lock); 1055 nrequeue--; 1056 /* 1057 * Transfer the reference from f to f2. 1058 * As above, we assert that we are not 1059 * dropping the last reference to f here. 1060 * 1061 * XXX futex_hold() could theoretically 1062 * XXX fail here. 1063 */ 1064 futex_rele_not_last(f); 1065 hold_error = futex_hold(f2); 1066 KASSERT(hold_error == 0); 1067 } else { 1068 break; 1069 } 1070 } 1071 } else { 1072 KASSERT(nrequeue == 0); 1073 } 1074 1075 /* Return the number of waiters woken. */ 1076 return nwoken; 1077} 1078 1079/* 1080 * futex_queue_lock(f) 1081 * 1082 * Acquire the queue lock of f. Pair with futex_queue_unlock. Do 1083 * not use if caller needs to acquire two locks; use 1084 * futex_queue_lock2 instead. 1085 */ 1086static void 1087futex_queue_lock(struct futex *f) 1088{ 1089 mutex_enter(&f->fx_qlock); 1090} 1091 1092/* 1093 * futex_queue_unlock(f) 1094 * 1095 * Release the queue lock of f. 1096 */ 1097static void 1098futex_queue_unlock(struct futex *f) 1099{ 1100 mutex_exit(&f->fx_qlock); 1101} 1102 1103/* 1104 * futex_queue_lock2(f, f2) 1105 * 1106 * Acquire the queue locks of both f and f2, which may be null, or 1107 * which may have the same underlying queue. If they are 1108 * distinct, an arbitrary total order is chosen on the locks. 1109 * 1110 * Callers should only ever acquire multiple queue locks 1111 * simultaneously using futex_queue_lock2. 1112 */ 1113static void 1114futex_queue_lock2(struct futex *f, struct futex *f2) 1115{ 1116 1117 /* 1118 * If both are null, do nothing; if one is null and the other 1119 * is not, lock the other and be done with it. 1120 */ 1121 if (f == NULL && f2 == NULL) { 1122 return; 1123 } else if (f == NULL) { 1124 mutex_enter(&f2->fx_qlock); 1125 return; 1126 } else if (f2 == NULL) { 1127 mutex_enter(&f->fx_qlock); 1128 return; 1129 } 1130 1131 /* If both futexes are the same, acquire only one. */ 1132 if (f == f2) { 1133 mutex_enter(&f->fx_qlock); 1134 return; 1135 } 1136 1137 /* Otherwise, use the ordering on the kva of the futex pointer. */ 1138 if ((uintptr_t)f < (uintptr_t)f2) { 1139 mutex_enter(&f->fx_qlock); 1140 mutex_enter(&f2->fx_qlock); 1141 } else { 1142 mutex_enter(&f2->fx_qlock); 1143 mutex_enter(&f->fx_qlock); 1144 } 1145} 1146 1147/* 1148 * futex_queue_unlock2(f, f2) 1149 * 1150 * Release the queue locks of both f and f2, which may be null, or 1151 * which may have the same underlying queue. 1152 */ 1153static void 1154futex_queue_unlock2(struct futex *f, struct futex *f2) 1155{ 1156 1157 /* 1158 * If both are null, do nothing; if one is null and the other 1159 * is not, unlock the other and be done with it. 1160 */ 1161 if (f == NULL && f2 == NULL) { 1162 return; 1163 } else if (f == NULL) { 1164 mutex_exit(&f2->fx_qlock); 1165 return; 1166 } else if (f2 == NULL) { 1167 mutex_exit(&f->fx_qlock); 1168 return; 1169 } 1170 1171 /* If both futexes are the same, release only one. */ 1172 if (f == f2) { 1173 mutex_exit(&f->fx_qlock); 1174 return; 1175 } 1176 1177 /* Otherwise, use the ordering on the kva of the futex pointer. */ 1178 if ((uintptr_t)f < (uintptr_t)f2) { 1179 mutex_exit(&f2->fx_qlock); 1180 mutex_exit(&f->fx_qlock); 1181 } else { 1182 mutex_exit(&f->fx_qlock); 1183 mutex_exit(&f2->fx_qlock); 1184 } 1185} 1186 1187/* 1188 * futex_func_wait(uaddr, val, val3, timeout, clkid, clkflags, retval) 1189 * 1190 * Implement futex(FUTEX_WAIT). 1191 */ 1192static int 1193futex_func_wait(bool shared, int *uaddr, int val, int val3, 1194 const struct timespec *timeout, clockid_t clkid, int clkflags, 1195 register_t *retval) 1196{ 1197 struct futex *f; 1198 struct futex_wait wait, *fw = &wait; 1199 struct timespec ts; 1200 const struct timespec *deadline; 1201 int error; 1202 1203 /* 1204 * If there's nothing to wait for, and nobody will ever wake 1205 * us, then don't set anything up to wait -- just stop here. 1206 */ 1207 if (val3 == 0) 1208 return EINVAL; 1209 1210 /* Optimistically test before anything else. */ 1211 if (!futex_test(uaddr, val)) 1212 return EAGAIN; 1213 1214 /* Determine a deadline on the specified clock. */ 1215 if (timeout == NULL || (clkflags & TIMER_ABSTIME) == TIMER_ABSTIME) { 1216 deadline = timeout; 1217 } else { 1218 error = clock_gettime1(clkid, &ts); 1219 if (error) 1220 return error; 1221 timespecadd(&ts, timeout, &ts); 1222 deadline = &ts; 1223 } 1224 1225 /* Get the futex, creating it if necessary. */ 1226 error = futex_lookup_create(uaddr, shared, &f); 1227 if (error) 1228 return error; 1229 KASSERT(f); 1230 1231 /* Get ready to wait. */ 1232 futex_wait_init(fw, val3); 1233 1234 /* 1235 * Under the queue lock, check the value again: if it has 1236 * already changed, EAGAIN; otherwise enqueue the waiter. 1237 * Since FUTEX_WAKE will use the same lock and be done after 1238 * modifying the value, the order in which we check and enqueue 1239 * is immaterial. 1240 */ 1241 futex_queue_lock(f); 1242 if (!futex_test(uaddr, val)) { 1243 futex_queue_unlock(f); 1244 error = EAGAIN; 1245 goto out; 1246 } 1247 mutex_enter(&fw->fw_lock); 1248 futex_wait_enqueue(fw, f); 1249 mutex_exit(&fw->fw_lock); 1250 futex_queue_unlock(f); 1251 1252 /* 1253 * We cannot drop our reference to the futex here, because 1254 * we might be enqueued on a different one when we are awakened. 1255 * The references will be managed on our behalf in the requeue 1256 * and wake cases. 1257 */ 1258 f = NULL; 1259 1260 /* Wait. */ 1261 error = futex_wait(fw, deadline, clkid); 1262 if (error) 1263 goto out; 1264 1265 /* Return 0 on success, error on failure. */ 1266 *retval = 0; 1267 1268out: if (f != NULL) 1269 futex_rele(f); 1270 futex_wait_fini(fw); 1271 return error; 1272} 1273 1274/* 1275 * futex_func_wake(uaddr, val, val3, retval) 1276 * 1277 * Implement futex(FUTEX_WAKE) and futex(FUTEX_WAKE_BITSET). 1278 */ 1279static int 1280futex_func_wake(bool shared, int *uaddr, int val, int val3, register_t *retval) 1281{ 1282 struct futex *f; 1283 unsigned int nwoken = 0; 1284 int error = 0; 1285 1286 /* Reject negative number of wakeups. */ 1287 if (val < 0) { 1288 error = EINVAL; 1289 goto out; 1290 } 1291 1292 /* Look up the futex, if any. */ 1293 error = futex_lookup(uaddr, shared, &f); 1294 if (error) 1295 goto out; 1296 1297 /* If there's no futex, there are no waiters to wake. */ 1298 if (f == NULL) 1299 goto out; 1300 1301 /* 1302 * Under f's queue lock, wake the waiters and remember the 1303 * number woken. 1304 */ 1305 futex_queue_lock(f); 1306 nwoken = futex_wake(f, val, NULL, 0, val3); 1307 futex_queue_unlock(f); 1308 1309 /* Release the futex. */ 1310 futex_rele(f); 1311 1312out: 1313 /* Return the number of waiters woken. */ 1314 *retval = nwoken; 1315 1316 /* Success! */ 1317 return error; 1318} 1319 1320/* 1321 * futex_func_requeue(op, uaddr, val, uaddr2, val2, val3, retval) 1322 * 1323 * Implement futex(FUTEX_REQUEUE) and futex(FUTEX_CMP_REQUEUE). 1324 */ 1325static int 1326futex_func_requeue(bool shared, int op, int *uaddr, int val, int *uaddr2, 1327 int val2, int val3, register_t *retval) 1328{ 1329 struct futex *f = NULL, *f2 = NULL; 1330 unsigned nwoken = 0; /* default to zero woken on early return */ 1331 int error; 1332 1333 /* Reject negative number of wakeups or requeues. */ 1334 if (val < 0 || val2 < 0) { 1335 error = EINVAL; 1336 goto out; 1337 } 1338 1339 /* Look up the source futex, if any. */ 1340 error = futex_lookup(uaddr, shared, &f); 1341 if (error) 1342 goto out; 1343 1344 /* If there is none, nothing to do. */ 1345 if (f == NULL) 1346 goto out; 1347 1348 /* 1349 * We may need to create the destination futex because it's 1350 * entirely possible it does not currently have any waiters. 1351 */ 1352 error = futex_lookup_create(uaddr2, shared, &f2); 1353 if (error) 1354 goto out; 1355 1356 /* 1357 * Under the futexes' queue locks, check the value; if 1358 * unchanged from val3, wake the waiters. 1359 */ 1360 futex_queue_lock2(f, f2); 1361 if (op == FUTEX_CMP_REQUEUE && !futex_test(uaddr, val3)) { 1362 error = EAGAIN; 1363 } else { 1364 error = 0; 1365 nwoken = futex_wake(f, val, f2, val2, FUTEX_BITSET_MATCH_ANY); 1366 } 1367 futex_queue_unlock2(f, f2); 1368 1369out: 1370 /* Return the number of waiters woken. */ 1371 *retval = nwoken; 1372 1373 /* Release the futexes if we got them. */ 1374 if (f2) 1375 futex_rele(f2); 1376 if (f) 1377 futex_rele(f); 1378 return error; 1379} 1380 1381/* 1382 * futex_validate_op_cmp(val3) 1383 * 1384 * Validate an op/cmp argument for FUTEX_WAKE_OP. 1385 */ 1386static int 1387futex_validate_op_cmp(int val3) 1388{ 1389 int op = __SHIFTOUT(val3, FUTEX_OP_OP_MASK); 1390 int cmp = __SHIFTOUT(val3, FUTEX_OP_CMP_MASK); 1391 1392 if (op & FUTEX_OP_OPARG_SHIFT) { 1393 int oparg = __SHIFTOUT(val3, FUTEX_OP_OPARG_MASK); 1394 if (oparg < 0) 1395 return EINVAL; 1396 if (oparg >= 32) 1397 return EINVAL; 1398 op &= ~FUTEX_OP_OPARG_SHIFT; 1399 } 1400 1401 switch (op) { 1402 case FUTEX_OP_SET: 1403 case FUTEX_OP_ADD: 1404 case FUTEX_OP_OR: 1405 case FUTEX_OP_ANDN: 1406 case FUTEX_OP_XOR: 1407 break; 1408 default: 1409 return EINVAL; 1410 } 1411 1412 switch (cmp) { 1413 case FUTEX_OP_CMP_EQ: 1414 case FUTEX_OP_CMP_NE: 1415 case FUTEX_OP_CMP_LT: 1416 case FUTEX_OP_CMP_LE: 1417 case FUTEX_OP_CMP_GT: 1418 case FUTEX_OP_CMP_GE: 1419 break; 1420 default: 1421 return EINVAL; 1422 } 1423 1424 return 0; 1425} 1426 1427/* 1428 * futex_compute_op(oldval, val3) 1429 * 1430 * Apply a FUTEX_WAIT_OP operation to oldval. 1431 */ 1432static int 1433futex_compute_op(int oldval, int val3) 1434{ 1435 int op = __SHIFTOUT(val3, FUTEX_OP_OP_MASK); 1436 int oparg = __SHIFTOUT(val3, FUTEX_OP_OPARG_MASK); 1437 1438 if (op & FUTEX_OP_OPARG_SHIFT) { 1439 KASSERT(oparg >= 0); 1440 KASSERT(oparg < 32); 1441 oparg = 1u << oparg; 1442 op &= ~FUTEX_OP_OPARG_SHIFT; 1443 } 1444 1445 switch (op) { 1446 case FUTEX_OP_SET: 1447 return oparg; 1448 1449 case FUTEX_OP_ADD: 1450 /* 1451 * Avoid signed arithmetic overflow by doing 1452 * arithmetic unsigned and converting back to signed 1453 * at the end. 1454 */ 1455 return (int)((unsigned)oldval + (unsigned)oparg); 1456 1457 case FUTEX_OP_OR: 1458 return oldval | oparg; 1459 1460 case FUTEX_OP_ANDN: 1461 return oldval & ~oparg; 1462 1463 case FUTEX_OP_XOR: 1464 return oldval ^ oparg; 1465 1466 default: 1467 panic("invalid futex op"); 1468 } 1469} 1470 1471/* 1472 * futex_compute_cmp(oldval, val3) 1473 * 1474 * Apply a FUTEX_WAIT_OP comparison to oldval. 1475 */ 1476static bool 1477futex_compute_cmp(int oldval, int val3) 1478{ 1479 int cmp = __SHIFTOUT(val3, FUTEX_OP_CMP_MASK); 1480 int cmparg = __SHIFTOUT(val3, FUTEX_OP_CMPARG_MASK); 1481 1482 switch (cmp) { 1483 case FUTEX_OP_CMP_EQ: 1484 return (oldval == cmparg); 1485 1486 case FUTEX_OP_CMP_NE: 1487 return (oldval != cmparg); 1488 1489 case FUTEX_OP_CMP_LT: 1490 return (oldval < cmparg); 1491 1492 case FUTEX_OP_CMP_LE: 1493 return (oldval <= cmparg); 1494 1495 case FUTEX_OP_CMP_GT: 1496 return (oldval > cmparg); 1497 1498 case FUTEX_OP_CMP_GE: 1499 return (oldval >= cmparg); 1500 1501 default: 1502 panic("invalid futex cmp operation"); 1503 } 1504} 1505 1506/* 1507 * futex_func_wake_op(uaddr, val, uaddr2, val2, val3, retval) 1508 * 1509 * Implement futex(FUTEX_WAKE_OP). 1510 */ 1511static int 1512futex_func_wake_op(bool shared, int *uaddr, int val, int *uaddr2, int val2, 1513 int val3, register_t *retval) 1514{ 1515 struct futex *f = NULL, *f2 = NULL; 1516 int oldval, newval, actual; 1517 unsigned nwoken = 0; 1518 int error; 1519 1520 /* Reject negative number of wakeups. */ 1521 if (val < 0 || val2 < 0) { 1522 error = EINVAL; 1523 goto out; 1524 } 1525 1526 /* Reject invalid operations before we start doing things. */ 1527 if ((error = futex_validate_op_cmp(val3)) != 0) 1528 goto out; 1529 1530 /* Look up the first futex, if any. */ 1531 error = futex_lookup(uaddr, shared, &f); 1532 if (error) 1533 goto out; 1534 1535 /* Look up the second futex, if any. */ 1536 error = futex_lookup(uaddr2, shared, &f2); 1537 if (error) 1538 goto out; 1539 1540 /* 1541 * Under the queue locks: 1542 * 1543 * 1. Read/modify/write: *uaddr2 op= oparg. 1544 * 2. Unconditionally wake uaddr. 1545 * 3. Conditionally wake uaddr2, if it previously matched val2. 1546 */ 1547 futex_queue_lock2(f, f2); 1548 do { 1549 error = futex_load(uaddr2, &oldval); 1550 if (error) 1551 goto out_unlock; 1552 newval = futex_compute_op(oldval, val3); 1553 error = ucas_int(uaddr2, oldval, newval, &actual); 1554 if (error) 1555 goto out_unlock; 1556 } while (actual != oldval); 1557 nwoken = (f ? futex_wake(f, val, NULL, 0, FUTEX_BITSET_MATCH_ANY) : 0); 1558 if (f2 && futex_compute_cmp(oldval, val3)) 1559 nwoken += futex_wake(f2, val2, NULL, 0, 1560 FUTEX_BITSET_MATCH_ANY); 1561 1562 /* Success! */ 1563 error = 0; 1564out_unlock: 1565 futex_queue_unlock2(f, f2); 1566 1567out: 1568 /* Return the number of waiters woken. */ 1569 *retval = nwoken; 1570 1571 /* Release the futexes, if we got them. */ 1572 if (f2) 1573 futex_rele(f2); 1574 if (f) 1575 futex_rele(f); 1576 return error; 1577} 1578 1579/* 1580 * do_futex(uaddr, op, val, timeout, uaddr2, val2, val3) 1581 * 1582 * Implement the futex system call with all the parameters 1583 * parsed out. 1584 */ 1585int 1586do_futex(int *uaddr, int op, int val, const struct timespec *timeout, 1587 int *uaddr2, int val2, int val3, register_t *retval) 1588{ 1589 const bool shared = (op & FUTEX_PRIVATE_FLAG) ? false : true; 1590 const clockid_t clkid = (op & FUTEX_CLOCK_REALTIME) ? CLOCK_REALTIME 1591 : CLOCK_MONOTONIC; 1592 1593 op &= FUTEX_CMD_MASK; 1594 1595 switch (op) { 1596 case FUTEX_WAIT: 1597 return futex_func_wait(shared, uaddr, val, 1598 FUTEX_BITSET_MATCH_ANY, timeout, clkid, TIMER_RELTIME, 1599 retval); 1600 1601 case FUTEX_WAKE: 1602 val3 = FUTEX_BITSET_MATCH_ANY; 1603 /* FALLTHROUGH */ 1604 case FUTEX_WAKE_BITSET: 1605 return futex_func_wake(shared, uaddr, val, val3, retval); 1606 1607 case FUTEX_REQUEUE: 1608 case FUTEX_CMP_REQUEUE: 1609 return futex_func_requeue(shared, op, uaddr, val, uaddr2, 1610 val2, val3, retval); 1611 1612 case FUTEX_WAIT_BITSET: 1613 return futex_func_wait(shared, uaddr, val, val3, timeout, 1614 clkid, TIMER_ABSTIME, retval); 1615 1616 case FUTEX_WAKE_OP: 1617 return futex_func_wake_op(shared, uaddr, val, uaddr2, val2, 1618 val3, retval); 1619 1620 case FUTEX_FD: 1621 default: 1622 return ENOSYS; 1623 } 1624} 1625 1626/* 1627 * sys___futex(l, uap, retval) 1628 * 1629 * __futex(2) system call: generic futex operations. 1630 */ 1631int 1632sys___futex(struct lwp *l, const struct sys___futex_args *uap, 1633 register_t *retval) 1634{ 1635 /* { 1636 syscallarg(int *) uaddr; 1637 syscallarg(int) op; 1638 syscallarg(int) val; 1639 syscallarg(const struct timespec *) timeout; 1640 syscallarg(int *) uaddr2; 1641 syscallarg(int) val2; 1642 syscallarg(int) val3; 1643 } */ 1644 struct timespec ts, *tsp; 1645 int error; 1646 1647 /* 1648 * Copy in the timeout argument, if specified. 1649 */ 1650 if (SCARG(uap, timeout)) { 1651 error = copyin(SCARG(uap, timeout), &ts, sizeof(ts)); 1652 if (error) 1653 return error; 1654 tsp = &ts; 1655 } else { 1656 tsp = NULL; 1657 } 1658 1659 return do_futex(SCARG(uap, uaddr), SCARG(uap, op), SCARG(uap, val), 1660 tsp, SCARG(uap, uaddr2), SCARG(uap, val2), SCARG(uap, val3), 1661 retval); 1662} 1663 1664/* 1665 * sys___futex_set_robust_list(l, uap, retval) 1666 * 1667 * __futex_set_robust_list(2) system call for robust futexes. 1668 */ 1669int 1670sys___futex_set_robust_list(struct lwp *l, 1671 const struct sys___futex_set_robust_list_args *uap, register_t *retval) 1672{ 1673 /* { 1674 syscallarg(void *) head; 1675 syscallarg(size_t) len; 1676 } */ 1677 void *head = SCARG(uap, head); 1678 1679 if (SCARG(uap, len) != _FUTEX_ROBUST_HEAD_SIZE) 1680 return EINVAL; 1681 if ((uintptr_t)head % sizeof(u_long)) 1682 return EINVAL; 1683 1684 l->l_robust_head = (uintptr_t)head; 1685 1686 return 0; 1687} 1688 1689/* 1690 * sys___futex_get_robust_list(l, uap, retval) 1691 * 1692 * __futex_get_robust_list(2) system call for robust futexes. 1693 */ 1694int 1695sys___futex_get_robust_list(struct lwp *l, 1696 const struct sys___futex_get_robust_list_args *uap, register_t *retval) 1697{ 1698 /* { 1699 syscallarg(lwpid_t) lwpid; 1700 syscallarg(void **) headp; 1701 syscallarg(size_t *) lenp; 1702 } */ 1703 void *head; 1704 const size_t len = _FUTEX_ROBUST_HEAD_SIZE; 1705 int error; 1706 1707 error = futex_robust_head_lookup(l, SCARG(uap, lwpid), &head); 1708 if (error) 1709 return error; 1710 1711 /* Copy out the head pointer and the head structure length. */ 1712 error = copyout(&head, SCARG(uap, headp), sizeof(head)); 1713 if (__predict_true(error == 0)) { 1714 error = copyout(&len, SCARG(uap, lenp), sizeof(len)); 1715 } 1716 1717 return error; 1718} 1719 1720/* 1721 * release_futex(uva, tid) 1722 * 1723 * Try to release the robust futex at uva in the current process 1724 * on lwp exit. If anything goes wrong, silently fail. It is the 1725 * userland program's obligation to arrange correct behaviour. 1726 */ 1727static void 1728release_futex(uintptr_t const uptr, lwpid_t const tid, bool const is_pi, 1729 bool const is_pending) 1730{ 1731 int *uaddr; 1732 struct futex *f; 1733 int oldval, newval, actual; 1734 int error; 1735 1736 /* If it's misaligned, tough. */ 1737 if (__predict_false(uptr & 3)) 1738 return; 1739 uaddr = (int *)uptr; 1740 1741 error = futex_load(uaddr, &oldval); 1742 if (__predict_false(error)) 1743 return; 1744 1745 /* 1746 * There are two race conditions we need to handle here: 1747 * 1748 * 1. User space cleared the futex word but died before 1749 * being able to issue the wakeup. No wakeups will 1750 * ever be issued, oops! 1751 * 1752 * 2. Awakened waiter died before being able to acquire 1753 * the futex in user space. Any other waiters are 1754 * now stuck, oops! 1755 * 1756 * In both of these cases, the futex word will be 0 (because 1757 * it's updated before the wake is issued). The best we can 1758 * do is detect this situation if it's the pending futex and 1759 * issue a wake without modifying the futex word. 1760 * 1761 * XXX eventual PI handling? 1762 */ 1763 if (__predict_false(is_pending && (oldval & ~FUTEX_WAITERS) == 0)) { 1764 register_t retval; 1765 (void) futex_func_wake(/*shared*/true, uaddr, 1, 1766 FUTEX_BITSET_MATCH_ANY, &retval); 1767 return; 1768 } 1769 1770 /* Optimistically test whether we need to do anything at all. */ 1771 if ((oldval & FUTEX_TID_MASK) != tid) 1772 return; 1773 1774 /* 1775 * We need to handle the case where this thread owned the futex, 1776 * but it was uncontended. In this case, there won't be any 1777 * kernel state to look up. All we can do is mark the futex 1778 * as a zombie to be mopped up the next time another thread 1779 * attempts to acquire it. 1780 * 1781 * N.B. It's important to ensure to set FUTEX_OWNER_DIED in 1782 * this loop, even if waiters appear while we're are doing 1783 * so. This is beause FUTEX_WAITERS is set by user space 1784 * before calling __futex() to wait, and the futex needs 1785 * to be marked as a zombie when the new waiter gets into 1786 * the kernel. 1787 */ 1788 if ((oldval & FUTEX_WAITERS) == 0) { 1789 do { 1790 error = futex_load(uaddr, &oldval); 1791 if (error) 1792 return; 1793 if ((oldval & FUTEX_TID_MASK) != tid) 1794 return; 1795 newval = oldval | FUTEX_OWNER_DIED; 1796 error = ucas_int(uaddr, oldval, newval, &actual); 1797 if (error) 1798 return; 1799 } while (actual != oldval); 1800 1801 /* 1802 * If where is still no indication of waiters, then there is 1803 * no more work for us to do. 1804 */ 1805 if ((oldval & FUTEX_WAITERS) == 0) 1806 return; 1807 } 1808 1809 /* 1810 * Look for a shared futex since we have no positive indication 1811 * it is private. If we can't, tough. 1812 */ 1813 error = futex_lookup(uaddr, /*shared*/true, &f); 1814 if (error) 1815 return; 1816 1817 /* 1818 * If there's no kernel state for this futex, there's nothing to 1819 * release. 1820 */ 1821 if (f == NULL) 1822 return; 1823 1824 /* Work under the futex queue lock. */ 1825 futex_queue_lock(f); 1826 1827 /* 1828 * Fetch the word: if the tid doesn't match ours, skip; 1829 * otherwise, set the owner-died bit, atomically. 1830 */ 1831 do { 1832 error = futex_load(uaddr, &oldval); 1833 if (error) 1834 goto out; 1835 if ((oldval & FUTEX_TID_MASK) != tid) 1836 goto out; 1837 newval = oldval | FUTEX_OWNER_DIED; 1838 error = ucas_int(uaddr, oldval, newval, &actual); 1839 if (error) 1840 goto out; 1841 } while (actual != oldval); 1842 1843 /* 1844 * If there may be waiters, try to wake one. If anything goes 1845 * wrong, tough. 1846 * 1847 * XXX eventual PI handling? 1848 */ 1849 if (oldval & FUTEX_WAITERS) 1850 (void)futex_wake(f, 1, NULL, 0, FUTEX_BITSET_MATCH_ANY); 1851 1852 /* Unlock the queue and release the futex. */ 1853out: futex_queue_unlock(f); 1854 futex_rele(f); 1855} 1856 1857/* 1858 * futex_robust_head_lookup(l, lwpid) 1859 * 1860 * Helper function to look up a robust head by LWP ID. 1861 */ 1862int 1863futex_robust_head_lookup(struct lwp *l, lwpid_t lwpid, void **headp) 1864{ 1865 struct proc *p = l->l_proc; 1866 1867 /* Find the other lwp, if requested; otherwise use our robust head. */ 1868 if (lwpid) { 1869 mutex_enter(p->p_lock); 1870 l = lwp_find(p, lwpid); 1871 if (l == NULL) { 1872 mutex_exit(p->p_lock); 1873 return ESRCH; 1874 } 1875 *headp = (void *)l->l_robust_head; 1876 mutex_exit(p->p_lock); 1877 } else { 1878 *headp = (void *)l->l_robust_head; 1879 } 1880 return 0; 1881} 1882 1883/* 1884 * futex_fetch_robust_head(uaddr) 1885 * 1886 * Helper routine to fetch the futex robust list head that 1887 * handles 32-bit binaries running on 64-bit kernels. 1888 */ 1889static int 1890futex_fetch_robust_head(uintptr_t uaddr, u_long *rhead) 1891{ 1892#ifdef _LP64 1893 if (curproc->p_flag & PK_32) { 1894 uint32_t rhead32[_FUTEX_ROBUST_HEAD_NWORDS]; 1895 int error; 1896 1897 error = copyin((void *)uaddr, rhead32, sizeof(rhead32)); 1898 if (__predict_true(error == 0)) { 1899 for (int i = 0; i < _FUTEX_ROBUST_HEAD_NWORDS; i++) { 1900 if (i == _FUTEX_ROBUST_HEAD_OFFSET) { 1901 /* 1902 * Make sure the offset is sign- 1903 * extended. 1904 */ 1905 rhead[i] = (int32_t)rhead32[i]; 1906 } else { 1907 rhead[i] = rhead32[i]; 1908 } 1909 } 1910 } 1911 return error; 1912 } 1913#endif /* _L64 */ 1914 1915 return copyin((void *)uaddr, rhead, 1916 sizeof(*rhead) * _FUTEX_ROBUST_HEAD_NWORDS); 1917} 1918 1919/* 1920 * futex_decode_robust_word(word) 1921 * 1922 * Decode a robust futex list word into the entry and entry 1923 * properties. 1924 */ 1925static inline void 1926futex_decode_robust_word(uintptr_t const word, uintptr_t * const entry, 1927 bool * const is_pi) 1928{ 1929 *is_pi = (word & _FUTEX_ROBUST_ENTRY_PI) ? true : false; 1930 *entry = word & ~_FUTEX_ROBUST_ENTRY_PI; 1931} 1932 1933/* 1934 * futex_fetch_robust_entry(uaddr) 1935 * 1936 * Helper routine to fetch and decode a robust futex entry 1937 * that handles 32-bit binaries running on 64-bit kernels. 1938 */ 1939static int 1940futex_fetch_robust_entry(uintptr_t const uaddr, uintptr_t * const valp, 1941 bool * const is_pi) 1942{ 1943 uintptr_t val = 0; 1944 int error = 0; 1945 1946#ifdef _LP64 1947 if (curproc->p_flag & PK_32) { 1948 uint32_t val32; 1949 1950 error = ufetch_32((uint32_t *)uaddr, &val32); 1951 if (__predict_true(error == 0)) 1952 val = val32; 1953 } else 1954#endif /* _LP64 */ 1955 error = ufetch_long((u_long *)uaddr, (u_long *)&val); 1956 if (__predict_false(error)) 1957 return error; 1958 1959 futex_decode_robust_word(val, valp, is_pi); 1960 return 0; 1961} 1962 1963/* 1964 * futex_release_all_lwp(l, tid) 1965 * 1966 * Release all l's robust futexes. If anything looks funny in 1967 * the process, give up -- it's userland's responsibility to dot 1968 * the i's and cross the t's. 1969 */ 1970void 1971futex_release_all_lwp(struct lwp * const l) 1972{ 1973 u_long rhead[_FUTEX_ROBUST_HEAD_NWORDS]; 1974 int limit = 1000000; 1975 int error; 1976 1977 /* If there's no robust list there's nothing to do. */ 1978 if (l->l_robust_head == 0) 1979 return; 1980 1981 KASSERT((l->l_lid & FUTEX_TID_MASK) == l->l_lid); 1982 1983 /* Read the final snapshot of the robust list head. */ 1984 error = futex_fetch_robust_head(l->l_robust_head, rhead); 1985 if (error) { 1986 printf("WARNING: pid %jd (%s) lwp %jd:" 1987 " unmapped robust futex list head\n", 1988 (uintmax_t)l->l_proc->p_pid, l->l_proc->p_comm, 1989 (uintmax_t)l->l_lid); 1990 return; 1991 } 1992 1993 const long offset = (long)rhead[_FUTEX_ROBUST_HEAD_OFFSET]; 1994 1995 uintptr_t next, pending; 1996 bool is_pi, pending_is_pi; 1997 1998 futex_decode_robust_word(rhead[_FUTEX_ROBUST_HEAD_LIST], 1999 &next, &is_pi); 2000 futex_decode_robust_word(rhead[_FUTEX_ROBUST_HEAD_PENDING], 2001 &pending, &pending_is_pi); 2002 2003 /* 2004 * Walk down the list of locked futexes and release them, up 2005 * to one million of them before we give up. 2006 */ 2007 2008 while (next != l->l_robust_head && limit-- > 0) { 2009 /* pending handled below. */ 2010 if (next != pending) 2011 release_futex(next + offset, l->l_lid, is_pi, false); 2012 error = futex_fetch_robust_entry(next, &next, &is_pi); 2013 if (error) 2014 break; 2015 preempt_point(); 2016 } 2017 if (limit <= 0) { 2018 printf("WARNING: pid %jd (%s) lwp %jd:" 2019 " exhausted robust futex limit\n", 2020 (uintmax_t)l->l_proc->p_pid, l->l_proc->p_comm, 2021 (uintmax_t)l->l_lid); 2022 } 2023 2024 /* If there's a pending futex, it may need to be released too. */ 2025 if (pending != 0) { 2026 release_futex(pending + offset, l->l_lid, pending_is_pi, true); 2027 } 2028} 2029