cloudabi_futex.c revision 285908
1/*- 2 * Copyright (c) 2015 Nuxi, https://nuxi.nl/ 3 * 4 * Redistribution and use in source and binary forms, with or without 5 * modification, are permitted provided that the following conditions 6 * are met: 7 * 1. Redistributions of source code must retain the above copyright 8 * notice, this list of conditions and the following disclaimer. 9 * 2. Redistributions in binary form must reproduce the above copyright 10 * notice, this list of conditions and the following disclaimer in the 11 * documentation and/or other materials provided with the distribution. 12 * 13 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 14 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 15 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 16 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 17 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 18 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 19 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 20 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 21 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 22 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 23 * SUCH DAMAGE. 24 */ 25 26#include <sys/cdefs.h> 27__FBSDID("$FreeBSD: head/sys/compat/cloudabi/cloudabi_futex.c 285908 2015-07-27 10:07:29Z ed $"); 28 29#include <sys/param.h> 30#include <sys/kernel.h> 31#include <sys/limits.h> 32#include <sys/lock.h> 33#include <sys/malloc.h> 34#include <sys/mutex.h> 35#include <sys/proc.h> 36#include <sys/sx.h> 37#include <sys/systm.h> 38 39#include <vm/vm.h> 40#include <vm/vm_param.h> 41#include <vm/pmap.h> 42#include <vm/vm_extern.h> 43#include <vm/vm_map.h> 44#include <vm/vm_object.h> 45 46#include <compat/cloudabi/cloudabi_proto.h> 47#include <compat/cloudabi/cloudabi_syscalldefs.h> 48#include <compat/cloudabi/cloudabi_util.h> 49 50/* 51 * Futexes for CloudABI. 52 * 53 * On most systems, futexes are implemented as objects of a single type 54 * on which a set of operations can be performed. CloudABI makes a clear 55 * distinction between locks and condition variables. A lock may have 56 * zero or more associated condition variables. A condition variable is 57 * always associated with exactly one lock. There is a strict topology. 58 * This approach has two advantages: 59 * 60 * - This topology is guaranteed to be acyclic. Requeueing of threads 61 * only happens in one direction (from condition variables to locks). 62 * This eases locking. 63 * - It means that a futex object for a lock exists when it is unlocked, 64 * but has threads waiting on associated condition variables. Threads 65 * can be requeued to a lock even if the thread performing the wakeup 66 * does not have the lock mapped in its address space. 67 * 68 * This futex implementation only implements a single lock type, namely 69 * a read-write lock. A regular mutex type would not be necessary, as 70 * the read-write lock is as efficient as a mutex if used as such. 71 * Userspace futex locks are 32 bits in size: 72 * 73 * - 1 bit: has threads waiting in kernel-space. 74 * - 1 bit: is write-locked. 75 * - 30 bits: 76 * - if write-locked: thread ID of owner. 77 * - if not write-locked: number of read locks held. 78 * 79 * Condition variables are also 32 bits in size. Its value is modified 80 * by kernel-space exclusively. Zero indicates that it has no waiting 81 * threads. Non-zero indicates the opposite. 82 * 83 * This implementation is optimal, in the sense that it only wakes up 84 * threads if they can actually continue execution. It does not suffer 85 * from the thundering herd problem. If multiple threads waiting on a 86 * condition variable need to be woken up, only a single thread is 87 * scheduled. All other threads are 'donated' to this thread. After the 88 * thread manages to reacquire the lock, it requeues its donated threads 89 * to the lock. 90 * 91 * TODO(ed): Integrate this functionality into kern_umtx.c instead. 92 * TODO(ed): Store futex objects in a hash table. 93 * TODO(ed): Add actual priority inheritance. 94 * TODO(ed): Let futex_queue also take priorities into account. 95 * TODO(ed): Make locking fine-grained. 96 * TODO(ed): Perform sleeps until an actual absolute point in time, 97 * instead of converting the timestamp to a relative value. 98 */ 99 100struct futex_address; 101struct futex_condvar; 102struct futex_lock; 103struct futex_queue; 104struct futex_waiter; 105 106/* Identifier of a location in memory. */ 107struct futex_address { 108 /* For process-private objects: address space of the process. */ 109 struct vmspace * fa_vmspace; 110 /* For process-shared objects: VM object containing the object. */ 111 struct vm_object * fa_vmobject; 112 113 /* Memory address within address space or offset within VM object. */ 114 uintptr_t fa_offset; 115}; 116 117/* A set of waiting threads. */ 118struct futex_queue { 119 STAILQ_HEAD(, futex_waiter) fq_list; 120 unsigned int fq_count; 121}; 122 123/* Condition variables. */ 124struct futex_condvar { 125 /* Address of the condition variable. */ 126 struct futex_address fc_address; 127 128 /* The lock the waiters should be moved to when signalled. */ 129 struct futex_lock * fc_lock; 130 131 /* Threads waiting on the condition variable. */ 132 struct futex_queue fc_waiters; 133 /* 134 * Number of threads blocked on this condition variable, or 135 * being blocked on the lock after being requeued. 136 */ 137 unsigned int fc_waitcount; 138 139 /* Global list pointers. */ 140 LIST_ENTRY(futex_condvar) fc_next; 141}; 142 143/* Read-write locks. */ 144struct futex_lock { 145 /* Address of the lock. */ 146 struct futex_address fl_address; 147 148 /* 149 * Current owner of the lock. LOCK_UNMANAGED if the lock is 150 * currently not owned by the kernel. LOCK_OWNER_UNKNOWN in case 151 * the owner is not known (e.g., when the lock is read-locked). 152 */ 153 cloudabi_tid_t fl_owner; 154#define LOCK_UNMANAGED 0x0 155#define LOCK_OWNER_UNKNOWN 0x1 156 157 /* Writers blocked on the lock. */ 158 struct futex_queue fl_writers; 159 /* Readers blocked on the lock. */ 160 struct futex_queue fl_readers; 161 /* Number of threads blocked on this lock + condition variables. */ 162 unsigned int fl_waitcount; 163 164 /* Global list pointers. */ 165 LIST_ENTRY(futex_lock) fl_next; 166}; 167 168/* Information associated with a thread blocked on an object. */ 169struct futex_waiter { 170 /* Thread ID. */ 171 cloudabi_tid_t fw_tid; 172 /* Condition variable used for waiting. */ 173 struct cv fw_wait; 174 175 /* Queue this waiter is currently placed in. */ 176 struct futex_queue * fw_queue; 177 /* List pointers of fw_queue. */ 178 STAILQ_ENTRY(futex_waiter) fw_next; 179 180 /* Lock has been acquired. */ 181 bool fw_locked; 182 /* If not locked, threads that should block after acquiring. */ 183 struct futex_queue fw_donated; 184}; 185 186/* Global data structures. */ 187static MALLOC_DEFINE(M_FUTEX, "futex", "CloudABI futex"); 188 189static struct sx futex_global_lock; 190SX_SYSINIT(futex_global_lock, &futex_global_lock, "CloudABI futex global lock"); 191 192static LIST_HEAD(, futex_lock) futex_lock_list = 193 LIST_HEAD_INITIALIZER(&futex_lock_list); 194static LIST_HEAD(, futex_condvar) futex_condvar_list = 195 LIST_HEAD_INITIALIZER(&futex_condvar_list); 196 197/* Utility functions. */ 198static void futex_lock_assert(const struct futex_lock *); 199static struct futex_lock *futex_lock_lookup_locked(struct futex_address *); 200static void futex_lock_release(struct futex_lock *); 201static int futex_lock_tryrdlock(struct futex_lock *, cloudabi_lock_t *); 202static int futex_lock_unmanage(struct futex_lock *, cloudabi_lock_t *); 203static int futex_lock_update_owner(struct futex_lock *, cloudabi_lock_t *); 204static int futex_lock_wake_up_next(struct futex_lock *, cloudabi_lock_t *); 205static unsigned int futex_queue_count(const struct futex_queue *); 206static void futex_queue_init(struct futex_queue *); 207static void futex_queue_requeue(struct futex_queue *, struct futex_queue *, 208 unsigned int); 209static int futex_queue_sleep(struct futex_queue *, struct futex_lock *, 210 struct futex_waiter *, struct thread *, cloudabi_clockid_t, 211 cloudabi_timestamp_t, cloudabi_timestamp_t); 212static cloudabi_tid_t futex_queue_tid_best(const struct futex_queue *); 213static void futex_queue_wake_up_all(struct futex_queue *); 214static void futex_queue_wake_up_best(struct futex_queue *); 215static void futex_queue_wake_up_donate(struct futex_queue *, unsigned int); 216static int futex_user_load(uint32_t *, uint32_t *); 217static int futex_user_store(uint32_t *, uint32_t); 218static int futex_user_cmpxchg(uint32_t *, uint32_t, uint32_t *, uint32_t); 219 220/* 221 * futex_address operations. 222 */ 223 224static int 225futex_address_create(struct futex_address *fa, struct thread *td, 226 const void *object, cloudabi_mflags_t scope) 227{ 228 struct vmspace *vs; 229 struct vm_object *vo; 230 vm_map_t map; 231 vm_map_entry_t entry; 232 vm_pindex_t pindex; 233 vm_prot_t prot; 234 boolean_t wired; 235 236 /* 237 * Most of the time objects are stored in privately mapped 238 * anonymous memory. For these objects we wouldn't need to look 239 * up the corresponding VM object. The scope hint provided by 240 * userspace allows us to skip the VM map lookup for the common 241 * case. 242 * 243 * POSIX does permit enabling PTHREAD_PROCESS_SHARED on a lock 244 * stored in a private mapping, at the cost of additional 245 * performance overhead. Fall back to identifying the object by 246 * virtual memory address if the mapping isn't shared. 247 */ 248 vs = td->td_proc->p_vmspace; 249 switch (scope) { 250 case CLOUDABI_MAP_SHARED: 251 map = &vs->vm_map; 252 if (vm_map_lookup(&map, (vm_offset_t)object, 253 VM_PROT_COPY | VM_PROT_WRITE, &entry, &vo, &pindex, &prot, 254 &wired) != KERN_SUCCESS) 255 return (EFAULT); 256 257 if (entry->inheritance == VM_INHERIT_SHARE) { 258 /* 259 * Address corresponds to a shared mapping. 260 * Identify the address by its VM object. 261 */ 262 fa->fa_vmspace = NULL; 263 fa->fa_vmobject = vo; 264 vm_object_reference(vo); 265 fa->fa_offset = entry->offset - entry->start + 266 (vm_offset_t)object; 267 vm_map_lookup_done(map, entry); 268 return (0); 269 } 270 vm_map_lookup_done(map, entry); 271 /* FALLTHROUGH */ 272 case CLOUDABI_MAP_PRIVATE: 273 /* 274 * Address corresponds to a private mapping. Never 275 * identify the address by its VM object, as shadow 276 * objects may get inserted if another thread forks. 277 * Simply use the VM space instead. 278 */ 279 fa->fa_vmspace = vs; 280 fa->fa_vmobject = NULL; 281 fa->fa_offset = (uintptr_t)object; 282 return (0); 283 default: 284 return (EINVAL); 285 } 286} 287 288static void 289futex_address_free(struct futex_address *fa) 290{ 291 292 if (fa->fa_vmobject != NULL) 293 vm_object_deallocate(fa->fa_vmobject); 294} 295 296static bool 297futex_address_match(const struct futex_address *fa1, 298 const struct futex_address *fa2) 299{ 300 301 /* Either fa_vmspace or fa_vmobject is NULL. */ 302 return (fa1->fa_vmspace == fa2->fa_vmspace && 303 fa1->fa_vmobject == fa2->fa_vmobject && 304 fa1->fa_offset == fa2->fa_offset); 305} 306 307/* 308 * futex_condvar operations. 309 */ 310 311static void 312futex_condvar_assert(const struct futex_condvar *fc) 313{ 314 315 KASSERT(fc->fc_waitcount >= futex_queue_count(&fc->fc_waiters), 316 ("Total number of waiters cannot be smaller than the wait queue")); 317 futex_lock_assert(fc->fc_lock); 318} 319 320static int 321futex_condvar_lookup(struct thread *td, const cloudabi_condvar_t *address, 322 cloudabi_mflags_t scope, struct futex_condvar **fcret) 323{ 324 struct futex_address fa_condvar; 325 struct futex_condvar *fc; 326 int error; 327 328 error = futex_address_create(&fa_condvar, td, address, scope); 329 if (error != 0) 330 return (error); 331 332 sx_xlock(&futex_global_lock); 333 LIST_FOREACH(fc, &futex_condvar_list, fc_next) { 334 if (futex_address_match(&fc->fc_address, &fa_condvar)) { 335 /* Found matching lock object. */ 336 futex_address_free(&fa_condvar); 337 futex_condvar_assert(fc); 338 *fcret = fc; 339 return (0); 340 } 341 } 342 sx_xunlock(&futex_global_lock); 343 futex_address_free(&fa_condvar); 344 return (ENOENT); 345} 346 347static int 348futex_condvar_lookup_or_create(struct thread *td, 349 const cloudabi_condvar_t *condvar, cloudabi_mflags_t condvar_scope, 350 const cloudabi_lock_t *lock, cloudabi_mflags_t lock_scope, 351 struct futex_condvar **fcret) 352{ 353 struct futex_address fa_condvar, fa_lock; 354 struct futex_condvar *fc; 355 struct futex_lock *fl; 356 int error; 357 358 error = futex_address_create(&fa_condvar, td, condvar, condvar_scope); 359 if (error != 0) 360 return (error); 361 error = futex_address_create(&fa_lock, td, lock, lock_scope); 362 if (error != 0) { 363 futex_address_free(&fa_condvar); 364 return (error); 365 } 366 367 sx_xlock(&futex_global_lock); 368 LIST_FOREACH(fc, &futex_condvar_list, fc_next) { 369 if (!futex_address_match(&fc->fc_address, &fa_condvar)) 370 continue; 371 fl = fc->fc_lock; 372 if (!futex_address_match(&fl->fl_address, &fa_lock)) { 373 /* Condition variable is owned by a different lock. */ 374 futex_address_free(&fa_condvar); 375 futex_address_free(&fa_lock); 376 sx_xunlock(&futex_global_lock); 377 return (EINVAL); 378 } 379 380 /* Found fully matching condition variable. */ 381 futex_address_free(&fa_condvar); 382 futex_address_free(&fa_lock); 383 futex_condvar_assert(fc); 384 *fcret = fc; 385 return (0); 386 } 387 388 /* None found. Create new condition variable object. */ 389 fc = malloc(sizeof(*fc), M_FUTEX, M_WAITOK); 390 fc->fc_address = fa_condvar; 391 fc->fc_lock = futex_lock_lookup_locked(&fa_lock); 392 futex_queue_init(&fc->fc_waiters); 393 fc->fc_waitcount = 0; 394 LIST_INSERT_HEAD(&futex_condvar_list, fc, fc_next); 395 *fcret = fc; 396 return (0); 397} 398 399static void 400futex_condvar_release(struct futex_condvar *fc) 401{ 402 struct futex_lock *fl; 403 404 futex_condvar_assert(fc); 405 fl = fc->fc_lock; 406 if (fc->fc_waitcount == 0) { 407 /* Condition variable has no waiters. Deallocate it. */ 408 futex_address_free(&fc->fc_address); 409 LIST_REMOVE(fc, fc_next); 410 free(fc, M_FUTEX); 411 } 412 futex_lock_release(fl); 413} 414 415static int 416futex_condvar_unmanage(struct futex_condvar *fc, 417 cloudabi_condvar_t *condvar) 418{ 419 420 if (futex_queue_count(&fc->fc_waiters) != 0) 421 return (0); 422 return (futex_user_store(condvar, CLOUDABI_CONDVAR_HAS_NO_WAITERS)); 423} 424 425/* 426 * futex_lock operations. 427 */ 428 429static void 430futex_lock_assert(const struct futex_lock *fl) 431{ 432 433 /* 434 * A futex lock can only be kernel-managed if it has waiters. 435 * Vice versa: if a futex lock has waiters, it must be 436 * kernel-managed. 437 */ 438 KASSERT((fl->fl_owner == LOCK_UNMANAGED) == 439 (futex_queue_count(&fl->fl_readers) == 0 && 440 futex_queue_count(&fl->fl_writers) == 0), 441 ("Managed locks must have waiting threads")); 442 KASSERT(fl->fl_waitcount != 0 || fl->fl_owner == LOCK_UNMANAGED, 443 ("Lock with no waiters must be unmanaged")); 444} 445 446static int 447futex_lock_lookup(struct thread *td, const cloudabi_lock_t *address, 448 cloudabi_mflags_t scope, struct futex_lock **flret) 449{ 450 struct futex_address fa; 451 int error; 452 453 error = futex_address_create(&fa, td, address, scope); 454 if (error != 0) 455 return (error); 456 457 sx_xlock(&futex_global_lock); 458 *flret = futex_lock_lookup_locked(&fa); 459 return (0); 460} 461 462static struct futex_lock * 463futex_lock_lookup_locked(struct futex_address *fa) 464{ 465 struct futex_lock *fl; 466 467 LIST_FOREACH(fl, &futex_lock_list, fl_next) { 468 if (futex_address_match(&fl->fl_address, fa)) { 469 /* Found matching lock object. */ 470 futex_address_free(fa); 471 futex_lock_assert(fl); 472 return (fl); 473 } 474 } 475 476 /* None found. Create new lock object. */ 477 fl = malloc(sizeof(*fl), M_FUTEX, M_WAITOK); 478 fl->fl_address = *fa; 479 fl->fl_owner = LOCK_UNMANAGED; 480 futex_queue_init(&fl->fl_readers); 481 futex_queue_init(&fl->fl_writers); 482 fl->fl_waitcount = 0; 483 LIST_INSERT_HEAD(&futex_lock_list, fl, fl_next); 484 return (fl); 485} 486 487static int 488futex_lock_rdlock(struct futex_lock *fl, struct thread *td, 489 cloudabi_lock_t *lock, cloudabi_clockid_t clock_id, 490 cloudabi_timestamp_t timeout, cloudabi_timestamp_t precision) 491{ 492 struct futex_waiter fw; 493 int error; 494 495 error = futex_lock_tryrdlock(fl, lock); 496 if (error == EBUSY) { 497 /* Suspend execution. */ 498 KASSERT(fl->fl_owner != LOCK_UNMANAGED, 499 ("Attempted to sleep on an unmanaged lock")); 500 error = futex_queue_sleep(&fl->fl_readers, fl, &fw, td, 501 clock_id, timeout, precision); 502 KASSERT((error == 0) == fw.fw_locked, 503 ("Should have locked write lock on success")); 504 KASSERT(futex_queue_count(&fw.fw_donated) == 0, 505 ("Lock functions cannot receive threads")); 506 } 507 if (error != 0) 508 futex_lock_unmanage(fl, lock); 509 return (error); 510} 511 512static void 513futex_lock_release(struct futex_lock *fl) 514{ 515 516 futex_lock_assert(fl); 517 if (fl->fl_waitcount == 0) { 518 /* Lock object is unreferenced. Deallocate it. */ 519 KASSERT(fl->fl_owner == LOCK_UNMANAGED, 520 ("Attempted to free a managed lock")); 521 futex_address_free(&fl->fl_address); 522 LIST_REMOVE(fl, fl_next); 523 free(fl, M_FUTEX); 524 } 525 sx_xunlock(&futex_global_lock); 526} 527 528static int 529futex_lock_unmanage(struct futex_lock *fl, cloudabi_lock_t *lock) 530{ 531 cloudabi_lock_t cmp, old; 532 int error; 533 534 if (futex_queue_count(&fl->fl_readers) == 0 && 535 futex_queue_count(&fl->fl_writers) == 0) { 536 /* Lock should be unmanaged. */ 537 fl->fl_owner = LOCK_UNMANAGED; 538 539 /* Clear kernel-managed bit. */ 540 error = futex_user_load(lock, &old); 541 if (error != 0) 542 return (error); 543 for (;;) { 544 cmp = old; 545 error = futex_user_cmpxchg(lock, cmp, &old, 546 cmp & ~CLOUDABI_LOCK_KERNEL_MANAGED); 547 if (error != 0) 548 return (error); 549 if (old == cmp) 550 break; 551 } 552 } 553 return (0); 554} 555 556/* Sets an owner of a lock, based on a userspace lock value. */ 557static void 558futex_lock_set_owner(struct futex_lock *fl, cloudabi_lock_t lock) 559{ 560 561 /* Lock has no explicit owner. */ 562 if ((lock & ~CLOUDABI_LOCK_WRLOCKED) == 0) { 563 fl->fl_owner = LOCK_OWNER_UNKNOWN; 564 return; 565 } 566 lock &= ~(CLOUDABI_LOCK_WRLOCKED | CLOUDABI_LOCK_KERNEL_MANAGED); 567 568 /* Don't allow userspace to silently unlock. */ 569 if (lock == LOCK_UNMANAGED) { 570 fl->fl_owner = LOCK_OWNER_UNKNOWN; 571 return; 572 } 573 fl->fl_owner = lock; 574} 575 576static int 577futex_lock_unlock(struct futex_lock *fl, struct thread *td, 578 cloudabi_lock_t *lock) 579{ 580 int error; 581 582 /* Validate that this thread is allowed to unlock. */ 583 error = futex_lock_update_owner(fl, lock); 584 if (error != 0) 585 return (error); 586 if (fl->fl_owner != LOCK_UNMANAGED && fl->fl_owner != td->td_tid) 587 return (EPERM); 588 return (futex_lock_wake_up_next(fl, lock)); 589} 590 591/* Syncs in the owner of the lock from userspace if needed. */ 592static int 593futex_lock_update_owner(struct futex_lock *fl, cloudabi_lock_t *address) 594{ 595 cloudabi_lock_t lock; 596 int error; 597 598 if (fl->fl_owner == LOCK_OWNER_UNKNOWN) { 599 error = futex_user_load(address, &lock); 600 if (error != 0) 601 return (error); 602 futex_lock_set_owner(fl, lock); 603 } 604 return (0); 605} 606 607static int 608futex_lock_tryrdlock(struct futex_lock *fl, cloudabi_lock_t *address) 609{ 610 cloudabi_lock_t old, cmp; 611 int error; 612 613 if (fl->fl_owner != LOCK_UNMANAGED) { 614 /* Lock is already acquired. */ 615 return (EBUSY); 616 } 617 618 old = CLOUDABI_LOCK_UNLOCKED; 619 for (;;) { 620 if ((old & CLOUDABI_LOCK_KERNEL_MANAGED) != 0) { 621 /* 622 * Userspace lock is kernel-managed, even though 623 * the kernel disagrees. 624 */ 625 return (EINVAL); 626 } 627 628 if ((old & CLOUDABI_LOCK_WRLOCKED) == 0) { 629 /* 630 * Lock is not write-locked. Attempt to acquire 631 * it by increasing the read count. 632 */ 633 cmp = old; 634 error = futex_user_cmpxchg(address, cmp, &old, cmp + 1); 635 if (error != 0) 636 return (error); 637 if (old == cmp) { 638 /* Success. */ 639 return (0); 640 } 641 } else { 642 /* Lock is write-locked. Make it kernel-managed. */ 643 cmp = old; 644 error = futex_user_cmpxchg(address, cmp, &old, 645 cmp | CLOUDABI_LOCK_KERNEL_MANAGED); 646 if (error != 0) 647 return (error); 648 if (old == cmp) { 649 /* Success. */ 650 futex_lock_set_owner(fl, cmp); 651 return (EBUSY); 652 } 653 } 654 } 655} 656 657static int 658futex_lock_trywrlock(struct futex_lock *fl, cloudabi_lock_t *address, 659 cloudabi_tid_t tid, bool force_kernel_managed) 660{ 661 cloudabi_lock_t old, new, cmp; 662 int error; 663 664 if (fl->fl_owner == tid) { 665 /* Attempted to acquire lock recursively. */ 666 return (EDEADLK); 667 } 668 if (fl->fl_owner != LOCK_UNMANAGED) { 669 /* Lock is already acquired. */ 670 return (EBUSY); 671 } 672 673 old = CLOUDABI_LOCK_UNLOCKED; 674 for (;;) { 675 if ((old & CLOUDABI_LOCK_KERNEL_MANAGED) != 0) { 676 /* 677 * Userspace lock is kernel-managed, even though 678 * the kernel disagrees. 679 */ 680 return (EINVAL); 681 } 682 if (old == (tid | CLOUDABI_LOCK_WRLOCKED)) { 683 /* Attempted to acquire lock recursively. */ 684 return (EDEADLK); 685 } 686 687 if (old == CLOUDABI_LOCK_UNLOCKED) { 688 /* Lock is unlocked. Attempt to acquire it. */ 689 new = tid | CLOUDABI_LOCK_WRLOCKED; 690 if (force_kernel_managed) 691 new |= CLOUDABI_LOCK_KERNEL_MANAGED; 692 error = futex_user_cmpxchg(address, 693 CLOUDABI_LOCK_UNLOCKED, &old, new); 694 if (error != 0) 695 return (error); 696 if (old == CLOUDABI_LOCK_UNLOCKED) { 697 /* Success. */ 698 if (force_kernel_managed) 699 fl->fl_owner = tid; 700 return (0); 701 } 702 } else { 703 /* Lock is still locked. Make it kernel-managed. */ 704 cmp = old; 705 error = futex_user_cmpxchg(address, cmp, &old, 706 cmp | CLOUDABI_LOCK_KERNEL_MANAGED); 707 if (error != 0) 708 return (error); 709 if (old == cmp) { 710 /* Success. */ 711 futex_lock_set_owner(fl, cmp); 712 return (EBUSY); 713 } 714 } 715 } 716} 717 718static int 719futex_lock_wake_up_next(struct futex_lock *fl, cloudabi_lock_t *lock) 720{ 721 cloudabi_tid_t tid; 722 int error; 723 724 /* 725 * Determine which thread(s) to wake up. Prefer waking up 726 * writers over readers to prevent write starvation. 727 */ 728 if (futex_queue_count(&fl->fl_writers) > 0) { 729 /* Transfer ownership to a single write-locker. */ 730 if (futex_queue_count(&fl->fl_writers) > 1 || 731 futex_queue_count(&fl->fl_readers) > 0) { 732 /* Lock should remain managed afterwards. */ 733 tid = futex_queue_tid_best(&fl->fl_writers); 734 error = futex_user_store(lock, 735 tid | CLOUDABI_LOCK_WRLOCKED | 736 CLOUDABI_LOCK_KERNEL_MANAGED); 737 if (error != 0) 738 return (error); 739 740 futex_queue_wake_up_best(&fl->fl_writers); 741 fl->fl_owner = tid; 742 } else { 743 /* Lock can become unmanaged afterwards. */ 744 error = futex_user_store(lock, 745 futex_queue_tid_best(&fl->fl_writers) | 746 CLOUDABI_LOCK_WRLOCKED); 747 if (error != 0) 748 return (error); 749 750 futex_queue_wake_up_best(&fl->fl_writers); 751 fl->fl_owner = LOCK_UNMANAGED; 752 } 753 } else { 754 /* Transfer ownership to all read-lockers (if any). */ 755 error = futex_user_store(lock, 756 futex_queue_count(&fl->fl_readers)); 757 if (error != 0) 758 return (error); 759 760 /* Wake up all threads. */ 761 futex_queue_wake_up_all(&fl->fl_readers); 762 fl->fl_owner = LOCK_UNMANAGED; 763 } 764 return (0); 765} 766 767static int 768futex_lock_wrlock(struct futex_lock *fl, struct thread *td, 769 cloudabi_lock_t *lock, cloudabi_clockid_t clock_id, 770 cloudabi_timestamp_t timeout, cloudabi_timestamp_t precision, 771 struct futex_queue *donated) 772{ 773 struct futex_waiter fw; 774 int error; 775 776 error = futex_lock_trywrlock(fl, lock, td->td_tid, 777 futex_queue_count(donated) > 0); 778 779 if (error == 0 || error == EBUSY) { 780 /* Put donated threads in queue before suspending. */ 781 KASSERT(futex_queue_count(donated) == 0 || 782 fl->fl_owner != LOCK_UNMANAGED, 783 ("Lock should be managed if we are going to donate")); 784 futex_queue_requeue(donated, &fl->fl_writers, UINT_MAX); 785 } else { 786 /* 787 * This thread cannot deal with the donated threads. 788 * Wake up the next thread and let it try it by itself. 789 */ 790 futex_queue_wake_up_donate(donated, UINT_MAX); 791 } 792 793 if (error == EBUSY) { 794 /* Suspend execution if the lock was busy. */ 795 KASSERT(fl->fl_owner != LOCK_UNMANAGED, 796 ("Attempted to sleep on an unmanaged lock")); 797 error = futex_queue_sleep(&fl->fl_writers, fl, &fw, td, 798 clock_id, timeout, precision); 799 KASSERT((error == 0) == fw.fw_locked, 800 ("Should have locked write lock on success")); 801 KASSERT(futex_queue_count(&fw.fw_donated) == 0, 802 ("Lock functions cannot receive threads")); 803 } 804 if (error != 0) 805 futex_lock_unmanage(fl, lock); 806 return (error); 807} 808 809/* 810 * futex_queue operations. 811 */ 812 813static cloudabi_tid_t 814futex_queue_tid_best(const struct futex_queue *fq) 815{ 816 817 return (STAILQ_FIRST(&fq->fq_list)->fw_tid); 818} 819 820static unsigned int 821futex_queue_count(const struct futex_queue *fq) 822{ 823 824 return (fq->fq_count); 825} 826 827static void 828futex_queue_init(struct futex_queue *fq) 829{ 830 831 STAILQ_INIT(&fq->fq_list); 832 fq->fq_count = 0; 833} 834 835/* Converts a relative timestamp to an sbintime. */ 836static sbintime_t 837futex_queue_convert_timestamp_relative(cloudabi_timestamp_t ts) 838{ 839 cloudabi_timestamp_t s, ns; 840 841 s = ts / 1000000000; 842 ns = ts % 1000000000; 843 if (s > INT32_MAX) 844 return (INT64_MAX); 845 return ((s << 32) + (ns << 32) / 1000000000); 846} 847 848/* Converts an absolute timestamp and precision to a pair of sbintime values. */ 849static int 850futex_queue_convert_timestamp(struct thread *td, cloudabi_clockid_t clock_id, 851 cloudabi_timestamp_t timeout, cloudabi_timestamp_t precision, 852 sbintime_t *sbttimeout, sbintime_t *sbtprecision) 853{ 854 cloudabi_timestamp_t now; 855 int error; 856 857 /* Make the time relative. */ 858 error = cloudabi_clock_time_get(td, clock_id, &now); 859 if (error != 0) 860 return (error); 861 timeout = timeout < now ? 0 : timeout - now; 862 863 *sbttimeout = futex_queue_convert_timestamp_relative(timeout); 864 *sbtprecision = futex_queue_convert_timestamp_relative(precision); 865 return (0); 866} 867 868static int 869futex_queue_sleep(struct futex_queue *fq, struct futex_lock *fl, 870 struct futex_waiter *fw, struct thread *td, cloudabi_clockid_t clock_id, 871 cloudabi_timestamp_t timeout, cloudabi_timestamp_t precision) 872{ 873 sbintime_t sbttimeout, sbtprecision; 874 int error; 875 876 /* Initialize futex_waiter object. */ 877 fw->fw_tid = td->td_tid; 878 fw->fw_locked = false; 879 futex_queue_init(&fw->fw_donated); 880 881 if (timeout != UINT64_MAX) { 882 /* Convert timeout duration. */ 883 error = futex_queue_convert_timestamp(td, clock_id, timeout, 884 precision, &sbttimeout, &sbtprecision); 885 if (error != 0) 886 return (error); 887 } 888 889 /* Place object in the queue. */ 890 fw->fw_queue = fq; 891 STAILQ_INSERT_TAIL(&fq->fq_list, fw, fw_next); 892 ++fq->fq_count; 893 894 cv_init(&fw->fw_wait, "futex"); 895 ++fl->fl_waitcount; 896 897 futex_lock_assert(fl); 898 if (timeout == UINT64_MAX) { 899 /* Wait without a timeout. */ 900 error = cv_wait_sig(&fw->fw_wait, &futex_global_lock); 901 } else { 902 /* Wait respecting the timeout. */ 903 error = cv_timedwait_sig_sbt(&fw->fw_wait, &futex_global_lock, 904 sbttimeout, sbtprecision, 0); 905 futex_lock_assert(fl); 906 if (error == EWOULDBLOCK && 907 fw->fw_queue != NULL && fw->fw_queue != fq) { 908 /* 909 * We got signalled on a condition variable, but 910 * observed a timeout while waiting to reacquire 911 * the lock. In other words, we didn't actually 912 * time out. Go back to sleep and wait for the 913 * lock to be reacquired. 914 */ 915 error = cv_wait_sig(&fw->fw_wait, &futex_global_lock); 916 } 917 } 918 futex_lock_assert(fl); 919 920 --fl->fl_waitcount; 921 cv_destroy(&fw->fw_wait); 922 923 fq = fw->fw_queue; 924 if (fq == NULL) { 925 /* Thread got dequeued, so we've slept successfully. */ 926 return (0); 927 } 928 929 /* Thread is still enqueued. Remove it. */ 930 KASSERT(error != 0, ("Woken up thread is still enqueued")); 931 STAILQ_REMOVE(&fq->fq_list, fw, futex_waiter, fw_next); 932 --fq->fq_count; 933 return (error == EWOULDBLOCK ? ETIMEDOUT : error); 934} 935 936/* Moves up to nwaiters waiters from one queue to another. */ 937static void 938futex_queue_requeue(struct futex_queue *fqfrom, struct futex_queue *fqto, 939 unsigned int nwaiters) 940{ 941 struct futex_waiter *fw; 942 943 /* Move waiters to the target queue. */ 944 while (nwaiters-- > 0 && !STAILQ_EMPTY(&fqfrom->fq_list)) { 945 fw = STAILQ_FIRST(&fqfrom->fq_list); 946 STAILQ_REMOVE_HEAD(&fqfrom->fq_list, fw_next); 947 --fqfrom->fq_count; 948 949 fw->fw_queue = fqto; 950 STAILQ_INSERT_TAIL(&fqto->fq_list, fw, fw_next); 951 ++fqto->fq_count; 952 } 953} 954 955/* Wakes up all waiters in a queue. */ 956static void 957futex_queue_wake_up_all(struct futex_queue *fq) 958{ 959 struct futex_waiter *fw; 960 961 STAILQ_FOREACH(fw, &fq->fq_list, fw_next) { 962 fw->fw_locked = true; 963 fw->fw_queue = NULL; 964 cv_signal(&fw->fw_wait); 965 } 966 967 STAILQ_INIT(&fq->fq_list); 968 fq->fq_count = 0; 969} 970 971/* 972 * Wakes up the best waiter (i.e., the waiter having the highest 973 * priority) in a queue. 974 */ 975static void 976futex_queue_wake_up_best(struct futex_queue *fq) 977{ 978 struct futex_waiter *fw; 979 980 fw = STAILQ_FIRST(&fq->fq_list); 981 fw->fw_locked = true; 982 fw->fw_queue = NULL; 983 cv_signal(&fw->fw_wait); 984 985 STAILQ_REMOVE_HEAD(&fq->fq_list, fw_next); 986 --fq->fq_count; 987} 988 989static void 990futex_queue_wake_up_donate(struct futex_queue *fq, unsigned int nwaiters) 991{ 992 struct futex_waiter *fw; 993 994 fw = STAILQ_FIRST(&fq->fq_list); 995 if (fw == NULL) 996 return; 997 fw->fw_locked = false; 998 fw->fw_queue = NULL; 999 cv_signal(&fw->fw_wait); 1000 1001 STAILQ_REMOVE_HEAD(&fq->fq_list, fw_next); 1002 --fq->fq_count; 1003 futex_queue_requeue(fq, &fw->fw_donated, nwaiters); 1004} 1005 1006/* 1007 * futex_user operations. Used to adjust values in userspace. 1008 */ 1009 1010static int 1011futex_user_load(uint32_t *obj, uint32_t *val) 1012{ 1013 1014 return (fueword32(obj, val) != 0 ? EFAULT : 0); 1015} 1016 1017static int 1018futex_user_store(uint32_t *obj, uint32_t val) 1019{ 1020 1021 return (suword32(obj, val) != 0 ? EFAULT : 0); 1022} 1023 1024static int 1025futex_user_cmpxchg(uint32_t *obj, uint32_t cmp, uint32_t *old, uint32_t new) 1026{ 1027 1028 return (casueword32(obj, cmp, old, new) != 0 ? EFAULT : 0); 1029} 1030 1031/* 1032 * Blocking calls: acquiring locks, waiting on condition variables. 1033 */ 1034 1035int 1036cloudabi_futex_condvar_wait(struct thread *td, cloudabi_condvar_t *condvar, 1037 cloudabi_mflags_t condvar_scope, cloudabi_lock_t *lock, 1038 cloudabi_mflags_t lock_scope, cloudabi_clockid_t clock_id, 1039 cloudabi_timestamp_t timeout, cloudabi_timestamp_t precision) 1040{ 1041 struct futex_condvar *fc; 1042 struct futex_lock *fl; 1043 struct futex_waiter fw; 1044 int error, error2; 1045 1046 /* Lookup condition variable object. */ 1047 error = futex_condvar_lookup_or_create(td, condvar, condvar_scope, lock, 1048 lock_scope, &fc); 1049 if (error != 0) 1050 return (error); 1051 fl = fc->fc_lock; 1052 1053 /* 1054 * Set the condition variable to something other than 1055 * CLOUDABI_CONDVAR_HAS_NO_WAITERS to make userspace threads 1056 * call into the kernel to perform wakeups. 1057 */ 1058 error = futex_user_store(condvar, ~CLOUDABI_CONDVAR_HAS_NO_WAITERS); 1059 if (error != 0) { 1060 futex_condvar_release(fc); 1061 return (error); 1062 } 1063 1064 /* Drop the lock. */ 1065 error = futex_lock_unlock(fl, td, lock); 1066 if (error != 0) { 1067 futex_condvar_unmanage(fc, condvar); 1068 futex_condvar_release(fc); 1069 return (error); 1070 } 1071 1072 /* Go to sleep. */ 1073 ++fc->fc_waitcount; 1074 error = futex_queue_sleep(&fc->fc_waiters, fc->fc_lock, &fw, td, 1075 clock_id, timeout, precision); 1076 if (fw.fw_locked) { 1077 /* Waited and got the lock assigned to us. */ 1078 KASSERT(futex_queue_count(&fw.fw_donated) == 0, 1079 ("Received threads while being locked")); 1080 } else if (error == 0 || error == ETIMEDOUT) { 1081 if (error != 0) 1082 futex_condvar_unmanage(fc, condvar); 1083 /* 1084 * Got woken up without having the lock assigned to us. 1085 * This can happen in two cases: 1086 * 1087 * 1. We observed a timeout on a condition variable. 1088 * 2. We got signalled on a condition variable while the 1089 * associated lock is unlocked. We are the first 1090 * thread that gets woken up. This thread is 1091 * responsible for reacquiring the userspace lock. 1092 */ 1093 error2 = futex_lock_wrlock(fl, td, lock, 1094 CLOUDABI_CLOCK_MONOTONIC, UINT64_MAX, 0, &fw.fw_donated); 1095 if (error2 != 0) 1096 error = error2; 1097 } else { 1098 KASSERT(futex_queue_count(&fw.fw_donated) == 0, 1099 ("Received threads on error")); 1100 futex_condvar_unmanage(fc, condvar); 1101 futex_lock_unmanage(fl, lock); 1102 } 1103 --fc->fc_waitcount; 1104 futex_condvar_release(fc); 1105 return (error); 1106} 1107 1108int 1109cloudabi_futex_lock_rdlock(struct thread *td, cloudabi_lock_t *lock, 1110 cloudabi_mflags_t scope, cloudabi_clockid_t clock_id, 1111 cloudabi_timestamp_t timeout, cloudabi_timestamp_t precision) 1112{ 1113 struct futex_lock *fl; 1114 int error; 1115 1116 /* Look up lock object. */ 1117 error = futex_lock_lookup(td, lock, scope, &fl); 1118 if (error != 0) 1119 return (error); 1120 1121 error = futex_lock_rdlock(fl, td, lock, clock_id, timeout, 1122 precision); 1123 futex_lock_release(fl); 1124 return (error); 1125} 1126 1127int 1128cloudabi_futex_lock_wrlock(struct thread *td, cloudabi_lock_t *lock, 1129 cloudabi_mflags_t scope, cloudabi_clockid_t clock_id, 1130 cloudabi_timestamp_t timeout, cloudabi_timestamp_t precision) 1131{ 1132 struct futex_lock *fl; 1133 struct futex_queue fq; 1134 int error; 1135 1136 /* Look up lock object. */ 1137 error = futex_lock_lookup(td, lock, scope, &fl); 1138 if (error != 0) 1139 return (error); 1140 1141 futex_queue_init(&fq); 1142 error = futex_lock_wrlock(fl, td, lock, clock_id, timeout, 1143 precision, &fq); 1144 futex_lock_release(fl); 1145 return (error); 1146} 1147 1148/* 1149 * Non-blocking calls: releasing locks, signalling condition variables. 1150 */ 1151 1152int 1153cloudabi_sys_condvar_signal(struct thread *td, 1154 struct cloudabi_sys_condvar_signal_args *uap) 1155{ 1156 struct futex_condvar *fc; 1157 struct futex_lock *fl; 1158 cloudabi_nthreads_t nwaiters; 1159 int error; 1160 1161 nwaiters = uap->nwaiters; 1162 if (nwaiters == 0) { 1163 /* No threads to wake up. */ 1164 return (0); 1165 } 1166 1167 /* Look up futex object. */ 1168 error = futex_condvar_lookup(td, uap->condvar, uap->scope, &fc); 1169 if (error != 0) { 1170 /* Race condition: condition variable with no waiters. */ 1171 return (error == ENOENT ? 0 : error); 1172 } 1173 fl = fc->fc_lock; 1174 1175 if (fl->fl_owner == LOCK_UNMANAGED) { 1176 /* 1177 * The lock is currently not managed by the kernel, 1178 * meaning we must attempt to acquire the userspace lock 1179 * first. We cannot requeue threads to an unmanaged lock, 1180 * as these threads will then never be scheduled. 1181 * 1182 * Unfortunately, the memory address of the lock is 1183 * unknown from this context, meaning that we cannot 1184 * acquire the lock on behalf of the first thread to be 1185 * scheduled. The lock may even not be mapped within the 1186 * address space of the current thread. 1187 * 1188 * To solve this, wake up a single waiter that will 1189 * attempt to acquire the lock. Donate all of the other 1190 * waiters that need to be woken up to this waiter, so 1191 * it can requeue them after acquiring the lock. 1192 */ 1193 futex_queue_wake_up_donate(&fc->fc_waiters, nwaiters - 1); 1194 } else { 1195 /* 1196 * Lock is already managed by the kernel. This makes it 1197 * easy, as we can requeue the threads from the 1198 * condition variable directly to the associated lock. 1199 */ 1200 futex_queue_requeue(&fc->fc_waiters, &fl->fl_writers, nwaiters); 1201 } 1202 1203 /* Clear userspace condition variable if all waiters are gone. */ 1204 error = futex_condvar_unmanage(fc, uap->condvar); 1205 futex_condvar_release(fc); 1206 return (error); 1207} 1208 1209int 1210cloudabi_sys_lock_unlock(struct thread *td, 1211 struct cloudabi_sys_lock_unlock_args *uap) 1212{ 1213 struct futex_lock *fl; 1214 int error; 1215 1216 error = futex_lock_lookup(td, uap->lock, uap->scope, &fl); 1217 if (error != 0) 1218 return (error); 1219 error = futex_lock_unlock(fl, td, uap->lock); 1220 futex_lock_release(fl); 1221 return (error); 1222} 1223