1/*-
2 * Copyright (c) 2015 Nuxi, https://nuxi.nl/
3 *
4 * Redistribution and use in source and binary forms, with or without
5 * modification, are permitted provided that the following conditions
6 * are met:
7 * 1. Redistributions of source code must retain the above copyright
8 *    notice, this list of conditions and the following disclaimer.
9 * 2. Redistributions in binary form must reproduce the above copyright
10 *    notice, this list of conditions and the following disclaimer in the
11 *    documentation and/or other materials provided with the distribution.
12 *
13 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
14 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
15 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
16 * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
17 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
18 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
19 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
20 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
21 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
22 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
23 * SUCH DAMAGE.
24 */
25
26#include <sys/cdefs.h>
27__FBSDID("$FreeBSD: stable/11/sys/compat/cloudabi/cloudabi_futex.c 328127 2018-01-18 13:43:09Z ed $");
28
29#include <sys/param.h>
30#include <sys/kernel.h>
31#include <sys/limits.h>
32#include <sys/lock.h>
33#include <sys/malloc.h>
34#include <sys/mutex.h>
35#include <sys/proc.h>
36#include <sys/sx.h>
37#include <sys/systm.h>
38#include <sys/umtx.h>
39
40#include <contrib/cloudabi/cloudabi_types_common.h>
41
42#include <compat/cloudabi/cloudabi_proto.h>
43#include <compat/cloudabi/cloudabi_util.h>
44
45/*
46 * Futexes for CloudABI.
47 *
48 * On most systems, futexes are implemented as objects of a single type
49 * on which a set of operations can be performed. CloudABI makes a clear
50 * distinction between locks and condition variables. A lock may have
51 * zero or more associated condition variables. A condition variable is
52 * always associated with exactly one lock. There is a strict topology.
53 * This approach has two advantages:
54 *
55 * - This topology is guaranteed to be acyclic. Requeueing of threads
56 *   only happens in one direction (from condition variables to locks).
57 *   This eases locking.
58 * - It means that a futex object for a lock exists when it is unlocked,
59 *   but has threads waiting on associated condition variables. Threads
60 *   can be requeued to a lock even if the thread performing the wakeup
61 *   does not have the lock mapped in its address space.
62 *
63 * This futex implementation only implements a single lock type, namely
64 * a read-write lock. A regular mutex type would not be necessary, as
65 * the read-write lock is as efficient as a mutex if used as such.
66 * Userspace futex locks are 32 bits in size:
67 *
68 * - 1 bit: has threads waiting in kernel-space.
69 * - 1 bit: is write-locked.
70 * - 30 bits:
71 *   - if write-locked: thread ID of owner.
72 *   - if not write-locked: number of read locks held.
73 *
74 * Condition variables are also 32 bits in size. Its value is modified
75 * by kernel-space exclusively. Zero indicates that it has no waiting
76 * threads. Non-zero indicates the opposite.
77 *
78 * This implementation is optimal, in the sense that it only wakes up
79 * threads if they can actually continue execution. It does not suffer
80 * from the thundering herd problem. If multiple threads waiting on a
81 * condition variable need to be woken up, only a single thread is
82 * scheduled. All other threads are 'donated' to this thread. After the
83 * thread manages to reacquire the lock, it requeues its donated threads
84 * to the lock.
85 *
86 * TODO(ed): Integrate this functionality into kern_umtx.c instead.
87 * TODO(ed): Store futex objects in a hash table.
88 * TODO(ed): Add actual priority inheritance.
89 * TODO(ed): Let futex_queue also take priorities into account.
90 * TODO(ed): Make locking fine-grained.
91 * TODO(ed): Perform sleeps until an actual absolute point in time,
92 *           instead of converting the timestamp to a relative value.
93 */
94
95struct futex_address;
96struct futex_condvar;
97struct futex_lock;
98struct futex_queue;
99struct futex_waiter;
100
101/* Identifier of a location in memory. */
102struct futex_address {
103	struct umtx_key			fa_key;
104};
105
106/* A set of waiting threads. */
107struct futex_queue {
108	STAILQ_HEAD(, futex_waiter)	fq_list;
109	unsigned int			fq_count;
110};
111
112/* Condition variables. */
113struct futex_condvar {
114	/* Address of the condition variable. */
115	struct futex_address		fc_address;
116
117	/* The lock the waiters should be moved to when signalled. */
118	struct futex_lock *		fc_lock;
119
120	/* Threads waiting on the condition variable. */
121	struct futex_queue		fc_waiters;
122	/*
123	 * Number of threads blocked on this condition variable, or
124	 * being blocked on the lock after being requeued.
125	 */
126	unsigned int			fc_waitcount;
127
128	/* Global list pointers. */
129	LIST_ENTRY(futex_condvar)	fc_next;
130};
131
132/* Read-write locks. */
133struct futex_lock {
134	/* Address of the lock. */
135	struct futex_address		fl_address;
136
137	/*
138	 * Current owner of the lock. LOCK_UNMANAGED if the lock is
139	 * currently not owned by the kernel. LOCK_OWNER_UNKNOWN in case
140	 * the owner is not known (e.g., when the lock is read-locked).
141	 */
142	cloudabi_tid_t			fl_owner;
143#define LOCK_UNMANAGED 0x0
144#define LOCK_OWNER_UNKNOWN 0x1
145
146	/* Writers blocked on the lock. */
147	struct futex_queue		fl_writers;
148	/* Readers blocked on the lock. */
149	struct futex_queue		fl_readers;
150	/* Number of threads blocked on this lock + condition variables. */
151	unsigned int			fl_waitcount;
152
153	/* Global list pointers. */
154	LIST_ENTRY(futex_lock)		fl_next;
155};
156
157/* Information associated with a thread blocked on an object. */
158struct futex_waiter {
159	/* Thread ID. */
160	cloudabi_tid_t			fw_tid;
161	/* Condition variable used for waiting. */
162	struct cv			fw_wait;
163
164	/* Queue this waiter is currently placed in. */
165	struct futex_queue *		fw_queue;
166	/* List pointers of fw_queue. */
167	STAILQ_ENTRY(futex_waiter)	fw_next;
168
169	/* Lock has been acquired. */
170	bool				fw_locked;
171	/* If not locked, threads that should block after acquiring. */
172	struct futex_queue		fw_donated;
173};
174
175/* Global data structures. */
176static MALLOC_DEFINE(M_FUTEX, "futex", "CloudABI futex");
177
178static struct sx futex_global_lock;
179SX_SYSINIT(futex_global_lock, &futex_global_lock, "CloudABI futex global lock");
180
181static LIST_HEAD(, futex_lock) futex_lock_list =
182    LIST_HEAD_INITIALIZER(&futex_lock_list);
183static LIST_HEAD(, futex_condvar) futex_condvar_list =
184    LIST_HEAD_INITIALIZER(&futex_condvar_list);
185
186/* Utility functions. */
187static void futex_lock_assert(const struct futex_lock *);
188static struct futex_lock *futex_lock_lookup_locked(struct futex_address *);
189static void futex_lock_release(struct futex_lock *);
190static int futex_lock_tryrdlock(struct futex_lock *, cloudabi_lock_t *);
191static int futex_lock_unmanage(struct futex_lock *, cloudabi_lock_t *);
192static int futex_lock_update_owner(struct futex_lock *, cloudabi_lock_t *);
193static int futex_lock_wake_up_next(struct futex_lock *, cloudabi_lock_t *);
194static unsigned int futex_queue_count(const struct futex_queue *);
195static void futex_queue_init(struct futex_queue *);
196static void futex_queue_requeue(struct futex_queue *, struct futex_queue *,
197    unsigned int);
198static int futex_queue_sleep(struct futex_queue *, struct futex_lock *,
199    struct futex_waiter *, struct thread *, cloudabi_clockid_t,
200    cloudabi_timestamp_t, cloudabi_timestamp_t, bool);
201static cloudabi_tid_t futex_queue_tid_best(const struct futex_queue *);
202static void futex_queue_wake_up_all(struct futex_queue *);
203static void futex_queue_wake_up_best(struct futex_queue *);
204static void futex_queue_wake_up_donate(struct futex_queue *, unsigned int);
205static int futex_user_load(uint32_t *, uint32_t *);
206static int futex_user_store(uint32_t *, uint32_t);
207static int futex_user_cmpxchg(uint32_t *, uint32_t, uint32_t *, uint32_t);
208
209/*
210 * futex_address operations.
211 */
212
213static int
214futex_address_create(struct futex_address *fa, struct thread *td,
215    const void *object, cloudabi_scope_t scope)
216{
217
218	KASSERT(td == curthread,
219	    ("Can only create umtx keys for the current thread"));
220	switch (scope) {
221	case CLOUDABI_SCOPE_PRIVATE:
222		return (umtx_key_get(object, TYPE_FUTEX, THREAD_SHARE,
223		    &fa->fa_key));
224	case CLOUDABI_SCOPE_SHARED:
225		return (umtx_key_get(object, TYPE_FUTEX, AUTO_SHARE,
226		    &fa->fa_key));
227	default:
228		return (EINVAL);
229	}
230}
231
232static void
233futex_address_free(struct futex_address *fa)
234{
235
236	umtx_key_release(&fa->fa_key);
237}
238
239static bool
240futex_address_match(const struct futex_address *fa1,
241    const struct futex_address *fa2)
242{
243
244	return (umtx_key_match(&fa1->fa_key, &fa2->fa_key));
245}
246
247/*
248 * futex_condvar operations.
249 */
250
251static void
252futex_condvar_assert(const struct futex_condvar *fc)
253{
254
255	KASSERT(fc->fc_waitcount >= futex_queue_count(&fc->fc_waiters),
256	    ("Total number of waiters cannot be smaller than the wait queue"));
257	futex_lock_assert(fc->fc_lock);
258}
259
260static int
261futex_condvar_lookup(struct thread *td, const cloudabi_condvar_t *address,
262    cloudabi_scope_t scope, struct futex_condvar **fcret)
263{
264	struct futex_address fa_condvar;
265	struct futex_condvar *fc;
266	int error;
267
268	error = futex_address_create(&fa_condvar, td, address, scope);
269	if (error != 0)
270		return (error);
271
272	sx_xlock(&futex_global_lock);
273	LIST_FOREACH(fc, &futex_condvar_list, fc_next) {
274		if (futex_address_match(&fc->fc_address, &fa_condvar)) {
275			/* Found matching lock object. */
276			futex_address_free(&fa_condvar);
277			futex_condvar_assert(fc);
278			*fcret = fc;
279			return (0);
280		}
281	}
282	sx_xunlock(&futex_global_lock);
283	futex_address_free(&fa_condvar);
284	return (ENOENT);
285}
286
287static int
288futex_condvar_lookup_or_create(struct thread *td,
289    const cloudabi_condvar_t *condvar, cloudabi_scope_t condvar_scope,
290    const cloudabi_lock_t *lock, cloudabi_scope_t lock_scope,
291    struct futex_condvar **fcret)
292{
293	struct futex_address fa_condvar, fa_lock;
294	struct futex_condvar *fc;
295	struct futex_lock *fl;
296	int error;
297
298	error = futex_address_create(&fa_condvar, td, condvar, condvar_scope);
299	if (error != 0)
300		return (error);
301	error = futex_address_create(&fa_lock, td, lock, lock_scope);
302	if (error != 0) {
303		futex_address_free(&fa_condvar);
304		return (error);
305	}
306
307	sx_xlock(&futex_global_lock);
308	LIST_FOREACH(fc, &futex_condvar_list, fc_next) {
309		if (!futex_address_match(&fc->fc_address, &fa_condvar))
310			continue;
311		fl = fc->fc_lock;
312		if (!futex_address_match(&fl->fl_address, &fa_lock)) {
313			/* Condition variable is owned by a different lock. */
314			futex_address_free(&fa_condvar);
315			futex_address_free(&fa_lock);
316			sx_xunlock(&futex_global_lock);
317			return (EINVAL);
318		}
319
320		/* Found fully matching condition variable. */
321		futex_address_free(&fa_condvar);
322		futex_address_free(&fa_lock);
323		futex_condvar_assert(fc);
324		*fcret = fc;
325		return (0);
326	}
327
328	/* None found. Create new condition variable object. */
329	fc = malloc(sizeof(*fc), M_FUTEX, M_WAITOK);
330	fc->fc_address = fa_condvar;
331	fc->fc_lock = futex_lock_lookup_locked(&fa_lock);
332	futex_queue_init(&fc->fc_waiters);
333	fc->fc_waitcount = 0;
334	LIST_INSERT_HEAD(&futex_condvar_list, fc, fc_next);
335	*fcret = fc;
336	return (0);
337}
338
339static void
340futex_condvar_release(struct futex_condvar *fc)
341{
342	struct futex_lock *fl;
343
344	futex_condvar_assert(fc);
345	fl = fc->fc_lock;
346	if (fc->fc_waitcount == 0) {
347		/* Condition variable has no waiters. Deallocate it. */
348		futex_address_free(&fc->fc_address);
349		LIST_REMOVE(fc, fc_next);
350		free(fc, M_FUTEX);
351	}
352	futex_lock_release(fl);
353}
354
355static int
356futex_condvar_unmanage(struct futex_condvar *fc,
357    cloudabi_condvar_t *condvar)
358{
359
360	if (futex_queue_count(&fc->fc_waiters) != 0)
361		return (0);
362	return (futex_user_store(condvar, CLOUDABI_CONDVAR_HAS_NO_WAITERS));
363}
364
365/*
366 * futex_lock operations.
367 */
368
369static void
370futex_lock_assert(const struct futex_lock *fl)
371{
372
373	/*
374	 * A futex lock can only be kernel-managed if it has waiters.
375	 * Vice versa: if a futex lock has waiters, it must be
376	 * kernel-managed.
377	 */
378	KASSERT((fl->fl_owner == LOCK_UNMANAGED) ==
379	    (futex_queue_count(&fl->fl_readers) == 0 &&
380	    futex_queue_count(&fl->fl_writers) == 0),
381	    ("Managed locks must have waiting threads"));
382	KASSERT(fl->fl_waitcount != 0 || fl->fl_owner == LOCK_UNMANAGED,
383	    ("Lock with no waiters must be unmanaged"));
384}
385
386static int
387futex_lock_lookup(struct thread *td, const cloudabi_lock_t *address,
388    cloudabi_scope_t scope, struct futex_lock **flret)
389{
390	struct futex_address fa;
391	int error;
392
393	error = futex_address_create(&fa, td, address, scope);
394	if (error != 0)
395		return (error);
396
397	sx_xlock(&futex_global_lock);
398	*flret = futex_lock_lookup_locked(&fa);
399	return (0);
400}
401
402static struct futex_lock *
403futex_lock_lookup_locked(struct futex_address *fa)
404{
405	struct futex_lock *fl;
406
407	LIST_FOREACH(fl, &futex_lock_list, fl_next) {
408		if (futex_address_match(&fl->fl_address, fa)) {
409			/* Found matching lock object. */
410			futex_address_free(fa);
411			futex_lock_assert(fl);
412			return (fl);
413		}
414	}
415
416	/* None found. Create new lock object. */
417	fl = malloc(sizeof(*fl), M_FUTEX, M_WAITOK);
418	fl->fl_address = *fa;
419	fl->fl_owner = LOCK_UNMANAGED;
420	futex_queue_init(&fl->fl_readers);
421	futex_queue_init(&fl->fl_writers);
422	fl->fl_waitcount = 0;
423	LIST_INSERT_HEAD(&futex_lock_list, fl, fl_next);
424	return (fl);
425}
426
427static int
428futex_lock_rdlock(struct futex_lock *fl, struct thread *td,
429    cloudabi_lock_t *lock, cloudabi_clockid_t clock_id,
430    cloudabi_timestamp_t timeout, cloudabi_timestamp_t precision, bool abstime)
431{
432	struct futex_waiter fw;
433	int error;
434
435	error = futex_lock_tryrdlock(fl, lock);
436	if (error == EBUSY) {
437		/* Suspend execution. */
438		KASSERT(fl->fl_owner != LOCK_UNMANAGED,
439		    ("Attempted to sleep on an unmanaged lock"));
440		error = futex_queue_sleep(&fl->fl_readers, fl, &fw, td,
441		    clock_id, timeout, precision, abstime);
442		KASSERT((error == 0) == fw.fw_locked,
443		    ("Should have locked write lock on success"));
444		KASSERT(futex_queue_count(&fw.fw_donated) == 0,
445		    ("Lock functions cannot receive threads"));
446	}
447	if (error != 0)
448		futex_lock_unmanage(fl, lock);
449	return (error);
450}
451
452static void
453futex_lock_release(struct futex_lock *fl)
454{
455
456	futex_lock_assert(fl);
457	if (fl->fl_waitcount == 0) {
458		/* Lock object is unreferenced. Deallocate it. */
459		KASSERT(fl->fl_owner == LOCK_UNMANAGED,
460		    ("Attempted to free a managed lock"));
461		futex_address_free(&fl->fl_address);
462		LIST_REMOVE(fl, fl_next);
463		free(fl, M_FUTEX);
464	}
465	sx_xunlock(&futex_global_lock);
466}
467
468static int
469futex_lock_unmanage(struct futex_lock *fl, cloudabi_lock_t *lock)
470{
471	cloudabi_lock_t cmp, old;
472	int error;
473
474	if (futex_queue_count(&fl->fl_readers) == 0 &&
475	    futex_queue_count(&fl->fl_writers) == 0) {
476		/* Lock should be unmanaged. */
477		fl->fl_owner = LOCK_UNMANAGED;
478
479		/* Clear kernel-managed bit. */
480		error = futex_user_load(lock, &old);
481		if (error != 0)
482			return (error);
483		for (;;) {
484			cmp = old;
485			error = futex_user_cmpxchg(lock, cmp, &old,
486			    cmp & ~CLOUDABI_LOCK_KERNEL_MANAGED);
487			if (error != 0)
488				return (error);
489			if (old == cmp)
490				break;
491		}
492	}
493	return (0);
494}
495
496/* Sets an owner of a lock, based on a userspace lock value. */
497static void
498futex_lock_set_owner(struct futex_lock *fl, cloudabi_lock_t lock)
499{
500
501	/* Lock has no explicit owner. */
502	if ((lock & ~CLOUDABI_LOCK_WRLOCKED) == 0) {
503		fl->fl_owner = LOCK_OWNER_UNKNOWN;
504		return;
505	}
506	lock &= ~(CLOUDABI_LOCK_WRLOCKED | CLOUDABI_LOCK_KERNEL_MANAGED);
507
508	/* Don't allow userspace to silently unlock. */
509	if (lock == LOCK_UNMANAGED) {
510		fl->fl_owner = LOCK_OWNER_UNKNOWN;
511		return;
512	}
513	fl->fl_owner = lock;
514}
515
516static int
517futex_lock_unlock(struct futex_lock *fl, struct thread *td,
518    cloudabi_lock_t *lock)
519{
520	int error;
521
522	/* Validate that this thread is allowed to unlock. */
523	error = futex_lock_update_owner(fl, lock);
524	if (error != 0)
525		return (error);
526	if (fl->fl_owner != LOCK_UNMANAGED && fl->fl_owner != td->td_tid)
527		return (EPERM);
528	return (futex_lock_wake_up_next(fl, lock));
529}
530
531/* Syncs in the owner of the lock from userspace if needed. */
532static int
533futex_lock_update_owner(struct futex_lock *fl, cloudabi_lock_t *address)
534{
535	cloudabi_lock_t lock;
536	int error;
537
538	if (fl->fl_owner == LOCK_OWNER_UNKNOWN) {
539		error = futex_user_load(address, &lock);
540		if (error != 0)
541			return (error);
542		futex_lock_set_owner(fl, lock);
543	}
544	return (0);
545}
546
547static int
548futex_lock_tryrdlock(struct futex_lock *fl, cloudabi_lock_t *address)
549{
550	cloudabi_lock_t old, cmp;
551	int error;
552
553	if (fl->fl_owner != LOCK_UNMANAGED) {
554		/* Lock is already acquired. */
555		return (EBUSY);
556	}
557
558	old = CLOUDABI_LOCK_UNLOCKED;
559	for (;;) {
560		if ((old & CLOUDABI_LOCK_KERNEL_MANAGED) != 0) {
561			/*
562			 * Userspace lock is kernel-managed, even though
563			 * the kernel disagrees.
564			 */
565			return (EINVAL);
566		}
567
568		if ((old & CLOUDABI_LOCK_WRLOCKED) == 0) {
569			/*
570			 * Lock is not write-locked. Attempt to acquire
571			 * it by increasing the read count.
572			 */
573			cmp = old;
574			error = futex_user_cmpxchg(address, cmp, &old, cmp + 1);
575			if (error != 0)
576				return (error);
577			if (old == cmp) {
578				/* Success. */
579				return (0);
580			}
581		} else {
582			/* Lock is write-locked. Make it kernel-managed. */
583			cmp = old;
584			error = futex_user_cmpxchg(address, cmp, &old,
585			    cmp | CLOUDABI_LOCK_KERNEL_MANAGED);
586			if (error != 0)
587				return (error);
588			if (old == cmp) {
589				/* Success. */
590				futex_lock_set_owner(fl, cmp);
591				return (EBUSY);
592			}
593		}
594	}
595}
596
597static int
598futex_lock_trywrlock(struct futex_lock *fl, cloudabi_lock_t *address,
599    cloudabi_tid_t tid, bool force_kernel_managed)
600{
601	cloudabi_lock_t old, new, cmp;
602	int error;
603
604	if (fl->fl_owner == tid) {
605		/* Attempted to acquire lock recursively. */
606		return (EDEADLK);
607	}
608	if (fl->fl_owner != LOCK_UNMANAGED) {
609		/* Lock is already acquired. */
610		return (EBUSY);
611	}
612
613	old = CLOUDABI_LOCK_UNLOCKED;
614	for (;;) {
615		if ((old & CLOUDABI_LOCK_KERNEL_MANAGED) != 0) {
616			/*
617			 * Userspace lock is kernel-managed, even though
618			 * the kernel disagrees.
619			 */
620			return (EINVAL);
621		}
622		if (old == (tid | CLOUDABI_LOCK_WRLOCKED)) {
623			/* Attempted to acquire lock recursively. */
624			return (EDEADLK);
625		}
626
627		if (old == CLOUDABI_LOCK_UNLOCKED) {
628			/* Lock is unlocked. Attempt to acquire it. */
629			new = tid | CLOUDABI_LOCK_WRLOCKED;
630			if (force_kernel_managed)
631				new |= CLOUDABI_LOCK_KERNEL_MANAGED;
632			error = futex_user_cmpxchg(address,
633			    CLOUDABI_LOCK_UNLOCKED, &old, new);
634			if (error != 0)
635				return (error);
636			if (old == CLOUDABI_LOCK_UNLOCKED) {
637				/* Success. */
638				if (force_kernel_managed)
639					fl->fl_owner = tid;
640				return (0);
641			}
642		} else {
643			/* Lock is still locked. Make it kernel-managed. */
644			cmp = old;
645			error = futex_user_cmpxchg(address, cmp, &old,
646			    cmp | CLOUDABI_LOCK_KERNEL_MANAGED);
647			if (error != 0)
648				return (error);
649			if (old == cmp) {
650				/* Success. */
651				futex_lock_set_owner(fl, cmp);
652				return (EBUSY);
653			}
654		}
655	}
656}
657
658static int
659futex_lock_wake_up_next(struct futex_lock *fl, cloudabi_lock_t *lock)
660{
661	cloudabi_tid_t tid;
662	int error;
663
664	/*
665	 * Determine which thread(s) to wake up. Prefer waking up
666	 * writers over readers to prevent write starvation.
667	 */
668	if (futex_queue_count(&fl->fl_writers) > 0) {
669		/* Transfer ownership to a single write-locker. */
670		if (futex_queue_count(&fl->fl_writers) > 1 ||
671		    futex_queue_count(&fl->fl_readers) > 0) {
672			/* Lock should remain managed afterwards. */
673			tid = futex_queue_tid_best(&fl->fl_writers);
674			error = futex_user_store(lock,
675			    tid | CLOUDABI_LOCK_WRLOCKED |
676			    CLOUDABI_LOCK_KERNEL_MANAGED);
677			if (error != 0)
678				return (error);
679
680			futex_queue_wake_up_best(&fl->fl_writers);
681			fl->fl_owner = tid;
682		} else {
683			/* Lock can become unmanaged afterwards. */
684			error = futex_user_store(lock,
685			    futex_queue_tid_best(&fl->fl_writers) |
686			    CLOUDABI_LOCK_WRLOCKED);
687			if (error != 0)
688				return (error);
689
690			futex_queue_wake_up_best(&fl->fl_writers);
691			fl->fl_owner = LOCK_UNMANAGED;
692		}
693	} else {
694		/* Transfer ownership to all read-lockers (if any). */
695		error = futex_user_store(lock,
696		    futex_queue_count(&fl->fl_readers));
697		if (error != 0)
698			return (error);
699
700		/* Wake up all threads. */
701		futex_queue_wake_up_all(&fl->fl_readers);
702		fl->fl_owner = LOCK_UNMANAGED;
703	}
704	return (0);
705}
706
707static int
708futex_lock_wrlock(struct futex_lock *fl, struct thread *td,
709    cloudabi_lock_t *lock, cloudabi_clockid_t clock_id,
710    cloudabi_timestamp_t timeout, cloudabi_timestamp_t precision, bool abstime,
711    struct futex_queue *donated)
712{
713	struct futex_waiter fw;
714	int error;
715
716	error = futex_lock_trywrlock(fl, lock, td->td_tid,
717	    futex_queue_count(donated) > 0);
718
719	if (error == 0 || error == EBUSY) {
720		/* Put donated threads in queue before suspending. */
721		KASSERT(futex_queue_count(donated) == 0 ||
722		    fl->fl_owner != LOCK_UNMANAGED,
723		    ("Lock should be managed if we are going to donate"));
724		futex_queue_requeue(donated, &fl->fl_writers, UINT_MAX);
725	} else {
726		/*
727		 * This thread cannot deal with the donated threads.
728		 * Wake up the next thread and let it try it by itself.
729		 */
730		futex_queue_wake_up_donate(donated, UINT_MAX);
731	}
732
733	if (error == EBUSY) {
734		/* Suspend execution if the lock was busy. */
735		KASSERT(fl->fl_owner != LOCK_UNMANAGED,
736		    ("Attempted to sleep on an unmanaged lock"));
737		error = futex_queue_sleep(&fl->fl_writers, fl, &fw, td,
738		    clock_id, timeout, precision, abstime);
739		KASSERT((error == 0) == fw.fw_locked,
740		    ("Should have locked write lock on success"));
741		KASSERT(futex_queue_count(&fw.fw_donated) == 0,
742		    ("Lock functions cannot receive threads"));
743	}
744	if (error != 0)
745		futex_lock_unmanage(fl, lock);
746	return (error);
747}
748
749/*
750 * futex_queue operations.
751 */
752
753static cloudabi_tid_t
754futex_queue_tid_best(const struct futex_queue *fq)
755{
756
757	return (STAILQ_FIRST(&fq->fq_list)->fw_tid);
758}
759
760static unsigned int
761futex_queue_count(const struct futex_queue *fq)
762{
763
764	return (fq->fq_count);
765}
766
767static void
768futex_queue_init(struct futex_queue *fq)
769{
770
771	STAILQ_INIT(&fq->fq_list);
772	fq->fq_count = 0;
773}
774
775/* Converts a relative timestamp to an sbintime. */
776static sbintime_t
777futex_queue_convert_timestamp_relative(cloudabi_timestamp_t ts)
778{
779	cloudabi_timestamp_t s, ns;
780
781	s = ts / 1000000000;
782	ns = ts % 1000000000;
783	if (s > INT32_MAX)
784		return (INT64_MAX);
785	return ((s << 32) + (ns << 32) / 1000000000);
786}
787
788/* Converts an absolute timestamp and precision to a pair of sbintime values. */
789static int
790futex_queue_convert_timestamp(struct thread *td, cloudabi_clockid_t clock_id,
791    cloudabi_timestamp_t timeout, cloudabi_timestamp_t precision,
792    sbintime_t *sbttimeout, sbintime_t *sbtprecision, bool abstime)
793{
794	cloudabi_timestamp_t now;
795	int error;
796
797	if (abstime) {
798		/* Make the time relative. */
799		error = cloudabi_clock_time_get(td, clock_id, &now);
800		if (error != 0)
801			return (error);
802		timeout = timeout < now ? 0 : timeout - now;
803	}
804
805	*sbttimeout = futex_queue_convert_timestamp_relative(timeout);
806	*sbtprecision = futex_queue_convert_timestamp_relative(precision);
807	return (0);
808}
809
810static int
811futex_queue_sleep(struct futex_queue *fq, struct futex_lock *fl,
812    struct futex_waiter *fw, struct thread *td, cloudabi_clockid_t clock_id,
813    cloudabi_timestamp_t timeout, cloudabi_timestamp_t precision, bool abstime)
814{
815	sbintime_t sbttimeout, sbtprecision;
816	int error;
817
818	/* Initialize futex_waiter object. */
819	fw->fw_tid = td->td_tid;
820	fw->fw_locked = false;
821	futex_queue_init(&fw->fw_donated);
822
823	if (timeout != UINT64_MAX) {
824		/* Convert timeout duration. */
825		error = futex_queue_convert_timestamp(td, clock_id, timeout,
826		    precision, &sbttimeout, &sbtprecision, abstime);
827		if (error != 0)
828			return (error);
829	}
830
831	/* Place object in the queue. */
832	fw->fw_queue = fq;
833	STAILQ_INSERT_TAIL(&fq->fq_list, fw, fw_next);
834	++fq->fq_count;
835
836	cv_init(&fw->fw_wait, "futex");
837	++fl->fl_waitcount;
838
839	futex_lock_assert(fl);
840	if (timeout == UINT64_MAX) {
841		/* Wait without a timeout. */
842		error = cv_wait_sig(&fw->fw_wait, &futex_global_lock);
843	} else {
844		/* Wait respecting the timeout. */
845		error = cv_timedwait_sig_sbt(&fw->fw_wait, &futex_global_lock,
846		    sbttimeout, sbtprecision, 0);
847		futex_lock_assert(fl);
848		if (error == EWOULDBLOCK &&
849		    fw->fw_queue != NULL && fw->fw_queue != fq) {
850			/*
851			 * We got signalled on a condition variable, but
852			 * observed a timeout while waiting to reacquire
853			 * the lock. In other words, we didn't actually
854			 * time out. Go back to sleep and wait for the
855			 * lock to be reacquired.
856			 */
857			error = cv_wait_sig(&fw->fw_wait, &futex_global_lock);
858		}
859	}
860	futex_lock_assert(fl);
861
862	--fl->fl_waitcount;
863	cv_destroy(&fw->fw_wait);
864
865	fq = fw->fw_queue;
866	if (fq == NULL) {
867		/* Thread got dequeued, so we've slept successfully. */
868		return (0);
869	}
870
871	/* Thread is still enqueued. Remove it. */
872	KASSERT(error != 0, ("Woken up thread is still enqueued"));
873	STAILQ_REMOVE(&fq->fq_list, fw, futex_waiter, fw_next);
874	--fq->fq_count;
875	return (error == EWOULDBLOCK ? ETIMEDOUT : error);
876}
877
878/* Moves up to nwaiters waiters from one queue to another. */
879static void
880futex_queue_requeue(struct futex_queue *fqfrom, struct futex_queue *fqto,
881    unsigned int nwaiters)
882{
883	struct futex_waiter *fw;
884
885	/* Move waiters to the target queue. */
886	while (nwaiters-- > 0 && !STAILQ_EMPTY(&fqfrom->fq_list)) {
887		fw = STAILQ_FIRST(&fqfrom->fq_list);
888		STAILQ_REMOVE_HEAD(&fqfrom->fq_list, fw_next);
889		--fqfrom->fq_count;
890
891		fw->fw_queue = fqto;
892		STAILQ_INSERT_TAIL(&fqto->fq_list, fw, fw_next);
893		++fqto->fq_count;
894	}
895}
896
897/* Wakes up all waiters in a queue. */
898static void
899futex_queue_wake_up_all(struct futex_queue *fq)
900{
901	struct futex_waiter *fw;
902
903	STAILQ_FOREACH(fw, &fq->fq_list, fw_next) {
904		fw->fw_locked = true;
905		fw->fw_queue = NULL;
906		cv_signal(&fw->fw_wait);
907	}
908
909	STAILQ_INIT(&fq->fq_list);
910	fq->fq_count = 0;
911}
912
913/*
914 * Wakes up the best waiter (i.e., the waiter having the highest
915 * priority) in a queue.
916 */
917static void
918futex_queue_wake_up_best(struct futex_queue *fq)
919{
920	struct futex_waiter *fw;
921
922	fw = STAILQ_FIRST(&fq->fq_list);
923	fw->fw_locked = true;
924	fw->fw_queue = NULL;
925	cv_signal(&fw->fw_wait);
926
927	STAILQ_REMOVE_HEAD(&fq->fq_list, fw_next);
928	--fq->fq_count;
929}
930
931static void
932futex_queue_wake_up_donate(struct futex_queue *fq, unsigned int nwaiters)
933{
934	struct futex_waiter *fw;
935
936	fw = STAILQ_FIRST(&fq->fq_list);
937	if (fw == NULL)
938		return;
939	fw->fw_locked = false;
940	fw->fw_queue = NULL;
941	cv_signal(&fw->fw_wait);
942
943	STAILQ_REMOVE_HEAD(&fq->fq_list, fw_next);
944	--fq->fq_count;
945	futex_queue_requeue(fq, &fw->fw_donated, nwaiters);
946}
947
948/*
949 * futex_user operations. Used to adjust values in userspace.
950 */
951
952static int
953futex_user_load(uint32_t *obj, uint32_t *val)
954{
955
956	return (fueword32(obj, val) != 0 ? EFAULT : 0);
957}
958
959static int
960futex_user_store(uint32_t *obj, uint32_t val)
961{
962
963	return (suword32(obj, val) != 0 ? EFAULT : 0);
964}
965
966static int
967futex_user_cmpxchg(uint32_t *obj, uint32_t cmp, uint32_t *old, uint32_t new)
968{
969
970	return (casueword32(obj, cmp, old, new) != 0 ? EFAULT : 0);
971}
972
973/*
974 * Blocking calls: acquiring locks, waiting on condition variables.
975 */
976
977int
978cloudabi_futex_condvar_wait(struct thread *td, cloudabi_condvar_t *condvar,
979    cloudabi_scope_t condvar_scope, cloudabi_lock_t *lock,
980    cloudabi_scope_t lock_scope, cloudabi_clockid_t clock_id,
981    cloudabi_timestamp_t timeout, cloudabi_timestamp_t precision, bool abstime)
982{
983	struct futex_condvar *fc;
984	struct futex_lock *fl;
985	struct futex_waiter fw;
986	int error, error2;
987
988	/* Lookup condition variable object. */
989	error = futex_condvar_lookup_or_create(td, condvar, condvar_scope, lock,
990	    lock_scope, &fc);
991	if (error != 0)
992		return (error);
993	fl = fc->fc_lock;
994
995	/*
996	 * Set the condition variable to something other than
997	 * CLOUDABI_CONDVAR_HAS_NO_WAITERS to make userspace threads
998	 * call into the kernel to perform wakeups.
999	 */
1000	error = futex_user_store(condvar, ~CLOUDABI_CONDVAR_HAS_NO_WAITERS);
1001	if (error != 0) {
1002		futex_condvar_release(fc);
1003		return (error);
1004	}
1005
1006	/* Drop the lock. */
1007	error = futex_lock_unlock(fl, td, lock);
1008	if (error != 0) {
1009		futex_condvar_unmanage(fc, condvar);
1010		futex_condvar_release(fc);
1011		return (error);
1012	}
1013
1014	/* Go to sleep. */
1015	++fc->fc_waitcount;
1016	error = futex_queue_sleep(&fc->fc_waiters, fc->fc_lock, &fw, td,
1017	    clock_id, timeout, precision, abstime);
1018	if (fw.fw_locked) {
1019		/* Waited and got the lock assigned to us. */
1020		KASSERT(futex_queue_count(&fw.fw_donated) == 0,
1021		    ("Received threads while being locked"));
1022	} else if (error == 0 || error == ETIMEDOUT) {
1023		if (error != 0)
1024			futex_condvar_unmanage(fc, condvar);
1025		/*
1026		 * Got woken up without having the lock assigned to us.
1027		 * This can happen in two cases:
1028		 *
1029		 * 1. We observed a timeout on a condition variable.
1030		 * 2. We got signalled on a condition variable while the
1031		 *    associated lock is unlocked. We are the first
1032		 *    thread that gets woken up. This thread is
1033		 *    responsible for reacquiring the userspace lock.
1034		 */
1035		error2 = futex_lock_wrlock(fl, td, lock,
1036		    CLOUDABI_CLOCK_MONOTONIC, UINT64_MAX, 0, abstime,
1037		    &fw.fw_donated);
1038		if (error2 != 0)
1039			error = error2;
1040	} else {
1041		KASSERT(futex_queue_count(&fw.fw_donated) == 0,
1042		    ("Received threads on error"));
1043		futex_condvar_unmanage(fc, condvar);
1044		futex_lock_unmanage(fl, lock);
1045	}
1046	--fc->fc_waitcount;
1047	futex_condvar_release(fc);
1048	return (error);
1049}
1050
1051int
1052cloudabi_futex_lock_rdlock(struct thread *td, cloudabi_lock_t *lock,
1053    cloudabi_scope_t scope, cloudabi_clockid_t clock_id,
1054    cloudabi_timestamp_t timeout, cloudabi_timestamp_t precision, bool abstime)
1055{
1056	struct futex_lock *fl;
1057	int error;
1058
1059	/* Look up lock object. */
1060	error = futex_lock_lookup(td, lock, scope, &fl);
1061	if (error != 0)
1062		return (error);
1063
1064	error = futex_lock_rdlock(fl, td, lock, clock_id, timeout,
1065	    precision, abstime);
1066	futex_lock_release(fl);
1067	return (error);
1068}
1069
1070int
1071cloudabi_futex_lock_wrlock(struct thread *td, cloudabi_lock_t *lock,
1072    cloudabi_scope_t scope, cloudabi_clockid_t clock_id,
1073    cloudabi_timestamp_t timeout, cloudabi_timestamp_t precision, bool abstime)
1074{
1075	struct futex_lock *fl;
1076	struct futex_queue fq;
1077	int error;
1078
1079	/* Look up lock object. */
1080	error = futex_lock_lookup(td, lock, scope, &fl);
1081	if (error != 0)
1082		return (error);
1083
1084	futex_queue_init(&fq);
1085	error = futex_lock_wrlock(fl, td, lock, clock_id, timeout,
1086	    precision, abstime, &fq);
1087	futex_lock_release(fl);
1088	return (error);
1089}
1090
1091/*
1092 * Non-blocking calls: releasing locks, signalling condition variables.
1093 */
1094
1095int
1096cloudabi_sys_condvar_signal(struct thread *td,
1097    struct cloudabi_sys_condvar_signal_args *uap)
1098{
1099	struct futex_condvar *fc;
1100	struct futex_lock *fl;
1101	cloudabi_nthreads_t nwaiters;
1102	int error;
1103
1104	nwaiters = uap->nwaiters;
1105	if (nwaiters == 0) {
1106		/* No threads to wake up. */
1107		return (0);
1108	}
1109
1110	/* Look up futex object. */
1111	error = futex_condvar_lookup(td, uap->condvar, uap->scope, &fc);
1112	if (error != 0) {
1113		/* Race condition: condition variable with no waiters. */
1114		return (error == ENOENT ? 0 : error);
1115	}
1116	fl = fc->fc_lock;
1117
1118	if (fl->fl_owner == LOCK_UNMANAGED) {
1119		/*
1120		 * The lock is currently not managed by the kernel,
1121		 * meaning we must attempt to acquire the userspace lock
1122		 * first. We cannot requeue threads to an unmanaged lock,
1123		 * as these threads will then never be scheduled.
1124		 *
1125		 * Unfortunately, the memory address of the lock is
1126		 * unknown from this context, meaning that we cannot
1127		 * acquire the lock on behalf of the first thread to be
1128		 * scheduled. The lock may even not be mapped within the
1129		 * address space of the current thread.
1130		 *
1131		 * To solve this, wake up a single waiter that will
1132		 * attempt to acquire the lock. Donate all of the other
1133		 * waiters that need to be woken up to this waiter, so
1134		 * it can requeue them after acquiring the lock.
1135		 */
1136		futex_queue_wake_up_donate(&fc->fc_waiters, nwaiters - 1);
1137	} else {
1138		/*
1139		 * Lock is already managed by the kernel. This makes it
1140		 * easy, as we can requeue the threads from the
1141		 * condition variable directly to the associated lock.
1142		 */
1143		futex_queue_requeue(&fc->fc_waiters, &fl->fl_writers, nwaiters);
1144	}
1145
1146	/* Clear userspace condition variable if all waiters are gone. */
1147	error = futex_condvar_unmanage(fc, uap->condvar);
1148	futex_condvar_release(fc);
1149	return (error);
1150}
1151
1152int
1153cloudabi_sys_lock_unlock(struct thread *td,
1154    struct cloudabi_sys_lock_unlock_args *uap)
1155{
1156	struct futex_lock *fl;
1157	int error;
1158
1159	error = futex_lock_lookup(td, uap->lock, uap->scope, &fl);
1160	if (error != 0)
1161		return (error);
1162	error = futex_lock_unlock(fl, td, uap->lock);
1163	futex_lock_release(fl);
1164	return (error);
1165}
1166