cloudabi_futex.c revision 285908
1/*-
2 * Copyright (c) 2015 Nuxi, https://nuxi.nl/
3 *
4 * Redistribution and use in source and binary forms, with or without
5 * modification, are permitted provided that the following conditions
6 * are met:
7 * 1. Redistributions of source code must retain the above copyright
8 *    notice, this list of conditions and the following disclaimer.
9 * 2. Redistributions in binary form must reproduce the above copyright
10 *    notice, this list of conditions and the following disclaimer in the
11 *    documentation and/or other materials provided with the distribution.
12 *
13 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
14 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
15 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
16 * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
17 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
18 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
19 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
20 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
21 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
22 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
23 * SUCH DAMAGE.
24 */
25
26#include <sys/cdefs.h>
27__FBSDID("$FreeBSD: head/sys/compat/cloudabi/cloudabi_futex.c 285908 2015-07-27 10:07:29Z ed $");
28
29#include <sys/param.h>
30#include <sys/kernel.h>
31#include <sys/limits.h>
32#include <sys/lock.h>
33#include <sys/malloc.h>
34#include <sys/mutex.h>
35#include <sys/proc.h>
36#include <sys/sx.h>
37#include <sys/systm.h>
38
39#include <vm/vm.h>
40#include <vm/vm_param.h>
41#include <vm/pmap.h>
42#include <vm/vm_extern.h>
43#include <vm/vm_map.h>
44#include <vm/vm_object.h>
45
46#include <compat/cloudabi/cloudabi_proto.h>
47#include <compat/cloudabi/cloudabi_syscalldefs.h>
48#include <compat/cloudabi/cloudabi_util.h>
49
50/*
51 * Futexes for CloudABI.
52 *
53 * On most systems, futexes are implemented as objects of a single type
54 * on which a set of operations can be performed. CloudABI makes a clear
55 * distinction between locks and condition variables. A lock may have
56 * zero or more associated condition variables. A condition variable is
57 * always associated with exactly one lock. There is a strict topology.
58 * This approach has two advantages:
59 *
60 * - This topology is guaranteed to be acyclic. Requeueing of threads
61 *   only happens in one direction (from condition variables to locks).
62 *   This eases locking.
63 * - It means that a futex object for a lock exists when it is unlocked,
64 *   but has threads waiting on associated condition variables. Threads
65 *   can be requeued to a lock even if the thread performing the wakeup
66 *   does not have the lock mapped in its address space.
67 *
68 * This futex implementation only implements a single lock type, namely
69 * a read-write lock. A regular mutex type would not be necessary, as
70 * the read-write lock is as efficient as a mutex if used as such.
71 * Userspace futex locks are 32 bits in size:
72 *
73 * - 1 bit: has threads waiting in kernel-space.
74 * - 1 bit: is write-locked.
75 * - 30 bits:
76 *   - if write-locked: thread ID of owner.
77 *   - if not write-locked: number of read locks held.
78 *
79 * Condition variables are also 32 bits in size. Its value is modified
80 * by kernel-space exclusively. Zero indicates that it has no waiting
81 * threads. Non-zero indicates the opposite.
82 *
83 * This implementation is optimal, in the sense that it only wakes up
84 * threads if they can actually continue execution. It does not suffer
85 * from the thundering herd problem. If multiple threads waiting on a
86 * condition variable need to be woken up, only a single thread is
87 * scheduled. All other threads are 'donated' to this thread. After the
88 * thread manages to reacquire the lock, it requeues its donated threads
89 * to the lock.
90 *
91 * TODO(ed): Integrate this functionality into kern_umtx.c instead.
92 * TODO(ed): Store futex objects in a hash table.
93 * TODO(ed): Add actual priority inheritance.
94 * TODO(ed): Let futex_queue also take priorities into account.
95 * TODO(ed): Make locking fine-grained.
96 * TODO(ed): Perform sleeps until an actual absolute point in time,
97 *           instead of converting the timestamp to a relative value.
98 */
99
100struct futex_address;
101struct futex_condvar;
102struct futex_lock;
103struct futex_queue;
104struct futex_waiter;
105
106/* Identifier of a location in memory. */
107struct futex_address {
108	/* For process-private objects: address space of the process. */
109	struct vmspace *		fa_vmspace;
110	/* For process-shared objects: VM object containing the object. */
111	struct vm_object *		fa_vmobject;
112
113	/* Memory address within address space or offset within VM object. */
114	uintptr_t			fa_offset;
115};
116
117/* A set of waiting threads. */
118struct futex_queue {
119	STAILQ_HEAD(, futex_waiter)	fq_list;
120	unsigned int			fq_count;
121};
122
123/* Condition variables. */
124struct futex_condvar {
125	/* Address of the condition variable. */
126	struct futex_address		fc_address;
127
128	/* The lock the waiters should be moved to when signalled. */
129	struct futex_lock *		fc_lock;
130
131	/* Threads waiting on the condition variable. */
132	struct futex_queue		fc_waiters;
133	/*
134	 * Number of threads blocked on this condition variable, or
135	 * being blocked on the lock after being requeued.
136	 */
137	unsigned int			fc_waitcount;
138
139	/* Global list pointers. */
140	LIST_ENTRY(futex_condvar)	fc_next;
141};
142
143/* Read-write locks. */
144struct futex_lock {
145	/* Address of the lock. */
146	struct futex_address		fl_address;
147
148	/*
149	 * Current owner of the lock. LOCK_UNMANAGED if the lock is
150	 * currently not owned by the kernel. LOCK_OWNER_UNKNOWN in case
151	 * the owner is not known (e.g., when the lock is read-locked).
152	 */
153	cloudabi_tid_t			fl_owner;
154#define LOCK_UNMANAGED 0x0
155#define LOCK_OWNER_UNKNOWN 0x1
156
157	/* Writers blocked on the lock. */
158	struct futex_queue		fl_writers;
159	/* Readers blocked on the lock. */
160	struct futex_queue		fl_readers;
161	/* Number of threads blocked on this lock + condition variables. */
162	unsigned int			fl_waitcount;
163
164	/* Global list pointers. */
165	LIST_ENTRY(futex_lock)		fl_next;
166};
167
168/* Information associated with a thread blocked on an object. */
169struct futex_waiter {
170	/* Thread ID. */
171	cloudabi_tid_t			fw_tid;
172	/* Condition variable used for waiting. */
173	struct cv			fw_wait;
174
175	/* Queue this waiter is currently placed in. */
176	struct futex_queue *		fw_queue;
177	/* List pointers of fw_queue. */
178	STAILQ_ENTRY(futex_waiter)	fw_next;
179
180	/* Lock has been acquired. */
181	bool				fw_locked;
182	/* If not locked, threads that should block after acquiring. */
183	struct futex_queue		fw_donated;
184};
185
186/* Global data structures. */
187static MALLOC_DEFINE(M_FUTEX, "futex", "CloudABI futex");
188
189static struct sx futex_global_lock;
190SX_SYSINIT(futex_global_lock, &futex_global_lock, "CloudABI futex global lock");
191
192static LIST_HEAD(, futex_lock) futex_lock_list =
193    LIST_HEAD_INITIALIZER(&futex_lock_list);
194static LIST_HEAD(, futex_condvar) futex_condvar_list =
195    LIST_HEAD_INITIALIZER(&futex_condvar_list);
196
197/* Utility functions. */
198static void futex_lock_assert(const struct futex_lock *);
199static struct futex_lock *futex_lock_lookup_locked(struct futex_address *);
200static void futex_lock_release(struct futex_lock *);
201static int futex_lock_tryrdlock(struct futex_lock *, cloudabi_lock_t *);
202static int futex_lock_unmanage(struct futex_lock *, cloudabi_lock_t *);
203static int futex_lock_update_owner(struct futex_lock *, cloudabi_lock_t *);
204static int futex_lock_wake_up_next(struct futex_lock *, cloudabi_lock_t *);
205static unsigned int futex_queue_count(const struct futex_queue *);
206static void futex_queue_init(struct futex_queue *);
207static void futex_queue_requeue(struct futex_queue *, struct futex_queue *,
208    unsigned int);
209static int futex_queue_sleep(struct futex_queue *, struct futex_lock *,
210    struct futex_waiter *, struct thread *, cloudabi_clockid_t,
211    cloudabi_timestamp_t, cloudabi_timestamp_t);
212static cloudabi_tid_t futex_queue_tid_best(const struct futex_queue *);
213static void futex_queue_wake_up_all(struct futex_queue *);
214static void futex_queue_wake_up_best(struct futex_queue *);
215static void futex_queue_wake_up_donate(struct futex_queue *, unsigned int);
216static int futex_user_load(uint32_t *, uint32_t *);
217static int futex_user_store(uint32_t *, uint32_t);
218static int futex_user_cmpxchg(uint32_t *, uint32_t, uint32_t *, uint32_t);
219
220/*
221 * futex_address operations.
222 */
223
224static int
225futex_address_create(struct futex_address *fa, struct thread *td,
226    const void *object, cloudabi_mflags_t scope)
227{
228	struct vmspace *vs;
229	struct vm_object *vo;
230	vm_map_t map;
231	vm_map_entry_t entry;
232	vm_pindex_t pindex;
233	vm_prot_t prot;
234	boolean_t wired;
235
236	/*
237	 * Most of the time objects are stored in privately mapped
238	 * anonymous memory. For these objects we wouldn't need to look
239	 * up the corresponding VM object. The scope hint provided by
240	 * userspace allows us to skip the VM map lookup for the common
241	 * case.
242	 *
243	 * POSIX does permit enabling PTHREAD_PROCESS_SHARED on a lock
244	 * stored in a private mapping, at the cost of additional
245	 * performance overhead. Fall back to identifying the object by
246	 * virtual memory address if the mapping isn't shared.
247	 */
248	vs = td->td_proc->p_vmspace;
249	switch (scope) {
250	case CLOUDABI_MAP_SHARED:
251		map = &vs->vm_map;
252		if (vm_map_lookup(&map, (vm_offset_t)object,
253		    VM_PROT_COPY | VM_PROT_WRITE, &entry, &vo, &pindex, &prot,
254		    &wired) != KERN_SUCCESS)
255			return (EFAULT);
256
257		if (entry->inheritance == VM_INHERIT_SHARE) {
258			/*
259			 * Address corresponds to a shared mapping.
260			 * Identify the address by its VM object.
261			 */
262			fa->fa_vmspace = NULL;
263			fa->fa_vmobject = vo;
264			vm_object_reference(vo);
265			fa->fa_offset = entry->offset - entry->start +
266			    (vm_offset_t)object;
267			vm_map_lookup_done(map, entry);
268			return (0);
269		}
270		vm_map_lookup_done(map, entry);
271		/* FALLTHROUGH */
272	case CLOUDABI_MAP_PRIVATE:
273		/*
274		 * Address corresponds to a private mapping. Never
275		 * identify the address by its VM object, as shadow
276		 * objects may get inserted if another thread forks.
277		 * Simply use the VM space instead.
278		 */
279		fa->fa_vmspace = vs;
280		fa->fa_vmobject = NULL;
281		fa->fa_offset = (uintptr_t)object;
282		return (0);
283	default:
284		return (EINVAL);
285	}
286}
287
288static void
289futex_address_free(struct futex_address *fa)
290{
291
292	if (fa->fa_vmobject != NULL)
293		vm_object_deallocate(fa->fa_vmobject);
294}
295
296static bool
297futex_address_match(const struct futex_address *fa1,
298    const struct futex_address *fa2)
299{
300
301	/* Either fa_vmspace or fa_vmobject is NULL. */
302	return (fa1->fa_vmspace == fa2->fa_vmspace &&
303	    fa1->fa_vmobject == fa2->fa_vmobject &&
304	    fa1->fa_offset == fa2->fa_offset);
305}
306
307/*
308 * futex_condvar operations.
309 */
310
311static void
312futex_condvar_assert(const struct futex_condvar *fc)
313{
314
315	KASSERT(fc->fc_waitcount >= futex_queue_count(&fc->fc_waiters),
316	    ("Total number of waiters cannot be smaller than the wait queue"));
317	futex_lock_assert(fc->fc_lock);
318}
319
320static int
321futex_condvar_lookup(struct thread *td, const cloudabi_condvar_t *address,
322    cloudabi_mflags_t scope, struct futex_condvar **fcret)
323{
324	struct futex_address fa_condvar;
325	struct futex_condvar *fc;
326	int error;
327
328	error = futex_address_create(&fa_condvar, td, address, scope);
329	if (error != 0)
330		return (error);
331
332	sx_xlock(&futex_global_lock);
333	LIST_FOREACH(fc, &futex_condvar_list, fc_next) {
334		if (futex_address_match(&fc->fc_address, &fa_condvar)) {
335			/* Found matching lock object. */
336			futex_address_free(&fa_condvar);
337			futex_condvar_assert(fc);
338			*fcret = fc;
339			return (0);
340		}
341	}
342	sx_xunlock(&futex_global_lock);
343	futex_address_free(&fa_condvar);
344	return (ENOENT);
345}
346
347static int
348futex_condvar_lookup_or_create(struct thread *td,
349    const cloudabi_condvar_t *condvar, cloudabi_mflags_t condvar_scope,
350    const cloudabi_lock_t *lock, cloudabi_mflags_t lock_scope,
351    struct futex_condvar **fcret)
352{
353	struct futex_address fa_condvar, fa_lock;
354	struct futex_condvar *fc;
355	struct futex_lock *fl;
356	int error;
357
358	error = futex_address_create(&fa_condvar, td, condvar, condvar_scope);
359	if (error != 0)
360		return (error);
361	error = futex_address_create(&fa_lock, td, lock, lock_scope);
362	if (error != 0) {
363		futex_address_free(&fa_condvar);
364		return (error);
365	}
366
367	sx_xlock(&futex_global_lock);
368	LIST_FOREACH(fc, &futex_condvar_list, fc_next) {
369		if (!futex_address_match(&fc->fc_address, &fa_condvar))
370			continue;
371		fl = fc->fc_lock;
372		if (!futex_address_match(&fl->fl_address, &fa_lock)) {
373			/* Condition variable is owned by a different lock. */
374			futex_address_free(&fa_condvar);
375			futex_address_free(&fa_lock);
376			sx_xunlock(&futex_global_lock);
377			return (EINVAL);
378		}
379
380		/* Found fully matching condition variable. */
381		futex_address_free(&fa_condvar);
382		futex_address_free(&fa_lock);
383		futex_condvar_assert(fc);
384		*fcret = fc;
385		return (0);
386	}
387
388	/* None found. Create new condition variable object. */
389	fc = malloc(sizeof(*fc), M_FUTEX, M_WAITOK);
390	fc->fc_address = fa_condvar;
391	fc->fc_lock = futex_lock_lookup_locked(&fa_lock);
392	futex_queue_init(&fc->fc_waiters);
393	fc->fc_waitcount = 0;
394	LIST_INSERT_HEAD(&futex_condvar_list, fc, fc_next);
395	*fcret = fc;
396	return (0);
397}
398
399static void
400futex_condvar_release(struct futex_condvar *fc)
401{
402	struct futex_lock *fl;
403
404	futex_condvar_assert(fc);
405	fl = fc->fc_lock;
406	if (fc->fc_waitcount == 0) {
407		/* Condition variable has no waiters. Deallocate it. */
408		futex_address_free(&fc->fc_address);
409		LIST_REMOVE(fc, fc_next);
410		free(fc, M_FUTEX);
411	}
412	futex_lock_release(fl);
413}
414
415static int
416futex_condvar_unmanage(struct futex_condvar *fc,
417    cloudabi_condvar_t *condvar)
418{
419
420	if (futex_queue_count(&fc->fc_waiters) != 0)
421		return (0);
422	return (futex_user_store(condvar, CLOUDABI_CONDVAR_HAS_NO_WAITERS));
423}
424
425/*
426 * futex_lock operations.
427 */
428
429static void
430futex_lock_assert(const struct futex_lock *fl)
431{
432
433	/*
434	 * A futex lock can only be kernel-managed if it has waiters.
435	 * Vice versa: if a futex lock has waiters, it must be
436	 * kernel-managed.
437	 */
438	KASSERT((fl->fl_owner == LOCK_UNMANAGED) ==
439	    (futex_queue_count(&fl->fl_readers) == 0 &&
440	    futex_queue_count(&fl->fl_writers) == 0),
441	    ("Managed locks must have waiting threads"));
442	KASSERT(fl->fl_waitcount != 0 || fl->fl_owner == LOCK_UNMANAGED,
443	    ("Lock with no waiters must be unmanaged"));
444}
445
446static int
447futex_lock_lookup(struct thread *td, const cloudabi_lock_t *address,
448    cloudabi_mflags_t scope, struct futex_lock **flret)
449{
450	struct futex_address fa;
451	int error;
452
453	error = futex_address_create(&fa, td, address, scope);
454	if (error != 0)
455		return (error);
456
457	sx_xlock(&futex_global_lock);
458	*flret = futex_lock_lookup_locked(&fa);
459	return (0);
460}
461
462static struct futex_lock *
463futex_lock_lookup_locked(struct futex_address *fa)
464{
465	struct futex_lock *fl;
466
467	LIST_FOREACH(fl, &futex_lock_list, fl_next) {
468		if (futex_address_match(&fl->fl_address, fa)) {
469			/* Found matching lock object. */
470			futex_address_free(fa);
471			futex_lock_assert(fl);
472			return (fl);
473		}
474	}
475
476	/* None found. Create new lock object. */
477	fl = malloc(sizeof(*fl), M_FUTEX, M_WAITOK);
478	fl->fl_address = *fa;
479	fl->fl_owner = LOCK_UNMANAGED;
480	futex_queue_init(&fl->fl_readers);
481	futex_queue_init(&fl->fl_writers);
482	fl->fl_waitcount = 0;
483	LIST_INSERT_HEAD(&futex_lock_list, fl, fl_next);
484	return (fl);
485}
486
487static int
488futex_lock_rdlock(struct futex_lock *fl, struct thread *td,
489    cloudabi_lock_t *lock, cloudabi_clockid_t clock_id,
490    cloudabi_timestamp_t timeout, cloudabi_timestamp_t precision)
491{
492	struct futex_waiter fw;
493	int error;
494
495	error = futex_lock_tryrdlock(fl, lock);
496	if (error == EBUSY) {
497		/* Suspend execution. */
498		KASSERT(fl->fl_owner != LOCK_UNMANAGED,
499		    ("Attempted to sleep on an unmanaged lock"));
500		error = futex_queue_sleep(&fl->fl_readers, fl, &fw, td,
501		    clock_id, timeout, precision);
502		KASSERT((error == 0) == fw.fw_locked,
503		    ("Should have locked write lock on success"));
504		KASSERT(futex_queue_count(&fw.fw_donated) == 0,
505		    ("Lock functions cannot receive threads"));
506	}
507	if (error != 0)
508		futex_lock_unmanage(fl, lock);
509	return (error);
510}
511
512static void
513futex_lock_release(struct futex_lock *fl)
514{
515
516	futex_lock_assert(fl);
517	if (fl->fl_waitcount == 0) {
518		/* Lock object is unreferenced. Deallocate it. */
519		KASSERT(fl->fl_owner == LOCK_UNMANAGED,
520		    ("Attempted to free a managed lock"));
521		futex_address_free(&fl->fl_address);
522		LIST_REMOVE(fl, fl_next);
523		free(fl, M_FUTEX);
524	}
525	sx_xunlock(&futex_global_lock);
526}
527
528static int
529futex_lock_unmanage(struct futex_lock *fl, cloudabi_lock_t *lock)
530{
531	cloudabi_lock_t cmp, old;
532	int error;
533
534	if (futex_queue_count(&fl->fl_readers) == 0 &&
535	    futex_queue_count(&fl->fl_writers) == 0) {
536		/* Lock should be unmanaged. */
537		fl->fl_owner = LOCK_UNMANAGED;
538
539		/* Clear kernel-managed bit. */
540		error = futex_user_load(lock, &old);
541		if (error != 0)
542			return (error);
543		for (;;) {
544			cmp = old;
545			error = futex_user_cmpxchg(lock, cmp, &old,
546			    cmp & ~CLOUDABI_LOCK_KERNEL_MANAGED);
547			if (error != 0)
548				return (error);
549			if (old == cmp)
550				break;
551		}
552	}
553	return (0);
554}
555
556/* Sets an owner of a lock, based on a userspace lock value. */
557static void
558futex_lock_set_owner(struct futex_lock *fl, cloudabi_lock_t lock)
559{
560
561	/* Lock has no explicit owner. */
562	if ((lock & ~CLOUDABI_LOCK_WRLOCKED) == 0) {
563		fl->fl_owner = LOCK_OWNER_UNKNOWN;
564		return;
565	}
566	lock &= ~(CLOUDABI_LOCK_WRLOCKED | CLOUDABI_LOCK_KERNEL_MANAGED);
567
568	/* Don't allow userspace to silently unlock. */
569	if (lock == LOCK_UNMANAGED) {
570		fl->fl_owner = LOCK_OWNER_UNKNOWN;
571		return;
572	}
573	fl->fl_owner = lock;
574}
575
576static int
577futex_lock_unlock(struct futex_lock *fl, struct thread *td,
578    cloudabi_lock_t *lock)
579{
580	int error;
581
582	/* Validate that this thread is allowed to unlock. */
583	error = futex_lock_update_owner(fl, lock);
584	if (error != 0)
585		return (error);
586	if (fl->fl_owner != LOCK_UNMANAGED && fl->fl_owner != td->td_tid)
587		return (EPERM);
588	return (futex_lock_wake_up_next(fl, lock));
589}
590
591/* Syncs in the owner of the lock from userspace if needed. */
592static int
593futex_lock_update_owner(struct futex_lock *fl, cloudabi_lock_t *address)
594{
595	cloudabi_lock_t lock;
596	int error;
597
598	if (fl->fl_owner == LOCK_OWNER_UNKNOWN) {
599		error = futex_user_load(address, &lock);
600		if (error != 0)
601			return (error);
602		futex_lock_set_owner(fl, lock);
603	}
604	return (0);
605}
606
607static int
608futex_lock_tryrdlock(struct futex_lock *fl, cloudabi_lock_t *address)
609{
610	cloudabi_lock_t old, cmp;
611	int error;
612
613	if (fl->fl_owner != LOCK_UNMANAGED) {
614		/* Lock is already acquired. */
615		return (EBUSY);
616	}
617
618	old = CLOUDABI_LOCK_UNLOCKED;
619	for (;;) {
620		if ((old & CLOUDABI_LOCK_KERNEL_MANAGED) != 0) {
621			/*
622			 * Userspace lock is kernel-managed, even though
623			 * the kernel disagrees.
624			 */
625			return (EINVAL);
626		}
627
628		if ((old & CLOUDABI_LOCK_WRLOCKED) == 0) {
629			/*
630			 * Lock is not write-locked. Attempt to acquire
631			 * it by increasing the read count.
632			 */
633			cmp = old;
634			error = futex_user_cmpxchg(address, cmp, &old, cmp + 1);
635			if (error != 0)
636				return (error);
637			if (old == cmp) {
638				/* Success. */
639				return (0);
640			}
641		} else {
642			/* Lock is write-locked. Make it kernel-managed. */
643			cmp = old;
644			error = futex_user_cmpxchg(address, cmp, &old,
645			    cmp | CLOUDABI_LOCK_KERNEL_MANAGED);
646			if (error != 0)
647				return (error);
648			if (old == cmp) {
649				/* Success. */
650				futex_lock_set_owner(fl, cmp);
651				return (EBUSY);
652			}
653		}
654	}
655}
656
657static int
658futex_lock_trywrlock(struct futex_lock *fl, cloudabi_lock_t *address,
659    cloudabi_tid_t tid, bool force_kernel_managed)
660{
661	cloudabi_lock_t old, new, cmp;
662	int error;
663
664	if (fl->fl_owner == tid) {
665		/* Attempted to acquire lock recursively. */
666		return (EDEADLK);
667	}
668	if (fl->fl_owner != LOCK_UNMANAGED) {
669		/* Lock is already acquired. */
670		return (EBUSY);
671	}
672
673	old = CLOUDABI_LOCK_UNLOCKED;
674	for (;;) {
675		if ((old & CLOUDABI_LOCK_KERNEL_MANAGED) != 0) {
676			/*
677			 * Userspace lock is kernel-managed, even though
678			 * the kernel disagrees.
679			 */
680			return (EINVAL);
681		}
682		if (old == (tid | CLOUDABI_LOCK_WRLOCKED)) {
683			/* Attempted to acquire lock recursively. */
684			return (EDEADLK);
685		}
686
687		if (old == CLOUDABI_LOCK_UNLOCKED) {
688			/* Lock is unlocked. Attempt to acquire it. */
689			new = tid | CLOUDABI_LOCK_WRLOCKED;
690			if (force_kernel_managed)
691				new |= CLOUDABI_LOCK_KERNEL_MANAGED;
692			error = futex_user_cmpxchg(address,
693			    CLOUDABI_LOCK_UNLOCKED, &old, new);
694			if (error != 0)
695				return (error);
696			if (old == CLOUDABI_LOCK_UNLOCKED) {
697				/* Success. */
698				if (force_kernel_managed)
699					fl->fl_owner = tid;
700				return (0);
701			}
702		} else {
703			/* Lock is still locked. Make it kernel-managed. */
704			cmp = old;
705			error = futex_user_cmpxchg(address, cmp, &old,
706			    cmp | CLOUDABI_LOCK_KERNEL_MANAGED);
707			if (error != 0)
708				return (error);
709			if (old == cmp) {
710				/* Success. */
711				futex_lock_set_owner(fl, cmp);
712				return (EBUSY);
713			}
714		}
715	}
716}
717
718static int
719futex_lock_wake_up_next(struct futex_lock *fl, cloudabi_lock_t *lock)
720{
721	cloudabi_tid_t tid;
722	int error;
723
724	/*
725	 * Determine which thread(s) to wake up. Prefer waking up
726	 * writers over readers to prevent write starvation.
727	 */
728	if (futex_queue_count(&fl->fl_writers) > 0) {
729		/* Transfer ownership to a single write-locker. */
730		if (futex_queue_count(&fl->fl_writers) > 1 ||
731		    futex_queue_count(&fl->fl_readers) > 0) {
732			/* Lock should remain managed afterwards. */
733			tid = futex_queue_tid_best(&fl->fl_writers);
734			error = futex_user_store(lock,
735			    tid | CLOUDABI_LOCK_WRLOCKED |
736			    CLOUDABI_LOCK_KERNEL_MANAGED);
737			if (error != 0)
738				return (error);
739
740			futex_queue_wake_up_best(&fl->fl_writers);
741			fl->fl_owner = tid;
742		} else {
743			/* Lock can become unmanaged afterwards. */
744			error = futex_user_store(lock,
745			    futex_queue_tid_best(&fl->fl_writers) |
746			    CLOUDABI_LOCK_WRLOCKED);
747			if (error != 0)
748				return (error);
749
750			futex_queue_wake_up_best(&fl->fl_writers);
751			fl->fl_owner = LOCK_UNMANAGED;
752		}
753	} else {
754		/* Transfer ownership to all read-lockers (if any). */
755		error = futex_user_store(lock,
756		    futex_queue_count(&fl->fl_readers));
757		if (error != 0)
758			return (error);
759
760		/* Wake up all threads. */
761		futex_queue_wake_up_all(&fl->fl_readers);
762		fl->fl_owner = LOCK_UNMANAGED;
763	}
764	return (0);
765}
766
767static int
768futex_lock_wrlock(struct futex_lock *fl, struct thread *td,
769    cloudabi_lock_t *lock, cloudabi_clockid_t clock_id,
770    cloudabi_timestamp_t timeout, cloudabi_timestamp_t precision,
771    struct futex_queue *donated)
772{
773	struct futex_waiter fw;
774	int error;
775
776	error = futex_lock_trywrlock(fl, lock, td->td_tid,
777	    futex_queue_count(donated) > 0);
778
779	if (error == 0 || error == EBUSY) {
780		/* Put donated threads in queue before suspending. */
781		KASSERT(futex_queue_count(donated) == 0 ||
782		    fl->fl_owner != LOCK_UNMANAGED,
783		    ("Lock should be managed if we are going to donate"));
784		futex_queue_requeue(donated, &fl->fl_writers, UINT_MAX);
785	} else {
786		/*
787		 * This thread cannot deal with the donated threads.
788		 * Wake up the next thread and let it try it by itself.
789		 */
790		futex_queue_wake_up_donate(donated, UINT_MAX);
791	}
792
793	if (error == EBUSY) {
794		/* Suspend execution if the lock was busy. */
795		KASSERT(fl->fl_owner != LOCK_UNMANAGED,
796		    ("Attempted to sleep on an unmanaged lock"));
797		error = futex_queue_sleep(&fl->fl_writers, fl, &fw, td,
798		    clock_id, timeout, precision);
799		KASSERT((error == 0) == fw.fw_locked,
800		    ("Should have locked write lock on success"));
801		KASSERT(futex_queue_count(&fw.fw_donated) == 0,
802		    ("Lock functions cannot receive threads"));
803	}
804	if (error != 0)
805		futex_lock_unmanage(fl, lock);
806	return (error);
807}
808
809/*
810 * futex_queue operations.
811 */
812
813static cloudabi_tid_t
814futex_queue_tid_best(const struct futex_queue *fq)
815{
816
817	return (STAILQ_FIRST(&fq->fq_list)->fw_tid);
818}
819
820static unsigned int
821futex_queue_count(const struct futex_queue *fq)
822{
823
824	return (fq->fq_count);
825}
826
827static void
828futex_queue_init(struct futex_queue *fq)
829{
830
831	STAILQ_INIT(&fq->fq_list);
832	fq->fq_count = 0;
833}
834
835/* Converts a relative timestamp to an sbintime. */
836static sbintime_t
837futex_queue_convert_timestamp_relative(cloudabi_timestamp_t ts)
838{
839	cloudabi_timestamp_t s, ns;
840
841	s = ts / 1000000000;
842	ns = ts % 1000000000;
843	if (s > INT32_MAX)
844		return (INT64_MAX);
845	return ((s << 32) + (ns << 32) / 1000000000);
846}
847
848/* Converts an absolute timestamp and precision to a pair of sbintime values. */
849static int
850futex_queue_convert_timestamp(struct thread *td, cloudabi_clockid_t clock_id,
851    cloudabi_timestamp_t timeout, cloudabi_timestamp_t precision,
852    sbintime_t *sbttimeout, sbintime_t *sbtprecision)
853{
854	cloudabi_timestamp_t now;
855	int error;
856
857	/* Make the time relative. */
858	error = cloudabi_clock_time_get(td, clock_id, &now);
859	if (error != 0)
860		return (error);
861	timeout = timeout < now ? 0 : timeout - now;
862
863	*sbttimeout = futex_queue_convert_timestamp_relative(timeout);
864	*sbtprecision = futex_queue_convert_timestamp_relative(precision);
865	return (0);
866}
867
868static int
869futex_queue_sleep(struct futex_queue *fq, struct futex_lock *fl,
870    struct futex_waiter *fw, struct thread *td, cloudabi_clockid_t clock_id,
871    cloudabi_timestamp_t timeout, cloudabi_timestamp_t precision)
872{
873	sbintime_t sbttimeout, sbtprecision;
874	int error;
875
876	/* Initialize futex_waiter object. */
877	fw->fw_tid = td->td_tid;
878	fw->fw_locked = false;
879	futex_queue_init(&fw->fw_donated);
880
881	if (timeout != UINT64_MAX) {
882		/* Convert timeout duration. */
883		error = futex_queue_convert_timestamp(td, clock_id, timeout,
884		    precision, &sbttimeout, &sbtprecision);
885		if (error != 0)
886			return (error);
887	}
888
889	/* Place object in the queue. */
890	fw->fw_queue = fq;
891	STAILQ_INSERT_TAIL(&fq->fq_list, fw, fw_next);
892	++fq->fq_count;
893
894	cv_init(&fw->fw_wait, "futex");
895	++fl->fl_waitcount;
896
897	futex_lock_assert(fl);
898	if (timeout == UINT64_MAX) {
899		/* Wait without a timeout. */
900		error = cv_wait_sig(&fw->fw_wait, &futex_global_lock);
901	} else {
902		/* Wait respecting the timeout. */
903		error = cv_timedwait_sig_sbt(&fw->fw_wait, &futex_global_lock,
904		    sbttimeout, sbtprecision, 0);
905		futex_lock_assert(fl);
906		if (error == EWOULDBLOCK &&
907		    fw->fw_queue != NULL && fw->fw_queue != fq) {
908			/*
909			 * We got signalled on a condition variable, but
910			 * observed a timeout while waiting to reacquire
911			 * the lock. In other words, we didn't actually
912			 * time out. Go back to sleep and wait for the
913			 * lock to be reacquired.
914			 */
915			error = cv_wait_sig(&fw->fw_wait, &futex_global_lock);
916		}
917	}
918	futex_lock_assert(fl);
919
920	--fl->fl_waitcount;
921	cv_destroy(&fw->fw_wait);
922
923	fq = fw->fw_queue;
924	if (fq == NULL) {
925		/* Thread got dequeued, so we've slept successfully. */
926		return (0);
927	}
928
929	/* Thread is still enqueued. Remove it. */
930	KASSERT(error != 0, ("Woken up thread is still enqueued"));
931	STAILQ_REMOVE(&fq->fq_list, fw, futex_waiter, fw_next);
932	--fq->fq_count;
933	return (error == EWOULDBLOCK ? ETIMEDOUT : error);
934}
935
936/* Moves up to nwaiters waiters from one queue to another. */
937static void
938futex_queue_requeue(struct futex_queue *fqfrom, struct futex_queue *fqto,
939    unsigned int nwaiters)
940{
941	struct futex_waiter *fw;
942
943	/* Move waiters to the target queue. */
944	while (nwaiters-- > 0 && !STAILQ_EMPTY(&fqfrom->fq_list)) {
945		fw = STAILQ_FIRST(&fqfrom->fq_list);
946		STAILQ_REMOVE_HEAD(&fqfrom->fq_list, fw_next);
947		--fqfrom->fq_count;
948
949		fw->fw_queue = fqto;
950		STAILQ_INSERT_TAIL(&fqto->fq_list, fw, fw_next);
951		++fqto->fq_count;
952	}
953}
954
955/* Wakes up all waiters in a queue. */
956static void
957futex_queue_wake_up_all(struct futex_queue *fq)
958{
959	struct futex_waiter *fw;
960
961	STAILQ_FOREACH(fw, &fq->fq_list, fw_next) {
962		fw->fw_locked = true;
963		fw->fw_queue = NULL;
964		cv_signal(&fw->fw_wait);
965	}
966
967	STAILQ_INIT(&fq->fq_list);
968	fq->fq_count = 0;
969}
970
971/*
972 * Wakes up the best waiter (i.e., the waiter having the highest
973 * priority) in a queue.
974 */
975static void
976futex_queue_wake_up_best(struct futex_queue *fq)
977{
978	struct futex_waiter *fw;
979
980	fw = STAILQ_FIRST(&fq->fq_list);
981	fw->fw_locked = true;
982	fw->fw_queue = NULL;
983	cv_signal(&fw->fw_wait);
984
985	STAILQ_REMOVE_HEAD(&fq->fq_list, fw_next);
986	--fq->fq_count;
987}
988
989static void
990futex_queue_wake_up_donate(struct futex_queue *fq, unsigned int nwaiters)
991{
992	struct futex_waiter *fw;
993
994	fw = STAILQ_FIRST(&fq->fq_list);
995	if (fw == NULL)
996		return;
997	fw->fw_locked = false;
998	fw->fw_queue = NULL;
999	cv_signal(&fw->fw_wait);
1000
1001	STAILQ_REMOVE_HEAD(&fq->fq_list, fw_next);
1002	--fq->fq_count;
1003	futex_queue_requeue(fq, &fw->fw_donated, nwaiters);
1004}
1005
1006/*
1007 * futex_user operations. Used to adjust values in userspace.
1008 */
1009
1010static int
1011futex_user_load(uint32_t *obj, uint32_t *val)
1012{
1013
1014	return (fueword32(obj, val) != 0 ? EFAULT : 0);
1015}
1016
1017static int
1018futex_user_store(uint32_t *obj, uint32_t val)
1019{
1020
1021	return (suword32(obj, val) != 0 ? EFAULT : 0);
1022}
1023
1024static int
1025futex_user_cmpxchg(uint32_t *obj, uint32_t cmp, uint32_t *old, uint32_t new)
1026{
1027
1028	return (casueword32(obj, cmp, old, new) != 0 ? EFAULT : 0);
1029}
1030
1031/*
1032 * Blocking calls: acquiring locks, waiting on condition variables.
1033 */
1034
1035int
1036cloudabi_futex_condvar_wait(struct thread *td, cloudabi_condvar_t *condvar,
1037    cloudabi_mflags_t condvar_scope, cloudabi_lock_t *lock,
1038    cloudabi_mflags_t lock_scope, cloudabi_clockid_t clock_id,
1039    cloudabi_timestamp_t timeout, cloudabi_timestamp_t precision)
1040{
1041	struct futex_condvar *fc;
1042	struct futex_lock *fl;
1043	struct futex_waiter fw;
1044	int error, error2;
1045
1046	/* Lookup condition variable object. */
1047	error = futex_condvar_lookup_or_create(td, condvar, condvar_scope, lock,
1048	    lock_scope, &fc);
1049	if (error != 0)
1050		return (error);
1051	fl = fc->fc_lock;
1052
1053	/*
1054	 * Set the condition variable to something other than
1055	 * CLOUDABI_CONDVAR_HAS_NO_WAITERS to make userspace threads
1056	 * call into the kernel to perform wakeups.
1057	 */
1058	error = futex_user_store(condvar, ~CLOUDABI_CONDVAR_HAS_NO_WAITERS);
1059	if (error != 0) {
1060		futex_condvar_release(fc);
1061		return (error);
1062	}
1063
1064	/* Drop the lock. */
1065	error = futex_lock_unlock(fl, td, lock);
1066	if (error != 0) {
1067		futex_condvar_unmanage(fc, condvar);
1068		futex_condvar_release(fc);
1069		return (error);
1070	}
1071
1072	/* Go to sleep. */
1073	++fc->fc_waitcount;
1074	error = futex_queue_sleep(&fc->fc_waiters, fc->fc_lock, &fw, td,
1075	    clock_id, timeout, precision);
1076	if (fw.fw_locked) {
1077		/* Waited and got the lock assigned to us. */
1078		KASSERT(futex_queue_count(&fw.fw_donated) == 0,
1079		    ("Received threads while being locked"));
1080	} else if (error == 0 || error == ETIMEDOUT) {
1081		if (error != 0)
1082			futex_condvar_unmanage(fc, condvar);
1083		/*
1084		 * Got woken up without having the lock assigned to us.
1085		 * This can happen in two cases:
1086		 *
1087		 * 1. We observed a timeout on a condition variable.
1088		 * 2. We got signalled on a condition variable while the
1089		 *    associated lock is unlocked. We are the first
1090		 *    thread that gets woken up. This thread is
1091		 *    responsible for reacquiring the userspace lock.
1092		 */
1093		error2 = futex_lock_wrlock(fl, td, lock,
1094		    CLOUDABI_CLOCK_MONOTONIC, UINT64_MAX, 0, &fw.fw_donated);
1095		if (error2 != 0)
1096			error = error2;
1097	} else {
1098		KASSERT(futex_queue_count(&fw.fw_donated) == 0,
1099		    ("Received threads on error"));
1100		futex_condvar_unmanage(fc, condvar);
1101		futex_lock_unmanage(fl, lock);
1102	}
1103	--fc->fc_waitcount;
1104	futex_condvar_release(fc);
1105	return (error);
1106}
1107
1108int
1109cloudabi_futex_lock_rdlock(struct thread *td, cloudabi_lock_t *lock,
1110    cloudabi_mflags_t scope, cloudabi_clockid_t clock_id,
1111    cloudabi_timestamp_t timeout, cloudabi_timestamp_t precision)
1112{
1113	struct futex_lock *fl;
1114	int error;
1115
1116	/* Look up lock object. */
1117	error = futex_lock_lookup(td, lock, scope, &fl);
1118	if (error != 0)
1119		return (error);
1120
1121	error = futex_lock_rdlock(fl, td, lock, clock_id, timeout,
1122	    precision);
1123	futex_lock_release(fl);
1124	return (error);
1125}
1126
1127int
1128cloudabi_futex_lock_wrlock(struct thread *td, cloudabi_lock_t *lock,
1129    cloudabi_mflags_t scope, cloudabi_clockid_t clock_id,
1130    cloudabi_timestamp_t timeout, cloudabi_timestamp_t precision)
1131{
1132	struct futex_lock *fl;
1133	struct futex_queue fq;
1134	int error;
1135
1136	/* Look up lock object. */
1137	error = futex_lock_lookup(td, lock, scope, &fl);
1138	if (error != 0)
1139		return (error);
1140
1141	futex_queue_init(&fq);
1142	error = futex_lock_wrlock(fl, td, lock, clock_id, timeout,
1143	    precision, &fq);
1144	futex_lock_release(fl);
1145	return (error);
1146}
1147
1148/*
1149 * Non-blocking calls: releasing locks, signalling condition variables.
1150 */
1151
1152int
1153cloudabi_sys_condvar_signal(struct thread *td,
1154    struct cloudabi_sys_condvar_signal_args *uap)
1155{
1156	struct futex_condvar *fc;
1157	struct futex_lock *fl;
1158	cloudabi_nthreads_t nwaiters;
1159	int error;
1160
1161	nwaiters = uap->nwaiters;
1162	if (nwaiters == 0) {
1163		/* No threads to wake up. */
1164		return (0);
1165	}
1166
1167	/* Look up futex object. */
1168	error = futex_condvar_lookup(td, uap->condvar, uap->scope, &fc);
1169	if (error != 0) {
1170		/* Race condition: condition variable with no waiters. */
1171		return (error == ENOENT ? 0 : error);
1172	}
1173	fl = fc->fc_lock;
1174
1175	if (fl->fl_owner == LOCK_UNMANAGED) {
1176		/*
1177		 * The lock is currently not managed by the kernel,
1178		 * meaning we must attempt to acquire the userspace lock
1179		 * first. We cannot requeue threads to an unmanaged lock,
1180		 * as these threads will then never be scheduled.
1181		 *
1182		 * Unfortunately, the memory address of the lock is
1183		 * unknown from this context, meaning that we cannot
1184		 * acquire the lock on behalf of the first thread to be
1185		 * scheduled. The lock may even not be mapped within the
1186		 * address space of the current thread.
1187		 *
1188		 * To solve this, wake up a single waiter that will
1189		 * attempt to acquire the lock. Donate all of the other
1190		 * waiters that need to be woken up to this waiter, so
1191		 * it can requeue them after acquiring the lock.
1192		 */
1193		futex_queue_wake_up_donate(&fc->fc_waiters, nwaiters - 1);
1194	} else {
1195		/*
1196		 * Lock is already managed by the kernel. This makes it
1197		 * easy, as we can requeue the threads from the
1198		 * condition variable directly to the associated lock.
1199		 */
1200		futex_queue_requeue(&fc->fc_waiters, &fl->fl_writers, nwaiters);
1201	}
1202
1203	/* Clear userspace condition variable if all waiters are gone. */
1204	error = futex_condvar_unmanage(fc, uap->condvar);
1205	futex_condvar_release(fc);
1206	return (error);
1207}
1208
1209int
1210cloudabi_sys_lock_unlock(struct thread *td,
1211    struct cloudabi_sys_lock_unlock_args *uap)
1212{
1213	struct futex_lock *fl;
1214	int error;
1215
1216	error = futex_lock_lookup(td, uap->lock, uap->scope, &fl);
1217	if (error != 0)
1218		return (error);
1219	error = futex_lock_unlock(fl, td, uap->lock);
1220	futex_lock_release(fl);
1221	return (error);
1222}
1223