kern_umtx.c revision 179970
1/*-
2 * Copyright (c) 2004, David Xu <davidxu@freebsd.org>
3 * Copyright (c) 2002, Jeffrey Roberson <jeff@freebsd.org>
4 * All rights reserved.
5 *
6 * Redistribution and use in source and binary forms, with or without
7 * modification, are permitted provided that the following conditions
8 * are met:
9 * 1. Redistributions of source code must retain the above copyright
10 *    notice unmodified, this list of conditions, and the following
11 *    disclaimer.
12 * 2. Redistributions in binary form must reproduce the above copyright
13 *    notice, this list of conditions and the following disclaimer in the
14 *    documentation and/or other materials provided with the distribution.
15 *
16 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
17 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
18 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
19 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
20 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
21 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
22 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
23 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
25 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26 */
27
28#include <sys/cdefs.h>
29__FBSDID("$FreeBSD: head/sys/kern/kern_umtx.c 179970 2008-06-24 07:32:12Z davidxu $");
30
31#include "opt_compat.h"
32#include <sys/param.h>
33#include <sys/kernel.h>
34#include <sys/limits.h>
35#include <sys/lock.h>
36#include <sys/malloc.h>
37#include <sys/mutex.h>
38#include <sys/priv.h>
39#include <sys/proc.h>
40#include <sys/sched.h>
41#include <sys/smp.h>
42#include <sys/sysctl.h>
43#include <sys/sysent.h>
44#include <sys/systm.h>
45#include <sys/sysproto.h>
46#include <sys/eventhandler.h>
47#include <sys/umtx.h>
48
49#include <vm/vm.h>
50#include <vm/vm_param.h>
51#include <vm/pmap.h>
52#include <vm/vm_map.h>
53#include <vm/vm_object.h>
54
55#include <machine/cpu.h>
56
57#ifdef COMPAT_IA32
58#include <compat/freebsd32/freebsd32_proto.h>
59#endif
60
61#define TYPE_SIMPLE_WAIT	0
62#define TYPE_CV			1
63#define TYPE_SIMPLE_LOCK	2
64#define TYPE_NORMAL_UMUTEX	3
65#define TYPE_PI_UMUTEX		4
66#define TYPE_PP_UMUTEX		5
67#define TYPE_RWLOCK		6
68
69#define _UMUTEX_TRY		1
70#define _UMUTEX_WAIT		2
71
72/* Key to represent a unique userland synchronous object */
73struct umtx_key {
74	int	hash;
75	int	type;
76	int	shared;
77	union {
78		struct {
79			vm_object_t	object;
80			uintptr_t	offset;
81		} shared;
82		struct {
83			struct vmspace	*vs;
84			uintptr_t	addr;
85		} private;
86		struct {
87			void		*a;
88			uintptr_t	b;
89		} both;
90	} info;
91};
92
93/* Priority inheritance mutex info. */
94struct umtx_pi {
95	/* Owner thread */
96	struct thread		*pi_owner;
97
98	/* Reference count */
99	int			pi_refcount;
100
101 	/* List entry to link umtx holding by thread */
102	TAILQ_ENTRY(umtx_pi)	pi_link;
103
104	/* List entry in hash */
105	TAILQ_ENTRY(umtx_pi)	pi_hashlink;
106
107	/* List for waiters */
108	TAILQ_HEAD(,umtx_q)	pi_blocked;
109
110	/* Identify a userland lock object */
111	struct umtx_key		pi_key;
112};
113
114/* A userland synchronous object user. */
115struct umtx_q {
116	/* Linked list for the hash. */
117	TAILQ_ENTRY(umtx_q)	uq_link;
118
119	/* Umtx key. */
120	struct umtx_key		uq_key;
121
122	/* Umtx flags. */
123	int			uq_flags;
124#define UQF_UMTXQ	0x0001
125
126	/* The thread waits on. */
127	struct thread		*uq_thread;
128
129	/*
130	 * Blocked on PI mutex. read can use chain lock
131	 * or umtx_lock, write must have both chain lock and
132	 * umtx_lock being hold.
133	 */
134	struct umtx_pi		*uq_pi_blocked;
135
136	/* On blocked list */
137	TAILQ_ENTRY(umtx_q)	uq_lockq;
138
139	/* Thread contending with us */
140	TAILQ_HEAD(,umtx_pi)	uq_pi_contested;
141
142	/* Inherited priority from PP mutex */
143	u_char			uq_inherited_pri;
144};
145
146TAILQ_HEAD(umtxq_head, umtx_q);
147
148/* Userland lock object's wait-queue chain */
149struct umtxq_chain {
150	/* Lock for this chain. */
151	struct mtx		uc_lock;
152
153	/* List of sleep queues. */
154	struct umtxq_head	uc_queue[2];
155#define UMTX_SHARED_QUEUE	0
156#define UMTX_EXCLUSIVE_QUEUE	1
157
158	/* Busy flag */
159	char			uc_busy;
160
161	/* Chain lock waiters */
162	int			uc_waiters;
163
164	/* All PI in the list */
165	TAILQ_HEAD(,umtx_pi)	uc_pi_list;
166};
167
168#define	UMTXQ_LOCKED_ASSERT(uc)		mtx_assert(&(uc)->uc_lock, MA_OWNED)
169
170/*
171 * Don't propagate time-sharing priority, there is a security reason,
172 * a user can simply introduce PI-mutex, let thread A lock the mutex,
173 * and let another thread B block on the mutex, because B is
174 * sleeping, its priority will be boosted, this causes A's priority to
175 * be boosted via priority propagating too and will never be lowered even
176 * if it is using 100%CPU, this is unfair to other processes.
177 */
178
179#define UPRI(td)	(((td)->td_user_pri >= PRI_MIN_TIMESHARE &&\
180			  (td)->td_user_pri <= PRI_MAX_TIMESHARE) ?\
181			 PRI_MAX_TIMESHARE : (td)->td_user_pri)
182
183#define	GOLDEN_RATIO_PRIME	2654404609U
184#define	UMTX_CHAINS		128
185#define	UMTX_SHIFTS		(__WORD_BIT - 7)
186
187#define THREAD_SHARE		0
188#define PROCESS_SHARE		1
189#define AUTO_SHARE		2
190
191#define	GET_SHARE(flags)	\
192    (((flags) & USYNC_PROCESS_SHARED) == 0 ? THREAD_SHARE : PROCESS_SHARE)
193
194#define BUSY_SPINS		200
195
196static uma_zone_t		umtx_pi_zone;
197static struct umtxq_chain	umtxq_chains[2][UMTX_CHAINS];
198static MALLOC_DEFINE(M_UMTX, "umtx", "UMTX queue memory");
199static int			umtx_pi_allocated;
200
201SYSCTL_NODE(_debug, OID_AUTO, umtx, CTLFLAG_RW, 0, "umtx debug");
202SYSCTL_INT(_debug_umtx, OID_AUTO, umtx_pi_allocated, CTLFLAG_RD,
203    &umtx_pi_allocated, 0, "Allocated umtx_pi");
204
205static void umtxq_sysinit(void *);
206static void umtxq_hash(struct umtx_key *key);
207static struct umtxq_chain *umtxq_getchain(struct umtx_key *key);
208static void umtxq_lock(struct umtx_key *key);
209static void umtxq_unlock(struct umtx_key *key);
210static void umtxq_busy(struct umtx_key *key);
211static void umtxq_unbusy(struct umtx_key *key);
212static void umtxq_insert_queue(struct umtx_q *uq, int q);
213static void umtxq_remove_queue(struct umtx_q *uq, int q);
214static int umtxq_sleep(struct umtx_q *uq, const char *wmesg, int timo);
215static int umtxq_count(struct umtx_key *key);
216static int umtx_key_match(const struct umtx_key *k1, const struct umtx_key *k2);
217static int umtx_key_get(void *addr, int type, int share,
218	struct umtx_key *key);
219static void umtx_key_release(struct umtx_key *key);
220static struct umtx_pi *umtx_pi_alloc(int);
221static void umtx_pi_free(struct umtx_pi *pi);
222static void umtx_pi_adjust_locked(struct thread *td, u_char oldpri);
223static int do_unlock_pp(struct thread *td, struct umutex *m, uint32_t flags);
224static void umtx_thread_cleanup(struct thread *td);
225static void umtx_exec_hook(void *arg __unused, struct proc *p __unused,
226	struct image_params *imgp __unused);
227SYSINIT(umtx, SI_SUB_EVENTHANDLER+1, SI_ORDER_MIDDLE, umtxq_sysinit, NULL);
228
229#define umtxq_signal(key, nwake)	umtxq_signal_queue((key), (nwake), UMTX_SHARED_QUEUE)
230#define umtxq_insert(uq)	umtxq_insert_queue((uq), UMTX_SHARED_QUEUE)
231#define umtxq_remove(uq)	umtxq_remove_queue((uq), UMTX_SHARED_QUEUE)
232
233static struct mtx umtx_lock;
234
235static void
236umtxq_sysinit(void *arg __unused)
237{
238	int i, j;
239
240	umtx_pi_zone = uma_zcreate("umtx pi", sizeof(struct umtx_pi),
241		NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0);
242	for (i = 0; i < 2; ++i) {
243		for (j = 0; j < UMTX_CHAINS; ++j) {
244			mtx_init(&umtxq_chains[i][j].uc_lock, "umtxql", NULL,
245				 MTX_DEF | MTX_DUPOK);
246			TAILQ_INIT(&umtxq_chains[i][j].uc_queue[0]);
247			TAILQ_INIT(&umtxq_chains[i][j].uc_queue[1]);
248			TAILQ_INIT(&umtxq_chains[i][j].uc_pi_list);
249			umtxq_chains[i][j].uc_busy = 0;
250			umtxq_chains[i][j].uc_waiters = 0;
251		}
252	}
253	mtx_init(&umtx_lock, "umtx lock", NULL, MTX_SPIN);
254	EVENTHANDLER_REGISTER(process_exec, umtx_exec_hook, NULL,
255	    EVENTHANDLER_PRI_ANY);
256}
257
258struct umtx_q *
259umtxq_alloc(void)
260{
261	struct umtx_q *uq;
262
263	uq = malloc(sizeof(struct umtx_q), M_UMTX, M_WAITOK | M_ZERO);
264	TAILQ_INIT(&uq->uq_pi_contested);
265	uq->uq_inherited_pri = PRI_MAX;
266	return (uq);
267}
268
269void
270umtxq_free(struct umtx_q *uq)
271{
272	free(uq, M_UMTX);
273}
274
275static inline void
276umtxq_hash(struct umtx_key *key)
277{
278	unsigned n = (uintptr_t)key->info.both.a + key->info.both.b;
279	key->hash = ((n * GOLDEN_RATIO_PRIME) >> UMTX_SHIFTS) % UMTX_CHAINS;
280}
281
282static inline int
283umtx_key_match(const struct umtx_key *k1, const struct umtx_key *k2)
284{
285	return (k1->type == k2->type &&
286		k1->info.both.a == k2->info.both.a &&
287	        k1->info.both.b == k2->info.both.b);
288}
289
290static inline struct umtxq_chain *
291umtxq_getchain(struct umtx_key *key)
292{
293	if (key->type <= TYPE_CV)
294		return (&umtxq_chains[1][key->hash]);
295	return (&umtxq_chains[0][key->hash]);
296}
297
298/*
299 * Lock a chain.
300 */
301static inline void
302umtxq_lock(struct umtx_key *key)
303{
304	struct umtxq_chain *uc;
305
306	uc = umtxq_getchain(key);
307	mtx_lock(&uc->uc_lock);
308}
309
310/*
311 * Unlock a chain.
312 */
313static inline void
314umtxq_unlock(struct umtx_key *key)
315{
316	struct umtxq_chain *uc;
317
318	uc = umtxq_getchain(key);
319	mtx_unlock(&uc->uc_lock);
320}
321
322/*
323 * Set chain to busy state when following operation
324 * may be blocked (kernel mutex can not be used).
325 */
326static inline void
327umtxq_busy(struct umtx_key *key)
328{
329	struct umtxq_chain *uc;
330
331	uc = umtxq_getchain(key);
332	mtx_assert(&uc->uc_lock, MA_OWNED);
333	if (uc->uc_busy) {
334#ifdef SMP
335		if (smp_cpus > 1) {
336			int count = BUSY_SPINS;
337			if (count > 0) {
338				umtxq_unlock(key);
339				while (uc->uc_busy && --count > 0)
340					cpu_spinwait();
341				umtxq_lock(key);
342			}
343		}
344#endif
345		while (uc->uc_busy) {
346			uc->uc_waiters++;
347			msleep(uc, &uc->uc_lock, 0, "umtxqb", 0);
348			uc->uc_waiters--;
349		}
350	}
351	uc->uc_busy = 1;
352}
353
354/*
355 * Unbusy a chain.
356 */
357static inline void
358umtxq_unbusy(struct umtx_key *key)
359{
360	struct umtxq_chain *uc;
361
362	uc = umtxq_getchain(key);
363	mtx_assert(&uc->uc_lock, MA_OWNED);
364	KASSERT(uc->uc_busy != 0, ("not busy"));
365	uc->uc_busy = 0;
366	if (uc->uc_waiters)
367		wakeup_one(uc);
368}
369
370static inline void
371umtxq_insert_queue(struct umtx_q *uq, int q)
372{
373	struct umtxq_chain *uc;
374
375	uc = umtxq_getchain(&uq->uq_key);
376	UMTXQ_LOCKED_ASSERT(uc);
377	TAILQ_INSERT_TAIL(&uc->uc_queue[q], uq, uq_link);
378	uq->uq_flags |= UQF_UMTXQ;
379}
380
381static inline void
382umtxq_remove_queue(struct umtx_q *uq, int q)
383{
384	struct umtxq_chain *uc;
385
386	uc = umtxq_getchain(&uq->uq_key);
387	UMTXQ_LOCKED_ASSERT(uc);
388	if (uq->uq_flags & UQF_UMTXQ) {
389		TAILQ_REMOVE(&uc->uc_queue[q], uq, uq_link);
390		uq->uq_flags &= ~UQF_UMTXQ;
391	}
392}
393
394/*
395 * Check if there are multiple waiters
396 */
397static int
398umtxq_count(struct umtx_key *key)
399{
400	struct umtxq_chain *uc;
401	struct umtx_q *uq;
402	int count = 0;
403
404	uc = umtxq_getchain(key);
405	UMTXQ_LOCKED_ASSERT(uc);
406	TAILQ_FOREACH(uq, &uc->uc_queue[UMTX_SHARED_QUEUE], uq_link) {
407		if (umtx_key_match(&uq->uq_key, key)) {
408			if (++count > 1)
409				break;
410		}
411	}
412	return (count);
413}
414
415/*
416 * Check if there are multiple PI waiters and returns first
417 * waiter.
418 */
419static int
420umtxq_count_pi(struct umtx_key *key, struct umtx_q **first)
421{
422	struct umtxq_chain *uc;
423	struct umtx_q *uq;
424	int count = 0;
425
426	*first = NULL;
427	uc = umtxq_getchain(key);
428	UMTXQ_LOCKED_ASSERT(uc);
429	TAILQ_FOREACH(uq, &uc->uc_queue[UMTX_SHARED_QUEUE], uq_link) {
430		if (umtx_key_match(&uq->uq_key, key)) {
431			if (++count > 1)
432				break;
433			*first = uq;
434		}
435	}
436	return (count);
437}
438
439/*
440 * Wake up threads waiting on an userland object.
441 */
442
443static int
444umtxq_signal_queue(struct umtx_key *key, int n_wake, int q)
445{
446	struct umtxq_chain *uc;
447	struct umtx_q *uq, *next;
448	int ret;
449
450	ret = 0;
451	uc = umtxq_getchain(key);
452	UMTXQ_LOCKED_ASSERT(uc);
453	TAILQ_FOREACH_SAFE(uq, &uc->uc_queue[q], uq_link, next) {
454		if (umtx_key_match(&uq->uq_key, key)) {
455			umtxq_remove_queue(uq, q);
456			wakeup(uq);
457			if (++ret >= n_wake)
458				break;
459		}
460	}
461	return (ret);
462}
463
464
465/*
466 * Wake up specified thread.
467 */
468static inline void
469umtxq_signal_thread(struct umtx_q *uq)
470{
471	struct umtxq_chain *uc;
472
473	uc = umtxq_getchain(&uq->uq_key);
474	UMTXQ_LOCKED_ASSERT(uc);
475	umtxq_remove(uq);
476	wakeup(uq);
477}
478
479/*
480 * Put thread into sleep state, before sleeping, check if
481 * thread was removed from umtx queue.
482 */
483static inline int
484umtxq_sleep(struct umtx_q *uq, const char *wmesg, int timo)
485{
486	struct umtxq_chain *uc;
487	int error;
488
489	uc = umtxq_getchain(&uq->uq_key);
490	UMTXQ_LOCKED_ASSERT(uc);
491	if (!(uq->uq_flags & UQF_UMTXQ))
492		return (0);
493	error = msleep(uq, &uc->uc_lock, PCATCH, wmesg, timo);
494	if (error == EWOULDBLOCK)
495		error = ETIMEDOUT;
496	return (error);
497}
498
499/*
500 * Convert userspace address into unique logical address.
501 */
502static int
503umtx_key_get(void *addr, int type, int share, struct umtx_key *key)
504{
505	struct thread *td = curthread;
506	vm_map_t map;
507	vm_map_entry_t entry;
508	vm_pindex_t pindex;
509	vm_prot_t prot;
510	boolean_t wired;
511
512	key->type = type;
513	if (share == THREAD_SHARE) {
514		key->shared = 0;
515		key->info.private.vs = td->td_proc->p_vmspace;
516		key->info.private.addr = (uintptr_t)addr;
517	} else {
518		MPASS(share == PROCESS_SHARE || share == AUTO_SHARE);
519		map = &td->td_proc->p_vmspace->vm_map;
520		if (vm_map_lookup(&map, (vm_offset_t)addr, VM_PROT_WRITE,
521		    &entry, &key->info.shared.object, &pindex, &prot,
522		    &wired) != KERN_SUCCESS) {
523			return EFAULT;
524		}
525
526		if ((share == PROCESS_SHARE) ||
527		    (share == AUTO_SHARE &&
528		     VM_INHERIT_SHARE == entry->inheritance)) {
529			key->shared = 1;
530			key->info.shared.offset = entry->offset + entry->start -
531				(vm_offset_t)addr;
532			vm_object_reference(key->info.shared.object);
533		} else {
534			key->shared = 0;
535			key->info.private.vs = td->td_proc->p_vmspace;
536			key->info.private.addr = (uintptr_t)addr;
537		}
538		vm_map_lookup_done(map, entry);
539	}
540
541	umtxq_hash(key);
542	return (0);
543}
544
545/*
546 * Release key.
547 */
548static inline void
549umtx_key_release(struct umtx_key *key)
550{
551	if (key->shared)
552		vm_object_deallocate(key->info.shared.object);
553}
554
555/*
556 * Lock a umtx object.
557 */
558static int
559_do_lock_umtx(struct thread *td, struct umtx *umtx, u_long id, int timo)
560{
561	struct umtx_q *uq;
562	u_long owner;
563	u_long old;
564	int error = 0;
565
566	uq = td->td_umtxq;
567
568	/*
569	 * Care must be exercised when dealing with umtx structure. It
570	 * can fault on any access.
571	 */
572	for (;;) {
573		/*
574		 * Try the uncontested case.  This should be done in userland.
575		 */
576		owner = casuword(&umtx->u_owner, UMTX_UNOWNED, id);
577
578		/* The acquire succeeded. */
579		if (owner == UMTX_UNOWNED)
580			return (0);
581
582		/* The address was invalid. */
583		if (owner == -1)
584			return (EFAULT);
585
586		/* If no one owns it but it is contested try to acquire it. */
587		if (owner == UMTX_CONTESTED) {
588			owner = casuword(&umtx->u_owner,
589			    UMTX_CONTESTED, id | UMTX_CONTESTED);
590
591			if (owner == UMTX_CONTESTED)
592				return (0);
593
594			/* The address was invalid. */
595			if (owner == -1)
596				return (EFAULT);
597
598			/* If this failed the lock has changed, restart. */
599			continue;
600		}
601
602		/*
603		 * If we caught a signal, we have retried and now
604		 * exit immediately.
605		 */
606		if (error != 0)
607			return (error);
608
609		if ((error = umtx_key_get(umtx, TYPE_SIMPLE_LOCK,
610			AUTO_SHARE, &uq->uq_key)) != 0)
611			return (error);
612
613		umtxq_lock(&uq->uq_key);
614		umtxq_busy(&uq->uq_key);
615		umtxq_insert(uq);
616		umtxq_unbusy(&uq->uq_key);
617		umtxq_unlock(&uq->uq_key);
618
619		/*
620		 * Set the contested bit so that a release in user space
621		 * knows to use the system call for unlock.  If this fails
622		 * either some one else has acquired the lock or it has been
623		 * released.
624		 */
625		old = casuword(&umtx->u_owner, owner, owner | UMTX_CONTESTED);
626
627		/* The address was invalid. */
628		if (old == -1) {
629			umtxq_lock(&uq->uq_key);
630			umtxq_remove(uq);
631			umtxq_unlock(&uq->uq_key);
632			umtx_key_release(&uq->uq_key);
633			return (EFAULT);
634		}
635
636		/*
637		 * We set the contested bit, sleep. Otherwise the lock changed
638		 * and we need to retry or we lost a race to the thread
639		 * unlocking the umtx.
640		 */
641		umtxq_lock(&uq->uq_key);
642		if (old == owner)
643			error = umtxq_sleep(uq, "umtx", timo);
644		umtxq_remove(uq);
645		umtxq_unlock(&uq->uq_key);
646		umtx_key_release(&uq->uq_key);
647	}
648
649	return (0);
650}
651
652/*
653 * Lock a umtx object.
654 */
655static int
656do_lock_umtx(struct thread *td, struct umtx *umtx, u_long id,
657	struct timespec *timeout)
658{
659	struct timespec ts, ts2, ts3;
660	struct timeval tv;
661	int error;
662
663	if (timeout == NULL) {
664		error = _do_lock_umtx(td, umtx, id, 0);
665		/* Mutex locking is restarted if it is interrupted. */
666		if (error == EINTR)
667			error = ERESTART;
668	} else {
669		getnanouptime(&ts);
670		timespecadd(&ts, timeout);
671		TIMESPEC_TO_TIMEVAL(&tv, timeout);
672		for (;;) {
673			error = _do_lock_umtx(td, umtx, id, tvtohz(&tv));
674			if (error != ETIMEDOUT)
675				break;
676			getnanouptime(&ts2);
677			if (timespeccmp(&ts2, &ts, >=)) {
678				error = ETIMEDOUT;
679				break;
680			}
681			ts3 = ts;
682			timespecsub(&ts3, &ts2);
683			TIMESPEC_TO_TIMEVAL(&tv, &ts3);
684		}
685		/* Timed-locking is not restarted. */
686		if (error == ERESTART)
687			error = EINTR;
688	}
689	return (error);
690}
691
692/*
693 * Unlock a umtx object.
694 */
695static int
696do_unlock_umtx(struct thread *td, struct umtx *umtx, u_long id)
697{
698	struct umtx_key key;
699	u_long owner;
700	u_long old;
701	int error;
702	int count;
703
704	/*
705	 * Make sure we own this mtx.
706	 */
707	owner = fuword(__DEVOLATILE(u_long *, &umtx->u_owner));
708	if (owner == -1)
709		return (EFAULT);
710
711	if ((owner & ~UMTX_CONTESTED) != id)
712		return (EPERM);
713
714	/* This should be done in userland */
715	if ((owner & UMTX_CONTESTED) == 0) {
716		old = casuword(&umtx->u_owner, owner, UMTX_UNOWNED);
717		if (old == -1)
718			return (EFAULT);
719		if (old == owner)
720			return (0);
721		owner = old;
722	}
723
724	/* We should only ever be in here for contested locks */
725	if ((error = umtx_key_get(umtx, TYPE_SIMPLE_LOCK, AUTO_SHARE,
726		&key)) != 0)
727		return (error);
728
729	umtxq_lock(&key);
730	umtxq_busy(&key);
731	count = umtxq_count(&key);
732	umtxq_unlock(&key);
733
734	/*
735	 * When unlocking the umtx, it must be marked as unowned if
736	 * there is zero or one thread only waiting for it.
737	 * Otherwise, it must be marked as contested.
738	 */
739	old = casuword(&umtx->u_owner, owner,
740		count <= 1 ? UMTX_UNOWNED : UMTX_CONTESTED);
741	umtxq_lock(&key);
742	umtxq_signal(&key,1);
743	umtxq_unbusy(&key);
744	umtxq_unlock(&key);
745	umtx_key_release(&key);
746	if (old == -1)
747		return (EFAULT);
748	if (old != owner)
749		return (EINVAL);
750	return (0);
751}
752
753#ifdef COMPAT_IA32
754
755/*
756 * Lock a umtx object.
757 */
758static int
759_do_lock_umtx32(struct thread *td, uint32_t *m, uint32_t id, int timo)
760{
761	struct umtx_q *uq;
762	uint32_t owner;
763	uint32_t old;
764	int error = 0;
765
766	uq = td->td_umtxq;
767
768	/*
769	 * Care must be exercised when dealing with umtx structure. It
770	 * can fault on any access.
771	 */
772	for (;;) {
773		/*
774		 * Try the uncontested case.  This should be done in userland.
775		 */
776		owner = casuword32(m, UMUTEX_UNOWNED, id);
777
778		/* The acquire succeeded. */
779		if (owner == UMUTEX_UNOWNED)
780			return (0);
781
782		/* The address was invalid. */
783		if (owner == -1)
784			return (EFAULT);
785
786		/* If no one owns it but it is contested try to acquire it. */
787		if (owner == UMUTEX_CONTESTED) {
788			owner = casuword32(m,
789			    UMUTEX_CONTESTED, id | UMUTEX_CONTESTED);
790			if (owner == UMUTEX_CONTESTED)
791				return (0);
792
793			/* The address was invalid. */
794			if (owner == -1)
795				return (EFAULT);
796
797			/* If this failed the lock has changed, restart. */
798			continue;
799		}
800
801		/*
802		 * If we caught a signal, we have retried and now
803		 * exit immediately.
804		 */
805		if (error != 0)
806			return (error);
807
808		if ((error = umtx_key_get(m, TYPE_SIMPLE_LOCK,
809			AUTO_SHARE, &uq->uq_key)) != 0)
810			return (error);
811
812		umtxq_lock(&uq->uq_key);
813		umtxq_busy(&uq->uq_key);
814		umtxq_insert(uq);
815		umtxq_unbusy(&uq->uq_key);
816		umtxq_unlock(&uq->uq_key);
817
818		/*
819		 * Set the contested bit so that a release in user space
820		 * knows to use the system call for unlock.  If this fails
821		 * either some one else has acquired the lock or it has been
822		 * released.
823		 */
824		old = casuword32(m, owner, owner | UMUTEX_CONTESTED);
825
826		/* The address was invalid. */
827		if (old == -1) {
828			umtxq_lock(&uq->uq_key);
829			umtxq_remove(uq);
830			umtxq_unlock(&uq->uq_key);
831			umtx_key_release(&uq->uq_key);
832			return (EFAULT);
833		}
834
835		/*
836		 * We set the contested bit, sleep. Otherwise the lock changed
837		 * and we need to retry or we lost a race to the thread
838		 * unlocking the umtx.
839		 */
840		umtxq_lock(&uq->uq_key);
841		if (old == owner)
842			error = umtxq_sleep(uq, "umtx", timo);
843		umtxq_remove(uq);
844		umtxq_unlock(&uq->uq_key);
845		umtx_key_release(&uq->uq_key);
846	}
847
848	return (0);
849}
850
851/*
852 * Lock a umtx object.
853 */
854static int
855do_lock_umtx32(struct thread *td, void *m, uint32_t id,
856	struct timespec *timeout)
857{
858	struct timespec ts, ts2, ts3;
859	struct timeval tv;
860	int error;
861
862	if (timeout == NULL) {
863		error = _do_lock_umtx32(td, m, id, 0);
864		/* Mutex locking is restarted if it is interrupted. */
865		if (error == EINTR)
866			error = ERESTART;
867	} else {
868		getnanouptime(&ts);
869		timespecadd(&ts, timeout);
870		TIMESPEC_TO_TIMEVAL(&tv, timeout);
871		for (;;) {
872			error = _do_lock_umtx32(td, m, id, tvtohz(&tv));
873			if (error != ETIMEDOUT)
874				break;
875			getnanouptime(&ts2);
876			if (timespeccmp(&ts2, &ts, >=)) {
877				error = ETIMEDOUT;
878				break;
879			}
880			ts3 = ts;
881			timespecsub(&ts3, &ts2);
882			TIMESPEC_TO_TIMEVAL(&tv, &ts3);
883		}
884		/* Timed-locking is not restarted. */
885		if (error == ERESTART)
886			error = EINTR;
887	}
888	return (error);
889}
890
891/*
892 * Unlock a umtx object.
893 */
894static int
895do_unlock_umtx32(struct thread *td, uint32_t *m, uint32_t id)
896{
897	struct umtx_key key;
898	uint32_t owner;
899	uint32_t old;
900	int error;
901	int count;
902
903	/*
904	 * Make sure we own this mtx.
905	 */
906	owner = fuword32(m);
907	if (owner == -1)
908		return (EFAULT);
909
910	if ((owner & ~UMUTEX_CONTESTED) != id)
911		return (EPERM);
912
913	/* This should be done in userland */
914	if ((owner & UMUTEX_CONTESTED) == 0) {
915		old = casuword32(m, owner, UMUTEX_UNOWNED);
916		if (old == -1)
917			return (EFAULT);
918		if (old == owner)
919			return (0);
920		owner = old;
921	}
922
923	/* We should only ever be in here for contested locks */
924	if ((error = umtx_key_get(m, TYPE_SIMPLE_LOCK, AUTO_SHARE,
925		&key)) != 0)
926		return (error);
927
928	umtxq_lock(&key);
929	umtxq_busy(&key);
930	count = umtxq_count(&key);
931	umtxq_unlock(&key);
932
933	/*
934	 * When unlocking the umtx, it must be marked as unowned if
935	 * there is zero or one thread only waiting for it.
936	 * Otherwise, it must be marked as contested.
937	 */
938	old = casuword32(m, owner,
939		count <= 1 ? UMUTEX_UNOWNED : UMUTEX_CONTESTED);
940	umtxq_lock(&key);
941	umtxq_signal(&key,1);
942	umtxq_unbusy(&key);
943	umtxq_unlock(&key);
944	umtx_key_release(&key);
945	if (old == -1)
946		return (EFAULT);
947	if (old != owner)
948		return (EINVAL);
949	return (0);
950}
951#endif
952
953/*
954 * Fetch and compare value, sleep on the address if value is not changed.
955 */
956static int
957do_wait(struct thread *td, void *addr, u_long id,
958	struct timespec *timeout, int compat32, int is_private)
959{
960	struct umtx_q *uq;
961	struct timespec ts, ts2, ts3;
962	struct timeval tv;
963	u_long tmp;
964	int error = 0;
965
966	uq = td->td_umtxq;
967	if ((error = umtx_key_get(addr, TYPE_SIMPLE_WAIT,
968		is_private ? THREAD_SHARE : AUTO_SHARE, &uq->uq_key)) != 0)
969		return (error);
970
971	umtxq_lock(&uq->uq_key);
972	umtxq_insert(uq);
973	umtxq_unlock(&uq->uq_key);
974	if (compat32 == 0)
975		tmp = fuword(addr);
976        else
977		tmp = fuword32(addr);
978	if (tmp != id) {
979		umtxq_lock(&uq->uq_key);
980		umtxq_remove(uq);
981		umtxq_unlock(&uq->uq_key);
982	} else if (timeout == NULL) {
983		umtxq_lock(&uq->uq_key);
984		error = umtxq_sleep(uq, "uwait", 0);
985		umtxq_remove(uq);
986		umtxq_unlock(&uq->uq_key);
987	} else {
988		getnanouptime(&ts);
989		timespecadd(&ts, timeout);
990		TIMESPEC_TO_TIMEVAL(&tv, timeout);
991		umtxq_lock(&uq->uq_key);
992		for (;;) {
993			error = umtxq_sleep(uq, "uwait", tvtohz(&tv));
994			if (!(uq->uq_flags & UQF_UMTXQ))
995				break;
996			if (error != ETIMEDOUT)
997				break;
998			umtxq_unlock(&uq->uq_key);
999			getnanouptime(&ts2);
1000			if (timespeccmp(&ts2, &ts, >=)) {
1001				error = ETIMEDOUT;
1002				umtxq_lock(&uq->uq_key);
1003				break;
1004			}
1005			ts3 = ts;
1006			timespecsub(&ts3, &ts2);
1007			TIMESPEC_TO_TIMEVAL(&tv, &ts3);
1008			umtxq_lock(&uq->uq_key);
1009		}
1010		umtxq_remove(uq);
1011		umtxq_unlock(&uq->uq_key);
1012	}
1013	umtx_key_release(&uq->uq_key);
1014	if (error == ERESTART)
1015		error = EINTR;
1016	return (error);
1017}
1018
1019/*
1020 * Wake up threads sleeping on the specified address.
1021 */
1022int
1023kern_umtx_wake(struct thread *td, void *uaddr, int n_wake, int is_private)
1024{
1025	struct umtx_key key;
1026	int ret;
1027
1028	if ((ret = umtx_key_get(uaddr, TYPE_SIMPLE_WAIT,
1029		is_private ? THREAD_SHARE : AUTO_SHARE, &key)) != 0)
1030		return (ret);
1031	umtxq_lock(&key);
1032	ret = umtxq_signal(&key, n_wake);
1033	umtxq_unlock(&key);
1034	umtx_key_release(&key);
1035	return (0);
1036}
1037
1038/*
1039 * Lock PTHREAD_PRIO_NONE protocol POSIX mutex.
1040 */
1041static int
1042_do_lock_normal(struct thread *td, struct umutex *m, uint32_t flags, int timo,
1043	int mode)
1044{
1045	struct umtx_q *uq;
1046	uint32_t owner, old, id;
1047	int error = 0;
1048
1049	id = td->td_tid;
1050	uq = td->td_umtxq;
1051
1052	/*
1053	 * Care must be exercised when dealing with umtx structure. It
1054	 * can fault on any access.
1055	 */
1056	for (;;) {
1057		owner = fuword32(__DEVOLATILE(void *, &m->m_owner));
1058		if (mode == _UMUTEX_WAIT) {
1059			if (owner == UMUTEX_UNOWNED || owner == UMUTEX_CONTESTED)
1060				return (0);
1061		} else {
1062			/*
1063			 * Try the uncontested case.  This should be done in userland.
1064			 */
1065			owner = casuword32(&m->m_owner, UMUTEX_UNOWNED, id);
1066
1067			/* The acquire succeeded. */
1068			if (owner == UMUTEX_UNOWNED)
1069				return (0);
1070
1071			/* The address was invalid. */
1072			if (owner == -1)
1073				return (EFAULT);
1074
1075			/* If no one owns it but it is contested try to acquire it. */
1076			if (owner == UMUTEX_CONTESTED) {
1077				owner = casuword32(&m->m_owner,
1078				    UMUTEX_CONTESTED, id | UMUTEX_CONTESTED);
1079
1080				if (owner == UMUTEX_CONTESTED)
1081					return (0);
1082
1083				/* The address was invalid. */
1084				if (owner == -1)
1085					return (EFAULT);
1086
1087				/* If this failed the lock has changed, restart. */
1088				continue;
1089			}
1090		}
1091
1092		if ((flags & UMUTEX_ERROR_CHECK) != 0 &&
1093		    (owner & ~UMUTEX_CONTESTED) == id)
1094			return (EDEADLK);
1095
1096		if (mode == _UMUTEX_TRY)
1097			return (EBUSY);
1098
1099		/*
1100		 * If we caught a signal, we have retried and now
1101		 * exit immediately.
1102		 */
1103		if (error != 0)
1104			return (error);
1105
1106		if ((error = umtx_key_get(m, TYPE_NORMAL_UMUTEX,
1107		    GET_SHARE(flags), &uq->uq_key)) != 0)
1108			return (error);
1109
1110		umtxq_lock(&uq->uq_key);
1111		umtxq_busy(&uq->uq_key);
1112		umtxq_insert(uq);
1113		umtxq_unlock(&uq->uq_key);
1114
1115		/*
1116		 * Set the contested bit so that a release in user space
1117		 * knows to use the system call for unlock.  If this fails
1118		 * either some one else has acquired the lock or it has been
1119		 * released.
1120		 */
1121		old = casuword32(&m->m_owner, owner, owner | UMUTEX_CONTESTED);
1122
1123		/* The address was invalid. */
1124		if (old == -1) {
1125			umtxq_lock(&uq->uq_key);
1126			umtxq_remove(uq);
1127			umtxq_unbusy(&uq->uq_key);
1128			umtxq_unlock(&uq->uq_key);
1129			umtx_key_release(&uq->uq_key);
1130			return (EFAULT);
1131		}
1132
1133		/*
1134		 * We set the contested bit, sleep. Otherwise the lock changed
1135		 * and we need to retry or we lost a race to the thread
1136		 * unlocking the umtx.
1137		 */
1138		umtxq_lock(&uq->uq_key);
1139		umtxq_unbusy(&uq->uq_key);
1140		if (old == owner)
1141			error = umtxq_sleep(uq, "umtxn", timo);
1142		umtxq_remove(uq);
1143		umtxq_unlock(&uq->uq_key);
1144		umtx_key_release(&uq->uq_key);
1145	}
1146
1147	return (0);
1148}
1149
1150/*
1151 * Lock PTHREAD_PRIO_NONE protocol POSIX mutex.
1152 */
1153/*
1154 * Unlock PTHREAD_PRIO_NONE protocol POSIX mutex.
1155 */
1156static int
1157do_unlock_normal(struct thread *td, struct umutex *m, uint32_t flags)
1158{
1159	struct umtx_key key;
1160	uint32_t owner, old, id;
1161	int error;
1162	int count;
1163
1164	id = td->td_tid;
1165	/*
1166	 * Make sure we own this mtx.
1167	 */
1168	owner = fuword32(__DEVOLATILE(uint32_t *, &m->m_owner));
1169	if (owner == -1)
1170		return (EFAULT);
1171
1172	if ((owner & ~UMUTEX_CONTESTED) != id)
1173		return (EPERM);
1174
1175	if ((owner & UMUTEX_CONTESTED) == 0) {
1176		old = casuword32(&m->m_owner, owner, UMUTEX_UNOWNED);
1177		if (old == -1)
1178			return (EFAULT);
1179		if (old == owner)
1180			return (0);
1181		owner = old;
1182	}
1183
1184	/* We should only ever be in here for contested locks */
1185	if ((error = umtx_key_get(m, TYPE_NORMAL_UMUTEX, GET_SHARE(flags),
1186	    &key)) != 0)
1187		return (error);
1188
1189	umtxq_lock(&key);
1190	umtxq_busy(&key);
1191	count = umtxq_count(&key);
1192	umtxq_unlock(&key);
1193
1194	/*
1195	 * When unlocking the umtx, it must be marked as unowned if
1196	 * there is zero or one thread only waiting for it.
1197	 * Otherwise, it must be marked as contested.
1198	 */
1199	old = casuword32(&m->m_owner, owner,
1200		count <= 1 ? UMUTEX_UNOWNED : UMUTEX_CONTESTED);
1201	umtxq_lock(&key);
1202	umtxq_signal(&key,1);
1203	umtxq_unbusy(&key);
1204	umtxq_unlock(&key);
1205	umtx_key_release(&key);
1206	if (old == -1)
1207		return (EFAULT);
1208	if (old != owner)
1209		return (EINVAL);
1210	return (0);
1211}
1212
1213/*
1214 * Check if the mutex is available and wake up a waiter,
1215 * only for simple mutex.
1216 */
1217static int
1218do_wake_umutex(struct thread *td, struct umutex *m)
1219{
1220	struct umtx_key key;
1221	uint32_t owner;
1222	uint32_t flags;
1223	int error;
1224	int count;
1225
1226	owner = fuword32(__DEVOLATILE(uint32_t *, &m->m_owner));
1227	if (owner == -1)
1228		return (EFAULT);
1229
1230	if ((owner & ~UMUTEX_CONTESTED) != 0)
1231		return (0);
1232
1233	flags = fuword32(&m->m_flags);
1234
1235	/* We should only ever be in here for contested locks */
1236	if ((error = umtx_key_get(m, TYPE_NORMAL_UMUTEX, GET_SHARE(flags),
1237	    &key)) != 0)
1238		return (error);
1239
1240	umtxq_lock(&key);
1241	umtxq_busy(&key);
1242	count = umtxq_count(&key);
1243	umtxq_unlock(&key);
1244
1245	if (count <= 1)
1246		owner = casuword32(&m->m_owner, UMUTEX_CONTESTED, UMUTEX_UNOWNED);
1247
1248	umtxq_lock(&key);
1249	if (count != 0 && (owner & ~UMUTEX_CONTESTED) == 0)
1250		umtxq_signal(&key, 1);
1251	umtxq_unbusy(&key);
1252	umtxq_unlock(&key);
1253	umtx_key_release(&key);
1254	return (0);
1255}
1256
1257static inline struct umtx_pi *
1258umtx_pi_alloc(int flags)
1259{
1260	struct umtx_pi *pi;
1261
1262	pi = uma_zalloc(umtx_pi_zone, M_ZERO | flags);
1263	TAILQ_INIT(&pi->pi_blocked);
1264	atomic_add_int(&umtx_pi_allocated, 1);
1265	return (pi);
1266}
1267
1268static inline void
1269umtx_pi_free(struct umtx_pi *pi)
1270{
1271	uma_zfree(umtx_pi_zone, pi);
1272	atomic_add_int(&umtx_pi_allocated, -1);
1273}
1274
1275/*
1276 * Adjust the thread's position on a pi_state after its priority has been
1277 * changed.
1278 */
1279static int
1280umtx_pi_adjust_thread(struct umtx_pi *pi, struct thread *td)
1281{
1282	struct umtx_q *uq, *uq1, *uq2;
1283	struct thread *td1;
1284
1285	mtx_assert(&umtx_lock, MA_OWNED);
1286	if (pi == NULL)
1287		return (0);
1288
1289	uq = td->td_umtxq;
1290
1291	/*
1292	 * Check if the thread needs to be moved on the blocked chain.
1293	 * It needs to be moved if either its priority is lower than
1294	 * the previous thread or higher than the next thread.
1295	 */
1296	uq1 = TAILQ_PREV(uq, umtxq_head, uq_lockq);
1297	uq2 = TAILQ_NEXT(uq, uq_lockq);
1298	if ((uq1 != NULL && UPRI(td) < UPRI(uq1->uq_thread)) ||
1299	    (uq2 != NULL && UPRI(td) > UPRI(uq2->uq_thread))) {
1300		/*
1301		 * Remove thread from blocked chain and determine where
1302		 * it should be moved to.
1303		 */
1304		TAILQ_REMOVE(&pi->pi_blocked, uq, uq_lockq);
1305		TAILQ_FOREACH(uq1, &pi->pi_blocked, uq_lockq) {
1306			td1 = uq1->uq_thread;
1307			MPASS(td1->td_proc->p_magic == P_MAGIC);
1308			if (UPRI(td1) > UPRI(td))
1309				break;
1310		}
1311
1312		if (uq1 == NULL)
1313			TAILQ_INSERT_TAIL(&pi->pi_blocked, uq, uq_lockq);
1314		else
1315			TAILQ_INSERT_BEFORE(uq1, uq, uq_lockq);
1316	}
1317	return (1);
1318}
1319
1320/*
1321 * Propagate priority when a thread is blocked on POSIX
1322 * PI mutex.
1323 */
1324static void
1325umtx_propagate_priority(struct thread *td)
1326{
1327	struct umtx_q *uq;
1328	struct umtx_pi *pi;
1329	int pri;
1330
1331	mtx_assert(&umtx_lock, MA_OWNED);
1332	pri = UPRI(td);
1333	uq = td->td_umtxq;
1334	pi = uq->uq_pi_blocked;
1335	if (pi == NULL)
1336		return;
1337
1338	for (;;) {
1339		td = pi->pi_owner;
1340		if (td == NULL)
1341			return;
1342
1343		MPASS(td->td_proc != NULL);
1344		MPASS(td->td_proc->p_magic == P_MAGIC);
1345
1346		if (UPRI(td) <= pri)
1347			return;
1348
1349		thread_lock(td);
1350		sched_lend_user_prio(td, pri);
1351		thread_unlock(td);
1352
1353		/*
1354		 * Pick up the lock that td is blocked on.
1355		 */
1356		uq = td->td_umtxq;
1357		pi = uq->uq_pi_blocked;
1358		/* Resort td on the list if needed. */
1359		if (!umtx_pi_adjust_thread(pi, td))
1360			break;
1361	}
1362}
1363
1364/*
1365 * Unpropagate priority for a PI mutex when a thread blocked on
1366 * it is interrupted by signal or resumed by others.
1367 */
1368static void
1369umtx_unpropagate_priority(struct umtx_pi *pi)
1370{
1371	struct umtx_q *uq, *uq_owner;
1372	struct umtx_pi *pi2;
1373	int pri, oldpri;
1374
1375	mtx_assert(&umtx_lock, MA_OWNED);
1376
1377	while (pi != NULL && pi->pi_owner != NULL) {
1378		pri = PRI_MAX;
1379		uq_owner = pi->pi_owner->td_umtxq;
1380
1381		TAILQ_FOREACH(pi2, &uq_owner->uq_pi_contested, pi_link) {
1382			uq = TAILQ_FIRST(&pi2->pi_blocked);
1383			if (uq != NULL) {
1384				if (pri > UPRI(uq->uq_thread))
1385					pri = UPRI(uq->uq_thread);
1386			}
1387		}
1388
1389		if (pri > uq_owner->uq_inherited_pri)
1390			pri = uq_owner->uq_inherited_pri;
1391		thread_lock(pi->pi_owner);
1392		oldpri = pi->pi_owner->td_user_pri;
1393		sched_unlend_user_prio(pi->pi_owner, pri);
1394		thread_unlock(pi->pi_owner);
1395		umtx_pi_adjust_locked(pi->pi_owner, oldpri);
1396		pi = uq_owner->uq_pi_blocked;
1397	}
1398}
1399
1400/*
1401 * Insert a PI mutex into owned list.
1402 */
1403static void
1404umtx_pi_setowner(struct umtx_pi *pi, struct thread *owner)
1405{
1406	struct umtx_q *uq_owner;
1407
1408	uq_owner = owner->td_umtxq;
1409	mtx_assert(&umtx_lock, MA_OWNED);
1410	if (pi->pi_owner != NULL)
1411		panic("pi_ower != NULL");
1412	pi->pi_owner = owner;
1413	TAILQ_INSERT_TAIL(&uq_owner->uq_pi_contested, pi, pi_link);
1414}
1415
1416/*
1417 * Claim ownership of a PI mutex.
1418 */
1419static int
1420umtx_pi_claim(struct umtx_pi *pi, struct thread *owner)
1421{
1422	struct umtx_q *uq, *uq_owner;
1423
1424	uq_owner = owner->td_umtxq;
1425	mtx_lock_spin(&umtx_lock);
1426	if (pi->pi_owner == owner) {
1427		mtx_unlock_spin(&umtx_lock);
1428		return (0);
1429	}
1430
1431	if (pi->pi_owner != NULL) {
1432		/*
1433		 * userland may have already messed the mutex, sigh.
1434		 */
1435		mtx_unlock_spin(&umtx_lock);
1436		return (EPERM);
1437	}
1438	umtx_pi_setowner(pi, owner);
1439	uq = TAILQ_FIRST(&pi->pi_blocked);
1440	if (uq != NULL) {
1441		int pri;
1442
1443		pri = UPRI(uq->uq_thread);
1444		thread_lock(owner);
1445		if (pri < UPRI(owner))
1446			sched_lend_user_prio(owner, pri);
1447		thread_unlock(owner);
1448	}
1449	mtx_unlock_spin(&umtx_lock);
1450	return (0);
1451}
1452
1453static void
1454umtx_pi_adjust_locked(struct thread *td, u_char oldpri)
1455{
1456	struct umtx_q *uq;
1457	struct umtx_pi *pi;
1458
1459	uq = td->td_umtxq;
1460	/*
1461	 * Pick up the lock that td is blocked on.
1462	 */
1463	pi = uq->uq_pi_blocked;
1464	MPASS(pi != NULL);
1465
1466	/* Resort the turnstile on the list. */
1467	if (!umtx_pi_adjust_thread(pi, td))
1468		return;
1469
1470	/*
1471	 * If our priority was lowered and we are at the head of the
1472	 * turnstile, then propagate our new priority up the chain.
1473	 */
1474	if (uq == TAILQ_FIRST(&pi->pi_blocked) && UPRI(td) < oldpri)
1475		umtx_propagate_priority(td);
1476}
1477
1478/*
1479 * Adjust a thread's order position in its blocked PI mutex,
1480 * this may result new priority propagating process.
1481 */
1482void
1483umtx_pi_adjust(struct thread *td, u_char oldpri)
1484{
1485	struct umtx_q *uq;
1486	struct umtx_pi *pi;
1487
1488	uq = td->td_umtxq;
1489	mtx_lock_spin(&umtx_lock);
1490	/*
1491	 * Pick up the lock that td is blocked on.
1492	 */
1493	pi = uq->uq_pi_blocked;
1494	if (pi != NULL)
1495		umtx_pi_adjust_locked(td, oldpri);
1496	mtx_unlock_spin(&umtx_lock);
1497}
1498
1499/*
1500 * Sleep on a PI mutex.
1501 */
1502static int
1503umtxq_sleep_pi(struct umtx_q *uq, struct umtx_pi *pi,
1504	uint32_t owner, const char *wmesg, int timo)
1505{
1506	struct umtxq_chain *uc;
1507	struct thread *td, *td1;
1508	struct umtx_q *uq1;
1509	int pri;
1510	int error = 0;
1511
1512	td = uq->uq_thread;
1513	KASSERT(td == curthread, ("inconsistent uq_thread"));
1514	uc = umtxq_getchain(&uq->uq_key);
1515	UMTXQ_LOCKED_ASSERT(uc);
1516	umtxq_insert(uq);
1517	if (pi->pi_owner == NULL) {
1518		/* XXX
1519		 * Current, We only support process private PI-mutex,
1520		 * non-contended PI-mutexes are locked in userland.
1521		 * Process shared PI-mutex should always be initialized
1522		 * by kernel and be registered in kernel, locking should
1523		 * always be done by kernel to avoid security problems.
1524		 * For process private PI-mutex, we can find owner
1525		 * thread and boost its priority safely.
1526		 */
1527		PROC_LOCK(curproc);
1528		td1 = thread_find(curproc, owner);
1529		mtx_lock_spin(&umtx_lock);
1530		if (td1 != NULL && pi->pi_owner == NULL) {
1531			uq1 = td1->td_umtxq;
1532			umtx_pi_setowner(pi, td1);
1533		}
1534		PROC_UNLOCK(curproc);
1535	} else {
1536		mtx_lock_spin(&umtx_lock);
1537	}
1538
1539	TAILQ_FOREACH(uq1, &pi->pi_blocked, uq_lockq) {
1540		pri = UPRI(uq1->uq_thread);
1541		if (pri > UPRI(td))
1542			break;
1543	}
1544
1545	if (uq1 != NULL)
1546		TAILQ_INSERT_BEFORE(uq1, uq, uq_lockq);
1547	else
1548		TAILQ_INSERT_TAIL(&pi->pi_blocked, uq, uq_lockq);
1549
1550	uq->uq_pi_blocked = pi;
1551	thread_lock(td);
1552	td->td_flags |= TDF_UPIBLOCKED;
1553	thread_unlock(td);
1554	mtx_unlock_spin(&umtx_lock);
1555	umtxq_unlock(&uq->uq_key);
1556
1557	mtx_lock_spin(&umtx_lock);
1558	umtx_propagate_priority(td);
1559	mtx_unlock_spin(&umtx_lock);
1560
1561	umtxq_lock(&uq->uq_key);
1562	if (uq->uq_flags & UQF_UMTXQ) {
1563		error = msleep(uq, &uc->uc_lock, PCATCH, wmesg, timo);
1564		if (error == EWOULDBLOCK)
1565			error = ETIMEDOUT;
1566		if (uq->uq_flags & UQF_UMTXQ) {
1567			umtxq_busy(&uq->uq_key);
1568			umtxq_remove(uq);
1569			umtxq_unbusy(&uq->uq_key);
1570		}
1571	}
1572	umtxq_unlock(&uq->uq_key);
1573
1574	mtx_lock_spin(&umtx_lock);
1575	uq->uq_pi_blocked = NULL;
1576	thread_lock(td);
1577	td->td_flags &= ~TDF_UPIBLOCKED;
1578	thread_unlock(td);
1579	TAILQ_REMOVE(&pi->pi_blocked, uq, uq_lockq);
1580	umtx_unpropagate_priority(pi);
1581	mtx_unlock_spin(&umtx_lock);
1582
1583	umtxq_lock(&uq->uq_key);
1584
1585	return (error);
1586}
1587
1588/*
1589 * Add reference count for a PI mutex.
1590 */
1591static void
1592umtx_pi_ref(struct umtx_pi *pi)
1593{
1594	struct umtxq_chain *uc;
1595
1596	uc = umtxq_getchain(&pi->pi_key);
1597	UMTXQ_LOCKED_ASSERT(uc);
1598	pi->pi_refcount++;
1599}
1600
1601/*
1602 * Decrease reference count for a PI mutex, if the counter
1603 * is decreased to zero, its memory space is freed.
1604 */
1605static void
1606umtx_pi_unref(struct umtx_pi *pi)
1607{
1608	struct umtxq_chain *uc;
1609	int free = 0;
1610
1611	uc = umtxq_getchain(&pi->pi_key);
1612	UMTXQ_LOCKED_ASSERT(uc);
1613	KASSERT(pi->pi_refcount > 0, ("invalid reference count"));
1614	if (--pi->pi_refcount == 0) {
1615		mtx_lock_spin(&umtx_lock);
1616		if (pi->pi_owner != NULL) {
1617			TAILQ_REMOVE(&pi->pi_owner->td_umtxq->uq_pi_contested,
1618				pi, pi_link);
1619			pi->pi_owner = NULL;
1620		}
1621		KASSERT(TAILQ_EMPTY(&pi->pi_blocked),
1622			("blocked queue not empty"));
1623		mtx_unlock_spin(&umtx_lock);
1624		TAILQ_REMOVE(&uc->uc_pi_list, pi, pi_hashlink);
1625		free = 1;
1626	}
1627	if (free)
1628		umtx_pi_free(pi);
1629}
1630
1631/*
1632 * Find a PI mutex in hash table.
1633 */
1634static struct umtx_pi *
1635umtx_pi_lookup(struct umtx_key *key)
1636{
1637	struct umtxq_chain *uc;
1638	struct umtx_pi *pi;
1639
1640	uc = umtxq_getchain(key);
1641	UMTXQ_LOCKED_ASSERT(uc);
1642
1643	TAILQ_FOREACH(pi, &uc->uc_pi_list, pi_hashlink) {
1644		if (umtx_key_match(&pi->pi_key, key)) {
1645			return (pi);
1646		}
1647	}
1648	return (NULL);
1649}
1650
1651/*
1652 * Insert a PI mutex into hash table.
1653 */
1654static inline void
1655umtx_pi_insert(struct umtx_pi *pi)
1656{
1657	struct umtxq_chain *uc;
1658
1659	uc = umtxq_getchain(&pi->pi_key);
1660	UMTXQ_LOCKED_ASSERT(uc);
1661	TAILQ_INSERT_TAIL(&uc->uc_pi_list, pi, pi_hashlink);
1662}
1663
1664/*
1665 * Lock a PI mutex.
1666 */
1667static int
1668_do_lock_pi(struct thread *td, struct umutex *m, uint32_t flags, int timo,
1669	int try)
1670{
1671	struct umtx_q *uq;
1672	struct umtx_pi *pi, *new_pi;
1673	uint32_t id, owner, old;
1674	int error;
1675
1676	id = td->td_tid;
1677	uq = td->td_umtxq;
1678
1679	if ((error = umtx_key_get(m, TYPE_PI_UMUTEX, GET_SHARE(flags),
1680	    &uq->uq_key)) != 0)
1681		return (error);
1682	umtxq_lock(&uq->uq_key);
1683	pi = umtx_pi_lookup(&uq->uq_key);
1684	if (pi == NULL) {
1685		new_pi = umtx_pi_alloc(M_NOWAIT);
1686		if (new_pi == NULL) {
1687			umtxq_unlock(&uq->uq_key);
1688			new_pi = umtx_pi_alloc(M_WAITOK);
1689			new_pi->pi_key = uq->uq_key;
1690			umtxq_lock(&uq->uq_key);
1691			pi = umtx_pi_lookup(&uq->uq_key);
1692			if (pi != NULL) {
1693				umtx_pi_free(new_pi);
1694				new_pi = NULL;
1695			}
1696		}
1697		if (new_pi != NULL) {
1698			new_pi->pi_key = uq->uq_key;
1699			umtx_pi_insert(new_pi);
1700			pi = new_pi;
1701		}
1702	}
1703	umtx_pi_ref(pi);
1704	umtxq_unlock(&uq->uq_key);
1705
1706	/*
1707	 * Care must be exercised when dealing with umtx structure.  It
1708	 * can fault on any access.
1709	 */
1710	for (;;) {
1711		/*
1712		 * Try the uncontested case.  This should be done in userland.
1713		 */
1714		owner = casuword32(&m->m_owner, UMUTEX_UNOWNED, id);
1715
1716		/* The acquire succeeded. */
1717		if (owner == UMUTEX_UNOWNED) {
1718			error = 0;
1719			break;
1720		}
1721
1722		/* The address was invalid. */
1723		if (owner == -1) {
1724			error = EFAULT;
1725			break;
1726		}
1727
1728		/* If no one owns it but it is contested try to acquire it. */
1729		if (owner == UMUTEX_CONTESTED) {
1730			owner = casuword32(&m->m_owner,
1731			    UMUTEX_CONTESTED, id | UMUTEX_CONTESTED);
1732
1733			if (owner == UMUTEX_CONTESTED) {
1734				umtxq_lock(&uq->uq_key);
1735				error = umtx_pi_claim(pi, td);
1736				umtxq_unlock(&uq->uq_key);
1737				break;
1738			}
1739
1740			/* The address was invalid. */
1741			if (owner == -1) {
1742				error = EFAULT;
1743				break;
1744			}
1745
1746			/* If this failed the lock has changed, restart. */
1747			continue;
1748		}
1749
1750		if ((flags & UMUTEX_ERROR_CHECK) != 0 &&
1751		    (owner & ~UMUTEX_CONTESTED) == id) {
1752			error = EDEADLK;
1753			break;
1754		}
1755
1756		if (try != 0) {
1757			error = EBUSY;
1758			break;
1759		}
1760
1761		/*
1762		 * If we caught a signal, we have retried and now
1763		 * exit immediately.
1764		 */
1765		if (error != 0)
1766			break;
1767
1768		umtxq_lock(&uq->uq_key);
1769		umtxq_busy(&uq->uq_key);
1770		umtxq_unlock(&uq->uq_key);
1771
1772		/*
1773		 * Set the contested bit so that a release in user space
1774		 * knows to use the system call for unlock.  If this fails
1775		 * either some one else has acquired the lock or it has been
1776		 * released.
1777		 */
1778		old = casuword32(&m->m_owner, owner, owner | UMUTEX_CONTESTED);
1779
1780		/* The address was invalid. */
1781		if (old == -1) {
1782			umtxq_lock(&uq->uq_key);
1783			umtxq_unbusy(&uq->uq_key);
1784			umtxq_unlock(&uq->uq_key);
1785			error = EFAULT;
1786			break;
1787		}
1788
1789		umtxq_lock(&uq->uq_key);
1790		umtxq_unbusy(&uq->uq_key);
1791		/*
1792		 * We set the contested bit, sleep. Otherwise the lock changed
1793		 * and we need to retry or we lost a race to the thread
1794		 * unlocking the umtx.
1795		 */
1796		if (old == owner)
1797			error = umtxq_sleep_pi(uq, pi, owner & ~UMUTEX_CONTESTED,
1798				 "umtxpi", timo);
1799		umtxq_unlock(&uq->uq_key);
1800	}
1801
1802	umtxq_lock(&uq->uq_key);
1803	umtx_pi_unref(pi);
1804	umtxq_unlock(&uq->uq_key);
1805
1806	umtx_key_release(&uq->uq_key);
1807	return (error);
1808}
1809
1810/*
1811 * Unlock a PI mutex.
1812 */
1813static int
1814do_unlock_pi(struct thread *td, struct umutex *m, uint32_t flags)
1815{
1816	struct umtx_key key;
1817	struct umtx_q *uq_first, *uq_first2, *uq_me;
1818	struct umtx_pi *pi, *pi2;
1819	uint32_t owner, old, id;
1820	int error;
1821	int count;
1822	int pri;
1823
1824	id = td->td_tid;
1825	/*
1826	 * Make sure we own this mtx.
1827	 */
1828	owner = fuword32(__DEVOLATILE(uint32_t *, &m->m_owner));
1829	if (owner == -1)
1830		return (EFAULT);
1831
1832	if ((owner & ~UMUTEX_CONTESTED) != id)
1833		return (EPERM);
1834
1835	/* This should be done in userland */
1836	if ((owner & UMUTEX_CONTESTED) == 0) {
1837		old = casuword32(&m->m_owner, owner, UMUTEX_UNOWNED);
1838		if (old == -1)
1839			return (EFAULT);
1840		if (old == owner)
1841			return (0);
1842		owner = old;
1843	}
1844
1845	/* We should only ever be in here for contested locks */
1846	if ((error = umtx_key_get(m, TYPE_PI_UMUTEX, GET_SHARE(flags),
1847	    &key)) != 0)
1848		return (error);
1849
1850	umtxq_lock(&key);
1851	umtxq_busy(&key);
1852	count = umtxq_count_pi(&key, &uq_first);
1853	if (uq_first != NULL) {
1854		pi = uq_first->uq_pi_blocked;
1855		if (pi->pi_owner != curthread) {
1856			umtxq_unbusy(&key);
1857			umtxq_unlock(&key);
1858			/* userland messed the mutex */
1859			return (EPERM);
1860		}
1861		uq_me = curthread->td_umtxq;
1862		mtx_lock_spin(&umtx_lock);
1863		pi->pi_owner = NULL;
1864		TAILQ_REMOVE(&uq_me->uq_pi_contested, pi, pi_link);
1865		uq_first = TAILQ_FIRST(&pi->pi_blocked);
1866		pri = PRI_MAX;
1867		TAILQ_FOREACH(pi2, &uq_me->uq_pi_contested, pi_link) {
1868			uq_first2 = TAILQ_FIRST(&pi2->pi_blocked);
1869			if (uq_first2 != NULL) {
1870				if (pri > UPRI(uq_first2->uq_thread))
1871					pri = UPRI(uq_first2->uq_thread);
1872			}
1873		}
1874		thread_lock(curthread);
1875		sched_unlend_user_prio(curthread, pri);
1876		thread_unlock(curthread);
1877		mtx_unlock_spin(&umtx_lock);
1878	}
1879	umtxq_unlock(&key);
1880
1881	/*
1882	 * When unlocking the umtx, it must be marked as unowned if
1883	 * there is zero or one thread only waiting for it.
1884	 * Otherwise, it must be marked as contested.
1885	 */
1886	old = casuword32(&m->m_owner, owner,
1887		count <= 1 ? UMUTEX_UNOWNED : UMUTEX_CONTESTED);
1888
1889	umtxq_lock(&key);
1890	if (uq_first != NULL)
1891		umtxq_signal_thread(uq_first);
1892	umtxq_unbusy(&key);
1893	umtxq_unlock(&key);
1894	umtx_key_release(&key);
1895	if (old == -1)
1896		return (EFAULT);
1897	if (old != owner)
1898		return (EINVAL);
1899	return (0);
1900}
1901
1902/*
1903 * Lock a PP mutex.
1904 */
1905static int
1906_do_lock_pp(struct thread *td, struct umutex *m, uint32_t flags, int timo,
1907	int try)
1908{
1909	struct umtx_q *uq, *uq2;
1910	struct umtx_pi *pi;
1911	uint32_t ceiling;
1912	uint32_t owner, id;
1913	int error, pri, old_inherited_pri, su;
1914
1915	id = td->td_tid;
1916	uq = td->td_umtxq;
1917	if ((error = umtx_key_get(m, TYPE_PP_UMUTEX, GET_SHARE(flags),
1918	    &uq->uq_key)) != 0)
1919		return (error);
1920	su = (priv_check(td, PRIV_SCHED_RTPRIO) == 0);
1921	for (;;) {
1922		old_inherited_pri = uq->uq_inherited_pri;
1923		umtxq_lock(&uq->uq_key);
1924		umtxq_busy(&uq->uq_key);
1925		umtxq_unlock(&uq->uq_key);
1926
1927		ceiling = RTP_PRIO_MAX - fuword32(&m->m_ceilings[0]);
1928		if (ceiling > RTP_PRIO_MAX) {
1929			error = EINVAL;
1930			goto out;
1931		}
1932
1933		mtx_lock_spin(&umtx_lock);
1934		if (UPRI(td) < PRI_MIN_REALTIME + ceiling) {
1935			mtx_unlock_spin(&umtx_lock);
1936			error = EINVAL;
1937			goto out;
1938		}
1939		if (su && PRI_MIN_REALTIME + ceiling < uq->uq_inherited_pri) {
1940			uq->uq_inherited_pri = PRI_MIN_REALTIME + ceiling;
1941			thread_lock(td);
1942			if (uq->uq_inherited_pri < UPRI(td))
1943				sched_lend_user_prio(td, uq->uq_inherited_pri);
1944			thread_unlock(td);
1945		}
1946		mtx_unlock_spin(&umtx_lock);
1947
1948		owner = casuword32(&m->m_owner,
1949		    UMUTEX_CONTESTED, id | UMUTEX_CONTESTED);
1950
1951		if (owner == UMUTEX_CONTESTED) {
1952			error = 0;
1953			break;
1954		}
1955
1956		/* The address was invalid. */
1957		if (owner == -1) {
1958			error = EFAULT;
1959			break;
1960		}
1961
1962		if ((flags & UMUTEX_ERROR_CHECK) != 0 &&
1963		    (owner & ~UMUTEX_CONTESTED) == id) {
1964			error = EDEADLK;
1965			break;
1966		}
1967
1968		if (try != 0) {
1969			error = EBUSY;
1970			break;
1971		}
1972
1973		/*
1974		 * If we caught a signal, we have retried and now
1975		 * exit immediately.
1976		 */
1977		if (error != 0)
1978			break;
1979
1980		umtxq_lock(&uq->uq_key);
1981		umtxq_insert(uq);
1982		umtxq_unbusy(&uq->uq_key);
1983		error = umtxq_sleep(uq, "umtxpp", timo);
1984		umtxq_remove(uq);
1985		umtxq_unlock(&uq->uq_key);
1986
1987		mtx_lock_spin(&umtx_lock);
1988		uq->uq_inherited_pri = old_inherited_pri;
1989		pri = PRI_MAX;
1990		TAILQ_FOREACH(pi, &uq->uq_pi_contested, pi_link) {
1991			uq2 = TAILQ_FIRST(&pi->pi_blocked);
1992			if (uq2 != NULL) {
1993				if (pri > UPRI(uq2->uq_thread))
1994					pri = UPRI(uq2->uq_thread);
1995			}
1996		}
1997		if (pri > uq->uq_inherited_pri)
1998			pri = uq->uq_inherited_pri;
1999		thread_lock(td);
2000		sched_unlend_user_prio(td, pri);
2001		thread_unlock(td);
2002		mtx_unlock_spin(&umtx_lock);
2003	}
2004
2005	if (error != 0) {
2006		mtx_lock_spin(&umtx_lock);
2007		uq->uq_inherited_pri = old_inherited_pri;
2008		pri = PRI_MAX;
2009		TAILQ_FOREACH(pi, &uq->uq_pi_contested, pi_link) {
2010			uq2 = TAILQ_FIRST(&pi->pi_blocked);
2011			if (uq2 != NULL) {
2012				if (pri > UPRI(uq2->uq_thread))
2013					pri = UPRI(uq2->uq_thread);
2014			}
2015		}
2016		if (pri > uq->uq_inherited_pri)
2017			pri = uq->uq_inherited_pri;
2018		thread_lock(td);
2019		sched_unlend_user_prio(td, pri);
2020		thread_unlock(td);
2021		mtx_unlock_spin(&umtx_lock);
2022	}
2023
2024out:
2025	umtxq_lock(&uq->uq_key);
2026	umtxq_unbusy(&uq->uq_key);
2027	umtxq_unlock(&uq->uq_key);
2028	umtx_key_release(&uq->uq_key);
2029	return (error);
2030}
2031
2032/*
2033 * Unlock a PP mutex.
2034 */
2035static int
2036do_unlock_pp(struct thread *td, struct umutex *m, uint32_t flags)
2037{
2038	struct umtx_key key;
2039	struct umtx_q *uq, *uq2;
2040	struct umtx_pi *pi;
2041	uint32_t owner, id;
2042	uint32_t rceiling;
2043	int error, pri, new_inherited_pri, su;
2044
2045	id = td->td_tid;
2046	uq = td->td_umtxq;
2047	su = (priv_check(td, PRIV_SCHED_RTPRIO) == 0);
2048
2049	/*
2050	 * Make sure we own this mtx.
2051	 */
2052	owner = fuword32(__DEVOLATILE(uint32_t *, &m->m_owner));
2053	if (owner == -1)
2054		return (EFAULT);
2055
2056	if ((owner & ~UMUTEX_CONTESTED) != id)
2057		return (EPERM);
2058
2059	error = copyin(&m->m_ceilings[1], &rceiling, sizeof(uint32_t));
2060	if (error != 0)
2061		return (error);
2062
2063	if (rceiling == -1)
2064		new_inherited_pri = PRI_MAX;
2065	else {
2066		rceiling = RTP_PRIO_MAX - rceiling;
2067		if (rceiling > RTP_PRIO_MAX)
2068			return (EINVAL);
2069		new_inherited_pri = PRI_MIN_REALTIME + rceiling;
2070	}
2071
2072	if ((error = umtx_key_get(m, TYPE_PP_UMUTEX, GET_SHARE(flags),
2073	    &key)) != 0)
2074		return (error);
2075	umtxq_lock(&key);
2076	umtxq_busy(&key);
2077	umtxq_unlock(&key);
2078	/*
2079	 * For priority protected mutex, always set unlocked state
2080	 * to UMUTEX_CONTESTED, so that userland always enters kernel
2081	 * to lock the mutex, it is necessary because thread priority
2082	 * has to be adjusted for such mutex.
2083	 */
2084	error = suword32(__DEVOLATILE(uint32_t *, &m->m_owner),
2085		UMUTEX_CONTESTED);
2086
2087	umtxq_lock(&key);
2088	if (error == 0)
2089		umtxq_signal(&key, 1);
2090	umtxq_unbusy(&key);
2091	umtxq_unlock(&key);
2092
2093	if (error == -1)
2094		error = EFAULT;
2095	else {
2096		mtx_lock_spin(&umtx_lock);
2097		if (su != 0)
2098			uq->uq_inherited_pri = new_inherited_pri;
2099		pri = PRI_MAX;
2100		TAILQ_FOREACH(pi, &uq->uq_pi_contested, pi_link) {
2101			uq2 = TAILQ_FIRST(&pi->pi_blocked);
2102			if (uq2 != NULL) {
2103				if (pri > UPRI(uq2->uq_thread))
2104					pri = UPRI(uq2->uq_thread);
2105			}
2106		}
2107		if (pri > uq->uq_inherited_pri)
2108			pri = uq->uq_inherited_pri;
2109		thread_lock(td);
2110		sched_unlend_user_prio(td, pri);
2111		thread_unlock(td);
2112		mtx_unlock_spin(&umtx_lock);
2113	}
2114	umtx_key_release(&key);
2115	return (error);
2116}
2117
2118static int
2119do_set_ceiling(struct thread *td, struct umutex *m, uint32_t ceiling,
2120	uint32_t *old_ceiling)
2121{
2122	struct umtx_q *uq;
2123	uint32_t save_ceiling;
2124	uint32_t owner, id;
2125	uint32_t flags;
2126	int error;
2127
2128	flags = fuword32(&m->m_flags);
2129	if ((flags & UMUTEX_PRIO_PROTECT) == 0)
2130		return (EINVAL);
2131	if (ceiling > RTP_PRIO_MAX)
2132		return (EINVAL);
2133	id = td->td_tid;
2134	uq = td->td_umtxq;
2135	if ((error = umtx_key_get(m, TYPE_PP_UMUTEX, GET_SHARE(flags),
2136	   &uq->uq_key)) != 0)
2137		return (error);
2138	for (;;) {
2139		umtxq_lock(&uq->uq_key);
2140		umtxq_busy(&uq->uq_key);
2141		umtxq_unlock(&uq->uq_key);
2142
2143		save_ceiling = fuword32(&m->m_ceilings[0]);
2144
2145		owner = casuword32(&m->m_owner,
2146		    UMUTEX_CONTESTED, id | UMUTEX_CONTESTED);
2147
2148		if (owner == UMUTEX_CONTESTED) {
2149			suword32(&m->m_ceilings[0], ceiling);
2150			suword32(__DEVOLATILE(uint32_t *, &m->m_owner),
2151				UMUTEX_CONTESTED);
2152			error = 0;
2153			break;
2154		}
2155
2156		/* The address was invalid. */
2157		if (owner == -1) {
2158			error = EFAULT;
2159			break;
2160		}
2161
2162		if ((owner & ~UMUTEX_CONTESTED) == id) {
2163			suword32(&m->m_ceilings[0], ceiling);
2164			error = 0;
2165			break;
2166		}
2167
2168		/*
2169		 * If we caught a signal, we have retried and now
2170		 * exit immediately.
2171		 */
2172		if (error != 0)
2173			break;
2174
2175		/*
2176		 * We set the contested bit, sleep. Otherwise the lock changed
2177		 * and we need to retry or we lost a race to the thread
2178		 * unlocking the umtx.
2179		 */
2180		umtxq_lock(&uq->uq_key);
2181		umtxq_insert(uq);
2182		umtxq_unbusy(&uq->uq_key);
2183		error = umtxq_sleep(uq, "umtxpp", 0);
2184		umtxq_remove(uq);
2185		umtxq_unlock(&uq->uq_key);
2186	}
2187	umtxq_lock(&uq->uq_key);
2188	if (error == 0)
2189		umtxq_signal(&uq->uq_key, INT_MAX);
2190	umtxq_unbusy(&uq->uq_key);
2191	umtxq_unlock(&uq->uq_key);
2192	umtx_key_release(&uq->uq_key);
2193	if (error == 0 && old_ceiling != NULL)
2194		suword32(old_ceiling, save_ceiling);
2195	return (error);
2196}
2197
2198static int
2199_do_lock_umutex(struct thread *td, struct umutex *m, int flags, int timo,
2200	int mode)
2201{
2202	switch(flags & (UMUTEX_PRIO_INHERIT | UMUTEX_PRIO_PROTECT)) {
2203	case 0:
2204		return (_do_lock_normal(td, m, flags, timo, mode));
2205	case UMUTEX_PRIO_INHERIT:
2206		return (_do_lock_pi(td, m, flags, timo, mode));
2207	case UMUTEX_PRIO_PROTECT:
2208		return (_do_lock_pp(td, m, flags, timo, mode));
2209	}
2210	return (EINVAL);
2211}
2212
2213/*
2214 * Lock a userland POSIX mutex.
2215 */
2216static int
2217do_lock_umutex(struct thread *td, struct umutex *m,
2218	struct timespec *timeout, int mode)
2219{
2220	struct timespec ts, ts2, ts3;
2221	struct timeval tv;
2222	uint32_t flags;
2223	int error;
2224
2225	flags = fuword32(&m->m_flags);
2226	if (flags == -1)
2227		return (EFAULT);
2228
2229	if (timeout == NULL) {
2230		error = _do_lock_umutex(td, m, flags, 0, mode);
2231		/* Mutex locking is restarted if it is interrupted. */
2232		if (error == EINTR && mode != _UMUTEX_WAIT)
2233			error = ERESTART;
2234	} else {
2235		getnanouptime(&ts);
2236		timespecadd(&ts, timeout);
2237		TIMESPEC_TO_TIMEVAL(&tv, timeout);
2238		for (;;) {
2239			error = _do_lock_umutex(td, m, flags, tvtohz(&tv), mode);
2240			if (error != ETIMEDOUT)
2241				break;
2242			getnanouptime(&ts2);
2243			if (timespeccmp(&ts2, &ts, >=)) {
2244				error = ETIMEDOUT;
2245				break;
2246			}
2247			ts3 = ts;
2248			timespecsub(&ts3, &ts2);
2249			TIMESPEC_TO_TIMEVAL(&tv, &ts3);
2250		}
2251		/* Timed-locking is not restarted. */
2252		if (error == ERESTART)
2253			error = EINTR;
2254	}
2255	return (error);
2256}
2257
2258/*
2259 * Unlock a userland POSIX mutex.
2260 */
2261static int
2262do_unlock_umutex(struct thread *td, struct umutex *m)
2263{
2264	uint32_t flags;
2265
2266	flags = fuword32(&m->m_flags);
2267	if (flags == -1)
2268		return (EFAULT);
2269
2270	switch(flags & (UMUTEX_PRIO_INHERIT | UMUTEX_PRIO_PROTECT)) {
2271	case 0:
2272		return (do_unlock_normal(td, m, flags));
2273	case UMUTEX_PRIO_INHERIT:
2274		return (do_unlock_pi(td, m, flags));
2275	case UMUTEX_PRIO_PROTECT:
2276		return (do_unlock_pp(td, m, flags));
2277	}
2278
2279	return (EINVAL);
2280}
2281
2282static int
2283do_cv_wait(struct thread *td, struct ucond *cv, struct umutex *m,
2284	struct timespec *timeout, u_long wflags)
2285{
2286	struct umtx_q *uq;
2287	struct timeval tv;
2288	struct timespec cts, ets, tts;
2289	uint32_t flags;
2290	int error;
2291
2292	uq = td->td_umtxq;
2293	flags = fuword32(&cv->c_flags);
2294	error = umtx_key_get(cv, TYPE_CV, GET_SHARE(flags), &uq->uq_key);
2295	if (error != 0)
2296		return (error);
2297	umtxq_lock(&uq->uq_key);
2298	umtxq_busy(&uq->uq_key);
2299	umtxq_insert(uq);
2300	umtxq_unlock(&uq->uq_key);
2301
2302	/*
2303	 * The magic thing is we should set c_has_waiters to 1 before
2304	 * releasing user mutex.
2305	 */
2306	suword32(__DEVOLATILE(uint32_t *, &cv->c_has_waiters), 1);
2307
2308	umtxq_lock(&uq->uq_key);
2309	umtxq_unbusy(&uq->uq_key);
2310	umtxq_unlock(&uq->uq_key);
2311
2312	error = do_unlock_umutex(td, m);
2313
2314	umtxq_lock(&uq->uq_key);
2315	if (error == 0) {
2316		if ((wflags & UMTX_CHECK_UNPARKING) &&
2317		    (td->td_pflags & TDP_WAKEUP)) {
2318			td->td_pflags &= ~TDP_WAKEUP;
2319			error = EINTR;
2320		} else if (timeout == NULL) {
2321			error = umtxq_sleep(uq, "ucond", 0);
2322		} else {
2323			getnanouptime(&ets);
2324			timespecadd(&ets, timeout);
2325			TIMESPEC_TO_TIMEVAL(&tv, timeout);
2326			for (;;) {
2327				error = umtxq_sleep(uq, "ucond", tvtohz(&tv));
2328				if (error != ETIMEDOUT)
2329					break;
2330				getnanouptime(&cts);
2331				if (timespeccmp(&cts, &ets, >=)) {
2332					error = ETIMEDOUT;
2333					break;
2334				}
2335				tts = ets;
2336				timespecsub(&tts, &cts);
2337				TIMESPEC_TO_TIMEVAL(&tv, &tts);
2338			}
2339		}
2340	}
2341
2342	if (error != 0) {
2343		if ((uq->uq_flags & UQF_UMTXQ) == 0) {
2344			/*
2345			 * If we concurrently got do_cv_signal()d
2346			 * and we got an error or UNIX signals or a timeout,
2347			 * then, perform another umtxq_signal to avoid
2348			 * consuming the wakeup. This may cause supurious
2349			 * wakeup for another thread which was just queued,
2350			 * but SUSV3 explicitly allows supurious wakeup to
2351			 * occur, and indeed a kernel based implementation
2352			 * can not avoid it.
2353			 */
2354			if (!umtxq_signal(&uq->uq_key, 1))
2355				error = 0;
2356		}
2357		if (error == ERESTART)
2358			error = EINTR;
2359	}
2360	umtxq_remove(uq);
2361	umtxq_unlock(&uq->uq_key);
2362	umtx_key_release(&uq->uq_key);
2363	return (error);
2364}
2365
2366/*
2367 * Signal a userland condition variable.
2368 */
2369static int
2370do_cv_signal(struct thread *td, struct ucond *cv)
2371{
2372	struct umtx_key key;
2373	int error, cnt, nwake;
2374	uint32_t flags;
2375
2376	flags = fuword32(&cv->c_flags);
2377	if ((error = umtx_key_get(cv, TYPE_CV, GET_SHARE(flags), &key)) != 0)
2378		return (error);
2379	umtxq_lock(&key);
2380	umtxq_busy(&key);
2381	cnt = umtxq_count(&key);
2382	nwake = umtxq_signal(&key, 1);
2383	if (cnt <= nwake) {
2384		umtxq_unlock(&key);
2385		error = suword32(
2386		    __DEVOLATILE(uint32_t *, &cv->c_has_waiters), 0);
2387		umtxq_lock(&key);
2388	}
2389	umtxq_unbusy(&key);
2390	umtxq_unlock(&key);
2391	umtx_key_release(&key);
2392	return (error);
2393}
2394
2395static int
2396do_cv_broadcast(struct thread *td, struct ucond *cv)
2397{
2398	struct umtx_key key;
2399	int error;
2400	uint32_t flags;
2401
2402	flags = fuword32(&cv->c_flags);
2403	if ((error = umtx_key_get(cv, TYPE_CV, GET_SHARE(flags), &key)) != 0)
2404		return (error);
2405
2406	umtxq_lock(&key);
2407	umtxq_busy(&key);
2408	umtxq_signal(&key, INT_MAX);
2409	umtxq_unlock(&key);
2410
2411	error = suword32(__DEVOLATILE(uint32_t *, &cv->c_has_waiters), 0);
2412
2413	umtxq_lock(&key);
2414	umtxq_unbusy(&key);
2415	umtxq_unlock(&key);
2416
2417	umtx_key_release(&key);
2418	return (error);
2419}
2420
2421static int
2422do_rw_rdlock(struct thread *td, struct urwlock *rwlock, long fflag, int timo)
2423{
2424	struct umtx_q *uq;
2425	uint32_t flags, wrflags;
2426	int32_t state, oldstate;
2427	int32_t blocked_readers;
2428	int error;
2429
2430	uq = td->td_umtxq;
2431	flags = fuword32(&rwlock->rw_flags);
2432	error = umtx_key_get(rwlock, TYPE_RWLOCK, GET_SHARE(flags), &uq->uq_key);
2433	if (error != 0)
2434		return (error);
2435
2436	wrflags = URWLOCK_WRITE_OWNER;
2437	if (!(fflag & URWLOCK_PREFER_READER) && !(flags & URWLOCK_PREFER_READER))
2438		wrflags |= URWLOCK_WRITE_WAITERS;
2439
2440	for (;;) {
2441		state = fuword32(__DEVOLATILE(int32_t *, &rwlock->rw_state));
2442		/* try to lock it */
2443		while (!(state & wrflags)) {
2444			if (__predict_false(URWLOCK_READER_COUNT(state) == URWLOCK_MAX_READERS)) {
2445				umtx_key_release(&uq->uq_key);
2446				return (EAGAIN);
2447			}
2448			oldstate = casuword32(&rwlock->rw_state, state, state + 1);
2449			if (oldstate == state) {
2450				umtx_key_release(&uq->uq_key);
2451				return (0);
2452			}
2453			state = oldstate;
2454		}
2455
2456		if (error)
2457			break;
2458
2459		/* grab monitor lock */
2460		umtxq_lock(&uq->uq_key);
2461		umtxq_busy(&uq->uq_key);
2462		umtxq_unlock(&uq->uq_key);
2463
2464		/* set read contention bit */
2465		while ((state & wrflags) && !(state & URWLOCK_READ_WAITERS)) {
2466			oldstate = casuword32(&rwlock->rw_state, state, state | URWLOCK_READ_WAITERS);
2467			if (oldstate == state)
2468				goto sleep;
2469			state = oldstate;
2470		}
2471
2472		/* state is changed while setting flags, restart */
2473		if (!(state & wrflags)) {
2474			umtxq_lock(&uq->uq_key);
2475			umtxq_unbusy(&uq->uq_key);
2476			umtxq_unlock(&uq->uq_key);
2477			continue;
2478		}
2479
2480sleep:
2481		/* contention bit is set, before sleeping, increase read waiter count */
2482		blocked_readers = fuword32(&rwlock->rw_blocked_readers);
2483		suword32(&rwlock->rw_blocked_readers, blocked_readers+1);
2484
2485		while (state & wrflags) {
2486			umtxq_lock(&uq->uq_key);
2487			umtxq_insert(uq);
2488			umtxq_unbusy(&uq->uq_key);
2489
2490			error = umtxq_sleep(uq, "urdlck", timo);
2491
2492			umtxq_busy(&uq->uq_key);
2493			umtxq_remove(uq);
2494			umtxq_unlock(&uq->uq_key);
2495			if (error)
2496				break;
2497			state = fuword32(__DEVOLATILE(int32_t *, &rwlock->rw_state));
2498		}
2499
2500		/* decrease read waiter count, and may clear read contention bit */
2501		blocked_readers = fuword32(&rwlock->rw_blocked_readers);
2502		suword32(&rwlock->rw_blocked_readers, blocked_readers-1);
2503		if (blocked_readers == 1) {
2504			state = fuword32(__DEVOLATILE(int32_t *, &rwlock->rw_state));
2505			for (;;) {
2506				oldstate = casuword32(&rwlock->rw_state, state,
2507					 state & ~URWLOCK_READ_WAITERS);
2508				if (oldstate == state)
2509					break;
2510				state = oldstate;
2511			}
2512		}
2513
2514		umtxq_lock(&uq->uq_key);
2515		umtxq_unbusy(&uq->uq_key);
2516		umtxq_unlock(&uq->uq_key);
2517	}
2518	umtx_key_release(&uq->uq_key);
2519	return (error);
2520}
2521
2522static int
2523do_rw_rdlock2(struct thread *td, void *obj, long val, struct timespec *timeout)
2524{
2525	struct timespec ts, ts2, ts3;
2526	struct timeval tv;
2527	int error;
2528
2529	getnanouptime(&ts);
2530	timespecadd(&ts, timeout);
2531	TIMESPEC_TO_TIMEVAL(&tv, timeout);
2532	for (;;) {
2533		error = do_rw_rdlock(td, obj, val, tvtohz(&tv));
2534		if (error != ETIMEDOUT)
2535			break;
2536		getnanouptime(&ts2);
2537		if (timespeccmp(&ts2, &ts, >=)) {
2538			error = ETIMEDOUT;
2539			break;
2540		}
2541		ts3 = ts;
2542		timespecsub(&ts3, &ts2);
2543		TIMESPEC_TO_TIMEVAL(&tv, &ts3);
2544	}
2545	if (error == ERESTART)
2546		error = EINTR;
2547	return (error);
2548}
2549
2550static int
2551do_rw_wrlock(struct thread *td, struct urwlock *rwlock, int timo)
2552{
2553	struct umtx_q *uq;
2554	uint32_t flags;
2555	int32_t state, oldstate;
2556	int32_t blocked_writers;
2557	int error;
2558
2559	uq = td->td_umtxq;
2560	flags = fuword32(&rwlock->rw_flags);
2561	error = umtx_key_get(rwlock, TYPE_RWLOCK, GET_SHARE(flags), &uq->uq_key);
2562	if (error != 0)
2563		return (error);
2564
2565	for (;;) {
2566		state = fuword32(__DEVOLATILE(int32_t *, &rwlock->rw_state));
2567		while (!(state & URWLOCK_WRITE_OWNER) && URWLOCK_READER_COUNT(state) == 0) {
2568			oldstate = casuword32(&rwlock->rw_state, state, state | URWLOCK_WRITE_OWNER);
2569			if (oldstate == state) {
2570				umtx_key_release(&uq->uq_key);
2571				return (0);
2572			}
2573			state = oldstate;
2574		}
2575
2576		if (error)
2577			break;
2578
2579		/* grab monitor lock */
2580		umtxq_lock(&uq->uq_key);
2581		umtxq_busy(&uq->uq_key);
2582		umtxq_unlock(&uq->uq_key);
2583
2584		while (((state & URWLOCK_WRITE_OWNER) || URWLOCK_READER_COUNT(state) != 0) &&
2585		       (state & URWLOCK_WRITE_WAITERS) == 0) {
2586			oldstate = casuword32(&rwlock->rw_state, state, state | URWLOCK_WRITE_WAITERS);
2587			if (oldstate == state)
2588				goto sleep;
2589			state = oldstate;
2590		}
2591
2592		if (!(state & URWLOCK_WRITE_OWNER) && URWLOCK_READER_COUNT(state) == 0) {
2593			umtxq_lock(&uq->uq_key);
2594			umtxq_unbusy(&uq->uq_key);
2595			umtxq_unlock(&uq->uq_key);
2596			continue;
2597		}
2598sleep:
2599		blocked_writers = fuword32(&rwlock->rw_blocked_writers);
2600		suword32(&rwlock->rw_blocked_writers, blocked_writers+1);
2601
2602		while ((state & URWLOCK_WRITE_OWNER) || URWLOCK_READER_COUNT(state) != 0) {
2603			umtxq_lock(&uq->uq_key);
2604			umtxq_insert_queue(uq, UMTX_EXCLUSIVE_QUEUE);
2605			umtxq_unbusy(&uq->uq_key);
2606
2607			error = umtxq_sleep(uq, "uwrlck", timo);
2608
2609			umtxq_busy(&uq->uq_key);
2610			umtxq_remove_queue(uq, UMTX_EXCLUSIVE_QUEUE);
2611			umtxq_unlock(&uq->uq_key);
2612			if (error)
2613				break;
2614			state = fuword32(__DEVOLATILE(int32_t *, &rwlock->rw_state));
2615		}
2616
2617		blocked_writers = fuword32(&rwlock->rw_blocked_writers);
2618		suword32(&rwlock->rw_blocked_writers, blocked_writers-1);
2619		if (blocked_writers == 1) {
2620			state = fuword32(__DEVOLATILE(int32_t *, &rwlock->rw_state));
2621			for (;;) {
2622				oldstate = casuword32(&rwlock->rw_state, state,
2623					 state & ~URWLOCK_WRITE_WAITERS);
2624				if (oldstate == state)
2625					break;
2626				state = oldstate;
2627			}
2628		}
2629
2630		umtxq_lock(&uq->uq_key);
2631		umtxq_unbusy(&uq->uq_key);
2632		umtxq_unlock(&uq->uq_key);
2633	}
2634
2635	umtx_key_release(&uq->uq_key);
2636	return (error);
2637}
2638
2639static int
2640do_rw_wrlock2(struct thread *td, void *obj, struct timespec *timeout)
2641{
2642	struct timespec ts, ts2, ts3;
2643	struct timeval tv;
2644	int error;
2645
2646	getnanouptime(&ts);
2647	timespecadd(&ts, timeout);
2648	TIMESPEC_TO_TIMEVAL(&tv, timeout);
2649	for (;;) {
2650		error = do_rw_wrlock(td, obj, tvtohz(&tv));
2651		if (error != ETIMEDOUT)
2652			break;
2653		getnanouptime(&ts2);
2654		if (timespeccmp(&ts2, &ts, >=)) {
2655			error = ETIMEDOUT;
2656			break;
2657		}
2658		ts3 = ts;
2659		timespecsub(&ts3, &ts2);
2660		TIMESPEC_TO_TIMEVAL(&tv, &ts3);
2661	}
2662	if (error == ERESTART)
2663		error = EINTR;
2664	return (error);
2665}
2666
2667static int
2668do_rw_unlock(struct thread *td, struct urwlock *rwlock)
2669{
2670	struct umtx_q *uq;
2671	uint32_t flags;
2672	int32_t state, oldstate;
2673	int error, q, count;
2674
2675	uq = td->td_umtxq;
2676	flags = fuword32(&rwlock->rw_flags);
2677	error = umtx_key_get(rwlock, TYPE_RWLOCK, GET_SHARE(flags), &uq->uq_key);
2678	if (error != 0)
2679		return (error);
2680
2681	state = fuword32(__DEVOLATILE(int32_t *, &rwlock->rw_state));
2682	if (state & URWLOCK_WRITE_OWNER) {
2683		for (;;) {
2684			oldstate = casuword32(&rwlock->rw_state, state,
2685				state & ~URWLOCK_WRITE_OWNER);
2686			if (oldstate != state) {
2687				state = oldstate;
2688				if (!(oldstate & URWLOCK_WRITE_OWNER)) {
2689					error = EPERM;
2690					goto out;
2691				}
2692			} else
2693				break;
2694		}
2695	} else if (URWLOCK_READER_COUNT(state) != 0) {
2696		for (;;) {
2697			oldstate = casuword32(&rwlock->rw_state, state,
2698				state - 1);
2699			if (oldstate != state) {
2700				state = oldstate;
2701				if (URWLOCK_READER_COUNT(oldstate) == 0) {
2702					error = EPERM;
2703					goto out;
2704				}
2705			}
2706			else
2707				break;
2708		}
2709	} else {
2710		error = EPERM;
2711		goto out;
2712	}
2713
2714	count = 0;
2715
2716	if (!(flags & URWLOCK_PREFER_READER)) {
2717		if (state & URWLOCK_WRITE_WAITERS) {
2718			count = 1;
2719			q = UMTX_EXCLUSIVE_QUEUE;
2720		} else if (state & URWLOCK_READ_WAITERS) {
2721			count = INT_MAX;
2722			q = UMTX_SHARED_QUEUE;
2723		}
2724	} else {
2725		if (state & URWLOCK_READ_WAITERS) {
2726			count = INT_MAX;
2727			q = UMTX_SHARED_QUEUE;
2728		} else if (state & URWLOCK_WRITE_WAITERS) {
2729			count = 1;
2730			q = UMTX_EXCLUSIVE_QUEUE;
2731		}
2732	}
2733
2734	if (count) {
2735		umtxq_lock(&uq->uq_key);
2736		umtxq_busy(&uq->uq_key);
2737		umtxq_signal_queue(&uq->uq_key, count, q);
2738		umtxq_unbusy(&uq->uq_key);
2739		umtxq_unlock(&uq->uq_key);
2740	}
2741out:
2742	umtx_key_release(&uq->uq_key);
2743	return (error);
2744}
2745
2746int
2747_umtx_lock(struct thread *td, struct _umtx_lock_args *uap)
2748    /* struct umtx *umtx */
2749{
2750	return _do_lock_umtx(td, uap->umtx, td->td_tid, 0);
2751}
2752
2753int
2754_umtx_unlock(struct thread *td, struct _umtx_unlock_args *uap)
2755    /* struct umtx *umtx */
2756{
2757	return do_unlock_umtx(td, uap->umtx, td->td_tid);
2758}
2759
2760static int
2761__umtx_op_lock_umtx(struct thread *td, struct _umtx_op_args *uap)
2762{
2763	struct timespec *ts, timeout;
2764	int error;
2765
2766	/* Allow a null timespec (wait forever). */
2767	if (uap->uaddr2 == NULL)
2768		ts = NULL;
2769	else {
2770		error = copyin(uap->uaddr2, &timeout, sizeof(timeout));
2771		if (error != 0)
2772			return (error);
2773		if (timeout.tv_nsec >= 1000000000 ||
2774		    timeout.tv_nsec < 0) {
2775			return (EINVAL);
2776		}
2777		ts = &timeout;
2778	}
2779	return (do_lock_umtx(td, uap->obj, uap->val, ts));
2780}
2781
2782static int
2783__umtx_op_unlock_umtx(struct thread *td, struct _umtx_op_args *uap)
2784{
2785	return (do_unlock_umtx(td, uap->obj, uap->val));
2786}
2787
2788static int
2789__umtx_op_wait(struct thread *td, struct _umtx_op_args *uap)
2790{
2791	struct timespec *ts, timeout;
2792	int error;
2793
2794	if (uap->uaddr2 == NULL)
2795		ts = NULL;
2796	else {
2797		error = copyin(uap->uaddr2, &timeout, sizeof(timeout));
2798		if (error != 0)
2799			return (error);
2800		if (timeout.tv_nsec >= 1000000000 ||
2801		    timeout.tv_nsec < 0)
2802			return (EINVAL);
2803		ts = &timeout;
2804	}
2805	return do_wait(td, uap->obj, uap->val, ts, 0, 0);
2806}
2807
2808static int
2809__umtx_op_wait_uint(struct thread *td, struct _umtx_op_args *uap)
2810{
2811	struct timespec *ts, timeout;
2812	int error;
2813
2814	if (uap->uaddr2 == NULL)
2815		ts = NULL;
2816	else {
2817		error = copyin(uap->uaddr2, &timeout, sizeof(timeout));
2818		if (error != 0)
2819			return (error);
2820		if (timeout.tv_nsec >= 1000000000 ||
2821		    timeout.tv_nsec < 0)
2822			return (EINVAL);
2823		ts = &timeout;
2824	}
2825	return do_wait(td, uap->obj, uap->val, ts, 1, 0);
2826}
2827
2828static int
2829__umtx_op_wait_uint_private(struct thread *td, struct _umtx_op_args *uap)
2830{
2831	struct timespec *ts, timeout;
2832	int error;
2833
2834	if (uap->uaddr2 == NULL)
2835		ts = NULL;
2836	else {
2837		error = copyin(uap->uaddr2, &timeout, sizeof(timeout));
2838		if (error != 0)
2839			return (error);
2840		if (timeout.tv_nsec >= 1000000000 ||
2841		    timeout.tv_nsec < 0)
2842			return (EINVAL);
2843		ts = &timeout;
2844	}
2845	return do_wait(td, uap->obj, uap->val, ts, 1, 1);
2846}
2847
2848static int
2849__umtx_op_wake(struct thread *td, struct _umtx_op_args *uap)
2850{
2851	return (kern_umtx_wake(td, uap->obj, uap->val, 0));
2852}
2853
2854static int
2855__umtx_op_wake_private(struct thread *td, struct _umtx_op_args *uap)
2856{
2857	return (kern_umtx_wake(td, uap->obj, uap->val, 1));
2858}
2859
2860static int
2861__umtx_op_lock_umutex(struct thread *td, struct _umtx_op_args *uap)
2862{
2863	struct timespec *ts, timeout;
2864	int error;
2865
2866	/* Allow a null timespec (wait forever). */
2867	if (uap->uaddr2 == NULL)
2868		ts = NULL;
2869	else {
2870		error = copyin(uap->uaddr2, &timeout,
2871		    sizeof(timeout));
2872		if (error != 0)
2873			return (error);
2874		if (timeout.tv_nsec >= 1000000000 ||
2875		    timeout.tv_nsec < 0) {
2876			return (EINVAL);
2877		}
2878		ts = &timeout;
2879	}
2880	return do_lock_umutex(td, uap->obj, ts, 0);
2881}
2882
2883static int
2884__umtx_op_trylock_umutex(struct thread *td, struct _umtx_op_args *uap)
2885{
2886	return do_lock_umutex(td, uap->obj, NULL, _UMUTEX_TRY);
2887}
2888
2889static int
2890__umtx_op_wait_umutex(struct thread *td, struct _umtx_op_args *uap)
2891{
2892	struct timespec *ts, timeout;
2893	int error;
2894
2895	/* Allow a null timespec (wait forever). */
2896	if (uap->uaddr2 == NULL)
2897		ts = NULL;
2898	else {
2899		error = copyin(uap->uaddr2, &timeout,
2900		    sizeof(timeout));
2901		if (error != 0)
2902			return (error);
2903		if (timeout.tv_nsec >= 1000000000 ||
2904		    timeout.tv_nsec < 0) {
2905			return (EINVAL);
2906		}
2907		ts = &timeout;
2908	}
2909	return do_lock_umutex(td, uap->obj, ts, _UMUTEX_WAIT);
2910}
2911
2912static int
2913__umtx_op_wake_umutex(struct thread *td, struct _umtx_op_args *uap)
2914{
2915	return do_wake_umutex(td, uap->obj);
2916}
2917
2918static int
2919__umtx_op_unlock_umutex(struct thread *td, struct _umtx_op_args *uap)
2920{
2921	return do_unlock_umutex(td, uap->obj);
2922}
2923
2924static int
2925__umtx_op_set_ceiling(struct thread *td, struct _umtx_op_args *uap)
2926{
2927	return do_set_ceiling(td, uap->obj, uap->val, uap->uaddr1);
2928}
2929
2930static int
2931__umtx_op_cv_wait(struct thread *td, struct _umtx_op_args *uap)
2932{
2933	struct timespec *ts, timeout;
2934	int error;
2935
2936	/* Allow a null timespec (wait forever). */
2937	if (uap->uaddr2 == NULL)
2938		ts = NULL;
2939	else {
2940		error = copyin(uap->uaddr2, &timeout,
2941		    sizeof(timeout));
2942		if (error != 0)
2943			return (error);
2944		if (timeout.tv_nsec >= 1000000000 ||
2945		    timeout.tv_nsec < 0) {
2946			return (EINVAL);
2947		}
2948		ts = &timeout;
2949	}
2950	return (do_cv_wait(td, uap->obj, uap->uaddr1, ts, uap->val));
2951}
2952
2953static int
2954__umtx_op_cv_signal(struct thread *td, struct _umtx_op_args *uap)
2955{
2956	return do_cv_signal(td, uap->obj);
2957}
2958
2959static int
2960__umtx_op_cv_broadcast(struct thread *td, struct _umtx_op_args *uap)
2961{
2962	return do_cv_broadcast(td, uap->obj);
2963}
2964
2965static int
2966__umtx_op_rw_rdlock(struct thread *td, struct _umtx_op_args *uap)
2967{
2968	struct timespec timeout;
2969	int error;
2970
2971	/* Allow a null timespec (wait forever). */
2972	if (uap->uaddr2 == NULL) {
2973		error = do_rw_rdlock(td, uap->obj, uap->val, 0);
2974	} else {
2975		error = copyin(uap->uaddr2, &timeout,
2976		    sizeof(timeout));
2977		if (error != 0)
2978			return (error);
2979		if (timeout.tv_nsec >= 1000000000 ||
2980		    timeout.tv_nsec < 0) {
2981			return (EINVAL);
2982		}
2983		error = do_rw_rdlock2(td, uap->obj, uap->val, &timeout);
2984	}
2985	return (error);
2986}
2987
2988static int
2989__umtx_op_rw_wrlock(struct thread *td, struct _umtx_op_args *uap)
2990{
2991	struct timespec timeout;
2992	int error;
2993
2994	/* Allow a null timespec (wait forever). */
2995	if (uap->uaddr2 == NULL) {
2996		error = do_rw_wrlock(td, uap->obj, 0);
2997	} else {
2998		error = copyin(uap->uaddr2, &timeout,
2999		    sizeof(timeout));
3000		if (error != 0)
3001			return (error);
3002		if (timeout.tv_nsec >= 1000000000 ||
3003		    timeout.tv_nsec < 0) {
3004			return (EINVAL);
3005		}
3006
3007		error = do_rw_wrlock2(td, uap->obj, &timeout);
3008	}
3009	return (error);
3010}
3011
3012static int
3013__umtx_op_rw_unlock(struct thread *td, struct _umtx_op_args *uap)
3014{
3015	return do_rw_unlock(td, uap->obj);
3016}
3017
3018typedef int (*_umtx_op_func)(struct thread *td, struct _umtx_op_args *uap);
3019
3020static _umtx_op_func op_table[] = {
3021	__umtx_op_lock_umtx,		/* UMTX_OP_LOCK */
3022	__umtx_op_unlock_umtx,		/* UMTX_OP_UNLOCK */
3023	__umtx_op_wait,			/* UMTX_OP_WAIT */
3024	__umtx_op_wake,			/* UMTX_OP_WAKE */
3025	__umtx_op_trylock_umutex,	/* UMTX_OP_MUTEX_TRYLOCK */
3026	__umtx_op_lock_umutex,		/* UMTX_OP_MUTEX_LOCK */
3027	__umtx_op_unlock_umutex,	/* UMTX_OP_MUTEX_UNLOCK */
3028	__umtx_op_set_ceiling,		/* UMTX_OP_SET_CEILING */
3029	__umtx_op_cv_wait,		/* UMTX_OP_CV_WAIT*/
3030	__umtx_op_cv_signal,		/* UMTX_OP_CV_SIGNAL */
3031	__umtx_op_cv_broadcast,		/* UMTX_OP_CV_BROADCAST */
3032	__umtx_op_wait_uint,		/* UMTX_OP_WAIT_UINT */
3033	__umtx_op_rw_rdlock,		/* UMTX_OP_RW_RDLOCK */
3034	__umtx_op_rw_wrlock,		/* UMTX_OP_RW_WRLOCK */
3035	__umtx_op_rw_unlock,		/* UMTX_OP_RW_UNLOCK */
3036	__umtx_op_wait_uint_private,	/* UMTX_OP_WAIT_UINT_PRIVATE */
3037	__umtx_op_wake_private,		/* UMTX_OP_WAKE_PRIVATE */
3038	__umtx_op_wait_umutex,		/* UMTX_OP_UMUTEX_WAIT */
3039	__umtx_op_wake_umutex		/* UMTX_OP_UMUTEX_WAKE */
3040};
3041
3042int
3043_umtx_op(struct thread *td, struct _umtx_op_args *uap)
3044{
3045	if ((unsigned)uap->op < UMTX_OP_MAX)
3046		return (*op_table[uap->op])(td, uap);
3047	return (EINVAL);
3048}
3049
3050#ifdef COMPAT_IA32
3051int
3052freebsd32_umtx_lock(struct thread *td, struct freebsd32_umtx_lock_args *uap)
3053    /* struct umtx *umtx */
3054{
3055	return (do_lock_umtx32(td, (uint32_t *)uap->umtx, td->td_tid, NULL));
3056}
3057
3058int
3059freebsd32_umtx_unlock(struct thread *td, struct freebsd32_umtx_unlock_args *uap)
3060    /* struct umtx *umtx */
3061{
3062	return (do_unlock_umtx32(td, (uint32_t *)uap->umtx, td->td_tid));
3063}
3064
3065struct timespec32 {
3066	u_int32_t tv_sec;
3067	u_int32_t tv_nsec;
3068};
3069
3070static inline int
3071copyin_timeout32(void *addr, struct timespec *tsp)
3072{
3073	struct timespec32 ts32;
3074	int error;
3075
3076	error = copyin(addr, &ts32, sizeof(struct timespec32));
3077	if (error == 0) {
3078		tsp->tv_sec = ts32.tv_sec;
3079		tsp->tv_nsec = ts32.tv_nsec;
3080	}
3081	return (error);
3082}
3083
3084static int
3085__umtx_op_lock_umtx_compat32(struct thread *td, struct _umtx_op_args *uap)
3086{
3087	struct timespec *ts, timeout;
3088	int error;
3089
3090	/* Allow a null timespec (wait forever). */
3091	if (uap->uaddr2 == NULL)
3092		ts = NULL;
3093	else {
3094		error = copyin_timeout32(uap->uaddr2, &timeout);
3095		if (error != 0)
3096			return (error);
3097		if (timeout.tv_nsec >= 1000000000 ||
3098		    timeout.tv_nsec < 0) {
3099			return (EINVAL);
3100		}
3101		ts = &timeout;
3102	}
3103	return (do_lock_umtx32(td, uap->obj, uap->val, ts));
3104}
3105
3106static int
3107__umtx_op_unlock_umtx_compat32(struct thread *td, struct _umtx_op_args *uap)
3108{
3109	return (do_unlock_umtx32(td, uap->obj, (uint32_t)uap->val));
3110}
3111
3112static int
3113__umtx_op_wait_compat32(struct thread *td, struct _umtx_op_args *uap)
3114{
3115	struct timespec *ts, timeout;
3116	int error;
3117
3118	if (uap->uaddr2 == NULL)
3119		ts = NULL;
3120	else {
3121		error = copyin_timeout32(uap->uaddr2, &timeout);
3122		if (error != 0)
3123			return (error);
3124		if (timeout.tv_nsec >= 1000000000 ||
3125		    timeout.tv_nsec < 0)
3126			return (EINVAL);
3127		ts = &timeout;
3128	}
3129	return do_wait(td, uap->obj, uap->val, ts, 1, 0);
3130}
3131
3132static int
3133__umtx_op_lock_umutex_compat32(struct thread *td, struct _umtx_op_args *uap)
3134{
3135	struct timespec *ts, timeout;
3136	int error;
3137
3138	/* Allow a null timespec (wait forever). */
3139	if (uap->uaddr2 == NULL)
3140		ts = NULL;
3141	else {
3142		error = copyin_timeout32(uap->uaddr2, &timeout);
3143		if (error != 0)
3144			return (error);
3145		if (timeout.tv_nsec >= 1000000000 ||
3146		    timeout.tv_nsec < 0)
3147			return (EINVAL);
3148		ts = &timeout;
3149	}
3150	return do_lock_umutex(td, uap->obj, ts, 0);
3151}
3152
3153static int
3154__umtx_op_wait_umutex_compat32(struct thread *td, struct _umtx_op_args *uap)
3155{
3156	struct timespec *ts, timeout;
3157	int error;
3158
3159	/* Allow a null timespec (wait forever). */
3160	if (uap->uaddr2 == NULL)
3161		ts = NULL;
3162	else {
3163		error = copyin_timeout32(uap->uaddr2, &timeout);
3164		if (error != 0)
3165			return (error);
3166		if (timeout.tv_nsec >= 1000000000 ||
3167		    timeout.tv_nsec < 0)
3168			return (EINVAL);
3169		ts = &timeout;
3170	}
3171	return do_lock_umutex(td, uap->obj, ts, _UMUTEX_WAIT);
3172}
3173
3174static int
3175__umtx_op_cv_wait_compat32(struct thread *td, struct _umtx_op_args *uap)
3176{
3177	struct timespec *ts, timeout;
3178	int error;
3179
3180	/* Allow a null timespec (wait forever). */
3181	if (uap->uaddr2 == NULL)
3182		ts = NULL;
3183	else {
3184		error = copyin_timeout32(uap->uaddr2, &timeout);
3185		if (error != 0)
3186			return (error);
3187		if (timeout.tv_nsec >= 1000000000 ||
3188		    timeout.tv_nsec < 0)
3189			return (EINVAL);
3190		ts = &timeout;
3191	}
3192	return (do_cv_wait(td, uap->obj, uap->uaddr1, ts, uap->val));
3193}
3194
3195static int
3196__umtx_op_rw_rdlock_compat32(struct thread *td, struct _umtx_op_args *uap)
3197{
3198	struct timespec timeout;
3199	int error;
3200
3201	/* Allow a null timespec (wait forever). */
3202	if (uap->uaddr2 == NULL) {
3203		error = do_rw_rdlock(td, uap->obj, uap->val, 0);
3204	} else {
3205		error = copyin(uap->uaddr2, &timeout,
3206		    sizeof(timeout));
3207		if (error != 0)
3208			return (error);
3209		if (timeout.tv_nsec >= 1000000000 ||
3210		    timeout.tv_nsec < 0) {
3211			return (EINVAL);
3212		}
3213		error = do_rw_rdlock2(td, uap->obj, uap->val, &timeout);
3214	}
3215	return (error);
3216}
3217
3218static int
3219__umtx_op_rw_wrlock_compat32(struct thread *td, struct _umtx_op_args *uap)
3220{
3221	struct timespec timeout;
3222	int error;
3223
3224	/* Allow a null timespec (wait forever). */
3225	if (uap->uaddr2 == NULL) {
3226		error = do_rw_wrlock(td, uap->obj, 0);
3227	} else {
3228		error = copyin_timeout32(uap->uaddr2, &timeout);
3229		if (error != 0)
3230			return (error);
3231		if (timeout.tv_nsec >= 1000000000 ||
3232		    timeout.tv_nsec < 0) {
3233			return (EINVAL);
3234		}
3235
3236		error = do_rw_wrlock2(td, uap->obj, &timeout);
3237	}
3238	return (error);
3239}
3240
3241static int
3242__umtx_op_wait_uint_private_compat32(struct thread *td, struct _umtx_op_args *uap)
3243{
3244	struct timespec *ts, timeout;
3245	int error;
3246
3247	if (uap->uaddr2 == NULL)
3248		ts = NULL;
3249	else {
3250		error = copyin_timeout32(uap->uaddr2, &timeout);
3251		if (error != 0)
3252			return (error);
3253		if (timeout.tv_nsec >= 1000000000 ||
3254		    timeout.tv_nsec < 0)
3255			return (EINVAL);
3256		ts = &timeout;
3257	}
3258	return do_wait(td, uap->obj, uap->val, ts, 1, 1);
3259}
3260
3261static _umtx_op_func op_table_compat32[] = {
3262	__umtx_op_lock_umtx_compat32,	/* UMTX_OP_LOCK */
3263	__umtx_op_unlock_umtx_compat32,	/* UMTX_OP_UNLOCK */
3264	__umtx_op_wait_compat32,	/* UMTX_OP_WAIT */
3265	__umtx_op_wake,			/* UMTX_OP_WAKE */
3266	__umtx_op_trylock_umutex,	/* UMTX_OP_MUTEX_LOCK */
3267	__umtx_op_lock_umutex_compat32,	/* UMTX_OP_MUTEX_TRYLOCK */
3268	__umtx_op_unlock_umutex,	/* UMTX_OP_MUTEX_UNLOCK	*/
3269	__umtx_op_set_ceiling,		/* UMTX_OP_SET_CEILING */
3270	__umtx_op_cv_wait_compat32,	/* UMTX_OP_CV_WAIT*/
3271	__umtx_op_cv_signal,		/* UMTX_OP_CV_SIGNAL */
3272	__umtx_op_cv_broadcast,		/* UMTX_OP_CV_BROADCAST */
3273	__umtx_op_wait_compat32,	/* UMTX_OP_WAIT_UINT */
3274	__umtx_op_rw_rdlock_compat32,	/* UMTX_OP_RW_RDLOCK */
3275	__umtx_op_rw_wrlock_compat32,	/* UMTX_OP_RW_WRLOCK */
3276	__umtx_op_rw_unlock,		/* UMTX_OP_RW_UNLOCK */
3277	__umtx_op_wait_uint_private_compat32,	/* UMTX_OP_WAIT_UINT_PRIVATE */
3278	__umtx_op_wake_private,		/* UMTX_OP_WAKE_PRIVATE */
3279	__umtx_op_wait_umutex_compat32, /* UMTX_OP_UMUTEX_WAIT */
3280	__umtx_op_wake_umutex		/* UMTX_OP_UMUTEX_WAKE */
3281};
3282
3283int
3284freebsd32_umtx_op(struct thread *td, struct freebsd32_umtx_op_args *uap)
3285{
3286	if ((unsigned)uap->op < UMTX_OP_MAX)
3287		return (*op_table_compat32[uap->op])(td,
3288			(struct _umtx_op_args *)uap);
3289	return (EINVAL);
3290}
3291#endif
3292
3293void
3294umtx_thread_init(struct thread *td)
3295{
3296	td->td_umtxq = umtxq_alloc();
3297	td->td_umtxq->uq_thread = td;
3298}
3299
3300void
3301umtx_thread_fini(struct thread *td)
3302{
3303	umtxq_free(td->td_umtxq);
3304}
3305
3306/*
3307 * It will be called when new thread is created, e.g fork().
3308 */
3309void
3310umtx_thread_alloc(struct thread *td)
3311{
3312	struct umtx_q *uq;
3313
3314	uq = td->td_umtxq;
3315	uq->uq_inherited_pri = PRI_MAX;
3316
3317	KASSERT(uq->uq_flags == 0, ("uq_flags != 0"));
3318	KASSERT(uq->uq_thread == td, ("uq_thread != td"));
3319	KASSERT(uq->uq_pi_blocked == NULL, ("uq_pi_blocked != NULL"));
3320	KASSERT(TAILQ_EMPTY(&uq->uq_pi_contested), ("uq_pi_contested is not empty"));
3321}
3322
3323/*
3324 * exec() hook.
3325 */
3326static void
3327umtx_exec_hook(void *arg __unused, struct proc *p __unused,
3328	struct image_params *imgp __unused)
3329{
3330	umtx_thread_cleanup(curthread);
3331}
3332
3333/*
3334 * thread_exit() hook.
3335 */
3336void
3337umtx_thread_exit(struct thread *td)
3338{
3339	umtx_thread_cleanup(td);
3340}
3341
3342/*
3343 * clean up umtx data.
3344 */
3345static void
3346umtx_thread_cleanup(struct thread *td)
3347{
3348	struct umtx_q *uq;
3349	struct umtx_pi *pi;
3350
3351	if ((uq = td->td_umtxq) == NULL)
3352		return;
3353
3354	mtx_lock_spin(&umtx_lock);
3355	uq->uq_inherited_pri = PRI_MAX;
3356	while ((pi = TAILQ_FIRST(&uq->uq_pi_contested)) != NULL) {
3357		pi->pi_owner = NULL;
3358		TAILQ_REMOVE(&uq->uq_pi_contested, pi, pi_link);
3359	}
3360	thread_lock(td);
3361	td->td_flags &= ~TDF_UBORROWING;
3362	thread_unlock(td);
3363	mtx_unlock_spin(&umtx_lock);
3364}
3365