kern_umtx.c revision 216313
1/*-
2 * Copyright (c) 2004, David Xu <davidxu@freebsd.org>
3 * Copyright (c) 2002, Jeffrey Roberson <jeff@freebsd.org>
4 * All rights reserved.
5 *
6 * Redistribution and use in source and binary forms, with or without
7 * modification, are permitted provided that the following conditions
8 * are met:
9 * 1. Redistributions of source code must retain the above copyright
10 *    notice unmodified, this list of conditions, and the following
11 *    disclaimer.
12 * 2. Redistributions in binary form must reproduce the above copyright
13 *    notice, this list of conditions and the following disclaimer in the
14 *    documentation and/or other materials provided with the distribution.
15 *
16 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
17 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
18 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
19 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
20 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
21 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
22 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
23 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
25 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26 */
27
28#include <sys/cdefs.h>
29__FBSDID("$FreeBSD: head/sys/kern/kern_umtx.c 216313 2010-12-09 02:42:02Z davidxu $");
30
31#include "opt_compat.h"
32#include <sys/param.h>
33#include <sys/kernel.h>
34#include <sys/limits.h>
35#include <sys/lock.h>
36#include <sys/malloc.h>
37#include <sys/mutex.h>
38#include <sys/priv.h>
39#include <sys/proc.h>
40#include <sys/sched.h>
41#include <sys/smp.h>
42#include <sys/sysctl.h>
43#include <sys/sysent.h>
44#include <sys/systm.h>
45#include <sys/sysproto.h>
46#include <sys/eventhandler.h>
47#include <sys/umtx.h>
48
49#include <vm/vm.h>
50#include <vm/vm_param.h>
51#include <vm/pmap.h>
52#include <vm/vm_map.h>
53#include <vm/vm_object.h>
54
55#include <machine/cpu.h>
56
57#ifdef COMPAT_FREEBSD32
58#include <compat/freebsd32/freebsd32_proto.h>
59#endif
60
61enum {
62	TYPE_SIMPLE_WAIT,
63	TYPE_CV,
64	TYPE_SEM,
65	TYPE_SIMPLE_LOCK,
66	TYPE_NORMAL_UMUTEX,
67	TYPE_PI_UMUTEX,
68	TYPE_PP_UMUTEX,
69	TYPE_RWLOCK
70};
71
72#define _UMUTEX_TRY		1
73#define _UMUTEX_WAIT		2
74
75/* Key to represent a unique userland synchronous object */
76struct umtx_key {
77	int	hash;
78	int	type;
79	int	shared;
80	union {
81		struct {
82			vm_object_t	object;
83			uintptr_t	offset;
84		} shared;
85		struct {
86			struct vmspace	*vs;
87			uintptr_t	addr;
88		} private;
89		struct {
90			void		*a;
91			uintptr_t	b;
92		} both;
93	} info;
94};
95
96/* Priority inheritance mutex info. */
97struct umtx_pi {
98	/* Owner thread */
99	struct thread		*pi_owner;
100
101	/* Reference count */
102	int			pi_refcount;
103
104 	/* List entry to link umtx holding by thread */
105	TAILQ_ENTRY(umtx_pi)	pi_link;
106
107	/* List entry in hash */
108	TAILQ_ENTRY(umtx_pi)	pi_hashlink;
109
110	/* List for waiters */
111	TAILQ_HEAD(,umtx_q)	pi_blocked;
112
113	/* Identify a userland lock object */
114	struct umtx_key		pi_key;
115};
116
117/* A userland synchronous object user. */
118struct umtx_q {
119	/* Linked list for the hash. */
120	TAILQ_ENTRY(umtx_q)	uq_link;
121
122	/* Umtx key. */
123	struct umtx_key		uq_key;
124
125	/* Umtx flags. */
126	int			uq_flags;
127#define UQF_UMTXQ	0x0001
128
129	/* The thread waits on. */
130	struct thread		*uq_thread;
131
132	/*
133	 * Blocked on PI mutex. read can use chain lock
134	 * or umtx_lock, write must have both chain lock and
135	 * umtx_lock being hold.
136	 */
137	struct umtx_pi		*uq_pi_blocked;
138
139	/* On blocked list */
140	TAILQ_ENTRY(umtx_q)	uq_lockq;
141
142	/* Thread contending with us */
143	TAILQ_HEAD(,umtx_pi)	uq_pi_contested;
144
145	/* Inherited priority from PP mutex */
146	u_char			uq_inherited_pri;
147
148	/* Spare queue ready to be reused */
149	struct umtxq_queue	*uq_spare_queue;
150
151	/* The queue we on */
152	struct umtxq_queue	*uq_cur_queue;
153};
154
155TAILQ_HEAD(umtxq_head, umtx_q);
156
157/* Per-key wait-queue */
158struct umtxq_queue {
159	struct umtxq_head	head;
160	struct umtx_key		key;
161	LIST_ENTRY(umtxq_queue)	link;
162	int			length;
163};
164
165LIST_HEAD(umtxq_list, umtxq_queue);
166
167/* Userland lock object's wait-queue chain */
168struct umtxq_chain {
169	/* Lock for this chain. */
170	struct mtx		uc_lock;
171
172	/* List of sleep queues. */
173	struct umtxq_list	uc_queue[2];
174#define UMTX_SHARED_QUEUE	0
175#define UMTX_EXCLUSIVE_QUEUE	1
176
177	LIST_HEAD(, umtxq_queue) uc_spare_queue;
178
179	/* Busy flag */
180	char			uc_busy;
181
182	/* Chain lock waiters */
183	int			uc_waiters;
184
185	/* All PI in the list */
186	TAILQ_HEAD(,umtx_pi)	uc_pi_list;
187
188};
189
190#define	UMTXQ_LOCKED_ASSERT(uc)		mtx_assert(&(uc)->uc_lock, MA_OWNED)
191#define	UMTXQ_BUSY_ASSERT(uc)	KASSERT(&(uc)->uc_busy, ("umtx chain is not busy"))
192
193/*
194 * Don't propagate time-sharing priority, there is a security reason,
195 * a user can simply introduce PI-mutex, let thread A lock the mutex,
196 * and let another thread B block on the mutex, because B is
197 * sleeping, its priority will be boosted, this causes A's priority to
198 * be boosted via priority propagating too and will never be lowered even
199 * if it is using 100%CPU, this is unfair to other processes.
200 */
201
202#define UPRI(td)	(((td)->td_user_pri >= PRI_MIN_TIMESHARE &&\
203			  (td)->td_user_pri <= PRI_MAX_TIMESHARE) ?\
204			 PRI_MAX_TIMESHARE : (td)->td_user_pri)
205
206#define	GOLDEN_RATIO_PRIME	2654404609U
207#define	UMTX_CHAINS		128
208#define	UMTX_SHIFTS		(__WORD_BIT - 7)
209
210#define THREAD_SHARE		0
211#define PROCESS_SHARE		1
212#define AUTO_SHARE		2
213
214#define	GET_SHARE(flags)	\
215    (((flags) & USYNC_PROCESS_SHARED) == 0 ? THREAD_SHARE : PROCESS_SHARE)
216
217#define BUSY_SPINS		200
218
219static uma_zone_t		umtx_pi_zone;
220static struct umtxq_chain	umtxq_chains[2][UMTX_CHAINS];
221static MALLOC_DEFINE(M_UMTX, "umtx", "UMTX queue memory");
222static int			umtx_pi_allocated;
223
224SYSCTL_NODE(_debug, OID_AUTO, umtx, CTLFLAG_RW, 0, "umtx debug");
225SYSCTL_INT(_debug_umtx, OID_AUTO, umtx_pi_allocated, CTLFLAG_RD,
226    &umtx_pi_allocated, 0, "Allocated umtx_pi");
227
228static void umtxq_sysinit(void *);
229static void umtxq_hash(struct umtx_key *key);
230static struct umtxq_chain *umtxq_getchain(struct umtx_key *key);
231static void umtxq_lock(struct umtx_key *key);
232static void umtxq_unlock(struct umtx_key *key);
233static void umtxq_busy(struct umtx_key *key);
234static void umtxq_unbusy(struct umtx_key *key);
235static void umtxq_insert_queue(struct umtx_q *uq, int q);
236static void umtxq_remove_queue(struct umtx_q *uq, int q);
237static int umtxq_sleep(struct umtx_q *uq, const char *wmesg, int timo);
238static int umtxq_count(struct umtx_key *key);
239static int umtx_key_match(const struct umtx_key *k1, const struct umtx_key *k2);
240static int umtx_key_get(void *addr, int type, int share,
241	struct umtx_key *key);
242static void umtx_key_release(struct umtx_key *key);
243static struct umtx_pi *umtx_pi_alloc(int);
244static void umtx_pi_free(struct umtx_pi *pi);
245static void umtx_pi_adjust_locked(struct thread *td, u_char oldpri);
246static int do_unlock_pp(struct thread *td, struct umutex *m, uint32_t flags);
247static void umtx_thread_cleanup(struct thread *td);
248static void umtx_exec_hook(void *arg __unused, struct proc *p __unused,
249	struct image_params *imgp __unused);
250SYSINIT(umtx, SI_SUB_EVENTHANDLER+1, SI_ORDER_MIDDLE, umtxq_sysinit, NULL);
251
252#define umtxq_signal(key, nwake)	umtxq_signal_queue((key), (nwake), UMTX_SHARED_QUEUE)
253#define umtxq_insert(uq)	umtxq_insert_queue((uq), UMTX_SHARED_QUEUE)
254#define umtxq_remove(uq)	umtxq_remove_queue((uq), UMTX_SHARED_QUEUE)
255
256static struct mtx umtx_lock;
257
258static void
259umtxq_sysinit(void *arg __unused)
260{
261	int i, j;
262
263	umtx_pi_zone = uma_zcreate("umtx pi", sizeof(struct umtx_pi),
264		NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0);
265	for (i = 0; i < 2; ++i) {
266		for (j = 0; j < UMTX_CHAINS; ++j) {
267			mtx_init(&umtxq_chains[i][j].uc_lock, "umtxql", NULL,
268				 MTX_DEF | MTX_DUPOK);
269			LIST_INIT(&umtxq_chains[i][j].uc_queue[0]);
270			LIST_INIT(&umtxq_chains[i][j].uc_queue[1]);
271			LIST_INIT(&umtxq_chains[i][j].uc_spare_queue);
272			TAILQ_INIT(&umtxq_chains[i][j].uc_pi_list);
273			umtxq_chains[i][j].uc_busy = 0;
274			umtxq_chains[i][j].uc_waiters = 0;
275		}
276	}
277	mtx_init(&umtx_lock, "umtx lock", NULL, MTX_SPIN);
278	EVENTHANDLER_REGISTER(process_exec, umtx_exec_hook, NULL,
279	    EVENTHANDLER_PRI_ANY);
280}
281
282struct umtx_q *
283umtxq_alloc(void)
284{
285	struct umtx_q *uq;
286
287	uq = malloc(sizeof(struct umtx_q), M_UMTX, M_WAITOK | M_ZERO);
288	uq->uq_spare_queue = malloc(sizeof(struct umtxq_queue), M_UMTX, M_WAITOK | M_ZERO);
289	TAILQ_INIT(&uq->uq_spare_queue->head);
290	TAILQ_INIT(&uq->uq_pi_contested);
291	uq->uq_inherited_pri = PRI_MAX;
292	return (uq);
293}
294
295void
296umtxq_free(struct umtx_q *uq)
297{
298	MPASS(uq->uq_spare_queue != NULL);
299	free(uq->uq_spare_queue, M_UMTX);
300	free(uq, M_UMTX);
301}
302
303static inline void
304umtxq_hash(struct umtx_key *key)
305{
306	unsigned n = (uintptr_t)key->info.both.a + key->info.both.b;
307	key->hash = ((n * GOLDEN_RATIO_PRIME) >> UMTX_SHIFTS) % UMTX_CHAINS;
308}
309
310static inline int
311umtx_key_match(const struct umtx_key *k1, const struct umtx_key *k2)
312{
313	return (k1->type == k2->type &&
314		k1->info.both.a == k2->info.both.a &&
315	        k1->info.both.b == k2->info.both.b);
316}
317
318static inline struct umtxq_chain *
319umtxq_getchain(struct umtx_key *key)
320{
321	if (key->type <= TYPE_SEM)
322		return (&umtxq_chains[1][key->hash]);
323	return (&umtxq_chains[0][key->hash]);
324}
325
326/*
327 * Lock a chain.
328 */
329static inline void
330umtxq_lock(struct umtx_key *key)
331{
332	struct umtxq_chain *uc;
333
334	uc = umtxq_getchain(key);
335	mtx_lock(&uc->uc_lock);
336}
337
338/*
339 * Unlock a chain.
340 */
341static inline void
342umtxq_unlock(struct umtx_key *key)
343{
344	struct umtxq_chain *uc;
345
346	uc = umtxq_getchain(key);
347	mtx_unlock(&uc->uc_lock);
348}
349
350/*
351 * Set chain to busy state when following operation
352 * may be blocked (kernel mutex can not be used).
353 */
354static inline void
355umtxq_busy(struct umtx_key *key)
356{
357	struct umtxq_chain *uc;
358
359	uc = umtxq_getchain(key);
360	mtx_assert(&uc->uc_lock, MA_OWNED);
361	if (uc->uc_busy) {
362#ifdef SMP
363		if (smp_cpus > 1) {
364			int count = BUSY_SPINS;
365			if (count > 0) {
366				umtxq_unlock(key);
367				while (uc->uc_busy && --count > 0)
368					cpu_spinwait();
369				umtxq_lock(key);
370			}
371		}
372#endif
373		while (uc->uc_busy) {
374			uc->uc_waiters++;
375			msleep(uc, &uc->uc_lock, 0, "umtxqb", 0);
376			uc->uc_waiters--;
377		}
378	}
379	uc->uc_busy = 1;
380}
381
382/*
383 * Unbusy a chain.
384 */
385static inline void
386umtxq_unbusy(struct umtx_key *key)
387{
388	struct umtxq_chain *uc;
389
390	uc = umtxq_getchain(key);
391	mtx_assert(&uc->uc_lock, MA_OWNED);
392	KASSERT(uc->uc_busy != 0, ("not busy"));
393	uc->uc_busy = 0;
394	if (uc->uc_waiters)
395		wakeup_one(uc);
396}
397
398static struct umtxq_queue *
399umtxq_queue_lookup(struct umtx_key *key, int q)
400{
401	struct umtxq_queue *uh;
402	struct umtxq_chain *uc;
403
404	uc = umtxq_getchain(key);
405	UMTXQ_LOCKED_ASSERT(uc);
406	LIST_FOREACH(uh, &uc->uc_queue[q], link) {
407		if (umtx_key_match(&uh->key, key))
408			return (uh);
409	}
410
411	return (NULL);
412}
413
414static inline void
415umtxq_insert_queue(struct umtx_q *uq, int q)
416{
417	struct umtxq_queue *uh;
418	struct umtxq_chain *uc;
419
420	uc = umtxq_getchain(&uq->uq_key);
421	UMTXQ_LOCKED_ASSERT(uc);
422	KASSERT((uq->uq_flags & UQF_UMTXQ) == 0, ("umtx_q is already on queue"));
423	uh = umtxq_queue_lookup(&uq->uq_key, q);
424	if (uh != NULL) {
425		LIST_INSERT_HEAD(&uc->uc_spare_queue, uq->uq_spare_queue, link);
426	} else {
427		uh = uq->uq_spare_queue;
428		uh->key = uq->uq_key;
429		LIST_INSERT_HEAD(&uc->uc_queue[q], uh, link);
430	}
431	uq->uq_spare_queue = NULL;
432
433	TAILQ_INSERT_TAIL(&uh->head, uq, uq_link);
434	uh->length++;
435	uq->uq_flags |= UQF_UMTXQ;
436	uq->uq_cur_queue = uh;
437	return;
438}
439
440static inline void
441umtxq_remove_queue(struct umtx_q *uq, int q)
442{
443	struct umtxq_chain *uc;
444	struct umtxq_queue *uh;
445
446	uc = umtxq_getchain(&uq->uq_key);
447	UMTXQ_LOCKED_ASSERT(uc);
448	if (uq->uq_flags & UQF_UMTXQ) {
449		uh = uq->uq_cur_queue;
450		TAILQ_REMOVE(&uh->head, uq, uq_link);
451		uh->length--;
452		uq->uq_flags &= ~UQF_UMTXQ;
453		if (TAILQ_EMPTY(&uh->head)) {
454			KASSERT(uh->length == 0,
455			    ("inconsistent umtxq_queue length"));
456			LIST_REMOVE(uh, link);
457		} else {
458			uh = LIST_FIRST(&uc->uc_spare_queue);
459			KASSERT(uh != NULL, ("uc_spare_queue is empty"));
460			LIST_REMOVE(uh, link);
461		}
462		uq->uq_spare_queue = uh;
463		uq->uq_cur_queue = NULL;
464	}
465}
466
467/*
468 * Check if there are multiple waiters
469 */
470static int
471umtxq_count(struct umtx_key *key)
472{
473	struct umtxq_chain *uc;
474	struct umtxq_queue *uh;
475
476	uc = umtxq_getchain(key);
477	UMTXQ_LOCKED_ASSERT(uc);
478	uh = umtxq_queue_lookup(key, UMTX_SHARED_QUEUE);
479	if (uh != NULL)
480		return (uh->length);
481	return (0);
482}
483
484/*
485 * Check if there are multiple PI waiters and returns first
486 * waiter.
487 */
488static int
489umtxq_count_pi(struct umtx_key *key, struct umtx_q **first)
490{
491	struct umtxq_chain *uc;
492	struct umtxq_queue *uh;
493
494	*first = NULL;
495	uc = umtxq_getchain(key);
496	UMTXQ_LOCKED_ASSERT(uc);
497	uh = umtxq_queue_lookup(key, UMTX_SHARED_QUEUE);
498	if (uh != NULL) {
499		*first = TAILQ_FIRST(&uh->head);
500		return (uh->length);
501	}
502	return (0);
503}
504
505/*
506 * Wake up threads waiting on an userland object.
507 */
508
509static int
510umtxq_signal_queue(struct umtx_key *key, int n_wake, int q)
511{
512	struct umtxq_chain *uc;
513	struct umtxq_queue *uh;
514	struct umtx_q *uq;
515	int ret;
516
517	ret = 0;
518	uc = umtxq_getchain(key);
519	UMTXQ_LOCKED_ASSERT(uc);
520	uh = umtxq_queue_lookup(key, q);
521	if (uh != NULL) {
522		while ((uq = TAILQ_FIRST(&uh->head)) != NULL) {
523			umtxq_remove_queue(uq, q);
524			wakeup(uq);
525			if (++ret >= n_wake)
526				return (ret);
527		}
528	}
529	return (ret);
530}
531
532
533/*
534 * Wake up specified thread.
535 */
536static inline void
537umtxq_signal_thread(struct umtx_q *uq)
538{
539	struct umtxq_chain *uc;
540
541	uc = umtxq_getchain(&uq->uq_key);
542	UMTXQ_LOCKED_ASSERT(uc);
543	umtxq_remove(uq);
544	wakeup(uq);
545}
546
547/*
548 * Put thread into sleep state, before sleeping, check if
549 * thread was removed from umtx queue.
550 */
551static inline int
552umtxq_sleep(struct umtx_q *uq, const char *wmesg, int timo)
553{
554	struct umtxq_chain *uc;
555	int error;
556
557	uc = umtxq_getchain(&uq->uq_key);
558	UMTXQ_LOCKED_ASSERT(uc);
559	if (!(uq->uq_flags & UQF_UMTXQ))
560		return (0);
561	error = msleep(uq, &uc->uc_lock, PCATCH, wmesg, timo);
562	if (error == EWOULDBLOCK)
563		error = ETIMEDOUT;
564	return (error);
565}
566
567/*
568 * Convert userspace address into unique logical address.
569 */
570static int
571umtx_key_get(void *addr, int type, int share, struct umtx_key *key)
572{
573	struct thread *td = curthread;
574	vm_map_t map;
575	vm_map_entry_t entry;
576	vm_pindex_t pindex;
577	vm_prot_t prot;
578	boolean_t wired;
579
580	key->type = type;
581	if (share == THREAD_SHARE) {
582		key->shared = 0;
583		key->info.private.vs = td->td_proc->p_vmspace;
584		key->info.private.addr = (uintptr_t)addr;
585	} else {
586		MPASS(share == PROCESS_SHARE || share == AUTO_SHARE);
587		map = &td->td_proc->p_vmspace->vm_map;
588		if (vm_map_lookup(&map, (vm_offset_t)addr, VM_PROT_WRITE,
589		    &entry, &key->info.shared.object, &pindex, &prot,
590		    &wired) != KERN_SUCCESS) {
591			return EFAULT;
592		}
593
594		if ((share == PROCESS_SHARE) ||
595		    (share == AUTO_SHARE &&
596		     VM_INHERIT_SHARE == entry->inheritance)) {
597			key->shared = 1;
598			key->info.shared.offset = entry->offset + entry->start -
599				(vm_offset_t)addr;
600			vm_object_reference(key->info.shared.object);
601		} else {
602			key->shared = 0;
603			key->info.private.vs = td->td_proc->p_vmspace;
604			key->info.private.addr = (uintptr_t)addr;
605		}
606		vm_map_lookup_done(map, entry);
607	}
608
609	umtxq_hash(key);
610	return (0);
611}
612
613/*
614 * Release key.
615 */
616static inline void
617umtx_key_release(struct umtx_key *key)
618{
619	if (key->shared)
620		vm_object_deallocate(key->info.shared.object);
621}
622
623/*
624 * Lock a umtx object.
625 */
626static int
627_do_lock_umtx(struct thread *td, struct umtx *umtx, u_long id, int timo)
628{
629	struct umtx_q *uq;
630	u_long owner;
631	u_long old;
632	int error = 0;
633
634	uq = td->td_umtxq;
635
636	/*
637	 * Care must be exercised when dealing with umtx structure. It
638	 * can fault on any access.
639	 */
640	for (;;) {
641		/*
642		 * Try the uncontested case.  This should be done in userland.
643		 */
644		owner = casuword(&umtx->u_owner, UMTX_UNOWNED, id);
645
646		/* The acquire succeeded. */
647		if (owner == UMTX_UNOWNED)
648			return (0);
649
650		/* The address was invalid. */
651		if (owner == -1)
652			return (EFAULT);
653
654		/* If no one owns it but it is contested try to acquire it. */
655		if (owner == UMTX_CONTESTED) {
656			owner = casuword(&umtx->u_owner,
657			    UMTX_CONTESTED, id | UMTX_CONTESTED);
658
659			if (owner == UMTX_CONTESTED)
660				return (0);
661
662			/* The address was invalid. */
663			if (owner == -1)
664				return (EFAULT);
665
666			/* If this failed the lock has changed, restart. */
667			continue;
668		}
669
670		/*
671		 * If we caught a signal, we have retried and now
672		 * exit immediately.
673		 */
674		if (error != 0)
675			return (error);
676
677		if ((error = umtx_key_get(umtx, TYPE_SIMPLE_LOCK,
678			AUTO_SHARE, &uq->uq_key)) != 0)
679			return (error);
680
681		umtxq_lock(&uq->uq_key);
682		umtxq_busy(&uq->uq_key);
683		umtxq_insert(uq);
684		umtxq_unbusy(&uq->uq_key);
685		umtxq_unlock(&uq->uq_key);
686
687		/*
688		 * Set the contested bit so that a release in user space
689		 * knows to use the system call for unlock.  If this fails
690		 * either some one else has acquired the lock or it has been
691		 * released.
692		 */
693		old = casuword(&umtx->u_owner, owner, owner | UMTX_CONTESTED);
694
695		/* The address was invalid. */
696		if (old == -1) {
697			umtxq_lock(&uq->uq_key);
698			umtxq_remove(uq);
699			umtxq_unlock(&uq->uq_key);
700			umtx_key_release(&uq->uq_key);
701			return (EFAULT);
702		}
703
704		/*
705		 * We set the contested bit, sleep. Otherwise the lock changed
706		 * and we need to retry or we lost a race to the thread
707		 * unlocking the umtx.
708		 */
709		umtxq_lock(&uq->uq_key);
710		if (old == owner)
711			error = umtxq_sleep(uq, "umtx", timo);
712		umtxq_remove(uq);
713		umtxq_unlock(&uq->uq_key);
714		umtx_key_release(&uq->uq_key);
715	}
716
717	return (0);
718}
719
720/*
721 * Lock a umtx object.
722 */
723static int
724do_lock_umtx(struct thread *td, struct umtx *umtx, u_long id,
725	struct timespec *timeout)
726{
727	struct timespec ts, ts2, ts3;
728	struct timeval tv;
729	int error;
730
731	if (timeout == NULL) {
732		error = _do_lock_umtx(td, umtx, id, 0);
733		/* Mutex locking is restarted if it is interrupted. */
734		if (error == EINTR)
735			error = ERESTART;
736	} else {
737		getnanouptime(&ts);
738		timespecadd(&ts, timeout);
739		TIMESPEC_TO_TIMEVAL(&tv, timeout);
740		for (;;) {
741			error = _do_lock_umtx(td, umtx, id, tvtohz(&tv));
742			if (error != ETIMEDOUT)
743				break;
744			getnanouptime(&ts2);
745			if (timespeccmp(&ts2, &ts, >=)) {
746				error = ETIMEDOUT;
747				break;
748			}
749			ts3 = ts;
750			timespecsub(&ts3, &ts2);
751			TIMESPEC_TO_TIMEVAL(&tv, &ts3);
752		}
753		/* Timed-locking is not restarted. */
754		if (error == ERESTART)
755			error = EINTR;
756	}
757	return (error);
758}
759
760/*
761 * Unlock a umtx object.
762 */
763static int
764do_unlock_umtx(struct thread *td, struct umtx *umtx, u_long id)
765{
766	struct umtx_key key;
767	u_long owner;
768	u_long old;
769	int error;
770	int count;
771
772	/*
773	 * Make sure we own this mtx.
774	 */
775	owner = fuword(__DEVOLATILE(u_long *, &umtx->u_owner));
776	if (owner == -1)
777		return (EFAULT);
778
779	if ((owner & ~UMTX_CONTESTED) != id)
780		return (EPERM);
781
782	/* This should be done in userland */
783	if ((owner & UMTX_CONTESTED) == 0) {
784		old = casuword(&umtx->u_owner, owner, UMTX_UNOWNED);
785		if (old == -1)
786			return (EFAULT);
787		if (old == owner)
788			return (0);
789		owner = old;
790	}
791
792	/* We should only ever be in here for contested locks */
793	if ((error = umtx_key_get(umtx, TYPE_SIMPLE_LOCK, AUTO_SHARE,
794		&key)) != 0)
795		return (error);
796
797	umtxq_lock(&key);
798	umtxq_busy(&key);
799	count = umtxq_count(&key);
800	umtxq_unlock(&key);
801
802	/*
803	 * When unlocking the umtx, it must be marked as unowned if
804	 * there is zero or one thread only waiting for it.
805	 * Otherwise, it must be marked as contested.
806	 */
807	old = casuword(&umtx->u_owner, owner,
808		count <= 1 ? UMTX_UNOWNED : UMTX_CONTESTED);
809	umtxq_lock(&key);
810	umtxq_signal(&key,1);
811	umtxq_unbusy(&key);
812	umtxq_unlock(&key);
813	umtx_key_release(&key);
814	if (old == -1)
815		return (EFAULT);
816	if (old != owner)
817		return (EINVAL);
818	return (0);
819}
820
821#ifdef COMPAT_FREEBSD32
822
823/*
824 * Lock a umtx object.
825 */
826static int
827_do_lock_umtx32(struct thread *td, uint32_t *m, uint32_t id, int timo)
828{
829	struct umtx_q *uq;
830	uint32_t owner;
831	uint32_t old;
832	int error = 0;
833
834	uq = td->td_umtxq;
835
836	/*
837	 * Care must be exercised when dealing with umtx structure. It
838	 * can fault on any access.
839	 */
840	for (;;) {
841		/*
842		 * Try the uncontested case.  This should be done in userland.
843		 */
844		owner = casuword32(m, UMUTEX_UNOWNED, id);
845
846		/* The acquire succeeded. */
847		if (owner == UMUTEX_UNOWNED)
848			return (0);
849
850		/* The address was invalid. */
851		if (owner == -1)
852			return (EFAULT);
853
854		/* If no one owns it but it is contested try to acquire it. */
855		if (owner == UMUTEX_CONTESTED) {
856			owner = casuword32(m,
857			    UMUTEX_CONTESTED, id | UMUTEX_CONTESTED);
858			if (owner == UMUTEX_CONTESTED)
859				return (0);
860
861			/* The address was invalid. */
862			if (owner == -1)
863				return (EFAULT);
864
865			/* If this failed the lock has changed, restart. */
866			continue;
867		}
868
869		/*
870		 * If we caught a signal, we have retried and now
871		 * exit immediately.
872		 */
873		if (error != 0)
874			return (error);
875
876		if ((error = umtx_key_get(m, TYPE_SIMPLE_LOCK,
877			AUTO_SHARE, &uq->uq_key)) != 0)
878			return (error);
879
880		umtxq_lock(&uq->uq_key);
881		umtxq_busy(&uq->uq_key);
882		umtxq_insert(uq);
883		umtxq_unbusy(&uq->uq_key);
884		umtxq_unlock(&uq->uq_key);
885
886		/*
887		 * Set the contested bit so that a release in user space
888		 * knows to use the system call for unlock.  If this fails
889		 * either some one else has acquired the lock or it has been
890		 * released.
891		 */
892		old = casuword32(m, owner, owner | UMUTEX_CONTESTED);
893
894		/* The address was invalid. */
895		if (old == -1) {
896			umtxq_lock(&uq->uq_key);
897			umtxq_remove(uq);
898			umtxq_unlock(&uq->uq_key);
899			umtx_key_release(&uq->uq_key);
900			return (EFAULT);
901		}
902
903		/*
904		 * We set the contested bit, sleep. Otherwise the lock changed
905		 * and we need to retry or we lost a race to the thread
906		 * unlocking the umtx.
907		 */
908		umtxq_lock(&uq->uq_key);
909		if (old == owner)
910			error = umtxq_sleep(uq, "umtx", timo);
911		umtxq_remove(uq);
912		umtxq_unlock(&uq->uq_key);
913		umtx_key_release(&uq->uq_key);
914	}
915
916	return (0);
917}
918
919/*
920 * Lock a umtx object.
921 */
922static int
923do_lock_umtx32(struct thread *td, void *m, uint32_t id,
924	struct timespec *timeout)
925{
926	struct timespec ts, ts2, ts3;
927	struct timeval tv;
928	int error;
929
930	if (timeout == NULL) {
931		error = _do_lock_umtx32(td, m, id, 0);
932		/* Mutex locking is restarted if it is interrupted. */
933		if (error == EINTR)
934			error = ERESTART;
935	} else {
936		getnanouptime(&ts);
937		timespecadd(&ts, timeout);
938		TIMESPEC_TO_TIMEVAL(&tv, timeout);
939		for (;;) {
940			error = _do_lock_umtx32(td, m, id, tvtohz(&tv));
941			if (error != ETIMEDOUT)
942				break;
943			getnanouptime(&ts2);
944			if (timespeccmp(&ts2, &ts, >=)) {
945				error = ETIMEDOUT;
946				break;
947			}
948			ts3 = ts;
949			timespecsub(&ts3, &ts2);
950			TIMESPEC_TO_TIMEVAL(&tv, &ts3);
951		}
952		/* Timed-locking is not restarted. */
953		if (error == ERESTART)
954			error = EINTR;
955	}
956	return (error);
957}
958
959/*
960 * Unlock a umtx object.
961 */
962static int
963do_unlock_umtx32(struct thread *td, uint32_t *m, uint32_t id)
964{
965	struct umtx_key key;
966	uint32_t owner;
967	uint32_t old;
968	int error;
969	int count;
970
971	/*
972	 * Make sure we own this mtx.
973	 */
974	owner = fuword32(m);
975	if (owner == -1)
976		return (EFAULT);
977
978	if ((owner & ~UMUTEX_CONTESTED) != id)
979		return (EPERM);
980
981	/* This should be done in userland */
982	if ((owner & UMUTEX_CONTESTED) == 0) {
983		old = casuword32(m, owner, UMUTEX_UNOWNED);
984		if (old == -1)
985			return (EFAULT);
986		if (old == owner)
987			return (0);
988		owner = old;
989	}
990
991	/* We should only ever be in here for contested locks */
992	if ((error = umtx_key_get(m, TYPE_SIMPLE_LOCK, AUTO_SHARE,
993		&key)) != 0)
994		return (error);
995
996	umtxq_lock(&key);
997	umtxq_busy(&key);
998	count = umtxq_count(&key);
999	umtxq_unlock(&key);
1000
1001	/*
1002	 * When unlocking the umtx, it must be marked as unowned if
1003	 * there is zero or one thread only waiting for it.
1004	 * Otherwise, it must be marked as contested.
1005	 */
1006	old = casuword32(m, owner,
1007		count <= 1 ? UMUTEX_UNOWNED : UMUTEX_CONTESTED);
1008	umtxq_lock(&key);
1009	umtxq_signal(&key,1);
1010	umtxq_unbusy(&key);
1011	umtxq_unlock(&key);
1012	umtx_key_release(&key);
1013	if (old == -1)
1014		return (EFAULT);
1015	if (old != owner)
1016		return (EINVAL);
1017	return (0);
1018}
1019#endif
1020
1021/*
1022 * Fetch and compare value, sleep on the address if value is not changed.
1023 */
1024static int
1025do_wait(struct thread *td, void *addr, u_long id,
1026	struct timespec *timeout, int compat32, int is_private)
1027{
1028	struct umtx_q *uq;
1029	struct timespec ts, ts2, ts3;
1030	struct timeval tv;
1031	u_long tmp;
1032	int error = 0;
1033
1034	uq = td->td_umtxq;
1035	if ((error = umtx_key_get(addr, TYPE_SIMPLE_WAIT,
1036		is_private ? THREAD_SHARE : AUTO_SHARE, &uq->uq_key)) != 0)
1037		return (error);
1038
1039	umtxq_lock(&uq->uq_key);
1040	umtxq_insert(uq);
1041	umtxq_unlock(&uq->uq_key);
1042	if (compat32 == 0)
1043		tmp = fuword(addr);
1044        else
1045		tmp = (unsigned int)fuword32(addr);
1046	if (tmp != id) {
1047		umtxq_lock(&uq->uq_key);
1048		umtxq_remove(uq);
1049		umtxq_unlock(&uq->uq_key);
1050	} else if (timeout == NULL) {
1051		umtxq_lock(&uq->uq_key);
1052		error = umtxq_sleep(uq, "uwait", 0);
1053		umtxq_remove(uq);
1054		umtxq_unlock(&uq->uq_key);
1055	} else {
1056		getnanouptime(&ts);
1057		timespecadd(&ts, timeout);
1058		TIMESPEC_TO_TIMEVAL(&tv, timeout);
1059		umtxq_lock(&uq->uq_key);
1060		for (;;) {
1061			error = umtxq_sleep(uq, "uwait", tvtohz(&tv));
1062			if (!(uq->uq_flags & UQF_UMTXQ)) {
1063				error = 0;
1064				break;
1065			}
1066			if (error != ETIMEDOUT)
1067				break;
1068			umtxq_unlock(&uq->uq_key);
1069			getnanouptime(&ts2);
1070			if (timespeccmp(&ts2, &ts, >=)) {
1071				error = ETIMEDOUT;
1072				umtxq_lock(&uq->uq_key);
1073				break;
1074			}
1075			ts3 = ts;
1076			timespecsub(&ts3, &ts2);
1077			TIMESPEC_TO_TIMEVAL(&tv, &ts3);
1078			umtxq_lock(&uq->uq_key);
1079		}
1080		umtxq_remove(uq);
1081		umtxq_unlock(&uq->uq_key);
1082	}
1083	umtx_key_release(&uq->uq_key);
1084	if (error == ERESTART)
1085		error = EINTR;
1086	return (error);
1087}
1088
1089/*
1090 * Wake up threads sleeping on the specified address.
1091 */
1092int
1093kern_umtx_wake(struct thread *td, void *uaddr, int n_wake, int is_private)
1094{
1095	struct umtx_key key;
1096	int ret;
1097
1098	if ((ret = umtx_key_get(uaddr, TYPE_SIMPLE_WAIT,
1099		is_private ? THREAD_SHARE : AUTO_SHARE, &key)) != 0)
1100		return (ret);
1101	umtxq_lock(&key);
1102	ret = umtxq_signal(&key, n_wake);
1103	umtxq_unlock(&key);
1104	umtx_key_release(&key);
1105	return (0);
1106}
1107
1108/*
1109 * Lock PTHREAD_PRIO_NONE protocol POSIX mutex.
1110 */
1111static int
1112_do_lock_normal(struct thread *td, struct umutex *m, uint32_t flags, int timo,
1113	int mode)
1114{
1115	struct umtx_q *uq;
1116	uint32_t owner, old, id;
1117	int error = 0;
1118
1119	id = td->td_tid;
1120	uq = td->td_umtxq;
1121
1122	/*
1123	 * Care must be exercised when dealing with umtx structure. It
1124	 * can fault on any access.
1125	 */
1126	for (;;) {
1127		owner = fuword32(__DEVOLATILE(void *, &m->m_owner));
1128		if (mode == _UMUTEX_WAIT) {
1129			if (owner == UMUTEX_UNOWNED || owner == UMUTEX_CONTESTED)
1130				return (0);
1131		} else {
1132			/*
1133			 * Try the uncontested case.  This should be done in userland.
1134			 */
1135			owner = casuword32(&m->m_owner, UMUTEX_UNOWNED, id);
1136
1137			/* The acquire succeeded. */
1138			if (owner == UMUTEX_UNOWNED)
1139				return (0);
1140
1141			/* The address was invalid. */
1142			if (owner == -1)
1143				return (EFAULT);
1144
1145			/* If no one owns it but it is contested try to acquire it. */
1146			if (owner == UMUTEX_CONTESTED) {
1147				owner = casuword32(&m->m_owner,
1148				    UMUTEX_CONTESTED, id | UMUTEX_CONTESTED);
1149
1150				if (owner == UMUTEX_CONTESTED)
1151					return (0);
1152
1153				/* The address was invalid. */
1154				if (owner == -1)
1155					return (EFAULT);
1156
1157				/* If this failed the lock has changed, restart. */
1158				continue;
1159			}
1160		}
1161
1162		if ((flags & UMUTEX_ERROR_CHECK) != 0 &&
1163		    (owner & ~UMUTEX_CONTESTED) == id)
1164			return (EDEADLK);
1165
1166		if (mode == _UMUTEX_TRY)
1167			return (EBUSY);
1168
1169		/*
1170		 * If we caught a signal, we have retried and now
1171		 * exit immediately.
1172		 */
1173		if (error != 0)
1174			return (error);
1175
1176		if ((error = umtx_key_get(m, TYPE_NORMAL_UMUTEX,
1177		    GET_SHARE(flags), &uq->uq_key)) != 0)
1178			return (error);
1179
1180		umtxq_lock(&uq->uq_key);
1181		umtxq_busy(&uq->uq_key);
1182		umtxq_insert(uq);
1183		umtxq_unlock(&uq->uq_key);
1184
1185		/*
1186		 * Set the contested bit so that a release in user space
1187		 * knows to use the system call for unlock.  If this fails
1188		 * either some one else has acquired the lock or it has been
1189		 * released.
1190		 */
1191		old = casuword32(&m->m_owner, owner, owner | UMUTEX_CONTESTED);
1192
1193		/* The address was invalid. */
1194		if (old == -1) {
1195			umtxq_lock(&uq->uq_key);
1196			umtxq_remove(uq);
1197			umtxq_unbusy(&uq->uq_key);
1198			umtxq_unlock(&uq->uq_key);
1199			umtx_key_release(&uq->uq_key);
1200			return (EFAULT);
1201		}
1202
1203		/*
1204		 * We set the contested bit, sleep. Otherwise the lock changed
1205		 * and we need to retry or we lost a race to the thread
1206		 * unlocking the umtx.
1207		 */
1208		umtxq_lock(&uq->uq_key);
1209		umtxq_unbusy(&uq->uq_key);
1210		if (old == owner)
1211			error = umtxq_sleep(uq, "umtxn", timo);
1212		umtxq_remove(uq);
1213		umtxq_unlock(&uq->uq_key);
1214		umtx_key_release(&uq->uq_key);
1215	}
1216
1217	return (0);
1218}
1219
1220/*
1221 * Lock PTHREAD_PRIO_NONE protocol POSIX mutex.
1222 */
1223/*
1224 * Unlock PTHREAD_PRIO_NONE protocol POSIX mutex.
1225 */
1226static int
1227do_unlock_normal(struct thread *td, struct umutex *m, uint32_t flags)
1228{
1229	struct umtx_key key;
1230	uint32_t owner, old, id;
1231	int error;
1232	int count;
1233
1234	id = td->td_tid;
1235	/*
1236	 * Make sure we own this mtx.
1237	 */
1238	owner = fuword32(__DEVOLATILE(uint32_t *, &m->m_owner));
1239	if (owner == -1)
1240		return (EFAULT);
1241
1242	if ((owner & ~UMUTEX_CONTESTED) != id)
1243		return (EPERM);
1244
1245	if ((owner & UMUTEX_CONTESTED) == 0) {
1246		old = casuword32(&m->m_owner, owner, UMUTEX_UNOWNED);
1247		if (old == -1)
1248			return (EFAULT);
1249		if (old == owner)
1250			return (0);
1251		owner = old;
1252	}
1253
1254	/* We should only ever be in here for contested locks */
1255	if ((error = umtx_key_get(m, TYPE_NORMAL_UMUTEX, GET_SHARE(flags),
1256	    &key)) != 0)
1257		return (error);
1258
1259	umtxq_lock(&key);
1260	umtxq_busy(&key);
1261	count = umtxq_count(&key);
1262	umtxq_unlock(&key);
1263
1264	/*
1265	 * When unlocking the umtx, it must be marked as unowned if
1266	 * there is zero or one thread only waiting for it.
1267	 * Otherwise, it must be marked as contested.
1268	 */
1269	old = casuword32(&m->m_owner, owner,
1270		count <= 1 ? UMUTEX_UNOWNED : UMUTEX_CONTESTED);
1271	umtxq_lock(&key);
1272	umtxq_signal(&key,1);
1273	umtxq_unbusy(&key);
1274	umtxq_unlock(&key);
1275	umtx_key_release(&key);
1276	if (old == -1)
1277		return (EFAULT);
1278	if (old != owner)
1279		return (EINVAL);
1280	return (0);
1281}
1282
1283/*
1284 * Check if the mutex is available and wake up a waiter,
1285 * only for simple mutex.
1286 */
1287static int
1288do_wake_umutex(struct thread *td, struct umutex *m)
1289{
1290	struct umtx_key key;
1291	uint32_t owner;
1292	uint32_t flags;
1293	int error;
1294	int count;
1295
1296	owner = fuword32(__DEVOLATILE(uint32_t *, &m->m_owner));
1297	if (owner == -1)
1298		return (EFAULT);
1299
1300	if ((owner & ~UMUTEX_CONTESTED) != 0)
1301		return (0);
1302
1303	flags = fuword32(&m->m_flags);
1304
1305	/* We should only ever be in here for contested locks */
1306	if ((error = umtx_key_get(m, TYPE_NORMAL_UMUTEX, GET_SHARE(flags),
1307	    &key)) != 0)
1308		return (error);
1309
1310	umtxq_lock(&key);
1311	umtxq_busy(&key);
1312	count = umtxq_count(&key);
1313	umtxq_unlock(&key);
1314
1315	if (count <= 1)
1316		owner = casuword32(&m->m_owner, UMUTEX_CONTESTED, UMUTEX_UNOWNED);
1317
1318	umtxq_lock(&key);
1319	if (count != 0 && (owner & ~UMUTEX_CONTESTED) == 0)
1320		umtxq_signal(&key, 1);
1321	umtxq_unbusy(&key);
1322	umtxq_unlock(&key);
1323	umtx_key_release(&key);
1324	return (0);
1325}
1326
1327static inline struct umtx_pi *
1328umtx_pi_alloc(int flags)
1329{
1330	struct umtx_pi *pi;
1331
1332	pi = uma_zalloc(umtx_pi_zone, M_ZERO | flags);
1333	TAILQ_INIT(&pi->pi_blocked);
1334	atomic_add_int(&umtx_pi_allocated, 1);
1335	return (pi);
1336}
1337
1338static inline void
1339umtx_pi_free(struct umtx_pi *pi)
1340{
1341	uma_zfree(umtx_pi_zone, pi);
1342	atomic_add_int(&umtx_pi_allocated, -1);
1343}
1344
1345/*
1346 * Adjust the thread's position on a pi_state after its priority has been
1347 * changed.
1348 */
1349static int
1350umtx_pi_adjust_thread(struct umtx_pi *pi, struct thread *td)
1351{
1352	struct umtx_q *uq, *uq1, *uq2;
1353	struct thread *td1;
1354
1355	mtx_assert(&umtx_lock, MA_OWNED);
1356	if (pi == NULL)
1357		return (0);
1358
1359	uq = td->td_umtxq;
1360
1361	/*
1362	 * Check if the thread needs to be moved on the blocked chain.
1363	 * It needs to be moved if either its priority is lower than
1364	 * the previous thread or higher than the next thread.
1365	 */
1366	uq1 = TAILQ_PREV(uq, umtxq_head, uq_lockq);
1367	uq2 = TAILQ_NEXT(uq, uq_lockq);
1368	if ((uq1 != NULL && UPRI(td) < UPRI(uq1->uq_thread)) ||
1369	    (uq2 != NULL && UPRI(td) > UPRI(uq2->uq_thread))) {
1370		/*
1371		 * Remove thread from blocked chain and determine where
1372		 * it should be moved to.
1373		 */
1374		TAILQ_REMOVE(&pi->pi_blocked, uq, uq_lockq);
1375		TAILQ_FOREACH(uq1, &pi->pi_blocked, uq_lockq) {
1376			td1 = uq1->uq_thread;
1377			MPASS(td1->td_proc->p_magic == P_MAGIC);
1378			if (UPRI(td1) > UPRI(td))
1379				break;
1380		}
1381
1382		if (uq1 == NULL)
1383			TAILQ_INSERT_TAIL(&pi->pi_blocked, uq, uq_lockq);
1384		else
1385			TAILQ_INSERT_BEFORE(uq1, uq, uq_lockq);
1386	}
1387	return (1);
1388}
1389
1390/*
1391 * Propagate priority when a thread is blocked on POSIX
1392 * PI mutex.
1393 */
1394static void
1395umtx_propagate_priority(struct thread *td)
1396{
1397	struct umtx_q *uq;
1398	struct umtx_pi *pi;
1399	int pri;
1400
1401	mtx_assert(&umtx_lock, MA_OWNED);
1402	pri = UPRI(td);
1403	uq = td->td_umtxq;
1404	pi = uq->uq_pi_blocked;
1405	if (pi == NULL)
1406		return;
1407
1408	for (;;) {
1409		td = pi->pi_owner;
1410		if (td == NULL || td == curthread)
1411			return;
1412
1413		MPASS(td->td_proc != NULL);
1414		MPASS(td->td_proc->p_magic == P_MAGIC);
1415
1416		thread_lock(td);
1417		if (td->td_lend_user_pri > pri)
1418			sched_lend_user_prio(td, pri);
1419		else {
1420			thread_unlock(td);
1421			break;
1422		}
1423		thread_unlock(td);
1424
1425		/*
1426		 * Pick up the lock that td is blocked on.
1427		 */
1428		uq = td->td_umtxq;
1429		pi = uq->uq_pi_blocked;
1430		/* Resort td on the list if needed. */
1431		if (!umtx_pi_adjust_thread(pi, td))
1432			break;
1433	}
1434}
1435
1436/*
1437 * Unpropagate priority for a PI mutex when a thread blocked on
1438 * it is interrupted by signal or resumed by others.
1439 */
1440static void
1441umtx_unpropagate_priority(struct umtx_pi *pi)
1442{
1443	struct umtx_q *uq, *uq_owner;
1444	struct umtx_pi *pi2;
1445	int pri, oldpri;
1446
1447	mtx_assert(&umtx_lock, MA_OWNED);
1448
1449	while (pi != NULL && pi->pi_owner != NULL) {
1450		pri = PRI_MAX;
1451		uq_owner = pi->pi_owner->td_umtxq;
1452
1453		TAILQ_FOREACH(pi2, &uq_owner->uq_pi_contested, pi_link) {
1454			uq = TAILQ_FIRST(&pi2->pi_blocked);
1455			if (uq != NULL) {
1456				if (pri > UPRI(uq->uq_thread))
1457					pri = UPRI(uq->uq_thread);
1458			}
1459		}
1460
1461		if (pri > uq_owner->uq_inherited_pri)
1462			pri = uq_owner->uq_inherited_pri;
1463		thread_lock(pi->pi_owner);
1464		oldpri = pi->pi_owner->td_user_pri;
1465		sched_unlend_user_prio(pi->pi_owner, pri);
1466		thread_unlock(pi->pi_owner);
1467		if (uq_owner->uq_pi_blocked != NULL)
1468			umtx_pi_adjust_locked(pi->pi_owner, oldpri);
1469		pi = uq_owner->uq_pi_blocked;
1470	}
1471}
1472
1473/*
1474 * Insert a PI mutex into owned list.
1475 */
1476static void
1477umtx_pi_setowner(struct umtx_pi *pi, struct thread *owner)
1478{
1479	struct umtx_q *uq_owner;
1480
1481	uq_owner = owner->td_umtxq;
1482	mtx_assert(&umtx_lock, MA_OWNED);
1483	if (pi->pi_owner != NULL)
1484		panic("pi_ower != NULL");
1485	pi->pi_owner = owner;
1486	TAILQ_INSERT_TAIL(&uq_owner->uq_pi_contested, pi, pi_link);
1487}
1488
1489/*
1490 * Claim ownership of a PI mutex.
1491 */
1492static int
1493umtx_pi_claim(struct umtx_pi *pi, struct thread *owner)
1494{
1495	struct umtx_q *uq, *uq_owner;
1496
1497	uq_owner = owner->td_umtxq;
1498	mtx_lock_spin(&umtx_lock);
1499	if (pi->pi_owner == owner) {
1500		mtx_unlock_spin(&umtx_lock);
1501		return (0);
1502	}
1503
1504	if (pi->pi_owner != NULL) {
1505		/*
1506		 * userland may have already messed the mutex, sigh.
1507		 */
1508		mtx_unlock_spin(&umtx_lock);
1509		return (EPERM);
1510	}
1511	umtx_pi_setowner(pi, owner);
1512	uq = TAILQ_FIRST(&pi->pi_blocked);
1513	if (uq != NULL) {
1514		int pri;
1515
1516		pri = UPRI(uq->uq_thread);
1517		thread_lock(owner);
1518		if (pri < UPRI(owner))
1519			sched_lend_user_prio(owner, pri);
1520		thread_unlock(owner);
1521	}
1522	mtx_unlock_spin(&umtx_lock);
1523	return (0);
1524}
1525
1526static void
1527umtx_pi_adjust_locked(struct thread *td, u_char oldpri)
1528{
1529	struct umtx_q *uq;
1530	struct umtx_pi *pi;
1531
1532	uq = td->td_umtxq;
1533	/*
1534	 * Pick up the lock that td is blocked on.
1535	 */
1536	pi = uq->uq_pi_blocked;
1537	MPASS(pi != NULL);
1538
1539	/* Resort the turnstile on the list. */
1540	if (!umtx_pi_adjust_thread(pi, td))
1541		return;
1542
1543	/*
1544	 * If our priority was lowered and we are at the head of the
1545	 * turnstile, then propagate our new priority up the chain.
1546	 */
1547	if (uq == TAILQ_FIRST(&pi->pi_blocked) && UPRI(td) < oldpri)
1548		umtx_propagate_priority(td);
1549}
1550
1551/*
1552 * Adjust a thread's order position in its blocked PI mutex,
1553 * this may result new priority propagating process.
1554 */
1555void
1556umtx_pi_adjust(struct thread *td, u_char oldpri)
1557{
1558	struct umtx_q *uq;
1559	struct umtx_pi *pi;
1560
1561	uq = td->td_umtxq;
1562	mtx_lock_spin(&umtx_lock);
1563	/*
1564	 * Pick up the lock that td is blocked on.
1565	 */
1566	pi = uq->uq_pi_blocked;
1567	if (pi != NULL)
1568		umtx_pi_adjust_locked(td, oldpri);
1569	mtx_unlock_spin(&umtx_lock);
1570}
1571
1572/*
1573 * Sleep on a PI mutex.
1574 */
1575static int
1576umtxq_sleep_pi(struct umtx_q *uq, struct umtx_pi *pi,
1577	uint32_t owner, const char *wmesg, int timo)
1578{
1579	struct umtxq_chain *uc;
1580	struct thread *td, *td1;
1581	struct umtx_q *uq1;
1582	int pri;
1583	int error = 0;
1584
1585	td = uq->uq_thread;
1586	KASSERT(td == curthread, ("inconsistent uq_thread"));
1587	uc = umtxq_getchain(&uq->uq_key);
1588	UMTXQ_LOCKED_ASSERT(uc);
1589	UMTXQ_BUSY_ASSERT(uc);
1590	umtxq_insert(uq);
1591	mtx_lock_spin(&umtx_lock);
1592	if (pi->pi_owner == NULL) {
1593		mtx_unlock_spin(&umtx_lock);
1594		/* XXX Only look up thread in current process. */
1595		td1 = tdfind(owner, curproc->p_pid);
1596		mtx_lock_spin(&umtx_lock);
1597		if (td1 != NULL) {
1598			if (pi->pi_owner == NULL)
1599				umtx_pi_setowner(pi, td1);
1600			PROC_UNLOCK(td1->td_proc);
1601		}
1602	}
1603
1604	TAILQ_FOREACH(uq1, &pi->pi_blocked, uq_lockq) {
1605		pri = UPRI(uq1->uq_thread);
1606		if (pri > UPRI(td))
1607			break;
1608	}
1609
1610	if (uq1 != NULL)
1611		TAILQ_INSERT_BEFORE(uq1, uq, uq_lockq);
1612	else
1613		TAILQ_INSERT_TAIL(&pi->pi_blocked, uq, uq_lockq);
1614
1615	uq->uq_pi_blocked = pi;
1616	thread_lock(td);
1617	td->td_flags |= TDF_UPIBLOCKED;
1618	thread_unlock(td);
1619	umtx_propagate_priority(td);
1620	mtx_unlock_spin(&umtx_lock);
1621	umtxq_unbusy(&uq->uq_key);
1622
1623	if (uq->uq_flags & UQF_UMTXQ) {
1624		error = msleep(uq, &uc->uc_lock, PCATCH, wmesg, timo);
1625		if (error == EWOULDBLOCK)
1626			error = ETIMEDOUT;
1627		if (uq->uq_flags & UQF_UMTXQ) {
1628			umtxq_remove(uq);
1629		}
1630	}
1631	mtx_lock_spin(&umtx_lock);
1632	uq->uq_pi_blocked = NULL;
1633	thread_lock(td);
1634	td->td_flags &= ~TDF_UPIBLOCKED;
1635	thread_unlock(td);
1636	TAILQ_REMOVE(&pi->pi_blocked, uq, uq_lockq);
1637	umtx_unpropagate_priority(pi);
1638	mtx_unlock_spin(&umtx_lock);
1639	umtxq_unlock(&uq->uq_key);
1640
1641	return (error);
1642}
1643
1644/*
1645 * Add reference count for a PI mutex.
1646 */
1647static void
1648umtx_pi_ref(struct umtx_pi *pi)
1649{
1650	struct umtxq_chain *uc;
1651
1652	uc = umtxq_getchain(&pi->pi_key);
1653	UMTXQ_LOCKED_ASSERT(uc);
1654	pi->pi_refcount++;
1655}
1656
1657/*
1658 * Decrease reference count for a PI mutex, if the counter
1659 * is decreased to zero, its memory space is freed.
1660 */
1661static void
1662umtx_pi_unref(struct umtx_pi *pi)
1663{
1664	struct umtxq_chain *uc;
1665
1666	uc = umtxq_getchain(&pi->pi_key);
1667	UMTXQ_LOCKED_ASSERT(uc);
1668	KASSERT(pi->pi_refcount > 0, ("invalid reference count"));
1669	if (--pi->pi_refcount == 0) {
1670		mtx_lock_spin(&umtx_lock);
1671		if (pi->pi_owner != NULL) {
1672			TAILQ_REMOVE(&pi->pi_owner->td_umtxq->uq_pi_contested,
1673				pi, pi_link);
1674			pi->pi_owner = NULL;
1675		}
1676		KASSERT(TAILQ_EMPTY(&pi->pi_blocked),
1677			("blocked queue not empty"));
1678		mtx_unlock_spin(&umtx_lock);
1679		TAILQ_REMOVE(&uc->uc_pi_list, pi, pi_hashlink);
1680		umtx_pi_free(pi);
1681	}
1682}
1683
1684/*
1685 * Find a PI mutex in hash table.
1686 */
1687static struct umtx_pi *
1688umtx_pi_lookup(struct umtx_key *key)
1689{
1690	struct umtxq_chain *uc;
1691	struct umtx_pi *pi;
1692
1693	uc = umtxq_getchain(key);
1694	UMTXQ_LOCKED_ASSERT(uc);
1695
1696	TAILQ_FOREACH(pi, &uc->uc_pi_list, pi_hashlink) {
1697		if (umtx_key_match(&pi->pi_key, key)) {
1698			return (pi);
1699		}
1700	}
1701	return (NULL);
1702}
1703
1704/*
1705 * Insert a PI mutex into hash table.
1706 */
1707static inline void
1708umtx_pi_insert(struct umtx_pi *pi)
1709{
1710	struct umtxq_chain *uc;
1711
1712	uc = umtxq_getchain(&pi->pi_key);
1713	UMTXQ_LOCKED_ASSERT(uc);
1714	TAILQ_INSERT_TAIL(&uc->uc_pi_list, pi, pi_hashlink);
1715}
1716
1717/*
1718 * Lock a PI mutex.
1719 */
1720static int
1721_do_lock_pi(struct thread *td, struct umutex *m, uint32_t flags, int timo,
1722	int try)
1723{
1724	struct umtx_q *uq;
1725	struct umtx_pi *pi, *new_pi;
1726	uint32_t id, owner, old;
1727	int error;
1728
1729	id = td->td_tid;
1730	uq = td->td_umtxq;
1731
1732	if ((error = umtx_key_get(m, TYPE_PI_UMUTEX, GET_SHARE(flags),
1733	    &uq->uq_key)) != 0)
1734		return (error);
1735	umtxq_lock(&uq->uq_key);
1736	pi = umtx_pi_lookup(&uq->uq_key);
1737	if (pi == NULL) {
1738		new_pi = umtx_pi_alloc(M_NOWAIT);
1739		if (new_pi == NULL) {
1740			umtxq_unlock(&uq->uq_key);
1741			new_pi = umtx_pi_alloc(M_WAITOK);
1742			umtxq_lock(&uq->uq_key);
1743			pi = umtx_pi_lookup(&uq->uq_key);
1744			if (pi != NULL) {
1745				umtx_pi_free(new_pi);
1746				new_pi = NULL;
1747			}
1748		}
1749		if (new_pi != NULL) {
1750			new_pi->pi_key = uq->uq_key;
1751			umtx_pi_insert(new_pi);
1752			pi = new_pi;
1753		}
1754	}
1755	umtx_pi_ref(pi);
1756	umtxq_unlock(&uq->uq_key);
1757
1758	/*
1759	 * Care must be exercised when dealing with umtx structure.  It
1760	 * can fault on any access.
1761	 */
1762	for (;;) {
1763		/*
1764		 * Try the uncontested case.  This should be done in userland.
1765		 */
1766		owner = casuword32(&m->m_owner, UMUTEX_UNOWNED, id);
1767
1768		/* The acquire succeeded. */
1769		if (owner == UMUTEX_UNOWNED) {
1770			error = 0;
1771			break;
1772		}
1773
1774		/* The address was invalid. */
1775		if (owner == -1) {
1776			error = EFAULT;
1777			break;
1778		}
1779
1780		/* If no one owns it but it is contested try to acquire it. */
1781		if (owner == UMUTEX_CONTESTED) {
1782			owner = casuword32(&m->m_owner,
1783			    UMUTEX_CONTESTED, id | UMUTEX_CONTESTED);
1784
1785			if (owner == UMUTEX_CONTESTED) {
1786				umtxq_lock(&uq->uq_key);
1787				umtxq_busy(&uq->uq_key);
1788				error = umtx_pi_claim(pi, td);
1789				umtxq_unbusy(&uq->uq_key);
1790				umtxq_unlock(&uq->uq_key);
1791				break;
1792			}
1793
1794			/* The address was invalid. */
1795			if (owner == -1) {
1796				error = EFAULT;
1797				break;
1798			}
1799
1800			/* If this failed the lock has changed, restart. */
1801			continue;
1802		}
1803
1804		if ((flags & UMUTEX_ERROR_CHECK) != 0 &&
1805		    (owner & ~UMUTEX_CONTESTED) == id) {
1806			error = EDEADLK;
1807			break;
1808		}
1809
1810		if (try != 0) {
1811			error = EBUSY;
1812			break;
1813		}
1814
1815		/*
1816		 * If we caught a signal, we have retried and now
1817		 * exit immediately.
1818		 */
1819		if (error != 0)
1820			break;
1821
1822		umtxq_lock(&uq->uq_key);
1823		umtxq_busy(&uq->uq_key);
1824		umtxq_unlock(&uq->uq_key);
1825
1826		/*
1827		 * Set the contested bit so that a release in user space
1828		 * knows to use the system call for unlock.  If this fails
1829		 * either some one else has acquired the lock or it has been
1830		 * released.
1831		 */
1832		old = casuword32(&m->m_owner, owner, owner | UMUTEX_CONTESTED);
1833
1834		/* The address was invalid. */
1835		if (old == -1) {
1836			umtxq_lock(&uq->uq_key);
1837			umtxq_unbusy(&uq->uq_key);
1838			umtxq_unlock(&uq->uq_key);
1839			error = EFAULT;
1840			break;
1841		}
1842
1843		umtxq_lock(&uq->uq_key);
1844		/*
1845		 * We set the contested bit, sleep. Otherwise the lock changed
1846		 * and we need to retry or we lost a race to the thread
1847		 * unlocking the umtx.
1848		 */
1849		if (old == owner)
1850			error = umtxq_sleep_pi(uq, pi, owner & ~UMUTEX_CONTESTED,
1851				 "umtxpi", timo);
1852		else {
1853			umtxq_unbusy(&uq->uq_key);
1854			umtxq_unlock(&uq->uq_key);
1855		}
1856	}
1857
1858	umtxq_lock(&uq->uq_key);
1859	umtx_pi_unref(pi);
1860	umtxq_unlock(&uq->uq_key);
1861
1862	umtx_key_release(&uq->uq_key);
1863	return (error);
1864}
1865
1866/*
1867 * Unlock a PI mutex.
1868 */
1869static int
1870do_unlock_pi(struct thread *td, struct umutex *m, uint32_t flags)
1871{
1872	struct umtx_key key;
1873	struct umtx_q *uq_first, *uq_first2, *uq_me;
1874	struct umtx_pi *pi, *pi2;
1875	uint32_t owner, old, id;
1876	int error;
1877	int count;
1878	int pri;
1879
1880	id = td->td_tid;
1881	/*
1882	 * Make sure we own this mtx.
1883	 */
1884	owner = fuword32(__DEVOLATILE(uint32_t *, &m->m_owner));
1885	if (owner == -1)
1886		return (EFAULT);
1887
1888	if ((owner & ~UMUTEX_CONTESTED) != id)
1889		return (EPERM);
1890
1891	/* This should be done in userland */
1892	if ((owner & UMUTEX_CONTESTED) == 0) {
1893		old = casuword32(&m->m_owner, owner, UMUTEX_UNOWNED);
1894		if (old == -1)
1895			return (EFAULT);
1896		if (old == owner)
1897			return (0);
1898		owner = old;
1899	}
1900
1901	/* We should only ever be in here for contested locks */
1902	if ((error = umtx_key_get(m, TYPE_PI_UMUTEX, GET_SHARE(flags),
1903	    &key)) != 0)
1904		return (error);
1905
1906	umtxq_lock(&key);
1907	umtxq_busy(&key);
1908	count = umtxq_count_pi(&key, &uq_first);
1909	if (uq_first != NULL) {
1910		mtx_lock_spin(&umtx_lock);
1911		pi = uq_first->uq_pi_blocked;
1912		KASSERT(pi != NULL, ("pi == NULL?"));
1913		if (pi->pi_owner != curthread) {
1914			mtx_unlock_spin(&umtx_lock);
1915			umtxq_unbusy(&key);
1916			umtxq_unlock(&key);
1917			umtx_key_release(&key);
1918			/* userland messed the mutex */
1919			return (EPERM);
1920		}
1921		uq_me = curthread->td_umtxq;
1922		pi->pi_owner = NULL;
1923		TAILQ_REMOVE(&uq_me->uq_pi_contested, pi, pi_link);
1924		/* get highest priority thread which is still sleeping. */
1925		uq_first = TAILQ_FIRST(&pi->pi_blocked);
1926		while (uq_first != NULL &&
1927		       (uq_first->uq_flags & UQF_UMTXQ) == 0) {
1928			uq_first = TAILQ_NEXT(uq_first, uq_lockq);
1929		}
1930		pri = PRI_MAX;
1931		TAILQ_FOREACH(pi2, &uq_me->uq_pi_contested, pi_link) {
1932			uq_first2 = TAILQ_FIRST(&pi2->pi_blocked);
1933			if (uq_first2 != NULL) {
1934				if (pri > UPRI(uq_first2->uq_thread))
1935					pri = UPRI(uq_first2->uq_thread);
1936			}
1937		}
1938		thread_lock(curthread);
1939		sched_unlend_user_prio(curthread, pri);
1940		thread_unlock(curthread);
1941		mtx_unlock_spin(&umtx_lock);
1942		if (uq_first)
1943			umtxq_signal_thread(uq_first);
1944	}
1945	umtxq_unlock(&key);
1946
1947	/*
1948	 * When unlocking the umtx, it must be marked as unowned if
1949	 * there is zero or one thread only waiting for it.
1950	 * Otherwise, it must be marked as contested.
1951	 */
1952	old = casuword32(&m->m_owner, owner,
1953		count <= 1 ? UMUTEX_UNOWNED : UMUTEX_CONTESTED);
1954
1955	umtxq_lock(&key);
1956	umtxq_unbusy(&key);
1957	umtxq_unlock(&key);
1958	umtx_key_release(&key);
1959	if (old == -1)
1960		return (EFAULT);
1961	if (old != owner)
1962		return (EINVAL);
1963	return (0);
1964}
1965
1966/*
1967 * Lock a PP mutex.
1968 */
1969static int
1970_do_lock_pp(struct thread *td, struct umutex *m, uint32_t flags, int timo,
1971	int try)
1972{
1973	struct umtx_q *uq, *uq2;
1974	struct umtx_pi *pi;
1975	uint32_t ceiling;
1976	uint32_t owner, id;
1977	int error, pri, old_inherited_pri, su;
1978
1979	id = td->td_tid;
1980	uq = td->td_umtxq;
1981	if ((error = umtx_key_get(m, TYPE_PP_UMUTEX, GET_SHARE(flags),
1982	    &uq->uq_key)) != 0)
1983		return (error);
1984	su = (priv_check(td, PRIV_SCHED_RTPRIO) == 0);
1985	for (;;) {
1986		old_inherited_pri = uq->uq_inherited_pri;
1987		umtxq_lock(&uq->uq_key);
1988		umtxq_busy(&uq->uq_key);
1989		umtxq_unlock(&uq->uq_key);
1990
1991		ceiling = RTP_PRIO_MAX - fuword32(&m->m_ceilings[0]);
1992		if (ceiling > RTP_PRIO_MAX) {
1993			error = EINVAL;
1994			goto out;
1995		}
1996
1997		mtx_lock_spin(&umtx_lock);
1998		if (UPRI(td) < PRI_MIN_REALTIME + ceiling) {
1999			mtx_unlock_spin(&umtx_lock);
2000			error = EINVAL;
2001			goto out;
2002		}
2003		if (su && PRI_MIN_REALTIME + ceiling < uq->uq_inherited_pri) {
2004			uq->uq_inherited_pri = PRI_MIN_REALTIME + ceiling;
2005			thread_lock(td);
2006			if (uq->uq_inherited_pri < UPRI(td))
2007				sched_lend_user_prio(td, uq->uq_inherited_pri);
2008			thread_unlock(td);
2009		}
2010		mtx_unlock_spin(&umtx_lock);
2011
2012		owner = casuword32(&m->m_owner,
2013		    UMUTEX_CONTESTED, id | UMUTEX_CONTESTED);
2014
2015		if (owner == UMUTEX_CONTESTED) {
2016			error = 0;
2017			break;
2018		}
2019
2020		/* The address was invalid. */
2021		if (owner == -1) {
2022			error = EFAULT;
2023			break;
2024		}
2025
2026		if ((flags & UMUTEX_ERROR_CHECK) != 0 &&
2027		    (owner & ~UMUTEX_CONTESTED) == id) {
2028			error = EDEADLK;
2029			break;
2030		}
2031
2032		if (try != 0) {
2033			error = EBUSY;
2034			break;
2035		}
2036
2037		/*
2038		 * If we caught a signal, we have retried and now
2039		 * exit immediately.
2040		 */
2041		if (error != 0)
2042			break;
2043
2044		umtxq_lock(&uq->uq_key);
2045		umtxq_insert(uq);
2046		umtxq_unbusy(&uq->uq_key);
2047		error = umtxq_sleep(uq, "umtxpp", timo);
2048		umtxq_remove(uq);
2049		umtxq_unlock(&uq->uq_key);
2050
2051		mtx_lock_spin(&umtx_lock);
2052		uq->uq_inherited_pri = old_inherited_pri;
2053		pri = PRI_MAX;
2054		TAILQ_FOREACH(pi, &uq->uq_pi_contested, pi_link) {
2055			uq2 = TAILQ_FIRST(&pi->pi_blocked);
2056			if (uq2 != NULL) {
2057				if (pri > UPRI(uq2->uq_thread))
2058					pri = UPRI(uq2->uq_thread);
2059			}
2060		}
2061		if (pri > uq->uq_inherited_pri)
2062			pri = uq->uq_inherited_pri;
2063		thread_lock(td);
2064		sched_unlend_user_prio(td, pri);
2065		thread_unlock(td);
2066		mtx_unlock_spin(&umtx_lock);
2067	}
2068
2069	if (error != 0) {
2070		mtx_lock_spin(&umtx_lock);
2071		uq->uq_inherited_pri = old_inherited_pri;
2072		pri = PRI_MAX;
2073		TAILQ_FOREACH(pi, &uq->uq_pi_contested, pi_link) {
2074			uq2 = TAILQ_FIRST(&pi->pi_blocked);
2075			if (uq2 != NULL) {
2076				if (pri > UPRI(uq2->uq_thread))
2077					pri = UPRI(uq2->uq_thread);
2078			}
2079		}
2080		if (pri > uq->uq_inherited_pri)
2081			pri = uq->uq_inherited_pri;
2082		thread_lock(td);
2083		sched_unlend_user_prio(td, pri);
2084		thread_unlock(td);
2085		mtx_unlock_spin(&umtx_lock);
2086	}
2087
2088out:
2089	umtxq_lock(&uq->uq_key);
2090	umtxq_unbusy(&uq->uq_key);
2091	umtxq_unlock(&uq->uq_key);
2092	umtx_key_release(&uq->uq_key);
2093	return (error);
2094}
2095
2096/*
2097 * Unlock a PP mutex.
2098 */
2099static int
2100do_unlock_pp(struct thread *td, struct umutex *m, uint32_t flags)
2101{
2102	struct umtx_key key;
2103	struct umtx_q *uq, *uq2;
2104	struct umtx_pi *pi;
2105	uint32_t owner, id;
2106	uint32_t rceiling;
2107	int error, pri, new_inherited_pri, su;
2108
2109	id = td->td_tid;
2110	uq = td->td_umtxq;
2111	su = (priv_check(td, PRIV_SCHED_RTPRIO) == 0);
2112
2113	/*
2114	 * Make sure we own this mtx.
2115	 */
2116	owner = fuword32(__DEVOLATILE(uint32_t *, &m->m_owner));
2117	if (owner == -1)
2118		return (EFAULT);
2119
2120	if ((owner & ~UMUTEX_CONTESTED) != id)
2121		return (EPERM);
2122
2123	error = copyin(&m->m_ceilings[1], &rceiling, sizeof(uint32_t));
2124	if (error != 0)
2125		return (error);
2126
2127	if (rceiling == -1)
2128		new_inherited_pri = PRI_MAX;
2129	else {
2130		rceiling = RTP_PRIO_MAX - rceiling;
2131		if (rceiling > RTP_PRIO_MAX)
2132			return (EINVAL);
2133		new_inherited_pri = PRI_MIN_REALTIME + rceiling;
2134	}
2135
2136	if ((error = umtx_key_get(m, TYPE_PP_UMUTEX, GET_SHARE(flags),
2137	    &key)) != 0)
2138		return (error);
2139	umtxq_lock(&key);
2140	umtxq_busy(&key);
2141	umtxq_unlock(&key);
2142	/*
2143	 * For priority protected mutex, always set unlocked state
2144	 * to UMUTEX_CONTESTED, so that userland always enters kernel
2145	 * to lock the mutex, it is necessary because thread priority
2146	 * has to be adjusted for such mutex.
2147	 */
2148	error = suword32(__DEVOLATILE(uint32_t *, &m->m_owner),
2149		UMUTEX_CONTESTED);
2150
2151	umtxq_lock(&key);
2152	if (error == 0)
2153		umtxq_signal(&key, 1);
2154	umtxq_unbusy(&key);
2155	umtxq_unlock(&key);
2156
2157	if (error == -1)
2158		error = EFAULT;
2159	else {
2160		mtx_lock_spin(&umtx_lock);
2161		if (su != 0)
2162			uq->uq_inherited_pri = new_inherited_pri;
2163		pri = PRI_MAX;
2164		TAILQ_FOREACH(pi, &uq->uq_pi_contested, pi_link) {
2165			uq2 = TAILQ_FIRST(&pi->pi_blocked);
2166			if (uq2 != NULL) {
2167				if (pri > UPRI(uq2->uq_thread))
2168					pri = UPRI(uq2->uq_thread);
2169			}
2170		}
2171		if (pri > uq->uq_inherited_pri)
2172			pri = uq->uq_inherited_pri;
2173		thread_lock(td);
2174		sched_unlend_user_prio(td, pri);
2175		thread_unlock(td);
2176		mtx_unlock_spin(&umtx_lock);
2177	}
2178	umtx_key_release(&key);
2179	return (error);
2180}
2181
2182static int
2183do_set_ceiling(struct thread *td, struct umutex *m, uint32_t ceiling,
2184	uint32_t *old_ceiling)
2185{
2186	struct umtx_q *uq;
2187	uint32_t save_ceiling;
2188	uint32_t owner, id;
2189	uint32_t flags;
2190	int error;
2191
2192	flags = fuword32(&m->m_flags);
2193	if ((flags & UMUTEX_PRIO_PROTECT) == 0)
2194		return (EINVAL);
2195	if (ceiling > RTP_PRIO_MAX)
2196		return (EINVAL);
2197	id = td->td_tid;
2198	uq = td->td_umtxq;
2199	if ((error = umtx_key_get(m, TYPE_PP_UMUTEX, GET_SHARE(flags),
2200	   &uq->uq_key)) != 0)
2201		return (error);
2202	for (;;) {
2203		umtxq_lock(&uq->uq_key);
2204		umtxq_busy(&uq->uq_key);
2205		umtxq_unlock(&uq->uq_key);
2206
2207		save_ceiling = fuword32(&m->m_ceilings[0]);
2208
2209		owner = casuword32(&m->m_owner,
2210		    UMUTEX_CONTESTED, id | UMUTEX_CONTESTED);
2211
2212		if (owner == UMUTEX_CONTESTED) {
2213			suword32(&m->m_ceilings[0], ceiling);
2214			suword32(__DEVOLATILE(uint32_t *, &m->m_owner),
2215				UMUTEX_CONTESTED);
2216			error = 0;
2217			break;
2218		}
2219
2220		/* The address was invalid. */
2221		if (owner == -1) {
2222			error = EFAULT;
2223			break;
2224		}
2225
2226		if ((owner & ~UMUTEX_CONTESTED) == id) {
2227			suword32(&m->m_ceilings[0], ceiling);
2228			error = 0;
2229			break;
2230		}
2231
2232		/*
2233		 * If we caught a signal, we have retried and now
2234		 * exit immediately.
2235		 */
2236		if (error != 0)
2237			break;
2238
2239		/*
2240		 * We set the contested bit, sleep. Otherwise the lock changed
2241		 * and we need to retry or we lost a race to the thread
2242		 * unlocking the umtx.
2243		 */
2244		umtxq_lock(&uq->uq_key);
2245		umtxq_insert(uq);
2246		umtxq_unbusy(&uq->uq_key);
2247		error = umtxq_sleep(uq, "umtxpp", 0);
2248		umtxq_remove(uq);
2249		umtxq_unlock(&uq->uq_key);
2250	}
2251	umtxq_lock(&uq->uq_key);
2252	if (error == 0)
2253		umtxq_signal(&uq->uq_key, INT_MAX);
2254	umtxq_unbusy(&uq->uq_key);
2255	umtxq_unlock(&uq->uq_key);
2256	umtx_key_release(&uq->uq_key);
2257	if (error == 0 && old_ceiling != NULL)
2258		suword32(old_ceiling, save_ceiling);
2259	return (error);
2260}
2261
2262static int
2263_do_lock_umutex(struct thread *td, struct umutex *m, int flags, int timo,
2264	int mode)
2265{
2266	switch(flags & (UMUTEX_PRIO_INHERIT | UMUTEX_PRIO_PROTECT)) {
2267	case 0:
2268		return (_do_lock_normal(td, m, flags, timo, mode));
2269	case UMUTEX_PRIO_INHERIT:
2270		return (_do_lock_pi(td, m, flags, timo, mode));
2271	case UMUTEX_PRIO_PROTECT:
2272		return (_do_lock_pp(td, m, flags, timo, mode));
2273	}
2274	return (EINVAL);
2275}
2276
2277/*
2278 * Lock a userland POSIX mutex.
2279 */
2280static int
2281do_lock_umutex(struct thread *td, struct umutex *m,
2282	struct timespec *timeout, int mode)
2283{
2284	struct timespec ts, ts2, ts3;
2285	struct timeval tv;
2286	uint32_t flags;
2287	int error;
2288
2289	flags = fuword32(&m->m_flags);
2290	if (flags == -1)
2291		return (EFAULT);
2292
2293	if (timeout == NULL) {
2294		error = _do_lock_umutex(td, m, flags, 0, mode);
2295		/* Mutex locking is restarted if it is interrupted. */
2296		if (error == EINTR && mode != _UMUTEX_WAIT)
2297			error = ERESTART;
2298	} else {
2299		getnanouptime(&ts);
2300		timespecadd(&ts, timeout);
2301		TIMESPEC_TO_TIMEVAL(&tv, timeout);
2302		for (;;) {
2303			error = _do_lock_umutex(td, m, flags, tvtohz(&tv), mode);
2304			if (error != ETIMEDOUT)
2305				break;
2306			getnanouptime(&ts2);
2307			if (timespeccmp(&ts2, &ts, >=)) {
2308				error = ETIMEDOUT;
2309				break;
2310			}
2311			ts3 = ts;
2312			timespecsub(&ts3, &ts2);
2313			TIMESPEC_TO_TIMEVAL(&tv, &ts3);
2314		}
2315		/* Timed-locking is not restarted. */
2316		if (error == ERESTART)
2317			error = EINTR;
2318	}
2319	return (error);
2320}
2321
2322/*
2323 * Unlock a userland POSIX mutex.
2324 */
2325static int
2326do_unlock_umutex(struct thread *td, struct umutex *m)
2327{
2328	uint32_t flags;
2329
2330	flags = fuword32(&m->m_flags);
2331	if (flags == -1)
2332		return (EFAULT);
2333
2334	switch(flags & (UMUTEX_PRIO_INHERIT | UMUTEX_PRIO_PROTECT)) {
2335	case 0:
2336		return (do_unlock_normal(td, m, flags));
2337	case UMUTEX_PRIO_INHERIT:
2338		return (do_unlock_pi(td, m, flags));
2339	case UMUTEX_PRIO_PROTECT:
2340		return (do_unlock_pp(td, m, flags));
2341	}
2342
2343	return (EINVAL);
2344}
2345
2346static int
2347do_cv_wait(struct thread *td, struct ucond *cv, struct umutex *m,
2348	struct timespec *timeout, u_long wflags)
2349{
2350	struct umtx_q *uq;
2351	struct timeval tv;
2352	struct timespec cts, ets, tts;
2353	uint32_t flags;
2354	int error;
2355
2356	uq = td->td_umtxq;
2357	flags = fuword32(&cv->c_flags);
2358	error = umtx_key_get(cv, TYPE_CV, GET_SHARE(flags), &uq->uq_key);
2359	if (error != 0)
2360		return (error);
2361	umtxq_lock(&uq->uq_key);
2362	umtxq_busy(&uq->uq_key);
2363	umtxq_insert(uq);
2364	umtxq_unlock(&uq->uq_key);
2365
2366	/*
2367	 * The magic thing is we should set c_has_waiters to 1 before
2368	 * releasing user mutex.
2369	 */
2370	suword32(__DEVOLATILE(uint32_t *, &cv->c_has_waiters), 1);
2371
2372	umtxq_lock(&uq->uq_key);
2373	umtxq_unbusy(&uq->uq_key);
2374	umtxq_unlock(&uq->uq_key);
2375
2376	error = do_unlock_umutex(td, m);
2377
2378	umtxq_lock(&uq->uq_key);
2379	if (error == 0) {
2380		if ((wflags & UMTX_CHECK_UNPARKING) &&
2381		    (td->td_pflags & TDP_WAKEUP)) {
2382			td->td_pflags &= ~TDP_WAKEUP;
2383			error = EINTR;
2384		} else if (timeout == NULL) {
2385			error = umtxq_sleep(uq, "ucond", 0);
2386		} else {
2387			getnanouptime(&ets);
2388			timespecadd(&ets, timeout);
2389			TIMESPEC_TO_TIMEVAL(&tv, timeout);
2390			for (;;) {
2391				error = umtxq_sleep(uq, "ucond", tvtohz(&tv));
2392				if (error != ETIMEDOUT)
2393					break;
2394				getnanouptime(&cts);
2395				if (timespeccmp(&cts, &ets, >=)) {
2396					error = ETIMEDOUT;
2397					break;
2398				}
2399				tts = ets;
2400				timespecsub(&tts, &cts);
2401				TIMESPEC_TO_TIMEVAL(&tv, &tts);
2402			}
2403		}
2404	}
2405
2406	if ((uq->uq_flags & UQF_UMTXQ) == 0)
2407		error = 0;
2408	else {
2409		umtxq_remove(uq);
2410		if (error == ERESTART)
2411			error = EINTR;
2412	}
2413
2414	umtxq_unlock(&uq->uq_key);
2415	umtx_key_release(&uq->uq_key);
2416	return (error);
2417}
2418
2419/*
2420 * Signal a userland condition variable.
2421 */
2422static int
2423do_cv_signal(struct thread *td, struct ucond *cv)
2424{
2425	struct umtx_key key;
2426	int error, cnt, nwake;
2427	uint32_t flags;
2428
2429	flags = fuword32(&cv->c_flags);
2430	if ((error = umtx_key_get(cv, TYPE_CV, GET_SHARE(flags), &key)) != 0)
2431		return (error);
2432	umtxq_lock(&key);
2433	umtxq_busy(&key);
2434	cnt = umtxq_count(&key);
2435	nwake = umtxq_signal(&key, 1);
2436	if (cnt <= nwake) {
2437		umtxq_unlock(&key);
2438		error = suword32(
2439		    __DEVOLATILE(uint32_t *, &cv->c_has_waiters), 0);
2440		umtxq_lock(&key);
2441	}
2442	umtxq_unbusy(&key);
2443	umtxq_unlock(&key);
2444	umtx_key_release(&key);
2445	return (error);
2446}
2447
2448static int
2449do_cv_broadcast(struct thread *td, struct ucond *cv)
2450{
2451	struct umtx_key key;
2452	int error;
2453	uint32_t flags;
2454
2455	flags = fuword32(&cv->c_flags);
2456	if ((error = umtx_key_get(cv, TYPE_CV, GET_SHARE(flags), &key)) != 0)
2457		return (error);
2458
2459	umtxq_lock(&key);
2460	umtxq_busy(&key);
2461	umtxq_signal(&key, INT_MAX);
2462	umtxq_unlock(&key);
2463
2464	error = suword32(__DEVOLATILE(uint32_t *, &cv->c_has_waiters), 0);
2465
2466	umtxq_lock(&key);
2467	umtxq_unbusy(&key);
2468	umtxq_unlock(&key);
2469
2470	umtx_key_release(&key);
2471	return (error);
2472}
2473
2474static int
2475do_rw_rdlock(struct thread *td, struct urwlock *rwlock, long fflag, int timo)
2476{
2477	struct umtx_q *uq;
2478	uint32_t flags, wrflags;
2479	int32_t state, oldstate;
2480	int32_t blocked_readers;
2481	int error;
2482
2483	uq = td->td_umtxq;
2484	flags = fuword32(&rwlock->rw_flags);
2485	error = umtx_key_get(rwlock, TYPE_RWLOCK, GET_SHARE(flags), &uq->uq_key);
2486	if (error != 0)
2487		return (error);
2488
2489	wrflags = URWLOCK_WRITE_OWNER;
2490	if (!(fflag & URWLOCK_PREFER_READER) && !(flags & URWLOCK_PREFER_READER))
2491		wrflags |= URWLOCK_WRITE_WAITERS;
2492
2493	for (;;) {
2494		state = fuword32(__DEVOLATILE(int32_t *, &rwlock->rw_state));
2495		/* try to lock it */
2496		while (!(state & wrflags)) {
2497			if (__predict_false(URWLOCK_READER_COUNT(state) == URWLOCK_MAX_READERS)) {
2498				umtx_key_release(&uq->uq_key);
2499				return (EAGAIN);
2500			}
2501			oldstate = casuword32(&rwlock->rw_state, state, state + 1);
2502			if (oldstate == state) {
2503				umtx_key_release(&uq->uq_key);
2504				return (0);
2505			}
2506			state = oldstate;
2507		}
2508
2509		if (error)
2510			break;
2511
2512		/* grab monitor lock */
2513		umtxq_lock(&uq->uq_key);
2514		umtxq_busy(&uq->uq_key);
2515		umtxq_unlock(&uq->uq_key);
2516
2517		/*
2518		 * re-read the state, in case it changed between the try-lock above
2519		 * and the check below
2520		 */
2521		state = fuword32(__DEVOLATILE(int32_t *, &rwlock->rw_state));
2522
2523		/* set read contention bit */
2524		while ((state & wrflags) && !(state & URWLOCK_READ_WAITERS)) {
2525			oldstate = casuword32(&rwlock->rw_state, state, state | URWLOCK_READ_WAITERS);
2526			if (oldstate == state)
2527				goto sleep;
2528			state = oldstate;
2529		}
2530
2531		/* state is changed while setting flags, restart */
2532		if (!(state & wrflags)) {
2533			umtxq_lock(&uq->uq_key);
2534			umtxq_unbusy(&uq->uq_key);
2535			umtxq_unlock(&uq->uq_key);
2536			continue;
2537		}
2538
2539sleep:
2540		/* contention bit is set, before sleeping, increase read waiter count */
2541		blocked_readers = fuword32(&rwlock->rw_blocked_readers);
2542		suword32(&rwlock->rw_blocked_readers, blocked_readers+1);
2543
2544		while (state & wrflags) {
2545			umtxq_lock(&uq->uq_key);
2546			umtxq_insert(uq);
2547			umtxq_unbusy(&uq->uq_key);
2548
2549			error = umtxq_sleep(uq, "urdlck", timo);
2550
2551			umtxq_busy(&uq->uq_key);
2552			umtxq_remove(uq);
2553			umtxq_unlock(&uq->uq_key);
2554			if (error)
2555				break;
2556			state = fuword32(__DEVOLATILE(int32_t *, &rwlock->rw_state));
2557		}
2558
2559		/* decrease read waiter count, and may clear read contention bit */
2560		blocked_readers = fuword32(&rwlock->rw_blocked_readers);
2561		suword32(&rwlock->rw_blocked_readers, blocked_readers-1);
2562		if (blocked_readers == 1) {
2563			state = fuword32(__DEVOLATILE(int32_t *, &rwlock->rw_state));
2564			for (;;) {
2565				oldstate = casuword32(&rwlock->rw_state, state,
2566					 state & ~URWLOCK_READ_WAITERS);
2567				if (oldstate == state)
2568					break;
2569				state = oldstate;
2570			}
2571		}
2572
2573		umtxq_lock(&uq->uq_key);
2574		umtxq_unbusy(&uq->uq_key);
2575		umtxq_unlock(&uq->uq_key);
2576	}
2577	umtx_key_release(&uq->uq_key);
2578	return (error);
2579}
2580
2581static int
2582do_rw_rdlock2(struct thread *td, void *obj, long val, struct timespec *timeout)
2583{
2584	struct timespec ts, ts2, ts3;
2585	struct timeval tv;
2586	int error;
2587
2588	getnanouptime(&ts);
2589	timespecadd(&ts, timeout);
2590	TIMESPEC_TO_TIMEVAL(&tv, timeout);
2591	for (;;) {
2592		error = do_rw_rdlock(td, obj, val, tvtohz(&tv));
2593		if (error != ETIMEDOUT)
2594			break;
2595		getnanouptime(&ts2);
2596		if (timespeccmp(&ts2, &ts, >=)) {
2597			error = ETIMEDOUT;
2598			break;
2599		}
2600		ts3 = ts;
2601		timespecsub(&ts3, &ts2);
2602		TIMESPEC_TO_TIMEVAL(&tv, &ts3);
2603	}
2604	if (error == ERESTART)
2605		error = EINTR;
2606	return (error);
2607}
2608
2609static int
2610do_rw_wrlock(struct thread *td, struct urwlock *rwlock, int timo)
2611{
2612	struct umtx_q *uq;
2613	uint32_t flags;
2614	int32_t state, oldstate;
2615	int32_t blocked_writers;
2616	int32_t blocked_readers;
2617	int error;
2618
2619	uq = td->td_umtxq;
2620	flags = fuword32(&rwlock->rw_flags);
2621	error = umtx_key_get(rwlock, TYPE_RWLOCK, GET_SHARE(flags), &uq->uq_key);
2622	if (error != 0)
2623		return (error);
2624
2625	blocked_readers = 0;
2626	for (;;) {
2627		state = fuword32(__DEVOLATILE(int32_t *, &rwlock->rw_state));
2628		while (!(state & URWLOCK_WRITE_OWNER) && URWLOCK_READER_COUNT(state) == 0) {
2629			oldstate = casuword32(&rwlock->rw_state, state, state | URWLOCK_WRITE_OWNER);
2630			if (oldstate == state) {
2631				umtx_key_release(&uq->uq_key);
2632				return (0);
2633			}
2634			state = oldstate;
2635		}
2636
2637		if (error) {
2638			if (!(state & (URWLOCK_WRITE_OWNER|URWLOCK_WRITE_WAITERS)) &&
2639			    blocked_readers != 0) {
2640				umtxq_lock(&uq->uq_key);
2641				umtxq_busy(&uq->uq_key);
2642				umtxq_signal_queue(&uq->uq_key, INT_MAX, UMTX_SHARED_QUEUE);
2643				umtxq_unbusy(&uq->uq_key);
2644				umtxq_unlock(&uq->uq_key);
2645			}
2646
2647			break;
2648		}
2649
2650		/* grab monitor lock */
2651		umtxq_lock(&uq->uq_key);
2652		umtxq_busy(&uq->uq_key);
2653		umtxq_unlock(&uq->uq_key);
2654
2655		/*
2656		 * re-read the state, in case it changed between the try-lock above
2657		 * and the check below
2658		 */
2659		state = fuword32(__DEVOLATILE(int32_t *, &rwlock->rw_state));
2660
2661		while (((state & URWLOCK_WRITE_OWNER) || URWLOCK_READER_COUNT(state) != 0) &&
2662		       (state & URWLOCK_WRITE_WAITERS) == 0) {
2663			oldstate = casuword32(&rwlock->rw_state, state, state | URWLOCK_WRITE_WAITERS);
2664			if (oldstate == state)
2665				goto sleep;
2666			state = oldstate;
2667		}
2668
2669		if (!(state & URWLOCK_WRITE_OWNER) && URWLOCK_READER_COUNT(state) == 0) {
2670			umtxq_lock(&uq->uq_key);
2671			umtxq_unbusy(&uq->uq_key);
2672			umtxq_unlock(&uq->uq_key);
2673			continue;
2674		}
2675sleep:
2676		blocked_writers = fuword32(&rwlock->rw_blocked_writers);
2677		suword32(&rwlock->rw_blocked_writers, blocked_writers+1);
2678
2679		while ((state & URWLOCK_WRITE_OWNER) || URWLOCK_READER_COUNT(state) != 0) {
2680			umtxq_lock(&uq->uq_key);
2681			umtxq_insert_queue(uq, UMTX_EXCLUSIVE_QUEUE);
2682			umtxq_unbusy(&uq->uq_key);
2683
2684			error = umtxq_sleep(uq, "uwrlck", timo);
2685
2686			umtxq_busy(&uq->uq_key);
2687			umtxq_remove_queue(uq, UMTX_EXCLUSIVE_QUEUE);
2688			umtxq_unlock(&uq->uq_key);
2689			if (error)
2690				break;
2691			state = fuword32(__DEVOLATILE(int32_t *, &rwlock->rw_state));
2692		}
2693
2694		blocked_writers = fuword32(&rwlock->rw_blocked_writers);
2695		suword32(&rwlock->rw_blocked_writers, blocked_writers-1);
2696		if (blocked_writers == 1) {
2697			state = fuword32(__DEVOLATILE(int32_t *, &rwlock->rw_state));
2698			for (;;) {
2699				oldstate = casuword32(&rwlock->rw_state, state,
2700					 state & ~URWLOCK_WRITE_WAITERS);
2701				if (oldstate == state)
2702					break;
2703				state = oldstate;
2704			}
2705			blocked_readers = fuword32(&rwlock->rw_blocked_readers);
2706		} else
2707			blocked_readers = 0;
2708
2709		umtxq_lock(&uq->uq_key);
2710		umtxq_unbusy(&uq->uq_key);
2711		umtxq_unlock(&uq->uq_key);
2712	}
2713
2714	umtx_key_release(&uq->uq_key);
2715	return (error);
2716}
2717
2718static int
2719do_rw_wrlock2(struct thread *td, void *obj, struct timespec *timeout)
2720{
2721	struct timespec ts, ts2, ts3;
2722	struct timeval tv;
2723	int error;
2724
2725	getnanouptime(&ts);
2726	timespecadd(&ts, timeout);
2727	TIMESPEC_TO_TIMEVAL(&tv, timeout);
2728	for (;;) {
2729		error = do_rw_wrlock(td, obj, tvtohz(&tv));
2730		if (error != ETIMEDOUT)
2731			break;
2732		getnanouptime(&ts2);
2733		if (timespeccmp(&ts2, &ts, >=)) {
2734			error = ETIMEDOUT;
2735			break;
2736		}
2737		ts3 = ts;
2738		timespecsub(&ts3, &ts2);
2739		TIMESPEC_TO_TIMEVAL(&tv, &ts3);
2740	}
2741	if (error == ERESTART)
2742		error = EINTR;
2743	return (error);
2744}
2745
2746static int
2747do_rw_unlock(struct thread *td, struct urwlock *rwlock)
2748{
2749	struct umtx_q *uq;
2750	uint32_t flags;
2751	int32_t state, oldstate;
2752	int error, q, count;
2753
2754	uq = td->td_umtxq;
2755	flags = fuword32(&rwlock->rw_flags);
2756	error = umtx_key_get(rwlock, TYPE_RWLOCK, GET_SHARE(flags), &uq->uq_key);
2757	if (error != 0)
2758		return (error);
2759
2760	state = fuword32(__DEVOLATILE(int32_t *, &rwlock->rw_state));
2761	if (state & URWLOCK_WRITE_OWNER) {
2762		for (;;) {
2763			oldstate = casuword32(&rwlock->rw_state, state,
2764				state & ~URWLOCK_WRITE_OWNER);
2765			if (oldstate != state) {
2766				state = oldstate;
2767				if (!(oldstate & URWLOCK_WRITE_OWNER)) {
2768					error = EPERM;
2769					goto out;
2770				}
2771			} else
2772				break;
2773		}
2774	} else if (URWLOCK_READER_COUNT(state) != 0) {
2775		for (;;) {
2776			oldstate = casuword32(&rwlock->rw_state, state,
2777				state - 1);
2778			if (oldstate != state) {
2779				state = oldstate;
2780				if (URWLOCK_READER_COUNT(oldstate) == 0) {
2781					error = EPERM;
2782					goto out;
2783				}
2784			}
2785			else
2786				break;
2787		}
2788	} else {
2789		error = EPERM;
2790		goto out;
2791	}
2792
2793	count = 0;
2794
2795	if (!(flags & URWLOCK_PREFER_READER)) {
2796		if (state & URWLOCK_WRITE_WAITERS) {
2797			count = 1;
2798			q = UMTX_EXCLUSIVE_QUEUE;
2799		} else if (state & URWLOCK_READ_WAITERS) {
2800			count = INT_MAX;
2801			q = UMTX_SHARED_QUEUE;
2802		}
2803	} else {
2804		if (state & URWLOCK_READ_WAITERS) {
2805			count = INT_MAX;
2806			q = UMTX_SHARED_QUEUE;
2807		} else if (state & URWLOCK_WRITE_WAITERS) {
2808			count = 1;
2809			q = UMTX_EXCLUSIVE_QUEUE;
2810		}
2811	}
2812
2813	if (count) {
2814		umtxq_lock(&uq->uq_key);
2815		umtxq_busy(&uq->uq_key);
2816		umtxq_signal_queue(&uq->uq_key, count, q);
2817		umtxq_unbusy(&uq->uq_key);
2818		umtxq_unlock(&uq->uq_key);
2819	}
2820out:
2821	umtx_key_release(&uq->uq_key);
2822	return (error);
2823}
2824
2825static int
2826do_sem_wait(struct thread *td, struct _usem *sem, struct timespec *timeout)
2827{
2828	struct umtx_q *uq;
2829	struct timeval tv;
2830	struct timespec cts, ets, tts;
2831	uint32_t flags, count;
2832	int error;
2833
2834	uq = td->td_umtxq;
2835	flags = fuword32(&sem->_flags);
2836	error = umtx_key_get(sem, TYPE_SEM, GET_SHARE(flags), &uq->uq_key);
2837	if (error != 0)
2838		return (error);
2839	umtxq_lock(&uq->uq_key);
2840	umtxq_busy(&uq->uq_key);
2841	umtxq_insert(uq);
2842	umtxq_unlock(&uq->uq_key);
2843
2844	if (fuword32(__DEVOLATILE(uint32_t *, &sem->_has_waiters)) == 0)
2845		casuword32(__DEVOLATILE(uint32_t *, &sem->_has_waiters), 0, 1);
2846
2847	count = fuword32(__DEVOLATILE(uint32_t *, &sem->_count));
2848	if (count != 0) {
2849		umtxq_lock(&uq->uq_key);
2850		umtxq_unbusy(&uq->uq_key);
2851		umtxq_remove(uq);
2852		umtxq_unlock(&uq->uq_key);
2853		umtx_key_release(&uq->uq_key);
2854		return (0);
2855	}
2856
2857	umtxq_lock(&uq->uq_key);
2858	umtxq_unbusy(&uq->uq_key);
2859	umtxq_unlock(&uq->uq_key);
2860
2861	umtxq_lock(&uq->uq_key);
2862	if (timeout == NULL) {
2863		error = umtxq_sleep(uq, "usem", 0);
2864	} else {
2865		getnanouptime(&ets);
2866		timespecadd(&ets, timeout);
2867		TIMESPEC_TO_TIMEVAL(&tv, timeout);
2868		for (;;) {
2869			error = umtxq_sleep(uq, "usem", tvtohz(&tv));
2870			if (error != ETIMEDOUT)
2871				break;
2872			getnanouptime(&cts);
2873			if (timespeccmp(&cts, &ets, >=)) {
2874				error = ETIMEDOUT;
2875				break;
2876			}
2877			tts = ets;
2878			timespecsub(&tts, &cts);
2879			TIMESPEC_TO_TIMEVAL(&tv, &tts);
2880		}
2881	}
2882
2883	if ((uq->uq_flags & UQF_UMTXQ) == 0)
2884		error = 0;
2885	else {
2886		umtxq_remove(uq);
2887		if (error == ERESTART)
2888			error = EINTR;
2889	}
2890	umtxq_unlock(&uq->uq_key);
2891	umtx_key_release(&uq->uq_key);
2892	return (error);
2893}
2894
2895/*
2896 * Signal a userland condition variable.
2897 */
2898static int
2899do_sem_wake(struct thread *td, struct _usem *sem)
2900{
2901	struct umtx_key key;
2902	int error, cnt, nwake;
2903	uint32_t flags;
2904
2905	flags = fuword32(&sem->_flags);
2906	if ((error = umtx_key_get(sem, TYPE_SEM, GET_SHARE(flags), &key)) != 0)
2907		return (error);
2908	umtxq_lock(&key);
2909	umtxq_busy(&key);
2910	cnt = umtxq_count(&key);
2911	nwake = umtxq_signal(&key, 1);
2912	if (cnt <= nwake) {
2913		umtxq_unlock(&key);
2914		error = suword32(
2915		    __DEVOLATILE(uint32_t *, &sem->_has_waiters), 0);
2916		umtxq_lock(&key);
2917	}
2918	umtxq_unbusy(&key);
2919	umtxq_unlock(&key);
2920	umtx_key_release(&key);
2921	return (error);
2922}
2923
2924int
2925_umtx_lock(struct thread *td, struct _umtx_lock_args *uap)
2926    /* struct umtx *umtx */
2927{
2928	return _do_lock_umtx(td, uap->umtx, td->td_tid, 0);
2929}
2930
2931int
2932_umtx_unlock(struct thread *td, struct _umtx_unlock_args *uap)
2933    /* struct umtx *umtx */
2934{
2935	return do_unlock_umtx(td, uap->umtx, td->td_tid);
2936}
2937
2938static int
2939__umtx_op_lock_umtx(struct thread *td, struct _umtx_op_args *uap)
2940{
2941	struct timespec *ts, timeout;
2942	int error;
2943
2944	/* Allow a null timespec (wait forever). */
2945	if (uap->uaddr2 == NULL)
2946		ts = NULL;
2947	else {
2948		error = copyin(uap->uaddr2, &timeout, sizeof(timeout));
2949		if (error != 0)
2950			return (error);
2951		if (timeout.tv_nsec >= 1000000000 ||
2952		    timeout.tv_nsec < 0) {
2953			return (EINVAL);
2954		}
2955		ts = &timeout;
2956	}
2957	return (do_lock_umtx(td, uap->obj, uap->val, ts));
2958}
2959
2960static int
2961__umtx_op_unlock_umtx(struct thread *td, struct _umtx_op_args *uap)
2962{
2963	return (do_unlock_umtx(td, uap->obj, uap->val));
2964}
2965
2966static int
2967__umtx_op_wait(struct thread *td, struct _umtx_op_args *uap)
2968{
2969	struct timespec *ts, timeout;
2970	int error;
2971
2972	if (uap->uaddr2 == NULL)
2973		ts = NULL;
2974	else {
2975		error = copyin(uap->uaddr2, &timeout, sizeof(timeout));
2976		if (error != 0)
2977			return (error);
2978		if (timeout.tv_nsec >= 1000000000 ||
2979		    timeout.tv_nsec < 0)
2980			return (EINVAL);
2981		ts = &timeout;
2982	}
2983	return do_wait(td, uap->obj, uap->val, ts, 0, 0);
2984}
2985
2986static int
2987__umtx_op_wait_uint(struct thread *td, struct _umtx_op_args *uap)
2988{
2989	struct timespec *ts, timeout;
2990	int error;
2991
2992	if (uap->uaddr2 == NULL)
2993		ts = NULL;
2994	else {
2995		error = copyin(uap->uaddr2, &timeout, sizeof(timeout));
2996		if (error != 0)
2997			return (error);
2998		if (timeout.tv_nsec >= 1000000000 ||
2999		    timeout.tv_nsec < 0)
3000			return (EINVAL);
3001		ts = &timeout;
3002	}
3003	return do_wait(td, uap->obj, uap->val, ts, 1, 0);
3004}
3005
3006static int
3007__umtx_op_wait_uint_private(struct thread *td, struct _umtx_op_args *uap)
3008{
3009	struct timespec *ts, timeout;
3010	int error;
3011
3012	if (uap->uaddr2 == NULL)
3013		ts = NULL;
3014	else {
3015		error = copyin(uap->uaddr2, &timeout, sizeof(timeout));
3016		if (error != 0)
3017			return (error);
3018		if (timeout.tv_nsec >= 1000000000 ||
3019		    timeout.tv_nsec < 0)
3020			return (EINVAL);
3021		ts = &timeout;
3022	}
3023	return do_wait(td, uap->obj, uap->val, ts, 1, 1);
3024}
3025
3026static int
3027__umtx_op_wake(struct thread *td, struct _umtx_op_args *uap)
3028{
3029	return (kern_umtx_wake(td, uap->obj, uap->val, 0));
3030}
3031
3032static int
3033__umtx_op_wake_private(struct thread *td, struct _umtx_op_args *uap)
3034{
3035	return (kern_umtx_wake(td, uap->obj, uap->val, 1));
3036}
3037
3038static int
3039__umtx_op_lock_umutex(struct thread *td, struct _umtx_op_args *uap)
3040{
3041	struct timespec *ts, timeout;
3042	int error;
3043
3044	/* Allow a null timespec (wait forever). */
3045	if (uap->uaddr2 == NULL)
3046		ts = NULL;
3047	else {
3048		error = copyin(uap->uaddr2, &timeout,
3049		    sizeof(timeout));
3050		if (error != 0)
3051			return (error);
3052		if (timeout.tv_nsec >= 1000000000 ||
3053		    timeout.tv_nsec < 0) {
3054			return (EINVAL);
3055		}
3056		ts = &timeout;
3057	}
3058	return do_lock_umutex(td, uap->obj, ts, 0);
3059}
3060
3061static int
3062__umtx_op_trylock_umutex(struct thread *td, struct _umtx_op_args *uap)
3063{
3064	return do_lock_umutex(td, uap->obj, NULL, _UMUTEX_TRY);
3065}
3066
3067static int
3068__umtx_op_wait_umutex(struct thread *td, struct _umtx_op_args *uap)
3069{
3070	struct timespec *ts, timeout;
3071	int error;
3072
3073	/* Allow a null timespec (wait forever). */
3074	if (uap->uaddr2 == NULL)
3075		ts = NULL;
3076	else {
3077		error = copyin(uap->uaddr2, &timeout,
3078		    sizeof(timeout));
3079		if (error != 0)
3080			return (error);
3081		if (timeout.tv_nsec >= 1000000000 ||
3082		    timeout.tv_nsec < 0) {
3083			return (EINVAL);
3084		}
3085		ts = &timeout;
3086	}
3087	return do_lock_umutex(td, uap->obj, ts, _UMUTEX_WAIT);
3088}
3089
3090static int
3091__umtx_op_wake_umutex(struct thread *td, struct _umtx_op_args *uap)
3092{
3093	return do_wake_umutex(td, uap->obj);
3094}
3095
3096static int
3097__umtx_op_unlock_umutex(struct thread *td, struct _umtx_op_args *uap)
3098{
3099	return do_unlock_umutex(td, uap->obj);
3100}
3101
3102static int
3103__umtx_op_set_ceiling(struct thread *td, struct _umtx_op_args *uap)
3104{
3105	return do_set_ceiling(td, uap->obj, uap->val, uap->uaddr1);
3106}
3107
3108static int
3109__umtx_op_cv_wait(struct thread *td, struct _umtx_op_args *uap)
3110{
3111	struct timespec *ts, timeout;
3112	int error;
3113
3114	/* Allow a null timespec (wait forever). */
3115	if (uap->uaddr2 == NULL)
3116		ts = NULL;
3117	else {
3118		error = copyin(uap->uaddr2, &timeout,
3119		    sizeof(timeout));
3120		if (error != 0)
3121			return (error);
3122		if (timeout.tv_nsec >= 1000000000 ||
3123		    timeout.tv_nsec < 0) {
3124			return (EINVAL);
3125		}
3126		ts = &timeout;
3127	}
3128	return (do_cv_wait(td, uap->obj, uap->uaddr1, ts, uap->val));
3129}
3130
3131static int
3132__umtx_op_cv_signal(struct thread *td, struct _umtx_op_args *uap)
3133{
3134	return do_cv_signal(td, uap->obj);
3135}
3136
3137static int
3138__umtx_op_cv_broadcast(struct thread *td, struct _umtx_op_args *uap)
3139{
3140	return do_cv_broadcast(td, uap->obj);
3141}
3142
3143static int
3144__umtx_op_rw_rdlock(struct thread *td, struct _umtx_op_args *uap)
3145{
3146	struct timespec timeout;
3147	int error;
3148
3149	/* Allow a null timespec (wait forever). */
3150	if (uap->uaddr2 == NULL) {
3151		error = do_rw_rdlock(td, uap->obj, uap->val, 0);
3152	} else {
3153		error = copyin(uap->uaddr2, &timeout,
3154		    sizeof(timeout));
3155		if (error != 0)
3156			return (error);
3157		if (timeout.tv_nsec >= 1000000000 ||
3158		    timeout.tv_nsec < 0) {
3159			return (EINVAL);
3160		}
3161		error = do_rw_rdlock2(td, uap->obj, uap->val, &timeout);
3162	}
3163	return (error);
3164}
3165
3166static int
3167__umtx_op_rw_wrlock(struct thread *td, struct _umtx_op_args *uap)
3168{
3169	struct timespec timeout;
3170	int error;
3171
3172	/* Allow a null timespec (wait forever). */
3173	if (uap->uaddr2 == NULL) {
3174		error = do_rw_wrlock(td, uap->obj, 0);
3175	} else {
3176		error = copyin(uap->uaddr2, &timeout,
3177		    sizeof(timeout));
3178		if (error != 0)
3179			return (error);
3180		if (timeout.tv_nsec >= 1000000000 ||
3181		    timeout.tv_nsec < 0) {
3182			return (EINVAL);
3183		}
3184
3185		error = do_rw_wrlock2(td, uap->obj, &timeout);
3186	}
3187	return (error);
3188}
3189
3190static int
3191__umtx_op_rw_unlock(struct thread *td, struct _umtx_op_args *uap)
3192{
3193	return do_rw_unlock(td, uap->obj);
3194}
3195
3196static int
3197__umtx_op_sem_wait(struct thread *td, struct _umtx_op_args *uap)
3198{
3199	struct timespec *ts, timeout;
3200	int error;
3201
3202	/* Allow a null timespec (wait forever). */
3203	if (uap->uaddr2 == NULL)
3204		ts = NULL;
3205	else {
3206		error = copyin(uap->uaddr2, &timeout,
3207		    sizeof(timeout));
3208		if (error != 0)
3209			return (error);
3210		if (timeout.tv_nsec >= 1000000000 ||
3211		    timeout.tv_nsec < 0) {
3212			return (EINVAL);
3213		}
3214		ts = &timeout;
3215	}
3216	return (do_sem_wait(td, uap->obj, ts));
3217}
3218
3219static int
3220__umtx_op_sem_wake(struct thread *td, struct _umtx_op_args *uap)
3221{
3222	return do_sem_wake(td, uap->obj);
3223}
3224
3225typedef int (*_umtx_op_func)(struct thread *td, struct _umtx_op_args *uap);
3226
3227static _umtx_op_func op_table[] = {
3228	__umtx_op_lock_umtx,		/* UMTX_OP_LOCK */
3229	__umtx_op_unlock_umtx,		/* UMTX_OP_UNLOCK */
3230	__umtx_op_wait,			/* UMTX_OP_WAIT */
3231	__umtx_op_wake,			/* UMTX_OP_WAKE */
3232	__umtx_op_trylock_umutex,	/* UMTX_OP_MUTEX_TRYLOCK */
3233	__umtx_op_lock_umutex,		/* UMTX_OP_MUTEX_LOCK */
3234	__umtx_op_unlock_umutex,	/* UMTX_OP_MUTEX_UNLOCK */
3235	__umtx_op_set_ceiling,		/* UMTX_OP_SET_CEILING */
3236	__umtx_op_cv_wait,		/* UMTX_OP_CV_WAIT*/
3237	__umtx_op_cv_signal,		/* UMTX_OP_CV_SIGNAL */
3238	__umtx_op_cv_broadcast,		/* UMTX_OP_CV_BROADCAST */
3239	__umtx_op_wait_uint,		/* UMTX_OP_WAIT_UINT */
3240	__umtx_op_rw_rdlock,		/* UMTX_OP_RW_RDLOCK */
3241	__umtx_op_rw_wrlock,		/* UMTX_OP_RW_WRLOCK */
3242	__umtx_op_rw_unlock,		/* UMTX_OP_RW_UNLOCK */
3243	__umtx_op_wait_uint_private,	/* UMTX_OP_WAIT_UINT_PRIVATE */
3244	__umtx_op_wake_private,		/* UMTX_OP_WAKE_PRIVATE */
3245	__umtx_op_wait_umutex,		/* UMTX_OP_UMUTEX_WAIT */
3246	__umtx_op_wake_umutex,		/* UMTX_OP_UMUTEX_WAKE */
3247	__umtx_op_sem_wait,		/* UMTX_OP_SEM_WAIT */
3248	__umtx_op_sem_wake		/* UMTX_OP_SEM_WAKE */
3249};
3250
3251int
3252_umtx_op(struct thread *td, struct _umtx_op_args *uap)
3253{
3254	if ((unsigned)uap->op < UMTX_OP_MAX)
3255		return (*op_table[uap->op])(td, uap);
3256	return (EINVAL);
3257}
3258
3259#ifdef COMPAT_FREEBSD32
3260int
3261freebsd32_umtx_lock(struct thread *td, struct freebsd32_umtx_lock_args *uap)
3262    /* struct umtx *umtx */
3263{
3264	return (do_lock_umtx32(td, (uint32_t *)uap->umtx, td->td_tid, NULL));
3265}
3266
3267int
3268freebsd32_umtx_unlock(struct thread *td, struct freebsd32_umtx_unlock_args *uap)
3269    /* struct umtx *umtx */
3270{
3271	return (do_unlock_umtx32(td, (uint32_t *)uap->umtx, td->td_tid));
3272}
3273
3274struct timespec32 {
3275	uint32_t tv_sec;
3276	uint32_t tv_nsec;
3277};
3278
3279static inline int
3280copyin_timeout32(void *addr, struct timespec *tsp)
3281{
3282	struct timespec32 ts32;
3283	int error;
3284
3285	error = copyin(addr, &ts32, sizeof(struct timespec32));
3286	if (error == 0) {
3287		tsp->tv_sec = ts32.tv_sec;
3288		tsp->tv_nsec = ts32.tv_nsec;
3289	}
3290	return (error);
3291}
3292
3293static int
3294__umtx_op_lock_umtx_compat32(struct thread *td, struct _umtx_op_args *uap)
3295{
3296	struct timespec *ts, timeout;
3297	int error;
3298
3299	/* Allow a null timespec (wait forever). */
3300	if (uap->uaddr2 == NULL)
3301		ts = NULL;
3302	else {
3303		error = copyin_timeout32(uap->uaddr2, &timeout);
3304		if (error != 0)
3305			return (error);
3306		if (timeout.tv_nsec >= 1000000000 ||
3307		    timeout.tv_nsec < 0) {
3308			return (EINVAL);
3309		}
3310		ts = &timeout;
3311	}
3312	return (do_lock_umtx32(td, uap->obj, uap->val, ts));
3313}
3314
3315static int
3316__umtx_op_unlock_umtx_compat32(struct thread *td, struct _umtx_op_args *uap)
3317{
3318	return (do_unlock_umtx32(td, uap->obj, (uint32_t)uap->val));
3319}
3320
3321static int
3322__umtx_op_wait_compat32(struct thread *td, struct _umtx_op_args *uap)
3323{
3324	struct timespec *ts, timeout;
3325	int error;
3326
3327	if (uap->uaddr2 == NULL)
3328		ts = NULL;
3329	else {
3330		error = copyin_timeout32(uap->uaddr2, &timeout);
3331		if (error != 0)
3332			return (error);
3333		if (timeout.tv_nsec >= 1000000000 ||
3334		    timeout.tv_nsec < 0)
3335			return (EINVAL);
3336		ts = &timeout;
3337	}
3338	return do_wait(td, uap->obj, uap->val, ts, 1, 0);
3339}
3340
3341static int
3342__umtx_op_lock_umutex_compat32(struct thread *td, struct _umtx_op_args *uap)
3343{
3344	struct timespec *ts, timeout;
3345	int error;
3346
3347	/* Allow a null timespec (wait forever). */
3348	if (uap->uaddr2 == NULL)
3349		ts = NULL;
3350	else {
3351		error = copyin_timeout32(uap->uaddr2, &timeout);
3352		if (error != 0)
3353			return (error);
3354		if (timeout.tv_nsec >= 1000000000 ||
3355		    timeout.tv_nsec < 0)
3356			return (EINVAL);
3357		ts = &timeout;
3358	}
3359	return do_lock_umutex(td, uap->obj, ts, 0);
3360}
3361
3362static int
3363__umtx_op_wait_umutex_compat32(struct thread *td, struct _umtx_op_args *uap)
3364{
3365	struct timespec *ts, timeout;
3366	int error;
3367
3368	/* Allow a null timespec (wait forever). */
3369	if (uap->uaddr2 == NULL)
3370		ts = NULL;
3371	else {
3372		error = copyin_timeout32(uap->uaddr2, &timeout);
3373		if (error != 0)
3374			return (error);
3375		if (timeout.tv_nsec >= 1000000000 ||
3376		    timeout.tv_nsec < 0)
3377			return (EINVAL);
3378		ts = &timeout;
3379	}
3380	return do_lock_umutex(td, uap->obj, ts, _UMUTEX_WAIT);
3381}
3382
3383static int
3384__umtx_op_cv_wait_compat32(struct thread *td, struct _umtx_op_args *uap)
3385{
3386	struct timespec *ts, timeout;
3387	int error;
3388
3389	/* Allow a null timespec (wait forever). */
3390	if (uap->uaddr2 == NULL)
3391		ts = NULL;
3392	else {
3393		error = copyin_timeout32(uap->uaddr2, &timeout);
3394		if (error != 0)
3395			return (error);
3396		if (timeout.tv_nsec >= 1000000000 ||
3397		    timeout.tv_nsec < 0)
3398			return (EINVAL);
3399		ts = &timeout;
3400	}
3401	return (do_cv_wait(td, uap->obj, uap->uaddr1, ts, uap->val));
3402}
3403
3404static int
3405__umtx_op_rw_rdlock_compat32(struct thread *td, struct _umtx_op_args *uap)
3406{
3407	struct timespec timeout;
3408	int error;
3409
3410	/* Allow a null timespec (wait forever). */
3411	if (uap->uaddr2 == NULL) {
3412		error = do_rw_rdlock(td, uap->obj, uap->val, 0);
3413	} else {
3414		error = copyin(uap->uaddr2, &timeout,
3415		    sizeof(timeout));
3416		if (error != 0)
3417			return (error);
3418		if (timeout.tv_nsec >= 1000000000 ||
3419		    timeout.tv_nsec < 0) {
3420			return (EINVAL);
3421		}
3422		error = do_rw_rdlock2(td, uap->obj, uap->val, &timeout);
3423	}
3424	return (error);
3425}
3426
3427static int
3428__umtx_op_rw_wrlock_compat32(struct thread *td, struct _umtx_op_args *uap)
3429{
3430	struct timespec timeout;
3431	int error;
3432
3433	/* Allow a null timespec (wait forever). */
3434	if (uap->uaddr2 == NULL) {
3435		error = do_rw_wrlock(td, uap->obj, 0);
3436	} else {
3437		error = copyin_timeout32(uap->uaddr2, &timeout);
3438		if (error != 0)
3439			return (error);
3440		if (timeout.tv_nsec >= 1000000000 ||
3441		    timeout.tv_nsec < 0) {
3442			return (EINVAL);
3443		}
3444
3445		error = do_rw_wrlock2(td, uap->obj, &timeout);
3446	}
3447	return (error);
3448}
3449
3450static int
3451__umtx_op_wait_uint_private_compat32(struct thread *td, struct _umtx_op_args *uap)
3452{
3453	struct timespec *ts, timeout;
3454	int error;
3455
3456	if (uap->uaddr2 == NULL)
3457		ts = NULL;
3458	else {
3459		error = copyin_timeout32(uap->uaddr2, &timeout);
3460		if (error != 0)
3461			return (error);
3462		if (timeout.tv_nsec >= 1000000000 ||
3463		    timeout.tv_nsec < 0)
3464			return (EINVAL);
3465		ts = &timeout;
3466	}
3467	return do_wait(td, uap->obj, uap->val, ts, 1, 1);
3468}
3469
3470static int
3471__umtx_op_sem_wait_compat32(struct thread *td, struct _umtx_op_args *uap)
3472{
3473	struct timespec *ts, timeout;
3474	int error;
3475
3476	/* Allow a null timespec (wait forever). */
3477	if (uap->uaddr2 == NULL)
3478		ts = NULL;
3479	else {
3480		error = copyin_timeout32(uap->uaddr2, &timeout);
3481		if (error != 0)
3482			return (error);
3483		if (timeout.tv_nsec >= 1000000000 ||
3484		    timeout.tv_nsec < 0)
3485			return (EINVAL);
3486		ts = &timeout;
3487	}
3488	return (do_sem_wait(td, uap->obj, ts));
3489}
3490
3491static _umtx_op_func op_table_compat32[] = {
3492	__umtx_op_lock_umtx_compat32,	/* UMTX_OP_LOCK */
3493	__umtx_op_unlock_umtx_compat32,	/* UMTX_OP_UNLOCK */
3494	__umtx_op_wait_compat32,	/* UMTX_OP_WAIT */
3495	__umtx_op_wake,			/* UMTX_OP_WAKE */
3496	__umtx_op_trylock_umutex,	/* UMTX_OP_MUTEX_LOCK */
3497	__umtx_op_lock_umutex_compat32,	/* UMTX_OP_MUTEX_TRYLOCK */
3498	__umtx_op_unlock_umutex,	/* UMTX_OP_MUTEX_UNLOCK	*/
3499	__umtx_op_set_ceiling,		/* UMTX_OP_SET_CEILING */
3500	__umtx_op_cv_wait_compat32,	/* UMTX_OP_CV_WAIT*/
3501	__umtx_op_cv_signal,		/* UMTX_OP_CV_SIGNAL */
3502	__umtx_op_cv_broadcast,		/* UMTX_OP_CV_BROADCAST */
3503	__umtx_op_wait_compat32,	/* UMTX_OP_WAIT_UINT */
3504	__umtx_op_rw_rdlock_compat32,	/* UMTX_OP_RW_RDLOCK */
3505	__umtx_op_rw_wrlock_compat32,	/* UMTX_OP_RW_WRLOCK */
3506	__umtx_op_rw_unlock,		/* UMTX_OP_RW_UNLOCK */
3507	__umtx_op_wait_uint_private_compat32,	/* UMTX_OP_WAIT_UINT_PRIVATE */
3508	__umtx_op_wake_private,		/* UMTX_OP_WAKE_PRIVATE */
3509	__umtx_op_wait_umutex_compat32, /* UMTX_OP_UMUTEX_WAIT */
3510	__umtx_op_wake_umutex,		/* UMTX_OP_UMUTEX_WAKE */
3511	__umtx_op_sem_wait_compat32,	/* UMTX_OP_SEM_WAIT */
3512	__umtx_op_sem_wake		/* UMTX_OP_SEM_WAKE */
3513};
3514
3515int
3516freebsd32_umtx_op(struct thread *td, struct freebsd32_umtx_op_args *uap)
3517{
3518	if ((unsigned)uap->op < UMTX_OP_MAX)
3519		return (*op_table_compat32[uap->op])(td,
3520			(struct _umtx_op_args *)uap);
3521	return (EINVAL);
3522}
3523#endif
3524
3525void
3526umtx_thread_init(struct thread *td)
3527{
3528	td->td_umtxq = umtxq_alloc();
3529	td->td_umtxq->uq_thread = td;
3530}
3531
3532void
3533umtx_thread_fini(struct thread *td)
3534{
3535	umtxq_free(td->td_umtxq);
3536}
3537
3538/*
3539 * It will be called when new thread is created, e.g fork().
3540 */
3541void
3542umtx_thread_alloc(struct thread *td)
3543{
3544	struct umtx_q *uq;
3545
3546	uq = td->td_umtxq;
3547	uq->uq_inherited_pri = PRI_MAX;
3548
3549	KASSERT(uq->uq_flags == 0, ("uq_flags != 0"));
3550	KASSERT(uq->uq_thread == td, ("uq_thread != td"));
3551	KASSERT(uq->uq_pi_blocked == NULL, ("uq_pi_blocked != NULL"));
3552	KASSERT(TAILQ_EMPTY(&uq->uq_pi_contested), ("uq_pi_contested is not empty"));
3553}
3554
3555/*
3556 * exec() hook.
3557 */
3558static void
3559umtx_exec_hook(void *arg __unused, struct proc *p __unused,
3560	struct image_params *imgp __unused)
3561{
3562	umtx_thread_cleanup(curthread);
3563}
3564
3565/*
3566 * thread_exit() hook.
3567 */
3568void
3569umtx_thread_exit(struct thread *td)
3570{
3571	umtx_thread_cleanup(td);
3572}
3573
3574/*
3575 * clean up umtx data.
3576 */
3577static void
3578umtx_thread_cleanup(struct thread *td)
3579{
3580	struct umtx_q *uq;
3581	struct umtx_pi *pi;
3582
3583	if ((uq = td->td_umtxq) == NULL)
3584		return;
3585
3586	mtx_lock_spin(&umtx_lock);
3587	uq->uq_inherited_pri = PRI_MAX;
3588	while ((pi = TAILQ_FIRST(&uq->uq_pi_contested)) != NULL) {
3589		pi->pi_owner = NULL;
3590		TAILQ_REMOVE(&uq->uq_pi_contested, pi, pi_link);
3591	}
3592	mtx_unlock_spin(&umtx_lock);
3593	thread_lock(td);
3594	sched_unlend_user_prio(td, PRI_MAX);
3595	thread_unlock(td);
3596}
3597