kern_umtx.c revision 216641
1/*-
2 * Copyright (c) 2004, David Xu <davidxu@freebsd.org>
3 * Copyright (c) 2002, Jeffrey Roberson <jeff@freebsd.org>
4 * All rights reserved.
5 *
6 * Redistribution and use in source and binary forms, with or without
7 * modification, are permitted provided that the following conditions
8 * are met:
9 * 1. Redistributions of source code must retain the above copyright
10 *    notice unmodified, this list of conditions, and the following
11 *    disclaimer.
12 * 2. Redistributions in binary form must reproduce the above copyright
13 *    notice, this list of conditions and the following disclaimer in the
14 *    documentation and/or other materials provided with the distribution.
15 *
16 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
17 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
18 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
19 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
20 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
21 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
22 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
23 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
25 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26 */
27
28#include <sys/cdefs.h>
29__FBSDID("$FreeBSD: head/sys/kern/kern_umtx.c 216641 2010-12-22 05:01:52Z davidxu $");
30
31#include "opt_compat.h"
32#include <sys/param.h>
33#include <sys/kernel.h>
34#include <sys/limits.h>
35#include <sys/lock.h>
36#include <sys/malloc.h>
37#include <sys/mutex.h>
38#include <sys/priv.h>
39#include <sys/proc.h>
40#include <sys/sched.h>
41#include <sys/smp.h>
42#include <sys/sysctl.h>
43#include <sys/sysent.h>
44#include <sys/systm.h>
45#include <sys/sysproto.h>
46#include <sys/syscallsubr.h>
47#include <sys/eventhandler.h>
48#include <sys/umtx.h>
49
50#include <vm/vm.h>
51#include <vm/vm_param.h>
52#include <vm/pmap.h>
53#include <vm/vm_map.h>
54#include <vm/vm_object.h>
55
56#include <machine/cpu.h>
57
58#ifdef COMPAT_FREEBSD32
59#include <compat/freebsd32/freebsd32_proto.h>
60#endif
61
62enum {
63	TYPE_SIMPLE_WAIT,
64	TYPE_CV,
65	TYPE_SEM,
66	TYPE_SIMPLE_LOCK,
67	TYPE_NORMAL_UMUTEX,
68	TYPE_PI_UMUTEX,
69	TYPE_PP_UMUTEX,
70	TYPE_RWLOCK
71};
72
73#define _UMUTEX_TRY		1
74#define _UMUTEX_WAIT		2
75
76/* Key to represent a unique userland synchronous object */
77struct umtx_key {
78	int	hash;
79	int	type;
80	int	shared;
81	union {
82		struct {
83			vm_object_t	object;
84			uintptr_t	offset;
85		} shared;
86		struct {
87			struct vmspace	*vs;
88			uintptr_t	addr;
89		} private;
90		struct {
91			void		*a;
92			uintptr_t	b;
93		} both;
94	} info;
95};
96
97/* Priority inheritance mutex info. */
98struct umtx_pi {
99	/* Owner thread */
100	struct thread		*pi_owner;
101
102	/* Reference count */
103	int			pi_refcount;
104
105 	/* List entry to link umtx holding by thread */
106	TAILQ_ENTRY(umtx_pi)	pi_link;
107
108	/* List entry in hash */
109	TAILQ_ENTRY(umtx_pi)	pi_hashlink;
110
111	/* List for waiters */
112	TAILQ_HEAD(,umtx_q)	pi_blocked;
113
114	/* Identify a userland lock object */
115	struct umtx_key		pi_key;
116};
117
118/* A userland synchronous object user. */
119struct umtx_q {
120	/* Linked list for the hash. */
121	TAILQ_ENTRY(umtx_q)	uq_link;
122
123	/* Umtx key. */
124	struct umtx_key		uq_key;
125
126	/* Umtx flags. */
127	int			uq_flags;
128#define UQF_UMTXQ	0x0001
129
130	/* The thread waits on. */
131	struct thread		*uq_thread;
132
133	/*
134	 * Blocked on PI mutex. read can use chain lock
135	 * or umtx_lock, write must have both chain lock and
136	 * umtx_lock being hold.
137	 */
138	struct umtx_pi		*uq_pi_blocked;
139
140	/* On blocked list */
141	TAILQ_ENTRY(umtx_q)	uq_lockq;
142
143	/* Thread contending with us */
144	TAILQ_HEAD(,umtx_pi)	uq_pi_contested;
145
146	/* Inherited priority from PP mutex */
147	u_char			uq_inherited_pri;
148
149	/* Spare queue ready to be reused */
150	struct umtxq_queue	*uq_spare_queue;
151
152	/* The queue we on */
153	struct umtxq_queue	*uq_cur_queue;
154};
155
156TAILQ_HEAD(umtxq_head, umtx_q);
157
158/* Per-key wait-queue */
159struct umtxq_queue {
160	struct umtxq_head	head;
161	struct umtx_key		key;
162	LIST_ENTRY(umtxq_queue)	link;
163	int			length;
164};
165
166LIST_HEAD(umtxq_list, umtxq_queue);
167
168/* Userland lock object's wait-queue chain */
169struct umtxq_chain {
170	/* Lock for this chain. */
171	struct mtx		uc_lock;
172
173	/* List of sleep queues. */
174	struct umtxq_list	uc_queue[2];
175#define UMTX_SHARED_QUEUE	0
176#define UMTX_EXCLUSIVE_QUEUE	1
177
178	LIST_HEAD(, umtxq_queue) uc_spare_queue;
179
180	/* Busy flag */
181	char			uc_busy;
182
183	/* Chain lock waiters */
184	int			uc_waiters;
185
186	/* All PI in the list */
187	TAILQ_HEAD(,umtx_pi)	uc_pi_list;
188
189};
190
191#define	UMTXQ_LOCKED_ASSERT(uc)		mtx_assert(&(uc)->uc_lock, MA_OWNED)
192#define	UMTXQ_BUSY_ASSERT(uc)	KASSERT(&(uc)->uc_busy, ("umtx chain is not busy"))
193
194/*
195 * Don't propagate time-sharing priority, there is a security reason,
196 * a user can simply introduce PI-mutex, let thread A lock the mutex,
197 * and let another thread B block on the mutex, because B is
198 * sleeping, its priority will be boosted, this causes A's priority to
199 * be boosted via priority propagating too and will never be lowered even
200 * if it is using 100%CPU, this is unfair to other processes.
201 */
202
203#define UPRI(td)	(((td)->td_user_pri >= PRI_MIN_TIMESHARE &&\
204			  (td)->td_user_pri <= PRI_MAX_TIMESHARE) ?\
205			 PRI_MAX_TIMESHARE : (td)->td_user_pri)
206
207#define	GOLDEN_RATIO_PRIME	2654404609U
208#define	UMTX_CHAINS		128
209#define	UMTX_SHIFTS		(__WORD_BIT - 7)
210
211#define THREAD_SHARE		0
212#define PROCESS_SHARE		1
213#define AUTO_SHARE		2
214
215#define	GET_SHARE(flags)	\
216    (((flags) & USYNC_PROCESS_SHARED) == 0 ? THREAD_SHARE : PROCESS_SHARE)
217
218#define BUSY_SPINS		200
219
220static uma_zone_t		umtx_pi_zone;
221static struct umtxq_chain	umtxq_chains[2][UMTX_CHAINS];
222static MALLOC_DEFINE(M_UMTX, "umtx", "UMTX queue memory");
223static int			umtx_pi_allocated;
224
225SYSCTL_NODE(_debug, OID_AUTO, umtx, CTLFLAG_RW, 0, "umtx debug");
226SYSCTL_INT(_debug_umtx, OID_AUTO, umtx_pi_allocated, CTLFLAG_RD,
227    &umtx_pi_allocated, 0, "Allocated umtx_pi");
228
229static void umtxq_sysinit(void *);
230static void umtxq_hash(struct umtx_key *key);
231static struct umtxq_chain *umtxq_getchain(struct umtx_key *key);
232static void umtxq_lock(struct umtx_key *key);
233static void umtxq_unlock(struct umtx_key *key);
234static void umtxq_busy(struct umtx_key *key);
235static void umtxq_unbusy(struct umtx_key *key);
236static void umtxq_insert_queue(struct umtx_q *uq, int q);
237static void umtxq_remove_queue(struct umtx_q *uq, int q);
238static int umtxq_sleep(struct umtx_q *uq, const char *wmesg, int timo);
239static int umtxq_count(struct umtx_key *key);
240static int umtx_key_match(const struct umtx_key *k1, const struct umtx_key *k2);
241static int umtx_key_get(void *addr, int type, int share,
242	struct umtx_key *key);
243static void umtx_key_release(struct umtx_key *key);
244static struct umtx_pi *umtx_pi_alloc(int);
245static void umtx_pi_free(struct umtx_pi *pi);
246static void umtx_pi_adjust_locked(struct thread *td, u_char oldpri);
247static int do_unlock_pp(struct thread *td, struct umutex *m, uint32_t flags);
248static void umtx_thread_cleanup(struct thread *td);
249static void umtx_exec_hook(void *arg __unused, struct proc *p __unused,
250	struct image_params *imgp __unused);
251SYSINIT(umtx, SI_SUB_EVENTHANDLER+1, SI_ORDER_MIDDLE, umtxq_sysinit, NULL);
252
253#define umtxq_signal(key, nwake)	umtxq_signal_queue((key), (nwake), UMTX_SHARED_QUEUE)
254#define umtxq_insert(uq)	umtxq_insert_queue((uq), UMTX_SHARED_QUEUE)
255#define umtxq_remove(uq)	umtxq_remove_queue((uq), UMTX_SHARED_QUEUE)
256
257static struct mtx umtx_lock;
258
259static void
260umtxq_sysinit(void *arg __unused)
261{
262	int i, j;
263
264	umtx_pi_zone = uma_zcreate("umtx pi", sizeof(struct umtx_pi),
265		NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0);
266	for (i = 0; i < 2; ++i) {
267		for (j = 0; j < UMTX_CHAINS; ++j) {
268			mtx_init(&umtxq_chains[i][j].uc_lock, "umtxql", NULL,
269				 MTX_DEF | MTX_DUPOK);
270			LIST_INIT(&umtxq_chains[i][j].uc_queue[0]);
271			LIST_INIT(&umtxq_chains[i][j].uc_queue[1]);
272			LIST_INIT(&umtxq_chains[i][j].uc_spare_queue);
273			TAILQ_INIT(&umtxq_chains[i][j].uc_pi_list);
274			umtxq_chains[i][j].uc_busy = 0;
275			umtxq_chains[i][j].uc_waiters = 0;
276		}
277	}
278	mtx_init(&umtx_lock, "umtx lock", NULL, MTX_SPIN);
279	EVENTHANDLER_REGISTER(process_exec, umtx_exec_hook, NULL,
280	    EVENTHANDLER_PRI_ANY);
281}
282
283struct umtx_q *
284umtxq_alloc(void)
285{
286	struct umtx_q *uq;
287
288	uq = malloc(sizeof(struct umtx_q), M_UMTX, M_WAITOK | M_ZERO);
289	uq->uq_spare_queue = malloc(sizeof(struct umtxq_queue), M_UMTX, M_WAITOK | M_ZERO);
290	TAILQ_INIT(&uq->uq_spare_queue->head);
291	TAILQ_INIT(&uq->uq_pi_contested);
292	uq->uq_inherited_pri = PRI_MAX;
293	return (uq);
294}
295
296void
297umtxq_free(struct umtx_q *uq)
298{
299	MPASS(uq->uq_spare_queue != NULL);
300	free(uq->uq_spare_queue, M_UMTX);
301	free(uq, M_UMTX);
302}
303
304static inline void
305umtxq_hash(struct umtx_key *key)
306{
307	unsigned n = (uintptr_t)key->info.both.a + key->info.both.b;
308	key->hash = ((n * GOLDEN_RATIO_PRIME) >> UMTX_SHIFTS) % UMTX_CHAINS;
309}
310
311static inline int
312umtx_key_match(const struct umtx_key *k1, const struct umtx_key *k2)
313{
314	return (k1->type == k2->type &&
315		k1->info.both.a == k2->info.both.a &&
316	        k1->info.both.b == k2->info.both.b);
317}
318
319static inline struct umtxq_chain *
320umtxq_getchain(struct umtx_key *key)
321{
322	if (key->type <= TYPE_SEM)
323		return (&umtxq_chains[1][key->hash]);
324	return (&umtxq_chains[0][key->hash]);
325}
326
327/*
328 * Lock a chain.
329 */
330static inline void
331umtxq_lock(struct umtx_key *key)
332{
333	struct umtxq_chain *uc;
334
335	uc = umtxq_getchain(key);
336	mtx_lock(&uc->uc_lock);
337}
338
339/*
340 * Unlock a chain.
341 */
342static inline void
343umtxq_unlock(struct umtx_key *key)
344{
345	struct umtxq_chain *uc;
346
347	uc = umtxq_getchain(key);
348	mtx_unlock(&uc->uc_lock);
349}
350
351/*
352 * Set chain to busy state when following operation
353 * may be blocked (kernel mutex can not be used).
354 */
355static inline void
356umtxq_busy(struct umtx_key *key)
357{
358	struct umtxq_chain *uc;
359
360	uc = umtxq_getchain(key);
361	mtx_assert(&uc->uc_lock, MA_OWNED);
362	if (uc->uc_busy) {
363#ifdef SMP
364		if (smp_cpus > 1) {
365			int count = BUSY_SPINS;
366			if (count > 0) {
367				umtxq_unlock(key);
368				while (uc->uc_busy && --count > 0)
369					cpu_spinwait();
370				umtxq_lock(key);
371			}
372		}
373#endif
374		while (uc->uc_busy) {
375			uc->uc_waiters++;
376			msleep(uc, &uc->uc_lock, 0, "umtxqb", 0);
377			uc->uc_waiters--;
378		}
379	}
380	uc->uc_busy = 1;
381}
382
383/*
384 * Unbusy a chain.
385 */
386static inline void
387umtxq_unbusy(struct umtx_key *key)
388{
389	struct umtxq_chain *uc;
390
391	uc = umtxq_getchain(key);
392	mtx_assert(&uc->uc_lock, MA_OWNED);
393	KASSERT(uc->uc_busy != 0, ("not busy"));
394	uc->uc_busy = 0;
395	if (uc->uc_waiters)
396		wakeup_one(uc);
397}
398
399static struct umtxq_queue *
400umtxq_queue_lookup(struct umtx_key *key, int q)
401{
402	struct umtxq_queue *uh;
403	struct umtxq_chain *uc;
404
405	uc = umtxq_getchain(key);
406	UMTXQ_LOCKED_ASSERT(uc);
407	LIST_FOREACH(uh, &uc->uc_queue[q], link) {
408		if (umtx_key_match(&uh->key, key))
409			return (uh);
410	}
411
412	return (NULL);
413}
414
415static inline void
416umtxq_insert_queue(struct umtx_q *uq, int q)
417{
418	struct umtxq_queue *uh;
419	struct umtxq_chain *uc;
420
421	uc = umtxq_getchain(&uq->uq_key);
422	UMTXQ_LOCKED_ASSERT(uc);
423	KASSERT((uq->uq_flags & UQF_UMTXQ) == 0, ("umtx_q is already on queue"));
424	uh = umtxq_queue_lookup(&uq->uq_key, q);
425	if (uh != NULL) {
426		LIST_INSERT_HEAD(&uc->uc_spare_queue, uq->uq_spare_queue, link);
427	} else {
428		uh = uq->uq_spare_queue;
429		uh->key = uq->uq_key;
430		LIST_INSERT_HEAD(&uc->uc_queue[q], uh, link);
431	}
432	uq->uq_spare_queue = NULL;
433
434	TAILQ_INSERT_TAIL(&uh->head, uq, uq_link);
435	uh->length++;
436	uq->uq_flags |= UQF_UMTXQ;
437	uq->uq_cur_queue = uh;
438	return;
439}
440
441static inline void
442umtxq_remove_queue(struct umtx_q *uq, int q)
443{
444	struct umtxq_chain *uc;
445	struct umtxq_queue *uh;
446
447	uc = umtxq_getchain(&uq->uq_key);
448	UMTXQ_LOCKED_ASSERT(uc);
449	if (uq->uq_flags & UQF_UMTXQ) {
450		uh = uq->uq_cur_queue;
451		TAILQ_REMOVE(&uh->head, uq, uq_link);
452		uh->length--;
453		uq->uq_flags &= ~UQF_UMTXQ;
454		if (TAILQ_EMPTY(&uh->head)) {
455			KASSERT(uh->length == 0,
456			    ("inconsistent umtxq_queue length"));
457			LIST_REMOVE(uh, link);
458		} else {
459			uh = LIST_FIRST(&uc->uc_spare_queue);
460			KASSERT(uh != NULL, ("uc_spare_queue is empty"));
461			LIST_REMOVE(uh, link);
462		}
463		uq->uq_spare_queue = uh;
464		uq->uq_cur_queue = NULL;
465	}
466}
467
468/*
469 * Check if there are multiple waiters
470 */
471static int
472umtxq_count(struct umtx_key *key)
473{
474	struct umtxq_chain *uc;
475	struct umtxq_queue *uh;
476
477	uc = umtxq_getchain(key);
478	UMTXQ_LOCKED_ASSERT(uc);
479	uh = umtxq_queue_lookup(key, UMTX_SHARED_QUEUE);
480	if (uh != NULL)
481		return (uh->length);
482	return (0);
483}
484
485/*
486 * Check if there are multiple PI waiters and returns first
487 * waiter.
488 */
489static int
490umtxq_count_pi(struct umtx_key *key, struct umtx_q **first)
491{
492	struct umtxq_chain *uc;
493	struct umtxq_queue *uh;
494
495	*first = NULL;
496	uc = umtxq_getchain(key);
497	UMTXQ_LOCKED_ASSERT(uc);
498	uh = umtxq_queue_lookup(key, UMTX_SHARED_QUEUE);
499	if (uh != NULL) {
500		*first = TAILQ_FIRST(&uh->head);
501		return (uh->length);
502	}
503	return (0);
504}
505
506/*
507 * Wake up threads waiting on an userland object.
508 */
509
510static int
511umtxq_signal_queue(struct umtx_key *key, int n_wake, int q)
512{
513	struct umtxq_chain *uc;
514	struct umtxq_queue *uh;
515	struct umtx_q *uq;
516	int ret;
517
518	ret = 0;
519	uc = umtxq_getchain(key);
520	UMTXQ_LOCKED_ASSERT(uc);
521	uh = umtxq_queue_lookup(key, q);
522	if (uh != NULL) {
523		while ((uq = TAILQ_FIRST(&uh->head)) != NULL) {
524			umtxq_remove_queue(uq, q);
525			wakeup(uq);
526			if (++ret >= n_wake)
527				return (ret);
528		}
529	}
530	return (ret);
531}
532
533
534/*
535 * Wake up specified thread.
536 */
537static inline void
538umtxq_signal_thread(struct umtx_q *uq)
539{
540	struct umtxq_chain *uc;
541
542	uc = umtxq_getchain(&uq->uq_key);
543	UMTXQ_LOCKED_ASSERT(uc);
544	umtxq_remove(uq);
545	wakeup(uq);
546}
547
548/*
549 * Put thread into sleep state, before sleeping, check if
550 * thread was removed from umtx queue.
551 */
552static inline int
553umtxq_sleep(struct umtx_q *uq, const char *wmesg, int timo)
554{
555	struct umtxq_chain *uc;
556	int error;
557
558	uc = umtxq_getchain(&uq->uq_key);
559	UMTXQ_LOCKED_ASSERT(uc);
560	if (!(uq->uq_flags & UQF_UMTXQ))
561		return (0);
562	error = msleep(uq, &uc->uc_lock, PCATCH, wmesg, timo);
563	if (error == EWOULDBLOCK)
564		error = ETIMEDOUT;
565	return (error);
566}
567
568/*
569 * Convert userspace address into unique logical address.
570 */
571static int
572umtx_key_get(void *addr, int type, int share, struct umtx_key *key)
573{
574	struct thread *td = curthread;
575	vm_map_t map;
576	vm_map_entry_t entry;
577	vm_pindex_t pindex;
578	vm_prot_t prot;
579	boolean_t wired;
580
581	key->type = type;
582	if (share == THREAD_SHARE) {
583		key->shared = 0;
584		key->info.private.vs = td->td_proc->p_vmspace;
585		key->info.private.addr = (uintptr_t)addr;
586	} else {
587		MPASS(share == PROCESS_SHARE || share == AUTO_SHARE);
588		map = &td->td_proc->p_vmspace->vm_map;
589		if (vm_map_lookup(&map, (vm_offset_t)addr, VM_PROT_WRITE,
590		    &entry, &key->info.shared.object, &pindex, &prot,
591		    &wired) != KERN_SUCCESS) {
592			return EFAULT;
593		}
594
595		if ((share == PROCESS_SHARE) ||
596		    (share == AUTO_SHARE &&
597		     VM_INHERIT_SHARE == entry->inheritance)) {
598			key->shared = 1;
599			key->info.shared.offset = entry->offset + entry->start -
600				(vm_offset_t)addr;
601			vm_object_reference(key->info.shared.object);
602		} else {
603			key->shared = 0;
604			key->info.private.vs = td->td_proc->p_vmspace;
605			key->info.private.addr = (uintptr_t)addr;
606		}
607		vm_map_lookup_done(map, entry);
608	}
609
610	umtxq_hash(key);
611	return (0);
612}
613
614/*
615 * Release key.
616 */
617static inline void
618umtx_key_release(struct umtx_key *key)
619{
620	if (key->shared)
621		vm_object_deallocate(key->info.shared.object);
622}
623
624/*
625 * Lock a umtx object.
626 */
627static int
628_do_lock_umtx(struct thread *td, struct umtx *umtx, u_long id, int timo)
629{
630	struct umtx_q *uq;
631	u_long owner;
632	u_long old;
633	int error = 0;
634
635	uq = td->td_umtxq;
636
637	/*
638	 * Care must be exercised when dealing with umtx structure. It
639	 * can fault on any access.
640	 */
641	for (;;) {
642		/*
643		 * Try the uncontested case.  This should be done in userland.
644		 */
645		owner = casuword(&umtx->u_owner, UMTX_UNOWNED, id);
646
647		/* The acquire succeeded. */
648		if (owner == UMTX_UNOWNED)
649			return (0);
650
651		/* The address was invalid. */
652		if (owner == -1)
653			return (EFAULT);
654
655		/* If no one owns it but it is contested try to acquire it. */
656		if (owner == UMTX_CONTESTED) {
657			owner = casuword(&umtx->u_owner,
658			    UMTX_CONTESTED, id | UMTX_CONTESTED);
659
660			if (owner == UMTX_CONTESTED)
661				return (0);
662
663			/* The address was invalid. */
664			if (owner == -1)
665				return (EFAULT);
666
667			/* If this failed the lock has changed, restart. */
668			continue;
669		}
670
671		/*
672		 * If we caught a signal, we have retried and now
673		 * exit immediately.
674		 */
675		if (error != 0)
676			return (error);
677
678		if ((error = umtx_key_get(umtx, TYPE_SIMPLE_LOCK,
679			AUTO_SHARE, &uq->uq_key)) != 0)
680			return (error);
681
682		umtxq_lock(&uq->uq_key);
683		umtxq_busy(&uq->uq_key);
684		umtxq_insert(uq);
685		umtxq_unbusy(&uq->uq_key);
686		umtxq_unlock(&uq->uq_key);
687
688		/*
689		 * Set the contested bit so that a release in user space
690		 * knows to use the system call for unlock.  If this fails
691		 * either some one else has acquired the lock or it has been
692		 * released.
693		 */
694		old = casuword(&umtx->u_owner, owner, owner | UMTX_CONTESTED);
695
696		/* The address was invalid. */
697		if (old == -1) {
698			umtxq_lock(&uq->uq_key);
699			umtxq_remove(uq);
700			umtxq_unlock(&uq->uq_key);
701			umtx_key_release(&uq->uq_key);
702			return (EFAULT);
703		}
704
705		/*
706		 * We set the contested bit, sleep. Otherwise the lock changed
707		 * and we need to retry or we lost a race to the thread
708		 * unlocking the umtx.
709		 */
710		umtxq_lock(&uq->uq_key);
711		if (old == owner)
712			error = umtxq_sleep(uq, "umtx", timo);
713		umtxq_remove(uq);
714		umtxq_unlock(&uq->uq_key);
715		umtx_key_release(&uq->uq_key);
716	}
717
718	return (0);
719}
720
721/*
722 * Lock a umtx object.
723 */
724static int
725do_lock_umtx(struct thread *td, struct umtx *umtx, u_long id,
726	struct timespec *timeout)
727{
728	struct timespec ts, ts2, ts3;
729	struct timeval tv;
730	int error;
731
732	if (timeout == NULL) {
733		error = _do_lock_umtx(td, umtx, id, 0);
734		/* Mutex locking is restarted if it is interrupted. */
735		if (error == EINTR)
736			error = ERESTART;
737	} else {
738		getnanouptime(&ts);
739		timespecadd(&ts, timeout);
740		TIMESPEC_TO_TIMEVAL(&tv, timeout);
741		for (;;) {
742			error = _do_lock_umtx(td, umtx, id, tvtohz(&tv));
743			if (error != ETIMEDOUT)
744				break;
745			getnanouptime(&ts2);
746			if (timespeccmp(&ts2, &ts, >=)) {
747				error = ETIMEDOUT;
748				break;
749			}
750			ts3 = ts;
751			timespecsub(&ts3, &ts2);
752			TIMESPEC_TO_TIMEVAL(&tv, &ts3);
753		}
754		/* Timed-locking is not restarted. */
755		if (error == ERESTART)
756			error = EINTR;
757	}
758	return (error);
759}
760
761/*
762 * Unlock a umtx object.
763 */
764static int
765do_unlock_umtx(struct thread *td, struct umtx *umtx, u_long id)
766{
767	struct umtx_key key;
768	u_long owner;
769	u_long old;
770	int error;
771	int count;
772
773	/*
774	 * Make sure we own this mtx.
775	 */
776	owner = fuword(__DEVOLATILE(u_long *, &umtx->u_owner));
777	if (owner == -1)
778		return (EFAULT);
779
780	if ((owner & ~UMTX_CONTESTED) != id)
781		return (EPERM);
782
783	/* This should be done in userland */
784	if ((owner & UMTX_CONTESTED) == 0) {
785		old = casuword(&umtx->u_owner, owner, UMTX_UNOWNED);
786		if (old == -1)
787			return (EFAULT);
788		if (old == owner)
789			return (0);
790		owner = old;
791	}
792
793	/* We should only ever be in here for contested locks */
794	if ((error = umtx_key_get(umtx, TYPE_SIMPLE_LOCK, AUTO_SHARE,
795		&key)) != 0)
796		return (error);
797
798	umtxq_lock(&key);
799	umtxq_busy(&key);
800	count = umtxq_count(&key);
801	umtxq_unlock(&key);
802
803	/*
804	 * When unlocking the umtx, it must be marked as unowned if
805	 * there is zero or one thread only waiting for it.
806	 * Otherwise, it must be marked as contested.
807	 */
808	old = casuword(&umtx->u_owner, owner,
809		count <= 1 ? UMTX_UNOWNED : UMTX_CONTESTED);
810	umtxq_lock(&key);
811	umtxq_signal(&key,1);
812	umtxq_unbusy(&key);
813	umtxq_unlock(&key);
814	umtx_key_release(&key);
815	if (old == -1)
816		return (EFAULT);
817	if (old != owner)
818		return (EINVAL);
819	return (0);
820}
821
822#ifdef COMPAT_FREEBSD32
823
824/*
825 * Lock a umtx object.
826 */
827static int
828_do_lock_umtx32(struct thread *td, uint32_t *m, uint32_t id, int timo)
829{
830	struct umtx_q *uq;
831	uint32_t owner;
832	uint32_t old;
833	int error = 0;
834
835	uq = td->td_umtxq;
836
837	/*
838	 * Care must be exercised when dealing with umtx structure. It
839	 * can fault on any access.
840	 */
841	for (;;) {
842		/*
843		 * Try the uncontested case.  This should be done in userland.
844		 */
845		owner = casuword32(m, UMUTEX_UNOWNED, id);
846
847		/* The acquire succeeded. */
848		if (owner == UMUTEX_UNOWNED)
849			return (0);
850
851		/* The address was invalid. */
852		if (owner == -1)
853			return (EFAULT);
854
855		/* If no one owns it but it is contested try to acquire it. */
856		if (owner == UMUTEX_CONTESTED) {
857			owner = casuword32(m,
858			    UMUTEX_CONTESTED, id | UMUTEX_CONTESTED);
859			if (owner == UMUTEX_CONTESTED)
860				return (0);
861
862			/* The address was invalid. */
863			if (owner == -1)
864				return (EFAULT);
865
866			/* If this failed the lock has changed, restart. */
867			continue;
868		}
869
870		/*
871		 * If we caught a signal, we have retried and now
872		 * exit immediately.
873		 */
874		if (error != 0)
875			return (error);
876
877		if ((error = umtx_key_get(m, TYPE_SIMPLE_LOCK,
878			AUTO_SHARE, &uq->uq_key)) != 0)
879			return (error);
880
881		umtxq_lock(&uq->uq_key);
882		umtxq_busy(&uq->uq_key);
883		umtxq_insert(uq);
884		umtxq_unbusy(&uq->uq_key);
885		umtxq_unlock(&uq->uq_key);
886
887		/*
888		 * Set the contested bit so that a release in user space
889		 * knows to use the system call for unlock.  If this fails
890		 * either some one else has acquired the lock or it has been
891		 * released.
892		 */
893		old = casuword32(m, owner, owner | UMUTEX_CONTESTED);
894
895		/* The address was invalid. */
896		if (old == -1) {
897			umtxq_lock(&uq->uq_key);
898			umtxq_remove(uq);
899			umtxq_unlock(&uq->uq_key);
900			umtx_key_release(&uq->uq_key);
901			return (EFAULT);
902		}
903
904		/*
905		 * We set the contested bit, sleep. Otherwise the lock changed
906		 * and we need to retry or we lost a race to the thread
907		 * unlocking the umtx.
908		 */
909		umtxq_lock(&uq->uq_key);
910		if (old == owner)
911			error = umtxq_sleep(uq, "umtx", timo);
912		umtxq_remove(uq);
913		umtxq_unlock(&uq->uq_key);
914		umtx_key_release(&uq->uq_key);
915	}
916
917	return (0);
918}
919
920/*
921 * Lock a umtx object.
922 */
923static int
924do_lock_umtx32(struct thread *td, void *m, uint32_t id,
925	struct timespec *timeout)
926{
927	struct timespec ts, ts2, ts3;
928	struct timeval tv;
929	int error;
930
931	if (timeout == NULL) {
932		error = _do_lock_umtx32(td, m, id, 0);
933		/* Mutex locking is restarted if it is interrupted. */
934		if (error == EINTR)
935			error = ERESTART;
936	} else {
937		getnanouptime(&ts);
938		timespecadd(&ts, timeout);
939		TIMESPEC_TO_TIMEVAL(&tv, timeout);
940		for (;;) {
941			error = _do_lock_umtx32(td, m, id, tvtohz(&tv));
942			if (error != ETIMEDOUT)
943				break;
944			getnanouptime(&ts2);
945			if (timespeccmp(&ts2, &ts, >=)) {
946				error = ETIMEDOUT;
947				break;
948			}
949			ts3 = ts;
950			timespecsub(&ts3, &ts2);
951			TIMESPEC_TO_TIMEVAL(&tv, &ts3);
952		}
953		/* Timed-locking is not restarted. */
954		if (error == ERESTART)
955			error = EINTR;
956	}
957	return (error);
958}
959
960/*
961 * Unlock a umtx object.
962 */
963static int
964do_unlock_umtx32(struct thread *td, uint32_t *m, uint32_t id)
965{
966	struct umtx_key key;
967	uint32_t owner;
968	uint32_t old;
969	int error;
970	int count;
971
972	/*
973	 * Make sure we own this mtx.
974	 */
975	owner = fuword32(m);
976	if (owner == -1)
977		return (EFAULT);
978
979	if ((owner & ~UMUTEX_CONTESTED) != id)
980		return (EPERM);
981
982	/* This should be done in userland */
983	if ((owner & UMUTEX_CONTESTED) == 0) {
984		old = casuword32(m, owner, UMUTEX_UNOWNED);
985		if (old == -1)
986			return (EFAULT);
987		if (old == owner)
988			return (0);
989		owner = old;
990	}
991
992	/* We should only ever be in here for contested locks */
993	if ((error = umtx_key_get(m, TYPE_SIMPLE_LOCK, AUTO_SHARE,
994		&key)) != 0)
995		return (error);
996
997	umtxq_lock(&key);
998	umtxq_busy(&key);
999	count = umtxq_count(&key);
1000	umtxq_unlock(&key);
1001
1002	/*
1003	 * When unlocking the umtx, it must be marked as unowned if
1004	 * there is zero or one thread only waiting for it.
1005	 * Otherwise, it must be marked as contested.
1006	 */
1007	old = casuword32(m, owner,
1008		count <= 1 ? UMUTEX_UNOWNED : UMUTEX_CONTESTED);
1009	umtxq_lock(&key);
1010	umtxq_signal(&key,1);
1011	umtxq_unbusy(&key);
1012	umtxq_unlock(&key);
1013	umtx_key_release(&key);
1014	if (old == -1)
1015		return (EFAULT);
1016	if (old != owner)
1017		return (EINVAL);
1018	return (0);
1019}
1020#endif
1021
1022/*
1023 * Fetch and compare value, sleep on the address if value is not changed.
1024 */
1025static int
1026do_wait(struct thread *td, void *addr, u_long id,
1027	struct timespec *timeout, int compat32, int is_private)
1028{
1029	struct umtx_q *uq;
1030	struct timespec ts, ts2, ts3;
1031	struct timeval tv;
1032	u_long tmp;
1033	int error = 0;
1034
1035	uq = td->td_umtxq;
1036	if ((error = umtx_key_get(addr, TYPE_SIMPLE_WAIT,
1037		is_private ? THREAD_SHARE : AUTO_SHARE, &uq->uq_key)) != 0)
1038		return (error);
1039
1040	umtxq_lock(&uq->uq_key);
1041	umtxq_insert(uq);
1042	umtxq_unlock(&uq->uq_key);
1043	if (compat32 == 0)
1044		tmp = fuword(addr);
1045        else
1046		tmp = (unsigned int)fuword32(addr);
1047	if (tmp != id) {
1048		umtxq_lock(&uq->uq_key);
1049		umtxq_remove(uq);
1050		umtxq_unlock(&uq->uq_key);
1051	} else if (timeout == NULL) {
1052		umtxq_lock(&uq->uq_key);
1053		error = umtxq_sleep(uq, "uwait", 0);
1054		umtxq_remove(uq);
1055		umtxq_unlock(&uq->uq_key);
1056	} else {
1057		getnanouptime(&ts);
1058		timespecadd(&ts, timeout);
1059		TIMESPEC_TO_TIMEVAL(&tv, timeout);
1060		umtxq_lock(&uq->uq_key);
1061		for (;;) {
1062			error = umtxq_sleep(uq, "uwait", tvtohz(&tv));
1063			if (!(uq->uq_flags & UQF_UMTXQ)) {
1064				error = 0;
1065				break;
1066			}
1067			if (error != ETIMEDOUT)
1068				break;
1069			umtxq_unlock(&uq->uq_key);
1070			getnanouptime(&ts2);
1071			if (timespeccmp(&ts2, &ts, >=)) {
1072				error = ETIMEDOUT;
1073				umtxq_lock(&uq->uq_key);
1074				break;
1075			}
1076			ts3 = ts;
1077			timespecsub(&ts3, &ts2);
1078			TIMESPEC_TO_TIMEVAL(&tv, &ts3);
1079			umtxq_lock(&uq->uq_key);
1080		}
1081		umtxq_remove(uq);
1082		umtxq_unlock(&uq->uq_key);
1083	}
1084	umtx_key_release(&uq->uq_key);
1085	if (error == ERESTART)
1086		error = EINTR;
1087	return (error);
1088}
1089
1090/*
1091 * Wake up threads sleeping on the specified address.
1092 */
1093int
1094kern_umtx_wake(struct thread *td, void *uaddr, int n_wake, int is_private)
1095{
1096	struct umtx_key key;
1097	int ret;
1098
1099	if ((ret = umtx_key_get(uaddr, TYPE_SIMPLE_WAIT,
1100		is_private ? THREAD_SHARE : AUTO_SHARE, &key)) != 0)
1101		return (ret);
1102	umtxq_lock(&key);
1103	ret = umtxq_signal(&key, n_wake);
1104	umtxq_unlock(&key);
1105	umtx_key_release(&key);
1106	return (0);
1107}
1108
1109/*
1110 * Lock PTHREAD_PRIO_NONE protocol POSIX mutex.
1111 */
1112static int
1113_do_lock_normal(struct thread *td, struct umutex *m, uint32_t flags, int timo,
1114	int mode)
1115{
1116	struct umtx_q *uq;
1117	uint32_t owner, old, id;
1118	int error = 0;
1119
1120	id = td->td_tid;
1121	uq = td->td_umtxq;
1122
1123	/*
1124	 * Care must be exercised when dealing with umtx structure. It
1125	 * can fault on any access.
1126	 */
1127	for (;;) {
1128		owner = fuword32(__DEVOLATILE(void *, &m->m_owner));
1129		if (mode == _UMUTEX_WAIT) {
1130			if (owner == UMUTEX_UNOWNED || owner == UMUTEX_CONTESTED)
1131				return (0);
1132		} else {
1133			/*
1134			 * Try the uncontested case.  This should be done in userland.
1135			 */
1136			owner = casuword32(&m->m_owner, UMUTEX_UNOWNED, id);
1137
1138			/* The acquire succeeded. */
1139			if (owner == UMUTEX_UNOWNED)
1140				return (0);
1141
1142			/* The address was invalid. */
1143			if (owner == -1)
1144				return (EFAULT);
1145
1146			/* If no one owns it but it is contested try to acquire it. */
1147			if (owner == UMUTEX_CONTESTED) {
1148				owner = casuword32(&m->m_owner,
1149				    UMUTEX_CONTESTED, id | UMUTEX_CONTESTED);
1150
1151				if (owner == UMUTEX_CONTESTED)
1152					return (0);
1153
1154				/* The address was invalid. */
1155				if (owner == -1)
1156					return (EFAULT);
1157
1158				/* If this failed the lock has changed, restart. */
1159				continue;
1160			}
1161		}
1162
1163		if ((flags & UMUTEX_ERROR_CHECK) != 0 &&
1164		    (owner & ~UMUTEX_CONTESTED) == id)
1165			return (EDEADLK);
1166
1167		if (mode == _UMUTEX_TRY)
1168			return (EBUSY);
1169
1170		/*
1171		 * If we caught a signal, we have retried and now
1172		 * exit immediately.
1173		 */
1174		if (error != 0)
1175			return (error);
1176
1177		if ((error = umtx_key_get(m, TYPE_NORMAL_UMUTEX,
1178		    GET_SHARE(flags), &uq->uq_key)) != 0)
1179			return (error);
1180
1181		umtxq_lock(&uq->uq_key);
1182		umtxq_busy(&uq->uq_key);
1183		umtxq_insert(uq);
1184		umtxq_unlock(&uq->uq_key);
1185
1186		/*
1187		 * Set the contested bit so that a release in user space
1188		 * knows to use the system call for unlock.  If this fails
1189		 * either some one else has acquired the lock or it has been
1190		 * released.
1191		 */
1192		old = casuword32(&m->m_owner, owner, owner | UMUTEX_CONTESTED);
1193
1194		/* The address was invalid. */
1195		if (old == -1) {
1196			umtxq_lock(&uq->uq_key);
1197			umtxq_remove(uq);
1198			umtxq_unbusy(&uq->uq_key);
1199			umtxq_unlock(&uq->uq_key);
1200			umtx_key_release(&uq->uq_key);
1201			return (EFAULT);
1202		}
1203
1204		/*
1205		 * We set the contested bit, sleep. Otherwise the lock changed
1206		 * and we need to retry or we lost a race to the thread
1207		 * unlocking the umtx.
1208		 */
1209		umtxq_lock(&uq->uq_key);
1210		umtxq_unbusy(&uq->uq_key);
1211		if (old == owner)
1212			error = umtxq_sleep(uq, "umtxn", timo);
1213		umtxq_remove(uq);
1214		umtxq_unlock(&uq->uq_key);
1215		umtx_key_release(&uq->uq_key);
1216	}
1217
1218	return (0);
1219}
1220
1221/*
1222 * Lock PTHREAD_PRIO_NONE protocol POSIX mutex.
1223 */
1224/*
1225 * Unlock PTHREAD_PRIO_NONE protocol POSIX mutex.
1226 */
1227static int
1228do_unlock_normal(struct thread *td, struct umutex *m, uint32_t flags)
1229{
1230	struct umtx_key key;
1231	uint32_t owner, old, id;
1232	int error;
1233	int count;
1234
1235	id = td->td_tid;
1236	/*
1237	 * Make sure we own this mtx.
1238	 */
1239	owner = fuword32(__DEVOLATILE(uint32_t *, &m->m_owner));
1240	if (owner == -1)
1241		return (EFAULT);
1242
1243	if ((owner & ~UMUTEX_CONTESTED) != id)
1244		return (EPERM);
1245
1246	if ((owner & UMUTEX_CONTESTED) == 0) {
1247		old = casuword32(&m->m_owner, owner, UMUTEX_UNOWNED);
1248		if (old == -1)
1249			return (EFAULT);
1250		if (old == owner)
1251			return (0);
1252		owner = old;
1253	}
1254
1255	/* We should only ever be in here for contested locks */
1256	if ((error = umtx_key_get(m, TYPE_NORMAL_UMUTEX, GET_SHARE(flags),
1257	    &key)) != 0)
1258		return (error);
1259
1260	umtxq_lock(&key);
1261	umtxq_busy(&key);
1262	count = umtxq_count(&key);
1263	umtxq_unlock(&key);
1264
1265	/*
1266	 * When unlocking the umtx, it must be marked as unowned if
1267	 * there is zero or one thread only waiting for it.
1268	 * Otherwise, it must be marked as contested.
1269	 */
1270	old = casuword32(&m->m_owner, owner,
1271		count <= 1 ? UMUTEX_UNOWNED : UMUTEX_CONTESTED);
1272	umtxq_lock(&key);
1273	umtxq_signal(&key,1);
1274	umtxq_unbusy(&key);
1275	umtxq_unlock(&key);
1276	umtx_key_release(&key);
1277	if (old == -1)
1278		return (EFAULT);
1279	if (old != owner)
1280		return (EINVAL);
1281	return (0);
1282}
1283
1284/*
1285 * Check if the mutex is available and wake up a waiter,
1286 * only for simple mutex.
1287 */
1288static int
1289do_wake_umutex(struct thread *td, struct umutex *m)
1290{
1291	struct umtx_key key;
1292	uint32_t owner;
1293	uint32_t flags;
1294	int error;
1295	int count;
1296
1297	owner = fuword32(__DEVOLATILE(uint32_t *, &m->m_owner));
1298	if (owner == -1)
1299		return (EFAULT);
1300
1301	if ((owner & ~UMUTEX_CONTESTED) != 0)
1302		return (0);
1303
1304	flags = fuword32(&m->m_flags);
1305
1306	/* We should only ever be in here for contested locks */
1307	if ((error = umtx_key_get(m, TYPE_NORMAL_UMUTEX, GET_SHARE(flags),
1308	    &key)) != 0)
1309		return (error);
1310
1311	umtxq_lock(&key);
1312	umtxq_busy(&key);
1313	count = umtxq_count(&key);
1314	umtxq_unlock(&key);
1315
1316	if (count <= 1)
1317		owner = casuword32(&m->m_owner, UMUTEX_CONTESTED, UMUTEX_UNOWNED);
1318
1319	umtxq_lock(&key);
1320	if (count != 0 && (owner & ~UMUTEX_CONTESTED) == 0)
1321		umtxq_signal(&key, 1);
1322	umtxq_unbusy(&key);
1323	umtxq_unlock(&key);
1324	umtx_key_release(&key);
1325	return (0);
1326}
1327
1328static inline struct umtx_pi *
1329umtx_pi_alloc(int flags)
1330{
1331	struct umtx_pi *pi;
1332
1333	pi = uma_zalloc(umtx_pi_zone, M_ZERO | flags);
1334	TAILQ_INIT(&pi->pi_blocked);
1335	atomic_add_int(&umtx_pi_allocated, 1);
1336	return (pi);
1337}
1338
1339static inline void
1340umtx_pi_free(struct umtx_pi *pi)
1341{
1342	uma_zfree(umtx_pi_zone, pi);
1343	atomic_add_int(&umtx_pi_allocated, -1);
1344}
1345
1346/*
1347 * Adjust the thread's position on a pi_state after its priority has been
1348 * changed.
1349 */
1350static int
1351umtx_pi_adjust_thread(struct umtx_pi *pi, struct thread *td)
1352{
1353	struct umtx_q *uq, *uq1, *uq2;
1354	struct thread *td1;
1355
1356	mtx_assert(&umtx_lock, MA_OWNED);
1357	if (pi == NULL)
1358		return (0);
1359
1360	uq = td->td_umtxq;
1361
1362	/*
1363	 * Check if the thread needs to be moved on the blocked chain.
1364	 * It needs to be moved if either its priority is lower than
1365	 * the previous thread or higher than the next thread.
1366	 */
1367	uq1 = TAILQ_PREV(uq, umtxq_head, uq_lockq);
1368	uq2 = TAILQ_NEXT(uq, uq_lockq);
1369	if ((uq1 != NULL && UPRI(td) < UPRI(uq1->uq_thread)) ||
1370	    (uq2 != NULL && UPRI(td) > UPRI(uq2->uq_thread))) {
1371		/*
1372		 * Remove thread from blocked chain and determine where
1373		 * it should be moved to.
1374		 */
1375		TAILQ_REMOVE(&pi->pi_blocked, uq, uq_lockq);
1376		TAILQ_FOREACH(uq1, &pi->pi_blocked, uq_lockq) {
1377			td1 = uq1->uq_thread;
1378			MPASS(td1->td_proc->p_magic == P_MAGIC);
1379			if (UPRI(td1) > UPRI(td))
1380				break;
1381		}
1382
1383		if (uq1 == NULL)
1384			TAILQ_INSERT_TAIL(&pi->pi_blocked, uq, uq_lockq);
1385		else
1386			TAILQ_INSERT_BEFORE(uq1, uq, uq_lockq);
1387	}
1388	return (1);
1389}
1390
1391/*
1392 * Propagate priority when a thread is blocked on POSIX
1393 * PI mutex.
1394 */
1395static void
1396umtx_propagate_priority(struct thread *td)
1397{
1398	struct umtx_q *uq;
1399	struct umtx_pi *pi;
1400	int pri;
1401
1402	mtx_assert(&umtx_lock, MA_OWNED);
1403	pri = UPRI(td);
1404	uq = td->td_umtxq;
1405	pi = uq->uq_pi_blocked;
1406	if (pi == NULL)
1407		return;
1408
1409	for (;;) {
1410		td = pi->pi_owner;
1411		if (td == NULL || td == curthread)
1412			return;
1413
1414		MPASS(td->td_proc != NULL);
1415		MPASS(td->td_proc->p_magic == P_MAGIC);
1416
1417		thread_lock(td);
1418		if (td->td_lend_user_pri > pri)
1419			sched_lend_user_prio(td, pri);
1420		else {
1421			thread_unlock(td);
1422			break;
1423		}
1424		thread_unlock(td);
1425
1426		/*
1427		 * Pick up the lock that td is blocked on.
1428		 */
1429		uq = td->td_umtxq;
1430		pi = uq->uq_pi_blocked;
1431		/* Resort td on the list if needed. */
1432		if (!umtx_pi_adjust_thread(pi, td))
1433			break;
1434	}
1435}
1436
1437/*
1438 * Unpropagate priority for a PI mutex when a thread blocked on
1439 * it is interrupted by signal or resumed by others.
1440 */
1441static void
1442umtx_unpropagate_priority(struct umtx_pi *pi)
1443{
1444	struct umtx_q *uq, *uq_owner;
1445	struct umtx_pi *pi2;
1446	int pri, oldpri;
1447
1448	mtx_assert(&umtx_lock, MA_OWNED);
1449
1450	while (pi != NULL && pi->pi_owner != NULL) {
1451		pri = PRI_MAX;
1452		uq_owner = pi->pi_owner->td_umtxq;
1453
1454		TAILQ_FOREACH(pi2, &uq_owner->uq_pi_contested, pi_link) {
1455			uq = TAILQ_FIRST(&pi2->pi_blocked);
1456			if (uq != NULL) {
1457				if (pri > UPRI(uq->uq_thread))
1458					pri = UPRI(uq->uq_thread);
1459			}
1460		}
1461
1462		if (pri > uq_owner->uq_inherited_pri)
1463			pri = uq_owner->uq_inherited_pri;
1464		thread_lock(pi->pi_owner);
1465		oldpri = pi->pi_owner->td_user_pri;
1466		sched_unlend_user_prio(pi->pi_owner, pri);
1467		thread_unlock(pi->pi_owner);
1468		if (uq_owner->uq_pi_blocked != NULL)
1469			umtx_pi_adjust_locked(pi->pi_owner, oldpri);
1470		pi = uq_owner->uq_pi_blocked;
1471	}
1472}
1473
1474/*
1475 * Insert a PI mutex into owned list.
1476 */
1477static void
1478umtx_pi_setowner(struct umtx_pi *pi, struct thread *owner)
1479{
1480	struct umtx_q *uq_owner;
1481
1482	uq_owner = owner->td_umtxq;
1483	mtx_assert(&umtx_lock, MA_OWNED);
1484	if (pi->pi_owner != NULL)
1485		panic("pi_ower != NULL");
1486	pi->pi_owner = owner;
1487	TAILQ_INSERT_TAIL(&uq_owner->uq_pi_contested, pi, pi_link);
1488}
1489
1490/*
1491 * Claim ownership of a PI mutex.
1492 */
1493static int
1494umtx_pi_claim(struct umtx_pi *pi, struct thread *owner)
1495{
1496	struct umtx_q *uq, *uq_owner;
1497
1498	uq_owner = owner->td_umtxq;
1499	mtx_lock_spin(&umtx_lock);
1500	if (pi->pi_owner == owner) {
1501		mtx_unlock_spin(&umtx_lock);
1502		return (0);
1503	}
1504
1505	if (pi->pi_owner != NULL) {
1506		/*
1507		 * userland may have already messed the mutex, sigh.
1508		 */
1509		mtx_unlock_spin(&umtx_lock);
1510		return (EPERM);
1511	}
1512	umtx_pi_setowner(pi, owner);
1513	uq = TAILQ_FIRST(&pi->pi_blocked);
1514	if (uq != NULL) {
1515		int pri;
1516
1517		pri = UPRI(uq->uq_thread);
1518		thread_lock(owner);
1519		if (pri < UPRI(owner))
1520			sched_lend_user_prio(owner, pri);
1521		thread_unlock(owner);
1522	}
1523	mtx_unlock_spin(&umtx_lock);
1524	return (0);
1525}
1526
1527static void
1528umtx_pi_adjust_locked(struct thread *td, u_char oldpri)
1529{
1530	struct umtx_q *uq;
1531	struct umtx_pi *pi;
1532
1533	uq = td->td_umtxq;
1534	/*
1535	 * Pick up the lock that td is blocked on.
1536	 */
1537	pi = uq->uq_pi_blocked;
1538	MPASS(pi != NULL);
1539
1540	/* Resort the turnstile on the list. */
1541	if (!umtx_pi_adjust_thread(pi, td))
1542		return;
1543
1544	/*
1545	 * If our priority was lowered and we are at the head of the
1546	 * turnstile, then propagate our new priority up the chain.
1547	 */
1548	if (uq == TAILQ_FIRST(&pi->pi_blocked) && UPRI(td) < oldpri)
1549		umtx_propagate_priority(td);
1550}
1551
1552/*
1553 * Adjust a thread's order position in its blocked PI mutex,
1554 * this may result new priority propagating process.
1555 */
1556void
1557umtx_pi_adjust(struct thread *td, u_char oldpri)
1558{
1559	struct umtx_q *uq;
1560	struct umtx_pi *pi;
1561
1562	uq = td->td_umtxq;
1563	mtx_lock_spin(&umtx_lock);
1564	/*
1565	 * Pick up the lock that td is blocked on.
1566	 */
1567	pi = uq->uq_pi_blocked;
1568	if (pi != NULL)
1569		umtx_pi_adjust_locked(td, oldpri);
1570	mtx_unlock_spin(&umtx_lock);
1571}
1572
1573/*
1574 * Sleep on a PI mutex.
1575 */
1576static int
1577umtxq_sleep_pi(struct umtx_q *uq, struct umtx_pi *pi,
1578	uint32_t owner, const char *wmesg, int timo)
1579{
1580	struct umtxq_chain *uc;
1581	struct thread *td, *td1;
1582	struct umtx_q *uq1;
1583	int pri;
1584	int error = 0;
1585
1586	td = uq->uq_thread;
1587	KASSERT(td == curthread, ("inconsistent uq_thread"));
1588	uc = umtxq_getchain(&uq->uq_key);
1589	UMTXQ_LOCKED_ASSERT(uc);
1590	UMTXQ_BUSY_ASSERT(uc);
1591	umtxq_insert(uq);
1592	mtx_lock_spin(&umtx_lock);
1593	if (pi->pi_owner == NULL) {
1594		mtx_unlock_spin(&umtx_lock);
1595		/* XXX Only look up thread in current process. */
1596		td1 = tdfind(owner, curproc->p_pid);
1597		mtx_lock_spin(&umtx_lock);
1598		if (td1 != NULL) {
1599			if (pi->pi_owner == NULL)
1600				umtx_pi_setowner(pi, td1);
1601			PROC_UNLOCK(td1->td_proc);
1602		}
1603	}
1604
1605	TAILQ_FOREACH(uq1, &pi->pi_blocked, uq_lockq) {
1606		pri = UPRI(uq1->uq_thread);
1607		if (pri > UPRI(td))
1608			break;
1609	}
1610
1611	if (uq1 != NULL)
1612		TAILQ_INSERT_BEFORE(uq1, uq, uq_lockq);
1613	else
1614		TAILQ_INSERT_TAIL(&pi->pi_blocked, uq, uq_lockq);
1615
1616	uq->uq_pi_blocked = pi;
1617	thread_lock(td);
1618	td->td_flags |= TDF_UPIBLOCKED;
1619	thread_unlock(td);
1620	umtx_propagate_priority(td);
1621	mtx_unlock_spin(&umtx_lock);
1622	umtxq_unbusy(&uq->uq_key);
1623
1624	if (uq->uq_flags & UQF_UMTXQ) {
1625		error = msleep(uq, &uc->uc_lock, PCATCH, wmesg, timo);
1626		if (error == EWOULDBLOCK)
1627			error = ETIMEDOUT;
1628		if (uq->uq_flags & UQF_UMTXQ) {
1629			umtxq_remove(uq);
1630		}
1631	}
1632	mtx_lock_spin(&umtx_lock);
1633	uq->uq_pi_blocked = NULL;
1634	thread_lock(td);
1635	td->td_flags &= ~TDF_UPIBLOCKED;
1636	thread_unlock(td);
1637	TAILQ_REMOVE(&pi->pi_blocked, uq, uq_lockq);
1638	umtx_unpropagate_priority(pi);
1639	mtx_unlock_spin(&umtx_lock);
1640	umtxq_unlock(&uq->uq_key);
1641
1642	return (error);
1643}
1644
1645/*
1646 * Add reference count for a PI mutex.
1647 */
1648static void
1649umtx_pi_ref(struct umtx_pi *pi)
1650{
1651	struct umtxq_chain *uc;
1652
1653	uc = umtxq_getchain(&pi->pi_key);
1654	UMTXQ_LOCKED_ASSERT(uc);
1655	pi->pi_refcount++;
1656}
1657
1658/*
1659 * Decrease reference count for a PI mutex, if the counter
1660 * is decreased to zero, its memory space is freed.
1661 */
1662static void
1663umtx_pi_unref(struct umtx_pi *pi)
1664{
1665	struct umtxq_chain *uc;
1666
1667	uc = umtxq_getchain(&pi->pi_key);
1668	UMTXQ_LOCKED_ASSERT(uc);
1669	KASSERT(pi->pi_refcount > 0, ("invalid reference count"));
1670	if (--pi->pi_refcount == 0) {
1671		mtx_lock_spin(&umtx_lock);
1672		if (pi->pi_owner != NULL) {
1673			TAILQ_REMOVE(&pi->pi_owner->td_umtxq->uq_pi_contested,
1674				pi, pi_link);
1675			pi->pi_owner = NULL;
1676		}
1677		KASSERT(TAILQ_EMPTY(&pi->pi_blocked),
1678			("blocked queue not empty"));
1679		mtx_unlock_spin(&umtx_lock);
1680		TAILQ_REMOVE(&uc->uc_pi_list, pi, pi_hashlink);
1681		umtx_pi_free(pi);
1682	}
1683}
1684
1685/*
1686 * Find a PI mutex in hash table.
1687 */
1688static struct umtx_pi *
1689umtx_pi_lookup(struct umtx_key *key)
1690{
1691	struct umtxq_chain *uc;
1692	struct umtx_pi *pi;
1693
1694	uc = umtxq_getchain(key);
1695	UMTXQ_LOCKED_ASSERT(uc);
1696
1697	TAILQ_FOREACH(pi, &uc->uc_pi_list, pi_hashlink) {
1698		if (umtx_key_match(&pi->pi_key, key)) {
1699			return (pi);
1700		}
1701	}
1702	return (NULL);
1703}
1704
1705/*
1706 * Insert a PI mutex into hash table.
1707 */
1708static inline void
1709umtx_pi_insert(struct umtx_pi *pi)
1710{
1711	struct umtxq_chain *uc;
1712
1713	uc = umtxq_getchain(&pi->pi_key);
1714	UMTXQ_LOCKED_ASSERT(uc);
1715	TAILQ_INSERT_TAIL(&uc->uc_pi_list, pi, pi_hashlink);
1716}
1717
1718/*
1719 * Lock a PI mutex.
1720 */
1721static int
1722_do_lock_pi(struct thread *td, struct umutex *m, uint32_t flags, int timo,
1723	int try)
1724{
1725	struct umtx_q *uq;
1726	struct umtx_pi *pi, *new_pi;
1727	uint32_t id, owner, old;
1728	int error;
1729
1730	id = td->td_tid;
1731	uq = td->td_umtxq;
1732
1733	if ((error = umtx_key_get(m, TYPE_PI_UMUTEX, GET_SHARE(flags),
1734	    &uq->uq_key)) != 0)
1735		return (error);
1736	umtxq_lock(&uq->uq_key);
1737	pi = umtx_pi_lookup(&uq->uq_key);
1738	if (pi == NULL) {
1739		new_pi = umtx_pi_alloc(M_NOWAIT);
1740		if (new_pi == NULL) {
1741			umtxq_unlock(&uq->uq_key);
1742			new_pi = umtx_pi_alloc(M_WAITOK);
1743			umtxq_lock(&uq->uq_key);
1744			pi = umtx_pi_lookup(&uq->uq_key);
1745			if (pi != NULL) {
1746				umtx_pi_free(new_pi);
1747				new_pi = NULL;
1748			}
1749		}
1750		if (new_pi != NULL) {
1751			new_pi->pi_key = uq->uq_key;
1752			umtx_pi_insert(new_pi);
1753			pi = new_pi;
1754		}
1755	}
1756	umtx_pi_ref(pi);
1757	umtxq_unlock(&uq->uq_key);
1758
1759	/*
1760	 * Care must be exercised when dealing with umtx structure.  It
1761	 * can fault on any access.
1762	 */
1763	for (;;) {
1764		/*
1765		 * Try the uncontested case.  This should be done in userland.
1766		 */
1767		owner = casuword32(&m->m_owner, UMUTEX_UNOWNED, id);
1768
1769		/* The acquire succeeded. */
1770		if (owner == UMUTEX_UNOWNED) {
1771			error = 0;
1772			break;
1773		}
1774
1775		/* The address was invalid. */
1776		if (owner == -1) {
1777			error = EFAULT;
1778			break;
1779		}
1780
1781		/* If no one owns it but it is contested try to acquire it. */
1782		if (owner == UMUTEX_CONTESTED) {
1783			owner = casuword32(&m->m_owner,
1784			    UMUTEX_CONTESTED, id | UMUTEX_CONTESTED);
1785
1786			if (owner == UMUTEX_CONTESTED) {
1787				umtxq_lock(&uq->uq_key);
1788				umtxq_busy(&uq->uq_key);
1789				error = umtx_pi_claim(pi, td);
1790				umtxq_unbusy(&uq->uq_key);
1791				umtxq_unlock(&uq->uq_key);
1792				break;
1793			}
1794
1795			/* The address was invalid. */
1796			if (owner == -1) {
1797				error = EFAULT;
1798				break;
1799			}
1800
1801			/* If this failed the lock has changed, restart. */
1802			continue;
1803		}
1804
1805		if ((flags & UMUTEX_ERROR_CHECK) != 0 &&
1806		    (owner & ~UMUTEX_CONTESTED) == id) {
1807			error = EDEADLK;
1808			break;
1809		}
1810
1811		if (try != 0) {
1812			error = EBUSY;
1813			break;
1814		}
1815
1816		/*
1817		 * If we caught a signal, we have retried and now
1818		 * exit immediately.
1819		 */
1820		if (error != 0)
1821			break;
1822
1823		umtxq_lock(&uq->uq_key);
1824		umtxq_busy(&uq->uq_key);
1825		umtxq_unlock(&uq->uq_key);
1826
1827		/*
1828		 * Set the contested bit so that a release in user space
1829		 * knows to use the system call for unlock.  If this fails
1830		 * either some one else has acquired the lock or it has been
1831		 * released.
1832		 */
1833		old = casuword32(&m->m_owner, owner, owner | UMUTEX_CONTESTED);
1834
1835		/* The address was invalid. */
1836		if (old == -1) {
1837			umtxq_lock(&uq->uq_key);
1838			umtxq_unbusy(&uq->uq_key);
1839			umtxq_unlock(&uq->uq_key);
1840			error = EFAULT;
1841			break;
1842		}
1843
1844		umtxq_lock(&uq->uq_key);
1845		/*
1846		 * We set the contested bit, sleep. Otherwise the lock changed
1847		 * and we need to retry or we lost a race to the thread
1848		 * unlocking the umtx.
1849		 */
1850		if (old == owner)
1851			error = umtxq_sleep_pi(uq, pi, owner & ~UMUTEX_CONTESTED,
1852				 "umtxpi", timo);
1853		else {
1854			umtxq_unbusy(&uq->uq_key);
1855			umtxq_unlock(&uq->uq_key);
1856		}
1857	}
1858
1859	umtxq_lock(&uq->uq_key);
1860	umtx_pi_unref(pi);
1861	umtxq_unlock(&uq->uq_key);
1862
1863	umtx_key_release(&uq->uq_key);
1864	return (error);
1865}
1866
1867/*
1868 * Unlock a PI mutex.
1869 */
1870static int
1871do_unlock_pi(struct thread *td, struct umutex *m, uint32_t flags)
1872{
1873	struct umtx_key key;
1874	struct umtx_q *uq_first, *uq_first2, *uq_me;
1875	struct umtx_pi *pi, *pi2;
1876	uint32_t owner, old, id;
1877	int error;
1878	int count;
1879	int pri;
1880
1881	id = td->td_tid;
1882	/*
1883	 * Make sure we own this mtx.
1884	 */
1885	owner = fuword32(__DEVOLATILE(uint32_t *, &m->m_owner));
1886	if (owner == -1)
1887		return (EFAULT);
1888
1889	if ((owner & ~UMUTEX_CONTESTED) != id)
1890		return (EPERM);
1891
1892	/* This should be done in userland */
1893	if ((owner & UMUTEX_CONTESTED) == 0) {
1894		old = casuword32(&m->m_owner, owner, UMUTEX_UNOWNED);
1895		if (old == -1)
1896			return (EFAULT);
1897		if (old == owner)
1898			return (0);
1899		owner = old;
1900	}
1901
1902	/* We should only ever be in here for contested locks */
1903	if ((error = umtx_key_get(m, TYPE_PI_UMUTEX, GET_SHARE(flags),
1904	    &key)) != 0)
1905		return (error);
1906
1907	umtxq_lock(&key);
1908	umtxq_busy(&key);
1909	count = umtxq_count_pi(&key, &uq_first);
1910	if (uq_first != NULL) {
1911		mtx_lock_spin(&umtx_lock);
1912		pi = uq_first->uq_pi_blocked;
1913		KASSERT(pi != NULL, ("pi == NULL?"));
1914		if (pi->pi_owner != curthread) {
1915			mtx_unlock_spin(&umtx_lock);
1916			umtxq_unbusy(&key);
1917			umtxq_unlock(&key);
1918			umtx_key_release(&key);
1919			/* userland messed the mutex */
1920			return (EPERM);
1921		}
1922		uq_me = curthread->td_umtxq;
1923		pi->pi_owner = NULL;
1924		TAILQ_REMOVE(&uq_me->uq_pi_contested, pi, pi_link);
1925		/* get highest priority thread which is still sleeping. */
1926		uq_first = TAILQ_FIRST(&pi->pi_blocked);
1927		while (uq_first != NULL &&
1928		       (uq_first->uq_flags & UQF_UMTXQ) == 0) {
1929			uq_first = TAILQ_NEXT(uq_first, uq_lockq);
1930		}
1931		pri = PRI_MAX;
1932		TAILQ_FOREACH(pi2, &uq_me->uq_pi_contested, pi_link) {
1933			uq_first2 = TAILQ_FIRST(&pi2->pi_blocked);
1934			if (uq_first2 != NULL) {
1935				if (pri > UPRI(uq_first2->uq_thread))
1936					pri = UPRI(uq_first2->uq_thread);
1937			}
1938		}
1939		thread_lock(curthread);
1940		sched_unlend_user_prio(curthread, pri);
1941		thread_unlock(curthread);
1942		mtx_unlock_spin(&umtx_lock);
1943		if (uq_first)
1944			umtxq_signal_thread(uq_first);
1945	}
1946	umtxq_unlock(&key);
1947
1948	/*
1949	 * When unlocking the umtx, it must be marked as unowned if
1950	 * there is zero or one thread only waiting for it.
1951	 * Otherwise, it must be marked as contested.
1952	 */
1953	old = casuword32(&m->m_owner, owner,
1954		count <= 1 ? UMUTEX_UNOWNED : UMUTEX_CONTESTED);
1955
1956	umtxq_lock(&key);
1957	umtxq_unbusy(&key);
1958	umtxq_unlock(&key);
1959	umtx_key_release(&key);
1960	if (old == -1)
1961		return (EFAULT);
1962	if (old != owner)
1963		return (EINVAL);
1964	return (0);
1965}
1966
1967/*
1968 * Lock a PP mutex.
1969 */
1970static int
1971_do_lock_pp(struct thread *td, struct umutex *m, uint32_t flags, int timo,
1972	int try)
1973{
1974	struct umtx_q *uq, *uq2;
1975	struct umtx_pi *pi;
1976	uint32_t ceiling;
1977	uint32_t owner, id;
1978	int error, pri, old_inherited_pri, su;
1979
1980	id = td->td_tid;
1981	uq = td->td_umtxq;
1982	if ((error = umtx_key_get(m, TYPE_PP_UMUTEX, GET_SHARE(flags),
1983	    &uq->uq_key)) != 0)
1984		return (error);
1985	su = (priv_check(td, PRIV_SCHED_RTPRIO) == 0);
1986	for (;;) {
1987		old_inherited_pri = uq->uq_inherited_pri;
1988		umtxq_lock(&uq->uq_key);
1989		umtxq_busy(&uq->uq_key);
1990		umtxq_unlock(&uq->uq_key);
1991
1992		ceiling = RTP_PRIO_MAX - fuword32(&m->m_ceilings[0]);
1993		if (ceiling > RTP_PRIO_MAX) {
1994			error = EINVAL;
1995			goto out;
1996		}
1997
1998		mtx_lock_spin(&umtx_lock);
1999		if (UPRI(td) < PRI_MIN_REALTIME + ceiling) {
2000			mtx_unlock_spin(&umtx_lock);
2001			error = EINVAL;
2002			goto out;
2003		}
2004		if (su && PRI_MIN_REALTIME + ceiling < uq->uq_inherited_pri) {
2005			uq->uq_inherited_pri = PRI_MIN_REALTIME + ceiling;
2006			thread_lock(td);
2007			if (uq->uq_inherited_pri < UPRI(td))
2008				sched_lend_user_prio(td, uq->uq_inherited_pri);
2009			thread_unlock(td);
2010		}
2011		mtx_unlock_spin(&umtx_lock);
2012
2013		owner = casuword32(&m->m_owner,
2014		    UMUTEX_CONTESTED, id | UMUTEX_CONTESTED);
2015
2016		if (owner == UMUTEX_CONTESTED) {
2017			error = 0;
2018			break;
2019		}
2020
2021		/* The address was invalid. */
2022		if (owner == -1) {
2023			error = EFAULT;
2024			break;
2025		}
2026
2027		if ((flags & UMUTEX_ERROR_CHECK) != 0 &&
2028		    (owner & ~UMUTEX_CONTESTED) == id) {
2029			error = EDEADLK;
2030			break;
2031		}
2032
2033		if (try != 0) {
2034			error = EBUSY;
2035			break;
2036		}
2037
2038		/*
2039		 * If we caught a signal, we have retried and now
2040		 * exit immediately.
2041		 */
2042		if (error != 0)
2043			break;
2044
2045		umtxq_lock(&uq->uq_key);
2046		umtxq_insert(uq);
2047		umtxq_unbusy(&uq->uq_key);
2048		error = umtxq_sleep(uq, "umtxpp", timo);
2049		umtxq_remove(uq);
2050		umtxq_unlock(&uq->uq_key);
2051
2052		mtx_lock_spin(&umtx_lock);
2053		uq->uq_inherited_pri = old_inherited_pri;
2054		pri = PRI_MAX;
2055		TAILQ_FOREACH(pi, &uq->uq_pi_contested, pi_link) {
2056			uq2 = TAILQ_FIRST(&pi->pi_blocked);
2057			if (uq2 != NULL) {
2058				if (pri > UPRI(uq2->uq_thread))
2059					pri = UPRI(uq2->uq_thread);
2060			}
2061		}
2062		if (pri > uq->uq_inherited_pri)
2063			pri = uq->uq_inherited_pri;
2064		thread_lock(td);
2065		sched_unlend_user_prio(td, pri);
2066		thread_unlock(td);
2067		mtx_unlock_spin(&umtx_lock);
2068	}
2069
2070	if (error != 0) {
2071		mtx_lock_spin(&umtx_lock);
2072		uq->uq_inherited_pri = old_inherited_pri;
2073		pri = PRI_MAX;
2074		TAILQ_FOREACH(pi, &uq->uq_pi_contested, pi_link) {
2075			uq2 = TAILQ_FIRST(&pi->pi_blocked);
2076			if (uq2 != NULL) {
2077				if (pri > UPRI(uq2->uq_thread))
2078					pri = UPRI(uq2->uq_thread);
2079			}
2080		}
2081		if (pri > uq->uq_inherited_pri)
2082			pri = uq->uq_inherited_pri;
2083		thread_lock(td);
2084		sched_unlend_user_prio(td, pri);
2085		thread_unlock(td);
2086		mtx_unlock_spin(&umtx_lock);
2087	}
2088
2089out:
2090	umtxq_lock(&uq->uq_key);
2091	umtxq_unbusy(&uq->uq_key);
2092	umtxq_unlock(&uq->uq_key);
2093	umtx_key_release(&uq->uq_key);
2094	return (error);
2095}
2096
2097/*
2098 * Unlock a PP mutex.
2099 */
2100static int
2101do_unlock_pp(struct thread *td, struct umutex *m, uint32_t flags)
2102{
2103	struct umtx_key key;
2104	struct umtx_q *uq, *uq2;
2105	struct umtx_pi *pi;
2106	uint32_t owner, id;
2107	uint32_t rceiling;
2108	int error, pri, new_inherited_pri, su;
2109
2110	id = td->td_tid;
2111	uq = td->td_umtxq;
2112	su = (priv_check(td, PRIV_SCHED_RTPRIO) == 0);
2113
2114	/*
2115	 * Make sure we own this mtx.
2116	 */
2117	owner = fuword32(__DEVOLATILE(uint32_t *, &m->m_owner));
2118	if (owner == -1)
2119		return (EFAULT);
2120
2121	if ((owner & ~UMUTEX_CONTESTED) != id)
2122		return (EPERM);
2123
2124	error = copyin(&m->m_ceilings[1], &rceiling, sizeof(uint32_t));
2125	if (error != 0)
2126		return (error);
2127
2128	if (rceiling == -1)
2129		new_inherited_pri = PRI_MAX;
2130	else {
2131		rceiling = RTP_PRIO_MAX - rceiling;
2132		if (rceiling > RTP_PRIO_MAX)
2133			return (EINVAL);
2134		new_inherited_pri = PRI_MIN_REALTIME + rceiling;
2135	}
2136
2137	if ((error = umtx_key_get(m, TYPE_PP_UMUTEX, GET_SHARE(flags),
2138	    &key)) != 0)
2139		return (error);
2140	umtxq_lock(&key);
2141	umtxq_busy(&key);
2142	umtxq_unlock(&key);
2143	/*
2144	 * For priority protected mutex, always set unlocked state
2145	 * to UMUTEX_CONTESTED, so that userland always enters kernel
2146	 * to lock the mutex, it is necessary because thread priority
2147	 * has to be adjusted for such mutex.
2148	 */
2149	error = suword32(__DEVOLATILE(uint32_t *, &m->m_owner),
2150		UMUTEX_CONTESTED);
2151
2152	umtxq_lock(&key);
2153	if (error == 0)
2154		umtxq_signal(&key, 1);
2155	umtxq_unbusy(&key);
2156	umtxq_unlock(&key);
2157
2158	if (error == -1)
2159		error = EFAULT;
2160	else {
2161		mtx_lock_spin(&umtx_lock);
2162		if (su != 0)
2163			uq->uq_inherited_pri = new_inherited_pri;
2164		pri = PRI_MAX;
2165		TAILQ_FOREACH(pi, &uq->uq_pi_contested, pi_link) {
2166			uq2 = TAILQ_FIRST(&pi->pi_blocked);
2167			if (uq2 != NULL) {
2168				if (pri > UPRI(uq2->uq_thread))
2169					pri = UPRI(uq2->uq_thread);
2170			}
2171		}
2172		if (pri > uq->uq_inherited_pri)
2173			pri = uq->uq_inherited_pri;
2174		thread_lock(td);
2175		sched_unlend_user_prio(td, pri);
2176		thread_unlock(td);
2177		mtx_unlock_spin(&umtx_lock);
2178	}
2179	umtx_key_release(&key);
2180	return (error);
2181}
2182
2183static int
2184do_set_ceiling(struct thread *td, struct umutex *m, uint32_t ceiling,
2185	uint32_t *old_ceiling)
2186{
2187	struct umtx_q *uq;
2188	uint32_t save_ceiling;
2189	uint32_t owner, id;
2190	uint32_t flags;
2191	int error;
2192
2193	flags = fuword32(&m->m_flags);
2194	if ((flags & UMUTEX_PRIO_PROTECT) == 0)
2195		return (EINVAL);
2196	if (ceiling > RTP_PRIO_MAX)
2197		return (EINVAL);
2198	id = td->td_tid;
2199	uq = td->td_umtxq;
2200	if ((error = umtx_key_get(m, TYPE_PP_UMUTEX, GET_SHARE(flags),
2201	   &uq->uq_key)) != 0)
2202		return (error);
2203	for (;;) {
2204		umtxq_lock(&uq->uq_key);
2205		umtxq_busy(&uq->uq_key);
2206		umtxq_unlock(&uq->uq_key);
2207
2208		save_ceiling = fuword32(&m->m_ceilings[0]);
2209
2210		owner = casuword32(&m->m_owner,
2211		    UMUTEX_CONTESTED, id | UMUTEX_CONTESTED);
2212
2213		if (owner == UMUTEX_CONTESTED) {
2214			suword32(&m->m_ceilings[0], ceiling);
2215			suword32(__DEVOLATILE(uint32_t *, &m->m_owner),
2216				UMUTEX_CONTESTED);
2217			error = 0;
2218			break;
2219		}
2220
2221		/* The address was invalid. */
2222		if (owner == -1) {
2223			error = EFAULT;
2224			break;
2225		}
2226
2227		if ((owner & ~UMUTEX_CONTESTED) == id) {
2228			suword32(&m->m_ceilings[0], ceiling);
2229			error = 0;
2230			break;
2231		}
2232
2233		/*
2234		 * If we caught a signal, we have retried and now
2235		 * exit immediately.
2236		 */
2237		if (error != 0)
2238			break;
2239
2240		/*
2241		 * We set the contested bit, sleep. Otherwise the lock changed
2242		 * and we need to retry or we lost a race to the thread
2243		 * unlocking the umtx.
2244		 */
2245		umtxq_lock(&uq->uq_key);
2246		umtxq_insert(uq);
2247		umtxq_unbusy(&uq->uq_key);
2248		error = umtxq_sleep(uq, "umtxpp", 0);
2249		umtxq_remove(uq);
2250		umtxq_unlock(&uq->uq_key);
2251	}
2252	umtxq_lock(&uq->uq_key);
2253	if (error == 0)
2254		umtxq_signal(&uq->uq_key, INT_MAX);
2255	umtxq_unbusy(&uq->uq_key);
2256	umtxq_unlock(&uq->uq_key);
2257	umtx_key_release(&uq->uq_key);
2258	if (error == 0 && old_ceiling != NULL)
2259		suword32(old_ceiling, save_ceiling);
2260	return (error);
2261}
2262
2263static int
2264_do_lock_umutex(struct thread *td, struct umutex *m, int flags, int timo,
2265	int mode)
2266{
2267	switch(flags & (UMUTEX_PRIO_INHERIT | UMUTEX_PRIO_PROTECT)) {
2268	case 0:
2269		return (_do_lock_normal(td, m, flags, timo, mode));
2270	case UMUTEX_PRIO_INHERIT:
2271		return (_do_lock_pi(td, m, flags, timo, mode));
2272	case UMUTEX_PRIO_PROTECT:
2273		return (_do_lock_pp(td, m, flags, timo, mode));
2274	}
2275	return (EINVAL);
2276}
2277
2278/*
2279 * Lock a userland POSIX mutex.
2280 */
2281static int
2282do_lock_umutex(struct thread *td, struct umutex *m,
2283	struct timespec *timeout, int mode)
2284{
2285	struct timespec ts, ts2, ts3;
2286	struct timeval tv;
2287	uint32_t flags;
2288	int error;
2289
2290	flags = fuword32(&m->m_flags);
2291	if (flags == -1)
2292		return (EFAULT);
2293
2294	if (timeout == NULL) {
2295		error = _do_lock_umutex(td, m, flags, 0, mode);
2296		/* Mutex locking is restarted if it is interrupted. */
2297		if (error == EINTR && mode != _UMUTEX_WAIT)
2298			error = ERESTART;
2299	} else {
2300		getnanouptime(&ts);
2301		timespecadd(&ts, timeout);
2302		TIMESPEC_TO_TIMEVAL(&tv, timeout);
2303		for (;;) {
2304			error = _do_lock_umutex(td, m, flags, tvtohz(&tv), mode);
2305			if (error != ETIMEDOUT)
2306				break;
2307			getnanouptime(&ts2);
2308			if (timespeccmp(&ts2, &ts, >=)) {
2309				error = ETIMEDOUT;
2310				break;
2311			}
2312			ts3 = ts;
2313			timespecsub(&ts3, &ts2);
2314			TIMESPEC_TO_TIMEVAL(&tv, &ts3);
2315		}
2316		/* Timed-locking is not restarted. */
2317		if (error == ERESTART)
2318			error = EINTR;
2319	}
2320	return (error);
2321}
2322
2323/*
2324 * Unlock a userland POSIX mutex.
2325 */
2326static int
2327do_unlock_umutex(struct thread *td, struct umutex *m)
2328{
2329	uint32_t flags;
2330
2331	flags = fuword32(&m->m_flags);
2332	if (flags == -1)
2333		return (EFAULT);
2334
2335	switch(flags & (UMUTEX_PRIO_INHERIT | UMUTEX_PRIO_PROTECT)) {
2336	case 0:
2337		return (do_unlock_normal(td, m, flags));
2338	case UMUTEX_PRIO_INHERIT:
2339		return (do_unlock_pi(td, m, flags));
2340	case UMUTEX_PRIO_PROTECT:
2341		return (do_unlock_pp(td, m, flags));
2342	}
2343
2344	return (EINVAL);
2345}
2346
2347static int
2348do_cv_wait(struct thread *td, struct ucond *cv, struct umutex *m,
2349	struct timespec *timeout, u_long wflags)
2350{
2351	struct umtx_q *uq;
2352	struct timeval tv;
2353	struct timespec cts, ets, tts;
2354	uint32_t flags;
2355	uint32_t clockid;
2356	int error;
2357
2358	uq = td->td_umtxq;
2359	flags = fuword32(&cv->c_flags);
2360	error = umtx_key_get(cv, TYPE_CV, GET_SHARE(flags), &uq->uq_key);
2361	if (error != 0)
2362		return (error);
2363
2364	if ((wflags & CVWAIT_CLOCKID) != 0) {
2365		clockid = fuword32(&cv->c_clockid);
2366		if (clockid < CLOCK_REALTIME ||
2367		    clockid >= CLOCK_THREAD_CPUTIME_ID) {
2368			/* hmm, only HW clock id will work. */
2369			return (EINVAL);
2370		}
2371	} else {
2372		clockid = CLOCK_REALTIME;
2373	}
2374
2375	umtxq_lock(&uq->uq_key);
2376	umtxq_busy(&uq->uq_key);
2377	umtxq_insert(uq);
2378	umtxq_unlock(&uq->uq_key);
2379
2380	/*
2381	 * Set c_has_waiters to 1 before releasing user mutex, also
2382	 * don't modify cache line when unnecessary.
2383	 */
2384	if (fuword32(__DEVOLATILE(uint32_t *, &cv->c_has_waiters)) == 0)
2385		suword32(__DEVOLATILE(uint32_t *, &cv->c_has_waiters), 1);
2386
2387	umtxq_lock(&uq->uq_key);
2388	umtxq_unbusy(&uq->uq_key);
2389	umtxq_unlock(&uq->uq_key);
2390
2391	error = do_unlock_umutex(td, m);
2392
2393	umtxq_lock(&uq->uq_key);
2394	if (error == 0) {
2395		if (timeout == NULL) {
2396			error = umtxq_sleep(uq, "ucond", 0);
2397		} else {
2398			if ((wflags & CVWAIT_ABSTIME) == 0) {
2399				kern_clock_gettime(td, clockid, &ets);
2400				timespecadd(&ets, timeout);
2401				tts = *timeout;
2402			} else { /* absolute time */
2403				ets = *timeout;
2404				tts = *timeout;
2405				kern_clock_gettime(td, clockid, &cts);
2406				timespecsub(&tts, &cts);
2407			}
2408			TIMESPEC_TO_TIMEVAL(&tv, &tts);
2409			for (;;) {
2410				error = umtxq_sleep(uq, "ucond", tvtohz(&tv));
2411				if (error != ETIMEDOUT)
2412					break;
2413				kern_clock_gettime(td, clockid, &cts);
2414				if (timespeccmp(&cts, &ets, >=)) {
2415					error = ETIMEDOUT;
2416					break;
2417				}
2418				tts = ets;
2419				timespecsub(&tts, &cts);
2420				TIMESPEC_TO_TIMEVAL(&tv, &tts);
2421			}
2422		}
2423	}
2424
2425	if ((uq->uq_flags & UQF_UMTXQ) == 0)
2426		error = 0;
2427	else {
2428		/*
2429		 * This must be timeout,interrupted by signal or
2430		 * surprious wakeup, clear c_has_waiter flag when
2431		 * necessary.
2432		 */
2433		umtxq_busy(&uq->uq_key);
2434		if ((uq->uq_flags & UQF_UMTXQ) != 0) {
2435			int oldlen = uq->uq_cur_queue->length;
2436			umtxq_remove(uq);
2437			if (oldlen == 1) {
2438				umtxq_unlock(&uq->uq_key);
2439				suword32(
2440				    __DEVOLATILE(uint32_t *,
2441					 &cv->c_has_waiters), 0);
2442				umtxq_lock(&uq->uq_key);
2443			}
2444		}
2445		umtxq_unbusy(&uq->uq_key);
2446		if (error == ERESTART)
2447			error = EINTR;
2448	}
2449
2450	umtxq_unlock(&uq->uq_key);
2451	umtx_key_release(&uq->uq_key);
2452	return (error);
2453}
2454
2455/*
2456 * Signal a userland condition variable.
2457 */
2458static int
2459do_cv_signal(struct thread *td, struct ucond *cv)
2460{
2461	struct umtx_key key;
2462	int error, cnt, nwake;
2463	uint32_t flags;
2464
2465	flags = fuword32(&cv->c_flags);
2466	if ((error = umtx_key_get(cv, TYPE_CV, GET_SHARE(flags), &key)) != 0)
2467		return (error);
2468	umtxq_lock(&key);
2469	umtxq_busy(&key);
2470	cnt = umtxq_count(&key);
2471	nwake = umtxq_signal(&key, 1);
2472	if (cnt <= nwake) {
2473		umtxq_unlock(&key);
2474		error = suword32(
2475		    __DEVOLATILE(uint32_t *, &cv->c_has_waiters), 0);
2476		umtxq_lock(&key);
2477	}
2478	umtxq_unbusy(&key);
2479	umtxq_unlock(&key);
2480	umtx_key_release(&key);
2481	return (error);
2482}
2483
2484static int
2485do_cv_broadcast(struct thread *td, struct ucond *cv)
2486{
2487	struct umtx_key key;
2488	int error;
2489	uint32_t flags;
2490
2491	flags = fuword32(&cv->c_flags);
2492	if ((error = umtx_key_get(cv, TYPE_CV, GET_SHARE(flags), &key)) != 0)
2493		return (error);
2494
2495	umtxq_lock(&key);
2496	umtxq_busy(&key);
2497	umtxq_signal(&key, INT_MAX);
2498	umtxq_unlock(&key);
2499
2500	error = suword32(__DEVOLATILE(uint32_t *, &cv->c_has_waiters), 0);
2501
2502	umtxq_lock(&key);
2503	umtxq_unbusy(&key);
2504	umtxq_unlock(&key);
2505
2506	umtx_key_release(&key);
2507	return (error);
2508}
2509
2510static int
2511do_rw_rdlock(struct thread *td, struct urwlock *rwlock, long fflag, int timo)
2512{
2513	struct umtx_q *uq;
2514	uint32_t flags, wrflags;
2515	int32_t state, oldstate;
2516	int32_t blocked_readers;
2517	int error;
2518
2519	uq = td->td_umtxq;
2520	flags = fuword32(&rwlock->rw_flags);
2521	error = umtx_key_get(rwlock, TYPE_RWLOCK, GET_SHARE(flags), &uq->uq_key);
2522	if (error != 0)
2523		return (error);
2524
2525	wrflags = URWLOCK_WRITE_OWNER;
2526	if (!(fflag & URWLOCK_PREFER_READER) && !(flags & URWLOCK_PREFER_READER))
2527		wrflags |= URWLOCK_WRITE_WAITERS;
2528
2529	for (;;) {
2530		state = fuword32(__DEVOLATILE(int32_t *, &rwlock->rw_state));
2531		/* try to lock it */
2532		while (!(state & wrflags)) {
2533			if (__predict_false(URWLOCK_READER_COUNT(state) == URWLOCK_MAX_READERS)) {
2534				umtx_key_release(&uq->uq_key);
2535				return (EAGAIN);
2536			}
2537			oldstate = casuword32(&rwlock->rw_state, state, state + 1);
2538			if (oldstate == state) {
2539				umtx_key_release(&uq->uq_key);
2540				return (0);
2541			}
2542			state = oldstate;
2543		}
2544
2545		if (error)
2546			break;
2547
2548		/* grab monitor lock */
2549		umtxq_lock(&uq->uq_key);
2550		umtxq_busy(&uq->uq_key);
2551		umtxq_unlock(&uq->uq_key);
2552
2553		/*
2554		 * re-read the state, in case it changed between the try-lock above
2555		 * and the check below
2556		 */
2557		state = fuword32(__DEVOLATILE(int32_t *, &rwlock->rw_state));
2558
2559		/* set read contention bit */
2560		while ((state & wrflags) && !(state & URWLOCK_READ_WAITERS)) {
2561			oldstate = casuword32(&rwlock->rw_state, state, state | URWLOCK_READ_WAITERS);
2562			if (oldstate == state)
2563				goto sleep;
2564			state = oldstate;
2565		}
2566
2567		/* state is changed while setting flags, restart */
2568		if (!(state & wrflags)) {
2569			umtxq_lock(&uq->uq_key);
2570			umtxq_unbusy(&uq->uq_key);
2571			umtxq_unlock(&uq->uq_key);
2572			continue;
2573		}
2574
2575sleep:
2576		/* contention bit is set, before sleeping, increase read waiter count */
2577		blocked_readers = fuword32(&rwlock->rw_blocked_readers);
2578		suword32(&rwlock->rw_blocked_readers, blocked_readers+1);
2579
2580		while (state & wrflags) {
2581			umtxq_lock(&uq->uq_key);
2582			umtxq_insert(uq);
2583			umtxq_unbusy(&uq->uq_key);
2584
2585			error = umtxq_sleep(uq, "urdlck", timo);
2586
2587			umtxq_busy(&uq->uq_key);
2588			umtxq_remove(uq);
2589			umtxq_unlock(&uq->uq_key);
2590			if (error)
2591				break;
2592			state = fuword32(__DEVOLATILE(int32_t *, &rwlock->rw_state));
2593		}
2594
2595		/* decrease read waiter count, and may clear read contention bit */
2596		blocked_readers = fuword32(&rwlock->rw_blocked_readers);
2597		suword32(&rwlock->rw_blocked_readers, blocked_readers-1);
2598		if (blocked_readers == 1) {
2599			state = fuword32(__DEVOLATILE(int32_t *, &rwlock->rw_state));
2600			for (;;) {
2601				oldstate = casuword32(&rwlock->rw_state, state,
2602					 state & ~URWLOCK_READ_WAITERS);
2603				if (oldstate == state)
2604					break;
2605				state = oldstate;
2606			}
2607		}
2608
2609		umtxq_lock(&uq->uq_key);
2610		umtxq_unbusy(&uq->uq_key);
2611		umtxq_unlock(&uq->uq_key);
2612	}
2613	umtx_key_release(&uq->uq_key);
2614	return (error);
2615}
2616
2617static int
2618do_rw_rdlock2(struct thread *td, void *obj, long val, struct timespec *timeout)
2619{
2620	struct timespec ts, ts2, ts3;
2621	struct timeval tv;
2622	int error;
2623
2624	getnanouptime(&ts);
2625	timespecadd(&ts, timeout);
2626	TIMESPEC_TO_TIMEVAL(&tv, timeout);
2627	for (;;) {
2628		error = do_rw_rdlock(td, obj, val, tvtohz(&tv));
2629		if (error != ETIMEDOUT)
2630			break;
2631		getnanouptime(&ts2);
2632		if (timespeccmp(&ts2, &ts, >=)) {
2633			error = ETIMEDOUT;
2634			break;
2635		}
2636		ts3 = ts;
2637		timespecsub(&ts3, &ts2);
2638		TIMESPEC_TO_TIMEVAL(&tv, &ts3);
2639	}
2640	if (error == ERESTART)
2641		error = EINTR;
2642	return (error);
2643}
2644
2645static int
2646do_rw_wrlock(struct thread *td, struct urwlock *rwlock, int timo)
2647{
2648	struct umtx_q *uq;
2649	uint32_t flags;
2650	int32_t state, oldstate;
2651	int32_t blocked_writers;
2652	int32_t blocked_readers;
2653	int error;
2654
2655	uq = td->td_umtxq;
2656	flags = fuword32(&rwlock->rw_flags);
2657	error = umtx_key_get(rwlock, TYPE_RWLOCK, GET_SHARE(flags), &uq->uq_key);
2658	if (error != 0)
2659		return (error);
2660
2661	blocked_readers = 0;
2662	for (;;) {
2663		state = fuword32(__DEVOLATILE(int32_t *, &rwlock->rw_state));
2664		while (!(state & URWLOCK_WRITE_OWNER) && URWLOCK_READER_COUNT(state) == 0) {
2665			oldstate = casuword32(&rwlock->rw_state, state, state | URWLOCK_WRITE_OWNER);
2666			if (oldstate == state) {
2667				umtx_key_release(&uq->uq_key);
2668				return (0);
2669			}
2670			state = oldstate;
2671		}
2672
2673		if (error) {
2674			if (!(state & (URWLOCK_WRITE_OWNER|URWLOCK_WRITE_WAITERS)) &&
2675			    blocked_readers != 0) {
2676				umtxq_lock(&uq->uq_key);
2677				umtxq_busy(&uq->uq_key);
2678				umtxq_signal_queue(&uq->uq_key, INT_MAX, UMTX_SHARED_QUEUE);
2679				umtxq_unbusy(&uq->uq_key);
2680				umtxq_unlock(&uq->uq_key);
2681			}
2682
2683			break;
2684		}
2685
2686		/* grab monitor lock */
2687		umtxq_lock(&uq->uq_key);
2688		umtxq_busy(&uq->uq_key);
2689		umtxq_unlock(&uq->uq_key);
2690
2691		/*
2692		 * re-read the state, in case it changed between the try-lock above
2693		 * and the check below
2694		 */
2695		state = fuword32(__DEVOLATILE(int32_t *, &rwlock->rw_state));
2696
2697		while (((state & URWLOCK_WRITE_OWNER) || URWLOCK_READER_COUNT(state) != 0) &&
2698		       (state & URWLOCK_WRITE_WAITERS) == 0) {
2699			oldstate = casuword32(&rwlock->rw_state, state, state | URWLOCK_WRITE_WAITERS);
2700			if (oldstate == state)
2701				goto sleep;
2702			state = oldstate;
2703		}
2704
2705		if (!(state & URWLOCK_WRITE_OWNER) && URWLOCK_READER_COUNT(state) == 0) {
2706			umtxq_lock(&uq->uq_key);
2707			umtxq_unbusy(&uq->uq_key);
2708			umtxq_unlock(&uq->uq_key);
2709			continue;
2710		}
2711sleep:
2712		blocked_writers = fuword32(&rwlock->rw_blocked_writers);
2713		suword32(&rwlock->rw_blocked_writers, blocked_writers+1);
2714
2715		while ((state & URWLOCK_WRITE_OWNER) || URWLOCK_READER_COUNT(state) != 0) {
2716			umtxq_lock(&uq->uq_key);
2717			umtxq_insert_queue(uq, UMTX_EXCLUSIVE_QUEUE);
2718			umtxq_unbusy(&uq->uq_key);
2719
2720			error = umtxq_sleep(uq, "uwrlck", timo);
2721
2722			umtxq_busy(&uq->uq_key);
2723			umtxq_remove_queue(uq, UMTX_EXCLUSIVE_QUEUE);
2724			umtxq_unlock(&uq->uq_key);
2725			if (error)
2726				break;
2727			state = fuword32(__DEVOLATILE(int32_t *, &rwlock->rw_state));
2728		}
2729
2730		blocked_writers = fuword32(&rwlock->rw_blocked_writers);
2731		suword32(&rwlock->rw_blocked_writers, blocked_writers-1);
2732		if (blocked_writers == 1) {
2733			state = fuword32(__DEVOLATILE(int32_t *, &rwlock->rw_state));
2734			for (;;) {
2735				oldstate = casuword32(&rwlock->rw_state, state,
2736					 state & ~URWLOCK_WRITE_WAITERS);
2737				if (oldstate == state)
2738					break;
2739				state = oldstate;
2740			}
2741			blocked_readers = fuword32(&rwlock->rw_blocked_readers);
2742		} else
2743			blocked_readers = 0;
2744
2745		umtxq_lock(&uq->uq_key);
2746		umtxq_unbusy(&uq->uq_key);
2747		umtxq_unlock(&uq->uq_key);
2748	}
2749
2750	umtx_key_release(&uq->uq_key);
2751	return (error);
2752}
2753
2754static int
2755do_rw_wrlock2(struct thread *td, void *obj, struct timespec *timeout)
2756{
2757	struct timespec ts, ts2, ts3;
2758	struct timeval tv;
2759	int error;
2760
2761	getnanouptime(&ts);
2762	timespecadd(&ts, timeout);
2763	TIMESPEC_TO_TIMEVAL(&tv, timeout);
2764	for (;;) {
2765		error = do_rw_wrlock(td, obj, tvtohz(&tv));
2766		if (error != ETIMEDOUT)
2767			break;
2768		getnanouptime(&ts2);
2769		if (timespeccmp(&ts2, &ts, >=)) {
2770			error = ETIMEDOUT;
2771			break;
2772		}
2773		ts3 = ts;
2774		timespecsub(&ts3, &ts2);
2775		TIMESPEC_TO_TIMEVAL(&tv, &ts3);
2776	}
2777	if (error == ERESTART)
2778		error = EINTR;
2779	return (error);
2780}
2781
2782static int
2783do_rw_unlock(struct thread *td, struct urwlock *rwlock)
2784{
2785	struct umtx_q *uq;
2786	uint32_t flags;
2787	int32_t state, oldstate;
2788	int error, q, count;
2789
2790	uq = td->td_umtxq;
2791	flags = fuword32(&rwlock->rw_flags);
2792	error = umtx_key_get(rwlock, TYPE_RWLOCK, GET_SHARE(flags), &uq->uq_key);
2793	if (error != 0)
2794		return (error);
2795
2796	state = fuword32(__DEVOLATILE(int32_t *, &rwlock->rw_state));
2797	if (state & URWLOCK_WRITE_OWNER) {
2798		for (;;) {
2799			oldstate = casuword32(&rwlock->rw_state, state,
2800				state & ~URWLOCK_WRITE_OWNER);
2801			if (oldstate != state) {
2802				state = oldstate;
2803				if (!(oldstate & URWLOCK_WRITE_OWNER)) {
2804					error = EPERM;
2805					goto out;
2806				}
2807			} else
2808				break;
2809		}
2810	} else if (URWLOCK_READER_COUNT(state) != 0) {
2811		for (;;) {
2812			oldstate = casuword32(&rwlock->rw_state, state,
2813				state - 1);
2814			if (oldstate != state) {
2815				state = oldstate;
2816				if (URWLOCK_READER_COUNT(oldstate) == 0) {
2817					error = EPERM;
2818					goto out;
2819				}
2820			}
2821			else
2822				break;
2823		}
2824	} else {
2825		error = EPERM;
2826		goto out;
2827	}
2828
2829	count = 0;
2830
2831	if (!(flags & URWLOCK_PREFER_READER)) {
2832		if (state & URWLOCK_WRITE_WAITERS) {
2833			count = 1;
2834			q = UMTX_EXCLUSIVE_QUEUE;
2835		} else if (state & URWLOCK_READ_WAITERS) {
2836			count = INT_MAX;
2837			q = UMTX_SHARED_QUEUE;
2838		}
2839	} else {
2840		if (state & URWLOCK_READ_WAITERS) {
2841			count = INT_MAX;
2842			q = UMTX_SHARED_QUEUE;
2843		} else if (state & URWLOCK_WRITE_WAITERS) {
2844			count = 1;
2845			q = UMTX_EXCLUSIVE_QUEUE;
2846		}
2847	}
2848
2849	if (count) {
2850		umtxq_lock(&uq->uq_key);
2851		umtxq_busy(&uq->uq_key);
2852		umtxq_signal_queue(&uq->uq_key, count, q);
2853		umtxq_unbusy(&uq->uq_key);
2854		umtxq_unlock(&uq->uq_key);
2855	}
2856out:
2857	umtx_key_release(&uq->uq_key);
2858	return (error);
2859}
2860
2861static int
2862do_sem_wait(struct thread *td, struct _usem *sem, struct timespec *timeout)
2863{
2864	struct umtx_q *uq;
2865	struct timeval tv;
2866	struct timespec cts, ets, tts;
2867	uint32_t flags, count;
2868	int error;
2869
2870	uq = td->td_umtxq;
2871	flags = fuword32(&sem->_flags);
2872	error = umtx_key_get(sem, TYPE_SEM, GET_SHARE(flags), &uq->uq_key);
2873	if (error != 0)
2874		return (error);
2875	umtxq_lock(&uq->uq_key);
2876	umtxq_busy(&uq->uq_key);
2877	umtxq_insert(uq);
2878	umtxq_unlock(&uq->uq_key);
2879
2880	if (fuword32(__DEVOLATILE(uint32_t *, &sem->_has_waiters)) == 0)
2881		casuword32(__DEVOLATILE(uint32_t *, &sem->_has_waiters), 0, 1);
2882
2883	count = fuword32(__DEVOLATILE(uint32_t *, &sem->_count));
2884	if (count != 0) {
2885		umtxq_lock(&uq->uq_key);
2886		umtxq_unbusy(&uq->uq_key);
2887		umtxq_remove(uq);
2888		umtxq_unlock(&uq->uq_key);
2889		umtx_key_release(&uq->uq_key);
2890		return (0);
2891	}
2892
2893	umtxq_lock(&uq->uq_key);
2894	umtxq_unbusy(&uq->uq_key);
2895	umtxq_unlock(&uq->uq_key);
2896
2897	umtxq_lock(&uq->uq_key);
2898	if (timeout == NULL) {
2899		error = umtxq_sleep(uq, "usem", 0);
2900	} else {
2901		getnanouptime(&ets);
2902		timespecadd(&ets, timeout);
2903		TIMESPEC_TO_TIMEVAL(&tv, timeout);
2904		for (;;) {
2905			error = umtxq_sleep(uq, "usem", tvtohz(&tv));
2906			if (error != ETIMEDOUT)
2907				break;
2908			getnanouptime(&cts);
2909			if (timespeccmp(&cts, &ets, >=)) {
2910				error = ETIMEDOUT;
2911				break;
2912			}
2913			tts = ets;
2914			timespecsub(&tts, &cts);
2915			TIMESPEC_TO_TIMEVAL(&tv, &tts);
2916		}
2917	}
2918
2919	if ((uq->uq_flags & UQF_UMTXQ) == 0)
2920		error = 0;
2921	else {
2922		umtxq_remove(uq);
2923		if (error == ERESTART)
2924			error = EINTR;
2925	}
2926	umtxq_unlock(&uq->uq_key);
2927	umtx_key_release(&uq->uq_key);
2928	return (error);
2929}
2930
2931/*
2932 * Signal a userland condition variable.
2933 */
2934static int
2935do_sem_wake(struct thread *td, struct _usem *sem)
2936{
2937	struct umtx_key key;
2938	int error, cnt, nwake;
2939	uint32_t flags;
2940
2941	flags = fuword32(&sem->_flags);
2942	if ((error = umtx_key_get(sem, TYPE_SEM, GET_SHARE(flags), &key)) != 0)
2943		return (error);
2944	umtxq_lock(&key);
2945	umtxq_busy(&key);
2946	cnt = umtxq_count(&key);
2947	nwake = umtxq_signal(&key, 1);
2948	if (cnt <= nwake) {
2949		umtxq_unlock(&key);
2950		error = suword32(
2951		    __DEVOLATILE(uint32_t *, &sem->_has_waiters), 0);
2952		umtxq_lock(&key);
2953	}
2954	umtxq_unbusy(&key);
2955	umtxq_unlock(&key);
2956	umtx_key_release(&key);
2957	return (error);
2958}
2959
2960int
2961_umtx_lock(struct thread *td, struct _umtx_lock_args *uap)
2962    /* struct umtx *umtx */
2963{
2964	return _do_lock_umtx(td, uap->umtx, td->td_tid, 0);
2965}
2966
2967int
2968_umtx_unlock(struct thread *td, struct _umtx_unlock_args *uap)
2969    /* struct umtx *umtx */
2970{
2971	return do_unlock_umtx(td, uap->umtx, td->td_tid);
2972}
2973
2974static int
2975__umtx_op_lock_umtx(struct thread *td, struct _umtx_op_args *uap)
2976{
2977	struct timespec *ts, timeout;
2978	int error;
2979
2980	/* Allow a null timespec (wait forever). */
2981	if (uap->uaddr2 == NULL)
2982		ts = NULL;
2983	else {
2984		error = copyin(uap->uaddr2, &timeout, sizeof(timeout));
2985		if (error != 0)
2986			return (error);
2987		if (timeout.tv_nsec >= 1000000000 ||
2988		    timeout.tv_nsec < 0) {
2989			return (EINVAL);
2990		}
2991		ts = &timeout;
2992	}
2993	return (do_lock_umtx(td, uap->obj, uap->val, ts));
2994}
2995
2996static int
2997__umtx_op_unlock_umtx(struct thread *td, struct _umtx_op_args *uap)
2998{
2999	return (do_unlock_umtx(td, uap->obj, uap->val));
3000}
3001
3002static int
3003__umtx_op_wait(struct thread *td, struct _umtx_op_args *uap)
3004{
3005	struct timespec *ts, timeout;
3006	int error;
3007
3008	if (uap->uaddr2 == NULL)
3009		ts = NULL;
3010	else {
3011		error = copyin(uap->uaddr2, &timeout, sizeof(timeout));
3012		if (error != 0)
3013			return (error);
3014		if (timeout.tv_nsec >= 1000000000 ||
3015		    timeout.tv_nsec < 0)
3016			return (EINVAL);
3017		ts = &timeout;
3018	}
3019	return do_wait(td, uap->obj, uap->val, ts, 0, 0);
3020}
3021
3022static int
3023__umtx_op_wait_uint(struct thread *td, struct _umtx_op_args *uap)
3024{
3025	struct timespec *ts, timeout;
3026	int error;
3027
3028	if (uap->uaddr2 == NULL)
3029		ts = NULL;
3030	else {
3031		error = copyin(uap->uaddr2, &timeout, sizeof(timeout));
3032		if (error != 0)
3033			return (error);
3034		if (timeout.tv_nsec >= 1000000000 ||
3035		    timeout.tv_nsec < 0)
3036			return (EINVAL);
3037		ts = &timeout;
3038	}
3039	return do_wait(td, uap->obj, uap->val, ts, 1, 0);
3040}
3041
3042static int
3043__umtx_op_wait_uint_private(struct thread *td, struct _umtx_op_args *uap)
3044{
3045	struct timespec *ts, timeout;
3046	int error;
3047
3048	if (uap->uaddr2 == NULL)
3049		ts = NULL;
3050	else {
3051		error = copyin(uap->uaddr2, &timeout, sizeof(timeout));
3052		if (error != 0)
3053			return (error);
3054		if (timeout.tv_nsec >= 1000000000 ||
3055		    timeout.tv_nsec < 0)
3056			return (EINVAL);
3057		ts = &timeout;
3058	}
3059	return do_wait(td, uap->obj, uap->val, ts, 1, 1);
3060}
3061
3062static int
3063__umtx_op_wake(struct thread *td, struct _umtx_op_args *uap)
3064{
3065	return (kern_umtx_wake(td, uap->obj, uap->val, 0));
3066}
3067
3068#define BATCH_SIZE	128
3069static int
3070__umtx_op_nwake_private(struct thread *td, struct _umtx_op_args *uap)
3071{
3072	int count = uap->val;
3073	void *uaddrs[BATCH_SIZE];
3074	char **upp = (char **)uap->obj;
3075	int tocopy;
3076	int error = 0;
3077	int i, pos = 0;
3078
3079	while (count > 0) {
3080		tocopy = count;
3081		if (tocopy > BATCH_SIZE)
3082			tocopy = BATCH_SIZE;
3083		error = copyin(upp+pos, uaddrs, tocopy * sizeof(char *));
3084		if (error != 0)
3085			break;
3086		for (i = 0; i < tocopy; ++i)
3087			kern_umtx_wake(td, uaddrs[i], INT_MAX, 1);
3088		count -= tocopy;
3089		pos += tocopy;
3090	}
3091	return (error);
3092}
3093
3094static int
3095__umtx_op_wake_private(struct thread *td, struct _umtx_op_args *uap)
3096{
3097	return (kern_umtx_wake(td, uap->obj, uap->val, 1));
3098}
3099
3100static int
3101__umtx_op_lock_umutex(struct thread *td, struct _umtx_op_args *uap)
3102{
3103	struct timespec *ts, timeout;
3104	int error;
3105
3106	/* Allow a null timespec (wait forever). */
3107	if (uap->uaddr2 == NULL)
3108		ts = NULL;
3109	else {
3110		error = copyin(uap->uaddr2, &timeout,
3111		    sizeof(timeout));
3112		if (error != 0)
3113			return (error);
3114		if (timeout.tv_nsec >= 1000000000 ||
3115		    timeout.tv_nsec < 0) {
3116			return (EINVAL);
3117		}
3118		ts = &timeout;
3119	}
3120	return do_lock_umutex(td, uap->obj, ts, 0);
3121}
3122
3123static int
3124__umtx_op_trylock_umutex(struct thread *td, struct _umtx_op_args *uap)
3125{
3126	return do_lock_umutex(td, uap->obj, NULL, _UMUTEX_TRY);
3127}
3128
3129static int
3130__umtx_op_wait_umutex(struct thread *td, struct _umtx_op_args *uap)
3131{
3132	struct timespec *ts, timeout;
3133	int error;
3134
3135	/* Allow a null timespec (wait forever). */
3136	if (uap->uaddr2 == NULL)
3137		ts = NULL;
3138	else {
3139		error = copyin(uap->uaddr2, &timeout,
3140		    sizeof(timeout));
3141		if (error != 0)
3142			return (error);
3143		if (timeout.tv_nsec >= 1000000000 ||
3144		    timeout.tv_nsec < 0) {
3145			return (EINVAL);
3146		}
3147		ts = &timeout;
3148	}
3149	return do_lock_umutex(td, uap->obj, ts, _UMUTEX_WAIT);
3150}
3151
3152static int
3153__umtx_op_wake_umutex(struct thread *td, struct _umtx_op_args *uap)
3154{
3155	return do_wake_umutex(td, uap->obj);
3156}
3157
3158static int
3159__umtx_op_unlock_umutex(struct thread *td, struct _umtx_op_args *uap)
3160{
3161	return do_unlock_umutex(td, uap->obj);
3162}
3163
3164static int
3165__umtx_op_set_ceiling(struct thread *td, struct _umtx_op_args *uap)
3166{
3167	return do_set_ceiling(td, uap->obj, uap->val, uap->uaddr1);
3168}
3169
3170static int
3171__umtx_op_cv_wait(struct thread *td, struct _umtx_op_args *uap)
3172{
3173	struct timespec *ts, timeout;
3174	int error;
3175
3176	/* Allow a null timespec (wait forever). */
3177	if (uap->uaddr2 == NULL)
3178		ts = NULL;
3179	else {
3180		error = copyin(uap->uaddr2, &timeout,
3181		    sizeof(timeout));
3182		if (error != 0)
3183			return (error);
3184		if (timeout.tv_nsec >= 1000000000 ||
3185		    timeout.tv_nsec < 0) {
3186			return (EINVAL);
3187		}
3188		ts = &timeout;
3189	}
3190	return (do_cv_wait(td, uap->obj, uap->uaddr1, ts, uap->val));
3191}
3192
3193static int
3194__umtx_op_cv_signal(struct thread *td, struct _umtx_op_args *uap)
3195{
3196	return do_cv_signal(td, uap->obj);
3197}
3198
3199static int
3200__umtx_op_cv_broadcast(struct thread *td, struct _umtx_op_args *uap)
3201{
3202	return do_cv_broadcast(td, uap->obj);
3203}
3204
3205static int
3206__umtx_op_rw_rdlock(struct thread *td, struct _umtx_op_args *uap)
3207{
3208	struct timespec timeout;
3209	int error;
3210
3211	/* Allow a null timespec (wait forever). */
3212	if (uap->uaddr2 == NULL) {
3213		error = do_rw_rdlock(td, uap->obj, uap->val, 0);
3214	} else {
3215		error = copyin(uap->uaddr2, &timeout,
3216		    sizeof(timeout));
3217		if (error != 0)
3218			return (error);
3219		if (timeout.tv_nsec >= 1000000000 ||
3220		    timeout.tv_nsec < 0) {
3221			return (EINVAL);
3222		}
3223		error = do_rw_rdlock2(td, uap->obj, uap->val, &timeout);
3224	}
3225	return (error);
3226}
3227
3228static int
3229__umtx_op_rw_wrlock(struct thread *td, struct _umtx_op_args *uap)
3230{
3231	struct timespec timeout;
3232	int error;
3233
3234	/* Allow a null timespec (wait forever). */
3235	if (uap->uaddr2 == NULL) {
3236		error = do_rw_wrlock(td, uap->obj, 0);
3237	} else {
3238		error = copyin(uap->uaddr2, &timeout,
3239		    sizeof(timeout));
3240		if (error != 0)
3241			return (error);
3242		if (timeout.tv_nsec >= 1000000000 ||
3243		    timeout.tv_nsec < 0) {
3244			return (EINVAL);
3245		}
3246
3247		error = do_rw_wrlock2(td, uap->obj, &timeout);
3248	}
3249	return (error);
3250}
3251
3252static int
3253__umtx_op_rw_unlock(struct thread *td, struct _umtx_op_args *uap)
3254{
3255	return do_rw_unlock(td, uap->obj);
3256}
3257
3258static int
3259__umtx_op_sem_wait(struct thread *td, struct _umtx_op_args *uap)
3260{
3261	struct timespec *ts, timeout;
3262	int error;
3263
3264	/* Allow a null timespec (wait forever). */
3265	if (uap->uaddr2 == NULL)
3266		ts = NULL;
3267	else {
3268		error = copyin(uap->uaddr2, &timeout,
3269		    sizeof(timeout));
3270		if (error != 0)
3271			return (error);
3272		if (timeout.tv_nsec >= 1000000000 ||
3273		    timeout.tv_nsec < 0) {
3274			return (EINVAL);
3275		}
3276		ts = &timeout;
3277	}
3278	return (do_sem_wait(td, uap->obj, ts));
3279}
3280
3281static int
3282__umtx_op_sem_wake(struct thread *td, struct _umtx_op_args *uap)
3283{
3284	return do_sem_wake(td, uap->obj);
3285}
3286
3287typedef int (*_umtx_op_func)(struct thread *td, struct _umtx_op_args *uap);
3288
3289static _umtx_op_func op_table[] = {
3290	__umtx_op_lock_umtx,		/* UMTX_OP_LOCK */
3291	__umtx_op_unlock_umtx,		/* UMTX_OP_UNLOCK */
3292	__umtx_op_wait,			/* UMTX_OP_WAIT */
3293	__umtx_op_wake,			/* UMTX_OP_WAKE */
3294	__umtx_op_trylock_umutex,	/* UMTX_OP_MUTEX_TRYLOCK */
3295	__umtx_op_lock_umutex,		/* UMTX_OP_MUTEX_LOCK */
3296	__umtx_op_unlock_umutex,	/* UMTX_OP_MUTEX_UNLOCK */
3297	__umtx_op_set_ceiling,		/* UMTX_OP_SET_CEILING */
3298	__umtx_op_cv_wait,		/* UMTX_OP_CV_WAIT*/
3299	__umtx_op_cv_signal,		/* UMTX_OP_CV_SIGNAL */
3300	__umtx_op_cv_broadcast,		/* UMTX_OP_CV_BROADCAST */
3301	__umtx_op_wait_uint,		/* UMTX_OP_WAIT_UINT */
3302	__umtx_op_rw_rdlock,		/* UMTX_OP_RW_RDLOCK */
3303	__umtx_op_rw_wrlock,		/* UMTX_OP_RW_WRLOCK */
3304	__umtx_op_rw_unlock,		/* UMTX_OP_RW_UNLOCK */
3305	__umtx_op_wait_uint_private,	/* UMTX_OP_WAIT_UINT_PRIVATE */
3306	__umtx_op_wake_private,		/* UMTX_OP_WAKE_PRIVATE */
3307	__umtx_op_wait_umutex,		/* UMTX_OP_UMUTEX_WAIT */
3308	__umtx_op_wake_umutex,		/* UMTX_OP_UMUTEX_WAKE */
3309	__umtx_op_sem_wait,		/* UMTX_OP_SEM_WAIT */
3310	__umtx_op_sem_wake,		/* UMTX_OP_SEM_WAKE */
3311	__umtx_op_nwake_private		/* UMTX_OP_NWAKE_PRIVATE */
3312};
3313
3314int
3315_umtx_op(struct thread *td, struct _umtx_op_args *uap)
3316{
3317	if ((unsigned)uap->op < UMTX_OP_MAX)
3318		return (*op_table[uap->op])(td, uap);
3319	return (EINVAL);
3320}
3321
3322#ifdef COMPAT_FREEBSD32
3323int
3324freebsd32_umtx_lock(struct thread *td, struct freebsd32_umtx_lock_args *uap)
3325    /* struct umtx *umtx */
3326{
3327	return (do_lock_umtx32(td, (uint32_t *)uap->umtx, td->td_tid, NULL));
3328}
3329
3330int
3331freebsd32_umtx_unlock(struct thread *td, struct freebsd32_umtx_unlock_args *uap)
3332    /* struct umtx *umtx */
3333{
3334	return (do_unlock_umtx32(td, (uint32_t *)uap->umtx, td->td_tid));
3335}
3336
3337struct timespec32 {
3338	uint32_t tv_sec;
3339	uint32_t tv_nsec;
3340};
3341
3342static inline int
3343copyin_timeout32(void *addr, struct timespec *tsp)
3344{
3345	struct timespec32 ts32;
3346	int error;
3347
3348	error = copyin(addr, &ts32, sizeof(struct timespec32));
3349	if (error == 0) {
3350		tsp->tv_sec = ts32.tv_sec;
3351		tsp->tv_nsec = ts32.tv_nsec;
3352	}
3353	return (error);
3354}
3355
3356static int
3357__umtx_op_lock_umtx_compat32(struct thread *td, struct _umtx_op_args *uap)
3358{
3359	struct timespec *ts, timeout;
3360	int error;
3361
3362	/* Allow a null timespec (wait forever). */
3363	if (uap->uaddr2 == NULL)
3364		ts = NULL;
3365	else {
3366		error = copyin_timeout32(uap->uaddr2, &timeout);
3367		if (error != 0)
3368			return (error);
3369		if (timeout.tv_nsec >= 1000000000 ||
3370		    timeout.tv_nsec < 0) {
3371			return (EINVAL);
3372		}
3373		ts = &timeout;
3374	}
3375	return (do_lock_umtx32(td, uap->obj, uap->val, ts));
3376}
3377
3378static int
3379__umtx_op_unlock_umtx_compat32(struct thread *td, struct _umtx_op_args *uap)
3380{
3381	return (do_unlock_umtx32(td, uap->obj, (uint32_t)uap->val));
3382}
3383
3384static int
3385__umtx_op_wait_compat32(struct thread *td, struct _umtx_op_args *uap)
3386{
3387	struct timespec *ts, timeout;
3388	int error;
3389
3390	if (uap->uaddr2 == NULL)
3391		ts = NULL;
3392	else {
3393		error = copyin_timeout32(uap->uaddr2, &timeout);
3394		if (error != 0)
3395			return (error);
3396		if (timeout.tv_nsec >= 1000000000 ||
3397		    timeout.tv_nsec < 0)
3398			return (EINVAL);
3399		ts = &timeout;
3400	}
3401	return do_wait(td, uap->obj, uap->val, ts, 1, 0);
3402}
3403
3404static int
3405__umtx_op_lock_umutex_compat32(struct thread *td, struct _umtx_op_args *uap)
3406{
3407	struct timespec *ts, timeout;
3408	int error;
3409
3410	/* Allow a null timespec (wait forever). */
3411	if (uap->uaddr2 == NULL)
3412		ts = NULL;
3413	else {
3414		error = copyin_timeout32(uap->uaddr2, &timeout);
3415		if (error != 0)
3416			return (error);
3417		if (timeout.tv_nsec >= 1000000000 ||
3418		    timeout.tv_nsec < 0)
3419			return (EINVAL);
3420		ts = &timeout;
3421	}
3422	return do_lock_umutex(td, uap->obj, ts, 0);
3423}
3424
3425static int
3426__umtx_op_wait_umutex_compat32(struct thread *td, struct _umtx_op_args *uap)
3427{
3428	struct timespec *ts, timeout;
3429	int error;
3430
3431	/* Allow a null timespec (wait forever). */
3432	if (uap->uaddr2 == NULL)
3433		ts = NULL;
3434	else {
3435		error = copyin_timeout32(uap->uaddr2, &timeout);
3436		if (error != 0)
3437			return (error);
3438		if (timeout.tv_nsec >= 1000000000 ||
3439		    timeout.tv_nsec < 0)
3440			return (EINVAL);
3441		ts = &timeout;
3442	}
3443	return do_lock_umutex(td, uap->obj, ts, _UMUTEX_WAIT);
3444}
3445
3446static int
3447__umtx_op_cv_wait_compat32(struct thread *td, struct _umtx_op_args *uap)
3448{
3449	struct timespec *ts, timeout;
3450	int error;
3451
3452	/* Allow a null timespec (wait forever). */
3453	if (uap->uaddr2 == NULL)
3454		ts = NULL;
3455	else {
3456		error = copyin_timeout32(uap->uaddr2, &timeout);
3457		if (error != 0)
3458			return (error);
3459		if (timeout.tv_nsec >= 1000000000 ||
3460		    timeout.tv_nsec < 0)
3461			return (EINVAL);
3462		ts = &timeout;
3463	}
3464	return (do_cv_wait(td, uap->obj, uap->uaddr1, ts, uap->val));
3465}
3466
3467static int
3468__umtx_op_rw_rdlock_compat32(struct thread *td, struct _umtx_op_args *uap)
3469{
3470	struct timespec timeout;
3471	int error;
3472
3473	/* Allow a null timespec (wait forever). */
3474	if (uap->uaddr2 == NULL) {
3475		error = do_rw_rdlock(td, uap->obj, uap->val, 0);
3476	} else {
3477		error = copyin_timeout32(uap->uaddr2, &timeout);
3478		if (error != 0)
3479			return (error);
3480		if (timeout.tv_nsec >= 1000000000 ||
3481		    timeout.tv_nsec < 0) {
3482			return (EINVAL);
3483		}
3484		error = do_rw_rdlock2(td, uap->obj, uap->val, &timeout);
3485	}
3486	return (error);
3487}
3488
3489static int
3490__umtx_op_rw_wrlock_compat32(struct thread *td, struct _umtx_op_args *uap)
3491{
3492	struct timespec timeout;
3493	int error;
3494
3495	/* Allow a null timespec (wait forever). */
3496	if (uap->uaddr2 == NULL) {
3497		error = do_rw_wrlock(td, uap->obj, 0);
3498	} else {
3499		error = copyin_timeout32(uap->uaddr2, &timeout);
3500		if (error != 0)
3501			return (error);
3502		if (timeout.tv_nsec >= 1000000000 ||
3503		    timeout.tv_nsec < 0) {
3504			return (EINVAL);
3505		}
3506
3507		error = do_rw_wrlock2(td, uap->obj, &timeout);
3508	}
3509	return (error);
3510}
3511
3512static int
3513__umtx_op_wait_uint_private_compat32(struct thread *td, struct _umtx_op_args *uap)
3514{
3515	struct timespec *ts, timeout;
3516	int error;
3517
3518	if (uap->uaddr2 == NULL)
3519		ts = NULL;
3520	else {
3521		error = copyin_timeout32(uap->uaddr2, &timeout);
3522		if (error != 0)
3523			return (error);
3524		if (timeout.tv_nsec >= 1000000000 ||
3525		    timeout.tv_nsec < 0)
3526			return (EINVAL);
3527		ts = &timeout;
3528	}
3529	return do_wait(td, uap->obj, uap->val, ts, 1, 1);
3530}
3531
3532static int
3533__umtx_op_sem_wait_compat32(struct thread *td, struct _umtx_op_args *uap)
3534{
3535	struct timespec *ts, timeout;
3536	int error;
3537
3538	/* Allow a null timespec (wait forever). */
3539	if (uap->uaddr2 == NULL)
3540		ts = NULL;
3541	else {
3542		error = copyin_timeout32(uap->uaddr2, &timeout);
3543		if (error != 0)
3544			return (error);
3545		if (timeout.tv_nsec >= 1000000000 ||
3546		    timeout.tv_nsec < 0)
3547			return (EINVAL);
3548		ts = &timeout;
3549	}
3550	return (do_sem_wait(td, uap->obj, ts));
3551}
3552
3553static int
3554__umtx_op_nwake_private32(struct thread *td, struct _umtx_op_args *uap)
3555{
3556	int count = uap->val;
3557	uint32_t uaddrs[BATCH_SIZE];
3558	uint32_t **upp = (uint32_t **)uap->obj;
3559	int tocopy;
3560	int error = 0;
3561	int i, pos = 0;
3562
3563	while (count > 0) {
3564		tocopy = count;
3565		if (tocopy > BATCH_SIZE)
3566			tocopy = BATCH_SIZE;
3567		error = copyin(upp+pos, uaddrs, tocopy * sizeof(uint32_t));
3568		if (error != 0)
3569			break;
3570		for (i = 0; i < tocopy; ++i)
3571			kern_umtx_wake(td, (void *)(intptr_t)uaddrs[i],
3572				INT_MAX, 1);
3573		count -= tocopy;
3574		pos += tocopy;
3575	}
3576	return (error);
3577}
3578
3579static _umtx_op_func op_table_compat32[] = {
3580	__umtx_op_lock_umtx_compat32,	/* UMTX_OP_LOCK */
3581	__umtx_op_unlock_umtx_compat32,	/* UMTX_OP_UNLOCK */
3582	__umtx_op_wait_compat32,	/* UMTX_OP_WAIT */
3583	__umtx_op_wake,			/* UMTX_OP_WAKE */
3584	__umtx_op_trylock_umutex,	/* UMTX_OP_MUTEX_LOCK */
3585	__umtx_op_lock_umutex_compat32,	/* UMTX_OP_MUTEX_TRYLOCK */
3586	__umtx_op_unlock_umutex,	/* UMTX_OP_MUTEX_UNLOCK	*/
3587	__umtx_op_set_ceiling,		/* UMTX_OP_SET_CEILING */
3588	__umtx_op_cv_wait_compat32,	/* UMTX_OP_CV_WAIT*/
3589	__umtx_op_cv_signal,		/* UMTX_OP_CV_SIGNAL */
3590	__umtx_op_cv_broadcast,		/* UMTX_OP_CV_BROADCAST */
3591	__umtx_op_wait_compat32,	/* UMTX_OP_WAIT_UINT */
3592	__umtx_op_rw_rdlock_compat32,	/* UMTX_OP_RW_RDLOCK */
3593	__umtx_op_rw_wrlock_compat32,	/* UMTX_OP_RW_WRLOCK */
3594	__umtx_op_rw_unlock,		/* UMTX_OP_RW_UNLOCK */
3595	__umtx_op_wait_uint_private_compat32,	/* UMTX_OP_WAIT_UINT_PRIVATE */
3596	__umtx_op_wake_private,		/* UMTX_OP_WAKE_PRIVATE */
3597	__umtx_op_wait_umutex_compat32, /* UMTX_OP_UMUTEX_WAIT */
3598	__umtx_op_wake_umutex,		/* UMTX_OP_UMUTEX_WAKE */
3599	__umtx_op_sem_wait_compat32,	/* UMTX_OP_SEM_WAIT */
3600	__umtx_op_sem_wake,		/* UMTX_OP_SEM_WAKE */
3601	__umtx_op_nwake_private32	/* UMTX_OP_NWAKE_PRIVATE */
3602};
3603
3604int
3605freebsd32_umtx_op(struct thread *td, struct freebsd32_umtx_op_args *uap)
3606{
3607	if ((unsigned)uap->op < UMTX_OP_MAX)
3608		return (*op_table_compat32[uap->op])(td,
3609			(struct _umtx_op_args *)uap);
3610	return (EINVAL);
3611}
3612#endif
3613
3614void
3615umtx_thread_init(struct thread *td)
3616{
3617	td->td_umtxq = umtxq_alloc();
3618	td->td_umtxq->uq_thread = td;
3619}
3620
3621void
3622umtx_thread_fini(struct thread *td)
3623{
3624	umtxq_free(td->td_umtxq);
3625}
3626
3627/*
3628 * It will be called when new thread is created, e.g fork().
3629 */
3630void
3631umtx_thread_alloc(struct thread *td)
3632{
3633	struct umtx_q *uq;
3634
3635	uq = td->td_umtxq;
3636	uq->uq_inherited_pri = PRI_MAX;
3637
3638	KASSERT(uq->uq_flags == 0, ("uq_flags != 0"));
3639	KASSERT(uq->uq_thread == td, ("uq_thread != td"));
3640	KASSERT(uq->uq_pi_blocked == NULL, ("uq_pi_blocked != NULL"));
3641	KASSERT(TAILQ_EMPTY(&uq->uq_pi_contested), ("uq_pi_contested is not empty"));
3642}
3643
3644/*
3645 * exec() hook.
3646 */
3647static void
3648umtx_exec_hook(void *arg __unused, struct proc *p __unused,
3649	struct image_params *imgp __unused)
3650{
3651	umtx_thread_cleanup(curthread);
3652}
3653
3654/*
3655 * thread_exit() hook.
3656 */
3657void
3658umtx_thread_exit(struct thread *td)
3659{
3660	umtx_thread_cleanup(td);
3661}
3662
3663/*
3664 * clean up umtx data.
3665 */
3666static void
3667umtx_thread_cleanup(struct thread *td)
3668{
3669	struct umtx_q *uq;
3670	struct umtx_pi *pi;
3671
3672	if ((uq = td->td_umtxq) == NULL)
3673		return;
3674
3675	mtx_lock_spin(&umtx_lock);
3676	uq->uq_inherited_pri = PRI_MAX;
3677	while ((pi = TAILQ_FIRST(&uq->uq_pi_contested)) != NULL) {
3678		pi->pi_owner = NULL;
3679		TAILQ_REMOVE(&uq->uq_pi_contested, pi, pi_link);
3680	}
3681	mtx_unlock_spin(&umtx_lock);
3682	thread_lock(td);
3683	sched_unlend_user_prio(td, PRI_MAX);
3684	thread_unlock(td);
3685}
3686