kern_umtx.c revision 234302
1/*-
2 * Copyright (c) 2004, David Xu <davidxu@freebsd.org>
3 * Copyright (c) 2002, Jeffrey Roberson <jeff@freebsd.org>
4 * All rights reserved.
5 *
6 * Redistribution and use in source and binary forms, with or without
7 * modification, are permitted provided that the following conditions
8 * are met:
9 * 1. Redistributions of source code must retain the above copyright
10 *    notice unmodified, this list of conditions, and the following
11 *    disclaimer.
12 * 2. Redistributions in binary form must reproduce the above copyright
13 *    notice, this list of conditions and the following disclaimer in the
14 *    documentation and/or other materials provided with the distribution.
15 *
16 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
17 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
18 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
19 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
20 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
21 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
22 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
23 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
25 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26 */
27
28#include <sys/cdefs.h>
29__FBSDID("$FreeBSD: head/sys/kern/kern_umtx.c 234302 2012-04-14 23:53:31Z davide $");
30
31#include "opt_compat.h"
32#include "opt_umtx_profiling.h"
33
34#include <sys/param.h>
35#include <sys/kernel.h>
36#include <sys/limits.h>
37#include <sys/lock.h>
38#include <sys/malloc.h>
39#include <sys/mutex.h>
40#include <sys/priv.h>
41#include <sys/proc.h>
42#include <sys/sched.h>
43#include <sys/smp.h>
44#include <sys/sysctl.h>
45#include <sys/sysent.h>
46#include <sys/systm.h>
47#include <sys/sysproto.h>
48#include <sys/syscallsubr.h>
49#include <sys/eventhandler.h>
50#include <sys/umtx.h>
51
52#include <vm/vm.h>
53#include <vm/vm_param.h>
54#include <vm/pmap.h>
55#include <vm/vm_map.h>
56#include <vm/vm_object.h>
57
58#include <machine/cpu.h>
59
60#ifdef COMPAT_FREEBSD32
61#include <compat/freebsd32/freebsd32_proto.h>
62#endif
63
64#define _UMUTEX_TRY		1
65#define _UMUTEX_WAIT		2
66
67/* Priority inheritance mutex info. */
68struct umtx_pi {
69	/* Owner thread */
70	struct thread		*pi_owner;
71
72	/* Reference count */
73	int			pi_refcount;
74
75 	/* List entry to link umtx holding by thread */
76	TAILQ_ENTRY(umtx_pi)	pi_link;
77
78	/* List entry in hash */
79	TAILQ_ENTRY(umtx_pi)	pi_hashlink;
80
81	/* List for waiters */
82	TAILQ_HEAD(,umtx_q)	pi_blocked;
83
84	/* Identify a userland lock object */
85	struct umtx_key		pi_key;
86};
87
88/* A userland synchronous object user. */
89struct umtx_q {
90	/* Linked list for the hash. */
91	TAILQ_ENTRY(umtx_q)	uq_link;
92
93	/* Umtx key. */
94	struct umtx_key		uq_key;
95
96	/* Umtx flags. */
97	int			uq_flags;
98#define UQF_UMTXQ	0x0001
99
100	/* The thread waits on. */
101	struct thread		*uq_thread;
102
103	/*
104	 * Blocked on PI mutex. read can use chain lock
105	 * or umtx_lock, write must have both chain lock and
106	 * umtx_lock being hold.
107	 */
108	struct umtx_pi		*uq_pi_blocked;
109
110	/* On blocked list */
111	TAILQ_ENTRY(umtx_q)	uq_lockq;
112
113	/* Thread contending with us */
114	TAILQ_HEAD(,umtx_pi)	uq_pi_contested;
115
116	/* Inherited priority from PP mutex */
117	u_char			uq_inherited_pri;
118
119	/* Spare queue ready to be reused */
120	struct umtxq_queue	*uq_spare_queue;
121
122	/* The queue we on */
123	struct umtxq_queue	*uq_cur_queue;
124};
125
126TAILQ_HEAD(umtxq_head, umtx_q);
127
128/* Per-key wait-queue */
129struct umtxq_queue {
130	struct umtxq_head	head;
131	struct umtx_key		key;
132	LIST_ENTRY(umtxq_queue)	link;
133	int			length;
134};
135
136LIST_HEAD(umtxq_list, umtxq_queue);
137
138/* Userland lock object's wait-queue chain */
139struct umtxq_chain {
140	/* Lock for this chain. */
141	struct mtx		uc_lock;
142
143	/* List of sleep queues. */
144	struct umtxq_list	uc_queue[2];
145#define UMTX_SHARED_QUEUE	0
146#define UMTX_EXCLUSIVE_QUEUE	1
147
148	LIST_HEAD(, umtxq_queue) uc_spare_queue;
149
150	/* Busy flag */
151	char			uc_busy;
152
153	/* Chain lock waiters */
154	int			uc_waiters;
155
156	/* All PI in the list */
157	TAILQ_HEAD(,umtx_pi)	uc_pi_list;
158
159#ifdef UMTX_PROFILING
160	int 			length;
161	int			max_length;
162#endif
163};
164
165#define	UMTXQ_LOCKED_ASSERT(uc)		mtx_assert(&(uc)->uc_lock, MA_OWNED)
166#define	UMTXQ_BUSY_ASSERT(uc)	KASSERT(&(uc)->uc_busy, ("umtx chain is not busy"))
167
168/*
169 * Don't propagate time-sharing priority, there is a security reason,
170 * a user can simply introduce PI-mutex, let thread A lock the mutex,
171 * and let another thread B block on the mutex, because B is
172 * sleeping, its priority will be boosted, this causes A's priority to
173 * be boosted via priority propagating too and will never be lowered even
174 * if it is using 100%CPU, this is unfair to other processes.
175 */
176
177#define UPRI(td)	(((td)->td_user_pri >= PRI_MIN_TIMESHARE &&\
178			  (td)->td_user_pri <= PRI_MAX_TIMESHARE) ?\
179			 PRI_MAX_TIMESHARE : (td)->td_user_pri)
180
181#define	GOLDEN_RATIO_PRIME	2654404609U
182#define	UMTX_CHAINS		512
183#define	UMTX_SHIFTS		(__WORD_BIT - 9)
184
185#define	GET_SHARE(flags)	\
186    (((flags) & USYNC_PROCESS_SHARED) == 0 ? THREAD_SHARE : PROCESS_SHARE)
187
188#define BUSY_SPINS		200
189
190struct abs_timeout {
191	int clockid;
192	struct timespec cur;
193	struct timespec end;
194};
195
196static uma_zone_t		umtx_pi_zone;
197static struct umtxq_chain	umtxq_chains[2][UMTX_CHAINS];
198static MALLOC_DEFINE(M_UMTX, "umtx", "UMTX queue memory");
199static int			umtx_pi_allocated;
200
201static SYSCTL_NODE(_debug, OID_AUTO, umtx, CTLFLAG_RW, 0, "umtx debug");
202SYSCTL_INT(_debug_umtx, OID_AUTO, umtx_pi_allocated, CTLFLAG_RD,
203    &umtx_pi_allocated, 0, "Allocated umtx_pi");
204
205#ifdef UMTX_PROFILING
206static long max_length;
207SYSCTL_LONG(_debug_umtx, OID_AUTO, max_length, CTLFLAG_RD, &max_length, 0, "max_length");
208static SYSCTL_NODE(_debug_umtx, OID_AUTO, chains, CTLFLAG_RD, 0, "umtx chain stats");
209#endif
210
211static void umtxq_sysinit(void *);
212static void umtxq_hash(struct umtx_key *key);
213static struct umtxq_chain *umtxq_getchain(struct umtx_key *key);
214static void umtxq_lock(struct umtx_key *key);
215static void umtxq_unlock(struct umtx_key *key);
216static void umtxq_busy(struct umtx_key *key);
217static void umtxq_unbusy(struct umtx_key *key);
218static void umtxq_insert_queue(struct umtx_q *uq, int q);
219static void umtxq_remove_queue(struct umtx_q *uq, int q);
220static int umtxq_sleep(struct umtx_q *uq, const char *wmesg, struct abs_timeout *);
221static int umtxq_count(struct umtx_key *key);
222static struct umtx_pi *umtx_pi_alloc(int);
223static void umtx_pi_free(struct umtx_pi *pi);
224static int do_unlock_pp(struct thread *td, struct umutex *m, uint32_t flags);
225static void umtx_thread_cleanup(struct thread *td);
226static void umtx_exec_hook(void *arg __unused, struct proc *p __unused,
227	struct image_params *imgp __unused);
228SYSINIT(umtx, SI_SUB_EVENTHANDLER+1, SI_ORDER_MIDDLE, umtxq_sysinit, NULL);
229
230#define umtxq_signal(key, nwake)	umtxq_signal_queue((key), (nwake), UMTX_SHARED_QUEUE)
231#define umtxq_insert(uq)	umtxq_insert_queue((uq), UMTX_SHARED_QUEUE)
232#define umtxq_remove(uq)	umtxq_remove_queue((uq), UMTX_SHARED_QUEUE)
233
234static struct mtx umtx_lock;
235
236#ifdef UMTX_PROFILING
237static void
238umtx_init_profiling(void)
239{
240	struct sysctl_oid *chain_oid;
241	char chain_name[10];
242	int i;
243
244	for (i = 0; i < UMTX_CHAINS; ++i) {
245		snprintf(chain_name, sizeof(chain_name), "%d", i);
246		chain_oid = SYSCTL_ADD_NODE(NULL,
247		    SYSCTL_STATIC_CHILDREN(_debug_umtx_chains), OID_AUTO,
248		    chain_name, CTLFLAG_RD, NULL, "umtx hash stats");
249		SYSCTL_ADD_INT(NULL, SYSCTL_CHILDREN(chain_oid), OID_AUTO,
250		    "max_length0", CTLFLAG_RD, &umtxq_chains[0][i].max_length, 0, NULL);
251		SYSCTL_ADD_INT(NULL, SYSCTL_CHILDREN(chain_oid), OID_AUTO,
252		    "max_length1", CTLFLAG_RD, &umtxq_chains[1][i].max_length, 0, NULL);
253	}
254}
255#endif
256
257static void
258umtxq_sysinit(void *arg __unused)
259{
260	int i, j;
261
262	umtx_pi_zone = uma_zcreate("umtx pi", sizeof(struct umtx_pi),
263		NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0);
264	for (i = 0; i < 2; ++i) {
265		for (j = 0; j < UMTX_CHAINS; ++j) {
266			mtx_init(&umtxq_chains[i][j].uc_lock, "umtxql", NULL,
267				 MTX_DEF | MTX_DUPOK);
268			LIST_INIT(&umtxq_chains[i][j].uc_queue[0]);
269			LIST_INIT(&umtxq_chains[i][j].uc_queue[1]);
270			LIST_INIT(&umtxq_chains[i][j].uc_spare_queue);
271			TAILQ_INIT(&umtxq_chains[i][j].uc_pi_list);
272			umtxq_chains[i][j].uc_busy = 0;
273			umtxq_chains[i][j].uc_waiters = 0;
274#ifdef UMTX_PROFILING
275			umtxq_chains[i][j].length = 0;
276			umtxq_chains[i][j].max_length = 0;
277#endif
278		}
279	}
280#ifdef UMTX_PROFILING
281	umtx_init_profiling();
282#endif
283	mtx_init(&umtx_lock, "umtx lock", NULL, MTX_SPIN);
284	EVENTHANDLER_REGISTER(process_exec, umtx_exec_hook, NULL,
285	    EVENTHANDLER_PRI_ANY);
286}
287
288struct umtx_q *
289umtxq_alloc(void)
290{
291	struct umtx_q *uq;
292
293	uq = malloc(sizeof(struct umtx_q), M_UMTX, M_WAITOK | M_ZERO);
294	uq->uq_spare_queue = malloc(sizeof(struct umtxq_queue), M_UMTX, M_WAITOK | M_ZERO);
295	TAILQ_INIT(&uq->uq_spare_queue->head);
296	TAILQ_INIT(&uq->uq_pi_contested);
297	uq->uq_inherited_pri = PRI_MAX;
298	return (uq);
299}
300
301void
302umtxq_free(struct umtx_q *uq)
303{
304	MPASS(uq->uq_spare_queue != NULL);
305	free(uq->uq_spare_queue, M_UMTX);
306	free(uq, M_UMTX);
307}
308
309static inline void
310umtxq_hash(struct umtx_key *key)
311{
312	unsigned n = (uintptr_t)key->info.both.a + key->info.both.b;
313	key->hash = ((n * GOLDEN_RATIO_PRIME) >> UMTX_SHIFTS) % UMTX_CHAINS;
314}
315
316static inline struct umtxq_chain *
317umtxq_getchain(struct umtx_key *key)
318{
319	if (key->type <= TYPE_SEM)
320		return (&umtxq_chains[1][key->hash]);
321	return (&umtxq_chains[0][key->hash]);
322}
323
324/*
325 * Lock a chain.
326 */
327static inline void
328umtxq_lock(struct umtx_key *key)
329{
330	struct umtxq_chain *uc;
331
332	uc = umtxq_getchain(key);
333	mtx_lock(&uc->uc_lock);
334}
335
336/*
337 * Unlock a chain.
338 */
339static inline void
340umtxq_unlock(struct umtx_key *key)
341{
342	struct umtxq_chain *uc;
343
344	uc = umtxq_getchain(key);
345	mtx_unlock(&uc->uc_lock);
346}
347
348/*
349 * Set chain to busy state when following operation
350 * may be blocked (kernel mutex can not be used).
351 */
352static inline void
353umtxq_busy(struct umtx_key *key)
354{
355	struct umtxq_chain *uc;
356
357	uc = umtxq_getchain(key);
358	mtx_assert(&uc->uc_lock, MA_OWNED);
359	if (uc->uc_busy) {
360#ifdef SMP
361		if (smp_cpus > 1) {
362			int count = BUSY_SPINS;
363			if (count > 0) {
364				umtxq_unlock(key);
365				while (uc->uc_busy && --count > 0)
366					cpu_spinwait();
367				umtxq_lock(key);
368			}
369		}
370#endif
371		while (uc->uc_busy) {
372			uc->uc_waiters++;
373			msleep(uc, &uc->uc_lock, 0, "umtxqb", 0);
374			uc->uc_waiters--;
375		}
376	}
377	uc->uc_busy = 1;
378}
379
380/*
381 * Unbusy a chain.
382 */
383static inline void
384umtxq_unbusy(struct umtx_key *key)
385{
386	struct umtxq_chain *uc;
387
388	uc = umtxq_getchain(key);
389	mtx_assert(&uc->uc_lock, MA_OWNED);
390	KASSERT(uc->uc_busy != 0, ("not busy"));
391	uc->uc_busy = 0;
392	if (uc->uc_waiters)
393		wakeup_one(uc);
394}
395
396static struct umtxq_queue *
397umtxq_queue_lookup(struct umtx_key *key, int q)
398{
399	struct umtxq_queue *uh;
400	struct umtxq_chain *uc;
401
402	uc = umtxq_getchain(key);
403	UMTXQ_LOCKED_ASSERT(uc);
404	LIST_FOREACH(uh, &uc->uc_queue[q], link) {
405		if (umtx_key_match(&uh->key, key))
406			return (uh);
407	}
408
409	return (NULL);
410}
411
412static inline void
413umtxq_insert_queue(struct umtx_q *uq, int q)
414{
415	struct umtxq_queue *uh;
416	struct umtxq_chain *uc;
417
418	uc = umtxq_getchain(&uq->uq_key);
419	UMTXQ_LOCKED_ASSERT(uc);
420	KASSERT((uq->uq_flags & UQF_UMTXQ) == 0, ("umtx_q is already on queue"));
421	uh = umtxq_queue_lookup(&uq->uq_key, q);
422	if (uh != NULL) {
423		LIST_INSERT_HEAD(&uc->uc_spare_queue, uq->uq_spare_queue, link);
424	} else {
425		uh = uq->uq_spare_queue;
426		uh->key = uq->uq_key;
427		LIST_INSERT_HEAD(&uc->uc_queue[q], uh, link);
428	}
429	uq->uq_spare_queue = NULL;
430
431	TAILQ_INSERT_TAIL(&uh->head, uq, uq_link);
432	uh->length++;
433#ifdef UMTX_PROFILING
434	uc->length++;
435	if (uc->length > uc->max_length) {
436		uc->max_length = uc->length;
437		if (uc->max_length > max_length)
438			max_length = uc->max_length;
439	}
440#endif
441	uq->uq_flags |= UQF_UMTXQ;
442	uq->uq_cur_queue = uh;
443	return;
444}
445
446static inline void
447umtxq_remove_queue(struct umtx_q *uq, int q)
448{
449	struct umtxq_chain *uc;
450	struct umtxq_queue *uh;
451
452	uc = umtxq_getchain(&uq->uq_key);
453	UMTXQ_LOCKED_ASSERT(uc);
454	if (uq->uq_flags & UQF_UMTXQ) {
455		uh = uq->uq_cur_queue;
456		TAILQ_REMOVE(&uh->head, uq, uq_link);
457		uh->length--;
458#ifdef UMTX_PROFILING
459		uc->length--;
460#endif
461		uq->uq_flags &= ~UQF_UMTXQ;
462		if (TAILQ_EMPTY(&uh->head)) {
463			KASSERT(uh->length == 0,
464			    ("inconsistent umtxq_queue length"));
465			LIST_REMOVE(uh, link);
466		} else {
467			uh = LIST_FIRST(&uc->uc_spare_queue);
468			KASSERT(uh != NULL, ("uc_spare_queue is empty"));
469			LIST_REMOVE(uh, link);
470		}
471		uq->uq_spare_queue = uh;
472		uq->uq_cur_queue = NULL;
473	}
474}
475
476/*
477 * Check if there are multiple waiters
478 */
479static int
480umtxq_count(struct umtx_key *key)
481{
482	struct umtxq_chain *uc;
483	struct umtxq_queue *uh;
484
485	uc = umtxq_getchain(key);
486	UMTXQ_LOCKED_ASSERT(uc);
487	uh = umtxq_queue_lookup(key, UMTX_SHARED_QUEUE);
488	if (uh != NULL)
489		return (uh->length);
490	return (0);
491}
492
493/*
494 * Check if there are multiple PI waiters and returns first
495 * waiter.
496 */
497static int
498umtxq_count_pi(struct umtx_key *key, struct umtx_q **first)
499{
500	struct umtxq_chain *uc;
501	struct umtxq_queue *uh;
502
503	*first = NULL;
504	uc = umtxq_getchain(key);
505	UMTXQ_LOCKED_ASSERT(uc);
506	uh = umtxq_queue_lookup(key, UMTX_SHARED_QUEUE);
507	if (uh != NULL) {
508		*first = TAILQ_FIRST(&uh->head);
509		return (uh->length);
510	}
511	return (0);
512}
513
514/*
515 * Wake up threads waiting on an userland object.
516 */
517
518static int
519umtxq_signal_queue(struct umtx_key *key, int n_wake, int q)
520{
521	struct umtxq_chain *uc;
522	struct umtxq_queue *uh;
523	struct umtx_q *uq;
524	int ret;
525
526	ret = 0;
527	uc = umtxq_getchain(key);
528	UMTXQ_LOCKED_ASSERT(uc);
529	uh = umtxq_queue_lookup(key, q);
530	if (uh != NULL) {
531		while ((uq = TAILQ_FIRST(&uh->head)) != NULL) {
532			umtxq_remove_queue(uq, q);
533			wakeup(uq);
534			if (++ret >= n_wake)
535				return (ret);
536		}
537	}
538	return (ret);
539}
540
541
542/*
543 * Wake up specified thread.
544 */
545static inline void
546umtxq_signal_thread(struct umtx_q *uq)
547{
548	struct umtxq_chain *uc;
549
550	uc = umtxq_getchain(&uq->uq_key);
551	UMTXQ_LOCKED_ASSERT(uc);
552	umtxq_remove(uq);
553	wakeup(uq);
554}
555
556static inline int
557tstohz(const struct timespec *tsp)
558{
559	struct timeval tv;
560
561	TIMESPEC_TO_TIMEVAL(&tv, tsp);
562	return tvtohz(&tv);
563}
564
565static void
566abs_timeout_init(struct abs_timeout *timo, int clockid, int absolute,
567	const struct timespec *timeout)
568{
569
570	timo->clockid = clockid;
571	if (!absolute) {
572		kern_clock_gettime(curthread, clockid, &timo->end);
573		timo->cur = timo->end;
574		timespecadd(&timo->end, timeout);
575	} else {
576		timo->end = *timeout;
577		kern_clock_gettime(curthread, clockid, &timo->cur);
578	}
579}
580
581static void
582abs_timeout_init2(struct abs_timeout *timo, const struct _umtx_time *umtxtime)
583{
584
585	abs_timeout_init(timo, umtxtime->_clockid,
586		(umtxtime->_flags & UMTX_ABSTIME) != 0,
587		&umtxtime->_timeout);
588}
589
590static int
591abs_timeout_update(struct abs_timeout *timo)
592{
593	kern_clock_gettime(curthread, timo->clockid, &timo->cur);
594	return (timespeccmp(&timo->cur, &timo->end, >=));
595}
596
597static int
598abs_timeout_gethz(struct abs_timeout *timo)
599{
600	struct timespec tts;
601
602	tts = timo->end;
603	timespecsub(&tts, &timo->cur);
604	return (tstohz(&tts));
605}
606
607/*
608 * Put thread into sleep state, before sleeping, check if
609 * thread was removed from umtx queue.
610 */
611static inline int
612umtxq_sleep(struct umtx_q *uq, const char *wmesg, struct abs_timeout *timo)
613{
614	struct umtxq_chain *uc;
615	int error;
616
617	uc = umtxq_getchain(&uq->uq_key);
618	UMTXQ_LOCKED_ASSERT(uc);
619	for (;;) {
620		if (!(uq->uq_flags & UQF_UMTXQ))
621			return (0);
622		error = msleep(uq, &uc->uc_lock, PCATCH, wmesg,
623		    timo == NULL ? 0 : abs_timeout_gethz(timo));
624		if (error != EWOULDBLOCK)
625			break;
626		umtxq_unlock(&uq->uq_key);
627		if (abs_timeout_update(timo)) {
628			error = ETIMEDOUT;
629			umtxq_lock(&uq->uq_key);
630			break;
631		}
632		umtxq_lock(&uq->uq_key);
633	}
634	return (error);
635}
636
637/*
638 * Convert userspace address into unique logical address.
639 */
640int
641umtx_key_get(void *addr, int type, int share, struct umtx_key *key)
642{
643	struct thread *td = curthread;
644	vm_map_t map;
645	vm_map_entry_t entry;
646	vm_pindex_t pindex;
647	vm_prot_t prot;
648	boolean_t wired;
649
650	key->type = type;
651	if (share == THREAD_SHARE) {
652		key->shared = 0;
653		key->info.private.vs = td->td_proc->p_vmspace;
654		key->info.private.addr = (uintptr_t)addr;
655	} else {
656		MPASS(share == PROCESS_SHARE || share == AUTO_SHARE);
657		map = &td->td_proc->p_vmspace->vm_map;
658		if (vm_map_lookup(&map, (vm_offset_t)addr, VM_PROT_WRITE,
659		    &entry, &key->info.shared.object, &pindex, &prot,
660		    &wired) != KERN_SUCCESS) {
661			return EFAULT;
662		}
663
664		if ((share == PROCESS_SHARE) ||
665		    (share == AUTO_SHARE &&
666		     VM_INHERIT_SHARE == entry->inheritance)) {
667			key->shared = 1;
668			key->info.shared.offset = entry->offset + entry->start -
669				(vm_offset_t)addr;
670			vm_object_reference(key->info.shared.object);
671		} else {
672			key->shared = 0;
673			key->info.private.vs = td->td_proc->p_vmspace;
674			key->info.private.addr = (uintptr_t)addr;
675		}
676		vm_map_lookup_done(map, entry);
677	}
678
679	umtxq_hash(key);
680	return (0);
681}
682
683/*
684 * Release key.
685 */
686void
687umtx_key_release(struct umtx_key *key)
688{
689	if (key->shared)
690		vm_object_deallocate(key->info.shared.object);
691}
692
693/*
694 * Lock a umtx object.
695 */
696static int
697do_lock_umtx(struct thread *td, struct umtx *umtx, u_long id,
698	const struct timespec *timeout)
699{
700	struct abs_timeout timo;
701	struct umtx_q *uq;
702	u_long owner;
703	u_long old;
704	int error = 0;
705
706	uq = td->td_umtxq;
707	if (timeout != NULL)
708		abs_timeout_init(&timo, CLOCK_REALTIME, 0, timeout);
709
710	/*
711	 * Care must be exercised when dealing with umtx structure. It
712	 * can fault on any access.
713	 */
714	for (;;) {
715		/*
716		 * Try the uncontested case.  This should be done in userland.
717		 */
718		owner = casuword(&umtx->u_owner, UMTX_UNOWNED, id);
719
720		/* The acquire succeeded. */
721		if (owner == UMTX_UNOWNED)
722			return (0);
723
724		/* The address was invalid. */
725		if (owner == -1)
726			return (EFAULT);
727
728		/* If no one owns it but it is contested try to acquire it. */
729		if (owner == UMTX_CONTESTED) {
730			owner = casuword(&umtx->u_owner,
731			    UMTX_CONTESTED, id | UMTX_CONTESTED);
732
733			if (owner == UMTX_CONTESTED)
734				return (0);
735
736			/* The address was invalid. */
737			if (owner == -1)
738				return (EFAULT);
739
740			/* If this failed the lock has changed, restart. */
741			continue;
742		}
743
744		/*
745		 * If we caught a signal, we have retried and now
746		 * exit immediately.
747		 */
748		if (error != 0)
749			break;
750
751		if ((error = umtx_key_get(umtx, TYPE_SIMPLE_LOCK,
752			AUTO_SHARE, &uq->uq_key)) != 0)
753			return (error);
754
755		umtxq_lock(&uq->uq_key);
756		umtxq_busy(&uq->uq_key);
757		umtxq_insert(uq);
758		umtxq_unbusy(&uq->uq_key);
759		umtxq_unlock(&uq->uq_key);
760
761		/*
762		 * Set the contested bit so that a release in user space
763		 * knows to use the system call for unlock.  If this fails
764		 * either some one else has acquired the lock or it has been
765		 * released.
766		 */
767		old = casuword(&umtx->u_owner, owner, owner | UMTX_CONTESTED);
768
769		/* The address was invalid. */
770		if (old == -1) {
771			umtxq_lock(&uq->uq_key);
772			umtxq_remove(uq);
773			umtxq_unlock(&uq->uq_key);
774			umtx_key_release(&uq->uq_key);
775			return (EFAULT);
776		}
777
778		/*
779		 * We set the contested bit, sleep. Otherwise the lock changed
780		 * and we need to retry or we lost a race to the thread
781		 * unlocking the umtx.
782		 */
783		umtxq_lock(&uq->uq_key);
784		if (old == owner)
785			error = umtxq_sleep(uq, "umtx", timeout == NULL ? NULL :
786			    &timo);
787		umtxq_remove(uq);
788		umtxq_unlock(&uq->uq_key);
789		umtx_key_release(&uq->uq_key);
790	}
791
792	if (timeout == NULL) {
793		/* Mutex locking is restarted if it is interrupted. */
794		if (error == EINTR)
795			error = ERESTART;
796	} else {
797		/* Timed-locking is not restarted. */
798		if (error == ERESTART)
799			error = EINTR;
800	}
801	return (error);
802}
803
804/*
805 * Unlock a umtx object.
806 */
807static int
808do_unlock_umtx(struct thread *td, struct umtx *umtx, u_long id)
809{
810	struct umtx_key key;
811	u_long owner;
812	u_long old;
813	int error;
814	int count;
815
816	/*
817	 * Make sure we own this mtx.
818	 */
819	owner = fuword(__DEVOLATILE(u_long *, &umtx->u_owner));
820	if (owner == -1)
821		return (EFAULT);
822
823	if ((owner & ~UMTX_CONTESTED) != id)
824		return (EPERM);
825
826	/* This should be done in userland */
827	if ((owner & UMTX_CONTESTED) == 0) {
828		old = casuword(&umtx->u_owner, owner, UMTX_UNOWNED);
829		if (old == -1)
830			return (EFAULT);
831		if (old == owner)
832			return (0);
833		owner = old;
834	}
835
836	/* We should only ever be in here for contested locks */
837	if ((error = umtx_key_get(umtx, TYPE_SIMPLE_LOCK, AUTO_SHARE,
838		&key)) != 0)
839		return (error);
840
841	umtxq_lock(&key);
842	umtxq_busy(&key);
843	count = umtxq_count(&key);
844	umtxq_unlock(&key);
845
846	/*
847	 * When unlocking the umtx, it must be marked as unowned if
848	 * there is zero or one thread only waiting for it.
849	 * Otherwise, it must be marked as contested.
850	 */
851	old = casuword(&umtx->u_owner, owner,
852		count <= 1 ? UMTX_UNOWNED : UMTX_CONTESTED);
853	umtxq_lock(&key);
854	umtxq_signal(&key,1);
855	umtxq_unbusy(&key);
856	umtxq_unlock(&key);
857	umtx_key_release(&key);
858	if (old == -1)
859		return (EFAULT);
860	if (old != owner)
861		return (EINVAL);
862	return (0);
863}
864
865#ifdef COMPAT_FREEBSD32
866
867/*
868 * Lock a umtx object.
869 */
870static int
871do_lock_umtx32(struct thread *td, uint32_t *m, uint32_t id,
872	const struct timespec *timeout)
873{
874	struct abs_timeout timo;
875	struct umtx_q *uq;
876	uint32_t owner;
877	uint32_t old;
878	int error = 0;
879
880	uq = td->td_umtxq;
881
882	if (timeout != NULL)
883		abs_timeout_init(&timo, CLOCK_REALTIME, 0, timeout);
884
885	/*
886	 * Care must be exercised when dealing with umtx structure. It
887	 * can fault on any access.
888	 */
889	for (;;) {
890		/*
891		 * Try the uncontested case.  This should be done in userland.
892		 */
893		owner = casuword32(m, UMUTEX_UNOWNED, id);
894
895		/* The acquire succeeded. */
896		if (owner == UMUTEX_UNOWNED)
897			return (0);
898
899		/* The address was invalid. */
900		if (owner == -1)
901			return (EFAULT);
902
903		/* If no one owns it but it is contested try to acquire it. */
904		if (owner == UMUTEX_CONTESTED) {
905			owner = casuword32(m,
906			    UMUTEX_CONTESTED, id | UMUTEX_CONTESTED);
907			if (owner == UMUTEX_CONTESTED)
908				return (0);
909
910			/* The address was invalid. */
911			if (owner == -1)
912				return (EFAULT);
913
914			/* If this failed the lock has changed, restart. */
915			continue;
916		}
917
918		/*
919		 * If we caught a signal, we have retried and now
920		 * exit immediately.
921		 */
922		if (error != 0)
923			return (error);
924
925		if ((error = umtx_key_get(m, TYPE_SIMPLE_LOCK,
926			AUTO_SHARE, &uq->uq_key)) != 0)
927			return (error);
928
929		umtxq_lock(&uq->uq_key);
930		umtxq_busy(&uq->uq_key);
931		umtxq_insert(uq);
932		umtxq_unbusy(&uq->uq_key);
933		umtxq_unlock(&uq->uq_key);
934
935		/*
936		 * Set the contested bit so that a release in user space
937		 * knows to use the system call for unlock.  If this fails
938		 * either some one else has acquired the lock or it has been
939		 * released.
940		 */
941		old = casuword32(m, owner, owner | UMUTEX_CONTESTED);
942
943		/* The address was invalid. */
944		if (old == -1) {
945			umtxq_lock(&uq->uq_key);
946			umtxq_remove(uq);
947			umtxq_unlock(&uq->uq_key);
948			umtx_key_release(&uq->uq_key);
949			return (EFAULT);
950		}
951
952		/*
953		 * We set the contested bit, sleep. Otherwise the lock changed
954		 * and we need to retry or we lost a race to the thread
955		 * unlocking the umtx.
956		 */
957		umtxq_lock(&uq->uq_key);
958		if (old == owner)
959			error = umtxq_sleep(uq, "umtx", timeout == NULL ?
960			    NULL : &timo);
961		umtxq_remove(uq);
962		umtxq_unlock(&uq->uq_key);
963		umtx_key_release(&uq->uq_key);
964	}
965
966	if (timeout == NULL) {
967		/* Mutex locking is restarted if it is interrupted. */
968		if (error == EINTR)
969			error = ERESTART;
970	} else {
971		/* Timed-locking is not restarted. */
972		if (error == ERESTART)
973			error = EINTR;
974	}
975	return (error);
976}
977
978/*
979 * Unlock a umtx object.
980 */
981static int
982do_unlock_umtx32(struct thread *td, uint32_t *m, uint32_t id)
983{
984	struct umtx_key key;
985	uint32_t owner;
986	uint32_t old;
987	int error;
988	int count;
989
990	/*
991	 * Make sure we own this mtx.
992	 */
993	owner = fuword32(m);
994	if (owner == -1)
995		return (EFAULT);
996
997	if ((owner & ~UMUTEX_CONTESTED) != id)
998		return (EPERM);
999
1000	/* This should be done in userland */
1001	if ((owner & UMUTEX_CONTESTED) == 0) {
1002		old = casuword32(m, owner, UMUTEX_UNOWNED);
1003		if (old == -1)
1004			return (EFAULT);
1005		if (old == owner)
1006			return (0);
1007		owner = old;
1008	}
1009
1010	/* We should only ever be in here for contested locks */
1011	if ((error = umtx_key_get(m, TYPE_SIMPLE_LOCK, AUTO_SHARE,
1012		&key)) != 0)
1013		return (error);
1014
1015	umtxq_lock(&key);
1016	umtxq_busy(&key);
1017	count = umtxq_count(&key);
1018	umtxq_unlock(&key);
1019
1020	/*
1021	 * When unlocking the umtx, it must be marked as unowned if
1022	 * there is zero or one thread only waiting for it.
1023	 * Otherwise, it must be marked as contested.
1024	 */
1025	old = casuword32(m, owner,
1026		count <= 1 ? UMUTEX_UNOWNED : UMUTEX_CONTESTED);
1027	umtxq_lock(&key);
1028	umtxq_signal(&key,1);
1029	umtxq_unbusy(&key);
1030	umtxq_unlock(&key);
1031	umtx_key_release(&key);
1032	if (old == -1)
1033		return (EFAULT);
1034	if (old != owner)
1035		return (EINVAL);
1036	return (0);
1037}
1038#endif
1039
1040/*
1041 * Fetch and compare value, sleep on the address if value is not changed.
1042 */
1043static int
1044do_wait(struct thread *td, void *addr, u_long id,
1045	struct _umtx_time *timeout, int compat32, int is_private)
1046{
1047	struct abs_timeout timo;
1048	struct umtx_q *uq;
1049	u_long tmp;
1050	int error = 0;
1051
1052	uq = td->td_umtxq;
1053	if ((error = umtx_key_get(addr, TYPE_SIMPLE_WAIT,
1054		is_private ? THREAD_SHARE : AUTO_SHARE, &uq->uq_key)) != 0)
1055		return (error);
1056
1057	if (timeout != NULL)
1058		abs_timeout_init2(&timo, timeout);
1059
1060	umtxq_lock(&uq->uq_key);
1061	umtxq_insert(uq);
1062	umtxq_unlock(&uq->uq_key);
1063	if (compat32 == 0)
1064		tmp = fuword(addr);
1065        else
1066		tmp = (unsigned int)fuword32(addr);
1067	umtxq_lock(&uq->uq_key);
1068	if (tmp == id)
1069		error = umtxq_sleep(uq, "uwait", timeout == NULL ?
1070		    NULL : &timo);
1071	if ((uq->uq_flags & UQF_UMTXQ) == 0)
1072		error = 0;
1073	else
1074		umtxq_remove(uq);
1075	umtxq_unlock(&uq->uq_key);
1076	umtx_key_release(&uq->uq_key);
1077	if (error == ERESTART)
1078		error = EINTR;
1079	return (error);
1080}
1081
1082/*
1083 * Wake up threads sleeping on the specified address.
1084 */
1085int
1086kern_umtx_wake(struct thread *td, void *uaddr, int n_wake, int is_private)
1087{
1088	struct umtx_key key;
1089	int ret;
1090
1091	if ((ret = umtx_key_get(uaddr, TYPE_SIMPLE_WAIT,
1092		is_private ? THREAD_SHARE : AUTO_SHARE, &key)) != 0)
1093		return (ret);
1094	umtxq_lock(&key);
1095	ret = umtxq_signal(&key, n_wake);
1096	umtxq_unlock(&key);
1097	umtx_key_release(&key);
1098	return (0);
1099}
1100
1101/*
1102 * Lock PTHREAD_PRIO_NONE protocol POSIX mutex.
1103 */
1104static int
1105do_lock_normal(struct thread *td, struct umutex *m, uint32_t flags,
1106	struct _umtx_time *timeout, int mode)
1107{
1108	struct abs_timeout timo;
1109	struct umtx_q *uq;
1110	uint32_t owner, old, id;
1111	int error = 0;
1112
1113	id = td->td_tid;
1114	uq = td->td_umtxq;
1115
1116	if (timeout != NULL)
1117		abs_timeout_init2(&timo, timeout);
1118
1119	/*
1120	 * Care must be exercised when dealing with umtx structure. It
1121	 * can fault on any access.
1122	 */
1123	for (;;) {
1124		owner = fuword32(__DEVOLATILE(void *, &m->m_owner));
1125		if (mode == _UMUTEX_WAIT) {
1126			if (owner == UMUTEX_UNOWNED || owner == UMUTEX_CONTESTED)
1127				return (0);
1128		} else {
1129			/*
1130			 * Try the uncontested case.  This should be done in userland.
1131			 */
1132			owner = casuword32(&m->m_owner, UMUTEX_UNOWNED, id);
1133
1134			/* The acquire succeeded. */
1135			if (owner == UMUTEX_UNOWNED)
1136				return (0);
1137
1138			/* The address was invalid. */
1139			if (owner == -1)
1140				return (EFAULT);
1141
1142			/* If no one owns it but it is contested try to acquire it. */
1143			if (owner == UMUTEX_CONTESTED) {
1144				owner = casuword32(&m->m_owner,
1145				    UMUTEX_CONTESTED, id | UMUTEX_CONTESTED);
1146
1147				if (owner == UMUTEX_CONTESTED)
1148					return (0);
1149
1150				/* The address was invalid. */
1151				if (owner == -1)
1152					return (EFAULT);
1153
1154				/* If this failed the lock has changed, restart. */
1155				continue;
1156			}
1157		}
1158
1159		if ((flags & UMUTEX_ERROR_CHECK) != 0 &&
1160		    (owner & ~UMUTEX_CONTESTED) == id)
1161			return (EDEADLK);
1162
1163		if (mode == _UMUTEX_TRY)
1164			return (EBUSY);
1165
1166		/*
1167		 * If we caught a signal, we have retried and now
1168		 * exit immediately.
1169		 */
1170		if (error != 0)
1171			return (error);
1172
1173		if ((error = umtx_key_get(m, TYPE_NORMAL_UMUTEX,
1174		    GET_SHARE(flags), &uq->uq_key)) != 0)
1175			return (error);
1176
1177		umtxq_lock(&uq->uq_key);
1178		umtxq_busy(&uq->uq_key);
1179		umtxq_insert(uq);
1180		umtxq_unlock(&uq->uq_key);
1181
1182		/*
1183		 * Set the contested bit so that a release in user space
1184		 * knows to use the system call for unlock.  If this fails
1185		 * either some one else has acquired the lock or it has been
1186		 * released.
1187		 */
1188		old = casuword32(&m->m_owner, owner, owner | UMUTEX_CONTESTED);
1189
1190		/* The address was invalid. */
1191		if (old == -1) {
1192			umtxq_lock(&uq->uq_key);
1193			umtxq_remove(uq);
1194			umtxq_unbusy(&uq->uq_key);
1195			umtxq_unlock(&uq->uq_key);
1196			umtx_key_release(&uq->uq_key);
1197			return (EFAULT);
1198		}
1199
1200		/*
1201		 * We set the contested bit, sleep. Otherwise the lock changed
1202		 * and we need to retry or we lost a race to the thread
1203		 * unlocking the umtx.
1204		 */
1205		umtxq_lock(&uq->uq_key);
1206		umtxq_unbusy(&uq->uq_key);
1207		if (old == owner)
1208			error = umtxq_sleep(uq, "umtxn", timeout == NULL ?
1209			    NULL : &timo);
1210		umtxq_remove(uq);
1211		umtxq_unlock(&uq->uq_key);
1212		umtx_key_release(&uq->uq_key);
1213	}
1214
1215	return (0);
1216}
1217
1218/*
1219 * Unlock PTHREAD_PRIO_NONE protocol POSIX mutex.
1220 */
1221static int
1222do_unlock_normal(struct thread *td, struct umutex *m, uint32_t flags)
1223{
1224	struct umtx_key key;
1225	uint32_t owner, old, id;
1226	int error;
1227	int count;
1228
1229	id = td->td_tid;
1230	/*
1231	 * Make sure we own this mtx.
1232	 */
1233	owner = fuword32(__DEVOLATILE(uint32_t *, &m->m_owner));
1234	if (owner == -1)
1235		return (EFAULT);
1236
1237	if ((owner & ~UMUTEX_CONTESTED) != id)
1238		return (EPERM);
1239
1240	if ((owner & UMUTEX_CONTESTED) == 0) {
1241		old = casuword32(&m->m_owner, owner, UMUTEX_UNOWNED);
1242		if (old == -1)
1243			return (EFAULT);
1244		if (old == owner)
1245			return (0);
1246		owner = old;
1247	}
1248
1249	/* We should only ever be in here for contested locks */
1250	if ((error = umtx_key_get(m, TYPE_NORMAL_UMUTEX, GET_SHARE(flags),
1251	    &key)) != 0)
1252		return (error);
1253
1254	umtxq_lock(&key);
1255	umtxq_busy(&key);
1256	count = umtxq_count(&key);
1257	umtxq_unlock(&key);
1258
1259	/*
1260	 * When unlocking the umtx, it must be marked as unowned if
1261	 * there is zero or one thread only waiting for it.
1262	 * Otherwise, it must be marked as contested.
1263	 */
1264	old = casuword32(&m->m_owner, owner,
1265		count <= 1 ? UMUTEX_UNOWNED : UMUTEX_CONTESTED);
1266	umtxq_lock(&key);
1267	umtxq_signal(&key,1);
1268	umtxq_unbusy(&key);
1269	umtxq_unlock(&key);
1270	umtx_key_release(&key);
1271	if (old == -1)
1272		return (EFAULT);
1273	if (old != owner)
1274		return (EINVAL);
1275	return (0);
1276}
1277
1278/*
1279 * Check if the mutex is available and wake up a waiter,
1280 * only for simple mutex.
1281 */
1282static int
1283do_wake_umutex(struct thread *td, struct umutex *m)
1284{
1285	struct umtx_key key;
1286	uint32_t owner;
1287	uint32_t flags;
1288	int error;
1289	int count;
1290
1291	owner = fuword32(__DEVOLATILE(uint32_t *, &m->m_owner));
1292	if (owner == -1)
1293		return (EFAULT);
1294
1295	if ((owner & ~UMUTEX_CONTESTED) != 0)
1296		return (0);
1297
1298	flags = fuword32(&m->m_flags);
1299
1300	/* We should only ever be in here for contested locks */
1301	if ((error = umtx_key_get(m, TYPE_NORMAL_UMUTEX, GET_SHARE(flags),
1302	    &key)) != 0)
1303		return (error);
1304
1305	umtxq_lock(&key);
1306	umtxq_busy(&key);
1307	count = umtxq_count(&key);
1308	umtxq_unlock(&key);
1309
1310	if (count <= 1)
1311		owner = casuword32(&m->m_owner, UMUTEX_CONTESTED, UMUTEX_UNOWNED);
1312
1313	umtxq_lock(&key);
1314	if (count != 0 && (owner & ~UMUTEX_CONTESTED) == 0)
1315		umtxq_signal(&key, 1);
1316	umtxq_unbusy(&key);
1317	umtxq_unlock(&key);
1318	umtx_key_release(&key);
1319	return (0);
1320}
1321
1322/*
1323 * Check if the mutex has waiters and tries to fix contention bit.
1324 */
1325static int
1326do_wake2_umutex(struct thread *td, struct umutex *m, uint32_t flags)
1327{
1328	struct umtx_key key;
1329	uint32_t owner, old;
1330	int type;
1331	int error;
1332	int count;
1333
1334	switch(flags & (UMUTEX_PRIO_INHERIT | UMUTEX_PRIO_PROTECT)) {
1335	case 0:
1336		type = TYPE_NORMAL_UMUTEX;
1337		break;
1338	case UMUTEX_PRIO_INHERIT:
1339		type = TYPE_PI_UMUTEX;
1340		break;
1341	case UMUTEX_PRIO_PROTECT:
1342		type = TYPE_PP_UMUTEX;
1343		break;
1344	default:
1345		return (EINVAL);
1346	}
1347	if ((error = umtx_key_get(m, type, GET_SHARE(flags),
1348	    &key)) != 0)
1349		return (error);
1350
1351	owner = 0;
1352	umtxq_lock(&key);
1353	umtxq_busy(&key);
1354	count = umtxq_count(&key);
1355	umtxq_unlock(&key);
1356	/*
1357	 * Only repair contention bit if there is a waiter, this means the mutex
1358	 * is still being referenced by userland code, otherwise don't update
1359	 * any memory.
1360	 */
1361	if (count > 1) {
1362		owner = fuword32(__DEVOLATILE(uint32_t *, &m->m_owner));
1363		while ((owner & UMUTEX_CONTESTED) ==0) {
1364			old = casuword32(&m->m_owner, owner,
1365			    owner|UMUTEX_CONTESTED);
1366			if (old == owner)
1367				break;
1368			owner = old;
1369		}
1370	} else if (count == 1) {
1371		owner = fuword32(__DEVOLATILE(uint32_t *, &m->m_owner));
1372		while ((owner & ~UMUTEX_CONTESTED) != 0 &&
1373		       (owner & UMUTEX_CONTESTED) == 0) {
1374			old = casuword32(&m->m_owner, owner,
1375			    owner|UMUTEX_CONTESTED);
1376			if (old == owner)
1377				break;
1378			owner = old;
1379		}
1380	}
1381	umtxq_lock(&key);
1382	if (owner == -1) {
1383		error = EFAULT;
1384		umtxq_signal(&key, INT_MAX);
1385	}
1386	else if (count != 0 && (owner & ~UMUTEX_CONTESTED) == 0)
1387		umtxq_signal(&key, 1);
1388	umtxq_unbusy(&key);
1389	umtxq_unlock(&key);
1390	umtx_key_release(&key);
1391	return (error);
1392}
1393
1394static inline struct umtx_pi *
1395umtx_pi_alloc(int flags)
1396{
1397	struct umtx_pi *pi;
1398
1399	pi = uma_zalloc(umtx_pi_zone, M_ZERO | flags);
1400	TAILQ_INIT(&pi->pi_blocked);
1401	atomic_add_int(&umtx_pi_allocated, 1);
1402	return (pi);
1403}
1404
1405static inline void
1406umtx_pi_free(struct umtx_pi *pi)
1407{
1408	uma_zfree(umtx_pi_zone, pi);
1409	atomic_add_int(&umtx_pi_allocated, -1);
1410}
1411
1412/*
1413 * Adjust the thread's position on a pi_state after its priority has been
1414 * changed.
1415 */
1416static int
1417umtx_pi_adjust_thread(struct umtx_pi *pi, struct thread *td)
1418{
1419	struct umtx_q *uq, *uq1, *uq2;
1420	struct thread *td1;
1421
1422	mtx_assert(&umtx_lock, MA_OWNED);
1423	if (pi == NULL)
1424		return (0);
1425
1426	uq = td->td_umtxq;
1427
1428	/*
1429	 * Check if the thread needs to be moved on the blocked chain.
1430	 * It needs to be moved if either its priority is lower than
1431	 * the previous thread or higher than the next thread.
1432	 */
1433	uq1 = TAILQ_PREV(uq, umtxq_head, uq_lockq);
1434	uq2 = TAILQ_NEXT(uq, uq_lockq);
1435	if ((uq1 != NULL && UPRI(td) < UPRI(uq1->uq_thread)) ||
1436	    (uq2 != NULL && UPRI(td) > UPRI(uq2->uq_thread))) {
1437		/*
1438		 * Remove thread from blocked chain and determine where
1439		 * it should be moved to.
1440		 */
1441		TAILQ_REMOVE(&pi->pi_blocked, uq, uq_lockq);
1442		TAILQ_FOREACH(uq1, &pi->pi_blocked, uq_lockq) {
1443			td1 = uq1->uq_thread;
1444			MPASS(td1->td_proc->p_magic == P_MAGIC);
1445			if (UPRI(td1) > UPRI(td))
1446				break;
1447		}
1448
1449		if (uq1 == NULL)
1450			TAILQ_INSERT_TAIL(&pi->pi_blocked, uq, uq_lockq);
1451		else
1452			TAILQ_INSERT_BEFORE(uq1, uq, uq_lockq);
1453	}
1454	return (1);
1455}
1456
1457/*
1458 * Propagate priority when a thread is blocked on POSIX
1459 * PI mutex.
1460 */
1461static void
1462umtx_propagate_priority(struct thread *td)
1463{
1464	struct umtx_q *uq;
1465	struct umtx_pi *pi;
1466	int pri;
1467
1468	mtx_assert(&umtx_lock, MA_OWNED);
1469	pri = UPRI(td);
1470	uq = td->td_umtxq;
1471	pi = uq->uq_pi_blocked;
1472	if (pi == NULL)
1473		return;
1474
1475	for (;;) {
1476		td = pi->pi_owner;
1477		if (td == NULL || td == curthread)
1478			return;
1479
1480		MPASS(td->td_proc != NULL);
1481		MPASS(td->td_proc->p_magic == P_MAGIC);
1482
1483		thread_lock(td);
1484		if (td->td_lend_user_pri > pri)
1485			sched_lend_user_prio(td, pri);
1486		else {
1487			thread_unlock(td);
1488			break;
1489		}
1490		thread_unlock(td);
1491
1492		/*
1493		 * Pick up the lock that td is blocked on.
1494		 */
1495		uq = td->td_umtxq;
1496		pi = uq->uq_pi_blocked;
1497		if (pi == NULL)
1498			break;
1499		/* Resort td on the list if needed. */
1500		umtx_pi_adjust_thread(pi, td);
1501	}
1502}
1503
1504/*
1505 * Unpropagate priority for a PI mutex when a thread blocked on
1506 * it is interrupted by signal or resumed by others.
1507 */
1508static void
1509umtx_repropagate_priority(struct umtx_pi *pi)
1510{
1511	struct umtx_q *uq, *uq_owner;
1512	struct umtx_pi *pi2;
1513	int pri;
1514
1515	mtx_assert(&umtx_lock, MA_OWNED);
1516
1517	while (pi != NULL && pi->pi_owner != NULL) {
1518		pri = PRI_MAX;
1519		uq_owner = pi->pi_owner->td_umtxq;
1520
1521		TAILQ_FOREACH(pi2, &uq_owner->uq_pi_contested, pi_link) {
1522			uq = TAILQ_FIRST(&pi2->pi_blocked);
1523			if (uq != NULL) {
1524				if (pri > UPRI(uq->uq_thread))
1525					pri = UPRI(uq->uq_thread);
1526			}
1527		}
1528
1529		if (pri > uq_owner->uq_inherited_pri)
1530			pri = uq_owner->uq_inherited_pri;
1531		thread_lock(pi->pi_owner);
1532		sched_lend_user_prio(pi->pi_owner, pri);
1533		thread_unlock(pi->pi_owner);
1534		if ((pi = uq_owner->uq_pi_blocked) != NULL)
1535			umtx_pi_adjust_thread(pi, uq_owner->uq_thread);
1536	}
1537}
1538
1539/*
1540 * Insert a PI mutex into owned list.
1541 */
1542static void
1543umtx_pi_setowner(struct umtx_pi *pi, struct thread *owner)
1544{
1545	struct umtx_q *uq_owner;
1546
1547	uq_owner = owner->td_umtxq;
1548	mtx_assert(&umtx_lock, MA_OWNED);
1549	if (pi->pi_owner != NULL)
1550		panic("pi_ower != NULL");
1551	pi->pi_owner = owner;
1552	TAILQ_INSERT_TAIL(&uq_owner->uq_pi_contested, pi, pi_link);
1553}
1554
1555/*
1556 * Claim ownership of a PI mutex.
1557 */
1558static int
1559umtx_pi_claim(struct umtx_pi *pi, struct thread *owner)
1560{
1561	struct umtx_q *uq, *uq_owner;
1562
1563	uq_owner = owner->td_umtxq;
1564	mtx_lock_spin(&umtx_lock);
1565	if (pi->pi_owner == owner) {
1566		mtx_unlock_spin(&umtx_lock);
1567		return (0);
1568	}
1569
1570	if (pi->pi_owner != NULL) {
1571		/*
1572		 * userland may have already messed the mutex, sigh.
1573		 */
1574		mtx_unlock_spin(&umtx_lock);
1575		return (EPERM);
1576	}
1577	umtx_pi_setowner(pi, owner);
1578	uq = TAILQ_FIRST(&pi->pi_blocked);
1579	if (uq != NULL) {
1580		int pri;
1581
1582		pri = UPRI(uq->uq_thread);
1583		thread_lock(owner);
1584		if (pri < UPRI(owner))
1585			sched_lend_user_prio(owner, pri);
1586		thread_unlock(owner);
1587	}
1588	mtx_unlock_spin(&umtx_lock);
1589	return (0);
1590}
1591
1592/*
1593 * Adjust a thread's order position in its blocked PI mutex,
1594 * this may result new priority propagating process.
1595 */
1596void
1597umtx_pi_adjust(struct thread *td, u_char oldpri)
1598{
1599	struct umtx_q *uq;
1600	struct umtx_pi *pi;
1601
1602	uq = td->td_umtxq;
1603	mtx_lock_spin(&umtx_lock);
1604	/*
1605	 * Pick up the lock that td is blocked on.
1606	 */
1607	pi = uq->uq_pi_blocked;
1608	if (pi != NULL) {
1609		umtx_pi_adjust_thread(pi, td);
1610		umtx_repropagate_priority(pi);
1611	}
1612	mtx_unlock_spin(&umtx_lock);
1613}
1614
1615/*
1616 * Sleep on a PI mutex.
1617 */
1618static int
1619umtxq_sleep_pi(struct umtx_q *uq, struct umtx_pi *pi,
1620	uint32_t owner, const char *wmesg, struct abs_timeout *timo)
1621{
1622	struct umtxq_chain *uc;
1623	struct thread *td, *td1;
1624	struct umtx_q *uq1;
1625	int pri;
1626	int error = 0;
1627
1628	td = uq->uq_thread;
1629	KASSERT(td == curthread, ("inconsistent uq_thread"));
1630	uc = umtxq_getchain(&uq->uq_key);
1631	UMTXQ_LOCKED_ASSERT(uc);
1632	UMTXQ_BUSY_ASSERT(uc);
1633	umtxq_insert(uq);
1634	mtx_lock_spin(&umtx_lock);
1635	if (pi->pi_owner == NULL) {
1636		mtx_unlock_spin(&umtx_lock);
1637		/* XXX Only look up thread in current process. */
1638		td1 = tdfind(owner, curproc->p_pid);
1639		mtx_lock_spin(&umtx_lock);
1640		if (td1 != NULL) {
1641			if (pi->pi_owner == NULL)
1642				umtx_pi_setowner(pi, td1);
1643			PROC_UNLOCK(td1->td_proc);
1644		}
1645	}
1646
1647	TAILQ_FOREACH(uq1, &pi->pi_blocked, uq_lockq) {
1648		pri = UPRI(uq1->uq_thread);
1649		if (pri > UPRI(td))
1650			break;
1651	}
1652
1653	if (uq1 != NULL)
1654		TAILQ_INSERT_BEFORE(uq1, uq, uq_lockq);
1655	else
1656		TAILQ_INSERT_TAIL(&pi->pi_blocked, uq, uq_lockq);
1657
1658	uq->uq_pi_blocked = pi;
1659	thread_lock(td);
1660	td->td_flags |= TDF_UPIBLOCKED;
1661	thread_unlock(td);
1662	umtx_propagate_priority(td);
1663	mtx_unlock_spin(&umtx_lock);
1664	umtxq_unbusy(&uq->uq_key);
1665
1666	error = umtxq_sleep(uq, wmesg, timo);
1667	umtxq_remove(uq);
1668
1669	mtx_lock_spin(&umtx_lock);
1670	uq->uq_pi_blocked = NULL;
1671	thread_lock(td);
1672	td->td_flags &= ~TDF_UPIBLOCKED;
1673	thread_unlock(td);
1674	TAILQ_REMOVE(&pi->pi_blocked, uq, uq_lockq);
1675	umtx_repropagate_priority(pi);
1676	mtx_unlock_spin(&umtx_lock);
1677	umtxq_unlock(&uq->uq_key);
1678
1679	return (error);
1680}
1681
1682/*
1683 * Add reference count for a PI mutex.
1684 */
1685static void
1686umtx_pi_ref(struct umtx_pi *pi)
1687{
1688	struct umtxq_chain *uc;
1689
1690	uc = umtxq_getchain(&pi->pi_key);
1691	UMTXQ_LOCKED_ASSERT(uc);
1692	pi->pi_refcount++;
1693}
1694
1695/*
1696 * Decrease reference count for a PI mutex, if the counter
1697 * is decreased to zero, its memory space is freed.
1698 */
1699static void
1700umtx_pi_unref(struct umtx_pi *pi)
1701{
1702	struct umtxq_chain *uc;
1703
1704	uc = umtxq_getchain(&pi->pi_key);
1705	UMTXQ_LOCKED_ASSERT(uc);
1706	KASSERT(pi->pi_refcount > 0, ("invalid reference count"));
1707	if (--pi->pi_refcount == 0) {
1708		mtx_lock_spin(&umtx_lock);
1709		if (pi->pi_owner != NULL) {
1710			TAILQ_REMOVE(&pi->pi_owner->td_umtxq->uq_pi_contested,
1711				pi, pi_link);
1712			pi->pi_owner = NULL;
1713		}
1714		KASSERT(TAILQ_EMPTY(&pi->pi_blocked),
1715			("blocked queue not empty"));
1716		mtx_unlock_spin(&umtx_lock);
1717		TAILQ_REMOVE(&uc->uc_pi_list, pi, pi_hashlink);
1718		umtx_pi_free(pi);
1719	}
1720}
1721
1722/*
1723 * Find a PI mutex in hash table.
1724 */
1725static struct umtx_pi *
1726umtx_pi_lookup(struct umtx_key *key)
1727{
1728	struct umtxq_chain *uc;
1729	struct umtx_pi *pi;
1730
1731	uc = umtxq_getchain(key);
1732	UMTXQ_LOCKED_ASSERT(uc);
1733
1734	TAILQ_FOREACH(pi, &uc->uc_pi_list, pi_hashlink) {
1735		if (umtx_key_match(&pi->pi_key, key)) {
1736			return (pi);
1737		}
1738	}
1739	return (NULL);
1740}
1741
1742/*
1743 * Insert a PI mutex into hash table.
1744 */
1745static inline void
1746umtx_pi_insert(struct umtx_pi *pi)
1747{
1748	struct umtxq_chain *uc;
1749
1750	uc = umtxq_getchain(&pi->pi_key);
1751	UMTXQ_LOCKED_ASSERT(uc);
1752	TAILQ_INSERT_TAIL(&uc->uc_pi_list, pi, pi_hashlink);
1753}
1754
1755/*
1756 * Lock a PI mutex.
1757 */
1758static int
1759do_lock_pi(struct thread *td, struct umutex *m, uint32_t flags,
1760    struct _umtx_time *timeout, int try)
1761{
1762	struct abs_timeout timo;
1763	struct umtx_q *uq;
1764	struct umtx_pi *pi, *new_pi;
1765	uint32_t id, owner, old;
1766	int error;
1767
1768	id = td->td_tid;
1769	uq = td->td_umtxq;
1770
1771	if ((error = umtx_key_get(m, TYPE_PI_UMUTEX, GET_SHARE(flags),
1772	    &uq->uq_key)) != 0)
1773		return (error);
1774
1775	if (timeout != NULL)
1776		abs_timeout_init2(&timo, timeout);
1777
1778	umtxq_lock(&uq->uq_key);
1779	pi = umtx_pi_lookup(&uq->uq_key);
1780	if (pi == NULL) {
1781		new_pi = umtx_pi_alloc(M_NOWAIT);
1782		if (new_pi == NULL) {
1783			umtxq_unlock(&uq->uq_key);
1784			new_pi = umtx_pi_alloc(M_WAITOK);
1785			umtxq_lock(&uq->uq_key);
1786			pi = umtx_pi_lookup(&uq->uq_key);
1787			if (pi != NULL) {
1788				umtx_pi_free(new_pi);
1789				new_pi = NULL;
1790			}
1791		}
1792		if (new_pi != NULL) {
1793			new_pi->pi_key = uq->uq_key;
1794			umtx_pi_insert(new_pi);
1795			pi = new_pi;
1796		}
1797	}
1798	umtx_pi_ref(pi);
1799	umtxq_unlock(&uq->uq_key);
1800
1801	/*
1802	 * Care must be exercised when dealing with umtx structure.  It
1803	 * can fault on any access.
1804	 */
1805	for (;;) {
1806		/*
1807		 * Try the uncontested case.  This should be done in userland.
1808		 */
1809		owner = casuword32(&m->m_owner, UMUTEX_UNOWNED, id);
1810
1811		/* The acquire succeeded. */
1812		if (owner == UMUTEX_UNOWNED) {
1813			error = 0;
1814			break;
1815		}
1816
1817		/* The address was invalid. */
1818		if (owner == -1) {
1819			error = EFAULT;
1820			break;
1821		}
1822
1823		/* If no one owns it but it is contested try to acquire it. */
1824		if (owner == UMUTEX_CONTESTED) {
1825			owner = casuword32(&m->m_owner,
1826			    UMUTEX_CONTESTED, id | UMUTEX_CONTESTED);
1827
1828			if (owner == UMUTEX_CONTESTED) {
1829				umtxq_lock(&uq->uq_key);
1830				umtxq_busy(&uq->uq_key);
1831				error = umtx_pi_claim(pi, td);
1832				umtxq_unbusy(&uq->uq_key);
1833				umtxq_unlock(&uq->uq_key);
1834				break;
1835			}
1836
1837			/* The address was invalid. */
1838			if (owner == -1) {
1839				error = EFAULT;
1840				break;
1841			}
1842
1843			/* If this failed the lock has changed, restart. */
1844			continue;
1845		}
1846
1847		if ((flags & UMUTEX_ERROR_CHECK) != 0 &&
1848		    (owner & ~UMUTEX_CONTESTED) == id) {
1849			error = EDEADLK;
1850			break;
1851		}
1852
1853		if (try != 0) {
1854			error = EBUSY;
1855			break;
1856		}
1857
1858		/*
1859		 * If we caught a signal, we have retried and now
1860		 * exit immediately.
1861		 */
1862		if (error != 0)
1863			break;
1864
1865		umtxq_lock(&uq->uq_key);
1866		umtxq_busy(&uq->uq_key);
1867		umtxq_unlock(&uq->uq_key);
1868
1869		/*
1870		 * Set the contested bit so that a release in user space
1871		 * knows to use the system call for unlock.  If this fails
1872		 * either some one else has acquired the lock or it has been
1873		 * released.
1874		 */
1875		old = casuword32(&m->m_owner, owner, owner | UMUTEX_CONTESTED);
1876
1877		/* The address was invalid. */
1878		if (old == -1) {
1879			umtxq_lock(&uq->uq_key);
1880			umtxq_unbusy(&uq->uq_key);
1881			umtxq_unlock(&uq->uq_key);
1882			error = EFAULT;
1883			break;
1884		}
1885
1886		umtxq_lock(&uq->uq_key);
1887		/*
1888		 * We set the contested bit, sleep. Otherwise the lock changed
1889		 * and we need to retry or we lost a race to the thread
1890		 * unlocking the umtx.
1891		 */
1892		if (old == owner)
1893			error = umtxq_sleep_pi(uq, pi, owner & ~UMUTEX_CONTESTED,
1894			    "umtxpi", timeout == NULL ? NULL : &timo);
1895		else {
1896			umtxq_unbusy(&uq->uq_key);
1897			umtxq_unlock(&uq->uq_key);
1898		}
1899	}
1900
1901	umtxq_lock(&uq->uq_key);
1902	umtx_pi_unref(pi);
1903	umtxq_unlock(&uq->uq_key);
1904
1905	umtx_key_release(&uq->uq_key);
1906	return (error);
1907}
1908
1909/*
1910 * Unlock a PI mutex.
1911 */
1912static int
1913do_unlock_pi(struct thread *td, struct umutex *m, uint32_t flags)
1914{
1915	struct umtx_key key;
1916	struct umtx_q *uq_first, *uq_first2, *uq_me;
1917	struct umtx_pi *pi, *pi2;
1918	uint32_t owner, old, id;
1919	int error;
1920	int count;
1921	int pri;
1922
1923	id = td->td_tid;
1924	/*
1925	 * Make sure we own this mtx.
1926	 */
1927	owner = fuword32(__DEVOLATILE(uint32_t *, &m->m_owner));
1928	if (owner == -1)
1929		return (EFAULT);
1930
1931	if ((owner & ~UMUTEX_CONTESTED) != id)
1932		return (EPERM);
1933
1934	/* This should be done in userland */
1935	if ((owner & UMUTEX_CONTESTED) == 0) {
1936		old = casuword32(&m->m_owner, owner, UMUTEX_UNOWNED);
1937		if (old == -1)
1938			return (EFAULT);
1939		if (old == owner)
1940			return (0);
1941		owner = old;
1942	}
1943
1944	/* We should only ever be in here for contested locks */
1945	if ((error = umtx_key_get(m, TYPE_PI_UMUTEX, GET_SHARE(flags),
1946	    &key)) != 0)
1947		return (error);
1948
1949	umtxq_lock(&key);
1950	umtxq_busy(&key);
1951	count = umtxq_count_pi(&key, &uq_first);
1952	if (uq_first != NULL) {
1953		mtx_lock_spin(&umtx_lock);
1954		pi = uq_first->uq_pi_blocked;
1955		KASSERT(pi != NULL, ("pi == NULL?"));
1956		if (pi->pi_owner != curthread) {
1957			mtx_unlock_spin(&umtx_lock);
1958			umtxq_unbusy(&key);
1959			umtxq_unlock(&key);
1960			umtx_key_release(&key);
1961			/* userland messed the mutex */
1962			return (EPERM);
1963		}
1964		uq_me = curthread->td_umtxq;
1965		pi->pi_owner = NULL;
1966		TAILQ_REMOVE(&uq_me->uq_pi_contested, pi, pi_link);
1967		/* get highest priority thread which is still sleeping. */
1968		uq_first = TAILQ_FIRST(&pi->pi_blocked);
1969		while (uq_first != NULL &&
1970		       (uq_first->uq_flags & UQF_UMTXQ) == 0) {
1971			uq_first = TAILQ_NEXT(uq_first, uq_lockq);
1972		}
1973		pri = PRI_MAX;
1974		TAILQ_FOREACH(pi2, &uq_me->uq_pi_contested, pi_link) {
1975			uq_first2 = TAILQ_FIRST(&pi2->pi_blocked);
1976			if (uq_first2 != NULL) {
1977				if (pri > UPRI(uq_first2->uq_thread))
1978					pri = UPRI(uq_first2->uq_thread);
1979			}
1980		}
1981		thread_lock(curthread);
1982		sched_lend_user_prio(curthread, pri);
1983		thread_unlock(curthread);
1984		mtx_unlock_spin(&umtx_lock);
1985		if (uq_first)
1986			umtxq_signal_thread(uq_first);
1987	}
1988	umtxq_unlock(&key);
1989
1990	/*
1991	 * When unlocking the umtx, it must be marked as unowned if
1992	 * there is zero or one thread only waiting for it.
1993	 * Otherwise, it must be marked as contested.
1994	 */
1995	old = casuword32(&m->m_owner, owner,
1996		count <= 1 ? UMUTEX_UNOWNED : UMUTEX_CONTESTED);
1997
1998	umtxq_lock(&key);
1999	umtxq_unbusy(&key);
2000	umtxq_unlock(&key);
2001	umtx_key_release(&key);
2002	if (old == -1)
2003		return (EFAULT);
2004	if (old != owner)
2005		return (EINVAL);
2006	return (0);
2007}
2008
2009/*
2010 * Lock a PP mutex.
2011 */
2012static int
2013do_lock_pp(struct thread *td, struct umutex *m, uint32_t flags,
2014    struct _umtx_time *timeout, int try)
2015{
2016	struct abs_timeout timo;
2017	struct umtx_q *uq, *uq2;
2018	struct umtx_pi *pi;
2019	uint32_t ceiling;
2020	uint32_t owner, id;
2021	int error, pri, old_inherited_pri, su;
2022
2023	id = td->td_tid;
2024	uq = td->td_umtxq;
2025	if ((error = umtx_key_get(m, TYPE_PP_UMUTEX, GET_SHARE(flags),
2026	    &uq->uq_key)) != 0)
2027		return (error);
2028
2029	if (timeout != NULL)
2030		abs_timeout_init2(&timo, timeout);
2031
2032	su = (priv_check(td, PRIV_SCHED_RTPRIO) == 0);
2033	for (;;) {
2034		old_inherited_pri = uq->uq_inherited_pri;
2035		umtxq_lock(&uq->uq_key);
2036		umtxq_busy(&uq->uq_key);
2037		umtxq_unlock(&uq->uq_key);
2038
2039		ceiling = RTP_PRIO_MAX - fuword32(&m->m_ceilings[0]);
2040		if (ceiling > RTP_PRIO_MAX) {
2041			error = EINVAL;
2042			goto out;
2043		}
2044
2045		mtx_lock_spin(&umtx_lock);
2046		if (UPRI(td) < PRI_MIN_REALTIME + ceiling) {
2047			mtx_unlock_spin(&umtx_lock);
2048			error = EINVAL;
2049			goto out;
2050		}
2051		if (su && PRI_MIN_REALTIME + ceiling < uq->uq_inherited_pri) {
2052			uq->uq_inherited_pri = PRI_MIN_REALTIME + ceiling;
2053			thread_lock(td);
2054			if (uq->uq_inherited_pri < UPRI(td))
2055				sched_lend_user_prio(td, uq->uq_inherited_pri);
2056			thread_unlock(td);
2057		}
2058		mtx_unlock_spin(&umtx_lock);
2059
2060		owner = casuword32(&m->m_owner,
2061		    UMUTEX_CONTESTED, id | UMUTEX_CONTESTED);
2062
2063		if (owner == UMUTEX_CONTESTED) {
2064			error = 0;
2065			break;
2066		}
2067
2068		/* The address was invalid. */
2069		if (owner == -1) {
2070			error = EFAULT;
2071			break;
2072		}
2073
2074		if ((flags & UMUTEX_ERROR_CHECK) != 0 &&
2075		    (owner & ~UMUTEX_CONTESTED) == id) {
2076			error = EDEADLK;
2077			break;
2078		}
2079
2080		if (try != 0) {
2081			error = EBUSY;
2082			break;
2083		}
2084
2085		/*
2086		 * If we caught a signal, we have retried and now
2087		 * exit immediately.
2088		 */
2089		if (error != 0)
2090			break;
2091
2092		umtxq_lock(&uq->uq_key);
2093		umtxq_insert(uq);
2094		umtxq_unbusy(&uq->uq_key);
2095		error = umtxq_sleep(uq, "umtxpp", timeout == NULL ?
2096		    NULL : &timo);
2097		umtxq_remove(uq);
2098		umtxq_unlock(&uq->uq_key);
2099
2100		mtx_lock_spin(&umtx_lock);
2101		uq->uq_inherited_pri = old_inherited_pri;
2102		pri = PRI_MAX;
2103		TAILQ_FOREACH(pi, &uq->uq_pi_contested, pi_link) {
2104			uq2 = TAILQ_FIRST(&pi->pi_blocked);
2105			if (uq2 != NULL) {
2106				if (pri > UPRI(uq2->uq_thread))
2107					pri = UPRI(uq2->uq_thread);
2108			}
2109		}
2110		if (pri > uq->uq_inherited_pri)
2111			pri = uq->uq_inherited_pri;
2112		thread_lock(td);
2113		sched_lend_user_prio(td, pri);
2114		thread_unlock(td);
2115		mtx_unlock_spin(&umtx_lock);
2116	}
2117
2118	if (error != 0) {
2119		mtx_lock_spin(&umtx_lock);
2120		uq->uq_inherited_pri = old_inherited_pri;
2121		pri = PRI_MAX;
2122		TAILQ_FOREACH(pi, &uq->uq_pi_contested, pi_link) {
2123			uq2 = TAILQ_FIRST(&pi->pi_blocked);
2124			if (uq2 != NULL) {
2125				if (pri > UPRI(uq2->uq_thread))
2126					pri = UPRI(uq2->uq_thread);
2127			}
2128		}
2129		if (pri > uq->uq_inherited_pri)
2130			pri = uq->uq_inherited_pri;
2131		thread_lock(td);
2132		sched_lend_user_prio(td, pri);
2133		thread_unlock(td);
2134		mtx_unlock_spin(&umtx_lock);
2135	}
2136
2137out:
2138	umtxq_lock(&uq->uq_key);
2139	umtxq_unbusy(&uq->uq_key);
2140	umtxq_unlock(&uq->uq_key);
2141	umtx_key_release(&uq->uq_key);
2142	return (error);
2143}
2144
2145/*
2146 * Unlock a PP mutex.
2147 */
2148static int
2149do_unlock_pp(struct thread *td, struct umutex *m, uint32_t flags)
2150{
2151	struct umtx_key key;
2152	struct umtx_q *uq, *uq2;
2153	struct umtx_pi *pi;
2154	uint32_t owner, id;
2155	uint32_t rceiling;
2156	int error, pri, new_inherited_pri, su;
2157
2158	id = td->td_tid;
2159	uq = td->td_umtxq;
2160	su = (priv_check(td, PRIV_SCHED_RTPRIO) == 0);
2161
2162	/*
2163	 * Make sure we own this mtx.
2164	 */
2165	owner = fuword32(__DEVOLATILE(uint32_t *, &m->m_owner));
2166	if (owner == -1)
2167		return (EFAULT);
2168
2169	if ((owner & ~UMUTEX_CONTESTED) != id)
2170		return (EPERM);
2171
2172	error = copyin(&m->m_ceilings[1], &rceiling, sizeof(uint32_t));
2173	if (error != 0)
2174		return (error);
2175
2176	if (rceiling == -1)
2177		new_inherited_pri = PRI_MAX;
2178	else {
2179		rceiling = RTP_PRIO_MAX - rceiling;
2180		if (rceiling > RTP_PRIO_MAX)
2181			return (EINVAL);
2182		new_inherited_pri = PRI_MIN_REALTIME + rceiling;
2183	}
2184
2185	if ((error = umtx_key_get(m, TYPE_PP_UMUTEX, GET_SHARE(flags),
2186	    &key)) != 0)
2187		return (error);
2188	umtxq_lock(&key);
2189	umtxq_busy(&key);
2190	umtxq_unlock(&key);
2191	/*
2192	 * For priority protected mutex, always set unlocked state
2193	 * to UMUTEX_CONTESTED, so that userland always enters kernel
2194	 * to lock the mutex, it is necessary because thread priority
2195	 * has to be adjusted for such mutex.
2196	 */
2197	error = suword32(__DEVOLATILE(uint32_t *, &m->m_owner),
2198		UMUTEX_CONTESTED);
2199
2200	umtxq_lock(&key);
2201	if (error == 0)
2202		umtxq_signal(&key, 1);
2203	umtxq_unbusy(&key);
2204	umtxq_unlock(&key);
2205
2206	if (error == -1)
2207		error = EFAULT;
2208	else {
2209		mtx_lock_spin(&umtx_lock);
2210		if (su != 0)
2211			uq->uq_inherited_pri = new_inherited_pri;
2212		pri = PRI_MAX;
2213		TAILQ_FOREACH(pi, &uq->uq_pi_contested, pi_link) {
2214			uq2 = TAILQ_FIRST(&pi->pi_blocked);
2215			if (uq2 != NULL) {
2216				if (pri > UPRI(uq2->uq_thread))
2217					pri = UPRI(uq2->uq_thread);
2218			}
2219		}
2220		if (pri > uq->uq_inherited_pri)
2221			pri = uq->uq_inherited_pri;
2222		thread_lock(td);
2223		sched_lend_user_prio(td, pri);
2224		thread_unlock(td);
2225		mtx_unlock_spin(&umtx_lock);
2226	}
2227	umtx_key_release(&key);
2228	return (error);
2229}
2230
2231static int
2232do_set_ceiling(struct thread *td, struct umutex *m, uint32_t ceiling,
2233	uint32_t *old_ceiling)
2234{
2235	struct umtx_q *uq;
2236	uint32_t save_ceiling;
2237	uint32_t owner, id;
2238	uint32_t flags;
2239	int error;
2240
2241	flags = fuword32(&m->m_flags);
2242	if ((flags & UMUTEX_PRIO_PROTECT) == 0)
2243		return (EINVAL);
2244	if (ceiling > RTP_PRIO_MAX)
2245		return (EINVAL);
2246	id = td->td_tid;
2247	uq = td->td_umtxq;
2248	if ((error = umtx_key_get(m, TYPE_PP_UMUTEX, GET_SHARE(flags),
2249	   &uq->uq_key)) != 0)
2250		return (error);
2251	for (;;) {
2252		umtxq_lock(&uq->uq_key);
2253		umtxq_busy(&uq->uq_key);
2254		umtxq_unlock(&uq->uq_key);
2255
2256		save_ceiling = fuword32(&m->m_ceilings[0]);
2257
2258		owner = casuword32(&m->m_owner,
2259		    UMUTEX_CONTESTED, id | UMUTEX_CONTESTED);
2260
2261		if (owner == UMUTEX_CONTESTED) {
2262			suword32(&m->m_ceilings[0], ceiling);
2263			suword32(__DEVOLATILE(uint32_t *, &m->m_owner),
2264				UMUTEX_CONTESTED);
2265			error = 0;
2266			break;
2267		}
2268
2269		/* The address was invalid. */
2270		if (owner == -1) {
2271			error = EFAULT;
2272			break;
2273		}
2274
2275		if ((owner & ~UMUTEX_CONTESTED) == id) {
2276			suword32(&m->m_ceilings[0], ceiling);
2277			error = 0;
2278			break;
2279		}
2280
2281		/*
2282		 * If we caught a signal, we have retried and now
2283		 * exit immediately.
2284		 */
2285		if (error != 0)
2286			break;
2287
2288		/*
2289		 * We set the contested bit, sleep. Otherwise the lock changed
2290		 * and we need to retry or we lost a race to the thread
2291		 * unlocking the umtx.
2292		 */
2293		umtxq_lock(&uq->uq_key);
2294		umtxq_insert(uq);
2295		umtxq_unbusy(&uq->uq_key);
2296		error = umtxq_sleep(uq, "umtxpp", NULL);
2297		umtxq_remove(uq);
2298		umtxq_unlock(&uq->uq_key);
2299	}
2300	umtxq_lock(&uq->uq_key);
2301	if (error == 0)
2302		umtxq_signal(&uq->uq_key, INT_MAX);
2303	umtxq_unbusy(&uq->uq_key);
2304	umtxq_unlock(&uq->uq_key);
2305	umtx_key_release(&uq->uq_key);
2306	if (error == 0 && old_ceiling != NULL)
2307		suword32(old_ceiling, save_ceiling);
2308	return (error);
2309}
2310
2311/*
2312 * Lock a userland POSIX mutex.
2313 */
2314static int
2315do_lock_umutex(struct thread *td, struct umutex *m,
2316    struct _umtx_time *timeout, int mode)
2317{
2318	uint32_t flags;
2319	int error;
2320
2321	flags = fuword32(&m->m_flags);
2322	if (flags == -1)
2323		return (EFAULT);
2324
2325	switch(flags & (UMUTEX_PRIO_INHERIT | UMUTEX_PRIO_PROTECT)) {
2326	case 0:
2327		error = do_lock_normal(td, m, flags, timeout, mode);
2328		break;
2329	case UMUTEX_PRIO_INHERIT:
2330		error = do_lock_pi(td, m, flags, timeout, mode);
2331		break;
2332	case UMUTEX_PRIO_PROTECT:
2333		error = do_lock_pp(td, m, flags, timeout, mode);
2334		break;
2335	default:
2336		return (EINVAL);
2337	}
2338	if (timeout == NULL) {
2339		if (error == EINTR && mode != _UMUTEX_WAIT)
2340			error = ERESTART;
2341	} else {
2342		/* Timed-locking is not restarted. */
2343		if (error == ERESTART)
2344			error = EINTR;
2345	}
2346	return (error);
2347}
2348
2349/*
2350 * Unlock a userland POSIX mutex.
2351 */
2352static int
2353do_unlock_umutex(struct thread *td, struct umutex *m)
2354{
2355	uint32_t flags;
2356
2357	flags = fuword32(&m->m_flags);
2358	if (flags == -1)
2359		return (EFAULT);
2360
2361	switch(flags & (UMUTEX_PRIO_INHERIT | UMUTEX_PRIO_PROTECT)) {
2362	case 0:
2363		return (do_unlock_normal(td, m, flags));
2364	case UMUTEX_PRIO_INHERIT:
2365		return (do_unlock_pi(td, m, flags));
2366	case UMUTEX_PRIO_PROTECT:
2367		return (do_unlock_pp(td, m, flags));
2368	}
2369
2370	return (EINVAL);
2371}
2372
2373static int
2374do_cv_wait(struct thread *td, struct ucond *cv, struct umutex *m,
2375	struct timespec *timeout, u_long wflags)
2376{
2377	struct abs_timeout timo;
2378	struct umtx_q *uq;
2379	uint32_t flags;
2380	uint32_t clockid;
2381	int error;
2382
2383	uq = td->td_umtxq;
2384	flags = fuword32(&cv->c_flags);
2385	error = umtx_key_get(cv, TYPE_CV, GET_SHARE(flags), &uq->uq_key);
2386	if (error != 0)
2387		return (error);
2388
2389	if ((wflags & CVWAIT_CLOCKID) != 0) {
2390		clockid = fuword32(&cv->c_clockid);
2391		if (clockid < CLOCK_REALTIME ||
2392		    clockid >= CLOCK_THREAD_CPUTIME_ID) {
2393			/* hmm, only HW clock id will work. */
2394			return (EINVAL);
2395		}
2396	} else {
2397		clockid = CLOCK_REALTIME;
2398	}
2399
2400	umtxq_lock(&uq->uq_key);
2401	umtxq_busy(&uq->uq_key);
2402	umtxq_insert(uq);
2403	umtxq_unlock(&uq->uq_key);
2404
2405	/*
2406	 * Set c_has_waiters to 1 before releasing user mutex, also
2407	 * don't modify cache line when unnecessary.
2408	 */
2409	if (fuword32(__DEVOLATILE(uint32_t *, &cv->c_has_waiters)) == 0)
2410		suword32(__DEVOLATILE(uint32_t *, &cv->c_has_waiters), 1);
2411
2412	umtxq_lock(&uq->uq_key);
2413	umtxq_unbusy(&uq->uq_key);
2414	umtxq_unlock(&uq->uq_key);
2415
2416	error = do_unlock_umutex(td, m);
2417
2418	if (timeout != NULL)
2419		abs_timeout_init(&timo, clockid, ((wflags & CVWAIT_ABSTIME) != 0),
2420			timeout);
2421
2422	umtxq_lock(&uq->uq_key);
2423	if (error == 0) {
2424		error = umtxq_sleep(uq, "ucond", timeout == NULL ?
2425		    NULL : &timo);
2426	}
2427
2428	if ((uq->uq_flags & UQF_UMTXQ) == 0)
2429		error = 0;
2430	else {
2431		/*
2432		 * This must be timeout,interrupted by signal or
2433		 * surprious wakeup, clear c_has_waiter flag when
2434		 * necessary.
2435		 */
2436		umtxq_busy(&uq->uq_key);
2437		if ((uq->uq_flags & UQF_UMTXQ) != 0) {
2438			int oldlen = uq->uq_cur_queue->length;
2439			umtxq_remove(uq);
2440			if (oldlen == 1) {
2441				umtxq_unlock(&uq->uq_key);
2442				suword32(
2443				    __DEVOLATILE(uint32_t *,
2444					 &cv->c_has_waiters), 0);
2445				umtxq_lock(&uq->uq_key);
2446			}
2447		}
2448		umtxq_unbusy(&uq->uq_key);
2449		if (error == ERESTART)
2450			error = EINTR;
2451	}
2452
2453	umtxq_unlock(&uq->uq_key);
2454	umtx_key_release(&uq->uq_key);
2455	return (error);
2456}
2457
2458/*
2459 * Signal a userland condition variable.
2460 */
2461static int
2462do_cv_signal(struct thread *td, struct ucond *cv)
2463{
2464	struct umtx_key key;
2465	int error, cnt, nwake;
2466	uint32_t flags;
2467
2468	flags = fuword32(&cv->c_flags);
2469	if ((error = umtx_key_get(cv, TYPE_CV, GET_SHARE(flags), &key)) != 0)
2470		return (error);
2471	umtxq_lock(&key);
2472	umtxq_busy(&key);
2473	cnt = umtxq_count(&key);
2474	nwake = umtxq_signal(&key, 1);
2475	if (cnt <= nwake) {
2476		umtxq_unlock(&key);
2477		error = suword32(
2478		    __DEVOLATILE(uint32_t *, &cv->c_has_waiters), 0);
2479		umtxq_lock(&key);
2480	}
2481	umtxq_unbusy(&key);
2482	umtxq_unlock(&key);
2483	umtx_key_release(&key);
2484	return (error);
2485}
2486
2487static int
2488do_cv_broadcast(struct thread *td, struct ucond *cv)
2489{
2490	struct umtx_key key;
2491	int error;
2492	uint32_t flags;
2493
2494	flags = fuword32(&cv->c_flags);
2495	if ((error = umtx_key_get(cv, TYPE_CV, GET_SHARE(flags), &key)) != 0)
2496		return (error);
2497
2498	umtxq_lock(&key);
2499	umtxq_busy(&key);
2500	umtxq_signal(&key, INT_MAX);
2501	umtxq_unlock(&key);
2502
2503	error = suword32(__DEVOLATILE(uint32_t *, &cv->c_has_waiters), 0);
2504
2505	umtxq_lock(&key);
2506	umtxq_unbusy(&key);
2507	umtxq_unlock(&key);
2508
2509	umtx_key_release(&key);
2510	return (error);
2511}
2512
2513static int
2514do_rw_rdlock(struct thread *td, struct urwlock *rwlock, long fflag, struct _umtx_time *timeout)
2515{
2516	struct abs_timeout timo;
2517	struct umtx_q *uq;
2518	uint32_t flags, wrflags;
2519	int32_t state, oldstate;
2520	int32_t blocked_readers;
2521	int error;
2522
2523	uq = td->td_umtxq;
2524	flags = fuword32(&rwlock->rw_flags);
2525	error = umtx_key_get(rwlock, TYPE_RWLOCK, GET_SHARE(flags), &uq->uq_key);
2526	if (error != 0)
2527		return (error);
2528
2529	if (timeout != NULL)
2530		abs_timeout_init2(&timo, timeout);
2531
2532	wrflags = URWLOCK_WRITE_OWNER;
2533	if (!(fflag & URWLOCK_PREFER_READER) && !(flags & URWLOCK_PREFER_READER))
2534		wrflags |= URWLOCK_WRITE_WAITERS;
2535
2536	for (;;) {
2537		state = fuword32(__DEVOLATILE(int32_t *, &rwlock->rw_state));
2538		/* try to lock it */
2539		while (!(state & wrflags)) {
2540			if (__predict_false(URWLOCK_READER_COUNT(state) == URWLOCK_MAX_READERS)) {
2541				umtx_key_release(&uq->uq_key);
2542				return (EAGAIN);
2543			}
2544			oldstate = casuword32(&rwlock->rw_state, state, state + 1);
2545			if (oldstate == state) {
2546				umtx_key_release(&uq->uq_key);
2547				return (0);
2548			}
2549			state = oldstate;
2550		}
2551
2552		if (error)
2553			break;
2554
2555		/* grab monitor lock */
2556		umtxq_lock(&uq->uq_key);
2557		umtxq_busy(&uq->uq_key);
2558		umtxq_unlock(&uq->uq_key);
2559
2560		/*
2561		 * re-read the state, in case it changed between the try-lock above
2562		 * and the check below
2563		 */
2564		state = fuword32(__DEVOLATILE(int32_t *, &rwlock->rw_state));
2565
2566		/* set read contention bit */
2567		while ((state & wrflags) && !(state & URWLOCK_READ_WAITERS)) {
2568			oldstate = casuword32(&rwlock->rw_state, state, state | URWLOCK_READ_WAITERS);
2569			if (oldstate == state)
2570				goto sleep;
2571			state = oldstate;
2572		}
2573
2574		/* state is changed while setting flags, restart */
2575		if (!(state & wrflags)) {
2576			umtxq_lock(&uq->uq_key);
2577			umtxq_unbusy(&uq->uq_key);
2578			umtxq_unlock(&uq->uq_key);
2579			continue;
2580		}
2581
2582sleep:
2583		/* contention bit is set, before sleeping, increase read waiter count */
2584		blocked_readers = fuword32(&rwlock->rw_blocked_readers);
2585		suword32(&rwlock->rw_blocked_readers, blocked_readers+1);
2586
2587		while (state & wrflags) {
2588			umtxq_lock(&uq->uq_key);
2589			umtxq_insert(uq);
2590			umtxq_unbusy(&uq->uq_key);
2591
2592			error = umtxq_sleep(uq, "urdlck", timeout == NULL ?
2593			    NULL : &timo);
2594
2595			umtxq_busy(&uq->uq_key);
2596			umtxq_remove(uq);
2597			umtxq_unlock(&uq->uq_key);
2598			if (error)
2599				break;
2600			state = fuword32(__DEVOLATILE(int32_t *, &rwlock->rw_state));
2601		}
2602
2603		/* decrease read waiter count, and may clear read contention bit */
2604		blocked_readers = fuword32(&rwlock->rw_blocked_readers);
2605		suword32(&rwlock->rw_blocked_readers, blocked_readers-1);
2606		if (blocked_readers == 1) {
2607			state = fuword32(__DEVOLATILE(int32_t *, &rwlock->rw_state));
2608			for (;;) {
2609				oldstate = casuword32(&rwlock->rw_state, state,
2610					 state & ~URWLOCK_READ_WAITERS);
2611				if (oldstate == state)
2612					break;
2613				state = oldstate;
2614			}
2615		}
2616
2617		umtxq_lock(&uq->uq_key);
2618		umtxq_unbusy(&uq->uq_key);
2619		umtxq_unlock(&uq->uq_key);
2620	}
2621	umtx_key_release(&uq->uq_key);
2622	if (error == ERESTART)
2623		error = EINTR;
2624	return (error);
2625}
2626
2627static int
2628do_rw_wrlock(struct thread *td, struct urwlock *rwlock, struct _umtx_time *timeout)
2629{
2630	struct abs_timeout timo;
2631	struct umtx_q *uq;
2632	uint32_t flags;
2633	int32_t state, oldstate;
2634	int32_t blocked_writers;
2635	int32_t blocked_readers;
2636	int error;
2637
2638	uq = td->td_umtxq;
2639	flags = fuword32(&rwlock->rw_flags);
2640	error = umtx_key_get(rwlock, TYPE_RWLOCK, GET_SHARE(flags), &uq->uq_key);
2641	if (error != 0)
2642		return (error);
2643
2644	if (timeout != NULL)
2645		abs_timeout_init2(&timo, timeout);
2646
2647	blocked_readers = 0;
2648	for (;;) {
2649		state = fuword32(__DEVOLATILE(int32_t *, &rwlock->rw_state));
2650		while (!(state & URWLOCK_WRITE_OWNER) && URWLOCK_READER_COUNT(state) == 0) {
2651			oldstate = casuword32(&rwlock->rw_state, state, state | URWLOCK_WRITE_OWNER);
2652			if (oldstate == state) {
2653				umtx_key_release(&uq->uq_key);
2654				return (0);
2655			}
2656			state = oldstate;
2657		}
2658
2659		if (error) {
2660			if (!(state & (URWLOCK_WRITE_OWNER|URWLOCK_WRITE_WAITERS)) &&
2661			    blocked_readers != 0) {
2662				umtxq_lock(&uq->uq_key);
2663				umtxq_busy(&uq->uq_key);
2664				umtxq_signal_queue(&uq->uq_key, INT_MAX, UMTX_SHARED_QUEUE);
2665				umtxq_unbusy(&uq->uq_key);
2666				umtxq_unlock(&uq->uq_key);
2667			}
2668
2669			break;
2670		}
2671
2672		/* grab monitor lock */
2673		umtxq_lock(&uq->uq_key);
2674		umtxq_busy(&uq->uq_key);
2675		umtxq_unlock(&uq->uq_key);
2676
2677		/*
2678		 * re-read the state, in case it changed between the try-lock above
2679		 * and the check below
2680		 */
2681		state = fuword32(__DEVOLATILE(int32_t *, &rwlock->rw_state));
2682
2683		while (((state & URWLOCK_WRITE_OWNER) || URWLOCK_READER_COUNT(state) != 0) &&
2684		       (state & URWLOCK_WRITE_WAITERS) == 0) {
2685			oldstate = casuword32(&rwlock->rw_state, state, state | URWLOCK_WRITE_WAITERS);
2686			if (oldstate == state)
2687				goto sleep;
2688			state = oldstate;
2689		}
2690
2691		if (!(state & URWLOCK_WRITE_OWNER) && URWLOCK_READER_COUNT(state) == 0) {
2692			umtxq_lock(&uq->uq_key);
2693			umtxq_unbusy(&uq->uq_key);
2694			umtxq_unlock(&uq->uq_key);
2695			continue;
2696		}
2697sleep:
2698		blocked_writers = fuword32(&rwlock->rw_blocked_writers);
2699		suword32(&rwlock->rw_blocked_writers, blocked_writers+1);
2700
2701		while ((state & URWLOCK_WRITE_OWNER) || URWLOCK_READER_COUNT(state) != 0) {
2702			umtxq_lock(&uq->uq_key);
2703			umtxq_insert_queue(uq, UMTX_EXCLUSIVE_QUEUE);
2704			umtxq_unbusy(&uq->uq_key);
2705
2706			error = umtxq_sleep(uq, "uwrlck", timeout == NULL ?
2707			    NULL : &timo);
2708
2709			umtxq_busy(&uq->uq_key);
2710			umtxq_remove_queue(uq, UMTX_EXCLUSIVE_QUEUE);
2711			umtxq_unlock(&uq->uq_key);
2712			if (error)
2713				break;
2714			state = fuword32(__DEVOLATILE(int32_t *, &rwlock->rw_state));
2715		}
2716
2717		blocked_writers = fuword32(&rwlock->rw_blocked_writers);
2718		suword32(&rwlock->rw_blocked_writers, blocked_writers-1);
2719		if (blocked_writers == 1) {
2720			state = fuword32(__DEVOLATILE(int32_t *, &rwlock->rw_state));
2721			for (;;) {
2722				oldstate = casuword32(&rwlock->rw_state, state,
2723					 state & ~URWLOCK_WRITE_WAITERS);
2724				if (oldstate == state)
2725					break;
2726				state = oldstate;
2727			}
2728			blocked_readers = fuword32(&rwlock->rw_blocked_readers);
2729		} else
2730			blocked_readers = 0;
2731
2732		umtxq_lock(&uq->uq_key);
2733		umtxq_unbusy(&uq->uq_key);
2734		umtxq_unlock(&uq->uq_key);
2735	}
2736
2737	umtx_key_release(&uq->uq_key);
2738	if (error == ERESTART)
2739		error = EINTR;
2740	return (error);
2741}
2742
2743static int
2744do_rw_unlock(struct thread *td, struct urwlock *rwlock)
2745{
2746	struct umtx_q *uq;
2747	uint32_t flags;
2748	int32_t state, oldstate;
2749	int error, q, count;
2750
2751	uq = td->td_umtxq;
2752	flags = fuword32(&rwlock->rw_flags);
2753	error = umtx_key_get(rwlock, TYPE_RWLOCK, GET_SHARE(flags), &uq->uq_key);
2754	if (error != 0)
2755		return (error);
2756
2757	state = fuword32(__DEVOLATILE(int32_t *, &rwlock->rw_state));
2758	if (state & URWLOCK_WRITE_OWNER) {
2759		for (;;) {
2760			oldstate = casuword32(&rwlock->rw_state, state,
2761				state & ~URWLOCK_WRITE_OWNER);
2762			if (oldstate != state) {
2763				state = oldstate;
2764				if (!(oldstate & URWLOCK_WRITE_OWNER)) {
2765					error = EPERM;
2766					goto out;
2767				}
2768			} else
2769				break;
2770		}
2771	} else if (URWLOCK_READER_COUNT(state) != 0) {
2772		for (;;) {
2773			oldstate = casuword32(&rwlock->rw_state, state,
2774				state - 1);
2775			if (oldstate != state) {
2776				state = oldstate;
2777				if (URWLOCK_READER_COUNT(oldstate) == 0) {
2778					error = EPERM;
2779					goto out;
2780				}
2781			}
2782			else
2783				break;
2784		}
2785	} else {
2786		error = EPERM;
2787		goto out;
2788	}
2789
2790	count = 0;
2791
2792	if (!(flags & URWLOCK_PREFER_READER)) {
2793		if (state & URWLOCK_WRITE_WAITERS) {
2794			count = 1;
2795			q = UMTX_EXCLUSIVE_QUEUE;
2796		} else if (state & URWLOCK_READ_WAITERS) {
2797			count = INT_MAX;
2798			q = UMTX_SHARED_QUEUE;
2799		}
2800	} else {
2801		if (state & URWLOCK_READ_WAITERS) {
2802			count = INT_MAX;
2803			q = UMTX_SHARED_QUEUE;
2804		} else if (state & URWLOCK_WRITE_WAITERS) {
2805			count = 1;
2806			q = UMTX_EXCLUSIVE_QUEUE;
2807		}
2808	}
2809
2810	if (count) {
2811		umtxq_lock(&uq->uq_key);
2812		umtxq_busy(&uq->uq_key);
2813		umtxq_signal_queue(&uq->uq_key, count, q);
2814		umtxq_unbusy(&uq->uq_key);
2815		umtxq_unlock(&uq->uq_key);
2816	}
2817out:
2818	umtx_key_release(&uq->uq_key);
2819	return (error);
2820}
2821
2822static int
2823do_sem_wait(struct thread *td, struct _usem *sem, struct _umtx_time *timeout)
2824{
2825	struct abs_timeout timo;
2826	struct umtx_q *uq;
2827	uint32_t flags, count;
2828	int error;
2829
2830	uq = td->td_umtxq;
2831	flags = fuword32(&sem->_flags);
2832	error = umtx_key_get(sem, TYPE_SEM, GET_SHARE(flags), &uq->uq_key);
2833	if (error != 0)
2834		return (error);
2835
2836	if (timeout != NULL)
2837		abs_timeout_init2(&timo, timeout);
2838
2839	umtxq_lock(&uq->uq_key);
2840	umtxq_busy(&uq->uq_key);
2841	umtxq_insert(uq);
2842	umtxq_unlock(&uq->uq_key);
2843	casuword32(__DEVOLATILE(uint32_t *, &sem->_has_waiters), 0, 1);
2844	count = fuword32(__DEVOLATILE(uint32_t *, &sem->_count));
2845	if (count != 0) {
2846		umtxq_lock(&uq->uq_key);
2847		umtxq_unbusy(&uq->uq_key);
2848		umtxq_remove(uq);
2849		umtxq_unlock(&uq->uq_key);
2850		umtx_key_release(&uq->uq_key);
2851		return (0);
2852	}
2853	umtxq_lock(&uq->uq_key);
2854	umtxq_unbusy(&uq->uq_key);
2855
2856	error = umtxq_sleep(uq, "usem", timeout == NULL ? NULL : &timo);
2857
2858	if ((uq->uq_flags & UQF_UMTXQ) == 0)
2859		error = 0;
2860	else {
2861		umtxq_remove(uq);
2862		if (error == ERESTART)
2863			error = EINTR;
2864	}
2865	umtxq_unlock(&uq->uq_key);
2866	umtx_key_release(&uq->uq_key);
2867	return (error);
2868}
2869
2870/*
2871 * Signal a userland condition variable.
2872 */
2873static int
2874do_sem_wake(struct thread *td, struct _usem *sem)
2875{
2876	struct umtx_key key;
2877	int error, cnt;
2878	uint32_t flags;
2879
2880	flags = fuword32(&sem->_flags);
2881	if ((error = umtx_key_get(sem, TYPE_SEM, GET_SHARE(flags), &key)) != 0)
2882		return (error);
2883	umtxq_lock(&key);
2884	umtxq_busy(&key);
2885	cnt = umtxq_count(&key);
2886	if (cnt > 0) {
2887		umtxq_signal(&key, 1);
2888		/*
2889		 * Check if count is greater than 0, this means the memory is
2890		 * still being referenced by user code, so we can safely
2891		 * update _has_waiters flag.
2892		 */
2893		if (cnt == 1) {
2894			umtxq_unlock(&key);
2895			error = suword32(
2896			    __DEVOLATILE(uint32_t *, &sem->_has_waiters), 0);
2897			umtxq_lock(&key);
2898		}
2899	}
2900	umtxq_unbusy(&key);
2901	umtxq_unlock(&key);
2902	umtx_key_release(&key);
2903	return (error);
2904}
2905
2906int
2907sys__umtx_lock(struct thread *td, struct _umtx_lock_args *uap)
2908    /* struct umtx *umtx */
2909{
2910	return do_lock_umtx(td, uap->umtx, td->td_tid, 0);
2911}
2912
2913int
2914sys__umtx_unlock(struct thread *td, struct _umtx_unlock_args *uap)
2915    /* struct umtx *umtx */
2916{
2917	return do_unlock_umtx(td, uap->umtx, td->td_tid);
2918}
2919
2920inline int
2921umtx_copyin_timeout(const void *addr, struct timespec *tsp)
2922{
2923	int error;
2924
2925	error = copyin(addr, tsp, sizeof(struct timespec));
2926	if (error == 0) {
2927		if (tsp->tv_sec < 0 ||
2928		    tsp->tv_nsec >= 1000000000 ||
2929		    tsp->tv_nsec < 0)
2930			error = EINVAL;
2931	}
2932	return (error);
2933}
2934
2935static inline int
2936umtx_copyin_umtx_time(const void *addr, size_t size, struct _umtx_time *tp)
2937{
2938	int error;
2939
2940	if (size <= sizeof(struct timespec)) {
2941		tp->_clockid = CLOCK_REALTIME;
2942		tp->_flags = 0;
2943		error = copyin(addr, &tp->_timeout, sizeof(struct timespec));
2944	} else
2945		error = copyin(addr, tp, sizeof(struct _umtx_time));
2946	if (error != 0)
2947		return (error);
2948	if (tp->_timeout.tv_sec < 0 ||
2949	    tp->_timeout.tv_nsec >= 1000000000 || tp->_timeout.tv_nsec < 0)
2950		return (EINVAL);
2951	return (0);
2952}
2953
2954static int
2955__umtx_op_lock_umtx(struct thread *td, struct _umtx_op_args *uap)
2956{
2957	struct timespec *ts, timeout;
2958	int error;
2959
2960	/* Allow a null timespec (wait forever). */
2961	if (uap->uaddr2 == NULL)
2962		ts = NULL;
2963	else {
2964		error = umtx_copyin_timeout(uap->uaddr2, &timeout);
2965		if (error != 0)
2966			return (error);
2967		ts = &timeout;
2968	}
2969	return (do_lock_umtx(td, uap->obj, uap->val, ts));
2970}
2971
2972static int
2973__umtx_op_unlock_umtx(struct thread *td, struct _umtx_op_args *uap)
2974{
2975	return (do_unlock_umtx(td, uap->obj, uap->val));
2976}
2977
2978static int
2979__umtx_op_wait(struct thread *td, struct _umtx_op_args *uap)
2980{
2981	struct _umtx_time timeout, *tm_p;
2982	int error;
2983
2984	if (uap->uaddr2 == NULL)
2985		tm_p = NULL;
2986	else {
2987		error = umtx_copyin_umtx_time(
2988		    uap->uaddr2, (size_t)uap->uaddr1, &timeout);
2989		if (error != 0)
2990			return (error);
2991		tm_p = &timeout;
2992	}
2993	return do_wait(td, uap->obj, uap->val, tm_p, 0, 0);
2994}
2995
2996static int
2997__umtx_op_wait_uint(struct thread *td, struct _umtx_op_args *uap)
2998{
2999	struct _umtx_time timeout, *tm_p;
3000	int error;
3001
3002	if (uap->uaddr2 == NULL)
3003		tm_p = NULL;
3004	else {
3005		error = umtx_copyin_umtx_time(
3006		    uap->uaddr2, (size_t)uap->uaddr1, &timeout);
3007		if (error != 0)
3008			return (error);
3009		tm_p = &timeout;
3010	}
3011	return do_wait(td, uap->obj, uap->val, tm_p, 1, 0);
3012}
3013
3014static int
3015__umtx_op_wait_uint_private(struct thread *td, struct _umtx_op_args *uap)
3016{
3017	struct _umtx_time *tm_p, timeout;
3018	int error;
3019
3020	if (uap->uaddr2 == NULL)
3021		tm_p = NULL;
3022	else {
3023		error = umtx_copyin_umtx_time(
3024		    uap->uaddr2, (size_t)uap->uaddr1, &timeout);
3025		if (error != 0)
3026			return (error);
3027		tm_p = &timeout;
3028	}
3029	return do_wait(td, uap->obj, uap->val, tm_p, 1, 1);
3030}
3031
3032static int
3033__umtx_op_wake(struct thread *td, struct _umtx_op_args *uap)
3034{
3035	return (kern_umtx_wake(td, uap->obj, uap->val, 0));
3036}
3037
3038#define BATCH_SIZE	128
3039static int
3040__umtx_op_nwake_private(struct thread *td, struct _umtx_op_args *uap)
3041{
3042	int count = uap->val;
3043	void *uaddrs[BATCH_SIZE];
3044	char **upp = (char **)uap->obj;
3045	int tocopy;
3046	int error = 0;
3047	int i, pos = 0;
3048
3049	while (count > 0) {
3050		tocopy = count;
3051		if (tocopy > BATCH_SIZE)
3052			tocopy = BATCH_SIZE;
3053		error = copyin(upp+pos, uaddrs, tocopy * sizeof(char *));
3054		if (error != 0)
3055			break;
3056		for (i = 0; i < tocopy; ++i)
3057			kern_umtx_wake(td, uaddrs[i], INT_MAX, 1);
3058		count -= tocopy;
3059		pos += tocopy;
3060	}
3061	return (error);
3062}
3063
3064static int
3065__umtx_op_wake_private(struct thread *td, struct _umtx_op_args *uap)
3066{
3067	return (kern_umtx_wake(td, uap->obj, uap->val, 1));
3068}
3069
3070static int
3071__umtx_op_lock_umutex(struct thread *td, struct _umtx_op_args *uap)
3072{
3073	struct _umtx_time *tm_p, timeout;
3074	int error;
3075
3076	/* Allow a null timespec (wait forever). */
3077	if (uap->uaddr2 == NULL)
3078		tm_p = NULL;
3079	else {
3080		error = umtx_copyin_umtx_time(
3081		    uap->uaddr2, (size_t)uap->uaddr1, &timeout);
3082		if (error != 0)
3083			return (error);
3084		tm_p = &timeout;
3085	}
3086	return do_lock_umutex(td, uap->obj, tm_p, 0);
3087}
3088
3089static int
3090__umtx_op_trylock_umutex(struct thread *td, struct _umtx_op_args *uap)
3091{
3092	return do_lock_umutex(td, uap->obj, NULL, _UMUTEX_TRY);
3093}
3094
3095static int
3096__umtx_op_wait_umutex(struct thread *td, struct _umtx_op_args *uap)
3097{
3098	struct _umtx_time *tm_p, timeout;
3099	int error;
3100
3101	/* Allow a null timespec (wait forever). */
3102	if (uap->uaddr2 == NULL)
3103		tm_p = NULL;
3104	else {
3105		error = umtx_copyin_umtx_time(
3106		    uap->uaddr2, (size_t)uap->uaddr1, &timeout);
3107		if (error != 0)
3108			return (error);
3109		tm_p = &timeout;
3110	}
3111	return do_lock_umutex(td, uap->obj, tm_p, _UMUTEX_WAIT);
3112}
3113
3114static int
3115__umtx_op_wake_umutex(struct thread *td, struct _umtx_op_args *uap)
3116{
3117	return do_wake_umutex(td, uap->obj);
3118}
3119
3120static int
3121__umtx_op_unlock_umutex(struct thread *td, struct _umtx_op_args *uap)
3122{
3123	return do_unlock_umutex(td, uap->obj);
3124}
3125
3126static int
3127__umtx_op_set_ceiling(struct thread *td, struct _umtx_op_args *uap)
3128{
3129	return do_set_ceiling(td, uap->obj, uap->val, uap->uaddr1);
3130}
3131
3132static int
3133__umtx_op_cv_wait(struct thread *td, struct _umtx_op_args *uap)
3134{
3135	struct timespec *ts, timeout;
3136	int error;
3137
3138	/* Allow a null timespec (wait forever). */
3139	if (uap->uaddr2 == NULL)
3140		ts = NULL;
3141	else {
3142		error = umtx_copyin_timeout(uap->uaddr2, &timeout);
3143		if (error != 0)
3144			return (error);
3145		ts = &timeout;
3146	}
3147	return (do_cv_wait(td, uap->obj, uap->uaddr1, ts, uap->val));
3148}
3149
3150static int
3151__umtx_op_cv_signal(struct thread *td, struct _umtx_op_args *uap)
3152{
3153	return do_cv_signal(td, uap->obj);
3154}
3155
3156static int
3157__umtx_op_cv_broadcast(struct thread *td, struct _umtx_op_args *uap)
3158{
3159	return do_cv_broadcast(td, uap->obj);
3160}
3161
3162static int
3163__umtx_op_rw_rdlock(struct thread *td, struct _umtx_op_args *uap)
3164{
3165	struct _umtx_time timeout;
3166	int error;
3167
3168	/* Allow a null timespec (wait forever). */
3169	if (uap->uaddr2 == NULL) {
3170		error = do_rw_rdlock(td, uap->obj, uap->val, 0);
3171	} else {
3172		error = umtx_copyin_umtx_time(uap->uaddr2,
3173		   (size_t)uap->uaddr1, &timeout);
3174		if (error != 0)
3175			return (error);
3176		error = do_rw_rdlock(td, uap->obj, uap->val, &timeout);
3177	}
3178	return (error);
3179}
3180
3181static int
3182__umtx_op_rw_wrlock(struct thread *td, struct _umtx_op_args *uap)
3183{
3184	struct _umtx_time timeout;
3185	int error;
3186
3187	/* Allow a null timespec (wait forever). */
3188	if (uap->uaddr2 == NULL) {
3189		error = do_rw_wrlock(td, uap->obj, 0);
3190	} else {
3191		error = umtx_copyin_umtx_time(uap->uaddr2,
3192		   (size_t)uap->uaddr1, &timeout);
3193		if (error != 0)
3194			return (error);
3195
3196		error = do_rw_wrlock(td, uap->obj, &timeout);
3197	}
3198	return (error);
3199}
3200
3201static int
3202__umtx_op_rw_unlock(struct thread *td, struct _umtx_op_args *uap)
3203{
3204	return do_rw_unlock(td, uap->obj);
3205}
3206
3207static int
3208__umtx_op_sem_wait(struct thread *td, struct _umtx_op_args *uap)
3209{
3210	struct _umtx_time *tm_p, timeout;
3211	int error;
3212
3213	/* Allow a null timespec (wait forever). */
3214	if (uap->uaddr2 == NULL)
3215		tm_p = NULL;
3216	else {
3217		error = umtx_copyin_umtx_time(
3218		    uap->uaddr2, (size_t)uap->uaddr1, &timeout);
3219		if (error != 0)
3220			return (error);
3221		tm_p = &timeout;
3222	}
3223	return (do_sem_wait(td, uap->obj, tm_p));
3224}
3225
3226static int
3227__umtx_op_sem_wake(struct thread *td, struct _umtx_op_args *uap)
3228{
3229	return do_sem_wake(td, uap->obj);
3230}
3231
3232static int
3233__umtx_op_wake2_umutex(struct thread *td, struct _umtx_op_args *uap)
3234{
3235	return do_wake2_umutex(td, uap->obj, uap->val);
3236}
3237
3238typedef int (*_umtx_op_func)(struct thread *td, struct _umtx_op_args *uap);
3239
3240static _umtx_op_func op_table[] = {
3241	__umtx_op_lock_umtx,		/* UMTX_OP_LOCK */
3242	__umtx_op_unlock_umtx,		/* UMTX_OP_UNLOCK */
3243	__umtx_op_wait,			/* UMTX_OP_WAIT */
3244	__umtx_op_wake,			/* UMTX_OP_WAKE */
3245	__umtx_op_trylock_umutex,	/* UMTX_OP_MUTEX_TRYLOCK */
3246	__umtx_op_lock_umutex,		/* UMTX_OP_MUTEX_LOCK */
3247	__umtx_op_unlock_umutex,	/* UMTX_OP_MUTEX_UNLOCK */
3248	__umtx_op_set_ceiling,		/* UMTX_OP_SET_CEILING */
3249	__umtx_op_cv_wait,		/* UMTX_OP_CV_WAIT*/
3250	__umtx_op_cv_signal,		/* UMTX_OP_CV_SIGNAL */
3251	__umtx_op_cv_broadcast,		/* UMTX_OP_CV_BROADCAST */
3252	__umtx_op_wait_uint,		/* UMTX_OP_WAIT_UINT */
3253	__umtx_op_rw_rdlock,		/* UMTX_OP_RW_RDLOCK */
3254	__umtx_op_rw_wrlock,		/* UMTX_OP_RW_WRLOCK */
3255	__umtx_op_rw_unlock,		/* UMTX_OP_RW_UNLOCK */
3256	__umtx_op_wait_uint_private,	/* UMTX_OP_WAIT_UINT_PRIVATE */
3257	__umtx_op_wake_private,		/* UMTX_OP_WAKE_PRIVATE */
3258	__umtx_op_wait_umutex,		/* UMTX_OP_UMUTEX_WAIT */
3259	__umtx_op_wake_umutex,		/* UMTX_OP_UMUTEX_WAKE */
3260	__umtx_op_sem_wait,		/* UMTX_OP_SEM_WAIT */
3261	__umtx_op_sem_wake,		/* UMTX_OP_SEM_WAKE */
3262	__umtx_op_nwake_private,	/* UMTX_OP_NWAKE_PRIVATE */
3263	__umtx_op_wake2_umutex		/* UMTX_OP_UMUTEX_WAKE2 */
3264};
3265
3266int
3267sys__umtx_op(struct thread *td, struct _umtx_op_args *uap)
3268{
3269	if ((unsigned)uap->op < UMTX_OP_MAX)
3270		return (*op_table[uap->op])(td, uap);
3271	return (EINVAL);
3272}
3273
3274#ifdef COMPAT_FREEBSD32
3275int
3276freebsd32_umtx_lock(struct thread *td, struct freebsd32_umtx_lock_args *uap)
3277    /* struct umtx *umtx */
3278{
3279	return (do_lock_umtx32(td, (uint32_t *)uap->umtx, td->td_tid, NULL));
3280}
3281
3282int
3283freebsd32_umtx_unlock(struct thread *td, struct freebsd32_umtx_unlock_args *uap)
3284    /* struct umtx *umtx */
3285{
3286	return (do_unlock_umtx32(td, (uint32_t *)uap->umtx, td->td_tid));
3287}
3288
3289struct timespec32 {
3290	uint32_t tv_sec;
3291	uint32_t tv_nsec;
3292};
3293
3294struct umtx_time32 {
3295	struct	timespec32	timeout;
3296	uint32_t		flags;
3297	uint32_t		clockid;
3298};
3299
3300static inline int
3301umtx_copyin_timeout32(void *addr, struct timespec *tsp)
3302{
3303	struct timespec32 ts32;
3304	int error;
3305
3306	error = copyin(addr, &ts32, sizeof(struct timespec32));
3307	if (error == 0) {
3308		if (ts32.tv_sec < 0 ||
3309		    ts32.tv_nsec >= 1000000000 ||
3310		    ts32.tv_nsec < 0)
3311			error = EINVAL;
3312		else {
3313			tsp->tv_sec = ts32.tv_sec;
3314			tsp->tv_nsec = ts32.tv_nsec;
3315		}
3316	}
3317	return (error);
3318}
3319
3320static inline int
3321umtx_copyin_umtx_time32(const void *addr, size_t size, struct _umtx_time *tp)
3322{
3323	struct umtx_time32 t32;
3324	int error;
3325
3326	t32.clockid = CLOCK_REALTIME;
3327	t32.flags   = 0;
3328	if (size <= sizeof(struct timespec32))
3329		error = copyin(addr, &t32.timeout, sizeof(struct timespec32));
3330	else
3331		error = copyin(addr, &t32, sizeof(struct umtx_time32));
3332	if (error != 0)
3333		return (error);
3334	if (t32.timeout.tv_sec < 0 ||
3335	    t32.timeout.tv_nsec >= 1000000000 || t32.timeout.tv_nsec < 0)
3336		return (EINVAL);
3337	tp->_timeout.tv_sec = t32.timeout.tv_sec;
3338	tp->_timeout.tv_nsec = t32.timeout.tv_nsec;
3339	tp->_flags = t32.flags;
3340	tp->_clockid = t32.clockid;
3341	return (0);
3342}
3343
3344static int
3345__umtx_op_lock_umtx_compat32(struct thread *td, struct _umtx_op_args *uap)
3346{
3347	struct timespec *ts, timeout;
3348	int error;
3349
3350	/* Allow a null timespec (wait forever). */
3351	if (uap->uaddr2 == NULL)
3352		ts = NULL;
3353	else {
3354		error = umtx_copyin_timeout32(uap->uaddr2, &timeout);
3355		if (error != 0)
3356			return (error);
3357		ts = &timeout;
3358	}
3359	return (do_lock_umtx32(td, uap->obj, uap->val, ts));
3360}
3361
3362static int
3363__umtx_op_unlock_umtx_compat32(struct thread *td, struct _umtx_op_args *uap)
3364{
3365	return (do_unlock_umtx32(td, uap->obj, (uint32_t)uap->val));
3366}
3367
3368static int
3369__umtx_op_wait_compat32(struct thread *td, struct _umtx_op_args *uap)
3370{
3371	struct _umtx_time *tm_p, timeout;
3372	int error;
3373
3374	if (uap->uaddr2 == NULL)
3375		tm_p = NULL;
3376	else {
3377		error = umtx_copyin_umtx_time32(uap->uaddr2,
3378			(size_t)uap->uaddr1, &timeout);
3379		if (error != 0)
3380			return (error);
3381		tm_p = &timeout;
3382	}
3383	return do_wait(td, uap->obj, uap->val, tm_p, 1, 0);
3384}
3385
3386static int
3387__umtx_op_lock_umutex_compat32(struct thread *td, struct _umtx_op_args *uap)
3388{
3389	struct _umtx_time *tm_p, timeout;
3390	int error;
3391
3392	/* Allow a null timespec (wait forever). */
3393	if (uap->uaddr2 == NULL)
3394		tm_p = NULL;
3395	else {
3396		error = umtx_copyin_umtx_time(uap->uaddr2,
3397			    (size_t)uap->uaddr1, &timeout);
3398		if (error != 0)
3399			return (error);
3400		tm_p = &timeout;
3401	}
3402	return do_lock_umutex(td, uap->obj, tm_p, 0);
3403}
3404
3405static int
3406__umtx_op_wait_umutex_compat32(struct thread *td, struct _umtx_op_args *uap)
3407{
3408	struct _umtx_time *tm_p, timeout;
3409	int error;
3410
3411	/* Allow a null timespec (wait forever). */
3412	if (uap->uaddr2 == NULL)
3413		tm_p = NULL;
3414	else {
3415		error = umtx_copyin_umtx_time32(uap->uaddr2,
3416		    (size_t)uap->uaddr1, &timeout);
3417		if (error != 0)
3418			return (error);
3419		tm_p = &timeout;
3420	}
3421	return do_lock_umutex(td, uap->obj, tm_p, _UMUTEX_WAIT);
3422}
3423
3424static int
3425__umtx_op_cv_wait_compat32(struct thread *td, struct _umtx_op_args *uap)
3426{
3427	struct timespec *ts, timeout;
3428	int error;
3429
3430	/* Allow a null timespec (wait forever). */
3431	if (uap->uaddr2 == NULL)
3432		ts = NULL;
3433	else {
3434		error = umtx_copyin_timeout32(uap->uaddr2, &timeout);
3435		if (error != 0)
3436			return (error);
3437		ts = &timeout;
3438	}
3439	return (do_cv_wait(td, uap->obj, uap->uaddr1, ts, uap->val));
3440}
3441
3442static int
3443__umtx_op_rw_rdlock_compat32(struct thread *td, struct _umtx_op_args *uap)
3444{
3445	struct _umtx_time timeout;
3446	int error;
3447
3448	/* Allow a null timespec (wait forever). */
3449	if (uap->uaddr2 == NULL) {
3450		error = do_rw_rdlock(td, uap->obj, uap->val, 0);
3451	} else {
3452		error = umtx_copyin_umtx_time32(uap->uaddr2,
3453		    (size_t)uap->uaddr1, &timeout);
3454		if (error != 0)
3455			return (error);
3456		error = do_rw_rdlock(td, uap->obj, uap->val, &timeout);
3457	}
3458	return (error);
3459}
3460
3461static int
3462__umtx_op_rw_wrlock_compat32(struct thread *td, struct _umtx_op_args *uap)
3463{
3464	struct _umtx_time timeout;
3465	int error;
3466
3467	/* Allow a null timespec (wait forever). */
3468	if (uap->uaddr2 == NULL) {
3469		error = do_rw_wrlock(td, uap->obj, 0);
3470	} else {
3471		error = umtx_copyin_umtx_time32(uap->uaddr2,
3472		    (size_t)uap->uaddr1, &timeout);
3473		if (error != 0)
3474			return (error);
3475		error = do_rw_wrlock(td, uap->obj, &timeout);
3476	}
3477	return (error);
3478}
3479
3480static int
3481__umtx_op_wait_uint_private_compat32(struct thread *td, struct _umtx_op_args *uap)
3482{
3483	struct _umtx_time *tm_p, timeout;
3484	int error;
3485
3486	if (uap->uaddr2 == NULL)
3487		tm_p = NULL;
3488	else {
3489		error = umtx_copyin_umtx_time32(
3490		    uap->uaddr2, (size_t)uap->uaddr1,&timeout);
3491		if (error != 0)
3492			return (error);
3493		tm_p = &timeout;
3494	}
3495	return do_wait(td, uap->obj, uap->val, tm_p, 1, 1);
3496}
3497
3498static int
3499__umtx_op_sem_wait_compat32(struct thread *td, struct _umtx_op_args *uap)
3500{
3501	struct _umtx_time *tm_p, timeout;
3502	int error;
3503
3504	/* Allow a null timespec (wait forever). */
3505	if (uap->uaddr2 == NULL)
3506		tm_p = NULL;
3507	else {
3508		error = umtx_copyin_umtx_time32(uap->uaddr2,
3509		    (size_t)uap->uaddr1, &timeout);
3510		if (error != 0)
3511			return (error);
3512		tm_p = &timeout;
3513	}
3514	return (do_sem_wait(td, uap->obj, tm_p));
3515}
3516
3517static int
3518__umtx_op_nwake_private32(struct thread *td, struct _umtx_op_args *uap)
3519{
3520	int count = uap->val;
3521	uint32_t uaddrs[BATCH_SIZE];
3522	uint32_t **upp = (uint32_t **)uap->obj;
3523	int tocopy;
3524	int error = 0;
3525	int i, pos = 0;
3526
3527	while (count > 0) {
3528		tocopy = count;
3529		if (tocopy > BATCH_SIZE)
3530			tocopy = BATCH_SIZE;
3531		error = copyin(upp+pos, uaddrs, tocopy * sizeof(uint32_t));
3532		if (error != 0)
3533			break;
3534		for (i = 0; i < tocopy; ++i)
3535			kern_umtx_wake(td, (void *)(intptr_t)uaddrs[i],
3536				INT_MAX, 1);
3537		count -= tocopy;
3538		pos += tocopy;
3539	}
3540	return (error);
3541}
3542
3543static _umtx_op_func op_table_compat32[] = {
3544	__umtx_op_lock_umtx_compat32,	/* UMTX_OP_LOCK */
3545	__umtx_op_unlock_umtx_compat32,	/* UMTX_OP_UNLOCK */
3546	__umtx_op_wait_compat32,	/* UMTX_OP_WAIT */
3547	__umtx_op_wake,			/* UMTX_OP_WAKE */
3548	__umtx_op_trylock_umutex,	/* UMTX_OP_MUTEX_LOCK */
3549	__umtx_op_lock_umutex_compat32,	/* UMTX_OP_MUTEX_TRYLOCK */
3550	__umtx_op_unlock_umutex,	/* UMTX_OP_MUTEX_UNLOCK	*/
3551	__umtx_op_set_ceiling,		/* UMTX_OP_SET_CEILING */
3552	__umtx_op_cv_wait_compat32,	/* UMTX_OP_CV_WAIT*/
3553	__umtx_op_cv_signal,		/* UMTX_OP_CV_SIGNAL */
3554	__umtx_op_cv_broadcast,		/* UMTX_OP_CV_BROADCAST */
3555	__umtx_op_wait_compat32,	/* UMTX_OP_WAIT_UINT */
3556	__umtx_op_rw_rdlock_compat32,	/* UMTX_OP_RW_RDLOCK */
3557	__umtx_op_rw_wrlock_compat32,	/* UMTX_OP_RW_WRLOCK */
3558	__umtx_op_rw_unlock,		/* UMTX_OP_RW_UNLOCK */
3559	__umtx_op_wait_uint_private_compat32,	/* UMTX_OP_WAIT_UINT_PRIVATE */
3560	__umtx_op_wake_private,		/* UMTX_OP_WAKE_PRIVATE */
3561	__umtx_op_wait_umutex_compat32, /* UMTX_OP_UMUTEX_WAIT */
3562	__umtx_op_wake_umutex,		/* UMTX_OP_UMUTEX_WAKE */
3563	__umtx_op_sem_wait_compat32,	/* UMTX_OP_SEM_WAIT */
3564	__umtx_op_sem_wake,		/* UMTX_OP_SEM_WAKE */
3565	__umtx_op_nwake_private32,	/* UMTX_OP_NWAKE_PRIVATE */
3566	__umtx_op_wake2_umutex		/* UMTX_OP_UMUTEX_WAKE2 */
3567};
3568
3569int
3570freebsd32_umtx_op(struct thread *td, struct freebsd32_umtx_op_args *uap)
3571{
3572	if ((unsigned)uap->op < UMTX_OP_MAX)
3573		return (*op_table_compat32[uap->op])(td,
3574			(struct _umtx_op_args *)uap);
3575	return (EINVAL);
3576}
3577#endif
3578
3579void
3580umtx_thread_init(struct thread *td)
3581{
3582	td->td_umtxq = umtxq_alloc();
3583	td->td_umtxq->uq_thread = td;
3584}
3585
3586void
3587umtx_thread_fini(struct thread *td)
3588{
3589	umtxq_free(td->td_umtxq);
3590}
3591
3592/*
3593 * It will be called when new thread is created, e.g fork().
3594 */
3595void
3596umtx_thread_alloc(struct thread *td)
3597{
3598	struct umtx_q *uq;
3599
3600	uq = td->td_umtxq;
3601	uq->uq_inherited_pri = PRI_MAX;
3602
3603	KASSERT(uq->uq_flags == 0, ("uq_flags != 0"));
3604	KASSERT(uq->uq_thread == td, ("uq_thread != td"));
3605	KASSERT(uq->uq_pi_blocked == NULL, ("uq_pi_blocked != NULL"));
3606	KASSERT(TAILQ_EMPTY(&uq->uq_pi_contested), ("uq_pi_contested is not empty"));
3607}
3608
3609/*
3610 * exec() hook.
3611 */
3612static void
3613umtx_exec_hook(void *arg __unused, struct proc *p __unused,
3614	struct image_params *imgp __unused)
3615{
3616	umtx_thread_cleanup(curthread);
3617}
3618
3619/*
3620 * thread_exit() hook.
3621 */
3622void
3623umtx_thread_exit(struct thread *td)
3624{
3625	umtx_thread_cleanup(td);
3626}
3627
3628/*
3629 * clean up umtx data.
3630 */
3631static void
3632umtx_thread_cleanup(struct thread *td)
3633{
3634	struct umtx_q *uq;
3635	struct umtx_pi *pi;
3636
3637	if ((uq = td->td_umtxq) == NULL)
3638		return;
3639
3640	mtx_lock_spin(&umtx_lock);
3641	uq->uq_inherited_pri = PRI_MAX;
3642	while ((pi = TAILQ_FIRST(&uq->uq_pi_contested)) != NULL) {
3643		pi->pi_owner = NULL;
3644		TAILQ_REMOVE(&uq->uq_pi_contested, pi, pi_link);
3645	}
3646	mtx_unlock_spin(&umtx_lock);
3647	thread_lock(td);
3648	sched_lend_user_prio(td, PRI_MAX);
3649	thread_unlock(td);
3650}
3651