kern_umtx.c revision 227309
1/*-
2 * Copyright (c) 2004, David Xu <davidxu@freebsd.org>
3 * Copyright (c) 2002, Jeffrey Roberson <jeff@freebsd.org>
4 * All rights reserved.
5 *
6 * Redistribution and use in source and binary forms, with or without
7 * modification, are permitted provided that the following conditions
8 * are met:
9 * 1. Redistributions of source code must retain the above copyright
10 *    notice unmodified, this list of conditions, and the following
11 *    disclaimer.
12 * 2. Redistributions in binary form must reproduce the above copyright
13 *    notice, this list of conditions and the following disclaimer in the
14 *    documentation and/or other materials provided with the distribution.
15 *
16 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
17 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
18 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
19 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
20 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
21 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
22 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
23 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
25 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26 */
27
28#include <sys/cdefs.h>
29__FBSDID("$FreeBSD: head/sys/kern/kern_umtx.c 227309 2011-11-07 15:43:11Z ed $");
30
31#include "opt_compat.h"
32#include <sys/param.h>
33#include <sys/kernel.h>
34#include <sys/limits.h>
35#include <sys/lock.h>
36#include <sys/malloc.h>
37#include <sys/mutex.h>
38#include <sys/priv.h>
39#include <sys/proc.h>
40#include <sys/sched.h>
41#include <sys/smp.h>
42#include <sys/sysctl.h>
43#include <sys/sysent.h>
44#include <sys/systm.h>
45#include <sys/sysproto.h>
46#include <sys/syscallsubr.h>
47#include <sys/eventhandler.h>
48#include <sys/umtx.h>
49
50#include <vm/vm.h>
51#include <vm/vm_param.h>
52#include <vm/pmap.h>
53#include <vm/vm_map.h>
54#include <vm/vm_object.h>
55
56#include <machine/cpu.h>
57
58#ifdef COMPAT_FREEBSD32
59#include <compat/freebsd32/freebsd32_proto.h>
60#endif
61
62#define _UMUTEX_TRY		1
63#define _UMUTEX_WAIT		2
64
65/* Priority inheritance mutex info. */
66struct umtx_pi {
67	/* Owner thread */
68	struct thread		*pi_owner;
69
70	/* Reference count */
71	int			pi_refcount;
72
73 	/* List entry to link umtx holding by thread */
74	TAILQ_ENTRY(umtx_pi)	pi_link;
75
76	/* List entry in hash */
77	TAILQ_ENTRY(umtx_pi)	pi_hashlink;
78
79	/* List for waiters */
80	TAILQ_HEAD(,umtx_q)	pi_blocked;
81
82	/* Identify a userland lock object */
83	struct umtx_key		pi_key;
84};
85
86/* A userland synchronous object user. */
87struct umtx_q {
88	/* Linked list for the hash. */
89	TAILQ_ENTRY(umtx_q)	uq_link;
90
91	/* Umtx key. */
92	struct umtx_key		uq_key;
93
94	/* Umtx flags. */
95	int			uq_flags;
96#define UQF_UMTXQ	0x0001
97
98	/* The thread waits on. */
99	struct thread		*uq_thread;
100
101	/*
102	 * Blocked on PI mutex. read can use chain lock
103	 * or umtx_lock, write must have both chain lock and
104	 * umtx_lock being hold.
105	 */
106	struct umtx_pi		*uq_pi_blocked;
107
108	/* On blocked list */
109	TAILQ_ENTRY(umtx_q)	uq_lockq;
110
111	/* Thread contending with us */
112	TAILQ_HEAD(,umtx_pi)	uq_pi_contested;
113
114	/* Inherited priority from PP mutex */
115	u_char			uq_inherited_pri;
116
117	/* Spare queue ready to be reused */
118	struct umtxq_queue	*uq_spare_queue;
119
120	/* The queue we on */
121	struct umtxq_queue	*uq_cur_queue;
122};
123
124TAILQ_HEAD(umtxq_head, umtx_q);
125
126/* Per-key wait-queue */
127struct umtxq_queue {
128	struct umtxq_head	head;
129	struct umtx_key		key;
130	LIST_ENTRY(umtxq_queue)	link;
131	int			length;
132};
133
134LIST_HEAD(umtxq_list, umtxq_queue);
135
136/* Userland lock object's wait-queue chain */
137struct umtxq_chain {
138	/* Lock for this chain. */
139	struct mtx		uc_lock;
140
141	/* List of sleep queues. */
142	struct umtxq_list	uc_queue[2];
143#define UMTX_SHARED_QUEUE	0
144#define UMTX_EXCLUSIVE_QUEUE	1
145
146	LIST_HEAD(, umtxq_queue) uc_spare_queue;
147
148	/* Busy flag */
149	char			uc_busy;
150
151	/* Chain lock waiters */
152	int			uc_waiters;
153
154	/* All PI in the list */
155	TAILQ_HEAD(,umtx_pi)	uc_pi_list;
156
157};
158
159#define	UMTXQ_LOCKED_ASSERT(uc)		mtx_assert(&(uc)->uc_lock, MA_OWNED)
160#define	UMTXQ_BUSY_ASSERT(uc)	KASSERT(&(uc)->uc_busy, ("umtx chain is not busy"))
161
162/*
163 * Don't propagate time-sharing priority, there is a security reason,
164 * a user can simply introduce PI-mutex, let thread A lock the mutex,
165 * and let another thread B block on the mutex, because B is
166 * sleeping, its priority will be boosted, this causes A's priority to
167 * be boosted via priority propagating too and will never be lowered even
168 * if it is using 100%CPU, this is unfair to other processes.
169 */
170
171#define UPRI(td)	(((td)->td_user_pri >= PRI_MIN_TIMESHARE &&\
172			  (td)->td_user_pri <= PRI_MAX_TIMESHARE) ?\
173			 PRI_MAX_TIMESHARE : (td)->td_user_pri)
174
175#define	GOLDEN_RATIO_PRIME	2654404609U
176#define	UMTX_CHAINS		512
177#define	UMTX_SHIFTS		(__WORD_BIT - 9)
178
179#define	GET_SHARE(flags)	\
180    (((flags) & USYNC_PROCESS_SHARED) == 0 ? THREAD_SHARE : PROCESS_SHARE)
181
182#define BUSY_SPINS		200
183
184static uma_zone_t		umtx_pi_zone;
185static struct umtxq_chain	umtxq_chains[2][UMTX_CHAINS];
186static MALLOC_DEFINE(M_UMTX, "umtx", "UMTX queue memory");
187static int			umtx_pi_allocated;
188
189static SYSCTL_NODE(_debug, OID_AUTO, umtx, CTLFLAG_RW, 0, "umtx debug");
190SYSCTL_INT(_debug_umtx, OID_AUTO, umtx_pi_allocated, CTLFLAG_RD,
191    &umtx_pi_allocated, 0, "Allocated umtx_pi");
192
193static void umtxq_sysinit(void *);
194static void umtxq_hash(struct umtx_key *key);
195static struct umtxq_chain *umtxq_getchain(struct umtx_key *key);
196static void umtxq_lock(struct umtx_key *key);
197static void umtxq_unlock(struct umtx_key *key);
198static void umtxq_busy(struct umtx_key *key);
199static void umtxq_unbusy(struct umtx_key *key);
200static void umtxq_insert_queue(struct umtx_q *uq, int q);
201static void umtxq_remove_queue(struct umtx_q *uq, int q);
202static int umtxq_sleep(struct umtx_q *uq, const char *wmesg, int timo);
203static int umtxq_count(struct umtx_key *key);
204static struct umtx_pi *umtx_pi_alloc(int);
205static void umtx_pi_free(struct umtx_pi *pi);
206static int do_unlock_pp(struct thread *td, struct umutex *m, uint32_t flags);
207static void umtx_thread_cleanup(struct thread *td);
208static void umtx_exec_hook(void *arg __unused, struct proc *p __unused,
209	struct image_params *imgp __unused);
210SYSINIT(umtx, SI_SUB_EVENTHANDLER+1, SI_ORDER_MIDDLE, umtxq_sysinit, NULL);
211
212#define umtxq_signal(key, nwake)	umtxq_signal_queue((key), (nwake), UMTX_SHARED_QUEUE)
213#define umtxq_insert(uq)	umtxq_insert_queue((uq), UMTX_SHARED_QUEUE)
214#define umtxq_remove(uq)	umtxq_remove_queue((uq), UMTX_SHARED_QUEUE)
215
216static struct mtx umtx_lock;
217
218static void
219umtxq_sysinit(void *arg __unused)
220{
221	int i, j;
222
223	umtx_pi_zone = uma_zcreate("umtx pi", sizeof(struct umtx_pi),
224		NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0);
225	for (i = 0; i < 2; ++i) {
226		for (j = 0; j < UMTX_CHAINS; ++j) {
227			mtx_init(&umtxq_chains[i][j].uc_lock, "umtxql", NULL,
228				 MTX_DEF | MTX_DUPOK);
229			LIST_INIT(&umtxq_chains[i][j].uc_queue[0]);
230			LIST_INIT(&umtxq_chains[i][j].uc_queue[1]);
231			LIST_INIT(&umtxq_chains[i][j].uc_spare_queue);
232			TAILQ_INIT(&umtxq_chains[i][j].uc_pi_list);
233			umtxq_chains[i][j].uc_busy = 0;
234			umtxq_chains[i][j].uc_waiters = 0;
235		}
236	}
237	mtx_init(&umtx_lock, "umtx lock", NULL, MTX_SPIN);
238	EVENTHANDLER_REGISTER(process_exec, umtx_exec_hook, NULL,
239	    EVENTHANDLER_PRI_ANY);
240}
241
242struct umtx_q *
243umtxq_alloc(void)
244{
245	struct umtx_q *uq;
246
247	uq = malloc(sizeof(struct umtx_q), M_UMTX, M_WAITOK | M_ZERO);
248	uq->uq_spare_queue = malloc(sizeof(struct umtxq_queue), M_UMTX, M_WAITOK | M_ZERO);
249	TAILQ_INIT(&uq->uq_spare_queue->head);
250	TAILQ_INIT(&uq->uq_pi_contested);
251	uq->uq_inherited_pri = PRI_MAX;
252	return (uq);
253}
254
255void
256umtxq_free(struct umtx_q *uq)
257{
258	MPASS(uq->uq_spare_queue != NULL);
259	free(uq->uq_spare_queue, M_UMTX);
260	free(uq, M_UMTX);
261}
262
263static inline void
264umtxq_hash(struct umtx_key *key)
265{
266	unsigned n = (uintptr_t)key->info.both.a + key->info.both.b;
267	key->hash = ((n * GOLDEN_RATIO_PRIME) >> UMTX_SHIFTS) % UMTX_CHAINS;
268}
269
270static inline struct umtxq_chain *
271umtxq_getchain(struct umtx_key *key)
272{
273	if (key->type <= TYPE_SEM)
274		return (&umtxq_chains[1][key->hash]);
275	return (&umtxq_chains[0][key->hash]);
276}
277
278/*
279 * Lock a chain.
280 */
281static inline void
282umtxq_lock(struct umtx_key *key)
283{
284	struct umtxq_chain *uc;
285
286	uc = umtxq_getchain(key);
287	mtx_lock(&uc->uc_lock);
288}
289
290/*
291 * Unlock a chain.
292 */
293static inline void
294umtxq_unlock(struct umtx_key *key)
295{
296	struct umtxq_chain *uc;
297
298	uc = umtxq_getchain(key);
299	mtx_unlock(&uc->uc_lock);
300}
301
302/*
303 * Set chain to busy state when following operation
304 * may be blocked (kernel mutex can not be used).
305 */
306static inline void
307umtxq_busy(struct umtx_key *key)
308{
309	struct umtxq_chain *uc;
310
311	uc = umtxq_getchain(key);
312	mtx_assert(&uc->uc_lock, MA_OWNED);
313	if (uc->uc_busy) {
314#ifdef SMP
315		if (smp_cpus > 1) {
316			int count = BUSY_SPINS;
317			if (count > 0) {
318				umtxq_unlock(key);
319				while (uc->uc_busy && --count > 0)
320					cpu_spinwait();
321				umtxq_lock(key);
322			}
323		}
324#endif
325		while (uc->uc_busy) {
326			uc->uc_waiters++;
327			msleep(uc, &uc->uc_lock, 0, "umtxqb", 0);
328			uc->uc_waiters--;
329		}
330	}
331	uc->uc_busy = 1;
332}
333
334/*
335 * Unbusy a chain.
336 */
337static inline void
338umtxq_unbusy(struct umtx_key *key)
339{
340	struct umtxq_chain *uc;
341
342	uc = umtxq_getchain(key);
343	mtx_assert(&uc->uc_lock, MA_OWNED);
344	KASSERT(uc->uc_busy != 0, ("not busy"));
345	uc->uc_busy = 0;
346	if (uc->uc_waiters)
347		wakeup_one(uc);
348}
349
350static struct umtxq_queue *
351umtxq_queue_lookup(struct umtx_key *key, int q)
352{
353	struct umtxq_queue *uh;
354	struct umtxq_chain *uc;
355
356	uc = umtxq_getchain(key);
357	UMTXQ_LOCKED_ASSERT(uc);
358	LIST_FOREACH(uh, &uc->uc_queue[q], link) {
359		if (umtx_key_match(&uh->key, key))
360			return (uh);
361	}
362
363	return (NULL);
364}
365
366static inline void
367umtxq_insert_queue(struct umtx_q *uq, int q)
368{
369	struct umtxq_queue *uh;
370	struct umtxq_chain *uc;
371
372	uc = umtxq_getchain(&uq->uq_key);
373	UMTXQ_LOCKED_ASSERT(uc);
374	KASSERT((uq->uq_flags & UQF_UMTXQ) == 0, ("umtx_q is already on queue"));
375	uh = umtxq_queue_lookup(&uq->uq_key, q);
376	if (uh != NULL) {
377		LIST_INSERT_HEAD(&uc->uc_spare_queue, uq->uq_spare_queue, link);
378	} else {
379		uh = uq->uq_spare_queue;
380		uh->key = uq->uq_key;
381		LIST_INSERT_HEAD(&uc->uc_queue[q], uh, link);
382	}
383	uq->uq_spare_queue = NULL;
384
385	TAILQ_INSERT_TAIL(&uh->head, uq, uq_link);
386	uh->length++;
387	uq->uq_flags |= UQF_UMTXQ;
388	uq->uq_cur_queue = uh;
389	return;
390}
391
392static inline void
393umtxq_remove_queue(struct umtx_q *uq, int q)
394{
395	struct umtxq_chain *uc;
396	struct umtxq_queue *uh;
397
398	uc = umtxq_getchain(&uq->uq_key);
399	UMTXQ_LOCKED_ASSERT(uc);
400	if (uq->uq_flags & UQF_UMTXQ) {
401		uh = uq->uq_cur_queue;
402		TAILQ_REMOVE(&uh->head, uq, uq_link);
403		uh->length--;
404		uq->uq_flags &= ~UQF_UMTXQ;
405		if (TAILQ_EMPTY(&uh->head)) {
406			KASSERT(uh->length == 0,
407			    ("inconsistent umtxq_queue length"));
408			LIST_REMOVE(uh, link);
409		} else {
410			uh = LIST_FIRST(&uc->uc_spare_queue);
411			KASSERT(uh != NULL, ("uc_spare_queue is empty"));
412			LIST_REMOVE(uh, link);
413		}
414		uq->uq_spare_queue = uh;
415		uq->uq_cur_queue = NULL;
416	}
417}
418
419/*
420 * Check if there are multiple waiters
421 */
422static int
423umtxq_count(struct umtx_key *key)
424{
425	struct umtxq_chain *uc;
426	struct umtxq_queue *uh;
427
428	uc = umtxq_getchain(key);
429	UMTXQ_LOCKED_ASSERT(uc);
430	uh = umtxq_queue_lookup(key, UMTX_SHARED_QUEUE);
431	if (uh != NULL)
432		return (uh->length);
433	return (0);
434}
435
436/*
437 * Check if there are multiple PI waiters and returns first
438 * waiter.
439 */
440static int
441umtxq_count_pi(struct umtx_key *key, struct umtx_q **first)
442{
443	struct umtxq_chain *uc;
444	struct umtxq_queue *uh;
445
446	*first = NULL;
447	uc = umtxq_getchain(key);
448	UMTXQ_LOCKED_ASSERT(uc);
449	uh = umtxq_queue_lookup(key, UMTX_SHARED_QUEUE);
450	if (uh != NULL) {
451		*first = TAILQ_FIRST(&uh->head);
452		return (uh->length);
453	}
454	return (0);
455}
456
457/*
458 * Wake up threads waiting on an userland object.
459 */
460
461static int
462umtxq_signal_queue(struct umtx_key *key, int n_wake, int q)
463{
464	struct umtxq_chain *uc;
465	struct umtxq_queue *uh;
466	struct umtx_q *uq;
467	int ret;
468
469	ret = 0;
470	uc = umtxq_getchain(key);
471	UMTXQ_LOCKED_ASSERT(uc);
472	uh = umtxq_queue_lookup(key, q);
473	if (uh != NULL) {
474		while ((uq = TAILQ_FIRST(&uh->head)) != NULL) {
475			umtxq_remove_queue(uq, q);
476			wakeup(uq);
477			if (++ret >= n_wake)
478				return (ret);
479		}
480	}
481	return (ret);
482}
483
484
485/*
486 * Wake up specified thread.
487 */
488static inline void
489umtxq_signal_thread(struct umtx_q *uq)
490{
491	struct umtxq_chain *uc;
492
493	uc = umtxq_getchain(&uq->uq_key);
494	UMTXQ_LOCKED_ASSERT(uc);
495	umtxq_remove(uq);
496	wakeup(uq);
497}
498
499/*
500 * Put thread into sleep state, before sleeping, check if
501 * thread was removed from umtx queue.
502 */
503static inline int
504umtxq_sleep(struct umtx_q *uq, const char *wmesg, int timo)
505{
506	struct umtxq_chain *uc;
507	int error;
508
509	uc = umtxq_getchain(&uq->uq_key);
510	UMTXQ_LOCKED_ASSERT(uc);
511	if (!(uq->uq_flags & UQF_UMTXQ))
512		return (0);
513	error = msleep(uq, &uc->uc_lock, PCATCH, wmesg, timo);
514	if (error == EWOULDBLOCK)
515		error = ETIMEDOUT;
516	return (error);
517}
518
519/*
520 * Convert userspace address into unique logical address.
521 */
522int
523umtx_key_get(void *addr, int type, int share, struct umtx_key *key)
524{
525	struct thread *td = curthread;
526	vm_map_t map;
527	vm_map_entry_t entry;
528	vm_pindex_t pindex;
529	vm_prot_t prot;
530	boolean_t wired;
531
532	key->type = type;
533	if (share == THREAD_SHARE) {
534		key->shared = 0;
535		key->info.private.vs = td->td_proc->p_vmspace;
536		key->info.private.addr = (uintptr_t)addr;
537	} else {
538		MPASS(share == PROCESS_SHARE || share == AUTO_SHARE);
539		map = &td->td_proc->p_vmspace->vm_map;
540		if (vm_map_lookup(&map, (vm_offset_t)addr, VM_PROT_WRITE,
541		    &entry, &key->info.shared.object, &pindex, &prot,
542		    &wired) != KERN_SUCCESS) {
543			return EFAULT;
544		}
545
546		if ((share == PROCESS_SHARE) ||
547		    (share == AUTO_SHARE &&
548		     VM_INHERIT_SHARE == entry->inheritance)) {
549			key->shared = 1;
550			key->info.shared.offset = entry->offset + entry->start -
551				(vm_offset_t)addr;
552			vm_object_reference(key->info.shared.object);
553		} else {
554			key->shared = 0;
555			key->info.private.vs = td->td_proc->p_vmspace;
556			key->info.private.addr = (uintptr_t)addr;
557		}
558		vm_map_lookup_done(map, entry);
559	}
560
561	umtxq_hash(key);
562	return (0);
563}
564
565/*
566 * Release key.
567 */
568void
569umtx_key_release(struct umtx_key *key)
570{
571	if (key->shared)
572		vm_object_deallocate(key->info.shared.object);
573}
574
575/*
576 * Lock a umtx object.
577 */
578static int
579_do_lock_umtx(struct thread *td, struct umtx *umtx, u_long id, int timo)
580{
581	struct umtx_q *uq;
582	u_long owner;
583	u_long old;
584	int error = 0;
585
586	uq = td->td_umtxq;
587
588	/*
589	 * Care must be exercised when dealing with umtx structure. It
590	 * can fault on any access.
591	 */
592	for (;;) {
593		/*
594		 * Try the uncontested case.  This should be done in userland.
595		 */
596		owner = casuword(&umtx->u_owner, UMTX_UNOWNED, id);
597
598		/* The acquire succeeded. */
599		if (owner == UMTX_UNOWNED)
600			return (0);
601
602		/* The address was invalid. */
603		if (owner == -1)
604			return (EFAULT);
605
606		/* If no one owns it but it is contested try to acquire it. */
607		if (owner == UMTX_CONTESTED) {
608			owner = casuword(&umtx->u_owner,
609			    UMTX_CONTESTED, id | UMTX_CONTESTED);
610
611			if (owner == UMTX_CONTESTED)
612				return (0);
613
614			/* The address was invalid. */
615			if (owner == -1)
616				return (EFAULT);
617
618			/* If this failed the lock has changed, restart. */
619			continue;
620		}
621
622		/*
623		 * If we caught a signal, we have retried and now
624		 * exit immediately.
625		 */
626		if (error != 0)
627			return (error);
628
629		if ((error = umtx_key_get(umtx, TYPE_SIMPLE_LOCK,
630			AUTO_SHARE, &uq->uq_key)) != 0)
631			return (error);
632
633		umtxq_lock(&uq->uq_key);
634		umtxq_busy(&uq->uq_key);
635		umtxq_insert(uq);
636		umtxq_unbusy(&uq->uq_key);
637		umtxq_unlock(&uq->uq_key);
638
639		/*
640		 * Set the contested bit so that a release in user space
641		 * knows to use the system call for unlock.  If this fails
642		 * either some one else has acquired the lock or it has been
643		 * released.
644		 */
645		old = casuword(&umtx->u_owner, owner, owner | UMTX_CONTESTED);
646
647		/* The address was invalid. */
648		if (old == -1) {
649			umtxq_lock(&uq->uq_key);
650			umtxq_remove(uq);
651			umtxq_unlock(&uq->uq_key);
652			umtx_key_release(&uq->uq_key);
653			return (EFAULT);
654		}
655
656		/*
657		 * We set the contested bit, sleep. Otherwise the lock changed
658		 * and we need to retry or we lost a race to the thread
659		 * unlocking the umtx.
660		 */
661		umtxq_lock(&uq->uq_key);
662		if (old == owner)
663			error = umtxq_sleep(uq, "umtx", timo);
664		umtxq_remove(uq);
665		umtxq_unlock(&uq->uq_key);
666		umtx_key_release(&uq->uq_key);
667	}
668
669	return (0);
670}
671
672/*
673 * Lock a umtx object.
674 */
675static int
676do_lock_umtx(struct thread *td, struct umtx *umtx, u_long id,
677	struct timespec *timeout)
678{
679	struct timespec ts, ts2, ts3;
680	struct timeval tv;
681	int error;
682
683	if (timeout == NULL) {
684		error = _do_lock_umtx(td, umtx, id, 0);
685		/* Mutex locking is restarted if it is interrupted. */
686		if (error == EINTR)
687			error = ERESTART;
688	} else {
689		getnanouptime(&ts);
690		timespecadd(&ts, timeout);
691		TIMESPEC_TO_TIMEVAL(&tv, timeout);
692		for (;;) {
693			error = _do_lock_umtx(td, umtx, id, tvtohz(&tv));
694			if (error != ETIMEDOUT)
695				break;
696			getnanouptime(&ts2);
697			if (timespeccmp(&ts2, &ts, >=)) {
698				error = ETIMEDOUT;
699				break;
700			}
701			ts3 = ts;
702			timespecsub(&ts3, &ts2);
703			TIMESPEC_TO_TIMEVAL(&tv, &ts3);
704		}
705		/* Timed-locking is not restarted. */
706		if (error == ERESTART)
707			error = EINTR;
708	}
709	return (error);
710}
711
712/*
713 * Unlock a umtx object.
714 */
715static int
716do_unlock_umtx(struct thread *td, struct umtx *umtx, u_long id)
717{
718	struct umtx_key key;
719	u_long owner;
720	u_long old;
721	int error;
722	int count;
723
724	/*
725	 * Make sure we own this mtx.
726	 */
727	owner = fuword(__DEVOLATILE(u_long *, &umtx->u_owner));
728	if (owner == -1)
729		return (EFAULT);
730
731	if ((owner & ~UMTX_CONTESTED) != id)
732		return (EPERM);
733
734	/* This should be done in userland */
735	if ((owner & UMTX_CONTESTED) == 0) {
736		old = casuword(&umtx->u_owner, owner, UMTX_UNOWNED);
737		if (old == -1)
738			return (EFAULT);
739		if (old == owner)
740			return (0);
741		owner = old;
742	}
743
744	/* We should only ever be in here for contested locks */
745	if ((error = umtx_key_get(umtx, TYPE_SIMPLE_LOCK, AUTO_SHARE,
746		&key)) != 0)
747		return (error);
748
749	umtxq_lock(&key);
750	umtxq_busy(&key);
751	count = umtxq_count(&key);
752	umtxq_unlock(&key);
753
754	/*
755	 * When unlocking the umtx, it must be marked as unowned if
756	 * there is zero or one thread only waiting for it.
757	 * Otherwise, it must be marked as contested.
758	 */
759	old = casuword(&umtx->u_owner, owner,
760		count <= 1 ? UMTX_UNOWNED : UMTX_CONTESTED);
761	umtxq_lock(&key);
762	umtxq_signal(&key,1);
763	umtxq_unbusy(&key);
764	umtxq_unlock(&key);
765	umtx_key_release(&key);
766	if (old == -1)
767		return (EFAULT);
768	if (old != owner)
769		return (EINVAL);
770	return (0);
771}
772
773#ifdef COMPAT_FREEBSD32
774
775/*
776 * Lock a umtx object.
777 */
778static int
779_do_lock_umtx32(struct thread *td, uint32_t *m, uint32_t id, int timo)
780{
781	struct umtx_q *uq;
782	uint32_t owner;
783	uint32_t old;
784	int error = 0;
785
786	uq = td->td_umtxq;
787
788	/*
789	 * Care must be exercised when dealing with umtx structure. It
790	 * can fault on any access.
791	 */
792	for (;;) {
793		/*
794		 * Try the uncontested case.  This should be done in userland.
795		 */
796		owner = casuword32(m, UMUTEX_UNOWNED, id);
797
798		/* The acquire succeeded. */
799		if (owner == UMUTEX_UNOWNED)
800			return (0);
801
802		/* The address was invalid. */
803		if (owner == -1)
804			return (EFAULT);
805
806		/* If no one owns it but it is contested try to acquire it. */
807		if (owner == UMUTEX_CONTESTED) {
808			owner = casuword32(m,
809			    UMUTEX_CONTESTED, id | UMUTEX_CONTESTED);
810			if (owner == UMUTEX_CONTESTED)
811				return (0);
812
813			/* The address was invalid. */
814			if (owner == -1)
815				return (EFAULT);
816
817			/* If this failed the lock has changed, restart. */
818			continue;
819		}
820
821		/*
822		 * If we caught a signal, we have retried and now
823		 * exit immediately.
824		 */
825		if (error != 0)
826			return (error);
827
828		if ((error = umtx_key_get(m, TYPE_SIMPLE_LOCK,
829			AUTO_SHARE, &uq->uq_key)) != 0)
830			return (error);
831
832		umtxq_lock(&uq->uq_key);
833		umtxq_busy(&uq->uq_key);
834		umtxq_insert(uq);
835		umtxq_unbusy(&uq->uq_key);
836		umtxq_unlock(&uq->uq_key);
837
838		/*
839		 * Set the contested bit so that a release in user space
840		 * knows to use the system call for unlock.  If this fails
841		 * either some one else has acquired the lock or it has been
842		 * released.
843		 */
844		old = casuword32(m, owner, owner | UMUTEX_CONTESTED);
845
846		/* The address was invalid. */
847		if (old == -1) {
848			umtxq_lock(&uq->uq_key);
849			umtxq_remove(uq);
850			umtxq_unlock(&uq->uq_key);
851			umtx_key_release(&uq->uq_key);
852			return (EFAULT);
853		}
854
855		/*
856		 * We set the contested bit, sleep. Otherwise the lock changed
857		 * and we need to retry or we lost a race to the thread
858		 * unlocking the umtx.
859		 */
860		umtxq_lock(&uq->uq_key);
861		if (old == owner)
862			error = umtxq_sleep(uq, "umtx", timo);
863		umtxq_remove(uq);
864		umtxq_unlock(&uq->uq_key);
865		umtx_key_release(&uq->uq_key);
866	}
867
868	return (0);
869}
870
871/*
872 * Lock a umtx object.
873 */
874static int
875do_lock_umtx32(struct thread *td, void *m, uint32_t id,
876	struct timespec *timeout)
877{
878	struct timespec ts, ts2, ts3;
879	struct timeval tv;
880	int error;
881
882	if (timeout == NULL) {
883		error = _do_lock_umtx32(td, m, id, 0);
884		/* Mutex locking is restarted if it is interrupted. */
885		if (error == EINTR)
886			error = ERESTART;
887	} else {
888		getnanouptime(&ts);
889		timespecadd(&ts, timeout);
890		TIMESPEC_TO_TIMEVAL(&tv, timeout);
891		for (;;) {
892			error = _do_lock_umtx32(td, m, id, tvtohz(&tv));
893			if (error != ETIMEDOUT)
894				break;
895			getnanouptime(&ts2);
896			if (timespeccmp(&ts2, &ts, >=)) {
897				error = ETIMEDOUT;
898				break;
899			}
900			ts3 = ts;
901			timespecsub(&ts3, &ts2);
902			TIMESPEC_TO_TIMEVAL(&tv, &ts3);
903		}
904		/* Timed-locking is not restarted. */
905		if (error == ERESTART)
906			error = EINTR;
907	}
908	return (error);
909}
910
911/*
912 * Unlock a umtx object.
913 */
914static int
915do_unlock_umtx32(struct thread *td, uint32_t *m, uint32_t id)
916{
917	struct umtx_key key;
918	uint32_t owner;
919	uint32_t old;
920	int error;
921	int count;
922
923	/*
924	 * Make sure we own this mtx.
925	 */
926	owner = fuword32(m);
927	if (owner == -1)
928		return (EFAULT);
929
930	if ((owner & ~UMUTEX_CONTESTED) != id)
931		return (EPERM);
932
933	/* This should be done in userland */
934	if ((owner & UMUTEX_CONTESTED) == 0) {
935		old = casuword32(m, owner, UMUTEX_UNOWNED);
936		if (old == -1)
937			return (EFAULT);
938		if (old == owner)
939			return (0);
940		owner = old;
941	}
942
943	/* We should only ever be in here for contested locks */
944	if ((error = umtx_key_get(m, TYPE_SIMPLE_LOCK, AUTO_SHARE,
945		&key)) != 0)
946		return (error);
947
948	umtxq_lock(&key);
949	umtxq_busy(&key);
950	count = umtxq_count(&key);
951	umtxq_unlock(&key);
952
953	/*
954	 * When unlocking the umtx, it must be marked as unowned if
955	 * there is zero or one thread only waiting for it.
956	 * Otherwise, it must be marked as contested.
957	 */
958	old = casuword32(m, owner,
959		count <= 1 ? UMUTEX_UNOWNED : UMUTEX_CONTESTED);
960	umtxq_lock(&key);
961	umtxq_signal(&key,1);
962	umtxq_unbusy(&key);
963	umtxq_unlock(&key);
964	umtx_key_release(&key);
965	if (old == -1)
966		return (EFAULT);
967	if (old != owner)
968		return (EINVAL);
969	return (0);
970}
971#endif
972
973/*
974 * Fetch and compare value, sleep on the address if value is not changed.
975 */
976static int
977do_wait(struct thread *td, void *addr, u_long id,
978	struct timespec *timeout, int compat32, int is_private)
979{
980	struct umtx_q *uq;
981	struct timespec ts, ts2, ts3;
982	struct timeval tv;
983	u_long tmp;
984	int error = 0;
985
986	uq = td->td_umtxq;
987	if ((error = umtx_key_get(addr, TYPE_SIMPLE_WAIT,
988		is_private ? THREAD_SHARE : AUTO_SHARE, &uq->uq_key)) != 0)
989		return (error);
990
991	umtxq_lock(&uq->uq_key);
992	umtxq_insert(uq);
993	umtxq_unlock(&uq->uq_key);
994	if (compat32 == 0)
995		tmp = fuword(addr);
996        else
997		tmp = (unsigned int)fuword32(addr);
998	if (tmp != id) {
999		umtxq_lock(&uq->uq_key);
1000		umtxq_remove(uq);
1001		umtxq_unlock(&uq->uq_key);
1002	} else if (timeout == NULL) {
1003		umtxq_lock(&uq->uq_key);
1004		error = umtxq_sleep(uq, "uwait", 0);
1005		umtxq_remove(uq);
1006		umtxq_unlock(&uq->uq_key);
1007	} else {
1008		getnanouptime(&ts);
1009		timespecadd(&ts, timeout);
1010		TIMESPEC_TO_TIMEVAL(&tv, timeout);
1011		umtxq_lock(&uq->uq_key);
1012		for (;;) {
1013			error = umtxq_sleep(uq, "uwait", tvtohz(&tv));
1014			if (!(uq->uq_flags & UQF_UMTXQ)) {
1015				error = 0;
1016				break;
1017			}
1018			if (error != ETIMEDOUT)
1019				break;
1020			umtxq_unlock(&uq->uq_key);
1021			getnanouptime(&ts2);
1022			if (timespeccmp(&ts2, &ts, >=)) {
1023				error = ETIMEDOUT;
1024				umtxq_lock(&uq->uq_key);
1025				break;
1026			}
1027			ts3 = ts;
1028			timespecsub(&ts3, &ts2);
1029			TIMESPEC_TO_TIMEVAL(&tv, &ts3);
1030			umtxq_lock(&uq->uq_key);
1031		}
1032		umtxq_remove(uq);
1033		umtxq_unlock(&uq->uq_key);
1034	}
1035	umtx_key_release(&uq->uq_key);
1036	if (error == ERESTART)
1037		error = EINTR;
1038	return (error);
1039}
1040
1041/*
1042 * Wake up threads sleeping on the specified address.
1043 */
1044int
1045kern_umtx_wake(struct thread *td, void *uaddr, int n_wake, int is_private)
1046{
1047	struct umtx_key key;
1048	int ret;
1049
1050	if ((ret = umtx_key_get(uaddr, TYPE_SIMPLE_WAIT,
1051		is_private ? THREAD_SHARE : AUTO_SHARE, &key)) != 0)
1052		return (ret);
1053	umtxq_lock(&key);
1054	ret = umtxq_signal(&key, n_wake);
1055	umtxq_unlock(&key);
1056	umtx_key_release(&key);
1057	return (0);
1058}
1059
1060/*
1061 * Lock PTHREAD_PRIO_NONE protocol POSIX mutex.
1062 */
1063static int
1064_do_lock_normal(struct thread *td, struct umutex *m, uint32_t flags, int timo,
1065	int mode)
1066{
1067	struct umtx_q *uq;
1068	uint32_t owner, old, id;
1069	int error = 0;
1070
1071	id = td->td_tid;
1072	uq = td->td_umtxq;
1073
1074	/*
1075	 * Care must be exercised when dealing with umtx structure. It
1076	 * can fault on any access.
1077	 */
1078	for (;;) {
1079		owner = fuword32(__DEVOLATILE(void *, &m->m_owner));
1080		if (mode == _UMUTEX_WAIT) {
1081			if (owner == UMUTEX_UNOWNED || owner == UMUTEX_CONTESTED)
1082				return (0);
1083		} else {
1084			/*
1085			 * Try the uncontested case.  This should be done in userland.
1086			 */
1087			owner = casuword32(&m->m_owner, UMUTEX_UNOWNED, id);
1088
1089			/* The acquire succeeded. */
1090			if (owner == UMUTEX_UNOWNED)
1091				return (0);
1092
1093			/* The address was invalid. */
1094			if (owner == -1)
1095				return (EFAULT);
1096
1097			/* If no one owns it but it is contested try to acquire it. */
1098			if (owner == UMUTEX_CONTESTED) {
1099				owner = casuword32(&m->m_owner,
1100				    UMUTEX_CONTESTED, id | UMUTEX_CONTESTED);
1101
1102				if (owner == UMUTEX_CONTESTED)
1103					return (0);
1104
1105				/* The address was invalid. */
1106				if (owner == -1)
1107					return (EFAULT);
1108
1109				/* If this failed the lock has changed, restart. */
1110				continue;
1111			}
1112		}
1113
1114		if ((flags & UMUTEX_ERROR_CHECK) != 0 &&
1115		    (owner & ~UMUTEX_CONTESTED) == id)
1116			return (EDEADLK);
1117
1118		if (mode == _UMUTEX_TRY)
1119			return (EBUSY);
1120
1121		/*
1122		 * If we caught a signal, we have retried and now
1123		 * exit immediately.
1124		 */
1125		if (error != 0)
1126			return (error);
1127
1128		if ((error = umtx_key_get(m, TYPE_NORMAL_UMUTEX,
1129		    GET_SHARE(flags), &uq->uq_key)) != 0)
1130			return (error);
1131
1132		umtxq_lock(&uq->uq_key);
1133		umtxq_busy(&uq->uq_key);
1134		umtxq_insert(uq);
1135		umtxq_unlock(&uq->uq_key);
1136
1137		/*
1138		 * Set the contested bit so that a release in user space
1139		 * knows to use the system call for unlock.  If this fails
1140		 * either some one else has acquired the lock or it has been
1141		 * released.
1142		 */
1143		old = casuword32(&m->m_owner, owner, owner | UMUTEX_CONTESTED);
1144
1145		/* The address was invalid. */
1146		if (old == -1) {
1147			umtxq_lock(&uq->uq_key);
1148			umtxq_remove(uq);
1149			umtxq_unbusy(&uq->uq_key);
1150			umtxq_unlock(&uq->uq_key);
1151			umtx_key_release(&uq->uq_key);
1152			return (EFAULT);
1153		}
1154
1155		/*
1156		 * We set the contested bit, sleep. Otherwise the lock changed
1157		 * and we need to retry or we lost a race to the thread
1158		 * unlocking the umtx.
1159		 */
1160		umtxq_lock(&uq->uq_key);
1161		umtxq_unbusy(&uq->uq_key);
1162		if (old == owner)
1163			error = umtxq_sleep(uq, "umtxn", timo);
1164		umtxq_remove(uq);
1165		umtxq_unlock(&uq->uq_key);
1166		umtx_key_release(&uq->uq_key);
1167	}
1168
1169	return (0);
1170}
1171
1172/*
1173 * Lock PTHREAD_PRIO_NONE protocol POSIX mutex.
1174 */
1175/*
1176 * Unlock PTHREAD_PRIO_NONE protocol POSIX mutex.
1177 */
1178static int
1179do_unlock_normal(struct thread *td, struct umutex *m, uint32_t flags)
1180{
1181	struct umtx_key key;
1182	uint32_t owner, old, id;
1183	int error;
1184	int count;
1185
1186	id = td->td_tid;
1187	/*
1188	 * Make sure we own this mtx.
1189	 */
1190	owner = fuword32(__DEVOLATILE(uint32_t *, &m->m_owner));
1191	if (owner == -1)
1192		return (EFAULT);
1193
1194	if ((owner & ~UMUTEX_CONTESTED) != id)
1195		return (EPERM);
1196
1197	if ((owner & UMUTEX_CONTESTED) == 0) {
1198		old = casuword32(&m->m_owner, owner, UMUTEX_UNOWNED);
1199		if (old == -1)
1200			return (EFAULT);
1201		if (old == owner)
1202			return (0);
1203		owner = old;
1204	}
1205
1206	/* We should only ever be in here for contested locks */
1207	if ((error = umtx_key_get(m, TYPE_NORMAL_UMUTEX, GET_SHARE(flags),
1208	    &key)) != 0)
1209		return (error);
1210
1211	umtxq_lock(&key);
1212	umtxq_busy(&key);
1213	count = umtxq_count(&key);
1214	umtxq_unlock(&key);
1215
1216	/*
1217	 * When unlocking the umtx, it must be marked as unowned if
1218	 * there is zero or one thread only waiting for it.
1219	 * Otherwise, it must be marked as contested.
1220	 */
1221	old = casuword32(&m->m_owner, owner,
1222		count <= 1 ? UMUTEX_UNOWNED : UMUTEX_CONTESTED);
1223	umtxq_lock(&key);
1224	umtxq_signal(&key,1);
1225	umtxq_unbusy(&key);
1226	umtxq_unlock(&key);
1227	umtx_key_release(&key);
1228	if (old == -1)
1229		return (EFAULT);
1230	if (old != owner)
1231		return (EINVAL);
1232	return (0);
1233}
1234
1235/*
1236 * Check if the mutex is available and wake up a waiter,
1237 * only for simple mutex.
1238 */
1239static int
1240do_wake_umutex(struct thread *td, struct umutex *m)
1241{
1242	struct umtx_key key;
1243	uint32_t owner;
1244	uint32_t flags;
1245	int error;
1246	int count;
1247
1248	owner = fuword32(__DEVOLATILE(uint32_t *, &m->m_owner));
1249	if (owner == -1)
1250		return (EFAULT);
1251
1252	if ((owner & ~UMUTEX_CONTESTED) != 0)
1253		return (0);
1254
1255	flags = fuword32(&m->m_flags);
1256
1257	/* We should only ever be in here for contested locks */
1258	if ((error = umtx_key_get(m, TYPE_NORMAL_UMUTEX, GET_SHARE(flags),
1259	    &key)) != 0)
1260		return (error);
1261
1262	umtxq_lock(&key);
1263	umtxq_busy(&key);
1264	count = umtxq_count(&key);
1265	umtxq_unlock(&key);
1266
1267	if (count <= 1)
1268		owner = casuword32(&m->m_owner, UMUTEX_CONTESTED, UMUTEX_UNOWNED);
1269
1270	umtxq_lock(&key);
1271	if (count != 0 && (owner & ~UMUTEX_CONTESTED) == 0)
1272		umtxq_signal(&key, 1);
1273	umtxq_unbusy(&key);
1274	umtxq_unlock(&key);
1275	umtx_key_release(&key);
1276	return (0);
1277}
1278
1279static inline struct umtx_pi *
1280umtx_pi_alloc(int flags)
1281{
1282	struct umtx_pi *pi;
1283
1284	pi = uma_zalloc(umtx_pi_zone, M_ZERO | flags);
1285	TAILQ_INIT(&pi->pi_blocked);
1286	atomic_add_int(&umtx_pi_allocated, 1);
1287	return (pi);
1288}
1289
1290static inline void
1291umtx_pi_free(struct umtx_pi *pi)
1292{
1293	uma_zfree(umtx_pi_zone, pi);
1294	atomic_add_int(&umtx_pi_allocated, -1);
1295}
1296
1297/*
1298 * Adjust the thread's position on a pi_state after its priority has been
1299 * changed.
1300 */
1301static int
1302umtx_pi_adjust_thread(struct umtx_pi *pi, struct thread *td)
1303{
1304	struct umtx_q *uq, *uq1, *uq2;
1305	struct thread *td1;
1306
1307	mtx_assert(&umtx_lock, MA_OWNED);
1308	if (pi == NULL)
1309		return (0);
1310
1311	uq = td->td_umtxq;
1312
1313	/*
1314	 * Check if the thread needs to be moved on the blocked chain.
1315	 * It needs to be moved if either its priority is lower than
1316	 * the previous thread or higher than the next thread.
1317	 */
1318	uq1 = TAILQ_PREV(uq, umtxq_head, uq_lockq);
1319	uq2 = TAILQ_NEXT(uq, uq_lockq);
1320	if ((uq1 != NULL && UPRI(td) < UPRI(uq1->uq_thread)) ||
1321	    (uq2 != NULL && UPRI(td) > UPRI(uq2->uq_thread))) {
1322		/*
1323		 * Remove thread from blocked chain and determine where
1324		 * it should be moved to.
1325		 */
1326		TAILQ_REMOVE(&pi->pi_blocked, uq, uq_lockq);
1327		TAILQ_FOREACH(uq1, &pi->pi_blocked, uq_lockq) {
1328			td1 = uq1->uq_thread;
1329			MPASS(td1->td_proc->p_magic == P_MAGIC);
1330			if (UPRI(td1) > UPRI(td))
1331				break;
1332		}
1333
1334		if (uq1 == NULL)
1335			TAILQ_INSERT_TAIL(&pi->pi_blocked, uq, uq_lockq);
1336		else
1337			TAILQ_INSERT_BEFORE(uq1, uq, uq_lockq);
1338	}
1339	return (1);
1340}
1341
1342/*
1343 * Propagate priority when a thread is blocked on POSIX
1344 * PI mutex.
1345 */
1346static void
1347umtx_propagate_priority(struct thread *td)
1348{
1349	struct umtx_q *uq;
1350	struct umtx_pi *pi;
1351	int pri;
1352
1353	mtx_assert(&umtx_lock, MA_OWNED);
1354	pri = UPRI(td);
1355	uq = td->td_umtxq;
1356	pi = uq->uq_pi_blocked;
1357	if (pi == NULL)
1358		return;
1359
1360	for (;;) {
1361		td = pi->pi_owner;
1362		if (td == NULL || td == curthread)
1363			return;
1364
1365		MPASS(td->td_proc != NULL);
1366		MPASS(td->td_proc->p_magic == P_MAGIC);
1367
1368		thread_lock(td);
1369		if (td->td_lend_user_pri > pri)
1370			sched_lend_user_prio(td, pri);
1371		else {
1372			thread_unlock(td);
1373			break;
1374		}
1375		thread_unlock(td);
1376
1377		/*
1378		 * Pick up the lock that td is blocked on.
1379		 */
1380		uq = td->td_umtxq;
1381		pi = uq->uq_pi_blocked;
1382		if (pi == NULL)
1383			break;
1384		/* Resort td on the list if needed. */
1385		umtx_pi_adjust_thread(pi, td);
1386	}
1387}
1388
1389/*
1390 * Unpropagate priority for a PI mutex when a thread blocked on
1391 * it is interrupted by signal or resumed by others.
1392 */
1393static void
1394umtx_repropagate_priority(struct umtx_pi *pi)
1395{
1396	struct umtx_q *uq, *uq_owner;
1397	struct umtx_pi *pi2;
1398	int pri;
1399
1400	mtx_assert(&umtx_lock, MA_OWNED);
1401
1402	while (pi != NULL && pi->pi_owner != NULL) {
1403		pri = PRI_MAX;
1404		uq_owner = pi->pi_owner->td_umtxq;
1405
1406		TAILQ_FOREACH(pi2, &uq_owner->uq_pi_contested, pi_link) {
1407			uq = TAILQ_FIRST(&pi2->pi_blocked);
1408			if (uq != NULL) {
1409				if (pri > UPRI(uq->uq_thread))
1410					pri = UPRI(uq->uq_thread);
1411			}
1412		}
1413
1414		if (pri > uq_owner->uq_inherited_pri)
1415			pri = uq_owner->uq_inherited_pri;
1416		thread_lock(pi->pi_owner);
1417		sched_lend_user_prio(pi->pi_owner, pri);
1418		thread_unlock(pi->pi_owner);
1419		if ((pi = uq_owner->uq_pi_blocked) != NULL)
1420			umtx_pi_adjust_thread(pi, uq_owner->uq_thread);
1421	}
1422}
1423
1424/*
1425 * Insert a PI mutex into owned list.
1426 */
1427static void
1428umtx_pi_setowner(struct umtx_pi *pi, struct thread *owner)
1429{
1430	struct umtx_q *uq_owner;
1431
1432	uq_owner = owner->td_umtxq;
1433	mtx_assert(&umtx_lock, MA_OWNED);
1434	if (pi->pi_owner != NULL)
1435		panic("pi_ower != NULL");
1436	pi->pi_owner = owner;
1437	TAILQ_INSERT_TAIL(&uq_owner->uq_pi_contested, pi, pi_link);
1438}
1439
1440/*
1441 * Claim ownership of a PI mutex.
1442 */
1443static int
1444umtx_pi_claim(struct umtx_pi *pi, struct thread *owner)
1445{
1446	struct umtx_q *uq, *uq_owner;
1447
1448	uq_owner = owner->td_umtxq;
1449	mtx_lock_spin(&umtx_lock);
1450	if (pi->pi_owner == owner) {
1451		mtx_unlock_spin(&umtx_lock);
1452		return (0);
1453	}
1454
1455	if (pi->pi_owner != NULL) {
1456		/*
1457		 * userland may have already messed the mutex, sigh.
1458		 */
1459		mtx_unlock_spin(&umtx_lock);
1460		return (EPERM);
1461	}
1462	umtx_pi_setowner(pi, owner);
1463	uq = TAILQ_FIRST(&pi->pi_blocked);
1464	if (uq != NULL) {
1465		int pri;
1466
1467		pri = UPRI(uq->uq_thread);
1468		thread_lock(owner);
1469		if (pri < UPRI(owner))
1470			sched_lend_user_prio(owner, pri);
1471		thread_unlock(owner);
1472	}
1473	mtx_unlock_spin(&umtx_lock);
1474	return (0);
1475}
1476
1477/*
1478 * Adjust a thread's order position in its blocked PI mutex,
1479 * this may result new priority propagating process.
1480 */
1481void
1482umtx_pi_adjust(struct thread *td, u_char oldpri)
1483{
1484	struct umtx_q *uq;
1485	struct umtx_pi *pi;
1486
1487	uq = td->td_umtxq;
1488	mtx_lock_spin(&umtx_lock);
1489	/*
1490	 * Pick up the lock that td is blocked on.
1491	 */
1492	pi = uq->uq_pi_blocked;
1493	if (pi != NULL) {
1494		umtx_pi_adjust_thread(pi, td);
1495		umtx_repropagate_priority(pi);
1496	}
1497	mtx_unlock_spin(&umtx_lock);
1498}
1499
1500/*
1501 * Sleep on a PI mutex.
1502 */
1503static int
1504umtxq_sleep_pi(struct umtx_q *uq, struct umtx_pi *pi,
1505	uint32_t owner, const char *wmesg, int timo)
1506{
1507	struct umtxq_chain *uc;
1508	struct thread *td, *td1;
1509	struct umtx_q *uq1;
1510	int pri;
1511	int error = 0;
1512
1513	td = uq->uq_thread;
1514	KASSERT(td == curthread, ("inconsistent uq_thread"));
1515	uc = umtxq_getchain(&uq->uq_key);
1516	UMTXQ_LOCKED_ASSERT(uc);
1517	UMTXQ_BUSY_ASSERT(uc);
1518	umtxq_insert(uq);
1519	mtx_lock_spin(&umtx_lock);
1520	if (pi->pi_owner == NULL) {
1521		mtx_unlock_spin(&umtx_lock);
1522		/* XXX Only look up thread in current process. */
1523		td1 = tdfind(owner, curproc->p_pid);
1524		mtx_lock_spin(&umtx_lock);
1525		if (td1 != NULL) {
1526			if (pi->pi_owner == NULL)
1527				umtx_pi_setowner(pi, td1);
1528			PROC_UNLOCK(td1->td_proc);
1529		}
1530	}
1531
1532	TAILQ_FOREACH(uq1, &pi->pi_blocked, uq_lockq) {
1533		pri = UPRI(uq1->uq_thread);
1534		if (pri > UPRI(td))
1535			break;
1536	}
1537
1538	if (uq1 != NULL)
1539		TAILQ_INSERT_BEFORE(uq1, uq, uq_lockq);
1540	else
1541		TAILQ_INSERT_TAIL(&pi->pi_blocked, uq, uq_lockq);
1542
1543	uq->uq_pi_blocked = pi;
1544	thread_lock(td);
1545	td->td_flags |= TDF_UPIBLOCKED;
1546	thread_unlock(td);
1547	umtx_propagate_priority(td);
1548	mtx_unlock_spin(&umtx_lock);
1549	umtxq_unbusy(&uq->uq_key);
1550
1551	if (uq->uq_flags & UQF_UMTXQ) {
1552		error = msleep(uq, &uc->uc_lock, PCATCH, wmesg, timo);
1553		if (error == EWOULDBLOCK)
1554			error = ETIMEDOUT;
1555		if (uq->uq_flags & UQF_UMTXQ) {
1556			umtxq_remove(uq);
1557		}
1558	}
1559	mtx_lock_spin(&umtx_lock);
1560	uq->uq_pi_blocked = NULL;
1561	thread_lock(td);
1562	td->td_flags &= ~TDF_UPIBLOCKED;
1563	thread_unlock(td);
1564	TAILQ_REMOVE(&pi->pi_blocked, uq, uq_lockq);
1565	umtx_repropagate_priority(pi);
1566	mtx_unlock_spin(&umtx_lock);
1567	umtxq_unlock(&uq->uq_key);
1568
1569	return (error);
1570}
1571
1572/*
1573 * Add reference count for a PI mutex.
1574 */
1575static void
1576umtx_pi_ref(struct umtx_pi *pi)
1577{
1578	struct umtxq_chain *uc;
1579
1580	uc = umtxq_getchain(&pi->pi_key);
1581	UMTXQ_LOCKED_ASSERT(uc);
1582	pi->pi_refcount++;
1583}
1584
1585/*
1586 * Decrease reference count for a PI mutex, if the counter
1587 * is decreased to zero, its memory space is freed.
1588 */
1589static void
1590umtx_pi_unref(struct umtx_pi *pi)
1591{
1592	struct umtxq_chain *uc;
1593
1594	uc = umtxq_getchain(&pi->pi_key);
1595	UMTXQ_LOCKED_ASSERT(uc);
1596	KASSERT(pi->pi_refcount > 0, ("invalid reference count"));
1597	if (--pi->pi_refcount == 0) {
1598		mtx_lock_spin(&umtx_lock);
1599		if (pi->pi_owner != NULL) {
1600			TAILQ_REMOVE(&pi->pi_owner->td_umtxq->uq_pi_contested,
1601				pi, pi_link);
1602			pi->pi_owner = NULL;
1603		}
1604		KASSERT(TAILQ_EMPTY(&pi->pi_blocked),
1605			("blocked queue not empty"));
1606		mtx_unlock_spin(&umtx_lock);
1607		TAILQ_REMOVE(&uc->uc_pi_list, pi, pi_hashlink);
1608		umtx_pi_free(pi);
1609	}
1610}
1611
1612/*
1613 * Find a PI mutex in hash table.
1614 */
1615static struct umtx_pi *
1616umtx_pi_lookup(struct umtx_key *key)
1617{
1618	struct umtxq_chain *uc;
1619	struct umtx_pi *pi;
1620
1621	uc = umtxq_getchain(key);
1622	UMTXQ_LOCKED_ASSERT(uc);
1623
1624	TAILQ_FOREACH(pi, &uc->uc_pi_list, pi_hashlink) {
1625		if (umtx_key_match(&pi->pi_key, key)) {
1626			return (pi);
1627		}
1628	}
1629	return (NULL);
1630}
1631
1632/*
1633 * Insert a PI mutex into hash table.
1634 */
1635static inline void
1636umtx_pi_insert(struct umtx_pi *pi)
1637{
1638	struct umtxq_chain *uc;
1639
1640	uc = umtxq_getchain(&pi->pi_key);
1641	UMTXQ_LOCKED_ASSERT(uc);
1642	TAILQ_INSERT_TAIL(&uc->uc_pi_list, pi, pi_hashlink);
1643}
1644
1645/*
1646 * Lock a PI mutex.
1647 */
1648static int
1649_do_lock_pi(struct thread *td, struct umutex *m, uint32_t flags, int timo,
1650	int try)
1651{
1652	struct umtx_q *uq;
1653	struct umtx_pi *pi, *new_pi;
1654	uint32_t id, owner, old;
1655	int error;
1656
1657	id = td->td_tid;
1658	uq = td->td_umtxq;
1659
1660	if ((error = umtx_key_get(m, TYPE_PI_UMUTEX, GET_SHARE(flags),
1661	    &uq->uq_key)) != 0)
1662		return (error);
1663	umtxq_lock(&uq->uq_key);
1664	pi = umtx_pi_lookup(&uq->uq_key);
1665	if (pi == NULL) {
1666		new_pi = umtx_pi_alloc(M_NOWAIT);
1667		if (new_pi == NULL) {
1668			umtxq_unlock(&uq->uq_key);
1669			new_pi = umtx_pi_alloc(M_WAITOK);
1670			umtxq_lock(&uq->uq_key);
1671			pi = umtx_pi_lookup(&uq->uq_key);
1672			if (pi != NULL) {
1673				umtx_pi_free(new_pi);
1674				new_pi = NULL;
1675			}
1676		}
1677		if (new_pi != NULL) {
1678			new_pi->pi_key = uq->uq_key;
1679			umtx_pi_insert(new_pi);
1680			pi = new_pi;
1681		}
1682	}
1683	umtx_pi_ref(pi);
1684	umtxq_unlock(&uq->uq_key);
1685
1686	/*
1687	 * Care must be exercised when dealing with umtx structure.  It
1688	 * can fault on any access.
1689	 */
1690	for (;;) {
1691		/*
1692		 * Try the uncontested case.  This should be done in userland.
1693		 */
1694		owner = casuword32(&m->m_owner, UMUTEX_UNOWNED, id);
1695
1696		/* The acquire succeeded. */
1697		if (owner == UMUTEX_UNOWNED) {
1698			error = 0;
1699			break;
1700		}
1701
1702		/* The address was invalid. */
1703		if (owner == -1) {
1704			error = EFAULT;
1705			break;
1706		}
1707
1708		/* If no one owns it but it is contested try to acquire it. */
1709		if (owner == UMUTEX_CONTESTED) {
1710			owner = casuword32(&m->m_owner,
1711			    UMUTEX_CONTESTED, id | UMUTEX_CONTESTED);
1712
1713			if (owner == UMUTEX_CONTESTED) {
1714				umtxq_lock(&uq->uq_key);
1715				umtxq_busy(&uq->uq_key);
1716				error = umtx_pi_claim(pi, td);
1717				umtxq_unbusy(&uq->uq_key);
1718				umtxq_unlock(&uq->uq_key);
1719				break;
1720			}
1721
1722			/* The address was invalid. */
1723			if (owner == -1) {
1724				error = EFAULT;
1725				break;
1726			}
1727
1728			/* If this failed the lock has changed, restart. */
1729			continue;
1730		}
1731
1732		if ((flags & UMUTEX_ERROR_CHECK) != 0 &&
1733		    (owner & ~UMUTEX_CONTESTED) == id) {
1734			error = EDEADLK;
1735			break;
1736		}
1737
1738		if (try != 0) {
1739			error = EBUSY;
1740			break;
1741		}
1742
1743		/*
1744		 * If we caught a signal, we have retried and now
1745		 * exit immediately.
1746		 */
1747		if (error != 0)
1748			break;
1749
1750		umtxq_lock(&uq->uq_key);
1751		umtxq_busy(&uq->uq_key);
1752		umtxq_unlock(&uq->uq_key);
1753
1754		/*
1755		 * Set the contested bit so that a release in user space
1756		 * knows to use the system call for unlock.  If this fails
1757		 * either some one else has acquired the lock or it has been
1758		 * released.
1759		 */
1760		old = casuword32(&m->m_owner, owner, owner | UMUTEX_CONTESTED);
1761
1762		/* The address was invalid. */
1763		if (old == -1) {
1764			umtxq_lock(&uq->uq_key);
1765			umtxq_unbusy(&uq->uq_key);
1766			umtxq_unlock(&uq->uq_key);
1767			error = EFAULT;
1768			break;
1769		}
1770
1771		umtxq_lock(&uq->uq_key);
1772		/*
1773		 * We set the contested bit, sleep. Otherwise the lock changed
1774		 * and we need to retry or we lost a race to the thread
1775		 * unlocking the umtx.
1776		 */
1777		if (old == owner)
1778			error = umtxq_sleep_pi(uq, pi, owner & ~UMUTEX_CONTESTED,
1779				 "umtxpi", timo);
1780		else {
1781			umtxq_unbusy(&uq->uq_key);
1782			umtxq_unlock(&uq->uq_key);
1783		}
1784	}
1785
1786	umtxq_lock(&uq->uq_key);
1787	umtx_pi_unref(pi);
1788	umtxq_unlock(&uq->uq_key);
1789
1790	umtx_key_release(&uq->uq_key);
1791	return (error);
1792}
1793
1794/*
1795 * Unlock a PI mutex.
1796 */
1797static int
1798do_unlock_pi(struct thread *td, struct umutex *m, uint32_t flags)
1799{
1800	struct umtx_key key;
1801	struct umtx_q *uq_first, *uq_first2, *uq_me;
1802	struct umtx_pi *pi, *pi2;
1803	uint32_t owner, old, id;
1804	int error;
1805	int count;
1806	int pri;
1807
1808	id = td->td_tid;
1809	/*
1810	 * Make sure we own this mtx.
1811	 */
1812	owner = fuword32(__DEVOLATILE(uint32_t *, &m->m_owner));
1813	if (owner == -1)
1814		return (EFAULT);
1815
1816	if ((owner & ~UMUTEX_CONTESTED) != id)
1817		return (EPERM);
1818
1819	/* This should be done in userland */
1820	if ((owner & UMUTEX_CONTESTED) == 0) {
1821		old = casuword32(&m->m_owner, owner, UMUTEX_UNOWNED);
1822		if (old == -1)
1823			return (EFAULT);
1824		if (old == owner)
1825			return (0);
1826		owner = old;
1827	}
1828
1829	/* We should only ever be in here for contested locks */
1830	if ((error = umtx_key_get(m, TYPE_PI_UMUTEX, GET_SHARE(flags),
1831	    &key)) != 0)
1832		return (error);
1833
1834	umtxq_lock(&key);
1835	umtxq_busy(&key);
1836	count = umtxq_count_pi(&key, &uq_first);
1837	if (uq_first != NULL) {
1838		mtx_lock_spin(&umtx_lock);
1839		pi = uq_first->uq_pi_blocked;
1840		KASSERT(pi != NULL, ("pi == NULL?"));
1841		if (pi->pi_owner != curthread) {
1842			mtx_unlock_spin(&umtx_lock);
1843			umtxq_unbusy(&key);
1844			umtxq_unlock(&key);
1845			umtx_key_release(&key);
1846			/* userland messed the mutex */
1847			return (EPERM);
1848		}
1849		uq_me = curthread->td_umtxq;
1850		pi->pi_owner = NULL;
1851		TAILQ_REMOVE(&uq_me->uq_pi_contested, pi, pi_link);
1852		/* get highest priority thread which is still sleeping. */
1853		uq_first = TAILQ_FIRST(&pi->pi_blocked);
1854		while (uq_first != NULL &&
1855		       (uq_first->uq_flags & UQF_UMTXQ) == 0) {
1856			uq_first = TAILQ_NEXT(uq_first, uq_lockq);
1857		}
1858		pri = PRI_MAX;
1859		TAILQ_FOREACH(pi2, &uq_me->uq_pi_contested, pi_link) {
1860			uq_first2 = TAILQ_FIRST(&pi2->pi_blocked);
1861			if (uq_first2 != NULL) {
1862				if (pri > UPRI(uq_first2->uq_thread))
1863					pri = UPRI(uq_first2->uq_thread);
1864			}
1865		}
1866		thread_lock(curthread);
1867		sched_lend_user_prio(curthread, pri);
1868		thread_unlock(curthread);
1869		mtx_unlock_spin(&umtx_lock);
1870		if (uq_first)
1871			umtxq_signal_thread(uq_first);
1872	}
1873	umtxq_unlock(&key);
1874
1875	/*
1876	 * When unlocking the umtx, it must be marked as unowned if
1877	 * there is zero or one thread only waiting for it.
1878	 * Otherwise, it must be marked as contested.
1879	 */
1880	old = casuword32(&m->m_owner, owner,
1881		count <= 1 ? UMUTEX_UNOWNED : UMUTEX_CONTESTED);
1882
1883	umtxq_lock(&key);
1884	umtxq_unbusy(&key);
1885	umtxq_unlock(&key);
1886	umtx_key_release(&key);
1887	if (old == -1)
1888		return (EFAULT);
1889	if (old != owner)
1890		return (EINVAL);
1891	return (0);
1892}
1893
1894/*
1895 * Lock a PP mutex.
1896 */
1897static int
1898_do_lock_pp(struct thread *td, struct umutex *m, uint32_t flags, int timo,
1899	int try)
1900{
1901	struct umtx_q *uq, *uq2;
1902	struct umtx_pi *pi;
1903	uint32_t ceiling;
1904	uint32_t owner, id;
1905	int error, pri, old_inherited_pri, su;
1906
1907	id = td->td_tid;
1908	uq = td->td_umtxq;
1909	if ((error = umtx_key_get(m, TYPE_PP_UMUTEX, GET_SHARE(flags),
1910	    &uq->uq_key)) != 0)
1911		return (error);
1912	su = (priv_check(td, PRIV_SCHED_RTPRIO) == 0);
1913	for (;;) {
1914		old_inherited_pri = uq->uq_inherited_pri;
1915		umtxq_lock(&uq->uq_key);
1916		umtxq_busy(&uq->uq_key);
1917		umtxq_unlock(&uq->uq_key);
1918
1919		ceiling = RTP_PRIO_MAX - fuword32(&m->m_ceilings[0]);
1920		if (ceiling > RTP_PRIO_MAX) {
1921			error = EINVAL;
1922			goto out;
1923		}
1924
1925		mtx_lock_spin(&umtx_lock);
1926		if (UPRI(td) < PRI_MIN_REALTIME + ceiling) {
1927			mtx_unlock_spin(&umtx_lock);
1928			error = EINVAL;
1929			goto out;
1930		}
1931		if (su && PRI_MIN_REALTIME + ceiling < uq->uq_inherited_pri) {
1932			uq->uq_inherited_pri = PRI_MIN_REALTIME + ceiling;
1933			thread_lock(td);
1934			if (uq->uq_inherited_pri < UPRI(td))
1935				sched_lend_user_prio(td, uq->uq_inherited_pri);
1936			thread_unlock(td);
1937		}
1938		mtx_unlock_spin(&umtx_lock);
1939
1940		owner = casuword32(&m->m_owner,
1941		    UMUTEX_CONTESTED, id | UMUTEX_CONTESTED);
1942
1943		if (owner == UMUTEX_CONTESTED) {
1944			error = 0;
1945			break;
1946		}
1947
1948		/* The address was invalid. */
1949		if (owner == -1) {
1950			error = EFAULT;
1951			break;
1952		}
1953
1954		if ((flags & UMUTEX_ERROR_CHECK) != 0 &&
1955		    (owner & ~UMUTEX_CONTESTED) == id) {
1956			error = EDEADLK;
1957			break;
1958		}
1959
1960		if (try != 0) {
1961			error = EBUSY;
1962			break;
1963		}
1964
1965		/*
1966		 * If we caught a signal, we have retried and now
1967		 * exit immediately.
1968		 */
1969		if (error != 0)
1970			break;
1971
1972		umtxq_lock(&uq->uq_key);
1973		umtxq_insert(uq);
1974		umtxq_unbusy(&uq->uq_key);
1975		error = umtxq_sleep(uq, "umtxpp", timo);
1976		umtxq_remove(uq);
1977		umtxq_unlock(&uq->uq_key);
1978
1979		mtx_lock_spin(&umtx_lock);
1980		uq->uq_inherited_pri = old_inherited_pri;
1981		pri = PRI_MAX;
1982		TAILQ_FOREACH(pi, &uq->uq_pi_contested, pi_link) {
1983			uq2 = TAILQ_FIRST(&pi->pi_blocked);
1984			if (uq2 != NULL) {
1985				if (pri > UPRI(uq2->uq_thread))
1986					pri = UPRI(uq2->uq_thread);
1987			}
1988		}
1989		if (pri > uq->uq_inherited_pri)
1990			pri = uq->uq_inherited_pri;
1991		thread_lock(td);
1992		sched_lend_user_prio(td, pri);
1993		thread_unlock(td);
1994		mtx_unlock_spin(&umtx_lock);
1995	}
1996
1997	if (error != 0) {
1998		mtx_lock_spin(&umtx_lock);
1999		uq->uq_inherited_pri = old_inherited_pri;
2000		pri = PRI_MAX;
2001		TAILQ_FOREACH(pi, &uq->uq_pi_contested, pi_link) {
2002			uq2 = TAILQ_FIRST(&pi->pi_blocked);
2003			if (uq2 != NULL) {
2004				if (pri > UPRI(uq2->uq_thread))
2005					pri = UPRI(uq2->uq_thread);
2006			}
2007		}
2008		if (pri > uq->uq_inherited_pri)
2009			pri = uq->uq_inherited_pri;
2010		thread_lock(td);
2011		sched_lend_user_prio(td, pri);
2012		thread_unlock(td);
2013		mtx_unlock_spin(&umtx_lock);
2014	}
2015
2016out:
2017	umtxq_lock(&uq->uq_key);
2018	umtxq_unbusy(&uq->uq_key);
2019	umtxq_unlock(&uq->uq_key);
2020	umtx_key_release(&uq->uq_key);
2021	return (error);
2022}
2023
2024/*
2025 * Unlock a PP mutex.
2026 */
2027static int
2028do_unlock_pp(struct thread *td, struct umutex *m, uint32_t flags)
2029{
2030	struct umtx_key key;
2031	struct umtx_q *uq, *uq2;
2032	struct umtx_pi *pi;
2033	uint32_t owner, id;
2034	uint32_t rceiling;
2035	int error, pri, new_inherited_pri, su;
2036
2037	id = td->td_tid;
2038	uq = td->td_umtxq;
2039	su = (priv_check(td, PRIV_SCHED_RTPRIO) == 0);
2040
2041	/*
2042	 * Make sure we own this mtx.
2043	 */
2044	owner = fuword32(__DEVOLATILE(uint32_t *, &m->m_owner));
2045	if (owner == -1)
2046		return (EFAULT);
2047
2048	if ((owner & ~UMUTEX_CONTESTED) != id)
2049		return (EPERM);
2050
2051	error = copyin(&m->m_ceilings[1], &rceiling, sizeof(uint32_t));
2052	if (error != 0)
2053		return (error);
2054
2055	if (rceiling == -1)
2056		new_inherited_pri = PRI_MAX;
2057	else {
2058		rceiling = RTP_PRIO_MAX - rceiling;
2059		if (rceiling > RTP_PRIO_MAX)
2060			return (EINVAL);
2061		new_inherited_pri = PRI_MIN_REALTIME + rceiling;
2062	}
2063
2064	if ((error = umtx_key_get(m, TYPE_PP_UMUTEX, GET_SHARE(flags),
2065	    &key)) != 0)
2066		return (error);
2067	umtxq_lock(&key);
2068	umtxq_busy(&key);
2069	umtxq_unlock(&key);
2070	/*
2071	 * For priority protected mutex, always set unlocked state
2072	 * to UMUTEX_CONTESTED, so that userland always enters kernel
2073	 * to lock the mutex, it is necessary because thread priority
2074	 * has to be adjusted for such mutex.
2075	 */
2076	error = suword32(__DEVOLATILE(uint32_t *, &m->m_owner),
2077		UMUTEX_CONTESTED);
2078
2079	umtxq_lock(&key);
2080	if (error == 0)
2081		umtxq_signal(&key, 1);
2082	umtxq_unbusy(&key);
2083	umtxq_unlock(&key);
2084
2085	if (error == -1)
2086		error = EFAULT;
2087	else {
2088		mtx_lock_spin(&umtx_lock);
2089		if (su != 0)
2090			uq->uq_inherited_pri = new_inherited_pri;
2091		pri = PRI_MAX;
2092		TAILQ_FOREACH(pi, &uq->uq_pi_contested, pi_link) {
2093			uq2 = TAILQ_FIRST(&pi->pi_blocked);
2094			if (uq2 != NULL) {
2095				if (pri > UPRI(uq2->uq_thread))
2096					pri = UPRI(uq2->uq_thread);
2097			}
2098		}
2099		if (pri > uq->uq_inherited_pri)
2100			pri = uq->uq_inherited_pri;
2101		thread_lock(td);
2102		sched_lend_user_prio(td, pri);
2103		thread_unlock(td);
2104		mtx_unlock_spin(&umtx_lock);
2105	}
2106	umtx_key_release(&key);
2107	return (error);
2108}
2109
2110static int
2111do_set_ceiling(struct thread *td, struct umutex *m, uint32_t ceiling,
2112	uint32_t *old_ceiling)
2113{
2114	struct umtx_q *uq;
2115	uint32_t save_ceiling;
2116	uint32_t owner, id;
2117	uint32_t flags;
2118	int error;
2119
2120	flags = fuword32(&m->m_flags);
2121	if ((flags & UMUTEX_PRIO_PROTECT) == 0)
2122		return (EINVAL);
2123	if (ceiling > RTP_PRIO_MAX)
2124		return (EINVAL);
2125	id = td->td_tid;
2126	uq = td->td_umtxq;
2127	if ((error = umtx_key_get(m, TYPE_PP_UMUTEX, GET_SHARE(flags),
2128	   &uq->uq_key)) != 0)
2129		return (error);
2130	for (;;) {
2131		umtxq_lock(&uq->uq_key);
2132		umtxq_busy(&uq->uq_key);
2133		umtxq_unlock(&uq->uq_key);
2134
2135		save_ceiling = fuword32(&m->m_ceilings[0]);
2136
2137		owner = casuword32(&m->m_owner,
2138		    UMUTEX_CONTESTED, id | UMUTEX_CONTESTED);
2139
2140		if (owner == UMUTEX_CONTESTED) {
2141			suword32(&m->m_ceilings[0], ceiling);
2142			suword32(__DEVOLATILE(uint32_t *, &m->m_owner),
2143				UMUTEX_CONTESTED);
2144			error = 0;
2145			break;
2146		}
2147
2148		/* The address was invalid. */
2149		if (owner == -1) {
2150			error = EFAULT;
2151			break;
2152		}
2153
2154		if ((owner & ~UMUTEX_CONTESTED) == id) {
2155			suword32(&m->m_ceilings[0], ceiling);
2156			error = 0;
2157			break;
2158		}
2159
2160		/*
2161		 * If we caught a signal, we have retried and now
2162		 * exit immediately.
2163		 */
2164		if (error != 0)
2165			break;
2166
2167		/*
2168		 * We set the contested bit, sleep. Otherwise the lock changed
2169		 * and we need to retry or we lost a race to the thread
2170		 * unlocking the umtx.
2171		 */
2172		umtxq_lock(&uq->uq_key);
2173		umtxq_insert(uq);
2174		umtxq_unbusy(&uq->uq_key);
2175		error = umtxq_sleep(uq, "umtxpp", 0);
2176		umtxq_remove(uq);
2177		umtxq_unlock(&uq->uq_key);
2178	}
2179	umtxq_lock(&uq->uq_key);
2180	if (error == 0)
2181		umtxq_signal(&uq->uq_key, INT_MAX);
2182	umtxq_unbusy(&uq->uq_key);
2183	umtxq_unlock(&uq->uq_key);
2184	umtx_key_release(&uq->uq_key);
2185	if (error == 0 && old_ceiling != NULL)
2186		suword32(old_ceiling, save_ceiling);
2187	return (error);
2188}
2189
2190static int
2191_do_lock_umutex(struct thread *td, struct umutex *m, int flags, int timo,
2192	int mode)
2193{
2194	switch(flags & (UMUTEX_PRIO_INHERIT | UMUTEX_PRIO_PROTECT)) {
2195	case 0:
2196		return (_do_lock_normal(td, m, flags, timo, mode));
2197	case UMUTEX_PRIO_INHERIT:
2198		return (_do_lock_pi(td, m, flags, timo, mode));
2199	case UMUTEX_PRIO_PROTECT:
2200		return (_do_lock_pp(td, m, flags, timo, mode));
2201	}
2202	return (EINVAL);
2203}
2204
2205/*
2206 * Lock a userland POSIX mutex.
2207 */
2208static int
2209do_lock_umutex(struct thread *td, struct umutex *m,
2210	struct timespec *timeout, int mode)
2211{
2212	struct timespec ts, ts2, ts3;
2213	struct timeval tv;
2214	uint32_t flags;
2215	int error;
2216
2217	flags = fuword32(&m->m_flags);
2218	if (flags == -1)
2219		return (EFAULT);
2220
2221	if (timeout == NULL) {
2222		error = _do_lock_umutex(td, m, flags, 0, mode);
2223		/* Mutex locking is restarted if it is interrupted. */
2224		if (error == EINTR && mode != _UMUTEX_WAIT)
2225			error = ERESTART;
2226	} else {
2227		getnanouptime(&ts);
2228		timespecadd(&ts, timeout);
2229		TIMESPEC_TO_TIMEVAL(&tv, timeout);
2230		for (;;) {
2231			error = _do_lock_umutex(td, m, flags, tvtohz(&tv), mode);
2232			if (error != ETIMEDOUT)
2233				break;
2234			getnanouptime(&ts2);
2235			if (timespeccmp(&ts2, &ts, >=)) {
2236				error = ETIMEDOUT;
2237				break;
2238			}
2239			ts3 = ts;
2240			timespecsub(&ts3, &ts2);
2241			TIMESPEC_TO_TIMEVAL(&tv, &ts3);
2242		}
2243		/* Timed-locking is not restarted. */
2244		if (error == ERESTART)
2245			error = EINTR;
2246	}
2247	return (error);
2248}
2249
2250/*
2251 * Unlock a userland POSIX mutex.
2252 */
2253static int
2254do_unlock_umutex(struct thread *td, struct umutex *m)
2255{
2256	uint32_t flags;
2257
2258	flags = fuword32(&m->m_flags);
2259	if (flags == -1)
2260		return (EFAULT);
2261
2262	switch(flags & (UMUTEX_PRIO_INHERIT | UMUTEX_PRIO_PROTECT)) {
2263	case 0:
2264		return (do_unlock_normal(td, m, flags));
2265	case UMUTEX_PRIO_INHERIT:
2266		return (do_unlock_pi(td, m, flags));
2267	case UMUTEX_PRIO_PROTECT:
2268		return (do_unlock_pp(td, m, flags));
2269	}
2270
2271	return (EINVAL);
2272}
2273
2274static int
2275do_cv_wait(struct thread *td, struct ucond *cv, struct umutex *m,
2276	struct timespec *timeout, u_long wflags)
2277{
2278	struct umtx_q *uq;
2279	struct timeval tv;
2280	struct timespec cts, ets, tts;
2281	uint32_t flags;
2282	uint32_t clockid;
2283	int error;
2284
2285	uq = td->td_umtxq;
2286	flags = fuword32(&cv->c_flags);
2287	error = umtx_key_get(cv, TYPE_CV, GET_SHARE(flags), &uq->uq_key);
2288	if (error != 0)
2289		return (error);
2290
2291	if ((wflags & CVWAIT_CLOCKID) != 0) {
2292		clockid = fuword32(&cv->c_clockid);
2293		if (clockid < CLOCK_REALTIME ||
2294		    clockid >= CLOCK_THREAD_CPUTIME_ID) {
2295			/* hmm, only HW clock id will work. */
2296			return (EINVAL);
2297		}
2298	} else {
2299		clockid = CLOCK_REALTIME;
2300	}
2301
2302	umtxq_lock(&uq->uq_key);
2303	umtxq_busy(&uq->uq_key);
2304	umtxq_insert(uq);
2305	umtxq_unlock(&uq->uq_key);
2306
2307	/*
2308	 * Set c_has_waiters to 1 before releasing user mutex, also
2309	 * don't modify cache line when unnecessary.
2310	 */
2311	if (fuword32(__DEVOLATILE(uint32_t *, &cv->c_has_waiters)) == 0)
2312		suword32(__DEVOLATILE(uint32_t *, &cv->c_has_waiters), 1);
2313
2314	umtxq_lock(&uq->uq_key);
2315	umtxq_unbusy(&uq->uq_key);
2316	umtxq_unlock(&uq->uq_key);
2317
2318	error = do_unlock_umutex(td, m);
2319
2320	umtxq_lock(&uq->uq_key);
2321	if (error == 0) {
2322		if (timeout == NULL) {
2323			error = umtxq_sleep(uq, "ucond", 0);
2324		} else {
2325			if ((wflags & CVWAIT_ABSTIME) == 0) {
2326				kern_clock_gettime(td, clockid, &ets);
2327				timespecadd(&ets, timeout);
2328				tts = *timeout;
2329			} else { /* absolute time */
2330				ets = *timeout;
2331				tts = *timeout;
2332				kern_clock_gettime(td, clockid, &cts);
2333				timespecsub(&tts, &cts);
2334			}
2335			TIMESPEC_TO_TIMEVAL(&tv, &tts);
2336			for (;;) {
2337				error = umtxq_sleep(uq, "ucond", tvtohz(&tv));
2338				if (error != ETIMEDOUT)
2339					break;
2340				kern_clock_gettime(td, clockid, &cts);
2341				if (timespeccmp(&cts, &ets, >=)) {
2342					error = ETIMEDOUT;
2343					break;
2344				}
2345				tts = ets;
2346				timespecsub(&tts, &cts);
2347				TIMESPEC_TO_TIMEVAL(&tv, &tts);
2348			}
2349		}
2350	}
2351
2352	if ((uq->uq_flags & UQF_UMTXQ) == 0)
2353		error = 0;
2354	else {
2355		/*
2356		 * This must be timeout,interrupted by signal or
2357		 * surprious wakeup, clear c_has_waiter flag when
2358		 * necessary.
2359		 */
2360		umtxq_busy(&uq->uq_key);
2361		if ((uq->uq_flags & UQF_UMTXQ) != 0) {
2362			int oldlen = uq->uq_cur_queue->length;
2363			umtxq_remove(uq);
2364			if (oldlen == 1) {
2365				umtxq_unlock(&uq->uq_key);
2366				suword32(
2367				    __DEVOLATILE(uint32_t *,
2368					 &cv->c_has_waiters), 0);
2369				umtxq_lock(&uq->uq_key);
2370			}
2371		}
2372		umtxq_unbusy(&uq->uq_key);
2373		if (error == ERESTART)
2374			error = EINTR;
2375	}
2376
2377	umtxq_unlock(&uq->uq_key);
2378	umtx_key_release(&uq->uq_key);
2379	return (error);
2380}
2381
2382/*
2383 * Signal a userland condition variable.
2384 */
2385static int
2386do_cv_signal(struct thread *td, struct ucond *cv)
2387{
2388	struct umtx_key key;
2389	int error, cnt, nwake;
2390	uint32_t flags;
2391
2392	flags = fuword32(&cv->c_flags);
2393	if ((error = umtx_key_get(cv, TYPE_CV, GET_SHARE(flags), &key)) != 0)
2394		return (error);
2395	umtxq_lock(&key);
2396	umtxq_busy(&key);
2397	cnt = umtxq_count(&key);
2398	nwake = umtxq_signal(&key, 1);
2399	if (cnt <= nwake) {
2400		umtxq_unlock(&key);
2401		error = suword32(
2402		    __DEVOLATILE(uint32_t *, &cv->c_has_waiters), 0);
2403		umtxq_lock(&key);
2404	}
2405	umtxq_unbusy(&key);
2406	umtxq_unlock(&key);
2407	umtx_key_release(&key);
2408	return (error);
2409}
2410
2411static int
2412do_cv_broadcast(struct thread *td, struct ucond *cv)
2413{
2414	struct umtx_key key;
2415	int error;
2416	uint32_t flags;
2417
2418	flags = fuword32(&cv->c_flags);
2419	if ((error = umtx_key_get(cv, TYPE_CV, GET_SHARE(flags), &key)) != 0)
2420		return (error);
2421
2422	umtxq_lock(&key);
2423	umtxq_busy(&key);
2424	umtxq_signal(&key, INT_MAX);
2425	umtxq_unlock(&key);
2426
2427	error = suword32(__DEVOLATILE(uint32_t *, &cv->c_has_waiters), 0);
2428
2429	umtxq_lock(&key);
2430	umtxq_unbusy(&key);
2431	umtxq_unlock(&key);
2432
2433	umtx_key_release(&key);
2434	return (error);
2435}
2436
2437static int
2438do_rw_rdlock(struct thread *td, struct urwlock *rwlock, long fflag, int timo)
2439{
2440	struct umtx_q *uq;
2441	uint32_t flags, wrflags;
2442	int32_t state, oldstate;
2443	int32_t blocked_readers;
2444	int error;
2445
2446	uq = td->td_umtxq;
2447	flags = fuword32(&rwlock->rw_flags);
2448	error = umtx_key_get(rwlock, TYPE_RWLOCK, GET_SHARE(flags), &uq->uq_key);
2449	if (error != 0)
2450		return (error);
2451
2452	wrflags = URWLOCK_WRITE_OWNER;
2453	if (!(fflag & URWLOCK_PREFER_READER) && !(flags & URWLOCK_PREFER_READER))
2454		wrflags |= URWLOCK_WRITE_WAITERS;
2455
2456	for (;;) {
2457		state = fuword32(__DEVOLATILE(int32_t *, &rwlock->rw_state));
2458		/* try to lock it */
2459		while (!(state & wrflags)) {
2460			if (__predict_false(URWLOCK_READER_COUNT(state) == URWLOCK_MAX_READERS)) {
2461				umtx_key_release(&uq->uq_key);
2462				return (EAGAIN);
2463			}
2464			oldstate = casuword32(&rwlock->rw_state, state, state + 1);
2465			if (oldstate == state) {
2466				umtx_key_release(&uq->uq_key);
2467				return (0);
2468			}
2469			state = oldstate;
2470		}
2471
2472		if (error)
2473			break;
2474
2475		/* grab monitor lock */
2476		umtxq_lock(&uq->uq_key);
2477		umtxq_busy(&uq->uq_key);
2478		umtxq_unlock(&uq->uq_key);
2479
2480		/*
2481		 * re-read the state, in case it changed between the try-lock above
2482		 * and the check below
2483		 */
2484		state = fuword32(__DEVOLATILE(int32_t *, &rwlock->rw_state));
2485
2486		/* set read contention bit */
2487		while ((state & wrflags) && !(state & URWLOCK_READ_WAITERS)) {
2488			oldstate = casuword32(&rwlock->rw_state, state, state | URWLOCK_READ_WAITERS);
2489			if (oldstate == state)
2490				goto sleep;
2491			state = oldstate;
2492		}
2493
2494		/* state is changed while setting flags, restart */
2495		if (!(state & wrflags)) {
2496			umtxq_lock(&uq->uq_key);
2497			umtxq_unbusy(&uq->uq_key);
2498			umtxq_unlock(&uq->uq_key);
2499			continue;
2500		}
2501
2502sleep:
2503		/* contention bit is set, before sleeping, increase read waiter count */
2504		blocked_readers = fuword32(&rwlock->rw_blocked_readers);
2505		suword32(&rwlock->rw_blocked_readers, blocked_readers+1);
2506
2507		while (state & wrflags) {
2508			umtxq_lock(&uq->uq_key);
2509			umtxq_insert(uq);
2510			umtxq_unbusy(&uq->uq_key);
2511
2512			error = umtxq_sleep(uq, "urdlck", timo);
2513
2514			umtxq_busy(&uq->uq_key);
2515			umtxq_remove(uq);
2516			umtxq_unlock(&uq->uq_key);
2517			if (error)
2518				break;
2519			state = fuword32(__DEVOLATILE(int32_t *, &rwlock->rw_state));
2520		}
2521
2522		/* decrease read waiter count, and may clear read contention bit */
2523		blocked_readers = fuword32(&rwlock->rw_blocked_readers);
2524		suword32(&rwlock->rw_blocked_readers, blocked_readers-1);
2525		if (blocked_readers == 1) {
2526			state = fuword32(__DEVOLATILE(int32_t *, &rwlock->rw_state));
2527			for (;;) {
2528				oldstate = casuword32(&rwlock->rw_state, state,
2529					 state & ~URWLOCK_READ_WAITERS);
2530				if (oldstate == state)
2531					break;
2532				state = oldstate;
2533			}
2534		}
2535
2536		umtxq_lock(&uq->uq_key);
2537		umtxq_unbusy(&uq->uq_key);
2538		umtxq_unlock(&uq->uq_key);
2539	}
2540	umtx_key_release(&uq->uq_key);
2541	return (error);
2542}
2543
2544static int
2545do_rw_rdlock2(struct thread *td, void *obj, long val, struct timespec *timeout)
2546{
2547	struct timespec ts, ts2, ts3;
2548	struct timeval tv;
2549	int error;
2550
2551	getnanouptime(&ts);
2552	timespecadd(&ts, timeout);
2553	TIMESPEC_TO_TIMEVAL(&tv, timeout);
2554	for (;;) {
2555		error = do_rw_rdlock(td, obj, val, tvtohz(&tv));
2556		if (error != ETIMEDOUT)
2557			break;
2558		getnanouptime(&ts2);
2559		if (timespeccmp(&ts2, &ts, >=)) {
2560			error = ETIMEDOUT;
2561			break;
2562		}
2563		ts3 = ts;
2564		timespecsub(&ts3, &ts2);
2565		TIMESPEC_TO_TIMEVAL(&tv, &ts3);
2566	}
2567	if (error == ERESTART)
2568		error = EINTR;
2569	return (error);
2570}
2571
2572static int
2573do_rw_wrlock(struct thread *td, struct urwlock *rwlock, int timo)
2574{
2575	struct umtx_q *uq;
2576	uint32_t flags;
2577	int32_t state, oldstate;
2578	int32_t blocked_writers;
2579	int32_t blocked_readers;
2580	int error;
2581
2582	uq = td->td_umtxq;
2583	flags = fuword32(&rwlock->rw_flags);
2584	error = umtx_key_get(rwlock, TYPE_RWLOCK, GET_SHARE(flags), &uq->uq_key);
2585	if (error != 0)
2586		return (error);
2587
2588	blocked_readers = 0;
2589	for (;;) {
2590		state = fuword32(__DEVOLATILE(int32_t *, &rwlock->rw_state));
2591		while (!(state & URWLOCK_WRITE_OWNER) && URWLOCK_READER_COUNT(state) == 0) {
2592			oldstate = casuword32(&rwlock->rw_state, state, state | URWLOCK_WRITE_OWNER);
2593			if (oldstate == state) {
2594				umtx_key_release(&uq->uq_key);
2595				return (0);
2596			}
2597			state = oldstate;
2598		}
2599
2600		if (error) {
2601			if (!(state & (URWLOCK_WRITE_OWNER|URWLOCK_WRITE_WAITERS)) &&
2602			    blocked_readers != 0) {
2603				umtxq_lock(&uq->uq_key);
2604				umtxq_busy(&uq->uq_key);
2605				umtxq_signal_queue(&uq->uq_key, INT_MAX, UMTX_SHARED_QUEUE);
2606				umtxq_unbusy(&uq->uq_key);
2607				umtxq_unlock(&uq->uq_key);
2608			}
2609
2610			break;
2611		}
2612
2613		/* grab monitor lock */
2614		umtxq_lock(&uq->uq_key);
2615		umtxq_busy(&uq->uq_key);
2616		umtxq_unlock(&uq->uq_key);
2617
2618		/*
2619		 * re-read the state, in case it changed between the try-lock above
2620		 * and the check below
2621		 */
2622		state = fuword32(__DEVOLATILE(int32_t *, &rwlock->rw_state));
2623
2624		while (((state & URWLOCK_WRITE_OWNER) || URWLOCK_READER_COUNT(state) != 0) &&
2625		       (state & URWLOCK_WRITE_WAITERS) == 0) {
2626			oldstate = casuword32(&rwlock->rw_state, state, state | URWLOCK_WRITE_WAITERS);
2627			if (oldstate == state)
2628				goto sleep;
2629			state = oldstate;
2630		}
2631
2632		if (!(state & URWLOCK_WRITE_OWNER) && URWLOCK_READER_COUNT(state) == 0) {
2633			umtxq_lock(&uq->uq_key);
2634			umtxq_unbusy(&uq->uq_key);
2635			umtxq_unlock(&uq->uq_key);
2636			continue;
2637		}
2638sleep:
2639		blocked_writers = fuword32(&rwlock->rw_blocked_writers);
2640		suword32(&rwlock->rw_blocked_writers, blocked_writers+1);
2641
2642		while ((state & URWLOCK_WRITE_OWNER) || URWLOCK_READER_COUNT(state) != 0) {
2643			umtxq_lock(&uq->uq_key);
2644			umtxq_insert_queue(uq, UMTX_EXCLUSIVE_QUEUE);
2645			umtxq_unbusy(&uq->uq_key);
2646
2647			error = umtxq_sleep(uq, "uwrlck", timo);
2648
2649			umtxq_busy(&uq->uq_key);
2650			umtxq_remove_queue(uq, UMTX_EXCLUSIVE_QUEUE);
2651			umtxq_unlock(&uq->uq_key);
2652			if (error)
2653				break;
2654			state = fuword32(__DEVOLATILE(int32_t *, &rwlock->rw_state));
2655		}
2656
2657		blocked_writers = fuword32(&rwlock->rw_blocked_writers);
2658		suword32(&rwlock->rw_blocked_writers, blocked_writers-1);
2659		if (blocked_writers == 1) {
2660			state = fuword32(__DEVOLATILE(int32_t *, &rwlock->rw_state));
2661			for (;;) {
2662				oldstate = casuword32(&rwlock->rw_state, state,
2663					 state & ~URWLOCK_WRITE_WAITERS);
2664				if (oldstate == state)
2665					break;
2666				state = oldstate;
2667			}
2668			blocked_readers = fuword32(&rwlock->rw_blocked_readers);
2669		} else
2670			blocked_readers = 0;
2671
2672		umtxq_lock(&uq->uq_key);
2673		umtxq_unbusy(&uq->uq_key);
2674		umtxq_unlock(&uq->uq_key);
2675	}
2676
2677	umtx_key_release(&uq->uq_key);
2678	return (error);
2679}
2680
2681static int
2682do_rw_wrlock2(struct thread *td, void *obj, struct timespec *timeout)
2683{
2684	struct timespec ts, ts2, ts3;
2685	struct timeval tv;
2686	int error;
2687
2688	getnanouptime(&ts);
2689	timespecadd(&ts, timeout);
2690	TIMESPEC_TO_TIMEVAL(&tv, timeout);
2691	for (;;) {
2692		error = do_rw_wrlock(td, obj, tvtohz(&tv));
2693		if (error != ETIMEDOUT)
2694			break;
2695		getnanouptime(&ts2);
2696		if (timespeccmp(&ts2, &ts, >=)) {
2697			error = ETIMEDOUT;
2698			break;
2699		}
2700		ts3 = ts;
2701		timespecsub(&ts3, &ts2);
2702		TIMESPEC_TO_TIMEVAL(&tv, &ts3);
2703	}
2704	if (error == ERESTART)
2705		error = EINTR;
2706	return (error);
2707}
2708
2709static int
2710do_rw_unlock(struct thread *td, struct urwlock *rwlock)
2711{
2712	struct umtx_q *uq;
2713	uint32_t flags;
2714	int32_t state, oldstate;
2715	int error, q, count;
2716
2717	uq = td->td_umtxq;
2718	flags = fuword32(&rwlock->rw_flags);
2719	error = umtx_key_get(rwlock, TYPE_RWLOCK, GET_SHARE(flags), &uq->uq_key);
2720	if (error != 0)
2721		return (error);
2722
2723	state = fuword32(__DEVOLATILE(int32_t *, &rwlock->rw_state));
2724	if (state & URWLOCK_WRITE_OWNER) {
2725		for (;;) {
2726			oldstate = casuword32(&rwlock->rw_state, state,
2727				state & ~URWLOCK_WRITE_OWNER);
2728			if (oldstate != state) {
2729				state = oldstate;
2730				if (!(oldstate & URWLOCK_WRITE_OWNER)) {
2731					error = EPERM;
2732					goto out;
2733				}
2734			} else
2735				break;
2736		}
2737	} else if (URWLOCK_READER_COUNT(state) != 0) {
2738		for (;;) {
2739			oldstate = casuword32(&rwlock->rw_state, state,
2740				state - 1);
2741			if (oldstate != state) {
2742				state = oldstate;
2743				if (URWLOCK_READER_COUNT(oldstate) == 0) {
2744					error = EPERM;
2745					goto out;
2746				}
2747			}
2748			else
2749				break;
2750		}
2751	} else {
2752		error = EPERM;
2753		goto out;
2754	}
2755
2756	count = 0;
2757
2758	if (!(flags & URWLOCK_PREFER_READER)) {
2759		if (state & URWLOCK_WRITE_WAITERS) {
2760			count = 1;
2761			q = UMTX_EXCLUSIVE_QUEUE;
2762		} else if (state & URWLOCK_READ_WAITERS) {
2763			count = INT_MAX;
2764			q = UMTX_SHARED_QUEUE;
2765		}
2766	} else {
2767		if (state & URWLOCK_READ_WAITERS) {
2768			count = INT_MAX;
2769			q = UMTX_SHARED_QUEUE;
2770		} else if (state & URWLOCK_WRITE_WAITERS) {
2771			count = 1;
2772			q = UMTX_EXCLUSIVE_QUEUE;
2773		}
2774	}
2775
2776	if (count) {
2777		umtxq_lock(&uq->uq_key);
2778		umtxq_busy(&uq->uq_key);
2779		umtxq_signal_queue(&uq->uq_key, count, q);
2780		umtxq_unbusy(&uq->uq_key);
2781		umtxq_unlock(&uq->uq_key);
2782	}
2783out:
2784	umtx_key_release(&uq->uq_key);
2785	return (error);
2786}
2787
2788static int
2789do_sem_wait(struct thread *td, struct _usem *sem, struct timespec *timeout)
2790{
2791	struct umtx_q *uq;
2792	struct timeval tv;
2793	struct timespec cts, ets, tts;
2794	uint32_t flags, count;
2795	int error;
2796
2797	uq = td->td_umtxq;
2798	flags = fuword32(&sem->_flags);
2799	error = umtx_key_get(sem, TYPE_SEM, GET_SHARE(flags), &uq->uq_key);
2800	if (error != 0)
2801		return (error);
2802	umtxq_lock(&uq->uq_key);
2803	umtxq_busy(&uq->uq_key);
2804	umtxq_insert(uq);
2805	umtxq_unlock(&uq->uq_key);
2806
2807	if (fuword32(__DEVOLATILE(uint32_t *, &sem->_has_waiters)) == 0)
2808		casuword32(__DEVOLATILE(uint32_t *, &sem->_has_waiters), 0, 1);
2809
2810	count = fuword32(__DEVOLATILE(uint32_t *, &sem->_count));
2811	if (count != 0) {
2812		umtxq_lock(&uq->uq_key);
2813		umtxq_unbusy(&uq->uq_key);
2814		umtxq_remove(uq);
2815		umtxq_unlock(&uq->uq_key);
2816		umtx_key_release(&uq->uq_key);
2817		return (0);
2818	}
2819
2820	umtxq_lock(&uq->uq_key);
2821	umtxq_unbusy(&uq->uq_key);
2822	umtxq_unlock(&uq->uq_key);
2823
2824	umtxq_lock(&uq->uq_key);
2825	if (timeout == NULL) {
2826		error = umtxq_sleep(uq, "usem", 0);
2827	} else {
2828		getnanouptime(&ets);
2829		timespecadd(&ets, timeout);
2830		TIMESPEC_TO_TIMEVAL(&tv, timeout);
2831		for (;;) {
2832			error = umtxq_sleep(uq, "usem", tvtohz(&tv));
2833			if (error != ETIMEDOUT)
2834				break;
2835			getnanouptime(&cts);
2836			if (timespeccmp(&cts, &ets, >=)) {
2837				error = ETIMEDOUT;
2838				break;
2839			}
2840			tts = ets;
2841			timespecsub(&tts, &cts);
2842			TIMESPEC_TO_TIMEVAL(&tv, &tts);
2843		}
2844	}
2845
2846	if ((uq->uq_flags & UQF_UMTXQ) == 0)
2847		error = 0;
2848	else {
2849		umtxq_remove(uq);
2850		if (error == ERESTART)
2851			error = EINTR;
2852	}
2853	umtxq_unlock(&uq->uq_key);
2854	umtx_key_release(&uq->uq_key);
2855	return (error);
2856}
2857
2858/*
2859 * Signal a userland condition variable.
2860 */
2861static int
2862do_sem_wake(struct thread *td, struct _usem *sem)
2863{
2864	struct umtx_key key;
2865	int error, cnt, nwake;
2866	uint32_t flags;
2867
2868	flags = fuword32(&sem->_flags);
2869	if ((error = umtx_key_get(sem, TYPE_SEM, GET_SHARE(flags), &key)) != 0)
2870		return (error);
2871	umtxq_lock(&key);
2872	umtxq_busy(&key);
2873	cnt = umtxq_count(&key);
2874	nwake = umtxq_signal(&key, 1);
2875	if (cnt <= nwake) {
2876		umtxq_unlock(&key);
2877		error = suword32(
2878		    __DEVOLATILE(uint32_t *, &sem->_has_waiters), 0);
2879		umtxq_lock(&key);
2880	}
2881	umtxq_unbusy(&key);
2882	umtxq_unlock(&key);
2883	umtx_key_release(&key);
2884	return (error);
2885}
2886
2887int
2888sys__umtx_lock(struct thread *td, struct _umtx_lock_args *uap)
2889    /* struct umtx *umtx */
2890{
2891	return _do_lock_umtx(td, uap->umtx, td->td_tid, 0);
2892}
2893
2894int
2895sys__umtx_unlock(struct thread *td, struct _umtx_unlock_args *uap)
2896    /* struct umtx *umtx */
2897{
2898	return do_unlock_umtx(td, uap->umtx, td->td_tid);
2899}
2900
2901static int
2902__umtx_op_lock_umtx(struct thread *td, struct _umtx_op_args *uap)
2903{
2904	struct timespec *ts, timeout;
2905	int error;
2906
2907	/* Allow a null timespec (wait forever). */
2908	if (uap->uaddr2 == NULL)
2909		ts = NULL;
2910	else {
2911		error = copyin(uap->uaddr2, &timeout, sizeof(timeout));
2912		if (error != 0)
2913			return (error);
2914		if (timeout.tv_nsec >= 1000000000 ||
2915		    timeout.tv_nsec < 0) {
2916			return (EINVAL);
2917		}
2918		ts = &timeout;
2919	}
2920	return (do_lock_umtx(td, uap->obj, uap->val, ts));
2921}
2922
2923static int
2924__umtx_op_unlock_umtx(struct thread *td, struct _umtx_op_args *uap)
2925{
2926	return (do_unlock_umtx(td, uap->obj, uap->val));
2927}
2928
2929static int
2930__umtx_op_wait(struct thread *td, struct _umtx_op_args *uap)
2931{
2932	struct timespec *ts, timeout;
2933	int error;
2934
2935	if (uap->uaddr2 == NULL)
2936		ts = NULL;
2937	else {
2938		error = copyin(uap->uaddr2, &timeout, sizeof(timeout));
2939		if (error != 0)
2940			return (error);
2941		if (timeout.tv_nsec >= 1000000000 ||
2942		    timeout.tv_nsec < 0)
2943			return (EINVAL);
2944		ts = &timeout;
2945	}
2946	return do_wait(td, uap->obj, uap->val, ts, 0, 0);
2947}
2948
2949static int
2950__umtx_op_wait_uint(struct thread *td, struct _umtx_op_args *uap)
2951{
2952	struct timespec *ts, timeout;
2953	int error;
2954
2955	if (uap->uaddr2 == NULL)
2956		ts = NULL;
2957	else {
2958		error = copyin(uap->uaddr2, &timeout, sizeof(timeout));
2959		if (error != 0)
2960			return (error);
2961		if (timeout.tv_nsec >= 1000000000 ||
2962		    timeout.tv_nsec < 0)
2963			return (EINVAL);
2964		ts = &timeout;
2965	}
2966	return do_wait(td, uap->obj, uap->val, ts, 1, 0);
2967}
2968
2969static int
2970__umtx_op_wait_uint_private(struct thread *td, struct _umtx_op_args *uap)
2971{
2972	struct timespec *ts, timeout;
2973	int error;
2974
2975	if (uap->uaddr2 == NULL)
2976		ts = NULL;
2977	else {
2978		error = copyin(uap->uaddr2, &timeout, sizeof(timeout));
2979		if (error != 0)
2980			return (error);
2981		if (timeout.tv_nsec >= 1000000000 ||
2982		    timeout.tv_nsec < 0)
2983			return (EINVAL);
2984		ts = &timeout;
2985	}
2986	return do_wait(td, uap->obj, uap->val, ts, 1, 1);
2987}
2988
2989static int
2990__umtx_op_wake(struct thread *td, struct _umtx_op_args *uap)
2991{
2992	return (kern_umtx_wake(td, uap->obj, uap->val, 0));
2993}
2994
2995#define BATCH_SIZE	128
2996static int
2997__umtx_op_nwake_private(struct thread *td, struct _umtx_op_args *uap)
2998{
2999	int count = uap->val;
3000	void *uaddrs[BATCH_SIZE];
3001	char **upp = (char **)uap->obj;
3002	int tocopy;
3003	int error = 0;
3004	int i, pos = 0;
3005
3006	while (count > 0) {
3007		tocopy = count;
3008		if (tocopy > BATCH_SIZE)
3009			tocopy = BATCH_SIZE;
3010		error = copyin(upp+pos, uaddrs, tocopy * sizeof(char *));
3011		if (error != 0)
3012			break;
3013		for (i = 0; i < tocopy; ++i)
3014			kern_umtx_wake(td, uaddrs[i], INT_MAX, 1);
3015		count -= tocopy;
3016		pos += tocopy;
3017	}
3018	return (error);
3019}
3020
3021static int
3022__umtx_op_wake_private(struct thread *td, struct _umtx_op_args *uap)
3023{
3024	return (kern_umtx_wake(td, uap->obj, uap->val, 1));
3025}
3026
3027static int
3028__umtx_op_lock_umutex(struct thread *td, struct _umtx_op_args *uap)
3029{
3030	struct timespec *ts, timeout;
3031	int error;
3032
3033	/* Allow a null timespec (wait forever). */
3034	if (uap->uaddr2 == NULL)
3035		ts = NULL;
3036	else {
3037		error = copyin(uap->uaddr2, &timeout,
3038		    sizeof(timeout));
3039		if (error != 0)
3040			return (error);
3041		if (timeout.tv_nsec >= 1000000000 ||
3042		    timeout.tv_nsec < 0) {
3043			return (EINVAL);
3044		}
3045		ts = &timeout;
3046	}
3047	return do_lock_umutex(td, uap->obj, ts, 0);
3048}
3049
3050static int
3051__umtx_op_trylock_umutex(struct thread *td, struct _umtx_op_args *uap)
3052{
3053	return do_lock_umutex(td, uap->obj, NULL, _UMUTEX_TRY);
3054}
3055
3056static int
3057__umtx_op_wait_umutex(struct thread *td, struct _umtx_op_args *uap)
3058{
3059	struct timespec *ts, timeout;
3060	int error;
3061
3062	/* Allow a null timespec (wait forever). */
3063	if (uap->uaddr2 == NULL)
3064		ts = NULL;
3065	else {
3066		error = copyin(uap->uaddr2, &timeout,
3067		    sizeof(timeout));
3068		if (error != 0)
3069			return (error);
3070		if (timeout.tv_nsec >= 1000000000 ||
3071		    timeout.tv_nsec < 0) {
3072			return (EINVAL);
3073		}
3074		ts = &timeout;
3075	}
3076	return do_lock_umutex(td, uap->obj, ts, _UMUTEX_WAIT);
3077}
3078
3079static int
3080__umtx_op_wake_umutex(struct thread *td, struct _umtx_op_args *uap)
3081{
3082	return do_wake_umutex(td, uap->obj);
3083}
3084
3085static int
3086__umtx_op_unlock_umutex(struct thread *td, struct _umtx_op_args *uap)
3087{
3088	return do_unlock_umutex(td, uap->obj);
3089}
3090
3091static int
3092__umtx_op_set_ceiling(struct thread *td, struct _umtx_op_args *uap)
3093{
3094	return do_set_ceiling(td, uap->obj, uap->val, uap->uaddr1);
3095}
3096
3097static int
3098__umtx_op_cv_wait(struct thread *td, struct _umtx_op_args *uap)
3099{
3100	struct timespec *ts, timeout;
3101	int error;
3102
3103	/* Allow a null timespec (wait forever). */
3104	if (uap->uaddr2 == NULL)
3105		ts = NULL;
3106	else {
3107		error = copyin(uap->uaddr2, &timeout,
3108		    sizeof(timeout));
3109		if (error != 0)
3110			return (error);
3111		if (timeout.tv_nsec >= 1000000000 ||
3112		    timeout.tv_nsec < 0) {
3113			return (EINVAL);
3114		}
3115		ts = &timeout;
3116	}
3117	return (do_cv_wait(td, uap->obj, uap->uaddr1, ts, uap->val));
3118}
3119
3120static int
3121__umtx_op_cv_signal(struct thread *td, struct _umtx_op_args *uap)
3122{
3123	return do_cv_signal(td, uap->obj);
3124}
3125
3126static int
3127__umtx_op_cv_broadcast(struct thread *td, struct _umtx_op_args *uap)
3128{
3129	return do_cv_broadcast(td, uap->obj);
3130}
3131
3132static int
3133__umtx_op_rw_rdlock(struct thread *td, struct _umtx_op_args *uap)
3134{
3135	struct timespec timeout;
3136	int error;
3137
3138	/* Allow a null timespec (wait forever). */
3139	if (uap->uaddr2 == NULL) {
3140		error = do_rw_rdlock(td, uap->obj, uap->val, 0);
3141	} else {
3142		error = copyin(uap->uaddr2, &timeout,
3143		    sizeof(timeout));
3144		if (error != 0)
3145			return (error);
3146		if (timeout.tv_nsec >= 1000000000 ||
3147		    timeout.tv_nsec < 0) {
3148			return (EINVAL);
3149		}
3150		error = do_rw_rdlock2(td, uap->obj, uap->val, &timeout);
3151	}
3152	return (error);
3153}
3154
3155static int
3156__umtx_op_rw_wrlock(struct thread *td, struct _umtx_op_args *uap)
3157{
3158	struct timespec timeout;
3159	int error;
3160
3161	/* Allow a null timespec (wait forever). */
3162	if (uap->uaddr2 == NULL) {
3163		error = do_rw_wrlock(td, uap->obj, 0);
3164	} else {
3165		error = copyin(uap->uaddr2, &timeout,
3166		    sizeof(timeout));
3167		if (error != 0)
3168			return (error);
3169		if (timeout.tv_nsec >= 1000000000 ||
3170		    timeout.tv_nsec < 0) {
3171			return (EINVAL);
3172		}
3173
3174		error = do_rw_wrlock2(td, uap->obj, &timeout);
3175	}
3176	return (error);
3177}
3178
3179static int
3180__umtx_op_rw_unlock(struct thread *td, struct _umtx_op_args *uap)
3181{
3182	return do_rw_unlock(td, uap->obj);
3183}
3184
3185static int
3186__umtx_op_sem_wait(struct thread *td, struct _umtx_op_args *uap)
3187{
3188	struct timespec *ts, timeout;
3189	int error;
3190
3191	/* Allow a null timespec (wait forever). */
3192	if (uap->uaddr2 == NULL)
3193		ts = NULL;
3194	else {
3195		error = copyin(uap->uaddr2, &timeout,
3196		    sizeof(timeout));
3197		if (error != 0)
3198			return (error);
3199		if (timeout.tv_nsec >= 1000000000 ||
3200		    timeout.tv_nsec < 0) {
3201			return (EINVAL);
3202		}
3203		ts = &timeout;
3204	}
3205	return (do_sem_wait(td, uap->obj, ts));
3206}
3207
3208static int
3209__umtx_op_sem_wake(struct thread *td, struct _umtx_op_args *uap)
3210{
3211	return do_sem_wake(td, uap->obj);
3212}
3213
3214typedef int (*_umtx_op_func)(struct thread *td, struct _umtx_op_args *uap);
3215
3216static _umtx_op_func op_table[] = {
3217	__umtx_op_lock_umtx,		/* UMTX_OP_LOCK */
3218	__umtx_op_unlock_umtx,		/* UMTX_OP_UNLOCK */
3219	__umtx_op_wait,			/* UMTX_OP_WAIT */
3220	__umtx_op_wake,			/* UMTX_OP_WAKE */
3221	__umtx_op_trylock_umutex,	/* UMTX_OP_MUTEX_TRYLOCK */
3222	__umtx_op_lock_umutex,		/* UMTX_OP_MUTEX_LOCK */
3223	__umtx_op_unlock_umutex,	/* UMTX_OP_MUTEX_UNLOCK */
3224	__umtx_op_set_ceiling,		/* UMTX_OP_SET_CEILING */
3225	__umtx_op_cv_wait,		/* UMTX_OP_CV_WAIT*/
3226	__umtx_op_cv_signal,		/* UMTX_OP_CV_SIGNAL */
3227	__umtx_op_cv_broadcast,		/* UMTX_OP_CV_BROADCAST */
3228	__umtx_op_wait_uint,		/* UMTX_OP_WAIT_UINT */
3229	__umtx_op_rw_rdlock,		/* UMTX_OP_RW_RDLOCK */
3230	__umtx_op_rw_wrlock,		/* UMTX_OP_RW_WRLOCK */
3231	__umtx_op_rw_unlock,		/* UMTX_OP_RW_UNLOCK */
3232	__umtx_op_wait_uint_private,	/* UMTX_OP_WAIT_UINT_PRIVATE */
3233	__umtx_op_wake_private,		/* UMTX_OP_WAKE_PRIVATE */
3234	__umtx_op_wait_umutex,		/* UMTX_OP_UMUTEX_WAIT */
3235	__umtx_op_wake_umutex,		/* UMTX_OP_UMUTEX_WAKE */
3236	__umtx_op_sem_wait,		/* UMTX_OP_SEM_WAIT */
3237	__umtx_op_sem_wake,		/* UMTX_OP_SEM_WAKE */
3238	__umtx_op_nwake_private		/* UMTX_OP_NWAKE_PRIVATE */
3239};
3240
3241int
3242sys__umtx_op(struct thread *td, struct _umtx_op_args *uap)
3243{
3244	if ((unsigned)uap->op < UMTX_OP_MAX)
3245		return (*op_table[uap->op])(td, uap);
3246	return (EINVAL);
3247}
3248
3249#ifdef COMPAT_FREEBSD32
3250int
3251freebsd32_umtx_lock(struct thread *td, struct freebsd32_umtx_lock_args *uap)
3252    /* struct umtx *umtx */
3253{
3254	return (do_lock_umtx32(td, (uint32_t *)uap->umtx, td->td_tid, NULL));
3255}
3256
3257int
3258freebsd32_umtx_unlock(struct thread *td, struct freebsd32_umtx_unlock_args *uap)
3259    /* struct umtx *umtx */
3260{
3261	return (do_unlock_umtx32(td, (uint32_t *)uap->umtx, td->td_tid));
3262}
3263
3264struct timespec32 {
3265	uint32_t tv_sec;
3266	uint32_t tv_nsec;
3267};
3268
3269static inline int
3270copyin_timeout32(void *addr, struct timespec *tsp)
3271{
3272	struct timespec32 ts32;
3273	int error;
3274
3275	error = copyin(addr, &ts32, sizeof(struct timespec32));
3276	if (error == 0) {
3277		tsp->tv_sec = ts32.tv_sec;
3278		tsp->tv_nsec = ts32.tv_nsec;
3279	}
3280	return (error);
3281}
3282
3283static int
3284__umtx_op_lock_umtx_compat32(struct thread *td, struct _umtx_op_args *uap)
3285{
3286	struct timespec *ts, timeout;
3287	int error;
3288
3289	/* Allow a null timespec (wait forever). */
3290	if (uap->uaddr2 == NULL)
3291		ts = NULL;
3292	else {
3293		error = copyin_timeout32(uap->uaddr2, &timeout);
3294		if (error != 0)
3295			return (error);
3296		if (timeout.tv_nsec >= 1000000000 ||
3297		    timeout.tv_nsec < 0) {
3298			return (EINVAL);
3299		}
3300		ts = &timeout;
3301	}
3302	return (do_lock_umtx32(td, uap->obj, uap->val, ts));
3303}
3304
3305static int
3306__umtx_op_unlock_umtx_compat32(struct thread *td, struct _umtx_op_args *uap)
3307{
3308	return (do_unlock_umtx32(td, uap->obj, (uint32_t)uap->val));
3309}
3310
3311static int
3312__umtx_op_wait_compat32(struct thread *td, struct _umtx_op_args *uap)
3313{
3314	struct timespec *ts, timeout;
3315	int error;
3316
3317	if (uap->uaddr2 == NULL)
3318		ts = NULL;
3319	else {
3320		error = copyin_timeout32(uap->uaddr2, &timeout);
3321		if (error != 0)
3322			return (error);
3323		if (timeout.tv_nsec >= 1000000000 ||
3324		    timeout.tv_nsec < 0)
3325			return (EINVAL);
3326		ts = &timeout;
3327	}
3328	return do_wait(td, uap->obj, uap->val, ts, 1, 0);
3329}
3330
3331static int
3332__umtx_op_lock_umutex_compat32(struct thread *td, struct _umtx_op_args *uap)
3333{
3334	struct timespec *ts, timeout;
3335	int error;
3336
3337	/* Allow a null timespec (wait forever). */
3338	if (uap->uaddr2 == NULL)
3339		ts = NULL;
3340	else {
3341		error = copyin_timeout32(uap->uaddr2, &timeout);
3342		if (error != 0)
3343			return (error);
3344		if (timeout.tv_nsec >= 1000000000 ||
3345		    timeout.tv_nsec < 0)
3346			return (EINVAL);
3347		ts = &timeout;
3348	}
3349	return do_lock_umutex(td, uap->obj, ts, 0);
3350}
3351
3352static int
3353__umtx_op_wait_umutex_compat32(struct thread *td, struct _umtx_op_args *uap)
3354{
3355	struct timespec *ts, timeout;
3356	int error;
3357
3358	/* Allow a null timespec (wait forever). */
3359	if (uap->uaddr2 == NULL)
3360		ts = NULL;
3361	else {
3362		error = copyin_timeout32(uap->uaddr2, &timeout);
3363		if (error != 0)
3364			return (error);
3365		if (timeout.tv_nsec >= 1000000000 ||
3366		    timeout.tv_nsec < 0)
3367			return (EINVAL);
3368		ts = &timeout;
3369	}
3370	return do_lock_umutex(td, uap->obj, ts, _UMUTEX_WAIT);
3371}
3372
3373static int
3374__umtx_op_cv_wait_compat32(struct thread *td, struct _umtx_op_args *uap)
3375{
3376	struct timespec *ts, timeout;
3377	int error;
3378
3379	/* Allow a null timespec (wait forever). */
3380	if (uap->uaddr2 == NULL)
3381		ts = NULL;
3382	else {
3383		error = copyin_timeout32(uap->uaddr2, &timeout);
3384		if (error != 0)
3385			return (error);
3386		if (timeout.tv_nsec >= 1000000000 ||
3387		    timeout.tv_nsec < 0)
3388			return (EINVAL);
3389		ts = &timeout;
3390	}
3391	return (do_cv_wait(td, uap->obj, uap->uaddr1, ts, uap->val));
3392}
3393
3394static int
3395__umtx_op_rw_rdlock_compat32(struct thread *td, struct _umtx_op_args *uap)
3396{
3397	struct timespec timeout;
3398	int error;
3399
3400	/* Allow a null timespec (wait forever). */
3401	if (uap->uaddr2 == NULL) {
3402		error = do_rw_rdlock(td, uap->obj, uap->val, 0);
3403	} else {
3404		error = copyin_timeout32(uap->uaddr2, &timeout);
3405		if (error != 0)
3406			return (error);
3407		if (timeout.tv_nsec >= 1000000000 ||
3408		    timeout.tv_nsec < 0) {
3409			return (EINVAL);
3410		}
3411		error = do_rw_rdlock2(td, uap->obj, uap->val, &timeout);
3412	}
3413	return (error);
3414}
3415
3416static int
3417__umtx_op_rw_wrlock_compat32(struct thread *td, struct _umtx_op_args *uap)
3418{
3419	struct timespec timeout;
3420	int error;
3421
3422	/* Allow a null timespec (wait forever). */
3423	if (uap->uaddr2 == NULL) {
3424		error = do_rw_wrlock(td, uap->obj, 0);
3425	} else {
3426		error = copyin_timeout32(uap->uaddr2, &timeout);
3427		if (error != 0)
3428			return (error);
3429		if (timeout.tv_nsec >= 1000000000 ||
3430		    timeout.tv_nsec < 0) {
3431			return (EINVAL);
3432		}
3433
3434		error = do_rw_wrlock2(td, uap->obj, &timeout);
3435	}
3436	return (error);
3437}
3438
3439static int
3440__umtx_op_wait_uint_private_compat32(struct thread *td, struct _umtx_op_args *uap)
3441{
3442	struct timespec *ts, timeout;
3443	int error;
3444
3445	if (uap->uaddr2 == NULL)
3446		ts = NULL;
3447	else {
3448		error = copyin_timeout32(uap->uaddr2, &timeout);
3449		if (error != 0)
3450			return (error);
3451		if (timeout.tv_nsec >= 1000000000 ||
3452		    timeout.tv_nsec < 0)
3453			return (EINVAL);
3454		ts = &timeout;
3455	}
3456	return do_wait(td, uap->obj, uap->val, ts, 1, 1);
3457}
3458
3459static int
3460__umtx_op_sem_wait_compat32(struct thread *td, struct _umtx_op_args *uap)
3461{
3462	struct timespec *ts, timeout;
3463	int error;
3464
3465	/* Allow a null timespec (wait forever). */
3466	if (uap->uaddr2 == NULL)
3467		ts = NULL;
3468	else {
3469		error = copyin_timeout32(uap->uaddr2, &timeout);
3470		if (error != 0)
3471			return (error);
3472		if (timeout.tv_nsec >= 1000000000 ||
3473		    timeout.tv_nsec < 0)
3474			return (EINVAL);
3475		ts = &timeout;
3476	}
3477	return (do_sem_wait(td, uap->obj, ts));
3478}
3479
3480static int
3481__umtx_op_nwake_private32(struct thread *td, struct _umtx_op_args *uap)
3482{
3483	int count = uap->val;
3484	uint32_t uaddrs[BATCH_SIZE];
3485	uint32_t **upp = (uint32_t **)uap->obj;
3486	int tocopy;
3487	int error = 0;
3488	int i, pos = 0;
3489
3490	while (count > 0) {
3491		tocopy = count;
3492		if (tocopy > BATCH_SIZE)
3493			tocopy = BATCH_SIZE;
3494		error = copyin(upp+pos, uaddrs, tocopy * sizeof(uint32_t));
3495		if (error != 0)
3496			break;
3497		for (i = 0; i < tocopy; ++i)
3498			kern_umtx_wake(td, (void *)(intptr_t)uaddrs[i],
3499				INT_MAX, 1);
3500		count -= tocopy;
3501		pos += tocopy;
3502	}
3503	return (error);
3504}
3505
3506static _umtx_op_func op_table_compat32[] = {
3507	__umtx_op_lock_umtx_compat32,	/* UMTX_OP_LOCK */
3508	__umtx_op_unlock_umtx_compat32,	/* UMTX_OP_UNLOCK */
3509	__umtx_op_wait_compat32,	/* UMTX_OP_WAIT */
3510	__umtx_op_wake,			/* UMTX_OP_WAKE */
3511	__umtx_op_trylock_umutex,	/* UMTX_OP_MUTEX_LOCK */
3512	__umtx_op_lock_umutex_compat32,	/* UMTX_OP_MUTEX_TRYLOCK */
3513	__umtx_op_unlock_umutex,	/* UMTX_OP_MUTEX_UNLOCK	*/
3514	__umtx_op_set_ceiling,		/* UMTX_OP_SET_CEILING */
3515	__umtx_op_cv_wait_compat32,	/* UMTX_OP_CV_WAIT*/
3516	__umtx_op_cv_signal,		/* UMTX_OP_CV_SIGNAL */
3517	__umtx_op_cv_broadcast,		/* UMTX_OP_CV_BROADCAST */
3518	__umtx_op_wait_compat32,	/* UMTX_OP_WAIT_UINT */
3519	__umtx_op_rw_rdlock_compat32,	/* UMTX_OP_RW_RDLOCK */
3520	__umtx_op_rw_wrlock_compat32,	/* UMTX_OP_RW_WRLOCK */
3521	__umtx_op_rw_unlock,		/* UMTX_OP_RW_UNLOCK */
3522	__umtx_op_wait_uint_private_compat32,	/* UMTX_OP_WAIT_UINT_PRIVATE */
3523	__umtx_op_wake_private,		/* UMTX_OP_WAKE_PRIVATE */
3524	__umtx_op_wait_umutex_compat32, /* UMTX_OP_UMUTEX_WAIT */
3525	__umtx_op_wake_umutex,		/* UMTX_OP_UMUTEX_WAKE */
3526	__umtx_op_sem_wait_compat32,	/* UMTX_OP_SEM_WAIT */
3527	__umtx_op_sem_wake,		/* UMTX_OP_SEM_WAKE */
3528	__umtx_op_nwake_private32	/* UMTX_OP_NWAKE_PRIVATE */
3529};
3530
3531int
3532freebsd32_umtx_op(struct thread *td, struct freebsd32_umtx_op_args *uap)
3533{
3534	if ((unsigned)uap->op < UMTX_OP_MAX)
3535		return (*op_table_compat32[uap->op])(td,
3536			(struct _umtx_op_args *)uap);
3537	return (EINVAL);
3538}
3539#endif
3540
3541void
3542umtx_thread_init(struct thread *td)
3543{
3544	td->td_umtxq = umtxq_alloc();
3545	td->td_umtxq->uq_thread = td;
3546}
3547
3548void
3549umtx_thread_fini(struct thread *td)
3550{
3551	umtxq_free(td->td_umtxq);
3552}
3553
3554/*
3555 * It will be called when new thread is created, e.g fork().
3556 */
3557void
3558umtx_thread_alloc(struct thread *td)
3559{
3560	struct umtx_q *uq;
3561
3562	uq = td->td_umtxq;
3563	uq->uq_inherited_pri = PRI_MAX;
3564
3565	KASSERT(uq->uq_flags == 0, ("uq_flags != 0"));
3566	KASSERT(uq->uq_thread == td, ("uq_thread != td"));
3567	KASSERT(uq->uq_pi_blocked == NULL, ("uq_pi_blocked != NULL"));
3568	KASSERT(TAILQ_EMPTY(&uq->uq_pi_contested), ("uq_pi_contested is not empty"));
3569}
3570
3571/*
3572 * exec() hook.
3573 */
3574static void
3575umtx_exec_hook(void *arg __unused, struct proc *p __unused,
3576	struct image_params *imgp __unused)
3577{
3578	umtx_thread_cleanup(curthread);
3579}
3580
3581/*
3582 * thread_exit() hook.
3583 */
3584void
3585umtx_thread_exit(struct thread *td)
3586{
3587	umtx_thread_cleanup(td);
3588}
3589
3590/*
3591 * clean up umtx data.
3592 */
3593static void
3594umtx_thread_cleanup(struct thread *td)
3595{
3596	struct umtx_q *uq;
3597	struct umtx_pi *pi;
3598
3599	if ((uq = td->td_umtxq) == NULL)
3600		return;
3601
3602	mtx_lock_spin(&umtx_lock);
3603	uq->uq_inherited_pri = PRI_MAX;
3604	while ((pi = TAILQ_FIRST(&uq->uq_pi_contested)) != NULL) {
3605		pi->pi_owner = NULL;
3606		TAILQ_REMOVE(&uq->uq_pi_contested, pi, pi_link);
3607	}
3608	mtx_unlock_spin(&umtx_lock);
3609	thread_lock(td);
3610	sched_lend_user_prio(td, PRI_MAX);
3611	thread_unlock(td);
3612}
3613