kern_umtx.c revision 177849
1/*-
2 * Copyright (c) 2004, David Xu <davidxu@freebsd.org>
3 * Copyright (c) 2002, Jeffrey Roberson <jeff@freebsd.org>
4 * All rights reserved.
5 *
6 * Redistribution and use in source and binary forms, with or without
7 * modification, are permitted provided that the following conditions
8 * are met:
9 * 1. Redistributions of source code must retain the above copyright
10 *    notice unmodified, this list of conditions, and the following
11 *    disclaimer.
12 * 2. Redistributions in binary form must reproduce the above copyright
13 *    notice, this list of conditions and the following disclaimer in the
14 *    documentation and/or other materials provided with the distribution.
15 *
16 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
17 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
18 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
19 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
20 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
21 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
22 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
23 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
25 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26 */
27
28#include <sys/cdefs.h>
29__FBSDID("$FreeBSD: head/sys/kern/kern_umtx.c 177849 2008-04-02 04:26:59Z davidxu $");
30
31#include "opt_compat.h"
32#include <sys/param.h>
33#include <sys/kernel.h>
34#include <sys/limits.h>
35#include <sys/lock.h>
36#include <sys/malloc.h>
37#include <sys/mutex.h>
38#include <sys/priv.h>
39#include <sys/proc.h>
40#include <sys/sched.h>
41#include <sys/smp.h>
42#include <sys/sysctl.h>
43#include <sys/sysent.h>
44#include <sys/systm.h>
45#include <sys/sysproto.h>
46#include <sys/eventhandler.h>
47#include <sys/umtx.h>
48
49#include <vm/vm.h>
50#include <vm/vm_param.h>
51#include <vm/pmap.h>
52#include <vm/vm_map.h>
53#include <vm/vm_object.h>
54
55#include <machine/cpu.h>
56
57#ifdef COMPAT_IA32
58#include <compat/freebsd32/freebsd32_proto.h>
59#endif
60
61#define TYPE_SIMPLE_LOCK	0
62#define TYPE_SIMPLE_WAIT	1
63#define TYPE_NORMAL_UMUTEX	2
64#define TYPE_PI_UMUTEX		3
65#define TYPE_PP_UMUTEX		4
66#define TYPE_CV			5
67#define TYPE_RWLOCK		6
68
69/* Key to represent a unique userland synchronous object */
70struct umtx_key {
71	int	hash;
72	int	type;
73	int	shared;
74	union {
75		struct {
76			vm_object_t	object;
77			uintptr_t	offset;
78		} shared;
79		struct {
80			struct vmspace	*vs;
81			uintptr_t	addr;
82		} private;
83		struct {
84			void		*a;
85			uintptr_t	b;
86		} both;
87	} info;
88};
89
90/* Priority inheritance mutex info. */
91struct umtx_pi {
92	/* Owner thread */
93	struct thread		*pi_owner;
94
95	/* Reference count */
96	int			pi_refcount;
97
98 	/* List entry to link umtx holding by thread */
99	TAILQ_ENTRY(umtx_pi)	pi_link;
100
101	/* List entry in hash */
102	TAILQ_ENTRY(umtx_pi)	pi_hashlink;
103
104	/* List for waiters */
105	TAILQ_HEAD(,umtx_q)	pi_blocked;
106
107	/* Identify a userland lock object */
108	struct umtx_key		pi_key;
109};
110
111/* A userland synchronous object user. */
112struct umtx_q {
113	/* Linked list for the hash. */
114	TAILQ_ENTRY(umtx_q)	uq_link;
115
116	/* Umtx key. */
117	struct umtx_key		uq_key;
118
119	/* Umtx flags. */
120	int			uq_flags;
121#define UQF_UMTXQ	0x0001
122
123	/* The thread waits on. */
124	struct thread		*uq_thread;
125
126	/*
127	 * Blocked on PI mutex. read can use chain lock
128	 * or umtx_lock, write must have both chain lock and
129	 * umtx_lock being hold.
130	 */
131	struct umtx_pi		*uq_pi_blocked;
132
133	/* On blocked list */
134	TAILQ_ENTRY(umtx_q)	uq_lockq;
135
136	/* Thread contending with us */
137	TAILQ_HEAD(,umtx_pi)	uq_pi_contested;
138
139	/* Inherited priority from PP mutex */
140	u_char			uq_inherited_pri;
141};
142
143TAILQ_HEAD(umtxq_head, umtx_q);
144
145/* Userland lock object's wait-queue chain */
146struct umtxq_chain {
147	/* Lock for this chain. */
148	struct mtx		uc_lock;
149
150	/* List of sleep queues. */
151	struct umtxq_head	uc_queue[2];
152#define UMTX_SHARED_QUEUE	0
153#define UMTX_EXCLUSIVE_QUEUE	1
154
155	/* Busy flag */
156	char			uc_busy;
157
158	/* Chain lock waiters */
159	int			uc_waiters;
160
161	/* All PI in the list */
162	TAILQ_HEAD(,umtx_pi)	uc_pi_list;
163};
164
165#define	UMTXQ_LOCKED_ASSERT(uc)		mtx_assert(&(uc)->uc_lock, MA_OWNED)
166
167/*
168 * Don't propagate time-sharing priority, there is a security reason,
169 * a user can simply introduce PI-mutex, let thread A lock the mutex,
170 * and let another thread B block on the mutex, because B is
171 * sleeping, its priority will be boosted, this causes A's priority to
172 * be boosted via priority propagating too and will never be lowered even
173 * if it is using 100%CPU, this is unfair to other processes.
174 */
175
176#define UPRI(td)	(((td)->td_user_pri >= PRI_MIN_TIMESHARE &&\
177			  (td)->td_user_pri <= PRI_MAX_TIMESHARE) ?\
178			 PRI_MAX_TIMESHARE : (td)->td_user_pri)
179
180#define	GOLDEN_RATIO_PRIME	2654404609U
181#define	UMTX_CHAINS		128
182#define	UMTX_SHIFTS		(__WORD_BIT - 7)
183
184#define THREAD_SHARE		0
185#define PROCESS_SHARE		1
186#define AUTO_SHARE		2
187
188#define	GET_SHARE(flags)	\
189    (((flags) & USYNC_PROCESS_SHARED) == 0 ? THREAD_SHARE : PROCESS_SHARE)
190
191#define BUSY_SPINS		200
192
193static uma_zone_t		umtx_pi_zone;
194static struct umtxq_chain	umtxq_chains[UMTX_CHAINS];
195static MALLOC_DEFINE(M_UMTX, "umtx", "UMTX queue memory");
196static int			umtx_pi_allocated;
197
198SYSCTL_NODE(_debug, OID_AUTO, umtx, CTLFLAG_RW, 0, "umtx debug");
199SYSCTL_INT(_debug_umtx, OID_AUTO, umtx_pi_allocated, CTLFLAG_RD,
200    &umtx_pi_allocated, 0, "Allocated umtx_pi");
201
202static void umtxq_sysinit(void *);
203static void umtxq_hash(struct umtx_key *key);
204static struct umtxq_chain *umtxq_getchain(struct umtx_key *key);
205static void umtxq_lock(struct umtx_key *key);
206static void umtxq_unlock(struct umtx_key *key);
207static void umtxq_busy(struct umtx_key *key);
208static void umtxq_unbusy(struct umtx_key *key);
209static void umtxq_insert_queue(struct umtx_q *uq, int q);
210static void umtxq_remove_queue(struct umtx_q *uq, int q);
211static int umtxq_sleep(struct umtx_q *uq, const char *wmesg, int timo);
212static int umtxq_count(struct umtx_key *key);
213static int umtx_key_match(const struct umtx_key *k1, const struct umtx_key *k2);
214static int umtx_key_get(void *addr, int type, int share,
215	struct umtx_key *key);
216static void umtx_key_release(struct umtx_key *key);
217static struct umtx_pi *umtx_pi_alloc(int);
218static void umtx_pi_free(struct umtx_pi *pi);
219static void umtx_pi_adjust_locked(struct thread *td, u_char oldpri);
220static int do_unlock_pp(struct thread *td, struct umutex *m, uint32_t flags);
221static void umtx_thread_cleanup(struct thread *td);
222static void umtx_exec_hook(void *arg __unused, struct proc *p __unused,
223	struct image_params *imgp __unused);
224SYSINIT(umtx, SI_SUB_EVENTHANDLER+1, SI_ORDER_MIDDLE, umtxq_sysinit, NULL);
225
226#define umtxq_signal(key, nwake)	umtxq_signal_queue((key), (nwake), UMTX_SHARED_QUEUE)
227#define umtxq_insert(uq)	umtxq_insert_queue((uq), UMTX_SHARED_QUEUE)
228#define umtxq_remove(uq)	umtxq_remove_queue((uq), UMTX_SHARED_QUEUE)
229
230static struct mtx umtx_lock;
231
232static void
233umtxq_sysinit(void *arg __unused)
234{
235	int i;
236
237	umtx_pi_zone = uma_zcreate("umtx pi", sizeof(struct umtx_pi),
238		NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0);
239	for (i = 0; i < UMTX_CHAINS; ++i) {
240		mtx_init(&umtxq_chains[i].uc_lock, "umtxql", NULL,
241			 MTX_DEF | MTX_DUPOK);
242		TAILQ_INIT(&umtxq_chains[i].uc_queue[0]);
243		TAILQ_INIT(&umtxq_chains[i].uc_queue[1]);
244		TAILQ_INIT(&umtxq_chains[i].uc_pi_list);
245		umtxq_chains[i].uc_busy = 0;
246		umtxq_chains[i].uc_waiters = 0;
247	}
248	mtx_init(&umtx_lock, "umtx lock", NULL, MTX_SPIN);
249	EVENTHANDLER_REGISTER(process_exec, umtx_exec_hook, NULL,
250	    EVENTHANDLER_PRI_ANY);
251}
252
253struct umtx_q *
254umtxq_alloc(void)
255{
256	struct umtx_q *uq;
257
258	uq = malloc(sizeof(struct umtx_q), M_UMTX, M_WAITOK | M_ZERO);
259	TAILQ_INIT(&uq->uq_pi_contested);
260	uq->uq_inherited_pri = PRI_MAX;
261	return (uq);
262}
263
264void
265umtxq_free(struct umtx_q *uq)
266{
267	free(uq, M_UMTX);
268}
269
270static inline void
271umtxq_hash(struct umtx_key *key)
272{
273	unsigned n = (uintptr_t)key->info.both.a + key->info.both.b;
274	key->hash = ((n * GOLDEN_RATIO_PRIME) >> UMTX_SHIFTS) % UMTX_CHAINS;
275}
276
277static inline int
278umtx_key_match(const struct umtx_key *k1, const struct umtx_key *k2)
279{
280	return (k1->type == k2->type &&
281		k1->info.both.a == k2->info.both.a &&
282	        k1->info.both.b == k2->info.both.b);
283}
284
285static inline struct umtxq_chain *
286umtxq_getchain(struct umtx_key *key)
287{
288	return (&umtxq_chains[key->hash]);
289}
290
291/*
292 * Lock a chain.
293 */
294static inline void
295umtxq_lock(struct umtx_key *key)
296{
297	struct umtxq_chain *uc;
298
299	uc = umtxq_getchain(key);
300	mtx_lock(&uc->uc_lock);
301}
302
303/*
304 * Unlock a chain.
305 */
306static inline void
307umtxq_unlock(struct umtx_key *key)
308{
309	struct umtxq_chain *uc;
310
311	uc = umtxq_getchain(key);
312	mtx_unlock(&uc->uc_lock);
313}
314
315/*
316 * Set chain to busy state when following operation
317 * may be blocked (kernel mutex can not be used).
318 */
319static inline void
320umtxq_busy(struct umtx_key *key)
321{
322	struct umtxq_chain *uc;
323
324	uc = umtxq_getchain(key);
325	mtx_assert(&uc->uc_lock, MA_OWNED);
326	if (uc->uc_busy) {
327		int count = BUSY_SPINS;
328		if (count > 0) {
329			umtxq_unlock(key);
330			while (uc->uc_busy && --count > 0)
331				cpu_spinwait();
332			umtxq_lock(key);
333		}
334		while (uc->uc_busy != 0) {
335			uc->uc_waiters++;
336			msleep(uc, &uc->uc_lock, 0, "umtxqb", 0);
337			uc->uc_waiters--;
338		}
339	}
340	uc->uc_busy = 1;
341}
342
343/*
344 * Unbusy a chain.
345 */
346static inline void
347umtxq_unbusy(struct umtx_key *key)
348{
349	struct umtxq_chain *uc;
350
351	uc = umtxq_getchain(key);
352	mtx_assert(&uc->uc_lock, MA_OWNED);
353	KASSERT(uc->uc_busy != 0, ("not busy"));
354	uc->uc_busy = 0;
355	if (uc->uc_waiters)
356		wakeup_one(uc);
357}
358
359static inline void
360umtxq_insert_queue(struct umtx_q *uq, int q)
361{
362	struct umtxq_chain *uc;
363
364	uc = umtxq_getchain(&uq->uq_key);
365	UMTXQ_LOCKED_ASSERT(uc);
366	TAILQ_INSERT_TAIL(&uc->uc_queue[q], uq, uq_link);
367	uq->uq_flags |= UQF_UMTXQ;
368}
369
370static inline void
371umtxq_remove_queue(struct umtx_q *uq, int q)
372{
373	struct umtxq_chain *uc;
374
375	uc = umtxq_getchain(&uq->uq_key);
376	UMTXQ_LOCKED_ASSERT(uc);
377	if (uq->uq_flags & UQF_UMTXQ) {
378		TAILQ_REMOVE(&uc->uc_queue[q], uq, uq_link);
379		uq->uq_flags &= ~UQF_UMTXQ;
380	}
381}
382
383/*
384 * Check if there are multiple waiters
385 */
386static int
387umtxq_count(struct umtx_key *key)
388{
389	struct umtxq_chain *uc;
390	struct umtx_q *uq;
391	int count = 0;
392
393	uc = umtxq_getchain(key);
394	UMTXQ_LOCKED_ASSERT(uc);
395	TAILQ_FOREACH(uq, &uc->uc_queue[UMTX_SHARED_QUEUE], uq_link) {
396		if (umtx_key_match(&uq->uq_key, key)) {
397			if (++count > 1)
398				break;
399		}
400	}
401	return (count);
402}
403
404/*
405 * Check if there are multiple PI waiters and returns first
406 * waiter.
407 */
408static int
409umtxq_count_pi(struct umtx_key *key, struct umtx_q **first)
410{
411	struct umtxq_chain *uc;
412	struct umtx_q *uq;
413	int count = 0;
414
415	*first = NULL;
416	uc = umtxq_getchain(key);
417	UMTXQ_LOCKED_ASSERT(uc);
418	TAILQ_FOREACH(uq, &uc->uc_queue[UMTX_SHARED_QUEUE], uq_link) {
419		if (umtx_key_match(&uq->uq_key, key)) {
420			if (++count > 1)
421				break;
422			*first = uq;
423		}
424	}
425	return (count);
426}
427
428/*
429 * Wake up threads waiting on an userland object.
430 */
431
432static int
433umtxq_signal_queue(struct umtx_key *key, int n_wake, int q)
434{
435	struct umtxq_chain *uc;
436	struct umtx_q *uq, *next;
437	int ret;
438
439	ret = 0;
440	uc = umtxq_getchain(key);
441	UMTXQ_LOCKED_ASSERT(uc);
442	TAILQ_FOREACH_SAFE(uq, &uc->uc_queue[q], uq_link, next) {
443		if (umtx_key_match(&uq->uq_key, key)) {
444			umtxq_remove_queue(uq, q);
445			wakeup(uq);
446			if (++ret >= n_wake)
447				break;
448		}
449	}
450	return (ret);
451}
452
453
454/*
455 * Wake up specified thread.
456 */
457static inline void
458umtxq_signal_thread(struct umtx_q *uq)
459{
460	struct umtxq_chain *uc;
461
462	uc = umtxq_getchain(&uq->uq_key);
463	UMTXQ_LOCKED_ASSERT(uc);
464	umtxq_remove(uq);
465	wakeup(uq);
466}
467
468/*
469 * Put thread into sleep state, before sleeping, check if
470 * thread was removed from umtx queue.
471 */
472static inline int
473umtxq_sleep(struct umtx_q *uq, const char *wmesg, int timo)
474{
475	struct umtxq_chain *uc;
476	int error;
477
478	uc = umtxq_getchain(&uq->uq_key);
479	UMTXQ_LOCKED_ASSERT(uc);
480	if (!(uq->uq_flags & UQF_UMTXQ))
481		return (0);
482	error = msleep(uq, &uc->uc_lock, PCATCH, wmesg, timo);
483	if (error == EWOULDBLOCK)
484		error = ETIMEDOUT;
485	return (error);
486}
487
488/*
489 * Convert userspace address into unique logical address.
490 */
491static int
492umtx_key_get(void *addr, int type, int share, struct umtx_key *key)
493{
494	struct thread *td = curthread;
495	vm_map_t map;
496	vm_map_entry_t entry;
497	vm_pindex_t pindex;
498	vm_prot_t prot;
499	boolean_t wired;
500
501	key->type = type;
502	if (share == THREAD_SHARE) {
503		key->shared = 0;
504		key->info.private.vs = td->td_proc->p_vmspace;
505		key->info.private.addr = (uintptr_t)addr;
506	} else {
507		MPASS(share == PROCESS_SHARE || share == AUTO_SHARE);
508		map = &td->td_proc->p_vmspace->vm_map;
509		if (vm_map_lookup(&map, (vm_offset_t)addr, VM_PROT_WRITE,
510		    &entry, &key->info.shared.object, &pindex, &prot,
511		    &wired) != KERN_SUCCESS) {
512			return EFAULT;
513		}
514
515		if ((share == PROCESS_SHARE) ||
516		    (share == AUTO_SHARE &&
517		     VM_INHERIT_SHARE == entry->inheritance)) {
518			key->shared = 1;
519			key->info.shared.offset = entry->offset + entry->start -
520				(vm_offset_t)addr;
521			vm_object_reference(key->info.shared.object);
522		} else {
523			key->shared = 0;
524			key->info.private.vs = td->td_proc->p_vmspace;
525			key->info.private.addr = (uintptr_t)addr;
526		}
527		vm_map_lookup_done(map, entry);
528	}
529
530	umtxq_hash(key);
531	return (0);
532}
533
534/*
535 * Release key.
536 */
537static inline void
538umtx_key_release(struct umtx_key *key)
539{
540	if (key->shared)
541		vm_object_deallocate(key->info.shared.object);
542}
543
544/*
545 * Lock a umtx object.
546 */
547static int
548_do_lock_umtx(struct thread *td, struct umtx *umtx, u_long id, int timo)
549{
550	struct umtx_q *uq;
551	u_long owner;
552	u_long old;
553	int error = 0;
554
555	uq = td->td_umtxq;
556
557	/*
558	 * Care must be exercised when dealing with umtx structure. It
559	 * can fault on any access.
560	 */
561	for (;;) {
562		/*
563		 * Try the uncontested case.  This should be done in userland.
564		 */
565		owner = casuword(&umtx->u_owner, UMTX_UNOWNED, id);
566
567		/* The acquire succeeded. */
568		if (owner == UMTX_UNOWNED)
569			return (0);
570
571		/* The address was invalid. */
572		if (owner == -1)
573			return (EFAULT);
574
575		/* If no one owns it but it is contested try to acquire it. */
576		if (owner == UMTX_CONTESTED) {
577			owner = casuword(&umtx->u_owner,
578			    UMTX_CONTESTED, id | UMTX_CONTESTED);
579
580			if (owner == UMTX_CONTESTED)
581				return (0);
582
583			/* The address was invalid. */
584			if (owner == -1)
585				return (EFAULT);
586
587			/* If this failed the lock has changed, restart. */
588			continue;
589		}
590
591		/*
592		 * If we caught a signal, we have retried and now
593		 * exit immediately.
594		 */
595		if (error != 0)
596			return (error);
597
598		if ((error = umtx_key_get(umtx, TYPE_SIMPLE_LOCK,
599			AUTO_SHARE, &uq->uq_key)) != 0)
600			return (error);
601
602		umtxq_lock(&uq->uq_key);
603		umtxq_busy(&uq->uq_key);
604		umtxq_insert(uq);
605		umtxq_unbusy(&uq->uq_key);
606		umtxq_unlock(&uq->uq_key);
607
608		/*
609		 * Set the contested bit so that a release in user space
610		 * knows to use the system call for unlock.  If this fails
611		 * either some one else has acquired the lock or it has been
612		 * released.
613		 */
614		old = casuword(&umtx->u_owner, owner, owner | UMTX_CONTESTED);
615
616		/* The address was invalid. */
617		if (old == -1) {
618			umtxq_lock(&uq->uq_key);
619			umtxq_remove(uq);
620			umtxq_unlock(&uq->uq_key);
621			umtx_key_release(&uq->uq_key);
622			return (EFAULT);
623		}
624
625		/*
626		 * We set the contested bit, sleep. Otherwise the lock changed
627		 * and we need to retry or we lost a race to the thread
628		 * unlocking the umtx.
629		 */
630		umtxq_lock(&uq->uq_key);
631		if (old == owner)
632			error = umtxq_sleep(uq, "umtx", timo);
633		umtxq_remove(uq);
634		umtxq_unlock(&uq->uq_key);
635		umtx_key_release(&uq->uq_key);
636	}
637
638	return (0);
639}
640
641/*
642 * Lock a umtx object.
643 */
644static int
645do_lock_umtx(struct thread *td, struct umtx *umtx, u_long id,
646	struct timespec *timeout)
647{
648	struct timespec ts, ts2, ts3;
649	struct timeval tv;
650	int error;
651
652	if (timeout == NULL) {
653		error = _do_lock_umtx(td, umtx, id, 0);
654		/* Mutex locking is restarted if it is interrupted. */
655		if (error == EINTR)
656			error = ERESTART;
657	} else {
658		getnanouptime(&ts);
659		timespecadd(&ts, timeout);
660		TIMESPEC_TO_TIMEVAL(&tv, timeout);
661		for (;;) {
662			error = _do_lock_umtx(td, umtx, id, tvtohz(&tv));
663			if (error != ETIMEDOUT)
664				break;
665			getnanouptime(&ts2);
666			if (timespeccmp(&ts2, &ts, >=)) {
667				error = ETIMEDOUT;
668				break;
669			}
670			ts3 = ts;
671			timespecsub(&ts3, &ts2);
672			TIMESPEC_TO_TIMEVAL(&tv, &ts3);
673		}
674		/* Timed-locking is not restarted. */
675		if (error == ERESTART)
676			error = EINTR;
677	}
678	return (error);
679}
680
681/*
682 * Unlock a umtx object.
683 */
684static int
685do_unlock_umtx(struct thread *td, struct umtx *umtx, u_long id)
686{
687	struct umtx_key key;
688	u_long owner;
689	u_long old;
690	int error;
691	int count;
692
693	/*
694	 * Make sure we own this mtx.
695	 */
696	owner = fuword(__DEVOLATILE(u_long *, &umtx->u_owner));
697	if (owner == -1)
698		return (EFAULT);
699
700	if ((owner & ~UMTX_CONTESTED) != id)
701		return (EPERM);
702
703	/* This should be done in userland */
704	if ((owner & UMTX_CONTESTED) == 0) {
705		old = casuword(&umtx->u_owner, owner, UMTX_UNOWNED);
706		if (old == -1)
707			return (EFAULT);
708		if (old == owner)
709			return (0);
710		owner = old;
711	}
712
713	/* We should only ever be in here for contested locks */
714	if ((error = umtx_key_get(umtx, TYPE_SIMPLE_LOCK, AUTO_SHARE,
715		&key)) != 0)
716		return (error);
717
718	umtxq_lock(&key);
719	umtxq_busy(&key);
720	count = umtxq_count(&key);
721	umtxq_unlock(&key);
722
723	/*
724	 * When unlocking the umtx, it must be marked as unowned if
725	 * there is zero or one thread only waiting for it.
726	 * Otherwise, it must be marked as contested.
727	 */
728	old = casuword(&umtx->u_owner, owner,
729		count <= 1 ? UMTX_UNOWNED : UMTX_CONTESTED);
730	umtxq_lock(&key);
731	umtxq_signal(&key,1);
732	umtxq_unbusy(&key);
733	umtxq_unlock(&key);
734	umtx_key_release(&key);
735	if (old == -1)
736		return (EFAULT);
737	if (old != owner)
738		return (EINVAL);
739	return (0);
740}
741
742#ifdef COMPAT_IA32
743
744/*
745 * Lock a umtx object.
746 */
747static int
748_do_lock_umtx32(struct thread *td, uint32_t *m, uint32_t id, int timo)
749{
750	struct umtx_q *uq;
751	uint32_t owner;
752	uint32_t old;
753	int error = 0;
754
755	uq = td->td_umtxq;
756
757	/*
758	 * Care must be exercised when dealing with umtx structure. It
759	 * can fault on any access.
760	 */
761	for (;;) {
762		/*
763		 * Try the uncontested case.  This should be done in userland.
764		 */
765		owner = casuword32(m, UMUTEX_UNOWNED, id);
766
767		/* The acquire succeeded. */
768		if (owner == UMUTEX_UNOWNED)
769			return (0);
770
771		/* The address was invalid. */
772		if (owner == -1)
773			return (EFAULT);
774
775		/* If no one owns it but it is contested try to acquire it. */
776		if (owner == UMUTEX_CONTESTED) {
777			owner = casuword32(m,
778			    UMUTEX_CONTESTED, id | UMUTEX_CONTESTED);
779			if (owner == UMUTEX_CONTESTED)
780				return (0);
781
782			/* The address was invalid. */
783			if (owner == -1)
784				return (EFAULT);
785
786			/* If this failed the lock has changed, restart. */
787			continue;
788		}
789
790		/*
791		 * If we caught a signal, we have retried and now
792		 * exit immediately.
793		 */
794		if (error != 0)
795			return (error);
796
797		if ((error = umtx_key_get(m, TYPE_SIMPLE_LOCK,
798			AUTO_SHARE, &uq->uq_key)) != 0)
799			return (error);
800
801		umtxq_lock(&uq->uq_key);
802		umtxq_busy(&uq->uq_key);
803		umtxq_insert(uq);
804		umtxq_unbusy(&uq->uq_key);
805		umtxq_unlock(&uq->uq_key);
806
807		/*
808		 * Set the contested bit so that a release in user space
809		 * knows to use the system call for unlock.  If this fails
810		 * either some one else has acquired the lock or it has been
811		 * released.
812		 */
813		old = casuword32(m, owner, owner | UMUTEX_CONTESTED);
814
815		/* The address was invalid. */
816		if (old == -1) {
817			umtxq_lock(&uq->uq_key);
818			umtxq_remove(uq);
819			umtxq_unlock(&uq->uq_key);
820			umtx_key_release(&uq->uq_key);
821			return (EFAULT);
822		}
823
824		/*
825		 * We set the contested bit, sleep. Otherwise the lock changed
826		 * and we need to retry or we lost a race to the thread
827		 * unlocking the umtx.
828		 */
829		umtxq_lock(&uq->uq_key);
830		if (old == owner)
831			error = umtxq_sleep(uq, "umtx", timo);
832		umtxq_remove(uq);
833		umtxq_unlock(&uq->uq_key);
834		umtx_key_release(&uq->uq_key);
835	}
836
837	return (0);
838}
839
840/*
841 * Lock a umtx object.
842 */
843static int
844do_lock_umtx32(struct thread *td, void *m, uint32_t id,
845	struct timespec *timeout)
846{
847	struct timespec ts, ts2, ts3;
848	struct timeval tv;
849	int error;
850
851	if (timeout == NULL) {
852		error = _do_lock_umtx32(td, m, id, 0);
853		/* Mutex locking is restarted if it is interrupted. */
854		if (error == EINTR)
855			error = ERESTART;
856	} else {
857		getnanouptime(&ts);
858		timespecadd(&ts, timeout);
859		TIMESPEC_TO_TIMEVAL(&tv, timeout);
860		for (;;) {
861			error = _do_lock_umtx32(td, m, id, tvtohz(&tv));
862			if (error != ETIMEDOUT)
863				break;
864			getnanouptime(&ts2);
865			if (timespeccmp(&ts2, &ts, >=)) {
866				error = ETIMEDOUT;
867				break;
868			}
869			ts3 = ts;
870			timespecsub(&ts3, &ts2);
871			TIMESPEC_TO_TIMEVAL(&tv, &ts3);
872		}
873		/* Timed-locking is not restarted. */
874		if (error == ERESTART)
875			error = EINTR;
876	}
877	return (error);
878}
879
880/*
881 * Unlock a umtx object.
882 */
883static int
884do_unlock_umtx32(struct thread *td, uint32_t *m, uint32_t id)
885{
886	struct umtx_key key;
887	uint32_t owner;
888	uint32_t old;
889	int error;
890	int count;
891
892	/*
893	 * Make sure we own this mtx.
894	 */
895	owner = fuword32(m);
896	if (owner == -1)
897		return (EFAULT);
898
899	if ((owner & ~UMUTEX_CONTESTED) != id)
900		return (EPERM);
901
902	/* This should be done in userland */
903	if ((owner & UMUTEX_CONTESTED) == 0) {
904		old = casuword32(m, owner, UMUTEX_UNOWNED);
905		if (old == -1)
906			return (EFAULT);
907		if (old == owner)
908			return (0);
909		owner = old;
910	}
911
912	/* We should only ever be in here for contested locks */
913	if ((error = umtx_key_get(m, TYPE_SIMPLE_LOCK, AUTO_SHARE,
914		&key)) != 0)
915		return (error);
916
917	umtxq_lock(&key);
918	umtxq_busy(&key);
919	count = umtxq_count(&key);
920	umtxq_unlock(&key);
921
922	/*
923	 * When unlocking the umtx, it must be marked as unowned if
924	 * there is zero or one thread only waiting for it.
925	 * Otherwise, it must be marked as contested.
926	 */
927	old = casuword32(m, owner,
928		count <= 1 ? UMUTEX_UNOWNED : UMUTEX_CONTESTED);
929	umtxq_lock(&key);
930	umtxq_signal(&key,1);
931	umtxq_unbusy(&key);
932	umtxq_unlock(&key);
933	umtx_key_release(&key);
934	if (old == -1)
935		return (EFAULT);
936	if (old != owner)
937		return (EINVAL);
938	return (0);
939}
940#endif
941
942/*
943 * Fetch and compare value, sleep on the address if value is not changed.
944 */
945static int
946do_wait(struct thread *td, void *addr, u_long id,
947	struct timespec *timeout, int compat32)
948{
949	struct umtx_q *uq;
950	struct timespec ts, ts2, ts3;
951	struct timeval tv;
952	u_long tmp;
953	int error = 0;
954
955	uq = td->td_umtxq;
956	if ((error = umtx_key_get(addr, TYPE_SIMPLE_WAIT, AUTO_SHARE,
957	    &uq->uq_key)) != 0)
958		return (error);
959
960	umtxq_lock(&uq->uq_key);
961	umtxq_insert(uq);
962	umtxq_unlock(&uq->uq_key);
963	if (compat32 == 0)
964		tmp = fuword(addr);
965        else
966		tmp = fuword32(addr);
967	if (tmp != id) {
968		umtxq_lock(&uq->uq_key);
969		umtxq_remove(uq);
970		umtxq_unlock(&uq->uq_key);
971	} else if (timeout == NULL) {
972		umtxq_lock(&uq->uq_key);
973		error = umtxq_sleep(uq, "uwait", 0);
974		umtxq_remove(uq);
975		umtxq_unlock(&uq->uq_key);
976	} else {
977		getnanouptime(&ts);
978		timespecadd(&ts, timeout);
979		TIMESPEC_TO_TIMEVAL(&tv, timeout);
980		umtxq_lock(&uq->uq_key);
981		for (;;) {
982			error = umtxq_sleep(uq, "uwait", tvtohz(&tv));
983			if (!(uq->uq_flags & UQF_UMTXQ))
984				break;
985			if (error != ETIMEDOUT)
986				break;
987			umtxq_unlock(&uq->uq_key);
988			getnanouptime(&ts2);
989			if (timespeccmp(&ts2, &ts, >=)) {
990				error = ETIMEDOUT;
991				umtxq_lock(&uq->uq_key);
992				break;
993			}
994			ts3 = ts;
995			timespecsub(&ts3, &ts2);
996			TIMESPEC_TO_TIMEVAL(&tv, &ts3);
997			umtxq_lock(&uq->uq_key);
998		}
999		umtxq_remove(uq);
1000		umtxq_unlock(&uq->uq_key);
1001	}
1002	umtx_key_release(&uq->uq_key);
1003	if (error == ERESTART)
1004		error = EINTR;
1005	return (error);
1006}
1007
1008/*
1009 * Wake up threads sleeping on the specified address.
1010 */
1011int
1012kern_umtx_wake(struct thread *td, void *uaddr, int n_wake)
1013{
1014	struct umtx_key key;
1015	int ret;
1016
1017	if ((ret = umtx_key_get(uaddr, TYPE_SIMPLE_WAIT, AUTO_SHARE,
1018	   &key)) != 0)
1019		return (ret);
1020	umtxq_lock(&key);
1021	ret = umtxq_signal(&key, n_wake);
1022	umtxq_unlock(&key);
1023	umtx_key_release(&key);
1024	return (0);
1025}
1026
1027/*
1028 * Lock PTHREAD_PRIO_NONE protocol POSIX mutex.
1029 */
1030static int
1031_do_lock_normal(struct thread *td, struct umutex *m, uint32_t flags, int timo,
1032	int try)
1033{
1034	struct umtx_q *uq;
1035	uint32_t owner, old, id;
1036	int error = 0;
1037
1038	id = td->td_tid;
1039	uq = td->td_umtxq;
1040
1041	/*
1042	 * Care must be exercised when dealing with umtx structure. It
1043	 * can fault on any access.
1044	 */
1045	for (;;) {
1046		/*
1047		 * Try the uncontested case.  This should be done in userland.
1048		 */
1049		owner = casuword32(&m->m_owner, UMUTEX_UNOWNED, id);
1050
1051		/* The acquire succeeded. */
1052		if (owner == UMUTEX_UNOWNED)
1053			return (0);
1054
1055		/* The address was invalid. */
1056		if (owner == -1)
1057			return (EFAULT);
1058
1059		/* If no one owns it but it is contested try to acquire it. */
1060		if (owner == UMUTEX_CONTESTED) {
1061			owner = casuword32(&m->m_owner,
1062			    UMUTEX_CONTESTED, id | UMUTEX_CONTESTED);
1063
1064			if (owner == UMUTEX_CONTESTED)
1065				return (0);
1066
1067			/* The address was invalid. */
1068			if (owner == -1)
1069				return (EFAULT);
1070
1071			/* If this failed the lock has changed, restart. */
1072			continue;
1073		}
1074
1075		if ((flags & UMUTEX_ERROR_CHECK) != 0 &&
1076		    (owner & ~UMUTEX_CONTESTED) == id)
1077			return (EDEADLK);
1078
1079		if (try != 0)
1080			return (EBUSY);
1081
1082		/*
1083		 * If we caught a signal, we have retried and now
1084		 * exit immediately.
1085		 */
1086		if (error != 0)
1087			return (error);
1088
1089		if ((error = umtx_key_get(m, TYPE_NORMAL_UMUTEX,
1090		    GET_SHARE(flags), &uq->uq_key)) != 0)
1091			return (error);
1092
1093		umtxq_lock(&uq->uq_key);
1094		umtxq_busy(&uq->uq_key);
1095		umtxq_insert(uq);
1096		umtxq_unbusy(&uq->uq_key);
1097		umtxq_unlock(&uq->uq_key);
1098
1099		/*
1100		 * Set the contested bit so that a release in user space
1101		 * knows to use the system call for unlock.  If this fails
1102		 * either some one else has acquired the lock or it has been
1103		 * released.
1104		 */
1105		old = casuword32(&m->m_owner, owner, owner | UMUTEX_CONTESTED);
1106
1107		/* The address was invalid. */
1108		if (old == -1) {
1109			umtxq_lock(&uq->uq_key);
1110			umtxq_remove(uq);
1111			umtxq_unlock(&uq->uq_key);
1112			umtx_key_release(&uq->uq_key);
1113			return (EFAULT);
1114		}
1115
1116		/*
1117		 * We set the contested bit, sleep. Otherwise the lock changed
1118		 * and we need to retry or we lost a race to the thread
1119		 * unlocking the umtx.
1120		 */
1121		umtxq_lock(&uq->uq_key);
1122		if (old == owner)
1123			error = umtxq_sleep(uq, "umtxn", timo);
1124		umtxq_remove(uq);
1125		umtxq_unlock(&uq->uq_key);
1126		umtx_key_release(&uq->uq_key);
1127	}
1128
1129	return (0);
1130}
1131
1132/*
1133 * Lock PTHREAD_PRIO_NONE protocol POSIX mutex.
1134 */
1135/*
1136 * Unlock PTHREAD_PRIO_NONE protocol POSIX mutex.
1137 */
1138static int
1139do_unlock_normal(struct thread *td, struct umutex *m, uint32_t flags)
1140{
1141	struct umtx_key key;
1142	uint32_t owner, old, id;
1143	int error;
1144	int count;
1145
1146	id = td->td_tid;
1147	/*
1148	 * Make sure we own this mtx.
1149	 */
1150	owner = fuword32(__DEVOLATILE(uint32_t *, &m->m_owner));
1151	if (owner == -1)
1152		return (EFAULT);
1153
1154	if ((owner & ~UMUTEX_CONTESTED) != id)
1155		return (EPERM);
1156
1157	/* This should be done in userland */
1158	if ((owner & UMUTEX_CONTESTED) == 0) {
1159		old = casuword32(&m->m_owner, owner, UMUTEX_UNOWNED);
1160		if (old == -1)
1161			return (EFAULT);
1162		if (old == owner)
1163			return (0);
1164		owner = old;
1165	}
1166
1167	/* We should only ever be in here for contested locks */
1168	if ((error = umtx_key_get(m, TYPE_NORMAL_UMUTEX, GET_SHARE(flags),
1169	    &key)) != 0)
1170		return (error);
1171
1172	umtxq_lock(&key);
1173	umtxq_busy(&key);
1174	count = umtxq_count(&key);
1175	umtxq_unlock(&key);
1176
1177	/*
1178	 * When unlocking the umtx, it must be marked as unowned if
1179	 * there is zero or one thread only waiting for it.
1180	 * Otherwise, it must be marked as contested.
1181	 */
1182	old = casuword32(&m->m_owner, owner,
1183		count <= 1 ? UMUTEX_UNOWNED : UMUTEX_CONTESTED);
1184	umtxq_lock(&key);
1185	umtxq_signal(&key,1);
1186	umtxq_unbusy(&key);
1187	umtxq_unlock(&key);
1188	umtx_key_release(&key);
1189	if (old == -1)
1190		return (EFAULT);
1191	if (old != owner)
1192		return (EINVAL);
1193	return (0);
1194}
1195
1196static inline struct umtx_pi *
1197umtx_pi_alloc(int flags)
1198{
1199	struct umtx_pi *pi;
1200
1201	pi = uma_zalloc(umtx_pi_zone, M_ZERO | flags);
1202	TAILQ_INIT(&pi->pi_blocked);
1203	atomic_add_int(&umtx_pi_allocated, 1);
1204	return (pi);
1205}
1206
1207static inline void
1208umtx_pi_free(struct umtx_pi *pi)
1209{
1210	uma_zfree(umtx_pi_zone, pi);
1211	atomic_add_int(&umtx_pi_allocated, -1);
1212}
1213
1214/*
1215 * Adjust the thread's position on a pi_state after its priority has been
1216 * changed.
1217 */
1218static int
1219umtx_pi_adjust_thread(struct umtx_pi *pi, struct thread *td)
1220{
1221	struct umtx_q *uq, *uq1, *uq2;
1222	struct thread *td1;
1223
1224	mtx_assert(&umtx_lock, MA_OWNED);
1225	if (pi == NULL)
1226		return (0);
1227
1228	uq = td->td_umtxq;
1229
1230	/*
1231	 * Check if the thread needs to be moved on the blocked chain.
1232	 * It needs to be moved if either its priority is lower than
1233	 * the previous thread or higher than the next thread.
1234	 */
1235	uq1 = TAILQ_PREV(uq, umtxq_head, uq_lockq);
1236	uq2 = TAILQ_NEXT(uq, uq_lockq);
1237	if ((uq1 != NULL && UPRI(td) < UPRI(uq1->uq_thread)) ||
1238	    (uq2 != NULL && UPRI(td) > UPRI(uq2->uq_thread))) {
1239		/*
1240		 * Remove thread from blocked chain and determine where
1241		 * it should be moved to.
1242		 */
1243		TAILQ_REMOVE(&pi->pi_blocked, uq, uq_lockq);
1244		TAILQ_FOREACH(uq1, &pi->pi_blocked, uq_lockq) {
1245			td1 = uq1->uq_thread;
1246			MPASS(td1->td_proc->p_magic == P_MAGIC);
1247			if (UPRI(td1) > UPRI(td))
1248				break;
1249		}
1250
1251		if (uq1 == NULL)
1252			TAILQ_INSERT_TAIL(&pi->pi_blocked, uq, uq_lockq);
1253		else
1254			TAILQ_INSERT_BEFORE(uq1, uq, uq_lockq);
1255	}
1256	return (1);
1257}
1258
1259/*
1260 * Propagate priority when a thread is blocked on POSIX
1261 * PI mutex.
1262 */
1263static void
1264umtx_propagate_priority(struct thread *td)
1265{
1266	struct umtx_q *uq;
1267	struct umtx_pi *pi;
1268	int pri;
1269
1270	mtx_assert(&umtx_lock, MA_OWNED);
1271	pri = UPRI(td);
1272	uq = td->td_umtxq;
1273	pi = uq->uq_pi_blocked;
1274	if (pi == NULL)
1275		return;
1276
1277	for (;;) {
1278		td = pi->pi_owner;
1279		if (td == NULL)
1280			return;
1281
1282		MPASS(td->td_proc != NULL);
1283		MPASS(td->td_proc->p_magic == P_MAGIC);
1284
1285		if (UPRI(td) <= pri)
1286			return;
1287
1288		thread_lock(td);
1289		sched_lend_user_prio(td, pri);
1290		thread_unlock(td);
1291
1292		/*
1293		 * Pick up the lock that td is blocked on.
1294		 */
1295		uq = td->td_umtxq;
1296		pi = uq->uq_pi_blocked;
1297		/* Resort td on the list if needed. */
1298		if (!umtx_pi_adjust_thread(pi, td))
1299			break;
1300	}
1301}
1302
1303/*
1304 * Unpropagate priority for a PI mutex when a thread blocked on
1305 * it is interrupted by signal or resumed by others.
1306 */
1307static void
1308umtx_unpropagate_priority(struct umtx_pi *pi)
1309{
1310	struct umtx_q *uq, *uq_owner;
1311	struct umtx_pi *pi2;
1312	int pri, oldpri;
1313
1314	mtx_assert(&umtx_lock, MA_OWNED);
1315
1316	while (pi != NULL && pi->pi_owner != NULL) {
1317		pri = PRI_MAX;
1318		uq_owner = pi->pi_owner->td_umtxq;
1319
1320		TAILQ_FOREACH(pi2, &uq_owner->uq_pi_contested, pi_link) {
1321			uq = TAILQ_FIRST(&pi2->pi_blocked);
1322			if (uq != NULL) {
1323				if (pri > UPRI(uq->uq_thread))
1324					pri = UPRI(uq->uq_thread);
1325			}
1326		}
1327
1328		if (pri > uq_owner->uq_inherited_pri)
1329			pri = uq_owner->uq_inherited_pri;
1330		thread_lock(pi->pi_owner);
1331		oldpri = pi->pi_owner->td_user_pri;
1332		sched_unlend_user_prio(pi->pi_owner, pri);
1333		thread_unlock(pi->pi_owner);
1334		umtx_pi_adjust_locked(pi->pi_owner, oldpri);
1335		pi = uq_owner->uq_pi_blocked;
1336	}
1337}
1338
1339/*
1340 * Insert a PI mutex into owned list.
1341 */
1342static void
1343umtx_pi_setowner(struct umtx_pi *pi, struct thread *owner)
1344{
1345	struct umtx_q *uq_owner;
1346
1347	uq_owner = owner->td_umtxq;
1348	mtx_assert(&umtx_lock, MA_OWNED);
1349	if (pi->pi_owner != NULL)
1350		panic("pi_ower != NULL");
1351	pi->pi_owner = owner;
1352	TAILQ_INSERT_TAIL(&uq_owner->uq_pi_contested, pi, pi_link);
1353}
1354
1355/*
1356 * Claim ownership of a PI mutex.
1357 */
1358static int
1359umtx_pi_claim(struct umtx_pi *pi, struct thread *owner)
1360{
1361	struct umtx_q *uq, *uq_owner;
1362
1363	uq_owner = owner->td_umtxq;
1364	mtx_lock_spin(&umtx_lock);
1365	if (pi->pi_owner == owner) {
1366		mtx_unlock_spin(&umtx_lock);
1367		return (0);
1368	}
1369
1370	if (pi->pi_owner != NULL) {
1371		/*
1372		 * userland may have already messed the mutex, sigh.
1373		 */
1374		mtx_unlock_spin(&umtx_lock);
1375		return (EPERM);
1376	}
1377	umtx_pi_setowner(pi, owner);
1378	uq = TAILQ_FIRST(&pi->pi_blocked);
1379	if (uq != NULL) {
1380		int pri;
1381
1382		pri = UPRI(uq->uq_thread);
1383		thread_lock(owner);
1384		if (pri < UPRI(owner))
1385			sched_lend_user_prio(owner, pri);
1386		thread_unlock(owner);
1387	}
1388	mtx_unlock_spin(&umtx_lock);
1389	return (0);
1390}
1391
1392static void
1393umtx_pi_adjust_locked(struct thread *td, u_char oldpri)
1394{
1395	struct umtx_q *uq;
1396	struct umtx_pi *pi;
1397
1398	uq = td->td_umtxq;
1399	/*
1400	 * Pick up the lock that td is blocked on.
1401	 */
1402	pi = uq->uq_pi_blocked;
1403	MPASS(pi != NULL);
1404
1405	/* Resort the turnstile on the list. */
1406	if (!umtx_pi_adjust_thread(pi, td))
1407		return;
1408
1409	/*
1410	 * If our priority was lowered and we are at the head of the
1411	 * turnstile, then propagate our new priority up the chain.
1412	 */
1413	if (uq == TAILQ_FIRST(&pi->pi_blocked) && UPRI(td) < oldpri)
1414		umtx_propagate_priority(td);
1415}
1416
1417/*
1418 * Adjust a thread's order position in its blocked PI mutex,
1419 * this may result new priority propagating process.
1420 */
1421void
1422umtx_pi_adjust(struct thread *td, u_char oldpri)
1423{
1424	struct umtx_q *uq;
1425	struct umtx_pi *pi;
1426
1427	uq = td->td_umtxq;
1428	mtx_lock_spin(&umtx_lock);
1429	/*
1430	 * Pick up the lock that td is blocked on.
1431	 */
1432	pi = uq->uq_pi_blocked;
1433	if (pi != NULL)
1434		umtx_pi_adjust_locked(td, oldpri);
1435	mtx_unlock_spin(&umtx_lock);
1436}
1437
1438/*
1439 * Sleep on a PI mutex.
1440 */
1441static int
1442umtxq_sleep_pi(struct umtx_q *uq, struct umtx_pi *pi,
1443	uint32_t owner, const char *wmesg, int timo)
1444{
1445	struct umtxq_chain *uc;
1446	struct thread *td, *td1;
1447	struct umtx_q *uq1;
1448	int pri;
1449	int error = 0;
1450
1451	td = uq->uq_thread;
1452	KASSERT(td == curthread, ("inconsistent uq_thread"));
1453	uc = umtxq_getchain(&uq->uq_key);
1454	UMTXQ_LOCKED_ASSERT(uc);
1455	umtxq_insert(uq);
1456	if (pi->pi_owner == NULL) {
1457		/* XXX
1458		 * Current, We only support process private PI-mutex,
1459		 * non-contended PI-mutexes are locked in userland.
1460		 * Process shared PI-mutex should always be initialized
1461		 * by kernel and be registered in kernel, locking should
1462		 * always be done by kernel to avoid security problems.
1463		 * For process private PI-mutex, we can find owner
1464		 * thread and boost its priority safely.
1465		 */
1466		PROC_LOCK(curproc);
1467		td1 = thread_find(curproc, owner);
1468		mtx_lock_spin(&umtx_lock);
1469		if (td1 != NULL && pi->pi_owner == NULL) {
1470			uq1 = td1->td_umtxq;
1471			umtx_pi_setowner(pi, td1);
1472		}
1473		PROC_UNLOCK(curproc);
1474	} else {
1475		mtx_lock_spin(&umtx_lock);
1476	}
1477
1478	TAILQ_FOREACH(uq1, &pi->pi_blocked, uq_lockq) {
1479		pri = UPRI(uq1->uq_thread);
1480		if (pri > UPRI(td))
1481			break;
1482	}
1483
1484	if (uq1 != NULL)
1485		TAILQ_INSERT_BEFORE(uq1, uq, uq_lockq);
1486	else
1487		TAILQ_INSERT_TAIL(&pi->pi_blocked, uq, uq_lockq);
1488
1489	uq->uq_pi_blocked = pi;
1490	thread_lock(td);
1491	td->td_flags |= TDF_UPIBLOCKED;
1492	thread_unlock(td);
1493	mtx_unlock_spin(&umtx_lock);
1494	umtxq_unlock(&uq->uq_key);
1495
1496	mtx_lock_spin(&umtx_lock);
1497	umtx_propagate_priority(td);
1498	mtx_unlock_spin(&umtx_lock);
1499
1500	umtxq_lock(&uq->uq_key);
1501	if (uq->uq_flags & UQF_UMTXQ) {
1502		error = msleep(uq, &uc->uc_lock, PCATCH, wmesg, timo);
1503		if (error == EWOULDBLOCK)
1504			error = ETIMEDOUT;
1505		if (uq->uq_flags & UQF_UMTXQ) {
1506			umtxq_busy(&uq->uq_key);
1507			umtxq_remove(uq);
1508			umtxq_unbusy(&uq->uq_key);
1509		}
1510	}
1511	umtxq_unlock(&uq->uq_key);
1512
1513	mtx_lock_spin(&umtx_lock);
1514	uq->uq_pi_blocked = NULL;
1515	thread_lock(td);
1516	td->td_flags &= ~TDF_UPIBLOCKED;
1517	thread_unlock(td);
1518	TAILQ_REMOVE(&pi->pi_blocked, uq, uq_lockq);
1519	umtx_unpropagate_priority(pi);
1520	mtx_unlock_spin(&umtx_lock);
1521
1522	umtxq_lock(&uq->uq_key);
1523
1524	return (error);
1525}
1526
1527/*
1528 * Add reference count for a PI mutex.
1529 */
1530static void
1531umtx_pi_ref(struct umtx_pi *pi)
1532{
1533	struct umtxq_chain *uc;
1534
1535	uc = umtxq_getchain(&pi->pi_key);
1536	UMTXQ_LOCKED_ASSERT(uc);
1537	pi->pi_refcount++;
1538}
1539
1540/*
1541 * Decrease reference count for a PI mutex, if the counter
1542 * is decreased to zero, its memory space is freed.
1543 */
1544static void
1545umtx_pi_unref(struct umtx_pi *pi)
1546{
1547	struct umtxq_chain *uc;
1548	int free = 0;
1549
1550	uc = umtxq_getchain(&pi->pi_key);
1551	UMTXQ_LOCKED_ASSERT(uc);
1552	KASSERT(pi->pi_refcount > 0, ("invalid reference count"));
1553	if (--pi->pi_refcount == 0) {
1554		mtx_lock_spin(&umtx_lock);
1555		if (pi->pi_owner != NULL) {
1556			TAILQ_REMOVE(&pi->pi_owner->td_umtxq->uq_pi_contested,
1557				pi, pi_link);
1558			pi->pi_owner = NULL;
1559		}
1560		KASSERT(TAILQ_EMPTY(&pi->pi_blocked),
1561			("blocked queue not empty"));
1562		mtx_unlock_spin(&umtx_lock);
1563		TAILQ_REMOVE(&uc->uc_pi_list, pi, pi_hashlink);
1564		free = 1;
1565	}
1566	if (free)
1567		umtx_pi_free(pi);
1568}
1569
1570/*
1571 * Find a PI mutex in hash table.
1572 */
1573static struct umtx_pi *
1574umtx_pi_lookup(struct umtx_key *key)
1575{
1576	struct umtxq_chain *uc;
1577	struct umtx_pi *pi;
1578
1579	uc = umtxq_getchain(key);
1580	UMTXQ_LOCKED_ASSERT(uc);
1581
1582	TAILQ_FOREACH(pi, &uc->uc_pi_list, pi_hashlink) {
1583		if (umtx_key_match(&pi->pi_key, key)) {
1584			return (pi);
1585		}
1586	}
1587	return (NULL);
1588}
1589
1590/*
1591 * Insert a PI mutex into hash table.
1592 */
1593static inline void
1594umtx_pi_insert(struct umtx_pi *pi)
1595{
1596	struct umtxq_chain *uc;
1597
1598	uc = umtxq_getchain(&pi->pi_key);
1599	UMTXQ_LOCKED_ASSERT(uc);
1600	TAILQ_INSERT_TAIL(&uc->uc_pi_list, pi, pi_hashlink);
1601}
1602
1603/*
1604 * Lock a PI mutex.
1605 */
1606static int
1607_do_lock_pi(struct thread *td, struct umutex *m, uint32_t flags, int timo,
1608	int try)
1609{
1610	struct umtx_q *uq;
1611	struct umtx_pi *pi, *new_pi;
1612	uint32_t id, owner, old;
1613	int error;
1614
1615	id = td->td_tid;
1616	uq = td->td_umtxq;
1617
1618	if ((error = umtx_key_get(m, TYPE_PI_UMUTEX, GET_SHARE(flags),
1619	    &uq->uq_key)) != 0)
1620		return (error);
1621	umtxq_lock(&uq->uq_key);
1622	pi = umtx_pi_lookup(&uq->uq_key);
1623	if (pi == NULL) {
1624		new_pi = umtx_pi_alloc(M_NOWAIT);
1625		if (new_pi == NULL) {
1626			umtxq_unlock(&uq->uq_key);
1627			new_pi = umtx_pi_alloc(M_WAITOK);
1628			new_pi->pi_key = uq->uq_key;
1629			umtxq_lock(&uq->uq_key);
1630			pi = umtx_pi_lookup(&uq->uq_key);
1631			if (pi != NULL) {
1632				umtx_pi_free(new_pi);
1633				new_pi = NULL;
1634			}
1635		}
1636		if (new_pi != NULL) {
1637			new_pi->pi_key = uq->uq_key;
1638			umtx_pi_insert(new_pi);
1639			pi = new_pi;
1640		}
1641	}
1642	umtx_pi_ref(pi);
1643	umtxq_unlock(&uq->uq_key);
1644
1645	/*
1646	 * Care must be exercised when dealing with umtx structure.  It
1647	 * can fault on any access.
1648	 */
1649	for (;;) {
1650		/*
1651		 * Try the uncontested case.  This should be done in userland.
1652		 */
1653		owner = casuword32(&m->m_owner, UMUTEX_UNOWNED, id);
1654
1655		/* The acquire succeeded. */
1656		if (owner == UMUTEX_UNOWNED) {
1657			error = 0;
1658			break;
1659		}
1660
1661		/* The address was invalid. */
1662		if (owner == -1) {
1663			error = EFAULT;
1664			break;
1665		}
1666
1667		/* If no one owns it but it is contested try to acquire it. */
1668		if (owner == UMUTEX_CONTESTED) {
1669			owner = casuword32(&m->m_owner,
1670			    UMUTEX_CONTESTED, id | UMUTEX_CONTESTED);
1671
1672			if (owner == UMUTEX_CONTESTED) {
1673				umtxq_lock(&uq->uq_key);
1674				error = umtx_pi_claim(pi, td);
1675				umtxq_unlock(&uq->uq_key);
1676				break;
1677			}
1678
1679			/* The address was invalid. */
1680			if (owner == -1) {
1681				error = EFAULT;
1682				break;
1683			}
1684
1685			/* If this failed the lock has changed, restart. */
1686			continue;
1687		}
1688
1689		if ((flags & UMUTEX_ERROR_CHECK) != 0 &&
1690		    (owner & ~UMUTEX_CONTESTED) == id) {
1691			error = EDEADLK;
1692			break;
1693		}
1694
1695		if (try != 0) {
1696			error = EBUSY;
1697			break;
1698		}
1699
1700		/*
1701		 * If we caught a signal, we have retried and now
1702		 * exit immediately.
1703		 */
1704		if (error != 0)
1705			break;
1706
1707		umtxq_lock(&uq->uq_key);
1708		umtxq_busy(&uq->uq_key);
1709		umtxq_unlock(&uq->uq_key);
1710
1711		/*
1712		 * Set the contested bit so that a release in user space
1713		 * knows to use the system call for unlock.  If this fails
1714		 * either some one else has acquired the lock or it has been
1715		 * released.
1716		 */
1717		old = casuword32(&m->m_owner, owner, owner | UMUTEX_CONTESTED);
1718
1719		/* The address was invalid. */
1720		if (old == -1) {
1721			umtxq_lock(&uq->uq_key);
1722			umtxq_unbusy(&uq->uq_key);
1723			umtxq_unlock(&uq->uq_key);
1724			error = EFAULT;
1725			break;
1726		}
1727
1728		umtxq_lock(&uq->uq_key);
1729		umtxq_unbusy(&uq->uq_key);
1730		/*
1731		 * We set the contested bit, sleep. Otherwise the lock changed
1732		 * and we need to retry or we lost a race to the thread
1733		 * unlocking the umtx.
1734		 */
1735		if (old == owner)
1736			error = umtxq_sleep_pi(uq, pi, owner & ~UMUTEX_CONTESTED,
1737				 "umtxpi", timo);
1738		umtxq_unlock(&uq->uq_key);
1739	}
1740
1741	umtxq_lock(&uq->uq_key);
1742	umtx_pi_unref(pi);
1743	umtxq_unlock(&uq->uq_key);
1744
1745	umtx_key_release(&uq->uq_key);
1746	return (error);
1747}
1748
1749/*
1750 * Unlock a PI mutex.
1751 */
1752static int
1753do_unlock_pi(struct thread *td, struct umutex *m, uint32_t flags)
1754{
1755	struct umtx_key key;
1756	struct umtx_q *uq_first, *uq_first2, *uq_me;
1757	struct umtx_pi *pi, *pi2;
1758	uint32_t owner, old, id;
1759	int error;
1760	int count;
1761	int pri;
1762
1763	id = td->td_tid;
1764	/*
1765	 * Make sure we own this mtx.
1766	 */
1767	owner = fuword32(__DEVOLATILE(uint32_t *, &m->m_owner));
1768	if (owner == -1)
1769		return (EFAULT);
1770
1771	if ((owner & ~UMUTEX_CONTESTED) != id)
1772		return (EPERM);
1773
1774	/* This should be done in userland */
1775	if ((owner & UMUTEX_CONTESTED) == 0) {
1776		old = casuword32(&m->m_owner, owner, UMUTEX_UNOWNED);
1777		if (old == -1)
1778			return (EFAULT);
1779		if (old == owner)
1780			return (0);
1781		owner = old;
1782	}
1783
1784	/* We should only ever be in here for contested locks */
1785	if ((error = umtx_key_get(m, TYPE_PI_UMUTEX, GET_SHARE(flags),
1786	    &key)) != 0)
1787		return (error);
1788
1789	umtxq_lock(&key);
1790	umtxq_busy(&key);
1791	count = umtxq_count_pi(&key, &uq_first);
1792	if (uq_first != NULL) {
1793		pi = uq_first->uq_pi_blocked;
1794		if (pi->pi_owner != curthread) {
1795			umtxq_unbusy(&key);
1796			umtxq_unlock(&key);
1797			/* userland messed the mutex */
1798			return (EPERM);
1799		}
1800		uq_me = curthread->td_umtxq;
1801		mtx_lock_spin(&umtx_lock);
1802		pi->pi_owner = NULL;
1803		TAILQ_REMOVE(&uq_me->uq_pi_contested, pi, pi_link);
1804		uq_first = TAILQ_FIRST(&pi->pi_blocked);
1805		pri = PRI_MAX;
1806		TAILQ_FOREACH(pi2, &uq_me->uq_pi_contested, pi_link) {
1807			uq_first2 = TAILQ_FIRST(&pi2->pi_blocked);
1808			if (uq_first2 != NULL) {
1809				if (pri > UPRI(uq_first2->uq_thread))
1810					pri = UPRI(uq_first2->uq_thread);
1811			}
1812		}
1813		thread_lock(curthread);
1814		sched_unlend_user_prio(curthread, pri);
1815		thread_unlock(curthread);
1816		mtx_unlock_spin(&umtx_lock);
1817	}
1818	umtxq_unlock(&key);
1819
1820	/*
1821	 * When unlocking the umtx, it must be marked as unowned if
1822	 * there is zero or one thread only waiting for it.
1823	 * Otherwise, it must be marked as contested.
1824	 */
1825	old = casuword32(&m->m_owner, owner,
1826		count <= 1 ? UMUTEX_UNOWNED : UMUTEX_CONTESTED);
1827
1828	umtxq_lock(&key);
1829	if (uq_first != NULL)
1830		umtxq_signal_thread(uq_first);
1831	umtxq_unbusy(&key);
1832	umtxq_unlock(&key);
1833	umtx_key_release(&key);
1834	if (old == -1)
1835		return (EFAULT);
1836	if (old != owner)
1837		return (EINVAL);
1838	return (0);
1839}
1840
1841/*
1842 * Lock a PP mutex.
1843 */
1844static int
1845_do_lock_pp(struct thread *td, struct umutex *m, uint32_t flags, int timo,
1846	int try)
1847{
1848	struct umtx_q *uq, *uq2;
1849	struct umtx_pi *pi;
1850	uint32_t ceiling;
1851	uint32_t owner, id;
1852	int error, pri, old_inherited_pri, su;
1853
1854	id = td->td_tid;
1855	uq = td->td_umtxq;
1856	if ((error = umtx_key_get(m, TYPE_PP_UMUTEX, GET_SHARE(flags),
1857	    &uq->uq_key)) != 0)
1858		return (error);
1859	su = (priv_check(td, PRIV_SCHED_RTPRIO) == 0);
1860	for (;;) {
1861		old_inherited_pri = uq->uq_inherited_pri;
1862		umtxq_lock(&uq->uq_key);
1863		umtxq_busy(&uq->uq_key);
1864		umtxq_unlock(&uq->uq_key);
1865
1866		ceiling = RTP_PRIO_MAX - fuword32(&m->m_ceilings[0]);
1867		if (ceiling > RTP_PRIO_MAX) {
1868			error = EINVAL;
1869			goto out;
1870		}
1871
1872		mtx_lock_spin(&umtx_lock);
1873		if (UPRI(td) < PRI_MIN_REALTIME + ceiling) {
1874			mtx_unlock_spin(&umtx_lock);
1875			error = EINVAL;
1876			goto out;
1877		}
1878		if (su && PRI_MIN_REALTIME + ceiling < uq->uq_inherited_pri) {
1879			uq->uq_inherited_pri = PRI_MIN_REALTIME + ceiling;
1880			thread_lock(td);
1881			if (uq->uq_inherited_pri < UPRI(td))
1882				sched_lend_user_prio(td, uq->uq_inherited_pri);
1883			thread_unlock(td);
1884		}
1885		mtx_unlock_spin(&umtx_lock);
1886
1887		owner = casuword32(&m->m_owner,
1888		    UMUTEX_CONTESTED, id | UMUTEX_CONTESTED);
1889
1890		if (owner == UMUTEX_CONTESTED) {
1891			error = 0;
1892			break;
1893		}
1894
1895		/* The address was invalid. */
1896		if (owner == -1) {
1897			error = EFAULT;
1898			break;
1899		}
1900
1901		if ((flags & UMUTEX_ERROR_CHECK) != 0 &&
1902		    (owner & ~UMUTEX_CONTESTED) == id) {
1903			error = EDEADLK;
1904			break;
1905		}
1906
1907		if (try != 0) {
1908			error = EBUSY;
1909			break;
1910		}
1911
1912		/*
1913		 * If we caught a signal, we have retried and now
1914		 * exit immediately.
1915		 */
1916		if (error != 0)
1917			break;
1918
1919		umtxq_lock(&uq->uq_key);
1920		umtxq_insert(uq);
1921		umtxq_unbusy(&uq->uq_key);
1922		error = umtxq_sleep(uq, "umtxpp", timo);
1923		umtxq_remove(uq);
1924		umtxq_unlock(&uq->uq_key);
1925
1926		mtx_lock_spin(&umtx_lock);
1927		uq->uq_inherited_pri = old_inherited_pri;
1928		pri = PRI_MAX;
1929		TAILQ_FOREACH(pi, &uq->uq_pi_contested, pi_link) {
1930			uq2 = TAILQ_FIRST(&pi->pi_blocked);
1931			if (uq2 != NULL) {
1932				if (pri > UPRI(uq2->uq_thread))
1933					pri = UPRI(uq2->uq_thread);
1934			}
1935		}
1936		if (pri > uq->uq_inherited_pri)
1937			pri = uq->uq_inherited_pri;
1938		thread_lock(td);
1939		sched_unlend_user_prio(td, pri);
1940		thread_unlock(td);
1941		mtx_unlock_spin(&umtx_lock);
1942	}
1943
1944	if (error != 0) {
1945		mtx_lock_spin(&umtx_lock);
1946		uq->uq_inherited_pri = old_inherited_pri;
1947		pri = PRI_MAX;
1948		TAILQ_FOREACH(pi, &uq->uq_pi_contested, pi_link) {
1949			uq2 = TAILQ_FIRST(&pi->pi_blocked);
1950			if (uq2 != NULL) {
1951				if (pri > UPRI(uq2->uq_thread))
1952					pri = UPRI(uq2->uq_thread);
1953			}
1954		}
1955		if (pri > uq->uq_inherited_pri)
1956			pri = uq->uq_inherited_pri;
1957		thread_lock(td);
1958		sched_unlend_user_prio(td, pri);
1959		thread_unlock(td);
1960		mtx_unlock_spin(&umtx_lock);
1961	}
1962
1963out:
1964	umtxq_lock(&uq->uq_key);
1965	umtxq_unbusy(&uq->uq_key);
1966	umtxq_unlock(&uq->uq_key);
1967	umtx_key_release(&uq->uq_key);
1968	return (error);
1969}
1970
1971/*
1972 * Unlock a PP mutex.
1973 */
1974static int
1975do_unlock_pp(struct thread *td, struct umutex *m, uint32_t flags)
1976{
1977	struct umtx_key key;
1978	struct umtx_q *uq, *uq2;
1979	struct umtx_pi *pi;
1980	uint32_t owner, id;
1981	uint32_t rceiling;
1982	int error, pri, new_inherited_pri, su;
1983
1984	id = td->td_tid;
1985	uq = td->td_umtxq;
1986	su = (priv_check(td, PRIV_SCHED_RTPRIO) == 0);
1987
1988	/*
1989	 * Make sure we own this mtx.
1990	 */
1991	owner = fuword32(__DEVOLATILE(uint32_t *, &m->m_owner));
1992	if (owner == -1)
1993		return (EFAULT);
1994
1995	if ((owner & ~UMUTEX_CONTESTED) != id)
1996		return (EPERM);
1997
1998	error = copyin(&m->m_ceilings[1], &rceiling, sizeof(uint32_t));
1999	if (error != 0)
2000		return (error);
2001
2002	if (rceiling == -1)
2003		new_inherited_pri = PRI_MAX;
2004	else {
2005		rceiling = RTP_PRIO_MAX - rceiling;
2006		if (rceiling > RTP_PRIO_MAX)
2007			return (EINVAL);
2008		new_inherited_pri = PRI_MIN_REALTIME + rceiling;
2009	}
2010
2011	if ((error = umtx_key_get(m, TYPE_PP_UMUTEX, GET_SHARE(flags),
2012	    &key)) != 0)
2013		return (error);
2014	umtxq_lock(&key);
2015	umtxq_busy(&key);
2016	umtxq_unlock(&key);
2017	/*
2018	 * For priority protected mutex, always set unlocked state
2019	 * to UMUTEX_CONTESTED, so that userland always enters kernel
2020	 * to lock the mutex, it is necessary because thread priority
2021	 * has to be adjusted for such mutex.
2022	 */
2023	error = suword32(__DEVOLATILE(uint32_t *, &m->m_owner),
2024		UMUTEX_CONTESTED);
2025
2026	umtxq_lock(&key);
2027	if (error == 0)
2028		umtxq_signal(&key, 1);
2029	umtxq_unbusy(&key);
2030	umtxq_unlock(&key);
2031
2032	if (error == -1)
2033		error = EFAULT;
2034	else {
2035		mtx_lock_spin(&umtx_lock);
2036		if (su != 0)
2037			uq->uq_inherited_pri = new_inherited_pri;
2038		pri = PRI_MAX;
2039		TAILQ_FOREACH(pi, &uq->uq_pi_contested, pi_link) {
2040			uq2 = TAILQ_FIRST(&pi->pi_blocked);
2041			if (uq2 != NULL) {
2042				if (pri > UPRI(uq2->uq_thread))
2043					pri = UPRI(uq2->uq_thread);
2044			}
2045		}
2046		if (pri > uq->uq_inherited_pri)
2047			pri = uq->uq_inherited_pri;
2048		thread_lock(td);
2049		sched_unlend_user_prio(td, pri);
2050		thread_unlock(td);
2051		mtx_unlock_spin(&umtx_lock);
2052	}
2053	umtx_key_release(&key);
2054	return (error);
2055}
2056
2057static int
2058do_set_ceiling(struct thread *td, struct umutex *m, uint32_t ceiling,
2059	uint32_t *old_ceiling)
2060{
2061	struct umtx_q *uq;
2062	uint32_t save_ceiling;
2063	uint32_t owner, id;
2064	uint32_t flags;
2065	int error;
2066
2067	flags = fuword32(&m->m_flags);
2068	if ((flags & UMUTEX_PRIO_PROTECT) == 0)
2069		return (EINVAL);
2070	if (ceiling > RTP_PRIO_MAX)
2071		return (EINVAL);
2072	id = td->td_tid;
2073	uq = td->td_umtxq;
2074	if ((error = umtx_key_get(m, TYPE_PP_UMUTEX, GET_SHARE(flags),
2075	   &uq->uq_key)) != 0)
2076		return (error);
2077	for (;;) {
2078		umtxq_lock(&uq->uq_key);
2079		umtxq_busy(&uq->uq_key);
2080		umtxq_unlock(&uq->uq_key);
2081
2082		save_ceiling = fuword32(&m->m_ceilings[0]);
2083
2084		owner = casuword32(&m->m_owner,
2085		    UMUTEX_CONTESTED, id | UMUTEX_CONTESTED);
2086
2087		if (owner == UMUTEX_CONTESTED) {
2088			suword32(&m->m_ceilings[0], ceiling);
2089			suword32(__DEVOLATILE(uint32_t *, &m->m_owner),
2090				UMUTEX_CONTESTED);
2091			error = 0;
2092			break;
2093		}
2094
2095		/* The address was invalid. */
2096		if (owner == -1) {
2097			error = EFAULT;
2098			break;
2099		}
2100
2101		if ((owner & ~UMUTEX_CONTESTED) == id) {
2102			suword32(&m->m_ceilings[0], ceiling);
2103			error = 0;
2104			break;
2105		}
2106
2107		/*
2108		 * If we caught a signal, we have retried and now
2109		 * exit immediately.
2110		 */
2111		if (error != 0)
2112			break;
2113
2114		/*
2115		 * We set the contested bit, sleep. Otherwise the lock changed
2116		 * and we need to retry or we lost a race to the thread
2117		 * unlocking the umtx.
2118		 */
2119		umtxq_lock(&uq->uq_key);
2120		umtxq_insert(uq);
2121		umtxq_unbusy(&uq->uq_key);
2122		error = umtxq_sleep(uq, "umtxpp", 0);
2123		umtxq_remove(uq);
2124		umtxq_unlock(&uq->uq_key);
2125	}
2126	umtxq_lock(&uq->uq_key);
2127	if (error == 0)
2128		umtxq_signal(&uq->uq_key, INT_MAX);
2129	umtxq_unbusy(&uq->uq_key);
2130	umtxq_unlock(&uq->uq_key);
2131	umtx_key_release(&uq->uq_key);
2132	if (error == 0 && old_ceiling != NULL)
2133		suword32(old_ceiling, save_ceiling);
2134	return (error);
2135}
2136
2137static int
2138_do_lock_umutex(struct thread *td, struct umutex *m, int flags, int timo,
2139	int try)
2140{
2141	switch(flags & (UMUTEX_PRIO_INHERIT | UMUTEX_PRIO_PROTECT)) {
2142	case 0:
2143		return (_do_lock_normal(td, m, flags, timo, try));
2144	case UMUTEX_PRIO_INHERIT:
2145		return (_do_lock_pi(td, m, flags, timo, try));
2146	case UMUTEX_PRIO_PROTECT:
2147		return (_do_lock_pp(td, m, flags, timo, try));
2148	}
2149	return (EINVAL);
2150}
2151
2152/*
2153 * Lock a userland POSIX mutex.
2154 */
2155static int
2156do_lock_umutex(struct thread *td, struct umutex *m,
2157	struct timespec *timeout, int try)
2158{
2159	struct timespec ts, ts2, ts3;
2160	struct timeval tv;
2161	uint32_t flags;
2162	int error;
2163
2164	flags = fuword32(&m->m_flags);
2165	if (flags == -1)
2166		return (EFAULT);
2167
2168	if (timeout == NULL) {
2169		error = _do_lock_umutex(td, m, flags, 0, try);
2170		/* Mutex locking is restarted if it is interrupted. */
2171		if (error == EINTR)
2172			error = ERESTART;
2173	} else {
2174		getnanouptime(&ts);
2175		timespecadd(&ts, timeout);
2176		TIMESPEC_TO_TIMEVAL(&tv, timeout);
2177		for (;;) {
2178			error = _do_lock_umutex(td, m, flags, tvtohz(&tv), try);
2179			if (error != ETIMEDOUT)
2180				break;
2181			getnanouptime(&ts2);
2182			if (timespeccmp(&ts2, &ts, >=)) {
2183				error = ETIMEDOUT;
2184				break;
2185			}
2186			ts3 = ts;
2187			timespecsub(&ts3, &ts2);
2188			TIMESPEC_TO_TIMEVAL(&tv, &ts3);
2189		}
2190		/* Timed-locking is not restarted. */
2191		if (error == ERESTART)
2192			error = EINTR;
2193	}
2194	return (error);
2195}
2196
2197/*
2198 * Unlock a userland POSIX mutex.
2199 */
2200static int
2201do_unlock_umutex(struct thread *td, struct umutex *m)
2202{
2203	uint32_t flags;
2204
2205	flags = fuword32(&m->m_flags);
2206	if (flags == -1)
2207		return (EFAULT);
2208
2209	switch(flags & (UMUTEX_PRIO_INHERIT | UMUTEX_PRIO_PROTECT)) {
2210	case 0:
2211		return (do_unlock_normal(td, m, flags));
2212	case UMUTEX_PRIO_INHERIT:
2213		return (do_unlock_pi(td, m, flags));
2214	case UMUTEX_PRIO_PROTECT:
2215		return (do_unlock_pp(td, m, flags));
2216	}
2217
2218	return (EINVAL);
2219}
2220
2221static int
2222do_cv_wait(struct thread *td, struct ucond *cv, struct umutex *m,
2223	struct timespec *timeout, u_long wflags)
2224{
2225	struct umtx_q *uq;
2226	struct timeval tv;
2227	struct timespec cts, ets, tts;
2228	uint32_t flags;
2229	int error;
2230
2231	uq = td->td_umtxq;
2232	flags = fuword32(&cv->c_flags);
2233	error = umtx_key_get(cv, TYPE_CV, GET_SHARE(flags), &uq->uq_key);
2234	if (error != 0)
2235		return (error);
2236	umtxq_lock(&uq->uq_key);
2237	umtxq_busy(&uq->uq_key);
2238	umtxq_insert(uq);
2239	umtxq_unlock(&uq->uq_key);
2240
2241	/*
2242	 * The magic thing is we should set c_has_waiters to 1 before
2243	 * releasing user mutex.
2244	 */
2245	suword32(__DEVOLATILE(uint32_t *, &cv->c_has_waiters), 1);
2246
2247	umtxq_lock(&uq->uq_key);
2248	umtxq_unbusy(&uq->uq_key);
2249	umtxq_unlock(&uq->uq_key);
2250
2251	error = do_unlock_umutex(td, m);
2252
2253	umtxq_lock(&uq->uq_key);
2254	if (error == 0) {
2255		if ((wflags & UMTX_CHECK_UNPARKING) &&
2256		    (td->td_pflags & TDP_WAKEUP)) {
2257			td->td_pflags &= ~TDP_WAKEUP;
2258			error = EINTR;
2259		} else if (timeout == NULL) {
2260			error = umtxq_sleep(uq, "ucond", 0);
2261		} else {
2262			getnanouptime(&ets);
2263			timespecadd(&ets, timeout);
2264			TIMESPEC_TO_TIMEVAL(&tv, timeout);
2265			for (;;) {
2266				error = umtxq_sleep(uq, "ucond", tvtohz(&tv));
2267				if (error != ETIMEDOUT)
2268					break;
2269				getnanouptime(&cts);
2270				if (timespeccmp(&cts, &ets, >=)) {
2271					error = ETIMEDOUT;
2272					break;
2273				}
2274				tts = ets;
2275				timespecsub(&tts, &cts);
2276				TIMESPEC_TO_TIMEVAL(&tv, &tts);
2277			}
2278		}
2279	}
2280
2281	if (error != 0) {
2282		if ((uq->uq_flags & UQF_UMTXQ) == 0) {
2283			/*
2284			 * If we concurrently got do_cv_signal()d
2285			 * and we got an error or UNIX signals or a timeout,
2286			 * then, perform another umtxq_signal to avoid
2287			 * consuming the wakeup. This may cause supurious
2288			 * wakeup for another thread which was just queued,
2289			 * but SUSV3 explicitly allows supurious wakeup to
2290			 * occur, and indeed a kernel based implementation
2291			 * can not avoid it.
2292			 */
2293			if (!umtxq_signal(&uq->uq_key, 1))
2294				error = 0;
2295		}
2296		if (error == ERESTART)
2297			error = EINTR;
2298	}
2299	umtxq_remove(uq);
2300	umtxq_unlock(&uq->uq_key);
2301	umtx_key_release(&uq->uq_key);
2302	return (error);
2303}
2304
2305/*
2306 * Signal a userland condition variable.
2307 */
2308static int
2309do_cv_signal(struct thread *td, struct ucond *cv)
2310{
2311	struct umtx_key key;
2312	int error, cnt, nwake;
2313	uint32_t flags;
2314
2315	flags = fuword32(&cv->c_flags);
2316	if ((error = umtx_key_get(cv, TYPE_CV, GET_SHARE(flags), &key)) != 0)
2317		return (error);
2318	umtxq_lock(&key);
2319	umtxq_busy(&key);
2320	cnt = umtxq_count(&key);
2321	nwake = umtxq_signal(&key, 1);
2322	if (cnt <= nwake) {
2323		umtxq_unlock(&key);
2324		error = suword32(
2325		    __DEVOLATILE(uint32_t *, &cv->c_has_waiters), 0);
2326		umtxq_lock(&key);
2327	}
2328	umtxq_unbusy(&key);
2329	umtxq_unlock(&key);
2330	umtx_key_release(&key);
2331	return (error);
2332}
2333
2334static int
2335do_cv_broadcast(struct thread *td, struct ucond *cv)
2336{
2337	struct umtx_key key;
2338	int error;
2339	uint32_t flags;
2340
2341	flags = fuword32(&cv->c_flags);
2342	if ((error = umtx_key_get(cv, TYPE_CV, GET_SHARE(flags), &key)) != 0)
2343		return (error);
2344
2345	umtxq_lock(&key);
2346	umtxq_busy(&key);
2347	umtxq_signal(&key, INT_MAX);
2348	umtxq_unlock(&key);
2349
2350	error = suword32(__DEVOLATILE(uint32_t *, &cv->c_has_waiters), 0);
2351
2352	umtxq_lock(&key);
2353	umtxq_unbusy(&key);
2354	umtxq_unlock(&key);
2355
2356	umtx_key_release(&key);
2357	return (error);
2358}
2359
2360static int
2361do_rw_rdlock(struct thread *td, struct urwlock *rwlock, long fflag, int timo)
2362{
2363	struct umtx_q *uq;
2364	uint32_t flags, wrflags;
2365	int32_t state, oldstate;
2366	int32_t blocked_readers;
2367	int error;
2368
2369	uq = td->td_umtxq;
2370	flags = fuword32(&rwlock->rw_flags);
2371	error = umtx_key_get(rwlock, TYPE_RWLOCK, GET_SHARE(flags), &uq->uq_key);
2372	if (error != 0)
2373		return (error);
2374
2375	wrflags = URWLOCK_WRITE_OWNER;
2376	if (!(fflag & URWLOCK_PREFER_READER) && !(flags & URWLOCK_PREFER_READER))
2377		wrflags |= URWLOCK_WRITE_WAITERS;
2378
2379	for (;;) {
2380		state = fuword32(__DEVOLATILE(int32_t *, &rwlock->rw_state));
2381		/* try to lock it */
2382		while (!(state & wrflags)) {
2383			if (__predict_false(URWLOCK_READER_COUNT(state) == URWLOCK_MAX_READERS)) {
2384				umtx_key_release(&uq->uq_key);
2385				return (EAGAIN);
2386			}
2387			oldstate = casuword32(&rwlock->rw_state, state, state + 1);
2388			if (oldstate == state) {
2389				umtx_key_release(&uq->uq_key);
2390				return (0);
2391			}
2392			state = oldstate;
2393		}
2394
2395		if (error)
2396			break;
2397
2398		/* grab monitor lock */
2399		umtxq_lock(&uq->uq_key);
2400		umtxq_busy(&uq->uq_key);
2401		umtxq_unlock(&uq->uq_key);
2402
2403		/* set read contention bit */
2404		while ((state & wrflags) && !(state & URWLOCK_READ_WAITERS)) {
2405			oldstate = casuword32(&rwlock->rw_state, state, state | URWLOCK_READ_WAITERS);
2406			if (oldstate == state)
2407				goto sleep;
2408			state = oldstate;
2409		}
2410
2411		/* state is changed while setting flags, restart */
2412		if (!(state & wrflags)) {
2413			umtxq_lock(&uq->uq_key);
2414			umtxq_unbusy(&uq->uq_key);
2415			umtxq_unlock(&uq->uq_key);
2416			continue;
2417		}
2418
2419sleep:
2420		/* contention bit is set, before sleeping, increase read waiter count */
2421		blocked_readers = fuword32(&rwlock->rw_blocked_readers);
2422		suword32(&rwlock->rw_blocked_readers, blocked_readers+1);
2423
2424		while (state & wrflags) {
2425			umtxq_lock(&uq->uq_key);
2426			umtxq_insert(uq);
2427			umtxq_unbusy(&uq->uq_key);
2428
2429			error = umtxq_sleep(uq, "urdlck", timo);
2430
2431			umtxq_busy(&uq->uq_key);
2432			umtxq_remove(uq);
2433			umtxq_unlock(&uq->uq_key);
2434			if (error)
2435				break;
2436			state = fuword32(__DEVOLATILE(int32_t *, &rwlock->rw_state));
2437		}
2438
2439		/* decrease read waiter count, and may clear read contention bit */
2440		blocked_readers = fuword32(&rwlock->rw_blocked_readers);
2441		suword32(&rwlock->rw_blocked_readers, blocked_readers-1);
2442		if (blocked_readers == 1) {
2443			state = fuword32(__DEVOLATILE(int32_t *, &rwlock->rw_state));
2444			for (;;) {
2445				oldstate = casuword32(&rwlock->rw_state, state,
2446					 state & ~URWLOCK_READ_WAITERS);
2447				if (oldstate == state)
2448					break;
2449				state = oldstate;
2450			}
2451		}
2452
2453		umtxq_lock(&uq->uq_key);
2454		umtxq_unbusy(&uq->uq_key);
2455		umtxq_unlock(&uq->uq_key);
2456	}
2457	umtx_key_release(&uq->uq_key);
2458	return (error);
2459}
2460
2461static int
2462do_rw_rdlock2(struct thread *td, void *obj, long val, struct timespec *timeout)
2463{
2464	struct timespec ts, ts2, ts3;
2465	struct timeval tv;
2466	int error;
2467
2468	getnanouptime(&ts);
2469	timespecadd(&ts, timeout);
2470	TIMESPEC_TO_TIMEVAL(&tv, timeout);
2471	for (;;) {
2472		error = do_rw_rdlock(td, obj, val, tvtohz(&tv));
2473		if (error != ETIMEDOUT)
2474			break;
2475		getnanouptime(&ts2);
2476		if (timespeccmp(&ts2, &ts, >=)) {
2477			error = ETIMEDOUT;
2478			break;
2479		}
2480		ts3 = ts;
2481		timespecsub(&ts3, &ts2);
2482		TIMESPEC_TO_TIMEVAL(&tv, &ts3);
2483	}
2484	if (error == ERESTART)
2485		error = EINTR;
2486	return (error);
2487}
2488
2489static int
2490do_rw_wrlock(struct thread *td, struct urwlock *rwlock, int timo)
2491{
2492	struct umtx_q *uq;
2493	uint32_t flags;
2494	int32_t state, oldstate;
2495	int32_t blocked_writers;
2496	int error;
2497
2498	uq = td->td_umtxq;
2499	flags = fuword32(&rwlock->rw_flags);
2500	error = umtx_key_get(rwlock, TYPE_RWLOCK, GET_SHARE(flags), &uq->uq_key);
2501	if (error != 0)
2502		return (error);
2503
2504	for (;;) {
2505		state = fuword32(__DEVOLATILE(int32_t *, &rwlock->rw_state));
2506		while (!(state & URWLOCK_WRITE_OWNER) && URWLOCK_READER_COUNT(state) == 0) {
2507			oldstate = casuword32(&rwlock->rw_state, state, state | URWLOCK_WRITE_OWNER);
2508			if (oldstate == state) {
2509				umtx_key_release(&uq->uq_key);
2510				return (0);
2511			}
2512			state = oldstate;
2513		}
2514
2515		if (error)
2516			break;
2517
2518		/* grab monitor lock */
2519		umtxq_lock(&uq->uq_key);
2520		umtxq_busy(&uq->uq_key);
2521		umtxq_unlock(&uq->uq_key);
2522
2523		while (((state & URWLOCK_WRITE_OWNER) || URWLOCK_READER_COUNT(state) != 0) &&
2524		       (state & URWLOCK_WRITE_WAITERS) == 0) {
2525			oldstate = casuword32(&rwlock->rw_state, state, state | URWLOCK_WRITE_WAITERS);
2526			if (oldstate == state)
2527				goto sleep;
2528			state = oldstate;
2529		}
2530
2531		if (!(state & URWLOCK_WRITE_OWNER) && URWLOCK_READER_COUNT(state) == 0) {
2532			umtxq_lock(&uq->uq_key);
2533			umtxq_unbusy(&uq->uq_key);
2534			umtxq_unlock(&uq->uq_key);
2535			continue;
2536		}
2537sleep:
2538		blocked_writers = fuword32(&rwlock->rw_blocked_writers);
2539		suword32(&rwlock->rw_blocked_writers, blocked_writers+1);
2540
2541		while ((state & URWLOCK_WRITE_OWNER) || URWLOCK_READER_COUNT(state) != 0) {
2542			umtxq_lock(&uq->uq_key);
2543			umtxq_insert_queue(uq, UMTX_EXCLUSIVE_QUEUE);
2544			umtxq_unbusy(&uq->uq_key);
2545
2546			error = umtxq_sleep(uq, "uwrlck", timo);
2547
2548			umtxq_busy(&uq->uq_key);
2549			umtxq_remove_queue(uq, UMTX_EXCLUSIVE_QUEUE);
2550			umtxq_unlock(&uq->uq_key);
2551			if (error)
2552				break;
2553			state = fuword32(__DEVOLATILE(int32_t *, &rwlock->rw_state));
2554		}
2555
2556		blocked_writers = fuword32(&rwlock->rw_blocked_writers);
2557		suword32(&rwlock->rw_blocked_writers, blocked_writers-1);
2558		if (blocked_writers == 1) {
2559			state = fuword32(__DEVOLATILE(int32_t *, &rwlock->rw_state));
2560			for (;;) {
2561				oldstate = casuword32(&rwlock->rw_state, state,
2562					 state & ~URWLOCK_WRITE_WAITERS);
2563				if (oldstate == state)
2564					break;
2565				state = oldstate;
2566			}
2567		}
2568
2569		umtxq_lock(&uq->uq_key);
2570		umtxq_unbusy(&uq->uq_key);
2571		umtxq_unlock(&uq->uq_key);
2572	}
2573
2574	umtx_key_release(&uq->uq_key);
2575	return (error);
2576}
2577
2578static int
2579do_rw_wrlock2(struct thread *td, void *obj, struct timespec *timeout)
2580{
2581	struct timespec ts, ts2, ts3;
2582	struct timeval tv;
2583	int error;
2584
2585	getnanouptime(&ts);
2586	timespecadd(&ts, timeout);
2587	TIMESPEC_TO_TIMEVAL(&tv, timeout);
2588	for (;;) {
2589		error = do_rw_wrlock(td, obj, tvtohz(&tv));
2590		if (error != ETIMEDOUT)
2591			break;
2592		getnanouptime(&ts2);
2593		if (timespeccmp(&ts2, &ts, >=)) {
2594			error = ETIMEDOUT;
2595			break;
2596		}
2597		ts3 = ts;
2598		timespecsub(&ts3, &ts2);
2599		TIMESPEC_TO_TIMEVAL(&tv, &ts3);
2600	}
2601	if (error == ERESTART)
2602		error = EINTR;
2603	return (error);
2604}
2605
2606static int
2607do_rwlock_unlock(struct thread *td, struct urwlock *rwlock)
2608{
2609	struct umtx_q *uq;
2610	uint32_t flags;
2611	int32_t state, oldstate;
2612	int error, q, count;
2613
2614	uq = td->td_umtxq;
2615	flags = fuword32(&rwlock->rw_flags);
2616	error = umtx_key_get(rwlock, TYPE_RWLOCK, GET_SHARE(flags), &uq->uq_key);
2617	if (error != 0)
2618		return (error);
2619
2620	state = fuword32(__DEVOLATILE(int32_t *, &rwlock->rw_state));
2621	if (state & URWLOCK_WRITE_OWNER) {
2622		for (;;) {
2623			oldstate = casuword32(&rwlock->rw_state, state,
2624				state & ~URWLOCK_WRITE_OWNER);
2625			if (oldstate != state) {
2626				state = oldstate;
2627				if (!(oldstate & URWLOCK_WRITE_OWNER)) {
2628					error = EPERM;
2629					goto out;
2630				}
2631			} else
2632				break;
2633		}
2634	} else if (URWLOCK_READER_COUNT(state) != 0) {
2635		for (;;) {
2636			oldstate = casuword32(&rwlock->rw_state, state,
2637				state - 1);
2638			if (oldstate != state) {
2639				state = oldstate;
2640				if (URWLOCK_READER_COUNT(oldstate) == 0) {
2641					error = EPERM;
2642					goto out;
2643				}
2644			}
2645			else
2646				break;
2647		}
2648	} else {
2649		error = EPERM;
2650		goto out;
2651	}
2652
2653	count = 0;
2654
2655	if (!(flags & URWLOCK_PREFER_READER)) {
2656		if (state & URWLOCK_WRITE_WAITERS) {
2657			count = 1;
2658			q = UMTX_EXCLUSIVE_QUEUE;
2659		} else if (state & URWLOCK_READ_WAITERS) {
2660			count = INT_MAX;
2661			q = UMTX_SHARED_QUEUE;
2662		}
2663	} else {
2664		if (state & URWLOCK_READ_WAITERS) {
2665			count = INT_MAX;
2666			q = UMTX_SHARED_QUEUE;
2667		} else if (state & URWLOCK_WRITE_WAITERS) {
2668			count = 1;
2669			q = UMTX_EXCLUSIVE_QUEUE;
2670		}
2671	}
2672
2673	if (count) {
2674		umtxq_lock(&uq->uq_key);
2675		umtxq_busy(&uq->uq_key);
2676		umtxq_signal_queue(&uq->uq_key, count, q);
2677		umtxq_unbusy(&uq->uq_key);
2678		umtxq_unlock(&uq->uq_key);
2679	}
2680out:
2681	umtx_key_release(&uq->uq_key);
2682	return (error);
2683}
2684
2685int
2686_umtx_lock(struct thread *td, struct _umtx_lock_args *uap)
2687    /* struct umtx *umtx */
2688{
2689	return _do_lock_umtx(td, uap->umtx, td->td_tid, 0);
2690}
2691
2692int
2693_umtx_unlock(struct thread *td, struct _umtx_unlock_args *uap)
2694    /* struct umtx *umtx */
2695{
2696	return do_unlock_umtx(td, uap->umtx, td->td_tid);
2697}
2698
2699static int
2700__umtx_op_lock_umtx(struct thread *td, struct _umtx_op_args *uap)
2701{
2702	struct timespec *ts, timeout;
2703	int error;
2704
2705	/* Allow a null timespec (wait forever). */
2706	if (uap->uaddr2 == NULL)
2707		ts = NULL;
2708	else {
2709		error = copyin(uap->uaddr2, &timeout, sizeof(timeout));
2710		if (error != 0)
2711			return (error);
2712		if (timeout.tv_nsec >= 1000000000 ||
2713		    timeout.tv_nsec < 0) {
2714			return (EINVAL);
2715		}
2716		ts = &timeout;
2717	}
2718	return (do_lock_umtx(td, uap->obj, uap->val, ts));
2719}
2720
2721static int
2722__umtx_op_unlock_umtx(struct thread *td, struct _umtx_op_args *uap)
2723{
2724	return (do_unlock_umtx(td, uap->obj, uap->val));
2725}
2726
2727static int
2728__umtx_op_wait(struct thread *td, struct _umtx_op_args *uap)
2729{
2730	struct timespec *ts, timeout;
2731	int error;
2732
2733	if (uap->uaddr2 == NULL)
2734		ts = NULL;
2735	else {
2736		error = copyin(uap->uaddr2, &timeout, sizeof(timeout));
2737		if (error != 0)
2738			return (error);
2739		if (timeout.tv_nsec >= 1000000000 ||
2740		    timeout.tv_nsec < 0)
2741			return (EINVAL);
2742		ts = &timeout;
2743	}
2744	return do_wait(td, uap->obj, uap->val, ts, 0);
2745}
2746
2747static int
2748__umtx_op_wait_uint(struct thread *td, struct _umtx_op_args *uap)
2749{
2750	struct timespec *ts, timeout;
2751	int error;
2752
2753	if (uap->uaddr2 == NULL)
2754		ts = NULL;
2755	else {
2756		error = copyin(uap->uaddr2, &timeout, sizeof(timeout));
2757		if (error != 0)
2758			return (error);
2759		if (timeout.tv_nsec >= 1000000000 ||
2760		    timeout.tv_nsec < 0)
2761			return (EINVAL);
2762		ts = &timeout;
2763	}
2764	return do_wait(td, uap->obj, uap->val, ts, 1);
2765}
2766
2767static int
2768__umtx_op_wake(struct thread *td, struct _umtx_op_args *uap)
2769{
2770	return (kern_umtx_wake(td, uap->obj, uap->val));
2771}
2772
2773static int
2774__umtx_op_lock_umutex(struct thread *td, struct _umtx_op_args *uap)
2775{
2776	struct timespec *ts, timeout;
2777	int error;
2778
2779	/* Allow a null timespec (wait forever). */
2780	if (uap->uaddr2 == NULL)
2781		ts = NULL;
2782	else {
2783		error = copyin(uap->uaddr2, &timeout,
2784		    sizeof(timeout));
2785		if (error != 0)
2786			return (error);
2787		if (timeout.tv_nsec >= 1000000000 ||
2788		    timeout.tv_nsec < 0) {
2789			return (EINVAL);
2790		}
2791		ts = &timeout;
2792	}
2793	return do_lock_umutex(td, uap->obj, ts, 0);
2794}
2795
2796static int
2797__umtx_op_trylock_umutex(struct thread *td, struct _umtx_op_args *uap)
2798{
2799	return do_lock_umutex(td, uap->obj, NULL, 1);
2800}
2801
2802static int
2803__umtx_op_unlock_umutex(struct thread *td, struct _umtx_op_args *uap)
2804{
2805	return do_unlock_umutex(td, uap->obj);
2806}
2807
2808static int
2809__umtx_op_set_ceiling(struct thread *td, struct _umtx_op_args *uap)
2810{
2811	return do_set_ceiling(td, uap->obj, uap->val, uap->uaddr1);
2812}
2813
2814static int
2815__umtx_op_cv_wait(struct thread *td, struct _umtx_op_args *uap)
2816{
2817	struct timespec *ts, timeout;
2818	int error;
2819
2820	/* Allow a null timespec (wait forever). */
2821	if (uap->uaddr2 == NULL)
2822		ts = NULL;
2823	else {
2824		error = copyin(uap->uaddr2, &timeout,
2825		    sizeof(timeout));
2826		if (error != 0)
2827			return (error);
2828		if (timeout.tv_nsec >= 1000000000 ||
2829		    timeout.tv_nsec < 0) {
2830			return (EINVAL);
2831		}
2832		ts = &timeout;
2833	}
2834	return (do_cv_wait(td, uap->obj, uap->uaddr1, ts, uap->val));
2835}
2836
2837static int
2838__umtx_op_cv_signal(struct thread *td, struct _umtx_op_args *uap)
2839{
2840	return do_cv_signal(td, uap->obj);
2841}
2842
2843static int
2844__umtx_op_cv_broadcast(struct thread *td, struct _umtx_op_args *uap)
2845{
2846	return do_cv_broadcast(td, uap->obj);
2847}
2848
2849static int
2850__umtx_op_rw_rdlock(struct thread *td, struct _umtx_op_args *uap)
2851{
2852	struct timespec timeout;
2853	int error;
2854
2855	/* Allow a null timespec (wait forever). */
2856	if (uap->uaddr2 == NULL) {
2857		error = do_rw_rdlock(td, uap->obj, uap->val, 0);
2858	} else {
2859		error = copyin(uap->uaddr2, &timeout,
2860		    sizeof(timeout));
2861		if (error != 0)
2862			return (error);
2863		if (timeout.tv_nsec >= 1000000000 ||
2864		    timeout.tv_nsec < 0) {
2865			return (EINVAL);
2866		}
2867		error = do_rw_rdlock2(td, uap->obj, uap->val, &timeout);
2868	}
2869	return (error);
2870}
2871
2872static int
2873__umtx_op_rw_wrlock(struct thread *td, struct _umtx_op_args *uap)
2874{
2875	struct timespec timeout;
2876	int error;
2877
2878	/* Allow a null timespec (wait forever). */
2879	if (uap->uaddr2 == NULL) {
2880		error = do_rw_wrlock(td, uap->obj, 0);
2881	} else {
2882		error = copyin(uap->uaddr2, &timeout,
2883		    sizeof(timeout));
2884		if (error != 0)
2885			return (error);
2886		if (timeout.tv_nsec >= 1000000000 ||
2887		    timeout.tv_nsec < 0) {
2888			return (EINVAL);
2889		}
2890
2891		error = do_rw_wrlock2(td, uap->obj, &timeout);
2892	}
2893	return (error);
2894}
2895
2896static int
2897__umtx_op_rw_unlock(struct thread *td, struct _umtx_op_args *uap)
2898{
2899	return do_rwlock_unlock(td, uap->obj);
2900}
2901
2902typedef int (*_umtx_op_func)(struct thread *td, struct _umtx_op_args *uap);
2903
2904static _umtx_op_func op_table[] = {
2905	__umtx_op_lock_umtx,		/* UMTX_OP_LOCK */
2906	__umtx_op_unlock_umtx,		/* UMTX_OP_UNLOCK */
2907	__umtx_op_wait,			/* UMTX_OP_WAIT */
2908	__umtx_op_wake,			/* UMTX_OP_WAKE */
2909	__umtx_op_trylock_umutex,	/* UMTX_OP_MUTEX_TRYLOCK */
2910	__umtx_op_lock_umutex,		/* UMTX_OP_MUTEX_LOCK */
2911	__umtx_op_unlock_umutex,	/* UMTX_OP_MUTEX_UNLOCK */
2912	__umtx_op_set_ceiling,		/* UMTX_OP_SET_CEILING */
2913	__umtx_op_cv_wait,		/* UMTX_OP_CV_WAIT*/
2914	__umtx_op_cv_signal,		/* UMTX_OP_CV_SIGNAL */
2915	__umtx_op_cv_broadcast,		/* UMTX_OP_CV_BROADCAST */
2916	__umtx_op_wait_uint,		/* UMTX_OP_WAIT_UINT */
2917	__umtx_op_rw_rdlock,		/* UMTX_OP_RW_RDLOCK */
2918	__umtx_op_rw_wrlock,		/* UMTX_OP_RW_WRLOCK */
2919	__umtx_op_rw_unlock		/* UMTX_OP_RW_UNLOCK */
2920};
2921
2922int
2923_umtx_op(struct thread *td, struct _umtx_op_args *uap)
2924{
2925	if ((unsigned)uap->op < UMTX_OP_MAX)
2926		return (*op_table[uap->op])(td, uap);
2927	return (EINVAL);
2928}
2929
2930#ifdef COMPAT_IA32
2931int
2932freebsd32_umtx_lock(struct thread *td, struct freebsd32_umtx_lock_args *uap)
2933    /* struct umtx *umtx */
2934{
2935	return (do_lock_umtx32(td, (uint32_t *)uap->umtx, td->td_tid, NULL));
2936}
2937
2938int
2939freebsd32_umtx_unlock(struct thread *td, struct freebsd32_umtx_unlock_args *uap)
2940    /* struct umtx *umtx */
2941{
2942	return (do_unlock_umtx32(td, (uint32_t *)uap->umtx, td->td_tid));
2943}
2944
2945struct timespec32 {
2946	u_int32_t tv_sec;
2947	u_int32_t tv_nsec;
2948};
2949
2950static inline int
2951copyin_timeout32(void *addr, struct timespec *tsp)
2952{
2953	struct timespec32 ts32;
2954	int error;
2955
2956	error = copyin(addr, &ts32, sizeof(struct timespec32));
2957	if (error == 0) {
2958		tsp->tv_sec = ts32.tv_sec;
2959		tsp->tv_nsec = ts32.tv_nsec;
2960	}
2961	return (error);
2962}
2963
2964static int
2965__umtx_op_lock_umtx_compat32(struct thread *td, struct _umtx_op_args *uap)
2966{
2967	struct timespec *ts, timeout;
2968	int error;
2969
2970	/* Allow a null timespec (wait forever). */
2971	if (uap->uaddr2 == NULL)
2972		ts = NULL;
2973	else {
2974		error = copyin_timeout32(uap->uaddr2, &timeout);
2975		if (error != 0)
2976			return (error);
2977		if (timeout.tv_nsec >= 1000000000 ||
2978		    timeout.tv_nsec < 0) {
2979			return (EINVAL);
2980		}
2981		ts = &timeout;
2982	}
2983	return (do_lock_umtx32(td, uap->obj, uap->val, ts));
2984}
2985
2986static int
2987__umtx_op_unlock_umtx_compat32(struct thread *td, struct _umtx_op_args *uap)
2988{
2989	return (do_unlock_umtx32(td, uap->obj, (uint32_t)uap->val));
2990}
2991
2992static int
2993__umtx_op_wait_compat32(struct thread *td, struct _umtx_op_args *uap)
2994{
2995	struct timespec *ts, timeout;
2996	int error;
2997
2998	if (uap->uaddr2 == NULL)
2999		ts = NULL;
3000	else {
3001		error = copyin_timeout32(uap->uaddr2, &timeout);
3002		if (error != 0)
3003			return (error);
3004		if (timeout.tv_nsec >= 1000000000 ||
3005		    timeout.tv_nsec < 0)
3006			return (EINVAL);
3007		ts = &timeout;
3008	}
3009	return do_wait(td, uap->obj, uap->val, ts, 1);
3010}
3011
3012static int
3013__umtx_op_lock_umutex_compat32(struct thread *td, struct _umtx_op_args *uap)
3014{
3015	struct timespec *ts, timeout;
3016	int error;
3017
3018	/* Allow a null timespec (wait forever). */
3019	if (uap->uaddr2 == NULL)
3020		ts = NULL;
3021	else {
3022		error = copyin_timeout32(uap->uaddr2, &timeout);
3023		if (error != 0)
3024			return (error);
3025		if (timeout.tv_nsec >= 1000000000 ||
3026		    timeout.tv_nsec < 0)
3027			return (EINVAL);
3028		ts = &timeout;
3029	}
3030	return do_lock_umutex(td, uap->obj, ts, 0);
3031}
3032
3033static int
3034__umtx_op_cv_wait_compat32(struct thread *td, struct _umtx_op_args *uap)
3035{
3036	struct timespec *ts, timeout;
3037	int error;
3038
3039	/* Allow a null timespec (wait forever). */
3040	if (uap->uaddr2 == NULL)
3041		ts = NULL;
3042	else {
3043		error = copyin_timeout32(uap->uaddr2, &timeout);
3044		if (error != 0)
3045			return (error);
3046		if (timeout.tv_nsec >= 1000000000 ||
3047		    timeout.tv_nsec < 0)
3048			return (EINVAL);
3049		ts = &timeout;
3050	}
3051	return (do_cv_wait(td, uap->obj, uap->uaddr1, ts, uap->val));
3052}
3053
3054static int
3055__umtx_op_rw_rdlock_compat32(struct thread *td, struct _umtx_op_args *uap)
3056{
3057	struct timespec timeout;
3058	int error;
3059
3060	/* Allow a null timespec (wait forever). */
3061	if (uap->uaddr2 == NULL) {
3062		error = do_rw_rdlock(td, uap->obj, uap->val, 0);
3063	} else {
3064		error = copyin(uap->uaddr2, &timeout,
3065		    sizeof(timeout));
3066		if (error != 0)
3067			return (error);
3068		if (timeout.tv_nsec >= 1000000000 ||
3069		    timeout.tv_nsec < 0) {
3070			return (EINVAL);
3071		}
3072		error = do_rw_rdlock2(td, uap->obj, uap->val, &timeout);
3073	}
3074	return (error);
3075}
3076
3077static int
3078__umtx_op_rw_wrlock_compat32(struct thread *td, struct _umtx_op_args *uap)
3079{
3080	struct timespec timeout;
3081	int error;
3082
3083	/* Allow a null timespec (wait forever). */
3084	if (uap->uaddr2 == NULL) {
3085		error = do_rwlock_wrlock(td, uap->obj, 0);
3086	} else {
3087		error = copyin_timeout32(uap->uaddr2, &timeout);
3088		if (error != 0)
3089			return (error);
3090		if (timeout.tv_nsec >= 1000000000 ||
3091		    timeout.tv_nsec < 0) {
3092			return (EINVAL);
3093		}
3094
3095		error = do_rw_wrlock2(td, uap->obj, uap->val, &timeout);
3096	}
3097	return (error);
3098}
3099
3100static _umtx_op_func op_table_compat32[] = {
3101	__umtx_op_lock_umtx_compat32,	/* UMTX_OP_LOCK */
3102	__umtx_op_unlock_umtx_compat32,	/* UMTX_OP_UNLOCK */
3103	__umtx_op_wait_compat32,	/* UMTX_OP_WAIT */
3104	__umtx_op_wake,			/* UMTX_OP_WAKE */
3105	__umtx_op_trylock_umutex,	/* UMTX_OP_MUTEX_LOCK */
3106	__umtx_op_lock_umutex_compat32,	/* UMTX_OP_MUTEX_TRYLOCK */
3107	__umtx_op_unlock_umutex,	/* UMTX_OP_MUTEX_UNLOCK	*/
3108	__umtx_op_set_ceiling,		/* UMTX_OP_SET_CEILING */
3109	__umtx_op_cv_wait_compat32,	/* UMTX_OP_CV_WAIT*/
3110	__umtx_op_cv_signal,		/* UMTX_OP_CV_SIGNAL */
3111	__umtx_op_cv_broadcast,		/* UMTX_OP_CV_BROADCAST */
3112	__umtx_op_wait_compat32,	/* UMTX_OP_WAIT_UINT */
3113	__umtx_op_rw_rdlock_compat32,	/* UMTX_OP_RW_RDLOCK */
3114	__umtx_op_rw_wrlock_compat32,	/* UMTX_OP_RW_WRLOCK */
3115	__umtx_op_rw_unlock		/* UMTX_OP_RW_UNLOCK */
3116};
3117
3118int
3119freebsd32_umtx_op(struct thread *td, struct freebsd32_umtx_op_args *uap)
3120{
3121	if ((unsigned)uap->op < UMTX_OP_MAX)
3122		return (*op_table_compat32[uap->op])(td,
3123			(struct _umtx_op_args *)uap);
3124	return (EINVAL);
3125}
3126#endif
3127
3128void
3129umtx_thread_init(struct thread *td)
3130{
3131	td->td_umtxq = umtxq_alloc();
3132	td->td_umtxq->uq_thread = td;
3133}
3134
3135void
3136umtx_thread_fini(struct thread *td)
3137{
3138	umtxq_free(td->td_umtxq);
3139}
3140
3141/*
3142 * It will be called when new thread is created, e.g fork().
3143 */
3144void
3145umtx_thread_alloc(struct thread *td)
3146{
3147	struct umtx_q *uq;
3148
3149	uq = td->td_umtxq;
3150	uq->uq_inherited_pri = PRI_MAX;
3151
3152	KASSERT(uq->uq_flags == 0, ("uq_flags != 0"));
3153	KASSERT(uq->uq_thread == td, ("uq_thread != td"));
3154	KASSERT(uq->uq_pi_blocked == NULL, ("uq_pi_blocked != NULL"));
3155	KASSERT(TAILQ_EMPTY(&uq->uq_pi_contested), ("uq_pi_contested is not empty"));
3156}
3157
3158/*
3159 * exec() hook.
3160 */
3161static void
3162umtx_exec_hook(void *arg __unused, struct proc *p __unused,
3163	struct image_params *imgp __unused)
3164{
3165	umtx_thread_cleanup(curthread);
3166}
3167
3168/*
3169 * thread_exit() hook.
3170 */
3171void
3172umtx_thread_exit(struct thread *td)
3173{
3174	umtx_thread_cleanup(td);
3175}
3176
3177/*
3178 * clean up umtx data.
3179 */
3180static void
3181umtx_thread_cleanup(struct thread *td)
3182{
3183	struct umtx_q *uq;
3184	struct umtx_pi *pi;
3185
3186	if ((uq = td->td_umtxq) == NULL)
3187		return;
3188
3189	mtx_lock_spin(&umtx_lock);
3190	uq->uq_inherited_pri = PRI_MAX;
3191	while ((pi = TAILQ_FIRST(&uq->uq_pi_contested)) != NULL) {
3192		pi->pi_owner = NULL;
3193		TAILQ_REMOVE(&uq->uq_pi_contested, pi, pi_link);
3194	}
3195	thread_lock(td);
3196	td->td_flags &= ~TDF_UBORROWING;
3197	thread_unlock(td);
3198	mtx_unlock_spin(&umtx_lock);
3199}
3200