kern_umtx.c revision 177880
1/*-
2 * Copyright (c) 2004, David Xu <davidxu@freebsd.org>
3 * Copyright (c) 2002, Jeffrey Roberson <jeff@freebsd.org>
4 * All rights reserved.
5 *
6 * Redistribution and use in source and binary forms, with or without
7 * modification, are permitted provided that the following conditions
8 * are met:
9 * 1. Redistributions of source code must retain the above copyright
10 *    notice unmodified, this list of conditions, and the following
11 *    disclaimer.
12 * 2. Redistributions in binary form must reproduce the above copyright
13 *    notice, this list of conditions and the following disclaimer in the
14 *    documentation and/or other materials provided with the distribution.
15 *
16 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
17 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
18 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
19 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
20 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
21 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
22 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
23 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
25 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26 */
27
28#include <sys/cdefs.h>
29__FBSDID("$FreeBSD: head/sys/kern/kern_umtx.c 177880 2008-04-03 11:49:20Z davidxu $");
30
31#include "opt_compat.h"
32#include <sys/param.h>
33#include <sys/kernel.h>
34#include <sys/limits.h>
35#include <sys/lock.h>
36#include <sys/malloc.h>
37#include <sys/mutex.h>
38#include <sys/priv.h>
39#include <sys/proc.h>
40#include <sys/sched.h>
41#include <sys/smp.h>
42#include <sys/sysctl.h>
43#include <sys/sysent.h>
44#include <sys/systm.h>
45#include <sys/sysproto.h>
46#include <sys/eventhandler.h>
47#include <sys/umtx.h>
48
49#include <vm/vm.h>
50#include <vm/vm_param.h>
51#include <vm/pmap.h>
52#include <vm/vm_map.h>
53#include <vm/vm_object.h>
54
55#include <machine/cpu.h>
56
57#ifdef COMPAT_IA32
58#include <compat/freebsd32/freebsd32_proto.h>
59#endif
60
61#define TYPE_SIMPLE_LOCK	0
62#define TYPE_SIMPLE_WAIT	1
63#define TYPE_NORMAL_UMUTEX	2
64#define TYPE_PI_UMUTEX		3
65#define TYPE_PP_UMUTEX		4
66#define TYPE_CV			5
67#define TYPE_RWLOCK		6
68
69/* Key to represent a unique userland synchronous object */
70struct umtx_key {
71	int	hash;
72	int	type;
73	int	shared;
74	union {
75		struct {
76			vm_object_t	object;
77			uintptr_t	offset;
78		} shared;
79		struct {
80			struct vmspace	*vs;
81			uintptr_t	addr;
82		} private;
83		struct {
84			void		*a;
85			uintptr_t	b;
86		} both;
87	} info;
88};
89
90/* Priority inheritance mutex info. */
91struct umtx_pi {
92	/* Owner thread */
93	struct thread		*pi_owner;
94
95	/* Reference count */
96	int			pi_refcount;
97
98 	/* List entry to link umtx holding by thread */
99	TAILQ_ENTRY(umtx_pi)	pi_link;
100
101	/* List entry in hash */
102	TAILQ_ENTRY(umtx_pi)	pi_hashlink;
103
104	/* List for waiters */
105	TAILQ_HEAD(,umtx_q)	pi_blocked;
106
107	/* Identify a userland lock object */
108	struct umtx_key		pi_key;
109};
110
111/* A userland synchronous object user. */
112struct umtx_q {
113	/* Linked list for the hash. */
114	TAILQ_ENTRY(umtx_q)	uq_link;
115
116	/* Umtx key. */
117	struct umtx_key		uq_key;
118
119	/* Umtx flags. */
120	int			uq_flags;
121#define UQF_UMTXQ	0x0001
122
123	/* The thread waits on. */
124	struct thread		*uq_thread;
125
126	/*
127	 * Blocked on PI mutex. read can use chain lock
128	 * or umtx_lock, write must have both chain lock and
129	 * umtx_lock being hold.
130	 */
131	struct umtx_pi		*uq_pi_blocked;
132
133	/* On blocked list */
134	TAILQ_ENTRY(umtx_q)	uq_lockq;
135
136	/* Thread contending with us */
137	TAILQ_HEAD(,umtx_pi)	uq_pi_contested;
138
139	/* Inherited priority from PP mutex */
140	u_char			uq_inherited_pri;
141};
142
143TAILQ_HEAD(umtxq_head, umtx_q);
144
145/* Userland lock object's wait-queue chain */
146struct umtxq_chain {
147	/* Lock for this chain. */
148	struct mtx		uc_lock;
149
150	/* List of sleep queues. */
151	struct umtxq_head	uc_queue[2];
152#define UMTX_SHARED_QUEUE	0
153#define UMTX_EXCLUSIVE_QUEUE	1
154
155	/* Busy flag */
156	char			uc_busy;
157
158	/* Chain lock waiters */
159	int			uc_waiters;
160
161	/* All PI in the list */
162	TAILQ_HEAD(,umtx_pi)	uc_pi_list;
163};
164
165#define	UMTXQ_LOCKED_ASSERT(uc)		mtx_assert(&(uc)->uc_lock, MA_OWNED)
166
167/*
168 * Don't propagate time-sharing priority, there is a security reason,
169 * a user can simply introduce PI-mutex, let thread A lock the mutex,
170 * and let another thread B block on the mutex, because B is
171 * sleeping, its priority will be boosted, this causes A's priority to
172 * be boosted via priority propagating too and will never be lowered even
173 * if it is using 100%CPU, this is unfair to other processes.
174 */
175
176#define UPRI(td)	(((td)->td_user_pri >= PRI_MIN_TIMESHARE &&\
177			  (td)->td_user_pri <= PRI_MAX_TIMESHARE) ?\
178			 PRI_MAX_TIMESHARE : (td)->td_user_pri)
179
180#define	GOLDEN_RATIO_PRIME	2654404609U
181#define	UMTX_CHAINS		128
182#define	UMTX_SHIFTS		(__WORD_BIT - 7)
183
184#define THREAD_SHARE		0
185#define PROCESS_SHARE		1
186#define AUTO_SHARE		2
187
188#define	GET_SHARE(flags)	\
189    (((flags) & USYNC_PROCESS_SHARED) == 0 ? THREAD_SHARE : PROCESS_SHARE)
190
191#define BUSY_SPINS		200
192
193static uma_zone_t		umtx_pi_zone;
194static struct umtxq_chain	umtxq_chains[UMTX_CHAINS];
195static MALLOC_DEFINE(M_UMTX, "umtx", "UMTX queue memory");
196static int			umtx_pi_allocated;
197
198SYSCTL_NODE(_debug, OID_AUTO, umtx, CTLFLAG_RW, 0, "umtx debug");
199SYSCTL_INT(_debug_umtx, OID_AUTO, umtx_pi_allocated, CTLFLAG_RD,
200    &umtx_pi_allocated, 0, "Allocated umtx_pi");
201
202static void umtxq_sysinit(void *);
203static void umtxq_hash(struct umtx_key *key);
204static struct umtxq_chain *umtxq_getchain(struct umtx_key *key);
205static void umtxq_lock(struct umtx_key *key);
206static void umtxq_unlock(struct umtx_key *key);
207static void umtxq_busy(struct umtx_key *key);
208static void umtxq_unbusy(struct umtx_key *key);
209static void umtxq_insert_queue(struct umtx_q *uq, int q);
210static void umtxq_remove_queue(struct umtx_q *uq, int q);
211static int umtxq_sleep(struct umtx_q *uq, const char *wmesg, int timo);
212static int umtxq_count(struct umtx_key *key);
213static int umtx_key_match(const struct umtx_key *k1, const struct umtx_key *k2);
214static int umtx_key_get(void *addr, int type, int share,
215	struct umtx_key *key);
216static void umtx_key_release(struct umtx_key *key);
217static struct umtx_pi *umtx_pi_alloc(int);
218static void umtx_pi_free(struct umtx_pi *pi);
219static void umtx_pi_adjust_locked(struct thread *td, u_char oldpri);
220static int do_unlock_pp(struct thread *td, struct umutex *m, uint32_t flags);
221static void umtx_thread_cleanup(struct thread *td);
222static void umtx_exec_hook(void *arg __unused, struct proc *p __unused,
223	struct image_params *imgp __unused);
224SYSINIT(umtx, SI_SUB_EVENTHANDLER+1, SI_ORDER_MIDDLE, umtxq_sysinit, NULL);
225
226#define umtxq_signal(key, nwake)	umtxq_signal_queue((key), (nwake), UMTX_SHARED_QUEUE)
227#define umtxq_insert(uq)	umtxq_insert_queue((uq), UMTX_SHARED_QUEUE)
228#define umtxq_remove(uq)	umtxq_remove_queue((uq), UMTX_SHARED_QUEUE)
229
230static struct mtx umtx_lock;
231
232static void
233umtxq_sysinit(void *arg __unused)
234{
235	int i;
236
237	umtx_pi_zone = uma_zcreate("umtx pi", sizeof(struct umtx_pi),
238		NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0);
239	for (i = 0; i < UMTX_CHAINS; ++i) {
240		mtx_init(&umtxq_chains[i].uc_lock, "umtxql", NULL,
241			 MTX_DEF | MTX_DUPOK);
242		TAILQ_INIT(&umtxq_chains[i].uc_queue[0]);
243		TAILQ_INIT(&umtxq_chains[i].uc_queue[1]);
244		TAILQ_INIT(&umtxq_chains[i].uc_pi_list);
245		umtxq_chains[i].uc_busy = 0;
246		umtxq_chains[i].uc_waiters = 0;
247	}
248	mtx_init(&umtx_lock, "umtx lock", NULL, MTX_SPIN);
249	EVENTHANDLER_REGISTER(process_exec, umtx_exec_hook, NULL,
250	    EVENTHANDLER_PRI_ANY);
251}
252
253struct umtx_q *
254umtxq_alloc(void)
255{
256	struct umtx_q *uq;
257
258	uq = malloc(sizeof(struct umtx_q), M_UMTX, M_WAITOK | M_ZERO);
259	TAILQ_INIT(&uq->uq_pi_contested);
260	uq->uq_inherited_pri = PRI_MAX;
261	return (uq);
262}
263
264void
265umtxq_free(struct umtx_q *uq)
266{
267	free(uq, M_UMTX);
268}
269
270static inline void
271umtxq_hash(struct umtx_key *key)
272{
273	unsigned n = (uintptr_t)key->info.both.a + key->info.both.b;
274	key->hash = ((n * GOLDEN_RATIO_PRIME) >> UMTX_SHIFTS) % UMTX_CHAINS;
275}
276
277static inline int
278umtx_key_match(const struct umtx_key *k1, const struct umtx_key *k2)
279{
280	return (k1->type == k2->type &&
281		k1->info.both.a == k2->info.both.a &&
282	        k1->info.both.b == k2->info.both.b);
283}
284
285static inline struct umtxq_chain *
286umtxq_getchain(struct umtx_key *key)
287{
288	return (&umtxq_chains[key->hash]);
289}
290
291/*
292 * Lock a chain.
293 */
294static inline void
295umtxq_lock(struct umtx_key *key)
296{
297	struct umtxq_chain *uc;
298
299	uc = umtxq_getchain(key);
300	mtx_lock(&uc->uc_lock);
301}
302
303/*
304 * Unlock a chain.
305 */
306static inline void
307umtxq_unlock(struct umtx_key *key)
308{
309	struct umtxq_chain *uc;
310
311	uc = umtxq_getchain(key);
312	mtx_unlock(&uc->uc_lock);
313}
314
315/*
316 * Set chain to busy state when following operation
317 * may be blocked (kernel mutex can not be used).
318 */
319static inline void
320umtxq_busy(struct umtx_key *key)
321{
322	struct umtxq_chain *uc;
323
324	uc = umtxq_getchain(key);
325	mtx_assert(&uc->uc_lock, MA_OWNED);
326	if (uc->uc_busy) {
327#ifdef SMP
328		if (smp_cpus > 1) {
329			int count = BUSY_SPINS;
330			if (count > 0) {
331				umtxq_unlock(key);
332				while (uc->uc_busy && --count > 0)
333					cpu_spinwait();
334				umtxq_lock(key);
335			}
336		}
337#endif
338		while (uc->uc_busy) {
339			uc->uc_waiters++;
340			msleep(uc, &uc->uc_lock, 0, "umtxqb", 0);
341			uc->uc_waiters--;
342		}
343	}
344	uc->uc_busy = 1;
345}
346
347/*
348 * Unbusy a chain.
349 */
350static inline void
351umtxq_unbusy(struct umtx_key *key)
352{
353	struct umtxq_chain *uc;
354
355	uc = umtxq_getchain(key);
356	mtx_assert(&uc->uc_lock, MA_OWNED);
357	KASSERT(uc->uc_busy != 0, ("not busy"));
358	uc->uc_busy = 0;
359	if (uc->uc_waiters)
360		wakeup_one(uc);
361}
362
363static inline void
364umtxq_insert_queue(struct umtx_q *uq, int q)
365{
366	struct umtxq_chain *uc;
367
368	uc = umtxq_getchain(&uq->uq_key);
369	UMTXQ_LOCKED_ASSERT(uc);
370	TAILQ_INSERT_TAIL(&uc->uc_queue[q], uq, uq_link);
371	uq->uq_flags |= UQF_UMTXQ;
372}
373
374static inline void
375umtxq_remove_queue(struct umtx_q *uq, int q)
376{
377	struct umtxq_chain *uc;
378
379	uc = umtxq_getchain(&uq->uq_key);
380	UMTXQ_LOCKED_ASSERT(uc);
381	if (uq->uq_flags & UQF_UMTXQ) {
382		TAILQ_REMOVE(&uc->uc_queue[q], uq, uq_link);
383		uq->uq_flags &= ~UQF_UMTXQ;
384	}
385}
386
387/*
388 * Check if there are multiple waiters
389 */
390static int
391umtxq_count(struct umtx_key *key)
392{
393	struct umtxq_chain *uc;
394	struct umtx_q *uq;
395	int count = 0;
396
397	uc = umtxq_getchain(key);
398	UMTXQ_LOCKED_ASSERT(uc);
399	TAILQ_FOREACH(uq, &uc->uc_queue[UMTX_SHARED_QUEUE], uq_link) {
400		if (umtx_key_match(&uq->uq_key, key)) {
401			if (++count > 1)
402				break;
403		}
404	}
405	return (count);
406}
407
408/*
409 * Check if there are multiple PI waiters and returns first
410 * waiter.
411 */
412static int
413umtxq_count_pi(struct umtx_key *key, struct umtx_q **first)
414{
415	struct umtxq_chain *uc;
416	struct umtx_q *uq;
417	int count = 0;
418
419	*first = NULL;
420	uc = umtxq_getchain(key);
421	UMTXQ_LOCKED_ASSERT(uc);
422	TAILQ_FOREACH(uq, &uc->uc_queue[UMTX_SHARED_QUEUE], uq_link) {
423		if (umtx_key_match(&uq->uq_key, key)) {
424			if (++count > 1)
425				break;
426			*first = uq;
427		}
428	}
429	return (count);
430}
431
432/*
433 * Wake up threads waiting on an userland object.
434 */
435
436static int
437umtxq_signal_queue(struct umtx_key *key, int n_wake, int q)
438{
439	struct umtxq_chain *uc;
440	struct umtx_q *uq, *next;
441	int ret;
442
443	ret = 0;
444	uc = umtxq_getchain(key);
445	UMTXQ_LOCKED_ASSERT(uc);
446	TAILQ_FOREACH_SAFE(uq, &uc->uc_queue[q], uq_link, next) {
447		if (umtx_key_match(&uq->uq_key, key)) {
448			umtxq_remove_queue(uq, q);
449			wakeup(uq);
450			if (++ret >= n_wake)
451				break;
452		}
453	}
454	return (ret);
455}
456
457
458/*
459 * Wake up specified thread.
460 */
461static inline void
462umtxq_signal_thread(struct umtx_q *uq)
463{
464	struct umtxq_chain *uc;
465
466	uc = umtxq_getchain(&uq->uq_key);
467	UMTXQ_LOCKED_ASSERT(uc);
468	umtxq_remove(uq);
469	wakeup(uq);
470}
471
472/*
473 * Put thread into sleep state, before sleeping, check if
474 * thread was removed from umtx queue.
475 */
476static inline int
477umtxq_sleep(struct umtx_q *uq, const char *wmesg, int timo)
478{
479	struct umtxq_chain *uc;
480	int error;
481
482	uc = umtxq_getchain(&uq->uq_key);
483	UMTXQ_LOCKED_ASSERT(uc);
484	if (!(uq->uq_flags & UQF_UMTXQ))
485		return (0);
486	error = msleep(uq, &uc->uc_lock, PCATCH, wmesg, timo);
487	if (error == EWOULDBLOCK)
488		error = ETIMEDOUT;
489	return (error);
490}
491
492/*
493 * Convert userspace address into unique logical address.
494 */
495static int
496umtx_key_get(void *addr, int type, int share, struct umtx_key *key)
497{
498	struct thread *td = curthread;
499	vm_map_t map;
500	vm_map_entry_t entry;
501	vm_pindex_t pindex;
502	vm_prot_t prot;
503	boolean_t wired;
504
505	key->type = type;
506	if (share == THREAD_SHARE) {
507		key->shared = 0;
508		key->info.private.vs = td->td_proc->p_vmspace;
509		key->info.private.addr = (uintptr_t)addr;
510	} else {
511		MPASS(share == PROCESS_SHARE || share == AUTO_SHARE);
512		map = &td->td_proc->p_vmspace->vm_map;
513		if (vm_map_lookup(&map, (vm_offset_t)addr, VM_PROT_WRITE,
514		    &entry, &key->info.shared.object, &pindex, &prot,
515		    &wired) != KERN_SUCCESS) {
516			return EFAULT;
517		}
518
519		if ((share == PROCESS_SHARE) ||
520		    (share == AUTO_SHARE &&
521		     VM_INHERIT_SHARE == entry->inheritance)) {
522			key->shared = 1;
523			key->info.shared.offset = entry->offset + entry->start -
524				(vm_offset_t)addr;
525			vm_object_reference(key->info.shared.object);
526		} else {
527			key->shared = 0;
528			key->info.private.vs = td->td_proc->p_vmspace;
529			key->info.private.addr = (uintptr_t)addr;
530		}
531		vm_map_lookup_done(map, entry);
532	}
533
534	umtxq_hash(key);
535	return (0);
536}
537
538/*
539 * Release key.
540 */
541static inline void
542umtx_key_release(struct umtx_key *key)
543{
544	if (key->shared)
545		vm_object_deallocate(key->info.shared.object);
546}
547
548/*
549 * Lock a umtx object.
550 */
551static int
552_do_lock_umtx(struct thread *td, struct umtx *umtx, u_long id, int timo)
553{
554	struct umtx_q *uq;
555	u_long owner;
556	u_long old;
557	int error = 0;
558
559	uq = td->td_umtxq;
560
561	/*
562	 * Care must be exercised when dealing with umtx structure. It
563	 * can fault on any access.
564	 */
565	for (;;) {
566		/*
567		 * Try the uncontested case.  This should be done in userland.
568		 */
569		owner = casuword(&umtx->u_owner, UMTX_UNOWNED, id);
570
571		/* The acquire succeeded. */
572		if (owner == UMTX_UNOWNED)
573			return (0);
574
575		/* The address was invalid. */
576		if (owner == -1)
577			return (EFAULT);
578
579		/* If no one owns it but it is contested try to acquire it. */
580		if (owner == UMTX_CONTESTED) {
581			owner = casuword(&umtx->u_owner,
582			    UMTX_CONTESTED, id | UMTX_CONTESTED);
583
584			if (owner == UMTX_CONTESTED)
585				return (0);
586
587			/* The address was invalid. */
588			if (owner == -1)
589				return (EFAULT);
590
591			/* If this failed the lock has changed, restart. */
592			continue;
593		}
594
595		/*
596		 * If we caught a signal, we have retried and now
597		 * exit immediately.
598		 */
599		if (error != 0)
600			return (error);
601
602		if ((error = umtx_key_get(umtx, TYPE_SIMPLE_LOCK,
603			AUTO_SHARE, &uq->uq_key)) != 0)
604			return (error);
605
606		umtxq_lock(&uq->uq_key);
607		umtxq_busy(&uq->uq_key);
608		umtxq_insert(uq);
609		umtxq_unbusy(&uq->uq_key);
610		umtxq_unlock(&uq->uq_key);
611
612		/*
613		 * Set the contested bit so that a release in user space
614		 * knows to use the system call for unlock.  If this fails
615		 * either some one else has acquired the lock or it has been
616		 * released.
617		 */
618		old = casuword(&umtx->u_owner, owner, owner | UMTX_CONTESTED);
619
620		/* The address was invalid. */
621		if (old == -1) {
622			umtxq_lock(&uq->uq_key);
623			umtxq_remove(uq);
624			umtxq_unlock(&uq->uq_key);
625			umtx_key_release(&uq->uq_key);
626			return (EFAULT);
627		}
628
629		/*
630		 * We set the contested bit, sleep. Otherwise the lock changed
631		 * and we need to retry or we lost a race to the thread
632		 * unlocking the umtx.
633		 */
634		umtxq_lock(&uq->uq_key);
635		if (old == owner)
636			error = umtxq_sleep(uq, "umtx", timo);
637		umtxq_remove(uq);
638		umtxq_unlock(&uq->uq_key);
639		umtx_key_release(&uq->uq_key);
640	}
641
642	return (0);
643}
644
645/*
646 * Lock a umtx object.
647 */
648static int
649do_lock_umtx(struct thread *td, struct umtx *umtx, u_long id,
650	struct timespec *timeout)
651{
652	struct timespec ts, ts2, ts3;
653	struct timeval tv;
654	int error;
655
656	if (timeout == NULL) {
657		error = _do_lock_umtx(td, umtx, id, 0);
658		/* Mutex locking is restarted if it is interrupted. */
659		if (error == EINTR)
660			error = ERESTART;
661	} else {
662		getnanouptime(&ts);
663		timespecadd(&ts, timeout);
664		TIMESPEC_TO_TIMEVAL(&tv, timeout);
665		for (;;) {
666			error = _do_lock_umtx(td, umtx, id, tvtohz(&tv));
667			if (error != ETIMEDOUT)
668				break;
669			getnanouptime(&ts2);
670			if (timespeccmp(&ts2, &ts, >=)) {
671				error = ETIMEDOUT;
672				break;
673			}
674			ts3 = ts;
675			timespecsub(&ts3, &ts2);
676			TIMESPEC_TO_TIMEVAL(&tv, &ts3);
677		}
678		/* Timed-locking is not restarted. */
679		if (error == ERESTART)
680			error = EINTR;
681	}
682	return (error);
683}
684
685/*
686 * Unlock a umtx object.
687 */
688static int
689do_unlock_umtx(struct thread *td, struct umtx *umtx, u_long id)
690{
691	struct umtx_key key;
692	u_long owner;
693	u_long old;
694	int error;
695	int count;
696
697	/*
698	 * Make sure we own this mtx.
699	 */
700	owner = fuword(__DEVOLATILE(u_long *, &umtx->u_owner));
701	if (owner == -1)
702		return (EFAULT);
703
704	if ((owner & ~UMTX_CONTESTED) != id)
705		return (EPERM);
706
707	/* This should be done in userland */
708	if ((owner & UMTX_CONTESTED) == 0) {
709		old = casuword(&umtx->u_owner, owner, UMTX_UNOWNED);
710		if (old == -1)
711			return (EFAULT);
712		if (old == owner)
713			return (0);
714		owner = old;
715	}
716
717	/* We should only ever be in here for contested locks */
718	if ((error = umtx_key_get(umtx, TYPE_SIMPLE_LOCK, AUTO_SHARE,
719		&key)) != 0)
720		return (error);
721
722	umtxq_lock(&key);
723	umtxq_busy(&key);
724	count = umtxq_count(&key);
725	umtxq_unlock(&key);
726
727	/*
728	 * When unlocking the umtx, it must be marked as unowned if
729	 * there is zero or one thread only waiting for it.
730	 * Otherwise, it must be marked as contested.
731	 */
732	old = casuword(&umtx->u_owner, owner,
733		count <= 1 ? UMTX_UNOWNED : UMTX_CONTESTED);
734	umtxq_lock(&key);
735	umtxq_signal(&key,1);
736	umtxq_unbusy(&key);
737	umtxq_unlock(&key);
738	umtx_key_release(&key);
739	if (old == -1)
740		return (EFAULT);
741	if (old != owner)
742		return (EINVAL);
743	return (0);
744}
745
746#ifdef COMPAT_IA32
747
748/*
749 * Lock a umtx object.
750 */
751static int
752_do_lock_umtx32(struct thread *td, uint32_t *m, uint32_t id, int timo)
753{
754	struct umtx_q *uq;
755	uint32_t owner;
756	uint32_t old;
757	int error = 0;
758
759	uq = td->td_umtxq;
760
761	/*
762	 * Care must be exercised when dealing with umtx structure. It
763	 * can fault on any access.
764	 */
765	for (;;) {
766		/*
767		 * Try the uncontested case.  This should be done in userland.
768		 */
769		owner = casuword32(m, UMUTEX_UNOWNED, id);
770
771		/* The acquire succeeded. */
772		if (owner == UMUTEX_UNOWNED)
773			return (0);
774
775		/* The address was invalid. */
776		if (owner == -1)
777			return (EFAULT);
778
779		/* If no one owns it but it is contested try to acquire it. */
780		if (owner == UMUTEX_CONTESTED) {
781			owner = casuword32(m,
782			    UMUTEX_CONTESTED, id | UMUTEX_CONTESTED);
783			if (owner == UMUTEX_CONTESTED)
784				return (0);
785
786			/* The address was invalid. */
787			if (owner == -1)
788				return (EFAULT);
789
790			/* If this failed the lock has changed, restart. */
791			continue;
792		}
793
794		/*
795		 * If we caught a signal, we have retried and now
796		 * exit immediately.
797		 */
798		if (error != 0)
799			return (error);
800
801		if ((error = umtx_key_get(m, TYPE_SIMPLE_LOCK,
802			AUTO_SHARE, &uq->uq_key)) != 0)
803			return (error);
804
805		umtxq_lock(&uq->uq_key);
806		umtxq_busy(&uq->uq_key);
807		umtxq_insert(uq);
808		umtxq_unbusy(&uq->uq_key);
809		umtxq_unlock(&uq->uq_key);
810
811		/*
812		 * Set the contested bit so that a release in user space
813		 * knows to use the system call for unlock.  If this fails
814		 * either some one else has acquired the lock or it has been
815		 * released.
816		 */
817		old = casuword32(m, owner, owner | UMUTEX_CONTESTED);
818
819		/* The address was invalid. */
820		if (old == -1) {
821			umtxq_lock(&uq->uq_key);
822			umtxq_remove(uq);
823			umtxq_unlock(&uq->uq_key);
824			umtx_key_release(&uq->uq_key);
825			return (EFAULT);
826		}
827
828		/*
829		 * We set the contested bit, sleep. Otherwise the lock changed
830		 * and we need to retry or we lost a race to the thread
831		 * unlocking the umtx.
832		 */
833		umtxq_lock(&uq->uq_key);
834		if (old == owner)
835			error = umtxq_sleep(uq, "umtx", timo);
836		umtxq_remove(uq);
837		umtxq_unlock(&uq->uq_key);
838		umtx_key_release(&uq->uq_key);
839	}
840
841	return (0);
842}
843
844/*
845 * Lock a umtx object.
846 */
847static int
848do_lock_umtx32(struct thread *td, void *m, uint32_t id,
849	struct timespec *timeout)
850{
851	struct timespec ts, ts2, ts3;
852	struct timeval tv;
853	int error;
854
855	if (timeout == NULL) {
856		error = _do_lock_umtx32(td, m, id, 0);
857		/* Mutex locking is restarted if it is interrupted. */
858		if (error == EINTR)
859			error = ERESTART;
860	} else {
861		getnanouptime(&ts);
862		timespecadd(&ts, timeout);
863		TIMESPEC_TO_TIMEVAL(&tv, timeout);
864		for (;;) {
865			error = _do_lock_umtx32(td, m, id, tvtohz(&tv));
866			if (error != ETIMEDOUT)
867				break;
868			getnanouptime(&ts2);
869			if (timespeccmp(&ts2, &ts, >=)) {
870				error = ETIMEDOUT;
871				break;
872			}
873			ts3 = ts;
874			timespecsub(&ts3, &ts2);
875			TIMESPEC_TO_TIMEVAL(&tv, &ts3);
876		}
877		/* Timed-locking is not restarted. */
878		if (error == ERESTART)
879			error = EINTR;
880	}
881	return (error);
882}
883
884/*
885 * Unlock a umtx object.
886 */
887static int
888do_unlock_umtx32(struct thread *td, uint32_t *m, uint32_t id)
889{
890	struct umtx_key key;
891	uint32_t owner;
892	uint32_t old;
893	int error;
894	int count;
895
896	/*
897	 * Make sure we own this mtx.
898	 */
899	owner = fuword32(m);
900	if (owner == -1)
901		return (EFAULT);
902
903	if ((owner & ~UMUTEX_CONTESTED) != id)
904		return (EPERM);
905
906	/* This should be done in userland */
907	if ((owner & UMUTEX_CONTESTED) == 0) {
908		old = casuword32(m, owner, UMUTEX_UNOWNED);
909		if (old == -1)
910			return (EFAULT);
911		if (old == owner)
912			return (0);
913		owner = old;
914	}
915
916	/* We should only ever be in here for contested locks */
917	if ((error = umtx_key_get(m, TYPE_SIMPLE_LOCK, AUTO_SHARE,
918		&key)) != 0)
919		return (error);
920
921	umtxq_lock(&key);
922	umtxq_busy(&key);
923	count = umtxq_count(&key);
924	umtxq_unlock(&key);
925
926	/*
927	 * When unlocking the umtx, it must be marked as unowned if
928	 * there is zero or one thread only waiting for it.
929	 * Otherwise, it must be marked as contested.
930	 */
931	old = casuword32(m, owner,
932		count <= 1 ? UMUTEX_UNOWNED : UMUTEX_CONTESTED);
933	umtxq_lock(&key);
934	umtxq_signal(&key,1);
935	umtxq_unbusy(&key);
936	umtxq_unlock(&key);
937	umtx_key_release(&key);
938	if (old == -1)
939		return (EFAULT);
940	if (old != owner)
941		return (EINVAL);
942	return (0);
943}
944#endif
945
946/*
947 * Fetch and compare value, sleep on the address if value is not changed.
948 */
949static int
950do_wait(struct thread *td, void *addr, u_long id,
951	struct timespec *timeout, int compat32)
952{
953	struct umtx_q *uq;
954	struct timespec ts, ts2, ts3;
955	struct timeval tv;
956	u_long tmp;
957	int error = 0;
958
959	uq = td->td_umtxq;
960	if ((error = umtx_key_get(addr, TYPE_SIMPLE_WAIT, AUTO_SHARE,
961	    &uq->uq_key)) != 0)
962		return (error);
963
964	umtxq_lock(&uq->uq_key);
965	umtxq_insert(uq);
966	umtxq_unlock(&uq->uq_key);
967	if (compat32 == 0)
968		tmp = fuword(addr);
969        else
970		tmp = fuword32(addr);
971	if (tmp != id) {
972		umtxq_lock(&uq->uq_key);
973		umtxq_remove(uq);
974		umtxq_unlock(&uq->uq_key);
975	} else if (timeout == NULL) {
976		umtxq_lock(&uq->uq_key);
977		error = umtxq_sleep(uq, "uwait", 0);
978		umtxq_remove(uq);
979		umtxq_unlock(&uq->uq_key);
980	} else {
981		getnanouptime(&ts);
982		timespecadd(&ts, timeout);
983		TIMESPEC_TO_TIMEVAL(&tv, timeout);
984		umtxq_lock(&uq->uq_key);
985		for (;;) {
986			error = umtxq_sleep(uq, "uwait", tvtohz(&tv));
987			if (!(uq->uq_flags & UQF_UMTXQ))
988				break;
989			if (error != ETIMEDOUT)
990				break;
991			umtxq_unlock(&uq->uq_key);
992			getnanouptime(&ts2);
993			if (timespeccmp(&ts2, &ts, >=)) {
994				error = ETIMEDOUT;
995				umtxq_lock(&uq->uq_key);
996				break;
997			}
998			ts3 = ts;
999			timespecsub(&ts3, &ts2);
1000			TIMESPEC_TO_TIMEVAL(&tv, &ts3);
1001			umtxq_lock(&uq->uq_key);
1002		}
1003		umtxq_remove(uq);
1004		umtxq_unlock(&uq->uq_key);
1005	}
1006	umtx_key_release(&uq->uq_key);
1007	if (error == ERESTART)
1008		error = EINTR;
1009	return (error);
1010}
1011
1012/*
1013 * Wake up threads sleeping on the specified address.
1014 */
1015int
1016kern_umtx_wake(struct thread *td, void *uaddr, int n_wake)
1017{
1018	struct umtx_key key;
1019	int ret;
1020
1021	if ((ret = umtx_key_get(uaddr, TYPE_SIMPLE_WAIT, AUTO_SHARE,
1022	   &key)) != 0)
1023		return (ret);
1024	umtxq_lock(&key);
1025	ret = umtxq_signal(&key, n_wake);
1026	umtxq_unlock(&key);
1027	umtx_key_release(&key);
1028	return (0);
1029}
1030
1031/*
1032 * Lock PTHREAD_PRIO_NONE protocol POSIX mutex.
1033 */
1034static int
1035_do_lock_normal(struct thread *td, struct umutex *m, uint32_t flags, int timo,
1036	int try)
1037{
1038	struct umtx_q *uq;
1039	uint32_t owner, old, id;
1040	int error = 0;
1041
1042	id = td->td_tid;
1043	uq = td->td_umtxq;
1044
1045	/*
1046	 * Care must be exercised when dealing with umtx structure. It
1047	 * can fault on any access.
1048	 */
1049	for (;;) {
1050		/*
1051		 * Try the uncontested case.  This should be done in userland.
1052		 */
1053		owner = casuword32(&m->m_owner, UMUTEX_UNOWNED, id);
1054
1055		/* The acquire succeeded. */
1056		if (owner == UMUTEX_UNOWNED)
1057			return (0);
1058
1059		/* The address was invalid. */
1060		if (owner == -1)
1061			return (EFAULT);
1062
1063		/* If no one owns it but it is contested try to acquire it. */
1064		if (owner == UMUTEX_CONTESTED) {
1065			owner = casuword32(&m->m_owner,
1066			    UMUTEX_CONTESTED, id | UMUTEX_CONTESTED);
1067
1068			if (owner == UMUTEX_CONTESTED)
1069				return (0);
1070
1071			/* The address was invalid. */
1072			if (owner == -1)
1073				return (EFAULT);
1074
1075			/* If this failed the lock has changed, restart. */
1076			continue;
1077		}
1078
1079		if ((flags & UMUTEX_ERROR_CHECK) != 0 &&
1080		    (owner & ~UMUTEX_CONTESTED) == id)
1081			return (EDEADLK);
1082
1083		if (try != 0)
1084			return (EBUSY);
1085
1086		/*
1087		 * If we caught a signal, we have retried and now
1088		 * exit immediately.
1089		 */
1090		if (error != 0)
1091			return (error);
1092
1093		if ((error = umtx_key_get(m, TYPE_NORMAL_UMUTEX,
1094		    GET_SHARE(flags), &uq->uq_key)) != 0)
1095			return (error);
1096
1097		umtxq_lock(&uq->uq_key);
1098		umtxq_busy(&uq->uq_key);
1099		umtxq_insert(uq);
1100		umtxq_unbusy(&uq->uq_key);
1101		umtxq_unlock(&uq->uq_key);
1102
1103		/*
1104		 * Set the contested bit so that a release in user space
1105		 * knows to use the system call for unlock.  If this fails
1106		 * either some one else has acquired the lock or it has been
1107		 * released.
1108		 */
1109		old = casuword32(&m->m_owner, owner, owner | UMUTEX_CONTESTED);
1110
1111		/* The address was invalid. */
1112		if (old == -1) {
1113			umtxq_lock(&uq->uq_key);
1114			umtxq_remove(uq);
1115			umtxq_unlock(&uq->uq_key);
1116			umtx_key_release(&uq->uq_key);
1117			return (EFAULT);
1118		}
1119
1120		/*
1121		 * We set the contested bit, sleep. Otherwise the lock changed
1122		 * and we need to retry or we lost a race to the thread
1123		 * unlocking the umtx.
1124		 */
1125		umtxq_lock(&uq->uq_key);
1126		if (old == owner)
1127			error = umtxq_sleep(uq, "umtxn", timo);
1128		umtxq_remove(uq);
1129		umtxq_unlock(&uq->uq_key);
1130		umtx_key_release(&uq->uq_key);
1131	}
1132
1133	return (0);
1134}
1135
1136/*
1137 * Lock PTHREAD_PRIO_NONE protocol POSIX mutex.
1138 */
1139/*
1140 * Unlock PTHREAD_PRIO_NONE protocol POSIX mutex.
1141 */
1142static int
1143do_unlock_normal(struct thread *td, struct umutex *m, uint32_t flags)
1144{
1145	struct umtx_key key;
1146	uint32_t owner, old, id;
1147	int error;
1148	int count;
1149
1150	id = td->td_tid;
1151	/*
1152	 * Make sure we own this mtx.
1153	 */
1154	owner = fuword32(__DEVOLATILE(uint32_t *, &m->m_owner));
1155	if (owner == -1)
1156		return (EFAULT);
1157
1158	if ((owner & ~UMUTEX_CONTESTED) != id)
1159		return (EPERM);
1160
1161	/* This should be done in userland */
1162	if ((owner & UMUTEX_CONTESTED) == 0) {
1163		old = casuword32(&m->m_owner, owner, UMUTEX_UNOWNED);
1164		if (old == -1)
1165			return (EFAULT);
1166		if (old == owner)
1167			return (0);
1168		owner = old;
1169	}
1170
1171	/* We should only ever be in here for contested locks */
1172	if ((error = umtx_key_get(m, TYPE_NORMAL_UMUTEX, GET_SHARE(flags),
1173	    &key)) != 0)
1174		return (error);
1175
1176	umtxq_lock(&key);
1177	umtxq_busy(&key);
1178	count = umtxq_count(&key);
1179	umtxq_unlock(&key);
1180
1181	/*
1182	 * When unlocking the umtx, it must be marked as unowned if
1183	 * there is zero or one thread only waiting for it.
1184	 * Otherwise, it must be marked as contested.
1185	 */
1186	old = casuword32(&m->m_owner, owner,
1187		count <= 1 ? UMUTEX_UNOWNED : UMUTEX_CONTESTED);
1188	umtxq_lock(&key);
1189	umtxq_signal(&key,1);
1190	umtxq_unbusy(&key);
1191	umtxq_unlock(&key);
1192	umtx_key_release(&key);
1193	if (old == -1)
1194		return (EFAULT);
1195	if (old != owner)
1196		return (EINVAL);
1197	return (0);
1198}
1199
1200static inline struct umtx_pi *
1201umtx_pi_alloc(int flags)
1202{
1203	struct umtx_pi *pi;
1204
1205	pi = uma_zalloc(umtx_pi_zone, M_ZERO | flags);
1206	TAILQ_INIT(&pi->pi_blocked);
1207	atomic_add_int(&umtx_pi_allocated, 1);
1208	return (pi);
1209}
1210
1211static inline void
1212umtx_pi_free(struct umtx_pi *pi)
1213{
1214	uma_zfree(umtx_pi_zone, pi);
1215	atomic_add_int(&umtx_pi_allocated, -1);
1216}
1217
1218/*
1219 * Adjust the thread's position on a pi_state after its priority has been
1220 * changed.
1221 */
1222static int
1223umtx_pi_adjust_thread(struct umtx_pi *pi, struct thread *td)
1224{
1225	struct umtx_q *uq, *uq1, *uq2;
1226	struct thread *td1;
1227
1228	mtx_assert(&umtx_lock, MA_OWNED);
1229	if (pi == NULL)
1230		return (0);
1231
1232	uq = td->td_umtxq;
1233
1234	/*
1235	 * Check if the thread needs to be moved on the blocked chain.
1236	 * It needs to be moved if either its priority is lower than
1237	 * the previous thread or higher than the next thread.
1238	 */
1239	uq1 = TAILQ_PREV(uq, umtxq_head, uq_lockq);
1240	uq2 = TAILQ_NEXT(uq, uq_lockq);
1241	if ((uq1 != NULL && UPRI(td) < UPRI(uq1->uq_thread)) ||
1242	    (uq2 != NULL && UPRI(td) > UPRI(uq2->uq_thread))) {
1243		/*
1244		 * Remove thread from blocked chain and determine where
1245		 * it should be moved to.
1246		 */
1247		TAILQ_REMOVE(&pi->pi_blocked, uq, uq_lockq);
1248		TAILQ_FOREACH(uq1, &pi->pi_blocked, uq_lockq) {
1249			td1 = uq1->uq_thread;
1250			MPASS(td1->td_proc->p_magic == P_MAGIC);
1251			if (UPRI(td1) > UPRI(td))
1252				break;
1253		}
1254
1255		if (uq1 == NULL)
1256			TAILQ_INSERT_TAIL(&pi->pi_blocked, uq, uq_lockq);
1257		else
1258			TAILQ_INSERT_BEFORE(uq1, uq, uq_lockq);
1259	}
1260	return (1);
1261}
1262
1263/*
1264 * Propagate priority when a thread is blocked on POSIX
1265 * PI mutex.
1266 */
1267static void
1268umtx_propagate_priority(struct thread *td)
1269{
1270	struct umtx_q *uq;
1271	struct umtx_pi *pi;
1272	int pri;
1273
1274	mtx_assert(&umtx_lock, MA_OWNED);
1275	pri = UPRI(td);
1276	uq = td->td_umtxq;
1277	pi = uq->uq_pi_blocked;
1278	if (pi == NULL)
1279		return;
1280
1281	for (;;) {
1282		td = pi->pi_owner;
1283		if (td == NULL)
1284			return;
1285
1286		MPASS(td->td_proc != NULL);
1287		MPASS(td->td_proc->p_magic == P_MAGIC);
1288
1289		if (UPRI(td) <= pri)
1290			return;
1291
1292		thread_lock(td);
1293		sched_lend_user_prio(td, pri);
1294		thread_unlock(td);
1295
1296		/*
1297		 * Pick up the lock that td is blocked on.
1298		 */
1299		uq = td->td_umtxq;
1300		pi = uq->uq_pi_blocked;
1301		/* Resort td on the list if needed. */
1302		if (!umtx_pi_adjust_thread(pi, td))
1303			break;
1304	}
1305}
1306
1307/*
1308 * Unpropagate priority for a PI mutex when a thread blocked on
1309 * it is interrupted by signal or resumed by others.
1310 */
1311static void
1312umtx_unpropagate_priority(struct umtx_pi *pi)
1313{
1314	struct umtx_q *uq, *uq_owner;
1315	struct umtx_pi *pi2;
1316	int pri, oldpri;
1317
1318	mtx_assert(&umtx_lock, MA_OWNED);
1319
1320	while (pi != NULL && pi->pi_owner != NULL) {
1321		pri = PRI_MAX;
1322		uq_owner = pi->pi_owner->td_umtxq;
1323
1324		TAILQ_FOREACH(pi2, &uq_owner->uq_pi_contested, pi_link) {
1325			uq = TAILQ_FIRST(&pi2->pi_blocked);
1326			if (uq != NULL) {
1327				if (pri > UPRI(uq->uq_thread))
1328					pri = UPRI(uq->uq_thread);
1329			}
1330		}
1331
1332		if (pri > uq_owner->uq_inherited_pri)
1333			pri = uq_owner->uq_inherited_pri;
1334		thread_lock(pi->pi_owner);
1335		oldpri = pi->pi_owner->td_user_pri;
1336		sched_unlend_user_prio(pi->pi_owner, pri);
1337		thread_unlock(pi->pi_owner);
1338		umtx_pi_adjust_locked(pi->pi_owner, oldpri);
1339		pi = uq_owner->uq_pi_blocked;
1340	}
1341}
1342
1343/*
1344 * Insert a PI mutex into owned list.
1345 */
1346static void
1347umtx_pi_setowner(struct umtx_pi *pi, struct thread *owner)
1348{
1349	struct umtx_q *uq_owner;
1350
1351	uq_owner = owner->td_umtxq;
1352	mtx_assert(&umtx_lock, MA_OWNED);
1353	if (pi->pi_owner != NULL)
1354		panic("pi_ower != NULL");
1355	pi->pi_owner = owner;
1356	TAILQ_INSERT_TAIL(&uq_owner->uq_pi_contested, pi, pi_link);
1357}
1358
1359/*
1360 * Claim ownership of a PI mutex.
1361 */
1362static int
1363umtx_pi_claim(struct umtx_pi *pi, struct thread *owner)
1364{
1365	struct umtx_q *uq, *uq_owner;
1366
1367	uq_owner = owner->td_umtxq;
1368	mtx_lock_spin(&umtx_lock);
1369	if (pi->pi_owner == owner) {
1370		mtx_unlock_spin(&umtx_lock);
1371		return (0);
1372	}
1373
1374	if (pi->pi_owner != NULL) {
1375		/*
1376		 * userland may have already messed the mutex, sigh.
1377		 */
1378		mtx_unlock_spin(&umtx_lock);
1379		return (EPERM);
1380	}
1381	umtx_pi_setowner(pi, owner);
1382	uq = TAILQ_FIRST(&pi->pi_blocked);
1383	if (uq != NULL) {
1384		int pri;
1385
1386		pri = UPRI(uq->uq_thread);
1387		thread_lock(owner);
1388		if (pri < UPRI(owner))
1389			sched_lend_user_prio(owner, pri);
1390		thread_unlock(owner);
1391	}
1392	mtx_unlock_spin(&umtx_lock);
1393	return (0);
1394}
1395
1396static void
1397umtx_pi_adjust_locked(struct thread *td, u_char oldpri)
1398{
1399	struct umtx_q *uq;
1400	struct umtx_pi *pi;
1401
1402	uq = td->td_umtxq;
1403	/*
1404	 * Pick up the lock that td is blocked on.
1405	 */
1406	pi = uq->uq_pi_blocked;
1407	MPASS(pi != NULL);
1408
1409	/* Resort the turnstile on the list. */
1410	if (!umtx_pi_adjust_thread(pi, td))
1411		return;
1412
1413	/*
1414	 * If our priority was lowered and we are at the head of the
1415	 * turnstile, then propagate our new priority up the chain.
1416	 */
1417	if (uq == TAILQ_FIRST(&pi->pi_blocked) && UPRI(td) < oldpri)
1418		umtx_propagate_priority(td);
1419}
1420
1421/*
1422 * Adjust a thread's order position in its blocked PI mutex,
1423 * this may result new priority propagating process.
1424 */
1425void
1426umtx_pi_adjust(struct thread *td, u_char oldpri)
1427{
1428	struct umtx_q *uq;
1429	struct umtx_pi *pi;
1430
1431	uq = td->td_umtxq;
1432	mtx_lock_spin(&umtx_lock);
1433	/*
1434	 * Pick up the lock that td is blocked on.
1435	 */
1436	pi = uq->uq_pi_blocked;
1437	if (pi != NULL)
1438		umtx_pi_adjust_locked(td, oldpri);
1439	mtx_unlock_spin(&umtx_lock);
1440}
1441
1442/*
1443 * Sleep on a PI mutex.
1444 */
1445static int
1446umtxq_sleep_pi(struct umtx_q *uq, struct umtx_pi *pi,
1447	uint32_t owner, const char *wmesg, int timo)
1448{
1449	struct umtxq_chain *uc;
1450	struct thread *td, *td1;
1451	struct umtx_q *uq1;
1452	int pri;
1453	int error = 0;
1454
1455	td = uq->uq_thread;
1456	KASSERT(td == curthread, ("inconsistent uq_thread"));
1457	uc = umtxq_getchain(&uq->uq_key);
1458	UMTXQ_LOCKED_ASSERT(uc);
1459	umtxq_insert(uq);
1460	if (pi->pi_owner == NULL) {
1461		/* XXX
1462		 * Current, We only support process private PI-mutex,
1463		 * non-contended PI-mutexes are locked in userland.
1464		 * Process shared PI-mutex should always be initialized
1465		 * by kernel and be registered in kernel, locking should
1466		 * always be done by kernel to avoid security problems.
1467		 * For process private PI-mutex, we can find owner
1468		 * thread and boost its priority safely.
1469		 */
1470		PROC_LOCK(curproc);
1471		td1 = thread_find(curproc, owner);
1472		mtx_lock_spin(&umtx_lock);
1473		if (td1 != NULL && pi->pi_owner == NULL) {
1474			uq1 = td1->td_umtxq;
1475			umtx_pi_setowner(pi, td1);
1476		}
1477		PROC_UNLOCK(curproc);
1478	} else {
1479		mtx_lock_spin(&umtx_lock);
1480	}
1481
1482	TAILQ_FOREACH(uq1, &pi->pi_blocked, uq_lockq) {
1483		pri = UPRI(uq1->uq_thread);
1484		if (pri > UPRI(td))
1485			break;
1486	}
1487
1488	if (uq1 != NULL)
1489		TAILQ_INSERT_BEFORE(uq1, uq, uq_lockq);
1490	else
1491		TAILQ_INSERT_TAIL(&pi->pi_blocked, uq, uq_lockq);
1492
1493	uq->uq_pi_blocked = pi;
1494	thread_lock(td);
1495	td->td_flags |= TDF_UPIBLOCKED;
1496	thread_unlock(td);
1497	mtx_unlock_spin(&umtx_lock);
1498	umtxq_unlock(&uq->uq_key);
1499
1500	mtx_lock_spin(&umtx_lock);
1501	umtx_propagate_priority(td);
1502	mtx_unlock_spin(&umtx_lock);
1503
1504	umtxq_lock(&uq->uq_key);
1505	if (uq->uq_flags & UQF_UMTXQ) {
1506		error = msleep(uq, &uc->uc_lock, PCATCH, wmesg, timo);
1507		if (error == EWOULDBLOCK)
1508			error = ETIMEDOUT;
1509		if (uq->uq_flags & UQF_UMTXQ) {
1510			umtxq_busy(&uq->uq_key);
1511			umtxq_remove(uq);
1512			umtxq_unbusy(&uq->uq_key);
1513		}
1514	}
1515	umtxq_unlock(&uq->uq_key);
1516
1517	mtx_lock_spin(&umtx_lock);
1518	uq->uq_pi_blocked = NULL;
1519	thread_lock(td);
1520	td->td_flags &= ~TDF_UPIBLOCKED;
1521	thread_unlock(td);
1522	TAILQ_REMOVE(&pi->pi_blocked, uq, uq_lockq);
1523	umtx_unpropagate_priority(pi);
1524	mtx_unlock_spin(&umtx_lock);
1525
1526	umtxq_lock(&uq->uq_key);
1527
1528	return (error);
1529}
1530
1531/*
1532 * Add reference count for a PI mutex.
1533 */
1534static void
1535umtx_pi_ref(struct umtx_pi *pi)
1536{
1537	struct umtxq_chain *uc;
1538
1539	uc = umtxq_getchain(&pi->pi_key);
1540	UMTXQ_LOCKED_ASSERT(uc);
1541	pi->pi_refcount++;
1542}
1543
1544/*
1545 * Decrease reference count for a PI mutex, if the counter
1546 * is decreased to zero, its memory space is freed.
1547 */
1548static void
1549umtx_pi_unref(struct umtx_pi *pi)
1550{
1551	struct umtxq_chain *uc;
1552	int free = 0;
1553
1554	uc = umtxq_getchain(&pi->pi_key);
1555	UMTXQ_LOCKED_ASSERT(uc);
1556	KASSERT(pi->pi_refcount > 0, ("invalid reference count"));
1557	if (--pi->pi_refcount == 0) {
1558		mtx_lock_spin(&umtx_lock);
1559		if (pi->pi_owner != NULL) {
1560			TAILQ_REMOVE(&pi->pi_owner->td_umtxq->uq_pi_contested,
1561				pi, pi_link);
1562			pi->pi_owner = NULL;
1563		}
1564		KASSERT(TAILQ_EMPTY(&pi->pi_blocked),
1565			("blocked queue not empty"));
1566		mtx_unlock_spin(&umtx_lock);
1567		TAILQ_REMOVE(&uc->uc_pi_list, pi, pi_hashlink);
1568		free = 1;
1569	}
1570	if (free)
1571		umtx_pi_free(pi);
1572}
1573
1574/*
1575 * Find a PI mutex in hash table.
1576 */
1577static struct umtx_pi *
1578umtx_pi_lookup(struct umtx_key *key)
1579{
1580	struct umtxq_chain *uc;
1581	struct umtx_pi *pi;
1582
1583	uc = umtxq_getchain(key);
1584	UMTXQ_LOCKED_ASSERT(uc);
1585
1586	TAILQ_FOREACH(pi, &uc->uc_pi_list, pi_hashlink) {
1587		if (umtx_key_match(&pi->pi_key, key)) {
1588			return (pi);
1589		}
1590	}
1591	return (NULL);
1592}
1593
1594/*
1595 * Insert a PI mutex into hash table.
1596 */
1597static inline void
1598umtx_pi_insert(struct umtx_pi *pi)
1599{
1600	struct umtxq_chain *uc;
1601
1602	uc = umtxq_getchain(&pi->pi_key);
1603	UMTXQ_LOCKED_ASSERT(uc);
1604	TAILQ_INSERT_TAIL(&uc->uc_pi_list, pi, pi_hashlink);
1605}
1606
1607/*
1608 * Lock a PI mutex.
1609 */
1610static int
1611_do_lock_pi(struct thread *td, struct umutex *m, uint32_t flags, int timo,
1612	int try)
1613{
1614	struct umtx_q *uq;
1615	struct umtx_pi *pi, *new_pi;
1616	uint32_t id, owner, old;
1617	int error;
1618
1619	id = td->td_tid;
1620	uq = td->td_umtxq;
1621
1622	if ((error = umtx_key_get(m, TYPE_PI_UMUTEX, GET_SHARE(flags),
1623	    &uq->uq_key)) != 0)
1624		return (error);
1625	umtxq_lock(&uq->uq_key);
1626	pi = umtx_pi_lookup(&uq->uq_key);
1627	if (pi == NULL) {
1628		new_pi = umtx_pi_alloc(M_NOWAIT);
1629		if (new_pi == NULL) {
1630			umtxq_unlock(&uq->uq_key);
1631			new_pi = umtx_pi_alloc(M_WAITOK);
1632			new_pi->pi_key = uq->uq_key;
1633			umtxq_lock(&uq->uq_key);
1634			pi = umtx_pi_lookup(&uq->uq_key);
1635			if (pi != NULL) {
1636				umtx_pi_free(new_pi);
1637				new_pi = NULL;
1638			}
1639		}
1640		if (new_pi != NULL) {
1641			new_pi->pi_key = uq->uq_key;
1642			umtx_pi_insert(new_pi);
1643			pi = new_pi;
1644		}
1645	}
1646	umtx_pi_ref(pi);
1647	umtxq_unlock(&uq->uq_key);
1648
1649	/*
1650	 * Care must be exercised when dealing with umtx structure.  It
1651	 * can fault on any access.
1652	 */
1653	for (;;) {
1654		/*
1655		 * Try the uncontested case.  This should be done in userland.
1656		 */
1657		owner = casuword32(&m->m_owner, UMUTEX_UNOWNED, id);
1658
1659		/* The acquire succeeded. */
1660		if (owner == UMUTEX_UNOWNED) {
1661			error = 0;
1662			break;
1663		}
1664
1665		/* The address was invalid. */
1666		if (owner == -1) {
1667			error = EFAULT;
1668			break;
1669		}
1670
1671		/* If no one owns it but it is contested try to acquire it. */
1672		if (owner == UMUTEX_CONTESTED) {
1673			owner = casuword32(&m->m_owner,
1674			    UMUTEX_CONTESTED, id | UMUTEX_CONTESTED);
1675
1676			if (owner == UMUTEX_CONTESTED) {
1677				umtxq_lock(&uq->uq_key);
1678				error = umtx_pi_claim(pi, td);
1679				umtxq_unlock(&uq->uq_key);
1680				break;
1681			}
1682
1683			/* The address was invalid. */
1684			if (owner == -1) {
1685				error = EFAULT;
1686				break;
1687			}
1688
1689			/* If this failed the lock has changed, restart. */
1690			continue;
1691		}
1692
1693		if ((flags & UMUTEX_ERROR_CHECK) != 0 &&
1694		    (owner & ~UMUTEX_CONTESTED) == id) {
1695			error = EDEADLK;
1696			break;
1697		}
1698
1699		if (try != 0) {
1700			error = EBUSY;
1701			break;
1702		}
1703
1704		/*
1705		 * If we caught a signal, we have retried and now
1706		 * exit immediately.
1707		 */
1708		if (error != 0)
1709			break;
1710
1711		umtxq_lock(&uq->uq_key);
1712		umtxq_busy(&uq->uq_key);
1713		umtxq_unlock(&uq->uq_key);
1714
1715		/*
1716		 * Set the contested bit so that a release in user space
1717		 * knows to use the system call for unlock.  If this fails
1718		 * either some one else has acquired the lock or it has been
1719		 * released.
1720		 */
1721		old = casuword32(&m->m_owner, owner, owner | UMUTEX_CONTESTED);
1722
1723		/* The address was invalid. */
1724		if (old == -1) {
1725			umtxq_lock(&uq->uq_key);
1726			umtxq_unbusy(&uq->uq_key);
1727			umtxq_unlock(&uq->uq_key);
1728			error = EFAULT;
1729			break;
1730		}
1731
1732		umtxq_lock(&uq->uq_key);
1733		umtxq_unbusy(&uq->uq_key);
1734		/*
1735		 * We set the contested bit, sleep. Otherwise the lock changed
1736		 * and we need to retry or we lost a race to the thread
1737		 * unlocking the umtx.
1738		 */
1739		if (old == owner)
1740			error = umtxq_sleep_pi(uq, pi, owner & ~UMUTEX_CONTESTED,
1741				 "umtxpi", timo);
1742		umtxq_unlock(&uq->uq_key);
1743	}
1744
1745	umtxq_lock(&uq->uq_key);
1746	umtx_pi_unref(pi);
1747	umtxq_unlock(&uq->uq_key);
1748
1749	umtx_key_release(&uq->uq_key);
1750	return (error);
1751}
1752
1753/*
1754 * Unlock a PI mutex.
1755 */
1756static int
1757do_unlock_pi(struct thread *td, struct umutex *m, uint32_t flags)
1758{
1759	struct umtx_key key;
1760	struct umtx_q *uq_first, *uq_first2, *uq_me;
1761	struct umtx_pi *pi, *pi2;
1762	uint32_t owner, old, id;
1763	int error;
1764	int count;
1765	int pri;
1766
1767	id = td->td_tid;
1768	/*
1769	 * Make sure we own this mtx.
1770	 */
1771	owner = fuword32(__DEVOLATILE(uint32_t *, &m->m_owner));
1772	if (owner == -1)
1773		return (EFAULT);
1774
1775	if ((owner & ~UMUTEX_CONTESTED) != id)
1776		return (EPERM);
1777
1778	/* This should be done in userland */
1779	if ((owner & UMUTEX_CONTESTED) == 0) {
1780		old = casuword32(&m->m_owner, owner, UMUTEX_UNOWNED);
1781		if (old == -1)
1782			return (EFAULT);
1783		if (old == owner)
1784			return (0);
1785		owner = old;
1786	}
1787
1788	/* We should only ever be in here for contested locks */
1789	if ((error = umtx_key_get(m, TYPE_PI_UMUTEX, GET_SHARE(flags),
1790	    &key)) != 0)
1791		return (error);
1792
1793	umtxq_lock(&key);
1794	umtxq_busy(&key);
1795	count = umtxq_count_pi(&key, &uq_first);
1796	if (uq_first != NULL) {
1797		pi = uq_first->uq_pi_blocked;
1798		if (pi->pi_owner != curthread) {
1799			umtxq_unbusy(&key);
1800			umtxq_unlock(&key);
1801			/* userland messed the mutex */
1802			return (EPERM);
1803		}
1804		uq_me = curthread->td_umtxq;
1805		mtx_lock_spin(&umtx_lock);
1806		pi->pi_owner = NULL;
1807		TAILQ_REMOVE(&uq_me->uq_pi_contested, pi, pi_link);
1808		uq_first = TAILQ_FIRST(&pi->pi_blocked);
1809		pri = PRI_MAX;
1810		TAILQ_FOREACH(pi2, &uq_me->uq_pi_contested, pi_link) {
1811			uq_first2 = TAILQ_FIRST(&pi2->pi_blocked);
1812			if (uq_first2 != NULL) {
1813				if (pri > UPRI(uq_first2->uq_thread))
1814					pri = UPRI(uq_first2->uq_thread);
1815			}
1816		}
1817		thread_lock(curthread);
1818		sched_unlend_user_prio(curthread, pri);
1819		thread_unlock(curthread);
1820		mtx_unlock_spin(&umtx_lock);
1821	}
1822	umtxq_unlock(&key);
1823
1824	/*
1825	 * When unlocking the umtx, it must be marked as unowned if
1826	 * there is zero or one thread only waiting for it.
1827	 * Otherwise, it must be marked as contested.
1828	 */
1829	old = casuword32(&m->m_owner, owner,
1830		count <= 1 ? UMUTEX_UNOWNED : UMUTEX_CONTESTED);
1831
1832	umtxq_lock(&key);
1833	if (uq_first != NULL)
1834		umtxq_signal_thread(uq_first);
1835	umtxq_unbusy(&key);
1836	umtxq_unlock(&key);
1837	umtx_key_release(&key);
1838	if (old == -1)
1839		return (EFAULT);
1840	if (old != owner)
1841		return (EINVAL);
1842	return (0);
1843}
1844
1845/*
1846 * Lock a PP mutex.
1847 */
1848static int
1849_do_lock_pp(struct thread *td, struct umutex *m, uint32_t flags, int timo,
1850	int try)
1851{
1852	struct umtx_q *uq, *uq2;
1853	struct umtx_pi *pi;
1854	uint32_t ceiling;
1855	uint32_t owner, id;
1856	int error, pri, old_inherited_pri, su;
1857
1858	id = td->td_tid;
1859	uq = td->td_umtxq;
1860	if ((error = umtx_key_get(m, TYPE_PP_UMUTEX, GET_SHARE(flags),
1861	    &uq->uq_key)) != 0)
1862		return (error);
1863	su = (priv_check(td, PRIV_SCHED_RTPRIO) == 0);
1864	for (;;) {
1865		old_inherited_pri = uq->uq_inherited_pri;
1866		umtxq_lock(&uq->uq_key);
1867		umtxq_busy(&uq->uq_key);
1868		umtxq_unlock(&uq->uq_key);
1869
1870		ceiling = RTP_PRIO_MAX - fuword32(&m->m_ceilings[0]);
1871		if (ceiling > RTP_PRIO_MAX) {
1872			error = EINVAL;
1873			goto out;
1874		}
1875
1876		mtx_lock_spin(&umtx_lock);
1877		if (UPRI(td) < PRI_MIN_REALTIME + ceiling) {
1878			mtx_unlock_spin(&umtx_lock);
1879			error = EINVAL;
1880			goto out;
1881		}
1882		if (su && PRI_MIN_REALTIME + ceiling < uq->uq_inherited_pri) {
1883			uq->uq_inherited_pri = PRI_MIN_REALTIME + ceiling;
1884			thread_lock(td);
1885			if (uq->uq_inherited_pri < UPRI(td))
1886				sched_lend_user_prio(td, uq->uq_inherited_pri);
1887			thread_unlock(td);
1888		}
1889		mtx_unlock_spin(&umtx_lock);
1890
1891		owner = casuword32(&m->m_owner,
1892		    UMUTEX_CONTESTED, id | UMUTEX_CONTESTED);
1893
1894		if (owner == UMUTEX_CONTESTED) {
1895			error = 0;
1896			break;
1897		}
1898
1899		/* The address was invalid. */
1900		if (owner == -1) {
1901			error = EFAULT;
1902			break;
1903		}
1904
1905		if ((flags & UMUTEX_ERROR_CHECK) != 0 &&
1906		    (owner & ~UMUTEX_CONTESTED) == id) {
1907			error = EDEADLK;
1908			break;
1909		}
1910
1911		if (try != 0) {
1912			error = EBUSY;
1913			break;
1914		}
1915
1916		/*
1917		 * If we caught a signal, we have retried and now
1918		 * exit immediately.
1919		 */
1920		if (error != 0)
1921			break;
1922
1923		umtxq_lock(&uq->uq_key);
1924		umtxq_insert(uq);
1925		umtxq_unbusy(&uq->uq_key);
1926		error = umtxq_sleep(uq, "umtxpp", timo);
1927		umtxq_remove(uq);
1928		umtxq_unlock(&uq->uq_key);
1929
1930		mtx_lock_spin(&umtx_lock);
1931		uq->uq_inherited_pri = old_inherited_pri;
1932		pri = PRI_MAX;
1933		TAILQ_FOREACH(pi, &uq->uq_pi_contested, pi_link) {
1934			uq2 = TAILQ_FIRST(&pi->pi_blocked);
1935			if (uq2 != NULL) {
1936				if (pri > UPRI(uq2->uq_thread))
1937					pri = UPRI(uq2->uq_thread);
1938			}
1939		}
1940		if (pri > uq->uq_inherited_pri)
1941			pri = uq->uq_inherited_pri;
1942		thread_lock(td);
1943		sched_unlend_user_prio(td, pri);
1944		thread_unlock(td);
1945		mtx_unlock_spin(&umtx_lock);
1946	}
1947
1948	if (error != 0) {
1949		mtx_lock_spin(&umtx_lock);
1950		uq->uq_inherited_pri = old_inherited_pri;
1951		pri = PRI_MAX;
1952		TAILQ_FOREACH(pi, &uq->uq_pi_contested, pi_link) {
1953			uq2 = TAILQ_FIRST(&pi->pi_blocked);
1954			if (uq2 != NULL) {
1955				if (pri > UPRI(uq2->uq_thread))
1956					pri = UPRI(uq2->uq_thread);
1957			}
1958		}
1959		if (pri > uq->uq_inherited_pri)
1960			pri = uq->uq_inherited_pri;
1961		thread_lock(td);
1962		sched_unlend_user_prio(td, pri);
1963		thread_unlock(td);
1964		mtx_unlock_spin(&umtx_lock);
1965	}
1966
1967out:
1968	umtxq_lock(&uq->uq_key);
1969	umtxq_unbusy(&uq->uq_key);
1970	umtxq_unlock(&uq->uq_key);
1971	umtx_key_release(&uq->uq_key);
1972	return (error);
1973}
1974
1975/*
1976 * Unlock a PP mutex.
1977 */
1978static int
1979do_unlock_pp(struct thread *td, struct umutex *m, uint32_t flags)
1980{
1981	struct umtx_key key;
1982	struct umtx_q *uq, *uq2;
1983	struct umtx_pi *pi;
1984	uint32_t owner, id;
1985	uint32_t rceiling;
1986	int error, pri, new_inherited_pri, su;
1987
1988	id = td->td_tid;
1989	uq = td->td_umtxq;
1990	su = (priv_check(td, PRIV_SCHED_RTPRIO) == 0);
1991
1992	/*
1993	 * Make sure we own this mtx.
1994	 */
1995	owner = fuword32(__DEVOLATILE(uint32_t *, &m->m_owner));
1996	if (owner == -1)
1997		return (EFAULT);
1998
1999	if ((owner & ~UMUTEX_CONTESTED) != id)
2000		return (EPERM);
2001
2002	error = copyin(&m->m_ceilings[1], &rceiling, sizeof(uint32_t));
2003	if (error != 0)
2004		return (error);
2005
2006	if (rceiling == -1)
2007		new_inherited_pri = PRI_MAX;
2008	else {
2009		rceiling = RTP_PRIO_MAX - rceiling;
2010		if (rceiling > RTP_PRIO_MAX)
2011			return (EINVAL);
2012		new_inherited_pri = PRI_MIN_REALTIME + rceiling;
2013	}
2014
2015	if ((error = umtx_key_get(m, TYPE_PP_UMUTEX, GET_SHARE(flags),
2016	    &key)) != 0)
2017		return (error);
2018	umtxq_lock(&key);
2019	umtxq_busy(&key);
2020	umtxq_unlock(&key);
2021	/*
2022	 * For priority protected mutex, always set unlocked state
2023	 * to UMUTEX_CONTESTED, so that userland always enters kernel
2024	 * to lock the mutex, it is necessary because thread priority
2025	 * has to be adjusted for such mutex.
2026	 */
2027	error = suword32(__DEVOLATILE(uint32_t *, &m->m_owner),
2028		UMUTEX_CONTESTED);
2029
2030	umtxq_lock(&key);
2031	if (error == 0)
2032		umtxq_signal(&key, 1);
2033	umtxq_unbusy(&key);
2034	umtxq_unlock(&key);
2035
2036	if (error == -1)
2037		error = EFAULT;
2038	else {
2039		mtx_lock_spin(&umtx_lock);
2040		if (su != 0)
2041			uq->uq_inherited_pri = new_inherited_pri;
2042		pri = PRI_MAX;
2043		TAILQ_FOREACH(pi, &uq->uq_pi_contested, pi_link) {
2044			uq2 = TAILQ_FIRST(&pi->pi_blocked);
2045			if (uq2 != NULL) {
2046				if (pri > UPRI(uq2->uq_thread))
2047					pri = UPRI(uq2->uq_thread);
2048			}
2049		}
2050		if (pri > uq->uq_inherited_pri)
2051			pri = uq->uq_inherited_pri;
2052		thread_lock(td);
2053		sched_unlend_user_prio(td, pri);
2054		thread_unlock(td);
2055		mtx_unlock_spin(&umtx_lock);
2056	}
2057	umtx_key_release(&key);
2058	return (error);
2059}
2060
2061static int
2062do_set_ceiling(struct thread *td, struct umutex *m, uint32_t ceiling,
2063	uint32_t *old_ceiling)
2064{
2065	struct umtx_q *uq;
2066	uint32_t save_ceiling;
2067	uint32_t owner, id;
2068	uint32_t flags;
2069	int error;
2070
2071	flags = fuword32(&m->m_flags);
2072	if ((flags & UMUTEX_PRIO_PROTECT) == 0)
2073		return (EINVAL);
2074	if (ceiling > RTP_PRIO_MAX)
2075		return (EINVAL);
2076	id = td->td_tid;
2077	uq = td->td_umtxq;
2078	if ((error = umtx_key_get(m, TYPE_PP_UMUTEX, GET_SHARE(flags),
2079	   &uq->uq_key)) != 0)
2080		return (error);
2081	for (;;) {
2082		umtxq_lock(&uq->uq_key);
2083		umtxq_busy(&uq->uq_key);
2084		umtxq_unlock(&uq->uq_key);
2085
2086		save_ceiling = fuword32(&m->m_ceilings[0]);
2087
2088		owner = casuword32(&m->m_owner,
2089		    UMUTEX_CONTESTED, id | UMUTEX_CONTESTED);
2090
2091		if (owner == UMUTEX_CONTESTED) {
2092			suword32(&m->m_ceilings[0], ceiling);
2093			suword32(__DEVOLATILE(uint32_t *, &m->m_owner),
2094				UMUTEX_CONTESTED);
2095			error = 0;
2096			break;
2097		}
2098
2099		/* The address was invalid. */
2100		if (owner == -1) {
2101			error = EFAULT;
2102			break;
2103		}
2104
2105		if ((owner & ~UMUTEX_CONTESTED) == id) {
2106			suword32(&m->m_ceilings[0], ceiling);
2107			error = 0;
2108			break;
2109		}
2110
2111		/*
2112		 * If we caught a signal, we have retried and now
2113		 * exit immediately.
2114		 */
2115		if (error != 0)
2116			break;
2117
2118		/*
2119		 * We set the contested bit, sleep. Otherwise the lock changed
2120		 * and we need to retry or we lost a race to the thread
2121		 * unlocking the umtx.
2122		 */
2123		umtxq_lock(&uq->uq_key);
2124		umtxq_insert(uq);
2125		umtxq_unbusy(&uq->uq_key);
2126		error = umtxq_sleep(uq, "umtxpp", 0);
2127		umtxq_remove(uq);
2128		umtxq_unlock(&uq->uq_key);
2129	}
2130	umtxq_lock(&uq->uq_key);
2131	if (error == 0)
2132		umtxq_signal(&uq->uq_key, INT_MAX);
2133	umtxq_unbusy(&uq->uq_key);
2134	umtxq_unlock(&uq->uq_key);
2135	umtx_key_release(&uq->uq_key);
2136	if (error == 0 && old_ceiling != NULL)
2137		suword32(old_ceiling, save_ceiling);
2138	return (error);
2139}
2140
2141static int
2142_do_lock_umutex(struct thread *td, struct umutex *m, int flags, int timo,
2143	int try)
2144{
2145	switch(flags & (UMUTEX_PRIO_INHERIT | UMUTEX_PRIO_PROTECT)) {
2146	case 0:
2147		return (_do_lock_normal(td, m, flags, timo, try));
2148	case UMUTEX_PRIO_INHERIT:
2149		return (_do_lock_pi(td, m, flags, timo, try));
2150	case UMUTEX_PRIO_PROTECT:
2151		return (_do_lock_pp(td, m, flags, timo, try));
2152	}
2153	return (EINVAL);
2154}
2155
2156/*
2157 * Lock a userland POSIX mutex.
2158 */
2159static int
2160do_lock_umutex(struct thread *td, struct umutex *m,
2161	struct timespec *timeout, int try)
2162{
2163	struct timespec ts, ts2, ts3;
2164	struct timeval tv;
2165	uint32_t flags;
2166	int error;
2167
2168	flags = fuword32(&m->m_flags);
2169	if (flags == -1)
2170		return (EFAULT);
2171
2172	if (timeout == NULL) {
2173		error = _do_lock_umutex(td, m, flags, 0, try);
2174		/* Mutex locking is restarted if it is interrupted. */
2175		if (error == EINTR)
2176			error = ERESTART;
2177	} else {
2178		getnanouptime(&ts);
2179		timespecadd(&ts, timeout);
2180		TIMESPEC_TO_TIMEVAL(&tv, timeout);
2181		for (;;) {
2182			error = _do_lock_umutex(td, m, flags, tvtohz(&tv), try);
2183			if (error != ETIMEDOUT)
2184				break;
2185			getnanouptime(&ts2);
2186			if (timespeccmp(&ts2, &ts, >=)) {
2187				error = ETIMEDOUT;
2188				break;
2189			}
2190			ts3 = ts;
2191			timespecsub(&ts3, &ts2);
2192			TIMESPEC_TO_TIMEVAL(&tv, &ts3);
2193		}
2194		/* Timed-locking is not restarted. */
2195		if (error == ERESTART)
2196			error = EINTR;
2197	}
2198	return (error);
2199}
2200
2201/*
2202 * Unlock a userland POSIX mutex.
2203 */
2204static int
2205do_unlock_umutex(struct thread *td, struct umutex *m)
2206{
2207	uint32_t flags;
2208
2209	flags = fuword32(&m->m_flags);
2210	if (flags == -1)
2211		return (EFAULT);
2212
2213	switch(flags & (UMUTEX_PRIO_INHERIT | UMUTEX_PRIO_PROTECT)) {
2214	case 0:
2215		return (do_unlock_normal(td, m, flags));
2216	case UMUTEX_PRIO_INHERIT:
2217		return (do_unlock_pi(td, m, flags));
2218	case UMUTEX_PRIO_PROTECT:
2219		return (do_unlock_pp(td, m, flags));
2220	}
2221
2222	return (EINVAL);
2223}
2224
2225static int
2226do_cv_wait(struct thread *td, struct ucond *cv, struct umutex *m,
2227	struct timespec *timeout, u_long wflags)
2228{
2229	struct umtx_q *uq;
2230	struct timeval tv;
2231	struct timespec cts, ets, tts;
2232	uint32_t flags;
2233	int error;
2234
2235	uq = td->td_umtxq;
2236	flags = fuword32(&cv->c_flags);
2237	error = umtx_key_get(cv, TYPE_CV, GET_SHARE(flags), &uq->uq_key);
2238	if (error != 0)
2239		return (error);
2240	umtxq_lock(&uq->uq_key);
2241	umtxq_busy(&uq->uq_key);
2242	umtxq_insert(uq);
2243	umtxq_unlock(&uq->uq_key);
2244
2245	/*
2246	 * The magic thing is we should set c_has_waiters to 1 before
2247	 * releasing user mutex.
2248	 */
2249	suword32(__DEVOLATILE(uint32_t *, &cv->c_has_waiters), 1);
2250
2251	umtxq_lock(&uq->uq_key);
2252	umtxq_unbusy(&uq->uq_key);
2253	umtxq_unlock(&uq->uq_key);
2254
2255	error = do_unlock_umutex(td, m);
2256
2257	umtxq_lock(&uq->uq_key);
2258	if (error == 0) {
2259		if ((wflags & UMTX_CHECK_UNPARKING) &&
2260		    (td->td_pflags & TDP_WAKEUP)) {
2261			td->td_pflags &= ~TDP_WAKEUP;
2262			error = EINTR;
2263		} else if (timeout == NULL) {
2264			error = umtxq_sleep(uq, "ucond", 0);
2265		} else {
2266			getnanouptime(&ets);
2267			timespecadd(&ets, timeout);
2268			TIMESPEC_TO_TIMEVAL(&tv, timeout);
2269			for (;;) {
2270				error = umtxq_sleep(uq, "ucond", tvtohz(&tv));
2271				if (error != ETIMEDOUT)
2272					break;
2273				getnanouptime(&cts);
2274				if (timespeccmp(&cts, &ets, >=)) {
2275					error = ETIMEDOUT;
2276					break;
2277				}
2278				tts = ets;
2279				timespecsub(&tts, &cts);
2280				TIMESPEC_TO_TIMEVAL(&tv, &tts);
2281			}
2282		}
2283	}
2284
2285	if (error != 0) {
2286		if ((uq->uq_flags & UQF_UMTXQ) == 0) {
2287			/*
2288			 * If we concurrently got do_cv_signal()d
2289			 * and we got an error or UNIX signals or a timeout,
2290			 * then, perform another umtxq_signal to avoid
2291			 * consuming the wakeup. This may cause supurious
2292			 * wakeup for another thread which was just queued,
2293			 * but SUSV3 explicitly allows supurious wakeup to
2294			 * occur, and indeed a kernel based implementation
2295			 * can not avoid it.
2296			 */
2297			if (!umtxq_signal(&uq->uq_key, 1))
2298				error = 0;
2299		}
2300		if (error == ERESTART)
2301			error = EINTR;
2302	}
2303	umtxq_remove(uq);
2304	umtxq_unlock(&uq->uq_key);
2305	umtx_key_release(&uq->uq_key);
2306	return (error);
2307}
2308
2309/*
2310 * Signal a userland condition variable.
2311 */
2312static int
2313do_cv_signal(struct thread *td, struct ucond *cv)
2314{
2315	struct umtx_key key;
2316	int error, cnt, nwake;
2317	uint32_t flags;
2318
2319	flags = fuword32(&cv->c_flags);
2320	if ((error = umtx_key_get(cv, TYPE_CV, GET_SHARE(flags), &key)) != 0)
2321		return (error);
2322	umtxq_lock(&key);
2323	umtxq_busy(&key);
2324	cnt = umtxq_count(&key);
2325	nwake = umtxq_signal(&key, 1);
2326	if (cnt <= nwake) {
2327		umtxq_unlock(&key);
2328		error = suword32(
2329		    __DEVOLATILE(uint32_t *, &cv->c_has_waiters), 0);
2330		umtxq_lock(&key);
2331	}
2332	umtxq_unbusy(&key);
2333	umtxq_unlock(&key);
2334	umtx_key_release(&key);
2335	return (error);
2336}
2337
2338static int
2339do_cv_broadcast(struct thread *td, struct ucond *cv)
2340{
2341	struct umtx_key key;
2342	int error;
2343	uint32_t flags;
2344
2345	flags = fuword32(&cv->c_flags);
2346	if ((error = umtx_key_get(cv, TYPE_CV, GET_SHARE(flags), &key)) != 0)
2347		return (error);
2348
2349	umtxq_lock(&key);
2350	umtxq_busy(&key);
2351	umtxq_signal(&key, INT_MAX);
2352	umtxq_unlock(&key);
2353
2354	error = suword32(__DEVOLATILE(uint32_t *, &cv->c_has_waiters), 0);
2355
2356	umtxq_lock(&key);
2357	umtxq_unbusy(&key);
2358	umtxq_unlock(&key);
2359
2360	umtx_key_release(&key);
2361	return (error);
2362}
2363
2364static int
2365do_rw_rdlock(struct thread *td, struct urwlock *rwlock, long fflag, int timo)
2366{
2367	struct umtx_q *uq;
2368	uint32_t flags, wrflags;
2369	int32_t state, oldstate;
2370	int32_t blocked_readers;
2371	int error;
2372
2373	uq = td->td_umtxq;
2374	flags = fuword32(&rwlock->rw_flags);
2375	error = umtx_key_get(rwlock, TYPE_RWLOCK, GET_SHARE(flags), &uq->uq_key);
2376	if (error != 0)
2377		return (error);
2378
2379	wrflags = URWLOCK_WRITE_OWNER;
2380	if (!(fflag & URWLOCK_PREFER_READER) && !(flags & URWLOCK_PREFER_READER))
2381		wrflags |= URWLOCK_WRITE_WAITERS;
2382
2383	for (;;) {
2384		state = fuword32(__DEVOLATILE(int32_t *, &rwlock->rw_state));
2385		/* try to lock it */
2386		while (!(state & wrflags)) {
2387			if (__predict_false(URWLOCK_READER_COUNT(state) == URWLOCK_MAX_READERS)) {
2388				umtx_key_release(&uq->uq_key);
2389				return (EAGAIN);
2390			}
2391			oldstate = casuword32(&rwlock->rw_state, state, state + 1);
2392			if (oldstate == state) {
2393				umtx_key_release(&uq->uq_key);
2394				return (0);
2395			}
2396			state = oldstate;
2397		}
2398
2399		if (error)
2400			break;
2401
2402		/* grab monitor lock */
2403		umtxq_lock(&uq->uq_key);
2404		umtxq_busy(&uq->uq_key);
2405		umtxq_unlock(&uq->uq_key);
2406
2407		/* set read contention bit */
2408		while ((state & wrflags) && !(state & URWLOCK_READ_WAITERS)) {
2409			oldstate = casuword32(&rwlock->rw_state, state, state | URWLOCK_READ_WAITERS);
2410			if (oldstate == state)
2411				goto sleep;
2412			state = oldstate;
2413		}
2414
2415		/* state is changed while setting flags, restart */
2416		if (!(state & wrflags)) {
2417			umtxq_lock(&uq->uq_key);
2418			umtxq_unbusy(&uq->uq_key);
2419			umtxq_unlock(&uq->uq_key);
2420			continue;
2421		}
2422
2423sleep:
2424		/* contention bit is set, before sleeping, increase read waiter count */
2425		blocked_readers = fuword32(&rwlock->rw_blocked_readers);
2426		suword32(&rwlock->rw_blocked_readers, blocked_readers+1);
2427
2428		while (state & wrflags) {
2429			umtxq_lock(&uq->uq_key);
2430			umtxq_insert(uq);
2431			umtxq_unbusy(&uq->uq_key);
2432
2433			error = umtxq_sleep(uq, "urdlck", timo);
2434
2435			umtxq_busy(&uq->uq_key);
2436			umtxq_remove(uq);
2437			umtxq_unlock(&uq->uq_key);
2438			if (error)
2439				break;
2440			state = fuword32(__DEVOLATILE(int32_t *, &rwlock->rw_state));
2441		}
2442
2443		/* decrease read waiter count, and may clear read contention bit */
2444		blocked_readers = fuword32(&rwlock->rw_blocked_readers);
2445		suword32(&rwlock->rw_blocked_readers, blocked_readers-1);
2446		if (blocked_readers == 1) {
2447			state = fuword32(__DEVOLATILE(int32_t *, &rwlock->rw_state));
2448			for (;;) {
2449				oldstate = casuword32(&rwlock->rw_state, state,
2450					 state & ~URWLOCK_READ_WAITERS);
2451				if (oldstate == state)
2452					break;
2453				state = oldstate;
2454			}
2455		}
2456
2457		umtxq_lock(&uq->uq_key);
2458		umtxq_unbusy(&uq->uq_key);
2459		umtxq_unlock(&uq->uq_key);
2460	}
2461	umtx_key_release(&uq->uq_key);
2462	return (error);
2463}
2464
2465static int
2466do_rw_rdlock2(struct thread *td, void *obj, long val, struct timespec *timeout)
2467{
2468	struct timespec ts, ts2, ts3;
2469	struct timeval tv;
2470	int error;
2471
2472	getnanouptime(&ts);
2473	timespecadd(&ts, timeout);
2474	TIMESPEC_TO_TIMEVAL(&tv, timeout);
2475	for (;;) {
2476		error = do_rw_rdlock(td, obj, val, tvtohz(&tv));
2477		if (error != ETIMEDOUT)
2478			break;
2479		getnanouptime(&ts2);
2480		if (timespeccmp(&ts2, &ts, >=)) {
2481			error = ETIMEDOUT;
2482			break;
2483		}
2484		ts3 = ts;
2485		timespecsub(&ts3, &ts2);
2486		TIMESPEC_TO_TIMEVAL(&tv, &ts3);
2487	}
2488	if (error == ERESTART)
2489		error = EINTR;
2490	return (error);
2491}
2492
2493static int
2494do_rw_wrlock(struct thread *td, struct urwlock *rwlock, int timo)
2495{
2496	struct umtx_q *uq;
2497	uint32_t flags;
2498	int32_t state, oldstate;
2499	int32_t blocked_writers;
2500	int error;
2501
2502	uq = td->td_umtxq;
2503	flags = fuword32(&rwlock->rw_flags);
2504	error = umtx_key_get(rwlock, TYPE_RWLOCK, GET_SHARE(flags), &uq->uq_key);
2505	if (error != 0)
2506		return (error);
2507
2508	for (;;) {
2509		state = fuword32(__DEVOLATILE(int32_t *, &rwlock->rw_state));
2510		while (!(state & URWLOCK_WRITE_OWNER) && URWLOCK_READER_COUNT(state) == 0) {
2511			oldstate = casuword32(&rwlock->rw_state, state, state | URWLOCK_WRITE_OWNER);
2512			if (oldstate == state) {
2513				umtx_key_release(&uq->uq_key);
2514				return (0);
2515			}
2516			state = oldstate;
2517		}
2518
2519		if (error)
2520			break;
2521
2522		/* grab monitor lock */
2523		umtxq_lock(&uq->uq_key);
2524		umtxq_busy(&uq->uq_key);
2525		umtxq_unlock(&uq->uq_key);
2526
2527		while (((state & URWLOCK_WRITE_OWNER) || URWLOCK_READER_COUNT(state) != 0) &&
2528		       (state & URWLOCK_WRITE_WAITERS) == 0) {
2529			oldstate = casuword32(&rwlock->rw_state, state, state | URWLOCK_WRITE_WAITERS);
2530			if (oldstate == state)
2531				goto sleep;
2532			state = oldstate;
2533		}
2534
2535		if (!(state & URWLOCK_WRITE_OWNER) && URWLOCK_READER_COUNT(state) == 0) {
2536			umtxq_lock(&uq->uq_key);
2537			umtxq_unbusy(&uq->uq_key);
2538			umtxq_unlock(&uq->uq_key);
2539			continue;
2540		}
2541sleep:
2542		blocked_writers = fuword32(&rwlock->rw_blocked_writers);
2543		suword32(&rwlock->rw_blocked_writers, blocked_writers+1);
2544
2545		while ((state & URWLOCK_WRITE_OWNER) || URWLOCK_READER_COUNT(state) != 0) {
2546			umtxq_lock(&uq->uq_key);
2547			umtxq_insert_queue(uq, UMTX_EXCLUSIVE_QUEUE);
2548			umtxq_unbusy(&uq->uq_key);
2549
2550			error = umtxq_sleep(uq, "uwrlck", timo);
2551
2552			umtxq_busy(&uq->uq_key);
2553			umtxq_remove_queue(uq, UMTX_EXCLUSIVE_QUEUE);
2554			umtxq_unlock(&uq->uq_key);
2555			if (error)
2556				break;
2557			state = fuword32(__DEVOLATILE(int32_t *, &rwlock->rw_state));
2558		}
2559
2560		blocked_writers = fuword32(&rwlock->rw_blocked_writers);
2561		suword32(&rwlock->rw_blocked_writers, blocked_writers-1);
2562		if (blocked_writers == 1) {
2563			state = fuword32(__DEVOLATILE(int32_t *, &rwlock->rw_state));
2564			for (;;) {
2565				oldstate = casuword32(&rwlock->rw_state, state,
2566					 state & ~URWLOCK_WRITE_WAITERS);
2567				if (oldstate == state)
2568					break;
2569				state = oldstate;
2570			}
2571		}
2572
2573		umtxq_lock(&uq->uq_key);
2574		umtxq_unbusy(&uq->uq_key);
2575		umtxq_unlock(&uq->uq_key);
2576	}
2577
2578	umtx_key_release(&uq->uq_key);
2579	return (error);
2580}
2581
2582static int
2583do_rw_wrlock2(struct thread *td, void *obj, struct timespec *timeout)
2584{
2585	struct timespec ts, ts2, ts3;
2586	struct timeval tv;
2587	int error;
2588
2589	getnanouptime(&ts);
2590	timespecadd(&ts, timeout);
2591	TIMESPEC_TO_TIMEVAL(&tv, timeout);
2592	for (;;) {
2593		error = do_rw_wrlock(td, obj, tvtohz(&tv));
2594		if (error != ETIMEDOUT)
2595			break;
2596		getnanouptime(&ts2);
2597		if (timespeccmp(&ts2, &ts, >=)) {
2598			error = ETIMEDOUT;
2599			break;
2600		}
2601		ts3 = ts;
2602		timespecsub(&ts3, &ts2);
2603		TIMESPEC_TO_TIMEVAL(&tv, &ts3);
2604	}
2605	if (error == ERESTART)
2606		error = EINTR;
2607	return (error);
2608}
2609
2610static int
2611do_rw_unlock(struct thread *td, struct urwlock *rwlock)
2612{
2613	struct umtx_q *uq;
2614	uint32_t flags;
2615	int32_t state, oldstate;
2616	int error, q, count;
2617
2618	uq = td->td_umtxq;
2619	flags = fuword32(&rwlock->rw_flags);
2620	error = umtx_key_get(rwlock, TYPE_RWLOCK, GET_SHARE(flags), &uq->uq_key);
2621	if (error != 0)
2622		return (error);
2623
2624	state = fuword32(__DEVOLATILE(int32_t *, &rwlock->rw_state));
2625	if (state & URWLOCK_WRITE_OWNER) {
2626		for (;;) {
2627			oldstate = casuword32(&rwlock->rw_state, state,
2628				state & ~URWLOCK_WRITE_OWNER);
2629			if (oldstate != state) {
2630				state = oldstate;
2631				if (!(oldstate & URWLOCK_WRITE_OWNER)) {
2632					error = EPERM;
2633					goto out;
2634				}
2635			} else
2636				break;
2637		}
2638	} else if (URWLOCK_READER_COUNT(state) != 0) {
2639		for (;;) {
2640			oldstate = casuword32(&rwlock->rw_state, state,
2641				state - 1);
2642			if (oldstate != state) {
2643				state = oldstate;
2644				if (URWLOCK_READER_COUNT(oldstate) == 0) {
2645					error = EPERM;
2646					goto out;
2647				}
2648			}
2649			else
2650				break;
2651		}
2652	} else {
2653		error = EPERM;
2654		goto out;
2655	}
2656
2657	count = 0;
2658
2659	if (!(flags & URWLOCK_PREFER_READER)) {
2660		if (state & URWLOCK_WRITE_WAITERS) {
2661			count = 1;
2662			q = UMTX_EXCLUSIVE_QUEUE;
2663		} else if (state & URWLOCK_READ_WAITERS) {
2664			count = INT_MAX;
2665			q = UMTX_SHARED_QUEUE;
2666		}
2667	} else {
2668		if (state & URWLOCK_READ_WAITERS) {
2669			count = INT_MAX;
2670			q = UMTX_SHARED_QUEUE;
2671		} else if (state & URWLOCK_WRITE_WAITERS) {
2672			count = 1;
2673			q = UMTX_EXCLUSIVE_QUEUE;
2674		}
2675	}
2676
2677	if (count) {
2678		umtxq_lock(&uq->uq_key);
2679		umtxq_busy(&uq->uq_key);
2680		umtxq_signal_queue(&uq->uq_key, count, q);
2681		umtxq_unbusy(&uq->uq_key);
2682		umtxq_unlock(&uq->uq_key);
2683	}
2684out:
2685	umtx_key_release(&uq->uq_key);
2686	return (error);
2687}
2688
2689int
2690_umtx_lock(struct thread *td, struct _umtx_lock_args *uap)
2691    /* struct umtx *umtx */
2692{
2693	return _do_lock_umtx(td, uap->umtx, td->td_tid, 0);
2694}
2695
2696int
2697_umtx_unlock(struct thread *td, struct _umtx_unlock_args *uap)
2698    /* struct umtx *umtx */
2699{
2700	return do_unlock_umtx(td, uap->umtx, td->td_tid);
2701}
2702
2703static int
2704__umtx_op_lock_umtx(struct thread *td, struct _umtx_op_args *uap)
2705{
2706	struct timespec *ts, timeout;
2707	int error;
2708
2709	/* Allow a null timespec (wait forever). */
2710	if (uap->uaddr2 == NULL)
2711		ts = NULL;
2712	else {
2713		error = copyin(uap->uaddr2, &timeout, sizeof(timeout));
2714		if (error != 0)
2715			return (error);
2716		if (timeout.tv_nsec >= 1000000000 ||
2717		    timeout.tv_nsec < 0) {
2718			return (EINVAL);
2719		}
2720		ts = &timeout;
2721	}
2722	return (do_lock_umtx(td, uap->obj, uap->val, ts));
2723}
2724
2725static int
2726__umtx_op_unlock_umtx(struct thread *td, struct _umtx_op_args *uap)
2727{
2728	return (do_unlock_umtx(td, uap->obj, uap->val));
2729}
2730
2731static int
2732__umtx_op_wait(struct thread *td, struct _umtx_op_args *uap)
2733{
2734	struct timespec *ts, timeout;
2735	int error;
2736
2737	if (uap->uaddr2 == NULL)
2738		ts = NULL;
2739	else {
2740		error = copyin(uap->uaddr2, &timeout, sizeof(timeout));
2741		if (error != 0)
2742			return (error);
2743		if (timeout.tv_nsec >= 1000000000 ||
2744		    timeout.tv_nsec < 0)
2745			return (EINVAL);
2746		ts = &timeout;
2747	}
2748	return do_wait(td, uap->obj, uap->val, ts, 0);
2749}
2750
2751static int
2752__umtx_op_wait_uint(struct thread *td, struct _umtx_op_args *uap)
2753{
2754	struct timespec *ts, timeout;
2755	int error;
2756
2757	if (uap->uaddr2 == NULL)
2758		ts = NULL;
2759	else {
2760		error = copyin(uap->uaddr2, &timeout, sizeof(timeout));
2761		if (error != 0)
2762			return (error);
2763		if (timeout.tv_nsec >= 1000000000 ||
2764		    timeout.tv_nsec < 0)
2765			return (EINVAL);
2766		ts = &timeout;
2767	}
2768	return do_wait(td, uap->obj, uap->val, ts, 1);
2769}
2770
2771static int
2772__umtx_op_wake(struct thread *td, struct _umtx_op_args *uap)
2773{
2774	return (kern_umtx_wake(td, uap->obj, uap->val));
2775}
2776
2777static int
2778__umtx_op_lock_umutex(struct thread *td, struct _umtx_op_args *uap)
2779{
2780	struct timespec *ts, timeout;
2781	int error;
2782
2783	/* Allow a null timespec (wait forever). */
2784	if (uap->uaddr2 == NULL)
2785		ts = NULL;
2786	else {
2787		error = copyin(uap->uaddr2, &timeout,
2788		    sizeof(timeout));
2789		if (error != 0)
2790			return (error);
2791		if (timeout.tv_nsec >= 1000000000 ||
2792		    timeout.tv_nsec < 0) {
2793			return (EINVAL);
2794		}
2795		ts = &timeout;
2796	}
2797	return do_lock_umutex(td, uap->obj, ts, 0);
2798}
2799
2800static int
2801__umtx_op_trylock_umutex(struct thread *td, struct _umtx_op_args *uap)
2802{
2803	return do_lock_umutex(td, uap->obj, NULL, 1);
2804}
2805
2806static int
2807__umtx_op_unlock_umutex(struct thread *td, struct _umtx_op_args *uap)
2808{
2809	return do_unlock_umutex(td, uap->obj);
2810}
2811
2812static int
2813__umtx_op_set_ceiling(struct thread *td, struct _umtx_op_args *uap)
2814{
2815	return do_set_ceiling(td, uap->obj, uap->val, uap->uaddr1);
2816}
2817
2818static int
2819__umtx_op_cv_wait(struct thread *td, struct _umtx_op_args *uap)
2820{
2821	struct timespec *ts, timeout;
2822	int error;
2823
2824	/* Allow a null timespec (wait forever). */
2825	if (uap->uaddr2 == NULL)
2826		ts = NULL;
2827	else {
2828		error = copyin(uap->uaddr2, &timeout,
2829		    sizeof(timeout));
2830		if (error != 0)
2831			return (error);
2832		if (timeout.tv_nsec >= 1000000000 ||
2833		    timeout.tv_nsec < 0) {
2834			return (EINVAL);
2835		}
2836		ts = &timeout;
2837	}
2838	return (do_cv_wait(td, uap->obj, uap->uaddr1, ts, uap->val));
2839}
2840
2841static int
2842__umtx_op_cv_signal(struct thread *td, struct _umtx_op_args *uap)
2843{
2844	return do_cv_signal(td, uap->obj);
2845}
2846
2847static int
2848__umtx_op_cv_broadcast(struct thread *td, struct _umtx_op_args *uap)
2849{
2850	return do_cv_broadcast(td, uap->obj);
2851}
2852
2853static int
2854__umtx_op_rw_rdlock(struct thread *td, struct _umtx_op_args *uap)
2855{
2856	struct timespec timeout;
2857	int error;
2858
2859	/* Allow a null timespec (wait forever). */
2860	if (uap->uaddr2 == NULL) {
2861		error = do_rw_rdlock(td, uap->obj, uap->val, 0);
2862	} else {
2863		error = copyin(uap->uaddr2, &timeout,
2864		    sizeof(timeout));
2865		if (error != 0)
2866			return (error);
2867		if (timeout.tv_nsec >= 1000000000 ||
2868		    timeout.tv_nsec < 0) {
2869			return (EINVAL);
2870		}
2871		error = do_rw_rdlock2(td, uap->obj, uap->val, &timeout);
2872	}
2873	return (error);
2874}
2875
2876static int
2877__umtx_op_rw_wrlock(struct thread *td, struct _umtx_op_args *uap)
2878{
2879	struct timespec timeout;
2880	int error;
2881
2882	/* Allow a null timespec (wait forever). */
2883	if (uap->uaddr2 == NULL) {
2884		error = do_rw_wrlock(td, uap->obj, 0);
2885	} else {
2886		error = copyin(uap->uaddr2, &timeout,
2887		    sizeof(timeout));
2888		if (error != 0)
2889			return (error);
2890		if (timeout.tv_nsec >= 1000000000 ||
2891		    timeout.tv_nsec < 0) {
2892			return (EINVAL);
2893		}
2894
2895		error = do_rw_wrlock2(td, uap->obj, &timeout);
2896	}
2897	return (error);
2898}
2899
2900static int
2901__umtx_op_rw_unlock(struct thread *td, struct _umtx_op_args *uap)
2902{
2903	return do_rw_unlock(td, uap->obj);
2904}
2905
2906typedef int (*_umtx_op_func)(struct thread *td, struct _umtx_op_args *uap);
2907
2908static _umtx_op_func op_table[] = {
2909	__umtx_op_lock_umtx,		/* UMTX_OP_LOCK */
2910	__umtx_op_unlock_umtx,		/* UMTX_OP_UNLOCK */
2911	__umtx_op_wait,			/* UMTX_OP_WAIT */
2912	__umtx_op_wake,			/* UMTX_OP_WAKE */
2913	__umtx_op_trylock_umutex,	/* UMTX_OP_MUTEX_TRYLOCK */
2914	__umtx_op_lock_umutex,		/* UMTX_OP_MUTEX_LOCK */
2915	__umtx_op_unlock_umutex,	/* UMTX_OP_MUTEX_UNLOCK */
2916	__umtx_op_set_ceiling,		/* UMTX_OP_SET_CEILING */
2917	__umtx_op_cv_wait,		/* UMTX_OP_CV_WAIT*/
2918	__umtx_op_cv_signal,		/* UMTX_OP_CV_SIGNAL */
2919	__umtx_op_cv_broadcast,		/* UMTX_OP_CV_BROADCAST */
2920	__umtx_op_wait_uint,		/* UMTX_OP_WAIT_UINT */
2921	__umtx_op_rw_rdlock,		/* UMTX_OP_RW_RDLOCK */
2922	__umtx_op_rw_wrlock,		/* UMTX_OP_RW_WRLOCK */
2923	__umtx_op_rw_unlock		/* UMTX_OP_RW_UNLOCK */
2924};
2925
2926int
2927_umtx_op(struct thread *td, struct _umtx_op_args *uap)
2928{
2929	if ((unsigned)uap->op < UMTX_OP_MAX)
2930		return (*op_table[uap->op])(td, uap);
2931	return (EINVAL);
2932}
2933
2934#ifdef COMPAT_IA32
2935int
2936freebsd32_umtx_lock(struct thread *td, struct freebsd32_umtx_lock_args *uap)
2937    /* struct umtx *umtx */
2938{
2939	return (do_lock_umtx32(td, (uint32_t *)uap->umtx, td->td_tid, NULL));
2940}
2941
2942int
2943freebsd32_umtx_unlock(struct thread *td, struct freebsd32_umtx_unlock_args *uap)
2944    /* struct umtx *umtx */
2945{
2946	return (do_unlock_umtx32(td, (uint32_t *)uap->umtx, td->td_tid));
2947}
2948
2949struct timespec32 {
2950	u_int32_t tv_sec;
2951	u_int32_t tv_nsec;
2952};
2953
2954static inline int
2955copyin_timeout32(void *addr, struct timespec *tsp)
2956{
2957	struct timespec32 ts32;
2958	int error;
2959
2960	error = copyin(addr, &ts32, sizeof(struct timespec32));
2961	if (error == 0) {
2962		tsp->tv_sec = ts32.tv_sec;
2963		tsp->tv_nsec = ts32.tv_nsec;
2964	}
2965	return (error);
2966}
2967
2968static int
2969__umtx_op_lock_umtx_compat32(struct thread *td, struct _umtx_op_args *uap)
2970{
2971	struct timespec *ts, timeout;
2972	int error;
2973
2974	/* Allow a null timespec (wait forever). */
2975	if (uap->uaddr2 == NULL)
2976		ts = NULL;
2977	else {
2978		error = copyin_timeout32(uap->uaddr2, &timeout);
2979		if (error != 0)
2980			return (error);
2981		if (timeout.tv_nsec >= 1000000000 ||
2982		    timeout.tv_nsec < 0) {
2983			return (EINVAL);
2984		}
2985		ts = &timeout;
2986	}
2987	return (do_lock_umtx32(td, uap->obj, uap->val, ts));
2988}
2989
2990static int
2991__umtx_op_unlock_umtx_compat32(struct thread *td, struct _umtx_op_args *uap)
2992{
2993	return (do_unlock_umtx32(td, uap->obj, (uint32_t)uap->val));
2994}
2995
2996static int
2997__umtx_op_wait_compat32(struct thread *td, struct _umtx_op_args *uap)
2998{
2999	struct timespec *ts, timeout;
3000	int error;
3001
3002	if (uap->uaddr2 == NULL)
3003		ts = NULL;
3004	else {
3005		error = copyin_timeout32(uap->uaddr2, &timeout);
3006		if (error != 0)
3007			return (error);
3008		if (timeout.tv_nsec >= 1000000000 ||
3009		    timeout.tv_nsec < 0)
3010			return (EINVAL);
3011		ts = &timeout;
3012	}
3013	return do_wait(td, uap->obj, uap->val, ts, 1);
3014}
3015
3016static int
3017__umtx_op_lock_umutex_compat32(struct thread *td, struct _umtx_op_args *uap)
3018{
3019	struct timespec *ts, timeout;
3020	int error;
3021
3022	/* Allow a null timespec (wait forever). */
3023	if (uap->uaddr2 == NULL)
3024		ts = NULL;
3025	else {
3026		error = copyin_timeout32(uap->uaddr2, &timeout);
3027		if (error != 0)
3028			return (error);
3029		if (timeout.tv_nsec >= 1000000000 ||
3030		    timeout.tv_nsec < 0)
3031			return (EINVAL);
3032		ts = &timeout;
3033	}
3034	return do_lock_umutex(td, uap->obj, ts, 0);
3035}
3036
3037static int
3038__umtx_op_cv_wait_compat32(struct thread *td, struct _umtx_op_args *uap)
3039{
3040	struct timespec *ts, timeout;
3041	int error;
3042
3043	/* Allow a null timespec (wait forever). */
3044	if (uap->uaddr2 == NULL)
3045		ts = NULL;
3046	else {
3047		error = copyin_timeout32(uap->uaddr2, &timeout);
3048		if (error != 0)
3049			return (error);
3050		if (timeout.tv_nsec >= 1000000000 ||
3051		    timeout.tv_nsec < 0)
3052			return (EINVAL);
3053		ts = &timeout;
3054	}
3055	return (do_cv_wait(td, uap->obj, uap->uaddr1, ts, uap->val));
3056}
3057
3058static int
3059__umtx_op_rw_rdlock_compat32(struct thread *td, struct _umtx_op_args *uap)
3060{
3061	struct timespec timeout;
3062	int error;
3063
3064	/* Allow a null timespec (wait forever). */
3065	if (uap->uaddr2 == NULL) {
3066		error = do_rw_rdlock(td, uap->obj, uap->val, 0);
3067	} else {
3068		error = copyin(uap->uaddr2, &timeout,
3069		    sizeof(timeout));
3070		if (error != 0)
3071			return (error);
3072		if (timeout.tv_nsec >= 1000000000 ||
3073		    timeout.tv_nsec < 0) {
3074			return (EINVAL);
3075		}
3076		error = do_rw_rdlock2(td, uap->obj, uap->val, &timeout);
3077	}
3078	return (error);
3079}
3080
3081static int
3082__umtx_op_rw_wrlock_compat32(struct thread *td, struct _umtx_op_args *uap)
3083{
3084	struct timespec timeout;
3085	int error;
3086
3087	/* Allow a null timespec (wait forever). */
3088	if (uap->uaddr2 == NULL) {
3089		error = do_rw_wrlock(td, uap->obj, 0);
3090	} else {
3091		error = copyin_timeout32(uap->uaddr2, &timeout);
3092		if (error != 0)
3093			return (error);
3094		if (timeout.tv_nsec >= 1000000000 ||
3095		    timeout.tv_nsec < 0) {
3096			return (EINVAL);
3097		}
3098
3099		error = do_rw_wrlock2(td, uap->obj, &timeout);
3100	}
3101	return (error);
3102}
3103
3104static _umtx_op_func op_table_compat32[] = {
3105	__umtx_op_lock_umtx_compat32,	/* UMTX_OP_LOCK */
3106	__umtx_op_unlock_umtx_compat32,	/* UMTX_OP_UNLOCK */
3107	__umtx_op_wait_compat32,	/* UMTX_OP_WAIT */
3108	__umtx_op_wake,			/* UMTX_OP_WAKE */
3109	__umtx_op_trylock_umutex,	/* UMTX_OP_MUTEX_LOCK */
3110	__umtx_op_lock_umutex_compat32,	/* UMTX_OP_MUTEX_TRYLOCK */
3111	__umtx_op_unlock_umutex,	/* UMTX_OP_MUTEX_UNLOCK	*/
3112	__umtx_op_set_ceiling,		/* UMTX_OP_SET_CEILING */
3113	__umtx_op_cv_wait_compat32,	/* UMTX_OP_CV_WAIT*/
3114	__umtx_op_cv_signal,		/* UMTX_OP_CV_SIGNAL */
3115	__umtx_op_cv_broadcast,		/* UMTX_OP_CV_BROADCAST */
3116	__umtx_op_wait_compat32,	/* UMTX_OP_WAIT_UINT */
3117	__umtx_op_rw_rdlock_compat32,	/* UMTX_OP_RW_RDLOCK */
3118	__umtx_op_rw_wrlock_compat32,	/* UMTX_OP_RW_WRLOCK */
3119	__umtx_op_rw_unlock		/* UMTX_OP_RW_UNLOCK */
3120};
3121
3122int
3123freebsd32_umtx_op(struct thread *td, struct freebsd32_umtx_op_args *uap)
3124{
3125	if ((unsigned)uap->op < UMTX_OP_MAX)
3126		return (*op_table_compat32[uap->op])(td,
3127			(struct _umtx_op_args *)uap);
3128	return (EINVAL);
3129}
3130#endif
3131
3132void
3133umtx_thread_init(struct thread *td)
3134{
3135	td->td_umtxq = umtxq_alloc();
3136	td->td_umtxq->uq_thread = td;
3137}
3138
3139void
3140umtx_thread_fini(struct thread *td)
3141{
3142	umtxq_free(td->td_umtxq);
3143}
3144
3145/*
3146 * It will be called when new thread is created, e.g fork().
3147 */
3148void
3149umtx_thread_alloc(struct thread *td)
3150{
3151	struct umtx_q *uq;
3152
3153	uq = td->td_umtxq;
3154	uq->uq_inherited_pri = PRI_MAX;
3155
3156	KASSERT(uq->uq_flags == 0, ("uq_flags != 0"));
3157	KASSERT(uq->uq_thread == td, ("uq_thread != td"));
3158	KASSERT(uq->uq_pi_blocked == NULL, ("uq_pi_blocked != NULL"));
3159	KASSERT(TAILQ_EMPTY(&uq->uq_pi_contested), ("uq_pi_contested is not empty"));
3160}
3161
3162/*
3163 * exec() hook.
3164 */
3165static void
3166umtx_exec_hook(void *arg __unused, struct proc *p __unused,
3167	struct image_params *imgp __unused)
3168{
3169	umtx_thread_cleanup(curthread);
3170}
3171
3172/*
3173 * thread_exit() hook.
3174 */
3175void
3176umtx_thread_exit(struct thread *td)
3177{
3178	umtx_thread_cleanup(td);
3179}
3180
3181/*
3182 * clean up umtx data.
3183 */
3184static void
3185umtx_thread_cleanup(struct thread *td)
3186{
3187	struct umtx_q *uq;
3188	struct umtx_pi *pi;
3189
3190	if ((uq = td->td_umtxq) == NULL)
3191		return;
3192
3193	mtx_lock_spin(&umtx_lock);
3194	uq->uq_inherited_pri = PRI_MAX;
3195	while ((pi = TAILQ_FIRST(&uq->uq_pi_contested)) != NULL) {
3196		pi->pi_owner = NULL;
3197		TAILQ_REMOVE(&uq->uq_pi_contested, pi, pi_link);
3198	}
3199	thread_lock(td);
3200	td->td_flags &= ~TDF_UBORROWING;
3201	thread_unlock(td);
3202	mtx_unlock_spin(&umtx_lock);
3203}
3204