kern_umtx.c revision 163678
1/*-
2 * Copyright (c) 2004, David Xu <davidxu@freebsd.org>
3 * Copyright (c) 2002, Jeffrey Roberson <jeff@freebsd.org>
4 * All rights reserved.
5 *
6 * Redistribution and use in source and binary forms, with or without
7 * modification, are permitted provided that the following conditions
8 * are met:
9 * 1. Redistributions of source code must retain the above copyright
10 *    notice unmodified, this list of conditions, and the following
11 *    disclaimer.
12 * 2. Redistributions in binary form must reproduce the above copyright
13 *    notice, this list of conditions and the following disclaimer in the
14 *    documentation and/or other materials provided with the distribution.
15 *
16 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
17 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
18 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
19 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
20 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
21 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
22 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
23 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
25 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26 */
27
28#include <sys/cdefs.h>
29__FBSDID("$FreeBSD: head/sys/kern/kern_umtx.c 163678 2006-10-25 06:38:46Z davidxu $");
30
31#include "opt_compat.h"
32#include <sys/param.h>
33#include <sys/kernel.h>
34#include <sys/limits.h>
35#include <sys/lock.h>
36#include <sys/malloc.h>
37#include <sys/mutex.h>
38#include <sys/proc.h>
39#include <sys/sched.h>
40#include <sys/sysctl.h>
41#include <sys/sysent.h>
42#include <sys/systm.h>
43#include <sys/sysproto.h>
44#include <sys/eventhandler.h>
45#include <sys/umtx.h>
46
47#include <vm/vm.h>
48#include <vm/vm_param.h>
49#include <vm/pmap.h>
50#include <vm/vm_map.h>
51#include <vm/vm_object.h>
52
53#ifdef COMPAT_IA32
54#include <compat/freebsd32/freebsd32_proto.h>
55#endif
56
57#define TYPE_SIMPLE_LOCK	0
58#define TYPE_SIMPLE_WAIT	1
59#define TYPE_NORMAL_UMUTEX	2
60#define TYPE_PI_UMUTEX		3
61#define TYPE_PP_UMUTEX		4
62#define TYPE_CV			5
63
64/* Key to represent a unique userland synchronous object */
65struct umtx_key {
66	int	hash;
67	int	type;
68	int	shared;
69	union {
70		struct {
71			vm_object_t	object;
72			uintptr_t	offset;
73		} shared;
74		struct {
75			struct vmspace	*vs;
76			uintptr_t	addr;
77		} private;
78		struct {
79			void		*a;
80			uintptr_t	b;
81		} both;
82	} info;
83};
84
85/* Priority inheritance mutex info. */
86struct umtx_pi {
87	/* Owner thread */
88	struct thread		*pi_owner;
89
90	/* Reference count */
91	int			pi_refcount;
92
93 	/* List entry to link umtx holding by thread */
94	TAILQ_ENTRY(umtx_pi)	pi_link;
95
96	/* List entry in hash */
97	TAILQ_ENTRY(umtx_pi)	pi_hashlink;
98
99	/* List for waiters */
100	TAILQ_HEAD(,umtx_q)	pi_blocked;
101
102	/* Identify a userland lock object */
103	struct umtx_key		pi_key;
104};
105
106/* A userland synchronous object user. */
107struct umtx_q {
108	/* Linked list for the hash. */
109	TAILQ_ENTRY(umtx_q)	uq_link;
110
111	/* Umtx key. */
112	struct umtx_key		uq_key;
113
114	/* Umtx flags. */
115	int			uq_flags;
116#define UQF_UMTXQ	0x0001
117
118	/* The thread waits on. */
119	struct thread		*uq_thread;
120
121	/*
122	 * Blocked on PI mutex. read can use chain lock
123	 * or sched_lock, write must have both chain lock and
124	 * sched_lock being hold.
125	 */
126	struct umtx_pi		*uq_pi_blocked;
127
128	/* On blocked list */
129	TAILQ_ENTRY(umtx_q)	uq_lockq;
130
131	/* Thread contending with us */
132	TAILQ_HEAD(,umtx_pi)	uq_pi_contested;
133
134	/* Inherited priority from PP mutex */
135	u_char			uq_inherited_pri;
136};
137
138TAILQ_HEAD(umtxq_head, umtx_q);
139
140/* Userland lock object's wait-queue chain */
141struct umtxq_chain {
142	/* Lock for this chain. */
143	struct mtx		uc_lock;
144
145	/* List of sleep queues. */
146	struct umtxq_head	uc_queue;
147
148	/* Busy flag */
149	char			uc_busy;
150
151	/* Chain lock waiters */
152	int			uc_waiters;
153
154	/* All PI in the list */
155	TAILQ_HEAD(,umtx_pi)	uc_pi_list;
156};
157
158#define	UMTXQ_LOCKED_ASSERT(uc)		mtx_assert(&(uc)->uc_lock, MA_OWNED)
159
160/*
161 * Don't propagate time-sharing priority, there is a security reason,
162 * a user can simply introduce PI-mutex, let thread A lock the mutex,
163 * and let another thread B block on the mutex, because B is
164 * sleeping, its priority will be boosted, this causes A's priority to
165 * be boosted via priority propagating too and will never be lowered even
166 * if it is using 100%CPU, this is unfair to other processes.
167 */
168
169#define UPRI(td)	(((td)->td_ksegrp->kg_user_pri >= PRI_MIN_TIMESHARE &&\
170			  (td)->td_ksegrp->kg_user_pri <= PRI_MAX_TIMESHARE) ?\
171			 PRI_MAX_TIMESHARE : (td)->td_ksegrp->kg_user_pri)
172
173#define	GOLDEN_RATIO_PRIME	2654404609U
174#define	UMTX_CHAINS		128
175#define	UMTX_SHIFTS		(__WORD_BIT - 7)
176
177#define THREAD_SHARE		0
178#define PROCESS_SHARE		1
179#define AUTO_SHARE		2
180
181#define	GET_SHARE(flags)	\
182    (((flags) & USYNC_PROCESS_SHARED) == 0 ? THREAD_SHARE : PROCESS_SHARE)
183
184static uma_zone_t		umtx_pi_zone;
185static struct umtxq_chain	umtxq_chains[UMTX_CHAINS];
186static MALLOC_DEFINE(M_UMTX, "umtx", "UMTX queue memory");
187static int			umtx_pi_allocated;
188
189SYSCTL_NODE(_debug, OID_AUTO, umtx, CTLFLAG_RW, 0, "umtx debug");
190SYSCTL_INT(_debug_umtx, OID_AUTO, umtx_pi_allocated, CTLFLAG_RD,
191    &umtx_pi_allocated, 0, "Allocated umtx_pi");
192
193static void umtxq_sysinit(void *);
194static void umtxq_hash(struct umtx_key *key);
195static struct umtxq_chain *umtxq_getchain(struct umtx_key *key);
196static void umtxq_lock(struct umtx_key *key);
197static void umtxq_unlock(struct umtx_key *key);
198static void umtxq_busy(struct umtx_key *key);
199static void umtxq_unbusy(struct umtx_key *key);
200static void umtxq_insert(struct umtx_q *uq);
201static void umtxq_remove(struct umtx_q *uq);
202static int umtxq_sleep(struct umtx_q *uq, const char *wmesg, int timo);
203static int umtxq_count(struct umtx_key *key);
204static int umtxq_signal(struct umtx_key *key, int nr_wakeup);
205static int umtx_key_match(const struct umtx_key *k1, const struct umtx_key *k2);
206static int umtx_key_get(void *addr, int type, int share,
207	struct umtx_key *key);
208static void umtx_key_release(struct umtx_key *key);
209static struct umtx_pi *umtx_pi_alloc(void);
210static void umtx_pi_free(struct umtx_pi *pi);
211static int do_unlock_pp(struct thread *td, struct umutex *m, uint32_t flags);
212static void umtx_thread_cleanup(struct thread *td);
213static void umtx_exec_hook(void *arg __unused, struct proc *p __unused,
214	struct image_params *imgp __unused);
215SYSINIT(umtx, SI_SUB_EVENTHANDLER+1, SI_ORDER_MIDDLE, umtxq_sysinit, NULL);
216
217static void
218umtxq_sysinit(void *arg __unused)
219{
220	int i;
221
222	umtx_pi_zone = uma_zcreate("umtx pi", sizeof(struct umtx_pi),
223		NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0);
224	for (i = 0; i < UMTX_CHAINS; ++i) {
225		mtx_init(&umtxq_chains[i].uc_lock, "umtxql", NULL,
226			 MTX_DEF | MTX_DUPOK);
227		TAILQ_INIT(&umtxq_chains[i].uc_queue);
228		TAILQ_INIT(&umtxq_chains[i].uc_pi_list);
229		umtxq_chains[i].uc_busy = 0;
230		umtxq_chains[i].uc_waiters = 0;
231	}
232	EVENTHANDLER_REGISTER(process_exec, umtx_exec_hook, NULL,
233	    EVENTHANDLER_PRI_ANY);
234}
235
236struct umtx_q *
237umtxq_alloc(void)
238{
239	struct umtx_q *uq;
240
241	uq = malloc(sizeof(struct umtx_q), M_UMTX, M_WAITOK | M_ZERO);
242	TAILQ_INIT(&uq->uq_pi_contested);
243	uq->uq_inherited_pri = PRI_MAX;
244	return (uq);
245}
246
247void
248umtxq_free(struct umtx_q *uq)
249{
250	free(uq, M_UMTX);
251}
252
253static inline void
254umtxq_hash(struct umtx_key *key)
255{
256	unsigned n = (uintptr_t)key->info.both.a + key->info.both.b;
257	key->hash = ((n * GOLDEN_RATIO_PRIME) >> UMTX_SHIFTS) % UMTX_CHAINS;
258}
259
260static inline int
261umtx_key_match(const struct umtx_key *k1, const struct umtx_key *k2)
262{
263	return (k1->type == k2->type &&
264		k1->info.both.a == k2->info.both.a &&
265	        k1->info.both.b == k2->info.both.b);
266}
267
268static inline struct umtxq_chain *
269umtxq_getchain(struct umtx_key *key)
270{
271	return (&umtxq_chains[key->hash]);
272}
273
274/*
275 * Set chain to busy state when following operation
276 * may be blocked (kernel mutex can not be used).
277 */
278static inline void
279umtxq_busy(struct umtx_key *key)
280{
281	struct umtxq_chain *uc;
282
283	uc = umtxq_getchain(key);
284	mtx_assert(&uc->uc_lock, MA_OWNED);
285	while (uc->uc_busy != 0) {
286		uc->uc_waiters++;
287		msleep(uc, &uc->uc_lock, 0, "umtxqb", 0);
288		uc->uc_waiters--;
289	}
290	uc->uc_busy = 1;
291}
292
293/*
294 * Unbusy a chain.
295 */
296static inline void
297umtxq_unbusy(struct umtx_key *key)
298{
299	struct umtxq_chain *uc;
300
301	uc = umtxq_getchain(key);
302	mtx_assert(&uc->uc_lock, MA_OWNED);
303	KASSERT(uc->uc_busy != 0, ("not busy"));
304	uc->uc_busy = 0;
305	if (uc->uc_waiters)
306		wakeup_one(uc);
307}
308
309/*
310 * Lock a chain.
311 */
312static inline void
313umtxq_lock(struct umtx_key *key)
314{
315	struct umtxq_chain *uc;
316
317	uc = umtxq_getchain(key);
318	mtx_lock(&uc->uc_lock);
319}
320
321/*
322 * Unlock a chain.
323 */
324static inline void
325umtxq_unlock(struct umtx_key *key)
326{
327	struct umtxq_chain *uc;
328
329	uc = umtxq_getchain(key);
330	mtx_unlock(&uc->uc_lock);
331}
332
333/*
334 * Insert a thread onto the umtx queue.
335 */
336static inline void
337umtxq_insert(struct umtx_q *uq)
338{
339	struct umtxq_chain *uc;
340
341	uc = umtxq_getchain(&uq->uq_key);
342	UMTXQ_LOCKED_ASSERT(uc);
343	TAILQ_INSERT_TAIL(&uc->uc_queue, uq, uq_link);
344	uq->uq_flags |= UQF_UMTXQ;
345}
346
347/*
348 * Remove thread from the umtx queue.
349 */
350static inline void
351umtxq_remove(struct umtx_q *uq)
352{
353	struct umtxq_chain *uc;
354
355	uc = umtxq_getchain(&uq->uq_key);
356	UMTXQ_LOCKED_ASSERT(uc);
357	if (uq->uq_flags & UQF_UMTXQ) {
358		TAILQ_REMOVE(&uc->uc_queue, uq, uq_link);
359		uq->uq_flags &= ~UQF_UMTXQ;
360	}
361}
362
363/*
364 * Check if there are multiple waiters
365 */
366static int
367umtxq_count(struct umtx_key *key)
368{
369	struct umtxq_chain *uc;
370	struct umtx_q *uq;
371	int count = 0;
372
373	uc = umtxq_getchain(key);
374	UMTXQ_LOCKED_ASSERT(uc);
375	TAILQ_FOREACH(uq, &uc->uc_queue, uq_link) {
376		if (umtx_key_match(&uq->uq_key, key)) {
377			if (++count > 1)
378				break;
379		}
380	}
381	return (count);
382}
383
384/*
385 * Check if there are multiple PI waiters and returns first
386 * waiter.
387 */
388static int
389umtxq_count_pi(struct umtx_key *key, struct umtx_q **first)
390{
391	struct umtxq_chain *uc;
392	struct umtx_q *uq;
393	int count = 0;
394
395	*first = NULL;
396	uc = umtxq_getchain(key);
397	UMTXQ_LOCKED_ASSERT(uc);
398	TAILQ_FOREACH(uq, &uc->uc_queue, uq_link) {
399		if (umtx_key_match(&uq->uq_key, key)) {
400			if (++count > 1)
401				break;
402			*first = uq;
403		}
404	}
405	return (count);
406}
407
408/*
409 * Wake up threads waiting on an userland object.
410 */
411static int
412umtxq_signal(struct umtx_key *key, int n_wake)
413{
414	struct umtxq_chain *uc;
415	struct umtx_q *uq, *next;
416	int ret;
417
418	ret = 0;
419	uc = umtxq_getchain(key);
420	UMTXQ_LOCKED_ASSERT(uc);
421	TAILQ_FOREACH_SAFE(uq, &uc->uc_queue, uq_link, next) {
422		if (umtx_key_match(&uq->uq_key, key)) {
423			umtxq_remove(uq);
424			wakeup(uq);
425			if (++ret >= n_wake)
426				break;
427		}
428	}
429	return (ret);
430}
431
432/*
433 * Wake up specified thread.
434 */
435static inline void
436umtxq_signal_thread(struct umtx_q *uq)
437{
438	struct umtxq_chain *uc;
439
440	uc = umtxq_getchain(&uq->uq_key);
441	UMTXQ_LOCKED_ASSERT(uc);
442	umtxq_remove(uq);
443	wakeup(uq);
444}
445
446/*
447 * Put thread into sleep state, before sleeping, check if
448 * thread was removed from umtx queue.
449 */
450static inline int
451umtxq_sleep(struct umtx_q *uq, const char *wmesg, int timo)
452{
453	struct umtxq_chain *uc;
454	int error;
455
456	uc = umtxq_getchain(&uq->uq_key);
457	UMTXQ_LOCKED_ASSERT(uc);
458	if (!(uq->uq_flags & UQF_UMTXQ))
459		return (0);
460	error = msleep(uq, &uc->uc_lock, PCATCH, wmesg, timo);
461	if (error == EWOULDBLOCK)
462		error = ETIMEDOUT;
463	return (error);
464}
465
466/*
467 * Convert userspace address into unique logical address.
468 */
469static int
470umtx_key_get(void *addr, int type, int share, struct umtx_key *key)
471{
472	struct thread *td = curthread;
473	vm_map_t map;
474	vm_map_entry_t entry;
475	vm_pindex_t pindex;
476	vm_prot_t prot;
477	boolean_t wired;
478
479	key->type = type;
480	if (share == THREAD_SHARE) {
481		key->shared = 0;
482		key->info.private.vs = td->td_proc->p_vmspace;
483		key->info.private.addr = (uintptr_t)addr;
484	} else {
485		MPASS(share == PROCESS_SHARE || share == AUTO_SHARE);
486		map = &td->td_proc->p_vmspace->vm_map;
487		if (vm_map_lookup(&map, (vm_offset_t)addr, VM_PROT_WRITE,
488		    &entry, &key->info.shared.object, &pindex, &prot,
489		    &wired) != KERN_SUCCESS) {
490			return EFAULT;
491		}
492
493		if ((share == PROCESS_SHARE) ||
494		    (share == AUTO_SHARE &&
495		     VM_INHERIT_SHARE == entry->inheritance)) {
496			key->shared = 1;
497			key->info.shared.offset = entry->offset + entry->start -
498				(vm_offset_t)addr;
499			vm_object_reference(key->info.shared.object);
500		} else {
501			key->shared = 0;
502			key->info.private.vs = td->td_proc->p_vmspace;
503			key->info.private.addr = (uintptr_t)addr;
504		}
505		vm_map_lookup_done(map, entry);
506	}
507
508	umtxq_hash(key);
509	return (0);
510}
511
512/*
513 * Release key.
514 */
515static inline void
516umtx_key_release(struct umtx_key *key)
517{
518	if (key->shared)
519		vm_object_deallocate(key->info.shared.object);
520}
521
522/*
523 * Lock a umtx object.
524 */
525static int
526_do_lock_umtx(struct thread *td, struct umtx *umtx, u_long id, int timo)
527{
528	struct umtx_q *uq;
529	u_long owner;
530	u_long old;
531	int error = 0;
532
533	uq = td->td_umtxq;
534
535	/*
536	 * Care must be exercised when dealing with umtx structure. It
537	 * can fault on any access.
538	 */
539	for (;;) {
540		/*
541		 * Try the uncontested case.  This should be done in userland.
542		 */
543		owner = casuword(&umtx->u_owner, UMTX_UNOWNED, id);
544
545		/* The acquire succeeded. */
546		if (owner == UMTX_UNOWNED)
547			return (0);
548
549		/* The address was invalid. */
550		if (owner == -1)
551			return (EFAULT);
552
553		/* If no one owns it but it is contested try to acquire it. */
554		if (owner == UMTX_CONTESTED) {
555			owner = casuword(&umtx->u_owner,
556			    UMTX_CONTESTED, id | UMTX_CONTESTED);
557
558			if (owner == UMTX_CONTESTED)
559				return (0);
560
561			/* The address was invalid. */
562			if (owner == -1)
563				return (EFAULT);
564
565			/* If this failed the lock has changed, restart. */
566			continue;
567		}
568
569		/*
570		 * If we caught a signal, we have retried and now
571		 * exit immediately.
572		 */
573		if (error != 0)
574			return (error);
575
576		if ((error = umtx_key_get(umtx, TYPE_SIMPLE_LOCK,
577			AUTO_SHARE, &uq->uq_key)) != 0)
578			return (error);
579
580		umtxq_lock(&uq->uq_key);
581		umtxq_busy(&uq->uq_key);
582		umtxq_insert(uq);
583		umtxq_unbusy(&uq->uq_key);
584		umtxq_unlock(&uq->uq_key);
585
586		/*
587		 * Set the contested bit so that a release in user space
588		 * knows to use the system call for unlock.  If this fails
589		 * either some one else has acquired the lock or it has been
590		 * released.
591		 */
592		old = casuword(&umtx->u_owner, owner, owner | UMTX_CONTESTED);
593
594		/* The address was invalid. */
595		if (old == -1) {
596			umtxq_lock(&uq->uq_key);
597			umtxq_remove(uq);
598			umtxq_unlock(&uq->uq_key);
599			umtx_key_release(&uq->uq_key);
600			return (EFAULT);
601		}
602
603		/*
604		 * We set the contested bit, sleep. Otherwise the lock changed
605		 * and we need to retry or we lost a race to the thread
606		 * unlocking the umtx.
607		 */
608		umtxq_lock(&uq->uq_key);
609		if (old == owner)
610			error = umtxq_sleep(uq, "umtx", timo);
611		umtxq_remove(uq);
612		umtxq_unlock(&uq->uq_key);
613		umtx_key_release(&uq->uq_key);
614	}
615
616	return (0);
617}
618
619/*
620 * Lock a umtx object.
621 */
622static int
623do_lock_umtx(struct thread *td, struct umtx *umtx, u_long id,
624	struct timespec *timeout)
625{
626	struct timespec ts, ts2, ts3;
627	struct timeval tv;
628	int error;
629
630	if (timeout == NULL) {
631		error = _do_lock_umtx(td, umtx, id, 0);
632		/* Mutex locking is restarted if it is interrupted. */
633		if (error == EINTR)
634			error = ERESTART;
635	} else {
636		getnanouptime(&ts);
637		timespecadd(&ts, timeout);
638		TIMESPEC_TO_TIMEVAL(&tv, timeout);
639		for (;;) {
640			error = _do_lock_umtx(td, umtx, id, tvtohz(&tv));
641			if (error != ETIMEDOUT)
642				break;
643			getnanouptime(&ts2);
644			if (timespeccmp(&ts2, &ts, >=)) {
645				error = ETIMEDOUT;
646				break;
647			}
648			ts3 = ts;
649			timespecsub(&ts3, &ts2);
650			TIMESPEC_TO_TIMEVAL(&tv, &ts3);
651		}
652		/* Timed-locking is not restarted. */
653		if (error == ERESTART)
654			error = EINTR;
655	}
656	return (error);
657}
658
659/*
660 * Unlock a umtx object.
661 */
662static int
663do_unlock_umtx(struct thread *td, struct umtx *umtx, u_long id)
664{
665	struct umtx_key key;
666	u_long owner;
667	u_long old;
668	int error;
669	int count;
670
671	/*
672	 * Make sure we own this mtx.
673	 */
674	owner = fuword(__DEVOLATILE(u_long *, &umtx->u_owner));
675	if (owner == -1)
676		return (EFAULT);
677
678	if ((owner & ~UMTX_CONTESTED) != id)
679		return (EPERM);
680
681	/* This should be done in userland */
682	if ((owner & UMTX_CONTESTED) == 0) {
683		old = casuword(&umtx->u_owner, owner, UMTX_UNOWNED);
684		if (old == -1)
685			return (EFAULT);
686		if (old == owner)
687			return (0);
688		owner = old;
689	}
690
691	/* We should only ever be in here for contested locks */
692	if ((error = umtx_key_get(umtx, TYPE_SIMPLE_LOCK, AUTO_SHARE,
693		&key)) != 0)
694		return (error);
695
696	umtxq_lock(&key);
697	umtxq_busy(&key);
698	count = umtxq_count(&key);
699	umtxq_unlock(&key);
700
701	/*
702	 * When unlocking the umtx, it must be marked as unowned if
703	 * there is zero or one thread only waiting for it.
704	 * Otherwise, it must be marked as contested.
705	 */
706	old = casuword(&umtx->u_owner, owner,
707		count <= 1 ? UMTX_UNOWNED : UMTX_CONTESTED);
708	umtxq_lock(&key);
709	umtxq_signal(&key,1);
710	umtxq_unbusy(&key);
711	umtxq_unlock(&key);
712	umtx_key_release(&key);
713	if (old == -1)
714		return (EFAULT);
715	if (old != owner)
716		return (EINVAL);
717	return (0);
718}
719
720#ifdef COMPAT_IA32
721
722/*
723 * Lock a umtx object.
724 */
725static int
726_do_lock_umtx32(struct thread *td, uint32_t *m, uint32_t id, int timo)
727{
728	struct umtx_q *uq;
729	uint32_t owner;
730	uint32_t old;
731	int error = 0;
732
733	uq = td->td_umtxq;
734
735	/*
736	 * Care must be exercised when dealing with umtx structure. It
737	 * can fault on any access.
738	 */
739	for (;;) {
740		/*
741		 * Try the uncontested case.  This should be done in userland.
742		 */
743		owner = casuword32(m, UMUTEX_UNOWNED, id);
744
745		/* The acquire succeeded. */
746		if (owner == UMUTEX_UNOWNED)
747			return (0);
748
749		/* The address was invalid. */
750		if (owner == -1)
751			return (EFAULT);
752
753		/* If no one owns it but it is contested try to acquire it. */
754		if (owner == UMUTEX_CONTESTED) {
755			owner = casuword32(m,
756			    UMUTEX_CONTESTED, id | UMUTEX_CONTESTED);
757			if (owner == UMUTEX_CONTESTED)
758				return (0);
759
760			/* The address was invalid. */
761			if (owner == -1)
762				return (EFAULT);
763
764			/* If this failed the lock has changed, restart. */
765			continue;
766		}
767
768		/*
769		 * If we caught a signal, we have retried and now
770		 * exit immediately.
771		 */
772		if (error != 0)
773			return (error);
774
775		if ((error = umtx_key_get(m, TYPE_SIMPLE_LOCK,
776			AUTO_SHARE, &uq->uq_key)) != 0)
777			return (error);
778
779		umtxq_lock(&uq->uq_key);
780		umtxq_busy(&uq->uq_key);
781		umtxq_insert(uq);
782		umtxq_unbusy(&uq->uq_key);
783		umtxq_unlock(&uq->uq_key);
784
785		/*
786		 * Set the contested bit so that a release in user space
787		 * knows to use the system call for unlock.  If this fails
788		 * either some one else has acquired the lock or it has been
789		 * released.
790		 */
791		old = casuword32(m, owner, owner | UMUTEX_CONTESTED);
792
793		/* The address was invalid. */
794		if (old == -1) {
795			umtxq_lock(&uq->uq_key);
796			umtxq_remove(uq);
797			umtxq_unlock(&uq->uq_key);
798			umtx_key_release(&uq->uq_key);
799			return (EFAULT);
800		}
801
802		/*
803		 * We set the contested bit, sleep. Otherwise the lock changed
804		 * and we need to retry or we lost a race to the thread
805		 * unlocking the umtx.
806		 */
807		umtxq_lock(&uq->uq_key);
808		if (old == owner)
809			error = umtxq_sleep(uq, "umtx", timo);
810		umtxq_remove(uq);
811		umtxq_unlock(&uq->uq_key);
812		umtx_key_release(&uq->uq_key);
813	}
814
815	return (0);
816}
817
818/*
819 * Lock a umtx object.
820 */
821static int
822do_lock_umtx32(struct thread *td, void *m, uint32_t id,
823	struct timespec *timeout)
824{
825	struct timespec ts, ts2, ts3;
826	struct timeval tv;
827	int error;
828
829	if (timeout == NULL) {
830		error = _do_lock_umtx32(td, m, id, 0);
831		/* Mutex locking is restarted if it is interrupted. */
832		if (error == EINTR)
833			error = ERESTART;
834	} else {
835		getnanouptime(&ts);
836		timespecadd(&ts, timeout);
837		TIMESPEC_TO_TIMEVAL(&tv, timeout);
838		for (;;) {
839			error = _do_lock_umtx32(td, m, id, tvtohz(&tv));
840			if (error != ETIMEDOUT)
841				break;
842			getnanouptime(&ts2);
843			if (timespeccmp(&ts2, &ts, >=)) {
844				error = ETIMEDOUT;
845				break;
846			}
847			ts3 = ts;
848			timespecsub(&ts3, &ts2);
849			TIMESPEC_TO_TIMEVAL(&tv, &ts3);
850		}
851		/* Timed-locking is not restarted. */
852		if (error == ERESTART)
853			error = EINTR;
854	}
855	return (error);
856}
857
858/*
859 * Unlock a umtx object.
860 */
861static int
862do_unlock_umtx32(struct thread *td, uint32_t *m, uint32_t id)
863{
864	struct umtx_key key;
865	uint32_t owner;
866	uint32_t old;
867	int error;
868	int count;
869
870	/*
871	 * Make sure we own this mtx.
872	 */
873	owner = fuword32(m);
874	if (owner == -1)
875		return (EFAULT);
876
877	if ((owner & ~UMUTEX_CONTESTED) != id)
878		return (EPERM);
879
880	/* This should be done in userland */
881	if ((owner & UMUTEX_CONTESTED) == 0) {
882		old = casuword32(m, owner, UMUTEX_UNOWNED);
883		if (old == -1)
884			return (EFAULT);
885		if (old == owner)
886			return (0);
887		owner = old;
888	}
889
890	/* We should only ever be in here for contested locks */
891	if ((error = umtx_key_get(m, TYPE_SIMPLE_LOCK, AUTO_SHARE,
892		&key)) != 0)
893		return (error);
894
895	umtxq_lock(&key);
896	umtxq_busy(&key);
897	count = umtxq_count(&key);
898	umtxq_unlock(&key);
899
900	/*
901	 * When unlocking the umtx, it must be marked as unowned if
902	 * there is zero or one thread only waiting for it.
903	 * Otherwise, it must be marked as contested.
904	 */
905	old = casuword32(m, owner,
906		count <= 1 ? UMUTEX_UNOWNED : UMUTEX_CONTESTED);
907	umtxq_lock(&key);
908	umtxq_signal(&key,1);
909	umtxq_unbusy(&key);
910	umtxq_unlock(&key);
911	umtx_key_release(&key);
912	if (old == -1)
913		return (EFAULT);
914	if (old != owner)
915		return (EINVAL);
916	return (0);
917}
918#endif
919
920/*
921 * Fetch and compare value, sleep on the address if value is not changed.
922 */
923static int
924do_wait(struct thread *td, void *addr, u_long id,
925	struct timespec *timeout, int compat32)
926{
927	struct umtx_q *uq;
928	struct timespec ts, ts2, ts3;
929	struct timeval tv;
930	u_long tmp;
931	int error = 0;
932
933	uq = td->td_umtxq;
934	if ((error = umtx_key_get(addr, TYPE_SIMPLE_WAIT, AUTO_SHARE,
935	    &uq->uq_key)) != 0)
936		return (error);
937
938	umtxq_lock(&uq->uq_key);
939	umtxq_insert(uq);
940	umtxq_unlock(&uq->uq_key);
941	if (compat32 == 0)
942		tmp = fuword(addr);
943        else
944		tmp = fuword32(addr);
945	if (tmp != id) {
946		umtxq_lock(&uq->uq_key);
947		umtxq_remove(uq);
948		umtxq_unlock(&uq->uq_key);
949	} else if (timeout == NULL) {
950		umtxq_lock(&uq->uq_key);
951		error = umtxq_sleep(uq, "ucond", 0);
952		umtxq_remove(uq);
953		umtxq_unlock(&uq->uq_key);
954	} else {
955		getnanouptime(&ts);
956		timespecadd(&ts, timeout);
957		TIMESPEC_TO_TIMEVAL(&tv, timeout);
958		umtxq_lock(&uq->uq_key);
959		for (;;) {
960			error = umtxq_sleep(uq, "ucond", tvtohz(&tv));
961			if (!(uq->uq_flags & UQF_UMTXQ))
962				break;
963			if (error != ETIMEDOUT)
964				break;
965			umtxq_unlock(&uq->uq_key);
966			getnanouptime(&ts2);
967			if (timespeccmp(&ts2, &ts, >=)) {
968				error = ETIMEDOUT;
969				umtxq_lock(&uq->uq_key);
970				break;
971			}
972			ts3 = ts;
973			timespecsub(&ts3, &ts2);
974			TIMESPEC_TO_TIMEVAL(&tv, &ts3);
975			umtxq_lock(&uq->uq_key);
976		}
977		umtxq_remove(uq);
978		umtxq_unlock(&uq->uq_key);
979	}
980	umtx_key_release(&uq->uq_key);
981	if (error == ERESTART)
982		error = EINTR;
983	return (error);
984}
985
986/*
987 * Wake up threads sleeping on the specified address.
988 */
989int
990kern_umtx_wake(struct thread *td, void *uaddr, int n_wake)
991{
992	struct umtx_key key;
993	int ret;
994
995	if ((ret = umtx_key_get(uaddr, TYPE_SIMPLE_WAIT, AUTO_SHARE,
996	   &key)) != 0)
997		return (ret);
998	umtxq_lock(&key);
999	ret = umtxq_signal(&key, n_wake);
1000	umtxq_unlock(&key);
1001	umtx_key_release(&key);
1002	return (0);
1003}
1004
1005/*
1006 * Lock PTHREAD_PRIO_NONE protocol POSIX mutex.
1007 */
1008static int
1009_do_lock_normal(struct thread *td, struct umutex *m, uint32_t flags, int timo,
1010	int try)
1011{
1012	struct umtx_q *uq;
1013	uint32_t owner, old, id;
1014	int error = 0;
1015
1016	id = td->td_tid;
1017	uq = td->td_umtxq;
1018
1019	/*
1020	 * Care must be exercised when dealing with umtx structure. It
1021	 * can fault on any access.
1022	 */
1023	for (;;) {
1024		/*
1025		 * Try the uncontested case.  This should be done in userland.
1026		 */
1027		owner = casuword32(&m->m_owner, UMUTEX_UNOWNED, id);
1028
1029		/* The acquire succeeded. */
1030		if (owner == UMUTEX_UNOWNED)
1031			return (0);
1032
1033		/* The address was invalid. */
1034		if (owner == -1)
1035			return (EFAULT);
1036
1037		/* If no one owns it but it is contested try to acquire it. */
1038		if (owner == UMUTEX_CONTESTED) {
1039			owner = casuword32(&m->m_owner,
1040			    UMUTEX_CONTESTED, id | UMUTEX_CONTESTED);
1041
1042			if (owner == UMUTEX_CONTESTED)
1043				return (0);
1044
1045			/* The address was invalid. */
1046			if (owner == -1)
1047				return (EFAULT);
1048
1049			/* If this failed the lock has changed, restart. */
1050			continue;
1051		}
1052
1053		if ((flags & UMUTEX_ERROR_CHECK) != 0 &&
1054		    (owner & ~UMUTEX_CONTESTED) == id)
1055			return (EDEADLK);
1056
1057		if (try != 0)
1058			return (EBUSY);
1059
1060		/*
1061		 * If we caught a signal, we have retried and now
1062		 * exit immediately.
1063		 */
1064		if (error != 0)
1065			return (error);
1066
1067		if ((error = umtx_key_get(m, TYPE_NORMAL_UMUTEX,
1068		    GET_SHARE(flags), &uq->uq_key)) != 0)
1069			return (error);
1070
1071		umtxq_lock(&uq->uq_key);
1072		umtxq_busy(&uq->uq_key);
1073		umtxq_insert(uq);
1074		umtxq_unbusy(&uq->uq_key);
1075		umtxq_unlock(&uq->uq_key);
1076
1077		/*
1078		 * Set the contested bit so that a release in user space
1079		 * knows to use the system call for unlock.  If this fails
1080		 * either some one else has acquired the lock or it has been
1081		 * released.
1082		 */
1083		old = casuword32(&m->m_owner, owner, owner | UMUTEX_CONTESTED);
1084
1085		/* The address was invalid. */
1086		if (old == -1) {
1087			umtxq_lock(&uq->uq_key);
1088			umtxq_remove(uq);
1089			umtxq_unlock(&uq->uq_key);
1090			umtx_key_release(&uq->uq_key);
1091			return (EFAULT);
1092		}
1093
1094		/*
1095		 * We set the contested bit, sleep. Otherwise the lock changed
1096		 * and we need to retry or we lost a race to the thread
1097		 * unlocking the umtx.
1098		 */
1099		umtxq_lock(&uq->uq_key);
1100		if (old == owner)
1101			error = umtxq_sleep(uq, "umtxn", timo);
1102		umtxq_remove(uq);
1103		umtxq_unlock(&uq->uq_key);
1104		umtx_key_release(&uq->uq_key);
1105	}
1106
1107	return (0);
1108}
1109
1110/*
1111 * Lock PTHREAD_PRIO_NONE protocol POSIX mutex.
1112 */
1113/*
1114 * Unlock PTHREAD_PRIO_NONE protocol POSIX mutex.
1115 */
1116static int
1117do_unlock_normal(struct thread *td, struct umutex *m, uint32_t flags)
1118{
1119	struct umtx_key key;
1120	uint32_t owner, old, id;
1121	int error;
1122	int count;
1123
1124	id = td->td_tid;
1125	/*
1126	 * Make sure we own this mtx.
1127	 */
1128	owner = fuword32(__DEVOLATILE(uint32_t *, &m->m_owner));
1129	if (owner == -1)
1130		return (EFAULT);
1131
1132	if ((owner & ~UMUTEX_CONTESTED) != id)
1133		return (EPERM);
1134
1135	/* This should be done in userland */
1136	if ((owner & UMUTEX_CONTESTED) == 0) {
1137		old = casuword32(&m->m_owner, owner, UMUTEX_UNOWNED);
1138		if (old == -1)
1139			return (EFAULT);
1140		if (old == owner)
1141			return (0);
1142		owner = old;
1143	}
1144
1145	/* We should only ever be in here for contested locks */
1146	if ((error = umtx_key_get(m, TYPE_NORMAL_UMUTEX, GET_SHARE(flags),
1147	    &key)) != 0)
1148		return (error);
1149
1150	umtxq_lock(&key);
1151	umtxq_busy(&key);
1152	count = umtxq_count(&key);
1153	umtxq_unlock(&key);
1154
1155	/*
1156	 * When unlocking the umtx, it must be marked as unowned if
1157	 * there is zero or one thread only waiting for it.
1158	 * Otherwise, it must be marked as contested.
1159	 */
1160	old = casuword32(&m->m_owner, owner,
1161		count <= 1 ? UMUTEX_UNOWNED : UMUTEX_CONTESTED);
1162	umtxq_lock(&key);
1163	umtxq_signal(&key,1);
1164	umtxq_unbusy(&key);
1165	umtxq_unlock(&key);
1166	umtx_key_release(&key);
1167	if (old == -1)
1168		return (EFAULT);
1169	if (old != owner)
1170		return (EINVAL);
1171	return (0);
1172}
1173
1174static inline struct umtx_pi *
1175umtx_pi_alloc(void)
1176{
1177	struct umtx_pi *pi;
1178
1179	pi = uma_zalloc(umtx_pi_zone, M_ZERO | M_WAITOK);
1180	TAILQ_INIT(&pi->pi_blocked);
1181	atomic_add_int(&umtx_pi_allocated, 1);
1182	return (pi);
1183}
1184
1185static inline void
1186umtx_pi_free(struct umtx_pi *pi)
1187{
1188	uma_zfree(umtx_pi_zone, pi);
1189	atomic_add_int(&umtx_pi_allocated, -1);
1190}
1191
1192/*
1193 * Adjust the thread's position on a pi_state after its priority has been
1194 * changed.
1195 */
1196static int
1197umtx_pi_adjust_thread(struct umtx_pi *pi, struct thread *td)
1198{
1199	struct umtx_q *uq, *uq1, *uq2;
1200	struct thread *td1;
1201
1202	mtx_assert(&sched_lock, MA_OWNED);
1203	if (pi == NULL)
1204		return (0);
1205
1206	uq = td->td_umtxq;
1207
1208	/*
1209	 * Check if the thread needs to be moved on the blocked chain.
1210	 * It needs to be moved if either its priority is lower than
1211	 * the previous thread or higher than the next thread.
1212	 */
1213	uq1 = TAILQ_PREV(uq, umtxq_head, uq_lockq);
1214	uq2 = TAILQ_NEXT(uq, uq_lockq);
1215	if ((uq1 != NULL && UPRI(td) < UPRI(uq1->uq_thread)) ||
1216	    (uq2 != NULL && UPRI(td) > UPRI(uq2->uq_thread))) {
1217		/*
1218		 * Remove thread from blocked chain and determine where
1219		 * it should be moved to.
1220		 */
1221		TAILQ_REMOVE(&pi->pi_blocked, uq, uq_lockq);
1222		TAILQ_FOREACH(uq1, &pi->pi_blocked, uq_lockq) {
1223			td1 = uq1->uq_thread;
1224			MPASS(td1->td_proc->p_magic == P_MAGIC);
1225			if (UPRI(td1) > UPRI(td))
1226				break;
1227		}
1228
1229		if (uq1 == NULL)
1230			TAILQ_INSERT_TAIL(&pi->pi_blocked, uq, uq_lockq);
1231		else
1232			TAILQ_INSERT_BEFORE(uq1, uq, uq_lockq);
1233	}
1234	return (1);
1235}
1236
1237/*
1238 * Propagate priority when a thread is blocked on POSIX
1239 * PI mutex.
1240 */
1241static void
1242umtx_propagate_priority(struct thread *td)
1243{
1244	struct umtx_q *uq;
1245	struct umtx_pi *pi;
1246	int pri;
1247
1248	mtx_assert(&sched_lock, MA_OWNED);
1249	pri = UPRI(td);
1250	uq = td->td_umtxq;
1251	pi = uq->uq_pi_blocked;
1252	if (pi == NULL)
1253		return;
1254
1255	for (;;) {
1256		td = pi->pi_owner;
1257		if (td == NULL)
1258			return;
1259
1260		MPASS(td->td_proc != NULL);
1261		MPASS(td->td_proc->p_magic == P_MAGIC);
1262
1263		if (UPRI(td) <= pri)
1264			return;
1265
1266		sched_lend_user_prio(td, pri);
1267
1268		/*
1269		 * Pick up the lock that td is blocked on.
1270		 */
1271		uq = td->td_umtxq;
1272		pi = uq->uq_pi_blocked;
1273		/* Resort td on the list if needed. */
1274		if (!umtx_pi_adjust_thread(pi, td))
1275			break;
1276	}
1277}
1278
1279/*
1280 * Unpropagate priority for a PI mutex when a thread blocked on
1281 * it is interrupted by signal or resumed by others.
1282 */
1283static void
1284umtx_unpropagate_priority(struct umtx_pi *pi)
1285{
1286	struct umtx_q *uq, *uq_owner;
1287	struct umtx_pi *pi2;
1288	int pri;
1289
1290	mtx_assert(&sched_lock, MA_OWNED);
1291
1292	while (pi != NULL && pi->pi_owner != NULL) {
1293		pri = PRI_MAX;
1294		uq_owner = pi->pi_owner->td_umtxq;
1295
1296		TAILQ_FOREACH(pi2, &uq_owner->uq_pi_contested, pi_link) {
1297			uq = TAILQ_FIRST(&pi2->pi_blocked);
1298			if (uq != NULL) {
1299				if (pri > UPRI(uq->uq_thread))
1300					pri = UPRI(uq->uq_thread);
1301			}
1302		}
1303
1304		if (pri > uq_owner->uq_inherited_pri)
1305			pri = uq_owner->uq_inherited_pri;
1306		sched_unlend_user_prio(pi->pi_owner, pri);
1307		pi = uq_owner->uq_pi_blocked;
1308	}
1309}
1310
1311/*
1312 * Insert a PI mutex into owned list.
1313 */
1314static void
1315umtx_pi_setowner(struct umtx_pi *pi, struct thread *owner)
1316{
1317	struct umtx_q *uq_owner;
1318
1319	uq_owner = owner->td_umtxq;
1320	mtx_assert(&sched_lock, MA_OWNED);
1321	if (pi->pi_owner != NULL)
1322		panic("pi_ower != NULL");
1323	pi->pi_owner = owner;
1324	TAILQ_INSERT_TAIL(&uq_owner->uq_pi_contested, pi, pi_link);
1325}
1326
1327/*
1328 * Claim ownership of a PI mutex.
1329 */
1330static int
1331umtx_pi_claim(struct umtx_pi *pi, struct thread *owner)
1332{
1333	struct umtx_q *uq, *uq_owner;
1334
1335	uq_owner = owner->td_umtxq;
1336	mtx_lock_spin(&sched_lock);
1337	if (pi->pi_owner == owner) {
1338		mtx_unlock_spin(&sched_lock);
1339		return (0);
1340	}
1341
1342	if (pi->pi_owner != NULL) {
1343		/*
1344		 * userland may have already messed the mutex, sigh.
1345		 */
1346		mtx_unlock_spin(&sched_lock);
1347		return (EPERM);
1348	}
1349	umtx_pi_setowner(pi, owner);
1350	uq = TAILQ_FIRST(&pi->pi_blocked);
1351	if (uq != NULL) {
1352		int pri;
1353
1354		pri = UPRI(uq->uq_thread);
1355		if (pri < UPRI(owner))
1356			sched_lend_user_prio(owner, pri);
1357	}
1358	mtx_unlock_spin(&sched_lock);
1359	return (0);
1360}
1361
1362/*
1363 * Adjust a thread's order position in its blocked PI mutex,
1364 * this may result new priority propagating process.
1365 */
1366void
1367umtx_pi_adjust(struct thread *td, u_char oldpri)
1368{
1369	struct umtx_q *uq;
1370	struct umtx_pi *pi;
1371
1372	uq = td->td_umtxq;
1373
1374	mtx_assert(&sched_lock, MA_OWNED);
1375	MPASS(TD_ON_UPILOCK(td));
1376
1377	/*
1378	 * Pick up the lock that td is blocked on.
1379	 */
1380	pi = uq->uq_pi_blocked;
1381	MPASS(pi != NULL);
1382
1383	/* Resort the turnstile on the list. */
1384	if (!umtx_pi_adjust_thread(pi, td))
1385		return;
1386
1387	/*
1388	 * If our priority was lowered and we are at the head of the
1389	 * turnstile, then propagate our new priority up the chain.
1390	 */
1391	if (uq == TAILQ_FIRST(&pi->pi_blocked) && UPRI(td) < oldpri)
1392		umtx_propagate_priority(td);
1393}
1394
1395/*
1396 * Sleep on a PI mutex.
1397 */
1398static int
1399umtxq_sleep_pi(struct umtx_q *uq, struct umtx_pi *pi,
1400	uint32_t owner, const char *wmesg, int timo)
1401{
1402	struct umtxq_chain *uc;
1403	struct thread *td, *td1;
1404	struct umtx_q *uq1;
1405	int pri;
1406	int error = 0;
1407
1408	td = uq->uq_thread;
1409	KASSERT(td == curthread, ("inconsistent uq_thread"));
1410	uc = umtxq_getchain(&uq->uq_key);
1411	UMTXQ_LOCKED_ASSERT(uc);
1412	umtxq_insert(uq);
1413	if (pi->pi_owner == NULL) {
1414		/* XXX
1415		 * Current, We only support process private PI-mutex,
1416		 * non-contended PI-mutexes are locked in userland.
1417		 * Process shared PI-mutex should always be initialized
1418		 * by kernel and be registered in kernel, locking should
1419		 * always be done by kernel to avoid security problems.
1420		 * For process private PI-mutex, we can find owner
1421		 * thread and boost its priority safely.
1422		 */
1423		PROC_LOCK(curproc);
1424		td1 = thread_find(curproc, owner);
1425		mtx_lock_spin(&sched_lock);
1426		if (td1 != NULL && pi->pi_owner == NULL) {
1427			uq1 = td1->td_umtxq;
1428			umtx_pi_setowner(pi, td1);
1429		}
1430		PROC_UNLOCK(curproc);
1431	} else {
1432		mtx_lock_spin(&sched_lock);
1433	}
1434
1435	TAILQ_FOREACH(uq1, &pi->pi_blocked, uq_lockq) {
1436		pri = UPRI(uq1->uq_thread);
1437		if (pri > UPRI(td))
1438			break;
1439	}
1440
1441	if (uq1 != NULL)
1442		TAILQ_INSERT_BEFORE(uq1, uq, uq_lockq);
1443	else
1444		TAILQ_INSERT_TAIL(&pi->pi_blocked, uq, uq_lockq);
1445
1446	uq->uq_pi_blocked = pi;
1447	td->td_flags |= TDF_UPIBLOCKED;
1448	mtx_unlock_spin(&sched_lock);
1449	umtxq_unlock(&uq->uq_key);
1450
1451	mtx_lock_spin(&sched_lock);
1452	umtx_propagate_priority(td);
1453	mtx_unlock_spin(&sched_lock);
1454
1455	umtxq_lock(&uq->uq_key);
1456	if (uq->uq_flags & UQF_UMTXQ) {
1457		error = msleep(uq, &uc->uc_lock, PCATCH, wmesg, timo);
1458		if (error == EWOULDBLOCK)
1459			error = ETIMEDOUT;
1460		if (uq->uq_flags & UQF_UMTXQ) {
1461			umtxq_busy(&uq->uq_key);
1462			umtxq_remove(uq);
1463			umtxq_unbusy(&uq->uq_key);
1464		}
1465	}
1466	umtxq_unlock(&uq->uq_key);
1467
1468	mtx_lock_spin(&sched_lock);
1469	uq->uq_pi_blocked = NULL;
1470	td->td_flags &= ~TDF_UPIBLOCKED;
1471	TAILQ_REMOVE(&pi->pi_blocked, uq, uq_lockq);
1472	umtx_unpropagate_priority(pi);
1473	mtx_unlock_spin(&sched_lock);
1474
1475	umtxq_lock(&uq->uq_key);
1476
1477	return (error);
1478}
1479
1480/*
1481 * Add reference count for a PI mutex.
1482 */
1483static void
1484umtx_pi_ref(struct umtx_pi *pi)
1485{
1486	struct umtxq_chain *uc;
1487
1488	uc = umtxq_getchain(&pi->pi_key);
1489	UMTXQ_LOCKED_ASSERT(uc);
1490	pi->pi_refcount++;
1491}
1492
1493/*
1494 * Decrease reference count for a PI mutex, if the counter
1495 * is decreased to zero, its memory space is freed.
1496 */
1497static void
1498umtx_pi_unref(struct umtx_pi *pi)
1499{
1500	struct umtxq_chain *uc;
1501	int free = 0;
1502
1503	uc = umtxq_getchain(&pi->pi_key);
1504	UMTXQ_LOCKED_ASSERT(uc);
1505	KASSERT(pi->pi_refcount > 0, ("invalid reference count"));
1506	if (--pi->pi_refcount == 0) {
1507		mtx_lock_spin(&sched_lock);
1508		if (pi->pi_owner != NULL) {
1509			TAILQ_REMOVE(&pi->pi_owner->td_umtxq->uq_pi_contested,
1510				pi, pi_link);
1511			pi->pi_owner = NULL;
1512		}
1513		KASSERT(TAILQ_EMPTY(&pi->pi_blocked),
1514			("blocked queue not empty"));
1515		mtx_unlock_spin(&sched_lock);
1516		TAILQ_REMOVE(&uc->uc_pi_list, pi, pi_hashlink);
1517		free = 1;
1518	}
1519	if (free)
1520		umtx_pi_free(pi);
1521}
1522
1523/*
1524 * Find a PI mutex in hash table.
1525 */
1526static struct umtx_pi *
1527umtx_pi_lookup(struct umtx_key *key)
1528{
1529	struct umtxq_chain *uc;
1530	struct umtx_pi *pi;
1531
1532	uc = umtxq_getchain(key);
1533	UMTXQ_LOCKED_ASSERT(uc);
1534
1535	TAILQ_FOREACH(pi, &uc->uc_pi_list, pi_hashlink) {
1536		if (umtx_key_match(&pi->pi_key, key)) {
1537			return (pi);
1538		}
1539	}
1540	return (NULL);
1541}
1542
1543/*
1544 * Insert a PI mutex into hash table.
1545 */
1546static inline void
1547umtx_pi_insert(struct umtx_pi *pi)
1548{
1549	struct umtxq_chain *uc;
1550
1551	uc = umtxq_getchain(&pi->pi_key);
1552	UMTXQ_LOCKED_ASSERT(uc);
1553	TAILQ_INSERT_TAIL(&uc->uc_pi_list, pi, pi_hashlink);
1554}
1555
1556/*
1557 * Lock a PI mutex.
1558 */
1559static int
1560_do_lock_pi(struct thread *td, struct umutex *m, uint32_t flags, int timo,
1561	int try)
1562{
1563	struct umtx_q *uq;
1564	struct umtx_pi *pi, *new_pi;
1565	uint32_t id, owner, old;
1566	int error;
1567
1568	id = td->td_tid;
1569	uq = td->td_umtxq;
1570
1571	if ((error = umtx_key_get(m, TYPE_PI_UMUTEX, GET_SHARE(flags),
1572	    &uq->uq_key)) != 0)
1573		return (error);
1574	for (;;) {
1575		pi = NULL;
1576		umtxq_lock(&uq->uq_key);
1577		pi = umtx_pi_lookup(&uq->uq_key);
1578		if (pi == NULL) {
1579			umtxq_unlock(&uq->uq_key);
1580			new_pi = umtx_pi_alloc();
1581			new_pi->pi_key = uq->uq_key;
1582			umtxq_lock(&uq->uq_key);
1583			pi = umtx_pi_lookup(&uq->uq_key);
1584			if (pi != NULL)
1585				umtx_pi_free(new_pi);
1586			else {
1587				umtx_pi_insert(new_pi);
1588				pi = new_pi;
1589			}
1590		}
1591
1592		umtx_pi_ref(pi);
1593		umtxq_unlock(&uq->uq_key);
1594
1595		/*
1596		 * Care must be exercised when dealing with umtx structure.  It
1597		 * can fault on any access.
1598		 */
1599
1600		/*
1601		 * Try the uncontested case.  This should be done in userland.
1602		 */
1603		owner = casuword32(&m->m_owner, UMUTEX_UNOWNED, id);
1604
1605		/* The acquire succeeded. */
1606		if (owner == UMUTEX_UNOWNED) {
1607			error = 0;
1608			break;
1609		}
1610
1611		/* The address was invalid. */
1612		if (owner == -1) {
1613			error = EFAULT;
1614			break;
1615		}
1616
1617		/* If no one owns it but it is contested try to acquire it. */
1618		if (owner == UMUTEX_CONTESTED) {
1619			owner = casuword32(&m->m_owner,
1620			    UMUTEX_CONTESTED, id | UMUTEX_CONTESTED);
1621
1622			if (owner == UMUTEX_CONTESTED) {
1623				umtxq_lock(&uq->uq_key);
1624				error = umtx_pi_claim(pi, td);
1625				umtxq_unlock(&uq->uq_key);
1626				break;
1627			}
1628
1629			/* The address was invalid. */
1630			if (owner == -1) {
1631				error = EFAULT;
1632				break;
1633			}
1634
1635			/* If this failed the lock has changed, restart. */
1636			umtxq_lock(&uq->uq_key);
1637			umtx_pi_unref(pi);
1638			umtxq_unlock(&uq->uq_key);
1639			pi = NULL;
1640			continue;
1641		}
1642
1643		if ((flags & UMUTEX_ERROR_CHECK) != 0 &&
1644		    (owner & ~UMUTEX_CONTESTED) == id) {
1645			error = EDEADLK;
1646			break;
1647		}
1648
1649		if (try != 0) {
1650			error = EBUSY;
1651			break;
1652		}
1653
1654		/*
1655		 * If we caught a signal, we have retried and now
1656		 * exit immediately.
1657		 */
1658		if (error != 0)
1659			break;
1660
1661		umtxq_lock(&uq->uq_key);
1662		umtxq_busy(&uq->uq_key);
1663		umtxq_unlock(&uq->uq_key);
1664
1665		/*
1666		 * Set the contested bit so that a release in user space
1667		 * knows to use the system call for unlock.  If this fails
1668		 * either some one else has acquired the lock or it has been
1669		 * released.
1670		 */
1671		old = casuword32(&m->m_owner, owner, owner | UMUTEX_CONTESTED);
1672
1673		/* The address was invalid. */
1674		if (old == -1) {
1675			umtxq_lock(&uq->uq_key);
1676			umtxq_unbusy(&uq->uq_key);
1677			umtxq_unlock(&uq->uq_key);
1678			error = EFAULT;
1679			break;
1680		}
1681
1682		umtxq_lock(&uq->uq_key);
1683		umtxq_unbusy(&uq->uq_key);
1684		/*
1685		 * We set the contested bit, sleep. Otherwise the lock changed
1686		 * and we need to retry or we lost a race to the thread
1687		 * unlocking the umtx.
1688		 */
1689		if (old == owner)
1690			error = umtxq_sleep_pi(uq, pi, owner & ~UMUTEX_CONTESTED,
1691				 "umtxpi", timo);
1692		umtx_pi_unref(pi);
1693		umtxq_unlock(&uq->uq_key);
1694		pi = NULL;
1695	}
1696
1697	if (pi != NULL) {
1698		umtxq_lock(&uq->uq_key);
1699		umtx_pi_unref(pi);
1700		umtxq_unlock(&uq->uq_key);
1701	}
1702
1703	umtx_key_release(&uq->uq_key);
1704	return (error);
1705}
1706
1707/*
1708 * Unlock a PI mutex.
1709 */
1710static int
1711do_unlock_pi(struct thread *td, struct umutex *m, uint32_t flags)
1712{
1713	struct umtx_key key;
1714	struct umtx_q *uq_first, *uq_first2, *uq_me;
1715	struct umtx_pi *pi, *pi2;
1716	uint32_t owner, old, id;
1717	int error;
1718	int count;
1719	int pri;
1720
1721	id = td->td_tid;
1722	/*
1723	 * Make sure we own this mtx.
1724	 */
1725	owner = fuword32(__DEVOLATILE(uint32_t *, &m->m_owner));
1726	if (owner == -1)
1727		return (EFAULT);
1728
1729	if ((owner & ~UMUTEX_CONTESTED) != id)
1730		return (EPERM);
1731
1732	/* This should be done in userland */
1733	if ((owner & UMUTEX_CONTESTED) == 0) {
1734		old = casuword32(&m->m_owner, owner, UMUTEX_UNOWNED);
1735		if (old == -1)
1736			return (EFAULT);
1737		if (old == owner)
1738			return (0);
1739		owner = old;
1740	}
1741
1742	/* We should only ever be in here for contested locks */
1743	if ((error = umtx_key_get(m, TYPE_PI_UMUTEX, GET_SHARE(flags),
1744	    &key)) != 0)
1745		return (error);
1746
1747	umtxq_lock(&key);
1748	umtxq_busy(&key);
1749	count = umtxq_count_pi(&key, &uq_first);
1750	if (uq_first != NULL) {
1751		pi = uq_first->uq_pi_blocked;
1752		if (pi->pi_owner != curthread) {
1753			umtxq_unbusy(&key);
1754			umtxq_unlock(&key);
1755			/* userland messed the mutex */
1756			return (EPERM);
1757		}
1758		uq_me = curthread->td_umtxq;
1759		mtx_lock_spin(&sched_lock);
1760		pi->pi_owner = NULL;
1761		TAILQ_REMOVE(&uq_me->uq_pi_contested, pi, pi_link);
1762		uq_first = TAILQ_FIRST(&pi->pi_blocked);
1763		pri = PRI_MAX;
1764		TAILQ_FOREACH(pi2, &uq_me->uq_pi_contested, pi_link) {
1765			uq_first2 = TAILQ_FIRST(&pi2->pi_blocked);
1766			if (uq_first2 != NULL) {
1767				if (pri > UPRI(uq_first2->uq_thread))
1768					pri = UPRI(uq_first2->uq_thread);
1769			}
1770		}
1771		sched_unlend_user_prio(curthread, pri);
1772		mtx_unlock_spin(&sched_lock);
1773	}
1774	umtxq_unlock(&key);
1775
1776	/*
1777	 * When unlocking the umtx, it must be marked as unowned if
1778	 * there is zero or one thread only waiting for it.
1779	 * Otherwise, it must be marked as contested.
1780	 */
1781	old = casuword32(&m->m_owner, owner,
1782		count <= 1 ? UMUTEX_UNOWNED : UMUTEX_CONTESTED);
1783
1784	umtxq_lock(&key);
1785	if (uq_first != NULL)
1786		umtxq_signal_thread(uq_first);
1787	umtxq_unbusy(&key);
1788	umtxq_unlock(&key);
1789	umtx_key_release(&key);
1790	if (old == -1)
1791		return (EFAULT);
1792	if (old != owner)
1793		return (EINVAL);
1794	return (0);
1795}
1796
1797/*
1798 * Lock a PP mutex.
1799 */
1800static int
1801_do_lock_pp(struct thread *td, struct umutex *m, uint32_t flags, int timo,
1802	int try)
1803{
1804	struct umtx_q *uq, *uq2;
1805	struct umtx_pi *pi;
1806	uint32_t ceiling;
1807	uint32_t owner, id;
1808	int error, pri, old_inherited_pri, su;
1809
1810	id = td->td_tid;
1811	uq = td->td_umtxq;
1812	if ((error = umtx_key_get(m, TYPE_PP_UMUTEX, GET_SHARE(flags),
1813	    &uq->uq_key)) != 0)
1814		return (error);
1815	su = (suser(td) == 0);
1816	for (;;) {
1817		old_inherited_pri = uq->uq_inherited_pri;
1818		umtxq_lock(&uq->uq_key);
1819		umtxq_busy(&uq->uq_key);
1820		umtxq_unlock(&uq->uq_key);
1821
1822		ceiling = RTP_PRIO_MAX - fuword32(&m->m_ceilings[0]);
1823		if (ceiling > RTP_PRIO_MAX) {
1824			error = EINVAL;
1825			goto out;
1826		}
1827
1828		mtx_lock_spin(&sched_lock);
1829		if (UPRI(td) < PRI_MIN_REALTIME + ceiling) {
1830			mtx_unlock_spin(&sched_lock);
1831			error = EINVAL;
1832			goto out;
1833		}
1834		if (su && PRI_MIN_REALTIME + ceiling < uq->uq_inherited_pri) {
1835			uq->uq_inherited_pri = PRI_MIN_REALTIME + ceiling;
1836			if (uq->uq_inherited_pri < UPRI(td))
1837				sched_lend_user_prio(td, uq->uq_inherited_pri);
1838		}
1839		mtx_unlock_spin(&sched_lock);
1840
1841		owner = casuword32(&m->m_owner,
1842		    UMUTEX_CONTESTED, id | UMUTEX_CONTESTED);
1843
1844		if (owner == UMUTEX_CONTESTED) {
1845			error = 0;
1846			break;
1847		}
1848
1849		/* The address was invalid. */
1850		if (owner == -1) {
1851			error = EFAULT;
1852			break;
1853		}
1854
1855		if ((flags & UMUTEX_ERROR_CHECK) != 0 &&
1856		    (owner & ~UMUTEX_CONTESTED) == id) {
1857			error = EDEADLK;
1858			break;
1859		}
1860
1861		if (try != 0) {
1862			error = EBUSY;
1863			break;
1864		}
1865
1866		/*
1867		 * If we caught a signal, we have retried and now
1868		 * exit immediately.
1869		 */
1870		if (error != 0)
1871			break;
1872
1873		umtxq_lock(&uq->uq_key);
1874		umtxq_insert(uq);
1875		umtxq_unbusy(&uq->uq_key);
1876		error = umtxq_sleep(uq, "umtxpp", timo);
1877		umtxq_remove(uq);
1878		umtxq_unlock(&uq->uq_key);
1879
1880		mtx_lock_spin(&sched_lock);
1881		uq->uq_inherited_pri = old_inherited_pri;
1882		pri = PRI_MAX;
1883		TAILQ_FOREACH(pi, &uq->uq_pi_contested, pi_link) {
1884			uq2 = TAILQ_FIRST(&pi->pi_blocked);
1885			if (uq2 != NULL) {
1886				if (pri > UPRI(uq2->uq_thread))
1887					pri = UPRI(uq2->uq_thread);
1888			}
1889		}
1890		if (pri > uq->uq_inherited_pri)
1891			pri = uq->uq_inherited_pri;
1892		sched_unlend_user_prio(td, pri);
1893		mtx_unlock_spin(&sched_lock);
1894	}
1895
1896	if (error != 0) {
1897		mtx_lock_spin(&sched_lock);
1898		uq->uq_inherited_pri = old_inherited_pri;
1899		pri = PRI_MAX;
1900		TAILQ_FOREACH(pi, &uq->uq_pi_contested, pi_link) {
1901			uq2 = TAILQ_FIRST(&pi->pi_blocked);
1902			if (uq2 != NULL) {
1903				if (pri > UPRI(uq2->uq_thread))
1904					pri = UPRI(uq2->uq_thread);
1905			}
1906		}
1907		if (pri > uq->uq_inherited_pri)
1908			pri = uq->uq_inherited_pri;
1909		sched_unlend_user_prio(td, pri);
1910		mtx_unlock_spin(&sched_lock);
1911	}
1912
1913out:
1914	umtxq_lock(&uq->uq_key);
1915	umtxq_unbusy(&uq->uq_key);
1916	umtxq_unlock(&uq->uq_key);
1917	umtx_key_release(&uq->uq_key);
1918	return (error);
1919}
1920
1921/*
1922 * Unlock a PP mutex.
1923 */
1924static int
1925do_unlock_pp(struct thread *td, struct umutex *m, uint32_t flags)
1926{
1927	struct umtx_key key;
1928	struct umtx_q *uq, *uq2;
1929	struct umtx_pi *pi;
1930	uint32_t owner, id;
1931	uint32_t rceiling;
1932	int error, pri, new_inherited_pri, su;
1933
1934	id = td->td_tid;
1935	uq = td->td_umtxq;
1936	su = (suser(td) == 0);
1937
1938	/*
1939	 * Make sure we own this mtx.
1940	 */
1941	owner = fuword32(__DEVOLATILE(uint32_t *, &m->m_owner));
1942	if (owner == -1)
1943		return (EFAULT);
1944
1945	if ((owner & ~UMUTEX_CONTESTED) != id)
1946		return (EPERM);
1947
1948	error = copyin(&m->m_ceilings[1], &rceiling, sizeof(uint32_t));
1949	if (error != 0)
1950		return (error);
1951
1952	if (rceiling == -1)
1953		new_inherited_pri = PRI_MAX;
1954	else {
1955		rceiling = RTP_PRIO_MAX - rceiling;
1956		if (rceiling > RTP_PRIO_MAX)
1957			return (EINVAL);
1958		new_inherited_pri = PRI_MIN_REALTIME + rceiling;
1959	}
1960
1961	if ((error = umtx_key_get(m, TYPE_PP_UMUTEX, GET_SHARE(flags),
1962	    &key)) != 0)
1963		return (error);
1964	umtxq_lock(&key);
1965	umtxq_busy(&key);
1966	umtxq_unlock(&key);
1967	/*
1968	 * For priority protected mutex, always set unlocked state
1969	 * to UMUTEX_CONTESTED, so that userland always enters kernel
1970	 * to lock the mutex, it is necessary because thread priority
1971	 * has to be adjusted for such mutex.
1972	 */
1973	error = suword32(__DEVOLATILE(uint32_t *, &m->m_owner),
1974		UMUTEX_CONTESTED);
1975
1976	umtxq_lock(&key);
1977	if (error == 0)
1978		umtxq_signal(&key, 1);
1979	umtxq_unbusy(&key);
1980	umtxq_unlock(&key);
1981
1982	if (error == -1)
1983		error = EFAULT;
1984	else {
1985		mtx_lock_spin(&sched_lock);
1986		if (su != 0)
1987			uq->uq_inherited_pri = new_inherited_pri;
1988		pri = PRI_MAX;
1989		TAILQ_FOREACH(pi, &uq->uq_pi_contested, pi_link) {
1990			uq2 = TAILQ_FIRST(&pi->pi_blocked);
1991			if (uq2 != NULL) {
1992				if (pri > UPRI(uq2->uq_thread))
1993					pri = UPRI(uq2->uq_thread);
1994			}
1995		}
1996		if (pri > uq->uq_inherited_pri)
1997			pri = uq->uq_inherited_pri;
1998		sched_unlend_user_prio(td, pri);
1999		mtx_unlock_spin(&sched_lock);
2000	}
2001	umtx_key_release(&key);
2002	return (error);
2003}
2004
2005static int
2006do_set_ceiling(struct thread *td, struct umutex *m, uint32_t ceiling,
2007	uint32_t *old_ceiling)
2008{
2009	struct umtx_q *uq;
2010	uint32_t save_ceiling;
2011	uint32_t owner, id;
2012	uint32_t flags;
2013	int error;
2014
2015	flags = fuword32(&m->m_flags);
2016	if ((flags & UMUTEX_PRIO_PROTECT) == 0)
2017		return (EINVAL);
2018	if (ceiling > RTP_PRIO_MAX)
2019		return (EINVAL);
2020	id = td->td_tid;
2021	uq = td->td_umtxq;
2022	if ((error = umtx_key_get(m, TYPE_PP_UMUTEX, GET_SHARE(flags),
2023	   &uq->uq_key)) != 0)
2024		return (error);
2025	for (;;) {
2026		umtxq_lock(&uq->uq_key);
2027		umtxq_busy(&uq->uq_key);
2028		umtxq_unlock(&uq->uq_key);
2029
2030		save_ceiling = fuword32(&m->m_ceilings[0]);
2031
2032		owner = casuword32(&m->m_owner,
2033		    UMUTEX_CONTESTED, id | UMUTEX_CONTESTED);
2034
2035		if (owner == UMUTEX_CONTESTED) {
2036			suword32(&m->m_ceilings[0], ceiling);
2037			suword32(__DEVOLATILE(uint32_t *, &m->m_owner),
2038				UMUTEX_CONTESTED);
2039			error = 0;
2040			break;
2041		}
2042
2043		/* The address was invalid. */
2044		if (owner == -1) {
2045			error = EFAULT;
2046			break;
2047		}
2048
2049		if ((owner & ~UMUTEX_CONTESTED) == id) {
2050			suword32(&m->m_ceilings[0], ceiling);
2051			error = 0;
2052			break;
2053		}
2054
2055		/*
2056		 * If we caught a signal, we have retried and now
2057		 * exit immediately.
2058		 */
2059		if (error != 0)
2060			break;
2061
2062		/*
2063		 * We set the contested bit, sleep. Otherwise the lock changed
2064		 * and we need to retry or we lost a race to the thread
2065		 * unlocking the umtx.
2066		 */
2067		umtxq_lock(&uq->uq_key);
2068		umtxq_insert(uq);
2069		umtxq_unbusy(&uq->uq_key);
2070		error = umtxq_sleep(uq, "umtxpp", 0);
2071		umtxq_remove(uq);
2072		umtxq_unlock(&uq->uq_key);
2073	}
2074	umtxq_lock(&uq->uq_key);
2075	if (error == 0)
2076		umtxq_signal(&uq->uq_key, INT_MAX);
2077	umtxq_unbusy(&uq->uq_key);
2078	umtxq_unlock(&uq->uq_key);
2079	umtx_key_release(&uq->uq_key);
2080	if (error == 0 && old_ceiling != NULL)
2081		suword32(old_ceiling, save_ceiling);
2082	return (error);
2083}
2084
2085static int
2086_do_lock_umutex(struct thread *td, struct umutex *m, int flags, int timo,
2087	int try)
2088{
2089	switch(flags & (UMUTEX_PRIO_INHERIT | UMUTEX_PRIO_PROTECT)) {
2090	case 0:
2091		return (_do_lock_normal(td, m, flags, timo, try));
2092	case UMUTEX_PRIO_INHERIT:
2093		return (_do_lock_pi(td, m, flags, timo, try));
2094	case UMUTEX_PRIO_PROTECT:
2095		return (_do_lock_pp(td, m, flags, timo, try));
2096	}
2097	return (EINVAL);
2098}
2099
2100/*
2101 * Lock a userland POSIX mutex.
2102 */
2103static int
2104do_lock_umutex(struct thread *td, struct umutex *m,
2105	struct timespec *timeout, int try)
2106{
2107	struct timespec ts, ts2, ts3;
2108	struct timeval tv;
2109	uint32_t flags;
2110	int error;
2111
2112	flags = fuword32(&m->m_flags);
2113	if (flags == -1)
2114		return (EFAULT);
2115
2116	if (timeout == NULL) {
2117		error = _do_lock_umutex(td, m, flags, 0, try);
2118		/* Mutex locking is restarted if it is interrupted. */
2119		if (error == EINTR)
2120			error = ERESTART;
2121	} else {
2122		getnanouptime(&ts);
2123		timespecadd(&ts, timeout);
2124		TIMESPEC_TO_TIMEVAL(&tv, timeout);
2125		for (;;) {
2126			error = _do_lock_umutex(td, m, flags, tvtohz(&tv), try);
2127			if (error != ETIMEDOUT)
2128				break;
2129			getnanouptime(&ts2);
2130			if (timespeccmp(&ts2, &ts, >=)) {
2131				error = ETIMEDOUT;
2132				break;
2133			}
2134			ts3 = ts;
2135			timespecsub(&ts3, &ts2);
2136			TIMESPEC_TO_TIMEVAL(&tv, &ts3);
2137		}
2138		/* Timed-locking is not restarted. */
2139		if (error == ERESTART)
2140			error = EINTR;
2141	}
2142	return (error);
2143}
2144
2145/*
2146 * Unlock a userland POSIX mutex.
2147 */
2148static int
2149do_unlock_umutex(struct thread *td, struct umutex *m)
2150{
2151	uint32_t flags;
2152
2153	flags = fuword32(&m->m_flags);
2154	if (flags == -1)
2155		return (EFAULT);
2156
2157	switch(flags & (UMUTEX_PRIO_INHERIT | UMUTEX_PRIO_PROTECT)) {
2158	case 0:
2159		return (do_unlock_normal(td, m, flags));
2160	case UMUTEX_PRIO_INHERIT:
2161		return (do_unlock_pi(td, m, flags));
2162	case UMUTEX_PRIO_PROTECT:
2163		return (do_unlock_pp(td, m, flags));
2164	}
2165
2166	return (EINVAL);
2167}
2168
2169int
2170_umtx_lock(struct thread *td, struct _umtx_lock_args *uap)
2171    /* struct umtx *umtx */
2172{
2173	return _do_lock_umtx(td, uap->umtx, td->td_tid, 0);
2174}
2175
2176int
2177_umtx_unlock(struct thread *td, struct _umtx_unlock_args *uap)
2178    /* struct umtx *umtx */
2179{
2180	return do_unlock_umtx(td, uap->umtx, td->td_tid);
2181}
2182
2183static int
2184__umtx_op_lock_umtx(struct thread *td, struct _umtx_op_args *uap)
2185{
2186	struct timespec *ts, timeout;
2187	int error;
2188
2189	/* Allow a null timespec (wait forever). */
2190	if (uap->uaddr2 == NULL)
2191		ts = NULL;
2192	else {
2193		error = copyin(uap->uaddr2, &timeout, sizeof(timeout));
2194		if (error != 0)
2195			return (error);
2196		if (timeout.tv_nsec >= 1000000000 ||
2197		    timeout.tv_nsec < 0) {
2198			return (EINVAL);
2199		}
2200		ts = &timeout;
2201	}
2202	return (do_lock_umtx(td, uap->obj, uap->val, ts));
2203}
2204
2205static int
2206__umtx_op_unlock_umtx(struct thread *td, struct _umtx_op_args *uap)
2207{
2208	return (do_unlock_umtx(td, uap->obj, uap->val));
2209}
2210
2211static int
2212__umtx_op_wait(struct thread *td, struct _umtx_op_args *uap)
2213{
2214	struct timespec *ts, timeout;
2215	int error;
2216
2217	if (uap->uaddr2 == NULL)
2218		ts = NULL;
2219	else {
2220		error = copyin(uap->uaddr2, &timeout, sizeof(timeout));
2221		if (error != 0)
2222			return (error);
2223		if (timeout.tv_nsec >= 1000000000 ||
2224		    timeout.tv_nsec < 0)
2225			return (EINVAL);
2226		ts = &timeout;
2227	}
2228	return do_wait(td, uap->obj, uap->val, ts, 0);
2229}
2230
2231static int
2232__umtx_op_wake(struct thread *td, struct _umtx_op_args *uap)
2233{
2234	return (kern_umtx_wake(td, uap->obj, uap->val));
2235}
2236
2237static int
2238__umtx_op_lock_umutex(struct thread *td, struct _umtx_op_args *uap)
2239{
2240	struct timespec *ts, timeout;
2241	int error;
2242
2243	/* Allow a null timespec (wait forever). */
2244	if (uap->uaddr2 == NULL)
2245		ts = NULL;
2246	else {
2247		error = copyin(uap->uaddr2, &timeout,
2248		    sizeof(timeout));
2249		if (error != 0)
2250			return (error);
2251		if (timeout.tv_nsec >= 1000000000 ||
2252		    timeout.tv_nsec < 0) {
2253			return (EINVAL);
2254		}
2255		ts = &timeout;
2256	}
2257	return do_lock_umutex(td, uap->obj, ts, 0);
2258}
2259
2260static int
2261__umtx_op_trylock_umutex(struct thread *td, struct _umtx_op_args *uap)
2262{
2263	return do_lock_umutex(td, uap->obj, NULL, 1);
2264}
2265
2266static int
2267__umtx_op_unlock_umutex(struct thread *td, struct _umtx_op_args *uap)
2268{
2269	return do_unlock_umutex(td, uap->obj);
2270}
2271
2272static int
2273__umtx_op_set_ceiling(struct thread *td, struct _umtx_op_args *uap)
2274{
2275	return do_set_ceiling(td, uap->obj, uap->val, uap->uaddr1);
2276}
2277
2278typedef int (*_umtx_op_func)(struct thread *td, struct _umtx_op_args *uap);
2279
2280static _umtx_op_func op_table[] = {
2281	__umtx_op_lock_umtx,		/* UMTX_OP_LOCK */
2282	__umtx_op_unlock_umtx,		/* UMTX_OP_UNLOCK */
2283	__umtx_op_wait,			/* UMTX_OP_WAIT */
2284	__umtx_op_wake,			/* UMTX_OP_WAKE */
2285	__umtx_op_trylock_umutex,	/* UMTX_OP_MUTEX_TRYLOCK */
2286	__umtx_op_lock_umutex,		/* UMTX_OP_MUTEX_LOCK */
2287	__umtx_op_unlock_umutex,	/* UMTX_OP_MUTEX_UNLOCK */
2288	__umtx_op_set_ceiling		/* UMTX_OP_SET_CEILING */
2289};
2290
2291int
2292_umtx_op(struct thread *td, struct _umtx_op_args *uap)
2293{
2294	if ((unsigned)uap->op < UMTX_OP_MAX)
2295		return (*op_table[uap->op])(td, uap);
2296	return (EINVAL);
2297}
2298
2299#ifdef COMPAT_IA32
2300
2301int
2302freebsd32_umtx_lock(struct thread *td, struct freebsd32_umtx_lock_args *uap)
2303    /* struct umtx *umtx */
2304{
2305	return (do_lock_umtx32(td, (uint32_t *)uap->umtx, td->td_tid, NULL));
2306}
2307
2308int
2309freebsd32_umtx_unlock(struct thread *td, struct freebsd32_umtx_unlock_args *uap)
2310    /* struct umtx *umtx */
2311{
2312	return (do_unlock_umtx32(td, (uint32_t *)uap->umtx, td->td_tid));
2313}
2314
2315struct timespec32 {
2316	u_int32_t tv_sec;
2317	u_int32_t tv_nsec;
2318};
2319
2320static inline int
2321copyin_timeout32(void *addr, struct timespec *tsp)
2322{
2323	struct timespec32 ts32;
2324	int error;
2325
2326	error = copyin(addr, &ts32, sizeof(struct timespec32));
2327	if (error == 0) {
2328		tsp->tv_sec = ts32.tv_sec;
2329		tsp->tv_nsec = ts32.tv_nsec;
2330	}
2331	return (error);
2332}
2333
2334static int
2335__umtx_op_lock_umtx_compat32(struct thread *td, struct _umtx_op_args *uap)
2336{
2337	struct timespec *ts, timeout;
2338	int error;
2339
2340	/* Allow a null timespec (wait forever). */
2341	if (uap->uaddr2 == NULL)
2342		ts = NULL;
2343	else {
2344		error = copyin_timeout32(uap->uaddr2, &timeout);
2345		if (error != 0)
2346			return (error);
2347		if (timeout.tv_nsec >= 1000000000 ||
2348		    timeout.tv_nsec < 0) {
2349			return (EINVAL);
2350		}
2351		ts = &timeout;
2352	}
2353	return (do_lock_umtx32(td, uap->obj, uap->val, ts));
2354}
2355
2356static int
2357__umtx_op_unlock_umtx_compat32(struct thread *td, struct _umtx_op_args *uap)
2358{
2359	return (do_unlock_umtx32(td, uap->obj, (uint32_t)uap->val));
2360}
2361
2362static int
2363__umtx_op_wait_compat32(struct thread *td, struct _umtx_op_args *uap)
2364{
2365	struct timespec *ts, timeout;
2366	int error;
2367
2368	if (uap->uaddr2 == NULL)
2369		ts = NULL;
2370	else {
2371		error = copyin_timeout32(uap->uaddr2, &timeout);
2372		if (error != 0)
2373			return (error);
2374		if (timeout.tv_nsec >= 1000000000 ||
2375		    timeout.tv_nsec < 0)
2376			return (EINVAL);
2377		ts = &timeout;
2378	}
2379	return do_wait(td, uap->obj, uap->val, ts, 1);
2380}
2381
2382static int
2383__umtx_op_lock_umutex_compat32(struct thread *td, struct _umtx_op_args *uap)
2384{
2385	struct timespec *ts, timeout;
2386	int error;
2387
2388	/* Allow a null timespec (wait forever). */
2389	if (uap->uaddr2 == NULL)
2390		ts = NULL;
2391	else {
2392		error = copyin_timeout32(uap->uaddr2, &timeout);
2393		if (error != 0)
2394			return (error);
2395		if (timeout.tv_nsec >= 1000000000 ||
2396		    timeout.tv_nsec < 0)
2397			return (EINVAL);
2398		ts = &timeout;
2399	}
2400	return do_lock_umutex(td, uap->obj, ts, 0);
2401}
2402
2403static _umtx_op_func op_table_compat32[] = {
2404	__umtx_op_lock_umtx_compat32,	/* UMTX_OP_LOCK */
2405	__umtx_op_unlock_umtx_compat32,	/* UMTX_OP_UNLOCK */
2406	__umtx_op_wait_compat32,	/* UMTX_OP_WAIT */
2407	__umtx_op_wake,			/* UMTX_OP_WAKE */
2408	__umtx_op_trylock_umutex,	/* UMTX_OP_MUTEX_LOCK */
2409	__umtx_op_lock_umutex_compat32,	/* UMTX_OP_MUTEX_TRYLOCK */
2410	__umtx_op_unlock_umutex,	/* UMTX_OP_MUTEX_UNLOCK	*/
2411	__umtx_op_set_ceiling		/* UMTX_OP_SET_CEILING */
2412};
2413
2414int
2415freebsd32_umtx_op(struct thread *td, struct freebsd32_umtx_op_args *uap)
2416{
2417	if ((unsigned)uap->op < UMTX_OP_MAX)
2418		return (*op_table_compat32[uap->op])(td,
2419			(struct _umtx_op_args *)uap);
2420	return (EINVAL);
2421}
2422#endif
2423
2424void
2425umtx_thread_init(struct thread *td)
2426{
2427	td->td_umtxq = umtxq_alloc();
2428	td->td_umtxq->uq_thread = td;
2429}
2430
2431void
2432umtx_thread_fini(struct thread *td)
2433{
2434	umtxq_free(td->td_umtxq);
2435}
2436
2437/*
2438 * It will be called when new thread is created, e.g fork().
2439 */
2440void
2441umtx_thread_alloc(struct thread *td)
2442{
2443	struct umtx_q *uq;
2444
2445	uq = td->td_umtxq;
2446	uq->uq_inherited_pri = PRI_MAX;
2447
2448	KASSERT(uq->uq_flags == 0, ("uq_flags != 0"));
2449	KASSERT(uq->uq_thread == td, ("uq_thread != td"));
2450	KASSERT(uq->uq_pi_blocked == NULL, ("uq_pi_blocked != NULL"));
2451	KASSERT(TAILQ_EMPTY(&uq->uq_pi_contested), ("uq_pi_contested is not empty"));
2452}
2453
2454/*
2455 * exec() hook.
2456 */
2457static void
2458umtx_exec_hook(void *arg __unused, struct proc *p __unused,
2459	struct image_params *imgp __unused)
2460{
2461	umtx_thread_cleanup(curthread);
2462}
2463
2464/*
2465 * thread_exit() hook.
2466 */
2467void
2468umtx_thread_exit(struct thread *td)
2469{
2470	umtx_thread_cleanup(td);
2471}
2472
2473/*
2474 * clean up umtx data.
2475 */
2476static void
2477umtx_thread_cleanup(struct thread *td)
2478{
2479	struct umtx_q *uq;
2480	struct umtx_pi *pi;
2481
2482	if ((uq = td->td_umtxq) == NULL)
2483		return;
2484
2485	mtx_lock_spin(&sched_lock);
2486	uq->uq_inherited_pri = PRI_MAX;
2487	while ((pi = TAILQ_FIRST(&uq->uq_pi_contested)) != NULL) {
2488		pi->pi_owner = NULL;
2489		TAILQ_REMOVE(&uq->uq_pi_contested, pi, pi_link);
2490	}
2491	td->td_flags &= ~TDF_UBORROWING;
2492	mtx_unlock_spin(&sched_lock);
2493}
2494