kern_umtx.c revision 162030
1/*-
2 * Copyright (c) 2004, David Xu <davidxu@freebsd.org>
3 * Copyright (c) 2002, Jeffrey Roberson <jeff@freebsd.org>
4 * All rights reserved.
5 *
6 * Redistribution and use in source and binary forms, with or without
7 * modification, are permitted provided that the following conditions
8 * are met:
9 * 1. Redistributions of source code must retain the above copyright
10 *    notice unmodified, this list of conditions, and the following
11 *    disclaimer.
12 * 2. Redistributions in binary form must reproduce the above copyright
13 *    notice, this list of conditions and the following disclaimer in the
14 *    documentation and/or other materials provided with the distribution.
15 *
16 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
17 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
18 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
19 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
20 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
21 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
22 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
23 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
25 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26 */
27
28#include <sys/cdefs.h>
29__FBSDID("$FreeBSD: head/sys/kern/kern_umtx.c 162030 2006-09-05 12:01:09Z davidxu $");
30
31#include <sys/param.h>
32#include <sys/kernel.h>
33#include <sys/limits.h>
34#include <sys/lock.h>
35#include <sys/malloc.h>
36#include <sys/mutex.h>
37#include <sys/proc.h>
38#include <sys/sched.h>
39#include <sys/sysctl.h>
40#include <sys/sysent.h>
41#include <sys/systm.h>
42#include <sys/sysproto.h>
43#include <sys/eventhandler.h>
44#include <sys/umtx.h>
45
46#include <vm/vm.h>
47#include <vm/vm_param.h>
48#include <vm/pmap.h>
49#include <vm/vm_map.h>
50#include <vm/vm_object.h>
51
52#define TYPE_SIMPLE_LOCK	0
53#define TYPE_SIMPLE_WAIT	1
54#define TYPE_NORMAL_UMUTEX	2
55#define TYPE_PI_UMUTEX		3
56#define TYPE_PP_UMUTEX		4
57#define TYPE_CV			5
58
59/* Key to represent a unique userland synchronous object */
60struct umtx_key {
61	int	hash;
62	int	type;
63	int	shared;
64	union {
65		struct {
66			vm_object_t	object;
67			uintptr_t	offset;
68		} shared;
69		struct {
70			struct vmspace	*vs;
71			uintptr_t	addr;
72		} private;
73		struct {
74			void		*a;
75			uintptr_t	b;
76		} both;
77	} info;
78};
79
80/* Priority inheritance mutex info. */
81struct umtx_pi {
82	/* Owner thread */
83	struct thread		*pi_owner;
84
85	/* Reference count */
86	int			pi_refcount;
87
88 	/* List entry to link umtx holding by thread */
89	TAILQ_ENTRY(umtx_pi)	pi_link;
90
91	/* List entry in hash */
92	TAILQ_ENTRY(umtx_pi)	pi_hashlink;
93
94	/* List for waiters */
95	TAILQ_HEAD(,umtx_q)	pi_blocked;
96
97	/* Identify a userland lock object */
98	struct umtx_key		pi_key;
99};
100
101/* A userland synchronous object user. */
102struct umtx_q {
103	/* Linked list for the hash. */
104	TAILQ_ENTRY(umtx_q)	uq_link;
105
106	/* Umtx key. */
107	struct umtx_key		uq_key;
108
109	/* Umtx flags. */
110	int			uq_flags;
111#define UQF_UMTXQ	0x0001
112
113	/* The thread waits on. */
114	struct thread		*uq_thread;
115
116	/*
117	 * Blocked on PI mutex. read can use chain lock
118	 * or sched_lock, write must have both chain lock and
119	 * sched_lock being hold.
120	 */
121	struct umtx_pi		*uq_pi_blocked;
122
123	/* On blocked list */
124	TAILQ_ENTRY(umtx_q)	uq_lockq;
125
126	/* Thread contending with us */
127	TAILQ_HEAD(,umtx_pi)	uq_pi_contested;
128
129	/* Inherited priority from PP mutex */
130	u_char			uq_inherited_pri;
131};
132
133TAILQ_HEAD(umtxq_head, umtx_q);
134
135/* Userland lock object's wait-queue chain */
136struct umtxq_chain {
137	/* Lock for this chain. */
138	struct mtx		uc_lock;
139
140	/* List of sleep queues. */
141	struct umtxq_head	uc_queue;
142
143	/* Busy flag */
144	char			uc_busy;
145
146	/* Chain lock waiters */
147	int			uc_waiters;
148
149	/* All PI in the list */
150	TAILQ_HEAD(,umtx_pi)	uc_pi_list;
151};
152
153#define	UMTXQ_LOCKED_ASSERT(uc)		mtx_assert(&(uc)->uc_lock, MA_OWNED)
154
155/*
156 * Don't propagate time-sharing priority, there is a security reason,
157 * a user can simply introduce PI-mutex, let thread A lock the mutex,
158 * and let another thread B block on the mutex, because B is
159 * sleeping, its priority will be boosted, this causes A's priority to
160 * be boosted via priority propagating too and will never be lowered even
161 * if it is using 100%CPU, this is unfair to other processes.
162 */
163
164#define UPRI(td)	(((td)->td_ksegrp->kg_user_pri >= PRI_MIN_TIMESHARE &&\
165			  (td)->td_ksegrp->kg_user_pri <= PRI_MAX_TIMESHARE) ?\
166			 PRI_MAX_TIMESHARE : (td)->td_ksegrp->kg_user_pri)
167
168#define	GOLDEN_RATIO_PRIME	2654404609U
169#define	UMTX_CHAINS		128
170#define	UMTX_SHIFTS		(__WORD_BIT - 7)
171
172#define THREAD_SHARE		0
173#define PROCESS_SHARE		1
174#define AUTO_SHARE		2
175
176#define	GET_SHARE(flags)	\
177    (((flags) & USYNC_PROCESS_SHARED) == 0 ? THREAD_SHARE : PROCESS_SHARE)
178
179static uma_zone_t		umtx_pi_zone;
180static struct umtxq_chain	umtxq_chains[UMTX_CHAINS];
181static MALLOC_DEFINE(M_UMTX, "umtx", "UMTX queue memory");
182static int			umtx_pi_allocated;
183
184SYSCTL_NODE(_debug, OID_AUTO, umtx, CTLFLAG_RW, 0, "umtx debug");
185SYSCTL_INT(_debug_umtx, OID_AUTO, umtx_pi_allocated, CTLFLAG_RD,
186    &umtx_pi_allocated, 0, "Allocated umtx_pi");
187
188static void umtxq_sysinit(void *);
189static void umtxq_hash(struct umtx_key *key);
190static struct umtxq_chain *umtxq_getchain(struct umtx_key *key);
191static void umtxq_lock(struct umtx_key *key);
192static void umtxq_unlock(struct umtx_key *key);
193static void umtxq_busy(struct umtx_key *key);
194static void umtxq_unbusy(struct umtx_key *key);
195static void umtxq_insert(struct umtx_q *uq);
196static void umtxq_remove(struct umtx_q *uq);
197static int umtxq_sleep(struct umtx_q *uq, const char *wmesg, int timo);
198static int umtxq_count(struct umtx_key *key);
199static int umtxq_signal(struct umtx_key *key, int nr_wakeup);
200static int umtx_key_match(const struct umtx_key *k1, const struct umtx_key *k2);
201static int umtx_key_get(void *addr, int type, int share,
202	struct umtx_key *key);
203static void umtx_key_release(struct umtx_key *key);
204static struct umtx_pi *umtx_pi_alloc(void);
205static void umtx_pi_free(struct umtx_pi *pi);
206static int do_unlock_pp(struct thread *td, struct umutex *m, uint32_t flags);
207static void umtx_thread_cleanup(struct thread *td);
208static void umtx_exec_hook(void *arg __unused, struct proc *p __unused,
209	struct image_params *imgp __unused);
210SYSINIT(umtx, SI_SUB_EVENTHANDLER+1, SI_ORDER_MIDDLE, umtxq_sysinit, NULL);
211
212static void
213umtxq_sysinit(void *arg __unused)
214{
215	int i;
216
217	umtx_pi_zone = uma_zcreate("umtx pi", sizeof(struct umtx_pi),
218		NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0);
219	for (i = 0; i < UMTX_CHAINS; ++i) {
220		mtx_init(&umtxq_chains[i].uc_lock, "umtxql", NULL,
221			 MTX_DEF | MTX_DUPOK);
222		TAILQ_INIT(&umtxq_chains[i].uc_queue);
223		TAILQ_INIT(&umtxq_chains[i].uc_pi_list);
224		umtxq_chains[i].uc_busy = 0;
225		umtxq_chains[i].uc_waiters = 0;
226	}
227	EVENTHANDLER_REGISTER(process_exec, umtx_exec_hook, NULL,
228	    EVENTHANDLER_PRI_ANY);
229}
230
231struct umtx_q *
232umtxq_alloc(void)
233{
234	struct umtx_q *uq;
235
236	uq = malloc(sizeof(struct umtx_q), M_UMTX, M_WAITOK | M_ZERO);
237	TAILQ_INIT(&uq->uq_pi_contested);
238	uq->uq_inherited_pri = PRI_MAX;
239	return (uq);
240}
241
242void
243umtxq_free(struct umtx_q *uq)
244{
245	free(uq, M_UMTX);
246}
247
248static inline void
249umtxq_hash(struct umtx_key *key)
250{
251	unsigned n = (uintptr_t)key->info.both.a + key->info.both.b;
252	key->hash = ((n * GOLDEN_RATIO_PRIME) >> UMTX_SHIFTS) % UMTX_CHAINS;
253}
254
255static inline int
256umtx_key_match(const struct umtx_key *k1, const struct umtx_key *k2)
257{
258	return (k1->type == k2->type &&
259		k1->info.both.a == k2->info.both.a &&
260	        k1->info.both.b == k2->info.both.b);
261}
262
263static inline struct umtxq_chain *
264umtxq_getchain(struct umtx_key *key)
265{
266	return (&umtxq_chains[key->hash]);
267}
268
269/*
270 * Set chain to busy state when following operation
271 * may be blocked (kernel mutex can not be used).
272 */
273static inline void
274umtxq_busy(struct umtx_key *key)
275{
276	struct umtxq_chain *uc;
277
278	uc = umtxq_getchain(key);
279	mtx_assert(&uc->uc_lock, MA_OWNED);
280	while (uc->uc_busy != 0) {
281		uc->uc_waiters++;
282		msleep(uc, &uc->uc_lock, 0, "umtxqb", 0);
283		uc->uc_waiters--;
284	}
285	uc->uc_busy = 1;
286}
287
288/*
289 * Unbusy a chain.
290 */
291static inline void
292umtxq_unbusy(struct umtx_key *key)
293{
294	struct umtxq_chain *uc;
295
296	uc = umtxq_getchain(key);
297	mtx_assert(&uc->uc_lock, MA_OWNED);
298	KASSERT(uc->uc_busy != 0, ("not busy"));
299	uc->uc_busy = 0;
300	if (uc->uc_waiters)
301		wakeup_one(uc);
302}
303
304/*
305 * Lock a chain.
306 */
307static inline void
308umtxq_lock(struct umtx_key *key)
309{
310	struct umtxq_chain *uc;
311
312	uc = umtxq_getchain(key);
313	mtx_lock(&uc->uc_lock);
314}
315
316/*
317 * Unlock a chain.
318 */
319static inline void
320umtxq_unlock(struct umtx_key *key)
321{
322	struct umtxq_chain *uc;
323
324	uc = umtxq_getchain(key);
325	mtx_unlock(&uc->uc_lock);
326}
327
328/*
329 * Insert a thread onto the umtx queue.
330 */
331static inline void
332umtxq_insert(struct umtx_q *uq)
333{
334	struct umtxq_chain *uc;
335
336	uc = umtxq_getchain(&uq->uq_key);
337	UMTXQ_LOCKED_ASSERT(uc);
338	TAILQ_INSERT_TAIL(&uc->uc_queue, uq, uq_link);
339	uq->uq_flags |= UQF_UMTXQ;
340}
341
342/*
343 * Remove thread from the umtx queue.
344 */
345static inline void
346umtxq_remove(struct umtx_q *uq)
347{
348	struct umtxq_chain *uc;
349
350	uc = umtxq_getchain(&uq->uq_key);
351	UMTXQ_LOCKED_ASSERT(uc);
352	if (uq->uq_flags & UQF_UMTXQ) {
353		TAILQ_REMOVE(&uc->uc_queue, uq, uq_link);
354		uq->uq_flags &= ~UQF_UMTXQ;
355	}
356}
357
358/*
359 * Check if there are multiple waiters
360 */
361static int
362umtxq_count(struct umtx_key *key)
363{
364	struct umtxq_chain *uc;
365	struct umtx_q *uq;
366	int count = 0;
367
368	uc = umtxq_getchain(key);
369	UMTXQ_LOCKED_ASSERT(uc);
370	TAILQ_FOREACH(uq, &uc->uc_queue, uq_link) {
371		if (umtx_key_match(&uq->uq_key, key)) {
372			if (++count > 1)
373				break;
374		}
375	}
376	return (count);
377}
378
379/*
380 * Check if there are multiple PI waiters and returns first
381 * waiter.
382 */
383static int
384umtxq_count_pi(struct umtx_key *key, struct umtx_q **first)
385{
386	struct umtxq_chain *uc;
387	struct umtx_q *uq;
388	int count = 0;
389
390	*first = NULL;
391	uc = umtxq_getchain(key);
392	UMTXQ_LOCKED_ASSERT(uc);
393	TAILQ_FOREACH(uq, &uc->uc_queue, uq_link) {
394		if (umtx_key_match(&uq->uq_key, key)) {
395			if (++count > 1)
396				break;
397			*first = uq;
398		}
399	}
400	return (count);
401}
402
403/*
404 * Wake up threads waiting on an userland object.
405 */
406static int
407umtxq_signal(struct umtx_key *key, int n_wake)
408{
409	struct umtxq_chain *uc;
410	struct umtx_q *uq, *next;
411	int ret;
412
413	ret = 0;
414	uc = umtxq_getchain(key);
415	UMTXQ_LOCKED_ASSERT(uc);
416	TAILQ_FOREACH_SAFE(uq, &uc->uc_queue, uq_link, next) {
417		if (umtx_key_match(&uq->uq_key, key)) {
418			umtxq_remove(uq);
419			wakeup(uq);
420			if (++ret >= n_wake)
421				break;
422		}
423	}
424	return (ret);
425}
426
427/*
428 * Wake up specified thread.
429 */
430static inline void
431umtxq_signal_thread(struct umtx_q *uq)
432{
433	struct umtxq_chain *uc;
434
435	uc = umtxq_getchain(&uq->uq_key);
436	UMTXQ_LOCKED_ASSERT(uc);
437	umtxq_remove(uq);
438	wakeup(uq);
439}
440
441/*
442 * Put thread into sleep state, before sleeping, check if
443 * thread was removed from umtx queue.
444 */
445static inline int
446umtxq_sleep(struct umtx_q *uq, const char *wmesg, int timo)
447{
448	struct umtxq_chain *uc;
449	int error;
450
451	uc = umtxq_getchain(&uq->uq_key);
452	UMTXQ_LOCKED_ASSERT(uc);
453	if (!(uq->uq_flags & UQF_UMTXQ))
454		return (0);
455	error = msleep(uq, &uc->uc_lock, PCATCH, wmesg, timo);
456	if (error == EWOULDBLOCK)
457		error = ETIMEDOUT;
458	return (error);
459}
460
461/*
462 * Convert userspace address into unique logical address.
463 */
464static int
465umtx_key_get(void *addr, int type, int share, struct umtx_key *key)
466{
467	struct thread *td = curthread;
468	vm_map_t map;
469	vm_map_entry_t entry;
470	vm_pindex_t pindex;
471	vm_prot_t prot;
472	boolean_t wired;
473
474	key->type = type;
475	if (share == THREAD_SHARE) {
476		key->shared = 0;
477		key->info.private.vs = td->td_proc->p_vmspace;
478		key->info.private.addr = (uintptr_t)addr;
479	} else if (share == PROCESS_SHARE || share == AUTO_SHARE) {
480		map = &td->td_proc->p_vmspace->vm_map;
481		if (vm_map_lookup(&map, (vm_offset_t)addr, VM_PROT_WRITE,
482		    &entry, &key->info.shared.object, &pindex, &prot,
483		    &wired) != KERN_SUCCESS) {
484			return EFAULT;
485		}
486
487		if ((share == PROCESS_SHARE) ||
488		    (share == AUTO_SHARE &&
489		     VM_INHERIT_SHARE == entry->inheritance)) {
490			key->shared = 1;
491			key->info.shared.offset = entry->offset + entry->start -
492				(vm_offset_t)addr;
493			vm_object_reference(key->info.shared.object);
494		} else {
495			key->shared = 0;
496			key->info.private.vs = td->td_proc->p_vmspace;
497			key->info.private.addr = (uintptr_t)addr;
498		}
499		vm_map_lookup_done(map, entry);
500	}
501
502	umtxq_hash(key);
503	return (0);
504}
505
506/*
507 * Release key.
508 */
509static inline void
510umtx_key_release(struct umtx_key *key)
511{
512	if (key->shared)
513		vm_object_deallocate(key->info.shared.object);
514}
515
516/*
517 * Lock a umtx object.
518 */
519static int
520_do_lock(struct thread *td, struct umtx *umtx, uintptr_t id, int timo)
521{
522	struct umtx_q *uq;
523	intptr_t owner;
524	intptr_t old;
525	int error = 0;
526
527	uq = td->td_umtxq;
528
529	/*
530	 * Care must be exercised when dealing with umtx structure. It
531	 * can fault on any access.
532	 */
533	for (;;) {
534		/*
535		 * Try the uncontested case.  This should be done in userland.
536		 */
537		owner = casuptr((intptr_t *)&umtx->u_owner, UMTX_UNOWNED, id);
538
539		/* The acquire succeeded. */
540		if (owner == UMTX_UNOWNED)
541			return (0);
542
543		/* The address was invalid. */
544		if (owner == -1)
545			return (EFAULT);
546
547		/* If no one owns it but it is contested try to acquire it. */
548		if (owner == UMTX_CONTESTED) {
549			owner = casuptr((intptr_t *)&umtx->u_owner,
550			    UMTX_CONTESTED, id | UMTX_CONTESTED);
551
552			if (owner == UMTX_CONTESTED)
553				return (0);
554
555			/* The address was invalid. */
556			if (owner == -1)
557				return (EFAULT);
558
559			/* If this failed the lock has changed, restart. */
560			continue;
561		}
562
563		/*
564		 * If we caught a signal, we have retried and now
565		 * exit immediately.
566		 */
567		if (error != 0)
568			return (error);
569
570		if ((error = umtx_key_get(umtx, TYPE_SIMPLE_LOCK,
571			AUTO_SHARE, &uq->uq_key)) != 0)
572			return (error);
573
574		umtxq_lock(&uq->uq_key);
575		umtxq_busy(&uq->uq_key);
576		umtxq_insert(uq);
577		umtxq_unbusy(&uq->uq_key);
578		umtxq_unlock(&uq->uq_key);
579
580		/*
581		 * Set the contested bit so that a release in user space
582		 * knows to use the system call for unlock.  If this fails
583		 * either some one else has acquired the lock or it has been
584		 * released.
585		 */
586		old = casuptr((intptr_t *)&umtx->u_owner, owner,
587		    owner | UMTX_CONTESTED);
588
589		/* The address was invalid. */
590		if (old == -1) {
591			umtxq_lock(&uq->uq_key);
592			umtxq_remove(uq);
593			umtxq_unlock(&uq->uq_key);
594			umtx_key_release(&uq->uq_key);
595			return (EFAULT);
596		}
597
598		/*
599		 * We set the contested bit, sleep. Otherwise the lock changed
600		 * and we need to retry or we lost a race to the thread
601		 * unlocking the umtx.
602		 */
603		umtxq_lock(&uq->uq_key);
604		if (old == owner)
605			error = umtxq_sleep(uq, "umtx", timo);
606		umtxq_remove(uq);
607		umtxq_unlock(&uq->uq_key);
608		umtx_key_release(&uq->uq_key);
609	}
610
611	return (0);
612}
613
614/*
615 * Lock a umtx object.
616 */
617static int
618do_lock(struct thread *td, struct umtx *umtx, uintptr_t id,
619	struct timespec *timeout)
620{
621	struct timespec ts, ts2, ts3;
622	struct timeval tv;
623	int error;
624
625	if (timeout == NULL) {
626		error = _do_lock(td, umtx, id, 0);
627		/* Mutex locking is restarted if it is interrupted. */
628		if (error == EINTR)
629			error = ERESTART;
630	} else {
631		getnanouptime(&ts);
632		timespecadd(&ts, timeout);
633		TIMESPEC_TO_TIMEVAL(&tv, timeout);
634		for (;;) {
635			error = _do_lock(td, umtx, id, tvtohz(&tv));
636			if (error != ETIMEDOUT)
637				break;
638			getnanouptime(&ts2);
639			if (timespeccmp(&ts2, &ts, >=)) {
640				error = ETIMEDOUT;
641				break;
642			}
643			ts3 = ts;
644			timespecsub(&ts3, &ts2);
645			TIMESPEC_TO_TIMEVAL(&tv, &ts3);
646		}
647		/* Timed-locking is not restarted. */
648		if (error == ERESTART)
649			error = EINTR;
650	}
651	return (error);
652}
653
654/*
655 * Unlock a umtx object.
656 */
657static int
658do_unlock(struct thread *td, struct umtx *umtx, uintptr_t id)
659{
660	struct umtx_key key;
661	intptr_t owner;
662	intptr_t old;
663	int error;
664	int count;
665
666	/*
667	 * Make sure we own this mtx.
668	 *
669	 * XXX Need a {fu,su}ptr this is not correct on arch where
670	 * sizeof(intptr_t) != sizeof(long).
671	 */
672	owner = fuword(&umtx->u_owner);
673	if (owner == -1)
674		return (EFAULT);
675
676	if ((owner & ~UMTX_CONTESTED) != id)
677		return (EPERM);
678
679	/* This should be done in userland */
680	if ((owner & UMTX_CONTESTED) == 0) {
681		old = casuptr((intptr_t *)&umtx->u_owner, owner,
682			UMTX_UNOWNED);
683		if (old == -1)
684			return (EFAULT);
685		if (old == owner)
686			return (0);
687		owner = old;
688	}
689
690	/* We should only ever be in here for contested locks */
691	if ((error = umtx_key_get(umtx, TYPE_SIMPLE_LOCK, AUTO_SHARE,
692		&key)) != 0)
693		return (error);
694
695	umtxq_lock(&key);
696	umtxq_busy(&key);
697	count = umtxq_count(&key);
698	umtxq_unlock(&key);
699
700	/*
701	 * When unlocking the umtx, it must be marked as unowned if
702	 * there is zero or one thread only waiting for it.
703	 * Otherwise, it must be marked as contested.
704	 */
705	old = casuptr((intptr_t *)&umtx->u_owner, owner,
706			count <= 1 ? UMTX_UNOWNED : UMTX_CONTESTED);
707	umtxq_lock(&key);
708	umtxq_signal(&key,1);
709	umtxq_unbusy(&key);
710	umtxq_unlock(&key);
711	umtx_key_release(&key);
712	if (old == -1)
713		return (EFAULT);
714	if (old != owner)
715		return (EINVAL);
716	return (0);
717}
718
719/*
720 * Fetch and compare value, sleep on the address if value is not changed.
721 */
722static int
723do_wait(struct thread *td, struct umtx *umtx, uintptr_t id, struct timespec *timeout)
724{
725	struct umtx_q *uq;
726	struct timespec ts, ts2, ts3;
727	struct timeval tv;
728	uintptr_t tmp;
729	int error = 0;
730
731	uq = td->td_umtxq;
732	if ((error = umtx_key_get(umtx, TYPE_SIMPLE_WAIT, AUTO_SHARE,
733	    &uq->uq_key)) != 0)
734		return (error);
735
736	umtxq_lock(&uq->uq_key);
737	umtxq_insert(uq);
738	umtxq_unlock(&uq->uq_key);
739	tmp = fuword(&umtx->u_owner);
740	if (tmp != id) {
741		umtxq_lock(&uq->uq_key);
742		umtxq_remove(uq);
743		umtxq_unlock(&uq->uq_key);
744	} else if (timeout == NULL) {
745		umtxq_lock(&uq->uq_key);
746		error = umtxq_sleep(uq, "ucond", 0);
747		umtxq_remove(uq);
748		umtxq_unlock(&uq->uq_key);
749	} else {
750		getnanouptime(&ts);
751		timespecadd(&ts, timeout);
752		TIMESPEC_TO_TIMEVAL(&tv, timeout);
753		umtxq_lock(&uq->uq_key);
754		for (;;) {
755			error = umtxq_sleep(uq, "ucond", tvtohz(&tv));
756			if (!(uq->uq_flags & UQF_UMTXQ))
757				break;
758			if (error != ETIMEDOUT)
759				break;
760			umtxq_unlock(&uq->uq_key);
761			getnanouptime(&ts2);
762			if (timespeccmp(&ts2, &ts, >=)) {
763				error = ETIMEDOUT;
764				umtxq_lock(&uq->uq_key);
765				break;
766			}
767			ts3 = ts;
768			timespecsub(&ts3, &ts2);
769			TIMESPEC_TO_TIMEVAL(&tv, &ts3);
770			umtxq_lock(&uq->uq_key);
771		}
772		umtxq_remove(uq);
773		umtxq_unlock(&uq->uq_key);
774	}
775	umtx_key_release(&uq->uq_key);
776	if (error == ERESTART)
777		error = EINTR;
778	return (error);
779}
780
781/*
782 * Wake up threads sleeping on the specified address.
783 */
784int
785kern_umtx_wake(struct thread *td, void *uaddr, int n_wake)
786{
787	struct umtx_key key;
788	int ret;
789
790	if ((ret = umtx_key_get(uaddr, TYPE_SIMPLE_WAIT, AUTO_SHARE,
791	   &key)) != 0)
792		return (ret);
793	umtxq_lock(&key);
794	ret = umtxq_signal(&key, n_wake);
795	umtxq_unlock(&key);
796	umtx_key_release(&key);
797	return (0);
798}
799
800/*
801 * Lock PTHREAD_PRIO_NONE protocol POSIX mutex.
802 */
803static int
804_do_lock_normal(struct thread *td, struct umutex *m, uint32_t flags, int timo,
805	int try)
806{
807	struct umtx_q *uq;
808	uint32_t owner, old, id;
809	int error = 0;
810
811	id = td->td_tid;
812	uq = td->td_umtxq;
813
814	/*
815	 * Care must be exercised when dealing with umtx structure. It
816	 * can fault on any access.
817	 */
818	for (;;) {
819		/*
820		 * Try the uncontested case.  This should be done in userland.
821		 */
822		owner = casuword32(&m->m_owner, UMUTEX_UNOWNED, id);
823
824		/* The acquire succeeded. */
825		if (owner == UMUTEX_UNOWNED)
826			return (0);
827
828		/* The address was invalid. */
829		if (owner == -1)
830			return (EFAULT);
831
832		/* If no one owns it but it is contested try to acquire it. */
833		if (owner == UMUTEX_CONTESTED) {
834			owner = casuword32(&m->m_owner,
835			    UMUTEX_CONTESTED, id | UMUTEX_CONTESTED);
836
837			if (owner == UMUTEX_CONTESTED)
838				return (0);
839
840			/* The address was invalid. */
841			if (owner == -1)
842				return (EFAULT);
843
844			/* If this failed the lock has changed, restart. */
845			continue;
846		}
847
848		if ((flags & UMUTEX_ERROR_CHECK) != 0 &&
849		    (owner & ~UMUTEX_CONTESTED) == id)
850			return (EDEADLK);
851
852		if (try != 0)
853			return (EBUSY);
854
855		/*
856		 * If we caught a signal, we have retried and now
857		 * exit immediately.
858		 */
859		if (error != 0)
860			return (error);
861
862		if ((error = umtx_key_get(m, TYPE_NORMAL_UMUTEX,
863		    GET_SHARE(flags), &uq->uq_key)) != 0)
864			return (error);
865
866		umtxq_lock(&uq->uq_key);
867		umtxq_busy(&uq->uq_key);
868		umtxq_insert(uq);
869		umtxq_unbusy(&uq->uq_key);
870		umtxq_unlock(&uq->uq_key);
871
872		/*
873		 * Set the contested bit so that a release in user space
874		 * knows to use the system call for unlock.  If this fails
875		 * either some one else has acquired the lock or it has been
876		 * released.
877		 */
878		old = casuword32(&m->m_owner, owner, owner | UMUTEX_CONTESTED);
879
880		/* The address was invalid. */
881		if (old == -1) {
882			umtxq_lock(&uq->uq_key);
883			umtxq_remove(uq);
884			umtxq_unlock(&uq->uq_key);
885			umtx_key_release(&uq->uq_key);
886			return (EFAULT);
887		}
888
889		/*
890		 * We set the contested bit, sleep. Otherwise the lock changed
891		 * and we need to retry or we lost a race to the thread
892		 * unlocking the umtx.
893		 */
894		umtxq_lock(&uq->uq_key);
895		if (old == owner)
896			error = umtxq_sleep(uq, "umtxn", timo);
897		umtxq_remove(uq);
898		umtxq_unlock(&uq->uq_key);
899		umtx_key_release(&uq->uq_key);
900	}
901
902	return (0);
903}
904
905/*
906 * Lock PTHREAD_PRIO_NONE protocol POSIX mutex.
907 */
908/*
909 * Unlock PTHREAD_PRIO_NONE protocol POSIX mutex.
910 */
911static int
912do_unlock_normal(struct thread *td, struct umutex *m, uint32_t flags)
913{
914	struct umtx_key key;
915	uint32_t owner, old, id;
916	int error;
917	int count;
918
919	id = td->td_tid;
920	/*
921	 * Make sure we own this mtx.
922	 */
923	owner = fuword32(&m->m_owner);
924	if (owner == -1)
925		return (EFAULT);
926
927	if ((owner & ~UMUTEX_CONTESTED) != id)
928		return (EPERM);
929
930	/* This should be done in userland */
931	if ((owner & UMUTEX_CONTESTED) == 0) {
932		old = casuword32(&m->m_owner, owner, UMUTEX_UNOWNED);
933		if (old == -1)
934			return (EFAULT);
935		if (old == owner)
936			return (0);
937		owner = old;
938	}
939
940	/* We should only ever be in here for contested locks */
941	if ((error = umtx_key_get(m, TYPE_NORMAL_UMUTEX, GET_SHARE(flags),
942	    &key)) != 0)
943		return (error);
944
945	umtxq_lock(&key);
946	umtxq_busy(&key);
947	count = umtxq_count(&key);
948	umtxq_unlock(&key);
949
950	/*
951	 * When unlocking the umtx, it must be marked as unowned if
952	 * there is zero or one thread only waiting for it.
953	 * Otherwise, it must be marked as contested.
954	 */
955	old = casuword32(&m->m_owner, owner,
956		count <= 1 ? UMUTEX_UNOWNED : UMUTEX_CONTESTED);
957	umtxq_lock(&key);
958	umtxq_signal(&key,1);
959	umtxq_unbusy(&key);
960	umtxq_unlock(&key);
961	umtx_key_release(&key);
962	if (old == -1)
963		return (EFAULT);
964	if (old != owner)
965		return (EINVAL);
966	return (0);
967}
968
969static inline struct umtx_pi *
970umtx_pi_alloc(void)
971{
972	struct umtx_pi *pi;
973
974	pi = uma_zalloc(umtx_pi_zone, M_ZERO | M_WAITOK);
975	TAILQ_INIT(&pi->pi_blocked);
976	atomic_add_int(&umtx_pi_allocated, 1);
977	return (pi);
978}
979
980static inline void
981umtx_pi_free(struct umtx_pi *pi)
982{
983	uma_zfree(umtx_pi_zone, pi);
984	atomic_add_int(&umtx_pi_allocated, -1);
985}
986
987/*
988 * Adjust the thread's position on a pi_state after its priority has been
989 * changed.
990 */
991static int
992umtx_pi_adjust_thread(struct umtx_pi *pi, struct thread *td)
993{
994	struct umtx_q *uq, *uq1, *uq2;
995	struct thread *td1;
996
997	mtx_assert(&sched_lock, MA_OWNED);
998	if (pi == NULL)
999		return (0);
1000
1001	uq = td->td_umtxq;
1002
1003	/*
1004	 * Check if the thread needs to be moved on the blocked chain.
1005	 * It needs to be moved if either its priority is lower than
1006	 * the previous thread or higher than the next thread.
1007	 */
1008	uq1 = TAILQ_PREV(uq, umtxq_head, uq_lockq);
1009	uq2 = TAILQ_NEXT(uq, uq_lockq);
1010	if ((uq1 != NULL && UPRI(td) < UPRI(uq1->uq_thread)) ||
1011	    (uq2 != NULL && UPRI(td) > UPRI(uq2->uq_thread))) {
1012		/*
1013		 * Remove thread from blocked chain and determine where
1014		 * it should be moved to.
1015		 */
1016		TAILQ_REMOVE(&pi->pi_blocked, uq, uq_lockq);
1017		TAILQ_FOREACH(uq1, &pi->pi_blocked, uq_lockq) {
1018			td1 = uq1->uq_thread;
1019			MPASS(td1->td_proc->p_magic == P_MAGIC);
1020			if (UPRI(td1) > UPRI(td))
1021				break;
1022		}
1023
1024		if (uq1 == NULL)
1025			TAILQ_INSERT_TAIL(&pi->pi_blocked, uq, uq_lockq);
1026		else
1027			TAILQ_INSERT_BEFORE(uq1, uq, uq_lockq);
1028	}
1029	return (1);
1030}
1031
1032/*
1033 * Propagate priority when a thread is blocked on POSIX
1034 * PI mutex.
1035 */
1036static void
1037umtx_propagate_priority(struct thread *td)
1038{
1039	struct umtx_q *uq;
1040	struct umtx_pi *pi;
1041	int pri;
1042
1043	mtx_assert(&sched_lock, MA_OWNED);
1044	pri = UPRI(td);
1045	uq = td->td_umtxq;
1046	pi = uq->uq_pi_blocked;
1047	if (pi == NULL)
1048		return;
1049
1050	for (;;) {
1051		td = pi->pi_owner;
1052		if (td == NULL)
1053			return;
1054
1055		MPASS(td->td_proc != NULL);
1056		MPASS(td->td_proc->p_magic == P_MAGIC);
1057
1058		if (UPRI(td) <= pri)
1059			return;
1060
1061		sched_lend_user_prio(td, pri);
1062
1063		/*
1064		 * Pick up the lock that td is blocked on.
1065		 */
1066		uq = td->td_umtxq;
1067		pi = uq->uq_pi_blocked;
1068		/* Resort td on the list if needed. */
1069		if (!umtx_pi_adjust_thread(pi, td))
1070			break;
1071	}
1072}
1073
1074/*
1075 * Unpropagate priority for a PI mutex when a thread blocked on
1076 * it is interrupted by signal or resumed by others.
1077 */
1078static void
1079umtx_unpropagate_priority(struct umtx_pi *pi)
1080{
1081	struct umtx_q *uq, *uq_owner;
1082	struct umtx_pi *pi2;
1083	int pri;
1084
1085	mtx_assert(&sched_lock, MA_OWNED);
1086
1087	while (pi != NULL && pi->pi_owner != NULL) {
1088		pri = PRI_MAX;
1089		uq_owner = pi->pi_owner->td_umtxq;
1090
1091		TAILQ_FOREACH(pi2, &uq_owner->uq_pi_contested, pi_link) {
1092			uq = TAILQ_FIRST(&pi2->pi_blocked);
1093			if (uq != NULL) {
1094				if (pri > UPRI(uq->uq_thread))
1095					pri = UPRI(uq->uq_thread);
1096			}
1097		}
1098
1099		if (pri > uq_owner->uq_inherited_pri)
1100			pri = uq_owner->uq_inherited_pri;
1101		sched_unlend_user_prio(pi->pi_owner, pri);
1102		pi = uq_owner->uq_pi_blocked;
1103	}
1104}
1105
1106/*
1107 * Insert a PI mutex into owned list.
1108 */
1109static void
1110umtx_pi_setowner(struct umtx_pi *pi, struct thread *owner)
1111{
1112	struct umtx_q *uq_owner;
1113
1114	uq_owner = owner->td_umtxq;
1115	mtx_assert(&sched_lock, MA_OWNED);
1116	if (pi->pi_owner != NULL)
1117		panic("pi_ower != NULL");
1118	pi->pi_owner = owner;
1119	TAILQ_INSERT_TAIL(&uq_owner->uq_pi_contested, pi, pi_link);
1120}
1121
1122/*
1123 * Claim ownership of a PI mutex.
1124 */
1125static int
1126umtx_pi_claim(struct umtx_pi *pi, struct thread *owner)
1127{
1128	struct umtx_q *uq, *uq_owner;
1129
1130	uq_owner = owner->td_umtxq;
1131	mtx_lock_spin(&sched_lock);
1132	if (pi->pi_owner == owner) {
1133		mtx_unlock_spin(&sched_lock);
1134		return (0);
1135	}
1136
1137	if (pi->pi_owner != NULL) {
1138		/*
1139		 * userland may have already messed the mutex, sigh.
1140		 */
1141		mtx_unlock_spin(&sched_lock);
1142		return (EPERM);
1143	}
1144	umtx_pi_setowner(pi, owner);
1145	uq = TAILQ_FIRST(&pi->pi_blocked);
1146	if (uq != NULL) {
1147		int pri;
1148
1149		pri = UPRI(uq->uq_thread);
1150		if (pri < UPRI(owner))
1151			sched_lend_user_prio(owner, pri);
1152	}
1153	mtx_unlock_spin(&sched_lock);
1154	return (0);
1155}
1156
1157/*
1158 * Adjust a thread's order position in its blocked PI mutex,
1159 * this may result new priority propagating process.
1160 */
1161void
1162umtx_pi_adjust(struct thread *td, u_char oldpri)
1163{
1164	struct umtx_q *uq;
1165	struct umtx_pi *pi;
1166
1167	uq = td->td_umtxq;
1168
1169	mtx_assert(&sched_lock, MA_OWNED);
1170	MPASS(TD_ON_UPILOCK(td));
1171
1172	/*
1173	 * Pick up the lock that td is blocked on.
1174	 */
1175	pi = uq->uq_pi_blocked;
1176	MPASS(pi != NULL);
1177
1178	/* Resort the turnstile on the list. */
1179	if (!umtx_pi_adjust_thread(pi, td))
1180		return;
1181
1182	/*
1183	 * If our priority was lowered and we are at the head of the
1184	 * turnstile, then propagate our new priority up the chain.
1185	 */
1186	if (uq == TAILQ_FIRST(&pi->pi_blocked) && UPRI(td) < oldpri)
1187		umtx_propagate_priority(td);
1188}
1189
1190/*
1191 * Sleep on a PI mutex.
1192 */
1193static int
1194umtxq_sleep_pi(struct umtx_q *uq, struct umtx_pi *pi,
1195	uint32_t owner, const char *wmesg, int timo)
1196{
1197	struct umtxq_chain *uc;
1198	struct thread *td, *td1;
1199	struct umtx_q *uq1;
1200	int pri;
1201	int error = 0;
1202
1203	td = uq->uq_thread;
1204	KASSERT(td == curthread, ("inconsistent uq_thread"));
1205	uc = umtxq_getchain(&uq->uq_key);
1206	UMTXQ_LOCKED_ASSERT(uc);
1207	umtxq_insert(uq);
1208	if (pi->pi_owner == NULL) {
1209		/* XXX
1210		 * Current, We only support process private PI-mutex,
1211		 * non-contended PI-mutexes are locked in userland.
1212		 * Process shared PI-mutex should always be initialized
1213		 * by kernel and be registered in kernel, locking should
1214		 * always be done by kernel to avoid security problems.
1215		 * For process private PI-mutex, we can find owner
1216		 * thread and boost its priority safely.
1217		 */
1218		PROC_LOCK(curproc);
1219		td1 = thread_find(curproc, owner);
1220		mtx_lock_spin(&sched_lock);
1221		if (td1 != NULL && pi->pi_owner == NULL) {
1222			uq1 = td1->td_umtxq;
1223			umtx_pi_setowner(pi, td1);
1224		}
1225		PROC_UNLOCK(curproc);
1226	} else {
1227		mtx_lock_spin(&sched_lock);
1228	}
1229
1230	TAILQ_FOREACH(uq1, &pi->pi_blocked, uq_lockq) {
1231		pri = UPRI(uq1->uq_thread);
1232		if (pri > UPRI(td))
1233			break;
1234	}
1235
1236	if (uq1 != NULL)
1237		TAILQ_INSERT_BEFORE(uq1, uq, uq_lockq);
1238	else
1239		TAILQ_INSERT_TAIL(&pi->pi_blocked, uq, uq_lockq);
1240
1241	uq->uq_pi_blocked = pi;
1242	td->td_flags |= TDF_UPIBLOCKED;
1243	mtx_unlock_spin(&sched_lock);
1244	umtxq_unlock(&uq->uq_key);
1245
1246	mtx_lock_spin(&sched_lock);
1247	umtx_propagate_priority(td);
1248	mtx_unlock_spin(&sched_lock);
1249
1250	umtxq_lock(&uq->uq_key);
1251	if (uq->uq_flags & UQF_UMTXQ) {
1252		error = msleep(uq, &uc->uc_lock, PCATCH, wmesg, timo);
1253		if (error == EWOULDBLOCK)
1254			error = ETIMEDOUT;
1255		if (uq->uq_flags & UQF_UMTXQ) {
1256			umtxq_busy(&uq->uq_key);
1257			umtxq_remove(uq);
1258			umtxq_unbusy(&uq->uq_key);
1259		}
1260	}
1261	umtxq_unlock(&uq->uq_key);
1262
1263	mtx_lock_spin(&sched_lock);
1264	uq->uq_pi_blocked = NULL;
1265	td->td_flags &= ~TDF_UPIBLOCKED;
1266	TAILQ_REMOVE(&pi->pi_blocked, uq, uq_lockq);
1267	umtx_unpropagate_priority(pi);
1268	mtx_unlock_spin(&sched_lock);
1269
1270	umtxq_lock(&uq->uq_key);
1271
1272	return (error);
1273}
1274
1275/*
1276 * Add reference count for a PI mutex.
1277 */
1278static void
1279umtx_pi_ref(struct umtx_pi *pi)
1280{
1281	struct umtxq_chain *uc;
1282
1283	uc = umtxq_getchain(&pi->pi_key);
1284	UMTXQ_LOCKED_ASSERT(uc);
1285	pi->pi_refcount++;
1286}
1287
1288/*
1289 * Decrease reference count for a PI mutex, if the counter
1290 * is decreased to zero, its memory space is freed.
1291 */
1292static void
1293umtx_pi_unref(struct umtx_pi *pi)
1294{
1295	struct umtxq_chain *uc;
1296	int free = 0;
1297
1298	uc = umtxq_getchain(&pi->pi_key);
1299	UMTXQ_LOCKED_ASSERT(uc);
1300	KASSERT(pi->pi_refcount > 0, ("invalid reference count"));
1301	if (--pi->pi_refcount == 0) {
1302		mtx_lock_spin(&sched_lock);
1303		if (pi->pi_owner != NULL) {
1304			TAILQ_REMOVE(&pi->pi_owner->td_umtxq->uq_pi_contested,
1305				pi, pi_link);
1306			pi->pi_owner = NULL;
1307		}
1308		KASSERT(TAILQ_EMPTY(&pi->pi_blocked),
1309			("blocked queue not empty"));
1310		mtx_unlock_spin(&sched_lock);
1311		TAILQ_REMOVE(&uc->uc_pi_list, pi, pi_hashlink);
1312		free = 1;
1313	}
1314	if (free)
1315		umtx_pi_free(pi);
1316}
1317
1318/*
1319 * Find a PI mutex in hash table.
1320 */
1321static struct umtx_pi *
1322umtx_pi_lookup(struct umtx_key *key)
1323{
1324	struct umtxq_chain *uc;
1325	struct umtx_pi *pi;
1326
1327	uc = umtxq_getchain(key);
1328	UMTXQ_LOCKED_ASSERT(uc);
1329
1330	TAILQ_FOREACH(pi, &uc->uc_pi_list, pi_hashlink) {
1331		if (umtx_key_match(&pi->pi_key, key)) {
1332			return (pi);
1333		}
1334	}
1335	return (NULL);
1336}
1337
1338/*
1339 * Insert a PI mutex into hash table.
1340 */
1341static inline void
1342umtx_pi_insert(struct umtx_pi *pi)
1343{
1344	struct umtxq_chain *uc;
1345
1346	uc = umtxq_getchain(&pi->pi_key);
1347	UMTXQ_LOCKED_ASSERT(uc);
1348	TAILQ_INSERT_TAIL(&uc->uc_pi_list, pi, pi_hashlink);
1349}
1350
1351/*
1352 * Lock a PI mutex.
1353 */
1354static int
1355_do_lock_pi(struct thread *td, struct umutex *m, uint32_t flags, int timo,
1356	int try)
1357{
1358	struct umtx_q *uq;
1359	struct umtx_pi *pi, *new_pi;
1360	uint32_t id, owner, old;
1361	int error;
1362
1363	id = td->td_tid;
1364	uq = td->td_umtxq;
1365
1366	if ((error = umtx_key_get(m, TYPE_PI_UMUTEX, GET_SHARE(flags),
1367	    &uq->uq_key)) != 0)
1368		return (error);
1369	for (;;) {
1370		pi = NULL;
1371		umtxq_lock(&uq->uq_key);
1372		pi = umtx_pi_lookup(&uq->uq_key);
1373		if (pi == NULL) {
1374			umtxq_unlock(&uq->uq_key);
1375			new_pi = umtx_pi_alloc();
1376			new_pi->pi_key = uq->uq_key;
1377			umtxq_lock(&uq->uq_key);
1378			pi = umtx_pi_lookup(&uq->uq_key);
1379			if (pi != NULL)
1380				umtx_pi_free(new_pi);
1381			else {
1382				umtx_pi_insert(new_pi);
1383				pi = new_pi;
1384			}
1385		}
1386
1387		umtx_pi_ref(pi);
1388		umtxq_unlock(&uq->uq_key);
1389
1390		/*
1391		 * Care must be exercised when dealing with umtx structure.  It
1392		 * can fault on any access.
1393		 */
1394
1395		/*
1396		 * Try the uncontested case.  This should be done in userland.
1397		 */
1398		owner = casuword32(&m->m_owner, UMUTEX_UNOWNED, id);
1399
1400		/* The acquire succeeded. */
1401		if (owner == UMUTEX_UNOWNED) {
1402			error = 0;
1403			break;
1404		}
1405
1406		/* The address was invalid. */
1407		if (owner == -1) {
1408			error = EFAULT;
1409			break;
1410		}
1411
1412		/* If no one owns it but it is contested try to acquire it. */
1413		if (owner == UMUTEX_CONTESTED) {
1414			owner = casuword32(&m->m_owner,
1415			    UMUTEX_CONTESTED, id | UMUTEX_CONTESTED);
1416
1417			if (owner == UMUTEX_CONTESTED) {
1418				umtxq_lock(&uq->uq_key);
1419				error = umtx_pi_claim(pi, td);
1420				umtxq_unlock(&uq->uq_key);
1421				break;
1422			}
1423
1424			/* The address was invalid. */
1425			if (owner == -1) {
1426				error = EFAULT;
1427				break;
1428			}
1429
1430			/* If this failed the lock has changed, restart. */
1431			umtxq_lock(&uq->uq_key);
1432			umtx_pi_unref(pi);
1433			umtxq_unlock(&uq->uq_key);
1434			pi = NULL;
1435			continue;
1436		}
1437
1438		if ((flags & UMUTEX_ERROR_CHECK) != 0 &&
1439		    (owner & ~UMUTEX_CONTESTED) == id) {
1440			error = EDEADLK;
1441			break;
1442		}
1443
1444		if (try != 0) {
1445			error = EBUSY;
1446			break;
1447		}
1448
1449		/*
1450		 * If we caught a signal, we have retried and now
1451		 * exit immediately.
1452		 */
1453		if (error != 0)
1454			break;
1455
1456		umtxq_lock(&uq->uq_key);
1457		umtxq_busy(&uq->uq_key);
1458		umtxq_unlock(&uq->uq_key);
1459
1460		/*
1461		 * Set the contested bit so that a release in user space
1462		 * knows to use the system call for unlock.  If this fails
1463		 * either some one else has acquired the lock or it has been
1464		 * released.
1465		 */
1466		old = casuword32(&m->m_owner, owner, owner | UMUTEX_CONTESTED);
1467
1468		/* The address was invalid. */
1469		if (old == -1) {
1470			umtxq_lock(&uq->uq_key);
1471			umtxq_unbusy(&uq->uq_key);
1472			umtxq_unlock(&uq->uq_key);
1473			error = EFAULT;
1474			break;
1475		}
1476
1477		umtxq_lock(&uq->uq_key);
1478		umtxq_unbusy(&uq->uq_key);
1479		/*
1480		 * We set the contested bit, sleep. Otherwise the lock changed
1481		 * and we need to retry or we lost a race to the thread
1482		 * unlocking the umtx.
1483		 */
1484		if (old == owner)
1485			error = umtxq_sleep_pi(uq, pi, owner & ~UMUTEX_CONTESTED,
1486				 "umtxpi", timo);
1487		umtx_pi_unref(pi);
1488		umtxq_unlock(&uq->uq_key);
1489		pi = NULL;
1490	}
1491
1492	if (pi != NULL) {
1493		umtxq_lock(&uq->uq_key);
1494		umtx_pi_unref(pi);
1495		umtxq_unlock(&uq->uq_key);
1496	}
1497
1498	umtx_key_release(&uq->uq_key);
1499	return (error);
1500}
1501
1502/*
1503 * Unlock a PI mutex.
1504 */
1505static int
1506do_unlock_pi(struct thread *td, struct umutex *m, uint32_t flags)
1507{
1508	struct umtx_key key;
1509	struct umtx_q *uq_first, *uq_first2, *uq_me;
1510	struct umtx_pi *pi, *pi2;
1511	uint32_t owner, old, id;
1512	int error;
1513	int count;
1514	int pri;
1515
1516	id = td->td_tid;
1517	/*
1518	 * Make sure we own this mtx.
1519	 */
1520	owner = fuword32(&m->m_owner);
1521	if (owner == -1)
1522		return (EFAULT);
1523
1524	if ((owner & ~UMUTEX_CONTESTED) != id)
1525		return (EPERM);
1526
1527	/* This should be done in userland */
1528	if ((owner & UMUTEX_CONTESTED) == 0) {
1529		old = casuword32(&m->m_owner, owner, UMUTEX_UNOWNED);
1530		if (old == -1)
1531			return (EFAULT);
1532		if (old == owner)
1533			return (0);
1534		owner = old;
1535	}
1536
1537	/* We should only ever be in here for contested locks */
1538	if ((error = umtx_key_get(m, TYPE_PI_UMUTEX, GET_SHARE(flags),
1539	    &key)) != 0)
1540		return (error);
1541
1542	umtxq_lock(&key);
1543	umtxq_busy(&key);
1544	count = umtxq_count_pi(&key, &uq_first);
1545	if (uq_first != NULL) {
1546		pi = uq_first->uq_pi_blocked;
1547		if (pi->pi_owner != curthread) {
1548			umtxq_unbusy(&key);
1549			umtxq_unlock(&key);
1550			/* userland messed the mutex */
1551			return (EPERM);
1552		}
1553		uq_me = curthread->td_umtxq;
1554		mtx_lock_spin(&sched_lock);
1555		pi->pi_owner = NULL;
1556		TAILQ_REMOVE(&uq_me->uq_pi_contested, pi, pi_link);
1557		uq_first = TAILQ_FIRST(&pi->pi_blocked);
1558		pri = PRI_MAX;
1559		TAILQ_FOREACH(pi2, &uq_me->uq_pi_contested, pi_link) {
1560			uq_first2 = TAILQ_FIRST(&pi2->pi_blocked);
1561			if (uq_first2 != NULL) {
1562				if (pri > UPRI(uq_first2->uq_thread))
1563					pri = UPRI(uq_first2->uq_thread);
1564			}
1565		}
1566		sched_unlend_user_prio(curthread, pri);
1567		mtx_unlock_spin(&sched_lock);
1568	}
1569	umtxq_unlock(&key);
1570
1571	/*
1572	 * When unlocking the umtx, it must be marked as unowned if
1573	 * there is zero or one thread only waiting for it.
1574	 * Otherwise, it must be marked as contested.
1575	 */
1576	old = casuword32(&m->m_owner, owner,
1577		count <= 1 ? UMUTEX_UNOWNED : UMUTEX_CONTESTED);
1578
1579	umtxq_lock(&key);
1580	if (uq_first != NULL)
1581		umtxq_signal_thread(uq_first);
1582	umtxq_unbusy(&key);
1583	umtxq_unlock(&key);
1584	umtx_key_release(&key);
1585	if (old == -1)
1586		return (EFAULT);
1587	if (old != owner)
1588		return (EINVAL);
1589	return (0);
1590}
1591
1592/*
1593 * Lock a PP mutex.
1594 */
1595static int
1596_do_lock_pp(struct thread *td, struct umutex *m, uint32_t flags, int timo,
1597	int try)
1598{
1599	struct umtx_q *uq, *uq2;
1600	struct umtx_pi *pi;
1601	uint32_t ceiling;
1602	uint32_t owner, id;
1603	int error, pri, old_inherited_pri, su;
1604
1605	id = td->td_tid;
1606	uq = td->td_umtxq;
1607	if ((error = umtx_key_get(m, TYPE_PP_UMUTEX, GET_SHARE(flags),
1608	    &uq->uq_key)) != 0)
1609		return (error);
1610	su = (suser(td) == 0);
1611	for (;;) {
1612		old_inherited_pri = uq->uq_inherited_pri;
1613		umtxq_lock(&uq->uq_key);
1614		umtxq_busy(&uq->uq_key);
1615		umtxq_unlock(&uq->uq_key);
1616
1617		ceiling = RTP_PRIO_MAX - fuword32(&m->m_ceilings[0]);
1618		if (ceiling > RTP_PRIO_MAX) {
1619			error = EINVAL;
1620			goto out;
1621		}
1622
1623		mtx_lock_spin(&sched_lock);
1624		if (UPRI(td) < PRI_MIN_REALTIME + ceiling) {
1625			mtx_unlock_spin(&sched_lock);
1626			error = EINVAL;
1627			goto out;
1628		}
1629		if (su && PRI_MIN_REALTIME + ceiling < uq->uq_inherited_pri) {
1630			uq->uq_inherited_pri = PRI_MIN_REALTIME + ceiling;
1631			if (uq->uq_inherited_pri < UPRI(td))
1632				sched_lend_user_prio(td, uq->uq_inherited_pri);
1633		}
1634		mtx_unlock_spin(&sched_lock);
1635
1636		owner = casuword32(&m->m_owner,
1637		    UMUTEX_CONTESTED, id | UMUTEX_CONTESTED);
1638
1639		if (owner == UMUTEX_CONTESTED) {
1640			error = 0;
1641			break;
1642		}
1643
1644		/* The address was invalid. */
1645		if (owner == -1) {
1646			error = EFAULT;
1647			break;
1648		}
1649
1650		if ((flags & UMUTEX_ERROR_CHECK) != 0 &&
1651		    (owner & ~UMUTEX_CONTESTED) == id) {
1652			error = EDEADLK;
1653			break;
1654		}
1655
1656		if (try != 0) {
1657			error = EBUSY;
1658			break;
1659		}
1660
1661		/*
1662		 * If we caught a signal, we have retried and now
1663		 * exit immediately.
1664		 */
1665		if (error != 0)
1666			break;
1667
1668		umtxq_lock(&uq->uq_key);
1669		umtxq_insert(uq);
1670		umtxq_unbusy(&uq->uq_key);
1671		error = umtxq_sleep(uq, "umtxpp", timo);
1672		umtxq_remove(uq);
1673		umtxq_unlock(&uq->uq_key);
1674
1675		mtx_lock_spin(&sched_lock);
1676		uq->uq_inherited_pri = old_inherited_pri;
1677		pri = PRI_MAX;
1678		TAILQ_FOREACH(pi, &uq->uq_pi_contested, pi_link) {
1679			uq2 = TAILQ_FIRST(&pi->pi_blocked);
1680			if (uq2 != NULL) {
1681				if (pri > UPRI(uq2->uq_thread))
1682					pri = UPRI(uq2->uq_thread);
1683			}
1684		}
1685		if (pri > uq->uq_inherited_pri)
1686			pri = uq->uq_inherited_pri;
1687		sched_unlend_user_prio(td, pri);
1688		mtx_unlock_spin(&sched_lock);
1689	}
1690
1691	if (error != 0) {
1692		mtx_lock_spin(&sched_lock);
1693		uq->uq_inherited_pri = old_inherited_pri;
1694		pri = PRI_MAX;
1695		TAILQ_FOREACH(pi, &uq->uq_pi_contested, pi_link) {
1696			uq2 = TAILQ_FIRST(&pi->pi_blocked);
1697			if (uq2 != NULL) {
1698				if (pri > UPRI(uq2->uq_thread))
1699					pri = UPRI(uq2->uq_thread);
1700			}
1701		}
1702		if (pri > uq->uq_inherited_pri)
1703			pri = uq->uq_inherited_pri;
1704		sched_unlend_user_prio(td, pri);
1705		mtx_unlock_spin(&sched_lock);
1706	}
1707
1708out:
1709	umtxq_lock(&uq->uq_key);
1710	umtxq_unbusy(&uq->uq_key);
1711	umtxq_unlock(&uq->uq_key);
1712	umtx_key_release(&uq->uq_key);
1713	return (error);
1714}
1715
1716/*
1717 * Unlock a PP mutex.
1718 */
1719static int
1720do_unlock_pp(struct thread *td, struct umutex *m, uint32_t flags)
1721{
1722	struct umtx_key key;
1723	struct umtx_q *uq, *uq2;
1724	struct umtx_pi *pi;
1725	uint32_t owner, id;
1726	uint32_t rceiling;
1727	int error, pri, new_inherited_pri, su;
1728
1729	id = td->td_tid;
1730	uq = td->td_umtxq;
1731	su = (suser(td) == 0);
1732
1733	/*
1734	 * Make sure we own this mtx.
1735	 */
1736	owner = fuword32(&m->m_owner);
1737	if (owner == -1)
1738		return (EFAULT);
1739
1740	if ((owner & ~UMUTEX_CONTESTED) != id)
1741		return (EPERM);
1742
1743	error = copyin(&m->m_ceilings[1], &rceiling, sizeof(uint32_t));
1744	if (error != 0)
1745		return (error);
1746
1747	if (rceiling == -1)
1748		new_inherited_pri = PRI_MAX;
1749	else {
1750		rceiling = RTP_PRIO_MAX - rceiling;
1751		if (rceiling > RTP_PRIO_MAX)
1752			return (EINVAL);
1753		new_inherited_pri = PRI_MIN_REALTIME + rceiling;
1754	}
1755
1756	if ((error = umtx_key_get(m, TYPE_PP_UMUTEX, GET_SHARE(flags),
1757	    &key)) != 0)
1758		return (error);
1759	umtxq_lock(&key);
1760	umtxq_busy(&key);
1761	umtxq_unlock(&key);
1762	/*
1763	 * For priority protected mutex, always set unlocked state
1764	 * to UMUTEX_CONTESTED, so that userland always enters kernel
1765	 * to lock the mutex, it is necessary because thread priority
1766	 * has to be adjusted for such mutex.
1767	 */
1768	error = suword32(&m->m_owner, UMUTEX_CONTESTED);
1769
1770	umtxq_lock(&key);
1771	if (error == 0)
1772		umtxq_signal(&key, 1);
1773	umtxq_unbusy(&key);
1774	umtxq_unlock(&key);
1775
1776	if (error == -1)
1777		error = EFAULT;
1778	else {
1779		mtx_lock_spin(&sched_lock);
1780		if (su != 0)
1781			uq->uq_inherited_pri = new_inherited_pri;
1782		pri = PRI_MAX;
1783		TAILQ_FOREACH(pi, &uq->uq_pi_contested, pi_link) {
1784			uq2 = TAILQ_FIRST(&pi->pi_blocked);
1785			if (uq2 != NULL) {
1786				if (pri > UPRI(uq2->uq_thread))
1787					pri = UPRI(uq2->uq_thread);
1788			}
1789		}
1790		if (pri > uq->uq_inherited_pri)
1791			pri = uq->uq_inherited_pri;
1792		sched_unlend_user_prio(td, pri);
1793		mtx_unlock_spin(&sched_lock);
1794	}
1795	umtx_key_release(&key);
1796	return (error);
1797}
1798
1799static int
1800do_set_ceiling(struct thread *td, struct umutex *m, uint32_t ceiling,
1801	uint32_t *old_ceiling)
1802{
1803	struct umtx_q *uq;
1804	uint32_t save_ceiling;
1805	uint32_t owner, id;
1806	uint32_t flags;
1807	int error;
1808
1809	flags = fuword32(&m->m_flags);
1810	if ((flags & UMUTEX_PRIO_PROTECT) == 0)
1811		return (EINVAL);
1812	if (ceiling > RTP_PRIO_MAX)
1813		return (EINVAL);
1814	id = td->td_tid;
1815	uq = td->td_umtxq;
1816	if ((error = umtx_key_get(m, TYPE_PP_UMUTEX, GET_SHARE(flags),
1817	   &uq->uq_key)) != 0)
1818		return (error);
1819	for (;;) {
1820		umtxq_lock(&uq->uq_key);
1821		umtxq_busy(&uq->uq_key);
1822		umtxq_unlock(&uq->uq_key);
1823
1824		save_ceiling = fuword32(&m->m_ceilings[0]);
1825
1826		owner = casuword32(&m->m_owner,
1827		    UMUTEX_CONTESTED, id | UMUTEX_CONTESTED);
1828
1829		if (owner == UMUTEX_CONTESTED) {
1830			suword32(&m->m_ceilings[0], ceiling);
1831			suword32(&m->m_owner, UMUTEX_CONTESTED);
1832			error = 0;
1833			break;
1834		}
1835
1836		/* The address was invalid. */
1837		if (owner == -1) {
1838			error = EFAULT;
1839			break;
1840		}
1841
1842		if ((owner & ~UMUTEX_CONTESTED) == id) {
1843			suword32(&m->m_ceilings[0], ceiling);
1844			error = 0;
1845			break;
1846		}
1847
1848		/*
1849		 * If we caught a signal, we have retried and now
1850		 * exit immediately.
1851		 */
1852		if (error != 0)
1853			break;
1854
1855		/*
1856		 * We set the contested bit, sleep. Otherwise the lock changed
1857		 * and we need to retry or we lost a race to the thread
1858		 * unlocking the umtx.
1859		 */
1860		umtxq_lock(&uq->uq_key);
1861		umtxq_insert(uq);
1862		umtxq_unbusy(&uq->uq_key);
1863		error = umtxq_sleep(uq, "umtxpp", 0);
1864		umtxq_remove(uq);
1865		umtxq_unlock(&uq->uq_key);
1866	}
1867	umtxq_lock(&uq->uq_key);
1868	if (error == 0)
1869		umtxq_signal(&uq->uq_key, INT_MAX);
1870	umtxq_unbusy(&uq->uq_key);
1871	umtxq_unlock(&uq->uq_key);
1872	umtx_key_release(&uq->uq_key);
1873	if (error == 0 && old_ceiling != NULL)
1874		suword32(old_ceiling, save_ceiling);
1875	return (error);
1876}
1877
1878static int
1879_do_lock_umutex(struct thread *td, struct umutex *m, int flags, int timo,
1880	int try)
1881{
1882	switch(flags & (UMUTEX_PRIO_INHERIT | UMUTEX_PRIO_PROTECT)) {
1883	case 0:
1884		return (_do_lock_normal(td, m, flags, timo, try));
1885	case UMUTEX_PRIO_INHERIT:
1886		return (_do_lock_pi(td, m, flags, timo, try));
1887	case UMUTEX_PRIO_PROTECT:
1888		return (_do_lock_pp(td, m, flags, timo, try));
1889	}
1890	return (EINVAL);
1891}
1892
1893/*
1894 * Lock a userland POSIX mutex.
1895 */
1896static int
1897do_lock_umutex(struct thread *td, struct umutex *m,
1898	struct timespec *timeout, int try)
1899{
1900	struct timespec ts, ts2, ts3;
1901	struct timeval tv;
1902	uint32_t flags;
1903	int error;
1904
1905	flags = fuword32(&m->m_flags);
1906	if (flags == -1)
1907		return (EFAULT);
1908
1909	if (timeout == NULL) {
1910		error = _do_lock_umutex(td, m, flags, 0, try);
1911		/* Mutex locking is restarted if it is interrupted. */
1912		if (error == EINTR)
1913			error = ERESTART;
1914	} else {
1915		getnanouptime(&ts);
1916		timespecadd(&ts, timeout);
1917		TIMESPEC_TO_TIMEVAL(&tv, timeout);
1918		for (;;) {
1919			error = _do_lock_umutex(td, m, flags, tvtohz(&tv), try);
1920			if (error != ETIMEDOUT)
1921				break;
1922			getnanouptime(&ts2);
1923			if (timespeccmp(&ts2, &ts, >=)) {
1924				error = ETIMEDOUT;
1925				break;
1926			}
1927			ts3 = ts;
1928			timespecsub(&ts3, &ts2);
1929			TIMESPEC_TO_TIMEVAL(&tv, &ts3);
1930		}
1931		/* Timed-locking is not restarted. */
1932		if (error == ERESTART)
1933			error = EINTR;
1934	}
1935	return (error);
1936}
1937
1938/*
1939 * Unlock a userland POSIX mutex.
1940 */
1941static int
1942do_unlock_umutex(struct thread *td, struct umutex *m)
1943{
1944	uint32_t flags;
1945
1946	flags = fuword32(&m->m_flags);
1947	if (flags == -1)
1948		return (EFAULT);
1949
1950	switch(flags & (UMUTEX_PRIO_INHERIT | UMUTEX_PRIO_PROTECT)) {
1951	case 0:
1952		return (do_unlock_normal(td, m, flags));
1953	case UMUTEX_PRIO_INHERIT:
1954		return (do_unlock_pi(td, m, flags));
1955	case UMUTEX_PRIO_PROTECT:
1956		return (do_unlock_pp(td, m, flags));
1957	}
1958
1959	return (EINVAL);
1960}
1961
1962int
1963_umtx_lock(struct thread *td, struct _umtx_lock_args *uap)
1964    /* struct umtx *umtx */
1965{
1966	return _do_lock(td, uap->umtx, td->td_tid, 0);
1967}
1968
1969int
1970_umtx_unlock(struct thread *td, struct _umtx_unlock_args *uap)
1971    /* struct umtx *umtx */
1972{
1973	return do_unlock(td, uap->umtx, td->td_tid);
1974}
1975
1976int
1977_umtx_op(struct thread *td, struct _umtx_op_args *uap)
1978{
1979	struct timespec timeout;
1980	struct timespec *ts;
1981	int error;
1982
1983	switch(uap->op) {
1984	case UMTX_OP_MUTEX_LOCK:
1985		/* Allow a null timespec (wait forever). */
1986		if (uap->uaddr2 == NULL)
1987			ts = NULL;
1988		else {
1989			error = copyin(uap->uaddr2, &timeout, sizeof(timeout));
1990			if (error != 0)
1991				break;
1992			if (timeout.tv_nsec >= 1000000000 ||
1993			    timeout.tv_nsec < 0) {
1994				error = EINVAL;
1995				break;
1996			}
1997			ts = &timeout;
1998		}
1999		error = do_lock_umutex(td, uap->obj, ts, 0);
2000		break;
2001	case UMTX_OP_MUTEX_UNLOCK:
2002		error = do_unlock_umutex(td, uap->obj);
2003		break;
2004	case UMTX_OP_LOCK:
2005		/* Allow a null timespec (wait forever). */
2006		if (uap->uaddr2 == NULL)
2007			ts = NULL;
2008		else {
2009			error = copyin(uap->uaddr2, &timeout, sizeof(timeout));
2010			if (error != 0)
2011				break;
2012			if (timeout.tv_nsec >= 1000000000 ||
2013			    timeout.tv_nsec < 0) {
2014				error = EINVAL;
2015				break;
2016			}
2017			ts = &timeout;
2018		}
2019		error = do_lock(td, uap->obj, uap->val, ts);
2020		break;
2021	case UMTX_OP_UNLOCK:
2022		error = do_unlock(td, uap->obj, uap->val);
2023		break;
2024	case UMTX_OP_WAIT:
2025		/* Allow a null timespec (wait forever). */
2026		if (uap->uaddr2 == NULL)
2027			ts = NULL;
2028		else {
2029			error = copyin(uap->uaddr2, &timeout, sizeof(timeout));
2030			if (error != 0)
2031				break;
2032			if (timeout.tv_nsec >= 1000000000 ||
2033			    timeout.tv_nsec < 0) {
2034				error = EINVAL;
2035				break;
2036			}
2037			ts = &timeout;
2038		}
2039		error = do_wait(td, uap->obj, uap->val, ts);
2040		break;
2041	case UMTX_OP_WAKE:
2042		error = kern_umtx_wake(td, uap->obj, uap->val);
2043		break;
2044	case UMTX_OP_MUTEX_TRYLOCK:
2045		error = do_lock_umutex(td, uap->obj, NULL, 1);
2046		break;
2047	case UMTX_OP_SET_CEILING:
2048		error = do_set_ceiling(td, uap->obj, uap->val, uap->uaddr1);
2049		break;
2050	default:
2051		error = EINVAL;
2052		break;
2053	}
2054	return (error);
2055}
2056
2057void
2058umtx_thread_init(struct thread *td)
2059{
2060	td->td_umtxq = umtxq_alloc();
2061	td->td_umtxq->uq_thread = td;
2062}
2063
2064void
2065umtx_thread_fini(struct thread *td)
2066{
2067	umtxq_free(td->td_umtxq);
2068}
2069
2070/*
2071 * It will be called when new thread is created, e.g fork().
2072 */
2073void
2074umtx_thread_alloc(struct thread *td)
2075{
2076	struct umtx_q *uq;
2077
2078	uq = td->td_umtxq;
2079	uq->uq_inherited_pri = PRI_MAX;
2080
2081	KASSERT(uq->uq_flags == 0, ("uq_flags != 0"));
2082	KASSERT(uq->uq_thread == td, ("uq_thread != td"));
2083	KASSERT(uq->uq_pi_blocked == NULL, ("uq_pi_blocked != NULL"));
2084	KASSERT(TAILQ_EMPTY(&uq->uq_pi_contested), ("uq_pi_contested is not empty"));
2085}
2086
2087/*
2088 * exec() hook.
2089 */
2090static void
2091umtx_exec_hook(void *arg __unused, struct proc *p __unused,
2092	struct image_params *imgp __unused)
2093{
2094	umtx_thread_cleanup(curthread);
2095}
2096
2097/*
2098 * thread_exit() hook.
2099 */
2100void
2101umtx_thread_exit(struct thread *td)
2102{
2103	umtx_thread_cleanup(td);
2104}
2105
2106/*
2107 * clean up umtx data.
2108 */
2109static void
2110umtx_thread_cleanup(struct thread *td)
2111{
2112	struct umtx_q *uq;
2113	struct umtx_pi *pi;
2114
2115	if ((uq = td->td_umtxq) == NULL)
2116		return;
2117
2118	mtx_lock_spin(&sched_lock);
2119	uq->uq_inherited_pri = PRI_MAX;
2120	while ((pi = TAILQ_FIRST(&uq->uq_pi_contested)) != NULL) {
2121		pi->pi_owner = NULL;
2122		TAILQ_REMOVE(&uq->uq_pi_contested, pi, pi_link);
2123	}
2124	td->td_flags &= ~TDF_UBORROWING;
2125	mtx_unlock_spin(&sched_lock);
2126}
2127