kern_umtx.c revision 164033
1/*-
2 * Copyright (c) 2004, David Xu <davidxu@freebsd.org>
3 * Copyright (c) 2002, Jeffrey Roberson <jeff@freebsd.org>
4 * All rights reserved.
5 *
6 * Redistribution and use in source and binary forms, with or without
7 * modification, are permitted provided that the following conditions
8 * are met:
9 * 1. Redistributions of source code must retain the above copyright
10 *    notice unmodified, this list of conditions, and the following
11 *    disclaimer.
12 * 2. Redistributions in binary form must reproduce the above copyright
13 *    notice, this list of conditions and the following disclaimer in the
14 *    documentation and/or other materials provided with the distribution.
15 *
16 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
17 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
18 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
19 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
20 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
21 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
22 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
23 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
25 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26 */
27
28#include <sys/cdefs.h>
29__FBSDID("$FreeBSD: head/sys/kern/kern_umtx.c 164033 2006-11-06 13:42:10Z rwatson $");
30
31#include "opt_compat.h"
32#include <sys/param.h>
33#include <sys/kernel.h>
34#include <sys/limits.h>
35#include <sys/lock.h>
36#include <sys/malloc.h>
37#include <sys/mutex.h>
38#include <sys/priv.h>
39#include <sys/proc.h>
40#include <sys/sched.h>
41#include <sys/sysctl.h>
42#include <sys/sysent.h>
43#include <sys/systm.h>
44#include <sys/sysproto.h>
45#include <sys/eventhandler.h>
46#include <sys/umtx.h>
47
48#include <vm/vm.h>
49#include <vm/vm_param.h>
50#include <vm/pmap.h>
51#include <vm/vm_map.h>
52#include <vm/vm_object.h>
53
54#ifdef COMPAT_IA32
55#include <compat/freebsd32/freebsd32_proto.h>
56#endif
57
58#define TYPE_SIMPLE_LOCK	0
59#define TYPE_SIMPLE_WAIT	1
60#define TYPE_NORMAL_UMUTEX	2
61#define TYPE_PI_UMUTEX		3
62#define TYPE_PP_UMUTEX		4
63#define TYPE_CV			5
64
65/* Key to represent a unique userland synchronous object */
66struct umtx_key {
67	int	hash;
68	int	type;
69	int	shared;
70	union {
71		struct {
72			vm_object_t	object;
73			uintptr_t	offset;
74		} shared;
75		struct {
76			struct vmspace	*vs;
77			uintptr_t	addr;
78		} private;
79		struct {
80			void		*a;
81			uintptr_t	b;
82		} both;
83	} info;
84};
85
86/* Priority inheritance mutex info. */
87struct umtx_pi {
88	/* Owner thread */
89	struct thread		*pi_owner;
90
91	/* Reference count */
92	int			pi_refcount;
93
94 	/* List entry to link umtx holding by thread */
95	TAILQ_ENTRY(umtx_pi)	pi_link;
96
97	/* List entry in hash */
98	TAILQ_ENTRY(umtx_pi)	pi_hashlink;
99
100	/* List for waiters */
101	TAILQ_HEAD(,umtx_q)	pi_blocked;
102
103	/* Identify a userland lock object */
104	struct umtx_key		pi_key;
105};
106
107/* A userland synchronous object user. */
108struct umtx_q {
109	/* Linked list for the hash. */
110	TAILQ_ENTRY(umtx_q)	uq_link;
111
112	/* Umtx key. */
113	struct umtx_key		uq_key;
114
115	/* Umtx flags. */
116	int			uq_flags;
117#define UQF_UMTXQ	0x0001
118
119	/* The thread waits on. */
120	struct thread		*uq_thread;
121
122	/*
123	 * Blocked on PI mutex. read can use chain lock
124	 * or sched_lock, write must have both chain lock and
125	 * sched_lock being hold.
126	 */
127	struct umtx_pi		*uq_pi_blocked;
128
129	/* On blocked list */
130	TAILQ_ENTRY(umtx_q)	uq_lockq;
131
132	/* Thread contending with us */
133	TAILQ_HEAD(,umtx_pi)	uq_pi_contested;
134
135	/* Inherited priority from PP mutex */
136	u_char			uq_inherited_pri;
137};
138
139TAILQ_HEAD(umtxq_head, umtx_q);
140
141/* Userland lock object's wait-queue chain */
142struct umtxq_chain {
143	/* Lock for this chain. */
144	struct mtx		uc_lock;
145
146	/* List of sleep queues. */
147	struct umtxq_head	uc_queue;
148
149	/* Busy flag */
150	char			uc_busy;
151
152	/* Chain lock waiters */
153	int			uc_waiters;
154
155	/* All PI in the list */
156	TAILQ_HEAD(,umtx_pi)	uc_pi_list;
157};
158
159#define	UMTXQ_LOCKED_ASSERT(uc)		mtx_assert(&(uc)->uc_lock, MA_OWNED)
160
161/*
162 * Don't propagate time-sharing priority, there is a security reason,
163 * a user can simply introduce PI-mutex, let thread A lock the mutex,
164 * and let another thread B block on the mutex, because B is
165 * sleeping, its priority will be boosted, this causes A's priority to
166 * be boosted via priority propagating too and will never be lowered even
167 * if it is using 100%CPU, this is unfair to other processes.
168 */
169
170#ifdef KSE
171#define UPRI(td)	(((td)->td_ksegrp->kg_user_pri >= PRI_MIN_TIMESHARE &&\
172			  (td)->td_ksegrp->kg_user_pri <= PRI_MAX_TIMESHARE) ?\
173			 PRI_MAX_TIMESHARE : (td)->td_ksegrp->kg_user_pri)
174#else
175#define UPRI(td)	(((td)->td_user_pri >= PRI_MIN_TIMESHARE &&\
176			  (td)->td_user_pri <= PRI_MAX_TIMESHARE) ?\
177			 PRI_MAX_TIMESHARE : (td)->td_user_pri)
178#endif
179
180#define	GOLDEN_RATIO_PRIME	2654404609U
181#define	UMTX_CHAINS		128
182#define	UMTX_SHIFTS		(__WORD_BIT - 7)
183
184#define THREAD_SHARE		0
185#define PROCESS_SHARE		1
186#define AUTO_SHARE		2
187
188#define	GET_SHARE(flags)	\
189    (((flags) & USYNC_PROCESS_SHARED) == 0 ? THREAD_SHARE : PROCESS_SHARE)
190
191static uma_zone_t		umtx_pi_zone;
192static struct umtxq_chain	umtxq_chains[UMTX_CHAINS];
193static MALLOC_DEFINE(M_UMTX, "umtx", "UMTX queue memory");
194static int			umtx_pi_allocated;
195
196SYSCTL_NODE(_debug, OID_AUTO, umtx, CTLFLAG_RW, 0, "umtx debug");
197SYSCTL_INT(_debug_umtx, OID_AUTO, umtx_pi_allocated, CTLFLAG_RD,
198    &umtx_pi_allocated, 0, "Allocated umtx_pi");
199
200static void umtxq_sysinit(void *);
201static void umtxq_hash(struct umtx_key *key);
202static struct umtxq_chain *umtxq_getchain(struct umtx_key *key);
203static void umtxq_lock(struct umtx_key *key);
204static void umtxq_unlock(struct umtx_key *key);
205static void umtxq_busy(struct umtx_key *key);
206static void umtxq_unbusy(struct umtx_key *key);
207static void umtxq_insert(struct umtx_q *uq);
208static void umtxq_remove(struct umtx_q *uq);
209static int umtxq_sleep(struct umtx_q *uq, const char *wmesg, int timo);
210static int umtxq_count(struct umtx_key *key);
211static int umtxq_signal(struct umtx_key *key, int nr_wakeup);
212static int umtx_key_match(const struct umtx_key *k1, const struct umtx_key *k2);
213static int umtx_key_get(void *addr, int type, int share,
214	struct umtx_key *key);
215static void umtx_key_release(struct umtx_key *key);
216static struct umtx_pi *umtx_pi_alloc(int);
217static void umtx_pi_free(struct umtx_pi *pi);
218static int do_unlock_pp(struct thread *td, struct umutex *m, uint32_t flags);
219static void umtx_thread_cleanup(struct thread *td);
220static void umtx_exec_hook(void *arg __unused, struct proc *p __unused,
221	struct image_params *imgp __unused);
222SYSINIT(umtx, SI_SUB_EVENTHANDLER+1, SI_ORDER_MIDDLE, umtxq_sysinit, NULL);
223
224static void
225umtxq_sysinit(void *arg __unused)
226{
227	int i;
228
229	umtx_pi_zone = uma_zcreate("umtx pi", sizeof(struct umtx_pi),
230		NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0);
231	for (i = 0; i < UMTX_CHAINS; ++i) {
232		mtx_init(&umtxq_chains[i].uc_lock, "umtxql", NULL,
233			 MTX_DEF | MTX_DUPOK);
234		TAILQ_INIT(&umtxq_chains[i].uc_queue);
235		TAILQ_INIT(&umtxq_chains[i].uc_pi_list);
236		umtxq_chains[i].uc_busy = 0;
237		umtxq_chains[i].uc_waiters = 0;
238	}
239	EVENTHANDLER_REGISTER(process_exec, umtx_exec_hook, NULL,
240	    EVENTHANDLER_PRI_ANY);
241}
242
243struct umtx_q *
244umtxq_alloc(void)
245{
246	struct umtx_q *uq;
247
248	uq = malloc(sizeof(struct umtx_q), M_UMTX, M_WAITOK | M_ZERO);
249	TAILQ_INIT(&uq->uq_pi_contested);
250	uq->uq_inherited_pri = PRI_MAX;
251	return (uq);
252}
253
254void
255umtxq_free(struct umtx_q *uq)
256{
257	free(uq, M_UMTX);
258}
259
260static inline void
261umtxq_hash(struct umtx_key *key)
262{
263	unsigned n = (uintptr_t)key->info.both.a + key->info.both.b;
264	key->hash = ((n * GOLDEN_RATIO_PRIME) >> UMTX_SHIFTS) % UMTX_CHAINS;
265}
266
267static inline int
268umtx_key_match(const struct umtx_key *k1, const struct umtx_key *k2)
269{
270	return (k1->type == k2->type &&
271		k1->info.both.a == k2->info.both.a &&
272	        k1->info.both.b == k2->info.both.b);
273}
274
275static inline struct umtxq_chain *
276umtxq_getchain(struct umtx_key *key)
277{
278	return (&umtxq_chains[key->hash]);
279}
280
281/*
282 * Set chain to busy state when following operation
283 * may be blocked (kernel mutex can not be used).
284 */
285static inline void
286umtxq_busy(struct umtx_key *key)
287{
288	struct umtxq_chain *uc;
289
290	uc = umtxq_getchain(key);
291	mtx_assert(&uc->uc_lock, MA_OWNED);
292	while (uc->uc_busy != 0) {
293		uc->uc_waiters++;
294		msleep(uc, &uc->uc_lock, 0, "umtxqb", 0);
295		uc->uc_waiters--;
296	}
297	uc->uc_busy = 1;
298}
299
300/*
301 * Unbusy a chain.
302 */
303static inline void
304umtxq_unbusy(struct umtx_key *key)
305{
306	struct umtxq_chain *uc;
307
308	uc = umtxq_getchain(key);
309	mtx_assert(&uc->uc_lock, MA_OWNED);
310	KASSERT(uc->uc_busy != 0, ("not busy"));
311	uc->uc_busy = 0;
312	if (uc->uc_waiters)
313		wakeup_one(uc);
314}
315
316/*
317 * Lock a chain.
318 */
319static inline void
320umtxq_lock(struct umtx_key *key)
321{
322	struct umtxq_chain *uc;
323
324	uc = umtxq_getchain(key);
325	mtx_lock(&uc->uc_lock);
326}
327
328/*
329 * Unlock a chain.
330 */
331static inline void
332umtxq_unlock(struct umtx_key *key)
333{
334	struct umtxq_chain *uc;
335
336	uc = umtxq_getchain(key);
337	mtx_unlock(&uc->uc_lock);
338}
339
340/*
341 * Insert a thread onto the umtx queue.
342 */
343static inline void
344umtxq_insert(struct umtx_q *uq)
345{
346	struct umtxq_chain *uc;
347
348	uc = umtxq_getchain(&uq->uq_key);
349	UMTXQ_LOCKED_ASSERT(uc);
350	TAILQ_INSERT_TAIL(&uc->uc_queue, uq, uq_link);
351	uq->uq_flags |= UQF_UMTXQ;
352}
353
354/*
355 * Remove thread from the umtx queue.
356 */
357static inline void
358umtxq_remove(struct umtx_q *uq)
359{
360	struct umtxq_chain *uc;
361
362	uc = umtxq_getchain(&uq->uq_key);
363	UMTXQ_LOCKED_ASSERT(uc);
364	if (uq->uq_flags & UQF_UMTXQ) {
365		TAILQ_REMOVE(&uc->uc_queue, uq, uq_link);
366		uq->uq_flags &= ~UQF_UMTXQ;
367	}
368}
369
370/*
371 * Check if there are multiple waiters
372 */
373static int
374umtxq_count(struct umtx_key *key)
375{
376	struct umtxq_chain *uc;
377	struct umtx_q *uq;
378	int count = 0;
379
380	uc = umtxq_getchain(key);
381	UMTXQ_LOCKED_ASSERT(uc);
382	TAILQ_FOREACH(uq, &uc->uc_queue, uq_link) {
383		if (umtx_key_match(&uq->uq_key, key)) {
384			if (++count > 1)
385				break;
386		}
387	}
388	return (count);
389}
390
391/*
392 * Check if there are multiple PI waiters and returns first
393 * waiter.
394 */
395static int
396umtxq_count_pi(struct umtx_key *key, struct umtx_q **first)
397{
398	struct umtxq_chain *uc;
399	struct umtx_q *uq;
400	int count = 0;
401
402	*first = NULL;
403	uc = umtxq_getchain(key);
404	UMTXQ_LOCKED_ASSERT(uc);
405	TAILQ_FOREACH(uq, &uc->uc_queue, uq_link) {
406		if (umtx_key_match(&uq->uq_key, key)) {
407			if (++count > 1)
408				break;
409			*first = uq;
410		}
411	}
412	return (count);
413}
414
415/*
416 * Wake up threads waiting on an userland object.
417 */
418static int
419umtxq_signal(struct umtx_key *key, int n_wake)
420{
421	struct umtxq_chain *uc;
422	struct umtx_q *uq, *next;
423	int ret;
424
425	ret = 0;
426	uc = umtxq_getchain(key);
427	UMTXQ_LOCKED_ASSERT(uc);
428	TAILQ_FOREACH_SAFE(uq, &uc->uc_queue, uq_link, next) {
429		if (umtx_key_match(&uq->uq_key, key)) {
430			umtxq_remove(uq);
431			wakeup(uq);
432			if (++ret >= n_wake)
433				break;
434		}
435	}
436	return (ret);
437}
438
439/*
440 * Wake up specified thread.
441 */
442static inline void
443umtxq_signal_thread(struct umtx_q *uq)
444{
445	struct umtxq_chain *uc;
446
447	uc = umtxq_getchain(&uq->uq_key);
448	UMTXQ_LOCKED_ASSERT(uc);
449	umtxq_remove(uq);
450	wakeup(uq);
451}
452
453/*
454 * Put thread into sleep state, before sleeping, check if
455 * thread was removed from umtx queue.
456 */
457static inline int
458umtxq_sleep(struct umtx_q *uq, const char *wmesg, int timo)
459{
460	struct umtxq_chain *uc;
461	int error;
462
463	uc = umtxq_getchain(&uq->uq_key);
464	UMTXQ_LOCKED_ASSERT(uc);
465	if (!(uq->uq_flags & UQF_UMTXQ))
466		return (0);
467	error = msleep(uq, &uc->uc_lock, PCATCH, wmesg, timo);
468	if (error == EWOULDBLOCK)
469		error = ETIMEDOUT;
470	return (error);
471}
472
473/*
474 * Convert userspace address into unique logical address.
475 */
476static int
477umtx_key_get(void *addr, int type, int share, struct umtx_key *key)
478{
479	struct thread *td = curthread;
480	vm_map_t map;
481	vm_map_entry_t entry;
482	vm_pindex_t pindex;
483	vm_prot_t prot;
484	boolean_t wired;
485
486	key->type = type;
487	if (share == THREAD_SHARE) {
488		key->shared = 0;
489		key->info.private.vs = td->td_proc->p_vmspace;
490		key->info.private.addr = (uintptr_t)addr;
491	} else {
492		MPASS(share == PROCESS_SHARE || share == AUTO_SHARE);
493		map = &td->td_proc->p_vmspace->vm_map;
494		if (vm_map_lookup(&map, (vm_offset_t)addr, VM_PROT_WRITE,
495		    &entry, &key->info.shared.object, &pindex, &prot,
496		    &wired) != KERN_SUCCESS) {
497			return EFAULT;
498		}
499
500		if ((share == PROCESS_SHARE) ||
501		    (share == AUTO_SHARE &&
502		     VM_INHERIT_SHARE == entry->inheritance)) {
503			key->shared = 1;
504			key->info.shared.offset = entry->offset + entry->start -
505				(vm_offset_t)addr;
506			vm_object_reference(key->info.shared.object);
507		} else {
508			key->shared = 0;
509			key->info.private.vs = td->td_proc->p_vmspace;
510			key->info.private.addr = (uintptr_t)addr;
511		}
512		vm_map_lookup_done(map, entry);
513	}
514
515	umtxq_hash(key);
516	return (0);
517}
518
519/*
520 * Release key.
521 */
522static inline void
523umtx_key_release(struct umtx_key *key)
524{
525	if (key->shared)
526		vm_object_deallocate(key->info.shared.object);
527}
528
529/*
530 * Lock a umtx object.
531 */
532static int
533_do_lock_umtx(struct thread *td, struct umtx *umtx, u_long id, int timo)
534{
535	struct umtx_q *uq;
536	u_long owner;
537	u_long old;
538	int error = 0;
539
540	uq = td->td_umtxq;
541
542	/*
543	 * Care must be exercised when dealing with umtx structure. It
544	 * can fault on any access.
545	 */
546	for (;;) {
547		/*
548		 * Try the uncontested case.  This should be done in userland.
549		 */
550		owner = casuword(&umtx->u_owner, UMTX_UNOWNED, id);
551
552		/* The acquire succeeded. */
553		if (owner == UMTX_UNOWNED)
554			return (0);
555
556		/* The address was invalid. */
557		if (owner == -1)
558			return (EFAULT);
559
560		/* If no one owns it but it is contested try to acquire it. */
561		if (owner == UMTX_CONTESTED) {
562			owner = casuword(&umtx->u_owner,
563			    UMTX_CONTESTED, id | UMTX_CONTESTED);
564
565			if (owner == UMTX_CONTESTED)
566				return (0);
567
568			/* The address was invalid. */
569			if (owner == -1)
570				return (EFAULT);
571
572			/* If this failed the lock has changed, restart. */
573			continue;
574		}
575
576		/*
577		 * If we caught a signal, we have retried and now
578		 * exit immediately.
579		 */
580		if (error != 0)
581			return (error);
582
583		if ((error = umtx_key_get(umtx, TYPE_SIMPLE_LOCK,
584			AUTO_SHARE, &uq->uq_key)) != 0)
585			return (error);
586
587		umtxq_lock(&uq->uq_key);
588		umtxq_busy(&uq->uq_key);
589		umtxq_insert(uq);
590		umtxq_unbusy(&uq->uq_key);
591		umtxq_unlock(&uq->uq_key);
592
593		/*
594		 * Set the contested bit so that a release in user space
595		 * knows to use the system call for unlock.  If this fails
596		 * either some one else has acquired the lock or it has been
597		 * released.
598		 */
599		old = casuword(&umtx->u_owner, owner, owner | UMTX_CONTESTED);
600
601		/* The address was invalid. */
602		if (old == -1) {
603			umtxq_lock(&uq->uq_key);
604			umtxq_remove(uq);
605			umtxq_unlock(&uq->uq_key);
606			umtx_key_release(&uq->uq_key);
607			return (EFAULT);
608		}
609
610		/*
611		 * We set the contested bit, sleep. Otherwise the lock changed
612		 * and we need to retry or we lost a race to the thread
613		 * unlocking the umtx.
614		 */
615		umtxq_lock(&uq->uq_key);
616		if (old == owner)
617			error = umtxq_sleep(uq, "umtx", timo);
618		umtxq_remove(uq);
619		umtxq_unlock(&uq->uq_key);
620		umtx_key_release(&uq->uq_key);
621	}
622
623	return (0);
624}
625
626/*
627 * Lock a umtx object.
628 */
629static int
630do_lock_umtx(struct thread *td, struct umtx *umtx, u_long id,
631	struct timespec *timeout)
632{
633	struct timespec ts, ts2, ts3;
634	struct timeval tv;
635	int error;
636
637	if (timeout == NULL) {
638		error = _do_lock_umtx(td, umtx, id, 0);
639		/* Mutex locking is restarted if it is interrupted. */
640		if (error == EINTR)
641			error = ERESTART;
642	} else {
643		getnanouptime(&ts);
644		timespecadd(&ts, timeout);
645		TIMESPEC_TO_TIMEVAL(&tv, timeout);
646		for (;;) {
647			error = _do_lock_umtx(td, umtx, id, tvtohz(&tv));
648			if (error != ETIMEDOUT)
649				break;
650			getnanouptime(&ts2);
651			if (timespeccmp(&ts2, &ts, >=)) {
652				error = ETIMEDOUT;
653				break;
654			}
655			ts3 = ts;
656			timespecsub(&ts3, &ts2);
657			TIMESPEC_TO_TIMEVAL(&tv, &ts3);
658		}
659		/* Timed-locking is not restarted. */
660		if (error == ERESTART)
661			error = EINTR;
662	}
663	return (error);
664}
665
666/*
667 * Unlock a umtx object.
668 */
669static int
670do_unlock_umtx(struct thread *td, struct umtx *umtx, u_long id)
671{
672	struct umtx_key key;
673	u_long owner;
674	u_long old;
675	int error;
676	int count;
677
678	/*
679	 * Make sure we own this mtx.
680	 */
681	owner = fuword(__DEVOLATILE(u_long *, &umtx->u_owner));
682	if (owner == -1)
683		return (EFAULT);
684
685	if ((owner & ~UMTX_CONTESTED) != id)
686		return (EPERM);
687
688	/* This should be done in userland */
689	if ((owner & UMTX_CONTESTED) == 0) {
690		old = casuword(&umtx->u_owner, owner, UMTX_UNOWNED);
691		if (old == -1)
692			return (EFAULT);
693		if (old == owner)
694			return (0);
695		owner = old;
696	}
697
698	/* We should only ever be in here for contested locks */
699	if ((error = umtx_key_get(umtx, TYPE_SIMPLE_LOCK, AUTO_SHARE,
700		&key)) != 0)
701		return (error);
702
703	umtxq_lock(&key);
704	umtxq_busy(&key);
705	count = umtxq_count(&key);
706	umtxq_unlock(&key);
707
708	/*
709	 * When unlocking the umtx, it must be marked as unowned if
710	 * there is zero or one thread only waiting for it.
711	 * Otherwise, it must be marked as contested.
712	 */
713	old = casuword(&umtx->u_owner, owner,
714		count <= 1 ? UMTX_UNOWNED : UMTX_CONTESTED);
715	umtxq_lock(&key);
716	umtxq_signal(&key,1);
717	umtxq_unbusy(&key);
718	umtxq_unlock(&key);
719	umtx_key_release(&key);
720	if (old == -1)
721		return (EFAULT);
722	if (old != owner)
723		return (EINVAL);
724	return (0);
725}
726
727#ifdef COMPAT_IA32
728
729/*
730 * Lock a umtx object.
731 */
732static int
733_do_lock_umtx32(struct thread *td, uint32_t *m, uint32_t id, int timo)
734{
735	struct umtx_q *uq;
736	uint32_t owner;
737	uint32_t old;
738	int error = 0;
739
740	uq = td->td_umtxq;
741
742	/*
743	 * Care must be exercised when dealing with umtx structure. It
744	 * can fault on any access.
745	 */
746	for (;;) {
747		/*
748		 * Try the uncontested case.  This should be done in userland.
749		 */
750		owner = casuword32(m, UMUTEX_UNOWNED, id);
751
752		/* The acquire succeeded. */
753		if (owner == UMUTEX_UNOWNED)
754			return (0);
755
756		/* The address was invalid. */
757		if (owner == -1)
758			return (EFAULT);
759
760		/* If no one owns it but it is contested try to acquire it. */
761		if (owner == UMUTEX_CONTESTED) {
762			owner = casuword32(m,
763			    UMUTEX_CONTESTED, id | UMUTEX_CONTESTED);
764			if (owner == UMUTEX_CONTESTED)
765				return (0);
766
767			/* The address was invalid. */
768			if (owner == -1)
769				return (EFAULT);
770
771			/* If this failed the lock has changed, restart. */
772			continue;
773		}
774
775		/*
776		 * If we caught a signal, we have retried and now
777		 * exit immediately.
778		 */
779		if (error != 0)
780			return (error);
781
782		if ((error = umtx_key_get(m, TYPE_SIMPLE_LOCK,
783			AUTO_SHARE, &uq->uq_key)) != 0)
784			return (error);
785
786		umtxq_lock(&uq->uq_key);
787		umtxq_busy(&uq->uq_key);
788		umtxq_insert(uq);
789		umtxq_unbusy(&uq->uq_key);
790		umtxq_unlock(&uq->uq_key);
791
792		/*
793		 * Set the contested bit so that a release in user space
794		 * knows to use the system call for unlock.  If this fails
795		 * either some one else has acquired the lock or it has been
796		 * released.
797		 */
798		old = casuword32(m, owner, owner | UMUTEX_CONTESTED);
799
800		/* The address was invalid. */
801		if (old == -1) {
802			umtxq_lock(&uq->uq_key);
803			umtxq_remove(uq);
804			umtxq_unlock(&uq->uq_key);
805			umtx_key_release(&uq->uq_key);
806			return (EFAULT);
807		}
808
809		/*
810		 * We set the contested bit, sleep. Otherwise the lock changed
811		 * and we need to retry or we lost a race to the thread
812		 * unlocking the umtx.
813		 */
814		umtxq_lock(&uq->uq_key);
815		if (old == owner)
816			error = umtxq_sleep(uq, "umtx", timo);
817		umtxq_remove(uq);
818		umtxq_unlock(&uq->uq_key);
819		umtx_key_release(&uq->uq_key);
820	}
821
822	return (0);
823}
824
825/*
826 * Lock a umtx object.
827 */
828static int
829do_lock_umtx32(struct thread *td, void *m, uint32_t id,
830	struct timespec *timeout)
831{
832	struct timespec ts, ts2, ts3;
833	struct timeval tv;
834	int error;
835
836	if (timeout == NULL) {
837		error = _do_lock_umtx32(td, m, id, 0);
838		/* Mutex locking is restarted if it is interrupted. */
839		if (error == EINTR)
840			error = ERESTART;
841	} else {
842		getnanouptime(&ts);
843		timespecadd(&ts, timeout);
844		TIMESPEC_TO_TIMEVAL(&tv, timeout);
845		for (;;) {
846			error = _do_lock_umtx32(td, m, id, tvtohz(&tv));
847			if (error != ETIMEDOUT)
848				break;
849			getnanouptime(&ts2);
850			if (timespeccmp(&ts2, &ts, >=)) {
851				error = ETIMEDOUT;
852				break;
853			}
854			ts3 = ts;
855			timespecsub(&ts3, &ts2);
856			TIMESPEC_TO_TIMEVAL(&tv, &ts3);
857		}
858		/* Timed-locking is not restarted. */
859		if (error == ERESTART)
860			error = EINTR;
861	}
862	return (error);
863}
864
865/*
866 * Unlock a umtx object.
867 */
868static int
869do_unlock_umtx32(struct thread *td, uint32_t *m, uint32_t id)
870{
871	struct umtx_key key;
872	uint32_t owner;
873	uint32_t old;
874	int error;
875	int count;
876
877	/*
878	 * Make sure we own this mtx.
879	 */
880	owner = fuword32(m);
881	if (owner == -1)
882		return (EFAULT);
883
884	if ((owner & ~UMUTEX_CONTESTED) != id)
885		return (EPERM);
886
887	/* This should be done in userland */
888	if ((owner & UMUTEX_CONTESTED) == 0) {
889		old = casuword32(m, owner, UMUTEX_UNOWNED);
890		if (old == -1)
891			return (EFAULT);
892		if (old == owner)
893			return (0);
894		owner = old;
895	}
896
897	/* We should only ever be in here for contested locks */
898	if ((error = umtx_key_get(m, TYPE_SIMPLE_LOCK, AUTO_SHARE,
899		&key)) != 0)
900		return (error);
901
902	umtxq_lock(&key);
903	umtxq_busy(&key);
904	count = umtxq_count(&key);
905	umtxq_unlock(&key);
906
907	/*
908	 * When unlocking the umtx, it must be marked as unowned if
909	 * there is zero or one thread only waiting for it.
910	 * Otherwise, it must be marked as contested.
911	 */
912	old = casuword32(m, owner,
913		count <= 1 ? UMUTEX_UNOWNED : UMUTEX_CONTESTED);
914	umtxq_lock(&key);
915	umtxq_signal(&key,1);
916	umtxq_unbusy(&key);
917	umtxq_unlock(&key);
918	umtx_key_release(&key);
919	if (old == -1)
920		return (EFAULT);
921	if (old != owner)
922		return (EINVAL);
923	return (0);
924}
925#endif
926
927/*
928 * Fetch and compare value, sleep on the address if value is not changed.
929 */
930static int
931do_wait(struct thread *td, void *addr, u_long id,
932	struct timespec *timeout, int compat32)
933{
934	struct umtx_q *uq;
935	struct timespec ts, ts2, ts3;
936	struct timeval tv;
937	u_long tmp;
938	int error = 0;
939
940	uq = td->td_umtxq;
941	if ((error = umtx_key_get(addr, TYPE_SIMPLE_WAIT, AUTO_SHARE,
942	    &uq->uq_key)) != 0)
943		return (error);
944
945	umtxq_lock(&uq->uq_key);
946	umtxq_insert(uq);
947	umtxq_unlock(&uq->uq_key);
948	if (compat32 == 0)
949		tmp = fuword(addr);
950        else
951		tmp = fuword32(addr);
952	if (tmp != id) {
953		umtxq_lock(&uq->uq_key);
954		umtxq_remove(uq);
955		umtxq_unlock(&uq->uq_key);
956	} else if (timeout == NULL) {
957		umtxq_lock(&uq->uq_key);
958		error = umtxq_sleep(uq, "ucond", 0);
959		umtxq_remove(uq);
960		umtxq_unlock(&uq->uq_key);
961	} else {
962		getnanouptime(&ts);
963		timespecadd(&ts, timeout);
964		TIMESPEC_TO_TIMEVAL(&tv, timeout);
965		umtxq_lock(&uq->uq_key);
966		for (;;) {
967			error = umtxq_sleep(uq, "ucond", tvtohz(&tv));
968			if (!(uq->uq_flags & UQF_UMTXQ))
969				break;
970			if (error != ETIMEDOUT)
971				break;
972			umtxq_unlock(&uq->uq_key);
973			getnanouptime(&ts2);
974			if (timespeccmp(&ts2, &ts, >=)) {
975				error = ETIMEDOUT;
976				umtxq_lock(&uq->uq_key);
977				break;
978			}
979			ts3 = ts;
980			timespecsub(&ts3, &ts2);
981			TIMESPEC_TO_TIMEVAL(&tv, &ts3);
982			umtxq_lock(&uq->uq_key);
983		}
984		umtxq_remove(uq);
985		umtxq_unlock(&uq->uq_key);
986	}
987	umtx_key_release(&uq->uq_key);
988	if (error == ERESTART)
989		error = EINTR;
990	return (error);
991}
992
993/*
994 * Wake up threads sleeping on the specified address.
995 */
996int
997kern_umtx_wake(struct thread *td, void *uaddr, int n_wake)
998{
999	struct umtx_key key;
1000	int ret;
1001
1002	if ((ret = umtx_key_get(uaddr, TYPE_SIMPLE_WAIT, AUTO_SHARE,
1003	   &key)) != 0)
1004		return (ret);
1005	umtxq_lock(&key);
1006	ret = umtxq_signal(&key, n_wake);
1007	umtxq_unlock(&key);
1008	umtx_key_release(&key);
1009	return (0);
1010}
1011
1012/*
1013 * Lock PTHREAD_PRIO_NONE protocol POSIX mutex.
1014 */
1015static int
1016_do_lock_normal(struct thread *td, struct umutex *m, uint32_t flags, int timo,
1017	int try)
1018{
1019	struct umtx_q *uq;
1020	uint32_t owner, old, id;
1021	int error = 0;
1022
1023	id = td->td_tid;
1024	uq = td->td_umtxq;
1025
1026	/*
1027	 * Care must be exercised when dealing with umtx structure. It
1028	 * can fault on any access.
1029	 */
1030	for (;;) {
1031		/*
1032		 * Try the uncontested case.  This should be done in userland.
1033		 */
1034		owner = casuword32(&m->m_owner, UMUTEX_UNOWNED, id);
1035
1036		/* The acquire succeeded. */
1037		if (owner == UMUTEX_UNOWNED)
1038			return (0);
1039
1040		/* The address was invalid. */
1041		if (owner == -1)
1042			return (EFAULT);
1043
1044		/* If no one owns it but it is contested try to acquire it. */
1045		if (owner == UMUTEX_CONTESTED) {
1046			owner = casuword32(&m->m_owner,
1047			    UMUTEX_CONTESTED, id | UMUTEX_CONTESTED);
1048
1049			if (owner == UMUTEX_CONTESTED)
1050				return (0);
1051
1052			/* The address was invalid. */
1053			if (owner == -1)
1054				return (EFAULT);
1055
1056			/* If this failed the lock has changed, restart. */
1057			continue;
1058		}
1059
1060		if ((flags & UMUTEX_ERROR_CHECK) != 0 &&
1061		    (owner & ~UMUTEX_CONTESTED) == id)
1062			return (EDEADLK);
1063
1064		if (try != 0)
1065			return (EBUSY);
1066
1067		/*
1068		 * If we caught a signal, we have retried and now
1069		 * exit immediately.
1070		 */
1071		if (error != 0)
1072			return (error);
1073
1074		if ((error = umtx_key_get(m, TYPE_NORMAL_UMUTEX,
1075		    GET_SHARE(flags), &uq->uq_key)) != 0)
1076			return (error);
1077
1078		umtxq_lock(&uq->uq_key);
1079		umtxq_busy(&uq->uq_key);
1080		umtxq_insert(uq);
1081		umtxq_unbusy(&uq->uq_key);
1082		umtxq_unlock(&uq->uq_key);
1083
1084		/*
1085		 * Set the contested bit so that a release in user space
1086		 * knows to use the system call for unlock.  If this fails
1087		 * either some one else has acquired the lock or it has been
1088		 * released.
1089		 */
1090		old = casuword32(&m->m_owner, owner, owner | UMUTEX_CONTESTED);
1091
1092		/* The address was invalid. */
1093		if (old == -1) {
1094			umtxq_lock(&uq->uq_key);
1095			umtxq_remove(uq);
1096			umtxq_unlock(&uq->uq_key);
1097			umtx_key_release(&uq->uq_key);
1098			return (EFAULT);
1099		}
1100
1101		/*
1102		 * We set the contested bit, sleep. Otherwise the lock changed
1103		 * and we need to retry or we lost a race to the thread
1104		 * unlocking the umtx.
1105		 */
1106		umtxq_lock(&uq->uq_key);
1107		if (old == owner)
1108			error = umtxq_sleep(uq, "umtxn", timo);
1109		umtxq_remove(uq);
1110		umtxq_unlock(&uq->uq_key);
1111		umtx_key_release(&uq->uq_key);
1112	}
1113
1114	return (0);
1115}
1116
1117/*
1118 * Lock PTHREAD_PRIO_NONE protocol POSIX mutex.
1119 */
1120/*
1121 * Unlock PTHREAD_PRIO_NONE protocol POSIX mutex.
1122 */
1123static int
1124do_unlock_normal(struct thread *td, struct umutex *m, uint32_t flags)
1125{
1126	struct umtx_key key;
1127	uint32_t owner, old, id;
1128	int error;
1129	int count;
1130
1131	id = td->td_tid;
1132	/*
1133	 * Make sure we own this mtx.
1134	 */
1135	owner = fuword32(__DEVOLATILE(uint32_t *, &m->m_owner));
1136	if (owner == -1)
1137		return (EFAULT);
1138
1139	if ((owner & ~UMUTEX_CONTESTED) != id)
1140		return (EPERM);
1141
1142	/* This should be done in userland */
1143	if ((owner & UMUTEX_CONTESTED) == 0) {
1144		old = casuword32(&m->m_owner, owner, UMUTEX_UNOWNED);
1145		if (old == -1)
1146			return (EFAULT);
1147		if (old == owner)
1148			return (0);
1149		owner = old;
1150	}
1151
1152	/* We should only ever be in here for contested locks */
1153	if ((error = umtx_key_get(m, TYPE_NORMAL_UMUTEX, GET_SHARE(flags),
1154	    &key)) != 0)
1155		return (error);
1156
1157	umtxq_lock(&key);
1158	umtxq_busy(&key);
1159	count = umtxq_count(&key);
1160	umtxq_unlock(&key);
1161
1162	/*
1163	 * When unlocking the umtx, it must be marked as unowned if
1164	 * there is zero or one thread only waiting for it.
1165	 * Otherwise, it must be marked as contested.
1166	 */
1167	old = casuword32(&m->m_owner, owner,
1168		count <= 1 ? UMUTEX_UNOWNED : UMUTEX_CONTESTED);
1169	umtxq_lock(&key);
1170	umtxq_signal(&key,1);
1171	umtxq_unbusy(&key);
1172	umtxq_unlock(&key);
1173	umtx_key_release(&key);
1174	if (old == -1)
1175		return (EFAULT);
1176	if (old != owner)
1177		return (EINVAL);
1178	return (0);
1179}
1180
1181static inline struct umtx_pi *
1182umtx_pi_alloc(int flags)
1183{
1184	struct umtx_pi *pi;
1185
1186	pi = uma_zalloc(umtx_pi_zone, M_ZERO | flags);
1187	TAILQ_INIT(&pi->pi_blocked);
1188	atomic_add_int(&umtx_pi_allocated, 1);
1189	return (pi);
1190}
1191
1192static inline void
1193umtx_pi_free(struct umtx_pi *pi)
1194{
1195	uma_zfree(umtx_pi_zone, pi);
1196	atomic_add_int(&umtx_pi_allocated, -1);
1197}
1198
1199/*
1200 * Adjust the thread's position on a pi_state after its priority has been
1201 * changed.
1202 */
1203static int
1204umtx_pi_adjust_thread(struct umtx_pi *pi, struct thread *td)
1205{
1206	struct umtx_q *uq, *uq1, *uq2;
1207	struct thread *td1;
1208
1209	mtx_assert(&sched_lock, MA_OWNED);
1210	if (pi == NULL)
1211		return (0);
1212
1213	uq = td->td_umtxq;
1214
1215	/*
1216	 * Check if the thread needs to be moved on the blocked chain.
1217	 * It needs to be moved if either its priority is lower than
1218	 * the previous thread or higher than the next thread.
1219	 */
1220	uq1 = TAILQ_PREV(uq, umtxq_head, uq_lockq);
1221	uq2 = TAILQ_NEXT(uq, uq_lockq);
1222	if ((uq1 != NULL && UPRI(td) < UPRI(uq1->uq_thread)) ||
1223	    (uq2 != NULL && UPRI(td) > UPRI(uq2->uq_thread))) {
1224		/*
1225		 * Remove thread from blocked chain and determine where
1226		 * it should be moved to.
1227		 */
1228		TAILQ_REMOVE(&pi->pi_blocked, uq, uq_lockq);
1229		TAILQ_FOREACH(uq1, &pi->pi_blocked, uq_lockq) {
1230			td1 = uq1->uq_thread;
1231			MPASS(td1->td_proc->p_magic == P_MAGIC);
1232			if (UPRI(td1) > UPRI(td))
1233				break;
1234		}
1235
1236		if (uq1 == NULL)
1237			TAILQ_INSERT_TAIL(&pi->pi_blocked, uq, uq_lockq);
1238		else
1239			TAILQ_INSERT_BEFORE(uq1, uq, uq_lockq);
1240	}
1241	return (1);
1242}
1243
1244/*
1245 * Propagate priority when a thread is blocked on POSIX
1246 * PI mutex.
1247 */
1248static void
1249umtx_propagate_priority(struct thread *td)
1250{
1251	struct umtx_q *uq;
1252	struct umtx_pi *pi;
1253	int pri;
1254
1255	mtx_assert(&sched_lock, MA_OWNED);
1256	pri = UPRI(td);
1257	uq = td->td_umtxq;
1258	pi = uq->uq_pi_blocked;
1259	if (pi == NULL)
1260		return;
1261
1262	for (;;) {
1263		td = pi->pi_owner;
1264		if (td == NULL)
1265			return;
1266
1267		MPASS(td->td_proc != NULL);
1268		MPASS(td->td_proc->p_magic == P_MAGIC);
1269
1270		if (UPRI(td) <= pri)
1271			return;
1272
1273		sched_lend_user_prio(td, pri);
1274
1275		/*
1276		 * Pick up the lock that td is blocked on.
1277		 */
1278		uq = td->td_umtxq;
1279		pi = uq->uq_pi_blocked;
1280		/* Resort td on the list if needed. */
1281		if (!umtx_pi_adjust_thread(pi, td))
1282			break;
1283	}
1284}
1285
1286/*
1287 * Unpropagate priority for a PI mutex when a thread blocked on
1288 * it is interrupted by signal or resumed by others.
1289 */
1290static void
1291umtx_unpropagate_priority(struct umtx_pi *pi)
1292{
1293	struct umtx_q *uq, *uq_owner;
1294	struct umtx_pi *pi2;
1295	int pri;
1296
1297	mtx_assert(&sched_lock, MA_OWNED);
1298
1299	while (pi != NULL && pi->pi_owner != NULL) {
1300		pri = PRI_MAX;
1301		uq_owner = pi->pi_owner->td_umtxq;
1302
1303		TAILQ_FOREACH(pi2, &uq_owner->uq_pi_contested, pi_link) {
1304			uq = TAILQ_FIRST(&pi2->pi_blocked);
1305			if (uq != NULL) {
1306				if (pri > UPRI(uq->uq_thread))
1307					pri = UPRI(uq->uq_thread);
1308			}
1309		}
1310
1311		if (pri > uq_owner->uq_inherited_pri)
1312			pri = uq_owner->uq_inherited_pri;
1313		sched_unlend_user_prio(pi->pi_owner, pri);
1314		pi = uq_owner->uq_pi_blocked;
1315	}
1316}
1317
1318/*
1319 * Insert a PI mutex into owned list.
1320 */
1321static void
1322umtx_pi_setowner(struct umtx_pi *pi, struct thread *owner)
1323{
1324	struct umtx_q *uq_owner;
1325
1326	uq_owner = owner->td_umtxq;
1327	mtx_assert(&sched_lock, MA_OWNED);
1328	if (pi->pi_owner != NULL)
1329		panic("pi_ower != NULL");
1330	pi->pi_owner = owner;
1331	TAILQ_INSERT_TAIL(&uq_owner->uq_pi_contested, pi, pi_link);
1332}
1333
1334/*
1335 * Claim ownership of a PI mutex.
1336 */
1337static int
1338umtx_pi_claim(struct umtx_pi *pi, struct thread *owner)
1339{
1340	struct umtx_q *uq, *uq_owner;
1341
1342	uq_owner = owner->td_umtxq;
1343	mtx_lock_spin(&sched_lock);
1344	if (pi->pi_owner == owner) {
1345		mtx_unlock_spin(&sched_lock);
1346		return (0);
1347	}
1348
1349	if (pi->pi_owner != NULL) {
1350		/*
1351		 * userland may have already messed the mutex, sigh.
1352		 */
1353		mtx_unlock_spin(&sched_lock);
1354		return (EPERM);
1355	}
1356	umtx_pi_setowner(pi, owner);
1357	uq = TAILQ_FIRST(&pi->pi_blocked);
1358	if (uq != NULL) {
1359		int pri;
1360
1361		pri = UPRI(uq->uq_thread);
1362		if (pri < UPRI(owner))
1363			sched_lend_user_prio(owner, pri);
1364	}
1365	mtx_unlock_spin(&sched_lock);
1366	return (0);
1367}
1368
1369/*
1370 * Adjust a thread's order position in its blocked PI mutex,
1371 * this may result new priority propagating process.
1372 */
1373void
1374umtx_pi_adjust(struct thread *td, u_char oldpri)
1375{
1376	struct umtx_q *uq;
1377	struct umtx_pi *pi;
1378
1379	uq = td->td_umtxq;
1380
1381	mtx_assert(&sched_lock, MA_OWNED);
1382	MPASS(TD_ON_UPILOCK(td));
1383
1384	/*
1385	 * Pick up the lock that td is blocked on.
1386	 */
1387	pi = uq->uq_pi_blocked;
1388	MPASS(pi != NULL);
1389
1390	/* Resort the turnstile on the list. */
1391	if (!umtx_pi_adjust_thread(pi, td))
1392		return;
1393
1394	/*
1395	 * If our priority was lowered and we are at the head of the
1396	 * turnstile, then propagate our new priority up the chain.
1397	 */
1398	if (uq == TAILQ_FIRST(&pi->pi_blocked) && UPRI(td) < oldpri)
1399		umtx_propagate_priority(td);
1400}
1401
1402/*
1403 * Sleep on a PI mutex.
1404 */
1405static int
1406umtxq_sleep_pi(struct umtx_q *uq, struct umtx_pi *pi,
1407	uint32_t owner, const char *wmesg, int timo)
1408{
1409	struct umtxq_chain *uc;
1410	struct thread *td, *td1;
1411	struct umtx_q *uq1;
1412	int pri;
1413	int error = 0;
1414
1415	td = uq->uq_thread;
1416	KASSERT(td == curthread, ("inconsistent uq_thread"));
1417	uc = umtxq_getchain(&uq->uq_key);
1418	UMTXQ_LOCKED_ASSERT(uc);
1419	umtxq_insert(uq);
1420	if (pi->pi_owner == NULL) {
1421		/* XXX
1422		 * Current, We only support process private PI-mutex,
1423		 * non-contended PI-mutexes are locked in userland.
1424		 * Process shared PI-mutex should always be initialized
1425		 * by kernel and be registered in kernel, locking should
1426		 * always be done by kernel to avoid security problems.
1427		 * For process private PI-mutex, we can find owner
1428		 * thread and boost its priority safely.
1429		 */
1430		PROC_LOCK(curproc);
1431		td1 = thread_find(curproc, owner);
1432		mtx_lock_spin(&sched_lock);
1433		if (td1 != NULL && pi->pi_owner == NULL) {
1434			uq1 = td1->td_umtxq;
1435			umtx_pi_setowner(pi, td1);
1436		}
1437		PROC_UNLOCK(curproc);
1438	} else {
1439		mtx_lock_spin(&sched_lock);
1440	}
1441
1442	TAILQ_FOREACH(uq1, &pi->pi_blocked, uq_lockq) {
1443		pri = UPRI(uq1->uq_thread);
1444		if (pri > UPRI(td))
1445			break;
1446	}
1447
1448	if (uq1 != NULL)
1449		TAILQ_INSERT_BEFORE(uq1, uq, uq_lockq);
1450	else
1451		TAILQ_INSERT_TAIL(&pi->pi_blocked, uq, uq_lockq);
1452
1453	uq->uq_pi_blocked = pi;
1454	td->td_flags |= TDF_UPIBLOCKED;
1455	mtx_unlock_spin(&sched_lock);
1456	umtxq_unlock(&uq->uq_key);
1457
1458	mtx_lock_spin(&sched_lock);
1459	umtx_propagate_priority(td);
1460	mtx_unlock_spin(&sched_lock);
1461
1462	umtxq_lock(&uq->uq_key);
1463	if (uq->uq_flags & UQF_UMTXQ) {
1464		error = msleep(uq, &uc->uc_lock, PCATCH, wmesg, timo);
1465		if (error == EWOULDBLOCK)
1466			error = ETIMEDOUT;
1467		if (uq->uq_flags & UQF_UMTXQ) {
1468			umtxq_busy(&uq->uq_key);
1469			umtxq_remove(uq);
1470			umtxq_unbusy(&uq->uq_key);
1471		}
1472	}
1473	umtxq_unlock(&uq->uq_key);
1474
1475	mtx_lock_spin(&sched_lock);
1476	uq->uq_pi_blocked = NULL;
1477	td->td_flags &= ~TDF_UPIBLOCKED;
1478	TAILQ_REMOVE(&pi->pi_blocked, uq, uq_lockq);
1479	umtx_unpropagate_priority(pi);
1480	mtx_unlock_spin(&sched_lock);
1481
1482	umtxq_lock(&uq->uq_key);
1483
1484	return (error);
1485}
1486
1487/*
1488 * Add reference count for a PI mutex.
1489 */
1490static void
1491umtx_pi_ref(struct umtx_pi *pi)
1492{
1493	struct umtxq_chain *uc;
1494
1495	uc = umtxq_getchain(&pi->pi_key);
1496	UMTXQ_LOCKED_ASSERT(uc);
1497	pi->pi_refcount++;
1498}
1499
1500/*
1501 * Decrease reference count for a PI mutex, if the counter
1502 * is decreased to zero, its memory space is freed.
1503 */
1504static void
1505umtx_pi_unref(struct umtx_pi *pi)
1506{
1507	struct umtxq_chain *uc;
1508	int free = 0;
1509
1510	uc = umtxq_getchain(&pi->pi_key);
1511	UMTXQ_LOCKED_ASSERT(uc);
1512	KASSERT(pi->pi_refcount > 0, ("invalid reference count"));
1513	if (--pi->pi_refcount == 0) {
1514		mtx_lock_spin(&sched_lock);
1515		if (pi->pi_owner != NULL) {
1516			TAILQ_REMOVE(&pi->pi_owner->td_umtxq->uq_pi_contested,
1517				pi, pi_link);
1518			pi->pi_owner = NULL;
1519		}
1520		KASSERT(TAILQ_EMPTY(&pi->pi_blocked),
1521			("blocked queue not empty"));
1522		mtx_unlock_spin(&sched_lock);
1523		TAILQ_REMOVE(&uc->uc_pi_list, pi, pi_hashlink);
1524		free = 1;
1525	}
1526	if (free)
1527		umtx_pi_free(pi);
1528}
1529
1530/*
1531 * Find a PI mutex in hash table.
1532 */
1533static struct umtx_pi *
1534umtx_pi_lookup(struct umtx_key *key)
1535{
1536	struct umtxq_chain *uc;
1537	struct umtx_pi *pi;
1538
1539	uc = umtxq_getchain(key);
1540	UMTXQ_LOCKED_ASSERT(uc);
1541
1542	TAILQ_FOREACH(pi, &uc->uc_pi_list, pi_hashlink) {
1543		if (umtx_key_match(&pi->pi_key, key)) {
1544			return (pi);
1545		}
1546	}
1547	return (NULL);
1548}
1549
1550/*
1551 * Insert a PI mutex into hash table.
1552 */
1553static inline void
1554umtx_pi_insert(struct umtx_pi *pi)
1555{
1556	struct umtxq_chain *uc;
1557
1558	uc = umtxq_getchain(&pi->pi_key);
1559	UMTXQ_LOCKED_ASSERT(uc);
1560	TAILQ_INSERT_TAIL(&uc->uc_pi_list, pi, pi_hashlink);
1561}
1562
1563/*
1564 * Lock a PI mutex.
1565 */
1566static int
1567_do_lock_pi(struct thread *td, struct umutex *m, uint32_t flags, int timo,
1568	int try)
1569{
1570	struct umtx_q *uq;
1571	struct umtx_pi *pi, *new_pi;
1572	uint32_t id, owner, old;
1573	int error;
1574
1575	id = td->td_tid;
1576	uq = td->td_umtxq;
1577
1578	if ((error = umtx_key_get(m, TYPE_PI_UMUTEX, GET_SHARE(flags),
1579	    &uq->uq_key)) != 0)
1580		return (error);
1581	umtxq_lock(&uq->uq_key);
1582	pi = umtx_pi_lookup(&uq->uq_key);
1583	if (pi == NULL) {
1584		new_pi = umtx_pi_alloc(M_NOWAIT);
1585		if (new_pi == NULL) {
1586			umtxq_unlock(&uq->uq_key);
1587			new_pi = umtx_pi_alloc(M_WAITOK);
1588			new_pi->pi_key = uq->uq_key;
1589			umtxq_lock(&uq->uq_key);
1590			pi = umtx_pi_lookup(&uq->uq_key);
1591			if (pi != NULL) {
1592				umtx_pi_free(new_pi);
1593				new_pi = NULL;
1594			}
1595		}
1596		if (new_pi != NULL) {
1597			new_pi->pi_key = uq->uq_key;
1598			umtx_pi_insert(new_pi);
1599			pi = new_pi;
1600		}
1601	}
1602	umtx_pi_ref(pi);
1603	umtxq_unlock(&uq->uq_key);
1604
1605	/*
1606	 * Care must be exercised when dealing with umtx structure.  It
1607	 * can fault on any access.
1608	 */
1609	for (;;) {
1610		/*
1611		 * Try the uncontested case.  This should be done in userland.
1612		 */
1613		owner = casuword32(&m->m_owner, UMUTEX_UNOWNED, id);
1614
1615		/* The acquire succeeded. */
1616		if (owner == UMUTEX_UNOWNED) {
1617			error = 0;
1618			break;
1619		}
1620
1621		/* The address was invalid. */
1622		if (owner == -1) {
1623			error = EFAULT;
1624			break;
1625		}
1626
1627		/* If no one owns it but it is contested try to acquire it. */
1628		if (owner == UMUTEX_CONTESTED) {
1629			owner = casuword32(&m->m_owner,
1630			    UMUTEX_CONTESTED, id | UMUTEX_CONTESTED);
1631
1632			if (owner == UMUTEX_CONTESTED) {
1633				umtxq_lock(&uq->uq_key);
1634				error = umtx_pi_claim(pi, td);
1635				umtxq_unlock(&uq->uq_key);
1636				break;
1637			}
1638
1639			/* The address was invalid. */
1640			if (owner == -1) {
1641				error = EFAULT;
1642				break;
1643			}
1644
1645			/* If this failed the lock has changed, restart. */
1646			continue;
1647		}
1648
1649		if ((flags & UMUTEX_ERROR_CHECK) != 0 &&
1650		    (owner & ~UMUTEX_CONTESTED) == id) {
1651			error = EDEADLK;
1652			break;
1653		}
1654
1655		if (try != 0) {
1656			error = EBUSY;
1657			break;
1658		}
1659
1660		/*
1661		 * If we caught a signal, we have retried and now
1662		 * exit immediately.
1663		 */
1664		if (error != 0)
1665			break;
1666
1667		umtxq_lock(&uq->uq_key);
1668		umtxq_busy(&uq->uq_key);
1669		umtxq_unlock(&uq->uq_key);
1670
1671		/*
1672		 * Set the contested bit so that a release in user space
1673		 * knows to use the system call for unlock.  If this fails
1674		 * either some one else has acquired the lock or it has been
1675		 * released.
1676		 */
1677		old = casuword32(&m->m_owner, owner, owner | UMUTEX_CONTESTED);
1678
1679		/* The address was invalid. */
1680		if (old == -1) {
1681			umtxq_lock(&uq->uq_key);
1682			umtxq_unbusy(&uq->uq_key);
1683			umtxq_unlock(&uq->uq_key);
1684			error = EFAULT;
1685			break;
1686		}
1687
1688		umtxq_lock(&uq->uq_key);
1689		umtxq_unbusy(&uq->uq_key);
1690		/*
1691		 * We set the contested bit, sleep. Otherwise the lock changed
1692		 * and we need to retry or we lost a race to the thread
1693		 * unlocking the umtx.
1694		 */
1695		if (old == owner)
1696			error = umtxq_sleep_pi(uq, pi, owner & ~UMUTEX_CONTESTED,
1697				 "umtxpi", timo);
1698		umtxq_unlock(&uq->uq_key);
1699	}
1700
1701	umtxq_lock(&uq->uq_key);
1702	umtx_pi_unref(pi);
1703	umtxq_unlock(&uq->uq_key);
1704
1705	umtx_key_release(&uq->uq_key);
1706	return (error);
1707}
1708
1709/*
1710 * Unlock a PI mutex.
1711 */
1712static int
1713do_unlock_pi(struct thread *td, struct umutex *m, uint32_t flags)
1714{
1715	struct umtx_key key;
1716	struct umtx_q *uq_first, *uq_first2, *uq_me;
1717	struct umtx_pi *pi, *pi2;
1718	uint32_t owner, old, id;
1719	int error;
1720	int count;
1721	int pri;
1722
1723	id = td->td_tid;
1724	/*
1725	 * Make sure we own this mtx.
1726	 */
1727	owner = fuword32(__DEVOLATILE(uint32_t *, &m->m_owner));
1728	if (owner == -1)
1729		return (EFAULT);
1730
1731	if ((owner & ~UMUTEX_CONTESTED) != id)
1732		return (EPERM);
1733
1734	/* This should be done in userland */
1735	if ((owner & UMUTEX_CONTESTED) == 0) {
1736		old = casuword32(&m->m_owner, owner, UMUTEX_UNOWNED);
1737		if (old == -1)
1738			return (EFAULT);
1739		if (old == owner)
1740			return (0);
1741		owner = old;
1742	}
1743
1744	/* We should only ever be in here for contested locks */
1745	if ((error = umtx_key_get(m, TYPE_PI_UMUTEX, GET_SHARE(flags),
1746	    &key)) != 0)
1747		return (error);
1748
1749	umtxq_lock(&key);
1750	umtxq_busy(&key);
1751	count = umtxq_count_pi(&key, &uq_first);
1752	if (uq_first != NULL) {
1753		pi = uq_first->uq_pi_blocked;
1754		if (pi->pi_owner != curthread) {
1755			umtxq_unbusy(&key);
1756			umtxq_unlock(&key);
1757			/* userland messed the mutex */
1758			return (EPERM);
1759		}
1760		uq_me = curthread->td_umtxq;
1761		mtx_lock_spin(&sched_lock);
1762		pi->pi_owner = NULL;
1763		TAILQ_REMOVE(&uq_me->uq_pi_contested, pi, pi_link);
1764		uq_first = TAILQ_FIRST(&pi->pi_blocked);
1765		pri = PRI_MAX;
1766		TAILQ_FOREACH(pi2, &uq_me->uq_pi_contested, pi_link) {
1767			uq_first2 = TAILQ_FIRST(&pi2->pi_blocked);
1768			if (uq_first2 != NULL) {
1769				if (pri > UPRI(uq_first2->uq_thread))
1770					pri = UPRI(uq_first2->uq_thread);
1771			}
1772		}
1773		sched_unlend_user_prio(curthread, pri);
1774		mtx_unlock_spin(&sched_lock);
1775	}
1776	umtxq_unlock(&key);
1777
1778	/*
1779	 * When unlocking the umtx, it must be marked as unowned if
1780	 * there is zero or one thread only waiting for it.
1781	 * Otherwise, it must be marked as contested.
1782	 */
1783	old = casuword32(&m->m_owner, owner,
1784		count <= 1 ? UMUTEX_UNOWNED : UMUTEX_CONTESTED);
1785
1786	umtxq_lock(&key);
1787	if (uq_first != NULL)
1788		umtxq_signal_thread(uq_first);
1789	umtxq_unbusy(&key);
1790	umtxq_unlock(&key);
1791	umtx_key_release(&key);
1792	if (old == -1)
1793		return (EFAULT);
1794	if (old != owner)
1795		return (EINVAL);
1796	return (0);
1797}
1798
1799/*
1800 * Lock a PP mutex.
1801 */
1802static int
1803_do_lock_pp(struct thread *td, struct umutex *m, uint32_t flags, int timo,
1804	int try)
1805{
1806	struct umtx_q *uq, *uq2;
1807	struct umtx_pi *pi;
1808	uint32_t ceiling;
1809	uint32_t owner, id;
1810	int error, pri, old_inherited_pri, su;
1811
1812	id = td->td_tid;
1813	uq = td->td_umtxq;
1814	if ((error = umtx_key_get(m, TYPE_PP_UMUTEX, GET_SHARE(flags),
1815	    &uq->uq_key)) != 0)
1816		return (error);
1817	su = (priv_check(td, PRIV_SCHED_RTPRIO) == 0);
1818	for (;;) {
1819		old_inherited_pri = uq->uq_inherited_pri;
1820		umtxq_lock(&uq->uq_key);
1821		umtxq_busy(&uq->uq_key);
1822		umtxq_unlock(&uq->uq_key);
1823
1824		ceiling = RTP_PRIO_MAX - fuword32(&m->m_ceilings[0]);
1825		if (ceiling > RTP_PRIO_MAX) {
1826			error = EINVAL;
1827			goto out;
1828		}
1829
1830		mtx_lock_spin(&sched_lock);
1831		if (UPRI(td) < PRI_MIN_REALTIME + ceiling) {
1832			mtx_unlock_spin(&sched_lock);
1833			error = EINVAL;
1834			goto out;
1835		}
1836		if (su && PRI_MIN_REALTIME + ceiling < uq->uq_inherited_pri) {
1837			uq->uq_inherited_pri = PRI_MIN_REALTIME + ceiling;
1838			if (uq->uq_inherited_pri < UPRI(td))
1839				sched_lend_user_prio(td, uq->uq_inherited_pri);
1840		}
1841		mtx_unlock_spin(&sched_lock);
1842
1843		owner = casuword32(&m->m_owner,
1844		    UMUTEX_CONTESTED, id | UMUTEX_CONTESTED);
1845
1846		if (owner == UMUTEX_CONTESTED) {
1847			error = 0;
1848			break;
1849		}
1850
1851		/* The address was invalid. */
1852		if (owner == -1) {
1853			error = EFAULT;
1854			break;
1855		}
1856
1857		if ((flags & UMUTEX_ERROR_CHECK) != 0 &&
1858		    (owner & ~UMUTEX_CONTESTED) == id) {
1859			error = EDEADLK;
1860			break;
1861		}
1862
1863		if (try != 0) {
1864			error = EBUSY;
1865			break;
1866		}
1867
1868		/*
1869		 * If we caught a signal, we have retried and now
1870		 * exit immediately.
1871		 */
1872		if (error != 0)
1873			break;
1874
1875		umtxq_lock(&uq->uq_key);
1876		umtxq_insert(uq);
1877		umtxq_unbusy(&uq->uq_key);
1878		error = umtxq_sleep(uq, "umtxpp", timo);
1879		umtxq_remove(uq);
1880		umtxq_unlock(&uq->uq_key);
1881
1882		mtx_lock_spin(&sched_lock);
1883		uq->uq_inherited_pri = old_inherited_pri;
1884		pri = PRI_MAX;
1885		TAILQ_FOREACH(pi, &uq->uq_pi_contested, pi_link) {
1886			uq2 = TAILQ_FIRST(&pi->pi_blocked);
1887			if (uq2 != NULL) {
1888				if (pri > UPRI(uq2->uq_thread))
1889					pri = UPRI(uq2->uq_thread);
1890			}
1891		}
1892		if (pri > uq->uq_inherited_pri)
1893			pri = uq->uq_inherited_pri;
1894		sched_unlend_user_prio(td, pri);
1895		mtx_unlock_spin(&sched_lock);
1896	}
1897
1898	if (error != 0) {
1899		mtx_lock_spin(&sched_lock);
1900		uq->uq_inherited_pri = old_inherited_pri;
1901		pri = PRI_MAX;
1902		TAILQ_FOREACH(pi, &uq->uq_pi_contested, pi_link) {
1903			uq2 = TAILQ_FIRST(&pi->pi_blocked);
1904			if (uq2 != NULL) {
1905				if (pri > UPRI(uq2->uq_thread))
1906					pri = UPRI(uq2->uq_thread);
1907			}
1908		}
1909		if (pri > uq->uq_inherited_pri)
1910			pri = uq->uq_inherited_pri;
1911		sched_unlend_user_prio(td, pri);
1912		mtx_unlock_spin(&sched_lock);
1913	}
1914
1915out:
1916	umtxq_lock(&uq->uq_key);
1917	umtxq_unbusy(&uq->uq_key);
1918	umtxq_unlock(&uq->uq_key);
1919	umtx_key_release(&uq->uq_key);
1920	return (error);
1921}
1922
1923/*
1924 * Unlock a PP mutex.
1925 */
1926static int
1927do_unlock_pp(struct thread *td, struct umutex *m, uint32_t flags)
1928{
1929	struct umtx_key key;
1930	struct umtx_q *uq, *uq2;
1931	struct umtx_pi *pi;
1932	uint32_t owner, id;
1933	uint32_t rceiling;
1934	int error, pri, new_inherited_pri, su;
1935
1936	id = td->td_tid;
1937	uq = td->td_umtxq;
1938	su = (priv_check(td, PRIV_SCHED_RTPRIO) == 0);
1939
1940	/*
1941	 * Make sure we own this mtx.
1942	 */
1943	owner = fuword32(__DEVOLATILE(uint32_t *, &m->m_owner));
1944	if (owner == -1)
1945		return (EFAULT);
1946
1947	if ((owner & ~UMUTEX_CONTESTED) != id)
1948		return (EPERM);
1949
1950	error = copyin(&m->m_ceilings[1], &rceiling, sizeof(uint32_t));
1951	if (error != 0)
1952		return (error);
1953
1954	if (rceiling == -1)
1955		new_inherited_pri = PRI_MAX;
1956	else {
1957		rceiling = RTP_PRIO_MAX - rceiling;
1958		if (rceiling > RTP_PRIO_MAX)
1959			return (EINVAL);
1960		new_inherited_pri = PRI_MIN_REALTIME + rceiling;
1961	}
1962
1963	if ((error = umtx_key_get(m, TYPE_PP_UMUTEX, GET_SHARE(flags),
1964	    &key)) != 0)
1965		return (error);
1966	umtxq_lock(&key);
1967	umtxq_busy(&key);
1968	umtxq_unlock(&key);
1969	/*
1970	 * For priority protected mutex, always set unlocked state
1971	 * to UMUTEX_CONTESTED, so that userland always enters kernel
1972	 * to lock the mutex, it is necessary because thread priority
1973	 * has to be adjusted for such mutex.
1974	 */
1975	error = suword32(__DEVOLATILE(uint32_t *, &m->m_owner),
1976		UMUTEX_CONTESTED);
1977
1978	umtxq_lock(&key);
1979	if (error == 0)
1980		umtxq_signal(&key, 1);
1981	umtxq_unbusy(&key);
1982	umtxq_unlock(&key);
1983
1984	if (error == -1)
1985		error = EFAULT;
1986	else {
1987		mtx_lock_spin(&sched_lock);
1988		if (su != 0)
1989			uq->uq_inherited_pri = new_inherited_pri;
1990		pri = PRI_MAX;
1991		TAILQ_FOREACH(pi, &uq->uq_pi_contested, pi_link) {
1992			uq2 = TAILQ_FIRST(&pi->pi_blocked);
1993			if (uq2 != NULL) {
1994				if (pri > UPRI(uq2->uq_thread))
1995					pri = UPRI(uq2->uq_thread);
1996			}
1997		}
1998		if (pri > uq->uq_inherited_pri)
1999			pri = uq->uq_inherited_pri;
2000		sched_unlend_user_prio(td, pri);
2001		mtx_unlock_spin(&sched_lock);
2002	}
2003	umtx_key_release(&key);
2004	return (error);
2005}
2006
2007static int
2008do_set_ceiling(struct thread *td, struct umutex *m, uint32_t ceiling,
2009	uint32_t *old_ceiling)
2010{
2011	struct umtx_q *uq;
2012	uint32_t save_ceiling;
2013	uint32_t owner, id;
2014	uint32_t flags;
2015	int error;
2016
2017	flags = fuword32(&m->m_flags);
2018	if ((flags & UMUTEX_PRIO_PROTECT) == 0)
2019		return (EINVAL);
2020	if (ceiling > RTP_PRIO_MAX)
2021		return (EINVAL);
2022	id = td->td_tid;
2023	uq = td->td_umtxq;
2024	if ((error = umtx_key_get(m, TYPE_PP_UMUTEX, GET_SHARE(flags),
2025	   &uq->uq_key)) != 0)
2026		return (error);
2027	for (;;) {
2028		umtxq_lock(&uq->uq_key);
2029		umtxq_busy(&uq->uq_key);
2030		umtxq_unlock(&uq->uq_key);
2031
2032		save_ceiling = fuword32(&m->m_ceilings[0]);
2033
2034		owner = casuword32(&m->m_owner,
2035		    UMUTEX_CONTESTED, id | UMUTEX_CONTESTED);
2036
2037		if (owner == UMUTEX_CONTESTED) {
2038			suword32(&m->m_ceilings[0], ceiling);
2039			suword32(__DEVOLATILE(uint32_t *, &m->m_owner),
2040				UMUTEX_CONTESTED);
2041			error = 0;
2042			break;
2043		}
2044
2045		/* The address was invalid. */
2046		if (owner == -1) {
2047			error = EFAULT;
2048			break;
2049		}
2050
2051		if ((owner & ~UMUTEX_CONTESTED) == id) {
2052			suword32(&m->m_ceilings[0], ceiling);
2053			error = 0;
2054			break;
2055		}
2056
2057		/*
2058		 * If we caught a signal, we have retried and now
2059		 * exit immediately.
2060		 */
2061		if (error != 0)
2062			break;
2063
2064		/*
2065		 * We set the contested bit, sleep. Otherwise the lock changed
2066		 * and we need to retry or we lost a race to the thread
2067		 * unlocking the umtx.
2068		 */
2069		umtxq_lock(&uq->uq_key);
2070		umtxq_insert(uq);
2071		umtxq_unbusy(&uq->uq_key);
2072		error = umtxq_sleep(uq, "umtxpp", 0);
2073		umtxq_remove(uq);
2074		umtxq_unlock(&uq->uq_key);
2075	}
2076	umtxq_lock(&uq->uq_key);
2077	if (error == 0)
2078		umtxq_signal(&uq->uq_key, INT_MAX);
2079	umtxq_unbusy(&uq->uq_key);
2080	umtxq_unlock(&uq->uq_key);
2081	umtx_key_release(&uq->uq_key);
2082	if (error == 0 && old_ceiling != NULL)
2083		suword32(old_ceiling, save_ceiling);
2084	return (error);
2085}
2086
2087static int
2088_do_lock_umutex(struct thread *td, struct umutex *m, int flags, int timo,
2089	int try)
2090{
2091	switch(flags & (UMUTEX_PRIO_INHERIT | UMUTEX_PRIO_PROTECT)) {
2092	case 0:
2093		return (_do_lock_normal(td, m, flags, timo, try));
2094	case UMUTEX_PRIO_INHERIT:
2095		return (_do_lock_pi(td, m, flags, timo, try));
2096	case UMUTEX_PRIO_PROTECT:
2097		return (_do_lock_pp(td, m, flags, timo, try));
2098	}
2099	return (EINVAL);
2100}
2101
2102/*
2103 * Lock a userland POSIX mutex.
2104 */
2105static int
2106do_lock_umutex(struct thread *td, struct umutex *m,
2107	struct timespec *timeout, int try)
2108{
2109	struct timespec ts, ts2, ts3;
2110	struct timeval tv;
2111	uint32_t flags;
2112	int error;
2113
2114	flags = fuword32(&m->m_flags);
2115	if (flags == -1)
2116		return (EFAULT);
2117
2118	if (timeout == NULL) {
2119		error = _do_lock_umutex(td, m, flags, 0, try);
2120		/* Mutex locking is restarted if it is interrupted. */
2121		if (error == EINTR)
2122			error = ERESTART;
2123	} else {
2124		getnanouptime(&ts);
2125		timespecadd(&ts, timeout);
2126		TIMESPEC_TO_TIMEVAL(&tv, timeout);
2127		for (;;) {
2128			error = _do_lock_umutex(td, m, flags, tvtohz(&tv), try);
2129			if (error != ETIMEDOUT)
2130				break;
2131			getnanouptime(&ts2);
2132			if (timespeccmp(&ts2, &ts, >=)) {
2133				error = ETIMEDOUT;
2134				break;
2135			}
2136			ts3 = ts;
2137			timespecsub(&ts3, &ts2);
2138			TIMESPEC_TO_TIMEVAL(&tv, &ts3);
2139		}
2140		/* Timed-locking is not restarted. */
2141		if (error == ERESTART)
2142			error = EINTR;
2143	}
2144	return (error);
2145}
2146
2147/*
2148 * Unlock a userland POSIX mutex.
2149 */
2150static int
2151do_unlock_umutex(struct thread *td, struct umutex *m)
2152{
2153	uint32_t flags;
2154
2155	flags = fuword32(&m->m_flags);
2156	if (flags == -1)
2157		return (EFAULT);
2158
2159	switch(flags & (UMUTEX_PRIO_INHERIT | UMUTEX_PRIO_PROTECT)) {
2160	case 0:
2161		return (do_unlock_normal(td, m, flags));
2162	case UMUTEX_PRIO_INHERIT:
2163		return (do_unlock_pi(td, m, flags));
2164	case UMUTEX_PRIO_PROTECT:
2165		return (do_unlock_pp(td, m, flags));
2166	}
2167
2168	return (EINVAL);
2169}
2170
2171int
2172_umtx_lock(struct thread *td, struct _umtx_lock_args *uap)
2173    /* struct umtx *umtx */
2174{
2175	return _do_lock_umtx(td, uap->umtx, td->td_tid, 0);
2176}
2177
2178int
2179_umtx_unlock(struct thread *td, struct _umtx_unlock_args *uap)
2180    /* struct umtx *umtx */
2181{
2182	return do_unlock_umtx(td, uap->umtx, td->td_tid);
2183}
2184
2185static int
2186__umtx_op_lock_umtx(struct thread *td, struct _umtx_op_args *uap)
2187{
2188	struct timespec *ts, timeout;
2189	int error;
2190
2191	/* Allow a null timespec (wait forever). */
2192	if (uap->uaddr2 == NULL)
2193		ts = NULL;
2194	else {
2195		error = copyin(uap->uaddr2, &timeout, sizeof(timeout));
2196		if (error != 0)
2197			return (error);
2198		if (timeout.tv_nsec >= 1000000000 ||
2199		    timeout.tv_nsec < 0) {
2200			return (EINVAL);
2201		}
2202		ts = &timeout;
2203	}
2204	return (do_lock_umtx(td, uap->obj, uap->val, ts));
2205}
2206
2207static int
2208__umtx_op_unlock_umtx(struct thread *td, struct _umtx_op_args *uap)
2209{
2210	return (do_unlock_umtx(td, uap->obj, uap->val));
2211}
2212
2213static int
2214__umtx_op_wait(struct thread *td, struct _umtx_op_args *uap)
2215{
2216	struct timespec *ts, timeout;
2217	int error;
2218
2219	if (uap->uaddr2 == NULL)
2220		ts = NULL;
2221	else {
2222		error = copyin(uap->uaddr2, &timeout, sizeof(timeout));
2223		if (error != 0)
2224			return (error);
2225		if (timeout.tv_nsec >= 1000000000 ||
2226		    timeout.tv_nsec < 0)
2227			return (EINVAL);
2228		ts = &timeout;
2229	}
2230	return do_wait(td, uap->obj, uap->val, ts, 0);
2231}
2232
2233static int
2234__umtx_op_wake(struct thread *td, struct _umtx_op_args *uap)
2235{
2236	return (kern_umtx_wake(td, uap->obj, uap->val));
2237}
2238
2239static int
2240__umtx_op_lock_umutex(struct thread *td, struct _umtx_op_args *uap)
2241{
2242	struct timespec *ts, timeout;
2243	int error;
2244
2245	/* Allow a null timespec (wait forever). */
2246	if (uap->uaddr2 == NULL)
2247		ts = NULL;
2248	else {
2249		error = copyin(uap->uaddr2, &timeout,
2250		    sizeof(timeout));
2251		if (error != 0)
2252			return (error);
2253		if (timeout.tv_nsec >= 1000000000 ||
2254		    timeout.tv_nsec < 0) {
2255			return (EINVAL);
2256		}
2257		ts = &timeout;
2258	}
2259	return do_lock_umutex(td, uap->obj, ts, 0);
2260}
2261
2262static int
2263__umtx_op_trylock_umutex(struct thread *td, struct _umtx_op_args *uap)
2264{
2265	return do_lock_umutex(td, uap->obj, NULL, 1);
2266}
2267
2268static int
2269__umtx_op_unlock_umutex(struct thread *td, struct _umtx_op_args *uap)
2270{
2271	return do_unlock_umutex(td, uap->obj);
2272}
2273
2274static int
2275__umtx_op_set_ceiling(struct thread *td, struct _umtx_op_args *uap)
2276{
2277	return do_set_ceiling(td, uap->obj, uap->val, uap->uaddr1);
2278}
2279
2280typedef int (*_umtx_op_func)(struct thread *td, struct _umtx_op_args *uap);
2281
2282static _umtx_op_func op_table[] = {
2283	__umtx_op_lock_umtx,		/* UMTX_OP_LOCK */
2284	__umtx_op_unlock_umtx,		/* UMTX_OP_UNLOCK */
2285	__umtx_op_wait,			/* UMTX_OP_WAIT */
2286	__umtx_op_wake,			/* UMTX_OP_WAKE */
2287	__umtx_op_trylock_umutex,	/* UMTX_OP_MUTEX_TRYLOCK */
2288	__umtx_op_lock_umutex,		/* UMTX_OP_MUTEX_LOCK */
2289	__umtx_op_unlock_umutex,	/* UMTX_OP_MUTEX_UNLOCK */
2290	__umtx_op_set_ceiling		/* UMTX_OP_SET_CEILING */
2291};
2292
2293int
2294_umtx_op(struct thread *td, struct _umtx_op_args *uap)
2295{
2296	if ((unsigned)uap->op < UMTX_OP_MAX)
2297		return (*op_table[uap->op])(td, uap);
2298	return (EINVAL);
2299}
2300
2301#ifdef COMPAT_IA32
2302
2303int
2304freebsd32_umtx_lock(struct thread *td, struct freebsd32_umtx_lock_args *uap)
2305    /* struct umtx *umtx */
2306{
2307	return (do_lock_umtx32(td, (uint32_t *)uap->umtx, td->td_tid, NULL));
2308}
2309
2310int
2311freebsd32_umtx_unlock(struct thread *td, struct freebsd32_umtx_unlock_args *uap)
2312    /* struct umtx *umtx */
2313{
2314	return (do_unlock_umtx32(td, (uint32_t *)uap->umtx, td->td_tid));
2315}
2316
2317struct timespec32 {
2318	u_int32_t tv_sec;
2319	u_int32_t tv_nsec;
2320};
2321
2322static inline int
2323copyin_timeout32(void *addr, struct timespec *tsp)
2324{
2325	struct timespec32 ts32;
2326	int error;
2327
2328	error = copyin(addr, &ts32, sizeof(struct timespec32));
2329	if (error == 0) {
2330		tsp->tv_sec = ts32.tv_sec;
2331		tsp->tv_nsec = ts32.tv_nsec;
2332	}
2333	return (error);
2334}
2335
2336static int
2337__umtx_op_lock_umtx_compat32(struct thread *td, struct _umtx_op_args *uap)
2338{
2339	struct timespec *ts, timeout;
2340	int error;
2341
2342	/* Allow a null timespec (wait forever). */
2343	if (uap->uaddr2 == NULL)
2344		ts = NULL;
2345	else {
2346		error = copyin_timeout32(uap->uaddr2, &timeout);
2347		if (error != 0)
2348			return (error);
2349		if (timeout.tv_nsec >= 1000000000 ||
2350		    timeout.tv_nsec < 0) {
2351			return (EINVAL);
2352		}
2353		ts = &timeout;
2354	}
2355	return (do_lock_umtx32(td, uap->obj, uap->val, ts));
2356}
2357
2358static int
2359__umtx_op_unlock_umtx_compat32(struct thread *td, struct _umtx_op_args *uap)
2360{
2361	return (do_unlock_umtx32(td, uap->obj, (uint32_t)uap->val));
2362}
2363
2364static int
2365__umtx_op_wait_compat32(struct thread *td, struct _umtx_op_args *uap)
2366{
2367	struct timespec *ts, timeout;
2368	int error;
2369
2370	if (uap->uaddr2 == NULL)
2371		ts = NULL;
2372	else {
2373		error = copyin_timeout32(uap->uaddr2, &timeout);
2374		if (error != 0)
2375			return (error);
2376		if (timeout.tv_nsec >= 1000000000 ||
2377		    timeout.tv_nsec < 0)
2378			return (EINVAL);
2379		ts = &timeout;
2380	}
2381	return do_wait(td, uap->obj, uap->val, ts, 1);
2382}
2383
2384static int
2385__umtx_op_lock_umutex_compat32(struct thread *td, struct _umtx_op_args *uap)
2386{
2387	struct timespec *ts, timeout;
2388	int error;
2389
2390	/* Allow a null timespec (wait forever). */
2391	if (uap->uaddr2 == NULL)
2392		ts = NULL;
2393	else {
2394		error = copyin_timeout32(uap->uaddr2, &timeout);
2395		if (error != 0)
2396			return (error);
2397		if (timeout.tv_nsec >= 1000000000 ||
2398		    timeout.tv_nsec < 0)
2399			return (EINVAL);
2400		ts = &timeout;
2401	}
2402	return do_lock_umutex(td, uap->obj, ts, 0);
2403}
2404
2405static _umtx_op_func op_table_compat32[] = {
2406	__umtx_op_lock_umtx_compat32,	/* UMTX_OP_LOCK */
2407	__umtx_op_unlock_umtx_compat32,	/* UMTX_OP_UNLOCK */
2408	__umtx_op_wait_compat32,	/* UMTX_OP_WAIT */
2409	__umtx_op_wake,			/* UMTX_OP_WAKE */
2410	__umtx_op_trylock_umutex,	/* UMTX_OP_MUTEX_LOCK */
2411	__umtx_op_lock_umutex_compat32,	/* UMTX_OP_MUTEX_TRYLOCK */
2412	__umtx_op_unlock_umutex,	/* UMTX_OP_MUTEX_UNLOCK	*/
2413	__umtx_op_set_ceiling		/* UMTX_OP_SET_CEILING */
2414};
2415
2416int
2417freebsd32_umtx_op(struct thread *td, struct freebsd32_umtx_op_args *uap)
2418{
2419	if ((unsigned)uap->op < UMTX_OP_MAX)
2420		return (*op_table_compat32[uap->op])(td,
2421			(struct _umtx_op_args *)uap);
2422	return (EINVAL);
2423}
2424#endif
2425
2426void
2427umtx_thread_init(struct thread *td)
2428{
2429	td->td_umtxq = umtxq_alloc();
2430	td->td_umtxq->uq_thread = td;
2431}
2432
2433void
2434umtx_thread_fini(struct thread *td)
2435{
2436	umtxq_free(td->td_umtxq);
2437}
2438
2439/*
2440 * It will be called when new thread is created, e.g fork().
2441 */
2442void
2443umtx_thread_alloc(struct thread *td)
2444{
2445	struct umtx_q *uq;
2446
2447	uq = td->td_umtxq;
2448	uq->uq_inherited_pri = PRI_MAX;
2449
2450	KASSERT(uq->uq_flags == 0, ("uq_flags != 0"));
2451	KASSERT(uq->uq_thread == td, ("uq_thread != td"));
2452	KASSERT(uq->uq_pi_blocked == NULL, ("uq_pi_blocked != NULL"));
2453	KASSERT(TAILQ_EMPTY(&uq->uq_pi_contested), ("uq_pi_contested is not empty"));
2454}
2455
2456/*
2457 * exec() hook.
2458 */
2459static void
2460umtx_exec_hook(void *arg __unused, struct proc *p __unused,
2461	struct image_params *imgp __unused)
2462{
2463	umtx_thread_cleanup(curthread);
2464}
2465
2466/*
2467 * thread_exit() hook.
2468 */
2469void
2470umtx_thread_exit(struct thread *td)
2471{
2472	umtx_thread_cleanup(td);
2473}
2474
2475/*
2476 * clean up umtx data.
2477 */
2478static void
2479umtx_thread_cleanup(struct thread *td)
2480{
2481	struct umtx_q *uq;
2482	struct umtx_pi *pi;
2483
2484	if ((uq = td->td_umtxq) == NULL)
2485		return;
2486
2487	mtx_lock_spin(&sched_lock);
2488	uq->uq_inherited_pri = PRI_MAX;
2489	while ((pi = TAILQ_FIRST(&uq->uq_pi_contested)) != NULL) {
2490		pi->pi_owner = NULL;
2491		TAILQ_REMOVE(&uq->uq_pi_contested, pi, pi_link);
2492	}
2493	td->td_flags &= ~TDF_UBORROWING;
2494	mtx_unlock_spin(&sched_lock);
2495}
2496