kern_umtx.c revision 170368
1/*-
2 * Copyright (c) 2004, David Xu <davidxu@freebsd.org>
3 * Copyright (c) 2002, Jeffrey Roberson <jeff@freebsd.org>
4 * All rights reserved.
5 *
6 * Redistribution and use in source and binary forms, with or without
7 * modification, are permitted provided that the following conditions
8 * are met:
9 * 1. Redistributions of source code must retain the above copyright
10 *    notice unmodified, this list of conditions, and the following
11 *    disclaimer.
12 * 2. Redistributions in binary form must reproduce the above copyright
13 *    notice, this list of conditions and the following disclaimer in the
14 *    documentation and/or other materials provided with the distribution.
15 *
16 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
17 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
18 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
19 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
20 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
21 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
22 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
23 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
25 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26 */
27
28#include <sys/cdefs.h>
29__FBSDID("$FreeBSD: head/sys/kern/kern_umtx.c 170368 2007-06-06 07:35:08Z davidxu $");
30
31#include "opt_compat.h"
32#include <sys/param.h>
33#include <sys/kernel.h>
34#include <sys/limits.h>
35#include <sys/lock.h>
36#include <sys/malloc.h>
37#include <sys/mutex.h>
38#include <sys/priv.h>
39#include <sys/proc.h>
40#include <sys/sched.h>
41#include <sys/smp.h>
42#include <sys/sysctl.h>
43#include <sys/sysent.h>
44#include <sys/systm.h>
45#include <sys/sysproto.h>
46#include <sys/eventhandler.h>
47#include <sys/umtx.h>
48
49#include <vm/vm.h>
50#include <vm/vm_param.h>
51#include <vm/pmap.h>
52#include <vm/vm_map.h>
53#include <vm/vm_object.h>
54
55#include <machine/cpu.h>
56
57#ifdef COMPAT_IA32
58#include <compat/freebsd32/freebsd32_proto.h>
59#endif
60
61#define TYPE_SIMPLE_LOCK	0
62#define TYPE_SIMPLE_WAIT	1
63#define TYPE_NORMAL_UMUTEX	2
64#define TYPE_PI_UMUTEX		3
65#define TYPE_PP_UMUTEX		4
66#define TYPE_CV			5
67
68/* Key to represent a unique userland synchronous object */
69struct umtx_key {
70	int	hash;
71	int	type;
72	int	shared;
73	union {
74		struct {
75			vm_object_t	object;
76			uintptr_t	offset;
77		} shared;
78		struct {
79			struct vmspace	*vs;
80			uintptr_t	addr;
81		} private;
82		struct {
83			void		*a;
84			uintptr_t	b;
85		} both;
86	} info;
87};
88
89/* Priority inheritance mutex info. */
90struct umtx_pi {
91	/* Owner thread */
92	struct thread		*pi_owner;
93
94	/* Reference count */
95	int			pi_refcount;
96
97 	/* List entry to link umtx holding by thread */
98	TAILQ_ENTRY(umtx_pi)	pi_link;
99
100	/* List entry in hash */
101	TAILQ_ENTRY(umtx_pi)	pi_hashlink;
102
103	/* List for waiters */
104	TAILQ_HEAD(,umtx_q)	pi_blocked;
105
106	/* Identify a userland lock object */
107	struct umtx_key		pi_key;
108};
109
110/* A userland synchronous object user. */
111struct umtx_q {
112	/* Linked list for the hash. */
113	TAILQ_ENTRY(umtx_q)	uq_link;
114
115	/* Umtx key. */
116	struct umtx_key		uq_key;
117
118	/* Umtx flags. */
119	int			uq_flags;
120#define UQF_UMTXQ	0x0001
121
122	/* The thread waits on. */
123	struct thread		*uq_thread;
124
125	/*
126	 * Blocked on PI mutex. read can use chain lock
127	 * or umtx_lock, write must have both chain lock and
128	 * umtx_lock being hold.
129	 */
130	struct umtx_pi		*uq_pi_blocked;
131
132	/* On blocked list */
133	TAILQ_ENTRY(umtx_q)	uq_lockq;
134
135	/* Thread contending with us */
136	TAILQ_HEAD(,umtx_pi)	uq_pi_contested;
137
138	/* Inherited priority from PP mutex */
139	u_char			uq_inherited_pri;
140};
141
142TAILQ_HEAD(umtxq_head, umtx_q);
143
144/* Userland lock object's wait-queue chain */
145struct umtxq_chain {
146	/* Lock for this chain. */
147	struct mtx		uc_lock;
148
149	/* List of sleep queues. */
150	struct umtxq_head	uc_queue;
151
152	/* Busy flag */
153	char			uc_busy;
154
155	/* Chain lock waiters */
156	int			uc_waiters;
157
158	/* All PI in the list */
159	TAILQ_HEAD(,umtx_pi)	uc_pi_list;
160};
161
162#define	UMTXQ_LOCKED_ASSERT(uc)		mtx_assert(&(uc)->uc_lock, MA_OWNED)
163
164/*
165 * Don't propagate time-sharing priority, there is a security reason,
166 * a user can simply introduce PI-mutex, let thread A lock the mutex,
167 * and let another thread B block on the mutex, because B is
168 * sleeping, its priority will be boosted, this causes A's priority to
169 * be boosted via priority propagating too and will never be lowered even
170 * if it is using 100%CPU, this is unfair to other processes.
171 */
172
173#define UPRI(td)	(((td)->td_user_pri >= PRI_MIN_TIMESHARE &&\
174			  (td)->td_user_pri <= PRI_MAX_TIMESHARE) ?\
175			 PRI_MAX_TIMESHARE : (td)->td_user_pri)
176
177#define	GOLDEN_RATIO_PRIME	2654404609U
178#define	UMTX_CHAINS		128
179#define	UMTX_SHIFTS		(__WORD_BIT - 7)
180
181#define THREAD_SHARE		0
182#define PROCESS_SHARE		1
183#define AUTO_SHARE		2
184
185#define	GET_SHARE(flags)	\
186    (((flags) & USYNC_PROCESS_SHARED) == 0 ? THREAD_SHARE : PROCESS_SHARE)
187
188static uma_zone_t		umtx_pi_zone;
189static struct umtxq_chain	umtxq_chains[UMTX_CHAINS];
190static MALLOC_DEFINE(M_UMTX, "umtx", "UMTX queue memory");
191static int			umtx_pi_allocated;
192
193SYSCTL_NODE(_debug, OID_AUTO, umtx, CTLFLAG_RW, 0, "umtx debug");
194SYSCTL_INT(_debug_umtx, OID_AUTO, umtx_pi_allocated, CTLFLAG_RD,
195    &umtx_pi_allocated, 0, "Allocated umtx_pi");
196
197static void umtxq_sysinit(void *);
198static void umtxq_hash(struct umtx_key *key);
199static struct umtxq_chain *umtxq_getchain(struct umtx_key *key);
200static void umtxq_lock(struct umtx_key *key);
201static void umtxq_unlock(struct umtx_key *key);
202static void umtxq_busy(struct umtx_key *key);
203static void umtxq_unbusy(struct umtx_key *key);
204static void umtxq_insert(struct umtx_q *uq);
205static void umtxq_remove(struct umtx_q *uq);
206static int umtxq_sleep(struct umtx_q *uq, const char *wmesg, int timo);
207static int umtxq_count(struct umtx_key *key);
208static int umtxq_signal(struct umtx_key *key, int nr_wakeup);
209static int umtx_key_match(const struct umtx_key *k1, const struct umtx_key *k2);
210static int umtx_key_get(void *addr, int type, int share,
211	struct umtx_key *key);
212static void umtx_key_release(struct umtx_key *key);
213static struct umtx_pi *umtx_pi_alloc(int);
214static void umtx_pi_free(struct umtx_pi *pi);
215static int do_unlock_pp(struct thread *td, struct umutex *m, uint32_t flags);
216static void umtx_thread_cleanup(struct thread *td);
217static void umtx_exec_hook(void *arg __unused, struct proc *p __unused,
218	struct image_params *imgp __unused);
219SYSINIT(umtx, SI_SUB_EVENTHANDLER+1, SI_ORDER_MIDDLE, umtxq_sysinit, NULL);
220
221static struct mtx umtx_lock;
222
223static void
224umtxq_sysinit(void *arg __unused)
225{
226	int i;
227
228	umtx_pi_zone = uma_zcreate("umtx pi", sizeof(struct umtx_pi),
229		NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0);
230	for (i = 0; i < UMTX_CHAINS; ++i) {
231		mtx_init(&umtxq_chains[i].uc_lock, "umtxql", NULL,
232			 MTX_DEF | MTX_DUPOK);
233		TAILQ_INIT(&umtxq_chains[i].uc_queue);
234		TAILQ_INIT(&umtxq_chains[i].uc_pi_list);
235		umtxq_chains[i].uc_busy = 0;
236		umtxq_chains[i].uc_waiters = 0;
237	}
238	mtx_init(&umtx_lock, "umtx lock", NULL, MTX_SPIN);
239	EVENTHANDLER_REGISTER(process_exec, umtx_exec_hook, NULL,
240	    EVENTHANDLER_PRI_ANY);
241}
242
243struct umtx_q *
244umtxq_alloc(void)
245{
246	struct umtx_q *uq;
247
248	uq = malloc(sizeof(struct umtx_q), M_UMTX, M_WAITOK | M_ZERO);
249	TAILQ_INIT(&uq->uq_pi_contested);
250	uq->uq_inherited_pri = PRI_MAX;
251	return (uq);
252}
253
254void
255umtxq_free(struct umtx_q *uq)
256{
257	free(uq, M_UMTX);
258}
259
260static inline void
261umtxq_hash(struct umtx_key *key)
262{
263	unsigned n = (uintptr_t)key->info.both.a + key->info.both.b;
264	key->hash = ((n * GOLDEN_RATIO_PRIME) >> UMTX_SHIFTS) % UMTX_CHAINS;
265}
266
267static inline int
268umtx_key_match(const struct umtx_key *k1, const struct umtx_key *k2)
269{
270	return (k1->type == k2->type &&
271		k1->info.both.a == k2->info.both.a &&
272	        k1->info.both.b == k2->info.both.b);
273}
274
275static inline struct umtxq_chain *
276umtxq_getchain(struct umtx_key *key)
277{
278	return (&umtxq_chains[key->hash]);
279}
280
281/*
282 * Set chain to busy state when following operation
283 * may be blocked (kernel mutex can not be used).
284 */
285static inline void
286umtxq_busy(struct umtx_key *key)
287{
288	struct umtxq_chain *uc;
289
290	uc = umtxq_getchain(key);
291	mtx_assert(&uc->uc_lock, MA_OWNED);
292	while (uc->uc_busy != 0) {
293		uc->uc_waiters++;
294		msleep(uc, &uc->uc_lock, 0, "umtxqb", 0);
295		uc->uc_waiters--;
296	}
297	uc->uc_busy = 1;
298}
299
300/*
301 * Unbusy a chain.
302 */
303static inline void
304umtxq_unbusy(struct umtx_key *key)
305{
306	struct umtxq_chain *uc;
307
308	uc = umtxq_getchain(key);
309	mtx_assert(&uc->uc_lock, MA_OWNED);
310	KASSERT(uc->uc_busy != 0, ("not busy"));
311	uc->uc_busy = 0;
312	if (uc->uc_waiters)
313		wakeup_one(uc);
314}
315
316/*
317 * Lock a chain.
318 */
319static inline void
320umtxq_lock(struct umtx_key *key)
321{
322	struct umtxq_chain *uc;
323
324	uc = umtxq_getchain(key);
325	mtx_lock(&uc->uc_lock);
326}
327
328/*
329 * Unlock a chain.
330 */
331static inline void
332umtxq_unlock(struct umtx_key *key)
333{
334	struct umtxq_chain *uc;
335
336	uc = umtxq_getchain(key);
337	mtx_unlock(&uc->uc_lock);
338}
339
340/*
341 * Insert a thread onto the umtx queue.
342 */
343static inline void
344umtxq_insert(struct umtx_q *uq)
345{
346	struct umtxq_chain *uc;
347
348	uc = umtxq_getchain(&uq->uq_key);
349	UMTXQ_LOCKED_ASSERT(uc);
350	TAILQ_INSERT_TAIL(&uc->uc_queue, uq, uq_link);
351	uq->uq_flags |= UQF_UMTXQ;
352}
353
354/*
355 * Remove thread from the umtx queue.
356 */
357static inline void
358umtxq_remove(struct umtx_q *uq)
359{
360	struct umtxq_chain *uc;
361
362	uc = umtxq_getchain(&uq->uq_key);
363	UMTXQ_LOCKED_ASSERT(uc);
364	if (uq->uq_flags & UQF_UMTXQ) {
365		TAILQ_REMOVE(&uc->uc_queue, uq, uq_link);
366		uq->uq_flags &= ~UQF_UMTXQ;
367	}
368}
369
370/*
371 * Check if there are multiple waiters
372 */
373static int
374umtxq_count(struct umtx_key *key)
375{
376	struct umtxq_chain *uc;
377	struct umtx_q *uq;
378	int count = 0;
379
380	uc = umtxq_getchain(key);
381	UMTXQ_LOCKED_ASSERT(uc);
382	TAILQ_FOREACH(uq, &uc->uc_queue, uq_link) {
383		if (umtx_key_match(&uq->uq_key, key)) {
384			if (++count > 1)
385				break;
386		}
387	}
388	return (count);
389}
390
391/*
392 * Check if there are multiple PI waiters and returns first
393 * waiter.
394 */
395static int
396umtxq_count_pi(struct umtx_key *key, struct umtx_q **first)
397{
398	struct umtxq_chain *uc;
399	struct umtx_q *uq;
400	int count = 0;
401
402	*first = NULL;
403	uc = umtxq_getchain(key);
404	UMTXQ_LOCKED_ASSERT(uc);
405	TAILQ_FOREACH(uq, &uc->uc_queue, uq_link) {
406		if (umtx_key_match(&uq->uq_key, key)) {
407			if (++count > 1)
408				break;
409			*first = uq;
410		}
411	}
412	return (count);
413}
414
415/*
416 * Wake up threads waiting on an userland object.
417 */
418static int
419umtxq_signal(struct umtx_key *key, int n_wake)
420{
421	struct umtxq_chain *uc;
422	struct umtx_q *uq, *next;
423	int ret;
424
425	ret = 0;
426	uc = umtxq_getchain(key);
427	UMTXQ_LOCKED_ASSERT(uc);
428	TAILQ_FOREACH_SAFE(uq, &uc->uc_queue, uq_link, next) {
429		if (umtx_key_match(&uq->uq_key, key)) {
430			umtxq_remove(uq);
431			wakeup(uq);
432			if (++ret >= n_wake)
433				break;
434		}
435	}
436	return (ret);
437}
438
439/*
440 * Wake up specified thread.
441 */
442static inline void
443umtxq_signal_thread(struct umtx_q *uq)
444{
445	struct umtxq_chain *uc;
446
447	uc = umtxq_getchain(&uq->uq_key);
448	UMTXQ_LOCKED_ASSERT(uc);
449	umtxq_remove(uq);
450	wakeup(uq);
451}
452
453/*
454 * Put thread into sleep state, before sleeping, check if
455 * thread was removed from umtx queue.
456 */
457static inline int
458umtxq_sleep(struct umtx_q *uq, const char *wmesg, int timo)
459{
460	struct umtxq_chain *uc;
461	int error;
462
463	uc = umtxq_getchain(&uq->uq_key);
464	UMTXQ_LOCKED_ASSERT(uc);
465	if (!(uq->uq_flags & UQF_UMTXQ))
466		return (0);
467	error = msleep(uq, &uc->uc_lock, PCATCH, wmesg, timo);
468	if (error == EWOULDBLOCK)
469		error = ETIMEDOUT;
470	return (error);
471}
472
473/*
474 * Convert userspace address into unique logical address.
475 */
476static int
477umtx_key_get(void *addr, int type, int share, struct umtx_key *key)
478{
479	struct thread *td = curthread;
480	vm_map_t map;
481	vm_map_entry_t entry;
482	vm_pindex_t pindex;
483	vm_prot_t prot;
484	boolean_t wired;
485
486	key->type = type;
487	if (share == THREAD_SHARE) {
488		key->shared = 0;
489		key->info.private.vs = td->td_proc->p_vmspace;
490		key->info.private.addr = (uintptr_t)addr;
491	} else {
492		MPASS(share == PROCESS_SHARE || share == AUTO_SHARE);
493		map = &td->td_proc->p_vmspace->vm_map;
494		if (vm_map_lookup(&map, (vm_offset_t)addr, VM_PROT_WRITE,
495		    &entry, &key->info.shared.object, &pindex, &prot,
496		    &wired) != KERN_SUCCESS) {
497			return EFAULT;
498		}
499
500		if ((share == PROCESS_SHARE) ||
501		    (share == AUTO_SHARE &&
502		     VM_INHERIT_SHARE == entry->inheritance)) {
503			key->shared = 1;
504			key->info.shared.offset = entry->offset + entry->start -
505				(vm_offset_t)addr;
506			vm_object_reference(key->info.shared.object);
507		} else {
508			key->shared = 0;
509			key->info.private.vs = td->td_proc->p_vmspace;
510			key->info.private.addr = (uintptr_t)addr;
511		}
512		vm_map_lookup_done(map, entry);
513	}
514
515	umtxq_hash(key);
516	return (0);
517}
518
519/*
520 * Release key.
521 */
522static inline void
523umtx_key_release(struct umtx_key *key)
524{
525	if (key->shared)
526		vm_object_deallocate(key->info.shared.object);
527}
528
529/*
530 * Lock a umtx object.
531 */
532static int
533_do_lock_umtx(struct thread *td, struct umtx *umtx, u_long id, int timo)
534{
535	struct umtx_q *uq;
536	u_long owner;
537	u_long old;
538	int error = 0;
539
540	uq = td->td_umtxq;
541
542	/*
543	 * Care must be exercised when dealing with umtx structure. It
544	 * can fault on any access.
545	 */
546	for (;;) {
547		/*
548		 * Try the uncontested case.  This should be done in userland.
549		 */
550		owner = casuword(&umtx->u_owner, UMTX_UNOWNED, id);
551
552		/* The acquire succeeded. */
553		if (owner == UMTX_UNOWNED)
554			return (0);
555
556		/* The address was invalid. */
557		if (owner == -1)
558			return (EFAULT);
559
560		/* If no one owns it but it is contested try to acquire it. */
561		if (owner == UMTX_CONTESTED) {
562			owner = casuword(&umtx->u_owner,
563			    UMTX_CONTESTED, id | UMTX_CONTESTED);
564
565			if (owner == UMTX_CONTESTED)
566				return (0);
567
568			/* The address was invalid. */
569			if (owner == -1)
570				return (EFAULT);
571
572			/* If this failed the lock has changed, restart. */
573			continue;
574		}
575
576		/*
577		 * If we caught a signal, we have retried and now
578		 * exit immediately.
579		 */
580		if (error != 0)
581			return (error);
582
583		if ((error = umtx_key_get(umtx, TYPE_SIMPLE_LOCK,
584			AUTO_SHARE, &uq->uq_key)) != 0)
585			return (error);
586
587		umtxq_lock(&uq->uq_key);
588		umtxq_busy(&uq->uq_key);
589		umtxq_insert(uq);
590		umtxq_unbusy(&uq->uq_key);
591		umtxq_unlock(&uq->uq_key);
592
593		/*
594		 * Set the contested bit so that a release in user space
595		 * knows to use the system call for unlock.  If this fails
596		 * either some one else has acquired the lock or it has been
597		 * released.
598		 */
599		old = casuword(&umtx->u_owner, owner, owner | UMTX_CONTESTED);
600
601		/* The address was invalid. */
602		if (old == -1) {
603			umtxq_lock(&uq->uq_key);
604			umtxq_remove(uq);
605			umtxq_unlock(&uq->uq_key);
606			umtx_key_release(&uq->uq_key);
607			return (EFAULT);
608		}
609
610		/*
611		 * We set the contested bit, sleep. Otherwise the lock changed
612		 * and we need to retry or we lost a race to the thread
613		 * unlocking the umtx.
614		 */
615		umtxq_lock(&uq->uq_key);
616		if (old == owner)
617			error = umtxq_sleep(uq, "umtx", timo);
618		umtxq_remove(uq);
619		umtxq_unlock(&uq->uq_key);
620		umtx_key_release(&uq->uq_key);
621	}
622
623	return (0);
624}
625
626/*
627 * Lock a umtx object.
628 */
629static int
630do_lock_umtx(struct thread *td, struct umtx *umtx, u_long id,
631	struct timespec *timeout)
632{
633	struct timespec ts, ts2, ts3;
634	struct timeval tv;
635	int error;
636
637	if (timeout == NULL) {
638		error = _do_lock_umtx(td, umtx, id, 0);
639		/* Mutex locking is restarted if it is interrupted. */
640		if (error == EINTR)
641			error = ERESTART;
642	} else {
643		getnanouptime(&ts);
644		timespecadd(&ts, timeout);
645		TIMESPEC_TO_TIMEVAL(&tv, timeout);
646		for (;;) {
647			error = _do_lock_umtx(td, umtx, id, tvtohz(&tv));
648			if (error != ETIMEDOUT)
649				break;
650			getnanouptime(&ts2);
651			if (timespeccmp(&ts2, &ts, >=)) {
652				error = ETIMEDOUT;
653				break;
654			}
655			ts3 = ts;
656			timespecsub(&ts3, &ts2);
657			TIMESPEC_TO_TIMEVAL(&tv, &ts3);
658		}
659		/* Timed-locking is not restarted. */
660		if (error == ERESTART)
661			error = EINTR;
662	}
663	return (error);
664}
665
666/*
667 * Unlock a umtx object.
668 */
669static int
670do_unlock_umtx(struct thread *td, struct umtx *umtx, u_long id)
671{
672	struct umtx_key key;
673	u_long owner;
674	u_long old;
675	int error;
676	int count;
677
678	/*
679	 * Make sure we own this mtx.
680	 */
681	owner = fuword(__DEVOLATILE(u_long *, &umtx->u_owner));
682	if (owner == -1)
683		return (EFAULT);
684
685	if ((owner & ~UMTX_CONTESTED) != id)
686		return (EPERM);
687
688	/* This should be done in userland */
689	if ((owner & UMTX_CONTESTED) == 0) {
690		old = casuword(&umtx->u_owner, owner, UMTX_UNOWNED);
691		if (old == -1)
692			return (EFAULT);
693		if (old == owner)
694			return (0);
695		owner = old;
696	}
697
698	/* We should only ever be in here for contested locks */
699	if ((error = umtx_key_get(umtx, TYPE_SIMPLE_LOCK, AUTO_SHARE,
700		&key)) != 0)
701		return (error);
702
703	umtxq_lock(&key);
704	umtxq_busy(&key);
705	count = umtxq_count(&key);
706	umtxq_unlock(&key);
707
708	/*
709	 * When unlocking the umtx, it must be marked as unowned if
710	 * there is zero or one thread only waiting for it.
711	 * Otherwise, it must be marked as contested.
712	 */
713	old = casuword(&umtx->u_owner, owner,
714		count <= 1 ? UMTX_UNOWNED : UMTX_CONTESTED);
715	umtxq_lock(&key);
716	umtxq_signal(&key,1);
717	umtxq_unbusy(&key);
718	umtxq_unlock(&key);
719	umtx_key_release(&key);
720	if (old == -1)
721		return (EFAULT);
722	if (old != owner)
723		return (EINVAL);
724	return (0);
725}
726
727#ifdef COMPAT_IA32
728
729/*
730 * Lock a umtx object.
731 */
732static int
733_do_lock_umtx32(struct thread *td, uint32_t *m, uint32_t id, int timo)
734{
735	struct umtx_q *uq;
736	uint32_t owner;
737	uint32_t old;
738	int error = 0;
739
740	uq = td->td_umtxq;
741
742	/*
743	 * Care must be exercised when dealing with umtx structure. It
744	 * can fault on any access.
745	 */
746	for (;;) {
747		/*
748		 * Try the uncontested case.  This should be done in userland.
749		 */
750		owner = casuword32(m, UMUTEX_UNOWNED, id);
751
752		/* The acquire succeeded. */
753		if (owner == UMUTEX_UNOWNED)
754			return (0);
755
756		/* The address was invalid. */
757		if (owner == -1)
758			return (EFAULT);
759
760		/* If no one owns it but it is contested try to acquire it. */
761		if (owner == UMUTEX_CONTESTED) {
762			owner = casuword32(m,
763			    UMUTEX_CONTESTED, id | UMUTEX_CONTESTED);
764			if (owner == UMUTEX_CONTESTED)
765				return (0);
766
767			/* The address was invalid. */
768			if (owner == -1)
769				return (EFAULT);
770
771			/* If this failed the lock has changed, restart. */
772			continue;
773		}
774
775		/*
776		 * If we caught a signal, we have retried and now
777		 * exit immediately.
778		 */
779		if (error != 0)
780			return (error);
781
782		if ((error = umtx_key_get(m, TYPE_SIMPLE_LOCK,
783			AUTO_SHARE, &uq->uq_key)) != 0)
784			return (error);
785
786		umtxq_lock(&uq->uq_key);
787		umtxq_busy(&uq->uq_key);
788		umtxq_insert(uq);
789		umtxq_unbusy(&uq->uq_key);
790		umtxq_unlock(&uq->uq_key);
791
792		/*
793		 * Set the contested bit so that a release in user space
794		 * knows to use the system call for unlock.  If this fails
795		 * either some one else has acquired the lock or it has been
796		 * released.
797		 */
798		old = casuword32(m, owner, owner | UMUTEX_CONTESTED);
799
800		/* The address was invalid. */
801		if (old == -1) {
802			umtxq_lock(&uq->uq_key);
803			umtxq_remove(uq);
804			umtxq_unlock(&uq->uq_key);
805			umtx_key_release(&uq->uq_key);
806			return (EFAULT);
807		}
808
809		/*
810		 * We set the contested bit, sleep. Otherwise the lock changed
811		 * and we need to retry or we lost a race to the thread
812		 * unlocking the umtx.
813		 */
814		umtxq_lock(&uq->uq_key);
815		if (old == owner)
816			error = umtxq_sleep(uq, "umtx", timo);
817		umtxq_remove(uq);
818		umtxq_unlock(&uq->uq_key);
819		umtx_key_release(&uq->uq_key);
820	}
821
822	return (0);
823}
824
825/*
826 * Lock a umtx object.
827 */
828static int
829do_lock_umtx32(struct thread *td, void *m, uint32_t id,
830	struct timespec *timeout)
831{
832	struct timespec ts, ts2, ts3;
833	struct timeval tv;
834	int error;
835
836	if (timeout == NULL) {
837		error = _do_lock_umtx32(td, m, id, 0);
838		/* Mutex locking is restarted if it is interrupted. */
839		if (error == EINTR)
840			error = ERESTART;
841	} else {
842		getnanouptime(&ts);
843		timespecadd(&ts, timeout);
844		TIMESPEC_TO_TIMEVAL(&tv, timeout);
845		for (;;) {
846			error = _do_lock_umtx32(td, m, id, tvtohz(&tv));
847			if (error != ETIMEDOUT)
848				break;
849			getnanouptime(&ts2);
850			if (timespeccmp(&ts2, &ts, >=)) {
851				error = ETIMEDOUT;
852				break;
853			}
854			ts3 = ts;
855			timespecsub(&ts3, &ts2);
856			TIMESPEC_TO_TIMEVAL(&tv, &ts3);
857		}
858		/* Timed-locking is not restarted. */
859		if (error == ERESTART)
860			error = EINTR;
861	}
862	return (error);
863}
864
865/*
866 * Unlock a umtx object.
867 */
868static int
869do_unlock_umtx32(struct thread *td, uint32_t *m, uint32_t id)
870{
871	struct umtx_key key;
872	uint32_t owner;
873	uint32_t old;
874	int error;
875	int count;
876
877	/*
878	 * Make sure we own this mtx.
879	 */
880	owner = fuword32(m);
881	if (owner == -1)
882		return (EFAULT);
883
884	if ((owner & ~UMUTEX_CONTESTED) != id)
885		return (EPERM);
886
887	/* This should be done in userland */
888	if ((owner & UMUTEX_CONTESTED) == 0) {
889		old = casuword32(m, owner, UMUTEX_UNOWNED);
890		if (old == -1)
891			return (EFAULT);
892		if (old == owner)
893			return (0);
894		owner = old;
895	}
896
897	/* We should only ever be in here for contested locks */
898	if ((error = umtx_key_get(m, TYPE_SIMPLE_LOCK, AUTO_SHARE,
899		&key)) != 0)
900		return (error);
901
902	umtxq_lock(&key);
903	umtxq_busy(&key);
904	count = umtxq_count(&key);
905	umtxq_unlock(&key);
906
907	/*
908	 * When unlocking the umtx, it must be marked as unowned if
909	 * there is zero or one thread only waiting for it.
910	 * Otherwise, it must be marked as contested.
911	 */
912	old = casuword32(m, owner,
913		count <= 1 ? UMUTEX_UNOWNED : UMUTEX_CONTESTED);
914	umtxq_lock(&key);
915	umtxq_signal(&key,1);
916	umtxq_unbusy(&key);
917	umtxq_unlock(&key);
918	umtx_key_release(&key);
919	if (old == -1)
920		return (EFAULT);
921	if (old != owner)
922		return (EINVAL);
923	return (0);
924}
925#endif
926
927/*
928 * Fetch and compare value, sleep on the address if value is not changed.
929 */
930static int
931do_wait(struct thread *td, void *addr, u_long id,
932	struct timespec *timeout, int compat32)
933{
934	struct umtx_q *uq;
935	struct timespec ts, ts2, ts3;
936	struct timeval tv;
937	u_long tmp;
938	int error = 0;
939
940	uq = td->td_umtxq;
941	if ((error = umtx_key_get(addr, TYPE_SIMPLE_WAIT, AUTO_SHARE,
942	    &uq->uq_key)) != 0)
943		return (error);
944
945	umtxq_lock(&uq->uq_key);
946	umtxq_insert(uq);
947	umtxq_unlock(&uq->uq_key);
948	if (compat32 == 0)
949		tmp = fuword(addr);
950        else
951		tmp = fuword32(addr);
952	if (tmp != id) {
953		umtxq_lock(&uq->uq_key);
954		umtxq_remove(uq);
955		umtxq_unlock(&uq->uq_key);
956	} else if (timeout == NULL) {
957		umtxq_lock(&uq->uq_key);
958		error = umtxq_sleep(uq, "uwait", 0);
959		umtxq_remove(uq);
960		umtxq_unlock(&uq->uq_key);
961	} else {
962		getnanouptime(&ts);
963		timespecadd(&ts, timeout);
964		TIMESPEC_TO_TIMEVAL(&tv, timeout);
965		umtxq_lock(&uq->uq_key);
966		for (;;) {
967			error = umtxq_sleep(uq, "uwait", tvtohz(&tv));
968			if (!(uq->uq_flags & UQF_UMTXQ))
969				break;
970			if (error != ETIMEDOUT)
971				break;
972			umtxq_unlock(&uq->uq_key);
973			getnanouptime(&ts2);
974			if (timespeccmp(&ts2, &ts, >=)) {
975				error = ETIMEDOUT;
976				umtxq_lock(&uq->uq_key);
977				break;
978			}
979			ts3 = ts;
980			timespecsub(&ts3, &ts2);
981			TIMESPEC_TO_TIMEVAL(&tv, &ts3);
982			umtxq_lock(&uq->uq_key);
983		}
984		umtxq_remove(uq);
985		umtxq_unlock(&uq->uq_key);
986	}
987	umtx_key_release(&uq->uq_key);
988	if (error == ERESTART)
989		error = EINTR;
990	return (error);
991}
992
993/*
994 * Wake up threads sleeping on the specified address.
995 */
996int
997kern_umtx_wake(struct thread *td, void *uaddr, int n_wake)
998{
999	struct umtx_key key;
1000	int ret;
1001
1002	if ((ret = umtx_key_get(uaddr, TYPE_SIMPLE_WAIT, AUTO_SHARE,
1003	   &key)) != 0)
1004		return (ret);
1005	umtxq_lock(&key);
1006	ret = umtxq_signal(&key, n_wake);
1007	umtxq_unlock(&key);
1008	umtx_key_release(&key);
1009	return (0);
1010}
1011
1012/*
1013 * Lock PTHREAD_PRIO_NONE protocol POSIX mutex.
1014 */
1015static int
1016_do_lock_normal(struct thread *td, struct umutex *m, uint32_t flags, int timo,
1017	int try)
1018{
1019	struct umtx_q *uq;
1020	uint32_t owner, old, id;
1021	int error = 0;
1022
1023	id = td->td_tid;
1024	uq = td->td_umtxq;
1025
1026	/*
1027	 * Care must be exercised when dealing with umtx structure. It
1028	 * can fault on any access.
1029	 */
1030	for (;;) {
1031		/*
1032		 * Try the uncontested case.  This should be done in userland.
1033		 */
1034		owner = casuword32(&m->m_owner, UMUTEX_UNOWNED, id);
1035
1036		/* The acquire succeeded. */
1037		if (owner == UMUTEX_UNOWNED)
1038			return (0);
1039
1040		/* The address was invalid. */
1041		if (owner == -1)
1042			return (EFAULT);
1043
1044		/* If no one owns it but it is contested try to acquire it. */
1045		if (owner == UMUTEX_CONTESTED) {
1046			owner = casuword32(&m->m_owner,
1047			    UMUTEX_CONTESTED, id | UMUTEX_CONTESTED);
1048
1049			if (owner == UMUTEX_CONTESTED)
1050				return (0);
1051
1052			/* The address was invalid. */
1053			if (owner == -1)
1054				return (EFAULT);
1055
1056			/* If this failed the lock has changed, restart. */
1057			continue;
1058		}
1059
1060		if ((flags & UMUTEX_ERROR_CHECK) != 0 &&
1061		    (owner & ~UMUTEX_CONTESTED) == id)
1062			return (EDEADLK);
1063
1064		if (try != 0)
1065			return (EBUSY);
1066
1067		/*
1068		 * If we caught a signal, we have retried and now
1069		 * exit immediately.
1070		 */
1071		if (error != 0)
1072			return (error);
1073
1074		if ((error = umtx_key_get(m, TYPE_NORMAL_UMUTEX,
1075		    GET_SHARE(flags), &uq->uq_key)) != 0)
1076			return (error);
1077
1078		umtxq_lock(&uq->uq_key);
1079		umtxq_busy(&uq->uq_key);
1080		umtxq_insert(uq);
1081		umtxq_unbusy(&uq->uq_key);
1082		umtxq_unlock(&uq->uq_key);
1083
1084		/*
1085		 * Set the contested bit so that a release in user space
1086		 * knows to use the system call for unlock.  If this fails
1087		 * either some one else has acquired the lock or it has been
1088		 * released.
1089		 */
1090		old = casuword32(&m->m_owner, owner, owner | UMUTEX_CONTESTED);
1091
1092		/* The address was invalid. */
1093		if (old == -1) {
1094			umtxq_lock(&uq->uq_key);
1095			umtxq_remove(uq);
1096			umtxq_unlock(&uq->uq_key);
1097			umtx_key_release(&uq->uq_key);
1098			return (EFAULT);
1099		}
1100
1101		/*
1102		 * We set the contested bit, sleep. Otherwise the lock changed
1103		 * and we need to retry or we lost a race to the thread
1104		 * unlocking the umtx.
1105		 */
1106		umtxq_lock(&uq->uq_key);
1107		if (old == owner)
1108			error = umtxq_sleep(uq, "umtxn", timo);
1109		umtxq_remove(uq);
1110		umtxq_unlock(&uq->uq_key);
1111		umtx_key_release(&uq->uq_key);
1112	}
1113
1114	return (0);
1115}
1116
1117/*
1118 * Lock PTHREAD_PRIO_NONE protocol POSIX mutex.
1119 */
1120/*
1121 * Unlock PTHREAD_PRIO_NONE protocol POSIX mutex.
1122 */
1123static int
1124do_unlock_normal(struct thread *td, struct umutex *m, uint32_t flags)
1125{
1126	struct umtx_key key;
1127	uint32_t owner, old, id;
1128	int error;
1129	int count;
1130
1131	id = td->td_tid;
1132	/*
1133	 * Make sure we own this mtx.
1134	 */
1135	owner = fuword32(__DEVOLATILE(uint32_t *, &m->m_owner));
1136	if (owner == -1)
1137		return (EFAULT);
1138
1139	if ((owner & ~UMUTEX_CONTESTED) != id)
1140		return (EPERM);
1141
1142	/* This should be done in userland */
1143	if ((owner & UMUTEX_CONTESTED) == 0) {
1144		old = casuword32(&m->m_owner, owner, UMUTEX_UNOWNED);
1145		if (old == -1)
1146			return (EFAULT);
1147		if (old == owner)
1148			return (0);
1149		owner = old;
1150	}
1151
1152	/* We should only ever be in here for contested locks */
1153	if ((error = umtx_key_get(m, TYPE_NORMAL_UMUTEX, GET_SHARE(flags),
1154	    &key)) != 0)
1155		return (error);
1156
1157	umtxq_lock(&key);
1158	umtxq_busy(&key);
1159	count = umtxq_count(&key);
1160	umtxq_unlock(&key);
1161
1162	/*
1163	 * When unlocking the umtx, it must be marked as unowned if
1164	 * there is zero or one thread only waiting for it.
1165	 * Otherwise, it must be marked as contested.
1166	 */
1167	old = casuword32(&m->m_owner, owner,
1168		count <= 1 ? UMUTEX_UNOWNED : UMUTEX_CONTESTED);
1169	umtxq_lock(&key);
1170	umtxq_signal(&key,1);
1171	umtxq_unbusy(&key);
1172	umtxq_unlock(&key);
1173	umtx_key_release(&key);
1174	if (old == -1)
1175		return (EFAULT);
1176	if (old != owner)
1177		return (EINVAL);
1178	return (0);
1179}
1180
1181static inline struct umtx_pi *
1182umtx_pi_alloc(int flags)
1183{
1184	struct umtx_pi *pi;
1185
1186	pi = uma_zalloc(umtx_pi_zone, M_ZERO | flags);
1187	TAILQ_INIT(&pi->pi_blocked);
1188	atomic_add_int(&umtx_pi_allocated, 1);
1189	return (pi);
1190}
1191
1192static inline void
1193umtx_pi_free(struct umtx_pi *pi)
1194{
1195	uma_zfree(umtx_pi_zone, pi);
1196	atomic_add_int(&umtx_pi_allocated, -1);
1197}
1198
1199/*
1200 * Adjust the thread's position on a pi_state after its priority has been
1201 * changed.
1202 */
1203static int
1204umtx_pi_adjust_thread(struct umtx_pi *pi, struct thread *td)
1205{
1206	struct umtx_q *uq, *uq1, *uq2;
1207	struct thread *td1;
1208
1209	mtx_assert(&umtx_lock, MA_OWNED);
1210	if (pi == NULL)
1211		return (0);
1212
1213	uq = td->td_umtxq;
1214
1215	/*
1216	 * Check if the thread needs to be moved on the blocked chain.
1217	 * It needs to be moved if either its priority is lower than
1218	 * the previous thread or higher than the next thread.
1219	 */
1220	uq1 = TAILQ_PREV(uq, umtxq_head, uq_lockq);
1221	uq2 = TAILQ_NEXT(uq, uq_lockq);
1222	if ((uq1 != NULL && UPRI(td) < UPRI(uq1->uq_thread)) ||
1223	    (uq2 != NULL && UPRI(td) > UPRI(uq2->uq_thread))) {
1224		/*
1225		 * Remove thread from blocked chain and determine where
1226		 * it should be moved to.
1227		 */
1228		TAILQ_REMOVE(&pi->pi_blocked, uq, uq_lockq);
1229		TAILQ_FOREACH(uq1, &pi->pi_blocked, uq_lockq) {
1230			td1 = uq1->uq_thread;
1231			MPASS(td1->td_proc->p_magic == P_MAGIC);
1232			if (UPRI(td1) > UPRI(td))
1233				break;
1234		}
1235
1236		if (uq1 == NULL)
1237			TAILQ_INSERT_TAIL(&pi->pi_blocked, uq, uq_lockq);
1238		else
1239			TAILQ_INSERT_BEFORE(uq1, uq, uq_lockq);
1240	}
1241	return (1);
1242}
1243
1244/*
1245 * Propagate priority when a thread is blocked on POSIX
1246 * PI mutex.
1247 */
1248static void
1249umtx_propagate_priority(struct thread *td)
1250{
1251	struct umtx_q *uq;
1252	struct umtx_pi *pi;
1253	int pri;
1254
1255	mtx_assert(&umtx_lock, MA_OWNED);
1256	pri = UPRI(td);
1257	uq = td->td_umtxq;
1258	pi = uq->uq_pi_blocked;
1259	if (pi == NULL)
1260		return;
1261
1262	for (;;) {
1263		td = pi->pi_owner;
1264		if (td == NULL)
1265			return;
1266
1267		MPASS(td->td_proc != NULL);
1268		MPASS(td->td_proc->p_magic == P_MAGIC);
1269
1270		if (UPRI(td) <= pri)
1271			return;
1272
1273		thread_lock(td);
1274		sched_lend_user_prio(td, pri);
1275		thread_unlock(td);
1276
1277		/*
1278		 * Pick up the lock that td is blocked on.
1279		 */
1280		uq = td->td_umtxq;
1281		pi = uq->uq_pi_blocked;
1282		/* Resort td on the list if needed. */
1283		if (!umtx_pi_adjust_thread(pi, td))
1284			break;
1285	}
1286}
1287
1288/*
1289 * Unpropagate priority for a PI mutex when a thread blocked on
1290 * it is interrupted by signal or resumed by others.
1291 */
1292static void
1293umtx_unpropagate_priority(struct umtx_pi *pi)
1294{
1295	struct umtx_q *uq, *uq_owner;
1296	struct umtx_pi *pi2;
1297	int pri;
1298
1299	mtx_assert(&umtx_lock, MA_OWNED);
1300
1301	while (pi != NULL && pi->pi_owner != NULL) {
1302		pri = PRI_MAX;
1303		uq_owner = pi->pi_owner->td_umtxq;
1304
1305		TAILQ_FOREACH(pi2, &uq_owner->uq_pi_contested, pi_link) {
1306			uq = TAILQ_FIRST(&pi2->pi_blocked);
1307			if (uq != NULL) {
1308				if (pri > UPRI(uq->uq_thread))
1309					pri = UPRI(uq->uq_thread);
1310			}
1311		}
1312
1313		if (pri > uq_owner->uq_inherited_pri)
1314			pri = uq_owner->uq_inherited_pri;
1315		thread_lock(pi->pi_owner);
1316		sched_unlend_user_prio(pi->pi_owner, pri);
1317		thread_unlock(pi->pi_owner);
1318		pi = uq_owner->uq_pi_blocked;
1319	}
1320}
1321
1322/*
1323 * Insert a PI mutex into owned list.
1324 */
1325static void
1326umtx_pi_setowner(struct umtx_pi *pi, struct thread *owner)
1327{
1328	struct umtx_q *uq_owner;
1329
1330	uq_owner = owner->td_umtxq;
1331	mtx_assert(&umtx_lock, MA_OWNED);
1332	if (pi->pi_owner != NULL)
1333		panic("pi_ower != NULL");
1334	pi->pi_owner = owner;
1335	TAILQ_INSERT_TAIL(&uq_owner->uq_pi_contested, pi, pi_link);
1336}
1337
1338/*
1339 * Claim ownership of a PI mutex.
1340 */
1341static int
1342umtx_pi_claim(struct umtx_pi *pi, struct thread *owner)
1343{
1344	struct umtx_q *uq, *uq_owner;
1345
1346	uq_owner = owner->td_umtxq;
1347	mtx_lock_spin(&umtx_lock);
1348	if (pi->pi_owner == owner) {
1349		mtx_unlock_spin(&umtx_lock);
1350		return (0);
1351	}
1352
1353	if (pi->pi_owner != NULL) {
1354		/*
1355		 * userland may have already messed the mutex, sigh.
1356		 */
1357		mtx_unlock_spin(&umtx_lock);
1358		return (EPERM);
1359	}
1360	umtx_pi_setowner(pi, owner);
1361	uq = TAILQ_FIRST(&pi->pi_blocked);
1362	if (uq != NULL) {
1363		int pri;
1364
1365		pri = UPRI(uq->uq_thread);
1366		thread_lock(owner);
1367		if (pri < UPRI(owner))
1368			sched_lend_user_prio(owner, pri);
1369		thread_unlock(owner);
1370	}
1371	mtx_unlock_spin(&umtx_lock);
1372	return (0);
1373}
1374
1375/*
1376 * Adjust a thread's order position in its blocked PI mutex,
1377 * this may result new priority propagating process.
1378 */
1379void
1380umtx_pi_adjust(struct thread *td, u_char oldpri)
1381{
1382	struct umtx_q *uq;
1383	struct umtx_pi *pi;
1384
1385	uq = td->td_umtxq;
1386
1387	mtx_assert(&umtx_lock, MA_OWNED);
1388	MPASS(TD_ON_UPILOCK(td));
1389
1390	/*
1391	 * Pick up the lock that td is blocked on.
1392	 */
1393	pi = uq->uq_pi_blocked;
1394	MPASS(pi != NULL);
1395
1396	/* Resort the turnstile on the list. */
1397	if (!umtx_pi_adjust_thread(pi, td))
1398		return;
1399
1400	/*
1401	 * If our priority was lowered and we are at the head of the
1402	 * turnstile, then propagate our new priority up the chain.
1403	 */
1404	if (uq == TAILQ_FIRST(&pi->pi_blocked) && UPRI(td) < oldpri)
1405		umtx_propagate_priority(td);
1406}
1407
1408/*
1409 * Sleep on a PI mutex.
1410 */
1411static int
1412umtxq_sleep_pi(struct umtx_q *uq, struct umtx_pi *pi,
1413	uint32_t owner, const char *wmesg, int timo)
1414{
1415	struct umtxq_chain *uc;
1416	struct thread *td, *td1;
1417	struct umtx_q *uq1;
1418	int pri;
1419	int error = 0;
1420
1421	td = uq->uq_thread;
1422	KASSERT(td == curthread, ("inconsistent uq_thread"));
1423	uc = umtxq_getchain(&uq->uq_key);
1424	UMTXQ_LOCKED_ASSERT(uc);
1425	umtxq_insert(uq);
1426	if (pi->pi_owner == NULL) {
1427		/* XXX
1428		 * Current, We only support process private PI-mutex,
1429		 * non-contended PI-mutexes are locked in userland.
1430		 * Process shared PI-mutex should always be initialized
1431		 * by kernel and be registered in kernel, locking should
1432		 * always be done by kernel to avoid security problems.
1433		 * For process private PI-mutex, we can find owner
1434		 * thread and boost its priority safely.
1435		 */
1436		PROC_LOCK(curproc);
1437		td1 = thread_find(curproc, owner);
1438		mtx_lock_spin(&umtx_lock);
1439		if (td1 != NULL && pi->pi_owner == NULL) {
1440			uq1 = td1->td_umtxq;
1441			umtx_pi_setowner(pi, td1);
1442		}
1443		PROC_UNLOCK(curproc);
1444	} else {
1445		mtx_lock_spin(&umtx_lock);
1446	}
1447
1448	TAILQ_FOREACH(uq1, &pi->pi_blocked, uq_lockq) {
1449		pri = UPRI(uq1->uq_thread);
1450		if (pri > UPRI(td))
1451			break;
1452	}
1453
1454	if (uq1 != NULL)
1455		TAILQ_INSERT_BEFORE(uq1, uq, uq_lockq);
1456	else
1457		TAILQ_INSERT_TAIL(&pi->pi_blocked, uq, uq_lockq);
1458
1459	uq->uq_pi_blocked = pi;
1460	td->td_flags |= TDF_UPIBLOCKED;
1461	mtx_unlock_spin(&umtx_lock);
1462	umtxq_unlock(&uq->uq_key);
1463
1464	mtx_lock_spin(&umtx_lock);
1465	umtx_propagate_priority(td);
1466	mtx_unlock_spin(&umtx_lock);
1467
1468	umtxq_lock(&uq->uq_key);
1469	if (uq->uq_flags & UQF_UMTXQ) {
1470		error = msleep(uq, &uc->uc_lock, PCATCH, wmesg, timo);
1471		if (error == EWOULDBLOCK)
1472			error = ETIMEDOUT;
1473		if (uq->uq_flags & UQF_UMTXQ) {
1474			umtxq_busy(&uq->uq_key);
1475			umtxq_remove(uq);
1476			umtxq_unbusy(&uq->uq_key);
1477		}
1478	}
1479	umtxq_unlock(&uq->uq_key);
1480
1481	mtx_lock_spin(&umtx_lock);
1482	uq->uq_pi_blocked = NULL;
1483	td->td_flags &= ~TDF_UPIBLOCKED;
1484	TAILQ_REMOVE(&pi->pi_blocked, uq, uq_lockq);
1485	umtx_unpropagate_priority(pi);
1486	mtx_unlock_spin(&umtx_lock);
1487
1488	umtxq_lock(&uq->uq_key);
1489
1490	return (error);
1491}
1492
1493/*
1494 * Add reference count for a PI mutex.
1495 */
1496static void
1497umtx_pi_ref(struct umtx_pi *pi)
1498{
1499	struct umtxq_chain *uc;
1500
1501	uc = umtxq_getchain(&pi->pi_key);
1502	UMTXQ_LOCKED_ASSERT(uc);
1503	pi->pi_refcount++;
1504}
1505
1506/*
1507 * Decrease reference count for a PI mutex, if the counter
1508 * is decreased to zero, its memory space is freed.
1509 */
1510static void
1511umtx_pi_unref(struct umtx_pi *pi)
1512{
1513	struct umtxq_chain *uc;
1514	int free = 0;
1515
1516	uc = umtxq_getchain(&pi->pi_key);
1517	UMTXQ_LOCKED_ASSERT(uc);
1518	KASSERT(pi->pi_refcount > 0, ("invalid reference count"));
1519	if (--pi->pi_refcount == 0) {
1520		mtx_lock_spin(&umtx_lock);
1521		if (pi->pi_owner != NULL) {
1522			TAILQ_REMOVE(&pi->pi_owner->td_umtxq->uq_pi_contested,
1523				pi, pi_link);
1524			pi->pi_owner = NULL;
1525		}
1526		KASSERT(TAILQ_EMPTY(&pi->pi_blocked),
1527			("blocked queue not empty"));
1528		mtx_unlock_spin(&umtx_lock);
1529		TAILQ_REMOVE(&uc->uc_pi_list, pi, pi_hashlink);
1530		free = 1;
1531	}
1532	if (free)
1533		umtx_pi_free(pi);
1534}
1535
1536/*
1537 * Find a PI mutex in hash table.
1538 */
1539static struct umtx_pi *
1540umtx_pi_lookup(struct umtx_key *key)
1541{
1542	struct umtxq_chain *uc;
1543	struct umtx_pi *pi;
1544
1545	uc = umtxq_getchain(key);
1546	UMTXQ_LOCKED_ASSERT(uc);
1547
1548	TAILQ_FOREACH(pi, &uc->uc_pi_list, pi_hashlink) {
1549		if (umtx_key_match(&pi->pi_key, key)) {
1550			return (pi);
1551		}
1552	}
1553	return (NULL);
1554}
1555
1556/*
1557 * Insert a PI mutex into hash table.
1558 */
1559static inline void
1560umtx_pi_insert(struct umtx_pi *pi)
1561{
1562	struct umtxq_chain *uc;
1563
1564	uc = umtxq_getchain(&pi->pi_key);
1565	UMTXQ_LOCKED_ASSERT(uc);
1566	TAILQ_INSERT_TAIL(&uc->uc_pi_list, pi, pi_hashlink);
1567}
1568
1569/*
1570 * Lock a PI mutex.
1571 */
1572static int
1573_do_lock_pi(struct thread *td, struct umutex *m, uint32_t flags, int timo,
1574	int try)
1575{
1576	struct umtx_q *uq;
1577	struct umtx_pi *pi, *new_pi;
1578	uint32_t id, owner, old;
1579	int error;
1580
1581	id = td->td_tid;
1582	uq = td->td_umtxq;
1583
1584	if ((error = umtx_key_get(m, TYPE_PI_UMUTEX, GET_SHARE(flags),
1585	    &uq->uq_key)) != 0)
1586		return (error);
1587	umtxq_lock(&uq->uq_key);
1588	pi = umtx_pi_lookup(&uq->uq_key);
1589	if (pi == NULL) {
1590		new_pi = umtx_pi_alloc(M_NOWAIT);
1591		if (new_pi == NULL) {
1592			umtxq_unlock(&uq->uq_key);
1593			new_pi = umtx_pi_alloc(M_WAITOK);
1594			new_pi->pi_key = uq->uq_key;
1595			umtxq_lock(&uq->uq_key);
1596			pi = umtx_pi_lookup(&uq->uq_key);
1597			if (pi != NULL) {
1598				umtx_pi_free(new_pi);
1599				new_pi = NULL;
1600			}
1601		}
1602		if (new_pi != NULL) {
1603			new_pi->pi_key = uq->uq_key;
1604			umtx_pi_insert(new_pi);
1605			pi = new_pi;
1606		}
1607	}
1608	umtx_pi_ref(pi);
1609	umtxq_unlock(&uq->uq_key);
1610
1611	/*
1612	 * Care must be exercised when dealing with umtx structure.  It
1613	 * can fault on any access.
1614	 */
1615	for (;;) {
1616		/*
1617		 * Try the uncontested case.  This should be done in userland.
1618		 */
1619		owner = casuword32(&m->m_owner, UMUTEX_UNOWNED, id);
1620
1621		/* The acquire succeeded. */
1622		if (owner == UMUTEX_UNOWNED) {
1623			error = 0;
1624			break;
1625		}
1626
1627		/* The address was invalid. */
1628		if (owner == -1) {
1629			error = EFAULT;
1630			break;
1631		}
1632
1633		/* If no one owns it but it is contested try to acquire it. */
1634		if (owner == UMUTEX_CONTESTED) {
1635			owner = casuword32(&m->m_owner,
1636			    UMUTEX_CONTESTED, id | UMUTEX_CONTESTED);
1637
1638			if (owner == UMUTEX_CONTESTED) {
1639				umtxq_lock(&uq->uq_key);
1640				error = umtx_pi_claim(pi, td);
1641				umtxq_unlock(&uq->uq_key);
1642				break;
1643			}
1644
1645			/* The address was invalid. */
1646			if (owner == -1) {
1647				error = EFAULT;
1648				break;
1649			}
1650
1651			/* If this failed the lock has changed, restart. */
1652			continue;
1653		}
1654
1655		if ((flags & UMUTEX_ERROR_CHECK) != 0 &&
1656		    (owner & ~UMUTEX_CONTESTED) == id) {
1657			error = EDEADLK;
1658			break;
1659		}
1660
1661		if (try != 0) {
1662			error = EBUSY;
1663			break;
1664		}
1665
1666		/*
1667		 * If we caught a signal, we have retried and now
1668		 * exit immediately.
1669		 */
1670		if (error != 0)
1671			break;
1672
1673		umtxq_lock(&uq->uq_key);
1674		umtxq_busy(&uq->uq_key);
1675		umtxq_unlock(&uq->uq_key);
1676
1677		/*
1678		 * Set the contested bit so that a release in user space
1679		 * knows to use the system call for unlock.  If this fails
1680		 * either some one else has acquired the lock or it has been
1681		 * released.
1682		 */
1683		old = casuword32(&m->m_owner, owner, owner | UMUTEX_CONTESTED);
1684
1685		/* The address was invalid. */
1686		if (old == -1) {
1687			umtxq_lock(&uq->uq_key);
1688			umtxq_unbusy(&uq->uq_key);
1689			umtxq_unlock(&uq->uq_key);
1690			error = EFAULT;
1691			break;
1692		}
1693
1694		umtxq_lock(&uq->uq_key);
1695		umtxq_unbusy(&uq->uq_key);
1696		/*
1697		 * We set the contested bit, sleep. Otherwise the lock changed
1698		 * and we need to retry or we lost a race to the thread
1699		 * unlocking the umtx.
1700		 */
1701		if (old == owner)
1702			error = umtxq_sleep_pi(uq, pi, owner & ~UMUTEX_CONTESTED,
1703				 "umtxpi", timo);
1704		umtxq_unlock(&uq->uq_key);
1705	}
1706
1707	umtxq_lock(&uq->uq_key);
1708	umtx_pi_unref(pi);
1709	umtxq_unlock(&uq->uq_key);
1710
1711	umtx_key_release(&uq->uq_key);
1712	return (error);
1713}
1714
1715/*
1716 * Unlock a PI mutex.
1717 */
1718static int
1719do_unlock_pi(struct thread *td, struct umutex *m, uint32_t flags)
1720{
1721	struct umtx_key key;
1722	struct umtx_q *uq_first, *uq_first2, *uq_me;
1723	struct umtx_pi *pi, *pi2;
1724	uint32_t owner, old, id;
1725	int error;
1726	int count;
1727	int pri;
1728
1729	id = td->td_tid;
1730	/*
1731	 * Make sure we own this mtx.
1732	 */
1733	owner = fuword32(__DEVOLATILE(uint32_t *, &m->m_owner));
1734	if (owner == -1)
1735		return (EFAULT);
1736
1737	if ((owner & ~UMUTEX_CONTESTED) != id)
1738		return (EPERM);
1739
1740	/* This should be done in userland */
1741	if ((owner & UMUTEX_CONTESTED) == 0) {
1742		old = casuword32(&m->m_owner, owner, UMUTEX_UNOWNED);
1743		if (old == -1)
1744			return (EFAULT);
1745		if (old == owner)
1746			return (0);
1747		owner = old;
1748	}
1749
1750	/* We should only ever be in here for contested locks */
1751	if ((error = umtx_key_get(m, TYPE_PI_UMUTEX, GET_SHARE(flags),
1752	    &key)) != 0)
1753		return (error);
1754
1755	umtxq_lock(&key);
1756	umtxq_busy(&key);
1757	count = umtxq_count_pi(&key, &uq_first);
1758	if (uq_first != NULL) {
1759		pi = uq_first->uq_pi_blocked;
1760		if (pi->pi_owner != curthread) {
1761			umtxq_unbusy(&key);
1762			umtxq_unlock(&key);
1763			/* userland messed the mutex */
1764			return (EPERM);
1765		}
1766		uq_me = curthread->td_umtxq;
1767		mtx_lock_spin(&umtx_lock);
1768		pi->pi_owner = NULL;
1769		TAILQ_REMOVE(&uq_me->uq_pi_contested, pi, pi_link);
1770		uq_first = TAILQ_FIRST(&pi->pi_blocked);
1771		pri = PRI_MAX;
1772		TAILQ_FOREACH(pi2, &uq_me->uq_pi_contested, pi_link) {
1773			uq_first2 = TAILQ_FIRST(&pi2->pi_blocked);
1774			if (uq_first2 != NULL) {
1775				if (pri > UPRI(uq_first2->uq_thread))
1776					pri = UPRI(uq_first2->uq_thread);
1777			}
1778		}
1779		thread_lock(curthread);
1780		sched_unlend_user_prio(curthread, pri);
1781		thread_unlock(curthread);
1782		mtx_unlock_spin(&umtx_lock);
1783	}
1784	umtxq_unlock(&key);
1785
1786	/*
1787	 * When unlocking the umtx, it must be marked as unowned if
1788	 * there is zero or one thread only waiting for it.
1789	 * Otherwise, it must be marked as contested.
1790	 */
1791	old = casuword32(&m->m_owner, owner,
1792		count <= 1 ? UMUTEX_UNOWNED : UMUTEX_CONTESTED);
1793
1794	umtxq_lock(&key);
1795	if (uq_first != NULL)
1796		umtxq_signal_thread(uq_first);
1797	umtxq_unbusy(&key);
1798	umtxq_unlock(&key);
1799	umtx_key_release(&key);
1800	if (old == -1)
1801		return (EFAULT);
1802	if (old != owner)
1803		return (EINVAL);
1804	return (0);
1805}
1806
1807/*
1808 * Lock a PP mutex.
1809 */
1810static int
1811_do_lock_pp(struct thread *td, struct umutex *m, uint32_t flags, int timo,
1812	int try)
1813{
1814	struct umtx_q *uq, *uq2;
1815	struct umtx_pi *pi;
1816	uint32_t ceiling;
1817	uint32_t owner, id;
1818	int error, pri, old_inherited_pri, su;
1819
1820	id = td->td_tid;
1821	uq = td->td_umtxq;
1822	if ((error = umtx_key_get(m, TYPE_PP_UMUTEX, GET_SHARE(flags),
1823	    &uq->uq_key)) != 0)
1824		return (error);
1825	su = (priv_check(td, PRIV_SCHED_RTPRIO) == 0);
1826	for (;;) {
1827		old_inherited_pri = uq->uq_inherited_pri;
1828		umtxq_lock(&uq->uq_key);
1829		umtxq_busy(&uq->uq_key);
1830		umtxq_unlock(&uq->uq_key);
1831
1832		ceiling = RTP_PRIO_MAX - fuword32(&m->m_ceilings[0]);
1833		if (ceiling > RTP_PRIO_MAX) {
1834			error = EINVAL;
1835			goto out;
1836		}
1837
1838		mtx_lock_spin(&umtx_lock);
1839		if (UPRI(td) < PRI_MIN_REALTIME + ceiling) {
1840			mtx_unlock_spin(&umtx_lock);
1841			error = EINVAL;
1842			goto out;
1843		}
1844		if (su && PRI_MIN_REALTIME + ceiling < uq->uq_inherited_pri) {
1845			uq->uq_inherited_pri = PRI_MIN_REALTIME + ceiling;
1846			thread_lock(td);
1847			if (uq->uq_inherited_pri < UPRI(td))
1848				sched_lend_user_prio(td, uq->uq_inherited_pri);
1849			thread_unlock(td);
1850		}
1851		mtx_unlock_spin(&umtx_lock);
1852
1853		owner = casuword32(&m->m_owner,
1854		    UMUTEX_CONTESTED, id | UMUTEX_CONTESTED);
1855
1856		if (owner == UMUTEX_CONTESTED) {
1857			error = 0;
1858			break;
1859		}
1860
1861		/* The address was invalid. */
1862		if (owner == -1) {
1863			error = EFAULT;
1864			break;
1865		}
1866
1867		if ((flags & UMUTEX_ERROR_CHECK) != 0 &&
1868		    (owner & ~UMUTEX_CONTESTED) == id) {
1869			error = EDEADLK;
1870			break;
1871		}
1872
1873		if (try != 0) {
1874			error = EBUSY;
1875			break;
1876		}
1877
1878		/*
1879		 * If we caught a signal, we have retried and now
1880		 * exit immediately.
1881		 */
1882		if (error != 0)
1883			break;
1884
1885		umtxq_lock(&uq->uq_key);
1886		umtxq_insert(uq);
1887		umtxq_unbusy(&uq->uq_key);
1888		error = umtxq_sleep(uq, "umtxpp", timo);
1889		umtxq_remove(uq);
1890		umtxq_unlock(&uq->uq_key);
1891
1892		mtx_lock_spin(&umtx_lock);
1893		uq->uq_inherited_pri = old_inherited_pri;
1894		pri = PRI_MAX;
1895		TAILQ_FOREACH(pi, &uq->uq_pi_contested, pi_link) {
1896			uq2 = TAILQ_FIRST(&pi->pi_blocked);
1897			if (uq2 != NULL) {
1898				if (pri > UPRI(uq2->uq_thread))
1899					pri = UPRI(uq2->uq_thread);
1900			}
1901		}
1902		if (pri > uq->uq_inherited_pri)
1903			pri = uq->uq_inherited_pri;
1904		thread_lock(td);
1905		sched_unlend_user_prio(td, pri);
1906		thread_unlock(td);
1907		mtx_unlock_spin(&umtx_lock);
1908	}
1909
1910	if (error != 0) {
1911		mtx_lock_spin(&umtx_lock);
1912		uq->uq_inherited_pri = old_inherited_pri;
1913		pri = PRI_MAX;
1914		TAILQ_FOREACH(pi, &uq->uq_pi_contested, pi_link) {
1915			uq2 = TAILQ_FIRST(&pi->pi_blocked);
1916			if (uq2 != NULL) {
1917				if (pri > UPRI(uq2->uq_thread))
1918					pri = UPRI(uq2->uq_thread);
1919			}
1920		}
1921		if (pri > uq->uq_inherited_pri)
1922			pri = uq->uq_inherited_pri;
1923		thread_lock(td);
1924		sched_unlend_user_prio(td, pri);
1925		thread_unlock(td);
1926		mtx_unlock_spin(&umtx_lock);
1927	}
1928
1929out:
1930	umtxq_lock(&uq->uq_key);
1931	umtxq_unbusy(&uq->uq_key);
1932	umtxq_unlock(&uq->uq_key);
1933	umtx_key_release(&uq->uq_key);
1934	return (error);
1935}
1936
1937/*
1938 * Unlock a PP mutex.
1939 */
1940static int
1941do_unlock_pp(struct thread *td, struct umutex *m, uint32_t flags)
1942{
1943	struct umtx_key key;
1944	struct umtx_q *uq, *uq2;
1945	struct umtx_pi *pi;
1946	uint32_t owner, id;
1947	uint32_t rceiling;
1948	int error, pri, new_inherited_pri, su;
1949
1950	id = td->td_tid;
1951	uq = td->td_umtxq;
1952	su = (priv_check(td, PRIV_SCHED_RTPRIO) == 0);
1953
1954	/*
1955	 * Make sure we own this mtx.
1956	 */
1957	owner = fuword32(__DEVOLATILE(uint32_t *, &m->m_owner));
1958	if (owner == -1)
1959		return (EFAULT);
1960
1961	if ((owner & ~UMUTEX_CONTESTED) != id)
1962		return (EPERM);
1963
1964	error = copyin(&m->m_ceilings[1], &rceiling, sizeof(uint32_t));
1965	if (error != 0)
1966		return (error);
1967
1968	if (rceiling == -1)
1969		new_inherited_pri = PRI_MAX;
1970	else {
1971		rceiling = RTP_PRIO_MAX - rceiling;
1972		if (rceiling > RTP_PRIO_MAX)
1973			return (EINVAL);
1974		new_inherited_pri = PRI_MIN_REALTIME + rceiling;
1975	}
1976
1977	if ((error = umtx_key_get(m, TYPE_PP_UMUTEX, GET_SHARE(flags),
1978	    &key)) != 0)
1979		return (error);
1980	umtxq_lock(&key);
1981	umtxq_busy(&key);
1982	umtxq_unlock(&key);
1983	/*
1984	 * For priority protected mutex, always set unlocked state
1985	 * to UMUTEX_CONTESTED, so that userland always enters kernel
1986	 * to lock the mutex, it is necessary because thread priority
1987	 * has to be adjusted for such mutex.
1988	 */
1989	error = suword32(__DEVOLATILE(uint32_t *, &m->m_owner),
1990		UMUTEX_CONTESTED);
1991
1992	umtxq_lock(&key);
1993	if (error == 0)
1994		umtxq_signal(&key, 1);
1995	umtxq_unbusy(&key);
1996	umtxq_unlock(&key);
1997
1998	if (error == -1)
1999		error = EFAULT;
2000	else {
2001		mtx_lock_spin(&umtx_lock);
2002		if (su != 0)
2003			uq->uq_inherited_pri = new_inherited_pri;
2004		pri = PRI_MAX;
2005		TAILQ_FOREACH(pi, &uq->uq_pi_contested, pi_link) {
2006			uq2 = TAILQ_FIRST(&pi->pi_blocked);
2007			if (uq2 != NULL) {
2008				if (pri > UPRI(uq2->uq_thread))
2009					pri = UPRI(uq2->uq_thread);
2010			}
2011		}
2012		if (pri > uq->uq_inherited_pri)
2013			pri = uq->uq_inherited_pri;
2014		thread_lock(td);
2015		sched_unlend_user_prio(td, pri);
2016		thread_unlock(td);
2017		mtx_unlock_spin(&umtx_lock);
2018	}
2019	umtx_key_release(&key);
2020	return (error);
2021}
2022
2023static int
2024do_set_ceiling(struct thread *td, struct umutex *m, uint32_t ceiling,
2025	uint32_t *old_ceiling)
2026{
2027	struct umtx_q *uq;
2028	uint32_t save_ceiling;
2029	uint32_t owner, id;
2030	uint32_t flags;
2031	int error;
2032
2033	flags = fuword32(&m->m_flags);
2034	if ((flags & UMUTEX_PRIO_PROTECT) == 0)
2035		return (EINVAL);
2036	if (ceiling > RTP_PRIO_MAX)
2037		return (EINVAL);
2038	id = td->td_tid;
2039	uq = td->td_umtxq;
2040	if ((error = umtx_key_get(m, TYPE_PP_UMUTEX, GET_SHARE(flags),
2041	   &uq->uq_key)) != 0)
2042		return (error);
2043	for (;;) {
2044		umtxq_lock(&uq->uq_key);
2045		umtxq_busy(&uq->uq_key);
2046		umtxq_unlock(&uq->uq_key);
2047
2048		save_ceiling = fuword32(&m->m_ceilings[0]);
2049
2050		owner = casuword32(&m->m_owner,
2051		    UMUTEX_CONTESTED, id | UMUTEX_CONTESTED);
2052
2053		if (owner == UMUTEX_CONTESTED) {
2054			suword32(&m->m_ceilings[0], ceiling);
2055			suword32(__DEVOLATILE(uint32_t *, &m->m_owner),
2056				UMUTEX_CONTESTED);
2057			error = 0;
2058			break;
2059		}
2060
2061		/* The address was invalid. */
2062		if (owner == -1) {
2063			error = EFAULT;
2064			break;
2065		}
2066
2067		if ((owner & ~UMUTEX_CONTESTED) == id) {
2068			suword32(&m->m_ceilings[0], ceiling);
2069			error = 0;
2070			break;
2071		}
2072
2073		/*
2074		 * If we caught a signal, we have retried and now
2075		 * exit immediately.
2076		 */
2077		if (error != 0)
2078			break;
2079
2080		/*
2081		 * We set the contested bit, sleep. Otherwise the lock changed
2082		 * and we need to retry or we lost a race to the thread
2083		 * unlocking the umtx.
2084		 */
2085		umtxq_lock(&uq->uq_key);
2086		umtxq_insert(uq);
2087		umtxq_unbusy(&uq->uq_key);
2088		error = umtxq_sleep(uq, "umtxpp", 0);
2089		umtxq_remove(uq);
2090		umtxq_unlock(&uq->uq_key);
2091	}
2092	umtxq_lock(&uq->uq_key);
2093	if (error == 0)
2094		umtxq_signal(&uq->uq_key, INT_MAX);
2095	umtxq_unbusy(&uq->uq_key);
2096	umtxq_unlock(&uq->uq_key);
2097	umtx_key_release(&uq->uq_key);
2098	if (error == 0 && old_ceiling != NULL)
2099		suword32(old_ceiling, save_ceiling);
2100	return (error);
2101}
2102
2103static int
2104_do_lock_umutex(struct thread *td, struct umutex *m, int flags, int timo,
2105	int try)
2106{
2107	switch(flags & (UMUTEX_PRIO_INHERIT | UMUTEX_PRIO_PROTECT)) {
2108	case 0:
2109		return (_do_lock_normal(td, m, flags, timo, try));
2110	case UMUTEX_PRIO_INHERIT:
2111		return (_do_lock_pi(td, m, flags, timo, try));
2112	case UMUTEX_PRIO_PROTECT:
2113		return (_do_lock_pp(td, m, flags, timo, try));
2114	}
2115	return (EINVAL);
2116}
2117
2118/*
2119 * Lock a userland POSIX mutex.
2120 */
2121static int
2122do_lock_umutex(struct thread *td, struct umutex *m,
2123	struct timespec *timeout, int try)
2124{
2125	struct timespec ts, ts2, ts3;
2126	struct timeval tv;
2127	uint32_t flags;
2128	int error;
2129
2130	flags = fuword32(&m->m_flags);
2131	if (flags == -1)
2132		return (EFAULT);
2133
2134	if (timeout == NULL) {
2135		error = _do_lock_umutex(td, m, flags, 0, try);
2136		/* Mutex locking is restarted if it is interrupted. */
2137		if (error == EINTR)
2138			error = ERESTART;
2139	} else {
2140		getnanouptime(&ts);
2141		timespecadd(&ts, timeout);
2142		TIMESPEC_TO_TIMEVAL(&tv, timeout);
2143		for (;;) {
2144			error = _do_lock_umutex(td, m, flags, tvtohz(&tv), try);
2145			if (error != ETIMEDOUT)
2146				break;
2147			getnanouptime(&ts2);
2148			if (timespeccmp(&ts2, &ts, >=)) {
2149				error = ETIMEDOUT;
2150				break;
2151			}
2152			ts3 = ts;
2153			timespecsub(&ts3, &ts2);
2154			TIMESPEC_TO_TIMEVAL(&tv, &ts3);
2155		}
2156		/* Timed-locking is not restarted. */
2157		if (error == ERESTART)
2158			error = EINTR;
2159	}
2160	return (error);
2161}
2162
2163/*
2164 * Unlock a userland POSIX mutex.
2165 */
2166static int
2167do_unlock_umutex(struct thread *td, struct umutex *m)
2168{
2169	uint32_t flags;
2170
2171	flags = fuword32(&m->m_flags);
2172	if (flags == -1)
2173		return (EFAULT);
2174
2175	switch(flags & (UMUTEX_PRIO_INHERIT | UMUTEX_PRIO_PROTECT)) {
2176	case 0:
2177		return (do_unlock_normal(td, m, flags));
2178	case UMUTEX_PRIO_INHERIT:
2179		return (do_unlock_pi(td, m, flags));
2180	case UMUTEX_PRIO_PROTECT:
2181		return (do_unlock_pp(td, m, flags));
2182	}
2183
2184	return (EINVAL);
2185}
2186
2187static int
2188do_cv_wait(struct thread *td, struct ucond *cv, struct umutex *m,
2189	struct timespec *timeout, u_long wflags)
2190{
2191	struct umtx_q *uq;
2192	struct timeval tv;
2193	struct timespec cts, ets, tts;
2194	uint32_t flags;
2195	int error;
2196
2197	uq = td->td_umtxq;
2198	flags = fuword32(&cv->c_flags);
2199	error = umtx_key_get(cv, TYPE_CV, GET_SHARE(flags), &uq->uq_key);
2200	if (error != 0)
2201		return (error);
2202	umtxq_lock(&uq->uq_key);
2203	umtxq_busy(&uq->uq_key);
2204	umtxq_insert(uq);
2205	umtxq_unlock(&uq->uq_key);
2206
2207	/*
2208	 * The magic thing is we should set c_has_waiters to 1 before
2209	 * releasing user mutex.
2210	 */
2211	suword32(__DEVOLATILE(uint32_t *, &cv->c_has_waiters), 1);
2212
2213	umtxq_lock(&uq->uq_key);
2214	umtxq_unbusy(&uq->uq_key);
2215	umtxq_unlock(&uq->uq_key);
2216
2217	error = do_unlock_umutex(td, m);
2218
2219	umtxq_lock(&uq->uq_key);
2220	if (error == 0) {
2221		if ((wflags & UMTX_CHECK_UNPARKING) &&
2222		    (td->td_pflags & TDP_WAKEUP)) {
2223			td->td_pflags &= ~TDP_WAKEUP;
2224			error = EINTR;
2225		} else if (timeout == NULL) {
2226			error = umtxq_sleep(uq, "ucond", 0);
2227		} else {
2228			getnanouptime(&ets);
2229			timespecadd(&ets, timeout);
2230			TIMESPEC_TO_TIMEVAL(&tv, timeout);
2231			for (;;) {
2232				error = umtxq_sleep(uq, "ucond", tvtohz(&tv));
2233				if (error != ETIMEDOUT)
2234					break;
2235				getnanouptime(&cts);
2236				if (timespeccmp(&cts, &ets, >=)) {
2237					error = ETIMEDOUT;
2238					break;
2239				}
2240				tts = ets;
2241				timespecsub(&tts, &cts);
2242				TIMESPEC_TO_TIMEVAL(&tv, &tts);
2243			}
2244		}
2245	}
2246
2247	if (error != 0) {
2248		if ((uq->uq_flags & UQF_UMTXQ) == 0) {
2249			/*
2250			 * If we concurrently got do_cv_signal()d
2251			 * and we got an error or UNIX signals or a timeout,
2252			 * then, perform another umtxq_signal to avoid
2253			 * consuming the wakeup. This may cause supurious
2254			 * wakeup for another thread which was just queued,
2255			 * but SUSV3 explicitly allows supurious wakeup to
2256			 * occur, and indeed a kernel based implementation
2257			 * can not avoid it.
2258			 */
2259			if (!umtxq_signal(&uq->uq_key, 1))
2260				error = 0;
2261		}
2262		if (error == ERESTART)
2263			error = EINTR;
2264	}
2265	umtxq_remove(uq);
2266	umtxq_unlock(&uq->uq_key);
2267	umtx_key_release(&uq->uq_key);
2268	return (error);
2269}
2270
2271/*
2272 * Signal a userland condition variable.
2273 */
2274static int
2275do_cv_signal(struct thread *td, struct ucond *cv)
2276{
2277	struct umtx_key key;
2278	int error, cnt, nwake;
2279	uint32_t flags;
2280
2281	flags = fuword32(&cv->c_flags);
2282	if ((error = umtx_key_get(cv, TYPE_CV, GET_SHARE(flags), &key)) != 0)
2283		return (error);
2284	umtxq_lock(&key);
2285	umtxq_busy(&key);
2286	cnt = umtxq_count(&key);
2287	nwake = umtxq_signal(&key, 1);
2288	if (cnt <= nwake) {
2289		umtxq_unlock(&key);
2290		error = suword32(
2291		    __DEVOLATILE(uint32_t *, &cv->c_has_waiters), 0);
2292		umtxq_lock(&key);
2293	}
2294	umtxq_unbusy(&key);
2295	umtxq_unlock(&key);
2296	umtx_key_release(&key);
2297	return (error);
2298}
2299
2300static int
2301do_cv_broadcast(struct thread *td, struct ucond *cv)
2302{
2303	struct umtx_key key;
2304	int error;
2305	uint32_t flags;
2306
2307	flags = fuword32(&cv->c_flags);
2308	if ((error = umtx_key_get(cv, TYPE_CV, GET_SHARE(flags), &key)) != 0)
2309		return (error);
2310
2311	umtxq_lock(&key);
2312	umtxq_busy(&key);
2313	umtxq_signal(&key, INT_MAX);
2314	umtxq_unlock(&key);
2315
2316	error = suword32(__DEVOLATILE(uint32_t *, &cv->c_has_waiters), 0);
2317
2318	umtxq_lock(&key);
2319	umtxq_unbusy(&key);
2320	umtxq_unlock(&key);
2321
2322	umtx_key_release(&key);
2323	return (error);
2324}
2325
2326int
2327_umtx_lock(struct thread *td, struct _umtx_lock_args *uap)
2328    /* struct umtx *umtx */
2329{
2330	return _do_lock_umtx(td, uap->umtx, td->td_tid, 0);
2331}
2332
2333int
2334_umtx_unlock(struct thread *td, struct _umtx_unlock_args *uap)
2335    /* struct umtx *umtx */
2336{
2337	return do_unlock_umtx(td, uap->umtx, td->td_tid);
2338}
2339
2340static int
2341__umtx_op_lock_umtx(struct thread *td, struct _umtx_op_args *uap)
2342{
2343	struct timespec *ts, timeout;
2344	int error;
2345
2346	/* Allow a null timespec (wait forever). */
2347	if (uap->uaddr2 == NULL)
2348		ts = NULL;
2349	else {
2350		error = copyin(uap->uaddr2, &timeout, sizeof(timeout));
2351		if (error != 0)
2352			return (error);
2353		if (timeout.tv_nsec >= 1000000000 ||
2354		    timeout.tv_nsec < 0) {
2355			return (EINVAL);
2356		}
2357		ts = &timeout;
2358	}
2359	return (do_lock_umtx(td, uap->obj, uap->val, ts));
2360}
2361
2362static int
2363__umtx_op_unlock_umtx(struct thread *td, struct _umtx_op_args *uap)
2364{
2365	return (do_unlock_umtx(td, uap->obj, uap->val));
2366}
2367
2368static int
2369__umtx_op_wait(struct thread *td, struct _umtx_op_args *uap)
2370{
2371	struct timespec *ts, timeout;
2372	int error;
2373
2374	if (uap->uaddr2 == NULL)
2375		ts = NULL;
2376	else {
2377		error = copyin(uap->uaddr2, &timeout, sizeof(timeout));
2378		if (error != 0)
2379			return (error);
2380		if (timeout.tv_nsec >= 1000000000 ||
2381		    timeout.tv_nsec < 0)
2382			return (EINVAL);
2383		ts = &timeout;
2384	}
2385	return do_wait(td, uap->obj, uap->val, ts, 0);
2386}
2387
2388static int
2389__umtx_op_wake(struct thread *td, struct _umtx_op_args *uap)
2390{
2391	return (kern_umtx_wake(td, uap->obj, uap->val));
2392}
2393
2394static int
2395__umtx_op_lock_umutex(struct thread *td, struct _umtx_op_args *uap)
2396{
2397	struct timespec *ts, timeout;
2398	int error;
2399
2400	/* Allow a null timespec (wait forever). */
2401	if (uap->uaddr2 == NULL)
2402		ts = NULL;
2403	else {
2404		error = copyin(uap->uaddr2, &timeout,
2405		    sizeof(timeout));
2406		if (error != 0)
2407			return (error);
2408		if (timeout.tv_nsec >= 1000000000 ||
2409		    timeout.tv_nsec < 0) {
2410			return (EINVAL);
2411		}
2412		ts = &timeout;
2413	}
2414	return do_lock_umutex(td, uap->obj, ts, 0);
2415}
2416
2417static int
2418__umtx_op_trylock_umutex(struct thread *td, struct _umtx_op_args *uap)
2419{
2420	return do_lock_umutex(td, uap->obj, NULL, 1);
2421}
2422
2423static int
2424__umtx_op_unlock_umutex(struct thread *td, struct _umtx_op_args *uap)
2425{
2426	return do_unlock_umutex(td, uap->obj);
2427}
2428
2429static int
2430__umtx_op_set_ceiling(struct thread *td, struct _umtx_op_args *uap)
2431{
2432	return do_set_ceiling(td, uap->obj, uap->val, uap->uaddr1);
2433}
2434
2435static int
2436__umtx_op_cv_wait(struct thread *td, struct _umtx_op_args *uap)
2437{
2438	struct timespec *ts, timeout;
2439	int error;
2440
2441	/* Allow a null timespec (wait forever). */
2442	if (uap->uaddr2 == NULL)
2443		ts = NULL;
2444	else {
2445		error = copyin(uap->uaddr2, &timeout,
2446		    sizeof(timeout));
2447		if (error != 0)
2448			return (error);
2449		if (timeout.tv_nsec >= 1000000000 ||
2450		    timeout.tv_nsec < 0) {
2451			return (EINVAL);
2452		}
2453		ts = &timeout;
2454	}
2455	return (do_cv_wait(td, uap->obj, uap->uaddr1, ts, uap->val));
2456}
2457
2458static int
2459__umtx_op_cv_signal(struct thread *td, struct _umtx_op_args *uap)
2460{
2461	return do_cv_signal(td, uap->obj);
2462}
2463
2464static int
2465__umtx_op_cv_broadcast(struct thread *td, struct _umtx_op_args *uap)
2466{
2467	return do_cv_broadcast(td, uap->obj);
2468}
2469
2470typedef int (*_umtx_op_func)(struct thread *td, struct _umtx_op_args *uap);
2471
2472static _umtx_op_func op_table[] = {
2473	__umtx_op_lock_umtx,		/* UMTX_OP_LOCK */
2474	__umtx_op_unlock_umtx,		/* UMTX_OP_UNLOCK */
2475	__umtx_op_wait,			/* UMTX_OP_WAIT */
2476	__umtx_op_wake,			/* UMTX_OP_WAKE */
2477	__umtx_op_trylock_umutex,	/* UMTX_OP_MUTEX_TRYLOCK */
2478	__umtx_op_lock_umutex,		/* UMTX_OP_MUTEX_LOCK */
2479	__umtx_op_unlock_umutex,	/* UMTX_OP_MUTEX_UNLOCK */
2480	__umtx_op_set_ceiling,		/* UMTX_OP_SET_CEILING */
2481	__umtx_op_cv_wait,		/* UMTX_OP_CV_WAIT*/
2482	__umtx_op_cv_signal,		/* UMTX_OP_CV_SIGNAL */
2483	__umtx_op_cv_broadcast		/* UMTX_OP_CV_BROADCAST */
2484};
2485
2486int
2487_umtx_op(struct thread *td, struct _umtx_op_args *uap)
2488{
2489	if ((unsigned)uap->op < UMTX_OP_MAX)
2490		return (*op_table[uap->op])(td, uap);
2491	return (EINVAL);
2492}
2493
2494#ifdef COMPAT_IA32
2495int
2496freebsd32_umtx_lock(struct thread *td, struct freebsd32_umtx_lock_args *uap)
2497    /* struct umtx *umtx */
2498{
2499	return (do_lock_umtx32(td, (uint32_t *)uap->umtx, td->td_tid, NULL));
2500}
2501
2502int
2503freebsd32_umtx_unlock(struct thread *td, struct freebsd32_umtx_unlock_args *uap)
2504    /* struct umtx *umtx */
2505{
2506	return (do_unlock_umtx32(td, (uint32_t *)uap->umtx, td->td_tid));
2507}
2508
2509struct timespec32 {
2510	u_int32_t tv_sec;
2511	u_int32_t tv_nsec;
2512};
2513
2514static inline int
2515copyin_timeout32(void *addr, struct timespec *tsp)
2516{
2517	struct timespec32 ts32;
2518	int error;
2519
2520	error = copyin(addr, &ts32, sizeof(struct timespec32));
2521	if (error == 0) {
2522		tsp->tv_sec = ts32.tv_sec;
2523		tsp->tv_nsec = ts32.tv_nsec;
2524	}
2525	return (error);
2526}
2527
2528static int
2529__umtx_op_lock_umtx_compat32(struct thread *td, struct _umtx_op_args *uap)
2530{
2531	struct timespec *ts, timeout;
2532	int error;
2533
2534	/* Allow a null timespec (wait forever). */
2535	if (uap->uaddr2 == NULL)
2536		ts = NULL;
2537	else {
2538		error = copyin_timeout32(uap->uaddr2, &timeout);
2539		if (error != 0)
2540			return (error);
2541		if (timeout.tv_nsec >= 1000000000 ||
2542		    timeout.tv_nsec < 0) {
2543			return (EINVAL);
2544		}
2545		ts = &timeout;
2546	}
2547	return (do_lock_umtx32(td, uap->obj, uap->val, ts));
2548}
2549
2550static int
2551__umtx_op_unlock_umtx_compat32(struct thread *td, struct _umtx_op_args *uap)
2552{
2553	return (do_unlock_umtx32(td, uap->obj, (uint32_t)uap->val));
2554}
2555
2556static int
2557__umtx_op_wait_compat32(struct thread *td, struct _umtx_op_args *uap)
2558{
2559	struct timespec *ts, timeout;
2560	int error;
2561
2562	if (uap->uaddr2 == NULL)
2563		ts = NULL;
2564	else {
2565		error = copyin_timeout32(uap->uaddr2, &timeout);
2566		if (error != 0)
2567			return (error);
2568		if (timeout.tv_nsec >= 1000000000 ||
2569		    timeout.tv_nsec < 0)
2570			return (EINVAL);
2571		ts = &timeout;
2572	}
2573	return do_wait(td, uap->obj, uap->val, ts, 1);
2574}
2575
2576static int
2577__umtx_op_lock_umutex_compat32(struct thread *td, struct _umtx_op_args *uap)
2578{
2579	struct timespec *ts, timeout;
2580	int error;
2581
2582	/* Allow a null timespec (wait forever). */
2583	if (uap->uaddr2 == NULL)
2584		ts = NULL;
2585	else {
2586		error = copyin_timeout32(uap->uaddr2, &timeout);
2587		if (error != 0)
2588			return (error);
2589		if (timeout.tv_nsec >= 1000000000 ||
2590		    timeout.tv_nsec < 0)
2591			return (EINVAL);
2592		ts = &timeout;
2593	}
2594	return do_lock_umutex(td, uap->obj, ts, 0);
2595}
2596
2597static int
2598__umtx_op_cv_wait_compat32(struct thread *td, struct _umtx_op_args *uap)
2599{
2600	struct timespec *ts, timeout;
2601	int error;
2602
2603	/* Allow a null timespec (wait forever). */
2604	if (uap->uaddr2 == NULL)
2605		ts = NULL;
2606	else {
2607		error = copyin_timeout32(uap->uaddr2, &timeout);
2608		if (error != 0)
2609			return (error);
2610		if (timeout.tv_nsec >= 1000000000 ||
2611		    timeout.tv_nsec < 0)
2612			return (EINVAL);
2613		ts = &timeout;
2614	}
2615	return (do_cv_wait(td, uap->obj, uap->uaddr1, ts, uap->val));
2616}
2617
2618static _umtx_op_func op_table_compat32[] = {
2619	__umtx_op_lock_umtx_compat32,	/* UMTX_OP_LOCK */
2620	__umtx_op_unlock_umtx_compat32,	/* UMTX_OP_UNLOCK */
2621	__umtx_op_wait_compat32,	/* UMTX_OP_WAIT */
2622	__umtx_op_wake,			/* UMTX_OP_WAKE */
2623	__umtx_op_trylock_umutex,	/* UMTX_OP_MUTEX_LOCK */
2624	__umtx_op_lock_umutex_compat32,	/* UMTX_OP_MUTEX_TRYLOCK */
2625	__umtx_op_unlock_umutex,	/* UMTX_OP_MUTEX_UNLOCK	*/
2626	__umtx_op_set_ceiling,		/* UMTX_OP_SET_CEILING */
2627	__umtx_op_cv_wait_compat32,	/* UMTX_OP_CV_WAIT*/
2628	__umtx_op_cv_signal,		/* UMTX_OP_CV_SIGNAL */
2629	__umtx_op_cv_broadcast		/* UMTX_OP_CV_BROADCAST */
2630};
2631
2632int
2633freebsd32_umtx_op(struct thread *td, struct freebsd32_umtx_op_args *uap)
2634{
2635	if ((unsigned)uap->op < UMTX_OP_MAX)
2636		return (*op_table_compat32[uap->op])(td,
2637			(struct _umtx_op_args *)uap);
2638	return (EINVAL);
2639}
2640#endif
2641
2642void
2643umtx_thread_init(struct thread *td)
2644{
2645	td->td_umtxq = umtxq_alloc();
2646	td->td_umtxq->uq_thread = td;
2647}
2648
2649void
2650umtx_thread_fini(struct thread *td)
2651{
2652	umtxq_free(td->td_umtxq);
2653}
2654
2655/*
2656 * It will be called when new thread is created, e.g fork().
2657 */
2658void
2659umtx_thread_alloc(struct thread *td)
2660{
2661	struct umtx_q *uq;
2662
2663	uq = td->td_umtxq;
2664	uq->uq_inherited_pri = PRI_MAX;
2665
2666	KASSERT(uq->uq_flags == 0, ("uq_flags != 0"));
2667	KASSERT(uq->uq_thread == td, ("uq_thread != td"));
2668	KASSERT(uq->uq_pi_blocked == NULL, ("uq_pi_blocked != NULL"));
2669	KASSERT(TAILQ_EMPTY(&uq->uq_pi_contested), ("uq_pi_contested is not empty"));
2670}
2671
2672/*
2673 * exec() hook.
2674 */
2675static void
2676umtx_exec_hook(void *arg __unused, struct proc *p __unused,
2677	struct image_params *imgp __unused)
2678{
2679	umtx_thread_cleanup(curthread);
2680}
2681
2682/*
2683 * thread_exit() hook.
2684 */
2685void
2686umtx_thread_exit(struct thread *td)
2687{
2688	umtx_thread_cleanup(td);
2689}
2690
2691/*
2692 * clean up umtx data.
2693 */
2694static void
2695umtx_thread_cleanup(struct thread *td)
2696{
2697	struct umtx_q *uq;
2698	struct umtx_pi *pi;
2699
2700	if ((uq = td->td_umtxq) == NULL)
2701		return;
2702
2703	mtx_lock_spin(&umtx_lock);
2704	uq->uq_inherited_pri = PRI_MAX;
2705	while ((pi = TAILQ_FIRST(&uq->uq_pi_contested)) != NULL) {
2706		pi->pi_owner = NULL;
2707		TAILQ_REMOVE(&uq->uq_pi_contested, pi, pi_link);
2708	}
2709	td->td_flags &= ~TDF_UBORROWING;
2710	mtx_unlock_spin(&umtx_lock);
2711}
2712