kern_umtx.c revision 174701
1/*-
2 * Copyright (c) 2004, David Xu <davidxu@freebsd.org>
3 * Copyright (c) 2002, Jeffrey Roberson <jeff@freebsd.org>
4 * All rights reserved.
5 *
6 * Redistribution and use in source and binary forms, with or without
7 * modification, are permitted provided that the following conditions
8 * are met:
9 * 1. Redistributions of source code must retain the above copyright
10 *    notice unmodified, this list of conditions, and the following
11 *    disclaimer.
12 * 2. Redistributions in binary form must reproduce the above copyright
13 *    notice, this list of conditions and the following disclaimer in the
14 *    documentation and/or other materials provided with the distribution.
15 *
16 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
17 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
18 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
19 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
20 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
21 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
22 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
23 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
25 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26 */
27
28#include <sys/cdefs.h>
29__FBSDID("$FreeBSD: head/sys/kern/kern_umtx.c 174701 2007-12-17 05:55:07Z davidxu $");
30
31#include "opt_compat.h"
32#include <sys/param.h>
33#include <sys/kernel.h>
34#include <sys/limits.h>
35#include <sys/lock.h>
36#include <sys/malloc.h>
37#include <sys/mutex.h>
38#include <sys/priv.h>
39#include <sys/proc.h>
40#include <sys/sched.h>
41#include <sys/smp.h>
42#include <sys/sysctl.h>
43#include <sys/sysent.h>
44#include <sys/systm.h>
45#include <sys/sysproto.h>
46#include <sys/eventhandler.h>
47#include <sys/umtx.h>
48
49#include <vm/vm.h>
50#include <vm/vm_param.h>
51#include <vm/pmap.h>
52#include <vm/vm_map.h>
53#include <vm/vm_object.h>
54
55#include <machine/cpu.h>
56
57#ifdef COMPAT_IA32
58#include <compat/freebsd32/freebsd32_proto.h>
59#endif
60
61#define TYPE_SIMPLE_LOCK	0
62#define TYPE_SIMPLE_WAIT	1
63#define TYPE_NORMAL_UMUTEX	2
64#define TYPE_PI_UMUTEX		3
65#define TYPE_PP_UMUTEX		4
66#define TYPE_CV			5
67
68/* Key to represent a unique userland synchronous object */
69struct umtx_key {
70	int	hash;
71	int	type;
72	int	shared;
73	union {
74		struct {
75			vm_object_t	object;
76			uintptr_t	offset;
77		} shared;
78		struct {
79			struct vmspace	*vs;
80			uintptr_t	addr;
81		} private;
82		struct {
83			void		*a;
84			uintptr_t	b;
85		} both;
86	} info;
87};
88
89/* Priority inheritance mutex info. */
90struct umtx_pi {
91	/* Owner thread */
92	struct thread		*pi_owner;
93
94	/* Reference count */
95	int			pi_refcount;
96
97 	/* List entry to link umtx holding by thread */
98	TAILQ_ENTRY(umtx_pi)	pi_link;
99
100	/* List entry in hash */
101	TAILQ_ENTRY(umtx_pi)	pi_hashlink;
102
103	/* List for waiters */
104	TAILQ_HEAD(,umtx_q)	pi_blocked;
105
106	/* Identify a userland lock object */
107	struct umtx_key		pi_key;
108};
109
110/* A userland synchronous object user. */
111struct umtx_q {
112	/* Linked list for the hash. */
113	TAILQ_ENTRY(umtx_q)	uq_link;
114
115	/* Umtx key. */
116	struct umtx_key		uq_key;
117
118	/* Umtx flags. */
119	int			uq_flags;
120#define UQF_UMTXQ	0x0001
121
122	/* The thread waits on. */
123	struct thread		*uq_thread;
124
125	/*
126	 * Blocked on PI mutex. read can use chain lock
127	 * or umtx_lock, write must have both chain lock and
128	 * umtx_lock being hold.
129	 */
130	struct umtx_pi		*uq_pi_blocked;
131
132	/* On blocked list */
133	TAILQ_ENTRY(umtx_q)	uq_lockq;
134
135	/* Thread contending with us */
136	TAILQ_HEAD(,umtx_pi)	uq_pi_contested;
137
138	/* Inherited priority from PP mutex */
139	u_char			uq_inherited_pri;
140};
141
142TAILQ_HEAD(umtxq_head, umtx_q);
143
144/* Userland lock object's wait-queue chain */
145struct umtxq_chain {
146	/* Lock for this chain. */
147	struct mtx		uc_lock;
148
149	/* List of sleep queues. */
150	struct umtxq_head	uc_queue;
151
152	/* Busy flag */
153	char			uc_busy;
154
155	/* Chain lock waiters */
156	int			uc_waiters;
157
158	/* All PI in the list */
159	TAILQ_HEAD(,umtx_pi)	uc_pi_list;
160};
161
162#define	UMTXQ_LOCKED_ASSERT(uc)		mtx_assert(&(uc)->uc_lock, MA_OWNED)
163
164/*
165 * Don't propagate time-sharing priority, there is a security reason,
166 * a user can simply introduce PI-mutex, let thread A lock the mutex,
167 * and let another thread B block on the mutex, because B is
168 * sleeping, its priority will be boosted, this causes A's priority to
169 * be boosted via priority propagating too and will never be lowered even
170 * if it is using 100%CPU, this is unfair to other processes.
171 */
172
173#define UPRI(td)	(((td)->td_user_pri >= PRI_MIN_TIMESHARE &&\
174			  (td)->td_user_pri <= PRI_MAX_TIMESHARE) ?\
175			 PRI_MAX_TIMESHARE : (td)->td_user_pri)
176
177#define	GOLDEN_RATIO_PRIME	2654404609U
178#define	UMTX_CHAINS		128
179#define	UMTX_SHIFTS		(__WORD_BIT - 7)
180
181#define THREAD_SHARE		0
182#define PROCESS_SHARE		1
183#define AUTO_SHARE		2
184
185#define	GET_SHARE(flags)	\
186    (((flags) & USYNC_PROCESS_SHARED) == 0 ? THREAD_SHARE : PROCESS_SHARE)
187
188static uma_zone_t		umtx_pi_zone;
189static struct umtxq_chain	umtxq_chains[UMTX_CHAINS];
190static MALLOC_DEFINE(M_UMTX, "umtx", "UMTX queue memory");
191static int			umtx_pi_allocated;
192
193SYSCTL_NODE(_debug, OID_AUTO, umtx, CTLFLAG_RW, 0, "umtx debug");
194SYSCTL_INT(_debug_umtx, OID_AUTO, umtx_pi_allocated, CTLFLAG_RD,
195    &umtx_pi_allocated, 0, "Allocated umtx_pi");
196
197static void umtxq_sysinit(void *);
198static void umtxq_hash(struct umtx_key *key);
199static struct umtxq_chain *umtxq_getchain(struct umtx_key *key);
200static void umtxq_lock(struct umtx_key *key);
201static void umtxq_unlock(struct umtx_key *key);
202static void umtxq_busy(struct umtx_key *key);
203static void umtxq_unbusy(struct umtx_key *key);
204static void umtxq_insert(struct umtx_q *uq);
205static void umtxq_remove(struct umtx_q *uq);
206static int umtxq_sleep(struct umtx_q *uq, const char *wmesg, int timo);
207static int umtxq_count(struct umtx_key *key);
208static int umtxq_signal(struct umtx_key *key, int nr_wakeup);
209static int umtx_key_match(const struct umtx_key *k1, const struct umtx_key *k2);
210static int umtx_key_get(void *addr, int type, int share,
211	struct umtx_key *key);
212static void umtx_key_release(struct umtx_key *key);
213static struct umtx_pi *umtx_pi_alloc(int);
214static void umtx_pi_free(struct umtx_pi *pi);
215static void umtx_pi_adjust_locked(struct thread *td, u_char oldpri);
216static int do_unlock_pp(struct thread *td, struct umutex *m, uint32_t flags);
217static void umtx_thread_cleanup(struct thread *td);
218static void umtx_exec_hook(void *arg __unused, struct proc *p __unused,
219	struct image_params *imgp __unused);
220SYSINIT(umtx, SI_SUB_EVENTHANDLER+1, SI_ORDER_MIDDLE, umtxq_sysinit, NULL);
221
222static struct mtx umtx_lock;
223
224static void
225umtxq_sysinit(void *arg __unused)
226{
227	int i;
228
229	umtx_pi_zone = uma_zcreate("umtx pi", sizeof(struct umtx_pi),
230		NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0);
231	for (i = 0; i < UMTX_CHAINS; ++i) {
232		mtx_init(&umtxq_chains[i].uc_lock, "umtxql", NULL,
233			 MTX_DEF | MTX_DUPOK);
234		TAILQ_INIT(&umtxq_chains[i].uc_queue);
235		TAILQ_INIT(&umtxq_chains[i].uc_pi_list);
236		umtxq_chains[i].uc_busy = 0;
237		umtxq_chains[i].uc_waiters = 0;
238	}
239	mtx_init(&umtx_lock, "umtx lock", NULL, MTX_SPIN);
240	EVENTHANDLER_REGISTER(process_exec, umtx_exec_hook, NULL,
241	    EVENTHANDLER_PRI_ANY);
242}
243
244struct umtx_q *
245umtxq_alloc(void)
246{
247	struct umtx_q *uq;
248
249	uq = malloc(sizeof(struct umtx_q), M_UMTX, M_WAITOK | M_ZERO);
250	TAILQ_INIT(&uq->uq_pi_contested);
251	uq->uq_inherited_pri = PRI_MAX;
252	return (uq);
253}
254
255void
256umtxq_free(struct umtx_q *uq)
257{
258	free(uq, M_UMTX);
259}
260
261static inline void
262umtxq_hash(struct umtx_key *key)
263{
264	unsigned n = (uintptr_t)key->info.both.a + key->info.both.b;
265	key->hash = ((n * GOLDEN_RATIO_PRIME) >> UMTX_SHIFTS) % UMTX_CHAINS;
266}
267
268static inline int
269umtx_key_match(const struct umtx_key *k1, const struct umtx_key *k2)
270{
271	return (k1->type == k2->type &&
272		k1->info.both.a == k2->info.both.a &&
273	        k1->info.both.b == k2->info.both.b);
274}
275
276static inline struct umtxq_chain *
277umtxq_getchain(struct umtx_key *key)
278{
279	return (&umtxq_chains[key->hash]);
280}
281
282/*
283 * Set chain to busy state when following operation
284 * may be blocked (kernel mutex can not be used).
285 */
286static inline void
287umtxq_busy(struct umtx_key *key)
288{
289	struct umtxq_chain *uc;
290
291	uc = umtxq_getchain(key);
292	mtx_assert(&uc->uc_lock, MA_OWNED);
293	while (uc->uc_busy != 0) {
294		uc->uc_waiters++;
295		msleep(uc, &uc->uc_lock, 0, "umtxqb", 0);
296		uc->uc_waiters--;
297	}
298	uc->uc_busy = 1;
299}
300
301/*
302 * Unbusy a chain.
303 */
304static inline void
305umtxq_unbusy(struct umtx_key *key)
306{
307	struct umtxq_chain *uc;
308
309	uc = umtxq_getchain(key);
310	mtx_assert(&uc->uc_lock, MA_OWNED);
311	KASSERT(uc->uc_busy != 0, ("not busy"));
312	uc->uc_busy = 0;
313	if (uc->uc_waiters)
314		wakeup_one(uc);
315}
316
317/*
318 * Lock a chain.
319 */
320static inline void
321umtxq_lock(struct umtx_key *key)
322{
323	struct umtxq_chain *uc;
324
325	uc = umtxq_getchain(key);
326	mtx_lock(&uc->uc_lock);
327}
328
329/*
330 * Unlock a chain.
331 */
332static inline void
333umtxq_unlock(struct umtx_key *key)
334{
335	struct umtxq_chain *uc;
336
337	uc = umtxq_getchain(key);
338	mtx_unlock(&uc->uc_lock);
339}
340
341/*
342 * Insert a thread onto the umtx queue.
343 */
344static inline void
345umtxq_insert(struct umtx_q *uq)
346{
347	struct umtxq_chain *uc;
348
349	uc = umtxq_getchain(&uq->uq_key);
350	UMTXQ_LOCKED_ASSERT(uc);
351	TAILQ_INSERT_TAIL(&uc->uc_queue, uq, uq_link);
352	uq->uq_flags |= UQF_UMTXQ;
353}
354
355/*
356 * Remove thread from the umtx queue.
357 */
358static inline void
359umtxq_remove(struct umtx_q *uq)
360{
361	struct umtxq_chain *uc;
362
363	uc = umtxq_getchain(&uq->uq_key);
364	UMTXQ_LOCKED_ASSERT(uc);
365	if (uq->uq_flags & UQF_UMTXQ) {
366		TAILQ_REMOVE(&uc->uc_queue, uq, uq_link);
367		uq->uq_flags &= ~UQF_UMTXQ;
368	}
369}
370
371/*
372 * Check if there are multiple waiters
373 */
374static int
375umtxq_count(struct umtx_key *key)
376{
377	struct umtxq_chain *uc;
378	struct umtx_q *uq;
379	int count = 0;
380
381	uc = umtxq_getchain(key);
382	UMTXQ_LOCKED_ASSERT(uc);
383	TAILQ_FOREACH(uq, &uc->uc_queue, uq_link) {
384		if (umtx_key_match(&uq->uq_key, key)) {
385			if (++count > 1)
386				break;
387		}
388	}
389	return (count);
390}
391
392/*
393 * Check if there are multiple PI waiters and returns first
394 * waiter.
395 */
396static int
397umtxq_count_pi(struct umtx_key *key, struct umtx_q **first)
398{
399	struct umtxq_chain *uc;
400	struct umtx_q *uq;
401	int count = 0;
402
403	*first = NULL;
404	uc = umtxq_getchain(key);
405	UMTXQ_LOCKED_ASSERT(uc);
406	TAILQ_FOREACH(uq, &uc->uc_queue, uq_link) {
407		if (umtx_key_match(&uq->uq_key, key)) {
408			if (++count > 1)
409				break;
410			*first = uq;
411		}
412	}
413	return (count);
414}
415
416/*
417 * Wake up threads waiting on an userland object.
418 */
419static int
420umtxq_signal(struct umtx_key *key, int n_wake)
421{
422	struct umtxq_chain *uc;
423	struct umtx_q *uq, *next;
424	int ret;
425
426	ret = 0;
427	uc = umtxq_getchain(key);
428	UMTXQ_LOCKED_ASSERT(uc);
429	TAILQ_FOREACH_SAFE(uq, &uc->uc_queue, uq_link, next) {
430		if (umtx_key_match(&uq->uq_key, key)) {
431			umtxq_remove(uq);
432			wakeup(uq);
433			if (++ret >= n_wake)
434				break;
435		}
436	}
437	return (ret);
438}
439
440/*
441 * Wake up specified thread.
442 */
443static inline void
444umtxq_signal_thread(struct umtx_q *uq)
445{
446	struct umtxq_chain *uc;
447
448	uc = umtxq_getchain(&uq->uq_key);
449	UMTXQ_LOCKED_ASSERT(uc);
450	umtxq_remove(uq);
451	wakeup(uq);
452}
453
454/*
455 * Put thread into sleep state, before sleeping, check if
456 * thread was removed from umtx queue.
457 */
458static inline int
459umtxq_sleep(struct umtx_q *uq, const char *wmesg, int timo)
460{
461	struct umtxq_chain *uc;
462	int error;
463
464	uc = umtxq_getchain(&uq->uq_key);
465	UMTXQ_LOCKED_ASSERT(uc);
466	if (!(uq->uq_flags & UQF_UMTXQ))
467		return (0);
468	error = msleep(uq, &uc->uc_lock, PCATCH, wmesg, timo);
469	if (error == EWOULDBLOCK)
470		error = ETIMEDOUT;
471	return (error);
472}
473
474/*
475 * Convert userspace address into unique logical address.
476 */
477static int
478umtx_key_get(void *addr, int type, int share, struct umtx_key *key)
479{
480	struct thread *td = curthread;
481	vm_map_t map;
482	vm_map_entry_t entry;
483	vm_pindex_t pindex;
484	vm_prot_t prot;
485	boolean_t wired;
486
487	key->type = type;
488	if (share == THREAD_SHARE) {
489		key->shared = 0;
490		key->info.private.vs = td->td_proc->p_vmspace;
491		key->info.private.addr = (uintptr_t)addr;
492	} else {
493		MPASS(share == PROCESS_SHARE || share == AUTO_SHARE);
494		map = &td->td_proc->p_vmspace->vm_map;
495		if (vm_map_lookup(&map, (vm_offset_t)addr, VM_PROT_WRITE,
496		    &entry, &key->info.shared.object, &pindex, &prot,
497		    &wired) != KERN_SUCCESS) {
498			return EFAULT;
499		}
500
501		if ((share == PROCESS_SHARE) ||
502		    (share == AUTO_SHARE &&
503		     VM_INHERIT_SHARE == entry->inheritance)) {
504			key->shared = 1;
505			key->info.shared.offset = entry->offset + entry->start -
506				(vm_offset_t)addr;
507			vm_object_reference(key->info.shared.object);
508		} else {
509			key->shared = 0;
510			key->info.private.vs = td->td_proc->p_vmspace;
511			key->info.private.addr = (uintptr_t)addr;
512		}
513		vm_map_lookup_done(map, entry);
514	}
515
516	umtxq_hash(key);
517	return (0);
518}
519
520/*
521 * Release key.
522 */
523static inline void
524umtx_key_release(struct umtx_key *key)
525{
526	if (key->shared)
527		vm_object_deallocate(key->info.shared.object);
528}
529
530/*
531 * Lock a umtx object.
532 */
533static int
534_do_lock_umtx(struct thread *td, struct umtx *umtx, u_long id, int timo)
535{
536	struct umtx_q *uq;
537	u_long owner;
538	u_long old;
539	int error = 0;
540
541	uq = td->td_umtxq;
542
543	/*
544	 * Care must be exercised when dealing with umtx structure. It
545	 * can fault on any access.
546	 */
547	for (;;) {
548		/*
549		 * Try the uncontested case.  This should be done in userland.
550		 */
551		owner = casuword(&umtx->u_owner, UMTX_UNOWNED, id);
552
553		/* The acquire succeeded. */
554		if (owner == UMTX_UNOWNED)
555			return (0);
556
557		/* The address was invalid. */
558		if (owner == -1)
559			return (EFAULT);
560
561		/* If no one owns it but it is contested try to acquire it. */
562		if (owner == UMTX_CONTESTED) {
563			owner = casuword(&umtx->u_owner,
564			    UMTX_CONTESTED, id | UMTX_CONTESTED);
565
566			if (owner == UMTX_CONTESTED)
567				return (0);
568
569			/* The address was invalid. */
570			if (owner == -1)
571				return (EFAULT);
572
573			/* If this failed the lock has changed, restart. */
574			continue;
575		}
576
577		/*
578		 * If we caught a signal, we have retried and now
579		 * exit immediately.
580		 */
581		if (error != 0)
582			return (error);
583
584		if ((error = umtx_key_get(umtx, TYPE_SIMPLE_LOCK,
585			AUTO_SHARE, &uq->uq_key)) != 0)
586			return (error);
587
588		umtxq_lock(&uq->uq_key);
589		umtxq_busy(&uq->uq_key);
590		umtxq_insert(uq);
591		umtxq_unbusy(&uq->uq_key);
592		umtxq_unlock(&uq->uq_key);
593
594		/*
595		 * Set the contested bit so that a release in user space
596		 * knows to use the system call for unlock.  If this fails
597		 * either some one else has acquired the lock or it has been
598		 * released.
599		 */
600		old = casuword(&umtx->u_owner, owner, owner | UMTX_CONTESTED);
601
602		/* The address was invalid. */
603		if (old == -1) {
604			umtxq_lock(&uq->uq_key);
605			umtxq_remove(uq);
606			umtxq_unlock(&uq->uq_key);
607			umtx_key_release(&uq->uq_key);
608			return (EFAULT);
609		}
610
611		/*
612		 * We set the contested bit, sleep. Otherwise the lock changed
613		 * and we need to retry or we lost a race to the thread
614		 * unlocking the umtx.
615		 */
616		umtxq_lock(&uq->uq_key);
617		if (old == owner)
618			error = umtxq_sleep(uq, "umtx", timo);
619		umtxq_remove(uq);
620		umtxq_unlock(&uq->uq_key);
621		umtx_key_release(&uq->uq_key);
622	}
623
624	return (0);
625}
626
627/*
628 * Lock a umtx object.
629 */
630static int
631do_lock_umtx(struct thread *td, struct umtx *umtx, u_long id,
632	struct timespec *timeout)
633{
634	struct timespec ts, ts2, ts3;
635	struct timeval tv;
636	int error;
637
638	if (timeout == NULL) {
639		error = _do_lock_umtx(td, umtx, id, 0);
640		/* Mutex locking is restarted if it is interrupted. */
641		if (error == EINTR)
642			error = ERESTART;
643	} else {
644		getnanouptime(&ts);
645		timespecadd(&ts, timeout);
646		TIMESPEC_TO_TIMEVAL(&tv, timeout);
647		for (;;) {
648			error = _do_lock_umtx(td, umtx, id, tvtohz(&tv));
649			if (error != ETIMEDOUT)
650				break;
651			getnanouptime(&ts2);
652			if (timespeccmp(&ts2, &ts, >=)) {
653				error = ETIMEDOUT;
654				break;
655			}
656			ts3 = ts;
657			timespecsub(&ts3, &ts2);
658			TIMESPEC_TO_TIMEVAL(&tv, &ts3);
659		}
660		/* Timed-locking is not restarted. */
661		if (error == ERESTART)
662			error = EINTR;
663	}
664	return (error);
665}
666
667/*
668 * Unlock a umtx object.
669 */
670static int
671do_unlock_umtx(struct thread *td, struct umtx *umtx, u_long id)
672{
673	struct umtx_key key;
674	u_long owner;
675	u_long old;
676	int error;
677	int count;
678
679	/*
680	 * Make sure we own this mtx.
681	 */
682	owner = fuword(__DEVOLATILE(u_long *, &umtx->u_owner));
683	if (owner == -1)
684		return (EFAULT);
685
686	if ((owner & ~UMTX_CONTESTED) != id)
687		return (EPERM);
688
689	/* This should be done in userland */
690	if ((owner & UMTX_CONTESTED) == 0) {
691		old = casuword(&umtx->u_owner, owner, UMTX_UNOWNED);
692		if (old == -1)
693			return (EFAULT);
694		if (old == owner)
695			return (0);
696		owner = old;
697	}
698
699	/* We should only ever be in here for contested locks */
700	if ((error = umtx_key_get(umtx, TYPE_SIMPLE_LOCK, AUTO_SHARE,
701		&key)) != 0)
702		return (error);
703
704	umtxq_lock(&key);
705	umtxq_busy(&key);
706	count = umtxq_count(&key);
707	umtxq_unlock(&key);
708
709	/*
710	 * When unlocking the umtx, it must be marked as unowned if
711	 * there is zero or one thread only waiting for it.
712	 * Otherwise, it must be marked as contested.
713	 */
714	old = casuword(&umtx->u_owner, owner,
715		count <= 1 ? UMTX_UNOWNED : UMTX_CONTESTED);
716	umtxq_lock(&key);
717	umtxq_signal(&key,1);
718	umtxq_unbusy(&key);
719	umtxq_unlock(&key);
720	umtx_key_release(&key);
721	if (old == -1)
722		return (EFAULT);
723	if (old != owner)
724		return (EINVAL);
725	return (0);
726}
727
728#ifdef COMPAT_IA32
729
730/*
731 * Lock a umtx object.
732 */
733static int
734_do_lock_umtx32(struct thread *td, uint32_t *m, uint32_t id, int timo)
735{
736	struct umtx_q *uq;
737	uint32_t owner;
738	uint32_t old;
739	int error = 0;
740
741	uq = td->td_umtxq;
742
743	/*
744	 * Care must be exercised when dealing with umtx structure. It
745	 * can fault on any access.
746	 */
747	for (;;) {
748		/*
749		 * Try the uncontested case.  This should be done in userland.
750		 */
751		owner = casuword32(m, UMUTEX_UNOWNED, id);
752
753		/* The acquire succeeded. */
754		if (owner == UMUTEX_UNOWNED)
755			return (0);
756
757		/* The address was invalid. */
758		if (owner == -1)
759			return (EFAULT);
760
761		/* If no one owns it but it is contested try to acquire it. */
762		if (owner == UMUTEX_CONTESTED) {
763			owner = casuword32(m,
764			    UMUTEX_CONTESTED, id | UMUTEX_CONTESTED);
765			if (owner == UMUTEX_CONTESTED)
766				return (0);
767
768			/* The address was invalid. */
769			if (owner == -1)
770				return (EFAULT);
771
772			/* If this failed the lock has changed, restart. */
773			continue;
774		}
775
776		/*
777		 * If we caught a signal, we have retried and now
778		 * exit immediately.
779		 */
780		if (error != 0)
781			return (error);
782
783		if ((error = umtx_key_get(m, TYPE_SIMPLE_LOCK,
784			AUTO_SHARE, &uq->uq_key)) != 0)
785			return (error);
786
787		umtxq_lock(&uq->uq_key);
788		umtxq_busy(&uq->uq_key);
789		umtxq_insert(uq);
790		umtxq_unbusy(&uq->uq_key);
791		umtxq_unlock(&uq->uq_key);
792
793		/*
794		 * Set the contested bit so that a release in user space
795		 * knows to use the system call for unlock.  If this fails
796		 * either some one else has acquired the lock or it has been
797		 * released.
798		 */
799		old = casuword32(m, owner, owner | UMUTEX_CONTESTED);
800
801		/* The address was invalid. */
802		if (old == -1) {
803			umtxq_lock(&uq->uq_key);
804			umtxq_remove(uq);
805			umtxq_unlock(&uq->uq_key);
806			umtx_key_release(&uq->uq_key);
807			return (EFAULT);
808		}
809
810		/*
811		 * We set the contested bit, sleep. Otherwise the lock changed
812		 * and we need to retry or we lost a race to the thread
813		 * unlocking the umtx.
814		 */
815		umtxq_lock(&uq->uq_key);
816		if (old == owner)
817			error = umtxq_sleep(uq, "umtx", timo);
818		umtxq_remove(uq);
819		umtxq_unlock(&uq->uq_key);
820		umtx_key_release(&uq->uq_key);
821	}
822
823	return (0);
824}
825
826/*
827 * Lock a umtx object.
828 */
829static int
830do_lock_umtx32(struct thread *td, void *m, uint32_t id,
831	struct timespec *timeout)
832{
833	struct timespec ts, ts2, ts3;
834	struct timeval tv;
835	int error;
836
837	if (timeout == NULL) {
838		error = _do_lock_umtx32(td, m, id, 0);
839		/* Mutex locking is restarted if it is interrupted. */
840		if (error == EINTR)
841			error = ERESTART;
842	} else {
843		getnanouptime(&ts);
844		timespecadd(&ts, timeout);
845		TIMESPEC_TO_TIMEVAL(&tv, timeout);
846		for (;;) {
847			error = _do_lock_umtx32(td, m, id, tvtohz(&tv));
848			if (error != ETIMEDOUT)
849				break;
850			getnanouptime(&ts2);
851			if (timespeccmp(&ts2, &ts, >=)) {
852				error = ETIMEDOUT;
853				break;
854			}
855			ts3 = ts;
856			timespecsub(&ts3, &ts2);
857			TIMESPEC_TO_TIMEVAL(&tv, &ts3);
858		}
859		/* Timed-locking is not restarted. */
860		if (error == ERESTART)
861			error = EINTR;
862	}
863	return (error);
864}
865
866/*
867 * Unlock a umtx object.
868 */
869static int
870do_unlock_umtx32(struct thread *td, uint32_t *m, uint32_t id)
871{
872	struct umtx_key key;
873	uint32_t owner;
874	uint32_t old;
875	int error;
876	int count;
877
878	/*
879	 * Make sure we own this mtx.
880	 */
881	owner = fuword32(m);
882	if (owner == -1)
883		return (EFAULT);
884
885	if ((owner & ~UMUTEX_CONTESTED) != id)
886		return (EPERM);
887
888	/* This should be done in userland */
889	if ((owner & UMUTEX_CONTESTED) == 0) {
890		old = casuword32(m, owner, UMUTEX_UNOWNED);
891		if (old == -1)
892			return (EFAULT);
893		if (old == owner)
894			return (0);
895		owner = old;
896	}
897
898	/* We should only ever be in here for contested locks */
899	if ((error = umtx_key_get(m, TYPE_SIMPLE_LOCK, AUTO_SHARE,
900		&key)) != 0)
901		return (error);
902
903	umtxq_lock(&key);
904	umtxq_busy(&key);
905	count = umtxq_count(&key);
906	umtxq_unlock(&key);
907
908	/*
909	 * When unlocking the umtx, it must be marked as unowned if
910	 * there is zero or one thread only waiting for it.
911	 * Otherwise, it must be marked as contested.
912	 */
913	old = casuword32(m, owner,
914		count <= 1 ? UMUTEX_UNOWNED : UMUTEX_CONTESTED);
915	umtxq_lock(&key);
916	umtxq_signal(&key,1);
917	umtxq_unbusy(&key);
918	umtxq_unlock(&key);
919	umtx_key_release(&key);
920	if (old == -1)
921		return (EFAULT);
922	if (old != owner)
923		return (EINVAL);
924	return (0);
925}
926#endif
927
928/*
929 * Fetch and compare value, sleep on the address if value is not changed.
930 */
931static int
932do_wait(struct thread *td, void *addr, u_long id,
933	struct timespec *timeout, int compat32)
934{
935	struct umtx_q *uq;
936	struct timespec ts, ts2, ts3;
937	struct timeval tv;
938	u_long tmp;
939	int error = 0;
940
941	uq = td->td_umtxq;
942	if ((error = umtx_key_get(addr, TYPE_SIMPLE_WAIT, AUTO_SHARE,
943	    &uq->uq_key)) != 0)
944		return (error);
945
946	umtxq_lock(&uq->uq_key);
947	umtxq_insert(uq);
948	umtxq_unlock(&uq->uq_key);
949	if (compat32 == 0)
950		tmp = fuword(addr);
951        else
952		tmp = fuword32(addr);
953	if (tmp != id) {
954		umtxq_lock(&uq->uq_key);
955		umtxq_remove(uq);
956		umtxq_unlock(&uq->uq_key);
957	} else if (timeout == NULL) {
958		umtxq_lock(&uq->uq_key);
959		error = umtxq_sleep(uq, "uwait", 0);
960		umtxq_remove(uq);
961		umtxq_unlock(&uq->uq_key);
962	} else {
963		getnanouptime(&ts);
964		timespecadd(&ts, timeout);
965		TIMESPEC_TO_TIMEVAL(&tv, timeout);
966		umtxq_lock(&uq->uq_key);
967		for (;;) {
968			error = umtxq_sleep(uq, "uwait", tvtohz(&tv));
969			if (!(uq->uq_flags & UQF_UMTXQ))
970				break;
971			if (error != ETIMEDOUT)
972				break;
973			umtxq_unlock(&uq->uq_key);
974			getnanouptime(&ts2);
975			if (timespeccmp(&ts2, &ts, >=)) {
976				error = ETIMEDOUT;
977				umtxq_lock(&uq->uq_key);
978				break;
979			}
980			ts3 = ts;
981			timespecsub(&ts3, &ts2);
982			TIMESPEC_TO_TIMEVAL(&tv, &ts3);
983			umtxq_lock(&uq->uq_key);
984		}
985		umtxq_remove(uq);
986		umtxq_unlock(&uq->uq_key);
987	}
988	umtx_key_release(&uq->uq_key);
989	if (error == ERESTART)
990		error = EINTR;
991	return (error);
992}
993
994/*
995 * Wake up threads sleeping on the specified address.
996 */
997int
998kern_umtx_wake(struct thread *td, void *uaddr, int n_wake)
999{
1000	struct umtx_key key;
1001	int ret;
1002
1003	if ((ret = umtx_key_get(uaddr, TYPE_SIMPLE_WAIT, AUTO_SHARE,
1004	   &key)) != 0)
1005		return (ret);
1006	umtxq_lock(&key);
1007	ret = umtxq_signal(&key, n_wake);
1008	umtxq_unlock(&key);
1009	umtx_key_release(&key);
1010	return (0);
1011}
1012
1013/*
1014 * Lock PTHREAD_PRIO_NONE protocol POSIX mutex.
1015 */
1016static int
1017_do_lock_normal(struct thread *td, struct umutex *m, uint32_t flags, int timo,
1018	int try)
1019{
1020	struct umtx_q *uq;
1021	uint32_t owner, old, id;
1022	int error = 0;
1023
1024	id = td->td_tid;
1025	uq = td->td_umtxq;
1026
1027	/*
1028	 * Care must be exercised when dealing with umtx structure. It
1029	 * can fault on any access.
1030	 */
1031	for (;;) {
1032		/*
1033		 * Try the uncontested case.  This should be done in userland.
1034		 */
1035		owner = casuword32(&m->m_owner, UMUTEX_UNOWNED, id);
1036
1037		/* The acquire succeeded. */
1038		if (owner == UMUTEX_UNOWNED)
1039			return (0);
1040
1041		/* The address was invalid. */
1042		if (owner == -1)
1043			return (EFAULT);
1044
1045		/* If no one owns it but it is contested try to acquire it. */
1046		if (owner == UMUTEX_CONTESTED) {
1047			owner = casuword32(&m->m_owner,
1048			    UMUTEX_CONTESTED, id | UMUTEX_CONTESTED);
1049
1050			if (owner == UMUTEX_CONTESTED)
1051				return (0);
1052
1053			/* The address was invalid. */
1054			if (owner == -1)
1055				return (EFAULT);
1056
1057			/* If this failed the lock has changed, restart. */
1058			continue;
1059		}
1060
1061		if ((flags & UMUTEX_ERROR_CHECK) != 0 &&
1062		    (owner & ~UMUTEX_CONTESTED) == id)
1063			return (EDEADLK);
1064
1065		if (try != 0)
1066			return (EBUSY);
1067
1068		/*
1069		 * If we caught a signal, we have retried and now
1070		 * exit immediately.
1071		 */
1072		if (error != 0)
1073			return (error);
1074
1075		if ((error = umtx_key_get(m, TYPE_NORMAL_UMUTEX,
1076		    GET_SHARE(flags), &uq->uq_key)) != 0)
1077			return (error);
1078
1079		umtxq_lock(&uq->uq_key);
1080		umtxq_busy(&uq->uq_key);
1081		umtxq_insert(uq);
1082		umtxq_unbusy(&uq->uq_key);
1083		umtxq_unlock(&uq->uq_key);
1084
1085		/*
1086		 * Set the contested bit so that a release in user space
1087		 * knows to use the system call for unlock.  If this fails
1088		 * either some one else has acquired the lock or it has been
1089		 * released.
1090		 */
1091		old = casuword32(&m->m_owner, owner, owner | UMUTEX_CONTESTED);
1092
1093		/* The address was invalid. */
1094		if (old == -1) {
1095			umtxq_lock(&uq->uq_key);
1096			umtxq_remove(uq);
1097			umtxq_unlock(&uq->uq_key);
1098			umtx_key_release(&uq->uq_key);
1099			return (EFAULT);
1100		}
1101
1102		/*
1103		 * We set the contested bit, sleep. Otherwise the lock changed
1104		 * and we need to retry or we lost a race to the thread
1105		 * unlocking the umtx.
1106		 */
1107		umtxq_lock(&uq->uq_key);
1108		if (old == owner)
1109			error = umtxq_sleep(uq, "umtxn", timo);
1110		umtxq_remove(uq);
1111		umtxq_unlock(&uq->uq_key);
1112		umtx_key_release(&uq->uq_key);
1113	}
1114
1115	return (0);
1116}
1117
1118/*
1119 * Lock PTHREAD_PRIO_NONE protocol POSIX mutex.
1120 */
1121/*
1122 * Unlock PTHREAD_PRIO_NONE protocol POSIX mutex.
1123 */
1124static int
1125do_unlock_normal(struct thread *td, struct umutex *m, uint32_t flags)
1126{
1127	struct umtx_key key;
1128	uint32_t owner, old, id;
1129	int error;
1130	int count;
1131
1132	id = td->td_tid;
1133	/*
1134	 * Make sure we own this mtx.
1135	 */
1136	owner = fuword32(__DEVOLATILE(uint32_t *, &m->m_owner));
1137	if (owner == -1)
1138		return (EFAULT);
1139
1140	if ((owner & ~UMUTEX_CONTESTED) != id)
1141		return (EPERM);
1142
1143	/* This should be done in userland */
1144	if ((owner & UMUTEX_CONTESTED) == 0) {
1145		old = casuword32(&m->m_owner, owner, UMUTEX_UNOWNED);
1146		if (old == -1)
1147			return (EFAULT);
1148		if (old == owner)
1149			return (0);
1150		owner = old;
1151	}
1152
1153	/* We should only ever be in here for contested locks */
1154	if ((error = umtx_key_get(m, TYPE_NORMAL_UMUTEX, GET_SHARE(flags),
1155	    &key)) != 0)
1156		return (error);
1157
1158	umtxq_lock(&key);
1159	umtxq_busy(&key);
1160	count = umtxq_count(&key);
1161	umtxq_unlock(&key);
1162
1163	/*
1164	 * When unlocking the umtx, it must be marked as unowned if
1165	 * there is zero or one thread only waiting for it.
1166	 * Otherwise, it must be marked as contested.
1167	 */
1168	old = casuword32(&m->m_owner, owner,
1169		count <= 1 ? UMUTEX_UNOWNED : UMUTEX_CONTESTED);
1170	umtxq_lock(&key);
1171	umtxq_signal(&key,1);
1172	umtxq_unbusy(&key);
1173	umtxq_unlock(&key);
1174	umtx_key_release(&key);
1175	if (old == -1)
1176		return (EFAULT);
1177	if (old != owner)
1178		return (EINVAL);
1179	return (0);
1180}
1181
1182static inline struct umtx_pi *
1183umtx_pi_alloc(int flags)
1184{
1185	struct umtx_pi *pi;
1186
1187	pi = uma_zalloc(umtx_pi_zone, M_ZERO | flags);
1188	TAILQ_INIT(&pi->pi_blocked);
1189	atomic_add_int(&umtx_pi_allocated, 1);
1190	return (pi);
1191}
1192
1193static inline void
1194umtx_pi_free(struct umtx_pi *pi)
1195{
1196	uma_zfree(umtx_pi_zone, pi);
1197	atomic_add_int(&umtx_pi_allocated, -1);
1198}
1199
1200/*
1201 * Adjust the thread's position on a pi_state after its priority has been
1202 * changed.
1203 */
1204static int
1205umtx_pi_adjust_thread(struct umtx_pi *pi, struct thread *td)
1206{
1207	struct umtx_q *uq, *uq1, *uq2;
1208	struct thread *td1;
1209
1210	mtx_assert(&umtx_lock, MA_OWNED);
1211	if (pi == NULL)
1212		return (0);
1213
1214	uq = td->td_umtxq;
1215
1216	/*
1217	 * Check if the thread needs to be moved on the blocked chain.
1218	 * It needs to be moved if either its priority is lower than
1219	 * the previous thread or higher than the next thread.
1220	 */
1221	uq1 = TAILQ_PREV(uq, umtxq_head, uq_lockq);
1222	uq2 = TAILQ_NEXT(uq, uq_lockq);
1223	if ((uq1 != NULL && UPRI(td) < UPRI(uq1->uq_thread)) ||
1224	    (uq2 != NULL && UPRI(td) > UPRI(uq2->uq_thread))) {
1225		/*
1226		 * Remove thread from blocked chain and determine where
1227		 * it should be moved to.
1228		 */
1229		TAILQ_REMOVE(&pi->pi_blocked, uq, uq_lockq);
1230		TAILQ_FOREACH(uq1, &pi->pi_blocked, uq_lockq) {
1231			td1 = uq1->uq_thread;
1232			MPASS(td1->td_proc->p_magic == P_MAGIC);
1233			if (UPRI(td1) > UPRI(td))
1234				break;
1235		}
1236
1237		if (uq1 == NULL)
1238			TAILQ_INSERT_TAIL(&pi->pi_blocked, uq, uq_lockq);
1239		else
1240			TAILQ_INSERT_BEFORE(uq1, uq, uq_lockq);
1241	}
1242	return (1);
1243}
1244
1245/*
1246 * Propagate priority when a thread is blocked on POSIX
1247 * PI mutex.
1248 */
1249static void
1250umtx_propagate_priority(struct thread *td)
1251{
1252	struct umtx_q *uq;
1253	struct umtx_pi *pi;
1254	int pri;
1255
1256	mtx_assert(&umtx_lock, MA_OWNED);
1257	pri = UPRI(td);
1258	uq = td->td_umtxq;
1259	pi = uq->uq_pi_blocked;
1260	if (pi == NULL)
1261		return;
1262
1263	for (;;) {
1264		td = pi->pi_owner;
1265		if (td == NULL)
1266			return;
1267
1268		MPASS(td->td_proc != NULL);
1269		MPASS(td->td_proc->p_magic == P_MAGIC);
1270
1271		if (UPRI(td) <= pri)
1272			return;
1273
1274		thread_lock(td);
1275		sched_lend_user_prio(td, pri);
1276		thread_unlock(td);
1277
1278		/*
1279		 * Pick up the lock that td is blocked on.
1280		 */
1281		uq = td->td_umtxq;
1282		pi = uq->uq_pi_blocked;
1283		/* Resort td on the list if needed. */
1284		if (!umtx_pi_adjust_thread(pi, td))
1285			break;
1286	}
1287}
1288
1289/*
1290 * Unpropagate priority for a PI mutex when a thread blocked on
1291 * it is interrupted by signal or resumed by others.
1292 */
1293static void
1294umtx_unpropagate_priority(struct umtx_pi *pi)
1295{
1296	struct umtx_q *uq, *uq_owner;
1297	struct umtx_pi *pi2;
1298	int pri, oldpri;
1299
1300	mtx_assert(&umtx_lock, MA_OWNED);
1301
1302	while (pi != NULL && pi->pi_owner != NULL) {
1303		pri = PRI_MAX;
1304		uq_owner = pi->pi_owner->td_umtxq;
1305
1306		TAILQ_FOREACH(pi2, &uq_owner->uq_pi_contested, pi_link) {
1307			uq = TAILQ_FIRST(&pi2->pi_blocked);
1308			if (uq != NULL) {
1309				if (pri > UPRI(uq->uq_thread))
1310					pri = UPRI(uq->uq_thread);
1311			}
1312		}
1313
1314		if (pri > uq_owner->uq_inherited_pri)
1315			pri = uq_owner->uq_inherited_pri;
1316		thread_lock(pi->pi_owner);
1317		oldpri = pi->pi_owner->td_user_pri;
1318		sched_unlend_user_prio(pi->pi_owner, pri);
1319		thread_unlock(pi->pi_owner);
1320		umtx_pi_adjust_locked(pi->pi_owner, oldpri);
1321		pi = uq_owner->uq_pi_blocked;
1322	}
1323}
1324
1325/*
1326 * Insert a PI mutex into owned list.
1327 */
1328static void
1329umtx_pi_setowner(struct umtx_pi *pi, struct thread *owner)
1330{
1331	struct umtx_q *uq_owner;
1332
1333	uq_owner = owner->td_umtxq;
1334	mtx_assert(&umtx_lock, MA_OWNED);
1335	if (pi->pi_owner != NULL)
1336		panic("pi_ower != NULL");
1337	pi->pi_owner = owner;
1338	TAILQ_INSERT_TAIL(&uq_owner->uq_pi_contested, pi, pi_link);
1339}
1340
1341/*
1342 * Claim ownership of a PI mutex.
1343 */
1344static int
1345umtx_pi_claim(struct umtx_pi *pi, struct thread *owner)
1346{
1347	struct umtx_q *uq, *uq_owner;
1348
1349	uq_owner = owner->td_umtxq;
1350	mtx_lock_spin(&umtx_lock);
1351	if (pi->pi_owner == owner) {
1352		mtx_unlock_spin(&umtx_lock);
1353		return (0);
1354	}
1355
1356	if (pi->pi_owner != NULL) {
1357		/*
1358		 * userland may have already messed the mutex, sigh.
1359		 */
1360		mtx_unlock_spin(&umtx_lock);
1361		return (EPERM);
1362	}
1363	umtx_pi_setowner(pi, owner);
1364	uq = TAILQ_FIRST(&pi->pi_blocked);
1365	if (uq != NULL) {
1366		int pri;
1367
1368		pri = UPRI(uq->uq_thread);
1369		thread_lock(owner);
1370		if (pri < UPRI(owner))
1371			sched_lend_user_prio(owner, pri);
1372		thread_unlock(owner);
1373	}
1374	mtx_unlock_spin(&umtx_lock);
1375	return (0);
1376}
1377
1378static void
1379umtx_pi_adjust_locked(struct thread *td, u_char oldpri)
1380{
1381	struct umtx_q *uq;
1382	struct umtx_pi *pi;
1383
1384	uq = td->td_umtxq;
1385	/*
1386	 * Pick up the lock that td is blocked on.
1387	 */
1388	pi = uq->uq_pi_blocked;
1389	MPASS(pi != NULL);
1390
1391	/* Resort the turnstile on the list. */
1392	if (!umtx_pi_adjust_thread(pi, td))
1393		return;
1394
1395	/*
1396	 * If our priority was lowered and we are at the head of the
1397	 * turnstile, then propagate our new priority up the chain.
1398	 */
1399	if (uq == TAILQ_FIRST(&pi->pi_blocked) && UPRI(td) < oldpri)
1400		umtx_propagate_priority(td);
1401}
1402
1403/*
1404 * Adjust a thread's order position in its blocked PI mutex,
1405 * this may result new priority propagating process.
1406 */
1407void
1408umtx_pi_adjust(struct thread *td, u_char oldpri)
1409{
1410	mtx_lock_spin(&umtx_lock);
1411	umtx_pi_adjust_locked(td, oldpri);
1412	mtx_unlock_spin(&umtx_lock);
1413}
1414
1415/*
1416 * Sleep on a PI mutex.
1417 */
1418static int
1419umtxq_sleep_pi(struct umtx_q *uq, struct umtx_pi *pi,
1420	uint32_t owner, const char *wmesg, int timo)
1421{
1422	struct umtxq_chain *uc;
1423	struct thread *td, *td1;
1424	struct umtx_q *uq1;
1425	int pri;
1426	int error = 0;
1427
1428	td = uq->uq_thread;
1429	KASSERT(td == curthread, ("inconsistent uq_thread"));
1430	uc = umtxq_getchain(&uq->uq_key);
1431	UMTXQ_LOCKED_ASSERT(uc);
1432	umtxq_insert(uq);
1433	if (pi->pi_owner == NULL) {
1434		/* XXX
1435		 * Current, We only support process private PI-mutex,
1436		 * non-contended PI-mutexes are locked in userland.
1437		 * Process shared PI-mutex should always be initialized
1438		 * by kernel and be registered in kernel, locking should
1439		 * always be done by kernel to avoid security problems.
1440		 * For process private PI-mutex, we can find owner
1441		 * thread and boost its priority safely.
1442		 */
1443		PROC_LOCK(curproc);
1444		td1 = thread_find(curproc, owner);
1445		mtx_lock_spin(&umtx_lock);
1446		if (td1 != NULL && pi->pi_owner == NULL) {
1447			uq1 = td1->td_umtxq;
1448			umtx_pi_setowner(pi, td1);
1449		}
1450		PROC_UNLOCK(curproc);
1451	} else {
1452		mtx_lock_spin(&umtx_lock);
1453	}
1454
1455	TAILQ_FOREACH(uq1, &pi->pi_blocked, uq_lockq) {
1456		pri = UPRI(uq1->uq_thread);
1457		if (pri > UPRI(td))
1458			break;
1459	}
1460
1461	if (uq1 != NULL)
1462		TAILQ_INSERT_BEFORE(uq1, uq, uq_lockq);
1463	else
1464		TAILQ_INSERT_TAIL(&pi->pi_blocked, uq, uq_lockq);
1465
1466	uq->uq_pi_blocked = pi;
1467	thread_lock(td);
1468	td->td_flags |= TDF_UPIBLOCKED;
1469	thread_unlock(td);
1470	mtx_unlock_spin(&umtx_lock);
1471	umtxq_unlock(&uq->uq_key);
1472
1473	mtx_lock_spin(&umtx_lock);
1474	umtx_propagate_priority(td);
1475	mtx_unlock_spin(&umtx_lock);
1476
1477	umtxq_lock(&uq->uq_key);
1478	if (uq->uq_flags & UQF_UMTXQ) {
1479		error = msleep(uq, &uc->uc_lock, PCATCH, wmesg, timo);
1480		if (error == EWOULDBLOCK)
1481			error = ETIMEDOUT;
1482		if (uq->uq_flags & UQF_UMTXQ) {
1483			umtxq_busy(&uq->uq_key);
1484			umtxq_remove(uq);
1485			umtxq_unbusy(&uq->uq_key);
1486		}
1487	}
1488	umtxq_unlock(&uq->uq_key);
1489
1490	mtx_lock_spin(&umtx_lock);
1491	uq->uq_pi_blocked = NULL;
1492	thread_lock(td);
1493	td->td_flags &= ~TDF_UPIBLOCKED;
1494	thread_unlock(td);
1495	TAILQ_REMOVE(&pi->pi_blocked, uq, uq_lockq);
1496	umtx_unpropagate_priority(pi);
1497	mtx_unlock_spin(&umtx_lock);
1498
1499	umtxq_lock(&uq->uq_key);
1500
1501	return (error);
1502}
1503
1504/*
1505 * Add reference count for a PI mutex.
1506 */
1507static void
1508umtx_pi_ref(struct umtx_pi *pi)
1509{
1510	struct umtxq_chain *uc;
1511
1512	uc = umtxq_getchain(&pi->pi_key);
1513	UMTXQ_LOCKED_ASSERT(uc);
1514	pi->pi_refcount++;
1515}
1516
1517/*
1518 * Decrease reference count for a PI mutex, if the counter
1519 * is decreased to zero, its memory space is freed.
1520 */
1521static void
1522umtx_pi_unref(struct umtx_pi *pi)
1523{
1524	struct umtxq_chain *uc;
1525	int free = 0;
1526
1527	uc = umtxq_getchain(&pi->pi_key);
1528	UMTXQ_LOCKED_ASSERT(uc);
1529	KASSERT(pi->pi_refcount > 0, ("invalid reference count"));
1530	if (--pi->pi_refcount == 0) {
1531		mtx_lock_spin(&umtx_lock);
1532		if (pi->pi_owner != NULL) {
1533			TAILQ_REMOVE(&pi->pi_owner->td_umtxq->uq_pi_contested,
1534				pi, pi_link);
1535			pi->pi_owner = NULL;
1536		}
1537		KASSERT(TAILQ_EMPTY(&pi->pi_blocked),
1538			("blocked queue not empty"));
1539		mtx_unlock_spin(&umtx_lock);
1540		TAILQ_REMOVE(&uc->uc_pi_list, pi, pi_hashlink);
1541		free = 1;
1542	}
1543	if (free)
1544		umtx_pi_free(pi);
1545}
1546
1547/*
1548 * Find a PI mutex in hash table.
1549 */
1550static struct umtx_pi *
1551umtx_pi_lookup(struct umtx_key *key)
1552{
1553	struct umtxq_chain *uc;
1554	struct umtx_pi *pi;
1555
1556	uc = umtxq_getchain(key);
1557	UMTXQ_LOCKED_ASSERT(uc);
1558
1559	TAILQ_FOREACH(pi, &uc->uc_pi_list, pi_hashlink) {
1560		if (umtx_key_match(&pi->pi_key, key)) {
1561			return (pi);
1562		}
1563	}
1564	return (NULL);
1565}
1566
1567/*
1568 * Insert a PI mutex into hash table.
1569 */
1570static inline void
1571umtx_pi_insert(struct umtx_pi *pi)
1572{
1573	struct umtxq_chain *uc;
1574
1575	uc = umtxq_getchain(&pi->pi_key);
1576	UMTXQ_LOCKED_ASSERT(uc);
1577	TAILQ_INSERT_TAIL(&uc->uc_pi_list, pi, pi_hashlink);
1578}
1579
1580/*
1581 * Lock a PI mutex.
1582 */
1583static int
1584_do_lock_pi(struct thread *td, struct umutex *m, uint32_t flags, int timo,
1585	int try)
1586{
1587	struct umtx_q *uq;
1588	struct umtx_pi *pi, *new_pi;
1589	uint32_t id, owner, old;
1590	int error;
1591
1592	id = td->td_tid;
1593	uq = td->td_umtxq;
1594
1595	if ((error = umtx_key_get(m, TYPE_PI_UMUTEX, GET_SHARE(flags),
1596	    &uq->uq_key)) != 0)
1597		return (error);
1598	umtxq_lock(&uq->uq_key);
1599	pi = umtx_pi_lookup(&uq->uq_key);
1600	if (pi == NULL) {
1601		new_pi = umtx_pi_alloc(M_NOWAIT);
1602		if (new_pi == NULL) {
1603			umtxq_unlock(&uq->uq_key);
1604			new_pi = umtx_pi_alloc(M_WAITOK);
1605			new_pi->pi_key = uq->uq_key;
1606			umtxq_lock(&uq->uq_key);
1607			pi = umtx_pi_lookup(&uq->uq_key);
1608			if (pi != NULL) {
1609				umtx_pi_free(new_pi);
1610				new_pi = NULL;
1611			}
1612		}
1613		if (new_pi != NULL) {
1614			new_pi->pi_key = uq->uq_key;
1615			umtx_pi_insert(new_pi);
1616			pi = new_pi;
1617		}
1618	}
1619	umtx_pi_ref(pi);
1620	umtxq_unlock(&uq->uq_key);
1621
1622	/*
1623	 * Care must be exercised when dealing with umtx structure.  It
1624	 * can fault on any access.
1625	 */
1626	for (;;) {
1627		/*
1628		 * Try the uncontested case.  This should be done in userland.
1629		 */
1630		owner = casuword32(&m->m_owner, UMUTEX_UNOWNED, id);
1631
1632		/* The acquire succeeded. */
1633		if (owner == UMUTEX_UNOWNED) {
1634			error = 0;
1635			break;
1636		}
1637
1638		/* The address was invalid. */
1639		if (owner == -1) {
1640			error = EFAULT;
1641			break;
1642		}
1643
1644		/* If no one owns it but it is contested try to acquire it. */
1645		if (owner == UMUTEX_CONTESTED) {
1646			owner = casuword32(&m->m_owner,
1647			    UMUTEX_CONTESTED, id | UMUTEX_CONTESTED);
1648
1649			if (owner == UMUTEX_CONTESTED) {
1650				umtxq_lock(&uq->uq_key);
1651				error = umtx_pi_claim(pi, td);
1652				umtxq_unlock(&uq->uq_key);
1653				break;
1654			}
1655
1656			/* The address was invalid. */
1657			if (owner == -1) {
1658				error = EFAULT;
1659				break;
1660			}
1661
1662			/* If this failed the lock has changed, restart. */
1663			continue;
1664		}
1665
1666		if ((flags & UMUTEX_ERROR_CHECK) != 0 &&
1667		    (owner & ~UMUTEX_CONTESTED) == id) {
1668			error = EDEADLK;
1669			break;
1670		}
1671
1672		if (try != 0) {
1673			error = EBUSY;
1674			break;
1675		}
1676
1677		/*
1678		 * If we caught a signal, we have retried and now
1679		 * exit immediately.
1680		 */
1681		if (error != 0)
1682			break;
1683
1684		umtxq_lock(&uq->uq_key);
1685		umtxq_busy(&uq->uq_key);
1686		umtxq_unlock(&uq->uq_key);
1687
1688		/*
1689		 * Set the contested bit so that a release in user space
1690		 * knows to use the system call for unlock.  If this fails
1691		 * either some one else has acquired the lock or it has been
1692		 * released.
1693		 */
1694		old = casuword32(&m->m_owner, owner, owner | UMUTEX_CONTESTED);
1695
1696		/* The address was invalid. */
1697		if (old == -1) {
1698			umtxq_lock(&uq->uq_key);
1699			umtxq_unbusy(&uq->uq_key);
1700			umtxq_unlock(&uq->uq_key);
1701			error = EFAULT;
1702			break;
1703		}
1704
1705		umtxq_lock(&uq->uq_key);
1706		umtxq_unbusy(&uq->uq_key);
1707		/*
1708		 * We set the contested bit, sleep. Otherwise the lock changed
1709		 * and we need to retry or we lost a race to the thread
1710		 * unlocking the umtx.
1711		 */
1712		if (old == owner)
1713			error = umtxq_sleep_pi(uq, pi, owner & ~UMUTEX_CONTESTED,
1714				 "umtxpi", timo);
1715		umtxq_unlock(&uq->uq_key);
1716	}
1717
1718	umtxq_lock(&uq->uq_key);
1719	umtx_pi_unref(pi);
1720	umtxq_unlock(&uq->uq_key);
1721
1722	umtx_key_release(&uq->uq_key);
1723	return (error);
1724}
1725
1726/*
1727 * Unlock a PI mutex.
1728 */
1729static int
1730do_unlock_pi(struct thread *td, struct umutex *m, uint32_t flags)
1731{
1732	struct umtx_key key;
1733	struct umtx_q *uq_first, *uq_first2, *uq_me;
1734	struct umtx_pi *pi, *pi2;
1735	uint32_t owner, old, id;
1736	int error;
1737	int count;
1738	int pri;
1739
1740	id = td->td_tid;
1741	/*
1742	 * Make sure we own this mtx.
1743	 */
1744	owner = fuword32(__DEVOLATILE(uint32_t *, &m->m_owner));
1745	if (owner == -1)
1746		return (EFAULT);
1747
1748	if ((owner & ~UMUTEX_CONTESTED) != id)
1749		return (EPERM);
1750
1751	/* This should be done in userland */
1752	if ((owner & UMUTEX_CONTESTED) == 0) {
1753		old = casuword32(&m->m_owner, owner, UMUTEX_UNOWNED);
1754		if (old == -1)
1755			return (EFAULT);
1756		if (old == owner)
1757			return (0);
1758		owner = old;
1759	}
1760
1761	/* We should only ever be in here for contested locks */
1762	if ((error = umtx_key_get(m, TYPE_PI_UMUTEX, GET_SHARE(flags),
1763	    &key)) != 0)
1764		return (error);
1765
1766	umtxq_lock(&key);
1767	umtxq_busy(&key);
1768	count = umtxq_count_pi(&key, &uq_first);
1769	if (uq_first != NULL) {
1770		pi = uq_first->uq_pi_blocked;
1771		if (pi->pi_owner != curthread) {
1772			umtxq_unbusy(&key);
1773			umtxq_unlock(&key);
1774			/* userland messed the mutex */
1775			return (EPERM);
1776		}
1777		uq_me = curthread->td_umtxq;
1778		mtx_lock_spin(&umtx_lock);
1779		pi->pi_owner = NULL;
1780		TAILQ_REMOVE(&uq_me->uq_pi_contested, pi, pi_link);
1781		uq_first = TAILQ_FIRST(&pi->pi_blocked);
1782		pri = PRI_MAX;
1783		TAILQ_FOREACH(pi2, &uq_me->uq_pi_contested, pi_link) {
1784			uq_first2 = TAILQ_FIRST(&pi2->pi_blocked);
1785			if (uq_first2 != NULL) {
1786				if (pri > UPRI(uq_first2->uq_thread))
1787					pri = UPRI(uq_first2->uq_thread);
1788			}
1789		}
1790		thread_lock(curthread);
1791		sched_unlend_user_prio(curthread, pri);
1792		thread_unlock(curthread);
1793		mtx_unlock_spin(&umtx_lock);
1794	}
1795	umtxq_unlock(&key);
1796
1797	/*
1798	 * When unlocking the umtx, it must be marked as unowned if
1799	 * there is zero or one thread only waiting for it.
1800	 * Otherwise, it must be marked as contested.
1801	 */
1802	old = casuword32(&m->m_owner, owner,
1803		count <= 1 ? UMUTEX_UNOWNED : UMUTEX_CONTESTED);
1804
1805	umtxq_lock(&key);
1806	if (uq_first != NULL)
1807		umtxq_signal_thread(uq_first);
1808	umtxq_unbusy(&key);
1809	umtxq_unlock(&key);
1810	umtx_key_release(&key);
1811	if (old == -1)
1812		return (EFAULT);
1813	if (old != owner)
1814		return (EINVAL);
1815	return (0);
1816}
1817
1818/*
1819 * Lock a PP mutex.
1820 */
1821static int
1822_do_lock_pp(struct thread *td, struct umutex *m, uint32_t flags, int timo,
1823	int try)
1824{
1825	struct umtx_q *uq, *uq2;
1826	struct umtx_pi *pi;
1827	uint32_t ceiling;
1828	uint32_t owner, id;
1829	int error, pri, old_inherited_pri, su;
1830
1831	id = td->td_tid;
1832	uq = td->td_umtxq;
1833	if ((error = umtx_key_get(m, TYPE_PP_UMUTEX, GET_SHARE(flags),
1834	    &uq->uq_key)) != 0)
1835		return (error);
1836	su = (priv_check(td, PRIV_SCHED_RTPRIO) == 0);
1837	for (;;) {
1838		old_inherited_pri = uq->uq_inherited_pri;
1839		umtxq_lock(&uq->uq_key);
1840		umtxq_busy(&uq->uq_key);
1841		umtxq_unlock(&uq->uq_key);
1842
1843		ceiling = RTP_PRIO_MAX - fuword32(&m->m_ceilings[0]);
1844		if (ceiling > RTP_PRIO_MAX) {
1845			error = EINVAL;
1846			goto out;
1847		}
1848
1849		mtx_lock_spin(&umtx_lock);
1850		if (UPRI(td) < PRI_MIN_REALTIME + ceiling) {
1851			mtx_unlock_spin(&umtx_lock);
1852			error = EINVAL;
1853			goto out;
1854		}
1855		if (su && PRI_MIN_REALTIME + ceiling < uq->uq_inherited_pri) {
1856			uq->uq_inherited_pri = PRI_MIN_REALTIME + ceiling;
1857			thread_lock(td);
1858			if (uq->uq_inherited_pri < UPRI(td))
1859				sched_lend_user_prio(td, uq->uq_inherited_pri);
1860			thread_unlock(td);
1861		}
1862		mtx_unlock_spin(&umtx_lock);
1863
1864		owner = casuword32(&m->m_owner,
1865		    UMUTEX_CONTESTED, id | UMUTEX_CONTESTED);
1866
1867		if (owner == UMUTEX_CONTESTED) {
1868			error = 0;
1869			break;
1870		}
1871
1872		/* The address was invalid. */
1873		if (owner == -1) {
1874			error = EFAULT;
1875			break;
1876		}
1877
1878		if ((flags & UMUTEX_ERROR_CHECK) != 0 &&
1879		    (owner & ~UMUTEX_CONTESTED) == id) {
1880			error = EDEADLK;
1881			break;
1882		}
1883
1884		if (try != 0) {
1885			error = EBUSY;
1886			break;
1887		}
1888
1889		/*
1890		 * If we caught a signal, we have retried and now
1891		 * exit immediately.
1892		 */
1893		if (error != 0)
1894			break;
1895
1896		umtxq_lock(&uq->uq_key);
1897		umtxq_insert(uq);
1898		umtxq_unbusy(&uq->uq_key);
1899		error = umtxq_sleep(uq, "umtxpp", timo);
1900		umtxq_remove(uq);
1901		umtxq_unlock(&uq->uq_key);
1902
1903		mtx_lock_spin(&umtx_lock);
1904		uq->uq_inherited_pri = old_inherited_pri;
1905		pri = PRI_MAX;
1906		TAILQ_FOREACH(pi, &uq->uq_pi_contested, pi_link) {
1907			uq2 = TAILQ_FIRST(&pi->pi_blocked);
1908			if (uq2 != NULL) {
1909				if (pri > UPRI(uq2->uq_thread))
1910					pri = UPRI(uq2->uq_thread);
1911			}
1912		}
1913		if (pri > uq->uq_inherited_pri)
1914			pri = uq->uq_inherited_pri;
1915		thread_lock(td);
1916		sched_unlend_user_prio(td, pri);
1917		thread_unlock(td);
1918		mtx_unlock_spin(&umtx_lock);
1919	}
1920
1921	if (error != 0) {
1922		mtx_lock_spin(&umtx_lock);
1923		uq->uq_inherited_pri = old_inherited_pri;
1924		pri = PRI_MAX;
1925		TAILQ_FOREACH(pi, &uq->uq_pi_contested, pi_link) {
1926			uq2 = TAILQ_FIRST(&pi->pi_blocked);
1927			if (uq2 != NULL) {
1928				if (pri > UPRI(uq2->uq_thread))
1929					pri = UPRI(uq2->uq_thread);
1930			}
1931		}
1932		if (pri > uq->uq_inherited_pri)
1933			pri = uq->uq_inherited_pri;
1934		thread_lock(td);
1935		sched_unlend_user_prio(td, pri);
1936		thread_unlock(td);
1937		mtx_unlock_spin(&umtx_lock);
1938	}
1939
1940out:
1941	umtxq_lock(&uq->uq_key);
1942	umtxq_unbusy(&uq->uq_key);
1943	umtxq_unlock(&uq->uq_key);
1944	umtx_key_release(&uq->uq_key);
1945	return (error);
1946}
1947
1948/*
1949 * Unlock a PP mutex.
1950 */
1951static int
1952do_unlock_pp(struct thread *td, struct umutex *m, uint32_t flags)
1953{
1954	struct umtx_key key;
1955	struct umtx_q *uq, *uq2;
1956	struct umtx_pi *pi;
1957	uint32_t owner, id;
1958	uint32_t rceiling;
1959	int error, pri, new_inherited_pri, su;
1960
1961	id = td->td_tid;
1962	uq = td->td_umtxq;
1963	su = (priv_check(td, PRIV_SCHED_RTPRIO) == 0);
1964
1965	/*
1966	 * Make sure we own this mtx.
1967	 */
1968	owner = fuword32(__DEVOLATILE(uint32_t *, &m->m_owner));
1969	if (owner == -1)
1970		return (EFAULT);
1971
1972	if ((owner & ~UMUTEX_CONTESTED) != id)
1973		return (EPERM);
1974
1975	error = copyin(&m->m_ceilings[1], &rceiling, sizeof(uint32_t));
1976	if (error != 0)
1977		return (error);
1978
1979	if (rceiling == -1)
1980		new_inherited_pri = PRI_MAX;
1981	else {
1982		rceiling = RTP_PRIO_MAX - rceiling;
1983		if (rceiling > RTP_PRIO_MAX)
1984			return (EINVAL);
1985		new_inherited_pri = PRI_MIN_REALTIME + rceiling;
1986	}
1987
1988	if ((error = umtx_key_get(m, TYPE_PP_UMUTEX, GET_SHARE(flags),
1989	    &key)) != 0)
1990		return (error);
1991	umtxq_lock(&key);
1992	umtxq_busy(&key);
1993	umtxq_unlock(&key);
1994	/*
1995	 * For priority protected mutex, always set unlocked state
1996	 * to UMUTEX_CONTESTED, so that userland always enters kernel
1997	 * to lock the mutex, it is necessary because thread priority
1998	 * has to be adjusted for such mutex.
1999	 */
2000	error = suword32(__DEVOLATILE(uint32_t *, &m->m_owner),
2001		UMUTEX_CONTESTED);
2002
2003	umtxq_lock(&key);
2004	if (error == 0)
2005		umtxq_signal(&key, 1);
2006	umtxq_unbusy(&key);
2007	umtxq_unlock(&key);
2008
2009	if (error == -1)
2010		error = EFAULT;
2011	else {
2012		mtx_lock_spin(&umtx_lock);
2013		if (su != 0)
2014			uq->uq_inherited_pri = new_inherited_pri;
2015		pri = PRI_MAX;
2016		TAILQ_FOREACH(pi, &uq->uq_pi_contested, pi_link) {
2017			uq2 = TAILQ_FIRST(&pi->pi_blocked);
2018			if (uq2 != NULL) {
2019				if (pri > UPRI(uq2->uq_thread))
2020					pri = UPRI(uq2->uq_thread);
2021			}
2022		}
2023		if (pri > uq->uq_inherited_pri)
2024			pri = uq->uq_inherited_pri;
2025		thread_lock(td);
2026		sched_unlend_user_prio(td, pri);
2027		thread_unlock(td);
2028		mtx_unlock_spin(&umtx_lock);
2029	}
2030	umtx_key_release(&key);
2031	return (error);
2032}
2033
2034static int
2035do_set_ceiling(struct thread *td, struct umutex *m, uint32_t ceiling,
2036	uint32_t *old_ceiling)
2037{
2038	struct umtx_q *uq;
2039	uint32_t save_ceiling;
2040	uint32_t owner, id;
2041	uint32_t flags;
2042	int error;
2043
2044	flags = fuword32(&m->m_flags);
2045	if ((flags & UMUTEX_PRIO_PROTECT) == 0)
2046		return (EINVAL);
2047	if (ceiling > RTP_PRIO_MAX)
2048		return (EINVAL);
2049	id = td->td_tid;
2050	uq = td->td_umtxq;
2051	if ((error = umtx_key_get(m, TYPE_PP_UMUTEX, GET_SHARE(flags),
2052	   &uq->uq_key)) != 0)
2053		return (error);
2054	for (;;) {
2055		umtxq_lock(&uq->uq_key);
2056		umtxq_busy(&uq->uq_key);
2057		umtxq_unlock(&uq->uq_key);
2058
2059		save_ceiling = fuword32(&m->m_ceilings[0]);
2060
2061		owner = casuword32(&m->m_owner,
2062		    UMUTEX_CONTESTED, id | UMUTEX_CONTESTED);
2063
2064		if (owner == UMUTEX_CONTESTED) {
2065			suword32(&m->m_ceilings[0], ceiling);
2066			suword32(__DEVOLATILE(uint32_t *, &m->m_owner),
2067				UMUTEX_CONTESTED);
2068			error = 0;
2069			break;
2070		}
2071
2072		/* The address was invalid. */
2073		if (owner == -1) {
2074			error = EFAULT;
2075			break;
2076		}
2077
2078		if ((owner & ~UMUTEX_CONTESTED) == id) {
2079			suword32(&m->m_ceilings[0], ceiling);
2080			error = 0;
2081			break;
2082		}
2083
2084		/*
2085		 * If we caught a signal, we have retried and now
2086		 * exit immediately.
2087		 */
2088		if (error != 0)
2089			break;
2090
2091		/*
2092		 * We set the contested bit, sleep. Otherwise the lock changed
2093		 * and we need to retry or we lost a race to the thread
2094		 * unlocking the umtx.
2095		 */
2096		umtxq_lock(&uq->uq_key);
2097		umtxq_insert(uq);
2098		umtxq_unbusy(&uq->uq_key);
2099		error = umtxq_sleep(uq, "umtxpp", 0);
2100		umtxq_remove(uq);
2101		umtxq_unlock(&uq->uq_key);
2102	}
2103	umtxq_lock(&uq->uq_key);
2104	if (error == 0)
2105		umtxq_signal(&uq->uq_key, INT_MAX);
2106	umtxq_unbusy(&uq->uq_key);
2107	umtxq_unlock(&uq->uq_key);
2108	umtx_key_release(&uq->uq_key);
2109	if (error == 0 && old_ceiling != NULL)
2110		suword32(old_ceiling, save_ceiling);
2111	return (error);
2112}
2113
2114static int
2115_do_lock_umutex(struct thread *td, struct umutex *m, int flags, int timo,
2116	int try)
2117{
2118	switch(flags & (UMUTEX_PRIO_INHERIT | UMUTEX_PRIO_PROTECT)) {
2119	case 0:
2120		return (_do_lock_normal(td, m, flags, timo, try));
2121	case UMUTEX_PRIO_INHERIT:
2122		return (_do_lock_pi(td, m, flags, timo, try));
2123	case UMUTEX_PRIO_PROTECT:
2124		return (_do_lock_pp(td, m, flags, timo, try));
2125	}
2126	return (EINVAL);
2127}
2128
2129/*
2130 * Lock a userland POSIX mutex.
2131 */
2132static int
2133do_lock_umutex(struct thread *td, struct umutex *m,
2134	struct timespec *timeout, int try)
2135{
2136	struct timespec ts, ts2, ts3;
2137	struct timeval tv;
2138	uint32_t flags;
2139	int error;
2140
2141	flags = fuword32(&m->m_flags);
2142	if (flags == -1)
2143		return (EFAULT);
2144
2145	if (timeout == NULL) {
2146		error = _do_lock_umutex(td, m, flags, 0, try);
2147		/* Mutex locking is restarted if it is interrupted. */
2148		if (error == EINTR)
2149			error = ERESTART;
2150	} else {
2151		getnanouptime(&ts);
2152		timespecadd(&ts, timeout);
2153		TIMESPEC_TO_TIMEVAL(&tv, timeout);
2154		for (;;) {
2155			error = _do_lock_umutex(td, m, flags, tvtohz(&tv), try);
2156			if (error != ETIMEDOUT)
2157				break;
2158			getnanouptime(&ts2);
2159			if (timespeccmp(&ts2, &ts, >=)) {
2160				error = ETIMEDOUT;
2161				break;
2162			}
2163			ts3 = ts;
2164			timespecsub(&ts3, &ts2);
2165			TIMESPEC_TO_TIMEVAL(&tv, &ts3);
2166		}
2167		/* Timed-locking is not restarted. */
2168		if (error == ERESTART)
2169			error = EINTR;
2170	}
2171	return (error);
2172}
2173
2174/*
2175 * Unlock a userland POSIX mutex.
2176 */
2177static int
2178do_unlock_umutex(struct thread *td, struct umutex *m)
2179{
2180	uint32_t flags;
2181
2182	flags = fuword32(&m->m_flags);
2183	if (flags == -1)
2184		return (EFAULT);
2185
2186	switch(flags & (UMUTEX_PRIO_INHERIT | UMUTEX_PRIO_PROTECT)) {
2187	case 0:
2188		return (do_unlock_normal(td, m, flags));
2189	case UMUTEX_PRIO_INHERIT:
2190		return (do_unlock_pi(td, m, flags));
2191	case UMUTEX_PRIO_PROTECT:
2192		return (do_unlock_pp(td, m, flags));
2193	}
2194
2195	return (EINVAL);
2196}
2197
2198static int
2199do_cv_wait(struct thread *td, struct ucond *cv, struct umutex *m,
2200	struct timespec *timeout, u_long wflags)
2201{
2202	struct umtx_q *uq;
2203	struct timeval tv;
2204	struct timespec cts, ets, tts;
2205	uint32_t flags;
2206	int error;
2207
2208	uq = td->td_umtxq;
2209	flags = fuword32(&cv->c_flags);
2210	error = umtx_key_get(cv, TYPE_CV, GET_SHARE(flags), &uq->uq_key);
2211	if (error != 0)
2212		return (error);
2213	umtxq_lock(&uq->uq_key);
2214	umtxq_busy(&uq->uq_key);
2215	umtxq_insert(uq);
2216	umtxq_unlock(&uq->uq_key);
2217
2218	/*
2219	 * The magic thing is we should set c_has_waiters to 1 before
2220	 * releasing user mutex.
2221	 */
2222	suword32(__DEVOLATILE(uint32_t *, &cv->c_has_waiters), 1);
2223
2224	umtxq_lock(&uq->uq_key);
2225	umtxq_unbusy(&uq->uq_key);
2226	umtxq_unlock(&uq->uq_key);
2227
2228	error = do_unlock_umutex(td, m);
2229
2230	umtxq_lock(&uq->uq_key);
2231	if (error == 0) {
2232		if ((wflags & UMTX_CHECK_UNPARKING) &&
2233		    (td->td_pflags & TDP_WAKEUP)) {
2234			td->td_pflags &= ~TDP_WAKEUP;
2235			error = EINTR;
2236		} else if (timeout == NULL) {
2237			error = umtxq_sleep(uq, "ucond", 0);
2238		} else {
2239			getnanouptime(&ets);
2240			timespecadd(&ets, timeout);
2241			TIMESPEC_TO_TIMEVAL(&tv, timeout);
2242			for (;;) {
2243				error = umtxq_sleep(uq, "ucond", tvtohz(&tv));
2244				if (error != ETIMEDOUT)
2245					break;
2246				getnanouptime(&cts);
2247				if (timespeccmp(&cts, &ets, >=)) {
2248					error = ETIMEDOUT;
2249					break;
2250				}
2251				tts = ets;
2252				timespecsub(&tts, &cts);
2253				TIMESPEC_TO_TIMEVAL(&tv, &tts);
2254			}
2255		}
2256	}
2257
2258	if (error != 0) {
2259		if ((uq->uq_flags & UQF_UMTXQ) == 0) {
2260			/*
2261			 * If we concurrently got do_cv_signal()d
2262			 * and we got an error or UNIX signals or a timeout,
2263			 * then, perform another umtxq_signal to avoid
2264			 * consuming the wakeup. This may cause supurious
2265			 * wakeup for another thread which was just queued,
2266			 * but SUSV3 explicitly allows supurious wakeup to
2267			 * occur, and indeed a kernel based implementation
2268			 * can not avoid it.
2269			 */
2270			if (!umtxq_signal(&uq->uq_key, 1))
2271				error = 0;
2272		}
2273		if (error == ERESTART)
2274			error = EINTR;
2275	}
2276	umtxq_remove(uq);
2277	umtxq_unlock(&uq->uq_key);
2278	umtx_key_release(&uq->uq_key);
2279	return (error);
2280}
2281
2282/*
2283 * Signal a userland condition variable.
2284 */
2285static int
2286do_cv_signal(struct thread *td, struct ucond *cv)
2287{
2288	struct umtx_key key;
2289	int error, cnt, nwake;
2290	uint32_t flags;
2291
2292	flags = fuword32(&cv->c_flags);
2293	if ((error = umtx_key_get(cv, TYPE_CV, GET_SHARE(flags), &key)) != 0)
2294		return (error);
2295	umtxq_lock(&key);
2296	umtxq_busy(&key);
2297	cnt = umtxq_count(&key);
2298	nwake = umtxq_signal(&key, 1);
2299	if (cnt <= nwake) {
2300		umtxq_unlock(&key);
2301		error = suword32(
2302		    __DEVOLATILE(uint32_t *, &cv->c_has_waiters), 0);
2303		umtxq_lock(&key);
2304	}
2305	umtxq_unbusy(&key);
2306	umtxq_unlock(&key);
2307	umtx_key_release(&key);
2308	return (error);
2309}
2310
2311static int
2312do_cv_broadcast(struct thread *td, struct ucond *cv)
2313{
2314	struct umtx_key key;
2315	int error;
2316	uint32_t flags;
2317
2318	flags = fuword32(&cv->c_flags);
2319	if ((error = umtx_key_get(cv, TYPE_CV, GET_SHARE(flags), &key)) != 0)
2320		return (error);
2321
2322	umtxq_lock(&key);
2323	umtxq_busy(&key);
2324	umtxq_signal(&key, INT_MAX);
2325	umtxq_unlock(&key);
2326
2327	error = suword32(__DEVOLATILE(uint32_t *, &cv->c_has_waiters), 0);
2328
2329	umtxq_lock(&key);
2330	umtxq_unbusy(&key);
2331	umtxq_unlock(&key);
2332
2333	umtx_key_release(&key);
2334	return (error);
2335}
2336
2337int
2338_umtx_lock(struct thread *td, struct _umtx_lock_args *uap)
2339    /* struct umtx *umtx */
2340{
2341	return _do_lock_umtx(td, uap->umtx, td->td_tid, 0);
2342}
2343
2344int
2345_umtx_unlock(struct thread *td, struct _umtx_unlock_args *uap)
2346    /* struct umtx *umtx */
2347{
2348	return do_unlock_umtx(td, uap->umtx, td->td_tid);
2349}
2350
2351static int
2352__umtx_op_lock_umtx(struct thread *td, struct _umtx_op_args *uap)
2353{
2354	struct timespec *ts, timeout;
2355	int error;
2356
2357	/* Allow a null timespec (wait forever). */
2358	if (uap->uaddr2 == NULL)
2359		ts = NULL;
2360	else {
2361		error = copyin(uap->uaddr2, &timeout, sizeof(timeout));
2362		if (error != 0)
2363			return (error);
2364		if (timeout.tv_nsec >= 1000000000 ||
2365		    timeout.tv_nsec < 0) {
2366			return (EINVAL);
2367		}
2368		ts = &timeout;
2369	}
2370	return (do_lock_umtx(td, uap->obj, uap->val, ts));
2371}
2372
2373static int
2374__umtx_op_unlock_umtx(struct thread *td, struct _umtx_op_args *uap)
2375{
2376	return (do_unlock_umtx(td, uap->obj, uap->val));
2377}
2378
2379static int
2380__umtx_op_wait(struct thread *td, struct _umtx_op_args *uap)
2381{
2382	struct timespec *ts, timeout;
2383	int error;
2384
2385	if (uap->uaddr2 == NULL)
2386		ts = NULL;
2387	else {
2388		error = copyin(uap->uaddr2, &timeout, sizeof(timeout));
2389		if (error != 0)
2390			return (error);
2391		if (timeout.tv_nsec >= 1000000000 ||
2392		    timeout.tv_nsec < 0)
2393			return (EINVAL);
2394		ts = &timeout;
2395	}
2396	return do_wait(td, uap->obj, uap->val, ts, 0);
2397}
2398
2399static int
2400__umtx_op_wait_uint(struct thread *td, struct _umtx_op_args *uap)
2401{
2402	struct timespec *ts, timeout;
2403	int error;
2404
2405	if (uap->uaddr2 == NULL)
2406		ts = NULL;
2407	else {
2408		error = copyin(uap->uaddr2, &timeout, sizeof(timeout));
2409		if (error != 0)
2410			return (error);
2411		if (timeout.tv_nsec >= 1000000000 ||
2412		    timeout.tv_nsec < 0)
2413			return (EINVAL);
2414		ts = &timeout;
2415	}
2416	return do_wait(td, uap->obj, uap->val, ts, 1);
2417}
2418
2419static int
2420__umtx_op_wake(struct thread *td, struct _umtx_op_args *uap)
2421{
2422	return (kern_umtx_wake(td, uap->obj, uap->val));
2423}
2424
2425static int
2426__umtx_op_lock_umutex(struct thread *td, struct _umtx_op_args *uap)
2427{
2428	struct timespec *ts, timeout;
2429	int error;
2430
2431	/* Allow a null timespec (wait forever). */
2432	if (uap->uaddr2 == NULL)
2433		ts = NULL;
2434	else {
2435		error = copyin(uap->uaddr2, &timeout,
2436		    sizeof(timeout));
2437		if (error != 0)
2438			return (error);
2439		if (timeout.tv_nsec >= 1000000000 ||
2440		    timeout.tv_nsec < 0) {
2441			return (EINVAL);
2442		}
2443		ts = &timeout;
2444	}
2445	return do_lock_umutex(td, uap->obj, ts, 0);
2446}
2447
2448static int
2449__umtx_op_trylock_umutex(struct thread *td, struct _umtx_op_args *uap)
2450{
2451	return do_lock_umutex(td, uap->obj, NULL, 1);
2452}
2453
2454static int
2455__umtx_op_unlock_umutex(struct thread *td, struct _umtx_op_args *uap)
2456{
2457	return do_unlock_umutex(td, uap->obj);
2458}
2459
2460static int
2461__umtx_op_set_ceiling(struct thread *td, struct _umtx_op_args *uap)
2462{
2463	return do_set_ceiling(td, uap->obj, uap->val, uap->uaddr1);
2464}
2465
2466static int
2467__umtx_op_cv_wait(struct thread *td, struct _umtx_op_args *uap)
2468{
2469	struct timespec *ts, timeout;
2470	int error;
2471
2472	/* Allow a null timespec (wait forever). */
2473	if (uap->uaddr2 == NULL)
2474		ts = NULL;
2475	else {
2476		error = copyin(uap->uaddr2, &timeout,
2477		    sizeof(timeout));
2478		if (error != 0)
2479			return (error);
2480		if (timeout.tv_nsec >= 1000000000 ||
2481		    timeout.tv_nsec < 0) {
2482			return (EINVAL);
2483		}
2484		ts = &timeout;
2485	}
2486	return (do_cv_wait(td, uap->obj, uap->uaddr1, ts, uap->val));
2487}
2488
2489static int
2490__umtx_op_cv_signal(struct thread *td, struct _umtx_op_args *uap)
2491{
2492	return do_cv_signal(td, uap->obj);
2493}
2494
2495static int
2496__umtx_op_cv_broadcast(struct thread *td, struct _umtx_op_args *uap)
2497{
2498	return do_cv_broadcast(td, uap->obj);
2499}
2500
2501typedef int (*_umtx_op_func)(struct thread *td, struct _umtx_op_args *uap);
2502
2503static _umtx_op_func op_table[] = {
2504	__umtx_op_lock_umtx,		/* UMTX_OP_LOCK */
2505	__umtx_op_unlock_umtx,		/* UMTX_OP_UNLOCK */
2506	__umtx_op_wait,			/* UMTX_OP_WAIT */
2507	__umtx_op_wake,			/* UMTX_OP_WAKE */
2508	__umtx_op_trylock_umutex,	/* UMTX_OP_MUTEX_TRYLOCK */
2509	__umtx_op_lock_umutex,		/* UMTX_OP_MUTEX_LOCK */
2510	__umtx_op_unlock_umutex,	/* UMTX_OP_MUTEX_UNLOCK */
2511	__umtx_op_set_ceiling,		/* UMTX_OP_SET_CEILING */
2512	__umtx_op_cv_wait,		/* UMTX_OP_CV_WAIT*/
2513	__umtx_op_cv_signal,		/* UMTX_OP_CV_SIGNAL */
2514	__umtx_op_cv_broadcast,		/* UMTX_OP_CV_BROADCAST */
2515	__umtx_op_wait_uint		/* UMTX_OP_WAIT_UINT */
2516};
2517
2518int
2519_umtx_op(struct thread *td, struct _umtx_op_args *uap)
2520{
2521	if ((unsigned)uap->op < UMTX_OP_MAX)
2522		return (*op_table[uap->op])(td, uap);
2523	return (EINVAL);
2524}
2525
2526#ifdef COMPAT_IA32
2527int
2528freebsd32_umtx_lock(struct thread *td, struct freebsd32_umtx_lock_args *uap)
2529    /* struct umtx *umtx */
2530{
2531	return (do_lock_umtx32(td, (uint32_t *)uap->umtx, td->td_tid, NULL));
2532}
2533
2534int
2535freebsd32_umtx_unlock(struct thread *td, struct freebsd32_umtx_unlock_args *uap)
2536    /* struct umtx *umtx */
2537{
2538	return (do_unlock_umtx32(td, (uint32_t *)uap->umtx, td->td_tid));
2539}
2540
2541struct timespec32 {
2542	u_int32_t tv_sec;
2543	u_int32_t tv_nsec;
2544};
2545
2546static inline int
2547copyin_timeout32(void *addr, struct timespec *tsp)
2548{
2549	struct timespec32 ts32;
2550	int error;
2551
2552	error = copyin(addr, &ts32, sizeof(struct timespec32));
2553	if (error == 0) {
2554		tsp->tv_sec = ts32.tv_sec;
2555		tsp->tv_nsec = ts32.tv_nsec;
2556	}
2557	return (error);
2558}
2559
2560static int
2561__umtx_op_lock_umtx_compat32(struct thread *td, struct _umtx_op_args *uap)
2562{
2563	struct timespec *ts, timeout;
2564	int error;
2565
2566	/* Allow a null timespec (wait forever). */
2567	if (uap->uaddr2 == NULL)
2568		ts = NULL;
2569	else {
2570		error = copyin_timeout32(uap->uaddr2, &timeout);
2571		if (error != 0)
2572			return (error);
2573		if (timeout.tv_nsec >= 1000000000 ||
2574		    timeout.tv_nsec < 0) {
2575			return (EINVAL);
2576		}
2577		ts = &timeout;
2578	}
2579	return (do_lock_umtx32(td, uap->obj, uap->val, ts));
2580}
2581
2582static int
2583__umtx_op_unlock_umtx_compat32(struct thread *td, struct _umtx_op_args *uap)
2584{
2585	return (do_unlock_umtx32(td, uap->obj, (uint32_t)uap->val));
2586}
2587
2588static int
2589__umtx_op_wait_compat32(struct thread *td, struct _umtx_op_args *uap)
2590{
2591	struct timespec *ts, timeout;
2592	int error;
2593
2594	if (uap->uaddr2 == NULL)
2595		ts = NULL;
2596	else {
2597		error = copyin_timeout32(uap->uaddr2, &timeout);
2598		if (error != 0)
2599			return (error);
2600		if (timeout.tv_nsec >= 1000000000 ||
2601		    timeout.tv_nsec < 0)
2602			return (EINVAL);
2603		ts = &timeout;
2604	}
2605	return do_wait(td, uap->obj, uap->val, ts, 1);
2606}
2607
2608static int
2609__umtx_op_lock_umutex_compat32(struct thread *td, struct _umtx_op_args *uap)
2610{
2611	struct timespec *ts, timeout;
2612	int error;
2613
2614	/* Allow a null timespec (wait forever). */
2615	if (uap->uaddr2 == NULL)
2616		ts = NULL;
2617	else {
2618		error = copyin_timeout32(uap->uaddr2, &timeout);
2619		if (error != 0)
2620			return (error);
2621		if (timeout.tv_nsec >= 1000000000 ||
2622		    timeout.tv_nsec < 0)
2623			return (EINVAL);
2624		ts = &timeout;
2625	}
2626	return do_lock_umutex(td, uap->obj, ts, 0);
2627}
2628
2629static int
2630__umtx_op_cv_wait_compat32(struct thread *td, struct _umtx_op_args *uap)
2631{
2632	struct timespec *ts, timeout;
2633	int error;
2634
2635	/* Allow a null timespec (wait forever). */
2636	if (uap->uaddr2 == NULL)
2637		ts = NULL;
2638	else {
2639		error = copyin_timeout32(uap->uaddr2, &timeout);
2640		if (error != 0)
2641			return (error);
2642		if (timeout.tv_nsec >= 1000000000 ||
2643		    timeout.tv_nsec < 0)
2644			return (EINVAL);
2645		ts = &timeout;
2646	}
2647	return (do_cv_wait(td, uap->obj, uap->uaddr1, ts, uap->val));
2648}
2649
2650static _umtx_op_func op_table_compat32[] = {
2651	__umtx_op_lock_umtx_compat32,	/* UMTX_OP_LOCK */
2652	__umtx_op_unlock_umtx_compat32,	/* UMTX_OP_UNLOCK */
2653	__umtx_op_wait_compat32,	/* UMTX_OP_WAIT */
2654	__umtx_op_wake,			/* UMTX_OP_WAKE */
2655	__umtx_op_trylock_umutex,	/* UMTX_OP_MUTEX_LOCK */
2656	__umtx_op_lock_umutex_compat32,	/* UMTX_OP_MUTEX_TRYLOCK */
2657	__umtx_op_unlock_umutex,	/* UMTX_OP_MUTEX_UNLOCK	*/
2658	__umtx_op_set_ceiling,		/* UMTX_OP_SET_CEILING */
2659	__umtx_op_cv_wait_compat32,	/* UMTX_OP_CV_WAIT*/
2660	__umtx_op_cv_signal,		/* UMTX_OP_CV_SIGNAL */
2661	__umtx_op_cv_broadcast,		/* UMTX_OP_CV_BROADCAST */
2662	__umtx_op_wait_compat32		/* UMTX_OP_WAIT_UINT */
2663};
2664
2665int
2666freebsd32_umtx_op(struct thread *td, struct freebsd32_umtx_op_args *uap)
2667{
2668	if ((unsigned)uap->op < UMTX_OP_MAX)
2669		return (*op_table_compat32[uap->op])(td,
2670			(struct _umtx_op_args *)uap);
2671	return (EINVAL);
2672}
2673#endif
2674
2675void
2676umtx_thread_init(struct thread *td)
2677{
2678	td->td_umtxq = umtxq_alloc();
2679	td->td_umtxq->uq_thread = td;
2680}
2681
2682void
2683umtx_thread_fini(struct thread *td)
2684{
2685	umtxq_free(td->td_umtxq);
2686}
2687
2688/*
2689 * It will be called when new thread is created, e.g fork().
2690 */
2691void
2692umtx_thread_alloc(struct thread *td)
2693{
2694	struct umtx_q *uq;
2695
2696	uq = td->td_umtxq;
2697	uq->uq_inherited_pri = PRI_MAX;
2698
2699	KASSERT(uq->uq_flags == 0, ("uq_flags != 0"));
2700	KASSERT(uq->uq_thread == td, ("uq_thread != td"));
2701	KASSERT(uq->uq_pi_blocked == NULL, ("uq_pi_blocked != NULL"));
2702	KASSERT(TAILQ_EMPTY(&uq->uq_pi_contested), ("uq_pi_contested is not empty"));
2703}
2704
2705/*
2706 * exec() hook.
2707 */
2708static void
2709umtx_exec_hook(void *arg __unused, struct proc *p __unused,
2710	struct image_params *imgp __unused)
2711{
2712	umtx_thread_cleanup(curthread);
2713}
2714
2715/*
2716 * thread_exit() hook.
2717 */
2718void
2719umtx_thread_exit(struct thread *td)
2720{
2721	umtx_thread_cleanup(td);
2722}
2723
2724/*
2725 * clean up umtx data.
2726 */
2727static void
2728umtx_thread_cleanup(struct thread *td)
2729{
2730	struct umtx_q *uq;
2731	struct umtx_pi *pi;
2732
2733	if ((uq = td->td_umtxq) == NULL)
2734		return;
2735
2736	mtx_lock_spin(&umtx_lock);
2737	uq->uq_inherited_pri = PRI_MAX;
2738	while ((pi = TAILQ_FIRST(&uq->uq_pi_contested)) != NULL) {
2739		pi->pi_owner = NULL;
2740		TAILQ_REMOVE(&uq->uq_pi_contested, pi, pi_link);
2741	}
2742	thread_lock(td);
2743	td->td_flags &= ~TDF_UBORROWING;
2744	thread_unlock(td);
2745	mtx_unlock_spin(&umtx_lock);
2746}
2747