1/*-
2 * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
3 *
4 * Copyright (c) 2015, 2016 The FreeBSD Foundation
5 * Copyright (c) 2004, David Xu <davidxu@freebsd.org>
6 * Copyright (c) 2002, Jeffrey Roberson <jeff@freebsd.org>
7 * All rights reserved.
8 *
9 * Portions of this software were developed by Konstantin Belousov
10 * under sponsorship from the FreeBSD Foundation.
11 *
12 * Redistribution and use in source and binary forms, with or without
13 * modification, are permitted provided that the following conditions
14 * are met:
15 * 1. Redistributions of source code must retain the above copyright
16 *    notice unmodified, this list of conditions, and the following
17 *    disclaimer.
18 * 2. Redistributions in binary form must reproduce the above copyright
19 *    notice, this list of conditions and the following disclaimer in the
20 *    documentation and/or other materials provided with the distribution.
21 *
22 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
23 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
24 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
25 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
26 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
27 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
28 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
29 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
30 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
31 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
32 */
33
34#include <sys/cdefs.h>
35__FBSDID("$FreeBSD$");
36
37#include "opt_umtx_profiling.h"
38
39#include <sys/param.h>
40#include <sys/kernel.h>
41#include <sys/fcntl.h>
42#include <sys/file.h>
43#include <sys/filedesc.h>
44#include <sys/limits.h>
45#include <sys/lock.h>
46#include <sys/malloc.h>
47#include <sys/mman.h>
48#include <sys/mutex.h>
49#include <sys/priv.h>
50#include <sys/proc.h>
51#include <sys/resource.h>
52#include <sys/resourcevar.h>
53#include <sys/rwlock.h>
54#include <sys/sbuf.h>
55#include <sys/sched.h>
56#include <sys/smp.h>
57#include <sys/sysctl.h>
58#include <sys/sysent.h>
59#include <sys/systm.h>
60#include <sys/sysproto.h>
61#include <sys/syscallsubr.h>
62#include <sys/taskqueue.h>
63#include <sys/time.h>
64#include <sys/eventhandler.h>
65#include <sys/umtx.h>
66
67#include <security/mac/mac_framework.h>
68
69#include <vm/vm.h>
70#include <vm/vm_param.h>
71#include <vm/pmap.h>
72#include <vm/vm_map.h>
73#include <vm/vm_object.h>
74
75#include <machine/atomic.h>
76#include <machine/cpu.h>
77
78#include <compat/freebsd32/freebsd32.h>
79#ifdef COMPAT_FREEBSD32
80#include <compat/freebsd32/freebsd32_proto.h>
81#endif
82
83#define _UMUTEX_TRY		1
84#define _UMUTEX_WAIT		2
85
86#ifdef UMTX_PROFILING
87#define	UPROF_PERC_BIGGER(w, f, sw, sf)					\
88	(((w) > (sw)) || ((w) == (sw) && (f) > (sf)))
89#endif
90
91/* Priority inheritance mutex info. */
92struct umtx_pi {
93	/* Owner thread */
94	struct thread		*pi_owner;
95
96	/* Reference count */
97	int			pi_refcount;
98
99	/* List entry to link umtx holding by thread */
100	TAILQ_ENTRY(umtx_pi)	pi_link;
101
102	/* List entry in hash */
103	TAILQ_ENTRY(umtx_pi)	pi_hashlink;
104
105	/* List for waiters */
106	TAILQ_HEAD(,umtx_q)	pi_blocked;
107
108	/* Identify a userland lock object */
109	struct umtx_key		pi_key;
110};
111
112/* A userland synchronous object user. */
113struct umtx_q {
114	/* Linked list for the hash. */
115	TAILQ_ENTRY(umtx_q)	uq_link;
116
117	/* Umtx key. */
118	struct umtx_key		uq_key;
119
120	/* Umtx flags. */
121	int			uq_flags;
122#define UQF_UMTXQ	0x0001
123
124	/* The thread waits on. */
125	struct thread		*uq_thread;
126
127	/*
128	 * Blocked on PI mutex. read can use chain lock
129	 * or umtx_lock, write must have both chain lock and
130	 * umtx_lock being hold.
131	 */
132	struct umtx_pi		*uq_pi_blocked;
133
134	/* On blocked list */
135	TAILQ_ENTRY(umtx_q)	uq_lockq;
136
137	/* Thread contending with us */
138	TAILQ_HEAD(,umtx_pi)	uq_pi_contested;
139
140	/* Inherited priority from PP mutex */
141	u_char			uq_inherited_pri;
142
143	/* Spare queue ready to be reused */
144	struct umtxq_queue	*uq_spare_queue;
145
146	/* The queue we on */
147	struct umtxq_queue	*uq_cur_queue;
148};
149
150TAILQ_HEAD(umtxq_head, umtx_q);
151
152/* Per-key wait-queue */
153struct umtxq_queue {
154	struct umtxq_head	head;
155	struct umtx_key		key;
156	LIST_ENTRY(umtxq_queue)	link;
157	int			length;
158};
159
160LIST_HEAD(umtxq_list, umtxq_queue);
161
162/* Userland lock object's wait-queue chain */
163struct umtxq_chain {
164	/* Lock for this chain. */
165	struct mtx		uc_lock;
166
167	/* List of sleep queues. */
168	struct umtxq_list	uc_queue[2];
169#define UMTX_SHARED_QUEUE	0
170#define UMTX_EXCLUSIVE_QUEUE	1
171
172	LIST_HEAD(, umtxq_queue) uc_spare_queue;
173
174	/* Busy flag */
175	char			uc_busy;
176
177	/* Chain lock waiters */
178	int			uc_waiters;
179
180	/* All PI in the list */
181	TAILQ_HEAD(,umtx_pi)	uc_pi_list;
182
183#ifdef UMTX_PROFILING
184	u_int			length;
185	u_int			max_length;
186#endif
187};
188
189#define	UMTXQ_LOCKED_ASSERT(uc)		mtx_assert(&(uc)->uc_lock, MA_OWNED)
190
191/*
192 * Don't propagate time-sharing priority, there is a security reason,
193 * a user can simply introduce PI-mutex, let thread A lock the mutex,
194 * and let another thread B block on the mutex, because B is
195 * sleeping, its priority will be boosted, this causes A's priority to
196 * be boosted via priority propagating too and will never be lowered even
197 * if it is using 100%CPU, this is unfair to other processes.
198 */
199
200#define UPRI(td)	(((td)->td_user_pri >= PRI_MIN_TIMESHARE &&\
201			  (td)->td_user_pri <= PRI_MAX_TIMESHARE) ?\
202			 PRI_MAX_TIMESHARE : (td)->td_user_pri)
203
204#define	GOLDEN_RATIO_PRIME	2654404609U
205#ifndef	UMTX_CHAINS
206#define	UMTX_CHAINS		512
207#endif
208#define	UMTX_SHIFTS		(__WORD_BIT - 9)
209
210#define	GET_SHARE(flags)	\
211    (((flags) & USYNC_PROCESS_SHARED) == 0 ? THREAD_SHARE : PROCESS_SHARE)
212
213#define BUSY_SPINS		200
214
215struct abs_timeout {
216	int clockid;
217	bool is_abs_real;	/* TIMER_ABSTIME && CLOCK_REALTIME* */
218	struct timespec cur;
219	struct timespec end;
220};
221
222struct umtx_copyops {
223	int	(*copyin_timeout)(const void *uaddr, struct timespec *tsp);
224	int	(*copyin_umtx_time)(const void *uaddr, size_t size,
225	    struct _umtx_time *tp);
226	int	(*copyin_robust_lists)(const void *uaddr, size_t size,
227	    struct umtx_robust_lists_params *rbp);
228	int	(*copyout_timeout)(void *uaddr, size_t size,
229	    struct timespec *tsp);
230	const size_t	timespec_sz;
231	const size_t	umtx_time_sz;
232	const bool	compat32;
233};
234
235_Static_assert(sizeof(struct umutex) == sizeof(struct umutex32), "umutex32");
236_Static_assert(__offsetof(struct umutex, m_spare[0]) ==
237    __offsetof(struct umutex32, m_spare[0]), "m_spare32");
238
239int umtx_shm_vnobj_persistent = 0;
240SYSCTL_INT(_kern_ipc, OID_AUTO, umtx_vnode_persistent, CTLFLAG_RWTUN,
241    &umtx_shm_vnobj_persistent, 0,
242    "False forces destruction of umtx attached to file, on last close");
243static int umtx_max_rb = 1000;
244SYSCTL_INT(_kern_ipc, OID_AUTO, umtx_max_robust, CTLFLAG_RWTUN,
245    &umtx_max_rb, 0,
246    "Maximum number of robust mutexes allowed for each thread");
247
248static uma_zone_t		umtx_pi_zone;
249static struct umtxq_chain	umtxq_chains[2][UMTX_CHAINS];
250static MALLOC_DEFINE(M_UMTX, "umtx", "UMTX queue memory");
251static int			umtx_pi_allocated;
252
253static SYSCTL_NODE(_debug, OID_AUTO, umtx, CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
254    "umtx debug");
255SYSCTL_INT(_debug_umtx, OID_AUTO, umtx_pi_allocated, CTLFLAG_RD,
256    &umtx_pi_allocated, 0, "Allocated umtx_pi");
257static int umtx_verbose_rb = 1;
258SYSCTL_INT(_debug_umtx, OID_AUTO, robust_faults_verbose, CTLFLAG_RWTUN,
259    &umtx_verbose_rb, 0,
260    "");
261
262#ifdef UMTX_PROFILING
263static long max_length;
264SYSCTL_LONG(_debug_umtx, OID_AUTO, max_length, CTLFLAG_RD, &max_length, 0, "max_length");
265static SYSCTL_NODE(_debug_umtx, OID_AUTO, chains, CTLFLAG_RD | CTLFLAG_MPSAFE, 0,
266    "umtx chain stats");
267#endif
268
269static void abs_timeout_update(struct abs_timeout *timo);
270
271static void umtx_shm_init(void);
272static void umtxq_sysinit(void *);
273static void umtxq_hash(struct umtx_key *key);
274static struct umtxq_chain *umtxq_getchain(struct umtx_key *key);
275static void umtxq_unlock(struct umtx_key *key);
276static void umtxq_busy(struct umtx_key *key);
277static void umtxq_unbusy(struct umtx_key *key);
278static void umtxq_insert_queue(struct umtx_q *uq, int q);
279static void umtxq_remove_queue(struct umtx_q *uq, int q);
280static int umtxq_sleep(struct umtx_q *uq, const char *wmesg, struct abs_timeout *);
281static int umtxq_count(struct umtx_key *key);
282static struct umtx_pi *umtx_pi_alloc(int);
283static void umtx_pi_free(struct umtx_pi *pi);
284static int do_unlock_pp(struct thread *td, struct umutex *m, uint32_t flags,
285    bool rb);
286static void umtx_thread_cleanup(struct thread *td);
287SYSINIT(umtx, SI_SUB_EVENTHANDLER+1, SI_ORDER_MIDDLE, umtxq_sysinit, NULL);
288
289#define umtxq_signal(key, nwake)	umtxq_signal_queue((key), (nwake), UMTX_SHARED_QUEUE)
290#define umtxq_insert(uq)	umtxq_insert_queue((uq), UMTX_SHARED_QUEUE)
291#define umtxq_remove(uq)	umtxq_remove_queue((uq), UMTX_SHARED_QUEUE)
292
293static struct mtx umtx_lock;
294
295#ifdef UMTX_PROFILING
296static void
297umtx_init_profiling(void)
298{
299	struct sysctl_oid *chain_oid;
300	char chain_name[10];
301	int i;
302
303	for (i = 0; i < UMTX_CHAINS; ++i) {
304		snprintf(chain_name, sizeof(chain_name), "%d", i);
305		chain_oid = SYSCTL_ADD_NODE(NULL,
306		    SYSCTL_STATIC_CHILDREN(_debug_umtx_chains), OID_AUTO,
307		    chain_name, CTLFLAG_RD | CTLFLAG_MPSAFE, NULL,
308		    "umtx hash stats");
309		SYSCTL_ADD_INT(NULL, SYSCTL_CHILDREN(chain_oid), OID_AUTO,
310		    "max_length0", CTLFLAG_RD, &umtxq_chains[0][i].max_length, 0, NULL);
311		SYSCTL_ADD_INT(NULL, SYSCTL_CHILDREN(chain_oid), OID_AUTO,
312		    "max_length1", CTLFLAG_RD, &umtxq_chains[1][i].max_length, 0, NULL);
313	}
314}
315
316static int
317sysctl_debug_umtx_chains_peaks(SYSCTL_HANDLER_ARGS)
318{
319	char buf[512];
320	struct sbuf sb;
321	struct umtxq_chain *uc;
322	u_int fract, i, j, tot, whole;
323	u_int sf0, sf1, sf2, sf3, sf4;
324	u_int si0, si1, si2, si3, si4;
325	u_int sw0, sw1, sw2, sw3, sw4;
326
327	sbuf_new(&sb, buf, sizeof(buf), SBUF_FIXEDLEN);
328	for (i = 0; i < 2; i++) {
329		tot = 0;
330		for (j = 0; j < UMTX_CHAINS; ++j) {
331			uc = &umtxq_chains[i][j];
332			mtx_lock(&uc->uc_lock);
333			tot += uc->max_length;
334			mtx_unlock(&uc->uc_lock);
335		}
336		if (tot == 0)
337			sbuf_printf(&sb, "%u) Empty ", i);
338		else {
339			sf0 = sf1 = sf2 = sf3 = sf4 = 0;
340			si0 = si1 = si2 = si3 = si4 = 0;
341			sw0 = sw1 = sw2 = sw3 = sw4 = 0;
342			for (j = 0; j < UMTX_CHAINS; j++) {
343				uc = &umtxq_chains[i][j];
344				mtx_lock(&uc->uc_lock);
345				whole = uc->max_length * 100;
346				mtx_unlock(&uc->uc_lock);
347				fract = (whole % tot) * 100;
348				if (UPROF_PERC_BIGGER(whole, fract, sw0, sf0)) {
349					sf0 = fract;
350					si0 = j;
351					sw0 = whole;
352				} else if (UPROF_PERC_BIGGER(whole, fract, sw1,
353				    sf1)) {
354					sf1 = fract;
355					si1 = j;
356					sw1 = whole;
357				} else if (UPROF_PERC_BIGGER(whole, fract, sw2,
358				    sf2)) {
359					sf2 = fract;
360					si2 = j;
361					sw2 = whole;
362				} else if (UPROF_PERC_BIGGER(whole, fract, sw3,
363				    sf3)) {
364					sf3 = fract;
365					si3 = j;
366					sw3 = whole;
367				} else if (UPROF_PERC_BIGGER(whole, fract, sw4,
368				    sf4)) {
369					sf4 = fract;
370					si4 = j;
371					sw4 = whole;
372				}
373			}
374			sbuf_printf(&sb, "queue %u:\n", i);
375			sbuf_printf(&sb, "1st: %u.%u%% idx: %u\n", sw0 / tot,
376			    sf0 / tot, si0);
377			sbuf_printf(&sb, "2nd: %u.%u%% idx: %u\n", sw1 / tot,
378			    sf1 / tot, si1);
379			sbuf_printf(&sb, "3rd: %u.%u%% idx: %u\n", sw2 / tot,
380			    sf2 / tot, si2);
381			sbuf_printf(&sb, "4th: %u.%u%% idx: %u\n", sw3 / tot,
382			    sf3 / tot, si3);
383			sbuf_printf(&sb, "5th: %u.%u%% idx: %u\n", sw4 / tot,
384			    sf4 / tot, si4);
385		}
386	}
387	sbuf_trim(&sb);
388	sbuf_finish(&sb);
389	sysctl_handle_string(oidp, sbuf_data(&sb), sbuf_len(&sb), req);
390	sbuf_delete(&sb);
391	return (0);
392}
393
394static int
395sysctl_debug_umtx_chains_clear(SYSCTL_HANDLER_ARGS)
396{
397	struct umtxq_chain *uc;
398	u_int i, j;
399	int clear, error;
400
401	clear = 0;
402	error = sysctl_handle_int(oidp, &clear, 0, req);
403	if (error != 0 || req->newptr == NULL)
404		return (error);
405
406	if (clear != 0) {
407		for (i = 0; i < 2; ++i) {
408			for (j = 0; j < UMTX_CHAINS; ++j) {
409				uc = &umtxq_chains[i][j];
410				mtx_lock(&uc->uc_lock);
411				uc->length = 0;
412				uc->max_length = 0;
413				mtx_unlock(&uc->uc_lock);
414			}
415		}
416	}
417	return (0);
418}
419
420SYSCTL_PROC(_debug_umtx_chains, OID_AUTO, clear,
421    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, 0, 0,
422    sysctl_debug_umtx_chains_clear, "I",
423    "Clear umtx chains statistics");
424SYSCTL_PROC(_debug_umtx_chains, OID_AUTO, peaks,
425    CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, 0, 0,
426    sysctl_debug_umtx_chains_peaks, "A",
427    "Highest peaks in chains max length");
428#endif
429
430static void
431umtxq_sysinit(void *arg __unused)
432{
433	int i, j;
434
435	umtx_pi_zone = uma_zcreate("umtx pi", sizeof(struct umtx_pi),
436		NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0);
437	for (i = 0; i < 2; ++i) {
438		for (j = 0; j < UMTX_CHAINS; ++j) {
439			mtx_init(&umtxq_chains[i][j].uc_lock, "umtxql", NULL,
440				 MTX_DEF | MTX_DUPOK);
441			LIST_INIT(&umtxq_chains[i][j].uc_queue[0]);
442			LIST_INIT(&umtxq_chains[i][j].uc_queue[1]);
443			LIST_INIT(&umtxq_chains[i][j].uc_spare_queue);
444			TAILQ_INIT(&umtxq_chains[i][j].uc_pi_list);
445			umtxq_chains[i][j].uc_busy = 0;
446			umtxq_chains[i][j].uc_waiters = 0;
447#ifdef UMTX_PROFILING
448			umtxq_chains[i][j].length = 0;
449			umtxq_chains[i][j].max_length = 0;
450#endif
451		}
452	}
453#ifdef UMTX_PROFILING
454	umtx_init_profiling();
455#endif
456	mtx_init(&umtx_lock, "umtx lock", NULL, MTX_DEF);
457	umtx_shm_init();
458}
459
460struct umtx_q *
461umtxq_alloc(void)
462{
463	struct umtx_q *uq;
464
465	uq = malloc(sizeof(struct umtx_q), M_UMTX, M_WAITOK | M_ZERO);
466	uq->uq_spare_queue = malloc(sizeof(struct umtxq_queue), M_UMTX,
467	    M_WAITOK | M_ZERO);
468	TAILQ_INIT(&uq->uq_spare_queue->head);
469	TAILQ_INIT(&uq->uq_pi_contested);
470	uq->uq_inherited_pri = PRI_MAX;
471	return (uq);
472}
473
474void
475umtxq_free(struct umtx_q *uq)
476{
477
478	MPASS(uq->uq_spare_queue != NULL);
479	free(uq->uq_spare_queue, M_UMTX);
480	free(uq, M_UMTX);
481}
482
483static inline void
484umtxq_hash(struct umtx_key *key)
485{
486	unsigned n;
487
488	n = (uintptr_t)key->info.both.a + key->info.both.b;
489	key->hash = ((n * GOLDEN_RATIO_PRIME) >> UMTX_SHIFTS) % UMTX_CHAINS;
490}
491
492static inline struct umtxq_chain *
493umtxq_getchain(struct umtx_key *key)
494{
495
496	if (key->type <= TYPE_SEM)
497		return (&umtxq_chains[1][key->hash]);
498	return (&umtxq_chains[0][key->hash]);
499}
500
501/*
502 * Lock a chain.
503 *
504 * The code is a macro so that file/line information is taken from the caller.
505 */
506#define umtxq_lock(key) do {		\
507	struct umtx_key *_key = (key);	\
508	struct umtxq_chain *_uc;	\
509					\
510	_uc = umtxq_getchain(_key);	\
511	mtx_lock(&_uc->uc_lock);	\
512} while (0)
513
514/*
515 * Unlock a chain.
516 */
517static inline void
518umtxq_unlock(struct umtx_key *key)
519{
520	struct umtxq_chain *uc;
521
522	uc = umtxq_getchain(key);
523	mtx_unlock(&uc->uc_lock);
524}
525
526/*
527 * Set chain to busy state when following operation
528 * may be blocked (kernel mutex can not be used).
529 */
530static inline void
531umtxq_busy(struct umtx_key *key)
532{
533	struct umtxq_chain *uc;
534
535	uc = umtxq_getchain(key);
536	mtx_assert(&uc->uc_lock, MA_OWNED);
537	if (uc->uc_busy) {
538#ifdef SMP
539		if (smp_cpus > 1) {
540			int count = BUSY_SPINS;
541			if (count > 0) {
542				umtxq_unlock(key);
543				while (uc->uc_busy && --count > 0)
544					cpu_spinwait();
545				umtxq_lock(key);
546			}
547		}
548#endif
549		while (uc->uc_busy) {
550			uc->uc_waiters++;
551			msleep(uc, &uc->uc_lock, 0, "umtxqb", 0);
552			uc->uc_waiters--;
553		}
554	}
555	uc->uc_busy = 1;
556}
557
558/*
559 * Unbusy a chain.
560 */
561static inline void
562umtxq_unbusy(struct umtx_key *key)
563{
564	struct umtxq_chain *uc;
565
566	uc = umtxq_getchain(key);
567	mtx_assert(&uc->uc_lock, MA_OWNED);
568	KASSERT(uc->uc_busy != 0, ("not busy"));
569	uc->uc_busy = 0;
570	if (uc->uc_waiters)
571		wakeup_one(uc);
572}
573
574static inline void
575umtxq_unbusy_unlocked(struct umtx_key *key)
576{
577
578	umtxq_lock(key);
579	umtxq_unbusy(key);
580	umtxq_unlock(key);
581}
582
583static struct umtxq_queue *
584umtxq_queue_lookup(struct umtx_key *key, int q)
585{
586	struct umtxq_queue *uh;
587	struct umtxq_chain *uc;
588
589	uc = umtxq_getchain(key);
590	UMTXQ_LOCKED_ASSERT(uc);
591	LIST_FOREACH(uh, &uc->uc_queue[q], link) {
592		if (umtx_key_match(&uh->key, key))
593			return (uh);
594	}
595
596	return (NULL);
597}
598
599static inline void
600umtxq_insert_queue(struct umtx_q *uq, int q)
601{
602	struct umtxq_queue *uh;
603	struct umtxq_chain *uc;
604
605	uc = umtxq_getchain(&uq->uq_key);
606	UMTXQ_LOCKED_ASSERT(uc);
607	KASSERT((uq->uq_flags & UQF_UMTXQ) == 0, ("umtx_q is already on queue"));
608	uh = umtxq_queue_lookup(&uq->uq_key, q);
609	if (uh != NULL) {
610		LIST_INSERT_HEAD(&uc->uc_spare_queue, uq->uq_spare_queue, link);
611	} else {
612		uh = uq->uq_spare_queue;
613		uh->key = uq->uq_key;
614		LIST_INSERT_HEAD(&uc->uc_queue[q], uh, link);
615#ifdef UMTX_PROFILING
616		uc->length++;
617		if (uc->length > uc->max_length) {
618			uc->max_length = uc->length;
619			if (uc->max_length > max_length)
620				max_length = uc->max_length;
621		}
622#endif
623	}
624	uq->uq_spare_queue = NULL;
625
626	TAILQ_INSERT_TAIL(&uh->head, uq, uq_link);
627	uh->length++;
628	uq->uq_flags |= UQF_UMTXQ;
629	uq->uq_cur_queue = uh;
630	return;
631}
632
633static inline void
634umtxq_remove_queue(struct umtx_q *uq, int q)
635{
636	struct umtxq_chain *uc;
637	struct umtxq_queue *uh;
638
639	uc = umtxq_getchain(&uq->uq_key);
640	UMTXQ_LOCKED_ASSERT(uc);
641	if (uq->uq_flags & UQF_UMTXQ) {
642		uh = uq->uq_cur_queue;
643		TAILQ_REMOVE(&uh->head, uq, uq_link);
644		uh->length--;
645		uq->uq_flags &= ~UQF_UMTXQ;
646		if (TAILQ_EMPTY(&uh->head)) {
647			KASSERT(uh->length == 0,
648			    ("inconsistent umtxq_queue length"));
649#ifdef UMTX_PROFILING
650			uc->length--;
651#endif
652			LIST_REMOVE(uh, link);
653		} else {
654			uh = LIST_FIRST(&uc->uc_spare_queue);
655			KASSERT(uh != NULL, ("uc_spare_queue is empty"));
656			LIST_REMOVE(uh, link);
657		}
658		uq->uq_spare_queue = uh;
659		uq->uq_cur_queue = NULL;
660	}
661}
662
663/*
664 * Check if there are multiple waiters
665 */
666static int
667umtxq_count(struct umtx_key *key)
668{
669	struct umtxq_queue *uh;
670
671	UMTXQ_LOCKED_ASSERT(umtxq_getchain(key));
672	uh = umtxq_queue_lookup(key, UMTX_SHARED_QUEUE);
673	if (uh != NULL)
674		return (uh->length);
675	return (0);
676}
677
678/*
679 * Check if there are multiple PI waiters and returns first
680 * waiter.
681 */
682static int
683umtxq_count_pi(struct umtx_key *key, struct umtx_q **first)
684{
685	struct umtxq_queue *uh;
686
687	*first = NULL;
688	UMTXQ_LOCKED_ASSERT(umtxq_getchain(key));
689	uh = umtxq_queue_lookup(key, UMTX_SHARED_QUEUE);
690	if (uh != NULL) {
691		*first = TAILQ_FIRST(&uh->head);
692		return (uh->length);
693	}
694	return (0);
695}
696
697/*
698 * Wake up threads waiting on an userland object.
699 */
700
701static int
702umtxq_signal_queue(struct umtx_key *key, int n_wake, int q)
703{
704	struct umtxq_queue *uh;
705	struct umtx_q *uq;
706	int ret;
707
708	ret = 0;
709	UMTXQ_LOCKED_ASSERT(umtxq_getchain(key));
710	uh = umtxq_queue_lookup(key, q);
711	if (uh != NULL) {
712		while ((uq = TAILQ_FIRST(&uh->head)) != NULL) {
713			umtxq_remove_queue(uq, q);
714			wakeup(uq);
715			if (++ret >= n_wake)
716				return (ret);
717		}
718	}
719	return (ret);
720}
721
722/*
723 * Wake up specified thread.
724 */
725static inline void
726umtxq_signal_thread(struct umtx_q *uq)
727{
728
729	UMTXQ_LOCKED_ASSERT(umtxq_getchain(&uq->uq_key));
730	umtxq_remove(uq);
731	wakeup(uq);
732}
733
734static inline int
735tstohz(const struct timespec *tsp)
736{
737	struct timeval tv;
738
739	TIMESPEC_TO_TIMEVAL(&tv, tsp);
740	return tvtohz(&tv);
741}
742
743static void
744abs_timeout_init(struct abs_timeout *timo, int clockid, int absolute,
745	const struct timespec *timeout)
746{
747
748	timo->clockid = clockid;
749	if (!absolute) {
750		timo->is_abs_real = false;
751		abs_timeout_update(timo);
752		timespecadd(&timo->cur, timeout, &timo->end);
753	} else {
754		timo->end = *timeout;
755		timo->is_abs_real = clockid == CLOCK_REALTIME ||
756		    clockid == CLOCK_REALTIME_FAST ||
757		    clockid == CLOCK_REALTIME_PRECISE;
758		/*
759		 * If is_abs_real, umtxq_sleep will read the clock
760		 * after setting td_rtcgen; otherwise, read it here.
761		 */
762		if (!timo->is_abs_real) {
763			abs_timeout_update(timo);
764		}
765	}
766}
767
768static void
769abs_timeout_init2(struct abs_timeout *timo, const struct _umtx_time *umtxtime)
770{
771
772	abs_timeout_init(timo, umtxtime->_clockid,
773	    (umtxtime->_flags & UMTX_ABSTIME) != 0, &umtxtime->_timeout);
774}
775
776static inline void
777abs_timeout_update(struct abs_timeout *timo)
778{
779
780	kern_clock_gettime(curthread, timo->clockid, &timo->cur);
781}
782
783static int
784abs_timeout_gethz(struct abs_timeout *timo)
785{
786	struct timespec tts;
787
788	if (timespeccmp(&timo->end, &timo->cur, <=))
789		return (-1);
790	timespecsub(&timo->end, &timo->cur, &tts);
791	return (tstohz(&tts));
792}
793
794static uint32_t
795umtx_unlock_val(uint32_t flags, bool rb)
796{
797
798	if (rb)
799		return (UMUTEX_RB_OWNERDEAD);
800	else if ((flags & UMUTEX_NONCONSISTENT) != 0)
801		return (UMUTEX_RB_NOTRECOV);
802	else
803		return (UMUTEX_UNOWNED);
804
805}
806
807/*
808 * Put thread into sleep state, before sleeping, check if
809 * thread was removed from umtx queue.
810 */
811static inline int
812umtxq_sleep(struct umtx_q *uq, const char *wmesg, struct abs_timeout *abstime)
813{
814	struct umtxq_chain *uc;
815	int error, timo;
816
817	if (abstime != NULL && abstime->is_abs_real) {
818		curthread->td_rtcgen = atomic_load_acq_int(&rtc_generation);
819		abs_timeout_update(abstime);
820	}
821
822	uc = umtxq_getchain(&uq->uq_key);
823	UMTXQ_LOCKED_ASSERT(uc);
824	for (;;) {
825		if (!(uq->uq_flags & UQF_UMTXQ)) {
826			error = 0;
827			break;
828		}
829		if (abstime != NULL) {
830			timo = abs_timeout_gethz(abstime);
831			if (timo < 0) {
832				error = ETIMEDOUT;
833				break;
834			}
835		} else
836			timo = 0;
837		error = msleep(uq, &uc->uc_lock, PCATCH | PDROP, wmesg, timo);
838		if (error == EINTR || error == ERESTART) {
839			umtxq_lock(&uq->uq_key);
840			break;
841		}
842		if (abstime != NULL) {
843			if (abstime->is_abs_real)
844				curthread->td_rtcgen =
845				    atomic_load_acq_int(&rtc_generation);
846			abs_timeout_update(abstime);
847		}
848		umtxq_lock(&uq->uq_key);
849	}
850
851	curthread->td_rtcgen = 0;
852	return (error);
853}
854
855/*
856 * Convert userspace address into unique logical address.
857 */
858int
859umtx_key_get(const void *addr, int type, int share, struct umtx_key *key)
860{
861	struct thread *td = curthread;
862	vm_map_t map;
863	vm_map_entry_t entry;
864	vm_pindex_t pindex;
865	vm_prot_t prot;
866	boolean_t wired;
867
868	key->type = type;
869	if (share == THREAD_SHARE) {
870		key->shared = 0;
871		key->info.private.vs = td->td_proc->p_vmspace;
872		key->info.private.addr = (uintptr_t)addr;
873	} else {
874		MPASS(share == PROCESS_SHARE || share == AUTO_SHARE);
875		map = &td->td_proc->p_vmspace->vm_map;
876		if (vm_map_lookup(&map, (vm_offset_t)addr, VM_PROT_WRITE,
877		    &entry, &key->info.shared.object, &pindex, &prot,
878		    &wired) != KERN_SUCCESS) {
879			return (EFAULT);
880		}
881
882		if ((share == PROCESS_SHARE) ||
883		    (share == AUTO_SHARE &&
884		     VM_INHERIT_SHARE == entry->inheritance)) {
885			key->shared = 1;
886			key->info.shared.offset = (vm_offset_t)addr -
887			    entry->start + entry->offset;
888			vm_object_reference(key->info.shared.object);
889		} else {
890			key->shared = 0;
891			key->info.private.vs = td->td_proc->p_vmspace;
892			key->info.private.addr = (uintptr_t)addr;
893		}
894		vm_map_lookup_done(map, entry);
895	}
896
897	umtxq_hash(key);
898	return (0);
899}
900
901/*
902 * Release key.
903 */
904void
905umtx_key_release(struct umtx_key *key)
906{
907	if (key->shared)
908		vm_object_deallocate(key->info.shared.object);
909}
910
911/*
912 * Fetch and compare value, sleep on the address if value is not changed.
913 */
914static int
915do_wait(struct thread *td, void *addr, u_long id,
916    struct _umtx_time *timeout, int compat32, int is_private)
917{
918	struct abs_timeout timo;
919	struct umtx_q *uq;
920	u_long tmp;
921	uint32_t tmp32;
922	int error = 0;
923
924	uq = td->td_umtxq;
925	if ((error = umtx_key_get(addr, TYPE_SIMPLE_WAIT,
926		is_private ? THREAD_SHARE : AUTO_SHARE, &uq->uq_key)) != 0)
927		return (error);
928
929	if (timeout != NULL)
930		abs_timeout_init2(&timo, timeout);
931
932	umtxq_lock(&uq->uq_key);
933	umtxq_insert(uq);
934	umtxq_unlock(&uq->uq_key);
935	if (compat32 == 0) {
936		error = fueword(addr, &tmp);
937		if (error != 0)
938			error = EFAULT;
939	} else {
940		error = fueword32(addr, &tmp32);
941		if (error == 0)
942			tmp = tmp32;
943		else
944			error = EFAULT;
945	}
946	umtxq_lock(&uq->uq_key);
947	if (error == 0) {
948		if (tmp == id)
949			error = umtxq_sleep(uq, "uwait", timeout == NULL ?
950			    NULL : &timo);
951		if ((uq->uq_flags & UQF_UMTXQ) == 0)
952			error = 0;
953		else
954			umtxq_remove(uq);
955	} else if ((uq->uq_flags & UQF_UMTXQ) != 0) {
956		umtxq_remove(uq);
957	}
958	umtxq_unlock(&uq->uq_key);
959	umtx_key_release(&uq->uq_key);
960	if (error == ERESTART)
961		error = EINTR;
962	return (error);
963}
964
965/*
966 * Wake up threads sleeping on the specified address.
967 */
968int
969kern_umtx_wake(struct thread *td, void *uaddr, int n_wake, int is_private)
970{
971	struct umtx_key key;
972	int ret;
973
974	if ((ret = umtx_key_get(uaddr, TYPE_SIMPLE_WAIT,
975	    is_private ? THREAD_SHARE : AUTO_SHARE, &key)) != 0)
976		return (ret);
977	umtxq_lock(&key);
978	umtxq_signal(&key, n_wake);
979	umtxq_unlock(&key);
980	umtx_key_release(&key);
981	return (0);
982}
983
984/*
985 * Lock PTHREAD_PRIO_NONE protocol POSIX mutex.
986 */
987static int
988do_lock_normal(struct thread *td, struct umutex *m, uint32_t flags,
989    struct _umtx_time *timeout, int mode)
990{
991	struct abs_timeout timo;
992	struct umtx_q *uq;
993	uint32_t owner, old, id;
994	int error, rv;
995
996	id = td->td_tid;
997	uq = td->td_umtxq;
998	error = 0;
999	if (timeout != NULL)
1000		abs_timeout_init2(&timo, timeout);
1001
1002	/*
1003	 * Care must be exercised when dealing with umtx structure. It
1004	 * can fault on any access.
1005	 */
1006	for (;;) {
1007		rv = fueword32(&m->m_owner, &owner);
1008		if (rv == -1)
1009			return (EFAULT);
1010		if (mode == _UMUTEX_WAIT) {
1011			if (owner == UMUTEX_UNOWNED ||
1012			    owner == UMUTEX_CONTESTED ||
1013			    owner == UMUTEX_RB_OWNERDEAD ||
1014			    owner == UMUTEX_RB_NOTRECOV)
1015				return (0);
1016		} else {
1017			/*
1018			 * Robust mutex terminated.  Kernel duty is to
1019			 * return EOWNERDEAD to the userspace.  The
1020			 * umutex.m_flags UMUTEX_NONCONSISTENT is set
1021			 * by the common userspace code.
1022			 */
1023			if (owner == UMUTEX_RB_OWNERDEAD) {
1024				rv = casueword32(&m->m_owner,
1025				    UMUTEX_RB_OWNERDEAD, &owner,
1026				    id | UMUTEX_CONTESTED);
1027				if (rv == -1)
1028					return (EFAULT);
1029				if (rv == 0) {
1030					MPASS(owner == UMUTEX_RB_OWNERDEAD);
1031					return (EOWNERDEAD); /* success */
1032				}
1033				MPASS(rv == 1);
1034				rv = thread_check_susp(td, false);
1035				if (rv != 0)
1036					return (rv);
1037				continue;
1038			}
1039			if (owner == UMUTEX_RB_NOTRECOV)
1040				return (ENOTRECOVERABLE);
1041
1042			/*
1043			 * Try the uncontested case.  This should be
1044			 * done in userland.
1045			 */
1046			rv = casueword32(&m->m_owner, UMUTEX_UNOWNED,
1047			    &owner, id);
1048			/* The address was invalid. */
1049			if (rv == -1)
1050				return (EFAULT);
1051
1052			/* The acquire succeeded. */
1053			if (rv == 0) {
1054				MPASS(owner == UMUTEX_UNOWNED);
1055				return (0);
1056			}
1057
1058			/*
1059			 * If no one owns it but it is contested try
1060			 * to acquire it.
1061			 */
1062			MPASS(rv == 1);
1063			if (owner == UMUTEX_CONTESTED) {
1064				rv = casueword32(&m->m_owner,
1065				    UMUTEX_CONTESTED, &owner,
1066				    id | UMUTEX_CONTESTED);
1067				/* The address was invalid. */
1068				if (rv == -1)
1069					return (EFAULT);
1070				if (rv == 0) {
1071					MPASS(owner == UMUTEX_CONTESTED);
1072					return (0);
1073				}
1074				if (rv == 1) {
1075					rv = thread_check_susp(td, false);
1076					if (rv != 0)
1077						return (rv);
1078				}
1079
1080				/*
1081				 * If this failed the lock has
1082				 * changed, restart.
1083				 */
1084				continue;
1085			}
1086
1087			/* rv == 1 but not contested, likely store failure */
1088			rv = thread_check_susp(td, false);
1089			if (rv != 0)
1090				return (rv);
1091		}
1092
1093		if (mode == _UMUTEX_TRY)
1094			return (EBUSY);
1095
1096		/*
1097		 * If we caught a signal, we have retried and now
1098		 * exit immediately.
1099		 */
1100		if (error != 0)
1101			return (error);
1102
1103		if ((error = umtx_key_get(m, TYPE_NORMAL_UMUTEX,
1104		    GET_SHARE(flags), &uq->uq_key)) != 0)
1105			return (error);
1106
1107		umtxq_lock(&uq->uq_key);
1108		umtxq_busy(&uq->uq_key);
1109		umtxq_insert(uq);
1110		umtxq_unlock(&uq->uq_key);
1111
1112		/*
1113		 * Set the contested bit so that a release in user space
1114		 * knows to use the system call for unlock.  If this fails
1115		 * either some one else has acquired the lock or it has been
1116		 * released.
1117		 */
1118		rv = casueword32(&m->m_owner, owner, &old,
1119		    owner | UMUTEX_CONTESTED);
1120
1121		/* The address was invalid or casueword failed to store. */
1122		if (rv == -1 || rv == 1) {
1123			umtxq_lock(&uq->uq_key);
1124			umtxq_remove(uq);
1125			umtxq_unbusy(&uq->uq_key);
1126			umtxq_unlock(&uq->uq_key);
1127			umtx_key_release(&uq->uq_key);
1128			if (rv == -1)
1129				return (EFAULT);
1130			if (rv == 1) {
1131				rv = thread_check_susp(td, false);
1132				if (rv != 0)
1133					return (rv);
1134			}
1135			continue;
1136		}
1137
1138		/*
1139		 * We set the contested bit, sleep. Otherwise the lock changed
1140		 * and we need to retry or we lost a race to the thread
1141		 * unlocking the umtx.
1142		 */
1143		umtxq_lock(&uq->uq_key);
1144		umtxq_unbusy(&uq->uq_key);
1145		MPASS(old == owner);
1146		error = umtxq_sleep(uq, "umtxn", timeout == NULL ?
1147		    NULL : &timo);
1148		umtxq_remove(uq);
1149		umtxq_unlock(&uq->uq_key);
1150		umtx_key_release(&uq->uq_key);
1151
1152		if (error == 0)
1153			error = thread_check_susp(td, false);
1154	}
1155
1156	return (0);
1157}
1158
1159/*
1160 * Unlock PTHREAD_PRIO_NONE protocol POSIX mutex.
1161 */
1162static int
1163do_unlock_normal(struct thread *td, struct umutex *m, uint32_t flags, bool rb)
1164{
1165	struct umtx_key key;
1166	uint32_t owner, old, id, newlock;
1167	int error, count;
1168
1169	id = td->td_tid;
1170
1171again:
1172	/*
1173	 * Make sure we own this mtx.
1174	 */
1175	error = fueword32(&m->m_owner, &owner);
1176	if (error == -1)
1177		return (EFAULT);
1178
1179	if ((owner & ~UMUTEX_CONTESTED) != id)
1180		return (EPERM);
1181
1182	newlock = umtx_unlock_val(flags, rb);
1183	if ((owner & UMUTEX_CONTESTED) == 0) {
1184		error = casueword32(&m->m_owner, owner, &old, newlock);
1185		if (error == -1)
1186			return (EFAULT);
1187		if (error == 1) {
1188			error = thread_check_susp(td, false);
1189			if (error != 0)
1190				return (error);
1191			goto again;
1192		}
1193		MPASS(old == owner);
1194		return (0);
1195	}
1196
1197	/* We should only ever be in here for contested locks */
1198	if ((error = umtx_key_get(m, TYPE_NORMAL_UMUTEX, GET_SHARE(flags),
1199	    &key)) != 0)
1200		return (error);
1201
1202	umtxq_lock(&key);
1203	umtxq_busy(&key);
1204	count = umtxq_count(&key);
1205	umtxq_unlock(&key);
1206
1207	/*
1208	 * When unlocking the umtx, it must be marked as unowned if
1209	 * there is zero or one thread only waiting for it.
1210	 * Otherwise, it must be marked as contested.
1211	 */
1212	if (count > 1)
1213		newlock |= UMUTEX_CONTESTED;
1214	error = casueword32(&m->m_owner, owner, &old, newlock);
1215	umtxq_lock(&key);
1216	umtxq_signal(&key, 1);
1217	umtxq_unbusy(&key);
1218	umtxq_unlock(&key);
1219	umtx_key_release(&key);
1220	if (error == -1)
1221		return (EFAULT);
1222	if (error == 1) {
1223		if (old != owner)
1224			return (EINVAL);
1225		error = thread_check_susp(td, false);
1226		if (error != 0)
1227			return (error);
1228		goto again;
1229	}
1230	return (0);
1231}
1232
1233/*
1234 * Check if the mutex is available and wake up a waiter,
1235 * only for simple mutex.
1236 */
1237static int
1238do_wake_umutex(struct thread *td, struct umutex *m)
1239{
1240	struct umtx_key key;
1241	uint32_t owner;
1242	uint32_t flags;
1243	int error;
1244	int count;
1245
1246again:
1247	error = fueword32(&m->m_owner, &owner);
1248	if (error == -1)
1249		return (EFAULT);
1250
1251	if ((owner & ~UMUTEX_CONTESTED) != 0 && owner != UMUTEX_RB_OWNERDEAD &&
1252	    owner != UMUTEX_RB_NOTRECOV)
1253		return (0);
1254
1255	error = fueword32(&m->m_flags, &flags);
1256	if (error == -1)
1257		return (EFAULT);
1258
1259	/* We should only ever be in here for contested locks */
1260	if ((error = umtx_key_get(m, TYPE_NORMAL_UMUTEX, GET_SHARE(flags),
1261	    &key)) != 0)
1262		return (error);
1263
1264	umtxq_lock(&key);
1265	umtxq_busy(&key);
1266	count = umtxq_count(&key);
1267	umtxq_unlock(&key);
1268
1269	if (count <= 1 && owner != UMUTEX_RB_OWNERDEAD &&
1270	    owner != UMUTEX_RB_NOTRECOV) {
1271		error = casueword32(&m->m_owner, UMUTEX_CONTESTED, &owner,
1272		    UMUTEX_UNOWNED);
1273		if (error == -1) {
1274			error = EFAULT;
1275		} else if (error == 1) {
1276			umtxq_lock(&key);
1277			umtxq_unbusy(&key);
1278			umtxq_unlock(&key);
1279			umtx_key_release(&key);
1280			error = thread_check_susp(td, false);
1281			if (error != 0)
1282				return (error);
1283			goto again;
1284		}
1285	}
1286
1287	umtxq_lock(&key);
1288	if (error == 0 && count != 0) {
1289		MPASS((owner & ~UMUTEX_CONTESTED) == 0 ||
1290		    owner == UMUTEX_RB_OWNERDEAD ||
1291		    owner == UMUTEX_RB_NOTRECOV);
1292		umtxq_signal(&key, 1);
1293	}
1294	umtxq_unbusy(&key);
1295	umtxq_unlock(&key);
1296	umtx_key_release(&key);
1297	return (error);
1298}
1299
1300/*
1301 * Check if the mutex has waiters and tries to fix contention bit.
1302 */
1303static int
1304do_wake2_umutex(struct thread *td, struct umutex *m, uint32_t flags)
1305{
1306	struct umtx_key key;
1307	uint32_t owner, old;
1308	int type;
1309	int error;
1310	int count;
1311
1312	switch (flags & (UMUTEX_PRIO_INHERIT | UMUTEX_PRIO_PROTECT |
1313	    UMUTEX_ROBUST)) {
1314	case 0:
1315	case UMUTEX_ROBUST:
1316		type = TYPE_NORMAL_UMUTEX;
1317		break;
1318	case UMUTEX_PRIO_INHERIT:
1319		type = TYPE_PI_UMUTEX;
1320		break;
1321	case (UMUTEX_PRIO_INHERIT | UMUTEX_ROBUST):
1322		type = TYPE_PI_ROBUST_UMUTEX;
1323		break;
1324	case UMUTEX_PRIO_PROTECT:
1325		type = TYPE_PP_UMUTEX;
1326		break;
1327	case (UMUTEX_PRIO_PROTECT | UMUTEX_ROBUST):
1328		type = TYPE_PP_ROBUST_UMUTEX;
1329		break;
1330	default:
1331		return (EINVAL);
1332	}
1333	if ((error = umtx_key_get(m, type, GET_SHARE(flags), &key)) != 0)
1334		return (error);
1335
1336	owner = 0;
1337	umtxq_lock(&key);
1338	umtxq_busy(&key);
1339	count = umtxq_count(&key);
1340	umtxq_unlock(&key);
1341
1342	error = fueword32(&m->m_owner, &owner);
1343	if (error == -1)
1344		error = EFAULT;
1345
1346	/*
1347	 * Only repair contention bit if there is a waiter, this means
1348	 * the mutex is still being referenced by userland code,
1349	 * otherwise don't update any memory.
1350	 */
1351	while (error == 0 && (owner & UMUTEX_CONTESTED) == 0 &&
1352	    (count > 1 || (count == 1 && (owner & ~UMUTEX_CONTESTED) != 0))) {
1353		error = casueword32(&m->m_owner, owner, &old,
1354		    owner | UMUTEX_CONTESTED);
1355		if (error == -1) {
1356			error = EFAULT;
1357			break;
1358		}
1359		if (error == 0) {
1360			MPASS(old == owner);
1361			break;
1362		}
1363		owner = old;
1364		error = thread_check_susp(td, false);
1365	}
1366
1367	umtxq_lock(&key);
1368	if (error == EFAULT) {
1369		umtxq_signal(&key, INT_MAX);
1370	} else if (count != 0 && ((owner & ~UMUTEX_CONTESTED) == 0 ||
1371	    owner == UMUTEX_RB_OWNERDEAD || owner == UMUTEX_RB_NOTRECOV))
1372		umtxq_signal(&key, 1);
1373	umtxq_unbusy(&key);
1374	umtxq_unlock(&key);
1375	umtx_key_release(&key);
1376	return (error);
1377}
1378
1379static inline struct umtx_pi *
1380umtx_pi_alloc(int flags)
1381{
1382	struct umtx_pi *pi;
1383
1384	pi = uma_zalloc(umtx_pi_zone, M_ZERO | flags);
1385	TAILQ_INIT(&pi->pi_blocked);
1386	atomic_add_int(&umtx_pi_allocated, 1);
1387	return (pi);
1388}
1389
1390static inline void
1391umtx_pi_free(struct umtx_pi *pi)
1392{
1393	uma_zfree(umtx_pi_zone, pi);
1394	atomic_add_int(&umtx_pi_allocated, -1);
1395}
1396
1397/*
1398 * Adjust the thread's position on a pi_state after its priority has been
1399 * changed.
1400 */
1401static int
1402umtx_pi_adjust_thread(struct umtx_pi *pi, struct thread *td)
1403{
1404	struct umtx_q *uq, *uq1, *uq2;
1405	struct thread *td1;
1406
1407	mtx_assert(&umtx_lock, MA_OWNED);
1408	if (pi == NULL)
1409		return (0);
1410
1411	uq = td->td_umtxq;
1412
1413	/*
1414	 * Check if the thread needs to be moved on the blocked chain.
1415	 * It needs to be moved if either its priority is lower than
1416	 * the previous thread or higher than the next thread.
1417	 */
1418	uq1 = TAILQ_PREV(uq, umtxq_head, uq_lockq);
1419	uq2 = TAILQ_NEXT(uq, uq_lockq);
1420	if ((uq1 != NULL && UPRI(td) < UPRI(uq1->uq_thread)) ||
1421	    (uq2 != NULL && UPRI(td) > UPRI(uq2->uq_thread))) {
1422		/*
1423		 * Remove thread from blocked chain and determine where
1424		 * it should be moved to.
1425		 */
1426		TAILQ_REMOVE(&pi->pi_blocked, uq, uq_lockq);
1427		TAILQ_FOREACH(uq1, &pi->pi_blocked, uq_lockq) {
1428			td1 = uq1->uq_thread;
1429			MPASS(td1->td_proc->p_magic == P_MAGIC);
1430			if (UPRI(td1) > UPRI(td))
1431				break;
1432		}
1433
1434		if (uq1 == NULL)
1435			TAILQ_INSERT_TAIL(&pi->pi_blocked, uq, uq_lockq);
1436		else
1437			TAILQ_INSERT_BEFORE(uq1, uq, uq_lockq);
1438	}
1439	return (1);
1440}
1441
1442static struct umtx_pi *
1443umtx_pi_next(struct umtx_pi *pi)
1444{
1445	struct umtx_q *uq_owner;
1446
1447	if (pi->pi_owner == NULL)
1448		return (NULL);
1449	uq_owner = pi->pi_owner->td_umtxq;
1450	if (uq_owner == NULL)
1451		return (NULL);
1452	return (uq_owner->uq_pi_blocked);
1453}
1454
1455/*
1456 * Floyd's Cycle-Finding Algorithm.
1457 */
1458static bool
1459umtx_pi_check_loop(struct umtx_pi *pi)
1460{
1461	struct umtx_pi *pi1;	/* fast iterator */
1462
1463	mtx_assert(&umtx_lock, MA_OWNED);
1464	if (pi == NULL)
1465		return (false);
1466	pi1 = pi;
1467	for (;;) {
1468		pi = umtx_pi_next(pi);
1469		if (pi == NULL)
1470			break;
1471		pi1 = umtx_pi_next(pi1);
1472		if (pi1 == NULL)
1473			break;
1474		pi1 = umtx_pi_next(pi1);
1475		if (pi1 == NULL)
1476			break;
1477		if (pi == pi1)
1478			return (true);
1479	}
1480	return (false);
1481}
1482
1483/*
1484 * Propagate priority when a thread is blocked on POSIX
1485 * PI mutex.
1486 */
1487static void
1488umtx_propagate_priority(struct thread *td)
1489{
1490	struct umtx_q *uq;
1491	struct umtx_pi *pi;
1492	int pri;
1493
1494	mtx_assert(&umtx_lock, MA_OWNED);
1495	pri = UPRI(td);
1496	uq = td->td_umtxq;
1497	pi = uq->uq_pi_blocked;
1498	if (pi == NULL)
1499		return;
1500	if (umtx_pi_check_loop(pi))
1501		return;
1502
1503	for (;;) {
1504		td = pi->pi_owner;
1505		if (td == NULL || td == curthread)
1506			return;
1507
1508		MPASS(td->td_proc != NULL);
1509		MPASS(td->td_proc->p_magic == P_MAGIC);
1510
1511		thread_lock(td);
1512		if (td->td_lend_user_pri > pri)
1513			sched_lend_user_prio(td, pri);
1514		else {
1515			thread_unlock(td);
1516			break;
1517		}
1518		thread_unlock(td);
1519
1520		/*
1521		 * Pick up the lock that td is blocked on.
1522		 */
1523		uq = td->td_umtxq;
1524		pi = uq->uq_pi_blocked;
1525		if (pi == NULL)
1526			break;
1527		/* Resort td on the list if needed. */
1528		umtx_pi_adjust_thread(pi, td);
1529	}
1530}
1531
1532/*
1533 * Unpropagate priority for a PI mutex when a thread blocked on
1534 * it is interrupted by signal or resumed by others.
1535 */
1536static void
1537umtx_repropagate_priority(struct umtx_pi *pi)
1538{
1539	struct umtx_q *uq, *uq_owner;
1540	struct umtx_pi *pi2;
1541	int pri;
1542
1543	mtx_assert(&umtx_lock, MA_OWNED);
1544
1545	if (umtx_pi_check_loop(pi))
1546		return;
1547	while (pi != NULL && pi->pi_owner != NULL) {
1548		pri = PRI_MAX;
1549		uq_owner = pi->pi_owner->td_umtxq;
1550
1551		TAILQ_FOREACH(pi2, &uq_owner->uq_pi_contested, pi_link) {
1552			uq = TAILQ_FIRST(&pi2->pi_blocked);
1553			if (uq != NULL) {
1554				if (pri > UPRI(uq->uq_thread))
1555					pri = UPRI(uq->uq_thread);
1556			}
1557		}
1558
1559		if (pri > uq_owner->uq_inherited_pri)
1560			pri = uq_owner->uq_inherited_pri;
1561		thread_lock(pi->pi_owner);
1562		sched_lend_user_prio(pi->pi_owner, pri);
1563		thread_unlock(pi->pi_owner);
1564		if ((pi = uq_owner->uq_pi_blocked) != NULL)
1565			umtx_pi_adjust_thread(pi, uq_owner->uq_thread);
1566	}
1567}
1568
1569/*
1570 * Insert a PI mutex into owned list.
1571 */
1572static void
1573umtx_pi_setowner(struct umtx_pi *pi, struct thread *owner)
1574{
1575	struct umtx_q *uq_owner;
1576
1577	uq_owner = owner->td_umtxq;
1578	mtx_assert(&umtx_lock, MA_OWNED);
1579	MPASS(pi->pi_owner == NULL);
1580	pi->pi_owner = owner;
1581	TAILQ_INSERT_TAIL(&uq_owner->uq_pi_contested, pi, pi_link);
1582}
1583
1584/*
1585 * Disown a PI mutex, and remove it from the owned list.
1586 */
1587static void
1588umtx_pi_disown(struct umtx_pi *pi)
1589{
1590
1591	mtx_assert(&umtx_lock, MA_OWNED);
1592	TAILQ_REMOVE(&pi->pi_owner->td_umtxq->uq_pi_contested, pi, pi_link);
1593	pi->pi_owner = NULL;
1594}
1595
1596/*
1597 * Claim ownership of a PI mutex.
1598 */
1599static int
1600umtx_pi_claim(struct umtx_pi *pi, struct thread *owner)
1601{
1602	struct umtx_q *uq;
1603	int pri;
1604
1605	mtx_lock(&umtx_lock);
1606	if (pi->pi_owner == owner) {
1607		mtx_unlock(&umtx_lock);
1608		return (0);
1609	}
1610
1611	if (pi->pi_owner != NULL) {
1612		/*
1613		 * userland may have already messed the mutex, sigh.
1614		 */
1615		mtx_unlock(&umtx_lock);
1616		return (EPERM);
1617	}
1618	umtx_pi_setowner(pi, owner);
1619	uq = TAILQ_FIRST(&pi->pi_blocked);
1620	if (uq != NULL) {
1621		pri = UPRI(uq->uq_thread);
1622		thread_lock(owner);
1623		if (pri < UPRI(owner))
1624			sched_lend_user_prio(owner, pri);
1625		thread_unlock(owner);
1626	}
1627	mtx_unlock(&umtx_lock);
1628	return (0);
1629}
1630
1631/*
1632 * Adjust a thread's order position in its blocked PI mutex,
1633 * this may result new priority propagating process.
1634 */
1635void
1636umtx_pi_adjust(struct thread *td, u_char oldpri)
1637{
1638	struct umtx_q *uq;
1639	struct umtx_pi *pi;
1640
1641	uq = td->td_umtxq;
1642	mtx_lock(&umtx_lock);
1643	/*
1644	 * Pick up the lock that td is blocked on.
1645	 */
1646	pi = uq->uq_pi_blocked;
1647	if (pi != NULL) {
1648		umtx_pi_adjust_thread(pi, td);
1649		umtx_repropagate_priority(pi);
1650	}
1651	mtx_unlock(&umtx_lock);
1652}
1653
1654/*
1655 * Sleep on a PI mutex.
1656 */
1657static int
1658umtxq_sleep_pi(struct umtx_q *uq, struct umtx_pi *pi, uint32_t owner,
1659    const char *wmesg, struct abs_timeout *timo, bool shared)
1660{
1661	struct thread *td, *td1;
1662	struct umtx_q *uq1;
1663	int error, pri;
1664#ifdef INVARIANTS
1665	struct umtxq_chain *uc;
1666
1667	uc = umtxq_getchain(&pi->pi_key);
1668#endif
1669	error = 0;
1670	td = uq->uq_thread;
1671	KASSERT(td == curthread, ("inconsistent uq_thread"));
1672	UMTXQ_LOCKED_ASSERT(umtxq_getchain(&uq->uq_key));
1673	KASSERT(uc->uc_busy != 0, ("umtx chain is not busy"));
1674	umtxq_insert(uq);
1675	mtx_lock(&umtx_lock);
1676	if (pi->pi_owner == NULL) {
1677		mtx_unlock(&umtx_lock);
1678		td1 = tdfind(owner, shared ? -1 : td->td_proc->p_pid);
1679		mtx_lock(&umtx_lock);
1680		if (td1 != NULL) {
1681			if (pi->pi_owner == NULL)
1682				umtx_pi_setowner(pi, td1);
1683			PROC_UNLOCK(td1->td_proc);
1684		}
1685	}
1686
1687	TAILQ_FOREACH(uq1, &pi->pi_blocked, uq_lockq) {
1688		pri = UPRI(uq1->uq_thread);
1689		if (pri > UPRI(td))
1690			break;
1691	}
1692
1693	if (uq1 != NULL)
1694		TAILQ_INSERT_BEFORE(uq1, uq, uq_lockq);
1695	else
1696		TAILQ_INSERT_TAIL(&pi->pi_blocked, uq, uq_lockq);
1697
1698	uq->uq_pi_blocked = pi;
1699	thread_lock(td);
1700	td->td_flags |= TDF_UPIBLOCKED;
1701	thread_unlock(td);
1702	umtx_propagate_priority(td);
1703	mtx_unlock(&umtx_lock);
1704	umtxq_unbusy(&uq->uq_key);
1705
1706	error = umtxq_sleep(uq, wmesg, timo);
1707	umtxq_remove(uq);
1708
1709	mtx_lock(&umtx_lock);
1710	uq->uq_pi_blocked = NULL;
1711	thread_lock(td);
1712	td->td_flags &= ~TDF_UPIBLOCKED;
1713	thread_unlock(td);
1714	TAILQ_REMOVE(&pi->pi_blocked, uq, uq_lockq);
1715	umtx_repropagate_priority(pi);
1716	mtx_unlock(&umtx_lock);
1717	umtxq_unlock(&uq->uq_key);
1718
1719	return (error);
1720}
1721
1722/*
1723 * Add reference count for a PI mutex.
1724 */
1725static void
1726umtx_pi_ref(struct umtx_pi *pi)
1727{
1728
1729	UMTXQ_LOCKED_ASSERT(umtxq_getchain(&pi->pi_key));
1730	pi->pi_refcount++;
1731}
1732
1733/*
1734 * Decrease reference count for a PI mutex, if the counter
1735 * is decreased to zero, its memory space is freed.
1736 */
1737static void
1738umtx_pi_unref(struct umtx_pi *pi)
1739{
1740	struct umtxq_chain *uc;
1741
1742	uc = umtxq_getchain(&pi->pi_key);
1743	UMTXQ_LOCKED_ASSERT(uc);
1744	KASSERT(pi->pi_refcount > 0, ("invalid reference count"));
1745	if (--pi->pi_refcount == 0) {
1746		mtx_lock(&umtx_lock);
1747		if (pi->pi_owner != NULL)
1748			umtx_pi_disown(pi);
1749		KASSERT(TAILQ_EMPTY(&pi->pi_blocked),
1750			("blocked queue not empty"));
1751		mtx_unlock(&umtx_lock);
1752		TAILQ_REMOVE(&uc->uc_pi_list, pi, pi_hashlink);
1753		umtx_pi_free(pi);
1754	}
1755}
1756
1757/*
1758 * Find a PI mutex in hash table.
1759 */
1760static struct umtx_pi *
1761umtx_pi_lookup(struct umtx_key *key)
1762{
1763	struct umtxq_chain *uc;
1764	struct umtx_pi *pi;
1765
1766	uc = umtxq_getchain(key);
1767	UMTXQ_LOCKED_ASSERT(uc);
1768
1769	TAILQ_FOREACH(pi, &uc->uc_pi_list, pi_hashlink) {
1770		if (umtx_key_match(&pi->pi_key, key)) {
1771			return (pi);
1772		}
1773	}
1774	return (NULL);
1775}
1776
1777/*
1778 * Insert a PI mutex into hash table.
1779 */
1780static inline void
1781umtx_pi_insert(struct umtx_pi *pi)
1782{
1783	struct umtxq_chain *uc;
1784
1785	uc = umtxq_getchain(&pi->pi_key);
1786	UMTXQ_LOCKED_ASSERT(uc);
1787	TAILQ_INSERT_TAIL(&uc->uc_pi_list, pi, pi_hashlink);
1788}
1789
1790/*
1791 * Lock a PI mutex.
1792 */
1793static int
1794do_lock_pi(struct thread *td, struct umutex *m, uint32_t flags,
1795    struct _umtx_time *timeout, int try)
1796{
1797	struct abs_timeout timo;
1798	struct umtx_q *uq;
1799	struct umtx_pi *pi, *new_pi;
1800	uint32_t id, old_owner, owner, old;
1801	int error, rv;
1802
1803	id = td->td_tid;
1804	uq = td->td_umtxq;
1805
1806	if ((error = umtx_key_get(m, (flags & UMUTEX_ROBUST) != 0 ?
1807	    TYPE_PI_ROBUST_UMUTEX : TYPE_PI_UMUTEX, GET_SHARE(flags),
1808	    &uq->uq_key)) != 0)
1809		return (error);
1810
1811	if (timeout != NULL)
1812		abs_timeout_init2(&timo, timeout);
1813
1814	umtxq_lock(&uq->uq_key);
1815	pi = umtx_pi_lookup(&uq->uq_key);
1816	if (pi == NULL) {
1817		new_pi = umtx_pi_alloc(M_NOWAIT);
1818		if (new_pi == NULL) {
1819			umtxq_unlock(&uq->uq_key);
1820			new_pi = umtx_pi_alloc(M_WAITOK);
1821			umtxq_lock(&uq->uq_key);
1822			pi = umtx_pi_lookup(&uq->uq_key);
1823			if (pi != NULL) {
1824				umtx_pi_free(new_pi);
1825				new_pi = NULL;
1826			}
1827		}
1828		if (new_pi != NULL) {
1829			new_pi->pi_key = uq->uq_key;
1830			umtx_pi_insert(new_pi);
1831			pi = new_pi;
1832		}
1833	}
1834	umtx_pi_ref(pi);
1835	umtxq_unlock(&uq->uq_key);
1836
1837	/*
1838	 * Care must be exercised when dealing with umtx structure.  It
1839	 * can fault on any access.
1840	 */
1841	for (;;) {
1842		/*
1843		 * Try the uncontested case.  This should be done in userland.
1844		 */
1845		rv = casueword32(&m->m_owner, UMUTEX_UNOWNED, &owner, id);
1846		/* The address was invalid. */
1847		if (rv == -1) {
1848			error = EFAULT;
1849			break;
1850		}
1851		/* The acquire succeeded. */
1852		if (rv == 0) {
1853			MPASS(owner == UMUTEX_UNOWNED);
1854			error = 0;
1855			break;
1856		}
1857
1858		if (owner == UMUTEX_RB_NOTRECOV) {
1859			error = ENOTRECOVERABLE;
1860			break;
1861		}
1862
1863		/*
1864		 * Avoid overwriting a possible error from sleep due
1865		 * to the pending signal with suspension check result.
1866		 */
1867		if (error == 0) {
1868			error = thread_check_susp(td, true);
1869			if (error != 0)
1870				break;
1871		}
1872
1873		/* If no one owns it but it is contested try to acquire it. */
1874		if (owner == UMUTEX_CONTESTED || owner == UMUTEX_RB_OWNERDEAD) {
1875			old_owner = owner;
1876			rv = casueword32(&m->m_owner, owner, &owner,
1877			    id | UMUTEX_CONTESTED);
1878			/* The address was invalid. */
1879			if (rv == -1) {
1880				error = EFAULT;
1881				break;
1882			}
1883			if (rv == 1) {
1884				if (error == 0) {
1885					error = thread_check_susp(td, true);
1886					if (error != 0)
1887						break;
1888				}
1889
1890				/*
1891				 * If this failed the lock could
1892				 * changed, restart.
1893				 */
1894				continue;
1895			}
1896
1897			MPASS(rv == 0);
1898			MPASS(owner == old_owner);
1899			umtxq_lock(&uq->uq_key);
1900			umtxq_busy(&uq->uq_key);
1901			error = umtx_pi_claim(pi, td);
1902			umtxq_unbusy(&uq->uq_key);
1903			umtxq_unlock(&uq->uq_key);
1904			if (error != 0) {
1905				/*
1906				 * Since we're going to return an
1907				 * error, restore the m_owner to its
1908				 * previous, unowned state to avoid
1909				 * compounding the problem.
1910				 */
1911				(void)casuword32(&m->m_owner,
1912				    id | UMUTEX_CONTESTED, old_owner);
1913			}
1914			if (error == 0 && old_owner == UMUTEX_RB_OWNERDEAD)
1915				error = EOWNERDEAD;
1916			break;
1917		}
1918
1919		if ((owner & ~UMUTEX_CONTESTED) == id) {
1920			error = EDEADLK;
1921			break;
1922		}
1923
1924		if (try != 0) {
1925			error = EBUSY;
1926			break;
1927		}
1928
1929		/*
1930		 * If we caught a signal, we have retried and now
1931		 * exit immediately.
1932		 */
1933		if (error != 0)
1934			break;
1935
1936		umtxq_lock(&uq->uq_key);
1937		umtxq_busy(&uq->uq_key);
1938		umtxq_unlock(&uq->uq_key);
1939
1940		/*
1941		 * Set the contested bit so that a release in user space
1942		 * knows to use the system call for unlock.  If this fails
1943		 * either some one else has acquired the lock or it has been
1944		 * released.
1945		 */
1946		rv = casueword32(&m->m_owner, owner, &old, owner |
1947		    UMUTEX_CONTESTED);
1948
1949		/* The address was invalid. */
1950		if (rv == -1) {
1951			umtxq_unbusy_unlocked(&uq->uq_key);
1952			error = EFAULT;
1953			break;
1954		}
1955		if (rv == 1) {
1956			umtxq_unbusy_unlocked(&uq->uq_key);
1957			error = thread_check_susp(td, true);
1958			if (error != 0)
1959				break;
1960
1961			/*
1962			 * The lock changed and we need to retry or we
1963			 * lost a race to the thread unlocking the
1964			 * umtx.  Note that the UMUTEX_RB_OWNERDEAD
1965			 * value for owner is impossible there.
1966			 */
1967			continue;
1968		}
1969
1970		umtxq_lock(&uq->uq_key);
1971
1972		/* We set the contested bit, sleep. */
1973		MPASS(old == owner);
1974		error = umtxq_sleep_pi(uq, pi, owner & ~UMUTEX_CONTESTED,
1975		    "umtxpi", timeout == NULL ? NULL : &timo,
1976		    (flags & USYNC_PROCESS_SHARED) != 0);
1977		if (error != 0)
1978			continue;
1979
1980		error = thread_check_susp(td, false);
1981		if (error != 0)
1982			break;
1983	}
1984
1985	umtxq_lock(&uq->uq_key);
1986	umtx_pi_unref(pi);
1987	umtxq_unlock(&uq->uq_key);
1988
1989	umtx_key_release(&uq->uq_key);
1990	return (error);
1991}
1992
1993/*
1994 * Unlock a PI mutex.
1995 */
1996static int
1997do_unlock_pi(struct thread *td, struct umutex *m, uint32_t flags, bool rb)
1998{
1999	struct umtx_key key;
2000	struct umtx_q *uq_first, *uq_first2, *uq_me;
2001	struct umtx_pi *pi, *pi2;
2002	uint32_t id, new_owner, old, owner;
2003	int count, error, pri;
2004
2005	id = td->td_tid;
2006
2007usrloop:
2008	/*
2009	 * Make sure we own this mtx.
2010	 */
2011	error = fueword32(&m->m_owner, &owner);
2012	if (error == -1)
2013		return (EFAULT);
2014
2015	if ((owner & ~UMUTEX_CONTESTED) != id)
2016		return (EPERM);
2017
2018	new_owner = umtx_unlock_val(flags, rb);
2019
2020	/* This should be done in userland */
2021	if ((owner & UMUTEX_CONTESTED) == 0) {
2022		error = casueword32(&m->m_owner, owner, &old, new_owner);
2023		if (error == -1)
2024			return (EFAULT);
2025		if (error == 1) {
2026			error = thread_check_susp(td, true);
2027			if (error != 0)
2028				return (error);
2029			goto usrloop;
2030		}
2031		if (old == owner)
2032			return (0);
2033		owner = old;
2034	}
2035
2036	/* We should only ever be in here for contested locks */
2037	if ((error = umtx_key_get(m, (flags & UMUTEX_ROBUST) != 0 ?
2038	    TYPE_PI_ROBUST_UMUTEX : TYPE_PI_UMUTEX, GET_SHARE(flags),
2039	    &key)) != 0)
2040		return (error);
2041
2042	umtxq_lock(&key);
2043	umtxq_busy(&key);
2044	count = umtxq_count_pi(&key, &uq_first);
2045	if (uq_first != NULL) {
2046		mtx_lock(&umtx_lock);
2047		pi = uq_first->uq_pi_blocked;
2048		KASSERT(pi != NULL, ("pi == NULL?"));
2049		if (pi->pi_owner != td && !(rb && pi->pi_owner == NULL)) {
2050			mtx_unlock(&umtx_lock);
2051			umtxq_unbusy(&key);
2052			umtxq_unlock(&key);
2053			umtx_key_release(&key);
2054			/* userland messed the mutex */
2055			return (EPERM);
2056		}
2057		uq_me = td->td_umtxq;
2058		if (pi->pi_owner == td)
2059			umtx_pi_disown(pi);
2060		/* get highest priority thread which is still sleeping. */
2061		uq_first = TAILQ_FIRST(&pi->pi_blocked);
2062		while (uq_first != NULL &&
2063		    (uq_first->uq_flags & UQF_UMTXQ) == 0) {
2064			uq_first = TAILQ_NEXT(uq_first, uq_lockq);
2065		}
2066		pri = PRI_MAX;
2067		TAILQ_FOREACH(pi2, &uq_me->uq_pi_contested, pi_link) {
2068			uq_first2 = TAILQ_FIRST(&pi2->pi_blocked);
2069			if (uq_first2 != NULL) {
2070				if (pri > UPRI(uq_first2->uq_thread))
2071					pri = UPRI(uq_first2->uq_thread);
2072			}
2073		}
2074		thread_lock(td);
2075		sched_lend_user_prio(td, pri);
2076		thread_unlock(td);
2077		mtx_unlock(&umtx_lock);
2078		if (uq_first)
2079			umtxq_signal_thread(uq_first);
2080	} else {
2081		pi = umtx_pi_lookup(&key);
2082		/*
2083		 * A umtx_pi can exist if a signal or timeout removed the
2084		 * last waiter from the umtxq, but there is still
2085		 * a thread in do_lock_pi() holding the umtx_pi.
2086		 */
2087		if (pi != NULL) {
2088			/*
2089			 * The umtx_pi can be unowned, such as when a thread
2090			 * has just entered do_lock_pi(), allocated the
2091			 * umtx_pi, and unlocked the umtxq.
2092			 * If the current thread owns it, it must disown it.
2093			 */
2094			mtx_lock(&umtx_lock);
2095			if (pi->pi_owner == td)
2096				umtx_pi_disown(pi);
2097			mtx_unlock(&umtx_lock);
2098		}
2099	}
2100	umtxq_unlock(&key);
2101
2102	/*
2103	 * When unlocking the umtx, it must be marked as unowned if
2104	 * there is zero or one thread only waiting for it.
2105	 * Otherwise, it must be marked as contested.
2106	 */
2107
2108	if (count > 1)
2109		new_owner |= UMUTEX_CONTESTED;
2110again:
2111	error = casueword32(&m->m_owner, owner, &old, new_owner);
2112	if (error == 1) {
2113		error = thread_check_susp(td, false);
2114		if (error == 0)
2115			goto again;
2116	}
2117	umtxq_unbusy_unlocked(&key);
2118	umtx_key_release(&key);
2119	if (error == -1)
2120		return (EFAULT);
2121	if (error == 0 && old != owner)
2122		return (EINVAL);
2123	return (error);
2124}
2125
2126/*
2127 * Lock a PP mutex.
2128 */
2129static int
2130do_lock_pp(struct thread *td, struct umutex *m, uint32_t flags,
2131    struct _umtx_time *timeout, int try)
2132{
2133	struct abs_timeout timo;
2134	struct umtx_q *uq, *uq2;
2135	struct umtx_pi *pi;
2136	uint32_t ceiling;
2137	uint32_t owner, id;
2138	int error, pri, old_inherited_pri, su, rv;
2139
2140	id = td->td_tid;
2141	uq = td->td_umtxq;
2142	if ((error = umtx_key_get(m, (flags & UMUTEX_ROBUST) != 0 ?
2143	    TYPE_PP_ROBUST_UMUTEX : TYPE_PP_UMUTEX, GET_SHARE(flags),
2144	    &uq->uq_key)) != 0)
2145		return (error);
2146
2147	if (timeout != NULL)
2148		abs_timeout_init2(&timo, timeout);
2149
2150	su = (priv_check(td, PRIV_SCHED_RTPRIO) == 0);
2151	for (;;) {
2152		old_inherited_pri = uq->uq_inherited_pri;
2153		umtxq_lock(&uq->uq_key);
2154		umtxq_busy(&uq->uq_key);
2155		umtxq_unlock(&uq->uq_key);
2156
2157		rv = fueword32(&m->m_ceilings[0], &ceiling);
2158		if (rv == -1) {
2159			error = EFAULT;
2160			goto out;
2161		}
2162		ceiling = RTP_PRIO_MAX - ceiling;
2163		if (ceiling > RTP_PRIO_MAX) {
2164			error = EINVAL;
2165			goto out;
2166		}
2167
2168		mtx_lock(&umtx_lock);
2169		if (UPRI(td) < PRI_MIN_REALTIME + ceiling) {
2170			mtx_unlock(&umtx_lock);
2171			error = EINVAL;
2172			goto out;
2173		}
2174		if (su && PRI_MIN_REALTIME + ceiling < uq->uq_inherited_pri) {
2175			uq->uq_inherited_pri = PRI_MIN_REALTIME + ceiling;
2176			thread_lock(td);
2177			if (uq->uq_inherited_pri < UPRI(td))
2178				sched_lend_user_prio(td, uq->uq_inherited_pri);
2179			thread_unlock(td);
2180		}
2181		mtx_unlock(&umtx_lock);
2182
2183		rv = casueword32(&m->m_owner, UMUTEX_CONTESTED, &owner,
2184		    id | UMUTEX_CONTESTED);
2185		/* The address was invalid. */
2186		if (rv == -1) {
2187			error = EFAULT;
2188			break;
2189		}
2190		if (rv == 0) {
2191			MPASS(owner == UMUTEX_CONTESTED);
2192			error = 0;
2193			break;
2194		}
2195		/* rv == 1 */
2196		if (owner == UMUTEX_RB_OWNERDEAD) {
2197			rv = casueword32(&m->m_owner, UMUTEX_RB_OWNERDEAD,
2198			    &owner, id | UMUTEX_CONTESTED);
2199			if (rv == -1) {
2200				error = EFAULT;
2201				break;
2202			}
2203			if (rv == 0) {
2204				MPASS(owner == UMUTEX_RB_OWNERDEAD);
2205				error = EOWNERDEAD; /* success */
2206				break;
2207			}
2208
2209			/*
2210			 *  rv == 1, only check for suspension if we
2211			 *  did not already catched a signal.  If we
2212			 *  get an error from the check, the same
2213			 *  condition is checked by the umtxq_sleep()
2214			 *  call below, so we should obliterate the
2215			 *  error to not skip the last loop iteration.
2216			 */
2217			if (error == 0) {
2218				error = thread_check_susp(td, false);
2219				if (error == 0) {
2220					if (try != 0)
2221						error = EBUSY;
2222					else
2223						continue;
2224				}
2225				error = 0;
2226			}
2227		} else if (owner == UMUTEX_RB_NOTRECOV) {
2228			error = ENOTRECOVERABLE;
2229		}
2230
2231		if (try != 0)
2232			error = EBUSY;
2233
2234		/*
2235		 * If we caught a signal, we have retried and now
2236		 * exit immediately.
2237		 */
2238		if (error != 0)
2239			break;
2240
2241		umtxq_lock(&uq->uq_key);
2242		umtxq_insert(uq);
2243		umtxq_unbusy(&uq->uq_key);
2244		error = umtxq_sleep(uq, "umtxpp", timeout == NULL ?
2245		    NULL : &timo);
2246		umtxq_remove(uq);
2247		umtxq_unlock(&uq->uq_key);
2248
2249		mtx_lock(&umtx_lock);
2250		uq->uq_inherited_pri = old_inherited_pri;
2251		pri = PRI_MAX;
2252		TAILQ_FOREACH(pi, &uq->uq_pi_contested, pi_link) {
2253			uq2 = TAILQ_FIRST(&pi->pi_blocked);
2254			if (uq2 != NULL) {
2255				if (pri > UPRI(uq2->uq_thread))
2256					pri = UPRI(uq2->uq_thread);
2257			}
2258		}
2259		if (pri > uq->uq_inherited_pri)
2260			pri = uq->uq_inherited_pri;
2261		thread_lock(td);
2262		sched_lend_user_prio(td, pri);
2263		thread_unlock(td);
2264		mtx_unlock(&umtx_lock);
2265	}
2266
2267	if (error != 0 && error != EOWNERDEAD) {
2268		mtx_lock(&umtx_lock);
2269		uq->uq_inherited_pri = old_inherited_pri;
2270		pri = PRI_MAX;
2271		TAILQ_FOREACH(pi, &uq->uq_pi_contested, pi_link) {
2272			uq2 = TAILQ_FIRST(&pi->pi_blocked);
2273			if (uq2 != NULL) {
2274				if (pri > UPRI(uq2->uq_thread))
2275					pri = UPRI(uq2->uq_thread);
2276			}
2277		}
2278		if (pri > uq->uq_inherited_pri)
2279			pri = uq->uq_inherited_pri;
2280		thread_lock(td);
2281		sched_lend_user_prio(td, pri);
2282		thread_unlock(td);
2283		mtx_unlock(&umtx_lock);
2284	}
2285
2286out:
2287	umtxq_unbusy_unlocked(&uq->uq_key);
2288	umtx_key_release(&uq->uq_key);
2289	return (error);
2290}
2291
2292/*
2293 * Unlock a PP mutex.
2294 */
2295static int
2296do_unlock_pp(struct thread *td, struct umutex *m, uint32_t flags, bool rb)
2297{
2298	struct umtx_key key;
2299	struct umtx_q *uq, *uq2;
2300	struct umtx_pi *pi;
2301	uint32_t id, owner, rceiling;
2302	int error, pri, new_inherited_pri, su;
2303
2304	id = td->td_tid;
2305	uq = td->td_umtxq;
2306	su = (priv_check(td, PRIV_SCHED_RTPRIO) == 0);
2307
2308	/*
2309	 * Make sure we own this mtx.
2310	 */
2311	error = fueword32(&m->m_owner, &owner);
2312	if (error == -1)
2313		return (EFAULT);
2314
2315	if ((owner & ~UMUTEX_CONTESTED) != id)
2316		return (EPERM);
2317
2318	error = copyin(&m->m_ceilings[1], &rceiling, sizeof(uint32_t));
2319	if (error != 0)
2320		return (error);
2321
2322	if (rceiling == -1)
2323		new_inherited_pri = PRI_MAX;
2324	else {
2325		rceiling = RTP_PRIO_MAX - rceiling;
2326		if (rceiling > RTP_PRIO_MAX)
2327			return (EINVAL);
2328		new_inherited_pri = PRI_MIN_REALTIME + rceiling;
2329	}
2330
2331	if ((error = umtx_key_get(m, (flags & UMUTEX_ROBUST) != 0 ?
2332	    TYPE_PP_ROBUST_UMUTEX : TYPE_PP_UMUTEX, GET_SHARE(flags),
2333	    &key)) != 0)
2334		return (error);
2335	umtxq_lock(&key);
2336	umtxq_busy(&key);
2337	umtxq_unlock(&key);
2338	/*
2339	 * For priority protected mutex, always set unlocked state
2340	 * to UMUTEX_CONTESTED, so that userland always enters kernel
2341	 * to lock the mutex, it is necessary because thread priority
2342	 * has to be adjusted for such mutex.
2343	 */
2344	error = suword32(&m->m_owner, umtx_unlock_val(flags, rb) |
2345	    UMUTEX_CONTESTED);
2346
2347	umtxq_lock(&key);
2348	if (error == 0)
2349		umtxq_signal(&key, 1);
2350	umtxq_unbusy(&key);
2351	umtxq_unlock(&key);
2352
2353	if (error == -1)
2354		error = EFAULT;
2355	else {
2356		mtx_lock(&umtx_lock);
2357		if (su != 0)
2358			uq->uq_inherited_pri = new_inherited_pri;
2359		pri = PRI_MAX;
2360		TAILQ_FOREACH(pi, &uq->uq_pi_contested, pi_link) {
2361			uq2 = TAILQ_FIRST(&pi->pi_blocked);
2362			if (uq2 != NULL) {
2363				if (pri > UPRI(uq2->uq_thread))
2364					pri = UPRI(uq2->uq_thread);
2365			}
2366		}
2367		if (pri > uq->uq_inherited_pri)
2368			pri = uq->uq_inherited_pri;
2369		thread_lock(td);
2370		sched_lend_user_prio(td, pri);
2371		thread_unlock(td);
2372		mtx_unlock(&umtx_lock);
2373	}
2374	umtx_key_release(&key);
2375	return (error);
2376}
2377
2378static int
2379do_set_ceiling(struct thread *td, struct umutex *m, uint32_t ceiling,
2380    uint32_t *old_ceiling)
2381{
2382	struct umtx_q *uq;
2383	uint32_t flags, id, owner, save_ceiling;
2384	int error, rv, rv1;
2385
2386	error = fueword32(&m->m_flags, &flags);
2387	if (error == -1)
2388		return (EFAULT);
2389	if ((flags & UMUTEX_PRIO_PROTECT) == 0)
2390		return (EINVAL);
2391	if (ceiling > RTP_PRIO_MAX)
2392		return (EINVAL);
2393	id = td->td_tid;
2394	uq = td->td_umtxq;
2395	if ((error = umtx_key_get(m, (flags & UMUTEX_ROBUST) != 0 ?
2396	    TYPE_PP_ROBUST_UMUTEX : TYPE_PP_UMUTEX, GET_SHARE(flags),
2397	    &uq->uq_key)) != 0)
2398		return (error);
2399	for (;;) {
2400		umtxq_lock(&uq->uq_key);
2401		umtxq_busy(&uq->uq_key);
2402		umtxq_unlock(&uq->uq_key);
2403
2404		rv = fueword32(&m->m_ceilings[0], &save_ceiling);
2405		if (rv == -1) {
2406			error = EFAULT;
2407			break;
2408		}
2409
2410		rv = casueword32(&m->m_owner, UMUTEX_CONTESTED, &owner,
2411		    id | UMUTEX_CONTESTED);
2412		if (rv == -1) {
2413			error = EFAULT;
2414			break;
2415		}
2416
2417		if (rv == 0) {
2418			MPASS(owner == UMUTEX_CONTESTED);
2419			rv = suword32(&m->m_ceilings[0], ceiling);
2420			rv1 = suword32(&m->m_owner, UMUTEX_CONTESTED);
2421			error = (rv == 0 && rv1 == 0) ? 0: EFAULT;
2422			break;
2423		}
2424
2425		if ((owner & ~UMUTEX_CONTESTED) == id) {
2426			rv = suword32(&m->m_ceilings[0], ceiling);
2427			error = rv == 0 ? 0 : EFAULT;
2428			break;
2429		}
2430
2431		if (owner == UMUTEX_RB_OWNERDEAD) {
2432			error = EOWNERDEAD;
2433			break;
2434		} else if (owner == UMUTEX_RB_NOTRECOV) {
2435			error = ENOTRECOVERABLE;
2436			break;
2437		}
2438
2439		/*
2440		 * If we caught a signal, we have retried and now
2441		 * exit immediately.
2442		 */
2443		if (error != 0)
2444			break;
2445
2446		/*
2447		 * We set the contested bit, sleep. Otherwise the lock changed
2448		 * and we need to retry or we lost a race to the thread
2449		 * unlocking the umtx.
2450		 */
2451		umtxq_lock(&uq->uq_key);
2452		umtxq_insert(uq);
2453		umtxq_unbusy(&uq->uq_key);
2454		error = umtxq_sleep(uq, "umtxpp", NULL);
2455		umtxq_remove(uq);
2456		umtxq_unlock(&uq->uq_key);
2457	}
2458	umtxq_lock(&uq->uq_key);
2459	if (error == 0)
2460		umtxq_signal(&uq->uq_key, INT_MAX);
2461	umtxq_unbusy(&uq->uq_key);
2462	umtxq_unlock(&uq->uq_key);
2463	umtx_key_release(&uq->uq_key);
2464	if (error == 0 && old_ceiling != NULL) {
2465		rv = suword32(old_ceiling, save_ceiling);
2466		error = rv == 0 ? 0 : EFAULT;
2467	}
2468	return (error);
2469}
2470
2471/*
2472 * Lock a userland POSIX mutex.
2473 */
2474static int
2475do_lock_umutex(struct thread *td, struct umutex *m,
2476    struct _umtx_time *timeout, int mode)
2477{
2478	uint32_t flags;
2479	int error;
2480
2481	error = fueword32(&m->m_flags, &flags);
2482	if (error == -1)
2483		return (EFAULT);
2484
2485	switch (flags & (UMUTEX_PRIO_INHERIT | UMUTEX_PRIO_PROTECT)) {
2486	case 0:
2487		error = do_lock_normal(td, m, flags, timeout, mode);
2488		break;
2489	case UMUTEX_PRIO_INHERIT:
2490		error = do_lock_pi(td, m, flags, timeout, mode);
2491		break;
2492	case UMUTEX_PRIO_PROTECT:
2493		error = do_lock_pp(td, m, flags, timeout, mode);
2494		break;
2495	default:
2496		return (EINVAL);
2497	}
2498	if (timeout == NULL) {
2499		if (error == EINTR && mode != _UMUTEX_WAIT)
2500			error = ERESTART;
2501	} else {
2502		/* Timed-locking is not restarted. */
2503		if (error == ERESTART)
2504			error = EINTR;
2505	}
2506	return (error);
2507}
2508
2509/*
2510 * Unlock a userland POSIX mutex.
2511 */
2512static int
2513do_unlock_umutex(struct thread *td, struct umutex *m, bool rb)
2514{
2515	uint32_t flags;
2516	int error;
2517
2518	error = fueword32(&m->m_flags, &flags);
2519	if (error == -1)
2520		return (EFAULT);
2521
2522	switch (flags & (UMUTEX_PRIO_INHERIT | UMUTEX_PRIO_PROTECT)) {
2523	case 0:
2524		return (do_unlock_normal(td, m, flags, rb));
2525	case UMUTEX_PRIO_INHERIT:
2526		return (do_unlock_pi(td, m, flags, rb));
2527	case UMUTEX_PRIO_PROTECT:
2528		return (do_unlock_pp(td, m, flags, rb));
2529	}
2530
2531	return (EINVAL);
2532}
2533
2534static int
2535do_cv_wait(struct thread *td, struct ucond *cv, struct umutex *m,
2536    struct timespec *timeout, u_long wflags)
2537{
2538	struct abs_timeout timo;
2539	struct umtx_q *uq;
2540	uint32_t flags, clockid, hasw;
2541	int error;
2542
2543	uq = td->td_umtxq;
2544	error = fueword32(&cv->c_flags, &flags);
2545	if (error == -1)
2546		return (EFAULT);
2547	error = umtx_key_get(cv, TYPE_CV, GET_SHARE(flags), &uq->uq_key);
2548	if (error != 0)
2549		return (error);
2550
2551	if ((wflags & CVWAIT_CLOCKID) != 0) {
2552		error = fueword32(&cv->c_clockid, &clockid);
2553		if (error == -1) {
2554			umtx_key_release(&uq->uq_key);
2555			return (EFAULT);
2556		}
2557		if (clockid < CLOCK_REALTIME ||
2558		    clockid >= CLOCK_THREAD_CPUTIME_ID) {
2559			/* hmm, only HW clock id will work. */
2560			umtx_key_release(&uq->uq_key);
2561			return (EINVAL);
2562		}
2563	} else {
2564		clockid = CLOCK_REALTIME;
2565	}
2566
2567	umtxq_lock(&uq->uq_key);
2568	umtxq_busy(&uq->uq_key);
2569	umtxq_insert(uq);
2570	umtxq_unlock(&uq->uq_key);
2571
2572	/*
2573	 * Set c_has_waiters to 1 before releasing user mutex, also
2574	 * don't modify cache line when unnecessary.
2575	 */
2576	error = fueword32(&cv->c_has_waiters, &hasw);
2577	if (error == 0 && hasw == 0)
2578		suword32(&cv->c_has_waiters, 1);
2579
2580	umtxq_unbusy_unlocked(&uq->uq_key);
2581
2582	error = do_unlock_umutex(td, m, false);
2583
2584	if (timeout != NULL)
2585		abs_timeout_init(&timo, clockid, (wflags & CVWAIT_ABSTIME) != 0,
2586		    timeout);
2587
2588	umtxq_lock(&uq->uq_key);
2589	if (error == 0) {
2590		error = umtxq_sleep(uq, "ucond", timeout == NULL ?
2591		    NULL : &timo);
2592	}
2593
2594	if ((uq->uq_flags & UQF_UMTXQ) == 0)
2595		error = 0;
2596	else {
2597		/*
2598		 * This must be timeout,interrupted by signal or
2599		 * surprious wakeup, clear c_has_waiter flag when
2600		 * necessary.
2601		 */
2602		umtxq_busy(&uq->uq_key);
2603		if ((uq->uq_flags & UQF_UMTXQ) != 0) {
2604			int oldlen = uq->uq_cur_queue->length;
2605			umtxq_remove(uq);
2606			if (oldlen == 1) {
2607				umtxq_unlock(&uq->uq_key);
2608				suword32(&cv->c_has_waiters, 0);
2609				umtxq_lock(&uq->uq_key);
2610			}
2611		}
2612		umtxq_unbusy(&uq->uq_key);
2613		if (error == ERESTART)
2614			error = EINTR;
2615	}
2616
2617	umtxq_unlock(&uq->uq_key);
2618	umtx_key_release(&uq->uq_key);
2619	return (error);
2620}
2621
2622/*
2623 * Signal a userland condition variable.
2624 */
2625static int
2626do_cv_signal(struct thread *td, struct ucond *cv)
2627{
2628	struct umtx_key key;
2629	int error, cnt, nwake;
2630	uint32_t flags;
2631
2632	error = fueword32(&cv->c_flags, &flags);
2633	if (error == -1)
2634		return (EFAULT);
2635	if ((error = umtx_key_get(cv, TYPE_CV, GET_SHARE(flags), &key)) != 0)
2636		return (error);
2637	umtxq_lock(&key);
2638	umtxq_busy(&key);
2639	cnt = umtxq_count(&key);
2640	nwake = umtxq_signal(&key, 1);
2641	if (cnt <= nwake) {
2642		umtxq_unlock(&key);
2643		error = suword32(&cv->c_has_waiters, 0);
2644		if (error == -1)
2645			error = EFAULT;
2646		umtxq_lock(&key);
2647	}
2648	umtxq_unbusy(&key);
2649	umtxq_unlock(&key);
2650	umtx_key_release(&key);
2651	return (error);
2652}
2653
2654static int
2655do_cv_broadcast(struct thread *td, struct ucond *cv)
2656{
2657	struct umtx_key key;
2658	int error;
2659	uint32_t flags;
2660
2661	error = fueword32(&cv->c_flags, &flags);
2662	if (error == -1)
2663		return (EFAULT);
2664	if ((error = umtx_key_get(cv, TYPE_CV, GET_SHARE(flags), &key)) != 0)
2665		return (error);
2666
2667	umtxq_lock(&key);
2668	umtxq_busy(&key);
2669	umtxq_signal(&key, INT_MAX);
2670	umtxq_unlock(&key);
2671
2672	error = suword32(&cv->c_has_waiters, 0);
2673	if (error == -1)
2674		error = EFAULT;
2675
2676	umtxq_unbusy_unlocked(&key);
2677
2678	umtx_key_release(&key);
2679	return (error);
2680}
2681
2682static int
2683do_rw_rdlock(struct thread *td, struct urwlock *rwlock, long fflag,
2684    struct _umtx_time *timeout)
2685{
2686	struct abs_timeout timo;
2687	struct umtx_q *uq;
2688	uint32_t flags, wrflags;
2689	int32_t state, oldstate;
2690	int32_t blocked_readers;
2691	int error, error1, rv;
2692
2693	uq = td->td_umtxq;
2694	error = fueword32(&rwlock->rw_flags, &flags);
2695	if (error == -1)
2696		return (EFAULT);
2697	error = umtx_key_get(rwlock, TYPE_RWLOCK, GET_SHARE(flags), &uq->uq_key);
2698	if (error != 0)
2699		return (error);
2700
2701	if (timeout != NULL)
2702		abs_timeout_init2(&timo, timeout);
2703
2704	wrflags = URWLOCK_WRITE_OWNER;
2705	if (!(fflag & URWLOCK_PREFER_READER) && !(flags & URWLOCK_PREFER_READER))
2706		wrflags |= URWLOCK_WRITE_WAITERS;
2707
2708	for (;;) {
2709		rv = fueword32(&rwlock->rw_state, &state);
2710		if (rv == -1) {
2711			umtx_key_release(&uq->uq_key);
2712			return (EFAULT);
2713		}
2714
2715		/* try to lock it */
2716		while (!(state & wrflags)) {
2717			if (__predict_false(URWLOCK_READER_COUNT(state) ==
2718			    URWLOCK_MAX_READERS)) {
2719				umtx_key_release(&uq->uq_key);
2720				return (EAGAIN);
2721			}
2722			rv = casueword32(&rwlock->rw_state, state,
2723			    &oldstate, state + 1);
2724			if (rv == -1) {
2725				umtx_key_release(&uq->uq_key);
2726				return (EFAULT);
2727			}
2728			if (rv == 0) {
2729				MPASS(oldstate == state);
2730				umtx_key_release(&uq->uq_key);
2731				return (0);
2732			}
2733			error = thread_check_susp(td, true);
2734			if (error != 0)
2735				break;
2736			state = oldstate;
2737		}
2738
2739		if (error)
2740			break;
2741
2742		/* grab monitor lock */
2743		umtxq_lock(&uq->uq_key);
2744		umtxq_busy(&uq->uq_key);
2745		umtxq_unlock(&uq->uq_key);
2746
2747		/*
2748		 * re-read the state, in case it changed between the try-lock above
2749		 * and the check below
2750		 */
2751		rv = fueword32(&rwlock->rw_state, &state);
2752		if (rv == -1)
2753			error = EFAULT;
2754
2755		/* set read contention bit */
2756		while (error == 0 && (state & wrflags) &&
2757		    !(state & URWLOCK_READ_WAITERS)) {
2758			rv = casueword32(&rwlock->rw_state, state,
2759			    &oldstate, state | URWLOCK_READ_WAITERS);
2760			if (rv == -1) {
2761				error = EFAULT;
2762				break;
2763			}
2764			if (rv == 0) {
2765				MPASS(oldstate == state);
2766				goto sleep;
2767			}
2768			state = oldstate;
2769			error = thread_check_susp(td, false);
2770			if (error != 0)
2771				break;
2772		}
2773		if (error != 0) {
2774			umtxq_unbusy_unlocked(&uq->uq_key);
2775			break;
2776		}
2777
2778		/* state is changed while setting flags, restart */
2779		if (!(state & wrflags)) {
2780			umtxq_unbusy_unlocked(&uq->uq_key);
2781			error = thread_check_susp(td, true);
2782			if (error != 0)
2783				break;
2784			continue;
2785		}
2786
2787sleep:
2788		/*
2789		 * Contention bit is set, before sleeping, increase
2790		 * read waiter count.
2791		 */
2792		rv = fueword32(&rwlock->rw_blocked_readers,
2793		    &blocked_readers);
2794		if (rv == -1) {
2795			umtxq_unbusy_unlocked(&uq->uq_key);
2796			error = EFAULT;
2797			break;
2798		}
2799		suword32(&rwlock->rw_blocked_readers, blocked_readers+1);
2800
2801		while (state & wrflags) {
2802			umtxq_lock(&uq->uq_key);
2803			umtxq_insert(uq);
2804			umtxq_unbusy(&uq->uq_key);
2805
2806			error = umtxq_sleep(uq, "urdlck", timeout == NULL ?
2807			    NULL : &timo);
2808
2809			umtxq_busy(&uq->uq_key);
2810			umtxq_remove(uq);
2811			umtxq_unlock(&uq->uq_key);
2812			if (error)
2813				break;
2814			rv = fueword32(&rwlock->rw_state, &state);
2815			if (rv == -1) {
2816				error = EFAULT;
2817				break;
2818			}
2819		}
2820
2821		/* decrease read waiter count, and may clear read contention bit */
2822		rv = fueword32(&rwlock->rw_blocked_readers,
2823		    &blocked_readers);
2824		if (rv == -1) {
2825			umtxq_unbusy_unlocked(&uq->uq_key);
2826			error = EFAULT;
2827			break;
2828		}
2829		suword32(&rwlock->rw_blocked_readers, blocked_readers-1);
2830		if (blocked_readers == 1) {
2831			rv = fueword32(&rwlock->rw_state, &state);
2832			if (rv == -1) {
2833				umtxq_unbusy_unlocked(&uq->uq_key);
2834				error = EFAULT;
2835				break;
2836			}
2837			for (;;) {
2838				rv = casueword32(&rwlock->rw_state, state,
2839				    &oldstate, state & ~URWLOCK_READ_WAITERS);
2840				if (rv == -1) {
2841					error = EFAULT;
2842					break;
2843				}
2844				if (rv == 0) {
2845					MPASS(oldstate == state);
2846					break;
2847				}
2848				state = oldstate;
2849				error1 = thread_check_susp(td, false);
2850				if (error1 != 0) {
2851					if (error == 0)
2852						error = error1;
2853					break;
2854				}
2855			}
2856		}
2857
2858		umtxq_unbusy_unlocked(&uq->uq_key);
2859		if (error != 0)
2860			break;
2861	}
2862	umtx_key_release(&uq->uq_key);
2863	if (error == ERESTART)
2864		error = EINTR;
2865	return (error);
2866}
2867
2868static int
2869do_rw_wrlock(struct thread *td, struct urwlock *rwlock, struct _umtx_time *timeout)
2870{
2871	struct abs_timeout timo;
2872	struct umtx_q *uq;
2873	uint32_t flags;
2874	int32_t state, oldstate;
2875	int32_t blocked_writers;
2876	int32_t blocked_readers;
2877	int error, error1, rv;
2878
2879	uq = td->td_umtxq;
2880	error = fueword32(&rwlock->rw_flags, &flags);
2881	if (error == -1)
2882		return (EFAULT);
2883	error = umtx_key_get(rwlock, TYPE_RWLOCK, GET_SHARE(flags), &uq->uq_key);
2884	if (error != 0)
2885		return (error);
2886
2887	if (timeout != NULL)
2888		abs_timeout_init2(&timo, timeout);
2889
2890	blocked_readers = 0;
2891	for (;;) {
2892		rv = fueword32(&rwlock->rw_state, &state);
2893		if (rv == -1) {
2894			umtx_key_release(&uq->uq_key);
2895			return (EFAULT);
2896		}
2897		while ((state & URWLOCK_WRITE_OWNER) == 0 &&
2898		    URWLOCK_READER_COUNT(state) == 0) {
2899			rv = casueword32(&rwlock->rw_state, state,
2900			    &oldstate, state | URWLOCK_WRITE_OWNER);
2901			if (rv == -1) {
2902				umtx_key_release(&uq->uq_key);
2903				return (EFAULT);
2904			}
2905			if (rv == 0) {
2906				MPASS(oldstate == state);
2907				umtx_key_release(&uq->uq_key);
2908				return (0);
2909			}
2910			state = oldstate;
2911			error = thread_check_susp(td, true);
2912			if (error != 0)
2913				break;
2914		}
2915
2916		if (error) {
2917			if ((state & (URWLOCK_WRITE_OWNER |
2918			    URWLOCK_WRITE_WAITERS)) == 0 &&
2919			    blocked_readers != 0) {
2920				umtxq_lock(&uq->uq_key);
2921				umtxq_busy(&uq->uq_key);
2922				umtxq_signal_queue(&uq->uq_key, INT_MAX,
2923				    UMTX_SHARED_QUEUE);
2924				umtxq_unbusy(&uq->uq_key);
2925				umtxq_unlock(&uq->uq_key);
2926			}
2927
2928			break;
2929		}
2930
2931		/* grab monitor lock */
2932		umtxq_lock(&uq->uq_key);
2933		umtxq_busy(&uq->uq_key);
2934		umtxq_unlock(&uq->uq_key);
2935
2936		/*
2937		 * Re-read the state, in case it changed between the
2938		 * try-lock above and the check below.
2939		 */
2940		rv = fueword32(&rwlock->rw_state, &state);
2941		if (rv == -1)
2942			error = EFAULT;
2943
2944		while (error == 0 && ((state & URWLOCK_WRITE_OWNER) ||
2945		    URWLOCK_READER_COUNT(state) != 0) &&
2946		    (state & URWLOCK_WRITE_WAITERS) == 0) {
2947			rv = casueword32(&rwlock->rw_state, state,
2948			    &oldstate, state | URWLOCK_WRITE_WAITERS);
2949			if (rv == -1) {
2950				error = EFAULT;
2951				break;
2952			}
2953			if (rv == 0) {
2954				MPASS(oldstate == state);
2955				goto sleep;
2956			}
2957			state = oldstate;
2958			error = thread_check_susp(td, false);
2959			if (error != 0)
2960				break;
2961		}
2962		if (error != 0) {
2963			umtxq_unbusy_unlocked(&uq->uq_key);
2964			break;
2965		}
2966
2967		if ((state & URWLOCK_WRITE_OWNER) == 0 &&
2968		    URWLOCK_READER_COUNT(state) == 0) {
2969			umtxq_unbusy_unlocked(&uq->uq_key);
2970			error = thread_check_susp(td, false);
2971			if (error != 0)
2972				break;
2973			continue;
2974		}
2975sleep:
2976		rv = fueword32(&rwlock->rw_blocked_writers,
2977		    &blocked_writers);
2978		if (rv == -1) {
2979			umtxq_unbusy_unlocked(&uq->uq_key);
2980			error = EFAULT;
2981			break;
2982		}
2983		suword32(&rwlock->rw_blocked_writers, blocked_writers + 1);
2984
2985		while ((state & URWLOCK_WRITE_OWNER) ||
2986		    URWLOCK_READER_COUNT(state) != 0) {
2987			umtxq_lock(&uq->uq_key);
2988			umtxq_insert_queue(uq, UMTX_EXCLUSIVE_QUEUE);
2989			umtxq_unbusy(&uq->uq_key);
2990
2991			error = umtxq_sleep(uq, "uwrlck", timeout == NULL ?
2992			    NULL : &timo);
2993
2994			umtxq_busy(&uq->uq_key);
2995			umtxq_remove_queue(uq, UMTX_EXCLUSIVE_QUEUE);
2996			umtxq_unlock(&uq->uq_key);
2997			if (error)
2998				break;
2999			rv = fueword32(&rwlock->rw_state, &state);
3000			if (rv == -1) {
3001				error = EFAULT;
3002				break;
3003			}
3004		}
3005
3006		rv = fueword32(&rwlock->rw_blocked_writers,
3007		    &blocked_writers);
3008		if (rv == -1) {
3009			umtxq_unbusy_unlocked(&uq->uq_key);
3010			error = EFAULT;
3011			break;
3012		}
3013		suword32(&rwlock->rw_blocked_writers, blocked_writers-1);
3014		if (blocked_writers == 1) {
3015			rv = fueword32(&rwlock->rw_state, &state);
3016			if (rv == -1) {
3017				umtxq_unbusy_unlocked(&uq->uq_key);
3018				error = EFAULT;
3019				break;
3020			}
3021			for (;;) {
3022				rv = casueword32(&rwlock->rw_state, state,
3023				    &oldstate, state & ~URWLOCK_WRITE_WAITERS);
3024				if (rv == -1) {
3025					error = EFAULT;
3026					break;
3027				}
3028				if (rv == 0) {
3029					MPASS(oldstate == state);
3030					break;
3031				}
3032				state = oldstate;
3033				error1 = thread_check_susp(td, false);
3034				/*
3035				 * We are leaving the URWLOCK_WRITE_WAITERS
3036				 * behind, but this should not harm the
3037				 * correctness.
3038				 */
3039				if (error1 != 0) {
3040					if (error == 0)
3041						error = error1;
3042					break;
3043				}
3044			}
3045			rv = fueword32(&rwlock->rw_blocked_readers,
3046			    &blocked_readers);
3047			if (rv == -1) {
3048				umtxq_unbusy_unlocked(&uq->uq_key);
3049				error = EFAULT;
3050				break;
3051			}
3052		} else
3053			blocked_readers = 0;
3054
3055		umtxq_unbusy_unlocked(&uq->uq_key);
3056	}
3057
3058	umtx_key_release(&uq->uq_key);
3059	if (error == ERESTART)
3060		error = EINTR;
3061	return (error);
3062}
3063
3064static int
3065do_rw_unlock(struct thread *td, struct urwlock *rwlock)
3066{
3067	struct umtx_q *uq;
3068	uint32_t flags;
3069	int32_t state, oldstate;
3070	int error, rv, q, count;
3071
3072	uq = td->td_umtxq;
3073	error = fueword32(&rwlock->rw_flags, &flags);
3074	if (error == -1)
3075		return (EFAULT);
3076	error = umtx_key_get(rwlock, TYPE_RWLOCK, GET_SHARE(flags), &uq->uq_key);
3077	if (error != 0)
3078		return (error);
3079
3080	error = fueword32(&rwlock->rw_state, &state);
3081	if (error == -1) {
3082		error = EFAULT;
3083		goto out;
3084	}
3085	if (state & URWLOCK_WRITE_OWNER) {
3086		for (;;) {
3087			rv = casueword32(&rwlock->rw_state, state,
3088			    &oldstate, state & ~URWLOCK_WRITE_OWNER);
3089			if (rv == -1) {
3090				error = EFAULT;
3091				goto out;
3092			}
3093			if (rv == 1) {
3094				state = oldstate;
3095				if (!(oldstate & URWLOCK_WRITE_OWNER)) {
3096					error = EPERM;
3097					goto out;
3098				}
3099				error = thread_check_susp(td, true);
3100				if (error != 0)
3101					goto out;
3102			} else
3103				break;
3104		}
3105	} else if (URWLOCK_READER_COUNT(state) != 0) {
3106		for (;;) {
3107			rv = casueword32(&rwlock->rw_state, state,
3108			    &oldstate, state - 1);
3109			if (rv == -1) {
3110				error = EFAULT;
3111				goto out;
3112			}
3113			if (rv == 1) {
3114				state = oldstate;
3115				if (URWLOCK_READER_COUNT(oldstate) == 0) {
3116					error = EPERM;
3117					goto out;
3118				}
3119				error = thread_check_susp(td, true);
3120				if (error != 0)
3121					goto out;
3122			} else
3123				break;
3124		}
3125	} else {
3126		error = EPERM;
3127		goto out;
3128	}
3129
3130	count = 0;
3131
3132	if (!(flags & URWLOCK_PREFER_READER)) {
3133		if (state & URWLOCK_WRITE_WAITERS) {
3134			count = 1;
3135			q = UMTX_EXCLUSIVE_QUEUE;
3136		} else if (state & URWLOCK_READ_WAITERS) {
3137			count = INT_MAX;
3138			q = UMTX_SHARED_QUEUE;
3139		}
3140	} else {
3141		if (state & URWLOCK_READ_WAITERS) {
3142			count = INT_MAX;
3143			q = UMTX_SHARED_QUEUE;
3144		} else if (state & URWLOCK_WRITE_WAITERS) {
3145			count = 1;
3146			q = UMTX_EXCLUSIVE_QUEUE;
3147		}
3148	}
3149
3150	if (count) {
3151		umtxq_lock(&uq->uq_key);
3152		umtxq_busy(&uq->uq_key);
3153		umtxq_signal_queue(&uq->uq_key, count, q);
3154		umtxq_unbusy(&uq->uq_key);
3155		umtxq_unlock(&uq->uq_key);
3156	}
3157out:
3158	umtx_key_release(&uq->uq_key);
3159	return (error);
3160}
3161
3162#if defined(COMPAT_FREEBSD9) || defined(COMPAT_FREEBSD10)
3163static int
3164do_sem_wait(struct thread *td, struct _usem *sem, struct _umtx_time *timeout)
3165{
3166	struct abs_timeout timo;
3167	struct umtx_q *uq;
3168	uint32_t flags, count, count1;
3169	int error, rv, rv1;
3170
3171	uq = td->td_umtxq;
3172	error = fueword32(&sem->_flags, &flags);
3173	if (error == -1)
3174		return (EFAULT);
3175	error = umtx_key_get(sem, TYPE_SEM, GET_SHARE(flags), &uq->uq_key);
3176	if (error != 0)
3177		return (error);
3178
3179	if (timeout != NULL)
3180		abs_timeout_init2(&timo, timeout);
3181
3182again:
3183	umtxq_lock(&uq->uq_key);
3184	umtxq_busy(&uq->uq_key);
3185	umtxq_insert(uq);
3186	umtxq_unlock(&uq->uq_key);
3187	rv = casueword32(&sem->_has_waiters, 0, &count1, 1);
3188	if (rv == 0)
3189		rv1 = fueword32(&sem->_count, &count);
3190	if (rv == -1 || (rv == 0 && (rv1 == -1 || count != 0)) ||
3191	    (rv == 1 && count1 == 0)) {
3192		umtxq_lock(&uq->uq_key);
3193		umtxq_unbusy(&uq->uq_key);
3194		umtxq_remove(uq);
3195		umtxq_unlock(&uq->uq_key);
3196		if (rv == 1) {
3197			rv = thread_check_susp(td, true);
3198			if (rv == 0)
3199				goto again;
3200			error = rv;
3201			goto out;
3202		}
3203		if (rv == 0)
3204			rv = rv1;
3205		error = rv == -1 ? EFAULT : 0;
3206		goto out;
3207	}
3208	umtxq_lock(&uq->uq_key);
3209	umtxq_unbusy(&uq->uq_key);
3210
3211	error = umtxq_sleep(uq, "usem", timeout == NULL ? NULL : &timo);
3212
3213	if ((uq->uq_flags & UQF_UMTXQ) == 0)
3214		error = 0;
3215	else {
3216		umtxq_remove(uq);
3217		/* A relative timeout cannot be restarted. */
3218		if (error == ERESTART && timeout != NULL &&
3219		    (timeout->_flags & UMTX_ABSTIME) == 0)
3220			error = EINTR;
3221	}
3222	umtxq_unlock(&uq->uq_key);
3223out:
3224	umtx_key_release(&uq->uq_key);
3225	return (error);
3226}
3227
3228/*
3229 * Signal a userland semaphore.
3230 */
3231static int
3232do_sem_wake(struct thread *td, struct _usem *sem)
3233{
3234	struct umtx_key key;
3235	int error, cnt;
3236	uint32_t flags;
3237
3238	error = fueword32(&sem->_flags, &flags);
3239	if (error == -1)
3240		return (EFAULT);
3241	if ((error = umtx_key_get(sem, TYPE_SEM, GET_SHARE(flags), &key)) != 0)
3242		return (error);
3243	umtxq_lock(&key);
3244	umtxq_busy(&key);
3245	cnt = umtxq_count(&key);
3246	if (cnt > 0) {
3247		/*
3248		 * Check if count is greater than 0, this means the memory is
3249		 * still being referenced by user code, so we can safely
3250		 * update _has_waiters flag.
3251		 */
3252		if (cnt == 1) {
3253			umtxq_unlock(&key);
3254			error = suword32(&sem->_has_waiters, 0);
3255			umtxq_lock(&key);
3256			if (error == -1)
3257				error = EFAULT;
3258		}
3259		umtxq_signal(&key, 1);
3260	}
3261	umtxq_unbusy(&key);
3262	umtxq_unlock(&key);
3263	umtx_key_release(&key);
3264	return (error);
3265}
3266#endif
3267
3268static int
3269do_sem2_wait(struct thread *td, struct _usem2 *sem, struct _umtx_time *timeout)
3270{
3271	struct abs_timeout timo;
3272	struct umtx_q *uq;
3273	uint32_t count, flags;
3274	int error, rv;
3275
3276	uq = td->td_umtxq;
3277	flags = fuword32(&sem->_flags);
3278	if (timeout != NULL)
3279		abs_timeout_init2(&timo, timeout);
3280
3281again:
3282	error = umtx_key_get(sem, TYPE_SEM, GET_SHARE(flags), &uq->uq_key);
3283	if (error != 0)
3284		return (error);
3285	umtxq_lock(&uq->uq_key);
3286	umtxq_busy(&uq->uq_key);
3287	umtxq_insert(uq);
3288	umtxq_unlock(&uq->uq_key);
3289	rv = fueword32(&sem->_count, &count);
3290	if (rv == -1) {
3291		umtxq_lock(&uq->uq_key);
3292		umtxq_unbusy(&uq->uq_key);
3293		umtxq_remove(uq);
3294		umtxq_unlock(&uq->uq_key);
3295		umtx_key_release(&uq->uq_key);
3296		return (EFAULT);
3297	}
3298	for (;;) {
3299		if (USEM_COUNT(count) != 0) {
3300			umtxq_lock(&uq->uq_key);
3301			umtxq_unbusy(&uq->uq_key);
3302			umtxq_remove(uq);
3303			umtxq_unlock(&uq->uq_key);
3304			umtx_key_release(&uq->uq_key);
3305			return (0);
3306		}
3307		if (count == USEM_HAS_WAITERS)
3308			break;
3309		rv = casueword32(&sem->_count, 0, &count, USEM_HAS_WAITERS);
3310		if (rv == 0)
3311			break;
3312		umtxq_lock(&uq->uq_key);
3313		umtxq_unbusy(&uq->uq_key);
3314		umtxq_remove(uq);
3315		umtxq_unlock(&uq->uq_key);
3316		umtx_key_release(&uq->uq_key);
3317		if (rv == -1)
3318			return (EFAULT);
3319		rv = thread_check_susp(td, true);
3320		if (rv != 0)
3321			return (rv);
3322		goto again;
3323	}
3324	umtxq_lock(&uq->uq_key);
3325	umtxq_unbusy(&uq->uq_key);
3326
3327	error = umtxq_sleep(uq, "usem", timeout == NULL ? NULL : &timo);
3328
3329	if ((uq->uq_flags & UQF_UMTXQ) == 0)
3330		error = 0;
3331	else {
3332		umtxq_remove(uq);
3333		if (timeout != NULL && (timeout->_flags & UMTX_ABSTIME) == 0) {
3334			/* A relative timeout cannot be restarted. */
3335			if (error == ERESTART)
3336				error = EINTR;
3337			if (error == EINTR) {
3338				abs_timeout_update(&timo);
3339				timespecsub(&timo.end, &timo.cur,
3340				    &timeout->_timeout);
3341			}
3342		}
3343	}
3344	umtxq_unlock(&uq->uq_key);
3345	umtx_key_release(&uq->uq_key);
3346	return (error);
3347}
3348
3349/*
3350 * Signal a userland semaphore.
3351 */
3352static int
3353do_sem2_wake(struct thread *td, struct _usem2 *sem)
3354{
3355	struct umtx_key key;
3356	int error, cnt, rv;
3357	uint32_t count, flags;
3358
3359	rv = fueword32(&sem->_flags, &flags);
3360	if (rv == -1)
3361		return (EFAULT);
3362	if ((error = umtx_key_get(sem, TYPE_SEM, GET_SHARE(flags), &key)) != 0)
3363		return (error);
3364	umtxq_lock(&key);
3365	umtxq_busy(&key);
3366	cnt = umtxq_count(&key);
3367	if (cnt > 0) {
3368		/*
3369		 * If this was the last sleeping thread, clear the waiters
3370		 * flag in _count.
3371		 */
3372		if (cnt == 1) {
3373			umtxq_unlock(&key);
3374			rv = fueword32(&sem->_count, &count);
3375			while (rv != -1 && count & USEM_HAS_WAITERS) {
3376				rv = casueword32(&sem->_count, count, &count,
3377				    count & ~USEM_HAS_WAITERS);
3378				if (rv == 1) {
3379					rv = thread_check_susp(td, true);
3380					if (rv != 0)
3381						break;
3382				}
3383			}
3384			if (rv == -1)
3385				error = EFAULT;
3386			else if (rv > 0) {
3387				error = rv;
3388			}
3389			umtxq_lock(&key);
3390		}
3391
3392		umtxq_signal(&key, 1);
3393	}
3394	umtxq_unbusy(&key);
3395	umtxq_unlock(&key);
3396	umtx_key_release(&key);
3397	return (error);
3398}
3399
3400inline int
3401umtx_copyin_timeout(const void *uaddr, struct timespec *tsp)
3402{
3403	int error;
3404
3405	error = copyin(uaddr, tsp, sizeof(*tsp));
3406	if (error == 0) {
3407		if (tsp->tv_sec < 0 ||
3408		    tsp->tv_nsec >= 1000000000 ||
3409		    tsp->tv_nsec < 0)
3410			error = EINVAL;
3411	}
3412	return (error);
3413}
3414
3415static inline int
3416umtx_copyin_umtx_time(const void *uaddr, size_t size, struct _umtx_time *tp)
3417{
3418	int error;
3419
3420	if (size <= sizeof(tp->_timeout)) {
3421		tp->_clockid = CLOCK_REALTIME;
3422		tp->_flags = 0;
3423		error = copyin(uaddr, &tp->_timeout, sizeof(tp->_timeout));
3424	} else
3425		error = copyin(uaddr, tp, sizeof(*tp));
3426	if (error != 0)
3427		return (error);
3428	if (tp->_timeout.tv_sec < 0 ||
3429	    tp->_timeout.tv_nsec >= 1000000000 || tp->_timeout.tv_nsec < 0)
3430		return (EINVAL);
3431	return (0);
3432}
3433
3434static int
3435umtx_copyin_robust_lists(const void *uaddr, size_t size,
3436    struct umtx_robust_lists_params *rb)
3437{
3438
3439	if (size > sizeof(*rb))
3440		return (EINVAL);
3441	return (copyin(uaddr, rb, size));
3442}
3443
3444static int
3445umtx_copyout_timeout(void *uaddr, size_t sz, struct timespec *tsp)
3446{
3447
3448	/*
3449	 * Should be guaranteed by the caller, sz == uaddr1 - sizeof(_umtx_time)
3450	 * and we're only called if sz >= sizeof(timespec) as supplied in the
3451	 * copyops.
3452	 */
3453	KASSERT(sz >= sizeof(*tsp),
3454	    ("umtx_copyops specifies incorrect sizes"));
3455
3456	return (copyout(tsp, uaddr, sizeof(*tsp)));
3457}
3458
3459static int
3460__umtx_op_unimpl(struct thread *td, struct _umtx_op_args *uap,
3461    const struct umtx_copyops *ops __unused)
3462{
3463
3464	return (EOPNOTSUPP);
3465}
3466
3467static int
3468__umtx_op_wait(struct thread *td, struct _umtx_op_args *uap,
3469    const struct umtx_copyops *ops)
3470{
3471	struct _umtx_time timeout, *tm_p;
3472	int error;
3473
3474	if (uap->uaddr2 == NULL)
3475		tm_p = NULL;
3476	else {
3477		error = ops->copyin_umtx_time(
3478		    uap->uaddr2, (size_t)uap->uaddr1, &timeout);
3479		if (error != 0)
3480			return (error);
3481		tm_p = &timeout;
3482	}
3483	return (do_wait(td, uap->obj, uap->val, tm_p, ops->compat32, 0));
3484}
3485
3486static int
3487__umtx_op_wait_uint(struct thread *td, struct _umtx_op_args *uap,
3488    const struct umtx_copyops *ops)
3489{
3490	struct _umtx_time timeout, *tm_p;
3491	int error;
3492
3493	if (uap->uaddr2 == NULL)
3494		tm_p = NULL;
3495	else {
3496		error = ops->copyin_umtx_time(
3497		    uap->uaddr2, (size_t)uap->uaddr1, &timeout);
3498		if (error != 0)
3499			return (error);
3500		tm_p = &timeout;
3501	}
3502	return (do_wait(td, uap->obj, uap->val, tm_p, 1, 0));
3503}
3504
3505static int
3506__umtx_op_wait_uint_private(struct thread *td, struct _umtx_op_args *uap,
3507    const struct umtx_copyops *ops)
3508{
3509	struct _umtx_time *tm_p, timeout;
3510	int error;
3511
3512	if (uap->uaddr2 == NULL)
3513		tm_p = NULL;
3514	else {
3515		error = ops->copyin_umtx_time(
3516		    uap->uaddr2, (size_t)uap->uaddr1, &timeout);
3517		if (error != 0)
3518			return (error);
3519		tm_p = &timeout;
3520	}
3521	return (do_wait(td, uap->obj, uap->val, tm_p, 1, 1));
3522}
3523
3524static int
3525__umtx_op_wake(struct thread *td, struct _umtx_op_args *uap,
3526    const struct umtx_copyops *ops __unused)
3527{
3528
3529	return (kern_umtx_wake(td, uap->obj, uap->val, 0));
3530}
3531
3532#define BATCH_SIZE	128
3533static int
3534__umtx_op_nwake_private_native(struct thread *td, struct _umtx_op_args *uap)
3535{
3536	char *uaddrs[BATCH_SIZE], **upp;
3537	int count, error, i, pos, tocopy;
3538
3539	upp = (char **)uap->obj;
3540	error = 0;
3541	for (count = uap->val, pos = 0; count > 0; count -= tocopy,
3542	    pos += tocopy) {
3543		tocopy = MIN(count, BATCH_SIZE);
3544		error = copyin(upp + pos, uaddrs, tocopy * sizeof(char *));
3545		if (error != 0)
3546			break;
3547		for (i = 0; i < tocopy; ++i) {
3548			kern_umtx_wake(td, uaddrs[i], INT_MAX, 1);
3549		}
3550		maybe_yield();
3551	}
3552	return (error);
3553}
3554
3555static int
3556__umtx_op_nwake_private_compat32(struct thread *td, struct _umtx_op_args *uap)
3557{
3558	uint32_t uaddrs[BATCH_SIZE], *upp;
3559	int count, error, i, pos, tocopy;
3560
3561	upp = (uint32_t *)uap->obj;
3562	error = 0;
3563	for (count = uap->val, pos = 0; count > 0; count -= tocopy,
3564	    pos += tocopy) {
3565		tocopy = MIN(count, BATCH_SIZE);
3566		error = copyin(upp + pos, uaddrs, tocopy * sizeof(uint32_t));
3567		if (error != 0)
3568			break;
3569		for (i = 0; i < tocopy; ++i) {
3570			kern_umtx_wake(td, (void *)(uintptr_t)uaddrs[i],
3571			    INT_MAX, 1);
3572		}
3573		maybe_yield();
3574	}
3575	return (error);
3576}
3577
3578static int
3579__umtx_op_nwake_private(struct thread *td, struct _umtx_op_args *uap,
3580    const struct umtx_copyops *ops)
3581{
3582
3583	if (ops->compat32)
3584		return (__umtx_op_nwake_private_compat32(td, uap));
3585	return (__umtx_op_nwake_private_native(td, uap));
3586}
3587
3588static int
3589__umtx_op_wake_private(struct thread *td, struct _umtx_op_args *uap,
3590    const struct umtx_copyops *ops __unused)
3591{
3592
3593	return (kern_umtx_wake(td, uap->obj, uap->val, 1));
3594}
3595
3596static int
3597__umtx_op_lock_umutex(struct thread *td, struct _umtx_op_args *uap,
3598   const struct umtx_copyops *ops)
3599{
3600	struct _umtx_time *tm_p, timeout;
3601	int error;
3602
3603	/* Allow a null timespec (wait forever). */
3604	if (uap->uaddr2 == NULL)
3605		tm_p = NULL;
3606	else {
3607		error = ops->copyin_umtx_time(
3608		    uap->uaddr2, (size_t)uap->uaddr1, &timeout);
3609		if (error != 0)
3610			return (error);
3611		tm_p = &timeout;
3612	}
3613	return (do_lock_umutex(td, uap->obj, tm_p, 0));
3614}
3615
3616static int
3617__umtx_op_trylock_umutex(struct thread *td, struct _umtx_op_args *uap,
3618    const struct umtx_copyops *ops __unused)
3619{
3620
3621	return (do_lock_umutex(td, uap->obj, NULL, _UMUTEX_TRY));
3622}
3623
3624static int
3625__umtx_op_wait_umutex(struct thread *td, struct _umtx_op_args *uap,
3626    const struct umtx_copyops *ops)
3627{
3628	struct _umtx_time *tm_p, timeout;
3629	int error;
3630
3631	/* Allow a null timespec (wait forever). */
3632	if (uap->uaddr2 == NULL)
3633		tm_p = NULL;
3634	else {
3635		error = ops->copyin_umtx_time(
3636		    uap->uaddr2, (size_t)uap->uaddr1, &timeout);
3637		if (error != 0)
3638			return (error);
3639		tm_p = &timeout;
3640	}
3641	return (do_lock_umutex(td, uap->obj, tm_p, _UMUTEX_WAIT));
3642}
3643
3644static int
3645__umtx_op_wake_umutex(struct thread *td, struct _umtx_op_args *uap,
3646    const struct umtx_copyops *ops __unused)
3647{
3648
3649	return (do_wake_umutex(td, uap->obj));
3650}
3651
3652static int
3653__umtx_op_unlock_umutex(struct thread *td, struct _umtx_op_args *uap,
3654    const struct umtx_copyops *ops __unused)
3655{
3656
3657	return (do_unlock_umutex(td, uap->obj, false));
3658}
3659
3660static int
3661__umtx_op_set_ceiling(struct thread *td, struct _umtx_op_args *uap,
3662    const struct umtx_copyops *ops __unused)
3663{
3664
3665	return (do_set_ceiling(td, uap->obj, uap->val, uap->uaddr1));
3666}
3667
3668static int
3669__umtx_op_cv_wait(struct thread *td, struct _umtx_op_args *uap,
3670    const struct umtx_copyops *ops)
3671{
3672	struct timespec *ts, timeout;
3673	int error;
3674
3675	/* Allow a null timespec (wait forever). */
3676	if (uap->uaddr2 == NULL)
3677		ts = NULL;
3678	else {
3679		error = ops->copyin_timeout(uap->uaddr2, &timeout);
3680		if (error != 0)
3681			return (error);
3682		ts = &timeout;
3683	}
3684	return (do_cv_wait(td, uap->obj, uap->uaddr1, ts, uap->val));
3685}
3686
3687static int
3688__umtx_op_cv_signal(struct thread *td, struct _umtx_op_args *uap,
3689    const struct umtx_copyops *ops __unused)
3690{
3691
3692	return (do_cv_signal(td, uap->obj));
3693}
3694
3695static int
3696__umtx_op_cv_broadcast(struct thread *td, struct _umtx_op_args *uap,
3697    const struct umtx_copyops *ops __unused)
3698{
3699
3700	return (do_cv_broadcast(td, uap->obj));
3701}
3702
3703static int
3704__umtx_op_rw_rdlock(struct thread *td, struct _umtx_op_args *uap,
3705    const struct umtx_copyops *ops)
3706{
3707	struct _umtx_time timeout;
3708	int error;
3709
3710	/* Allow a null timespec (wait forever). */
3711	if (uap->uaddr2 == NULL) {
3712		error = do_rw_rdlock(td, uap->obj, uap->val, 0);
3713	} else {
3714		error = ops->copyin_umtx_time(uap->uaddr2,
3715		   (size_t)uap->uaddr1, &timeout);
3716		if (error != 0)
3717			return (error);
3718		error = do_rw_rdlock(td, uap->obj, uap->val, &timeout);
3719	}
3720	return (error);
3721}
3722
3723static int
3724__umtx_op_rw_wrlock(struct thread *td, struct _umtx_op_args *uap,
3725    const struct umtx_copyops *ops)
3726{
3727	struct _umtx_time timeout;
3728	int error;
3729
3730	/* Allow a null timespec (wait forever). */
3731	if (uap->uaddr2 == NULL) {
3732		error = do_rw_wrlock(td, uap->obj, 0);
3733	} else {
3734		error = ops->copyin_umtx_time(uap->uaddr2,
3735		   (size_t)uap->uaddr1, &timeout);
3736		if (error != 0)
3737			return (error);
3738
3739		error = do_rw_wrlock(td, uap->obj, &timeout);
3740	}
3741	return (error);
3742}
3743
3744static int
3745__umtx_op_rw_unlock(struct thread *td, struct _umtx_op_args *uap,
3746    const struct umtx_copyops *ops __unused)
3747{
3748
3749	return (do_rw_unlock(td, uap->obj));
3750}
3751
3752#if defined(COMPAT_FREEBSD9) || defined(COMPAT_FREEBSD10)
3753static int
3754__umtx_op_sem_wait(struct thread *td, struct _umtx_op_args *uap,
3755    const struct umtx_copyops *ops)
3756{
3757	struct _umtx_time *tm_p, timeout;
3758	int error;
3759
3760	/* Allow a null timespec (wait forever). */
3761	if (uap->uaddr2 == NULL)
3762		tm_p = NULL;
3763	else {
3764		error = ops->copyin_umtx_time(
3765		    uap->uaddr2, (size_t)uap->uaddr1, &timeout);
3766		if (error != 0)
3767			return (error);
3768		tm_p = &timeout;
3769	}
3770	return (do_sem_wait(td, uap->obj, tm_p));
3771}
3772
3773static int
3774__umtx_op_sem_wake(struct thread *td, struct _umtx_op_args *uap,
3775    const struct umtx_copyops *ops __unused)
3776{
3777
3778	return (do_sem_wake(td, uap->obj));
3779}
3780#endif
3781
3782static int
3783__umtx_op_wake2_umutex(struct thread *td, struct _umtx_op_args *uap,
3784    const struct umtx_copyops *ops __unused)
3785{
3786
3787	return (do_wake2_umutex(td, uap->obj, uap->val));
3788}
3789
3790static int
3791__umtx_op_sem2_wait(struct thread *td, struct _umtx_op_args *uap,
3792    const struct umtx_copyops *ops)
3793{
3794	struct _umtx_time *tm_p, timeout;
3795	size_t uasize;
3796	int error;
3797
3798	/* Allow a null timespec (wait forever). */
3799	if (uap->uaddr2 == NULL) {
3800		uasize = 0;
3801		tm_p = NULL;
3802	} else {
3803		uasize = (size_t)uap->uaddr1;
3804		error = ops->copyin_umtx_time(uap->uaddr2, uasize, &timeout);
3805		if (error != 0)
3806			return (error);
3807		tm_p = &timeout;
3808	}
3809	error = do_sem2_wait(td, uap->obj, tm_p);
3810	if (error == EINTR && uap->uaddr2 != NULL &&
3811	    (timeout._flags & UMTX_ABSTIME) == 0 &&
3812	    uasize >= ops->umtx_time_sz + ops->timespec_sz) {
3813		error = ops->copyout_timeout(
3814		    (void *)((uintptr_t)uap->uaddr2 + ops->umtx_time_sz),
3815		    uasize - ops->umtx_time_sz, &timeout._timeout);
3816		if (error == 0) {
3817			error = EINTR;
3818		}
3819	}
3820
3821	return (error);
3822}
3823
3824static int
3825__umtx_op_sem2_wake(struct thread *td, struct _umtx_op_args *uap,
3826    const struct umtx_copyops *ops __unused)
3827{
3828
3829	return (do_sem2_wake(td, uap->obj));
3830}
3831
3832#define	USHM_OBJ_UMTX(o)						\
3833    ((struct umtx_shm_obj_list *)(&(o)->umtx_data))
3834
3835#define	USHMF_REG_LINKED	0x0001
3836#define	USHMF_OBJ_LINKED	0x0002
3837struct umtx_shm_reg {
3838	TAILQ_ENTRY(umtx_shm_reg) ushm_reg_link;
3839	LIST_ENTRY(umtx_shm_reg) ushm_obj_link;
3840	struct umtx_key		ushm_key;
3841	struct ucred		*ushm_cred;
3842	struct shmfd		*ushm_obj;
3843	u_int			ushm_refcnt;
3844	u_int			ushm_flags;
3845};
3846
3847LIST_HEAD(umtx_shm_obj_list, umtx_shm_reg);
3848TAILQ_HEAD(umtx_shm_reg_head, umtx_shm_reg);
3849
3850static uma_zone_t umtx_shm_reg_zone;
3851static struct umtx_shm_reg_head umtx_shm_registry[UMTX_CHAINS];
3852static struct mtx umtx_shm_lock;
3853static struct umtx_shm_reg_head umtx_shm_reg_delfree =
3854    TAILQ_HEAD_INITIALIZER(umtx_shm_reg_delfree);
3855
3856static void umtx_shm_free_reg(struct umtx_shm_reg *reg);
3857
3858static void
3859umtx_shm_reg_delfree_tq(void *context __unused, int pending __unused)
3860{
3861	struct umtx_shm_reg_head d;
3862	struct umtx_shm_reg *reg, *reg1;
3863
3864	TAILQ_INIT(&d);
3865	mtx_lock(&umtx_shm_lock);
3866	TAILQ_CONCAT(&d, &umtx_shm_reg_delfree, ushm_reg_link);
3867	mtx_unlock(&umtx_shm_lock);
3868	TAILQ_FOREACH_SAFE(reg, &d, ushm_reg_link, reg1) {
3869		TAILQ_REMOVE(&d, reg, ushm_reg_link);
3870		umtx_shm_free_reg(reg);
3871	}
3872}
3873
3874static struct task umtx_shm_reg_delfree_task =
3875    TASK_INITIALIZER(0, umtx_shm_reg_delfree_tq, NULL);
3876
3877static struct umtx_shm_reg *
3878umtx_shm_find_reg_locked(const struct umtx_key *key)
3879{
3880	struct umtx_shm_reg *reg;
3881	struct umtx_shm_reg_head *reg_head;
3882
3883	KASSERT(key->shared, ("umtx_p_find_rg: private key"));
3884	mtx_assert(&umtx_shm_lock, MA_OWNED);
3885	reg_head = &umtx_shm_registry[key->hash];
3886	TAILQ_FOREACH(reg, reg_head, ushm_reg_link) {
3887		KASSERT(reg->ushm_key.shared,
3888		    ("non-shared key on reg %p %d", reg, reg->ushm_key.shared));
3889		if (reg->ushm_key.info.shared.object ==
3890		    key->info.shared.object &&
3891		    reg->ushm_key.info.shared.offset ==
3892		    key->info.shared.offset) {
3893			KASSERT(reg->ushm_key.type == TYPE_SHM, ("TYPE_USHM"));
3894			KASSERT(reg->ushm_refcnt > 0,
3895			    ("reg %p refcnt 0 onlist", reg));
3896			KASSERT((reg->ushm_flags & USHMF_REG_LINKED) != 0,
3897			    ("reg %p not linked", reg));
3898			reg->ushm_refcnt++;
3899			return (reg);
3900		}
3901	}
3902	return (NULL);
3903}
3904
3905static struct umtx_shm_reg *
3906umtx_shm_find_reg(const struct umtx_key *key)
3907{
3908	struct umtx_shm_reg *reg;
3909
3910	mtx_lock(&umtx_shm_lock);
3911	reg = umtx_shm_find_reg_locked(key);
3912	mtx_unlock(&umtx_shm_lock);
3913	return (reg);
3914}
3915
3916static void
3917umtx_shm_free_reg(struct umtx_shm_reg *reg)
3918{
3919
3920	chgumtxcnt(reg->ushm_cred->cr_ruidinfo, -1, 0);
3921	crfree(reg->ushm_cred);
3922	shm_drop(reg->ushm_obj);
3923	uma_zfree(umtx_shm_reg_zone, reg);
3924}
3925
3926static bool
3927umtx_shm_unref_reg_locked(struct umtx_shm_reg *reg, bool force)
3928{
3929	bool res;
3930
3931	mtx_assert(&umtx_shm_lock, MA_OWNED);
3932	KASSERT(reg->ushm_refcnt > 0, ("ushm_reg %p refcnt 0", reg));
3933	reg->ushm_refcnt--;
3934	res = reg->ushm_refcnt == 0;
3935	if (res || force) {
3936		if ((reg->ushm_flags & USHMF_REG_LINKED) != 0) {
3937			TAILQ_REMOVE(&umtx_shm_registry[reg->ushm_key.hash],
3938			    reg, ushm_reg_link);
3939			reg->ushm_flags &= ~USHMF_REG_LINKED;
3940		}
3941		if ((reg->ushm_flags & USHMF_OBJ_LINKED) != 0) {
3942			LIST_REMOVE(reg, ushm_obj_link);
3943			reg->ushm_flags &= ~USHMF_OBJ_LINKED;
3944		}
3945	}
3946	return (res);
3947}
3948
3949static void
3950umtx_shm_unref_reg(struct umtx_shm_reg *reg, bool force)
3951{
3952	vm_object_t object;
3953	bool dofree;
3954
3955	if (force) {
3956		object = reg->ushm_obj->shm_object;
3957		VM_OBJECT_WLOCK(object);
3958		object->flags |= OBJ_UMTXDEAD;
3959		VM_OBJECT_WUNLOCK(object);
3960	}
3961	mtx_lock(&umtx_shm_lock);
3962	dofree = umtx_shm_unref_reg_locked(reg, force);
3963	mtx_unlock(&umtx_shm_lock);
3964	if (dofree)
3965		umtx_shm_free_reg(reg);
3966}
3967
3968void
3969umtx_shm_object_init(vm_object_t object)
3970{
3971
3972	LIST_INIT(USHM_OBJ_UMTX(object));
3973}
3974
3975void
3976umtx_shm_object_terminated(vm_object_t object)
3977{
3978	struct umtx_shm_reg *reg, *reg1;
3979	bool dofree;
3980
3981	if (LIST_EMPTY(USHM_OBJ_UMTX(object)))
3982		return;
3983
3984	dofree = false;
3985	mtx_lock(&umtx_shm_lock);
3986	LIST_FOREACH_SAFE(reg, USHM_OBJ_UMTX(object), ushm_obj_link, reg1) {
3987		if (umtx_shm_unref_reg_locked(reg, true)) {
3988			TAILQ_INSERT_TAIL(&umtx_shm_reg_delfree, reg,
3989			    ushm_reg_link);
3990			dofree = true;
3991		}
3992	}
3993	mtx_unlock(&umtx_shm_lock);
3994	if (dofree)
3995		taskqueue_enqueue(taskqueue_thread, &umtx_shm_reg_delfree_task);
3996}
3997
3998static int
3999umtx_shm_create_reg(struct thread *td, const struct umtx_key *key,
4000    struct umtx_shm_reg **res)
4001{
4002	struct umtx_shm_reg *reg, *reg1;
4003	struct ucred *cred;
4004	int error;
4005
4006	reg = umtx_shm_find_reg(key);
4007	if (reg != NULL) {
4008		*res = reg;
4009		return (0);
4010	}
4011	cred = td->td_ucred;
4012	if (!chgumtxcnt(cred->cr_ruidinfo, 1, lim_cur(td, RLIMIT_UMTXP)))
4013		return (ENOMEM);
4014	reg = uma_zalloc(umtx_shm_reg_zone, M_WAITOK | M_ZERO);
4015	reg->ushm_refcnt = 1;
4016	bcopy(key, &reg->ushm_key, sizeof(*key));
4017	reg->ushm_obj = shm_alloc(td->td_ucred, O_RDWR, false);
4018	reg->ushm_cred = crhold(cred);
4019	error = shm_dotruncate(reg->ushm_obj, PAGE_SIZE);
4020	if (error != 0) {
4021		umtx_shm_free_reg(reg);
4022		return (error);
4023	}
4024	mtx_lock(&umtx_shm_lock);
4025	reg1 = umtx_shm_find_reg_locked(key);
4026	if (reg1 != NULL) {
4027		mtx_unlock(&umtx_shm_lock);
4028		umtx_shm_free_reg(reg);
4029		*res = reg1;
4030		return (0);
4031	}
4032	reg->ushm_refcnt++;
4033	TAILQ_INSERT_TAIL(&umtx_shm_registry[key->hash], reg, ushm_reg_link);
4034	LIST_INSERT_HEAD(USHM_OBJ_UMTX(key->info.shared.object), reg,
4035	    ushm_obj_link);
4036	reg->ushm_flags = USHMF_REG_LINKED | USHMF_OBJ_LINKED;
4037	mtx_unlock(&umtx_shm_lock);
4038	*res = reg;
4039	return (0);
4040}
4041
4042static int
4043umtx_shm_alive(struct thread *td, void *addr)
4044{
4045	vm_map_t map;
4046	vm_map_entry_t entry;
4047	vm_object_t object;
4048	vm_pindex_t pindex;
4049	vm_prot_t prot;
4050	int res, ret;
4051	boolean_t wired;
4052
4053	map = &td->td_proc->p_vmspace->vm_map;
4054	res = vm_map_lookup(&map, (uintptr_t)addr, VM_PROT_READ, &entry,
4055	    &object, &pindex, &prot, &wired);
4056	if (res != KERN_SUCCESS)
4057		return (EFAULT);
4058	if (object == NULL)
4059		ret = EINVAL;
4060	else
4061		ret = (object->flags & OBJ_UMTXDEAD) != 0 ? ENOTTY : 0;
4062	vm_map_lookup_done(map, entry);
4063	return (ret);
4064}
4065
4066static void
4067umtx_shm_init(void)
4068{
4069	int i;
4070
4071	umtx_shm_reg_zone = uma_zcreate("umtx_shm", sizeof(struct umtx_shm_reg),
4072	    NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0);
4073	mtx_init(&umtx_shm_lock, "umtxshm", NULL, MTX_DEF);
4074	for (i = 0; i < nitems(umtx_shm_registry); i++)
4075		TAILQ_INIT(&umtx_shm_registry[i]);
4076}
4077
4078static int
4079umtx_shm(struct thread *td, void *addr, u_int flags)
4080{
4081	struct umtx_key key;
4082	struct umtx_shm_reg *reg;
4083	struct file *fp;
4084	int error, fd;
4085
4086	if (__bitcount(flags & (UMTX_SHM_CREAT | UMTX_SHM_LOOKUP |
4087	    UMTX_SHM_DESTROY| UMTX_SHM_ALIVE)) != 1)
4088		return (EINVAL);
4089	if ((flags & UMTX_SHM_ALIVE) != 0)
4090		return (umtx_shm_alive(td, addr));
4091	error = umtx_key_get(addr, TYPE_SHM, PROCESS_SHARE, &key);
4092	if (error != 0)
4093		return (error);
4094	KASSERT(key.shared == 1, ("non-shared key"));
4095	if ((flags & UMTX_SHM_CREAT) != 0) {
4096		error = umtx_shm_create_reg(td, &key, &reg);
4097	} else {
4098		reg = umtx_shm_find_reg(&key);
4099		if (reg == NULL)
4100			error = ESRCH;
4101	}
4102	umtx_key_release(&key);
4103	if (error != 0)
4104		return (error);
4105	KASSERT(reg != NULL, ("no reg"));
4106	if ((flags & UMTX_SHM_DESTROY) != 0) {
4107		umtx_shm_unref_reg(reg, true);
4108	} else {
4109#if 0
4110#ifdef MAC
4111		error = mac_posixshm_check_open(td->td_ucred,
4112		    reg->ushm_obj, FFLAGS(O_RDWR));
4113		if (error == 0)
4114#endif
4115			error = shm_access(reg->ushm_obj, td->td_ucred,
4116			    FFLAGS(O_RDWR));
4117		if (error == 0)
4118#endif
4119			error = falloc_caps(td, &fp, &fd, O_CLOEXEC, NULL);
4120		if (error == 0) {
4121			shm_hold(reg->ushm_obj);
4122			finit(fp, FFLAGS(O_RDWR), DTYPE_SHM, reg->ushm_obj,
4123			    &shm_ops);
4124			td->td_retval[0] = fd;
4125			fdrop(fp, td);
4126		}
4127	}
4128	umtx_shm_unref_reg(reg, false);
4129	return (error);
4130}
4131
4132static int
4133__umtx_op_shm(struct thread *td, struct _umtx_op_args *uap,
4134    const struct umtx_copyops *ops __unused)
4135{
4136
4137	return (umtx_shm(td, uap->uaddr1, uap->val));
4138}
4139
4140static int
4141__umtx_op_robust_lists(struct thread *td, struct _umtx_op_args *uap,
4142    const struct umtx_copyops *ops)
4143{
4144	struct umtx_robust_lists_params rb;
4145	int error;
4146
4147	if (ops->compat32) {
4148		if ((td->td_pflags2 & TDP2_COMPAT32RB) == 0 &&
4149		    (td->td_rb_list != 0 || td->td_rbp_list != 0 ||
4150		    td->td_rb_inact != 0))
4151			return (EBUSY);
4152	} else if ((td->td_pflags2 & TDP2_COMPAT32RB) != 0) {
4153		return (EBUSY);
4154	}
4155
4156	bzero(&rb, sizeof(rb));
4157	error = ops->copyin_robust_lists(uap->uaddr1, uap->val, &rb);
4158	if (error != 0)
4159		return (error);
4160
4161	if (ops->compat32)
4162		td->td_pflags2 |= TDP2_COMPAT32RB;
4163
4164	td->td_rb_list = rb.robust_list_offset;
4165	td->td_rbp_list = rb.robust_priv_list_offset;
4166	td->td_rb_inact = rb.robust_inact_offset;
4167	return (0);
4168}
4169
4170#if defined(__i386__) || defined(__amd64__)
4171/*
4172 * Provide the standard 32-bit definitions for x86, since native/compat32 use a
4173 * 32-bit time_t there.  Other architectures just need the i386 definitions
4174 * along with their standard compat32.
4175 */
4176struct timespecx32 {
4177	int64_t			tv_sec;
4178	int32_t			tv_nsec;
4179};
4180
4181struct umtx_timex32 {
4182	struct	timespecx32	_timeout;
4183	uint32_t		_flags;
4184	uint32_t		_clockid;
4185};
4186
4187#ifndef __i386__
4188#define	timespeci386	timespec32
4189#define	umtx_timei386	umtx_time32
4190#endif
4191#else /* !__i386__ && !__amd64__ */
4192/* 32-bit architectures can emulate i386, so define these almost everywhere. */
4193struct timespeci386 {
4194	int32_t			tv_sec;
4195	int32_t			tv_nsec;
4196};
4197
4198struct umtx_timei386 {
4199	struct	timespeci386	_timeout;
4200	uint32_t		_flags;
4201	uint32_t		_clockid;
4202};
4203
4204#if defined(__LP64__)
4205#define	timespecx32	timespec32
4206#define	umtx_timex32	umtx_time32
4207#endif
4208#endif
4209
4210static int
4211umtx_copyin_robust_lists32(const void *uaddr, size_t size,
4212    struct umtx_robust_lists_params *rbp)
4213{
4214	struct umtx_robust_lists_params_compat32 rb32;
4215	int error;
4216
4217	if (size > sizeof(rb32))
4218		return (EINVAL);
4219	bzero(&rb32, sizeof(rb32));
4220	error = copyin(uaddr, &rb32, size);
4221	if (error != 0)
4222		return (error);
4223	CP(rb32, *rbp, robust_list_offset);
4224	CP(rb32, *rbp, robust_priv_list_offset);
4225	CP(rb32, *rbp, robust_inact_offset);
4226	return (0);
4227}
4228
4229#ifndef __i386__
4230static inline int
4231umtx_copyin_timeouti386(const void *uaddr, struct timespec *tsp)
4232{
4233	struct timespeci386 ts32;
4234	int error;
4235
4236	error = copyin(uaddr, &ts32, sizeof(ts32));
4237	if (error == 0) {
4238		if (ts32.tv_sec < 0 ||
4239		    ts32.tv_nsec >= 1000000000 ||
4240		    ts32.tv_nsec < 0)
4241			error = EINVAL;
4242		else {
4243			CP(ts32, *tsp, tv_sec);
4244			CP(ts32, *tsp, tv_nsec);
4245		}
4246	}
4247	return (error);
4248}
4249
4250static inline int
4251umtx_copyin_umtx_timei386(const void *uaddr, size_t size, struct _umtx_time *tp)
4252{
4253	struct umtx_timei386 t32;
4254	int error;
4255
4256	t32._clockid = CLOCK_REALTIME;
4257	t32._flags   = 0;
4258	if (size <= sizeof(t32._timeout))
4259		error = copyin(uaddr, &t32._timeout, sizeof(t32._timeout));
4260	else
4261		error = copyin(uaddr, &t32, sizeof(t32));
4262	if (error != 0)
4263		return (error);
4264	if (t32._timeout.tv_sec < 0 ||
4265	    t32._timeout.tv_nsec >= 1000000000 || t32._timeout.tv_nsec < 0)
4266		return (EINVAL);
4267	TS_CP(t32, *tp, _timeout);
4268	CP(t32, *tp, _flags);
4269	CP(t32, *tp, _clockid);
4270	return (0);
4271}
4272
4273static int
4274umtx_copyout_timeouti386(void *uaddr, size_t sz, struct timespec *tsp)
4275{
4276	struct timespeci386 remain32 = {
4277		.tv_sec = tsp->tv_sec,
4278		.tv_nsec = tsp->tv_nsec,
4279	};
4280
4281	/*
4282	 * Should be guaranteed by the caller, sz == uaddr1 - sizeof(_umtx_time)
4283	 * and we're only called if sz >= sizeof(timespec) as supplied in the
4284	 * copyops.
4285	 */
4286	KASSERT(sz >= sizeof(remain32),
4287	    ("umtx_copyops specifies incorrect sizes"));
4288
4289	return (copyout(&remain32, uaddr, sizeof(remain32)));
4290}
4291#endif /* !__i386__ */
4292
4293#if defined(__i386__) || defined(__LP64__)
4294static inline int
4295umtx_copyin_timeoutx32(const void *uaddr, struct timespec *tsp)
4296{
4297	struct timespecx32 ts32;
4298	int error;
4299
4300	error = copyin(uaddr, &ts32, sizeof(ts32));
4301	if (error == 0) {
4302		if (ts32.tv_sec < 0 ||
4303		    ts32.tv_nsec >= 1000000000 ||
4304		    ts32.tv_nsec < 0)
4305			error = EINVAL;
4306		else {
4307			CP(ts32, *tsp, tv_sec);
4308			CP(ts32, *tsp, tv_nsec);
4309		}
4310	}
4311	return (error);
4312}
4313
4314static inline int
4315umtx_copyin_umtx_timex32(const void *uaddr, size_t size, struct _umtx_time *tp)
4316{
4317	struct umtx_timex32 t32;
4318	int error;
4319
4320	t32._clockid = CLOCK_REALTIME;
4321	t32._flags   = 0;
4322	if (size <= sizeof(t32._timeout))
4323		error = copyin(uaddr, &t32._timeout, sizeof(t32._timeout));
4324	else
4325		error = copyin(uaddr, &t32, sizeof(t32));
4326	if (error != 0)
4327		return (error);
4328	if (t32._timeout.tv_sec < 0 ||
4329	    t32._timeout.tv_nsec >= 1000000000 || t32._timeout.tv_nsec < 0)
4330		return (EINVAL);
4331	TS_CP(t32, *tp, _timeout);
4332	CP(t32, *tp, _flags);
4333	CP(t32, *tp, _clockid);
4334	return (0);
4335}
4336
4337static int
4338umtx_copyout_timeoutx32(void *uaddr, size_t sz, struct timespec *tsp)
4339{
4340	struct timespecx32 remain32 = {
4341		.tv_sec = tsp->tv_sec,
4342		.tv_nsec = tsp->tv_nsec,
4343	};
4344
4345	/*
4346	 * Should be guaranteed by the caller, sz == uaddr1 - sizeof(_umtx_time)
4347	 * and we're only called if sz >= sizeof(timespec) as supplied in the
4348	 * copyops.
4349	 */
4350	KASSERT(sz >= sizeof(remain32),
4351	    ("umtx_copyops specifies incorrect sizes"));
4352
4353	return (copyout(&remain32, uaddr, sizeof(remain32)));
4354}
4355#endif /* __i386__ || __LP64__ */
4356
4357typedef int (*_umtx_op_func)(struct thread *td, struct _umtx_op_args *uap,
4358    const struct umtx_copyops *umtx_ops);
4359
4360static const _umtx_op_func op_table[] = {
4361	[UMTX_OP_RESERVED0]	= __umtx_op_unimpl,
4362	[UMTX_OP_RESERVED1]	= __umtx_op_unimpl,
4363	[UMTX_OP_WAIT]		= __umtx_op_wait,
4364	[UMTX_OP_WAKE]		= __umtx_op_wake,
4365	[UMTX_OP_MUTEX_TRYLOCK]	= __umtx_op_trylock_umutex,
4366	[UMTX_OP_MUTEX_LOCK]	= __umtx_op_lock_umutex,
4367	[UMTX_OP_MUTEX_UNLOCK]	= __umtx_op_unlock_umutex,
4368	[UMTX_OP_SET_CEILING]	= __umtx_op_set_ceiling,
4369	[UMTX_OP_CV_WAIT]	= __umtx_op_cv_wait,
4370	[UMTX_OP_CV_SIGNAL]	= __umtx_op_cv_signal,
4371	[UMTX_OP_CV_BROADCAST]	= __umtx_op_cv_broadcast,
4372	[UMTX_OP_WAIT_UINT]	= __umtx_op_wait_uint,
4373	[UMTX_OP_RW_RDLOCK]	= __umtx_op_rw_rdlock,
4374	[UMTX_OP_RW_WRLOCK]	= __umtx_op_rw_wrlock,
4375	[UMTX_OP_RW_UNLOCK]	= __umtx_op_rw_unlock,
4376	[UMTX_OP_WAIT_UINT_PRIVATE] = __umtx_op_wait_uint_private,
4377	[UMTX_OP_WAKE_PRIVATE]	= __umtx_op_wake_private,
4378	[UMTX_OP_MUTEX_WAIT]	= __umtx_op_wait_umutex,
4379	[UMTX_OP_MUTEX_WAKE]	= __umtx_op_wake_umutex,
4380#if defined(COMPAT_FREEBSD9) || defined(COMPAT_FREEBSD10)
4381	[UMTX_OP_SEM_WAIT]	= __umtx_op_sem_wait,
4382	[UMTX_OP_SEM_WAKE]	= __umtx_op_sem_wake,
4383#else
4384	[UMTX_OP_SEM_WAIT]	= __umtx_op_unimpl,
4385	[UMTX_OP_SEM_WAKE]	= __umtx_op_unimpl,
4386#endif
4387	[UMTX_OP_NWAKE_PRIVATE]	= __umtx_op_nwake_private,
4388	[UMTX_OP_MUTEX_WAKE2]	= __umtx_op_wake2_umutex,
4389	[UMTX_OP_SEM2_WAIT]	= __umtx_op_sem2_wait,
4390	[UMTX_OP_SEM2_WAKE]	= __umtx_op_sem2_wake,
4391	[UMTX_OP_SHM]		= __umtx_op_shm,
4392	[UMTX_OP_ROBUST_LISTS]	= __umtx_op_robust_lists,
4393};
4394
4395static const struct umtx_copyops umtx_native_ops = {
4396	.copyin_timeout = umtx_copyin_timeout,
4397	.copyin_umtx_time = umtx_copyin_umtx_time,
4398	.copyin_robust_lists = umtx_copyin_robust_lists,
4399	.copyout_timeout = umtx_copyout_timeout,
4400	.timespec_sz = sizeof(struct timespec),
4401	.umtx_time_sz = sizeof(struct _umtx_time),
4402};
4403
4404#ifndef __i386__
4405static const struct umtx_copyops umtx_native_opsi386 = {
4406	.copyin_timeout = umtx_copyin_timeouti386,
4407	.copyin_umtx_time = umtx_copyin_umtx_timei386,
4408	.copyin_robust_lists = umtx_copyin_robust_lists32,
4409	.copyout_timeout = umtx_copyout_timeouti386,
4410	.timespec_sz = sizeof(struct timespeci386),
4411	.umtx_time_sz = sizeof(struct umtx_timei386),
4412	.compat32 = true,
4413};
4414#endif
4415
4416#if defined(__i386__) || defined(__LP64__)
4417/* i386 can emulate other 32-bit archs, too! */
4418static const struct umtx_copyops umtx_native_opsx32 = {
4419	.copyin_timeout = umtx_copyin_timeoutx32,
4420	.copyin_umtx_time = umtx_copyin_umtx_timex32,
4421	.copyin_robust_lists = umtx_copyin_robust_lists32,
4422	.copyout_timeout = umtx_copyout_timeoutx32,
4423	.timespec_sz = sizeof(struct timespecx32),
4424	.umtx_time_sz = sizeof(struct umtx_timex32),
4425	.compat32 = true,
4426};
4427
4428#ifdef COMPAT_FREEBSD32
4429#ifdef __amd64__
4430#define	umtx_native_ops32	umtx_native_opsi386
4431#else
4432#define	umtx_native_ops32	umtx_native_opsx32
4433#endif
4434#endif /* COMPAT_FREEBSD32 */
4435#endif /* __i386__ || __LP64__ */
4436
4437#define	UMTX_OP__FLAGS	(UMTX_OP__32BIT | UMTX_OP__I386)
4438
4439static int
4440kern__umtx_op(struct thread *td, void *obj, int op, unsigned long val,
4441    void *uaddr1, void *uaddr2, const struct umtx_copyops *ops)
4442{
4443	struct _umtx_op_args uap = {
4444		.obj = obj,
4445		.op = op & ~UMTX_OP__FLAGS,
4446		.val = val,
4447		.uaddr1 = uaddr1,
4448		.uaddr2 = uaddr2
4449	};
4450
4451	if ((uap.op >= nitems(op_table)))
4452		return (EINVAL);
4453	return ((*op_table[uap.op])(td, &uap, ops));
4454}
4455
4456int
4457sys__umtx_op(struct thread *td, struct _umtx_op_args *uap)
4458{
4459	static const struct umtx_copyops *umtx_ops;
4460
4461	umtx_ops = &umtx_native_ops;
4462#ifdef __LP64__
4463	if ((uap->op & (UMTX_OP__32BIT | UMTX_OP__I386)) != 0) {
4464		if ((uap->op & UMTX_OP__I386) != 0)
4465			umtx_ops = &umtx_native_opsi386;
4466		else
4467			umtx_ops = &umtx_native_opsx32;
4468	}
4469#elif !defined(__i386__)
4470	/* We consider UMTX_OP__32BIT a nop on !i386 ILP32. */
4471	if ((uap->op & UMTX_OP__I386) != 0)
4472		umtx_ops = &umtx_native_opsi386;
4473#else
4474	/* Likewise, UMTX_OP__I386 is a nop on i386. */
4475	if ((uap->op & UMTX_OP__32BIT) != 0)
4476		umtx_ops = &umtx_native_opsx32;
4477#endif
4478	return (kern__umtx_op(td, uap->obj, uap->op, uap->val, uap->uaddr1,
4479	    uap->uaddr2, umtx_ops));
4480}
4481
4482#ifdef COMPAT_FREEBSD32
4483int
4484freebsd32__umtx_op(struct thread *td, struct freebsd32__umtx_op_args *uap)
4485{
4486
4487	return (kern__umtx_op(td, uap->obj, uap->op, uap->val, uap->uaddr,
4488	    uap->uaddr2, &umtx_native_ops32));
4489}
4490#endif
4491
4492void
4493umtx_thread_init(struct thread *td)
4494{
4495
4496	td->td_umtxq = umtxq_alloc();
4497	td->td_umtxq->uq_thread = td;
4498}
4499
4500void
4501umtx_thread_fini(struct thread *td)
4502{
4503
4504	umtxq_free(td->td_umtxq);
4505}
4506
4507/*
4508 * It will be called when new thread is created, e.g fork().
4509 */
4510void
4511umtx_thread_alloc(struct thread *td)
4512{
4513	struct umtx_q *uq;
4514
4515	uq = td->td_umtxq;
4516	uq->uq_inherited_pri = PRI_MAX;
4517
4518	KASSERT(uq->uq_flags == 0, ("uq_flags != 0"));
4519	KASSERT(uq->uq_thread == td, ("uq_thread != td"));
4520	KASSERT(uq->uq_pi_blocked == NULL, ("uq_pi_blocked != NULL"));
4521	KASSERT(TAILQ_EMPTY(&uq->uq_pi_contested), ("uq_pi_contested is not empty"));
4522}
4523
4524/*
4525 * exec() hook.
4526 *
4527 * Clear robust lists for all process' threads, not delaying the
4528 * cleanup to thread exit, since the relevant address space is
4529 * destroyed right now.
4530 */
4531void
4532umtx_exec(struct proc *p)
4533{
4534	struct thread *td;
4535
4536	KASSERT(p == curproc, ("need curproc"));
4537	KASSERT((p->p_flag & P_HADTHREADS) == 0 ||
4538	    (p->p_flag & P_STOPPED_SINGLE) != 0,
4539	    ("curproc must be single-threaded"));
4540	/*
4541	 * There is no need to lock the list as only this thread can be
4542	 * running.
4543	 */
4544	FOREACH_THREAD_IN_PROC(p, td) {
4545		KASSERT(td == curthread ||
4546		    ((td->td_flags & TDF_BOUNDARY) != 0 && TD_IS_SUSPENDED(td)),
4547		    ("running thread %p %p", p, td));
4548		umtx_thread_cleanup(td);
4549		td->td_rb_list = td->td_rbp_list = td->td_rb_inact = 0;
4550	}
4551}
4552
4553/*
4554 * thread exit hook.
4555 */
4556void
4557umtx_thread_exit(struct thread *td)
4558{
4559
4560	umtx_thread_cleanup(td);
4561}
4562
4563static int
4564umtx_read_uptr(struct thread *td, uintptr_t ptr, uintptr_t *res, bool compat32)
4565{
4566	u_long res1;
4567	uint32_t res32;
4568	int error;
4569
4570	if (compat32) {
4571		error = fueword32((void *)ptr, &res32);
4572		if (error == 0)
4573			res1 = res32;
4574	} else {
4575		error = fueword((void *)ptr, &res1);
4576	}
4577	if (error == 0)
4578		*res = res1;
4579	else
4580		error = EFAULT;
4581	return (error);
4582}
4583
4584static void
4585umtx_read_rb_list(struct thread *td, struct umutex *m, uintptr_t *rb_list,
4586    bool compat32)
4587{
4588	struct umutex32 m32;
4589
4590	if (compat32) {
4591		memcpy(&m32, m, sizeof(m32));
4592		*rb_list = m32.m_rb_lnk;
4593	} else {
4594		*rb_list = m->m_rb_lnk;
4595	}
4596}
4597
4598static int
4599umtx_handle_rb(struct thread *td, uintptr_t rbp, uintptr_t *rb_list, bool inact,
4600    bool compat32)
4601{
4602	struct umutex m;
4603	int error;
4604
4605	KASSERT(td->td_proc == curproc, ("need current vmspace"));
4606	error = copyin((void *)rbp, &m, sizeof(m));
4607	if (error != 0)
4608		return (error);
4609	if (rb_list != NULL)
4610		umtx_read_rb_list(td, &m, rb_list, compat32);
4611	if ((m.m_flags & UMUTEX_ROBUST) == 0)
4612		return (EINVAL);
4613	if ((m.m_owner & ~UMUTEX_CONTESTED) != td->td_tid)
4614		/* inact is cleared after unlock, allow the inconsistency */
4615		return (inact ? 0 : EINVAL);
4616	return (do_unlock_umutex(td, (struct umutex *)rbp, true));
4617}
4618
4619static void
4620umtx_cleanup_rb_list(struct thread *td, uintptr_t rb_list, uintptr_t *rb_inact,
4621    const char *name, bool compat32)
4622{
4623	int error, i;
4624	uintptr_t rbp;
4625	bool inact;
4626
4627	if (rb_list == 0)
4628		return;
4629	error = umtx_read_uptr(td, rb_list, &rbp, compat32);
4630	for (i = 0; error == 0 && rbp != 0 && i < umtx_max_rb; i++) {
4631		if (rbp == *rb_inact) {
4632			inact = true;
4633			*rb_inact = 0;
4634		} else
4635			inact = false;
4636		error = umtx_handle_rb(td, rbp, &rbp, inact, compat32);
4637	}
4638	if (i == umtx_max_rb && umtx_verbose_rb) {
4639		uprintf("comm %s pid %d: reached umtx %smax rb %d\n",
4640		    td->td_proc->p_comm, td->td_proc->p_pid, name, umtx_max_rb);
4641	}
4642	if (error != 0 && umtx_verbose_rb) {
4643		uprintf("comm %s pid %d: handling %srb error %d\n",
4644		    td->td_proc->p_comm, td->td_proc->p_pid, name, error);
4645	}
4646}
4647
4648/*
4649 * Clean up umtx data.
4650 */
4651static void
4652umtx_thread_cleanup(struct thread *td)
4653{
4654	struct umtx_q *uq;
4655	struct umtx_pi *pi;
4656	uintptr_t rb_inact;
4657	bool compat32;
4658
4659	/*
4660	 * Disown pi mutexes.
4661	 */
4662	uq = td->td_umtxq;
4663	if (uq != NULL) {
4664		if (uq->uq_inherited_pri != PRI_MAX ||
4665		    !TAILQ_EMPTY(&uq->uq_pi_contested)) {
4666			mtx_lock(&umtx_lock);
4667			uq->uq_inherited_pri = PRI_MAX;
4668			while ((pi = TAILQ_FIRST(&uq->uq_pi_contested)) != NULL) {
4669				pi->pi_owner = NULL;
4670				TAILQ_REMOVE(&uq->uq_pi_contested, pi, pi_link);
4671			}
4672			mtx_unlock(&umtx_lock);
4673		}
4674		sched_lend_user_prio_cond(td, PRI_MAX);
4675	}
4676
4677	compat32 = (td->td_pflags2 & TDP2_COMPAT32RB) != 0;
4678	td->td_pflags2 &= ~TDP2_COMPAT32RB;
4679
4680	if (td->td_rb_inact == 0 && td->td_rb_list == 0 && td->td_rbp_list == 0)
4681		return;
4682
4683	/*
4684	 * Handle terminated robust mutexes.  Must be done after
4685	 * robust pi disown, otherwise unlock could see unowned
4686	 * entries.
4687	 */
4688	rb_inact = td->td_rb_inact;
4689	if (rb_inact != 0)
4690		(void)umtx_read_uptr(td, rb_inact, &rb_inact, compat32);
4691	umtx_cleanup_rb_list(td, td->td_rb_list, &rb_inact, "", compat32);
4692	umtx_cleanup_rb_list(td, td->td_rbp_list, &rb_inact, "priv ", compat32);
4693	if (rb_inact != 0)
4694		(void)umtx_handle_rb(td, rb_inact, NULL, true, compat32);
4695}
4696