1/*-
2 * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
3 *
4 * Copyright (c) 2015, 2016 The FreeBSD Foundation
5 * Copyright (c) 2004, David Xu <davidxu@freebsd.org>
6 * Copyright (c) 2002, Jeffrey Roberson <jeff@freebsd.org>
7 * All rights reserved.
8 *
9 * Portions of this software were developed by Konstantin Belousov
10 * under sponsorship from the FreeBSD Foundation.
11 *
12 * Redistribution and use in source and binary forms, with or without
13 * modification, are permitted provided that the following conditions
14 * are met:
15 * 1. Redistributions of source code must retain the above copyright
16 *    notice unmodified, this list of conditions, and the following
17 *    disclaimer.
18 * 2. Redistributions in binary form must reproduce the above copyright
19 *    notice, this list of conditions and the following disclaimer in the
20 *    documentation and/or other materials provided with the distribution.
21 *
22 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
23 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
24 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
25 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
26 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
27 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
28 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
29 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
30 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
31 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
32 */
33
34#include <sys/cdefs.h>
35__FBSDID("$FreeBSD$");
36
37#include "opt_umtx_profiling.h"
38
39#include <sys/param.h>
40#include <sys/kernel.h>
41#include <sys/fcntl.h>
42#include <sys/file.h>
43#include <sys/filedesc.h>
44#include <sys/limits.h>
45#include <sys/lock.h>
46#include <sys/malloc.h>
47#include <sys/mman.h>
48#include <sys/mutex.h>
49#include <sys/priv.h>
50#include <sys/proc.h>
51#include <sys/resource.h>
52#include <sys/resourcevar.h>
53#include <sys/rwlock.h>
54#include <sys/sbuf.h>
55#include <sys/sched.h>
56#include <sys/smp.h>
57#include <sys/sysctl.h>
58#include <sys/sysent.h>
59#include <sys/systm.h>
60#include <sys/sysproto.h>
61#include <sys/syscallsubr.h>
62#include <sys/taskqueue.h>
63#include <sys/time.h>
64#include <sys/eventhandler.h>
65#include <sys/umtx.h>
66
67#include <security/mac/mac_framework.h>
68
69#include <vm/vm.h>
70#include <vm/vm_param.h>
71#include <vm/pmap.h>
72#include <vm/vm_map.h>
73#include <vm/vm_object.h>
74
75#include <machine/atomic.h>
76#include <machine/cpu.h>
77
78#include <compat/freebsd32/freebsd32.h>
79#ifdef COMPAT_FREEBSD32
80#include <compat/freebsd32/freebsd32_proto.h>
81#endif
82
83#define _UMUTEX_TRY		1
84#define _UMUTEX_WAIT		2
85
86#ifdef UMTX_PROFILING
87#define	UPROF_PERC_BIGGER(w, f, sw, sf)					\
88	(((w) > (sw)) || ((w) == (sw) && (f) > (sf)))
89#endif
90
91/* Priority inheritance mutex info. */
92struct umtx_pi {
93	/* Owner thread */
94	struct thread		*pi_owner;
95
96	/* Reference count */
97	int			pi_refcount;
98
99	/* List entry to link umtx holding by thread */
100	TAILQ_ENTRY(umtx_pi)	pi_link;
101
102	/* List entry in hash */
103	TAILQ_ENTRY(umtx_pi)	pi_hashlink;
104
105	/* List for waiters */
106	TAILQ_HEAD(,umtx_q)	pi_blocked;
107
108	/* Identify a userland lock object */
109	struct umtx_key		pi_key;
110};
111
112/* A userland synchronous object user. */
113struct umtx_q {
114	/* Linked list for the hash. */
115	TAILQ_ENTRY(umtx_q)	uq_link;
116
117	/* Umtx key. */
118	struct umtx_key		uq_key;
119
120	/* Umtx flags. */
121	int			uq_flags;
122#define UQF_UMTXQ	0x0001
123
124	/* The thread waits on. */
125	struct thread		*uq_thread;
126
127	/*
128	 * Blocked on PI mutex. read can use chain lock
129	 * or umtx_lock, write must have both chain lock and
130	 * umtx_lock being hold.
131	 */
132	struct umtx_pi		*uq_pi_blocked;
133
134	/* On blocked list */
135	TAILQ_ENTRY(umtx_q)	uq_lockq;
136
137	/* Thread contending with us */
138	TAILQ_HEAD(,umtx_pi)	uq_pi_contested;
139
140	/* Inherited priority from PP mutex */
141	u_char			uq_inherited_pri;
142
143	/* Spare queue ready to be reused */
144	struct umtxq_queue	*uq_spare_queue;
145
146	/* The queue we on */
147	struct umtxq_queue	*uq_cur_queue;
148};
149
150TAILQ_HEAD(umtxq_head, umtx_q);
151
152/* Per-key wait-queue */
153struct umtxq_queue {
154	struct umtxq_head	head;
155	struct umtx_key		key;
156	LIST_ENTRY(umtxq_queue)	link;
157	int			length;
158};
159
160LIST_HEAD(umtxq_list, umtxq_queue);
161
162/* Userland lock object's wait-queue chain */
163struct umtxq_chain {
164	/* Lock for this chain. */
165	struct mtx		uc_lock;
166
167	/* List of sleep queues. */
168	struct umtxq_list	uc_queue[2];
169#define UMTX_SHARED_QUEUE	0
170#define UMTX_EXCLUSIVE_QUEUE	1
171
172	LIST_HEAD(, umtxq_queue) uc_spare_queue;
173
174	/* Busy flag */
175	char			uc_busy;
176
177	/* Chain lock waiters */
178	int			uc_waiters;
179
180	/* All PI in the list */
181	TAILQ_HEAD(,umtx_pi)	uc_pi_list;
182
183#ifdef UMTX_PROFILING
184	u_int			length;
185	u_int			max_length;
186#endif
187};
188
189#define	UMTXQ_LOCKED_ASSERT(uc)		mtx_assert(&(uc)->uc_lock, MA_OWNED)
190
191/*
192 * Don't propagate time-sharing priority, there is a security reason,
193 * a user can simply introduce PI-mutex, let thread A lock the mutex,
194 * and let another thread B block on the mutex, because B is
195 * sleeping, its priority will be boosted, this causes A's priority to
196 * be boosted via priority propagating too and will never be lowered even
197 * if it is using 100%CPU, this is unfair to other processes.
198 */
199
200#define UPRI(td)	(((td)->td_user_pri >= PRI_MIN_TIMESHARE &&\
201			  (td)->td_user_pri <= PRI_MAX_TIMESHARE) ?\
202			 PRI_MAX_TIMESHARE : (td)->td_user_pri)
203
204#define	GOLDEN_RATIO_PRIME	2654404609U
205#ifndef	UMTX_CHAINS
206#define	UMTX_CHAINS		512
207#endif
208#define	UMTX_SHIFTS		(__WORD_BIT - 9)
209
210#define	GET_SHARE(flags)	\
211    (((flags) & USYNC_PROCESS_SHARED) == 0 ? THREAD_SHARE : PROCESS_SHARE)
212
213#define BUSY_SPINS		200
214
215struct abs_timeout {
216	int clockid;
217	bool is_abs_real;	/* TIMER_ABSTIME && CLOCK_REALTIME* */
218	struct timespec cur;
219	struct timespec end;
220};
221
222struct umtx_copyops {
223	int	(*copyin_timeout)(const void *uaddr, struct timespec *tsp);
224	int	(*copyin_umtx_time)(const void *uaddr, size_t size,
225	    struct _umtx_time *tp);
226	int	(*copyin_robust_lists)(const void *uaddr, size_t size,
227	    struct umtx_robust_lists_params *rbp);
228	int	(*copyout_timeout)(void *uaddr, size_t size,
229	    struct timespec *tsp);
230	const size_t	timespec_sz;
231	const size_t	umtx_time_sz;
232	const bool	compat32;
233};
234
235_Static_assert(sizeof(struct umutex) == sizeof(struct umutex32), "umutex32");
236_Static_assert(__offsetof(struct umutex, m_spare[0]) ==
237    __offsetof(struct umutex32, m_spare[0]), "m_spare32");
238
239int umtx_shm_vnobj_persistent = 0;
240SYSCTL_INT(_kern_ipc, OID_AUTO, umtx_vnode_persistent, CTLFLAG_RWTUN,
241    &umtx_shm_vnobj_persistent, 0,
242    "False forces destruction of umtx attached to file, on last close");
243static int umtx_max_rb = 1000;
244SYSCTL_INT(_kern_ipc, OID_AUTO, umtx_max_robust, CTLFLAG_RWTUN,
245    &umtx_max_rb, 0,
246    "");
247
248static uma_zone_t		umtx_pi_zone;
249static struct umtxq_chain	umtxq_chains[2][UMTX_CHAINS];
250static MALLOC_DEFINE(M_UMTX, "umtx", "UMTX queue memory");
251static int			umtx_pi_allocated;
252
253static SYSCTL_NODE(_debug, OID_AUTO, umtx, CTLFLAG_RW, 0, "umtx debug");
254SYSCTL_INT(_debug_umtx, OID_AUTO, umtx_pi_allocated, CTLFLAG_RD,
255    &umtx_pi_allocated, 0, "Allocated umtx_pi");
256static int umtx_verbose_rb = 1;
257SYSCTL_INT(_debug_umtx, OID_AUTO, robust_faults_verbose, CTLFLAG_RWTUN,
258    &umtx_verbose_rb, 0,
259    "");
260
261#ifdef UMTX_PROFILING
262static long max_length;
263SYSCTL_LONG(_debug_umtx, OID_AUTO, max_length, CTLFLAG_RD, &max_length, 0, "max_length");
264static SYSCTL_NODE(_debug_umtx, OID_AUTO, chains, CTLFLAG_RD, 0, "umtx chain stats");
265#endif
266
267static void abs_timeout_update(struct abs_timeout *timo);
268
269static void umtx_shm_init(void);
270static void umtxq_sysinit(void *);
271static void umtxq_hash(struct umtx_key *key);
272static struct umtxq_chain *umtxq_getchain(struct umtx_key *key);
273static void umtxq_lock(struct umtx_key *key);
274static void umtxq_unlock(struct umtx_key *key);
275static void umtxq_busy(struct umtx_key *key);
276static void umtxq_unbusy(struct umtx_key *key);
277static void umtxq_insert_queue(struct umtx_q *uq, int q);
278static void umtxq_remove_queue(struct umtx_q *uq, int q);
279static int umtxq_sleep(struct umtx_q *uq, const char *wmesg, struct abs_timeout *);
280static int umtxq_count(struct umtx_key *key);
281static struct umtx_pi *umtx_pi_alloc(int);
282static void umtx_pi_free(struct umtx_pi *pi);
283static int do_unlock_pp(struct thread *td, struct umutex *m, uint32_t flags,
284    bool rb);
285static void umtx_thread_cleanup(struct thread *td);
286SYSINIT(umtx, SI_SUB_EVENTHANDLER+1, SI_ORDER_MIDDLE, umtxq_sysinit, NULL);
287
288#define umtxq_signal(key, nwake)	umtxq_signal_queue((key), (nwake), UMTX_SHARED_QUEUE)
289#define umtxq_insert(uq)	umtxq_insert_queue((uq), UMTX_SHARED_QUEUE)
290#define umtxq_remove(uq)	umtxq_remove_queue((uq), UMTX_SHARED_QUEUE)
291
292static struct mtx umtx_lock;
293
294#ifdef UMTX_PROFILING
295static void
296umtx_init_profiling(void)
297{
298	struct sysctl_oid *chain_oid;
299	char chain_name[10];
300	int i;
301
302	for (i = 0; i < UMTX_CHAINS; ++i) {
303		snprintf(chain_name, sizeof(chain_name), "%d", i);
304		chain_oid = SYSCTL_ADD_NODE(NULL,
305		    SYSCTL_STATIC_CHILDREN(_debug_umtx_chains), OID_AUTO,
306		    chain_name, CTLFLAG_RD, NULL, "umtx hash stats");
307		SYSCTL_ADD_INT(NULL, SYSCTL_CHILDREN(chain_oid), OID_AUTO,
308		    "max_length0", CTLFLAG_RD, &umtxq_chains[0][i].max_length, 0, NULL);
309		SYSCTL_ADD_INT(NULL, SYSCTL_CHILDREN(chain_oid), OID_AUTO,
310		    "max_length1", CTLFLAG_RD, &umtxq_chains[1][i].max_length, 0, NULL);
311	}
312}
313
314static int
315sysctl_debug_umtx_chains_peaks(SYSCTL_HANDLER_ARGS)
316{
317	char buf[512];
318	struct sbuf sb;
319	struct umtxq_chain *uc;
320	u_int fract, i, j, tot, whole;
321	u_int sf0, sf1, sf2, sf3, sf4;
322	u_int si0, si1, si2, si3, si4;
323	u_int sw0, sw1, sw2, sw3, sw4;
324
325	sbuf_new(&sb, buf, sizeof(buf), SBUF_FIXEDLEN);
326	for (i = 0; i < 2; i++) {
327		tot = 0;
328		for (j = 0; j < UMTX_CHAINS; ++j) {
329			uc = &umtxq_chains[i][j];
330			mtx_lock(&uc->uc_lock);
331			tot += uc->max_length;
332			mtx_unlock(&uc->uc_lock);
333		}
334		if (tot == 0)
335			sbuf_printf(&sb, "%u) Empty ", i);
336		else {
337			sf0 = sf1 = sf2 = sf3 = sf4 = 0;
338			si0 = si1 = si2 = si3 = si4 = 0;
339			sw0 = sw1 = sw2 = sw3 = sw4 = 0;
340			for (j = 0; j < UMTX_CHAINS; j++) {
341				uc = &umtxq_chains[i][j];
342				mtx_lock(&uc->uc_lock);
343				whole = uc->max_length * 100;
344				mtx_unlock(&uc->uc_lock);
345				fract = (whole % tot) * 100;
346				if (UPROF_PERC_BIGGER(whole, fract, sw0, sf0)) {
347					sf0 = fract;
348					si0 = j;
349					sw0 = whole;
350				} else if (UPROF_PERC_BIGGER(whole, fract, sw1,
351				    sf1)) {
352					sf1 = fract;
353					si1 = j;
354					sw1 = whole;
355				} else if (UPROF_PERC_BIGGER(whole, fract, sw2,
356				    sf2)) {
357					sf2 = fract;
358					si2 = j;
359					sw2 = whole;
360				} else if (UPROF_PERC_BIGGER(whole, fract, sw3,
361				    sf3)) {
362					sf3 = fract;
363					si3 = j;
364					sw3 = whole;
365				} else if (UPROF_PERC_BIGGER(whole, fract, sw4,
366				    sf4)) {
367					sf4 = fract;
368					si4 = j;
369					sw4 = whole;
370				}
371			}
372			sbuf_printf(&sb, "queue %u:\n", i);
373			sbuf_printf(&sb, "1st: %u.%u%% idx: %u\n", sw0 / tot,
374			    sf0 / tot, si0);
375			sbuf_printf(&sb, "2nd: %u.%u%% idx: %u\n", sw1 / tot,
376			    sf1 / tot, si1);
377			sbuf_printf(&sb, "3rd: %u.%u%% idx: %u\n", sw2 / tot,
378			    sf2 / tot, si2);
379			sbuf_printf(&sb, "4th: %u.%u%% idx: %u\n", sw3 / tot,
380			    sf3 / tot, si3);
381			sbuf_printf(&sb, "5th: %u.%u%% idx: %u\n", sw4 / tot,
382			    sf4 / tot, si4);
383		}
384	}
385	sbuf_trim(&sb);
386	sbuf_finish(&sb);
387	sysctl_handle_string(oidp, sbuf_data(&sb), sbuf_len(&sb), req);
388	sbuf_delete(&sb);
389	return (0);
390}
391
392static int
393sysctl_debug_umtx_chains_clear(SYSCTL_HANDLER_ARGS)
394{
395	struct umtxq_chain *uc;
396	u_int i, j;
397	int clear, error;
398
399	clear = 0;
400	error = sysctl_handle_int(oidp, &clear, 0, req);
401	if (error != 0 || req->newptr == NULL)
402		return (error);
403
404	if (clear != 0) {
405		for (i = 0; i < 2; ++i) {
406			for (j = 0; j < UMTX_CHAINS; ++j) {
407				uc = &umtxq_chains[i][j];
408				mtx_lock(&uc->uc_lock);
409				uc->length = 0;
410				uc->max_length = 0;
411				mtx_unlock(&uc->uc_lock);
412			}
413		}
414	}
415	return (0);
416}
417
418SYSCTL_PROC(_debug_umtx_chains, OID_AUTO, clear,
419    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, 0, 0,
420    sysctl_debug_umtx_chains_clear, "I", "Clear umtx chains statistics");
421SYSCTL_PROC(_debug_umtx_chains, OID_AUTO, peaks,
422    CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, 0, 0,
423    sysctl_debug_umtx_chains_peaks, "A", "Highest peaks in chains max length");
424#endif
425
426static void
427umtxq_sysinit(void *arg __unused)
428{
429	int i, j;
430
431	umtx_pi_zone = uma_zcreate("umtx pi", sizeof(struct umtx_pi),
432		NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0);
433	for (i = 0; i < 2; ++i) {
434		for (j = 0; j < UMTX_CHAINS; ++j) {
435			mtx_init(&umtxq_chains[i][j].uc_lock, "umtxql", NULL,
436				 MTX_DEF | MTX_DUPOK);
437			LIST_INIT(&umtxq_chains[i][j].uc_queue[0]);
438			LIST_INIT(&umtxq_chains[i][j].uc_queue[1]);
439			LIST_INIT(&umtxq_chains[i][j].uc_spare_queue);
440			TAILQ_INIT(&umtxq_chains[i][j].uc_pi_list);
441			umtxq_chains[i][j].uc_busy = 0;
442			umtxq_chains[i][j].uc_waiters = 0;
443#ifdef UMTX_PROFILING
444			umtxq_chains[i][j].length = 0;
445			umtxq_chains[i][j].max_length = 0;
446#endif
447		}
448	}
449#ifdef UMTX_PROFILING
450	umtx_init_profiling();
451#endif
452	mtx_init(&umtx_lock, "umtx lock", NULL, MTX_DEF);
453	umtx_shm_init();
454}
455
456struct umtx_q *
457umtxq_alloc(void)
458{
459	struct umtx_q *uq;
460
461	uq = malloc(sizeof(struct umtx_q), M_UMTX, M_WAITOK | M_ZERO);
462	uq->uq_spare_queue = malloc(sizeof(struct umtxq_queue), M_UMTX,
463	    M_WAITOK | M_ZERO);
464	TAILQ_INIT(&uq->uq_spare_queue->head);
465	TAILQ_INIT(&uq->uq_pi_contested);
466	uq->uq_inherited_pri = PRI_MAX;
467	return (uq);
468}
469
470void
471umtxq_free(struct umtx_q *uq)
472{
473
474	MPASS(uq->uq_spare_queue != NULL);
475	free(uq->uq_spare_queue, M_UMTX);
476	free(uq, M_UMTX);
477}
478
479static inline void
480umtxq_hash(struct umtx_key *key)
481{
482	unsigned n;
483
484	n = (uintptr_t)key->info.both.a + key->info.both.b;
485	key->hash = ((n * GOLDEN_RATIO_PRIME) >> UMTX_SHIFTS) % UMTX_CHAINS;
486}
487
488static inline struct umtxq_chain *
489umtxq_getchain(struct umtx_key *key)
490{
491
492	if (key->type <= TYPE_SEM)
493		return (&umtxq_chains[1][key->hash]);
494	return (&umtxq_chains[0][key->hash]);
495}
496
497/*
498 * Lock a chain.
499 */
500static inline void
501umtxq_lock(struct umtx_key *key)
502{
503	struct umtxq_chain *uc;
504
505	uc = umtxq_getchain(key);
506	mtx_lock(&uc->uc_lock);
507}
508
509/*
510 * Unlock a chain.
511 */
512static inline void
513umtxq_unlock(struct umtx_key *key)
514{
515	struct umtxq_chain *uc;
516
517	uc = umtxq_getchain(key);
518	mtx_unlock(&uc->uc_lock);
519}
520
521/*
522 * Set chain to busy state when following operation
523 * may be blocked (kernel mutex can not be used).
524 */
525static inline void
526umtxq_busy(struct umtx_key *key)
527{
528	struct umtxq_chain *uc;
529
530	uc = umtxq_getchain(key);
531	mtx_assert(&uc->uc_lock, MA_OWNED);
532	if (uc->uc_busy) {
533#ifdef SMP
534		if (smp_cpus > 1) {
535			int count = BUSY_SPINS;
536			if (count > 0) {
537				umtxq_unlock(key);
538				while (uc->uc_busy && --count > 0)
539					cpu_spinwait();
540				umtxq_lock(key);
541			}
542		}
543#endif
544		while (uc->uc_busy) {
545			uc->uc_waiters++;
546			msleep(uc, &uc->uc_lock, 0, "umtxqb", 0);
547			uc->uc_waiters--;
548		}
549	}
550	uc->uc_busy = 1;
551}
552
553/*
554 * Unbusy a chain.
555 */
556static inline void
557umtxq_unbusy(struct umtx_key *key)
558{
559	struct umtxq_chain *uc;
560
561	uc = umtxq_getchain(key);
562	mtx_assert(&uc->uc_lock, MA_OWNED);
563	KASSERT(uc->uc_busy != 0, ("not busy"));
564	uc->uc_busy = 0;
565	if (uc->uc_waiters)
566		wakeup_one(uc);
567}
568
569static inline void
570umtxq_unbusy_unlocked(struct umtx_key *key)
571{
572
573	umtxq_lock(key);
574	umtxq_unbusy(key);
575	umtxq_unlock(key);
576}
577
578static struct umtxq_queue *
579umtxq_queue_lookup(struct umtx_key *key, int q)
580{
581	struct umtxq_queue *uh;
582	struct umtxq_chain *uc;
583
584	uc = umtxq_getchain(key);
585	UMTXQ_LOCKED_ASSERT(uc);
586	LIST_FOREACH(uh, &uc->uc_queue[q], link) {
587		if (umtx_key_match(&uh->key, key))
588			return (uh);
589	}
590
591	return (NULL);
592}
593
594static inline void
595umtxq_insert_queue(struct umtx_q *uq, int q)
596{
597	struct umtxq_queue *uh;
598	struct umtxq_chain *uc;
599
600	uc = umtxq_getchain(&uq->uq_key);
601	UMTXQ_LOCKED_ASSERT(uc);
602	KASSERT((uq->uq_flags & UQF_UMTXQ) == 0, ("umtx_q is already on queue"));
603	uh = umtxq_queue_lookup(&uq->uq_key, q);
604	if (uh != NULL) {
605		LIST_INSERT_HEAD(&uc->uc_spare_queue, uq->uq_spare_queue, link);
606	} else {
607		uh = uq->uq_spare_queue;
608		uh->key = uq->uq_key;
609		LIST_INSERT_HEAD(&uc->uc_queue[q], uh, link);
610#ifdef UMTX_PROFILING
611		uc->length++;
612		if (uc->length > uc->max_length) {
613			uc->max_length = uc->length;
614			if (uc->max_length > max_length)
615				max_length = uc->max_length;
616		}
617#endif
618	}
619	uq->uq_spare_queue = NULL;
620
621	TAILQ_INSERT_TAIL(&uh->head, uq, uq_link);
622	uh->length++;
623	uq->uq_flags |= UQF_UMTXQ;
624	uq->uq_cur_queue = uh;
625	return;
626}
627
628static inline void
629umtxq_remove_queue(struct umtx_q *uq, int q)
630{
631	struct umtxq_chain *uc;
632	struct umtxq_queue *uh;
633
634	uc = umtxq_getchain(&uq->uq_key);
635	UMTXQ_LOCKED_ASSERT(uc);
636	if (uq->uq_flags & UQF_UMTXQ) {
637		uh = uq->uq_cur_queue;
638		TAILQ_REMOVE(&uh->head, uq, uq_link);
639		uh->length--;
640		uq->uq_flags &= ~UQF_UMTXQ;
641		if (TAILQ_EMPTY(&uh->head)) {
642			KASSERT(uh->length == 0,
643			    ("inconsistent umtxq_queue length"));
644#ifdef UMTX_PROFILING
645			uc->length--;
646#endif
647			LIST_REMOVE(uh, link);
648		} else {
649			uh = LIST_FIRST(&uc->uc_spare_queue);
650			KASSERT(uh != NULL, ("uc_spare_queue is empty"));
651			LIST_REMOVE(uh, link);
652		}
653		uq->uq_spare_queue = uh;
654		uq->uq_cur_queue = NULL;
655	}
656}
657
658/*
659 * Check if there are multiple waiters
660 */
661static int
662umtxq_count(struct umtx_key *key)
663{
664	struct umtxq_queue *uh;
665
666	UMTXQ_LOCKED_ASSERT(umtxq_getchain(key));
667	uh = umtxq_queue_lookup(key, UMTX_SHARED_QUEUE);
668	if (uh != NULL)
669		return (uh->length);
670	return (0);
671}
672
673/*
674 * Check if there are multiple PI waiters and returns first
675 * waiter.
676 */
677static int
678umtxq_count_pi(struct umtx_key *key, struct umtx_q **first)
679{
680	struct umtxq_queue *uh;
681
682	*first = NULL;
683	UMTXQ_LOCKED_ASSERT(umtxq_getchain(key));
684	uh = umtxq_queue_lookup(key, UMTX_SHARED_QUEUE);
685	if (uh != NULL) {
686		*first = TAILQ_FIRST(&uh->head);
687		return (uh->length);
688	}
689	return (0);
690}
691
692/*
693 * Wake up threads waiting on an userland object.
694 */
695
696static int
697umtxq_signal_queue(struct umtx_key *key, int n_wake, int q)
698{
699	struct umtxq_queue *uh;
700	struct umtx_q *uq;
701	int ret;
702
703	ret = 0;
704	UMTXQ_LOCKED_ASSERT(umtxq_getchain(key));
705	uh = umtxq_queue_lookup(key, q);
706	if (uh != NULL) {
707		while ((uq = TAILQ_FIRST(&uh->head)) != NULL) {
708			umtxq_remove_queue(uq, q);
709			wakeup(uq);
710			if (++ret >= n_wake)
711				return (ret);
712		}
713	}
714	return (ret);
715}
716
717
718/*
719 * Wake up specified thread.
720 */
721static inline void
722umtxq_signal_thread(struct umtx_q *uq)
723{
724
725	UMTXQ_LOCKED_ASSERT(umtxq_getchain(&uq->uq_key));
726	umtxq_remove(uq);
727	wakeup(uq);
728}
729
730static inline int
731tstohz(const struct timespec *tsp)
732{
733	struct timeval tv;
734
735	TIMESPEC_TO_TIMEVAL(&tv, tsp);
736	return tvtohz(&tv);
737}
738
739static void
740abs_timeout_init(struct abs_timeout *timo, int clockid, int absolute,
741	const struct timespec *timeout)
742{
743
744	timo->clockid = clockid;
745	if (!absolute) {
746		timo->is_abs_real = false;
747		abs_timeout_update(timo);
748		timespecadd(&timo->cur, timeout, &timo->end);
749	} else {
750		timo->end = *timeout;
751		timo->is_abs_real = clockid == CLOCK_REALTIME ||
752		    clockid == CLOCK_REALTIME_FAST ||
753		    clockid == CLOCK_REALTIME_PRECISE;
754		/*
755		 * If is_abs_real, umtxq_sleep will read the clock
756		 * after setting td_rtcgen; otherwise, read it here.
757		 */
758		if (!timo->is_abs_real) {
759			abs_timeout_update(timo);
760		}
761	}
762}
763
764static void
765abs_timeout_init2(struct abs_timeout *timo, const struct _umtx_time *umtxtime)
766{
767
768	abs_timeout_init(timo, umtxtime->_clockid,
769	    (umtxtime->_flags & UMTX_ABSTIME) != 0, &umtxtime->_timeout);
770}
771
772static inline void
773abs_timeout_update(struct abs_timeout *timo)
774{
775
776	kern_clock_gettime(curthread, timo->clockid, &timo->cur);
777}
778
779static int
780abs_timeout_gethz(struct abs_timeout *timo)
781{
782	struct timespec tts;
783
784	if (timespeccmp(&timo->end, &timo->cur, <=))
785		return (-1);
786	timespecsub(&timo->end, &timo->cur, &tts);
787	return (tstohz(&tts));
788}
789
790static uint32_t
791umtx_unlock_val(uint32_t flags, bool rb)
792{
793
794	if (rb)
795		return (UMUTEX_RB_OWNERDEAD);
796	else if ((flags & UMUTEX_NONCONSISTENT) != 0)
797		return (UMUTEX_RB_NOTRECOV);
798	else
799		return (UMUTEX_UNOWNED);
800
801}
802
803/*
804 * Put thread into sleep state, before sleeping, check if
805 * thread was removed from umtx queue.
806 */
807static inline int
808umtxq_sleep(struct umtx_q *uq, const char *wmesg, struct abs_timeout *abstime)
809{
810	struct umtxq_chain *uc;
811	int error, timo;
812
813	if (abstime != NULL && abstime->is_abs_real) {
814		curthread->td_rtcgen = atomic_load_acq_int(&rtc_generation);
815		abs_timeout_update(abstime);
816	}
817
818	uc = umtxq_getchain(&uq->uq_key);
819	UMTXQ_LOCKED_ASSERT(uc);
820	for (;;) {
821		if (!(uq->uq_flags & UQF_UMTXQ)) {
822			error = 0;
823			break;
824		}
825		if (abstime != NULL) {
826			timo = abs_timeout_gethz(abstime);
827			if (timo < 0) {
828				error = ETIMEDOUT;
829				break;
830			}
831		} else
832			timo = 0;
833		error = msleep(uq, &uc->uc_lock, PCATCH | PDROP, wmesg, timo);
834		if (error == EINTR || error == ERESTART) {
835			umtxq_lock(&uq->uq_key);
836			break;
837		}
838		if (abstime != NULL) {
839			if (abstime->is_abs_real)
840				curthread->td_rtcgen =
841				    atomic_load_acq_int(&rtc_generation);
842			abs_timeout_update(abstime);
843		}
844		umtxq_lock(&uq->uq_key);
845	}
846
847	curthread->td_rtcgen = 0;
848	return (error);
849}
850
851/*
852 * Convert userspace address into unique logical address.
853 */
854int
855umtx_key_get(const void *addr, int type, int share, struct umtx_key *key)
856{
857	struct thread *td = curthread;
858	vm_map_t map;
859	vm_map_entry_t entry;
860	vm_pindex_t pindex;
861	vm_prot_t prot;
862	boolean_t wired;
863
864	key->type = type;
865	if (share == THREAD_SHARE) {
866		key->shared = 0;
867		key->info.private.vs = td->td_proc->p_vmspace;
868		key->info.private.addr = (uintptr_t)addr;
869	} else {
870		MPASS(share == PROCESS_SHARE || share == AUTO_SHARE);
871		map = &td->td_proc->p_vmspace->vm_map;
872		if (vm_map_lookup(&map, (vm_offset_t)addr, VM_PROT_WRITE,
873		    &entry, &key->info.shared.object, &pindex, &prot,
874		    &wired) != KERN_SUCCESS) {
875			return (EFAULT);
876		}
877
878		if ((share == PROCESS_SHARE) ||
879		    (share == AUTO_SHARE &&
880		     VM_INHERIT_SHARE == entry->inheritance)) {
881			key->shared = 1;
882			key->info.shared.offset = (vm_offset_t)addr -
883			    entry->start + entry->offset;
884			vm_object_reference(key->info.shared.object);
885		} else {
886			key->shared = 0;
887			key->info.private.vs = td->td_proc->p_vmspace;
888			key->info.private.addr = (uintptr_t)addr;
889		}
890		vm_map_lookup_done(map, entry);
891	}
892
893	umtxq_hash(key);
894	return (0);
895}
896
897/*
898 * Release key.
899 */
900void
901umtx_key_release(struct umtx_key *key)
902{
903	if (key->shared)
904		vm_object_deallocate(key->info.shared.object);
905}
906
907/*
908 * Fetch and compare value, sleep on the address if value is not changed.
909 */
910static int
911do_wait(struct thread *td, void *addr, u_long id,
912    struct _umtx_time *timeout, int compat32, int is_private)
913{
914	struct abs_timeout timo;
915	struct umtx_q *uq;
916	u_long tmp;
917	uint32_t tmp32;
918	int error = 0;
919
920	uq = td->td_umtxq;
921	if ((error = umtx_key_get(addr, TYPE_SIMPLE_WAIT,
922		is_private ? THREAD_SHARE : AUTO_SHARE, &uq->uq_key)) != 0)
923		return (error);
924
925	if (timeout != NULL)
926		abs_timeout_init2(&timo, timeout);
927
928	umtxq_lock(&uq->uq_key);
929	umtxq_insert(uq);
930	umtxq_unlock(&uq->uq_key);
931	if (compat32 == 0) {
932		error = fueword(addr, &tmp);
933		if (error != 0)
934			error = EFAULT;
935	} else {
936		error = fueword32(addr, &tmp32);
937		if (error == 0)
938			tmp = tmp32;
939		else
940			error = EFAULT;
941	}
942	umtxq_lock(&uq->uq_key);
943	if (error == 0) {
944		if (tmp == id)
945			error = umtxq_sleep(uq, "uwait", timeout == NULL ?
946			    NULL : &timo);
947		if ((uq->uq_flags & UQF_UMTXQ) == 0)
948			error = 0;
949		else
950			umtxq_remove(uq);
951	} else if ((uq->uq_flags & UQF_UMTXQ) != 0) {
952		umtxq_remove(uq);
953	}
954	umtxq_unlock(&uq->uq_key);
955	umtx_key_release(&uq->uq_key);
956	if (error == ERESTART)
957		error = EINTR;
958	return (error);
959}
960
961/*
962 * Wake up threads sleeping on the specified address.
963 */
964int
965kern_umtx_wake(struct thread *td, void *uaddr, int n_wake, int is_private)
966{
967	struct umtx_key key;
968	int ret;
969
970	if ((ret = umtx_key_get(uaddr, TYPE_SIMPLE_WAIT,
971	    is_private ? THREAD_SHARE : AUTO_SHARE, &key)) != 0)
972		return (ret);
973	umtxq_lock(&key);
974	umtxq_signal(&key, n_wake);
975	umtxq_unlock(&key);
976	umtx_key_release(&key);
977	return (0);
978}
979
980/*
981 * Lock PTHREAD_PRIO_NONE protocol POSIX mutex.
982 */
983static int
984do_lock_normal(struct thread *td, struct umutex *m, uint32_t flags,
985    struct _umtx_time *timeout, int mode)
986{
987	struct abs_timeout timo;
988	struct umtx_q *uq;
989	uint32_t owner, old, id;
990	int error, rv;
991
992	id = td->td_tid;
993	uq = td->td_umtxq;
994	error = 0;
995	if (timeout != NULL)
996		abs_timeout_init2(&timo, timeout);
997
998	/*
999	 * Care must be exercised when dealing with umtx structure. It
1000	 * can fault on any access.
1001	 */
1002	for (;;) {
1003		rv = fueword32(&m->m_owner, &owner);
1004		if (rv == -1)
1005			return (EFAULT);
1006		if (mode == _UMUTEX_WAIT) {
1007			if (owner == UMUTEX_UNOWNED ||
1008			    owner == UMUTEX_CONTESTED ||
1009			    owner == UMUTEX_RB_OWNERDEAD ||
1010			    owner == UMUTEX_RB_NOTRECOV)
1011				return (0);
1012		} else {
1013			/*
1014			 * Robust mutex terminated.  Kernel duty is to
1015			 * return EOWNERDEAD to the userspace.  The
1016			 * umutex.m_flags UMUTEX_NONCONSISTENT is set
1017			 * by the common userspace code.
1018			 */
1019			if (owner == UMUTEX_RB_OWNERDEAD) {
1020				rv = casueword32(&m->m_owner,
1021				    UMUTEX_RB_OWNERDEAD, &owner,
1022				    id | UMUTEX_CONTESTED);
1023				if (rv == -1)
1024					return (EFAULT);
1025				if (rv == 0) {
1026					MPASS(owner == UMUTEX_RB_OWNERDEAD);
1027					return (EOWNERDEAD); /* success */
1028				}
1029				MPASS(rv == 1);
1030				rv = thread_check_susp(td, false);
1031				if (rv != 0)
1032					return (rv);
1033				continue;
1034			}
1035			if (owner == UMUTEX_RB_NOTRECOV)
1036				return (ENOTRECOVERABLE);
1037
1038			/*
1039			 * Try the uncontested case.  This should be
1040			 * done in userland.
1041			 */
1042			rv = casueword32(&m->m_owner, UMUTEX_UNOWNED,
1043			    &owner, id);
1044			/* The address was invalid. */
1045			if (rv == -1)
1046				return (EFAULT);
1047
1048			/* The acquire succeeded. */
1049			if (rv == 0) {
1050				MPASS(owner == UMUTEX_UNOWNED);
1051				return (0);
1052			}
1053
1054			/*
1055			 * If no one owns it but it is contested try
1056			 * to acquire it.
1057			 */
1058			MPASS(rv == 1);
1059			if (owner == UMUTEX_CONTESTED) {
1060				rv = casueword32(&m->m_owner,
1061				    UMUTEX_CONTESTED, &owner,
1062				    id | UMUTEX_CONTESTED);
1063				/* The address was invalid. */
1064				if (rv == -1)
1065					return (EFAULT);
1066				if (rv == 0) {
1067					MPASS(owner == UMUTEX_CONTESTED);
1068					return (0);
1069				}
1070				if (rv == 1) {
1071					rv = thread_check_susp(td, false);
1072					if (rv != 0)
1073						return (rv);
1074				}
1075
1076				/*
1077				 * If this failed the lock has
1078				 * changed, restart.
1079				 */
1080				continue;
1081			}
1082
1083			/* rv == 1 but not contested, likely store failure */
1084			rv = thread_check_susp(td, false);
1085			if (rv != 0)
1086				return (rv);
1087		}
1088
1089		if (mode == _UMUTEX_TRY)
1090			return (EBUSY);
1091
1092		/*
1093		 * If we caught a signal, we have retried and now
1094		 * exit immediately.
1095		 */
1096		if (error != 0)
1097			return (error);
1098
1099		if ((error = umtx_key_get(m, TYPE_NORMAL_UMUTEX,
1100		    GET_SHARE(flags), &uq->uq_key)) != 0)
1101			return (error);
1102
1103		umtxq_lock(&uq->uq_key);
1104		umtxq_busy(&uq->uq_key);
1105		umtxq_insert(uq);
1106		umtxq_unlock(&uq->uq_key);
1107
1108		/*
1109		 * Set the contested bit so that a release in user space
1110		 * knows to use the system call for unlock.  If this fails
1111		 * either some one else has acquired the lock or it has been
1112		 * released.
1113		 */
1114		rv = casueword32(&m->m_owner, owner, &old,
1115		    owner | UMUTEX_CONTESTED);
1116
1117		/* The address was invalid or casueword failed to store. */
1118		if (rv == -1 || rv == 1) {
1119			umtxq_lock(&uq->uq_key);
1120			umtxq_remove(uq);
1121			umtxq_unbusy(&uq->uq_key);
1122			umtxq_unlock(&uq->uq_key);
1123			umtx_key_release(&uq->uq_key);
1124			if (rv == -1)
1125				return (EFAULT);
1126			if (rv == 1) {
1127				rv = thread_check_susp(td, false);
1128				if (rv != 0)
1129					return (rv);
1130			}
1131			continue;
1132		}
1133
1134		/*
1135		 * We set the contested bit, sleep. Otherwise the lock changed
1136		 * and we need to retry or we lost a race to the thread
1137		 * unlocking the umtx.
1138		 */
1139		umtxq_lock(&uq->uq_key);
1140		umtxq_unbusy(&uq->uq_key);
1141		MPASS(old == owner);
1142		error = umtxq_sleep(uq, "umtxn", timeout == NULL ?
1143		    NULL : &timo);
1144		umtxq_remove(uq);
1145		umtxq_unlock(&uq->uq_key);
1146		umtx_key_release(&uq->uq_key);
1147
1148		if (error == 0)
1149			error = thread_check_susp(td, false);
1150	}
1151
1152	return (0);
1153}
1154
1155/*
1156 * Unlock PTHREAD_PRIO_NONE protocol POSIX mutex.
1157 */
1158static int
1159do_unlock_normal(struct thread *td, struct umutex *m, uint32_t flags, bool rb)
1160{
1161	struct umtx_key key;
1162	uint32_t owner, old, id, newlock;
1163	int error, count;
1164
1165	id = td->td_tid;
1166
1167again:
1168	/*
1169	 * Make sure we own this mtx.
1170	 */
1171	error = fueword32(&m->m_owner, &owner);
1172	if (error == -1)
1173		return (EFAULT);
1174
1175	if ((owner & ~UMUTEX_CONTESTED) != id)
1176		return (EPERM);
1177
1178	newlock = umtx_unlock_val(flags, rb);
1179	if ((owner & UMUTEX_CONTESTED) == 0) {
1180		error = casueword32(&m->m_owner, owner, &old, newlock);
1181		if (error == -1)
1182			return (EFAULT);
1183		if (error == 1) {
1184			error = thread_check_susp(td, false);
1185			if (error != 0)
1186				return (error);
1187			goto again;
1188		}
1189		MPASS(old == owner);
1190		return (0);
1191	}
1192
1193	/* We should only ever be in here for contested locks */
1194	if ((error = umtx_key_get(m, TYPE_NORMAL_UMUTEX, GET_SHARE(flags),
1195	    &key)) != 0)
1196		return (error);
1197
1198	umtxq_lock(&key);
1199	umtxq_busy(&key);
1200	count = umtxq_count(&key);
1201	umtxq_unlock(&key);
1202
1203	/*
1204	 * When unlocking the umtx, it must be marked as unowned if
1205	 * there is zero or one thread only waiting for it.
1206	 * Otherwise, it must be marked as contested.
1207	 */
1208	if (count > 1)
1209		newlock |= UMUTEX_CONTESTED;
1210	error = casueword32(&m->m_owner, owner, &old, newlock);
1211	umtxq_lock(&key);
1212	umtxq_signal(&key, 1);
1213	umtxq_unbusy(&key);
1214	umtxq_unlock(&key);
1215	umtx_key_release(&key);
1216	if (error == -1)
1217		return (EFAULT);
1218	if (error == 1) {
1219		if (old != owner)
1220			return (EINVAL);
1221		error = thread_check_susp(td, false);
1222		if (error != 0)
1223			return (error);
1224		goto again;
1225	}
1226	return (0);
1227}
1228
1229/*
1230 * Check if the mutex is available and wake up a waiter,
1231 * only for simple mutex.
1232 */
1233static int
1234do_wake_umutex(struct thread *td, struct umutex *m)
1235{
1236	struct umtx_key key;
1237	uint32_t owner;
1238	uint32_t flags;
1239	int error;
1240	int count;
1241
1242again:
1243	error = fueword32(&m->m_owner, &owner);
1244	if (error == -1)
1245		return (EFAULT);
1246
1247	if ((owner & ~UMUTEX_CONTESTED) != 0 && owner != UMUTEX_RB_OWNERDEAD &&
1248	    owner != UMUTEX_RB_NOTRECOV)
1249		return (0);
1250
1251	error = fueword32(&m->m_flags, &flags);
1252	if (error == -1)
1253		return (EFAULT);
1254
1255	/* We should only ever be in here for contested locks */
1256	if ((error = umtx_key_get(m, TYPE_NORMAL_UMUTEX, GET_SHARE(flags),
1257	    &key)) != 0)
1258		return (error);
1259
1260	umtxq_lock(&key);
1261	umtxq_busy(&key);
1262	count = umtxq_count(&key);
1263	umtxq_unlock(&key);
1264
1265	if (count <= 1 && owner != UMUTEX_RB_OWNERDEAD &&
1266	    owner != UMUTEX_RB_NOTRECOV) {
1267		error = casueword32(&m->m_owner, UMUTEX_CONTESTED, &owner,
1268		    UMUTEX_UNOWNED);
1269		if (error == -1) {
1270			error = EFAULT;
1271		} else if (error == 1) {
1272			umtxq_lock(&key);
1273			umtxq_unbusy(&key);
1274			umtxq_unlock(&key);
1275			umtx_key_release(&key);
1276			error = thread_check_susp(td, false);
1277			if (error != 0)
1278				return (error);
1279			goto again;
1280		}
1281	}
1282
1283	umtxq_lock(&key);
1284	if (error == 0 && count != 0) {
1285		MPASS((owner & ~UMUTEX_CONTESTED) == 0 ||
1286		    owner == UMUTEX_RB_OWNERDEAD ||
1287		    owner == UMUTEX_RB_NOTRECOV);
1288		umtxq_signal(&key, 1);
1289	}
1290	umtxq_unbusy(&key);
1291	umtxq_unlock(&key);
1292	umtx_key_release(&key);
1293	return (error);
1294}
1295
1296/*
1297 * Check if the mutex has waiters and tries to fix contention bit.
1298 */
1299static int
1300do_wake2_umutex(struct thread *td, struct umutex *m, uint32_t flags)
1301{
1302	struct umtx_key key;
1303	uint32_t owner, old;
1304	int type;
1305	int error;
1306	int count;
1307
1308	switch (flags & (UMUTEX_PRIO_INHERIT | UMUTEX_PRIO_PROTECT |
1309	    UMUTEX_ROBUST)) {
1310	case 0:
1311	case UMUTEX_ROBUST:
1312		type = TYPE_NORMAL_UMUTEX;
1313		break;
1314	case UMUTEX_PRIO_INHERIT:
1315		type = TYPE_PI_UMUTEX;
1316		break;
1317	case (UMUTEX_PRIO_INHERIT | UMUTEX_ROBUST):
1318		type = TYPE_PI_ROBUST_UMUTEX;
1319		break;
1320	case UMUTEX_PRIO_PROTECT:
1321		type = TYPE_PP_UMUTEX;
1322		break;
1323	case (UMUTEX_PRIO_PROTECT | UMUTEX_ROBUST):
1324		type = TYPE_PP_ROBUST_UMUTEX;
1325		break;
1326	default:
1327		return (EINVAL);
1328	}
1329	if ((error = umtx_key_get(m, type, GET_SHARE(flags), &key)) != 0)
1330		return (error);
1331
1332	owner = 0;
1333	umtxq_lock(&key);
1334	umtxq_busy(&key);
1335	count = umtxq_count(&key);
1336	umtxq_unlock(&key);
1337
1338	error = fueword32(&m->m_owner, &owner);
1339	if (error == -1)
1340		error = EFAULT;
1341
1342	/*
1343	 * Only repair contention bit if there is a waiter, this means
1344	 * the mutex is still being referenced by userland code,
1345	 * otherwise don't update any memory.
1346	 */
1347	while (error == 0 && (owner & UMUTEX_CONTESTED) == 0 &&
1348	    (count > 1 || (count == 1 && (owner & ~UMUTEX_CONTESTED) != 0))) {
1349		error = casueword32(&m->m_owner, owner, &old,
1350		    owner | UMUTEX_CONTESTED);
1351		if (error == -1) {
1352			error = EFAULT;
1353			break;
1354		}
1355		if (error == 0) {
1356			MPASS(old == owner);
1357			break;
1358		}
1359		owner = old;
1360		error = thread_check_susp(td, false);
1361	}
1362
1363	umtxq_lock(&key);
1364	if (error == EFAULT) {
1365		umtxq_signal(&key, INT_MAX);
1366	} else if (count != 0 && ((owner & ~UMUTEX_CONTESTED) == 0 ||
1367	    owner == UMUTEX_RB_OWNERDEAD || owner == UMUTEX_RB_NOTRECOV))
1368		umtxq_signal(&key, 1);
1369	umtxq_unbusy(&key);
1370	umtxq_unlock(&key);
1371	umtx_key_release(&key);
1372	return (error);
1373}
1374
1375static inline struct umtx_pi *
1376umtx_pi_alloc(int flags)
1377{
1378	struct umtx_pi *pi;
1379
1380	pi = uma_zalloc(umtx_pi_zone, M_ZERO | flags);
1381	TAILQ_INIT(&pi->pi_blocked);
1382	atomic_add_int(&umtx_pi_allocated, 1);
1383	return (pi);
1384}
1385
1386static inline void
1387umtx_pi_free(struct umtx_pi *pi)
1388{
1389	uma_zfree(umtx_pi_zone, pi);
1390	atomic_add_int(&umtx_pi_allocated, -1);
1391}
1392
1393/*
1394 * Adjust the thread's position on a pi_state after its priority has been
1395 * changed.
1396 */
1397static int
1398umtx_pi_adjust_thread(struct umtx_pi *pi, struct thread *td)
1399{
1400	struct umtx_q *uq, *uq1, *uq2;
1401	struct thread *td1;
1402
1403	mtx_assert(&umtx_lock, MA_OWNED);
1404	if (pi == NULL)
1405		return (0);
1406
1407	uq = td->td_umtxq;
1408
1409	/*
1410	 * Check if the thread needs to be moved on the blocked chain.
1411	 * It needs to be moved if either its priority is lower than
1412	 * the previous thread or higher than the next thread.
1413	 */
1414	uq1 = TAILQ_PREV(uq, umtxq_head, uq_lockq);
1415	uq2 = TAILQ_NEXT(uq, uq_lockq);
1416	if ((uq1 != NULL && UPRI(td) < UPRI(uq1->uq_thread)) ||
1417	    (uq2 != NULL && UPRI(td) > UPRI(uq2->uq_thread))) {
1418		/*
1419		 * Remove thread from blocked chain and determine where
1420		 * it should be moved to.
1421		 */
1422		TAILQ_REMOVE(&pi->pi_blocked, uq, uq_lockq);
1423		TAILQ_FOREACH(uq1, &pi->pi_blocked, uq_lockq) {
1424			td1 = uq1->uq_thread;
1425			MPASS(td1->td_proc->p_magic == P_MAGIC);
1426			if (UPRI(td1) > UPRI(td))
1427				break;
1428		}
1429
1430		if (uq1 == NULL)
1431			TAILQ_INSERT_TAIL(&pi->pi_blocked, uq, uq_lockq);
1432		else
1433			TAILQ_INSERT_BEFORE(uq1, uq, uq_lockq);
1434	}
1435	return (1);
1436}
1437
1438static struct umtx_pi *
1439umtx_pi_next(struct umtx_pi *pi)
1440{
1441	struct umtx_q *uq_owner;
1442
1443	if (pi->pi_owner == NULL)
1444		return (NULL);
1445	uq_owner = pi->pi_owner->td_umtxq;
1446	if (uq_owner == NULL)
1447		return (NULL);
1448	return (uq_owner->uq_pi_blocked);
1449}
1450
1451/*
1452 * Floyd's Cycle-Finding Algorithm.
1453 */
1454static bool
1455umtx_pi_check_loop(struct umtx_pi *pi)
1456{
1457	struct umtx_pi *pi1;	/* fast iterator */
1458
1459	mtx_assert(&umtx_lock, MA_OWNED);
1460	if (pi == NULL)
1461		return (false);
1462	pi1 = pi;
1463	for (;;) {
1464		pi = umtx_pi_next(pi);
1465		if (pi == NULL)
1466			break;
1467		pi1 = umtx_pi_next(pi1);
1468		if (pi1 == NULL)
1469			break;
1470		pi1 = umtx_pi_next(pi1);
1471		if (pi1 == NULL)
1472			break;
1473		if (pi == pi1)
1474			return (true);
1475	}
1476	return (false);
1477}
1478
1479/*
1480 * Propagate priority when a thread is blocked on POSIX
1481 * PI mutex.
1482 */
1483static void
1484umtx_propagate_priority(struct thread *td)
1485{
1486	struct umtx_q *uq;
1487	struct umtx_pi *pi;
1488	int pri;
1489
1490	mtx_assert(&umtx_lock, MA_OWNED);
1491	pri = UPRI(td);
1492	uq = td->td_umtxq;
1493	pi = uq->uq_pi_blocked;
1494	if (pi == NULL)
1495		return;
1496	if (umtx_pi_check_loop(pi))
1497		return;
1498
1499	for (;;) {
1500		td = pi->pi_owner;
1501		if (td == NULL || td == curthread)
1502			return;
1503
1504		MPASS(td->td_proc != NULL);
1505		MPASS(td->td_proc->p_magic == P_MAGIC);
1506
1507		thread_lock(td);
1508		if (td->td_lend_user_pri > pri)
1509			sched_lend_user_prio(td, pri);
1510		else {
1511			thread_unlock(td);
1512			break;
1513		}
1514		thread_unlock(td);
1515
1516		/*
1517		 * Pick up the lock that td is blocked on.
1518		 */
1519		uq = td->td_umtxq;
1520		pi = uq->uq_pi_blocked;
1521		if (pi == NULL)
1522			break;
1523		/* Resort td on the list if needed. */
1524		umtx_pi_adjust_thread(pi, td);
1525	}
1526}
1527
1528/*
1529 * Unpropagate priority for a PI mutex when a thread blocked on
1530 * it is interrupted by signal or resumed by others.
1531 */
1532static void
1533umtx_repropagate_priority(struct umtx_pi *pi)
1534{
1535	struct umtx_q *uq, *uq_owner;
1536	struct umtx_pi *pi2;
1537	int pri;
1538
1539	mtx_assert(&umtx_lock, MA_OWNED);
1540
1541	if (umtx_pi_check_loop(pi))
1542		return;
1543	while (pi != NULL && pi->pi_owner != NULL) {
1544		pri = PRI_MAX;
1545		uq_owner = pi->pi_owner->td_umtxq;
1546
1547		TAILQ_FOREACH(pi2, &uq_owner->uq_pi_contested, pi_link) {
1548			uq = TAILQ_FIRST(&pi2->pi_blocked);
1549			if (uq != NULL) {
1550				if (pri > UPRI(uq->uq_thread))
1551					pri = UPRI(uq->uq_thread);
1552			}
1553		}
1554
1555		if (pri > uq_owner->uq_inherited_pri)
1556			pri = uq_owner->uq_inherited_pri;
1557		thread_lock(pi->pi_owner);
1558		sched_lend_user_prio(pi->pi_owner, pri);
1559		thread_unlock(pi->pi_owner);
1560		if ((pi = uq_owner->uq_pi_blocked) != NULL)
1561			umtx_pi_adjust_thread(pi, uq_owner->uq_thread);
1562	}
1563}
1564
1565/*
1566 * Insert a PI mutex into owned list.
1567 */
1568static void
1569umtx_pi_setowner(struct umtx_pi *pi, struct thread *owner)
1570{
1571	struct umtx_q *uq_owner;
1572
1573	uq_owner = owner->td_umtxq;
1574	mtx_assert(&umtx_lock, MA_OWNED);
1575	MPASS(pi->pi_owner == NULL);
1576	pi->pi_owner = owner;
1577	TAILQ_INSERT_TAIL(&uq_owner->uq_pi_contested, pi, pi_link);
1578}
1579
1580
1581/*
1582 * Disown a PI mutex, and remove it from the owned list.
1583 */
1584static void
1585umtx_pi_disown(struct umtx_pi *pi)
1586{
1587
1588	mtx_assert(&umtx_lock, MA_OWNED);
1589	TAILQ_REMOVE(&pi->pi_owner->td_umtxq->uq_pi_contested, pi, pi_link);
1590	pi->pi_owner = NULL;
1591}
1592
1593/*
1594 * Claim ownership of a PI mutex.
1595 */
1596static int
1597umtx_pi_claim(struct umtx_pi *pi, struct thread *owner)
1598{
1599	struct umtx_q *uq;
1600	int pri;
1601
1602	mtx_lock(&umtx_lock);
1603	if (pi->pi_owner == owner) {
1604		mtx_unlock(&umtx_lock);
1605		return (0);
1606	}
1607
1608	if (pi->pi_owner != NULL) {
1609		/*
1610		 * userland may have already messed the mutex, sigh.
1611		 */
1612		mtx_unlock(&umtx_lock);
1613		return (EPERM);
1614	}
1615	umtx_pi_setowner(pi, owner);
1616	uq = TAILQ_FIRST(&pi->pi_blocked);
1617	if (uq != NULL) {
1618		pri = UPRI(uq->uq_thread);
1619		thread_lock(owner);
1620		if (pri < UPRI(owner))
1621			sched_lend_user_prio(owner, pri);
1622		thread_unlock(owner);
1623	}
1624	mtx_unlock(&umtx_lock);
1625	return (0);
1626}
1627
1628/*
1629 * Adjust a thread's order position in its blocked PI mutex,
1630 * this may result new priority propagating process.
1631 */
1632void
1633umtx_pi_adjust(struct thread *td, u_char oldpri)
1634{
1635	struct umtx_q *uq;
1636	struct umtx_pi *pi;
1637
1638	uq = td->td_umtxq;
1639	mtx_lock(&umtx_lock);
1640	/*
1641	 * Pick up the lock that td is blocked on.
1642	 */
1643	pi = uq->uq_pi_blocked;
1644	if (pi != NULL) {
1645		umtx_pi_adjust_thread(pi, td);
1646		umtx_repropagate_priority(pi);
1647	}
1648	mtx_unlock(&umtx_lock);
1649}
1650
1651/*
1652 * Sleep on a PI mutex.
1653 */
1654static int
1655umtxq_sleep_pi(struct umtx_q *uq, struct umtx_pi *pi, uint32_t owner,
1656    const char *wmesg, struct abs_timeout *timo, bool shared)
1657{
1658	struct thread *td, *td1;
1659	struct umtx_q *uq1;
1660	int error, pri;
1661#ifdef INVARIANTS
1662	struct umtxq_chain *uc;
1663
1664	uc = umtxq_getchain(&pi->pi_key);
1665#endif
1666	error = 0;
1667	td = uq->uq_thread;
1668	KASSERT(td == curthread, ("inconsistent uq_thread"));
1669	UMTXQ_LOCKED_ASSERT(umtxq_getchain(&uq->uq_key));
1670	KASSERT(uc->uc_busy != 0, ("umtx chain is not busy"));
1671	umtxq_insert(uq);
1672	mtx_lock(&umtx_lock);
1673	if (pi->pi_owner == NULL) {
1674		mtx_unlock(&umtx_lock);
1675		td1 = tdfind(owner, shared ? -1 : td->td_proc->p_pid);
1676		mtx_lock(&umtx_lock);
1677		if (td1 != NULL) {
1678			if (pi->pi_owner == NULL)
1679				umtx_pi_setowner(pi, td1);
1680			PROC_UNLOCK(td1->td_proc);
1681		}
1682	}
1683
1684	TAILQ_FOREACH(uq1, &pi->pi_blocked, uq_lockq) {
1685		pri = UPRI(uq1->uq_thread);
1686		if (pri > UPRI(td))
1687			break;
1688	}
1689
1690	if (uq1 != NULL)
1691		TAILQ_INSERT_BEFORE(uq1, uq, uq_lockq);
1692	else
1693		TAILQ_INSERT_TAIL(&pi->pi_blocked, uq, uq_lockq);
1694
1695	uq->uq_pi_blocked = pi;
1696	thread_lock(td);
1697	td->td_flags |= TDF_UPIBLOCKED;
1698	thread_unlock(td);
1699	umtx_propagate_priority(td);
1700	mtx_unlock(&umtx_lock);
1701	umtxq_unbusy(&uq->uq_key);
1702
1703	error = umtxq_sleep(uq, wmesg, timo);
1704	umtxq_remove(uq);
1705
1706	mtx_lock(&umtx_lock);
1707	uq->uq_pi_blocked = NULL;
1708	thread_lock(td);
1709	td->td_flags &= ~TDF_UPIBLOCKED;
1710	thread_unlock(td);
1711	TAILQ_REMOVE(&pi->pi_blocked, uq, uq_lockq);
1712	umtx_repropagate_priority(pi);
1713	mtx_unlock(&umtx_lock);
1714	umtxq_unlock(&uq->uq_key);
1715
1716	return (error);
1717}
1718
1719/*
1720 * Add reference count for a PI mutex.
1721 */
1722static void
1723umtx_pi_ref(struct umtx_pi *pi)
1724{
1725
1726	UMTXQ_LOCKED_ASSERT(umtxq_getchain(&pi->pi_key));
1727	pi->pi_refcount++;
1728}
1729
1730/*
1731 * Decrease reference count for a PI mutex, if the counter
1732 * is decreased to zero, its memory space is freed.
1733 */
1734static void
1735umtx_pi_unref(struct umtx_pi *pi)
1736{
1737	struct umtxq_chain *uc;
1738
1739	uc = umtxq_getchain(&pi->pi_key);
1740	UMTXQ_LOCKED_ASSERT(uc);
1741	KASSERT(pi->pi_refcount > 0, ("invalid reference count"));
1742	if (--pi->pi_refcount == 0) {
1743		mtx_lock(&umtx_lock);
1744		if (pi->pi_owner != NULL)
1745			umtx_pi_disown(pi);
1746		KASSERT(TAILQ_EMPTY(&pi->pi_blocked),
1747			("blocked queue not empty"));
1748		mtx_unlock(&umtx_lock);
1749		TAILQ_REMOVE(&uc->uc_pi_list, pi, pi_hashlink);
1750		umtx_pi_free(pi);
1751	}
1752}
1753
1754/*
1755 * Find a PI mutex in hash table.
1756 */
1757static struct umtx_pi *
1758umtx_pi_lookup(struct umtx_key *key)
1759{
1760	struct umtxq_chain *uc;
1761	struct umtx_pi *pi;
1762
1763	uc = umtxq_getchain(key);
1764	UMTXQ_LOCKED_ASSERT(uc);
1765
1766	TAILQ_FOREACH(pi, &uc->uc_pi_list, pi_hashlink) {
1767		if (umtx_key_match(&pi->pi_key, key)) {
1768			return (pi);
1769		}
1770	}
1771	return (NULL);
1772}
1773
1774/*
1775 * Insert a PI mutex into hash table.
1776 */
1777static inline void
1778umtx_pi_insert(struct umtx_pi *pi)
1779{
1780	struct umtxq_chain *uc;
1781
1782	uc = umtxq_getchain(&pi->pi_key);
1783	UMTXQ_LOCKED_ASSERT(uc);
1784	TAILQ_INSERT_TAIL(&uc->uc_pi_list, pi, pi_hashlink);
1785}
1786
1787/*
1788 * Lock a PI mutex.
1789 */
1790static int
1791do_lock_pi(struct thread *td, struct umutex *m, uint32_t flags,
1792    struct _umtx_time *timeout, int try)
1793{
1794	struct abs_timeout timo;
1795	struct umtx_q *uq;
1796	struct umtx_pi *pi, *new_pi;
1797	uint32_t id, old_owner, owner, old;
1798	int error, rv;
1799
1800	id = td->td_tid;
1801	uq = td->td_umtxq;
1802
1803	if ((error = umtx_key_get(m, (flags & UMUTEX_ROBUST) != 0 ?
1804	    TYPE_PI_ROBUST_UMUTEX : TYPE_PI_UMUTEX, GET_SHARE(flags),
1805	    &uq->uq_key)) != 0)
1806		return (error);
1807
1808	if (timeout != NULL)
1809		abs_timeout_init2(&timo, timeout);
1810
1811	umtxq_lock(&uq->uq_key);
1812	pi = umtx_pi_lookup(&uq->uq_key);
1813	if (pi == NULL) {
1814		new_pi = umtx_pi_alloc(M_NOWAIT);
1815		if (new_pi == NULL) {
1816			umtxq_unlock(&uq->uq_key);
1817			new_pi = umtx_pi_alloc(M_WAITOK);
1818			umtxq_lock(&uq->uq_key);
1819			pi = umtx_pi_lookup(&uq->uq_key);
1820			if (pi != NULL) {
1821				umtx_pi_free(new_pi);
1822				new_pi = NULL;
1823			}
1824		}
1825		if (new_pi != NULL) {
1826			new_pi->pi_key = uq->uq_key;
1827			umtx_pi_insert(new_pi);
1828			pi = new_pi;
1829		}
1830	}
1831	umtx_pi_ref(pi);
1832	umtxq_unlock(&uq->uq_key);
1833
1834	/*
1835	 * Care must be exercised when dealing with umtx structure.  It
1836	 * can fault on any access.
1837	 */
1838	for (;;) {
1839		/*
1840		 * Try the uncontested case.  This should be done in userland.
1841		 */
1842		rv = casueword32(&m->m_owner, UMUTEX_UNOWNED, &owner, id);
1843		/* The address was invalid. */
1844		if (rv == -1) {
1845			error = EFAULT;
1846			break;
1847		}
1848		/* The acquire succeeded. */
1849		if (rv == 0) {
1850			MPASS(owner == UMUTEX_UNOWNED);
1851			error = 0;
1852			break;
1853		}
1854
1855		if (owner == UMUTEX_RB_NOTRECOV) {
1856			error = ENOTRECOVERABLE;
1857			break;
1858		}
1859
1860		/*
1861		 * Avoid overwriting a possible error from sleep due
1862		 * to the pending signal with suspension check result.
1863		 */
1864		if (error == 0) {
1865			error = thread_check_susp(td, true);
1866			if (error != 0)
1867				break;
1868		}
1869
1870		/* If no one owns it but it is contested try to acquire it. */
1871		if (owner == UMUTEX_CONTESTED || owner == UMUTEX_RB_OWNERDEAD) {
1872			old_owner = owner;
1873			rv = casueword32(&m->m_owner, owner, &owner,
1874			    id | UMUTEX_CONTESTED);
1875			/* The address was invalid. */
1876			if (rv == -1) {
1877				error = EFAULT;
1878				break;
1879			}
1880			if (rv == 1) {
1881				if (error == 0) {
1882					error = thread_check_susp(td, true);
1883					if (error != 0)
1884						break;
1885				}
1886
1887				/*
1888				 * If this failed the lock could
1889				 * changed, restart.
1890				 */
1891				continue;
1892			}
1893
1894			MPASS(rv == 0);
1895			MPASS(owner == old_owner);
1896			umtxq_lock(&uq->uq_key);
1897			umtxq_busy(&uq->uq_key);
1898			error = umtx_pi_claim(pi, td);
1899			umtxq_unbusy(&uq->uq_key);
1900			umtxq_unlock(&uq->uq_key);
1901			if (error != 0) {
1902				/*
1903				 * Since we're going to return an
1904				 * error, restore the m_owner to its
1905				 * previous, unowned state to avoid
1906				 * compounding the problem.
1907				 */
1908				(void)casuword32(&m->m_owner,
1909				    id | UMUTEX_CONTESTED, old_owner);
1910			}
1911			if (error == 0 && old_owner == UMUTEX_RB_OWNERDEAD)
1912				error = EOWNERDEAD;
1913			break;
1914		}
1915
1916		if ((owner & ~UMUTEX_CONTESTED) == id) {
1917			error = EDEADLK;
1918			break;
1919		}
1920
1921		if (try != 0) {
1922			error = EBUSY;
1923			break;
1924		}
1925
1926		/*
1927		 * If we caught a signal, we have retried and now
1928		 * exit immediately.
1929		 */
1930		if (error != 0)
1931			break;
1932
1933		umtxq_lock(&uq->uq_key);
1934		umtxq_busy(&uq->uq_key);
1935		umtxq_unlock(&uq->uq_key);
1936
1937		/*
1938		 * Set the contested bit so that a release in user space
1939		 * knows to use the system call for unlock.  If this fails
1940		 * either some one else has acquired the lock or it has been
1941		 * released.
1942		 */
1943		rv = casueword32(&m->m_owner, owner, &old, owner |
1944		    UMUTEX_CONTESTED);
1945
1946		/* The address was invalid. */
1947		if (rv == -1) {
1948			umtxq_unbusy_unlocked(&uq->uq_key);
1949			error = EFAULT;
1950			break;
1951		}
1952		if (rv == 1) {
1953			umtxq_unbusy_unlocked(&uq->uq_key);
1954			error = thread_check_susp(td, true);
1955			if (error != 0)
1956				break;
1957
1958			/*
1959			 * The lock changed and we need to retry or we
1960			 * lost a race to the thread unlocking the
1961			 * umtx.  Note that the UMUTEX_RB_OWNERDEAD
1962			 * value for owner is impossible there.
1963			 */
1964			continue;
1965		}
1966
1967		umtxq_lock(&uq->uq_key);
1968
1969		/* We set the contested bit, sleep. */
1970		MPASS(old == owner);
1971		error = umtxq_sleep_pi(uq, pi, owner & ~UMUTEX_CONTESTED,
1972		    "umtxpi", timeout == NULL ? NULL : &timo,
1973		    (flags & USYNC_PROCESS_SHARED) != 0);
1974		if (error != 0)
1975			continue;
1976
1977		error = thread_check_susp(td, false);
1978		if (error != 0)
1979			break;
1980	}
1981
1982	umtxq_lock(&uq->uq_key);
1983	umtx_pi_unref(pi);
1984	umtxq_unlock(&uq->uq_key);
1985
1986	umtx_key_release(&uq->uq_key);
1987	return (error);
1988}
1989
1990/*
1991 * Unlock a PI mutex.
1992 */
1993static int
1994do_unlock_pi(struct thread *td, struct umutex *m, uint32_t flags, bool rb)
1995{
1996	struct umtx_key key;
1997	struct umtx_q *uq_first, *uq_first2, *uq_me;
1998	struct umtx_pi *pi, *pi2;
1999	uint32_t id, new_owner, old, owner;
2000	int count, error, pri;
2001
2002	id = td->td_tid;
2003
2004usrloop:
2005	/*
2006	 * Make sure we own this mtx.
2007	 */
2008	error = fueword32(&m->m_owner, &owner);
2009	if (error == -1)
2010		return (EFAULT);
2011
2012	if ((owner & ~UMUTEX_CONTESTED) != id)
2013		return (EPERM);
2014
2015	new_owner = umtx_unlock_val(flags, rb);
2016
2017	/* This should be done in userland */
2018	if ((owner & UMUTEX_CONTESTED) == 0) {
2019		error = casueword32(&m->m_owner, owner, &old, new_owner);
2020		if (error == -1)
2021			return (EFAULT);
2022		if (error == 1) {
2023			error = thread_check_susp(td, true);
2024			if (error != 0)
2025				return (error);
2026			goto usrloop;
2027		}
2028		if (old == owner)
2029			return (0);
2030		owner = old;
2031	}
2032
2033	/* We should only ever be in here for contested locks */
2034	if ((error = umtx_key_get(m, (flags & UMUTEX_ROBUST) != 0 ?
2035	    TYPE_PI_ROBUST_UMUTEX : TYPE_PI_UMUTEX, GET_SHARE(flags),
2036	    &key)) != 0)
2037		return (error);
2038
2039	umtxq_lock(&key);
2040	umtxq_busy(&key);
2041	count = umtxq_count_pi(&key, &uq_first);
2042	if (uq_first != NULL) {
2043		mtx_lock(&umtx_lock);
2044		pi = uq_first->uq_pi_blocked;
2045		KASSERT(pi != NULL, ("pi == NULL?"));
2046		if (pi->pi_owner != td && !(rb && pi->pi_owner == NULL)) {
2047			mtx_unlock(&umtx_lock);
2048			umtxq_unbusy(&key);
2049			umtxq_unlock(&key);
2050			umtx_key_release(&key);
2051			/* userland messed the mutex */
2052			return (EPERM);
2053		}
2054		uq_me = td->td_umtxq;
2055		if (pi->pi_owner == td)
2056			umtx_pi_disown(pi);
2057		/* get highest priority thread which is still sleeping. */
2058		uq_first = TAILQ_FIRST(&pi->pi_blocked);
2059		while (uq_first != NULL &&
2060		    (uq_first->uq_flags & UQF_UMTXQ) == 0) {
2061			uq_first = TAILQ_NEXT(uq_first, uq_lockq);
2062		}
2063		pri = PRI_MAX;
2064		TAILQ_FOREACH(pi2, &uq_me->uq_pi_contested, pi_link) {
2065			uq_first2 = TAILQ_FIRST(&pi2->pi_blocked);
2066			if (uq_first2 != NULL) {
2067				if (pri > UPRI(uq_first2->uq_thread))
2068					pri = UPRI(uq_first2->uq_thread);
2069			}
2070		}
2071		thread_lock(td);
2072		sched_lend_user_prio(td, pri);
2073		thread_unlock(td);
2074		mtx_unlock(&umtx_lock);
2075		if (uq_first)
2076			umtxq_signal_thread(uq_first);
2077	} else {
2078		pi = umtx_pi_lookup(&key);
2079		/*
2080		 * A umtx_pi can exist if a signal or timeout removed the
2081		 * last waiter from the umtxq, but there is still
2082		 * a thread in do_lock_pi() holding the umtx_pi.
2083		 */
2084		if (pi != NULL) {
2085			/*
2086			 * The umtx_pi can be unowned, such as when a thread
2087			 * has just entered do_lock_pi(), allocated the
2088			 * umtx_pi, and unlocked the umtxq.
2089			 * If the current thread owns it, it must disown it.
2090			 */
2091			mtx_lock(&umtx_lock);
2092			if (pi->pi_owner == td)
2093				umtx_pi_disown(pi);
2094			mtx_unlock(&umtx_lock);
2095		}
2096	}
2097	umtxq_unlock(&key);
2098
2099	/*
2100	 * When unlocking the umtx, it must be marked as unowned if
2101	 * there is zero or one thread only waiting for it.
2102	 * Otherwise, it must be marked as contested.
2103	 */
2104
2105	if (count > 1)
2106		new_owner |= UMUTEX_CONTESTED;
2107again:
2108	error = casueword32(&m->m_owner, owner, &old, new_owner);
2109	if (error == 1) {
2110		error = thread_check_susp(td, false);
2111		if (error == 0)
2112			goto again;
2113	}
2114	umtxq_unbusy_unlocked(&key);
2115	umtx_key_release(&key);
2116	if (error == -1)
2117		return (EFAULT);
2118	if (error == 0 && old != owner)
2119		return (EINVAL);
2120	return (error);
2121}
2122
2123/*
2124 * Lock a PP mutex.
2125 */
2126static int
2127do_lock_pp(struct thread *td, struct umutex *m, uint32_t flags,
2128    struct _umtx_time *timeout, int try)
2129{
2130	struct abs_timeout timo;
2131	struct umtx_q *uq, *uq2;
2132	struct umtx_pi *pi;
2133	uint32_t ceiling;
2134	uint32_t owner, id;
2135	int error, pri, old_inherited_pri, su, rv;
2136
2137	id = td->td_tid;
2138	uq = td->td_umtxq;
2139	if ((error = umtx_key_get(m, (flags & UMUTEX_ROBUST) != 0 ?
2140	    TYPE_PP_ROBUST_UMUTEX : TYPE_PP_UMUTEX, GET_SHARE(flags),
2141	    &uq->uq_key)) != 0)
2142		return (error);
2143
2144	if (timeout != NULL)
2145		abs_timeout_init2(&timo, timeout);
2146
2147	su = (priv_check(td, PRIV_SCHED_RTPRIO) == 0);
2148	for (;;) {
2149		old_inherited_pri = uq->uq_inherited_pri;
2150		umtxq_lock(&uq->uq_key);
2151		umtxq_busy(&uq->uq_key);
2152		umtxq_unlock(&uq->uq_key);
2153
2154		rv = fueword32(&m->m_ceilings[0], &ceiling);
2155		if (rv == -1) {
2156			error = EFAULT;
2157			goto out;
2158		}
2159		ceiling = RTP_PRIO_MAX - ceiling;
2160		if (ceiling > RTP_PRIO_MAX) {
2161			error = EINVAL;
2162			goto out;
2163		}
2164
2165		mtx_lock(&umtx_lock);
2166		if (UPRI(td) < PRI_MIN_REALTIME + ceiling) {
2167			mtx_unlock(&umtx_lock);
2168			error = EINVAL;
2169			goto out;
2170		}
2171		if (su && PRI_MIN_REALTIME + ceiling < uq->uq_inherited_pri) {
2172			uq->uq_inherited_pri = PRI_MIN_REALTIME + ceiling;
2173			thread_lock(td);
2174			if (uq->uq_inherited_pri < UPRI(td))
2175				sched_lend_user_prio(td, uq->uq_inherited_pri);
2176			thread_unlock(td);
2177		}
2178		mtx_unlock(&umtx_lock);
2179
2180		rv = casueword32(&m->m_owner, UMUTEX_CONTESTED, &owner,
2181		    id | UMUTEX_CONTESTED);
2182		/* The address was invalid. */
2183		if (rv == -1) {
2184			error = EFAULT;
2185			break;
2186		}
2187		if (rv == 0) {
2188			MPASS(owner == UMUTEX_CONTESTED);
2189			error = 0;
2190			break;
2191		}
2192		/* rv == 1 */
2193		if (owner == UMUTEX_RB_OWNERDEAD) {
2194			rv = casueword32(&m->m_owner, UMUTEX_RB_OWNERDEAD,
2195			    &owner, id | UMUTEX_CONTESTED);
2196			if (rv == -1) {
2197				error = EFAULT;
2198				break;
2199			}
2200			if (rv == 0) {
2201				MPASS(owner == UMUTEX_RB_OWNERDEAD);
2202				error = EOWNERDEAD; /* success */
2203				break;
2204			}
2205
2206			/*
2207			 *  rv == 1, only check for suspension if we
2208			 *  did not already catched a signal.  If we
2209			 *  get an error from the check, the same
2210			 *  condition is checked by the umtxq_sleep()
2211			 *  call below, so we should obliterate the
2212			 *  error to not skip the last loop iteration.
2213			 */
2214			if (error == 0) {
2215				error = thread_check_susp(td, false);
2216				if (error == 0) {
2217					if (try != 0)
2218						error = EBUSY;
2219					else
2220						continue;
2221				}
2222				error = 0;
2223			}
2224		} else if (owner == UMUTEX_RB_NOTRECOV) {
2225			error = ENOTRECOVERABLE;
2226		}
2227
2228		if (try != 0)
2229			error = EBUSY;
2230
2231		/*
2232		 * If we caught a signal, we have retried and now
2233		 * exit immediately.
2234		 */
2235		if (error != 0)
2236			break;
2237
2238		umtxq_lock(&uq->uq_key);
2239		umtxq_insert(uq);
2240		umtxq_unbusy(&uq->uq_key);
2241		error = umtxq_sleep(uq, "umtxpp", timeout == NULL ?
2242		    NULL : &timo);
2243		umtxq_remove(uq);
2244		umtxq_unlock(&uq->uq_key);
2245
2246		mtx_lock(&umtx_lock);
2247		uq->uq_inherited_pri = old_inherited_pri;
2248		pri = PRI_MAX;
2249		TAILQ_FOREACH(pi, &uq->uq_pi_contested, pi_link) {
2250			uq2 = TAILQ_FIRST(&pi->pi_blocked);
2251			if (uq2 != NULL) {
2252				if (pri > UPRI(uq2->uq_thread))
2253					pri = UPRI(uq2->uq_thread);
2254			}
2255		}
2256		if (pri > uq->uq_inherited_pri)
2257			pri = uq->uq_inherited_pri;
2258		thread_lock(td);
2259		sched_lend_user_prio(td, pri);
2260		thread_unlock(td);
2261		mtx_unlock(&umtx_lock);
2262	}
2263
2264	if (error != 0 && error != EOWNERDEAD) {
2265		mtx_lock(&umtx_lock);
2266		uq->uq_inherited_pri = old_inherited_pri;
2267		pri = PRI_MAX;
2268		TAILQ_FOREACH(pi, &uq->uq_pi_contested, pi_link) {
2269			uq2 = TAILQ_FIRST(&pi->pi_blocked);
2270			if (uq2 != NULL) {
2271				if (pri > UPRI(uq2->uq_thread))
2272					pri = UPRI(uq2->uq_thread);
2273			}
2274		}
2275		if (pri > uq->uq_inherited_pri)
2276			pri = uq->uq_inherited_pri;
2277		thread_lock(td);
2278		sched_lend_user_prio(td, pri);
2279		thread_unlock(td);
2280		mtx_unlock(&umtx_lock);
2281	}
2282
2283out:
2284	umtxq_unbusy_unlocked(&uq->uq_key);
2285	umtx_key_release(&uq->uq_key);
2286	return (error);
2287}
2288
2289/*
2290 * Unlock a PP mutex.
2291 */
2292static int
2293do_unlock_pp(struct thread *td, struct umutex *m, uint32_t flags, bool rb)
2294{
2295	struct umtx_key key;
2296	struct umtx_q *uq, *uq2;
2297	struct umtx_pi *pi;
2298	uint32_t id, owner, rceiling;
2299	int error, pri, new_inherited_pri, su;
2300
2301	id = td->td_tid;
2302	uq = td->td_umtxq;
2303	su = (priv_check(td, PRIV_SCHED_RTPRIO) == 0);
2304
2305	/*
2306	 * Make sure we own this mtx.
2307	 */
2308	error = fueword32(&m->m_owner, &owner);
2309	if (error == -1)
2310		return (EFAULT);
2311
2312	if ((owner & ~UMUTEX_CONTESTED) != id)
2313		return (EPERM);
2314
2315	error = copyin(&m->m_ceilings[1], &rceiling, sizeof(uint32_t));
2316	if (error != 0)
2317		return (error);
2318
2319	if (rceiling == -1)
2320		new_inherited_pri = PRI_MAX;
2321	else {
2322		rceiling = RTP_PRIO_MAX - rceiling;
2323		if (rceiling > RTP_PRIO_MAX)
2324			return (EINVAL);
2325		new_inherited_pri = PRI_MIN_REALTIME + rceiling;
2326	}
2327
2328	if ((error = umtx_key_get(m, (flags & UMUTEX_ROBUST) != 0 ?
2329	    TYPE_PP_ROBUST_UMUTEX : TYPE_PP_UMUTEX, GET_SHARE(flags),
2330	    &key)) != 0)
2331		return (error);
2332	umtxq_lock(&key);
2333	umtxq_busy(&key);
2334	umtxq_unlock(&key);
2335	/*
2336	 * For priority protected mutex, always set unlocked state
2337	 * to UMUTEX_CONTESTED, so that userland always enters kernel
2338	 * to lock the mutex, it is necessary because thread priority
2339	 * has to be adjusted for such mutex.
2340	 */
2341	error = suword32(&m->m_owner, umtx_unlock_val(flags, rb) |
2342	    UMUTEX_CONTESTED);
2343
2344	umtxq_lock(&key);
2345	if (error == 0)
2346		umtxq_signal(&key, 1);
2347	umtxq_unbusy(&key);
2348	umtxq_unlock(&key);
2349
2350	if (error == -1)
2351		error = EFAULT;
2352	else {
2353		mtx_lock(&umtx_lock);
2354		if (su != 0)
2355			uq->uq_inherited_pri = new_inherited_pri;
2356		pri = PRI_MAX;
2357		TAILQ_FOREACH(pi, &uq->uq_pi_contested, pi_link) {
2358			uq2 = TAILQ_FIRST(&pi->pi_blocked);
2359			if (uq2 != NULL) {
2360				if (pri > UPRI(uq2->uq_thread))
2361					pri = UPRI(uq2->uq_thread);
2362			}
2363		}
2364		if (pri > uq->uq_inherited_pri)
2365			pri = uq->uq_inherited_pri;
2366		thread_lock(td);
2367		sched_lend_user_prio(td, pri);
2368		thread_unlock(td);
2369		mtx_unlock(&umtx_lock);
2370	}
2371	umtx_key_release(&key);
2372	return (error);
2373}
2374
2375static int
2376do_set_ceiling(struct thread *td, struct umutex *m, uint32_t ceiling,
2377    uint32_t *old_ceiling)
2378{
2379	struct umtx_q *uq;
2380	uint32_t flags, id, owner, save_ceiling;
2381	int error, rv, rv1;
2382
2383	error = fueword32(&m->m_flags, &flags);
2384	if (error == -1)
2385		return (EFAULT);
2386	if ((flags & UMUTEX_PRIO_PROTECT) == 0)
2387		return (EINVAL);
2388	if (ceiling > RTP_PRIO_MAX)
2389		return (EINVAL);
2390	id = td->td_tid;
2391	uq = td->td_umtxq;
2392	if ((error = umtx_key_get(m, (flags & UMUTEX_ROBUST) != 0 ?
2393	    TYPE_PP_ROBUST_UMUTEX : TYPE_PP_UMUTEX, GET_SHARE(flags),
2394	    &uq->uq_key)) != 0)
2395		return (error);
2396	for (;;) {
2397		umtxq_lock(&uq->uq_key);
2398		umtxq_busy(&uq->uq_key);
2399		umtxq_unlock(&uq->uq_key);
2400
2401		rv = fueword32(&m->m_ceilings[0], &save_ceiling);
2402		if (rv == -1) {
2403			error = EFAULT;
2404			break;
2405		}
2406
2407		rv = casueword32(&m->m_owner, UMUTEX_CONTESTED, &owner,
2408		    id | UMUTEX_CONTESTED);
2409		if (rv == -1) {
2410			error = EFAULT;
2411			break;
2412		}
2413
2414		if (rv == 0) {
2415			MPASS(owner == UMUTEX_CONTESTED);
2416			rv = suword32(&m->m_ceilings[0], ceiling);
2417			rv1 = suword32(&m->m_owner, UMUTEX_CONTESTED);
2418			error = (rv == 0 && rv1 == 0) ? 0: EFAULT;
2419			break;
2420		}
2421
2422		if ((owner & ~UMUTEX_CONTESTED) == id) {
2423			rv = suword32(&m->m_ceilings[0], ceiling);
2424			error = rv == 0 ? 0 : EFAULT;
2425			break;
2426		}
2427
2428		if (owner == UMUTEX_RB_OWNERDEAD) {
2429			error = EOWNERDEAD;
2430			break;
2431		} else if (owner == UMUTEX_RB_NOTRECOV) {
2432			error = ENOTRECOVERABLE;
2433			break;
2434		}
2435
2436		/*
2437		 * If we caught a signal, we have retried and now
2438		 * exit immediately.
2439		 */
2440		if (error != 0)
2441			break;
2442
2443		/*
2444		 * We set the contested bit, sleep. Otherwise the lock changed
2445		 * and we need to retry or we lost a race to the thread
2446		 * unlocking the umtx.
2447		 */
2448		umtxq_lock(&uq->uq_key);
2449		umtxq_insert(uq);
2450		umtxq_unbusy(&uq->uq_key);
2451		error = umtxq_sleep(uq, "umtxpp", NULL);
2452		umtxq_remove(uq);
2453		umtxq_unlock(&uq->uq_key);
2454	}
2455	umtxq_lock(&uq->uq_key);
2456	if (error == 0)
2457		umtxq_signal(&uq->uq_key, INT_MAX);
2458	umtxq_unbusy(&uq->uq_key);
2459	umtxq_unlock(&uq->uq_key);
2460	umtx_key_release(&uq->uq_key);
2461	if (error == 0 && old_ceiling != NULL) {
2462		rv = suword32(old_ceiling, save_ceiling);
2463		error = rv == 0 ? 0 : EFAULT;
2464	}
2465	return (error);
2466}
2467
2468/*
2469 * Lock a userland POSIX mutex.
2470 */
2471static int
2472do_lock_umutex(struct thread *td, struct umutex *m,
2473    struct _umtx_time *timeout, int mode)
2474{
2475	uint32_t flags;
2476	int error;
2477
2478	error = fueword32(&m->m_flags, &flags);
2479	if (error == -1)
2480		return (EFAULT);
2481
2482	switch (flags & (UMUTEX_PRIO_INHERIT | UMUTEX_PRIO_PROTECT)) {
2483	case 0:
2484		error = do_lock_normal(td, m, flags, timeout, mode);
2485		break;
2486	case UMUTEX_PRIO_INHERIT:
2487		error = do_lock_pi(td, m, flags, timeout, mode);
2488		break;
2489	case UMUTEX_PRIO_PROTECT:
2490		error = do_lock_pp(td, m, flags, timeout, mode);
2491		break;
2492	default:
2493		return (EINVAL);
2494	}
2495	if (timeout == NULL) {
2496		if (error == EINTR && mode != _UMUTEX_WAIT)
2497			error = ERESTART;
2498	} else {
2499		/* Timed-locking is not restarted. */
2500		if (error == ERESTART)
2501			error = EINTR;
2502	}
2503	return (error);
2504}
2505
2506/*
2507 * Unlock a userland POSIX mutex.
2508 */
2509static int
2510do_unlock_umutex(struct thread *td, struct umutex *m, bool rb)
2511{
2512	uint32_t flags;
2513	int error;
2514
2515	error = fueword32(&m->m_flags, &flags);
2516	if (error == -1)
2517		return (EFAULT);
2518
2519	switch (flags & (UMUTEX_PRIO_INHERIT | UMUTEX_PRIO_PROTECT)) {
2520	case 0:
2521		return (do_unlock_normal(td, m, flags, rb));
2522	case UMUTEX_PRIO_INHERIT:
2523		return (do_unlock_pi(td, m, flags, rb));
2524	case UMUTEX_PRIO_PROTECT:
2525		return (do_unlock_pp(td, m, flags, rb));
2526	}
2527
2528	return (EINVAL);
2529}
2530
2531static int
2532do_cv_wait(struct thread *td, struct ucond *cv, struct umutex *m,
2533    struct timespec *timeout, u_long wflags)
2534{
2535	struct abs_timeout timo;
2536	struct umtx_q *uq;
2537	uint32_t flags, clockid, hasw;
2538	int error;
2539
2540	uq = td->td_umtxq;
2541	error = fueword32(&cv->c_flags, &flags);
2542	if (error == -1)
2543		return (EFAULT);
2544	error = umtx_key_get(cv, TYPE_CV, GET_SHARE(flags), &uq->uq_key);
2545	if (error != 0)
2546		return (error);
2547
2548	if ((wflags & CVWAIT_CLOCKID) != 0) {
2549		error = fueword32(&cv->c_clockid, &clockid);
2550		if (error == -1) {
2551			umtx_key_release(&uq->uq_key);
2552			return (EFAULT);
2553		}
2554		if (clockid < CLOCK_REALTIME ||
2555		    clockid >= CLOCK_THREAD_CPUTIME_ID) {
2556			/* hmm, only HW clock id will work. */
2557			umtx_key_release(&uq->uq_key);
2558			return (EINVAL);
2559		}
2560	} else {
2561		clockid = CLOCK_REALTIME;
2562	}
2563
2564	umtxq_lock(&uq->uq_key);
2565	umtxq_busy(&uq->uq_key);
2566	umtxq_insert(uq);
2567	umtxq_unlock(&uq->uq_key);
2568
2569	/*
2570	 * Set c_has_waiters to 1 before releasing user mutex, also
2571	 * don't modify cache line when unnecessary.
2572	 */
2573	error = fueword32(&cv->c_has_waiters, &hasw);
2574	if (error == 0 && hasw == 0)
2575		suword32(&cv->c_has_waiters, 1);
2576
2577	umtxq_unbusy_unlocked(&uq->uq_key);
2578
2579	error = do_unlock_umutex(td, m, false);
2580
2581	if (timeout != NULL)
2582		abs_timeout_init(&timo, clockid, (wflags & CVWAIT_ABSTIME) != 0,
2583		    timeout);
2584
2585	umtxq_lock(&uq->uq_key);
2586	if (error == 0) {
2587		error = umtxq_sleep(uq, "ucond", timeout == NULL ?
2588		    NULL : &timo);
2589	}
2590
2591	if ((uq->uq_flags & UQF_UMTXQ) == 0)
2592		error = 0;
2593	else {
2594		/*
2595		 * This must be timeout,interrupted by signal or
2596		 * surprious wakeup, clear c_has_waiter flag when
2597		 * necessary.
2598		 */
2599		umtxq_busy(&uq->uq_key);
2600		if ((uq->uq_flags & UQF_UMTXQ) != 0) {
2601			int oldlen = uq->uq_cur_queue->length;
2602			umtxq_remove(uq);
2603			if (oldlen == 1) {
2604				umtxq_unlock(&uq->uq_key);
2605				suword32(&cv->c_has_waiters, 0);
2606				umtxq_lock(&uq->uq_key);
2607			}
2608		}
2609		umtxq_unbusy(&uq->uq_key);
2610		if (error == ERESTART)
2611			error = EINTR;
2612	}
2613
2614	umtxq_unlock(&uq->uq_key);
2615	umtx_key_release(&uq->uq_key);
2616	return (error);
2617}
2618
2619/*
2620 * Signal a userland condition variable.
2621 */
2622static int
2623do_cv_signal(struct thread *td, struct ucond *cv)
2624{
2625	struct umtx_key key;
2626	int error, cnt, nwake;
2627	uint32_t flags;
2628
2629	error = fueword32(&cv->c_flags, &flags);
2630	if (error == -1)
2631		return (EFAULT);
2632	if ((error = umtx_key_get(cv, TYPE_CV, GET_SHARE(flags), &key)) != 0)
2633		return (error);
2634	umtxq_lock(&key);
2635	umtxq_busy(&key);
2636	cnt = umtxq_count(&key);
2637	nwake = umtxq_signal(&key, 1);
2638	if (cnt <= nwake) {
2639		umtxq_unlock(&key);
2640		error = suword32(&cv->c_has_waiters, 0);
2641		if (error == -1)
2642			error = EFAULT;
2643		umtxq_lock(&key);
2644	}
2645	umtxq_unbusy(&key);
2646	umtxq_unlock(&key);
2647	umtx_key_release(&key);
2648	return (error);
2649}
2650
2651static int
2652do_cv_broadcast(struct thread *td, struct ucond *cv)
2653{
2654	struct umtx_key key;
2655	int error;
2656	uint32_t flags;
2657
2658	error = fueword32(&cv->c_flags, &flags);
2659	if (error == -1)
2660		return (EFAULT);
2661	if ((error = umtx_key_get(cv, TYPE_CV, GET_SHARE(flags), &key)) != 0)
2662		return (error);
2663
2664	umtxq_lock(&key);
2665	umtxq_busy(&key);
2666	umtxq_signal(&key, INT_MAX);
2667	umtxq_unlock(&key);
2668
2669	error = suword32(&cv->c_has_waiters, 0);
2670	if (error == -1)
2671		error = EFAULT;
2672
2673	umtxq_unbusy_unlocked(&key);
2674
2675	umtx_key_release(&key);
2676	return (error);
2677}
2678
2679static int
2680do_rw_rdlock(struct thread *td, struct urwlock *rwlock, long fflag,
2681    struct _umtx_time *timeout)
2682{
2683	struct abs_timeout timo;
2684	struct umtx_q *uq;
2685	uint32_t flags, wrflags;
2686	int32_t state, oldstate;
2687	int32_t blocked_readers;
2688	int error, error1, rv;
2689
2690	uq = td->td_umtxq;
2691	error = fueword32(&rwlock->rw_flags, &flags);
2692	if (error == -1)
2693		return (EFAULT);
2694	error = umtx_key_get(rwlock, TYPE_RWLOCK, GET_SHARE(flags), &uq->uq_key);
2695	if (error != 0)
2696		return (error);
2697
2698	if (timeout != NULL)
2699		abs_timeout_init2(&timo, timeout);
2700
2701	wrflags = URWLOCK_WRITE_OWNER;
2702	if (!(fflag & URWLOCK_PREFER_READER) && !(flags & URWLOCK_PREFER_READER))
2703		wrflags |= URWLOCK_WRITE_WAITERS;
2704
2705	for (;;) {
2706		rv = fueword32(&rwlock->rw_state, &state);
2707		if (rv == -1) {
2708			umtx_key_release(&uq->uq_key);
2709			return (EFAULT);
2710		}
2711
2712		/* try to lock it */
2713		while (!(state & wrflags)) {
2714			if (__predict_false(URWLOCK_READER_COUNT(state) ==
2715			    URWLOCK_MAX_READERS)) {
2716				umtx_key_release(&uq->uq_key);
2717				return (EAGAIN);
2718			}
2719			rv = casueword32(&rwlock->rw_state, state,
2720			    &oldstate, state + 1);
2721			if (rv == -1) {
2722				umtx_key_release(&uq->uq_key);
2723				return (EFAULT);
2724			}
2725			if (rv == 0) {
2726				MPASS(oldstate == state);
2727				umtx_key_release(&uq->uq_key);
2728				return (0);
2729			}
2730			error = thread_check_susp(td, true);
2731			if (error != 0)
2732				break;
2733			state = oldstate;
2734		}
2735
2736		if (error)
2737			break;
2738
2739		/* grab monitor lock */
2740		umtxq_lock(&uq->uq_key);
2741		umtxq_busy(&uq->uq_key);
2742		umtxq_unlock(&uq->uq_key);
2743
2744		/*
2745		 * re-read the state, in case it changed between the try-lock above
2746		 * and the check below
2747		 */
2748		rv = fueword32(&rwlock->rw_state, &state);
2749		if (rv == -1)
2750			error = EFAULT;
2751
2752		/* set read contention bit */
2753		while (error == 0 && (state & wrflags) &&
2754		    !(state & URWLOCK_READ_WAITERS)) {
2755			rv = casueword32(&rwlock->rw_state, state,
2756			    &oldstate, state | URWLOCK_READ_WAITERS);
2757			if (rv == -1) {
2758				error = EFAULT;
2759				break;
2760			}
2761			if (rv == 0) {
2762				MPASS(oldstate == state);
2763				goto sleep;
2764			}
2765			state = oldstate;
2766			error = thread_check_susp(td, false);
2767			if (error != 0)
2768				break;
2769		}
2770		if (error != 0) {
2771			umtxq_unbusy_unlocked(&uq->uq_key);
2772			break;
2773		}
2774
2775		/* state is changed while setting flags, restart */
2776		if (!(state & wrflags)) {
2777			umtxq_unbusy_unlocked(&uq->uq_key);
2778			error = thread_check_susp(td, true);
2779			if (error != 0)
2780				break;
2781			continue;
2782		}
2783
2784sleep:
2785		/*
2786		 * Contention bit is set, before sleeping, increase
2787		 * read waiter count.
2788		 */
2789		rv = fueword32(&rwlock->rw_blocked_readers,
2790		    &blocked_readers);
2791		if (rv == -1) {
2792			umtxq_unbusy_unlocked(&uq->uq_key);
2793			error = EFAULT;
2794			break;
2795		}
2796		suword32(&rwlock->rw_blocked_readers, blocked_readers+1);
2797
2798		while (state & wrflags) {
2799			umtxq_lock(&uq->uq_key);
2800			umtxq_insert(uq);
2801			umtxq_unbusy(&uq->uq_key);
2802
2803			error = umtxq_sleep(uq, "urdlck", timeout == NULL ?
2804			    NULL : &timo);
2805
2806			umtxq_busy(&uq->uq_key);
2807			umtxq_remove(uq);
2808			umtxq_unlock(&uq->uq_key);
2809			if (error)
2810				break;
2811			rv = fueword32(&rwlock->rw_state, &state);
2812			if (rv == -1) {
2813				error = EFAULT;
2814				break;
2815			}
2816		}
2817
2818		/* decrease read waiter count, and may clear read contention bit */
2819		rv = fueword32(&rwlock->rw_blocked_readers,
2820		    &blocked_readers);
2821		if (rv == -1) {
2822			umtxq_unbusy_unlocked(&uq->uq_key);
2823			error = EFAULT;
2824			break;
2825		}
2826		suword32(&rwlock->rw_blocked_readers, blocked_readers-1);
2827		if (blocked_readers == 1) {
2828			rv = fueword32(&rwlock->rw_state, &state);
2829			if (rv == -1) {
2830				umtxq_unbusy_unlocked(&uq->uq_key);
2831				error = EFAULT;
2832				break;
2833			}
2834			for (;;) {
2835				rv = casueword32(&rwlock->rw_state, state,
2836				    &oldstate, state & ~URWLOCK_READ_WAITERS);
2837				if (rv == -1) {
2838					error = EFAULT;
2839					break;
2840				}
2841				if (rv == 0) {
2842					MPASS(oldstate == state);
2843					break;
2844				}
2845				state = oldstate;
2846				error1 = thread_check_susp(td, false);
2847				if (error1 != 0) {
2848					if (error == 0)
2849						error = error1;
2850					break;
2851				}
2852			}
2853		}
2854
2855		umtxq_unbusy_unlocked(&uq->uq_key);
2856		if (error != 0)
2857			break;
2858	}
2859	umtx_key_release(&uq->uq_key);
2860	if (error == ERESTART)
2861		error = EINTR;
2862	return (error);
2863}
2864
2865static int
2866do_rw_wrlock(struct thread *td, struct urwlock *rwlock, struct _umtx_time *timeout)
2867{
2868	struct abs_timeout timo;
2869	struct umtx_q *uq;
2870	uint32_t flags;
2871	int32_t state, oldstate;
2872	int32_t blocked_writers;
2873	int32_t blocked_readers;
2874	int error, error1, rv;
2875
2876	uq = td->td_umtxq;
2877	error = fueword32(&rwlock->rw_flags, &flags);
2878	if (error == -1)
2879		return (EFAULT);
2880	error = umtx_key_get(rwlock, TYPE_RWLOCK, GET_SHARE(flags), &uq->uq_key);
2881	if (error != 0)
2882		return (error);
2883
2884	if (timeout != NULL)
2885		abs_timeout_init2(&timo, timeout);
2886
2887	blocked_readers = 0;
2888	for (;;) {
2889		rv = fueword32(&rwlock->rw_state, &state);
2890		if (rv == -1) {
2891			umtx_key_release(&uq->uq_key);
2892			return (EFAULT);
2893		}
2894		while ((state & URWLOCK_WRITE_OWNER) == 0 &&
2895		    URWLOCK_READER_COUNT(state) == 0) {
2896			rv = casueword32(&rwlock->rw_state, state,
2897			    &oldstate, state | URWLOCK_WRITE_OWNER);
2898			if (rv == -1) {
2899				umtx_key_release(&uq->uq_key);
2900				return (EFAULT);
2901			}
2902			if (rv == 0) {
2903				MPASS(oldstate == state);
2904				umtx_key_release(&uq->uq_key);
2905				return (0);
2906			}
2907			state = oldstate;
2908			error = thread_check_susp(td, true);
2909			if (error != 0)
2910				break;
2911		}
2912
2913		if (error) {
2914			if ((state & (URWLOCK_WRITE_OWNER |
2915			    URWLOCK_WRITE_WAITERS)) == 0 &&
2916			    blocked_readers != 0) {
2917				umtxq_lock(&uq->uq_key);
2918				umtxq_busy(&uq->uq_key);
2919				umtxq_signal_queue(&uq->uq_key, INT_MAX,
2920				    UMTX_SHARED_QUEUE);
2921				umtxq_unbusy(&uq->uq_key);
2922				umtxq_unlock(&uq->uq_key);
2923			}
2924
2925			break;
2926		}
2927
2928		/* grab monitor lock */
2929		umtxq_lock(&uq->uq_key);
2930		umtxq_busy(&uq->uq_key);
2931		umtxq_unlock(&uq->uq_key);
2932
2933		/*
2934		 * Re-read the state, in case it changed between the
2935		 * try-lock above and the check below.
2936		 */
2937		rv = fueword32(&rwlock->rw_state, &state);
2938		if (rv == -1)
2939			error = EFAULT;
2940
2941		while (error == 0 && ((state & URWLOCK_WRITE_OWNER) ||
2942		    URWLOCK_READER_COUNT(state) != 0) &&
2943		    (state & URWLOCK_WRITE_WAITERS) == 0) {
2944			rv = casueword32(&rwlock->rw_state, state,
2945			    &oldstate, state | URWLOCK_WRITE_WAITERS);
2946			if (rv == -1) {
2947				error = EFAULT;
2948				break;
2949			}
2950			if (rv == 0) {
2951				MPASS(oldstate == state);
2952				goto sleep;
2953			}
2954			state = oldstate;
2955			error = thread_check_susp(td, false);
2956			if (error != 0)
2957				break;
2958		}
2959		if (error != 0) {
2960			umtxq_unbusy_unlocked(&uq->uq_key);
2961			break;
2962		}
2963
2964		if ((state & URWLOCK_WRITE_OWNER) == 0 &&
2965		    URWLOCK_READER_COUNT(state) == 0) {
2966			umtxq_unbusy_unlocked(&uq->uq_key);
2967			error = thread_check_susp(td, false);
2968			if (error != 0)
2969				break;
2970			continue;
2971		}
2972sleep:
2973		rv = fueword32(&rwlock->rw_blocked_writers,
2974		    &blocked_writers);
2975		if (rv == -1) {
2976			umtxq_unbusy_unlocked(&uq->uq_key);
2977			error = EFAULT;
2978			break;
2979		}
2980		suword32(&rwlock->rw_blocked_writers, blocked_writers + 1);
2981
2982		while ((state & URWLOCK_WRITE_OWNER) ||
2983		    URWLOCK_READER_COUNT(state) != 0) {
2984			umtxq_lock(&uq->uq_key);
2985			umtxq_insert_queue(uq, UMTX_EXCLUSIVE_QUEUE);
2986			umtxq_unbusy(&uq->uq_key);
2987
2988			error = umtxq_sleep(uq, "uwrlck", timeout == NULL ?
2989			    NULL : &timo);
2990
2991			umtxq_busy(&uq->uq_key);
2992			umtxq_remove_queue(uq, UMTX_EXCLUSIVE_QUEUE);
2993			umtxq_unlock(&uq->uq_key);
2994			if (error)
2995				break;
2996			rv = fueword32(&rwlock->rw_state, &state);
2997			if (rv == -1) {
2998				error = EFAULT;
2999				break;
3000			}
3001		}
3002
3003		rv = fueword32(&rwlock->rw_blocked_writers,
3004		    &blocked_writers);
3005		if (rv == -1) {
3006			umtxq_unbusy_unlocked(&uq->uq_key);
3007			error = EFAULT;
3008			break;
3009		}
3010		suword32(&rwlock->rw_blocked_writers, blocked_writers-1);
3011		if (blocked_writers == 1) {
3012			rv = fueword32(&rwlock->rw_state, &state);
3013			if (rv == -1) {
3014				umtxq_unbusy_unlocked(&uq->uq_key);
3015				error = EFAULT;
3016				break;
3017			}
3018			for (;;) {
3019				rv = casueword32(&rwlock->rw_state, state,
3020				    &oldstate, state & ~URWLOCK_WRITE_WAITERS);
3021				if (rv == -1) {
3022					error = EFAULT;
3023					break;
3024				}
3025				if (rv == 0) {
3026					MPASS(oldstate == state);
3027					break;
3028				}
3029				state = oldstate;
3030				error1 = thread_check_susp(td, false);
3031				/*
3032				 * We are leaving the URWLOCK_WRITE_WAITERS
3033				 * behind, but this should not harm the
3034				 * correctness.
3035				 */
3036				if (error1 != 0) {
3037					if (error == 0)
3038						error = error1;
3039					break;
3040				}
3041			}
3042			rv = fueword32(&rwlock->rw_blocked_readers,
3043			    &blocked_readers);
3044			if (rv == -1) {
3045				umtxq_unbusy_unlocked(&uq->uq_key);
3046				error = EFAULT;
3047				break;
3048			}
3049		} else
3050			blocked_readers = 0;
3051
3052		umtxq_unbusy_unlocked(&uq->uq_key);
3053	}
3054
3055	umtx_key_release(&uq->uq_key);
3056	if (error == ERESTART)
3057		error = EINTR;
3058	return (error);
3059}
3060
3061static int
3062do_rw_unlock(struct thread *td, struct urwlock *rwlock)
3063{
3064	struct umtx_q *uq;
3065	uint32_t flags;
3066	int32_t state, oldstate;
3067	int error, rv, q, count;
3068
3069	uq = td->td_umtxq;
3070	error = fueword32(&rwlock->rw_flags, &flags);
3071	if (error == -1)
3072		return (EFAULT);
3073	error = umtx_key_get(rwlock, TYPE_RWLOCK, GET_SHARE(flags), &uq->uq_key);
3074	if (error != 0)
3075		return (error);
3076
3077	error = fueword32(&rwlock->rw_state, &state);
3078	if (error == -1) {
3079		error = EFAULT;
3080		goto out;
3081	}
3082	if (state & URWLOCK_WRITE_OWNER) {
3083		for (;;) {
3084			rv = casueword32(&rwlock->rw_state, state,
3085			    &oldstate, state & ~URWLOCK_WRITE_OWNER);
3086			if (rv == -1) {
3087				error = EFAULT;
3088				goto out;
3089			}
3090			if (rv == 1) {
3091				state = oldstate;
3092				if (!(oldstate & URWLOCK_WRITE_OWNER)) {
3093					error = EPERM;
3094					goto out;
3095				}
3096				error = thread_check_susp(td, true);
3097				if (error != 0)
3098					goto out;
3099			} else
3100				break;
3101		}
3102	} else if (URWLOCK_READER_COUNT(state) != 0) {
3103		for (;;) {
3104			rv = casueword32(&rwlock->rw_state, state,
3105			    &oldstate, state - 1);
3106			if (rv == -1) {
3107				error = EFAULT;
3108				goto out;
3109			}
3110			if (rv == 1) {
3111				state = oldstate;
3112				if (URWLOCK_READER_COUNT(oldstate) == 0) {
3113					error = EPERM;
3114					goto out;
3115				}
3116				error = thread_check_susp(td, true);
3117				if (error != 0)
3118					goto out;
3119			} else
3120				break;
3121		}
3122	} else {
3123		error = EPERM;
3124		goto out;
3125	}
3126
3127	count = 0;
3128
3129	if (!(flags & URWLOCK_PREFER_READER)) {
3130		if (state & URWLOCK_WRITE_WAITERS) {
3131			count = 1;
3132			q = UMTX_EXCLUSIVE_QUEUE;
3133		} else if (state & URWLOCK_READ_WAITERS) {
3134			count = INT_MAX;
3135			q = UMTX_SHARED_QUEUE;
3136		}
3137	} else {
3138		if (state & URWLOCK_READ_WAITERS) {
3139			count = INT_MAX;
3140			q = UMTX_SHARED_QUEUE;
3141		} else if (state & URWLOCK_WRITE_WAITERS) {
3142			count = 1;
3143			q = UMTX_EXCLUSIVE_QUEUE;
3144		}
3145	}
3146
3147	if (count) {
3148		umtxq_lock(&uq->uq_key);
3149		umtxq_busy(&uq->uq_key);
3150		umtxq_signal_queue(&uq->uq_key, count, q);
3151		umtxq_unbusy(&uq->uq_key);
3152		umtxq_unlock(&uq->uq_key);
3153	}
3154out:
3155	umtx_key_release(&uq->uq_key);
3156	return (error);
3157}
3158
3159#if defined(COMPAT_FREEBSD9) || defined(COMPAT_FREEBSD10)
3160static int
3161do_sem_wait(struct thread *td, struct _usem *sem, struct _umtx_time *timeout)
3162{
3163	struct abs_timeout timo;
3164	struct umtx_q *uq;
3165	uint32_t flags, count, count1;
3166	int error, rv, rv1;
3167
3168	uq = td->td_umtxq;
3169	error = fueword32(&sem->_flags, &flags);
3170	if (error == -1)
3171		return (EFAULT);
3172	error = umtx_key_get(sem, TYPE_SEM, GET_SHARE(flags), &uq->uq_key);
3173	if (error != 0)
3174		return (error);
3175
3176	if (timeout != NULL)
3177		abs_timeout_init2(&timo, timeout);
3178
3179again:
3180	umtxq_lock(&uq->uq_key);
3181	umtxq_busy(&uq->uq_key);
3182	umtxq_insert(uq);
3183	umtxq_unlock(&uq->uq_key);
3184	rv = casueword32(&sem->_has_waiters, 0, &count1, 1);
3185	if (rv == 0)
3186		rv1 = fueword32(&sem->_count, &count);
3187	if (rv == -1 || (rv == 0 && (rv1 == -1 || count != 0)) ||
3188	    (rv == 1 && count1 == 0)) {
3189		umtxq_lock(&uq->uq_key);
3190		umtxq_unbusy(&uq->uq_key);
3191		umtxq_remove(uq);
3192		umtxq_unlock(&uq->uq_key);
3193		if (rv == 1) {
3194			rv = thread_check_susp(td, true);
3195			if (rv == 0)
3196				goto again;
3197			error = rv;
3198			goto out;
3199		}
3200		if (rv == 0)
3201			rv = rv1;
3202		error = rv == -1 ? EFAULT : 0;
3203		goto out;
3204	}
3205	umtxq_lock(&uq->uq_key);
3206	umtxq_unbusy(&uq->uq_key);
3207
3208	error = umtxq_sleep(uq, "usem", timeout == NULL ? NULL : &timo);
3209
3210	if ((uq->uq_flags & UQF_UMTXQ) == 0)
3211		error = 0;
3212	else {
3213		umtxq_remove(uq);
3214		/* A relative timeout cannot be restarted. */
3215		if (error == ERESTART && timeout != NULL &&
3216		    (timeout->_flags & UMTX_ABSTIME) == 0)
3217			error = EINTR;
3218	}
3219	umtxq_unlock(&uq->uq_key);
3220out:
3221	umtx_key_release(&uq->uq_key);
3222	return (error);
3223}
3224
3225/*
3226 * Signal a userland semaphore.
3227 */
3228static int
3229do_sem_wake(struct thread *td, struct _usem *sem)
3230{
3231	struct umtx_key key;
3232	int error, cnt;
3233	uint32_t flags;
3234
3235	error = fueword32(&sem->_flags, &flags);
3236	if (error == -1)
3237		return (EFAULT);
3238	if ((error = umtx_key_get(sem, TYPE_SEM, GET_SHARE(flags), &key)) != 0)
3239		return (error);
3240	umtxq_lock(&key);
3241	umtxq_busy(&key);
3242	cnt = umtxq_count(&key);
3243	if (cnt > 0) {
3244		/*
3245		 * Check if count is greater than 0, this means the memory is
3246		 * still being referenced by user code, so we can safely
3247		 * update _has_waiters flag.
3248		 */
3249		if (cnt == 1) {
3250			umtxq_unlock(&key);
3251			error = suword32(&sem->_has_waiters, 0);
3252			umtxq_lock(&key);
3253			if (error == -1)
3254				error = EFAULT;
3255		}
3256		umtxq_signal(&key, 1);
3257	}
3258	umtxq_unbusy(&key);
3259	umtxq_unlock(&key);
3260	umtx_key_release(&key);
3261	return (error);
3262}
3263#endif
3264
3265static int
3266do_sem2_wait(struct thread *td, struct _usem2 *sem, struct _umtx_time *timeout)
3267{
3268	struct abs_timeout timo;
3269	struct umtx_q *uq;
3270	uint32_t count, flags;
3271	int error, rv;
3272
3273	uq = td->td_umtxq;
3274	flags = fuword32(&sem->_flags);
3275	if (timeout != NULL)
3276		abs_timeout_init2(&timo, timeout);
3277
3278again:
3279	error = umtx_key_get(sem, TYPE_SEM, GET_SHARE(flags), &uq->uq_key);
3280	if (error != 0)
3281		return (error);
3282	umtxq_lock(&uq->uq_key);
3283	umtxq_busy(&uq->uq_key);
3284	umtxq_insert(uq);
3285	umtxq_unlock(&uq->uq_key);
3286	rv = fueword32(&sem->_count, &count);
3287	if (rv == -1) {
3288		umtxq_lock(&uq->uq_key);
3289		umtxq_unbusy(&uq->uq_key);
3290		umtxq_remove(uq);
3291		umtxq_unlock(&uq->uq_key);
3292		umtx_key_release(&uq->uq_key);
3293		return (EFAULT);
3294	}
3295	for (;;) {
3296		if (USEM_COUNT(count) != 0) {
3297			umtxq_lock(&uq->uq_key);
3298			umtxq_unbusy(&uq->uq_key);
3299			umtxq_remove(uq);
3300			umtxq_unlock(&uq->uq_key);
3301			umtx_key_release(&uq->uq_key);
3302			return (0);
3303		}
3304		if (count == USEM_HAS_WAITERS)
3305			break;
3306		rv = casueword32(&sem->_count, 0, &count, USEM_HAS_WAITERS);
3307		if (rv == 0)
3308			break;
3309		umtxq_lock(&uq->uq_key);
3310		umtxq_unbusy(&uq->uq_key);
3311		umtxq_remove(uq);
3312		umtxq_unlock(&uq->uq_key);
3313		umtx_key_release(&uq->uq_key);
3314		if (rv == -1)
3315			return (EFAULT);
3316		rv = thread_check_susp(td, true);
3317		if (rv != 0)
3318			return (rv);
3319		goto again;
3320	}
3321	umtxq_lock(&uq->uq_key);
3322	umtxq_unbusy(&uq->uq_key);
3323
3324	error = umtxq_sleep(uq, "usem", timeout == NULL ? NULL : &timo);
3325
3326	if ((uq->uq_flags & UQF_UMTXQ) == 0)
3327		error = 0;
3328	else {
3329		umtxq_remove(uq);
3330		if (timeout != NULL && (timeout->_flags & UMTX_ABSTIME) == 0) {
3331			/* A relative timeout cannot be restarted. */
3332			if (error == ERESTART)
3333				error = EINTR;
3334			if (error == EINTR) {
3335				abs_timeout_update(&timo);
3336				timespecsub(&timo.end, &timo.cur,
3337				    &timeout->_timeout);
3338			}
3339		}
3340	}
3341	umtxq_unlock(&uq->uq_key);
3342	umtx_key_release(&uq->uq_key);
3343	return (error);
3344}
3345
3346/*
3347 * Signal a userland semaphore.
3348 */
3349static int
3350do_sem2_wake(struct thread *td, struct _usem2 *sem)
3351{
3352	struct umtx_key key;
3353	int error, cnt, rv;
3354	uint32_t count, flags;
3355
3356	rv = fueword32(&sem->_flags, &flags);
3357	if (rv == -1)
3358		return (EFAULT);
3359	if ((error = umtx_key_get(sem, TYPE_SEM, GET_SHARE(flags), &key)) != 0)
3360		return (error);
3361	umtxq_lock(&key);
3362	umtxq_busy(&key);
3363	cnt = umtxq_count(&key);
3364	if (cnt > 0) {
3365		/*
3366		 * If this was the last sleeping thread, clear the waiters
3367		 * flag in _count.
3368		 */
3369		if (cnt == 1) {
3370			umtxq_unlock(&key);
3371			rv = fueword32(&sem->_count, &count);
3372			while (rv != -1 && count & USEM_HAS_WAITERS) {
3373				rv = casueword32(&sem->_count, count, &count,
3374				    count & ~USEM_HAS_WAITERS);
3375				if (rv == 1) {
3376					rv = thread_check_susp(td, true);
3377					if (rv != 0)
3378						break;
3379				}
3380			}
3381			if (rv == -1)
3382				error = EFAULT;
3383			else if (rv > 0) {
3384				error = rv;
3385			}
3386			umtxq_lock(&key);
3387		}
3388
3389		umtxq_signal(&key, 1);
3390	}
3391	umtxq_unbusy(&key);
3392	umtxq_unlock(&key);
3393	umtx_key_release(&key);
3394	return (error);
3395}
3396
3397inline int
3398umtx_copyin_timeout(const void *uaddr, struct timespec *tsp)
3399{
3400	int error;
3401
3402	error = copyin(uaddr, tsp, sizeof(*tsp));
3403	if (error == 0) {
3404		if (tsp->tv_sec < 0 ||
3405		    tsp->tv_nsec >= 1000000000 ||
3406		    tsp->tv_nsec < 0)
3407			error = EINVAL;
3408	}
3409	return (error);
3410}
3411
3412static inline int
3413umtx_copyin_umtx_time(const void *uaddr, size_t size, struct _umtx_time *tp)
3414{
3415	int error;
3416
3417	if (size <= sizeof(tp->_timeout)) {
3418		tp->_clockid = CLOCK_REALTIME;
3419		tp->_flags = 0;
3420		error = copyin(uaddr, &tp->_timeout, sizeof(tp->_timeout));
3421	} else
3422		error = copyin(uaddr, tp, sizeof(*tp));
3423	if (error != 0)
3424		return (error);
3425	if (tp->_timeout.tv_sec < 0 ||
3426	    tp->_timeout.tv_nsec >= 1000000000 || tp->_timeout.tv_nsec < 0)
3427		return (EINVAL);
3428	return (0);
3429}
3430
3431static int
3432umtx_copyin_robust_lists(const void *uaddr, size_t size,
3433    struct umtx_robust_lists_params *rb)
3434{
3435
3436	if (size > sizeof(*rb))
3437		return (EINVAL);
3438	return (copyin(uaddr, rb, size));
3439}
3440
3441static int
3442umtx_copyout_timeout(void *uaddr, size_t sz, struct timespec *tsp)
3443{
3444
3445	/*
3446	 * Should be guaranteed by the caller, sz == uaddr1 - sizeof(_umtx_time)
3447	 * and we're only called if sz >= sizeof(timespec) as supplied in the
3448	 * copyops.
3449	 */
3450	KASSERT(sz >= sizeof(*tsp),
3451	    ("umtx_copyops specifies incorrect sizes"));
3452
3453	return (copyout(tsp, uaddr, sizeof(*tsp)));
3454}
3455
3456static int
3457__umtx_op_unimpl(struct thread *td, struct _umtx_op_args *uap,
3458    const struct umtx_copyops *ops __unused)
3459{
3460
3461	return (EOPNOTSUPP);
3462}
3463
3464static int
3465__umtx_op_wait(struct thread *td, struct _umtx_op_args *uap,
3466    const struct umtx_copyops *ops)
3467{
3468	struct _umtx_time timeout, *tm_p;
3469	int error;
3470
3471	if (uap->uaddr2 == NULL)
3472		tm_p = NULL;
3473	else {
3474		error = ops->copyin_umtx_time(
3475		    uap->uaddr2, (size_t)uap->uaddr1, &timeout);
3476		if (error != 0)
3477			return (error);
3478		tm_p = &timeout;
3479	}
3480	return (do_wait(td, uap->obj, uap->val, tm_p, ops->compat32, 0));
3481}
3482
3483static int
3484__umtx_op_wait_uint(struct thread *td, struct _umtx_op_args *uap,
3485    const struct umtx_copyops *ops)
3486{
3487	struct _umtx_time timeout, *tm_p;
3488	int error;
3489
3490	if (uap->uaddr2 == NULL)
3491		tm_p = NULL;
3492	else {
3493		error = ops->copyin_umtx_time(
3494		    uap->uaddr2, (size_t)uap->uaddr1, &timeout);
3495		if (error != 0)
3496			return (error);
3497		tm_p = &timeout;
3498	}
3499	return (do_wait(td, uap->obj, uap->val, tm_p, 1, 0));
3500}
3501
3502static int
3503__umtx_op_wait_uint_private(struct thread *td, struct _umtx_op_args *uap,
3504    const struct umtx_copyops *ops)
3505{
3506	struct _umtx_time *tm_p, timeout;
3507	int error;
3508
3509	if (uap->uaddr2 == NULL)
3510		tm_p = NULL;
3511	else {
3512		error = ops->copyin_umtx_time(
3513		    uap->uaddr2, (size_t)uap->uaddr1, &timeout);
3514		if (error != 0)
3515			return (error);
3516		tm_p = &timeout;
3517	}
3518	return (do_wait(td, uap->obj, uap->val, tm_p, 1, 1));
3519}
3520
3521static int
3522__umtx_op_wake(struct thread *td, struct _umtx_op_args *uap,
3523    const struct umtx_copyops *ops __unused)
3524{
3525
3526	return (kern_umtx_wake(td, uap->obj, uap->val, 0));
3527}
3528
3529#define BATCH_SIZE	128
3530static int
3531__umtx_op_nwake_private_native(struct thread *td, struct _umtx_op_args *uap)
3532{
3533	char *uaddrs[BATCH_SIZE], **upp;
3534	int count, error, i, pos, tocopy;
3535
3536	upp = (char **)uap->obj;
3537	error = 0;
3538	for (count = uap->val, pos = 0; count > 0; count -= tocopy,
3539	    pos += tocopy) {
3540		tocopy = MIN(count, BATCH_SIZE);
3541		error = copyin(upp + pos, uaddrs, tocopy * sizeof(char *));
3542		if (error != 0)
3543			break;
3544		for (i = 0; i < tocopy; ++i) {
3545			kern_umtx_wake(td, uaddrs[i], INT_MAX, 1);
3546		}
3547		maybe_yield();
3548	}
3549	return (error);
3550}
3551
3552static int
3553__umtx_op_nwake_private_compat32(struct thread *td, struct _umtx_op_args *uap)
3554{
3555	uint32_t uaddrs[BATCH_SIZE], *upp;
3556	int count, error, i, pos, tocopy;
3557
3558	upp = (uint32_t *)uap->obj;
3559	error = 0;
3560	for (count = uap->val, pos = 0; count > 0; count -= tocopy,
3561	    pos += tocopy) {
3562		tocopy = MIN(count, BATCH_SIZE);
3563		error = copyin(upp + pos, uaddrs, tocopy * sizeof(uint32_t));
3564		if (error != 0)
3565			break;
3566		for (i = 0; i < tocopy; ++i) {
3567			kern_umtx_wake(td, (void *)(uintptr_t)uaddrs[i],
3568			    INT_MAX, 1);
3569		}
3570		maybe_yield();
3571	}
3572	return (error);
3573}
3574
3575static int
3576__umtx_op_nwake_private(struct thread *td, struct _umtx_op_args *uap,
3577    const struct umtx_copyops *ops)
3578{
3579
3580	if (ops->compat32)
3581		return (__umtx_op_nwake_private_compat32(td, uap));
3582	return (__umtx_op_nwake_private_native(td, uap));
3583}
3584
3585static int
3586__umtx_op_wake_private(struct thread *td, struct _umtx_op_args *uap,
3587    const struct umtx_copyops *ops __unused)
3588{
3589
3590	return (kern_umtx_wake(td, uap->obj, uap->val, 1));
3591}
3592
3593static int
3594__umtx_op_lock_umutex(struct thread *td, struct _umtx_op_args *uap,
3595   const struct umtx_copyops *ops)
3596{
3597	struct _umtx_time *tm_p, timeout;
3598	int error;
3599
3600	/* Allow a null timespec (wait forever). */
3601	if (uap->uaddr2 == NULL)
3602		tm_p = NULL;
3603	else {
3604		error = ops->copyin_umtx_time(
3605		    uap->uaddr2, (size_t)uap->uaddr1, &timeout);
3606		if (error != 0)
3607			return (error);
3608		tm_p = &timeout;
3609	}
3610	return (do_lock_umutex(td, uap->obj, tm_p, 0));
3611}
3612
3613static int
3614__umtx_op_trylock_umutex(struct thread *td, struct _umtx_op_args *uap,
3615    const struct umtx_copyops *ops __unused)
3616{
3617
3618	return (do_lock_umutex(td, uap->obj, NULL, _UMUTEX_TRY));
3619}
3620
3621static int
3622__umtx_op_wait_umutex(struct thread *td, struct _umtx_op_args *uap,
3623    const struct umtx_copyops *ops)
3624{
3625	struct _umtx_time *tm_p, timeout;
3626	int error;
3627
3628	/* Allow a null timespec (wait forever). */
3629	if (uap->uaddr2 == NULL)
3630		tm_p = NULL;
3631	else {
3632		error = ops->copyin_umtx_time(
3633		    uap->uaddr2, (size_t)uap->uaddr1, &timeout);
3634		if (error != 0)
3635			return (error);
3636		tm_p = &timeout;
3637	}
3638	return (do_lock_umutex(td, uap->obj, tm_p, _UMUTEX_WAIT));
3639}
3640
3641static int
3642__umtx_op_wake_umutex(struct thread *td, struct _umtx_op_args *uap,
3643    const struct umtx_copyops *ops __unused)
3644{
3645
3646	return (do_wake_umutex(td, uap->obj));
3647}
3648
3649static int
3650__umtx_op_unlock_umutex(struct thread *td, struct _umtx_op_args *uap,
3651    const struct umtx_copyops *ops __unused)
3652{
3653
3654	return (do_unlock_umutex(td, uap->obj, false));
3655}
3656
3657static int
3658__umtx_op_set_ceiling(struct thread *td, struct _umtx_op_args *uap,
3659    const struct umtx_copyops *ops __unused)
3660{
3661
3662	return (do_set_ceiling(td, uap->obj, uap->val, uap->uaddr1));
3663}
3664
3665static int
3666__umtx_op_cv_wait(struct thread *td, struct _umtx_op_args *uap,
3667    const struct umtx_copyops *ops)
3668{
3669	struct timespec *ts, timeout;
3670	int error;
3671
3672	/* Allow a null timespec (wait forever). */
3673	if (uap->uaddr2 == NULL)
3674		ts = NULL;
3675	else {
3676		error = ops->copyin_timeout(uap->uaddr2, &timeout);
3677		if (error != 0)
3678			return (error);
3679		ts = &timeout;
3680	}
3681	return (do_cv_wait(td, uap->obj, uap->uaddr1, ts, uap->val));
3682}
3683
3684static int
3685__umtx_op_cv_signal(struct thread *td, struct _umtx_op_args *uap,
3686    const struct umtx_copyops *ops __unused)
3687{
3688
3689	return (do_cv_signal(td, uap->obj));
3690}
3691
3692static int
3693__umtx_op_cv_broadcast(struct thread *td, struct _umtx_op_args *uap,
3694    const struct umtx_copyops *ops __unused)
3695{
3696
3697	return (do_cv_broadcast(td, uap->obj));
3698}
3699
3700static int
3701__umtx_op_rw_rdlock(struct thread *td, struct _umtx_op_args *uap,
3702    const struct umtx_copyops *ops)
3703{
3704	struct _umtx_time timeout;
3705	int error;
3706
3707	/* Allow a null timespec (wait forever). */
3708	if (uap->uaddr2 == NULL) {
3709		error = do_rw_rdlock(td, uap->obj, uap->val, 0);
3710	} else {
3711		error = ops->copyin_umtx_time(uap->uaddr2,
3712		   (size_t)uap->uaddr1, &timeout);
3713		if (error != 0)
3714			return (error);
3715		error = do_rw_rdlock(td, uap->obj, uap->val, &timeout);
3716	}
3717	return (error);
3718}
3719
3720static int
3721__umtx_op_rw_wrlock(struct thread *td, struct _umtx_op_args *uap,
3722    const struct umtx_copyops *ops)
3723{
3724	struct _umtx_time timeout;
3725	int error;
3726
3727	/* Allow a null timespec (wait forever). */
3728	if (uap->uaddr2 == NULL) {
3729		error = do_rw_wrlock(td, uap->obj, 0);
3730	} else {
3731		error = ops->copyin_umtx_time(uap->uaddr2,
3732		   (size_t)uap->uaddr1, &timeout);
3733		if (error != 0)
3734			return (error);
3735
3736		error = do_rw_wrlock(td, uap->obj, &timeout);
3737	}
3738	return (error);
3739}
3740
3741static int
3742__umtx_op_rw_unlock(struct thread *td, struct _umtx_op_args *uap,
3743    const struct umtx_copyops *ops __unused)
3744{
3745
3746	return (do_rw_unlock(td, uap->obj));
3747}
3748
3749#if defined(COMPAT_FREEBSD9) || defined(COMPAT_FREEBSD10)
3750static int
3751__umtx_op_sem_wait(struct thread *td, struct _umtx_op_args *uap,
3752    const struct umtx_copyops *ops)
3753{
3754	struct _umtx_time *tm_p, timeout;
3755	int error;
3756
3757	/* Allow a null timespec (wait forever). */
3758	if (uap->uaddr2 == NULL)
3759		tm_p = NULL;
3760	else {
3761		error = ops->copyin_umtx_time(
3762		    uap->uaddr2, (size_t)uap->uaddr1, &timeout);
3763		if (error != 0)
3764			return (error);
3765		tm_p = &timeout;
3766	}
3767	return (do_sem_wait(td, uap->obj, tm_p));
3768}
3769
3770static int
3771__umtx_op_sem_wake(struct thread *td, struct _umtx_op_args *uap,
3772    const struct umtx_copyops *ops __unused)
3773{
3774
3775	return (do_sem_wake(td, uap->obj));
3776}
3777#endif
3778
3779static int
3780__umtx_op_wake2_umutex(struct thread *td, struct _umtx_op_args *uap,
3781    const struct umtx_copyops *ops __unused)
3782{
3783
3784	return (do_wake2_umutex(td, uap->obj, uap->val));
3785}
3786
3787static int
3788__umtx_op_sem2_wait(struct thread *td, struct _umtx_op_args *uap,
3789    const struct umtx_copyops *ops)
3790{
3791	struct _umtx_time *tm_p, timeout;
3792	size_t uasize;
3793	int error;
3794
3795	/* Allow a null timespec (wait forever). */
3796	if (uap->uaddr2 == NULL) {
3797		uasize = 0;
3798		tm_p = NULL;
3799	} else {
3800		uasize = (size_t)uap->uaddr1;
3801		error = ops->copyin_umtx_time(uap->uaddr2, uasize, &timeout);
3802		if (error != 0)
3803			return (error);
3804		tm_p = &timeout;
3805	}
3806	error = do_sem2_wait(td, uap->obj, tm_p);
3807	if (error == EINTR && uap->uaddr2 != NULL &&
3808	    (timeout._flags & UMTX_ABSTIME) == 0 &&
3809	    uasize >= ops->umtx_time_sz + ops->timespec_sz) {
3810		error = ops->copyout_timeout(
3811		    (void *)((uintptr_t)uap->uaddr2 + ops->umtx_time_sz),
3812		    uasize - ops->umtx_time_sz, &timeout._timeout);
3813		if (error == 0) {
3814			error = EINTR;
3815		}
3816	}
3817
3818	return (error);
3819}
3820
3821static int
3822__umtx_op_sem2_wake(struct thread *td, struct _umtx_op_args *uap,
3823    const struct umtx_copyops *ops __unused)
3824{
3825
3826	return (do_sem2_wake(td, uap->obj));
3827}
3828
3829#define	USHM_OBJ_UMTX(o)						\
3830    ((struct umtx_shm_obj_list *)(&(o)->umtx_data))
3831
3832#define	USHMF_REG_LINKED	0x0001
3833#define	USHMF_OBJ_LINKED	0x0002
3834struct umtx_shm_reg {
3835	TAILQ_ENTRY(umtx_shm_reg) ushm_reg_link;
3836	LIST_ENTRY(umtx_shm_reg) ushm_obj_link;
3837	struct umtx_key		ushm_key;
3838	struct ucred		*ushm_cred;
3839	struct shmfd		*ushm_obj;
3840	u_int			ushm_refcnt;
3841	u_int			ushm_flags;
3842};
3843
3844LIST_HEAD(umtx_shm_obj_list, umtx_shm_reg);
3845TAILQ_HEAD(umtx_shm_reg_head, umtx_shm_reg);
3846
3847static uma_zone_t umtx_shm_reg_zone;
3848static struct umtx_shm_reg_head umtx_shm_registry[UMTX_CHAINS];
3849static struct mtx umtx_shm_lock;
3850static struct umtx_shm_reg_head umtx_shm_reg_delfree =
3851    TAILQ_HEAD_INITIALIZER(umtx_shm_reg_delfree);
3852
3853static void umtx_shm_free_reg(struct umtx_shm_reg *reg);
3854
3855static void
3856umtx_shm_reg_delfree_tq(void *context __unused, int pending __unused)
3857{
3858	struct umtx_shm_reg_head d;
3859	struct umtx_shm_reg *reg, *reg1;
3860
3861	TAILQ_INIT(&d);
3862	mtx_lock(&umtx_shm_lock);
3863	TAILQ_CONCAT(&d, &umtx_shm_reg_delfree, ushm_reg_link);
3864	mtx_unlock(&umtx_shm_lock);
3865	TAILQ_FOREACH_SAFE(reg, &d, ushm_reg_link, reg1) {
3866		TAILQ_REMOVE(&d, reg, ushm_reg_link);
3867		umtx_shm_free_reg(reg);
3868	}
3869}
3870
3871static struct task umtx_shm_reg_delfree_task =
3872    TASK_INITIALIZER(0, umtx_shm_reg_delfree_tq, NULL);
3873
3874static struct umtx_shm_reg *
3875umtx_shm_find_reg_locked(const struct umtx_key *key)
3876{
3877	struct umtx_shm_reg *reg;
3878	struct umtx_shm_reg_head *reg_head;
3879
3880	KASSERT(key->shared, ("umtx_p_find_rg: private key"));
3881	mtx_assert(&umtx_shm_lock, MA_OWNED);
3882	reg_head = &umtx_shm_registry[key->hash];
3883	TAILQ_FOREACH(reg, reg_head, ushm_reg_link) {
3884		KASSERT(reg->ushm_key.shared,
3885		    ("non-shared key on reg %p %d", reg, reg->ushm_key.shared));
3886		if (reg->ushm_key.info.shared.object ==
3887		    key->info.shared.object &&
3888		    reg->ushm_key.info.shared.offset ==
3889		    key->info.shared.offset) {
3890			KASSERT(reg->ushm_key.type == TYPE_SHM, ("TYPE_USHM"));
3891			KASSERT(reg->ushm_refcnt > 0,
3892			    ("reg %p refcnt 0 onlist", reg));
3893			KASSERT((reg->ushm_flags & USHMF_REG_LINKED) != 0,
3894			    ("reg %p not linked", reg));
3895			reg->ushm_refcnt++;
3896			return (reg);
3897		}
3898	}
3899	return (NULL);
3900}
3901
3902static struct umtx_shm_reg *
3903umtx_shm_find_reg(const struct umtx_key *key)
3904{
3905	struct umtx_shm_reg *reg;
3906
3907	mtx_lock(&umtx_shm_lock);
3908	reg = umtx_shm_find_reg_locked(key);
3909	mtx_unlock(&umtx_shm_lock);
3910	return (reg);
3911}
3912
3913static void
3914umtx_shm_free_reg(struct umtx_shm_reg *reg)
3915{
3916
3917	chgumtxcnt(reg->ushm_cred->cr_ruidinfo, -1, 0);
3918	crfree(reg->ushm_cred);
3919	shm_drop(reg->ushm_obj);
3920	uma_zfree(umtx_shm_reg_zone, reg);
3921}
3922
3923static bool
3924umtx_shm_unref_reg_locked(struct umtx_shm_reg *reg, bool force)
3925{
3926	bool res;
3927
3928	mtx_assert(&umtx_shm_lock, MA_OWNED);
3929	KASSERT(reg->ushm_refcnt > 0, ("ushm_reg %p refcnt 0", reg));
3930	reg->ushm_refcnt--;
3931	res = reg->ushm_refcnt == 0;
3932	if (res || force) {
3933		if ((reg->ushm_flags & USHMF_REG_LINKED) != 0) {
3934			TAILQ_REMOVE(&umtx_shm_registry[reg->ushm_key.hash],
3935			    reg, ushm_reg_link);
3936			reg->ushm_flags &= ~USHMF_REG_LINKED;
3937		}
3938		if ((reg->ushm_flags & USHMF_OBJ_LINKED) != 0) {
3939			LIST_REMOVE(reg, ushm_obj_link);
3940			reg->ushm_flags &= ~USHMF_OBJ_LINKED;
3941		}
3942	}
3943	return (res);
3944}
3945
3946static void
3947umtx_shm_unref_reg(struct umtx_shm_reg *reg, bool force)
3948{
3949	vm_object_t object;
3950	bool dofree;
3951
3952	if (force) {
3953		object = reg->ushm_obj->shm_object;
3954		VM_OBJECT_WLOCK(object);
3955		object->flags |= OBJ_UMTXDEAD;
3956		VM_OBJECT_WUNLOCK(object);
3957	}
3958	mtx_lock(&umtx_shm_lock);
3959	dofree = umtx_shm_unref_reg_locked(reg, force);
3960	mtx_unlock(&umtx_shm_lock);
3961	if (dofree)
3962		umtx_shm_free_reg(reg);
3963}
3964
3965void
3966umtx_shm_object_init(vm_object_t object)
3967{
3968
3969	LIST_INIT(USHM_OBJ_UMTX(object));
3970}
3971
3972void
3973umtx_shm_object_terminated(vm_object_t object)
3974{
3975	struct umtx_shm_reg *reg, *reg1;
3976	bool dofree;
3977
3978	dofree = false;
3979	mtx_lock(&umtx_shm_lock);
3980	LIST_FOREACH_SAFE(reg, USHM_OBJ_UMTX(object), ushm_obj_link, reg1) {
3981		if (umtx_shm_unref_reg_locked(reg, true)) {
3982			TAILQ_INSERT_TAIL(&umtx_shm_reg_delfree, reg,
3983			    ushm_reg_link);
3984			dofree = true;
3985		}
3986	}
3987	mtx_unlock(&umtx_shm_lock);
3988	if (dofree)
3989		taskqueue_enqueue(taskqueue_thread, &umtx_shm_reg_delfree_task);
3990}
3991
3992static int
3993umtx_shm_create_reg(struct thread *td, const struct umtx_key *key,
3994    struct umtx_shm_reg **res)
3995{
3996	struct umtx_shm_reg *reg, *reg1;
3997	struct ucred *cred;
3998	int error;
3999
4000	reg = umtx_shm_find_reg(key);
4001	if (reg != NULL) {
4002		*res = reg;
4003		return (0);
4004	}
4005	cred = td->td_ucred;
4006	if (!chgumtxcnt(cred->cr_ruidinfo, 1, lim_cur(td, RLIMIT_UMTXP)))
4007		return (ENOMEM);
4008	reg = uma_zalloc(umtx_shm_reg_zone, M_WAITOK | M_ZERO);
4009	reg->ushm_refcnt = 1;
4010	bcopy(key, &reg->ushm_key, sizeof(*key));
4011	reg->ushm_obj = shm_alloc(td->td_ucred, O_RDWR);
4012	reg->ushm_cred = crhold(cred);
4013	error = shm_dotruncate(reg->ushm_obj, PAGE_SIZE);
4014	if (error != 0) {
4015		umtx_shm_free_reg(reg);
4016		return (error);
4017	}
4018	mtx_lock(&umtx_shm_lock);
4019	reg1 = umtx_shm_find_reg_locked(key);
4020	if (reg1 != NULL) {
4021		mtx_unlock(&umtx_shm_lock);
4022		umtx_shm_free_reg(reg);
4023		*res = reg1;
4024		return (0);
4025	}
4026	reg->ushm_refcnt++;
4027	TAILQ_INSERT_TAIL(&umtx_shm_registry[key->hash], reg, ushm_reg_link);
4028	LIST_INSERT_HEAD(USHM_OBJ_UMTX(key->info.shared.object), reg,
4029	    ushm_obj_link);
4030	reg->ushm_flags = USHMF_REG_LINKED | USHMF_OBJ_LINKED;
4031	mtx_unlock(&umtx_shm_lock);
4032	*res = reg;
4033	return (0);
4034}
4035
4036static int
4037umtx_shm_alive(struct thread *td, void *addr)
4038{
4039	vm_map_t map;
4040	vm_map_entry_t entry;
4041	vm_object_t object;
4042	vm_pindex_t pindex;
4043	vm_prot_t prot;
4044	int res, ret;
4045	boolean_t wired;
4046
4047	map = &td->td_proc->p_vmspace->vm_map;
4048	res = vm_map_lookup(&map, (uintptr_t)addr, VM_PROT_READ, &entry,
4049	    &object, &pindex, &prot, &wired);
4050	if (res != KERN_SUCCESS)
4051		return (EFAULT);
4052	if (object == NULL)
4053		ret = EINVAL;
4054	else
4055		ret = (object->flags & OBJ_UMTXDEAD) != 0 ? ENOTTY : 0;
4056	vm_map_lookup_done(map, entry);
4057	return (ret);
4058}
4059
4060static void
4061umtx_shm_init(void)
4062{
4063	int i;
4064
4065	umtx_shm_reg_zone = uma_zcreate("umtx_shm", sizeof(struct umtx_shm_reg),
4066	    NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0);
4067	mtx_init(&umtx_shm_lock, "umtxshm", NULL, MTX_DEF);
4068	for (i = 0; i < nitems(umtx_shm_registry); i++)
4069		TAILQ_INIT(&umtx_shm_registry[i]);
4070}
4071
4072static int
4073umtx_shm(struct thread *td, void *addr, u_int flags)
4074{
4075	struct umtx_key key;
4076	struct umtx_shm_reg *reg;
4077	struct file *fp;
4078	int error, fd;
4079
4080	if (__bitcount(flags & (UMTX_SHM_CREAT | UMTX_SHM_LOOKUP |
4081	    UMTX_SHM_DESTROY| UMTX_SHM_ALIVE)) != 1)
4082		return (EINVAL);
4083	if ((flags & UMTX_SHM_ALIVE) != 0)
4084		return (umtx_shm_alive(td, addr));
4085	error = umtx_key_get(addr, TYPE_SHM, PROCESS_SHARE, &key);
4086	if (error != 0)
4087		return (error);
4088	KASSERT(key.shared == 1, ("non-shared key"));
4089	if ((flags & UMTX_SHM_CREAT) != 0) {
4090		error = umtx_shm_create_reg(td, &key, &reg);
4091	} else {
4092		reg = umtx_shm_find_reg(&key);
4093		if (reg == NULL)
4094			error = ESRCH;
4095	}
4096	umtx_key_release(&key);
4097	if (error != 0)
4098		return (error);
4099	KASSERT(reg != NULL, ("no reg"));
4100	if ((flags & UMTX_SHM_DESTROY) != 0) {
4101		umtx_shm_unref_reg(reg, true);
4102	} else {
4103#if 0
4104#ifdef MAC
4105		error = mac_posixshm_check_open(td->td_ucred,
4106		    reg->ushm_obj, FFLAGS(O_RDWR));
4107		if (error == 0)
4108#endif
4109			error = shm_access(reg->ushm_obj, td->td_ucred,
4110			    FFLAGS(O_RDWR));
4111		if (error == 0)
4112#endif
4113			error = falloc_caps(td, &fp, &fd, O_CLOEXEC, NULL);
4114		if (error == 0) {
4115			shm_hold(reg->ushm_obj);
4116			finit(fp, FFLAGS(O_RDWR), DTYPE_SHM, reg->ushm_obj,
4117			    &shm_ops);
4118			td->td_retval[0] = fd;
4119			fdrop(fp, td);
4120		}
4121	}
4122	umtx_shm_unref_reg(reg, false);
4123	return (error);
4124}
4125
4126static int
4127__umtx_op_shm(struct thread *td, struct _umtx_op_args *uap,
4128    const struct umtx_copyops *ops __unused)
4129{
4130
4131	return (umtx_shm(td, uap->uaddr1, uap->val));
4132}
4133
4134static int
4135__umtx_op_robust_lists(struct thread *td, struct _umtx_op_args *uap,
4136    const struct umtx_copyops *ops)
4137{
4138	struct umtx_robust_lists_params rb;
4139	int error;
4140
4141	if (ops->compat32) {
4142		if ((td->td_pflags2 & TDP2_COMPAT32RB) == 0 &&
4143		    (td->td_rb_list != 0 || td->td_rbp_list != 0 ||
4144		    td->td_rb_inact != 0))
4145			return (EBUSY);
4146	} else if ((td->td_pflags2 & TDP2_COMPAT32RB) != 0) {
4147		return (EBUSY);
4148	}
4149
4150	bzero(&rb, sizeof(rb));
4151	error = ops->copyin_robust_lists(uap->uaddr1, uap->val, &rb);
4152	if (error != 0)
4153		return (error);
4154
4155	if (ops->compat32)
4156		td->td_pflags2 |= TDP2_COMPAT32RB;
4157
4158	td->td_rb_list = rb.robust_list_offset;
4159	td->td_rbp_list = rb.robust_priv_list_offset;
4160	td->td_rb_inact = rb.robust_inact_offset;
4161	return (0);
4162}
4163
4164#if defined(__i386__) || defined(__amd64__)
4165/*
4166 * Provide the standard 32-bit definitions for x86, since native/compat32 use a
4167 * 32-bit time_t there.  Other architectures just need the i386 definitions
4168 * along with their standard compat32.
4169 */
4170struct timespecx32 {
4171	int64_t			tv_sec;
4172	int32_t			tv_nsec;
4173};
4174
4175struct umtx_timex32 {
4176	struct	timespecx32	_timeout;
4177	uint32_t		_flags;
4178	uint32_t		_clockid;
4179};
4180
4181#ifndef __i386__
4182#define	timespeci386	timespec32
4183#define	umtx_timei386	umtx_time32
4184#endif
4185#else /* !__i386__ && !__amd64__ */
4186/* 32-bit architectures can emulate i386, so define these almost everywhere. */
4187struct timespeci386 {
4188	int32_t			tv_sec;
4189	int32_t			tv_nsec;
4190};
4191
4192struct umtx_timei386 {
4193	struct	timespeci386	_timeout;
4194	uint32_t		_flags;
4195	uint32_t		_clockid;
4196};
4197
4198#if defined(__LP64__)
4199#define	timespecx32	timespec32
4200#define	umtx_timex32	umtx_time32
4201#endif
4202#endif
4203
4204static int
4205umtx_copyin_robust_lists32(const void *uaddr, size_t size,
4206    struct umtx_robust_lists_params *rbp)
4207{
4208	struct umtx_robust_lists_params_compat32 rb32;
4209	int error;
4210
4211	if (size > sizeof(rb32))
4212		return (EINVAL);
4213	bzero(&rb32, sizeof(rb32));
4214	error = copyin(uaddr, &rb32, size);
4215	if (error != 0)
4216		return (error);
4217	CP(rb32, *rbp, robust_list_offset);
4218	CP(rb32, *rbp, robust_priv_list_offset);
4219	CP(rb32, *rbp, robust_inact_offset);
4220	return (0);
4221}
4222
4223#ifndef __i386__
4224static inline int
4225umtx_copyin_timeouti386(const void *uaddr, struct timespec *tsp)
4226{
4227	struct timespeci386 ts32;
4228	int error;
4229
4230	error = copyin(uaddr, &ts32, sizeof(ts32));
4231	if (error == 0) {
4232		if (ts32.tv_sec < 0 ||
4233		    ts32.tv_nsec >= 1000000000 ||
4234		    ts32.tv_nsec < 0)
4235			error = EINVAL;
4236		else {
4237			CP(ts32, *tsp, tv_sec);
4238			CP(ts32, *tsp, tv_nsec);
4239		}
4240	}
4241	return (error);
4242}
4243
4244static inline int
4245umtx_copyin_umtx_timei386(const void *uaddr, size_t size, struct _umtx_time *tp)
4246{
4247	struct umtx_timei386 t32;
4248	int error;
4249
4250	t32._clockid = CLOCK_REALTIME;
4251	t32._flags   = 0;
4252	if (size <= sizeof(t32._timeout))
4253		error = copyin(uaddr, &t32._timeout, sizeof(t32._timeout));
4254	else
4255		error = copyin(uaddr, &t32, sizeof(t32));
4256	if (error != 0)
4257		return (error);
4258	if (t32._timeout.tv_sec < 0 ||
4259	    t32._timeout.tv_nsec >= 1000000000 || t32._timeout.tv_nsec < 0)
4260		return (EINVAL);
4261	TS_CP(t32, *tp, _timeout);
4262	CP(t32, *tp, _flags);
4263	CP(t32, *tp, _clockid);
4264	return (0);
4265}
4266
4267static int
4268umtx_copyout_timeouti386(void *uaddr, size_t sz, struct timespec *tsp)
4269{
4270	struct timespeci386 remain32 = {
4271		.tv_sec = tsp->tv_sec,
4272		.tv_nsec = tsp->tv_nsec,
4273	};
4274
4275	/*
4276	 * Should be guaranteed by the caller, sz == uaddr1 - sizeof(_umtx_time)
4277	 * and we're only called if sz >= sizeof(timespec) as supplied in the
4278	 * copyops.
4279	 */
4280	KASSERT(sz >= sizeof(remain32),
4281	    ("umtx_copyops specifies incorrect sizes"));
4282
4283	return (copyout(&remain32, uaddr, sizeof(remain32)));
4284}
4285#endif /* !__i386__ */
4286
4287#if defined(__i386__) || defined(__LP64__)
4288static inline int
4289umtx_copyin_timeoutx32(const void *uaddr, struct timespec *tsp)
4290{
4291	struct timespecx32 ts32;
4292	int error;
4293
4294	error = copyin(uaddr, &ts32, sizeof(ts32));
4295	if (error == 0) {
4296		if (ts32.tv_sec < 0 ||
4297		    ts32.tv_nsec >= 1000000000 ||
4298		    ts32.tv_nsec < 0)
4299			error = EINVAL;
4300		else {
4301			CP(ts32, *tsp, tv_sec);
4302			CP(ts32, *tsp, tv_nsec);
4303		}
4304	}
4305	return (error);
4306}
4307
4308static inline int
4309umtx_copyin_umtx_timex32(const void *uaddr, size_t size, struct _umtx_time *tp)
4310{
4311	struct umtx_timex32 t32;
4312	int error;
4313
4314	t32._clockid = CLOCK_REALTIME;
4315	t32._flags   = 0;
4316	if (size <= sizeof(t32._timeout))
4317		error = copyin(uaddr, &t32._timeout, sizeof(t32._timeout));
4318	else
4319		error = copyin(uaddr, &t32, sizeof(t32));
4320	if (error != 0)
4321		return (error);
4322	if (t32._timeout.tv_sec < 0 ||
4323	    t32._timeout.tv_nsec >= 1000000000 || t32._timeout.tv_nsec < 0)
4324		return (EINVAL);
4325	TS_CP(t32, *tp, _timeout);
4326	CP(t32, *tp, _flags);
4327	CP(t32, *tp, _clockid);
4328	return (0);
4329}
4330
4331static int
4332umtx_copyout_timeoutx32(void *uaddr, size_t sz, struct timespec *tsp)
4333{
4334	struct timespecx32 remain32 = {
4335		.tv_sec = tsp->tv_sec,
4336		.tv_nsec = tsp->tv_nsec,
4337	};
4338
4339	/*
4340	 * Should be guaranteed by the caller, sz == uaddr1 - sizeof(_umtx_time)
4341	 * and we're only called if sz >= sizeof(timespec) as supplied in the
4342	 * copyops.
4343	 */
4344	KASSERT(sz >= sizeof(remain32),
4345	    ("umtx_copyops specifies incorrect sizes"));
4346
4347	return (copyout(&remain32, uaddr, sizeof(remain32)));
4348}
4349#endif /* __i386__ || __LP64__ */
4350
4351typedef int (*_umtx_op_func)(struct thread *td, struct _umtx_op_args *uap,
4352    const struct umtx_copyops *umtx_ops);
4353
4354static const _umtx_op_func op_table[] = {
4355	[UMTX_OP_RESERVED0]	= __umtx_op_unimpl,
4356	[UMTX_OP_RESERVED1]	= __umtx_op_unimpl,
4357	[UMTX_OP_WAIT]		= __umtx_op_wait,
4358	[UMTX_OP_WAKE]		= __umtx_op_wake,
4359	[UMTX_OP_MUTEX_TRYLOCK]	= __umtx_op_trylock_umutex,
4360	[UMTX_OP_MUTEX_LOCK]	= __umtx_op_lock_umutex,
4361	[UMTX_OP_MUTEX_UNLOCK]	= __umtx_op_unlock_umutex,
4362	[UMTX_OP_SET_CEILING]	= __umtx_op_set_ceiling,
4363	[UMTX_OP_CV_WAIT]	= __umtx_op_cv_wait,
4364	[UMTX_OP_CV_SIGNAL]	= __umtx_op_cv_signal,
4365	[UMTX_OP_CV_BROADCAST]	= __umtx_op_cv_broadcast,
4366	[UMTX_OP_WAIT_UINT]	= __umtx_op_wait_uint,
4367	[UMTX_OP_RW_RDLOCK]	= __umtx_op_rw_rdlock,
4368	[UMTX_OP_RW_WRLOCK]	= __umtx_op_rw_wrlock,
4369	[UMTX_OP_RW_UNLOCK]	= __umtx_op_rw_unlock,
4370	[UMTX_OP_WAIT_UINT_PRIVATE] = __umtx_op_wait_uint_private,
4371	[UMTX_OP_WAKE_PRIVATE]	= __umtx_op_wake_private,
4372	[UMTX_OP_MUTEX_WAIT]	= __umtx_op_wait_umutex,
4373	[UMTX_OP_MUTEX_WAKE]	= __umtx_op_wake_umutex,
4374#if defined(COMPAT_FREEBSD9) || defined(COMPAT_FREEBSD10)
4375	[UMTX_OP_SEM_WAIT]	= __umtx_op_sem_wait,
4376	[UMTX_OP_SEM_WAKE]	= __umtx_op_sem_wake,
4377#else
4378	[UMTX_OP_SEM_WAIT]	= __umtx_op_unimpl,
4379	[UMTX_OP_SEM_WAKE]	= __umtx_op_unimpl,
4380#endif
4381	[UMTX_OP_NWAKE_PRIVATE]	= __umtx_op_nwake_private,
4382	[UMTX_OP_MUTEX_WAKE2]	= __umtx_op_wake2_umutex,
4383	[UMTX_OP_SEM2_WAIT]	= __umtx_op_sem2_wait,
4384	[UMTX_OP_SEM2_WAKE]	= __umtx_op_sem2_wake,
4385	[UMTX_OP_SHM]		= __umtx_op_shm,
4386	[UMTX_OP_ROBUST_LISTS]	= __umtx_op_robust_lists,
4387};
4388
4389static const struct umtx_copyops umtx_native_ops = {
4390	.copyin_timeout = umtx_copyin_timeout,
4391	.copyin_umtx_time = umtx_copyin_umtx_time,
4392	.copyin_robust_lists = umtx_copyin_robust_lists,
4393	.copyout_timeout = umtx_copyout_timeout,
4394	.timespec_sz = sizeof(struct timespec),
4395	.umtx_time_sz = sizeof(struct _umtx_time),
4396};
4397
4398#ifndef __i386__
4399static const struct umtx_copyops umtx_native_opsi386 = {
4400	.copyin_timeout = umtx_copyin_timeouti386,
4401	.copyin_umtx_time = umtx_copyin_umtx_timei386,
4402	.copyin_robust_lists = umtx_copyin_robust_lists32,
4403	.copyout_timeout = umtx_copyout_timeouti386,
4404	.timespec_sz = sizeof(struct timespeci386),
4405	.umtx_time_sz = sizeof(struct umtx_timei386),
4406	.compat32 = true,
4407};
4408#endif
4409
4410#if defined(__i386__) || defined(__LP64__)
4411/* i386 can emulate other 32-bit archs, too! */
4412static const struct umtx_copyops umtx_native_opsx32 = {
4413	.copyin_timeout = umtx_copyin_timeoutx32,
4414	.copyin_umtx_time = umtx_copyin_umtx_timex32,
4415	.copyin_robust_lists = umtx_copyin_robust_lists32,
4416	.copyout_timeout = umtx_copyout_timeoutx32,
4417	.timespec_sz = sizeof(struct timespecx32),
4418	.umtx_time_sz = sizeof(struct umtx_timex32),
4419	.compat32 = true,
4420};
4421
4422#ifdef COMPAT_FREEBSD32
4423#ifdef __amd64__
4424#define	umtx_native_ops32	umtx_native_opsi386
4425#else
4426#define	umtx_native_ops32	umtx_native_opsx32
4427#endif
4428#endif /* COMPAT_FREEBSD32 */
4429#endif /* __i386__ || __LP64__ */
4430
4431#define	UMTX_OP__FLAGS	(UMTX_OP__32BIT | UMTX_OP__I386)
4432
4433static int
4434kern__umtx_op(struct thread *td, void *obj, int op, unsigned long val,
4435    void *uaddr1, void *uaddr2, const struct umtx_copyops *ops)
4436{
4437	struct _umtx_op_args uap = {
4438		.obj = obj,
4439		.op = op & ~UMTX_OP__FLAGS,
4440		.val = val,
4441		.uaddr1 = uaddr1,
4442		.uaddr2 = uaddr2
4443	};
4444
4445	if ((uap.op >= nitems(op_table)))
4446		return (EINVAL);
4447	return ((*op_table[uap.op])(td, &uap, ops));
4448}
4449
4450int
4451sys__umtx_op(struct thread *td, struct _umtx_op_args *uap)
4452{
4453	static const struct umtx_copyops *umtx_ops;
4454
4455	umtx_ops = &umtx_native_ops;
4456#ifdef __LP64__
4457	if ((uap->op & (UMTX_OP__32BIT | UMTX_OP__I386)) != 0) {
4458		if ((uap->op & UMTX_OP__I386) != 0)
4459			umtx_ops = &umtx_native_opsi386;
4460		else
4461			umtx_ops = &umtx_native_opsx32;
4462	}
4463#elif !defined(__i386__)
4464	/* We consider UMTX_OP__32BIT a nop on !i386 ILP32. */
4465	if ((uap->op & UMTX_OP__I386) != 0)
4466		umtx_ops = &umtx_native_opsi386;
4467#else
4468	/* Likewise, UMTX_OP__I386 is a nop on i386. */
4469	if ((uap->op & UMTX_OP__32BIT) != 0)
4470		umtx_ops = &umtx_native_opsx32;
4471#endif
4472	return (kern__umtx_op(td, uap->obj, uap->op, uap->val, uap->uaddr1,
4473	    uap->uaddr2, umtx_ops));
4474}
4475
4476#ifdef COMPAT_FREEBSD32
4477int
4478freebsd32__umtx_op(struct thread *td, struct freebsd32__umtx_op_args *uap)
4479{
4480
4481	return (kern__umtx_op(td, uap->obj, uap->op, uap->val, uap->uaddr,
4482	    uap->uaddr2, &umtx_native_ops32));
4483}
4484#endif
4485
4486void
4487umtx_thread_init(struct thread *td)
4488{
4489
4490	td->td_umtxq = umtxq_alloc();
4491	td->td_umtxq->uq_thread = td;
4492}
4493
4494void
4495umtx_thread_fini(struct thread *td)
4496{
4497
4498	umtxq_free(td->td_umtxq);
4499}
4500
4501/*
4502 * It will be called when new thread is created, e.g fork().
4503 */
4504void
4505umtx_thread_alloc(struct thread *td)
4506{
4507	struct umtx_q *uq;
4508
4509	uq = td->td_umtxq;
4510	uq->uq_inherited_pri = PRI_MAX;
4511
4512	KASSERT(uq->uq_flags == 0, ("uq_flags != 0"));
4513	KASSERT(uq->uq_thread == td, ("uq_thread != td"));
4514	KASSERT(uq->uq_pi_blocked == NULL, ("uq_pi_blocked != NULL"));
4515	KASSERT(TAILQ_EMPTY(&uq->uq_pi_contested), ("uq_pi_contested is not empty"));
4516}
4517
4518/*
4519 * exec() hook.
4520 *
4521 * Clear robust lists for all process' threads, not delaying the
4522 * cleanup to thread exit, since the relevant address space is
4523 * destroyed right now.
4524 */
4525void
4526umtx_exec(struct proc *p)
4527{
4528	struct thread *td;
4529
4530	KASSERT(p == curproc, ("need curproc"));
4531	PROC_LOCK(p);
4532	KASSERT((p->p_flag & P_HADTHREADS) == 0 ||
4533	    (p->p_flag & P_STOPPED_SINGLE) != 0,
4534	    ("curproc must be single-threaded"));
4535	FOREACH_THREAD_IN_PROC(p, td) {
4536		KASSERT(td == curthread ||
4537		    ((td->td_flags & TDF_BOUNDARY) != 0 && TD_IS_SUSPENDED(td)),
4538		    ("running thread %p %p", p, td));
4539		PROC_UNLOCK(p);
4540		umtx_thread_cleanup(td);
4541		PROC_LOCK(p);
4542		td->td_rb_list = td->td_rbp_list = td->td_rb_inact = 0;
4543	}
4544	PROC_UNLOCK(p);
4545}
4546
4547/*
4548 * thread exit hook.
4549 */
4550void
4551umtx_thread_exit(struct thread *td)
4552{
4553
4554	umtx_thread_cleanup(td);
4555}
4556
4557static int
4558umtx_read_uptr(struct thread *td, uintptr_t ptr, uintptr_t *res, bool compat32)
4559{
4560	u_long res1;
4561	uint32_t res32;
4562	int error;
4563
4564	if (compat32) {
4565		error = fueword32((void *)ptr, &res32);
4566		if (error == 0)
4567			res1 = res32;
4568	} else {
4569		error = fueword((void *)ptr, &res1);
4570	}
4571	if (error == 0)
4572		*res = res1;
4573	else
4574		error = EFAULT;
4575	return (error);
4576}
4577
4578static void
4579umtx_read_rb_list(struct thread *td, struct umutex *m, uintptr_t *rb_list,
4580    bool compat32)
4581{
4582	struct umutex32 m32;
4583
4584	if (compat32) {
4585		memcpy(&m32, m, sizeof(m32));
4586		*rb_list = m32.m_rb_lnk;
4587	} else {
4588		*rb_list = m->m_rb_lnk;
4589	}
4590}
4591
4592static int
4593umtx_handle_rb(struct thread *td, uintptr_t rbp, uintptr_t *rb_list, bool inact,
4594    bool compat32)
4595{
4596	struct umutex m;
4597	int error;
4598
4599	KASSERT(td->td_proc == curproc, ("need current vmspace"));
4600	error = copyin((void *)rbp, &m, sizeof(m));
4601	if (error != 0)
4602		return (error);
4603	if (rb_list != NULL)
4604		umtx_read_rb_list(td, &m, rb_list, compat32);
4605	if ((m.m_flags & UMUTEX_ROBUST) == 0)
4606		return (EINVAL);
4607	if ((m.m_owner & ~UMUTEX_CONTESTED) != td->td_tid)
4608		/* inact is cleared after unlock, allow the inconsistency */
4609		return (inact ? 0 : EINVAL);
4610	return (do_unlock_umutex(td, (struct umutex *)rbp, true));
4611}
4612
4613static void
4614umtx_cleanup_rb_list(struct thread *td, uintptr_t rb_list, uintptr_t *rb_inact,
4615    const char *name, bool compat32)
4616{
4617	int error, i;
4618	uintptr_t rbp;
4619	bool inact;
4620
4621	if (rb_list == 0)
4622		return;
4623	error = umtx_read_uptr(td, rb_list, &rbp, compat32);
4624	for (i = 0; error == 0 && rbp != 0 && i < umtx_max_rb; i++) {
4625		if (rbp == *rb_inact) {
4626			inact = true;
4627			*rb_inact = 0;
4628		} else
4629			inact = false;
4630		error = umtx_handle_rb(td, rbp, &rbp, inact, compat32);
4631	}
4632	if (i == umtx_max_rb && umtx_verbose_rb) {
4633		uprintf("comm %s pid %d: reached umtx %smax rb %d\n",
4634		    td->td_proc->p_comm, td->td_proc->p_pid, name, umtx_max_rb);
4635	}
4636	if (error != 0 && umtx_verbose_rb) {
4637		uprintf("comm %s pid %d: handling %srb error %d\n",
4638		    td->td_proc->p_comm, td->td_proc->p_pid, name, error);
4639	}
4640}
4641
4642/*
4643 * Clean up umtx data.
4644 */
4645static void
4646umtx_thread_cleanup(struct thread *td)
4647{
4648	struct umtx_q *uq;
4649	struct umtx_pi *pi;
4650	uintptr_t rb_inact;
4651	bool compat32;
4652
4653	/*
4654	 * Disown pi mutexes.
4655	 */
4656	uq = td->td_umtxq;
4657	if (uq != NULL) {
4658		mtx_lock(&umtx_lock);
4659		uq->uq_inherited_pri = PRI_MAX;
4660		while ((pi = TAILQ_FIRST(&uq->uq_pi_contested)) != NULL) {
4661			pi->pi_owner = NULL;
4662			TAILQ_REMOVE(&uq->uq_pi_contested, pi, pi_link);
4663		}
4664		mtx_unlock(&umtx_lock);
4665		thread_lock(td);
4666		sched_lend_user_prio(td, PRI_MAX);
4667		thread_unlock(td);
4668	}
4669
4670	compat32 = (td->td_pflags2 & TDP2_COMPAT32RB) != 0;
4671	td->td_pflags2 &= ~TDP2_COMPAT32RB;
4672
4673	/*
4674	 * Handle terminated robust mutexes.  Must be done after
4675	 * robust pi disown, otherwise unlock could see unowned
4676	 * entries.
4677	 */
4678	rb_inact = td->td_rb_inact;
4679	if (rb_inact != 0)
4680		(void)umtx_read_uptr(td, rb_inact, &rb_inact, compat32);
4681	umtx_cleanup_rb_list(td, td->td_rb_list, &rb_inact, "", compat32);
4682	umtx_cleanup_rb_list(td, td->td_rbp_list, &rb_inact, "priv ", compat32);
4683	if (rb_inact != 0)
4684		(void)umtx_handle_rb(td, rb_inact, NULL, true, compat32);
4685}
4686