1/*
2 * Copyright (c) 2000-2012 Apple Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28/* Copyright (c) 1995-2005 Apple Computer, Inc. All Rights Reserved */
29/*
30 *	pthread_support.c
31 */
32
33#include <sys/param.h>
34#include <sys/queue.h>
35#include <sys/resourcevar.h>
36//#include <sys/proc_internal.h>
37#include <sys/kauth.h>
38#include <sys/systm.h>
39#include <sys/timeb.h>
40#include <sys/times.h>
41#include <sys/time.h>
42#include <sys/acct.h>
43#include <sys/kernel.h>
44#include <sys/wait.h>
45#include <sys/signalvar.h>
46#include <sys/syslog.h>
47#include <sys/stat.h>
48#include <sys/lock.h>
49#include <sys/kdebug.h>
50//#include <sys/sysproto.h>
51//#include <sys/pthread_internal.h>
52#include <sys/vm.h>
53#include <sys/user.h>
54
55#include <mach/mach_types.h>
56#include <mach/vm_prot.h>
57#include <mach/semaphore.h>
58#include <mach/sync_policy.h>
59#include <mach/task.h>
60#include <kern/kern_types.h>
61#include <kern/task.h>
62#include <kern/clock.h>
63#include <mach/kern_return.h>
64#include <kern/thread.h>
65#include <kern/sched_prim.h>
66#include <kern/thread_call.h>
67#include <kern/kalloc.h>
68#include <kern/zalloc.h>
69#include <kern/sched_prim.h>
70#include <kern/processor.h>
71#include <kern/wait_queue.h>
72//#include <kern/mach_param.h>
73#include <mach/mach_vm.h>
74#include <mach/mach_param.h>
75#include <mach/thread_policy.h>
76#include <mach/message.h>
77#include <mach/port.h>
78//#include <vm/vm_protos.h>
79#include <vm/vm_map.h>
80#include <mach/vm_region.h>
81
82#include <libkern/OSAtomic.h>
83
84#include <pexpert/pexpert.h>
85#include <sys/pthread_shims.h>
86
87#include "kern_internal.h"
88#include "synch_internal.h"
89#include "kern_trace.h"
90
91typedef struct uthread *uthread_t;
92
93//#define __FAILEDUSERTEST__(s) do { panic(s); } while (0)
94#define __FAILEDUSERTEST__(s) do { printf("PSYNCH: pid[%d]: %s\n", proc_pid(current_proc()), s); } while (0)
95
96#define ECVCERORR	256
97#define ECVPERORR	512
98
99lck_mtx_t *pthread_list_mlock;
100
101#define PTH_HASHSIZE 100
102
103static LIST_HEAD(pthhashhead, ksyn_wait_queue) *pth_glob_hashtbl;
104static unsigned long pthhash;
105
106static LIST_HEAD(, ksyn_wait_queue) pth_free_list;
107
108static zone_t kwq_zone; /* zone for allocation of ksyn_queue */
109static zone_t kwe_zone;	/* zone for allocation of ksyn_waitq_element */
110
111#define SEQFIT 0
112#define FIRSTFIT 1
113
114struct ksyn_queue {
115	TAILQ_HEAD(ksynq_kwelist_head, ksyn_waitq_element) ksynq_kwelist;
116	uint32_t	ksynq_count;		/* number of entries in queue */
117	uint32_t	ksynq_firstnum;		/* lowest seq in queue */
118	uint32_t	ksynq_lastnum;		/* highest seq in queue */
119};
120typedef struct ksyn_queue *ksyn_queue_t;
121
122enum {
123	KSYN_QUEUE_READ = 0,
124	KSYN_QUEUE_WRITER,
125	KSYN_QUEUE_MAX,
126};
127
128struct ksyn_wait_queue {
129	LIST_ENTRY(ksyn_wait_queue) kw_hash;
130	LIST_ENTRY(ksyn_wait_queue) kw_list;
131	user_addr_t kw_addr;
132	uint64_t kw_owner;
133	uint64_t kw_object;		/* object backing in shared mode */
134	uint64_t kw_offset;		/* offset inside the object in shared mode */
135	int	kw_pflags;		/* flags under listlock protection */
136	struct timeval kw_ts;		/* timeval need for upkeep before free */
137	int	kw_iocount;		/* inuse reference */
138	int 	kw_dropcount;		/* current users unlocking... */
139
140	int	kw_type;		/* queue type like mutex, cvar, etc */
141	uint32_t kw_inqueue;		/* num of waiters held */
142	uint32_t kw_fakecount;		/* number of error/prepost fakes */
143	uint32_t kw_highseq;		/* highest seq in the queue */
144	uint32_t kw_lowseq;		/* lowest seq in the queue */
145	uint32_t kw_lword;		/* L value from userland */
146	uint32_t kw_uword;		/* U world value from userland */
147	uint32_t kw_sword;		/* S word value from userland */
148	uint32_t kw_lastunlockseq;	/* the last seq that unlocked */
149	/* for CV to be used as the seq kernel has seen so far */
150#define kw_cvkernelseq kw_lastunlockseq
151	uint32_t kw_lastseqword;		/* the last seq that unlocked */
152	/* for mutex and cvar we need to track I bit values */
153	uint32_t kw_nextseqword;	/* the last seq that unlocked; with num of waiters */
154	uint32_t kw_overlapwatch;	/* chance for overlaps */
155	uint32_t kw_pre_rwwc;		/* prepost count */
156	uint32_t kw_pre_lockseq;	/* prepost target seq */
157	uint32_t kw_pre_sseq;		/* prepost target sword, in cvar used for mutexowned */
158	uint32_t kw_pre_intrcount;	/* prepost of missed wakeup due to intrs */
159	uint32_t kw_pre_intrseq;	/* prepost of missed wakeup limit seq */
160	uint32_t kw_pre_intrretbits;	/* return bits value for missed wakeup threads */
161	uint32_t kw_pre_intrtype;	/* type of failed wakueps*/
162
163	int 	kw_kflags;
164	int		kw_qos_override;	/* QoS of max waiter during contention period */
165	struct ksyn_queue kw_ksynqueues[KSYN_QUEUE_MAX];	/* queues to hold threads */
166	lck_mtx_t kw_lock;		/* mutex lock protecting this structure */
167};
168typedef struct ksyn_wait_queue * ksyn_wait_queue_t;
169
170#define TID_ZERO (uint64_t)0
171
172/* bits needed in handling the rwlock unlock */
173#define PTH_RW_TYPE_READ	0x01
174#define PTH_RW_TYPE_WRITE	0x04
175#define PTH_RW_TYPE_MASK	0xff
176#define PTH_RW_TYPE_SHIFT	8
177
178#define PTH_RWSHFT_TYPE_READ	0x0100
179#define PTH_RWSHFT_TYPE_WRITE	0x0400
180#define PTH_RWSHFT_TYPE_MASK	0xff00
181
182/*
183 * Mutex pshared attributes
184 */
185#define PTHREAD_PROCESS_SHARED		_PTHREAD_MTX_OPT_PSHARED
186#define PTHREAD_PROCESS_PRIVATE		0x20
187#define PTHREAD_PSHARED_FLAGS_MASK	0x30
188
189/*
190 * Mutex policy attributes
191 */
192#define _PTHREAD_MUTEX_POLICY_NONE		0
193#define _PTHREAD_MUTEX_POLICY_FAIRSHARE		0x040	/* 1 */
194#define _PTHREAD_MUTEX_POLICY_FIRSTFIT		0x080	/* 2 */
195#define _PTHREAD_MUTEX_POLICY_REALTIME		0x0c0	/* 3 */
196#define _PTHREAD_MUTEX_POLICY_ADAPTIVE		0x100	/* 4 */
197#define _PTHREAD_MUTEX_POLICY_PRIPROTECT	0x140	/* 5 */
198#define _PTHREAD_MUTEX_POLICY_PRIINHERIT	0x180	/* 6 */
199#define PTHREAD_POLICY_FLAGS_MASK		0x1c0
200
201/* pflags */
202#define KSYN_WQ_INHASH	2
203#define KSYN_WQ_SHARED	4
204#define KSYN_WQ_WAITING 8	/* threads waiting for this wq to be available */
205#define KSYN_WQ_FLIST 	0X10	/* in free list to be freed after a short delay */
206
207/* kflags */
208#define KSYN_KWF_INITCLEARED	1	/* the init status found and preposts cleared */
209#define KSYN_KWF_ZEROEDOUT	2	/* the lword, etc are inited to 0 */
210#define KSYN_KWF_QOS_APPLIED	4	/* QoS override applied to owner */
211
212#define KSYN_CLEANUP_DEADLINE 10
213static int psynch_cleanupset;
214thread_call_t psynch_thcall;
215
216#define KSYN_WQTYPE_INWAIT	0x1000
217#define KSYN_WQTYPE_INDROP	0x2000
218#define KSYN_WQTYPE_MTX		0x01
219#define KSYN_WQTYPE_CVAR	0x02
220#define KSYN_WQTYPE_RWLOCK	0x04
221#define KSYN_WQTYPE_SEMA	0x08
222#define KSYN_WQTYPE_MASK	0xff
223
224#define KSYN_WQTYPE_MUTEXDROP	(KSYN_WQTYPE_INDROP | KSYN_WQTYPE_MTX)
225
226#define KW_UNLOCK_PREPOST 		0x01
227#define KW_UNLOCK_PREPOST_READLOCK 	0x08
228#define KW_UNLOCK_PREPOST_WRLOCK 	0x20
229
230static void
231CLEAR_PREPOST_BITS(ksyn_wait_queue_t kwq)
232{
233	kwq->kw_pre_lockseq = 0;
234	kwq->kw_pre_sseq = PTHRW_RWS_INIT;
235	kwq->kw_pre_rwwc = 0;
236}
237
238static void
239CLEAR_INTR_PREPOST_BITS(ksyn_wait_queue_t kwq)
240{
241	kwq->kw_pre_intrcount = 0;
242	kwq->kw_pre_intrseq = 0;
243	kwq->kw_pre_intrretbits = 0;
244	kwq->kw_pre_intrtype = 0;
245}
246
247static void
248CLEAR_REINIT_BITS(ksyn_wait_queue_t kwq)
249{
250	if ((kwq->kw_type & KSYN_WQTYPE_MASK) == KSYN_WQTYPE_CVAR) {
251		if (kwq->kw_inqueue != 0 && kwq->kw_inqueue != kwq->kw_fakecount) {
252			panic("CV:entries in queue durinmg reinit %d:%d\n",kwq->kw_inqueue, kwq->kw_fakecount);
253		}
254	};
255	if ((kwq->kw_type & KSYN_WQTYPE_MASK) == KSYN_WQTYPE_RWLOCK) {
256		kwq->kw_nextseqword = PTHRW_RWS_INIT;
257		kwq->kw_overlapwatch = 0;
258	};
259	CLEAR_PREPOST_BITS(kwq);
260	kwq->kw_lastunlockseq = PTHRW_RWL_INIT;
261	kwq->kw_lastseqword = PTHRW_RWS_INIT;
262	CLEAR_INTR_PREPOST_BITS(kwq);
263	kwq->kw_lword = 0;
264	kwq->kw_uword = 0;
265	kwq->kw_sword = PTHRW_RWS_INIT;
266}
267
268static int ksyn_wq_hash_lookup(user_addr_t uaddr, proc_t p, int flags, ksyn_wait_queue_t *kwq, struct pthhashhead **hashptr, uint64_t *object, uint64_t *offset);
269static int ksyn_wqfind(user_addr_t mutex, uint32_t mgen, uint32_t ugen, uint32_t rw_wc, int flags, int wqtype , ksyn_wait_queue_t *wq);
270static void ksyn_wqrelease(ksyn_wait_queue_t mkwq, int qfreenow, int wqtype);
271static int ksyn_findobj(user_addr_t uaddr, uint64_t *objectp, uint64_t *offsetp);
272
273static int _wait_result_to_errno(wait_result_t result);
274
275static int ksyn_wait(ksyn_wait_queue_t, int, uint32_t, int, uint64_t, thread_continue_t);
276static kern_return_t ksyn_signal(ksyn_wait_queue_t, int, ksyn_waitq_element_t, uint32_t);
277static void ksyn_freeallkwe(ksyn_queue_t kq);
278
279static kern_return_t ksyn_mtxsignal(ksyn_wait_queue_t, ksyn_waitq_element_t kwe, uint32_t);
280static void ksyn_mtx_update_owner_qos_override(ksyn_wait_queue_t, uint64_t tid, boolean_t prepost);
281static void ksyn_mtx_transfer_qos_override(ksyn_wait_queue_t, ksyn_waitq_element_t);
282static void ksyn_mtx_drop_qos_override(ksyn_wait_queue_t);
283
284static int kwq_handle_unlock(ksyn_wait_queue_t, uint32_t mgen, uint32_t rw_wc, uint32_t *updatep, int flags, int *blockp, uint32_t premgen);
285
286static void ksyn_queue_init(ksyn_queue_t kq);
287static int ksyn_queue_insert(ksyn_wait_queue_t kwq, int kqi, ksyn_waitq_element_t kwe, uint32_t mgen, int firstfit);
288static void ksyn_queue_remove_item(ksyn_wait_queue_t kwq, ksyn_queue_t kq, ksyn_waitq_element_t kwe);
289static void ksyn_queue_free_items(ksyn_wait_queue_t kwq, int kqi, uint32_t upto, int all);
290
291static void update_low_high(ksyn_wait_queue_t kwq, uint32_t lockseq);
292static uint32_t find_nextlowseq(ksyn_wait_queue_t kwq);
293static uint32_t find_nexthighseq(ksyn_wait_queue_t kwq);
294static int find_seq_till(ksyn_wait_queue_t kwq, uint32_t upto, uint32_t nwaiters, uint32_t *countp);
295
296static uint32_t ksyn_queue_count_tolowest(ksyn_queue_t kq, uint32_t upto);
297
298static ksyn_waitq_element_t ksyn_queue_find_cvpreposeq(ksyn_queue_t kq, uint32_t cgen);
299static void ksyn_handle_cvbroad(ksyn_wait_queue_t ckwq, uint32_t upto, uint32_t *updatep);
300static void ksyn_cvupdate_fixup(ksyn_wait_queue_t ckwq, uint32_t *updatep);
301static ksyn_waitq_element_t ksyn_queue_find_signalseq(ksyn_wait_queue_t kwq, ksyn_queue_t kq, uint32_t toseq, uint32_t lockseq);
302
303static void psynch_cvcontinue(void *, wait_result_t);
304static void psynch_mtxcontinue(void *, wait_result_t);
305
306static int ksyn_wakeupreaders(ksyn_wait_queue_t kwq, uint32_t limitread, int allreaders, uint32_t updatebits, int *wokenp);
307static int kwq_find_rw_lowest(ksyn_wait_queue_t kwq, int flags, uint32_t premgen, int *type, uint32_t lowest[]);
308static ksyn_waitq_element_t ksyn_queue_find_seq(ksyn_wait_queue_t kwq, ksyn_queue_t kq, uint32_t seq);
309
310static void
311UPDATE_CVKWQ(ksyn_wait_queue_t kwq, uint32_t mgen, uint32_t ugen, uint32_t rw_wc)
312{
313	int sinit = ((rw_wc & PTH_RWS_CV_CBIT) != 0);
314
315	// assert((kwq->kw_type & KSYN_WQTYPE_MASK) == KSYN_WQTYPE_CVAR);
316
317	if ((kwq->kw_kflags & KSYN_KWF_ZEROEDOUT) != 0) {
318		/* the values of L,U and S are cleared out due to L==S in previous transition */
319		kwq->kw_lword = mgen;
320		kwq->kw_uword = ugen;
321		kwq->kw_sword = rw_wc;
322		kwq->kw_kflags &= ~KSYN_KWF_ZEROEDOUT;
323	} else {
324		if (is_seqhigher(mgen, kwq->kw_lword)) {
325			kwq->kw_lword = mgen;
326		}
327		if (is_seqhigher(ugen, kwq->kw_uword)) {
328			kwq->kw_uword = ugen;
329		}
330		if (sinit && is_seqhigher(rw_wc, kwq->kw_sword)) {
331			kwq->kw_sword = rw_wc;
332		}
333	}
334	if (sinit && is_seqlower(kwq->kw_cvkernelseq, rw_wc)) {
335		kwq->kw_cvkernelseq = (rw_wc & PTHRW_COUNT_MASK);
336	}
337}
338
339static void
340pthread_list_lock(void)
341{
342	lck_mtx_lock(pthread_list_mlock);
343}
344
345static void
346pthread_list_unlock(void)
347{
348	lck_mtx_unlock(pthread_list_mlock);
349}
350
351static void
352ksyn_wqlock(ksyn_wait_queue_t kwq)
353{
354
355	lck_mtx_lock(&kwq->kw_lock);
356}
357
358static void
359ksyn_wqunlock(ksyn_wait_queue_t kwq)
360{
361	lck_mtx_unlock(&kwq->kw_lock);
362}
363
364
365/* routine to drop the mutex unlocks , used both for mutexunlock system call and drop during cond wait */
366static uint32_t
367_psynch_mutexdrop_internal(ksyn_wait_queue_t kwq, uint32_t mgen, uint32_t ugen, int flags)
368{
369	kern_return_t ret;
370	uint32_t returnbits = 0;
371	int firstfit = (flags & PTHREAD_POLICY_FLAGS_MASK) == _PTHREAD_MUTEX_POLICY_FIRSTFIT;
372	uint32_t nextgen = (ugen + PTHRW_INC);
373
374	ksyn_wqlock(kwq);
375	kwq->kw_lastunlockseq = (ugen & PTHRW_COUNT_MASK);
376	uint32_t updatebits = (kwq->kw_highseq & PTHRW_COUNT_MASK) | (PTH_RWL_EBIT | PTH_RWL_KBIT);
377
378redrive:
379	if (firstfit) {
380		if (kwq->kw_inqueue == 0) {
381			// not set or the new lock sequence is higher
382			if (kwq->kw_pre_rwwc == 0 || is_seqhigher(mgen, kwq->kw_pre_lockseq)) {
383				kwq->kw_pre_lockseq = (mgen & PTHRW_COUNT_MASK);
384			}
385			kwq->kw_pre_rwwc = 1;
386			ksyn_mtx_drop_qos_override(kwq);
387			kwq->kw_owner = 0;
388			// indicate prepost content in kernel
389			returnbits = mgen | PTH_RWL_PBIT;
390		} else {
391			// signal first waiter
392			ret = ksyn_mtxsignal(kwq, NULL, updatebits);
393			if (ret == KERN_NOT_WAITING) {
394				goto redrive;
395			}
396		}
397	} else {
398		int prepost = 0;
399		if (kwq->kw_inqueue == 0) {
400			// No waiters in the queue.
401			prepost = 1;
402		} else {
403			uint32_t low_writer = (kwq->kw_ksynqueues[KSYN_QUEUE_WRITER].ksynq_firstnum & PTHRW_COUNT_MASK);
404			if (low_writer == nextgen) {
405				/* next seq to be granted found */
406				/* since the grant could be cv, make sure mutex wait is set incase the thread interrupted out */
407				ret = ksyn_mtxsignal(kwq, NULL, updatebits | PTH_RWL_MTX_WAIT);
408				if (ret == KERN_NOT_WAITING) {
409					/* interrupt post */
410					kwq->kw_pre_intrcount = 1;
411					kwq->kw_pre_intrseq = nextgen;
412					kwq->kw_pre_intrretbits = updatebits;
413					kwq->kw_pre_intrtype = PTH_RW_TYPE_WRITE;
414				}
415
416			} else if (is_seqhigher(low_writer, nextgen)) {
417				prepost = 1;
418			} else {
419				//__FAILEDUSERTEST__("psynch_mutexdrop_internal: FS mutex unlock sequence higher than the lowest one is queue\n");
420				ksyn_waitq_element_t kwe;
421				kwe = ksyn_queue_find_seq(kwq, &kwq->kw_ksynqueues[KSYN_QUEUE_WRITER], nextgen);
422				if (kwe != NULL) {
423					/* next seq to be granted found */
424					/* since the grant could be cv, make sure mutex wait is set incase the thread interrupted out */
425					ret = ksyn_mtxsignal(kwq, kwe, updatebits | PTH_RWL_MTX_WAIT);
426					if (ret == KERN_NOT_WAITING) {
427						goto redrive;
428					}
429				} else {
430					prepost = 1;
431				}
432			}
433		}
434		if (prepost) {
435			ksyn_mtx_drop_qos_override(kwq);
436			kwq->kw_owner = 0;
437			if (++kwq->kw_pre_rwwc > 1) {
438				__FAILEDUSERTEST__("_psynch_mutexdrop_internal: multiple preposts\n");
439			} else {
440				kwq->kw_pre_lockseq = (nextgen & PTHRW_COUNT_MASK);
441			}
442		}
443	}
444
445	ksyn_wqunlock(kwq);
446	ksyn_wqrelease(kwq, 1, KSYN_WQTYPE_MUTEXDROP);
447	return returnbits;
448}
449
450static int
451_ksyn_check_init(ksyn_wait_queue_t kwq, uint32_t lgenval)
452{
453	int res = (lgenval & PTHRW_RWL_INIT) != 0;
454	if (res) {
455		if ((kwq->kw_kflags & KSYN_KWF_INITCLEARED) == 0) {
456			/* first to notice the reset of the lock, clear preposts */
457			CLEAR_REINIT_BITS(kwq);
458			kwq->kw_kflags |= KSYN_KWF_INITCLEARED;
459		}
460	}
461	return res;
462}
463
464static int
465_ksyn_handle_missed_wakeups(ksyn_wait_queue_t kwq,
466			    uint32_t type,
467			    uint32_t lockseq,
468			    uint32_t *retval)
469{
470	int res = 0;
471	if (kwq->kw_pre_intrcount != 0 &&
472	    kwq->kw_pre_intrtype == type &&
473	    is_seqlower_eq(lockseq, kwq->kw_pre_intrseq)) {
474		kwq->kw_pre_intrcount--;
475		*retval = kwq->kw_pre_intrretbits;
476		if (kwq->kw_pre_intrcount == 0) {
477			CLEAR_INTR_PREPOST_BITS(kwq);
478		}
479		res = 1;
480	}
481	return res;
482}
483
484static int
485_ksyn_handle_overlap(ksyn_wait_queue_t kwq,
486		     uint32_t lgenval,
487		     uint32_t rw_wc,
488		     uint32_t *retval)
489{
490	int res = 0;
491
492	// check for overlap and no pending W bit (indicates writers)
493	if (kwq->kw_overlapwatch != 0 &&
494	    (rw_wc & PTHRW_RWS_SAVEMASK) == 0 &&
495	    (lgenval & PTH_RWL_WBIT) == 0) {
496		/* overlap is set, so no need to check for valid state for overlap */
497
498		if (is_seqlower_eq(rw_wc, kwq->kw_nextseqword) || is_seqhigher_eq(kwq->kw_lastseqword, rw_wc)) {
499			/* increase the next expected seq by one */
500			kwq->kw_nextseqword += PTHRW_INC;
501			/* set count by one & bits from the nextseq and add M bit */
502			*retval = PTHRW_INC | ((kwq->kw_nextseqword & PTHRW_BIT_MASK) | PTH_RWL_MBIT);
503			res = 1;
504		}
505	}
506	return res;
507}
508
509static int
510_ksyn_handle_prepost(ksyn_wait_queue_t kwq,
511		     uint32_t type,
512		     uint32_t lockseq,
513		     uint32_t *retval)
514{
515	int res = 0;
516	if (kwq->kw_pre_rwwc != 0 && is_seqlower_eq(lockseq, kwq->kw_pre_lockseq)) {
517		kwq->kw_pre_rwwc--;
518		if (kwq->kw_pre_rwwc == 0) {
519			uint32_t preseq = kwq->kw_pre_lockseq;
520			uint32_t prerw_wc = kwq->kw_pre_sseq;
521			CLEAR_PREPOST_BITS(kwq);
522			if ((kwq->kw_kflags & KSYN_KWF_INITCLEARED) != 0){
523				kwq->kw_kflags &= ~KSYN_KWF_INITCLEARED;
524			}
525
526			int error, block;
527			uint32_t updatebits;
528			error = kwq_handle_unlock(kwq, preseq, prerw_wc, &updatebits, (type|KW_UNLOCK_PREPOST), &block, lockseq);
529			if (error != 0) {
530				panic("kwq_handle_unlock failed %d\n", error);
531			}
532
533			if (block == 0) {
534				*retval = updatebits;
535				res = 1;
536			}
537		}
538	}
539	return res;
540}
541
542/* Helpers for QoS override management. Only applies to mutexes */
543static void ksyn_mtx_update_owner_qos_override(ksyn_wait_queue_t kwq, uint64_t tid, boolean_t prepost)
544{
545	if (!(kwq->kw_pflags & KSYN_WQ_SHARED)) {
546		boolean_t wasboosted = (kwq->kw_kflags & KSYN_KWF_QOS_APPLIED) ? TRUE : FALSE;
547		int waiter_qos = pthread_kern->proc_usynch_get_requested_thread_qos(current_uthread());
548
549		kwq->kw_qos_override = MAX(waiter_qos, kwq->kw_qos_override);
550
551		if (prepost && kwq->kw_inqueue == 0) {
552			// if there are no more waiters in the queue after the new (prepost-receiving) owner, we do not set an
553			// override, because the receiving owner may not re-enter the kernel to signal someone else if it is
554			// the last one to unlock. If other waiters end up entering the kernel, they will boost the owner
555			tid = 0;
556		}
557
558		if (tid != 0) {
559			if ((tid == kwq->kw_owner) && (kwq->kw_kflags & KSYN_KWF_QOS_APPLIED)) {
560				// hint continues to be accurate, and a boost was already applied
561				pthread_kern->proc_usynch_thread_qos_add_override(NULL, tid, kwq->kw_qos_override, FALSE);
562			} else {
563				// either hint did not match previous owner, or hint was accurate but mutex was not contended enough for a boost previously
564				boolean_t boostsucceded;
565
566				boostsucceded = pthread_kern->proc_usynch_thread_qos_add_override(NULL, tid, kwq->kw_qos_override, TRUE);
567
568				if (boostsucceded) {
569					kwq->kw_kflags |= KSYN_KWF_QOS_APPLIED;
570				}
571
572				if (wasboosted && (tid != kwq->kw_owner) && (kwq->kw_owner != 0)) {
573					// the hint did not match the previous owner, so drop overrides
574					PTHREAD_TRACE(TRACE_psynch_ksyn_incorrect_owner, kwq->kw_owner, 0, 0, 0, 0);
575					pthread_kern->proc_usynch_thread_qos_remove_override(NULL, kwq->kw_owner);
576				}
577			}
578		} else {
579			// new hint tells us that we don't know the owner, so drop any existing overrides
580			kwq->kw_kflags &= ~KSYN_KWF_QOS_APPLIED;
581			kwq->kw_qos_override = THREAD_QOS_UNSPECIFIED;
582
583			if (wasboosted && (kwq->kw_owner != 0)) {
584				// the hint did not match the previous owner, so drop overrides
585				PTHREAD_TRACE(TRACE_psynch_ksyn_incorrect_owner, kwq->kw_owner, 0, 0, 0, 0);
586				pthread_kern->proc_usynch_thread_qos_remove_override(NULL, kwq->kw_owner);
587			}
588		}
589	}
590}
591
592static void ksyn_mtx_transfer_qos_override(ksyn_wait_queue_t kwq, ksyn_waitq_element_t kwe)
593{
594	if (!(kwq->kw_pflags & KSYN_WQ_SHARED)) {
595		boolean_t wasboosted = (kwq->kw_kflags & KSYN_KWF_QOS_APPLIED) ? TRUE : FALSE;
596
597		if (kwq->kw_inqueue > 1) {
598			boolean_t boostsucceeded;
599
600			// More than one waiter, so resource will still be contended after handing off ownership
601			boostsucceeded = pthread_kern->proc_usynch_thread_qos_add_override(kwe->kwe_uth, 0, kwq->kw_qos_override, TRUE);
602
603			if (boostsucceeded) {
604				kwq->kw_kflags |= KSYN_KWF_QOS_APPLIED;
605			}
606		} else {
607			// kw_inqueue == 1 to get to this point, which means there will be no contention after this point
608			kwq->kw_kflags &= ~KSYN_KWF_QOS_APPLIED;
609			kwq->kw_qos_override = THREAD_QOS_UNSPECIFIED;
610		}
611
612		// Remove the override that was applied to kw_owner. There may have been a race,
613		// in which case it may not match the current thread
614		if (wasboosted) {
615			if (kwq->kw_owner == 0) {
616				PTHREAD_TRACE(TRACE_psynch_ksyn_incorrect_owner, 0, 0, 0, 0, 0);
617			} else if (thread_tid(current_thread()) != kwq->kw_owner) {
618				PTHREAD_TRACE(TRACE_psynch_ksyn_incorrect_owner, kwq->kw_owner, 0, 0, 0, 0);
619				pthread_kern->proc_usynch_thread_qos_remove_override(NULL, kwq->kw_owner);
620			} else {
621				pthread_kern->proc_usynch_thread_qos_remove_override(current_uthread(), 0);
622			}
623		}
624	}
625}
626
627static void ksyn_mtx_drop_qos_override(ksyn_wait_queue_t kwq)
628{
629	if (!(kwq->kw_pflags & KSYN_WQ_SHARED)) {
630		boolean_t wasboosted = (kwq->kw_kflags & KSYN_KWF_QOS_APPLIED) ? TRUE : FALSE;
631
632		// assume nobody else in queue if this routine was called
633		kwq->kw_kflags &= ~KSYN_KWF_QOS_APPLIED;
634		kwq->kw_qos_override = THREAD_QOS_UNSPECIFIED;
635
636		// Remove the override that was applied to kw_owner. There may have been a race,
637		// in which case it may not match the current thread
638		if (wasboosted) {
639			if (kwq->kw_owner == 0) {
640				PTHREAD_TRACE(TRACE_psynch_ksyn_incorrect_owner, 0, 0, 0, 0, 0);
641			} else if (thread_tid(current_thread()) != kwq->kw_owner) {
642				PTHREAD_TRACE(TRACE_psynch_ksyn_incorrect_owner, kwq->kw_owner, 0, 0, 0, 0);
643				pthread_kern->proc_usynch_thread_qos_remove_override(NULL, kwq->kw_owner);
644			} else {
645				pthread_kern->proc_usynch_thread_qos_remove_override(current_uthread(), 0);
646			}
647		}
648	}
649}
650
651/*
652 * psynch_mutexwait: This system call is used for contended psynch mutexes to block.
653 */
654
655int
656_psynch_mutexwait(__unused proc_t p,
657		  user_addr_t mutex,
658		  uint32_t mgen,
659		  uint32_t ugen,
660		  uint64_t tid,
661		  uint32_t flags,
662		  uint32_t *retval)
663{
664	ksyn_wait_queue_t kwq;
665	int error=0;
666	int ins_flags;
667
668	int firstfit = (flags & PTHREAD_POLICY_FLAGS_MASK) == _PTHREAD_MUTEX_POLICY_FIRSTFIT;
669	uint32_t updatebits = 0;
670
671	uint32_t lockseq = (mgen & PTHRW_COUNT_MASK);
672
673	if (firstfit == 0) {
674		ins_flags = SEQFIT;
675	} else {
676		/* first fit */
677		ins_flags = FIRSTFIT;
678	}
679
680	error = ksyn_wqfind(mutex, mgen, ugen, 0, flags, (KSYN_WQTYPE_INWAIT|KSYN_WQTYPE_MTX), &kwq);
681	if (error != 0) {
682		return(error);
683	}
684
685	ksyn_wqlock(kwq);
686
687	// mutexwait passes in an owner hint at the time userspace contended for the mutex, however, the
688	// owner tid in the userspace data structure may be unset or SWITCHING (-1), or it may correspond
689	// to a stale snapshot after the lock has subsequently been unlocked by another thread.
690	if (tid == 0) {
691		// contender came in before owner could write TID
692		tid = 0;
693	} else if (kwq->kw_lastunlockseq != PTHRW_RWL_INIT && is_seqlower(ugen, kwq->kw_lastunlockseq)) {
694		// owner is stale, someone has come in and unlocked since this contended read the TID, so
695		// assume what is known in the kernel is accurate
696		tid = kwq->kw_owner;
697	} else if (tid == PTHREAD_MTX_TID_SWITCHING) {
698		// userspace didn't know the owner because it was being unlocked, but that unlocker hasn't
699		// reached the kernel yet. So assume what is known in the kernel is accurate
700		tid = kwq->kw_owner;
701	} else {
702		// hint is being passed in for a specific thread, and we have no reason not to trust
703		// it (like the kernel unlock sequence being higher
704	}
705
706
707	if (_ksyn_handle_missed_wakeups(kwq, PTH_RW_TYPE_WRITE, lockseq, retval)) {
708		ksyn_mtx_update_owner_qos_override(kwq, thread_tid(current_thread()), TRUE);
709		kwq->kw_owner = thread_tid(current_thread());
710
711		ksyn_wqunlock(kwq);
712		goto out;
713	}
714
715	if ((kwq->kw_pre_rwwc != 0) && ((ins_flags == FIRSTFIT) || ((lockseq & PTHRW_COUNT_MASK) == (kwq->kw_pre_lockseq & PTHRW_COUNT_MASK) ))) {
716		/* got preposted lock */
717		kwq->kw_pre_rwwc--;
718		if (kwq->kw_pre_rwwc == 0) {
719			CLEAR_PREPOST_BITS(kwq);
720			if (kwq->kw_inqueue == 0) {
721				updatebits = lockseq | (PTH_RWL_KBIT | PTH_RWL_EBIT);
722			} else {
723				updatebits = (kwq->kw_highseq & PTHRW_COUNT_MASK) | (PTH_RWL_KBIT | PTH_RWL_EBIT);
724			}
725			updatebits &= ~PTH_RWL_MTX_WAIT;
726
727			if (updatebits == 0) {
728				__FAILEDUSERTEST__("psynch_mutexwait(prepost): returning 0 lseq in mutexwait with no EBIT \n");
729			}
730
731			ksyn_mtx_update_owner_qos_override(kwq, thread_tid(current_thread()), TRUE);
732			kwq->kw_owner = thread_tid(current_thread());
733
734			ksyn_wqunlock(kwq);
735			*retval = updatebits;
736			goto out;
737		} else {
738			__FAILEDUSERTEST__("psynch_mutexwait: more than one prepost\n");
739			kwq->kw_pre_lockseq += PTHRW_INC; /* look for next one */
740			ksyn_wqunlock(kwq);
741			error = EINVAL;
742			goto out;
743		}
744	}
745
746	ksyn_mtx_update_owner_qos_override(kwq, tid, FALSE);
747	kwq->kw_owner = tid;
748
749	error = ksyn_wait(kwq, KSYN_QUEUE_WRITER, mgen, ins_flags, 0, psynch_mtxcontinue);
750	// ksyn_wait drops wait queue lock
751out:
752	ksyn_wqrelease(kwq, 1, (KSYN_WQTYPE_INWAIT|KSYN_WQTYPE_MTX));
753	return error;
754}
755
756void
757psynch_mtxcontinue(void *parameter, wait_result_t result)
758{
759	uthread_t uth = current_uthread();
760	ksyn_wait_queue_t kwq = parameter;
761	ksyn_waitq_element_t kwe = pthread_kern->uthread_get_uukwe(uth);
762
763	int error = _wait_result_to_errno(result);
764	if (error != 0) {
765		ksyn_wqlock(kwq);
766		if (kwe->kwe_kwqqueue) {
767			ksyn_queue_remove_item(kwq, &kwq->kw_ksynqueues[KSYN_QUEUE_WRITER], kwe);
768		}
769		ksyn_wqunlock(kwq);
770	} else {
771		uint32_t updatebits = kwe->kwe_psynchretval & ~PTH_RWL_MTX_WAIT;
772		pthread_kern->uthread_set_returnval(uth, updatebits);
773
774		if (updatebits == 0)
775			__FAILEDUSERTEST__("psynch_mutexwait: returning 0 lseq in mutexwait with no EBIT \n");
776	}
777	ksyn_wqrelease(kwq, 1, (KSYN_WQTYPE_INWAIT|KSYN_WQTYPE_MTX));
778	pthread_kern->unix_syscall_return(error);
779}
780
781/*
782 * psynch_mutexdrop: This system call is used for unlock postings on contended psynch mutexes.
783 */
784int
785_psynch_mutexdrop(__unused proc_t p,
786		  user_addr_t mutex,
787		  uint32_t mgen,
788		  uint32_t ugen,
789		  uint64_t tid __unused,
790		  uint32_t flags,
791		  uint32_t *retval)
792{
793	int res;
794	ksyn_wait_queue_t kwq;
795
796	res = ksyn_wqfind(mutex, mgen, ugen, 0, flags, KSYN_WQTYPE_MUTEXDROP, &kwq);
797	if (res == 0) {
798		uint32_t updateval = _psynch_mutexdrop_internal(kwq, mgen, ugen, flags);
799		/* drops the kwq reference */
800		if (retval) {
801			*retval = updateval;
802		}
803	}
804
805	return res;
806}
807
808static kern_return_t
809ksyn_mtxsignal(ksyn_wait_queue_t kwq, ksyn_waitq_element_t kwe, uint32_t updateval)
810{
811	kern_return_t ret;
812
813	if (!kwe) {
814		kwe = TAILQ_FIRST(&kwq->kw_ksynqueues[KSYN_QUEUE_WRITER].ksynq_kwelist);
815		if (!kwe) {
816			panic("ksyn_mtxsignal: panic signaling empty queue");
817		}
818	}
819
820	ksyn_mtx_transfer_qos_override(kwq, kwe);
821	kwq->kw_owner = kwe->kwe_tid;
822
823	ret = ksyn_signal(kwq, KSYN_QUEUE_WRITER, kwe, updateval);
824
825	// if waking the new owner failed, remove any overrides
826	if (ret != KERN_SUCCESS) {
827		ksyn_mtx_drop_qos_override(kwq);
828		kwq->kw_owner = 0;
829	}
830
831	return ret;
832}
833
834
835static void
836ksyn_prepost(ksyn_wait_queue_t kwq,
837	     ksyn_waitq_element_t kwe,
838	     uint32_t state,
839	     uint32_t lockseq)
840{
841	bzero(kwe, sizeof(*kwe));
842	kwe->kwe_state = state;
843	kwe->kwe_lockseq = lockseq;
844	kwe->kwe_count = 1;
845
846	(void)ksyn_queue_insert(kwq, KSYN_QUEUE_WRITER, kwe, lockseq, SEQFIT);
847	kwq->kw_fakecount++;
848}
849
850static void
851ksyn_cvsignal(ksyn_wait_queue_t ckwq,
852	      thread_t th,
853	      uint32_t uptoseq,
854	      uint32_t signalseq,
855	      uint32_t *updatebits,
856	      int *broadcast,
857	      ksyn_waitq_element_t *nkwep)
858{
859	ksyn_waitq_element_t kwe = NULL;
860	ksyn_waitq_element_t nkwe = NULL;
861	ksyn_queue_t kq = &ckwq->kw_ksynqueues[KSYN_QUEUE_WRITER];
862
863	uptoseq &= PTHRW_COUNT_MASK;
864
865	// Find the specified thread to wake.
866	if (th != THREAD_NULL) {
867		uthread_t uth = pthread_kern->get_bsdthread_info(th);
868		kwe = pthread_kern->uthread_get_uukwe(uth);
869		if (kwe->kwe_kwqqueue != ckwq ||
870		    is_seqhigher(kwe->kwe_lockseq, uptoseq)) {
871			// Unless it's no longer waiting on this CV...
872			kwe = NULL;
873			// ...in which case we post a broadcast instead.
874			*broadcast = 1;
875			return;
876		}
877	}
878
879	// If no thread was specified, find any thread to wake (with the right
880	// sequence number).
881	while (th == THREAD_NULL) {
882		if (kwe == NULL) {
883			kwe = ksyn_queue_find_signalseq(ckwq, kq, uptoseq, signalseq);
884		}
885		if (kwe == NULL && nkwe == NULL) {
886			// No eligible entries; need to allocate a new
887			// entry to prepost. Loop to rescan after
888			// reacquiring the lock after allocation in
889			// case anything new shows up.
890			ksyn_wqunlock(ckwq);
891			nkwe = (ksyn_waitq_element_t)pthread_kern->zalloc(kwe_zone);
892			ksyn_wqlock(ckwq);
893		} else {
894			break;
895		}
896	}
897
898	if (kwe != NULL) {
899		// If we found a thread to wake...
900		if (kwe->kwe_state == KWE_THREAD_INWAIT) {
901			if (is_seqlower(kwe->kwe_lockseq, signalseq)) {
902				/*
903				 * A valid thread in our range, but lower than our signal.
904				 * Matching it may leave our match with nobody to wake it if/when
905				 * it arrives (the signal originally meant for this thread might
906				 * not successfully wake it).
907				 *
908				 * Convert to broadcast - may cause some spurious wakeups
909				 * (allowed by spec), but avoids starvation (better choice).
910				 */
911				*broadcast = 1;
912			} else {
913				(void)ksyn_signal(ckwq, KSYN_QUEUE_WRITER, kwe, PTH_RWL_MTX_WAIT);
914				*updatebits += PTHRW_INC;
915			}
916		} else if (kwe->kwe_state == KWE_THREAD_PREPOST) {
917			// Merge with existing prepost at same uptoseq.
918			kwe->kwe_count += 1;
919		} else if (kwe->kwe_state == KWE_THREAD_BROADCAST) {
920			// Existing broadcasts subsume this signal.
921		} else {
922			panic("unknown kwe state\n");
923		}
924		if (nkwe) {
925			/*
926			 * If we allocated a new kwe above but then found a different kwe to
927			 * use then we need to deallocate the spare one.
928			 */
929			pthread_kern->zfree(kwe_zone, nkwe);
930			nkwe = NULL;
931		}
932	} else if (nkwe != NULL) {
933		// ... otherwise, insert the newly allocated prepost.
934		ksyn_prepost(ckwq, nkwe, KWE_THREAD_PREPOST, uptoseq);
935		nkwe = NULL;
936	} else {
937		panic("failed to allocate kwe\n");
938	}
939
940	*nkwep = nkwe;
941}
942
943static int
944__psynch_cvsignal(user_addr_t cv,
945		  uint32_t cgen,
946		  uint32_t cugen,
947		  uint32_t csgen,
948		  uint32_t flags,
949		  int broadcast,
950		  mach_port_name_t threadport,
951		  uint32_t *retval)
952{
953	int error = 0;
954	thread_t th = THREAD_NULL;
955	ksyn_wait_queue_t kwq;
956
957	uint32_t uptoseq = cgen & PTHRW_COUNT_MASK;
958	uint32_t fromseq = (cugen & PTHRW_COUNT_MASK) + PTHRW_INC;
959
960	// validate sane L, U, and S values
961	if ((threadport == 0 && is_seqhigher(fromseq, uptoseq)) || is_seqhigher(csgen, uptoseq)) {
962		__FAILEDUSERTEST__("cvbroad: invalid L, U and S values\n");
963		return EINVAL;
964	}
965
966	if (threadport != 0) {
967		th = port_name_to_thread((mach_port_name_t)threadport);
968		if (th == THREAD_NULL) {
969			return ESRCH;
970		}
971	}
972
973	error = ksyn_wqfind(cv, cgen, cugen, csgen, flags, (KSYN_WQTYPE_CVAR | KSYN_WQTYPE_INDROP), &kwq);
974	if (error == 0) {
975		uint32_t updatebits = 0;
976		ksyn_waitq_element_t nkwe = NULL;
977
978		ksyn_wqlock(kwq);
979
980		// update L, U and S...
981		UPDATE_CVKWQ(kwq, cgen, cugen, csgen);
982
983		if (!broadcast) {
984			// No need to signal if the CV is already balanced.
985			if (diff_genseq(kwq->kw_lword, kwq->kw_sword)) {
986				ksyn_cvsignal(kwq, th, uptoseq, fromseq, &updatebits, &broadcast, &nkwe);
987			}
988		}
989
990		if (broadcast) {
991			ksyn_handle_cvbroad(kwq, uptoseq, &updatebits);
992		}
993
994		kwq->kw_sword += (updatebits & PTHRW_COUNT_MASK);
995		// set C or P bits and free if needed
996		ksyn_cvupdate_fixup(kwq, &updatebits);
997		*retval = updatebits;
998
999		ksyn_wqunlock(kwq);
1000
1001		if (nkwe != NULL) {
1002			pthread_kern->zfree(kwe_zone, nkwe);
1003		}
1004
1005		ksyn_wqrelease(kwq, 1, (KSYN_WQTYPE_INDROP | KSYN_WQTYPE_CVAR));
1006	}
1007
1008	if (th != NULL) {
1009		thread_deallocate(th);
1010	}
1011
1012	return error;
1013}
1014
1015/*
1016 * psynch_cvbroad: This system call is used for broadcast posting on blocked waiters of psynch cvars.
1017 */
1018int
1019_psynch_cvbroad(__unused proc_t p,
1020		user_addr_t cv,
1021		uint64_t cvlsgen,
1022		uint64_t cvudgen,
1023		uint32_t flags,
1024		__unused user_addr_t mutex,
1025		__unused uint64_t mugen,
1026		__unused uint64_t tid,
1027		uint32_t *retval)
1028{
1029	uint32_t diffgen = cvudgen & 0xffffffff;
1030	uint32_t count = diffgen >> PTHRW_COUNT_SHIFT;
1031	if (count > pthread_kern->get_task_threadmax()) {
1032		__FAILEDUSERTEST__("cvbroad: difference greater than maximum possible thread count\n");
1033		return EBUSY;
1034	}
1035
1036	uint32_t csgen = (cvlsgen >> 32) & 0xffffffff;
1037	uint32_t cgen = cvlsgen & 0xffffffff;
1038	uint32_t cugen = (cvudgen >> 32) & 0xffffffff;
1039
1040	return __psynch_cvsignal(cv, cgen, cugen, csgen, flags, 1, 0, retval);
1041}
1042
1043/*
1044 * psynch_cvsignal: This system call is used for signalling the blocked waiters of psynch cvars.
1045 */
1046int
1047_psynch_cvsignal(__unused proc_t p,
1048		 user_addr_t cv,
1049		 uint64_t cvlsgen,
1050		 uint32_t cvugen,
1051		 int threadport,
1052		 __unused user_addr_t mutex,
1053		 __unused uint64_t mugen,
1054		 __unused uint64_t tid,
1055		 uint32_t flags,
1056		 uint32_t *retval)
1057{
1058	uint32_t csgen = (cvlsgen >> 32) & 0xffffffff;
1059	uint32_t cgen = cvlsgen & 0xffffffff;
1060
1061	return __psynch_cvsignal(cv, cgen, cvugen, csgen, flags, 0, threadport, retval);
1062}
1063
1064/*
1065 * psynch_cvwait: This system call is used for psynch cvar waiters to block in kernel.
1066 */
1067int
1068_psynch_cvwait(__unused proc_t p,
1069	       user_addr_t cv,
1070	       uint64_t cvlsgen,
1071	       uint32_t cvugen,
1072	       user_addr_t mutex,
1073	       uint64_t mugen,
1074	       uint32_t flags,
1075	       int64_t sec,
1076	       uint32_t nsec,
1077	       uint32_t *retval)
1078{
1079	int error = 0;
1080	uint32_t updatebits = 0;
1081	ksyn_wait_queue_t ckwq = NULL;
1082	ksyn_waitq_element_t kwe, nkwe = NULL;
1083
1084	/* for conformance reasons */
1085	pthread_kern->__pthread_testcancel(0);
1086
1087	uint32_t csgen = (cvlsgen >> 32) & 0xffffffff;
1088	uint32_t cgen = cvlsgen & 0xffffffff;
1089	uint32_t ugen = (mugen >> 32) & 0xffffffff;
1090	uint32_t mgen = mugen & 0xffffffff;
1091
1092	uint32_t lockseq = (cgen & PTHRW_COUNT_MASK);
1093
1094	/*
1095	 * In cvwait U word can be out of range as cv could be used only for
1096	 * timeouts. However S word needs to be within bounds and validated at
1097	 * user level as well.
1098	 */
1099	if (is_seqhigher_eq(csgen, lockseq) != 0) {
1100		__FAILEDUSERTEST__("psync_cvwait; invalid sequence numbers\n");
1101		return EINVAL;
1102	}
1103
1104	error = ksyn_wqfind(cv, cgen, cvugen, csgen, flags, KSYN_WQTYPE_CVAR | KSYN_WQTYPE_INWAIT, &ckwq);
1105	if (error != 0) {
1106		return error;
1107	}
1108
1109	if (mutex != 0) {
1110		error = _psynch_mutexdrop(NULL, mutex, mgen, ugen, 0, flags, NULL);
1111		if (error != 0) {
1112			goto out;
1113		}
1114	}
1115
1116	ksyn_wqlock(ckwq);
1117
1118	// update L, U and S...
1119	UPDATE_CVKWQ(ckwq, cgen, cvugen, csgen);
1120
1121	/* Look for the sequence for prepost (or conflicting thread */
1122	ksyn_queue_t kq = &ckwq->kw_ksynqueues[KSYN_QUEUE_WRITER];
1123	kwe = ksyn_queue_find_cvpreposeq(kq, lockseq);
1124	if (kwe != NULL) {
1125		if (kwe->kwe_state == KWE_THREAD_PREPOST) {
1126			if ((kwe->kwe_lockseq & PTHRW_COUNT_MASK) == lockseq) {
1127				/* we can safely consume a reference, so do so */
1128				if (--kwe->kwe_count == 0) {
1129					ksyn_queue_remove_item(ckwq, kq, kwe);
1130					ckwq->kw_fakecount--;
1131					nkwe = kwe;
1132				}
1133			} else {
1134				/*
1135				 * consuming a prepost higher than our lock sequence is valid, but
1136				 * can leave the higher thread without a match. Convert the entry
1137				 * to a broadcast to compensate for this.
1138				 */
1139				ksyn_handle_cvbroad(ckwq, kwe->kwe_lockseq, &updatebits);
1140#if __TESTPANICS__
1141				if (updatebits != 0)
1142					panic("psync_cvwait: convert pre-post to broadcast: woke up %d threads that shouldn't be there\n", updatebits);
1143#endif /* __TESTPANICS__ */
1144			}
1145		} else if (kwe->kwe_state == KWE_THREAD_BROADCAST) {
1146			// XXX
1147			// Nothing to do.
1148		} else if (kwe->kwe_state == KWE_THREAD_INWAIT) {
1149			__FAILEDUSERTEST__("cvwait: thread entry with same sequence already present\n");
1150			error = EBUSY;
1151		} else {
1152			panic("psync_cvwait: unexpected wait queue element type\n");
1153		}
1154
1155		if (error == 0) {
1156			updatebits = PTHRW_INC;
1157			ckwq->kw_sword += PTHRW_INC;
1158
1159			/* set C or P bits and free if needed */
1160			ksyn_cvupdate_fixup(ckwq, &updatebits);
1161			*retval = updatebits;
1162		}
1163	} else {
1164		uint64_t abstime = 0;
1165
1166		if (sec != 0 || (nsec & 0x3fffffff) != 0) {
1167			struct timespec ts;
1168			ts.tv_sec = (__darwin_time_t)sec;
1169			ts.tv_nsec = (nsec & 0x3fffffff);
1170			nanoseconds_to_absolutetime((uint64_t)ts.tv_sec * NSEC_PER_SEC + ts.tv_nsec, &abstime);
1171			clock_absolutetime_interval_to_deadline(abstime, &abstime);
1172		}
1173
1174		error = ksyn_wait(ckwq, KSYN_QUEUE_WRITER, cgen, SEQFIT, abstime, psynch_cvcontinue);
1175		// ksyn_wait drops wait queue lock
1176	}
1177
1178	ksyn_wqunlock(ckwq);
1179
1180	if (nkwe != NULL) {
1181		pthread_kern->zfree(kwe_zone, nkwe);
1182	}
1183out:
1184	ksyn_wqrelease(ckwq, 1, (KSYN_WQTYPE_INWAIT | KSYN_WQTYPE_CVAR));
1185	return error;
1186}
1187
1188
1189void
1190psynch_cvcontinue(void *parameter, wait_result_t result)
1191{
1192	uthread_t uth = current_uthread();
1193	ksyn_wait_queue_t ckwq = parameter;
1194	ksyn_waitq_element_t kwe = pthread_kern->uthread_get_uukwe(uth);
1195
1196	int error = _wait_result_to_errno(result);
1197	if (error != 0) {
1198		ksyn_wqlock(ckwq);
1199		/* just in case it got woken up as we were granting */
1200		pthread_kern->uthread_set_returnval(uth, kwe->kwe_psynchretval);
1201
1202		if (kwe->kwe_kwqqueue) {
1203			ksyn_queue_remove_item(ckwq, &ckwq->kw_ksynqueues[KSYN_QUEUE_WRITER], kwe);
1204		}
1205		if ((kwe->kwe_psynchretval & PTH_RWL_MTX_WAIT) != 0) {
1206			/* the condition var granted.
1207			 * reset the error so that the thread returns back.
1208			 */
1209			error = 0;
1210			/* no need to set any bits just return as cvsig/broad covers this */
1211		} else {
1212			ckwq->kw_sword += PTHRW_INC;
1213
1214			/* set C and P bits, in the local error */
1215			if ((ckwq->kw_lword & PTHRW_COUNT_MASK) == (ckwq->kw_sword & PTHRW_COUNT_MASK)) {
1216				error |= ECVCERORR;
1217				if (ckwq->kw_inqueue != 0) {
1218					ksyn_queue_free_items(ckwq, KSYN_QUEUE_WRITER, ckwq->kw_lword, 1);
1219				}
1220				ckwq->kw_lword = ckwq->kw_uword = ckwq->kw_sword = 0;
1221				ckwq->kw_kflags |= KSYN_KWF_ZEROEDOUT;
1222			} else {
1223				/* everythig in the queue is a fake entry ? */
1224				if (ckwq->kw_inqueue != 0 && ckwq->kw_fakecount == ckwq->kw_inqueue) {
1225					error |= ECVPERORR;
1226				}
1227			}
1228		}
1229		ksyn_wqunlock(ckwq);
1230	} else {
1231		int val = 0;
1232		// PTH_RWL_MTX_WAIT is removed
1233		if ((kwe->kwe_psynchretval & PTH_RWS_CV_MBIT) != 0) {
1234			val = PTHRW_INC | PTH_RWS_CV_CBIT;
1235		}
1236		pthread_kern->uthread_set_returnval(uth, val);
1237	}
1238
1239	ksyn_wqrelease(ckwq, 1, (KSYN_WQTYPE_INWAIT | KSYN_WQTYPE_CVAR));
1240	pthread_kern->unix_syscall_return(error);
1241}
1242
1243/*
1244 * psynch_cvclrprepost: This system call clears pending prepost if present.
1245 */
1246int
1247_psynch_cvclrprepost(__unused proc_t p,
1248		     user_addr_t cv,
1249		     uint32_t cvgen,
1250		     uint32_t cvugen,
1251		     uint32_t cvsgen,
1252		     __unused uint32_t prepocnt,
1253		     uint32_t preposeq,
1254		     uint32_t flags,
1255		     int *retval)
1256{
1257	int error = 0;
1258	int mutex = (flags & _PTHREAD_MTX_OPT_MUTEX);
1259	int wqtype = (mutex ? KSYN_WQTYPE_MTX : KSYN_WQTYPE_CVAR) | KSYN_WQTYPE_INDROP;
1260	ksyn_wait_queue_t kwq = NULL;
1261
1262	*retval = 0;
1263
1264	error = ksyn_wqfind(cv, cvgen, cvugen, mutex ? 0 : cvsgen, flags, wqtype, &kwq);
1265	if (error != 0) {
1266		return error;
1267	}
1268
1269	ksyn_wqlock(kwq);
1270
1271	if (mutex) {
1272		int firstfit = (flags & PTHREAD_POLICY_FLAGS_MASK) == _PTHREAD_MUTEX_POLICY_FIRSTFIT;
1273		if (firstfit && kwq->kw_pre_rwwc != 0) {
1274			if (is_seqlower_eq(kwq->kw_pre_lockseq, cvgen)) {
1275				// clear prepost
1276				kwq->kw_pre_rwwc = 0;
1277				kwq->kw_pre_lockseq = 0;
1278			}
1279		}
1280	} else {
1281		ksyn_queue_free_items(kwq, KSYN_QUEUE_WRITER, preposeq, 0);
1282	}
1283
1284	ksyn_wqunlock(kwq);
1285	ksyn_wqrelease(kwq, 1, wqtype);
1286	return error;
1287}
1288
1289/* ***************** pthread_rwlock ************************ */
1290
1291static int
1292__psynch_rw_lock(int type,
1293		 user_addr_t rwlock,
1294		 uint32_t lgenval,
1295		 uint32_t ugenval,
1296		 uint32_t rw_wc,
1297		 int flags,
1298		 uint32_t *retval)
1299{
1300	int prepost_type, kqi;
1301
1302	if (type == PTH_RW_TYPE_READ) {
1303		prepost_type = KW_UNLOCK_PREPOST_READLOCK;
1304		kqi = KSYN_QUEUE_READ;
1305	} else {
1306		prepost_type = KW_UNLOCK_PREPOST_WRLOCK;
1307		kqi = KSYN_QUEUE_WRITER;
1308	}
1309
1310	uint32_t lockseq = lgenval & PTHRW_COUNT_MASK;
1311
1312	int error;
1313	ksyn_wait_queue_t kwq;
1314	error = ksyn_wqfind(rwlock, lgenval, ugenval, rw_wc, flags, (KSYN_WQTYPE_INWAIT|KSYN_WQTYPE_RWLOCK), &kwq);
1315	if (error == 0) {
1316		ksyn_wqlock(kwq);
1317		_ksyn_check_init(kwq, lgenval);
1318		if (_ksyn_handle_missed_wakeups(kwq, type, lockseq, retval) ||
1319		    // handle overlap first as they are not counted against pre_rwwc
1320		    (type == PTH_RW_TYPE_READ && _ksyn_handle_overlap(kwq, lgenval, rw_wc, retval)) ||
1321		    _ksyn_handle_prepost(kwq, prepost_type, lockseq, retval)) {
1322			ksyn_wqunlock(kwq);
1323		} else {
1324			error = ksyn_wait(kwq, kqi, lgenval, SEQFIT, 0, THREAD_CONTINUE_NULL);
1325			// ksyn_wait drops wait queue lock
1326			if (error == 0) {
1327				uthread_t uth = current_uthread();
1328				ksyn_waitq_element_t kwe = pthread_kern->uthread_get_uukwe(uth);
1329				*retval = kwe->kwe_psynchretval;
1330			}
1331		}
1332		ksyn_wqrelease(kwq, 0, (KSYN_WQTYPE_INWAIT|KSYN_WQTYPE_RWLOCK));
1333	}
1334	return error;
1335}
1336
1337/*
1338 * psynch_rw_rdlock: This system call is used for psync rwlock readers to block.
1339 */
1340int
1341_psynch_rw_rdlock(__unused proc_t p,
1342		  user_addr_t rwlock,
1343		  uint32_t lgenval,
1344		  uint32_t ugenval,
1345		  uint32_t rw_wc,
1346		  int flags,
1347		  uint32_t *retval)
1348{
1349	return __psynch_rw_lock(PTH_RW_TYPE_READ, rwlock, lgenval, ugenval, rw_wc, flags, retval);
1350}
1351
1352/*
1353 * psynch_rw_longrdlock: This system call is used for psync rwlock long readers to block.
1354 */
1355int
1356_psynch_rw_longrdlock(__unused proc_t p,
1357		      __unused user_addr_t rwlock,
1358		      __unused uint32_t lgenval,
1359		      __unused uint32_t ugenval,
1360		      __unused uint32_t rw_wc,
1361		      __unused int flags,
1362		      __unused uint32_t *retval)
1363{
1364	return ESRCH;
1365}
1366
1367
1368/*
1369 * psynch_rw_wrlock: This system call is used for psync rwlock writers to block.
1370 */
1371int
1372_psynch_rw_wrlock(__unused proc_t p,
1373		  user_addr_t rwlock,
1374		  uint32_t lgenval,
1375		  uint32_t ugenval,
1376		  uint32_t rw_wc,
1377		  int flags,
1378		  uint32_t *retval)
1379{
1380	return __psynch_rw_lock(PTH_RW_TYPE_WRITE, rwlock, lgenval, ugenval, rw_wc, flags, retval);
1381}
1382
1383/*
1384 * psynch_rw_yieldwrlock: This system call is used for psync rwlock yielding writers to block.
1385 */
1386int
1387_psynch_rw_yieldwrlock(__unused proc_t p,
1388		       __unused user_addr_t rwlock,
1389		       __unused uint32_t lgenval,
1390		       __unused uint32_t ugenval,
1391		       __unused uint32_t rw_wc,
1392		       __unused int flags,
1393		       __unused uint32_t *retval)
1394{
1395	return ESRCH;
1396}
1397
1398/*
1399 * psynch_rw_unlock: This system call is used for unlock state postings. This will grant appropriate
1400 *			reader/writer variety lock.
1401 */
1402int
1403_psynch_rw_unlock(__unused proc_t p,
1404		  user_addr_t rwlock,
1405		  uint32_t lgenval,
1406		  uint32_t ugenval,
1407		  uint32_t rw_wc,
1408		  int flags,
1409		  uint32_t *retval)
1410{
1411	int error = 0;
1412	ksyn_wait_queue_t kwq;
1413	uint32_t updatebits = 0;
1414	int diff;
1415	uint32_t count = 0;
1416	uint32_t curgen = lgenval & PTHRW_COUNT_MASK;
1417
1418	error = ksyn_wqfind(rwlock, lgenval, ugenval, rw_wc, flags, (KSYN_WQTYPE_INDROP | KSYN_WQTYPE_RWLOCK), &kwq);
1419	if (error != 0) {
1420		return(error);
1421	}
1422
1423	ksyn_wqlock(kwq);
1424	int isinit = _ksyn_check_init(kwq, lgenval);
1425
1426	/* if lastunlock seq is set, ensure the current one is not lower than that, as it would be spurious */
1427	if ((kwq->kw_lastunlockseq != PTHRW_RWL_INIT) && (is_seqlower(ugenval, kwq->kw_lastunlockseq)!= 0)) {
1428		error = 0;
1429		goto out;
1430	}
1431
1432	/* If L-U != num of waiters, then it needs to be preposted or spr */
1433	diff = find_diff(lgenval, ugenval);
1434
1435	if (find_seq_till(kwq, curgen, diff, &count) == 0) {
1436		if ((count == 0) || (count < (uint32_t)diff))
1437			goto prepost;
1438	}
1439
1440	/* no prepost and all threads are in place, reset the bit */
1441	if ((isinit != 0) && ((kwq->kw_kflags & KSYN_KWF_INITCLEARED) != 0)){
1442		kwq->kw_kflags &= ~KSYN_KWF_INITCLEARED;
1443	}
1444
1445	/* can handle unlock now */
1446
1447	CLEAR_PREPOST_BITS(kwq);
1448
1449	error = kwq_handle_unlock(kwq, lgenval, rw_wc, &updatebits, 0, NULL, 0);
1450#if __TESTPANICS__
1451	if (error != 0)
1452		panic("psynch_rw_unlock: kwq_handle_unlock failed %d\n",error);
1453#endif /* __TESTPANICS__ */
1454out:
1455	if (error == 0) {
1456		/* update bits?? */
1457		*retval = updatebits;
1458	}
1459
1460
1461	ksyn_wqunlock(kwq);
1462	ksyn_wqrelease(kwq, 0, (KSYN_WQTYPE_INDROP | KSYN_WQTYPE_RWLOCK));
1463
1464	return(error);
1465
1466prepost:
1467	/* update if the new seq is higher than prev prepost, or first set */
1468	if (is_rws_setseq(kwq->kw_pre_sseq) ||
1469	    is_seqhigher_eq(rw_wc, kwq->kw_pre_sseq)) {
1470		kwq->kw_pre_rwwc = (diff - count);
1471		kwq->kw_pre_lockseq = curgen;
1472		kwq->kw_pre_sseq = rw_wc;
1473		updatebits = lgenval;	/* let this not do unlock handling */
1474	}
1475	error = 0;
1476	goto out;
1477}
1478
1479
1480/* ************************************************************************** */
1481void
1482pth_global_hashinit(void)
1483{
1484	pth_glob_hashtbl = hashinit(PTH_HASHSIZE * 4, M_PROC, &pthhash);
1485}
1486
1487void
1488_pth_proc_hashinit(proc_t p)
1489{
1490	void *ptr = hashinit(PTH_HASHSIZE, M_PCB, &pthhash);
1491	if (ptr == NULL) {
1492		panic("pth_proc_hashinit: hash init returned 0\n");
1493	}
1494
1495	pthread_kern->proc_set_pthhash(p, ptr);
1496}
1497
1498
1499static int
1500ksyn_wq_hash_lookup(user_addr_t uaddr,
1501		    proc_t p,
1502		    int flags,
1503		    ksyn_wait_queue_t *out_kwq,
1504		    struct pthhashhead **out_hashptr,
1505		    uint64_t *out_object,
1506		    uint64_t *out_offset)
1507{
1508	int res = 0;
1509	ksyn_wait_queue_t kwq;
1510	uint64_t object = 0, offset = 0;
1511	struct pthhashhead *hashptr;
1512	if ((flags & PTHREAD_PSHARED_FLAGS_MASK) == PTHREAD_PROCESS_SHARED) {
1513		hashptr = pth_glob_hashtbl;
1514		res = ksyn_findobj(uaddr, &object, &offset);
1515		if (res == 0) {
1516			LIST_FOREACH(kwq, &hashptr[object & pthhash], kw_hash) {
1517				if (kwq->kw_object == object && kwq->kw_offset == offset) {
1518					break;
1519				}
1520			}
1521		} else {
1522			kwq = NULL;
1523		}
1524	} else {
1525		hashptr = pthread_kern->proc_get_pthhash(p);
1526		LIST_FOREACH(kwq, &hashptr[uaddr & pthhash], kw_hash) {
1527			if (kwq->kw_addr == uaddr) {
1528				break;
1529			}
1530		}
1531	}
1532	*out_kwq = kwq;
1533	*out_object = object;
1534	*out_offset = offset;
1535	*out_hashptr = hashptr;
1536	return res;
1537}
1538
1539void
1540_pth_proc_hashdelete(proc_t p)
1541{
1542	struct pthhashhead * hashptr;
1543	ksyn_wait_queue_t kwq;
1544	unsigned long hashsize = pthhash + 1;
1545	unsigned long i;
1546
1547	hashptr = pthread_kern->proc_get_pthhash(p);
1548	pthread_kern->proc_set_pthhash(p, NULL);
1549	if (hashptr == NULL) {
1550		return;
1551	}
1552
1553	pthread_list_lock();
1554	for(i= 0; i < hashsize; i++) {
1555		while ((kwq = LIST_FIRST(&hashptr[i])) != NULL) {
1556			if ((kwq->kw_pflags & KSYN_WQ_INHASH) != 0) {
1557				kwq->kw_pflags &= ~KSYN_WQ_INHASH;
1558				LIST_REMOVE(kwq, kw_hash);
1559			}
1560			if ((kwq->kw_pflags & KSYN_WQ_FLIST) != 0) {
1561				kwq->kw_pflags &= ~KSYN_WQ_FLIST;
1562				LIST_REMOVE(kwq, kw_list);
1563			}
1564			pthread_list_unlock();
1565			/* release fake entries if present for cvars */
1566			if (((kwq->kw_type & KSYN_WQTYPE_MASK) == KSYN_WQTYPE_CVAR) && (kwq->kw_inqueue != 0))
1567				ksyn_freeallkwe(&kwq->kw_ksynqueues[KSYN_QUEUE_WRITER]);
1568			lck_mtx_destroy(&kwq->kw_lock, pthread_lck_grp);
1569			pthread_kern->zfree(kwq_zone, kwq);
1570			pthread_list_lock();
1571		}
1572	}
1573	pthread_list_unlock();
1574	FREE(hashptr, M_PROC);
1575}
1576
1577/* no lock held for this as the waitqueue is getting freed */
1578void
1579ksyn_freeallkwe(ksyn_queue_t kq)
1580{
1581	ksyn_waitq_element_t kwe;
1582	while ((kwe = TAILQ_FIRST(&kq->ksynq_kwelist)) != NULL) {
1583		TAILQ_REMOVE(&kq->ksynq_kwelist, kwe, kwe_list);
1584		if (kwe->kwe_state != KWE_THREAD_INWAIT) {
1585			pthread_kern->zfree(kwe_zone, kwe);
1586		}
1587	}
1588}
1589
1590/* find kernel waitqueue, if not present create one. Grants a reference  */
1591int
1592ksyn_wqfind(user_addr_t uaddr, uint32_t mgen, uint32_t ugen, uint32_t sgen, int flags, int wqtype, ksyn_wait_queue_t *kwqp)
1593{
1594	int res = 0;
1595	ksyn_wait_queue_t kwq = NULL;
1596	ksyn_wait_queue_t nkwq = NULL;
1597	struct pthhashhead *hashptr;
1598	proc_t p = current_proc();
1599
1600	uint64_t object = 0, offset = 0;
1601	if ((flags & PTHREAD_PSHARED_FLAGS_MASK) == PTHREAD_PROCESS_SHARED) {
1602		res = ksyn_findobj(uaddr, &object, &offset);
1603		hashptr = pth_glob_hashtbl;
1604	} else {
1605		hashptr = pthread_kern->proc_get_pthhash(p);
1606	}
1607
1608	while (res == 0) {
1609		pthread_list_lock();
1610		res = ksyn_wq_hash_lookup(uaddr, current_proc(), flags, &kwq, &hashptr, &object, &offset);
1611		if (res != 0) {
1612			break;
1613		}
1614		if (kwq == NULL && nkwq == NULL) {
1615			// Drop the lock to allocate a new kwq and retry.
1616			pthread_list_unlock();
1617
1618			nkwq = (ksyn_wait_queue_t)pthread_kern->zalloc(kwq_zone);
1619			bzero(nkwq, sizeof(struct ksyn_wait_queue));
1620			int i;
1621			for (i = 0; i < KSYN_QUEUE_MAX; i++) {
1622				ksyn_queue_init(&nkwq->kw_ksynqueues[i]);
1623			}
1624			lck_mtx_init(&nkwq->kw_lock, pthread_lck_grp, pthread_lck_attr);
1625			continue;
1626		} else if (kwq == NULL && nkwq != NULL) {
1627			// Still not found, add the new kwq to the hash.
1628			kwq = nkwq;
1629			nkwq = NULL; // Don't free.
1630			if ((flags & PTHREAD_PSHARED_FLAGS_MASK) == PTHREAD_PROCESS_SHARED) {
1631				kwq->kw_pflags |= KSYN_WQ_SHARED;
1632				LIST_INSERT_HEAD(&hashptr[object & pthhash], kwq, kw_hash);
1633			} else {
1634				LIST_INSERT_HEAD(&hashptr[uaddr & pthhash], kwq, kw_hash);
1635			}
1636			kwq->kw_pflags |= KSYN_WQ_INHASH;
1637		} else if (kwq != NULL) {
1638			// Found an existing kwq, use it.
1639			if ((kwq->kw_pflags & KSYN_WQ_FLIST) != 0) {
1640				LIST_REMOVE(kwq, kw_list);
1641				kwq->kw_pflags &= ~KSYN_WQ_FLIST;
1642			}
1643			if ((kwq->kw_type & KSYN_WQTYPE_MASK) != (wqtype & KSYN_WQTYPE_MASK)) {
1644				if (kwq->kw_inqueue == 0 && kwq->kw_pre_rwwc == 0 && kwq->kw_pre_intrcount == 0) {
1645					if (kwq->kw_iocount == 0) {
1646						kwq->kw_type = 0; // mark for reinitialization
1647					} else if (kwq->kw_iocount == 1 && kwq->kw_dropcount == kwq->kw_iocount) {
1648						/* if all users are unlockers then wait for it to finish */
1649						kwq->kw_pflags |= KSYN_WQ_WAITING;
1650						// Drop the lock and wait for the kwq to be free.
1651						(void)msleep(&kwq->kw_pflags, pthread_list_mlock, PDROP, "ksyn_wqfind", 0);
1652						continue;
1653					} else {
1654						__FAILEDUSERTEST__("address already known to kernel for another [busy] synchronizer type\n");
1655						res = EINVAL;
1656					}
1657				} else {
1658					__FAILEDUSERTEST__("address already known to kernel for another [busy] synchronizer type\n");
1659					res = EINVAL;
1660				}
1661			}
1662		}
1663		if (res == 0) {
1664			if (kwq->kw_type == 0) {
1665				kwq->kw_addr = uaddr;
1666				kwq->kw_object = object;
1667				kwq->kw_offset = offset;
1668				kwq->kw_type = (wqtype & KSYN_WQTYPE_MASK);
1669				CLEAR_REINIT_BITS(kwq);
1670				kwq->kw_lword = mgen;
1671				kwq->kw_uword = ugen;
1672				kwq->kw_sword = sgen;
1673				kwq->kw_owner = 0;
1674				kwq->kw_kflags = 0;
1675				kwq->kw_qos_override = THREAD_QOS_UNSPECIFIED;
1676			}
1677			kwq->kw_iocount++;
1678			if (wqtype == KSYN_WQTYPE_MUTEXDROP) {
1679				kwq->kw_dropcount++;
1680			}
1681		}
1682		break;
1683	}
1684	pthread_list_unlock();
1685	if (kwqp != NULL) {
1686		*kwqp = kwq;
1687	}
1688	if (nkwq) {
1689		lck_mtx_destroy(&nkwq->kw_lock, pthread_lck_grp);
1690		pthread_kern->zfree(kwq_zone, nkwq);
1691	}
1692	return res;
1693}
1694
1695/* Reference from find is dropped here. Starts the free process if needed */
1696void
1697ksyn_wqrelease(ksyn_wait_queue_t kwq, int qfreenow, int wqtype)
1698{
1699	uint64_t deadline;
1700	ksyn_wait_queue_t free_elem = NULL;
1701
1702	pthread_list_lock();
1703	if (wqtype == KSYN_WQTYPE_MUTEXDROP) {
1704		kwq->kw_dropcount--;
1705	}
1706	if (--kwq->kw_iocount == 0) {
1707		if ((kwq->kw_pflags & KSYN_WQ_WAITING) != 0) {
1708			/* some one is waiting for the waitqueue, wake them up */
1709			kwq->kw_pflags &= ~KSYN_WQ_WAITING;
1710			wakeup(&kwq->kw_pflags);
1711		}
1712
1713		if (kwq->kw_pre_rwwc == 0 && kwq->kw_inqueue == 0 && kwq->kw_pre_intrcount == 0) {
1714			if (qfreenow == 0) {
1715				microuptime(&kwq->kw_ts);
1716				LIST_INSERT_HEAD(&pth_free_list, kwq, kw_list);
1717				kwq->kw_pflags |= KSYN_WQ_FLIST;
1718				if (psynch_cleanupset == 0) {
1719					struct timeval t;
1720					microuptime(&t);
1721					t.tv_sec += KSYN_CLEANUP_DEADLINE;
1722					deadline = tvtoabstime(&t);
1723					thread_call_enter_delayed(psynch_thcall, deadline);
1724					psynch_cleanupset = 1;
1725				}
1726			} else {
1727				kwq->kw_pflags &= ~KSYN_WQ_INHASH;
1728				LIST_REMOVE(kwq, kw_hash);
1729				free_elem = kwq;
1730			}
1731		}
1732	}
1733	pthread_list_unlock();
1734	if (free_elem != NULL) {
1735		lck_mtx_destroy(&free_elem->kw_lock, pthread_lck_grp);
1736		pthread_kern->zfree(kwq_zone, free_elem);
1737	}
1738}
1739
1740/* responsible to free the waitqueues */
1741void
1742psynch_wq_cleanup(__unused void *param, __unused void * param1)
1743{
1744	ksyn_wait_queue_t kwq;
1745	struct timeval t;
1746	int reschedule = 0;
1747	uint64_t deadline = 0;
1748	LIST_HEAD(, ksyn_wait_queue) freelist;
1749	LIST_INIT(&freelist);
1750
1751	pthread_list_lock();
1752
1753	microuptime(&t);
1754
1755	LIST_FOREACH(kwq, &pth_free_list, kw_list) {
1756		if (kwq->kw_iocount != 0 || kwq->kw_pre_rwwc != 0 || kwq->kw_inqueue != 0 || kwq->kw_pre_intrcount != 0) {
1757			// still in use
1758			continue;
1759		}
1760		__darwin_time_t diff = t.tv_sec - kwq->kw_ts.tv_sec;
1761		if (diff < 0)
1762			diff *= -1;
1763		if (diff >= KSYN_CLEANUP_DEADLINE) {
1764			kwq->kw_pflags &= ~(KSYN_WQ_FLIST | KSYN_WQ_INHASH);
1765			LIST_REMOVE(kwq, kw_hash);
1766			LIST_REMOVE(kwq, kw_list);
1767			LIST_INSERT_HEAD(&freelist, kwq, kw_list);
1768		} else {
1769			reschedule = 1;
1770		}
1771
1772	}
1773	if (reschedule != 0) {
1774		t.tv_sec += KSYN_CLEANUP_DEADLINE;
1775		deadline = tvtoabstime(&t);
1776		thread_call_enter_delayed(psynch_thcall, deadline);
1777		psynch_cleanupset = 1;
1778	} else {
1779		psynch_cleanupset = 0;
1780	}
1781	pthread_list_unlock();
1782
1783	while ((kwq = LIST_FIRST(&freelist)) != NULL) {
1784		LIST_REMOVE(kwq, kw_list);
1785		lck_mtx_destroy(&kwq->kw_lock, pthread_lck_grp);
1786		pthread_kern->zfree(kwq_zone, kwq);
1787	}
1788}
1789
1790static int
1791_wait_result_to_errno(wait_result_t result)
1792{
1793	int res = 0;
1794	switch (result) {
1795		case THREAD_TIMED_OUT:
1796			res = ETIMEDOUT;
1797			break;
1798		case THREAD_INTERRUPTED:
1799			res = EINTR;
1800			break;
1801	}
1802	return res;
1803}
1804
1805int
1806ksyn_wait(ksyn_wait_queue_t kwq,
1807	  int kqi,
1808	  uint32_t lockseq,
1809	  int fit,
1810	  uint64_t abstime,
1811	  thread_continue_t continuation)
1812{
1813	int res;
1814
1815	thread_t th = current_thread();
1816	uthread_t uth = pthread_kern->get_bsdthread_info(th);
1817	ksyn_waitq_element_t kwe = pthread_kern->uthread_get_uukwe(uth);
1818	bzero(kwe, sizeof(*kwe));
1819	kwe->kwe_count = 1;
1820	kwe->kwe_lockseq = lockseq & PTHRW_COUNT_MASK;
1821	kwe->kwe_state = KWE_THREAD_INWAIT;
1822	kwe->kwe_uth = uth;
1823	kwe->kwe_tid = thread_tid(th);
1824
1825	res = ksyn_queue_insert(kwq, kqi, kwe, lockseq, fit);
1826	if (res != 0) {
1827		//panic("psynch_rw_wrlock: failed to enqueue\n"); // XXX
1828		ksyn_wqunlock(kwq);
1829		return res;
1830	}
1831
1832	assert_wait_deadline_with_leeway(&kwe->kwe_psynchretval, THREAD_ABORTSAFE, TIMEOUT_URGENCY_USER_NORMAL, abstime, 0);
1833	ksyn_wqunlock(kwq);
1834
1835	kern_return_t ret;
1836	if (continuation == THREAD_CONTINUE_NULL) {
1837		ret = thread_block(NULL);
1838	} else {
1839		ret = thread_block_parameter(continuation, kwq);
1840
1841		// If thread_block_parameter returns (interrupted) call the
1842		// continuation manually to clean up.
1843		continuation(kwq, ret);
1844
1845		// NOT REACHED
1846		panic("ksyn_wait continuation returned");
1847	}
1848
1849	res = _wait_result_to_errno(ret);
1850	if (res != 0) {
1851		ksyn_wqlock(kwq);
1852		if (kwe->kwe_kwqqueue) {
1853			ksyn_queue_remove_item(kwq, &kwq->kw_ksynqueues[kqi], kwe);
1854		}
1855		ksyn_wqunlock(kwq);
1856	}
1857	return res;
1858}
1859
1860kern_return_t
1861ksyn_signal(ksyn_wait_queue_t kwq,
1862	    int kqi,
1863	    ksyn_waitq_element_t kwe,
1864	    uint32_t updateval)
1865{
1866	kern_return_t ret;
1867
1868	// If no wait element was specified, wake the first.
1869	if (!kwe) {
1870		kwe = TAILQ_FIRST(&kwq->kw_ksynqueues[kqi].ksynq_kwelist);
1871		if (!kwe) {
1872			panic("ksyn_signal: panic signaling empty queue");
1873		}
1874	}
1875
1876	if (kwe->kwe_state != KWE_THREAD_INWAIT) {
1877		panic("ksyn_signal: panic signaling non-waiting element");
1878	}
1879
1880	ksyn_queue_remove_item(kwq, &kwq->kw_ksynqueues[kqi], kwe);
1881	kwe->kwe_psynchretval = updateval;
1882
1883	ret = thread_wakeup_one((caddr_t)&kwe->kwe_psynchretval);
1884	if (ret != KERN_SUCCESS && ret != KERN_NOT_WAITING) {
1885		panic("ksyn_signal: panic waking up thread %x\n", ret);
1886	}
1887	return ret;
1888}
1889
1890int
1891ksyn_findobj(user_addr_t uaddr, uint64_t *objectp, uint64_t *offsetp)
1892{
1893	kern_return_t ret;
1894	vm_page_info_basic_data_t info;
1895	mach_msg_type_number_t count = VM_PAGE_INFO_BASIC_COUNT;
1896	ret = pthread_kern->vm_map_page_info(pthread_kern->current_map(), uaddr, VM_PAGE_INFO_BASIC, (vm_page_info_t)&info, &count);
1897	if (ret != KERN_SUCCESS) {
1898		return EINVAL;
1899	}
1900
1901	if (objectp != NULL) {
1902		*objectp = (uint64_t)info.object_id;
1903	}
1904	if (offsetp != NULL) {
1905		*offsetp = (uint64_t)info.offset;
1906	}
1907
1908	return(0);
1909}
1910
1911
1912/* lowest of kw_fr, kw_flr, kw_fwr, kw_fywr */
1913int
1914kwq_find_rw_lowest(ksyn_wait_queue_t kwq, int flags, uint32_t premgen, int *typep, uint32_t lowest[])
1915{
1916	uint32_t kw_fr, kw_fwr, low;
1917	int type = 0, lowtype, typenum[2] = { 0 };
1918	uint32_t numbers[2] = { 0 };
1919	int count = 0, i;
1920
1921
1922	if ((kwq->kw_ksynqueues[KSYN_QUEUE_READ].ksynq_count != 0) || ((flags & KW_UNLOCK_PREPOST_READLOCK) != 0)) {
1923		type |= PTH_RWSHFT_TYPE_READ;
1924		/* read entries are present */
1925		if (kwq->kw_ksynqueues[KSYN_QUEUE_READ].ksynq_count != 0) {
1926			kw_fr = kwq->kw_ksynqueues[KSYN_QUEUE_READ].ksynq_firstnum;
1927			if (((flags & KW_UNLOCK_PREPOST_READLOCK) != 0) && (is_seqlower(premgen, kw_fr) != 0))
1928				kw_fr = premgen;
1929		} else
1930			kw_fr = premgen;
1931
1932		lowest[KSYN_QUEUE_READ] = kw_fr;
1933		numbers[count]= kw_fr;
1934		typenum[count] = PTH_RW_TYPE_READ;
1935		count++;
1936	} else
1937		lowest[KSYN_QUEUE_READ] = 0;
1938
1939	if ((kwq->kw_ksynqueues[KSYN_QUEUE_WRITER].ksynq_count != 0) || ((flags & KW_UNLOCK_PREPOST_WRLOCK) != 0)) {
1940		type |= PTH_RWSHFT_TYPE_WRITE;
1941		/* read entries are present */
1942		if (kwq->kw_ksynqueues[KSYN_QUEUE_WRITER].ksynq_count != 0) {
1943			kw_fwr = kwq->kw_ksynqueues[KSYN_QUEUE_WRITER].ksynq_firstnum;
1944			if (((flags & KW_UNLOCK_PREPOST_WRLOCK) != 0) && (is_seqlower(premgen, kw_fwr) != 0))
1945				kw_fwr = premgen;
1946		} else
1947			kw_fwr = premgen;
1948
1949		lowest[KSYN_QUEUE_WRITER] = kw_fwr;
1950		numbers[count]= kw_fwr;
1951		typenum[count] = PTH_RW_TYPE_WRITE;
1952		count++;
1953	} else
1954		lowest[KSYN_QUEUE_WRITER] = 0;
1955
1956#if __TESTPANICS__
1957	if (count == 0)
1958		panic("nothing in the queue???\n");
1959#endif /* __TESTPANICS__ */
1960
1961	low = numbers[0];
1962	lowtype = typenum[0];
1963	if (count > 1) {
1964		for (i = 1; i< count; i++) {
1965			if (is_seqlower(numbers[i] , low) != 0) {
1966				low = numbers[i];
1967				lowtype = typenum[i];
1968			}
1969		}
1970	}
1971	type |= lowtype;
1972
1973	if (typep != 0)
1974		*typep = type;
1975	return(0);
1976}
1977
1978/* wakeup readers to upto the writer limits */
1979int
1980ksyn_wakeupreaders(ksyn_wait_queue_t kwq, uint32_t limitread, int allreaders, uint32_t updatebits, int *wokenp)
1981{
1982	ksyn_queue_t kq;
1983	int failedwakeup = 0;
1984	int numwoken = 0;
1985	kern_return_t kret = KERN_SUCCESS;
1986	uint32_t lbits = 0;
1987
1988	lbits = updatebits;
1989
1990	kq = &kwq->kw_ksynqueues[KSYN_QUEUE_READ];
1991	while ((kq->ksynq_count != 0) && (allreaders || (is_seqlower(kq->ksynq_firstnum, limitread) != 0))) {
1992		kret = ksyn_signal(kwq, KSYN_QUEUE_READ, NULL, lbits);
1993		if (kret == KERN_NOT_WAITING) {
1994			failedwakeup++;
1995		}
1996		numwoken++;
1997	}
1998
1999	if (wokenp != NULL)
2000		*wokenp = numwoken;
2001	return(failedwakeup);
2002}
2003
2004
2005/* This handles the unlock grants for next set on rw_unlock() or on arrival of all preposted waiters */
2006int
2007kwq_handle_unlock(ksyn_wait_queue_t kwq,
2008		  __unused uint32_t mgen,
2009		  uint32_t rw_wc,
2010		  uint32_t *updatep,
2011		  int flags,
2012		  int *blockp,
2013		  uint32_t premgen)
2014{
2015	uint32_t low_writer, limitrdnum;
2016	int rwtype, error=0;
2017	int allreaders, failed;
2018	uint32_t updatebits=0, numneeded = 0;;
2019	int prepost = flags & KW_UNLOCK_PREPOST;
2020	thread_t preth = THREAD_NULL;
2021	ksyn_waitq_element_t kwe;
2022	uthread_t uth;
2023	thread_t th;
2024	int woken = 0;
2025	int block = 1;
2026	uint32_t lowest[KSYN_QUEUE_MAX]; /* np need for upgrade as it is handled separately */
2027	kern_return_t kret = KERN_SUCCESS;
2028	ksyn_queue_t kq;
2029	int curthreturns = 0;
2030
2031	if (prepost != 0) {
2032		preth = current_thread();
2033	}
2034
2035	kq = &kwq->kw_ksynqueues[KSYN_QUEUE_READ];
2036	kwq->kw_lastseqword = rw_wc;
2037	kwq->kw_lastunlockseq = (rw_wc & PTHRW_COUNT_MASK);
2038	kwq->kw_overlapwatch = 0;
2039
2040	error = kwq_find_rw_lowest(kwq, flags, premgen, &rwtype, lowest);
2041#if __TESTPANICS__
2042	if (error != 0)
2043		panic("rwunlock: cannot fails to slot next round of threads");
2044#endif /* __TESTPANICS__ */
2045
2046	low_writer = lowest[KSYN_QUEUE_WRITER];
2047
2048	allreaders = 0;
2049	updatebits = 0;
2050
2051	switch (rwtype & PTH_RW_TYPE_MASK) {
2052		case PTH_RW_TYPE_READ: {
2053			// XXX
2054			/* what about the preflight which is LREAD or READ ?? */
2055			if ((rwtype & PTH_RWSHFT_TYPE_MASK) != 0) {
2056				if (rwtype & PTH_RWSHFT_TYPE_WRITE) {
2057					updatebits |= (PTH_RWL_WBIT | PTH_RWL_KBIT);
2058				}
2059			}
2060			limitrdnum = 0;
2061			if ((rwtype & PTH_RWSHFT_TYPE_WRITE) != 0) {
2062				limitrdnum = low_writer;
2063			} else {
2064				allreaders = 1;
2065			}
2066
2067			numneeded = 0;
2068
2069			if ((rwtype & PTH_RWSHFT_TYPE_WRITE) != 0) {
2070				limitrdnum = low_writer;
2071				numneeded = ksyn_queue_count_tolowest(kq, limitrdnum);
2072				if (((flags & KW_UNLOCK_PREPOST_READLOCK) != 0) && (is_seqlower(premgen, limitrdnum) != 0)) {
2073					curthreturns = 1;
2074					numneeded += 1;
2075				}
2076			} else {
2077				// no writers at all
2078				// no other waiters only readers
2079				kwq->kw_overlapwatch = 1;
2080				numneeded += kwq->kw_ksynqueues[KSYN_QUEUE_READ].ksynq_count;
2081				if ((flags & KW_UNLOCK_PREPOST_READLOCK) != 0) {
2082					curthreturns = 1;
2083					numneeded += 1;
2084				}
2085			}
2086
2087			updatebits += (numneeded << PTHRW_COUNT_SHIFT);
2088
2089			kwq->kw_nextseqword = (rw_wc & PTHRW_COUNT_MASK) + updatebits;
2090
2091			if (curthreturns != 0) {
2092				block = 0;
2093				uth = current_uthread();
2094				kwe = pthread_kern->uthread_get_uukwe(uth);
2095				kwe->kwe_psynchretval = updatebits;
2096			}
2097
2098
2099			failed = ksyn_wakeupreaders(kwq, limitrdnum, allreaders, updatebits, &woken);
2100			if (failed != 0) {
2101				kwq->kw_pre_intrcount = failed;	/* actually a count */
2102				kwq->kw_pre_intrseq = limitrdnum;
2103				kwq->kw_pre_intrretbits = updatebits;
2104				kwq->kw_pre_intrtype = PTH_RW_TYPE_READ;
2105			}
2106
2107			error = 0;
2108
2109			if ((kwq->kw_ksynqueues[KSYN_QUEUE_WRITER].ksynq_count != 0) && ((updatebits & PTH_RWL_WBIT) == 0))
2110				panic("kwq_handle_unlock: writer pending but no writebit set %x\n", updatebits);
2111		}
2112			break;
2113
2114		case PTH_RW_TYPE_WRITE: {
2115
2116			/* only one thread is goin to be granted */
2117			updatebits |= (PTHRW_INC);
2118			updatebits |= PTH_RWL_KBIT| PTH_RWL_EBIT;
2119
2120			if (((flags & KW_UNLOCK_PREPOST_WRLOCK) != 0) && (low_writer == premgen)) {
2121				block = 0;
2122				if (kwq->kw_ksynqueues[KSYN_QUEUE_WRITER].ksynq_count != 0) {
2123					updatebits |= PTH_RWL_WBIT;
2124				}
2125				th = preth;
2126				uth = pthread_kern->get_bsdthread_info(th);
2127				kwe = pthread_kern->uthread_get_uukwe(uth);
2128				kwe->kwe_psynchretval = updatebits;
2129			} else {
2130				/* we are not granting writelock to the preposting thread */
2131				/* if there are writers present or the preposting write thread then W bit is to be set */
2132				if (kwq->kw_ksynqueues[KSYN_QUEUE_WRITER].ksynq_count > 1 ||
2133				    (flags & KW_UNLOCK_PREPOST_WRLOCK) != 0) {
2134					updatebits |= PTH_RWL_WBIT;
2135				}
2136				/* setup next in the queue */
2137				kret = ksyn_signal(kwq, KSYN_QUEUE_WRITER, NULL, updatebits);
2138				if (kret == KERN_NOT_WAITING) {
2139					kwq->kw_pre_intrcount = 1;	/* actually a count */
2140					kwq->kw_pre_intrseq = low_writer;
2141					kwq->kw_pre_intrretbits = updatebits;
2142					kwq->kw_pre_intrtype = PTH_RW_TYPE_WRITE;
2143				}
2144				error = 0;
2145			}
2146			kwq->kw_nextseqword = (rw_wc & PTHRW_COUNT_MASK) + updatebits;
2147			if ((updatebits & (PTH_RWL_KBIT | PTH_RWL_EBIT)) != (PTH_RWL_KBIT | PTH_RWL_EBIT))
2148				panic("kwq_handle_unlock: writer lock granted but no ke set %x\n", updatebits);
2149		}
2150			break;
2151
2152		default:
2153			panic("rwunlock: invalid type for lock grants");
2154
2155	};
2156
2157	if (updatep != NULL)
2158		*updatep = updatebits;
2159	if (blockp != NULL)
2160		*blockp = block;
2161	return(error);
2162}
2163
2164/************* Indiv queue support routines ************************/
2165void
2166ksyn_queue_init(ksyn_queue_t kq)
2167{
2168	TAILQ_INIT(&kq->ksynq_kwelist);
2169	kq->ksynq_count = 0;
2170	kq->ksynq_firstnum = 0;
2171	kq->ksynq_lastnum = 0;
2172}
2173
2174int
2175ksyn_queue_insert(ksyn_wait_queue_t kwq, int kqi, ksyn_waitq_element_t kwe, uint32_t mgen, int fit)
2176{
2177	ksyn_queue_t kq = &kwq->kw_ksynqueues[kqi];
2178	uint32_t lockseq = mgen & PTHRW_COUNT_MASK;
2179	int res = 0;
2180
2181	if (kwe->kwe_kwqqueue != NULL) {
2182		panic("adding enqueued item to another queue");
2183	}
2184
2185	if (kq->ksynq_count == 0) {
2186		TAILQ_INSERT_HEAD(&kq->ksynq_kwelist, kwe, kwe_list);
2187		kq->ksynq_firstnum = lockseq;
2188		kq->ksynq_lastnum = lockseq;
2189	} else if (fit == FIRSTFIT) {
2190		/* TBD: if retry bit is set for mutex, add it to the head */
2191		/* firstfit, arriving order */
2192		TAILQ_INSERT_TAIL(&kq->ksynq_kwelist, kwe, kwe_list);
2193		if (is_seqlower(lockseq, kq->ksynq_firstnum)) {
2194			kq->ksynq_firstnum = lockseq;
2195		}
2196		if (is_seqhigher(lockseq, kq->ksynq_lastnum)) {
2197			kq->ksynq_lastnum = lockseq;
2198		}
2199	} else if (lockseq == kq->ksynq_firstnum || lockseq == kq->ksynq_lastnum) {
2200		/* During prepost when a thread is getting cancelled, we could have two with same seq */
2201		res = EBUSY;
2202		if (kwe->kwe_state == KWE_THREAD_PREPOST) {
2203			ksyn_waitq_element_t tmp = ksyn_queue_find_seq(kwq, kq, lockseq);
2204			if (tmp != NULL && tmp->kwe_uth != NULL && pthread_kern->uthread_is_cancelled(tmp->kwe_uth)) {
2205				TAILQ_INSERT_TAIL(&kq->ksynq_kwelist, kwe, kwe_list);
2206				res = 0;
2207			}
2208		}
2209	} else if (is_seqlower(kq->ksynq_lastnum, lockseq)) { // XXX is_seqhigher
2210		TAILQ_INSERT_TAIL(&kq->ksynq_kwelist, kwe, kwe_list);
2211		kq->ksynq_lastnum = lockseq;
2212	} else if (is_seqlower(lockseq, kq->ksynq_firstnum)) {
2213		TAILQ_INSERT_HEAD(&kq->ksynq_kwelist, kwe, kwe_list);
2214		kq->ksynq_firstnum = lockseq;
2215	} else {
2216		ksyn_waitq_element_t q_kwe, r_kwe;
2217
2218		res = ESRCH;
2219		TAILQ_FOREACH_SAFE(q_kwe, &kq->ksynq_kwelist, kwe_list, r_kwe) {
2220			if (is_seqhigher(q_kwe->kwe_lockseq, lockseq)) {
2221				TAILQ_INSERT_BEFORE(q_kwe, kwe, kwe_list);
2222				res = 0;
2223				break;
2224			}
2225		}
2226	}
2227
2228	if (res == 0) {
2229		kwe->kwe_kwqqueue = kwq;
2230		kq->ksynq_count++;
2231		kwq->kw_inqueue++;
2232		update_low_high(kwq, lockseq);
2233	}
2234	return res;
2235}
2236
2237void
2238ksyn_queue_remove_item(ksyn_wait_queue_t kwq, ksyn_queue_t kq, ksyn_waitq_element_t kwe)
2239{
2240	if (kq->ksynq_count == 0) {
2241		panic("removing item from empty queue");
2242	}
2243
2244	if (kwe->kwe_kwqqueue != kwq) {
2245		panic("removing item from wrong queue");
2246	}
2247
2248	TAILQ_REMOVE(&kq->ksynq_kwelist, kwe, kwe_list);
2249	kwe->kwe_list.tqe_next = NULL;
2250	kwe->kwe_list.tqe_prev = NULL;
2251	kwe->kwe_kwqqueue = NULL;
2252
2253	if (--kq->ksynq_count > 0) {
2254		ksyn_waitq_element_t tmp;
2255		tmp = TAILQ_FIRST(&kq->ksynq_kwelist);
2256		kq->ksynq_firstnum = tmp->kwe_lockseq & PTHRW_COUNT_MASK;
2257		tmp = TAILQ_LAST(&kq->ksynq_kwelist, ksynq_kwelist_head);
2258		kq->ksynq_lastnum = tmp->kwe_lockseq & PTHRW_COUNT_MASK;
2259	} else {
2260		kq->ksynq_firstnum = 0;
2261		kq->ksynq_lastnum = 0;
2262	}
2263
2264	if (--kwq->kw_inqueue > 0) {
2265		uint32_t curseq = kwe->kwe_lockseq & PTHRW_COUNT_MASK;
2266		if (kwq->kw_lowseq == curseq) {
2267			kwq->kw_lowseq = find_nextlowseq(kwq);
2268		}
2269		if (kwq->kw_highseq == curseq) {
2270			kwq->kw_highseq = find_nexthighseq(kwq);
2271		}
2272	} else {
2273		kwq->kw_lowseq = 0;
2274		kwq->kw_highseq = 0;
2275	}
2276}
2277
2278ksyn_waitq_element_t
2279ksyn_queue_find_seq(__unused ksyn_wait_queue_t kwq, ksyn_queue_t kq, uint32_t seq)
2280{
2281	ksyn_waitq_element_t kwe;
2282
2283	// XXX: should stop searching when higher sequence number is seen
2284	TAILQ_FOREACH(kwe, &kq->ksynq_kwelist, kwe_list) {
2285		if ((kwe->kwe_lockseq & PTHRW_COUNT_MASK) == seq) {
2286			return kwe;
2287		}
2288	}
2289	return NULL;
2290}
2291
2292/* find the thread at the target sequence (or a broadcast/prepost at or above) */
2293ksyn_waitq_element_t
2294ksyn_queue_find_cvpreposeq(ksyn_queue_t kq, uint32_t cgen)
2295{
2296	ksyn_waitq_element_t result = NULL;
2297	ksyn_waitq_element_t kwe;
2298	uint32_t lgen = (cgen & PTHRW_COUNT_MASK);
2299
2300	TAILQ_FOREACH(kwe, &kq->ksynq_kwelist, kwe_list) {
2301		if (is_seqhigher_eq(kwe->kwe_lockseq, cgen)) {
2302			result = kwe;
2303
2304			// KWE_THREAD_INWAIT must be strictly equal
2305			if (kwe->kwe_state == KWE_THREAD_INWAIT && (kwe->kwe_lockseq & PTHRW_COUNT_MASK) != lgen) {
2306				result = NULL;
2307			}
2308			break;
2309		}
2310	}
2311	return result;
2312}
2313
2314/* look for a thread at lockseq, a */
2315ksyn_waitq_element_t
2316ksyn_queue_find_signalseq(__unused ksyn_wait_queue_t kwq, ksyn_queue_t kq, uint32_t uptoseq, uint32_t signalseq)
2317{
2318	ksyn_waitq_element_t result = NULL;
2319	ksyn_waitq_element_t q_kwe, r_kwe;
2320
2321	// XXX
2322	/* case where wrap in the tail of the queue exists */
2323	TAILQ_FOREACH_SAFE(q_kwe, &kq->ksynq_kwelist, kwe_list, r_kwe) {
2324		if (q_kwe->kwe_state == KWE_THREAD_PREPOST) {
2325			if (is_seqhigher(q_kwe->kwe_lockseq, uptoseq)) {
2326				return result;
2327			}
2328		}
2329		if (q_kwe->kwe_state == KWE_THREAD_PREPOST || q_kwe->kwe_state == KWE_THREAD_BROADCAST) {
2330			/* match any prepost at our same uptoseq or any broadcast above */
2331			if (is_seqlower(q_kwe->kwe_lockseq, uptoseq)) {
2332				continue;
2333			}
2334			return q_kwe;
2335		} else if (q_kwe->kwe_state == KWE_THREAD_INWAIT) {
2336			/*
2337			 * Match any (non-cancelled) thread at or below our upto sequence -
2338			 * but prefer an exact match to our signal sequence (if present) to
2339			 * keep exact matches happening.
2340			 */
2341			if (is_seqhigher(q_kwe->kwe_lockseq, uptoseq)) {
2342				return result;
2343			}
2344			if (q_kwe->kwe_kwqqueue == kwq) {
2345				if (!pthread_kern->uthread_is_cancelled(q_kwe->kwe_uth)) {
2346					/* if equal or higher than our signal sequence, return this one */
2347					if (is_seqhigher_eq(q_kwe->kwe_lockseq, signalseq)) {
2348						return q_kwe;
2349					}
2350
2351					/* otherwise, just remember this eligible thread and move on */
2352					if (result == NULL) {
2353						result = q_kwe;
2354					}
2355				}
2356			}
2357		} else {
2358			panic("ksyn_queue_find_signalseq(): unknown wait queue element type (%d)\n", q_kwe->kwe_state);
2359		}
2360	}
2361	return result;
2362}
2363
2364void
2365ksyn_queue_free_items(ksyn_wait_queue_t kwq, int kqi, uint32_t upto, int all)
2366{
2367	ksyn_waitq_element_t kwe;
2368	uint32_t tseq = upto & PTHRW_COUNT_MASK;
2369	ksyn_queue_t kq = &kwq->kw_ksynqueues[kqi];
2370
2371	while ((kwe = TAILQ_FIRST(&kq->ksynq_kwelist)) != NULL) {
2372		if (all == 0 && is_seqhigher(kwe->kwe_lockseq, tseq)) {
2373			break;
2374		}
2375		if (kwe->kwe_state == KWE_THREAD_INWAIT) {
2376			/*
2377			 * This scenario is typically noticed when the cvar is
2378			 * reinited and the new waiters are waiting. We can
2379			 * return them as spurious wait so the cvar state gets
2380			 * reset correctly.
2381			 */
2382
2383			/* skip canceled ones */
2384			/* wake the rest */
2385			/* set M bit to indicate to waking CV to retun Inc val */
2386			(void)ksyn_signal(kwq, kqi, kwe, PTHRW_INC | PTH_RWS_CV_MBIT | PTH_RWL_MTX_WAIT);
2387		} else {
2388			ksyn_queue_remove_item(kwq, kq, kwe);
2389			pthread_kern->zfree(kwe_zone, kwe);
2390			kwq->kw_fakecount--;
2391		}
2392	}
2393}
2394
2395/*************************************************************************/
2396
2397void
2398update_low_high(ksyn_wait_queue_t kwq, uint32_t lockseq)
2399{
2400	if (kwq->kw_inqueue == 1) {
2401		kwq->kw_lowseq = lockseq;
2402		kwq->kw_highseq = lockseq;
2403	} else {
2404		if (is_seqlower(lockseq, kwq->kw_lowseq)) {
2405			kwq->kw_lowseq = lockseq;
2406		}
2407		if (is_seqhigher(lockseq, kwq->kw_highseq)) {
2408			kwq->kw_highseq = lockseq;
2409		}
2410	}
2411}
2412
2413uint32_t
2414find_nextlowseq(ksyn_wait_queue_t kwq)
2415{
2416	uint32_t lowest = 0;
2417	int first = 1;
2418	int i;
2419
2420	for (i = 0; i < KSYN_QUEUE_MAX; i++) {
2421		if (kwq->kw_ksynqueues[i].ksynq_count > 0) {
2422			uint32_t current = kwq->kw_ksynqueues[i].ksynq_firstnum;
2423			if (first || is_seqlower(current, lowest)) {
2424				lowest = current;
2425				first = 0;
2426			}
2427		}
2428	}
2429
2430	return lowest;
2431}
2432
2433uint32_t
2434find_nexthighseq(ksyn_wait_queue_t kwq)
2435{
2436	uint32_t highest = 0;
2437	int first = 1;
2438	int i;
2439
2440	for (i = 0; i < KSYN_QUEUE_MAX; i++) {
2441		if (kwq->kw_ksynqueues[i].ksynq_count > 0) {
2442			uint32_t current = kwq->kw_ksynqueues[i].ksynq_lastnum;
2443			if (first || is_seqhigher(current, highest)) {
2444				highest = current;
2445				first = 0;
2446			}
2447		}
2448	}
2449
2450	return highest;
2451}
2452
2453int
2454find_seq_till(ksyn_wait_queue_t kwq, uint32_t upto, uint32_t nwaiters, uint32_t *countp)
2455{
2456	int i;
2457	uint32_t count = 0;
2458
2459	for (i = 0; i< KSYN_QUEUE_MAX; i++) {
2460		count += ksyn_queue_count_tolowest(&kwq->kw_ksynqueues[i], upto);
2461		if (count >= nwaiters) {
2462			break;
2463		}
2464	}
2465
2466	if (countp != NULL) {
2467		*countp = count;
2468	}
2469
2470	if (count == 0) {
2471		return 0;
2472	} else if (count >= nwaiters) {
2473		return 1;
2474	} else {
2475		return 0;
2476	}
2477}
2478
2479
2480uint32_t
2481ksyn_queue_count_tolowest(ksyn_queue_t kq, uint32_t upto)
2482{
2483	uint32_t i = 0;
2484	ksyn_waitq_element_t kwe, newkwe;
2485
2486	if (kq->ksynq_count == 0 || is_seqhigher(kq->ksynq_firstnum, upto)) {
2487		return 0;
2488	}
2489	if (upto == kq->ksynq_firstnum) {
2490		return 1;
2491	}
2492	TAILQ_FOREACH_SAFE(kwe, &kq->ksynq_kwelist, kwe_list, newkwe) {
2493		uint32_t curval = (kwe->kwe_lockseq & PTHRW_COUNT_MASK);
2494		if (is_seqhigher(curval, upto)) {
2495			break;
2496		}
2497		++i;
2498		if (upto == curval) {
2499			break;
2500		}
2501	}
2502	return i;
2503}
2504
2505/* handles the cond broadcast of cvar and returns number of woken threads and bits for syscall return */
2506void
2507ksyn_handle_cvbroad(ksyn_wait_queue_t ckwq, uint32_t upto, uint32_t *updatep)
2508{
2509	ksyn_waitq_element_t kwe, newkwe;
2510	uint32_t updatebits = 0;
2511	ksyn_queue_t kq = &ckwq->kw_ksynqueues[KSYN_QUEUE_WRITER];
2512
2513	struct ksyn_queue kfreeq;
2514	ksyn_queue_init(&kfreeq);
2515
2516retry:
2517	TAILQ_FOREACH_SAFE(kwe, &kq->ksynq_kwelist, kwe_list, newkwe) {
2518		if (is_seqhigher(kwe->kwe_lockseq, upto)) {
2519			// outside our range
2520			break;
2521		}
2522
2523		if (kwe->kwe_state == KWE_THREAD_INWAIT) {
2524			// Wake only non-canceled threads waiting on this CV.
2525			if (!pthread_kern->uthread_is_cancelled(kwe->kwe_uth)) {
2526				(void)ksyn_signal(ckwq, KSYN_QUEUE_WRITER, kwe, PTH_RWL_MTX_WAIT);
2527				updatebits += PTHRW_INC;
2528			}
2529		} else if (kwe->kwe_state == KWE_THREAD_BROADCAST ||
2530			   kwe->kwe_state == KWE_THREAD_PREPOST) {
2531			ksyn_queue_remove_item(ckwq, kq, kwe);
2532			TAILQ_INSERT_TAIL(&kfreeq.ksynq_kwelist, kwe, kwe_list);
2533			ckwq->kw_fakecount--;
2534		} else {
2535			panic("unknown kwe state\n");
2536		}
2537	}
2538
2539	/* Need to enter a broadcast in the queue (if not already at L == S) */
2540
2541	if (diff_genseq(ckwq->kw_lword, ckwq->kw_sword)) {
2542		newkwe = TAILQ_FIRST(&kfreeq.ksynq_kwelist);
2543		if (newkwe == NULL) {
2544			ksyn_wqunlock(ckwq);
2545			newkwe = (ksyn_waitq_element_t)pthread_kern->zalloc(kwe_zone);
2546			TAILQ_INSERT_TAIL(&kfreeq.ksynq_kwelist, newkwe, kwe_list);
2547			ksyn_wqlock(ckwq);
2548			goto retry;
2549		} else {
2550			TAILQ_REMOVE(&kfreeq.ksynq_kwelist, newkwe, kwe_list);
2551			ksyn_prepost(ckwq, newkwe, KWE_THREAD_BROADCAST, upto);
2552		}
2553	}
2554
2555	// free up any remaining things stumbled across above
2556	while ((kwe = TAILQ_FIRST(&kfreeq.ksynq_kwelist)) != NULL) {
2557		TAILQ_REMOVE(&kfreeq.ksynq_kwelist, kwe, kwe_list);
2558		pthread_kern->zfree(kwe_zone, kwe);
2559	}
2560
2561	if (updatep != NULL) {
2562		*updatep = updatebits;
2563	}
2564}
2565
2566void
2567ksyn_cvupdate_fixup(ksyn_wait_queue_t ckwq, uint32_t *updatebits)
2568{
2569	if ((ckwq->kw_lword & PTHRW_COUNT_MASK) == (ckwq->kw_sword & PTHRW_COUNT_MASK)) {
2570		if (ckwq->kw_inqueue != 0) {
2571			/* FREE THE QUEUE */
2572			ksyn_queue_free_items(ckwq, KSYN_QUEUE_WRITER, ckwq->kw_lword, 0);
2573#if __TESTPANICS__
2574			if (ckwq->kw_inqueue != 0)
2575				panic("ksyn_cvupdate_fixup: L == S, but entries in queue beyond S");
2576#endif /* __TESTPANICS__ */
2577		}
2578		ckwq->kw_lword = ckwq->kw_uword = ckwq->kw_sword = 0;
2579		ckwq->kw_kflags |= KSYN_KWF_ZEROEDOUT;
2580		*updatebits |= PTH_RWS_CV_CBIT;
2581	} else if (ckwq->kw_inqueue != 0 && ckwq->kw_fakecount == ckwq->kw_inqueue) {
2582		// only fake entries are present in the queue
2583		*updatebits |= PTH_RWS_CV_PBIT;
2584	}
2585}
2586
2587void
2588psynch_zoneinit(void)
2589{
2590	kwq_zone = (zone_t)pthread_kern->zinit(sizeof(struct ksyn_wait_queue), 8192 * sizeof(struct ksyn_wait_queue), 4096, "ksyn_wait_queue");
2591	kwe_zone = (zone_t)pthread_kern->zinit(sizeof(struct ksyn_waitq_element), 8192 * sizeof(struct ksyn_waitq_element), 4096, "ksyn_waitq_element");
2592}
2593