1/*
2 * Copyright (C) 2003 Daniel M. Eischen <deischen@freebsd.org>
3 * Copyright (C) 2002 Jonathon Mini <mini@freebsd.org>
4 * Copyright (c) 1995-1998 John Birrell <jb@cimlogic.com.au>
5 * All rights reserved.
6 *
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
9 * are met:
10 * 1. Redistributions of source code must retain the above copyright
11 *    notice, this list of conditions and the following disclaimer.
12 * 2. Redistributions in binary form must reproduce the above copyright
13 *    notice, this list of conditions and the following disclaimer in the
14 *    documentation and/or other materials provided with the distribution.
15 * 3. All advertising materials mentioning features or use of this software
16 *    must display the following acknowledgement:
17 *	This product includes software developed by John Birrell.
18 * 4. Neither the name of the author nor the names of any co-contributors
19 *    may be used to endorse or promote products derived from this software
20 *    without specific prior written permission.
21 *
22 * THIS SOFTWARE IS PROVIDED BY JOHN BIRRELL AND CONTRIBUTORS ``AS IS'' AND
23 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
24 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
25 * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
26 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
27 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
28 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
29 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
30 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
31 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32 * SUCH DAMAGE.
33 *
34 */
35#include <sys/cdefs.h>
36__FBSDID("$FreeBSD: releng/10.3/lib/libkse/thread/thr_kern.c 174335 2007-12-06 06:04:01Z deischen $");
37
38#include <sys/types.h>
39#include <sys/kse.h>
40#include <sys/ptrace.h>
41#include <sys/signalvar.h>
42#include <sys/queue.h>
43#include <machine/atomic.h>
44#include <machine/sigframe.h>
45
46#include <assert.h>
47#include <errno.h>
48#include <signal.h>
49#include <stdlib.h>
50#include <string.h>
51#include <time.h>
52#include <ucontext.h>
53#include <unistd.h>
54
55#include "atomic_ops.h"
56#include "thr_private.h"
57#include "libc_private.h"
58#ifdef NOTYET
59#include "spinlock.h"
60#endif
61
62/* #define DEBUG_THREAD_KERN */
63#ifdef DEBUG_THREAD_KERN
64#define DBG_MSG		stdout_debug
65#else
66#define DBG_MSG(x...)
67#endif
68
69/*
70 * Define a high water mark for the maximum number of threads that
71 * will be cached.  Once this level is reached, any extra threads
72 * will be free()'d.
73 */
74#define	MAX_CACHED_THREADS	100
75/*
76 * Define high water marks for the maximum number of KSEs and KSE groups
77 * that will be cached. Because we support 1:1 threading, there could have
78 * same number of KSEs and KSE groups as threads. Once these levels are
79 * reached, any extra KSE and KSE groups will be free()'d.
80 */
81#define	MAX_CACHED_KSES		((_thread_scope_system <= 0) ? 50 : 100)
82#define	MAX_CACHED_KSEGS	((_thread_scope_system <= 0) ? 50 : 100)
83
84#define	KSE_SET_MBOX(kse, thrd) \
85	(kse)->k_kcb->kcb_kmbx.km_curthread = &(thrd)->tcb->tcb_tmbx
86
87#define	KSE_SET_EXITED(kse)	(kse)->k_flags |= KF_EXITED
88
89/*
90 * Macros for manipulating the run queues.  The priority queue
91 * routines use the thread's pqe link and also handle the setting
92 * and clearing of the thread's THR_FLAGS_IN_RUNQ flag.
93 */
94#define	KSE_RUNQ_INSERT_HEAD(kse, thrd)			\
95	_pq_insert_head(&(kse)->k_schedq->sq_runq, thrd)
96#define	KSE_RUNQ_INSERT_TAIL(kse, thrd)			\
97	_pq_insert_tail(&(kse)->k_schedq->sq_runq, thrd)
98#define	KSE_RUNQ_REMOVE(kse, thrd)			\
99	_pq_remove(&(kse)->k_schedq->sq_runq, thrd)
100#define	KSE_RUNQ_FIRST(kse)				\
101	((_libkse_debug == 0) ?				\
102	 _pq_first(&(kse)->k_schedq->sq_runq) :		\
103	 _pq_first_debug(&(kse)->k_schedq->sq_runq))
104
105#define KSE_RUNQ_THREADS(kse)	((kse)->k_schedq->sq_runq.pq_threads)
106
107#define THR_NEED_CANCEL(thrd)						\
108	 (((thrd)->cancelflags & THR_CANCELLING) != 0 &&		\
109	  ((thrd)->cancelflags & PTHREAD_CANCEL_DISABLE) == 0 &&	\
110	  (((thrd)->cancelflags & THR_AT_CANCEL_POINT) != 0 ||		\
111	   ((thrd)->cancelflags & PTHREAD_CANCEL_ASYNCHRONOUS) != 0))
112
113#define THR_NEED_ASYNC_CANCEL(thrd)					\
114	 (((thrd)->cancelflags & THR_CANCELLING) != 0 &&		\
115	  ((thrd)->cancelflags & PTHREAD_CANCEL_DISABLE) == 0 &&	\
116	  (((thrd)->cancelflags & THR_AT_CANCEL_POINT) == 0 &&		\
117	   ((thrd)->cancelflags & PTHREAD_CANCEL_ASYNCHRONOUS) != 0))
118
119/*
120 * We've got to keep track of everything that is allocated, not only
121 * to have a speedy free list, but also so they can be deallocated
122 * after a fork().
123 */
124static TAILQ_HEAD(, kse)	active_kseq;
125static TAILQ_HEAD(, kse)	free_kseq;
126static TAILQ_HEAD(, kse_group)	free_kse_groupq;
127static TAILQ_HEAD(, kse_group)	active_kse_groupq;
128static TAILQ_HEAD(, kse_group)	gc_ksegq;
129static struct lock		kse_lock;	/* also used for kseg queue */
130static int			free_kse_count = 0;
131static int			free_kseg_count = 0;
132static TAILQ_HEAD(, pthread)	free_threadq;
133static struct lock		thread_lock;
134static int			free_thread_count = 0;
135static int			inited = 0;
136static int			active_kse_count = 0;
137static int			active_kseg_count = 0;
138static u_int64_t		next_uniqueid = 1;
139
140LIST_HEAD(thread_hash_head, pthread);
141#define THREAD_HASH_QUEUES	127
142static struct thread_hash_head	thr_hashtable[THREAD_HASH_QUEUES];
143#define	THREAD_HASH(thrd)	((unsigned long)thrd % THREAD_HASH_QUEUES)
144
145/* Lock for thread tcb constructor/destructor */
146static pthread_mutex_t		_tcb_mutex;
147
148#ifdef DEBUG_THREAD_KERN
149static void	dump_queues(struct kse *curkse);
150#endif
151static void	kse_check_completed(struct kse *kse);
152static void	kse_check_waitq(struct kse *kse);
153static void	kse_fini(struct kse *curkse);
154static void	kse_reinit(struct kse *kse, int sys_scope);
155static void	kse_sched_multi(struct kse_mailbox *kmbx);
156static void	kse_sched_single(struct kse_mailbox *kmbx);
157static void	kse_switchout_thread(struct kse *kse, struct pthread *thread);
158static void	kse_wait(struct kse *kse, struct pthread *td_wait, int sigseq);
159static void	kse_free_unlocked(struct kse *kse);
160static void	kse_destroy(struct kse *kse);
161static void	kseg_free_unlocked(struct kse_group *kseg);
162static void	kseg_init(struct kse_group *kseg);
163static void	kseg_reinit(struct kse_group *kseg);
164static void	kseg_destroy(struct kse_group *kseg);
165static void	kse_waitq_insert(struct pthread *thread);
166static void	kse_wakeup_multi(struct kse *curkse);
167static struct kse_mailbox *kse_wakeup_one(struct pthread *thread);
168static void	thr_cleanup(struct kse *kse, struct pthread *curthread);
169static void	thr_link(struct pthread *thread);
170static void	thr_resume_wrapper(int sig, siginfo_t *, ucontext_t *);
171static void	thr_resume_check(struct pthread *curthread, ucontext_t *ucp);
172static int	thr_timedout(struct pthread *thread, struct timespec *curtime);
173static void	thr_unlink(struct pthread *thread);
174static void	thr_destroy(struct pthread *curthread, struct pthread *thread);
175static void	thread_gc(struct pthread *thread);
176static void	kse_gc(struct pthread *thread);
177static void	kseg_gc(struct pthread *thread);
178
179static __inline void
180thr_accounting(struct pthread *thread)
181{
182	if ((thread->slice_usec != -1) &&
183	    (thread->slice_usec <= TIMESLICE_USEC) &&
184	    (thread->attr.sched_policy != SCHED_FIFO)) {
185		thread->slice_usec += (thread->tcb->tcb_tmbx.tm_uticks
186		    + thread->tcb->tcb_tmbx.tm_sticks) * _clock_res_usec;
187		/* Check for time quantum exceeded: */
188		if (thread->slice_usec > TIMESLICE_USEC)
189			thread->slice_usec = -1;
190	}
191	thread->tcb->tcb_tmbx.tm_uticks = 0;
192	thread->tcb->tcb_tmbx.tm_sticks = 0;
193}
194
195/*
196 * This is called after a fork().
197 * No locks need to be taken here since we are guaranteed to be
198 * single threaded.
199 *
200 * XXX
201 * POSIX says for threaded process, fork() function is used
202 * only to run new programs, and the effects of calling functions
203 * that require certain resources between the call to fork() and
204 * the call to an exec function are undefined.
205 *
206 * It is not safe to free memory after fork(), because these data
207 * structures may be in inconsistent state.
208 */
209void
210_kse_single_thread(struct pthread *curthread)
211{
212#ifdef NOTYET
213	struct kse *kse;
214	struct kse_group *kseg;
215	struct pthread *thread;
216
217	_thr_spinlock_init();
218	*__malloc_lock = (spinlock_t)_SPINLOCK_INITIALIZER;
219	if (__isthreaded) {
220		_thr_rtld_fini();
221		_thr_signal_deinit();
222	}
223	__isthreaded = 0;
224	/*
225	 * Restore signal mask early, so any memory problems could
226	 * dump core.
227	 */
228	__sys_sigprocmask(SIG_SETMASK, &curthread->sigmask, NULL);
229	_thread_active_threads = 1;
230
231	curthread->kse->k_kcb->kcb_kmbx.km_curthread = NULL;
232	curthread->attr.flags &= ~PTHREAD_SCOPE_PROCESS;
233	curthread->attr.flags |= PTHREAD_SCOPE_SYSTEM;
234
235	/*
236	 * Enter a loop to remove and free all threads other than
237	 * the running thread from the active thread list:
238	 */
239	while ((thread = TAILQ_FIRST(&_thread_list)) != NULL) {
240		THR_GCLIST_REMOVE(thread);
241		/*
242		 * Remove this thread from the list (the current
243		 * thread will be removed but re-added by libpthread
244		 * initialization.
245		 */
246		TAILQ_REMOVE(&_thread_list, thread, tle);
247		/* Make sure this isn't the running thread: */
248		if (thread != curthread) {
249			_thr_stack_free(&thread->attr);
250			if (thread->specific != NULL)
251				free(thread->specific);
252			thr_destroy(curthread, thread);
253		}
254	}
255
256	TAILQ_INIT(&curthread->mutexq);		/* initialize mutex queue */
257	curthread->joiner = NULL;		/* no joining threads yet */
258	curthread->refcount = 0;
259	SIGEMPTYSET(curthread->sigpend);	/* clear pending signals */
260
261	/* Don't free thread-specific data as the caller may require it */
262
263	/* Free the free KSEs: */
264	while ((kse = TAILQ_FIRST(&free_kseq)) != NULL) {
265		TAILQ_REMOVE(&free_kseq, kse, k_qe);
266		kse_destroy(kse);
267	}
268	free_kse_count = 0;
269
270	/* Free the active KSEs: */
271	while ((kse = TAILQ_FIRST(&active_kseq)) != NULL) {
272		TAILQ_REMOVE(&active_kseq, kse, k_qe);
273		kse_destroy(kse);
274	}
275	active_kse_count = 0;
276
277	/* Free the free KSEGs: */
278	while ((kseg = TAILQ_FIRST(&free_kse_groupq)) != NULL) {
279		TAILQ_REMOVE(&free_kse_groupq, kseg, kg_qe);
280		kseg_destroy(kseg);
281	}
282	free_kseg_count = 0;
283
284	/* Free the active KSEGs: */
285	while ((kseg = TAILQ_FIRST(&active_kse_groupq)) != NULL) {
286		TAILQ_REMOVE(&active_kse_groupq, kseg, kg_qe);
287		kseg_destroy(kseg);
288	}
289	active_kseg_count = 0;
290
291	/* Free the free threads. */
292	while ((thread = TAILQ_FIRST(&free_threadq)) != NULL) {
293		TAILQ_REMOVE(&free_threadq, thread, tle);
294		thr_destroy(curthread, thread);
295	}
296	free_thread_count = 0;
297
298	/* Free the to-be-gc'd threads. */
299	while ((thread = TAILQ_FIRST(&_thread_gc_list)) != NULL) {
300		TAILQ_REMOVE(&_thread_gc_list, thread, gcle);
301		thr_destroy(curthread, thread);
302	}
303	TAILQ_INIT(&gc_ksegq);
304	_gc_count = 0;
305
306	if (inited != 0) {
307		/*
308		 * Destroy these locks; they'll be recreated to assure they
309		 * are in the unlocked state.
310		 */
311		_lock_destroy(&kse_lock);
312		_lock_destroy(&thread_lock);
313		_lock_destroy(&_thread_list_lock);
314		inited = 0;
315	}
316
317	/* We're no longer part of any lists */
318	curthread->tlflags = 0;
319
320	/*
321	 * After a fork, we are still operating on the thread's original
322	 * stack.  Don't clear the THR_FLAGS_USER from the thread's
323	 * attribute flags.
324	 */
325
326	/* Initialize the threads library. */
327	curthread->kse = NULL;
328	curthread->kseg = NULL;
329	_kse_initial = NULL;
330	_libpthread_init(curthread);
331#else
332	int i;
333
334	/* Reset the current thread and KSE lock data. */
335	for (i = 0; i < curthread->locklevel; i++) {
336		_lockuser_reinit(&curthread->lockusers[i], (void *)curthread);
337	}
338	curthread->locklevel = 0;
339	for (i = 0; i < curthread->kse->k_locklevel; i++) {
340		_lockuser_reinit(&curthread->kse->k_lockusers[i],
341		    (void *)curthread->kse);
342		_LCK_SET_PRIVATE2(&curthread->kse->k_lockusers[i], NULL);
343	}
344	curthread->kse->k_locklevel = 0;
345
346	/*
347	 * Reinitialize the thread and signal locks so that
348	 * sigaction() will work after a fork().
349	 */
350	_lock_reinit(&curthread->lock, LCK_ADAPTIVE, _thr_lock_wait,
351	    _thr_lock_wakeup);
352	_lock_reinit(&_thread_signal_lock, LCK_ADAPTIVE, _kse_lock_wait,
353	    _kse_lock_wakeup);
354
355	_thr_spinlock_init();
356	if (__isthreaded) {
357		_thr_rtld_fini();
358		_thr_signal_deinit();
359	}
360	__isthreaded = 0;
361	curthread->kse->k_kcb->kcb_kmbx.km_curthread = NULL;
362	curthread->attr.flags |= PTHREAD_SCOPE_SYSTEM;
363
364	/*
365	 * After a fork, it is possible that an upcall occurs in
366	 * the parent KSE that fork()'d before the child process
367	 * is fully created and before its vm space is copied.
368	 * During the upcall, the tcb is set to null or to another
369	 * thread, and this is what gets copied in the child process
370	 * when the vm space is cloned sometime after the upcall
371	 * occurs.  Note that we shouldn't have to set the kcb, but
372	 * we do it for completeness.
373	 */
374	_kcb_set(curthread->kse->k_kcb);
375	_tcb_set(curthread->kse->k_kcb, curthread->tcb);
376
377	/* After a fork(), there child should have no pending signals. */
378	sigemptyset(&curthread->sigpend);
379
380	/*
381	 * Restore signal mask early, so any memory problems could
382	 * dump core.
383	 */
384	sigprocmask(SIG_SETMASK, &curthread->sigmask, NULL);
385	_thread_active_threads = 1;
386#endif
387}
388
389/*
390 * This is used to initialize housekeeping and to initialize the
391 * KSD for the KSE.
392 */
393void
394_kse_init(void)
395{
396	if (inited == 0) {
397		TAILQ_INIT(&active_kseq);
398		TAILQ_INIT(&active_kse_groupq);
399		TAILQ_INIT(&free_kseq);
400		TAILQ_INIT(&free_kse_groupq);
401		TAILQ_INIT(&free_threadq);
402		TAILQ_INIT(&gc_ksegq);
403		if (_lock_init(&kse_lock, LCK_ADAPTIVE,
404		    _kse_lock_wait, _kse_lock_wakeup, calloc) != 0)
405			PANIC("Unable to initialize free KSE queue lock");
406		if (_lock_init(&thread_lock, LCK_ADAPTIVE,
407		    _kse_lock_wait, _kse_lock_wakeup, calloc) != 0)
408			PANIC("Unable to initialize free thread queue lock");
409		if (_lock_init(&_thread_list_lock, LCK_ADAPTIVE,
410		    _kse_lock_wait, _kse_lock_wakeup, calloc) != 0)
411			PANIC("Unable to initialize thread list lock");
412		_pthread_mutex_init(&_tcb_mutex, NULL);
413		active_kse_count = 0;
414		active_kseg_count = 0;
415		_gc_count = 0;
416		inited = 1;
417	}
418}
419
420/*
421 * This is called when the first thread (other than the initial
422 * thread) is created.
423 */
424int
425_kse_setthreaded(int threaded)
426{
427	sigset_t sigset;
428
429	if ((threaded != 0) && (__isthreaded == 0)) {
430		SIGFILLSET(sigset);
431		__sys_sigprocmask(SIG_SETMASK, &sigset, &_thr_initial->sigmask);
432
433		/*
434		 * Tell the kernel to create a KSE for the initial thread
435		 * and enable upcalls in it.
436		 */
437		_kse_initial->k_flags |= KF_STARTED;
438
439		if (_thread_scope_system <= 0) {
440			_thr_initial->attr.flags &= ~PTHREAD_SCOPE_SYSTEM;
441			_kse_initial->k_kseg->kg_flags &= ~KGF_SINGLE_THREAD;
442			_kse_initial->k_kcb->kcb_kmbx.km_curthread = NULL;
443		}
444		else {
445			/*
446			 * For bound thread, kernel reads mailbox pointer
447			 * once, we'd set it here before calling kse_create.
448			 */
449			_tcb_set(_kse_initial->k_kcb, _thr_initial->tcb);
450			KSE_SET_MBOX(_kse_initial, _thr_initial);
451			_kse_initial->k_kcb->kcb_kmbx.km_flags |= KMF_BOUND;
452		}
453
454		/*
455		 * Locking functions in libc are required when there are
456		 * threads other than the initial thread.
457		 */
458		_thr_rtld_init();
459
460		__isthreaded = 1;
461		if (kse_create(&_kse_initial->k_kcb->kcb_kmbx, 0) != 0) {
462			_kse_initial->k_flags &= ~KF_STARTED;
463			__isthreaded = 0;
464			PANIC("kse_create() failed\n");
465			return (-1);
466		}
467		_thr_initial->tcb->tcb_tmbx.tm_lwp =
468			_kse_initial->k_kcb->kcb_kmbx.km_lwp;
469		_thread_activated = 1;
470
471#ifndef SYSTEM_SCOPE_ONLY
472		if (_thread_scope_system <= 0) {
473			/* Set current thread to initial thread */
474			_tcb_set(_kse_initial->k_kcb, _thr_initial->tcb);
475			KSE_SET_MBOX(_kse_initial, _thr_initial);
476			_thr_start_sig_daemon();
477			_thr_setmaxconcurrency();
478		}
479		else
480#endif
481			__sys_sigprocmask(SIG_SETMASK, &_thr_initial->sigmask,
482			    NULL);
483	}
484	return (0);
485}
486
487/*
488 * Lock wait and wakeup handlers for KSE locks.  These are only used by
489 * KSEs, and should never be used by threads.  KSE locks include the
490 * KSE group lock (used for locking the scheduling queue) and the
491 * kse_lock defined above.
492 *
493 * When a KSE lock attempt blocks, the entire KSE blocks allowing another
494 * KSE to run.  For the most part, it doesn't make much sense to try and
495 * schedule another thread because you need to lock the scheduling queue
496 * in order to do that.  And since the KSE lock is used to lock the scheduling
497 * queue, you would just end up blocking again.
498 */
499void
500_kse_lock_wait(struct lock *lock __unused, struct lockuser *lu)
501{
502	struct kse *curkse = (struct kse *)_LCK_GET_PRIVATE(lu);
503	struct timespec ts;
504	int saved_flags;
505
506	if (curkse->k_kcb->kcb_kmbx.km_curthread != NULL)
507		PANIC("kse_lock_wait does not disable upcall.\n");
508	/*
509	 * Enter a loop to wait until we get the lock.
510	 */
511	ts.tv_sec = 0;
512	ts.tv_nsec = 1000000;  /* 1 sec */
513	while (!_LCK_GRANTED(lu)) {
514		/*
515		 * Yield the kse and wait to be notified when the lock
516		 * is granted.
517		 */
518		saved_flags = curkse->k_kcb->kcb_kmbx.km_flags;
519		curkse->k_kcb->kcb_kmbx.km_flags |= KMF_NOUPCALL |
520		    KMF_NOCOMPLETED;
521		kse_release(&ts);
522		curkse->k_kcb->kcb_kmbx.km_flags = saved_flags;
523	}
524}
525
526void
527_kse_lock_wakeup(struct lock *lock, struct lockuser *lu)
528{
529	struct kse *curkse;
530	struct kse *kse;
531	struct kse_mailbox *mbx;
532
533	curkse = _get_curkse();
534	kse = (struct kse *)_LCK_GET_PRIVATE(lu);
535
536	if (kse == curkse)
537		PANIC("KSE trying to wake itself up in lock");
538	else {
539		mbx = &kse->k_kcb->kcb_kmbx;
540		_lock_grant(lock, lu);
541		/*
542		 * Notify the owning kse that it has the lock.
543		 * It is safe to pass invalid address to kse_wakeup
544		 * even if the mailbox is not in kernel at all,
545		 * and waking up a wrong kse is also harmless.
546		 */
547		kse_wakeup(mbx);
548	}
549}
550
551/*
552 * Thread wait and wakeup handlers for thread locks.  These are only used
553 * by threads, never by KSEs.  Thread locks include the per-thread lock
554 * (defined in its structure), and condition variable and mutex locks.
555 */
556void
557_thr_lock_wait(struct lock *lock __unused, struct lockuser *lu)
558{
559	struct pthread *curthread = (struct pthread *)lu->lu_private;
560
561	do {
562		THR_LOCK_SWITCH(curthread);
563		THR_SET_STATE(curthread, PS_LOCKWAIT);
564		_thr_sched_switch_unlocked(curthread);
565	} while (!_LCK_GRANTED(lu));
566}
567
568void
569_thr_lock_wakeup(struct lock *lock __unused, struct lockuser *lu)
570{
571	struct pthread *thread;
572	struct pthread *curthread;
573	struct kse_mailbox *kmbx;
574
575	curthread = _get_curthread();
576	thread = (struct pthread *)_LCK_GET_PRIVATE(lu);
577
578	THR_SCHED_LOCK(curthread, thread);
579	_lock_grant(lock, lu);
580	kmbx = _thr_setrunnable_unlocked(thread);
581	THR_SCHED_UNLOCK(curthread, thread);
582	if (kmbx != NULL)
583		kse_wakeup(kmbx);
584}
585
586kse_critical_t
587_kse_critical_enter(void)
588{
589	kse_critical_t crit;
590
591	crit = (kse_critical_t)_kcb_critical_enter();
592	return (crit);
593}
594
595void
596_kse_critical_leave(kse_critical_t crit)
597{
598	struct pthread *curthread;
599
600	_kcb_critical_leave((struct kse_thr_mailbox *)crit);
601	if ((crit != NULL) && ((curthread = _get_curthread()) != NULL))
602		THR_YIELD_CHECK(curthread);
603}
604
605int
606_kse_in_critical(void)
607{
608	return (_kcb_in_critical());
609}
610
611void
612_thr_critical_enter(struct pthread *thread)
613{
614	thread->critical_count++;
615}
616
617void
618_thr_critical_leave(struct pthread *thread)
619{
620	thread->critical_count--;
621	THR_YIELD_CHECK(thread);
622}
623
624void
625_thr_sched_switch(struct pthread *curthread)
626{
627	struct kse *curkse;
628
629	(void)_kse_critical_enter();
630	curkse = _get_curkse();
631	KSE_SCHED_LOCK(curkse, curkse->k_kseg);
632	_thr_sched_switch_unlocked(curthread);
633}
634
635/*
636 * XXX - We may need to take the scheduling lock before calling
637 *       this, or perhaps take the lock within here before
638 *       doing anything else.
639 */
640void
641_thr_sched_switch_unlocked(struct pthread *curthread)
642{
643	struct kse *curkse;
644	volatile int resume_once = 0;
645	ucontext_t *uc;
646
647	/* We're in the scheduler, 5 by 5: */
648	curkse = curthread->kse;
649
650	curthread->need_switchout = 1;	/* The thread yielded on its own. */
651	curthread->critical_yield = 0;	/* No need to yield anymore. */
652
653	/* Thread can unlock the scheduler lock. */
654	curthread->lock_switch = 1;
655
656	if (curthread->attr.flags & PTHREAD_SCOPE_SYSTEM)
657		kse_sched_single(&curkse->k_kcb->kcb_kmbx);
658	else {
659		if (__predict_false(_libkse_debug != 0)) {
660			/*
661			 * Because debugger saves single step status in thread
662			 * mailbox's tm_dflags, we can safely clear single
663			 * step status here. the single step status will be
664			 * restored by kse_switchin when the thread is
665			 * switched in again. This also lets uts run in full
666			 * speed.
667			 */
668			 ptrace(PT_CLEARSTEP, curkse->k_kcb->kcb_kmbx.km_lwp,
669				(caddr_t) 1, 0);
670		}
671
672		KSE_SET_SWITCH(curkse);
673		_thread_enter_uts(curthread->tcb, curkse->k_kcb);
674	}
675
676	/*
677	 * Unlock the scheduling queue and leave the
678	 * critical region.
679	 */
680	/* Don't trust this after a switch! */
681	curkse = curthread->kse;
682
683	curthread->lock_switch = 0;
684	KSE_SCHED_UNLOCK(curkse, curkse->k_kseg);
685	_kse_critical_leave(&curthread->tcb->tcb_tmbx);
686
687	/*
688	 * This thread is being resumed; check for cancellations.
689	 */
690	if (THR_NEED_ASYNC_CANCEL(curthread) && !THR_IN_CRITICAL(curthread)) {
691		uc = alloca(sizeof(ucontext_t));
692		resume_once = 0;
693		THR_GETCONTEXT(uc);
694		if (resume_once == 0) {
695			resume_once = 1;
696			curthread->check_pending = 0;
697			thr_resume_check(curthread, uc);
698		}
699	}
700	THR_ACTIVATE_LAST_LOCK(curthread);
701}
702
703/*
704 * This is the scheduler for a KSE which runs a scope system thread.
705 * The multi-thread KSE scheduler should also work for a single threaded
706 * KSE, but we use a separate scheduler so that it can be fine-tuned
707 * to be more efficient (and perhaps not need a separate stack for
708 * the KSE, allowing it to use the thread's stack).
709 */
710
711static void
712kse_sched_single(struct kse_mailbox *kmbx)
713{
714	struct kse *curkse;
715	struct pthread *curthread;
716	struct timespec ts;
717	sigset_t sigmask;
718	int i, sigseqno, level, first = 0;
719
720	curkse = (struct kse *)kmbx->km_udata;
721	curthread = curkse->k_curthread;
722
723	if (__predict_false((curkse->k_flags & KF_INITIALIZED) == 0)) {
724		/* Setup this KSEs specific data. */
725		_kcb_set(curkse->k_kcb);
726		_tcb_set(curkse->k_kcb, curthread->tcb);
727		curkse->k_flags |= KF_INITIALIZED;
728		first = 1;
729		curthread->active = 1;
730
731		/* Setup kernel signal masks for new thread. */
732		__sys_sigprocmask(SIG_SETMASK, &curthread->sigmask, NULL);
733		/*
734		 * Enter critical region, this is meanless for bound thread,
735		 * It is used to let other code work, those code want mailbox
736		 * to be cleared.
737		 */
738		(void)_kse_critical_enter();
739 	} else {
740		/*
741		 * Bound thread always has tcb set, this prevent some
742		 * code from blindly setting bound thread tcb to NULL,
743		 * buggy code ?
744		 */
745		_tcb_set(curkse->k_kcb, curthread->tcb);
746	}
747
748	curthread->critical_yield = 0;
749	curthread->need_switchout = 0;
750
751	/*
752	 * Lock the scheduling queue.
753	 *
754	 * There is no scheduling queue for single threaded KSEs,
755	 * but we need a lock for protection regardless.
756	 */
757	if (curthread->lock_switch == 0)
758		KSE_SCHED_LOCK(curkse, curkse->k_kseg);
759
760	/*
761	 * This has to do the job of kse_switchout_thread(), only
762	 * for a single threaded KSE/KSEG.
763	 */
764
765	switch (curthread->state) {
766	case PS_MUTEX_WAIT:
767	case PS_COND_WAIT:
768		if (THR_NEED_CANCEL(curthread)) {
769			curthread->interrupted = 1;
770			curthread->continuation = _thr_finish_cancellation;
771			THR_SET_STATE(curthread, PS_RUNNING);
772		}
773		break;
774
775	case PS_LOCKWAIT:
776		/*
777		 * This state doesn't timeout.
778		 */
779		curthread->wakeup_time.tv_sec = -1;
780		curthread->wakeup_time.tv_nsec = -1;
781		level = curthread->locklevel - 1;
782		if (_LCK_GRANTED(&curthread->lockusers[level]))
783			THR_SET_STATE(curthread, PS_RUNNING);
784		break;
785
786	case PS_DEAD:
787		/* Unlock the scheduling queue and exit the KSE and thread. */
788		thr_cleanup(curkse, curthread);
789		KSE_SCHED_UNLOCK(curkse, curkse->k_kseg);
790		PANIC("bound thread shouldn't get here\n");
791		break;
792
793	case PS_JOIN:
794		if (THR_NEED_CANCEL(curthread)) {
795			curthread->join_status.thread = NULL;
796			THR_SET_STATE(curthread, PS_RUNNING);
797		} else {
798			/*
799			 * This state doesn't timeout.
800			 */
801			curthread->wakeup_time.tv_sec = -1;
802			curthread->wakeup_time.tv_nsec = -1;
803		}
804		break;
805
806	case PS_SUSPENDED:
807		if (THR_NEED_CANCEL(curthread)) {
808			curthread->interrupted = 1;
809			THR_SET_STATE(curthread, PS_RUNNING);
810		} else {
811			/*
812			 * These states don't timeout.
813			 */
814			curthread->wakeup_time.tv_sec = -1;
815			curthread->wakeup_time.tv_nsec = -1;
816		}
817		break;
818
819	case PS_RUNNING:
820		if ((curthread->flags & THR_FLAGS_SUSPENDED) != 0 &&
821		    !THR_NEED_CANCEL(curthread)) {
822			THR_SET_STATE(curthread, PS_SUSPENDED);
823			/*
824			 * These states don't timeout.
825			 */
826			curthread->wakeup_time.tv_sec = -1;
827			curthread->wakeup_time.tv_nsec = -1;
828		}
829		break;
830
831	case PS_SIGWAIT:
832		PANIC("bound thread does not have SIGWAIT state\n");
833
834	case PS_SLEEP_WAIT:
835		PANIC("bound thread does not have SLEEP_WAIT state\n");
836
837	case PS_SIGSUSPEND:
838		PANIC("bound thread does not have SIGSUSPEND state\n");
839
840	case PS_DEADLOCK:
841		/*
842		 * These states don't timeout and don't need
843		 * to be in the waiting queue.
844		 */
845		curthread->wakeup_time.tv_sec = -1;
846		curthread->wakeup_time.tv_nsec = -1;
847		break;
848
849	default:
850		PANIC("Unknown state\n");
851		break;
852	}
853
854	while (curthread->state != PS_RUNNING) {
855		sigseqno = curkse->k_sigseqno;
856		if (curthread->check_pending != 0) {
857			/*
858			 * Install pending signals into the frame, possible
859			 * cause mutex or condvar backout.
860			 */
861			curthread->check_pending = 0;
862			SIGFILLSET(sigmask);
863
864			/*
865			 * Lock out kernel signal code when we are processing
866			 * signals, and get a fresh copy of signal mask.
867			 */
868			__sys_sigprocmask(SIG_SETMASK, &sigmask,
869					  &curthread->sigmask);
870			for (i = 1; i <= _SIG_MAXSIG; i++) {
871				if (SIGISMEMBER(curthread->sigmask, i))
872					continue;
873				if (SIGISMEMBER(curthread->sigpend, i))
874					(void)_thr_sig_add(curthread, i,
875					    &curthread->siginfo[i-1]);
876			}
877			__sys_sigprocmask(SIG_SETMASK, &curthread->sigmask,
878				NULL);
879			/* The above code might make thread runnable */
880			if (curthread->state == PS_RUNNING)
881				break;
882		}
883		THR_DEACTIVATE_LAST_LOCK(curthread);
884		kse_wait(curkse, curthread, sigseqno);
885		THR_ACTIVATE_LAST_LOCK(curthread);
886		if (curthread->wakeup_time.tv_sec >= 0) {
887			KSE_GET_TOD(curkse, &ts);
888			if (thr_timedout(curthread, &ts)) {
889				/* Indicate the thread timedout: */
890				curthread->timeout = 1;
891				/* Make the thread runnable. */
892				THR_SET_STATE(curthread, PS_RUNNING);
893			}
894		}
895	}
896
897	if (curthread->lock_switch == 0) {
898		/* Unlock the scheduling queue. */
899		KSE_SCHED_UNLOCK(curkse, curkse->k_kseg);
900	}
901
902	DBG_MSG("Continuing bound thread %p\n", curthread);
903	if (first) {
904		_kse_critical_leave(&curthread->tcb->tcb_tmbx);
905		pthread_exit(curthread->start_routine(curthread->arg));
906	}
907}
908
909#ifdef DEBUG_THREAD_KERN
910static void
911dump_queues(struct kse *curkse)
912{
913	struct pthread *thread;
914
915	DBG_MSG("Threads in waiting queue:\n");
916	TAILQ_FOREACH(thread, &curkse->k_kseg->kg_schedq.sq_waitq, pqe) {
917		DBG_MSG("  thread %p, state %d, blocked %d\n",
918		    thread, thread->state, thread->blocked);
919	}
920}
921#endif
922
923/*
924 * This is the scheduler for a KSE which runs multiple threads.
925 */
926static void
927kse_sched_multi(struct kse_mailbox *kmbx)
928{
929	struct kse *curkse;
930	struct pthread *curthread, *td_wait;
931	int ret;
932
933	curkse = (struct kse *)kmbx->km_udata;
934	THR_ASSERT(curkse->k_kcb->kcb_kmbx.km_curthread == NULL,
935	    "Mailbox not null in kse_sched_multi");
936
937	/* Check for first time initialization: */
938	if (__predict_false((curkse->k_flags & KF_INITIALIZED) == 0)) {
939		/* Setup this KSEs specific data. */
940		_kcb_set(curkse->k_kcb);
941
942		/* Set this before grabbing the context. */
943		curkse->k_flags |= KF_INITIALIZED;
944	}
945
946	/*
947	 * No current thread anymore, calling _get_curthread in UTS
948	 * should dump core
949	 */
950	_tcb_set(curkse->k_kcb, NULL);
951
952	/* If this is an upcall; take the scheduler lock. */
953	if (!KSE_IS_SWITCH(curkse))
954		KSE_SCHED_LOCK(curkse, curkse->k_kseg);
955	else
956		KSE_CLEAR_SWITCH(curkse);
957
958	if (KSE_IS_IDLE(curkse)) {
959		KSE_CLEAR_IDLE(curkse);
960		curkse->k_kseg->kg_idle_kses--;
961	}
962
963	/*
964	 * Now that the scheduler lock is held, get the current
965	 * thread.  The KSE's current thread cannot be safely
966	 * examined without the lock because it could have returned
967	 * as completed on another KSE.  See kse_check_completed().
968	 */
969	curthread = curkse->k_curthread;
970
971	/*
972	 * If the current thread was completed in another KSE, then
973	 * it will be in the run queue.  Don't mark it as being blocked.
974	 */
975	if ((curthread != NULL) &&
976	    ((curthread->flags & THR_FLAGS_IN_RUNQ) == 0) &&
977	    (curthread->need_switchout == 0)) {
978		/*
979		 * Assume the current thread is blocked; when the
980		 * completed threads are checked and if the current
981		 * thread is among the completed, the blocked flag
982		 * will be cleared.
983		 */
984		curthread->blocked = 1;
985		DBG_MSG("Running thread %p is now blocked in kernel.\n",
986		    curthread);
987	}
988
989	/* Check for any unblocked threads in the kernel. */
990	kse_check_completed(curkse);
991
992	/*
993	 * Check for threads that have timed-out.
994	 */
995	kse_check_waitq(curkse);
996
997	/*
998	 * Switchout the current thread, if necessary, as the last step
999	 * so that it is inserted into the run queue (if it's runnable)
1000	 * _after_ any other threads that were added to it above.
1001	 */
1002	if (curthread == NULL)
1003		;  /* Nothing to do here. */
1004	else if ((curthread->need_switchout == 0) && DBG_CAN_RUN(curthread) &&
1005	    (curthread->blocked == 0) && (THR_IN_CRITICAL(curthread))) {
1006		/*
1007		 * Resume the thread and tell it to yield when
1008		 * it leaves the critical region.
1009		 */
1010		curthread->critical_yield = 1;
1011		curthread->active = 1;
1012		if ((curthread->flags & THR_FLAGS_IN_RUNQ) != 0)
1013			KSE_RUNQ_REMOVE(curkse, curthread);
1014		curkse->k_curthread = curthread;
1015		curthread->kse = curkse;
1016		DBG_MSG("Continuing thread %p in critical region\n",
1017		    curthread);
1018		kse_wakeup_multi(curkse);
1019		KSE_SCHED_UNLOCK(curkse, curkse->k_kseg);
1020		ret = _thread_switch(curkse->k_kcb, curthread->tcb, 1);
1021		if (ret != 0)
1022			PANIC("Can't resume thread in critical region\n");
1023	}
1024	else if ((curthread->flags & THR_FLAGS_IN_RUNQ) == 0) {
1025		curthread->tcb->tcb_tmbx.tm_lwp = 0;
1026		kse_switchout_thread(curkse, curthread);
1027	}
1028	curkse->k_curthread = NULL;
1029
1030#ifdef DEBUG_THREAD_KERN
1031	dump_queues(curkse);
1032#endif
1033
1034	/* Check if there are no threads ready to run: */
1035	while (((curthread = KSE_RUNQ_FIRST(curkse)) == NULL) &&
1036	    (curkse->k_kseg->kg_threadcount != 0) &&
1037	    ((curkse->k_flags & KF_TERMINATED) == 0)) {
1038		/*
1039		 * Wait for a thread to become active or until there are
1040		 * no more threads.
1041		 */
1042		td_wait = KSE_WAITQ_FIRST(curkse);
1043		kse_wait(curkse, td_wait, 0);
1044		kse_check_completed(curkse);
1045		kse_check_waitq(curkse);
1046	}
1047
1048	/* Check for no more threads: */
1049	if ((curkse->k_kseg->kg_threadcount == 0) ||
1050	    ((curkse->k_flags & KF_TERMINATED) != 0)) {
1051		/*
1052		 * Normally this shouldn't return, but it will if there
1053		 * are other KSEs running that create new threads that
1054		 * are assigned to this KSE[G].  For instance, if a scope
1055		 * system thread were to create a scope process thread
1056		 * and this kse[g] is the initial kse[g], then that newly
1057		 * created thread would be assigned to us (the initial
1058		 * kse[g]).
1059		 */
1060		kse_wakeup_multi(curkse);
1061		KSE_SCHED_UNLOCK(curkse, curkse->k_kseg);
1062		kse_fini(curkse);
1063		/* never returns */
1064	}
1065
1066	THR_ASSERT(curthread != NULL,
1067	    "Return from kse_wait/fini without thread.");
1068	THR_ASSERT(curthread->state != PS_DEAD,
1069	    "Trying to resume dead thread!");
1070	KSE_RUNQ_REMOVE(curkse, curthread);
1071
1072	/*
1073	 * Make the selected thread the current thread.
1074	 */
1075	curkse->k_curthread = curthread;
1076
1077	/*
1078	 * Make sure the current thread's kse points to this kse.
1079	 */
1080	curthread->kse = curkse;
1081
1082	/*
1083	 * Reset the time slice if this thread is running for the first
1084	 * time or running again after using its full time slice allocation.
1085	 */
1086	if (curthread->slice_usec == -1)
1087		curthread->slice_usec = 0;
1088
1089	/* Mark the thread active. */
1090	curthread->active = 1;
1091
1092	/*
1093	 * The thread's current signal frame will only be NULL if it
1094	 * is being resumed after being blocked in the kernel.  In
1095	 * this case, and if the thread needs to run down pending
1096	 * signals or needs a cancellation check, we need to add a
1097	 * signal frame to the thread's context.
1098	 */
1099	if (curthread->lock_switch == 0 && curthread->state == PS_RUNNING &&
1100	    (curthread->check_pending != 0 ||
1101	     THR_NEED_ASYNC_CANCEL(curthread)) &&
1102	    !THR_IN_CRITICAL(curthread)) {
1103		curthread->check_pending = 0;
1104		signalcontext(&curthread->tcb->tcb_tmbx.tm_context, 0,
1105		    (__sighandler_t *)thr_resume_wrapper);
1106	}
1107	kse_wakeup_multi(curkse);
1108	/*
1109	 * Continue the thread at its current frame:
1110	 */
1111	if (curthread->lock_switch != 0) {
1112		/*
1113		 * This thread came from a scheduler switch; it will
1114		 * unlock the scheduler lock and set the mailbox.
1115		 */
1116		ret = _thread_switch(curkse->k_kcb, curthread->tcb, 0);
1117	} else {
1118		/* This thread won't unlock the scheduler lock. */
1119		KSE_SCHED_UNLOCK(curkse, curkse->k_kseg);
1120		ret = _thread_switch(curkse->k_kcb, curthread->tcb, 1);
1121	}
1122	if (ret != 0)
1123		PANIC("Thread has returned from _thread_switch");
1124
1125	/* This point should not be reached. */
1126	PANIC("Thread has returned from _thread_switch");
1127}
1128
1129static void
1130thr_resume_wrapper(int sig __unused, siginfo_t *siginfo __unused,
1131    ucontext_t *ucp)
1132{
1133	struct pthread *curthread = _get_curthread();
1134	struct kse *curkse;
1135	int ret, err_save = errno;
1136
1137	DBG_MSG(">>> sig wrapper\n");
1138	if (curthread->lock_switch)
1139		PANIC("thr_resume_wrapper, lock_switch != 0\n");
1140	thr_resume_check(curthread, ucp);
1141	errno = err_save;
1142	_kse_critical_enter();
1143	curkse = curthread->kse;
1144	curthread->tcb->tcb_tmbx.tm_context = *ucp;
1145	ret = _thread_switch(curkse->k_kcb, curthread->tcb, 1);
1146	if (ret != 0)
1147		PANIC("thr_resume_wrapper: thread has returned "
1148		      "from _thread_switch");
1149	/* THR_SETCONTEXT(ucp); */ /* not work, why ? */
1150}
1151
1152static void
1153thr_resume_check(struct pthread *curthread, ucontext_t *ucp)
1154{
1155	_thr_sig_rundown(curthread, ucp);
1156
1157	if (THR_NEED_ASYNC_CANCEL(curthread))
1158		pthread_testcancel();
1159}
1160
1161/*
1162 * Clean up a thread.  This must be called with the thread's KSE
1163 * scheduling lock held.  The thread must be a thread from the
1164 * KSE's group.
1165 */
1166static void
1167thr_cleanup(struct kse *curkse, struct pthread *thread)
1168{
1169	struct pthread *joiner;
1170	struct kse_mailbox *kmbx = NULL;
1171	int sys_scope;
1172
1173	thread->active = 0;
1174	thread->need_switchout = 0;
1175	thread->lock_switch = 0;
1176	thread->check_pending = 0;
1177
1178	if ((joiner = thread->joiner) != NULL) {
1179		/* Joinee scheduler lock held; joiner won't leave. */
1180		if (joiner->kseg == curkse->k_kseg) {
1181			if (joiner->join_status.thread == thread) {
1182				joiner->join_status.thread = NULL;
1183				joiner->join_status.ret = thread->ret;
1184				(void)_thr_setrunnable_unlocked(joiner);
1185			}
1186		} else {
1187			KSE_SCHED_UNLOCK(curkse, curkse->k_kseg);
1188			/* The joiner may have removed itself and exited. */
1189			if (_thr_ref_add(thread, joiner, 0) == 0) {
1190				KSE_SCHED_LOCK(curkse, joiner->kseg);
1191				if (joiner->join_status.thread == thread) {
1192					joiner->join_status.thread = NULL;
1193					joiner->join_status.ret = thread->ret;
1194					kmbx = _thr_setrunnable_unlocked(joiner);
1195				}
1196				KSE_SCHED_UNLOCK(curkse, joiner->kseg);
1197				_thr_ref_delete(thread, joiner);
1198				if (kmbx != NULL)
1199					kse_wakeup(kmbx);
1200			}
1201			KSE_SCHED_LOCK(curkse, curkse->k_kseg);
1202		}
1203		thread->attr.flags |= PTHREAD_DETACHED;
1204	}
1205
1206	if (!(sys_scope = (thread->attr.flags & PTHREAD_SCOPE_SYSTEM))) {
1207		/*
1208		 * Remove the thread from the KSEG's list of threads.
1209	 	 */
1210		KSEG_THRQ_REMOVE(thread->kseg, thread);
1211		/*
1212		 * Migrate the thread to the main KSE so that this
1213		 * KSE and KSEG can be cleaned when their last thread
1214		 * exits.
1215		 */
1216		thread->kseg = _kse_initial->k_kseg;
1217		thread->kse = _kse_initial;
1218	}
1219
1220	/*
1221	 * We can't hold the thread list lock while holding the
1222	 * scheduler lock.
1223	 */
1224	KSE_SCHED_UNLOCK(curkse, curkse->k_kseg);
1225	DBG_MSG("Adding thread %p to GC list\n", thread);
1226	KSE_LOCK_ACQUIRE(curkse, &_thread_list_lock);
1227	thread->tlflags |= TLFLAGS_GC_SAFE;
1228	THR_GCLIST_ADD(thread);
1229	KSE_LOCK_RELEASE(curkse, &_thread_list_lock);
1230	if (sys_scope) {
1231		/*
1232		 * System scope thread is single thread group,
1233		 * when thread is exited, its kse and ksegrp should
1234		 * be recycled as well.
1235		 * kse upcall stack belongs to thread, clear it here.
1236		 */
1237		curkse->k_stack.ss_sp = 0;
1238		curkse->k_stack.ss_size = 0;
1239		kse_exit();
1240		PANIC("kse_exit() failed for system scope thread");
1241	}
1242	KSE_SCHED_LOCK(curkse, curkse->k_kseg);
1243}
1244
1245void
1246_thr_gc(struct pthread *curthread)
1247{
1248	thread_gc(curthread);
1249	kse_gc(curthread);
1250	kseg_gc(curthread);
1251}
1252
1253static void
1254thread_gc(struct pthread *curthread)
1255{
1256	struct pthread *td, *td_next;
1257	kse_critical_t crit;
1258	TAILQ_HEAD(, pthread) worklist;
1259
1260	TAILQ_INIT(&worklist);
1261	crit = _kse_critical_enter();
1262	KSE_LOCK_ACQUIRE(curthread->kse, &_thread_list_lock);
1263
1264	/* Check the threads waiting for GC. */
1265	for (td = TAILQ_FIRST(&_thread_gc_list); td != NULL; td = td_next) {
1266		td_next = TAILQ_NEXT(td, gcle);
1267		if ((td->tlflags & TLFLAGS_GC_SAFE) == 0)
1268			continue;
1269		else if (((td->attr.flags & PTHREAD_SCOPE_SYSTEM) != 0) &&
1270		    ((td->kse->k_kcb->kcb_kmbx.km_flags & KMF_DONE) == 0)) {
1271			/*
1272			 * The thread and KSE are operating on the same
1273			 * stack.  Wait for the KSE to exit before freeing
1274			 * the thread's stack as well as everything else.
1275			 */
1276			continue;
1277		}
1278		/*
1279		 * Remove the thread from the GC list.  If the thread
1280		 * isn't yet detached, it will get added back to the
1281		 * GC list at a later time.
1282		 */
1283		THR_GCLIST_REMOVE(td);
1284		DBG_MSG("Freeing thread %p stack\n", td);
1285		/*
1286		 * We can free the thread stack since it's no longer
1287		 * in use.
1288		 */
1289		_thr_stack_free(&td->attr);
1290		if (((td->attr.flags & PTHREAD_DETACHED) != 0) &&
1291		    (td->refcount == 0)) {
1292			/*
1293			 * The thread has detached and is no longer
1294			 * referenced.  It is safe to remove all
1295			 * remnants of the thread.
1296			 */
1297			THR_LIST_REMOVE(td);
1298			TAILQ_INSERT_HEAD(&worklist, td, gcle);
1299		}
1300	}
1301	KSE_LOCK_RELEASE(curthread->kse, &_thread_list_lock);
1302	_kse_critical_leave(crit);
1303
1304	while ((td = TAILQ_FIRST(&worklist)) != NULL) {
1305		TAILQ_REMOVE(&worklist, td, gcle);
1306		/*
1307		 * XXX we don't free initial thread and its kse
1308		 * (if thread is a bound thread), because there might
1309		 * have some code referencing initial thread and kse.
1310		 */
1311		if (td == _thr_initial) {
1312			DBG_MSG("Initial thread won't be freed\n");
1313			continue;
1314		}
1315
1316		if ((td->attr.flags & PTHREAD_SCOPE_SYSTEM) != 0) {
1317			crit = _kse_critical_enter();
1318			KSE_LOCK_ACQUIRE(curthread->kse, &kse_lock);
1319			kse_free_unlocked(td->kse);
1320			kseg_free_unlocked(td->kseg);
1321			KSE_LOCK_RELEASE(curthread->kse, &kse_lock);
1322			_kse_critical_leave(crit);
1323		}
1324		DBG_MSG("Freeing thread %p\n", td);
1325		_thr_free(curthread, td);
1326	}
1327}
1328
1329static void
1330kse_gc(struct pthread *curthread)
1331{
1332	kse_critical_t crit;
1333	TAILQ_HEAD(, kse) worklist;
1334	struct kse *kse;
1335
1336	if (free_kse_count <= MAX_CACHED_KSES)
1337		return;
1338	TAILQ_INIT(&worklist);
1339	crit = _kse_critical_enter();
1340	KSE_LOCK_ACQUIRE(curthread->kse, &kse_lock);
1341	while (free_kse_count > MAX_CACHED_KSES) {
1342		kse = TAILQ_FIRST(&free_kseq);
1343		TAILQ_REMOVE(&free_kseq, kse, k_qe);
1344		TAILQ_INSERT_HEAD(&worklist, kse, k_qe);
1345		free_kse_count--;
1346	}
1347	KSE_LOCK_RELEASE(curthread->kse, &kse_lock);
1348	_kse_critical_leave(crit);
1349
1350	while ((kse = TAILQ_FIRST(&worklist))) {
1351		TAILQ_REMOVE(&worklist, kse, k_qe);
1352		kse_destroy(kse);
1353	}
1354}
1355
1356static void
1357kseg_gc(struct pthread *curthread)
1358{
1359	kse_critical_t crit;
1360	TAILQ_HEAD(, kse_group) worklist;
1361	struct kse_group *kseg;
1362
1363	if (free_kseg_count <= MAX_CACHED_KSEGS)
1364		return;
1365	TAILQ_INIT(&worklist);
1366	crit = _kse_critical_enter();
1367	KSE_LOCK_ACQUIRE(curthread->kse, &kse_lock);
1368	while (free_kseg_count > MAX_CACHED_KSEGS) {
1369		kseg = TAILQ_FIRST(&free_kse_groupq);
1370		TAILQ_REMOVE(&free_kse_groupq, kseg, kg_qe);
1371		free_kseg_count--;
1372		TAILQ_INSERT_HEAD(&worklist, kseg, kg_qe);
1373	}
1374	KSE_LOCK_RELEASE(curthread->kse, &kse_lock);
1375	_kse_critical_leave(crit);
1376
1377	while ((kseg = TAILQ_FIRST(&worklist))) {
1378		TAILQ_REMOVE(&worklist, kseg, kg_qe);
1379		kseg_destroy(kseg);
1380	}
1381}
1382
1383/*
1384 * Only new threads that are running or suspended may be scheduled.
1385 */
1386int
1387_thr_schedule_add(struct pthread *curthread, struct pthread *newthread)
1388{
1389	kse_critical_t crit;
1390	int ret;
1391
1392	/* Add the new thread. */
1393	thr_link(newthread);
1394
1395	/*
1396	 * If this is the first time creating a thread, make sure
1397	 * the mailbox is set for the current thread.
1398	 */
1399	if ((newthread->attr.flags & PTHREAD_SCOPE_SYSTEM) != 0) {
1400		/* We use the thread's stack as the KSE's stack. */
1401		newthread->kse->k_kcb->kcb_kmbx.km_stack.ss_sp =
1402		    newthread->attr.stackaddr_attr;
1403		newthread->kse->k_kcb->kcb_kmbx.km_stack.ss_size =
1404		    newthread->attr.stacksize_attr;
1405
1406		/*
1407		 * No need to lock the scheduling queue since the
1408		 * KSE/KSEG pair have not yet been started.
1409		 */
1410		KSEG_THRQ_ADD(newthread->kseg, newthread);
1411		/* this thread never gives up kse */
1412		newthread->active = 1;
1413		newthread->kse->k_curthread = newthread;
1414		newthread->kse->k_kcb->kcb_kmbx.km_flags = KMF_BOUND;
1415		newthread->kse->k_kcb->kcb_kmbx.km_func =
1416		    (kse_func_t *)kse_sched_single;
1417		newthread->kse->k_kcb->kcb_kmbx.km_quantum = 0;
1418		KSE_SET_MBOX(newthread->kse, newthread);
1419		/*
1420		 * This thread needs a new KSE and KSEG.
1421		 */
1422		newthread->kse->k_flags &= ~KF_INITIALIZED;
1423		newthread->kse->k_flags |= KF_STARTED;
1424		/* Fire up! */
1425		ret = kse_create(&newthread->kse->k_kcb->kcb_kmbx, 1);
1426		if (ret != 0)
1427			ret = errno;
1428	}
1429	else {
1430		/*
1431		 * Lock the KSE and add the new thread to its list of
1432		 * assigned threads.  If the new thread is runnable, also
1433		 * add it to the KSE's run queue.
1434		 */
1435		crit = _kse_critical_enter();
1436		KSE_SCHED_LOCK(curthread->kse, newthread->kseg);
1437		KSEG_THRQ_ADD(newthread->kseg, newthread);
1438		if (newthread->state == PS_RUNNING)
1439			THR_RUNQ_INSERT_TAIL(newthread);
1440		if ((newthread->kse->k_flags & KF_STARTED) == 0) {
1441			/*
1442			 * This KSE hasn't been started yet.  Start it
1443			 * outside of holding the lock.
1444			 */
1445			newthread->kse->k_flags |= KF_STARTED;
1446			newthread->kse->k_kcb->kcb_kmbx.km_func =
1447			    (kse_func_t *)kse_sched_multi;
1448			newthread->kse->k_kcb->kcb_kmbx.km_flags = 0;
1449			kse_create(&newthread->kse->k_kcb->kcb_kmbx, 0);
1450		 } else if ((newthread->state == PS_RUNNING) &&
1451		     KSE_IS_IDLE(newthread->kse)) {
1452			/*
1453			 * The thread is being scheduled on another KSEG.
1454			 */
1455			kse_wakeup_one(newthread);
1456		}
1457		KSE_SCHED_UNLOCK(curthread->kse, newthread->kseg);
1458		_kse_critical_leave(crit);
1459		ret = 0;
1460	}
1461	if (ret != 0)
1462		thr_unlink(newthread);
1463
1464	return (ret);
1465}
1466
1467void
1468kse_waitq_insert(struct pthread *thread)
1469{
1470	struct pthread *td;
1471
1472	if (thread->wakeup_time.tv_sec == -1)
1473		TAILQ_INSERT_TAIL(&thread->kse->k_schedq->sq_waitq, thread,
1474		    pqe);
1475	else {
1476		td = TAILQ_FIRST(&thread->kse->k_schedq->sq_waitq);
1477		while ((td != NULL) && (td->wakeup_time.tv_sec != -1) &&
1478		    ((td->wakeup_time.tv_sec < thread->wakeup_time.tv_sec) ||
1479		    ((td->wakeup_time.tv_sec == thread->wakeup_time.tv_sec) &&
1480		    (td->wakeup_time.tv_nsec <= thread->wakeup_time.tv_nsec))))
1481			td = TAILQ_NEXT(td, pqe);
1482		if (td == NULL)
1483			TAILQ_INSERT_TAIL(&thread->kse->k_schedq->sq_waitq,
1484			    thread, pqe);
1485		else
1486			TAILQ_INSERT_BEFORE(td, thread, pqe);
1487	}
1488	thread->flags |= THR_FLAGS_IN_WAITQ;
1489}
1490
1491/*
1492 * This must be called with the scheduling lock held.
1493 */
1494static void
1495kse_check_completed(struct kse *kse)
1496{
1497	struct pthread *thread;
1498	struct kse_thr_mailbox *completed;
1499	int sig;
1500
1501	if ((completed = kse->k_kcb->kcb_kmbx.km_completed) != NULL) {
1502		kse->k_kcb->kcb_kmbx.km_completed = NULL;
1503		while (completed != NULL) {
1504			thread = completed->tm_udata;
1505			DBG_MSG("Found completed thread %p, name %s\n",
1506			    thread,
1507			    (thread->name == NULL) ? "none" : thread->name);
1508			thread->blocked = 0;
1509			if (thread != kse->k_curthread) {
1510				thr_accounting(thread);
1511				if ((thread->flags & THR_FLAGS_SUSPENDED) != 0)
1512					THR_SET_STATE(thread, PS_SUSPENDED);
1513				else
1514					KSE_RUNQ_INSERT_TAIL(kse, thread);
1515				if ((thread->kse != kse) &&
1516				    (thread->kse->k_curthread == thread)) {
1517					/*
1518					 * Remove this thread from its
1519					 * previous KSE so that it (the KSE)
1520					 * doesn't think it is still active.
1521					 */
1522					thread->kse->k_curthread = NULL;
1523					thread->active = 0;
1524				}
1525			}
1526			if ((sig = thread->tcb->tcb_tmbx.tm_syncsig.si_signo)
1527			    != 0) {
1528				if (SIGISMEMBER(thread->sigmask, sig))
1529					SIGADDSET(thread->sigpend, sig);
1530				else if (THR_IN_CRITICAL(thread))
1531					kse_thr_interrupt(NULL, KSE_INTR_SIGEXIT, sig);
1532				else
1533					(void)_thr_sig_add(thread, sig,
1534					    &thread->tcb->tcb_tmbx.tm_syncsig);
1535				thread->tcb->tcb_tmbx.tm_syncsig.si_signo = 0;
1536			}
1537			completed = completed->tm_next;
1538		}
1539	}
1540}
1541
1542/*
1543 * This must be called with the scheduling lock held.
1544 */
1545static void
1546kse_check_waitq(struct kse *kse)
1547{
1548	struct pthread	*pthread;
1549	struct timespec ts;
1550
1551	KSE_GET_TOD(kse, &ts);
1552
1553	/*
1554	 * Wake up threads that have timedout.  This has to be
1555	 * done before adding the current thread to the run queue
1556	 * so that a CPU intensive thread doesn't get preference
1557	 * over waiting threads.
1558	 */
1559	while (((pthread = KSE_WAITQ_FIRST(kse)) != NULL) &&
1560	    thr_timedout(pthread, &ts)) {
1561		/* Remove the thread from the wait queue: */
1562		KSE_WAITQ_REMOVE(kse, pthread);
1563		DBG_MSG("Found timedout thread %p in waitq\n", pthread);
1564
1565		/* Indicate the thread timedout: */
1566		pthread->timeout = 1;
1567
1568		/* Add the thread to the priority queue: */
1569		if ((pthread->flags & THR_FLAGS_SUSPENDED) != 0)
1570			THR_SET_STATE(pthread, PS_SUSPENDED);
1571		else {
1572			THR_SET_STATE(pthread, PS_RUNNING);
1573			KSE_RUNQ_INSERT_TAIL(kse, pthread);
1574		}
1575	}
1576}
1577
1578static int
1579thr_timedout(struct pthread *thread, struct timespec *curtime)
1580{
1581	if (thread->wakeup_time.tv_sec < 0)
1582		return (0);
1583	else if (thread->wakeup_time.tv_sec > curtime->tv_sec)
1584		return (0);
1585	else if ((thread->wakeup_time.tv_sec == curtime->tv_sec) &&
1586	    (thread->wakeup_time.tv_nsec > curtime->tv_nsec))
1587		return (0);
1588	else
1589		return (1);
1590}
1591
1592/*
1593 * This must be called with the scheduling lock held.
1594 *
1595 * Each thread has a time slice, a wakeup time (used when it wants
1596 * to wait for a specified amount of time), a run state, and an
1597 * active flag.
1598 *
1599 * When a thread gets run by the scheduler, the active flag is
1600 * set to non-zero (1).  When a thread performs an explicit yield
1601 * or schedules a state change, it enters the scheduler and the
1602 * active flag is cleared.  When the active flag is still seen
1603 * set in the scheduler, that means that the thread is blocked in
1604 * the kernel (because it is cleared before entering the scheduler
1605 * in all other instances).
1606 *
1607 * The wakeup time is only set for those states that can timeout.
1608 * It is set to (-1, -1) for all other instances.
1609 *
1610 * The thread's run state, aside from being useful when debugging,
1611 * is used to place the thread in an appropriate queue.  There
1612 * are 2 basic queues:
1613 *
1614 *   o run queue - queue ordered by priority for all threads
1615 *                 that are runnable
1616 *   o waiting queue - queue sorted by wakeup time for all threads
1617 *                     that are not otherwise runnable (not blocked
1618 *                     in kernel, not waiting for locks)
1619 *
1620 * The thread's time slice is used for round-robin scheduling
1621 * (the default scheduling policy).  While a SCHED_RR thread
1622 * is runnable it's time slice accumulates.  When it reaches
1623 * the time slice interval, it gets reset and added to the end
1624 * of the queue of threads at its priority.  When a thread no
1625 * longer becomes runnable (blocks in kernel, waits, etc), its
1626 * time slice is reset.
1627 *
1628 * The job of kse_switchout_thread() is to handle all of the above.
1629 */
1630static void
1631kse_switchout_thread(struct kse *kse, struct pthread *thread)
1632{
1633	int level;
1634	int i;
1635	int restart;
1636	siginfo_t siginfo;
1637
1638	/*
1639	 * Place the currently running thread into the
1640	 * appropriate queue(s).
1641	 */
1642	DBG_MSG("Switching out thread %p, state %d\n", thread, thread->state);
1643
1644	THR_DEACTIVATE_LAST_LOCK(thread);
1645	if (thread->blocked != 0) {
1646		thread->active = 0;
1647		thread->need_switchout = 0;
1648		/* This thread must have blocked in the kernel. */
1649		/*
1650		 * Check for pending signals and cancellation for
1651		 * this thread to see if we need to interrupt it
1652		 * in the kernel.
1653		 */
1654		if (THR_NEED_CANCEL(thread)) {
1655			kse_thr_interrupt(&thread->tcb->tcb_tmbx,
1656					  KSE_INTR_INTERRUPT, 0);
1657		} else if (thread->check_pending != 0) {
1658			for (i = 1; i <= _SIG_MAXSIG; ++i) {
1659				if (SIGISMEMBER(thread->sigpend, i) &&
1660				    !SIGISMEMBER(thread->sigmask, i)) {
1661					restart = _thread_sigact[i - 1].sa_flags & SA_RESTART;
1662					kse_thr_interrupt(&thread->tcb->tcb_tmbx,
1663					    restart ? KSE_INTR_RESTART : KSE_INTR_INTERRUPT, 0);
1664					break;
1665				}
1666			}
1667		}
1668	}
1669	else {
1670		switch (thread->state) {
1671		case PS_MUTEX_WAIT:
1672		case PS_COND_WAIT:
1673			if (THR_NEED_CANCEL(thread)) {
1674				thread->interrupted = 1;
1675				thread->continuation = _thr_finish_cancellation;
1676				THR_SET_STATE(thread, PS_RUNNING);
1677			} else {
1678				/* Insert into the waiting queue: */
1679				KSE_WAITQ_INSERT(kse, thread);
1680			}
1681			break;
1682
1683		case PS_LOCKWAIT:
1684			/*
1685			 * This state doesn't timeout.
1686			 */
1687			thread->wakeup_time.tv_sec = -1;
1688			thread->wakeup_time.tv_nsec = -1;
1689			level = thread->locklevel - 1;
1690			if (!_LCK_GRANTED(&thread->lockusers[level]))
1691				KSE_WAITQ_INSERT(kse, thread);
1692			else
1693				THR_SET_STATE(thread, PS_RUNNING);
1694			break;
1695
1696		case PS_SLEEP_WAIT:
1697		case PS_SIGWAIT:
1698			if (THR_NEED_CANCEL(thread)) {
1699				thread->interrupted = 1;
1700				THR_SET_STATE(thread, PS_RUNNING);
1701			} else {
1702				KSE_WAITQ_INSERT(kse, thread);
1703			}
1704			break;
1705
1706		case PS_JOIN:
1707			if (THR_NEED_CANCEL(thread)) {
1708				thread->join_status.thread = NULL;
1709				THR_SET_STATE(thread, PS_RUNNING);
1710			} else {
1711				/*
1712				 * This state doesn't timeout.
1713				 */
1714				thread->wakeup_time.tv_sec = -1;
1715				thread->wakeup_time.tv_nsec = -1;
1716
1717				/* Insert into the waiting queue: */
1718				KSE_WAITQ_INSERT(kse, thread);
1719			}
1720			break;
1721
1722		case PS_SIGSUSPEND:
1723		case PS_SUSPENDED:
1724			if (THR_NEED_CANCEL(thread)) {
1725				thread->interrupted = 1;
1726				THR_SET_STATE(thread, PS_RUNNING);
1727			} else {
1728				/*
1729				 * These states don't timeout.
1730				 */
1731				thread->wakeup_time.tv_sec = -1;
1732				thread->wakeup_time.tv_nsec = -1;
1733
1734				/* Insert into the waiting queue: */
1735				KSE_WAITQ_INSERT(kse, thread);
1736			}
1737			break;
1738
1739		case PS_DEAD:
1740			/*
1741			 * The scheduler is operating on a different
1742			 * stack.  It is safe to do garbage collecting
1743			 * here.
1744			 */
1745			thr_cleanup(kse, thread);
1746			return;
1747			break;
1748
1749		case PS_RUNNING:
1750			if ((thread->flags & THR_FLAGS_SUSPENDED) != 0 &&
1751			    !THR_NEED_CANCEL(thread))
1752				THR_SET_STATE(thread, PS_SUSPENDED);
1753			break;
1754
1755		case PS_DEADLOCK:
1756			/*
1757			 * These states don't timeout.
1758			 */
1759			thread->wakeup_time.tv_sec = -1;
1760			thread->wakeup_time.tv_nsec = -1;
1761
1762			/* Insert into the waiting queue: */
1763			KSE_WAITQ_INSERT(kse, thread);
1764			break;
1765
1766		default:
1767			PANIC("Unknown state\n");
1768			break;
1769		}
1770
1771		thr_accounting(thread);
1772		if (thread->state == PS_RUNNING) {
1773			if (thread->slice_usec == -1) {
1774				/*
1775				 * The thread exceeded its time quantum or
1776				 * it yielded the CPU; place it at the tail
1777				 * of the queue for its priority.
1778				 */
1779				KSE_RUNQ_INSERT_TAIL(kse, thread);
1780			} else {
1781				/*
1782				 * The thread hasn't exceeded its interval
1783				 * Place it at the head of the queue for its
1784				 * priority.
1785				 */
1786				KSE_RUNQ_INSERT_HEAD(kse, thread);
1787			}
1788		}
1789	}
1790	thread->active = 0;
1791	thread->need_switchout = 0;
1792	if (thread->check_pending != 0) {
1793		/* Install pending signals into the frame. */
1794		thread->check_pending = 0;
1795		KSE_LOCK_ACQUIRE(kse, &_thread_signal_lock);
1796		for (i = 1; i <= _SIG_MAXSIG; i++) {
1797			if (SIGISMEMBER(thread->sigmask, i))
1798				continue;
1799			if (SIGISMEMBER(thread->sigpend, i))
1800				(void)_thr_sig_add(thread, i,
1801				    &thread->siginfo[i-1]);
1802			else if (SIGISMEMBER(_thr_proc_sigpending, i) &&
1803				_thr_getprocsig_unlocked(i, &siginfo)) {
1804				(void)_thr_sig_add(thread, i, &siginfo);
1805			}
1806		}
1807		KSE_LOCK_RELEASE(kse, &_thread_signal_lock);
1808	}
1809}
1810
1811/*
1812 * This function waits for the smallest timeout value of any waiting
1813 * thread, or until it receives a message from another KSE.
1814 *
1815 * This must be called with the scheduling lock held.
1816 */
1817static void
1818kse_wait(struct kse *kse, struct pthread *td_wait, int sigseqno)
1819{
1820	struct timespec ts, ts_sleep;
1821	int saved_flags;
1822
1823	if ((td_wait == NULL) || (td_wait->wakeup_time.tv_sec < 0)) {
1824		/* Limit sleep to no more than 1 minute. */
1825		ts_sleep.tv_sec = 60;
1826		ts_sleep.tv_nsec = 0;
1827	} else {
1828		KSE_GET_TOD(kse, &ts);
1829		TIMESPEC_SUB(&ts_sleep, &td_wait->wakeup_time, &ts);
1830		if (ts_sleep.tv_sec > 60) {
1831			ts_sleep.tv_sec = 60;
1832			ts_sleep.tv_nsec = 0;
1833		}
1834	}
1835	/* Don't sleep for negative times. */
1836	if ((ts_sleep.tv_sec >= 0) && (ts_sleep.tv_nsec >= 0)) {
1837		KSE_SET_IDLE(kse);
1838		kse->k_kseg->kg_idle_kses++;
1839		KSE_SCHED_UNLOCK(kse, kse->k_kseg);
1840		if ((kse->k_kseg->kg_flags & KGF_SINGLE_THREAD) &&
1841		    (kse->k_sigseqno != sigseqno))
1842			; /* don't sleep */
1843		else {
1844			saved_flags = kse->k_kcb->kcb_kmbx.km_flags;
1845			kse->k_kcb->kcb_kmbx.km_flags |= KMF_NOUPCALL;
1846			kse_release(&ts_sleep);
1847			kse->k_kcb->kcb_kmbx.km_flags = saved_flags;
1848		}
1849		KSE_SCHED_LOCK(kse, kse->k_kseg);
1850		if (KSE_IS_IDLE(kse)) {
1851			KSE_CLEAR_IDLE(kse);
1852			kse->k_kseg->kg_idle_kses--;
1853		}
1854	}
1855}
1856
1857/*
1858 * Avoid calling this kse_exit() so as not to confuse it with the
1859 * system call of the same name.
1860 */
1861static void
1862kse_fini(struct kse *kse)
1863{
1864	/* struct kse_group *free_kseg = NULL; */
1865	struct timespec ts;
1866	struct pthread *td;
1867
1868	/*
1869	 * Check to see if this is one of the main kses.
1870	 */
1871	if (kse->k_kseg != _kse_initial->k_kseg) {
1872		PANIC("shouldn't get here");
1873		/* This is for supporting thread groups. */
1874#ifdef NOT_YET
1875		/* Remove this KSE from the KSEG's list of KSEs. */
1876		KSE_SCHED_LOCK(kse, kse->k_kseg);
1877		TAILQ_REMOVE(&kse->k_kseg->kg_kseq, kse, k_kgqe);
1878		kse->k_kseg->kg_ksecount--;
1879		if (TAILQ_EMPTY(&kse->k_kseg->kg_kseq))
1880			free_kseg = kse->k_kseg;
1881		KSE_SCHED_UNLOCK(kse, kse->k_kseg);
1882
1883		/*
1884		 * Add this KSE to the list of free KSEs along with
1885		 * the KSEG if is now orphaned.
1886		 */
1887		KSE_LOCK_ACQUIRE(kse, &kse_lock);
1888		if (free_kseg != NULL)
1889			kseg_free_unlocked(free_kseg);
1890		kse_free_unlocked(kse);
1891		KSE_LOCK_RELEASE(kse, &kse_lock);
1892		kse_exit();
1893		/* Never returns. */
1894		PANIC("kse_exit()");
1895#endif
1896	} else {
1897		/*
1898		 * We allow program to kill kse in initial group (by
1899		 * lowering the concurrency).
1900		 */
1901		if ((kse != _kse_initial) &&
1902		    ((kse->k_flags & KF_TERMINATED) != 0)) {
1903			KSE_SCHED_LOCK(kse, kse->k_kseg);
1904			TAILQ_REMOVE(&kse->k_kseg->kg_kseq, kse, k_kgqe);
1905			kse->k_kseg->kg_ksecount--;
1906			/*
1907			 * Migrate thread to  _kse_initial if its lastest
1908			 * kse it ran on is the kse.
1909			 */
1910			td = TAILQ_FIRST(&kse->k_kseg->kg_threadq);
1911			while (td != NULL) {
1912				if (td->kse == kse)
1913					td->kse = _kse_initial;
1914				td = TAILQ_NEXT(td, kle);
1915			}
1916			KSE_SCHED_UNLOCK(kse, kse->k_kseg);
1917			KSE_LOCK_ACQUIRE(kse, &kse_lock);
1918			kse_free_unlocked(kse);
1919			KSE_LOCK_RELEASE(kse, &kse_lock);
1920			/* Make sure there is always at least one is awake */
1921			KSE_WAKEUP(_kse_initial);
1922			kse_exit();
1923                        /* Never returns. */
1924                        PANIC("kse_exit() failed for initial kseg");
1925                }
1926		KSE_SCHED_LOCK(kse, kse->k_kseg);
1927		KSE_SET_IDLE(kse);
1928		kse->k_kseg->kg_idle_kses++;
1929		KSE_SCHED_UNLOCK(kse, kse->k_kseg);
1930		ts.tv_sec = 120;
1931		ts.tv_nsec = 0;
1932		kse->k_kcb->kcb_kmbx.km_flags = 0;
1933		kse_release(&ts);
1934		/* Never reach */
1935	}
1936}
1937
1938void
1939_thr_set_timeout(const struct timespec *timeout)
1940{
1941	struct pthread	*curthread = _get_curthread();
1942	struct timespec ts;
1943
1944	/* Reset the timeout flag for the running thread: */
1945	curthread->timeout = 0;
1946
1947	/* Check if the thread is to wait forever: */
1948	if (timeout == NULL) {
1949		/*
1950		 * Set the wakeup time to something that can be recognised as
1951		 * different to an actual time of day:
1952		 */
1953		curthread->wakeup_time.tv_sec = -1;
1954		curthread->wakeup_time.tv_nsec = -1;
1955	}
1956	/* Check if no waiting is required: */
1957	else if ((timeout->tv_sec == 0) && (timeout->tv_nsec == 0)) {
1958		/* Set the wake up time to 'immediately': */
1959		curthread->wakeup_time.tv_sec = 0;
1960		curthread->wakeup_time.tv_nsec = 0;
1961	} else {
1962		/* Calculate the time for the current thread to wakeup: */
1963		KSE_GET_TOD(curthread->kse, &ts);
1964		TIMESPEC_ADD(&curthread->wakeup_time, &ts, timeout);
1965	}
1966}
1967
1968void
1969_thr_panic_exit(char *file, int line, char *msg)
1970{
1971	char buf[256];
1972
1973	snprintf(buf, sizeof(buf), "(%s:%d) %s\n", file, line, msg);
1974	__sys_write(2, buf, strlen(buf));
1975	abort();
1976}
1977
1978void
1979_thr_setrunnable(struct pthread *curthread, struct pthread *thread)
1980{
1981	kse_critical_t crit;
1982	struct kse_mailbox *kmbx;
1983
1984	crit = _kse_critical_enter();
1985	KSE_SCHED_LOCK(curthread->kse, thread->kseg);
1986	kmbx = _thr_setrunnable_unlocked(thread);
1987	KSE_SCHED_UNLOCK(curthread->kse, thread->kseg);
1988	_kse_critical_leave(crit);
1989	if ((kmbx != NULL) && (__isthreaded != 0))
1990		kse_wakeup(kmbx);
1991}
1992
1993struct kse_mailbox *
1994_thr_setrunnable_unlocked(struct pthread *thread)
1995{
1996	struct kse_mailbox *kmbx = NULL;
1997
1998	if ((thread->kseg->kg_flags & KGF_SINGLE_THREAD) != 0) {
1999		/* No silly queues for these threads. */
2000		if ((thread->flags & THR_FLAGS_SUSPENDED) != 0)
2001			THR_SET_STATE(thread, PS_SUSPENDED);
2002		else {
2003			THR_SET_STATE(thread, PS_RUNNING);
2004			kmbx = kse_wakeup_one(thread);
2005		}
2006
2007	} else if (thread->state != PS_RUNNING) {
2008		if ((thread->flags & THR_FLAGS_IN_WAITQ) != 0)
2009			KSE_WAITQ_REMOVE(thread->kse, thread);
2010		if ((thread->flags & THR_FLAGS_SUSPENDED) != 0)
2011			THR_SET_STATE(thread, PS_SUSPENDED);
2012		else {
2013			THR_SET_STATE(thread, PS_RUNNING);
2014			if ((thread->blocked == 0) && (thread->active == 0) &&
2015			    (thread->flags & THR_FLAGS_IN_RUNQ) == 0)
2016				THR_RUNQ_INSERT_TAIL(thread);
2017			/*
2018			 * XXX - Threads are not yet assigned to specific
2019			 *       KSEs; they are assigned to the KSEG.  So
2020			 *       the fact that a thread's KSE is waiting
2021			 *       doesn't necessarily mean that it will be
2022			 *       the KSE that runs the thread after the
2023			 *       lock is granted.  But we don't know if the
2024			 *       other KSEs within the same KSEG are also
2025			 *       in a waiting state or not so we err on the
2026			 *       side of caution and wakeup the thread's
2027			 *       last known KSE.  We ensure that the
2028			 *       threads KSE doesn't change while it's
2029			 *       scheduling lock is held so it is safe to
2030			 *       reference it (the KSE).  If the KSE wakes
2031			 *       up and doesn't find any more work it will
2032			 *       again go back to waiting so no harm is
2033			 *       done.
2034			 */
2035			kmbx = kse_wakeup_one(thread);
2036		}
2037	}
2038	return (kmbx);
2039}
2040
2041static struct kse_mailbox *
2042kse_wakeup_one(struct pthread *thread)
2043{
2044	struct kse *ke;
2045
2046	if (KSE_IS_IDLE(thread->kse)) {
2047		KSE_CLEAR_IDLE(thread->kse);
2048		thread->kseg->kg_idle_kses--;
2049		return (&thread->kse->k_kcb->kcb_kmbx);
2050	} else {
2051		TAILQ_FOREACH(ke, &thread->kseg->kg_kseq, k_kgqe) {
2052			if (KSE_IS_IDLE(ke)) {
2053				KSE_CLEAR_IDLE(ke);
2054				ke->k_kseg->kg_idle_kses--;
2055				return (&ke->k_kcb->kcb_kmbx);
2056			}
2057		}
2058	}
2059	return (NULL);
2060}
2061
2062static void
2063kse_wakeup_multi(struct kse *curkse)
2064{
2065	struct kse *ke;
2066	int tmp;
2067
2068	if ((tmp = KSE_RUNQ_THREADS(curkse)) && curkse->k_kseg->kg_idle_kses) {
2069		TAILQ_FOREACH(ke, &curkse->k_kseg->kg_kseq, k_kgqe) {
2070			if (KSE_IS_IDLE(ke)) {
2071				KSE_CLEAR_IDLE(ke);
2072				ke->k_kseg->kg_idle_kses--;
2073				KSE_WAKEUP(ke);
2074				if (--tmp == 0)
2075					break;
2076			}
2077		}
2078	}
2079}
2080
2081/*
2082 * Allocate a new KSEG.
2083 *
2084 * We allow the current thread to be NULL in the case that this
2085 * is the first time a KSEG is being created (library initialization).
2086 * In this case, we don't need to (and can't) take any locks.
2087 */
2088struct kse_group *
2089_kseg_alloc(struct pthread *curthread)
2090{
2091	struct kse_group *kseg = NULL;
2092	kse_critical_t crit;
2093
2094	if ((curthread != NULL) && (free_kseg_count > 0)) {
2095		/* Use the kse lock for the kseg queue. */
2096		crit = _kse_critical_enter();
2097		KSE_LOCK_ACQUIRE(curthread->kse, &kse_lock);
2098		if ((kseg = TAILQ_FIRST(&free_kse_groupq)) != NULL) {
2099			TAILQ_REMOVE(&free_kse_groupq, kseg, kg_qe);
2100			free_kseg_count--;
2101			active_kseg_count++;
2102			TAILQ_INSERT_TAIL(&active_kse_groupq, kseg, kg_qe);
2103		}
2104		KSE_LOCK_RELEASE(curthread->kse, &kse_lock);
2105		_kse_critical_leave(crit);
2106		if (kseg)
2107			kseg_reinit(kseg);
2108	}
2109
2110	/*
2111	 * If requested, attempt to allocate a new KSE group only if the
2112	 * KSE allocation was successful and a KSE group wasn't found in
2113	 * the free list.
2114	 */
2115	if ((kseg == NULL) &&
2116	    ((kseg = (struct kse_group *)malloc(sizeof(*kseg))) != NULL)) {
2117		if (_pq_alloc(&kseg->kg_schedq.sq_runq,
2118		    THR_MIN_PRIORITY, THR_LAST_PRIORITY) != 0) {
2119			free(kseg);
2120			kseg = NULL;
2121		} else {
2122			kseg_init(kseg);
2123			/* Add the KSEG to the list of active KSEGs. */
2124			if (curthread != NULL) {
2125				crit = _kse_critical_enter();
2126				KSE_LOCK_ACQUIRE(curthread->kse, &kse_lock);
2127				active_kseg_count++;
2128				TAILQ_INSERT_TAIL(&active_kse_groupq,
2129				    kseg, kg_qe);
2130				KSE_LOCK_RELEASE(curthread->kse, &kse_lock);
2131				_kse_critical_leave(crit);
2132			} else {
2133				active_kseg_count++;
2134				TAILQ_INSERT_TAIL(&active_kse_groupq,
2135				    kseg, kg_qe);
2136			}
2137		}
2138	}
2139	return (kseg);
2140}
2141
2142static void
2143kseg_init(struct kse_group *kseg)
2144{
2145	kseg_reinit(kseg);
2146	_lock_init(&kseg->kg_lock, LCK_ADAPTIVE, _kse_lock_wait,
2147	    _kse_lock_wakeup, calloc);
2148}
2149
2150static void
2151kseg_reinit(struct kse_group *kseg)
2152{
2153	TAILQ_INIT(&kseg->kg_kseq);
2154	TAILQ_INIT(&kseg->kg_threadq);
2155	TAILQ_INIT(&kseg->kg_schedq.sq_waitq);
2156	kseg->kg_threadcount = 0;
2157	kseg->kg_ksecount = 0;
2158	kseg->kg_idle_kses = 0;
2159	kseg->kg_flags = 0;
2160}
2161
2162/*
2163 * This must be called with the kse lock held and when there are
2164 * no more threads that reference it.
2165 */
2166static void
2167kseg_free_unlocked(struct kse_group *kseg)
2168{
2169	TAILQ_REMOVE(&active_kse_groupq, kseg, kg_qe);
2170	TAILQ_INSERT_HEAD(&free_kse_groupq, kseg, kg_qe);
2171	free_kseg_count++;
2172	active_kseg_count--;
2173}
2174
2175void
2176_kseg_free(struct kse_group *kseg)
2177{
2178	struct kse *curkse;
2179	kse_critical_t crit;
2180
2181	crit = _kse_critical_enter();
2182	curkse = _get_curkse();
2183	KSE_LOCK_ACQUIRE(curkse, &kse_lock);
2184	kseg_free_unlocked(kseg);
2185	KSE_LOCK_RELEASE(curkse, &kse_lock);
2186	_kse_critical_leave(crit);
2187}
2188
2189static void
2190kseg_destroy(struct kse_group *kseg)
2191{
2192	_lock_destroy(&kseg->kg_lock);
2193	_pq_free(&kseg->kg_schedq.sq_runq);
2194	free(kseg);
2195}
2196
2197/*
2198 * Allocate a new KSE.
2199 *
2200 * We allow the current thread to be NULL in the case that this
2201 * is the first time a KSE is being created (library initialization).
2202 * In this case, we don't need to (and can't) take any locks.
2203 */
2204struct kse *
2205_kse_alloc(struct pthread *curthread, int sys_scope)
2206{
2207	struct kse *kse = NULL;
2208	char *stack;
2209	kse_critical_t crit;
2210	int i;
2211
2212	if ((curthread != NULL) && (free_kse_count > 0)) {
2213		crit = _kse_critical_enter();
2214		KSE_LOCK_ACQUIRE(curthread->kse, &kse_lock);
2215		/* Search for a finished KSE. */
2216		kse = TAILQ_FIRST(&free_kseq);
2217		while ((kse != NULL) &&
2218		    ((kse->k_kcb->kcb_kmbx.km_flags & KMF_DONE) == 0)) {
2219			kse = TAILQ_NEXT(kse, k_qe);
2220		}
2221		if (kse != NULL) {
2222			DBG_MSG("found an unused kse.\n");
2223			TAILQ_REMOVE(&free_kseq, kse, k_qe);
2224			free_kse_count--;
2225			TAILQ_INSERT_TAIL(&active_kseq, kse, k_qe);
2226			active_kse_count++;
2227		}
2228		KSE_LOCK_RELEASE(curthread->kse, &kse_lock);
2229		_kse_critical_leave(crit);
2230		if (kse != NULL)
2231			kse_reinit(kse, sys_scope);
2232	}
2233	if ((kse == NULL) &&
2234	    ((kse = (struct kse *)malloc(sizeof(*kse))) != NULL)) {
2235		if (sys_scope != 0)
2236			stack = NULL;
2237		else if ((stack = malloc(KSE_STACKSIZE)) == NULL) {
2238			free(kse);
2239			return (NULL);
2240		}
2241		bzero(kse, sizeof(*kse));
2242
2243		/* Initialize KCB without the lock. */
2244		if ((kse->k_kcb = _kcb_ctor(kse)) == NULL) {
2245			if (stack != NULL)
2246				free(stack);
2247			free(kse);
2248			return (NULL);
2249		}
2250
2251		/* Initialize the lockusers. */
2252		for (i = 0; i < MAX_KSE_LOCKLEVEL; i++) {
2253			_lockuser_init(&kse->k_lockusers[i], (void *)kse);
2254			_LCK_SET_PRIVATE2(&kse->k_lockusers[i], NULL);
2255		}
2256		/* _lock_init(kse->k_lock, ...) */
2257
2258		if (curthread != NULL) {
2259			crit = _kse_critical_enter();
2260			KSE_LOCK_ACQUIRE(curthread->kse, &kse_lock);
2261		}
2262		kse->k_flags = 0;
2263		TAILQ_INSERT_TAIL(&active_kseq, kse, k_qe);
2264		active_kse_count++;
2265		if (curthread != NULL) {
2266			KSE_LOCK_RELEASE(curthread->kse, &kse_lock);
2267			_kse_critical_leave(crit);
2268		}
2269		/*
2270		 * Create the KSE context.
2271		 * Scope system threads (one thread per KSE) are not required
2272		 * to have a stack for an unneeded kse upcall.
2273		 */
2274		if (!sys_scope) {
2275			kse->k_kcb->kcb_kmbx.km_func = (kse_func_t *)kse_sched_multi;
2276			kse->k_stack.ss_sp = stack;
2277			kse->k_stack.ss_size = KSE_STACKSIZE;
2278		} else {
2279			kse->k_kcb->kcb_kmbx.km_func = (kse_func_t *)kse_sched_single;
2280			kse->k_stack.ss_sp = NULL;
2281			kse->k_stack.ss_size = 0;
2282		}
2283		kse->k_kcb->kcb_kmbx.km_udata = (void *)kse;
2284		kse->k_kcb->kcb_kmbx.km_quantum = 20000;
2285		/*
2286		 * We need to keep a copy of the stack in case it
2287		 * doesn't get used; a KSE running a scope system
2288		 * thread will use that thread's stack.
2289		 */
2290		kse->k_kcb->kcb_kmbx.km_stack = kse->k_stack;
2291	}
2292	return (kse);
2293}
2294
2295static void
2296kse_reinit(struct kse *kse, int sys_scope)
2297{
2298	if (!sys_scope) {
2299		kse->k_kcb->kcb_kmbx.km_func = (kse_func_t *)kse_sched_multi;
2300		if (kse->k_stack.ss_sp == NULL) {
2301			/* XXX check allocation failure */
2302			kse->k_stack.ss_sp = (char *) malloc(KSE_STACKSIZE);
2303			kse->k_stack.ss_size = KSE_STACKSIZE;
2304		}
2305		kse->k_kcb->kcb_kmbx.km_quantum = 20000;
2306	} else {
2307		kse->k_kcb->kcb_kmbx.km_func = (kse_func_t *)kse_sched_single;
2308		if (kse->k_stack.ss_sp)
2309			free(kse->k_stack.ss_sp);
2310		kse->k_stack.ss_sp = NULL;
2311		kse->k_stack.ss_size = 0;
2312		kse->k_kcb->kcb_kmbx.km_quantum = 0;
2313	}
2314	kse->k_kcb->kcb_kmbx.km_stack = kse->k_stack;
2315	kse->k_kcb->kcb_kmbx.km_udata = (void *)kse;
2316	kse->k_kcb->kcb_kmbx.km_curthread = NULL;
2317	kse->k_kcb->kcb_kmbx.km_flags = 0;
2318	kse->k_curthread = NULL;
2319	kse->k_kseg = 0;
2320	kse->k_schedq = 0;
2321	kse->k_locklevel = 0;
2322	kse->k_flags = 0;
2323	kse->k_error = 0;
2324	kse->k_cpu = 0;
2325	kse->k_sigseqno = 0;
2326}
2327
2328void
2329kse_free_unlocked(struct kse *kse)
2330{
2331	TAILQ_REMOVE(&active_kseq, kse, k_qe);
2332	active_kse_count--;
2333	kse->k_kseg = NULL;
2334	kse->k_kcb->kcb_kmbx.km_quantum = 20000;
2335	kse->k_flags = 0;
2336	TAILQ_INSERT_HEAD(&free_kseq, kse, k_qe);
2337	free_kse_count++;
2338}
2339
2340void
2341_kse_free(struct pthread *curthread, struct kse *kse)
2342{
2343	kse_critical_t crit;
2344
2345	if (curthread == NULL)
2346		kse_free_unlocked(kse);
2347	else {
2348		crit = _kse_critical_enter();
2349		KSE_LOCK_ACQUIRE(curthread->kse, &kse_lock);
2350		kse_free_unlocked(kse);
2351		KSE_LOCK_RELEASE(curthread->kse, &kse_lock);
2352		_kse_critical_leave(crit);
2353	}
2354}
2355
2356static void
2357kse_destroy(struct kse *kse)
2358{
2359	int i;
2360
2361	if (kse->k_stack.ss_sp != NULL)
2362		free(kse->k_stack.ss_sp);
2363	_kcb_dtor(kse->k_kcb);
2364	for (i = 0; i < MAX_KSE_LOCKLEVEL; ++i)
2365		_lockuser_destroy(&kse->k_lockusers[i]);
2366	_lock_destroy(&kse->k_lock);
2367	free(kse);
2368}
2369
2370struct pthread *
2371_thr_alloc(struct pthread *curthread)
2372{
2373	kse_critical_t	crit;
2374	struct pthread	*thread = NULL;
2375	int i;
2376
2377	if (curthread != NULL) {
2378		if (GC_NEEDED())
2379			_thr_gc(curthread);
2380		if (free_thread_count > 0) {
2381			crit = _kse_critical_enter();
2382			KSE_LOCK_ACQUIRE(curthread->kse, &thread_lock);
2383			if ((thread = TAILQ_FIRST(&free_threadq)) != NULL) {
2384				TAILQ_REMOVE(&free_threadq, thread, tle);
2385				free_thread_count--;
2386			}
2387			KSE_LOCK_RELEASE(curthread->kse, &thread_lock);
2388			_kse_critical_leave(crit);
2389		}
2390	}
2391	if ((thread == NULL) &&
2392	    ((thread = malloc(sizeof(struct pthread))) != NULL)) {
2393		bzero(thread, sizeof(struct pthread));
2394		thread->siginfo = calloc(_SIG_MAXSIG, sizeof(siginfo_t));
2395		if (thread->siginfo == NULL) {
2396			free(thread);
2397			return (NULL);
2398		}
2399		if (curthread) {
2400			_pthread_mutex_lock(&_tcb_mutex);
2401			thread->tcb = _tcb_ctor(thread, 0 /* not initial tls */);
2402			_pthread_mutex_unlock(&_tcb_mutex);
2403		} else {
2404			thread->tcb = _tcb_ctor(thread, 1 /* initial tls */);
2405		}
2406		if (thread->tcb == NULL) {
2407			free(thread->siginfo);
2408			free(thread);
2409			return (NULL);
2410		}
2411		/*
2412		 * Initialize thread locking.
2413		 * Lock initializing needs malloc, so don't
2414		 * enter critical region before doing this!
2415		 */
2416		if (_lock_init(&thread->lock, LCK_ADAPTIVE,
2417		    _thr_lock_wait, _thr_lock_wakeup, calloc) != 0)
2418			PANIC("Cannot initialize thread lock");
2419		for (i = 0; i < MAX_THR_LOCKLEVEL; i++) {
2420			_lockuser_init(&thread->lockusers[i], (void *)thread);
2421			_LCK_SET_PRIVATE2(&thread->lockusers[i],
2422			    (void *)thread);
2423		}
2424	}
2425	return (thread);
2426}
2427
2428void
2429_thr_free(struct pthread *curthread, struct pthread *thread)
2430{
2431	kse_critical_t crit;
2432
2433	DBG_MSG("Freeing thread %p\n", thread);
2434	if (thread->name) {
2435		free(thread->name);
2436		thread->name = NULL;
2437	}
2438	if ((curthread == NULL) || (free_thread_count >= MAX_CACHED_THREADS)) {
2439		thr_destroy(curthread, thread);
2440	} else {
2441		/* Add the thread to the free thread list. */
2442		crit = _kse_critical_enter();
2443		KSE_LOCK_ACQUIRE(curthread->kse, &thread_lock);
2444		TAILQ_INSERT_TAIL(&free_threadq, thread, tle);
2445		free_thread_count++;
2446		KSE_LOCK_RELEASE(curthread->kse, &thread_lock);
2447		_kse_critical_leave(crit);
2448	}
2449}
2450
2451static void
2452thr_destroy(struct pthread *curthread, struct pthread *thread)
2453{
2454	int i;
2455
2456	for (i = 0; i < MAX_THR_LOCKLEVEL; i++)
2457		_lockuser_destroy(&thread->lockusers[i]);
2458	_lock_destroy(&thread->lock);
2459	if (curthread) {
2460		_pthread_mutex_lock(&_tcb_mutex);
2461		_tcb_dtor(thread->tcb);
2462		_pthread_mutex_unlock(&_tcb_mutex);
2463	} else {
2464		_tcb_dtor(thread->tcb);
2465	}
2466	free(thread->siginfo);
2467	free(thread);
2468}
2469
2470/*
2471 * Add an active thread:
2472 *
2473 *   o Assign the thread a unique id (which GDB uses to track
2474 *     threads.
2475 *   o Add the thread to the list of all threads and increment
2476 *     number of active threads.
2477 */
2478static void
2479thr_link(struct pthread *thread)
2480{
2481	kse_critical_t crit;
2482	struct kse *curkse;
2483
2484	crit = _kse_critical_enter();
2485	curkse = _get_curkse();
2486	KSE_LOCK_ACQUIRE(curkse, &_thread_list_lock);
2487	/*
2488	 * Initialize the unique id (which GDB uses to track
2489	 * threads), add the thread to the list of all threads,
2490	 * and
2491	 */
2492	thread->uniqueid = next_uniqueid++;
2493	THR_LIST_ADD(thread);
2494	_thread_active_threads++;
2495	KSE_LOCK_RELEASE(curkse, &_thread_list_lock);
2496	_kse_critical_leave(crit);
2497}
2498
2499/*
2500 * Remove an active thread.
2501 */
2502static void
2503thr_unlink(struct pthread *thread)
2504{
2505	kse_critical_t crit;
2506	struct kse *curkse;
2507
2508	crit = _kse_critical_enter();
2509	curkse = _get_curkse();
2510	KSE_LOCK_ACQUIRE(curkse, &_thread_list_lock);
2511	THR_LIST_REMOVE(thread);
2512	_thread_active_threads--;
2513	KSE_LOCK_RELEASE(curkse, &_thread_list_lock);
2514	_kse_critical_leave(crit);
2515}
2516
2517void
2518_thr_hash_add(struct pthread *thread)
2519{
2520	struct thread_hash_head *head;
2521
2522	head = &thr_hashtable[THREAD_HASH(thread)];
2523	LIST_INSERT_HEAD(head, thread, hle);
2524}
2525
2526void
2527_thr_hash_remove(struct pthread *thread)
2528{
2529	LIST_REMOVE(thread, hle);
2530}
2531
2532struct pthread *
2533_thr_hash_find(struct pthread *thread)
2534{
2535	struct pthread *td;
2536	struct thread_hash_head *head;
2537
2538	head = &thr_hashtable[THREAD_HASH(thread)];
2539	LIST_FOREACH(td, head, hle) {
2540		if (td == thread)
2541			return (thread);
2542	}
2543	return (NULL);
2544}
2545
2546void
2547_thr_debug_check_yield(struct pthread *curthread)
2548{
2549	/*
2550	 * Note that TMDF_SUSPEND is set after process is suspended.
2551	 * When we are being debugged, every suspension in process
2552	 * will cause all KSEs to schedule an upcall in kernel, unless the
2553	 * KSE is in critical region.
2554	 * If the function is being called, it means the KSE is no longer
2555	 * in critical region, if the TMDF_SUSPEND is set by debugger
2556	 * before KSE leaves critical region, we will catch it here, else
2557	 * if the flag is changed during testing, it also not a problem,
2558	 * because the change only occurs after a process suspension event
2559	 * occurs. A suspension event will always cause KSE to schedule an
2560	 * upcall, in the case, because we are not in critical region,
2561	 * upcall will be scheduled sucessfully, the flag will be checked
2562	 * again in kse_sched_multi, we won't back until the flag
2563	 * is cleared by debugger, the flag will be cleared in next
2564	 * suspension event.
2565	 */
2566	if (!DBG_CAN_RUN(curthread)) {
2567		if ((curthread->attr.flags & PTHREAD_SCOPE_SYSTEM) == 0)
2568			_thr_sched_switch(curthread);
2569		else
2570			kse_thr_interrupt(&curthread->tcb->tcb_tmbx,
2571				KSE_INTR_DBSUSPEND, 0);
2572	}
2573}
2574