thr_kern.c revision 149617
1/*
2 * Copyright (C) 2003 Daniel M. Eischen <deischen@freebsd.org>
3 * Copyright (C) 2002 Jonathon Mini <mini@freebsd.org>
4 * Copyright (c) 1995-1998 John Birrell <jb@cimlogic.com.au>
5 * All rights reserved.
6 *
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
9 * are met:
10 * 1. Redistributions of source code must retain the above copyright
11 *    notice, this list of conditions and the following disclaimer.
12 * 2. Redistributions in binary form must reproduce the above copyright
13 *    notice, this list of conditions and the following disclaimer in the
14 *    documentation and/or other materials provided with the distribution.
15 * 3. All advertising materials mentioning features or use of this software
16 *    must display the following acknowledgement:
17 *	This product includes software developed by John Birrell.
18 * 4. Neither the name of the author nor the names of any co-contributors
19 *    may be used to endorse or promote products derived from this software
20 *    without specific prior written permission.
21 *
22 * THIS SOFTWARE IS PROVIDED BY JOHN BIRRELL AND CONTRIBUTORS ``AS IS'' AND
23 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
24 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
25 * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
26 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
27 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
28 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
29 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
30 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
31 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32 * SUCH DAMAGE.
33 *
34 */
35#include <sys/cdefs.h>
36__FBSDID("$FreeBSD: head/lib/libkse/thread/thr_kern.c 149617 2005-08-30 12:42:00Z deischen $");
37
38#include <sys/types.h>
39#include <sys/kse.h>
40#include <sys/ptrace.h>
41#include <sys/signalvar.h>
42#include <sys/queue.h>
43#include <machine/atomic.h>
44#include <machine/sigframe.h>
45
46#include <assert.h>
47#include <errno.h>
48#include <signal.h>
49#include <stdlib.h>
50#include <string.h>
51#include <time.h>
52#include <ucontext.h>
53#include <unistd.h>
54
55#include "atomic_ops.h"
56#include "thr_private.h"
57#include "libc_private.h"
58
59/* #define DEBUG_THREAD_KERN */
60#ifdef DEBUG_THREAD_KERN
61#define DBG_MSG		stdout_debug
62#else
63#define DBG_MSG(x...)
64#endif
65
66/*
67 * Define a high water mark for the maximum number of threads that
68 * will be cached.  Once this level is reached, any extra threads
69 * will be free()'d.
70 */
71#define	MAX_CACHED_THREADS	100
72/*
73 * Define high water marks for the maximum number of KSEs and KSE groups
74 * that will be cached. Because we support 1:1 threading, there could have
75 * same number of KSEs and KSE groups as threads. Once these levels are
76 * reached, any extra KSE and KSE groups will be free()'d.
77 */
78#define	MAX_CACHED_KSES		((_thread_scope_system <= 0) ? 50 : 100)
79#define	MAX_CACHED_KSEGS	((_thread_scope_system <= 0) ? 50 : 100)
80
81#define	KSE_SET_MBOX(kse, thrd) \
82	(kse)->k_kcb->kcb_kmbx.km_curthread = &(thrd)->tcb->tcb_tmbx
83
84#define	KSE_SET_EXITED(kse)	(kse)->k_flags |= KF_EXITED
85
86/*
87 * Macros for manipulating the run queues.  The priority queue
88 * routines use the thread's pqe link and also handle the setting
89 * and clearing of the thread's THR_FLAGS_IN_RUNQ flag.
90 */
91#define	KSE_RUNQ_INSERT_HEAD(kse, thrd)			\
92	_pq_insert_head(&(kse)->k_schedq->sq_runq, thrd)
93#define	KSE_RUNQ_INSERT_TAIL(kse, thrd)			\
94	_pq_insert_tail(&(kse)->k_schedq->sq_runq, thrd)
95#define	KSE_RUNQ_REMOVE(kse, thrd)			\
96	_pq_remove(&(kse)->k_schedq->sq_runq, thrd)
97#define	KSE_RUNQ_FIRST(kse)				\
98	((_libkse_debug == 0) ?				\
99	 _pq_first(&(kse)->k_schedq->sq_runq) :		\
100	 _pq_first_debug(&(kse)->k_schedq->sq_runq))
101
102#define KSE_RUNQ_THREADS(kse)	((kse)->k_schedq->sq_runq.pq_threads)
103
104#define THR_NEED_CANCEL(thrd)						\
105	 (((thrd)->cancelflags & THR_CANCELLING) != 0 &&		\
106	  ((thrd)->cancelflags & PTHREAD_CANCEL_DISABLE) == 0 &&	\
107	  (((thrd)->cancelflags & THR_AT_CANCEL_POINT) != 0 ||		\
108	   ((thrd)->cancelflags & PTHREAD_CANCEL_ASYNCHRONOUS) != 0))
109
110#define THR_NEED_ASYNC_CANCEL(thrd)					\
111	 (((thrd)->cancelflags & THR_CANCELLING) != 0 &&		\
112	  ((thrd)->cancelflags & PTHREAD_CANCEL_DISABLE) == 0 &&	\
113	  (((thrd)->cancelflags & THR_AT_CANCEL_POINT) == 0 &&		\
114	   ((thrd)->cancelflags & PTHREAD_CANCEL_ASYNCHRONOUS) != 0))
115
116/*
117 * We've got to keep track of everything that is allocated, not only
118 * to have a speedy free list, but also so they can be deallocated
119 * after a fork().
120 */
121static TAILQ_HEAD(, kse)	active_kseq;
122static TAILQ_HEAD(, kse)	free_kseq;
123static TAILQ_HEAD(, kse_group)	free_kse_groupq;
124static TAILQ_HEAD(, kse_group)	active_kse_groupq;
125static TAILQ_HEAD(, kse_group)	gc_ksegq;
126static struct lock		kse_lock;	/* also used for kseg queue */
127static int			free_kse_count = 0;
128static int			free_kseg_count = 0;
129static TAILQ_HEAD(, pthread)	free_threadq;
130static struct lock		thread_lock;
131static int			free_thread_count = 0;
132static int			inited = 0;
133static int			active_kse_count = 0;
134static int			active_kseg_count = 0;
135static u_int64_t		next_uniqueid = 1;
136
137LIST_HEAD(thread_hash_head, pthread);
138#define THREAD_HASH_QUEUES	127
139static struct thread_hash_head	thr_hashtable[THREAD_HASH_QUEUES];
140#define	THREAD_HASH(thrd)	((unsigned long)thrd % THREAD_HASH_QUEUES)
141
142/* Lock for thread tcb constructor/destructor */
143static pthread_mutex_t		_tcb_mutex;
144
145#ifdef DEBUG_THREAD_KERN
146static void	dump_queues(struct kse *curkse);
147#endif
148static void	kse_check_completed(struct kse *kse);
149static void	kse_check_waitq(struct kse *kse);
150static void	kse_fini(struct kse *curkse);
151static void	kse_reinit(struct kse *kse, int sys_scope);
152static void	kse_sched_multi(struct kse_mailbox *kmbx);
153static void	kse_sched_single(struct kse_mailbox *kmbx);
154static void	kse_switchout_thread(struct kse *kse, struct pthread *thread);
155static void	kse_wait(struct kse *kse, struct pthread *td_wait, int sigseq);
156static void	kse_free_unlocked(struct kse *kse);
157static void	kse_destroy(struct kse *kse);
158static void	kseg_free_unlocked(struct kse_group *kseg);
159static void	kseg_init(struct kse_group *kseg);
160static void	kseg_reinit(struct kse_group *kseg);
161static void	kseg_destroy(struct kse_group *kseg);
162static void	kse_waitq_insert(struct pthread *thread);
163static void	kse_wakeup_multi(struct kse *curkse);
164static struct kse_mailbox *kse_wakeup_one(struct pthread *thread);
165static void	thr_cleanup(struct kse *kse, struct pthread *curthread);
166static void	thr_link(struct pthread *thread);
167static void	thr_resume_wrapper(int sig, siginfo_t *, ucontext_t *);
168static void	thr_resume_check(struct pthread *curthread, ucontext_t *ucp);
169static int	thr_timedout(struct pthread *thread, struct timespec *curtime);
170static void	thr_unlink(struct pthread *thread);
171static void	thr_destroy(struct pthread *curthread, struct pthread *thread);
172static void	thread_gc(struct pthread *thread);
173static void	kse_gc(struct pthread *thread);
174static void	kseg_gc(struct pthread *thread);
175
176static void __inline
177thr_accounting(struct pthread *thread)
178{
179	if ((thread->slice_usec != -1) &&
180	    (thread->slice_usec <= TIMESLICE_USEC) &&
181	    (thread->attr.sched_policy != SCHED_FIFO)) {
182		thread->slice_usec += (thread->tcb->tcb_tmbx.tm_uticks
183		    + thread->tcb->tcb_tmbx.tm_sticks) * _clock_res_usec;
184		/* Check for time quantum exceeded: */
185		if (thread->slice_usec > TIMESLICE_USEC)
186			thread->slice_usec = -1;
187	}
188	thread->tcb->tcb_tmbx.tm_uticks = 0;
189	thread->tcb->tcb_tmbx.tm_sticks = 0;
190}
191
192/*
193 * This is called after a fork().
194 * No locks need to be taken here since we are guaranteed to be
195 * single threaded.
196 *
197 * XXX
198 * POSIX says for threaded process, fork() function is used
199 * only to run new programs, and the effects of calling functions
200 * that require certain resources between the call to fork() and
201 * the call to an exec function are undefined.
202 *
203 * It is not safe to free memory after fork(), because these data
204 * structures may be in inconsistent state.
205 */
206void
207_kse_single_thread(struct pthread *curthread)
208{
209#ifdef NOTYET
210	struct kse *kse;
211	struct kse_group *kseg;
212	struct pthread *thread;
213	kse_critical_t crit;
214	int i;
215
216	if (__isthreaded) {
217		_thr_rtld_fini();
218		_thr_signal_deinit();
219	}
220	__isthreaded = 0;
221	/*
222	 * Restore signal mask early, so any memory problems could
223	 * dump core.
224	 */
225	sigprocmask(SIG_SETMASK, &curthread->sigmask, NULL);
226	_thread_active_threads = 1;
227
228	/*
229	 * Enter a loop to remove and free all threads other than
230	 * the running thread from the active thread list:
231	 */
232	while ((thread = TAILQ_FIRST(&_thread_list)) != NULL) {
233		THR_GCLIST_REMOVE(thread);
234		/*
235		 * Remove this thread from the list (the current
236		 * thread will be removed but re-added by libpthread
237		 * initialization.
238		 */
239		TAILQ_REMOVE(&_thread_list, thread, tle);
240		/* Make sure this isn't the running thread: */
241		if (thread != curthread) {
242			_thr_stack_free(&thread->attr);
243			if (thread->specific != NULL)
244				free(thread->specific);
245			thr_destroy(curthread, thread);
246		}
247	}
248
249	TAILQ_INIT(&curthread->mutexq);		/* initialize mutex queue */
250	curthread->joiner = NULL;		/* no joining threads yet */
251	curthread->refcount = 0;
252	SIGEMPTYSET(curthread->sigpend);	/* clear pending signals */
253	if (curthread->specific != NULL) {
254		free(curthread->specific);
255		curthread->specific = NULL;
256		curthread->specific_data_count = 0;
257	}
258
259	/* Free the free KSEs: */
260	while ((kse = TAILQ_FIRST(&free_kseq)) != NULL) {
261		TAILQ_REMOVE(&free_kseq, kse, k_qe);
262		kse_destroy(kse);
263	}
264	free_kse_count = 0;
265
266	/* Free the active KSEs: */
267	while ((kse = TAILQ_FIRST(&active_kseq)) != NULL) {
268		TAILQ_REMOVE(&active_kseq, kse, k_qe);
269		kse_destroy(kse);
270	}
271	active_kse_count = 0;
272
273	/* Free the free KSEGs: */
274	while ((kseg = TAILQ_FIRST(&free_kse_groupq)) != NULL) {
275		TAILQ_REMOVE(&free_kse_groupq, kseg, kg_qe);
276		kseg_destroy(kseg);
277	}
278	free_kseg_count = 0;
279
280	/* Free the active KSEGs: */
281	while ((kseg = TAILQ_FIRST(&active_kse_groupq)) != NULL) {
282		TAILQ_REMOVE(&active_kse_groupq, kseg, kg_qe);
283		kseg_destroy(kseg);
284	}
285	active_kseg_count = 0;
286
287	/* Free the free threads. */
288	while ((thread = TAILQ_FIRST(&free_threadq)) != NULL) {
289		TAILQ_REMOVE(&free_threadq, thread, tle);
290		thr_destroy(curthread, thread);
291	}
292	free_thread_count = 0;
293
294	/* Free the to-be-gc'd threads. */
295	while ((thread = TAILQ_FIRST(&_thread_gc_list)) != NULL) {
296		TAILQ_REMOVE(&_thread_gc_list, thread, gcle);
297		thr_destroy(curthread, thread);
298	}
299	TAILQ_INIT(&gc_ksegq);
300	_gc_count = 0;
301
302	if (inited != 0) {
303		/*
304		 * Destroy these locks; they'll be recreated to assure they
305		 * are in the unlocked state.
306		 */
307		_lock_destroy(&kse_lock);
308		_lock_destroy(&thread_lock);
309		_lock_destroy(&_thread_list_lock);
310		inited = 0;
311	}
312
313	/*
314	 * After a fork(), the leftover thread goes back to being
315	 * scope process.
316	 */
317	curthread->attr.flags &= ~PTHREAD_SCOPE_SYSTEM;
318	curthread->attr.flags |= PTHREAD_SCOPE_PROCESS;
319
320	/*
321	 * After a fork, we are still operating on the thread's original
322	 * stack.  Don't clear the THR_FLAGS_USER from the thread's
323	 * attribute flags.
324	 */
325
326	/* Initialize the threads library. */
327	curthread->kse = NULL;
328	curthread->kseg = NULL;
329	_kse_initial = NULL;
330	_libpthread_init(curthread);
331#else
332	int i;
333
334	/* Reset the current thread and KSE lock data. */
335	for (i = 0; i < curthread->locklevel; i++) {
336		_lockuser_reinit(&curthread->lockusers[i], (void *)curthread);
337	}
338	curthread->locklevel = 0;
339	for (i = 0; i < curthread->kse->k_locklevel; i++) {
340		_lockuser_reinit(&curthread->kse->k_lockusers[i],
341		    (void *)curthread->kse);
342		_LCK_SET_PRIVATE2(&curthread->kse->k_lockusers[i], NULL);
343	}
344	curthread->kse->k_locklevel = 0;
345	_thr_spinlock_init();
346	if (__isthreaded) {
347		_thr_rtld_fini();
348		_thr_signal_deinit();
349	}
350	__isthreaded = 0;
351	curthread->kse->k_kcb->kcb_kmbx.km_curthread = NULL;
352	curthread->attr.flags |= PTHREAD_SCOPE_SYSTEM;
353
354	/* After a fork(), there child should have no pending signals. */
355	sigemptyset(&curthread->sigpend);
356
357	/*
358	 * Restore signal mask early, so any memory problems could
359	 * dump core.
360	 */
361	sigprocmask(SIG_SETMASK, &curthread->sigmask, NULL);
362	_thread_active_threads = 1;
363#endif
364}
365
366/*
367 * This is used to initialize housekeeping and to initialize the
368 * KSD for the KSE.
369 */
370void
371_kse_init(void)
372{
373	if (inited == 0) {
374		TAILQ_INIT(&active_kseq);
375		TAILQ_INIT(&active_kse_groupq);
376		TAILQ_INIT(&free_kseq);
377		TAILQ_INIT(&free_kse_groupq);
378		TAILQ_INIT(&free_threadq);
379		TAILQ_INIT(&gc_ksegq);
380		if (_lock_init(&kse_lock, LCK_ADAPTIVE,
381		    _kse_lock_wait, _kse_lock_wakeup) != 0)
382			PANIC("Unable to initialize free KSE queue lock");
383		if (_lock_init(&thread_lock, LCK_ADAPTIVE,
384		    _kse_lock_wait, _kse_lock_wakeup) != 0)
385			PANIC("Unable to initialize free thread queue lock");
386		if (_lock_init(&_thread_list_lock, LCK_ADAPTIVE,
387		    _kse_lock_wait, _kse_lock_wakeup) != 0)
388			PANIC("Unable to initialize thread list lock");
389		_pthread_mutex_init(&_tcb_mutex, NULL);
390		active_kse_count = 0;
391		active_kseg_count = 0;
392		_gc_count = 0;
393		inited = 1;
394	}
395}
396
397/*
398 * This is called when the first thread (other than the initial
399 * thread) is created.
400 */
401int
402_kse_setthreaded(int threaded)
403{
404	sigset_t sigset;
405
406	if ((threaded != 0) && (__isthreaded == 0)) {
407		SIGFILLSET(sigset);
408		__sys_sigprocmask(SIG_SETMASK, &sigset, &_thr_initial->sigmask);
409
410		/*
411		 * Tell the kernel to create a KSE for the initial thread
412		 * and enable upcalls in it.
413		 */
414		_kse_initial->k_flags |= KF_STARTED;
415
416		if (_thread_scope_system <= 0) {
417			_thr_initial->attr.flags &= ~PTHREAD_SCOPE_SYSTEM;
418			_kse_initial->k_kseg->kg_flags &= ~KGF_SINGLE_THREAD;
419			_kse_initial->k_kcb->kcb_kmbx.km_curthread = NULL;
420		}
421		else {
422			/*
423			 * For bound thread, kernel reads mailbox pointer
424			 * once, we'd set it here before calling kse_create.
425			 */
426			_tcb_set(_kse_initial->k_kcb, _thr_initial->tcb);
427			KSE_SET_MBOX(_kse_initial, _thr_initial);
428			_kse_initial->k_kcb->kcb_kmbx.km_flags |= KMF_BOUND;
429		}
430
431		/*
432		 * Locking functions in libc are required when there are
433		 * threads other than the initial thread.
434		 */
435		_thr_rtld_init();
436
437		__isthreaded = 1;
438		if (kse_create(&_kse_initial->k_kcb->kcb_kmbx, 0) != 0) {
439			_kse_initial->k_flags &= ~KF_STARTED;
440			__isthreaded = 0;
441			PANIC("kse_create() failed\n");
442			return (-1);
443		}
444		_thr_initial->tcb->tcb_tmbx.tm_lwp =
445			_kse_initial->k_kcb->kcb_kmbx.km_lwp;
446		_thread_activated = 1;
447
448#ifndef SYSTEM_SCOPE_ONLY
449		if (_thread_scope_system <= 0) {
450			/* Set current thread to initial thread */
451			_tcb_set(_kse_initial->k_kcb, _thr_initial->tcb);
452			KSE_SET_MBOX(_kse_initial, _thr_initial);
453			_thr_start_sig_daemon();
454			_thr_setmaxconcurrency();
455		}
456		else
457#endif
458			__sys_sigprocmask(SIG_SETMASK, &_thr_initial->sigmask,
459			    NULL);
460	}
461	return (0);
462}
463
464/*
465 * Lock wait and wakeup handlers for KSE locks.  These are only used by
466 * KSEs, and should never be used by threads.  KSE locks include the
467 * KSE group lock (used for locking the scheduling queue) and the
468 * kse_lock defined above.
469 *
470 * When a KSE lock attempt blocks, the entire KSE blocks allowing another
471 * KSE to run.  For the most part, it doesn't make much sense to try and
472 * schedule another thread because you need to lock the scheduling queue
473 * in order to do that.  And since the KSE lock is used to lock the scheduling
474 * queue, you would just end up blocking again.
475 */
476void
477_kse_lock_wait(struct lock *lock, struct lockuser *lu)
478{
479	struct kse *curkse = (struct kse *)_LCK_GET_PRIVATE(lu);
480	struct timespec ts;
481	int saved_flags;
482
483	if (curkse->k_kcb->kcb_kmbx.km_curthread != NULL)
484		PANIC("kse_lock_wait does not disable upcall.\n");
485	/*
486	 * Enter a loop to wait until we get the lock.
487	 */
488	ts.tv_sec = 0;
489	ts.tv_nsec = 1000000;  /* 1 sec */
490	while (!_LCK_GRANTED(lu)) {
491		/*
492		 * Yield the kse and wait to be notified when the lock
493		 * is granted.
494		 */
495		saved_flags = curkse->k_kcb->kcb_kmbx.km_flags;
496		curkse->k_kcb->kcb_kmbx.km_flags |= KMF_NOUPCALL |
497		    KMF_NOCOMPLETED;
498		kse_release(&ts);
499		curkse->k_kcb->kcb_kmbx.km_flags = saved_flags;
500	}
501}
502
503void
504_kse_lock_wakeup(struct lock *lock, struct lockuser *lu)
505{
506	struct kse *curkse;
507	struct kse *kse;
508	struct kse_mailbox *mbx;
509
510	curkse = _get_curkse();
511	kse = (struct kse *)_LCK_GET_PRIVATE(lu);
512
513	if (kse == curkse)
514		PANIC("KSE trying to wake itself up in lock");
515	else {
516		mbx = &kse->k_kcb->kcb_kmbx;
517		_lock_grant(lock, lu);
518		/*
519		 * Notify the owning kse that it has the lock.
520		 * It is safe to pass invalid address to kse_wakeup
521		 * even if the mailbox is not in kernel at all,
522		 * and waking up a wrong kse is also harmless.
523		 */
524		kse_wakeup(mbx);
525	}
526}
527
528/*
529 * Thread wait and wakeup handlers for thread locks.  These are only used
530 * by threads, never by KSEs.  Thread locks include the per-thread lock
531 * (defined in its structure), and condition variable and mutex locks.
532 */
533void
534_thr_lock_wait(struct lock *lock, struct lockuser *lu)
535{
536	struct pthread *curthread = (struct pthread *)lu->lu_private;
537
538	do {
539		THR_LOCK_SWITCH(curthread);
540		THR_SET_STATE(curthread, PS_LOCKWAIT);
541		_thr_sched_switch_unlocked(curthread);
542	} while (!_LCK_GRANTED(lu));
543}
544
545void
546_thr_lock_wakeup(struct lock *lock, struct lockuser *lu)
547{
548	struct pthread *thread;
549	struct pthread *curthread;
550	struct kse_mailbox *kmbx;
551
552	curthread = _get_curthread();
553	thread = (struct pthread *)_LCK_GET_PRIVATE(lu);
554
555	THR_SCHED_LOCK(curthread, thread);
556	_lock_grant(lock, lu);
557	kmbx = _thr_setrunnable_unlocked(thread);
558	THR_SCHED_UNLOCK(curthread, thread);
559	if (kmbx != NULL)
560		kse_wakeup(kmbx);
561}
562
563kse_critical_t
564_kse_critical_enter(void)
565{
566	kse_critical_t crit;
567
568	crit = (kse_critical_t)_kcb_critical_enter();
569	return (crit);
570}
571
572void
573_kse_critical_leave(kse_critical_t crit)
574{
575	struct pthread *curthread;
576
577	_kcb_critical_leave((struct kse_thr_mailbox *)crit);
578	if ((crit != NULL) && ((curthread = _get_curthread()) != NULL))
579		THR_YIELD_CHECK(curthread);
580}
581
582int
583_kse_in_critical(void)
584{
585	return (_kcb_in_critical());
586}
587
588void
589_thr_critical_enter(struct pthread *thread)
590{
591	thread->critical_count++;
592}
593
594void
595_thr_critical_leave(struct pthread *thread)
596{
597	thread->critical_count--;
598	THR_YIELD_CHECK(thread);
599}
600
601void
602_thr_sched_switch(struct pthread *curthread)
603{
604	struct kse *curkse;
605
606	(void)_kse_critical_enter();
607	curkse = _get_curkse();
608	KSE_SCHED_LOCK(curkse, curkse->k_kseg);
609	_thr_sched_switch_unlocked(curthread);
610}
611
612/*
613 * XXX - We may need to take the scheduling lock before calling
614 *       this, or perhaps take the lock within here before
615 *       doing anything else.
616 */
617void
618_thr_sched_switch_unlocked(struct pthread *curthread)
619{
620	struct kse *curkse;
621	volatile int resume_once = 0;
622	ucontext_t *uc;
623
624	/* We're in the scheduler, 5 by 5: */
625	curkse = curthread->kse;
626
627	curthread->need_switchout = 1;	/* The thread yielded on its own. */
628	curthread->critical_yield = 0;	/* No need to yield anymore. */
629
630	/* Thread can unlock the scheduler lock. */
631	curthread->lock_switch = 1;
632
633	if (curthread->attr.flags & PTHREAD_SCOPE_SYSTEM)
634		kse_sched_single(&curkse->k_kcb->kcb_kmbx);
635	else {
636		if (__predict_false(_libkse_debug != 0)) {
637			/*
638			 * Because debugger saves single step status in thread
639			 * mailbox's tm_dflags, we can safely clear single
640			 * step status here. the single step status will be
641			 * restored by kse_switchin when the thread is
642			 * switched in again. This also lets uts run in full
643			 * speed.
644			 */
645			 ptrace(PT_CLEARSTEP, curkse->k_kcb->kcb_kmbx.km_lwp,
646				(caddr_t) 1, 0);
647		}
648
649		KSE_SET_SWITCH(curkse);
650		_thread_enter_uts(curthread->tcb, curkse->k_kcb);
651	}
652
653	/*
654	 * Unlock the scheduling queue and leave the
655	 * critical region.
656	 */
657	/* Don't trust this after a switch! */
658	curkse = curthread->kse;
659
660	curthread->lock_switch = 0;
661	KSE_SCHED_UNLOCK(curkse, curkse->k_kseg);
662	_kse_critical_leave(&curthread->tcb->tcb_tmbx);
663
664	/*
665	 * This thread is being resumed; check for cancellations.
666	 */
667	if (THR_NEED_ASYNC_CANCEL(curthread) && !THR_IN_CRITICAL(curthread)) {
668		uc = alloca(sizeof(ucontext_t));
669		resume_once = 0;
670		THR_GETCONTEXT(uc);
671		if (resume_once == 0) {
672			resume_once = 1;
673			curthread->check_pending = 0;
674			thr_resume_check(curthread, uc);
675		}
676	}
677	THR_ACTIVATE_LAST_LOCK(curthread);
678}
679
680/*
681 * This is the scheduler for a KSE which runs a scope system thread.
682 * The multi-thread KSE scheduler should also work for a single threaded
683 * KSE, but we use a separate scheduler so that it can be fine-tuned
684 * to be more efficient (and perhaps not need a separate stack for
685 * the KSE, allowing it to use the thread's stack).
686 */
687
688static void
689kse_sched_single(struct kse_mailbox *kmbx)
690{
691	struct kse *curkse;
692	struct pthread *curthread;
693	struct timespec ts;
694	sigset_t sigmask;
695	int i, sigseqno, level, first = 0;
696
697	curkse = (struct kse *)kmbx->km_udata;
698	curthread = curkse->k_curthread;
699
700	if (__predict_false((curkse->k_flags & KF_INITIALIZED) == 0)) {
701		/* Setup this KSEs specific data. */
702		_kcb_set(curkse->k_kcb);
703		_tcb_set(curkse->k_kcb, curthread->tcb);
704		curkse->k_flags |= KF_INITIALIZED;
705		first = 1;
706		curthread->active = 1;
707
708		/* Setup kernel signal masks for new thread. */
709		__sys_sigprocmask(SIG_SETMASK, &curthread->sigmask, NULL);
710		/*
711		 * Enter critical region, this is meanless for bound thread,
712		 * It is used to let other code work, those code want mailbox
713		 * to be cleared.
714		 */
715		(void)_kse_critical_enter();
716 	} else {
717		/*
718		 * Bound thread always has tcb set, this prevent some
719		 * code from blindly setting bound thread tcb to NULL,
720		 * buggy code ?
721		 */
722		_tcb_set(curkse->k_kcb, curthread->tcb);
723	}
724
725	curthread->critical_yield = 0;
726	curthread->need_switchout = 0;
727
728	/*
729	 * Lock the scheduling queue.
730	 *
731	 * There is no scheduling queue for single threaded KSEs,
732	 * but we need a lock for protection regardless.
733	 */
734	if (curthread->lock_switch == 0)
735		KSE_SCHED_LOCK(curkse, curkse->k_kseg);
736
737	/*
738	 * This has to do the job of kse_switchout_thread(), only
739	 * for a single threaded KSE/KSEG.
740	 */
741
742	switch (curthread->state) {
743	case PS_MUTEX_WAIT:
744	case PS_COND_WAIT:
745		if (THR_NEED_CANCEL(curthread)) {
746			curthread->interrupted = 1;
747			curthread->continuation = _thr_finish_cancellation;
748			THR_SET_STATE(curthread, PS_RUNNING);
749		}
750		break;
751
752	case PS_LOCKWAIT:
753		/*
754		 * This state doesn't timeout.
755		 */
756		curthread->wakeup_time.tv_sec = -1;
757		curthread->wakeup_time.tv_nsec = -1;
758		level = curthread->locklevel - 1;
759		if (_LCK_GRANTED(&curthread->lockusers[level]))
760			THR_SET_STATE(curthread, PS_RUNNING);
761		break;
762
763	case PS_DEAD:
764		curthread->check_pending = 0;
765		/* Unlock the scheduling queue and exit the KSE and thread. */
766		thr_cleanup(curkse, curthread);
767		KSE_SCHED_UNLOCK(curkse, curkse->k_kseg);
768		PANIC("bound thread shouldn't get here\n");
769		break;
770
771	case PS_JOIN:
772		if (THR_NEED_CANCEL(curthread)) {
773			curthread->join_status.thread = NULL;
774			THR_SET_STATE(curthread, PS_RUNNING);
775		} else {
776			/*
777			 * This state doesn't timeout.
778			 */
779			curthread->wakeup_time.tv_sec = -1;
780			curthread->wakeup_time.tv_nsec = -1;
781		}
782		break;
783
784	case PS_SUSPENDED:
785		if (THR_NEED_CANCEL(curthread)) {
786			curthread->interrupted = 1;
787			THR_SET_STATE(curthread, PS_RUNNING);
788		} else {
789			/*
790			 * These states don't timeout.
791			 */
792			curthread->wakeup_time.tv_sec = -1;
793			curthread->wakeup_time.tv_nsec = -1;
794		}
795		break;
796
797	case PS_RUNNING:
798		if ((curthread->flags & THR_FLAGS_SUSPENDED) != 0 &&
799		    !THR_NEED_CANCEL(curthread)) {
800			THR_SET_STATE(curthread, PS_SUSPENDED);
801			/*
802			 * These states don't timeout.
803			 */
804			curthread->wakeup_time.tv_sec = -1;
805			curthread->wakeup_time.tv_nsec = -1;
806		}
807		break;
808
809	case PS_SIGWAIT:
810		PANIC("bound thread does not have SIGWAIT state\n");
811
812	case PS_SLEEP_WAIT:
813		PANIC("bound thread does not have SLEEP_WAIT state\n");
814
815	case PS_SIGSUSPEND:
816		PANIC("bound thread does not have SIGSUSPEND state\n");
817
818	case PS_DEADLOCK:
819		/*
820		 * These states don't timeout and don't need
821		 * to be in the waiting queue.
822		 */
823		curthread->wakeup_time.tv_sec = -1;
824		curthread->wakeup_time.tv_nsec = -1;
825		break;
826
827	default:
828		PANIC("Unknown state\n");
829		break;
830	}
831
832	while (curthread->state != PS_RUNNING) {
833		sigseqno = curkse->k_sigseqno;
834		if (curthread->check_pending != 0) {
835			/*
836			 * Install pending signals into the frame, possible
837			 * cause mutex or condvar backout.
838			 */
839			curthread->check_pending = 0;
840			SIGFILLSET(sigmask);
841
842			/*
843			 * Lock out kernel signal code when we are processing
844			 * signals, and get a fresh copy of signal mask.
845			 */
846			__sys_sigprocmask(SIG_SETMASK, &sigmask,
847					  &curthread->sigmask);
848			for (i = 1; i <= _SIG_MAXSIG; i++) {
849				if (SIGISMEMBER(curthread->sigmask, i))
850					continue;
851				if (SIGISMEMBER(curthread->sigpend, i))
852					(void)_thr_sig_add(curthread, i,
853					    &curthread->siginfo[i-1]);
854			}
855			__sys_sigprocmask(SIG_SETMASK, &curthread->sigmask,
856				NULL);
857			/* The above code might make thread runnable */
858			if (curthread->state == PS_RUNNING)
859				break;
860		}
861		THR_DEACTIVATE_LAST_LOCK(curthread);
862		kse_wait(curkse, curthread, sigseqno);
863		THR_ACTIVATE_LAST_LOCK(curthread);
864		if (curthread->wakeup_time.tv_sec >= 0) {
865			KSE_GET_TOD(curkse, &ts);
866			if (thr_timedout(curthread, &ts)) {
867				/* Indicate the thread timedout: */
868				curthread->timeout = 1;
869				/* Make the thread runnable. */
870				THR_SET_STATE(curthread, PS_RUNNING);
871			}
872		}
873	}
874
875	if (curthread->lock_switch == 0) {
876		/* Unlock the scheduling queue. */
877		KSE_SCHED_UNLOCK(curkse, curkse->k_kseg);
878	}
879
880	DBG_MSG("Continuing bound thread %p\n", curthread);
881	if (first) {
882		_kse_critical_leave(&curthread->tcb->tcb_tmbx);
883		pthread_exit(curthread->start_routine(curthread->arg));
884	}
885}
886
887#ifdef DEBUG_THREAD_KERN
888static void
889dump_queues(struct kse *curkse)
890{
891	struct pthread *thread;
892
893	DBG_MSG("Threads in waiting queue:\n");
894	TAILQ_FOREACH(thread, &curkse->k_kseg->kg_schedq.sq_waitq, pqe) {
895		DBG_MSG("  thread %p, state %d, blocked %d\n",
896		    thread, thread->state, thread->blocked);
897	}
898}
899#endif
900
901/*
902 * This is the scheduler for a KSE which runs multiple threads.
903 */
904static void
905kse_sched_multi(struct kse_mailbox *kmbx)
906{
907	struct kse *curkse;
908	struct pthread *curthread, *td_wait;
909	int ret;
910
911	curkse = (struct kse *)kmbx->km_udata;
912	THR_ASSERT(curkse->k_kcb->kcb_kmbx.km_curthread == NULL,
913	    "Mailbox not null in kse_sched_multi");
914
915	/* Check for first time initialization: */
916	if (__predict_false((curkse->k_flags & KF_INITIALIZED) == 0)) {
917		/* Setup this KSEs specific data. */
918		_kcb_set(curkse->k_kcb);
919
920		/* Set this before grabbing the context. */
921		curkse->k_flags |= KF_INITIALIZED;
922	}
923
924	/*
925	 * No current thread anymore, calling _get_curthread in UTS
926	 * should dump core
927	 */
928	_tcb_set(curkse->k_kcb, NULL);
929
930	/* If this is an upcall; take the scheduler lock. */
931	if (!KSE_IS_SWITCH(curkse))
932		KSE_SCHED_LOCK(curkse, curkse->k_kseg);
933	else
934		KSE_CLEAR_SWITCH(curkse);
935
936	if (KSE_IS_IDLE(curkse)) {
937		KSE_CLEAR_IDLE(curkse);
938		curkse->k_kseg->kg_idle_kses--;
939	}
940
941	/*
942	 * Now that the scheduler lock is held, get the current
943	 * thread.  The KSE's current thread cannot be safely
944	 * examined without the lock because it could have returned
945	 * as completed on another KSE.  See kse_check_completed().
946	 */
947	curthread = curkse->k_curthread;
948
949	/*
950	 * If the current thread was completed in another KSE, then
951	 * it will be in the run queue.  Don't mark it as being blocked.
952	 */
953	if ((curthread != NULL) &&
954	    ((curthread->flags & THR_FLAGS_IN_RUNQ) == 0) &&
955	    (curthread->need_switchout == 0)) {
956		/*
957		 * Assume the current thread is blocked; when the
958		 * completed threads are checked and if the current
959		 * thread is among the completed, the blocked flag
960		 * will be cleared.
961		 */
962		curthread->blocked = 1;
963		DBG_MSG("Running thread %p is now blocked in kernel.\n",
964		    curthread);
965	}
966
967	/* Check for any unblocked threads in the kernel. */
968	kse_check_completed(curkse);
969
970	/*
971	 * Check for threads that have timed-out.
972	 */
973	kse_check_waitq(curkse);
974
975	/*
976	 * Switchout the current thread, if necessary, as the last step
977	 * so that it is inserted into the run queue (if it's runnable)
978	 * _after_ any other threads that were added to it above.
979	 */
980	if (curthread == NULL)
981		;  /* Nothing to do here. */
982	else if ((curthread->need_switchout == 0) && DBG_CAN_RUN(curthread) &&
983	    (curthread->blocked == 0) && (THR_IN_CRITICAL(curthread))) {
984		/*
985		 * Resume the thread and tell it to yield when
986		 * it leaves the critical region.
987		 */
988		curthread->critical_yield = 1;
989		curthread->active = 1;
990		if ((curthread->flags & THR_FLAGS_IN_RUNQ) != 0)
991			KSE_RUNQ_REMOVE(curkse, curthread);
992		curkse->k_curthread = curthread;
993		curthread->kse = curkse;
994		DBG_MSG("Continuing thread %p in critical region\n",
995		    curthread);
996		kse_wakeup_multi(curkse);
997		KSE_SCHED_UNLOCK(curkse, curkse->k_kseg);
998		ret = _thread_switch(curkse->k_kcb, curthread->tcb, 1);
999		if (ret != 0)
1000			PANIC("Can't resume thread in critical region\n");
1001	}
1002	else if ((curthread->flags & THR_FLAGS_IN_RUNQ) == 0) {
1003		curthread->tcb->tcb_tmbx.tm_lwp = 0;
1004		kse_switchout_thread(curkse, curthread);
1005	}
1006	curkse->k_curthread = NULL;
1007
1008#ifdef DEBUG_THREAD_KERN
1009	dump_queues(curkse);
1010#endif
1011
1012	/* Check if there are no threads ready to run: */
1013	while (((curthread = KSE_RUNQ_FIRST(curkse)) == NULL) &&
1014	    (curkse->k_kseg->kg_threadcount != 0) &&
1015	    ((curkse->k_flags & KF_TERMINATED) == 0)) {
1016		/*
1017		 * Wait for a thread to become active or until there are
1018		 * no more threads.
1019		 */
1020		td_wait = KSE_WAITQ_FIRST(curkse);
1021		kse_wait(curkse, td_wait, 0);
1022		kse_check_completed(curkse);
1023		kse_check_waitq(curkse);
1024	}
1025
1026	/* Check for no more threads: */
1027	if ((curkse->k_kseg->kg_threadcount == 0) ||
1028	    ((curkse->k_flags & KF_TERMINATED) != 0)) {
1029		/*
1030		 * Normally this shouldn't return, but it will if there
1031		 * are other KSEs running that create new threads that
1032		 * are assigned to this KSE[G].  For instance, if a scope
1033		 * system thread were to create a scope process thread
1034		 * and this kse[g] is the initial kse[g], then that newly
1035		 * created thread would be assigned to us (the initial
1036		 * kse[g]).
1037		 */
1038		kse_wakeup_multi(curkse);
1039		KSE_SCHED_UNLOCK(curkse, curkse->k_kseg);
1040		kse_fini(curkse);
1041		/* never returns */
1042	}
1043
1044	THR_ASSERT(curthread != NULL,
1045	    "Return from kse_wait/fini without thread.");
1046	THR_ASSERT(curthread->state != PS_DEAD,
1047	    "Trying to resume dead thread!");
1048	KSE_RUNQ_REMOVE(curkse, curthread);
1049
1050	/*
1051	 * Make the selected thread the current thread.
1052	 */
1053	curkse->k_curthread = curthread;
1054
1055	/*
1056	 * Make sure the current thread's kse points to this kse.
1057	 */
1058	curthread->kse = curkse;
1059
1060	/*
1061	 * Reset the time slice if this thread is running for the first
1062	 * time or running again after using its full time slice allocation.
1063	 */
1064	if (curthread->slice_usec == -1)
1065		curthread->slice_usec = 0;
1066
1067	/* Mark the thread active. */
1068	curthread->active = 1;
1069
1070	/*
1071	 * The thread's current signal frame will only be NULL if it
1072	 * is being resumed after being blocked in the kernel.  In
1073	 * this case, and if the thread needs to run down pending
1074	 * signals or needs a cancellation check, we need to add a
1075	 * signal frame to the thread's context.
1076	 */
1077	if (curthread->lock_switch == 0 && curthread->state == PS_RUNNING &&
1078	    (curthread->check_pending != 0 ||
1079	     THR_NEED_ASYNC_CANCEL(curthread)) &&
1080	    !THR_IN_CRITICAL(curthread)) {
1081		curthread->check_pending = 0;
1082		signalcontext(&curthread->tcb->tcb_tmbx.tm_context, 0,
1083		    (__sighandler_t *)thr_resume_wrapper);
1084	}
1085	kse_wakeup_multi(curkse);
1086	/*
1087	 * Continue the thread at its current frame:
1088	 */
1089	if (curthread->lock_switch != 0) {
1090		/*
1091		 * This thread came from a scheduler switch; it will
1092		 * unlock the scheduler lock and set the mailbox.
1093		 */
1094		ret = _thread_switch(curkse->k_kcb, curthread->tcb, 0);
1095	} else {
1096		/* This thread won't unlock the scheduler lock. */
1097		KSE_SCHED_UNLOCK(curkse, curkse->k_kseg);
1098		ret = _thread_switch(curkse->k_kcb, curthread->tcb, 1);
1099	}
1100	if (ret != 0)
1101		PANIC("Thread has returned from _thread_switch");
1102
1103	/* This point should not be reached. */
1104	PANIC("Thread has returned from _thread_switch");
1105}
1106
1107static void
1108thr_resume_wrapper(int sig, siginfo_t *siginfo, ucontext_t *ucp)
1109{
1110	struct pthread *curthread = _get_curthread();
1111	struct kse *curkse;
1112	int ret, err_save = errno;
1113
1114	DBG_MSG(">>> sig wrapper\n");
1115	if (curthread->lock_switch)
1116		PANIC("thr_resume_wrapper, lock_switch != 0\n");
1117	thr_resume_check(curthread, ucp);
1118	errno = err_save;
1119	_kse_critical_enter();
1120	curkse = curthread->kse;
1121	curthread->tcb->tcb_tmbx.tm_context = *ucp;
1122	ret = _thread_switch(curkse->k_kcb, curthread->tcb, 1);
1123	if (ret != 0)
1124		PANIC("thr_resume_wrapper: thread has returned "
1125		      "from _thread_switch");
1126	/* THR_SETCONTEXT(ucp); */ /* not work, why ? */
1127}
1128
1129static void
1130thr_resume_check(struct pthread *curthread, ucontext_t *ucp)
1131{
1132	_thr_sig_rundown(curthread, ucp);
1133
1134	if (THR_NEED_ASYNC_CANCEL(curthread))
1135		pthread_testcancel();
1136}
1137
1138/*
1139 * Clean up a thread.  This must be called with the thread's KSE
1140 * scheduling lock held.  The thread must be a thread from the
1141 * KSE's group.
1142 */
1143static void
1144thr_cleanup(struct kse *curkse, struct pthread *thread)
1145{
1146	struct pthread *joiner;
1147	struct kse_mailbox *kmbx = NULL;
1148	int sys_scope;
1149
1150	if ((joiner = thread->joiner) != NULL) {
1151		/* Joinee scheduler lock held; joiner won't leave. */
1152		if (joiner->kseg == curkse->k_kseg) {
1153			if (joiner->join_status.thread == thread) {
1154				joiner->join_status.thread = NULL;
1155				joiner->join_status.ret = thread->ret;
1156				(void)_thr_setrunnable_unlocked(joiner);
1157			}
1158		} else {
1159			KSE_SCHED_UNLOCK(curkse, curkse->k_kseg);
1160			/* The joiner may have removed itself and exited. */
1161			if (_thr_ref_add(thread, joiner, 0) == 0) {
1162				KSE_SCHED_LOCK(curkse, joiner->kseg);
1163				if (joiner->join_status.thread == thread) {
1164					joiner->join_status.thread = NULL;
1165					joiner->join_status.ret = thread->ret;
1166					kmbx = _thr_setrunnable_unlocked(joiner);
1167				}
1168				KSE_SCHED_UNLOCK(curkse, joiner->kseg);
1169				_thr_ref_delete(thread, joiner);
1170				if (kmbx != NULL)
1171					kse_wakeup(kmbx);
1172			}
1173			KSE_SCHED_LOCK(curkse, curkse->k_kseg);
1174		}
1175		thread->attr.flags |= PTHREAD_DETACHED;
1176	}
1177
1178	if (!(sys_scope = (thread->attr.flags & PTHREAD_SCOPE_SYSTEM))) {
1179		/*
1180		 * Remove the thread from the KSEG's list of threads.
1181	 	 */
1182		KSEG_THRQ_REMOVE(thread->kseg, thread);
1183		/*
1184		 * Migrate the thread to the main KSE so that this
1185		 * KSE and KSEG can be cleaned when their last thread
1186		 * exits.
1187		 */
1188		thread->kseg = _kse_initial->k_kseg;
1189		thread->kse = _kse_initial;
1190	}
1191
1192	/*
1193	 * We can't hold the thread list lock while holding the
1194	 * scheduler lock.
1195	 */
1196	KSE_SCHED_UNLOCK(curkse, curkse->k_kseg);
1197	DBG_MSG("Adding thread %p to GC list\n", thread);
1198	KSE_LOCK_ACQUIRE(curkse, &_thread_list_lock);
1199	thread->tlflags |= TLFLAGS_GC_SAFE;
1200	THR_GCLIST_ADD(thread);
1201	KSE_LOCK_RELEASE(curkse, &_thread_list_lock);
1202	if (sys_scope) {
1203		/*
1204		 * System scope thread is single thread group,
1205		 * when thread is exited, its kse and ksegrp should
1206		 * be recycled as well.
1207		 * kse upcall stack belongs to thread, clear it here.
1208		 */
1209		curkse->k_stack.ss_sp = 0;
1210		curkse->k_stack.ss_size = 0;
1211		kse_exit();
1212		PANIC("kse_exit() failed for system scope thread");
1213	}
1214	KSE_SCHED_LOCK(curkse, curkse->k_kseg);
1215}
1216
1217void
1218_thr_gc(struct pthread *curthread)
1219{
1220	thread_gc(curthread);
1221	kse_gc(curthread);
1222	kseg_gc(curthread);
1223}
1224
1225static void
1226thread_gc(struct pthread *curthread)
1227{
1228	struct pthread *td, *td_next;
1229	kse_critical_t crit;
1230	TAILQ_HEAD(, pthread) worklist;
1231
1232	TAILQ_INIT(&worklist);
1233	crit = _kse_critical_enter();
1234	KSE_LOCK_ACQUIRE(curthread->kse, &_thread_list_lock);
1235
1236	/* Check the threads waiting for GC. */
1237	for (td = TAILQ_FIRST(&_thread_gc_list); td != NULL; td = td_next) {
1238		td_next = TAILQ_NEXT(td, gcle);
1239		if ((td->tlflags & TLFLAGS_GC_SAFE) == 0)
1240			continue;
1241		else if (((td->attr.flags & PTHREAD_SCOPE_SYSTEM) != 0) &&
1242		    ((td->kse->k_kcb->kcb_kmbx.km_flags & KMF_DONE) == 0)) {
1243			/*
1244			 * The thread and KSE are operating on the same
1245			 * stack.  Wait for the KSE to exit before freeing
1246			 * the thread's stack as well as everything else.
1247			 */
1248			continue;
1249		}
1250		/*
1251		 * Remove the thread from the GC list.  If the thread
1252		 * isn't yet detached, it will get added back to the
1253		 * GC list at a later time.
1254		 */
1255		THR_GCLIST_REMOVE(td);
1256		DBG_MSG("Freeing thread %p stack\n", td);
1257		/*
1258		 * We can free the thread stack since it's no longer
1259		 * in use.
1260		 */
1261		_thr_stack_free(&td->attr);
1262		if (((td->attr.flags & PTHREAD_DETACHED) != 0) &&
1263		    (td->refcount == 0)) {
1264			/*
1265			 * The thread has detached and is no longer
1266			 * referenced.  It is safe to remove all
1267			 * remnants of the thread.
1268			 */
1269			THR_LIST_REMOVE(td);
1270			TAILQ_INSERT_HEAD(&worklist, td, gcle);
1271		}
1272	}
1273	KSE_LOCK_RELEASE(curthread->kse, &_thread_list_lock);
1274	_kse_critical_leave(crit);
1275
1276	while ((td = TAILQ_FIRST(&worklist)) != NULL) {
1277		TAILQ_REMOVE(&worklist, td, gcle);
1278		/*
1279		 * XXX we don't free initial thread and its kse
1280		 * (if thread is a bound thread), because there might
1281		 * have some code referencing initial thread and kse.
1282		 */
1283		if (td == _thr_initial) {
1284			DBG_MSG("Initial thread won't be freed\n");
1285			continue;
1286		}
1287
1288		if ((td->attr.flags & PTHREAD_SCOPE_SYSTEM) != 0) {
1289			crit = _kse_critical_enter();
1290			KSE_LOCK_ACQUIRE(curthread->kse, &kse_lock);
1291			kse_free_unlocked(td->kse);
1292			kseg_free_unlocked(td->kseg);
1293			KSE_LOCK_RELEASE(curthread->kse, &kse_lock);
1294			_kse_critical_leave(crit);
1295		}
1296		DBG_MSG("Freeing thread %p\n", td);
1297		_thr_free(curthread, td);
1298	}
1299}
1300
1301static void
1302kse_gc(struct pthread *curthread)
1303{
1304	kse_critical_t crit;
1305	TAILQ_HEAD(, kse) worklist;
1306	struct kse *kse;
1307
1308	if (free_kse_count <= MAX_CACHED_KSES)
1309		return;
1310	TAILQ_INIT(&worklist);
1311	crit = _kse_critical_enter();
1312	KSE_LOCK_ACQUIRE(curthread->kse, &kse_lock);
1313	while (free_kse_count > MAX_CACHED_KSES) {
1314		kse = TAILQ_FIRST(&free_kseq);
1315		TAILQ_REMOVE(&free_kseq, kse, k_qe);
1316		TAILQ_INSERT_HEAD(&worklist, kse, k_qe);
1317		free_kse_count--;
1318	}
1319	KSE_LOCK_RELEASE(curthread->kse, &kse_lock);
1320	_kse_critical_leave(crit);
1321
1322	while ((kse = TAILQ_FIRST(&worklist))) {
1323		TAILQ_REMOVE(&worklist, kse, k_qe);
1324		kse_destroy(kse);
1325	}
1326}
1327
1328static void
1329kseg_gc(struct pthread *curthread)
1330{
1331	kse_critical_t crit;
1332	TAILQ_HEAD(, kse_group) worklist;
1333	struct kse_group *kseg;
1334
1335	if (free_kseg_count <= MAX_CACHED_KSEGS)
1336		return;
1337	crit = _kse_critical_enter();
1338	KSE_LOCK_ACQUIRE(curthread->kse, &kse_lock);
1339	while (free_kseg_count > MAX_CACHED_KSEGS) {
1340		kseg = TAILQ_FIRST(&free_kse_groupq);
1341		TAILQ_REMOVE(&free_kse_groupq, kseg, kg_qe);
1342		free_kseg_count--;
1343		TAILQ_INSERT_HEAD(&worklist, kseg, kg_qe);
1344	}
1345	KSE_LOCK_RELEASE(curthread->kse, &kse_lock);
1346	_kse_critical_leave(crit);
1347
1348	while ((kseg = TAILQ_FIRST(&worklist))) {
1349		TAILQ_REMOVE(&worklist, kseg, kg_qe);
1350		kseg_destroy(kseg);
1351	}
1352}
1353
1354/*
1355 * Only new threads that are running or suspended may be scheduled.
1356 */
1357int
1358_thr_schedule_add(struct pthread *curthread, struct pthread *newthread)
1359{
1360	kse_critical_t crit;
1361	int ret;
1362
1363	/* Add the new thread. */
1364	thr_link(newthread);
1365
1366	/*
1367	 * If this is the first time creating a thread, make sure
1368	 * the mailbox is set for the current thread.
1369	 */
1370	if ((newthread->attr.flags & PTHREAD_SCOPE_SYSTEM) != 0) {
1371		/* We use the thread's stack as the KSE's stack. */
1372		newthread->kse->k_kcb->kcb_kmbx.km_stack.ss_sp =
1373		    newthread->attr.stackaddr_attr;
1374		newthread->kse->k_kcb->kcb_kmbx.km_stack.ss_size =
1375		    newthread->attr.stacksize_attr;
1376
1377		/*
1378		 * No need to lock the scheduling queue since the
1379		 * KSE/KSEG pair have not yet been started.
1380		 */
1381		KSEG_THRQ_ADD(newthread->kseg, newthread);
1382		/* this thread never gives up kse */
1383		newthread->active = 1;
1384		newthread->kse->k_curthread = newthread;
1385		newthread->kse->k_kcb->kcb_kmbx.km_flags = KMF_BOUND;
1386		newthread->kse->k_kcb->kcb_kmbx.km_func =
1387		    (kse_func_t *)kse_sched_single;
1388		newthread->kse->k_kcb->kcb_kmbx.km_quantum = 0;
1389		KSE_SET_MBOX(newthread->kse, newthread);
1390		/*
1391		 * This thread needs a new KSE and KSEG.
1392		 */
1393		newthread->kse->k_flags &= ~KF_INITIALIZED;
1394		newthread->kse->k_flags |= KF_STARTED;
1395		/* Fire up! */
1396		ret = kse_create(&newthread->kse->k_kcb->kcb_kmbx, 1);
1397		if (ret != 0)
1398			ret = errno;
1399	}
1400	else {
1401		/*
1402		 * Lock the KSE and add the new thread to its list of
1403		 * assigned threads.  If the new thread is runnable, also
1404		 * add it to the KSE's run queue.
1405		 */
1406		crit = _kse_critical_enter();
1407		KSE_SCHED_LOCK(curthread->kse, newthread->kseg);
1408		KSEG_THRQ_ADD(newthread->kseg, newthread);
1409		if (newthread->state == PS_RUNNING)
1410			THR_RUNQ_INSERT_TAIL(newthread);
1411		if ((newthread->kse->k_flags & KF_STARTED) == 0) {
1412			/*
1413			 * This KSE hasn't been started yet.  Start it
1414			 * outside of holding the lock.
1415			 */
1416			newthread->kse->k_flags |= KF_STARTED;
1417			newthread->kse->k_kcb->kcb_kmbx.km_func =
1418			    (kse_func_t *)kse_sched_multi;
1419			newthread->kse->k_kcb->kcb_kmbx.km_flags = 0;
1420			kse_create(&newthread->kse->k_kcb->kcb_kmbx, 0);
1421		 } else if ((newthread->state == PS_RUNNING) &&
1422		     KSE_IS_IDLE(newthread->kse)) {
1423			/*
1424			 * The thread is being scheduled on another KSEG.
1425			 */
1426			kse_wakeup_one(newthread);
1427		}
1428		KSE_SCHED_UNLOCK(curthread->kse, newthread->kseg);
1429		_kse_critical_leave(crit);
1430		ret = 0;
1431	}
1432	if (ret != 0)
1433		thr_unlink(newthread);
1434
1435	return (ret);
1436}
1437
1438void
1439kse_waitq_insert(struct pthread *thread)
1440{
1441	struct pthread *td;
1442
1443	if (thread->wakeup_time.tv_sec == -1)
1444		TAILQ_INSERT_TAIL(&thread->kse->k_schedq->sq_waitq, thread,
1445		    pqe);
1446	else {
1447		td = TAILQ_FIRST(&thread->kse->k_schedq->sq_waitq);
1448		while ((td != NULL) && (td->wakeup_time.tv_sec != -1) &&
1449		    ((td->wakeup_time.tv_sec < thread->wakeup_time.tv_sec) ||
1450		    ((td->wakeup_time.tv_sec == thread->wakeup_time.tv_sec) &&
1451		    (td->wakeup_time.tv_nsec <= thread->wakeup_time.tv_nsec))))
1452			td = TAILQ_NEXT(td, pqe);
1453		if (td == NULL)
1454			TAILQ_INSERT_TAIL(&thread->kse->k_schedq->sq_waitq,
1455			    thread, pqe);
1456		else
1457			TAILQ_INSERT_BEFORE(td, thread, pqe);
1458	}
1459	thread->flags |= THR_FLAGS_IN_WAITQ;
1460}
1461
1462/*
1463 * This must be called with the scheduling lock held.
1464 */
1465static void
1466kse_check_completed(struct kse *kse)
1467{
1468	struct pthread *thread;
1469	struct kse_thr_mailbox *completed;
1470	int sig;
1471
1472	if ((completed = kse->k_kcb->kcb_kmbx.km_completed) != NULL) {
1473		kse->k_kcb->kcb_kmbx.km_completed = NULL;
1474		while (completed != NULL) {
1475			thread = completed->tm_udata;
1476			DBG_MSG("Found completed thread %p, name %s\n",
1477			    thread,
1478			    (thread->name == NULL) ? "none" : thread->name);
1479			thread->blocked = 0;
1480			if (thread != kse->k_curthread) {
1481				thr_accounting(thread);
1482				if ((thread->flags & THR_FLAGS_SUSPENDED) != 0)
1483					THR_SET_STATE(thread, PS_SUSPENDED);
1484				else
1485					KSE_RUNQ_INSERT_TAIL(kse, thread);
1486				if ((thread->kse != kse) &&
1487				    (thread->kse->k_curthread == thread)) {
1488					/*
1489					 * Remove this thread from its
1490					 * previous KSE so that it (the KSE)
1491					 * doesn't think it is still active.
1492					 */
1493					thread->kse->k_curthread = NULL;
1494					thread->active = 0;
1495				}
1496			}
1497			if ((sig = thread->tcb->tcb_tmbx.tm_syncsig.si_signo)
1498			    != 0) {
1499				if (SIGISMEMBER(thread->sigmask, sig))
1500					SIGADDSET(thread->sigpend, sig);
1501				else if (THR_IN_CRITICAL(thread))
1502					kse_thr_interrupt(NULL, KSE_INTR_SIGEXIT, sig);
1503				else
1504					(void)_thr_sig_add(thread, sig,
1505					    &thread->tcb->tcb_tmbx.tm_syncsig);
1506				thread->tcb->tcb_tmbx.tm_syncsig.si_signo = 0;
1507			}
1508			completed = completed->tm_next;
1509		}
1510	}
1511}
1512
1513/*
1514 * This must be called with the scheduling lock held.
1515 */
1516static void
1517kse_check_waitq(struct kse *kse)
1518{
1519	struct pthread	*pthread;
1520	struct timespec ts;
1521
1522	KSE_GET_TOD(kse, &ts);
1523
1524	/*
1525	 * Wake up threads that have timedout.  This has to be
1526	 * done before adding the current thread to the run queue
1527	 * so that a CPU intensive thread doesn't get preference
1528	 * over waiting threads.
1529	 */
1530	while (((pthread = KSE_WAITQ_FIRST(kse)) != NULL) &&
1531	    thr_timedout(pthread, &ts)) {
1532		/* Remove the thread from the wait queue: */
1533		KSE_WAITQ_REMOVE(kse, pthread);
1534		DBG_MSG("Found timedout thread %p in waitq\n", pthread);
1535
1536		/* Indicate the thread timedout: */
1537		pthread->timeout = 1;
1538
1539		/* Add the thread to the priority queue: */
1540		if ((pthread->flags & THR_FLAGS_SUSPENDED) != 0)
1541			THR_SET_STATE(pthread, PS_SUSPENDED);
1542		else {
1543			THR_SET_STATE(pthread, PS_RUNNING);
1544			KSE_RUNQ_INSERT_TAIL(kse, pthread);
1545		}
1546	}
1547}
1548
1549static int
1550thr_timedout(struct pthread *thread, struct timespec *curtime)
1551{
1552	if (thread->wakeup_time.tv_sec < 0)
1553		return (0);
1554	else if (thread->wakeup_time.tv_sec > curtime->tv_sec)
1555		return (0);
1556	else if ((thread->wakeup_time.tv_sec == curtime->tv_sec) &&
1557	    (thread->wakeup_time.tv_nsec > curtime->tv_nsec))
1558		return (0);
1559	else
1560		return (1);
1561}
1562
1563/*
1564 * This must be called with the scheduling lock held.
1565 *
1566 * Each thread has a time slice, a wakeup time (used when it wants
1567 * to wait for a specified amount of time), a run state, and an
1568 * active flag.
1569 *
1570 * When a thread gets run by the scheduler, the active flag is
1571 * set to non-zero (1).  When a thread performs an explicit yield
1572 * or schedules a state change, it enters the scheduler and the
1573 * active flag is cleared.  When the active flag is still seen
1574 * set in the scheduler, that means that the thread is blocked in
1575 * the kernel (because it is cleared before entering the scheduler
1576 * in all other instances).
1577 *
1578 * The wakeup time is only set for those states that can timeout.
1579 * It is set to (-1, -1) for all other instances.
1580 *
1581 * The thread's run state, aside from being useful when debugging,
1582 * is used to place the thread in an appropriate queue.  There
1583 * are 2 basic queues:
1584 *
1585 *   o run queue - queue ordered by priority for all threads
1586 *                 that are runnable
1587 *   o waiting queue - queue sorted by wakeup time for all threads
1588 *                     that are not otherwise runnable (not blocked
1589 *                     in kernel, not waiting for locks)
1590 *
1591 * The thread's time slice is used for round-robin scheduling
1592 * (the default scheduling policy).  While a SCHED_RR thread
1593 * is runnable it's time slice accumulates.  When it reaches
1594 * the time slice interval, it gets reset and added to the end
1595 * of the queue of threads at its priority.  When a thread no
1596 * longer becomes runnable (blocks in kernel, waits, etc), its
1597 * time slice is reset.
1598 *
1599 * The job of kse_switchout_thread() is to handle all of the above.
1600 */
1601static void
1602kse_switchout_thread(struct kse *kse, struct pthread *thread)
1603{
1604	int level;
1605	int i;
1606	int restart;
1607	siginfo_t siginfo;
1608
1609	/*
1610	 * Place the currently running thread into the
1611	 * appropriate queue(s).
1612	 */
1613	DBG_MSG("Switching out thread %p, state %d\n", thread, thread->state);
1614
1615	THR_DEACTIVATE_LAST_LOCK(thread);
1616	if (thread->blocked != 0) {
1617		thread->active = 0;
1618		thread->need_switchout = 0;
1619		/* This thread must have blocked in the kernel. */
1620		/*
1621		 * Check for pending signals and cancellation for
1622		 * this thread to see if we need to interrupt it
1623		 * in the kernel.
1624		 */
1625		if (THR_NEED_CANCEL(thread)) {
1626			kse_thr_interrupt(&thread->tcb->tcb_tmbx,
1627					  KSE_INTR_INTERRUPT, 0);
1628		} else if (thread->check_pending != 0) {
1629			for (i = 1; i <= _SIG_MAXSIG; ++i) {
1630				if (SIGISMEMBER(thread->sigpend, i) &&
1631				    !SIGISMEMBER(thread->sigmask, i)) {
1632					restart = _thread_sigact[i - 1].sa_flags & SA_RESTART;
1633					kse_thr_interrupt(&thread->tcb->tcb_tmbx,
1634					    restart ? KSE_INTR_RESTART : KSE_INTR_INTERRUPT, 0);
1635					break;
1636				}
1637			}
1638		}
1639	}
1640	else {
1641		switch (thread->state) {
1642		case PS_MUTEX_WAIT:
1643		case PS_COND_WAIT:
1644			if (THR_NEED_CANCEL(thread)) {
1645				thread->interrupted = 1;
1646				thread->continuation = _thr_finish_cancellation;
1647				THR_SET_STATE(thread, PS_RUNNING);
1648			} else {
1649				/* Insert into the waiting queue: */
1650				KSE_WAITQ_INSERT(kse, thread);
1651			}
1652			break;
1653
1654		case PS_LOCKWAIT:
1655			/*
1656			 * This state doesn't timeout.
1657			 */
1658			thread->wakeup_time.tv_sec = -1;
1659			thread->wakeup_time.tv_nsec = -1;
1660			level = thread->locklevel - 1;
1661			if (!_LCK_GRANTED(&thread->lockusers[level]))
1662				KSE_WAITQ_INSERT(kse, thread);
1663			else
1664				THR_SET_STATE(thread, PS_RUNNING);
1665			break;
1666
1667		case PS_SLEEP_WAIT:
1668		case PS_SIGWAIT:
1669			if (THR_NEED_CANCEL(thread)) {
1670				thread->interrupted = 1;
1671				THR_SET_STATE(thread, PS_RUNNING);
1672			} else {
1673				KSE_WAITQ_INSERT(kse, thread);
1674			}
1675			break;
1676
1677		case PS_JOIN:
1678			if (THR_NEED_CANCEL(thread)) {
1679				thread->join_status.thread = NULL;
1680				THR_SET_STATE(thread, PS_RUNNING);
1681			} else {
1682				/*
1683				 * This state doesn't timeout.
1684				 */
1685				thread->wakeup_time.tv_sec = -1;
1686				thread->wakeup_time.tv_nsec = -1;
1687
1688				/* Insert into the waiting queue: */
1689				KSE_WAITQ_INSERT(kse, thread);
1690			}
1691			break;
1692
1693		case PS_SIGSUSPEND:
1694		case PS_SUSPENDED:
1695			if (THR_NEED_CANCEL(thread)) {
1696				thread->interrupted = 1;
1697				THR_SET_STATE(thread, PS_RUNNING);
1698			} else {
1699				/*
1700				 * These states don't timeout.
1701				 */
1702				thread->wakeup_time.tv_sec = -1;
1703				thread->wakeup_time.tv_nsec = -1;
1704
1705				/* Insert into the waiting queue: */
1706				KSE_WAITQ_INSERT(kse, thread);
1707			}
1708			break;
1709
1710		case PS_DEAD:
1711			/*
1712			 * The scheduler is operating on a different
1713			 * stack.  It is safe to do garbage collecting
1714			 * here.
1715			 */
1716			thread->active = 0;
1717			thread->need_switchout = 0;
1718			thread->lock_switch = 0;
1719			thr_cleanup(kse, thread);
1720			return;
1721			break;
1722
1723		case PS_RUNNING:
1724			if ((thread->flags & THR_FLAGS_SUSPENDED) != 0 &&
1725			    !THR_NEED_CANCEL(thread))
1726				THR_SET_STATE(thread, PS_SUSPENDED);
1727			break;
1728
1729		case PS_DEADLOCK:
1730			/*
1731			 * These states don't timeout.
1732			 */
1733			thread->wakeup_time.tv_sec = -1;
1734			thread->wakeup_time.tv_nsec = -1;
1735
1736			/* Insert into the waiting queue: */
1737			KSE_WAITQ_INSERT(kse, thread);
1738			break;
1739
1740		default:
1741			PANIC("Unknown state\n");
1742			break;
1743		}
1744
1745		thr_accounting(thread);
1746		if (thread->state == PS_RUNNING) {
1747			if (thread->slice_usec == -1) {
1748				/*
1749				 * The thread exceeded its time quantum or
1750				 * it yielded the CPU; place it at the tail
1751				 * of the queue for its priority.
1752				 */
1753				KSE_RUNQ_INSERT_TAIL(kse, thread);
1754			} else {
1755				/*
1756				 * The thread hasn't exceeded its interval
1757				 * Place it at the head of the queue for its
1758				 * priority.
1759				 */
1760				KSE_RUNQ_INSERT_HEAD(kse, thread);
1761			}
1762		}
1763	}
1764	thread->active = 0;
1765	thread->need_switchout = 0;
1766	if (thread->check_pending != 0) {
1767		/* Install pending signals into the frame. */
1768		thread->check_pending = 0;
1769		KSE_LOCK_ACQUIRE(kse, &_thread_signal_lock);
1770		for (i = 1; i <= _SIG_MAXSIG; i++) {
1771			if (SIGISMEMBER(thread->sigmask, i))
1772				continue;
1773			if (SIGISMEMBER(thread->sigpend, i))
1774				(void)_thr_sig_add(thread, i,
1775				    &thread->siginfo[i-1]);
1776			else if (SIGISMEMBER(_thr_proc_sigpending, i) &&
1777				_thr_getprocsig_unlocked(i, &siginfo)) {
1778				(void)_thr_sig_add(thread, i, &siginfo);
1779			}
1780		}
1781		KSE_LOCK_RELEASE(kse, &_thread_signal_lock);
1782	}
1783}
1784
1785/*
1786 * This function waits for the smallest timeout value of any waiting
1787 * thread, or until it receives a message from another KSE.
1788 *
1789 * This must be called with the scheduling lock held.
1790 */
1791static void
1792kse_wait(struct kse *kse, struct pthread *td_wait, int sigseqno)
1793{
1794	struct timespec ts, ts_sleep;
1795	int saved_flags;
1796
1797	if ((td_wait == NULL) || (td_wait->wakeup_time.tv_sec < 0)) {
1798		/* Limit sleep to no more than 1 minute. */
1799		ts_sleep.tv_sec = 60;
1800		ts_sleep.tv_nsec = 0;
1801	} else {
1802		KSE_GET_TOD(kse, &ts);
1803		TIMESPEC_SUB(&ts_sleep, &td_wait->wakeup_time, &ts);
1804		if (ts_sleep.tv_sec > 60) {
1805			ts_sleep.tv_sec = 60;
1806			ts_sleep.tv_nsec = 0;
1807		}
1808	}
1809	/* Don't sleep for negative times. */
1810	if ((ts_sleep.tv_sec >= 0) && (ts_sleep.tv_nsec >= 0)) {
1811		KSE_SET_IDLE(kse);
1812		kse->k_kseg->kg_idle_kses++;
1813		KSE_SCHED_UNLOCK(kse, kse->k_kseg);
1814		if ((kse->k_kseg->kg_flags & KGF_SINGLE_THREAD) &&
1815		    (kse->k_sigseqno != sigseqno))
1816			; /* don't sleep */
1817		else {
1818			saved_flags = kse->k_kcb->kcb_kmbx.km_flags;
1819			kse->k_kcb->kcb_kmbx.km_flags |= KMF_NOUPCALL;
1820			kse_release(&ts_sleep);
1821			kse->k_kcb->kcb_kmbx.km_flags = saved_flags;
1822		}
1823		KSE_SCHED_LOCK(kse, kse->k_kseg);
1824		if (KSE_IS_IDLE(kse)) {
1825			KSE_CLEAR_IDLE(kse);
1826			kse->k_kseg->kg_idle_kses--;
1827		}
1828	}
1829}
1830
1831/*
1832 * Avoid calling this kse_exit() so as not to confuse it with the
1833 * system call of the same name.
1834 */
1835static void
1836kse_fini(struct kse *kse)
1837{
1838	/* struct kse_group *free_kseg = NULL; */
1839	struct timespec ts;
1840	struct pthread *td;
1841
1842	/*
1843	 * Check to see if this is one of the main kses.
1844	 */
1845	if (kse->k_kseg != _kse_initial->k_kseg) {
1846		PANIC("shouldn't get here");
1847		/* This is for supporting thread groups. */
1848#ifdef NOT_YET
1849		/* Remove this KSE from the KSEG's list of KSEs. */
1850		KSE_SCHED_LOCK(kse, kse->k_kseg);
1851		TAILQ_REMOVE(&kse->k_kseg->kg_kseq, kse, k_kgqe);
1852		kse->k_kseg->kg_ksecount--;
1853		if (TAILQ_EMPTY(&kse->k_kseg->kg_kseq))
1854			free_kseg = kse->k_kseg;
1855		KSE_SCHED_UNLOCK(kse, kse->k_kseg);
1856
1857		/*
1858		 * Add this KSE to the list of free KSEs along with
1859		 * the KSEG if is now orphaned.
1860		 */
1861		KSE_LOCK_ACQUIRE(kse, &kse_lock);
1862		if (free_kseg != NULL)
1863			kseg_free_unlocked(free_kseg);
1864		kse_free_unlocked(kse);
1865		KSE_LOCK_RELEASE(kse, &kse_lock);
1866		kse_exit();
1867		/* Never returns. */
1868		PANIC("kse_exit()");
1869#endif
1870	} else {
1871		/*
1872		 * We allow program to kill kse in initial group (by
1873		 * lowering the concurrency).
1874		 */
1875		if ((kse != _kse_initial) &&
1876		    ((kse->k_flags & KF_TERMINATED) != 0)) {
1877			KSE_SCHED_LOCK(kse, kse->k_kseg);
1878			TAILQ_REMOVE(&kse->k_kseg->kg_kseq, kse, k_kgqe);
1879			kse->k_kseg->kg_ksecount--;
1880			/*
1881			 * Migrate thread to  _kse_initial if its lastest
1882			 * kse it ran on is the kse.
1883			 */
1884			td = TAILQ_FIRST(&kse->k_kseg->kg_threadq);
1885			while (td != NULL) {
1886				if (td->kse == kse)
1887					td->kse = _kse_initial;
1888				td = TAILQ_NEXT(td, kle);
1889			}
1890			KSE_SCHED_UNLOCK(kse, kse->k_kseg);
1891			KSE_LOCK_ACQUIRE(kse, &kse_lock);
1892			kse_free_unlocked(kse);
1893			KSE_LOCK_RELEASE(kse, &kse_lock);
1894			/* Make sure there is always at least one is awake */
1895			KSE_WAKEUP(_kse_initial);
1896			kse_exit();
1897                        /* Never returns. */
1898                        PANIC("kse_exit() failed for initial kseg");
1899                }
1900		KSE_SCHED_LOCK(kse, kse->k_kseg);
1901		KSE_SET_IDLE(kse);
1902		kse->k_kseg->kg_idle_kses++;
1903		KSE_SCHED_UNLOCK(kse, kse->k_kseg);
1904		ts.tv_sec = 120;
1905		ts.tv_nsec = 0;
1906		kse->k_kcb->kcb_kmbx.km_flags = 0;
1907		kse_release(&ts);
1908		/* Never reach */
1909	}
1910}
1911
1912void
1913_thr_set_timeout(const struct timespec *timeout)
1914{
1915	struct pthread	*curthread = _get_curthread();
1916	struct timespec ts;
1917
1918	/* Reset the timeout flag for the running thread: */
1919	curthread->timeout = 0;
1920
1921	/* Check if the thread is to wait forever: */
1922	if (timeout == NULL) {
1923		/*
1924		 * Set the wakeup time to something that can be recognised as
1925		 * different to an actual time of day:
1926		 */
1927		curthread->wakeup_time.tv_sec = -1;
1928		curthread->wakeup_time.tv_nsec = -1;
1929	}
1930	/* Check if no waiting is required: */
1931	else if ((timeout->tv_sec == 0) && (timeout->tv_nsec == 0)) {
1932		/* Set the wake up time to 'immediately': */
1933		curthread->wakeup_time.tv_sec = 0;
1934		curthread->wakeup_time.tv_nsec = 0;
1935	} else {
1936		/* Calculate the time for the current thread to wakeup: */
1937		KSE_GET_TOD(curthread->kse, &ts);
1938		TIMESPEC_ADD(&curthread->wakeup_time, &ts, timeout);
1939	}
1940}
1941
1942void
1943_thr_panic_exit(char *file, int line, char *msg)
1944{
1945	char buf[256];
1946
1947	snprintf(buf, sizeof(buf), "(%s:%d) %s\n", file, line, msg);
1948	__sys_write(2, buf, strlen(buf));
1949	abort();
1950}
1951
1952void
1953_thr_setrunnable(struct pthread *curthread, struct pthread *thread)
1954{
1955	kse_critical_t crit;
1956	struct kse_mailbox *kmbx;
1957
1958	crit = _kse_critical_enter();
1959	KSE_SCHED_LOCK(curthread->kse, thread->kseg);
1960	kmbx = _thr_setrunnable_unlocked(thread);
1961	KSE_SCHED_UNLOCK(curthread->kse, thread->kseg);
1962	_kse_critical_leave(crit);
1963	if ((kmbx != NULL) && (__isthreaded != 0))
1964		kse_wakeup(kmbx);
1965}
1966
1967struct kse_mailbox *
1968_thr_setrunnable_unlocked(struct pthread *thread)
1969{
1970	struct kse_mailbox *kmbx = NULL;
1971
1972	if ((thread->kseg->kg_flags & KGF_SINGLE_THREAD) != 0) {
1973		/* No silly queues for these threads. */
1974		if ((thread->flags & THR_FLAGS_SUSPENDED) != 0)
1975			THR_SET_STATE(thread, PS_SUSPENDED);
1976		else {
1977			THR_SET_STATE(thread, PS_RUNNING);
1978			kmbx = kse_wakeup_one(thread);
1979		}
1980
1981	} else if (thread->state != PS_RUNNING) {
1982		if ((thread->flags & THR_FLAGS_IN_WAITQ) != 0)
1983			KSE_WAITQ_REMOVE(thread->kse, thread);
1984		if ((thread->flags & THR_FLAGS_SUSPENDED) != 0)
1985			THR_SET_STATE(thread, PS_SUSPENDED);
1986		else {
1987			THR_SET_STATE(thread, PS_RUNNING);
1988			if ((thread->blocked == 0) && (thread->active == 0) &&
1989			    (thread->flags & THR_FLAGS_IN_RUNQ) == 0)
1990				THR_RUNQ_INSERT_TAIL(thread);
1991			/*
1992			 * XXX - Threads are not yet assigned to specific
1993			 *       KSEs; they are assigned to the KSEG.  So
1994			 *       the fact that a thread's KSE is waiting
1995			 *       doesn't necessarily mean that it will be
1996			 *       the KSE that runs the thread after the
1997			 *       lock is granted.  But we don't know if the
1998			 *       other KSEs within the same KSEG are also
1999			 *       in a waiting state or not so we err on the
2000			 *       side of caution and wakeup the thread's
2001			 *       last known KSE.  We ensure that the
2002			 *       threads KSE doesn't change while it's
2003			 *       scheduling lock is held so it is safe to
2004			 *       reference it (the KSE).  If the KSE wakes
2005			 *       up and doesn't find any more work it will
2006			 *       again go back to waiting so no harm is
2007			 *       done.
2008			 */
2009			kmbx = kse_wakeup_one(thread);
2010		}
2011	}
2012	return (kmbx);
2013}
2014
2015static struct kse_mailbox *
2016kse_wakeup_one(struct pthread *thread)
2017{
2018	struct kse *ke;
2019
2020	if (KSE_IS_IDLE(thread->kse)) {
2021		KSE_CLEAR_IDLE(thread->kse);
2022		thread->kseg->kg_idle_kses--;
2023		return (&thread->kse->k_kcb->kcb_kmbx);
2024	} else {
2025		TAILQ_FOREACH(ke, &thread->kseg->kg_kseq, k_kgqe) {
2026			if (KSE_IS_IDLE(ke)) {
2027				KSE_CLEAR_IDLE(ke);
2028				ke->k_kseg->kg_idle_kses--;
2029				return (&ke->k_kcb->kcb_kmbx);
2030			}
2031		}
2032	}
2033	return (NULL);
2034}
2035
2036static void
2037kse_wakeup_multi(struct kse *curkse)
2038{
2039	struct kse *ke;
2040	int tmp;
2041
2042	if ((tmp = KSE_RUNQ_THREADS(curkse)) && curkse->k_kseg->kg_idle_kses) {
2043		TAILQ_FOREACH(ke, &curkse->k_kseg->kg_kseq, k_kgqe) {
2044			if (KSE_IS_IDLE(ke)) {
2045				KSE_CLEAR_IDLE(ke);
2046				ke->k_kseg->kg_idle_kses--;
2047				KSE_WAKEUP(ke);
2048				if (--tmp == 0)
2049					break;
2050			}
2051		}
2052	}
2053}
2054
2055/*
2056 * Allocate a new KSEG.
2057 *
2058 * We allow the current thread to be NULL in the case that this
2059 * is the first time a KSEG is being created (library initialization).
2060 * In this case, we don't need to (and can't) take any locks.
2061 */
2062struct kse_group *
2063_kseg_alloc(struct pthread *curthread)
2064{
2065	struct kse_group *kseg = NULL;
2066	kse_critical_t crit;
2067
2068	if ((curthread != NULL) && (free_kseg_count > 0)) {
2069		/* Use the kse lock for the kseg queue. */
2070		crit = _kse_critical_enter();
2071		KSE_LOCK_ACQUIRE(curthread->kse, &kse_lock);
2072		if ((kseg = TAILQ_FIRST(&free_kse_groupq)) != NULL) {
2073			TAILQ_REMOVE(&free_kse_groupq, kseg, kg_qe);
2074			free_kseg_count--;
2075			active_kseg_count++;
2076			TAILQ_INSERT_TAIL(&active_kse_groupq, kseg, kg_qe);
2077		}
2078		KSE_LOCK_RELEASE(curthread->kse, &kse_lock);
2079		_kse_critical_leave(crit);
2080		if (kseg)
2081			kseg_reinit(kseg);
2082	}
2083
2084	/*
2085	 * If requested, attempt to allocate a new KSE group only if the
2086	 * KSE allocation was successful and a KSE group wasn't found in
2087	 * the free list.
2088	 */
2089	if ((kseg == NULL) &&
2090	    ((kseg = (struct kse_group *)malloc(sizeof(*kseg))) != NULL)) {
2091		if (_pq_alloc(&kseg->kg_schedq.sq_runq,
2092		    THR_MIN_PRIORITY, THR_LAST_PRIORITY) != 0) {
2093			free(kseg);
2094			kseg = NULL;
2095		} else {
2096			kseg_init(kseg);
2097			/* Add the KSEG to the list of active KSEGs. */
2098			if (curthread != NULL) {
2099				crit = _kse_critical_enter();
2100				KSE_LOCK_ACQUIRE(curthread->kse, &kse_lock);
2101				active_kseg_count++;
2102				TAILQ_INSERT_TAIL(&active_kse_groupq,
2103				    kseg, kg_qe);
2104				KSE_LOCK_RELEASE(curthread->kse, &kse_lock);
2105				_kse_critical_leave(crit);
2106			} else {
2107				active_kseg_count++;
2108				TAILQ_INSERT_TAIL(&active_kse_groupq,
2109				    kseg, kg_qe);
2110			}
2111		}
2112	}
2113	return (kseg);
2114}
2115
2116static void
2117kseg_init(struct kse_group *kseg)
2118{
2119	kseg_reinit(kseg);
2120	_lock_init(&kseg->kg_lock, LCK_ADAPTIVE, _kse_lock_wait,
2121	    _kse_lock_wakeup);
2122}
2123
2124static void
2125kseg_reinit(struct kse_group *kseg)
2126{
2127	TAILQ_INIT(&kseg->kg_kseq);
2128	TAILQ_INIT(&kseg->kg_threadq);
2129	TAILQ_INIT(&kseg->kg_schedq.sq_waitq);
2130	kseg->kg_threadcount = 0;
2131	kseg->kg_ksecount = 0;
2132	kseg->kg_idle_kses = 0;
2133	kseg->kg_flags = 0;
2134}
2135
2136/*
2137 * This must be called with the kse lock held and when there are
2138 * no more threads that reference it.
2139 */
2140static void
2141kseg_free_unlocked(struct kse_group *kseg)
2142{
2143	TAILQ_REMOVE(&active_kse_groupq, kseg, kg_qe);
2144	TAILQ_INSERT_HEAD(&free_kse_groupq, kseg, kg_qe);
2145	free_kseg_count++;
2146	active_kseg_count--;
2147}
2148
2149void
2150_kseg_free(struct kse_group *kseg)
2151{
2152	struct kse *curkse;
2153	kse_critical_t crit;
2154
2155	crit = _kse_critical_enter();
2156	curkse = _get_curkse();
2157	KSE_LOCK_ACQUIRE(curkse, &kse_lock);
2158	kseg_free_unlocked(kseg);
2159	KSE_LOCK_RELEASE(curkse, &kse_lock);
2160	_kse_critical_leave(crit);
2161}
2162
2163static void
2164kseg_destroy(struct kse_group *kseg)
2165{
2166	_lock_destroy(&kseg->kg_lock);
2167	_pq_free(&kseg->kg_schedq.sq_runq);
2168	free(kseg);
2169}
2170
2171/*
2172 * Allocate a new KSE.
2173 *
2174 * We allow the current thread to be NULL in the case that this
2175 * is the first time a KSE is being created (library initialization).
2176 * In this case, we don't need to (and can't) take any locks.
2177 */
2178struct kse *
2179_kse_alloc(struct pthread *curthread, int sys_scope)
2180{
2181	struct kse *kse = NULL;
2182	char *stack;
2183	kse_critical_t crit;
2184	int i;
2185
2186	if ((curthread != NULL) && (free_kse_count > 0)) {
2187		crit = _kse_critical_enter();
2188		KSE_LOCK_ACQUIRE(curthread->kse, &kse_lock);
2189		/* Search for a finished KSE. */
2190		kse = TAILQ_FIRST(&free_kseq);
2191		while ((kse != NULL) &&
2192		    ((kse->k_kcb->kcb_kmbx.km_flags & KMF_DONE) == 0)) {
2193			kse = TAILQ_NEXT(kse, k_qe);
2194		}
2195		if (kse != NULL) {
2196			DBG_MSG("found an unused kse.\n");
2197			TAILQ_REMOVE(&free_kseq, kse, k_qe);
2198			free_kse_count--;
2199			TAILQ_INSERT_TAIL(&active_kseq, kse, k_qe);
2200			active_kse_count++;
2201		}
2202		KSE_LOCK_RELEASE(curthread->kse, &kse_lock);
2203		_kse_critical_leave(crit);
2204		if (kse != NULL)
2205			kse_reinit(kse, sys_scope);
2206	}
2207	if ((kse == NULL) &&
2208	    ((kse = (struct kse *)malloc(sizeof(*kse))) != NULL)) {
2209		if (sys_scope != 0)
2210			stack = NULL;
2211		else if ((stack = malloc(KSE_STACKSIZE)) == NULL) {
2212			free(kse);
2213			return (NULL);
2214		}
2215		bzero(kse, sizeof(*kse));
2216
2217		/* Initialize KCB without the lock. */
2218		if ((kse->k_kcb = _kcb_ctor(kse)) == NULL) {
2219			if (stack != NULL)
2220				free(stack);
2221			free(kse);
2222			return (NULL);
2223		}
2224
2225		/* Initialize the lockusers. */
2226		for (i = 0; i < MAX_KSE_LOCKLEVEL; i++) {
2227			_lockuser_init(&kse->k_lockusers[i], (void *)kse);
2228			_LCK_SET_PRIVATE2(&kse->k_lockusers[i], NULL);
2229		}
2230		/* _lock_init(kse->k_lock, ...) */
2231
2232		if (curthread != NULL) {
2233			crit = _kse_critical_enter();
2234			KSE_LOCK_ACQUIRE(curthread->kse, &kse_lock);
2235		}
2236		kse->k_flags = 0;
2237		TAILQ_INSERT_TAIL(&active_kseq, kse, k_qe);
2238		active_kse_count++;
2239		if (curthread != NULL) {
2240			KSE_LOCK_RELEASE(curthread->kse, &kse_lock);
2241			_kse_critical_leave(crit);
2242		}
2243		/*
2244		 * Create the KSE context.
2245		 * Scope system threads (one thread per KSE) are not required
2246		 * to have a stack for an unneeded kse upcall.
2247		 */
2248		if (!sys_scope) {
2249			kse->k_kcb->kcb_kmbx.km_func = (kse_func_t *)kse_sched_multi;
2250			kse->k_stack.ss_sp = stack;
2251			kse->k_stack.ss_size = KSE_STACKSIZE;
2252		} else {
2253			kse->k_kcb->kcb_kmbx.km_func = (kse_func_t *)kse_sched_single;
2254			kse->k_stack.ss_sp = NULL;
2255			kse->k_stack.ss_size = 0;
2256		}
2257		kse->k_kcb->kcb_kmbx.km_udata = (void *)kse;
2258		kse->k_kcb->kcb_kmbx.km_quantum = 20000;
2259		/*
2260		 * We need to keep a copy of the stack in case it
2261		 * doesn't get used; a KSE running a scope system
2262		 * thread will use that thread's stack.
2263		 */
2264		kse->k_kcb->kcb_kmbx.km_stack = kse->k_stack;
2265	}
2266	return (kse);
2267}
2268
2269static void
2270kse_reinit(struct kse *kse, int sys_scope)
2271{
2272	if (!sys_scope) {
2273		kse->k_kcb->kcb_kmbx.km_func = (kse_func_t *)kse_sched_multi;
2274		if (kse->k_stack.ss_sp == NULL) {
2275			/* XXX check allocation failure */
2276			kse->k_stack.ss_sp = (char *) malloc(KSE_STACKSIZE);
2277			kse->k_stack.ss_size = KSE_STACKSIZE;
2278		}
2279		kse->k_kcb->kcb_kmbx.km_quantum = 20000;
2280	} else {
2281		kse->k_kcb->kcb_kmbx.km_func = (kse_func_t *)kse_sched_single;
2282		if (kse->k_stack.ss_sp)
2283			free(kse->k_stack.ss_sp);
2284		kse->k_stack.ss_sp = NULL;
2285		kse->k_stack.ss_size = 0;
2286		kse->k_kcb->kcb_kmbx.km_quantum = 0;
2287	}
2288	kse->k_kcb->kcb_kmbx.km_stack = kse->k_stack;
2289	kse->k_kcb->kcb_kmbx.km_udata = (void *)kse;
2290	kse->k_kcb->kcb_kmbx.km_curthread = NULL;
2291	kse->k_kcb->kcb_kmbx.km_flags = 0;
2292	kse->k_curthread = NULL;
2293	kse->k_kseg = 0;
2294	kse->k_schedq = 0;
2295	kse->k_locklevel = 0;
2296	kse->k_flags = 0;
2297	kse->k_error = 0;
2298	kse->k_cpu = 0;
2299	kse->k_sigseqno = 0;
2300}
2301
2302void
2303kse_free_unlocked(struct kse *kse)
2304{
2305	TAILQ_REMOVE(&active_kseq, kse, k_qe);
2306	active_kse_count--;
2307	kse->k_kseg = NULL;
2308	kse->k_kcb->kcb_kmbx.km_quantum = 20000;
2309	kse->k_flags = 0;
2310	TAILQ_INSERT_HEAD(&free_kseq, kse, k_qe);
2311	free_kse_count++;
2312}
2313
2314void
2315_kse_free(struct pthread *curthread, struct kse *kse)
2316{
2317	kse_critical_t crit;
2318
2319	if (curthread == NULL)
2320		kse_free_unlocked(kse);
2321	else {
2322		crit = _kse_critical_enter();
2323		KSE_LOCK_ACQUIRE(curthread->kse, &kse_lock);
2324		kse_free_unlocked(kse);
2325		KSE_LOCK_RELEASE(curthread->kse, &kse_lock);
2326		_kse_critical_leave(crit);
2327	}
2328}
2329
2330static void
2331kse_destroy(struct kse *kse)
2332{
2333	int i;
2334
2335	if (kse->k_stack.ss_sp != NULL)
2336		free(kse->k_stack.ss_sp);
2337	_kcb_dtor(kse->k_kcb);
2338	for (i = 0; i < MAX_KSE_LOCKLEVEL; ++i)
2339		_lockuser_destroy(&kse->k_lockusers[i]);
2340	_lock_destroy(&kse->k_lock);
2341	free(kse);
2342}
2343
2344struct pthread *
2345_thr_alloc(struct pthread *curthread)
2346{
2347	kse_critical_t	crit;
2348	struct pthread	*thread = NULL;
2349	int i;
2350
2351	if (curthread != NULL) {
2352		if (GC_NEEDED())
2353			_thr_gc(curthread);
2354		if (free_thread_count > 0) {
2355			crit = _kse_critical_enter();
2356			KSE_LOCK_ACQUIRE(curthread->kse, &thread_lock);
2357			if ((thread = TAILQ_FIRST(&free_threadq)) != NULL) {
2358				TAILQ_REMOVE(&free_threadq, thread, tle);
2359				free_thread_count--;
2360			}
2361			KSE_LOCK_RELEASE(curthread->kse, &thread_lock);
2362			_kse_critical_leave(crit);
2363		}
2364	}
2365	if ((thread == NULL) &&
2366	    ((thread = malloc(sizeof(struct pthread))) != NULL)) {
2367		bzero(thread, sizeof(struct pthread));
2368		thread->siginfo = calloc(_SIG_MAXSIG, sizeof(siginfo_t));
2369		if (thread->siginfo == NULL) {
2370			free(thread);
2371			return (NULL);
2372		}
2373		if (curthread) {
2374			_pthread_mutex_lock(&_tcb_mutex);
2375			thread->tcb = _tcb_ctor(thread, 0 /* not initial tls */);
2376			_pthread_mutex_unlock(&_tcb_mutex);
2377		} else {
2378			thread->tcb = _tcb_ctor(thread, 1 /* initial tls */);
2379		}
2380		if (thread->tcb == NULL) {
2381			free(thread->siginfo);
2382			free(thread);
2383			return (NULL);
2384		}
2385		/*
2386		 * Initialize thread locking.
2387		 * Lock initializing needs malloc, so don't
2388		 * enter critical region before doing this!
2389		 */
2390		if (_lock_init(&thread->lock, LCK_ADAPTIVE,
2391		    _thr_lock_wait, _thr_lock_wakeup) != 0)
2392			PANIC("Cannot initialize thread lock");
2393		for (i = 0; i < MAX_THR_LOCKLEVEL; i++) {
2394			_lockuser_init(&thread->lockusers[i], (void *)thread);
2395			_LCK_SET_PRIVATE2(&thread->lockusers[i],
2396			    (void *)thread);
2397		}
2398	}
2399	return (thread);
2400}
2401
2402void
2403_thr_free(struct pthread *curthread, struct pthread *thread)
2404{
2405	kse_critical_t crit;
2406
2407	DBG_MSG("Freeing thread %p\n", thread);
2408	if (thread->name) {
2409		free(thread->name);
2410		thread->name = NULL;
2411	}
2412	if ((curthread == NULL) || (free_thread_count >= MAX_CACHED_THREADS)) {
2413		thr_destroy(curthread, thread);
2414	} else {
2415		/* Add the thread to the free thread list. */
2416		crit = _kse_critical_enter();
2417		KSE_LOCK_ACQUIRE(curthread->kse, &thread_lock);
2418		TAILQ_INSERT_TAIL(&free_threadq, thread, tle);
2419		free_thread_count++;
2420		KSE_LOCK_RELEASE(curthread->kse, &thread_lock);
2421		_kse_critical_leave(crit);
2422	}
2423}
2424
2425static void
2426thr_destroy(struct pthread *curthread, struct pthread *thread)
2427{
2428	int i;
2429
2430	for (i = 0; i < MAX_THR_LOCKLEVEL; i++)
2431		_lockuser_destroy(&thread->lockusers[i]);
2432	_lock_destroy(&thread->lock);
2433	if (curthread) {
2434		_pthread_mutex_lock(&_tcb_mutex);
2435		_tcb_dtor(thread->tcb);
2436		_pthread_mutex_unlock(&_tcb_mutex);
2437	} else {
2438		_tcb_dtor(thread->tcb);
2439	}
2440	free(thread->siginfo);
2441	free(thread);
2442}
2443
2444/*
2445 * Add an active thread:
2446 *
2447 *   o Assign the thread a unique id (which GDB uses to track
2448 *     threads.
2449 *   o Add the thread to the list of all threads and increment
2450 *     number of active threads.
2451 */
2452static void
2453thr_link(struct pthread *thread)
2454{
2455	kse_critical_t crit;
2456	struct kse *curkse;
2457
2458	crit = _kse_critical_enter();
2459	curkse = _get_curkse();
2460	KSE_LOCK_ACQUIRE(curkse, &_thread_list_lock);
2461	/*
2462	 * Initialize the unique id (which GDB uses to track
2463	 * threads), add the thread to the list of all threads,
2464	 * and
2465	 */
2466	thread->uniqueid = next_uniqueid++;
2467	THR_LIST_ADD(thread);
2468	_thread_active_threads++;
2469	KSE_LOCK_RELEASE(curkse, &_thread_list_lock);
2470	_kse_critical_leave(crit);
2471}
2472
2473/*
2474 * Remove an active thread.
2475 */
2476static void
2477thr_unlink(struct pthread *thread)
2478{
2479	kse_critical_t crit;
2480	struct kse *curkse;
2481
2482	crit = _kse_critical_enter();
2483	curkse = _get_curkse();
2484	KSE_LOCK_ACQUIRE(curkse, &_thread_list_lock);
2485	THR_LIST_REMOVE(thread);
2486	_thread_active_threads--;
2487	KSE_LOCK_RELEASE(curkse, &_thread_list_lock);
2488	_kse_critical_leave(crit);
2489}
2490
2491void
2492_thr_hash_add(struct pthread *thread)
2493{
2494	struct thread_hash_head *head;
2495
2496	head = &thr_hashtable[THREAD_HASH(thread)];
2497	LIST_INSERT_HEAD(head, thread, hle);
2498}
2499
2500void
2501_thr_hash_remove(struct pthread *thread)
2502{
2503	LIST_REMOVE(thread, hle);
2504}
2505
2506struct pthread *
2507_thr_hash_find(struct pthread *thread)
2508{
2509	struct pthread *td;
2510	struct thread_hash_head *head;
2511
2512	head = &thr_hashtable[THREAD_HASH(thread)];
2513	LIST_FOREACH(td, head, hle) {
2514		if (td == thread)
2515			return (thread);
2516	}
2517	return (NULL);
2518}
2519
2520void
2521_thr_debug_check_yield(struct pthread *curthread)
2522{
2523	/*
2524	 * Note that TMDF_SUSPEND is set after process is suspended.
2525	 * When we are being debugged, every suspension in process
2526	 * will cause all KSEs to schedule an upcall in kernel, unless the
2527	 * KSE is in critical region.
2528	 * If the function is being called, it means the KSE is no longer
2529	 * in critical region, if the TMDF_SUSPEND is set by debugger
2530	 * before KSE leaves critical region, we will catch it here, else
2531	 * if the flag is changed during testing, it also not a problem,
2532	 * because the change only occurs after a process suspension event
2533	 * occurs. A suspension event will always cause KSE to schedule an
2534	 * upcall, in the case, because we are not in critical region,
2535	 * upcall will be scheduled sucessfully, the flag will be checked
2536	 * again in kse_sched_multi, we won't back until the flag
2537	 * is cleared by debugger, the flag will be cleared in next
2538	 * suspension event.
2539	 */
2540	if (!DBG_CAN_RUN(curthread)) {
2541		if ((curthread->attr.flags & PTHREAD_SCOPE_SYSTEM) == 0)
2542			_thr_sched_switch(curthread);
2543		else
2544			kse_thr_interrupt(&curthread->tcb->tcb_tmbx,
2545				KSE_INTR_DBSUSPEND, 0);
2546	}
2547}
2548