thr_kern.c revision 135714
1/*
2 * Copyright (C) 2003 Daniel M. Eischen <deischen@freebsd.org>
3 * Copyright (C) 2002 Jonathon Mini <mini@freebsd.org>
4 * Copyright (c) 1995-1998 John Birrell <jb@cimlogic.com.au>
5 * All rights reserved.
6 *
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
9 * are met:
10 * 1. Redistributions of source code must retain the above copyright
11 *    notice, this list of conditions and the following disclaimer.
12 * 2. Redistributions in binary form must reproduce the above copyright
13 *    notice, this list of conditions and the following disclaimer in the
14 *    documentation and/or other materials provided with the distribution.
15 * 3. All advertising materials mentioning features or use of this software
16 *    must display the following acknowledgement:
17 *	This product includes software developed by John Birrell.
18 * 4. Neither the name of the author nor the names of any co-contributors
19 *    may be used to endorse or promote products derived from this software
20 *    without specific prior written permission.
21 *
22 * THIS SOFTWARE IS PROVIDED BY JOHN BIRRELL AND CONTRIBUTORS ``AS IS'' AND
23 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
24 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
25 * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
26 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
27 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
28 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
29 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
30 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
31 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32 * SUCH DAMAGE.
33 *
34 */
35#include <sys/cdefs.h>
36__FBSDID("$FreeBSD: head/lib/libkse/thread/thr_kern.c 135714 2004-09-24 06:36:31Z ssouhlal $");
37
38#include <sys/types.h>
39#include <sys/kse.h>
40#include <sys/ptrace.h>
41#include <sys/signalvar.h>
42#include <sys/queue.h>
43#include <machine/atomic.h>
44#include <machine/sigframe.h>
45
46#include <assert.h>
47#include <errno.h>
48#include <signal.h>
49#include <stdlib.h>
50#include <string.h>
51#include <time.h>
52#include <ucontext.h>
53#include <unistd.h>
54
55#include "atomic_ops.h"
56#include "thr_private.h"
57#include "libc_private.h"
58
59/*#define DEBUG_THREAD_KERN */
60#ifdef DEBUG_THREAD_KERN
61#define DBG_MSG		stdout_debug
62#else
63#define DBG_MSG(x...)
64#endif
65
66/*
67 * Define a high water mark for the maximum number of threads that
68 * will be cached.  Once this level is reached, any extra threads
69 * will be free()'d.
70 */
71#define	MAX_CACHED_THREADS	100
72/*
73 * Define high water marks for the maximum number of KSEs and KSE groups
74 * that will be cached. Because we support 1:1 threading, there could have
75 * same number of KSEs and KSE groups as threads. Once these levels are
76 * reached, any extra KSE and KSE groups will be free()'d.
77 */
78#define	MAX_CACHED_KSES		((_thread_scope_system <= 0) ? 50 : 100)
79#define	MAX_CACHED_KSEGS	((_thread_scope_system <= 0) ? 50 : 100)
80
81#define	KSE_SET_MBOX(kse, thrd) \
82	(kse)->k_kcb->kcb_kmbx.km_curthread = &(thrd)->tcb->tcb_tmbx
83
84#define	KSE_SET_EXITED(kse)	(kse)->k_flags |= KF_EXITED
85
86/*
87 * Macros for manipulating the run queues.  The priority queue
88 * routines use the thread's pqe link and also handle the setting
89 * and clearing of the thread's THR_FLAGS_IN_RUNQ flag.
90 */
91#define	KSE_RUNQ_INSERT_HEAD(kse, thrd)			\
92	_pq_insert_head(&(kse)->k_schedq->sq_runq, thrd)
93#define	KSE_RUNQ_INSERT_TAIL(kse, thrd)			\
94	_pq_insert_tail(&(kse)->k_schedq->sq_runq, thrd)
95#define	KSE_RUNQ_REMOVE(kse, thrd)			\
96	_pq_remove(&(kse)->k_schedq->sq_runq, thrd)
97#define	KSE_RUNQ_FIRST(kse)				\
98	((_libkse_debug == 0) ?				\
99	 _pq_first(&(kse)->k_schedq->sq_runq) :		\
100	 _pq_first_debug(&(kse)->k_schedq->sq_runq))
101
102#define KSE_RUNQ_THREADS(kse)	((kse)->k_schedq->sq_runq.pq_threads)
103
104#define THR_NEED_CANCEL(thrd)						\
105	 (((thrd)->cancelflags & THR_CANCELLING) != 0 &&		\
106	  ((thrd)->cancelflags & PTHREAD_CANCEL_DISABLE) == 0 &&	\
107	  (((thrd)->cancelflags & THR_AT_CANCEL_POINT) != 0 ||		\
108	   ((thrd)->cancelflags & PTHREAD_CANCEL_ASYNCHRONOUS) != 0))
109
110#define THR_NEED_ASYNC_CANCEL(thrd)					\
111	 (((thrd)->cancelflags & THR_CANCELLING) != 0 &&		\
112	  ((thrd)->cancelflags & PTHREAD_CANCEL_DISABLE) == 0 &&	\
113	  (((thrd)->cancelflags & THR_AT_CANCEL_POINT) == 0 &&		\
114	   ((thrd)->cancelflags & PTHREAD_CANCEL_ASYNCHRONOUS) != 0))
115
116/*
117 * We've got to keep track of everything that is allocated, not only
118 * to have a speedy free list, but also so they can be deallocated
119 * after a fork().
120 */
121static TAILQ_HEAD(, kse)	active_kseq;
122static TAILQ_HEAD(, kse)	free_kseq;
123static TAILQ_HEAD(, kse_group)	free_kse_groupq;
124static TAILQ_HEAD(, kse_group)	active_kse_groupq;
125static TAILQ_HEAD(, kse_group)	gc_ksegq;
126static struct lock		kse_lock;	/* also used for kseg queue */
127static int			free_kse_count = 0;
128static int			free_kseg_count = 0;
129static TAILQ_HEAD(, pthread)	free_threadq;
130static struct lock		thread_lock;
131static int			free_thread_count = 0;
132static int			inited = 0;
133static int			active_kse_count = 0;
134static int			active_kseg_count = 0;
135static u_int64_t		next_uniqueid = 1;
136
137LIST_HEAD(thread_hash_head, pthread);
138#define THREAD_HASH_QUEUES	127
139static struct thread_hash_head	thr_hashtable[THREAD_HASH_QUEUES];
140#define	THREAD_HASH(thrd)	((unsigned long)thrd % THREAD_HASH_QUEUES)
141
142#ifdef DEBUG_THREAD_KERN
143static void	dump_queues(struct kse *curkse);
144#endif
145static void	kse_check_completed(struct kse *kse);
146static void	kse_check_waitq(struct kse *kse);
147static void	kse_fini(struct kse *curkse);
148static void	kse_reinit(struct kse *kse, int sys_scope);
149static void	kse_sched_multi(struct kse_mailbox *kmbx);
150static void	kse_sched_single(struct kse_mailbox *kmbx);
151static void	kse_switchout_thread(struct kse *kse, struct pthread *thread);
152static void	kse_wait(struct kse *kse, struct pthread *td_wait, int sigseq);
153static void	kse_free_unlocked(struct kse *kse);
154static void	kse_destroy(struct kse *kse);
155static void	kseg_free_unlocked(struct kse_group *kseg);
156static void	kseg_init(struct kse_group *kseg);
157static void	kseg_reinit(struct kse_group *kseg);
158static void	kseg_destroy(struct kse_group *kseg);
159static void	kse_waitq_insert(struct pthread *thread);
160static void	kse_wakeup_multi(struct kse *curkse);
161static struct kse_mailbox *kse_wakeup_one(struct pthread *thread);
162static void	thr_cleanup(struct kse *kse, struct pthread *curthread);
163static void	thr_link(struct pthread *thread);
164static void	thr_resume_wrapper(int sig, siginfo_t *, ucontext_t *);
165static void	thr_resume_check(struct pthread *curthread, ucontext_t *ucp,
166		    struct pthread_sigframe *psf);
167static int	thr_timedout(struct pthread *thread, struct timespec *curtime);
168static void	thr_unlink(struct pthread *thread);
169static void	thr_destroy(struct pthread *thread);
170static void	thread_gc(struct pthread *thread);
171static void	kse_gc(struct pthread *thread);
172static void	kseg_gc(struct pthread *thread);
173
174static void __inline
175thr_accounting(struct pthread *thread)
176{
177	if ((thread->slice_usec != -1) &&
178	    (thread->slice_usec <= TIMESLICE_USEC) &&
179	    (thread->attr.sched_policy != SCHED_FIFO)) {
180		thread->slice_usec += (thread->tcb->tcb_tmbx.tm_uticks
181		    + thread->tcb->tcb_tmbx.tm_sticks) * _clock_res_usec;
182		/* Check for time quantum exceeded: */
183		if (thread->slice_usec > TIMESLICE_USEC)
184			thread->slice_usec = -1;
185	}
186	thread->tcb->tcb_tmbx.tm_uticks = 0;
187	thread->tcb->tcb_tmbx.tm_sticks = 0;
188}
189
190/*
191 * This is called after a fork().
192 * No locks need to be taken here since we are guaranteed to be
193 * single threaded.
194 *
195 * XXX
196 * POSIX says for threaded process, fork() function is used
197 * only to run new programs, and the effects of calling functions
198 * that require certain resources between the call to fork() and
199 * the call to an exec function are undefined.
200 *
201 * It is not safe to free memory after fork(), because these data
202 * structures may be in inconsistent state.
203 */
204void
205_kse_single_thread(struct pthread *curthread)
206{
207#ifdef NOTYET
208	struct kse *kse;
209	struct kse_group *kseg;
210	struct pthread *thread;
211	kse_critical_t crit;
212	int i;
213
214	if (__isthreaded) {
215		_thr_rtld_fini();
216		_thr_signal_deinit();
217	}
218	__isthreaded = 0;
219	/*
220	 * Restore signal mask early, so any memory problems could
221	 * dump core.
222	 */
223	sigprocmask(SIG_SETMASK, &curthread->sigmask, NULL);
224	_thread_active_threads = 1;
225
226	/*
227	 * Enter a loop to remove and free all threads other than
228	 * the running thread from the active thread list:
229	 */
230	while ((thread = TAILQ_FIRST(&_thread_list)) != NULL) {
231		THR_GCLIST_REMOVE(thread);
232		/*
233		 * Remove this thread from the list (the current
234		 * thread will be removed but re-added by libpthread
235		 * initialization.
236		 */
237		TAILQ_REMOVE(&_thread_list, thread, tle);
238		/* Make sure this isn't the running thread: */
239		if (thread != curthread) {
240			_thr_stack_free(&thread->attr);
241			if (thread->specific != NULL)
242				free(thread->specific);
243			thr_destroy(thread);
244		}
245	}
246
247	TAILQ_INIT(&curthread->mutexq);		/* initialize mutex queue */
248	curthread->joiner = NULL;		/* no joining threads yet */
249	curthread->refcount = 0;
250	SIGEMPTYSET(curthread->sigpend);	/* clear pending signals */
251	if (curthread->specific != NULL) {
252		free(curthread->specific);
253		curthread->specific = NULL;
254		curthread->specific_data_count = 0;
255	}
256
257	/* Free the free KSEs: */
258	while ((kse = TAILQ_FIRST(&free_kseq)) != NULL) {
259		TAILQ_REMOVE(&free_kseq, kse, k_qe);
260		kse_destroy(kse);
261	}
262	free_kse_count = 0;
263
264	/* Free the active KSEs: */
265	while ((kse = TAILQ_FIRST(&active_kseq)) != NULL) {
266		TAILQ_REMOVE(&active_kseq, kse, k_qe);
267		kse_destroy(kse);
268	}
269	active_kse_count = 0;
270
271	/* Free the free KSEGs: */
272	while ((kseg = TAILQ_FIRST(&free_kse_groupq)) != NULL) {
273		TAILQ_REMOVE(&free_kse_groupq, kseg, kg_qe);
274		kseg_destroy(kseg);
275	}
276	free_kseg_count = 0;
277
278	/* Free the active KSEGs: */
279	while ((kseg = TAILQ_FIRST(&active_kse_groupq)) != NULL) {
280		TAILQ_REMOVE(&active_kse_groupq, kseg, kg_qe);
281		kseg_destroy(kseg);
282	}
283	active_kseg_count = 0;
284
285	/* Free the free threads. */
286	while ((thread = TAILQ_FIRST(&free_threadq)) != NULL) {
287		TAILQ_REMOVE(&free_threadq, thread, tle);
288		thr_destroy(thread);
289	}
290	free_thread_count = 0;
291
292	/* Free the to-be-gc'd threads. */
293	while ((thread = TAILQ_FIRST(&_thread_gc_list)) != NULL) {
294		TAILQ_REMOVE(&_thread_gc_list, thread, gcle);
295		thr_destroy(thread);
296	}
297	TAILQ_INIT(&gc_ksegq);
298	_gc_count = 0;
299
300	if (inited != 0) {
301		/*
302		 * Destroy these locks; they'll be recreated to assure they
303		 * are in the unlocked state.
304		 */
305		_lock_destroy(&kse_lock);
306		_lock_destroy(&thread_lock);
307		_lock_destroy(&_thread_list_lock);
308		inited = 0;
309	}
310
311	/*
312	 * After a fork(), the leftover thread goes back to being
313	 * scope process.
314	 */
315	curthread->attr.flags &= ~PTHREAD_SCOPE_SYSTEM;
316	curthread->attr.flags |= PTHREAD_SCOPE_PROCESS;
317
318	/*
319	 * After a fork, we are still operating on the thread's original
320	 * stack.  Don't clear the THR_FLAGS_USER from the thread's
321	 * attribute flags.
322	 */
323
324	/* Initialize the threads library. */
325	curthread->kse = NULL;
326	curthread->kseg = NULL;
327	_kse_initial = NULL;
328	_libpthread_init(curthread);
329#else
330	int i;
331
332	/* Reset the current thread and KSE lock data. */
333	for (i = 0; i < curthread->locklevel; i++) {
334		_lockuser_reinit(&curthread->lockusers[i], (void *)curthread);
335	}
336	curthread->locklevel = 0;
337	for (i = 0; i < curthread->kse->k_locklevel; i++) {
338		_lockuser_reinit(&curthread->kse->k_lockusers[i],
339		    (void *)curthread->kse);
340		_LCK_SET_PRIVATE2(&curthread->kse->k_lockusers[i], NULL);
341	}
342	curthread->kse->k_locklevel = 0;
343	_thr_spinlock_init();
344	if (__isthreaded) {
345		_thr_rtld_fini();
346		_thr_signal_deinit();
347	}
348	__isthreaded = 0;
349	curthread->kse->k_kcb->kcb_kmbx.km_curthread = NULL;
350	curthread->attr.flags |= PTHREAD_SCOPE_SYSTEM;
351
352	/*
353	 * Restore signal mask early, so any memory problems could
354	 * dump core.
355	 */
356	sigprocmask(SIG_SETMASK, &curthread->sigmask, NULL);
357	_thread_active_threads = 1;
358#endif
359}
360
361/*
362 * This is used to initialize housekeeping and to initialize the
363 * KSD for the KSE.
364 */
365void
366_kse_init(void)
367{
368	if (inited == 0) {
369		TAILQ_INIT(&active_kseq);
370		TAILQ_INIT(&active_kse_groupq);
371		TAILQ_INIT(&free_kseq);
372		TAILQ_INIT(&free_kse_groupq);
373		TAILQ_INIT(&free_threadq);
374		TAILQ_INIT(&gc_ksegq);
375		if (_lock_init(&kse_lock, LCK_ADAPTIVE,
376		    _kse_lock_wait, _kse_lock_wakeup) != 0)
377			PANIC("Unable to initialize free KSE queue lock");
378		if (_lock_init(&thread_lock, LCK_ADAPTIVE,
379		    _kse_lock_wait, _kse_lock_wakeup) != 0)
380			PANIC("Unable to initialize free thread queue lock");
381		if (_lock_init(&_thread_list_lock, LCK_ADAPTIVE,
382		    _kse_lock_wait, _kse_lock_wakeup) != 0)
383			PANIC("Unable to initialize thread list lock");
384		active_kse_count = 0;
385		active_kseg_count = 0;
386		_gc_count = 0;
387		inited = 1;
388	}
389}
390
391/*
392 * This is called when the first thread (other than the initial
393 * thread) is created.
394 */
395int
396_kse_setthreaded(int threaded)
397{
398	sigset_t sigset;
399
400	if ((threaded != 0) && (__isthreaded == 0)) {
401		SIGFILLSET(sigset);
402		__sys_sigprocmask(SIG_SETMASK, &sigset, &_thr_initial->sigmask);
403
404		/*
405		 * Tell the kernel to create a KSE for the initial thread
406		 * and enable upcalls in it.
407		 */
408		_kse_initial->k_flags |= KF_STARTED;
409
410		if (_thread_scope_system <= 0) {
411			_thr_initial->attr.flags &= ~PTHREAD_SCOPE_SYSTEM;
412			_kse_initial->k_kseg->kg_flags &= ~KGF_SINGLE_THREAD;
413			_kse_initial->k_kcb->kcb_kmbx.km_curthread = NULL;
414		}
415		else {
416			/*
417			 * For bound thread, kernel reads mailbox pointer
418			 * once, we'd set it here before calling kse_create.
419			 */
420			_tcb_set(_kse_initial->k_kcb, _thr_initial->tcb);
421			KSE_SET_MBOX(_kse_initial, _thr_initial);
422			_kse_initial->k_kcb->kcb_kmbx.km_flags |= KMF_BOUND;
423		}
424
425		/*
426		 * Locking functions in libc are required when there are
427		 * threads other than the initial thread.
428		 */
429		_thr_rtld_init();
430
431		__isthreaded = 1;
432		if (kse_create(&_kse_initial->k_kcb->kcb_kmbx, 0) != 0) {
433			_kse_initial->k_flags &= ~KF_STARTED;
434			__isthreaded = 0;
435			PANIC("kse_create() failed\n");
436			return (-1);
437		}
438		_thr_initial->tcb->tcb_tmbx.tm_lwp =
439			_kse_initial->k_kcb->kcb_kmbx.km_lwp;
440		_thread_activated = 1;
441
442#ifndef SYSTEM_SCOPE_ONLY
443		if (_thread_scope_system <= 0) {
444			/* Set current thread to initial thread */
445			_tcb_set(_kse_initial->k_kcb, _thr_initial->tcb);
446			KSE_SET_MBOX(_kse_initial, _thr_initial);
447			_thr_start_sig_daemon();
448			_thr_setmaxconcurrency();
449		}
450		else
451#endif
452			__sys_sigprocmask(SIG_SETMASK, &_thr_initial->sigmask,
453			    NULL);
454	}
455	return (0);
456}
457
458/*
459 * Lock wait and wakeup handlers for KSE locks.  These are only used by
460 * KSEs, and should never be used by threads.  KSE locks include the
461 * KSE group lock (used for locking the scheduling queue) and the
462 * kse_lock defined above.
463 *
464 * When a KSE lock attempt blocks, the entire KSE blocks allowing another
465 * KSE to run.  For the most part, it doesn't make much sense to try and
466 * schedule another thread because you need to lock the scheduling queue
467 * in order to do that.  And since the KSE lock is used to lock the scheduling
468 * queue, you would just end up blocking again.
469 */
470void
471_kse_lock_wait(struct lock *lock, struct lockuser *lu)
472{
473	struct kse *curkse = (struct kse *)_LCK_GET_PRIVATE(lu);
474	struct timespec ts;
475	int saved_flags;
476
477	if (curkse->k_kcb->kcb_kmbx.km_curthread != NULL)
478		PANIC("kse_lock_wait does not disable upcall.\n");
479	/*
480	 * Enter a loop to wait until we get the lock.
481	 */
482	ts.tv_sec = 0;
483	ts.tv_nsec = 1000000;  /* 1 sec */
484	while (!_LCK_GRANTED(lu)) {
485		/*
486		 * Yield the kse and wait to be notified when the lock
487		 * is granted.
488		 */
489		saved_flags = curkse->k_kcb->kcb_kmbx.km_flags;
490		curkse->k_kcb->kcb_kmbx.km_flags |= KMF_NOUPCALL |
491		    KMF_NOCOMPLETED;
492		kse_release(&ts);
493		curkse->k_kcb->kcb_kmbx.km_flags = saved_flags;
494	}
495}
496
497void
498_kse_lock_wakeup(struct lock *lock, struct lockuser *lu)
499{
500	struct kse *curkse;
501	struct kse *kse;
502	struct kse_mailbox *mbx;
503
504	curkse = _get_curkse();
505	kse = (struct kse *)_LCK_GET_PRIVATE(lu);
506
507	if (kse == curkse)
508		PANIC("KSE trying to wake itself up in lock");
509	else {
510		mbx = &kse->k_kcb->kcb_kmbx;
511		_lock_grant(lock, lu);
512		/*
513		 * Notify the owning kse that it has the lock.
514		 * It is safe to pass invalid address to kse_wakeup
515		 * even if the mailbox is not in kernel at all,
516		 * and waking up a wrong kse is also harmless.
517		 */
518		kse_wakeup(mbx);
519	}
520}
521
522/*
523 * Thread wait and wakeup handlers for thread locks.  These are only used
524 * by threads, never by KSEs.  Thread locks include the per-thread lock
525 * (defined in its structure), and condition variable and mutex locks.
526 */
527void
528_thr_lock_wait(struct lock *lock, struct lockuser *lu)
529{
530	struct pthread *curthread = (struct pthread *)lu->lu_private;
531
532	do {
533		THR_LOCK_SWITCH(curthread);
534		THR_SET_STATE(curthread, PS_LOCKWAIT);
535		_thr_sched_switch_unlocked(curthread);
536	} while (!_LCK_GRANTED(lu));
537}
538
539void
540_thr_lock_wakeup(struct lock *lock, struct lockuser *lu)
541{
542	struct pthread *thread;
543	struct pthread *curthread;
544	struct kse_mailbox *kmbx;
545
546	curthread = _get_curthread();
547	thread = (struct pthread *)_LCK_GET_PRIVATE(lu);
548
549	THR_SCHED_LOCK(curthread, thread);
550	_lock_grant(lock, lu);
551	kmbx = _thr_setrunnable_unlocked(thread);
552	THR_SCHED_UNLOCK(curthread, thread);
553	if (kmbx != NULL)
554		kse_wakeup(kmbx);
555}
556
557kse_critical_t
558_kse_critical_enter(void)
559{
560	kse_critical_t crit;
561
562	crit = (kse_critical_t)_kcb_critical_enter();
563	return (crit);
564}
565
566void
567_kse_critical_leave(kse_critical_t crit)
568{
569	struct pthread *curthread;
570
571	_kcb_critical_leave((struct kse_thr_mailbox *)crit);
572	if ((crit != NULL) && ((curthread = _get_curthread()) != NULL))
573		THR_YIELD_CHECK(curthread);
574}
575
576int
577_kse_in_critical(void)
578{
579	return (_kcb_in_critical());
580}
581
582void
583_thr_critical_enter(struct pthread *thread)
584{
585	thread->critical_count++;
586}
587
588void
589_thr_critical_leave(struct pthread *thread)
590{
591	thread->critical_count--;
592	THR_YIELD_CHECK(thread);
593}
594
595void
596_thr_sched_switch(struct pthread *curthread)
597{
598	struct kse *curkse;
599
600	(void)_kse_critical_enter();
601	curkse = _get_curkse();
602	KSE_SCHED_LOCK(curkse, curkse->k_kseg);
603	_thr_sched_switch_unlocked(curthread);
604}
605
606/*
607 * XXX - We may need to take the scheduling lock before calling
608 *       this, or perhaps take the lock within here before
609 *       doing anything else.
610 */
611void
612_thr_sched_switch_unlocked(struct pthread *curthread)
613{
614	struct pthread_sigframe psf;
615	struct kse *curkse;
616	volatile int resume_once = 0;
617	ucontext_t *uc;
618
619	/* We're in the scheduler, 5 by 5: */
620	curkse = _get_curkse();
621
622	curthread->need_switchout = 1;	/* The thread yielded on its own. */
623	curthread->critical_yield = 0;	/* No need to yield anymore. */
624
625	/* Thread can unlock the scheduler lock. */
626	curthread->lock_switch = 1;
627
628	/*
629	 * The signal frame is allocated off the stack because
630	 * a thread can be interrupted by other signals while
631	 * it is running down pending signals.
632	 */
633	psf.psf_valid = 0;
634	curthread->curframe = &psf;
635
636	if (curthread->attr.flags & PTHREAD_SCOPE_SYSTEM)
637		kse_sched_single(&curkse->k_kcb->kcb_kmbx);
638	else {
639		if (__predict_false(_libkse_debug != 0)) {
640			/*
641			 * Because debugger saves single step status in thread
642			 * mailbox's tm_dflags, we can safely clear single
643			 * step status here. the single step status will be
644			 * restored by kse_switchin when the thread is
645			 * switched in again. This also lets uts run in full
646			 * speed.
647			 */
648			 ptrace(PT_CLEARSTEP, curkse->k_kcb->kcb_kmbx.km_lwp,
649				(caddr_t) 1, 0);
650		}
651
652		KSE_SET_SWITCH(curkse);
653		_thread_enter_uts(curthread->tcb, curkse->k_kcb);
654	}
655
656	/*
657	 * It is ugly we must increase critical count, because we
658	 * have a frame saved, we must backout state in psf
659	 * before we can process signals.
660 	 */
661	curthread->critical_count += psf.psf_valid;
662
663	/*
664	 * Unlock the scheduling queue and leave the
665	 * critical region.
666	 */
667	/* Don't trust this after a switch! */
668	curkse = _get_curkse();
669
670	curthread->lock_switch = 0;
671	KSE_SCHED_UNLOCK(curkse, curkse->k_kseg);
672	_kse_critical_leave(&curthread->tcb->tcb_tmbx);
673
674	/*
675	 * This thread is being resumed; check for cancellations.
676	 */
677	if ((psf.psf_valid ||
678	    ((curthread->check_pending || THR_NEED_ASYNC_CANCEL(curthread))
679	    && !THR_IN_CRITICAL(curthread)))) {
680		uc = alloca(sizeof(ucontext_t));
681		resume_once = 0;
682		THR_GETCONTEXT(uc);
683		if (resume_once == 0) {
684			resume_once = 1;
685			curthread->check_pending = 0;
686			thr_resume_check(curthread, uc, &psf);
687		}
688	}
689	THR_ACTIVATE_LAST_LOCK(curthread);
690}
691
692/*
693 * This is the scheduler for a KSE which runs a scope system thread.
694 * The multi-thread KSE scheduler should also work for a single threaded
695 * KSE, but we use a separate scheduler so that it can be fine-tuned
696 * to be more efficient (and perhaps not need a separate stack for
697 * the KSE, allowing it to use the thread's stack).
698 */
699
700static void
701kse_sched_single(struct kse_mailbox *kmbx)
702{
703	struct kse *curkse;
704	struct pthread *curthread;
705	struct timespec ts;
706	sigset_t sigmask;
707	int i, sigseqno, level, first = 0;
708
709	curkse = (struct kse *)kmbx->km_udata;
710	curthread = curkse->k_curthread;
711
712	if (__predict_false((curkse->k_flags & KF_INITIALIZED) == 0)) {
713		/* Setup this KSEs specific data. */
714		_kcb_set(curkse->k_kcb);
715		_tcb_set(curkse->k_kcb, curthread->tcb);
716		curkse->k_flags |= KF_INITIALIZED;
717		first = 1;
718		curthread->active = 1;
719
720		/* Setup kernel signal masks for new thread. */
721		__sys_sigprocmask(SIG_SETMASK, &curthread->sigmask, NULL);
722		/*
723		 * Enter critical region, this is meanless for bound thread,
724		 * It is used to let other code work, those code want mailbox
725		 * to be cleared.
726		 */
727		(void)_kse_critical_enter();
728 	} else {
729		/*
730		 * Bound thread always has tcb set, this prevent some
731		 * code from blindly setting bound thread tcb to NULL,
732		 * buggy code ?
733		 */
734		_tcb_set(curkse->k_kcb, curthread->tcb);
735	}
736
737	curthread->critical_yield = 0;
738	curthread->need_switchout = 0;
739
740	/*
741	 * Lock the scheduling queue.
742	 *
743	 * There is no scheduling queue for single threaded KSEs,
744	 * but we need a lock for protection regardless.
745	 */
746	if (curthread->lock_switch == 0)
747		KSE_SCHED_LOCK(curkse, curkse->k_kseg);
748
749	/*
750	 * This has to do the job of kse_switchout_thread(), only
751	 * for a single threaded KSE/KSEG.
752	 */
753
754	switch (curthread->state) {
755	case PS_MUTEX_WAIT:
756	case PS_COND_WAIT:
757		if (THR_NEED_CANCEL(curthread)) {
758			curthread->interrupted = 1;
759			curthread->continuation = _thr_finish_cancellation;
760			THR_SET_STATE(curthread, PS_RUNNING);
761		}
762		break;
763
764	case PS_LOCKWAIT:
765		/*
766		 * This state doesn't timeout.
767		 */
768		curthread->wakeup_time.tv_sec = -1;
769		curthread->wakeup_time.tv_nsec = -1;
770		level = curthread->locklevel - 1;
771		if (_LCK_GRANTED(&curthread->lockusers[level]))
772			THR_SET_STATE(curthread, PS_RUNNING);
773		break;
774
775	case PS_DEAD:
776		curthread->check_pending = 0;
777		/* Unlock the scheduling queue and exit the KSE and thread. */
778		thr_cleanup(curkse, curthread);
779		KSE_SCHED_UNLOCK(curkse, curkse->k_kseg);
780		PANIC("bound thread shouldn't get here\n");
781		break;
782
783	case PS_JOIN:
784		if (THR_NEED_CANCEL(curthread)) {
785			curthread->join_status.thread = NULL;
786			THR_SET_STATE(curthread, PS_RUNNING);
787		} else {
788			/*
789			 * This state doesn't timeout.
790			 */
791			curthread->wakeup_time.tv_sec = -1;
792			curthread->wakeup_time.tv_nsec = -1;
793		}
794		break;
795
796	case PS_SUSPENDED:
797		if (THR_NEED_CANCEL(curthread)) {
798			curthread->interrupted = 1;
799			THR_SET_STATE(curthread, PS_RUNNING);
800		} else {
801			/*
802			 * These states don't timeout.
803			 */
804			curthread->wakeup_time.tv_sec = -1;
805			curthread->wakeup_time.tv_nsec = -1;
806		}
807		break;
808
809	case PS_RUNNING:
810		if ((curthread->flags & THR_FLAGS_SUSPENDED) != 0 &&
811		    !THR_NEED_CANCEL(curthread)) {
812			THR_SET_STATE(curthread, PS_SUSPENDED);
813			/*
814			 * These states don't timeout.
815			 */
816			curthread->wakeup_time.tv_sec = -1;
817			curthread->wakeup_time.tv_nsec = -1;
818		}
819		break;
820
821	case PS_SIGWAIT:
822		PANIC("bound thread does not have SIGWAIT state\n");
823
824	case PS_SLEEP_WAIT:
825		PANIC("bound thread does not have SLEEP_WAIT state\n");
826
827	case PS_SIGSUSPEND:
828		PANIC("bound thread does not have SIGSUSPEND state\n");
829
830	case PS_DEADLOCK:
831		/*
832		 * These states don't timeout and don't need
833		 * to be in the waiting queue.
834		 */
835		curthread->wakeup_time.tv_sec = -1;
836		curthread->wakeup_time.tv_nsec = -1;
837		break;
838
839	default:
840		PANIC("Unknown state\n");
841		break;
842	}
843
844	while (curthread->state != PS_RUNNING) {
845		sigseqno = curkse->k_sigseqno;
846		if (curthread->check_pending != 0) {
847			/*
848			 * Install pending signals into the frame, possible
849			 * cause mutex or condvar backout.
850			 */
851			curthread->check_pending = 0;
852			SIGFILLSET(sigmask);
853
854			/*
855			 * Lock out kernel signal code when we are processing
856			 * signals, and get a fresh copy of signal mask.
857			 */
858			__sys_sigprocmask(SIG_SETMASK, &sigmask,
859					  &curthread->sigmask);
860			for (i = 1; i <= _SIG_MAXSIG; i++) {
861				if (SIGISMEMBER(curthread->sigmask, i))
862					continue;
863				if (SIGISMEMBER(curthread->sigpend, i))
864					(void)_thr_sig_add(curthread, i,
865					    &curthread->siginfo[i-1]);
866			}
867			__sys_sigprocmask(SIG_SETMASK, &curthread->sigmask,
868				NULL);
869			/* The above code might make thread runnable */
870			if (curthread->state == PS_RUNNING)
871				break;
872		}
873		THR_DEACTIVATE_LAST_LOCK(curthread);
874		kse_wait(curkse, curthread, sigseqno);
875		THR_ACTIVATE_LAST_LOCK(curthread);
876		KSE_GET_TOD(curkse, &ts);
877		if (thr_timedout(curthread, &ts)) {
878			/* Indicate the thread timedout: */
879			curthread->timeout = 1;
880			/* Make the thread runnable. */
881			THR_SET_STATE(curthread, PS_RUNNING);
882		}
883	}
884
885	/* Remove the frame reference. */
886	curthread->curframe = NULL;
887
888	if (curthread->lock_switch == 0) {
889		/* Unlock the scheduling queue. */
890		KSE_SCHED_UNLOCK(curkse, curkse->k_kseg);
891	}
892
893	DBG_MSG("Continuing bound thread %p\n", curthread);
894	if (first) {
895		_kse_critical_leave(&curthread->tcb->tcb_tmbx);
896		pthread_exit(curthread->start_routine(curthread->arg));
897	}
898}
899
900#ifdef DEBUG_THREAD_KERN
901static void
902dump_queues(struct kse *curkse)
903{
904	struct pthread *thread;
905
906	DBG_MSG("Threads in waiting queue:\n");
907	TAILQ_FOREACH(thread, &curkse->k_kseg->kg_schedq.sq_waitq, pqe) {
908		DBG_MSG("  thread %p, state %d, blocked %d\n",
909		    thread, thread->state, thread->blocked);
910	}
911}
912#endif
913
914/*
915 * This is the scheduler for a KSE which runs multiple threads.
916 */
917static void
918kse_sched_multi(struct kse_mailbox *kmbx)
919{
920	struct kse *curkse;
921	struct pthread *curthread, *td_wait;
922	struct pthread_sigframe *curframe;
923	int ret;
924
925	curkse = (struct kse *)kmbx->km_udata;
926	THR_ASSERT(curkse->k_kcb->kcb_kmbx.km_curthread == NULL,
927	    "Mailbox not null in kse_sched_multi");
928
929	/* Check for first time initialization: */
930	if (__predict_false((curkse->k_flags & KF_INITIALIZED) == 0)) {
931		/* Setup this KSEs specific data. */
932		_kcb_set(curkse->k_kcb);
933
934		/* Set this before grabbing the context. */
935		curkse->k_flags |= KF_INITIALIZED;
936	}
937
938	/*
939	 * No current thread anymore, calling _get_curthread in UTS
940	 * should dump core
941	 */
942	_tcb_set(curkse->k_kcb, NULL);
943
944	/* If this is an upcall; take the scheduler lock. */
945	if (!KSE_IS_SWITCH(curkse))
946		KSE_SCHED_LOCK(curkse, curkse->k_kseg);
947	else
948		KSE_CLEAR_SWITCH(curkse);
949
950	if (KSE_IS_IDLE(curkse)) {
951		KSE_CLEAR_IDLE(curkse);
952		curkse->k_kseg->kg_idle_kses--;
953	}
954
955	/*
956	 * Now that the scheduler lock is held, get the current
957	 * thread.  The KSE's current thread cannot be safely
958	 * examined without the lock because it could have returned
959	 * as completed on another KSE.  See kse_check_completed().
960	 */
961	curthread = curkse->k_curthread;
962
963	/*
964	 * If the current thread was completed in another KSE, then
965	 * it will be in the run queue.  Don't mark it as being blocked.
966	 */
967	if ((curthread != NULL) &&
968	    ((curthread->flags & THR_FLAGS_IN_RUNQ) == 0) &&
969	    (curthread->need_switchout == 0)) {
970		/*
971		 * Assume the current thread is blocked; when the
972		 * completed threads are checked and if the current
973		 * thread is among the completed, the blocked flag
974		 * will be cleared.
975		 */
976		curthread->blocked = 1;
977	}
978
979	/* Check for any unblocked threads in the kernel. */
980	kse_check_completed(curkse);
981
982	/*
983	 * Check for threads that have timed-out.
984	 */
985	kse_check_waitq(curkse);
986
987	/*
988	 * Switchout the current thread, if necessary, as the last step
989	 * so that it is inserted into the run queue (if it's runnable)
990	 * _after_ any other threads that were added to it above.
991	 */
992	if (curthread == NULL)
993		;  /* Nothing to do here. */
994	else if ((curthread->need_switchout == 0) && DBG_CAN_RUN(curthread) &&
995	    (curthread->blocked == 0) && (THR_IN_CRITICAL(curthread))) {
996		/*
997		 * Resume the thread and tell it to yield when
998		 * it leaves the critical region.
999		 */
1000		curthread->critical_yield = 1;
1001		curthread->active = 1;
1002		if ((curthread->flags & THR_FLAGS_IN_RUNQ) != 0)
1003			KSE_RUNQ_REMOVE(curkse, curthread);
1004		curkse->k_curthread = curthread;
1005		curthread->kse = curkse;
1006		DBG_MSG("Continuing thread %p in critical region\n",
1007		    curthread);
1008		kse_wakeup_multi(curkse);
1009		KSE_SCHED_UNLOCK(curkse, curkse->k_kseg);
1010		ret = _thread_switch(curkse->k_kcb, curthread->tcb, 1);
1011		if (ret != 0)
1012			PANIC("Can't resume thread in critical region\n");
1013	}
1014	else if ((curthread->flags & THR_FLAGS_IN_RUNQ) == 0) {
1015		curthread->tcb->tcb_tmbx.tm_lwp = 0;
1016		kse_switchout_thread(curkse, curthread);
1017	}
1018	curkse->k_curthread = NULL;
1019
1020#ifdef DEBUG_THREAD_KERN
1021	dump_queues(curkse);
1022#endif
1023
1024	/* Check if there are no threads ready to run: */
1025	while (((curthread = KSE_RUNQ_FIRST(curkse)) == NULL) &&
1026	    (curkse->k_kseg->kg_threadcount != 0) &&
1027	    ((curkse->k_flags & KF_TERMINATED) == 0)) {
1028		/*
1029		 * Wait for a thread to become active or until there are
1030		 * no more threads.
1031		 */
1032		td_wait = KSE_WAITQ_FIRST(curkse);
1033		kse_wait(curkse, td_wait, 0);
1034		kse_check_completed(curkse);
1035		kse_check_waitq(curkse);
1036	}
1037
1038	/* Check for no more threads: */
1039	if ((curkse->k_kseg->kg_threadcount == 0) ||
1040	    ((curkse->k_flags & KF_TERMINATED) != 0)) {
1041		/*
1042		 * Normally this shouldn't return, but it will if there
1043		 * are other KSEs running that create new threads that
1044		 * are assigned to this KSE[G].  For instance, if a scope
1045		 * system thread were to create a scope process thread
1046		 * and this kse[g] is the initial kse[g], then that newly
1047		 * created thread would be assigned to us (the initial
1048		 * kse[g]).
1049		 */
1050		kse_wakeup_multi(curkse);
1051		KSE_SCHED_UNLOCK(curkse, curkse->k_kseg);
1052		kse_fini(curkse);
1053		/* never returns */
1054	}
1055
1056	THR_ASSERT(curthread != NULL,
1057	    "Return from kse_wait/fini without thread.");
1058	THR_ASSERT(curthread->state != PS_DEAD,
1059	    "Trying to resume dead thread!");
1060	KSE_RUNQ_REMOVE(curkse, curthread);
1061
1062	/*
1063	 * Make the selected thread the current thread.
1064	 */
1065	curkse->k_curthread = curthread;
1066
1067	/*
1068	 * Make sure the current thread's kse points to this kse.
1069	 */
1070	curthread->kse = curkse;
1071
1072	/*
1073	 * Reset the time slice if this thread is running for the first
1074	 * time or running again after using its full time slice allocation.
1075	 */
1076	if (curthread->slice_usec == -1)
1077		curthread->slice_usec = 0;
1078
1079	/* Mark the thread active. */
1080	curthread->active = 1;
1081
1082	/* Remove the frame reference. */
1083	curframe = curthread->curframe;
1084	curthread->curframe = NULL;
1085
1086	/*
1087	 * The thread's current signal frame will only be NULL if it
1088	 * is being resumed after being blocked in the kernel.  In
1089	 * this case, and if the thread needs to run down pending
1090	 * signals or needs a cancellation check, we need to add a
1091	 * signal frame to the thread's context.
1092	 */
1093	if ((curframe == NULL) && (curthread->state == PS_RUNNING) &&
1094	    (curthread->check_pending != 0 ||
1095	     THR_NEED_ASYNC_CANCEL(curthread)) &&
1096	    !THR_IN_CRITICAL(curthread)) {
1097		curthread->check_pending = 0;
1098		signalcontext(&curthread->tcb->tcb_tmbx.tm_context, 0,
1099		    (__sighandler_t *)thr_resume_wrapper);
1100	}
1101	kse_wakeup_multi(curkse);
1102	/*
1103	 * Continue the thread at its current frame:
1104	 */
1105	if (curthread->lock_switch != 0) {
1106		/*
1107		 * This thread came from a scheduler switch; it will
1108		 * unlock the scheduler lock and set the mailbox.
1109		 */
1110		ret = _thread_switch(curkse->k_kcb, curthread->tcb, 0);
1111	} else {
1112		/* This thread won't unlock the scheduler lock. */
1113		KSE_SCHED_UNLOCK(curkse, curkse->k_kseg);
1114		ret = _thread_switch(curkse->k_kcb, curthread->tcb, 1);
1115	}
1116	if (ret != 0)
1117		PANIC("Thread has returned from _thread_switch");
1118
1119	/* This point should not be reached. */
1120	PANIC("Thread has returned from _thread_switch");
1121}
1122
1123static void
1124thr_resume_wrapper(int sig, siginfo_t *siginfo, ucontext_t *ucp)
1125{
1126	struct pthread *curthread = _get_curthread();
1127	struct kse *curkse;
1128	int ret, err_save = errno;
1129
1130	DBG_MSG(">>> sig wrapper\n");
1131	if (curthread->lock_switch)
1132		PANIC("thr_resume_wrapper, lock_switch != 0\n");
1133	thr_resume_check(curthread, ucp, NULL);
1134	errno = err_save;
1135	_kse_critical_enter();
1136	curkse = _get_curkse();
1137	curthread->tcb->tcb_tmbx.tm_context = *ucp;
1138	ret = _thread_switch(curkse->k_kcb, curthread->tcb, 1);
1139	if (ret != 0)
1140		PANIC("thr_resume_wrapper: thread has returned "
1141		      "from _thread_switch");
1142	/* THR_SETCONTEXT(ucp); */ /* not work, why ? */
1143}
1144
1145static void
1146thr_resume_check(struct pthread *curthread, ucontext_t *ucp,
1147    struct pthread_sigframe *psf)
1148{
1149	_thr_sig_rundown(curthread, ucp, psf);
1150
1151	if (THR_NEED_ASYNC_CANCEL(curthread))
1152		pthread_testcancel();
1153}
1154
1155/*
1156 * Clean up a thread.  This must be called with the thread's KSE
1157 * scheduling lock held.  The thread must be a thread from the
1158 * KSE's group.
1159 */
1160static void
1161thr_cleanup(struct kse *curkse, struct pthread *thread)
1162{
1163	struct pthread *joiner;
1164	struct kse_mailbox *kmbx = NULL;
1165	int sys_scope;
1166
1167	if ((joiner = thread->joiner) != NULL) {
1168		/* Joinee scheduler lock held; joiner won't leave. */
1169		if (joiner->kseg == curkse->k_kseg) {
1170			if (joiner->join_status.thread == thread) {
1171				joiner->join_status.thread = NULL;
1172				joiner->join_status.ret = thread->ret;
1173				(void)_thr_setrunnable_unlocked(joiner);
1174			}
1175		} else {
1176			KSE_SCHED_UNLOCK(curkse, curkse->k_kseg);
1177			/* The joiner may have removed itself and exited. */
1178			if (_thr_ref_add(thread, joiner, 0) == 0) {
1179				KSE_SCHED_LOCK(curkse, joiner->kseg);
1180				if (joiner->join_status.thread == thread) {
1181					joiner->join_status.thread = NULL;
1182					joiner->join_status.ret = thread->ret;
1183					kmbx = _thr_setrunnable_unlocked(joiner);
1184				}
1185				KSE_SCHED_UNLOCK(curkse, joiner->kseg);
1186				_thr_ref_delete(thread, joiner);
1187				if (kmbx != NULL)
1188					kse_wakeup(kmbx);
1189			}
1190			KSE_SCHED_LOCK(curkse, curkse->k_kseg);
1191		}
1192		thread->attr.flags |= PTHREAD_DETACHED;
1193	}
1194
1195	if (!(sys_scope = (thread->attr.flags & PTHREAD_SCOPE_SYSTEM))) {
1196		/*
1197		 * Remove the thread from the KSEG's list of threads.
1198	 	 */
1199		KSEG_THRQ_REMOVE(thread->kseg, thread);
1200		/*
1201		 * Migrate the thread to the main KSE so that this
1202		 * KSE and KSEG can be cleaned when their last thread
1203		 * exits.
1204		 */
1205		thread->kseg = _kse_initial->k_kseg;
1206		thread->kse = _kse_initial;
1207	}
1208	thread->flags |= THR_FLAGS_GC_SAFE;
1209
1210	/*
1211	 * We can't hold the thread list lock while holding the
1212	 * scheduler lock.
1213	 */
1214	KSE_SCHED_UNLOCK(curkse, curkse->k_kseg);
1215	DBG_MSG("Adding thread %p to GC list\n", thread);
1216	KSE_LOCK_ACQUIRE(curkse, &_thread_list_lock);
1217	THR_GCLIST_ADD(thread);
1218	KSE_LOCK_RELEASE(curkse, &_thread_list_lock);
1219	if (sys_scope) {
1220		/*
1221		 * System scope thread is single thread group,
1222		 * when thread is exited, its kse and ksegrp should
1223		 * be recycled as well.
1224		 * kse upcall stack belongs to thread, clear it here.
1225		 */
1226		curkse->k_stack.ss_sp = 0;
1227		curkse->k_stack.ss_size = 0;
1228		kse_exit();
1229		PANIC("kse_exit() failed for system scope thread");
1230	}
1231	KSE_SCHED_LOCK(curkse, curkse->k_kseg);
1232}
1233
1234void
1235_thr_gc(struct pthread *curthread)
1236{
1237	thread_gc(curthread);
1238	kse_gc(curthread);
1239	kseg_gc(curthread);
1240}
1241
1242static void
1243thread_gc(struct pthread *curthread)
1244{
1245	struct pthread *td, *td_next;
1246	kse_critical_t crit;
1247	TAILQ_HEAD(, pthread) worklist;
1248
1249	TAILQ_INIT(&worklist);
1250	crit = _kse_critical_enter();
1251	KSE_LOCK_ACQUIRE(curthread->kse, &_thread_list_lock);
1252
1253	/* Check the threads waiting for GC. */
1254	for (td = TAILQ_FIRST(&_thread_gc_list); td != NULL; td = td_next) {
1255		td_next = TAILQ_NEXT(td, gcle);
1256		if ((td->flags & THR_FLAGS_GC_SAFE) == 0)
1257			continue;
1258		else if (((td->attr.flags & PTHREAD_SCOPE_SYSTEM) != 0) &&
1259		    ((td->kse->k_kcb->kcb_kmbx.km_flags & KMF_DONE) == 0)) {
1260			/*
1261			 * The thread and KSE are operating on the same
1262			 * stack.  Wait for the KSE to exit before freeing
1263			 * the thread's stack as well as everything else.
1264			 */
1265			continue;
1266		}
1267		/*
1268		 * Remove the thread from the GC list.  If the thread
1269		 * isn't yet detached, it will get added back to the
1270		 * GC list at a later time.
1271		 */
1272		THR_GCLIST_REMOVE(td);
1273		DBG_MSG("Freeing thread %p stack\n", td);
1274		/*
1275		 * We can free the thread stack since it's no longer
1276		 * in use.
1277		 */
1278		_thr_stack_free(&td->attr);
1279		if (((td->attr.flags & PTHREAD_DETACHED) != 0) &&
1280		    (td->refcount == 0)) {
1281			/*
1282			 * The thread has detached and is no longer
1283			 * referenced.  It is safe to remove all
1284			 * remnants of the thread.
1285			 */
1286			THR_LIST_REMOVE(td);
1287			TAILQ_INSERT_HEAD(&worklist, td, gcle);
1288		}
1289	}
1290	KSE_LOCK_RELEASE(curthread->kse, &_thread_list_lock);
1291	_kse_critical_leave(crit);
1292
1293	while ((td = TAILQ_FIRST(&worklist)) != NULL) {
1294		TAILQ_REMOVE(&worklist, td, gcle);
1295		/*
1296		 * XXX we don't free initial thread and its kse
1297		 * (if thread is a bound thread), because there might
1298		 * have some code referencing initial thread and kse.
1299		 */
1300		if (td == _thr_initial) {
1301			DBG_MSG("Initial thread won't be freed\n");
1302			continue;
1303		}
1304
1305		if ((td->attr.flags & PTHREAD_SCOPE_SYSTEM) != 0) {
1306			crit = _kse_critical_enter();
1307			KSE_LOCK_ACQUIRE(curthread->kse, &kse_lock);
1308			kse_free_unlocked(td->kse);
1309			kseg_free_unlocked(td->kseg);
1310			KSE_LOCK_RELEASE(curthread->kse, &kse_lock);
1311			_kse_critical_leave(crit);
1312		}
1313		DBG_MSG("Freeing thread %p\n", td);
1314		_thr_free(curthread, td);
1315	}
1316}
1317
1318static void
1319kse_gc(struct pthread *curthread)
1320{
1321	kse_critical_t crit;
1322	TAILQ_HEAD(, kse) worklist;
1323	struct kse *kse;
1324
1325	if (free_kse_count <= MAX_CACHED_KSES)
1326		return;
1327	TAILQ_INIT(&worklist);
1328	crit = _kse_critical_enter();
1329	KSE_LOCK_ACQUIRE(curthread->kse, &kse_lock);
1330	while (free_kse_count > MAX_CACHED_KSES) {
1331		kse = TAILQ_FIRST(&free_kseq);
1332		TAILQ_REMOVE(&free_kseq, kse, k_qe);
1333		TAILQ_INSERT_HEAD(&worklist, kse, k_qe);
1334		free_kse_count--;
1335	}
1336	KSE_LOCK_RELEASE(curthread->kse, &kse_lock);
1337	_kse_critical_leave(crit);
1338
1339	while ((kse = TAILQ_FIRST(&worklist))) {
1340		TAILQ_REMOVE(&worklist, kse, k_qe);
1341		kse_destroy(kse);
1342	}
1343}
1344
1345static void
1346kseg_gc(struct pthread *curthread)
1347{
1348	kse_critical_t crit;
1349	TAILQ_HEAD(, kse_group) worklist;
1350	struct kse_group *kseg;
1351
1352	if (free_kseg_count <= MAX_CACHED_KSEGS)
1353		return;
1354	crit = _kse_critical_enter();
1355	KSE_LOCK_ACQUIRE(curthread->kse, &kse_lock);
1356	while (free_kseg_count > MAX_CACHED_KSEGS) {
1357		kseg = TAILQ_FIRST(&free_kse_groupq);
1358		TAILQ_REMOVE(&free_kse_groupq, kseg, kg_qe);
1359		free_kseg_count--;
1360		TAILQ_INSERT_HEAD(&worklist, kseg, kg_qe);
1361	}
1362	KSE_LOCK_RELEASE(curthread->kse, &kse_lock);
1363	_kse_critical_leave(crit);
1364
1365	while ((kseg = TAILQ_FIRST(&worklist))) {
1366		TAILQ_REMOVE(&worklist, kseg, kg_qe);
1367		kseg_destroy(kseg);
1368	}
1369}
1370
1371/*
1372 * Only new threads that are running or suspended may be scheduled.
1373 */
1374int
1375_thr_schedule_add(struct pthread *curthread, struct pthread *newthread)
1376{
1377	kse_critical_t crit;
1378	int ret;
1379
1380	/* Add the new thread. */
1381	thr_link(newthread);
1382
1383	/*
1384	 * If this is the first time creating a thread, make sure
1385	 * the mailbox is set for the current thread.
1386	 */
1387	if ((newthread->attr.flags & PTHREAD_SCOPE_SYSTEM) != 0) {
1388		/* We use the thread's stack as the KSE's stack. */
1389		newthread->kse->k_kcb->kcb_kmbx.km_stack.ss_sp =
1390		    newthread->attr.stackaddr_attr;
1391		newthread->kse->k_kcb->kcb_kmbx.km_stack.ss_size =
1392		    newthread->attr.stacksize_attr;
1393
1394		/*
1395		 * No need to lock the scheduling queue since the
1396		 * KSE/KSEG pair have not yet been started.
1397		 */
1398		KSEG_THRQ_ADD(newthread->kseg, newthread);
1399		/* this thread never gives up kse */
1400		newthread->active = 1;
1401		newthread->kse->k_curthread = newthread;
1402		newthread->kse->k_kcb->kcb_kmbx.km_flags = KMF_BOUND;
1403		newthread->kse->k_kcb->kcb_kmbx.km_func =
1404		    (kse_func_t *)kse_sched_single;
1405		newthread->kse->k_kcb->kcb_kmbx.km_quantum = 0;
1406		KSE_SET_MBOX(newthread->kse, newthread);
1407		/*
1408		 * This thread needs a new KSE and KSEG.
1409		 */
1410		newthread->kse->k_flags &= ~KF_INITIALIZED;
1411		newthread->kse->k_flags |= KF_STARTED;
1412		/* Fire up! */
1413		ret = kse_create(&newthread->kse->k_kcb->kcb_kmbx, 1);
1414		if (ret != 0)
1415			ret = errno;
1416	}
1417	else {
1418		/*
1419		 * Lock the KSE and add the new thread to its list of
1420		 * assigned threads.  If the new thread is runnable, also
1421		 * add it to the KSE's run queue.
1422		 */
1423		crit = _kse_critical_enter();
1424		KSE_SCHED_LOCK(curthread->kse, newthread->kseg);
1425		KSEG_THRQ_ADD(newthread->kseg, newthread);
1426		if (newthread->state == PS_RUNNING)
1427			THR_RUNQ_INSERT_TAIL(newthread);
1428		if ((newthread->kse->k_flags & KF_STARTED) == 0) {
1429			/*
1430			 * This KSE hasn't been started yet.  Start it
1431			 * outside of holding the lock.
1432			 */
1433			newthread->kse->k_flags |= KF_STARTED;
1434			newthread->kse->k_kcb->kcb_kmbx.km_func =
1435			    (kse_func_t *)kse_sched_multi;
1436			newthread->kse->k_kcb->kcb_kmbx.km_flags = 0;
1437			kse_create(&newthread->kse->k_kcb->kcb_kmbx, 0);
1438		 } else if ((newthread->state == PS_RUNNING) &&
1439		     KSE_IS_IDLE(newthread->kse)) {
1440			/*
1441			 * The thread is being scheduled on another KSEG.
1442			 */
1443			kse_wakeup_one(newthread);
1444		}
1445		KSE_SCHED_UNLOCK(curthread->kse, newthread->kseg);
1446		_kse_critical_leave(crit);
1447		ret = 0;
1448	}
1449	if (ret != 0)
1450		thr_unlink(newthread);
1451
1452	return (ret);
1453}
1454
1455void
1456kse_waitq_insert(struct pthread *thread)
1457{
1458	struct pthread *td;
1459
1460	if (thread->wakeup_time.tv_sec == -1)
1461		TAILQ_INSERT_TAIL(&thread->kse->k_schedq->sq_waitq, thread,
1462		    pqe);
1463	else {
1464		td = TAILQ_FIRST(&thread->kse->k_schedq->sq_waitq);
1465		while ((td != NULL) && (td->wakeup_time.tv_sec != -1) &&
1466		    ((td->wakeup_time.tv_sec < thread->wakeup_time.tv_sec) ||
1467		    ((td->wakeup_time.tv_sec == thread->wakeup_time.tv_sec) &&
1468		    (td->wakeup_time.tv_nsec <= thread->wakeup_time.tv_nsec))))
1469			td = TAILQ_NEXT(td, pqe);
1470		if (td == NULL)
1471			TAILQ_INSERT_TAIL(&thread->kse->k_schedq->sq_waitq,
1472			    thread, pqe);
1473		else
1474			TAILQ_INSERT_BEFORE(td, thread, pqe);
1475	}
1476	thread->flags |= THR_FLAGS_IN_WAITQ;
1477}
1478
1479/*
1480 * This must be called with the scheduling lock held.
1481 */
1482static void
1483kse_check_completed(struct kse *kse)
1484{
1485	struct pthread *thread;
1486	struct kse_thr_mailbox *completed;
1487	int sig;
1488
1489	if ((completed = kse->k_kcb->kcb_kmbx.km_completed) != NULL) {
1490		kse->k_kcb->kcb_kmbx.km_completed = NULL;
1491		while (completed != NULL) {
1492			thread = completed->tm_udata;
1493			DBG_MSG("Found completed thread %p, name %s\n",
1494			    thread,
1495			    (thread->name == NULL) ? "none" : thread->name);
1496			thread->blocked = 0;
1497			if (thread != kse->k_curthread) {
1498				thr_accounting(thread);
1499				if ((thread->flags & THR_FLAGS_SUSPENDED) != 0)
1500					THR_SET_STATE(thread, PS_SUSPENDED);
1501				else
1502					KSE_RUNQ_INSERT_TAIL(kse, thread);
1503				if ((thread->kse != kse) &&
1504				    (thread->kse->k_curthread == thread)) {
1505					/*
1506					 * Remove this thread from its
1507					 * previous KSE so that it (the KSE)
1508					 * doesn't think it is still active.
1509					 */
1510					thread->kse->k_curthread = NULL;
1511					thread->active = 0;
1512				}
1513			}
1514			if ((sig = thread->tcb->tcb_tmbx.tm_syncsig.si_signo)
1515			    != 0) {
1516				if (SIGISMEMBER(thread->sigmask, sig))
1517					SIGADDSET(thread->sigpend, sig);
1518				else if (THR_IN_CRITICAL(thread))
1519					kse_thr_interrupt(NULL, KSE_INTR_SIGEXIT, sig);
1520				else
1521					(void)_thr_sig_add(thread, sig,
1522					    &thread->tcb->tcb_tmbx.tm_syncsig);
1523				thread->tcb->tcb_tmbx.tm_syncsig.si_signo = 0;
1524			}
1525			completed = completed->tm_next;
1526		}
1527	}
1528}
1529
1530/*
1531 * This must be called with the scheduling lock held.
1532 */
1533static void
1534kse_check_waitq(struct kse *kse)
1535{
1536	struct pthread	*pthread;
1537	struct timespec ts;
1538
1539	KSE_GET_TOD(kse, &ts);
1540
1541	/*
1542	 * Wake up threads that have timedout.  This has to be
1543	 * done before adding the current thread to the run queue
1544	 * so that a CPU intensive thread doesn't get preference
1545	 * over waiting threads.
1546	 */
1547	while (((pthread = KSE_WAITQ_FIRST(kse)) != NULL) &&
1548	    thr_timedout(pthread, &ts)) {
1549		/* Remove the thread from the wait queue: */
1550		KSE_WAITQ_REMOVE(kse, pthread);
1551		DBG_MSG("Found timedout thread %p in waitq\n", pthread);
1552
1553		/* Indicate the thread timedout: */
1554		pthread->timeout = 1;
1555
1556		/* Add the thread to the priority queue: */
1557		if ((pthread->flags & THR_FLAGS_SUSPENDED) != 0)
1558			THR_SET_STATE(pthread, PS_SUSPENDED);
1559		else {
1560			THR_SET_STATE(pthread, PS_RUNNING);
1561			KSE_RUNQ_INSERT_TAIL(kse, pthread);
1562		}
1563	}
1564}
1565
1566static int
1567thr_timedout(struct pthread *thread, struct timespec *curtime)
1568{
1569	if (thread->wakeup_time.tv_sec < 0)
1570		return (0);
1571	else if (thread->wakeup_time.tv_sec > curtime->tv_sec)
1572		return (0);
1573	else if ((thread->wakeup_time.tv_sec == curtime->tv_sec) &&
1574	    (thread->wakeup_time.tv_nsec > curtime->tv_nsec))
1575		return (0);
1576	else
1577		return (1);
1578}
1579
1580/*
1581 * This must be called with the scheduling lock held.
1582 *
1583 * Each thread has a time slice, a wakeup time (used when it wants
1584 * to wait for a specified amount of time), a run state, and an
1585 * active flag.
1586 *
1587 * When a thread gets run by the scheduler, the active flag is
1588 * set to non-zero (1).  When a thread performs an explicit yield
1589 * or schedules a state change, it enters the scheduler and the
1590 * active flag is cleared.  When the active flag is still seen
1591 * set in the scheduler, that means that the thread is blocked in
1592 * the kernel (because it is cleared before entering the scheduler
1593 * in all other instances).
1594 *
1595 * The wakeup time is only set for those states that can timeout.
1596 * It is set to (-1, -1) for all other instances.
1597 *
1598 * The thread's run state, aside from being useful when debugging,
1599 * is used to place the thread in an appropriate queue.  There
1600 * are 2 basic queues:
1601 *
1602 *   o run queue - queue ordered by priority for all threads
1603 *                 that are runnable
1604 *   o waiting queue - queue sorted by wakeup time for all threads
1605 *                     that are not otherwise runnable (not blocked
1606 *                     in kernel, not waiting for locks)
1607 *
1608 * The thread's time slice is used for round-robin scheduling
1609 * (the default scheduling policy).  While a SCHED_RR thread
1610 * is runnable it's time slice accumulates.  When it reaches
1611 * the time slice interval, it gets reset and added to the end
1612 * of the queue of threads at its priority.  When a thread no
1613 * longer becomes runnable (blocks in kernel, waits, etc), its
1614 * time slice is reset.
1615 *
1616 * The job of kse_switchout_thread() is to handle all of the above.
1617 */
1618static void
1619kse_switchout_thread(struct kse *kse, struct pthread *thread)
1620{
1621	int level;
1622	int i;
1623	int restart;
1624	siginfo_t siginfo;
1625
1626	/*
1627	 * Place the currently running thread into the
1628	 * appropriate queue(s).
1629	 */
1630	DBG_MSG("Switching out thread %p, state %d\n", thread, thread->state);
1631
1632	THR_DEACTIVATE_LAST_LOCK(thread);
1633	if (thread->blocked != 0) {
1634		thread->active = 0;
1635		thread->need_switchout = 0;
1636		/* This thread must have blocked in the kernel. */
1637		/*
1638		 * Check for pending signals and cancellation for
1639		 * this thread to see if we need to interrupt it
1640		 * in the kernel.
1641		 */
1642		if (THR_NEED_CANCEL(thread)) {
1643			kse_thr_interrupt(&thread->tcb->tcb_tmbx,
1644					  KSE_INTR_INTERRUPT, 0);
1645		} else if (thread->check_pending != 0) {
1646			for (i = 1; i <= _SIG_MAXSIG; ++i) {
1647				if (SIGISMEMBER(thread->sigpend, i) &&
1648				    !SIGISMEMBER(thread->sigmask, i)) {
1649					restart = _thread_sigact[i - 1].sa_flags & SA_RESTART;
1650					kse_thr_interrupt(&thread->tcb->tcb_tmbx,
1651					    restart ? KSE_INTR_RESTART : KSE_INTR_INTERRUPT, 0);
1652					break;
1653				}
1654			}
1655		}
1656	}
1657	else {
1658		switch (thread->state) {
1659		case PS_MUTEX_WAIT:
1660		case PS_COND_WAIT:
1661			if (THR_NEED_CANCEL(thread)) {
1662				thread->interrupted = 1;
1663				thread->continuation = _thr_finish_cancellation;
1664				THR_SET_STATE(thread, PS_RUNNING);
1665			} else {
1666				/* Insert into the waiting queue: */
1667				KSE_WAITQ_INSERT(kse, thread);
1668			}
1669			break;
1670
1671		case PS_LOCKWAIT:
1672			/*
1673			 * This state doesn't timeout.
1674			 */
1675			thread->wakeup_time.tv_sec = -1;
1676			thread->wakeup_time.tv_nsec = -1;
1677			level = thread->locklevel - 1;
1678			if (!_LCK_GRANTED(&thread->lockusers[level]))
1679				KSE_WAITQ_INSERT(kse, thread);
1680			else
1681				THR_SET_STATE(thread, PS_RUNNING);
1682			break;
1683
1684		case PS_SLEEP_WAIT:
1685		case PS_SIGWAIT:
1686			if (THR_NEED_CANCEL(thread)) {
1687				thread->interrupted = 1;
1688				THR_SET_STATE(thread, PS_RUNNING);
1689			} else {
1690				KSE_WAITQ_INSERT(kse, thread);
1691			}
1692			break;
1693
1694		case PS_JOIN:
1695			if (THR_NEED_CANCEL(thread)) {
1696				thread->join_status.thread = NULL;
1697				THR_SET_STATE(thread, PS_RUNNING);
1698			} else {
1699				/*
1700				 * This state doesn't timeout.
1701				 */
1702				thread->wakeup_time.tv_sec = -1;
1703				thread->wakeup_time.tv_nsec = -1;
1704
1705				/* Insert into the waiting queue: */
1706				KSE_WAITQ_INSERT(kse, thread);
1707			}
1708			break;
1709
1710		case PS_SIGSUSPEND:
1711		case PS_SUSPENDED:
1712			if (THR_NEED_CANCEL(thread)) {
1713				thread->interrupted = 1;
1714				THR_SET_STATE(thread, PS_RUNNING);
1715			} else {
1716				/*
1717				 * These states don't timeout.
1718				 */
1719				thread->wakeup_time.tv_sec = -1;
1720				thread->wakeup_time.tv_nsec = -1;
1721
1722				/* Insert into the waiting queue: */
1723				KSE_WAITQ_INSERT(kse, thread);
1724			}
1725			break;
1726
1727		case PS_DEAD:
1728			/*
1729			 * The scheduler is operating on a different
1730			 * stack.  It is safe to do garbage collecting
1731			 * here.
1732			 */
1733			thread->active = 0;
1734			thread->need_switchout = 0;
1735			thread->lock_switch = 0;
1736			thr_cleanup(kse, thread);
1737			return;
1738			break;
1739
1740		case PS_RUNNING:
1741			if ((thread->flags & THR_FLAGS_SUSPENDED) != 0 &&
1742			    !THR_NEED_CANCEL(thread))
1743				THR_SET_STATE(thread, PS_SUSPENDED);
1744			break;
1745
1746		case PS_DEADLOCK:
1747			/*
1748			 * These states don't timeout.
1749			 */
1750			thread->wakeup_time.tv_sec = -1;
1751			thread->wakeup_time.tv_nsec = -1;
1752
1753			/* Insert into the waiting queue: */
1754			KSE_WAITQ_INSERT(kse, thread);
1755			break;
1756
1757		default:
1758			PANIC("Unknown state\n");
1759			break;
1760		}
1761
1762		thr_accounting(thread);
1763		if (thread->state == PS_RUNNING) {
1764			if (thread->slice_usec == -1) {
1765				/*
1766				 * The thread exceeded its time quantum or
1767				 * it yielded the CPU; place it at the tail
1768				 * of the queue for its priority.
1769				 */
1770				KSE_RUNQ_INSERT_TAIL(kse, thread);
1771			} else {
1772				/*
1773				 * The thread hasn't exceeded its interval
1774				 * Place it at the head of the queue for its
1775				 * priority.
1776				 */
1777				KSE_RUNQ_INSERT_HEAD(kse, thread);
1778			}
1779		}
1780	}
1781	thread->active = 0;
1782	thread->need_switchout = 0;
1783	if (thread->check_pending != 0) {
1784		/* Install pending signals into the frame. */
1785		thread->check_pending = 0;
1786		KSE_LOCK_ACQUIRE(kse, &_thread_signal_lock);
1787		for (i = 1; i <= _SIG_MAXSIG; i++) {
1788			if (SIGISMEMBER(thread->sigmask, i))
1789				continue;
1790			if (SIGISMEMBER(thread->sigpend, i))
1791				(void)_thr_sig_add(thread, i,
1792				    &thread->siginfo[i-1]);
1793			else if (SIGISMEMBER(_thr_proc_sigpending, i) &&
1794				_thr_getprocsig_unlocked(i, &siginfo)) {
1795				(void)_thr_sig_add(thread, i, &siginfo);
1796			}
1797		}
1798		KSE_LOCK_RELEASE(kse, &_thread_signal_lock);
1799	}
1800}
1801
1802/*
1803 * This function waits for the smallest timeout value of any waiting
1804 * thread, or until it receives a message from another KSE.
1805 *
1806 * This must be called with the scheduling lock held.
1807 */
1808static void
1809kse_wait(struct kse *kse, struct pthread *td_wait, int sigseqno)
1810{
1811	struct timespec ts, ts_sleep;
1812	int saved_flags;
1813
1814	KSE_GET_TOD(kse, &ts);
1815
1816	if ((td_wait == NULL) || (td_wait->wakeup_time.tv_sec < 0)) {
1817		/* Limit sleep to no more than 1 minute. */
1818		ts_sleep.tv_sec = 60;
1819		ts_sleep.tv_nsec = 0;
1820	} else {
1821		TIMESPEC_SUB(&ts_sleep, &td_wait->wakeup_time, &ts);
1822		if (ts_sleep.tv_sec > 60) {
1823			ts_sleep.tv_sec = 60;
1824			ts_sleep.tv_nsec = 0;
1825		}
1826	}
1827	/* Don't sleep for negative times. */
1828	if ((ts_sleep.tv_sec >= 0) && (ts_sleep.tv_nsec >= 0)) {
1829		KSE_SET_IDLE(kse);
1830		kse->k_kseg->kg_idle_kses++;
1831		KSE_SCHED_UNLOCK(kse, kse->k_kseg);
1832		if ((kse->k_kseg->kg_flags & KGF_SINGLE_THREAD) &&
1833		    (kse->k_sigseqno != sigseqno))
1834			; /* don't sleep */
1835		else {
1836			saved_flags = kse->k_kcb->kcb_kmbx.km_flags;
1837			kse->k_kcb->kcb_kmbx.km_flags |= KMF_NOUPCALL;
1838			kse_release(&ts_sleep);
1839			kse->k_kcb->kcb_kmbx.km_flags = saved_flags;
1840		}
1841		KSE_SCHED_LOCK(kse, kse->k_kseg);
1842		if (KSE_IS_IDLE(kse)) {
1843			KSE_CLEAR_IDLE(kse);
1844			kse->k_kseg->kg_idle_kses--;
1845		}
1846	}
1847}
1848
1849/*
1850 * Avoid calling this kse_exit() so as not to confuse it with the
1851 * system call of the same name.
1852 */
1853static void
1854kse_fini(struct kse *kse)
1855{
1856	/* struct kse_group *free_kseg = NULL; */
1857	struct timespec ts;
1858	struct pthread *td;
1859
1860	/*
1861	 * Check to see if this is one of the main kses.
1862	 */
1863	if (kse->k_kseg != _kse_initial->k_kseg) {
1864		PANIC("shouldn't get here");
1865		/* This is for supporting thread groups. */
1866#ifdef NOT_YET
1867		/* Remove this KSE from the KSEG's list of KSEs. */
1868		KSE_SCHED_LOCK(kse, kse->k_kseg);
1869		TAILQ_REMOVE(&kse->k_kseg->kg_kseq, kse, k_kgqe);
1870		kse->k_kseg->kg_ksecount--;
1871		if (TAILQ_EMPTY(&kse->k_kseg->kg_kseq))
1872			free_kseg = kse->k_kseg;
1873		KSE_SCHED_UNLOCK(kse, kse->k_kseg);
1874
1875		/*
1876		 * Add this KSE to the list of free KSEs along with
1877		 * the KSEG if is now orphaned.
1878		 */
1879		KSE_LOCK_ACQUIRE(kse, &kse_lock);
1880		if (free_kseg != NULL)
1881			kseg_free_unlocked(free_kseg);
1882		kse_free_unlocked(kse);
1883		KSE_LOCK_RELEASE(kse, &kse_lock);
1884		kse_exit();
1885		/* Never returns. */
1886		PANIC("kse_exit()");
1887#endif
1888	} else {
1889		/*
1890		 * We allow program to kill kse in initial group (by
1891		 * lowering the concurrency).
1892		 */
1893		if ((kse != _kse_initial) &&
1894		    ((kse->k_flags & KF_TERMINATED) != 0)) {
1895			KSE_SCHED_LOCK(kse, kse->k_kseg);
1896			TAILQ_REMOVE(&kse->k_kseg->kg_kseq, kse, k_kgqe);
1897			kse->k_kseg->kg_ksecount--;
1898			/*
1899			 * Migrate thread to  _kse_initial if its lastest
1900			 * kse it ran on is the kse.
1901			 */
1902			td = TAILQ_FIRST(&kse->k_kseg->kg_threadq);
1903			while (td != NULL) {
1904				if (td->kse == kse)
1905					td->kse = _kse_initial;
1906				td = TAILQ_NEXT(td, kle);
1907			}
1908			KSE_SCHED_UNLOCK(kse, kse->k_kseg);
1909			KSE_LOCK_ACQUIRE(kse, &kse_lock);
1910			kse_free_unlocked(kse);
1911			KSE_LOCK_RELEASE(kse, &kse_lock);
1912			/* Make sure there is always at least one is awake */
1913			KSE_WAKEUP(_kse_initial);
1914			kse_exit();
1915                        /* Never returns. */
1916                        PANIC("kse_exit() failed for initial kseg");
1917                }
1918		KSE_SCHED_LOCK(kse, kse->k_kseg);
1919		KSE_SET_IDLE(kse);
1920		kse->k_kseg->kg_idle_kses++;
1921		KSE_SCHED_UNLOCK(kse, kse->k_kseg);
1922		ts.tv_sec = 120;
1923		ts.tv_nsec = 0;
1924		kse->k_kcb->kcb_kmbx.km_flags = 0;
1925		kse_release(&ts);
1926		/* Never reach */
1927	}
1928}
1929
1930void
1931_thr_set_timeout(const struct timespec *timeout)
1932{
1933	struct pthread	*curthread = _get_curthread();
1934	struct timespec ts;
1935
1936	/* Reset the timeout flag for the running thread: */
1937	curthread->timeout = 0;
1938
1939	/* Check if the thread is to wait forever: */
1940	if (timeout == NULL) {
1941		/*
1942		 * Set the wakeup time to something that can be recognised as
1943		 * different to an actual time of day:
1944		 */
1945		curthread->wakeup_time.tv_sec = -1;
1946		curthread->wakeup_time.tv_nsec = -1;
1947	}
1948	/* Check if no waiting is required: */
1949	else if ((timeout->tv_sec == 0) && (timeout->tv_nsec == 0)) {
1950		/* Set the wake up time to 'immediately': */
1951		curthread->wakeup_time.tv_sec = 0;
1952		curthread->wakeup_time.tv_nsec = 0;
1953	} else {
1954		/* Calculate the time for the current thread to wakeup: */
1955		KSE_GET_TOD(curthread->kse, &ts);
1956		TIMESPEC_ADD(&curthread->wakeup_time, &ts, timeout);
1957	}
1958}
1959
1960void
1961_thr_panic_exit(char *file, int line, char *msg)
1962{
1963	char buf[256];
1964
1965	snprintf(buf, sizeof(buf), "(%s:%d) %s\n", file, line, msg);
1966	__sys_write(2, buf, strlen(buf));
1967	abort();
1968}
1969
1970void
1971_thr_setrunnable(struct pthread *curthread, struct pthread *thread)
1972{
1973	kse_critical_t crit;
1974	struct kse_mailbox *kmbx;
1975
1976	crit = _kse_critical_enter();
1977	KSE_SCHED_LOCK(curthread->kse, thread->kseg);
1978	kmbx = _thr_setrunnable_unlocked(thread);
1979	KSE_SCHED_UNLOCK(curthread->kse, thread->kseg);
1980	_kse_critical_leave(crit);
1981	if ((kmbx != NULL) && (__isthreaded != 0))
1982		kse_wakeup(kmbx);
1983}
1984
1985struct kse_mailbox *
1986_thr_setrunnable_unlocked(struct pthread *thread)
1987{
1988	struct kse_mailbox *kmbx = NULL;
1989
1990	if ((thread->kseg->kg_flags & KGF_SINGLE_THREAD) != 0) {
1991		/* No silly queues for these threads. */
1992		if ((thread->flags & THR_FLAGS_SUSPENDED) != 0)
1993			THR_SET_STATE(thread, PS_SUSPENDED);
1994		else {
1995			THR_SET_STATE(thread, PS_RUNNING);
1996			kmbx = kse_wakeup_one(thread);
1997		}
1998
1999	} else if (thread->state != PS_RUNNING) {
2000		if ((thread->flags & THR_FLAGS_IN_WAITQ) != 0)
2001			KSE_WAITQ_REMOVE(thread->kse, thread);
2002		if ((thread->flags & THR_FLAGS_SUSPENDED) != 0)
2003			THR_SET_STATE(thread, PS_SUSPENDED);
2004		else {
2005			THR_SET_STATE(thread, PS_RUNNING);
2006			if ((thread->blocked == 0) && (thread->active == 0) &&
2007			    (thread->flags & THR_FLAGS_IN_RUNQ) == 0)
2008				THR_RUNQ_INSERT_TAIL(thread);
2009			/*
2010			 * XXX - Threads are not yet assigned to specific
2011			 *       KSEs; they are assigned to the KSEG.  So
2012			 *       the fact that a thread's KSE is waiting
2013			 *       doesn't necessarily mean that it will be
2014			 *       the KSE that runs the thread after the
2015			 *       lock is granted.  But we don't know if the
2016			 *       other KSEs within the same KSEG are also
2017			 *       in a waiting state or not so we err on the
2018			 *       side of caution and wakeup the thread's
2019			 *       last known KSE.  We ensure that the
2020			 *       threads KSE doesn't change while it's
2021			 *       scheduling lock is held so it is safe to
2022			 *       reference it (the KSE).  If the KSE wakes
2023			 *       up and doesn't find any more work it will
2024			 *       again go back to waiting so no harm is
2025			 *       done.
2026			 */
2027			kmbx = kse_wakeup_one(thread);
2028		}
2029	}
2030	return (kmbx);
2031}
2032
2033static struct kse_mailbox *
2034kse_wakeup_one(struct pthread *thread)
2035{
2036	struct kse *ke;
2037
2038	if (KSE_IS_IDLE(thread->kse)) {
2039		KSE_CLEAR_IDLE(thread->kse);
2040		thread->kseg->kg_idle_kses--;
2041		return (&thread->kse->k_kcb->kcb_kmbx);
2042	} else {
2043		TAILQ_FOREACH(ke, &thread->kseg->kg_kseq, k_kgqe) {
2044			if (KSE_IS_IDLE(ke)) {
2045				KSE_CLEAR_IDLE(ke);
2046				ke->k_kseg->kg_idle_kses--;
2047				return (&ke->k_kcb->kcb_kmbx);
2048			}
2049		}
2050	}
2051	return (NULL);
2052}
2053
2054static void
2055kse_wakeup_multi(struct kse *curkse)
2056{
2057	struct kse *ke;
2058	int tmp;
2059
2060	if ((tmp = KSE_RUNQ_THREADS(curkse)) && curkse->k_kseg->kg_idle_kses) {
2061		TAILQ_FOREACH(ke, &curkse->k_kseg->kg_kseq, k_kgqe) {
2062			if (KSE_IS_IDLE(ke)) {
2063				KSE_CLEAR_IDLE(ke);
2064				ke->k_kseg->kg_idle_kses--;
2065				KSE_WAKEUP(ke);
2066				if (--tmp == 0)
2067					break;
2068			}
2069		}
2070	}
2071}
2072
2073/*
2074 * Allocate a new KSEG.
2075 *
2076 * We allow the current thread to be NULL in the case that this
2077 * is the first time a KSEG is being created (library initialization).
2078 * In this case, we don't need to (and can't) take any locks.
2079 */
2080struct kse_group *
2081_kseg_alloc(struct pthread *curthread)
2082{
2083	struct kse_group *kseg = NULL;
2084	kse_critical_t crit;
2085
2086	if ((curthread != NULL) && (free_kseg_count > 0)) {
2087		/* Use the kse lock for the kseg queue. */
2088		crit = _kse_critical_enter();
2089		KSE_LOCK_ACQUIRE(curthread->kse, &kse_lock);
2090		if ((kseg = TAILQ_FIRST(&free_kse_groupq)) != NULL) {
2091			TAILQ_REMOVE(&free_kse_groupq, kseg, kg_qe);
2092			free_kseg_count--;
2093			active_kseg_count++;
2094			TAILQ_INSERT_TAIL(&active_kse_groupq, kseg, kg_qe);
2095		}
2096		KSE_LOCK_RELEASE(curthread->kse, &kse_lock);
2097		_kse_critical_leave(crit);
2098		if (kseg)
2099			kseg_reinit(kseg);
2100	}
2101
2102	/*
2103	 * If requested, attempt to allocate a new KSE group only if the
2104	 * KSE allocation was successful and a KSE group wasn't found in
2105	 * the free list.
2106	 */
2107	if ((kseg == NULL) &&
2108	    ((kseg = (struct kse_group *)malloc(sizeof(*kseg))) != NULL)) {
2109		if (_pq_alloc(&kseg->kg_schedq.sq_runq,
2110		    THR_MIN_PRIORITY, THR_LAST_PRIORITY) != 0) {
2111			free(kseg);
2112			kseg = NULL;
2113		} else {
2114			kseg_init(kseg);
2115			/* Add the KSEG to the list of active KSEGs. */
2116			if (curthread != NULL) {
2117				crit = _kse_critical_enter();
2118				KSE_LOCK_ACQUIRE(curthread->kse, &kse_lock);
2119				active_kseg_count++;
2120				TAILQ_INSERT_TAIL(&active_kse_groupq,
2121				    kseg, kg_qe);
2122				KSE_LOCK_RELEASE(curthread->kse, &kse_lock);
2123				_kse_critical_leave(crit);
2124			} else {
2125				active_kseg_count++;
2126				TAILQ_INSERT_TAIL(&active_kse_groupq,
2127				    kseg, kg_qe);
2128			}
2129		}
2130	}
2131	return (kseg);
2132}
2133
2134static void
2135kseg_init(struct kse_group *kseg)
2136{
2137	kseg_reinit(kseg);
2138	_lock_init(&kseg->kg_lock, LCK_ADAPTIVE, _kse_lock_wait,
2139	    _kse_lock_wakeup);
2140}
2141
2142static void
2143kseg_reinit(struct kse_group *kseg)
2144{
2145	TAILQ_INIT(&kseg->kg_kseq);
2146	TAILQ_INIT(&kseg->kg_threadq);
2147	TAILQ_INIT(&kseg->kg_schedq.sq_waitq);
2148	kseg->kg_threadcount = 0;
2149	kseg->kg_ksecount = 0;
2150	kseg->kg_idle_kses = 0;
2151	kseg->kg_flags = 0;
2152}
2153
2154/*
2155 * This must be called with the kse lock held and when there are
2156 * no more threads that reference it.
2157 */
2158static void
2159kseg_free_unlocked(struct kse_group *kseg)
2160{
2161	TAILQ_REMOVE(&active_kse_groupq, kseg, kg_qe);
2162	TAILQ_INSERT_HEAD(&free_kse_groupq, kseg, kg_qe);
2163	free_kseg_count++;
2164	active_kseg_count--;
2165}
2166
2167void
2168_kseg_free(struct kse_group *kseg)
2169{
2170	struct kse *curkse;
2171	kse_critical_t crit;
2172
2173	crit = _kse_critical_enter();
2174	curkse = _get_curkse();
2175	KSE_LOCK_ACQUIRE(curkse, &kse_lock);
2176	kseg_free_unlocked(kseg);
2177	KSE_LOCK_RELEASE(curkse, &kse_lock);
2178	_kse_critical_leave(crit);
2179}
2180
2181static void
2182kseg_destroy(struct kse_group *kseg)
2183{
2184	_lock_destroy(&kseg->kg_lock);
2185	_pq_free(&kseg->kg_schedq.sq_runq);
2186	free(kseg);
2187}
2188
2189/*
2190 * Allocate a new KSE.
2191 *
2192 * We allow the current thread to be NULL in the case that this
2193 * is the first time a KSE is being created (library initialization).
2194 * In this case, we don't need to (and can't) take any locks.
2195 */
2196struct kse *
2197_kse_alloc(struct pthread *curthread, int sys_scope)
2198{
2199	struct kse *kse = NULL;
2200	char *stack;
2201	kse_critical_t crit;
2202	int i;
2203
2204	if ((curthread != NULL) && (free_kse_count > 0)) {
2205		crit = _kse_critical_enter();
2206		KSE_LOCK_ACQUIRE(curthread->kse, &kse_lock);
2207		/* Search for a finished KSE. */
2208		kse = TAILQ_FIRST(&free_kseq);
2209		while ((kse != NULL) &&
2210		    ((kse->k_kcb->kcb_kmbx.km_flags & KMF_DONE) == 0)) {
2211			kse = TAILQ_NEXT(kse, k_qe);
2212		}
2213		if (kse != NULL) {
2214			DBG_MSG("found an unused kse.\n");
2215			TAILQ_REMOVE(&free_kseq, kse, k_qe);
2216			free_kse_count--;
2217			TAILQ_INSERT_TAIL(&active_kseq, kse, k_qe);
2218			active_kse_count++;
2219		}
2220		KSE_LOCK_RELEASE(curthread->kse, &kse_lock);
2221		_kse_critical_leave(crit);
2222		if (kse != NULL)
2223			kse_reinit(kse, sys_scope);
2224	}
2225	if ((kse == NULL) &&
2226	    ((kse = (struct kse *)malloc(sizeof(*kse))) != NULL)) {
2227		if (sys_scope != 0)
2228			stack = NULL;
2229		else if ((stack = malloc(KSE_STACKSIZE)) == NULL) {
2230			free(kse);
2231			return (NULL);
2232		}
2233		bzero(kse, sizeof(*kse));
2234
2235		/* Initialize KCB without the lock. */
2236		if ((kse->k_kcb = _kcb_ctor(kse)) == NULL) {
2237			if (stack != NULL)
2238				free(stack);
2239			free(kse);
2240			return (NULL);
2241		}
2242
2243		/* Initialize the lockusers. */
2244		for (i = 0; i < MAX_KSE_LOCKLEVEL; i++) {
2245			_lockuser_init(&kse->k_lockusers[i], (void *)kse);
2246			_LCK_SET_PRIVATE2(&kse->k_lockusers[i], NULL);
2247		}
2248		/* _lock_init(kse->k_lock, ...) */
2249
2250		if (curthread != NULL) {
2251			crit = _kse_critical_enter();
2252			KSE_LOCK_ACQUIRE(curthread->kse, &kse_lock);
2253		}
2254		kse->k_flags = 0;
2255		TAILQ_INSERT_TAIL(&active_kseq, kse, k_qe);
2256		active_kse_count++;
2257		if (curthread != NULL) {
2258			KSE_LOCK_RELEASE(curthread->kse, &kse_lock);
2259			_kse_critical_leave(crit);
2260		}
2261		/*
2262		 * Create the KSE context.
2263		 * Scope system threads (one thread per KSE) are not required
2264		 * to have a stack for an unneeded kse upcall.
2265		 */
2266		if (!sys_scope) {
2267			kse->k_kcb->kcb_kmbx.km_func = (kse_func_t *)kse_sched_multi;
2268			kse->k_stack.ss_sp = stack;
2269			kse->k_stack.ss_size = KSE_STACKSIZE;
2270		} else {
2271			kse->k_kcb->kcb_kmbx.km_func = (kse_func_t *)kse_sched_single;
2272			kse->k_stack.ss_sp = NULL;
2273			kse->k_stack.ss_size = 0;
2274		}
2275		kse->k_kcb->kcb_kmbx.km_udata = (void *)kse;
2276		kse->k_kcb->kcb_kmbx.km_quantum = 20000;
2277		/*
2278		 * We need to keep a copy of the stack in case it
2279		 * doesn't get used; a KSE running a scope system
2280		 * thread will use that thread's stack.
2281		 */
2282		kse->k_kcb->kcb_kmbx.km_stack = kse->k_stack;
2283	}
2284	return (kse);
2285}
2286
2287static void
2288kse_reinit(struct kse *kse, int sys_scope)
2289{
2290	if (!sys_scope) {
2291		kse->k_kcb->kcb_kmbx.km_func = (kse_func_t *)kse_sched_multi;
2292		if (kse->k_stack.ss_sp == NULL) {
2293			/* XXX check allocation failure */
2294			kse->k_stack.ss_sp = (char *) malloc(KSE_STACKSIZE);
2295			kse->k_stack.ss_size = KSE_STACKSIZE;
2296		}
2297		kse->k_kcb->kcb_kmbx.km_quantum = 20000;
2298	} else {
2299		kse->k_kcb->kcb_kmbx.km_func = (kse_func_t *)kse_sched_single;
2300		if (kse->k_stack.ss_sp)
2301			free(kse->k_stack.ss_sp);
2302		kse->k_stack.ss_sp = NULL;
2303		kse->k_stack.ss_size = 0;
2304		kse->k_kcb->kcb_kmbx.km_quantum = 0;
2305	}
2306	kse->k_kcb->kcb_kmbx.km_stack = kse->k_stack;
2307	kse->k_kcb->kcb_kmbx.km_udata = (void *)kse;
2308	kse->k_kcb->kcb_kmbx.km_curthread = NULL;
2309	kse->k_kcb->kcb_kmbx.km_flags = 0;
2310	kse->k_curthread = NULL;
2311	kse->k_kseg = 0;
2312	kse->k_schedq = 0;
2313	kse->k_locklevel = 0;
2314	kse->k_flags = 0;
2315	kse->k_error = 0;
2316	kse->k_cpu = 0;
2317	kse->k_sigseqno = 0;
2318}
2319
2320void
2321kse_free_unlocked(struct kse *kse)
2322{
2323	TAILQ_REMOVE(&active_kseq, kse, k_qe);
2324	active_kse_count--;
2325	kse->k_kseg = NULL;
2326	kse->k_kcb->kcb_kmbx.km_quantum = 20000;
2327	kse->k_flags = 0;
2328	TAILQ_INSERT_HEAD(&free_kseq, kse, k_qe);
2329	free_kse_count++;
2330}
2331
2332void
2333_kse_free(struct pthread *curthread, struct kse *kse)
2334{
2335	kse_critical_t crit;
2336
2337	if (curthread == NULL)
2338		kse_free_unlocked(kse);
2339	else {
2340		crit = _kse_critical_enter();
2341		KSE_LOCK_ACQUIRE(curthread->kse, &kse_lock);
2342		kse_free_unlocked(kse);
2343		KSE_LOCK_RELEASE(curthread->kse, &kse_lock);
2344		_kse_critical_leave(crit);
2345	}
2346}
2347
2348static void
2349kse_destroy(struct kse *kse)
2350{
2351	int i;
2352
2353	if (kse->k_stack.ss_sp != NULL)
2354		free(kse->k_stack.ss_sp);
2355	_kcb_dtor(kse->k_kcb);
2356	for (i = 0; i < MAX_KSE_LOCKLEVEL; ++i)
2357		_lockuser_destroy(&kse->k_lockusers[i]);
2358	_lock_destroy(&kse->k_lock);
2359	free(kse);
2360}
2361
2362struct pthread *
2363_thr_alloc(struct pthread *curthread)
2364{
2365	kse_critical_t	crit;
2366	struct pthread	*thread = NULL;
2367	int i;
2368
2369	if (curthread != NULL) {
2370		if (GC_NEEDED())
2371			_thr_gc(curthread);
2372		if (free_thread_count > 0) {
2373			crit = _kse_critical_enter();
2374			KSE_LOCK_ACQUIRE(curthread->kse, &thread_lock);
2375			if ((thread = TAILQ_FIRST(&free_threadq)) != NULL) {
2376				TAILQ_REMOVE(&free_threadq, thread, tle);
2377				free_thread_count--;
2378			}
2379			KSE_LOCK_RELEASE(curthread->kse, &thread_lock);
2380			_kse_critical_leave(crit);
2381		}
2382	}
2383	if ((thread == NULL) &&
2384	    ((thread = malloc(sizeof(struct pthread))) != NULL)) {
2385		bzero(thread, sizeof(struct pthread));
2386		if ((thread->tcb = _tcb_ctor(thread, curthread == NULL)) == NULL) {
2387			free(thread);
2388			thread = NULL;
2389		} else {
2390			thread->siginfo = calloc(_SIG_MAXSIG,
2391				sizeof(siginfo_t));
2392			/*
2393			 * Initialize thread locking.
2394			 * Lock initializing needs malloc, so don't
2395			 * enter critical region before doing this!
2396			 */
2397			if (_lock_init(&thread->lock, LCK_ADAPTIVE,
2398			    _thr_lock_wait, _thr_lock_wakeup) != 0)
2399				PANIC("Cannot initialize thread lock");
2400			for (i = 0; i < MAX_THR_LOCKLEVEL; i++) {
2401				_lockuser_init(&thread->lockusers[i],
2402				    (void *)thread);
2403				_LCK_SET_PRIVATE2(&thread->lockusers[i],
2404				    (void *)thread);
2405			}
2406		}
2407	}
2408	return (thread);
2409}
2410
2411void
2412_thr_free(struct pthread *curthread, struct pthread *thread)
2413{
2414	kse_critical_t crit;
2415
2416	DBG_MSG("Freeing thread %p\n", thread);
2417	if (thread->name) {
2418		free(thread->name);
2419		thread->name = NULL;
2420	}
2421	if ((curthread == NULL) || (free_thread_count >= MAX_CACHED_THREADS)) {
2422		thr_destroy(thread);
2423	} else {
2424		/* Add the thread to the free thread list. */
2425		crit = _kse_critical_enter();
2426		KSE_LOCK_ACQUIRE(curthread->kse, &thread_lock);
2427		TAILQ_INSERT_TAIL(&free_threadq, thread, tle);
2428		free_thread_count++;
2429		KSE_LOCK_RELEASE(curthread->kse, &thread_lock);
2430		_kse_critical_leave(crit);
2431	}
2432}
2433
2434static void
2435thr_destroy(struct pthread *thread)
2436{
2437	int i;
2438
2439	for (i = 0; i < MAX_THR_LOCKLEVEL; i++)
2440		_lockuser_destroy(&thread->lockusers[i]);
2441	_lock_destroy(&thread->lock);
2442	_tcb_dtor(thread->tcb);
2443	free(thread->siginfo);
2444	free(thread);
2445}
2446
2447/*
2448 * Add an active thread:
2449 *
2450 *   o Assign the thread a unique id (which GDB uses to track
2451 *     threads.
2452 *   o Add the thread to the list of all threads and increment
2453 *     number of active threads.
2454 */
2455static void
2456thr_link(struct pthread *thread)
2457{
2458	kse_critical_t crit;
2459	struct kse *curkse;
2460
2461	crit = _kse_critical_enter();
2462	curkse = _get_curkse();
2463	KSE_LOCK_ACQUIRE(curkse, &_thread_list_lock);
2464	/*
2465	 * Initialize the unique id (which GDB uses to track
2466	 * threads), add the thread to the list of all threads,
2467	 * and
2468	 */
2469	thread->uniqueid = next_uniqueid++;
2470	THR_LIST_ADD(thread);
2471	_thread_active_threads++;
2472	KSE_LOCK_RELEASE(curkse, &_thread_list_lock);
2473	_kse_critical_leave(crit);
2474}
2475
2476/*
2477 * Remove an active thread.
2478 */
2479static void
2480thr_unlink(struct pthread *thread)
2481{
2482	kse_critical_t crit;
2483	struct kse *curkse;
2484
2485	crit = _kse_critical_enter();
2486	curkse = _get_curkse();
2487	KSE_LOCK_ACQUIRE(curkse, &_thread_list_lock);
2488	THR_LIST_REMOVE(thread);
2489	_thread_active_threads--;
2490	KSE_LOCK_RELEASE(curkse, &_thread_list_lock);
2491	_kse_critical_leave(crit);
2492}
2493
2494void
2495_thr_hash_add(struct pthread *thread)
2496{
2497	struct thread_hash_head *head;
2498
2499	head = &thr_hashtable[THREAD_HASH(thread)];
2500	LIST_INSERT_HEAD(head, thread, hle);
2501}
2502
2503void
2504_thr_hash_remove(struct pthread *thread)
2505{
2506	LIST_REMOVE(thread, hle);
2507}
2508
2509struct pthread *
2510_thr_hash_find(struct pthread *thread)
2511{
2512	struct pthread *td;
2513	struct thread_hash_head *head;
2514
2515	head = &thr_hashtable[THREAD_HASH(thread)];
2516	LIST_FOREACH(td, head, hle) {
2517		if (td == thread)
2518			return (thread);
2519	}
2520	return (NULL);
2521}
2522
2523void
2524_thr_debug_check_yield(struct pthread *curthread)
2525{
2526	/*
2527	 * Note that TMDF_SUSPEND is set after process is suspended.
2528	 * When we are being debugged, every suspension in process
2529	 * will cause all KSEs to schedule an upcall in kernel, unless the
2530	 * KSE is in critical region.
2531	 * If the function is being called, it means the KSE is no longer
2532	 * in critical region, if the TMDF_SUSPEND is set by debugger
2533	 * before KSE leaves critical region, we will catch it here, else
2534	 * if the flag is changed during testing, it also not a problem,
2535	 * because the change only occurs after a process suspension event
2536	 * occurs. A suspension event will always cause KSE to schedule an
2537	 * upcall, in the case, because we are not in critical region,
2538	 * upcall will be scheduled sucessfully, the flag will be checked
2539	 * again in kse_sched_multi, we won't back until the flag
2540	 * is cleared by debugger, the flag will be cleared in next
2541	 * suspension event.
2542	 */
2543	if (!DBG_CAN_RUN(curthread)) {
2544		if ((curthread->attr.flags & PTHREAD_SCOPE_SYSTEM) == 0)
2545			_thr_sched_switch(curthread);
2546		else
2547			kse_thr_interrupt(&curthread->tcb->tcb_tmbx,
2548				KSE_INTR_DBSUSPEND, 0);
2549	}
2550}
2551