thr_kern.c revision 167241
117680Spst/*
217680Spst * Copyright (C) 2003 Daniel M. Eischen <deischen@freebsd.org>
317680Spst * Copyright (C) 2002 Jonathon Mini <mini@freebsd.org>
417680Spst * Copyright (c) 1995-1998 John Birrell <jb@cimlogic.com.au>
517680Spst * All rights reserved.
617680Spst *
717680Spst * Redistribution and use in source and binary forms, with or without
817680Spst * modification, are permitted provided that the following conditions
917680Spst * are met:
1017680Spst * 1. Redistributions of source code must retain the above copyright
1117680Spst *    notice, this list of conditions and the following disclaimer.
1217680Spst * 2. Redistributions in binary form must reproduce the above copyright
1317680Spst *    notice, this list of conditions and the following disclaimer in the
1417680Spst *    documentation and/or other materials provided with the distribution.
1517680Spst * 3. All advertising materials mentioning features or use of this software
1617680Spst *    must display the following acknowledgement:
1717680Spst *	This product includes software developed by John Birrell.
1817680Spst * 4. Neither the name of the author nor the names of any co-contributors
1917680Spst *    may be used to endorse or promote products derived from this software
2026183Sfenner *    without specific prior written permission.
2117680Spst *
2217680Spst * THIS SOFTWARE IS PROVIDED BY JOHN BIRRELL AND CONTRIBUTORS ``AS IS'' AND
2356896Sfenner * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
2456896Sfenner * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
2517680Spst * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
2626183Sfenner * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
2717680Spst * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
28127675Sbms * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
29147904Ssam * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
3017680Spst * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
3117680Spst * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
3256896Sfenner * SUCH DAMAGE.
3356896Sfenner *
3456896Sfenner */
3556896Sfenner#include <sys/cdefs.h>
36127675Sbms__FBSDID("$FreeBSD: head/lib/libkse/thread/thr_kern.c 167241 2007-03-05 17:47:27Z brian $");
3717680Spst
3817680Spst#include <sys/types.h>
3917680Spst#include <sys/kse.h>
4017680Spst#include <sys/ptrace.h>
4117680Spst#include <sys/signalvar.h>
4217680Spst#include <sys/queue.h>
4317680Spst#include <machine/atomic.h>
4417680Spst#include <machine/sigframe.h>
4517680Spst
4617680Spst#include <assert.h>
4717680Spst#include <errno.h>
4817680Spst#include <signal.h>
4917680Spst#include <stdlib.h>
5017680Spst#include <string.h>
5117680Spst#include <time.h>
5217680Spst#include <ucontext.h>
5317680Spst#include <unistd.h>
5417680Spst
5517680Spst#include "atomic_ops.h"
5617680Spst#include "thr_private.h"
5717680Spst#include "libc_private.h"
5817680Spst#ifdef NOTYET
5917680Spst#include "spinlock.h"
6017680Spst#endif
61146778Ssam
62146778Ssam/* #define DEBUG_THREAD_KERN */
63146778Ssam#ifdef DEBUG_THREAD_KERN
6417680Spst#define DBG_MSG		stdout_debug
65127675Sbms#else
6617680Spst#define DBG_MSG(x...)
6717680Spst#endif
6817680Spst
69146778Ssam/*
7017680Spst * Define a high water mark for the maximum number of threads that
7117680Spst * will be cached.  Once this level is reached, any extra threads
7217680Spst * will be free()'d.
7317680Spst */
7417680Spst#define	MAX_CACHED_THREADS	100
7517680Spst/*
7617680Spst * Define high water marks for the maximum number of KSEs and KSE groups
7717680Spst * that will be cached. Because we support 1:1 threading, there could have
7817680Spst * same number of KSEs and KSE groups as threads. Once these levels are
7917680Spst * reached, any extra KSE and KSE groups will be free()'d.
8017680Spst */
8117680Spst#define	MAX_CACHED_KSES		((_thread_scope_system <= 0) ? 50 : 100)
8217680Spst#define	MAX_CACHED_KSEGS	((_thread_scope_system <= 0) ? 50 : 100)
8317680Spst
8417680Spst#define	KSE_SET_MBOX(kse, thrd) \
8517680Spst	(kse)->k_kcb->kcb_kmbx.km_curthread = &(thrd)->tcb->tcb_tmbx
8617680Spst
8717680Spst#define	KSE_SET_EXITED(kse)	(kse)->k_flags |= KF_EXITED
88127675Sbms
8917680Spst/*
9017680Spst * Macros for manipulating the run queues.  The priority queue
9117680Spst * routines use the thread's pqe link and also handle the setting
9217680Spst * and clearing of the thread's THR_FLAGS_IN_RUNQ flag.
9317680Spst */
9417680Spst#define	KSE_RUNQ_INSERT_HEAD(kse, thrd)			\
9517680Spst	_pq_insert_head(&(kse)->k_schedq->sq_runq, thrd)
9617680Spst#define	KSE_RUNQ_INSERT_TAIL(kse, thrd)			\
9717680Spst	_pq_insert_tail(&(kse)->k_schedq->sq_runq, thrd)
9817680Spst#define	KSE_RUNQ_REMOVE(kse, thrd)			\
9917680Spst	_pq_remove(&(kse)->k_schedq->sq_runq, thrd)
10017680Spst#define	KSE_RUNQ_FIRST(kse)				\
10117680Spst	((_libkse_debug == 0) ?				\
102146778Ssam	 _pq_first(&(kse)->k_schedq->sq_runq) :		\
10317680Spst	 _pq_first_debug(&(kse)->k_schedq->sq_runq))
10417680Spst
10517680Spst#define KSE_RUNQ_THREADS(kse)	((kse)->k_schedq->sq_runq.pq_threads)
10617680Spst
10717680Spst#define THR_NEED_CANCEL(thrd)						\
10817680Spst	 (((thrd)->cancelflags & THR_CANCELLING) != 0 &&		\
10917680Spst	  ((thrd)->cancelflags & PTHREAD_CANCEL_DISABLE) == 0 &&	\
11017680Spst	  (((thrd)->cancelflags & THR_AT_CANCEL_POINT) != 0 ||		\
111146778Ssam	   ((thrd)->cancelflags & PTHREAD_CANCEL_ASYNCHRONOUS) != 0))
11298527Sfenner
11398527Sfenner#define THR_NEED_ASYNC_CANCEL(thrd)					\
11498527Sfenner	 (((thrd)->cancelflags & THR_CANCELLING) != 0 &&		\
11517680Spst	  ((thrd)->cancelflags & PTHREAD_CANCEL_DISABLE) == 0 &&	\
11617680Spst	  (((thrd)->cancelflags & THR_AT_CANCEL_POINT) == 0 &&		\
117146778Ssam	   ((thrd)->cancelflags & PTHREAD_CANCEL_ASYNCHRONOUS) != 0))
11817680Spst
11998527Sfenner/*
120146778Ssam * We've got to keep track of everything that is allocated, not only
12198527Sfenner * to have a speedy free list, but also so they can be deallocated
12298527Sfenner * after a fork().
12398527Sfenner */
12498527Sfennerstatic TAILQ_HEAD(, kse)	active_kseq;
12598527Sfennerstatic TAILQ_HEAD(, kse)	free_kseq;
126146778Ssamstatic TAILQ_HEAD(, kse_group)	free_kse_groupq;
12798527Sfennerstatic TAILQ_HEAD(, kse_group)	active_kse_groupq;
12817680Spststatic TAILQ_HEAD(, kse_group)	gc_ksegq;
129146778Ssamstatic struct lock		kse_lock;	/* also used for kseg queue */
13017680Spststatic int			free_kse_count = 0;
13117680Spststatic int			free_kseg_count = 0;
13217680Spststatic TAILQ_HEAD(, pthread)	free_threadq;
13317680Spststatic struct lock		thread_lock;
13417680Spststatic int			free_thread_count = 0;
13517680Spststatic int			inited = 0;
13617680Spststatic int			active_kse_count = 0;
13717680Spststatic int			active_kseg_count = 0;
13817680Spststatic u_int64_t		next_uniqueid = 1;
13917680Spst
14017680SpstLIST_HEAD(thread_hash_head, pthread);
14117680Spst#define THREAD_HASH_QUEUES	127
14217680Spststatic struct thread_hash_head	thr_hashtable[THREAD_HASH_QUEUES];
14317680Spst#define	THREAD_HASH(thrd)	((unsigned long)thrd % THREAD_HASH_QUEUES)
14417680Spst
14517680Spst/* Lock for thread tcb constructor/destructor */
14617680Spststatic pthread_mutex_t		_tcb_mutex;
14717680Spst
14817680Spst#ifdef DEBUG_THREAD_KERN
14917680Spststatic void	dump_queues(struct kse *curkse);
15017680Spst#endif
15117680Spststatic void	kse_check_completed(struct kse *kse);
15298527Sfennerstatic void	kse_check_waitq(struct kse *kse);
153127675Sbmsstatic void	kse_fini(struct kse *curkse);
15417680Spststatic void	kse_reinit(struct kse *kse, int sys_scope);
15517680Spststatic void	kse_sched_multi(struct kse_mailbox *kmbx);
15617680Spststatic void	kse_sched_single(struct kse_mailbox *kmbx);
15717680Spststatic void	kse_switchout_thread(struct kse *kse, struct pthread *thread);
15817680Spststatic void	kse_wait(struct kse *kse, struct pthread *td_wait, int sigseq);
15917680Spststatic void	kse_free_unlocked(struct kse *kse);
16017680Spststatic void	kse_destroy(struct kse *kse);
16117680Spststatic void	kseg_free_unlocked(struct kse_group *kseg);
16217680Spststatic void	kseg_init(struct kse_group *kseg);
16317680Spststatic void	kseg_reinit(struct kse_group *kseg);
164147904Ssamstatic void	kseg_destroy(struct kse_group *kseg);
165147904Ssamstatic void	kse_waitq_insert(struct pthread *thread);
166147904Ssamstatic void	kse_wakeup_multi(struct kse *curkse);
167147904Ssamstatic struct kse_mailbox *kse_wakeup_one(struct pthread *thread);
168147904Ssamstatic void	thr_cleanup(struct kse *kse, struct pthread *curthread);
169147904Ssamstatic void	thr_link(struct pthread *thread);
17098527Sfennerstatic void	thr_resume_wrapper(int sig, siginfo_t *, ucontext_t *);
17117680Spststatic void	thr_resume_check(struct pthread *curthread, ucontext_t *ucp);
17217680Spststatic int	thr_timedout(struct pthread *thread, struct timespec *curtime);
17317680Spststatic void	thr_unlink(struct pthread *thread);
17417680Spststatic void	thr_destroy(struct pthread *curthread, struct pthread *thread);
17517680Spststatic void	thread_gc(struct pthread *thread);
17617680Spststatic void	kse_gc(struct pthread *thread);
17717680Spststatic void	kseg_gc(struct pthread *thread);
17898527Sfenner
17917680Spststatic void __inline
18017680Spstthr_accounting(struct pthread *thread)
18198527Sfenner{
18217680Spst	if ((thread->slice_usec != -1) &&
18398527Sfenner	    (thread->slice_usec <= TIMESLICE_USEC) &&
18417680Spst	    (thread->attr.sched_policy != SCHED_FIFO)) {
18517680Spst		thread->slice_usec += (thread->tcb->tcb_tmbx.tm_uticks
18617680Spst		    + thread->tcb->tcb_tmbx.tm_sticks) * _clock_res_usec;
18717680Spst		/* Check for time quantum exceeded: */
18817680Spst		if (thread->slice_usec > TIMESLICE_USEC)
18917680Spst			thread->slice_usec = -1;
19017680Spst	}
19117680Spst	thread->tcb->tcb_tmbx.tm_uticks = 0;
19217680Spst	thread->tcb->tcb_tmbx.tm_sticks = 0;
19317680Spst}
19417680Spst
19517680Spst/*
19617680Spst * This is called after a fork().
19717680Spst * No locks need to be taken here since we are guaranteed to be
19817680Spst * single threaded.
19917680Spst *
20017680Spst * XXX
20117692Spst * POSIX says for threaded process, fork() function is used
20217680Spst * only to run new programs, and the effects of calling functions
20317680Spst * that require certain resources between the call to fork() and
20417680Spst * the call to an exec function are undefined.
20517680Spst *
20617680Spst * It is not safe to free memory after fork(), because these data
20717680Spst * structures may be in inconsistent state.
20817680Spst */
20917692Spstvoid
21017680Spst_kse_single_thread(struct pthread *curthread)
21117680Spst{
21217680Spst#ifdef NOTYET
21317680Spst	struct kse *kse;
21417680Spst	struct kse_group *kseg;
21517680Spst	struct pthread *thread;
21617680Spst
21798527Sfenner	_thr_spinlock_init();
21898527Sfenner	*__malloc_lock = (spinlock_t)_SPINLOCK_INITIALIZER;
21917680Spst	if (__isthreaded) {
22098527Sfenner		_thr_rtld_fini();
22117680Spst		_thr_signal_deinit();
22298527Sfenner	}
22317680Spst	__isthreaded = 0;
224	/*
225	 * Restore signal mask early, so any memory problems could
226	 * dump core.
227	 */
228	__sys_sigprocmask(SIG_SETMASK, &curthread->sigmask, NULL);
229	_thread_active_threads = 1;
230
231	curthread->kse->k_kcb->kcb_kmbx.km_curthread = NULL;
232	curthread->attr.flags &= ~PTHREAD_SCOPE_SYSTEM;
233	curthread->attr.flags |= PTHREAD_SCOPE_SYSTEM;
234
235	/*
236	 * Enter a loop to remove and free all threads other than
237	 * the running thread from the active thread list:
238	 */
239	while ((thread = TAILQ_FIRST(&_thread_list)) != NULL) {
240		THR_GCLIST_REMOVE(thread);
241		/*
242		 * Remove this thread from the list (the current
243		 * thread will be removed but re-added by libpthread
244		 * initialization.
245		 */
246		TAILQ_REMOVE(&_thread_list, thread, tle);
247		/* Make sure this isn't the running thread: */
248		if (thread != curthread) {
249			_thr_stack_free(&thread->attr);
250			if (thread->specific != NULL)
251				free(thread->specific);
252			thr_destroy(curthread, thread);
253		}
254	}
255
256	TAILQ_INIT(&curthread->mutexq);		/* initialize mutex queue */
257	curthread->joiner = NULL;		/* no joining threads yet */
258	curthread->refcount = 0;
259	SIGEMPTYSET(curthread->sigpend);	/* clear pending signals */
260
261	/* Don't free thread-specific data as the caller may require it */
262
263	/* Free the free KSEs: */
264	while ((kse = TAILQ_FIRST(&free_kseq)) != NULL) {
265		TAILQ_REMOVE(&free_kseq, kse, k_qe);
266		kse_destroy(kse);
267	}
268	free_kse_count = 0;
269
270	/* Free the active KSEs: */
271	while ((kse = TAILQ_FIRST(&active_kseq)) != NULL) {
272		TAILQ_REMOVE(&active_kseq, kse, k_qe);
273		kse_destroy(kse);
274	}
275	active_kse_count = 0;
276
277	/* Free the free KSEGs: */
278	while ((kseg = TAILQ_FIRST(&free_kse_groupq)) != NULL) {
279		TAILQ_REMOVE(&free_kse_groupq, kseg, kg_qe);
280		kseg_destroy(kseg);
281	}
282	free_kseg_count = 0;
283
284	/* Free the active KSEGs: */
285	while ((kseg = TAILQ_FIRST(&active_kse_groupq)) != NULL) {
286		TAILQ_REMOVE(&active_kse_groupq, kseg, kg_qe);
287		kseg_destroy(kseg);
288	}
289	active_kseg_count = 0;
290
291	/* Free the free threads. */
292	while ((thread = TAILQ_FIRST(&free_threadq)) != NULL) {
293		TAILQ_REMOVE(&free_threadq, thread, tle);
294		thr_destroy(curthread, thread);
295	}
296	free_thread_count = 0;
297
298	/* Free the to-be-gc'd threads. */
299	while ((thread = TAILQ_FIRST(&_thread_gc_list)) != NULL) {
300		TAILQ_REMOVE(&_thread_gc_list, thread, gcle);
301		thr_destroy(curthread, thread);
302	}
303	TAILQ_INIT(&gc_ksegq);
304	_gc_count = 0;
305
306	if (inited != 0) {
307		/*
308		 * Destroy these locks; they'll be recreated to assure they
309		 * are in the unlocked state.
310		 */
311		_lock_destroy(&kse_lock);
312		_lock_destroy(&thread_lock);
313		_lock_destroy(&_thread_list_lock);
314		inited = 0;
315	}
316
317	/* We're no longer part of any lists */
318	curthread->tlflags = 0;
319
320	/*
321	 * After a fork, we are still operating on the thread's original
322	 * stack.  Don't clear the THR_FLAGS_USER from the thread's
323	 * attribute flags.
324	 */
325
326	/* Initialize the threads library. */
327	curthread->kse = NULL;
328	curthread->kseg = NULL;
329	_kse_initial = NULL;
330	_libpthread_init(curthread);
331#else
332	int i;
333
334	/* Reset the current thread and KSE lock data. */
335	for (i = 0; i < curthread->locklevel; i++) {
336		_lockuser_reinit(&curthread->lockusers[i], (void *)curthread);
337	}
338	curthread->locklevel = 0;
339	for (i = 0; i < curthread->kse->k_locklevel; i++) {
340		_lockuser_reinit(&curthread->kse->k_lockusers[i],
341		    (void *)curthread->kse);
342		_LCK_SET_PRIVATE2(&curthread->kse->k_lockusers[i], NULL);
343	}
344	curthread->kse->k_locklevel = 0;
345	_thr_spinlock_init();
346	if (__isthreaded) {
347		_thr_rtld_fini();
348		_thr_signal_deinit();
349	}
350	__isthreaded = 0;
351	curthread->kse->k_kcb->kcb_kmbx.km_curthread = NULL;
352	curthread->attr.flags |= PTHREAD_SCOPE_SYSTEM;
353
354	/* After a fork(), there child should have no pending signals. */
355	sigemptyset(&curthread->sigpend);
356
357	/*
358	 * Restore signal mask early, so any memory problems could
359	 * dump core.
360	 */
361	sigprocmask(SIG_SETMASK, &curthread->sigmask, NULL);
362	_thread_active_threads = 1;
363#endif
364}
365
366/*
367 * This is used to initialize housekeeping and to initialize the
368 * KSD for the KSE.
369 */
370void
371_kse_init(void)
372{
373	if (inited == 0) {
374		TAILQ_INIT(&active_kseq);
375		TAILQ_INIT(&active_kse_groupq);
376		TAILQ_INIT(&free_kseq);
377		TAILQ_INIT(&free_kse_groupq);
378		TAILQ_INIT(&free_threadq);
379		TAILQ_INIT(&gc_ksegq);
380		if (_lock_init(&kse_lock, LCK_ADAPTIVE,
381		    _kse_lock_wait, _kse_lock_wakeup) != 0)
382			PANIC("Unable to initialize free KSE queue lock");
383		if (_lock_init(&thread_lock, LCK_ADAPTIVE,
384		    _kse_lock_wait, _kse_lock_wakeup) != 0)
385			PANIC("Unable to initialize free thread queue lock");
386		if (_lock_init(&_thread_list_lock, LCK_ADAPTIVE,
387		    _kse_lock_wait, _kse_lock_wakeup) != 0)
388			PANIC("Unable to initialize thread list lock");
389		_pthread_mutex_init(&_tcb_mutex, NULL);
390		active_kse_count = 0;
391		active_kseg_count = 0;
392		_gc_count = 0;
393		inited = 1;
394	}
395}
396
397/*
398 * This is called when the first thread (other than the initial
399 * thread) is created.
400 */
401int
402_kse_setthreaded(int threaded)
403{
404	sigset_t sigset;
405
406	if ((threaded != 0) && (__isthreaded == 0)) {
407		SIGFILLSET(sigset);
408		__sys_sigprocmask(SIG_SETMASK, &sigset, &_thr_initial->sigmask);
409
410		/*
411		 * Tell the kernel to create a KSE for the initial thread
412		 * and enable upcalls in it.
413		 */
414		_kse_initial->k_flags |= KF_STARTED;
415
416		if (_thread_scope_system <= 0) {
417			_thr_initial->attr.flags &= ~PTHREAD_SCOPE_SYSTEM;
418			_kse_initial->k_kseg->kg_flags &= ~KGF_SINGLE_THREAD;
419			_kse_initial->k_kcb->kcb_kmbx.km_curthread = NULL;
420		}
421		else {
422			/*
423			 * For bound thread, kernel reads mailbox pointer
424			 * once, we'd set it here before calling kse_create.
425			 */
426			_tcb_set(_kse_initial->k_kcb, _thr_initial->tcb);
427			KSE_SET_MBOX(_kse_initial, _thr_initial);
428			_kse_initial->k_kcb->kcb_kmbx.km_flags |= KMF_BOUND;
429		}
430
431		/*
432		 * Locking functions in libc are required when there are
433		 * threads other than the initial thread.
434		 */
435		_thr_rtld_init();
436
437		__isthreaded = 1;
438		if (kse_create(&_kse_initial->k_kcb->kcb_kmbx, 0) != 0) {
439			_kse_initial->k_flags &= ~KF_STARTED;
440			__isthreaded = 0;
441			PANIC("kse_create() failed\n");
442			return (-1);
443		}
444		_thr_initial->tcb->tcb_tmbx.tm_lwp =
445			_kse_initial->k_kcb->kcb_kmbx.km_lwp;
446		_thread_activated = 1;
447
448#ifndef SYSTEM_SCOPE_ONLY
449		if (_thread_scope_system <= 0) {
450			/* Set current thread to initial thread */
451			_tcb_set(_kse_initial->k_kcb, _thr_initial->tcb);
452			KSE_SET_MBOX(_kse_initial, _thr_initial);
453			_thr_start_sig_daemon();
454			_thr_setmaxconcurrency();
455		}
456		else
457#endif
458			__sys_sigprocmask(SIG_SETMASK, &_thr_initial->sigmask,
459			    NULL);
460	}
461	return (0);
462}
463
464/*
465 * Lock wait and wakeup handlers for KSE locks.  These are only used by
466 * KSEs, and should never be used by threads.  KSE locks include the
467 * KSE group lock (used for locking the scheduling queue) and the
468 * kse_lock defined above.
469 *
470 * When a KSE lock attempt blocks, the entire KSE blocks allowing another
471 * KSE to run.  For the most part, it doesn't make much sense to try and
472 * schedule another thread because you need to lock the scheduling queue
473 * in order to do that.  And since the KSE lock is used to lock the scheduling
474 * queue, you would just end up blocking again.
475 */
476void
477_kse_lock_wait(struct lock *lock, struct lockuser *lu)
478{
479	struct kse *curkse = (struct kse *)_LCK_GET_PRIVATE(lu);
480	struct timespec ts;
481	int saved_flags;
482
483	if (curkse->k_kcb->kcb_kmbx.km_curthread != NULL)
484		PANIC("kse_lock_wait does not disable upcall.\n");
485	/*
486	 * Enter a loop to wait until we get the lock.
487	 */
488	ts.tv_sec = 0;
489	ts.tv_nsec = 1000000;  /* 1 sec */
490	while (!_LCK_GRANTED(lu)) {
491		/*
492		 * Yield the kse and wait to be notified when the lock
493		 * is granted.
494		 */
495		saved_flags = curkse->k_kcb->kcb_kmbx.km_flags;
496		curkse->k_kcb->kcb_kmbx.km_flags |= KMF_NOUPCALL |
497		    KMF_NOCOMPLETED;
498		kse_release(&ts);
499		curkse->k_kcb->kcb_kmbx.km_flags = saved_flags;
500	}
501}
502
503void
504_kse_lock_wakeup(struct lock *lock, struct lockuser *lu)
505{
506	struct kse *curkse;
507	struct kse *kse;
508	struct kse_mailbox *mbx;
509
510	curkse = _get_curkse();
511	kse = (struct kse *)_LCK_GET_PRIVATE(lu);
512
513	if (kse == curkse)
514		PANIC("KSE trying to wake itself up in lock");
515	else {
516		mbx = &kse->k_kcb->kcb_kmbx;
517		_lock_grant(lock, lu);
518		/*
519		 * Notify the owning kse that it has the lock.
520		 * It is safe to pass invalid address to kse_wakeup
521		 * even if the mailbox is not in kernel at all,
522		 * and waking up a wrong kse is also harmless.
523		 */
524		kse_wakeup(mbx);
525	}
526}
527
528/*
529 * Thread wait and wakeup handlers for thread locks.  These are only used
530 * by threads, never by KSEs.  Thread locks include the per-thread lock
531 * (defined in its structure), and condition variable and mutex locks.
532 */
533void
534_thr_lock_wait(struct lock *lock, struct lockuser *lu)
535{
536	struct pthread *curthread = (struct pthread *)lu->lu_private;
537
538	do {
539		THR_LOCK_SWITCH(curthread);
540		THR_SET_STATE(curthread, PS_LOCKWAIT);
541		_thr_sched_switch_unlocked(curthread);
542	} while (!_LCK_GRANTED(lu));
543}
544
545void
546_thr_lock_wakeup(struct lock *lock, struct lockuser *lu)
547{
548	struct pthread *thread;
549	struct pthread *curthread;
550	struct kse_mailbox *kmbx;
551
552	curthread = _get_curthread();
553	thread = (struct pthread *)_LCK_GET_PRIVATE(lu);
554
555	THR_SCHED_LOCK(curthread, thread);
556	_lock_grant(lock, lu);
557	kmbx = _thr_setrunnable_unlocked(thread);
558	THR_SCHED_UNLOCK(curthread, thread);
559	if (kmbx != NULL)
560		kse_wakeup(kmbx);
561}
562
563kse_critical_t
564_kse_critical_enter(void)
565{
566	kse_critical_t crit;
567
568	crit = (kse_critical_t)_kcb_critical_enter();
569	return (crit);
570}
571
572void
573_kse_critical_leave(kse_critical_t crit)
574{
575	struct pthread *curthread;
576
577	_kcb_critical_leave((struct kse_thr_mailbox *)crit);
578	if ((crit != NULL) && ((curthread = _get_curthread()) != NULL))
579		THR_YIELD_CHECK(curthread);
580}
581
582int
583_kse_in_critical(void)
584{
585	return (_kcb_in_critical());
586}
587
588void
589_thr_critical_enter(struct pthread *thread)
590{
591	thread->critical_count++;
592}
593
594void
595_thr_critical_leave(struct pthread *thread)
596{
597	thread->critical_count--;
598	THR_YIELD_CHECK(thread);
599}
600
601void
602_thr_sched_switch(struct pthread *curthread)
603{
604	struct kse *curkse;
605
606	(void)_kse_critical_enter();
607	curkse = _get_curkse();
608	KSE_SCHED_LOCK(curkse, curkse->k_kseg);
609	_thr_sched_switch_unlocked(curthread);
610}
611
612/*
613 * XXX - We may need to take the scheduling lock before calling
614 *       this, or perhaps take the lock within here before
615 *       doing anything else.
616 */
617void
618_thr_sched_switch_unlocked(struct pthread *curthread)
619{
620	struct kse *curkse;
621	volatile int resume_once = 0;
622	ucontext_t *uc;
623
624	/* We're in the scheduler, 5 by 5: */
625	curkse = curthread->kse;
626
627	curthread->need_switchout = 1;	/* The thread yielded on its own. */
628	curthread->critical_yield = 0;	/* No need to yield anymore. */
629
630	/* Thread can unlock the scheduler lock. */
631	curthread->lock_switch = 1;
632
633	if (curthread->attr.flags & PTHREAD_SCOPE_SYSTEM)
634		kse_sched_single(&curkse->k_kcb->kcb_kmbx);
635	else {
636		if (__predict_false(_libkse_debug != 0)) {
637			/*
638			 * Because debugger saves single step status in thread
639			 * mailbox's tm_dflags, we can safely clear single
640			 * step status here. the single step status will be
641			 * restored by kse_switchin when the thread is
642			 * switched in again. This also lets uts run in full
643			 * speed.
644			 */
645			 ptrace(PT_CLEARSTEP, curkse->k_kcb->kcb_kmbx.km_lwp,
646				(caddr_t) 1, 0);
647		}
648
649		KSE_SET_SWITCH(curkse);
650		_thread_enter_uts(curthread->tcb, curkse->k_kcb);
651	}
652
653	/*
654	 * Unlock the scheduling queue and leave the
655	 * critical region.
656	 */
657	/* Don't trust this after a switch! */
658	curkse = curthread->kse;
659
660	curthread->lock_switch = 0;
661	KSE_SCHED_UNLOCK(curkse, curkse->k_kseg);
662	_kse_critical_leave(&curthread->tcb->tcb_tmbx);
663
664	/*
665	 * This thread is being resumed; check for cancellations.
666	 */
667	if (THR_NEED_ASYNC_CANCEL(curthread) && !THR_IN_CRITICAL(curthread)) {
668		uc = alloca(sizeof(ucontext_t));
669		resume_once = 0;
670		THR_GETCONTEXT(uc);
671		if (resume_once == 0) {
672			resume_once = 1;
673			curthread->check_pending = 0;
674			thr_resume_check(curthread, uc);
675		}
676	}
677	THR_ACTIVATE_LAST_LOCK(curthread);
678}
679
680/*
681 * This is the scheduler for a KSE which runs a scope system thread.
682 * The multi-thread KSE scheduler should also work for a single threaded
683 * KSE, but we use a separate scheduler so that it can be fine-tuned
684 * to be more efficient (and perhaps not need a separate stack for
685 * the KSE, allowing it to use the thread's stack).
686 */
687
688static void
689kse_sched_single(struct kse_mailbox *kmbx)
690{
691	struct kse *curkse;
692	struct pthread *curthread;
693	struct timespec ts;
694	sigset_t sigmask;
695	int i, sigseqno, level, first = 0;
696
697	curkse = (struct kse *)kmbx->km_udata;
698	curthread = curkse->k_curthread;
699
700	if (__predict_false((curkse->k_flags & KF_INITIALIZED) == 0)) {
701		/* Setup this KSEs specific data. */
702		_kcb_set(curkse->k_kcb);
703		_tcb_set(curkse->k_kcb, curthread->tcb);
704		curkse->k_flags |= KF_INITIALIZED;
705		first = 1;
706		curthread->active = 1;
707
708		/* Setup kernel signal masks for new thread. */
709		__sys_sigprocmask(SIG_SETMASK, &curthread->sigmask, NULL);
710		/*
711		 * Enter critical region, this is meanless for bound thread,
712		 * It is used to let other code work, those code want mailbox
713		 * to be cleared.
714		 */
715		(void)_kse_critical_enter();
716 	} else {
717		/*
718		 * Bound thread always has tcb set, this prevent some
719		 * code from blindly setting bound thread tcb to NULL,
720		 * buggy code ?
721		 */
722		_tcb_set(curkse->k_kcb, curthread->tcb);
723	}
724
725	curthread->critical_yield = 0;
726	curthread->need_switchout = 0;
727
728	/*
729	 * Lock the scheduling queue.
730	 *
731	 * There is no scheduling queue for single threaded KSEs,
732	 * but we need a lock for protection regardless.
733	 */
734	if (curthread->lock_switch == 0)
735		KSE_SCHED_LOCK(curkse, curkse->k_kseg);
736
737	/*
738	 * This has to do the job of kse_switchout_thread(), only
739	 * for a single threaded KSE/KSEG.
740	 */
741
742	switch (curthread->state) {
743	case PS_MUTEX_WAIT:
744	case PS_COND_WAIT:
745		if (THR_NEED_CANCEL(curthread)) {
746			curthread->interrupted = 1;
747			curthread->continuation = _thr_finish_cancellation;
748			THR_SET_STATE(curthread, PS_RUNNING);
749		}
750		break;
751
752	case PS_LOCKWAIT:
753		/*
754		 * This state doesn't timeout.
755		 */
756		curthread->wakeup_time.tv_sec = -1;
757		curthread->wakeup_time.tv_nsec = -1;
758		level = curthread->locklevel - 1;
759		if (_LCK_GRANTED(&curthread->lockusers[level]))
760			THR_SET_STATE(curthread, PS_RUNNING);
761		break;
762
763	case PS_DEAD:
764		/* Unlock the scheduling queue and exit the KSE and thread. */
765		thr_cleanup(curkse, curthread);
766		KSE_SCHED_UNLOCK(curkse, curkse->k_kseg);
767		PANIC("bound thread shouldn't get here\n");
768		break;
769
770	case PS_JOIN:
771		if (THR_NEED_CANCEL(curthread)) {
772			curthread->join_status.thread = NULL;
773			THR_SET_STATE(curthread, PS_RUNNING);
774		} else {
775			/*
776			 * This state doesn't timeout.
777			 */
778			curthread->wakeup_time.tv_sec = -1;
779			curthread->wakeup_time.tv_nsec = -1;
780		}
781		break;
782
783	case PS_SUSPENDED:
784		if (THR_NEED_CANCEL(curthread)) {
785			curthread->interrupted = 1;
786			THR_SET_STATE(curthread, PS_RUNNING);
787		} else {
788			/*
789			 * These states don't timeout.
790			 */
791			curthread->wakeup_time.tv_sec = -1;
792			curthread->wakeup_time.tv_nsec = -1;
793		}
794		break;
795
796	case PS_RUNNING:
797		if ((curthread->flags & THR_FLAGS_SUSPENDED) != 0 &&
798		    !THR_NEED_CANCEL(curthread)) {
799			THR_SET_STATE(curthread, PS_SUSPENDED);
800			/*
801			 * These states don't timeout.
802			 */
803			curthread->wakeup_time.tv_sec = -1;
804			curthread->wakeup_time.tv_nsec = -1;
805		}
806		break;
807
808	case PS_SIGWAIT:
809		PANIC("bound thread does not have SIGWAIT state\n");
810
811	case PS_SLEEP_WAIT:
812		PANIC("bound thread does not have SLEEP_WAIT state\n");
813
814	case PS_SIGSUSPEND:
815		PANIC("bound thread does not have SIGSUSPEND state\n");
816
817	case PS_DEADLOCK:
818		/*
819		 * These states don't timeout and don't need
820		 * to be in the waiting queue.
821		 */
822		curthread->wakeup_time.tv_sec = -1;
823		curthread->wakeup_time.tv_nsec = -1;
824		break;
825
826	default:
827		PANIC("Unknown state\n");
828		break;
829	}
830
831	while (curthread->state != PS_RUNNING) {
832		sigseqno = curkse->k_sigseqno;
833		if (curthread->check_pending != 0) {
834			/*
835			 * Install pending signals into the frame, possible
836			 * cause mutex or condvar backout.
837			 */
838			curthread->check_pending = 0;
839			SIGFILLSET(sigmask);
840
841			/*
842			 * Lock out kernel signal code when we are processing
843			 * signals, and get a fresh copy of signal mask.
844			 */
845			__sys_sigprocmask(SIG_SETMASK, &sigmask,
846					  &curthread->sigmask);
847			for (i = 1; i <= _SIG_MAXSIG; i++) {
848				if (SIGISMEMBER(curthread->sigmask, i))
849					continue;
850				if (SIGISMEMBER(curthread->sigpend, i))
851					(void)_thr_sig_add(curthread, i,
852					    &curthread->siginfo[i-1]);
853			}
854			__sys_sigprocmask(SIG_SETMASK, &curthread->sigmask,
855				NULL);
856			/* The above code might make thread runnable */
857			if (curthread->state == PS_RUNNING)
858				break;
859		}
860		THR_DEACTIVATE_LAST_LOCK(curthread);
861		kse_wait(curkse, curthread, sigseqno);
862		THR_ACTIVATE_LAST_LOCK(curthread);
863		if (curthread->wakeup_time.tv_sec >= 0) {
864			KSE_GET_TOD(curkse, &ts);
865			if (thr_timedout(curthread, &ts)) {
866				/* Indicate the thread timedout: */
867				curthread->timeout = 1;
868				/* Make the thread runnable. */
869				THR_SET_STATE(curthread, PS_RUNNING);
870			}
871		}
872	}
873
874	if (curthread->lock_switch == 0) {
875		/* Unlock the scheduling queue. */
876		KSE_SCHED_UNLOCK(curkse, curkse->k_kseg);
877	}
878
879	DBG_MSG("Continuing bound thread %p\n", curthread);
880	if (first) {
881		_kse_critical_leave(&curthread->tcb->tcb_tmbx);
882		pthread_exit(curthread->start_routine(curthread->arg));
883	}
884}
885
886#ifdef DEBUG_THREAD_KERN
887static void
888dump_queues(struct kse *curkse)
889{
890	struct pthread *thread;
891
892	DBG_MSG("Threads in waiting queue:\n");
893	TAILQ_FOREACH(thread, &curkse->k_kseg->kg_schedq.sq_waitq, pqe) {
894		DBG_MSG("  thread %p, state %d, blocked %d\n",
895		    thread, thread->state, thread->blocked);
896	}
897}
898#endif
899
900/*
901 * This is the scheduler for a KSE which runs multiple threads.
902 */
903static void
904kse_sched_multi(struct kse_mailbox *kmbx)
905{
906	struct kse *curkse;
907	struct pthread *curthread, *td_wait;
908	int ret;
909
910	curkse = (struct kse *)kmbx->km_udata;
911	THR_ASSERT(curkse->k_kcb->kcb_kmbx.km_curthread == NULL,
912	    "Mailbox not null in kse_sched_multi");
913
914	/* Check for first time initialization: */
915	if (__predict_false((curkse->k_flags & KF_INITIALIZED) == 0)) {
916		/* Setup this KSEs specific data. */
917		_kcb_set(curkse->k_kcb);
918
919		/* Set this before grabbing the context. */
920		curkse->k_flags |= KF_INITIALIZED;
921	}
922
923	/*
924	 * No current thread anymore, calling _get_curthread in UTS
925	 * should dump core
926	 */
927	_tcb_set(curkse->k_kcb, NULL);
928
929	/* If this is an upcall; take the scheduler lock. */
930	if (!KSE_IS_SWITCH(curkse))
931		KSE_SCHED_LOCK(curkse, curkse->k_kseg);
932	else
933		KSE_CLEAR_SWITCH(curkse);
934
935	if (KSE_IS_IDLE(curkse)) {
936		KSE_CLEAR_IDLE(curkse);
937		curkse->k_kseg->kg_idle_kses--;
938	}
939
940	/*
941	 * Now that the scheduler lock is held, get the current
942	 * thread.  The KSE's current thread cannot be safely
943	 * examined without the lock because it could have returned
944	 * as completed on another KSE.  See kse_check_completed().
945	 */
946	curthread = curkse->k_curthread;
947
948	/*
949	 * If the current thread was completed in another KSE, then
950	 * it will be in the run queue.  Don't mark it as being blocked.
951	 */
952	if ((curthread != NULL) &&
953	    ((curthread->flags & THR_FLAGS_IN_RUNQ) == 0) &&
954	    (curthread->need_switchout == 0)) {
955		/*
956		 * Assume the current thread is blocked; when the
957		 * completed threads are checked and if the current
958		 * thread is among the completed, the blocked flag
959		 * will be cleared.
960		 */
961		curthread->blocked = 1;
962		DBG_MSG("Running thread %p is now blocked in kernel.\n",
963		    curthread);
964	}
965
966	/* Check for any unblocked threads in the kernel. */
967	kse_check_completed(curkse);
968
969	/*
970	 * Check for threads that have timed-out.
971	 */
972	kse_check_waitq(curkse);
973
974	/*
975	 * Switchout the current thread, if necessary, as the last step
976	 * so that it is inserted into the run queue (if it's runnable)
977	 * _after_ any other threads that were added to it above.
978	 */
979	if (curthread == NULL)
980		;  /* Nothing to do here. */
981	else if ((curthread->need_switchout == 0) && DBG_CAN_RUN(curthread) &&
982	    (curthread->blocked == 0) && (THR_IN_CRITICAL(curthread))) {
983		/*
984		 * Resume the thread and tell it to yield when
985		 * it leaves the critical region.
986		 */
987		curthread->critical_yield = 1;
988		curthread->active = 1;
989		if ((curthread->flags & THR_FLAGS_IN_RUNQ) != 0)
990			KSE_RUNQ_REMOVE(curkse, curthread);
991		curkse->k_curthread = curthread;
992		curthread->kse = curkse;
993		DBG_MSG("Continuing thread %p in critical region\n",
994		    curthread);
995		kse_wakeup_multi(curkse);
996		KSE_SCHED_UNLOCK(curkse, curkse->k_kseg);
997		ret = _thread_switch(curkse->k_kcb, curthread->tcb, 1);
998		if (ret != 0)
999			PANIC("Can't resume thread in critical region\n");
1000	}
1001	else if ((curthread->flags & THR_FLAGS_IN_RUNQ) == 0) {
1002		curthread->tcb->tcb_tmbx.tm_lwp = 0;
1003		kse_switchout_thread(curkse, curthread);
1004	}
1005	curkse->k_curthread = NULL;
1006
1007#ifdef DEBUG_THREAD_KERN
1008	dump_queues(curkse);
1009#endif
1010
1011	/* Check if there are no threads ready to run: */
1012	while (((curthread = KSE_RUNQ_FIRST(curkse)) == NULL) &&
1013	    (curkse->k_kseg->kg_threadcount != 0) &&
1014	    ((curkse->k_flags & KF_TERMINATED) == 0)) {
1015		/*
1016		 * Wait for a thread to become active or until there are
1017		 * no more threads.
1018		 */
1019		td_wait = KSE_WAITQ_FIRST(curkse);
1020		kse_wait(curkse, td_wait, 0);
1021		kse_check_completed(curkse);
1022		kse_check_waitq(curkse);
1023	}
1024
1025	/* Check for no more threads: */
1026	if ((curkse->k_kseg->kg_threadcount == 0) ||
1027	    ((curkse->k_flags & KF_TERMINATED) != 0)) {
1028		/*
1029		 * Normally this shouldn't return, but it will if there
1030		 * are other KSEs running that create new threads that
1031		 * are assigned to this KSE[G].  For instance, if a scope
1032		 * system thread were to create a scope process thread
1033		 * and this kse[g] is the initial kse[g], then that newly
1034		 * created thread would be assigned to us (the initial
1035		 * kse[g]).
1036		 */
1037		kse_wakeup_multi(curkse);
1038		KSE_SCHED_UNLOCK(curkse, curkse->k_kseg);
1039		kse_fini(curkse);
1040		/* never returns */
1041	}
1042
1043	THR_ASSERT(curthread != NULL,
1044	    "Return from kse_wait/fini without thread.");
1045	THR_ASSERT(curthread->state != PS_DEAD,
1046	    "Trying to resume dead thread!");
1047	KSE_RUNQ_REMOVE(curkse, curthread);
1048
1049	/*
1050	 * Make the selected thread the current thread.
1051	 */
1052	curkse->k_curthread = curthread;
1053
1054	/*
1055	 * Make sure the current thread's kse points to this kse.
1056	 */
1057	curthread->kse = curkse;
1058
1059	/*
1060	 * Reset the time slice if this thread is running for the first
1061	 * time or running again after using its full time slice allocation.
1062	 */
1063	if (curthread->slice_usec == -1)
1064		curthread->slice_usec = 0;
1065
1066	/* Mark the thread active. */
1067	curthread->active = 1;
1068
1069	/*
1070	 * The thread's current signal frame will only be NULL if it
1071	 * is being resumed after being blocked in the kernel.  In
1072	 * this case, and if the thread needs to run down pending
1073	 * signals or needs a cancellation check, we need to add a
1074	 * signal frame to the thread's context.
1075	 */
1076	if (curthread->lock_switch == 0 && curthread->state == PS_RUNNING &&
1077	    (curthread->check_pending != 0 ||
1078	     THR_NEED_ASYNC_CANCEL(curthread)) &&
1079	    !THR_IN_CRITICAL(curthread)) {
1080		curthread->check_pending = 0;
1081		signalcontext(&curthread->tcb->tcb_tmbx.tm_context, 0,
1082		    (__sighandler_t *)thr_resume_wrapper);
1083	}
1084	kse_wakeup_multi(curkse);
1085	/*
1086	 * Continue the thread at its current frame:
1087	 */
1088	if (curthread->lock_switch != 0) {
1089		/*
1090		 * This thread came from a scheduler switch; it will
1091		 * unlock the scheduler lock and set the mailbox.
1092		 */
1093		ret = _thread_switch(curkse->k_kcb, curthread->tcb, 0);
1094	} else {
1095		/* This thread won't unlock the scheduler lock. */
1096		KSE_SCHED_UNLOCK(curkse, curkse->k_kseg);
1097		ret = _thread_switch(curkse->k_kcb, curthread->tcb, 1);
1098	}
1099	if (ret != 0)
1100		PANIC("Thread has returned from _thread_switch");
1101
1102	/* This point should not be reached. */
1103	PANIC("Thread has returned from _thread_switch");
1104}
1105
1106static void
1107thr_resume_wrapper(int sig, siginfo_t *siginfo, ucontext_t *ucp)
1108{
1109	struct pthread *curthread = _get_curthread();
1110	struct kse *curkse;
1111	int ret, err_save = errno;
1112
1113	DBG_MSG(">>> sig wrapper\n");
1114	if (curthread->lock_switch)
1115		PANIC("thr_resume_wrapper, lock_switch != 0\n");
1116	thr_resume_check(curthread, ucp);
1117	errno = err_save;
1118	_kse_critical_enter();
1119	curkse = curthread->kse;
1120	curthread->tcb->tcb_tmbx.tm_context = *ucp;
1121	ret = _thread_switch(curkse->k_kcb, curthread->tcb, 1);
1122	if (ret != 0)
1123		PANIC("thr_resume_wrapper: thread has returned "
1124		      "from _thread_switch");
1125	/* THR_SETCONTEXT(ucp); */ /* not work, why ? */
1126}
1127
1128static void
1129thr_resume_check(struct pthread *curthread, ucontext_t *ucp)
1130{
1131	_thr_sig_rundown(curthread, ucp);
1132
1133	if (THR_NEED_ASYNC_CANCEL(curthread))
1134		pthread_testcancel();
1135}
1136
1137/*
1138 * Clean up a thread.  This must be called with the thread's KSE
1139 * scheduling lock held.  The thread must be a thread from the
1140 * KSE's group.
1141 */
1142static void
1143thr_cleanup(struct kse *curkse, struct pthread *thread)
1144{
1145	struct pthread *joiner;
1146	struct kse_mailbox *kmbx = NULL;
1147	int sys_scope;
1148
1149	thread->active = 0;
1150	thread->need_switchout = 0;
1151	thread->lock_switch = 0;
1152	thread->check_pending = 0;
1153
1154	if ((joiner = thread->joiner) != NULL) {
1155		/* Joinee scheduler lock held; joiner won't leave. */
1156		if (joiner->kseg == curkse->k_kseg) {
1157			if (joiner->join_status.thread == thread) {
1158				joiner->join_status.thread = NULL;
1159				joiner->join_status.ret = thread->ret;
1160				(void)_thr_setrunnable_unlocked(joiner);
1161			}
1162		} else {
1163			KSE_SCHED_UNLOCK(curkse, curkse->k_kseg);
1164			/* The joiner may have removed itself and exited. */
1165			if (_thr_ref_add(thread, joiner, 0) == 0) {
1166				KSE_SCHED_LOCK(curkse, joiner->kseg);
1167				if (joiner->join_status.thread == thread) {
1168					joiner->join_status.thread = NULL;
1169					joiner->join_status.ret = thread->ret;
1170					kmbx = _thr_setrunnable_unlocked(joiner);
1171				}
1172				KSE_SCHED_UNLOCK(curkse, joiner->kseg);
1173				_thr_ref_delete(thread, joiner);
1174				if (kmbx != NULL)
1175					kse_wakeup(kmbx);
1176			}
1177			KSE_SCHED_LOCK(curkse, curkse->k_kseg);
1178		}
1179		thread->attr.flags |= PTHREAD_DETACHED;
1180	}
1181
1182	if (!(sys_scope = (thread->attr.flags & PTHREAD_SCOPE_SYSTEM))) {
1183		/*
1184		 * Remove the thread from the KSEG's list of threads.
1185	 	 */
1186		KSEG_THRQ_REMOVE(thread->kseg, thread);
1187		/*
1188		 * Migrate the thread to the main KSE so that this
1189		 * KSE and KSEG can be cleaned when their last thread
1190		 * exits.
1191		 */
1192		thread->kseg = _kse_initial->k_kseg;
1193		thread->kse = _kse_initial;
1194	}
1195
1196	/*
1197	 * We can't hold the thread list lock while holding the
1198	 * scheduler lock.
1199	 */
1200	KSE_SCHED_UNLOCK(curkse, curkse->k_kseg);
1201	DBG_MSG("Adding thread %p to GC list\n", thread);
1202	KSE_LOCK_ACQUIRE(curkse, &_thread_list_lock);
1203	thread->tlflags |= TLFLAGS_GC_SAFE;
1204	THR_GCLIST_ADD(thread);
1205	KSE_LOCK_RELEASE(curkse, &_thread_list_lock);
1206	if (sys_scope) {
1207		/*
1208		 * System scope thread is single thread group,
1209		 * when thread is exited, its kse and ksegrp should
1210		 * be recycled as well.
1211		 * kse upcall stack belongs to thread, clear it here.
1212		 */
1213		curkse->k_stack.ss_sp = 0;
1214		curkse->k_stack.ss_size = 0;
1215		kse_exit();
1216		PANIC("kse_exit() failed for system scope thread");
1217	}
1218	KSE_SCHED_LOCK(curkse, curkse->k_kseg);
1219}
1220
1221void
1222_thr_gc(struct pthread *curthread)
1223{
1224	thread_gc(curthread);
1225	kse_gc(curthread);
1226	kseg_gc(curthread);
1227}
1228
1229static void
1230thread_gc(struct pthread *curthread)
1231{
1232	struct pthread *td, *td_next;
1233	kse_critical_t crit;
1234	TAILQ_HEAD(, pthread) worklist;
1235
1236	TAILQ_INIT(&worklist);
1237	crit = _kse_critical_enter();
1238	KSE_LOCK_ACQUIRE(curthread->kse, &_thread_list_lock);
1239
1240	/* Check the threads waiting for GC. */
1241	for (td = TAILQ_FIRST(&_thread_gc_list); td != NULL; td = td_next) {
1242		td_next = TAILQ_NEXT(td, gcle);
1243		if ((td->tlflags & TLFLAGS_GC_SAFE) == 0)
1244			continue;
1245		else if (((td->attr.flags & PTHREAD_SCOPE_SYSTEM) != 0) &&
1246		    ((td->kse->k_kcb->kcb_kmbx.km_flags & KMF_DONE) == 0)) {
1247			/*
1248			 * The thread and KSE are operating on the same
1249			 * stack.  Wait for the KSE to exit before freeing
1250			 * the thread's stack as well as everything else.
1251			 */
1252			continue;
1253		}
1254		/*
1255		 * Remove the thread from the GC list.  If the thread
1256		 * isn't yet detached, it will get added back to the
1257		 * GC list at a later time.
1258		 */
1259		THR_GCLIST_REMOVE(td);
1260		DBG_MSG("Freeing thread %p stack\n", td);
1261		/*
1262		 * We can free the thread stack since it's no longer
1263		 * in use.
1264		 */
1265		_thr_stack_free(&td->attr);
1266		if (((td->attr.flags & PTHREAD_DETACHED) != 0) &&
1267		    (td->refcount == 0)) {
1268			/*
1269			 * The thread has detached and is no longer
1270			 * referenced.  It is safe to remove all
1271			 * remnants of the thread.
1272			 */
1273			THR_LIST_REMOVE(td);
1274			TAILQ_INSERT_HEAD(&worklist, td, gcle);
1275		}
1276	}
1277	KSE_LOCK_RELEASE(curthread->kse, &_thread_list_lock);
1278	_kse_critical_leave(crit);
1279
1280	while ((td = TAILQ_FIRST(&worklist)) != NULL) {
1281		TAILQ_REMOVE(&worklist, td, gcle);
1282		/*
1283		 * XXX we don't free initial thread and its kse
1284		 * (if thread is a bound thread), because there might
1285		 * have some code referencing initial thread and kse.
1286		 */
1287		if (td == _thr_initial) {
1288			DBG_MSG("Initial thread won't be freed\n");
1289			continue;
1290		}
1291
1292		if ((td->attr.flags & PTHREAD_SCOPE_SYSTEM) != 0) {
1293			crit = _kse_critical_enter();
1294			KSE_LOCK_ACQUIRE(curthread->kse, &kse_lock);
1295			kse_free_unlocked(td->kse);
1296			kseg_free_unlocked(td->kseg);
1297			KSE_LOCK_RELEASE(curthread->kse, &kse_lock);
1298			_kse_critical_leave(crit);
1299		}
1300		DBG_MSG("Freeing thread %p\n", td);
1301		_thr_free(curthread, td);
1302	}
1303}
1304
1305static void
1306kse_gc(struct pthread *curthread)
1307{
1308	kse_critical_t crit;
1309	TAILQ_HEAD(, kse) worklist;
1310	struct kse *kse;
1311
1312	if (free_kse_count <= MAX_CACHED_KSES)
1313		return;
1314	TAILQ_INIT(&worklist);
1315	crit = _kse_critical_enter();
1316	KSE_LOCK_ACQUIRE(curthread->kse, &kse_lock);
1317	while (free_kse_count > MAX_CACHED_KSES) {
1318		kse = TAILQ_FIRST(&free_kseq);
1319		TAILQ_REMOVE(&free_kseq, kse, k_qe);
1320		TAILQ_INSERT_HEAD(&worklist, kse, k_qe);
1321		free_kse_count--;
1322	}
1323	KSE_LOCK_RELEASE(curthread->kse, &kse_lock);
1324	_kse_critical_leave(crit);
1325
1326	while ((kse = TAILQ_FIRST(&worklist))) {
1327		TAILQ_REMOVE(&worklist, kse, k_qe);
1328		kse_destroy(kse);
1329	}
1330}
1331
1332static void
1333kseg_gc(struct pthread *curthread)
1334{
1335	kse_critical_t crit;
1336	TAILQ_HEAD(, kse_group) worklist;
1337	struct kse_group *kseg;
1338
1339	if (free_kseg_count <= MAX_CACHED_KSEGS)
1340		return;
1341	TAILQ_INIT(&worklist);
1342	crit = _kse_critical_enter();
1343	KSE_LOCK_ACQUIRE(curthread->kse, &kse_lock);
1344	while (free_kseg_count > MAX_CACHED_KSEGS) {
1345		kseg = TAILQ_FIRST(&free_kse_groupq);
1346		TAILQ_REMOVE(&free_kse_groupq, kseg, kg_qe);
1347		free_kseg_count--;
1348		TAILQ_INSERT_HEAD(&worklist, kseg, kg_qe);
1349	}
1350	KSE_LOCK_RELEASE(curthread->kse, &kse_lock);
1351	_kse_critical_leave(crit);
1352
1353	while ((kseg = TAILQ_FIRST(&worklist))) {
1354		TAILQ_REMOVE(&worklist, kseg, kg_qe);
1355		kseg_destroy(kseg);
1356	}
1357}
1358
1359/*
1360 * Only new threads that are running or suspended may be scheduled.
1361 */
1362int
1363_thr_schedule_add(struct pthread *curthread, struct pthread *newthread)
1364{
1365	kse_critical_t crit;
1366	int ret;
1367
1368	/* Add the new thread. */
1369	thr_link(newthread);
1370
1371	/*
1372	 * If this is the first time creating a thread, make sure
1373	 * the mailbox is set for the current thread.
1374	 */
1375	if ((newthread->attr.flags & PTHREAD_SCOPE_SYSTEM) != 0) {
1376		/* We use the thread's stack as the KSE's stack. */
1377		newthread->kse->k_kcb->kcb_kmbx.km_stack.ss_sp =
1378		    newthread->attr.stackaddr_attr;
1379		newthread->kse->k_kcb->kcb_kmbx.km_stack.ss_size =
1380		    newthread->attr.stacksize_attr;
1381
1382		/*
1383		 * No need to lock the scheduling queue since the
1384		 * KSE/KSEG pair have not yet been started.
1385		 */
1386		KSEG_THRQ_ADD(newthread->kseg, newthread);
1387		/* this thread never gives up kse */
1388		newthread->active = 1;
1389		newthread->kse->k_curthread = newthread;
1390		newthread->kse->k_kcb->kcb_kmbx.km_flags = KMF_BOUND;
1391		newthread->kse->k_kcb->kcb_kmbx.km_func =
1392		    (kse_func_t *)kse_sched_single;
1393		newthread->kse->k_kcb->kcb_kmbx.km_quantum = 0;
1394		KSE_SET_MBOX(newthread->kse, newthread);
1395		/*
1396		 * This thread needs a new KSE and KSEG.
1397		 */
1398		newthread->kse->k_flags &= ~KF_INITIALIZED;
1399		newthread->kse->k_flags |= KF_STARTED;
1400		/* Fire up! */
1401		ret = kse_create(&newthread->kse->k_kcb->kcb_kmbx, 1);
1402		if (ret != 0)
1403			ret = errno;
1404	}
1405	else {
1406		/*
1407		 * Lock the KSE and add the new thread to its list of
1408		 * assigned threads.  If the new thread is runnable, also
1409		 * add it to the KSE's run queue.
1410		 */
1411		crit = _kse_critical_enter();
1412		KSE_SCHED_LOCK(curthread->kse, newthread->kseg);
1413		KSEG_THRQ_ADD(newthread->kseg, newthread);
1414		if (newthread->state == PS_RUNNING)
1415			THR_RUNQ_INSERT_TAIL(newthread);
1416		if ((newthread->kse->k_flags & KF_STARTED) == 0) {
1417			/*
1418			 * This KSE hasn't been started yet.  Start it
1419			 * outside of holding the lock.
1420			 */
1421			newthread->kse->k_flags |= KF_STARTED;
1422			newthread->kse->k_kcb->kcb_kmbx.km_func =
1423			    (kse_func_t *)kse_sched_multi;
1424			newthread->kse->k_kcb->kcb_kmbx.km_flags = 0;
1425			kse_create(&newthread->kse->k_kcb->kcb_kmbx, 0);
1426		 } else if ((newthread->state == PS_RUNNING) &&
1427		     KSE_IS_IDLE(newthread->kse)) {
1428			/*
1429			 * The thread is being scheduled on another KSEG.
1430			 */
1431			kse_wakeup_one(newthread);
1432		}
1433		KSE_SCHED_UNLOCK(curthread->kse, newthread->kseg);
1434		_kse_critical_leave(crit);
1435		ret = 0;
1436	}
1437	if (ret != 0)
1438		thr_unlink(newthread);
1439
1440	return (ret);
1441}
1442
1443void
1444kse_waitq_insert(struct pthread *thread)
1445{
1446	struct pthread *td;
1447
1448	if (thread->wakeup_time.tv_sec == -1)
1449		TAILQ_INSERT_TAIL(&thread->kse->k_schedq->sq_waitq, thread,
1450		    pqe);
1451	else {
1452		td = TAILQ_FIRST(&thread->kse->k_schedq->sq_waitq);
1453		while ((td != NULL) && (td->wakeup_time.tv_sec != -1) &&
1454		    ((td->wakeup_time.tv_sec < thread->wakeup_time.tv_sec) ||
1455		    ((td->wakeup_time.tv_sec == thread->wakeup_time.tv_sec) &&
1456		    (td->wakeup_time.tv_nsec <= thread->wakeup_time.tv_nsec))))
1457			td = TAILQ_NEXT(td, pqe);
1458		if (td == NULL)
1459			TAILQ_INSERT_TAIL(&thread->kse->k_schedq->sq_waitq,
1460			    thread, pqe);
1461		else
1462			TAILQ_INSERT_BEFORE(td, thread, pqe);
1463	}
1464	thread->flags |= THR_FLAGS_IN_WAITQ;
1465}
1466
1467/*
1468 * This must be called with the scheduling lock held.
1469 */
1470static void
1471kse_check_completed(struct kse *kse)
1472{
1473	struct pthread *thread;
1474	struct kse_thr_mailbox *completed;
1475	int sig;
1476
1477	if ((completed = kse->k_kcb->kcb_kmbx.km_completed) != NULL) {
1478		kse->k_kcb->kcb_kmbx.km_completed = NULL;
1479		while (completed != NULL) {
1480			thread = completed->tm_udata;
1481			DBG_MSG("Found completed thread %p, name %s\n",
1482			    thread,
1483			    (thread->name == NULL) ? "none" : thread->name);
1484			thread->blocked = 0;
1485			if (thread != kse->k_curthread) {
1486				thr_accounting(thread);
1487				if ((thread->flags & THR_FLAGS_SUSPENDED) != 0)
1488					THR_SET_STATE(thread, PS_SUSPENDED);
1489				else
1490					KSE_RUNQ_INSERT_TAIL(kse, thread);
1491				if ((thread->kse != kse) &&
1492				    (thread->kse->k_curthread == thread)) {
1493					/*
1494					 * Remove this thread from its
1495					 * previous KSE so that it (the KSE)
1496					 * doesn't think it is still active.
1497					 */
1498					thread->kse->k_curthread = NULL;
1499					thread->active = 0;
1500				}
1501			}
1502			if ((sig = thread->tcb->tcb_tmbx.tm_syncsig.si_signo)
1503			    != 0) {
1504				if (SIGISMEMBER(thread->sigmask, sig))
1505					SIGADDSET(thread->sigpend, sig);
1506				else if (THR_IN_CRITICAL(thread))
1507					kse_thr_interrupt(NULL, KSE_INTR_SIGEXIT, sig);
1508				else
1509					(void)_thr_sig_add(thread, sig,
1510					    &thread->tcb->tcb_tmbx.tm_syncsig);
1511				thread->tcb->tcb_tmbx.tm_syncsig.si_signo = 0;
1512			}
1513			completed = completed->tm_next;
1514		}
1515	}
1516}
1517
1518/*
1519 * This must be called with the scheduling lock held.
1520 */
1521static void
1522kse_check_waitq(struct kse *kse)
1523{
1524	struct pthread	*pthread;
1525	struct timespec ts;
1526
1527	KSE_GET_TOD(kse, &ts);
1528
1529	/*
1530	 * Wake up threads that have timedout.  This has to be
1531	 * done before adding the current thread to the run queue
1532	 * so that a CPU intensive thread doesn't get preference
1533	 * over waiting threads.
1534	 */
1535	while (((pthread = KSE_WAITQ_FIRST(kse)) != NULL) &&
1536	    thr_timedout(pthread, &ts)) {
1537		/* Remove the thread from the wait queue: */
1538		KSE_WAITQ_REMOVE(kse, pthread);
1539		DBG_MSG("Found timedout thread %p in waitq\n", pthread);
1540
1541		/* Indicate the thread timedout: */
1542		pthread->timeout = 1;
1543
1544		/* Add the thread to the priority queue: */
1545		if ((pthread->flags & THR_FLAGS_SUSPENDED) != 0)
1546			THR_SET_STATE(pthread, PS_SUSPENDED);
1547		else {
1548			THR_SET_STATE(pthread, PS_RUNNING);
1549			KSE_RUNQ_INSERT_TAIL(kse, pthread);
1550		}
1551	}
1552}
1553
1554static int
1555thr_timedout(struct pthread *thread, struct timespec *curtime)
1556{
1557	if (thread->wakeup_time.tv_sec < 0)
1558		return (0);
1559	else if (thread->wakeup_time.tv_sec > curtime->tv_sec)
1560		return (0);
1561	else if ((thread->wakeup_time.tv_sec == curtime->tv_sec) &&
1562	    (thread->wakeup_time.tv_nsec > curtime->tv_nsec))
1563		return (0);
1564	else
1565		return (1);
1566}
1567
1568/*
1569 * This must be called with the scheduling lock held.
1570 *
1571 * Each thread has a time slice, a wakeup time (used when it wants
1572 * to wait for a specified amount of time), a run state, and an
1573 * active flag.
1574 *
1575 * When a thread gets run by the scheduler, the active flag is
1576 * set to non-zero (1).  When a thread performs an explicit yield
1577 * or schedules a state change, it enters the scheduler and the
1578 * active flag is cleared.  When the active flag is still seen
1579 * set in the scheduler, that means that the thread is blocked in
1580 * the kernel (because it is cleared before entering the scheduler
1581 * in all other instances).
1582 *
1583 * The wakeup time is only set for those states that can timeout.
1584 * It is set to (-1, -1) for all other instances.
1585 *
1586 * The thread's run state, aside from being useful when debugging,
1587 * is used to place the thread in an appropriate queue.  There
1588 * are 2 basic queues:
1589 *
1590 *   o run queue - queue ordered by priority for all threads
1591 *                 that are runnable
1592 *   o waiting queue - queue sorted by wakeup time for all threads
1593 *                     that are not otherwise runnable (not blocked
1594 *                     in kernel, not waiting for locks)
1595 *
1596 * The thread's time slice is used for round-robin scheduling
1597 * (the default scheduling policy).  While a SCHED_RR thread
1598 * is runnable it's time slice accumulates.  When it reaches
1599 * the time slice interval, it gets reset and added to the end
1600 * of the queue of threads at its priority.  When a thread no
1601 * longer becomes runnable (blocks in kernel, waits, etc), its
1602 * time slice is reset.
1603 *
1604 * The job of kse_switchout_thread() is to handle all of the above.
1605 */
1606static void
1607kse_switchout_thread(struct kse *kse, struct pthread *thread)
1608{
1609	int level;
1610	int i;
1611	int restart;
1612	siginfo_t siginfo;
1613
1614	/*
1615	 * Place the currently running thread into the
1616	 * appropriate queue(s).
1617	 */
1618	DBG_MSG("Switching out thread %p, state %d\n", thread, thread->state);
1619
1620	THR_DEACTIVATE_LAST_LOCK(thread);
1621	if (thread->blocked != 0) {
1622		thread->active = 0;
1623		thread->need_switchout = 0;
1624		/* This thread must have blocked in the kernel. */
1625		/*
1626		 * Check for pending signals and cancellation for
1627		 * this thread to see if we need to interrupt it
1628		 * in the kernel.
1629		 */
1630		if (THR_NEED_CANCEL(thread)) {
1631			kse_thr_interrupt(&thread->tcb->tcb_tmbx,
1632					  KSE_INTR_INTERRUPT, 0);
1633		} else if (thread->check_pending != 0) {
1634			for (i = 1; i <= _SIG_MAXSIG; ++i) {
1635				if (SIGISMEMBER(thread->sigpend, i) &&
1636				    !SIGISMEMBER(thread->sigmask, i)) {
1637					restart = _thread_sigact[i - 1].sa_flags & SA_RESTART;
1638					kse_thr_interrupt(&thread->tcb->tcb_tmbx,
1639					    restart ? KSE_INTR_RESTART : KSE_INTR_INTERRUPT, 0);
1640					break;
1641				}
1642			}
1643		}
1644	}
1645	else {
1646		switch (thread->state) {
1647		case PS_MUTEX_WAIT:
1648		case PS_COND_WAIT:
1649			if (THR_NEED_CANCEL(thread)) {
1650				thread->interrupted = 1;
1651				thread->continuation = _thr_finish_cancellation;
1652				THR_SET_STATE(thread, PS_RUNNING);
1653			} else {
1654				/* Insert into the waiting queue: */
1655				KSE_WAITQ_INSERT(kse, thread);
1656			}
1657			break;
1658
1659		case PS_LOCKWAIT:
1660			/*
1661			 * This state doesn't timeout.
1662			 */
1663			thread->wakeup_time.tv_sec = -1;
1664			thread->wakeup_time.tv_nsec = -1;
1665			level = thread->locklevel - 1;
1666			if (!_LCK_GRANTED(&thread->lockusers[level]))
1667				KSE_WAITQ_INSERT(kse, thread);
1668			else
1669				THR_SET_STATE(thread, PS_RUNNING);
1670			break;
1671
1672		case PS_SLEEP_WAIT:
1673		case PS_SIGWAIT:
1674			if (THR_NEED_CANCEL(thread)) {
1675				thread->interrupted = 1;
1676				THR_SET_STATE(thread, PS_RUNNING);
1677			} else {
1678				KSE_WAITQ_INSERT(kse, thread);
1679			}
1680			break;
1681
1682		case PS_JOIN:
1683			if (THR_NEED_CANCEL(thread)) {
1684				thread->join_status.thread = NULL;
1685				THR_SET_STATE(thread, PS_RUNNING);
1686			} else {
1687				/*
1688				 * This state doesn't timeout.
1689				 */
1690				thread->wakeup_time.tv_sec = -1;
1691				thread->wakeup_time.tv_nsec = -1;
1692
1693				/* Insert into the waiting queue: */
1694				KSE_WAITQ_INSERT(kse, thread);
1695			}
1696			break;
1697
1698		case PS_SIGSUSPEND:
1699		case PS_SUSPENDED:
1700			if (THR_NEED_CANCEL(thread)) {
1701				thread->interrupted = 1;
1702				THR_SET_STATE(thread, PS_RUNNING);
1703			} else {
1704				/*
1705				 * These states don't timeout.
1706				 */
1707				thread->wakeup_time.tv_sec = -1;
1708				thread->wakeup_time.tv_nsec = -1;
1709
1710				/* Insert into the waiting queue: */
1711				KSE_WAITQ_INSERT(kse, thread);
1712			}
1713			break;
1714
1715		case PS_DEAD:
1716			/*
1717			 * The scheduler is operating on a different
1718			 * stack.  It is safe to do garbage collecting
1719			 * here.
1720			 */
1721			thr_cleanup(kse, thread);
1722			return;
1723			break;
1724
1725		case PS_RUNNING:
1726			if ((thread->flags & THR_FLAGS_SUSPENDED) != 0 &&
1727			    !THR_NEED_CANCEL(thread))
1728				THR_SET_STATE(thread, PS_SUSPENDED);
1729			break;
1730
1731		case PS_DEADLOCK:
1732			/*
1733			 * These states don't timeout.
1734			 */
1735			thread->wakeup_time.tv_sec = -1;
1736			thread->wakeup_time.tv_nsec = -1;
1737
1738			/* Insert into the waiting queue: */
1739			KSE_WAITQ_INSERT(kse, thread);
1740			break;
1741
1742		default:
1743			PANIC("Unknown state\n");
1744			break;
1745		}
1746
1747		thr_accounting(thread);
1748		if (thread->state == PS_RUNNING) {
1749			if (thread->slice_usec == -1) {
1750				/*
1751				 * The thread exceeded its time quantum or
1752				 * it yielded the CPU; place it at the tail
1753				 * of the queue for its priority.
1754				 */
1755				KSE_RUNQ_INSERT_TAIL(kse, thread);
1756			} else {
1757				/*
1758				 * The thread hasn't exceeded its interval
1759				 * Place it at the head of the queue for its
1760				 * priority.
1761				 */
1762				KSE_RUNQ_INSERT_HEAD(kse, thread);
1763			}
1764		}
1765	}
1766	thread->active = 0;
1767	thread->need_switchout = 0;
1768	if (thread->check_pending != 0) {
1769		/* Install pending signals into the frame. */
1770		thread->check_pending = 0;
1771		KSE_LOCK_ACQUIRE(kse, &_thread_signal_lock);
1772		for (i = 1; i <= _SIG_MAXSIG; i++) {
1773			if (SIGISMEMBER(thread->sigmask, i))
1774				continue;
1775			if (SIGISMEMBER(thread->sigpend, i))
1776				(void)_thr_sig_add(thread, i,
1777				    &thread->siginfo[i-1]);
1778			else if (SIGISMEMBER(_thr_proc_sigpending, i) &&
1779				_thr_getprocsig_unlocked(i, &siginfo)) {
1780				(void)_thr_sig_add(thread, i, &siginfo);
1781			}
1782		}
1783		KSE_LOCK_RELEASE(kse, &_thread_signal_lock);
1784	}
1785}
1786
1787/*
1788 * This function waits for the smallest timeout value of any waiting
1789 * thread, or until it receives a message from another KSE.
1790 *
1791 * This must be called with the scheduling lock held.
1792 */
1793static void
1794kse_wait(struct kse *kse, struct pthread *td_wait, int sigseqno)
1795{
1796	struct timespec ts, ts_sleep;
1797	int saved_flags;
1798
1799	if ((td_wait == NULL) || (td_wait->wakeup_time.tv_sec < 0)) {
1800		/* Limit sleep to no more than 1 minute. */
1801		ts_sleep.tv_sec = 60;
1802		ts_sleep.tv_nsec = 0;
1803	} else {
1804		KSE_GET_TOD(kse, &ts);
1805		TIMESPEC_SUB(&ts_sleep, &td_wait->wakeup_time, &ts);
1806		if (ts_sleep.tv_sec > 60) {
1807			ts_sleep.tv_sec = 60;
1808			ts_sleep.tv_nsec = 0;
1809		}
1810	}
1811	/* Don't sleep for negative times. */
1812	if ((ts_sleep.tv_sec >= 0) && (ts_sleep.tv_nsec >= 0)) {
1813		KSE_SET_IDLE(kse);
1814		kse->k_kseg->kg_idle_kses++;
1815		KSE_SCHED_UNLOCK(kse, kse->k_kseg);
1816		if ((kse->k_kseg->kg_flags & KGF_SINGLE_THREAD) &&
1817		    (kse->k_sigseqno != sigseqno))
1818			; /* don't sleep */
1819		else {
1820			saved_flags = kse->k_kcb->kcb_kmbx.km_flags;
1821			kse->k_kcb->kcb_kmbx.km_flags |= KMF_NOUPCALL;
1822			kse_release(&ts_sleep);
1823			kse->k_kcb->kcb_kmbx.km_flags = saved_flags;
1824		}
1825		KSE_SCHED_LOCK(kse, kse->k_kseg);
1826		if (KSE_IS_IDLE(kse)) {
1827			KSE_CLEAR_IDLE(kse);
1828			kse->k_kseg->kg_idle_kses--;
1829		}
1830	}
1831}
1832
1833/*
1834 * Avoid calling this kse_exit() so as not to confuse it with the
1835 * system call of the same name.
1836 */
1837static void
1838kse_fini(struct kse *kse)
1839{
1840	/* struct kse_group *free_kseg = NULL; */
1841	struct timespec ts;
1842	struct pthread *td;
1843
1844	/*
1845	 * Check to see if this is one of the main kses.
1846	 */
1847	if (kse->k_kseg != _kse_initial->k_kseg) {
1848		PANIC("shouldn't get here");
1849		/* This is for supporting thread groups. */
1850#ifdef NOT_YET
1851		/* Remove this KSE from the KSEG's list of KSEs. */
1852		KSE_SCHED_LOCK(kse, kse->k_kseg);
1853		TAILQ_REMOVE(&kse->k_kseg->kg_kseq, kse, k_kgqe);
1854		kse->k_kseg->kg_ksecount--;
1855		if (TAILQ_EMPTY(&kse->k_kseg->kg_kseq))
1856			free_kseg = kse->k_kseg;
1857		KSE_SCHED_UNLOCK(kse, kse->k_kseg);
1858
1859		/*
1860		 * Add this KSE to the list of free KSEs along with
1861		 * the KSEG if is now orphaned.
1862		 */
1863		KSE_LOCK_ACQUIRE(kse, &kse_lock);
1864		if (free_kseg != NULL)
1865			kseg_free_unlocked(free_kseg);
1866		kse_free_unlocked(kse);
1867		KSE_LOCK_RELEASE(kse, &kse_lock);
1868		kse_exit();
1869		/* Never returns. */
1870		PANIC("kse_exit()");
1871#endif
1872	} else {
1873		/*
1874		 * We allow program to kill kse in initial group (by
1875		 * lowering the concurrency).
1876		 */
1877		if ((kse != _kse_initial) &&
1878		    ((kse->k_flags & KF_TERMINATED) != 0)) {
1879			KSE_SCHED_LOCK(kse, kse->k_kseg);
1880			TAILQ_REMOVE(&kse->k_kseg->kg_kseq, kse, k_kgqe);
1881			kse->k_kseg->kg_ksecount--;
1882			/*
1883			 * Migrate thread to  _kse_initial if its lastest
1884			 * kse it ran on is the kse.
1885			 */
1886			td = TAILQ_FIRST(&kse->k_kseg->kg_threadq);
1887			while (td != NULL) {
1888				if (td->kse == kse)
1889					td->kse = _kse_initial;
1890				td = TAILQ_NEXT(td, kle);
1891			}
1892			KSE_SCHED_UNLOCK(kse, kse->k_kseg);
1893			KSE_LOCK_ACQUIRE(kse, &kse_lock);
1894			kse_free_unlocked(kse);
1895			KSE_LOCK_RELEASE(kse, &kse_lock);
1896			/* Make sure there is always at least one is awake */
1897			KSE_WAKEUP(_kse_initial);
1898			kse_exit();
1899                        /* Never returns. */
1900                        PANIC("kse_exit() failed for initial kseg");
1901                }
1902		KSE_SCHED_LOCK(kse, kse->k_kseg);
1903		KSE_SET_IDLE(kse);
1904		kse->k_kseg->kg_idle_kses++;
1905		KSE_SCHED_UNLOCK(kse, kse->k_kseg);
1906		ts.tv_sec = 120;
1907		ts.tv_nsec = 0;
1908		kse->k_kcb->kcb_kmbx.km_flags = 0;
1909		kse_release(&ts);
1910		/* Never reach */
1911	}
1912}
1913
1914void
1915_thr_set_timeout(const struct timespec *timeout)
1916{
1917	struct pthread	*curthread = _get_curthread();
1918	struct timespec ts;
1919
1920	/* Reset the timeout flag for the running thread: */
1921	curthread->timeout = 0;
1922
1923	/* Check if the thread is to wait forever: */
1924	if (timeout == NULL) {
1925		/*
1926		 * Set the wakeup time to something that can be recognised as
1927		 * different to an actual time of day:
1928		 */
1929		curthread->wakeup_time.tv_sec = -1;
1930		curthread->wakeup_time.tv_nsec = -1;
1931	}
1932	/* Check if no waiting is required: */
1933	else if ((timeout->tv_sec == 0) && (timeout->tv_nsec == 0)) {
1934		/* Set the wake up time to 'immediately': */
1935		curthread->wakeup_time.tv_sec = 0;
1936		curthread->wakeup_time.tv_nsec = 0;
1937	} else {
1938		/* Calculate the time for the current thread to wakeup: */
1939		KSE_GET_TOD(curthread->kse, &ts);
1940		TIMESPEC_ADD(&curthread->wakeup_time, &ts, timeout);
1941	}
1942}
1943
1944void
1945_thr_panic_exit(char *file, int line, char *msg)
1946{
1947	char buf[256];
1948
1949	snprintf(buf, sizeof(buf), "(%s:%d) %s\n", file, line, msg);
1950	__sys_write(2, buf, strlen(buf));
1951	abort();
1952}
1953
1954void
1955_thr_setrunnable(struct pthread *curthread, struct pthread *thread)
1956{
1957	kse_critical_t crit;
1958	struct kse_mailbox *kmbx;
1959
1960	crit = _kse_critical_enter();
1961	KSE_SCHED_LOCK(curthread->kse, thread->kseg);
1962	kmbx = _thr_setrunnable_unlocked(thread);
1963	KSE_SCHED_UNLOCK(curthread->kse, thread->kseg);
1964	_kse_critical_leave(crit);
1965	if ((kmbx != NULL) && (__isthreaded != 0))
1966		kse_wakeup(kmbx);
1967}
1968
1969struct kse_mailbox *
1970_thr_setrunnable_unlocked(struct pthread *thread)
1971{
1972	struct kse_mailbox *kmbx = NULL;
1973
1974	if ((thread->kseg->kg_flags & KGF_SINGLE_THREAD) != 0) {
1975		/* No silly queues for these threads. */
1976		if ((thread->flags & THR_FLAGS_SUSPENDED) != 0)
1977			THR_SET_STATE(thread, PS_SUSPENDED);
1978		else {
1979			THR_SET_STATE(thread, PS_RUNNING);
1980			kmbx = kse_wakeup_one(thread);
1981		}
1982
1983	} else if (thread->state != PS_RUNNING) {
1984		if ((thread->flags & THR_FLAGS_IN_WAITQ) != 0)
1985			KSE_WAITQ_REMOVE(thread->kse, thread);
1986		if ((thread->flags & THR_FLAGS_SUSPENDED) != 0)
1987			THR_SET_STATE(thread, PS_SUSPENDED);
1988		else {
1989			THR_SET_STATE(thread, PS_RUNNING);
1990			if ((thread->blocked == 0) && (thread->active == 0) &&
1991			    (thread->flags & THR_FLAGS_IN_RUNQ) == 0)
1992				THR_RUNQ_INSERT_TAIL(thread);
1993			/*
1994			 * XXX - Threads are not yet assigned to specific
1995			 *       KSEs; they are assigned to the KSEG.  So
1996			 *       the fact that a thread's KSE is waiting
1997			 *       doesn't necessarily mean that it will be
1998			 *       the KSE that runs the thread after the
1999			 *       lock is granted.  But we don't know if the
2000			 *       other KSEs within the same KSEG are also
2001			 *       in a waiting state or not so we err on the
2002			 *       side of caution and wakeup the thread's
2003			 *       last known KSE.  We ensure that the
2004			 *       threads KSE doesn't change while it's
2005			 *       scheduling lock is held so it is safe to
2006			 *       reference it (the KSE).  If the KSE wakes
2007			 *       up and doesn't find any more work it will
2008			 *       again go back to waiting so no harm is
2009			 *       done.
2010			 */
2011			kmbx = kse_wakeup_one(thread);
2012		}
2013	}
2014	return (kmbx);
2015}
2016
2017static struct kse_mailbox *
2018kse_wakeup_one(struct pthread *thread)
2019{
2020	struct kse *ke;
2021
2022	if (KSE_IS_IDLE(thread->kse)) {
2023		KSE_CLEAR_IDLE(thread->kse);
2024		thread->kseg->kg_idle_kses--;
2025		return (&thread->kse->k_kcb->kcb_kmbx);
2026	} else {
2027		TAILQ_FOREACH(ke, &thread->kseg->kg_kseq, k_kgqe) {
2028			if (KSE_IS_IDLE(ke)) {
2029				KSE_CLEAR_IDLE(ke);
2030				ke->k_kseg->kg_idle_kses--;
2031				return (&ke->k_kcb->kcb_kmbx);
2032			}
2033		}
2034	}
2035	return (NULL);
2036}
2037
2038static void
2039kse_wakeup_multi(struct kse *curkse)
2040{
2041	struct kse *ke;
2042	int tmp;
2043
2044	if ((tmp = KSE_RUNQ_THREADS(curkse)) && curkse->k_kseg->kg_idle_kses) {
2045		TAILQ_FOREACH(ke, &curkse->k_kseg->kg_kseq, k_kgqe) {
2046			if (KSE_IS_IDLE(ke)) {
2047				KSE_CLEAR_IDLE(ke);
2048				ke->k_kseg->kg_idle_kses--;
2049				KSE_WAKEUP(ke);
2050				if (--tmp == 0)
2051					break;
2052			}
2053		}
2054	}
2055}
2056
2057/*
2058 * Allocate a new KSEG.
2059 *
2060 * We allow the current thread to be NULL in the case that this
2061 * is the first time a KSEG is being created (library initialization).
2062 * In this case, we don't need to (and can't) take any locks.
2063 */
2064struct kse_group *
2065_kseg_alloc(struct pthread *curthread)
2066{
2067	struct kse_group *kseg = NULL;
2068	kse_critical_t crit;
2069
2070	if ((curthread != NULL) && (free_kseg_count > 0)) {
2071		/* Use the kse lock for the kseg queue. */
2072		crit = _kse_critical_enter();
2073		KSE_LOCK_ACQUIRE(curthread->kse, &kse_lock);
2074		if ((kseg = TAILQ_FIRST(&free_kse_groupq)) != NULL) {
2075			TAILQ_REMOVE(&free_kse_groupq, kseg, kg_qe);
2076			free_kseg_count--;
2077			active_kseg_count++;
2078			TAILQ_INSERT_TAIL(&active_kse_groupq, kseg, kg_qe);
2079		}
2080		KSE_LOCK_RELEASE(curthread->kse, &kse_lock);
2081		_kse_critical_leave(crit);
2082		if (kseg)
2083			kseg_reinit(kseg);
2084	}
2085
2086	/*
2087	 * If requested, attempt to allocate a new KSE group only if the
2088	 * KSE allocation was successful and a KSE group wasn't found in
2089	 * the free list.
2090	 */
2091	if ((kseg == NULL) &&
2092	    ((kseg = (struct kse_group *)malloc(sizeof(*kseg))) != NULL)) {
2093		if (_pq_alloc(&kseg->kg_schedq.sq_runq,
2094		    THR_MIN_PRIORITY, THR_LAST_PRIORITY) != 0) {
2095			free(kseg);
2096			kseg = NULL;
2097		} else {
2098			kseg_init(kseg);
2099			/* Add the KSEG to the list of active KSEGs. */
2100			if (curthread != NULL) {
2101				crit = _kse_critical_enter();
2102				KSE_LOCK_ACQUIRE(curthread->kse, &kse_lock);
2103				active_kseg_count++;
2104				TAILQ_INSERT_TAIL(&active_kse_groupq,
2105				    kseg, kg_qe);
2106				KSE_LOCK_RELEASE(curthread->kse, &kse_lock);
2107				_kse_critical_leave(crit);
2108			} else {
2109				active_kseg_count++;
2110				TAILQ_INSERT_TAIL(&active_kse_groupq,
2111				    kseg, kg_qe);
2112			}
2113		}
2114	}
2115	return (kseg);
2116}
2117
2118static void
2119kseg_init(struct kse_group *kseg)
2120{
2121	kseg_reinit(kseg);
2122	_lock_init(&kseg->kg_lock, LCK_ADAPTIVE, _kse_lock_wait,
2123	    _kse_lock_wakeup);
2124}
2125
2126static void
2127kseg_reinit(struct kse_group *kseg)
2128{
2129	TAILQ_INIT(&kseg->kg_kseq);
2130	TAILQ_INIT(&kseg->kg_threadq);
2131	TAILQ_INIT(&kseg->kg_schedq.sq_waitq);
2132	kseg->kg_threadcount = 0;
2133	kseg->kg_ksecount = 0;
2134	kseg->kg_idle_kses = 0;
2135	kseg->kg_flags = 0;
2136}
2137
2138/*
2139 * This must be called with the kse lock held and when there are
2140 * no more threads that reference it.
2141 */
2142static void
2143kseg_free_unlocked(struct kse_group *kseg)
2144{
2145	TAILQ_REMOVE(&active_kse_groupq, kseg, kg_qe);
2146	TAILQ_INSERT_HEAD(&free_kse_groupq, kseg, kg_qe);
2147	free_kseg_count++;
2148	active_kseg_count--;
2149}
2150
2151void
2152_kseg_free(struct kse_group *kseg)
2153{
2154	struct kse *curkse;
2155	kse_critical_t crit;
2156
2157	crit = _kse_critical_enter();
2158	curkse = _get_curkse();
2159	KSE_LOCK_ACQUIRE(curkse, &kse_lock);
2160	kseg_free_unlocked(kseg);
2161	KSE_LOCK_RELEASE(curkse, &kse_lock);
2162	_kse_critical_leave(crit);
2163}
2164
2165static void
2166kseg_destroy(struct kse_group *kseg)
2167{
2168	_lock_destroy(&kseg->kg_lock);
2169	_pq_free(&kseg->kg_schedq.sq_runq);
2170	free(kseg);
2171}
2172
2173/*
2174 * Allocate a new KSE.
2175 *
2176 * We allow the current thread to be NULL in the case that this
2177 * is the first time a KSE is being created (library initialization).
2178 * In this case, we don't need to (and can't) take any locks.
2179 */
2180struct kse *
2181_kse_alloc(struct pthread *curthread, int sys_scope)
2182{
2183	struct kse *kse = NULL;
2184	char *stack;
2185	kse_critical_t crit;
2186	int i;
2187
2188	if ((curthread != NULL) && (free_kse_count > 0)) {
2189		crit = _kse_critical_enter();
2190		KSE_LOCK_ACQUIRE(curthread->kse, &kse_lock);
2191		/* Search for a finished KSE. */
2192		kse = TAILQ_FIRST(&free_kseq);
2193		while ((kse != NULL) &&
2194		    ((kse->k_kcb->kcb_kmbx.km_flags & KMF_DONE) == 0)) {
2195			kse = TAILQ_NEXT(kse, k_qe);
2196		}
2197		if (kse != NULL) {
2198			DBG_MSG("found an unused kse.\n");
2199			TAILQ_REMOVE(&free_kseq, kse, k_qe);
2200			free_kse_count--;
2201			TAILQ_INSERT_TAIL(&active_kseq, kse, k_qe);
2202			active_kse_count++;
2203		}
2204		KSE_LOCK_RELEASE(curthread->kse, &kse_lock);
2205		_kse_critical_leave(crit);
2206		if (kse != NULL)
2207			kse_reinit(kse, sys_scope);
2208	}
2209	if ((kse == NULL) &&
2210	    ((kse = (struct kse *)malloc(sizeof(*kse))) != NULL)) {
2211		if (sys_scope != 0)
2212			stack = NULL;
2213		else if ((stack = malloc(KSE_STACKSIZE)) == NULL) {
2214			free(kse);
2215			return (NULL);
2216		}
2217		bzero(kse, sizeof(*kse));
2218
2219		/* Initialize KCB without the lock. */
2220		if ((kse->k_kcb = _kcb_ctor(kse)) == NULL) {
2221			if (stack != NULL)
2222				free(stack);
2223			free(kse);
2224			return (NULL);
2225		}
2226
2227		/* Initialize the lockusers. */
2228		for (i = 0; i < MAX_KSE_LOCKLEVEL; i++) {
2229			_lockuser_init(&kse->k_lockusers[i], (void *)kse);
2230			_LCK_SET_PRIVATE2(&kse->k_lockusers[i], NULL);
2231		}
2232		/* _lock_init(kse->k_lock, ...) */
2233
2234		if (curthread != NULL) {
2235			crit = _kse_critical_enter();
2236			KSE_LOCK_ACQUIRE(curthread->kse, &kse_lock);
2237		}
2238		kse->k_flags = 0;
2239		TAILQ_INSERT_TAIL(&active_kseq, kse, k_qe);
2240		active_kse_count++;
2241		if (curthread != NULL) {
2242			KSE_LOCK_RELEASE(curthread->kse, &kse_lock);
2243			_kse_critical_leave(crit);
2244		}
2245		/*
2246		 * Create the KSE context.
2247		 * Scope system threads (one thread per KSE) are not required
2248		 * to have a stack for an unneeded kse upcall.
2249		 */
2250		if (!sys_scope) {
2251			kse->k_kcb->kcb_kmbx.km_func = (kse_func_t *)kse_sched_multi;
2252			kse->k_stack.ss_sp = stack;
2253			kse->k_stack.ss_size = KSE_STACKSIZE;
2254		} else {
2255			kse->k_kcb->kcb_kmbx.km_func = (kse_func_t *)kse_sched_single;
2256			kse->k_stack.ss_sp = NULL;
2257			kse->k_stack.ss_size = 0;
2258		}
2259		kse->k_kcb->kcb_kmbx.km_udata = (void *)kse;
2260		kse->k_kcb->kcb_kmbx.km_quantum = 20000;
2261		/*
2262		 * We need to keep a copy of the stack in case it
2263		 * doesn't get used; a KSE running a scope system
2264		 * thread will use that thread's stack.
2265		 */
2266		kse->k_kcb->kcb_kmbx.km_stack = kse->k_stack;
2267	}
2268	return (kse);
2269}
2270
2271static void
2272kse_reinit(struct kse *kse, int sys_scope)
2273{
2274	if (!sys_scope) {
2275		kse->k_kcb->kcb_kmbx.km_func = (kse_func_t *)kse_sched_multi;
2276		if (kse->k_stack.ss_sp == NULL) {
2277			/* XXX check allocation failure */
2278			kse->k_stack.ss_sp = (char *) malloc(KSE_STACKSIZE);
2279			kse->k_stack.ss_size = KSE_STACKSIZE;
2280		}
2281		kse->k_kcb->kcb_kmbx.km_quantum = 20000;
2282	} else {
2283		kse->k_kcb->kcb_kmbx.km_func = (kse_func_t *)kse_sched_single;
2284		if (kse->k_stack.ss_sp)
2285			free(kse->k_stack.ss_sp);
2286		kse->k_stack.ss_sp = NULL;
2287		kse->k_stack.ss_size = 0;
2288		kse->k_kcb->kcb_kmbx.km_quantum = 0;
2289	}
2290	kse->k_kcb->kcb_kmbx.km_stack = kse->k_stack;
2291	kse->k_kcb->kcb_kmbx.km_udata = (void *)kse;
2292	kse->k_kcb->kcb_kmbx.km_curthread = NULL;
2293	kse->k_kcb->kcb_kmbx.km_flags = 0;
2294	kse->k_curthread = NULL;
2295	kse->k_kseg = 0;
2296	kse->k_schedq = 0;
2297	kse->k_locklevel = 0;
2298	kse->k_flags = 0;
2299	kse->k_error = 0;
2300	kse->k_cpu = 0;
2301	kse->k_sigseqno = 0;
2302}
2303
2304void
2305kse_free_unlocked(struct kse *kse)
2306{
2307	TAILQ_REMOVE(&active_kseq, kse, k_qe);
2308	active_kse_count--;
2309	kse->k_kseg = NULL;
2310	kse->k_kcb->kcb_kmbx.km_quantum = 20000;
2311	kse->k_flags = 0;
2312	TAILQ_INSERT_HEAD(&free_kseq, kse, k_qe);
2313	free_kse_count++;
2314}
2315
2316void
2317_kse_free(struct pthread *curthread, struct kse *kse)
2318{
2319	kse_critical_t crit;
2320
2321	if (curthread == NULL)
2322		kse_free_unlocked(kse);
2323	else {
2324		crit = _kse_critical_enter();
2325		KSE_LOCK_ACQUIRE(curthread->kse, &kse_lock);
2326		kse_free_unlocked(kse);
2327		KSE_LOCK_RELEASE(curthread->kse, &kse_lock);
2328		_kse_critical_leave(crit);
2329	}
2330}
2331
2332static void
2333kse_destroy(struct kse *kse)
2334{
2335	int i;
2336
2337	if (kse->k_stack.ss_sp != NULL)
2338		free(kse->k_stack.ss_sp);
2339	_kcb_dtor(kse->k_kcb);
2340	for (i = 0; i < MAX_KSE_LOCKLEVEL; ++i)
2341		_lockuser_destroy(&kse->k_lockusers[i]);
2342	_lock_destroy(&kse->k_lock);
2343	free(kse);
2344}
2345
2346struct pthread *
2347_thr_alloc(struct pthread *curthread)
2348{
2349	kse_critical_t	crit;
2350	struct pthread	*thread = NULL;
2351	int i;
2352
2353	if (curthread != NULL) {
2354		if (GC_NEEDED())
2355			_thr_gc(curthread);
2356		if (free_thread_count > 0) {
2357			crit = _kse_critical_enter();
2358			KSE_LOCK_ACQUIRE(curthread->kse, &thread_lock);
2359			if ((thread = TAILQ_FIRST(&free_threadq)) != NULL) {
2360				TAILQ_REMOVE(&free_threadq, thread, tle);
2361				free_thread_count--;
2362			}
2363			KSE_LOCK_RELEASE(curthread->kse, &thread_lock);
2364			_kse_critical_leave(crit);
2365		}
2366	}
2367	if ((thread == NULL) &&
2368	    ((thread = malloc(sizeof(struct pthread))) != NULL)) {
2369		bzero(thread, sizeof(struct pthread));
2370		thread->siginfo = calloc(_SIG_MAXSIG, sizeof(siginfo_t));
2371		if (thread->siginfo == NULL) {
2372			free(thread);
2373			return (NULL);
2374		}
2375		if (curthread) {
2376			_pthread_mutex_lock(&_tcb_mutex);
2377			thread->tcb = _tcb_ctor(thread, 0 /* not initial tls */);
2378			_pthread_mutex_unlock(&_tcb_mutex);
2379		} else {
2380			thread->tcb = _tcb_ctor(thread, 1 /* initial tls */);
2381		}
2382		if (thread->tcb == NULL) {
2383			free(thread->siginfo);
2384			free(thread);
2385			return (NULL);
2386		}
2387		/*
2388		 * Initialize thread locking.
2389		 * Lock initializing needs malloc, so don't
2390		 * enter critical region before doing this!
2391		 */
2392		if (_lock_init(&thread->lock, LCK_ADAPTIVE,
2393		    _thr_lock_wait, _thr_lock_wakeup) != 0)
2394			PANIC("Cannot initialize thread lock");
2395		for (i = 0; i < MAX_THR_LOCKLEVEL; i++) {
2396			_lockuser_init(&thread->lockusers[i], (void *)thread);
2397			_LCK_SET_PRIVATE2(&thread->lockusers[i],
2398			    (void *)thread);
2399		}
2400	}
2401	return (thread);
2402}
2403
2404void
2405_thr_free(struct pthread *curthread, struct pthread *thread)
2406{
2407	kse_critical_t crit;
2408
2409	DBG_MSG("Freeing thread %p\n", thread);
2410	if (thread->name) {
2411		free(thread->name);
2412		thread->name = NULL;
2413	}
2414	if ((curthread == NULL) || (free_thread_count >= MAX_CACHED_THREADS)) {
2415		thr_destroy(curthread, thread);
2416	} else {
2417		/* Add the thread to the free thread list. */
2418		crit = _kse_critical_enter();
2419		KSE_LOCK_ACQUIRE(curthread->kse, &thread_lock);
2420		TAILQ_INSERT_TAIL(&free_threadq, thread, tle);
2421		free_thread_count++;
2422		KSE_LOCK_RELEASE(curthread->kse, &thread_lock);
2423		_kse_critical_leave(crit);
2424	}
2425}
2426
2427static void
2428thr_destroy(struct pthread *curthread, struct pthread *thread)
2429{
2430	int i;
2431
2432	for (i = 0; i < MAX_THR_LOCKLEVEL; i++)
2433		_lockuser_destroy(&thread->lockusers[i]);
2434	_lock_destroy(&thread->lock);
2435	if (curthread) {
2436		_pthread_mutex_lock(&_tcb_mutex);
2437		_tcb_dtor(thread->tcb);
2438		_pthread_mutex_unlock(&_tcb_mutex);
2439	} else {
2440		_tcb_dtor(thread->tcb);
2441	}
2442	free(thread->siginfo);
2443	free(thread);
2444}
2445
2446/*
2447 * Add an active thread:
2448 *
2449 *   o Assign the thread a unique id (which GDB uses to track
2450 *     threads.
2451 *   o Add the thread to the list of all threads and increment
2452 *     number of active threads.
2453 */
2454static void
2455thr_link(struct pthread *thread)
2456{
2457	kse_critical_t crit;
2458	struct kse *curkse;
2459
2460	crit = _kse_critical_enter();
2461	curkse = _get_curkse();
2462	KSE_LOCK_ACQUIRE(curkse, &_thread_list_lock);
2463	/*
2464	 * Initialize the unique id (which GDB uses to track
2465	 * threads), add the thread to the list of all threads,
2466	 * and
2467	 */
2468	thread->uniqueid = next_uniqueid++;
2469	THR_LIST_ADD(thread);
2470	_thread_active_threads++;
2471	KSE_LOCK_RELEASE(curkse, &_thread_list_lock);
2472	_kse_critical_leave(crit);
2473}
2474
2475/*
2476 * Remove an active thread.
2477 */
2478static void
2479thr_unlink(struct pthread *thread)
2480{
2481	kse_critical_t crit;
2482	struct kse *curkse;
2483
2484	crit = _kse_critical_enter();
2485	curkse = _get_curkse();
2486	KSE_LOCK_ACQUIRE(curkse, &_thread_list_lock);
2487	THR_LIST_REMOVE(thread);
2488	_thread_active_threads--;
2489	KSE_LOCK_RELEASE(curkse, &_thread_list_lock);
2490	_kse_critical_leave(crit);
2491}
2492
2493void
2494_thr_hash_add(struct pthread *thread)
2495{
2496	struct thread_hash_head *head;
2497
2498	head = &thr_hashtable[THREAD_HASH(thread)];
2499	LIST_INSERT_HEAD(head, thread, hle);
2500}
2501
2502void
2503_thr_hash_remove(struct pthread *thread)
2504{
2505	LIST_REMOVE(thread, hle);
2506}
2507
2508struct pthread *
2509_thr_hash_find(struct pthread *thread)
2510{
2511	struct pthread *td;
2512	struct thread_hash_head *head;
2513
2514	head = &thr_hashtable[THREAD_HASH(thread)];
2515	LIST_FOREACH(td, head, hle) {
2516		if (td == thread)
2517			return (thread);
2518	}
2519	return (NULL);
2520}
2521
2522void
2523_thr_debug_check_yield(struct pthread *curthread)
2524{
2525	/*
2526	 * Note that TMDF_SUSPEND is set after process is suspended.
2527	 * When we are being debugged, every suspension in process
2528	 * will cause all KSEs to schedule an upcall in kernel, unless the
2529	 * KSE is in critical region.
2530	 * If the function is being called, it means the KSE is no longer
2531	 * in critical region, if the TMDF_SUSPEND is set by debugger
2532	 * before KSE leaves critical region, we will catch it here, else
2533	 * if the flag is changed during testing, it also not a problem,
2534	 * because the change only occurs after a process suspension event
2535	 * occurs. A suspension event will always cause KSE to schedule an
2536	 * upcall, in the case, because we are not in critical region,
2537	 * upcall will be scheduled sucessfully, the flag will be checked
2538	 * again in kse_sched_multi, we won't back until the flag
2539	 * is cleared by debugger, the flag will be cleared in next
2540	 * suspension event.
2541	 */
2542	if (!DBG_CAN_RUN(curthread)) {
2543		if ((curthread->attr.flags & PTHREAD_SCOPE_SYSTEM) == 0)
2544			_thr_sched_switch(curthread);
2545		else
2546			kse_thr_interrupt(&curthread->tcb->tcb_tmbx,
2547				KSE_INTR_DBSUSPEND, 0);
2548	}
2549}
2550