thr_kern.c revision 119732
1/*
2 * Copyright (C) 2003 Daniel M. Eischen <deischen@freebsd.org>
3 * Copyright (C) 2002 Jonathon Mini <mini@freebsd.org>
4 * Copyright (c) 1995-1998 John Birrell <jb@cimlogic.com.au>
5 * All rights reserved.
6 *
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
9 * are met:
10 * 1. Redistributions of source code must retain the above copyright
11 *    notice, this list of conditions and the following disclaimer.
12 * 2. Redistributions in binary form must reproduce the above copyright
13 *    notice, this list of conditions and the following disclaimer in the
14 *    documentation and/or other materials provided with the distribution.
15 * 3. All advertising materials mentioning features or use of this software
16 *    must display the following acknowledgement:
17 *	This product includes software developed by John Birrell.
18 * 4. Neither the name of the author nor the names of any co-contributors
19 *    may be used to endorse or promote products derived from this software
20 *    without specific prior written permission.
21 *
22 * THIS SOFTWARE IS PROVIDED BY JOHN BIRRELL AND CONTRIBUTORS ``AS IS'' AND
23 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
24 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
25 * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
26 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
27 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
28 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
29 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
30 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
31 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32 * SUCH DAMAGE.
33 *
34 */
35#include <sys/cdefs.h>
36__FBSDID("$FreeBSD: head/lib/libkse/thread/thr_kern.c 119732 2003-09-04 05:24:53Z davidxu $");
37
38#include <sys/types.h>
39#include <sys/kse.h>
40#include <sys/signalvar.h>
41#include <sys/queue.h>
42#include <machine/atomic.h>
43#include <machine/sigframe.h>
44
45#include <assert.h>
46#include <errno.h>
47#include <signal.h>
48#include <stdlib.h>
49#include <string.h>
50#include <time.h>
51#include <ucontext.h>
52#include <unistd.h>
53
54#include "atomic_ops.h"
55#include "thr_private.h"
56#include "libc_private.h"
57
58/*#define DEBUG_THREAD_KERN */
59#ifdef DEBUG_THREAD_KERN
60#define DBG_MSG		stdout_debug
61#else
62#define DBG_MSG(x...)
63#endif
64
65/*
66 * Define a high water mark for the maximum number of threads that
67 * will be cached.  Once this level is reached, any extra threads
68 * will be free()'d.
69 */
70#define	MAX_CACHED_THREADS	100
71/*
72 * Define high water marks for the maximum number of KSEs and KSE groups
73 * that will be cached. Because we support 1:1 threading, there could have
74 * same number of KSEs and KSE groups as threads. Once these levels are
75 * reached, any extra KSE and KSE groups will be free()'d.
76 */
77#ifdef SYSTEM_SCOPE_ONLY
78#define	MAX_CACHED_KSES		100
79#define	MAX_CACHED_KSEGS	100
80#else
81#define	MAX_CACHED_KSES		50
82#define	MAX_CACHED_KSEGS	50
83#endif
84
85#define	KSE_STACKSIZE		16384
86
87#define	KSE_SET_MBOX(kse, thrd) \
88	(kse)->k_kcb->kcb_kmbx.km_curthread = &(thrd)->tcb->tcb_tmbx
89
90#define	KSE_SET_EXITED(kse)	(kse)->k_flags |= KF_EXITED
91
92/*
93 * Macros for manipulating the run queues.  The priority queue
94 * routines use the thread's pqe link and also handle the setting
95 * and clearing of the thread's THR_FLAGS_IN_RUNQ flag.
96 */
97#define	KSE_RUNQ_INSERT_HEAD(kse, thrd)			\
98	_pq_insert_head(&(kse)->k_schedq->sq_runq, thrd)
99#define	KSE_RUNQ_INSERT_TAIL(kse, thrd)			\
100	_pq_insert_tail(&(kse)->k_schedq->sq_runq, thrd)
101#define	KSE_RUNQ_REMOVE(kse, thrd)			\
102	_pq_remove(&(kse)->k_schedq->sq_runq, thrd)
103#define	KSE_RUNQ_FIRST(kse)	_pq_first(&(kse)->k_schedq->sq_runq)
104
105#define KSE_RUNQ_THREADS(kse)	((kse)->k_schedq->sq_runq.pq_threads)
106
107/*
108 * We've got to keep track of everything that is allocated, not only
109 * to have a speedy free list, but also so they can be deallocated
110 * after a fork().
111 */
112static TAILQ_HEAD(, kse)	active_kseq;
113static TAILQ_HEAD(, kse)	free_kseq;
114static TAILQ_HEAD(, kse_group)	free_kse_groupq;
115static TAILQ_HEAD(, kse_group)	active_kse_groupq;
116static TAILQ_HEAD(, kse_group)	gc_ksegq;
117static struct lock		kse_lock;	/* also used for kseg queue */
118static int			free_kse_count = 0;
119static int			free_kseg_count = 0;
120static TAILQ_HEAD(, pthread)	free_threadq;
121static struct lock		thread_lock;
122static int			free_thread_count = 0;
123static int			inited = 0;
124static int			active_threads = 1;
125static int			active_kse_count = 0;
126static int			active_kseg_count = 0;
127static u_int64_t		next_uniqueid = 1;
128
129LIST_HEAD(thread_hash_head, pthread);
130#define THREAD_HASH_QUEUES	127
131static struct thread_hash_head	thr_hashtable[THREAD_HASH_QUEUES];
132#define	THREAD_HASH(thrd)	((unsigned long)thrd % THREAD_HASH_QUEUES)
133
134#ifdef DEBUG_THREAD_KERN
135static void	dump_queues(struct kse *curkse);
136#endif
137static void	kse_check_completed(struct kse *kse);
138static void	kse_check_waitq(struct kse *kse);
139static void	kse_fini(struct kse *curkse);
140static void	kse_reinit(struct kse *kse, int sys_scope);
141static void	kse_sched_multi(struct kse_mailbox *kmbx);
142static void	kse_sched_single(struct kse_mailbox *kmbx);
143static void	kse_switchout_thread(struct kse *kse, struct pthread *thread);
144static void	kse_wait(struct kse *kse, struct pthread *td_wait, int sigseq);
145static void	kse_free_unlocked(struct kse *kse);
146static void	kse_destroy(struct kse *kse);
147static void	kseg_free_unlocked(struct kse_group *kseg);
148static void	kseg_init(struct kse_group *kseg);
149static void	kseg_reinit(struct kse_group *kseg);
150static void	kseg_destroy(struct kse_group *kseg);
151static void	kse_waitq_insert(struct pthread *thread);
152static void	kse_wakeup_multi(struct kse *curkse);
153static struct kse_mailbox *kse_wakeup_one(struct pthread *thread);
154static void	thr_cleanup(struct kse *kse, struct pthread *curthread);
155static void	thr_link(struct pthread *thread);
156static void	thr_resume_wrapper(int sig, siginfo_t *, ucontext_t *);
157static void	thr_resume_check(struct pthread *curthread, ucontext_t *ucp,
158		    struct pthread_sigframe *psf);
159static int	thr_timedout(struct pthread *thread, struct timespec *curtime);
160static void	thr_unlink(struct pthread *thread);
161static void	thread_gc(struct pthread *thread);
162static void	kse_gc(struct pthread *thread);
163static void	kseg_gc(struct pthread *thread);
164
165static void __inline
166thr_accounting(struct pthread *thread)
167{
168	if ((thread->slice_usec != -1) &&
169	    (thread->slice_usec <= TIMESLICE_USEC) &&
170	    (thread->attr.sched_policy != SCHED_FIFO)) {
171		thread->slice_usec += (thread->tcb->tcb_tmbx.tm_uticks
172		    + thread->tcb->tcb_tmbx.tm_sticks) * _clock_res_usec;
173		/* Check for time quantum exceeded: */
174		if (thread->slice_usec > TIMESLICE_USEC)
175			thread->slice_usec = -1;
176	}
177	thread->tcb->tcb_tmbx.tm_uticks = 0;
178	thread->tcb->tcb_tmbx.tm_sticks = 0;
179}
180
181/*
182 * This is called after a fork().
183 * No locks need to be taken here since we are guaranteed to be
184 * single threaded.
185 *
186 * XXX
187 * POSIX says for threaded process, fork() function is used
188 * only to run new programs, and the effects of calling functions
189 * that require certain resources between the call to fork() and
190 * the call to an exec function are undefined.
191 *
192 * Here it is not safe to reinitialize the library after fork().
193 * Because memory management may be corrupted, further calling
194 * malloc()/free() may cause undefined behavior.
195 */
196void
197_kse_single_thread(struct pthread *curthread)
198{
199#ifdef NOTYET
200	struct kse *kse;
201	struct kse_group *kseg;
202	struct pthread *thread;
203	kse_critical_t crit;
204	int i;
205
206	if (__isthreaded) {
207		_thr_rtld_fini();
208		_thr_signal_deinit();
209	}
210	__isthreaded = 0;
211	/*
212	 * Restore signal mask early, so any memory problems could
213	 * dump core.
214	 */
215	sigprocmask(SIG_SETMASK, &curthread->sigmask, NULL);
216	active_threads = 1;
217
218	/*
219	 * Enter a loop to remove and free all threads other than
220	 * the running thread from the active thread list:
221	 */
222	while ((thread = TAILQ_FIRST(&_thread_list)) != NULL) {
223		THR_GCLIST_REMOVE(thread);
224		/*
225		 * Remove this thread from the list (the current
226		 * thread will be removed but re-added by libpthread
227		 * initialization.
228		 */
229		TAILQ_REMOVE(&_thread_list, thread, tle);
230		/* Make sure this isn't the running thread: */
231		if (thread != curthread) {
232			_thr_stack_free(&thread->attr);
233			if (thread->specific != NULL)
234				free(thread->specific);
235			for (i = 0; i < MAX_THR_LOCKLEVEL; i++) {
236				_lockuser_destroy(&thread->lockusers[i]);
237			}
238			_lock_destroy(&thread->lock);
239			free(thread);
240		}
241	}
242
243	TAILQ_INIT(&curthread->mutexq);		/* initialize mutex queue */
244	curthread->joiner = NULL;		/* no joining threads yet */
245	curthread->refcount = 0;
246	SIGEMPTYSET(curthread->sigpend);	/* clear pending signals */
247	if (curthread->specific != NULL) {
248		free(curthread->specific);
249		curthread->specific = NULL;
250		curthread->specific_data_count = 0;
251	}
252
253	/* Free the free KSEs: */
254	while ((kse = TAILQ_FIRST(&free_kseq)) != NULL) {
255		TAILQ_REMOVE(&free_kseq, kse, k_qe);
256		for (i = 0; i < MAX_KSE_LOCKLEVEL; i++) {
257			_lockuser_destroy(&kse->k_lockusers[i]);
258		}
259		_lock_destroy(&kse->k_lock);
260		_kcb_dtor(kse->k_kcb);
261		if (kse->k_stack.ss_sp != NULL)
262			free(kse->k_stack.ss_sp);
263		free(kse);
264	}
265	free_kse_count = 0;
266
267	/* Free the active KSEs: */
268	while ((kse = TAILQ_FIRST(&active_kseq)) != NULL) {
269		TAILQ_REMOVE(&active_kseq, kse, k_qe);
270		for (i = 0; i < MAX_KSE_LOCKLEVEL; i++) {
271			_lockuser_destroy(&kse->k_lockusers[i]);
272		}
273		_lock_destroy(&kse->k_lock);
274		if (kse->k_stack.ss_sp != NULL)
275			free(kse->k_stack.ss_sp);
276		free(kse);
277	}
278	active_kse_count = 0;
279
280	/* Free the free KSEGs: */
281	while ((kseg = TAILQ_FIRST(&free_kse_groupq)) != NULL) {
282		TAILQ_REMOVE(&free_kse_groupq, kseg, kg_qe);
283		_lock_destroy(&kseg->kg_lock);
284		_pq_free(&kseg->kg_schedq.sq_runq);
285		free(kseg);
286	}
287	free_kseg_count = 0;
288
289	/* Free the active KSEGs: */
290	while ((kseg = TAILQ_FIRST(&active_kse_groupq)) != NULL) {
291		TAILQ_REMOVE(&active_kse_groupq, kseg, kg_qe);
292		_lock_destroy(&kseg->kg_lock);
293		_pq_free(&kseg->kg_schedq.sq_runq);
294		free(kseg);
295	}
296	active_kseg_count = 0;
297
298	/* Free the free threads. */
299	while ((thread = TAILQ_FIRST(&free_threadq)) != NULL) {
300		TAILQ_REMOVE(&free_threadq, thread, tle);
301		if (thread->specific != NULL)
302			free(thread->specific);
303		for (i = 0; i < MAX_THR_LOCKLEVEL; i++) {
304			_lockuser_destroy(&thread->lockusers[i]);
305		}
306		_lock_destroy(&thread->lock);
307		free(thread);
308	}
309	free_thread_count = 0;
310
311	/* Free the to-be-gc'd threads. */
312	while ((thread = TAILQ_FIRST(&_thread_gc_list)) != NULL) {
313		TAILQ_REMOVE(&_thread_gc_list, thread, gcle);
314		for (i = 0; i < MAX_THR_LOCKLEVEL; i++) {
315			_lockuser_destroy(&thread->lockusers[i]);
316		}
317		_lock_destroy(&thread->lock);
318		free(thread);
319	}
320	TAILQ_INIT(&gc_ksegq);
321	_gc_count = 0;
322
323	if (inited != 0) {
324		/*
325		 * Destroy these locks; they'll be recreated to assure they
326		 * are in the unlocked state.
327		 */
328		_lock_destroy(&kse_lock);
329		_lock_destroy(&thread_lock);
330		_lock_destroy(&_thread_list_lock);
331		inited = 0;
332	}
333
334	/*
335	 * After a fork(), the leftover thread goes back to being
336	 * scope process.
337	 */
338	curthread->attr.flags &= ~PTHREAD_SCOPE_SYSTEM;
339	curthread->attr.flags |= PTHREAD_SCOPE_PROCESS;
340
341	/*
342	 * After a fork, we are still operating on the thread's original
343	 * stack.  Don't clear the THR_FLAGS_USER from the thread's
344	 * attribute flags.
345	 */
346
347	/* Initialize the threads library. */
348	curthread->kse = NULL;
349	curthread->kseg = NULL;
350	_kse_initial = NULL;
351	_libpthread_init(curthread);
352#else
353	if (__isthreaded) {
354		_thr_rtld_fini();
355		_thr_signal_deinit();
356	}
357	__isthreaded = 0;
358	/*
359	 * Restore signal mask early, so any memory problems could
360	 * dump core.
361	 */
362	sigprocmask(SIG_SETMASK, &curthread->sigmask, NULL);
363	curthread->kse->k_kcb->kcb_kmbx.km_curthread = NULL;
364	active_threads = 1;
365#endif
366}
367
368/*
369 * This is used to initialize housekeeping and to initialize the
370 * KSD for the KSE.
371 */
372void
373_kse_init(void)
374{
375	if (inited == 0) {
376		TAILQ_INIT(&active_kseq);
377		TAILQ_INIT(&active_kse_groupq);
378		TAILQ_INIT(&free_kseq);
379		TAILQ_INIT(&free_kse_groupq);
380		TAILQ_INIT(&free_threadq);
381		TAILQ_INIT(&gc_ksegq);
382		if (_lock_init(&kse_lock, LCK_ADAPTIVE,
383		    _kse_lock_wait, _kse_lock_wakeup) != 0)
384			PANIC("Unable to initialize free KSE queue lock");
385		if (_lock_init(&thread_lock, LCK_ADAPTIVE,
386		    _kse_lock_wait, _kse_lock_wakeup) != 0)
387			PANIC("Unable to initialize free thread queue lock");
388		if (_lock_init(&_thread_list_lock, LCK_ADAPTIVE,
389		    _kse_lock_wait, _kse_lock_wakeup) != 0)
390			PANIC("Unable to initialize thread list lock");
391		active_kse_count = 0;
392		active_kseg_count = 0;
393		_gc_count = 0;
394		inited = 1;
395	}
396}
397
398int
399_kse_isthreaded(void)
400{
401	return (__isthreaded != 0);
402}
403
404/*
405 * This is called when the first thread (other than the initial
406 * thread) is created.
407 */
408int
409_kse_setthreaded(int threaded)
410{
411	sigset_t sigset;
412
413	if ((threaded != 0) && (__isthreaded == 0)) {
414		SIGFILLSET(sigset);
415		__sys_sigprocmask(SIG_SETMASK, &sigset, &_thr_initial->sigmask);
416
417		/*
418		 * Tell the kernel to create a KSE for the initial thread
419		 * and enable upcalls in it.
420		 */
421		_kse_initial->k_flags |= KF_STARTED;
422
423#ifdef SYSTEM_SCOPE_ONLY
424		/*
425		 * For bound thread, kernel reads mailbox pointer once,
426		 * we'd set it here before calling kse_create
427		 */
428		_tcb_set(_kse_initial->k_kcb, _thr_initial->tcb);
429		KSE_SET_MBOX(_kse_initial, _thr_initial);
430		_kse_initial->k_kcb->kcb_kmbx.km_flags |= KMF_BOUND;
431#else
432		_thr_initial->attr.flags &= ~PTHREAD_SCOPE_SYSTEM;
433		_kse_initial->k_kseg->kg_flags &= ~KGF_SINGLE_THREAD;
434		_kse_initial->k_kcb->kcb_kmbx.km_curthread = NULL;
435#endif
436
437		/*
438		 * Locking functions in libc are required when there are
439		 * threads other than the initial thread.
440		 */
441		_thr_rtld_init();
442
443		__isthreaded = 1;
444		if (kse_create(&_kse_initial->k_kcb->kcb_kmbx, 0) != 0) {
445			_kse_initial->k_flags &= ~KF_STARTED;
446			__isthreaded = 0;
447			PANIC("kse_create() failed\n");
448			return (-1);
449		}
450
451#ifndef SYSTEM_SCOPE_ONLY
452		/* Set current thread to initial thread */
453		_tcb_set(_kse_initial->k_kcb, _thr_initial->tcb);
454		KSE_SET_MBOX(_kse_initial, _thr_initial);
455		_thr_start_sig_daemon();
456		_thr_setmaxconcurrency();
457#else
458		__sys_sigprocmask(SIG_SETMASK, &_thr_initial->sigmask, NULL);
459#endif
460	}
461	return (0);
462}
463
464/*
465 * Lock wait and wakeup handlers for KSE locks.  These are only used by
466 * KSEs, and should never be used by threads.  KSE locks include the
467 * KSE group lock (used for locking the scheduling queue) and the
468 * kse_lock defined above.
469 *
470 * When a KSE lock attempt blocks, the entire KSE blocks allowing another
471 * KSE to run.  For the most part, it doesn't make much sense to try and
472 * schedule another thread because you need to lock the scheduling queue
473 * in order to do that.  And since the KSE lock is used to lock the scheduling
474 * queue, you would just end up blocking again.
475 */
476void
477_kse_lock_wait(struct lock *lock, struct lockuser *lu)
478{
479	struct kse *curkse = (struct kse *)_LCK_GET_PRIVATE(lu);
480	struct timespec ts;
481	int saved_flags;
482
483	if (curkse->k_kcb->kcb_kmbx.km_curthread != NULL)
484		PANIC("kse_lock_wait does not disable upcall.\n");
485	/*
486	 * Enter a loop to wait until we get the lock.
487	 */
488	ts.tv_sec = 0;
489	ts.tv_nsec = 1000000;  /* 1 sec */
490	while (!_LCK_GRANTED(lu)) {
491		/*
492		 * Yield the kse and wait to be notified when the lock
493		 * is granted.
494		 */
495		saved_flags = curkse->k_kcb->kcb_kmbx.km_flags;
496		curkse->k_kcb->kcb_kmbx.km_flags |= KMF_NOUPCALL |
497		    KMF_NOCOMPLETED;
498		kse_release(&ts);
499		curkse->k_kcb->kcb_kmbx.km_flags = saved_flags;
500	}
501}
502
503void
504_kse_lock_wakeup(struct lock *lock, struct lockuser *lu)
505{
506	struct kse *curkse;
507	struct kse *kse;
508	struct kse_mailbox *mbx;
509
510	curkse = _get_curkse();
511	kse = (struct kse *)_LCK_GET_PRIVATE(lu);
512
513	if (kse == curkse)
514		PANIC("KSE trying to wake itself up in lock");
515	else {
516		mbx = &kse->k_kcb->kcb_kmbx;
517		_lock_grant(lock, lu);
518		/*
519		 * Notify the owning kse that it has the lock.
520		 * It is safe to pass invalid address to kse_wakeup
521		 * even if the mailbox is not in kernel at all,
522		 * and waking up a wrong kse is also harmless.
523		 */
524		kse_wakeup(mbx);
525	}
526}
527
528/*
529 * Thread wait and wakeup handlers for thread locks.  These are only used
530 * by threads, never by KSEs.  Thread locks include the per-thread lock
531 * (defined in its structure), and condition variable and mutex locks.
532 */
533void
534_thr_lock_wait(struct lock *lock, struct lockuser *lu)
535{
536	struct pthread *curthread = (struct pthread *)lu->lu_private;
537
538	do {
539		THR_LOCK_SWITCH(curthread);
540		THR_SET_STATE(curthread, PS_LOCKWAIT);
541		_thr_sched_switch_unlocked(curthread);
542	} while (!_LCK_GRANTED(lu));
543}
544
545void
546_thr_lock_wakeup(struct lock *lock, struct lockuser *lu)
547{
548	struct pthread *thread;
549	struct pthread *curthread;
550	struct kse_mailbox *kmbx;
551
552	curthread = _get_curthread();
553	thread = (struct pthread *)_LCK_GET_PRIVATE(lu);
554
555	THR_SCHED_LOCK(curthread, thread);
556	_lock_grant(lock, lu);
557	kmbx = _thr_setrunnable_unlocked(thread);
558	THR_SCHED_UNLOCK(curthread, thread);
559	if (kmbx != NULL)
560		kse_wakeup(kmbx);
561}
562
563kse_critical_t
564_kse_critical_enter(void)
565{
566	kse_critical_t crit;
567
568	crit = (kse_critical_t)_kcb_critical_enter();
569	return (crit);
570}
571
572void
573_kse_critical_leave(kse_critical_t crit)
574{
575	struct pthread *curthread;
576
577	_kcb_critical_leave((struct kse_thr_mailbox *)crit);
578	if ((crit != NULL) && ((curthread = _get_curthread()) != NULL))
579		THR_YIELD_CHECK(curthread);
580}
581
582int
583_kse_in_critical(void)
584{
585	return (_kcb_in_critical());
586}
587
588void
589_thr_critical_enter(struct pthread *thread)
590{
591	thread->critical_count++;
592}
593
594void
595_thr_critical_leave(struct pthread *thread)
596{
597	thread->critical_count--;
598	THR_YIELD_CHECK(thread);
599}
600
601void
602_thr_sched_switch(struct pthread *curthread)
603{
604	struct kse *curkse;
605
606	(void)_kse_critical_enter();
607	curkse = _get_curkse();
608	KSE_SCHED_LOCK(curkse, curkse->k_kseg);
609	_thr_sched_switch_unlocked(curthread);
610}
611
612/*
613 * XXX - We may need to take the scheduling lock before calling
614 *       this, or perhaps take the lock within here before
615 *       doing anything else.
616 */
617void
618_thr_sched_switch_unlocked(struct pthread *curthread)
619{
620	struct pthread *td;
621	struct pthread_sigframe psf;
622	struct kse *curkse;
623	int ret;
624	volatile int uts_once;
625	volatile int resume_once = 0;
626	ucontext_t uc;
627
628	/* We're in the scheduler, 5 by 5: */
629	curkse = _get_curkse();
630
631	curthread->need_switchout = 1;	/* The thread yielded on its own. */
632	curthread->critical_yield = 0;	/* No need to yield anymore. */
633	thr_accounting(curthread);
634
635
636	/* Thread can unlock the scheduler lock. */
637	curthread->lock_switch = 1;
638
639	/*
640	 * The signal frame is allocated off the stack because
641	 * a thread can be interrupted by other signals while
642	 * it is running down pending signals.
643	 */
644	psf.psf_valid = 0;
645	curthread->curframe = &psf;
646
647	/*
648	 * Enter the scheduler if any one of the following is true:
649	 *
650	 *   o The current thread is dead; it's stack needs to be
651	 *     cleaned up and it can't be done while operating on
652	 *     it.
653	 *   o The current thread has signals pending, should
654	 *     let scheduler install signal trampoline for us.
655	 *   o There are no runnable threads.
656	 *   o The next thread to run won't unlock the scheduler
657	 *     lock.  A side note: the current thread may be run
658	 *     instead of the next thread in the run queue, but
659	 *     we don't bother checking for that.
660	 */
661	if (curthread->attr.flags & PTHREAD_SCOPE_SYSTEM)
662		kse_sched_single(&curkse->k_kcb->kcb_kmbx);
663	else if ((curthread->state == PS_DEAD) ||
664	    (((td = KSE_RUNQ_FIRST(curkse)) == NULL) &&
665	    (curthread->state != PS_RUNNING)) ||
666	    ((td != NULL) && (td->lock_switch == 0))) {
667		curkse->k_switch = 1;
668		_thread_enter_uts(curthread->tcb, curkse->k_kcb);
669	}
670	else {
671		uts_once = 0;
672		THR_GETCONTEXT(&curthread->tcb->tcb_tmbx.tm_context);
673		if (uts_once == 0) {
674			uts_once = 1;
675
676			/* Switchout the current thread. */
677			kse_switchout_thread(curkse, curthread);
678			_tcb_set(curkse->k_kcb, NULL);
679
680		 	/* Choose another thread to run. */
681			td = KSE_RUNQ_FIRST(curkse);
682			KSE_RUNQ_REMOVE(curkse, td);
683			curkse->k_curthread = td;
684
685			/*
686			 * Make sure the current thread's kse points to
687			 * this kse.
688			 */
689			td->kse = curkse;
690
691			/*
692			 * Reset the time slice if this thread is running
693			 * for the first time or running again after using
694			 * its full time slice allocation.
695			 */
696			if (td->slice_usec == -1)
697				td->slice_usec = 0;
698
699			/* Mark the thread active. */
700			td->active = 1;
701
702			/* Remove the frame reference. */
703			td->curframe = NULL;
704
705			/*
706			 * Continue the thread at its current frame.
707			 * Note: TCB is set in _thread_switch
708			 */
709			ret = _thread_switch(curkse->k_kcb, td->tcb, 0);
710			/* This point should not be reached. */
711			if (ret != 0)
712				PANIC("Bad return from _thread_switch");
713			PANIC("Thread has returned from _thread_switch");
714		}
715	}
716
717	if (psf.psf_valid) {
718		/*
719		 * It is ugly we must increase critical count, because we
720		 * have a frame saved, we must backout state in psf
721		 * before we can process signals.
722 		 */
723		curthread->critical_count++;
724	}
725
726	if (curthread->lock_switch != 0) {
727		/*
728		 * Unlock the scheduling queue and leave the
729		 * critical region.
730		 */
731		/* Don't trust this after a switch! */
732		curkse = _get_curkse();
733
734		curthread->lock_switch = 0;
735		KSE_SCHED_UNLOCK(curkse, curkse->k_kseg);
736		_kse_critical_leave(&curthread->tcb->tcb_tmbx);
737	}
738	/*
739	 * This thread is being resumed; check for cancellations.
740	 */
741	if ((psf.psf_valid ||
742	    (curthread->check_pending && !THR_IN_CRITICAL(curthread)))) {
743		resume_once = 0;
744		THR_GETCONTEXT(&uc);
745		if (resume_once == 0) {
746			resume_once = 1;
747			curthread->check_pending = 0;
748			thr_resume_check(curthread, &uc, &psf);
749		}
750	}
751	THR_ACTIVATE_LAST_LOCK(curthread);
752}
753
754/*
755 * This is the scheduler for a KSE which runs a scope system thread.
756 * The multi-thread KSE scheduler should also work for a single threaded
757 * KSE, but we use a separate scheduler so that it can be fine-tuned
758 * to be more efficient (and perhaps not need a separate stack for
759 * the KSE, allowing it to use the thread's stack).
760 */
761
762static void
763kse_sched_single(struct kse_mailbox *kmbx)
764{
765	struct kse *curkse;
766	struct pthread *curthread;
767	struct timespec ts;
768	sigset_t sigmask;
769	int i, sigseqno, level, first = 0;
770
771	curkse = (struct kse *)kmbx->km_udata;
772	curthread = curkse->k_curthread;
773
774	if ((curkse->k_flags & KF_INITIALIZED) == 0) {
775		/* Setup this KSEs specific data. */
776		_kcb_set(curkse->k_kcb);
777		_tcb_set(curkse->k_kcb, curthread->tcb);
778		curkse->k_flags |= KF_INITIALIZED;
779		first = 1;
780		curthread->active = 1;
781
782		/* Setup kernel signal masks for new thread. */
783		__sys_sigprocmask(SIG_SETMASK, &curthread->sigmask, NULL);
784		/*
785		 * Enter critical region, this is meanless for bound thread,
786		 * It is used to let other code work, those code want mailbox
787		 * to be cleared.
788		 */
789		(void)_kse_critical_enter();
790 	} else {
791		/*
792		 * Bound thread always has tcb set, this prevent some
793		 * code from blindly setting bound thread tcb to NULL,
794		 * buggy code ?
795		 */
796		_tcb_set(curkse->k_kcb, curthread->tcb);
797	}
798
799	curthread->critical_yield = 0;
800	curthread->need_switchout = 0;
801
802	/*
803	 * Lock the scheduling queue.
804	 *
805	 * There is no scheduling queue for single threaded KSEs,
806	 * but we need a lock for protection regardless.
807	 */
808	if (curthread->lock_switch == 0)
809		KSE_SCHED_LOCK(curkse, curkse->k_kseg);
810
811	/*
812	 * This has to do the job of kse_switchout_thread(), only
813	 * for a single threaded KSE/KSEG.
814	 */
815
816	switch (curthread->state) {
817	case PS_DEAD:
818		curthread->check_pending = 0;
819		/* Unlock the scheduling queue and exit the KSE and thread. */
820		thr_cleanup(curkse, curthread);
821		KSE_SCHED_UNLOCK(curkse, curkse->k_kseg);
822		PANIC("bound thread shouldn't get here\n");
823		break;
824
825	case PS_SIGWAIT:
826		PANIC("bound thread does not have SIGWAIT state\n");
827
828	case PS_SLEEP_WAIT:
829		PANIC("bound thread does not have SLEEP_WAIT state\n");
830
831	case PS_SIGSUSPEND:
832		PANIC("bound thread does not have SIGSUSPEND state\n");
833
834	case PS_COND_WAIT:
835		break;
836
837	case PS_LOCKWAIT:
838		/*
839		 * This state doesn't timeout.
840		 */
841		curthread->wakeup_time.tv_sec = -1;
842		curthread->wakeup_time.tv_nsec = -1;
843		level = curthread->locklevel - 1;
844		if (_LCK_GRANTED(&curthread->lockusers[level]))
845			THR_SET_STATE(curthread, PS_RUNNING);
846		break;
847
848	case PS_RUNNING:
849		if ((curthread->flags & THR_FLAGS_SUSPENDED) != 0) {
850			THR_SET_STATE(curthread, PS_SUSPENDED);
851		}
852		curthread->wakeup_time.tv_sec = -1;
853		curthread->wakeup_time.tv_nsec = -1;
854		break;
855
856	case PS_JOIN:
857	case PS_MUTEX_WAIT:
858	case PS_SUSPENDED:
859	case PS_DEADLOCK:
860	default:
861		/*
862		 * These states don't timeout and don't need
863		 * to be in the waiting queue.
864		 */
865		curthread->wakeup_time.tv_sec = -1;
866		curthread->wakeup_time.tv_nsec = -1;
867		break;
868	}
869
870	while (curthread->state != PS_RUNNING) {
871		sigseqno = curkse->k_sigseqno;
872		if (curthread->check_pending != 0) {
873			/*
874			 * Install pending signals into the frame, possible
875			 * cause mutex or condvar backout.
876			 */
877			curthread->check_pending = 0;
878			SIGFILLSET(sigmask);
879
880			/*
881			 * Lock out kernel signal code when we are processing
882			 * signals, and get a fresh copy of signal mask.
883			 */
884			__sys_sigprocmask(SIG_SETMASK, &sigmask,
885					  &curthread->sigmask);
886			for (i = 1; i <= _SIG_MAXSIG; i++) {
887				if (SIGISMEMBER(curthread->sigmask, i))
888					continue;
889				if (SIGISMEMBER(curthread->sigpend, i))
890					(void)_thr_sig_add(curthread, i,
891					    &curthread->siginfo[i-1]);
892			}
893			__sys_sigprocmask(SIG_SETMASK, &curthread->sigmask,
894				NULL);
895			/* The above code might make thread runnable */
896			if (curthread->state == PS_RUNNING)
897				break;
898		}
899		THR_DEACTIVATE_LAST_LOCK(curthread);
900		kse_wait(curkse, curthread, sigseqno);
901		THR_ACTIVATE_LAST_LOCK(curthread);
902		KSE_GET_TOD(curkse, &ts);
903		if (thr_timedout(curthread, &ts)) {
904			/* Indicate the thread timedout: */
905			curthread->timeout = 1;
906			/* Make the thread runnable. */
907			THR_SET_STATE(curthread, PS_RUNNING);
908		}
909	}
910
911	/* Remove the frame reference. */
912	curthread->curframe = NULL;
913
914	if (curthread->lock_switch == 0) {
915		/* Unlock the scheduling queue. */
916		KSE_SCHED_UNLOCK(curkse, curkse->k_kseg);
917	}
918
919	DBG_MSG("Continuing bound thread %p\n", curthread);
920	if (first) {
921		_kse_critical_leave(&curthread->tcb->tcb_tmbx);
922		pthread_exit(curthread->start_routine(curthread->arg));
923	}
924}
925
926#ifdef DEBUG_THREAD_KERN
927static void
928dump_queues(struct kse *curkse)
929{
930	struct pthread *thread;
931
932	DBG_MSG("Threads in waiting queue:\n");
933	TAILQ_FOREACH(thread, &curkse->k_kseg->kg_schedq.sq_waitq, pqe) {
934		DBG_MSG("  thread %p, state %d, blocked %d\n",
935		    thread, thread->state, thread->blocked);
936	}
937}
938#endif
939
940/*
941 * This is the scheduler for a KSE which runs multiple threads.
942 */
943static void
944kse_sched_multi(struct kse_mailbox *kmbx)
945{
946	struct kse *curkse;
947	struct pthread *curthread, *td_wait;
948	struct pthread_sigframe *curframe;
949	int ret;
950
951	curkse = (struct kse *)kmbx->km_udata;
952	THR_ASSERT(curkse->k_kcb->kcb_kmbx.km_curthread == NULL,
953	    "Mailbox not null in kse_sched_multi");
954
955	/* Check for first time initialization: */
956	if ((curkse->k_flags & KF_INITIALIZED) == 0) {
957		/* Setup this KSEs specific data. */
958		_kcb_set(curkse->k_kcb);
959
960		/* Set this before grabbing the context. */
961		curkse->k_flags |= KF_INITIALIZED;
962	}
963
964	/*
965	 * No current thread anymore, calling _get_curthread in UTS
966	 * should dump core
967	 */
968	_tcb_set(curkse->k_kcb, NULL);
969
970	/* If this is an upcall; take the scheduler lock. */
971	if (curkse->k_switch == 0)
972		KSE_SCHED_LOCK(curkse, curkse->k_kseg);
973	curkse->k_switch = 0;
974
975	/*
976	 * Now that the scheduler lock is held, get the current
977	 * thread.  The KSE's current thread cannot be safely
978	 * examined without the lock because it could have returned
979	 * as completed on another KSE.  See kse_check_completed().
980	 */
981	curthread = curkse->k_curthread;
982
983	if (KSE_IS_IDLE(curkse)) {
984		KSE_CLEAR_IDLE(curkse);
985		curkse->k_kseg->kg_idle_kses--;
986	}
987	/*
988	 * If the current thread was completed in another KSE, then
989	 * it will be in the run queue.  Don't mark it as being blocked.
990	 */
991	if ((curthread != NULL) &&
992	    ((curthread->flags & THR_FLAGS_IN_RUNQ) == 0) &&
993	    (curthread->need_switchout == 0)) {
994		/*
995		 * Assume the current thread is blocked; when the
996		 * completed threads are checked and if the current
997		 * thread is among the completed, the blocked flag
998		 * will be cleared.
999		 */
1000		curthread->blocked = 1;
1001	}
1002
1003	/* Check for any unblocked threads in the kernel. */
1004	kse_check_completed(curkse);
1005
1006	/*
1007	 * Check for threads that have timed-out.
1008	 */
1009	kse_check_waitq(curkse);
1010
1011	/*
1012	 * Switchout the current thread, if necessary, as the last step
1013	 * so that it is inserted into the run queue (if it's runnable)
1014	 * _after_ any other threads that were added to it above.
1015	 */
1016	if (curthread == NULL)
1017		;  /* Nothing to do here. */
1018	else if ((curthread->need_switchout == 0) &&
1019	    (curthread->blocked == 0) && (THR_IN_CRITICAL(curthread))) {
1020		/*
1021		 * Resume the thread and tell it to yield when
1022		 * it leaves the critical region.
1023		 */
1024		curthread->critical_yield = 1;
1025		curthread->active = 1;
1026		if ((curthread->flags & THR_FLAGS_IN_RUNQ) != 0)
1027			KSE_RUNQ_REMOVE(curkse, curthread);
1028		curkse->k_curthread = curthread;
1029		curthread->kse = curkse;
1030		DBG_MSG("Continuing thread %p in critical region\n",
1031		    curthread);
1032		kse_wakeup_multi(curkse);
1033		KSE_SCHED_UNLOCK(curkse, curkse->k_kseg);
1034		ret = _thread_switch(curkse->k_kcb, curthread->tcb, 1);
1035		if (ret != 0)
1036			PANIC("Can't resume thread in critical region\n");
1037	}
1038	else if ((curthread->flags & THR_FLAGS_IN_RUNQ) == 0)
1039		kse_switchout_thread(curkse, curthread);
1040	curkse->k_curthread = NULL;
1041
1042#ifdef DEBUG_THREAD_KERN
1043	dump_queues(curkse);
1044#endif
1045
1046	/* Check if there are no threads ready to run: */
1047	while (((curthread = KSE_RUNQ_FIRST(curkse)) == NULL) &&
1048	    (curkse->k_kseg->kg_threadcount != 0) &&
1049	    ((curkse->k_flags & KF_TERMINATED) == 0)) {
1050		/*
1051		 * Wait for a thread to become active or until there are
1052		 * no more threads.
1053		 */
1054		td_wait = KSE_WAITQ_FIRST(curkse);
1055		kse_wait(curkse, td_wait, 0);
1056		kse_check_completed(curkse);
1057		kse_check_waitq(curkse);
1058	}
1059
1060	/* Check for no more threads: */
1061	if ((curkse->k_kseg->kg_threadcount == 0) ||
1062	    ((curkse->k_flags & KF_TERMINATED) != 0)) {
1063		/*
1064		 * Normally this shouldn't return, but it will if there
1065		 * are other KSEs running that create new threads that
1066		 * are assigned to this KSE[G].  For instance, if a scope
1067		 * system thread were to create a scope process thread
1068		 * and this kse[g] is the initial kse[g], then that newly
1069		 * created thread would be assigned to us (the initial
1070		 * kse[g]).
1071		 */
1072		kse_wakeup_multi(curkse);
1073		KSE_SCHED_UNLOCK(curkse, curkse->k_kseg);
1074		kse_fini(curkse);
1075		/* never returns */
1076	}
1077
1078	THR_ASSERT(curthread != NULL,
1079	    "Return from kse_wait/fini without thread.");
1080	THR_ASSERT(curthread->state != PS_DEAD,
1081	    "Trying to resume dead thread!");
1082	KSE_RUNQ_REMOVE(curkse, curthread);
1083
1084	/*
1085	 * Make the selected thread the current thread.
1086	 */
1087	curkse->k_curthread = curthread;
1088
1089	/*
1090	 * Make sure the current thread's kse points to this kse.
1091	 */
1092	curthread->kse = curkse;
1093
1094	/*
1095	 * Reset the time slice if this thread is running for the first
1096	 * time or running again after using its full time slice allocation.
1097	 */
1098	if (curthread->slice_usec == -1)
1099		curthread->slice_usec = 0;
1100
1101	/* Mark the thread active. */
1102	curthread->active = 1;
1103
1104	/* Remove the frame reference. */
1105	curframe = curthread->curframe;
1106	curthread->curframe = NULL;
1107
1108	/*
1109	 * The thread's current signal frame will only be NULL if it
1110	 * is being resumed after being blocked in the kernel.  In
1111	 * this case, and if the thread needs to run down pending
1112	 * signals or needs a cancellation check, we need to add a
1113	 * signal frame to the thread's context.
1114	 */
1115#ifdef NOT_YET
1116	if ((((curframe == NULL) && (curthread->check_pending != 0)) ||
1117	    (((curthread->cancelflags & THR_AT_CANCEL_POINT) == 0) &&
1118	     ((curthread->cancelflags & PTHREAD_CANCEL_ASYNCHRONOUS) != 0))) &&
1119	     !THR_IN_CRITICAL(curthread))
1120		signalcontext(&curthread->tcb->tcb_tmbx.tm_context, 0,
1121		    (__sighandler_t *)thr_resume_wrapper);
1122#else
1123	if ((curframe == NULL) && (curthread->state == PS_RUNNING) &&
1124	    (curthread->check_pending != 0) && !THR_IN_CRITICAL(curthread)) {
1125		curthread->check_pending = 0;
1126		signalcontext(&curthread->tcb->tcb_tmbx.tm_context, 0,
1127		    (__sighandler_t *)thr_resume_wrapper);
1128	}
1129#endif
1130	kse_wakeup_multi(curkse);
1131	/*
1132	 * Continue the thread at its current frame:
1133	 */
1134	if (curthread->lock_switch != 0) {
1135		/*
1136		 * This thread came from a scheduler switch; it will
1137		 * unlock the scheduler lock and set the mailbox.
1138		 */
1139		ret = _thread_switch(curkse->k_kcb, curthread->tcb, 0);
1140	} else {
1141		/* This thread won't unlock the scheduler lock. */
1142		KSE_SCHED_UNLOCK(curkse, curkse->k_kseg);
1143		ret = _thread_switch(curkse->k_kcb, curthread->tcb, 1);
1144	}
1145	if (ret != 0)
1146		PANIC("Thread has returned from _thread_switch");
1147
1148	/* This point should not be reached. */
1149	PANIC("Thread has returned from _thread_switch");
1150}
1151
1152static void
1153thr_resume_wrapper(int sig, siginfo_t *siginfo, ucontext_t *ucp)
1154{
1155	struct pthread *curthread = _get_curthread();
1156	struct kse *curkse;
1157	int ret, err_save = errno;
1158
1159	DBG_MSG(">>> sig wrapper\n");
1160	if (curthread->lock_switch)
1161		PANIC("thr_resume_wrapper, lock_switch != 0\n");
1162	thr_resume_check(curthread, ucp, NULL);
1163	errno = err_save;
1164	_kse_critical_enter();
1165	curkse = _get_curkse();
1166	curthread->tcb->tcb_tmbx.tm_context = *ucp;
1167	ret = _thread_switch(curkse->k_kcb, curthread->tcb, 1);
1168	if (ret != 0)
1169		PANIC("thr_resume_wrapper: thread has returned "
1170		      "from _thread_switch");
1171	/* THR_SETCONTEXT(ucp); */ /* not work, why ? */
1172}
1173
1174static void
1175thr_resume_check(struct pthread *curthread, ucontext_t *ucp,
1176    struct pthread_sigframe *psf)
1177{
1178	_thr_sig_rundown(curthread, ucp, psf);
1179
1180#ifdef NOT_YET
1181	if (((curthread->cancelflags & THR_AT_CANCEL_POINT) == 0) &&
1182	    ((curthread->cancelflags & PTHREAD_CANCEL_ASYNCHRONOUS) != 0))
1183		pthread_testcancel();
1184#endif
1185}
1186
1187/*
1188 * Clean up a thread.  This must be called with the thread's KSE
1189 * scheduling lock held.  The thread must be a thread from the
1190 * KSE's group.
1191 */
1192static void
1193thr_cleanup(struct kse *curkse, struct pthread *thread)
1194{
1195	struct pthread *joiner;
1196	struct kse_mailbox *kmbx = NULL;
1197	int sys_scope;
1198
1199	if ((joiner = thread->joiner) != NULL) {
1200		/* Joinee scheduler lock held; joiner won't leave. */
1201		if (joiner->kseg == curkse->k_kseg) {
1202			if (joiner->join_status.thread == thread) {
1203				joiner->join_status.thread = NULL;
1204				joiner->join_status.ret = thread->ret;
1205				(void)_thr_setrunnable_unlocked(joiner);
1206			}
1207		} else {
1208			KSE_SCHED_UNLOCK(curkse, curkse->k_kseg);
1209			/* The joiner may have removed itself and exited. */
1210			if (_thr_ref_add(thread, joiner, 0) == 0) {
1211				KSE_SCHED_LOCK(curkse, joiner->kseg);
1212				if (joiner->join_status.thread == thread) {
1213					joiner->join_status.thread = NULL;
1214					joiner->join_status.ret = thread->ret;
1215					kmbx = _thr_setrunnable_unlocked(joiner);
1216				}
1217				KSE_SCHED_UNLOCK(curkse, joiner->kseg);
1218				_thr_ref_delete(thread, joiner);
1219				if (kmbx != NULL)
1220					kse_wakeup(kmbx);
1221			}
1222			KSE_SCHED_LOCK(curkse, curkse->k_kseg);
1223		}
1224		thread->attr.flags |= PTHREAD_DETACHED;
1225	}
1226
1227	if (!(sys_scope = (thread->attr.flags & PTHREAD_SCOPE_SYSTEM))) {
1228		/*
1229		 * Remove the thread from the KSEG's list of threads.
1230	 	 */
1231		KSEG_THRQ_REMOVE(thread->kseg, thread);
1232		/*
1233		 * Migrate the thread to the main KSE so that this
1234		 * KSE and KSEG can be cleaned when their last thread
1235		 * exits.
1236		 */
1237		thread->kseg = _kse_initial->k_kseg;
1238		thread->kse = _kse_initial;
1239	}
1240	thread->flags |= THR_FLAGS_GC_SAFE;
1241
1242	/*
1243	 * We can't hold the thread list lock while holding the
1244	 * scheduler lock.
1245	 */
1246	KSE_SCHED_UNLOCK(curkse, curkse->k_kseg);
1247	DBG_MSG("Adding thread %p to GC list\n", thread);
1248	KSE_LOCK_ACQUIRE(curkse, &_thread_list_lock);
1249	/* Use thread_list_lock */
1250	active_threads--;
1251#ifdef SYSTEM_SCOPE_ONLY
1252	if (active_threads == 0) {
1253#else
1254	if (active_threads == 1) {
1255#endif
1256		KSE_LOCK_RELEASE(curkse, &_thread_list_lock);
1257		/* Possible use a signalcontext wrapper to call exit ? */
1258		curkse->k_curthread = thread;
1259		_tcb_set(curkse->k_kcb, thread->tcb);
1260		exit(0);
1261        }
1262	THR_GCLIST_ADD(thread);
1263	KSE_LOCK_RELEASE(curkse, &_thread_list_lock);
1264	if (sys_scope) {
1265		/*
1266		 * System scope thread is single thread group,
1267		 * when thread is exited, its kse and ksegrp should
1268		 * be recycled as well.
1269		 * kse upcall stack belongs to thread, clear it here.
1270		 */
1271		curkse->k_stack.ss_sp = 0;
1272		curkse->k_stack.ss_size = 0;
1273		kse_exit();
1274		PANIC("kse_exit() failed for system scope thread");
1275	}
1276	KSE_SCHED_LOCK(curkse, curkse->k_kseg);
1277}
1278
1279void
1280_thr_gc(struct pthread *curthread)
1281{
1282	thread_gc(curthread);
1283	kse_gc(curthread);
1284	kseg_gc(curthread);
1285}
1286
1287static void
1288thread_gc(struct pthread *curthread)
1289{
1290	struct pthread *td, *td_next;
1291	kse_critical_t crit;
1292	TAILQ_HEAD(, pthread) worklist;
1293
1294	TAILQ_INIT(&worklist);
1295	crit = _kse_critical_enter();
1296	KSE_LOCK_ACQUIRE(curthread->kse, &_thread_list_lock);
1297
1298	/* Check the threads waiting for GC. */
1299	for (td = TAILQ_FIRST(&_thread_gc_list); td != NULL; td = td_next) {
1300		td_next = TAILQ_NEXT(td, gcle);
1301		if ((td->flags & THR_FLAGS_GC_SAFE) == 0)
1302			continue;
1303		else if (((td->attr.flags & PTHREAD_SCOPE_SYSTEM) != 0) &&
1304		    ((td->kse->k_kcb->kcb_kmbx.km_flags & KMF_DONE) == 0)) {
1305			/*
1306			 * The thread and KSE are operating on the same
1307			 * stack.  Wait for the KSE to exit before freeing
1308			 * the thread's stack as well as everything else.
1309			 */
1310			continue;
1311		}
1312		/*
1313		 * Remove the thread from the GC list.  If the thread
1314		 * isn't yet detached, it will get added back to the
1315		 * GC list at a later time.
1316		 */
1317		THR_GCLIST_REMOVE(td);
1318		DBG_MSG("Freeing thread %p stack\n", td);
1319		/*
1320		 * We can free the thread stack since it's no longer
1321		 * in use.
1322		 */
1323		_thr_stack_free(&td->attr);
1324		if (((td->attr.flags & PTHREAD_DETACHED) != 0) &&
1325		    (td->refcount == 0)) {
1326			/*
1327			 * The thread has detached and is no longer
1328			 * referenced.  It is safe to remove all
1329			 * remnants of the thread.
1330			 */
1331			THR_LIST_REMOVE(td);
1332			TAILQ_INSERT_HEAD(&worklist, td, gcle);
1333		}
1334	}
1335	KSE_LOCK_RELEASE(curthread->kse, &_thread_list_lock);
1336	_kse_critical_leave(crit);
1337
1338	while ((td = TAILQ_FIRST(&worklist)) != NULL) {
1339		TAILQ_REMOVE(&worklist, td, gcle);
1340		/*
1341		 * XXX we don't free initial thread and its kse
1342		 * (if thread is a bound thread), because there might
1343		 * have some code referencing initial thread and kse.
1344		 */
1345		if (td == _thr_initial) {
1346			DBG_MSG("Initial thread won't be freed\n");
1347			continue;
1348		}
1349
1350		if ((td->attr.flags & PTHREAD_SCOPE_SYSTEM) != 0) {
1351			crit = _kse_critical_enter();
1352			KSE_LOCK_ACQUIRE(curthread->kse, &kse_lock);
1353			kse_free_unlocked(td->kse);
1354			kseg_free_unlocked(td->kseg);
1355			KSE_LOCK_RELEASE(curthread->kse, &kse_lock);
1356			_kse_critical_leave(crit);
1357		}
1358		DBG_MSG("Freeing thread %p\n", td);
1359		_thr_free(curthread, td);
1360	}
1361}
1362
1363static void
1364kse_gc(struct pthread *curthread)
1365{
1366	kse_critical_t crit;
1367	TAILQ_HEAD(, kse) worklist;
1368	struct kse *kse;
1369
1370	if (free_kse_count <= MAX_CACHED_KSES)
1371		return;
1372	TAILQ_INIT(&worklist);
1373	crit = _kse_critical_enter();
1374	KSE_LOCK_ACQUIRE(curthread->kse, &kse_lock);
1375	while (free_kse_count > MAX_CACHED_KSES) {
1376		kse = TAILQ_FIRST(&free_kseq);
1377		TAILQ_REMOVE(&free_kseq, kse, k_qe);
1378		TAILQ_INSERT_HEAD(&worklist, kse, k_qe);
1379		free_kse_count--;
1380	}
1381	KSE_LOCK_RELEASE(curthread->kse, &kse_lock);
1382	_kse_critical_leave(crit);
1383
1384	while ((kse = TAILQ_FIRST(&worklist))) {
1385		TAILQ_REMOVE(&worklist, kse, k_qe);
1386		kse_destroy(kse);
1387	}
1388}
1389
1390static void
1391kseg_gc(struct pthread *curthread)
1392{
1393	kse_critical_t crit;
1394	TAILQ_HEAD(, kse_group) worklist;
1395	struct kse_group *kseg;
1396
1397	if (free_kseg_count <= MAX_CACHED_KSEGS)
1398		return;
1399	crit = _kse_critical_enter();
1400	KSE_LOCK_ACQUIRE(curthread->kse, &kse_lock);
1401	while (free_kseg_count > MAX_CACHED_KSEGS) {
1402		kseg = TAILQ_FIRST(&free_kse_groupq);
1403		TAILQ_REMOVE(&free_kse_groupq, kseg, kg_qe);
1404		free_kseg_count--;
1405		TAILQ_INSERT_HEAD(&worklist, kseg, kg_qe);
1406	}
1407	KSE_LOCK_RELEASE(curthread->kse, &kse_lock);
1408	_kse_critical_leave(crit);
1409
1410	while ((kseg = TAILQ_FIRST(&worklist))) {
1411		TAILQ_REMOVE(&worklist, kseg, kg_qe);
1412		kseg_destroy(kseg);
1413	}
1414}
1415
1416/*
1417 * Only new threads that are running or suspended may be scheduled.
1418 */
1419int
1420_thr_schedule_add(struct pthread *curthread, struct pthread *newthread)
1421{
1422	kse_critical_t crit;
1423	int ret;
1424
1425	/* Add the new thread. */
1426	thr_link(newthread);
1427
1428	/*
1429	 * If this is the first time creating a thread, make sure
1430	 * the mailbox is set for the current thread.
1431	 */
1432	if ((newthread->attr.flags & PTHREAD_SCOPE_SYSTEM) != 0) {
1433		/* We use the thread's stack as the KSE's stack. */
1434		newthread->kse->k_kcb->kcb_kmbx.km_stack.ss_sp =
1435		    newthread->attr.stackaddr_attr;
1436		newthread->kse->k_kcb->kcb_kmbx.km_stack.ss_size =
1437		    newthread->attr.stacksize_attr;
1438
1439		/*
1440		 * No need to lock the scheduling queue since the
1441		 * KSE/KSEG pair have not yet been started.
1442		 */
1443		KSEG_THRQ_ADD(newthread->kseg, newthread);
1444		/* this thread never gives up kse */
1445		newthread->active = 1;
1446		newthread->kse->k_curthread = newthread;
1447		newthread->kse->k_kcb->kcb_kmbx.km_flags = KMF_BOUND;
1448		newthread->kse->k_kcb->kcb_kmbx.km_func =
1449		    (kse_func_t *)kse_sched_single;
1450		newthread->kse->k_kcb->kcb_kmbx.km_quantum = 0;
1451		KSE_SET_MBOX(newthread->kse, newthread);
1452		/*
1453		 * This thread needs a new KSE and KSEG.
1454		 */
1455		newthread->kse->k_flags &= ~KF_INITIALIZED;
1456		newthread->kse->k_flags |= KF_STARTED;
1457		/* Fire up! */
1458		ret = kse_create(&newthread->kse->k_kcb->kcb_kmbx, 1);
1459		if (ret != 0)
1460			ret = errno;
1461	}
1462	else {
1463		/*
1464		 * Lock the KSE and add the new thread to its list of
1465		 * assigned threads.  If the new thread is runnable, also
1466		 * add it to the KSE's run queue.
1467		 */
1468		crit = _kse_critical_enter();
1469		KSE_SCHED_LOCK(curthread->kse, newthread->kseg);
1470		KSEG_THRQ_ADD(newthread->kseg, newthread);
1471		if (newthread->state == PS_RUNNING)
1472			THR_RUNQ_INSERT_TAIL(newthread);
1473		if ((newthread->kse->k_flags & KF_STARTED) == 0) {
1474			/*
1475			 * This KSE hasn't been started yet.  Start it
1476			 * outside of holding the lock.
1477			 */
1478			newthread->kse->k_flags |= KF_STARTED;
1479			newthread->kse->k_kcb->kcb_kmbx.km_func =
1480			    (kse_func_t *)kse_sched_multi;
1481			newthread->kse->k_kcb->kcb_kmbx.km_flags = 0;
1482			kse_create(&newthread->kse->k_kcb->kcb_kmbx, 0);
1483		 } else if ((newthread->state == PS_RUNNING) &&
1484		     KSE_IS_IDLE(newthread->kse)) {
1485			/*
1486			 * The thread is being scheduled on another KSEG.
1487			 */
1488			kse_wakeup_one(newthread);
1489		}
1490		KSE_SCHED_UNLOCK(curthread->kse, newthread->kseg);
1491		_kse_critical_leave(crit);
1492		ret = 0;
1493	}
1494	if (ret != 0)
1495		thr_unlink(newthread);
1496
1497	return (ret);
1498}
1499
1500void
1501kse_waitq_insert(struct pthread *thread)
1502{
1503	struct pthread *td;
1504
1505	if (thread->wakeup_time.tv_sec == -1)
1506		TAILQ_INSERT_TAIL(&thread->kse->k_schedq->sq_waitq, thread,
1507		    pqe);
1508	else {
1509		td = TAILQ_FIRST(&thread->kse->k_schedq->sq_waitq);
1510		while ((td != NULL) && (td->wakeup_time.tv_sec != -1) &&
1511		    ((td->wakeup_time.tv_sec < thread->wakeup_time.tv_sec) ||
1512		    ((td->wakeup_time.tv_sec == thread->wakeup_time.tv_sec) &&
1513		    (td->wakeup_time.tv_nsec <= thread->wakeup_time.tv_nsec))))
1514			td = TAILQ_NEXT(td, pqe);
1515		if (td == NULL)
1516			TAILQ_INSERT_TAIL(&thread->kse->k_schedq->sq_waitq,
1517			    thread, pqe);
1518		else
1519			TAILQ_INSERT_BEFORE(td, thread, pqe);
1520	}
1521	thread->flags |= THR_FLAGS_IN_WAITQ;
1522}
1523
1524/*
1525 * This must be called with the scheduling lock held.
1526 */
1527static void
1528kse_check_completed(struct kse *kse)
1529{
1530	struct pthread *thread;
1531	struct kse_thr_mailbox *completed;
1532	int sig;
1533
1534	if ((completed = kse->k_kcb->kcb_kmbx.km_completed) != NULL) {
1535		kse->k_kcb->kcb_kmbx.km_completed = NULL;
1536		while (completed != NULL) {
1537			thread = completed->tm_udata;
1538			DBG_MSG("Found completed thread %p, name %s\n",
1539			    thread,
1540			    (thread->name == NULL) ? "none" : thread->name);
1541			thread->blocked = 0;
1542			if (thread != kse->k_curthread) {
1543				thr_accounting(thread);
1544				if ((thread->flags & THR_FLAGS_SUSPENDED) != 0)
1545					THR_SET_STATE(thread, PS_SUSPENDED);
1546				else
1547					KSE_RUNQ_INSERT_TAIL(kse, thread);
1548				if ((thread->kse != kse) &&
1549				    (thread->kse->k_curthread == thread)) {
1550					/*
1551					 * Remove this thread from its
1552					 * previous KSE so that it (the KSE)
1553					 * doesn't think it is still active.
1554					 */
1555					thread->kse->k_curthread = NULL;
1556					thread->active = 0;
1557				}
1558			}
1559			if ((sig = thread->tcb->tcb_tmbx.tm_syncsig.si_signo)
1560			    != 0) {
1561				if (SIGISMEMBER(thread->sigmask, sig))
1562					SIGADDSET(thread->sigpend, sig);
1563				else
1564					(void)_thr_sig_add(thread, sig,
1565					    &thread->tcb->tcb_tmbx.tm_syncsig);
1566				thread->tcb->tcb_tmbx.tm_syncsig.si_signo = 0;
1567			}
1568			completed = completed->tm_next;
1569		}
1570	}
1571}
1572
1573/*
1574 * This must be called with the scheduling lock held.
1575 */
1576static void
1577kse_check_waitq(struct kse *kse)
1578{
1579	struct pthread	*pthread;
1580	struct timespec ts;
1581
1582	KSE_GET_TOD(kse, &ts);
1583
1584	/*
1585	 * Wake up threads that have timedout.  This has to be
1586	 * done before adding the current thread to the run queue
1587	 * so that a CPU intensive thread doesn't get preference
1588	 * over waiting threads.
1589	 */
1590	while (((pthread = KSE_WAITQ_FIRST(kse)) != NULL) &&
1591	    thr_timedout(pthread, &ts)) {
1592		/* Remove the thread from the wait queue: */
1593		KSE_WAITQ_REMOVE(kse, pthread);
1594		DBG_MSG("Found timedout thread %p in waitq\n", pthread);
1595
1596		/* Indicate the thread timedout: */
1597		pthread->timeout = 1;
1598
1599		/* Add the thread to the priority queue: */
1600		if ((pthread->flags & THR_FLAGS_SUSPENDED) != 0)
1601			THR_SET_STATE(pthread, PS_SUSPENDED);
1602		else {
1603			THR_SET_STATE(pthread, PS_RUNNING);
1604			KSE_RUNQ_INSERT_TAIL(kse, pthread);
1605		}
1606	}
1607}
1608
1609static int
1610thr_timedout(struct pthread *thread, struct timespec *curtime)
1611{
1612	if (thread->wakeup_time.tv_sec < 0)
1613		return (0);
1614	else if (thread->wakeup_time.tv_sec > curtime->tv_sec)
1615		return (0);
1616	else if ((thread->wakeup_time.tv_sec == curtime->tv_sec) &&
1617	    (thread->wakeup_time.tv_nsec > curtime->tv_nsec))
1618		return (0);
1619	else
1620		return (1);
1621}
1622
1623/*
1624 * This must be called with the scheduling lock held.
1625 *
1626 * Each thread has a time slice, a wakeup time (used when it wants
1627 * to wait for a specified amount of time), a run state, and an
1628 * active flag.
1629 *
1630 * When a thread gets run by the scheduler, the active flag is
1631 * set to non-zero (1).  When a thread performs an explicit yield
1632 * or schedules a state change, it enters the scheduler and the
1633 * active flag is cleared.  When the active flag is still seen
1634 * set in the scheduler, that means that the thread is blocked in
1635 * the kernel (because it is cleared before entering the scheduler
1636 * in all other instances).
1637 *
1638 * The wakeup time is only set for those states that can timeout.
1639 * It is set to (-1, -1) for all other instances.
1640 *
1641 * The thread's run state, aside from being useful when debugging,
1642 * is used to place the thread in an appropriate queue.  There
1643 * are 2 basic queues:
1644 *
1645 *   o run queue - queue ordered by priority for all threads
1646 *                 that are runnable
1647 *   o waiting queue - queue sorted by wakeup time for all threads
1648 *                     that are not otherwise runnable (not blocked
1649 *                     in kernel, not waiting for locks)
1650 *
1651 * The thread's time slice is used for round-robin scheduling
1652 * (the default scheduling policy).  While a SCHED_RR thread
1653 * is runnable it's time slice accumulates.  When it reaches
1654 * the time slice interval, it gets reset and added to the end
1655 * of the queue of threads at its priority.  When a thread no
1656 * longer becomes runnable (blocks in kernel, waits, etc), its
1657 * time slice is reset.
1658 *
1659 * The job of kse_switchout_thread() is to handle all of the above.
1660 */
1661static void
1662kse_switchout_thread(struct kse *kse, struct pthread *thread)
1663{
1664	int level;
1665	int i;
1666	int restart;
1667	siginfo_t siginfo;
1668
1669	/*
1670	 * Place the currently running thread into the
1671	 * appropriate queue(s).
1672	 */
1673	DBG_MSG("Switching out thread %p, state %d\n", thread, thread->state);
1674
1675	THR_DEACTIVATE_LAST_LOCK(thread);
1676	if (thread->blocked != 0) {
1677		thread->active = 0;
1678		thread->need_switchout = 0;
1679		/* This thread must have blocked in the kernel. */
1680		/*
1681		 *  Check for pending signals for this thread to
1682		 *  see if we need to interrupt it in the kernel.
1683		 */
1684		if (thread->check_pending != 0) {
1685			for (i = 1; i <= _SIG_MAXSIG; ++i) {
1686				if (SIGISMEMBER(thread->sigpend, i) &&
1687				    !SIGISMEMBER(thread->sigmask, i)) {
1688					restart = _thread_sigact[1 - 1].sa_flags & SA_RESTART;
1689					kse_thr_interrupt(&thread->tcb->tcb_tmbx,
1690					    restart ? KSE_INTR_RESTART : KSE_INTR_INTERRUPT, 0);
1691					break;
1692				}
1693			}
1694		}
1695	}
1696	else {
1697		switch (thread->state) {
1698		case PS_DEAD:
1699			/*
1700			 * The scheduler is operating on a different
1701			 * stack.  It is safe to do garbage collecting
1702			 * here.
1703			 */
1704			thread->active = 0;
1705			thread->need_switchout = 0;
1706			thread->lock_switch = 0;
1707			thr_cleanup(kse, thread);
1708			return;
1709			break;
1710
1711		case PS_RUNNING:
1712			if ((thread->flags & THR_FLAGS_SUSPENDED) != 0)
1713				THR_SET_STATE(thread, PS_SUSPENDED);
1714			break;
1715
1716		case PS_COND_WAIT:
1717		case PS_SLEEP_WAIT:
1718			/* Insert into the waiting queue: */
1719			KSE_WAITQ_INSERT(kse, thread);
1720			break;
1721
1722		case PS_LOCKWAIT:
1723			/*
1724			 * This state doesn't timeout.
1725			 */
1726			thread->wakeup_time.tv_sec = -1;
1727			thread->wakeup_time.tv_nsec = -1;
1728			level = thread->locklevel - 1;
1729			if (!_LCK_GRANTED(&thread->lockusers[level]))
1730				KSE_WAITQ_INSERT(kse, thread);
1731			else
1732				THR_SET_STATE(thread, PS_RUNNING);
1733			break;
1734
1735		case PS_SIGWAIT:
1736			KSE_WAITQ_INSERT(kse, thread);
1737			break;
1738		case PS_JOIN:
1739		case PS_MUTEX_WAIT:
1740		case PS_SIGSUSPEND:
1741		case PS_SUSPENDED:
1742		case PS_DEADLOCK:
1743		default:
1744			/*
1745			 * These states don't timeout.
1746			 */
1747			thread->wakeup_time.tv_sec = -1;
1748			thread->wakeup_time.tv_nsec = -1;
1749
1750			/* Insert into the waiting queue: */
1751			KSE_WAITQ_INSERT(kse, thread);
1752			break;
1753		}
1754		thr_accounting(thread);
1755		if (thread->state == PS_RUNNING) {
1756			if (thread->slice_usec == -1) {
1757				/*
1758				 * The thread exceeded its time quantum or
1759				 * it yielded the CPU; place it at the tail
1760				 * of the queue for its priority.
1761				 */
1762				KSE_RUNQ_INSERT_TAIL(kse, thread);
1763			} else {
1764				/*
1765				 * The thread hasn't exceeded its interval
1766				 * Place it at the head of the queue for its
1767				 * priority.
1768				 */
1769				KSE_RUNQ_INSERT_HEAD(kse, thread);
1770			}
1771		}
1772	}
1773	thread->active = 0;
1774	thread->need_switchout = 0;
1775	if (thread->check_pending != 0) {
1776		/* Install pending signals into the frame. */
1777		thread->check_pending = 0;
1778		KSE_LOCK_ACQUIRE(kse, &_thread_signal_lock);
1779		for (i = 1; i <= _SIG_MAXSIG; i++) {
1780			if (SIGISMEMBER(thread->sigmask, i))
1781				continue;
1782			if (SIGISMEMBER(thread->sigpend, i))
1783				(void)_thr_sig_add(thread, i,
1784				    &thread->siginfo[i-1]);
1785			else if (SIGISMEMBER(_thr_proc_sigpending, i) &&
1786				_thr_getprocsig_unlocked(i, &siginfo)) {
1787				(void)_thr_sig_add(thread, i, &siginfo);
1788			}
1789		}
1790		KSE_LOCK_RELEASE(kse, &_thread_signal_lock);
1791	}
1792}
1793
1794/*
1795 * This function waits for the smallest timeout value of any waiting
1796 * thread, or until it receives a message from another KSE.
1797 *
1798 * This must be called with the scheduling lock held.
1799 */
1800static void
1801kse_wait(struct kse *kse, struct pthread *td_wait, int sigseqno)
1802{
1803	struct timespec ts, ts_sleep;
1804	int saved_flags;
1805
1806	KSE_GET_TOD(kse, &ts);
1807
1808	if ((td_wait == NULL) || (td_wait->wakeup_time.tv_sec < 0)) {
1809		/* Limit sleep to no more than 1 minute. */
1810		ts_sleep.tv_sec = 60;
1811		ts_sleep.tv_nsec = 0;
1812	} else {
1813		TIMESPEC_SUB(&ts_sleep, &td_wait->wakeup_time, &ts);
1814		if (ts_sleep.tv_sec > 60) {
1815			ts_sleep.tv_sec = 60;
1816			ts_sleep.tv_nsec = 0;
1817		}
1818	}
1819	/* Don't sleep for negative times. */
1820	if ((ts_sleep.tv_sec >= 0) && (ts_sleep.tv_nsec >= 0)) {
1821		KSE_SET_IDLE(kse);
1822		kse->k_kseg->kg_idle_kses++;
1823		KSE_SCHED_UNLOCK(kse, kse->k_kseg);
1824		if ((kse->k_kseg->kg_flags & KGF_SINGLE_THREAD) &&
1825		    (kse->k_sigseqno != sigseqno))
1826			; /* don't sleep */
1827		else {
1828			saved_flags = kse->k_kcb->kcb_kmbx.km_flags;
1829			kse->k_kcb->kcb_kmbx.km_flags |= KMF_NOUPCALL;
1830			kse_release(&ts_sleep);
1831			kse->k_kcb->kcb_kmbx.km_flags = saved_flags;
1832		}
1833		KSE_SCHED_LOCK(kse, kse->k_kseg);
1834		if (KSE_IS_IDLE(kse)) {
1835			KSE_CLEAR_IDLE(kse);
1836			kse->k_kseg->kg_idle_kses--;
1837		}
1838	}
1839}
1840
1841/*
1842 * Avoid calling this kse_exit() so as not to confuse it with the
1843 * system call of the same name.
1844 */
1845static void
1846kse_fini(struct kse *kse)
1847{
1848	/* struct kse_group *free_kseg = NULL; */
1849	struct timespec ts;
1850
1851	/*
1852	 * Check to see if this is one of the main kses.
1853	 */
1854	if (kse->k_kseg != _kse_initial->k_kseg) {
1855		PANIC("shouldn't get here");
1856		/* This is for supporting thread groups. */
1857#ifdef NOT_YET
1858		/* Remove this KSE from the KSEG's list of KSEs. */
1859		KSE_SCHED_LOCK(kse, kse->k_kseg);
1860		TAILQ_REMOVE(&kse->k_kseg->kg_kseq, kse, k_kgqe);
1861		kse->k_kseg->kg_ksecount--;
1862		if (TAILQ_EMPTY(&kse->k_kseg->kg_kseq))
1863			free_kseg = kse->k_kseg;
1864		KSE_SCHED_UNLOCK(kse, kse->k_kseg);
1865
1866		/*
1867		 * Add this KSE to the list of free KSEs along with
1868		 * the KSEG if is now orphaned.
1869		 */
1870		KSE_LOCK_ACQUIRE(kse, &kse_lock);
1871		if (free_kseg != NULL)
1872			kseg_free_unlocked(free_kseg);
1873		kse_free_unlocked(kse);
1874		KSE_LOCK_RELEASE(kse, &kse_lock);
1875		kse_exit();
1876		/* Never returns. */
1877		PANIC("kse_exit()");
1878#endif
1879	} else {
1880		/*
1881		 * We allow program to kill kse in initial group (by
1882		 * lowering the concurrency).
1883		 */
1884		if ((kse != _kse_initial) &&
1885		    ((kse->k_flags & KF_TERMINATED) != 0)) {
1886			KSE_SCHED_LOCK(kse, kse->k_kseg);
1887			TAILQ_REMOVE(&kse->k_kseg->kg_kseq, kse, k_kgqe);
1888			kse->k_kseg->kg_ksecount--;
1889			KSE_SCHED_UNLOCK(kse, kse->k_kseg);
1890			KSE_LOCK_ACQUIRE(kse, &kse_lock);
1891			kse_free_unlocked(kse);
1892			KSE_LOCK_RELEASE(kse, &kse_lock);
1893			kse_exit();
1894                        /* Never returns. */
1895                        PANIC("kse_exit() failed for initial kseg");
1896                }
1897		KSE_SCHED_LOCK(kse, kse->k_kseg);
1898		KSE_SET_IDLE(kse);
1899		kse->k_kseg->kg_idle_kses++;
1900		KSE_SCHED_UNLOCK(kse, kse->k_kseg);
1901		ts.tv_sec = 120;
1902		ts.tv_nsec = 0;
1903		kse->k_kcb->kcb_kmbx.km_flags = 0;
1904		kse_release(&ts);
1905		/* Never reach */
1906	}
1907}
1908
1909void
1910_thr_set_timeout(const struct timespec *timeout)
1911{
1912	struct pthread	*curthread = _get_curthread();
1913	struct timespec ts;
1914
1915	/* Reset the timeout flag for the running thread: */
1916	curthread->timeout = 0;
1917
1918	/* Check if the thread is to wait forever: */
1919	if (timeout == NULL) {
1920		/*
1921		 * Set the wakeup time to something that can be recognised as
1922		 * different to an actual time of day:
1923		 */
1924		curthread->wakeup_time.tv_sec = -1;
1925		curthread->wakeup_time.tv_nsec = -1;
1926	}
1927	/* Check if no waiting is required: */
1928	else if ((timeout->tv_sec == 0) && (timeout->tv_nsec == 0)) {
1929		/* Set the wake up time to 'immediately': */
1930		curthread->wakeup_time.tv_sec = 0;
1931		curthread->wakeup_time.tv_nsec = 0;
1932	} else {
1933		/* Calculate the time for the current thread to wakeup: */
1934		KSE_GET_TOD(curthread->kse, &ts);
1935		TIMESPEC_ADD(&curthread->wakeup_time, &ts, timeout);
1936	}
1937}
1938
1939void
1940_thr_panic_exit(char *file, int line, char *msg)
1941{
1942	char buf[256];
1943
1944	snprintf(buf, sizeof(buf), "(%s:%d) %s\n", file, line, msg);
1945	__sys_write(2, buf, strlen(buf));
1946	abort();
1947}
1948
1949void
1950_thr_setrunnable(struct pthread *curthread, struct pthread *thread)
1951{
1952	kse_critical_t crit;
1953	struct kse_mailbox *kmbx;
1954
1955	crit = _kse_critical_enter();
1956	KSE_SCHED_LOCK(curthread->kse, thread->kseg);
1957	kmbx = _thr_setrunnable_unlocked(thread);
1958	KSE_SCHED_UNLOCK(curthread->kse, thread->kseg);
1959	_kse_critical_leave(crit);
1960	if (kmbx != NULL)
1961		kse_wakeup(kmbx);
1962}
1963
1964struct kse_mailbox *
1965_thr_setrunnable_unlocked(struct pthread *thread)
1966{
1967	struct kse_mailbox *kmbx = NULL;
1968
1969	if ((thread->kseg->kg_flags & KGF_SINGLE_THREAD) != 0) {
1970		/* No silly queues for these threads. */
1971		if ((thread->flags & THR_FLAGS_SUSPENDED) != 0)
1972			THR_SET_STATE(thread, PS_SUSPENDED);
1973		else {
1974			THR_SET_STATE(thread, PS_RUNNING);
1975			kmbx = kse_wakeup_one(thread);
1976		}
1977
1978	} else if (thread->state != PS_RUNNING) {
1979		if ((thread->flags & THR_FLAGS_IN_WAITQ) != 0)
1980			KSE_WAITQ_REMOVE(thread->kse, thread);
1981		if ((thread->flags & THR_FLAGS_SUSPENDED) != 0)
1982			THR_SET_STATE(thread, PS_SUSPENDED);
1983		else {
1984			THR_SET_STATE(thread, PS_RUNNING);
1985			if ((thread->blocked == 0) && (thread->active == 0) &&
1986			    (thread->flags & THR_FLAGS_IN_RUNQ) == 0)
1987				THR_RUNQ_INSERT_TAIL(thread);
1988			/*
1989			 * XXX - Threads are not yet assigned to specific
1990			 *       KSEs; they are assigned to the KSEG.  So
1991			 *       the fact that a thread's KSE is waiting
1992			 *       doesn't necessarily mean that it will be
1993			 *       the KSE that runs the thread after the
1994			 *       lock is granted.  But we don't know if the
1995			 *       other KSEs within the same KSEG are also
1996			 *       in a waiting state or not so we err on the
1997			 *       side of caution and wakeup the thread's
1998			 *       last known KSE.  We ensure that the
1999			 *       threads KSE doesn't change while it's
2000			 *       scheduling lock is held so it is safe to
2001			 *       reference it (the KSE).  If the KSE wakes
2002			 *       up and doesn't find any more work it will
2003			 *       again go back to waiting so no harm is
2004			 *       done.
2005			 */
2006			kmbx = kse_wakeup_one(thread);
2007		}
2008	}
2009	return (kmbx);
2010}
2011
2012static struct kse_mailbox *
2013kse_wakeup_one(struct pthread *thread)
2014{
2015	struct kse *ke;
2016
2017	if (KSE_IS_IDLE(thread->kse)) {
2018		KSE_CLEAR_IDLE(thread->kse);
2019		thread->kseg->kg_idle_kses--;
2020		return (&thread->kse->k_kcb->kcb_kmbx);
2021	} else {
2022		TAILQ_FOREACH(ke, &thread->kseg->kg_kseq, k_kgqe) {
2023			if (KSE_IS_IDLE(ke)) {
2024				KSE_CLEAR_IDLE(ke);
2025				ke->k_kseg->kg_idle_kses--;
2026				return (&ke->k_kcb->kcb_kmbx);
2027			}
2028		}
2029	}
2030	return (NULL);
2031}
2032
2033static void
2034kse_wakeup_multi(struct kse *curkse)
2035{
2036	struct kse *ke;
2037	int tmp;
2038
2039	if ((tmp = KSE_RUNQ_THREADS(curkse)) && curkse->k_kseg->kg_idle_kses) {
2040		TAILQ_FOREACH(ke, &curkse->k_kseg->kg_kseq, k_kgqe) {
2041			if (KSE_IS_IDLE(ke)) {
2042				KSE_CLEAR_IDLE(ke);
2043				ke->k_kseg->kg_idle_kses--;
2044				KSE_WAKEUP(ke);
2045				if (--tmp == 0)
2046					break;
2047			}
2048		}
2049	}
2050}
2051
2052/*
2053 * Allocate a new KSEG.
2054 *
2055 * We allow the current thread to be NULL in the case that this
2056 * is the first time a KSEG is being created (library initialization).
2057 * In this case, we don't need to (and can't) take any locks.
2058 */
2059struct kse_group *
2060_kseg_alloc(struct pthread *curthread)
2061{
2062	struct kse_group *kseg = NULL;
2063	kse_critical_t crit;
2064
2065	if ((curthread != NULL) && (free_kseg_count > 0)) {
2066		/* Use the kse lock for the kseg queue. */
2067		crit = _kse_critical_enter();
2068		KSE_LOCK_ACQUIRE(curthread->kse, &kse_lock);
2069		if ((kseg = TAILQ_FIRST(&free_kse_groupq)) != NULL) {
2070			TAILQ_REMOVE(&free_kse_groupq, kseg, kg_qe);
2071			free_kseg_count--;
2072			active_kseg_count++;
2073			TAILQ_INSERT_TAIL(&active_kse_groupq, kseg, kg_qe);
2074		}
2075		KSE_LOCK_RELEASE(curthread->kse, &kse_lock);
2076		_kse_critical_leave(crit);
2077		if (kseg)
2078			kseg_reinit(kseg);
2079	}
2080
2081	/*
2082	 * If requested, attempt to allocate a new KSE group only if the
2083	 * KSE allocation was successful and a KSE group wasn't found in
2084	 * the free list.
2085	 */
2086	if ((kseg == NULL) &&
2087	    ((kseg = (struct kse_group *)malloc(sizeof(*kseg))) != NULL)) {
2088		if (_pq_alloc(&kseg->kg_schedq.sq_runq,
2089		    THR_MIN_PRIORITY, THR_LAST_PRIORITY) != 0) {
2090			free(kseg);
2091			kseg = NULL;
2092		} else {
2093			kseg_init(kseg);
2094			/* Add the KSEG to the list of active KSEGs. */
2095			if (curthread != NULL) {
2096				crit = _kse_critical_enter();
2097				KSE_LOCK_ACQUIRE(curthread->kse, &kse_lock);
2098				active_kseg_count++;
2099				TAILQ_INSERT_TAIL(&active_kse_groupq,
2100				    kseg, kg_qe);
2101				KSE_LOCK_RELEASE(curthread->kse, &kse_lock);
2102				_kse_critical_leave(crit);
2103			} else {
2104				active_kseg_count++;
2105				TAILQ_INSERT_TAIL(&active_kse_groupq,
2106				    kseg, kg_qe);
2107			}
2108		}
2109	}
2110	return (kseg);
2111}
2112
2113static void
2114kseg_init(struct kse_group *kseg)
2115{
2116	kseg_reinit(kseg);
2117	_lock_init(&kseg->kg_lock, LCK_ADAPTIVE, _kse_lock_wait,
2118	    _kse_lock_wakeup);
2119}
2120
2121static void
2122kseg_reinit(struct kse_group *kseg)
2123{
2124	TAILQ_INIT(&kseg->kg_kseq);
2125	TAILQ_INIT(&kseg->kg_threadq);
2126	TAILQ_INIT(&kseg->kg_schedq.sq_waitq);
2127	kseg->kg_threadcount = 0;
2128	kseg->kg_ksecount = 0;
2129	kseg->kg_idle_kses = 0;
2130	kseg->kg_flags = 0;
2131}
2132
2133/*
2134 * This must be called with the kse lock held and when there are
2135 * no more threads that reference it.
2136 */
2137static void
2138kseg_free_unlocked(struct kse_group *kseg)
2139{
2140	TAILQ_REMOVE(&active_kse_groupq, kseg, kg_qe);
2141	TAILQ_INSERT_HEAD(&free_kse_groupq, kseg, kg_qe);
2142	free_kseg_count++;
2143	active_kseg_count--;
2144}
2145
2146void
2147_kseg_free(struct kse_group *kseg)
2148{
2149	struct kse *curkse;
2150	kse_critical_t crit;
2151
2152	crit = _kse_critical_enter();
2153	curkse = _get_curkse();
2154	KSE_LOCK_ACQUIRE(curkse, &kse_lock);
2155	kseg_free_unlocked(kseg);
2156	KSE_LOCK_RELEASE(curkse, &kse_lock);
2157	_kse_critical_leave(crit);
2158}
2159
2160static void
2161kseg_destroy(struct kse_group *kseg)
2162{
2163	_lock_destroy(&kseg->kg_lock);
2164	_pq_free(&kseg->kg_schedq.sq_runq);
2165	free(kseg);
2166}
2167
2168/*
2169 * Allocate a new KSE.
2170 *
2171 * We allow the current thread to be NULL in the case that this
2172 * is the first time a KSE is being created (library initialization).
2173 * In this case, we don't need to (and can't) take any locks.
2174 */
2175struct kse *
2176_kse_alloc(struct pthread *curthread, int sys_scope)
2177{
2178	struct kse *kse = NULL;
2179	char *stack;
2180	kse_critical_t crit;
2181	int i;
2182
2183	if ((curthread != NULL) && (free_kse_count > 0)) {
2184		crit = _kse_critical_enter();
2185		KSE_LOCK_ACQUIRE(curthread->kse, &kse_lock);
2186		/* Search for a finished KSE. */
2187		kse = TAILQ_FIRST(&free_kseq);
2188		while ((kse != NULL) &&
2189		    ((kse->k_kcb->kcb_kmbx.km_flags & KMF_DONE) == 0)) {
2190			kse = TAILQ_NEXT(kse, k_qe);
2191		}
2192		if (kse != NULL) {
2193			DBG_MSG("found an unused kse.\n");
2194			TAILQ_REMOVE(&free_kseq, kse, k_qe);
2195			free_kse_count--;
2196			TAILQ_INSERT_TAIL(&active_kseq, kse, k_qe);
2197			active_kse_count++;
2198		}
2199		KSE_LOCK_RELEASE(curthread->kse, &kse_lock);
2200		_kse_critical_leave(crit);
2201		if (kse != NULL)
2202			kse_reinit(kse, sys_scope);
2203	}
2204	if ((kse == NULL) &&
2205	    ((kse = (struct kse *)malloc(sizeof(*kse))) != NULL)) {
2206		if (sys_scope != 0)
2207			stack = NULL;
2208		else if ((stack = malloc(KSE_STACKSIZE)) == NULL) {
2209			free(kse);
2210			return (NULL);
2211		}
2212		bzero(kse, sizeof(*kse));
2213
2214		/* Initialize KCB without the lock. */
2215		if ((kse->k_kcb = _kcb_ctor(kse)) == NULL) {
2216			if (stack != NULL)
2217				free(stack);
2218			free(kse);
2219			return (NULL);
2220		}
2221
2222		/* Initialize the lockusers. */
2223		for (i = 0; i < MAX_KSE_LOCKLEVEL; i++) {
2224			_lockuser_init(&kse->k_lockusers[i], (void *)kse);
2225			_LCK_SET_PRIVATE2(&kse->k_lockusers[i], NULL);
2226		}
2227		/* _lock_init(kse->k_lock, ...) */
2228
2229		if (curthread != NULL) {
2230			crit = _kse_critical_enter();
2231			KSE_LOCK_ACQUIRE(curthread->kse, &kse_lock);
2232		}
2233		kse->k_flags = 0;
2234		TAILQ_INSERT_TAIL(&active_kseq, kse, k_qe);
2235		active_kse_count++;
2236		if (curthread != NULL) {
2237			KSE_LOCK_RELEASE(curthread->kse, &kse_lock);
2238			_kse_critical_leave(crit);
2239		}
2240		/*
2241		 * Create the KSE context.
2242		 * Scope system threads (one thread per KSE) are not required
2243		 * to have a stack for an unneeded kse upcall.
2244		 */
2245		if (!sys_scope) {
2246			kse->k_kcb->kcb_kmbx.km_func = (kse_func_t *)kse_sched_multi;
2247			kse->k_stack.ss_sp = stack;
2248			kse->k_stack.ss_size = KSE_STACKSIZE;
2249		} else {
2250			kse->k_kcb->kcb_kmbx.km_func = (kse_func_t *)kse_sched_single;
2251			kse->k_stack.ss_sp = NULL;
2252			kse->k_stack.ss_size = 0;
2253		}
2254		kse->k_kcb->kcb_kmbx.km_udata = (void *)kse;
2255		kse->k_kcb->kcb_kmbx.km_quantum = 20000;
2256		/*
2257		 * We need to keep a copy of the stack in case it
2258		 * doesn't get used; a KSE running a scope system
2259		 * thread will use that thread's stack.
2260		 */
2261		kse->k_kcb->kcb_kmbx.km_stack = kse->k_stack;
2262	}
2263	return (kse);
2264}
2265
2266static void
2267kse_reinit(struct kse *kse, int sys_scope)
2268{
2269	if (!sys_scope) {
2270		kse->k_kcb->kcb_kmbx.km_func = (kse_func_t *)kse_sched_multi;
2271		if (kse->k_stack.ss_sp == NULL) {
2272			/* XXX check allocation failure */
2273			kse->k_stack.ss_sp = (char *) malloc(KSE_STACKSIZE);
2274			kse->k_stack.ss_size = KSE_STACKSIZE;
2275		}
2276		kse->k_kcb->kcb_kmbx.km_quantum = 20000;
2277	} else {
2278		kse->k_kcb->kcb_kmbx.km_func = (kse_func_t *)kse_sched_single;
2279		if (kse->k_stack.ss_sp)
2280			free(kse->k_stack.ss_sp);
2281		kse->k_stack.ss_sp = NULL;
2282		kse->k_stack.ss_size = 0;
2283		kse->k_kcb->kcb_kmbx.km_quantum = 0;
2284	}
2285	kse->k_kcb->kcb_kmbx.km_stack = kse->k_stack;
2286	kse->k_kcb->kcb_kmbx.km_udata = (void *)kse;
2287	kse->k_kcb->kcb_kmbx.km_curthread = NULL;
2288	kse->k_kcb->kcb_kmbx.km_flags = 0;
2289	kse->k_curthread = NULL;
2290	kse->k_kseg = 0;
2291	kse->k_schedq = 0;
2292	kse->k_locklevel = 0;
2293	kse->k_flags = 0;
2294	kse->k_idle = 0;
2295	kse->k_error = 0;
2296	kse->k_cpu = 0;
2297	kse->k_done = 0;
2298	kse->k_switch = 0;
2299	kse->k_sigseqno = 0;
2300}
2301
2302void
2303kse_free_unlocked(struct kse *kse)
2304{
2305	TAILQ_REMOVE(&active_kseq, kse, k_qe);
2306	active_kse_count--;
2307	kse->k_kseg = NULL;
2308	kse->k_kcb->kcb_kmbx.km_quantum = 20000;
2309	kse->k_flags = 0;
2310	TAILQ_INSERT_HEAD(&free_kseq, kse, k_qe);
2311	free_kse_count++;
2312}
2313
2314void
2315_kse_free(struct pthread *curthread, struct kse *kse)
2316{
2317	kse_critical_t crit;
2318
2319	if (curthread == NULL)
2320		kse_free_unlocked(kse);
2321	else {
2322		crit = _kse_critical_enter();
2323		KSE_LOCK_ACQUIRE(curthread->kse, &kse_lock);
2324		kse_free_unlocked(kse);
2325		KSE_LOCK_RELEASE(curthread->kse, &kse_lock);
2326		_kse_critical_leave(crit);
2327	}
2328}
2329
2330static void
2331kse_destroy(struct kse *kse)
2332{
2333	int i;
2334
2335	if (kse->k_stack.ss_sp != NULL)
2336		free(kse->k_stack.ss_sp);
2337	_kcb_dtor(kse->k_kcb);
2338	for (i = 0; i < MAX_KSE_LOCKLEVEL; ++i)
2339		_lockuser_destroy(&kse->k_lockusers[i]);
2340	_lock_destroy(&kse->k_lock);
2341	free(kse);
2342}
2343
2344struct pthread *
2345_thr_alloc(struct pthread *curthread)
2346{
2347	kse_critical_t crit;
2348	struct pthread *thread = NULL;
2349
2350	if (curthread != NULL) {
2351		if (GC_NEEDED())
2352			_thr_gc(curthread);
2353		if (free_thread_count > 0) {
2354			crit = _kse_critical_enter();
2355			KSE_LOCK_ACQUIRE(curthread->kse, &thread_lock);
2356			if ((thread = TAILQ_FIRST(&free_threadq)) != NULL) {
2357				TAILQ_REMOVE(&free_threadq, thread, tle);
2358				free_thread_count--;
2359			}
2360			KSE_LOCK_RELEASE(curthread->kse, &thread_lock);
2361			_kse_critical_leave(crit);
2362		}
2363	}
2364	if ((thread == NULL) &&
2365	    ((thread = malloc(sizeof(struct pthread))) != NULL)) {
2366		bzero(thread, sizeof(struct pthread));
2367		if ((thread->tcb = _tcb_ctor(thread)) == NULL) {
2368			free(thread);
2369			thread = NULL;
2370		}
2371	}
2372	return (thread);
2373}
2374
2375void
2376_thr_free(struct pthread *curthread, struct pthread *thread)
2377{
2378	kse_critical_t crit;
2379	int i;
2380
2381	DBG_MSG("Freeing thread %p\n", thread);
2382	if ((curthread == NULL) || (free_thread_count >= MAX_CACHED_THREADS)) {
2383		for (i = 0; i < MAX_THR_LOCKLEVEL; i++) {
2384			_lockuser_destroy(&thread->lockusers[i]);
2385		}
2386		_lock_destroy(&thread->lock);
2387		_tcb_dtor(thread->tcb);
2388		free(thread);
2389	}
2390	else {
2391		/* Reinitialize any important fields here. */
2392		thread->lock_switch = 0;
2393		sigemptyset(&thread->sigpend);
2394		thread->check_pending = 0;
2395
2396		/* Add the thread to the free thread list. */
2397		crit = _kse_critical_enter();
2398		KSE_LOCK_ACQUIRE(curthread->kse, &thread_lock);
2399		TAILQ_INSERT_TAIL(&free_threadq, thread, tle);
2400		free_thread_count++;
2401		KSE_LOCK_RELEASE(curthread->kse, &thread_lock);
2402		_kse_critical_leave(crit);
2403	}
2404}
2405
2406/*
2407 * Add an active thread:
2408 *
2409 *   o Assign the thread a unique id (which GDB uses to track
2410 *     threads.
2411 *   o Add the thread to the list of all threads and increment
2412 *     number of active threads.
2413 */
2414static void
2415thr_link(struct pthread *thread)
2416{
2417	kse_critical_t crit;
2418	struct kse *curkse;
2419	struct pthread *curthread;
2420
2421	crit = _kse_critical_enter();
2422	curkse = _get_curkse();
2423	curthread = _get_curthread();
2424	thread->sigmask = curthread->sigmask;
2425	KSE_LOCK_ACQUIRE(curkse, &_thread_list_lock);
2426	/*
2427	 * Initialize the unique id (which GDB uses to track
2428	 * threads), add the thread to the list of all threads,
2429	 * and
2430	 */
2431	thread->uniqueid = next_uniqueid++;
2432	THR_LIST_ADD(thread);
2433	active_threads++;
2434	KSE_LOCK_RELEASE(curkse, &_thread_list_lock);
2435	_kse_critical_leave(crit);
2436}
2437
2438/*
2439 * Remove an active thread.
2440 */
2441static void
2442thr_unlink(struct pthread *thread)
2443{
2444	kse_critical_t crit;
2445	struct kse *curkse;
2446
2447	crit = _kse_critical_enter();
2448	curkse = _get_curkse();
2449	KSE_LOCK_ACQUIRE(curkse, &_thread_list_lock);
2450	THR_LIST_REMOVE(thread);
2451	active_threads--;
2452	KSE_LOCK_RELEASE(curkse, &_thread_list_lock);
2453	_kse_critical_leave(crit);
2454}
2455
2456void
2457_thr_hash_add(struct pthread *thread)
2458{
2459	struct thread_hash_head *head;
2460
2461	head = &thr_hashtable[THREAD_HASH(thread)];
2462	LIST_INSERT_HEAD(head, thread, hle);
2463}
2464
2465void
2466_thr_hash_remove(struct pthread *thread)
2467{
2468	LIST_REMOVE(thread, hle);
2469}
2470
2471struct pthread *
2472_thr_hash_find(struct pthread *thread)
2473{
2474	struct pthread *td;
2475	struct thread_hash_head *head;
2476
2477	head = &thr_hashtable[THREAD_HASH(thread)];
2478	LIST_FOREACH(td, head, hle) {
2479		if (td == thread)
2480			return (thread);
2481	}
2482	return (NULL);
2483}
2484
2485