thr_kern.c revision 118850
1/*
2 * Copyright (C) 2003 Daniel M. Eischen <deischen@freebsd.org>
3 * Copyright (C) 2002 Jonathon Mini <mini@freebsd.org>
4 * Copyright (c) 1995-1998 John Birrell <jb@cimlogic.com.au>
5 * All rights reserved.
6 *
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
9 * are met:
10 * 1. Redistributions of source code must retain the above copyright
11 *    notice, this list of conditions and the following disclaimer.
12 * 2. Redistributions in binary form must reproduce the above copyright
13 *    notice, this list of conditions and the following disclaimer in the
14 *    documentation and/or other materials provided with the distribution.
15 * 3. All advertising materials mentioning features or use of this software
16 *    must display the following acknowledgement:
17 *	This product includes software developed by John Birrell.
18 * 4. Neither the name of the author nor the names of any co-contributors
19 *    may be used to endorse or promote products derived from this software
20 *    without specific prior written permission.
21 *
22 * THIS SOFTWARE IS PROVIDED BY JOHN BIRRELL AND CONTRIBUTORS ``AS IS'' AND
23 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
24 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
25 * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
26 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
27 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
28 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
29 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
30 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
31 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32 * SUCH DAMAGE.
33 *
34 */
35#include <sys/cdefs.h>
36__FBSDID("$FreeBSD: head/lib/libkse/thread/thr_kern.c 118850 2003-08-13 01:49:07Z davidxu $");
37
38#include <sys/types.h>
39#include <sys/kse.h>
40#include <sys/signalvar.h>
41#include <sys/queue.h>
42#include <machine/atomic.h>
43#include <machine/sigframe.h>
44
45#include <assert.h>
46#include <errno.h>
47#include <signal.h>
48#include <stdlib.h>
49#include <string.h>
50#include <time.h>
51#include <ucontext.h>
52#include <unistd.h>
53
54#include "atomic_ops.h"
55#include "thr_private.h"
56#include "libc_private.h"
57
58/*#define DEBUG_THREAD_KERN */
59#ifdef DEBUG_THREAD_KERN
60#define DBG_MSG		stdout_debug
61#else
62#define DBG_MSG(x...)
63#endif
64
65/*
66 * Define a high water mark for the maximum number of threads that
67 * will be cached.  Once this level is reached, any extra threads
68 * will be free()'d.
69 */
70#define	MAX_CACHED_THREADS	100
71/*
72 * Define high water marks for the maximum number of KSEs and KSE groups
73 * that will be cached. Because we support 1:1 threading, there could have
74 * same number of KSEs and KSE groups as threads. Once these levels are
75 * reached, any extra KSE and KSE groups will be free()'d.
76 */
77#ifdef SYSTEM_SCOPE_ONLY
78#define	MAX_CACHED_KSES		100
79#define	MAX_CACHED_KSEGS	100
80#else
81#define	MAX_CACHED_KSES		50
82#define	MAX_CACHED_KSEGS	50
83#endif
84
85#define	KSE_STACKSIZE		16384
86
87#define	KSE_SET_MBOX(kse, thrd) \
88	(kse)->k_kcb->kcb_kmbx.km_curthread = &(thrd)->tcb->tcb_tmbx
89
90#define	KSE_SET_EXITED(kse)	(kse)->k_flags |= KF_EXITED
91
92/*
93 * Macros for manipulating the run queues.  The priority queue
94 * routines use the thread's pqe link and also handle the setting
95 * and clearing of the thread's THR_FLAGS_IN_RUNQ flag.
96 */
97#define	KSE_RUNQ_INSERT_HEAD(kse, thrd)			\
98	_pq_insert_head(&(kse)->k_schedq->sq_runq, thrd)
99#define	KSE_RUNQ_INSERT_TAIL(kse, thrd)			\
100	_pq_insert_tail(&(kse)->k_schedq->sq_runq, thrd)
101#define	KSE_RUNQ_REMOVE(kse, thrd)			\
102	_pq_remove(&(kse)->k_schedq->sq_runq, thrd)
103#define	KSE_RUNQ_FIRST(kse)	_pq_first(&(kse)->k_schedq->sq_runq)
104
105#define KSE_RUNQ_THREADS(kse)	((kse)->k_schedq->sq_runq.pq_threads)
106
107/*
108 * We've got to keep track of everything that is allocated, not only
109 * to have a speedy free list, but also so they can be deallocated
110 * after a fork().
111 */
112static TAILQ_HEAD(, kse)	active_kseq;
113static TAILQ_HEAD(, kse)	free_kseq;
114static TAILQ_HEAD(, kse_group)	free_kse_groupq;
115static TAILQ_HEAD(, kse_group)	active_kse_groupq;
116static TAILQ_HEAD(, kse_group)	gc_ksegq;
117static struct lock		kse_lock;	/* also used for kseg queue */
118static int			free_kse_count = 0;
119static int			free_kseg_count = 0;
120static TAILQ_HEAD(, pthread)	free_threadq;
121static struct lock		thread_lock;
122static int			free_thread_count = 0;
123static int			inited = 0;
124static int			active_threads = 1;
125static int			active_kse_count = 0;
126static int			active_kseg_count = 0;
127static u_int64_t		next_uniqueid = 1;
128
129LIST_HEAD(thread_hash_head, pthread);
130#define THREAD_HASH_QUEUES	127
131static struct thread_hash_head	thr_hashtable[THREAD_HASH_QUEUES];
132#define	THREAD_HASH(thrd)	((unsigned long)thrd % THREAD_HASH_QUEUES)
133
134#ifdef DEBUG_THREAD_KERN
135static void	dump_queues(struct kse *curkse);
136#endif
137static void	kse_check_completed(struct kse *kse);
138static void	kse_check_waitq(struct kse *kse);
139static void	kse_fini(struct kse *curkse);
140static void	kse_reinit(struct kse *kse, int sys_scope);
141static void	kse_sched_multi(struct kse_mailbox *kmbx);
142static void	kse_sched_single(struct kse_mailbox *kmbx);
143static void	kse_switchout_thread(struct kse *kse, struct pthread *thread);
144static void	kse_wait(struct kse *kse, struct pthread *td_wait, int sigseq);
145static void	kse_free_unlocked(struct kse *kse);
146static void	kse_destroy(struct kse *kse);
147static void	kseg_free_unlocked(struct kse_group *kseg);
148static void	kseg_init(struct kse_group *kseg);
149static void	kseg_reinit(struct kse_group *kseg);
150static void	kseg_destroy(struct kse_group *kseg);
151static void	kse_waitq_insert(struct pthread *thread);
152static void	kse_wakeup_multi(struct kse *curkse);
153static struct kse_mailbox *kse_wakeup_one(struct pthread *thread);
154static void	thr_cleanup(struct kse *kse, struct pthread *curthread);
155static void	thr_link(struct pthread *thread);
156static void	thr_resume_wrapper(int sig, siginfo_t *, ucontext_t *);
157static void	thr_resume_check(struct pthread *curthread, ucontext_t *ucp,
158		    struct pthread_sigframe *psf);
159static int	thr_timedout(struct pthread *thread, struct timespec *curtime);
160static void	thr_unlink(struct pthread *thread);
161static void	thread_gc(struct pthread *thread);
162static void	kse_gc(struct pthread *thread);
163static void	kseg_gc(struct pthread *thread);
164
165static void __inline
166thr_accounting(struct pthread *thread)
167{
168	if ((thread->slice_usec != -1) &&
169	    (thread->slice_usec <= TIMESLICE_USEC) &&
170	    (thread->attr.sched_policy != SCHED_FIFO)) {
171		thread->slice_usec += (thread->tcb->tcb_tmbx.tm_uticks
172		    + thread->tcb->tcb_tmbx.tm_sticks) * _clock_res_usec;
173		/* Check for time quantum exceeded: */
174		if (thread->slice_usec > TIMESLICE_USEC)
175			thread->slice_usec = -1;
176	}
177	thread->tcb->tcb_tmbx.tm_uticks = 0;
178	thread->tcb->tcb_tmbx.tm_sticks = 0;
179}
180
181/*
182 * This is called after a fork().
183 * No locks need to be taken here since we are guaranteed to be
184 * single threaded.
185 *
186 * XXX
187 * POSIX says for threaded process, fork() function is used
188 * only to run new programs, and the effects of calling functions
189 * that require certain resources between the call to fork() and
190 * the call to an exec function are undefined.
191 *
192 * Here it is not safe to reinitialize the library after fork().
193 * Because memory management may be corrupted, further calling
194 * malloc()/free() may cause undefined behavior.
195 */
196void
197_kse_single_thread(struct pthread *curthread)
198{
199#ifdef NOTYET
200	struct kse *kse;
201	struct kse_group *kseg;
202	struct pthread *thread;
203	kse_critical_t crit;
204	int i;
205
206	if (__isthreaded) {
207		_thr_rtld_fini();
208		_thr_signal_deinit();
209	}
210	__isthreaded = 0;
211	/*
212	 * Restore signal mask early, so any memory problems could
213	 * dump core.
214	 */
215	sigprocmask(SIG_SETMASK, &curthread->sigmask, NULL);
216	active_threads = 1;
217
218	/*
219	 * Enter a loop to remove and free all threads other than
220	 * the running thread from the active thread list:
221	 */
222	while ((thread = TAILQ_FIRST(&_thread_list)) != NULL) {
223		THR_GCLIST_REMOVE(thread);
224		/*
225		 * Remove this thread from the list (the current
226		 * thread will be removed but re-added by libpthread
227		 * initialization.
228		 */
229		TAILQ_REMOVE(&_thread_list, thread, tle);
230		/* Make sure this isn't the running thread: */
231		if (thread != curthread) {
232			_thr_stack_free(&thread->attr);
233			if (thread->specific != NULL)
234				free(thread->specific);
235			for (i = 0; i < MAX_THR_LOCKLEVEL; i++) {
236				_lockuser_destroy(&thread->lockusers[i]);
237			}
238			_lock_destroy(&thread->lock);
239			free(thread);
240		}
241	}
242
243	TAILQ_INIT(&curthread->mutexq);		/* initialize mutex queue */
244	curthread->joiner = NULL;		/* no joining threads yet */
245	curthread->refcount = 0;
246	SIGEMPTYSET(curthread->sigpend);	/* clear pending signals */
247	if (curthread->specific != NULL) {
248		free(curthread->specific);
249		curthread->specific = NULL;
250		curthread->specific_data_count = 0;
251	}
252
253	/* Free the free KSEs: */
254	while ((kse = TAILQ_FIRST(&free_kseq)) != NULL) {
255		TAILQ_REMOVE(&free_kseq, kse, k_qe);
256		for (i = 0; i < MAX_KSE_LOCKLEVEL; i++) {
257			_lockuser_destroy(&kse->k_lockusers[i]);
258		}
259		_lock_destroy(&kse->k_lock);
260		_kcb_dtor(kse->k_kcb);
261		if (kse->k_stack.ss_sp != NULL)
262			free(kse->k_stack.ss_sp);
263		free(kse);
264	}
265	free_kse_count = 0;
266
267	/* Free the active KSEs: */
268	while ((kse = TAILQ_FIRST(&active_kseq)) != NULL) {
269		TAILQ_REMOVE(&active_kseq, kse, k_qe);
270		for (i = 0; i < MAX_KSE_LOCKLEVEL; i++) {
271			_lockuser_destroy(&kse->k_lockusers[i]);
272		}
273		_lock_destroy(&kse->k_lock);
274		if (kse->k_stack.ss_sp != NULL)
275			free(kse->k_stack.ss_sp);
276		free(kse);
277	}
278	active_kse_count = 0;
279
280	/* Free the free KSEGs: */
281	while ((kseg = TAILQ_FIRST(&free_kse_groupq)) != NULL) {
282		TAILQ_REMOVE(&free_kse_groupq, kseg, kg_qe);
283		_lock_destroy(&kseg->kg_lock);
284		_pq_free(&kseg->kg_schedq.sq_runq);
285		free(kseg);
286	}
287	free_kseg_count = 0;
288
289	/* Free the active KSEGs: */
290	while ((kseg = TAILQ_FIRST(&active_kse_groupq)) != NULL) {
291		TAILQ_REMOVE(&active_kse_groupq, kseg, kg_qe);
292		_lock_destroy(&kseg->kg_lock);
293		_pq_free(&kseg->kg_schedq.sq_runq);
294		free(kseg);
295	}
296	active_kseg_count = 0;
297
298	/* Free the free threads. */
299	while ((thread = TAILQ_FIRST(&free_threadq)) != NULL) {
300		TAILQ_REMOVE(&free_threadq, thread, tle);
301		if (thread->specific != NULL)
302			free(thread->specific);
303		for (i = 0; i < MAX_THR_LOCKLEVEL; i++) {
304			_lockuser_destroy(&thread->lockusers[i]);
305		}
306		_lock_destroy(&thread->lock);
307		free(thread);
308	}
309	free_thread_count = 0;
310
311	/* Free the to-be-gc'd threads. */
312	while ((thread = TAILQ_FIRST(&_thread_gc_list)) != NULL) {
313		TAILQ_REMOVE(&_thread_gc_list, thread, gcle);
314		for (i = 0; i < MAX_THR_LOCKLEVEL; i++) {
315			_lockuser_destroy(&thread->lockusers[i]);
316		}
317		_lock_destroy(&thread->lock);
318		free(thread);
319	}
320	TAILQ_INIT(&gc_ksegq);
321	_gc_count = 0;
322
323	if (inited != 0) {
324		/*
325		 * Destroy these locks; they'll be recreated to assure they
326		 * are in the unlocked state.
327		 */
328		_lock_destroy(&kse_lock);
329		_lock_destroy(&thread_lock);
330		_lock_destroy(&_thread_list_lock);
331		inited = 0;
332	}
333
334	/*
335	 * After a fork(), the leftover thread goes back to being
336	 * scope process.
337	 */
338	curthread->attr.flags &= ~PTHREAD_SCOPE_SYSTEM;
339	curthread->attr.flags |= PTHREAD_SCOPE_PROCESS;
340
341	/*
342	 * After a fork, we are still operating on the thread's original
343	 * stack.  Don't clear the THR_FLAGS_USER from the thread's
344	 * attribute flags.
345	 */
346
347	/* Initialize the threads library. */
348	curthread->kse = NULL;
349	curthread->kseg = NULL;
350	_kse_initial = NULL;
351	_libpthread_init(curthread);
352#else
353	if (__isthreaded) {
354		_thr_rtld_fini();
355		_thr_signal_deinit();
356	}
357	__isthreaded = 0;
358	/*
359	 * Restore signal mask early, so any memory problems could
360	 * dump core.
361	 */
362	sigprocmask(SIG_SETMASK, &curthread->sigmask, NULL);
363	curthread->kse->k_kcb->kcb_kmbx.km_curthread = NULL;
364	active_threads = 1;
365#endif
366}
367
368/*
369 * This is used to initialize housekeeping and to initialize the
370 * KSD for the KSE.
371 */
372void
373_kse_init(void)
374{
375	if (inited == 0) {
376		TAILQ_INIT(&active_kseq);
377		TAILQ_INIT(&active_kse_groupq);
378		TAILQ_INIT(&free_kseq);
379		TAILQ_INIT(&free_kse_groupq);
380		TAILQ_INIT(&free_threadq);
381		TAILQ_INIT(&gc_ksegq);
382		if (_lock_init(&kse_lock, LCK_ADAPTIVE,
383		    _kse_lock_wait, _kse_lock_wakeup) != 0)
384			PANIC("Unable to initialize free KSE queue lock");
385		if (_lock_init(&thread_lock, LCK_ADAPTIVE,
386		    _kse_lock_wait, _kse_lock_wakeup) != 0)
387			PANIC("Unable to initialize free thread queue lock");
388		if (_lock_init(&_thread_list_lock, LCK_ADAPTIVE,
389		    _kse_lock_wait, _kse_lock_wakeup) != 0)
390			PANIC("Unable to initialize thread list lock");
391		active_kse_count = 0;
392		active_kseg_count = 0;
393		_gc_count = 0;
394		inited = 1;
395	}
396}
397
398int
399_kse_isthreaded(void)
400{
401	return (__isthreaded != 0);
402}
403
404/*
405 * This is called when the first thread (other than the initial
406 * thread) is created.
407 */
408int
409_kse_setthreaded(int threaded)
410{
411	sigset_t sigset;
412
413	if ((threaded != 0) && (__isthreaded == 0)) {
414		/*
415		 * Tell the kernel to create a KSE for the initial thread
416		 * and enable upcalls in it.
417		 */
418		_kse_initial->k_flags |= KF_STARTED;
419
420#ifdef SYSTEM_SCOPE_ONLY
421		/*
422		 * For bound thread, kernel reads mailbox pointer once,
423		 * we'd set it here before calling kse_create
424		 */
425		_tcb_set(_kse_initial->k_kcb, _thr_initial->tcb);
426		KSE_SET_MBOX(_kse_initial, _thr_initial);
427		_kse_initial->k_kcb->kcb_kmbx.km_flags |= KMF_BOUND;
428#endif
429		SIGFILLSET(sigset);
430		__sys_sigprocmask(SIG_SETMASK, &sigset, &_thr_initial->sigmask);
431		_thr_signal_init();
432
433		/*
434		 * Locking functions in libc are required when there are
435		 * threads other than the initial thread.
436		 */
437		_thr_rtld_init();
438
439		__isthreaded = 1;
440		if (kse_create(&_kse_initial->k_kcb->kcb_kmbx, 0) != 0) {
441			_kse_initial->k_flags &= ~KF_STARTED;
442			__isthreaded = 0;
443			PANIC("kse_create() failed\n");
444			return (-1);
445		}
446
447#ifndef SYSTEM_SCOPE_ONLY
448		/* Set current thread to initial thread */
449		_tcb_set(_kse_initial->k_kcb, _thr_initial->tcb);
450		KSE_SET_MBOX(_kse_initial, _thr_initial);
451		_thr_start_sig_daemon();
452		_thr_setmaxconcurrency();
453#else
454		__sys_sigprocmask(SIG_SETMASK, &_thr_initial->sigmask, NULL);
455#endif
456	}
457	return (0);
458}
459
460/*
461 * Lock wait and wakeup handlers for KSE locks.  These are only used by
462 * KSEs, and should never be used by threads.  KSE locks include the
463 * KSE group lock (used for locking the scheduling queue) and the
464 * kse_lock defined above.
465 *
466 * When a KSE lock attempt blocks, the entire KSE blocks allowing another
467 * KSE to run.  For the most part, it doesn't make much sense to try and
468 * schedule another thread because you need to lock the scheduling queue
469 * in order to do that.  And since the KSE lock is used to lock the scheduling
470 * queue, you would just end up blocking again.
471 */
472void
473_kse_lock_wait(struct lock *lock, struct lockuser *lu)
474{
475	struct kse *curkse = (struct kse *)_LCK_GET_PRIVATE(lu);
476	struct timespec ts;
477	int saved_flags;
478
479	if (curkse->k_kcb->kcb_kmbx.km_curthread != NULL)
480		PANIC("kse_lock_wait does not disable upcall.\n");
481	/*
482	 * Enter a loop to wait until we get the lock.
483	 */
484	ts.tv_sec = 0;
485	ts.tv_nsec = 1000000;  /* 1 sec */
486	while (!_LCK_GRANTED(lu)) {
487		/*
488		 * Yield the kse and wait to be notified when the lock
489		 * is granted.
490		 */
491		saved_flags = curkse->k_kcb->kcb_kmbx.km_flags;
492		curkse->k_kcb->kcb_kmbx.km_flags |= KMF_NOUPCALL |
493		    KMF_NOCOMPLETED;
494		kse_release(&ts);
495		curkse->k_kcb->kcb_kmbx.km_flags = saved_flags;
496	}
497}
498
499void
500_kse_lock_wakeup(struct lock *lock, struct lockuser *lu)
501{
502	struct kse *curkse;
503	struct kse *kse;
504	struct kse_mailbox *mbx;
505
506	curkse = _get_curkse();
507	kse = (struct kse *)_LCK_GET_PRIVATE(lu);
508
509	if (kse == curkse)
510		PANIC("KSE trying to wake itself up in lock");
511	else {
512		mbx = &kse->k_kcb->kcb_kmbx;
513		_lock_grant(lock, lu);
514		/*
515		 * Notify the owning kse that it has the lock.
516		 * It is safe to pass invalid address to kse_wakeup
517		 * even if the mailbox is not in kernel at all,
518		 * and waking up a wrong kse is also harmless.
519		 */
520		kse_wakeup(mbx);
521	}
522}
523
524/*
525 * Thread wait and wakeup handlers for thread locks.  These are only used
526 * by threads, never by KSEs.  Thread locks include the per-thread lock
527 * (defined in its structure), and condition variable and mutex locks.
528 */
529void
530_thr_lock_wait(struct lock *lock, struct lockuser *lu)
531{
532	struct pthread *curthread = (struct pthread *)lu->lu_private;
533
534	do {
535		THR_LOCK_SWITCH(curthread);
536		THR_SET_STATE(curthread, PS_LOCKWAIT);
537		_thr_sched_switch_unlocked(curthread);
538	} while (!_LCK_GRANTED(lu));
539}
540
541void
542_thr_lock_wakeup(struct lock *lock, struct lockuser *lu)
543{
544	struct pthread *thread;
545	struct pthread *curthread;
546	struct kse_mailbox *kmbx;
547
548	curthread = _get_curthread();
549	thread = (struct pthread *)_LCK_GET_PRIVATE(lu);
550
551	THR_SCHED_LOCK(curthread, thread);
552	_lock_grant(lock, lu);
553	kmbx = _thr_setrunnable_unlocked(thread);
554	THR_SCHED_UNLOCK(curthread, thread);
555	if (kmbx != NULL)
556		kse_wakeup(kmbx);
557}
558
559kse_critical_t
560_kse_critical_enter(void)
561{
562	kse_critical_t crit;
563
564	crit = (kse_critical_t)_kcb_critical_enter();
565	return (crit);
566}
567
568void
569_kse_critical_leave(kse_critical_t crit)
570{
571	struct pthread *curthread;
572
573	_kcb_critical_leave((struct kse_thr_mailbox *)crit);
574	if ((crit != NULL) && ((curthread = _get_curthread()) != NULL))
575		THR_YIELD_CHECK(curthread);
576}
577
578int
579_kse_in_critical(void)
580{
581	return (_kcb_in_critical());
582}
583
584void
585_thr_critical_enter(struct pthread *thread)
586{
587	thread->critical_count++;
588}
589
590void
591_thr_critical_leave(struct pthread *thread)
592{
593	thread->critical_count--;
594	THR_YIELD_CHECK(thread);
595}
596
597void
598_thr_sched_switch(struct pthread *curthread)
599{
600	struct kse *curkse;
601
602	(void)_kse_critical_enter();
603	curkse = _get_curkse();
604	KSE_SCHED_LOCK(curkse, curkse->k_kseg);
605	_thr_sched_switch_unlocked(curthread);
606}
607
608/*
609 * XXX - We may need to take the scheduling lock before calling
610 *       this, or perhaps take the lock within here before
611 *       doing anything else.
612 */
613void
614_thr_sched_switch_unlocked(struct pthread *curthread)
615{
616	struct pthread *td;
617	struct pthread_sigframe psf;
618	struct kse *curkse;
619	int ret;
620	volatile int uts_once;
621	volatile int resume_once = 0;
622	ucontext_t uc;
623
624	/* We're in the scheduler, 5 by 5: */
625	curkse = _get_curkse();
626
627	curthread->need_switchout = 1;	/* The thread yielded on its own. */
628	curthread->critical_yield = 0;	/* No need to yield anymore. */
629	thr_accounting(curthread);
630
631
632	/* Thread can unlock the scheduler lock. */
633	curthread->lock_switch = 1;
634
635	/*
636	 * The signal frame is allocated off the stack because
637	 * a thread can be interrupted by other signals while
638	 * it is running down pending signals.
639	 */
640	psf.psf_valid = 0;
641	curthread->curframe = &psf;
642
643	/*
644	 * Enter the scheduler if any one of the following is true:
645	 *
646	 *   o The current thread is dead; it's stack needs to be
647	 *     cleaned up and it can't be done while operating on
648	 *     it.
649	 *   o The current thread has signals pending, should
650	 *     let scheduler install signal trampoline for us.
651	 *   o There are no runnable threads.
652	 *   o The next thread to run won't unlock the scheduler
653	 *     lock.  A side note: the current thread may be run
654	 *     instead of the next thread in the run queue, but
655	 *     we don't bother checking for that.
656	 */
657	if (curthread->attr.flags & PTHREAD_SCOPE_SYSTEM)
658		kse_sched_single(&curkse->k_kcb->kcb_kmbx);
659	else if ((curthread->state == PS_DEAD) ||
660	    (((td = KSE_RUNQ_FIRST(curkse)) == NULL) &&
661	    (curthread->state != PS_RUNNING)) ||
662	    ((td != NULL) && (td->lock_switch == 0))) {
663		curkse->k_switch = 1;
664		_thread_enter_uts(curthread->tcb, curkse->k_kcb);
665	}
666	else {
667		uts_once = 0;
668		THR_GETCONTEXT(&curthread->tcb->tcb_tmbx.tm_context);
669		if (uts_once == 0) {
670			uts_once = 1;
671
672			/* Switchout the current thread. */
673			kse_switchout_thread(curkse, curthread);
674			_tcb_set(curkse->k_kcb, NULL);
675
676		 	/* Choose another thread to run. */
677			td = KSE_RUNQ_FIRST(curkse);
678			KSE_RUNQ_REMOVE(curkse, td);
679			curkse->k_curthread = td;
680
681			/*
682			 * Make sure the current thread's kse points to
683			 * this kse.
684			 */
685			td->kse = curkse;
686
687			/*
688			 * Reset the time slice if this thread is running
689			 * for the first time or running again after using
690			 * its full time slice allocation.
691			 */
692			if (td->slice_usec == -1)
693				td->slice_usec = 0;
694
695			/* Mark the thread active. */
696			td->active = 1;
697
698			/* Remove the frame reference. */
699			td->curframe = NULL;
700
701			/*
702			 * Continue the thread at its current frame.
703			 * Note: TCB is set in _thread_switch
704			 */
705			ret = _thread_switch(curkse->k_kcb, td->tcb, 0);
706			/* This point should not be reached. */
707			if (ret != 0)
708				PANIC("Bad return from _thread_switch");
709			PANIC("Thread has returned from _thread_switch");
710		}
711	}
712
713	if (psf.psf_valid) {
714		/*
715		 * It is ugly we must increase critical count, because we
716		 * have a frame saved, we must backout state in psf
717		 * before we can process signals.
718 		 */
719		curthread->critical_count++;
720	}
721
722	if (curthread->lock_switch != 0) {
723		/*
724		 * Unlock the scheduling queue and leave the
725		 * critical region.
726		 */
727		/* Don't trust this after a switch! */
728		curkse = _get_curkse();
729
730		curthread->lock_switch = 0;
731		KSE_SCHED_UNLOCK(curkse, curkse->k_kseg);
732		_kse_critical_leave(&curthread->tcb->tcb_tmbx);
733	}
734	/*
735	 * This thread is being resumed; check for cancellations.
736	 */
737	if ((psf.psf_valid ||
738	    (curthread->check_pending && !THR_IN_CRITICAL(curthread)))) {
739		resume_once = 0;
740		THR_GETCONTEXT(&uc);
741		if (resume_once == 0) {
742			resume_once = 1;
743			curthread->check_pending = 0;
744			thr_resume_check(curthread, &uc, &psf);
745		}
746	}
747	THR_ACTIVATE_LAST_LOCK(curthread);
748}
749
750/*
751 * This is the scheduler for a KSE which runs a scope system thread.
752 * The multi-thread KSE scheduler should also work for a single threaded
753 * KSE, but we use a separate scheduler so that it can be fine-tuned
754 * to be more efficient (and perhaps not need a separate stack for
755 * the KSE, allowing it to use the thread's stack).
756 */
757
758static void
759kse_sched_single(struct kse_mailbox *kmbx)
760{
761	struct kse *curkse;
762	struct pthread *curthread;
763	struct timespec ts;
764	sigset_t sigmask;
765	int i, sigseqno, level, first = 0;
766
767	curkse = (struct kse *)kmbx->km_udata;
768	curthread = curkse->k_curthread;
769
770	if ((curkse->k_flags & KF_INITIALIZED) == 0) {
771		/* Setup this KSEs specific data. */
772		_kcb_set(curkse->k_kcb);
773		_tcb_set(curkse->k_kcb, curthread->tcb);
774		curkse->k_flags |= KF_INITIALIZED;
775		first = 1;
776		curthread->active = 1;
777
778		/* Setup kernel signal masks for new thread. */
779		__sys_sigprocmask(SIG_SETMASK, &curthread->sigmask, NULL);
780		/*
781		 * Enter critical region, this is meanless for bound thread,
782		 * It is used to let other code work, those code want mailbox
783		 * to be cleared.
784		 */
785		(void)_kse_critical_enter();
786 	} else {
787		/*
788		 * Bound thread always has tcb set, this prevent some
789		 * code from blindly setting bound thread tcb to NULL,
790		 * buggy code ?
791		 */
792		_tcb_set(curkse->k_kcb, curthread->tcb);
793	}
794
795	curthread->critical_yield = 0;
796	curthread->need_switchout = 0;
797
798	/*
799	 * Lock the scheduling queue.
800	 *
801	 * There is no scheduling queue for single threaded KSEs,
802	 * but we need a lock for protection regardless.
803	 */
804	if (curthread->lock_switch == 0)
805		KSE_SCHED_LOCK(curkse, curkse->k_kseg);
806
807	/*
808	 * This has to do the job of kse_switchout_thread(), only
809	 * for a single threaded KSE/KSEG.
810	 */
811
812	switch (curthread->state) {
813	case PS_DEAD:
814		curthread->check_pending = 0;
815		/* Unlock the scheduling queue and exit the KSE and thread. */
816		thr_cleanup(curkse, curthread);
817		KSE_SCHED_UNLOCK(curkse, curkse->k_kseg);
818		PANIC("bound thread shouldn't get here\n");
819		break;
820
821	case PS_SIGWAIT:
822		PANIC("bound thread does not have SIGWAIT state\n");
823
824	case PS_SLEEP_WAIT:
825		PANIC("bound thread does not have SLEEP_WAIT state\n");
826
827	case PS_SIGSUSPEND:
828		PANIC("bound thread does not have SIGSUSPEND state\n");
829
830	case PS_COND_WAIT:
831		break;
832
833	case PS_LOCKWAIT:
834		/*
835		 * This state doesn't timeout.
836		 */
837		curthread->wakeup_time.tv_sec = -1;
838		curthread->wakeup_time.tv_nsec = -1;
839		level = curthread->locklevel - 1;
840		if (_LCK_GRANTED(&curthread->lockusers[level]))
841			THR_SET_STATE(curthread, PS_RUNNING);
842		break;
843
844	case PS_RUNNING:
845		if ((curthread->flags & THR_FLAGS_SUSPENDED) != 0) {
846			THR_SET_STATE(curthread, PS_SUSPENDED);
847		}
848		curthread->wakeup_time.tv_sec = -1;
849		curthread->wakeup_time.tv_nsec = -1;
850		break;
851
852	case PS_JOIN:
853	case PS_MUTEX_WAIT:
854	case PS_SUSPENDED:
855	case PS_DEADLOCK:
856	default:
857		/*
858		 * These states don't timeout and don't need
859		 * to be in the waiting queue.
860		 */
861		curthread->wakeup_time.tv_sec = -1;
862		curthread->wakeup_time.tv_nsec = -1;
863		break;
864	}
865
866	while (curthread->state != PS_RUNNING) {
867		sigseqno = curkse->k_sigseqno;
868		if (curthread->check_pending != 0) {
869			/*
870			 * Install pending signals into the frame, possible
871			 * cause mutex or condvar backout.
872			 */
873			curthread->check_pending = 0;
874			SIGFILLSET(sigmask);
875
876			/*
877			 * Lock out kernel signal code when we are processing
878			 * signals, and get a fresh copy of signal mask.
879			 */
880			__sys_sigprocmask(SIG_SETMASK, &sigmask,
881					  &curthread->sigmask);
882			for (i = 1; i <= _SIG_MAXSIG; i++) {
883				if (SIGISMEMBER(curthread->sigmask, i))
884					continue;
885				if (SIGISMEMBER(curthread->sigpend, i))
886					(void)_thr_sig_add(curthread, i,
887					    &curthread->siginfo[i-1]);
888			}
889			__sys_sigprocmask(SIG_SETMASK, &curthread->sigmask,
890				NULL);
891			/* The above code might make thread runnable */
892			if (curthread->state == PS_RUNNING)
893				break;
894		}
895		THR_DEACTIVATE_LAST_LOCK(curthread);
896		kse_wait(curkse, curthread, sigseqno);
897		THR_ACTIVATE_LAST_LOCK(curthread);
898		KSE_GET_TOD(curkse, &ts);
899		if (thr_timedout(curthread, &ts)) {
900			/* Indicate the thread timedout: */
901			curthread->timeout = 1;
902			/* Make the thread runnable. */
903			THR_SET_STATE(curthread, PS_RUNNING);
904		}
905	}
906
907	/* Remove the frame reference. */
908	curthread->curframe = NULL;
909
910	if (curthread->lock_switch == 0) {
911		/* Unlock the scheduling queue. */
912		KSE_SCHED_UNLOCK(curkse, curkse->k_kseg);
913	}
914
915	DBG_MSG("Continuing bound thread %p\n", curthread);
916	if (first) {
917		_kse_critical_leave(&curthread->tcb->tcb_tmbx);
918		pthread_exit(curthread->start_routine(curthread->arg));
919	}
920}
921
922#ifdef DEBUG_THREAD_KERN
923static void
924dump_queues(struct kse *curkse)
925{
926	struct pthread *thread;
927
928	DBG_MSG("Threads in waiting queue:\n");
929	TAILQ_FOREACH(thread, &curkse->k_kseg->kg_schedq.sq_waitq, pqe) {
930		DBG_MSG("  thread %p, state %d, blocked %d\n",
931		    thread, thread->state, thread->blocked);
932	}
933}
934#endif
935
936/*
937 * This is the scheduler for a KSE which runs multiple threads.
938 */
939static void
940kse_sched_multi(struct kse_mailbox *kmbx)
941{
942	struct kse *curkse;
943	struct pthread *curthread, *td_wait;
944	struct pthread_sigframe *curframe;
945	int ret;
946
947	curkse = (struct kse *)kmbx->km_udata;
948	THR_ASSERT(curkse->k_kcb->kcb_kmbx.km_curthread == NULL,
949	    "Mailbox not null in kse_sched_multi");
950
951	/* Check for first time initialization: */
952	if ((curkse->k_flags & KF_INITIALIZED) == 0) {
953		/* Setup this KSEs specific data. */
954		_kcb_set(curkse->k_kcb);
955
956		/* Set this before grabbing the context. */
957		curkse->k_flags |= KF_INITIALIZED;
958	}
959
960	/*
961	 * No current thread anymore, calling _get_curthread in UTS
962	 * should dump core
963	 */
964	_tcb_set(curkse->k_kcb, NULL);
965
966	/* This may have returned from a kse_release(). */
967	if (KSE_WAITING(curkse)) {
968		DBG_MSG("Entered upcall when KSE is waiting.");
969		KSE_CLEAR_WAIT(curkse);
970	}
971
972	/* If this is an upcall; take the scheduler lock. */
973	if (curkse->k_switch == 0)
974		KSE_SCHED_LOCK(curkse, curkse->k_kseg);
975	curkse->k_switch = 0;
976
977	/*
978	 * Now that the scheduler lock is held, get the current
979	 * thread.  The KSE's current thread cannot be safely
980	 * examined without the lock because it could have returned
981	 * as completed on another KSE.  See kse_check_completed().
982	 */
983	curthread = curkse->k_curthread;
984
985	if (KSE_IS_IDLE(curkse)) {
986		KSE_CLEAR_IDLE(curkse);
987		curkse->k_kseg->kg_idle_kses--;
988	}
989	/*
990	 * If the current thread was completed in another KSE, then
991	 * it will be in the run queue.  Don't mark it as being blocked.
992	 */
993	if ((curthread != NULL) &&
994	    ((curthread->flags & THR_FLAGS_IN_RUNQ) == 0) &&
995	    (curthread->need_switchout == 0)) {
996		/*
997		 * Assume the current thread is blocked; when the
998		 * completed threads are checked and if the current
999		 * thread is among the completed, the blocked flag
1000		 * will be cleared.
1001		 */
1002		curthread->blocked = 1;
1003	}
1004
1005	/* Check for any unblocked threads in the kernel. */
1006	kse_check_completed(curkse);
1007
1008	/*
1009	 * Check for threads that have timed-out.
1010	 */
1011	kse_check_waitq(curkse);
1012
1013	/*
1014	 * Switchout the current thread, if necessary, as the last step
1015	 * so that it is inserted into the run queue (if it's runnable)
1016	 * _after_ any other threads that were added to it above.
1017	 */
1018	if (curthread == NULL)
1019		;  /* Nothing to do here. */
1020	else if ((curthread->need_switchout == 0) &&
1021	    (curthread->blocked == 0) && (THR_IN_CRITICAL(curthread))) {
1022		/*
1023		 * Resume the thread and tell it to yield when
1024		 * it leaves the critical region.
1025		 */
1026		curthread->critical_yield = 1;
1027		curthread->active = 1;
1028		if ((curthread->flags & THR_FLAGS_IN_RUNQ) != 0)
1029			KSE_RUNQ_REMOVE(curkse, curthread);
1030		curkse->k_curthread = curthread;
1031		curthread->kse = curkse;
1032		DBG_MSG("Continuing thread %p in critical region\n",
1033		    curthread);
1034		kse_wakeup_multi(curkse);
1035		KSE_SCHED_UNLOCK(curkse, curkse->k_kseg);
1036		ret = _thread_switch(curkse->k_kcb, curthread->tcb, 1);
1037		if (ret != 0)
1038			PANIC("Can't resume thread in critical region\n");
1039	}
1040	else if ((curthread->flags & THR_FLAGS_IN_RUNQ) == 0)
1041		kse_switchout_thread(curkse, curthread);
1042	curkse->k_curthread = NULL;
1043
1044	kse_wakeup_multi(curkse);
1045
1046#ifdef DEBUG_THREAD_KERN
1047	dump_queues(curkse);
1048#endif
1049
1050	/* Check if there are no threads ready to run: */
1051	while (((curthread = KSE_RUNQ_FIRST(curkse)) == NULL) &&
1052	    (curkse->k_kseg->kg_threadcount != 0)) {
1053		/*
1054		 * Wait for a thread to become active or until there are
1055		 * no more threads.
1056		 */
1057		td_wait = KSE_WAITQ_FIRST(curkse);
1058		kse_wait(curkse, td_wait, 0);
1059		kse_check_completed(curkse);
1060		kse_check_waitq(curkse);
1061	}
1062
1063	/* Check for no more threads: */
1064	if (curkse->k_kseg->kg_threadcount == 0) {
1065		/*
1066		 * Normally this shouldn't return, but it will if there
1067		 * are other KSEs running that create new threads that
1068		 * are assigned to this KSE[G].  For instance, if a scope
1069		 * system thread were to create a scope process thread
1070		 * and this kse[g] is the initial kse[g], then that newly
1071		 * created thread would be assigned to us (the initial
1072		 * kse[g]).
1073		 */
1074		KSE_SCHED_UNLOCK(curkse, curkse->k_kseg);
1075		kse_fini(curkse);
1076		/* never returns */
1077	}
1078
1079	THR_ASSERT(curthread != NULL,
1080	    "Return from kse_wait/fini without thread.");
1081	THR_ASSERT(curthread->state != PS_DEAD,
1082	    "Trying to resume dead thread!");
1083	KSE_RUNQ_REMOVE(curkse, curthread);
1084
1085	/*
1086	 * Make the selected thread the current thread.
1087	 */
1088	curkse->k_curthread = curthread;
1089
1090	/*
1091	 * Make sure the current thread's kse points to this kse.
1092	 */
1093	curthread->kse = curkse;
1094
1095	/*
1096	 * Reset the time slice if this thread is running for the first
1097	 * time or running again after using its full time slice allocation.
1098	 */
1099	if (curthread->slice_usec == -1)
1100		curthread->slice_usec = 0;
1101
1102	/* Mark the thread active. */
1103	curthread->active = 1;
1104
1105	/* Remove the frame reference. */
1106	curframe = curthread->curframe;
1107	curthread->curframe = NULL;
1108
1109	kse_wakeup_multi(curkse);
1110
1111	/*
1112	 * The thread's current signal frame will only be NULL if it
1113	 * is being resumed after being blocked in the kernel.  In
1114	 * this case, and if the thread needs to run down pending
1115	 * signals or needs a cancellation check, we need to add a
1116	 * signal frame to the thread's context.
1117	 */
1118#ifdef NOT_YET
1119	if ((((curframe == NULL) && (curthread->check_pending != 0)) ||
1120	    (((curthread->cancelflags & THR_AT_CANCEL_POINT) == 0) &&
1121	     ((curthread->cancelflags & PTHREAD_CANCEL_ASYNCHRONOUS) != 0))) &&
1122	     !THR_IN_CRITICAL(curthread))
1123		signalcontext(&curthread->tcb->tcb_tmbx.tm_context, 0,
1124		    (__sighandler_t *)thr_resume_wrapper);
1125#else
1126	if ((curframe == NULL) && (curthread->state == PS_RUNNING) &&
1127	    (curthread->check_pending != 0) && !THR_IN_CRITICAL(curthread)) {
1128		curthread->check_pending = 0;
1129		signalcontext(&curthread->tcb->tcb_tmbx.tm_context, 0,
1130		    (__sighandler_t *)thr_resume_wrapper);
1131	}
1132#endif
1133	/*
1134	 * Continue the thread at its current frame:
1135	 */
1136	if (curthread->lock_switch != 0) {
1137		/*
1138		 * This thread came from a scheduler switch; it will
1139		 * unlock the scheduler lock and set the mailbox.
1140		 */
1141		ret = _thread_switch(curkse->k_kcb, curthread->tcb, 0);
1142	} else {
1143		/* This thread won't unlock the scheduler lock. */
1144		KSE_SCHED_UNLOCK(curkse, curkse->k_kseg);
1145		ret = _thread_switch(curkse->k_kcb, curthread->tcb, 1);
1146	}
1147	if (ret != 0)
1148		PANIC("Thread has returned from _thread_switch");
1149
1150	/* This point should not be reached. */
1151	PANIC("Thread has returned from _thread_switch");
1152}
1153
1154static void
1155thr_resume_wrapper(int sig, siginfo_t *siginfo, ucontext_t *ucp)
1156{
1157	struct pthread *curthread = _get_curthread();
1158	struct kse *curkse;
1159	int ret, err_save = curthread->error;
1160
1161	DBG_MSG(">>> sig wrapper\n");
1162	if (curthread->lock_switch)
1163		PANIC("thr_resume_wrapper, lock_switch != 0\n");
1164	thr_resume_check(curthread, ucp, NULL);
1165	_kse_critical_enter();
1166	curkse = _get_curkse();
1167	curthread->tcb->tcb_tmbx.tm_context = *ucp;
1168	curthread->error = err_save;
1169	ret = _thread_switch(curkse->k_kcb, curthread->tcb, 1);
1170	if (ret != 0)
1171		PANIC("thr_resume_wrapper: thread has returned "
1172		      "from _thread_switch");
1173	/* THR_SETCONTEXT(ucp); */ /* not work, why ? */
1174}
1175
1176static void
1177thr_resume_check(struct pthread *curthread, ucontext_t *ucp,
1178    struct pthread_sigframe *psf)
1179{
1180	_thr_sig_rundown(curthread, ucp, psf);
1181
1182#ifdef NOT_YET
1183	if (((curthread->cancelflags & THR_AT_CANCEL_POINT) == 0) &&
1184	    ((curthread->cancelflags & PTHREAD_CANCEL_ASYNCHRONOUS) != 0))
1185		pthread_testcancel();
1186#endif
1187}
1188
1189/*
1190 * Clean up a thread.  This must be called with the thread's KSE
1191 * scheduling lock held.  The thread must be a thread from the
1192 * KSE's group.
1193 */
1194static void
1195thr_cleanup(struct kse *curkse, struct pthread *thread)
1196{
1197	struct pthread *joiner;
1198	struct kse_mailbox *kmbx = NULL;
1199	int sys_scope;
1200
1201	if ((joiner = thread->joiner) != NULL) {
1202		/* Joinee scheduler lock held; joiner won't leave. */
1203		if (joiner->kseg == curkse->k_kseg) {
1204			if (joiner->join_status.thread == thread) {
1205				joiner->join_status.thread = NULL;
1206				joiner->join_status.ret = thread->ret;
1207				(void)_thr_setrunnable_unlocked(joiner);
1208			}
1209		} else {
1210			KSE_SCHED_UNLOCK(curkse, curkse->k_kseg);
1211			/* The joiner may have removed itself and exited. */
1212			if (_thr_ref_add(thread, joiner, 0) == 0) {
1213				KSE_SCHED_LOCK(curkse, joiner->kseg);
1214				if (joiner->join_status.thread == thread) {
1215					joiner->join_status.thread = NULL;
1216					joiner->join_status.ret = thread->ret;
1217					kmbx = _thr_setrunnable_unlocked(joiner);
1218				}
1219				KSE_SCHED_UNLOCK(curkse, joiner->kseg);
1220				_thr_ref_delete(thread, joiner);
1221				if (kmbx != NULL)
1222					kse_wakeup(kmbx);
1223			}
1224			KSE_SCHED_LOCK(curkse, curkse->k_kseg);
1225		}
1226		thread->attr.flags |= PTHREAD_DETACHED;
1227	}
1228
1229	if (!(sys_scope = (thread->attr.flags & PTHREAD_SCOPE_SYSTEM))) {
1230		/*
1231		 * Remove the thread from the KSEG's list of threads.
1232	 	 */
1233		KSEG_THRQ_REMOVE(thread->kseg, thread);
1234		/*
1235		 * Migrate the thread to the main KSE so that this
1236		 * KSE and KSEG can be cleaned when their last thread
1237		 * exits.
1238		 */
1239		thread->kseg = _kse_initial->k_kseg;
1240		thread->kse = _kse_initial;
1241	}
1242	thread->flags |= THR_FLAGS_GC_SAFE;
1243
1244	/*
1245	 * We can't hold the thread list lock while holding the
1246	 * scheduler lock.
1247	 */
1248	KSE_SCHED_UNLOCK(curkse, curkse->k_kseg);
1249	DBG_MSG("Adding thread %p to GC list\n", thread);
1250	KSE_LOCK_ACQUIRE(curkse, &_thread_list_lock);
1251	THR_GCLIST_ADD(thread);
1252	/* Use thread_list_lock */
1253	active_threads--;
1254#ifdef SYSTEM_SCOPE_ONLY
1255	if (active_threads == 0) {
1256#else
1257	if (active_threads == 1) {
1258#endif
1259		KSE_LOCK_RELEASE(curkse, &_thread_list_lock);
1260		exit(0);
1261        }
1262	KSE_LOCK_RELEASE(curkse, &_thread_list_lock);
1263	if (sys_scope) {
1264		/*
1265		 * System scope thread is single thread group,
1266		 * when thread is exited, its kse and ksegrp should
1267		 * be recycled as well.
1268		 * kse upcall stack belongs to thread, clear it here.
1269		 */
1270		curkse->k_stack.ss_sp = 0;
1271		curkse->k_stack.ss_size = 0;
1272		kse_exit();
1273		PANIC("kse_exit() failed for system scope thread");
1274	}
1275	KSE_SCHED_LOCK(curkse, curkse->k_kseg);
1276}
1277
1278void
1279_thr_gc(struct pthread *curthread)
1280{
1281	thread_gc(curthread);
1282	kse_gc(curthread);
1283	kseg_gc(curthread);
1284}
1285
1286static void
1287thread_gc(struct pthread *curthread)
1288{
1289	struct pthread *td, *td_next;
1290	kse_critical_t crit;
1291	TAILQ_HEAD(, pthread) worklist;
1292
1293	TAILQ_INIT(&worklist);
1294	crit = _kse_critical_enter();
1295	KSE_LOCK_ACQUIRE(curthread->kse, &_thread_list_lock);
1296
1297	/* Check the threads waiting for GC. */
1298	for (td = TAILQ_FIRST(&_thread_gc_list); td != NULL; td = td_next) {
1299		td_next = TAILQ_NEXT(td, gcle);
1300		if ((td->flags & THR_FLAGS_GC_SAFE) == 0)
1301			continue;
1302		else if (((td->attr.flags & PTHREAD_SCOPE_SYSTEM) != 0) &&
1303		    ((td->kse->k_kcb->kcb_kmbx.km_flags & KMF_DONE) == 0)) {
1304			/*
1305			 * The thread and KSE are operating on the same
1306			 * stack.  Wait for the KSE to exit before freeing
1307			 * the thread's stack as well as everything else.
1308			 */
1309			continue;
1310		}
1311		/*
1312		 * Remove the thread from the GC list.  If the thread
1313		 * isn't yet detached, it will get added back to the
1314		 * GC list at a later time.
1315		 */
1316		THR_GCLIST_REMOVE(td);
1317		DBG_MSG("Freeing thread %p stack\n", td);
1318		/*
1319		 * We can free the thread stack since it's no longer
1320		 * in use.
1321		 */
1322		_thr_stack_free(&td->attr);
1323		if (((td->attr.flags & PTHREAD_DETACHED) != 0) &&
1324		    (td->refcount == 0)) {
1325			/*
1326			 * The thread has detached and is no longer
1327			 * referenced.  It is safe to remove all
1328			 * remnants of the thread.
1329			 */
1330			THR_LIST_REMOVE(td);
1331			TAILQ_INSERT_HEAD(&worklist, td, gcle);
1332		}
1333	}
1334	KSE_LOCK_RELEASE(curthread->kse, &_thread_list_lock);
1335	_kse_critical_leave(crit);
1336
1337	while ((td = TAILQ_FIRST(&worklist)) != NULL) {
1338		TAILQ_REMOVE(&worklist, td, gcle);
1339
1340		if ((td->attr.flags & PTHREAD_SCOPE_SYSTEM) != 0) {
1341			crit = _kse_critical_enter();
1342			KSE_LOCK_ACQUIRE(curthread->kse, &kse_lock);
1343			kse_free_unlocked(td->kse);
1344			kseg_free_unlocked(td->kseg);
1345			KSE_LOCK_RELEASE(curthread->kse, &kse_lock);
1346			_kse_critical_leave(crit);
1347		}
1348		/*
1349		 * XXX we don't free initial thread, because there might
1350		 * have some code referencing initial thread.
1351		 */
1352		if (td != _thr_initial) {
1353			DBG_MSG("Freeing thread %p\n", td);
1354			_thr_free(curthread, td);
1355		} else
1356			DBG_MSG("Initial thread won't be freed\n");
1357	}
1358}
1359
1360static void
1361kse_gc(struct pthread *curthread)
1362{
1363	kse_critical_t crit;
1364	TAILQ_HEAD(, kse) worklist;
1365	struct kse *kse;
1366
1367	if (free_kse_count <= MAX_CACHED_KSES)
1368		return;
1369	TAILQ_INIT(&worklist);
1370	crit = _kse_critical_enter();
1371	KSE_LOCK_ACQUIRE(curthread->kse, &kse_lock);
1372	while (free_kse_count > MAX_CACHED_KSES) {
1373		kse = TAILQ_FIRST(&free_kseq);
1374		TAILQ_REMOVE(&free_kseq, kse, k_qe);
1375		TAILQ_INSERT_HEAD(&worklist, kse, k_qe);
1376		free_kse_count--;
1377	}
1378	KSE_LOCK_RELEASE(curthread->kse, &kse_lock);
1379	_kse_critical_leave(crit);
1380
1381	while ((kse = TAILQ_FIRST(&worklist))) {
1382		TAILQ_REMOVE(&worklist, kse, k_qe);
1383		kse_destroy(kse);
1384	}
1385}
1386
1387static void
1388kseg_gc(struct pthread *curthread)
1389{
1390	kse_critical_t crit;
1391	TAILQ_HEAD(, kse_group) worklist;
1392	struct kse_group *kseg;
1393
1394	if (free_kseg_count <= MAX_CACHED_KSEGS)
1395		return;
1396	crit = _kse_critical_enter();
1397	KSE_LOCK_ACQUIRE(curthread->kse, &kse_lock);
1398	while (free_kseg_count > MAX_CACHED_KSEGS) {
1399		kseg = TAILQ_FIRST(&free_kse_groupq);
1400		TAILQ_REMOVE(&free_kse_groupq, kseg, kg_qe);
1401		free_kseg_count--;
1402		TAILQ_INSERT_HEAD(&worklist, kseg, kg_qe);
1403	}
1404	KSE_LOCK_RELEASE(curthread->kse, &kse_lock);
1405	_kse_critical_leave(crit);
1406
1407	while ((kseg = TAILQ_FIRST(&worklist))) {
1408		TAILQ_REMOVE(&worklist, kseg, kg_qe);
1409		kseg_destroy(kseg);
1410	}
1411}
1412
1413/*
1414 * Only new threads that are running or suspended may be scheduled.
1415 */
1416int
1417_thr_schedule_add(struct pthread *curthread, struct pthread *newthread)
1418{
1419	kse_critical_t crit;
1420	int ret;
1421
1422	/* Add the new thread. */
1423	thr_link(newthread);
1424
1425	/*
1426	 * If this is the first time creating a thread, make sure
1427	 * the mailbox is set for the current thread.
1428	 */
1429	if ((newthread->attr.flags & PTHREAD_SCOPE_SYSTEM) != 0) {
1430		/* We use the thread's stack as the KSE's stack. */
1431		newthread->kse->k_kcb->kcb_kmbx.km_stack.ss_sp =
1432		    newthread->attr.stackaddr_attr;
1433		newthread->kse->k_kcb->kcb_kmbx.km_stack.ss_size =
1434		    newthread->attr.stacksize_attr;
1435
1436		/*
1437		 * No need to lock the scheduling queue since the
1438		 * KSE/KSEG pair have not yet been started.
1439		 */
1440		KSEG_THRQ_ADD(newthread->kseg, newthread);
1441		/* this thread never gives up kse */
1442		newthread->active = 1;
1443		newthread->kse->k_curthread = newthread;
1444		newthread->kse->k_kcb->kcb_kmbx.km_flags = KMF_BOUND;
1445		newthread->kse->k_kcb->kcb_kmbx.km_func =
1446		    (kse_func_t *)kse_sched_single;
1447		newthread->kse->k_kcb->kcb_kmbx.km_quantum = 0;
1448		KSE_SET_MBOX(newthread->kse, newthread);
1449		/*
1450		 * This thread needs a new KSE and KSEG.
1451		 */
1452		newthread->kse->k_flags &= ~KF_INITIALIZED;
1453		newthread->kse->k_flags |= KF_STARTED;
1454		/* Fire up! */
1455		ret = kse_create(&newthread->kse->k_kcb->kcb_kmbx, 1);
1456		if (ret != 0)
1457			ret = errno;
1458	}
1459	else {
1460		/*
1461		 * Lock the KSE and add the new thread to its list of
1462		 * assigned threads.  If the new thread is runnable, also
1463		 * add it to the KSE's run queue.
1464		 */
1465		crit = _kse_critical_enter();
1466		KSE_SCHED_LOCK(curthread->kse, newthread->kseg);
1467		KSEG_THRQ_ADD(newthread->kseg, newthread);
1468		if (newthread->state == PS_RUNNING)
1469			THR_RUNQ_INSERT_TAIL(newthread);
1470		if ((newthread->kse->k_flags & KF_STARTED) == 0) {
1471			/*
1472			 * This KSE hasn't been started yet.  Start it
1473			 * outside of holding the lock.
1474			 */
1475			newthread->kse->k_flags |= KF_STARTED;
1476			newthread->kse->k_kcb->kcb_kmbx.km_func =
1477			    (kse_func_t *)kse_sched_multi;
1478			newthread->kse->k_kcb->kcb_kmbx.km_flags = 0;
1479			kse_create(&newthread->kse->k_kcb->kcb_kmbx, 0);
1480		 } else if ((newthread->state == PS_RUNNING) &&
1481		     KSE_IS_IDLE(newthread->kse)) {
1482			/*
1483			 * The thread is being scheduled on another KSEG.
1484			 */
1485			kse_wakeup_one(newthread);
1486		}
1487		KSE_SCHED_UNLOCK(curthread->kse, newthread->kseg);
1488		_kse_critical_leave(crit);
1489		ret = 0;
1490	}
1491	if (ret != 0)
1492		thr_unlink(newthread);
1493
1494	return (ret);
1495}
1496
1497void
1498kse_waitq_insert(struct pthread *thread)
1499{
1500	struct pthread *td;
1501
1502	if (thread->wakeup_time.tv_sec == -1)
1503		TAILQ_INSERT_TAIL(&thread->kse->k_schedq->sq_waitq, thread,
1504		    pqe);
1505	else {
1506		td = TAILQ_FIRST(&thread->kse->k_schedq->sq_waitq);
1507		while ((td != NULL) && (td->wakeup_time.tv_sec != -1) &&
1508		    ((td->wakeup_time.tv_sec < thread->wakeup_time.tv_sec) ||
1509		    ((td->wakeup_time.tv_sec == thread->wakeup_time.tv_sec) &&
1510		    (td->wakeup_time.tv_nsec <= thread->wakeup_time.tv_nsec))))
1511			td = TAILQ_NEXT(td, pqe);
1512		if (td == NULL)
1513			TAILQ_INSERT_TAIL(&thread->kse->k_schedq->sq_waitq,
1514			    thread, pqe);
1515		else
1516			TAILQ_INSERT_BEFORE(td, thread, pqe);
1517	}
1518	thread->flags |= THR_FLAGS_IN_WAITQ;
1519}
1520
1521/*
1522 * This must be called with the scheduling lock held.
1523 */
1524static void
1525kse_check_completed(struct kse *kse)
1526{
1527	struct pthread *thread;
1528	struct kse_thr_mailbox *completed;
1529	int sig;
1530
1531	if ((completed = kse->k_kcb->kcb_kmbx.km_completed) != NULL) {
1532		kse->k_kcb->kcb_kmbx.km_completed = NULL;
1533		while (completed != NULL) {
1534			thread = completed->tm_udata;
1535			DBG_MSG("Found completed thread %p, name %s\n",
1536			    thread,
1537			    (thread->name == NULL) ? "none" : thread->name);
1538			thread->blocked = 0;
1539			if (thread != kse->k_curthread) {
1540				thr_accounting(thread);
1541				if ((thread->flags & THR_FLAGS_SUSPENDED) != 0)
1542					THR_SET_STATE(thread, PS_SUSPENDED);
1543				else
1544					KSE_RUNQ_INSERT_TAIL(kse, thread);
1545				if ((thread->kse != kse) &&
1546				    (thread->kse->k_curthread == thread)) {
1547					/*
1548					 * Remove this thread from its
1549					 * previous KSE so that it (the KSE)
1550					 * doesn't think it is still active.
1551					 */
1552					thread->kse->k_curthread = NULL;
1553					thread->active = 0;
1554				}
1555			}
1556			if ((sig = thread->tcb->tcb_tmbx.tm_syncsig.si_signo)
1557			    != 0) {
1558				if (SIGISMEMBER(thread->sigmask, sig))
1559					SIGADDSET(thread->sigpend, sig);
1560				else
1561					(void)_thr_sig_add(thread, sig,
1562					    &thread->tcb->tcb_tmbx.tm_syncsig);
1563				thread->tcb->tcb_tmbx.tm_syncsig.si_signo = 0;
1564			}
1565			completed = completed->tm_next;
1566		}
1567	}
1568}
1569
1570/*
1571 * This must be called with the scheduling lock held.
1572 */
1573static void
1574kse_check_waitq(struct kse *kse)
1575{
1576	struct pthread	*pthread;
1577	struct timespec ts;
1578
1579	KSE_GET_TOD(kse, &ts);
1580
1581	/*
1582	 * Wake up threads that have timedout.  This has to be
1583	 * done before adding the current thread to the run queue
1584	 * so that a CPU intensive thread doesn't get preference
1585	 * over waiting threads.
1586	 */
1587	while (((pthread = KSE_WAITQ_FIRST(kse)) != NULL) &&
1588	    thr_timedout(pthread, &ts)) {
1589		/* Remove the thread from the wait queue: */
1590		KSE_WAITQ_REMOVE(kse, pthread);
1591		DBG_MSG("Found timedout thread %p in waitq\n", pthread);
1592
1593		/* Indicate the thread timedout: */
1594		pthread->timeout = 1;
1595
1596		/* Add the thread to the priority queue: */
1597		if ((pthread->flags & THR_FLAGS_SUSPENDED) != 0)
1598			THR_SET_STATE(pthread, PS_SUSPENDED);
1599		else {
1600			THR_SET_STATE(pthread, PS_RUNNING);
1601			KSE_RUNQ_INSERT_TAIL(kse, pthread);
1602		}
1603	}
1604}
1605
1606static int
1607thr_timedout(struct pthread *thread, struct timespec *curtime)
1608{
1609	if (thread->wakeup_time.tv_sec < 0)
1610		return (0);
1611	else if (thread->wakeup_time.tv_sec > curtime->tv_sec)
1612		return (0);
1613	else if ((thread->wakeup_time.tv_sec == curtime->tv_sec) &&
1614	    (thread->wakeup_time.tv_nsec > curtime->tv_nsec))
1615		return (0);
1616	else
1617		return (1);
1618}
1619
1620/*
1621 * This must be called with the scheduling lock held.
1622 *
1623 * Each thread has a time slice, a wakeup time (used when it wants
1624 * to wait for a specified amount of time), a run state, and an
1625 * active flag.
1626 *
1627 * When a thread gets run by the scheduler, the active flag is
1628 * set to non-zero (1).  When a thread performs an explicit yield
1629 * or schedules a state change, it enters the scheduler and the
1630 * active flag is cleared.  When the active flag is still seen
1631 * set in the scheduler, that means that the thread is blocked in
1632 * the kernel (because it is cleared before entering the scheduler
1633 * in all other instances).
1634 *
1635 * The wakeup time is only set for those states that can timeout.
1636 * It is set to (-1, -1) for all other instances.
1637 *
1638 * The thread's run state, aside from being useful when debugging,
1639 * is used to place the thread in an appropriate queue.  There
1640 * are 2 basic queues:
1641 *
1642 *   o run queue - queue ordered by priority for all threads
1643 *                 that are runnable
1644 *   o waiting queue - queue sorted by wakeup time for all threads
1645 *                     that are not otherwise runnable (not blocked
1646 *                     in kernel, not waiting for locks)
1647 *
1648 * The thread's time slice is used for round-robin scheduling
1649 * (the default scheduling policy).  While a SCHED_RR thread
1650 * is runnable it's time slice accumulates.  When it reaches
1651 * the time slice interval, it gets reset and added to the end
1652 * of the queue of threads at its priority.  When a thread no
1653 * longer becomes runnable (blocks in kernel, waits, etc), its
1654 * time slice is reset.
1655 *
1656 * The job of kse_switchout_thread() is to handle all of the above.
1657 */
1658static void
1659kse_switchout_thread(struct kse *kse, struct pthread *thread)
1660{
1661	int level;
1662	int i;
1663	int restart;
1664	siginfo_t siginfo;
1665
1666	/*
1667	 * Place the currently running thread into the
1668	 * appropriate queue(s).
1669	 */
1670	DBG_MSG("Switching out thread %p, state %d\n", thread, thread->state);
1671
1672	THR_DEACTIVATE_LAST_LOCK(thread);
1673	if (thread->blocked != 0) {
1674		thread->active = 0;
1675		thread->need_switchout = 0;
1676		/* This thread must have blocked in the kernel. */
1677		/*
1678		 *  Check for pending signals for this thread to
1679		 *  see if we need to interrupt it in the kernel.
1680		 */
1681		if (thread->check_pending != 0) {
1682			for (i = 1; i <= _SIG_MAXSIG; ++i) {
1683				if (SIGISMEMBER(thread->sigpend, i) &&
1684				    !SIGISMEMBER(thread->sigmask, i)) {
1685					restart = _thread_sigact[1 - 1].sa_flags & SA_RESTART;
1686					kse_thr_interrupt(&thread->tcb->tcb_tmbx,
1687					    restart ? KSE_INTR_RESTART : KSE_INTR_INTERRUPT, 0);
1688					break;
1689				}
1690			}
1691		}
1692	}
1693	else {
1694		switch (thread->state) {
1695		case PS_DEAD:
1696			/*
1697			 * The scheduler is operating on a different
1698			 * stack.  It is safe to do garbage collecting
1699			 * here.
1700			 */
1701			thread->active = 0;
1702			thread->need_switchout = 0;
1703			thread->lock_switch = 0;
1704			thr_cleanup(kse, thread);
1705			return;
1706			break;
1707
1708		case PS_RUNNING:
1709			if ((thread->flags & THR_FLAGS_SUSPENDED) != 0)
1710				THR_SET_STATE(thread, PS_SUSPENDED);
1711			break;
1712
1713		case PS_COND_WAIT:
1714		case PS_SLEEP_WAIT:
1715			/* Insert into the waiting queue: */
1716			KSE_WAITQ_INSERT(kse, thread);
1717			break;
1718
1719		case PS_LOCKWAIT:
1720			/*
1721			 * This state doesn't timeout.
1722			 */
1723			thread->wakeup_time.tv_sec = -1;
1724			thread->wakeup_time.tv_nsec = -1;
1725			level = thread->locklevel - 1;
1726			if (!_LCK_GRANTED(&thread->lockusers[level]))
1727				KSE_WAITQ_INSERT(kse, thread);
1728			else
1729				THR_SET_STATE(thread, PS_RUNNING);
1730			break;
1731
1732		case PS_SIGWAIT:
1733			KSE_WAITQ_INSERT(kse, thread);
1734			break;
1735		case PS_JOIN:
1736		case PS_MUTEX_WAIT:
1737		case PS_SIGSUSPEND:
1738		case PS_SUSPENDED:
1739		case PS_DEADLOCK:
1740		default:
1741			/*
1742			 * These states don't timeout.
1743			 */
1744			thread->wakeup_time.tv_sec = -1;
1745			thread->wakeup_time.tv_nsec = -1;
1746
1747			/* Insert into the waiting queue: */
1748			KSE_WAITQ_INSERT(kse, thread);
1749			break;
1750		}
1751		thr_accounting(thread);
1752		if (thread->state == PS_RUNNING) {
1753			if (thread->slice_usec == -1) {
1754				/*
1755				 * The thread exceeded its time quantum or
1756				 * it yielded the CPU; place it at the tail
1757				 * of the queue for its priority.
1758				 */
1759				KSE_RUNQ_INSERT_TAIL(kse, thread);
1760			} else {
1761				/*
1762				 * The thread hasn't exceeded its interval
1763				 * Place it at the head of the queue for its
1764				 * priority.
1765				 */
1766				KSE_RUNQ_INSERT_HEAD(kse, thread);
1767			}
1768		}
1769	}
1770	thread->active = 0;
1771	thread->need_switchout = 0;
1772	if (thread->check_pending != 0) {
1773		/* Install pending signals into the frame. */
1774		thread->check_pending = 0;
1775		KSE_LOCK_ACQUIRE(kse, &_thread_signal_lock);
1776		for (i = 1; i <= _SIG_MAXSIG; i++) {
1777			if (SIGISMEMBER(thread->sigmask, i))
1778				continue;
1779			if (SIGISMEMBER(thread->sigpend, i))
1780				(void)_thr_sig_add(thread, i,
1781				    &thread->siginfo[i-1]);
1782			else if (SIGISMEMBER(_thr_proc_sigpending, i) &&
1783				_thr_getprocsig_unlocked(i, &siginfo)) {
1784				(void)_thr_sig_add(thread, i, &siginfo);
1785			}
1786		}
1787		KSE_LOCK_RELEASE(kse, &_thread_signal_lock);
1788	}
1789}
1790
1791/*
1792 * This function waits for the smallest timeout value of any waiting
1793 * thread, or until it receives a message from another KSE.
1794 *
1795 * This must be called with the scheduling lock held.
1796 */
1797static void
1798kse_wait(struct kse *kse, struct pthread *td_wait, int sigseqno)
1799{
1800	struct timespec ts, ts_sleep;
1801	int saved_flags;
1802
1803	KSE_GET_TOD(kse, &ts);
1804
1805	if ((td_wait == NULL) || (td_wait->wakeup_time.tv_sec < 0)) {
1806		/* Limit sleep to no more than 1 minute. */
1807		ts_sleep.tv_sec = 60;
1808		ts_sleep.tv_nsec = 0;
1809	} else {
1810		TIMESPEC_SUB(&ts_sleep, &td_wait->wakeup_time, &ts);
1811		if (ts_sleep.tv_sec > 60) {
1812			ts_sleep.tv_sec = 60;
1813			ts_sleep.tv_nsec = 0;
1814		}
1815	}
1816	/* Don't sleep for negative times. */
1817	if ((ts_sleep.tv_sec >= 0) && (ts_sleep.tv_nsec >= 0)) {
1818		KSE_SET_IDLE(kse);
1819		kse->k_kseg->kg_idle_kses++;
1820		KSE_SCHED_UNLOCK(kse, kse->k_kseg);
1821		if ((kse->k_kseg->kg_flags & KGF_SINGLE_THREAD) &&
1822		    (kse->k_sigseqno != sigseqno))
1823			; /* don't sleep */
1824		else {
1825			saved_flags = kse->k_kcb->kcb_kmbx.km_flags;
1826			kse->k_kcb->kcb_kmbx.km_flags |= KMF_NOUPCALL;
1827			kse_release(&ts_sleep);
1828			kse->k_kcb->kcb_kmbx.km_flags = saved_flags;
1829		}
1830		KSE_SCHED_LOCK(kse, kse->k_kseg);
1831		if (KSE_IS_IDLE(kse)) {
1832			KSE_CLEAR_IDLE(kse);
1833			kse->k_kseg->kg_idle_kses--;
1834		}
1835	}
1836}
1837
1838/*
1839 * Avoid calling this kse_exit() so as not to confuse it with the
1840 * system call of the same name.
1841 */
1842static void
1843kse_fini(struct kse *kse)
1844{
1845	/* struct kse_group *free_kseg = NULL; */
1846	struct timespec ts;
1847
1848	/*
1849	 * Check to see if this is one of the main kses.
1850	 */
1851	if (kse->k_kseg != _kse_initial->k_kseg) {
1852		PANIC("shouldn't get here");
1853		/* This is for supporting thread groups. */
1854#ifdef NOT_YET
1855		/* Remove this KSE from the KSEG's list of KSEs. */
1856		KSE_SCHED_LOCK(kse, kse->k_kseg);
1857		TAILQ_REMOVE(&kse->k_kseg->kg_kseq, kse, k_kgqe);
1858		kse->k_kseg->kg_ksecount--;
1859		if (TAILQ_EMPTY(&kse->k_kseg->kg_kseq))
1860			free_kseg = kse->k_kseg;
1861		KSE_SCHED_UNLOCK(kse, kse->k_kseg);
1862
1863		/*
1864		 * Add this KSE to the list of free KSEs along with
1865		 * the KSEG if is now orphaned.
1866		 */
1867		KSE_LOCK_ACQUIRE(kse, &kse_lock);
1868		if (free_kseg != NULL)
1869			kseg_free_unlocked(free_kseg);
1870		kse_free_unlocked(kse);
1871		KSE_LOCK_RELEASE(kse, &kse_lock);
1872		kse_exit();
1873		/* Never returns. */
1874		PANIC("kse_exit()");
1875#endif
1876	} else {
1877#ifdef NOT_YET
1878		/*
1879		 * In future, we might allow program to kill
1880		 * kse in initial group.
1881		 */
1882		if (kse != _kse_initial) {
1883			KSE_SCHED_LOCK(kse, kse->k_kseg);
1884			TAILQ_REMOVE(&kse->k_kseg->kg_kseq, kse, k_kgqe);
1885			kse->k_kseg->kg_ksecount--;
1886			KSE_SCHED_UNLOCK(kse, kse->k_kseg);
1887			KSE_LOCK_ACQUIRE(kse, &kse_lock);
1888			kse_free_unlocked(kse);
1889			KSE_LOCK_RELEASE(kse, &kse_lock);
1890			kse_exit();
1891                        /* Never returns. */
1892                        PANIC("kse_exit() failed for initial kseg");
1893                }
1894#endif
1895		KSE_SCHED_LOCK(kse, kse->k_kseg);
1896		KSE_SET_IDLE(kse);
1897		kse->k_kseg->kg_idle_kses++;
1898		KSE_SCHED_UNLOCK(kse, kse->k_kseg);
1899		ts.tv_sec = 120;
1900		ts.tv_nsec = 0;
1901		kse->k_kcb->kcb_kmbx.km_flags = 0;
1902		kse_release(&ts);
1903		/* Never reach */
1904	}
1905}
1906
1907void
1908_thr_set_timeout(const struct timespec *timeout)
1909{
1910	struct pthread	*curthread = _get_curthread();
1911	struct timespec ts;
1912
1913	/* Reset the timeout flag for the running thread: */
1914	curthread->timeout = 0;
1915
1916	/* Check if the thread is to wait forever: */
1917	if (timeout == NULL) {
1918		/*
1919		 * Set the wakeup time to something that can be recognised as
1920		 * different to an actual time of day:
1921		 */
1922		curthread->wakeup_time.tv_sec = -1;
1923		curthread->wakeup_time.tv_nsec = -1;
1924	}
1925	/* Check if no waiting is required: */
1926	else if ((timeout->tv_sec == 0) && (timeout->tv_nsec == 0)) {
1927		/* Set the wake up time to 'immediately': */
1928		curthread->wakeup_time.tv_sec = 0;
1929		curthread->wakeup_time.tv_nsec = 0;
1930	} else {
1931		/* Calculate the time for the current thread to wakeup: */
1932		KSE_GET_TOD(curthread->kse, &ts);
1933		TIMESPEC_ADD(&curthread->wakeup_time, &ts, timeout);
1934	}
1935}
1936
1937void
1938_thr_panic_exit(char *file, int line, char *msg)
1939{
1940	char buf[256];
1941
1942	snprintf(buf, sizeof(buf), "(%s:%d) %s\n", file, line, msg);
1943	__sys_write(2, buf, strlen(buf));
1944	abort();
1945}
1946
1947void
1948_thr_setrunnable(struct pthread *curthread, struct pthread *thread)
1949{
1950	kse_critical_t crit;
1951	struct kse_mailbox *kmbx;
1952
1953	crit = _kse_critical_enter();
1954	KSE_SCHED_LOCK(curthread->kse, thread->kseg);
1955	kmbx = _thr_setrunnable_unlocked(thread);
1956	KSE_SCHED_UNLOCK(curthread->kse, thread->kseg);
1957	_kse_critical_leave(crit);
1958	if (kmbx != NULL)
1959		kse_wakeup(kmbx);
1960}
1961
1962struct kse_mailbox *
1963_thr_setrunnable_unlocked(struct pthread *thread)
1964{
1965	struct kse_mailbox *kmbx = NULL;
1966
1967	if ((thread->kseg->kg_flags & KGF_SINGLE_THREAD) != 0) {
1968		/* No silly queues for these threads. */
1969		if ((thread->flags & THR_FLAGS_SUSPENDED) != 0)
1970			THR_SET_STATE(thread, PS_SUSPENDED);
1971		else {
1972			THR_SET_STATE(thread, PS_RUNNING);
1973			kmbx = kse_wakeup_one(thread);
1974		}
1975
1976	} else if (thread->state != PS_RUNNING) {
1977		if ((thread->flags & THR_FLAGS_IN_WAITQ) != 0)
1978			KSE_WAITQ_REMOVE(thread->kse, thread);
1979		if ((thread->flags & THR_FLAGS_SUSPENDED) != 0)
1980			THR_SET_STATE(thread, PS_SUSPENDED);
1981		else {
1982			THR_SET_STATE(thread, PS_RUNNING);
1983			if ((thread->blocked == 0) && (thread->active == 0) &&
1984			    (thread->flags & THR_FLAGS_IN_RUNQ) == 0)
1985				THR_RUNQ_INSERT_TAIL(thread);
1986			/*
1987			 * XXX - Threads are not yet assigned to specific
1988			 *       KSEs; they are assigned to the KSEG.  So
1989			 *       the fact that a thread's KSE is waiting
1990			 *       doesn't necessarily mean that it will be
1991			 *       the KSE that runs the thread after the
1992			 *       lock is granted.  But we don't know if the
1993			 *       other KSEs within the same KSEG are also
1994			 *       in a waiting state or not so we err on the
1995			 *       side of caution and wakeup the thread's
1996			 *       last known KSE.  We ensure that the
1997			 *       threads KSE doesn't change while it's
1998			 *       scheduling lock is held so it is safe to
1999			 *       reference it (the KSE).  If the KSE wakes
2000			 *       up and doesn't find any more work it will
2001			 *       again go back to waiting so no harm is
2002			 *       done.
2003			 */
2004			kmbx = kse_wakeup_one(thread);
2005		}
2006	}
2007	return (kmbx);
2008}
2009
2010static struct kse_mailbox *
2011kse_wakeup_one(struct pthread *thread)
2012{
2013	struct kse *ke;
2014
2015	if (KSE_IS_IDLE(thread->kse)) {
2016		KSE_CLEAR_IDLE(thread->kse);
2017		thread->kseg->kg_idle_kses--;
2018		return (&thread->kse->k_kcb->kcb_kmbx);
2019	} else {
2020		TAILQ_FOREACH(ke, &thread->kseg->kg_kseq, k_kgqe) {
2021			if (KSE_IS_IDLE(ke)) {
2022				KSE_CLEAR_IDLE(ke);
2023				ke->k_kseg->kg_idle_kses--;
2024				return (&ke->k_kcb->kcb_kmbx);
2025			}
2026		}
2027	}
2028	return (NULL);
2029}
2030
2031static void
2032kse_wakeup_multi(struct kse *curkse)
2033{
2034	struct kse *ke;
2035	int tmp;
2036
2037	if ((tmp = KSE_RUNQ_THREADS(curkse)) && curkse->k_kseg->kg_idle_kses) {
2038		TAILQ_FOREACH(ke, &curkse->k_kseg->kg_kseq, k_kgqe) {
2039			if (KSE_IS_IDLE(ke)) {
2040				KSE_CLEAR_IDLE(ke);
2041				ke->k_kseg->kg_idle_kses--;
2042				KSE_WAKEUP(ke);
2043				if (--tmp == 0)
2044					break;
2045			}
2046		}
2047	}
2048}
2049
2050/*
2051 * Allocate a new KSEG.
2052 *
2053 * We allow the current thread to be NULL in the case that this
2054 * is the first time a KSEG is being created (library initialization).
2055 * In this case, we don't need to (and can't) take any locks.
2056 */
2057struct kse_group *
2058_kseg_alloc(struct pthread *curthread)
2059{
2060	struct kse_group *kseg = NULL;
2061	kse_critical_t crit;
2062
2063	if ((curthread != NULL) && (free_kseg_count > 0)) {
2064		/* Use the kse lock for the kseg queue. */
2065		crit = _kse_critical_enter();
2066		KSE_LOCK_ACQUIRE(curthread->kse, &kse_lock);
2067		if ((kseg = TAILQ_FIRST(&free_kse_groupq)) != NULL) {
2068			TAILQ_REMOVE(&free_kse_groupq, kseg, kg_qe);
2069			free_kseg_count--;
2070			active_kseg_count++;
2071			TAILQ_INSERT_TAIL(&active_kse_groupq, kseg, kg_qe);
2072		}
2073		KSE_LOCK_RELEASE(curthread->kse, &kse_lock);
2074		_kse_critical_leave(crit);
2075		if (kseg)
2076			kseg_reinit(kseg);
2077	}
2078
2079	/*
2080	 * If requested, attempt to allocate a new KSE group only if the
2081	 * KSE allocation was successful and a KSE group wasn't found in
2082	 * the free list.
2083	 */
2084	if ((kseg == NULL) &&
2085	    ((kseg = (struct kse_group *)malloc(sizeof(*kseg))) != NULL)) {
2086		if (_pq_alloc(&kseg->kg_schedq.sq_runq,
2087		    THR_MIN_PRIORITY, THR_LAST_PRIORITY) != 0) {
2088			free(kseg);
2089			kseg = NULL;
2090		} else {
2091			kseg_init(kseg);
2092			/* Add the KSEG to the list of active KSEGs. */
2093			if (curthread != NULL) {
2094				crit = _kse_critical_enter();
2095				KSE_LOCK_ACQUIRE(curthread->kse, &kse_lock);
2096				active_kseg_count++;
2097				TAILQ_INSERT_TAIL(&active_kse_groupq,
2098				    kseg, kg_qe);
2099				KSE_LOCK_RELEASE(curthread->kse, &kse_lock);
2100				_kse_critical_leave(crit);
2101			} else {
2102				active_kseg_count++;
2103				TAILQ_INSERT_TAIL(&active_kse_groupq,
2104				    kseg, kg_qe);
2105			}
2106		}
2107	}
2108	return (kseg);
2109}
2110
2111static void
2112kseg_init(struct kse_group *kseg)
2113{
2114	kseg_reinit(kseg);
2115	_lock_init(&kseg->kg_lock, LCK_ADAPTIVE, _kse_lock_wait,
2116	    _kse_lock_wakeup);
2117}
2118
2119static void
2120kseg_reinit(struct kse_group *kseg)
2121{
2122	TAILQ_INIT(&kseg->kg_kseq);
2123	TAILQ_INIT(&kseg->kg_threadq);
2124	TAILQ_INIT(&kseg->kg_schedq.sq_waitq);
2125	kseg->kg_threadcount = 0;
2126	kseg->kg_ksecount = 0;
2127	kseg->kg_idle_kses = 0;
2128	kseg->kg_flags = 0;
2129}
2130
2131/*
2132 * This must be called with the kse lock held and when there are
2133 * no more threads that reference it.
2134 */
2135static void
2136kseg_free_unlocked(struct kse_group *kseg)
2137{
2138	TAILQ_REMOVE(&active_kse_groupq, kseg, kg_qe);
2139	TAILQ_INSERT_HEAD(&free_kse_groupq, kseg, kg_qe);
2140	free_kseg_count++;
2141	active_kseg_count--;
2142}
2143
2144void
2145_kseg_free(struct kse_group *kseg)
2146{
2147	struct kse *curkse;
2148	kse_critical_t crit;
2149
2150	crit = _kse_critical_enter();
2151	curkse = _get_curkse();
2152	KSE_LOCK_ACQUIRE(curkse, &kse_lock);
2153	kseg_free_unlocked(kseg);
2154	KSE_LOCK_RELEASE(curkse, &kse_lock);
2155	_kse_critical_leave(crit);
2156}
2157
2158static void
2159kseg_destroy(struct kse_group *kseg)
2160{
2161	_lock_destroy(&kseg->kg_lock);
2162	_pq_free(&kseg->kg_schedq.sq_runq);
2163	free(kseg);
2164}
2165
2166/*
2167 * Allocate a new KSE.
2168 *
2169 * We allow the current thread to be NULL in the case that this
2170 * is the first time a KSE is being created (library initialization).
2171 * In this case, we don't need to (and can't) take any locks.
2172 */
2173struct kse *
2174_kse_alloc(struct pthread *curthread, int sys_scope)
2175{
2176	struct kse *kse = NULL;
2177	char *stack;
2178	kse_critical_t crit;
2179	int i;
2180
2181	if ((curthread != NULL) && (free_kse_count > 0)) {
2182		crit = _kse_critical_enter();
2183		KSE_LOCK_ACQUIRE(curthread->kse, &kse_lock);
2184		/* Search for a finished KSE. */
2185		kse = TAILQ_FIRST(&free_kseq);
2186		while ((kse != NULL) &&
2187		    ((kse->k_kcb->kcb_kmbx.km_flags & KMF_DONE) == 0)) {
2188			kse = TAILQ_NEXT(kse, k_qe);
2189		}
2190		if (kse != NULL) {
2191			DBG_MSG("found an unused kse.\n");
2192			TAILQ_REMOVE(&free_kseq, kse, k_qe);
2193			free_kse_count--;
2194			TAILQ_INSERT_TAIL(&active_kseq, kse, k_qe);
2195			active_kse_count++;
2196		}
2197		KSE_LOCK_RELEASE(curthread->kse, &kse_lock);
2198		_kse_critical_leave(crit);
2199		if (kse != NULL)
2200			kse_reinit(kse, sys_scope);
2201	}
2202	if ((kse == NULL) &&
2203	    ((kse = (struct kse *)malloc(sizeof(*kse))) != NULL)) {
2204		if (sys_scope != 0)
2205			stack = NULL;
2206		else if ((stack = malloc(KSE_STACKSIZE)) == NULL) {
2207			free(kse);
2208			return (NULL);
2209		}
2210		bzero(kse, sizeof(*kse));
2211
2212		/* Initialize KCB without the lock. */
2213		if ((kse->k_kcb = _kcb_ctor(kse)) == NULL) {
2214			if (stack != NULL)
2215				free(stack);
2216			free(kse);
2217			return (NULL);
2218		}
2219
2220		/* Initialize the lockusers. */
2221		for (i = 0; i < MAX_KSE_LOCKLEVEL; i++) {
2222			_lockuser_init(&kse->k_lockusers[i], (void *)kse);
2223			_LCK_SET_PRIVATE2(&kse->k_lockusers[i], NULL);
2224		}
2225		/* _lock_init(kse->k_lock, ...) */
2226
2227		if (curthread != NULL) {
2228			crit = _kse_critical_enter();
2229			KSE_LOCK_ACQUIRE(curthread->kse, &kse_lock);
2230		}
2231		kse->k_flags = 0;
2232		TAILQ_INSERT_TAIL(&active_kseq, kse, k_qe);
2233		active_kse_count++;
2234		if (curthread != NULL) {
2235			KSE_LOCK_RELEASE(curthread->kse, &kse_lock);
2236			_kse_critical_leave(crit);
2237		}
2238		/*
2239		 * Create the KSE context.
2240		 * Scope system threads (one thread per KSE) are not required
2241		 * to have a stack for an unneeded kse upcall.
2242		 */
2243		if (!sys_scope) {
2244			kse->k_kcb->kcb_kmbx.km_func = (kse_func_t *)kse_sched_multi;
2245			kse->k_stack.ss_sp = stack;
2246			kse->k_stack.ss_size = KSE_STACKSIZE;
2247		} else {
2248			kse->k_kcb->kcb_kmbx.km_func = (kse_func_t *)kse_sched_single;
2249			kse->k_stack.ss_sp = NULL;
2250			kse->k_stack.ss_size = 0;
2251		}
2252		kse->k_kcb->kcb_kmbx.km_udata = (void *)kse;
2253		kse->k_kcb->kcb_kmbx.km_quantum = 20000;
2254		/*
2255		 * We need to keep a copy of the stack in case it
2256		 * doesn't get used; a KSE running a scope system
2257		 * thread will use that thread's stack.
2258		 */
2259		kse->k_kcb->kcb_kmbx.km_stack = kse->k_stack;
2260	}
2261	return (kse);
2262}
2263
2264static void
2265kse_reinit(struct kse *kse, int sys_scope)
2266{
2267	if (!sys_scope) {
2268		kse->k_kcb->kcb_kmbx.km_func = (kse_func_t *)kse_sched_multi;
2269		if (kse->k_stack.ss_sp == NULL) {
2270			/* XXX check allocation failure */
2271			kse->k_stack.ss_sp = (char *) malloc(KSE_STACKSIZE);
2272			kse->k_stack.ss_size = KSE_STACKSIZE;
2273		}
2274		kse->k_kcb->kcb_kmbx.km_quantum = 20000;
2275	} else {
2276		kse->k_kcb->kcb_kmbx.km_func = (kse_func_t *)kse_sched_single;
2277		if (kse->k_stack.ss_sp)
2278			free(kse->k_stack.ss_sp);
2279		kse->k_stack.ss_sp = NULL;
2280		kse->k_stack.ss_size = 0;
2281		kse->k_kcb->kcb_kmbx.km_quantum = 0;
2282	}
2283	kse->k_kcb->kcb_kmbx.km_stack = kse->k_stack;
2284	kse->k_kcb->kcb_kmbx.km_udata = (void *)kse;
2285	kse->k_kcb->kcb_kmbx.km_curthread = NULL;
2286	kse->k_kcb->kcb_kmbx.km_flags = 0;
2287	kse->k_curthread = NULL;
2288	kse->k_kseg = 0;
2289	kse->k_schedq = 0;
2290	kse->k_locklevel = 0;
2291	SIGEMPTYSET(kse->k_sigmask);
2292	bzero(&kse->k_sigq, sizeof(kse->k_sigq));
2293	kse->k_check_sigq = 0;
2294	kse->k_flags = 0;
2295	kse->k_waiting = 0;
2296	kse->k_idle = 0;
2297	kse->k_error = 0;
2298	kse->k_cpu = 0;
2299	kse->k_done = 0;
2300	kse->k_switch = 0;
2301	kse->k_sigseqno = 0;
2302}
2303
2304void
2305kse_free_unlocked(struct kse *kse)
2306{
2307	TAILQ_REMOVE(&active_kseq, kse, k_qe);
2308	active_kse_count--;
2309	kse->k_kseg = NULL;
2310	kse->k_kcb->kcb_kmbx.km_quantum = 20000;
2311	kse->k_flags = 0;
2312	TAILQ_INSERT_HEAD(&free_kseq, kse, k_qe);
2313	free_kse_count++;
2314}
2315
2316void
2317_kse_free(struct pthread *curthread, struct kse *kse)
2318{
2319	kse_critical_t crit;
2320
2321	if (curthread == NULL)
2322		kse_free_unlocked(kse);
2323	else {
2324		crit = _kse_critical_enter();
2325		KSE_LOCK_ACQUIRE(curthread->kse, &kse_lock);
2326		kse_free_unlocked(kse);
2327		KSE_LOCK_RELEASE(curthread->kse, &kse_lock);
2328		_kse_critical_leave(crit);
2329	}
2330}
2331
2332static void
2333kse_destroy(struct kse *kse)
2334{
2335	int i;
2336
2337	if (kse->k_stack.ss_sp != NULL)
2338		free(kse->k_stack.ss_sp);
2339	_kcb_dtor(kse->k_kcb);
2340	for (i = 0; i < MAX_KSE_LOCKLEVEL; ++i)
2341		_lockuser_destroy(&kse->k_lockusers[i]);
2342	_lock_destroy(&kse->k_lock);
2343	free(kse);
2344}
2345
2346struct pthread *
2347_thr_alloc(struct pthread *curthread)
2348{
2349	kse_critical_t crit;
2350	struct pthread *thread = NULL;
2351
2352	if (curthread != NULL) {
2353		if (GC_NEEDED())
2354			_thr_gc(curthread);
2355		if (free_thread_count > 0) {
2356			crit = _kse_critical_enter();
2357			KSE_LOCK_ACQUIRE(curthread->kse, &thread_lock);
2358			if ((thread = TAILQ_FIRST(&free_threadq)) != NULL) {
2359				TAILQ_REMOVE(&free_threadq, thread, tle);
2360				free_thread_count--;
2361			}
2362			KSE_LOCK_RELEASE(curthread->kse, &thread_lock);
2363			_kse_critical_leave(crit);
2364		}
2365	}
2366	if ((thread == NULL) &&
2367	    ((thread = malloc(sizeof(struct pthread))) != NULL)) {
2368		bzero(thread, sizeof(struct pthread));
2369		if ((thread->tcb = _tcb_ctor(thread)) == NULL) {
2370			free(thread);
2371			thread = NULL;
2372		}
2373	}
2374	return (thread);
2375}
2376
2377void
2378_thr_free(struct pthread *curthread, struct pthread *thread)
2379{
2380	kse_critical_t crit;
2381	int i;
2382
2383	DBG_MSG("Freeing thread %p\n", thread);
2384	if ((curthread == NULL) || (free_thread_count >= MAX_CACHED_THREADS)) {
2385		for (i = 0; i < MAX_THR_LOCKLEVEL; i++) {
2386			_lockuser_destroy(&thread->lockusers[i]);
2387		}
2388		_lock_destroy(&thread->lock);
2389		_tcb_dtor(thread->tcb);
2390		free(thread);
2391	}
2392	else {
2393		/* Reinitialize any important fields here. */
2394		thread->lock_switch = 0;
2395		sigemptyset(&thread->sigpend);
2396		thread->check_pending = 0;
2397
2398		/* Add the thread to the free thread list. */
2399		crit = _kse_critical_enter();
2400		KSE_LOCK_ACQUIRE(curthread->kse, &thread_lock);
2401		TAILQ_INSERT_TAIL(&free_threadq, thread, tle);
2402		free_thread_count++;
2403		KSE_LOCK_RELEASE(curthread->kse, &thread_lock);
2404		_kse_critical_leave(crit);
2405	}
2406}
2407
2408/*
2409 * Add an active thread:
2410 *
2411 *   o Assign the thread a unique id (which GDB uses to track
2412 *     threads.
2413 *   o Add the thread to the list of all threads and increment
2414 *     number of active threads.
2415 */
2416static void
2417thr_link(struct pthread *thread)
2418{
2419	kse_critical_t crit;
2420	struct kse *curkse;
2421	struct pthread *curthread;
2422
2423	crit = _kse_critical_enter();
2424	curkse = _get_curkse();
2425	curthread = _get_curthread();
2426	thread->sigmask = curthread->sigmask;
2427	KSE_LOCK_ACQUIRE(curkse, &_thread_list_lock);
2428	/*
2429	 * Initialize the unique id (which GDB uses to track
2430	 * threads), add the thread to the list of all threads,
2431	 * and
2432	 */
2433	thread->uniqueid = next_uniqueid++;
2434	THR_LIST_ADD(thread);
2435	active_threads++;
2436	KSE_LOCK_RELEASE(curkse, &_thread_list_lock);
2437	_kse_critical_leave(crit);
2438}
2439
2440/*
2441 * Remove an active thread.
2442 */
2443static void
2444thr_unlink(struct pthread *thread)
2445{
2446	kse_critical_t crit;
2447	struct kse *curkse;
2448
2449	crit = _kse_critical_enter();
2450	curkse = _get_curkse();
2451	KSE_LOCK_ACQUIRE(curkse, &_thread_list_lock);
2452	THR_LIST_REMOVE(thread);
2453	active_threads--;
2454	KSE_LOCK_RELEASE(curkse, &_thread_list_lock);
2455	_kse_critical_leave(crit);
2456}
2457
2458void
2459_thr_hash_add(struct pthread *thread)
2460{
2461	struct thread_hash_head *head;
2462
2463	head = &thr_hashtable[THREAD_HASH(thread)];
2464	LIST_INSERT_HEAD(head, thread, hle);
2465}
2466
2467void
2468_thr_hash_remove(struct pthread *thread)
2469{
2470	LIST_REMOVE(thread, hle);
2471}
2472
2473struct pthread *
2474_thr_hash_find(struct pthread *thread)
2475{
2476	struct pthread *td;
2477	struct thread_hash_head *head;
2478
2479	head = &thr_hashtable[THREAD_HASH(thread)];
2480	LIST_FOREACH(td, head, hle) {
2481		if (td == thread)
2482			return (thread);
2483	}
2484	return (NULL);
2485}
2486
2487