thr_kern.c revision 118817
1/*
2 * Copyright (C) 2003 Daniel M. Eischen <deischen@freebsd.org>
3 * Copyright (C) 2002 Jonathon Mini <mini@freebsd.org>
4 * Copyright (c) 1995-1998 John Birrell <jb@cimlogic.com.au>
5 * All rights reserved.
6 *
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
9 * are met:
10 * 1. Redistributions of source code must retain the above copyright
11 *    notice, this list of conditions and the following disclaimer.
12 * 2. Redistributions in binary form must reproduce the above copyright
13 *    notice, this list of conditions and the following disclaimer in the
14 *    documentation and/or other materials provided with the distribution.
15 * 3. All advertising materials mentioning features or use of this software
16 *    must display the following acknowledgement:
17 *	This product includes software developed by John Birrell.
18 * 4. Neither the name of the author nor the names of any co-contributors
19 *    may be used to endorse or promote products derived from this software
20 *    without specific prior written permission.
21 *
22 * THIS SOFTWARE IS PROVIDED BY JOHN BIRRELL AND CONTRIBUTORS ``AS IS'' AND
23 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
24 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
25 * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
26 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
27 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
28 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
29 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
30 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
31 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32 * SUCH DAMAGE.
33 *
34 */
35#include <sys/cdefs.h>
36__FBSDID("$FreeBSD: head/lib/libkse/thread/thr_kern.c 118817 2003-08-12 08:01:34Z davidxu $");
37
38#include <sys/types.h>
39#include <sys/kse.h>
40#include <sys/signalvar.h>
41#include <sys/queue.h>
42#include <machine/atomic.h>
43#include <machine/sigframe.h>
44
45#include <assert.h>
46#include <errno.h>
47#include <signal.h>
48#include <stdlib.h>
49#include <string.h>
50#include <time.h>
51#include <ucontext.h>
52#include <unistd.h>
53
54#include "atomic_ops.h"
55#include "thr_private.h"
56#include "libc_private.h"
57
58/*#define DEBUG_THREAD_KERN */
59#ifdef DEBUG_THREAD_KERN
60#define DBG_MSG		stdout_debug
61#else
62#define DBG_MSG(x...)
63#endif
64
65/*
66 * Define a high water mark for the maximum number of threads that
67 * will be cached.  Once this level is reached, any extra threads
68 * will be free()'d.
69 */
70#define	MAX_CACHED_THREADS	100
71/*
72 * Define high water marks for the maximum number of KSEs and KSE groups
73 * that will be cached. Because we support 1:1 threading, there could have
74 * same number of KSEs and KSE groups as threads. Once these levels are
75 * reached, any extra KSE and KSE groups will be free()'d.
76 */
77#ifdef SYSTEM_SCOPE_ONLY
78#define	MAX_CACHED_KSES		100
79#define	MAX_CACHED_KSEGS	100
80#else
81#define	MAX_CACHED_KSES		50
82#define	MAX_CACHED_KSEGS	50
83#endif
84
85#define	KSE_STACKSIZE		16384
86
87#define	KSE_SET_MBOX(kse, thrd) \
88	(kse)->k_kcb->kcb_kmbx.km_curthread = &(thrd)->tcb->tcb_tmbx
89
90#define	KSE_SET_EXITED(kse)	(kse)->k_flags |= KF_EXITED
91
92/*
93 * Macros for manipulating the run queues.  The priority queue
94 * routines use the thread's pqe link and also handle the setting
95 * and clearing of the thread's THR_FLAGS_IN_RUNQ flag.
96 */
97#define	KSE_RUNQ_INSERT_HEAD(kse, thrd)			\
98	_pq_insert_head(&(kse)->k_schedq->sq_runq, thrd)
99#define	KSE_RUNQ_INSERT_TAIL(kse, thrd)			\
100	_pq_insert_tail(&(kse)->k_schedq->sq_runq, thrd)
101#define	KSE_RUNQ_REMOVE(kse, thrd)			\
102	_pq_remove(&(kse)->k_schedq->sq_runq, thrd)
103#define	KSE_RUNQ_FIRST(kse)	_pq_first(&(kse)->k_schedq->sq_runq)
104
105#define KSE_RUNQ_THREADS(kse)	((kse)->k_schedq->sq_runq.pq_threads)
106
107/*
108 * We've got to keep track of everything that is allocated, not only
109 * to have a speedy free list, but also so they can be deallocated
110 * after a fork().
111 */
112static TAILQ_HEAD(, kse)	active_kseq;
113static TAILQ_HEAD(, kse)	free_kseq;
114static TAILQ_HEAD(, kse_group)	free_kse_groupq;
115static TAILQ_HEAD(, kse_group)	active_kse_groupq;
116static TAILQ_HEAD(, kse_group)	gc_ksegq;
117static struct lock		kse_lock;	/* also used for kseg queue */
118static int			free_kse_count = 0;
119static int			free_kseg_count = 0;
120static TAILQ_HEAD(, pthread)	free_threadq;
121static struct lock		thread_lock;
122static int			free_thread_count = 0;
123static int			inited = 0;
124static int			active_threads = 1;
125static int			active_kse_count = 0;
126static int			active_kseg_count = 0;
127static u_int64_t		next_uniqueid = 1;
128
129LIST_HEAD(thread_hash_head, pthread);
130#define THREAD_HASH_QUEUES	127
131static struct thread_hash_head	thr_hashtable[THREAD_HASH_QUEUES];
132#define	THREAD_HASH(thrd)	((unsigned long)thrd % THREAD_HASH_QUEUES)
133
134#ifdef DEBUG_THREAD_KERN
135static void	dump_queues(struct kse *curkse);
136#endif
137static void	kse_check_completed(struct kse *kse);
138static void	kse_check_waitq(struct kse *kse);
139static void	kse_fini(struct kse *curkse);
140static void	kse_reinit(struct kse *kse, int sys_scope);
141static void	kse_sched_multi(struct kse_mailbox *kmbx);
142static void	kse_sched_single(struct kse_mailbox *kmbx);
143static void	kse_switchout_thread(struct kse *kse, struct pthread *thread);
144static void	kse_wait(struct kse *kse, struct pthread *td_wait, int sigseq);
145static void	kse_free_unlocked(struct kse *kse);
146static void	kse_destroy(struct kse *kse);
147static void	kseg_free_unlocked(struct kse_group *kseg);
148static void	kseg_init(struct kse_group *kseg);
149static void	kseg_reinit(struct kse_group *kseg);
150static void	kseg_destroy(struct kse_group *kseg);
151static void	kse_waitq_insert(struct pthread *thread);
152static void	kse_wakeup_multi(struct kse *curkse);
153static struct kse_mailbox *kse_wakeup_one(struct pthread *thread);
154static void	thr_cleanup(struct kse *kse, struct pthread *curthread);
155static void	thr_link(struct pthread *thread);
156static void	thr_resume_wrapper(int sig, siginfo_t *, ucontext_t *);
157static void	thr_resume_check(struct pthread *curthread, ucontext_t *ucp,
158		    struct pthread_sigframe *psf);
159static int	thr_timedout(struct pthread *thread, struct timespec *curtime);
160static void	thr_unlink(struct pthread *thread);
161static void	thread_gc(struct pthread *thread);
162static void	kse_gc(struct pthread *thread);
163static void	kseg_gc(struct pthread *thread);
164
165static void __inline
166thr_accounting(struct pthread *thread)
167{
168	if ((thread->slice_usec != -1) &&
169	    (thread->slice_usec <= TIMESLICE_USEC) &&
170	    (thread->attr.sched_policy != SCHED_FIFO)) {
171		thread->slice_usec += (thread->tcb->tcb_tmbx.tm_uticks
172		    + thread->tcb->tcb_tmbx.tm_sticks) * _clock_res_usec;
173		/* Check for time quantum exceeded: */
174		if (thread->slice_usec > TIMESLICE_USEC)
175			thread->slice_usec = -1;
176	}
177	thread->tcb->tcb_tmbx.tm_uticks = 0;
178	thread->tcb->tcb_tmbx.tm_sticks = 0;
179}
180
181/*
182 * This is called after a fork().
183 * No locks need to be taken here since we are guaranteed to be
184 * single threaded.
185 *
186 * XXX
187 * POSIX says for threaded process, fork() function is used
188 * only to run new programs, and the effects of calling functions
189 * that require certain resources between the call to fork() and
190 * the call to an exec function are undefined.
191 *
192 * Here it is not safe to reinitialize the library after fork().
193 * Because memory management may be corrupted, further calling
194 * malloc()/free() may cause undefined behavior.
195 */
196void
197_kse_single_thread(struct pthread *curthread)
198{
199#ifdef NOTYET
200	struct kse *kse;
201	struct kse_group *kseg;
202	struct pthread *thread;
203	kse_critical_t crit;
204	int i;
205
206	if (__isthreaded) {
207		_thr_rtld_fini();
208		_thr_signal_deinit();
209	}
210	__isthreaded = 0;
211	/*
212	 * Restore signal mask early, so any memory problems could
213	 * dump core.
214	 */
215	sigprocmask(SIG_SETMASK, &curthread->sigmask, NULL);
216	active_threads = 1;
217
218	/*
219	 * Enter a loop to remove and free all threads other than
220	 * the running thread from the active thread list:
221	 */
222	while ((thread = TAILQ_FIRST(&_thread_list)) != NULL) {
223		THR_GCLIST_REMOVE(thread);
224		/*
225		 * Remove this thread from the list (the current
226		 * thread will be removed but re-added by libpthread
227		 * initialization.
228		 */
229		TAILQ_REMOVE(&_thread_list, thread, tle);
230		/* Make sure this isn't the running thread: */
231		if (thread != curthread) {
232			_thr_stack_free(&thread->attr);
233			if (thread->specific != NULL)
234				free(thread->specific);
235			for (i = 0; i < MAX_THR_LOCKLEVEL; i++) {
236				_lockuser_destroy(&thread->lockusers[i]);
237			}
238			_lock_destroy(&thread->lock);
239			free(thread);
240		}
241	}
242
243	TAILQ_INIT(&curthread->mutexq);		/* initialize mutex queue */
244	curthread->joiner = NULL;		/* no joining threads yet */
245	curthread->refcount = 0;
246	SIGEMPTYSET(curthread->sigpend);	/* clear pending signals */
247	if (curthread->specific != NULL) {
248		free(curthread->specific);
249		curthread->specific = NULL;
250		curthread->specific_data_count = 0;
251	}
252
253	/* Free the free KSEs: */
254	while ((kse = TAILQ_FIRST(&free_kseq)) != NULL) {
255		TAILQ_REMOVE(&free_kseq, kse, k_qe);
256		for (i = 0; i < MAX_KSE_LOCKLEVEL; i++) {
257			_lockuser_destroy(&kse->k_lockusers[i]);
258		}
259		_lock_destroy(&kse->k_lock);
260		_kcb_dtor(kse->k_kcb);
261		if (kse->k_stack.ss_sp != NULL)
262			free(kse->k_stack.ss_sp);
263		free(kse);
264	}
265	free_kse_count = 0;
266
267	/* Free the active KSEs: */
268	while ((kse = TAILQ_FIRST(&active_kseq)) != NULL) {
269		TAILQ_REMOVE(&active_kseq, kse, k_qe);
270		for (i = 0; i < MAX_KSE_LOCKLEVEL; i++) {
271			_lockuser_destroy(&kse->k_lockusers[i]);
272		}
273		_lock_destroy(&kse->k_lock);
274		if (kse->k_stack.ss_sp != NULL)
275			free(kse->k_stack.ss_sp);
276		free(kse);
277	}
278	active_kse_count = 0;
279
280	/* Free the free KSEGs: */
281	while ((kseg = TAILQ_FIRST(&free_kse_groupq)) != NULL) {
282		TAILQ_REMOVE(&free_kse_groupq, kseg, kg_qe);
283		_lock_destroy(&kseg->kg_lock);
284		_pq_free(&kseg->kg_schedq.sq_runq);
285		free(kseg);
286	}
287	free_kseg_count = 0;
288
289	/* Free the active KSEGs: */
290	while ((kseg = TAILQ_FIRST(&active_kse_groupq)) != NULL) {
291		TAILQ_REMOVE(&active_kse_groupq, kseg, kg_qe);
292		_lock_destroy(&kseg->kg_lock);
293		_pq_free(&kseg->kg_schedq.sq_runq);
294		free(kseg);
295	}
296	active_kseg_count = 0;
297
298	/* Free the free threads. */
299	while ((thread = TAILQ_FIRST(&free_threadq)) != NULL) {
300		TAILQ_REMOVE(&free_threadq, thread, tle);
301		if (thread->specific != NULL)
302			free(thread->specific);
303		for (i = 0; i < MAX_THR_LOCKLEVEL; i++) {
304			_lockuser_destroy(&thread->lockusers[i]);
305		}
306		_lock_destroy(&thread->lock);
307		free(thread);
308	}
309	free_thread_count = 0;
310
311	/* Free the to-be-gc'd threads. */
312	while ((thread = TAILQ_FIRST(&_thread_gc_list)) != NULL) {
313		TAILQ_REMOVE(&_thread_gc_list, thread, gcle);
314		for (i = 0; i < MAX_THR_LOCKLEVEL; i++) {
315			_lockuser_destroy(&thread->lockusers[i]);
316		}
317		_lock_destroy(&thread->lock);
318		free(thread);
319	}
320	TAILQ_INIT(&gc_ksegq);
321	_gc_count = 0;
322
323	if (inited != 0) {
324		/*
325		 * Destroy these locks; they'll be recreated to assure they
326		 * are in the unlocked state.
327		 */
328		_lock_destroy(&kse_lock);
329		_lock_destroy(&thread_lock);
330		_lock_destroy(&_thread_list_lock);
331		inited = 0;
332	}
333
334	/*
335	 * After a fork(), the leftover thread goes back to being
336	 * scope process.
337	 */
338	curthread->attr.flags &= ~PTHREAD_SCOPE_SYSTEM;
339	curthread->attr.flags |= PTHREAD_SCOPE_PROCESS;
340
341	/*
342	 * After a fork, we are still operating on the thread's original
343	 * stack.  Don't clear the THR_FLAGS_USER from the thread's
344	 * attribute flags.
345	 */
346
347	/* Initialize the threads library. */
348	curthread->kse = NULL;
349	curthread->kseg = NULL;
350	_kse_initial = NULL;
351	_libpthread_init(curthread);
352#else
353	if (__isthreaded) {
354		_thr_rtld_fini();
355		_thr_signal_deinit();
356	}
357	__isthreaded = 0;
358	/*
359	 * Restore signal mask early, so any memory problems could
360	 * dump core.
361	 */
362	sigprocmask(SIG_SETMASK, &curthread->sigmask, NULL);
363	curthread->kse->k_kcb->kcb_kmbx.km_curthread = NULL;
364	active_threads = 1;
365#endif
366}
367
368/*
369 * This is used to initialize housekeeping and to initialize the
370 * KSD for the KSE.
371 */
372void
373_kse_init(void)
374{
375	if (inited == 0) {
376		TAILQ_INIT(&active_kseq);
377		TAILQ_INIT(&active_kse_groupq);
378		TAILQ_INIT(&free_kseq);
379		TAILQ_INIT(&free_kse_groupq);
380		TAILQ_INIT(&free_threadq);
381		TAILQ_INIT(&gc_ksegq);
382		if (_lock_init(&kse_lock, LCK_ADAPTIVE,
383		    _kse_lock_wait, _kse_lock_wakeup) != 0)
384			PANIC("Unable to initialize free KSE queue lock");
385		if (_lock_init(&thread_lock, LCK_ADAPTIVE,
386		    _kse_lock_wait, _kse_lock_wakeup) != 0)
387			PANIC("Unable to initialize free thread queue lock");
388		if (_lock_init(&_thread_list_lock, LCK_ADAPTIVE,
389		    _kse_lock_wait, _kse_lock_wakeup) != 0)
390			PANIC("Unable to initialize thread list lock");
391		active_kse_count = 0;
392		active_kseg_count = 0;
393		_gc_count = 0;
394		inited = 1;
395	}
396}
397
398int
399_kse_isthreaded(void)
400{
401	return (__isthreaded != 0);
402}
403
404/*
405 * This is called when the first thread (other than the initial
406 * thread) is created.
407 */
408int
409_kse_setthreaded(int threaded)
410{
411	sigset_t sigset;
412
413	if ((threaded != 0) && (__isthreaded == 0)) {
414		/*
415		 * Tell the kernel to create a KSE for the initial thread
416		 * and enable upcalls in it.
417		 */
418		_kse_initial->k_flags |= KF_STARTED;
419
420#ifdef SYSTEM_SCOPE_ONLY
421		/*
422		 * For bound thread, kernel reads mailbox pointer once,
423		 * we'd set it here before calling kse_create
424		 */
425		_tcb_set(_kse_initial->k_kcb, _thr_initial->tcb);
426		KSE_SET_MBOX(_kse_initial, _thr_initial);
427		_kse_initial->k_kcb->kcb_kmbx.km_flags |= KMF_BOUND;
428#endif
429		SIGFILLSET(sigset);
430		__sys_sigprocmask(SIG_SETMASK, &sigset, &_thr_initial->sigmask);
431		_thr_signal_init();
432
433		/*
434		 * Locking functions in libc are required when there are
435		 * threads other than the initial thread.
436		 */
437		_thr_rtld_init();
438
439		__isthreaded = 1;
440		if (kse_create(&_kse_initial->k_kcb->kcb_kmbx, 0) != 0) {
441			_kse_initial->k_flags &= ~KF_STARTED;
442			__isthreaded = 0;
443			PANIC("kse_create() failed\n");
444			return (-1);
445		}
446
447#ifndef SYSTEM_SCOPE_ONLY
448		/* Set current thread to initial thread */
449		_tcb_set(_kse_initial->k_kcb, _thr_initial->tcb);
450		KSE_SET_MBOX(_kse_initial, _thr_initial);
451		_thr_start_sig_daemon();
452		_thr_setmaxconcurrency();
453#else
454		__sys_sigprocmask(SIG_SETMASK, &_thr_initial->sigmask, NULL);
455#endif
456	}
457	return (0);
458}
459
460/*
461 * Lock wait and wakeup handlers for KSE locks.  These are only used by
462 * KSEs, and should never be used by threads.  KSE locks include the
463 * KSE group lock (used for locking the scheduling queue) and the
464 * kse_lock defined above.
465 *
466 * When a KSE lock attempt blocks, the entire KSE blocks allowing another
467 * KSE to run.  For the most part, it doesn't make much sense to try and
468 * schedule another thread because you need to lock the scheduling queue
469 * in order to do that.  And since the KSE lock is used to lock the scheduling
470 * queue, you would just end up blocking again.
471 */
472void
473_kse_lock_wait(struct lock *lock, struct lockuser *lu)
474{
475	struct kse *curkse = (struct kse *)_LCK_GET_PRIVATE(lu);
476	struct timespec ts;
477	int saved_flags;
478
479	if (curkse->k_kcb->kcb_kmbx.km_curthread != NULL)
480		PANIC("kse_lock_wait does not disable upcall.\n");
481	/*
482	 * Enter a loop to wait until we get the lock.
483	 */
484	ts.tv_sec = 0;
485	ts.tv_nsec = 1000000;  /* 1 sec */
486	while (!_LCK_GRANTED(lu)) {
487		/*
488		 * Yield the kse and wait to be notified when the lock
489		 * is granted.
490		 */
491		saved_flags = curkse->k_kcb->kcb_kmbx.km_flags;
492		curkse->k_kcb->kcb_kmbx.km_flags |= KMF_NOUPCALL |
493		    KMF_NOCOMPLETED;
494		kse_release(&ts);
495		curkse->k_kcb->kcb_kmbx.km_flags = saved_flags;
496	}
497}
498
499void
500_kse_lock_wakeup(struct lock *lock, struct lockuser *lu)
501{
502	struct kse *curkse;
503	struct kse *kse;
504	struct kse_mailbox *mbx;
505
506	curkse = _get_curkse();
507	kse = (struct kse *)_LCK_GET_PRIVATE(lu);
508
509	if (kse == curkse)
510		PANIC("KSE trying to wake itself up in lock");
511	else {
512		mbx = &kse->k_kcb->kcb_kmbx;
513		_lock_grant(lock, lu);
514		/*
515		 * Notify the owning kse that it has the lock.
516		 * It is safe to pass invalid address to kse_wakeup
517		 * even if the mailbox is not in kernel at all,
518		 * and waking up a wrong kse is also harmless.
519		 */
520		kse_wakeup(mbx);
521	}
522}
523
524/*
525 * Thread wait and wakeup handlers for thread locks.  These are only used
526 * by threads, never by KSEs.  Thread locks include the per-thread lock
527 * (defined in its structure), and condition variable and mutex locks.
528 */
529void
530_thr_lock_wait(struct lock *lock, struct lockuser *lu)
531{
532	struct pthread *curthread = (struct pthread *)lu->lu_private;
533
534	do {
535		THR_LOCK_SWITCH(curthread);
536		THR_SET_STATE(curthread, PS_LOCKWAIT);
537		_thr_sched_switch_unlocked(curthread);
538	} while (!_LCK_GRANTED(lu));
539}
540
541void
542_thr_lock_wakeup(struct lock *lock, struct lockuser *lu)
543{
544	struct pthread *thread;
545	struct pthread *curthread;
546	struct kse_mailbox *kmbx;
547
548	curthread = _get_curthread();
549	thread = (struct pthread *)_LCK_GET_PRIVATE(lu);
550
551	THR_SCHED_LOCK(curthread, thread);
552	_lock_grant(lock, lu);
553	kmbx = _thr_setrunnable_unlocked(thread);
554	THR_SCHED_UNLOCK(curthread, thread);
555	if (kmbx != NULL)
556		kse_wakeup(kmbx);
557}
558
559kse_critical_t
560_kse_critical_enter(void)
561{
562	kse_critical_t crit;
563
564	crit = (kse_critical_t)_kcb_critical_enter();
565	return (crit);
566}
567
568void
569_kse_critical_leave(kse_critical_t crit)
570{
571	struct pthread *curthread;
572
573	_kcb_critical_leave((struct kse_thr_mailbox *)crit);
574	if ((crit != NULL) && ((curthread = _get_curthread()) != NULL))
575		THR_YIELD_CHECK(curthread);
576}
577
578int
579_kse_in_critical(void)
580{
581	return (_kcb_in_critical());
582}
583
584void
585_thr_critical_enter(struct pthread *thread)
586{
587	thread->critical_count++;
588}
589
590void
591_thr_critical_leave(struct pthread *thread)
592{
593	thread->critical_count--;
594	THR_YIELD_CHECK(thread);
595}
596
597void
598_thr_sched_switch(struct pthread *curthread)
599{
600	struct kse *curkse;
601
602	(void)_kse_critical_enter();
603	curkse = _get_curkse();
604	KSE_SCHED_LOCK(curkse, curkse->k_kseg);
605	_thr_sched_switch_unlocked(curthread);
606}
607
608/*
609 * XXX - We may need to take the scheduling lock before calling
610 *       this, or perhaps take the lock within here before
611 *       doing anything else.
612 */
613void
614_thr_sched_switch_unlocked(struct pthread *curthread)
615{
616	struct pthread *td;
617	struct pthread_sigframe psf;
618	struct kse *curkse;
619	int ret;
620	volatile int uts_once;
621	volatile int resume_once = 0;
622	ucontext_t uc;
623
624	/* We're in the scheduler, 5 by 5: */
625	curkse = _get_curkse();
626	_tcb_set(curkse->k_kcb, NULL);
627
628	curthread->need_switchout = 1;	/* The thread yielded on its own. */
629	curthread->critical_yield = 0;	/* No need to yield anymore. */
630	thr_accounting(curthread);
631
632
633	/* Thread can unlock the scheduler lock. */
634	curthread->lock_switch = 1;
635
636	/*
637	 * The signal frame is allocated off the stack because
638	 * a thread can be interrupted by other signals while
639	 * it is running down pending signals.
640	 */
641	psf.psf_valid = 0;
642	curthread->curframe = &psf;
643
644	/*
645	 * Enter the scheduler if any one of the following is true:
646	 *
647	 *   o The current thread is dead; it's stack needs to be
648	 *     cleaned up and it can't be done while operating on
649	 *     it.
650	 *   o The current thread has signals pending, should
651	 *     let scheduler install signal trampoline for us.
652	 *   o There are no runnable threads.
653	 *   o The next thread to run won't unlock the scheduler
654	 *     lock.  A side note: the current thread may be run
655	 *     instead of the next thread in the run queue, but
656	 *     we don't bother checking for that.
657	 */
658	if (curthread->attr.flags & PTHREAD_SCOPE_SYSTEM)
659		kse_sched_single(&curkse->k_kcb->kcb_kmbx);
660	else if ((curthread->state == PS_DEAD) ||
661	    (((td = KSE_RUNQ_FIRST(curkse)) == NULL) &&
662	    (curthread->state != PS_RUNNING)) ||
663	    ((td != NULL) && (td->lock_switch == 0))) {
664		curkse->k_switch = 1;
665		_thread_enter_uts(curthread->tcb, curkse->k_kcb);
666	}
667	else {
668		uts_once = 0;
669		THR_GETCONTEXT(&curthread->tcb->tcb_tmbx.tm_context);
670		if (uts_once == 0) {
671			uts_once = 1;
672
673			/* Switchout the current thread. */
674			kse_switchout_thread(curkse, curthread);
675
676		 	/* Choose another thread to run. */
677			td = KSE_RUNQ_FIRST(curkse);
678			KSE_RUNQ_REMOVE(curkse, td);
679			curkse->k_curthread = td;
680
681			/*
682			 * Make sure the current thread's kse points to
683			 * this kse.
684			 */
685			td->kse = curkse;
686
687			/*
688			 * Reset the time slice if this thread is running
689			 * for the first time or running again after using
690			 * its full time slice allocation.
691			 */
692			if (td->slice_usec == -1)
693				td->slice_usec = 0;
694
695			/* Mark the thread active. */
696			td->active = 1;
697
698			/* Remove the frame reference. */
699			td->curframe = NULL;
700
701			/*
702			 * Continue the thread at its current frame:
703			 */
704			ret = _thread_switch(curkse->k_kcb, td->tcb, 0);
705			/* This point should not be reached. */
706			if (ret != 0)
707				PANIC("Bad return from _thread_switch");
708			PANIC("Thread has returned from _thread_switch");
709		}
710	}
711
712	if (psf.psf_valid) {
713		/*
714		 * It is ugly we must increase critical count, because we
715		 * have a frame saved, we must backout state in psf
716		 * before we can process signals.
717 		 */
718		curthread->critical_count++;
719	}
720
721	if (curthread->lock_switch != 0) {
722		/*
723		 * Unlock the scheduling queue and leave the
724		 * critical region.
725		 */
726		/* Don't trust this after a switch! */
727		curkse = _get_curkse();
728
729		curthread->lock_switch = 0;
730		KSE_SCHED_UNLOCK(curkse, curkse->k_kseg);
731		_kse_critical_leave(&curthread->tcb->tcb_tmbx);
732	}
733	/*
734	 * This thread is being resumed; check for cancellations.
735	 */
736	if ((psf.psf_valid ||
737	    (curthread->check_pending && !THR_IN_CRITICAL(curthread)))) {
738		resume_once = 0;
739		THR_GETCONTEXT(&uc);
740		if (resume_once == 0) {
741			resume_once = 1;
742			curthread->check_pending = 0;
743			thr_resume_check(curthread, &uc, &psf);
744		}
745	}
746	THR_ACTIVATE_LAST_LOCK(curthread);
747}
748
749/*
750 * This is the scheduler for a KSE which runs a scope system thread.
751 * The multi-thread KSE scheduler should also work for a single threaded
752 * KSE, but we use a separate scheduler so that it can be fine-tuned
753 * to be more efficient (and perhaps not need a separate stack for
754 * the KSE, allowing it to use the thread's stack).
755 */
756
757static void
758kse_sched_single(struct kse_mailbox *kmbx)
759{
760	struct kse *curkse;
761	struct pthread *curthread;
762	struct timespec ts;
763	sigset_t sigmask;
764	int i, sigseqno, level, first = 0;
765
766	curkse = (struct kse *)kmbx->km_udata;
767	curthread = curkse->k_curthread;
768
769	if ((curkse->k_flags & KF_INITIALIZED) == 0) {
770		/* Setup this KSEs specific data. */
771		_kcb_set(curkse->k_kcb);
772		_tcb_set(curkse->k_kcb, curthread->tcb);
773		curkse->k_flags |= KF_INITIALIZED;
774		first = 1;
775		curthread->active = 1;
776
777		/* Setup kernel signal masks for new thread. */
778		__sys_sigprocmask(SIG_SETMASK, &curthread->sigmask, NULL);
779		/*
780		 * Enter critical region, this is meanless for bound thread,
781		 * It is used to let other code work, those code want mailbox
782		 * to be cleared.
783		 */
784		(void)_kse_critical_enter();
785 	}
786
787	curthread->critical_yield = 0;
788	curthread->need_switchout = 0;
789
790	/*
791	 * Lock the scheduling queue.
792	 *
793	 * There is no scheduling queue for single threaded KSEs,
794	 * but we need a lock for protection regardless.
795	 */
796	if (curthread->lock_switch == 0)
797		KSE_SCHED_LOCK(curkse, curkse->k_kseg);
798
799	/*
800	 * This has to do the job of kse_switchout_thread(), only
801	 * for a single threaded KSE/KSEG.
802	 */
803
804	switch (curthread->state) {
805	case PS_DEAD:
806		curthread->check_pending = 0;
807		/* Unlock the scheduling queue and exit the KSE and thread. */
808		thr_cleanup(curkse, curthread);
809		KSE_SCHED_UNLOCK(curkse, curkse->k_kseg);
810		PANIC("bound thread shouldn't get here\n");
811		break;
812
813	case PS_SIGWAIT:
814		PANIC("bound thread does not have SIGWAIT state\n");
815
816	case PS_SLEEP_WAIT:
817		PANIC("bound thread does not have SLEEP_WAIT state\n");
818
819	case PS_SIGSUSPEND:
820		PANIC("bound thread does not have SIGSUSPEND state\n");
821
822	case PS_COND_WAIT:
823		break;
824
825	case PS_LOCKWAIT:
826		/*
827		 * This state doesn't timeout.
828		 */
829		curthread->wakeup_time.tv_sec = -1;
830		curthread->wakeup_time.tv_nsec = -1;
831		level = curthread->locklevel - 1;
832		if (_LCK_GRANTED(&curthread->lockusers[level]))
833			THR_SET_STATE(curthread, PS_RUNNING);
834		break;
835
836	case PS_RUNNING:
837		if ((curthread->flags & THR_FLAGS_SUSPENDED) != 0) {
838			THR_SET_STATE(curthread, PS_SUSPENDED);
839		}
840		curthread->wakeup_time.tv_sec = -1;
841		curthread->wakeup_time.tv_nsec = -1;
842		break;
843
844	case PS_JOIN:
845	case PS_MUTEX_WAIT:
846	case PS_SUSPENDED:
847	case PS_DEADLOCK:
848	default:
849		/*
850		 * These states don't timeout and don't need
851		 * to be in the waiting queue.
852		 */
853		curthread->wakeup_time.tv_sec = -1;
854		curthread->wakeup_time.tv_nsec = -1;
855		break;
856	}
857
858	while (curthread->state != PS_RUNNING) {
859		sigseqno = curkse->k_sigseqno;
860		if (curthread->check_pending != 0) {
861			/*
862			 * Install pending signals into the frame, possible
863			 * cause mutex or condvar backout.
864			 */
865			curthread->check_pending = 0;
866			SIGFILLSET(sigmask);
867
868			/*
869			 * Lock out kernel signal code when we are processing
870			 * signals, and get a fresh copy of signal mask.
871			 */
872			__sys_sigprocmask(SIG_SETMASK, &sigmask,
873					  &curthread->sigmask);
874			for (i = 1; i <= _SIG_MAXSIG; i++) {
875				if (SIGISMEMBER(curthread->sigmask, i))
876					continue;
877				if (SIGISMEMBER(curthread->sigpend, i))
878					(void)_thr_sig_add(curthread, i,
879					    &curthread->siginfo[i-1]);
880			}
881			__sys_sigprocmask(SIG_SETMASK, &curthread->sigmask,
882				NULL);
883			/* The above code might make thread runnable */
884			if (curthread->state == PS_RUNNING)
885				break;
886		}
887		THR_DEACTIVATE_LAST_LOCK(curthread);
888		kse_wait(curkse, curthread, sigseqno);
889		THR_ACTIVATE_LAST_LOCK(curthread);
890		KSE_GET_TOD(curkse, &ts);
891		if (thr_timedout(curthread, &ts)) {
892			/* Indicate the thread timedout: */
893			curthread->timeout = 1;
894			/* Make the thread runnable. */
895			THR_SET_STATE(curthread, PS_RUNNING);
896		}
897	}
898
899	/* Remove the frame reference. */
900	curthread->curframe = NULL;
901
902	if (curthread->lock_switch == 0) {
903		/* Unlock the scheduling queue. */
904		KSE_SCHED_UNLOCK(curkse, curkse->k_kseg);
905	}
906
907	DBG_MSG("Continuing bound thread %p\n", curthread);
908	if (first) {
909		_kse_critical_leave(&curthread->tcb->tcb_tmbx);
910		pthread_exit(curthread->start_routine(curthread->arg));
911	}
912}
913
914#ifdef DEBUG_THREAD_KERN
915static void
916dump_queues(struct kse *curkse)
917{
918	struct pthread *thread;
919
920	DBG_MSG("Threads in waiting queue:\n");
921	TAILQ_FOREACH(thread, &curkse->k_kseg->kg_schedq.sq_waitq, pqe) {
922		DBG_MSG("  thread %p, state %d, blocked %d\n",
923		    thread, thread->state, thread->blocked);
924	}
925}
926#endif
927
928/*
929 * This is the scheduler for a KSE which runs multiple threads.
930 */
931static void
932kse_sched_multi(struct kse_mailbox *kmbx)
933{
934	struct kse *curkse;
935	struct pthread *curthread, *td_wait;
936	struct pthread_sigframe *curframe;
937	int ret;
938
939	curkse = (struct kse *)kmbx->km_udata;
940	THR_ASSERT(curkse->k_kcb->kcb_kmbx.km_curthread == NULL,
941	    "Mailbox not null in kse_sched_multi");
942
943	/* Check for first time initialization: */
944	if ((curkse->k_flags & KF_INITIALIZED) == 0) {
945		/* Setup this KSEs specific data. */
946		_kcb_set(curkse->k_kcb);
947
948		/* Set this before grabbing the context. */
949		curkse->k_flags |= KF_INITIALIZED;
950	}
951
952	/* This may have returned from a kse_release(). */
953	if (KSE_WAITING(curkse)) {
954		DBG_MSG("Entered upcall when KSE is waiting.");
955		KSE_CLEAR_WAIT(curkse);
956	}
957
958	/* If this is an upcall; take the scheduler lock. */
959	if (curkse->k_switch == 0) {
960		/* Set fake kcb */
961		_tcb_set(curkse->k_kcb, NULL);
962		KSE_SCHED_LOCK(curkse, curkse->k_kseg);
963	}
964	curkse->k_switch = 0;
965
966	/*
967	 * Now that the scheduler lock is held, get the current
968	 * thread.  The KSE's current thread cannot be safely
969	 * examined without the lock because it could have returned
970	 * as completed on another KSE.  See kse_check_completed().
971	 */
972	curthread = curkse->k_curthread;
973
974	if (KSE_IS_IDLE(curkse)) {
975		KSE_CLEAR_IDLE(curkse);
976		curkse->k_kseg->kg_idle_kses--;
977	}
978	/*
979	 * If the current thread was completed in another KSE, then
980	 * it will be in the run queue.  Don't mark it as being blocked.
981	 */
982	if ((curthread != NULL) &&
983	    ((curthread->flags & THR_FLAGS_IN_RUNQ) == 0) &&
984	    (curthread->need_switchout == 0)) {
985		/*
986		 * Assume the current thread is blocked; when the
987		 * completed threads are checked and if the current
988		 * thread is among the completed, the blocked flag
989		 * will be cleared.
990		 */
991		curthread->blocked = 1;
992	}
993
994	/* Check for any unblocked threads in the kernel. */
995	kse_check_completed(curkse);
996
997	/*
998	 * Check for threads that have timed-out.
999	 */
1000	kse_check_waitq(curkse);
1001
1002	/*
1003	 * Switchout the current thread, if necessary, as the last step
1004	 * so that it is inserted into the run queue (if it's runnable)
1005	 * _after_ any other threads that were added to it above.
1006	 */
1007	if (curthread == NULL)
1008		;  /* Nothing to do here. */
1009	else if ((curthread->need_switchout == 0) &&
1010	    (curthread->blocked == 0) && (THR_IN_CRITICAL(curthread))) {
1011		/*
1012		 * Resume the thread and tell it to yield when
1013		 * it leaves the critical region.
1014		 */
1015		curthread->critical_yield = 1;
1016		curthread->active = 1;
1017		if ((curthread->flags & THR_FLAGS_IN_RUNQ) != 0)
1018			KSE_RUNQ_REMOVE(curkse, curthread);
1019		curkse->k_curthread = curthread;
1020		curthread->kse = curkse;
1021		DBG_MSG("Continuing thread %p in critical region\n",
1022		    curthread);
1023		kse_wakeup_multi(curkse);
1024		KSE_SCHED_UNLOCK(curkse, curkse->k_kseg);
1025		ret = _thread_switch(curkse->k_kcb, curthread->tcb, 1);
1026		if (ret != 0)
1027			PANIC("Can't resume thread in critical region\n");
1028	}
1029	else if ((curthread->flags & THR_FLAGS_IN_RUNQ) == 0)
1030		kse_switchout_thread(curkse, curthread);
1031	curkse->k_curthread = NULL;
1032
1033	kse_wakeup_multi(curkse);
1034
1035#ifdef DEBUG_THREAD_KERN
1036	dump_queues(curkse);
1037#endif
1038
1039	/* Check if there are no threads ready to run: */
1040	while (((curthread = KSE_RUNQ_FIRST(curkse)) == NULL) &&
1041	    (curkse->k_kseg->kg_threadcount != 0)) {
1042		/*
1043		 * Wait for a thread to become active or until there are
1044		 * no more threads.
1045		 */
1046		td_wait = KSE_WAITQ_FIRST(curkse);
1047		kse_wait(curkse, td_wait, 0);
1048		kse_check_completed(curkse);
1049		kse_check_waitq(curkse);
1050	}
1051
1052	/* Check for no more threads: */
1053	if (curkse->k_kseg->kg_threadcount == 0) {
1054		/*
1055		 * Normally this shouldn't return, but it will if there
1056		 * are other KSEs running that create new threads that
1057		 * are assigned to this KSE[G].  For instance, if a scope
1058		 * system thread were to create a scope process thread
1059		 * and this kse[g] is the initial kse[g], then that newly
1060		 * created thread would be assigned to us (the initial
1061		 * kse[g]).
1062		 */
1063		KSE_SCHED_UNLOCK(curkse, curkse->k_kseg);
1064		kse_fini(curkse);
1065		/* never returns */
1066	}
1067
1068	THR_ASSERT(curthread != NULL,
1069	    "Return from kse_wait/fini without thread.");
1070	THR_ASSERT(curthread->state != PS_DEAD,
1071	    "Trying to resume dead thread!");
1072	KSE_RUNQ_REMOVE(curkse, curthread);
1073
1074	/*
1075	 * Make the selected thread the current thread.
1076	 */
1077	curkse->k_curthread = curthread;
1078
1079	/*
1080	 * Make sure the current thread's kse points to this kse.
1081	 */
1082	curthread->kse = curkse;
1083
1084	/*
1085	 * Reset the time slice if this thread is running for the first
1086	 * time or running again after using its full time slice allocation.
1087	 */
1088	if (curthread->slice_usec == -1)
1089		curthread->slice_usec = 0;
1090
1091	/* Mark the thread active. */
1092	curthread->active = 1;
1093
1094	/* Remove the frame reference. */
1095	curframe = curthread->curframe;
1096	curthread->curframe = NULL;
1097
1098	kse_wakeup_multi(curkse);
1099
1100	/*
1101	 * The thread's current signal frame will only be NULL if it
1102	 * is being resumed after being blocked in the kernel.  In
1103	 * this case, and if the thread needs to run down pending
1104	 * signals or needs a cancellation check, we need to add a
1105	 * signal frame to the thread's context.
1106	 */
1107#ifdef NOT_YET
1108	if ((((curframe == NULL) && (curthread->check_pending != 0)) ||
1109	    (((curthread->cancelflags & THR_AT_CANCEL_POINT) == 0) &&
1110	     ((curthread->cancelflags & PTHREAD_CANCEL_ASYNCHRONOUS) != 0))) &&
1111	     !THR_IN_CRITICAL(curthread))
1112		signalcontext(&curthread->tcb->tcb_tmbx.tm_context, 0,
1113		    (__sighandler_t *)thr_resume_wrapper);
1114#else
1115	if ((curframe == NULL) && (curthread->state == PS_RUNNING) &&
1116	    (curthread->check_pending != 0) && !THR_IN_CRITICAL(curthread)) {
1117		curthread->check_pending = 0;
1118		signalcontext(&curthread->tcb->tcb_tmbx.tm_context, 0,
1119		    (__sighandler_t *)thr_resume_wrapper);
1120	}
1121#endif
1122	/*
1123	 * Continue the thread at its current frame:
1124	 */
1125	if (curthread->lock_switch != 0) {
1126		/*
1127		 * This thread came from a scheduler switch; it will
1128		 * unlock the scheduler lock and set the mailbox.
1129		 */
1130		ret = _thread_switch(curkse->k_kcb, curthread->tcb, 0);
1131	} else {
1132		/* This thread won't unlock the scheduler lock. */
1133		KSE_SCHED_UNLOCK(curkse, curkse->k_kseg);
1134		ret = _thread_switch(curkse->k_kcb, curthread->tcb, 1);
1135	}
1136	if (ret != 0)
1137		PANIC("Thread has returned from _thread_switch");
1138
1139	/* This point should not be reached. */
1140	PANIC("Thread has returned from _thread_switch");
1141}
1142
1143static void
1144thr_resume_wrapper(int sig, siginfo_t *siginfo, ucontext_t *ucp)
1145{
1146	struct pthread *curthread = _get_curthread();
1147	struct kse *curkse;
1148	int ret, err_save = curthread->error;
1149
1150	DBG_MSG(">>> sig wrapper\n");
1151	if (curthread->lock_switch)
1152		PANIC("thr_resume_wrapper, lock_switch != 0\n");
1153	thr_resume_check(curthread, ucp, NULL);
1154	_kse_critical_enter();
1155	curkse = _get_curkse();
1156	curthread->tcb->tcb_tmbx.tm_context = *ucp;
1157	curthread->error = err_save;
1158	ret = _thread_switch(curkse->k_kcb, curthread->tcb, 1);
1159	if (ret != 0)
1160		PANIC("thr_resume_wrapper: thread has returned "
1161		      "from _thread_switch");
1162	/* THR_SETCONTEXT(ucp); */ /* not work, why ? */
1163}
1164
1165static void
1166thr_resume_check(struct pthread *curthread, ucontext_t *ucp,
1167    struct pthread_sigframe *psf)
1168{
1169	_thr_sig_rundown(curthread, ucp, psf);
1170
1171#ifdef NOT_YET
1172	if (((curthread->cancelflags & THR_AT_CANCEL_POINT) == 0) &&
1173	    ((curthread->cancelflags & PTHREAD_CANCEL_ASYNCHRONOUS) != 0))
1174		pthread_testcancel();
1175#endif
1176}
1177
1178/*
1179 * Clean up a thread.  This must be called with the thread's KSE
1180 * scheduling lock held.  The thread must be a thread from the
1181 * KSE's group.
1182 */
1183static void
1184thr_cleanup(struct kse *curkse, struct pthread *thread)
1185{
1186	struct pthread *joiner;
1187	struct kse_mailbox *kmbx = NULL;
1188	int sys_scope;
1189
1190	if ((joiner = thread->joiner) != NULL) {
1191		/* Joinee scheduler lock held; joiner won't leave. */
1192		if (joiner->kseg == curkse->k_kseg) {
1193			if (joiner->join_status.thread == thread) {
1194				joiner->join_status.thread = NULL;
1195				joiner->join_status.ret = thread->ret;
1196				(void)_thr_setrunnable_unlocked(joiner);
1197			}
1198		} else {
1199			KSE_SCHED_UNLOCK(curkse, curkse->k_kseg);
1200			/* The joiner may have removed itself and exited. */
1201			if (_thr_ref_add(thread, joiner, 0) == 0) {
1202				KSE_SCHED_LOCK(curkse, joiner->kseg);
1203				if (joiner->join_status.thread == thread) {
1204					joiner->join_status.thread = NULL;
1205					joiner->join_status.ret = thread->ret;
1206					kmbx = _thr_setrunnable_unlocked(joiner);
1207				}
1208				KSE_SCHED_UNLOCK(curkse, joiner->kseg);
1209				_thr_ref_delete(thread, joiner);
1210				if (kmbx != NULL)
1211					kse_wakeup(kmbx);
1212			}
1213			KSE_SCHED_LOCK(curkse, curkse->k_kseg);
1214		}
1215		thread->attr.flags |= PTHREAD_DETACHED;
1216	}
1217
1218	if (!(sys_scope = (thread->attr.flags & PTHREAD_SCOPE_SYSTEM))) {
1219		/*
1220		 * Remove the thread from the KSEG's list of threads.
1221	 	 */
1222		KSEG_THRQ_REMOVE(thread->kseg, thread);
1223		/*
1224		 * Migrate the thread to the main KSE so that this
1225		 * KSE and KSEG can be cleaned when their last thread
1226		 * exits.
1227		 */
1228		thread->kseg = _kse_initial->k_kseg;
1229		thread->kse = _kse_initial;
1230	}
1231	thread->flags |= THR_FLAGS_GC_SAFE;
1232
1233	/*
1234	 * We can't hold the thread list lock while holding the
1235	 * scheduler lock.
1236	 */
1237	KSE_SCHED_UNLOCK(curkse, curkse->k_kseg);
1238	DBG_MSG("Adding thread %p to GC list\n", thread);
1239	KSE_LOCK_ACQUIRE(curkse, &_thread_list_lock);
1240	THR_GCLIST_ADD(thread);
1241	/* Use thread_list_lock */
1242	active_threads--;
1243#ifdef SYSTEM_SCOPE_ONLY
1244	if (active_threads == 0) {
1245#else
1246	if (active_threads == 1) {
1247#endif
1248		KSE_LOCK_RELEASE(curkse, &_thread_list_lock);
1249		exit(0);
1250        }
1251	KSE_LOCK_RELEASE(curkse, &_thread_list_lock);
1252	if (sys_scope) {
1253		/*
1254		 * System scope thread is single thread group,
1255		 * when thread is exited, its kse and ksegrp should
1256		 * be recycled as well.
1257		 * kse upcall stack belongs to thread, clear it here.
1258		 */
1259		curkse->k_stack.ss_sp = 0;
1260		curkse->k_stack.ss_size = 0;
1261		kse_exit();
1262		PANIC("kse_exit() failed for system scope thread");
1263	}
1264	KSE_SCHED_LOCK(curkse, curkse->k_kseg);
1265}
1266
1267void
1268_thr_gc(struct pthread *curthread)
1269{
1270	thread_gc(curthread);
1271	kse_gc(curthread);
1272	kseg_gc(curthread);
1273}
1274
1275static void
1276thread_gc(struct pthread *curthread)
1277{
1278	struct pthread *td, *td_next;
1279	kse_critical_t crit;
1280	TAILQ_HEAD(, pthread) worklist;
1281
1282	TAILQ_INIT(&worklist);
1283	crit = _kse_critical_enter();
1284	KSE_LOCK_ACQUIRE(curthread->kse, &_thread_list_lock);
1285
1286	/* Check the threads waiting for GC. */
1287	for (td = TAILQ_FIRST(&_thread_gc_list); td != NULL; td = td_next) {
1288		td_next = TAILQ_NEXT(td, gcle);
1289		if ((td->flags & THR_FLAGS_GC_SAFE) == 0)
1290			continue;
1291		else if (((td->attr.flags & PTHREAD_SCOPE_SYSTEM) != 0) &&
1292		    ((td->kse->k_kcb->kcb_kmbx.km_flags & KMF_DONE) == 0)) {
1293			/*
1294			 * The thread and KSE are operating on the same
1295			 * stack.  Wait for the KSE to exit before freeing
1296			 * the thread's stack as well as everything else.
1297			 */
1298			continue;
1299		}
1300		/*
1301		 * Remove the thread from the GC list.  If the thread
1302		 * isn't yet detached, it will get added back to the
1303		 * GC list at a later time.
1304		 */
1305		THR_GCLIST_REMOVE(td);
1306		DBG_MSG("Freeing thread %p stack\n", td);
1307		/*
1308		 * We can free the thread stack since it's no longer
1309		 * in use.
1310		 */
1311		_thr_stack_free(&td->attr);
1312		if (((td->attr.flags & PTHREAD_DETACHED) != 0) &&
1313		    (td->refcount == 0)) {
1314			/*
1315			 * The thread has detached and is no longer
1316			 * referenced.  It is safe to remove all
1317			 * remnants of the thread.
1318			 */
1319			THR_LIST_REMOVE(td);
1320			TAILQ_INSERT_HEAD(&worklist, td, gcle);
1321		}
1322	}
1323	KSE_LOCK_RELEASE(curthread->kse, &_thread_list_lock);
1324	_kse_critical_leave(crit);
1325
1326	while ((td = TAILQ_FIRST(&worklist)) != NULL) {
1327		TAILQ_REMOVE(&worklist, td, gcle);
1328
1329		if ((td->attr.flags & PTHREAD_SCOPE_SYSTEM) != 0) {
1330			crit = _kse_critical_enter();
1331			KSE_LOCK_ACQUIRE(curthread->kse, &kse_lock);
1332			kse_free_unlocked(td->kse);
1333			kseg_free_unlocked(td->kseg);
1334			KSE_LOCK_RELEASE(curthread->kse, &kse_lock);
1335			_kse_critical_leave(crit);
1336		}
1337		/*
1338		 * XXX we don't free initial thread, because there might
1339		 * have some code referencing initial thread.
1340		 */
1341		if (td != _thr_initial) {
1342			DBG_MSG("Freeing thread %p\n", td);
1343			_thr_free(curthread, td);
1344		} else
1345			DBG_MSG("Initial thread won't be freed\n");
1346	}
1347}
1348
1349static void
1350kse_gc(struct pthread *curthread)
1351{
1352	kse_critical_t crit;
1353	TAILQ_HEAD(, kse) worklist;
1354	struct kse *kse;
1355
1356	if (free_kse_count <= MAX_CACHED_KSES)
1357		return;
1358	TAILQ_INIT(&worklist);
1359	crit = _kse_critical_enter();
1360	KSE_LOCK_ACQUIRE(curthread->kse, &kse_lock);
1361	while (free_kse_count > MAX_CACHED_KSES) {
1362		kse = TAILQ_FIRST(&free_kseq);
1363		TAILQ_REMOVE(&free_kseq, kse, k_qe);
1364		TAILQ_INSERT_HEAD(&worklist, kse, k_qe);
1365		free_kse_count--;
1366	}
1367	KSE_LOCK_RELEASE(curthread->kse, &kse_lock);
1368	_kse_critical_leave(crit);
1369
1370	while ((kse = TAILQ_FIRST(&worklist))) {
1371		TAILQ_REMOVE(&worklist, kse, k_qe);
1372		kse_destroy(kse);
1373	}
1374}
1375
1376static void
1377kseg_gc(struct pthread *curthread)
1378{
1379	kse_critical_t crit;
1380	TAILQ_HEAD(, kse_group) worklist;
1381	struct kse_group *kseg;
1382
1383	if (free_kseg_count <= MAX_CACHED_KSEGS)
1384		return;
1385	crit = _kse_critical_enter();
1386	KSE_LOCK_ACQUIRE(curthread->kse, &kse_lock);
1387	while (free_kseg_count > MAX_CACHED_KSEGS) {
1388		kseg = TAILQ_FIRST(&free_kse_groupq);
1389		TAILQ_REMOVE(&free_kse_groupq, kseg, kg_qe);
1390		free_kseg_count--;
1391		TAILQ_INSERT_HEAD(&worklist, kseg, kg_qe);
1392	}
1393	KSE_LOCK_RELEASE(curthread->kse, &kse_lock);
1394	_kse_critical_leave(crit);
1395
1396	while ((kseg = TAILQ_FIRST(&worklist))) {
1397		TAILQ_REMOVE(&worklist, kseg, kg_qe);
1398		kseg_destroy(kseg);
1399	}
1400}
1401
1402/*
1403 * Only new threads that are running or suspended may be scheduled.
1404 */
1405int
1406_thr_schedule_add(struct pthread *curthread, struct pthread *newthread)
1407{
1408	kse_critical_t crit;
1409	int ret;
1410
1411	/* Add the new thread. */
1412	thr_link(newthread);
1413
1414	/*
1415	 * If this is the first time creating a thread, make sure
1416	 * the mailbox is set for the current thread.
1417	 */
1418	if ((newthread->attr.flags & PTHREAD_SCOPE_SYSTEM) != 0) {
1419		/* We use the thread's stack as the KSE's stack. */
1420		newthread->kse->k_kcb->kcb_kmbx.km_stack.ss_sp =
1421		    newthread->attr.stackaddr_attr;
1422		newthread->kse->k_kcb->kcb_kmbx.km_stack.ss_size =
1423		    newthread->attr.stacksize_attr;
1424
1425		/*
1426		 * No need to lock the scheduling queue since the
1427		 * KSE/KSEG pair have not yet been started.
1428		 */
1429		KSEG_THRQ_ADD(newthread->kseg, newthread);
1430		/* this thread never gives up kse */
1431		newthread->active = 1;
1432		newthread->kse->k_curthread = newthread;
1433		newthread->kse->k_kcb->kcb_kmbx.km_flags = KMF_BOUND;
1434		newthread->kse->k_kcb->kcb_kmbx.km_func =
1435		    (kse_func_t *)kse_sched_single;
1436		newthread->kse->k_kcb->kcb_kmbx.km_quantum = 0;
1437		KSE_SET_MBOX(newthread->kse, newthread);
1438		/*
1439		 * This thread needs a new KSE and KSEG.
1440		 */
1441		newthread->kse->k_flags &= ~KF_INITIALIZED;
1442		newthread->kse->k_flags |= KF_STARTED;
1443		/* Fire up! */
1444		ret = kse_create(&newthread->kse->k_kcb->kcb_kmbx, 1);
1445		if (ret != 0)
1446			ret = errno;
1447	}
1448	else {
1449		/*
1450		 * Lock the KSE and add the new thread to its list of
1451		 * assigned threads.  If the new thread is runnable, also
1452		 * add it to the KSE's run queue.
1453		 */
1454		crit = _kse_critical_enter();
1455		KSE_SCHED_LOCK(curthread->kse, newthread->kseg);
1456		KSEG_THRQ_ADD(newthread->kseg, newthread);
1457		if (newthread->state == PS_RUNNING)
1458			THR_RUNQ_INSERT_TAIL(newthread);
1459		if ((newthread->kse->k_flags & KF_STARTED) == 0) {
1460			/*
1461			 * This KSE hasn't been started yet.  Start it
1462			 * outside of holding the lock.
1463			 */
1464			newthread->kse->k_flags |= KF_STARTED;
1465			newthread->kse->k_kcb->kcb_kmbx.km_func =
1466			    (kse_func_t *)kse_sched_multi;
1467			newthread->kse->k_kcb->kcb_kmbx.km_flags = 0;
1468			kse_create(&newthread->kse->k_kcb->kcb_kmbx, 0);
1469		 } else if ((newthread->state == PS_RUNNING) &&
1470		     KSE_IS_IDLE(newthread->kse)) {
1471			/*
1472			 * The thread is being scheduled on another KSEG.
1473			 */
1474			kse_wakeup_one(newthread);
1475		}
1476		KSE_SCHED_UNLOCK(curthread->kse, newthread->kseg);
1477		_kse_critical_leave(crit);
1478		ret = 0;
1479	}
1480	if (ret != 0)
1481		thr_unlink(newthread);
1482
1483	return (ret);
1484}
1485
1486void
1487kse_waitq_insert(struct pthread *thread)
1488{
1489	struct pthread *td;
1490
1491	if (thread->wakeup_time.tv_sec == -1)
1492		TAILQ_INSERT_TAIL(&thread->kse->k_schedq->sq_waitq, thread,
1493		    pqe);
1494	else {
1495		td = TAILQ_FIRST(&thread->kse->k_schedq->sq_waitq);
1496		while ((td != NULL) && (td->wakeup_time.tv_sec != -1) &&
1497		    ((td->wakeup_time.tv_sec < thread->wakeup_time.tv_sec) ||
1498		    ((td->wakeup_time.tv_sec == thread->wakeup_time.tv_sec) &&
1499		    (td->wakeup_time.tv_nsec <= thread->wakeup_time.tv_nsec))))
1500			td = TAILQ_NEXT(td, pqe);
1501		if (td == NULL)
1502			TAILQ_INSERT_TAIL(&thread->kse->k_schedq->sq_waitq,
1503			    thread, pqe);
1504		else
1505			TAILQ_INSERT_BEFORE(td, thread, pqe);
1506	}
1507	thread->flags |= THR_FLAGS_IN_WAITQ;
1508}
1509
1510/*
1511 * This must be called with the scheduling lock held.
1512 */
1513static void
1514kse_check_completed(struct kse *kse)
1515{
1516	struct pthread *thread;
1517	struct kse_thr_mailbox *completed;
1518	int sig;
1519
1520	if ((completed = kse->k_kcb->kcb_kmbx.km_completed) != NULL) {
1521		kse->k_kcb->kcb_kmbx.km_completed = NULL;
1522		while (completed != NULL) {
1523			thread = completed->tm_udata;
1524			DBG_MSG("Found completed thread %p, name %s\n",
1525			    thread,
1526			    (thread->name == NULL) ? "none" : thread->name);
1527			thread->blocked = 0;
1528			if (thread != kse->k_curthread) {
1529				thr_accounting(thread);
1530				if ((thread->flags & THR_FLAGS_SUSPENDED) != 0)
1531					THR_SET_STATE(thread, PS_SUSPENDED);
1532				else
1533					KSE_RUNQ_INSERT_TAIL(kse, thread);
1534				if ((thread->kse != kse) &&
1535				    (thread->kse->k_curthread == thread)) {
1536					/*
1537					 * Remove this thread from its
1538					 * previous KSE so that it (the KSE)
1539					 * doesn't think it is still active.
1540					 */
1541					thread->kse->k_curthread = NULL;
1542					thread->active = 0;
1543				}
1544			}
1545			if ((sig = thread->tcb->tcb_tmbx.tm_syncsig.si_signo)
1546			    != 0) {
1547				if (SIGISMEMBER(thread->sigmask, sig))
1548					SIGADDSET(thread->sigpend, sig);
1549				else
1550					(void)_thr_sig_add(thread, sig,
1551					    &thread->tcb->tcb_tmbx.tm_syncsig);
1552				thread->tcb->tcb_tmbx.tm_syncsig.si_signo = 0;
1553			}
1554			completed = completed->tm_next;
1555		}
1556	}
1557}
1558
1559/*
1560 * This must be called with the scheduling lock held.
1561 */
1562static void
1563kse_check_waitq(struct kse *kse)
1564{
1565	struct pthread	*pthread;
1566	struct timespec ts;
1567
1568	KSE_GET_TOD(kse, &ts);
1569
1570	/*
1571	 * Wake up threads that have timedout.  This has to be
1572	 * done before adding the current thread to the run queue
1573	 * so that a CPU intensive thread doesn't get preference
1574	 * over waiting threads.
1575	 */
1576	while (((pthread = KSE_WAITQ_FIRST(kse)) != NULL) &&
1577	    thr_timedout(pthread, &ts)) {
1578		/* Remove the thread from the wait queue: */
1579		KSE_WAITQ_REMOVE(kse, pthread);
1580		DBG_MSG("Found timedout thread %p in waitq\n", pthread);
1581
1582		/* Indicate the thread timedout: */
1583		pthread->timeout = 1;
1584
1585		/* Add the thread to the priority queue: */
1586		if ((pthread->flags & THR_FLAGS_SUSPENDED) != 0)
1587			THR_SET_STATE(pthread, PS_SUSPENDED);
1588		else {
1589			THR_SET_STATE(pthread, PS_RUNNING);
1590			KSE_RUNQ_INSERT_TAIL(kse, pthread);
1591		}
1592	}
1593}
1594
1595static int
1596thr_timedout(struct pthread *thread, struct timespec *curtime)
1597{
1598	if (thread->wakeup_time.tv_sec < 0)
1599		return (0);
1600	else if (thread->wakeup_time.tv_sec > curtime->tv_sec)
1601		return (0);
1602	else if ((thread->wakeup_time.tv_sec == curtime->tv_sec) &&
1603	    (thread->wakeup_time.tv_nsec > curtime->tv_nsec))
1604		return (0);
1605	else
1606		return (1);
1607}
1608
1609/*
1610 * This must be called with the scheduling lock held.
1611 *
1612 * Each thread has a time slice, a wakeup time (used when it wants
1613 * to wait for a specified amount of time), a run state, and an
1614 * active flag.
1615 *
1616 * When a thread gets run by the scheduler, the active flag is
1617 * set to non-zero (1).  When a thread performs an explicit yield
1618 * or schedules a state change, it enters the scheduler and the
1619 * active flag is cleared.  When the active flag is still seen
1620 * set in the scheduler, that means that the thread is blocked in
1621 * the kernel (because it is cleared before entering the scheduler
1622 * in all other instances).
1623 *
1624 * The wakeup time is only set for those states that can timeout.
1625 * It is set to (-1, -1) for all other instances.
1626 *
1627 * The thread's run state, aside from being useful when debugging,
1628 * is used to place the thread in an appropriate queue.  There
1629 * are 2 basic queues:
1630 *
1631 *   o run queue - queue ordered by priority for all threads
1632 *                 that are runnable
1633 *   o waiting queue - queue sorted by wakeup time for all threads
1634 *                     that are not otherwise runnable (not blocked
1635 *                     in kernel, not waiting for locks)
1636 *
1637 * The thread's time slice is used for round-robin scheduling
1638 * (the default scheduling policy).  While a SCHED_RR thread
1639 * is runnable it's time slice accumulates.  When it reaches
1640 * the time slice interval, it gets reset and added to the end
1641 * of the queue of threads at its priority.  When a thread no
1642 * longer becomes runnable (blocks in kernel, waits, etc), its
1643 * time slice is reset.
1644 *
1645 * The job of kse_switchout_thread() is to handle all of the above.
1646 */
1647static void
1648kse_switchout_thread(struct kse *kse, struct pthread *thread)
1649{
1650	int level;
1651	int i;
1652	int restart;
1653	siginfo_t siginfo;
1654
1655	/*
1656	 * Place the currently running thread into the
1657	 * appropriate queue(s).
1658	 */
1659	DBG_MSG("Switching out thread %p, state %d\n", thread, thread->state);
1660
1661	THR_DEACTIVATE_LAST_LOCK(thread);
1662	if (thread->blocked != 0) {
1663		thread->active = 0;
1664		thread->need_switchout = 0;
1665		/* This thread must have blocked in the kernel. */
1666		/*
1667		 *  Check for pending signals for this thread to
1668		 *  see if we need to interrupt it in the kernel.
1669		 */
1670		if (thread->check_pending != 0) {
1671			for (i = 1; i <= _SIG_MAXSIG; ++i) {
1672				if (SIGISMEMBER(thread->sigpend, i) &&
1673				    !SIGISMEMBER(thread->sigmask, i)) {
1674					restart = _thread_sigact[1 - 1].sa_flags & SA_RESTART;
1675					kse_thr_interrupt(&thread->tcb->tcb_tmbx,
1676					    restart ? KSE_INTR_RESTART : KSE_INTR_INTERRUPT, 0);
1677					break;
1678				}
1679			}
1680		}
1681	}
1682	else {
1683		switch (thread->state) {
1684		case PS_DEAD:
1685			/*
1686			 * The scheduler is operating on a different
1687			 * stack.  It is safe to do garbage collecting
1688			 * here.
1689			 */
1690			thread->active = 0;
1691			thread->need_switchout = 0;
1692			thread->lock_switch = 0;
1693			thr_cleanup(kse, thread);
1694			return;
1695			break;
1696
1697		case PS_RUNNING:
1698			if ((thread->flags & THR_FLAGS_SUSPENDED) != 0)
1699				THR_SET_STATE(thread, PS_SUSPENDED);
1700			break;
1701
1702		case PS_COND_WAIT:
1703		case PS_SLEEP_WAIT:
1704			/* Insert into the waiting queue: */
1705			KSE_WAITQ_INSERT(kse, thread);
1706			break;
1707
1708		case PS_LOCKWAIT:
1709			/*
1710			 * This state doesn't timeout.
1711			 */
1712			thread->wakeup_time.tv_sec = -1;
1713			thread->wakeup_time.tv_nsec = -1;
1714			level = thread->locklevel - 1;
1715			if (!_LCK_GRANTED(&thread->lockusers[level]))
1716				KSE_WAITQ_INSERT(kse, thread);
1717			else
1718				THR_SET_STATE(thread, PS_RUNNING);
1719			break;
1720
1721		case PS_SIGWAIT:
1722			KSE_WAITQ_INSERT(kse, thread);
1723			break;
1724		case PS_JOIN:
1725		case PS_MUTEX_WAIT:
1726		case PS_SIGSUSPEND:
1727		case PS_SUSPENDED:
1728		case PS_DEADLOCK:
1729		default:
1730			/*
1731			 * These states don't timeout.
1732			 */
1733			thread->wakeup_time.tv_sec = -1;
1734			thread->wakeup_time.tv_nsec = -1;
1735
1736			/* Insert into the waiting queue: */
1737			KSE_WAITQ_INSERT(kse, thread);
1738			break;
1739		}
1740		thr_accounting(thread);
1741		if (thread->state == PS_RUNNING) {
1742			if (thread->slice_usec == -1) {
1743				/*
1744				 * The thread exceeded its time quantum or
1745				 * it yielded the CPU; place it at the tail
1746				 * of the queue for its priority.
1747				 */
1748				KSE_RUNQ_INSERT_TAIL(kse, thread);
1749			} else {
1750				/*
1751				 * The thread hasn't exceeded its interval
1752				 * Place it at the head of the queue for its
1753				 * priority.
1754				 */
1755				KSE_RUNQ_INSERT_HEAD(kse, thread);
1756			}
1757		}
1758	}
1759	thread->active = 0;
1760	thread->need_switchout = 0;
1761	if (thread->check_pending != 0) {
1762		/* Install pending signals into the frame. */
1763		thread->check_pending = 0;
1764		KSE_LOCK_ACQUIRE(kse, &_thread_signal_lock);
1765		for (i = 1; i <= _SIG_MAXSIG; i++) {
1766			if (SIGISMEMBER(thread->sigmask, i))
1767				continue;
1768			if (SIGISMEMBER(thread->sigpend, i))
1769				(void)_thr_sig_add(thread, i,
1770				    &thread->siginfo[i-1]);
1771			else if (SIGISMEMBER(_thr_proc_sigpending, i) &&
1772				_thr_getprocsig_unlocked(i, &siginfo)) {
1773				(void)_thr_sig_add(thread, i, &siginfo);
1774			}
1775		}
1776		KSE_LOCK_RELEASE(kse, &_thread_signal_lock);
1777	}
1778}
1779
1780/*
1781 * This function waits for the smallest timeout value of any waiting
1782 * thread, or until it receives a message from another KSE.
1783 *
1784 * This must be called with the scheduling lock held.
1785 */
1786static void
1787kse_wait(struct kse *kse, struct pthread *td_wait, int sigseqno)
1788{
1789	struct timespec ts, ts_sleep;
1790	int saved_flags;
1791
1792	KSE_GET_TOD(kse, &ts);
1793
1794	if ((td_wait == NULL) || (td_wait->wakeup_time.tv_sec < 0)) {
1795		/* Limit sleep to no more than 1 minute. */
1796		ts_sleep.tv_sec = 60;
1797		ts_sleep.tv_nsec = 0;
1798	} else {
1799		TIMESPEC_SUB(&ts_sleep, &td_wait->wakeup_time, &ts);
1800		if (ts_sleep.tv_sec > 60) {
1801			ts_sleep.tv_sec = 60;
1802			ts_sleep.tv_nsec = 0;
1803		}
1804	}
1805	/* Don't sleep for negative times. */
1806	if ((ts_sleep.tv_sec >= 0) && (ts_sleep.tv_nsec >= 0)) {
1807		KSE_SET_IDLE(kse);
1808		kse->k_kseg->kg_idle_kses++;
1809		KSE_SCHED_UNLOCK(kse, kse->k_kseg);
1810		if ((kse->k_kseg->kg_flags & KGF_SINGLE_THREAD) &&
1811		    (kse->k_sigseqno != sigseqno))
1812			; /* don't sleep */
1813		else {
1814			saved_flags = kse->k_kcb->kcb_kmbx.km_flags;
1815			kse->k_kcb->kcb_kmbx.km_flags |= KMF_NOUPCALL;
1816			kse_release(&ts_sleep);
1817			kse->k_kcb->kcb_kmbx.km_flags = saved_flags;
1818		}
1819		KSE_SCHED_LOCK(kse, kse->k_kseg);
1820		if (KSE_IS_IDLE(kse)) {
1821			KSE_CLEAR_IDLE(kse);
1822			kse->k_kseg->kg_idle_kses--;
1823		}
1824	}
1825}
1826
1827/*
1828 * Avoid calling this kse_exit() so as not to confuse it with the
1829 * system call of the same name.
1830 */
1831static void
1832kse_fini(struct kse *kse)
1833{
1834	/* struct kse_group *free_kseg = NULL; */
1835	struct timespec ts;
1836
1837	/*
1838	 * Check to see if this is one of the main kses.
1839	 */
1840	if (kse->k_kseg != _kse_initial->k_kseg) {
1841		PANIC("shouldn't get here");
1842		/* This is for supporting thread groups. */
1843#ifdef NOT_YET
1844		/* Remove this KSE from the KSEG's list of KSEs. */
1845		KSE_SCHED_LOCK(kse, kse->k_kseg);
1846		TAILQ_REMOVE(&kse->k_kseg->kg_kseq, kse, k_kgqe);
1847		kse->k_kseg->kg_ksecount--;
1848		if (TAILQ_EMPTY(&kse->k_kseg->kg_kseq))
1849			free_kseg = kse->k_kseg;
1850		KSE_SCHED_UNLOCK(kse, kse->k_kseg);
1851
1852		/*
1853		 * Add this KSE to the list of free KSEs along with
1854		 * the KSEG if is now orphaned.
1855		 */
1856		KSE_LOCK_ACQUIRE(kse, &kse_lock);
1857		if (free_kseg != NULL)
1858			kseg_free_unlocked(free_kseg);
1859		kse_free_unlocked(kse);
1860		KSE_LOCK_RELEASE(kse, &kse_lock);
1861		kse_exit();
1862		/* Never returns. */
1863		PANIC("kse_exit()");
1864#endif
1865	} else {
1866#ifdef NOT_YET
1867		/*
1868		 * In future, we might allow program to kill
1869		 * kse in initial group.
1870		 */
1871		if (kse != _kse_initial) {
1872			KSE_SCHED_LOCK(kse, kse->k_kseg);
1873			TAILQ_REMOVE(&kse->k_kseg->kg_kseq, kse, k_kgqe);
1874			kse->k_kseg->kg_ksecount--;
1875			KSE_SCHED_UNLOCK(kse, kse->k_kseg);
1876			KSE_LOCK_ACQUIRE(kse, &kse_lock);
1877			kse_free_unlocked(kse);
1878			KSE_LOCK_RELEASE(kse, &kse_lock);
1879			kse_exit();
1880                        /* Never returns. */
1881                        PANIC("kse_exit() failed for initial kseg");
1882                }
1883#endif
1884		KSE_SCHED_LOCK(kse, kse->k_kseg);
1885		KSE_SET_IDLE(kse);
1886		kse->k_kseg->kg_idle_kses++;
1887		KSE_SCHED_UNLOCK(kse, kse->k_kseg);
1888		ts.tv_sec = 120;
1889		ts.tv_nsec = 0;
1890		kse->k_kcb->kcb_kmbx.km_flags = 0;
1891		kse_release(&ts);
1892		/* Never reach */
1893	}
1894}
1895
1896void
1897_thr_set_timeout(const struct timespec *timeout)
1898{
1899	struct pthread	*curthread = _get_curthread();
1900	struct timespec ts;
1901
1902	/* Reset the timeout flag for the running thread: */
1903	curthread->timeout = 0;
1904
1905	/* Check if the thread is to wait forever: */
1906	if (timeout == NULL) {
1907		/*
1908		 * Set the wakeup time to something that can be recognised as
1909		 * different to an actual time of day:
1910		 */
1911		curthread->wakeup_time.tv_sec = -1;
1912		curthread->wakeup_time.tv_nsec = -1;
1913	}
1914	/* Check if no waiting is required: */
1915	else if ((timeout->tv_sec == 0) && (timeout->tv_nsec == 0)) {
1916		/* Set the wake up time to 'immediately': */
1917		curthread->wakeup_time.tv_sec = 0;
1918		curthread->wakeup_time.tv_nsec = 0;
1919	} else {
1920		/* Calculate the time for the current thread to wakeup: */
1921		KSE_GET_TOD(curthread->kse, &ts);
1922		TIMESPEC_ADD(&curthread->wakeup_time, &ts, timeout);
1923	}
1924}
1925
1926void
1927_thr_panic_exit(char *file, int line, char *msg)
1928{
1929	char buf[256];
1930
1931	snprintf(buf, sizeof(buf), "(%s:%d) %s\n", file, line, msg);
1932	__sys_write(2, buf, strlen(buf));
1933	abort();
1934}
1935
1936void
1937_thr_setrunnable(struct pthread *curthread, struct pthread *thread)
1938{
1939	kse_critical_t crit;
1940	struct kse_mailbox *kmbx;
1941
1942	crit = _kse_critical_enter();
1943	KSE_SCHED_LOCK(curthread->kse, thread->kseg);
1944	kmbx = _thr_setrunnable_unlocked(thread);
1945	KSE_SCHED_UNLOCK(curthread->kse, thread->kseg);
1946	_kse_critical_leave(crit);
1947	if (kmbx != NULL)
1948		kse_wakeup(kmbx);
1949}
1950
1951struct kse_mailbox *
1952_thr_setrunnable_unlocked(struct pthread *thread)
1953{
1954	struct kse_mailbox *kmbx = NULL;
1955
1956	if ((thread->kseg->kg_flags & KGF_SINGLE_THREAD) != 0) {
1957		/* No silly queues for these threads. */
1958		if ((thread->flags & THR_FLAGS_SUSPENDED) != 0)
1959			THR_SET_STATE(thread, PS_SUSPENDED);
1960		else {
1961			THR_SET_STATE(thread, PS_RUNNING);
1962			kmbx = kse_wakeup_one(thread);
1963		}
1964
1965	} else if (thread->state != PS_RUNNING) {
1966		if ((thread->flags & THR_FLAGS_IN_WAITQ) != 0)
1967			KSE_WAITQ_REMOVE(thread->kse, thread);
1968		if ((thread->flags & THR_FLAGS_SUSPENDED) != 0)
1969			THR_SET_STATE(thread, PS_SUSPENDED);
1970		else {
1971			THR_SET_STATE(thread, PS_RUNNING);
1972			if ((thread->blocked == 0) && (thread->active == 0) &&
1973			    (thread->flags & THR_FLAGS_IN_RUNQ) == 0)
1974				THR_RUNQ_INSERT_TAIL(thread);
1975			/*
1976			 * XXX - Threads are not yet assigned to specific
1977			 *       KSEs; they are assigned to the KSEG.  So
1978			 *       the fact that a thread's KSE is waiting
1979			 *       doesn't necessarily mean that it will be
1980			 *       the KSE that runs the thread after the
1981			 *       lock is granted.  But we don't know if the
1982			 *       other KSEs within the same KSEG are also
1983			 *       in a waiting state or not so we err on the
1984			 *       side of caution and wakeup the thread's
1985			 *       last known KSE.  We ensure that the
1986			 *       threads KSE doesn't change while it's
1987			 *       scheduling lock is held so it is safe to
1988			 *       reference it (the KSE).  If the KSE wakes
1989			 *       up and doesn't find any more work it will
1990			 *       again go back to waiting so no harm is
1991			 *       done.
1992			 */
1993			kmbx = kse_wakeup_one(thread);
1994		}
1995	}
1996	return (kmbx);
1997}
1998
1999static struct kse_mailbox *
2000kse_wakeup_one(struct pthread *thread)
2001{
2002	struct kse *ke;
2003
2004	if (KSE_IS_IDLE(thread->kse)) {
2005		KSE_CLEAR_IDLE(thread->kse);
2006		thread->kseg->kg_idle_kses--;
2007		return (&thread->kse->k_kcb->kcb_kmbx);
2008	} else {
2009		TAILQ_FOREACH(ke, &thread->kseg->kg_kseq, k_kgqe) {
2010			if (KSE_IS_IDLE(ke)) {
2011				KSE_CLEAR_IDLE(ke);
2012				ke->k_kseg->kg_idle_kses--;
2013				return (&ke->k_kcb->kcb_kmbx);
2014			}
2015		}
2016	}
2017	return (NULL);
2018}
2019
2020static void
2021kse_wakeup_multi(struct kse *curkse)
2022{
2023	struct kse *ke;
2024	int tmp;
2025
2026	if ((tmp = KSE_RUNQ_THREADS(curkse)) && curkse->k_kseg->kg_idle_kses) {
2027		TAILQ_FOREACH(ke, &curkse->k_kseg->kg_kseq, k_kgqe) {
2028			if (KSE_IS_IDLE(ke)) {
2029				KSE_CLEAR_IDLE(ke);
2030				ke->k_kseg->kg_idle_kses--;
2031				KSE_WAKEUP(ke);
2032				if (--tmp == 0)
2033					break;
2034			}
2035		}
2036	}
2037}
2038
2039/*
2040 * Allocate a new KSEG.
2041 *
2042 * We allow the current thread to be NULL in the case that this
2043 * is the first time a KSEG is being created (library initialization).
2044 * In this case, we don't need to (and can't) take any locks.
2045 */
2046struct kse_group *
2047_kseg_alloc(struct pthread *curthread)
2048{
2049	struct kse_group *kseg = NULL;
2050	kse_critical_t crit;
2051
2052	if ((curthread != NULL) && (free_kseg_count > 0)) {
2053		/* Use the kse lock for the kseg queue. */
2054		crit = _kse_critical_enter();
2055		KSE_LOCK_ACQUIRE(curthread->kse, &kse_lock);
2056		if ((kseg = TAILQ_FIRST(&free_kse_groupq)) != NULL) {
2057			TAILQ_REMOVE(&free_kse_groupq, kseg, kg_qe);
2058			free_kseg_count--;
2059			active_kseg_count++;
2060			TAILQ_INSERT_TAIL(&active_kse_groupq, kseg, kg_qe);
2061		}
2062		KSE_LOCK_RELEASE(curthread->kse, &kse_lock);
2063		_kse_critical_leave(crit);
2064		if (kseg)
2065			kseg_reinit(kseg);
2066	}
2067
2068	/*
2069	 * If requested, attempt to allocate a new KSE group only if the
2070	 * KSE allocation was successful and a KSE group wasn't found in
2071	 * the free list.
2072	 */
2073	if ((kseg == NULL) &&
2074	    ((kseg = (struct kse_group *)malloc(sizeof(*kseg))) != NULL)) {
2075		if (_pq_alloc(&kseg->kg_schedq.sq_runq,
2076		    THR_MIN_PRIORITY, THR_LAST_PRIORITY) != 0) {
2077			free(kseg);
2078			kseg = NULL;
2079		} else {
2080			kseg_init(kseg);
2081			/* Add the KSEG to the list of active KSEGs. */
2082			if (curthread != NULL) {
2083				crit = _kse_critical_enter();
2084				KSE_LOCK_ACQUIRE(curthread->kse, &kse_lock);
2085				active_kseg_count++;
2086				TAILQ_INSERT_TAIL(&active_kse_groupq,
2087				    kseg, kg_qe);
2088				KSE_LOCK_RELEASE(curthread->kse, &kse_lock);
2089				_kse_critical_leave(crit);
2090			} else {
2091				active_kseg_count++;
2092				TAILQ_INSERT_TAIL(&active_kse_groupq,
2093				    kseg, kg_qe);
2094			}
2095		}
2096	}
2097	return (kseg);
2098}
2099
2100static void
2101kseg_init(struct kse_group *kseg)
2102{
2103	kseg_reinit(kseg);
2104	_lock_init(&kseg->kg_lock, LCK_ADAPTIVE, _kse_lock_wait,
2105	    _kse_lock_wakeup);
2106}
2107
2108static void
2109kseg_reinit(struct kse_group *kseg)
2110{
2111	TAILQ_INIT(&kseg->kg_kseq);
2112	TAILQ_INIT(&kseg->kg_threadq);
2113	TAILQ_INIT(&kseg->kg_schedq.sq_waitq);
2114	kseg->kg_threadcount = 0;
2115	kseg->kg_ksecount = 0;
2116	kseg->kg_idle_kses = 0;
2117	kseg->kg_flags = 0;
2118}
2119
2120/*
2121 * This must be called with the kse lock held and when there are
2122 * no more threads that reference it.
2123 */
2124static void
2125kseg_free_unlocked(struct kse_group *kseg)
2126{
2127	TAILQ_REMOVE(&active_kse_groupq, kseg, kg_qe);
2128	TAILQ_INSERT_HEAD(&free_kse_groupq, kseg, kg_qe);
2129	free_kseg_count++;
2130	active_kseg_count--;
2131}
2132
2133void
2134_kseg_free(struct kse_group *kseg)
2135{
2136	struct kse *curkse;
2137	kse_critical_t crit;
2138
2139	crit = _kse_critical_enter();
2140	curkse = _get_curkse();
2141	KSE_LOCK_ACQUIRE(curkse, &kse_lock);
2142	kseg_free_unlocked(kseg);
2143	KSE_LOCK_RELEASE(curkse, &kse_lock);
2144	_kse_critical_leave(crit);
2145}
2146
2147static void
2148kseg_destroy(struct kse_group *kseg)
2149{
2150	_lock_destroy(&kseg->kg_lock);
2151	_pq_free(&kseg->kg_schedq.sq_runq);
2152	free(kseg);
2153}
2154
2155/*
2156 * Allocate a new KSE.
2157 *
2158 * We allow the current thread to be NULL in the case that this
2159 * is the first time a KSE is being created (library initialization).
2160 * In this case, we don't need to (and can't) take any locks.
2161 */
2162struct kse *
2163_kse_alloc(struct pthread *curthread, int sys_scope)
2164{
2165	struct kse *kse = NULL;
2166	char *stack;
2167	kse_critical_t crit;
2168	int i;
2169
2170	if ((curthread != NULL) && (free_kse_count > 0)) {
2171		crit = _kse_critical_enter();
2172		KSE_LOCK_ACQUIRE(curthread->kse, &kse_lock);
2173		/* Search for a finished KSE. */
2174		kse = TAILQ_FIRST(&free_kseq);
2175		while ((kse != NULL) &&
2176		    ((kse->k_kcb->kcb_kmbx.km_flags & KMF_DONE) == 0)) {
2177			kse = TAILQ_NEXT(kse, k_qe);
2178		}
2179		if (kse != NULL) {
2180			DBG_MSG("found an unused kse.\n");
2181			TAILQ_REMOVE(&free_kseq, kse, k_qe);
2182			free_kse_count--;
2183			TAILQ_INSERT_TAIL(&active_kseq, kse, k_qe);
2184			active_kse_count++;
2185		}
2186		KSE_LOCK_RELEASE(curthread->kse, &kse_lock);
2187		_kse_critical_leave(crit);
2188		if (kse != NULL)
2189			kse_reinit(kse, sys_scope);
2190	}
2191	if ((kse == NULL) &&
2192	    ((kse = (struct kse *)malloc(sizeof(*kse))) != NULL)) {
2193		if (sys_scope != 0)
2194			stack = NULL;
2195		else if ((stack = malloc(KSE_STACKSIZE)) == NULL) {
2196			free(kse);
2197			return (NULL);
2198		}
2199		bzero(kse, sizeof(*kse));
2200
2201		/* Initialize KCB without the lock. */
2202		if ((kse->k_kcb = _kcb_ctor(kse)) == NULL) {
2203			if (stack != NULL)
2204				free(stack);
2205			free(kse);
2206			return (NULL);
2207		}
2208
2209		/* Initialize the lockusers. */
2210		for (i = 0; i < MAX_KSE_LOCKLEVEL; i++) {
2211			_lockuser_init(&kse->k_lockusers[i], (void *)kse);
2212			_LCK_SET_PRIVATE2(&kse->k_lockusers[i], NULL);
2213		}
2214		/* _lock_init(kse->k_lock, ...) */
2215
2216		if (curthread != NULL) {
2217			crit = _kse_critical_enter();
2218			KSE_LOCK_ACQUIRE(curthread->kse, &kse_lock);
2219		}
2220		kse->k_flags = 0;
2221		TAILQ_INSERT_TAIL(&active_kseq, kse, k_qe);
2222		active_kse_count++;
2223		if (curthread != NULL) {
2224			KSE_LOCK_RELEASE(curthread->kse, &kse_lock);
2225			_kse_critical_leave(crit);
2226		}
2227		/*
2228		 * Create the KSE context.
2229		 * Scope system threads (one thread per KSE) are not required
2230		 * to have a stack for an unneeded kse upcall.
2231		 */
2232		if (!sys_scope) {
2233			kse->k_kcb->kcb_kmbx.km_func = (kse_func_t *)kse_sched_multi;
2234			kse->k_stack.ss_sp = stack;
2235			kse->k_stack.ss_size = KSE_STACKSIZE;
2236		} else {
2237			kse->k_kcb->kcb_kmbx.km_func = (kse_func_t *)kse_sched_single;
2238			kse->k_stack.ss_sp = NULL;
2239			kse->k_stack.ss_size = 0;
2240		}
2241		kse->k_kcb->kcb_kmbx.km_udata = (void *)kse;
2242		kse->k_kcb->kcb_kmbx.km_quantum = 20000;
2243		/*
2244		 * We need to keep a copy of the stack in case it
2245		 * doesn't get used; a KSE running a scope system
2246		 * thread will use that thread's stack.
2247		 */
2248		kse->k_kcb->kcb_kmbx.km_stack = kse->k_stack;
2249	}
2250	return (kse);
2251}
2252
2253static void
2254kse_reinit(struct kse *kse, int sys_scope)
2255{
2256	if (!sys_scope) {
2257		kse->k_kcb->kcb_kmbx.km_func = (kse_func_t *)kse_sched_multi;
2258		if (kse->k_stack.ss_sp == NULL) {
2259			/* XXX check allocation failure */
2260			kse->k_stack.ss_sp = (char *) malloc(KSE_STACKSIZE);
2261			kse->k_stack.ss_size = KSE_STACKSIZE;
2262		}
2263		kse->k_kcb->kcb_kmbx.km_quantum = 20000;
2264	} else {
2265		kse->k_kcb->kcb_kmbx.km_func = (kse_func_t *)kse_sched_single;
2266		if (kse->k_stack.ss_sp)
2267			free(kse->k_stack.ss_sp);
2268		kse->k_stack.ss_sp = NULL;
2269		kse->k_stack.ss_size = 0;
2270		kse->k_kcb->kcb_kmbx.km_quantum = 0;
2271	}
2272	kse->k_kcb->kcb_kmbx.km_stack = kse->k_stack;
2273	kse->k_kcb->kcb_kmbx.km_udata = (void *)kse;
2274	kse->k_kcb->kcb_kmbx.km_curthread = NULL;
2275	kse->k_kcb->kcb_kmbx.km_flags = 0;
2276	kse->k_curthread = NULL;
2277	kse->k_kseg = 0;
2278	kse->k_schedq = 0;
2279	kse->k_locklevel = 0;
2280	SIGEMPTYSET(kse->k_sigmask);
2281	bzero(&kse->k_sigq, sizeof(kse->k_sigq));
2282	kse->k_check_sigq = 0;
2283	kse->k_flags = 0;
2284	kse->k_waiting = 0;
2285	kse->k_idle = 0;
2286	kse->k_error = 0;
2287	kse->k_cpu = 0;
2288	kse->k_done = 0;
2289	kse->k_switch = 0;
2290	kse->k_sigseqno = 0;
2291}
2292
2293void
2294kse_free_unlocked(struct kse *kse)
2295{
2296	TAILQ_REMOVE(&active_kseq, kse, k_qe);
2297	active_kse_count--;
2298	kse->k_kseg = NULL;
2299	kse->k_kcb->kcb_kmbx.km_quantum = 20000;
2300	kse->k_flags = 0;
2301	TAILQ_INSERT_HEAD(&free_kseq, kse, k_qe);
2302	free_kse_count++;
2303}
2304
2305void
2306_kse_free(struct pthread *curthread, struct kse *kse)
2307{
2308	kse_critical_t crit;
2309
2310	if (curthread == NULL)
2311		kse_free_unlocked(kse);
2312	else {
2313		crit = _kse_critical_enter();
2314		KSE_LOCK_ACQUIRE(curthread->kse, &kse_lock);
2315		kse_free_unlocked(kse);
2316		KSE_LOCK_RELEASE(curthread->kse, &kse_lock);
2317		_kse_critical_leave(crit);
2318	}
2319}
2320
2321static void
2322kse_destroy(struct kse *kse)
2323{
2324	int i;
2325
2326	if (kse->k_stack.ss_sp != NULL)
2327		free(kse->k_stack.ss_sp);
2328	_kcb_dtor(kse->k_kcb);
2329	for (i = 0; i < MAX_KSE_LOCKLEVEL; ++i)
2330		_lockuser_destroy(&kse->k_lockusers[i]);
2331	_lock_destroy(&kse->k_lock);
2332	free(kse);
2333}
2334
2335struct pthread *
2336_thr_alloc(struct pthread *curthread)
2337{
2338	kse_critical_t crit;
2339	struct pthread *thread = NULL;
2340
2341	if (curthread != NULL) {
2342		if (GC_NEEDED())
2343			_thr_gc(curthread);
2344		if (free_thread_count > 0) {
2345			crit = _kse_critical_enter();
2346			KSE_LOCK_ACQUIRE(curthread->kse, &thread_lock);
2347			if ((thread = TAILQ_FIRST(&free_threadq)) != NULL) {
2348				TAILQ_REMOVE(&free_threadq, thread, tle);
2349				free_thread_count--;
2350			}
2351			KSE_LOCK_RELEASE(curthread->kse, &thread_lock);
2352			_kse_critical_leave(crit);
2353		}
2354	}
2355	if ((thread == NULL) &&
2356	    ((thread = malloc(sizeof(struct pthread))) != NULL)) {
2357		bzero(thread, sizeof(struct pthread));
2358		if ((thread->tcb = _tcb_ctor(thread)) == NULL) {
2359			free(thread);
2360			thread = NULL;
2361		}
2362	}
2363	return (thread);
2364}
2365
2366void
2367_thr_free(struct pthread *curthread, struct pthread *thread)
2368{
2369	kse_critical_t crit;
2370	int i;
2371
2372	DBG_MSG("Freeing thread %p\n", thread);
2373	if ((curthread == NULL) || (free_thread_count >= MAX_CACHED_THREADS)) {
2374		for (i = 0; i < MAX_THR_LOCKLEVEL; i++) {
2375			_lockuser_destroy(&thread->lockusers[i]);
2376		}
2377		_lock_destroy(&thread->lock);
2378		_tcb_dtor(thread->tcb);
2379		free(thread);
2380	}
2381	else {
2382		/* Reinitialize any important fields here. */
2383		thread->lock_switch = 0;
2384		sigemptyset(&thread->sigpend);
2385		thread->check_pending = 0;
2386
2387		/* Add the thread to the free thread list. */
2388		crit = _kse_critical_enter();
2389		KSE_LOCK_ACQUIRE(curthread->kse, &thread_lock);
2390		TAILQ_INSERT_TAIL(&free_threadq, thread, tle);
2391		free_thread_count++;
2392		KSE_LOCK_RELEASE(curthread->kse, &thread_lock);
2393		_kse_critical_leave(crit);
2394	}
2395}
2396
2397/*
2398 * Add an active thread:
2399 *
2400 *   o Assign the thread a unique id (which GDB uses to track
2401 *     threads.
2402 *   o Add the thread to the list of all threads and increment
2403 *     number of active threads.
2404 */
2405static void
2406thr_link(struct pthread *thread)
2407{
2408	kse_critical_t crit;
2409	struct kse *curkse;
2410	struct pthread *curthread;
2411
2412	crit = _kse_critical_enter();
2413	curkse = _get_curkse();
2414	curthread = _get_curthread();
2415	thread->sigmask = curthread->sigmask;
2416	KSE_LOCK_ACQUIRE(curkse, &_thread_list_lock);
2417	/*
2418	 * Initialize the unique id (which GDB uses to track
2419	 * threads), add the thread to the list of all threads,
2420	 * and
2421	 */
2422	thread->uniqueid = next_uniqueid++;
2423	THR_LIST_ADD(thread);
2424	active_threads++;
2425	KSE_LOCK_RELEASE(curkse, &_thread_list_lock);
2426	_kse_critical_leave(crit);
2427}
2428
2429/*
2430 * Remove an active thread.
2431 */
2432static void
2433thr_unlink(struct pthread *thread)
2434{
2435	kse_critical_t crit;
2436	struct kse *curkse;
2437
2438	crit = _kse_critical_enter();
2439	curkse = _get_curkse();
2440	KSE_LOCK_ACQUIRE(curkse, &_thread_list_lock);
2441	THR_LIST_REMOVE(thread);
2442	active_threads--;
2443	KSE_LOCK_RELEASE(curkse, &_thread_list_lock);
2444	_kse_critical_leave(crit);
2445}
2446
2447void
2448_thr_hash_add(struct pthread *thread)
2449{
2450	struct thread_hash_head *head;
2451
2452	head = &thr_hashtable[THREAD_HASH(thread)];
2453	LIST_INSERT_HEAD(head, thread, hle);
2454}
2455
2456void
2457_thr_hash_remove(struct pthread *thread)
2458{
2459	LIST_REMOVE(thread, hle);
2460}
2461
2462struct pthread *
2463_thr_hash_find(struct pthread *thread)
2464{
2465	struct pthread *td;
2466	struct thread_hash_head *head;
2467
2468	head = &thr_hashtable[THREAD_HASH(thread)];
2469	LIST_FOREACH(td, head, hle) {
2470		if (td == thread)
2471			return (thread);
2472	}
2473	return (NULL);
2474}
2475
2476