thr_kern.c revision 118676
1/*
2 * Copyright (C) 2003 Daniel M. Eischen <deischen@freebsd.org>
3 * Copyright (C) 2002 Jonathon Mini <mini@freebsd.org>
4 * Copyright (c) 1995-1998 John Birrell <jb@cimlogic.com.au>
5 * All rights reserved.
6 *
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
9 * are met:
10 * 1. Redistributions of source code must retain the above copyright
11 *    notice, this list of conditions and the following disclaimer.
12 * 2. Redistributions in binary form must reproduce the above copyright
13 *    notice, this list of conditions and the following disclaimer in the
14 *    documentation and/or other materials provided with the distribution.
15 * 3. All advertising materials mentioning features or use of this software
16 *    must display the following acknowledgement:
17 *	This product includes software developed by John Birrell.
18 * 4. Neither the name of the author nor the names of any co-contributors
19 *    may be used to endorse or promote products derived from this software
20 *    without specific prior written permission.
21 *
22 * THIS SOFTWARE IS PROVIDED BY JOHN BIRRELL AND CONTRIBUTORS ``AS IS'' AND
23 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
24 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
25 * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
26 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
27 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
28 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
29 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
30 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
31 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32 * SUCH DAMAGE.
33 *
34 */
35#include <sys/cdefs.h>
36__FBSDID("$FreeBSD: head/lib/libkse/thread/thr_kern.c 118676 2003-08-08 22:20:59Z davidxu $");
37
38#include <sys/types.h>
39#include <sys/kse.h>
40#include <sys/signalvar.h>
41#include <sys/queue.h>
42#include <machine/atomic.h>
43#include <machine/sigframe.h>
44
45#include <assert.h>
46#include <errno.h>
47#include <signal.h>
48#include <stdlib.h>
49#include <string.h>
50#include <time.h>
51#include <ucontext.h>
52#include <unistd.h>
53
54#include "atomic_ops.h"
55#include "thr_private.h"
56#include "libc_private.h"
57
58/*#define DEBUG_THREAD_KERN */
59#ifdef DEBUG_THREAD_KERN
60#define DBG_MSG		stdout_debug
61#else
62#define DBG_MSG(x...)
63#endif
64
65/*
66 * Define a high water mark for the maximum number of threads that
67 * will be cached.  Once this level is reached, any extra threads
68 * will be free()'d.
69 */
70#define	MAX_CACHED_THREADS	100
71/*
72 * Define high water marks for the maximum number of KSEs and KSE groups
73 * that will be cached. Because we support 1:1 threading, there could have
74 * same number of KSEs and KSE groups as threads. Once these levels are
75 * reached, any extra KSE and KSE groups will be free()'d.
76 */
77#ifdef SYSTEM_SCOPE_ONLY
78#define	MAX_CACHED_KSES		100
79#define	MAX_CACHED_KSEGS	100
80#else
81#define	MAX_CACHED_KSES		50
82#define	MAX_CACHED_KSEGS	50
83#endif
84
85#define	KSE_STACKSIZE		16384
86
87#define	KSE_SET_MBOX(kse, thrd) \
88	(kse)->k_kcb->kcb_kmbx.km_curthread = &(thrd)->tcb->tcb_tmbx
89
90#define	KSE_SET_EXITED(kse)	(kse)->k_flags |= KF_EXITED
91
92/*
93 * Macros for manipulating the run queues.  The priority queue
94 * routines use the thread's pqe link and also handle the setting
95 * and clearing of the thread's THR_FLAGS_IN_RUNQ flag.
96 */
97#define	KSE_RUNQ_INSERT_HEAD(kse, thrd)			\
98	_pq_insert_head(&(kse)->k_schedq->sq_runq, thrd)
99#define	KSE_RUNQ_INSERT_TAIL(kse, thrd)			\
100	_pq_insert_tail(&(kse)->k_schedq->sq_runq, thrd)
101#define	KSE_RUNQ_REMOVE(kse, thrd)			\
102	_pq_remove(&(kse)->k_schedq->sq_runq, thrd)
103#define	KSE_RUNQ_FIRST(kse)	_pq_first(&(kse)->k_schedq->sq_runq)
104
105#define KSE_RUNQ_THREADS(kse)	((kse)->k_schedq->sq_runq.pq_threads)
106
107/*
108 * We've got to keep track of everything that is allocated, not only
109 * to have a speedy free list, but also so they can be deallocated
110 * after a fork().
111 */
112static TAILQ_HEAD(, kse)	active_kseq;
113static TAILQ_HEAD(, kse)	free_kseq;
114static TAILQ_HEAD(, kse_group)	free_kse_groupq;
115static TAILQ_HEAD(, kse_group)	active_kse_groupq;
116static TAILQ_HEAD(, kse_group)	gc_ksegq;
117static struct lock		kse_lock;	/* also used for kseg queue */
118static int			free_kse_count = 0;
119static int			free_kseg_count = 0;
120static TAILQ_HEAD(, pthread)	free_threadq;
121static struct lock		thread_lock;
122static int			free_thread_count = 0;
123static int			inited = 0;
124static int			active_threads = 1;
125static int			active_kse_count = 0;
126static int			active_kseg_count = 0;
127static u_int64_t		next_uniqueid = 1;
128
129LIST_HEAD(thread_hash_head, pthread);
130#define THREAD_HASH_QUEUES	127
131static struct thread_hash_head	thr_hashtable[THREAD_HASH_QUEUES];
132#define	THREAD_HASH(thrd)	((unsigned long)thrd % THREAD_HASH_QUEUES)
133
134#ifdef DEBUG_THREAD_KERN
135static void	dump_queues(struct kse *curkse);
136#endif
137static void	kse_check_completed(struct kse *kse);
138static void	kse_check_waitq(struct kse *kse);
139static void	kse_fini(struct kse *curkse);
140static void	kse_reinit(struct kse *kse, int sys_scope);
141static void	kse_sched_multi(struct kse_mailbox *kmbx);
142static void	kse_sched_single(struct kse_mailbox *kmbx);
143static void	kse_switchout_thread(struct kse *kse, struct pthread *thread);
144static void	kse_wait(struct kse *kse, struct pthread *td_wait, int sigseq);
145static void	kse_free_unlocked(struct kse *kse);
146static void	kse_destroy(struct kse *kse);
147static void	kseg_free_unlocked(struct kse_group *kseg);
148static void	kseg_init(struct kse_group *kseg);
149static void	kseg_reinit(struct kse_group *kseg);
150static void	kseg_destroy(struct kse_group *kseg);
151static void	kse_waitq_insert(struct pthread *thread);
152static void	kse_wakeup_multi(struct kse *curkse);
153static struct kse_mailbox *kse_wakeup_one(struct pthread *thread);
154static void	thr_cleanup(struct kse *kse, struct pthread *curthread);
155static void	thr_link(struct pthread *thread);
156static void	thr_resume_wrapper(int sig, siginfo_t *, ucontext_t *);
157static void	thr_resume_check(struct pthread *curthread, ucontext_t *ucp,
158		    struct pthread_sigframe *psf);
159static int	thr_timedout(struct pthread *thread, struct timespec *curtime);
160static void	thr_unlink(struct pthread *thread);
161static void	thread_gc(struct pthread *thread);
162static void	kse_gc(struct pthread *thread);
163static void	kseg_gc(struct pthread *thread);
164
165static __inline void
166kse_set_curthread(struct kse *kse, struct pthread *td)
167{
168	kse->k_curthread = td;
169	if (td != NULL)
170		_tcb_set(kse->k_kcb, td->tcb);
171	else
172		_tcb_set(kse->k_kcb, NULL);
173}
174
175static void __inline
176thr_accounting(struct pthread *thread)
177{
178	if ((thread->slice_usec != -1) &&
179	    (thread->slice_usec <= TIMESLICE_USEC) &&
180	    (thread->attr.sched_policy != SCHED_FIFO)) {
181		thread->slice_usec += (thread->tcb->tcb_tmbx.tm_uticks
182		    + thread->tcb->tcb_tmbx.tm_sticks) * _clock_res_usec;
183		/* Check for time quantum exceeded: */
184		if (thread->slice_usec > TIMESLICE_USEC)
185			thread->slice_usec = -1;
186	}
187	thread->tcb->tcb_tmbx.tm_uticks = 0;
188	thread->tcb->tcb_tmbx.tm_sticks = 0;
189}
190
191/*
192 * This is called after a fork().
193 * No locks need to be taken here since we are guaranteed to be
194 * single threaded.
195 *
196 * XXX
197 * POSIX says for threaded process, fork() function is used
198 * only to run new programs, and the effects of calling functions
199 * that require certain resources between the call to fork() and
200 * the call to an exec function are undefined.
201 *
202 * Here it is not safe to reinitialize the library after fork().
203 * Because memory management may be corrupted, further calling
204 * malloc()/free() may cause undefined behavior.
205 */
206void
207_kse_single_thread(struct pthread *curthread)
208{
209#ifdef NOTYET
210	struct kse *kse;
211	struct kse_group *kseg;
212	struct pthread *thread;
213	kse_critical_t crit;
214	int i;
215
216
217	/*
218	 * Disable upcalls and clear the threaded flag.
219	 * XXX - I don't think we need to disable upcalls after a fork().
220	 *       but it doesn't hurt.
221	 */
222	crit = _kse_critical_enter();
223	__isthreaded = 0;
224	active_threads = 1;
225	_thr_signal_deinit();
226
227	/*
228	 * Enter a loop to remove and free all threads other than
229	 * the running thread from the active thread list:
230	 */
231	while ((thread = TAILQ_FIRST(&_thread_list)) != NULL) {
232		THR_GCLIST_REMOVE(thread);
233		/*
234		 * Remove this thread from the list (the current
235		 * thread will be removed but re-added by libpthread
236		 * initialization.
237		 */
238		TAILQ_REMOVE(&_thread_list, thread, tle);
239		/* Make sure this isn't the running thread: */
240		if (thread != curthread) {
241			_thr_stack_free(&thread->attr);
242			if (thread->specific != NULL)
243				free(thread->specific);
244			for (i = 0; i < MAX_THR_LOCKLEVEL; i++) {
245				_lockuser_destroy(&thread->lockusers[i]);
246			}
247			_lock_destroy(&thread->lock);
248			free(thread);
249		}
250	}
251
252	TAILQ_INIT(&curthread->mutexq);		/* initialize mutex queue */
253	curthread->joiner = NULL;		/* no joining threads yet */
254	curthread->refcount = 0;
255	SIGEMPTYSET(curthread->sigpend);	/* clear pending signals */
256	if (curthread->specific != NULL) {
257		free(curthread->specific);
258		curthread->specific = NULL;
259		curthread->specific_data_count = 0;
260	}
261
262	/* Free the free KSEs: */
263	while ((kse = TAILQ_FIRST(&free_kseq)) != NULL) {
264		TAILQ_REMOVE(&free_kseq, kse, k_qe);
265		for (i = 0; i < MAX_KSE_LOCKLEVEL; i++) {
266			_lockuser_destroy(&kse->k_lockusers[i]);
267		}
268		_lock_destroy(&kse->k_lock);
269		_kcb_dtor(kse->k_kcb);
270		if (kse->k_stack.ss_sp != NULL)
271			free(kse->k_stack.ss_sp);
272		free(kse);
273	}
274	free_kse_count = 0;
275
276	/* Free the active KSEs: */
277	while ((kse = TAILQ_FIRST(&active_kseq)) != NULL) {
278		TAILQ_REMOVE(&active_kseq, kse, k_qe);
279		for (i = 0; i < MAX_KSE_LOCKLEVEL; i++) {
280			_lockuser_destroy(&kse->k_lockusers[i]);
281		}
282		_lock_destroy(&kse->k_lock);
283		if (kse->k_stack.ss_sp != NULL)
284			free(kse->k_stack.ss_sp);
285		free(kse);
286	}
287	active_kse_count = 0;
288
289	/* Free the free KSEGs: */
290	while ((kseg = TAILQ_FIRST(&free_kse_groupq)) != NULL) {
291		TAILQ_REMOVE(&free_kse_groupq, kseg, kg_qe);
292		_lock_destroy(&kseg->kg_lock);
293		_pq_free(&kseg->kg_schedq.sq_runq);
294		free(kseg);
295	}
296	free_kseg_count = 0;
297
298	/* Free the active KSEGs: */
299	while ((kseg = TAILQ_FIRST(&active_kse_groupq)) != NULL) {
300		TAILQ_REMOVE(&active_kse_groupq, kseg, kg_qe);
301		_lock_destroy(&kseg->kg_lock);
302		_pq_free(&kseg->kg_schedq.sq_runq);
303		free(kseg);
304	}
305	active_kseg_count = 0;
306
307	/* Free the free threads. */
308	while ((thread = TAILQ_FIRST(&free_threadq)) != NULL) {
309		TAILQ_REMOVE(&free_threadq, thread, tle);
310		if (thread->specific != NULL)
311			free(thread->specific);
312		for (i = 0; i < MAX_THR_LOCKLEVEL; i++) {
313			_lockuser_destroy(&thread->lockusers[i]);
314		}
315		_lock_destroy(&thread->lock);
316		free(thread);
317	}
318	free_thread_count = 0;
319
320	/* Free the to-be-gc'd threads. */
321	while ((thread = TAILQ_FIRST(&_thread_gc_list)) != NULL) {
322		TAILQ_REMOVE(&_thread_gc_list, thread, gcle);
323		for (i = 0; i < MAX_THR_LOCKLEVEL; i++) {
324			_lockuser_destroy(&thread->lockusers[i]);
325		}
326		_lock_destroy(&thread->lock);
327		free(thread);
328	}
329	TAILQ_INIT(&gc_ksegq);
330	_gc_count = 0;
331
332	if (inited != 0) {
333		/*
334		 * Destroy these locks; they'll be recreated to assure they
335		 * are in the unlocked state.
336		 */
337		_lock_destroy(&kse_lock);
338		_lock_destroy(&thread_lock);
339		_lock_destroy(&_thread_list_lock);
340		inited = 0;
341	}
342
343	/*
344	 * After a fork(), the leftover thread goes back to being
345	 * scope process.
346	 */
347	curthread->attr.flags &= ~PTHREAD_SCOPE_SYSTEM;
348	curthread->attr.flags |= PTHREAD_SCOPE_PROCESS;
349
350	/*
351	 * After a fork, we are still operating on the thread's original
352	 * stack.  Don't clear the THR_FLAGS_USER from the thread's
353	 * attribute flags.
354	 */
355
356	/* Initialize the threads library. */
357	curthread->kse = NULL;
358	curthread->kseg = NULL;
359	_kse_initial = NULL;
360	_libpthread_init(curthread);
361#else
362	if (__isthreaded)
363		_thr_signal_deinit();
364	curthread->kse->k_kcb->kcb_kmbx.km_curthread = NULL;
365	__isthreaded   = 0;
366	active_threads = 0;
367#endif
368}
369
370/*
371 * This is used to initialize housekeeping and to initialize the
372 * KSD for the KSE.
373 */
374void
375_kse_init(void)
376{
377	if (inited == 0) {
378		TAILQ_INIT(&active_kseq);
379		TAILQ_INIT(&active_kse_groupq);
380		TAILQ_INIT(&free_kseq);
381		TAILQ_INIT(&free_kse_groupq);
382		TAILQ_INIT(&free_threadq);
383		TAILQ_INIT(&gc_ksegq);
384		if (_lock_init(&kse_lock, LCK_ADAPTIVE,
385		    _kse_lock_wait, _kse_lock_wakeup) != 0)
386			PANIC("Unable to initialize free KSE queue lock");
387		if (_lock_init(&thread_lock, LCK_ADAPTIVE,
388		    _kse_lock_wait, _kse_lock_wakeup) != 0)
389			PANIC("Unable to initialize free thread queue lock");
390		if (_lock_init(&_thread_list_lock, LCK_ADAPTIVE,
391		    _kse_lock_wait, _kse_lock_wakeup) != 0)
392			PANIC("Unable to initialize thread list lock");
393		active_kse_count = 0;
394		active_kseg_count = 0;
395		_gc_count = 0;
396		inited = 1;
397	}
398}
399
400int
401_kse_isthreaded(void)
402{
403	return (__isthreaded != 0);
404}
405
406/*
407 * This is called when the first thread (other than the initial
408 * thread) is created.
409 */
410int
411_kse_setthreaded(int threaded)
412{
413	if ((threaded != 0) && (__isthreaded == 0)) {
414		/*
415		 * Locking functions in libc are required when there are
416		 * threads other than the initial thread.
417		 */
418		__isthreaded = 1;
419
420		/*
421		 * Tell the kernel to create a KSE for the initial thread
422		 * and enable upcalls in it.
423		 */
424		_thr_signal_init();
425		_kse_initial->k_flags |= KF_STARTED;
426
427#ifdef SYSTEM_SCOPE_ONLY
428		/*
429		 * For bound thread, kernel reads mailbox pointer once,
430		 * we'd set it here before calling kse_create
431		 */
432		_tcb_set(_kse_initial->k_kcb, _thr_initial->tcb);
433		KSE_SET_MBOX(_kse_initial, _thr_initial);
434		_kse_initial->k_kcb->kcb_kmbx.km_flags |= KMF_BOUND;
435#endif
436
437		if (kse_create(&_kse_initial->k_kcb->kcb_kmbx, 0) != 0) {
438			_kse_initial->k_flags &= ~KF_STARTED;
439			__isthreaded = 0;
440			PANIC("kse_create() failed\n");
441			return (-1);
442		}
443
444#ifndef SYSTEM_SCOPE_ONLY
445		/* Set current thread to initial thread */
446		_tcb_set(_kse_initial->k_kcb, _thr_initial->tcb);
447		KSE_SET_MBOX(_kse_initial, _thr_initial);
448		_thr_start_sig_daemon();
449		_thr_setmaxconcurrency();
450#endif
451
452	}
453	return (0);
454}
455
456/*
457 * Lock wait and wakeup handlers for KSE locks.  These are only used by
458 * KSEs, and should never be used by threads.  KSE locks include the
459 * KSE group lock (used for locking the scheduling queue) and the
460 * kse_lock defined above.
461 *
462 * When a KSE lock attempt blocks, the entire KSE blocks allowing another
463 * KSE to run.  For the most part, it doesn't make much sense to try and
464 * schedule another thread because you need to lock the scheduling queue
465 * in order to do that.  And since the KSE lock is used to lock the scheduling
466 * queue, you would just end up blocking again.
467 */
468void
469_kse_lock_wait(struct lock *lock, struct lockuser *lu)
470{
471	struct kse *curkse = (struct kse *)_LCK_GET_PRIVATE(lu);
472	struct timespec ts;
473	int saved_flags;
474
475	if (curkse->k_kcb->kcb_kmbx.km_curthread != NULL)
476		PANIC("kse_lock_wait does not disable upcall.\n");
477	/*
478	 * Enter a loop to wait until we get the lock.
479	 */
480	ts.tv_sec = 0;
481	ts.tv_nsec = 1000000;  /* 1 sec */
482	while (!_LCK_GRANTED(lu)) {
483		/*
484		 * Yield the kse and wait to be notified when the lock
485		 * is granted.
486		 */
487		saved_flags = curkse->k_kcb->kcb_kmbx.km_flags;
488		curkse->k_kcb->kcb_kmbx.km_flags |= KMF_NOUPCALL |
489		    KMF_NOCOMPLETED;
490		kse_release(&ts);
491		curkse->k_kcb->kcb_kmbx.km_flags = saved_flags;
492	}
493}
494
495void
496_kse_lock_wakeup(struct lock *lock, struct lockuser *lu)
497{
498	struct kse *curkse;
499	struct kse *kse;
500	struct kse_mailbox *mbx;
501
502	curkse = _get_curkse();
503	kse = (struct kse *)_LCK_GET_PRIVATE(lu);
504
505	if (kse == curkse)
506		PANIC("KSE trying to wake itself up in lock");
507	else {
508		mbx = &kse->k_kcb->kcb_kmbx;
509		_lock_grant(lock, lu);
510		/*
511		 * Notify the owning kse that it has the lock.
512		 * It is safe to pass invalid address to kse_wakeup
513		 * even if the mailbox is not in kernel at all,
514		 * and waking up a wrong kse is also harmless.
515		 */
516		kse_wakeup(mbx);
517	}
518}
519
520/*
521 * Thread wait and wakeup handlers for thread locks.  These are only used
522 * by threads, never by KSEs.  Thread locks include the per-thread lock
523 * (defined in its structure), and condition variable and mutex locks.
524 */
525void
526_thr_lock_wait(struct lock *lock, struct lockuser *lu)
527{
528	struct pthread *curthread = (struct pthread *)lu->lu_private;
529
530	do {
531		THR_LOCK_SWITCH(curthread);
532		THR_SET_STATE(curthread, PS_LOCKWAIT);
533		_thr_sched_switch_unlocked(curthread);
534	} while (!_LCK_GRANTED(lu));
535}
536
537void
538_thr_lock_wakeup(struct lock *lock, struct lockuser *lu)
539{
540	struct pthread *thread;
541	struct pthread *curthread;
542	struct kse_mailbox *kmbx;
543
544	curthread = _get_curthread();
545	thread = (struct pthread *)_LCK_GET_PRIVATE(lu);
546
547	THR_SCHED_LOCK(curthread, thread);
548	_lock_grant(lock, lu);
549	kmbx = _thr_setrunnable_unlocked(thread);
550	THR_SCHED_UNLOCK(curthread, thread);
551	if (kmbx != NULL)
552		kse_wakeup(kmbx);
553}
554
555kse_critical_t
556_kse_critical_enter(void)
557{
558	kse_critical_t crit;
559
560	crit = (kse_critical_t)_kcb_critical_enter();
561	return (crit);
562}
563
564void
565_kse_critical_leave(kse_critical_t crit)
566{
567	struct pthread *curthread;
568
569	_kcb_critical_leave((struct kse_thr_mailbox *)crit);
570	if ((crit != NULL) && ((curthread = _get_curthread()) != NULL))
571		THR_YIELD_CHECK(curthread);
572}
573
574int
575_kse_in_critical(void)
576{
577	return (_kcb_in_critical());
578}
579
580void
581_thr_critical_enter(struct pthread *thread)
582{
583	thread->critical_count++;
584}
585
586void
587_thr_critical_leave(struct pthread *thread)
588{
589	thread->critical_count--;
590	THR_YIELD_CHECK(thread);
591}
592
593void
594_thr_sched_switch(struct pthread *curthread)
595{
596	struct kse *curkse;
597
598	(void)_kse_critical_enter();
599	curkse = _get_curkse();
600	KSE_SCHED_LOCK(curkse, curkse->k_kseg);
601	_thr_sched_switch_unlocked(curthread);
602}
603
604/*
605 * XXX - We may need to take the scheduling lock before calling
606 *       this, or perhaps take the lock within here before
607 *       doing anything else.
608 */
609void
610_thr_sched_switch_unlocked(struct pthread *curthread)
611{
612	struct pthread *td;
613	struct pthread_sigframe psf;
614	struct kse *curkse;
615	int ret;
616	volatile int uts_once;
617	volatile int resume_once = 0;
618	ucontext_t uc;
619
620	/* We're in the scheduler, 5 by 5: */
621	curkse = _get_curkse();
622
623	curthread->need_switchout = 1;	/* The thread yielded on its own. */
624	curthread->critical_yield = 0;	/* No need to yield anymore. */
625	thr_accounting(curthread);
626
627
628	/* Thread can unlock the scheduler lock. */
629	curthread->lock_switch = 1;
630
631	/*
632	 * The signal frame is allocated off the stack because
633	 * a thread can be interrupted by other signals while
634	 * it is running down pending signals.
635	 */
636	psf.psf_valid = 0;
637	curthread->curframe = &psf;
638
639	/*
640	 * Enter the scheduler if any one of the following is true:
641	 *
642	 *   o The current thread is dead; it's stack needs to be
643	 *     cleaned up and it can't be done while operating on
644	 *     it.
645	 *   o The current thread has signals pending, should
646	 *     let scheduler install signal trampoline for us.
647	 *   o There are no runnable threads.
648	 *   o The next thread to run won't unlock the scheduler
649	 *     lock.  A side note: the current thread may be run
650	 *     instead of the next thread in the run queue, but
651	 *     we don't bother checking for that.
652	 */
653	if (curthread->attr.flags & PTHREAD_SCOPE_SYSTEM)
654		kse_sched_single(&curkse->k_kcb->kcb_kmbx);
655	else if ((curthread->state == PS_DEAD) ||
656	    (((td = KSE_RUNQ_FIRST(curkse)) == NULL) &&
657	    (curthread->state != PS_RUNNING)) ||
658	    ((td != NULL) && (td->lock_switch == 0))) {
659		curkse->k_switch = 1;
660		_thread_enter_uts(curthread->tcb, curkse->k_kcb);
661	}
662	else {
663		uts_once = 0;
664		THR_GETCONTEXT(&curthread->tcb->tcb_tmbx.tm_context);
665		if (uts_once == 0) {
666			uts_once = 1;
667
668			/* Switchout the current thread. */
669			kse_switchout_thread(curkse, curthread);
670
671		 	/* Choose another thread to run. */
672			td = KSE_RUNQ_FIRST(curkse);
673			KSE_RUNQ_REMOVE(curkse, td);
674			kse_set_curthread(curkse, td);
675
676			/*
677			 * Make sure the current thread's kse points to
678			 * this kse.
679			 */
680			td->kse = curkse;
681
682			/*
683			 * Reset the time slice if this thread is running
684			 * for the first time or running again after using
685			 * its full time slice allocation.
686			 */
687			if (td->slice_usec == -1)
688				td->slice_usec = 0;
689
690			/* Mark the thread active. */
691			td->active = 1;
692
693			/* Remove the frame reference. */
694			td->curframe = NULL;
695
696			/*
697			 * Continue the thread at its current frame:
698			 */
699			ret = _thread_switch(curkse->k_kcb, td->tcb, 0);
700			/* This point should not be reached. */
701			if (ret != 0)
702				PANIC("Bad return from _thread_switch");
703			PANIC("Thread has returned from _thread_switch");
704		}
705	}
706
707	if (psf.psf_valid) {
708		/*
709		 * It is ugly we must increase critical count, because we
710		 * have a frame saved, we must backout state in psf
711		 * before we can process signals.
712 		 */
713		curthread->critical_count++;
714	}
715
716	if (curthread->lock_switch != 0) {
717		/*
718		 * Unlock the scheduling queue and leave the
719		 * critical region.
720		 */
721		/* Don't trust this after a switch! */
722		curkse = _get_curkse();
723
724		curthread->lock_switch = 0;
725		KSE_SCHED_UNLOCK(curkse, curkse->k_kseg);
726		_kse_critical_leave(&curthread->tcb->tcb_tmbx);
727	}
728	/*
729	 * This thread is being resumed; check for cancellations.
730	 */
731	if ((psf.psf_valid ||
732	    (curthread->check_pending && !THR_IN_CRITICAL(curthread)))) {
733		resume_once = 0;
734		THR_GETCONTEXT(&uc);
735		if (resume_once == 0) {
736			resume_once = 1;
737			curthread->check_pending = 0;
738			thr_resume_check(curthread, &uc, &psf);
739		}
740	}
741	THR_ACTIVATE_LAST_LOCK(curthread);
742}
743
744/*
745 * This is the scheduler for a KSE which runs a scope system thread.
746 * The multi-thread KSE scheduler should also work for a single threaded
747 * KSE, but we use a separate scheduler so that it can be fine-tuned
748 * to be more efficient (and perhaps not need a separate stack for
749 * the KSE, allowing it to use the thread's stack).
750 */
751
752static void
753kse_sched_single(struct kse_mailbox *kmbx)
754{
755	struct kse *curkse;
756	struct pthread *curthread;
757	struct timespec ts;
758	sigset_t sigmask;
759	int i, sigseqno, level, first = 0;
760
761	curkse = (struct kse *)kmbx->km_udata;
762	curthread = curkse->k_curthread;
763
764	if ((curkse->k_flags & KF_INITIALIZED) == 0) {
765		/* Setup this KSEs specific data. */
766		_kcb_set(curkse->k_kcb);
767		_tcb_set(curkse->k_kcb, curthread->tcb);
768		curkse->k_flags |= KF_INITIALIZED;
769		first = 1;
770		curthread->active = 1;
771
772		/* Setup kernel signal masks for new thread. */
773		__sys_sigprocmask(SIG_SETMASK, &curthread->sigmask, NULL);
774		/*
775		 * Enter critical region, this is meanless for bound thread,
776		 * It is used to let other code work, those code want mailbox
777		 * to be cleared.
778		 */
779		(void)_kse_critical_enter();
780 	}
781
782	curthread->critical_yield = 0;
783	curthread->need_switchout = 0;
784
785	/*
786	 * Lock the scheduling queue.
787	 *
788	 * There is no scheduling queue for single threaded KSEs,
789	 * but we need a lock for protection regardless.
790	 */
791	if (curthread->lock_switch == 0)
792		KSE_SCHED_LOCK(curkse, curkse->k_kseg);
793
794	/*
795	 * This has to do the job of kse_switchout_thread(), only
796	 * for a single threaded KSE/KSEG.
797	 */
798
799	switch (curthread->state) {
800	case PS_DEAD:
801		curthread->check_pending = 0;
802		/* Unlock the scheduling queue and exit the KSE and thread. */
803		thr_cleanup(curkse, curthread);
804		KSE_SCHED_UNLOCK(curkse, curkse->k_kseg);
805		PANIC("bound thread shouldn't get here\n");
806		break;
807
808	case PS_SIGWAIT:
809		PANIC("bound thread does not have SIGWAIT state\n");
810
811	case PS_SLEEP_WAIT:
812		PANIC("bound thread does not have SLEEP_WAIT state\n");
813
814	case PS_SIGSUSPEND:
815		PANIC("bound thread does not have SIGSUSPEND state\n");
816
817	case PS_COND_WAIT:
818		break;
819
820	case PS_LOCKWAIT:
821		/*
822		 * This state doesn't timeout.
823		 */
824		curthread->wakeup_time.tv_sec = -1;
825		curthread->wakeup_time.tv_nsec = -1;
826		level = curthread->locklevel - 1;
827		if (_LCK_GRANTED(&curthread->lockusers[level]))
828			THR_SET_STATE(curthread, PS_RUNNING);
829		break;
830
831	case PS_RUNNING:
832		if ((curthread->flags & THR_FLAGS_SUSPENDED) != 0) {
833			THR_SET_STATE(curthread, PS_SUSPENDED);
834		}
835		curthread->wakeup_time.tv_sec = -1;
836		curthread->wakeup_time.tv_nsec = -1;
837		break;
838
839	case PS_JOIN:
840	case PS_MUTEX_WAIT:
841	case PS_SUSPENDED:
842	case PS_DEADLOCK:
843	default:
844		/*
845		 * These states don't timeout and don't need
846		 * to be in the waiting queue.
847		 */
848		curthread->wakeup_time.tv_sec = -1;
849		curthread->wakeup_time.tv_nsec = -1;
850		break;
851	}
852
853	while (curthread->state != PS_RUNNING) {
854		sigseqno = curkse->k_sigseqno;
855		if (curthread->check_pending != 0) {
856			/*
857			 * Install pending signals into the frame, possible
858			 * cause mutex or condvar backout.
859			 */
860			curthread->check_pending = 0;
861			SIGFILLSET(sigmask);
862
863			/*
864			 * Lock out kernel signal code when we are processing
865			 * signals, and get a fresh copy of signal mask.
866			 */
867			__sys_sigprocmask(SIG_SETMASK, &sigmask,
868					  &curthread->sigmask);
869			for (i = 1; i <= _SIG_MAXSIG; i++) {
870				if (SIGISMEMBER(curthread->sigmask, i))
871					continue;
872				if (SIGISMEMBER(curthread->sigpend, i))
873					(void)_thr_sig_add(curthread, i,
874					    &curthread->siginfo[i-1]);
875			}
876			__sys_sigprocmask(SIG_SETMASK, &curthread->sigmask,
877				NULL);
878			/* The above code might make thread runnable */
879			if (curthread->state == PS_RUNNING)
880				break;
881		}
882		THR_DEACTIVATE_LAST_LOCK(curthread);
883		kse_wait(curkse, curthread, sigseqno);
884		THR_ACTIVATE_LAST_LOCK(curthread);
885		KSE_GET_TOD(curkse, &ts);
886		if (thr_timedout(curthread, &ts)) {
887			/* Indicate the thread timedout: */
888			curthread->timeout = 1;
889			/* Make the thread runnable. */
890			THR_SET_STATE(curthread, PS_RUNNING);
891		}
892	}
893
894	/* Remove the frame reference. */
895	curthread->curframe = NULL;
896
897	if (curthread->lock_switch == 0) {
898		/* Unlock the scheduling queue. */
899		KSE_SCHED_UNLOCK(curkse, curkse->k_kseg);
900	}
901
902	DBG_MSG("Continuing bound thread %p\n", curthread);
903	if (first) {
904		_kse_critical_leave(&curthread->tcb->tcb_tmbx);
905		pthread_exit(curthread->start_routine(curthread->arg));
906	}
907}
908
909#ifdef DEBUG_THREAD_KERN
910static void
911dump_queues(struct kse *curkse)
912{
913	struct pthread *thread;
914
915	DBG_MSG("Threads in waiting queue:\n");
916	TAILQ_FOREACH(thread, &curkse->k_kseg->kg_schedq.sq_waitq, pqe) {
917		DBG_MSG("  thread %p, state %d, blocked %d\n",
918		    thread, thread->state, thread->blocked);
919	}
920}
921#endif
922
923/*
924 * This is the scheduler for a KSE which runs multiple threads.
925 */
926static void
927kse_sched_multi(struct kse_mailbox *kmbx)
928{
929	struct kse *curkse;
930	struct pthread *curthread, *td_wait;
931	struct pthread_sigframe *curframe;
932	int ret;
933
934	curkse = (struct kse *)kmbx->km_udata;
935	THR_ASSERT(curkse->k_kcb->kcb_kmbx.km_curthread == NULL,
936	    "Mailbox not null in kse_sched_multi");
937
938	/* Check for first time initialization: */
939	if ((curkse->k_flags & KF_INITIALIZED) == 0) {
940		/* Setup this KSEs specific data. */
941		_kcb_set(curkse->k_kcb);
942
943		/* Set this before grabbing the context. */
944		curkse->k_flags |= KF_INITIALIZED;
945	}
946
947	/* This may have returned from a kse_release(). */
948	if (KSE_WAITING(curkse)) {
949		DBG_MSG("Entered upcall when KSE is waiting.");
950		KSE_CLEAR_WAIT(curkse);
951	}
952
953	/*If this is an upcall; take the scheduler lock. */
954	if (curkse->k_switch == 0)
955		KSE_SCHED_LOCK(curkse, curkse->k_kseg);
956	curkse->k_switch = 0;
957
958	/*
959	 * Now that the scheduler lock is held, get the current
960	 * thread.  The KSE's current thread cannot be safely
961	 * examined without the lock because it could have returned
962	 * as completed on another KSE.  See kse_check_completed().
963	 */
964	curthread = curkse->k_curthread;
965
966	if (KSE_IS_IDLE(curkse)) {
967		KSE_CLEAR_IDLE(curkse);
968		curkse->k_kseg->kg_idle_kses--;
969	}
970	/*
971	 * If the current thread was completed in another KSE, then
972	 * it will be in the run queue.  Don't mark it as being blocked.
973	 */
974	if ((curthread != NULL) &&
975	    ((curthread->flags & THR_FLAGS_IN_RUNQ) == 0) &&
976	    (curthread->need_switchout == 0)) {
977		/*
978		 * Assume the current thread is blocked; when the
979		 * completed threads are checked and if the current
980		 * thread is among the completed, the blocked flag
981		 * will be cleared.
982		 */
983		curthread->blocked = 1;
984	}
985
986	/* Check for any unblocked threads in the kernel. */
987	kse_check_completed(curkse);
988
989	/*
990	 * Check for threads that have timed-out.
991	 */
992	kse_check_waitq(curkse);
993
994	/*
995	 * Switchout the current thread, if necessary, as the last step
996	 * so that it is inserted into the run queue (if it's runnable)
997	 * _after_ any other threads that were added to it above.
998	 */
999	if (curthread == NULL)
1000		;  /* Nothing to do here. */
1001	else if ((curthread->need_switchout == 0) &&
1002	    (curthread->blocked == 0) && (THR_IN_CRITICAL(curthread))) {
1003		/*
1004		 * Resume the thread and tell it to yield when
1005		 * it leaves the critical region.
1006		 */
1007		curthread->critical_yield = 1;
1008		curthread->active = 1;
1009		if ((curthread->flags & THR_FLAGS_IN_RUNQ) != 0)
1010			KSE_RUNQ_REMOVE(curkse, curthread);
1011		kse_set_curthread(curkse, curthread);
1012		curthread->kse = curkse;
1013		DBG_MSG("Continuing thread %p in critical region\n",
1014		    curthread);
1015		kse_wakeup_multi(curkse);
1016		KSE_SCHED_UNLOCK(curkse, curkse->k_kseg);
1017		ret = _thread_switch(curkse->k_kcb, curthread->tcb, 1);
1018		if (ret != 0)
1019			PANIC("Can't resume thread in critical region\n");
1020	}
1021	else if ((curthread->flags & THR_FLAGS_IN_RUNQ) == 0)
1022		kse_switchout_thread(curkse, curthread);
1023	kse_set_curthread(curkse, NULL);
1024
1025	kse_wakeup_multi(curkse);
1026
1027#ifdef DEBUG_THREAD_KERN
1028	dump_queues(curkse);
1029#endif
1030
1031	/* Check if there are no threads ready to run: */
1032	while (((curthread = KSE_RUNQ_FIRST(curkse)) == NULL) &&
1033	    (curkse->k_kseg->kg_threadcount != 0)) {
1034		/*
1035		 * Wait for a thread to become active or until there are
1036		 * no more threads.
1037		 */
1038		td_wait = KSE_WAITQ_FIRST(curkse);
1039		kse_wait(curkse, td_wait, 0);
1040		kse_check_completed(curkse);
1041		kse_check_waitq(curkse);
1042	}
1043
1044	/* Check for no more threads: */
1045	if (curkse->k_kseg->kg_threadcount == 0) {
1046		/*
1047		 * Normally this shouldn't return, but it will if there
1048		 * are other KSEs running that create new threads that
1049		 * are assigned to this KSE[G].  For instance, if a scope
1050		 * system thread were to create a scope process thread
1051		 * and this kse[g] is the initial kse[g], then that newly
1052		 * created thread would be assigned to us (the initial
1053		 * kse[g]).
1054		 */
1055		KSE_SCHED_UNLOCK(curkse, curkse->k_kseg);
1056		kse_fini(curkse);
1057		/* never returns */
1058	}
1059
1060	THR_ASSERT(curthread != NULL,
1061	    "Return from kse_wait/fini without thread.");
1062	THR_ASSERT(curthread->state != PS_DEAD,
1063	    "Trying to resume dead thread!");
1064	KSE_RUNQ_REMOVE(curkse, curthread);
1065
1066	/*
1067	 * Make the selected thread the current thread.
1068	 */
1069	kse_set_curthread(curkse, curthread);
1070
1071	/*
1072	 * Make sure the current thread's kse points to this kse.
1073	 */
1074	curthread->kse = curkse;
1075
1076	/*
1077	 * Reset the time slice if this thread is running for the first
1078	 * time or running again after using its full time slice allocation.
1079	 */
1080	if (curthread->slice_usec == -1)
1081		curthread->slice_usec = 0;
1082
1083	/* Mark the thread active. */
1084	curthread->active = 1;
1085
1086	/* Remove the frame reference. */
1087	curframe = curthread->curframe;
1088	curthread->curframe = NULL;
1089
1090	kse_wakeup_multi(curkse);
1091
1092	/*
1093	 * The thread's current signal frame will only be NULL if it
1094	 * is being resumed after being blocked in the kernel.  In
1095	 * this case, and if the thread needs to run down pending
1096	 * signals or needs a cancellation check, we need to add a
1097	 * signal frame to the thread's context.
1098	 */
1099#ifdef NOT_YET
1100	if ((((curframe == NULL) && (curthread->check_pending != 0)) ||
1101	    (((curthread->cancelflags & THR_AT_CANCEL_POINT) == 0) &&
1102	     ((curthread->cancelflags & PTHREAD_CANCEL_ASYNCHRONOUS) != 0))) &&
1103	     !THR_IN_CRITICAL(curthread))
1104		signalcontext(&curthread->tcb->tcb_tmbx.tm_context, 0,
1105		    (__sighandler_t *)thr_resume_wrapper);
1106#else
1107	if ((curframe == NULL) && (curthread->state == PS_RUNNING) &&
1108	    (curthread->check_pending != 0) && !THR_IN_CRITICAL(curthread)) {
1109		curthread->check_pending = 0;
1110		signalcontext(&curthread->tcb->tcb_tmbx.tm_context, 0,
1111		    (__sighandler_t *)thr_resume_wrapper);
1112	}
1113#endif
1114	/*
1115	 * Continue the thread at its current frame:
1116	 */
1117	if (curthread->lock_switch != 0) {
1118		/*
1119		 * This thread came from a scheduler switch; it will
1120		 * unlock the scheduler lock and set the mailbox.
1121		 */
1122		ret = _thread_switch(curkse->k_kcb, curthread->tcb, 0);
1123	} else {
1124		/* This thread won't unlock the scheduler lock. */
1125		KSE_SCHED_UNLOCK(curkse, curkse->k_kseg);
1126		ret = _thread_switch(curkse->k_kcb, curthread->tcb, 1);
1127	}
1128	if (ret != 0)
1129		PANIC("Thread has returned from _thread_switch");
1130
1131	/* This point should not be reached. */
1132	PANIC("Thread has returned from _thread_switch");
1133}
1134
1135static void
1136thr_resume_wrapper(int sig, siginfo_t *siginfo, ucontext_t *ucp)
1137{
1138	struct pthread *curthread = _get_curthread();
1139	struct kse *curkse;
1140	int ret, err_save = curthread->error;
1141
1142	DBG_MSG(">>> sig wrapper\n");
1143	if (curthread->lock_switch)
1144		PANIC("thr_resume_wrapper, lock_switch != 0\n");
1145	thr_resume_check(curthread, ucp, NULL);
1146	_kse_critical_enter();
1147	curkse = _get_curkse();
1148	curthread->tcb->tcb_tmbx.tm_context = *ucp;
1149	curthread->error = err_save;
1150	ret = _thread_switch(curkse->k_kcb, curthread->tcb, 1);
1151	if (ret != 0)
1152		PANIC("thr_resume_wrapper: thread has returned "
1153		      "from _thread_switch");
1154	/* THR_SETCONTEXT(ucp); */ /* not work, why ? */
1155}
1156
1157static void
1158thr_resume_check(struct pthread *curthread, ucontext_t *ucp,
1159    struct pthread_sigframe *psf)
1160{
1161	_thr_sig_rundown(curthread, ucp, psf);
1162
1163#ifdef NOT_YET
1164	if (((curthread->cancelflags & THR_AT_CANCEL_POINT) == 0) &&
1165	    ((curthread->cancelflags & PTHREAD_CANCEL_ASYNCHRONOUS) != 0))
1166		pthread_testcancel();
1167#endif
1168}
1169
1170/*
1171 * Clean up a thread.  This must be called with the thread's KSE
1172 * scheduling lock held.  The thread must be a thread from the
1173 * KSE's group.
1174 */
1175static void
1176thr_cleanup(struct kse *curkse, struct pthread *thread)
1177{
1178	struct pthread *joiner;
1179	struct kse_mailbox *kmbx = NULL;
1180	int sys_scope;
1181
1182	if ((joiner = thread->joiner) != NULL) {
1183		/* Joinee scheduler lock held; joiner won't leave. */
1184		if (joiner->kseg == curkse->k_kseg) {
1185			if (joiner->join_status.thread == thread) {
1186				joiner->join_status.thread = NULL;
1187				joiner->join_status.ret = thread->ret;
1188				(void)_thr_setrunnable_unlocked(joiner);
1189			}
1190		} else {
1191			KSE_SCHED_UNLOCK(curkse, curkse->k_kseg);
1192			/* The joiner may have removed itself and exited. */
1193			if (_thr_ref_add(thread, joiner, 0) == 0) {
1194				KSE_SCHED_LOCK(curkse, joiner->kseg);
1195				if (joiner->join_status.thread == thread) {
1196					joiner->join_status.thread = NULL;
1197					joiner->join_status.ret = thread->ret;
1198					kmbx = _thr_setrunnable_unlocked(joiner);
1199				}
1200				KSE_SCHED_UNLOCK(curkse, joiner->kseg);
1201				_thr_ref_delete(thread, joiner);
1202				if (kmbx != NULL)
1203					kse_wakeup(kmbx);
1204			}
1205			KSE_SCHED_LOCK(curkse, curkse->k_kseg);
1206		}
1207		thread->attr.flags |= PTHREAD_DETACHED;
1208	}
1209
1210	if (!(sys_scope = (thread->attr.flags & PTHREAD_SCOPE_SYSTEM))) {
1211		/*
1212		 * Remove the thread from the KSEG's list of threads.
1213	 	 */
1214		KSEG_THRQ_REMOVE(thread->kseg, thread);
1215		/*
1216		 * Migrate the thread to the main KSE so that this
1217		 * KSE and KSEG can be cleaned when their last thread
1218		 * exits.
1219		 */
1220		thread->kseg = _kse_initial->k_kseg;
1221		thread->kse = _kse_initial;
1222	}
1223	thread->flags |= THR_FLAGS_GC_SAFE;
1224
1225	/*
1226	 * We can't hold the thread list lock while holding the
1227	 * scheduler lock.
1228	 */
1229	KSE_SCHED_UNLOCK(curkse, curkse->k_kseg);
1230	DBG_MSG("Adding thread %p to GC list\n", thread);
1231	KSE_LOCK_ACQUIRE(curkse, &_thread_list_lock);
1232	THR_GCLIST_ADD(thread);
1233	/* Use thread_list_lock */
1234	active_threads--;
1235#ifdef SYSTEM_SCOPE_ONLY
1236	if (active_threads == 0) {
1237#else
1238	if (active_threads == 1) {
1239#endif
1240		KSE_LOCK_RELEASE(curkse, &_thread_list_lock);
1241		exit(0);
1242        }
1243	KSE_LOCK_RELEASE(curkse, &_thread_list_lock);
1244	if (sys_scope) {
1245		/*
1246		 * System scope thread is single thread group,
1247		 * when thread is exited, its kse and ksegrp should
1248		 * be recycled as well.
1249		 * kse upcall stack belongs to thread, clear it here.
1250		 */
1251		curkse->k_stack.ss_sp = 0;
1252		curkse->k_stack.ss_size = 0;
1253		kse_exit();
1254		PANIC("kse_exit() failed for system scope thread");
1255	}
1256	KSE_SCHED_LOCK(curkse, curkse->k_kseg);
1257}
1258
1259void
1260_thr_gc(struct pthread *curthread)
1261{
1262	thread_gc(curthread);
1263	kse_gc(curthread);
1264	kseg_gc(curthread);
1265}
1266
1267static void
1268thread_gc(struct pthread *curthread)
1269{
1270	struct pthread *td, *td_next;
1271	kse_critical_t crit;
1272	TAILQ_HEAD(, pthread) worklist;
1273
1274	TAILQ_INIT(&worklist);
1275	crit = _kse_critical_enter();
1276	KSE_LOCK_ACQUIRE(curthread->kse, &_thread_list_lock);
1277
1278	/* Check the threads waiting for GC. */
1279	for (td = TAILQ_FIRST(&_thread_gc_list); td != NULL; td = td_next) {
1280		td_next = TAILQ_NEXT(td, gcle);
1281		if ((td->flags & THR_FLAGS_GC_SAFE) == 0)
1282			continue;
1283		else if (((td->attr.flags & PTHREAD_SCOPE_SYSTEM) != 0) &&
1284		    ((td->kse->k_kcb->kcb_kmbx.km_flags & KMF_DONE) == 0)) {
1285			/*
1286			 * The thread and KSE are operating on the same
1287			 * stack.  Wait for the KSE to exit before freeing
1288			 * the thread's stack as well as everything else.
1289			 */
1290			continue;
1291		}
1292		/*
1293		 * Remove the thread from the GC list.  If the thread
1294		 * isn't yet detached, it will get added back to the
1295		 * GC list at a later time.
1296		 */
1297		THR_GCLIST_REMOVE(td);
1298		DBG_MSG("Freeing thread %p stack\n", td);
1299		/*
1300		 * We can free the thread stack since it's no longer
1301		 * in use.
1302		 */
1303		_thr_stack_free(&td->attr);
1304		if (((td->attr.flags & PTHREAD_DETACHED) != 0) &&
1305		    (td->refcount == 0)) {
1306			/*
1307			 * The thread has detached and is no longer
1308			 * referenced.  It is safe to remove all
1309			 * remnants of the thread.
1310			 */
1311			THR_LIST_REMOVE(td);
1312			TAILQ_INSERT_HEAD(&worklist, td, gcle);
1313		}
1314	}
1315	KSE_LOCK_RELEASE(curthread->kse, &_thread_list_lock);
1316	_kse_critical_leave(crit);
1317
1318	while ((td = TAILQ_FIRST(&worklist)) != NULL) {
1319		TAILQ_REMOVE(&worklist, td, gcle);
1320
1321		if ((td->attr.flags & PTHREAD_SCOPE_SYSTEM) != 0) {
1322			crit = _kse_critical_enter();
1323			KSE_LOCK_ACQUIRE(curthread->kse, &kse_lock);
1324			kse_free_unlocked(td->kse);
1325			kseg_free_unlocked(td->kseg);
1326			KSE_LOCK_RELEASE(curthread->kse, &kse_lock);
1327			_kse_critical_leave(crit);
1328		}
1329		/*
1330		 * XXX we don't free initial thread, because there might
1331		 * have some code referencing initial thread.
1332		 */
1333		if (td != _thr_initial) {
1334			DBG_MSG("Freeing thread %p\n", td);
1335			_thr_free(curthread, td);
1336		} else
1337			DBG_MSG("Initial thread won't be freed\n");
1338	}
1339}
1340
1341static void
1342kse_gc(struct pthread *curthread)
1343{
1344	kse_critical_t crit;
1345	TAILQ_HEAD(, kse) worklist;
1346	struct kse *kse;
1347
1348	if (free_kse_count <= MAX_CACHED_KSES)
1349		return;
1350	TAILQ_INIT(&worklist);
1351	crit = _kse_critical_enter();
1352	KSE_LOCK_ACQUIRE(curthread->kse, &kse_lock);
1353	while (free_kse_count > MAX_CACHED_KSES) {
1354		kse = TAILQ_FIRST(&free_kseq);
1355		TAILQ_REMOVE(&free_kseq, kse, k_qe);
1356		TAILQ_INSERT_HEAD(&worklist, kse, k_qe);
1357		free_kse_count--;
1358	}
1359	KSE_LOCK_RELEASE(curthread->kse, &kse_lock);
1360	_kse_critical_leave(crit);
1361
1362	while ((kse = TAILQ_FIRST(&worklist))) {
1363		TAILQ_REMOVE(&worklist, kse, k_qe);
1364		kse_destroy(kse);
1365	}
1366}
1367
1368static void
1369kseg_gc(struct pthread *curthread)
1370{
1371	kse_critical_t crit;
1372	TAILQ_HEAD(, kse_group) worklist;
1373	struct kse_group *kseg;
1374
1375	if (free_kseg_count <= MAX_CACHED_KSEGS)
1376		return;
1377	crit = _kse_critical_enter();
1378	KSE_LOCK_ACQUIRE(curthread->kse, &kse_lock);
1379	while (free_kseg_count > MAX_CACHED_KSEGS) {
1380		kseg = TAILQ_FIRST(&free_kse_groupq);
1381		TAILQ_REMOVE(&free_kse_groupq, kseg, kg_qe);
1382		free_kseg_count--;
1383		TAILQ_INSERT_HEAD(&worklist, kseg, kg_qe);
1384	}
1385	KSE_LOCK_RELEASE(curthread->kse, &kse_lock);
1386	_kse_critical_leave(crit);
1387
1388	while ((kseg = TAILQ_FIRST(&worklist))) {
1389		TAILQ_REMOVE(&worklist, kseg, kg_qe);
1390		kseg_destroy(kseg);
1391	}
1392}
1393
1394/*
1395 * Only new threads that are running or suspended may be scheduled.
1396 */
1397int
1398_thr_schedule_add(struct pthread *curthread, struct pthread *newthread)
1399{
1400	kse_critical_t crit;
1401	int ret;
1402
1403	/* Add the new thread. */
1404	thr_link(newthread);
1405
1406	/*
1407	 * If this is the first time creating a thread, make sure
1408	 * the mailbox is set for the current thread.
1409	 */
1410	if ((newthread->attr.flags & PTHREAD_SCOPE_SYSTEM) != 0) {
1411		/* We use the thread's stack as the KSE's stack. */
1412		newthread->kse->k_kcb->kcb_kmbx.km_stack.ss_sp =
1413		    newthread->attr.stackaddr_attr;
1414		newthread->kse->k_kcb->kcb_kmbx.km_stack.ss_size =
1415		    newthread->attr.stacksize_attr;
1416
1417		/*
1418		 * No need to lock the scheduling queue since the
1419		 * KSE/KSEG pair have not yet been started.
1420		 */
1421		KSEG_THRQ_ADD(newthread->kseg, newthread);
1422		/* this thread never gives up kse */
1423		newthread->active = 1;
1424		newthread->kse->k_curthread = newthread;
1425		newthread->kse->k_kcb->kcb_kmbx.km_flags = KMF_BOUND;
1426		newthread->kse->k_kcb->kcb_kmbx.km_func =
1427		    (kse_func_t *)kse_sched_single;
1428		newthread->kse->k_kcb->kcb_kmbx.km_quantum = 0;
1429		KSE_SET_MBOX(newthread->kse, newthread);
1430		/*
1431		 * This thread needs a new KSE and KSEG.
1432		 */
1433		newthread->kse->k_flags &= ~KF_INITIALIZED;
1434		newthread->kse->k_flags |= KF_STARTED;
1435		/* Fire up! */
1436		ret = kse_create(&newthread->kse->k_kcb->kcb_kmbx, 1);
1437		if (ret != 0)
1438			ret = errno;
1439	}
1440	else {
1441		/*
1442		 * Lock the KSE and add the new thread to its list of
1443		 * assigned threads.  If the new thread is runnable, also
1444		 * add it to the KSE's run queue.
1445		 */
1446		crit = _kse_critical_enter();
1447		KSE_SCHED_LOCK(curthread->kse, newthread->kseg);
1448		KSEG_THRQ_ADD(newthread->kseg, newthread);
1449		if (newthread->state == PS_RUNNING)
1450			THR_RUNQ_INSERT_TAIL(newthread);
1451		if ((newthread->kse->k_flags & KF_STARTED) == 0) {
1452			/*
1453			 * This KSE hasn't been started yet.  Start it
1454			 * outside of holding the lock.
1455			 */
1456			newthread->kse->k_flags |= KF_STARTED;
1457			newthread->kse->k_kcb->kcb_kmbx.km_func =
1458			    (kse_func_t *)kse_sched_multi;
1459			newthread->kse->k_kcb->kcb_kmbx.km_flags = 0;
1460			kse_create(&newthread->kse->k_kcb->kcb_kmbx, 0);
1461		 } else if ((newthread->state == PS_RUNNING) &&
1462		     KSE_IS_IDLE(newthread->kse)) {
1463			/*
1464			 * The thread is being scheduled on another KSEG.
1465			 */
1466			kse_wakeup_one(newthread);
1467		}
1468		KSE_SCHED_UNLOCK(curthread->kse, newthread->kseg);
1469		_kse_critical_leave(crit);
1470		ret = 0;
1471	}
1472	if (ret != 0)
1473		thr_unlink(newthread);
1474
1475	return (ret);
1476}
1477
1478void
1479kse_waitq_insert(struct pthread *thread)
1480{
1481	struct pthread *td;
1482
1483	if (thread->wakeup_time.tv_sec == -1)
1484		TAILQ_INSERT_TAIL(&thread->kse->k_schedq->sq_waitq, thread,
1485		    pqe);
1486	else {
1487		td = TAILQ_FIRST(&thread->kse->k_schedq->sq_waitq);
1488		while ((td != NULL) && (td->wakeup_time.tv_sec != -1) &&
1489		    ((td->wakeup_time.tv_sec < thread->wakeup_time.tv_sec) ||
1490		    ((td->wakeup_time.tv_sec == thread->wakeup_time.tv_sec) &&
1491		    (td->wakeup_time.tv_nsec <= thread->wakeup_time.tv_nsec))))
1492			td = TAILQ_NEXT(td, pqe);
1493		if (td == NULL)
1494			TAILQ_INSERT_TAIL(&thread->kse->k_schedq->sq_waitq,
1495			    thread, pqe);
1496		else
1497			TAILQ_INSERT_BEFORE(td, thread, pqe);
1498	}
1499	thread->flags |= THR_FLAGS_IN_WAITQ;
1500}
1501
1502/*
1503 * This must be called with the scheduling lock held.
1504 */
1505static void
1506kse_check_completed(struct kse *kse)
1507{
1508	struct pthread *thread;
1509	struct kse_thr_mailbox *completed;
1510	int sig;
1511
1512	if ((completed = kse->k_kcb->kcb_kmbx.km_completed) != NULL) {
1513		kse->k_kcb->kcb_kmbx.km_completed = NULL;
1514		while (completed != NULL) {
1515			thread = completed->tm_udata;
1516			DBG_MSG("Found completed thread %p, name %s\n",
1517			    thread,
1518			    (thread->name == NULL) ? "none" : thread->name);
1519			thread->blocked = 0;
1520			if (thread != kse->k_curthread) {
1521				thr_accounting(thread);
1522				if ((thread->flags & THR_FLAGS_SUSPENDED) != 0)
1523					THR_SET_STATE(thread, PS_SUSPENDED);
1524				else
1525					KSE_RUNQ_INSERT_TAIL(kse, thread);
1526				if ((thread->kse != kse) &&
1527				    (thread->kse->k_curthread == thread)) {
1528					/*
1529					 * Remove this thread from its
1530					 * previous KSE so that it (the KSE)
1531					 * doesn't think it is still active.
1532					 */
1533					kse_set_curthread(thread->kse, NULL);
1534					thread->active = 0;
1535				}
1536			}
1537			if ((sig = thread->tcb->tcb_tmbx.tm_syncsig.si_signo)
1538			    != 0) {
1539				if (SIGISMEMBER(thread->sigmask, sig))
1540					SIGADDSET(thread->sigpend, sig);
1541				else
1542					(void)_thr_sig_add(thread, sig,
1543					    &thread->tcb->tcb_tmbx.tm_syncsig);
1544				thread->tcb->tcb_tmbx.tm_syncsig.si_signo = 0;
1545			}
1546			completed = completed->tm_next;
1547		}
1548	}
1549}
1550
1551/*
1552 * This must be called with the scheduling lock held.
1553 */
1554static void
1555kse_check_waitq(struct kse *kse)
1556{
1557	struct pthread	*pthread;
1558	struct timespec ts;
1559
1560	KSE_GET_TOD(kse, &ts);
1561
1562	/*
1563	 * Wake up threads that have timedout.  This has to be
1564	 * done before adding the current thread to the run queue
1565	 * so that a CPU intensive thread doesn't get preference
1566	 * over waiting threads.
1567	 */
1568	while (((pthread = KSE_WAITQ_FIRST(kse)) != NULL) &&
1569	    thr_timedout(pthread, &ts)) {
1570		/* Remove the thread from the wait queue: */
1571		KSE_WAITQ_REMOVE(kse, pthread);
1572		DBG_MSG("Found timedout thread %p in waitq\n", pthread);
1573
1574		/* Indicate the thread timedout: */
1575		pthread->timeout = 1;
1576
1577		/* Add the thread to the priority queue: */
1578		if ((pthread->flags & THR_FLAGS_SUSPENDED) != 0)
1579			THR_SET_STATE(pthread, PS_SUSPENDED);
1580		else {
1581			THR_SET_STATE(pthread, PS_RUNNING);
1582			KSE_RUNQ_INSERT_TAIL(kse, pthread);
1583		}
1584	}
1585}
1586
1587static int
1588thr_timedout(struct pthread *thread, struct timespec *curtime)
1589{
1590	if (thread->wakeup_time.tv_sec < 0)
1591		return (0);
1592	else if (thread->wakeup_time.tv_sec > curtime->tv_sec)
1593		return (0);
1594	else if ((thread->wakeup_time.tv_sec == curtime->tv_sec) &&
1595	    (thread->wakeup_time.tv_nsec > curtime->tv_nsec))
1596		return (0);
1597	else
1598		return (1);
1599}
1600
1601/*
1602 * This must be called with the scheduling lock held.
1603 *
1604 * Each thread has a time slice, a wakeup time (used when it wants
1605 * to wait for a specified amount of time), a run state, and an
1606 * active flag.
1607 *
1608 * When a thread gets run by the scheduler, the active flag is
1609 * set to non-zero (1).  When a thread performs an explicit yield
1610 * or schedules a state change, it enters the scheduler and the
1611 * active flag is cleared.  When the active flag is still seen
1612 * set in the scheduler, that means that the thread is blocked in
1613 * the kernel (because it is cleared before entering the scheduler
1614 * in all other instances).
1615 *
1616 * The wakeup time is only set for those states that can timeout.
1617 * It is set to (-1, -1) for all other instances.
1618 *
1619 * The thread's run state, aside from being useful when debugging,
1620 * is used to place the thread in an appropriate queue.  There
1621 * are 2 basic queues:
1622 *
1623 *   o run queue - queue ordered by priority for all threads
1624 *                 that are runnable
1625 *   o waiting queue - queue sorted by wakeup time for all threads
1626 *                     that are not otherwise runnable (not blocked
1627 *                     in kernel, not waiting for locks)
1628 *
1629 * The thread's time slice is used for round-robin scheduling
1630 * (the default scheduling policy).  While a SCHED_RR thread
1631 * is runnable it's time slice accumulates.  When it reaches
1632 * the time slice interval, it gets reset and added to the end
1633 * of the queue of threads at its priority.  When a thread no
1634 * longer becomes runnable (blocks in kernel, waits, etc), its
1635 * time slice is reset.
1636 *
1637 * The job of kse_switchout_thread() is to handle all of the above.
1638 */
1639static void
1640kse_switchout_thread(struct kse *kse, struct pthread *thread)
1641{
1642	int level;
1643	int i;
1644	int restart;
1645	siginfo_t siginfo;
1646
1647	/*
1648	 * Place the currently running thread into the
1649	 * appropriate queue(s).
1650	 */
1651	DBG_MSG("Switching out thread %p, state %d\n", thread, thread->state);
1652
1653	THR_DEACTIVATE_LAST_LOCK(thread);
1654	if (thread->blocked != 0) {
1655		thread->active = 0;
1656		thread->need_switchout = 0;
1657		/* This thread must have blocked in the kernel. */
1658		/*
1659		 *  Check for pending signals for this thread to
1660		 *  see if we need to interrupt it in the kernel.
1661		 */
1662		if (thread->check_pending != 0) {
1663			for (i = 1; i <= _SIG_MAXSIG; ++i) {
1664				if (SIGISMEMBER(thread->sigpend, i) &&
1665				    !SIGISMEMBER(thread->sigmask, i)) {
1666					restart = _thread_sigact[1 - 1].sa_flags & SA_RESTART;
1667					kse_thr_interrupt(&thread->tcb->tcb_tmbx,
1668					    restart ? KSE_INTR_RESTART : KSE_INTR_INTERRUPT, 0);
1669					break;
1670				}
1671			}
1672		}
1673	}
1674	else {
1675		switch (thread->state) {
1676		case PS_DEAD:
1677			/*
1678			 * The scheduler is operating on a different
1679			 * stack.  It is safe to do garbage collecting
1680			 * here.
1681			 */
1682			thread->active = 0;
1683			thread->need_switchout = 0;
1684			thread->lock_switch = 0;
1685			thr_cleanup(kse, thread);
1686			return;
1687			break;
1688
1689		case PS_RUNNING:
1690			if ((thread->flags & THR_FLAGS_SUSPENDED) != 0)
1691				THR_SET_STATE(thread, PS_SUSPENDED);
1692			break;
1693
1694		case PS_COND_WAIT:
1695		case PS_SLEEP_WAIT:
1696			/* Insert into the waiting queue: */
1697			KSE_WAITQ_INSERT(kse, thread);
1698			break;
1699
1700		case PS_LOCKWAIT:
1701			/*
1702			 * This state doesn't timeout.
1703			 */
1704			thread->wakeup_time.tv_sec = -1;
1705			thread->wakeup_time.tv_nsec = -1;
1706			level = thread->locklevel - 1;
1707			if (!_LCK_GRANTED(&thread->lockusers[level]))
1708				KSE_WAITQ_INSERT(kse, thread);
1709			else
1710				THR_SET_STATE(thread, PS_RUNNING);
1711			break;
1712
1713		case PS_SIGWAIT:
1714			KSE_WAITQ_INSERT(kse, thread);
1715			break;
1716		case PS_JOIN:
1717		case PS_MUTEX_WAIT:
1718		case PS_SIGSUSPEND:
1719		case PS_SUSPENDED:
1720		case PS_DEADLOCK:
1721		default:
1722			/*
1723			 * These states don't timeout.
1724			 */
1725			thread->wakeup_time.tv_sec = -1;
1726			thread->wakeup_time.tv_nsec = -1;
1727
1728			/* Insert into the waiting queue: */
1729			KSE_WAITQ_INSERT(kse, thread);
1730			break;
1731		}
1732		thr_accounting(thread);
1733		if (thread->state == PS_RUNNING) {
1734			if (thread->slice_usec == -1) {
1735				/*
1736				 * The thread exceeded its time quantum or
1737				 * it yielded the CPU; place it at the tail
1738				 * of the queue for its priority.
1739				 */
1740				KSE_RUNQ_INSERT_TAIL(kse, thread);
1741			} else {
1742				/*
1743				 * The thread hasn't exceeded its interval
1744				 * Place it at the head of the queue for its
1745				 * priority.
1746				 */
1747				KSE_RUNQ_INSERT_HEAD(kse, thread);
1748			}
1749		}
1750	}
1751	thread->active = 0;
1752	thread->need_switchout = 0;
1753	if (thread->check_pending != 0) {
1754		/* Install pending signals into the frame. */
1755		thread->check_pending = 0;
1756		KSE_LOCK_ACQUIRE(kse, &_thread_signal_lock);
1757		for (i = 1; i <= _SIG_MAXSIG; i++) {
1758			if (SIGISMEMBER(thread->sigmask, i))
1759				continue;
1760			if (SIGISMEMBER(thread->sigpend, i))
1761				(void)_thr_sig_add(thread, i,
1762				    &thread->siginfo[i-1]);
1763			else if (SIGISMEMBER(_thr_proc_sigpending, i) &&
1764				_thr_getprocsig_unlocked(i, &siginfo)) {
1765				(void)_thr_sig_add(thread, i, &siginfo);
1766			}
1767		}
1768		KSE_LOCK_RELEASE(kse, &_thread_signal_lock);
1769	}
1770}
1771
1772/*
1773 * This function waits for the smallest timeout value of any waiting
1774 * thread, or until it receives a message from another KSE.
1775 *
1776 * This must be called with the scheduling lock held.
1777 */
1778static void
1779kse_wait(struct kse *kse, struct pthread *td_wait, int sigseqno)
1780{
1781	struct timespec ts, ts_sleep;
1782	int saved_flags;
1783
1784	KSE_GET_TOD(kse, &ts);
1785
1786	if ((td_wait == NULL) || (td_wait->wakeup_time.tv_sec < 0)) {
1787		/* Limit sleep to no more than 1 minute. */
1788		ts_sleep.tv_sec = 60;
1789		ts_sleep.tv_nsec = 0;
1790	} else {
1791		TIMESPEC_SUB(&ts_sleep, &td_wait->wakeup_time, &ts);
1792		if (ts_sleep.tv_sec > 60) {
1793			ts_sleep.tv_sec = 60;
1794			ts_sleep.tv_nsec = 0;
1795		}
1796	}
1797	/* Don't sleep for negative times. */
1798	if ((ts_sleep.tv_sec >= 0) && (ts_sleep.tv_nsec >= 0)) {
1799		KSE_SET_IDLE(kse);
1800		kse->k_kseg->kg_idle_kses++;
1801		KSE_SCHED_UNLOCK(kse, kse->k_kseg);
1802		if ((kse->k_kseg->kg_flags & KGF_SINGLE_THREAD) &&
1803		    (kse->k_sigseqno != sigseqno))
1804			; /* don't sleep */
1805		else {
1806			saved_flags = kse->k_kcb->kcb_kmbx.km_flags;
1807			kse->k_kcb->kcb_kmbx.km_flags |= KMF_NOUPCALL;
1808			kse_release(&ts_sleep);
1809			kse->k_kcb->kcb_kmbx.km_flags = saved_flags;
1810		}
1811		KSE_SCHED_LOCK(kse, kse->k_kseg);
1812		if (KSE_IS_IDLE(kse)) {
1813			KSE_CLEAR_IDLE(kse);
1814			kse->k_kseg->kg_idle_kses--;
1815		}
1816	}
1817}
1818
1819/*
1820 * Avoid calling this kse_exit() so as not to confuse it with the
1821 * system call of the same name.
1822 */
1823static void
1824kse_fini(struct kse *kse)
1825{
1826	/* struct kse_group *free_kseg = NULL; */
1827	struct timespec ts;
1828
1829	/*
1830	 * Check to see if this is one of the main kses.
1831	 */
1832	if (kse->k_kseg != _kse_initial->k_kseg) {
1833		PANIC("shouldn't get here");
1834		/* This is for supporting thread groups. */
1835#ifdef NOT_YET
1836		/* Remove this KSE from the KSEG's list of KSEs. */
1837		KSE_SCHED_LOCK(kse, kse->k_kseg);
1838		TAILQ_REMOVE(&kse->k_kseg->kg_kseq, kse, k_kgqe);
1839		kse->k_kseg->kg_ksecount--;
1840		if (TAILQ_EMPTY(&kse->k_kseg->kg_kseq))
1841			free_kseg = kse->k_kseg;
1842		KSE_SCHED_UNLOCK(kse, kse->k_kseg);
1843
1844		/*
1845		 * Add this KSE to the list of free KSEs along with
1846		 * the KSEG if is now orphaned.
1847		 */
1848		KSE_LOCK_ACQUIRE(kse, &kse_lock);
1849		if (free_kseg != NULL)
1850			kseg_free_unlocked(free_kseg);
1851		kse_free_unlocked(kse);
1852		KSE_LOCK_RELEASE(kse, &kse_lock);
1853		kse_exit();
1854		/* Never returns. */
1855		PANIC("kse_exit()");
1856#endif
1857	} else {
1858#ifdef NOT_YET
1859		/*
1860		 * In future, we might allow program to kill
1861		 * kse in initial group.
1862		 */
1863		if (kse != _kse_initial) {
1864			KSE_SCHED_LOCK(kse, kse->k_kseg);
1865			TAILQ_REMOVE(&kse->k_kseg->kg_kseq, kse, k_kgqe);
1866			kse->k_kseg->kg_ksecount--;
1867			KSE_SCHED_UNLOCK(kse, kse->k_kseg);
1868			KSE_LOCK_ACQUIRE(kse, &kse_lock);
1869			kse_free_unlocked(kse);
1870			KSE_LOCK_RELEASE(kse, &kse_lock);
1871			kse_exit();
1872                        /* Never returns. */
1873                        PANIC("kse_exit() failed for initial kseg");
1874                }
1875#endif
1876		KSE_SCHED_LOCK(kse, kse->k_kseg);
1877		KSE_SET_IDLE(kse);
1878		kse->k_kseg->kg_idle_kses++;
1879		KSE_SCHED_UNLOCK(kse, kse->k_kseg);
1880		ts.tv_sec = 120;
1881		ts.tv_nsec = 0;
1882		kse->k_kcb->kcb_kmbx.km_flags = 0;
1883		kse_release(&ts);
1884		/* Never reach */
1885	}
1886}
1887
1888void
1889_thr_set_timeout(const struct timespec *timeout)
1890{
1891	struct pthread	*curthread = _get_curthread();
1892	struct timespec ts;
1893
1894	/* Reset the timeout flag for the running thread: */
1895	curthread->timeout = 0;
1896
1897	/* Check if the thread is to wait forever: */
1898	if (timeout == NULL) {
1899		/*
1900		 * Set the wakeup time to something that can be recognised as
1901		 * different to an actual time of day:
1902		 */
1903		curthread->wakeup_time.tv_sec = -1;
1904		curthread->wakeup_time.tv_nsec = -1;
1905	}
1906	/* Check if no waiting is required: */
1907	else if ((timeout->tv_sec == 0) && (timeout->tv_nsec == 0)) {
1908		/* Set the wake up time to 'immediately': */
1909		curthread->wakeup_time.tv_sec = 0;
1910		curthread->wakeup_time.tv_nsec = 0;
1911	} else {
1912		/* Calculate the time for the current thread to wakeup: */
1913		KSE_GET_TOD(curthread->kse, &ts);
1914		TIMESPEC_ADD(&curthread->wakeup_time, &ts, timeout);
1915	}
1916}
1917
1918void
1919_thr_panic_exit(char *file, int line, char *msg)
1920{
1921	char buf[256];
1922
1923	snprintf(buf, sizeof(buf), "(%s:%d) %s\n", file, line, msg);
1924	__sys_write(2, buf, strlen(buf));
1925	abort();
1926}
1927
1928void
1929_thr_setrunnable(struct pthread *curthread, struct pthread *thread)
1930{
1931	kse_critical_t crit;
1932	struct kse_mailbox *kmbx;
1933
1934	crit = _kse_critical_enter();
1935	KSE_SCHED_LOCK(curthread->kse, thread->kseg);
1936	kmbx = _thr_setrunnable_unlocked(thread);
1937	KSE_SCHED_UNLOCK(curthread->kse, thread->kseg);
1938	_kse_critical_leave(crit);
1939	if (kmbx != NULL)
1940		kse_wakeup(kmbx);
1941}
1942
1943struct kse_mailbox *
1944_thr_setrunnable_unlocked(struct pthread *thread)
1945{
1946	struct kse_mailbox *kmbx = NULL;
1947
1948	if ((thread->kseg->kg_flags & KGF_SINGLE_THREAD) != 0) {
1949		/* No silly queues for these threads. */
1950		if ((thread->flags & THR_FLAGS_SUSPENDED) != 0)
1951			THR_SET_STATE(thread, PS_SUSPENDED);
1952		else {
1953			THR_SET_STATE(thread, PS_RUNNING);
1954			kmbx = kse_wakeup_one(thread);
1955		}
1956
1957	} else if (thread->state != PS_RUNNING) {
1958		if ((thread->flags & THR_FLAGS_IN_WAITQ) != 0)
1959			KSE_WAITQ_REMOVE(thread->kse, thread);
1960		if ((thread->flags & THR_FLAGS_SUSPENDED) != 0)
1961			THR_SET_STATE(thread, PS_SUSPENDED);
1962		else {
1963			THR_SET_STATE(thread, PS_RUNNING);
1964			if ((thread->blocked == 0) && (thread->active == 0) &&
1965			    (thread->flags & THR_FLAGS_IN_RUNQ) == 0)
1966				THR_RUNQ_INSERT_TAIL(thread);
1967			/*
1968			 * XXX - Threads are not yet assigned to specific
1969			 *       KSEs; they are assigned to the KSEG.  So
1970			 *       the fact that a thread's KSE is waiting
1971			 *       doesn't necessarily mean that it will be
1972			 *       the KSE that runs the thread after the
1973			 *       lock is granted.  But we don't know if the
1974			 *       other KSEs within the same KSEG are also
1975			 *       in a waiting state or not so we err on the
1976			 *       side of caution and wakeup the thread's
1977			 *       last known KSE.  We ensure that the
1978			 *       threads KSE doesn't change while it's
1979			 *       scheduling lock is held so it is safe to
1980			 *       reference it (the KSE).  If the KSE wakes
1981			 *       up and doesn't find any more work it will
1982			 *       again go back to waiting so no harm is
1983			 *       done.
1984			 */
1985			kmbx = kse_wakeup_one(thread);
1986		}
1987	}
1988	return (kmbx);
1989}
1990
1991static struct kse_mailbox *
1992kse_wakeup_one(struct pthread *thread)
1993{
1994	struct kse *ke;
1995
1996	if (KSE_IS_IDLE(thread->kse)) {
1997		KSE_CLEAR_IDLE(thread->kse);
1998		thread->kseg->kg_idle_kses--;
1999		return (&thread->kse->k_kcb->kcb_kmbx);
2000	} else {
2001		TAILQ_FOREACH(ke, &thread->kseg->kg_kseq, k_kgqe) {
2002			if (KSE_IS_IDLE(ke)) {
2003				KSE_CLEAR_IDLE(ke);
2004				ke->k_kseg->kg_idle_kses--;
2005				return (&ke->k_kcb->kcb_kmbx);
2006			}
2007		}
2008	}
2009	return (NULL);
2010}
2011
2012static void
2013kse_wakeup_multi(struct kse *curkse)
2014{
2015	struct kse *ke;
2016	int tmp;
2017
2018	if ((tmp = KSE_RUNQ_THREADS(curkse)) && curkse->k_kseg->kg_idle_kses) {
2019		TAILQ_FOREACH(ke, &curkse->k_kseg->kg_kseq, k_kgqe) {
2020			if (KSE_IS_IDLE(ke)) {
2021				KSE_CLEAR_IDLE(ke);
2022				ke->k_kseg->kg_idle_kses--;
2023				KSE_WAKEUP(ke);
2024				if (--tmp == 0)
2025					break;
2026			}
2027		}
2028	}
2029}
2030
2031/*
2032 * Allocate a new KSEG.
2033 *
2034 * We allow the current thread to be NULL in the case that this
2035 * is the first time a KSEG is being created (library initialization).
2036 * In this case, we don't need to (and can't) take any locks.
2037 */
2038struct kse_group *
2039_kseg_alloc(struct pthread *curthread)
2040{
2041	struct kse_group *kseg = NULL;
2042	kse_critical_t crit;
2043
2044	if ((curthread != NULL) && (free_kseg_count > 0)) {
2045		/* Use the kse lock for the kseg queue. */
2046		crit = _kse_critical_enter();
2047		KSE_LOCK_ACQUIRE(curthread->kse, &kse_lock);
2048		if ((kseg = TAILQ_FIRST(&free_kse_groupq)) != NULL) {
2049			TAILQ_REMOVE(&free_kse_groupq, kseg, kg_qe);
2050			free_kseg_count--;
2051			active_kseg_count++;
2052			TAILQ_INSERT_TAIL(&active_kse_groupq, kseg, kg_qe);
2053		}
2054		KSE_LOCK_RELEASE(curthread->kse, &kse_lock);
2055		_kse_critical_leave(crit);
2056		if (kseg)
2057			kseg_reinit(kseg);
2058	}
2059
2060	/*
2061	 * If requested, attempt to allocate a new KSE group only if the
2062	 * KSE allocation was successful and a KSE group wasn't found in
2063	 * the free list.
2064	 */
2065	if ((kseg == NULL) &&
2066	    ((kseg = (struct kse_group *)malloc(sizeof(*kseg))) != NULL)) {
2067		if (_pq_alloc(&kseg->kg_schedq.sq_runq,
2068		    THR_MIN_PRIORITY, THR_LAST_PRIORITY) != 0) {
2069			free(kseg);
2070			kseg = NULL;
2071		} else {
2072			kseg_init(kseg);
2073			/* Add the KSEG to the list of active KSEGs. */
2074			if (curthread != NULL) {
2075				crit = _kse_critical_enter();
2076				KSE_LOCK_ACQUIRE(curthread->kse, &kse_lock);
2077				active_kseg_count++;
2078				TAILQ_INSERT_TAIL(&active_kse_groupq,
2079				    kseg, kg_qe);
2080				KSE_LOCK_RELEASE(curthread->kse, &kse_lock);
2081				_kse_critical_leave(crit);
2082			} else {
2083				active_kseg_count++;
2084				TAILQ_INSERT_TAIL(&active_kse_groupq,
2085				    kseg, kg_qe);
2086			}
2087		}
2088	}
2089	return (kseg);
2090}
2091
2092static void
2093kseg_init(struct kse_group *kseg)
2094{
2095	kseg_reinit(kseg);
2096	_lock_init(&kseg->kg_lock, LCK_ADAPTIVE, _kse_lock_wait,
2097	    _kse_lock_wakeup);
2098}
2099
2100static void
2101kseg_reinit(struct kse_group *kseg)
2102{
2103	TAILQ_INIT(&kseg->kg_kseq);
2104	TAILQ_INIT(&kseg->kg_threadq);
2105	TAILQ_INIT(&kseg->kg_schedq.sq_waitq);
2106	kseg->kg_threadcount = 0;
2107	kseg->kg_ksecount = 0;
2108	kseg->kg_idle_kses = 0;
2109	kseg->kg_flags = 0;
2110}
2111
2112/*
2113 * This must be called with the kse lock held and when there are
2114 * no more threads that reference it.
2115 */
2116static void
2117kseg_free_unlocked(struct kse_group *kseg)
2118{
2119	TAILQ_REMOVE(&active_kse_groupq, kseg, kg_qe);
2120	TAILQ_INSERT_HEAD(&free_kse_groupq, kseg, kg_qe);
2121	free_kseg_count++;
2122	active_kseg_count--;
2123}
2124
2125void
2126_kseg_free(struct kse_group *kseg)
2127{
2128	struct kse *curkse;
2129	kse_critical_t crit;
2130
2131	crit = _kse_critical_enter();
2132	curkse = _get_curkse();
2133	KSE_LOCK_ACQUIRE(curkse, &kse_lock);
2134	kseg_free_unlocked(kseg);
2135	KSE_LOCK_RELEASE(curkse, &kse_lock);
2136	_kse_critical_leave(crit);
2137}
2138
2139static void
2140kseg_destroy(struct kse_group *kseg)
2141{
2142	_lock_destroy(&kseg->kg_lock);
2143	_pq_free(&kseg->kg_schedq.sq_runq);
2144	free(kseg);
2145}
2146
2147/*
2148 * Allocate a new KSE.
2149 *
2150 * We allow the current thread to be NULL in the case that this
2151 * is the first time a KSE is being created (library initialization).
2152 * In this case, we don't need to (and can't) take any locks.
2153 */
2154struct kse *
2155_kse_alloc(struct pthread *curthread, int sys_scope)
2156{
2157	struct kse *kse = NULL;
2158	char *stack;
2159	kse_critical_t crit;
2160	int i;
2161
2162	if ((curthread != NULL) && (free_kse_count > 0)) {
2163		crit = _kse_critical_enter();
2164		KSE_LOCK_ACQUIRE(curthread->kse, &kse_lock);
2165		/* Search for a finished KSE. */
2166		kse = TAILQ_FIRST(&free_kseq);
2167		while ((kse != NULL) &&
2168		    ((kse->k_kcb->kcb_kmbx.km_flags & KMF_DONE) == 0)) {
2169			kse = TAILQ_NEXT(kse, k_qe);
2170		}
2171		if (kse != NULL) {
2172			DBG_MSG("found an unused kse.\n");
2173			TAILQ_REMOVE(&free_kseq, kse, k_qe);
2174			free_kse_count--;
2175			TAILQ_INSERT_TAIL(&active_kseq, kse, k_qe);
2176			active_kse_count++;
2177		}
2178		KSE_LOCK_RELEASE(curthread->kse, &kse_lock);
2179		_kse_critical_leave(crit);
2180		if (kse != NULL)
2181			kse_reinit(kse, sys_scope);
2182	}
2183	if ((kse == NULL) &&
2184	    ((kse = (struct kse *)malloc(sizeof(*kse))) != NULL)) {
2185		if (sys_scope != 0)
2186			stack = NULL;
2187		else if ((stack = malloc(KSE_STACKSIZE)) == NULL) {
2188			free(kse);
2189			return (NULL);
2190		}
2191		bzero(kse, sizeof(*kse));
2192
2193		/* Initialize KCB without the lock. */
2194		if ((kse->k_kcb = _kcb_ctor(kse)) == NULL) {
2195			if (stack != NULL)
2196				free(stack);
2197			free(kse);
2198			return (NULL);
2199		}
2200
2201		/* Initialize the lockusers. */
2202		for (i = 0; i < MAX_KSE_LOCKLEVEL; i++) {
2203			_lockuser_init(&kse->k_lockusers[i], (void *)kse);
2204			_LCK_SET_PRIVATE2(&kse->k_lockusers[i], NULL);
2205		}
2206		/* _lock_init(kse->k_lock, ...) */
2207
2208		if (curthread != NULL) {
2209			crit = _kse_critical_enter();
2210			KSE_LOCK_ACQUIRE(curthread->kse, &kse_lock);
2211		}
2212		kse->k_flags = 0;
2213		TAILQ_INSERT_TAIL(&active_kseq, kse, k_qe);
2214		active_kse_count++;
2215		if (curthread != NULL) {
2216			KSE_LOCK_RELEASE(curthread->kse, &kse_lock);
2217			_kse_critical_leave(crit);
2218		}
2219		/*
2220		 * Create the KSE context.
2221		 * Scope system threads (one thread per KSE) are not required
2222		 * to have a stack for an unneeded kse upcall.
2223		 */
2224		if (!sys_scope) {
2225			kse->k_kcb->kcb_kmbx.km_func = (kse_func_t *)kse_sched_multi;
2226			kse->k_stack.ss_sp = stack;
2227			kse->k_stack.ss_size = KSE_STACKSIZE;
2228		} else {
2229			kse->k_kcb->kcb_kmbx.km_func = (kse_func_t *)kse_sched_single;
2230			kse->k_stack.ss_sp = NULL;
2231			kse->k_stack.ss_size = 0;
2232		}
2233		kse->k_kcb->kcb_kmbx.km_udata = (void *)kse;
2234		kse->k_kcb->kcb_kmbx.km_quantum = 20000;
2235		/*
2236		 * We need to keep a copy of the stack in case it
2237		 * doesn't get used; a KSE running a scope system
2238		 * thread will use that thread's stack.
2239		 */
2240		kse->k_kcb->kcb_kmbx.km_stack = kse->k_stack;
2241	}
2242	return (kse);
2243}
2244
2245static void
2246kse_reinit(struct kse *kse, int sys_scope)
2247{
2248	if (!sys_scope) {
2249		kse->k_kcb->kcb_kmbx.km_func = (kse_func_t *)kse_sched_multi;
2250		if (kse->k_stack.ss_sp == NULL) {
2251			/* XXX check allocation failure */
2252			kse->k_stack.ss_sp = (char *) malloc(KSE_STACKSIZE);
2253			kse->k_stack.ss_size = KSE_STACKSIZE;
2254		}
2255		kse->k_kcb->kcb_kmbx.km_quantum = 20000;
2256	} else {
2257		kse->k_kcb->kcb_kmbx.km_func = (kse_func_t *)kse_sched_single;
2258		if (kse->k_stack.ss_sp)
2259			free(kse->k_stack.ss_sp);
2260		kse->k_stack.ss_sp = NULL;
2261		kse->k_stack.ss_size = 0;
2262		kse->k_kcb->kcb_kmbx.km_quantum = 0;
2263	}
2264	kse->k_kcb->kcb_kmbx.km_stack = kse->k_stack;
2265	kse->k_kcb->kcb_kmbx.km_udata = (void *)kse;
2266	kse->k_kcb->kcb_kmbx.km_curthread = NULL;
2267	kse->k_kcb->kcb_kmbx.km_flags = 0;
2268	kse->k_curthread = NULL;
2269	kse->k_kseg = 0;
2270	kse->k_schedq = 0;
2271	kse->k_locklevel = 0;
2272	SIGEMPTYSET(kse->k_sigmask);
2273	bzero(&kse->k_sigq, sizeof(kse->k_sigq));
2274	kse->k_check_sigq = 0;
2275	kse->k_flags = 0;
2276	kse->k_waiting = 0;
2277	kse->k_idle = 0;
2278	kse->k_error = 0;
2279	kse->k_cpu = 0;
2280	kse->k_done = 0;
2281	kse->k_switch = 0;
2282	kse->k_sigseqno = 0;
2283}
2284
2285void
2286kse_free_unlocked(struct kse *kse)
2287{
2288	TAILQ_REMOVE(&active_kseq, kse, k_qe);
2289	active_kse_count--;
2290	kse->k_kseg = NULL;
2291	kse->k_kcb->kcb_kmbx.km_quantum = 20000;
2292	kse->k_flags = 0;
2293	TAILQ_INSERT_HEAD(&free_kseq, kse, k_qe);
2294	free_kse_count++;
2295}
2296
2297void
2298_kse_free(struct pthread *curthread, struct kse *kse)
2299{
2300	kse_critical_t crit;
2301
2302	if (curthread == NULL)
2303		kse_free_unlocked(kse);
2304	else {
2305		crit = _kse_critical_enter();
2306		KSE_LOCK_ACQUIRE(curthread->kse, &kse_lock);
2307		kse_free_unlocked(kse);
2308		KSE_LOCK_RELEASE(curthread->kse, &kse_lock);
2309		_kse_critical_leave(crit);
2310	}
2311}
2312
2313static void
2314kse_destroy(struct kse *kse)
2315{
2316	int i;
2317
2318	if (kse->k_stack.ss_sp != NULL)
2319		free(kse->k_stack.ss_sp);
2320	_kcb_dtor(kse->k_kcb);
2321	for (i = 0; i < MAX_KSE_LOCKLEVEL; ++i)
2322		_lockuser_destroy(&kse->k_lockusers[i]);
2323	_lock_destroy(&kse->k_lock);
2324	free(kse);
2325}
2326
2327struct pthread *
2328_thr_alloc(struct pthread *curthread)
2329{
2330	kse_critical_t crit;
2331	struct pthread *thread = NULL;
2332
2333	if (curthread != NULL) {
2334		if (GC_NEEDED())
2335			_thr_gc(curthread);
2336		if (free_thread_count > 0) {
2337			crit = _kse_critical_enter();
2338			KSE_LOCK_ACQUIRE(curthread->kse, &thread_lock);
2339			if ((thread = TAILQ_FIRST(&free_threadq)) != NULL) {
2340				TAILQ_REMOVE(&free_threadq, thread, tle);
2341				free_thread_count--;
2342			}
2343			KSE_LOCK_RELEASE(curthread->kse, &thread_lock);
2344			_kse_critical_leave(crit);
2345		}
2346	}
2347	if ((thread == NULL) &&
2348	    ((thread = malloc(sizeof(struct pthread))) != NULL)) {
2349		bzero(thread, sizeof(struct pthread));
2350		if ((thread->tcb = _tcb_ctor(thread)) == NULL) {
2351			free(thread);
2352			thread = NULL;
2353		}
2354	}
2355	return (thread);
2356}
2357
2358void
2359_thr_free(struct pthread *curthread, struct pthread *thread)
2360{
2361	kse_critical_t crit;
2362	int i;
2363
2364	DBG_MSG("Freeing thread %p\n", thread);
2365	if ((curthread == NULL) || (free_thread_count >= MAX_CACHED_THREADS)) {
2366		for (i = 0; i < MAX_THR_LOCKLEVEL; i++) {
2367			_lockuser_destroy(&thread->lockusers[i]);
2368		}
2369		_lock_destroy(&thread->lock);
2370		_tcb_dtor(thread->tcb);
2371		free(thread);
2372	}
2373	else {
2374		/* Reinitialize any important fields here. */
2375		thread->lock_switch = 0;
2376		sigemptyset(&thread->sigpend);
2377		thread->check_pending = 0;
2378
2379		/* Add the thread to the free thread list. */
2380		crit = _kse_critical_enter();
2381		KSE_LOCK_ACQUIRE(curthread->kse, &thread_lock);
2382		TAILQ_INSERT_TAIL(&free_threadq, thread, tle);
2383		free_thread_count++;
2384		KSE_LOCK_RELEASE(curthread->kse, &thread_lock);
2385		_kse_critical_leave(crit);
2386	}
2387}
2388
2389/*
2390 * Add an active thread:
2391 *
2392 *   o Assign the thread a unique id (which GDB uses to track
2393 *     threads.
2394 *   o Add the thread to the list of all threads and increment
2395 *     number of active threads.
2396 */
2397static void
2398thr_link(struct pthread *thread)
2399{
2400	kse_critical_t crit;
2401	struct kse *curkse;
2402	struct pthread *curthread;
2403
2404	crit = _kse_critical_enter();
2405	curkse = _get_curkse();
2406	curthread = _get_curthread();
2407	thread->sigmask = curthread->sigmask;
2408	KSE_LOCK_ACQUIRE(curkse, &_thread_list_lock);
2409	/*
2410	 * Initialize the unique id (which GDB uses to track
2411	 * threads), add the thread to the list of all threads,
2412	 * and
2413	 */
2414	thread->uniqueid = next_uniqueid++;
2415	THR_LIST_ADD(thread);
2416	active_threads++;
2417	KSE_LOCK_RELEASE(curkse, &_thread_list_lock);
2418	_kse_critical_leave(crit);
2419}
2420
2421/*
2422 * Remove an active thread.
2423 */
2424static void
2425thr_unlink(struct pthread *thread)
2426{
2427	kse_critical_t crit;
2428	struct kse *curkse;
2429
2430	crit = _kse_critical_enter();
2431	curkse = _get_curkse();
2432	KSE_LOCK_ACQUIRE(curkse, &_thread_list_lock);
2433	THR_LIST_REMOVE(thread);
2434	active_threads--;
2435	KSE_LOCK_RELEASE(curkse, &_thread_list_lock);
2436	_kse_critical_leave(crit);
2437}
2438
2439void
2440_thr_hash_add(struct pthread *thread)
2441{
2442	struct thread_hash_head *head;
2443
2444	head = &thr_hashtable[THREAD_HASH(thread)];
2445	LIST_INSERT_HEAD(head, thread, hle);
2446}
2447
2448void
2449_thr_hash_remove(struct pthread *thread)
2450{
2451	LIST_REMOVE(thread, hle);
2452}
2453
2454struct pthread *
2455_thr_hash_find(struct pthread *thread)
2456{
2457	struct pthread *td;
2458	struct thread_hash_head *head;
2459
2460	head = &thr_hashtable[THREAD_HASH(thread)];
2461	LIST_FOREACH(td, head, hle) {
2462		if (td == thread)
2463			return (thread);
2464	}
2465	return (NULL);
2466}
2467
2468