thr_kern.c revision 117344
1/*
2 * Copyright (C) 2003 Daniel M. Eischen <deischen@freebsd.org>
3 * Copyright (C) 2002 Jonathon Mini <mini@freebsd.org>
4 * Copyright (c) 1995-1998 John Birrell <jb@cimlogic.com.au>
5 * All rights reserved.
6 *
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
9 * are met:
10 * 1. Redistributions of source code must retain the above copyright
11 *    notice, this list of conditions and the following disclaimer.
12 * 2. Redistributions in binary form must reproduce the above copyright
13 *    notice, this list of conditions and the following disclaimer in the
14 *    documentation and/or other materials provided with the distribution.
15 * 3. All advertising materials mentioning features or use of this software
16 *    must display the following acknowledgement:
17 *	This product includes software developed by John Birrell.
18 * 4. Neither the name of the author nor the names of any co-contributors
19 *    may be used to endorse or promote products derived from this software
20 *    without specific prior written permission.
21 *
22 * THIS SOFTWARE IS PROVIDED BY JOHN BIRRELL AND CONTRIBUTORS ``AS IS'' AND
23 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
24 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
25 * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
26 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
27 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
28 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
29 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
30 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
31 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32 * SUCH DAMAGE.
33 *
34 */
35#include <sys/cdefs.h>
36__FBSDID("$FreeBSD: head/lib/libkse/thread/thr_kern.c 117344 2003-07-09 01:06:12Z davidxu $");
37
38#include <sys/types.h>
39#include <sys/kse.h>
40#include <sys/signalvar.h>
41#include <sys/queue.h>
42#include <machine/atomic.h>
43#include <machine/sigframe.h>
44
45#include <assert.h>
46#include <errno.h>
47#include <signal.h>
48#include <stdlib.h>
49#include <string.h>
50#include <time.h>
51#include <ucontext.h>
52#include <unistd.h>
53
54#include "atomic_ops.h"
55#include "thr_private.h"
56#include "libc_private.h"
57#include "ksd.h"
58
59/*#define DEBUG_THREAD_KERN */
60#ifdef DEBUG_THREAD_KERN
61#define DBG_MSG		stdout_debug
62#else
63#define DBG_MSG(x...)
64#endif
65
66/*
67 * Define a high water mark for the maximum number of threads that
68 * will be cached.  Once this level is reached, any extra threads
69 * will be free()'d.
70 *
71 * XXX - It doesn't make sense to worry about the maximum number of
72 *       KSEs that we can cache because the system will limit us to
73 *       something *much* less than the maximum number of threads
74 *       that we can have.  Disregarding KSEs in their own group,
75 *       the maximum number of KSEs is the number of processors in
76 *       the system.
77 */
78#define	MAX_CACHED_THREADS	100
79#define	KSE_STACKSIZE		16384
80
81#define	KSE_SET_MBOX(kse, thrd) \
82	(kse)->k_mbx.km_curthread = &(thrd)->tmbx
83
84#define	KSE_SET_EXITED(kse)	(kse)->k_flags |= KF_EXITED
85
86/*
87 * Macros for manipulating the run queues.  The priority queue
88 * routines use the thread's pqe link and also handle the setting
89 * and clearing of the thread's THR_FLAGS_IN_RUNQ flag.
90 */
91#define	KSE_RUNQ_INSERT_HEAD(kse, thrd)			\
92	_pq_insert_head(&(kse)->k_schedq->sq_runq, thrd)
93#define	KSE_RUNQ_INSERT_TAIL(kse, thrd)			\
94	_pq_insert_tail(&(kse)->k_schedq->sq_runq, thrd)
95#define	KSE_RUNQ_REMOVE(kse, thrd)			\
96	_pq_remove(&(kse)->k_schedq->sq_runq, thrd)
97#define	KSE_RUNQ_FIRST(kse)	_pq_first(&(kse)->k_schedq->sq_runq)
98
99#define KSE_RUNQ_THREADS(kse)	((kse)->k_schedq->sq_runq.pq_threads)
100
101/*
102 * We've got to keep track of everything that is allocated, not only
103 * to have a speedy free list, but also so they can be deallocated
104 * after a fork().
105 */
106static TAILQ_HEAD(, kse)	active_kseq;
107static TAILQ_HEAD(, kse)	free_kseq;
108static TAILQ_HEAD(, kse_group)	free_kse_groupq;
109static TAILQ_HEAD(, kse_group)	active_kse_groupq;
110static TAILQ_HEAD(, kse_group)	gc_ksegq;
111static struct lock		kse_lock;	/* also used for kseg queue */
112static int			free_kse_count = 0;
113static int			free_kseg_count = 0;
114static TAILQ_HEAD(, pthread)	free_threadq;
115static struct lock		thread_lock;
116static int			free_thread_count = 0;
117static int			inited = 0;
118static int			active_threads = 1;
119static int			active_kse_count = 0;
120static int			active_kseg_count = 0;
121static u_int64_t		next_uniqueid = 1;
122
123
124#ifdef DEBUG_THREAD_KERN
125static void	dump_queues(struct kse *curkse);
126#endif
127static void	kse_check_completed(struct kse *kse);
128static void	kse_check_waitq(struct kse *kse);
129static void	kse_fini(struct kse *curkse);
130static void	kse_reinit(struct kse *kse);
131static void	kse_sched_multi(struct kse *curkse);
132#ifdef NOT_YET
133static void	kse_sched_single(struct kse *curkse);
134#endif
135static void	kse_switchout_thread(struct kse *kse, struct pthread *thread);
136static void	kse_wait(struct kse *kse, struct pthread *td_wait);
137static void	kse_free_unlocked(struct kse *kse);
138static void	kseg_free_unlocked(struct kse_group *kseg);
139static void	kseg_init(struct kse_group *kseg);
140static void	kseg_reinit(struct kse_group *kseg);
141static void	kse_waitq_insert(struct pthread *thread);
142static void	kse_wakeup_multi(struct kse *curkse);
143static void	kse_wakeup_one(struct pthread *thread);
144static void	thr_cleanup(struct kse *kse, struct pthread *curthread);
145static void	thr_link(struct pthread *thread);
146static void	thr_resume_wrapper(int sig, siginfo_t *, ucontext_t *);
147static void	thr_resume_check(struct pthread *curthread, ucontext_t *ucp,
148		    struct pthread_sigframe *psf);
149static int	thr_timedout(struct pthread *thread, struct timespec *curtime);
150static void	thr_unlink(struct pthread *thread);
151
152
153/*
154 * This is called after a fork().
155 * No locks need to be taken here since we are guaranteed to be
156 * single threaded.
157 *
158 * XXX
159 * POSIX says for threaded process, fork() function is used
160 * only to run new programs, and the effects of calling functions
161 * that require certain resources between the call to fork() and
162 * the call to an exec function are undefined.
163 *
164 * Here it is not safe to reinitialize the library after fork().
165 * Because memory management may be corrupted, further calling
166 * malloc()/free() may cause undefined behavior.
167 */
168void
169_kse_single_thread(struct pthread *curthread)
170{
171#ifdef NOTYET
172	struct kse *kse;
173	struct kse_group *kseg;
174	struct pthread *thread;
175	kse_critical_t crit;
176	int i;
177
178
179	/*
180	 * Disable upcalls and clear the threaded flag.
181	 * XXX - I don't think we need to disable upcalls after a fork().
182	 *       but it doesn't hurt.
183	 */
184	crit = _kse_critical_enter();
185	__isthreaded = 0;
186	active_threads = 1;
187	_thr_signal_deinit();
188
189	/*
190	 * Enter a loop to remove and free all threads other than
191	 * the running thread from the active thread list:
192	 */
193	while ((thread = TAILQ_FIRST(&_thread_list)) != NULL) {
194		THR_GCLIST_REMOVE(thread);
195		/*
196		 * Remove this thread from the list (the current
197		 * thread will be removed but re-added by libpthread
198		 * initialization.
199		 */
200		TAILQ_REMOVE(&_thread_list, thread, tle);
201		/* Make sure this isn't the running thread: */
202		if (thread != curthread) {
203			_thr_stack_free(&thread->attr);
204			if (thread->specific != NULL)
205				free(thread->specific);
206			for (i = 0; i < MAX_THR_LOCKLEVEL; i++) {
207				_lockuser_destroy(&thread->lockusers[i]);
208			}
209			_lock_destroy(&thread->lock);
210			free(thread);
211		}
212	}
213
214	TAILQ_INIT(&curthread->mutexq);		/* initialize mutex queue */
215	curthread->joiner = NULL;		/* no joining threads yet */
216	curthread->refcount = 0;
217	SIGEMPTYSET(curthread->sigpend);	/* clear pending signals */
218	if (curthread->specific != NULL) {
219		free(curthread->specific);
220		curthread->specific = NULL;
221		curthread->specific_data_count = 0;
222	}
223
224	/* Free the free KSEs: */
225	while ((kse = TAILQ_FIRST(&free_kseq)) != NULL) {
226		TAILQ_REMOVE(&free_kseq, kse, k_qe);
227		for (i = 0; i < MAX_KSE_LOCKLEVEL; i++) {
228			_lockuser_destroy(&kse->k_lockusers[i]);
229		}
230		_lock_destroy(&kse->k_lock);
231		_ksd_destroy(&kse->k_ksd);
232		if (kse->k_stack.ss_sp != NULL)
233			free(kse->k_stack.ss_sp);
234		free(kse);
235	}
236	free_kse_count = 0;
237
238	/* Free the active KSEs: */
239	while ((kse = TAILQ_FIRST(&active_kseq)) != NULL) {
240		TAILQ_REMOVE(&active_kseq, kse, k_qe);
241		for (i = 0; i < MAX_KSE_LOCKLEVEL; i++) {
242			_lockuser_destroy(&kse->k_lockusers[i]);
243		}
244		_lock_destroy(&kse->k_lock);
245		if (kse->k_stack.ss_sp != NULL)
246			free(kse->k_stack.ss_sp);
247		free(kse);
248	}
249	active_kse_count = 0;
250
251	/* Free the free KSEGs: */
252	while ((kseg = TAILQ_FIRST(&free_kse_groupq)) != NULL) {
253		TAILQ_REMOVE(&free_kse_groupq, kseg, kg_qe);
254		_lock_destroy(&kseg->kg_lock);
255		_pq_free(&kseg->kg_schedq.sq_runq);
256		free(kseg);
257	}
258	free_kseg_count = 0;
259
260	/* Free the active KSEGs: */
261	while ((kseg = TAILQ_FIRST(&active_kse_groupq)) != NULL) {
262		TAILQ_REMOVE(&active_kse_groupq, kseg, kg_qe);
263		_lock_destroy(&kseg->kg_lock);
264		_pq_free(&kseg->kg_schedq.sq_runq);
265		free(kseg);
266	}
267	active_kseg_count = 0;
268
269	/* Free the free threads. */
270	while ((thread = TAILQ_FIRST(&free_threadq)) != NULL) {
271		TAILQ_REMOVE(&free_threadq, thread, tle);
272		if (thread->specific != NULL)
273			free(thread->specific);
274		for (i = 0; i < MAX_THR_LOCKLEVEL; i++) {
275			_lockuser_destroy(&thread->lockusers[i]);
276		}
277		_lock_destroy(&thread->lock);
278		free(thread);
279	}
280	free_thread_count = 0;
281
282	/* Free the to-be-gc'd threads. */
283	while ((thread = TAILQ_FIRST(&_thread_gc_list)) != NULL) {
284		TAILQ_REMOVE(&_thread_gc_list, thread, gcle);
285		for (i = 0; i < MAX_THR_LOCKLEVEL; i++) {
286			_lockuser_destroy(&thread->lockusers[i]);
287		}
288		_lock_destroy(&thread->lock);
289		free(thread);
290	}
291	TAILQ_INIT(&gc_ksegq);
292	_gc_count = 0;
293
294	if (inited != 0) {
295		/*
296		 * Destroy these locks; they'll be recreated to assure they
297		 * are in the unlocked state.
298		 */
299		_lock_destroy(&kse_lock);
300		_lock_destroy(&thread_lock);
301		_lock_destroy(&_thread_list_lock);
302		inited = 0;
303	}
304
305	/*
306	 * After a fork(), the leftover thread goes back to being
307	 * scope process.
308	 */
309	curthread->attr.flags &= ~PTHREAD_SCOPE_SYSTEM;
310	curthread->attr.flags |= PTHREAD_SCOPE_PROCESS;
311
312	/*
313	 * After a fork, we are still operating on the thread's original
314	 * stack.  Don't clear the THR_FLAGS_USER from the thread's
315	 * attribute flags.
316	 */
317
318	/* Initialize the threads library. */
319	curthread->kse = NULL;
320	curthread->kseg = NULL;
321	_kse_initial = NULL;
322	_libpthread_init(curthread);
323#else
324	_ksd_readandclear_tmbx();
325	__isthreaded   = 0;
326	active_threads = 0;
327	_thr_signal_deinit();
328#endif
329}
330
331/*
332 * This is used to initialize housekeeping and to initialize the
333 * KSD for the KSE.
334 */
335void
336_kse_init(void)
337{
338	if (inited == 0) {
339		TAILQ_INIT(&active_kseq);
340		TAILQ_INIT(&active_kse_groupq);
341		TAILQ_INIT(&free_kseq);
342		TAILQ_INIT(&free_kse_groupq);
343		TAILQ_INIT(&free_threadq);
344		TAILQ_INIT(&gc_ksegq);
345		if (_lock_init(&kse_lock, LCK_ADAPTIVE,
346		    _kse_lock_wait, _kse_lock_wakeup) != 0)
347			PANIC("Unable to initialize free KSE queue lock");
348		if (_lock_init(&thread_lock, LCK_ADAPTIVE,
349		    _kse_lock_wait, _kse_lock_wakeup) != 0)
350			PANIC("Unable to initialize free thread queue lock");
351		if (_lock_init(&_thread_list_lock, LCK_ADAPTIVE,
352		    _kse_lock_wait, _kse_lock_wakeup) != 0)
353			PANIC("Unable to initialize thread list lock");
354		active_kse_count = 0;
355		active_kseg_count = 0;
356		_gc_count = 0;
357		inited = 1;
358	}
359}
360
361int
362_kse_isthreaded(void)
363{
364	return (__isthreaded != 0);
365}
366
367/*
368 * This is called when the first thread (other than the initial
369 * thread) is created.
370 */
371int
372_kse_setthreaded(int threaded)
373{
374	if ((threaded != 0) && (__isthreaded == 0)) {
375		/*
376		 * Locking functions in libc are required when there are
377		 * threads other than the initial thread.
378		 */
379		__isthreaded = 1;
380
381		/*
382		 * Tell the kernel to create a KSE for the initial thread
383		 * and enable upcalls in it.
384		 */
385		_thr_signal_init();
386		_kse_initial->k_flags |= KF_STARTED;
387		if (kse_create(&_kse_initial->k_mbx, 0) != 0) {
388			_kse_initial->k_flags &= ~KF_STARTED;
389			__isthreaded = 0;
390			/* may abort() */
391			PANIC("kse_create() failed\n");
392			return (-1);
393		}
394		KSE_SET_MBOX(_kse_initial, _thr_initial);
395		_thr_start_sig_daemon();
396		_thr_setmaxconcurrency();
397	}
398	return (0);
399}
400
401/*
402 * Lock wait and wakeup handlers for KSE locks.  These are only used by
403 * KSEs, and should never be used by threads.  KSE locks include the
404 * KSE group lock (used for locking the scheduling queue) and the
405 * kse_lock defined above.
406 *
407 * When a KSE lock attempt blocks, the entire KSE blocks allowing another
408 * KSE to run.  For the most part, it doesn't make much sense to try and
409 * schedule another thread because you need to lock the scheduling queue
410 * in order to do that.  And since the KSE lock is used to lock the scheduling
411 * queue, you would just end up blocking again.
412 */
413void
414_kse_lock_wait(struct lock *lock, struct lockuser *lu)
415{
416	struct kse *curkse = (struct kse *)_LCK_GET_PRIVATE(lu);
417	struct timespec ts;
418	int saved_flags;
419
420	if (curkse->k_mbx.km_curthread != NULL)
421		PANIC("kse_lock_wait does not disable upcall.\n");
422	/*
423	 * Enter a loop to wait until we get the lock.
424	 */
425	ts.tv_sec = 0;
426	ts.tv_nsec = 1000000;  /* 1 sec */
427	while (!_LCK_GRANTED(lu)) {
428		/*
429		 * Yield the kse and wait to be notified when the lock
430		 * is granted.
431		 */
432		saved_flags = curkse->k_mbx.km_flags;
433		curkse->k_mbx.km_flags |= KMF_NOUPCALL | KMF_NOCOMPLETED;
434		kse_release(&ts);
435		curkse->k_mbx.km_flags = saved_flags;
436	}
437}
438
439void
440_kse_lock_wakeup(struct lock *lock, struct lockuser *lu)
441{
442	struct kse *curkse;
443	struct kse *kse;
444	struct kse_mailbox *mbx;
445
446	curkse = _get_curkse();
447	kse = (struct kse *)_LCK_GET_PRIVATE(lu);
448
449	if (kse == curkse)
450		PANIC("KSE trying to wake itself up in lock");
451	else {
452		mbx = &kse->k_mbx;
453		_lock_grant(lock, lu);
454		/*
455		 * Notify the owning kse that it has the lock.
456		 * It is safe to pass invalid address to kse_wakeup
457		 * even if the mailbox is not in kernel at all,
458		 * and waking up a wrong kse is also harmless.
459		 */
460		kse_wakeup(mbx);
461	}
462}
463
464/*
465 * Thread wait and wakeup handlers for thread locks.  These are only used
466 * by threads, never by KSEs.  Thread locks include the per-thread lock
467 * (defined in its structure), and condition variable and mutex locks.
468 */
469void
470_thr_lock_wait(struct lock *lock, struct lockuser *lu)
471{
472	struct pthread *curthread = (struct pthread *)lu->lu_private;
473
474	do {
475		THR_SCHED_LOCK(curthread, curthread);
476		THR_SET_STATE(curthread, PS_LOCKWAIT);
477		THR_SCHED_UNLOCK(curthread, curthread);
478		_thr_sched_switch(curthread);
479	} while (!_LCK_GRANTED(lu));
480}
481
482void
483_thr_lock_wakeup(struct lock *lock, struct lockuser *lu)
484{
485	struct pthread *thread;
486	struct pthread *curthread;
487
488	curthread = _get_curthread();
489	thread = (struct pthread *)_LCK_GET_PRIVATE(lu);
490
491	THR_SCHED_LOCK(curthread, thread);
492	_lock_grant(lock, lu);
493	_thr_setrunnable_unlocked(thread);
494	THR_SCHED_UNLOCK(curthread, thread);
495}
496
497kse_critical_t
498_kse_critical_enter(void)
499{
500	kse_critical_t crit;
501
502	crit = _ksd_readandclear_tmbx();
503	return (crit);
504}
505
506void
507_kse_critical_leave(kse_critical_t crit)
508{
509	struct pthread *curthread;
510
511	_ksd_set_tmbx(crit);
512	if ((crit != NULL) && ((curthread = _get_curthread()) != NULL))
513		THR_YIELD_CHECK(curthread);
514}
515
516int
517_kse_in_critical(void)
518{
519	return (_ksd_get_tmbx() == NULL);
520}
521
522void
523_thr_critical_enter(struct pthread *thread)
524{
525	thread->critical_count++;
526}
527
528void
529_thr_critical_leave(struct pthread *thread)
530{
531	thread->critical_count--;
532	THR_YIELD_CHECK(thread);
533}
534
535void
536_thr_sched_switch(struct pthread *curthread)
537{
538	struct kse *curkse;
539
540	(void)_kse_critical_enter();
541	curkse = _get_curkse();
542	KSE_SCHED_LOCK(curkse, curkse->k_kseg);
543	_thr_sched_switch_unlocked(curthread);
544}
545
546/*
547 * XXX - We may need to take the scheduling lock before calling
548 *       this, or perhaps take the lock within here before
549 *       doing anything else.
550 */
551void
552_thr_sched_switch_unlocked(struct pthread *curthread)
553{
554	struct pthread *td;
555	struct pthread_sigframe psf;
556	struct kse *curkse;
557	int ret;
558	volatile int uts_once;
559	volatile int resume_once = 0;
560	ucontext_t uc;
561
562	/* We're in the scheduler, 5 by 5: */
563	curkse = _get_curkse();
564
565	curthread->need_switchout = 1;	/* The thread yielded on its own. */
566	curthread->critical_yield = 0;	/* No need to yield anymore. */
567	curthread->slice_usec = -1;	/* Restart the time slice. */
568
569	/* Thread can unlock the scheduler lock. */
570	curthread->lock_switch = 1;
571
572	/*
573	 * The signal frame is allocated off the stack because
574	 * a thread can be interrupted by other signals while
575	 * it is running down pending signals.
576	 */
577	psf.psf_valid = 0;
578	curthread->curframe = &psf;
579
580	/*
581	 * Enter the scheduler if any one of the following is true:
582	 *
583	 *   o The current thread is dead; it's stack needs to be
584	 *     cleaned up and it can't be done while operating on
585	 *     it.
586	 *   o The current thread has signals pending, should
587	 *     let scheduler install signal trampoline for us.
588	 *   o There are no runnable threads.
589	 *   o The next thread to run won't unlock the scheduler
590	 *     lock.  A side note: the current thread may be run
591	 *     instead of the next thread in the run queue, but
592	 *     we don't bother checking for that.
593	 */
594	if ((curthread->state == PS_DEAD) ||
595	    (((td = KSE_RUNQ_FIRST(curkse)) == NULL) &&
596	    (curthread->state != PS_RUNNING)) ||
597	    ((td != NULL) && (td->lock_switch == 0))) {
598		curkse->k_switch = 1;
599		_thread_enter_uts(&curthread->tmbx, &curkse->k_mbx);
600	}
601	else {
602		uts_once = 0;
603		THR_GETCONTEXT(&curthread->tmbx.tm_context);
604		if (uts_once == 0) {
605			uts_once = 1;
606
607			/* Switchout the current thread. */
608			kse_switchout_thread(curkse, curthread);
609
610		 	/* Choose another thread to run. */
611			td = KSE_RUNQ_FIRST(curkse);
612			KSE_RUNQ_REMOVE(curkse, td);
613			curkse->k_curthread = td;
614
615			/*
616			 * Make sure the current thread's kse points to
617			 * this kse.
618			 */
619			td->kse = curkse;
620
621			/*
622			 * Reset accounting.
623			 */
624			td->tmbx.tm_uticks = 0;
625			td->tmbx.tm_sticks = 0;
626
627			/*
628			 * Reset the time slice if this thread is running
629			 * for the first time or running again after using
630			 * its full time slice allocation.
631			 */
632			if (td->slice_usec == -1)
633				td->slice_usec = 0;
634
635			/* Mark the thread active. */
636			td->active = 1;
637
638			/* Remove the frame reference. */
639			td->curframe = NULL;
640
641			/*
642			 * Continue the thread at its current frame:
643			 */
644			ret = _thread_switch(&td->tmbx, NULL);
645			/* This point should not be reached. */
646			if (ret != 0)
647				PANIC("Bad return from _thread_switch");
648			PANIC("Thread has returned from _thread_switch");
649		}
650	}
651
652	if (psf.psf_valid) {
653		/*
654		 * It is ugly we must increase critical count, because we
655		 * have a frame saved, we must backout state in psf
656		 * before we can process signals.
657 		 */
658		curthread->critical_count++;
659	}
660
661	if (curthread->lock_switch != 0) {
662		/*
663		 * Unlock the scheduling queue and leave the
664		 * critical region.
665		 */
666		/* Don't trust this after a switch! */
667		curkse = _get_curkse();
668
669		curthread->lock_switch = 0;
670		KSE_SCHED_UNLOCK(curkse, curkse->k_kseg);
671		_kse_critical_leave(&curthread->tmbx);
672	}
673	/*
674	 * This thread is being resumed; check for cancellations.
675	 */
676	if ((psf.psf_valid ||
677	    (curthread->check_pending && !THR_IN_CRITICAL(curthread)))) {
678		resume_once = 0;
679		THR_GETCONTEXT(&uc);
680		if (resume_once == 0) {
681			resume_once = 1;
682			curthread->check_pending = 0;
683			thr_resume_check(curthread, &uc, &psf);
684		}
685	}
686	THR_ACTIVATE_LAST_LOCK(curthread);
687}
688
689/*
690 * This is the scheduler for a KSE which runs a scope system thread.
691 * The multi-thread KSE scheduler should also work for a single threaded
692 * KSE, but we use a separate scheduler so that it can be fine-tuned
693 * to be more efficient (and perhaps not need a separate stack for
694 * the KSE, allowing it to use the thread's stack).
695 *
696 * XXX - This probably needs some work.
697 */
698#ifdef NOT_YET
699static void
700kse_sched_single(struct kse *curkse)
701{
702	struct pthread *curthread = curkse->k_curthread;
703	struct pthread *td_wait;
704	struct timespec ts;
705	int level;
706
707	if (curthread->active == 0) {
708		if (curthread->state != PS_RUNNING) {
709			/* Check to see if the thread has timed out. */
710			KSE_GET_TOD(curkse, &ts);
711			if (thr_timedout(curthread, &ts) != 0) {
712				curthread->timeout = 1;
713				curthread->state = PS_RUNNING;
714			}
715		}
716	}
717
718	/* This thread no longer needs to yield the CPU: */
719	curthread->critical_yield = 0;
720	curthread->need_switchout = 0;
721
722	/*
723	 * Lock the scheduling queue.
724	 *
725	 * There is no scheduling queue for single threaded KSEs,
726	 * but we need a lock for protection regardless.
727	 */
728	KSE_SCHED_LOCK(curkse, curkse->k_kseg);
729
730	/*
731	 * This has to do the job of kse_switchout_thread(), only
732	 * for a single threaded KSE/KSEG.
733	 */
734
735	switch (curthread->state) {
736	case PS_DEAD:
737		/* Unlock the scheduling queue and exit the KSE and thread. */
738		thr_cleaup(curkse, curthread);
739		KSE_SCHED_UNLOCK(curkse, curkse->k_kseg);
740		break;
741
742	case PS_COND_WAIT:
743	case PS_SLEEP_WAIT:
744		/* Only insert threads that can timeout: */
745		if (curthread->wakeup_time.tv_sec != -1) {
746			/* Insert into the waiting queue: */
747			KSE_WAITQ_INSERT(curkse, curthread);
748		}
749		break;
750
751	case PS_LOCKWAIT:
752		level = curthread->locklevel - 1;
753		if (!_LCK_GRANTED(&curthread->lockusers[level]))
754			KSE_WAITQ_INSERT(curkse, curthread);
755		else
756			THR_SET_STATE(curthread, PS_RUNNING);
757		break;
758
759	case PS_JOIN:
760	case PS_MUTEX_WAIT:
761	case PS_RUNNING:
762	case PS_SIGSUSPEND:
763	case PS_SIGWAIT:
764	case PS_SUSPENDED:
765	case PS_DEADLOCK:
766	default:
767		/*
768		 * These states don't timeout and don't need
769		 * to be in the waiting queue.
770		 */
771		break;
772	}
773	while (curthread->state != PS_RUNNING) {
774		curthread->active = 0;
775		td_wait = KSE_WAITQ_FIRST(curkse);
776
777		kse_wait(curkse, td_wait);
778
779	    	if (td_wait != NULL) {
780			KSE_GET_TOD(curkse, &ts);
781			if (thr_timedout(curthread, &ts)) {
782				/* Indicate the thread timedout: */
783				td_wait->timeout = 1;
784
785				/* Make the thread runnable. */
786				THR_SET_STATE(td_wait, PS_RUNNING);
787				KSE_WAITQ_REMOVE(curkse, td_wait);
788			}
789		}
790	}
791
792	/* Remove the frame reference. */
793	curthread->curframe = NULL;
794
795	/* Unlock the scheduling queue. */
796	KSE_SCHED_UNLOCK(curkse, curkse->k_kseg);
797
798	/*
799	 * Continue the thread at its current frame:
800	 */
801	DBG_MSG("Continuing bound thread %p\n", curthread);
802	_thread_switch(&curthread->tmbx, &curkse->k_mbx.km_curthread);
803	PANIC("Thread has returned from _thread_switch");
804}
805#endif
806
807#ifdef DEBUG_THREAD_KERN
808static void
809dump_queues(struct kse *curkse)
810{
811	struct pthread *thread;
812
813	DBG_MSG("Threads in waiting queue:\n");
814	TAILQ_FOREACH(thread, &curkse->k_kseg->kg_schedq.sq_waitq, pqe) {
815		DBG_MSG("  thread %p, state %d, blocked %d\n",
816		    thread, thread->state, thread->blocked);
817	}
818}
819#endif
820
821/*
822 * This is the scheduler for a KSE which runs multiple threads.
823 */
824static void
825kse_sched_multi(struct kse *curkse)
826{
827	struct pthread *curthread, *td_wait;
828	struct pthread_sigframe *curframe;
829	int ret;
830
831	THR_ASSERT(curkse->k_mbx.km_curthread == NULL,
832	    "Mailbox not null in kse_sched_multi");
833
834	/* Check for first time initialization: */
835	if ((curkse->k_flags & KF_INITIALIZED) == 0) {
836		/* Setup this KSEs specific data. */
837		_ksd_setprivate(&curkse->k_ksd);
838		_set_curkse(curkse);
839
840		/* Set this before grabbing the context. */
841		curkse->k_flags |= KF_INITIALIZED;
842	}
843
844	/* This may have returned from a kse_release(). */
845	if (KSE_WAITING(curkse)) {
846		DBG_MSG("Entered upcall when KSE is waiting.");
847		KSE_CLEAR_WAIT(curkse);
848	}
849
850	/*If this is an upcall; take the scheduler lock. */
851	if (curkse->k_switch == 0)
852		KSE_SCHED_LOCK(curkse, curkse->k_kseg);
853	curkse->k_switch = 0;
854
855	curthread = curkse->k_curthread;
856
857	if (KSE_IS_IDLE(curkse)) {
858		KSE_CLEAR_IDLE(curkse);
859		curkse->k_kseg->kg_idle_kses--;
860	}
861	/*
862	 * If the current thread was completed in another KSE, then
863	 * it will be in the run queue.  Don't mark it as being blocked.
864	 */
865	if ((curthread != NULL) &&
866	    ((curthread->flags & THR_FLAGS_IN_RUNQ) == 0) &&
867	    (curthread->need_switchout == 0)) {
868		/*
869		 * Assume the current thread is blocked; when the
870		 * completed threads are checked and if the current
871		 * thread is among the completed, the blocked flag
872		 * will be cleared.
873		 */
874		curthread->blocked = 1;
875	}
876
877	/* Check for any unblocked threads in the kernel. */
878	kse_check_completed(curkse);
879
880	/*
881	 * Check for threads that have timed-out.
882	 */
883	kse_check_waitq(curkse);
884
885	/*
886	 * Switchout the current thread, if necessary, as the last step
887	 * so that it is inserted into the run queue (if it's runnable)
888	 * _after_ any other threads that were added to it above.
889	 */
890	if (curthread == NULL)
891		;  /* Nothing to do here. */
892	else if ((curthread->need_switchout == 0) &&
893	    (curthread->blocked == 0) && (THR_IN_CRITICAL(curthread))) {
894		/*
895		 * Resume the thread and tell it to yield when
896		 * it leaves the critical region.
897		 */
898		curthread->critical_yield = 1;
899		curthread->active = 1;
900		if ((curthread->flags & THR_FLAGS_IN_RUNQ) != 0)
901			KSE_RUNQ_REMOVE(curkse, curthread);
902		curkse->k_curthread = curthread;
903		curthread->kse = curkse;
904		DBG_MSG("Continuing thread %p in critical region\n",
905		    curthread);
906		kse_wakeup_multi(curkse);
907		KSE_SCHED_UNLOCK(curkse, curkse->k_kseg);
908		ret = _thread_switch(&curthread->tmbx,
909		    &curkse->k_mbx.km_curthread);
910		if (ret != 0)
911			PANIC("Can't resume thread in critical region\n");
912	}
913	else if ((curthread->flags & THR_FLAGS_IN_RUNQ) == 0)
914		kse_switchout_thread(curkse, curthread);
915	curkse->k_curthread = NULL;
916
917	kse_wakeup_multi(curkse);
918
919#ifdef DEBUG_THREAD_KERN
920	dump_queues(curkse);
921#endif
922
923	/* Check if there are no threads ready to run: */
924	while (((curthread = KSE_RUNQ_FIRST(curkse)) == NULL) &&
925	    (curkse->k_kseg->kg_threadcount != 0)) {
926		/*
927		 * Wait for a thread to become active or until there are
928		 * no more threads.
929		 */
930		td_wait = KSE_WAITQ_FIRST(curkse);
931		kse_wait(curkse, td_wait);
932		kse_check_completed(curkse);
933		kse_check_waitq(curkse);
934	}
935
936	/* Check for no more threads: */
937	if (curkse->k_kseg->kg_threadcount == 0) {
938		/*
939		 * Normally this shouldn't return, but it will if there
940		 * are other KSEs running that create new threads that
941		 * are assigned to this KSE[G].  For instance, if a scope
942		 * system thread were to create a scope process thread
943		 * and this kse[g] is the initial kse[g], then that newly
944		 * created thread would be assigned to us (the initial
945		 * kse[g]).
946		 */
947		KSE_SCHED_UNLOCK(curkse, curkse->k_kseg);
948		kse_fini(curkse);
949		/* never returns */
950	}
951
952	THR_ASSERT(curthread != NULL,
953	    "Return from kse_wait/fini without thread.");
954	THR_ASSERT(curthread->state != PS_DEAD,
955	    "Trying to resume dead thread!");
956	KSE_RUNQ_REMOVE(curkse, curthread);
957
958	/*
959	 * Make the selected thread the current thread.
960	 */
961	curkse->k_curthread = curthread;
962
963	/*
964	 * Make sure the current thread's kse points to this kse.
965	 */
966	curthread->kse = curkse;
967
968	/*
969	 * Reset accounting.
970	 */
971	curthread->tmbx.tm_uticks = 0;
972	curthread->tmbx.tm_sticks = 0;
973
974	/*
975	 * Reset the time slice if this thread is running for the first
976	 * time or running again after using its full time slice allocation.
977	 */
978	if (curthread->slice_usec == -1)
979		curthread->slice_usec = 0;
980
981	/* Mark the thread active. */
982	curthread->active = 1;
983
984	/* Remove the frame reference. */
985	curframe = curthread->curframe;
986	curthread->curframe = NULL;
987
988	kse_wakeup_multi(curkse);
989
990	/*
991	 * The thread's current signal frame will only be NULL if it
992	 * is being resumed after being blocked in the kernel.  In
993	 * this case, and if the thread needs to run down pending
994	 * signals or needs a cancellation check, we need to add a
995	 * signal frame to the thread's context.
996	 */
997#ifdef NOT_YET
998	if ((((curframe == NULL) && (curthread->check_pending != 0)) ||
999	    (((curthread->cancelflags & THR_AT_CANCEL_POINT) == 0) &&
1000	     ((curthread->cancelflags & PTHREAD_CANCEL_ASYNCHRONOUS) != 0))) &&
1001	     !THR_IN_CRITICAL(curthread))
1002		signalcontext(&curthread->tmbx.tm_context, 0,
1003		    (__sighandler_t *)thr_resume_wrapper);
1004#else
1005	if ((curframe == NULL) && (curthread->check_pending != 0) &&
1006	    !THR_IN_CRITICAL(curthread)) {
1007		curthread->check_pending = 0;
1008		signalcontext(&curthread->tmbx.tm_context, 0,
1009		    (__sighandler_t *)thr_resume_wrapper);
1010	}
1011#endif
1012	/*
1013	 * Continue the thread at its current frame:
1014	 */
1015	if (curthread->lock_switch != 0) {
1016		/*
1017		 * This thread came from a scheduler switch; it will
1018		 * unlock the scheduler lock and set the mailbox.
1019		 */
1020		ret = _thread_switch(&curthread->tmbx, NULL);
1021	} else {
1022		/* This thread won't unlock the scheduler lock. */
1023		KSE_SCHED_UNLOCK(curkse, curkse->k_kseg);
1024		ret = _thread_switch(&curthread->tmbx,
1025		    &curkse->k_mbx.km_curthread);
1026	}
1027	if (ret != 0)
1028		PANIC("Thread has returned from _thread_switch");
1029
1030	/* This point should not be reached. */
1031	PANIC("Thread has returned from _thread_switch");
1032}
1033
1034static void
1035thr_resume_wrapper(int sig, siginfo_t *siginfo, ucontext_t *ucp)
1036{
1037	struct pthread *curthread = _get_curthread();
1038	struct kse *curkse;
1039	int ret, err_save = curthread->error;
1040
1041	DBG_MSG(">>> sig wrapper\n");
1042	if (curthread->lock_switch)
1043		PANIC("thr_resume_wrapper, lock_switch != 0\n");
1044	thr_resume_check(curthread, ucp, NULL);
1045	_kse_critical_enter();
1046	curkse = _get_curkse();
1047	curthread->tmbx.tm_context = *ucp;
1048	curthread->error = err_save;
1049	ret = _thread_switch(&curthread->tmbx, &curkse->k_mbx.km_curthread);
1050	if (ret != 0)
1051		PANIC("thr_resume_wrapper: thread has returned "
1052		      "from _thread_switch");
1053	/* THR_SETCONTEXT(ucp); */ /* not work, why ? */
1054}
1055
1056static void
1057thr_resume_check(struct pthread *curthread, ucontext_t *ucp,
1058    struct pthread_sigframe *psf)
1059{
1060	_thr_sig_rundown(curthread, ucp, psf);
1061
1062#ifdef NOT_YET
1063	if (((curthread->cancelflags & THR_AT_CANCEL_POINT) == 0) &&
1064	    ((curthread->cancelflags & PTHREAD_CANCEL_ASYNCHRONOUS) != 0))
1065		pthread_testcancel();
1066#endif
1067}
1068
1069/*
1070 * Clean up a thread.  This must be called with the thread's KSE
1071 * scheduling lock held.  The thread must be a thread from the
1072 * KSE's group.
1073 */
1074static void
1075thr_cleanup(struct kse *curkse, struct pthread *thread)
1076{
1077	struct pthread *joiner;
1078	int sys_scope;
1079
1080	if ((joiner = thread->joiner) != NULL) {
1081		/* Joinee scheduler lock held; joiner won't leave. */
1082		if (joiner->kseg == curkse->k_kseg) {
1083			if (joiner->join_status.thread == thread) {
1084				joiner->join_status.thread = NULL;
1085				joiner->join_status.ret = thread->ret;
1086				_thr_setrunnable_unlocked(joiner);
1087			}
1088		} else {
1089			KSE_SCHED_UNLOCK(curkse, curkse->k_kseg);
1090			/* The joiner may have removed itself and exited. */
1091			if (_thr_ref_add(thread, joiner, 0) == 0) {
1092				KSE_SCHED_LOCK(curkse, joiner->kseg);
1093				if (joiner->join_status.thread == thread) {
1094					joiner->join_status.thread = NULL;
1095					joiner->join_status.ret = thread->ret;
1096					_thr_setrunnable_unlocked(joiner);
1097				}
1098				KSE_SCHED_UNLOCK(curkse, joiner->kseg);
1099				_thr_ref_delete(thread, joiner);
1100			}
1101			KSE_SCHED_LOCK(curkse, curkse->k_kseg);
1102		}
1103		thread->attr.flags |= PTHREAD_DETACHED;
1104	}
1105
1106	if (!(sys_scope = (thread->attr.flags & PTHREAD_SCOPE_SYSTEM))) {
1107		/*
1108		 * Remove the thread from the KSEG's list of threads.
1109	 	 */
1110		KSEG_THRQ_REMOVE(thread->kseg, thread);
1111		/*
1112		 * Migrate the thread to the main KSE so that this
1113		 * KSE and KSEG can be cleaned when their last thread
1114		 * exits.
1115		 */
1116		thread->kseg = _kse_initial->k_kseg;
1117		thread->kse = _kse_initial;
1118	}
1119	thread->flags |= THR_FLAGS_GC_SAFE;
1120
1121	/*
1122	 * We can't hold the thread list lock while holding the
1123	 * scheduler lock.
1124	 */
1125	KSE_SCHED_UNLOCK(curkse, curkse->k_kseg);
1126	DBG_MSG("Adding thread %p to GC list\n", thread);
1127	KSE_LOCK_ACQUIRE(curkse, &_thread_list_lock);
1128	THR_GCLIST_ADD(thread);
1129	/* Use thread_list_lock */
1130	active_threads--;
1131	if (active_threads == 1) {
1132		KSE_LOCK_RELEASE(curkse, &_thread_list_lock);
1133		exit(0);
1134        }
1135	KSE_LOCK_RELEASE(curkse, &_thread_list_lock);
1136	if (sys_scope) {
1137		/*
1138		 * System scope thread is single thread group,
1139		 * when thread is exited, its kse and ksegrp should
1140		 * be recycled as well.
1141		 */
1142		kse_exit();
1143		PANIC("kse_exit() failed for system scope thread");
1144	}
1145	KSE_SCHED_LOCK(curkse, curkse->k_kseg);
1146}
1147
1148void
1149_thr_gc(struct pthread *curthread)
1150{
1151	struct pthread *td, *td_next;
1152	kse_critical_t crit;
1153	TAILQ_HEAD(, pthread) worklist;
1154
1155	TAILQ_INIT(&worklist);
1156	crit = _kse_critical_enter();
1157	KSE_LOCK_ACQUIRE(curthread->kse, &_thread_list_lock);
1158
1159	/* Check the threads waiting for GC. */
1160	for (td = TAILQ_FIRST(&_thread_gc_list); td != NULL; td = td_next) {
1161		td_next = TAILQ_NEXT(td, gcle);
1162		if ((td->flags & THR_FLAGS_GC_SAFE) == 0)
1163			continue;
1164		else if (((td->attr.flags & PTHREAD_SCOPE_SYSTEM) != 0) &&
1165		    ((td->kse->k_mbx.km_flags & KMF_DONE) == 0)) {
1166			/*
1167			 * The thread and KSE are operating on the same
1168			 * stack.  Wait for the KSE to exit before freeing
1169			 * the thread's stack as well as everything else.
1170			 */
1171			continue;
1172		}
1173		/*
1174		 * Remove the thread from the GC list.  If the thread
1175		 * isn't yet detached, it will get added back to the
1176		 * GC list at a later time.
1177		 */
1178		THR_GCLIST_REMOVE(td);
1179		DBG_MSG("Freeing thread %p stack\n", td);
1180		/*
1181		 * We can free the thread stack since it's no longer
1182		 * in use.
1183		 */
1184		_thr_stack_free(&td->attr);
1185		if (((td->attr.flags & PTHREAD_DETACHED) != 0) &&
1186		    (td->refcount == 0)) {
1187			/*
1188			 * The thread has detached and is no longer
1189			 * referenced.  It is safe to remove all
1190			 * remnants of the thread.
1191			 */
1192			THR_LIST_REMOVE(td);
1193			TAILQ_INSERT_HEAD(&worklist, td, gcle);
1194		}
1195	}
1196	KSE_LOCK_RELEASE(curthread->kse, &_thread_list_lock);
1197	_kse_critical_leave(crit);
1198
1199	while ((td = TAILQ_FIRST(&worklist)) != NULL) {
1200		TAILQ_REMOVE(&worklist, td, gcle);
1201
1202		if ((td->attr.flags & PTHREAD_SCOPE_SYSTEM) != 0) {
1203			crit = _kse_critical_enter();
1204			KSE_LOCK_ACQUIRE(curthread->kse, &kse_lock);
1205			kse_free_unlocked(td->kse);
1206			kseg_free_unlocked(td->kseg);
1207			KSE_LOCK_RELEASE(curthread->kse, &kse_lock);
1208			_kse_critical_leave(crit);
1209		}
1210		/*
1211		 * XXX we don't free initial thread, because there might
1212		 * have some code referencing initial thread.
1213		 */
1214		if (td != _thr_initial) {
1215			DBG_MSG("Freeing thread %p\n", td);
1216			_thr_free(curthread, td);
1217		} else
1218			DBG_MSG("Initial thread won't be freed\n");
1219	}
1220	/* XXX free kse and ksegrp list should be looked as well */
1221}
1222
1223
1224/*
1225 * Only new threads that are running or suspended may be scheduled.
1226 */
1227int
1228_thr_schedule_add(struct pthread *curthread, struct pthread *newthread)
1229{
1230	kse_critical_t crit;
1231	int ret;
1232
1233	/* Add the new thread. */
1234	thr_link(newthread);
1235
1236	/*
1237	 * If this is the first time creating a thread, make sure
1238	 * the mailbox is set for the current thread.
1239	 */
1240	if ((newthread->attr.flags & PTHREAD_SCOPE_SYSTEM) != 0) {
1241#ifdef NOT_YET
1242		/* We use the thread's stack as the KSE's stack. */
1243		new_thread->kse->k_mbx.km_stack.ss_sp =
1244		    new_thread->attr.stackaddr_attr;
1245		new_thread->kse->k_mbx.km_stack.ss_size =
1246		    new_thread->attr.stacksize_attr;
1247#endif
1248		/*
1249		 * No need to lock the scheduling queue since the
1250		 * KSE/KSEG pair have not yet been started.
1251		 */
1252		KSEG_THRQ_ADD(newthread->kseg, newthread);
1253		if (newthread->state == PS_RUNNING)
1254			THR_RUNQ_INSERT_TAIL(newthread);
1255		newthread->kse->k_curthread = NULL;
1256		newthread->kse->k_mbx.km_flags = 0;
1257		newthread->kse->k_mbx.km_func = (kse_func_t *)kse_sched_multi;
1258		newthread->kse->k_mbx.km_quantum = 0;
1259
1260		/*
1261		 * This thread needs a new KSE and KSEG.
1262		 */
1263		newthread->kse->k_flags &= ~KF_INITIALIZED;
1264		newthread->kse->k_flags |= KF_STARTED;
1265		ret = kse_create(&newthread->kse->k_mbx, 1);
1266		if (ret != 0)
1267			ret = errno;
1268	}
1269	else {
1270		/*
1271		 * Lock the KSE and add the new thread to its list of
1272		 * assigned threads.  If the new thread is runnable, also
1273		 * add it to the KSE's run queue.
1274		 */
1275		crit = _kse_critical_enter();
1276		KSE_SCHED_LOCK(curthread->kse, newthread->kseg);
1277		KSEG_THRQ_ADD(newthread->kseg, newthread);
1278		if (newthread->state == PS_RUNNING)
1279			THR_RUNQ_INSERT_TAIL(newthread);
1280		if ((newthread->kse->k_flags & KF_STARTED) == 0) {
1281			/*
1282			 * This KSE hasn't been started yet.  Start it
1283			 * outside of holding the lock.
1284			 */
1285			newthread->kse->k_flags |= KF_STARTED;
1286			newthread->kse->k_mbx.km_func =
1287			    (kse_func_t *)kse_sched_multi;
1288			newthread->kse->k_mbx.km_flags = 0;
1289			kse_create(&newthread->kse->k_mbx, 0);
1290		 } else if ((newthread->state == PS_RUNNING) &&
1291		     KSE_IS_IDLE(newthread->kse)) {
1292			/*
1293			 * The thread is being scheduled on another KSEG.
1294			 */
1295			kse_wakeup_one(newthread);
1296		}
1297		KSE_SCHED_UNLOCK(curthread->kse, newthread->kseg);
1298		_kse_critical_leave(crit);
1299		ret = 0;
1300	}
1301	if (ret != 0)
1302		thr_unlink(newthread);
1303
1304	return (ret);
1305}
1306
1307void
1308kse_waitq_insert(struct pthread *thread)
1309{
1310	struct pthread *td;
1311
1312	if (thread->wakeup_time.tv_sec == -1)
1313		TAILQ_INSERT_TAIL(&thread->kse->k_schedq->sq_waitq, thread,
1314		    pqe);
1315	else {
1316		td = TAILQ_FIRST(&thread->kse->k_schedq->sq_waitq);
1317		while ((td != NULL) && (td->wakeup_time.tv_sec != -1) &&
1318		    ((td->wakeup_time.tv_sec < thread->wakeup_time.tv_sec) ||
1319		    ((td->wakeup_time.tv_sec == thread->wakeup_time.tv_sec) &&
1320		    (td->wakeup_time.tv_nsec <= thread->wakeup_time.tv_nsec))))
1321			td = TAILQ_NEXT(td, pqe);
1322		if (td == NULL)
1323			TAILQ_INSERT_TAIL(&thread->kse->k_schedq->sq_waitq,
1324			    thread, pqe);
1325		else
1326			TAILQ_INSERT_BEFORE(td, thread, pqe);
1327	}
1328	thread->flags |= THR_FLAGS_IN_WAITQ;
1329}
1330
1331/*
1332 * This must be called with the scheduling lock held.
1333 */
1334static void
1335kse_check_completed(struct kse *kse)
1336{
1337	struct pthread *thread;
1338	struct kse_thr_mailbox *completed;
1339	int sig;
1340
1341	if ((completed = kse->k_mbx.km_completed) != NULL) {
1342		kse->k_mbx.km_completed = NULL;
1343		while (completed != NULL) {
1344			thread = completed->tm_udata;
1345			DBG_MSG("Found completed thread %p, name %s\n",
1346			    thread,
1347			    (thread->name == NULL) ? "none" : thread->name);
1348			thread->blocked = 0;
1349			if (thread != kse->k_curthread) {
1350				if ((thread->flags & THR_FLAGS_SUSPENDED) != 0)
1351					THR_SET_STATE(thread, PS_SUSPENDED);
1352				else
1353					KSE_RUNQ_INSERT_TAIL(kse, thread);
1354				if ((thread->kse != kse) &&
1355				    (thread->kse->k_curthread == thread)) {
1356					thread->kse->k_curthread = NULL;
1357					thread->active = 0;
1358				}
1359			}
1360			if ((sig = thread->tmbx.tm_syncsig.si_signo) != 0) {
1361				if (SIGISMEMBER(thread->sigmask, sig))
1362					SIGADDSET(thread->sigpend, sig);
1363				else
1364					_thr_sig_add(thread, sig, &thread->tmbx.tm_syncsig);
1365				thread->tmbx.tm_syncsig.si_signo = 0;
1366			}
1367			completed = completed->tm_next;
1368		}
1369	}
1370}
1371
1372/*
1373 * This must be called with the scheduling lock held.
1374 */
1375static void
1376kse_check_waitq(struct kse *kse)
1377{
1378	struct pthread	*pthread;
1379	struct timespec ts;
1380
1381	KSE_GET_TOD(kse, &ts);
1382
1383	/*
1384	 * Wake up threads that have timedout.  This has to be
1385	 * done before adding the current thread to the run queue
1386	 * so that a CPU intensive thread doesn't get preference
1387	 * over waiting threads.
1388	 */
1389	while (((pthread = KSE_WAITQ_FIRST(kse)) != NULL) &&
1390	    thr_timedout(pthread, &ts)) {
1391		/* Remove the thread from the wait queue: */
1392		KSE_WAITQ_REMOVE(kse, pthread);
1393		DBG_MSG("Found timedout thread %p in waitq\n", pthread);
1394
1395		/* Indicate the thread timedout: */
1396		pthread->timeout = 1;
1397
1398		/* Add the thread to the priority queue: */
1399		if ((pthread->flags & THR_FLAGS_SUSPENDED) != 0)
1400			THR_SET_STATE(pthread, PS_SUSPENDED);
1401		else {
1402			THR_SET_STATE(pthread, PS_RUNNING);
1403			KSE_RUNQ_INSERT_TAIL(kse, pthread);
1404		}
1405	}
1406}
1407
1408static int
1409thr_timedout(struct pthread *thread, struct timespec *curtime)
1410{
1411	if (thread->wakeup_time.tv_sec < 0)
1412		return (0);
1413	else if (thread->wakeup_time.tv_sec > curtime->tv_sec)
1414		return (0);
1415	else if ((thread->wakeup_time.tv_sec == curtime->tv_sec) &&
1416	    (thread->wakeup_time.tv_nsec > curtime->tv_nsec))
1417		return (0);
1418	else
1419		return (1);
1420}
1421
1422/*
1423 * This must be called with the scheduling lock held.
1424 *
1425 * Each thread has a time slice, a wakeup time (used when it wants
1426 * to wait for a specified amount of time), a run state, and an
1427 * active flag.
1428 *
1429 * When a thread gets run by the scheduler, the active flag is
1430 * set to non-zero (1).  When a thread performs an explicit yield
1431 * or schedules a state change, it enters the scheduler and the
1432 * active flag is cleared.  When the active flag is still seen
1433 * set in the scheduler, that means that the thread is blocked in
1434 * the kernel (because it is cleared before entering the scheduler
1435 * in all other instances).
1436 *
1437 * The wakeup time is only set for those states that can timeout.
1438 * It is set to (-1, -1) for all other instances.
1439 *
1440 * The thread's run state, aside from being useful when debugging,
1441 * is used to place the thread in an appropriate queue.  There
1442 * are 2 basic queues:
1443 *
1444 *   o run queue - queue ordered by priority for all threads
1445 *                 that are runnable
1446 *   o waiting queue - queue sorted by wakeup time for all threads
1447 *                     that are not otherwise runnable (not blocked
1448 *                     in kernel, not waiting for locks)
1449 *
1450 * The thread's time slice is used for round-robin scheduling
1451 * (the default scheduling policy).  While a SCHED_RR thread
1452 * is runnable it's time slice accumulates.  When it reaches
1453 * the time slice interval, it gets reset and added to the end
1454 * of the queue of threads at its priority.  When a thread no
1455 * longer becomes runnable (blocks in kernel, waits, etc), its
1456 * time slice is reset.
1457 *
1458 * The job of kse_switchout_thread() is to handle all of the above.
1459 */
1460static void
1461kse_switchout_thread(struct kse *kse, struct pthread *thread)
1462{
1463	int level;
1464	int i;
1465	int restart;
1466	siginfo_t siginfo;
1467
1468	/*
1469	 * Place the currently running thread into the
1470	 * appropriate queue(s).
1471	 */
1472	DBG_MSG("Switching out thread %p, state %d\n", thread, thread->state);
1473
1474	THR_DEACTIVATE_LAST_LOCK(thread);
1475	if (thread->blocked != 0) {
1476		thread->active = 0;
1477		thread->need_switchout = 0;
1478		/* This thread must have blocked in the kernel. */
1479		/* thread->slice_usec = -1;*/	/* restart timeslice */
1480		if ((thread->slice_usec != -1) &&
1481		    (thread->attr.sched_policy != SCHED_FIFO))
1482			thread->slice_usec += (thread->tmbx.tm_uticks
1483			    + thread->tmbx.tm_sticks) * _clock_res_usec;
1484		/*
1485		 *  Check for pending signals for this thread to
1486		 *  see if we need to interrupt it in the kernel.
1487		 */
1488		if (thread->check_pending != 0) {
1489			for (i = 1; i <= _SIG_MAXSIG; ++i) {
1490				if (SIGISMEMBER(thread->sigpend, i) &&
1491				    !SIGISMEMBER(thread->sigmask, i)) {
1492					restart = _thread_sigact[1 - 1].sa_flags & SA_RESTART;
1493					kse_thr_interrupt(&thread->tmbx,
1494					    restart ? -2 : -1);
1495					break;
1496				}
1497			}
1498		}
1499	}
1500	else {
1501		switch (thread->state) {
1502		case PS_DEAD:
1503			/*
1504			 * The scheduler is operating on a different
1505			 * stack.  It is safe to do garbage collecting
1506			 * here.
1507			 */
1508			thread->active = 0;
1509			thread->need_switchout = 0;
1510			thr_cleanup(kse, thread);
1511			return;
1512			break;
1513
1514		case PS_RUNNING:
1515			if ((thread->flags & THR_FLAGS_SUSPENDED) != 0)
1516				THR_SET_STATE(thread, PS_SUSPENDED);
1517			break;
1518
1519		case PS_COND_WAIT:
1520		case PS_SLEEP_WAIT:
1521			/* Insert into the waiting queue: */
1522			KSE_WAITQ_INSERT(kse, thread);
1523			break;
1524
1525		case PS_LOCKWAIT:
1526			/*
1527			 * This state doesn't timeout.
1528			 */
1529			thread->wakeup_time.tv_sec = -1;
1530			thread->wakeup_time.tv_nsec = -1;
1531			level = thread->locklevel - 1;
1532			if (!_LCK_GRANTED(&thread->lockusers[level]))
1533				KSE_WAITQ_INSERT(kse, thread);
1534			else
1535				THR_SET_STATE(thread, PS_RUNNING);
1536			break;
1537
1538		case PS_SIGWAIT:
1539			KSE_WAITQ_INSERT(kse, thread);
1540			break;
1541		case PS_JOIN:
1542		case PS_MUTEX_WAIT:
1543		case PS_SIGSUSPEND:
1544		case PS_SUSPENDED:
1545		case PS_DEADLOCK:
1546		default:
1547			/*
1548			 * These states don't timeout.
1549			 */
1550			thread->wakeup_time.tv_sec = -1;
1551			thread->wakeup_time.tv_nsec = -1;
1552
1553			/* Insert into the waiting queue: */
1554			KSE_WAITQ_INSERT(kse, thread);
1555			break;
1556		}
1557		if (thread->state != PS_RUNNING) {
1558			/* Restart the time slice: */
1559			thread->slice_usec = -1;
1560		} else {
1561			if (thread->need_switchout != 0)
1562				/*
1563				 * The thread yielded on its own;
1564				 * restart the timeslice.
1565				 */
1566				thread->slice_usec = -1;
1567			else if ((thread->slice_usec != -1) &&
1568	   		    (thread->attr.sched_policy != SCHED_FIFO)) {
1569				thread->slice_usec += (thread->tmbx.tm_uticks
1570				    + thread->tmbx.tm_sticks) * _clock_res_usec;
1571				/* Check for time quantum exceeded: */
1572				if (thread->slice_usec > TIMESLICE_USEC)
1573					thread->slice_usec = -1;
1574			}
1575			if (thread->slice_usec == -1) {
1576				/*
1577				 * The thread exceeded its time quantum or
1578				 * it yielded the CPU; place it at the tail
1579				 * of the queue for its priority.
1580				 */
1581				KSE_RUNQ_INSERT_TAIL(kse, thread);
1582			} else {
1583				/*
1584				 * The thread hasn't exceeded its interval
1585				 * Place it at the head of the queue for its
1586				 * priority.
1587				 */
1588				KSE_RUNQ_INSERT_HEAD(kse, thread);
1589			}
1590		}
1591	}
1592	thread->active = 0;
1593	thread->need_switchout = 0;
1594	if (thread->check_pending != 0) {
1595		/* Install pending signals into the frame. */
1596		thread->check_pending = 0;
1597		KSE_LOCK_ACQUIRE(kse, &_thread_signal_lock);
1598		for (i = 1; i <= _SIG_MAXSIG; i++) {
1599			if (SIGISMEMBER(thread->sigmask, i))
1600				continue;
1601			if (SIGISMEMBER(thread->sigpend, i))
1602				_thr_sig_add(thread, i, &thread->siginfo[i-1]);
1603			else if (SIGISMEMBER(_thr_proc_sigpending, i) &&
1604				_thr_getprocsig_unlocked(i, &siginfo)) {
1605				_thr_sig_add(thread, i, &siginfo);
1606			}
1607		}
1608		KSE_LOCK_RELEASE(kse, &_thread_signal_lock);
1609	}
1610}
1611
1612/*
1613 * This function waits for the smallest timeout value of any waiting
1614 * thread, or until it receives a message from another KSE.
1615 *
1616 * This must be called with the scheduling lock held.
1617 */
1618static void
1619kse_wait(struct kse *kse, struct pthread *td_wait)
1620{
1621	struct timespec ts, ts_sleep;
1622	int saved_flags;
1623
1624	KSE_GET_TOD(kse, &ts);
1625
1626	if ((td_wait == NULL) || (td_wait->wakeup_time.tv_sec < 0)) {
1627		/* Limit sleep to no more than 1 minute. */
1628		ts_sleep.tv_sec = 60;
1629		ts_sleep.tv_nsec = 0;
1630	} else {
1631		TIMESPEC_SUB(&ts_sleep, &td_wait->wakeup_time, &ts);
1632		if (ts_sleep.tv_sec > 60) {
1633			ts_sleep.tv_sec = 60;
1634			ts_sleep.tv_nsec = 0;
1635		}
1636	}
1637	/* Don't sleep for negative times. */
1638	if ((ts_sleep.tv_sec >= 0) && (ts_sleep.tv_nsec >= 0)) {
1639		KSE_SET_IDLE(kse);
1640		kse->k_kseg->kg_idle_kses++;
1641		KSE_SCHED_UNLOCK(kse, kse->k_kseg);
1642		saved_flags = kse->k_mbx.km_flags;
1643		kse->k_mbx.km_flags |= KMF_NOUPCALL;
1644		kse_release(&ts_sleep);
1645		kse->k_mbx.km_flags = saved_flags;
1646		KSE_SCHED_LOCK(kse, kse->k_kseg);
1647		if (KSE_IS_IDLE(kse)) {
1648			KSE_CLEAR_IDLE(kse);
1649			kse->k_kseg->kg_idle_kses--;
1650		}
1651	}
1652}
1653
1654/*
1655 * Avoid calling this kse_exit() so as not to confuse it with the
1656 * system call of the same name.
1657 */
1658static void
1659kse_fini(struct kse *kse)
1660{
1661	/* struct kse_group *free_kseg = NULL; */
1662	struct timespec ts;
1663
1664	/*
1665	 * Check to see if this is one of the main kses.
1666	 */
1667	if (kse->k_kseg != _kse_initial->k_kseg) {
1668		PANIC("shouldn't get here");
1669		/* This is for supporting thread groups. */
1670#ifdef NOT_YET
1671		/* Remove this KSE from the KSEG's list of KSEs. */
1672		KSE_SCHED_LOCK(kse, kse->k_kseg);
1673		TAILQ_REMOVE(&kse->k_kseg->kg_kseq, kse, k_kgqe);
1674		kse->k_kseg->kg_ksecount--;
1675		if (TAILQ_EMPTY(&kse->k_kseg->kg_kseq))
1676			free_kseg = kse->k_kseg;
1677		KSE_SCHED_UNLOCK(kse, kse->k_kseg);
1678
1679		/*
1680		 * Add this KSE to the list of free KSEs along with
1681		 * the KSEG if is now orphaned.
1682		 */
1683		KSE_LOCK_ACQUIRE(kse, &kse_lock);
1684		if (free_kseg != NULL)
1685			kseg_free_unlocked(free_kseg);
1686		kse_free_unlocked(kse);
1687		KSE_LOCK_RELEASE(kse, &kse_lock);
1688		kse_exit();
1689		/* Never returns. */
1690		PANIC("kse_exit()");
1691#endif
1692	} else {
1693#ifdef NOT_YET
1694		/*
1695		 * In future, we might allow program to kill
1696		 * kse in initial group.
1697		 */
1698		if (kse != _kse_initial) {
1699			KSE_SCHED_LOCK(kse, kse->k_kseg);
1700			TAILQ_REMOVE(&kse->k_kseg->kg_kseq, kse, k_kgqe);
1701			kse->k_kseg->kg_ksecount--;
1702			KSE_SCHED_UNLOCK(kse, kse->k_kseg);
1703			KSE_LOCK_ACQUIRE(kse, &kse_lock);
1704			kse_free_unlocked(kse);
1705			KSE_LOCK_RELEASE(kse, &kse_lock);
1706			kse_exit();
1707                        /* Never returns. */
1708                        PANIC("kse_exit() failed for initial kseg");
1709                }
1710#endif
1711		KSE_SCHED_LOCK(kse, kse->k_kseg);
1712		KSE_SET_IDLE(kse);
1713		kse->k_kseg->kg_idle_kses++;
1714		KSE_SCHED_UNLOCK(kse, kse->k_kseg);
1715		ts.tv_sec = 120;
1716		ts.tv_nsec = 0;
1717		kse->k_mbx.km_flags = 0;
1718		kse_release(&ts);
1719		/* Never reach */
1720	}
1721}
1722
1723void
1724_thr_set_timeout(const struct timespec *timeout)
1725{
1726	struct pthread	*curthread = _get_curthread();
1727	struct timespec ts;
1728
1729	/* Reset the timeout flag for the running thread: */
1730	curthread->timeout = 0;
1731
1732	/* Check if the thread is to wait forever: */
1733	if (timeout == NULL) {
1734		/*
1735		 * Set the wakeup time to something that can be recognised as
1736		 * different to an actual time of day:
1737		 */
1738		curthread->wakeup_time.tv_sec = -1;
1739		curthread->wakeup_time.tv_nsec = -1;
1740	}
1741	/* Check if no waiting is required: */
1742	else if ((timeout->tv_sec == 0) && (timeout->tv_nsec == 0)) {
1743		/* Set the wake up time to 'immediately': */
1744		curthread->wakeup_time.tv_sec = 0;
1745		curthread->wakeup_time.tv_nsec = 0;
1746	} else {
1747		/* Calculate the time for the current thread to wakeup: */
1748		KSE_GET_TOD(curthread->kse, &ts);
1749		TIMESPEC_ADD(&curthread->wakeup_time, &ts, timeout);
1750	}
1751}
1752
1753void
1754_thr_panic_exit(char *file, int line, char *msg)
1755{
1756	char buf[256];
1757
1758	snprintf(buf, sizeof(buf), "(%s:%d) %s\n", file, line, msg);
1759	__sys_write(2, buf, strlen(buf));
1760	abort();
1761}
1762
1763void
1764_thr_setrunnable(struct pthread *curthread, struct pthread *thread)
1765{
1766	kse_critical_t crit;
1767
1768	crit = _kse_critical_enter();
1769	KSE_SCHED_LOCK(curthread->kse, thread->kseg);
1770	_thr_setrunnable_unlocked(thread);
1771	KSE_SCHED_UNLOCK(curthread->kse, thread->kseg);
1772	_kse_critical_leave(crit);
1773}
1774
1775void
1776_thr_setrunnable_unlocked(struct pthread *thread)
1777{
1778	if ((thread->kseg->kg_flags & KGF_SINGLE_THREAD) != 0) {
1779		/* No silly queues for these threads. */
1780		if ((thread->flags & THR_FLAGS_SUSPENDED) != 0)
1781			THR_SET_STATE(thread, PS_SUSPENDED);
1782		else
1783			THR_SET_STATE(thread, PS_RUNNING);
1784	} else if (thread->state != PS_RUNNING) {
1785		if ((thread->flags & THR_FLAGS_IN_WAITQ) != 0)
1786			KSE_WAITQ_REMOVE(thread->kse, thread);
1787		if ((thread->flags & THR_FLAGS_SUSPENDED) != 0)
1788			THR_SET_STATE(thread, PS_SUSPENDED);
1789		else {
1790			THR_SET_STATE(thread, PS_RUNNING);
1791			if ((thread->blocked == 0) && (thread->active == 0) &&
1792			    (thread->flags & THR_FLAGS_IN_RUNQ) == 0)
1793				THR_RUNQ_INSERT_TAIL(thread);
1794		}
1795	}
1796        /*
1797         * XXX - Threads are not yet assigned to specific KSEs; they are
1798         *       assigned to the KSEG.  So the fact that a thread's KSE is
1799         *       waiting doesn't necessarily mean that it will be the KSE
1800         *       that runs the thread after the lock is granted.  But we
1801         *       don't know if the other KSEs within the same KSEG are
1802         *       also in a waiting state or not so we err on the side of
1803         *       caution and wakeup the thread's last known KSE.  We
1804         *       ensure that the threads KSE doesn't change while it's
1805         *       scheduling lock is held so it is safe to reference it
1806         *       (the KSE).  If the KSE wakes up and doesn't find any more
1807         *       work it will again go back to waiting so no harm is done.
1808         */
1809	kse_wakeup_one(thread);
1810}
1811
1812static void
1813kse_wakeup_one(struct pthread *thread)
1814{
1815	struct kse *ke;
1816
1817	if (KSE_IS_IDLE(thread->kse)) {
1818		KSE_CLEAR_IDLE(thread->kse);
1819		thread->kseg->kg_idle_kses--;
1820		KSE_WAKEUP(thread->kse);
1821	} else {
1822		TAILQ_FOREACH(ke, &thread->kseg->kg_kseq, k_kgqe) {
1823			if (KSE_IS_IDLE(ke)) {
1824				KSE_CLEAR_IDLE(ke);
1825				ke->k_kseg->kg_idle_kses--;
1826				KSE_WAKEUP(ke);
1827				return;
1828			}
1829		}
1830	}
1831}
1832
1833static void
1834kse_wakeup_multi(struct kse *curkse)
1835{
1836	struct kse *ke;
1837	int tmp;
1838
1839	if ((tmp = KSE_RUNQ_THREADS(curkse)) && curkse->k_kseg->kg_idle_kses) {
1840		TAILQ_FOREACH(ke, &curkse->k_kseg->kg_kseq, k_kgqe) {
1841			if (KSE_IS_IDLE(ke)) {
1842				KSE_CLEAR_IDLE(ke);
1843				ke->k_kseg->kg_idle_kses--;
1844				KSE_WAKEUP(ke);
1845				if (--tmp == 0)
1846					break;
1847			}
1848		}
1849	}
1850}
1851
1852struct pthread *
1853_get_curthread(void)
1854{
1855	return (_ksd_curthread());
1856}
1857
1858/* This assumes the caller has disabled upcalls. */
1859struct kse *
1860_get_curkse(void)
1861{
1862	return (_ksd_curkse());
1863}
1864
1865void
1866_set_curkse(struct kse *kse)
1867{
1868	_ksd_setprivate(&kse->k_ksd);
1869}
1870
1871/*
1872 * Allocate a new KSEG.
1873 *
1874 * We allow the current thread to be NULL in the case that this
1875 * is the first time a KSEG is being created (library initialization).
1876 * In this case, we don't need to (and can't) take any locks.
1877 */
1878struct kse_group *
1879_kseg_alloc(struct pthread *curthread)
1880{
1881	struct kse_group *kseg = NULL;
1882	kse_critical_t crit;
1883
1884	if ((curthread != NULL) && (free_kseg_count > 0)) {
1885		/* Use the kse lock for the kseg queue. */
1886		crit = _kse_critical_enter();
1887		KSE_LOCK_ACQUIRE(curthread->kse, &kse_lock);
1888		if ((kseg = TAILQ_FIRST(&free_kse_groupq)) != NULL) {
1889			TAILQ_REMOVE(&free_kse_groupq, kseg, kg_qe);
1890			free_kseg_count--;
1891			active_kseg_count++;
1892			TAILQ_INSERT_TAIL(&active_kse_groupq, kseg, kg_qe);
1893		}
1894		KSE_LOCK_RELEASE(curthread->kse, &kse_lock);
1895		_kse_critical_leave(crit);
1896		if (kseg)
1897			kseg_reinit(kseg);
1898	}
1899
1900	/*
1901	 * If requested, attempt to allocate a new KSE group only if the
1902	 * KSE allocation was successful and a KSE group wasn't found in
1903	 * the free list.
1904	 */
1905	if ((kseg == NULL) &&
1906	    ((kseg = (struct kse_group *)malloc(sizeof(*kseg))) != NULL)) {
1907		if (_pq_alloc(&kseg->kg_schedq.sq_runq,
1908		    THR_MIN_PRIORITY, THR_LAST_PRIORITY) != 0) {
1909			free(kseg);
1910			kseg = NULL;
1911		} else {
1912			kseg_init(kseg);
1913			/* Add the KSEG to the list of active KSEGs. */
1914			if (curthread != NULL) {
1915				crit = _kse_critical_enter();
1916				KSE_LOCK_ACQUIRE(curthread->kse, &kse_lock);
1917				active_kseg_count++;
1918				TAILQ_INSERT_TAIL(&active_kse_groupq,
1919				    kseg, kg_qe);
1920				KSE_LOCK_RELEASE(curthread->kse, &kse_lock);
1921				_kse_critical_leave(crit);
1922			} else {
1923				active_kseg_count++;
1924				TAILQ_INSERT_TAIL(&active_kse_groupq,
1925				    kseg, kg_qe);
1926			}
1927		}
1928	}
1929	return (kseg);
1930}
1931
1932/*
1933 * This must be called with the kse lock held and when there are
1934 * no more threads that reference it.
1935 */
1936static void
1937kseg_free_unlocked(struct kse_group *kseg)
1938{
1939	TAILQ_REMOVE(&active_kse_groupq, kseg, kg_qe);
1940	TAILQ_INSERT_HEAD(&free_kse_groupq, kseg, kg_qe);
1941	free_kseg_count++;
1942	active_kseg_count--;
1943}
1944
1945void
1946_kseg_free(struct kse_group *kseg)
1947{
1948	struct kse *curkse;
1949	kse_critical_t crit;
1950
1951	crit = _kse_critical_enter();
1952	curkse = _get_curkse();
1953	KSE_LOCK_ACQUIRE(curkse, &kse_lock);
1954	kseg_free_unlocked(kseg);
1955	KSE_LOCK_RELEASE(curkse, &kse_lock);
1956	_kse_critical_leave(crit);
1957}
1958
1959/*
1960 * Allocate a new KSE.
1961 *
1962 * We allow the current thread to be NULL in the case that this
1963 * is the first time a KSE is being created (library initialization).
1964 * In this case, we don't need to (and can't) take any locks.
1965 */
1966struct kse *
1967_kse_alloc(struct pthread *curthread)
1968{
1969	struct kse *kse = NULL;
1970	kse_critical_t crit;
1971	int need_ksd = 0;
1972	int i;
1973
1974	if ((curthread != NULL) && (free_kse_count > 0)) {
1975		crit = _kse_critical_enter();
1976		KSE_LOCK_ACQUIRE(curthread->kse, &kse_lock);
1977		/* Search for a finished KSE. */
1978		kse = TAILQ_FIRST(&free_kseq);
1979		while ((kse != NULL) &&
1980		    ((kse->k_mbx.km_flags & KMF_DONE) == 0)) {
1981			kse = TAILQ_NEXT(kse, k_qe);
1982		}
1983		if (kse != NULL) {
1984			DBG_MSG("found an unused kse.\n");
1985			TAILQ_REMOVE(&free_kseq, kse, k_qe);
1986			free_kse_count--;
1987			TAILQ_INSERT_TAIL(&active_kseq, kse, k_qe);
1988			active_kse_count++;
1989		}
1990		KSE_LOCK_RELEASE(curthread->kse, &kse_lock);
1991		_kse_critical_leave(crit);
1992		if (kse != NULL)
1993			kse_reinit(kse);
1994	}
1995	if ((kse == NULL) &&
1996	    ((kse = (struct kse *)malloc(sizeof(*kse))) != NULL)) {
1997		bzero(kse, sizeof(*kse));
1998
1999		/* Initialize the lockusers. */
2000		for (i = 0; i < MAX_KSE_LOCKLEVEL; i++) {
2001			_lockuser_init(&kse->k_lockusers[i], (void *)kse);
2002			_LCK_SET_PRIVATE2(&kse->k_lockusers[i], NULL);
2003		}
2004		/* _lock_init(kse->k_lock, ...) */
2005
2006		/* We had to malloc a kse; mark it as needing a new ID.*/
2007		need_ksd = 1;
2008
2009		/*
2010		 * Create the KSE context.
2011		 *
2012		 * XXX - For now this is done here in the allocation.
2013		 *       In the future, we may want to have it done
2014		 *       outside the allocation so that scope system
2015		 *       threads (one thread per KSE) are not required
2016		 *       to have a stack for an unneeded kse upcall.
2017		 */
2018		kse->k_mbx.km_func = (kse_func_t *)kse_sched_multi;
2019		kse->k_mbx.km_stack.ss_sp = (char *)malloc(KSE_STACKSIZE);
2020		kse->k_mbx.km_stack.ss_size = KSE_STACKSIZE;
2021		kse->k_mbx.km_udata = (void *)kse;
2022		kse->k_mbx.km_quantum = 20000;
2023		/*
2024		 * We need to keep a copy of the stack in case it
2025		 * doesn't get used; a KSE running a scope system
2026		 * thread will use that thread's stack.
2027		 */
2028		kse->k_stack.ss_sp = kse->k_mbx.km_stack.ss_sp;
2029		kse->k_stack.ss_size = kse->k_mbx.km_stack.ss_size;
2030		if (kse->k_mbx.km_stack.ss_sp == NULL) {
2031			for (i = 0; i < MAX_KSE_LOCKLEVEL; i++) {
2032				_lockuser_destroy(&kse->k_lockusers[i]);
2033			}
2034			/* _lock_destroy(&kse->k_lock); */
2035			free(kse);
2036			kse = NULL;
2037		}
2038	}
2039	if ((kse != NULL) && (need_ksd != 0)) {
2040		/* This KSE needs initialization. */
2041		if (curthread != NULL) {
2042			crit = _kse_critical_enter();
2043			KSE_LOCK_ACQUIRE(curthread->kse, &kse_lock);
2044		}
2045		/* Initialize KSD inside of the lock. */
2046		if (_ksd_create(&kse->k_ksd, (void *)kse, sizeof(*kse)) != 0) {
2047			if (curthread != NULL) {
2048				KSE_LOCK_RELEASE(curthread->kse, &kse_lock);
2049				_kse_critical_leave(crit);
2050			}
2051			free(kse->k_mbx.km_stack.ss_sp);
2052			for (i = 0; i < MAX_KSE_LOCKLEVEL; i++) {
2053				_lockuser_destroy(&kse->k_lockusers[i]);
2054			}
2055			free(kse);
2056			return (NULL);
2057		}
2058		kse->k_flags = 0;
2059		TAILQ_INSERT_TAIL(&active_kseq, kse, k_qe);
2060		active_kse_count++;
2061		if (curthread != NULL) {
2062			KSE_LOCK_RELEASE(curthread->kse, &kse_lock);
2063			_kse_critical_leave(crit);
2064		}
2065	}
2066	return (kse);
2067}
2068
2069static void
2070kse_reinit(struct kse *kse)
2071{
2072	/*
2073	 * XXX - For now every kse has its stack.
2074	 *       In the future, we may want to have it done
2075	 *       outside the allocation so that scope system
2076	 *       threads (one thread per KSE) are not required
2077	 *       to have a stack for an unneeded kse upcall.
2078	 */
2079	kse->k_mbx.km_flags = 0;
2080	kse->k_curthread = 0;
2081	kse->k_kseg = 0;
2082	kse->k_schedq = 0;
2083	kse->k_locklevel = 0;
2084	SIGEMPTYSET(kse->k_sigmask);
2085	bzero(&kse->k_sigq, sizeof(kse->k_sigq));
2086	kse->k_check_sigq = 0;
2087	kse->k_flags = 0;
2088	kse->k_waiting = 0;
2089	kse->k_idle = 0;
2090	kse->k_error = 0;
2091	kse->k_cpu = 0;
2092	kse->k_done = 0;
2093	kse->k_switch = 0;
2094}
2095
2096void
2097kse_free_unlocked(struct kse *kse)
2098{
2099	TAILQ_REMOVE(&active_kseq, kse, k_qe);
2100	active_kse_count--;
2101	kse->k_kseg = NULL;
2102	kse->k_mbx.km_quantum = 20000;
2103	kse->k_flags = 0;
2104	TAILQ_INSERT_HEAD(&free_kseq, kse, k_qe);
2105	free_kse_count++;
2106}
2107
2108void
2109_kse_free(struct pthread *curthread, struct kse *kse)
2110{
2111	kse_critical_t crit;
2112
2113	if (curthread == NULL)
2114		kse_free_unlocked(kse);
2115	else {
2116		crit = _kse_critical_enter();
2117		KSE_LOCK_ACQUIRE(curthread->kse, &kse_lock);
2118		kse_free_unlocked(kse);
2119		KSE_LOCK_RELEASE(curthread->kse, &kse_lock);
2120		_kse_critical_leave(crit);
2121	}
2122}
2123
2124static void
2125kseg_init(struct kse_group *kseg)
2126{
2127	kseg_reinit(kseg);
2128	_lock_init(&kseg->kg_lock, LCK_ADAPTIVE, _kse_lock_wait,
2129	    _kse_lock_wakeup);
2130}
2131
2132static void
2133kseg_reinit(struct kse_group *kseg)
2134{
2135	TAILQ_INIT(&kseg->kg_kseq);
2136	TAILQ_INIT(&kseg->kg_threadq);
2137	TAILQ_INIT(&kseg->kg_schedq.sq_waitq);
2138	kseg->kg_threadcount = 0;
2139	kseg->kg_ksecount = 0;
2140	kseg->kg_idle_kses = 0;
2141	kseg->kg_flags = 0;
2142}
2143
2144struct pthread *
2145_thr_alloc(struct pthread *curthread)
2146{
2147	kse_critical_t crit;
2148	void *p;
2149	struct pthread *thread = NULL;
2150
2151	if (curthread != NULL) {
2152		if (GC_NEEDED())
2153			_thr_gc(curthread);
2154		if (free_thread_count > 0) {
2155			crit = _kse_critical_enter();
2156			KSE_LOCK_ACQUIRE(curthread->kse, &thread_lock);
2157			if ((thread = TAILQ_FIRST(&free_threadq)) != NULL) {
2158				TAILQ_REMOVE(&free_threadq, thread, tle);
2159				free_thread_count--;
2160			}
2161			KSE_LOCK_RELEASE(curthread->kse, &thread_lock);
2162			_kse_critical_leave(crit);
2163		}
2164	}
2165	if (thread == NULL) {
2166		p = malloc(sizeof(struct pthread) + THR_ALIGNBYTES);
2167		if (p != NULL) {
2168			thread = (struct pthread *)THR_ALIGN(p);
2169			thread->alloc_addr = p;
2170		}
2171	}
2172	return (thread);
2173}
2174
2175void
2176_thr_free(struct pthread *curthread, struct pthread *thread)
2177{
2178	kse_critical_t crit;
2179	int i;
2180
2181	DBG_MSG("Freeing thread %p\n", thread);
2182	if ((curthread == NULL) || (free_thread_count >= MAX_CACHED_THREADS)) {
2183		for (i = 0; i < MAX_THR_LOCKLEVEL; i++) {
2184			_lockuser_destroy(&thread->lockusers[i]);
2185		}
2186		_lock_destroy(&thread->lock);
2187		free(thread->alloc_addr);
2188	}
2189	else {
2190		crit = _kse_critical_enter();
2191		KSE_LOCK_ACQUIRE(curthread->kse, &thread_lock);
2192		TAILQ_INSERT_TAIL(&free_threadq, thread, tle);
2193		free_thread_count++;
2194		KSE_LOCK_RELEASE(curthread->kse, &thread_lock);
2195		_kse_critical_leave(crit);
2196	}
2197}
2198
2199/*
2200 * Add an active thread:
2201 *
2202 *   o Assign the thread a unique id (which GDB uses to track
2203 *     threads.
2204 *   o Add the thread to the list of all threads and increment
2205 *     number of active threads.
2206 */
2207static void
2208thr_link(struct pthread *thread)
2209{
2210	kse_critical_t crit;
2211	struct kse *curkse;
2212	struct pthread *curthread;
2213
2214	crit = _kse_critical_enter();
2215	curkse = _get_curkse();
2216	curthread = _get_curthread();
2217	thread->sigmask = curthread->sigmask;
2218	KSE_LOCK_ACQUIRE(curkse, &_thread_list_lock);
2219	/*
2220	 * Initialize the unique id (which GDB uses to track
2221	 * threads), add the thread to the list of all threads,
2222	 * and
2223	 */
2224	thread->uniqueid = next_uniqueid++;
2225	THR_LIST_ADD(thread);
2226	active_threads++;
2227	KSE_LOCK_RELEASE(curkse, &_thread_list_lock);
2228
2229	_kse_critical_leave(crit);
2230}
2231
2232/*
2233 * Remove an active thread.
2234 */
2235static void
2236thr_unlink(struct pthread *thread)
2237{
2238	kse_critical_t crit;
2239	struct kse *curkse;
2240
2241	crit = _kse_critical_enter();
2242	curkse = _get_curkse();
2243
2244	KSE_LOCK_ACQUIRE(curkse, &_thread_list_lock);
2245	THR_LIST_REMOVE(thread);
2246	active_threads--;
2247	KSE_LOCK_RELEASE(curkse, &_thread_list_lock);
2248
2249	_kse_critical_leave(crit);
2250}
2251