thr_kern.c revision 117066
1/*
2 * Copyright (C) 2003 Daniel M. Eischen <deischen@freebsd.org>
3 * Copyright (C) 2002 Jonathon Mini <mini@freebsd.org>
4 * Copyright (c) 1995-1998 John Birrell <jb@cimlogic.com.au>
5 * All rights reserved.
6 *
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
9 * are met:
10 * 1. Redistributions of source code must retain the above copyright
11 *    notice, this list of conditions and the following disclaimer.
12 * 2. Redistributions in binary form must reproduce the above copyright
13 *    notice, this list of conditions and the following disclaimer in the
14 *    documentation and/or other materials provided with the distribution.
15 * 3. All advertising materials mentioning features or use of this software
16 *    must display the following acknowledgement:
17 *	This product includes software developed by John Birrell.
18 * 4. Neither the name of the author nor the names of any co-contributors
19 *    may be used to endorse or promote products derived from this software
20 *    without specific prior written permission.
21 *
22 * THIS SOFTWARE IS PROVIDED BY JOHN BIRRELL AND CONTRIBUTORS ``AS IS'' AND
23 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
24 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
25 * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
26 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
27 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
28 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
29 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
30 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
31 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32 * SUCH DAMAGE.
33 *
34 */
35#include <sys/cdefs.h>
36__FBSDID("$FreeBSD: head/lib/libkse/thread/thr_kern.c 117066 2003-06-30 06:16:50Z davidxu $");
37
38#include <sys/types.h>
39#include <sys/kse.h>
40#include <sys/signalvar.h>
41#include <sys/queue.h>
42#include <machine/atomic.h>
43#include <machine/sigframe.h>
44
45#include <assert.h>
46#include <errno.h>
47#include <signal.h>
48#include <stdlib.h>
49#include <string.h>
50#include <time.h>
51#include <ucontext.h>
52#include <unistd.h>
53
54#include "atomic_ops.h"
55#include "thr_private.h"
56#include "libc_private.h"
57#include "ksd.h"
58
59/*#define DEBUG_THREAD_KERN */
60#ifdef DEBUG_THREAD_KERN
61#define DBG_MSG		stdout_debug
62#else
63#define DBG_MSG(x...)
64#endif
65
66/*
67 * Define a high water mark for the maximum number of threads that
68 * will be cached.  Once this level is reached, any extra threads
69 * will be free()'d.
70 *
71 * XXX - It doesn't make sense to worry about the maximum number of
72 *       KSEs that we can cache because the system will limit us to
73 *       something *much* less than the maximum number of threads
74 *       that we can have.  Disregarding KSEs in their own group,
75 *       the maximum number of KSEs is the number of processors in
76 *       the system.
77 */
78#define	MAX_CACHED_THREADS	100
79#define	KSE_STACKSIZE		16384
80
81#define	KSE_SET_MBOX(kse, thrd) \
82	(kse)->k_mbx.km_curthread = &(thrd)->tmbx
83
84#define	KSE_SET_EXITED(kse)	(kse)->k_flags |= KF_EXITED
85
86/*
87 * Macros for manipulating the run queues.  The priority queue
88 * routines use the thread's pqe link and also handle the setting
89 * and clearing of the thread's THR_FLAGS_IN_RUNQ flag.
90 */
91#define	KSE_RUNQ_INSERT_HEAD(kse, thrd)			\
92	_pq_insert_head(&(kse)->k_schedq->sq_runq, thrd)
93#define	KSE_RUNQ_INSERT_TAIL(kse, thrd)			\
94	_pq_insert_tail(&(kse)->k_schedq->sq_runq, thrd)
95#define	KSE_RUNQ_REMOVE(kse, thrd)			\
96	_pq_remove(&(kse)->k_schedq->sq_runq, thrd)
97#define	KSE_RUNQ_FIRST(kse)	_pq_first(&(kse)->k_schedq->sq_runq)
98
99#define KSE_RUNQ_THREADS(kse)	((kse)->k_schedq->sq_runq.pq_threads)
100
101/*
102 * We've got to keep track of everything that is allocated, not only
103 * to have a speedy free list, but also so they can be deallocated
104 * after a fork().
105 */
106static TAILQ_HEAD(, kse)	active_kseq;
107static TAILQ_HEAD(, kse)	free_kseq;
108static TAILQ_HEAD(, kse_group)	free_kse_groupq;
109static TAILQ_HEAD(, kse_group)	active_kse_groupq;
110static TAILQ_HEAD(, kse_group)	gc_ksegq;
111static struct lock		kse_lock;	/* also used for kseg queue */
112static int			free_kse_count = 0;
113static int			free_kseg_count = 0;
114static TAILQ_HEAD(, pthread)	free_threadq;
115static struct lock		thread_lock;
116static int			free_thread_count = 0;
117static int			inited = 0;
118static int			active_threads = 1;
119static int			active_kse_count = 0;
120static int			active_kseg_count = 0;
121static u_int64_t		next_uniqueid = 1;
122
123
124#ifdef DEBUG_THREAD_KERN
125static void	dump_queues(struct kse *curkse);
126#endif
127static void	kse_check_completed(struct kse *kse);
128static void	kse_check_waitq(struct kse *kse);
129static void	kse_fini(struct kse *curkse);
130static void	kse_reinit(struct kse *kse);
131static void	kse_sched_multi(struct kse *curkse);
132#ifdef NOT_YET
133static void	kse_sched_single(struct kse *curkse);
134#endif
135static void	kse_switchout_thread(struct kse *kse, struct pthread *thread);
136static void	kse_wait(struct kse *kse, struct pthread *td_wait);
137static void	kse_free_unlocked(struct kse *kse);
138static void	kseg_free_unlocked(struct kse_group *kseg);
139static void	kseg_init(struct kse_group *kseg);
140static void	kseg_reinit(struct kse_group *kseg);
141static void	kse_waitq_insert(struct pthread *thread);
142static void	kse_wakeup_multi(struct kse *curkse);
143static void	kse_wakeup_one(struct pthread *thread);
144static void	thr_cleanup(struct kse *kse, struct pthread *curthread);
145static void	thr_link(struct pthread *thread);
146static void	thr_resume_wrapper(int sig, siginfo_t *, ucontext_t *);
147static void	thr_resume_check(struct pthread *curthread, ucontext_t *ucp,
148		    struct pthread_sigframe *psf);
149static int	thr_timedout(struct pthread *thread, struct timespec *curtime);
150static void	thr_unlink(struct pthread *thread);
151
152
153/*
154 * This is called after a fork().
155 * No locks need to be taken here since we are guaranteed to be
156 * single threaded.
157 *
158 * XXX
159 * POSIX says for threaded process, fork() function is used
160 * only to run new programs, and the effects of calling functions
161 * that require certain resources between the call to fork() and
162 * the call to an exec function are undefined.
163 *
164 * Here it is not safe to reinitialize the library after fork().
165 * Because memory management may be corrupted, further calling
166 * malloc()/free() may cause undefined behavior.
167 */
168void
169_kse_single_thread(struct pthread *curthread)
170{
171#ifdef NOTYET
172	struct kse *kse;
173	struct kse_group *kseg;
174	struct pthread *thread;
175	kse_critical_t crit;
176	int i;
177
178
179	/*
180	 * Disable upcalls and clear the threaded flag.
181	 * XXX - I don't think we need to disable upcalls after a fork().
182	 *       but it doesn't hurt.
183	 */
184	crit = _kse_critical_enter();
185	__isthreaded = 0;
186	active_threads = 1;
187	_thr_signal_deinit();
188
189	/*
190	 * Enter a loop to remove and free all threads other than
191	 * the running thread from the active thread list:
192	 */
193	while ((thread = TAILQ_FIRST(&_thread_list)) != NULL) {
194		THR_GCLIST_REMOVE(thread);
195		/*
196		 * Remove this thread from the list (the current
197		 * thread will be removed but re-added by libpthread
198		 * initialization.
199		 */
200		TAILQ_REMOVE(&_thread_list, thread, tle);
201		/* Make sure this isn't the running thread: */
202		if (thread != curthread) {
203			_thr_stack_free(&thread->attr);
204			if (thread->specific != NULL)
205				free(thread->specific);
206			for (i = 0; i < MAX_THR_LOCKLEVEL; i++) {
207				_lockuser_destroy(&thread->lockusers[i]);
208			}
209			_lock_destroy(&thread->lock);
210			free(thread);
211		}
212	}
213
214	TAILQ_INIT(&curthread->mutexq);		/* initialize mutex queue */
215	curthread->joiner = NULL;		/* no joining threads yet */
216	curthread->refcount = 0;
217	SIGEMPTYSET(curthread->sigpend);	/* clear pending signals */
218	if (curthread->specific != NULL) {
219		free(curthread->specific);
220		curthread->specific = NULL;
221		curthread->specific_data_count = 0;
222	}
223
224	/* Free the free KSEs: */
225	while ((kse = TAILQ_FIRST(&free_kseq)) != NULL) {
226		TAILQ_REMOVE(&free_kseq, kse, k_qe);
227		for (i = 0; i < MAX_KSE_LOCKLEVEL; i++) {
228			_lockuser_destroy(&kse->k_lockusers[i]);
229		}
230		_lock_destroy(&kse->k_lock);
231		_ksd_destroy(&kse->k_ksd);
232		if (kse->k_stack.ss_sp != NULL)
233			free(kse->k_stack.ss_sp);
234		free(kse);
235	}
236	free_kse_count = 0;
237
238	/* Free the active KSEs: */
239	while ((kse = TAILQ_FIRST(&active_kseq)) != NULL) {
240		TAILQ_REMOVE(&active_kseq, kse, k_qe);
241		for (i = 0; i < MAX_KSE_LOCKLEVEL; i++) {
242			_lockuser_destroy(&kse->k_lockusers[i]);
243		}
244		_lock_destroy(&kse->k_lock);
245		if (kse->k_stack.ss_sp != NULL)
246			free(kse->k_stack.ss_sp);
247		free(kse);
248	}
249	active_kse_count = 0;
250
251	/* Free the free KSEGs: */
252	while ((kseg = TAILQ_FIRST(&free_kse_groupq)) != NULL) {
253		TAILQ_REMOVE(&free_kse_groupq, kseg, kg_qe);
254		_lock_destroy(&kseg->kg_lock);
255		_pq_free(&kseg->kg_schedq.sq_runq);
256		free(kseg);
257	}
258	free_kseg_count = 0;
259
260	/* Free the active KSEGs: */
261	while ((kseg = TAILQ_FIRST(&active_kse_groupq)) != NULL) {
262		TAILQ_REMOVE(&active_kse_groupq, kseg, kg_qe);
263		_lock_destroy(&kseg->kg_lock);
264		_pq_free(&kseg->kg_schedq.sq_runq);
265		free(kseg);
266	}
267	active_kseg_count = 0;
268
269	/* Free the free threads. */
270	while ((thread = TAILQ_FIRST(&free_threadq)) != NULL) {
271		TAILQ_REMOVE(&free_threadq, thread, tle);
272		if (thread->specific != NULL)
273			free(thread->specific);
274		for (i = 0; i < MAX_THR_LOCKLEVEL; i++) {
275			_lockuser_destroy(&thread->lockusers[i]);
276		}
277		_lock_destroy(&thread->lock);
278		free(thread);
279	}
280	free_thread_count = 0;
281
282	/* Free the to-be-gc'd threads. */
283	while ((thread = TAILQ_FIRST(&_thread_gc_list)) != NULL) {
284		TAILQ_REMOVE(&_thread_gc_list, thread, gcle);
285		for (i = 0; i < MAX_THR_LOCKLEVEL; i++) {
286			_lockuser_destroy(&thread->lockusers[i]);
287		}
288		_lock_destroy(&thread->lock);
289		free(thread);
290	}
291	TAILQ_INIT(&gc_ksegq);
292	_gc_count = 0;
293
294	if (inited != 0) {
295		/*
296		 * Destroy these locks; they'll be recreated to assure they
297		 * are in the unlocked state.
298		 */
299		_lock_destroy(&kse_lock);
300		_lock_destroy(&thread_lock);
301		_lock_destroy(&_thread_list_lock);
302		inited = 0;
303	}
304
305	/*
306	 * After a fork(), the leftover thread goes back to being
307	 * scope process.
308	 */
309	curthread->attr.flags &= ~PTHREAD_SCOPE_SYSTEM;
310	curthread->attr.flags |= PTHREAD_SCOPE_PROCESS;
311
312	/*
313	 * After a fork, we are still operating on the thread's original
314	 * stack.  Don't clear the THR_FLAGS_USER from the thread's
315	 * attribute flags.
316	 */
317
318	/* Initialize the threads library. */
319	curthread->kse = NULL;
320	curthread->kseg = NULL;
321	_kse_initial = NULL;
322	_libpthread_init(curthread);
323#else
324	_ksd_readandclear_tmbx();
325	__isthreaded   = 0;
326	active_threads = 0;
327	_thr_signal_deinit();
328#endif
329}
330
331/*
332 * This is used to initialize housekeeping and to initialize the
333 * KSD for the KSE.
334 */
335void
336_kse_init(void)
337{
338	if (inited == 0) {
339		TAILQ_INIT(&active_kseq);
340		TAILQ_INIT(&active_kse_groupq);
341		TAILQ_INIT(&free_kseq);
342		TAILQ_INIT(&free_kse_groupq);
343		TAILQ_INIT(&free_threadq);
344		TAILQ_INIT(&gc_ksegq);
345		if (_lock_init(&kse_lock, LCK_ADAPTIVE,
346		    _kse_lock_wait, _kse_lock_wakeup) != 0)
347			PANIC("Unable to initialize free KSE queue lock");
348		if (_lock_init(&thread_lock, LCK_ADAPTIVE,
349		    _kse_lock_wait, _kse_lock_wakeup) != 0)
350			PANIC("Unable to initialize free thread queue lock");
351		if (_lock_init(&_thread_list_lock, LCK_ADAPTIVE,
352		    _kse_lock_wait, _kse_lock_wakeup) != 0)
353			PANIC("Unable to initialize thread list lock");
354		active_kse_count = 0;
355		active_kseg_count = 0;
356		_gc_count = 0;
357		inited = 1;
358	}
359}
360
361int
362_kse_isthreaded(void)
363{
364	return (__isthreaded != 0);
365}
366
367/*
368 * This is called when the first thread (other than the initial
369 * thread) is created.
370 */
371int
372_kse_setthreaded(int threaded)
373{
374	if ((threaded != 0) && (__isthreaded == 0)) {
375		/*
376		 * Locking functions in libc are required when there are
377		 * threads other than the initial thread.
378		 */
379		__isthreaded = 1;
380
381		/*
382		 * Tell the kernel to create a KSE for the initial thread
383		 * and enable upcalls in it.
384		 */
385		_thr_signal_init();
386		_kse_initial->k_flags |= KF_STARTED;
387		if (kse_create(&_kse_initial->k_mbx, 0) != 0) {
388			_kse_initial->k_flags &= ~KF_STARTED;
389			__isthreaded = 0;
390			/* may abort() */
391			PANIC("kse_create() failed\n");
392			return (-1);
393		}
394		KSE_SET_MBOX(_kse_initial, _thr_initial);
395		_thr_start_sig_daemon();
396		_thr_setmaxconcurrency();
397	}
398	return (0);
399}
400
401/*
402 * Lock wait and wakeup handlers for KSE locks.  These are only used by
403 * KSEs, and should never be used by threads.  KSE locks include the
404 * KSE group lock (used for locking the scheduling queue) and the
405 * kse_lock defined above.
406 *
407 * When a KSE lock attempt blocks, the entire KSE blocks allowing another
408 * KSE to run.  For the most part, it doesn't make much sense to try and
409 * schedule another thread because you need to lock the scheduling queue
410 * in order to do that.  And since the KSE lock is used to lock the scheduling
411 * queue, you would just end up blocking again.
412 */
413void
414_kse_lock_wait(struct lock *lock, struct lockuser *lu)
415{
416	struct kse *curkse = (struct kse *)_LCK_GET_PRIVATE(lu);
417	struct timespec ts;
418	int saved_flags;
419
420	if (curkse->k_mbx.km_curthread != NULL)
421		PANIC("kse_lock_wait does not disable upcall.\n");
422	/*
423	 * Enter a loop to wait until we get the lock.
424	 */
425	ts.tv_sec = 0;
426	ts.tv_nsec = 1000000;  /* 1 sec */
427	while (!_LCK_GRANTED(lu)) {
428		/*
429		 * Yield the kse and wait to be notified when the lock
430		 * is granted.
431		 */
432		saved_flags = curkse->k_mbx.km_flags;
433		curkse->k_mbx.km_flags |= KMF_NOUPCALL | KMF_NOCOMPLETED;
434		kse_release(&ts);
435		curkse->k_mbx.km_flags = saved_flags;
436	}
437}
438
439void
440_kse_lock_wakeup(struct lock *lock, struct lockuser *lu)
441{
442	struct kse *curkse;
443	struct kse *kse;
444	struct kse_mailbox *mbx;
445
446	curkse = _get_curkse();
447	kse = (struct kse *)_LCK_GET_PRIVATE(lu);
448
449	if (kse == curkse)
450		PANIC("KSE trying to wake itself up in lock");
451	else {
452		mbx = &kse->k_mbx;
453		_lock_grant(lock, lu);
454		/*
455		 * Notify the owning kse that it has the lock.
456		 * It is safe to pass invalid address to kse_wakeup
457		 * even if the mailbox is not in kernel at all,
458		 * and waking up a wrong kse is also harmless.
459		 */
460		kse_wakeup(mbx);
461	}
462}
463
464/*
465 * Thread wait and wakeup handlers for thread locks.  These are only used
466 * by threads, never by KSEs.  Thread locks include the per-thread lock
467 * (defined in its structure), and condition variable and mutex locks.
468 */
469void
470_thr_lock_wait(struct lock *lock, struct lockuser *lu)
471{
472	struct pthread *curthread = (struct pthread *)lu->lu_private;
473
474	do {
475		THR_SCHED_LOCK(curthread, curthread);
476		THR_SET_STATE(curthread, PS_LOCKWAIT);
477		THR_SCHED_UNLOCK(curthread, curthread);
478		_thr_sched_switch(curthread);
479	} while (!_LCK_GRANTED(lu));
480}
481
482void
483_thr_lock_wakeup(struct lock *lock, struct lockuser *lu)
484{
485	struct pthread *thread;
486	struct pthread *curthread;
487
488	curthread = _get_curthread();
489	thread = (struct pthread *)_LCK_GET_PRIVATE(lu);
490
491	THR_SCHED_LOCK(curthread, thread);
492	_lock_grant(lock, lu);
493	_thr_setrunnable_unlocked(thread);
494	THR_SCHED_UNLOCK(curthread, thread);
495}
496
497kse_critical_t
498_kse_critical_enter(void)
499{
500	kse_critical_t crit;
501
502	crit = _ksd_readandclear_tmbx();
503	return (crit);
504}
505
506void
507_kse_critical_leave(kse_critical_t crit)
508{
509	struct pthread *curthread;
510
511	_ksd_set_tmbx(crit);
512	if ((crit != NULL) && ((curthread = _get_curthread()) != NULL))
513		THR_YIELD_CHECK(curthread);
514}
515
516int
517_kse_in_critical(void)
518{
519	return (_ksd_get_tmbx() == NULL);
520}
521
522void
523_thr_critical_enter(struct pthread *thread)
524{
525	thread->critical_count++;
526}
527
528void
529_thr_critical_leave(struct pthread *thread)
530{
531	thread->critical_count--;
532	THR_YIELD_CHECK(thread);
533}
534
535void
536_thr_sched_switch(struct pthread *curthread)
537{
538	struct kse *curkse;
539
540	(void)_kse_critical_enter();
541	curkse = _get_curkse();
542	KSE_SCHED_LOCK(curkse, curkse->k_kseg);
543	_thr_sched_switch_unlocked(curthread);
544}
545
546/*
547 * XXX - We may need to take the scheduling lock before calling
548 *       this, or perhaps take the lock within here before
549 *       doing anything else.
550 */
551void
552_thr_sched_switch_unlocked(struct pthread *curthread)
553{
554	struct pthread *td;
555	struct pthread_sigframe psf;
556	struct kse *curkse;
557	int ret;
558	volatile int uts_once;
559	volatile int resume_once = 0;
560	ucontext_t uc;
561
562	/* We're in the scheduler, 5 by 5: */
563	curkse = _get_curkse();
564
565	curthread->need_switchout = 1;	/* The thread yielded on its own. */
566	curthread->critical_yield = 0;	/* No need to yield anymore. */
567	curthread->slice_usec = -1;	/* Restart the time slice. */
568
569	/* Thread can unlock the scheduler lock. */
570	curthread->lock_switch = 1;
571
572	/*
573	 * The signal frame is allocated off the stack because
574	 * a thread can be interrupted by other signals while
575	 * it is running down pending signals.
576	 */
577	psf.psf_valid = 0;
578	curthread->curframe = &psf;
579
580	/*
581	 * Enter the scheduler if any one of the following is true:
582	 *
583	 *   o The current thread is dead; it's stack needs to be
584	 *     cleaned up and it can't be done while operating on
585	 *     it.
586	 *   o The current thread has signals pending, should
587	 *     let scheduler install signal trampoline for us.
588	 *   o There are no runnable threads.
589	 *   o The next thread to run won't unlock the scheduler
590	 *     lock.  A side note: the current thread may be run
591	 *     instead of the next thread in the run queue, but
592	 *     we don't bother checking for that.
593	 */
594	if ((curthread->state == PS_DEAD) ||
595	    (((td = KSE_RUNQ_FIRST(curkse)) == NULL) &&
596	    (curthread->state != PS_RUNNING)) ||
597	    ((td != NULL) && (td->lock_switch == 0))) {
598		curkse->k_switch = 1;
599		_thread_enter_uts(&curthread->tmbx, &curkse->k_mbx);
600	}
601	else {
602		uts_once = 0;
603		THR_GETCONTEXT(&curthread->tmbx.tm_context);
604		if (uts_once == 0) {
605			uts_once = 1;
606
607			/* Switchout the current thread. */
608			kse_switchout_thread(curkse, curthread);
609
610		 	/* Choose another thread to run. */
611			td = KSE_RUNQ_FIRST(curkse);
612			KSE_RUNQ_REMOVE(curkse, td);
613			curkse->k_curthread = td;
614
615			/*
616			 * Make sure the current thread's kse points to
617			 * this kse.
618			 */
619			td->kse = curkse;
620
621			/*
622			 * Reset accounting.
623			 */
624			td->tmbx.tm_uticks = 0;
625			td->tmbx.tm_sticks = 0;
626
627			/*
628			 * Reset the time slice if this thread is running
629			 * for the first time or running again after using
630			 * its full time slice allocation.
631			 */
632			if (td->slice_usec == -1)
633				td->slice_usec = 0;
634
635			/* Mark the thread active. */
636			td->active = 1;
637
638			/* Remove the frame reference. */
639			td->curframe = NULL;
640
641			/*
642			 * Continue the thread at its current frame:
643			 */
644			ret = _thread_switch(&td->tmbx, NULL);
645			/* This point should not be reached. */
646			if (ret != 0)
647				PANIC("Bad return from _thread_switch");
648			PANIC("Thread has returned from _thread_switch");
649		}
650	}
651
652	if (psf.psf_valid) {
653		/*
654		 * It is ugly we must increase critical count, because we
655		 * have a frame saved, we must backout state in psf
656		 * before we can process signals.
657 		 */
658		curthread->critical_count++;
659	}
660
661	if (curthread->lock_switch != 0) {
662		/*
663		 * Unlock the scheduling queue and leave the
664		 * critical region.
665		 */
666		/* Don't trust this after a switch! */
667		curkse = _get_curkse();
668
669		curthread->lock_switch = 0;
670		KSE_SCHED_UNLOCK(curkse, curkse->k_kseg);
671		_kse_critical_leave(&curthread->tmbx);
672	}
673	/*
674	 * This thread is being resumed; check for cancellations.
675	 */
676	if ((psf.psf_valid || curthread->check_pending)) {
677		resume_once = 0;
678		THR_GETCONTEXT(&uc);
679		if (resume_once == 0) {
680			resume_once = 1;
681			curthread->check_pending = 0;
682			thr_resume_check(curthread, &uc, &psf);
683		}
684	}
685	THR_ACTIVATE_LAST_LOCK(curthread);
686}
687
688/*
689 * This is the scheduler for a KSE which runs a scope system thread.
690 * The multi-thread KSE scheduler should also work for a single threaded
691 * KSE, but we use a separate scheduler so that it can be fine-tuned
692 * to be more efficient (and perhaps not need a separate stack for
693 * the KSE, allowing it to use the thread's stack).
694 *
695 * XXX - This probably needs some work.
696 */
697#ifdef NOT_YET
698static void
699kse_sched_single(struct kse *curkse)
700{
701	struct pthread *curthread = curkse->k_curthread;
702	struct pthread *td_wait;
703	struct timespec ts;
704	int level;
705
706	if (curthread->active == 0) {
707		if (curthread->state != PS_RUNNING) {
708			/* Check to see if the thread has timed out. */
709			KSE_GET_TOD(curkse, &ts);
710			if (thr_timedout(curthread, &ts) != 0) {
711				curthread->timeout = 1;
712				curthread->state = PS_RUNNING;
713			}
714		}
715	}
716
717	/* This thread no longer needs to yield the CPU: */
718	curthread->critical_yield = 0;
719	curthread->need_switchout = 0;
720
721	/*
722	 * Lock the scheduling queue.
723	 *
724	 * There is no scheduling queue for single threaded KSEs,
725	 * but we need a lock for protection regardless.
726	 */
727	KSE_SCHED_LOCK(curkse, curkse->k_kseg);
728
729	/*
730	 * This has to do the job of kse_switchout_thread(), only
731	 * for a single threaded KSE/KSEG.
732	 */
733
734	switch (curthread->state) {
735	case PS_DEAD:
736		/* Unlock the scheduling queue and exit the KSE and thread. */
737		thr_cleaup(curkse, curthread);
738		KSE_SCHED_UNLOCK(curkse, curkse->k_kseg);
739		break;
740
741	case PS_COND_WAIT:
742	case PS_SLEEP_WAIT:
743		/* Only insert threads that can timeout: */
744		if (curthread->wakeup_time.tv_sec != -1) {
745			/* Insert into the waiting queue: */
746			KSE_WAITQ_INSERT(curkse, curthread);
747		}
748		break;
749
750	case PS_LOCKWAIT:
751		level = curthread->locklevel - 1;
752		if (!_LCK_GRANTED(&curthread->lockusers[level]))
753			KSE_WAITQ_INSERT(curkse, curthread);
754		else
755			THR_SET_STATE(curthread, PS_RUNNING);
756		break;
757
758	case PS_JOIN:
759	case PS_MUTEX_WAIT:
760	case PS_RUNNING:
761	case PS_SIGSUSPEND:
762	case PS_SIGWAIT:
763	case PS_SUSPENDED:
764	case PS_DEADLOCK:
765	default:
766		/*
767		 * These states don't timeout and don't need
768		 * to be in the waiting queue.
769		 */
770		break;
771	}
772	while (curthread->state != PS_RUNNING) {
773		curthread->active = 0;
774		td_wait = KSE_WAITQ_FIRST(curkse);
775
776		kse_wait(curkse, td_wait);
777
778	    	if (td_wait != NULL) {
779			KSE_GET_TOD(curkse, &ts);
780			if (thr_timedout(curthread, &ts)) {
781				/* Indicate the thread timedout: */
782				td_wait->timeout = 1;
783
784				/* Make the thread runnable. */
785				THR_SET_STATE(td_wait, PS_RUNNING);
786				KSE_WAITQ_REMOVE(curkse, td_wait);
787			}
788		}
789	}
790
791	/* Remove the frame reference. */
792	curthread->curframe = NULL;
793
794	/* Unlock the scheduling queue. */
795	KSE_SCHED_UNLOCK(curkse, curkse->k_kseg);
796
797	/*
798	 * Continue the thread at its current frame:
799	 */
800	DBG_MSG("Continuing bound thread %p\n", curthread);
801	_thread_switch(&curthread->tmbx, &curkse->k_mbx.km_curthread);
802	PANIC("Thread has returned from _thread_switch");
803}
804#endif
805
806#ifdef DEBUG_THREAD_KERN
807static void
808dump_queues(struct kse *curkse)
809{
810	struct pthread *thread;
811
812	DBG_MSG("Threads in waiting queue:\n");
813	TAILQ_FOREACH(thread, &curkse->k_kseg->kg_schedq.sq_waitq, pqe) {
814		DBG_MSG("  thread %p, state %d, blocked %d\n",
815		    thread, thread->state, thread->blocked);
816	}
817}
818#endif
819
820/*
821 * This is the scheduler for a KSE which runs multiple threads.
822 */
823static void
824kse_sched_multi(struct kse *curkse)
825{
826	struct pthread *curthread, *td_wait;
827	struct pthread_sigframe *curframe;
828	int ret;
829
830	THR_ASSERT(curkse->k_mbx.km_curthread == NULL,
831	    "Mailbox not null in kse_sched_multi");
832
833	/* Check for first time initialization: */
834	if ((curkse->k_flags & KF_INITIALIZED) == 0) {
835		/* Setup this KSEs specific data. */
836		_ksd_setprivate(&curkse->k_ksd);
837		_set_curkse(curkse);
838
839		/* Set this before grabbing the context. */
840		curkse->k_flags |= KF_INITIALIZED;
841	}
842
843	/* This may have returned from a kse_release(). */
844	if (KSE_WAITING(curkse)) {
845		DBG_MSG("Entered upcall when KSE is waiting.");
846		KSE_CLEAR_WAIT(curkse);
847	}
848
849	/*If this is an upcall; take the scheduler lock. */
850	if (curkse->k_switch == 0)
851		KSE_SCHED_LOCK(curkse, curkse->k_kseg);
852	curkse->k_switch = 0;
853
854	curthread = curkse->k_curthread;
855
856	if (KSE_IS_IDLE(curkse)) {
857		KSE_CLEAR_IDLE(curkse);
858		curkse->k_kseg->kg_idle_kses--;
859	}
860	/*
861	 * If the current thread was completed in another KSE, then
862	 * it will be in the run queue.  Don't mark it as being blocked.
863	 */
864	if ((curthread != NULL) &&
865	    ((curthread->flags & THR_FLAGS_IN_RUNQ) == 0) &&
866	    (curthread->need_switchout == 0)) {
867		/*
868		 * Assume the current thread is blocked; when the
869		 * completed threads are checked and if the current
870		 * thread is among the completed, the blocked flag
871		 * will be cleared.
872		 */
873		curthread->blocked = 1;
874	}
875
876	/* Check for any unblocked threads in the kernel. */
877	kse_check_completed(curkse);
878
879	/*
880	 * Check for threads that have timed-out.
881	 */
882	kse_check_waitq(curkse);
883
884	/*
885	 * Switchout the current thread, if necessary, as the last step
886	 * so that it is inserted into the run queue (if it's runnable)
887	 * _after_ any other threads that were added to it above.
888	 */
889	if (curthread == NULL)
890		;  /* Nothing to do here. */
891	else if ((curthread->need_switchout == 0) &&
892	    (curthread->blocked == 0) && (THR_IN_CRITICAL(curthread))) {
893		/*
894		 * Resume the thread and tell it to yield when
895		 * it leaves the critical region.
896		 */
897		curthread->critical_yield = 1;
898		curthread->active = 1;
899		if ((curthread->flags & THR_FLAGS_IN_RUNQ) != 0)
900			KSE_RUNQ_REMOVE(curkse, curthread);
901		curkse->k_curthread = curthread;
902		curthread->kse = curkse;
903		DBG_MSG("Continuing thread %p in critical region\n",
904		    curthread);
905		kse_wakeup_multi(curkse);
906		KSE_SCHED_UNLOCK(curkse, curkse->k_kseg);
907		ret = _thread_switch(&curthread->tmbx,
908		    &curkse->k_mbx.km_curthread);
909		if (ret != 0)
910			PANIC("Can't resume thread in critical region\n");
911	}
912	else if ((curthread->flags & THR_FLAGS_IN_RUNQ) == 0)
913		kse_switchout_thread(curkse, curthread);
914	curkse->k_curthread = NULL;
915
916	kse_wakeup_multi(curkse);
917
918#ifdef DEBUG_THREAD_KERN
919	dump_queues(curkse);
920#endif
921
922	/* Check if there are no threads ready to run: */
923	while (((curthread = KSE_RUNQ_FIRST(curkse)) == NULL) &&
924	    (curkse->k_kseg->kg_threadcount != 0)) {
925		/*
926		 * Wait for a thread to become active or until there are
927		 * no more threads.
928		 */
929		td_wait = KSE_WAITQ_FIRST(curkse);
930		kse_wait(curkse, td_wait);
931		kse_check_completed(curkse);
932		kse_check_waitq(curkse);
933	}
934
935	/* Check for no more threads: */
936	if (curkse->k_kseg->kg_threadcount == 0) {
937		/*
938		 * Normally this shouldn't return, but it will if there
939		 * are other KSEs running that create new threads that
940		 * are assigned to this KSE[G].  For instance, if a scope
941		 * system thread were to create a scope process thread
942		 * and this kse[g] is the initial kse[g], then that newly
943		 * created thread would be assigned to us (the initial
944		 * kse[g]).
945		 */
946		KSE_SCHED_UNLOCK(curkse, curkse->k_kseg);
947		kse_fini(curkse);
948		/* never returns */
949	}
950
951	THR_ASSERT(curthread != NULL,
952	    "Return from kse_wait/fini without thread.");
953	THR_ASSERT(curthread->state != PS_DEAD,
954	    "Trying to resume dead thread!");
955	KSE_RUNQ_REMOVE(curkse, curthread);
956
957	/*
958	 * Make the selected thread the current thread.
959	 */
960	curkse->k_curthread = curthread;
961
962	/*
963	 * Make sure the current thread's kse points to this kse.
964	 */
965	curthread->kse = curkse;
966
967	/*
968	 * Reset accounting.
969	 */
970	curthread->tmbx.tm_uticks = 0;
971	curthread->tmbx.tm_sticks = 0;
972
973	/*
974	 * Reset the time slice if this thread is running for the first
975	 * time or running again after using its full time slice allocation.
976	 */
977	if (curthread->slice_usec == -1)
978		curthread->slice_usec = 0;
979
980	/* Mark the thread active. */
981	curthread->active = 1;
982
983	/* Remove the frame reference. */
984	curframe = curthread->curframe;
985	curthread->curframe = NULL;
986
987	kse_wakeup_multi(curkse);
988
989	/*
990	 * The thread's current signal frame will only be NULL if it
991	 * is being resumed after being blocked in the kernel.  In
992	 * this case, and if the thread needs to run down pending
993	 * signals or needs a cancellation check, we need to add a
994	 * signal frame to the thread's context.
995	 */
996#ifdef NOT_YET
997	if ((((curframe == NULL) && (curthread->check_pending != 0)) ||
998	    (((curthread->cancelflags & THR_AT_CANCEL_POINT) == 0) &&
999	     ((curthread->cancelflags & PTHREAD_CANCEL_ASYNCHRONOUS) != 0))) &&
1000	     !THR_IN_CRITICAL(curthread))
1001		signalcontext(&curthread->tmbx.tm_context, 0,
1002		    (__sighandler_t *)thr_resume_wrapper);
1003#else
1004	if ((curframe == NULL) && (curthread->check_pending != 0) &&
1005	    !THR_IN_CRITICAL(curthread)) {
1006		curthread->check_pending = 0;
1007		signalcontext(&curthread->tmbx.tm_context, 0,
1008		    (__sighandler_t *)thr_resume_wrapper);
1009	}
1010#endif
1011	/*
1012	 * Continue the thread at its current frame:
1013	 */
1014	if (curthread->lock_switch != 0) {
1015		/*
1016		 * This thread came from a scheduler switch; it will
1017		 * unlock the scheduler lock and set the mailbox.
1018		 */
1019		ret = _thread_switch(&curthread->tmbx, NULL);
1020	} else {
1021		/* This thread won't unlock the scheduler lock. */
1022		KSE_SCHED_UNLOCK(curkse, curkse->k_kseg);
1023		ret = _thread_switch(&curthread->tmbx,
1024		    &curkse->k_mbx.km_curthread);
1025	}
1026	if (ret != 0)
1027		PANIC("Thread has returned from _thread_switch");
1028
1029	/* This point should not be reached. */
1030	PANIC("Thread has returned from _thread_switch");
1031}
1032
1033static void
1034thr_resume_wrapper(int sig, siginfo_t *siginfo, ucontext_t *ucp)
1035{
1036	struct pthread *curthread = _get_curthread();
1037	struct kse *curkse;
1038	int ret;
1039
1040	DBG_MSG(">>> sig wrapper\n");
1041	if (curthread->lock_switch)
1042		PANIC("thr_resume_wrapper, lock_switch != 0\n");
1043	thr_resume_check(curthread, ucp, NULL);
1044	_kse_critical_enter();
1045	curkse = _get_curkse();
1046	curthread->tmbx.tm_context = *ucp;
1047	ret = _thread_switch(&curthread->tmbx, &curkse->k_mbx.km_curthread);
1048	if (ret != 0)
1049		PANIC("thr_resume_wrapper: thread has returned "
1050		      "from _thread_switch");
1051	/* THR_SETCONTEXT(ucp); */ /* not work, why ? */
1052}
1053
1054static void
1055thr_resume_check(struct pthread *curthread, ucontext_t *ucp,
1056    struct pthread_sigframe *psf)
1057{
1058	_thr_sig_rundown(curthread, ucp, psf);
1059
1060#ifdef NOT_YET
1061	if (((curthread->cancelflags & THR_AT_CANCEL_POINT) == 0) &&
1062	    ((curthread->cancelflags & PTHREAD_CANCEL_ASYNCHRONOUS) != 0))
1063		pthread_testcancel();
1064#endif
1065}
1066
1067/*
1068 * Clean up a thread.  This must be called with the thread's KSE
1069 * scheduling lock held.  The thread must be a thread from the
1070 * KSE's group.
1071 */
1072static void
1073thr_cleanup(struct kse *curkse, struct pthread *thread)
1074{
1075	struct pthread *joiner;
1076	int sys_scope;
1077
1078	if ((joiner = thread->joiner) != NULL) {
1079		/* Joinee scheduler lock held; joiner won't leave. */
1080		if (joiner->kseg == curkse->k_kseg) {
1081			if (joiner->join_status.thread == thread) {
1082				joiner->join_status.thread = NULL;
1083				joiner->join_status.ret = thread->ret;
1084				_thr_setrunnable_unlocked(joiner);
1085			}
1086		} else {
1087			KSE_SCHED_UNLOCK(curkse, curkse->k_kseg);
1088			/* The joiner may have removed itself and exited. */
1089			if (_thr_ref_add(thread, joiner, 0) == 0) {
1090				KSE_SCHED_LOCK(curkse, joiner->kseg);
1091				if (joiner->join_status.thread == thread) {
1092					joiner->join_status.thread = NULL;
1093					joiner->join_status.ret = thread->ret;
1094					_thr_setrunnable_unlocked(joiner);
1095				}
1096				KSE_SCHED_UNLOCK(curkse, joiner->kseg);
1097				_thr_ref_delete(thread, joiner);
1098			}
1099			KSE_SCHED_LOCK(curkse, curkse->k_kseg);
1100		}
1101		thread->attr.flags |= PTHREAD_DETACHED;
1102	}
1103
1104	if (!(sys_scope = (thread->attr.flags & PTHREAD_SCOPE_SYSTEM))) {
1105		/*
1106		 * Remove the thread from the KSEG's list of threads.
1107	 	 */
1108		KSEG_THRQ_REMOVE(thread->kseg, thread);
1109		/*
1110		 * Migrate the thread to the main KSE so that this
1111		 * KSE and KSEG can be cleaned when their last thread
1112		 * exits.
1113		 */
1114		thread->kseg = _kse_initial->k_kseg;
1115		thread->kse = _kse_initial;
1116	}
1117	thread->flags |= THR_FLAGS_GC_SAFE;
1118
1119	/*
1120	 * We can't hold the thread list lock while holding the
1121	 * scheduler lock.
1122	 */
1123	KSE_SCHED_UNLOCK(curkse, curkse->k_kseg);
1124	DBG_MSG("Adding thread %p to GC list\n", thread);
1125	KSE_LOCK_ACQUIRE(curkse, &_thread_list_lock);
1126	THR_GCLIST_ADD(thread);
1127	/* Use thread_list_lock */
1128	active_threads--;
1129	if (active_threads == 1) {
1130		KSE_LOCK_RELEASE(curkse, &_thread_list_lock);
1131		exit(0);
1132        }
1133	KSE_LOCK_RELEASE(curkse, &_thread_list_lock);
1134	if (sys_scope) {
1135		/*
1136		 * System scope thread is single thread group,
1137		 * when thread is exited, its kse and ksegrp should
1138		 * be recycled as well.
1139		 */
1140		kse_exit();
1141		PANIC("kse_exit() failed for system scope thread");
1142	}
1143	KSE_SCHED_LOCK(curkse, curkse->k_kseg);
1144}
1145
1146void
1147_thr_gc(struct pthread *curthread)
1148{
1149	struct pthread *td, *td_next;
1150	kse_critical_t crit;
1151	TAILQ_HEAD(, pthread) worklist;
1152
1153	TAILQ_INIT(&worklist);
1154	crit = _kse_critical_enter();
1155	KSE_LOCK_ACQUIRE(curthread->kse, &_thread_list_lock);
1156
1157	/* Check the threads waiting for GC. */
1158	for (td = TAILQ_FIRST(&_thread_gc_list); td != NULL; td = td_next) {
1159		td_next = TAILQ_NEXT(td, gcle);
1160		if ((td->flags & THR_FLAGS_GC_SAFE) == 0)
1161			continue;
1162		else if (((td->attr.flags & PTHREAD_SCOPE_SYSTEM) != 0) &&
1163		    ((td->kse->k_mbx.km_flags & KMF_DONE) == 0)) {
1164			/*
1165			 * The thread and KSE are operating on the same
1166			 * stack.  Wait for the KSE to exit before freeing
1167			 * the thread's stack as well as everything else.
1168			 */
1169			continue;
1170		}
1171		/*
1172		 * Remove the thread from the GC list.  If the thread
1173		 * isn't yet detached, it will get added back to the
1174		 * GC list at a later time.
1175		 */
1176		THR_GCLIST_REMOVE(td);
1177		DBG_MSG("Freeing thread %p stack\n", td);
1178		/*
1179		 * We can free the thread stack since it's no longer
1180		 * in use.
1181		 */
1182		_thr_stack_free(&td->attr);
1183		if (((td->attr.flags & PTHREAD_DETACHED) != 0) &&
1184		    (td->refcount == 0)) {
1185			/*
1186			 * The thread has detached and is no longer
1187			 * referenced.  It is safe to remove all
1188			 * remnants of the thread.
1189			 */
1190			THR_LIST_REMOVE(td);
1191			TAILQ_INSERT_HEAD(&worklist, td, gcle);
1192		}
1193	}
1194	KSE_LOCK_RELEASE(curthread->kse, &_thread_list_lock);
1195	_kse_critical_leave(crit);
1196
1197	while ((td = TAILQ_FIRST(&worklist)) != NULL) {
1198		TAILQ_REMOVE(&worklist, td, gcle);
1199
1200		if ((td->attr.flags & PTHREAD_SCOPE_SYSTEM) != 0) {
1201			crit = _kse_critical_enter();
1202			KSE_LOCK_ACQUIRE(curthread->kse, &kse_lock);
1203			kse_free_unlocked(td->kse);
1204			kseg_free_unlocked(td->kseg);
1205			KSE_LOCK_RELEASE(curthread->kse, &kse_lock);
1206			_kse_critical_leave(crit);
1207		}
1208		/*
1209		 * XXX we don't free initial thread, because there might
1210		 * have some code referencing initial thread.
1211		 */
1212		if (td != _thr_initial) {
1213			DBG_MSG("Freeing thread %p\n", td);
1214			_thr_free(curthread, td);
1215		} else
1216			DBG_MSG("Initial thread won't be freed\n");
1217	}
1218	/* XXX free kse and ksegrp list should be looked as well */
1219}
1220
1221
1222/*
1223 * Only new threads that are running or suspended may be scheduled.
1224 */
1225int
1226_thr_schedule_add(struct pthread *curthread, struct pthread *newthread)
1227{
1228	kse_critical_t crit;
1229	int ret;
1230
1231	/* Add the new thread. */
1232	thr_link(newthread);
1233
1234	/*
1235	 * If this is the first time creating a thread, make sure
1236	 * the mailbox is set for the current thread.
1237	 */
1238	if ((newthread->attr.flags & PTHREAD_SCOPE_SYSTEM) != 0) {
1239#ifdef NOT_YET
1240		/* We use the thread's stack as the KSE's stack. */
1241		new_thread->kse->k_mbx.km_stack.ss_sp =
1242		    new_thread->attr.stackaddr_attr;
1243		new_thread->kse->k_mbx.km_stack.ss_size =
1244		    new_thread->attr.stacksize_attr;
1245#endif
1246		/*
1247		 * No need to lock the scheduling queue since the
1248		 * KSE/KSEG pair have not yet been started.
1249		 */
1250		KSEG_THRQ_ADD(newthread->kseg, newthread);
1251		if (newthread->state == PS_RUNNING)
1252			THR_RUNQ_INSERT_TAIL(newthread);
1253		newthread->kse->k_curthread = NULL;
1254		newthread->kse->k_mbx.km_flags = 0;
1255		newthread->kse->k_mbx.km_func = (kse_func_t *)kse_sched_multi;
1256		newthread->kse->k_mbx.km_quantum = 0;
1257
1258		/*
1259		 * This thread needs a new KSE and KSEG.
1260		 */
1261		newthread->kse->k_flags &= ~KF_INITIALIZED;
1262		newthread->kse->k_flags |= KF_STARTED;
1263		ret = kse_create(&newthread->kse->k_mbx, 1);
1264		if (ret != 0)
1265			ret = errno;
1266	}
1267	else {
1268		/*
1269		 * Lock the KSE and add the new thread to its list of
1270		 * assigned threads.  If the new thread is runnable, also
1271		 * add it to the KSE's run queue.
1272		 */
1273		crit = _kse_critical_enter();
1274		KSE_SCHED_LOCK(curthread->kse, newthread->kseg);
1275		KSEG_THRQ_ADD(newthread->kseg, newthread);
1276		if (newthread->state == PS_RUNNING)
1277			THR_RUNQ_INSERT_TAIL(newthread);
1278		if ((newthread->kse->k_flags & KF_STARTED) == 0) {
1279			/*
1280			 * This KSE hasn't been started yet.  Start it
1281			 * outside of holding the lock.
1282			 */
1283			newthread->kse->k_flags |= KF_STARTED;
1284			newthread->kse->k_mbx.km_func =
1285			    (kse_func_t *)kse_sched_multi;
1286			newthread->kse->k_mbx.km_flags = 0;
1287			kse_create(&newthread->kse->k_mbx, 0);
1288		 } else if ((newthread->state == PS_RUNNING) &&
1289		     KSE_IS_IDLE(newthread->kse)) {
1290			/*
1291			 * The thread is being scheduled on another KSEG.
1292			 */
1293			kse_wakeup_one(newthread);
1294		}
1295		KSE_SCHED_UNLOCK(curthread->kse, newthread->kseg);
1296		_kse_critical_leave(crit);
1297		ret = 0;
1298	}
1299	if (ret != 0)
1300		thr_unlink(newthread);
1301
1302	return (ret);
1303}
1304
1305void
1306kse_waitq_insert(struct pthread *thread)
1307{
1308	struct pthread *td;
1309
1310	if (thread->wakeup_time.tv_sec == -1)
1311		TAILQ_INSERT_TAIL(&thread->kse->k_schedq->sq_waitq, thread,
1312		    pqe);
1313	else {
1314		td = TAILQ_FIRST(&thread->kse->k_schedq->sq_waitq);
1315		while ((td != NULL) && (td->wakeup_time.tv_sec != -1) &&
1316		    ((td->wakeup_time.tv_sec < thread->wakeup_time.tv_sec) ||
1317		    ((td->wakeup_time.tv_sec == thread->wakeup_time.tv_sec) &&
1318		    (td->wakeup_time.tv_nsec <= thread->wakeup_time.tv_nsec))))
1319			td = TAILQ_NEXT(td, pqe);
1320		if (td == NULL)
1321			TAILQ_INSERT_TAIL(&thread->kse->k_schedq->sq_waitq,
1322			    thread, pqe);
1323		else
1324			TAILQ_INSERT_BEFORE(td, thread, pqe);
1325	}
1326	thread->flags |= THR_FLAGS_IN_WAITQ;
1327}
1328
1329/*
1330 * This must be called with the scheduling lock held.
1331 */
1332static void
1333kse_check_completed(struct kse *kse)
1334{
1335	struct pthread *thread;
1336	struct kse_thr_mailbox *completed;
1337	int sig;
1338
1339	if ((completed = kse->k_mbx.km_completed) != NULL) {
1340		kse->k_mbx.km_completed = NULL;
1341		while (completed != NULL) {
1342			thread = completed->tm_udata;
1343			DBG_MSG("Found completed thread %p, name %s\n",
1344			    thread,
1345			    (thread->name == NULL) ? "none" : thread->name);
1346			thread->blocked = 0;
1347			if (thread != kse->k_curthread) {
1348				if ((thread->flags & THR_FLAGS_SUSPENDED) != 0)
1349					THR_SET_STATE(thread, PS_SUSPENDED);
1350				else
1351					KSE_RUNQ_INSERT_TAIL(kse, thread);
1352				if ((thread->kse != kse) &&
1353				    (thread->kse->k_curthread == thread)) {
1354					thread->kse->k_curthread = NULL;
1355					thread->active = 0;
1356				}
1357			}
1358			if ((sig = thread->tmbx.tm_syncsig.si_signo) != 0) {
1359				if (SIGISMEMBER(thread->sigmask, sig))
1360					SIGADDSET(thread->sigpend, sig);
1361				else
1362					_thr_sig_add(thread, sig, &thread->tmbx.tm_syncsig);
1363				thread->tmbx.tm_syncsig.si_signo = 0;
1364			}
1365			completed = completed->tm_next;
1366		}
1367	}
1368}
1369
1370/*
1371 * This must be called with the scheduling lock held.
1372 */
1373static void
1374kse_check_waitq(struct kse *kse)
1375{
1376	struct pthread	*pthread;
1377	struct timespec ts;
1378
1379	KSE_GET_TOD(kse, &ts);
1380
1381	/*
1382	 * Wake up threads that have timedout.  This has to be
1383	 * done before adding the current thread to the run queue
1384	 * so that a CPU intensive thread doesn't get preference
1385	 * over waiting threads.
1386	 */
1387	while (((pthread = KSE_WAITQ_FIRST(kse)) != NULL) &&
1388	    thr_timedout(pthread, &ts)) {
1389		/* Remove the thread from the wait queue: */
1390		KSE_WAITQ_REMOVE(kse, pthread);
1391		DBG_MSG("Found timedout thread %p in waitq\n", pthread);
1392
1393		/* Indicate the thread timedout: */
1394		pthread->timeout = 1;
1395
1396		/* Add the thread to the priority queue: */
1397		if ((pthread->flags & THR_FLAGS_SUSPENDED) != 0)
1398			THR_SET_STATE(pthread, PS_SUSPENDED);
1399		else {
1400			THR_SET_STATE(pthread, PS_RUNNING);
1401			KSE_RUNQ_INSERT_TAIL(kse, pthread);
1402		}
1403	}
1404}
1405
1406static int
1407thr_timedout(struct pthread *thread, struct timespec *curtime)
1408{
1409	if (thread->wakeup_time.tv_sec < 0)
1410		return (0);
1411	else if (thread->wakeup_time.tv_sec > curtime->tv_sec)
1412		return (0);
1413	else if ((thread->wakeup_time.tv_sec == curtime->tv_sec) &&
1414	    (thread->wakeup_time.tv_nsec > curtime->tv_nsec))
1415		return (0);
1416	else
1417		return (1);
1418}
1419
1420/*
1421 * This must be called with the scheduling lock held.
1422 *
1423 * Each thread has a time slice, a wakeup time (used when it wants
1424 * to wait for a specified amount of time), a run state, and an
1425 * active flag.
1426 *
1427 * When a thread gets run by the scheduler, the active flag is
1428 * set to non-zero (1).  When a thread performs an explicit yield
1429 * or schedules a state change, it enters the scheduler and the
1430 * active flag is cleared.  When the active flag is still seen
1431 * set in the scheduler, that means that the thread is blocked in
1432 * the kernel (because it is cleared before entering the scheduler
1433 * in all other instances).
1434 *
1435 * The wakeup time is only set for those states that can timeout.
1436 * It is set to (-1, -1) for all other instances.
1437 *
1438 * The thread's run state, aside from being useful when debugging,
1439 * is used to place the thread in an appropriate queue.  There
1440 * are 2 basic queues:
1441 *
1442 *   o run queue - queue ordered by priority for all threads
1443 *                 that are runnable
1444 *   o waiting queue - queue sorted by wakeup time for all threads
1445 *                     that are not otherwise runnable (not blocked
1446 *                     in kernel, not waiting for locks)
1447 *
1448 * The thread's time slice is used for round-robin scheduling
1449 * (the default scheduling policy).  While a SCHED_RR thread
1450 * is runnable it's time slice accumulates.  When it reaches
1451 * the time slice interval, it gets reset and added to the end
1452 * of the queue of threads at its priority.  When a thread no
1453 * longer becomes runnable (blocks in kernel, waits, etc), its
1454 * time slice is reset.
1455 *
1456 * The job of kse_switchout_thread() is to handle all of the above.
1457 */
1458static void
1459kse_switchout_thread(struct kse *kse, struct pthread *thread)
1460{
1461	int level;
1462	int i;
1463	int restart;
1464	siginfo_t siginfo;
1465
1466	/*
1467	 * Place the currently running thread into the
1468	 * appropriate queue(s).
1469	 */
1470	DBG_MSG("Switching out thread %p, state %d\n", thread, thread->state);
1471
1472	THR_DEACTIVATE_LAST_LOCK(thread);
1473	if (thread->blocked != 0) {
1474		thread->active = 0;
1475		thread->need_switchout = 0;
1476		/* This thread must have blocked in the kernel. */
1477		/* thread->slice_usec = -1;*/	/* restart timeslice */
1478		if ((thread->slice_usec != -1) &&
1479		    (thread->attr.sched_policy != SCHED_FIFO))
1480			thread->slice_usec += (thread->tmbx.tm_uticks
1481			    + thread->tmbx.tm_sticks) * _clock_res_usec;
1482		/*
1483		 *  Check for pending signals for this thread to
1484		 *  see if we need to interrupt it in the kernel.
1485		 */
1486		if (thread->check_pending != 0) {
1487			for (i = 1; i <= _SIG_MAXSIG; ++i) {
1488				if (SIGISMEMBER(thread->sigpend, i) &&
1489				    !SIGISMEMBER(thread->sigmask, i)) {
1490					restart = _thread_sigact[1 - 1].sa_flags & SA_RESTART;
1491					kse_thr_interrupt(&thread->tmbx,
1492					    restart ? -2 : -1);
1493					break;
1494				}
1495			}
1496		}
1497	}
1498	else {
1499		switch (thread->state) {
1500		case PS_DEAD:
1501			/*
1502			 * The scheduler is operating on a different
1503			 * stack.  It is safe to do garbage collecting
1504			 * here.
1505			 */
1506			thread->active = 0;
1507			thread->need_switchout = 0;
1508			thr_cleanup(kse, thread);
1509			return;
1510			break;
1511
1512		case PS_RUNNING:
1513			if ((thread->flags & THR_FLAGS_SUSPENDED) != 0)
1514				THR_SET_STATE(thread, PS_SUSPENDED);
1515			break;
1516
1517		case PS_COND_WAIT:
1518		case PS_SLEEP_WAIT:
1519			/* Insert into the waiting queue: */
1520			KSE_WAITQ_INSERT(kse, thread);
1521			break;
1522
1523		case PS_LOCKWAIT:
1524			/*
1525			 * This state doesn't timeout.
1526			 */
1527			thread->wakeup_time.tv_sec = -1;
1528			thread->wakeup_time.tv_nsec = -1;
1529			level = thread->locklevel - 1;
1530			if (!_LCK_GRANTED(&thread->lockusers[level]))
1531				KSE_WAITQ_INSERT(kse, thread);
1532			else
1533				THR_SET_STATE(thread, PS_RUNNING);
1534			break;
1535
1536		case PS_SIGWAIT:
1537			KSE_WAITQ_INSERT(kse, thread);
1538			break;
1539		case PS_JOIN:
1540		case PS_MUTEX_WAIT:
1541		case PS_SIGSUSPEND:
1542		case PS_SUSPENDED:
1543		case PS_DEADLOCK:
1544		default:
1545			/*
1546			 * These states don't timeout.
1547			 */
1548			thread->wakeup_time.tv_sec = -1;
1549			thread->wakeup_time.tv_nsec = -1;
1550
1551			/* Insert into the waiting queue: */
1552			KSE_WAITQ_INSERT(kse, thread);
1553			break;
1554		}
1555		if (thread->state != PS_RUNNING) {
1556			/* Restart the time slice: */
1557			thread->slice_usec = -1;
1558		} else {
1559			if (thread->need_switchout != 0)
1560				/*
1561				 * The thread yielded on its own;
1562				 * restart the timeslice.
1563				 */
1564				thread->slice_usec = -1;
1565			else if ((thread->slice_usec != -1) &&
1566	   		    (thread->attr.sched_policy != SCHED_FIFO)) {
1567				thread->slice_usec += (thread->tmbx.tm_uticks
1568				    + thread->tmbx.tm_sticks) * _clock_res_usec;
1569				/* Check for time quantum exceeded: */
1570				if (thread->slice_usec > TIMESLICE_USEC)
1571					thread->slice_usec = -1;
1572			}
1573			if (thread->slice_usec == -1) {
1574				/*
1575				 * The thread exceeded its time quantum or
1576				 * it yielded the CPU; place it at the tail
1577				 * of the queue for its priority.
1578				 */
1579				KSE_RUNQ_INSERT_TAIL(kse, thread);
1580			} else {
1581				/*
1582				 * The thread hasn't exceeded its interval
1583				 * Place it at the head of the queue for its
1584				 * priority.
1585				 */
1586				KSE_RUNQ_INSERT_HEAD(kse, thread);
1587			}
1588		}
1589	}
1590	thread->active = 0;
1591	thread->need_switchout = 0;
1592	if (thread->check_pending != 0) {
1593		/* Install pending signals into the frame. */
1594		thread->check_pending = 0;
1595		KSE_LOCK_ACQUIRE(kse, &_thread_signal_lock);
1596		for (i = 1; i <= _SIG_MAXSIG; i++) {
1597			if (SIGISMEMBER(thread->sigmask, i))
1598				continue;
1599			if (SIGISMEMBER(thread->sigpend, i))
1600				_thr_sig_add(thread, i, &thread->siginfo[i-1]);
1601			else if (SIGISMEMBER(_thr_proc_sigpending, i) &&
1602				_thr_getprocsig_unlocked(i, &siginfo)) {
1603				_thr_sig_add(thread, i, &siginfo);
1604			}
1605		}
1606		KSE_LOCK_RELEASE(kse, &_thread_signal_lock);
1607	}
1608}
1609
1610/*
1611 * This function waits for the smallest timeout value of any waiting
1612 * thread, or until it receives a message from another KSE.
1613 *
1614 * This must be called with the scheduling lock held.
1615 */
1616static void
1617kse_wait(struct kse *kse, struct pthread *td_wait)
1618{
1619	struct timespec ts, ts_sleep;
1620	int saved_flags;
1621
1622	KSE_GET_TOD(kse, &ts);
1623
1624	if ((td_wait == NULL) || (td_wait->wakeup_time.tv_sec < 0)) {
1625		/* Limit sleep to no more than 1 minute. */
1626		ts_sleep.tv_sec = 60;
1627		ts_sleep.tv_nsec = 0;
1628	} else {
1629		TIMESPEC_SUB(&ts_sleep, &td_wait->wakeup_time, &ts);
1630		if (ts_sleep.tv_sec > 60) {
1631			ts_sleep.tv_sec = 60;
1632			ts_sleep.tv_nsec = 0;
1633		}
1634	}
1635	/* Don't sleep for negative times. */
1636	if ((ts_sleep.tv_sec >= 0) && (ts_sleep.tv_nsec >= 0)) {
1637		KSE_SET_IDLE(kse);
1638		kse->k_kseg->kg_idle_kses++;
1639		KSE_SCHED_UNLOCK(kse, kse->k_kseg);
1640		saved_flags = kse->k_mbx.km_flags;
1641		kse->k_mbx.km_flags |= KMF_NOUPCALL;
1642		kse_release(&ts_sleep);
1643		kse->k_mbx.km_flags = saved_flags;
1644		KSE_SCHED_LOCK(kse, kse->k_kseg);
1645		if (KSE_IS_IDLE(kse)) {
1646			KSE_CLEAR_IDLE(kse);
1647			kse->k_kseg->kg_idle_kses--;
1648		}
1649	}
1650}
1651
1652/*
1653 * Avoid calling this kse_exit() so as not to confuse it with the
1654 * system call of the same name.
1655 */
1656static void
1657kse_fini(struct kse *kse)
1658{
1659	/* struct kse_group *free_kseg = NULL; */
1660	struct timespec ts;
1661
1662	/*
1663	 * Check to see if this is one of the main kses.
1664	 */
1665	if (kse->k_kseg != _kse_initial->k_kseg) {
1666		PANIC("shouldn't get here");
1667		/* This is for supporting thread groups. */
1668#ifdef NOT_YET
1669		/* Remove this KSE from the KSEG's list of KSEs. */
1670		KSE_SCHED_LOCK(kse, kse->k_kseg);
1671		TAILQ_REMOVE(&kse->k_kseg->kg_kseq, kse, k_kgqe);
1672		kse->k_kseg->kg_ksecount--;
1673		if (TAILQ_EMPTY(&kse->k_kseg->kg_kseq))
1674			free_kseg = kse->k_kseg;
1675		KSE_SCHED_UNLOCK(kse, kse->k_kseg);
1676
1677		/*
1678		 * Add this KSE to the list of free KSEs along with
1679		 * the KSEG if is now orphaned.
1680		 */
1681		KSE_LOCK_ACQUIRE(kse, &kse_lock);
1682		if (free_kseg != NULL)
1683			kseg_free_unlocked(free_kseg);
1684		kse_free_unlocked(kse);
1685		KSE_LOCK_RELEASE(kse, &kse_lock);
1686		kse_exit();
1687		/* Never returns. */
1688		PANIC("kse_exit()");
1689#endif
1690	} else {
1691#ifdef NOT_YET
1692		/*
1693		 * In future, we might allow program to kill
1694		 * kse in initial group.
1695		 */
1696		if (kse != _kse_initial) {
1697			KSE_SCHED_LOCK(kse, kse->k_kseg);
1698			TAILQ_REMOVE(&kse->k_kseg->kg_kseq, kse, k_kgqe);
1699			kse->k_kseg->kg_ksecount--;
1700			KSE_SCHED_UNLOCK(kse, kse->k_kseg);
1701			KSE_LOCK_ACQUIRE(kse, &kse_lock);
1702			kse_free_unlocked(kse);
1703			KSE_LOCK_RELEASE(kse, &kse_lock);
1704			kse_exit();
1705                        /* Never returns. */
1706                        PANIC("kse_exit() failed for initial kseg");
1707                }
1708#endif
1709		KSE_SCHED_LOCK(kse, kse->k_kseg);
1710		KSE_SET_IDLE(kse);
1711		kse->k_kseg->kg_idle_kses++;
1712		KSE_SCHED_UNLOCK(kse, kse->k_kseg);
1713		ts.tv_sec = 120;
1714		ts.tv_nsec = 0;
1715		kse->k_mbx.km_flags = 0;
1716		kse_release(&ts);
1717		/* Never reach */
1718	}
1719}
1720
1721void
1722_thr_set_timeout(const struct timespec *timeout)
1723{
1724	struct pthread	*curthread = _get_curthread();
1725	struct timespec ts;
1726
1727	/* Reset the timeout flag for the running thread: */
1728	curthread->timeout = 0;
1729
1730	/* Check if the thread is to wait forever: */
1731	if (timeout == NULL) {
1732		/*
1733		 * Set the wakeup time to something that can be recognised as
1734		 * different to an actual time of day:
1735		 */
1736		curthread->wakeup_time.tv_sec = -1;
1737		curthread->wakeup_time.tv_nsec = -1;
1738	}
1739	/* Check if no waiting is required: */
1740	else if ((timeout->tv_sec == 0) && (timeout->tv_nsec == 0)) {
1741		/* Set the wake up time to 'immediately': */
1742		curthread->wakeup_time.tv_sec = 0;
1743		curthread->wakeup_time.tv_nsec = 0;
1744	} else {
1745		/* Calculate the time for the current thread to wakeup: */
1746		KSE_GET_TOD(curthread->kse, &ts);
1747		TIMESPEC_ADD(&curthread->wakeup_time, &ts, timeout);
1748	}
1749}
1750
1751void
1752_thr_panic_exit(char *file, int line, char *msg)
1753{
1754	char buf[256];
1755
1756	snprintf(buf, sizeof(buf), "(%s:%d) %s\n", file, line, msg);
1757	__sys_write(2, buf, strlen(buf));
1758	abort();
1759}
1760
1761void
1762_thr_setrunnable(struct pthread *curthread, struct pthread *thread)
1763{
1764	kse_critical_t crit;
1765
1766	crit = _kse_critical_enter();
1767	KSE_SCHED_LOCK(curthread->kse, thread->kseg);
1768	_thr_setrunnable_unlocked(thread);
1769	KSE_SCHED_UNLOCK(curthread->kse, thread->kseg);
1770	_kse_critical_leave(crit);
1771}
1772
1773void
1774_thr_setrunnable_unlocked(struct pthread *thread)
1775{
1776	if ((thread->kseg->kg_flags & KGF_SINGLE_THREAD) != 0) {
1777		/* No silly queues for these threads. */
1778		if ((thread->flags & THR_FLAGS_SUSPENDED) != 0)
1779			THR_SET_STATE(thread, PS_SUSPENDED);
1780		else
1781			THR_SET_STATE(thread, PS_RUNNING);
1782	} else if (thread->state != PS_RUNNING) {
1783		if ((thread->flags & THR_FLAGS_IN_WAITQ) != 0)
1784			KSE_WAITQ_REMOVE(thread->kse, thread);
1785		if ((thread->flags & THR_FLAGS_SUSPENDED) != 0)
1786			THR_SET_STATE(thread, PS_SUSPENDED);
1787		else {
1788			THR_SET_STATE(thread, PS_RUNNING);
1789			if ((thread->blocked == 0) && (thread->active == 0) &&
1790			    (thread->flags & THR_FLAGS_IN_RUNQ) == 0)
1791				THR_RUNQ_INSERT_TAIL(thread);
1792		}
1793	}
1794        /*
1795         * XXX - Threads are not yet assigned to specific KSEs; they are
1796         *       assigned to the KSEG.  So the fact that a thread's KSE is
1797         *       waiting doesn't necessarily mean that it will be the KSE
1798         *       that runs the thread after the lock is granted.  But we
1799         *       don't know if the other KSEs within the same KSEG are
1800         *       also in a waiting state or not so we err on the side of
1801         *       caution and wakeup the thread's last known KSE.  We
1802         *       ensure that the threads KSE doesn't change while it's
1803         *       scheduling lock is held so it is safe to reference it
1804         *       (the KSE).  If the KSE wakes up and doesn't find any more
1805         *       work it will again go back to waiting so no harm is done.
1806         */
1807	kse_wakeup_one(thread);
1808}
1809
1810static void
1811kse_wakeup_one(struct pthread *thread)
1812{
1813	struct kse *ke;
1814
1815	if (KSE_IS_IDLE(thread->kse)) {
1816		KSE_CLEAR_IDLE(thread->kse);
1817		thread->kseg->kg_idle_kses--;
1818		KSE_WAKEUP(thread->kse);
1819	} else {
1820		TAILQ_FOREACH(ke, &thread->kseg->kg_kseq, k_kgqe) {
1821			if (KSE_IS_IDLE(ke)) {
1822				KSE_CLEAR_IDLE(ke);
1823				ke->k_kseg->kg_idle_kses--;
1824				KSE_WAKEUP(ke);
1825				return;
1826			}
1827		}
1828	}
1829}
1830
1831static void
1832kse_wakeup_multi(struct kse *curkse)
1833{
1834	struct kse *ke;
1835	int tmp;
1836
1837	if ((tmp = KSE_RUNQ_THREADS(curkse)) && curkse->k_kseg->kg_idle_kses) {
1838		TAILQ_FOREACH(ke, &curkse->k_kseg->kg_kseq, k_kgqe) {
1839			if (KSE_IS_IDLE(ke)) {
1840				KSE_CLEAR_IDLE(ke);
1841				ke->k_kseg->kg_idle_kses--;
1842				KSE_WAKEUP(ke);
1843				if (--tmp == 0)
1844					break;
1845			}
1846		}
1847	}
1848}
1849
1850struct pthread *
1851_get_curthread(void)
1852{
1853	return (_ksd_curthread());
1854}
1855
1856/* This assumes the caller has disabled upcalls. */
1857struct kse *
1858_get_curkse(void)
1859{
1860	return (_ksd_curkse());
1861}
1862
1863void
1864_set_curkse(struct kse *kse)
1865{
1866	_ksd_setprivate(&kse->k_ksd);
1867}
1868
1869/*
1870 * Allocate a new KSEG.
1871 *
1872 * We allow the current thread to be NULL in the case that this
1873 * is the first time a KSEG is being created (library initialization).
1874 * In this case, we don't need to (and can't) take any locks.
1875 */
1876struct kse_group *
1877_kseg_alloc(struct pthread *curthread)
1878{
1879	struct kse_group *kseg = NULL;
1880	kse_critical_t crit;
1881
1882	if ((curthread != NULL) && (free_kseg_count > 0)) {
1883		/* Use the kse lock for the kseg queue. */
1884		crit = _kse_critical_enter();
1885		KSE_LOCK_ACQUIRE(curthread->kse, &kse_lock);
1886		if ((kseg = TAILQ_FIRST(&free_kse_groupq)) != NULL) {
1887			TAILQ_REMOVE(&free_kse_groupq, kseg, kg_qe);
1888			free_kseg_count--;
1889			active_kseg_count++;
1890			TAILQ_INSERT_TAIL(&active_kse_groupq, kseg, kg_qe);
1891		}
1892		KSE_LOCK_RELEASE(curthread->kse, &kse_lock);
1893		_kse_critical_leave(crit);
1894		if (kseg)
1895			kseg_reinit(kseg);
1896	}
1897
1898	/*
1899	 * If requested, attempt to allocate a new KSE group only if the
1900	 * KSE allocation was successful and a KSE group wasn't found in
1901	 * the free list.
1902	 */
1903	if ((kseg == NULL) &&
1904	    ((kseg = (struct kse_group *)malloc(sizeof(*kseg))) != NULL)) {
1905		if (_pq_alloc(&kseg->kg_schedq.sq_runq,
1906		    THR_MIN_PRIORITY, THR_LAST_PRIORITY) != 0) {
1907			free(kseg);
1908			kseg = NULL;
1909		} else {
1910			kseg_init(kseg);
1911			/* Add the KSEG to the list of active KSEGs. */
1912			if (curthread != NULL) {
1913				crit = _kse_critical_enter();
1914				KSE_LOCK_ACQUIRE(curthread->kse, &kse_lock);
1915				active_kseg_count++;
1916				TAILQ_INSERT_TAIL(&active_kse_groupq,
1917				    kseg, kg_qe);
1918				KSE_LOCK_RELEASE(curthread->kse, &kse_lock);
1919				_kse_critical_leave(crit);
1920			} else {
1921				active_kseg_count++;
1922				TAILQ_INSERT_TAIL(&active_kse_groupq,
1923				    kseg, kg_qe);
1924			}
1925		}
1926	}
1927	return (kseg);
1928}
1929
1930/*
1931 * This must be called with the kse lock held and when there are
1932 * no more threads that reference it.
1933 */
1934static void
1935kseg_free_unlocked(struct kse_group *kseg)
1936{
1937	TAILQ_REMOVE(&active_kse_groupq, kseg, kg_qe);
1938	TAILQ_INSERT_HEAD(&free_kse_groupq, kseg, kg_qe);
1939	free_kseg_count++;
1940	active_kseg_count--;
1941}
1942
1943void
1944_kseg_free(struct kse_group *kseg)
1945{
1946	struct kse *curkse;
1947	kse_critical_t crit;
1948
1949	crit = _kse_critical_enter();
1950	curkse = _get_curkse();
1951	KSE_LOCK_ACQUIRE(curkse, &kse_lock);
1952	kseg_free_unlocked(kseg);
1953	KSE_LOCK_RELEASE(curkse, &kse_lock);
1954	_kse_critical_leave(crit);
1955}
1956
1957/*
1958 * Allocate a new KSE.
1959 *
1960 * We allow the current thread to be NULL in the case that this
1961 * is the first time a KSE is being created (library initialization).
1962 * In this case, we don't need to (and can't) take any locks.
1963 */
1964struct kse *
1965_kse_alloc(struct pthread *curthread)
1966{
1967	struct kse *kse = NULL;
1968	kse_critical_t crit;
1969	int need_ksd = 0;
1970	int i;
1971
1972	if ((curthread != NULL) && (free_kse_count > 0)) {
1973		crit = _kse_critical_enter();
1974		KSE_LOCK_ACQUIRE(curthread->kse, &kse_lock);
1975		/* Search for a finished KSE. */
1976		kse = TAILQ_FIRST(&free_kseq);
1977		while ((kse != NULL) &&
1978		    ((kse->k_mbx.km_flags & KMF_DONE) == 0)) {
1979			kse = TAILQ_NEXT(kse, k_qe);
1980		}
1981		if (kse != NULL) {
1982			DBG_MSG("found an unused kse.\n");
1983			TAILQ_REMOVE(&free_kseq, kse, k_qe);
1984			free_kse_count--;
1985			TAILQ_INSERT_TAIL(&active_kseq, kse, k_qe);
1986			active_kse_count++;
1987		}
1988		KSE_LOCK_RELEASE(curthread->kse, &kse_lock);
1989		_kse_critical_leave(crit);
1990		if (kse != NULL)
1991			kse_reinit(kse);
1992	}
1993	if ((kse == NULL) &&
1994	    ((kse = (struct kse *)malloc(sizeof(*kse))) != NULL)) {
1995		bzero(kse, sizeof(*kse));
1996
1997		/* Initialize the lockusers. */
1998		for (i = 0; i < MAX_KSE_LOCKLEVEL; i++) {
1999			_lockuser_init(&kse->k_lockusers[i], (void *)kse);
2000			_LCK_SET_PRIVATE2(&kse->k_lockusers[i], NULL);
2001		}
2002		/* _lock_init(kse->k_lock, ...) */
2003
2004		/* We had to malloc a kse; mark it as needing a new ID.*/
2005		need_ksd = 1;
2006
2007		/*
2008		 * Create the KSE context.
2009		 *
2010		 * XXX - For now this is done here in the allocation.
2011		 *       In the future, we may want to have it done
2012		 *       outside the allocation so that scope system
2013		 *       threads (one thread per KSE) are not required
2014		 *       to have a stack for an unneeded kse upcall.
2015		 */
2016		kse->k_mbx.km_func = (kse_func_t *)kse_sched_multi;
2017		kse->k_mbx.km_stack.ss_sp = (char *)malloc(KSE_STACKSIZE);
2018		kse->k_mbx.km_stack.ss_size = KSE_STACKSIZE;
2019		kse->k_mbx.km_udata = (void *)kse;
2020		kse->k_mbx.km_quantum = 20000;
2021		/*
2022		 * We need to keep a copy of the stack in case it
2023		 * doesn't get used; a KSE running a scope system
2024		 * thread will use that thread's stack.
2025		 */
2026		kse->k_stack.ss_sp = kse->k_mbx.km_stack.ss_sp;
2027		kse->k_stack.ss_size = kse->k_mbx.km_stack.ss_size;
2028		if (kse->k_mbx.km_stack.ss_sp == NULL) {
2029			for (i = 0; i < MAX_KSE_LOCKLEVEL; i++) {
2030				_lockuser_destroy(&kse->k_lockusers[i]);
2031			}
2032			/* _lock_destroy(&kse->k_lock); */
2033			free(kse);
2034			kse = NULL;
2035		}
2036	}
2037	if ((kse != NULL) && (need_ksd != 0)) {
2038		/* This KSE needs initialization. */
2039		if (curthread != NULL) {
2040			crit = _kse_critical_enter();
2041			KSE_LOCK_ACQUIRE(curthread->kse, &kse_lock);
2042		}
2043		/* Initialize KSD inside of the lock. */
2044		if (_ksd_create(&kse->k_ksd, (void *)kse, sizeof(*kse)) != 0) {
2045			if (curthread != NULL) {
2046				KSE_LOCK_RELEASE(curthread->kse, &kse_lock);
2047				_kse_critical_leave(crit);
2048			}
2049			free(kse->k_mbx.km_stack.ss_sp);
2050			for (i = 0; i < MAX_KSE_LOCKLEVEL; i++) {
2051				_lockuser_destroy(&kse->k_lockusers[i]);
2052			}
2053			free(kse);
2054			return (NULL);
2055		}
2056		kse->k_flags = 0;
2057		TAILQ_INSERT_TAIL(&active_kseq, kse, k_qe);
2058		active_kse_count++;
2059		if (curthread != NULL) {
2060			KSE_LOCK_RELEASE(curthread->kse, &kse_lock);
2061			_kse_critical_leave(crit);
2062		}
2063	}
2064	return (kse);
2065}
2066
2067static void
2068kse_reinit(struct kse *kse)
2069{
2070	/*
2071	 * XXX - For now every kse has its stack.
2072	 *       In the future, we may want to have it done
2073	 *       outside the allocation so that scope system
2074	 *       threads (one thread per KSE) are not required
2075	 *       to have a stack for an unneeded kse upcall.
2076	 */
2077	kse->k_mbx.km_flags = 0;
2078	kse->k_curthread = 0;
2079	kse->k_kseg = 0;
2080	kse->k_schedq = 0;
2081	kse->k_locklevel = 0;
2082	SIGEMPTYSET(kse->k_sigmask);
2083	bzero(&kse->k_sigq, sizeof(kse->k_sigq));
2084	kse->k_check_sigq = 0;
2085	kse->k_flags = 0;
2086	kse->k_waiting = 0;
2087	kse->k_idle = 0;
2088	kse->k_error = 0;
2089	kse->k_cpu = 0;
2090	kse->k_done = 0;
2091	kse->k_switch = 0;
2092}
2093
2094void
2095kse_free_unlocked(struct kse *kse)
2096{
2097	TAILQ_REMOVE(&active_kseq, kse, k_qe);
2098	active_kse_count--;
2099	kse->k_kseg = NULL;
2100	kse->k_mbx.km_quantum = 20000;
2101	kse->k_flags = 0;
2102	TAILQ_INSERT_HEAD(&free_kseq, kse, k_qe);
2103	free_kse_count++;
2104}
2105
2106void
2107_kse_free(struct pthread *curthread, struct kse *kse)
2108{
2109	kse_critical_t crit;
2110
2111	if (curthread == NULL)
2112		kse_free_unlocked(kse);
2113	else {
2114		crit = _kse_critical_enter();
2115		KSE_LOCK_ACQUIRE(curthread->kse, &kse_lock);
2116		kse_free_unlocked(kse);
2117		KSE_LOCK_RELEASE(curthread->kse, &kse_lock);
2118		_kse_critical_leave(crit);
2119	}
2120}
2121
2122static void
2123kseg_init(struct kse_group *kseg)
2124{
2125	kseg_reinit(kseg);
2126	_lock_init(&kseg->kg_lock, LCK_ADAPTIVE, _kse_lock_wait,
2127	    _kse_lock_wakeup);
2128}
2129
2130static void
2131kseg_reinit(struct kse_group *kseg)
2132{
2133	TAILQ_INIT(&kseg->kg_kseq);
2134	TAILQ_INIT(&kseg->kg_threadq);
2135	TAILQ_INIT(&kseg->kg_schedq.sq_waitq);
2136	kseg->kg_threadcount = 0;
2137	kseg->kg_ksecount = 0;
2138	kseg->kg_idle_kses = 0;
2139	kseg->kg_flags = 0;
2140}
2141
2142struct pthread *
2143_thr_alloc(struct pthread *curthread)
2144{
2145	kse_critical_t crit;
2146	void *p;
2147	struct pthread *thread = NULL;
2148
2149	if (curthread != NULL) {
2150		if (GC_NEEDED())
2151			_thr_gc(curthread);
2152		if (free_thread_count > 0) {
2153			crit = _kse_critical_enter();
2154			KSE_LOCK_ACQUIRE(curthread->kse, &thread_lock);
2155			if ((thread = TAILQ_FIRST(&free_threadq)) != NULL) {
2156				TAILQ_REMOVE(&free_threadq, thread, tle);
2157				free_thread_count--;
2158			}
2159			KSE_LOCK_RELEASE(curthread->kse, &thread_lock);
2160			_kse_critical_leave(crit);
2161		}
2162	}
2163	if (thread == NULL) {
2164		p = malloc(sizeof(struct pthread) + THR_ALIGNBYTES);
2165		if (p != NULL) {
2166			thread = (struct pthread *)THR_ALIGN(p);
2167			thread->alloc_addr = p;
2168		}
2169	}
2170	return (thread);
2171}
2172
2173void
2174_thr_free(struct pthread *curthread, struct pthread *thread)
2175{
2176	kse_critical_t crit;
2177	int i;
2178
2179	DBG_MSG("Freeing thread %p\n", thread);
2180	if ((curthread == NULL) || (free_thread_count >= MAX_CACHED_THREADS)) {
2181		for (i = 0; i < MAX_THR_LOCKLEVEL; i++) {
2182			_lockuser_destroy(&thread->lockusers[i]);
2183		}
2184		_lock_destroy(&thread->lock);
2185		free(thread->alloc_addr);
2186	}
2187	else {
2188		crit = _kse_critical_enter();
2189		KSE_LOCK_ACQUIRE(curthread->kse, &thread_lock);
2190		TAILQ_INSERT_TAIL(&free_threadq, thread, tle);
2191		free_thread_count++;
2192		KSE_LOCK_RELEASE(curthread->kse, &thread_lock);
2193		_kse_critical_leave(crit);
2194	}
2195}
2196
2197/*
2198 * Add an active thread:
2199 *
2200 *   o Assign the thread a unique id (which GDB uses to track
2201 *     threads.
2202 *   o Add the thread to the list of all threads and increment
2203 *     number of active threads.
2204 */
2205static void
2206thr_link(struct pthread *thread)
2207{
2208	kse_critical_t crit;
2209	struct kse *curkse;
2210	struct pthread *curthread;
2211
2212	crit = _kse_critical_enter();
2213	curkse = _get_curkse();
2214	curthread = _get_curthread();
2215	thread->sigmask = curthread->sigmask;
2216	KSE_LOCK_ACQUIRE(curkse, &_thread_list_lock);
2217	/*
2218	 * Initialize the unique id (which GDB uses to track
2219	 * threads), add the thread to the list of all threads,
2220	 * and
2221	 */
2222	thread->uniqueid = next_uniqueid++;
2223	THR_LIST_ADD(thread);
2224	active_threads++;
2225	KSE_LOCK_RELEASE(curkse, &_thread_list_lock);
2226
2227	_kse_critical_leave(crit);
2228}
2229
2230/*
2231 * Remove an active thread.
2232 */
2233static void
2234thr_unlink(struct pthread *thread)
2235{
2236	kse_critical_t crit;
2237	struct kse *curkse;
2238
2239	crit = _kse_critical_enter();
2240	curkse = _get_curkse();
2241
2242	KSE_LOCK_ACQUIRE(curkse, &_thread_list_lock);
2243	THR_LIST_REMOVE(thread);
2244	active_threads--;
2245	KSE_LOCK_RELEASE(curkse, &_thread_list_lock);
2246
2247	_kse_critical_leave(crit);
2248}
2249