thr_kern.c revision 115798
1/*
2 * Copyright (C) 2003 Daniel M. Eischen <deischen@freebsd.org>
3 * Copyright (C) 2002 Jonathon Mini <mini@freebsd.org>
4 * Copyright (c) 1995-1998 John Birrell <jb@cimlogic.com.au>
5 * All rights reserved.
6 *
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
9 * are met:
10 * 1. Redistributions of source code must retain the above copyright
11 *    notice, this list of conditions and the following disclaimer.
12 * 2. Redistributions in binary form must reproduce the above copyright
13 *    notice, this list of conditions and the following disclaimer in the
14 *    documentation and/or other materials provided with the distribution.
15 * 3. All advertising materials mentioning features or use of this software
16 *    must display the following acknowledgement:
17 *	This product includes software developed by John Birrell.
18 * 4. Neither the name of the author nor the names of any co-contributors
19 *    may be used to endorse or promote products derived from this software
20 *    without specific prior written permission.
21 *
22 * THIS SOFTWARE IS PROVIDED BY JOHN BIRRELL AND CONTRIBUTORS ``AS IS'' AND
23 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
24 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
25 * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
26 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
27 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
28 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
29 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
30 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
31 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32 * SUCH DAMAGE.
33 *
34 */
35#include <sys/cdefs.h>
36__FBSDID("$FreeBSD: head/lib/libkse/thread/thr_kern.c 115798 2003-06-04 03:22:59Z davidxu $");
37
38#include <sys/types.h>
39#include <sys/kse.h>
40#include <sys/signalvar.h>
41#include <sys/queue.h>
42#include <machine/atomic.h>
43
44#include <assert.h>
45#include <errno.h>
46#include <signal.h>
47#include <stdlib.h>
48#include <string.h>
49#include <time.h>
50#include <ucontext.h>
51#include <unistd.h>
52
53#include "atomic_ops.h"
54#include "thr_private.h"
55#include "pthread_md.h"
56#include "libc_private.h"
57
58/*#define DEBUG_THREAD_KERN */
59#ifdef DEBUG_THREAD_KERN
60#define DBG_MSG		stdout_debug
61#else
62#define DBG_MSG(x...)
63#endif
64
65/*
66 * Define a high water mark for the maximum number of threads that
67 * will be cached.  Once this level is reached, any extra threads
68 * will be free()'d.
69 *
70 * XXX - It doesn't make sense to worry about the maximum number of
71 *       KSEs that we can cache because the system will limit us to
72 *       something *much* less than the maximum number of threads
73 *       that we can have.  Disregarding KSEs in their own group,
74 *       the maximum number of KSEs is the number of processors in
75 *       the system.
76 */
77#define	MAX_CACHED_THREADS	100
78#define	KSE_STACKSIZE		16384
79
80#define	KSE_SET_MBOX(kse, thrd) \
81	(kse)->k_mbx.km_curthread = &(thrd)->tmbx
82
83#define	KSE_SET_EXITED(kse)	(kse)->k_flags |= KF_EXITED
84
85/*
86 * Macros for manipulating the run queues.  The priority queue
87 * routines use the thread's pqe link and also handle the setting
88 * and clearing of the thread's THR_FLAGS_IN_RUNQ flag.
89 */
90#define	KSE_RUNQ_INSERT_HEAD(kse, thrd)			\
91	_pq_insert_head(&(kse)->k_schedq->sq_runq, thrd)
92#define	KSE_RUNQ_INSERT_TAIL(kse, thrd)			\
93	_pq_insert_tail(&(kse)->k_schedq->sq_runq, thrd)
94#define	KSE_RUNQ_REMOVE(kse, thrd)			\
95	_pq_remove(&(kse)->k_schedq->sq_runq, thrd)
96#define	KSE_RUNQ_FIRST(kse)	_pq_first(&(kse)->k_schedq->sq_runq)
97
98#define KSE_RUNQ_THREADS(kse)	((kse)->k_schedq->sq_runq.pq_threads)
99
100/*
101 * We've got to keep track of everything that is allocated, not only
102 * to have a speedy free list, but also so they can be deallocated
103 * after a fork().
104 */
105static TAILQ_HEAD(, kse)	active_kseq;
106static TAILQ_HEAD(, kse)	free_kseq;
107static TAILQ_HEAD(, kse_group)	free_kse_groupq;
108static TAILQ_HEAD(, kse_group)	active_kse_groupq;
109static TAILQ_HEAD(, kse_group)	gc_ksegq;
110static struct lock		kse_lock;	/* also used for kseg queue */
111static int			free_kse_count = 0;
112static int			free_kseg_count = 0;
113static TAILQ_HEAD(, pthread)	free_threadq;
114static struct lock		thread_lock;
115static int			free_thread_count = 0;
116static int			inited = 0;
117static int			active_threads = 1;
118static int			active_kse_count = 0;
119static int			active_kseg_count = 0;
120static u_int64_t		next_uniqueid = 1;
121
122
123#ifdef DEBUG_THREAD_KERN
124static void	dump_queues(struct kse *curkse);
125#endif
126static void	kse_check_completed(struct kse *kse);
127static void	kse_check_waitq(struct kse *kse);
128static void	kse_check_signals(struct kse *kse);
129static void	kse_fini(struct kse *curkse);
130static void	kse_reinit(struct kse *kse);
131static void	kse_sched_multi(struct kse *curkse);
132#ifdef NOT_YET
133static void	kse_sched_single(struct kse *curkse);
134#endif
135static void	kse_switchout_thread(struct kse *kse, struct pthread *thread);
136static void	kse_wait(struct kse *kse, struct pthread *td_wait);
137static void	kse_free_unlocked(struct kse *kse);
138static void	kseg_free_unlocked(struct kse_group *kseg);
139static void	kseg_init(struct kse_group *kseg);
140static void	kseg_reinit(struct kse_group *kseg);
141static void	kse_waitq_insert(struct pthread *thread);
142static void	kse_wakeup_multi(struct kse *curkse);
143static void	kse_wakeup_one(struct pthread *thread);
144static void	thr_cleanup(struct kse *kse, struct pthread *curthread);
145static void	thr_link(struct pthread *thread);
146static void	thr_resume_wrapper(int unused_1, siginfo_t *unused_2,
147		    ucontext_t *ucp);
148static void	thr_resume_check(struct pthread *curthread, ucontext_t *ucp,
149		    struct pthread_sigframe *psf);
150static int	thr_timedout(struct pthread *thread, struct timespec *curtime);
151static void	thr_unlink(struct pthread *thread);
152
153/*
154 * This is called after a fork().
155 * No locks need to be taken here since we are guaranteed to be
156 * single threaded.
157 */
158void
159_kse_single_thread(struct pthread *curthread)
160{
161	struct kse *kse;
162	struct kse_group *kseg;
163	struct pthread *thread;
164	kse_critical_t crit;
165	int i;
166
167	/*
168	 * Disable upcalls and clear the threaded flag.
169	 * XXX - I don't think we need to disable upcalls after a fork().
170	 *       but it doesn't hurt.
171	 */
172	crit = _kse_critical_enter();
173	__isthreaded = 0;
174	active_threads = 1;
175
176	/*
177	 * Enter a loop to remove and free all threads other than
178	 * the running thread from the active thread list:
179	 */
180	while ((thread = TAILQ_FIRST(&_thread_list)) != NULL) {
181		THR_GCLIST_REMOVE(thread);
182		/*
183		 * Remove this thread from the list (the current
184		 * thread will be removed but re-added by libpthread
185		 * initialization.
186		 */
187		TAILQ_REMOVE(&_thread_list, thread, tle);
188		/* Make sure this isn't the running thread: */
189		if (thread != curthread) {
190			_thr_stack_free(&thread->attr);
191			if (thread->specific != NULL)
192				free(thread->specific);
193			for (i = 0; i < MAX_THR_LOCKLEVEL; i++) {
194				_lockuser_destroy(&thread->lockusers[i]);
195			}
196			_lock_destroy(&thread->lock);
197			free(thread);
198		}
199	}
200
201	TAILQ_INIT(&curthread->mutexq);		/* initialize mutex queue */
202	curthread->joiner = NULL;		/* no joining threads yet */
203	curthread->refcount = 0;
204	sigemptyset(&curthread->sigpend);	/* clear pending signals */
205	if (curthread->specific != NULL) {
206		free(curthread->specific);
207		curthread->specific = NULL;
208		curthread->specific_data_count = 0;
209	}
210
211	/* Free the free KSEs: */
212	while ((kse = TAILQ_FIRST(&free_kseq)) != NULL) {
213		TAILQ_REMOVE(&free_kseq, kse, k_qe);
214		for (i = 0; i < MAX_KSE_LOCKLEVEL; i++) {
215			_lockuser_destroy(&kse->k_lockusers[i]);
216		}
217		_lock_destroy(&kse->k_lock);
218		_ksd_destroy(&kse->k_ksd);
219		if (kse->k_stack.ss_sp != NULL)
220			free(kse->k_stack.ss_sp);
221		free(kse);
222	}
223	free_kse_count = 0;
224
225	/* Free the active KSEs: */
226	while ((kse = TAILQ_FIRST(&active_kseq)) != NULL) {
227		TAILQ_REMOVE(&active_kseq, kse, k_qe);
228		for (i = 0; i < MAX_KSE_LOCKLEVEL; i++) {
229			_lockuser_destroy(&kse->k_lockusers[i]);
230		}
231		_lock_destroy(&kse->k_lock);
232		if (kse->k_stack.ss_sp != NULL)
233			free(kse->k_stack.ss_sp);
234		free(kse);
235	}
236	active_kse_count = 0;
237
238	/* Free the free KSEGs: */
239	while ((kseg = TAILQ_FIRST(&free_kse_groupq)) != NULL) {
240		TAILQ_REMOVE(&free_kse_groupq, kseg, kg_qe);
241		_lock_destroy(&kseg->kg_lock);
242		_pq_free(&kseg->kg_schedq.sq_runq);
243		free(kseg);
244	}
245	free_kseg_count = 0;
246
247	/* Free the active KSEGs: */
248	while ((kseg = TAILQ_FIRST(&active_kse_groupq)) != NULL) {
249		TAILQ_REMOVE(&active_kse_groupq, kseg, kg_qe);
250		_lock_destroy(&kseg->kg_lock);
251		_pq_free(&kseg->kg_schedq.sq_runq);
252		free(kseg);
253	}
254	active_kseg_count = 0;
255
256	/* Free the free threads. */
257	while ((thread = TAILQ_FIRST(&free_threadq)) != NULL) {
258		TAILQ_REMOVE(&free_threadq, thread, tle);
259		if (thread->specific != NULL)
260			free(thread->specific);
261		for (i = 0; i < MAX_THR_LOCKLEVEL; i++) {
262			_lockuser_destroy(&thread->lockusers[i]);
263		}
264		_lock_destroy(&thread->lock);
265		free(thread);
266	}
267	free_thread_count = 0;
268
269	/* Free the to-be-gc'd threads. */
270	while ((thread = TAILQ_FIRST(&_thread_gc_list)) != NULL) {
271		TAILQ_REMOVE(&_thread_gc_list, thread, gcle);
272		for (i = 0; i < MAX_THR_LOCKLEVEL; i++) {
273			_lockuser_destroy(&thread->lockusers[i]);
274		}
275		_lock_destroy(&thread->lock);
276		free(thread);
277	}
278	TAILQ_INIT(&gc_ksegq);
279	_gc_count = 0;
280
281	if (inited != 0) {
282		/*
283		 * Destroy these locks; they'll be recreated to assure they
284		 * are in the unlocked state.
285		 */
286		_lock_destroy(&kse_lock);
287		_lock_destroy(&thread_lock);
288		_lock_destroy(&_thread_list_lock);
289		inited = 0;
290	}
291
292	/*
293	 * After a fork(), the leftover thread goes back to being
294	 * scope process.
295	 */
296	curthread->attr.flags &= ~PTHREAD_SCOPE_SYSTEM;
297	curthread->attr.flags |= PTHREAD_SCOPE_PROCESS;
298
299	/*
300	 * After a fork, we are still operating on the thread's original
301	 * stack.  Don't clear the THR_FLAGS_USER from the thread's
302	 * attribute flags.
303	 */
304
305	/* Initialize the threads library. */
306	curthread->kse = NULL;
307	curthread->kseg = NULL;
308	_kse_initial = NULL;
309	_libpthread_init(curthread);
310}
311
312/*
313 * This is used to initialize housekeeping and to initialize the
314 * KSD for the KSE.
315 */
316void
317_kse_init(void)
318{
319	if (inited == 0) {
320		TAILQ_INIT(&active_kseq);
321		TAILQ_INIT(&active_kse_groupq);
322		TAILQ_INIT(&free_kseq);
323		TAILQ_INIT(&free_kse_groupq);
324		TAILQ_INIT(&free_threadq);
325		TAILQ_INIT(&gc_ksegq);
326		if (_lock_init(&kse_lock, LCK_ADAPTIVE,
327		    _kse_lock_wait, _kse_lock_wakeup) != 0)
328			PANIC("Unable to initialize free KSE queue lock");
329		if (_lock_init(&thread_lock, LCK_ADAPTIVE,
330		    _kse_lock_wait, _kse_lock_wakeup) != 0)
331			PANIC("Unable to initialize free thread queue lock");
332		if (_lock_init(&_thread_list_lock, LCK_ADAPTIVE,
333		    _kse_lock_wait, _kse_lock_wakeup) != 0)
334			PANIC("Unable to initialize thread list lock");
335		active_kse_count = 0;
336		active_kseg_count = 0;
337		_gc_count = 0;
338		inited = 1;
339	}
340}
341
342int
343_kse_isthreaded(void)
344{
345	return (__isthreaded != 0);
346}
347
348/*
349 * This is called when the first thread (other than the initial
350 * thread) is created.
351 */
352int
353_kse_setthreaded(int threaded)
354{
355	if ((threaded != 0) && (__isthreaded == 0)) {
356		/*
357		 * Locking functions in libc are required when there are
358		 * threads other than the initial thread.
359		 */
360		__isthreaded = 1;
361
362		/*
363		 * Tell the kernel to create a KSE for the initial thread
364		 * and enable upcalls in it.
365		 */
366		_kse_initial->k_flags |= KF_STARTED;
367		if (kse_create(&_kse_initial->k_mbx, 0) != 0) {
368			_kse_initial->k_flags &= ~KF_STARTED;
369			__isthreaded = 0;
370			/* may abort() */
371			DBG_MSG("kse_create failed\n");
372			return (-1);
373		}
374		KSE_SET_MBOX(_kse_initial, _thr_initial);
375		_thr_setmaxconcurrency();
376	}
377	return (0);
378}
379
380/*
381 * Lock wait and wakeup handlers for KSE locks.  These are only used by
382 * KSEs, and should never be used by threads.  KSE locks include the
383 * KSE group lock (used for locking the scheduling queue) and the
384 * kse_lock defined above.
385 *
386 * When a KSE lock attempt blocks, the entire KSE blocks allowing another
387 * KSE to run.  For the most part, it doesn't make much sense to try and
388 * schedule another thread because you need to lock the scheduling queue
389 * in order to do that.  And since the KSE lock is used to lock the scheduling
390 * queue, you would just end up blocking again.
391 */
392void
393_kse_lock_wait(struct lock *lock, struct lockuser *lu)
394{
395	struct kse *curkse = (struct kse *)_LCK_GET_PRIVATE(lu);
396	struct timespec ts;
397	int saved_flags;
398
399	if (curkse->k_mbx.km_curthread != NULL)
400		PANIC("kse_lock_wait does not disable upcall.\n");
401	/*
402	 * Enter a loop to wait until we get the lock.
403	 */
404	ts.tv_sec = 0;
405	ts.tv_nsec = 1000000;  /* 1 sec */
406	while (!_LCK_GRANTED(lu)) {
407		/*
408		 * Yield the kse and wait to be notified when the lock
409		 * is granted.
410		 */
411		saved_flags = curkse->k_mbx.km_flags;
412		curkse->k_mbx.km_flags |= KMF_NOUPCALL | KMF_NOCOMPLETED;
413		kse_release(&ts);
414		curkse->k_mbx.km_flags = saved_flags;
415	}
416}
417
418void
419_kse_lock_wakeup(struct lock *lock, struct lockuser *lu)
420{
421	struct kse *curkse;
422	struct kse *kse;
423	struct kse_mailbox *mbx;
424
425	curkse = _get_curkse();
426	kse = (struct kse *)_LCK_GET_PRIVATE(lu);
427
428	if (kse == curkse)
429		PANIC("KSE trying to wake itself up in lock");
430	else {
431		mbx = &kse->k_mbx;
432		_lock_grant(lock, lu);
433		/*
434		 * Notify the owning kse that it has the lock.
435		 * It is safe to pass invalid address to kse_wakeup
436		 * even if the mailbox is not in kernel at all,
437		 * and waking up a wrong kse is also harmless.
438		 */
439		kse_wakeup(mbx);
440	}
441}
442
443/*
444 * Thread wait and wakeup handlers for thread locks.  These are only used
445 * by threads, never by KSEs.  Thread locks include the per-thread lock
446 * (defined in its structure), and condition variable and mutex locks.
447 */
448void
449_thr_lock_wait(struct lock *lock, struct lockuser *lu)
450{
451	struct pthread *curthread = (struct pthread *)lu->lu_private;
452
453	do {
454		THR_SCHED_LOCK(curthread, curthread);
455		THR_SET_STATE(curthread, PS_LOCKWAIT);
456		THR_SCHED_UNLOCK(curthread, curthread);
457		_thr_sched_switch(curthread);
458	} while (!_LCK_GRANTED(lu));
459}
460
461void
462_thr_lock_wakeup(struct lock *lock, struct lockuser *lu)
463{
464	struct pthread *thread;
465	struct pthread *curthread;
466
467	curthread = _get_curthread();
468	thread = (struct pthread *)_LCK_GET_PRIVATE(lu);
469
470	THR_SCHED_LOCK(curthread, thread);
471	_lock_grant(lock, lu);
472	_thr_setrunnable_unlocked(thread);
473	THR_SCHED_UNLOCK(curthread, thread);
474}
475
476kse_critical_t
477_kse_critical_enter(void)
478{
479	kse_critical_t crit;
480
481	crit = _ksd_readandclear_tmbx;
482	return (crit);
483}
484
485void
486_kse_critical_leave(kse_critical_t crit)
487{
488	struct pthread *curthread;
489
490	_ksd_set_tmbx(crit);
491	if ((crit != NULL) && ((curthread = _get_curthread()) != NULL))
492		THR_YIELD_CHECK(curthread);
493}
494
495int
496_kse_in_critical(void)
497{
498	return (_ksd_get_tmbx() == NULL);
499}
500
501void
502_thr_critical_enter(struct pthread *thread)
503{
504	thread->critical_count++;
505}
506
507void
508_thr_critical_leave(struct pthread *thread)
509{
510	thread->critical_count--;
511	THR_YIELD_CHECK(thread);
512}
513
514void
515_thr_sched_switch(struct pthread *curthread)
516{
517	struct kse *curkse;
518
519	(void)_kse_critical_enter();
520	curkse = _get_curkse();
521	KSE_SCHED_LOCK(curkse, curkse->k_kseg);
522	_thr_sched_switch_unlocked(curthread);
523}
524
525/*
526 * XXX - We may need to take the scheduling lock before calling
527 *       this, or perhaps take the lock within here before
528 *       doing anything else.
529 */
530void
531_thr_sched_switch_unlocked(struct pthread *curthread)
532{
533	struct pthread *td;
534	struct pthread_sigframe psf;
535	struct kse *curkse;
536	int ret;
537	volatile int uts_once;
538	volatile int resume_once = 0;
539
540	/* We're in the scheduler, 5 by 5: */
541	curkse = _get_curkse();
542
543	curthread->need_switchout = 1;	/* The thread yielded on its own. */
544	curthread->critical_yield = 0;	/* No need to yield anymore. */
545	curthread->slice_usec = -1;	/* Restart the time slice. */
546
547	/* Thread can unlock the scheduler lock. */
548	curthread->lock_switch = 1;
549
550	/*
551	 * The signal frame is allocated off the stack because
552	 * a thread can be interrupted by other signals while
553	 * it is running down pending signals.
554	 */
555	sigemptyset(&psf.psf_sigset);
556	curthread->curframe = &psf;
557
558	/*
559	 * Enter the scheduler if any one of the following is true:
560	 *
561	 *   o The current thread is dead; it's stack needs to be
562	 *     cleaned up and it can't be done while operating on
563	 *     it.
564	 *   o There are no runnable threads.
565	 *   o The next thread to run won't unlock the scheduler
566	 *     lock.  A side note: the current thread may be run
567	 *     instead of the next thread in the run queue, but
568	 *     we don't bother checking for that.
569	 */
570	if ((curthread->state == PS_DEAD) ||
571	    (((td = KSE_RUNQ_FIRST(curkse)) == NULL) &&
572	    (curthread->state != PS_RUNNING)) ||
573	    ((td != NULL) && (td->lock_switch == 0)))
574		_thread_enter_uts(&curthread->tmbx, &curkse->k_mbx);
575	else {
576		uts_once = 0;
577		THR_GETCONTEXT(&curthread->tmbx.tm_context);
578		if (uts_once == 0) {
579			uts_once = 1;
580
581			/* Switchout the current thread. */
582			kse_switchout_thread(curkse, curthread);
583
584		 	/* Choose another thread to run. */
585			td = KSE_RUNQ_FIRST(curkse);
586			KSE_RUNQ_REMOVE(curkse, td);
587			curkse->k_curthread = td;
588
589			/*
590			 * Make sure the current thread's kse points to
591			 * this kse.
592			 */
593			td->kse = curkse;
594
595			/*
596			 * Reset accounting.
597			 */
598			td->tmbx.tm_uticks = 0;
599			td->tmbx.tm_sticks = 0;
600
601			/*
602			 * Reset the time slice if this thread is running
603			 * for the first time or running again after using
604			 * its full time slice allocation.
605			 */
606			if (td->slice_usec == -1)
607				td->slice_usec = 0;
608
609			/* Mark the thread active. */
610			td->active = 1;
611
612			/* Remove the frame reference. */
613			td->curframe = NULL;
614
615			/*
616			 * Continue the thread at its current frame:
617			 */
618			ret = _thread_switch(&td->tmbx, NULL);
619			/* This point should not be reached. */
620			if (ret != 0)
621				PANIC("Bad return from _thread_switch");
622			PANIC("Thread has returned from _thread_switch");
623		}
624	}
625
626	if (curthread->lock_switch != 0) {
627		/*
628		 * Unlock the scheduling queue and leave the
629		 * critical region.
630		 */
631		/* Don't trust this after a switch! */
632		curkse = _get_curkse();
633
634		curthread->lock_switch = 0;
635		KSE_SCHED_UNLOCK(curkse, curkse->k_kseg);
636		_kse_critical_leave(&curthread->tmbx);
637	}
638	/*
639	 * This thread is being resumed; check for cancellations.
640	 */
641	if ((resume_once == 0) && (!THR_IN_CRITICAL(curthread))) {
642		resume_once = 1;
643		thr_resume_check(curthread, &curthread->tmbx.tm_context, &psf);
644	}
645
646	THR_ACTIVATE_LAST_LOCK(curthread);
647}
648
649/*
650 * This is the scheduler for a KSE which runs a scope system thread.
651 * The multi-thread KSE scheduler should also work for a single threaded
652 * KSE, but we use a separate scheduler so that it can be fine-tuned
653 * to be more efficient (and perhaps not need a separate stack for
654 * the KSE, allowing it to use the thread's stack).
655 *
656 * XXX - This probably needs some work.
657 */
658#ifdef NOT_YET
659static void
660kse_sched_single(struct kse *curkse)
661{
662	struct pthread *curthread = curkse->k_curthread;
663	struct pthread *td_wait;
664	struct timespec ts;
665	int level;
666
667	if (curthread->active == 0) {
668		if (curthread->state != PS_RUNNING) {
669			/* Check to see if the thread has timed out. */
670			KSE_GET_TOD(curkse, &ts);
671			if (thr_timedout(curthread, &ts) != 0) {
672				curthread->timeout = 1;
673				curthread->state = PS_RUNNING;
674			}
675		}
676	}
677
678	/* This thread no longer needs to yield the CPU: */
679	curthread->critical_yield = 0;
680	curthread->need_switchout = 0;
681
682	/*
683	 * Lock the scheduling queue.
684	 *
685	 * There is no scheduling queue for single threaded KSEs,
686	 * but we need a lock for protection regardless.
687	 */
688	KSE_SCHED_LOCK(curkse, curkse->k_kseg);
689
690	/*
691	 * This has to do the job of kse_switchout_thread(), only
692	 * for a single threaded KSE/KSEG.
693	 */
694
695	switch (curthread->state) {
696	case PS_DEAD:
697		/* Unlock the scheduling queue and exit the KSE and thread. */
698		thr_cleaup(curkse, curthread);
699		KSE_SCHED_UNLOCK(curkse, curkse->k_kseg);
700		break;
701
702	case PS_COND_WAIT:
703	case PS_SLEEP_WAIT:
704		/* Only insert threads that can timeout: */
705		if (curthread->wakeup_time.tv_sec != -1) {
706			/* Insert into the waiting queue: */
707			KSE_WAITQ_INSERT(curkse, curthread);
708		}
709		break;
710
711	case PS_LOCKWAIT:
712		level = curthread->locklevel - 1;
713		if (!_LCK_GRANTED(&curthread->lockusers[level]))
714			KSE_WAITQ_INSERT(curkse, curthread);
715		else
716			THR_SET_STATE(curthread, PS_RUNNING);
717		break;
718
719	case PS_JOIN:
720	case PS_MUTEX_WAIT:
721	case PS_RUNNING:
722	case PS_SIGSUSPEND:
723	case PS_SIGWAIT:
724	case PS_SUSPENDED:
725	case PS_DEADLOCK:
726	default:
727		/*
728		 * These states don't timeout and don't need
729		 * to be in the waiting queue.
730		 */
731		break;
732	}
733	while (curthread->state != PS_RUNNING) {
734		curthread->active = 0;
735		td_wait = KSE_WAITQ_FIRST(curkse);
736
737		kse_wait(curkse, td_wait);
738
739	    	if (td_wait != NULL) {
740			KSE_GET_TOD(curkse, &ts);
741			if (thr_timedout(curthread, &ts)) {
742				/* Indicate the thread timedout: */
743				td_wait->timeout = 1;
744
745				/* Make the thread runnable. */
746				THR_SET_STATE(td_wait, PS_RUNNING);
747				KSE_WAITQ_REMOVE(curkse, td_wait);
748			}
749		}
750		KSE_SCHED_UNLOCK(curkse, curkse->k_kseg);
751		kse_check_signals(curkse);
752		KSE_SCHED_LOCK(curkse, curkse->k_kseg);
753	}
754
755	/* Remove the frame reference. */
756	curthread->curframe = NULL;
757
758	/* Unlock the scheduling queue. */
759	KSE_SCHED_UNLOCK(curkse, curkse->k_kseg);
760
761	/*
762	 * Continue the thread at its current frame:
763	 */
764	DBG_MSG("Continuing bound thread %p\n", curthread);
765	_thread_switch(&curthread->tmbx, &curkse->k_mbx.km_curthread);
766	PANIC("Thread has returned from _thread_switch");
767}
768#endif
769
770#ifdef DEBUG_THREAD_KERN
771static void
772dump_queues(struct kse *curkse)
773{
774	struct pthread *thread;
775
776	DBG_MSG("Threads in waiting queue:\n");
777	TAILQ_FOREACH(thread, &curkse->k_kseg->kg_schedq.sq_waitq, pqe) {
778		DBG_MSG("  thread %p, state %d, blocked %d\n",
779		    thread, thread->state, thread->blocked);
780	}
781}
782#endif
783
784/*
785 * This is the scheduler for a KSE which runs multiple threads.
786 */
787static void
788kse_sched_multi(struct kse *curkse)
789{
790	struct pthread *curthread, *td_wait;
791	struct pthread_sigframe *curframe;
792	int ret;
793
794	THR_ASSERT(curkse->k_mbx.km_curthread == NULL,
795	    "Mailbox not null in kse_sched_multi");
796
797	/* Check for first time initialization: */
798	if ((curkse->k_flags & KF_INITIALIZED) == 0) {
799		/* Setup this KSEs specific data. */
800		_ksd_setprivate(&curkse->k_ksd);
801		_set_curkse(curkse);
802
803		/* Set this before grabbing the context. */
804		curkse->k_flags |= KF_INITIALIZED;
805	}
806
807	/* This may have returned from a kse_release(). */
808	if (KSE_WAITING(curkse)) {
809		DBG_MSG("Entered upcall when KSE is waiting.");
810		KSE_CLEAR_WAIT(curkse);
811	}
812
813	/* Lock the scheduling lock. */
814	curthread = curkse->k_curthread;
815	if ((curthread == NULL) || (curthread->need_switchout == 0)) {
816		/* This is an upcall; take the scheduler lock. */
817		KSE_SCHED_LOCK(curkse, curkse->k_kseg);
818	}
819
820	if (KSE_IS_IDLE(curkse)) {
821		KSE_CLEAR_IDLE(curkse);
822		curkse->k_kseg->kg_idle_kses--;
823	}
824	/*
825	 * If the current thread was completed in another KSE, then
826	 * it will be in the run queue.  Don't mark it as being blocked.
827	 */
828	if ((curthread != NULL) &&
829	    ((curthread->flags & THR_FLAGS_IN_RUNQ) == 0) &&
830	    (curthread->need_switchout == 0)) {
831		/*
832		 * Assume the current thread is blocked; when the
833		 * completed threads are checked and if the current
834		 * thread is among the completed, the blocked flag
835		 * will be cleared.
836		 */
837		curthread->blocked = 1;
838	}
839
840	/* Check for any unblocked threads in the kernel. */
841	kse_check_completed(curkse);
842
843	/*
844	 * Check for threads that have timed-out.
845	 */
846	kse_check_waitq(curkse);
847
848	/*
849	 * Switchout the current thread, if necessary, as the last step
850	 * so that it is inserted into the run queue (if it's runnable)
851	 * _after_ any other threads that were added to it above.
852	 */
853	if (curthread == NULL)
854		;  /* Nothing to do here. */
855	else if ((curthread->need_switchout == 0) &&
856	    (curthread->blocked == 0) && (THR_IN_CRITICAL(curthread))) {
857		/*
858		 * Resume the thread and tell it to yield when
859		 * it leaves the critical region.
860		 */
861		curthread->critical_yield = 1;
862		curthread->active = 1;
863		if ((curthread->flags & THR_FLAGS_IN_RUNQ) != 0)
864			KSE_RUNQ_REMOVE(curkse, curthread);
865		curkse->k_curthread = curthread;
866		curthread->kse = curkse;
867		DBG_MSG("Continuing thread %p in critical region\n",
868		    curthread);
869		kse_wakeup_multi(curkse);
870		KSE_SCHED_UNLOCK(curkse, curkse->k_kseg);
871		ret = _thread_switch(&curthread->tmbx,
872		    &curkse->k_mbx.km_curthread);
873		if (ret != 0)
874			PANIC("Can't resume thread in critical region\n");
875	}
876	else if ((curthread->flags & THR_FLAGS_IN_RUNQ) == 0)
877		kse_switchout_thread(curkse, curthread);
878	curkse->k_curthread = NULL;
879
880	kse_wakeup_multi(curkse);
881
882	/* This has to be done without the scheduling lock held. */
883	KSE_SCHED_UNLOCK(curkse, curkse->k_kseg);
884	kse_check_signals(curkse);
885	KSE_SCHED_LOCK(curkse, curkse->k_kseg);
886
887#ifdef DEBUG_THREAD_KERN
888	dump_queues(curkse);
889#endif
890
891	/* Check if there are no threads ready to run: */
892	while (((curthread = KSE_RUNQ_FIRST(curkse)) == NULL) &&
893	    (curkse->k_kseg->kg_threadcount != 0)) {
894		/*
895		 * Wait for a thread to become active or until there are
896		 * no more threads.
897		 */
898		td_wait = KSE_WAITQ_FIRST(curkse);
899		kse_wait(curkse, td_wait);
900		kse_check_completed(curkse);
901		kse_check_waitq(curkse);
902		KSE_SCHED_UNLOCK(curkse, curkse->k_kseg);
903		kse_check_signals(curkse);
904		KSE_SCHED_LOCK(curkse, curkse->k_kseg);
905	}
906
907	/* Check for no more threads: */
908	if (curkse->k_kseg->kg_threadcount == 0) {
909		/*
910		 * Normally this shouldn't return, but it will if there
911		 * are other KSEs running that create new threads that
912		 * are assigned to this KSE[G].  For instance, if a scope
913		 * system thread were to create a scope process thread
914		 * and this kse[g] is the initial kse[g], then that newly
915		 * created thread would be assigned to us (the initial
916		 * kse[g]).
917		 */
918		KSE_SCHED_UNLOCK(curkse, curkse->k_kseg);
919		kse_fini(curkse);
920		/* never returns */
921	}
922
923	THR_ASSERT(curthread != NULL,
924	    "Return from kse_wait/fini without thread.");
925	THR_ASSERT(curthread->state != PS_DEAD,
926	    "Trying to resume dead thread!");
927	KSE_RUNQ_REMOVE(curkse, curthread);
928
929	/*
930	 * Make the selected thread the current thread.
931	 */
932	curkse->k_curthread = curthread;
933
934	/*
935	 * Make sure the current thread's kse points to this kse.
936	 */
937	curthread->kse = curkse;
938
939	/*
940	 * Reset accounting.
941	 */
942	curthread->tmbx.tm_uticks = 0;
943	curthread->tmbx.tm_sticks = 0;
944
945	/*
946	 * Reset the time slice if this thread is running for the first
947	 * time or running again after using its full time slice allocation.
948	 */
949	if (curthread->slice_usec == -1)
950		curthread->slice_usec = 0;
951
952	/* Mark the thread active. */
953	curthread->active = 1;
954
955	/* Remove the frame reference. */
956	curframe = curthread->curframe;
957	curthread->curframe = NULL;
958
959	kse_wakeup_multi(curkse);
960
961	/*
962	 * The thread's current signal frame will only be NULL if it
963	 * is being resumed after being blocked in the kernel.  In
964	 * this case, and if the thread needs to run down pending
965	 * signals or needs a cancellation check, we need to add a
966	 * signal frame to the thread's context.
967	 */
968#ifdef NOT_YET
969	if ((curframe == NULL) && ((curthread->have_signals != 0) ||
970	    (((curthread->cancelflags & THR_AT_CANCEL_POINT) == 0) &&
971	    ((curthread->cancelflags & PTHREAD_CANCEL_ASYNCHRONOUS) != 0))))
972		signalcontext(&curthread->tmbx.tm_context, 0,
973		    (__sighandler_t *)thr_resume_wrapper);
974#else
975	if ((curframe == NULL) && (curthread->have_signals != 0))
976		signalcontext(&curthread->tmbx.tm_context, 0,
977		    (__sighandler_t *)thr_resume_wrapper);
978#endif
979	/*
980	 * Continue the thread at its current frame:
981	 */
982	if (curthread->lock_switch != 0) {
983		/*
984		 * This thread came from a scheduler switch; it will
985		 * unlock the scheduler lock and set the mailbox.
986		 */
987		ret = _thread_switch(&curthread->tmbx, NULL);
988	} else {
989		/* This thread won't unlock the scheduler lock. */
990		KSE_SCHED_UNLOCK(curkse, curkse->k_kseg);
991		ret = _thread_switch(&curthread->tmbx,
992		    &curkse->k_mbx.km_curthread);
993	}
994	if (ret != 0)
995		PANIC("Thread has returned from _thread_switch");
996
997	/* This point should not be reached. */
998	PANIC("Thread has returned from _thread_switch");
999}
1000
1001static void
1002kse_check_signals(struct kse *curkse)
1003{
1004	sigset_t sigset;
1005	int i;
1006
1007	/* Deliver posted signals. */
1008	for (i = 0; i < _SIG_WORDS; i++) {
1009		atomic_swap_int(&curkse->k_mbx.km_sigscaught.__bits[i],
1010		    0, &sigset.__bits[i]);
1011	}
1012	if (SIGNOTEMPTY(sigset)) {
1013		/*
1014		 * Dispatch each signal.
1015		 *
1016		 * XXX - There is no siginfo for any of these.
1017		 *       I think there should be, especially for
1018		 *       signals from other processes (si_pid, si_uid).
1019		 */
1020		for (i = 1; i < NSIG; i++) {
1021			if (sigismember(&sigset, i) != 0) {
1022				DBG_MSG("Dispatching signal %d\n", i);
1023				_thr_sig_dispatch(curkse, i,
1024				    NULL /* no siginfo */);
1025			}
1026		}
1027		sigemptyset(&sigset);
1028		__sys_sigprocmask(SIG_SETMASK, &sigset, NULL);
1029	}
1030}
1031
1032static void
1033thr_resume_wrapper(int unused_1, siginfo_t *unused_2, ucontext_t *ucp)
1034{
1035	struct pthread *curthread = _get_curthread();
1036
1037	thr_resume_check(curthread, ucp, NULL);
1038}
1039
1040static void
1041thr_resume_check(struct pthread *curthread, ucontext_t *ucp,
1042    struct pthread_sigframe *psf)
1043{
1044	/* Check signals before cancellations. */
1045	while (curthread->have_signals != 0) {
1046		/* Clear the pending flag. */
1047		curthread->have_signals = 0;
1048
1049		/*
1050		 * It's perfectly valid, though not portable, for
1051		 * signal handlers to munge their interrupted context
1052		 * and expect to return to it.  Ensure we use the
1053		 * correct context when running down signals.
1054		 */
1055		_thr_sig_rundown(curthread, ucp, psf);
1056	}
1057
1058#ifdef NOT_YET
1059	if (((curthread->cancelflags & THR_AT_CANCEL_POINT) == 0) &&
1060	    ((curthread->cancelflags & PTHREAD_CANCEL_ASYNCHRONOUS) != 0))
1061		pthread_testcancel();
1062#endif
1063}
1064
1065/*
1066 * Clean up a thread.  This must be called with the thread's KSE
1067 * scheduling lock held.  The thread must be a thread from the
1068 * KSE's group.
1069 */
1070static void
1071thr_cleanup(struct kse *curkse, struct pthread *thread)
1072{
1073	struct pthread *joiner;
1074	int sys_scope;
1075
1076	if ((joiner = thread->joiner) != NULL) {
1077		/* Joinee scheduler lock held; joiner won't leave. */
1078		if (joiner->kseg == curkse->k_kseg) {
1079			if (joiner->join_status.thread == thread) {
1080				joiner->join_status.thread = NULL;
1081				joiner->join_status.ret = thread->ret;
1082				_thr_setrunnable_unlocked(joiner);
1083			}
1084		} else {
1085			KSE_SCHED_UNLOCK(curkse, curkse->k_kseg);
1086			/* The joiner may have removed itself and exited. */
1087			if (_thr_ref_add(thread, joiner, 0) == 0) {
1088				KSE_SCHED_LOCK(curkse, joiner->kseg);
1089				if (joiner->join_status.thread == thread) {
1090					joiner->join_status.thread = NULL;
1091					joiner->join_status.ret = thread->ret;
1092					_thr_setrunnable_unlocked(joiner);
1093				}
1094				KSE_SCHED_UNLOCK(curkse, joiner->kseg);
1095				_thr_ref_delete(thread, joiner);
1096			}
1097			KSE_SCHED_LOCK(curkse, curkse->k_kseg);
1098		}
1099		thread->attr.flags |= PTHREAD_DETACHED;
1100	}
1101
1102	if (!(sys_scope = (thread->attr.flags & PTHREAD_SCOPE_SYSTEM))) {
1103		/*
1104		 * Remove the thread from the KSEG's list of threads.
1105	 	 */
1106		KSEG_THRQ_REMOVE(thread->kseg, thread);
1107		/*
1108		 * Migrate the thread to the main KSE so that this
1109		 * KSE and KSEG can be cleaned when their last thread
1110		 * exits.
1111		 */
1112		thread->kseg = _kse_initial->k_kseg;
1113		thread->kse = _kse_initial;
1114	}
1115	thread->flags |= THR_FLAGS_GC_SAFE;
1116
1117	/*
1118	 * We can't hold the thread list lock while holding the
1119	 * scheduler lock.
1120	 */
1121	KSE_SCHED_UNLOCK(curkse, curkse->k_kseg);
1122	DBG_MSG("Adding thread %p to GC list\n", thread);
1123	KSE_LOCK_ACQUIRE(curkse, &_thread_list_lock);
1124	THR_GCLIST_ADD(thread);
1125	/* Use thread_list_lock */
1126	active_threads--;
1127	if (active_threads == 0) {
1128		KSE_LOCK_RELEASE(curkse, &_thread_list_lock);
1129		exit(0);
1130        }
1131	KSE_LOCK_RELEASE(curkse, &_thread_list_lock);
1132	if (sys_scope) {
1133		/*
1134		 * System scope thread is single thread group,
1135		 * when thread is exited, its kse and ksegrp should
1136		 * be recycled as well.
1137		 */
1138		kse_exit();
1139		PANIC("kse_exit() failed for system scope thread");
1140	}
1141	KSE_SCHED_LOCK(curkse, curkse->k_kseg);
1142}
1143
1144void
1145_thr_gc(struct pthread *curthread)
1146{
1147	struct pthread *td, *td_next;
1148	kse_critical_t crit;
1149	TAILQ_HEAD(, pthread) worklist;
1150
1151	TAILQ_INIT(&worklist);
1152	crit = _kse_critical_enter();
1153	KSE_LOCK_ACQUIRE(curthread->kse, &_thread_list_lock);
1154
1155	/* Check the threads waiting for GC. */
1156	for (td = TAILQ_FIRST(&_thread_gc_list); td != NULL; td = td_next) {
1157		td_next = TAILQ_NEXT(td, gcle);
1158		if ((td->flags & THR_FLAGS_GC_SAFE) == 0)
1159			continue;
1160		else if (((td->attr.flags & PTHREAD_SCOPE_SYSTEM) != 0) &&
1161		    ((td->kse->k_mbx.km_flags & KMF_DONE) == 0)) {
1162			/*
1163			 * The thread and KSE are operating on the same
1164			 * stack.  Wait for the KSE to exit before freeing
1165			 * the thread's stack as well as everything else.
1166			 */
1167			continue;
1168		}
1169		/*
1170		 * Remove the thread from the GC list.  If the thread
1171		 * isn't yet detached, it will get added back to the
1172		 * GC list at a later time.
1173		 */
1174		THR_GCLIST_REMOVE(td);
1175		DBG_MSG("Freeing thread %p stack\n", td);
1176		/*
1177		 * We can free the thread stack since it's no longer
1178		 * in use.
1179		 */
1180		_thr_stack_free(&td->attr);
1181		if (((td->attr.flags & PTHREAD_DETACHED) != 0) &&
1182		    (td->refcount == 0)) {
1183			/*
1184			 * The thread has detached and is no longer
1185			 * referenced.  It is safe to remove all
1186			 * remnants of the thread.
1187			 */
1188			THR_LIST_REMOVE(td);
1189			TAILQ_INSERT_HEAD(&worklist, td, gcle);
1190		}
1191	}
1192	KSE_LOCK_RELEASE(curthread->kse, &_thread_list_lock);
1193	_kse_critical_leave(crit);
1194
1195	while ((td = TAILQ_FIRST(&worklist)) != NULL) {
1196		TAILQ_REMOVE(&worklist, td, gcle);
1197
1198		if ((td->attr.flags & PTHREAD_SCOPE_SYSTEM) != 0) {
1199			crit = _kse_critical_enter();
1200			KSE_LOCK_ACQUIRE(curthread->kse, &kse_lock);
1201			kse_free_unlocked(td->kse);
1202			kseg_free_unlocked(td->kseg);
1203			KSE_LOCK_RELEASE(curthread->kse, &kse_lock);
1204			_kse_critical_leave(crit);
1205		}
1206		DBG_MSG("Freeing thread %p\n", td);
1207		_thr_free(curthread, td);
1208	}
1209	/* XXX free kse and ksegrp list should be looked as well */
1210}
1211
1212
1213/*
1214 * Only new threads that are running or suspended may be scheduled.
1215 */
1216int
1217_thr_schedule_add(struct pthread *curthread, struct pthread *newthread)
1218{
1219	struct kse *curkse;
1220	kse_critical_t crit;
1221	int ret;
1222
1223	/* Add the new thread. */
1224	thr_link(newthread);
1225
1226	/*
1227	 * If this is the first time creating a thread, make sure
1228	 * the mailbox is set for the current thread.
1229	 */
1230	if ((newthread->attr.flags & PTHREAD_SCOPE_SYSTEM) != 0) {
1231#ifdef NOT_YET
1232		/* We use the thread's stack as the KSE's stack. */
1233		new_thread->kse->k_mbx.km_stack.ss_sp =
1234		    new_thread->attr.stackaddr_attr;
1235		new_thread->kse->k_mbx.km_stack.ss_size =
1236		    new_thread->attr.stacksize_attr;
1237#endif
1238		/*
1239		 * No need to lock the scheduling queue since the
1240		 * KSE/KSEG pair have not yet been started.
1241		 */
1242		KSEG_THRQ_ADD(newthread->kseg, newthread);
1243		if (newthread->state == PS_RUNNING)
1244			THR_RUNQ_INSERT_TAIL(newthread);
1245		newthread->kse->k_curthread = NULL;
1246		newthread->kse->k_mbx.km_flags = 0;
1247		newthread->kse->k_mbx.km_func = (kse_func_t *)kse_sched_multi;
1248		newthread->kse->k_mbx.km_quantum = 0;
1249
1250		/*
1251		 * This thread needs a new KSE and KSEG.
1252		 */
1253		crit = _kse_critical_enter();
1254		curkse = _get_curkse();
1255		_ksd_setprivate(&newthread->kse->k_ksd);
1256		newthread->kse->k_flags |= KF_INITIALIZED|KF_STARTED;
1257		ret = kse_create(&newthread->kse->k_mbx, 1);
1258		if (ret != 0)
1259			ret = errno;
1260		_ksd_setprivate(&curkse->k_ksd);
1261		_kse_critical_leave(crit);
1262	}
1263	else {
1264		/*
1265		 * Lock the KSE and add the new thread to its list of
1266		 * assigned threads.  If the new thread is runnable, also
1267		 * add it to the KSE's run queue.
1268		 */
1269		KSE_SCHED_LOCK(curthread->kse, newthread->kseg);
1270		KSEG_THRQ_ADD(newthread->kseg, newthread);
1271		if (newthread->state == PS_RUNNING)
1272			THR_RUNQ_INSERT_TAIL(newthread);
1273		if ((newthread->kse->k_flags & KF_STARTED) == 0) {
1274			/*
1275			 * This KSE hasn't been started yet.  Start it
1276			 * outside of holding the lock.
1277			 */
1278			newthread->kse->k_flags |= KF_STARTED;
1279			newthread->kse->k_mbx.km_func =
1280			    (kse_func_t *)kse_sched_multi;
1281			newthread->kse->k_mbx.km_flags = 0;
1282			kse_create(&newthread->kse->k_mbx, 0);
1283		 } else if ((newthread->state == PS_RUNNING) &&
1284		     KSE_IS_IDLE(newthread->kse)) {
1285			/*
1286			 * The thread is being scheduled on another KSEG.
1287			 */
1288			kse_wakeup_one(newthread);
1289		}
1290		KSE_SCHED_UNLOCK(curthread->kse, newthread->kseg);
1291		ret = 0;
1292	}
1293	if (ret != 0)
1294		thr_unlink(newthread);
1295
1296	return (ret);
1297}
1298
1299void
1300kse_waitq_insert(struct pthread *thread)
1301{
1302	struct pthread *td;
1303
1304	if (thread->wakeup_time.tv_sec == -1)
1305		TAILQ_INSERT_TAIL(&thread->kse->k_schedq->sq_waitq, thread,
1306		    pqe);
1307	else {
1308		td = TAILQ_FIRST(&thread->kse->k_schedq->sq_waitq);
1309		while ((td != NULL) && (td->wakeup_time.tv_sec != -1) &&
1310		    ((td->wakeup_time.tv_sec < thread->wakeup_time.tv_sec) ||
1311		    ((td->wakeup_time.tv_sec == thread->wakeup_time.tv_sec) &&
1312		    (td->wakeup_time.tv_nsec <= thread->wakeup_time.tv_nsec))))
1313			td = TAILQ_NEXT(td, pqe);
1314		if (td == NULL)
1315			TAILQ_INSERT_TAIL(&thread->kse->k_schedq->sq_waitq,
1316			    thread, pqe);
1317		else
1318			TAILQ_INSERT_BEFORE(td, thread, pqe);
1319	}
1320	thread->flags |= THR_FLAGS_IN_WAITQ;
1321}
1322
1323/*
1324 * This must be called with the scheduling lock held.
1325 */
1326static void
1327kse_check_completed(struct kse *kse)
1328{
1329	struct pthread *thread;
1330	struct kse_thr_mailbox *completed;
1331
1332	if ((completed = kse->k_mbx.km_completed) != NULL) {
1333		kse->k_mbx.km_completed = NULL;
1334		while (completed != NULL) {
1335			thread = completed->tm_udata;
1336			DBG_MSG("Found completed thread %p, name %s\n",
1337			    thread,
1338			    (thread->name == NULL) ? "none" : thread->name);
1339			thread->blocked = 0;
1340			if (thread != kse->k_curthread) {
1341				if ((thread->flags & THR_FLAGS_SUSPENDED) != 0)
1342					THR_SET_STATE(thread, PS_SUSPENDED);
1343				else
1344					KSE_RUNQ_INSERT_TAIL(kse, thread);
1345				if ((thread->kse != kse) &&
1346				    (thread->kse->k_curthread == thread)) {
1347					thread->kse->k_curthread = NULL;
1348					thread->active = 0;
1349				}
1350			}
1351			completed = completed->tm_next;
1352		}
1353	}
1354}
1355
1356/*
1357 * This must be called with the scheduling lock held.
1358 */
1359static void
1360kse_check_waitq(struct kse *kse)
1361{
1362	struct pthread	*pthread;
1363	struct timespec ts;
1364
1365	KSE_GET_TOD(kse, &ts);
1366
1367	/*
1368	 * Wake up threads that have timedout.  This has to be
1369	 * done before adding the current thread to the run queue
1370	 * so that a CPU intensive thread doesn't get preference
1371	 * over waiting threads.
1372	 */
1373	while (((pthread = KSE_WAITQ_FIRST(kse)) != NULL) &&
1374	    thr_timedout(pthread, &ts)) {
1375		/* Remove the thread from the wait queue: */
1376		KSE_WAITQ_REMOVE(kse, pthread);
1377		DBG_MSG("Found timedout thread %p in waitq\n", pthread);
1378
1379		/* Indicate the thread timedout: */
1380		pthread->timeout = 1;
1381
1382		/* Add the thread to the priority queue: */
1383		if ((pthread->flags & THR_FLAGS_SUSPENDED) != 0)
1384			THR_SET_STATE(pthread, PS_SUSPENDED);
1385		else {
1386			THR_SET_STATE(pthread, PS_RUNNING);
1387			KSE_RUNQ_INSERT_TAIL(kse, pthread);
1388		}
1389	}
1390}
1391
1392static int
1393thr_timedout(struct pthread *thread, struct timespec *curtime)
1394{
1395	if (thread->wakeup_time.tv_sec < 0)
1396		return (0);
1397	else if (thread->wakeup_time.tv_sec > curtime->tv_sec)
1398		return (0);
1399	else if ((thread->wakeup_time.tv_sec == curtime->tv_sec) &&
1400	    (thread->wakeup_time.tv_nsec > curtime->tv_nsec))
1401		return (0);
1402	else
1403		return (1);
1404}
1405
1406/*
1407 * This must be called with the scheduling lock held.
1408 *
1409 * Each thread has a time slice, a wakeup time (used when it wants
1410 * to wait for a specified amount of time), a run state, and an
1411 * active flag.
1412 *
1413 * When a thread gets run by the scheduler, the active flag is
1414 * set to non-zero (1).  When a thread performs an explicit yield
1415 * or schedules a state change, it enters the scheduler and the
1416 * active flag is cleared.  When the active flag is still seen
1417 * set in the scheduler, that means that the thread is blocked in
1418 * the kernel (because it is cleared before entering the scheduler
1419 * in all other instances).
1420 *
1421 * The wakeup time is only set for those states that can timeout.
1422 * It is set to (-1, -1) for all other instances.
1423 *
1424 * The thread's run state, aside from being useful when debugging,
1425 * is used to place the thread in an appropriate queue.  There
1426 * are 2 basic queues:
1427 *
1428 *   o run queue - queue ordered by priority for all threads
1429 *                 that are runnable
1430 *   o waiting queue - queue sorted by wakeup time for all threads
1431 *                     that are not otherwise runnable (not blocked
1432 *                     in kernel, not waiting for locks)
1433 *
1434 * The thread's time slice is used for round-robin scheduling
1435 * (the default scheduling policy).  While a SCHED_RR thread
1436 * is runnable it's time slice accumulates.  When it reaches
1437 * the time slice interval, it gets reset and added to the end
1438 * of the queue of threads at its priority.  When a thread no
1439 * longer becomes runnable (blocks in kernel, waits, etc), its
1440 * time slice is reset.
1441 *
1442 * The job of kse_switchout_thread() is to handle all of the above.
1443 */
1444static void
1445kse_switchout_thread(struct kse *kse, struct pthread *thread)
1446{
1447	int level;
1448	int i;
1449
1450	/*
1451	 * Place the currently running thread into the
1452	 * appropriate queue(s).
1453	 */
1454	DBG_MSG("Switching out thread %p, state %d\n", thread, thread->state);
1455
1456	THR_DEACTIVATE_LAST_LOCK(thread);
1457	if (thread->blocked != 0) {
1458		thread->active = 0;
1459		thread->need_switchout = 0;
1460		/* This thread must have blocked in the kernel. */
1461		/* thread->slice_usec = -1;*/	/* restart timeslice */
1462		/*
1463		 * XXX - Check for pending signals for this thread to
1464		 *       see if we need to interrupt it in the kernel.
1465		 */
1466		/* if (thread->check_pending != 0) */
1467		if ((thread->slice_usec != -1) &&
1468		    (thread->attr.sched_policy != SCHED_FIFO))
1469			thread->slice_usec += (thread->tmbx.tm_uticks
1470			    + thread->tmbx.tm_sticks) * _clock_res_usec;
1471	}
1472	else {
1473		switch (thread->state) {
1474		case PS_DEAD:
1475			/*
1476			 * The scheduler is operating on a different
1477			 * stack.  It is safe to do garbage collecting
1478			 * here.
1479			 */
1480			thread->active = 0;
1481			thread->need_switchout = 0;
1482			thr_cleanup(kse, thread);
1483			return;
1484			break;
1485
1486		case PS_RUNNING:
1487			if ((thread->flags & THR_FLAGS_SUSPENDED) != 0)
1488				THR_SET_STATE(thread, PS_SUSPENDED);
1489			break;
1490
1491		case PS_COND_WAIT:
1492		case PS_SLEEP_WAIT:
1493			/* Insert into the waiting queue: */
1494			KSE_WAITQ_INSERT(kse, thread);
1495			break;
1496
1497		case PS_LOCKWAIT:
1498			/*
1499			 * This state doesn't timeout.
1500			 */
1501			thread->wakeup_time.tv_sec = -1;
1502			thread->wakeup_time.tv_nsec = -1;
1503			level = thread->locklevel - 1;
1504			if (!_LCK_GRANTED(&thread->lockusers[level]))
1505				KSE_WAITQ_INSERT(kse, thread);
1506			else
1507				THR_SET_STATE(thread, PS_RUNNING);
1508			break;
1509
1510		case PS_JOIN:
1511		case PS_MUTEX_WAIT:
1512		case PS_SIGSUSPEND:
1513		case PS_SIGWAIT:
1514		case PS_SUSPENDED:
1515		case PS_DEADLOCK:
1516		default:
1517			/*
1518			 * These states don't timeout.
1519			 */
1520			thread->wakeup_time.tv_sec = -1;
1521			thread->wakeup_time.tv_nsec = -1;
1522
1523			/* Insert into the waiting queue: */
1524			KSE_WAITQ_INSERT(kse, thread);
1525			break;
1526		}
1527		if (thread->state != PS_RUNNING) {
1528			/* Restart the time slice: */
1529			thread->slice_usec = -1;
1530		} else {
1531			if (thread->need_switchout != 0)
1532				/*
1533				 * The thread yielded on its own;
1534				 * restart the timeslice.
1535				 */
1536				thread->slice_usec = -1;
1537			else if ((thread->slice_usec != -1) &&
1538	   		    (thread->attr.sched_policy != SCHED_FIFO)) {
1539				thread->slice_usec += (thread->tmbx.tm_uticks
1540				    + thread->tmbx.tm_sticks) * _clock_res_usec;
1541				/* Check for time quantum exceeded: */
1542				if (thread->slice_usec > TIMESLICE_USEC)
1543					thread->slice_usec = -1;
1544			}
1545			if (thread->slice_usec == -1) {
1546				/*
1547				 * The thread exceeded its time quantum or
1548				 * it yielded the CPU; place it at the tail
1549				 * of the queue for its priority.
1550				 */
1551				KSE_RUNQ_INSERT_TAIL(kse, thread);
1552			} else {
1553				/*
1554				 * The thread hasn't exceeded its interval
1555				 * Place it at the head of the queue for its
1556				 * priority.
1557				 */
1558				KSE_RUNQ_INSERT_HEAD(kse, thread);
1559			}
1560		}
1561	}
1562	thread->active = 0;
1563	thread->need_switchout = 0;
1564	if (thread->check_pending != 0) {
1565		/* Install pending signals into the frame. */
1566		thread->check_pending = 0;
1567		for (i = 0; i < _SIG_MAXSIG; i++) {
1568			if (sigismember(&thread->sigpend, i) &&
1569			    !sigismember(&thread->tmbx.tm_context.uc_sigmask, i))
1570				_thr_sig_add(thread, i, &thread->siginfo[i]);
1571		}
1572	}
1573}
1574
1575/*
1576 * This function waits for the smallest timeout value of any waiting
1577 * thread, or until it receives a message from another KSE.
1578 *
1579 * This must be called with the scheduling lock held.
1580 */
1581static void
1582kse_wait(struct kse *kse, struct pthread *td_wait)
1583{
1584	struct timespec ts, ts_sleep;
1585	int saved_flags;
1586
1587	KSE_GET_TOD(kse, &ts);
1588
1589	if ((td_wait == NULL) || (td_wait->wakeup_time.tv_sec < 0)) {
1590		/* Limit sleep to no more than 1 minute. */
1591		ts_sleep.tv_sec = 60;
1592		ts_sleep.tv_nsec = 0;
1593	} else {
1594		TIMESPEC_SUB(&ts_sleep, &td_wait->wakeup_time, &ts);
1595		if (ts_sleep.tv_sec > 60) {
1596			ts_sleep.tv_sec = 60;
1597			ts_sleep.tv_nsec = 0;
1598		}
1599	}
1600	/* Don't sleep for negative times. */
1601	if ((ts_sleep.tv_sec >= 0) && (ts_sleep.tv_nsec >= 0)) {
1602		KSE_SET_IDLE(kse);
1603		kse->k_kseg->kg_idle_kses++;
1604		KSE_SCHED_UNLOCK(kse, kse->k_kseg);
1605		saved_flags = kse->k_mbx.km_flags;
1606		kse->k_mbx.km_flags |= KMF_NOUPCALL;
1607		kse_release(&ts_sleep);
1608		kse->k_mbx.km_flags = saved_flags;
1609		KSE_SCHED_LOCK(kse, kse->k_kseg);
1610		if (KSE_IS_IDLE(kse)) {
1611			KSE_CLEAR_IDLE(kse);
1612			kse->k_kseg->kg_idle_kses--;
1613		}
1614	}
1615}
1616
1617/*
1618 * Avoid calling this kse_exit() so as not to confuse it with the
1619 * system call of the same name.
1620 */
1621static void
1622kse_fini(struct kse *kse)
1623{
1624	/* struct kse_group *free_kseg = NULL; */
1625	struct timespec ts;
1626
1627	/*
1628	 * Check to see if this is one of the main kses.
1629	 */
1630	if (kse->k_kseg != _kse_initial->k_kseg) {
1631		PANIC("shouldn't get here");
1632		/* This is for supporting thread groups. */
1633#ifdef NOT_YET
1634		/* Remove this KSE from the KSEG's list of KSEs. */
1635		KSE_SCHED_LOCK(kse, kse->k_kseg);
1636		TAILQ_REMOVE(&kse->k_kseg->kg_kseq, kse, k_kgqe);
1637		kse->k_kseg->kg_ksecount--;
1638		if (TAILQ_EMPTY(&kse->k_kseg->kg_kseq))
1639			free_kseg = kse->k_kseg;
1640		KSE_SCHED_UNLOCK(kse, kse->k_kseg);
1641
1642		/*
1643		 * Add this KSE to the list of free KSEs along with
1644		 * the KSEG if is now orphaned.
1645		 */
1646		KSE_LOCK_ACQUIRE(kse, &kse_lock);
1647		if (free_kseg != NULL)
1648			kseg_free_unlocked(free_kseg);
1649		kse_free_unlocked(kse);
1650		KSE_LOCK_RELEASE(kse, &kse_lock);
1651		kse_exit();
1652		/* Never returns. */
1653		PANIC("kse_exit()");
1654#endif
1655	} else {
1656#ifdef NOT_YET
1657		/*
1658		 * In future, we might allow program to kill
1659		 * kse in initial group.
1660		 */
1661		if (kse != _kse_initial) {
1662			KSE_SCHED_LOCK(kse, kse->k_kseg);
1663			TAILQ_REMOVE(&kse->k_kseg->kg_kseq, kse, k_kgqe);
1664			kse->k_kseg->kg_ksecount--;
1665			KSE_SCHED_UNLOCK(kse, kse->k_kseg);
1666			KSE_LOCK_ACQUIRE(kse, &kse_lock);
1667			kse_free_unlocked(kse);
1668			KSE_LOCK_RELEASE(kse, &kse_lock);
1669			kse_exit();
1670                        /* Never returns. */
1671                        PANIC("kse_exit() failed for initial kseg");
1672                }
1673#endif
1674		KSE_SCHED_LOCK(kse, kse->k_kseg);
1675		KSE_SET_IDLE(kse);
1676		kse->k_kseg->kg_idle_kses++;
1677		KSE_SCHED_UNLOCK(kse, kse->k_kseg);
1678		ts.tv_sec = 120;
1679		ts.tv_nsec = 0;
1680		kse->k_mbx.km_flags = 0;
1681		kse_release(&ts);
1682		/* Never reach */
1683	}
1684}
1685
1686void
1687_thr_set_timeout(const struct timespec *timeout)
1688{
1689	struct pthread	*curthread = _get_curthread();
1690	struct timespec ts;
1691
1692	/* Reset the timeout flag for the running thread: */
1693	curthread->timeout = 0;
1694
1695	/* Check if the thread is to wait forever: */
1696	if (timeout == NULL) {
1697		/*
1698		 * Set the wakeup time to something that can be recognised as
1699		 * different to an actual time of day:
1700		 */
1701		curthread->wakeup_time.tv_sec = -1;
1702		curthread->wakeup_time.tv_nsec = -1;
1703	}
1704	/* Check if no waiting is required: */
1705	else if ((timeout->tv_sec == 0) && (timeout->tv_nsec == 0)) {
1706		/* Set the wake up time to 'immediately': */
1707		curthread->wakeup_time.tv_sec = 0;
1708		curthread->wakeup_time.tv_nsec = 0;
1709	} else {
1710		/* Calculate the time for the current thread to wakeup: */
1711		KSE_GET_TOD(curthread->kse, &ts);
1712		TIMESPEC_ADD(&curthread->wakeup_time, &ts, timeout);
1713	}
1714}
1715
1716void
1717_thr_panic_exit(char *file, int line, char *msg)
1718{
1719	char buf[256];
1720
1721	snprintf(buf, sizeof(buf), "(%s:%d) %s\n", file, line, msg);
1722	__sys_write(2, buf, strlen(buf));
1723	abort();
1724}
1725
1726void
1727_thr_setrunnable(struct pthread *curthread, struct pthread *thread)
1728{
1729	kse_critical_t crit;
1730
1731	crit = _kse_critical_enter();
1732	KSE_SCHED_LOCK(curthread->kse, thread->kseg);
1733	_thr_setrunnable_unlocked(thread);
1734	KSE_SCHED_UNLOCK(curthread->kse, thread->kseg);
1735	_kse_critical_leave(crit);
1736}
1737
1738void
1739_thr_setrunnable_unlocked(struct pthread *thread)
1740{
1741	if ((thread->kseg->kg_flags & KGF_SINGLE_THREAD) != 0) {
1742		/* No silly queues for these threads. */
1743		if ((thread->flags & THR_FLAGS_SUSPENDED) != 0)
1744			THR_SET_STATE(thread, PS_SUSPENDED);
1745		else
1746			THR_SET_STATE(thread, PS_RUNNING);
1747	} else if (thread->state != PS_RUNNING) {
1748		if ((thread->flags & THR_FLAGS_IN_WAITQ) != 0)
1749			KSE_WAITQ_REMOVE(thread->kse, thread);
1750		if ((thread->flags & THR_FLAGS_SUSPENDED) != 0)
1751			THR_SET_STATE(thread, PS_SUSPENDED);
1752		else {
1753			THR_SET_STATE(thread, PS_RUNNING);
1754			if ((thread->blocked == 0) && (thread->active == 0) &&
1755			    (thread->flags & THR_FLAGS_IN_RUNQ) == 0)
1756				THR_RUNQ_INSERT_TAIL(thread);
1757		}
1758	}
1759        /*
1760         * XXX - Threads are not yet assigned to specific KSEs; they are
1761         *       assigned to the KSEG.  So the fact that a thread's KSE is
1762         *       waiting doesn't necessarily mean that it will be the KSE
1763         *       that runs the thread after the lock is granted.  But we
1764         *       don't know if the other KSEs within the same KSEG are
1765         *       also in a waiting state or not so we err on the side of
1766         *       caution and wakeup the thread's last known KSE.  We
1767         *       ensure that the threads KSE doesn't change while it's
1768         *       scheduling lock is held so it is safe to reference it
1769         *       (the KSE).  If the KSE wakes up and doesn't find any more
1770         *       work it will again go back to waiting so no harm is done.
1771         */
1772	kse_wakeup_one(thread);
1773}
1774
1775static void
1776kse_wakeup_one(struct pthread *thread)
1777{
1778	struct kse *ke;
1779
1780	if (KSE_IS_IDLE(thread->kse)) {
1781		KSE_CLEAR_IDLE(thread->kse);
1782		thread->kseg->kg_idle_kses--;
1783		KSE_WAKEUP(thread->kse);
1784	} else {
1785		TAILQ_FOREACH(ke, &thread->kseg->kg_kseq, k_kgqe) {
1786			if (KSE_IS_IDLE(ke)) {
1787				KSE_CLEAR_IDLE(ke);
1788				ke->k_kseg->kg_idle_kses--;
1789				KSE_WAKEUP(ke);
1790				return;
1791			}
1792		}
1793	}
1794}
1795
1796static void
1797kse_wakeup_multi(struct kse *curkse)
1798{
1799	struct kse *ke;
1800	int tmp;
1801
1802	if ((tmp = KSE_RUNQ_THREADS(curkse)) && curkse->k_kseg->kg_idle_kses) {
1803		TAILQ_FOREACH(ke, &curkse->k_kseg->kg_kseq, k_kgqe) {
1804			if (KSE_IS_IDLE(ke)) {
1805				KSE_CLEAR_IDLE(ke);
1806				ke->k_kseg->kg_idle_kses--;
1807				KSE_WAKEUP(ke);
1808				if (--tmp == 0)
1809					break;
1810			}
1811		}
1812	}
1813}
1814
1815struct pthread *
1816_get_curthread(void)
1817{
1818	return (_ksd_curthread);
1819}
1820
1821/* This assumes the caller has disabled upcalls. */
1822struct kse *
1823_get_curkse(void)
1824{
1825	return (_ksd_curkse);
1826}
1827
1828void
1829_set_curkse(struct kse *kse)
1830{
1831	_ksd_setprivate(&kse->k_ksd);
1832}
1833
1834/*
1835 * Allocate a new KSEG.
1836 *
1837 * We allow the current thread to be NULL in the case that this
1838 * is the first time a KSEG is being created (library initialization).
1839 * In this case, we don't need to (and can't) take any locks.
1840 */
1841struct kse_group *
1842_kseg_alloc(struct pthread *curthread)
1843{
1844	struct kse_group *kseg = NULL;
1845	kse_critical_t crit;
1846
1847	if ((curthread != NULL) && (free_kseg_count > 0)) {
1848		/* Use the kse lock for the kseg queue. */
1849		crit = _kse_critical_enter();
1850		KSE_LOCK_ACQUIRE(curthread->kse, &kse_lock);
1851		if ((kseg = TAILQ_FIRST(&free_kse_groupq)) != NULL) {
1852			TAILQ_REMOVE(&free_kse_groupq, kseg, kg_qe);
1853			free_kseg_count--;
1854			active_kseg_count++;
1855			TAILQ_INSERT_TAIL(&active_kse_groupq, kseg, kg_qe);
1856		}
1857		KSE_LOCK_RELEASE(curthread->kse, &kse_lock);
1858		_kse_critical_leave(crit);
1859		if (kseg)
1860			kseg_reinit(kseg);
1861	}
1862
1863	/*
1864	 * If requested, attempt to allocate a new KSE group only if the
1865	 * KSE allocation was successful and a KSE group wasn't found in
1866	 * the free list.
1867	 */
1868	if ((kseg == NULL) &&
1869	    ((kseg = (struct kse_group *)malloc(sizeof(*kseg))) != NULL)) {
1870		if (_pq_alloc(&kseg->kg_schedq.sq_runq,
1871		    THR_MIN_PRIORITY, THR_LAST_PRIORITY) != 0) {
1872			free(kseg);
1873			kseg = NULL;
1874		} else {
1875			kseg_init(kseg);
1876			/* Add the KSEG to the list of active KSEGs. */
1877			if (curthread != NULL) {
1878				crit = _kse_critical_enter();
1879				KSE_LOCK_ACQUIRE(curthread->kse, &kse_lock);
1880				active_kseg_count++;
1881				TAILQ_INSERT_TAIL(&active_kse_groupq,
1882				    kseg, kg_qe);
1883				KSE_LOCK_RELEASE(curthread->kse, &kse_lock);
1884				_kse_critical_leave(crit);
1885			} else {
1886				active_kseg_count++;
1887				TAILQ_INSERT_TAIL(&active_kse_groupq,
1888				    kseg, kg_qe);
1889			}
1890		}
1891	}
1892	return (kseg);
1893}
1894
1895/*
1896 * This must be called with the kse lock held and when there are
1897 * no more threads that reference it.
1898 */
1899static void
1900kseg_free_unlocked(struct kse_group *kseg)
1901{
1902	TAILQ_REMOVE(&active_kse_groupq, kseg, kg_qe);
1903	TAILQ_INSERT_HEAD(&free_kse_groupq, kseg, kg_qe);
1904	free_kseg_count++;
1905	active_kseg_count--;
1906}
1907
1908void
1909_kseg_free(struct kse_group *kseg)
1910{
1911	struct kse *curkse;
1912	kse_critical_t crit;
1913
1914	crit = _kse_critical_enter();
1915	curkse = _get_curkse();
1916	KSE_LOCK_ACQUIRE(curkse, &kse_lock);
1917	kseg_free_unlocked(kseg);
1918	KSE_LOCK_RELEASE(curkse, &kse_lock);
1919	_kse_critical_leave(crit);
1920}
1921
1922/*
1923 * Allocate a new KSE.
1924 *
1925 * We allow the current thread to be NULL in the case that this
1926 * is the first time a KSE is being created (library initialization).
1927 * In this case, we don't need to (and can't) take any locks.
1928 */
1929struct kse *
1930_kse_alloc(struct pthread *curthread)
1931{
1932	struct kse *kse = NULL;
1933	kse_critical_t crit;
1934	int need_ksd = 0;
1935	int i;
1936
1937	if ((curthread != NULL) && (free_kse_count > 0)) {
1938		crit = _kse_critical_enter();
1939		KSE_LOCK_ACQUIRE(curthread->kse, &kse_lock);
1940		/* Search for a finished KSE. */
1941		kse = TAILQ_FIRST(&free_kseq);
1942		while ((kse != NULL) &&
1943		    ((kse->k_mbx.km_flags & KMF_DONE) == 0)) {
1944			kse = TAILQ_NEXT(kse, k_qe);
1945		}
1946		if (kse != NULL) {
1947			DBG_MSG("found an unused kse.\n");
1948			TAILQ_REMOVE(&free_kseq, kse, k_qe);
1949			free_kse_count--;
1950			TAILQ_INSERT_TAIL(&active_kseq, kse, k_qe);
1951			active_kse_count++;
1952		}
1953		KSE_LOCK_RELEASE(curthread->kse, &kse_lock);
1954		_kse_critical_leave(crit);
1955		if (kse != NULL)
1956			kse_reinit(kse);
1957	}
1958	if ((kse == NULL) &&
1959	    ((kse = (struct kse *)malloc(sizeof(*kse))) != NULL)) {
1960		bzero(kse, sizeof(*kse));
1961
1962		/* Initialize the lockusers. */
1963		for (i = 0; i < MAX_KSE_LOCKLEVEL; i++) {
1964			_lockuser_init(&kse->k_lockusers[i], (void *)kse);
1965			_LCK_SET_PRIVATE2(&kse->k_lockusers[i], NULL);
1966		}
1967		/* _lock_init(kse->k_lock, ...) */
1968
1969		/* We had to malloc a kse; mark it as needing a new ID.*/
1970		need_ksd = 1;
1971
1972		/*
1973		 * Create the KSE context.
1974		 *
1975		 * XXX - For now this is done here in the allocation.
1976		 *       In the future, we may want to have it done
1977		 *       outside the allocation so that scope system
1978		 *       threads (one thread per KSE) are not required
1979		 *       to have a stack for an unneeded kse upcall.
1980		 */
1981		kse->k_mbx.km_func = (kse_func_t *)kse_sched_multi;
1982		kse->k_mbx.km_stack.ss_sp = (char *)malloc(KSE_STACKSIZE);
1983		kse->k_mbx.km_stack.ss_size = KSE_STACKSIZE;
1984		kse->k_mbx.km_udata = (void *)kse;
1985		kse->k_mbx.km_quantum = 20000;
1986		/*
1987		 * We need to keep a copy of the stack in case it
1988		 * doesn't get used; a KSE running a scope system
1989		 * thread will use that thread's stack.
1990		 */
1991		kse->k_stack.ss_sp = kse->k_mbx.km_stack.ss_sp;
1992		kse->k_stack.ss_size = kse->k_mbx.km_stack.ss_size;
1993		if (kse->k_mbx.km_stack.ss_sp == NULL) {
1994			for (i = 0; i < MAX_KSE_LOCKLEVEL; i++) {
1995				_lockuser_destroy(&kse->k_lockusers[i]);
1996			}
1997			/* _lock_destroy(&kse->k_lock); */
1998			free(kse);
1999			kse = NULL;
2000		}
2001	}
2002	if ((kse != NULL) && (need_ksd != 0)) {
2003		/* This KSE needs initialization. */
2004		if (curthread != NULL) {
2005			crit = _kse_critical_enter();
2006			KSE_LOCK_ACQUIRE(curthread->kse, &kse_lock);
2007		}
2008		/* Initialize KSD inside of the lock. */
2009		if (_ksd_create(&kse->k_ksd, (void *)kse, sizeof(*kse)) != 0) {
2010			if (curthread != NULL) {
2011				KSE_LOCK_RELEASE(curthread->kse, &kse_lock);
2012				_kse_critical_leave(crit);
2013			}
2014			free(kse->k_mbx.km_stack.ss_sp);
2015			for (i = 0; i < MAX_KSE_LOCKLEVEL; i++) {
2016				_lockuser_destroy(&kse->k_lockusers[i]);
2017			}
2018			free(kse);
2019			return (NULL);
2020		}
2021		kse->k_flags = 0;
2022		TAILQ_INSERT_TAIL(&active_kseq, kse, k_qe);
2023		active_kse_count++;
2024		if (curthread != NULL) {
2025			KSE_LOCK_RELEASE(curthread->kse, &kse_lock);
2026			_kse_critical_leave(crit);
2027		}
2028	}
2029	return (kse);
2030}
2031
2032static void
2033kse_reinit(struct kse *kse)
2034{
2035	/*
2036	 * XXX - For now every kse has its stack.
2037	 *       In the future, we may want to have it done
2038	 *       outside the allocation so that scope system
2039	 *       threads (one thread per KSE) are not required
2040	 *       to have a stack for an unneeded kse upcall.
2041	 */
2042	kse->k_mbx.km_flags = 0;
2043	kse->k_curthread = 0;
2044	kse->k_kseg = 0;
2045	kse->k_schedq = 0;
2046	kse->k_locklevel = 0;
2047	sigemptyset(&kse->k_sigmask);
2048	bzero(&kse->k_sigq, sizeof(kse->k_sigq));
2049	kse->k_check_sigq = 0;
2050	kse->k_flags = 0;
2051	kse->k_waiting = 0;
2052	kse->k_idle = 0;
2053	kse->k_error = 0;
2054	kse->k_cpu = 0;
2055	kse->k_done = 0;
2056}
2057
2058void
2059kse_free_unlocked(struct kse *kse)
2060{
2061	TAILQ_REMOVE(&active_kseq, kse, k_qe);
2062	active_kse_count--;
2063	kse->k_kseg = NULL;
2064	kse->k_mbx.km_quantum = 20000;
2065	kse->k_flags = 0;
2066	TAILQ_INSERT_HEAD(&free_kseq, kse, k_qe);
2067	free_kse_count++;
2068}
2069
2070void
2071_kse_free(struct pthread *curthread, struct kse *kse)
2072{
2073	kse_critical_t crit;
2074
2075	if (curthread == NULL)
2076		kse_free_unlocked(kse);
2077	else {
2078		crit = _kse_critical_enter();
2079		KSE_LOCK_ACQUIRE(curthread->kse, &kse_lock);
2080		kse_free_unlocked(kse);
2081		KSE_LOCK_RELEASE(curthread->kse, &kse_lock);
2082		_kse_critical_leave(crit);
2083	}
2084}
2085
2086static void
2087kseg_init(struct kse_group *kseg)
2088{
2089	kseg_reinit(kseg);
2090	_lock_init(&kseg->kg_lock, LCK_ADAPTIVE, _kse_lock_wait,
2091	    _kse_lock_wakeup);
2092}
2093
2094static void
2095kseg_reinit(struct kse_group *kseg)
2096{
2097	TAILQ_INIT(&kseg->kg_kseq);
2098	TAILQ_INIT(&kseg->kg_threadq);
2099	TAILQ_INIT(&kseg->kg_schedq.sq_waitq);
2100	kseg->kg_threadcount = 0;
2101	kseg->kg_ksecount = 0;
2102	kseg->kg_idle_kses = 0;
2103	kseg->kg_flags = 0;
2104}
2105
2106struct pthread *
2107_thr_alloc(struct pthread *curthread)
2108{
2109	kse_critical_t crit;
2110	void *p;
2111	struct pthread *thread = NULL;
2112
2113	if (curthread != NULL) {
2114		if (GC_NEEDED())
2115			_thr_gc(curthread);
2116		if (free_thread_count > 0) {
2117			crit = _kse_critical_enter();
2118			KSE_LOCK_ACQUIRE(curthread->kse, &thread_lock);
2119			if ((thread = TAILQ_FIRST(&free_threadq)) != NULL) {
2120				TAILQ_REMOVE(&free_threadq, thread, tle);
2121				free_thread_count--;
2122			}
2123			KSE_LOCK_RELEASE(curthread->kse, &thread_lock);
2124			_kse_critical_leave(crit);
2125		}
2126	}
2127	if (thread == NULL) {
2128		p = malloc(sizeof(struct pthread) + THR_ALIGNBYTES);
2129		if (p != NULL) {
2130			thread = (struct pthread *)THR_ALIGN(p);
2131			thread->alloc_addr = p;
2132		}
2133	}
2134	return (thread);
2135}
2136
2137void
2138_thr_free(struct pthread *curthread, struct pthread *thread)
2139{
2140	kse_critical_t crit;
2141	int i;
2142
2143	DBG_MSG("Freeing thread %p\n", thread);
2144	if ((curthread == NULL) || (free_thread_count >= MAX_CACHED_THREADS)) {
2145		for (i = 0; i < MAX_THR_LOCKLEVEL; i++) {
2146			_lockuser_destroy(&thread->lockusers[i]);
2147		}
2148		_lock_destroy(&thread->lock);
2149		free(thread->alloc_addr);
2150	}
2151	else {
2152		crit = _kse_critical_enter();
2153		KSE_LOCK_ACQUIRE(curthread->kse, &thread_lock);
2154		TAILQ_INSERT_HEAD(&free_threadq, thread, tle);
2155		free_thread_count++;
2156		KSE_LOCK_RELEASE(curthread->kse, &thread_lock);
2157		_kse_critical_leave(crit);
2158	}
2159}
2160
2161/*
2162 * Add an active thread:
2163 *
2164 *   o Assign the thread a unique id (which GDB uses to track
2165 *     threads.
2166 *   o Add the thread to the list of all threads and increment
2167 *     number of active threads.
2168 */
2169static void
2170thr_link(struct pthread *thread)
2171{
2172	kse_critical_t crit;
2173	struct kse *curkse;
2174
2175	crit = _kse_critical_enter();
2176	curkse = _get_curkse();
2177
2178	KSE_LOCK_ACQUIRE(curkse, &_thread_list_lock);
2179	/*
2180	 * Initialize the unique id (which GDB uses to track
2181	 * threads), add the thread to the list of all threads,
2182	 * and
2183	 */
2184	thread->uniqueid = next_uniqueid++;
2185	THR_LIST_ADD(thread);
2186	active_threads++;
2187	KSE_LOCK_RELEASE(curkse, &_thread_list_lock);
2188
2189	_kse_critical_leave(crit);
2190}
2191
2192/*
2193 * Remove an active thread.
2194 */
2195static void
2196thr_unlink(struct pthread *thread)
2197{
2198	kse_critical_t crit;
2199	struct kse *curkse;
2200
2201	crit = _kse_critical_enter();
2202	curkse = _get_curkse();
2203
2204	KSE_LOCK_ACQUIRE(curkse, &_thread_list_lock);
2205	THR_LIST_REMOVE(thread);
2206	active_threads--;
2207	KSE_LOCK_RELEASE(curkse, &_thread_list_lock);
2208
2209	_kse_critical_leave(crit);
2210}
2211