thr_kern.c revision 115278
1/*
2 * Copyright (C) 2003 Daniel M. Eischen <deischen@freebsd.org>
3 * Copyright (C) 2002 Jonathon Mini <mini@freebsd.org>
4 * Copyright (c) 1995-1998 John Birrell <jb@cimlogic.com.au>
5 * All rights reserved.
6 *
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
9 * are met:
10 * 1. Redistributions of source code must retain the above copyright
11 *    notice, this list of conditions and the following disclaimer.
12 * 2. Redistributions in binary form must reproduce the above copyright
13 *    notice, this list of conditions and the following disclaimer in the
14 *    documentation and/or other materials provided with the distribution.
15 * 3. All advertising materials mentioning features or use of this software
16 *    must display the following acknowledgement:
17 *	This product includes software developed by John Birrell.
18 * 4. Neither the name of the author nor the names of any co-contributors
19 *    may be used to endorse or promote products derived from this software
20 *    without specific prior written permission.
21 *
22 * THIS SOFTWARE IS PROVIDED BY JOHN BIRRELL AND CONTRIBUTORS ``AS IS'' AND
23 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
24 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
25 * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
26 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
27 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
28 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
29 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
30 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
31 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32 * SUCH DAMAGE.
33 *
34 */
35#include <sys/cdefs.h>
36__FBSDID("$FreeBSD: head/lib/libkse/thread/thr_kern.c 115278 2003-05-24 02:29:25Z deischen $");
37
38#include <sys/types.h>
39#include <sys/kse.h>
40#include <sys/signalvar.h>
41#include <sys/queue.h>
42#include <machine/atomic.h>
43
44#include <assert.h>
45#include <errno.h>
46#include <signal.h>
47#include <stdlib.h>
48#include <string.h>
49#include <time.h>
50#include <ucontext.h>
51#include <unistd.h>
52
53#include "atomic_ops.h"
54#include "thr_private.h"
55#include "pthread_md.h"
56#include "libc_private.h"
57
58/*#define DEBUG_THREAD_KERN */
59#ifdef DEBUG_THREAD_KERN
60#define DBG_MSG		stdout_debug
61#else
62#define DBG_MSG(x...)
63#endif
64
65/*
66 * Define a high water mark for the maximum number of threads that
67 * will be cached.  Once this level is reached, any extra threads
68 * will be free()'d.
69 *
70 * XXX - It doesn't make sense to worry about the maximum number of
71 *       KSEs that we can cache because the system will limit us to
72 *       something *much* less than the maximum number of threads
73 *       that we can have.  Disregarding KSEs in their own group,
74 *       the maximum number of KSEs is the number of processors in
75 *       the system.
76 */
77#define	MAX_CACHED_THREADS	100
78#define	KSE_STACKSIZE		16384
79
80#define	KSE_SET_MBOX(kse, thrd) \
81	(kse)->k_mbx.km_curthread = &(thrd)->tmbx
82
83#define	KSE_SET_EXITED(kse)	(kse)->k_flags |= KF_EXITED
84
85/*
86 * Macros for manipulating the run queues.  The priority queue
87 * routines use the thread's pqe link and also handle the setting
88 * and clearing of the thread's THR_FLAGS_IN_RUNQ flag.
89 */
90#define	KSE_RUNQ_INSERT_HEAD(kse, thrd)			\
91	_pq_insert_head(&(kse)->k_schedq->sq_runq, thrd)
92#define	KSE_RUNQ_INSERT_TAIL(kse, thrd)			\
93	_pq_insert_tail(&(kse)->k_schedq->sq_runq, thrd)
94#define	KSE_RUNQ_REMOVE(kse, thrd)			\
95	_pq_remove(&(kse)->k_schedq->sq_runq, thrd)
96#define	KSE_RUNQ_FIRST(kse)	_pq_first(&(kse)->k_schedq->sq_runq)
97
98#define KSE_RUNQ_THREADS(kse)	((kse)->k_schedq->sq_runq.pq_threads)
99
100#ifndef	KMF_DONE
101#define	KMF_DONE	0x04
102#endif
103
104/*
105 * We've got to keep track of everything that is allocated, not only
106 * to have a speedy free list, but also so they can be deallocated
107 * after a fork().
108 */
109static TAILQ_HEAD(, kse)	active_kseq;
110static TAILQ_HEAD(, kse)	free_kseq;
111static TAILQ_HEAD(, kse_group)	free_kse_groupq;
112static TAILQ_HEAD(, kse_group)	active_kse_groupq;
113static TAILQ_HEAD(, kse_group)	gc_ksegq;
114static struct lock		kse_lock;	/* also used for kseg queue */
115static int			free_kse_count = 0;
116static int			free_kseg_count = 0;
117static TAILQ_HEAD(, pthread)	free_threadq;
118static struct lock		thread_lock;
119static int			free_thread_count = 0;
120static int			inited = 0;
121static int			active_threads = 1;
122static int			active_kse_count = 0;
123static int			active_kseg_count = 0;
124static u_int64_t		next_uniqueid = 1;
125
126
127#ifdef DEBUG_THREAD_KERN
128static void	dump_queues(struct kse *curkse);
129#endif
130static void	kse_check_completed(struct kse *kse);
131static void	kse_check_waitq(struct kse *kse);
132static void	kse_check_signals(struct kse *kse);
133static void	kse_fini(struct kse *curkse);
134static void	kse_reinit(struct kse *kse);
135static void	kse_sched_multi(struct kse *curkse);
136#ifdef NOT_YET
137static void	kse_sched_single(struct kse *curkse);
138#endif
139static void	kse_switchout_thread(struct kse *kse, struct pthread *thread);
140static void	kse_wait(struct kse *kse, struct pthread *td_wait);
141static void	kse_free_unlocked(struct kse *kse);
142static void	kseg_free_unlocked(struct kse_group *kseg);
143static void	kseg_init(struct kse_group *kseg);
144static void	kseg_reinit(struct kse_group *kseg);
145static void	kse_waitq_insert(struct pthread *thread);
146static void	kse_wakeup_multi(struct kse *curkse);
147static void	kse_wakeup_one(struct pthread *thread);
148static void	thr_cleanup(struct kse *kse, struct pthread *curthread);
149static void	thr_link(struct pthread *thread);
150static void	thr_resume_wrapper(int unused_1, siginfo_t *unused_2,
151		    ucontext_t *ucp);
152static void	thr_resume_check(struct pthread *curthread, ucontext_t *ucp,
153		    struct pthread_sigframe *psf);
154static int	thr_timedout(struct pthread *thread, struct timespec *curtime);
155static void	thr_unlink(struct pthread *thread);
156
157/*
158 * This is called after a fork().
159 * No locks need to be taken here since we are guaranteed to be
160 * single threaded.
161 */
162void
163_kse_single_thread(struct pthread *curthread)
164{
165	struct kse *kse;
166	struct kse_group *kseg;
167	struct pthread *thread;
168	kse_critical_t crit;
169	int i;
170
171	/*
172	 * Disable upcalls and clear the threaded flag.
173	 * XXX - I don't think we need to disable upcalls after a fork().
174	 *       but it doesn't hurt.
175	 */
176	crit = _kse_critical_enter();
177	__isthreaded = 0;
178	active_threads = 1;
179
180	/*
181	 * Enter a loop to remove and free all threads other than
182	 * the running thread from the active thread list:
183	 */
184	while ((thread = TAILQ_FIRST(&_thread_list)) != NULL) {
185		THR_GCLIST_REMOVE(thread);
186		/*
187		 * Remove this thread from the list (the current
188		 * thread will be removed but re-added by libpthread
189		 * initialization.
190		 */
191		TAILQ_REMOVE(&_thread_list, thread, tle);
192		/* Make sure this isn't the running thread: */
193		if (thread != curthread) {
194			_thr_stack_free(&thread->attr);
195			if (thread->specific != NULL)
196				free(thread->specific);
197			for (i = 0; i < MAX_THR_LOCKLEVEL; i++) {
198				_lockuser_destroy(&thread->lockusers[i]);
199			}
200			_lock_destroy(&thread->lock);
201			free(thread);
202		}
203	}
204
205	TAILQ_INIT(&curthread->mutexq);		/* initialize mutex queue */
206	curthread->joiner = NULL;		/* no joining threads yet */
207	curthread->refcount = 0;
208	sigemptyset(&curthread->sigpend);	/* clear pending signals */
209	if (curthread->specific != NULL) {
210		free(curthread->specific);
211		curthread->specific = NULL;
212		curthread->specific_data_count = 0;
213	}
214
215	/* Free the free KSEs: */
216	while ((kse = TAILQ_FIRST(&free_kseq)) != NULL) {
217		TAILQ_REMOVE(&free_kseq, kse, k_qe);
218		for (i = 0; i < MAX_KSE_LOCKLEVEL; i++) {
219			_lockuser_destroy(&kse->k_lockusers[i]);
220		}
221		_lock_destroy(&kse->k_lock);
222		_ksd_destroy(&kse->k_ksd);
223		if (kse->k_stack.ss_sp != NULL)
224			free(kse->k_stack.ss_sp);
225		free(kse);
226	}
227	free_kse_count = 0;
228
229	/* Free the active KSEs: */
230	while ((kse = TAILQ_FIRST(&active_kseq)) != NULL) {
231		TAILQ_REMOVE(&active_kseq, kse, k_qe);
232		for (i = 0; i < MAX_KSE_LOCKLEVEL; i++) {
233			_lockuser_destroy(&kse->k_lockusers[i]);
234		}
235		_lock_destroy(&kse->k_lock);
236		if (kse->k_stack.ss_sp != NULL)
237			free(kse->k_stack.ss_sp);
238		free(kse);
239	}
240	active_kse_count = 0;
241
242	/* Free the free KSEGs: */
243	while ((kseg = TAILQ_FIRST(&free_kse_groupq)) != NULL) {
244		TAILQ_REMOVE(&free_kse_groupq, kseg, kg_qe);
245		_lock_destroy(&kseg->kg_lock);
246		_pq_free(&kseg->kg_schedq.sq_runq);
247		free(kseg);
248	}
249	free_kseg_count = 0;
250
251	/* Free the active KSEGs: */
252	while ((kseg = TAILQ_FIRST(&active_kse_groupq)) != NULL) {
253		TAILQ_REMOVE(&active_kse_groupq, kseg, kg_qe);
254		_lock_destroy(&kseg->kg_lock);
255		_pq_free(&kseg->kg_schedq.sq_runq);
256		free(kseg);
257	}
258	active_kseg_count = 0;
259
260	/* Free the free threads. */
261	while ((thread = TAILQ_FIRST(&free_threadq)) != NULL) {
262		TAILQ_REMOVE(&free_threadq, thread, tle);
263		if (thread->specific != NULL)
264			free(thread->specific);
265		for (i = 0; i < MAX_THR_LOCKLEVEL; i++) {
266			_lockuser_destroy(&thread->lockusers[i]);
267		}
268		_lock_destroy(&thread->lock);
269		free(thread);
270	}
271	free_thread_count = 0;
272
273	/* Free the to-be-gc'd threads. */
274	while ((thread = TAILQ_FIRST(&_thread_gc_list)) != NULL) {
275		TAILQ_REMOVE(&_thread_gc_list, thread, gcle);
276		for (i = 0; i < MAX_THR_LOCKLEVEL; i++) {
277			_lockuser_destroy(&thread->lockusers[i]);
278		}
279		_lock_destroy(&thread->lock);
280		free(thread);
281	}
282	TAILQ_INIT(&gc_ksegq);
283	_gc_count = 0;
284
285	if (inited != 0) {
286		/*
287		 * Destroy these locks; they'll be recreated to assure they
288		 * are in the unlocked state.
289		 */
290		_lock_destroy(&kse_lock);
291		_lock_destroy(&thread_lock);
292		_lock_destroy(&_thread_list_lock);
293		inited = 0;
294	}
295
296	/*
297	 * After a fork(), the leftover thread goes back to being
298	 * scope process.
299	 */
300	curthread->attr.flags &= ~PTHREAD_SCOPE_SYSTEM;
301	curthread->attr.flags |= PTHREAD_SCOPE_PROCESS;
302
303	/*
304	 * After a fork, we are still operating on the thread's original
305	 * stack.  Don't clear the THR_FLAGS_USER from the thread's
306	 * attribute flags.
307	 */
308
309	/* Initialize the threads library. */
310	curthread->kse = NULL;
311	curthread->kseg = NULL;
312	_kse_initial = NULL;
313	_libpthread_init(curthread);
314}
315
316/*
317 * This is used to initialize housekeeping and to initialize the
318 * KSD for the KSE.
319 */
320void
321_kse_init(void)
322{
323	if (inited == 0) {
324		TAILQ_INIT(&active_kseq);
325		TAILQ_INIT(&active_kse_groupq);
326		TAILQ_INIT(&free_kseq);
327		TAILQ_INIT(&free_kse_groupq);
328		TAILQ_INIT(&free_threadq);
329		TAILQ_INIT(&gc_ksegq);
330		if (_lock_init(&kse_lock, LCK_ADAPTIVE,
331		    _kse_lock_wait, _kse_lock_wakeup) != 0)
332			PANIC("Unable to initialize free KSE queue lock");
333		if (_lock_init(&thread_lock, LCK_ADAPTIVE,
334		    _kse_lock_wait, _kse_lock_wakeup) != 0)
335			PANIC("Unable to initialize free thread queue lock");
336		if (_lock_init(&_thread_list_lock, LCK_ADAPTIVE,
337		    _kse_lock_wait, _kse_lock_wakeup) != 0)
338			PANIC("Unable to initialize thread list lock");
339		active_kse_count = 0;
340		active_kseg_count = 0;
341		_gc_count = 0;
342		inited = 1;
343	}
344}
345
346int
347_kse_isthreaded(void)
348{
349	return (__isthreaded != 0);
350}
351
352/*
353 * This is called when the first thread (other than the initial
354 * thread) is created.
355 */
356int
357_kse_setthreaded(int threaded)
358{
359	if ((threaded != 0) && (__isthreaded == 0)) {
360		/*
361		 * Locking functions in libc are required when there are
362		 * threads other than the initial thread.
363		 */
364		__isthreaded = 1;
365
366		/*
367		 * Tell the kernel to create a KSE for the initial thread
368		 * and enable upcalls in it.
369		 */
370		_kse_initial->k_flags |= KF_STARTED;
371		if (kse_create(&_kse_initial->k_mbx, 0) != 0) {
372			_kse_initial->k_flags &= ~KF_STARTED;
373			__isthreaded = 0;
374			/* may abort() */
375			DBG_MSG("kse_create failed\n");
376			return (-1);
377		}
378		KSE_SET_MBOX(_kse_initial, _thr_initial);
379		_thr_setmaxconcurrency();
380	}
381	return (0);
382}
383
384/*
385 * Lock wait and wakeup handlers for KSE locks.  These are only used by
386 * KSEs, and should never be used by threads.  KSE locks include the
387 * KSE group lock (used for locking the scheduling queue) and the
388 * kse_lock defined above.
389 *
390 * When a KSE lock attempt blocks, the entire KSE blocks allowing another
391 * KSE to run.  For the most part, it doesn't make much sense to try and
392 * schedule another thread because you need to lock the scheduling queue
393 * in order to do that.  And since the KSE lock is used to lock the scheduling
394 * queue, you would just end up blocking again.
395 */
396void
397_kse_lock_wait(struct lock *lock, struct lockuser *lu)
398{
399	struct kse *curkse = (struct kse *)_LCK_GET_PRIVATE(lu);
400	struct timespec ts;
401	int saved_flags;
402
403	if (curkse->k_mbx.km_curthread != NULL)
404		PANIC("kse_lock_wait does not disable upcall.\n");
405	/*
406	 * Enter a loop to wait until we get the lock.
407	 */
408	ts.tv_sec = 0;
409	ts.tv_nsec = 1000000;  /* 1 sec */
410	while (!_LCK_GRANTED(lu)) {
411		/*
412		 * Yield the kse and wait to be notified when the lock
413		 * is granted.
414		 */
415		saved_flags = curkse->k_mbx.km_flags;
416		curkse->k_mbx.km_flags |= KMF_NOUPCALL | KMF_NOCOMPLETED;
417		kse_release(&ts);
418		curkse->k_mbx.km_flags = saved_flags;
419	}
420}
421
422void
423_kse_lock_wakeup(struct lock *lock, struct lockuser *lu)
424{
425	struct kse *curkse;
426	struct kse *kse;
427	struct kse_mailbox *mbx;
428
429	curkse = _get_curkse();
430	kse = (struct kse *)_LCK_GET_PRIVATE(lu);
431
432	if (kse == curkse)
433		PANIC("KSE trying to wake itself up in lock");
434	else {
435		mbx = &kse->k_mbx;
436		_lock_grant(lock, lu);
437		/*
438		 * Notify the owning kse that it has the lock.
439		 * It is safe to pass invalid address to kse_wakeup
440		 * even if the mailbox is not in kernel at all,
441		 * and waking up a wrong kse is also harmless.
442		 */
443		kse_wakeup(mbx);
444	}
445}
446
447/*
448 * Thread wait and wakeup handlers for thread locks.  These are only used
449 * by threads, never by KSEs.  Thread locks include the per-thread lock
450 * (defined in its structure), and condition variable and mutex locks.
451 */
452void
453_thr_lock_wait(struct lock *lock, struct lockuser *lu)
454{
455	struct pthread *curthread = (struct pthread *)lu->lu_private;
456
457	do {
458		THR_SCHED_LOCK(curthread, curthread);
459		THR_SET_STATE(curthread, PS_LOCKWAIT);
460		THR_SCHED_UNLOCK(curthread, curthread);
461		_thr_sched_switch(curthread);
462	} while (!_LCK_GRANTED(lu));
463}
464
465void
466_thr_lock_wakeup(struct lock *lock, struct lockuser *lu)
467{
468	struct pthread *thread;
469	struct pthread *curthread;
470
471	curthread = _get_curthread();
472	thread = (struct pthread *)_LCK_GET_PRIVATE(lu);
473
474	THR_SCHED_LOCK(curthread, thread);
475	_lock_grant(lock, lu);
476	_thr_setrunnable_unlocked(thread);
477	THR_SCHED_UNLOCK(curthread, thread);
478}
479
480kse_critical_t
481_kse_critical_enter(void)
482{
483	kse_critical_t crit;
484
485	crit = _ksd_readandclear_tmbx;
486	return (crit);
487}
488
489void
490_kse_critical_leave(kse_critical_t crit)
491{
492	struct pthread *curthread;
493
494	_ksd_set_tmbx(crit);
495	if ((crit != NULL) && ((curthread = _get_curthread()) != NULL))
496		THR_YIELD_CHECK(curthread);
497}
498
499int
500_kse_in_critical(void)
501{
502	return (_ksd_get_tmbx() == NULL);
503}
504
505void
506_thr_critical_enter(struct pthread *thread)
507{
508	thread->critical_count++;
509}
510
511void
512_thr_critical_leave(struct pthread *thread)
513{
514	thread->critical_count--;
515	THR_YIELD_CHECK(thread);
516}
517
518void
519_thr_sched_switch(struct pthread *curthread)
520{
521	struct kse *curkse;
522
523	(void)_kse_critical_enter();
524	curkse = _get_curkse();
525	KSE_SCHED_LOCK(curkse, curkse->k_kseg);
526	_thr_sched_switch_unlocked(curthread);
527}
528
529/*
530 * XXX - We may need to take the scheduling lock before calling
531 *       this, or perhaps take the lock within here before
532 *       doing anything else.
533 */
534void
535_thr_sched_switch_unlocked(struct pthread *curthread)
536{
537	struct pthread *td;
538	struct pthread_sigframe psf;
539	struct kse *curkse;
540	int ret;
541	volatile int uts_once;
542	volatile int resume_once = 0;
543
544	/* We're in the scheduler, 5 by 5: */
545	curkse = _get_curkse();
546
547	curthread->need_switchout = 1;	/* The thread yielded on its own. */
548	curthread->critical_yield = 0;	/* No need to yield anymore. */
549	curthread->slice_usec = -1;	/* Restart the time slice. */
550
551	/* Thread can unlock the scheduler lock. */
552	curthread->lock_switch = 1;
553
554	/*
555	 * The signal frame is allocated off the stack because
556	 * a thread can be interrupted by other signals while
557	 * it is running down pending signals.
558	 */
559	sigemptyset(&psf.psf_sigset);
560	curthread->curframe = &psf;
561
562	/*
563	 * Enter the scheduler if any one of the following is true:
564	 *
565	 *   o The current thread is dead; it's stack needs to be
566	 *     cleaned up and it can't be done while operating on
567	 *     it.
568	 *   o There are no runnable threads.
569	 *   o The next thread to run won't unlock the scheduler
570	 *     lock.  A side note: the current thread may be run
571	 *     instead of the next thread in the run queue, but
572	 *     we don't bother checking for that.
573	 */
574	if ((curthread->state == PS_DEAD) ||
575	    (((td = KSE_RUNQ_FIRST(curkse)) == NULL) &&
576	    (curthread->state != PS_RUNNING)) ||
577	    ((td != NULL) && (td->lock_switch == 0)))
578		_thread_enter_uts(&curthread->tmbx, &curkse->k_mbx);
579	else {
580		uts_once = 0;
581		THR_GETCONTEXT(&curthread->tmbx.tm_context);
582		if (uts_once == 0) {
583			uts_once = 1;
584
585			/* Switchout the current thread. */
586			kse_switchout_thread(curkse, curthread);
587
588		 	/* Choose another thread to run. */
589			td = KSE_RUNQ_FIRST(curkse);
590			KSE_RUNQ_REMOVE(curkse, td);
591			curkse->k_curthread = td;
592
593			/*
594			 * Make sure the current thread's kse points to
595			 * this kse.
596			 */
597			td->kse = curkse;
598
599			/*
600			 * Reset accounting.
601			 */
602			td->tmbx.tm_uticks = 0;
603			td->tmbx.tm_sticks = 0;
604
605			/*
606			 * Reset the time slice if this thread is running
607			 * for the first time or running again after using
608			 * its full time slice allocation.
609			 */
610			if (td->slice_usec == -1)
611				td->slice_usec = 0;
612
613			/* Mark the thread active. */
614			td->active = 1;
615
616			/* Remove the frame reference. */
617			td->curframe = NULL;
618
619			/*
620			 * Continue the thread at its current frame:
621			 */
622			ret = _thread_switch(&td->tmbx, NULL);
623			/* This point should not be reached. */
624			if (ret != 0)
625				PANIC("Bad return from _thread_switch");
626			PANIC("Thread has returned from _thread_switch");
627		}
628	}
629
630	if (curthread->lock_switch != 0) {
631		/*
632		 * Unlock the scheduling queue and leave the
633		 * critical region.
634		 */
635		/* Don't trust this after a switch! */
636		curkse = _get_curkse();
637
638		curthread->lock_switch = 0;
639		KSE_SCHED_UNLOCK(curkse, curkse->k_kseg);
640		_kse_critical_leave(&curthread->tmbx);
641	}
642	/*
643	 * This thread is being resumed; check for cancellations.
644	 */
645	if ((resume_once == 0) && (!THR_IN_CRITICAL(curthread))) {
646		resume_once = 1;
647		thr_resume_check(curthread, &curthread->tmbx.tm_context, &psf);
648	}
649
650	THR_ACTIVATE_LAST_LOCK(curthread);
651}
652
653/*
654 * This is the scheduler for a KSE which runs a scope system thread.
655 * The multi-thread KSE scheduler should also work for a single threaded
656 * KSE, but we use a separate scheduler so that it can be fine-tuned
657 * to be more efficient (and perhaps not need a separate stack for
658 * the KSE, allowing it to use the thread's stack).
659 *
660 * XXX - This probably needs some work.
661 */
662#ifdef NOT_YET
663static void
664kse_sched_single(struct kse *curkse)
665{
666	struct pthread *curthread = curkse->k_curthread;
667	struct pthread *td_wait;
668	struct timespec ts;
669	int level;
670
671	if (curthread->active == 0) {
672		if (curthread->state != PS_RUNNING) {
673			/* Check to see if the thread has timed out. */
674			KSE_GET_TOD(curkse, &ts);
675			if (thr_timedout(curthread, &ts) != 0) {
676				curthread->timeout = 1;
677				curthread->state = PS_RUNNING;
678			}
679		}
680	}
681
682	/* This thread no longer needs to yield the CPU: */
683	curthread->critical_yield = 0;
684	curthread->need_switchout = 0;
685
686	/*
687	 * Lock the scheduling queue.
688	 *
689	 * There is no scheduling queue for single threaded KSEs,
690	 * but we need a lock for protection regardless.
691	 */
692	KSE_SCHED_LOCK(curkse, curkse->k_kseg);
693
694	/*
695	 * This has to do the job of kse_switchout_thread(), only
696	 * for a single threaded KSE/KSEG.
697	 */
698
699	switch (curthread->state) {
700	case PS_DEAD:
701		/* Unlock the scheduling queue and exit the KSE and thread. */
702		thr_cleaup(curkse, curthread);
703		KSE_SCHED_UNLOCK(curkse, curkse->k_kseg);
704		break;
705
706	case PS_COND_WAIT:
707	case PS_SLEEP_WAIT:
708		/* Only insert threads that can timeout: */
709		if (curthread->wakeup_time.tv_sec != -1) {
710			/* Insert into the waiting queue: */
711			KSE_WAITQ_INSERT(curkse, curthread);
712		}
713		break;
714
715	case PS_LOCKWAIT:
716		level = curthread->locklevel - 1;
717		if (!_LCK_GRANTED(&curthread->lockusers[level]))
718			KSE_WAITQ_INSERT(curkse, curthread);
719		else
720			THR_SET_STATE(curthread, PS_RUNNING);
721		break;
722
723	case PS_JOIN:
724	case PS_MUTEX_WAIT:
725	case PS_RUNNING:
726	case PS_SIGSUSPEND:
727	case PS_SIGWAIT:
728	case PS_SUSPENDED:
729	case PS_DEADLOCK:
730	default:
731		/*
732		 * These states don't timeout and don't need
733		 * to be in the waiting queue.
734		 */
735		break;
736	}
737	while (curthread->state != PS_RUNNING) {
738		curthread->active = 0;
739		td_wait = KSE_WAITQ_FIRST(curkse);
740
741		kse_wait(curkse, td_wait);
742
743	    	if (td_wait != NULL) {
744			KSE_GET_TOD(curkse, &ts);
745			if (thr_timedout(curthread, &ts)) {
746				/* Indicate the thread timedout: */
747				td_wait->timeout = 1;
748
749				/* Make the thread runnable. */
750				THR_SET_STATE(td_wait, PS_RUNNING);
751				KSE_WAITQ_REMOVE(curkse, td_wait);
752			}
753		}
754		KSE_SCHED_UNLOCK(curkse, curkse->k_kseg);
755		kse_check_signals(curkse);
756		KSE_SCHED_LOCK(curkse, curkse->k_kseg);
757	}
758
759	/* Remove the frame reference. */
760	curthread->curframe = NULL;
761
762	/* Unlock the scheduling queue. */
763	KSE_SCHED_UNLOCK(curkse, curkse->k_kseg);
764
765	/*
766	 * Continue the thread at its current frame:
767	 */
768	DBG_MSG("Continuing bound thread %p\n", curthread);
769	_thread_switch(&curthread->tmbx, &curkse->k_mbx.km_curthread);
770	PANIC("Thread has returned from _thread_switch");
771}
772#endif
773
774#ifdef DEBUG_THREAD_KERN
775static void
776dump_queues(struct kse *curkse)
777{
778	struct pthread *thread;
779
780	DBG_MSG("Threads in waiting queue:\n");
781	TAILQ_FOREACH(thread, &curkse->k_kseg->kg_schedq.sq_waitq, pqe) {
782		DBG_MSG("  thread %p, state %d, blocked %d\n",
783		    thread, thread->state, thread->blocked);
784	}
785}
786#endif
787
788/*
789 * This is the scheduler for a KSE which runs multiple threads.
790 */
791static void
792kse_sched_multi(struct kse *curkse)
793{
794	struct pthread *curthread, *td_wait;
795	struct pthread_sigframe *curframe;
796	int ret;
797
798	THR_ASSERT(curkse->k_mbx.km_curthread == NULL,
799	    "Mailbox not null in kse_sched_multi");
800
801	/* Check for first time initialization: */
802	if ((curkse->k_flags & KF_INITIALIZED) == 0) {
803		/* Setup this KSEs specific data. */
804		_ksd_setprivate(&curkse->k_ksd);
805		_set_curkse(curkse);
806
807		/* Set this before grabbing the context. */
808		curkse->k_flags |= KF_INITIALIZED;
809	}
810
811	/* This may have returned from a kse_release(). */
812	if (KSE_WAITING(curkse)) {
813		DBG_MSG("Entered upcall when KSE is waiting.");
814		KSE_CLEAR_WAIT(curkse);
815	}
816
817	/* Lock the scheduling lock. */
818	curthread = curkse->k_curthread;
819	if ((curthread == NULL) || (curthread->need_switchout == 0)) {
820		/* This is an upcall; take the scheduler lock. */
821		KSE_SCHED_LOCK(curkse, curkse->k_kseg);
822	}
823
824	if (KSE_IS_IDLE(curkse)) {
825		KSE_CLEAR_IDLE(curkse);
826		curkse->k_kseg->kg_idle_kses--;
827	}
828	/*
829	 * If the current thread was completed in another KSE, then
830	 * it will be in the run queue.  Don't mark it as being blocked.
831	 */
832	if ((curthread != NULL) &&
833	    ((curthread->flags & THR_FLAGS_IN_RUNQ) == 0) &&
834	    (curthread->need_switchout == 0)) {
835		/*
836		 * Assume the current thread is blocked; when the
837		 * completed threads are checked and if the current
838		 * thread is among the completed, the blocked flag
839		 * will be cleared.
840		 */
841		curthread->blocked = 1;
842	}
843
844	/* Check for any unblocked threads in the kernel. */
845	kse_check_completed(curkse);
846
847	/*
848	 * Check for threads that have timed-out.
849	 */
850	kse_check_waitq(curkse);
851
852	/*
853	 * Switchout the current thread, if necessary, as the last step
854	 * so that it is inserted into the run queue (if it's runnable)
855	 * _after_ any other threads that were added to it above.
856	 */
857	if (curthread == NULL)
858		;  /* Nothing to do here. */
859	else if ((curthread->need_switchout == 0) &&
860	    (curthread->blocked == 0) && (THR_IN_CRITICAL(curthread))) {
861		/*
862		 * Resume the thread and tell it to yield when
863		 * it leaves the critical region.
864		 */
865		curthread->critical_yield = 1;
866		curthread->active = 1;
867		if ((curthread->flags & THR_FLAGS_IN_RUNQ) != 0)
868			KSE_RUNQ_REMOVE(curkse, curthread);
869		curkse->k_curthread = curthread;
870		curthread->kse = curkse;
871		DBG_MSG("Continuing thread %p in critical region\n",
872		    curthread);
873		kse_wakeup_multi(curkse);
874		KSE_SCHED_UNLOCK(curkse, curkse->k_kseg);
875		ret = _thread_switch(&curthread->tmbx,
876		    &curkse->k_mbx.km_curthread);
877		if (ret != 0)
878			PANIC("Can't resume thread in critical region\n");
879	}
880	else if ((curthread->flags & THR_FLAGS_IN_RUNQ) == 0)
881		kse_switchout_thread(curkse, curthread);
882	curkse->k_curthread = NULL;
883
884	kse_wakeup_multi(curkse);
885
886	/* This has to be done without the scheduling lock held. */
887	KSE_SCHED_UNLOCK(curkse, curkse->k_kseg);
888	kse_check_signals(curkse);
889	KSE_SCHED_LOCK(curkse, curkse->k_kseg);
890
891#ifdef DEBUG_THREAD_KERN
892	dump_queues(curkse);
893#endif
894
895	/* Check if there are no threads ready to run: */
896	while (((curthread = KSE_RUNQ_FIRST(curkse)) == NULL) &&
897	    (curkse->k_kseg->kg_threadcount != 0)) {
898		/*
899		 * Wait for a thread to become active or until there are
900		 * no more threads.
901		 */
902		td_wait = KSE_WAITQ_FIRST(curkse);
903		kse_wait(curkse, td_wait);
904		kse_check_completed(curkse);
905		kse_check_waitq(curkse);
906		KSE_SCHED_UNLOCK(curkse, curkse->k_kseg);
907		kse_check_signals(curkse);
908		KSE_SCHED_LOCK(curkse, curkse->k_kseg);
909	}
910
911	/* Check for no more threads: */
912	if (curkse->k_kseg->kg_threadcount == 0) {
913		/*
914		 * Normally this shouldn't return, but it will if there
915		 * are other KSEs running that create new threads that
916		 * are assigned to this KSE[G].  For instance, if a scope
917		 * system thread were to create a scope process thread
918		 * and this kse[g] is the initial kse[g], then that newly
919		 * created thread would be assigned to us (the initial
920		 * kse[g]).
921		 */
922		KSE_SCHED_UNLOCK(curkse, curkse->k_kseg);
923		kse_fini(curkse);
924		/* never returns */
925	}
926
927	THR_ASSERT(curthread != NULL,
928	    "Return from kse_wait/fini without thread.");
929	THR_ASSERT(curthread->state != PS_DEAD,
930	    "Trying to resume dead thread!");
931	KSE_RUNQ_REMOVE(curkse, curthread);
932
933	/*
934	 * Make the selected thread the current thread.
935	 */
936	curkse->k_curthread = curthread;
937
938	/*
939	 * Make sure the current thread's kse points to this kse.
940	 */
941	curthread->kse = curkse;
942
943	/*
944	 * Reset accounting.
945	 */
946	curthread->tmbx.tm_uticks = 0;
947	curthread->tmbx.tm_sticks = 0;
948
949	/*
950	 * Reset the time slice if this thread is running for the first
951	 * time or running again after using its full time slice allocation.
952	 */
953	if (curthread->slice_usec == -1)
954		curthread->slice_usec = 0;
955
956	/* Mark the thread active. */
957	curthread->active = 1;
958
959	/* Remove the frame reference. */
960	curframe = curthread->curframe;
961	curthread->curframe = NULL;
962
963	kse_wakeup_multi(curkse);
964
965	/*
966	 * The thread's current signal frame will only be NULL if it
967	 * is being resumed after being blocked in the kernel.  In
968	 * this case, and if the thread needs to run down pending
969	 * signals or needs a cancellation check, we need to add a
970	 * signal frame to the thread's context.
971	 */
972#ifdef NOT_YET
973	if ((curframe == NULL) && ((curthread->have_signals != 0) ||
974	    (((curthread->cancelflags & THR_AT_CANCEL_POINT) == 0) &&
975	    ((curthread->cancelflags & PTHREAD_CANCEL_ASYNCHRONOUS) != 0))))
976		signalcontext(&curthread->tmbx.tm_context, 0,
977		    (__sighandler_t *)thr_resume_wrapper);
978#else
979	if ((curframe == NULL) && (curthread->have_signals != 0))
980		signalcontext(&curthread->tmbx.tm_context, 0,
981		    (__sighandler_t *)thr_resume_wrapper);
982#endif
983	/*
984	 * Continue the thread at its current frame:
985	 */
986	if (curthread->lock_switch != 0) {
987		/*
988		 * This thread came from a scheduler switch; it will
989		 * unlock the scheduler lock and set the mailbox.
990		 */
991		ret = _thread_switch(&curthread->tmbx, NULL);
992	} else {
993		/* This thread won't unlock the scheduler lock. */
994		KSE_SCHED_UNLOCK(curkse, curkse->k_kseg);
995		ret = _thread_switch(&curthread->tmbx,
996		    &curkse->k_mbx.km_curthread);
997	}
998	if (ret != 0)
999		PANIC("Thread has returned from _thread_switch");
1000
1001	/* This point should not be reached. */
1002	PANIC("Thread has returned from _thread_switch");
1003}
1004
1005static void
1006kse_check_signals(struct kse *curkse)
1007{
1008	sigset_t sigset;
1009	int i;
1010
1011	/* Deliver posted signals. */
1012	for (i = 0; i < _SIG_WORDS; i++) {
1013		atomic_swap_int(&curkse->k_mbx.km_sigscaught.__bits[i],
1014		    0, &sigset.__bits[i]);
1015	}
1016	if (SIGNOTEMPTY(sigset)) {
1017		/*
1018		 * Dispatch each signal.
1019		 *
1020		 * XXX - There is no siginfo for any of these.
1021		 *       I think there should be, especially for
1022		 *       signals from other processes (si_pid, si_uid).
1023		 */
1024		for (i = 1; i < NSIG; i++) {
1025			if (sigismember(&sigset, i) != 0) {
1026				DBG_MSG("Dispatching signal %d\n", i);
1027				_thr_sig_dispatch(curkse, i,
1028				    NULL /* no siginfo */);
1029			}
1030		}
1031		sigemptyset(&sigset);
1032		__sys_sigprocmask(SIG_SETMASK, &sigset, NULL);
1033	}
1034}
1035
1036static void
1037thr_resume_wrapper(int unused_1, siginfo_t *unused_2, ucontext_t *ucp)
1038{
1039	struct pthread *curthread = _get_curthread();
1040
1041	thr_resume_check(curthread, ucp, NULL);
1042}
1043
1044static void
1045thr_resume_check(struct pthread *curthread, ucontext_t *ucp,
1046    struct pthread_sigframe *psf)
1047{
1048	/* Check signals before cancellations. */
1049	while (curthread->have_signals != 0) {
1050		/* Clear the pending flag. */
1051		curthread->have_signals = 0;
1052
1053		/*
1054		 * It's perfectly valid, though not portable, for
1055		 * signal handlers to munge their interrupted context
1056		 * and expect to return to it.  Ensure we use the
1057		 * correct context when running down signals.
1058		 */
1059		_thr_sig_rundown(curthread, ucp, psf);
1060	}
1061
1062#ifdef NOT_YET
1063	if (((curthread->cancelflags & THR_AT_CANCEL_POINT) == 0) &&
1064	    ((curthread->cancelflags & PTHREAD_CANCEL_ASYNCHRONOUS) != 0))
1065		pthread_testcancel();
1066#endif
1067}
1068
1069/*
1070 * Clean up a thread.  This must be called with the thread's KSE
1071 * scheduling lock held.  The thread must be a thread from the
1072 * KSE's group.
1073 */
1074static void
1075thr_cleanup(struct kse *curkse, struct pthread *thread)
1076{
1077	struct pthread *joiner;
1078	int sys_scope;
1079
1080	if ((joiner = thread->joiner) != NULL) {
1081		/* Joinee scheduler lock held; joiner won't leave. */
1082		if (joiner->kseg == curkse->k_kseg) {
1083			if (joiner->join_status.thread == thread) {
1084				joiner->join_status.thread = NULL;
1085				joiner->join_status.ret = thread->ret;
1086				_thr_setrunnable_unlocked(joiner);
1087			}
1088		} else {
1089			KSE_SCHED_UNLOCK(curkse, curkse->k_kseg);
1090			/* The joiner may have removed itself and exited. */
1091			if (_thr_ref_add(thread, joiner, 0) == 0) {
1092				KSE_SCHED_LOCK(curkse, joiner->kseg);
1093				if (joiner->join_status.thread == thread) {
1094					joiner->join_status.thread = NULL;
1095					joiner->join_status.ret = thread->ret;
1096					_thr_setrunnable_unlocked(joiner);
1097				}
1098				KSE_SCHED_UNLOCK(curkse, joiner->kseg);
1099				_thr_ref_delete(thread, joiner);
1100			}
1101			KSE_SCHED_LOCK(curkse, curkse->k_kseg);
1102		}
1103		thread->attr.flags |= PTHREAD_DETACHED;
1104	}
1105
1106	if (!(sys_scope = (thread->attr.flags & PTHREAD_SCOPE_SYSTEM))) {
1107		/*
1108		 * Remove the thread from the KSEG's list of threads.
1109	 	 */
1110		KSEG_THRQ_REMOVE(thread->kseg, thread);
1111		/*
1112		 * Migrate the thread to the main KSE so that this
1113		 * KSE and KSEG can be cleaned when their last thread
1114		 * exits.
1115		 */
1116		thread->kseg = _kse_initial->k_kseg;
1117		thread->kse = _kse_initial;
1118	}
1119	thread->flags |= THR_FLAGS_GC_SAFE;
1120
1121	/*
1122	 * We can't hold the thread list lock while holding the
1123	 * scheduler lock.
1124	 */
1125	KSE_SCHED_UNLOCK(curkse, curkse->k_kseg);
1126	DBG_MSG("Adding thread %p to GC list\n", thread);
1127	KSE_LOCK_ACQUIRE(curkse, &_thread_list_lock);
1128	THR_GCLIST_ADD(thread);
1129	/* Use thread_list_lock */
1130	active_threads--;
1131	if (active_threads == 0) {
1132		KSE_LOCK_RELEASE(curkse, &_thread_list_lock);
1133		exit(0);
1134        }
1135	KSE_LOCK_RELEASE(curkse, &_thread_list_lock);
1136	if (sys_scope) {
1137		/*
1138		 * System scope thread is single thread group,
1139		 * when thread is exited, its kse and ksegrp should
1140		 * be recycled as well.
1141		 */
1142		kse_exit();
1143		PANIC("kse_exit() failed for system scope thread");
1144	}
1145	KSE_SCHED_LOCK(curkse, curkse->k_kseg);
1146}
1147
1148void
1149_thr_gc(struct pthread *curthread)
1150{
1151	struct pthread *td, *td_next;
1152	kse_critical_t crit;
1153	TAILQ_HEAD(, pthread) worklist;
1154
1155	TAILQ_INIT(&worklist);
1156	crit = _kse_critical_enter();
1157	KSE_LOCK_ACQUIRE(curthread->kse, &_thread_list_lock);
1158
1159	/* Check the threads waiting for GC. */
1160	for (td = TAILQ_FIRST(&_thread_gc_list); td != NULL; td = td_next) {
1161		td_next = TAILQ_NEXT(td, gcle);
1162		if ((td->flags & THR_FLAGS_GC_SAFE) == 0)
1163			continue;
1164		else if (((td->attr.flags & PTHREAD_SCOPE_SYSTEM) != 0) &&
1165		    ((td->kse->k_mbx.km_flags & KMF_DONE) == 0)) {
1166			/*
1167			 * The thread and KSE are operating on the same
1168			 * stack.  Wait for the KSE to exit before freeing
1169			 * the thread's stack as well as everything else.
1170			 */
1171			continue;
1172		}
1173		/*
1174		 * Remove the thread from the GC list.  If the thread
1175		 * isn't yet detached, it will get added back to the
1176		 * GC list at a later time.
1177		 */
1178		THR_GCLIST_REMOVE(td);
1179		DBG_MSG("Freeing thread %p stack\n", td);
1180		/*
1181		 * We can free the thread stack since it's no longer
1182		 * in use.
1183		 */
1184		_thr_stack_free(&td->attr);
1185		if (((td->attr.flags & PTHREAD_DETACHED) != 0) &&
1186		    (td->refcount == 0)) {
1187			/*
1188			 * The thread has detached and is no longer
1189			 * referenced.  It is safe to remove all
1190			 * remnants of the thread.
1191			 */
1192			THR_LIST_REMOVE(td);
1193			TAILQ_INSERT_HEAD(&worklist, td, gcle);
1194		}
1195	}
1196	KSE_LOCK_RELEASE(curthread->kse, &_thread_list_lock);
1197	_kse_critical_leave(crit);
1198
1199	while ((td = TAILQ_FIRST(&worklist)) != NULL) {
1200		TAILQ_REMOVE(&worklist, td, gcle);
1201
1202		if ((td->attr.flags & PTHREAD_SCOPE_SYSTEM) != 0) {
1203			crit = _kse_critical_enter();
1204			KSE_LOCK_ACQUIRE(curthread->kse, &kse_lock);
1205			kse_free_unlocked(td->kse);
1206			kseg_free_unlocked(td->kseg);
1207			KSE_LOCK_RELEASE(curthread->kse, &kse_lock);
1208			_kse_critical_leave(crit);
1209		}
1210		DBG_MSG("Freeing thread %p\n", td);
1211		_thr_free(curthread, td);
1212	}
1213	/* XXX free kse and ksegrp list should be looked as well */
1214}
1215
1216
1217/*
1218 * Only new threads that are running or suspended may be scheduled.
1219 */
1220int
1221_thr_schedule_add(struct pthread *curthread, struct pthread *newthread)
1222{
1223	struct kse *curkse;
1224	kse_critical_t crit;
1225	int ret;
1226
1227	/* Add the new thread. */
1228	thr_link(newthread);
1229
1230	/*
1231	 * If this is the first time creating a thread, make sure
1232	 * the mailbox is set for the current thread.
1233	 */
1234	if ((newthread->attr.flags & PTHREAD_SCOPE_SYSTEM) != 0) {
1235#ifdef NOT_YET
1236		/* We use the thread's stack as the KSE's stack. */
1237		new_thread->kse->k_mbx.km_stack.ss_sp =
1238		    new_thread->attr.stackaddr_attr;
1239		new_thread->kse->k_mbx.km_stack.ss_size =
1240		    new_thread->attr.stacksize_attr;
1241#endif
1242		/*
1243		 * No need to lock the scheduling queue since the
1244		 * KSE/KSEG pair have not yet been started.
1245		 */
1246		KSEG_THRQ_ADD(newthread->kseg, newthread);
1247		if (newthread->state == PS_RUNNING)
1248			THR_RUNQ_INSERT_TAIL(newthread);
1249		newthread->kse->k_curthread = NULL;
1250		newthread->kse->k_mbx.km_flags = 0;
1251		newthread->kse->k_mbx.km_func = (kse_func_t *)kse_sched_multi;
1252		newthread->kse->k_mbx.km_quantum = 0;
1253
1254		/*
1255		 * This thread needs a new KSE and KSEG.
1256		 */
1257		crit = _kse_critical_enter();
1258		curkse = _get_curkse();
1259		_ksd_setprivate(&newthread->kse->k_ksd);
1260		newthread->kse->k_flags |= KF_INITIALIZED|KF_STARTED;
1261		ret = kse_create(&newthread->kse->k_mbx, 1);
1262		if (ret != 0)
1263			ret = errno;
1264		_ksd_setprivate(&curkse->k_ksd);
1265		_kse_critical_leave(crit);
1266	}
1267	else {
1268		/*
1269		 * Lock the KSE and add the new thread to its list of
1270		 * assigned threads.  If the new thread is runnable, also
1271		 * add it to the KSE's run queue.
1272		 */
1273		KSE_SCHED_LOCK(curthread->kse, newthread->kseg);
1274		KSEG_THRQ_ADD(newthread->kseg, newthread);
1275		if (newthread->state == PS_RUNNING)
1276			THR_RUNQ_INSERT_TAIL(newthread);
1277		if ((newthread->kse->k_flags & KF_STARTED) == 0) {
1278			/*
1279			 * This KSE hasn't been started yet.  Start it
1280			 * outside of holding the lock.
1281			 */
1282			newthread->kse->k_flags |= KF_STARTED;
1283			newthread->kse->k_mbx.km_func =
1284			    (kse_func_t *)kse_sched_multi;
1285			newthread->kse->k_mbx.km_flags = 0;
1286			kse_create(&newthread->kse->k_mbx, 0);
1287		 } else if ((newthread->state == PS_RUNNING) &&
1288		     KSE_IS_IDLE(newthread->kse)) {
1289			/*
1290			 * The thread is being scheduled on another KSEG.
1291			 */
1292			kse_wakeup_one(newthread);
1293		}
1294		KSE_SCHED_UNLOCK(curthread->kse, newthread->kseg);
1295		ret = 0;
1296	}
1297	if (ret != 0)
1298		thr_unlink(newthread);
1299
1300	return (ret);
1301}
1302
1303void
1304kse_waitq_insert(struct pthread *thread)
1305{
1306	struct pthread *td;
1307
1308	if (thread->wakeup_time.tv_sec == -1)
1309		TAILQ_INSERT_TAIL(&thread->kse->k_schedq->sq_waitq, thread,
1310		    pqe);
1311	else {
1312		td = TAILQ_FIRST(&thread->kse->k_schedq->sq_waitq);
1313		while ((td != NULL) && (td->wakeup_time.tv_sec != -1) &&
1314		    ((td->wakeup_time.tv_sec < thread->wakeup_time.tv_sec) ||
1315		    ((td->wakeup_time.tv_sec == thread->wakeup_time.tv_sec) &&
1316		    (td->wakeup_time.tv_nsec <= thread->wakeup_time.tv_nsec))))
1317			td = TAILQ_NEXT(td, pqe);
1318		if (td == NULL)
1319			TAILQ_INSERT_TAIL(&thread->kse->k_schedq->sq_waitq,
1320			    thread, pqe);
1321		else
1322			TAILQ_INSERT_BEFORE(td, thread, pqe);
1323	}
1324	thread->flags |= THR_FLAGS_IN_WAITQ;
1325}
1326
1327/*
1328 * This must be called with the scheduling lock held.
1329 */
1330static void
1331kse_check_completed(struct kse *kse)
1332{
1333	struct pthread *thread;
1334	struct kse_thr_mailbox *completed;
1335
1336	if ((completed = kse->k_mbx.km_completed) != NULL) {
1337		kse->k_mbx.km_completed = NULL;
1338		while (completed != NULL) {
1339			thread = completed->tm_udata;
1340			DBG_MSG("Found completed thread %p, name %s\n",
1341			    thread,
1342			    (thread->name == NULL) ? "none" : thread->name);
1343			thread->blocked = 0;
1344			if (thread != kse->k_curthread) {
1345				if ((thread->flags & THR_FLAGS_SUSPENDED) != 0)
1346					THR_SET_STATE(thread, PS_SUSPENDED);
1347				else
1348					KSE_RUNQ_INSERT_TAIL(kse, thread);
1349				if ((thread->kse != kse) &&
1350				    (thread->kse->k_curthread == thread)) {
1351					thread->kse->k_curthread = NULL;
1352					thread->active = 0;
1353				}
1354			}
1355			completed = completed->tm_next;
1356		}
1357	}
1358}
1359
1360/*
1361 * This must be called with the scheduling lock held.
1362 */
1363static void
1364kse_check_waitq(struct kse *kse)
1365{
1366	struct pthread	*pthread;
1367	struct timespec ts;
1368
1369	KSE_GET_TOD(kse, &ts);
1370
1371	/*
1372	 * Wake up threads that have timedout.  This has to be
1373	 * done before adding the current thread to the run queue
1374	 * so that a CPU intensive thread doesn't get preference
1375	 * over waiting threads.
1376	 */
1377	while (((pthread = KSE_WAITQ_FIRST(kse)) != NULL) &&
1378	    thr_timedout(pthread, &ts)) {
1379		/* Remove the thread from the wait queue: */
1380		KSE_WAITQ_REMOVE(kse, pthread);
1381		DBG_MSG("Found timedout thread %p in waitq\n", pthread);
1382
1383		/* Indicate the thread timedout: */
1384		pthread->timeout = 1;
1385
1386		/* Add the thread to the priority queue: */
1387		if ((pthread->flags & THR_FLAGS_SUSPENDED) != 0)
1388			THR_SET_STATE(pthread, PS_SUSPENDED);
1389		else {
1390			THR_SET_STATE(pthread, PS_RUNNING);
1391			KSE_RUNQ_INSERT_TAIL(kse, pthread);
1392		}
1393	}
1394}
1395
1396static int
1397thr_timedout(struct pthread *thread, struct timespec *curtime)
1398{
1399	if (thread->wakeup_time.tv_sec < 0)
1400		return (0);
1401	else if (thread->wakeup_time.tv_sec > curtime->tv_sec)
1402		return (0);
1403	else if ((thread->wakeup_time.tv_sec == curtime->tv_sec) &&
1404	    (thread->wakeup_time.tv_nsec > curtime->tv_nsec))
1405		return (0);
1406	else
1407		return (1);
1408}
1409
1410/*
1411 * This must be called with the scheduling lock held.
1412 *
1413 * Each thread has a time slice, a wakeup time (used when it wants
1414 * to wait for a specified amount of time), a run state, and an
1415 * active flag.
1416 *
1417 * When a thread gets run by the scheduler, the active flag is
1418 * set to non-zero (1).  When a thread performs an explicit yield
1419 * or schedules a state change, it enters the scheduler and the
1420 * active flag is cleared.  When the active flag is still seen
1421 * set in the scheduler, that means that the thread is blocked in
1422 * the kernel (because it is cleared before entering the scheduler
1423 * in all other instances).
1424 *
1425 * The wakeup time is only set for those states that can timeout.
1426 * It is set to (-1, -1) for all other instances.
1427 *
1428 * The thread's run state, aside from being useful when debugging,
1429 * is used to place the thread in an appropriate queue.  There
1430 * are 2 basic queues:
1431 *
1432 *   o run queue - queue ordered by priority for all threads
1433 *                 that are runnable
1434 *   o waiting queue - queue sorted by wakeup time for all threads
1435 *                     that are not otherwise runnable (not blocked
1436 *                     in kernel, not waiting for locks)
1437 *
1438 * The thread's time slice is used for round-robin scheduling
1439 * (the default scheduling policy).  While a SCHED_RR thread
1440 * is runnable it's time slice accumulates.  When it reaches
1441 * the time slice interval, it gets reset and added to the end
1442 * of the queue of threads at its priority.  When a thread no
1443 * longer becomes runnable (blocks in kernel, waits, etc), its
1444 * time slice is reset.
1445 *
1446 * The job of kse_switchout_thread() is to handle all of the above.
1447 */
1448static void
1449kse_switchout_thread(struct kse *kse, struct pthread *thread)
1450{
1451	int level;
1452	int i;
1453
1454	/*
1455	 * Place the currently running thread into the
1456	 * appropriate queue(s).
1457	 */
1458	DBG_MSG("Switching out thread %p, state %d\n", thread, thread->state);
1459
1460	THR_DEACTIVATE_LAST_LOCK(thread);
1461	if (thread->blocked != 0) {
1462		thread->active = 0;
1463		thread->need_switchout = 0;
1464		/* This thread must have blocked in the kernel. */
1465		/* thread->slice_usec = -1;*/	/* restart timeslice */
1466		/*
1467		 * XXX - Check for pending signals for this thread to
1468		 *       see if we need to interrupt it in the kernel.
1469		 */
1470		/* if (thread->check_pending != 0) */
1471		if ((thread->slice_usec != -1) &&
1472		    (thread->attr.sched_policy != SCHED_FIFO))
1473			thread->slice_usec += (thread->tmbx.tm_uticks
1474			    + thread->tmbx.tm_sticks) * _clock_res_usec;
1475	}
1476	else {
1477		switch (thread->state) {
1478		case PS_DEAD:
1479			/*
1480			 * The scheduler is operating on a different
1481			 * stack.  It is safe to do garbage collecting
1482			 * here.
1483			 */
1484			thread->active = 0;
1485			thread->need_switchout = 0;
1486			thr_cleanup(kse, thread);
1487			return;
1488			break;
1489
1490		case PS_RUNNING:
1491			if ((thread->flags & THR_FLAGS_SUSPENDED) != 0)
1492				THR_SET_STATE(thread, PS_SUSPENDED);
1493			break;
1494
1495		case PS_COND_WAIT:
1496		case PS_SLEEP_WAIT:
1497			/* Insert into the waiting queue: */
1498			KSE_WAITQ_INSERT(kse, thread);
1499			break;
1500
1501		case PS_LOCKWAIT:
1502			/*
1503			 * This state doesn't timeout.
1504			 */
1505			thread->wakeup_time.tv_sec = -1;
1506			thread->wakeup_time.tv_nsec = -1;
1507			level = thread->locklevel - 1;
1508			if (!_LCK_GRANTED(&thread->lockusers[level]))
1509				KSE_WAITQ_INSERT(kse, thread);
1510			else
1511				THR_SET_STATE(thread, PS_RUNNING);
1512			break;
1513
1514		case PS_JOIN:
1515		case PS_MUTEX_WAIT:
1516		case PS_SIGSUSPEND:
1517		case PS_SIGWAIT:
1518		case PS_SUSPENDED:
1519		case PS_DEADLOCK:
1520		default:
1521			/*
1522			 * These states don't timeout.
1523			 */
1524			thread->wakeup_time.tv_sec = -1;
1525			thread->wakeup_time.tv_nsec = -1;
1526
1527			/* Insert into the waiting queue: */
1528			KSE_WAITQ_INSERT(kse, thread);
1529			break;
1530		}
1531		if (thread->state != PS_RUNNING) {
1532			/* Restart the time slice: */
1533			thread->slice_usec = -1;
1534		} else {
1535			if (thread->need_switchout != 0)
1536				/*
1537				 * The thread yielded on its own;
1538				 * restart the timeslice.
1539				 */
1540				thread->slice_usec = -1;
1541			else if ((thread->slice_usec != -1) &&
1542	   		    (thread->attr.sched_policy != SCHED_FIFO)) {
1543				thread->slice_usec += (thread->tmbx.tm_uticks
1544				    + thread->tmbx.tm_sticks) * _clock_res_usec;
1545				/* Check for time quantum exceeded: */
1546				if (thread->slice_usec > TIMESLICE_USEC)
1547					thread->slice_usec = -1;
1548			}
1549			if (thread->slice_usec == -1) {
1550				/*
1551				 * The thread exceeded its time quantum or
1552				 * it yielded the CPU; place it at the tail
1553				 * of the queue for its priority.
1554				 */
1555				KSE_RUNQ_INSERT_TAIL(kse, thread);
1556			} else {
1557				/*
1558				 * The thread hasn't exceeded its interval
1559				 * Place it at the head of the queue for its
1560				 * priority.
1561				 */
1562				KSE_RUNQ_INSERT_HEAD(kse, thread);
1563			}
1564		}
1565	}
1566	thread->active = 0;
1567	thread->need_switchout = 0;
1568	if (thread->check_pending != 0) {
1569		/* Install pending signals into the frame. */
1570		thread->check_pending = 0;
1571		for (i = 0; i < _SIG_MAXSIG; i++) {
1572			if (sigismember(&thread->sigpend, i) &&
1573			    !sigismember(&thread->tmbx.tm_context.uc_sigmask, i))
1574				_thr_sig_add(thread, i, &thread->siginfo[i]);
1575		}
1576	}
1577}
1578
1579/*
1580 * This function waits for the smallest timeout value of any waiting
1581 * thread, or until it receives a message from another KSE.
1582 *
1583 * This must be called with the scheduling lock held.
1584 */
1585static void
1586kse_wait(struct kse *kse, struct pthread *td_wait)
1587{
1588	struct timespec ts, ts_sleep;
1589	int saved_flags;
1590
1591	KSE_GET_TOD(kse, &ts);
1592
1593	if ((td_wait == NULL) || (td_wait->wakeup_time.tv_sec < 0)) {
1594		/* Limit sleep to no more than 1 minute. */
1595		ts_sleep.tv_sec = 60;
1596		ts_sleep.tv_nsec = 0;
1597	} else {
1598		TIMESPEC_SUB(&ts_sleep, &td_wait->wakeup_time, &ts);
1599		if (ts_sleep.tv_sec > 60) {
1600			ts_sleep.tv_sec = 60;
1601			ts_sleep.tv_nsec = 0;
1602		}
1603	}
1604	/* Don't sleep for negative times. */
1605	if ((ts_sleep.tv_sec >= 0) && (ts_sleep.tv_nsec >= 0)) {
1606		KSE_SET_IDLE(kse);
1607		kse->k_kseg->kg_idle_kses++;
1608		KSE_SCHED_UNLOCK(kse, kse->k_kseg);
1609		saved_flags = kse->k_mbx.km_flags;
1610		kse->k_mbx.km_flags |= KMF_NOUPCALL;
1611		kse_release(&ts_sleep);
1612		kse->k_mbx.km_flags = saved_flags;
1613		KSE_SCHED_LOCK(kse, kse->k_kseg);
1614		if (KSE_IS_IDLE(kse)) {
1615			KSE_CLEAR_IDLE(kse);
1616			kse->k_kseg->kg_idle_kses--;
1617		}
1618	}
1619}
1620
1621/*
1622 * Avoid calling this kse_exit() so as not to confuse it with the
1623 * system call of the same name.
1624 */
1625static void
1626kse_fini(struct kse *kse)
1627{
1628	/* struct kse_group *free_kseg = NULL; */
1629	struct timespec ts;
1630
1631	/*
1632	 * Check to see if this is one of the main kses.
1633	 */
1634	if (kse->k_kseg != _kse_initial->k_kseg) {
1635		PANIC("shouldn't get here");
1636		/* This is for supporting thread groups. */
1637#ifdef NOT_YET
1638		/* Remove this KSE from the KSEG's list of KSEs. */
1639		KSE_SCHED_LOCK(kse, kse->k_kseg);
1640		TAILQ_REMOVE(&kse->k_kseg->kg_kseq, kse, k_kgqe);
1641		kse->k_kseg->kg_ksecount--;
1642		if (TAILQ_EMPTY(&kse->k_kseg->kg_kseq))
1643			free_kseg = kse->k_kseg;
1644		KSE_SCHED_UNLOCK(kse, kse->k_kseg);
1645
1646		/*
1647		 * Add this KSE to the list of free KSEs along with
1648		 * the KSEG if is now orphaned.
1649		 */
1650		KSE_LOCK_ACQUIRE(kse, &kse_lock);
1651		if (free_kseg != NULL)
1652			kseg_free_unlocked(free_kseg);
1653		kse_free_unlocked(kse);
1654		KSE_LOCK_RELEASE(kse, &kse_lock);
1655		kse_exit();
1656		/* Never returns. */
1657		PANIC("kse_exit()");
1658#endif
1659	} else {
1660#ifdef NOT_YET
1661		/*
1662		 * In future, we might allow program to kill
1663		 * kse in initial group.
1664		 */
1665		if (kse != _kse_initial) {
1666			KSE_SCHED_LOCK(kse, kse->k_kseg);
1667			TAILQ_REMOVE(&kse->k_kseg->kg_kseq, kse, k_kgqe);
1668			kse->k_kseg->kg_ksecount--;
1669			KSE_SCHED_UNLOCK(kse, kse->k_kseg);
1670			KSE_LOCK_ACQUIRE(kse, &kse_lock);
1671			kse_free_unlocked(kse);
1672			KSE_LOCK_RELEASE(kse, &kse_lock);
1673			kse_exit();
1674                        /* Never returns. */
1675                        PANIC("kse_exit() failed for initial kseg");
1676                }
1677#endif
1678		KSE_SCHED_LOCK(kse, kse->k_kseg);
1679		KSE_SET_IDLE(kse);
1680		kse->k_kseg->kg_idle_kses++;
1681		KSE_SCHED_UNLOCK(kse, kse->k_kseg);
1682		ts.tv_sec = 120;
1683		ts.tv_nsec = 0;
1684		kse->k_mbx.km_flags = 0;
1685		kse_release(&ts);
1686		/* Never reach */
1687	}
1688}
1689
1690void
1691_thr_set_timeout(const struct timespec *timeout)
1692{
1693	struct pthread	*curthread = _get_curthread();
1694	struct timespec ts;
1695
1696	/* Reset the timeout flag for the running thread: */
1697	curthread->timeout = 0;
1698
1699	/* Check if the thread is to wait forever: */
1700	if (timeout == NULL) {
1701		/*
1702		 * Set the wakeup time to something that can be recognised as
1703		 * different to an actual time of day:
1704		 */
1705		curthread->wakeup_time.tv_sec = -1;
1706		curthread->wakeup_time.tv_nsec = -1;
1707	}
1708	/* Check if no waiting is required: */
1709	else if ((timeout->tv_sec == 0) && (timeout->tv_nsec == 0)) {
1710		/* Set the wake up time to 'immediately': */
1711		curthread->wakeup_time.tv_sec = 0;
1712		curthread->wakeup_time.tv_nsec = 0;
1713	} else {
1714		/* Calculate the time for the current thread to wakeup: */
1715		KSE_GET_TOD(curthread->kse, &ts);
1716		TIMESPEC_ADD(&curthread->wakeup_time, &ts, timeout);
1717	}
1718}
1719
1720void
1721_thr_panic_exit(char *file, int line, char *msg)
1722{
1723	char buf[256];
1724
1725	snprintf(buf, sizeof(buf), "(%s:%d) %s\n", file, line, msg);
1726	__sys_write(2, buf, strlen(buf));
1727	abort();
1728}
1729
1730void
1731_thr_setrunnable(struct pthread *curthread, struct pthread *thread)
1732{
1733	kse_critical_t crit;
1734
1735	crit = _kse_critical_enter();
1736	KSE_SCHED_LOCK(curthread->kse, thread->kseg);
1737	_thr_setrunnable_unlocked(thread);
1738	KSE_SCHED_UNLOCK(curthread->kse, thread->kseg);
1739	_kse_critical_leave(crit);
1740}
1741
1742void
1743_thr_setrunnable_unlocked(struct pthread *thread)
1744{
1745	if ((thread->kseg->kg_flags & KGF_SINGLE_THREAD) != 0) {
1746		/* No silly queues for these threads. */
1747		if ((thread->flags & THR_FLAGS_SUSPENDED) != 0)
1748			THR_SET_STATE(thread, PS_SUSPENDED);
1749		else
1750			THR_SET_STATE(thread, PS_RUNNING);
1751	} else if (thread->state != PS_RUNNING) {
1752		if ((thread->flags & THR_FLAGS_IN_WAITQ) != 0)
1753			KSE_WAITQ_REMOVE(thread->kse, thread);
1754		if ((thread->flags & THR_FLAGS_SUSPENDED) != 0)
1755			THR_SET_STATE(thread, PS_SUSPENDED);
1756		else {
1757			THR_SET_STATE(thread, PS_RUNNING);
1758			if ((thread->blocked == 0) && (thread->active == 0) &&
1759			    (thread->flags & THR_FLAGS_IN_RUNQ) == 0)
1760				THR_RUNQ_INSERT_TAIL(thread);
1761		}
1762	}
1763        /*
1764         * XXX - Threads are not yet assigned to specific KSEs; they are
1765         *       assigned to the KSEG.  So the fact that a thread's KSE is
1766         *       waiting doesn't necessarily mean that it will be the KSE
1767         *       that runs the thread after the lock is granted.  But we
1768         *       don't know if the other KSEs within the same KSEG are
1769         *       also in a waiting state or not so we err on the side of
1770         *       caution and wakeup the thread's last known KSE.  We
1771         *       ensure that the threads KSE doesn't change while it's
1772         *       scheduling lock is held so it is safe to reference it
1773         *       (the KSE).  If the KSE wakes up and doesn't find any more
1774         *       work it will again go back to waiting so no harm is done.
1775         */
1776	kse_wakeup_one(thread);
1777}
1778
1779static void
1780kse_wakeup_one(struct pthread *thread)
1781{
1782	struct kse *ke;
1783
1784	if (KSE_IS_IDLE(thread->kse)) {
1785		KSE_CLEAR_IDLE(thread->kse);
1786		thread->kseg->kg_idle_kses--;
1787		KSE_WAKEUP(thread->kse);
1788	} else {
1789		TAILQ_FOREACH(ke, &thread->kseg->kg_kseq, k_kgqe) {
1790			if (KSE_IS_IDLE(ke)) {
1791				KSE_CLEAR_IDLE(ke);
1792				ke->k_kseg->kg_idle_kses--;
1793				KSE_WAKEUP(ke);
1794				return;
1795			}
1796		}
1797	}
1798}
1799
1800static void
1801kse_wakeup_multi(struct kse *curkse)
1802{
1803	struct kse *ke;
1804	int tmp;
1805
1806	if ((tmp = KSE_RUNQ_THREADS(curkse)) && curkse->k_kseg->kg_idle_kses) {
1807		TAILQ_FOREACH(ke, &curkse->k_kseg->kg_kseq, k_kgqe) {
1808			if (KSE_IS_IDLE(ke)) {
1809				KSE_CLEAR_IDLE(ke);
1810				ke->k_kseg->kg_idle_kses--;
1811				KSE_WAKEUP(ke);
1812				if (--tmp == 0)
1813					break;
1814			}
1815		}
1816	}
1817}
1818
1819struct pthread *
1820_get_curthread(void)
1821{
1822	return (_ksd_curthread);
1823}
1824
1825/* This assumes the caller has disabled upcalls. */
1826struct kse *
1827_get_curkse(void)
1828{
1829	return (_ksd_curkse);
1830}
1831
1832void
1833_set_curkse(struct kse *kse)
1834{
1835	_ksd_setprivate(&kse->k_ksd);
1836}
1837
1838/*
1839 * Allocate a new KSEG.
1840 *
1841 * We allow the current thread to be NULL in the case that this
1842 * is the first time a KSEG is being created (library initialization).
1843 * In this case, we don't need to (and can't) take any locks.
1844 */
1845struct kse_group *
1846_kseg_alloc(struct pthread *curthread)
1847{
1848	struct kse_group *kseg = NULL;
1849	kse_critical_t crit;
1850
1851	if ((curthread != NULL) && (free_kseg_count > 0)) {
1852		/* Use the kse lock for the kseg queue. */
1853		crit = _kse_critical_enter();
1854		KSE_LOCK_ACQUIRE(curthread->kse, &kse_lock);
1855		if ((kseg = TAILQ_FIRST(&free_kse_groupq)) != NULL) {
1856			TAILQ_REMOVE(&free_kse_groupq, kseg, kg_qe);
1857			free_kseg_count--;
1858			active_kseg_count++;
1859			TAILQ_INSERT_TAIL(&active_kse_groupq, kseg, kg_qe);
1860		}
1861		KSE_LOCK_RELEASE(curthread->kse, &kse_lock);
1862		_kse_critical_leave(crit);
1863		if (kseg)
1864			kseg_reinit(kseg);
1865	}
1866
1867	/*
1868	 * If requested, attempt to allocate a new KSE group only if the
1869	 * KSE allocation was successful and a KSE group wasn't found in
1870	 * the free list.
1871	 */
1872	if ((kseg == NULL) &&
1873	    ((kseg = (struct kse_group *)malloc(sizeof(*kseg))) != NULL)) {
1874		if (_pq_alloc(&kseg->kg_schedq.sq_runq,
1875		    THR_MIN_PRIORITY, THR_LAST_PRIORITY) != 0) {
1876			free(kseg);
1877			kseg = NULL;
1878		} else {
1879			kseg_init(kseg);
1880			/* Add the KSEG to the list of active KSEGs. */
1881			if (curthread != NULL) {
1882				crit = _kse_critical_enter();
1883				KSE_LOCK_ACQUIRE(curthread->kse, &kse_lock);
1884				active_kseg_count++;
1885				TAILQ_INSERT_TAIL(&active_kse_groupq,
1886				    kseg, kg_qe);
1887				KSE_LOCK_RELEASE(curthread->kse, &kse_lock);
1888				_kse_critical_leave(crit);
1889			} else {
1890				active_kseg_count++;
1891				TAILQ_INSERT_TAIL(&active_kse_groupq,
1892				    kseg, kg_qe);
1893			}
1894		}
1895	}
1896	return (kseg);
1897}
1898
1899/*
1900 * This must be called with the kse lock held and when there are
1901 * no more threads that reference it.
1902 */
1903static void
1904kseg_free_unlocked(struct kse_group *kseg)
1905{
1906	TAILQ_REMOVE(&active_kse_groupq, kseg, kg_qe);
1907	TAILQ_INSERT_HEAD(&free_kse_groupq, kseg, kg_qe);
1908	free_kseg_count++;
1909	active_kseg_count--;
1910}
1911
1912void
1913_kseg_free(struct kse_group *kseg)
1914{
1915	struct kse *curkse;
1916	kse_critical_t crit;
1917
1918	crit = _kse_critical_enter();
1919	curkse = _get_curkse();
1920	KSE_LOCK_ACQUIRE(curkse, &kse_lock);
1921	kseg_free_unlocked(kseg);
1922	KSE_LOCK_RELEASE(curkse, &kse_lock);
1923	_kse_critical_leave(crit);
1924}
1925
1926/*
1927 * Allocate a new KSE.
1928 *
1929 * We allow the current thread to be NULL in the case that this
1930 * is the first time a KSE is being created (library initialization).
1931 * In this case, we don't need to (and can't) take any locks.
1932 */
1933struct kse *
1934_kse_alloc(struct pthread *curthread)
1935{
1936	struct kse *kse = NULL;
1937	kse_critical_t crit;
1938	int need_ksd = 0;
1939	int i;
1940
1941	if ((curthread != NULL) && (free_kse_count > 0)) {
1942		crit = _kse_critical_enter();
1943		KSE_LOCK_ACQUIRE(curthread->kse, &kse_lock);
1944		/* Search for a finished KSE. */
1945		kse = TAILQ_FIRST(&free_kseq);
1946		while ((kse != NULL) &&
1947		    ((kse->k_mbx.km_flags & KMF_DONE) == 0)) {
1948			kse = TAILQ_NEXT(kse, k_qe);
1949		}
1950		if (kse != NULL) {
1951			DBG_MSG("found an unused kse.\n");
1952			TAILQ_REMOVE(&free_kseq, kse, k_qe);
1953			free_kse_count--;
1954			TAILQ_INSERT_TAIL(&active_kseq, kse, k_qe);
1955			active_kse_count++;
1956		}
1957		KSE_LOCK_RELEASE(curthread->kse, &kse_lock);
1958		_kse_critical_leave(crit);
1959		if (kse != NULL)
1960			kse_reinit(kse);
1961	}
1962	if ((kse == NULL) &&
1963	    ((kse = (struct kse *)malloc(sizeof(*kse))) != NULL)) {
1964		bzero(kse, sizeof(*kse));
1965
1966		/* Initialize the lockusers. */
1967		for (i = 0; i < MAX_KSE_LOCKLEVEL; i++) {
1968			_lockuser_init(&kse->k_lockusers[i], (void *)kse);
1969			_LCK_SET_PRIVATE2(&kse->k_lockusers[i], NULL);
1970		}
1971		/* _lock_init(kse->k_lock, ...) */
1972
1973		/* We had to malloc a kse; mark it as needing a new ID.*/
1974		need_ksd = 1;
1975
1976		/*
1977		 * Create the KSE context.
1978		 *
1979		 * XXX - For now this is done here in the allocation.
1980		 *       In the future, we may want to have it done
1981		 *       outside the allocation so that scope system
1982		 *       threads (one thread per KSE) are not required
1983		 *       to have a stack for an unneeded kse upcall.
1984		 */
1985		kse->k_mbx.km_func = (kse_func_t *)kse_sched_multi;
1986		kse->k_mbx.km_stack.ss_sp = (char *)malloc(KSE_STACKSIZE);
1987		kse->k_mbx.km_stack.ss_size = KSE_STACKSIZE;
1988		kse->k_mbx.km_udata = (void *)kse;
1989		kse->k_mbx.km_quantum = 20000;
1990		/*
1991		 * We need to keep a copy of the stack in case it
1992		 * doesn't get used; a KSE running a scope system
1993		 * thread will use that thread's stack.
1994		 */
1995		kse->k_stack.ss_sp = kse->k_mbx.km_stack.ss_sp;
1996		kse->k_stack.ss_size = kse->k_mbx.km_stack.ss_size;
1997		if (kse->k_mbx.km_stack.ss_sp == NULL) {
1998			for (i = 0; i < MAX_KSE_LOCKLEVEL; i++) {
1999				_lockuser_destroy(&kse->k_lockusers[i]);
2000			}
2001			/* _lock_destroy(&kse->k_lock); */
2002			free(kse);
2003			kse = NULL;
2004		}
2005	}
2006	if ((kse != NULL) && (need_ksd != 0)) {
2007		/* This KSE needs initialization. */
2008		if (curthread != NULL) {
2009			crit = _kse_critical_enter();
2010			KSE_LOCK_ACQUIRE(curthread->kse, &kse_lock);
2011		}
2012		/* Initialize KSD inside of the lock. */
2013		if (_ksd_create(&kse->k_ksd, (void *)kse, sizeof(*kse)) != 0) {
2014			if (curthread != NULL) {
2015				KSE_LOCK_RELEASE(curthread->kse, &kse_lock);
2016				_kse_critical_leave(crit);
2017			}
2018			free(kse->k_mbx.km_stack.ss_sp);
2019			for (i = 0; i < MAX_KSE_LOCKLEVEL; i++) {
2020				_lockuser_destroy(&kse->k_lockusers[i]);
2021			}
2022			free(kse);
2023			return (NULL);
2024		}
2025		kse->k_flags = 0;
2026		TAILQ_INSERT_TAIL(&active_kseq, kse, k_qe);
2027		active_kse_count++;
2028		if (curthread != NULL) {
2029			KSE_LOCK_RELEASE(curthread->kse, &kse_lock);
2030			_kse_critical_leave(crit);
2031		}
2032	}
2033	return (kse);
2034}
2035
2036static void
2037kse_reinit(struct kse *kse)
2038{
2039	/*
2040	 * XXX - For now every kse has its stack.
2041	 *       In the future, we may want to have it done
2042	 *       outside the allocation so that scope system
2043	 *       threads (one thread per KSE) are not required
2044	 *       to have a stack for an unneeded kse upcall.
2045	 */
2046	kse->k_mbx.km_flags = 0;
2047	kse->k_curthread = 0;
2048	kse->k_kseg = 0;
2049	kse->k_schedq = 0;
2050	kse->k_locklevel = 0;
2051	sigemptyset(&kse->k_sigmask);
2052	bzero(&kse->k_sigq, sizeof(kse->k_sigq));
2053	kse->k_check_sigq = 0;
2054	kse->k_flags = 0;
2055	kse->k_waiting = 0;
2056	kse->k_idle = 0;
2057	kse->k_error = 0;
2058	kse->k_cpu = 0;
2059	kse->k_done = 0;
2060}
2061
2062void
2063kse_free_unlocked(struct kse *kse)
2064{
2065	TAILQ_REMOVE(&active_kseq, kse, k_qe);
2066	active_kse_count--;
2067	kse->k_kseg = NULL;
2068	kse->k_mbx.km_quantum = 20000;
2069	kse->k_flags = 0;
2070	TAILQ_INSERT_HEAD(&free_kseq, kse, k_qe);
2071	free_kse_count++;
2072}
2073
2074void
2075_kse_free(struct pthread *curthread, struct kse *kse)
2076{
2077	kse_critical_t crit;
2078
2079	if (curthread == NULL)
2080		kse_free_unlocked(kse);
2081	else {
2082		crit = _kse_critical_enter();
2083		KSE_LOCK_ACQUIRE(curthread->kse, &kse_lock);
2084		kse_free_unlocked(kse);
2085		KSE_LOCK_RELEASE(curthread->kse, &kse_lock);
2086		_kse_critical_leave(crit);
2087	}
2088}
2089
2090static void
2091kseg_init(struct kse_group *kseg)
2092{
2093	kseg_reinit(kseg);
2094	_lock_init(&kseg->kg_lock, LCK_ADAPTIVE, _kse_lock_wait,
2095	    _kse_lock_wakeup);
2096}
2097
2098static void
2099kseg_reinit(struct kse_group *kseg)
2100{
2101	TAILQ_INIT(&kseg->kg_kseq);
2102	TAILQ_INIT(&kseg->kg_threadq);
2103	TAILQ_INIT(&kseg->kg_schedq.sq_waitq);
2104	kseg->kg_threadcount = 0;
2105	kseg->kg_ksecount = 0;
2106	kseg->kg_idle_kses = 0;
2107	kseg->kg_flags = 0;
2108}
2109
2110struct pthread *
2111_thr_alloc(struct pthread *curthread)
2112{
2113	kse_critical_t crit;
2114	void *p;
2115	struct pthread *thread = NULL;
2116
2117	if (curthread != NULL) {
2118		if (GC_NEEDED())
2119			_thr_gc(curthread);
2120		if (free_thread_count > 0) {
2121			crit = _kse_critical_enter();
2122			KSE_LOCK_ACQUIRE(curthread->kse, &thread_lock);
2123			if ((thread = TAILQ_FIRST(&free_threadq)) != NULL) {
2124				TAILQ_REMOVE(&free_threadq, thread, tle);
2125				free_thread_count--;
2126			}
2127			KSE_LOCK_RELEASE(curthread->kse, &thread_lock);
2128			_kse_critical_leave(crit);
2129		}
2130	}
2131	if (thread == NULL) {
2132		p = malloc(sizeof(struct pthread) + THR_ALIGNBYTES);
2133		if (p != NULL) {
2134			thread = (struct pthread *)THR_ALIGN(p);
2135			thread->alloc_addr = p;
2136		}
2137	}
2138	return (thread);
2139}
2140
2141void
2142_thr_free(struct pthread *curthread, struct pthread *thread)
2143{
2144	kse_critical_t crit;
2145	int i;
2146
2147	DBG_MSG("Freeing thread %p\n", thread);
2148	if ((curthread == NULL) || (free_thread_count >= MAX_CACHED_THREADS)) {
2149		for (i = 0; i < MAX_THR_LOCKLEVEL; i++) {
2150			_lockuser_destroy(&thread->lockusers[i]);
2151		}
2152		_lock_destroy(&thread->lock);
2153		free(thread->alloc_addr);
2154	}
2155	else {
2156		crit = _kse_critical_enter();
2157		KSE_LOCK_ACQUIRE(curthread->kse, &thread_lock);
2158		TAILQ_INSERT_HEAD(&free_threadq, thread, tle);
2159		free_thread_count++;
2160		KSE_LOCK_RELEASE(curthread->kse, &thread_lock);
2161		_kse_critical_leave(crit);
2162	}
2163}
2164
2165/*
2166 * Add an active thread:
2167 *
2168 *   o Assign the thread a unique id (which GDB uses to track
2169 *     threads.
2170 *   o Add the thread to the list of all threads and increment
2171 *     number of active threads.
2172 */
2173static void
2174thr_link(struct pthread *thread)
2175{
2176	kse_critical_t crit;
2177	struct kse *curkse;
2178
2179	crit = _kse_critical_enter();
2180	curkse = _get_curkse();
2181
2182	KSE_LOCK_ACQUIRE(curkse, &_thread_list_lock);
2183	/*
2184	 * Initialize the unique id (which GDB uses to track
2185	 * threads), add the thread to the list of all threads,
2186	 * and
2187	 */
2188	thread->uniqueid = next_uniqueid++;
2189	THR_LIST_ADD(thread);
2190	active_threads++;
2191	KSE_LOCK_RELEASE(curkse, &_thread_list_lock);
2192
2193	_kse_critical_leave(crit);
2194}
2195
2196/*
2197 * Remove an active thread.
2198 */
2199static void
2200thr_unlink(struct pthread *thread)
2201{
2202	kse_critical_t crit;
2203	struct kse *curkse;
2204
2205	crit = _kse_critical_enter();
2206	curkse = _get_curkse();
2207
2208	KSE_LOCK_ACQUIRE(curkse, &_thread_list_lock);
2209	THR_LIST_REMOVE(thread);
2210	active_threads--;
2211	KSE_LOCK_RELEASE(curkse, &_thread_list_lock);
2212
2213	_kse_critical_leave(crit);
2214}
2215