thr_kern.c revision 117345
1/*
2 * Copyright (C) 2003 Daniel M. Eischen <deischen@freebsd.org>
3 * Copyright (C) 2002 Jonathon Mini <mini@freebsd.org>
4 * Copyright (c) 1995-1998 John Birrell <jb@cimlogic.com.au>
5 * All rights reserved.
6 *
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
9 * are met:
10 * 1. Redistributions of source code must retain the above copyright
11 *    notice, this list of conditions and the following disclaimer.
12 * 2. Redistributions in binary form must reproduce the above copyright
13 *    notice, this list of conditions and the following disclaimer in the
14 *    documentation and/or other materials provided with the distribution.
15 * 3. All advertising materials mentioning features or use of this software
16 *    must display the following acknowledgement:
17 *	This product includes software developed by John Birrell.
18 * 4. Neither the name of the author nor the names of any co-contributors
19 *    may be used to endorse or promote products derived from this software
20 *    without specific prior written permission.
21 *
22 * THIS SOFTWARE IS PROVIDED BY JOHN BIRRELL AND CONTRIBUTORS ``AS IS'' AND
23 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
24 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
25 * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
26 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
27 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
28 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
29 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
30 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
31 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32 * SUCH DAMAGE.
33 *
34 */
35#include <sys/cdefs.h>
36__FBSDID("$FreeBSD: head/lib/libkse/thread/thr_kern.c 117345 2003-07-09 01:39:24Z davidxu $");
37
38#include <sys/types.h>
39#include <sys/kse.h>
40#include <sys/signalvar.h>
41#include <sys/queue.h>
42#include <machine/atomic.h>
43#include <machine/sigframe.h>
44
45#include <assert.h>
46#include <errno.h>
47#include <signal.h>
48#include <stdlib.h>
49#include <string.h>
50#include <time.h>
51#include <ucontext.h>
52#include <unistd.h>
53
54#include "atomic_ops.h"
55#include "thr_private.h"
56#include "libc_private.h"
57#include "ksd.h"
58
59/*#define DEBUG_THREAD_KERN */
60#ifdef DEBUG_THREAD_KERN
61#define DBG_MSG		stdout_debug
62#else
63#define DBG_MSG(x...)
64#endif
65
66/*
67 * Define a high water mark for the maximum number of threads that
68 * will be cached.  Once this level is reached, any extra threads
69 * will be free()'d.
70 *
71 * XXX - It doesn't make sense to worry about the maximum number of
72 *       KSEs that we can cache because the system will limit us to
73 *       something *much* less than the maximum number of threads
74 *       that we can have.  Disregarding KSEs in their own group,
75 *       the maximum number of KSEs is the number of processors in
76 *       the system.
77 */
78#define	MAX_CACHED_THREADS	100
79#define	KSE_STACKSIZE		16384
80
81#define	KSE_SET_MBOX(kse, thrd) \
82	(kse)->k_mbx.km_curthread = &(thrd)->tmbx
83
84#define	KSE_SET_EXITED(kse)	(kse)->k_flags |= KF_EXITED
85
86/*
87 * Macros for manipulating the run queues.  The priority queue
88 * routines use the thread's pqe link and also handle the setting
89 * and clearing of the thread's THR_FLAGS_IN_RUNQ flag.
90 */
91#define	KSE_RUNQ_INSERT_HEAD(kse, thrd)			\
92	_pq_insert_head(&(kse)->k_schedq->sq_runq, thrd)
93#define	KSE_RUNQ_INSERT_TAIL(kse, thrd)			\
94	_pq_insert_tail(&(kse)->k_schedq->sq_runq, thrd)
95#define	KSE_RUNQ_REMOVE(kse, thrd)			\
96	_pq_remove(&(kse)->k_schedq->sq_runq, thrd)
97#define	KSE_RUNQ_FIRST(kse)	_pq_first(&(kse)->k_schedq->sq_runq)
98
99#define KSE_RUNQ_THREADS(kse)	((kse)->k_schedq->sq_runq.pq_threads)
100
101/*
102 * We've got to keep track of everything that is allocated, not only
103 * to have a speedy free list, but also so they can be deallocated
104 * after a fork().
105 */
106static TAILQ_HEAD(, kse)	active_kseq;
107static TAILQ_HEAD(, kse)	free_kseq;
108static TAILQ_HEAD(, kse_group)	free_kse_groupq;
109static TAILQ_HEAD(, kse_group)	active_kse_groupq;
110static TAILQ_HEAD(, kse_group)	gc_ksegq;
111static struct lock		kse_lock;	/* also used for kseg queue */
112static int			free_kse_count = 0;
113static int			free_kseg_count = 0;
114static TAILQ_HEAD(, pthread)	free_threadq;
115static struct lock		thread_lock;
116static int			free_thread_count = 0;
117static int			inited = 0;
118static int			active_threads = 1;
119static int			active_kse_count = 0;
120static int			active_kseg_count = 0;
121static u_int64_t		next_uniqueid = 1;
122
123
124#ifdef DEBUG_THREAD_KERN
125static void	dump_queues(struct kse *curkse);
126#endif
127static void	kse_check_completed(struct kse *kse);
128static void	kse_check_waitq(struct kse *kse);
129static void	kse_fini(struct kse *curkse);
130static void	kse_reinit(struct kse *kse);
131static void	kse_sched_multi(struct kse *curkse);
132#ifdef NOT_YET
133static void	kse_sched_single(struct kse *curkse);
134#endif
135static void	kse_switchout_thread(struct kse *kse, struct pthread *thread);
136static void	kse_wait(struct kse *kse, struct pthread *td_wait);
137static void	kse_free_unlocked(struct kse *kse);
138static void	kseg_free_unlocked(struct kse_group *kseg);
139static void	kseg_init(struct kse_group *kseg);
140static void	kseg_reinit(struct kse_group *kseg);
141static void	kse_waitq_insert(struct pthread *thread);
142static void	kse_wakeup_multi(struct kse *curkse);
143static void	kse_wakeup_one(struct pthread *thread);
144static void	thr_cleanup(struct kse *kse, struct pthread *curthread);
145static void	thr_link(struct pthread *thread);
146static void	thr_resume_wrapper(int sig, siginfo_t *, ucontext_t *);
147static void	thr_resume_check(struct pthread *curthread, ucontext_t *ucp,
148		    struct pthread_sigframe *psf);
149static int	thr_timedout(struct pthread *thread, struct timespec *curtime);
150static void	thr_unlink(struct pthread *thread);
151
152
153/*
154 * This is called after a fork().
155 * No locks need to be taken here since we are guaranteed to be
156 * single threaded.
157 *
158 * XXX
159 * POSIX says for threaded process, fork() function is used
160 * only to run new programs, and the effects of calling functions
161 * that require certain resources between the call to fork() and
162 * the call to an exec function are undefined.
163 *
164 * Here it is not safe to reinitialize the library after fork().
165 * Because memory management may be corrupted, further calling
166 * malloc()/free() may cause undefined behavior.
167 */
168void
169_kse_single_thread(struct pthread *curthread)
170{
171#ifdef NOTYET
172	struct kse *kse;
173	struct kse_group *kseg;
174	struct pthread *thread;
175	kse_critical_t crit;
176	int i;
177
178
179	/*
180	 * Disable upcalls and clear the threaded flag.
181	 * XXX - I don't think we need to disable upcalls after a fork().
182	 *       but it doesn't hurt.
183	 */
184	crit = _kse_critical_enter();
185	__isthreaded = 0;
186	active_threads = 1;
187	_thr_signal_deinit();
188
189	/*
190	 * Enter a loop to remove and free all threads other than
191	 * the running thread from the active thread list:
192	 */
193	while ((thread = TAILQ_FIRST(&_thread_list)) != NULL) {
194		THR_GCLIST_REMOVE(thread);
195		/*
196		 * Remove this thread from the list (the current
197		 * thread will be removed but re-added by libpthread
198		 * initialization.
199		 */
200		TAILQ_REMOVE(&_thread_list, thread, tle);
201		/* Make sure this isn't the running thread: */
202		if (thread != curthread) {
203			_thr_stack_free(&thread->attr);
204			if (thread->specific != NULL)
205				free(thread->specific);
206			for (i = 0; i < MAX_THR_LOCKLEVEL; i++) {
207				_lockuser_destroy(&thread->lockusers[i]);
208			}
209			_lock_destroy(&thread->lock);
210			free(thread);
211		}
212	}
213
214	TAILQ_INIT(&curthread->mutexq);		/* initialize mutex queue */
215	curthread->joiner = NULL;		/* no joining threads yet */
216	curthread->refcount = 0;
217	SIGEMPTYSET(curthread->sigpend);	/* clear pending signals */
218	if (curthread->specific != NULL) {
219		free(curthread->specific);
220		curthread->specific = NULL;
221		curthread->specific_data_count = 0;
222	}
223
224	/* Free the free KSEs: */
225	while ((kse = TAILQ_FIRST(&free_kseq)) != NULL) {
226		TAILQ_REMOVE(&free_kseq, kse, k_qe);
227		for (i = 0; i < MAX_KSE_LOCKLEVEL; i++) {
228			_lockuser_destroy(&kse->k_lockusers[i]);
229		}
230		_lock_destroy(&kse->k_lock);
231		_ksd_destroy(&kse->k_ksd);
232		if (kse->k_stack.ss_sp != NULL)
233			free(kse->k_stack.ss_sp);
234		free(kse);
235	}
236	free_kse_count = 0;
237
238	/* Free the active KSEs: */
239	while ((kse = TAILQ_FIRST(&active_kseq)) != NULL) {
240		TAILQ_REMOVE(&active_kseq, kse, k_qe);
241		for (i = 0; i < MAX_KSE_LOCKLEVEL; i++) {
242			_lockuser_destroy(&kse->k_lockusers[i]);
243		}
244		_lock_destroy(&kse->k_lock);
245		if (kse->k_stack.ss_sp != NULL)
246			free(kse->k_stack.ss_sp);
247		free(kse);
248	}
249	active_kse_count = 0;
250
251	/* Free the free KSEGs: */
252	while ((kseg = TAILQ_FIRST(&free_kse_groupq)) != NULL) {
253		TAILQ_REMOVE(&free_kse_groupq, kseg, kg_qe);
254		_lock_destroy(&kseg->kg_lock);
255		_pq_free(&kseg->kg_schedq.sq_runq);
256		free(kseg);
257	}
258	free_kseg_count = 0;
259
260	/* Free the active KSEGs: */
261	while ((kseg = TAILQ_FIRST(&active_kse_groupq)) != NULL) {
262		TAILQ_REMOVE(&active_kse_groupq, kseg, kg_qe);
263		_lock_destroy(&kseg->kg_lock);
264		_pq_free(&kseg->kg_schedq.sq_runq);
265		free(kseg);
266	}
267	active_kseg_count = 0;
268
269	/* Free the free threads. */
270	while ((thread = TAILQ_FIRST(&free_threadq)) != NULL) {
271		TAILQ_REMOVE(&free_threadq, thread, tle);
272		if (thread->specific != NULL)
273			free(thread->specific);
274		for (i = 0; i < MAX_THR_LOCKLEVEL; i++) {
275			_lockuser_destroy(&thread->lockusers[i]);
276		}
277		_lock_destroy(&thread->lock);
278		free(thread);
279	}
280	free_thread_count = 0;
281
282	/* Free the to-be-gc'd threads. */
283	while ((thread = TAILQ_FIRST(&_thread_gc_list)) != NULL) {
284		TAILQ_REMOVE(&_thread_gc_list, thread, gcle);
285		for (i = 0; i < MAX_THR_LOCKLEVEL; i++) {
286			_lockuser_destroy(&thread->lockusers[i]);
287		}
288		_lock_destroy(&thread->lock);
289		free(thread);
290	}
291	TAILQ_INIT(&gc_ksegq);
292	_gc_count = 0;
293
294	if (inited != 0) {
295		/*
296		 * Destroy these locks; they'll be recreated to assure they
297		 * are in the unlocked state.
298		 */
299		_lock_destroy(&kse_lock);
300		_lock_destroy(&thread_lock);
301		_lock_destroy(&_thread_list_lock);
302		inited = 0;
303	}
304
305	/*
306	 * After a fork(), the leftover thread goes back to being
307	 * scope process.
308	 */
309	curthread->attr.flags &= ~PTHREAD_SCOPE_SYSTEM;
310	curthread->attr.flags |= PTHREAD_SCOPE_PROCESS;
311
312	/*
313	 * After a fork, we are still operating on the thread's original
314	 * stack.  Don't clear the THR_FLAGS_USER from the thread's
315	 * attribute flags.
316	 */
317
318	/* Initialize the threads library. */
319	curthread->kse = NULL;
320	curthread->kseg = NULL;
321	_kse_initial = NULL;
322	_libpthread_init(curthread);
323#else
324	if (__isthreaded)
325		_thr_signal_deinit();
326	_ksd_readandclear_tmbx();
327	__isthreaded   = 0;
328	active_threads = 0;
329#endif
330}
331
332/*
333 * This is used to initialize housekeeping and to initialize the
334 * KSD for the KSE.
335 */
336void
337_kse_init(void)
338{
339	if (inited == 0) {
340		TAILQ_INIT(&active_kseq);
341		TAILQ_INIT(&active_kse_groupq);
342		TAILQ_INIT(&free_kseq);
343		TAILQ_INIT(&free_kse_groupq);
344		TAILQ_INIT(&free_threadq);
345		TAILQ_INIT(&gc_ksegq);
346		if (_lock_init(&kse_lock, LCK_ADAPTIVE,
347		    _kse_lock_wait, _kse_lock_wakeup) != 0)
348			PANIC("Unable to initialize free KSE queue lock");
349		if (_lock_init(&thread_lock, LCK_ADAPTIVE,
350		    _kse_lock_wait, _kse_lock_wakeup) != 0)
351			PANIC("Unable to initialize free thread queue lock");
352		if (_lock_init(&_thread_list_lock, LCK_ADAPTIVE,
353		    _kse_lock_wait, _kse_lock_wakeup) != 0)
354			PANIC("Unable to initialize thread list lock");
355		active_kse_count = 0;
356		active_kseg_count = 0;
357		_gc_count = 0;
358		inited = 1;
359	}
360}
361
362int
363_kse_isthreaded(void)
364{
365	return (__isthreaded != 0);
366}
367
368/*
369 * This is called when the first thread (other than the initial
370 * thread) is created.
371 */
372int
373_kse_setthreaded(int threaded)
374{
375	if ((threaded != 0) && (__isthreaded == 0)) {
376		/*
377		 * Locking functions in libc are required when there are
378		 * threads other than the initial thread.
379		 */
380		__isthreaded = 1;
381
382		/*
383		 * Tell the kernel to create a KSE for the initial thread
384		 * and enable upcalls in it.
385		 */
386		_thr_signal_init();
387		_kse_initial->k_flags |= KF_STARTED;
388		if (kse_create(&_kse_initial->k_mbx, 0) != 0) {
389			_kse_initial->k_flags &= ~KF_STARTED;
390			__isthreaded = 0;
391			/* may abort() */
392			PANIC("kse_create() failed\n");
393			return (-1);
394		}
395		KSE_SET_MBOX(_kse_initial, _thr_initial);
396		_thr_start_sig_daemon();
397		_thr_setmaxconcurrency();
398	}
399	return (0);
400}
401
402/*
403 * Lock wait and wakeup handlers for KSE locks.  These are only used by
404 * KSEs, and should never be used by threads.  KSE locks include the
405 * KSE group lock (used for locking the scheduling queue) and the
406 * kse_lock defined above.
407 *
408 * When a KSE lock attempt blocks, the entire KSE blocks allowing another
409 * KSE to run.  For the most part, it doesn't make much sense to try and
410 * schedule another thread because you need to lock the scheduling queue
411 * in order to do that.  And since the KSE lock is used to lock the scheduling
412 * queue, you would just end up blocking again.
413 */
414void
415_kse_lock_wait(struct lock *lock, struct lockuser *lu)
416{
417	struct kse *curkse = (struct kse *)_LCK_GET_PRIVATE(lu);
418	struct timespec ts;
419	int saved_flags;
420
421	if (curkse->k_mbx.km_curthread != NULL)
422		PANIC("kse_lock_wait does not disable upcall.\n");
423	/*
424	 * Enter a loop to wait until we get the lock.
425	 */
426	ts.tv_sec = 0;
427	ts.tv_nsec = 1000000;  /* 1 sec */
428	while (!_LCK_GRANTED(lu)) {
429		/*
430		 * Yield the kse and wait to be notified when the lock
431		 * is granted.
432		 */
433		saved_flags = curkse->k_mbx.km_flags;
434		curkse->k_mbx.km_flags |= KMF_NOUPCALL | KMF_NOCOMPLETED;
435		kse_release(&ts);
436		curkse->k_mbx.km_flags = saved_flags;
437	}
438}
439
440void
441_kse_lock_wakeup(struct lock *lock, struct lockuser *lu)
442{
443	struct kse *curkse;
444	struct kse *kse;
445	struct kse_mailbox *mbx;
446
447	curkse = _get_curkse();
448	kse = (struct kse *)_LCK_GET_PRIVATE(lu);
449
450	if (kse == curkse)
451		PANIC("KSE trying to wake itself up in lock");
452	else {
453		mbx = &kse->k_mbx;
454		_lock_grant(lock, lu);
455		/*
456		 * Notify the owning kse that it has the lock.
457		 * It is safe to pass invalid address to kse_wakeup
458		 * even if the mailbox is not in kernel at all,
459		 * and waking up a wrong kse is also harmless.
460		 */
461		kse_wakeup(mbx);
462	}
463}
464
465/*
466 * Thread wait and wakeup handlers for thread locks.  These are only used
467 * by threads, never by KSEs.  Thread locks include the per-thread lock
468 * (defined in its structure), and condition variable and mutex locks.
469 */
470void
471_thr_lock_wait(struct lock *lock, struct lockuser *lu)
472{
473	struct pthread *curthread = (struct pthread *)lu->lu_private;
474
475	do {
476		THR_SCHED_LOCK(curthread, curthread);
477		THR_SET_STATE(curthread, PS_LOCKWAIT);
478		THR_SCHED_UNLOCK(curthread, curthread);
479		_thr_sched_switch(curthread);
480	} while (!_LCK_GRANTED(lu));
481}
482
483void
484_thr_lock_wakeup(struct lock *lock, struct lockuser *lu)
485{
486	struct pthread *thread;
487	struct pthread *curthread;
488
489	curthread = _get_curthread();
490	thread = (struct pthread *)_LCK_GET_PRIVATE(lu);
491
492	THR_SCHED_LOCK(curthread, thread);
493	_lock_grant(lock, lu);
494	_thr_setrunnable_unlocked(thread);
495	THR_SCHED_UNLOCK(curthread, thread);
496}
497
498kse_critical_t
499_kse_critical_enter(void)
500{
501	kse_critical_t crit;
502
503	crit = _ksd_readandclear_tmbx();
504	return (crit);
505}
506
507void
508_kse_critical_leave(kse_critical_t crit)
509{
510	struct pthread *curthread;
511
512	_ksd_set_tmbx(crit);
513	if ((crit != NULL) && ((curthread = _get_curthread()) != NULL))
514		THR_YIELD_CHECK(curthread);
515}
516
517int
518_kse_in_critical(void)
519{
520	return (_ksd_get_tmbx() == NULL);
521}
522
523void
524_thr_critical_enter(struct pthread *thread)
525{
526	thread->critical_count++;
527}
528
529void
530_thr_critical_leave(struct pthread *thread)
531{
532	thread->critical_count--;
533	THR_YIELD_CHECK(thread);
534}
535
536void
537_thr_sched_switch(struct pthread *curthread)
538{
539	struct kse *curkse;
540
541	(void)_kse_critical_enter();
542	curkse = _get_curkse();
543	KSE_SCHED_LOCK(curkse, curkse->k_kseg);
544	_thr_sched_switch_unlocked(curthread);
545}
546
547/*
548 * XXX - We may need to take the scheduling lock before calling
549 *       this, or perhaps take the lock within here before
550 *       doing anything else.
551 */
552void
553_thr_sched_switch_unlocked(struct pthread *curthread)
554{
555	struct pthread *td;
556	struct pthread_sigframe psf;
557	struct kse *curkse;
558	int ret;
559	volatile int uts_once;
560	volatile int resume_once = 0;
561	ucontext_t uc;
562
563	/* We're in the scheduler, 5 by 5: */
564	curkse = _get_curkse();
565
566	curthread->need_switchout = 1;	/* The thread yielded on its own. */
567	curthread->critical_yield = 0;	/* No need to yield anymore. */
568	curthread->slice_usec = -1;	/* Restart the time slice. */
569
570	/* Thread can unlock the scheduler lock. */
571	curthread->lock_switch = 1;
572
573	/*
574	 * The signal frame is allocated off the stack because
575	 * a thread can be interrupted by other signals while
576	 * it is running down pending signals.
577	 */
578	psf.psf_valid = 0;
579	curthread->curframe = &psf;
580
581	/*
582	 * Enter the scheduler if any one of the following is true:
583	 *
584	 *   o The current thread is dead; it's stack needs to be
585	 *     cleaned up and it can't be done while operating on
586	 *     it.
587	 *   o The current thread has signals pending, should
588	 *     let scheduler install signal trampoline for us.
589	 *   o There are no runnable threads.
590	 *   o The next thread to run won't unlock the scheduler
591	 *     lock.  A side note: the current thread may be run
592	 *     instead of the next thread in the run queue, but
593	 *     we don't bother checking for that.
594	 */
595	if ((curthread->state == PS_DEAD) ||
596	    (((td = KSE_RUNQ_FIRST(curkse)) == NULL) &&
597	    (curthread->state != PS_RUNNING)) ||
598	    ((td != NULL) && (td->lock_switch == 0))) {
599		curkse->k_switch = 1;
600		_thread_enter_uts(&curthread->tmbx, &curkse->k_mbx);
601	}
602	else {
603		uts_once = 0;
604		THR_GETCONTEXT(&curthread->tmbx.tm_context);
605		if (uts_once == 0) {
606			uts_once = 1;
607
608			/* Switchout the current thread. */
609			kse_switchout_thread(curkse, curthread);
610
611		 	/* Choose another thread to run. */
612			td = KSE_RUNQ_FIRST(curkse);
613			KSE_RUNQ_REMOVE(curkse, td);
614			curkse->k_curthread = td;
615
616			/*
617			 * Make sure the current thread's kse points to
618			 * this kse.
619			 */
620			td->kse = curkse;
621
622			/*
623			 * Reset accounting.
624			 */
625			td->tmbx.tm_uticks = 0;
626			td->tmbx.tm_sticks = 0;
627
628			/*
629			 * Reset the time slice if this thread is running
630			 * for the first time or running again after using
631			 * its full time slice allocation.
632			 */
633			if (td->slice_usec == -1)
634				td->slice_usec = 0;
635
636			/* Mark the thread active. */
637			td->active = 1;
638
639			/* Remove the frame reference. */
640			td->curframe = NULL;
641
642			/*
643			 * Continue the thread at its current frame:
644			 */
645			ret = _thread_switch(&td->tmbx, NULL);
646			/* This point should not be reached. */
647			if (ret != 0)
648				PANIC("Bad return from _thread_switch");
649			PANIC("Thread has returned from _thread_switch");
650		}
651	}
652
653	if (psf.psf_valid) {
654		/*
655		 * It is ugly we must increase critical count, because we
656		 * have a frame saved, we must backout state in psf
657		 * before we can process signals.
658 		 */
659		curthread->critical_count++;
660	}
661
662	if (curthread->lock_switch != 0) {
663		/*
664		 * Unlock the scheduling queue and leave the
665		 * critical region.
666		 */
667		/* Don't trust this after a switch! */
668		curkse = _get_curkse();
669
670		curthread->lock_switch = 0;
671		KSE_SCHED_UNLOCK(curkse, curkse->k_kseg);
672		_kse_critical_leave(&curthread->tmbx);
673	}
674	/*
675	 * This thread is being resumed; check for cancellations.
676	 */
677	if ((psf.psf_valid ||
678	    (curthread->check_pending && !THR_IN_CRITICAL(curthread)))) {
679		resume_once = 0;
680		THR_GETCONTEXT(&uc);
681		if (resume_once == 0) {
682			resume_once = 1;
683			curthread->check_pending = 0;
684			thr_resume_check(curthread, &uc, &psf);
685		}
686	}
687	THR_ACTIVATE_LAST_LOCK(curthread);
688}
689
690/*
691 * This is the scheduler for a KSE which runs a scope system thread.
692 * The multi-thread KSE scheduler should also work for a single threaded
693 * KSE, but we use a separate scheduler so that it can be fine-tuned
694 * to be more efficient (and perhaps not need a separate stack for
695 * the KSE, allowing it to use the thread's stack).
696 *
697 * XXX - This probably needs some work.
698 */
699#ifdef NOT_YET
700static void
701kse_sched_single(struct kse *curkse)
702{
703	struct pthread *curthread = curkse->k_curthread;
704	struct pthread *td_wait;
705	struct timespec ts;
706	int level;
707
708	if (curthread->active == 0) {
709		if (curthread->state != PS_RUNNING) {
710			/* Check to see if the thread has timed out. */
711			KSE_GET_TOD(curkse, &ts);
712			if (thr_timedout(curthread, &ts) != 0) {
713				curthread->timeout = 1;
714				curthread->state = PS_RUNNING;
715			}
716		}
717	}
718
719	/* This thread no longer needs to yield the CPU: */
720	curthread->critical_yield = 0;
721	curthread->need_switchout = 0;
722
723	/*
724	 * Lock the scheduling queue.
725	 *
726	 * There is no scheduling queue for single threaded KSEs,
727	 * but we need a lock for protection regardless.
728	 */
729	KSE_SCHED_LOCK(curkse, curkse->k_kseg);
730
731	/*
732	 * This has to do the job of kse_switchout_thread(), only
733	 * for a single threaded KSE/KSEG.
734	 */
735
736	switch (curthread->state) {
737	case PS_DEAD:
738		/* Unlock the scheduling queue and exit the KSE and thread. */
739		thr_cleaup(curkse, curthread);
740		KSE_SCHED_UNLOCK(curkse, curkse->k_kseg);
741		break;
742
743	case PS_COND_WAIT:
744	case PS_SLEEP_WAIT:
745		/* Only insert threads that can timeout: */
746		if (curthread->wakeup_time.tv_sec != -1) {
747			/* Insert into the waiting queue: */
748			KSE_WAITQ_INSERT(curkse, curthread);
749		}
750		break;
751
752	case PS_LOCKWAIT:
753		level = curthread->locklevel - 1;
754		if (!_LCK_GRANTED(&curthread->lockusers[level]))
755			KSE_WAITQ_INSERT(curkse, curthread);
756		else
757			THR_SET_STATE(curthread, PS_RUNNING);
758		break;
759
760	case PS_JOIN:
761	case PS_MUTEX_WAIT:
762	case PS_RUNNING:
763	case PS_SIGSUSPEND:
764	case PS_SIGWAIT:
765	case PS_SUSPENDED:
766	case PS_DEADLOCK:
767	default:
768		/*
769		 * These states don't timeout and don't need
770		 * to be in the waiting queue.
771		 */
772		break;
773	}
774	while (curthread->state != PS_RUNNING) {
775		curthread->active = 0;
776		td_wait = KSE_WAITQ_FIRST(curkse);
777
778		kse_wait(curkse, td_wait);
779
780	    	if (td_wait != NULL) {
781			KSE_GET_TOD(curkse, &ts);
782			if (thr_timedout(curthread, &ts)) {
783				/* Indicate the thread timedout: */
784				td_wait->timeout = 1;
785
786				/* Make the thread runnable. */
787				THR_SET_STATE(td_wait, PS_RUNNING);
788				KSE_WAITQ_REMOVE(curkse, td_wait);
789			}
790		}
791	}
792
793	/* Remove the frame reference. */
794	curthread->curframe = NULL;
795
796	/* Unlock the scheduling queue. */
797	KSE_SCHED_UNLOCK(curkse, curkse->k_kseg);
798
799	/*
800	 * Continue the thread at its current frame:
801	 */
802	DBG_MSG("Continuing bound thread %p\n", curthread);
803	_thread_switch(&curthread->tmbx, &curkse->k_mbx.km_curthread);
804	PANIC("Thread has returned from _thread_switch");
805}
806#endif
807
808#ifdef DEBUG_THREAD_KERN
809static void
810dump_queues(struct kse *curkse)
811{
812	struct pthread *thread;
813
814	DBG_MSG("Threads in waiting queue:\n");
815	TAILQ_FOREACH(thread, &curkse->k_kseg->kg_schedq.sq_waitq, pqe) {
816		DBG_MSG("  thread %p, state %d, blocked %d\n",
817		    thread, thread->state, thread->blocked);
818	}
819}
820#endif
821
822/*
823 * This is the scheduler for a KSE which runs multiple threads.
824 */
825static void
826kse_sched_multi(struct kse *curkse)
827{
828	struct pthread *curthread, *td_wait;
829	struct pthread_sigframe *curframe;
830	int ret;
831
832	THR_ASSERT(curkse->k_mbx.km_curthread == NULL,
833	    "Mailbox not null in kse_sched_multi");
834
835	/* Check for first time initialization: */
836	if ((curkse->k_flags & KF_INITIALIZED) == 0) {
837		/* Setup this KSEs specific data. */
838		_ksd_setprivate(&curkse->k_ksd);
839		_set_curkse(curkse);
840
841		/* Set this before grabbing the context. */
842		curkse->k_flags |= KF_INITIALIZED;
843	}
844
845	/* This may have returned from a kse_release(). */
846	if (KSE_WAITING(curkse)) {
847		DBG_MSG("Entered upcall when KSE is waiting.");
848		KSE_CLEAR_WAIT(curkse);
849	}
850
851	/*If this is an upcall; take the scheduler lock. */
852	if (curkse->k_switch == 0)
853		KSE_SCHED_LOCK(curkse, curkse->k_kseg);
854	curkse->k_switch = 0;
855
856	curthread = curkse->k_curthread;
857
858	if (KSE_IS_IDLE(curkse)) {
859		KSE_CLEAR_IDLE(curkse);
860		curkse->k_kseg->kg_idle_kses--;
861	}
862	/*
863	 * If the current thread was completed in another KSE, then
864	 * it will be in the run queue.  Don't mark it as being blocked.
865	 */
866	if ((curthread != NULL) &&
867	    ((curthread->flags & THR_FLAGS_IN_RUNQ) == 0) &&
868	    (curthread->need_switchout == 0)) {
869		/*
870		 * Assume the current thread is blocked; when the
871		 * completed threads are checked and if the current
872		 * thread is among the completed, the blocked flag
873		 * will be cleared.
874		 */
875		curthread->blocked = 1;
876	}
877
878	/* Check for any unblocked threads in the kernel. */
879	kse_check_completed(curkse);
880
881	/*
882	 * Check for threads that have timed-out.
883	 */
884	kse_check_waitq(curkse);
885
886	/*
887	 * Switchout the current thread, if necessary, as the last step
888	 * so that it is inserted into the run queue (if it's runnable)
889	 * _after_ any other threads that were added to it above.
890	 */
891	if (curthread == NULL)
892		;  /* Nothing to do here. */
893	else if ((curthread->need_switchout == 0) &&
894	    (curthread->blocked == 0) && (THR_IN_CRITICAL(curthread))) {
895		/*
896		 * Resume the thread and tell it to yield when
897		 * it leaves the critical region.
898		 */
899		curthread->critical_yield = 1;
900		curthread->active = 1;
901		if ((curthread->flags & THR_FLAGS_IN_RUNQ) != 0)
902			KSE_RUNQ_REMOVE(curkse, curthread);
903		curkse->k_curthread = curthread;
904		curthread->kse = curkse;
905		DBG_MSG("Continuing thread %p in critical region\n",
906		    curthread);
907		kse_wakeup_multi(curkse);
908		KSE_SCHED_UNLOCK(curkse, curkse->k_kseg);
909		ret = _thread_switch(&curthread->tmbx,
910		    &curkse->k_mbx.km_curthread);
911		if (ret != 0)
912			PANIC("Can't resume thread in critical region\n");
913	}
914	else if ((curthread->flags & THR_FLAGS_IN_RUNQ) == 0)
915		kse_switchout_thread(curkse, curthread);
916	curkse->k_curthread = NULL;
917
918	kse_wakeup_multi(curkse);
919
920#ifdef DEBUG_THREAD_KERN
921	dump_queues(curkse);
922#endif
923
924	/* Check if there are no threads ready to run: */
925	while (((curthread = KSE_RUNQ_FIRST(curkse)) == NULL) &&
926	    (curkse->k_kseg->kg_threadcount != 0)) {
927		/*
928		 * Wait for a thread to become active or until there are
929		 * no more threads.
930		 */
931		td_wait = KSE_WAITQ_FIRST(curkse);
932		kse_wait(curkse, td_wait);
933		kse_check_completed(curkse);
934		kse_check_waitq(curkse);
935	}
936
937	/* Check for no more threads: */
938	if (curkse->k_kseg->kg_threadcount == 0) {
939		/*
940		 * Normally this shouldn't return, but it will if there
941		 * are other KSEs running that create new threads that
942		 * are assigned to this KSE[G].  For instance, if a scope
943		 * system thread were to create a scope process thread
944		 * and this kse[g] is the initial kse[g], then that newly
945		 * created thread would be assigned to us (the initial
946		 * kse[g]).
947		 */
948		KSE_SCHED_UNLOCK(curkse, curkse->k_kseg);
949		kse_fini(curkse);
950		/* never returns */
951	}
952
953	THR_ASSERT(curthread != NULL,
954	    "Return from kse_wait/fini without thread.");
955	THR_ASSERT(curthread->state != PS_DEAD,
956	    "Trying to resume dead thread!");
957	KSE_RUNQ_REMOVE(curkse, curthread);
958
959	/*
960	 * Make the selected thread the current thread.
961	 */
962	curkse->k_curthread = curthread;
963
964	/*
965	 * Make sure the current thread's kse points to this kse.
966	 */
967	curthread->kse = curkse;
968
969	/*
970	 * Reset accounting.
971	 */
972	curthread->tmbx.tm_uticks = 0;
973	curthread->tmbx.tm_sticks = 0;
974
975	/*
976	 * Reset the time slice if this thread is running for the first
977	 * time or running again after using its full time slice allocation.
978	 */
979	if (curthread->slice_usec == -1)
980		curthread->slice_usec = 0;
981
982	/* Mark the thread active. */
983	curthread->active = 1;
984
985	/* Remove the frame reference. */
986	curframe = curthread->curframe;
987	curthread->curframe = NULL;
988
989	kse_wakeup_multi(curkse);
990
991	/*
992	 * The thread's current signal frame will only be NULL if it
993	 * is being resumed after being blocked in the kernel.  In
994	 * this case, and if the thread needs to run down pending
995	 * signals or needs a cancellation check, we need to add a
996	 * signal frame to the thread's context.
997	 */
998#ifdef NOT_YET
999	if ((((curframe == NULL) && (curthread->check_pending != 0)) ||
1000	    (((curthread->cancelflags & THR_AT_CANCEL_POINT) == 0) &&
1001	     ((curthread->cancelflags & PTHREAD_CANCEL_ASYNCHRONOUS) != 0))) &&
1002	     !THR_IN_CRITICAL(curthread))
1003		signalcontext(&curthread->tmbx.tm_context, 0,
1004		    (__sighandler_t *)thr_resume_wrapper);
1005#else
1006	if ((curframe == NULL) && (curthread->check_pending != 0) &&
1007	    !THR_IN_CRITICAL(curthread)) {
1008		curthread->check_pending = 0;
1009		signalcontext(&curthread->tmbx.tm_context, 0,
1010		    (__sighandler_t *)thr_resume_wrapper);
1011	}
1012#endif
1013	/*
1014	 * Continue the thread at its current frame:
1015	 */
1016	if (curthread->lock_switch != 0) {
1017		/*
1018		 * This thread came from a scheduler switch; it will
1019		 * unlock the scheduler lock and set the mailbox.
1020		 */
1021		ret = _thread_switch(&curthread->tmbx, NULL);
1022	} else {
1023		/* This thread won't unlock the scheduler lock. */
1024		KSE_SCHED_UNLOCK(curkse, curkse->k_kseg);
1025		ret = _thread_switch(&curthread->tmbx,
1026		    &curkse->k_mbx.km_curthread);
1027	}
1028	if (ret != 0)
1029		PANIC("Thread has returned from _thread_switch");
1030
1031	/* This point should not be reached. */
1032	PANIC("Thread has returned from _thread_switch");
1033}
1034
1035static void
1036thr_resume_wrapper(int sig, siginfo_t *siginfo, ucontext_t *ucp)
1037{
1038	struct pthread *curthread = _get_curthread();
1039	struct kse *curkse;
1040	int ret, err_save = curthread->error;
1041
1042	DBG_MSG(">>> sig wrapper\n");
1043	if (curthread->lock_switch)
1044		PANIC("thr_resume_wrapper, lock_switch != 0\n");
1045	thr_resume_check(curthread, ucp, NULL);
1046	_kse_critical_enter();
1047	curkse = _get_curkse();
1048	curthread->tmbx.tm_context = *ucp;
1049	curthread->error = err_save;
1050	ret = _thread_switch(&curthread->tmbx, &curkse->k_mbx.km_curthread);
1051	if (ret != 0)
1052		PANIC("thr_resume_wrapper: thread has returned "
1053		      "from _thread_switch");
1054	/* THR_SETCONTEXT(ucp); */ /* not work, why ? */
1055}
1056
1057static void
1058thr_resume_check(struct pthread *curthread, ucontext_t *ucp,
1059    struct pthread_sigframe *psf)
1060{
1061	_thr_sig_rundown(curthread, ucp, psf);
1062
1063#ifdef NOT_YET
1064	if (((curthread->cancelflags & THR_AT_CANCEL_POINT) == 0) &&
1065	    ((curthread->cancelflags & PTHREAD_CANCEL_ASYNCHRONOUS) != 0))
1066		pthread_testcancel();
1067#endif
1068}
1069
1070/*
1071 * Clean up a thread.  This must be called with the thread's KSE
1072 * scheduling lock held.  The thread must be a thread from the
1073 * KSE's group.
1074 */
1075static void
1076thr_cleanup(struct kse *curkse, struct pthread *thread)
1077{
1078	struct pthread *joiner;
1079	int sys_scope;
1080
1081	if ((joiner = thread->joiner) != NULL) {
1082		/* Joinee scheduler lock held; joiner won't leave. */
1083		if (joiner->kseg == curkse->k_kseg) {
1084			if (joiner->join_status.thread == thread) {
1085				joiner->join_status.thread = NULL;
1086				joiner->join_status.ret = thread->ret;
1087				_thr_setrunnable_unlocked(joiner);
1088			}
1089		} else {
1090			KSE_SCHED_UNLOCK(curkse, curkse->k_kseg);
1091			/* The joiner may have removed itself and exited. */
1092			if (_thr_ref_add(thread, joiner, 0) == 0) {
1093				KSE_SCHED_LOCK(curkse, joiner->kseg);
1094				if (joiner->join_status.thread == thread) {
1095					joiner->join_status.thread = NULL;
1096					joiner->join_status.ret = thread->ret;
1097					_thr_setrunnable_unlocked(joiner);
1098				}
1099				KSE_SCHED_UNLOCK(curkse, joiner->kseg);
1100				_thr_ref_delete(thread, joiner);
1101			}
1102			KSE_SCHED_LOCK(curkse, curkse->k_kseg);
1103		}
1104		thread->attr.flags |= PTHREAD_DETACHED;
1105	}
1106
1107	if (!(sys_scope = (thread->attr.flags & PTHREAD_SCOPE_SYSTEM))) {
1108		/*
1109		 * Remove the thread from the KSEG's list of threads.
1110	 	 */
1111		KSEG_THRQ_REMOVE(thread->kseg, thread);
1112		/*
1113		 * Migrate the thread to the main KSE so that this
1114		 * KSE and KSEG can be cleaned when their last thread
1115		 * exits.
1116		 */
1117		thread->kseg = _kse_initial->k_kseg;
1118		thread->kse = _kse_initial;
1119	}
1120	thread->flags |= THR_FLAGS_GC_SAFE;
1121
1122	/*
1123	 * We can't hold the thread list lock while holding the
1124	 * scheduler lock.
1125	 */
1126	KSE_SCHED_UNLOCK(curkse, curkse->k_kseg);
1127	DBG_MSG("Adding thread %p to GC list\n", thread);
1128	KSE_LOCK_ACQUIRE(curkse, &_thread_list_lock);
1129	THR_GCLIST_ADD(thread);
1130	/* Use thread_list_lock */
1131	active_threads--;
1132	if (active_threads == 1) {
1133		KSE_LOCK_RELEASE(curkse, &_thread_list_lock);
1134		exit(0);
1135        }
1136	KSE_LOCK_RELEASE(curkse, &_thread_list_lock);
1137	if (sys_scope) {
1138		/*
1139		 * System scope thread is single thread group,
1140		 * when thread is exited, its kse and ksegrp should
1141		 * be recycled as well.
1142		 */
1143		kse_exit();
1144		PANIC("kse_exit() failed for system scope thread");
1145	}
1146	KSE_SCHED_LOCK(curkse, curkse->k_kseg);
1147}
1148
1149void
1150_thr_gc(struct pthread *curthread)
1151{
1152	struct pthread *td, *td_next;
1153	kse_critical_t crit;
1154	TAILQ_HEAD(, pthread) worklist;
1155
1156	TAILQ_INIT(&worklist);
1157	crit = _kse_critical_enter();
1158	KSE_LOCK_ACQUIRE(curthread->kse, &_thread_list_lock);
1159
1160	/* Check the threads waiting for GC. */
1161	for (td = TAILQ_FIRST(&_thread_gc_list); td != NULL; td = td_next) {
1162		td_next = TAILQ_NEXT(td, gcle);
1163		if ((td->flags & THR_FLAGS_GC_SAFE) == 0)
1164			continue;
1165		else if (((td->attr.flags & PTHREAD_SCOPE_SYSTEM) != 0) &&
1166		    ((td->kse->k_mbx.km_flags & KMF_DONE) == 0)) {
1167			/*
1168			 * The thread and KSE are operating on the same
1169			 * stack.  Wait for the KSE to exit before freeing
1170			 * the thread's stack as well as everything else.
1171			 */
1172			continue;
1173		}
1174		/*
1175		 * Remove the thread from the GC list.  If the thread
1176		 * isn't yet detached, it will get added back to the
1177		 * GC list at a later time.
1178		 */
1179		THR_GCLIST_REMOVE(td);
1180		DBG_MSG("Freeing thread %p stack\n", td);
1181		/*
1182		 * We can free the thread stack since it's no longer
1183		 * in use.
1184		 */
1185		_thr_stack_free(&td->attr);
1186		if (((td->attr.flags & PTHREAD_DETACHED) != 0) &&
1187		    (td->refcount == 0)) {
1188			/*
1189			 * The thread has detached and is no longer
1190			 * referenced.  It is safe to remove all
1191			 * remnants of the thread.
1192			 */
1193			THR_LIST_REMOVE(td);
1194			TAILQ_INSERT_HEAD(&worklist, td, gcle);
1195		}
1196	}
1197	KSE_LOCK_RELEASE(curthread->kse, &_thread_list_lock);
1198	_kse_critical_leave(crit);
1199
1200	while ((td = TAILQ_FIRST(&worklist)) != NULL) {
1201		TAILQ_REMOVE(&worklist, td, gcle);
1202
1203		if ((td->attr.flags & PTHREAD_SCOPE_SYSTEM) != 0) {
1204			crit = _kse_critical_enter();
1205			KSE_LOCK_ACQUIRE(curthread->kse, &kse_lock);
1206			kse_free_unlocked(td->kse);
1207			kseg_free_unlocked(td->kseg);
1208			KSE_LOCK_RELEASE(curthread->kse, &kse_lock);
1209			_kse_critical_leave(crit);
1210		}
1211		/*
1212		 * XXX we don't free initial thread, because there might
1213		 * have some code referencing initial thread.
1214		 */
1215		if (td != _thr_initial) {
1216			DBG_MSG("Freeing thread %p\n", td);
1217			_thr_free(curthread, td);
1218		} else
1219			DBG_MSG("Initial thread won't be freed\n");
1220	}
1221	/* XXX free kse and ksegrp list should be looked as well */
1222}
1223
1224
1225/*
1226 * Only new threads that are running or suspended may be scheduled.
1227 */
1228int
1229_thr_schedule_add(struct pthread *curthread, struct pthread *newthread)
1230{
1231	kse_critical_t crit;
1232	int ret;
1233
1234	/* Add the new thread. */
1235	thr_link(newthread);
1236
1237	/*
1238	 * If this is the first time creating a thread, make sure
1239	 * the mailbox is set for the current thread.
1240	 */
1241	if ((newthread->attr.flags & PTHREAD_SCOPE_SYSTEM) != 0) {
1242#ifdef NOT_YET
1243		/* We use the thread's stack as the KSE's stack. */
1244		new_thread->kse->k_mbx.km_stack.ss_sp =
1245		    new_thread->attr.stackaddr_attr;
1246		new_thread->kse->k_mbx.km_stack.ss_size =
1247		    new_thread->attr.stacksize_attr;
1248#endif
1249		/*
1250		 * No need to lock the scheduling queue since the
1251		 * KSE/KSEG pair have not yet been started.
1252		 */
1253		KSEG_THRQ_ADD(newthread->kseg, newthread);
1254		if (newthread->state == PS_RUNNING)
1255			THR_RUNQ_INSERT_TAIL(newthread);
1256		newthread->kse->k_curthread = NULL;
1257		newthread->kse->k_mbx.km_flags = 0;
1258		newthread->kse->k_mbx.km_func = (kse_func_t *)kse_sched_multi;
1259		newthread->kse->k_mbx.km_quantum = 0;
1260
1261		/*
1262		 * This thread needs a new KSE and KSEG.
1263		 */
1264		newthread->kse->k_flags &= ~KF_INITIALIZED;
1265		newthread->kse->k_flags |= KF_STARTED;
1266		ret = kse_create(&newthread->kse->k_mbx, 1);
1267		if (ret != 0)
1268			ret = errno;
1269	}
1270	else {
1271		/*
1272		 * Lock the KSE and add the new thread to its list of
1273		 * assigned threads.  If the new thread is runnable, also
1274		 * add it to the KSE's run queue.
1275		 */
1276		crit = _kse_critical_enter();
1277		KSE_SCHED_LOCK(curthread->kse, newthread->kseg);
1278		KSEG_THRQ_ADD(newthread->kseg, newthread);
1279		if (newthread->state == PS_RUNNING)
1280			THR_RUNQ_INSERT_TAIL(newthread);
1281		if ((newthread->kse->k_flags & KF_STARTED) == 0) {
1282			/*
1283			 * This KSE hasn't been started yet.  Start it
1284			 * outside of holding the lock.
1285			 */
1286			newthread->kse->k_flags |= KF_STARTED;
1287			newthread->kse->k_mbx.km_func =
1288			    (kse_func_t *)kse_sched_multi;
1289			newthread->kse->k_mbx.km_flags = 0;
1290			kse_create(&newthread->kse->k_mbx, 0);
1291		 } else if ((newthread->state == PS_RUNNING) &&
1292		     KSE_IS_IDLE(newthread->kse)) {
1293			/*
1294			 * The thread is being scheduled on another KSEG.
1295			 */
1296			kse_wakeup_one(newthread);
1297		}
1298		KSE_SCHED_UNLOCK(curthread->kse, newthread->kseg);
1299		_kse_critical_leave(crit);
1300		ret = 0;
1301	}
1302	if (ret != 0)
1303		thr_unlink(newthread);
1304
1305	return (ret);
1306}
1307
1308void
1309kse_waitq_insert(struct pthread *thread)
1310{
1311	struct pthread *td;
1312
1313	if (thread->wakeup_time.tv_sec == -1)
1314		TAILQ_INSERT_TAIL(&thread->kse->k_schedq->sq_waitq, thread,
1315		    pqe);
1316	else {
1317		td = TAILQ_FIRST(&thread->kse->k_schedq->sq_waitq);
1318		while ((td != NULL) && (td->wakeup_time.tv_sec != -1) &&
1319		    ((td->wakeup_time.tv_sec < thread->wakeup_time.tv_sec) ||
1320		    ((td->wakeup_time.tv_sec == thread->wakeup_time.tv_sec) &&
1321		    (td->wakeup_time.tv_nsec <= thread->wakeup_time.tv_nsec))))
1322			td = TAILQ_NEXT(td, pqe);
1323		if (td == NULL)
1324			TAILQ_INSERT_TAIL(&thread->kse->k_schedq->sq_waitq,
1325			    thread, pqe);
1326		else
1327			TAILQ_INSERT_BEFORE(td, thread, pqe);
1328	}
1329	thread->flags |= THR_FLAGS_IN_WAITQ;
1330}
1331
1332/*
1333 * This must be called with the scheduling lock held.
1334 */
1335static void
1336kse_check_completed(struct kse *kse)
1337{
1338	struct pthread *thread;
1339	struct kse_thr_mailbox *completed;
1340	int sig;
1341
1342	if ((completed = kse->k_mbx.km_completed) != NULL) {
1343		kse->k_mbx.km_completed = NULL;
1344		while (completed != NULL) {
1345			thread = completed->tm_udata;
1346			DBG_MSG("Found completed thread %p, name %s\n",
1347			    thread,
1348			    (thread->name == NULL) ? "none" : thread->name);
1349			thread->blocked = 0;
1350			if (thread != kse->k_curthread) {
1351				if ((thread->flags & THR_FLAGS_SUSPENDED) != 0)
1352					THR_SET_STATE(thread, PS_SUSPENDED);
1353				else
1354					KSE_RUNQ_INSERT_TAIL(kse, thread);
1355				if ((thread->kse != kse) &&
1356				    (thread->kse->k_curthread == thread)) {
1357					thread->kse->k_curthread = NULL;
1358					thread->active = 0;
1359				}
1360			}
1361			if ((sig = thread->tmbx.tm_syncsig.si_signo) != 0) {
1362				if (SIGISMEMBER(thread->sigmask, sig))
1363					SIGADDSET(thread->sigpend, sig);
1364				else
1365					_thr_sig_add(thread, sig, &thread->tmbx.tm_syncsig);
1366				thread->tmbx.tm_syncsig.si_signo = 0;
1367			}
1368			completed = completed->tm_next;
1369		}
1370	}
1371}
1372
1373/*
1374 * This must be called with the scheduling lock held.
1375 */
1376static void
1377kse_check_waitq(struct kse *kse)
1378{
1379	struct pthread	*pthread;
1380	struct timespec ts;
1381
1382	KSE_GET_TOD(kse, &ts);
1383
1384	/*
1385	 * Wake up threads that have timedout.  This has to be
1386	 * done before adding the current thread to the run queue
1387	 * so that a CPU intensive thread doesn't get preference
1388	 * over waiting threads.
1389	 */
1390	while (((pthread = KSE_WAITQ_FIRST(kse)) != NULL) &&
1391	    thr_timedout(pthread, &ts)) {
1392		/* Remove the thread from the wait queue: */
1393		KSE_WAITQ_REMOVE(kse, pthread);
1394		DBG_MSG("Found timedout thread %p in waitq\n", pthread);
1395
1396		/* Indicate the thread timedout: */
1397		pthread->timeout = 1;
1398
1399		/* Add the thread to the priority queue: */
1400		if ((pthread->flags & THR_FLAGS_SUSPENDED) != 0)
1401			THR_SET_STATE(pthread, PS_SUSPENDED);
1402		else {
1403			THR_SET_STATE(pthread, PS_RUNNING);
1404			KSE_RUNQ_INSERT_TAIL(kse, pthread);
1405		}
1406	}
1407}
1408
1409static int
1410thr_timedout(struct pthread *thread, struct timespec *curtime)
1411{
1412	if (thread->wakeup_time.tv_sec < 0)
1413		return (0);
1414	else if (thread->wakeup_time.tv_sec > curtime->tv_sec)
1415		return (0);
1416	else if ((thread->wakeup_time.tv_sec == curtime->tv_sec) &&
1417	    (thread->wakeup_time.tv_nsec > curtime->tv_nsec))
1418		return (0);
1419	else
1420		return (1);
1421}
1422
1423/*
1424 * This must be called with the scheduling lock held.
1425 *
1426 * Each thread has a time slice, a wakeup time (used when it wants
1427 * to wait for a specified amount of time), a run state, and an
1428 * active flag.
1429 *
1430 * When a thread gets run by the scheduler, the active flag is
1431 * set to non-zero (1).  When a thread performs an explicit yield
1432 * or schedules a state change, it enters the scheduler and the
1433 * active flag is cleared.  When the active flag is still seen
1434 * set in the scheduler, that means that the thread is blocked in
1435 * the kernel (because it is cleared before entering the scheduler
1436 * in all other instances).
1437 *
1438 * The wakeup time is only set for those states that can timeout.
1439 * It is set to (-1, -1) for all other instances.
1440 *
1441 * The thread's run state, aside from being useful when debugging,
1442 * is used to place the thread in an appropriate queue.  There
1443 * are 2 basic queues:
1444 *
1445 *   o run queue - queue ordered by priority for all threads
1446 *                 that are runnable
1447 *   o waiting queue - queue sorted by wakeup time for all threads
1448 *                     that are not otherwise runnable (not blocked
1449 *                     in kernel, not waiting for locks)
1450 *
1451 * The thread's time slice is used for round-robin scheduling
1452 * (the default scheduling policy).  While a SCHED_RR thread
1453 * is runnable it's time slice accumulates.  When it reaches
1454 * the time slice interval, it gets reset and added to the end
1455 * of the queue of threads at its priority.  When a thread no
1456 * longer becomes runnable (blocks in kernel, waits, etc), its
1457 * time slice is reset.
1458 *
1459 * The job of kse_switchout_thread() is to handle all of the above.
1460 */
1461static void
1462kse_switchout_thread(struct kse *kse, struct pthread *thread)
1463{
1464	int level;
1465	int i;
1466	int restart;
1467	siginfo_t siginfo;
1468
1469	/*
1470	 * Place the currently running thread into the
1471	 * appropriate queue(s).
1472	 */
1473	DBG_MSG("Switching out thread %p, state %d\n", thread, thread->state);
1474
1475	THR_DEACTIVATE_LAST_LOCK(thread);
1476	if (thread->blocked != 0) {
1477		thread->active = 0;
1478		thread->need_switchout = 0;
1479		/* This thread must have blocked in the kernel. */
1480		/* thread->slice_usec = -1;*/	/* restart timeslice */
1481		if ((thread->slice_usec != -1) &&
1482		    (thread->attr.sched_policy != SCHED_FIFO))
1483			thread->slice_usec += (thread->tmbx.tm_uticks
1484			    + thread->tmbx.tm_sticks) * _clock_res_usec;
1485		/*
1486		 *  Check for pending signals for this thread to
1487		 *  see if we need to interrupt it in the kernel.
1488		 */
1489		if (thread->check_pending != 0) {
1490			for (i = 1; i <= _SIG_MAXSIG; ++i) {
1491				if (SIGISMEMBER(thread->sigpend, i) &&
1492				    !SIGISMEMBER(thread->sigmask, i)) {
1493					restart = _thread_sigact[1 - 1].sa_flags & SA_RESTART;
1494					kse_thr_interrupt(&thread->tmbx,
1495					    restart ? -2 : -1);
1496					break;
1497				}
1498			}
1499		}
1500	}
1501	else {
1502		switch (thread->state) {
1503		case PS_DEAD:
1504			/*
1505			 * The scheduler is operating on a different
1506			 * stack.  It is safe to do garbage collecting
1507			 * here.
1508			 */
1509			thread->active = 0;
1510			thread->need_switchout = 0;
1511			thr_cleanup(kse, thread);
1512			return;
1513			break;
1514
1515		case PS_RUNNING:
1516			if ((thread->flags & THR_FLAGS_SUSPENDED) != 0)
1517				THR_SET_STATE(thread, PS_SUSPENDED);
1518			break;
1519
1520		case PS_COND_WAIT:
1521		case PS_SLEEP_WAIT:
1522			/* Insert into the waiting queue: */
1523			KSE_WAITQ_INSERT(kse, thread);
1524			break;
1525
1526		case PS_LOCKWAIT:
1527			/*
1528			 * This state doesn't timeout.
1529			 */
1530			thread->wakeup_time.tv_sec = -1;
1531			thread->wakeup_time.tv_nsec = -1;
1532			level = thread->locklevel - 1;
1533			if (!_LCK_GRANTED(&thread->lockusers[level]))
1534				KSE_WAITQ_INSERT(kse, thread);
1535			else
1536				THR_SET_STATE(thread, PS_RUNNING);
1537			break;
1538
1539		case PS_SIGWAIT:
1540			KSE_WAITQ_INSERT(kse, thread);
1541			break;
1542		case PS_JOIN:
1543		case PS_MUTEX_WAIT:
1544		case PS_SIGSUSPEND:
1545		case PS_SUSPENDED:
1546		case PS_DEADLOCK:
1547		default:
1548			/*
1549			 * These states don't timeout.
1550			 */
1551			thread->wakeup_time.tv_sec = -1;
1552			thread->wakeup_time.tv_nsec = -1;
1553
1554			/* Insert into the waiting queue: */
1555			KSE_WAITQ_INSERT(kse, thread);
1556			break;
1557		}
1558		if (thread->state != PS_RUNNING) {
1559			/* Restart the time slice: */
1560			thread->slice_usec = -1;
1561		} else {
1562			if (thread->need_switchout != 0)
1563				/*
1564				 * The thread yielded on its own;
1565				 * restart the timeslice.
1566				 */
1567				thread->slice_usec = -1;
1568			else if ((thread->slice_usec != -1) &&
1569	   		    (thread->attr.sched_policy != SCHED_FIFO)) {
1570				thread->slice_usec += (thread->tmbx.tm_uticks
1571				    + thread->tmbx.tm_sticks) * _clock_res_usec;
1572				/* Check for time quantum exceeded: */
1573				if (thread->slice_usec > TIMESLICE_USEC)
1574					thread->slice_usec = -1;
1575			}
1576			if (thread->slice_usec == -1) {
1577				/*
1578				 * The thread exceeded its time quantum or
1579				 * it yielded the CPU; place it at the tail
1580				 * of the queue for its priority.
1581				 */
1582				KSE_RUNQ_INSERT_TAIL(kse, thread);
1583			} else {
1584				/*
1585				 * The thread hasn't exceeded its interval
1586				 * Place it at the head of the queue for its
1587				 * priority.
1588				 */
1589				KSE_RUNQ_INSERT_HEAD(kse, thread);
1590			}
1591		}
1592	}
1593	thread->active = 0;
1594	thread->need_switchout = 0;
1595	if (thread->check_pending != 0) {
1596		/* Install pending signals into the frame. */
1597		thread->check_pending = 0;
1598		KSE_LOCK_ACQUIRE(kse, &_thread_signal_lock);
1599		for (i = 1; i <= _SIG_MAXSIG; i++) {
1600			if (SIGISMEMBER(thread->sigmask, i))
1601				continue;
1602			if (SIGISMEMBER(thread->sigpend, i))
1603				_thr_sig_add(thread, i, &thread->siginfo[i-1]);
1604			else if (SIGISMEMBER(_thr_proc_sigpending, i) &&
1605				_thr_getprocsig_unlocked(i, &siginfo)) {
1606				_thr_sig_add(thread, i, &siginfo);
1607			}
1608		}
1609		KSE_LOCK_RELEASE(kse, &_thread_signal_lock);
1610	}
1611}
1612
1613/*
1614 * This function waits for the smallest timeout value of any waiting
1615 * thread, or until it receives a message from another KSE.
1616 *
1617 * This must be called with the scheduling lock held.
1618 */
1619static void
1620kse_wait(struct kse *kse, struct pthread *td_wait)
1621{
1622	struct timespec ts, ts_sleep;
1623	int saved_flags;
1624
1625	KSE_GET_TOD(kse, &ts);
1626
1627	if ((td_wait == NULL) || (td_wait->wakeup_time.tv_sec < 0)) {
1628		/* Limit sleep to no more than 1 minute. */
1629		ts_sleep.tv_sec = 60;
1630		ts_sleep.tv_nsec = 0;
1631	} else {
1632		TIMESPEC_SUB(&ts_sleep, &td_wait->wakeup_time, &ts);
1633		if (ts_sleep.tv_sec > 60) {
1634			ts_sleep.tv_sec = 60;
1635			ts_sleep.tv_nsec = 0;
1636		}
1637	}
1638	/* Don't sleep for negative times. */
1639	if ((ts_sleep.tv_sec >= 0) && (ts_sleep.tv_nsec >= 0)) {
1640		KSE_SET_IDLE(kse);
1641		kse->k_kseg->kg_idle_kses++;
1642		KSE_SCHED_UNLOCK(kse, kse->k_kseg);
1643		saved_flags = kse->k_mbx.km_flags;
1644		kse->k_mbx.km_flags |= KMF_NOUPCALL;
1645		kse_release(&ts_sleep);
1646		kse->k_mbx.km_flags = saved_flags;
1647		KSE_SCHED_LOCK(kse, kse->k_kseg);
1648		if (KSE_IS_IDLE(kse)) {
1649			KSE_CLEAR_IDLE(kse);
1650			kse->k_kseg->kg_idle_kses--;
1651		}
1652	}
1653}
1654
1655/*
1656 * Avoid calling this kse_exit() so as not to confuse it with the
1657 * system call of the same name.
1658 */
1659static void
1660kse_fini(struct kse *kse)
1661{
1662	/* struct kse_group *free_kseg = NULL; */
1663	struct timespec ts;
1664
1665	/*
1666	 * Check to see if this is one of the main kses.
1667	 */
1668	if (kse->k_kseg != _kse_initial->k_kseg) {
1669		PANIC("shouldn't get here");
1670		/* This is for supporting thread groups. */
1671#ifdef NOT_YET
1672		/* Remove this KSE from the KSEG's list of KSEs. */
1673		KSE_SCHED_LOCK(kse, kse->k_kseg);
1674		TAILQ_REMOVE(&kse->k_kseg->kg_kseq, kse, k_kgqe);
1675		kse->k_kseg->kg_ksecount--;
1676		if (TAILQ_EMPTY(&kse->k_kseg->kg_kseq))
1677			free_kseg = kse->k_kseg;
1678		KSE_SCHED_UNLOCK(kse, kse->k_kseg);
1679
1680		/*
1681		 * Add this KSE to the list of free KSEs along with
1682		 * the KSEG if is now orphaned.
1683		 */
1684		KSE_LOCK_ACQUIRE(kse, &kse_lock);
1685		if (free_kseg != NULL)
1686			kseg_free_unlocked(free_kseg);
1687		kse_free_unlocked(kse);
1688		KSE_LOCK_RELEASE(kse, &kse_lock);
1689		kse_exit();
1690		/* Never returns. */
1691		PANIC("kse_exit()");
1692#endif
1693	} else {
1694#ifdef NOT_YET
1695		/*
1696		 * In future, we might allow program to kill
1697		 * kse in initial group.
1698		 */
1699		if (kse != _kse_initial) {
1700			KSE_SCHED_LOCK(kse, kse->k_kseg);
1701			TAILQ_REMOVE(&kse->k_kseg->kg_kseq, kse, k_kgqe);
1702			kse->k_kseg->kg_ksecount--;
1703			KSE_SCHED_UNLOCK(kse, kse->k_kseg);
1704			KSE_LOCK_ACQUIRE(kse, &kse_lock);
1705			kse_free_unlocked(kse);
1706			KSE_LOCK_RELEASE(kse, &kse_lock);
1707			kse_exit();
1708                        /* Never returns. */
1709                        PANIC("kse_exit() failed for initial kseg");
1710                }
1711#endif
1712		KSE_SCHED_LOCK(kse, kse->k_kseg);
1713		KSE_SET_IDLE(kse);
1714		kse->k_kseg->kg_idle_kses++;
1715		KSE_SCHED_UNLOCK(kse, kse->k_kseg);
1716		ts.tv_sec = 120;
1717		ts.tv_nsec = 0;
1718		kse->k_mbx.km_flags = 0;
1719		kse_release(&ts);
1720		/* Never reach */
1721	}
1722}
1723
1724void
1725_thr_set_timeout(const struct timespec *timeout)
1726{
1727	struct pthread	*curthread = _get_curthread();
1728	struct timespec ts;
1729
1730	/* Reset the timeout flag for the running thread: */
1731	curthread->timeout = 0;
1732
1733	/* Check if the thread is to wait forever: */
1734	if (timeout == NULL) {
1735		/*
1736		 * Set the wakeup time to something that can be recognised as
1737		 * different to an actual time of day:
1738		 */
1739		curthread->wakeup_time.tv_sec = -1;
1740		curthread->wakeup_time.tv_nsec = -1;
1741	}
1742	/* Check if no waiting is required: */
1743	else if ((timeout->tv_sec == 0) && (timeout->tv_nsec == 0)) {
1744		/* Set the wake up time to 'immediately': */
1745		curthread->wakeup_time.tv_sec = 0;
1746		curthread->wakeup_time.tv_nsec = 0;
1747	} else {
1748		/* Calculate the time for the current thread to wakeup: */
1749		KSE_GET_TOD(curthread->kse, &ts);
1750		TIMESPEC_ADD(&curthread->wakeup_time, &ts, timeout);
1751	}
1752}
1753
1754void
1755_thr_panic_exit(char *file, int line, char *msg)
1756{
1757	char buf[256];
1758
1759	snprintf(buf, sizeof(buf), "(%s:%d) %s\n", file, line, msg);
1760	__sys_write(2, buf, strlen(buf));
1761	abort();
1762}
1763
1764void
1765_thr_setrunnable(struct pthread *curthread, struct pthread *thread)
1766{
1767	kse_critical_t crit;
1768
1769	crit = _kse_critical_enter();
1770	KSE_SCHED_LOCK(curthread->kse, thread->kseg);
1771	_thr_setrunnable_unlocked(thread);
1772	KSE_SCHED_UNLOCK(curthread->kse, thread->kseg);
1773	_kse_critical_leave(crit);
1774}
1775
1776void
1777_thr_setrunnable_unlocked(struct pthread *thread)
1778{
1779	if ((thread->kseg->kg_flags & KGF_SINGLE_THREAD) != 0) {
1780		/* No silly queues for these threads. */
1781		if ((thread->flags & THR_FLAGS_SUSPENDED) != 0)
1782			THR_SET_STATE(thread, PS_SUSPENDED);
1783		else
1784			THR_SET_STATE(thread, PS_RUNNING);
1785	} else if (thread->state != PS_RUNNING) {
1786		if ((thread->flags & THR_FLAGS_IN_WAITQ) != 0)
1787			KSE_WAITQ_REMOVE(thread->kse, thread);
1788		if ((thread->flags & THR_FLAGS_SUSPENDED) != 0)
1789			THR_SET_STATE(thread, PS_SUSPENDED);
1790		else {
1791			THR_SET_STATE(thread, PS_RUNNING);
1792			if ((thread->blocked == 0) && (thread->active == 0) &&
1793			    (thread->flags & THR_FLAGS_IN_RUNQ) == 0)
1794				THR_RUNQ_INSERT_TAIL(thread);
1795		}
1796	}
1797        /*
1798         * XXX - Threads are not yet assigned to specific KSEs; they are
1799         *       assigned to the KSEG.  So the fact that a thread's KSE is
1800         *       waiting doesn't necessarily mean that it will be the KSE
1801         *       that runs the thread after the lock is granted.  But we
1802         *       don't know if the other KSEs within the same KSEG are
1803         *       also in a waiting state or not so we err on the side of
1804         *       caution and wakeup the thread's last known KSE.  We
1805         *       ensure that the threads KSE doesn't change while it's
1806         *       scheduling lock is held so it is safe to reference it
1807         *       (the KSE).  If the KSE wakes up and doesn't find any more
1808         *       work it will again go back to waiting so no harm is done.
1809         */
1810	kse_wakeup_one(thread);
1811}
1812
1813static void
1814kse_wakeup_one(struct pthread *thread)
1815{
1816	struct kse *ke;
1817
1818	if (KSE_IS_IDLE(thread->kse)) {
1819		KSE_CLEAR_IDLE(thread->kse);
1820		thread->kseg->kg_idle_kses--;
1821		KSE_WAKEUP(thread->kse);
1822	} else {
1823		TAILQ_FOREACH(ke, &thread->kseg->kg_kseq, k_kgqe) {
1824			if (KSE_IS_IDLE(ke)) {
1825				KSE_CLEAR_IDLE(ke);
1826				ke->k_kseg->kg_idle_kses--;
1827				KSE_WAKEUP(ke);
1828				return;
1829			}
1830		}
1831	}
1832}
1833
1834static void
1835kse_wakeup_multi(struct kse *curkse)
1836{
1837	struct kse *ke;
1838	int tmp;
1839
1840	if ((tmp = KSE_RUNQ_THREADS(curkse)) && curkse->k_kseg->kg_idle_kses) {
1841		TAILQ_FOREACH(ke, &curkse->k_kseg->kg_kseq, k_kgqe) {
1842			if (KSE_IS_IDLE(ke)) {
1843				KSE_CLEAR_IDLE(ke);
1844				ke->k_kseg->kg_idle_kses--;
1845				KSE_WAKEUP(ke);
1846				if (--tmp == 0)
1847					break;
1848			}
1849		}
1850	}
1851}
1852
1853struct pthread *
1854_get_curthread(void)
1855{
1856	return (_ksd_curthread());
1857}
1858
1859/* This assumes the caller has disabled upcalls. */
1860struct kse *
1861_get_curkse(void)
1862{
1863	return (_ksd_curkse());
1864}
1865
1866void
1867_set_curkse(struct kse *kse)
1868{
1869	_ksd_setprivate(&kse->k_ksd);
1870}
1871
1872/*
1873 * Allocate a new KSEG.
1874 *
1875 * We allow the current thread to be NULL in the case that this
1876 * is the first time a KSEG is being created (library initialization).
1877 * In this case, we don't need to (and can't) take any locks.
1878 */
1879struct kse_group *
1880_kseg_alloc(struct pthread *curthread)
1881{
1882	struct kse_group *kseg = NULL;
1883	kse_critical_t crit;
1884
1885	if ((curthread != NULL) && (free_kseg_count > 0)) {
1886		/* Use the kse lock for the kseg queue. */
1887		crit = _kse_critical_enter();
1888		KSE_LOCK_ACQUIRE(curthread->kse, &kse_lock);
1889		if ((kseg = TAILQ_FIRST(&free_kse_groupq)) != NULL) {
1890			TAILQ_REMOVE(&free_kse_groupq, kseg, kg_qe);
1891			free_kseg_count--;
1892			active_kseg_count++;
1893			TAILQ_INSERT_TAIL(&active_kse_groupq, kseg, kg_qe);
1894		}
1895		KSE_LOCK_RELEASE(curthread->kse, &kse_lock);
1896		_kse_critical_leave(crit);
1897		if (kseg)
1898			kseg_reinit(kseg);
1899	}
1900
1901	/*
1902	 * If requested, attempt to allocate a new KSE group only if the
1903	 * KSE allocation was successful and a KSE group wasn't found in
1904	 * the free list.
1905	 */
1906	if ((kseg == NULL) &&
1907	    ((kseg = (struct kse_group *)malloc(sizeof(*kseg))) != NULL)) {
1908		if (_pq_alloc(&kseg->kg_schedq.sq_runq,
1909		    THR_MIN_PRIORITY, THR_LAST_PRIORITY) != 0) {
1910			free(kseg);
1911			kseg = NULL;
1912		} else {
1913			kseg_init(kseg);
1914			/* Add the KSEG to the list of active KSEGs. */
1915			if (curthread != NULL) {
1916				crit = _kse_critical_enter();
1917				KSE_LOCK_ACQUIRE(curthread->kse, &kse_lock);
1918				active_kseg_count++;
1919				TAILQ_INSERT_TAIL(&active_kse_groupq,
1920				    kseg, kg_qe);
1921				KSE_LOCK_RELEASE(curthread->kse, &kse_lock);
1922				_kse_critical_leave(crit);
1923			} else {
1924				active_kseg_count++;
1925				TAILQ_INSERT_TAIL(&active_kse_groupq,
1926				    kseg, kg_qe);
1927			}
1928		}
1929	}
1930	return (kseg);
1931}
1932
1933/*
1934 * This must be called with the kse lock held and when there are
1935 * no more threads that reference it.
1936 */
1937static void
1938kseg_free_unlocked(struct kse_group *kseg)
1939{
1940	TAILQ_REMOVE(&active_kse_groupq, kseg, kg_qe);
1941	TAILQ_INSERT_HEAD(&free_kse_groupq, kseg, kg_qe);
1942	free_kseg_count++;
1943	active_kseg_count--;
1944}
1945
1946void
1947_kseg_free(struct kse_group *kseg)
1948{
1949	struct kse *curkse;
1950	kse_critical_t crit;
1951
1952	crit = _kse_critical_enter();
1953	curkse = _get_curkse();
1954	KSE_LOCK_ACQUIRE(curkse, &kse_lock);
1955	kseg_free_unlocked(kseg);
1956	KSE_LOCK_RELEASE(curkse, &kse_lock);
1957	_kse_critical_leave(crit);
1958}
1959
1960/*
1961 * Allocate a new KSE.
1962 *
1963 * We allow the current thread to be NULL in the case that this
1964 * is the first time a KSE is being created (library initialization).
1965 * In this case, we don't need to (and can't) take any locks.
1966 */
1967struct kse *
1968_kse_alloc(struct pthread *curthread)
1969{
1970	struct kse *kse = NULL;
1971	kse_critical_t crit;
1972	int need_ksd = 0;
1973	int i;
1974
1975	if ((curthread != NULL) && (free_kse_count > 0)) {
1976		crit = _kse_critical_enter();
1977		KSE_LOCK_ACQUIRE(curthread->kse, &kse_lock);
1978		/* Search for a finished KSE. */
1979		kse = TAILQ_FIRST(&free_kseq);
1980		while ((kse != NULL) &&
1981		    ((kse->k_mbx.km_flags & KMF_DONE) == 0)) {
1982			kse = TAILQ_NEXT(kse, k_qe);
1983		}
1984		if (kse != NULL) {
1985			DBG_MSG("found an unused kse.\n");
1986			TAILQ_REMOVE(&free_kseq, kse, k_qe);
1987			free_kse_count--;
1988			TAILQ_INSERT_TAIL(&active_kseq, kse, k_qe);
1989			active_kse_count++;
1990		}
1991		KSE_LOCK_RELEASE(curthread->kse, &kse_lock);
1992		_kse_critical_leave(crit);
1993		if (kse != NULL)
1994			kse_reinit(kse);
1995	}
1996	if ((kse == NULL) &&
1997	    ((kse = (struct kse *)malloc(sizeof(*kse))) != NULL)) {
1998		bzero(kse, sizeof(*kse));
1999
2000		/* Initialize the lockusers. */
2001		for (i = 0; i < MAX_KSE_LOCKLEVEL; i++) {
2002			_lockuser_init(&kse->k_lockusers[i], (void *)kse);
2003			_LCK_SET_PRIVATE2(&kse->k_lockusers[i], NULL);
2004		}
2005		/* _lock_init(kse->k_lock, ...) */
2006
2007		/* We had to malloc a kse; mark it as needing a new ID.*/
2008		need_ksd = 1;
2009
2010		/*
2011		 * Create the KSE context.
2012		 *
2013		 * XXX - For now this is done here in the allocation.
2014		 *       In the future, we may want to have it done
2015		 *       outside the allocation so that scope system
2016		 *       threads (one thread per KSE) are not required
2017		 *       to have a stack for an unneeded kse upcall.
2018		 */
2019		kse->k_mbx.km_func = (kse_func_t *)kse_sched_multi;
2020		kse->k_mbx.km_stack.ss_sp = (char *)malloc(KSE_STACKSIZE);
2021		kse->k_mbx.km_stack.ss_size = KSE_STACKSIZE;
2022		kse->k_mbx.km_udata = (void *)kse;
2023		kse->k_mbx.km_quantum = 20000;
2024		/*
2025		 * We need to keep a copy of the stack in case it
2026		 * doesn't get used; a KSE running a scope system
2027		 * thread will use that thread's stack.
2028		 */
2029		kse->k_stack.ss_sp = kse->k_mbx.km_stack.ss_sp;
2030		kse->k_stack.ss_size = kse->k_mbx.km_stack.ss_size;
2031		if (kse->k_mbx.km_stack.ss_sp == NULL) {
2032			for (i = 0; i < MAX_KSE_LOCKLEVEL; i++) {
2033				_lockuser_destroy(&kse->k_lockusers[i]);
2034			}
2035			/* _lock_destroy(&kse->k_lock); */
2036			free(kse);
2037			kse = NULL;
2038		}
2039	}
2040	if ((kse != NULL) && (need_ksd != 0)) {
2041		/* This KSE needs initialization. */
2042		if (curthread != NULL) {
2043			crit = _kse_critical_enter();
2044			KSE_LOCK_ACQUIRE(curthread->kse, &kse_lock);
2045		}
2046		/* Initialize KSD inside of the lock. */
2047		if (_ksd_create(&kse->k_ksd, (void *)kse, sizeof(*kse)) != 0) {
2048			if (curthread != NULL) {
2049				KSE_LOCK_RELEASE(curthread->kse, &kse_lock);
2050				_kse_critical_leave(crit);
2051			}
2052			free(kse->k_mbx.km_stack.ss_sp);
2053			for (i = 0; i < MAX_KSE_LOCKLEVEL; i++) {
2054				_lockuser_destroy(&kse->k_lockusers[i]);
2055			}
2056			free(kse);
2057			return (NULL);
2058		}
2059		kse->k_flags = 0;
2060		TAILQ_INSERT_TAIL(&active_kseq, kse, k_qe);
2061		active_kse_count++;
2062		if (curthread != NULL) {
2063			KSE_LOCK_RELEASE(curthread->kse, &kse_lock);
2064			_kse_critical_leave(crit);
2065		}
2066	}
2067	return (kse);
2068}
2069
2070static void
2071kse_reinit(struct kse *kse)
2072{
2073	/*
2074	 * XXX - For now every kse has its stack.
2075	 *       In the future, we may want to have it done
2076	 *       outside the allocation so that scope system
2077	 *       threads (one thread per KSE) are not required
2078	 *       to have a stack for an unneeded kse upcall.
2079	 */
2080	kse->k_mbx.km_flags = 0;
2081	kse->k_curthread = 0;
2082	kse->k_kseg = 0;
2083	kse->k_schedq = 0;
2084	kse->k_locklevel = 0;
2085	SIGEMPTYSET(kse->k_sigmask);
2086	bzero(&kse->k_sigq, sizeof(kse->k_sigq));
2087	kse->k_check_sigq = 0;
2088	kse->k_flags = 0;
2089	kse->k_waiting = 0;
2090	kse->k_idle = 0;
2091	kse->k_error = 0;
2092	kse->k_cpu = 0;
2093	kse->k_done = 0;
2094	kse->k_switch = 0;
2095}
2096
2097void
2098kse_free_unlocked(struct kse *kse)
2099{
2100	TAILQ_REMOVE(&active_kseq, kse, k_qe);
2101	active_kse_count--;
2102	kse->k_kseg = NULL;
2103	kse->k_mbx.km_quantum = 20000;
2104	kse->k_flags = 0;
2105	TAILQ_INSERT_HEAD(&free_kseq, kse, k_qe);
2106	free_kse_count++;
2107}
2108
2109void
2110_kse_free(struct pthread *curthread, struct kse *kse)
2111{
2112	kse_critical_t crit;
2113
2114	if (curthread == NULL)
2115		kse_free_unlocked(kse);
2116	else {
2117		crit = _kse_critical_enter();
2118		KSE_LOCK_ACQUIRE(curthread->kse, &kse_lock);
2119		kse_free_unlocked(kse);
2120		KSE_LOCK_RELEASE(curthread->kse, &kse_lock);
2121		_kse_critical_leave(crit);
2122	}
2123}
2124
2125static void
2126kseg_init(struct kse_group *kseg)
2127{
2128	kseg_reinit(kseg);
2129	_lock_init(&kseg->kg_lock, LCK_ADAPTIVE, _kse_lock_wait,
2130	    _kse_lock_wakeup);
2131}
2132
2133static void
2134kseg_reinit(struct kse_group *kseg)
2135{
2136	TAILQ_INIT(&kseg->kg_kseq);
2137	TAILQ_INIT(&kseg->kg_threadq);
2138	TAILQ_INIT(&kseg->kg_schedq.sq_waitq);
2139	kseg->kg_threadcount = 0;
2140	kseg->kg_ksecount = 0;
2141	kseg->kg_idle_kses = 0;
2142	kseg->kg_flags = 0;
2143}
2144
2145struct pthread *
2146_thr_alloc(struct pthread *curthread)
2147{
2148	kse_critical_t crit;
2149	void *p;
2150	struct pthread *thread = NULL;
2151
2152	if (curthread != NULL) {
2153		if (GC_NEEDED())
2154			_thr_gc(curthread);
2155		if (free_thread_count > 0) {
2156			crit = _kse_critical_enter();
2157			KSE_LOCK_ACQUIRE(curthread->kse, &thread_lock);
2158			if ((thread = TAILQ_FIRST(&free_threadq)) != NULL) {
2159				TAILQ_REMOVE(&free_threadq, thread, tle);
2160				free_thread_count--;
2161			}
2162			KSE_LOCK_RELEASE(curthread->kse, &thread_lock);
2163			_kse_critical_leave(crit);
2164		}
2165	}
2166	if (thread == NULL) {
2167		p = malloc(sizeof(struct pthread) + THR_ALIGNBYTES);
2168		if (p != NULL) {
2169			thread = (struct pthread *)THR_ALIGN(p);
2170			thread->alloc_addr = p;
2171		}
2172	}
2173	return (thread);
2174}
2175
2176void
2177_thr_free(struct pthread *curthread, struct pthread *thread)
2178{
2179	kse_critical_t crit;
2180	int i;
2181
2182	DBG_MSG("Freeing thread %p\n", thread);
2183	if ((curthread == NULL) || (free_thread_count >= MAX_CACHED_THREADS)) {
2184		for (i = 0; i < MAX_THR_LOCKLEVEL; i++) {
2185			_lockuser_destroy(&thread->lockusers[i]);
2186		}
2187		_lock_destroy(&thread->lock);
2188		free(thread->alloc_addr);
2189	}
2190	else {
2191		crit = _kse_critical_enter();
2192		KSE_LOCK_ACQUIRE(curthread->kse, &thread_lock);
2193		TAILQ_INSERT_TAIL(&free_threadq, thread, tle);
2194		free_thread_count++;
2195		KSE_LOCK_RELEASE(curthread->kse, &thread_lock);
2196		_kse_critical_leave(crit);
2197	}
2198}
2199
2200/*
2201 * Add an active thread:
2202 *
2203 *   o Assign the thread a unique id (which GDB uses to track
2204 *     threads.
2205 *   o Add the thread to the list of all threads and increment
2206 *     number of active threads.
2207 */
2208static void
2209thr_link(struct pthread *thread)
2210{
2211	kse_critical_t crit;
2212	struct kse *curkse;
2213	struct pthread *curthread;
2214
2215	crit = _kse_critical_enter();
2216	curkse = _get_curkse();
2217	curthread = _get_curthread();
2218	thread->sigmask = curthread->sigmask;
2219	KSE_LOCK_ACQUIRE(curkse, &_thread_list_lock);
2220	/*
2221	 * Initialize the unique id (which GDB uses to track
2222	 * threads), add the thread to the list of all threads,
2223	 * and
2224	 */
2225	thread->uniqueid = next_uniqueid++;
2226	THR_LIST_ADD(thread);
2227	active_threads++;
2228	KSE_LOCK_RELEASE(curkse, &_thread_list_lock);
2229
2230	_kse_critical_leave(crit);
2231}
2232
2233/*
2234 * Remove an active thread.
2235 */
2236static void
2237thr_unlink(struct pthread *thread)
2238{
2239	kse_critical_t crit;
2240	struct kse *curkse;
2241
2242	crit = _kse_critical_enter();
2243	curkse = _get_curkse();
2244
2245	KSE_LOCK_ACQUIRE(curkse, &_thread_list_lock);
2246	THR_LIST_REMOVE(thread);
2247	active_threads--;
2248	KSE_LOCK_RELEASE(curkse, &_thread_list_lock);
2249
2250	_kse_critical_leave(crit);
2251}
2252