thr_kern.c revision 114664
1/*
2 * Copyright (C) 2003 Daniel M. Eischen <deischen@freebsd.org>
3 * Copyright (C) 2002 Jonathon Mini <mini@freebsd.org>
4 * Copyright (c) 1995-1998 John Birrell <jb@cimlogic.com.au>
5 * All rights reserved.
6 *
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
9 * are met:
10 * 1. Redistributions of source code must retain the above copyright
11 *    notice, this list of conditions and the following disclaimer.
12 * 2. Redistributions in binary form must reproduce the above copyright
13 *    notice, this list of conditions and the following disclaimer in the
14 *    documentation and/or other materials provided with the distribution.
15 * 3. All advertising materials mentioning features or use of this software
16 *    must display the following acknowledgement:
17 *	This product includes software developed by John Birrell.
18 * 4. Neither the name of the author nor the names of any co-contributors
19 *    may be used to endorse or promote products derived from this software
20 *    without specific prior written permission.
21 *
22 * THIS SOFTWARE IS PROVIDED BY JOHN BIRRELL AND CONTRIBUTORS ``AS IS'' AND
23 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
24 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
25 * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
26 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
27 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
28 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
29 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
30 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
31 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32 * SUCH DAMAGE.
33 *
34 */
35#include <sys/cdefs.h>
36__FBSDID("$FreeBSD: head/lib/libkse/thread/thr_kern.c 114664 2003-05-04 16:17:01Z deischen $");
37
38#include <sys/types.h>
39#include <sys/kse.h>
40#include <sys/signalvar.h>
41#include <sys/queue.h>
42#include <machine/atomic.h>
43
44#include <assert.h>
45#include <errno.h>
46#include <signal.h>
47#include <stdlib.h>
48#include <string.h>
49#include <time.h>
50#include <ucontext.h>
51#include <unistd.h>
52
53#include "atomic_ops.h"
54#include "thr_private.h"
55#include "pthread_md.h"
56#include "libc_private.h"
57
58/*#define DEBUG_THREAD_KERN */
59#ifdef DEBUG_THREAD_KERN
60#define DBG_MSG		stdout_debug
61#else
62#define DBG_MSG(x...)
63#endif
64
65/*
66 * Define a high water mark for the maximum number of threads that
67 * will be cached.  Once this level is reached, any extra threads
68 * will be free()'d.
69 *
70 * XXX - It doesn't make sense to worry about the maximum number of
71 *       KSEs that we can cache because the system will limit us to
72 *       something *much* less than the maximum number of threads
73 *       that we can have.  Disregarding KSEs in their own group,
74 *       the maximum number of KSEs is the number of processors in
75 *       the system.
76 */
77#define	MAX_CACHED_THREADS	100
78#define	KSE_STACKSIZE		16384
79
80#define	KSE_SET_MBOX(kse, thrd) \
81	(kse)->k_mbx.km_curthread = &(thrd)->tmbx
82
83#define	KSE_SET_EXITED(kse)	(kse)->k_flags |= KF_EXITED
84
85/*
86 * Macros for manipulating the run queues.  The priority queue
87 * routines use the thread's pqe link and also handle the setting
88 * and clearing of the thread's THR_FLAGS_IN_RUNQ flag.
89 */
90#define	KSE_RUNQ_INSERT_HEAD(kse, thrd)			\
91	_pq_insert_head(&(kse)->k_schedq->sq_runq, thrd)
92#define	KSE_RUNQ_INSERT_TAIL(kse, thrd)			\
93	_pq_insert_tail(&(kse)->k_schedq->sq_runq, thrd)
94#define	KSE_RUNQ_REMOVE(kse, thrd)			\
95	_pq_remove(&(kse)->k_schedq->sq_runq, thrd)
96#define	KSE_RUNQ_FIRST(kse)	_pq_first(&(kse)->k_schedq->sq_runq)
97
98#define KSE_RUNQ_THREADS(kse)	((kse)->k_schedq->sq_runq.pq_threads)
99
100/*
101 * We've got to keep track of everything that is allocated, not only
102 * to have a speedy free list, but also so they can be deallocated
103 * after a fork().
104 */
105static TAILQ_HEAD(, kse)	active_kseq;
106static TAILQ_HEAD(, kse)	free_kseq;
107static TAILQ_HEAD(, kse_group)	free_kse_groupq;
108static TAILQ_HEAD(, kse_group)	active_kse_groupq;
109static TAILQ_HEAD(, kse_group)	gc_ksegq;
110static struct lock		kse_lock;	/* also used for kseg queue */
111static int			free_kse_count = 0;
112static int			free_kseg_count = 0;
113static TAILQ_HEAD(, pthread)	free_threadq;
114static struct lock		thread_lock;
115static int			free_thread_count = 0;
116static int			inited = 0;
117static int			active_kse_count = 0;
118static int			active_kseg_count = 0;
119
120static void	kse_check_completed(struct kse *kse);
121static void	kse_check_waitq(struct kse *kse);
122static void	kse_check_signals(struct kse *kse);
123static void	kse_fini(struct kse *curkse);
124static void	kse_reinit(struct kse *kse);
125static void	kse_sched_multi(struct kse *curkse);
126#ifdef NOT_YET
127static void	kse_sched_single(struct kse *curkse);
128#endif
129static void	kse_switchout_thread(struct kse *kse, struct pthread *thread);
130static void	kse_wait(struct kse *kse, struct pthread *td_wait);
131static void	kse_free_unlocked(struct kse *kse);
132static void	kseg_free_unlocked(struct kse_group *kseg);
133static void	kseg_init(struct kse_group *kseg);
134static void	kseg_reinit(struct kse_group *kseg);
135static void	kse_waitq_insert(struct pthread *thread);
136static void	kse_wakeup_multi(struct kse *curkse);
137static void	kse_wakeup_one(struct pthread *thread);
138static void	thr_cleanup(struct kse *kse, struct pthread *curthread);
139static void	thr_resume_wrapper(int unused_1, siginfo_t *unused_2,
140		    ucontext_t *ucp);
141static void	thr_resume_check(struct pthread *curthread, ucontext_t *ucp,
142		    struct pthread_sigframe *psf);
143static int	thr_timedout(struct pthread *thread, struct timespec *curtime);
144
145/*
146 * This is called after a fork().
147 * No locks need to be taken here since we are guaranteed to be
148 * single threaded.
149 */
150void
151_kse_single_thread(struct pthread *curthread)
152{
153	struct kse *kse, *kse_next;
154	struct kse_group *kseg, *kseg_next;
155	struct pthread *thread, *thread_next;
156	kse_critical_t crit;
157	int i;
158
159	/*
160	 * Disable upcalls and clear the threaded flag.
161	 * XXX - I don't think we need to disable upcalls after a fork().
162	 *       but it doesn't hurt.
163	 */
164	crit = _kse_critical_enter();
165	__isthreaded = 0;
166
167	/*
168	 * Enter a loop to remove and free all threads other than
169	 * the running thread from the active thread list:
170	 */
171	for (thread = TAILQ_FIRST(&_thread_list); thread != NULL;
172	    thread = thread_next) {
173		/*
174		 * Advance to the next thread before the destroying
175		 * the current thread.
176		*/
177		thread_next = TAILQ_NEXT(thread, tle);
178
179		/*
180		 * Remove this thread from the list (the current
181		 * thread will be removed but re-added by libpthread
182		 * initialization.
183		 */
184		TAILQ_REMOVE(&_thread_list, thread, tle);
185		/* Make sure this isn't the running thread: */
186		if (thread != curthread) {
187			_thr_stack_free(&thread->attr);
188			if (thread->specific != NULL)
189				free(thread->specific);
190			for (i = 0; i < MAX_THR_LOCKLEVEL; i++) {
191				_lockuser_destroy(&thread->lockusers[i]);
192			}
193			_lock_destroy(&thread->lock);
194			free(thread);
195		}
196	}
197
198	TAILQ_INIT(&curthread->mutexq);		/* initialize mutex queue */
199	curthread->joiner = NULL;		/* no joining threads yet */
200	sigemptyset(&curthread->sigpend);	/* clear pending signals */
201	if (curthread->specific != NULL) {
202		free(curthread->specific);
203		curthread->specific = NULL;
204		curthread->specific_data_count = 0;
205	}
206
207	/* Free the free KSEs: */
208	while ((kse = TAILQ_FIRST(&free_kseq)) != NULL) {
209		TAILQ_REMOVE(&free_kseq, kse, k_qe);
210		_ksd_destroy(&kse->k_ksd);
211		if (kse->k_stack.ss_sp != NULL)
212			free(kse->k_stack.ss_sp);
213		free(kse);
214	}
215	free_kse_count = 0;
216
217	/* Free the active KSEs: */
218	for (kse = TAILQ_FIRST(&active_kseq); kse != NULL; kse = kse_next) {
219		kse_next = TAILQ_NEXT(kse, k_qe);
220		TAILQ_REMOVE(&active_kseq, kse, k_qe);
221		for (i = 0; i < MAX_KSE_LOCKLEVEL; i++) {
222			_lockuser_destroy(&kse->k_lockusers[i]);
223		}
224		if (kse->k_stack.ss_sp != NULL)
225			free(kse->k_stack.ss_sp);
226		_lock_destroy(&kse->k_lock);
227		free(kse);
228	}
229	active_kse_count = 0;
230
231	/* Free the free KSEGs: */
232	while ((kseg = TAILQ_FIRST(&free_kse_groupq)) != NULL) {
233		TAILQ_REMOVE(&free_kse_groupq, kseg, kg_qe);
234		_lock_destroy(&kseg->kg_lock);
235		_pq_free(&kseg->kg_schedq.sq_runq);
236		free(kseg);
237	}
238	free_kseg_count = 0;
239
240	/* Free the active KSEGs: */
241	for (kseg = TAILQ_FIRST(&active_kse_groupq);
242	    kseg != NULL; kseg = kseg_next) {
243		kseg_next = TAILQ_NEXT(kseg, kg_qe);
244		TAILQ_REMOVE(&active_kse_groupq, kseg, kg_qe);
245		_lock_destroy(&kseg->kg_lock);
246		_pq_free(&kseg->kg_schedq.sq_runq);
247		free(kseg);
248	}
249	active_kseg_count = 0;
250
251	/* Free the free threads. */
252	while ((thread = TAILQ_FIRST(&free_threadq)) != NULL) {
253		TAILQ_REMOVE(&free_threadq, thread, tle);
254		if (thread->specific != NULL)
255			free(thread->specific);
256		for (i = 0; i < MAX_THR_LOCKLEVEL; i++) {
257			_lockuser_destroy(&thread->lockusers[i]);
258		}
259		_lock_destroy(&thread->lock);
260		free(thread);
261	}
262	free_thread_count = 0;
263
264	/* Free the to-be-gc'd threads. */
265	while ((thread = TAILQ_FIRST(&_thread_gc_list)) != NULL) {
266		TAILQ_REMOVE(&_thread_gc_list, thread, gcle);
267		for (i = 0; i < MAX_THR_LOCKLEVEL; i++) {
268			_lockuser_destroy(&thread->lockusers[i]);
269		}
270		_lock_destroy(&thread->lock);
271		free(thread);
272	}
273	TAILQ_INIT(&gc_ksegq);
274	_gc_count = 0;
275
276	if (inited != 0) {
277		/*
278		 * Destroy these locks; they'll be recreated to assure they
279		 * are in the unlocked state.
280		 */
281		_lock_destroy(&kse_lock);
282		_lock_destroy(&thread_lock);
283		_lock_destroy(&_thread_list_lock);
284		inited = 0;
285	}
286
287	/*
288	 * After a fork(), the leftover thread goes back to being
289	 * scope process.
290	 */
291	curthread->attr.flags &= ~PTHREAD_SCOPE_SYSTEM;
292	curthread->attr.flags |= PTHREAD_SCOPE_PROCESS;
293
294	/*
295	 * After a fork, we are still operating on the thread's original
296	 * stack.  Don't clear the THR_FLAGS_USER from the thread's
297	 * attribute flags.
298	 */
299
300	/* Initialize the threads library. */
301	curthread->kse = NULL;
302	curthread->kseg = NULL;
303	_kse_initial = NULL;
304	_libpthread_init(curthread);
305}
306
307/*
308 * This is used to initialize housekeeping and to initialize the
309 * KSD for the KSE.
310 */
311void
312_kse_init(void)
313{
314	if (inited == 0) {
315		TAILQ_INIT(&active_kseq);
316		TAILQ_INIT(&active_kse_groupq);
317		TAILQ_INIT(&free_kseq);
318		TAILQ_INIT(&free_kse_groupq);
319		TAILQ_INIT(&free_threadq);
320		TAILQ_INIT(&gc_ksegq);
321		if (_lock_init(&kse_lock, LCK_ADAPTIVE,
322		    _kse_lock_wait, _kse_lock_wakeup) != 0)
323			PANIC("Unable to initialize free KSE queue lock");
324		if (_lock_init(&thread_lock, LCK_ADAPTIVE,
325		    _kse_lock_wait, _kse_lock_wakeup) != 0)
326			PANIC("Unable to initialize free thread queue lock");
327		if (_lock_init(&_thread_list_lock, LCK_ADAPTIVE,
328		    _kse_lock_wait, _kse_lock_wakeup) != 0)
329			PANIC("Unable to initialize thread list lock");
330		active_kse_count = 0;
331		active_kseg_count = 0;
332		_gc_count = 0;
333		inited = 1;
334	}
335}
336
337int
338_kse_isthreaded(void)
339{
340	return (__isthreaded != 0);
341}
342
343/*
344 * This is called when the first thread (other than the initial
345 * thread) is created.
346 */
347int
348_kse_setthreaded(int threaded)
349{
350	if ((threaded != 0) && (__isthreaded == 0)) {
351		/*
352		 * Locking functions in libc are required when there are
353		 * threads other than the initial thread.
354		 */
355		__isthreaded = 1;
356
357		/*
358		 * Tell the kernel to create a KSE for the initial thread
359		 * and enable upcalls in it.
360		 */
361		_kse_initial->k_flags |= KF_STARTED;
362		if (kse_create(&_kse_initial->k_mbx, 0) != 0) {
363			_kse_initial->k_flags &= ~KF_STARTED;
364			__isthreaded = 0;
365			/* may abort() */
366			DBG_MSG("kse_create failed\n");
367			return (-1);
368		}
369		KSE_SET_MBOX(_kse_initial, _thr_initial);
370		_thr_setmaxconcurrency();
371	}
372	return (0);
373}
374
375/*
376 * Lock wait and wakeup handlers for KSE locks.  These are only used by
377 * KSEs, and should never be used by threads.  KSE locks include the
378 * KSE group lock (used for locking the scheduling queue) and the
379 * kse_lock defined above.
380 *
381 * When a KSE lock attempt blocks, the entire KSE blocks allowing another
382 * KSE to run.  For the most part, it doesn't make much sense to try and
383 * schedule another thread because you need to lock the scheduling queue
384 * in order to do that.  And since the KSE lock is used to lock the scheduling
385 * queue, you would just end up blocking again.
386 */
387void
388_kse_lock_wait(struct lock *lock, struct lockuser *lu)
389{
390	struct kse *curkse = (struct kse *)_LCK_GET_PRIVATE(lu);
391	struct timespec ts;
392	int saved_flags;
393
394	if (curkse->k_mbx.km_curthread != NULL)
395		PANIC("kse_lock_wait does not disable upcall.\n");
396	/*
397	 * Enter a loop to wait until we get the lock.
398	 */
399	ts.tv_sec = 0;
400	ts.tv_nsec = 1000000;  /* 1 sec */
401	KSE_SET_WAIT(curkse);
402	while (_LCK_BUSY(lu)) {
403		/*
404		 * Yield the kse and wait to be notified when the lock
405		 * is granted.
406		 */
407		saved_flags = curkse->k_mbx.km_flags;
408		curkse->k_mbx.km_flags |= KMF_NOUPCALL | KMF_NOCOMPLETED;
409		kse_release(&ts);
410		curkse->k_mbx.km_flags = saved_flags;
411
412		/*
413		 * Make sure that the wait flag is set again in case
414		 * we wokeup without the lock being granted.
415		 */
416		KSE_SET_WAIT(curkse);
417	}
418	KSE_CLEAR_WAIT(curkse);
419}
420
421void
422_kse_lock_wakeup(struct lock *lock, struct lockuser *lu)
423{
424	struct kse *curkse;
425	struct kse *kse;
426
427	curkse = _get_curkse();
428	kse = (struct kse *)_LCK_GET_PRIVATE(lu);
429
430	if (kse == curkse)
431		PANIC("KSE trying to wake itself up in lock");
432	else if (KSE_WAITING(kse)) {
433		/*
434		 * Notify the owning kse that it has the lock.
435		 */
436		KSE_WAKEUP(kse);
437	}
438}
439
440/*
441 * Thread wait and wakeup handlers for thread locks.  These are only used
442 * by threads, never by KSEs.  Thread locks include the per-thread lock
443 * (defined in its structure), and condition variable and mutex locks.
444 */
445void
446_thr_lock_wait(struct lock *lock, struct lockuser *lu)
447{
448	struct pthread *curthread = (struct pthread *)lu->lu_private;
449	int count;
450
451	/*
452	 * Spin for a bit.
453	 *
454	 * XXX - We probably want to make this a bit smarter.  It
455	 *       doesn't make sense to spin unless there is more
456	 *       than 1 CPU.  A thread that is holding one of these
457	 *       locks is prevented from being swapped out for another
458	 *       thread within the same scheduling entity.
459	 */
460	count = 0;
461	while (_LCK_BUSY(lu) && count < 300)
462		count++;
463	while (_LCK_BUSY(lu)) {
464		THR_LOCK_SWITCH(curthread);
465		if (_LCK_BUSY(lu)) {
466			/* Wait for the lock: */
467			atomic_store_rel_int(&curthread->need_wakeup, 1);
468			THR_SET_STATE(curthread, PS_LOCKWAIT);
469			_thr_sched_switch(curthread);
470		}
471		THR_UNLOCK_SWITCH(curthread);
472	}
473}
474
475void
476_thr_lock_wakeup(struct lock *lock, struct lockuser *lu)
477{
478	struct pthread *thread;
479	struct pthread *curthread;
480	int unlock;
481
482	curthread = _get_curthread();
483	thread = (struct pthread *)_LCK_GET_PRIVATE(lu);
484
485	unlock = 0;
486	if (curthread->kseg == thread->kseg) {
487		/* Not already locked */
488		if (curthread->lock_switch == 0) {
489			THR_SCHED_LOCK(curthread, thread);
490			unlock = 1;
491		}
492	} else {
493		THR_SCHED_LOCK(curthread, thread);
494		unlock = 1;
495	}
496	_thr_setrunnable_unlocked(thread);
497	atomic_store_rel_int(&thread->need_wakeup, 0);
498	if (unlock)
499		THR_SCHED_UNLOCK(curthread, thread);
500}
501
502kse_critical_t
503_kse_critical_enter(void)
504{
505	kse_critical_t crit;
506
507	crit = _ksd_readandclear_tmbx;
508	return (crit);
509}
510
511void
512_kse_critical_leave(kse_critical_t crit)
513{
514	struct pthread *curthread;
515
516	_ksd_set_tmbx(crit);
517	if ((crit != NULL) && ((curthread = _get_curthread()) != NULL))
518		THR_YIELD_CHECK(curthread);
519}
520
521int
522_kse_in_critical(void)
523{
524	return (_ksd_get_tmbx() == NULL);
525}
526
527void
528_thr_critical_enter(struct pthread *thread)
529{
530	thread->critical_count++;
531}
532
533void
534_thr_critical_leave(struct pthread *thread)
535{
536	thread->critical_count--;
537	THR_YIELD_CHECK(thread);
538}
539
540/*
541 * XXX - We may need to take the scheduling lock before calling
542 *       this, or perhaps take the lock within here before
543 *       doing anything else.
544 */
545void
546_thr_sched_switch(struct pthread *curthread)
547{
548	struct pthread_sigframe psf;
549	struct kse *curkse;
550	volatile int once = 0;
551
552	/* We're in the scheduler, 5 by 5: */
553	THR_ASSERT(curthread->lock_switch, "lock_switch");
554	THR_ASSERT(_kse_in_critical(), "not in critical region");
555	curkse = _get_curkse();
556
557	curthread->need_switchout = 1;	/* The thread yielded on its own. */
558	curthread->critical_yield = 0;	/* No need to yield anymore. */
559	curthread->slice_usec = -1;	/* Restart the time slice. */
560
561	/*
562	 * The signal frame is allocated off the stack because
563	 * a thread can be interrupted by other signals while
564	 * it is running down pending signals.
565	 */
566	sigemptyset(&psf.psf_sigset);
567	curthread->curframe = &psf;
568
569	_thread_enter_uts(&curthread->tmbx, &curkse->k_mbx);
570
571	/*
572	 * This thread is being resumed; check for cancellations.
573	 */
574	if ((once == 0) && (!THR_IN_CRITICAL(curthread))) {
575		once = 1;
576		curthread->critical_count++;
577		THR_UNLOCK_SWITCH(curthread);
578		curthread->critical_count--;
579		thr_resume_check(curthread, &curthread->tmbx.tm_context, &psf);
580		THR_LOCK_SWITCH(curthread);
581	}
582}
583
584/*
585 * This is the scheduler for a KSE which runs a scope system thread.
586 * The multi-thread KSE scheduler should also work for a single threaded
587 * KSE, but we use a separate scheduler so that it can be fine-tuned
588 * to be more efficient (and perhaps not need a separate stack for
589 * the KSE, allowing it to use the thread's stack).
590 *
591 * XXX - This probably needs some work.
592 */
593#ifdef NOT_YET
594static void
595kse_sched_single(struct kse *curkse)
596{
597	struct pthread *curthread = curkse->k_curthread;
598	struct pthread *td_wait;
599	struct timespec ts;
600	int level;
601
602	if (curthread->active == 0) {
603		if (curthread->state != PS_RUNNING) {
604			/* Check to see if the thread has timed out. */
605			KSE_GET_TOD(curkse, &ts);
606			if (thr_timedout(curthread, &ts) != 0) {
607				curthread->timeout = 1;
608				curthread->state = PS_RUNNING;
609			}
610		}
611	}
612
613	/* This thread no longer needs to yield the CPU: */
614	curthread->critical_yield = 0;
615	curthread->need_switchout = 0;
616
617	/*
618	 * Lock the scheduling queue.
619	 *
620	 * There is no scheduling queue for single threaded KSEs,
621	 * but we need a lock for protection regardless.
622	 */
623	KSE_SCHED_LOCK(curkse, curkse->k_kseg);
624
625	/*
626	 * This has to do the job of kse_switchout_thread(), only
627	 * for a single threaded KSE/KSEG.
628	 */
629
630	switch (curthread->state) {
631	case PS_DEAD:
632		/* Unlock the scheduling queue and exit the KSE. */
633		KSE_SCHED_UNLOCK(curkse, curkse->k_kseg);
634		kse_fini(curkse);	/* does not return */
635		break;
636
637	case PS_COND_WAIT:
638	case PS_SLEEP_WAIT:
639		/* Only insert threads that can timeout: */
640		if (curthread->wakeup_time.tv_sec != -1) {
641			/* Insert into the waiting queue: */
642			KSE_WAITQ_INSERT(curkse, curthread);
643		}
644		break;
645
646	case PS_LOCKWAIT:
647		level = curthread->locklevel - 1;
648		if (_LCK_BUSY(&curthread->lockusers[level]))
649			KSE_WAITQ_INSERT(curkse, curthread);
650		else
651			THR_SET_STATE(curthread, PS_RUNNING);
652		break;
653
654	case PS_JOIN:
655	case PS_MUTEX_WAIT:
656	case PS_RUNNING:
657	case PS_SIGSUSPEND:
658	case PS_SIGWAIT:
659	case PS_SUSPENDED:
660	case PS_DEADLOCK:
661	default:
662		/*
663		 * These states don't timeout and don't need
664		 * to be in the waiting queue.
665		 */
666		break;
667	}
668	while (curthread->state != PS_RUNNING) {
669		curthread->active = 0;
670		td_wait = KSE_WAITQ_FIRST(curkse);
671
672		kse_wait(curkse, td_wait);
673
674	    	if (td_wait != NULL) {
675			KSE_GET_TOD(curkse, &ts);
676			if (thr_timedout(curthread, &ts)) {
677				/* Indicate the thread timedout: */
678				td_wait->timeout = 1;
679
680				/* Make the thread runnable. */
681				THR_SET_STATE(td_wait, PS_RUNNING);
682				KSE_WAITQ_REMOVE(curkse, td_wait);
683			}
684		}
685		KSE_SCHED_UNLOCK(curkse, curkse->k_kseg);
686		kse_check_signals(curkse);
687		KSE_SCHED_LOCK(curkse, curkse->k_kseg);
688	}
689
690	/* Remove the frame reference. */
691	curthread->curframe = NULL;
692
693	/* Unlock the scheduling queue. */
694	KSE_SCHED_UNLOCK(curkse, curkse->k_kseg);
695
696	/*
697	 * Continue the thread at its current frame:
698	 */
699	DBG_MSG("Continuing bound thread %p\n", curthread);
700	_thread_switch(&curthread->tmbx, &curkse->k_mbx.km_curthread);
701	PANIC("Thread has returned from _thread_switch");
702}
703#endif
704
705void
706dump_queues(struct kse *curkse)
707{
708	struct pthread *thread;
709
710	DBG_MSG("Threads in waiting queue:\n");
711	TAILQ_FOREACH(thread, &curkse->k_kseg->kg_schedq.sq_waitq, pqe) {
712		DBG_MSG("  thread %p, state %d, blocked %d\n",
713		    thread, thread->state, thread->blocked);
714	}
715}
716
717/*
718 * This is the scheduler for a KSE which runs multiple threads.
719 */
720static void
721kse_sched_multi(struct kse *curkse)
722{
723	struct pthread *curthread, *td_wait;
724	struct pthread_sigframe *curframe;
725	int ret;
726
727	THR_ASSERT(curkse->k_mbx.km_curthread == NULL,
728	    "Mailbox not null in kse_sched_multi");
729
730	/* Check for first time initialization: */
731	if ((curkse->k_flags & KF_INITIALIZED) == 0) {
732		/* Setup this KSEs specific data. */
733		_ksd_setprivate(&curkse->k_ksd);
734		_set_curkse(curkse);
735
736		/* Set this before grabbing the context. */
737		curkse->k_flags |= KF_INITIALIZED;
738	}
739
740	/* This may have returned from a kse_release(). */
741	if (KSE_WAITING(curkse)) {
742		DBG_MSG("Entered upcall when KSE is waiting.");
743		KSE_CLEAR_WAIT(curkse);
744	}
745
746	curthread = curkse->k_curthread;
747	if (curthread == NULL || curthread->lock_switch == 0) {
748		/*
749		 * curthread was preempted by upcall, it is not a volunteer
750		 * context switch. Lock the scheduling lock.
751		 */
752		KSE_SCHED_LOCK(curkse, curkse->k_kseg);
753	}
754
755	/*
756	 * If the current thread was completed in another KSE, then
757	 * it will be in the run queue.  Don't mark it as being blocked.
758	 */
759	if ((curthread != NULL) &&
760	    ((curthread->flags & THR_FLAGS_IN_RUNQ) == 0) &&
761	    (curthread->need_switchout == 0)) {
762		/*
763		 * Assume the current thread is blocked; when the
764		 * completed threads are checked and if the current
765		 * thread is among the completed, the blocked flag
766		 * will be cleared.
767		 */
768		curthread->blocked = 1;
769	}
770
771	/* Check for any unblocked threads in the kernel. */
772	kse_check_completed(curkse);
773
774	/*
775	 * Check for threads that have timed-out.
776	 */
777	kse_check_waitq(curkse);
778
779	/*
780	 * Switchout the current thread, if necessary, as the last step
781	 * so that it is inserted into the run queue (if it's runnable)
782	 * _after_ any other threads that were added to it above.
783	 */
784	if (curthread == NULL)
785		;  /* Nothing to do here. */
786	else if ((curthread->need_switchout == 0) &&
787	    (curthread->blocked == 0) && (THR_IN_CRITICAL(curthread))) {
788		/*
789		 * Resume the thread and tell it to yield when
790		 * it leaves the critical region.
791		 */
792		curthread->critical_yield = 1;
793		curthread->active = 1;
794		if ((curthread->flags & THR_FLAGS_IN_RUNQ) != 0)
795			KSE_RUNQ_REMOVE(curkse, curthread);
796		curkse->k_curthread = curthread;
797		curthread->kse = curkse;
798		DBG_MSG("Continuing thread %p in critical region\n",
799		    curthread);
800		kse_wakeup_multi(curkse);
801		if (curthread->lock_switch) {
802			KSE_SCHED_LOCK(curkse, curkse->k_kseg);
803			ret = _thread_switch(&curthread->tmbx, 0);
804		} else {
805			KSE_SCHED_UNLOCK(curkse, curkse->k_kseg);
806			ret = _thread_switch(&curthread->tmbx,
807		    		&curkse->k_mbx.km_curthread);
808		}
809		if (ret != 0)
810			PANIC("Can't resume thread in critical region\n");
811	}
812	else if ((curthread->flags & THR_FLAGS_IN_RUNQ) == 0)
813		kse_switchout_thread(curkse, curthread);
814	curkse->k_curthread = NULL;
815
816	kse_wakeup_multi(curkse);
817
818	/* This has to be done without the scheduling lock held. */
819	KSE_SCHED_UNLOCK(curkse, curkse->k_kseg);
820	kse_check_signals(curkse);
821	KSE_SCHED_LOCK(curkse, curkse->k_kseg);
822
823	dump_queues(curkse);
824
825	/* Check if there are no threads ready to run: */
826	while (((curthread = KSE_RUNQ_FIRST(curkse)) == NULL) &&
827	    (curkse->k_kseg->kg_threadcount != 0)) {
828		/*
829		 * Wait for a thread to become active or until there are
830		 * no more threads.
831		 */
832		td_wait = KSE_WAITQ_FIRST(curkse);
833		kse_wait(curkse, td_wait);
834		kse_check_completed(curkse);
835		kse_check_waitq(curkse);
836		KSE_SCHED_UNLOCK(curkse, curkse->k_kseg);
837		kse_check_signals(curkse);
838		KSE_SCHED_LOCK(curkse, curkse->k_kseg);
839	}
840
841	/* Check for no more threads: */
842	if (curkse->k_kseg->kg_threadcount == 0) {
843		/*
844		 * Normally this shouldn't return, but it will if there
845		 * are other KSEs running that create new threads that
846		 * are assigned to this KSE[G].  For instance, if a scope
847		 * system thread were to create a scope process thread
848		 * and this kse[g] is the initial kse[g], then that newly
849		 * created thread would be assigned to us (the initial
850		 * kse[g]).
851		 */
852		KSE_SCHED_UNLOCK(curkse, curkse->k_kseg);
853		kse_fini(curkse);
854		KSE_SCHED_LOCK(curkse, curkse->k_kseg);
855		curthread = KSE_RUNQ_FIRST(curkse);
856	}
857
858	THR_ASSERT(curthread != NULL,
859	    "Return from kse_wait/fini without thread.");
860	THR_ASSERT(curthread->state != PS_DEAD,
861	    "Trying to resume dead thread!");
862	KSE_RUNQ_REMOVE(curkse, curthread);
863
864	/*
865	 * Make the selected thread the current thread.
866	 */
867	curkse->k_curthread = curthread;
868
869	/*
870	 * Make sure the current thread's kse points to this kse.
871	 */
872	curthread->kse = curkse;
873
874	/*
875	 * Reset accounting.
876	 */
877	curthread->tmbx.tm_uticks = 0;
878	curthread->tmbx.tm_sticks = 0;
879
880	/*
881	 * Reset the time slice if this thread is running for the first
882	 * time or running again after using its full time slice allocation.
883	 */
884	if (curthread->slice_usec == -1)
885		curthread->slice_usec = 0;
886
887	/* Mark the thread active. */
888	curthread->active = 1;
889
890	/* Remove the frame reference. */
891	curframe = curthread->curframe;
892	curthread->curframe = NULL;
893
894	kse_wakeup_multi(curkse);
895
896	/* Unlock the scheduling queue: */
897	KSE_SCHED_UNLOCK(curkse, curkse->k_kseg);
898
899	/*
900	 * The thread's current signal frame will only be NULL if it
901	 * is being resumed after being blocked in the kernel.  In
902	 * this case, and if the thread needs to run down pending
903	 * signals or needs a cancellation check, we need to add a
904	 * signal frame to the thread's context.
905	 */
906#ifdef NOT_YET
907	if ((curframe == NULL) && ((curthread->check_pending != 0) ||
908	    (((curthread->cancelflags & THR_AT_CANCEL_POINT) == 0) &&
909	    ((curthread->cancelflags & PTHREAD_CANCEL_ASYNCHRONOUS) != 0))))
910		signalcontext(&curthread->tmbx.tm_context, 0,
911		    (__sighandler_t *)thr_resume_wrapper);
912#else
913	if ((curframe == NULL) && (curthread->check_pending != 0))
914		signalcontext(&curthread->tmbx.tm_context, 0,
915		    (__sighandler_t *)thr_resume_wrapper);
916#endif
917	/*
918	 * Continue the thread at its current frame:
919	 */
920	if (curthread->lock_switch) {
921		KSE_SCHED_LOCK(curkse, curkse->k_kseg);
922		ret = _thread_switch(&curthread->tmbx, 0);
923	} else {
924		ret = _thread_switch(&curthread->tmbx,
925		 	&curkse->k_mbx.km_curthread);
926	}
927	if (ret != 0)
928		PANIC("Thread has returned from _thread_switch");
929
930	/* This point should not be reached. */
931	PANIC("Thread has returned from _thread_switch");
932}
933
934static void
935kse_check_signals(struct kse *curkse)
936{
937	sigset_t sigset;
938	int i;
939
940	/* Deliver posted signals. */
941	for (i = 0; i < _SIG_WORDS; i++) {
942		atomic_swap_int(&curkse->k_mbx.km_sigscaught.__bits[i],
943		    0, &sigset.__bits[i]);
944	}
945	if (SIGNOTEMPTY(sigset)) {
946		/*
947		 * Dispatch each signal.
948		 *
949		 * XXX - There is no siginfo for any of these.
950		 *       I think there should be, especially for
951		 *       signals from other processes (si_pid, si_uid).
952		 */
953		for (i = 1; i < NSIG; i++) {
954			if (sigismember(&sigset, i) != 0) {
955				DBG_MSG("Dispatching signal %d\n", i);
956				_thr_sig_dispatch(curkse, i,
957				    NULL /* no siginfo */);
958			}
959		}
960		sigemptyset(&sigset);
961		__sys_sigprocmask(SIG_SETMASK, &sigset, NULL);
962	}
963}
964
965static void
966thr_resume_wrapper(int unused_1, siginfo_t *unused_2, ucontext_t *ucp)
967{
968	struct pthread *curthread = _get_curthread();
969
970	thr_resume_check(curthread, ucp, NULL);
971}
972
973static void
974thr_resume_check(struct pthread *curthread, ucontext_t *ucp,
975    struct pthread_sigframe *psf)
976{
977	/* Check signals before cancellations. */
978	while (curthread->check_pending != 0) {
979		/* Clear the pending flag. */
980		curthread->check_pending = 0;
981
982		/*
983		 * It's perfectly valid, though not portable, for
984		 * signal handlers to munge their interrupted context
985		 * and expect to return to it.  Ensure we use the
986		 * correct context when running down signals.
987		 */
988		_thr_sig_rundown(curthread, ucp, psf);
989	}
990
991#ifdef NOT_YET
992	if (((curthread->cancelflags & THR_AT_CANCEL_POINT) == 0) &&
993	    ((curthread->cancelflags & PTHREAD_CANCEL_ASYNCHRONOUS) != 0))
994		pthread_testcancel();
995#endif
996}
997
998/*
999 * Clean up a thread.  This must be called with the thread's KSE
1000 * scheduling lock held.  The thread must be a thread from the
1001 * KSE's group.
1002 */
1003static void
1004thr_cleanup(struct kse *curkse, struct pthread *thread)
1005{
1006	struct pthread *joiner;
1007
1008	if ((joiner = thread->joiner) != NULL) {
1009		thread->joiner = NULL;
1010		if ((joiner->state == PS_JOIN) &&
1011		    (joiner->join_status.thread == thread)) {
1012			joiner->join_status.thread = NULL;
1013
1014			/* Set the return status for the joining thread: */
1015			joiner->join_status.ret = thread->ret;
1016
1017			/* Make the thread runnable. */
1018			if (joiner->kseg == curkse->k_kseg)
1019				_thr_setrunnable_unlocked(joiner);
1020			else {
1021				KSE_SCHED_UNLOCK(curkse, curkse->k_kseg);
1022				KSE_SCHED_LOCK(curkse, joiner->kseg);
1023				_thr_setrunnable_unlocked(joiner);
1024				KSE_SCHED_UNLOCK(curkse, joiner->kseg);
1025				KSE_SCHED_LOCK(curkse, curkse->k_kseg);
1026			}
1027		}
1028		thread->attr.flags |= PTHREAD_DETACHED;
1029	}
1030
1031	if ((thread->attr.flags & PTHREAD_SCOPE_PROCESS) == 0) {
1032		/*
1033		 * Remove the thread from the KSEG's list of threads.
1034	 	 */
1035		KSEG_THRQ_REMOVE(thread->kseg, thread);
1036		/*
1037		 * Migrate the thread to the main KSE so that this
1038		 * KSE and KSEG can be cleaned when their last thread
1039		 * exits.
1040		 */
1041		thread->kseg = _kse_initial->k_kseg;
1042		thread->kse = _kse_initial;
1043	}
1044	thread->flags |= THR_FLAGS_GC_SAFE;
1045
1046	/*
1047	 * We can't hold the thread list lock while holding the
1048	 * scheduler lock.
1049	 */
1050	KSE_SCHED_UNLOCK(curkse, curkse->k_kseg);
1051	DBG_MSG("Adding thread %p to GC list\n", thread);
1052	KSE_LOCK_ACQUIRE(curkse, &_thread_list_lock);
1053	THR_GCLIST_ADD(thread);
1054	KSE_LOCK_RELEASE(curkse, &_thread_list_lock);
1055	KSE_SCHED_LOCK(curkse, curkse->k_kseg);
1056}
1057
1058void
1059_thr_gc(struct pthread *curthread)
1060{
1061	struct pthread *td, *td_next;
1062	kse_critical_t crit;
1063	TAILQ_HEAD(, pthread) worklist;
1064
1065	TAILQ_INIT(&worklist);
1066	crit = _kse_critical_enter();
1067	KSE_LOCK_ACQUIRE(curthread->kse, &_thread_list_lock);
1068
1069	/* Check the threads waiting for GC. */
1070	for (td = TAILQ_FIRST(&_thread_gc_list); td != NULL; td = td_next) {
1071		td_next = TAILQ_NEXT(td, gcle);
1072		if ((td->flags & THR_FLAGS_GC_SAFE) == 0)
1073			continue;
1074#ifdef NOT_YET
1075		else if (((td->attr.flags & PTHREAD_SCOPE_PROCESS) != 0) &&
1076		    (td->kse->k_mbx.km_flags == 0)) {
1077			/*
1078			 * The thread and KSE are operating on the same
1079			 * stack.  Wait for the KSE to exit before freeing
1080			 * the thread's stack as well as everything else.
1081			 */
1082			continue;
1083		}
1084#endif
1085		/*
1086		 * Remove the thread from the GC list.  If the thread
1087		 * isn't yet detached, it will get added back to the
1088		 * GC list at a later time.
1089		 */
1090		THR_GCLIST_REMOVE(td);
1091		DBG_MSG("Freeing thread %p stack\n", td);
1092		/*
1093		 * We can free the thread stack since it's no longer
1094		 * in use.
1095		 */
1096		_thr_stack_free(&td->attr);
1097		if (((td->attr.flags & PTHREAD_DETACHED) != 0) &&
1098		    (td->refcount == 0)) {
1099			/*
1100			 * The thread has detached and is no longer
1101			 * referenced.  It is safe to remove all
1102			 * remnants of the thread.
1103			 */
1104			THR_LIST_REMOVE(td);
1105			TAILQ_INSERT_HEAD(&worklist, td, gcle);
1106		}
1107	}
1108	KSE_LOCK_RELEASE(curthread->kse, &_thread_list_lock);
1109	_kse_critical_leave(crit);
1110
1111	while ((td = TAILQ_FIRST(&worklist)) != NULL) {
1112		TAILQ_REMOVE(&worklist, td, gcle);
1113
1114		if ((td->attr.flags & PTHREAD_SCOPE_PROCESS) != 0) {
1115			crit = _kse_critical_enter();
1116			KSE_LOCK_ACQUIRE(curthread->kse, &kse_lock);
1117			kse_free_unlocked(td->kse);
1118			kseg_free_unlocked(td->kseg);
1119			KSE_LOCK_RELEASE(curthread->kse, &kse_lock);
1120			_kse_critical_leave(crit);
1121		}
1122		DBG_MSG("Freeing thread %p\n", td);
1123		_thr_free(curthread, td);
1124	}
1125}
1126
1127
1128/*
1129 * Only new threads that are running or suspended may be scheduled.
1130 */
1131int
1132_thr_schedule_add(struct pthread *curthread, struct pthread *newthread)
1133{
1134	struct kse *curkse;
1135	kse_critical_t crit;
1136	int need_start;
1137	int ret;
1138
1139	/*
1140	 * If this is the first time creating a thread, make sure
1141	 * the mailbox is set for the current thread.
1142	 */
1143	if ((newthread->attr.flags & PTHREAD_SCOPE_SYSTEM) != 0) {
1144#ifdef NOT_YET
1145		/* We use the thread's stack as the KSE's stack. */
1146		new_thread->kse->k_mbx.km_stack.ss_sp =
1147		    new_thread->attr.stackaddr_attr;
1148		new_thread->kse->k_mbx.km_stack.ss_size =
1149		    new_thread->attr.stacksize_attr;
1150#endif
1151		/*
1152		 * No need to lock the scheduling queue since the
1153		 * KSE/KSEG pair have not yet been started.
1154		 */
1155		KSEG_THRQ_ADD(newthread->kseg, newthread);
1156		TAILQ_INSERT_TAIL(&newthread->kseg->kg_kseq, newthread->kse,
1157		    k_kgqe);
1158		newthread->kseg->kg_ksecount = 1;
1159		if (newthread->state == PS_RUNNING)
1160			THR_RUNQ_INSERT_TAIL(newthread);
1161		newthread->kse->k_curthread = NULL;
1162		newthread->kse->k_mbx.km_flags = 0;
1163		newthread->kse->k_mbx.km_func = (kse_func_t *)kse_sched_multi;
1164		newthread->kse->k_mbx.km_quantum = 0;
1165
1166		/*
1167		 * This thread needs a new KSE and KSEG.
1168		 */
1169		crit = _kse_critical_enter();
1170		curkse = _get_curkse();
1171		_ksd_setprivate(&newthread->kse->k_ksd);
1172		newthread->kse->k_flags |= KF_INITIALIZED;
1173		ret = kse_create(&newthread->kse->k_mbx, 1);
1174		if (ret != 0)
1175			ret = errno;
1176		_ksd_setprivate(&curkse->k_ksd);
1177		_kse_critical_leave(crit);
1178	}
1179	else {
1180		/*
1181		 * Lock the KSE and add the new thread to its list of
1182		 * assigned threads.  If the new thread is runnable, also
1183		 * add it to the KSE's run queue.
1184		 */
1185		need_start = 0;
1186		KSE_SCHED_LOCK(curthread->kse, newthread->kseg);
1187		KSEG_THRQ_ADD(newthread->kseg, newthread);
1188		if (newthread->state == PS_RUNNING)
1189			THR_RUNQ_INSERT_TAIL(newthread);
1190		if ((newthread->kse->k_flags & KF_STARTED) == 0) {
1191			/*
1192			 * This KSE hasn't been started yet.  Start it
1193			 * outside of holding the lock.
1194			 */
1195			newthread->kse->k_flags |= KF_STARTED;
1196			newthread->kse->k_mbx.km_func =
1197			    (kse_func_t *)kse_sched_multi;
1198			newthread->kse->k_mbx.km_flags = 0;
1199			need_start = 1;
1200		}
1201		KSE_SCHED_UNLOCK(curthread->kse, newthread->kseg);
1202
1203	  	if (need_start != 0)
1204			kse_create(&newthread->kse->k_mbx, 0);
1205		else if ((newthread->state == PS_RUNNING) &&
1206		    KSE_IS_IDLE(newthread->kse)) {
1207			/*
1208			 * The thread is being scheduled on another KSEG.
1209			 */
1210			kse_wakeup_one(newthread);
1211		}
1212		ret = 0;
1213	}
1214	return (ret);
1215}
1216
1217void
1218kse_waitq_insert(struct pthread *thread)
1219{
1220	struct pthread *td;
1221
1222	if (thread->wakeup_time.tv_sec == -1)
1223		TAILQ_INSERT_TAIL(&thread->kse->k_schedq->sq_waitq, thread,
1224		    pqe);
1225	else {
1226		td = TAILQ_FIRST(&thread->kse->k_schedq->sq_waitq);
1227		while ((td != NULL) && (td->wakeup_time.tv_sec != -1) &&
1228		    ((td->wakeup_time.tv_sec < thread->wakeup_time.tv_sec) ||
1229		    ((td->wakeup_time.tv_sec == thread->wakeup_time.tv_sec) &&
1230		    (td->wakeup_time.tv_nsec <= thread->wakeup_time.tv_nsec))))
1231			td = TAILQ_NEXT(td, pqe);
1232		if (td == NULL)
1233			TAILQ_INSERT_TAIL(&thread->kse->k_schedq->sq_waitq,
1234			    thread, pqe);
1235		else
1236			TAILQ_INSERT_BEFORE(td, thread, pqe);
1237	}
1238	thread->flags |= THR_FLAGS_IN_WAITQ;
1239}
1240
1241/*
1242 * This must be called with the scheduling lock held.
1243 */
1244static void
1245kse_check_completed(struct kse *kse)
1246{
1247	struct pthread *thread;
1248	struct kse_thr_mailbox *completed;
1249
1250	if ((completed = kse->k_mbx.km_completed) != NULL) {
1251		kse->k_mbx.km_completed = NULL;
1252		while (completed != NULL) {
1253			thread = completed->tm_udata;
1254			DBG_MSG("Found completed thread %p, name %s\n",
1255			    thread,
1256			    (thread->name == NULL) ? "none" : thread->name);
1257			thread->blocked = 0;
1258			if (thread != kse->k_curthread) {
1259				if ((thread->flags & THR_FLAGS_SUSPENDED) != 0)
1260					THR_SET_STATE(thread, PS_SUSPENDED);
1261				else
1262					KSE_RUNQ_INSERT_TAIL(kse, thread);
1263			}
1264			completed = completed->tm_next;
1265		}
1266	}
1267}
1268
1269/*
1270 * This must be called with the scheduling lock held.
1271 */
1272static void
1273kse_check_waitq(struct kse *kse)
1274{
1275	struct pthread	*pthread;
1276	struct timespec ts;
1277
1278	KSE_GET_TOD(kse, &ts);
1279
1280	/*
1281	 * Wake up threads that have timedout.  This has to be
1282	 * done before adding the current thread to the run queue
1283	 * so that a CPU intensive thread doesn't get preference
1284	 * over waiting threads.
1285	 */
1286	while (((pthread = KSE_WAITQ_FIRST(kse)) != NULL) &&
1287	    thr_timedout(pthread, &ts)) {
1288		/* Remove the thread from the wait queue: */
1289		KSE_WAITQ_REMOVE(kse, pthread);
1290		DBG_MSG("Found timedout thread %p in waitq\n", pthread);
1291
1292		/* Indicate the thread timedout: */
1293		pthread->timeout = 1;
1294
1295		/* Add the thread to the priority queue: */
1296		if ((pthread->flags & THR_FLAGS_SUSPENDED) != 0)
1297			THR_SET_STATE(pthread, PS_SUSPENDED);
1298		else {
1299			THR_SET_STATE(pthread, PS_RUNNING);
1300			KSE_RUNQ_INSERT_TAIL(kse, pthread);
1301		}
1302	}
1303}
1304
1305static int
1306thr_timedout(struct pthread *thread, struct timespec *curtime)
1307{
1308	if (thread->wakeup_time.tv_sec < 0)
1309		return (0);
1310	else if (thread->wakeup_time.tv_sec > curtime->tv_sec)
1311		return (0);
1312	else if ((thread->wakeup_time.tv_sec == curtime->tv_sec) &&
1313	    (thread->wakeup_time.tv_nsec > curtime->tv_nsec))
1314		return (0);
1315	else
1316		return (1);
1317}
1318
1319/*
1320 * This must be called with the scheduling lock held.
1321 *
1322 * Each thread has a time slice, a wakeup time (used when it wants
1323 * to wait for a specified amount of time), a run state, and an
1324 * active flag.
1325 *
1326 * When a thread gets run by the scheduler, the active flag is
1327 * set to non-zero (1).  When a thread performs an explicit yield
1328 * or schedules a state change, it enters the scheduler and the
1329 * active flag is cleared.  When the active flag is still seen
1330 * set in the scheduler, that means that the thread is blocked in
1331 * the kernel (because it is cleared before entering the scheduler
1332 * in all other instances).
1333 *
1334 * The wakeup time is only set for those states that can timeout.
1335 * It is set to (-1, -1) for all other instances.
1336 *
1337 * The thread's run state, aside from being useful when debugging,
1338 * is used to place the thread in an appropriate queue.  There
1339 * are 2 basic queues:
1340 *
1341 *   o run queue - queue ordered by priority for all threads
1342 *                 that are runnable
1343 *   o waiting queue - queue sorted by wakeup time for all threads
1344 *                     that are not otherwise runnable (not blocked
1345 *                     in kernel, not waiting for locks)
1346 *
1347 * The thread's time slice is used for round-robin scheduling
1348 * (the default scheduling policy).  While a SCHED_RR thread
1349 * is runnable it's time slice accumulates.  When it reaches
1350 * the time slice interval, it gets reset and added to the end
1351 * of the queue of threads at its priority.  When a thread no
1352 * longer becomes runnable (blocks in kernel, waits, etc), its
1353 * time slice is reset.
1354 *
1355 * The job of kse_switchout_thread() is to handle all of the above.
1356 */
1357static void
1358kse_switchout_thread(struct kse *kse, struct pthread *thread)
1359{
1360	int level;
1361
1362	/*
1363	 * Place the currently running thread into the
1364	 * appropriate queue(s).
1365	 */
1366	DBG_MSG("Switching out thread %p, state %d\n", thread, thread->state);
1367	if (thread->blocked != 0) {
1368		thread->active = 0;
1369		thread->need_switchout = 0;
1370		/* This thread must have blocked in the kernel. */
1371		/* thread->slice_usec = -1;*/	/* restart timeslice */
1372		/*
1373		 * XXX - Check for pending signals for this thread to
1374		 *       see if we need to interrupt it in the kernel.
1375		 */
1376		/* if (thread->check_pending != 0) */
1377		if ((thread->slice_usec != -1) &&
1378		    (thread->attr.sched_policy != SCHED_FIFO))
1379			thread->slice_usec += (thread->tmbx.tm_uticks
1380			    + thread->tmbx.tm_sticks) * _clock_res_usec;
1381	}
1382	else {
1383		switch (thread->state) {
1384		case PS_DEAD:
1385			/*
1386			 * The scheduler is operating on a different
1387			 * stack.  It is safe to do garbage collecting
1388			 * here.
1389			 */
1390			thread->active = 0;
1391			thread->need_switchout = 0;
1392			thr_cleanup(kse, thread);
1393			return;
1394			break;
1395
1396		case PS_RUNNING:
1397			if ((thread->flags & THR_FLAGS_SUSPENDED) != 0)
1398				THR_SET_STATE(thread, PS_SUSPENDED);
1399			break;
1400
1401		case PS_COND_WAIT:
1402		case PS_SLEEP_WAIT:
1403			/* Insert into the waiting queue: */
1404			KSE_WAITQ_INSERT(kse, thread);
1405			break;
1406
1407		case PS_LOCKWAIT:
1408			/*
1409			 * This state doesn't timeout.
1410			 */
1411			thread->wakeup_time.tv_sec = -1;
1412			thread->wakeup_time.tv_nsec = -1;
1413			level = thread->locklevel - 1;
1414			if (_LCK_BUSY(&thread->lockusers[level]))
1415				KSE_WAITQ_INSERT(kse, thread);
1416			else
1417				THR_SET_STATE(thread, PS_RUNNING);
1418			break;
1419
1420		case PS_JOIN:
1421		case PS_MUTEX_WAIT:
1422		case PS_SIGSUSPEND:
1423		case PS_SIGWAIT:
1424		case PS_SUSPENDED:
1425		case PS_DEADLOCK:
1426		default:
1427			/*
1428			 * These states don't timeout.
1429			 */
1430			thread->wakeup_time.tv_sec = -1;
1431			thread->wakeup_time.tv_nsec = -1;
1432
1433			/* Insert into the waiting queue: */
1434			KSE_WAITQ_INSERT(kse, thread);
1435			break;
1436		}
1437		if (thread->state != PS_RUNNING) {
1438			/* Restart the time slice: */
1439			thread->slice_usec = -1;
1440		} else {
1441			if (thread->need_switchout != 0)
1442				/*
1443				 * The thread yielded on its own;
1444				 * restart the timeslice.
1445				 */
1446				thread->slice_usec = -1;
1447			else if ((thread->slice_usec != -1) &&
1448	   		    (thread->attr.sched_policy != SCHED_FIFO)) {
1449				thread->slice_usec += (thread->tmbx.tm_uticks
1450				    + thread->tmbx.tm_sticks) * _clock_res_usec;
1451				/* Check for time quantum exceeded: */
1452				if (thread->slice_usec > TIMESLICE_USEC)
1453					thread->slice_usec = -1;
1454			}
1455			if (thread->slice_usec == -1) {
1456				/*
1457				 * The thread exceeded its time quantum or
1458				 * it yielded the CPU; place it at the tail
1459				 * of the queue for its priority.
1460				 */
1461				KSE_RUNQ_INSERT_TAIL(kse, thread);
1462			} else {
1463				/*
1464				 * The thread hasn't exceeded its interval
1465				 * Place it at the head of the queue for its
1466				 * priority.
1467				 */
1468				KSE_RUNQ_INSERT_HEAD(kse, thread);
1469			}
1470		}
1471	}
1472	thread->active = 0;
1473	thread->need_switchout = 0;
1474}
1475
1476/*
1477 * This function waits for the smallest timeout value of any waiting
1478 * thread, or until it receives a message from another KSE.
1479 *
1480 * This must be called with the scheduling lock held.
1481 */
1482static void
1483kse_wait(struct kse *kse, struct pthread *td_wait)
1484{
1485	struct timespec ts, ts_sleep;
1486	int saved_flags;
1487
1488	KSE_GET_TOD(kse, &ts);
1489
1490	if ((td_wait == NULL) || (td_wait->wakeup_time.tv_sec < 0)) {
1491		/* Limit sleep to no more than 1 minute. */
1492		ts_sleep.tv_sec = 60;
1493		ts_sleep.tv_nsec = 0;
1494	} else {
1495		TIMESPEC_SUB(&ts_sleep, &td_wait->wakeup_time, &ts);
1496		if (ts_sleep.tv_sec > 60) {
1497			ts_sleep.tv_sec = 60;
1498			ts_sleep.tv_nsec = 0;
1499		}
1500	}
1501	/* Don't sleep for negative times. */
1502	if ((ts_sleep.tv_sec >= 0) && (ts_sleep.tv_nsec >= 0)) {
1503		KSE_SET_IDLE(kse);
1504		kse->k_kseg->kg_idle_kses++;
1505		KSE_SCHED_UNLOCK(kse, kse->k_kseg);
1506		saved_flags = kse->k_mbx.km_flags;
1507		kse->k_mbx.km_flags |= KMF_NOUPCALL;
1508		kse_release(&ts_sleep);
1509		kse->k_mbx.km_flags = saved_flags;
1510		KSE_SCHED_LOCK(kse, kse->k_kseg);
1511		if (KSE_IS_IDLE(kse)) {
1512			KSE_CLEAR_IDLE(kse);
1513			kse->k_kseg->kg_idle_kses--;
1514		}
1515	}
1516}
1517
1518/*
1519 * Avoid calling this kse_exit() so as not to confuse it with the
1520 * system call of the same name.
1521 */
1522static void
1523kse_fini(struct kse *kse)
1524{
1525	struct timespec ts;
1526	struct kse_group *free_kseg = NULL;
1527
1528	if ((kse->k_kseg->kg_flags & KGF_SINGLE_THREAD) != 0)
1529		kse_exit();
1530	/*
1531	 * Check to see if this is one of the main kses.
1532	 */
1533	else if (kse->k_kseg != _kse_initial->k_kseg) {
1534		/* Remove this KSE from the KSEG's list of KSEs. */
1535		KSE_SCHED_LOCK(kse, kse->k_kseg);
1536		TAILQ_REMOVE(&kse->k_kseg->kg_kseq, kse, k_kgqe);
1537		kse->k_kseg->kg_ksecount--;
1538		if (TAILQ_EMPTY(&kse->k_kseg->kg_kseq))
1539			free_kseg = kse->k_kseg;
1540		KSE_SCHED_UNLOCK(kse, kse->k_kseg);
1541
1542		/*
1543		 * Add this KSE to the list of free KSEs along with
1544		 * the KSEG if is now orphaned.
1545		 */
1546#ifdef NOT_YET
1547		KSE_LOCK_ACQUIRE(kse, &kse_lock);
1548		if (free_kseg != NULL)
1549			kseg_free_unlocked(free_kseg);
1550		kse_free_unlocked(kse);
1551		KSE_LOCK_RELEASE(kse, &kse_lock);
1552#endif
1553		kse_exit();
1554		/* Never returns. */
1555	} else {
1556		/*
1557		 * Wait for the last KSE/thread to exit, or for more
1558		 * threads to be created (it is possible for additional
1559		 * scope process threads to be created after the main
1560		 * thread exits).
1561		 */
1562		ts.tv_sec = 120;
1563		ts.tv_nsec = 0;
1564		KSE_SET_WAIT(kse);
1565		KSE_SCHED_LOCK(kse, kse->k_kseg);
1566		if ((active_kse_count > 1) &&
1567		    (kse->k_kseg->kg_threadcount == 0)) {
1568			KSE_SCHED_UNLOCK(kse, kse->k_kseg);
1569			kse_release(&ts);
1570			/* The above never returns. */
1571		}
1572		else
1573			KSE_SCHED_UNLOCK(kse, kse->k_kseg);
1574
1575		/* There are no more threads; exit this process: */
1576		if (kse->k_kseg->kg_threadcount == 0) {
1577			/* kse_exit(); */
1578			__isthreaded = 0;
1579			exit(0);
1580		}
1581	}
1582}
1583
1584void
1585_thr_sig_add(struct pthread *thread, int sig, siginfo_t *info, ucontext_t *ucp)
1586{
1587	struct kse *curkse;
1588
1589	curkse = _get_curkse();
1590
1591	KSE_SCHED_LOCK(curkse, thread->kseg);
1592	/*
1593	 * A threads assigned KSE can't change out from under us
1594	 * when we hold the scheduler lock.
1595	 */
1596	if (THR_IS_ACTIVE(thread)) {
1597		/* Thread is active.  Can't install the signal for it. */
1598		/* Make a note in the thread that it has a signal. */
1599		sigaddset(&thread->sigpend, sig);
1600		thread->check_pending = 1;
1601	}
1602	else {
1603		/* Make a note in the thread that it has a signal. */
1604		sigaddset(&thread->sigpend, sig);
1605		thread->check_pending = 1;
1606
1607		if (thread->blocked != 0) {
1608			/* Tell the kernel to interrupt the thread. */
1609			kse_thr_interrupt(&thread->tmbx);
1610		}
1611	}
1612	KSE_SCHED_UNLOCK(curkse, thread->kseg);
1613}
1614
1615void
1616_thr_set_timeout(const struct timespec *timeout)
1617{
1618	struct pthread	*curthread = _get_curthread();
1619	struct timespec ts;
1620
1621	/* Reset the timeout flag for the running thread: */
1622	curthread->timeout = 0;
1623
1624	/* Check if the thread is to wait forever: */
1625	if (timeout == NULL) {
1626		/*
1627		 * Set the wakeup time to something that can be recognised as
1628		 * different to an actual time of day:
1629		 */
1630		curthread->wakeup_time.tv_sec = -1;
1631		curthread->wakeup_time.tv_nsec = -1;
1632	}
1633	/* Check if no waiting is required: */
1634	else if ((timeout->tv_sec == 0) && (timeout->tv_nsec == 0)) {
1635		/* Set the wake up time to 'immediately': */
1636		curthread->wakeup_time.tv_sec = 0;
1637		curthread->wakeup_time.tv_nsec = 0;
1638	} else {
1639		/* Calculate the time for the current thread to wakeup: */
1640		KSE_GET_TOD(curthread->kse, &ts);
1641		TIMESPEC_ADD(&curthread->wakeup_time, &ts, timeout);
1642	}
1643}
1644
1645void
1646_thr_panic_exit(char *file, int line, char *msg)
1647{
1648	char buf[256];
1649
1650	snprintf(buf, sizeof(buf), "(%s:%d) %s\n", file, line, msg);
1651	__sys_write(2, buf, strlen(buf));
1652	abort();
1653}
1654
1655void
1656_thr_setrunnable(struct pthread *curthread, struct pthread *thread)
1657{
1658	kse_critical_t crit;
1659
1660	crit = _kse_critical_enter();
1661	KSE_SCHED_LOCK(curthread->kse, thread->kseg);
1662	_thr_setrunnable_unlocked(thread);
1663	KSE_SCHED_UNLOCK(curthread->kse, thread->kseg);
1664	_kse_critical_leave(crit);
1665}
1666
1667void
1668_thr_setrunnable_unlocked(struct pthread *thread)
1669{
1670	if ((thread->kseg->kg_flags & KGF_SINGLE_THREAD) != 0) {
1671		/* No silly queues for these threads. */
1672		if ((thread->flags & THR_FLAGS_SUSPENDED) != 0)
1673			THR_SET_STATE(thread, PS_SUSPENDED);
1674		else
1675			THR_SET_STATE(thread, PS_RUNNING);
1676	}else if (thread->state != PS_RUNNING) {
1677		if ((thread->flags & THR_FLAGS_IN_WAITQ) != 0)
1678			KSE_WAITQ_REMOVE(thread->kse, thread);
1679		if ((thread->flags & THR_FLAGS_SUSPENDED) != 0)
1680			THR_SET_STATE(thread, PS_SUSPENDED);
1681		else {
1682			THR_SET_STATE(thread, PS_RUNNING);
1683			if ((thread->blocked == 0) &&
1684			    (thread->flags & THR_FLAGS_IN_RUNQ) == 0)
1685				THR_RUNQ_INSERT_TAIL(thread);
1686		}
1687	}
1688        /*
1689         * XXX - Threads are not yet assigned to specific KSEs; they are
1690         *       assigned to the KSEG.  So the fact that a thread's KSE is
1691         *       waiting doesn't necessarily mean that it will be the KSE
1692         *       that runs the thread after the lock is granted.  But we
1693         *       don't know if the other KSEs within the same KSEG are
1694         *       also in a waiting state or not so we err on the side of
1695         *       caution and wakeup the thread's last known KSE.  We
1696         *       ensure that the threads KSE doesn't change while it's
1697         *       scheduling lock is held so it is safe to reference it
1698         *       (the KSE).  If the KSE wakes up and doesn't find any more
1699         *       work it will again go back to waiting so no harm is done.
1700         */
1701	kse_wakeup_one(thread);
1702}
1703
1704static void
1705kse_wakeup_one(struct pthread *thread)
1706{
1707	struct kse *ke;
1708
1709	if (KSE_IS_IDLE(thread->kse)) {
1710		KSE_CLEAR_IDLE(thread->kse);
1711		thread->kseg->kg_idle_kses--;
1712		KSE_WAKEUP(thread->kse);
1713	} else {
1714		TAILQ_FOREACH(ke, &thread->kseg->kg_kseq, k_kgqe) {
1715			if (KSE_IS_IDLE(ke)) {
1716				KSE_CLEAR_IDLE(ke);
1717				ke->k_kseg->kg_idle_kses--;
1718				KSE_WAKEUP(ke);
1719				return;
1720			}
1721		}
1722	}
1723}
1724
1725static void
1726kse_wakeup_multi(struct kse *curkse)
1727{
1728	struct kse *ke;
1729	int tmp;
1730
1731	if ((tmp = KSE_RUNQ_THREADS(curkse)) && curkse->k_kseg->kg_idle_kses) {
1732		TAILQ_FOREACH(ke, &curkse->k_kseg->kg_kseq, k_kgqe) {
1733			if (KSE_IS_IDLE(ke)) {
1734				KSE_CLEAR_IDLE(ke);
1735				ke->k_kseg->kg_idle_kses--;
1736				KSE_WAKEUP(ke);
1737				if (--tmp == 0)
1738					break;
1739			}
1740		}
1741	}
1742}
1743
1744struct pthread *
1745_get_curthread(void)
1746{
1747	return (_ksd_curthread);
1748}
1749
1750/* This assumes the caller has disabled upcalls. */
1751struct kse *
1752_get_curkse(void)
1753{
1754	return (_ksd_curkse);
1755}
1756
1757void
1758_set_curkse(struct kse *kse)
1759{
1760	_ksd_setprivate(&kse->k_ksd);
1761}
1762
1763/*
1764 * Allocate a new KSEG.
1765 *
1766 * We allow the current thread to be NULL in the case that this
1767 * is the first time a KSEG is being created (library initialization).
1768 * In this case, we don't need to (and can't) take any locks.
1769 */
1770struct kse_group *
1771_kseg_alloc(struct pthread *curthread)
1772{
1773	struct kse_group *kseg = NULL;
1774	kse_critical_t crit;
1775
1776	if ((curthread != NULL) && (free_kseg_count > 0)) {
1777		/* Use the kse lock for the kseg queue. */
1778		crit = _kse_critical_enter();
1779		KSE_LOCK_ACQUIRE(curthread->kse, &kse_lock);
1780		if ((kseg = TAILQ_FIRST(&free_kse_groupq)) != NULL) {
1781			TAILQ_REMOVE(&free_kse_groupq, kseg, kg_qe);
1782			free_kseg_count--;
1783			active_kseg_count++;
1784			TAILQ_INSERT_TAIL(&active_kse_groupq, kseg, kg_qe);
1785		}
1786		KSE_LOCK_RELEASE(curthread->kse, &kse_lock);
1787		_kse_critical_leave(crit);
1788		if (kseg)
1789			kseg_reinit(kseg);
1790	}
1791
1792	/*
1793	 * If requested, attempt to allocate a new KSE group only if the
1794	 * KSE allocation was successful and a KSE group wasn't found in
1795	 * the free list.
1796	 */
1797	if ((kseg == NULL) &&
1798	    ((kseg = (struct kse_group *)malloc(sizeof(*kseg))) != NULL)) {
1799		if (_pq_alloc(&kseg->kg_schedq.sq_runq,
1800		    THR_MIN_PRIORITY, THR_LAST_PRIORITY) != 0) {
1801			free(kseg);
1802			kseg = NULL;
1803		} else {
1804			kseg_init(kseg);
1805			/* Add the KSEG to the list of active KSEGs. */
1806			if (curthread != NULL) {
1807				crit = _kse_critical_enter();
1808				KSE_LOCK_ACQUIRE(curthread->kse, &kse_lock);
1809				active_kseg_count++;
1810				TAILQ_INSERT_TAIL(&active_kse_groupq,
1811				    kseg, kg_qe);
1812				KSE_LOCK_RELEASE(curthread->kse, &kse_lock);
1813				_kse_critical_leave(crit);
1814			} else {
1815				active_kseg_count++;
1816				TAILQ_INSERT_TAIL(&active_kse_groupq,
1817				    kseg, kg_qe);
1818			}
1819		}
1820	}
1821	return (kseg);
1822}
1823
1824/*
1825 * This must be called with the kse lock held and when there are
1826 * no more threads that reference it.
1827 */
1828static void
1829kseg_free_unlocked(struct kse_group *kseg)
1830{
1831	TAILQ_REMOVE(&active_kse_groupq, kseg, kg_qe);
1832	TAILQ_INSERT_HEAD(&free_kse_groupq, kseg, kg_qe);
1833	free_kseg_count++;
1834	active_kseg_count--;
1835}
1836
1837void
1838_kseg_free(struct kse_group *kseg)
1839{
1840	struct kse *curkse;
1841	kse_critical_t crit;
1842
1843	crit = _kse_critical_enter();
1844	curkse = _get_curkse();
1845	KSE_LOCK_ACQUIRE(curkse, &kse_lock);
1846	kseg_free_unlocked(kseg);
1847	KSE_LOCK_RELEASE(curkse, &kse_lock);
1848	_kse_critical_leave(crit);
1849}
1850
1851/*
1852 * Allocate a new KSE.
1853 *
1854 * We allow the current thread to be NULL in the case that this
1855 * is the first time a KSE is being created (library initialization).
1856 * In this case, we don't need to (and can't) take any locks.
1857 */
1858struct kse *
1859_kse_alloc(struct pthread *curthread)
1860{
1861	struct kse *kse = NULL;
1862	kse_critical_t crit;
1863	int need_ksd = 0;
1864	int i;
1865
1866	if ((curthread != NULL) && (free_kse_count > 0)) {
1867		crit = _kse_critical_enter();
1868		KSE_LOCK_ACQUIRE(curthread->kse, &kse_lock);
1869		/* Search for a finished KSE. */
1870		kse = TAILQ_FIRST(&free_kseq);
1871#ifdef NOT_YET
1872#define KEMBX_DONE	0x04
1873		while ((kse != NULL) &&
1874		    ((kse->k_mbx.km_flags & KEMBX_DONE) == 0)) {
1875			kse = TAILQ_NEXT(kse, k_qe);
1876		}
1877#undef KEMBX_DONE
1878#endif
1879		if (kse != NULL) {
1880			TAILQ_REMOVE(&free_kseq, kse, k_qe);
1881			free_kse_count--;
1882			TAILQ_INSERT_TAIL(&active_kseq, kse, k_qe);
1883			active_kse_count++;
1884		}
1885		KSE_LOCK_RELEASE(curthread->kse, &kse_lock);
1886		_kse_critical_leave(crit);
1887		if (kse != NULL)
1888			kse_reinit(kse);
1889	}
1890	if ((kse == NULL) &&
1891	    ((kse = (struct kse *)malloc(sizeof(*kse))) != NULL)) {
1892		bzero(kse, sizeof(*kse));
1893
1894		/* Initialize the lockusers. */
1895		for (i = 0; i < MAX_KSE_LOCKLEVEL; i++) {
1896			_lockuser_init(&kse->k_lockusers[i], (void *)kse);
1897			_LCK_SET_PRIVATE2(&kse->k_lockusers[i], NULL);
1898		}
1899		/* _lock_init(kse->k_lock, ...) */
1900
1901		/* We had to malloc a kse; mark it as needing a new ID.*/
1902		need_ksd = 1;
1903
1904		/*
1905		 * Create the KSE context.
1906		 *
1907		 * XXX - For now this is done here in the allocation.
1908		 *       In the future, we may want to have it done
1909		 *       outside the allocation so that scope system
1910		 *       threads (one thread per KSE) are not required
1911		 *       to have a stack for an unneeded kse upcall.
1912		 */
1913		kse->k_mbx.km_func = (kse_func_t *)kse_sched_multi;
1914		kse->k_mbx.km_stack.ss_sp = (char *)malloc(KSE_STACKSIZE);
1915		kse->k_mbx.km_stack.ss_size = KSE_STACKSIZE;
1916		kse->k_mbx.km_udata = (void *)kse;
1917		kse->k_mbx.km_quantum = 20000;
1918		/*
1919		 * We need to keep a copy of the stack in case it
1920		 * doesn't get used; a KSE running a scope system
1921		 * thread will use that thread's stack.
1922		 */
1923		kse->k_stack.ss_sp = kse->k_mbx.km_stack.ss_sp;
1924		kse->k_stack.ss_size = kse->k_mbx.km_stack.ss_size;
1925		if (kse->k_mbx.km_stack.ss_sp == NULL) {
1926			for (i = 0; i < MAX_KSE_LOCKLEVEL; i++) {
1927				_lockuser_destroy(&kse->k_lockusers[i]);
1928			}
1929			/* _lock_destroy(&kse->k_lock); */
1930			free(kse);
1931			kse = NULL;
1932		}
1933	}
1934	if ((kse != NULL) && (need_ksd != 0)) {
1935		/* This KSE needs initialization. */
1936		if (curthread != NULL) {
1937			crit = _kse_critical_enter();
1938			KSE_LOCK_ACQUIRE(curthread->kse, &kse_lock);
1939		}
1940		/* Initialize KSD inside of the lock. */
1941		if (_ksd_create(&kse->k_ksd, (void *)kse, sizeof(*kse)) != 0) {
1942			if (curthread != NULL) {
1943				KSE_LOCK_RELEASE(curthread->kse, &kse_lock);
1944				_kse_critical_leave(crit);
1945			}
1946			free(kse->k_mbx.km_stack.ss_sp);
1947			for (i = 0; i < MAX_KSE_LOCKLEVEL; i++) {
1948				_lockuser_destroy(&kse->k_lockusers[i]);
1949			}
1950			free(kse);
1951			return (NULL);
1952		}
1953		kse->k_flags = 0;
1954		TAILQ_INSERT_TAIL(&active_kseq, kse, k_qe);
1955		active_kse_count++;
1956		if (curthread != NULL) {
1957			KSE_LOCK_RELEASE(curthread->kse, &kse_lock);
1958			_kse_critical_leave(crit);
1959		}
1960	}
1961	return (kse);
1962}
1963
1964static void
1965kse_reinit(struct kse *kse)
1966{
1967	bzero(&kse->k_mbx, sizeof(struct kse_mailbox));
1968	kse->k_curthread = 0;
1969	kse->k_kseg = 0;
1970	kse->k_schedq = 0;
1971	kse->k_locklevel = 0;
1972	sigemptyset(&kse->k_sigmask);
1973	bzero(&kse->k_sigq, sizeof(kse->k_sigq));
1974	kse->k_check_sigq = 0;
1975	kse->k_flags = 0;
1976	kse->k_waiting = 0;
1977	kse->k_error = 0;
1978	kse->k_cpu = 0;
1979	kse->k_done = 0;
1980}
1981
1982void
1983kse_free_unlocked(struct kse *kse)
1984{
1985	TAILQ_REMOVE(&active_kseq, kse, k_qe);
1986	active_kse_count--;
1987	kse->k_kseg = NULL;
1988	kse->k_mbx.km_quantum = 20000;
1989	kse->k_flags &= ~KF_INITIALIZED;
1990	TAILQ_INSERT_HEAD(&free_kseq, kse, k_qe);
1991	free_kse_count++;
1992}
1993
1994void
1995_kse_free(struct pthread *curthread, struct kse *kse)
1996{
1997	kse_critical_t crit;
1998
1999	if (curthread == NULL)
2000		kse_free_unlocked(kse);
2001	else {
2002		crit = _kse_critical_enter();
2003		KSE_LOCK_ACQUIRE(curthread->kse, &kse_lock);
2004		kse_free_unlocked(kse);
2005		KSE_LOCK_RELEASE(curthread->kse, &kse_lock);
2006		_kse_critical_leave(crit);
2007	}
2008}
2009
2010static void
2011kseg_init(struct kse_group *kseg)
2012{
2013	kseg_reinit(kseg);
2014	_lock_init(&kseg->kg_lock, LCK_ADAPTIVE, _kse_lock_wait,
2015	    _kse_lock_wakeup);
2016}
2017
2018static void
2019kseg_reinit(struct kse_group *kseg)
2020{
2021	TAILQ_INIT(&kseg->kg_kseq);
2022	TAILQ_INIT(&kseg->kg_threadq);
2023	TAILQ_INIT(&kseg->kg_schedq.sq_waitq);
2024	kseg->kg_threadcount = 0;
2025	kseg->kg_ksecount = 0;
2026	kseg->kg_idle_kses = 0;
2027	kseg->kg_flags = 0;
2028}
2029
2030struct pthread *
2031_thr_alloc(struct pthread *curthread)
2032{
2033	kse_critical_t crit;
2034	void *p;
2035	struct pthread *thread = NULL;
2036
2037	if (curthread != NULL) {
2038		if (GC_NEEDED())
2039			_thr_gc(curthread);
2040		if (free_thread_count > 0) {
2041			crit = _kse_critical_enter();
2042			KSE_LOCK_ACQUIRE(curthread->kse, &thread_lock);
2043			if ((thread = TAILQ_FIRST(&free_threadq)) != NULL) {
2044				TAILQ_REMOVE(&free_threadq, thread, tle);
2045				free_thread_count--;
2046			}
2047			KSE_LOCK_RELEASE(curthread->kse, &thread_lock);
2048			_kse_critical_leave(crit);
2049		}
2050	}
2051	if (thread == NULL) {
2052		p = malloc(sizeof(struct pthread) + THR_ALIGNBYTES);
2053		if (p != NULL) {
2054			thread = (struct pthread *)THR_ALIGN(p);
2055			thread->alloc_addr = p;
2056		}
2057	}
2058	return (thread);
2059}
2060
2061void
2062_thr_free(struct pthread *curthread, struct pthread *thread)
2063{
2064	kse_critical_t crit;
2065	int i;
2066
2067	DBG_MSG("Freeing thread %p\n", thread);
2068	if ((curthread == NULL) || (free_thread_count >= MAX_CACHED_THREADS)) {
2069		for (i = 0; i < MAX_THR_LOCKLEVEL; i++) {
2070			_lockuser_destroy(&thread->lockusers[i]);
2071		}
2072		_lock_destroy(&thread->lock);
2073		free(thread->alloc_addr);
2074	}
2075	else {
2076		crit = _kse_critical_enter();
2077		KSE_LOCK_ACQUIRE(curthread->kse, &thread_lock);
2078		TAILQ_INSERT_HEAD(&free_threadq, thread, tle);
2079		free_thread_count++;
2080		KSE_LOCK_RELEASE(curthread->kse, &thread_lock);
2081		_kse_critical_leave(crit);
2082	}
2083}
2084