thr_kern.c revision 117706
1/*
2 * Copyright (C) 2003 Daniel M. Eischen <deischen@freebsd.org>
3 * Copyright (C) 2002 Jonathon Mini <mini@freebsd.org>
4 * Copyright (c) 1995-1998 John Birrell <jb@cimlogic.com.au>
5 * All rights reserved.
6 *
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
9 * are met:
10 * 1. Redistributions of source code must retain the above copyright
11 *    notice, this list of conditions and the following disclaimer.
12 * 2. Redistributions in binary form must reproduce the above copyright
13 *    notice, this list of conditions and the following disclaimer in the
14 *    documentation and/or other materials provided with the distribution.
15 * 3. All advertising materials mentioning features or use of this software
16 *    must display the following acknowledgement:
17 *	This product includes software developed by John Birrell.
18 * 4. Neither the name of the author nor the names of any co-contributors
19 *    may be used to endorse or promote products derived from this software
20 *    without specific prior written permission.
21 *
22 * THIS SOFTWARE IS PROVIDED BY JOHN BIRRELL AND CONTRIBUTORS ``AS IS'' AND
23 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
24 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
25 * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
26 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
27 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
28 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
29 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
30 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
31 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32 * SUCH DAMAGE.
33 *
34 */
35#include <sys/cdefs.h>
36__FBSDID("$FreeBSD: head/lib/libkse/thread/thr_kern.c 117706 2003-07-17 23:02:30Z davidxu $");
37
38#include <sys/types.h>
39#include <sys/kse.h>
40#include <sys/signalvar.h>
41#include <sys/queue.h>
42#include <machine/atomic.h>
43#include <machine/sigframe.h>
44
45#include <assert.h>
46#include <errno.h>
47#include <signal.h>
48#include <stdlib.h>
49#include <string.h>
50#include <time.h>
51#include <ucontext.h>
52#include <unistd.h>
53
54#include "atomic_ops.h"
55#include "thr_private.h"
56#include "libc_private.h"
57#include "ksd.h"
58
59/*#define DEBUG_THREAD_KERN */
60#ifdef DEBUG_THREAD_KERN
61#define DBG_MSG		stdout_debug
62#else
63#define DBG_MSG(x...)
64#endif
65
66/*
67 * Define a high water mark for the maximum number of threads that
68 * will be cached.  Once this level is reached, any extra threads
69 * will be free()'d.
70 *
71 * XXX - It doesn't make sense to worry about the maximum number of
72 *       KSEs that we can cache because the system will limit us to
73 *       something *much* less than the maximum number of threads
74 *       that we can have.  Disregarding KSEs in their own group,
75 *       the maximum number of KSEs is the number of processors in
76 *       the system.
77 */
78#define	MAX_CACHED_THREADS	100
79#define	KSE_STACKSIZE		16384
80
81#define	KSE_SET_MBOX(kse, thrd) \
82	(kse)->k_mbx.km_curthread = &(thrd)->tmbx
83
84#define	KSE_SET_EXITED(kse)	(kse)->k_flags |= KF_EXITED
85
86/*
87 * Macros for manipulating the run queues.  The priority queue
88 * routines use the thread's pqe link and also handle the setting
89 * and clearing of the thread's THR_FLAGS_IN_RUNQ flag.
90 */
91#define	KSE_RUNQ_INSERT_HEAD(kse, thrd)			\
92	_pq_insert_head(&(kse)->k_schedq->sq_runq, thrd)
93#define	KSE_RUNQ_INSERT_TAIL(kse, thrd)			\
94	_pq_insert_tail(&(kse)->k_schedq->sq_runq, thrd)
95#define	KSE_RUNQ_REMOVE(kse, thrd)			\
96	_pq_remove(&(kse)->k_schedq->sq_runq, thrd)
97#define	KSE_RUNQ_FIRST(kse)	_pq_first(&(kse)->k_schedq->sq_runq)
98
99#define KSE_RUNQ_THREADS(kse)	((kse)->k_schedq->sq_runq.pq_threads)
100
101/*
102 * We've got to keep track of everything that is allocated, not only
103 * to have a speedy free list, but also so they can be deallocated
104 * after a fork().
105 */
106static TAILQ_HEAD(, kse)	active_kseq;
107static TAILQ_HEAD(, kse)	free_kseq;
108static TAILQ_HEAD(, kse_group)	free_kse_groupq;
109static TAILQ_HEAD(, kse_group)	active_kse_groupq;
110static TAILQ_HEAD(, kse_group)	gc_ksegq;
111static struct lock		kse_lock;	/* also used for kseg queue */
112static int			free_kse_count = 0;
113static int			free_kseg_count = 0;
114static TAILQ_HEAD(, pthread)	free_threadq;
115static struct lock		thread_lock;
116static int			free_thread_count = 0;
117static int			inited = 0;
118static int			active_threads = 1;
119static int			active_kse_count = 0;
120static int			active_kseg_count = 0;
121static u_int64_t		next_uniqueid = 1;
122
123LIST_HEAD(thread_hash_head, pthread);
124#define THREAD_HASH_QUEUES	127
125static struct thread_hash_head	thr_hashtable[THREAD_HASH_QUEUES];
126#define	THREAD_HASH(thrd)	((unsigned long)thrd % THREAD_HASH_QUEUES)
127
128#ifdef DEBUG_THREAD_KERN
129static void	dump_queues(struct kse *curkse);
130#endif
131static void	kse_check_completed(struct kse *kse);
132static void	kse_check_waitq(struct kse *kse);
133static void	kse_fini(struct kse *curkse);
134static void	kse_reinit(struct kse *kse, int sys_scope);
135static void	kse_sched_multi(struct kse *curkse);
136static void	kse_sched_single(struct kse *curkse);
137static void	kse_switchout_thread(struct kse *kse, struct pthread *thread);
138static void	kse_wait(struct kse *kse, struct pthread *td_wait, int sigseq);
139static void	kse_free_unlocked(struct kse *kse);
140static void	kseg_free_unlocked(struct kse_group *kseg);
141static void	kseg_init(struct kse_group *kseg);
142static void	kseg_reinit(struct kse_group *kseg);
143static void	kse_waitq_insert(struct pthread *thread);
144static void	kse_wakeup_multi(struct kse *curkse);
145static void	kse_wakeup_one(struct pthread *thread);
146static void	thr_cleanup(struct kse *kse, struct pthread *curthread);
147static void	thr_link(struct pthread *thread);
148static void	thr_resume_wrapper(int sig, siginfo_t *, ucontext_t *);
149static void	thr_resume_check(struct pthread *curthread, ucontext_t *ucp,
150		    struct pthread_sigframe *psf);
151static int	thr_timedout(struct pthread *thread, struct timespec *curtime);
152static void	thr_unlink(struct pthread *thread);
153
154
155/*
156 * This is called after a fork().
157 * No locks need to be taken here since we are guaranteed to be
158 * single threaded.
159 *
160 * XXX
161 * POSIX says for threaded process, fork() function is used
162 * only to run new programs, and the effects of calling functions
163 * that require certain resources between the call to fork() and
164 * the call to an exec function are undefined.
165 *
166 * Here it is not safe to reinitialize the library after fork().
167 * Because memory management may be corrupted, further calling
168 * malloc()/free() may cause undefined behavior.
169 */
170void
171_kse_single_thread(struct pthread *curthread)
172{
173#ifdef NOTYET
174	struct kse *kse;
175	struct kse_group *kseg;
176	struct pthread *thread;
177	kse_critical_t crit;
178	int i;
179
180
181	/*
182	 * Disable upcalls and clear the threaded flag.
183	 * XXX - I don't think we need to disable upcalls after a fork().
184	 *       but it doesn't hurt.
185	 */
186	crit = _kse_critical_enter();
187	__isthreaded = 0;
188	active_threads = 1;
189	_thr_signal_deinit();
190
191	/*
192	 * Enter a loop to remove and free all threads other than
193	 * the running thread from the active thread list:
194	 */
195	while ((thread = TAILQ_FIRST(&_thread_list)) != NULL) {
196		THR_GCLIST_REMOVE(thread);
197		/*
198		 * Remove this thread from the list (the current
199		 * thread will be removed but re-added by libpthread
200		 * initialization.
201		 */
202		TAILQ_REMOVE(&_thread_list, thread, tle);
203		/* Make sure this isn't the running thread: */
204		if (thread != curthread) {
205			_thr_stack_free(&thread->attr);
206			if (thread->specific != NULL)
207				free(thread->specific);
208			for (i = 0; i < MAX_THR_LOCKLEVEL; i++) {
209				_lockuser_destroy(&thread->lockusers[i]);
210			}
211			_lock_destroy(&thread->lock);
212			free(thread);
213		}
214	}
215
216	TAILQ_INIT(&curthread->mutexq);		/* initialize mutex queue */
217	curthread->joiner = NULL;		/* no joining threads yet */
218	curthread->refcount = 0;
219	SIGEMPTYSET(curthread->sigpend);	/* clear pending signals */
220	if (curthread->specific != NULL) {
221		free(curthread->specific);
222		curthread->specific = NULL;
223		curthread->specific_data_count = 0;
224	}
225
226	/* Free the free KSEs: */
227	while ((kse = TAILQ_FIRST(&free_kseq)) != NULL) {
228		TAILQ_REMOVE(&free_kseq, kse, k_qe);
229		for (i = 0; i < MAX_KSE_LOCKLEVEL; i++) {
230			_lockuser_destroy(&kse->k_lockusers[i]);
231		}
232		_lock_destroy(&kse->k_lock);
233		_ksd_destroy(&kse->k_ksd);
234		if (kse->k_stack.ss_sp != NULL)
235			free(kse->k_stack.ss_sp);
236		free(kse);
237	}
238	free_kse_count = 0;
239
240	/* Free the active KSEs: */
241	while ((kse = TAILQ_FIRST(&active_kseq)) != NULL) {
242		TAILQ_REMOVE(&active_kseq, kse, k_qe);
243		for (i = 0; i < MAX_KSE_LOCKLEVEL; i++) {
244			_lockuser_destroy(&kse->k_lockusers[i]);
245		}
246		_lock_destroy(&kse->k_lock);
247		if (kse->k_stack.ss_sp != NULL)
248			free(kse->k_stack.ss_sp);
249		free(kse);
250	}
251	active_kse_count = 0;
252
253	/* Free the free KSEGs: */
254	while ((kseg = TAILQ_FIRST(&free_kse_groupq)) != NULL) {
255		TAILQ_REMOVE(&free_kse_groupq, kseg, kg_qe);
256		_lock_destroy(&kseg->kg_lock);
257		_pq_free(&kseg->kg_schedq.sq_runq);
258		free(kseg);
259	}
260	free_kseg_count = 0;
261
262	/* Free the active KSEGs: */
263	while ((kseg = TAILQ_FIRST(&active_kse_groupq)) != NULL) {
264		TAILQ_REMOVE(&active_kse_groupq, kseg, kg_qe);
265		_lock_destroy(&kseg->kg_lock);
266		_pq_free(&kseg->kg_schedq.sq_runq);
267		free(kseg);
268	}
269	active_kseg_count = 0;
270
271	/* Free the free threads. */
272	while ((thread = TAILQ_FIRST(&free_threadq)) != NULL) {
273		TAILQ_REMOVE(&free_threadq, thread, tle);
274		if (thread->specific != NULL)
275			free(thread->specific);
276		for (i = 0; i < MAX_THR_LOCKLEVEL; i++) {
277			_lockuser_destroy(&thread->lockusers[i]);
278		}
279		_lock_destroy(&thread->lock);
280		free(thread);
281	}
282	free_thread_count = 0;
283
284	/* Free the to-be-gc'd threads. */
285	while ((thread = TAILQ_FIRST(&_thread_gc_list)) != NULL) {
286		TAILQ_REMOVE(&_thread_gc_list, thread, gcle);
287		for (i = 0; i < MAX_THR_LOCKLEVEL; i++) {
288			_lockuser_destroy(&thread->lockusers[i]);
289		}
290		_lock_destroy(&thread->lock);
291		free(thread);
292	}
293	TAILQ_INIT(&gc_ksegq);
294	_gc_count = 0;
295
296	if (inited != 0) {
297		/*
298		 * Destroy these locks; they'll be recreated to assure they
299		 * are in the unlocked state.
300		 */
301		_lock_destroy(&kse_lock);
302		_lock_destroy(&thread_lock);
303		_lock_destroy(&_thread_list_lock);
304		inited = 0;
305	}
306
307	/*
308	 * After a fork(), the leftover thread goes back to being
309	 * scope process.
310	 */
311	curthread->attr.flags &= ~PTHREAD_SCOPE_SYSTEM;
312	curthread->attr.flags |= PTHREAD_SCOPE_PROCESS;
313
314	/*
315	 * After a fork, we are still operating on the thread's original
316	 * stack.  Don't clear the THR_FLAGS_USER from the thread's
317	 * attribute flags.
318	 */
319
320	/* Initialize the threads library. */
321	curthread->kse = NULL;
322	curthread->kseg = NULL;
323	_kse_initial = NULL;
324	_libpthread_init(curthread);
325#else
326	if (__isthreaded)
327		_thr_signal_deinit();
328	_ksd_readandclear_tmbx();
329	__isthreaded   = 0;
330	active_threads = 0;
331#endif
332}
333
334/*
335 * This is used to initialize housekeeping and to initialize the
336 * KSD for the KSE.
337 */
338void
339_kse_init(void)
340{
341	if (inited == 0) {
342		TAILQ_INIT(&active_kseq);
343		TAILQ_INIT(&active_kse_groupq);
344		TAILQ_INIT(&free_kseq);
345		TAILQ_INIT(&free_kse_groupq);
346		TAILQ_INIT(&free_threadq);
347		TAILQ_INIT(&gc_ksegq);
348		if (_lock_init(&kse_lock, LCK_ADAPTIVE,
349		    _kse_lock_wait, _kse_lock_wakeup) != 0)
350			PANIC("Unable to initialize free KSE queue lock");
351		if (_lock_init(&thread_lock, LCK_ADAPTIVE,
352		    _kse_lock_wait, _kse_lock_wakeup) != 0)
353			PANIC("Unable to initialize free thread queue lock");
354		if (_lock_init(&_thread_list_lock, LCK_ADAPTIVE,
355		    _kse_lock_wait, _kse_lock_wakeup) != 0)
356			PANIC("Unable to initialize thread list lock");
357		active_kse_count = 0;
358		active_kseg_count = 0;
359		_gc_count = 0;
360		inited = 1;
361	}
362}
363
364int
365_kse_isthreaded(void)
366{
367	return (__isthreaded != 0);
368}
369
370/*
371 * This is called when the first thread (other than the initial
372 * thread) is created.
373 */
374int
375_kse_setthreaded(int threaded)
376{
377	if ((threaded != 0) && (__isthreaded == 0)) {
378		/*
379		 * Locking functions in libc are required when there are
380		 * threads other than the initial thread.
381		 */
382		__isthreaded = 1;
383
384		/*
385		 * Tell the kernel to create a KSE for the initial thread
386		 * and enable upcalls in it.
387		 */
388		_thr_signal_init();
389		_kse_initial->k_flags |= KF_STARTED;
390
391#ifdef SYSTEM_SCOPE_ONLY
392		/*
393		 * For bound thread, kernel reads mailbox pointer once,
394		 * we'd set it here before calling kse_create
395		 */
396		KSE_SET_MBOX(_kse_initial, _thr_initial);
397		_kse_initial->k_mbx.km_flags |= KMF_BOUND;
398#endif
399
400		if (kse_create(&_kse_initial->k_mbx, 0) != 0) {
401			_kse_initial->k_flags &= ~KF_STARTED;
402			__isthreaded = 0;
403			PANIC("kse_create() failed\n");
404			return (-1);
405		}
406
407#ifndef SYSTEM_SCOPE_ONLY
408		/* Set current thread to initial thread */
409		KSE_SET_MBOX(_kse_initial, _thr_initial);
410		_thr_start_sig_daemon();
411		_thr_setmaxconcurrency();
412#endif
413
414	}
415	return (0);
416}
417
418/*
419 * Lock wait and wakeup handlers for KSE locks.  These are only used by
420 * KSEs, and should never be used by threads.  KSE locks include the
421 * KSE group lock (used for locking the scheduling queue) and the
422 * kse_lock defined above.
423 *
424 * When a KSE lock attempt blocks, the entire KSE blocks allowing another
425 * KSE to run.  For the most part, it doesn't make much sense to try and
426 * schedule another thread because you need to lock the scheduling queue
427 * in order to do that.  And since the KSE lock is used to lock the scheduling
428 * queue, you would just end up blocking again.
429 */
430void
431_kse_lock_wait(struct lock *lock, struct lockuser *lu)
432{
433	struct kse *curkse = (struct kse *)_LCK_GET_PRIVATE(lu);
434	struct timespec ts;
435	int saved_flags;
436
437	if (curkse->k_mbx.km_curthread != NULL)
438		PANIC("kse_lock_wait does not disable upcall.\n");
439	/*
440	 * Enter a loop to wait until we get the lock.
441	 */
442	ts.tv_sec = 0;
443	ts.tv_nsec = 1000000;  /* 1 sec */
444	while (!_LCK_GRANTED(lu)) {
445		/*
446		 * Yield the kse and wait to be notified when the lock
447		 * is granted.
448		 */
449		saved_flags = curkse->k_mbx.km_flags;
450		curkse->k_mbx.km_flags |= KMF_NOUPCALL | KMF_NOCOMPLETED;
451		kse_release(&ts);
452		curkse->k_mbx.km_flags = saved_flags;
453	}
454}
455
456void
457_kse_lock_wakeup(struct lock *lock, struct lockuser *lu)
458{
459	struct kse *curkse;
460	struct kse *kse;
461	struct kse_mailbox *mbx;
462
463	curkse = _get_curkse();
464	kse = (struct kse *)_LCK_GET_PRIVATE(lu);
465
466	if (kse == curkse)
467		PANIC("KSE trying to wake itself up in lock");
468	else {
469		mbx = &kse->k_mbx;
470		_lock_grant(lock, lu);
471		/*
472		 * Notify the owning kse that it has the lock.
473		 * It is safe to pass invalid address to kse_wakeup
474		 * even if the mailbox is not in kernel at all,
475		 * and waking up a wrong kse is also harmless.
476		 */
477		kse_wakeup(mbx);
478	}
479}
480
481/*
482 * Thread wait and wakeup handlers for thread locks.  These are only used
483 * by threads, never by KSEs.  Thread locks include the per-thread lock
484 * (defined in its structure), and condition variable and mutex locks.
485 */
486void
487_thr_lock_wait(struct lock *lock, struct lockuser *lu)
488{
489	struct pthread *curthread = (struct pthread *)lu->lu_private;
490
491	do {
492		THR_SCHED_LOCK(curthread, curthread);
493		THR_SET_STATE(curthread, PS_LOCKWAIT);
494		THR_SCHED_UNLOCK(curthread, curthread);
495		_thr_sched_switch(curthread);
496	} while (!_LCK_GRANTED(lu));
497}
498
499void
500_thr_lock_wakeup(struct lock *lock, struct lockuser *lu)
501{
502	struct pthread *thread;
503	struct pthread *curthread;
504
505	curthread = _get_curthread();
506	thread = (struct pthread *)_LCK_GET_PRIVATE(lu);
507
508	THR_SCHED_LOCK(curthread, thread);
509	_lock_grant(lock, lu);
510	_thr_setrunnable_unlocked(thread);
511	THR_SCHED_UNLOCK(curthread, thread);
512}
513
514kse_critical_t
515_kse_critical_enter(void)
516{
517	kse_critical_t crit;
518
519	crit = _ksd_readandclear_tmbx();
520	return (crit);
521}
522
523void
524_kse_critical_leave(kse_critical_t crit)
525{
526	struct pthread *curthread;
527
528	_ksd_set_tmbx(crit);
529	if ((crit != NULL) && ((curthread = _get_curthread()) != NULL))
530		THR_YIELD_CHECK(curthread);
531}
532
533int
534_kse_in_critical(void)
535{
536	return (_ksd_get_tmbx() == NULL);
537}
538
539void
540_thr_critical_enter(struct pthread *thread)
541{
542	thread->critical_count++;
543}
544
545void
546_thr_critical_leave(struct pthread *thread)
547{
548	thread->critical_count--;
549	THR_YIELD_CHECK(thread);
550}
551
552void
553_thr_sched_switch(struct pthread *curthread)
554{
555	struct kse *curkse;
556
557	(void)_kse_critical_enter();
558	curkse = _get_curkse();
559	KSE_SCHED_LOCK(curkse, curkse->k_kseg);
560	_thr_sched_switch_unlocked(curthread);
561}
562
563/*
564 * XXX - We may need to take the scheduling lock before calling
565 *       this, or perhaps take the lock within here before
566 *       doing anything else.
567 */
568void
569_thr_sched_switch_unlocked(struct pthread *curthread)
570{
571	struct pthread *td;
572	struct pthread_sigframe psf;
573	struct kse *curkse;
574	int ret;
575	volatile int uts_once;
576	volatile int resume_once = 0;
577	ucontext_t uc;
578
579	/* We're in the scheduler, 5 by 5: */
580	curkse = _get_curkse();
581
582	curthread->need_switchout = 1;	/* The thread yielded on its own. */
583	curthread->critical_yield = 0;	/* No need to yield anymore. */
584	curthread->slice_usec = -1;	/* Restart the time slice. */
585
586	/* Thread can unlock the scheduler lock. */
587	curthread->lock_switch = 1;
588
589	/*
590	 * The signal frame is allocated off the stack because
591	 * a thread can be interrupted by other signals while
592	 * it is running down pending signals.
593	 */
594	psf.psf_valid = 0;
595	curthread->curframe = &psf;
596
597	/*
598	 * Enter the scheduler if any one of the following is true:
599	 *
600	 *   o The current thread is dead; it's stack needs to be
601	 *     cleaned up and it can't be done while operating on
602	 *     it.
603	 *   o The current thread has signals pending, should
604	 *     let scheduler install signal trampoline for us.
605	 *   o There are no runnable threads.
606	 *   o The next thread to run won't unlock the scheduler
607	 *     lock.  A side note: the current thread may be run
608	 *     instead of the next thread in the run queue, but
609	 *     we don't bother checking for that.
610	 */
611	if (curthread->attr.flags & PTHREAD_SCOPE_SYSTEM)
612		kse_sched_single(curkse);
613	else if ((curthread->state == PS_DEAD) ||
614	    (((td = KSE_RUNQ_FIRST(curkse)) == NULL) &&
615	    (curthread->state != PS_RUNNING)) ||
616	    ((td != NULL) && (td->lock_switch == 0))) {
617		curkse->k_switch = 1;
618		_thread_enter_uts(&curthread->tmbx, &curkse->k_mbx);
619	}
620	else {
621		uts_once = 0;
622		THR_GETCONTEXT(&curthread->tmbx.tm_context);
623		if (uts_once == 0) {
624			uts_once = 1;
625
626			/* Switchout the current thread. */
627			kse_switchout_thread(curkse, curthread);
628
629		 	/* Choose another thread to run. */
630			td = KSE_RUNQ_FIRST(curkse);
631			KSE_RUNQ_REMOVE(curkse, td);
632			curkse->k_curthread = td;
633
634			/*
635			 * Make sure the current thread's kse points to
636			 * this kse.
637			 */
638			td->kse = curkse;
639
640			/*
641			 * Reset accounting.
642			 */
643			td->tmbx.tm_uticks = 0;
644			td->tmbx.tm_sticks = 0;
645
646			/*
647			 * Reset the time slice if this thread is running
648			 * for the first time or running again after using
649			 * its full time slice allocation.
650			 */
651			if (td->slice_usec == -1)
652				td->slice_usec = 0;
653
654			/* Mark the thread active. */
655			td->active = 1;
656
657			/* Remove the frame reference. */
658			td->curframe = NULL;
659
660			/*
661			 * Continue the thread at its current frame:
662			 */
663			ret = _thread_switch(&td->tmbx, NULL);
664			/* This point should not be reached. */
665			if (ret != 0)
666				PANIC("Bad return from _thread_switch");
667			PANIC("Thread has returned from _thread_switch");
668		}
669	}
670
671	if (psf.psf_valid) {
672		/*
673		 * It is ugly we must increase critical count, because we
674		 * have a frame saved, we must backout state in psf
675		 * before we can process signals.
676 		 */
677		curthread->critical_count++;
678	}
679
680	if (curthread->lock_switch != 0) {
681		/*
682		 * Unlock the scheduling queue and leave the
683		 * critical region.
684		 */
685		/* Don't trust this after a switch! */
686		curkse = _get_curkse();
687
688		curthread->lock_switch = 0;
689		KSE_SCHED_UNLOCK(curkse, curkse->k_kseg);
690		_kse_critical_leave(&curthread->tmbx);
691	}
692	/*
693	 * This thread is being resumed; check for cancellations.
694	 */
695	if ((psf.psf_valid ||
696	    (curthread->check_pending && !THR_IN_CRITICAL(curthread)))) {
697		resume_once = 0;
698		THR_GETCONTEXT(&uc);
699		if (resume_once == 0) {
700			resume_once = 1;
701			curthread->check_pending = 0;
702			thr_resume_check(curthread, &uc, &psf);
703		}
704	}
705	THR_ACTIVATE_LAST_LOCK(curthread);
706}
707
708/*
709 * This is the scheduler for a KSE which runs a scope system thread.
710 * The multi-thread KSE scheduler should also work for a single threaded
711 * KSE, but we use a separate scheduler so that it can be fine-tuned
712 * to be more efficient (and perhaps not need a separate stack for
713 * the KSE, allowing it to use the thread's stack).
714 */
715
716static void
717kse_sched_single(struct kse *curkse)
718{
719	struct pthread *curthread = curkse->k_curthread;
720	struct timespec ts;
721	sigset_t sigmask;
722	int i, sigseqno, level, first = 0;
723
724	if ((curkse->k_flags & KF_INITIALIZED) == 0) {
725		/* Setup this KSEs specific data. */
726		_ksd_setprivate(&curkse->k_ksd);
727		_set_curkse(curkse);
728		curkse->k_flags |= KF_INITIALIZED;
729		first = 1;
730		curthread->active = 1;
731
732		/* Setup kernel signal masks for new thread. */
733		__sys_sigprocmask(SIG_SETMASK, &curthread->sigmask, NULL);
734		/*
735		 * Enter critical region, this is meanless for bound thread,
736		 * It is used to let other code work, those code want mailbox
737		 * to be cleared.
738		 */
739		_kse_critical_enter();
740 	}
741
742	curthread->critical_yield = 0;
743	curthread->need_switchout = 0;
744
745	/*
746	 * Lock the scheduling queue.
747	 *
748	 * There is no scheduling queue for single threaded KSEs,
749	 * but we need a lock for protection regardless.
750	 */
751	if (curthread->lock_switch == 0)
752		KSE_SCHED_LOCK(curkse, curkse->k_kseg);
753
754	/*
755	 * This has to do the job of kse_switchout_thread(), only
756	 * for a single threaded KSE/KSEG.
757	 */
758
759	switch (curthread->state) {
760	case PS_DEAD:
761		curthread->check_pending = 0;
762		/* Unlock the scheduling queue and exit the KSE and thread. */
763		thr_cleanup(curkse, curthread);
764		KSE_SCHED_UNLOCK(curkse, curkse->k_kseg);
765		PANIC("bound thread shouldn't get here\n");
766		break;
767
768	case PS_SIGWAIT:
769		PANIC("bound thread does not have SIGWAIT state\n");
770
771	case PS_SLEEP_WAIT:
772		PANIC("bound thread does not have SLEEP_WAIT state\n");
773
774	case PS_SIGSUSPEND:
775		PANIC("bound thread does not have SIGSUSPEND state\n");
776
777	case PS_COND_WAIT:
778		break;
779
780	case PS_LOCKWAIT:
781		/*
782		 * This state doesn't timeout.
783		 */
784		curthread->wakeup_time.tv_sec = -1;
785		curthread->wakeup_time.tv_nsec = -1;
786		level = curthread->locklevel - 1;
787		if (_LCK_GRANTED(&curthread->lockusers[level]))
788			THR_SET_STATE(curthread, PS_RUNNING);
789		break;
790
791	case PS_RUNNING:
792		if ((curthread->flags & THR_FLAGS_SUSPENDED) != 0) {
793			THR_SET_STATE(curthread, PS_SUSPENDED);
794		}
795		curthread->wakeup_time.tv_sec = -1;
796		curthread->wakeup_time.tv_nsec = -1;
797		break;
798
799	case PS_JOIN:
800	case PS_MUTEX_WAIT:
801	case PS_SUSPENDED:
802	case PS_DEADLOCK:
803	default:
804		/*
805		 * These states don't timeout and don't need
806		 * to be in the waiting queue.
807		 */
808		curthread->wakeup_time.tv_sec = -1;
809		curthread->wakeup_time.tv_nsec = -1;
810		break;
811	}
812
813	while (curthread->state != PS_RUNNING) {
814		sigseqno = curkse->k_sigseqno;
815		if (curthread->check_pending != 0) {
816			/*
817			 * Install pending signals into the frame, possible
818			 * cause mutex or condvar backout.
819			 */
820			curthread->check_pending = 0;
821			SIGFILLSET(sigmask);
822
823			/*
824			 * Lock out kernel signal code when we are processing
825			 * signals, and get a fresh copy of signal mask.
826			 */
827			__sys_sigprocmask(SIG_SETMASK, &sigmask,
828					  &curthread->sigmask);
829			for (i = 1; i <= _SIG_MAXSIG; i++) {
830				if (SIGISMEMBER(curthread->sigmask, i))
831					continue;
832				if (SIGISMEMBER(curthread->sigpend, i))
833					_thr_sig_add(curthread, i,
834						&curthread->siginfo[i-1]);
835			}
836			__sys_sigprocmask(SIG_SETMASK, &curthread->sigmask,
837				NULL);
838			/* The above code might make thread runnable */
839			if (curthread->state == PS_RUNNING)
840				break;
841		}
842		THR_DEACTIVATE_LAST_LOCK(curthread);
843		kse_wait(curkse, curthread, sigseqno);
844		THR_ACTIVATE_LAST_LOCK(curthread);
845		KSE_GET_TOD(curkse, &ts);
846		if (thr_timedout(curthread, &ts)) {
847			/* Indicate the thread timedout: */
848			curthread->timeout = 1;
849			/* Make the thread runnable. */
850			THR_SET_STATE(curthread, PS_RUNNING);
851		}
852	}
853
854	/* Remove the frame reference. */
855	curthread->curframe = NULL;
856
857	if (curthread->lock_switch == 0) {
858		/* Unlock the scheduling queue. */
859		KSE_SCHED_UNLOCK(curkse, curkse->k_kseg);
860	}
861
862	DBG_MSG("Continuing bound thread %p\n", curthread);
863	if (first) {
864		_kse_critical_leave(&curthread->tmbx);
865		pthread_exit(curthread->start_routine(curthread->arg));
866	}
867}
868
869#ifdef DEBUG_THREAD_KERN
870static void
871dump_queues(struct kse *curkse)
872{
873	struct pthread *thread;
874
875	DBG_MSG("Threads in waiting queue:\n");
876	TAILQ_FOREACH(thread, &curkse->k_kseg->kg_schedq.sq_waitq, pqe) {
877		DBG_MSG("  thread %p, state %d, blocked %d\n",
878		    thread, thread->state, thread->blocked);
879	}
880}
881#endif
882
883/*
884 * This is the scheduler for a KSE which runs multiple threads.
885 */
886static void
887kse_sched_multi(struct kse *curkse)
888{
889	struct pthread *curthread, *td_wait;
890	struct pthread_sigframe *curframe;
891	int ret;
892
893	THR_ASSERT(curkse->k_mbx.km_curthread == NULL,
894	    "Mailbox not null in kse_sched_multi");
895
896	/* Check for first time initialization: */
897	if ((curkse->k_flags & KF_INITIALIZED) == 0) {
898		/* Setup this KSEs specific data. */
899		_ksd_setprivate(&curkse->k_ksd);
900		_set_curkse(curkse);
901
902		/* Set this before grabbing the context. */
903		curkse->k_flags |= KF_INITIALIZED;
904	}
905
906	/* This may have returned from a kse_release(). */
907	if (KSE_WAITING(curkse)) {
908		DBG_MSG("Entered upcall when KSE is waiting.");
909		KSE_CLEAR_WAIT(curkse);
910	}
911
912	/*If this is an upcall; take the scheduler lock. */
913	if (curkse->k_switch == 0)
914		KSE_SCHED_LOCK(curkse, curkse->k_kseg);
915	curkse->k_switch = 0;
916
917	curthread = curkse->k_curthread;
918
919	if (KSE_IS_IDLE(curkse)) {
920		KSE_CLEAR_IDLE(curkse);
921		curkse->k_kseg->kg_idle_kses--;
922	}
923	/*
924	 * If the current thread was completed in another KSE, then
925	 * it will be in the run queue.  Don't mark it as being blocked.
926	 */
927	if ((curthread != NULL) &&
928	    ((curthread->flags & THR_FLAGS_IN_RUNQ) == 0) &&
929	    (curthread->need_switchout == 0)) {
930		/*
931		 * Assume the current thread is blocked; when the
932		 * completed threads are checked and if the current
933		 * thread is among the completed, the blocked flag
934		 * will be cleared.
935		 */
936		curthread->blocked = 1;
937	}
938
939	/* Check for any unblocked threads in the kernel. */
940	kse_check_completed(curkse);
941
942	/*
943	 * Check for threads that have timed-out.
944	 */
945	kse_check_waitq(curkse);
946
947	/*
948	 * Switchout the current thread, if necessary, as the last step
949	 * so that it is inserted into the run queue (if it's runnable)
950	 * _after_ any other threads that were added to it above.
951	 */
952	if (curthread == NULL)
953		;  /* Nothing to do here. */
954	else if ((curthread->need_switchout == 0) &&
955	    (curthread->blocked == 0) && (THR_IN_CRITICAL(curthread))) {
956		/*
957		 * Resume the thread and tell it to yield when
958		 * it leaves the critical region.
959		 */
960		curthread->critical_yield = 1;
961		curthread->active = 1;
962		if ((curthread->flags & THR_FLAGS_IN_RUNQ) != 0)
963			KSE_RUNQ_REMOVE(curkse, curthread);
964		curkse->k_curthread = curthread;
965		curthread->kse = curkse;
966		DBG_MSG("Continuing thread %p in critical region\n",
967		    curthread);
968		kse_wakeup_multi(curkse);
969		KSE_SCHED_UNLOCK(curkse, curkse->k_kseg);
970		ret = _thread_switch(&curthread->tmbx,
971		    &curkse->k_mbx.km_curthread);
972		if (ret != 0)
973			PANIC("Can't resume thread in critical region\n");
974	}
975	else if ((curthread->flags & THR_FLAGS_IN_RUNQ) == 0)
976		kse_switchout_thread(curkse, curthread);
977	curkse->k_curthread = NULL;
978
979	kse_wakeup_multi(curkse);
980
981#ifdef DEBUG_THREAD_KERN
982	dump_queues(curkse);
983#endif
984
985	/* Check if there are no threads ready to run: */
986	while (((curthread = KSE_RUNQ_FIRST(curkse)) == NULL) &&
987	    (curkse->k_kseg->kg_threadcount != 0)) {
988		/*
989		 * Wait for a thread to become active or until there are
990		 * no more threads.
991		 */
992		td_wait = KSE_WAITQ_FIRST(curkse);
993		kse_wait(curkse, td_wait, 0);
994		kse_check_completed(curkse);
995		kse_check_waitq(curkse);
996	}
997
998	/* Check for no more threads: */
999	if (curkse->k_kseg->kg_threadcount == 0) {
1000		/*
1001		 * Normally this shouldn't return, but it will if there
1002		 * are other KSEs running that create new threads that
1003		 * are assigned to this KSE[G].  For instance, if a scope
1004		 * system thread were to create a scope process thread
1005		 * and this kse[g] is the initial kse[g], then that newly
1006		 * created thread would be assigned to us (the initial
1007		 * kse[g]).
1008		 */
1009		KSE_SCHED_UNLOCK(curkse, curkse->k_kseg);
1010		kse_fini(curkse);
1011		/* never returns */
1012	}
1013
1014	THR_ASSERT(curthread != NULL,
1015	    "Return from kse_wait/fini without thread.");
1016	THR_ASSERT(curthread->state != PS_DEAD,
1017	    "Trying to resume dead thread!");
1018	KSE_RUNQ_REMOVE(curkse, curthread);
1019
1020	/*
1021	 * Make the selected thread the current thread.
1022	 */
1023	curkse->k_curthread = curthread;
1024
1025	/*
1026	 * Make sure the current thread's kse points to this kse.
1027	 */
1028	curthread->kse = curkse;
1029
1030	/*
1031	 * Reset accounting.
1032	 */
1033	curthread->tmbx.tm_uticks = 0;
1034	curthread->tmbx.tm_sticks = 0;
1035
1036	/*
1037	 * Reset the time slice if this thread is running for the first
1038	 * time or running again after using its full time slice allocation.
1039	 */
1040	if (curthread->slice_usec == -1)
1041		curthread->slice_usec = 0;
1042
1043	/* Mark the thread active. */
1044	curthread->active = 1;
1045
1046	/* Remove the frame reference. */
1047	curframe = curthread->curframe;
1048	curthread->curframe = NULL;
1049
1050	kse_wakeup_multi(curkse);
1051
1052	/*
1053	 * The thread's current signal frame will only be NULL if it
1054	 * is being resumed after being blocked in the kernel.  In
1055	 * this case, and if the thread needs to run down pending
1056	 * signals or needs a cancellation check, we need to add a
1057	 * signal frame to the thread's context.
1058	 */
1059#ifdef NOT_YET
1060	if ((((curframe == NULL) && (curthread->check_pending != 0)) ||
1061	    (((curthread->cancelflags & THR_AT_CANCEL_POINT) == 0) &&
1062	     ((curthread->cancelflags & PTHREAD_CANCEL_ASYNCHRONOUS) != 0))) &&
1063	     !THR_IN_CRITICAL(curthread))
1064		signalcontext(&curthread->tmbx.tm_context, 0,
1065		    (__sighandler_t *)thr_resume_wrapper);
1066#else
1067	if ((curframe == NULL) && (curthread->state == PS_RUNNING) &&
1068	    (curthread->check_pending != 0) && !THR_IN_CRITICAL(curthread)) {
1069		curthread->check_pending = 0;
1070		signalcontext(&curthread->tmbx.tm_context, 0,
1071		    (__sighandler_t *)thr_resume_wrapper);
1072	}
1073#endif
1074	/*
1075	 * Continue the thread at its current frame:
1076	 */
1077	if (curthread->lock_switch != 0) {
1078		/*
1079		 * This thread came from a scheduler switch; it will
1080		 * unlock the scheduler lock and set the mailbox.
1081		 */
1082		ret = _thread_switch(&curthread->tmbx, NULL);
1083	} else {
1084		/* This thread won't unlock the scheduler lock. */
1085		KSE_SCHED_UNLOCK(curkse, curkse->k_kseg);
1086		ret = _thread_switch(&curthread->tmbx,
1087		    &curkse->k_mbx.km_curthread);
1088	}
1089	if (ret != 0)
1090		PANIC("Thread has returned from _thread_switch");
1091
1092	/* This point should not be reached. */
1093	PANIC("Thread has returned from _thread_switch");
1094}
1095
1096static void
1097thr_resume_wrapper(int sig, siginfo_t *siginfo, ucontext_t *ucp)
1098{
1099	struct pthread *curthread = _get_curthread();
1100	struct kse *curkse;
1101	int ret, err_save = curthread->error;
1102
1103	DBG_MSG(">>> sig wrapper\n");
1104	if (curthread->lock_switch)
1105		PANIC("thr_resume_wrapper, lock_switch != 0\n");
1106	thr_resume_check(curthread, ucp, NULL);
1107	_kse_critical_enter();
1108	curkse = _get_curkse();
1109	curthread->tmbx.tm_context = *ucp;
1110	curthread->error = err_save;
1111	ret = _thread_switch(&curthread->tmbx, &curkse->k_mbx.km_curthread);
1112	if (ret != 0)
1113		PANIC("thr_resume_wrapper: thread has returned "
1114		      "from _thread_switch");
1115	/* THR_SETCONTEXT(ucp); */ /* not work, why ? */
1116}
1117
1118static void
1119thr_resume_check(struct pthread *curthread, ucontext_t *ucp,
1120    struct pthread_sigframe *psf)
1121{
1122	_thr_sig_rundown(curthread, ucp, psf);
1123
1124#ifdef NOT_YET
1125	if (((curthread->cancelflags & THR_AT_CANCEL_POINT) == 0) &&
1126	    ((curthread->cancelflags & PTHREAD_CANCEL_ASYNCHRONOUS) != 0))
1127		pthread_testcancel();
1128#endif
1129}
1130
1131/*
1132 * Clean up a thread.  This must be called with the thread's KSE
1133 * scheduling lock held.  The thread must be a thread from the
1134 * KSE's group.
1135 */
1136static void
1137thr_cleanup(struct kse *curkse, struct pthread *thread)
1138{
1139	struct pthread *joiner;
1140	int sys_scope;
1141
1142	if ((joiner = thread->joiner) != NULL) {
1143		/* Joinee scheduler lock held; joiner won't leave. */
1144		if (joiner->kseg == curkse->k_kseg) {
1145			if (joiner->join_status.thread == thread) {
1146				joiner->join_status.thread = NULL;
1147				joiner->join_status.ret = thread->ret;
1148				_thr_setrunnable_unlocked(joiner);
1149			}
1150		} else {
1151			KSE_SCHED_UNLOCK(curkse, curkse->k_kseg);
1152			/* The joiner may have removed itself and exited. */
1153			if (_thr_ref_add(thread, joiner, 0) == 0) {
1154				KSE_SCHED_LOCK(curkse, joiner->kseg);
1155				if (joiner->join_status.thread == thread) {
1156					joiner->join_status.thread = NULL;
1157					joiner->join_status.ret = thread->ret;
1158					_thr_setrunnable_unlocked(joiner);
1159				}
1160				KSE_SCHED_UNLOCK(curkse, joiner->kseg);
1161				_thr_ref_delete(thread, joiner);
1162			}
1163			KSE_SCHED_LOCK(curkse, curkse->k_kseg);
1164		}
1165		thread->attr.flags |= PTHREAD_DETACHED;
1166	}
1167
1168	if (!(sys_scope = (thread->attr.flags & PTHREAD_SCOPE_SYSTEM))) {
1169		/*
1170		 * Remove the thread from the KSEG's list of threads.
1171	 	 */
1172		KSEG_THRQ_REMOVE(thread->kseg, thread);
1173		/*
1174		 * Migrate the thread to the main KSE so that this
1175		 * KSE and KSEG can be cleaned when their last thread
1176		 * exits.
1177		 */
1178		thread->kseg = _kse_initial->k_kseg;
1179		thread->kse = _kse_initial;
1180	}
1181	thread->flags |= THR_FLAGS_GC_SAFE;
1182
1183	/*
1184	 * We can't hold the thread list lock while holding the
1185	 * scheduler lock.
1186	 */
1187	KSE_SCHED_UNLOCK(curkse, curkse->k_kseg);
1188	DBG_MSG("Adding thread %p to GC list\n", thread);
1189	KSE_LOCK_ACQUIRE(curkse, &_thread_list_lock);
1190	THR_GCLIST_ADD(thread);
1191	/* Use thread_list_lock */
1192	active_threads--;
1193#ifdef SYSTEM_SCOPE_ONLY
1194	if (active_threads == 0) {
1195#else
1196	if (active_threads == 1) {
1197#endif
1198		KSE_LOCK_RELEASE(curkse, &_thread_list_lock);
1199		exit(0);
1200        }
1201	KSE_LOCK_RELEASE(curkse, &_thread_list_lock);
1202	if (sys_scope) {
1203		/*
1204		 * System scope thread is single thread group,
1205		 * when thread is exited, its kse and ksegrp should
1206		 * be recycled as well.
1207		 * kse upcall stack belongs to thread, clear it here.
1208		 */
1209		curkse->k_stack.ss_sp = 0;
1210		curkse->k_stack.ss_size = 0;
1211		kse_exit();
1212		PANIC("kse_exit() failed for system scope thread");
1213	}
1214	KSE_SCHED_LOCK(curkse, curkse->k_kseg);
1215}
1216
1217void
1218_thr_gc(struct pthread *curthread)
1219{
1220	struct pthread *td, *td_next;
1221	kse_critical_t crit;
1222	TAILQ_HEAD(, pthread) worklist;
1223
1224	TAILQ_INIT(&worklist);
1225	crit = _kse_critical_enter();
1226	KSE_LOCK_ACQUIRE(curthread->kse, &_thread_list_lock);
1227
1228	/* Check the threads waiting for GC. */
1229	for (td = TAILQ_FIRST(&_thread_gc_list); td != NULL; td = td_next) {
1230		td_next = TAILQ_NEXT(td, gcle);
1231		if ((td->flags & THR_FLAGS_GC_SAFE) == 0)
1232			continue;
1233		else if (((td->attr.flags & PTHREAD_SCOPE_SYSTEM) != 0) &&
1234		    ((td->kse->k_mbx.km_flags & KMF_DONE) == 0)) {
1235			/*
1236			 * The thread and KSE are operating on the same
1237			 * stack.  Wait for the KSE to exit before freeing
1238			 * the thread's stack as well as everything else.
1239			 */
1240			continue;
1241		}
1242		/*
1243		 * Remove the thread from the GC list.  If the thread
1244		 * isn't yet detached, it will get added back to the
1245		 * GC list at a later time.
1246		 */
1247		THR_GCLIST_REMOVE(td);
1248		DBG_MSG("Freeing thread %p stack\n", td);
1249		/*
1250		 * We can free the thread stack since it's no longer
1251		 * in use.
1252		 */
1253		_thr_stack_free(&td->attr);
1254		if (((td->attr.flags & PTHREAD_DETACHED) != 0) &&
1255		    (td->refcount == 0)) {
1256			/*
1257			 * The thread has detached and is no longer
1258			 * referenced.  It is safe to remove all
1259			 * remnants of the thread.
1260			 */
1261			THR_LIST_REMOVE(td);
1262			TAILQ_INSERT_HEAD(&worklist, td, gcle);
1263		}
1264	}
1265	KSE_LOCK_RELEASE(curthread->kse, &_thread_list_lock);
1266	_kse_critical_leave(crit);
1267
1268	while ((td = TAILQ_FIRST(&worklist)) != NULL) {
1269		TAILQ_REMOVE(&worklist, td, gcle);
1270
1271		if ((td->attr.flags & PTHREAD_SCOPE_SYSTEM) != 0) {
1272			crit = _kse_critical_enter();
1273			KSE_LOCK_ACQUIRE(curthread->kse, &kse_lock);
1274			kse_free_unlocked(td->kse);
1275			kseg_free_unlocked(td->kseg);
1276			KSE_LOCK_RELEASE(curthread->kse, &kse_lock);
1277			_kse_critical_leave(crit);
1278		}
1279		/*
1280		 * XXX we don't free initial thread, because there might
1281		 * have some code referencing initial thread.
1282		 */
1283		if (td != _thr_initial) {
1284			DBG_MSG("Freeing thread %p\n", td);
1285			_thr_free(curthread, td);
1286		} else
1287			DBG_MSG("Initial thread won't be freed\n");
1288	}
1289	/* XXX free kse and ksegrp list should be looked as well */
1290}
1291
1292
1293/*
1294 * Only new threads that are running or suspended may be scheduled.
1295 */
1296int
1297_thr_schedule_add(struct pthread *curthread, struct pthread *newthread)
1298{
1299	kse_critical_t crit;
1300	int ret;
1301
1302	/* Add the new thread. */
1303	thr_link(newthread);
1304
1305	/*
1306	 * If this is the first time creating a thread, make sure
1307	 * the mailbox is set for the current thread.
1308	 */
1309	if ((newthread->attr.flags & PTHREAD_SCOPE_SYSTEM) != 0) {
1310		/* We use the thread's stack as the KSE's stack. */
1311		newthread->kse->k_mbx.km_stack.ss_sp =
1312		    newthread->attr.stackaddr_attr;
1313		newthread->kse->k_mbx.km_stack.ss_size =
1314		    newthread->attr.stacksize_attr;
1315
1316		/*
1317		 * No need to lock the scheduling queue since the
1318		 * KSE/KSEG pair have not yet been started.
1319		 */
1320		KSEG_THRQ_ADD(newthread->kseg, newthread);
1321		/* this thread never gives up kse */
1322		newthread->active = 1;
1323		newthread->kse->k_curthread = newthread;
1324		newthread->kse->k_mbx.km_flags = KMF_BOUND;
1325		newthread->kse->k_mbx.km_func = (kse_func_t *)kse_sched_single;
1326		newthread->kse->k_mbx.km_quantum = 0;
1327		KSE_SET_MBOX(newthread->kse, newthread);
1328		/*
1329		 * This thread needs a new KSE and KSEG.
1330		 */
1331		newthread->kse->k_flags &= ~KF_INITIALIZED;
1332		newthread->kse->k_flags |= KF_STARTED;
1333		/* Fire up! */
1334		ret = kse_create(&newthread->kse->k_mbx, 1);
1335		if (ret != 0)
1336			ret = errno;
1337	}
1338	else {
1339		/*
1340		 * Lock the KSE and add the new thread to its list of
1341		 * assigned threads.  If the new thread is runnable, also
1342		 * add it to the KSE's run queue.
1343		 */
1344		crit = _kse_critical_enter();
1345		KSE_SCHED_LOCK(curthread->kse, newthread->kseg);
1346		KSEG_THRQ_ADD(newthread->kseg, newthread);
1347		if (newthread->state == PS_RUNNING)
1348			THR_RUNQ_INSERT_TAIL(newthread);
1349		if ((newthread->kse->k_flags & KF_STARTED) == 0) {
1350			/*
1351			 * This KSE hasn't been started yet.  Start it
1352			 * outside of holding the lock.
1353			 */
1354			newthread->kse->k_flags |= KF_STARTED;
1355			newthread->kse->k_mbx.km_func =
1356			    (kse_func_t *)kse_sched_multi;
1357			newthread->kse->k_mbx.km_flags = 0;
1358			kse_create(&newthread->kse->k_mbx, 0);
1359		 } else if ((newthread->state == PS_RUNNING) &&
1360		     KSE_IS_IDLE(newthread->kse)) {
1361			/*
1362			 * The thread is being scheduled on another KSEG.
1363			 */
1364			kse_wakeup_one(newthread);
1365		}
1366		KSE_SCHED_UNLOCK(curthread->kse, newthread->kseg);
1367		_kse_critical_leave(crit);
1368		ret = 0;
1369	}
1370	if (ret != 0)
1371		thr_unlink(newthread);
1372
1373	return (ret);
1374}
1375
1376void
1377kse_waitq_insert(struct pthread *thread)
1378{
1379	struct pthread *td;
1380
1381	if (thread->wakeup_time.tv_sec == -1)
1382		TAILQ_INSERT_TAIL(&thread->kse->k_schedq->sq_waitq, thread,
1383		    pqe);
1384	else {
1385		td = TAILQ_FIRST(&thread->kse->k_schedq->sq_waitq);
1386		while ((td != NULL) && (td->wakeup_time.tv_sec != -1) &&
1387		    ((td->wakeup_time.tv_sec < thread->wakeup_time.tv_sec) ||
1388		    ((td->wakeup_time.tv_sec == thread->wakeup_time.tv_sec) &&
1389		    (td->wakeup_time.tv_nsec <= thread->wakeup_time.tv_nsec))))
1390			td = TAILQ_NEXT(td, pqe);
1391		if (td == NULL)
1392			TAILQ_INSERT_TAIL(&thread->kse->k_schedq->sq_waitq,
1393			    thread, pqe);
1394		else
1395			TAILQ_INSERT_BEFORE(td, thread, pqe);
1396	}
1397	thread->flags |= THR_FLAGS_IN_WAITQ;
1398}
1399
1400/*
1401 * This must be called with the scheduling lock held.
1402 */
1403static void
1404kse_check_completed(struct kse *kse)
1405{
1406	struct pthread *thread;
1407	struct kse_thr_mailbox *completed;
1408	int sig;
1409
1410	if ((completed = kse->k_mbx.km_completed) != NULL) {
1411		kse->k_mbx.km_completed = NULL;
1412		while (completed != NULL) {
1413			thread = completed->tm_udata;
1414			DBG_MSG("Found completed thread %p, name %s\n",
1415			    thread,
1416			    (thread->name == NULL) ? "none" : thread->name);
1417			thread->blocked = 0;
1418			if (thread != kse->k_curthread) {
1419				if ((thread->flags & THR_FLAGS_SUSPENDED) != 0)
1420					THR_SET_STATE(thread, PS_SUSPENDED);
1421				else
1422					KSE_RUNQ_INSERT_TAIL(kse, thread);
1423				if ((thread->kse != kse) &&
1424				    (thread->kse->k_curthread == thread)) {
1425					thread->kse->k_curthread = NULL;
1426					thread->active = 0;
1427				}
1428			}
1429			if ((sig = thread->tmbx.tm_syncsig.si_signo) != 0) {
1430				if (SIGISMEMBER(thread->sigmask, sig))
1431					SIGADDSET(thread->sigpend, sig);
1432				else
1433					_thr_sig_add(thread, sig, &thread->tmbx.tm_syncsig);
1434				thread->tmbx.tm_syncsig.si_signo = 0;
1435			}
1436			completed = completed->tm_next;
1437		}
1438	}
1439}
1440
1441/*
1442 * This must be called with the scheduling lock held.
1443 */
1444static void
1445kse_check_waitq(struct kse *kse)
1446{
1447	struct pthread	*pthread;
1448	struct timespec ts;
1449
1450	KSE_GET_TOD(kse, &ts);
1451
1452	/*
1453	 * Wake up threads that have timedout.  This has to be
1454	 * done before adding the current thread to the run queue
1455	 * so that a CPU intensive thread doesn't get preference
1456	 * over waiting threads.
1457	 */
1458	while (((pthread = KSE_WAITQ_FIRST(kse)) != NULL) &&
1459	    thr_timedout(pthread, &ts)) {
1460		/* Remove the thread from the wait queue: */
1461		KSE_WAITQ_REMOVE(kse, pthread);
1462		DBG_MSG("Found timedout thread %p in waitq\n", pthread);
1463
1464		/* Indicate the thread timedout: */
1465		pthread->timeout = 1;
1466
1467		/* Add the thread to the priority queue: */
1468		if ((pthread->flags & THR_FLAGS_SUSPENDED) != 0)
1469			THR_SET_STATE(pthread, PS_SUSPENDED);
1470		else {
1471			THR_SET_STATE(pthread, PS_RUNNING);
1472			KSE_RUNQ_INSERT_TAIL(kse, pthread);
1473		}
1474	}
1475}
1476
1477static int
1478thr_timedout(struct pthread *thread, struct timespec *curtime)
1479{
1480	if (thread->wakeup_time.tv_sec < 0)
1481		return (0);
1482	else if (thread->wakeup_time.tv_sec > curtime->tv_sec)
1483		return (0);
1484	else if ((thread->wakeup_time.tv_sec == curtime->tv_sec) &&
1485	    (thread->wakeup_time.tv_nsec > curtime->tv_nsec))
1486		return (0);
1487	else
1488		return (1);
1489}
1490
1491/*
1492 * This must be called with the scheduling lock held.
1493 *
1494 * Each thread has a time slice, a wakeup time (used when it wants
1495 * to wait for a specified amount of time), a run state, and an
1496 * active flag.
1497 *
1498 * When a thread gets run by the scheduler, the active flag is
1499 * set to non-zero (1).  When a thread performs an explicit yield
1500 * or schedules a state change, it enters the scheduler and the
1501 * active flag is cleared.  When the active flag is still seen
1502 * set in the scheduler, that means that the thread is blocked in
1503 * the kernel (because it is cleared before entering the scheduler
1504 * in all other instances).
1505 *
1506 * The wakeup time is only set for those states that can timeout.
1507 * It is set to (-1, -1) for all other instances.
1508 *
1509 * The thread's run state, aside from being useful when debugging,
1510 * is used to place the thread in an appropriate queue.  There
1511 * are 2 basic queues:
1512 *
1513 *   o run queue - queue ordered by priority for all threads
1514 *                 that are runnable
1515 *   o waiting queue - queue sorted by wakeup time for all threads
1516 *                     that are not otherwise runnable (not blocked
1517 *                     in kernel, not waiting for locks)
1518 *
1519 * The thread's time slice is used for round-robin scheduling
1520 * (the default scheduling policy).  While a SCHED_RR thread
1521 * is runnable it's time slice accumulates.  When it reaches
1522 * the time slice interval, it gets reset and added to the end
1523 * of the queue of threads at its priority.  When a thread no
1524 * longer becomes runnable (blocks in kernel, waits, etc), its
1525 * time slice is reset.
1526 *
1527 * The job of kse_switchout_thread() is to handle all of the above.
1528 */
1529static void
1530kse_switchout_thread(struct kse *kse, struct pthread *thread)
1531{
1532	int level;
1533	int i;
1534	int restart;
1535	siginfo_t siginfo;
1536
1537	/*
1538	 * Place the currently running thread into the
1539	 * appropriate queue(s).
1540	 */
1541	DBG_MSG("Switching out thread %p, state %d\n", thread, thread->state);
1542
1543	THR_DEACTIVATE_LAST_LOCK(thread);
1544	if (thread->blocked != 0) {
1545		thread->active = 0;
1546		thread->need_switchout = 0;
1547		/* This thread must have blocked in the kernel. */
1548		/* thread->slice_usec = -1;*/	/* restart timeslice */
1549		if ((thread->slice_usec != -1) &&
1550		    (thread->attr.sched_policy != SCHED_FIFO))
1551			thread->slice_usec += (thread->tmbx.tm_uticks
1552			    + thread->tmbx.tm_sticks) * _clock_res_usec;
1553		/*
1554		 *  Check for pending signals for this thread to
1555		 *  see if we need to interrupt it in the kernel.
1556		 */
1557		if (thread->check_pending != 0) {
1558			for (i = 1; i <= _SIG_MAXSIG; ++i) {
1559				if (SIGISMEMBER(thread->sigpend, i) &&
1560				    !SIGISMEMBER(thread->sigmask, i)) {
1561					restart = _thread_sigact[1 - 1].sa_flags & SA_RESTART;
1562					kse_thr_interrupt(&thread->tmbx,
1563					    restart ? KSE_INTR_RESTART : KSE_INTR_INTERRUPT, 0);
1564					break;
1565				}
1566			}
1567		}
1568	}
1569	else {
1570		switch (thread->state) {
1571		case PS_DEAD:
1572			/*
1573			 * The scheduler is operating on a different
1574			 * stack.  It is safe to do garbage collecting
1575			 * here.
1576			 */
1577			thread->active = 0;
1578			thread->need_switchout = 0;
1579			thr_cleanup(kse, thread);
1580			return;
1581			break;
1582
1583		case PS_RUNNING:
1584			if ((thread->flags & THR_FLAGS_SUSPENDED) != 0)
1585				THR_SET_STATE(thread, PS_SUSPENDED);
1586			break;
1587
1588		case PS_COND_WAIT:
1589		case PS_SLEEP_WAIT:
1590			/* Insert into the waiting queue: */
1591			KSE_WAITQ_INSERT(kse, thread);
1592			break;
1593
1594		case PS_LOCKWAIT:
1595			/*
1596			 * This state doesn't timeout.
1597			 */
1598			thread->wakeup_time.tv_sec = -1;
1599			thread->wakeup_time.tv_nsec = -1;
1600			level = thread->locklevel - 1;
1601			if (!_LCK_GRANTED(&thread->lockusers[level]))
1602				KSE_WAITQ_INSERT(kse, thread);
1603			else
1604				THR_SET_STATE(thread, PS_RUNNING);
1605			break;
1606
1607		case PS_SIGWAIT:
1608			KSE_WAITQ_INSERT(kse, thread);
1609			break;
1610		case PS_JOIN:
1611		case PS_MUTEX_WAIT:
1612		case PS_SIGSUSPEND:
1613		case PS_SUSPENDED:
1614		case PS_DEADLOCK:
1615		default:
1616			/*
1617			 * These states don't timeout.
1618			 */
1619			thread->wakeup_time.tv_sec = -1;
1620			thread->wakeup_time.tv_nsec = -1;
1621
1622			/* Insert into the waiting queue: */
1623			KSE_WAITQ_INSERT(kse, thread);
1624			break;
1625		}
1626		if (thread->state != PS_RUNNING) {
1627			/* Restart the time slice: */
1628			thread->slice_usec = -1;
1629		} else {
1630			if (thread->need_switchout != 0)
1631				/*
1632				 * The thread yielded on its own;
1633				 * restart the timeslice.
1634				 */
1635				thread->slice_usec = -1;
1636			else if ((thread->slice_usec != -1) &&
1637	   		    (thread->attr.sched_policy != SCHED_FIFO)) {
1638				thread->slice_usec += (thread->tmbx.tm_uticks
1639				    + thread->tmbx.tm_sticks) * _clock_res_usec;
1640				/* Check for time quantum exceeded: */
1641				if (thread->slice_usec > TIMESLICE_USEC)
1642					thread->slice_usec = -1;
1643			}
1644			if (thread->slice_usec == -1) {
1645				/*
1646				 * The thread exceeded its time quantum or
1647				 * it yielded the CPU; place it at the tail
1648				 * of the queue for its priority.
1649				 */
1650				KSE_RUNQ_INSERT_TAIL(kse, thread);
1651			} else {
1652				/*
1653				 * The thread hasn't exceeded its interval
1654				 * Place it at the head of the queue for its
1655				 * priority.
1656				 */
1657				KSE_RUNQ_INSERT_HEAD(kse, thread);
1658			}
1659		}
1660	}
1661	thread->active = 0;
1662	thread->need_switchout = 0;
1663	if (thread->check_pending != 0) {
1664		/* Install pending signals into the frame. */
1665		thread->check_pending = 0;
1666		KSE_LOCK_ACQUIRE(kse, &_thread_signal_lock);
1667		for (i = 1; i <= _SIG_MAXSIG; i++) {
1668			if (SIGISMEMBER(thread->sigmask, i))
1669				continue;
1670			if (SIGISMEMBER(thread->sigpend, i))
1671				_thr_sig_add(thread, i, &thread->siginfo[i-1]);
1672			else if (SIGISMEMBER(_thr_proc_sigpending, i) &&
1673				_thr_getprocsig_unlocked(i, &siginfo)) {
1674				_thr_sig_add(thread, i, &siginfo);
1675			}
1676		}
1677		KSE_LOCK_RELEASE(kse, &_thread_signal_lock);
1678	}
1679}
1680
1681/*
1682 * This function waits for the smallest timeout value of any waiting
1683 * thread, or until it receives a message from another KSE.
1684 *
1685 * This must be called with the scheduling lock held.
1686 */
1687static void
1688kse_wait(struct kse *kse, struct pthread *td_wait, int sigseqno)
1689{
1690	struct timespec ts, ts_sleep;
1691	int saved_flags;
1692
1693	KSE_GET_TOD(kse, &ts);
1694
1695	if ((td_wait == NULL) || (td_wait->wakeup_time.tv_sec < 0)) {
1696		/* Limit sleep to no more than 1 minute. */
1697		ts_sleep.tv_sec = 60;
1698		ts_sleep.tv_nsec = 0;
1699	} else {
1700		TIMESPEC_SUB(&ts_sleep, &td_wait->wakeup_time, &ts);
1701		if (ts_sleep.tv_sec > 60) {
1702			ts_sleep.tv_sec = 60;
1703			ts_sleep.tv_nsec = 0;
1704		}
1705	}
1706	/* Don't sleep for negative times. */
1707	if ((ts_sleep.tv_sec >= 0) && (ts_sleep.tv_nsec >= 0)) {
1708		KSE_SET_IDLE(kse);
1709		kse->k_kseg->kg_idle_kses++;
1710		KSE_SCHED_UNLOCK(kse, kse->k_kseg);
1711		if ((kse->k_kseg->kg_flags & KGF_SINGLE_THREAD) &&
1712		    (kse->k_sigseqno != sigseqno))
1713			; /* don't sleep */
1714		else {
1715			saved_flags = kse->k_mbx.km_flags;
1716			kse->k_mbx.km_flags |= KMF_NOUPCALL;
1717			kse_release(&ts_sleep);
1718			kse->k_mbx.km_flags = saved_flags;
1719		}
1720		KSE_SCHED_LOCK(kse, kse->k_kseg);
1721		if (KSE_IS_IDLE(kse)) {
1722			KSE_CLEAR_IDLE(kse);
1723			kse->k_kseg->kg_idle_kses--;
1724		}
1725	}
1726}
1727
1728/*
1729 * Avoid calling this kse_exit() so as not to confuse it with the
1730 * system call of the same name.
1731 */
1732static void
1733kse_fini(struct kse *kse)
1734{
1735	/* struct kse_group *free_kseg = NULL; */
1736	struct timespec ts;
1737
1738	/*
1739	 * Check to see if this is one of the main kses.
1740	 */
1741	if (kse->k_kseg != _kse_initial->k_kseg) {
1742		PANIC("shouldn't get here");
1743		/* This is for supporting thread groups. */
1744#ifdef NOT_YET
1745		/* Remove this KSE from the KSEG's list of KSEs. */
1746		KSE_SCHED_LOCK(kse, kse->k_kseg);
1747		TAILQ_REMOVE(&kse->k_kseg->kg_kseq, kse, k_kgqe);
1748		kse->k_kseg->kg_ksecount--;
1749		if (TAILQ_EMPTY(&kse->k_kseg->kg_kseq))
1750			free_kseg = kse->k_kseg;
1751		KSE_SCHED_UNLOCK(kse, kse->k_kseg);
1752
1753		/*
1754		 * Add this KSE to the list of free KSEs along with
1755		 * the KSEG if is now orphaned.
1756		 */
1757		KSE_LOCK_ACQUIRE(kse, &kse_lock);
1758		if (free_kseg != NULL)
1759			kseg_free_unlocked(free_kseg);
1760		kse_free_unlocked(kse);
1761		KSE_LOCK_RELEASE(kse, &kse_lock);
1762		kse_exit();
1763		/* Never returns. */
1764		PANIC("kse_exit()");
1765#endif
1766	} else {
1767#ifdef NOT_YET
1768		/*
1769		 * In future, we might allow program to kill
1770		 * kse in initial group.
1771		 */
1772		if (kse != _kse_initial) {
1773			KSE_SCHED_LOCK(kse, kse->k_kseg);
1774			TAILQ_REMOVE(&kse->k_kseg->kg_kseq, kse, k_kgqe);
1775			kse->k_kseg->kg_ksecount--;
1776			KSE_SCHED_UNLOCK(kse, kse->k_kseg);
1777			KSE_LOCK_ACQUIRE(kse, &kse_lock);
1778			kse_free_unlocked(kse);
1779			KSE_LOCK_RELEASE(kse, &kse_lock);
1780			kse_exit();
1781                        /* Never returns. */
1782                        PANIC("kse_exit() failed for initial kseg");
1783                }
1784#endif
1785		KSE_SCHED_LOCK(kse, kse->k_kseg);
1786		KSE_SET_IDLE(kse);
1787		kse->k_kseg->kg_idle_kses++;
1788		KSE_SCHED_UNLOCK(kse, kse->k_kseg);
1789		ts.tv_sec = 120;
1790		ts.tv_nsec = 0;
1791		kse->k_mbx.km_flags = 0;
1792		kse_release(&ts);
1793		/* Never reach */
1794	}
1795}
1796
1797void
1798_thr_set_timeout(const struct timespec *timeout)
1799{
1800	struct pthread	*curthread = _get_curthread();
1801	struct timespec ts;
1802
1803	/* Reset the timeout flag for the running thread: */
1804	curthread->timeout = 0;
1805
1806	/* Check if the thread is to wait forever: */
1807	if (timeout == NULL) {
1808		/*
1809		 * Set the wakeup time to something that can be recognised as
1810		 * different to an actual time of day:
1811		 */
1812		curthread->wakeup_time.tv_sec = -1;
1813		curthread->wakeup_time.tv_nsec = -1;
1814	}
1815	/* Check if no waiting is required: */
1816	else if ((timeout->tv_sec == 0) && (timeout->tv_nsec == 0)) {
1817		/* Set the wake up time to 'immediately': */
1818		curthread->wakeup_time.tv_sec = 0;
1819		curthread->wakeup_time.tv_nsec = 0;
1820	} else {
1821		/* Calculate the time for the current thread to wakeup: */
1822		KSE_GET_TOD(curthread->kse, &ts);
1823		TIMESPEC_ADD(&curthread->wakeup_time, &ts, timeout);
1824	}
1825}
1826
1827void
1828_thr_panic_exit(char *file, int line, char *msg)
1829{
1830	char buf[256];
1831
1832	snprintf(buf, sizeof(buf), "(%s:%d) %s\n", file, line, msg);
1833	__sys_write(2, buf, strlen(buf));
1834	abort();
1835}
1836
1837void
1838_thr_setrunnable(struct pthread *curthread, struct pthread *thread)
1839{
1840	kse_critical_t crit;
1841
1842	crit = _kse_critical_enter();
1843	KSE_SCHED_LOCK(curthread->kse, thread->kseg);
1844	_thr_setrunnable_unlocked(thread);
1845	KSE_SCHED_UNLOCK(curthread->kse, thread->kseg);
1846	_kse_critical_leave(crit);
1847}
1848
1849void
1850_thr_setrunnable_unlocked(struct pthread *thread)
1851{
1852	if ((thread->kseg->kg_flags & KGF_SINGLE_THREAD) != 0) {
1853		/* No silly queues for these threads. */
1854		if ((thread->flags & THR_FLAGS_SUSPENDED) != 0)
1855			THR_SET_STATE(thread, PS_SUSPENDED);
1856		else
1857			THR_SET_STATE(thread, PS_RUNNING);
1858	} else if (thread->state != PS_RUNNING) {
1859		if ((thread->flags & THR_FLAGS_IN_WAITQ) != 0)
1860			KSE_WAITQ_REMOVE(thread->kse, thread);
1861		if ((thread->flags & THR_FLAGS_SUSPENDED) != 0)
1862			THR_SET_STATE(thread, PS_SUSPENDED);
1863		else {
1864			THR_SET_STATE(thread, PS_RUNNING);
1865			if ((thread->blocked == 0) && (thread->active == 0) &&
1866			    (thread->flags & THR_FLAGS_IN_RUNQ) == 0)
1867				THR_RUNQ_INSERT_TAIL(thread);
1868		}
1869	}
1870        /*
1871         * XXX - Threads are not yet assigned to specific KSEs; they are
1872         *       assigned to the KSEG.  So the fact that a thread's KSE is
1873         *       waiting doesn't necessarily mean that it will be the KSE
1874         *       that runs the thread after the lock is granted.  But we
1875         *       don't know if the other KSEs within the same KSEG are
1876         *       also in a waiting state or not so we err on the side of
1877         *       caution and wakeup the thread's last known KSE.  We
1878         *       ensure that the threads KSE doesn't change while it's
1879         *       scheduling lock is held so it is safe to reference it
1880         *       (the KSE).  If the KSE wakes up and doesn't find any more
1881         *       work it will again go back to waiting so no harm is done.
1882         */
1883	kse_wakeup_one(thread);
1884}
1885
1886static void
1887kse_wakeup_one(struct pthread *thread)
1888{
1889	struct kse *ke;
1890
1891	if (KSE_IS_IDLE(thread->kse)) {
1892		KSE_CLEAR_IDLE(thread->kse);
1893		thread->kseg->kg_idle_kses--;
1894		KSE_WAKEUP(thread->kse);
1895	} else {
1896		TAILQ_FOREACH(ke, &thread->kseg->kg_kseq, k_kgqe) {
1897			if (KSE_IS_IDLE(ke)) {
1898				KSE_CLEAR_IDLE(ke);
1899				ke->k_kseg->kg_idle_kses--;
1900				KSE_WAKEUP(ke);
1901				return;
1902			}
1903		}
1904	}
1905}
1906
1907static void
1908kse_wakeup_multi(struct kse *curkse)
1909{
1910	struct kse *ke;
1911	int tmp;
1912
1913	if ((tmp = KSE_RUNQ_THREADS(curkse)) && curkse->k_kseg->kg_idle_kses) {
1914		TAILQ_FOREACH(ke, &curkse->k_kseg->kg_kseq, k_kgqe) {
1915			if (KSE_IS_IDLE(ke)) {
1916				KSE_CLEAR_IDLE(ke);
1917				ke->k_kseg->kg_idle_kses--;
1918				KSE_WAKEUP(ke);
1919				if (--tmp == 0)
1920					break;
1921			}
1922		}
1923	}
1924}
1925
1926struct pthread *
1927_get_curthread(void)
1928{
1929	return (_ksd_curthread());
1930}
1931
1932/* This assumes the caller has disabled upcalls. */
1933struct kse *
1934_get_curkse(void)
1935{
1936	return (_ksd_curkse());
1937}
1938
1939void
1940_set_curkse(struct kse *kse)
1941{
1942	_ksd_setprivate(&kse->k_ksd);
1943}
1944
1945/*
1946 * Allocate a new KSEG.
1947 *
1948 * We allow the current thread to be NULL in the case that this
1949 * is the first time a KSEG is being created (library initialization).
1950 * In this case, we don't need to (and can't) take any locks.
1951 */
1952struct kse_group *
1953_kseg_alloc(struct pthread *curthread)
1954{
1955	struct kse_group *kseg = NULL;
1956	kse_critical_t crit;
1957
1958	if ((curthread != NULL) && (free_kseg_count > 0)) {
1959		/* Use the kse lock for the kseg queue. */
1960		crit = _kse_critical_enter();
1961		KSE_LOCK_ACQUIRE(curthread->kse, &kse_lock);
1962		if ((kseg = TAILQ_FIRST(&free_kse_groupq)) != NULL) {
1963			TAILQ_REMOVE(&free_kse_groupq, kseg, kg_qe);
1964			free_kseg_count--;
1965			active_kseg_count++;
1966			TAILQ_INSERT_TAIL(&active_kse_groupq, kseg, kg_qe);
1967		}
1968		KSE_LOCK_RELEASE(curthread->kse, &kse_lock);
1969		_kse_critical_leave(crit);
1970		if (kseg)
1971			kseg_reinit(kseg);
1972	}
1973
1974	/*
1975	 * If requested, attempt to allocate a new KSE group only if the
1976	 * KSE allocation was successful and a KSE group wasn't found in
1977	 * the free list.
1978	 */
1979	if ((kseg == NULL) &&
1980	    ((kseg = (struct kse_group *)malloc(sizeof(*kseg))) != NULL)) {
1981		if (_pq_alloc(&kseg->kg_schedq.sq_runq,
1982		    THR_MIN_PRIORITY, THR_LAST_PRIORITY) != 0) {
1983			free(kseg);
1984			kseg = NULL;
1985		} else {
1986			kseg_init(kseg);
1987			/* Add the KSEG to the list of active KSEGs. */
1988			if (curthread != NULL) {
1989				crit = _kse_critical_enter();
1990				KSE_LOCK_ACQUIRE(curthread->kse, &kse_lock);
1991				active_kseg_count++;
1992				TAILQ_INSERT_TAIL(&active_kse_groupq,
1993				    kseg, kg_qe);
1994				KSE_LOCK_RELEASE(curthread->kse, &kse_lock);
1995				_kse_critical_leave(crit);
1996			} else {
1997				active_kseg_count++;
1998				TAILQ_INSERT_TAIL(&active_kse_groupq,
1999				    kseg, kg_qe);
2000			}
2001		}
2002	}
2003	return (kseg);
2004}
2005
2006/*
2007 * This must be called with the kse lock held and when there are
2008 * no more threads that reference it.
2009 */
2010static void
2011kseg_free_unlocked(struct kse_group *kseg)
2012{
2013	TAILQ_REMOVE(&active_kse_groupq, kseg, kg_qe);
2014	TAILQ_INSERT_HEAD(&free_kse_groupq, kseg, kg_qe);
2015	free_kseg_count++;
2016	active_kseg_count--;
2017}
2018
2019void
2020_kseg_free(struct kse_group *kseg)
2021{
2022	struct kse *curkse;
2023	kse_critical_t crit;
2024
2025	crit = _kse_critical_enter();
2026	curkse = _get_curkse();
2027	KSE_LOCK_ACQUIRE(curkse, &kse_lock);
2028	kseg_free_unlocked(kseg);
2029	KSE_LOCK_RELEASE(curkse, &kse_lock);
2030	_kse_critical_leave(crit);
2031}
2032
2033/*
2034 * Allocate a new KSE.
2035 *
2036 * We allow the current thread to be NULL in the case that this
2037 * is the first time a KSE is being created (library initialization).
2038 * In this case, we don't need to (and can't) take any locks.
2039 */
2040struct kse *
2041_kse_alloc(struct pthread *curthread, int sys_scope)
2042{
2043	struct kse *kse = NULL;
2044	kse_critical_t crit;
2045	int need_ksd = 0;
2046	int i;
2047
2048	if ((curthread != NULL) && (free_kse_count > 0)) {
2049		crit = _kse_critical_enter();
2050		KSE_LOCK_ACQUIRE(curthread->kse, &kse_lock);
2051		/* Search for a finished KSE. */
2052		kse = TAILQ_FIRST(&free_kseq);
2053		while ((kse != NULL) &&
2054		    ((kse->k_mbx.km_flags & KMF_DONE) == 0)) {
2055			kse = TAILQ_NEXT(kse, k_qe);
2056		}
2057		if (kse != NULL) {
2058			DBG_MSG("found an unused kse.\n");
2059			TAILQ_REMOVE(&free_kseq, kse, k_qe);
2060			free_kse_count--;
2061			TAILQ_INSERT_TAIL(&active_kseq, kse, k_qe);
2062			active_kse_count++;
2063		}
2064		KSE_LOCK_RELEASE(curthread->kse, &kse_lock);
2065		_kse_critical_leave(crit);
2066		if (kse != NULL)
2067			kse_reinit(kse, sys_scope);
2068	}
2069	if ((kse == NULL) &&
2070	    ((kse = (struct kse *)malloc(sizeof(*kse))) != NULL)) {
2071		bzero(kse, sizeof(*kse));
2072
2073		/* Initialize the lockusers. */
2074		for (i = 0; i < MAX_KSE_LOCKLEVEL; i++) {
2075			_lockuser_init(&kse->k_lockusers[i], (void *)kse);
2076			_LCK_SET_PRIVATE2(&kse->k_lockusers[i], NULL);
2077		}
2078		/* _lock_init(kse->k_lock, ...) */
2079
2080		/* We had to malloc a kse; mark it as needing a new ID.*/
2081		need_ksd = 1;
2082
2083		/*
2084		 * Create the KSE context.
2085		 * Scope system threads (one thread per KSE) are not required
2086		 * to have a stack for an unneeded kse upcall.
2087		 */
2088		if (!sys_scope) {
2089			kse->k_mbx.km_func = (kse_func_t *)kse_sched_multi;
2090			kse->k_stack.ss_sp = (char *) malloc(KSE_STACKSIZE);
2091			kse->k_stack.ss_size = KSE_STACKSIZE;
2092		} else {
2093			kse->k_mbx.km_func = (kse_func_t *)kse_sched_single;
2094		}
2095		kse->k_mbx.km_udata = (void *)kse;
2096		kse->k_mbx.km_quantum = 20000;
2097		/*
2098		 * We need to keep a copy of the stack in case it
2099		 * doesn't get used; a KSE running a scope system
2100		 * thread will use that thread's stack.
2101		 */
2102		kse->k_mbx.km_stack = kse->k_stack;
2103		if (!sys_scope && kse->k_stack.ss_sp == NULL) {
2104			for (i = 0; i < MAX_KSE_LOCKLEVEL; i++) {
2105				_lockuser_destroy(&kse->k_lockusers[i]);
2106			}
2107			/* _lock_destroy(&kse->k_lock); */
2108			free(kse);
2109			kse = NULL;
2110		}
2111	}
2112	if ((kse != NULL) && (need_ksd != 0)) {
2113		/* This KSE needs initialization. */
2114		if (curthread != NULL) {
2115			crit = _kse_critical_enter();
2116			KSE_LOCK_ACQUIRE(curthread->kse, &kse_lock);
2117		}
2118		/* Initialize KSD inside of the lock. */
2119		if (_ksd_create(&kse->k_ksd, (void *)kse, sizeof(*kse)) != 0) {
2120			if (curthread != NULL) {
2121				KSE_LOCK_RELEASE(curthread->kse, &kse_lock);
2122				_kse_critical_leave(crit);
2123			}
2124			if (kse->k_stack.ss_sp)
2125				free(kse->k_stack.ss_sp);
2126			for (i = 0; i < MAX_KSE_LOCKLEVEL; i++) {
2127				_lockuser_destroy(&kse->k_lockusers[i]);
2128			}
2129			free(kse);
2130			return (NULL);
2131		}
2132		kse->k_flags = 0;
2133		TAILQ_INSERT_TAIL(&active_kseq, kse, k_qe);
2134		active_kse_count++;
2135		if (curthread != NULL) {
2136			KSE_LOCK_RELEASE(curthread->kse, &kse_lock);
2137			_kse_critical_leave(crit);
2138		}
2139	}
2140	return (kse);
2141}
2142
2143static void
2144kse_reinit(struct kse *kse, int sys_scope)
2145{
2146	if (!sys_scope) {
2147		kse->k_mbx.km_func = (kse_func_t *)kse_sched_multi;
2148		if (kse->k_stack.ss_sp == NULL) {
2149			/* XXX check allocation failure */
2150			kse->k_stack.ss_sp = (char *) malloc(KSE_STACKSIZE);
2151			kse->k_stack.ss_size = KSE_STACKSIZE;
2152		}
2153		kse->k_mbx.km_quantum = 20000;
2154	} else {
2155		kse->k_mbx.km_func = (kse_func_t *)kse_sched_single;
2156		if (kse->k_stack.ss_sp)
2157			free(kse->k_stack.ss_sp);
2158		kse->k_stack.ss_sp = NULL;
2159		kse->k_stack.ss_size = 0;
2160		kse->k_mbx.km_quantum = 0;
2161	}
2162	kse->k_mbx.km_stack = kse->k_stack;
2163	kse->k_mbx.km_udata = (void *)kse;
2164	kse->k_mbx.km_curthread = NULL;
2165	kse->k_mbx.km_flags = 0;
2166	kse->k_curthread = 0;
2167	kse->k_kseg = 0;
2168	kse->k_schedq = 0;
2169	kse->k_locklevel = 0;
2170	SIGEMPTYSET(kse->k_sigmask);
2171	bzero(&kse->k_sigq, sizeof(kse->k_sigq));
2172	kse->k_check_sigq = 0;
2173	kse->k_flags = 0;
2174	kse->k_waiting = 0;
2175	kse->k_idle = 0;
2176	kse->k_error = 0;
2177	kse->k_cpu = 0;
2178	kse->k_done = 0;
2179	kse->k_switch = 0;
2180	kse->k_sigseqno = 0;
2181}
2182
2183void
2184kse_free_unlocked(struct kse *kse)
2185{
2186	TAILQ_REMOVE(&active_kseq, kse, k_qe);
2187	active_kse_count--;
2188	kse->k_kseg = NULL;
2189	kse->k_mbx.km_quantum = 20000;
2190	kse->k_flags = 0;
2191	TAILQ_INSERT_HEAD(&free_kseq, kse, k_qe);
2192	free_kse_count++;
2193}
2194
2195void
2196_kse_free(struct pthread *curthread, struct kse *kse)
2197{
2198	kse_critical_t crit;
2199
2200	if (curthread == NULL)
2201		kse_free_unlocked(kse);
2202	else {
2203		crit = _kse_critical_enter();
2204		KSE_LOCK_ACQUIRE(curthread->kse, &kse_lock);
2205		kse_free_unlocked(kse);
2206		KSE_LOCK_RELEASE(curthread->kse, &kse_lock);
2207		_kse_critical_leave(crit);
2208	}
2209}
2210
2211static void
2212kseg_init(struct kse_group *kseg)
2213{
2214	kseg_reinit(kseg);
2215	_lock_init(&kseg->kg_lock, LCK_ADAPTIVE, _kse_lock_wait,
2216	    _kse_lock_wakeup);
2217}
2218
2219static void
2220kseg_reinit(struct kse_group *kseg)
2221{
2222	TAILQ_INIT(&kseg->kg_kseq);
2223	TAILQ_INIT(&kseg->kg_threadq);
2224	TAILQ_INIT(&kseg->kg_schedq.sq_waitq);
2225	kseg->kg_threadcount = 0;
2226	kseg->kg_ksecount = 0;
2227	kseg->kg_idle_kses = 0;
2228	kseg->kg_flags = 0;
2229}
2230
2231struct pthread *
2232_thr_alloc(struct pthread *curthread)
2233{
2234	kse_critical_t crit;
2235	void *p;
2236	struct pthread *thread = NULL;
2237
2238	if (curthread != NULL) {
2239		if (GC_NEEDED())
2240			_thr_gc(curthread);
2241		if (free_thread_count > 0) {
2242			crit = _kse_critical_enter();
2243			KSE_LOCK_ACQUIRE(curthread->kse, &thread_lock);
2244			if ((thread = TAILQ_FIRST(&free_threadq)) != NULL) {
2245				TAILQ_REMOVE(&free_threadq, thread, tle);
2246				free_thread_count--;
2247			}
2248			KSE_LOCK_RELEASE(curthread->kse, &thread_lock);
2249			_kse_critical_leave(crit);
2250		}
2251	}
2252	if (thread == NULL) {
2253		p = malloc(sizeof(struct pthread) + THR_ALIGNBYTES);
2254		if (p != NULL) {
2255			thread = (struct pthread *)THR_ALIGN(p);
2256			thread->alloc_addr = p;
2257		}
2258	}
2259	return (thread);
2260}
2261
2262void
2263_thr_free(struct pthread *curthread, struct pthread *thread)
2264{
2265	kse_critical_t crit;
2266	int i;
2267
2268	DBG_MSG("Freeing thread %p\n", thread);
2269	if ((curthread == NULL) || (free_thread_count >= MAX_CACHED_THREADS)) {
2270		for (i = 0; i < MAX_THR_LOCKLEVEL; i++) {
2271			_lockuser_destroy(&thread->lockusers[i]);
2272		}
2273		_lock_destroy(&thread->lock);
2274		free(thread->alloc_addr);
2275	}
2276	else {
2277		crit = _kse_critical_enter();
2278		KSE_LOCK_ACQUIRE(curthread->kse, &thread_lock);
2279		TAILQ_INSERT_TAIL(&free_threadq, thread, tle);
2280		free_thread_count++;
2281		KSE_LOCK_RELEASE(curthread->kse, &thread_lock);
2282		_kse_critical_leave(crit);
2283	}
2284}
2285
2286/*
2287 * Add an active thread:
2288 *
2289 *   o Assign the thread a unique id (which GDB uses to track
2290 *     threads.
2291 *   o Add the thread to the list of all threads and increment
2292 *     number of active threads.
2293 */
2294static void
2295thr_link(struct pthread *thread)
2296{
2297	kse_critical_t crit;
2298	struct kse *curkse;
2299	struct pthread *curthread;
2300
2301	crit = _kse_critical_enter();
2302	curkse = _get_curkse();
2303	curthread = _get_curthread();
2304	thread->sigmask = curthread->sigmask;
2305	KSE_LOCK_ACQUIRE(curkse, &_thread_list_lock);
2306	/*
2307	 * Initialize the unique id (which GDB uses to track
2308	 * threads), add the thread to the list of all threads,
2309	 * and
2310	 */
2311	thread->uniqueid = next_uniqueid++;
2312	THR_LIST_ADD(thread);
2313	active_threads++;
2314	KSE_LOCK_RELEASE(curkse, &_thread_list_lock);
2315	_kse_critical_leave(crit);
2316}
2317
2318/*
2319 * Remove an active thread.
2320 */
2321static void
2322thr_unlink(struct pthread *thread)
2323{
2324	kse_critical_t crit;
2325	struct kse *curkse;
2326
2327	crit = _kse_critical_enter();
2328	curkse = _get_curkse();
2329	KSE_LOCK_ACQUIRE(curkse, &_thread_list_lock);
2330	THR_LIST_REMOVE(thread);
2331	active_threads--;
2332	KSE_LOCK_RELEASE(curkse, &_thread_list_lock);
2333	_kse_critical_leave(crit);
2334}
2335
2336void
2337_thr_hash_add(struct pthread *thread)
2338{
2339	struct thread_hash_head *head;
2340
2341	head = &thr_hashtable[THREAD_HASH(thread)];
2342	LIST_INSERT_HEAD(head, thread, hle);
2343}
2344
2345void
2346_thr_hash_remove(struct pthread *thread)
2347{
2348	LIST_REMOVE(thread, hle);
2349}
2350
2351struct pthread *
2352_thr_hash_find(struct pthread *thread)
2353{
2354	struct pthread *td;
2355	struct thread_hash_head *head;
2356
2357	head = &thr_hashtable[THREAD_HASH(thread)];
2358	LIST_FOREACH(td, head, hle) {
2359		if (td == thread)
2360			return (thread);
2361	}
2362	return (NULL);
2363}
2364
2365