kern_thread.c revision 107060
1/*
2 * Copyright (C) 2001 Julian Elischer <julian@freebsd.org>.
3 *  All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 * 1. Redistributions of source code must retain the above copyright
9 *    notice(s), this list of conditions and the following disclaimer as
10 *    the first lines of this file unmodified other than the possible
11 *    addition of one or more copyright notices.
12 * 2. Redistributions in binary form must reproduce the above copyright
13 *    notice(s), this list of conditions and the following disclaimer in the
14 *    documentation and/or other materials provided with the distribution.
15 *
16 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDER(S) ``AS IS'' AND ANY
17 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
18 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
19 * DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT HOLDER(S) BE LIABLE FOR ANY
20 * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
21 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
22 * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
23 * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
24 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
25 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH
26 * DAMAGE.
27 *
28 * $FreeBSD: head/sys/kern/kern_thread.c 107060 2002-11-18 12:28:15Z davidxu $
29 */
30
31#include <sys/param.h>
32#include <sys/systm.h>
33#include <sys/kernel.h>
34#include <sys/lock.h>
35#include <sys/malloc.h>
36#include <sys/mutex.h>
37#include <sys/proc.h>
38#include <sys/smp.h>
39#include <sys/sysctl.h>
40#include <sys/sysproto.h>
41#include <sys/filedesc.h>
42#include <sys/tty.h>
43#include <sys/signalvar.h>
44#include <sys/sx.h>
45#include <sys/user.h>
46#include <sys/jail.h>
47#include <sys/kse.h>
48#include <sys/ktr.h>
49#include <sys/ucontext.h>
50
51#include <vm/vm.h>
52#include <vm/vm_object.h>
53#include <vm/pmap.h>
54#include <vm/uma.h>
55#include <vm/vm_map.h>
56
57#include <machine/frame.h>
58
59/*
60 * KSEGRP related storage.
61 */
62static uma_zone_t ksegrp_zone;
63static uma_zone_t kse_zone;
64static uma_zone_t thread_zone;
65
66/* DEBUG ONLY */
67SYSCTL_NODE(_kern, OID_AUTO, threads, CTLFLAG_RW, 0, "thread allocation");
68static int oiks_debug = 0;	/* 0 disable, 1 printf, 2 enter debugger */
69SYSCTL_INT(_kern_threads, OID_AUTO, oiks, CTLFLAG_RW,
70	&oiks_debug, 0, "OIKS thread debug");
71
72static int oiks_max_threads_per_proc = 10;
73SYSCTL_INT(_kern_threads, OID_AUTO, oiks_max_per_proc, CTLFLAG_RW,
74	&oiks_max_threads_per_proc, 0, "Debug limit on threads per proc");
75
76static int max_threads_per_proc = 30;
77SYSCTL_INT(_kern_threads, OID_AUTO, max_threads_per_proc, CTLFLAG_RW,
78	&max_threads_per_proc, 0, "Limit on threads per proc");
79
80static int max_groups_per_proc = 5;
81SYSCTL_INT(_kern_threads, OID_AUTO, max_groups_per_proc, CTLFLAG_RW,
82	&max_groups_per_proc, 0, "Limit on thread groups per proc");
83
84#define RANGEOF(type, start, end) (offsetof(type, end) - offsetof(type, start))
85
86struct threadqueue zombie_threads = TAILQ_HEAD_INITIALIZER(zombie_threads);
87TAILQ_HEAD(, kse) zombie_kses = TAILQ_HEAD_INITIALIZER(zombie_kses);
88TAILQ_HEAD(, ksegrp) zombie_ksegrps = TAILQ_HEAD_INITIALIZER(zombie_ksegrps);
89struct mtx zombie_thread_lock;
90MTX_SYSINIT(zombie_thread_lock, &zombie_thread_lock,
91    "zombie_thread_lock", MTX_SPIN);
92
93
94
95void kse_purge(struct proc *p, struct thread *td);
96/*
97 * Pepare a thread for use.
98 */
99static void
100thread_ctor(void *mem, int size, void *arg)
101{
102	struct thread	*td;
103
104	KASSERT((size == sizeof(struct thread)),
105	    ("size mismatch: %d != %d\n", size, (int)sizeof(struct thread)));
106
107	td = (struct thread *)mem;
108	td->td_state = TDS_INACTIVE;
109	td->td_flags |= TDF_UNBOUND;
110}
111
112/*
113 * Reclaim a thread after use.
114 */
115static void
116thread_dtor(void *mem, int size, void *arg)
117{
118	struct thread	*td;
119
120	KASSERT((size == sizeof(struct thread)),
121	    ("size mismatch: %d != %d\n", size, (int)sizeof(struct thread)));
122
123	td = (struct thread *)mem;
124
125#ifdef INVARIANTS
126	/* Verify that this thread is in a safe state to free. */
127	switch (td->td_state) {
128	case TDS_INHIBITED:
129	case TDS_RUNNING:
130	case TDS_CAN_RUN:
131	case TDS_RUNQ:
132		/*
133		 * We must never unlink a thread that is in one of
134		 * these states, because it is currently active.
135		 */
136		panic("bad state for thread unlinking");
137		/* NOTREACHED */
138	case TDS_INACTIVE:
139		break;
140	default:
141		panic("bad thread state");
142		/* NOTREACHED */
143	}
144#endif
145}
146
147/*
148 * Initialize type-stable parts of a thread (when newly created).
149 */
150static void
151thread_init(void *mem, int size)
152{
153	struct thread	*td;
154
155	KASSERT((size == sizeof(struct thread)),
156	    ("size mismatch: %d != %d\n", size, (int)sizeof(struct thread)));
157
158	td = (struct thread *)mem;
159	mtx_lock(&Giant);
160	pmap_new_thread(td, 0);
161	mtx_unlock(&Giant);
162	cpu_thread_setup(td);
163}
164
165/*
166 * Tear down type-stable parts of a thread (just before being discarded).
167 */
168static void
169thread_fini(void *mem, int size)
170{
171	struct thread	*td;
172
173	KASSERT((size == sizeof(struct thread)),
174	    ("size mismatch: %d != %d\n", size, (int)sizeof(struct thread)));
175
176	td = (struct thread *)mem;
177	pmap_dispose_thread(td);
178}
179
180/*
181 * KSE is linked onto the idle queue.
182 */
183void
184kse_link(struct kse *ke, struct ksegrp *kg)
185{
186	struct proc *p = kg->kg_proc;
187
188	TAILQ_INSERT_HEAD(&kg->kg_kseq, ke, ke_kglist);
189	kg->kg_kses++;
190	ke->ke_state = KES_UNQUEUED;
191	ke->ke_proc	= p;
192	ke->ke_ksegrp	= kg;
193	ke->ke_thread	= NULL;
194	ke->ke_oncpu = NOCPU;
195}
196
197void
198kse_unlink(struct kse *ke)
199{
200	struct ksegrp *kg;
201
202	mtx_assert(&sched_lock, MA_OWNED);
203	kg = ke->ke_ksegrp;
204	if (ke->ke_state == KES_IDLE) {
205		kg->kg_idle_kses--;
206		TAILQ_REMOVE(&kg->kg_iq, ke, ke_kgrlist);
207	}
208
209	TAILQ_REMOVE(&kg->kg_kseq, ke, ke_kglist);
210	if (--kg->kg_kses == 0) {
211			ksegrp_unlink(kg);
212	}
213	/*
214	 * Aggregate stats from the KSE
215	 */
216	kse_stash(ke);
217}
218
219void
220ksegrp_link(struct ksegrp *kg, struct proc *p)
221{
222
223	TAILQ_INIT(&kg->kg_threads);
224	TAILQ_INIT(&kg->kg_runq);	/* links with td_runq */
225	TAILQ_INIT(&kg->kg_slpq);	/* links with td_runq */
226	TAILQ_INIT(&kg->kg_kseq);	/* all kses in ksegrp */
227	TAILQ_INIT(&kg->kg_iq);		/* idle kses in ksegrp */
228	TAILQ_INIT(&kg->kg_lq);		/* loan kses in ksegrp */
229	kg->kg_proc	= p;
230/* the following counters are in the -zero- section and may not need clearing */
231	kg->kg_numthreads = 0;
232	kg->kg_runnable = 0;
233	kg->kg_kses = 0;
234	kg->kg_idle_kses = 0;
235	kg->kg_loan_kses = 0;
236	kg->kg_runq_kses = 0; /* XXXKSE change name */
237/* link it in now that it's consistent */
238	p->p_numksegrps++;
239	TAILQ_INSERT_HEAD(&p->p_ksegrps, kg, kg_ksegrp);
240}
241
242void
243ksegrp_unlink(struct ksegrp *kg)
244{
245	struct proc *p;
246
247	mtx_assert(&sched_lock, MA_OWNED);
248	p = kg->kg_proc;
249	KASSERT(((kg->kg_numthreads == 0) && (kg->kg_kses == 0)),
250	    ("kseg_unlink: residual threads or KSEs"));
251	TAILQ_REMOVE(&p->p_ksegrps, kg, kg_ksegrp);
252	p->p_numksegrps--;
253	/*
254	 * Aggregate stats from the KSE
255	 */
256	ksegrp_stash(kg);
257}
258
259/*
260 * for a newly created process,
261 * link up a the structure and its initial threads etc.
262 */
263void
264proc_linkup(struct proc *p, struct ksegrp *kg,
265			struct kse *ke, struct thread *td)
266{
267
268	TAILQ_INIT(&p->p_ksegrps);	     /* all ksegrps in proc */
269	TAILQ_INIT(&p->p_threads);	     /* all threads in proc */
270	TAILQ_INIT(&p->p_suspended);	     /* Threads suspended */
271	p->p_numksegrps = 0;
272	p->p_numthreads = 0;
273
274	ksegrp_link(kg, p);
275	kse_link(ke, kg);
276	thread_link(td, kg);
277}
278
279int
280kse_thr_interrupt(struct thread *td, struct kse_thr_interrupt_args *uap)
281{
282	struct proc *p;
283	struct thread *td2;
284
285	p = td->td_proc;
286	/* KSE-enabled processes only, please. */
287	if (!(p->p_flag & P_KSES))
288		return (EINVAL);
289	if (uap->tmbx == NULL)
290		return (EINVAL);
291	mtx_lock_spin(&sched_lock);
292	FOREACH_THREAD_IN_PROC(p, td2) {
293		if (td2->td_mailbox == uap->tmbx) {
294			td2->td_flags |= TDF_INTERRUPT;
295			if (TD_ON_SLEEPQ(td2) && (td2->td_flags & TDF_SINTR)) {
296				if (td2->td_flags & TDF_CVWAITQ)
297					cv_abort(td2);
298				else
299					abortsleep(td2);
300			}
301			mtx_unlock_spin(&sched_lock);
302			td->td_retval[0] = 0;
303			td->td_retval[1] = 0;
304			return (0);
305		}
306	}
307	mtx_unlock_spin(&sched_lock);
308	return (ESRCH);
309}
310
311int
312kse_exit(struct thread *td, struct kse_exit_args *uap)
313{
314	struct proc *p;
315	struct ksegrp *kg;
316
317	p = td->td_proc;
318	/* KSE-enabled processes only, please. */
319	if (!(p->p_flag & P_KSES))
320		return (EINVAL);
321	/* must be a bound thread */
322	if (td->td_flags & TDF_UNBOUND)
323		return (EINVAL);
324	kg = td->td_ksegrp;
325	/* serialize killing kse */
326	PROC_LOCK(p);
327	mtx_lock_spin(&sched_lock);
328	if ((kg->kg_kses == 1) && (kg->kg_numthreads > 1)) {
329		mtx_unlock_spin(&sched_lock);
330		PROC_UNLOCK(p);
331		return (EDEADLK);
332	}
333	if ((p->p_numthreads == 1) && (p->p_numksegrps == 1)) {
334		p->p_flag &= ~P_KSES;
335		mtx_unlock_spin(&sched_lock);
336		PROC_UNLOCK(p);
337	} else {
338		while (mtx_owned(&Giant))
339			mtx_unlock(&Giant);
340		td->td_kse->ke_flags |= KEF_EXIT;
341		thread_exit();
342		/* NOTREACHED */
343	}
344	return (0);
345}
346
347int
348kse_release(struct thread *td, struct kse_release_args *uap)
349{
350	struct proc *p;
351
352	p = td->td_proc;
353	/* KSE-enabled processes only */
354	if (!(p->p_flag & P_KSES))
355		return (EINVAL);
356	/*
357	 * Must be a bound thread. And kse must have a mailbox ready,
358	 * if not, the kse would can not generate an upcall.
359	 */
360	if (!(td->td_flags & TDF_UNBOUND) && (td->td_kse->ke_mailbox != NULL)) {
361		PROC_LOCK(p);
362		mtx_lock_spin(&sched_lock);
363		/* prevent last thread from exiting */
364		if (p->p_numthreads == 1) {
365			mtx_unlock_spin(&sched_lock);
366			if (td->td_standin == NULL) {
367				PROC_UNLOCK(p);
368				td->td_standin = thread_alloc();
369				PROC_LOCK(p);
370			}
371			msleep(p->p_sigacts, &p->p_mtx, PPAUSE|PCATCH,
372			       "pause", 0);
373			mtx_lock_spin(&sched_lock);
374			td->td_flags |= TDF_UNBOUND;
375			thread_schedule_upcall(td, td->td_kse);
376		}
377		thread_exit();
378		/* NOTREACHED */
379	}
380	return (EINVAL);
381}
382
383/* struct kse_wakeup_args {
384	struct kse_mailbox *mbx;
385}; */
386int
387kse_wakeup(struct thread *td, struct kse_wakeup_args *uap)
388{
389	struct proc *p;
390	struct kse *ke, *ke2;
391	struct ksegrp *kg;
392
393	p = td->td_proc;
394	/* KSE-enabled processes only, please. */
395	if (!(p->p_flag & P_KSES))
396		return EINVAL;
397	if (td->td_standin == NULL)
398		td->td_standin = thread_alloc();
399	ke = NULL;
400	mtx_lock_spin(&sched_lock);
401	if (uap->mbx) {
402		FOREACH_KSEGRP_IN_PROC(p, kg) {
403			FOREACH_KSE_IN_GROUP(kg, ke2) {
404				if (ke2->ke_mailbox != uap->mbx)
405					continue;
406				if (ke2->ke_state == KES_IDLE) {
407					ke = ke2;
408					goto found;
409				} else {
410					mtx_unlock_spin(&sched_lock);
411					td->td_retval[0] = 0;
412					td->td_retval[1] = 0;
413					return (0);
414				}
415			}
416		}
417	} else {
418		kg = td->td_ksegrp;
419		ke = TAILQ_FIRST(&kg->kg_iq);
420	}
421	if (ke == NULL) {
422		mtx_unlock_spin(&sched_lock);
423		return (ESRCH);
424	}
425found:
426	thread_schedule_upcall(td, ke);
427	mtx_unlock_spin(&sched_lock);
428	td->td_retval[0] = 0;
429	td->td_retval[1] = 0;
430	return (0);
431}
432
433/*
434 * No new KSEG: first call: use current KSE, don't schedule an upcall
435 * All other situations, do allocate a new KSE and schedule an upcall on it.
436 */
437/* struct kse_create_args {
438	struct kse_mailbox *mbx;
439	int newgroup;
440}; */
441int
442kse_create(struct thread *td, struct kse_create_args *uap)
443{
444	struct kse *newke;
445	struct kse *ke;
446	struct ksegrp *newkg;
447	struct ksegrp *kg;
448	struct proc *p;
449	struct kse_mailbox mbx;
450	int err;
451
452	p = td->td_proc;
453	if ((err = copyin(uap->mbx, &mbx, sizeof(mbx))))
454		return (err);
455
456	p->p_flag |= P_KSES; /* easier to just set it than to test and set */
457	kg = td->td_ksegrp;
458	if (uap->newgroup) {
459		if (p->p_numksegrps >= max_groups_per_proc)
460			return (EPROCLIM);
461		/*
462		 * If we want a new KSEGRP it doesn't matter whether
463		 * we have already fired up KSE mode before or not.
464		 * We put the process in KSE mode and create a new KSEGRP
465		 * and KSE. If our KSE has not got a mailbox yet then
466		 * that doesn't matter, just leave it that way. It will
467		 * ensure that this thread stay BOUND. It's possible
468		 * that the call came form a threaded library and the main
469		 * program knows nothing of threads.
470		 */
471		newkg = ksegrp_alloc();
472		bzero(&newkg->kg_startzero, RANGEOF(struct ksegrp,
473		      kg_startzero, kg_endzero));
474		bcopy(&kg->kg_startcopy, &newkg->kg_startcopy,
475		      RANGEOF(struct ksegrp, kg_startcopy, kg_endcopy));
476		newke = kse_alloc();
477	} else {
478		/*
479		 * Otherwise, if we have already set this KSE
480		 * to have a mailbox, we want to make another KSE here,
481		 * but only if there are not already the limit, which
482		 * is 1 per CPU max.
483		 *
484		 * If the current KSE doesn't have a mailbox we just use it
485		 * and give it one.
486		 *
487		 * Because we don't like to access
488		 * the KSE outside of schedlock if we are UNBOUND,
489		 * (because it can change if we are preempted by an interrupt)
490		 * we can deduce it as having a mailbox if we are UNBOUND,
491		 * and only need to actually look at it if we are BOUND,
492		 * which is safe.
493		 */
494		if ((td->td_flags & TDF_UNBOUND) || td->td_kse->ke_mailbox) {
495			if (oiks_debug == 0) {
496#ifdef SMP
497			if (kg->kg_kses > mp_ncpus)
498#endif
499				return (EPROCLIM);
500			}
501			newke = kse_alloc();
502		} else {
503			newke = NULL;
504		}
505		newkg = NULL;
506	}
507	if (newke) {
508		bzero(&newke->ke_startzero, RANGEOF(struct kse,
509		      ke_startzero, ke_endzero));
510#if 0
511		bcopy(&ke->ke_startcopy, &newke->ke_startcopy,
512		      RANGEOF(struct kse, ke_startcopy, ke_endcopy));
513#endif
514		/* For the first call this may not have been set */
515		if (td->td_standin == NULL) {
516			td->td_standin = thread_alloc();
517		}
518		mtx_lock_spin(&sched_lock);
519		if (newkg) {
520			if (p->p_numksegrps >= max_groups_per_proc) {
521				mtx_unlock_spin(&sched_lock);
522				ksegrp_free(newkg);
523				kse_free(newke);
524				return (EPROCLIM);
525			}
526			ksegrp_link(newkg, p);
527		}
528		else
529			newkg = kg;
530		kse_link(newke, newkg);
531		if (p->p_sflag & PS_NEEDSIGCHK)
532			newke->ke_flags |= KEF_ASTPENDING;
533		newke->ke_mailbox = uap->mbx;
534		newke->ke_upcall = mbx.km_func;
535		bcopy(&mbx.km_stack, &newke->ke_stack, sizeof(stack_t));
536		thread_schedule_upcall(td, newke);
537		mtx_unlock_spin(&sched_lock);
538	} else {
539		/*
540		 * If we didn't allocate a new KSE then the we are using
541		 * the exisiting (BOUND) kse.
542		 */
543		ke = td->td_kse;
544		ke->ke_mailbox = uap->mbx;
545		ke->ke_upcall = mbx.km_func;
546		bcopy(&mbx.km_stack, &ke->ke_stack, sizeof(stack_t));
547	}
548	/*
549	 * Fill out the KSE-mode specific fields of the new kse.
550	 */
551
552	td->td_retval[0] = 0;
553	td->td_retval[1] = 0;
554	return (0);
555}
556
557/*
558 * Fill a ucontext_t with a thread's context information.
559 *
560 * This is an analogue to getcontext(3).
561 */
562void
563thread_getcontext(struct thread *td, ucontext_t *uc)
564{
565
566/*
567 * XXX this is declared in a MD include file, i386/include/ucontext.h but
568 * is used in MI code.
569 */
570#ifdef __i386__
571	get_mcontext(td, &uc->uc_mcontext);
572#endif
573	uc->uc_sigmask = td->td_proc->p_sigmask;
574}
575
576/*
577 * Set a thread's context from a ucontext_t.
578 *
579 * This is an analogue to setcontext(3).
580 */
581int
582thread_setcontext(struct thread *td, ucontext_t *uc)
583{
584	int ret;
585
586/*
587 * XXX this is declared in a MD include file, i386/include/ucontext.h but
588 * is used in MI code.
589 */
590#ifdef __i386__
591	ret = set_mcontext(td, &uc->uc_mcontext);
592#else
593	ret = ENOSYS;
594#endif
595	if (ret == 0) {
596		SIG_CANTMASK(uc->uc_sigmask);
597		PROC_LOCK(td->td_proc);
598		td->td_proc->p_sigmask = uc->uc_sigmask;
599		PROC_UNLOCK(td->td_proc);
600	}
601	return (ret);
602}
603
604/*
605 * Initialize global thread allocation resources.
606 */
607void
608threadinit(void)
609{
610
611#ifndef __ia64__
612	thread_zone = uma_zcreate("THREAD", sizeof (struct thread),
613	    thread_ctor, thread_dtor, thread_init, thread_fini,
614	    UMA_ALIGN_CACHE, 0);
615#else
616	/*
617	 * XXX the ia64 kstack allocator is really lame and is at the mercy
618	 * of contigmallloc().  This hackery is to pre-construct a whole
619	 * pile of thread structures with associated kernel stacks early
620	 * in the system startup while contigmalloc() still works. Once we
621	 * have them, keep them.  Sigh.
622	 */
623	thread_zone = uma_zcreate("THREAD", sizeof (struct thread),
624	    thread_ctor, thread_dtor, thread_init, thread_fini,
625	    UMA_ALIGN_CACHE, UMA_ZONE_NOFREE);
626	uma_prealloc(thread_zone, 512);		/* XXX arbitary */
627#endif
628	ksegrp_zone = uma_zcreate("KSEGRP", sizeof (struct ksegrp),
629	    NULL, NULL, NULL, NULL,
630	    UMA_ALIGN_CACHE, 0);
631	kse_zone = uma_zcreate("KSE", sizeof (struct kse),
632	    NULL, NULL, NULL, NULL,
633	    UMA_ALIGN_CACHE, 0);
634}
635
636/*
637 * Stash an embarasingly extra thread into the zombie thread queue.
638 */
639void
640thread_stash(struct thread *td)
641{
642	mtx_lock_spin(&zombie_thread_lock);
643	TAILQ_INSERT_HEAD(&zombie_threads, td, td_runq);
644	mtx_unlock_spin(&zombie_thread_lock);
645}
646
647/*
648 * Stash an embarasingly extra kse into the zombie kse queue.
649 */
650void
651kse_stash(struct kse *ke)
652{
653	mtx_lock_spin(&zombie_thread_lock);
654	TAILQ_INSERT_HEAD(&zombie_kses, ke, ke_procq);
655	mtx_unlock_spin(&zombie_thread_lock);
656}
657
658/*
659 * Stash an embarasingly extra ksegrp into the zombie ksegrp queue.
660 */
661void
662ksegrp_stash(struct ksegrp *kg)
663{
664	mtx_lock_spin(&zombie_thread_lock);
665	TAILQ_INSERT_HEAD(&zombie_ksegrps, kg, kg_ksegrp);
666	mtx_unlock_spin(&zombie_thread_lock);
667}
668
669/*
670 * Reap zombie threads.
671 */
672void
673thread_reap(void)
674{
675	struct thread *td_first, *td_next;
676	struct kse *ke_first, *ke_next;
677	struct ksegrp *kg_first, * kg_next;
678
679	/*
680	 * don't even bother to lock if none at this instant
681	 * We really don't care about the next instant..
682	 */
683	if ((!TAILQ_EMPTY(&zombie_threads))
684	    || (!TAILQ_EMPTY(&zombie_kses))
685	    || (!TAILQ_EMPTY(&zombie_ksegrps))) {
686		mtx_lock_spin(&zombie_thread_lock);
687		td_first = TAILQ_FIRST(&zombie_threads);
688		ke_first = TAILQ_FIRST(&zombie_kses);
689		kg_first = TAILQ_FIRST(&zombie_ksegrps);
690		if (td_first)
691			TAILQ_INIT(&zombie_threads);
692		if (ke_first)
693			TAILQ_INIT(&zombie_kses);
694		if (kg_first)
695			TAILQ_INIT(&zombie_ksegrps);
696		mtx_unlock_spin(&zombie_thread_lock);
697		while (td_first) {
698			td_next = TAILQ_NEXT(td_first, td_runq);
699			thread_free(td_first);
700			td_first = td_next;
701		}
702		while (ke_first) {
703			ke_next = TAILQ_NEXT(ke_first, ke_procq);
704			kse_free(ke_first);
705			ke_first = ke_next;
706		}
707		while (kg_first) {
708			kg_next = TAILQ_NEXT(kg_first, kg_ksegrp);
709			ksegrp_free(kg_first);
710			kg_first = kg_next;
711		}
712	}
713}
714
715/*
716 * Allocate a ksegrp.
717 */
718struct ksegrp *
719ksegrp_alloc(void)
720{
721	return (uma_zalloc(ksegrp_zone, M_WAITOK));
722}
723
724/*
725 * Allocate a kse.
726 */
727struct kse *
728kse_alloc(void)
729{
730	return (uma_zalloc(kse_zone, M_WAITOK));
731}
732
733/*
734 * Allocate a thread.
735 */
736struct thread *
737thread_alloc(void)
738{
739	thread_reap(); /* check if any zombies to get */
740	return (uma_zalloc(thread_zone, M_WAITOK));
741}
742
743/*
744 * Deallocate a ksegrp.
745 */
746void
747ksegrp_free(struct ksegrp *td)
748{
749	uma_zfree(ksegrp_zone, td);
750}
751
752/*
753 * Deallocate a kse.
754 */
755void
756kse_free(struct kse *td)
757{
758	uma_zfree(kse_zone, td);
759}
760
761/*
762 * Deallocate a thread.
763 */
764void
765thread_free(struct thread *td)
766{
767	uma_zfree(thread_zone, td);
768}
769
770/*
771 * Store the thread context in the UTS's mailbox.
772 * then add the mailbox at the head of a list we are building in user space.
773 * The list is anchored in the ksegrp structure.
774 */
775int
776thread_export_context(struct thread *td)
777{
778	struct proc *p;
779	struct ksegrp *kg;
780	uintptr_t mbx;
781	void *addr;
782	int error;
783	ucontext_t uc;
784	uint temp;
785
786	p = td->td_proc;
787	kg = td->td_ksegrp;
788
789	/* Export the user/machine context. */
790#if 0
791	addr = (caddr_t)td->td_mailbox +
792	    offsetof(struct kse_thr_mailbox, tm_context);
793#else /* if user pointer arithmetic is valid in the kernel */
794		addr = (void *)(&td->td_mailbox->tm_context);
795#endif
796	error = copyin(addr, &uc, sizeof(ucontext_t));
797	if (error == 0) {
798		thread_getcontext(td, &uc);
799		error = copyout(&uc, addr, sizeof(ucontext_t));
800
801	}
802	if (error) {
803		PROC_LOCK(p);
804		psignal(p, SIGSEGV);
805		PROC_UNLOCK(p);
806		return (error);
807	}
808	/* get address in latest mbox of list pointer */
809#if 0
810	addr = (caddr_t)td->td_mailbox
811	    + offsetof(struct kse_thr_mailbox , tm_next);
812#else /* if user pointer arithmetic is valid in the kernel */
813	addr = (void *)(&td->td_mailbox->tm_next);
814#endif
815	/*
816	 * Put the saved address of the previous first
817	 * entry into this one
818	 */
819	for (;;) {
820		mbx = (uintptr_t)kg->kg_completed;
821		if (suword(addr, mbx)) {
822			goto bad;
823		}
824		PROC_LOCK(p);
825		if (mbx == (uintptr_t)kg->kg_completed) {
826			kg->kg_completed = td->td_mailbox;
827			PROC_UNLOCK(p);
828			break;
829		}
830		PROC_UNLOCK(p);
831	}
832	addr = (caddr_t)td->td_mailbox
833		 + offsetof(struct kse_thr_mailbox, tm_sticks);
834	temp = fuword(addr) + td->td_usticks;
835	if (suword(addr, temp))
836		goto bad;
837	return (0);
838
839bad:
840	PROC_LOCK(p);
841	psignal(p, SIGSEGV);
842	PROC_UNLOCK(p);
843	return (EFAULT);
844}
845
846/*
847 * Take the list of completed mailboxes for this KSEGRP and put them on this
848 * KSE's mailbox as it's the next one going up.
849 */
850static int
851thread_link_mboxes(struct ksegrp *kg, struct kse *ke)
852{
853	struct proc *p = kg->kg_proc;
854	void *addr;
855	uintptr_t mbx;
856
857#if 0
858	addr = (caddr_t)ke->ke_mailbox
859	    + offsetof(struct kse_mailbox, km_completed);
860#else /* if user pointer arithmetic is valid in the kernel */
861		addr = (void *)(&ke->ke_mailbox->km_completed);
862#endif
863	for (;;) {
864		mbx = (uintptr_t)kg->kg_completed;
865		if (suword(addr, mbx)) {
866			PROC_LOCK(p);
867			psignal(p, SIGSEGV);
868			PROC_UNLOCK(p);
869			return (EFAULT);
870		}
871		/* XXXKSE could use atomic CMPXCH here */
872		PROC_LOCK(p);
873		if (mbx == (uintptr_t)kg->kg_completed) {
874			kg->kg_completed = NULL;
875			PROC_UNLOCK(p);
876			break;
877		}
878		PROC_UNLOCK(p);
879	}
880	return (0);
881}
882
883/*
884 * This function should be called at statclock interrupt time
885 */
886int
887thread_add_ticks_intr(int user, uint ticks)
888{
889	struct thread *td = curthread;
890	struct kse *ke = td->td_kse;
891
892	if (ke->ke_mailbox == NULL)
893		return -1;
894	if (user) {
895		/* Current always do via ast() */
896		ke->ke_flags |= KEF_ASTPENDING;
897		ke->ke_uuticks += ticks;
898	} else {
899		if (td->td_mailbox != NULL)
900			td->td_usticks += ticks;
901		else
902			ke->ke_usticks += ticks;
903	}
904	return 0;
905}
906
907static int
908thread_update_uticks(void)
909{
910	struct thread *td = curthread;
911	struct proc *p = td->td_proc;
912	struct kse *ke = td->td_kse;
913	struct kse_thr_mailbox *tmbx;
914	caddr_t addr;
915	uint uticks, sticks;
916
917	KASSERT(!(td->td_flags & TDF_UNBOUND), ("thread not bound."));
918
919	if (ke->ke_mailbox == NULL)
920		return 0;
921
922	uticks = ke->ke_uuticks;
923	ke->ke_uuticks = 0;
924	sticks = ke->ke_usticks;
925	ke->ke_usticks = 0;
926	tmbx = (void *)fuword((caddr_t)ke->ke_mailbox
927			+ offsetof(struct kse_mailbox, km_curthread));
928	if ((tmbx == NULL) || (tmbx == (void *)-1))
929		return 0;
930	if (uticks) {
931		addr = (caddr_t)tmbx + offsetof(struct kse_thr_mailbox, tm_uticks);
932		uticks += fuword(addr);
933		if (suword(addr, uticks))
934			goto bad;
935	}
936	if (sticks) {
937		addr = (caddr_t)tmbx + offsetof(struct kse_thr_mailbox, tm_sticks);
938		sticks += fuword(addr);
939		if (suword(addr, sticks))
940			goto bad;
941	}
942	return 0;
943bad:
944	PROC_LOCK(p);
945	psignal(p, SIGSEGV);
946	PROC_UNLOCK(p);
947	return -1;
948}
949
950/*
951 * Discard the current thread and exit from its context.
952 *
953 * Because we can't free a thread while we're operating under its context,
954 * push the current thread into our KSE's ke_tdspare slot, freeing the
955 * thread that might be there currently. Because we know that only this
956 * processor will run our KSE, we needn't worry about someone else grabbing
957 * our context before we do a cpu_throw.
958 */
959void
960thread_exit(void)
961{
962	struct thread *td;
963	struct kse *ke;
964	struct proc *p;
965	struct ksegrp	*kg;
966
967	td = curthread;
968	kg = td->td_ksegrp;
969	p = td->td_proc;
970	ke = td->td_kse;
971
972	mtx_assert(&sched_lock, MA_OWNED);
973	KASSERT(p != NULL, ("thread exiting without a process"));
974	KASSERT(ke != NULL, ("thread exiting without a kse"));
975	KASSERT(kg != NULL, ("thread exiting without a kse group"));
976	PROC_LOCK_ASSERT(p, MA_OWNED);
977	CTR1(KTR_PROC, "thread_exit: thread %p", td);
978	KASSERT(!mtx_owned(&Giant), ("dying thread owns giant"));
979
980	if (ke->ke_tdspare != NULL) {
981		thread_stash(ke->ke_tdspare);
982		ke->ke_tdspare = NULL;
983	}
984	if (td->td_standin != NULL) {
985		thread_stash(td->td_standin);
986		td->td_standin = NULL;
987	}
988
989	cpu_thread_exit(td);	/* XXXSMP */
990
991	/*
992	 * The last thread is left attached to the process
993	 * So that the whole bundle gets recycled. Skip
994	 * all this stuff.
995	 */
996	if (p->p_numthreads > 1) {
997		/*
998		 * Unlink this thread from its proc and the kseg.
999		 * In keeping with the other structs we probably should
1000		 * have a thread_unlink() that does some of this but it
1001		 * would only be called from here (I think) so it would
1002		 * be a waste. (might be useful for proc_fini() as well.)
1003 		 */
1004		TAILQ_REMOVE(&p->p_threads, td, td_plist);
1005		p->p_numthreads--;
1006		TAILQ_REMOVE(&kg->kg_threads, td, td_kglist);
1007		kg->kg_numthreads--;
1008		/*
1009		 * The test below is NOT true if we are the
1010		 * sole exiting thread. P_STOPPED_SNGL is unset
1011		 * in exit1() after it is the only survivor.
1012		 */
1013		if (P_SHOULDSTOP(p) == P_STOPPED_SINGLE) {
1014			if (p->p_numthreads == p->p_suspcount) {
1015				thread_unsuspend_one(p->p_singlethread);
1016			}
1017		}
1018
1019		/* Reassign this thread's KSE. */
1020		ke->ke_thread = NULL;
1021		td->td_kse = NULL;
1022		ke->ke_state = KES_UNQUEUED;
1023		KASSERT((ke->ke_bound != td),
1024		    ("thread_exit: entered with ke_bound set"));
1025
1026		/*
1027		 * The reason for all this hoopla is
1028		 * an attempt to stop our thread stack from being freed
1029		 * until AFTER we have stopped running on it.
1030		 * Since we are under schedlock, almost any method where
1031		 * it is eventually freed by someone else is probably ok.
1032		 * (Especially if they do it under schedlock). We could
1033		 * almost free it here if we could be certain that
1034		 * the uma code wouldn't pull it apart immediatly,
1035		 * but unfortunatly we can not guarantee that.
1036		 *
1037		 * For threads that are exiting and NOT killing their
1038		 * KSEs we can just stash it in the KSE, however
1039		 * in the case where the KSE is also being deallocated,
1040		 * we need to store it somewhere else. It turns out that
1041		 * we will never free the last KSE, so there is always one
1042		 * other KSE available. We might as well just choose one
1043		 * and stash it there. Being under schedlock should make that
1044		 * safe.
1045		 *
1046		 * In borrower threads, we can stash it in the lender
1047		 * Where it won't be needed until this thread is long gone.
1048		 * Borrower threads can't kill their KSE anyhow, so even
1049		 * the KSE would be a safe place for them. It is not
1050		 * necessary to have a KSE (or KSEGRP) at all beyond this
1051		 * point, while we are under the protection of schedlock.
1052		 *
1053		 * Either give the KSE to another thread to use (or make
1054		 * it idle), or free it entirely, possibly along with its
1055		 * ksegrp if it's the last one.
1056		 */
1057		if (ke->ke_flags & KEF_EXIT) {
1058			kse_unlink(ke);
1059			/*
1060			 * Designate another KSE to hold our thread.
1061			 * Safe as long as we abide by whatever lock
1062			 * we control it with.. The other KSE will not
1063			 * be able to run it until we release the schelock,
1064			 * but we need to be careful about it deciding to
1065			 * write to the stack before then. Luckily
1066			 * I believe that while another thread's
1067			 * standin thread can be used in this way, the
1068			 * spare thread for the KSE cannot be used without
1069			 * holding schedlock at least once.
1070			 */
1071			ke =  FIRST_KSE_IN_PROC(p);
1072		} else {
1073			kse_reassign(ke);
1074		}
1075#if 0
1076		if (ke->ke_bound) {
1077			/*
1078			 * WE are a borrower..
1079			 * stash our thread with the owner.
1080			 */
1081			if (ke->ke_bound->td_standin) {
1082				thread_stash(ke->ke_bound->td_standin);
1083			}
1084			ke->ke_bound->td_standin = td;
1085		} else {
1086#endif
1087			if (ke->ke_tdspare != NULL) {
1088				thread_stash(ke->ke_tdspare);
1089				ke->ke_tdspare = NULL;
1090			}
1091			ke->ke_tdspare = td;
1092#if 0
1093		}
1094#endif
1095		PROC_UNLOCK(p);
1096		td->td_state	= TDS_INACTIVE;
1097		td->td_proc	= NULL;
1098		td->td_ksegrp	= NULL;
1099		td->td_last_kse	= NULL;
1100	} else {
1101		PROC_UNLOCK(p);
1102	}
1103
1104	cpu_throw();
1105	/* NOTREACHED */
1106}
1107
1108/*
1109 * Link a thread to a process.
1110 * set up anything that needs to be initialized for it to
1111 * be used by the process.
1112 *
1113 * Note that we do not link to the proc's ucred here.
1114 * The thread is linked as if running but no KSE assigned.
1115 */
1116void
1117thread_link(struct thread *td, struct ksegrp *kg)
1118{
1119	struct proc *p;
1120
1121	p = kg->kg_proc;
1122	td->td_state = TDS_INACTIVE;
1123	td->td_proc	= p;
1124	td->td_ksegrp	= kg;
1125	td->td_last_kse	= NULL;
1126
1127	LIST_INIT(&td->td_contested);
1128	callout_init(&td->td_slpcallout, 1);
1129	TAILQ_INSERT_HEAD(&p->p_threads, td, td_plist);
1130	TAILQ_INSERT_HEAD(&kg->kg_threads, td, td_kglist);
1131	p->p_numthreads++;
1132	kg->kg_numthreads++;
1133	if (oiks_debug && (p->p_numthreads > oiks_max_threads_per_proc)) {
1134		printf("OIKS %d\n", p->p_numthreads);
1135		if (oiks_debug > 1)
1136			Debugger("OIKS");
1137	}
1138	td->td_kse	= NULL;
1139}
1140
1141void
1142kse_purge(struct proc *p, struct thread *td)
1143{
1144	struct kse *ke;
1145	struct ksegrp *kg;
1146
1147 	KASSERT(p->p_numthreads == 1, ("bad thread number"));
1148	mtx_lock_spin(&sched_lock);
1149	while ((kg = TAILQ_FIRST(&p->p_ksegrps)) != NULL) {
1150		while ((ke = TAILQ_FIRST(&kg->kg_iq)) != NULL) {
1151			TAILQ_REMOVE(&kg->kg_iq, ke, ke_kgrlist);
1152			kg->kg_idle_kses--;
1153			TAILQ_REMOVE(&kg->kg_kseq, ke, ke_kglist);
1154			kg->kg_kses--;
1155			if (ke->ke_tdspare)
1156				thread_stash(ke->ke_tdspare);
1157   			kse_stash(ke);
1158		}
1159		TAILQ_REMOVE(&p->p_ksegrps, kg, kg_ksegrp);
1160		p->p_numksegrps--;
1161		KASSERT(((kg->kg_kses == 0) && (kg != td->td_ksegrp)) ||
1162		    ((kg->kg_kses == 1) && (kg == td->td_ksegrp)),
1163			("wrong kg_kses"));
1164		if (kg != td->td_ksegrp) {
1165			ksegrp_stash(kg);
1166		}
1167	}
1168	TAILQ_INSERT_HEAD(&p->p_ksegrps, td->td_ksegrp, kg_ksegrp);
1169	p->p_numksegrps++;
1170	mtx_unlock_spin(&sched_lock);
1171}
1172
1173
1174/*
1175 * Create a thread and schedule it for upcall on the KSE given.
1176 */
1177struct thread *
1178thread_schedule_upcall(struct thread *td, struct kse *ke)
1179{
1180	struct thread *td2;
1181	struct ksegrp *kg;
1182	int newkse;
1183
1184	mtx_assert(&sched_lock, MA_OWNED);
1185	newkse = (ke != td->td_kse);
1186
1187	/*
1188	 * If the kse is already owned by another thread then we can't
1189	 * schedule an upcall because the other thread must be BOUND
1190	 * which means it is not in a position to take an upcall.
1191	 * We must be borrowing the KSE to allow us to complete some in-kernel
1192	 * work. When we complete, the Bound thread will have teh chance to
1193	 * complete. This thread will sleep as planned. Hopefully there will
1194	 * eventually be un unbound thread that can be converted to an
1195	 * upcall to report the completion of this thread.
1196	 */
1197	if (ke->ke_bound && ((ke->ke_bound->td_flags & TDF_UNBOUND) == 0)) {
1198		return (NULL);
1199	}
1200	KASSERT((ke->ke_bound == NULL), ("kse already bound"));
1201
1202	if (ke->ke_state == KES_IDLE) {
1203		kg = ke->ke_ksegrp;
1204		TAILQ_REMOVE(&kg->kg_iq, ke, ke_kgrlist);
1205		kg->kg_idle_kses--;
1206		ke->ke_state = KES_UNQUEUED;
1207	}
1208	if ((td2 = td->td_standin) != NULL) {
1209		td->td_standin = NULL;
1210	} else {
1211		if (newkse)
1212			panic("no reserve thread when called with a new kse");
1213		/*
1214		 * If called from (e.g.) sleep and we do not have
1215		 * a reserve thread, then we've used it, so do not
1216		 * create an upcall.
1217		 */
1218		return (NULL);
1219	}
1220	CTR3(KTR_PROC, "thread_schedule_upcall: thread %p (pid %d, %s)",
1221	     td2, td->td_proc->p_pid, td->td_proc->p_comm);
1222	bzero(&td2->td_startzero,
1223	    (unsigned)RANGEOF(struct thread, td_startzero, td_endzero));
1224	bcopy(&td->td_startcopy, &td2->td_startcopy,
1225	    (unsigned) RANGEOF(struct thread, td_startcopy, td_endcopy));
1226	thread_link(td2, ke->ke_ksegrp);
1227	cpu_set_upcall(td2, td->td_pcb);
1228
1229	/*
1230	 * XXXKSE do we really need this? (default values for the
1231	 * frame).
1232	 */
1233	bcopy(td->td_frame, td2->td_frame, sizeof(struct trapframe));
1234
1235	/*
1236	 * Bind the new thread to the KSE,
1237	 * and if it's our KSE, lend it back to ourself
1238	 * so we can continue running.
1239	 */
1240	td2->td_ucred = crhold(td->td_ucred);
1241	td2->td_flags = TDF_UPCALLING; /* note: BOUND */
1242	td2->td_kse = ke;
1243	td2->td_state = TDS_CAN_RUN;
1244	td2->td_inhibitors = 0;
1245	/*
1246	 * If called from msleep(), we are working on the current
1247	 * KSE so fake that we borrowed it. If called from
1248	 * kse_create(), don't, as we have a new kse too.
1249	 */
1250	if (!newkse) {
1251		/*
1252		 * This thread will be scheduled when the current thread
1253		 * blocks, exits or tries to enter userspace, (which ever
1254		 * happens first). When that happens the KSe will "revert"
1255		 * to this thread in a BOUND manner. Since we are called
1256		 * from msleep() this is going to be "very soon" in nearly
1257		 * all cases.
1258		 */
1259		ke->ke_bound = td2;
1260		TD_SET_LOAN(td2);
1261	} else {
1262		ke->ke_bound = NULL;
1263		ke->ke_thread = td2;
1264		ke->ke_state = KES_THREAD;
1265		setrunqueue(td2);
1266	}
1267	return (td2);	/* bogus.. should be a void function */
1268}
1269
1270/*
1271 * Schedule an upcall to notify a KSE process recieved signals.
1272 *
1273 * XXX - Modifying a sigset_t like this is totally bogus.
1274 */
1275struct thread *
1276signal_upcall(struct proc *p, int sig)
1277{
1278	struct thread *td, *td2;
1279	struct kse *ke;
1280	sigset_t ss;
1281	int error;
1282
1283	PROC_LOCK_ASSERT(p, MA_OWNED);
1284return (NULL);
1285
1286	td = FIRST_THREAD_IN_PROC(p);
1287	ke = td->td_kse;
1288	PROC_UNLOCK(p);
1289	error = copyin(&ke->ke_mailbox->km_sigscaught, &ss, sizeof(sigset_t));
1290	PROC_LOCK(p);
1291	if (error)
1292		return (NULL);
1293	SIGADDSET(ss, sig);
1294	PROC_UNLOCK(p);
1295	error = copyout(&ss, &ke->ke_mailbox->km_sigscaught, sizeof(sigset_t));
1296	PROC_LOCK(p);
1297	if (error)
1298		return (NULL);
1299	if (td->td_standin == NULL)
1300		td->td_standin = thread_alloc();
1301	mtx_lock_spin(&sched_lock);
1302	td2 = thread_schedule_upcall(td, ke); /* Bogus JRE */
1303	mtx_unlock_spin(&sched_lock);
1304	return (td2);
1305}
1306
1307/*
1308 * setup done on the thread when it enters the kernel.
1309 * XXXKSE Presently only for syscalls but eventually all kernel entries.
1310 */
1311void
1312thread_user_enter(struct proc *p, struct thread *td)
1313{
1314	struct kse *ke;
1315
1316	/*
1317	 * First check that we shouldn't just abort.
1318	 * But check if we are the single thread first!
1319	 * XXX p_singlethread not locked, but should be safe.
1320	 */
1321	if ((p->p_flag & P_WEXIT) && (p->p_singlethread != td)) {
1322		PROC_LOCK(p);
1323		mtx_lock_spin(&sched_lock);
1324		thread_exit();
1325		/* NOTREACHED */
1326	}
1327
1328	/*
1329	 * If we are doing a syscall in a KSE environment,
1330	 * note where our mailbox is. There is always the
1331	 * possibility that we could do this lazily (in sleep()),
1332	 * but for now do it every time.
1333	 */
1334	ke = td->td_kse;
1335	if (ke->ke_mailbox != NULL) {
1336#if 0
1337		td->td_mailbox = (void *)fuword((caddr_t)ke->ke_mailbox
1338		    + offsetof(struct kse_mailbox, km_curthread));
1339#else /* if user pointer arithmetic is ok in the kernel */
1340		td->td_mailbox =
1341		    (void *)fuword( (void *)&ke->ke_mailbox->km_curthread);
1342#endif
1343		if ((td->td_mailbox == NULL) ||
1344		    (td->td_mailbox == (void *)-1)) {
1345			td->td_mailbox = NULL;	/* single thread it.. */
1346			mtx_lock_spin(&sched_lock);
1347			td->td_flags &= ~TDF_UNBOUND;
1348			mtx_unlock_spin(&sched_lock);
1349		} else {
1350			/*
1351			 * when thread limit reached, act like that the thread
1352			 * has already done an upcall.
1353			 */
1354		    	if (p->p_numthreads > max_threads_per_proc) {
1355				if (td->td_standin != NULL)
1356					thread_stash(td->td_standin);
1357				td->td_standin = NULL;
1358			} else {
1359				if (td->td_standin == NULL)
1360					td->td_standin = thread_alloc();
1361			}
1362			mtx_lock_spin(&sched_lock);
1363			td->td_flags |= TDF_UNBOUND;
1364			mtx_unlock_spin(&sched_lock);
1365			td->td_usticks = 0;
1366		}
1367	}
1368}
1369
1370/*
1371 * The extra work we go through if we are a threaded process when we
1372 * return to userland.
1373 *
1374 * If we are a KSE process and returning to user mode, check for
1375 * extra work to do before we return (e.g. for more syscalls
1376 * to complete first).  If we were in a critical section, we should
1377 * just return to let it finish. Same if we were in the UTS (in
1378 * which case the mailbox's context's busy indicator will be set).
1379 * The only traps we suport will have set the mailbox.
1380 * We will clear it here.
1381 */
1382int
1383thread_userret(struct thread *td, struct trapframe *frame)
1384{
1385	int error;
1386	int unbound;
1387	struct kse *ke;
1388	struct ksegrp *kg;
1389	struct thread *td2;
1390	struct proc *p;
1391	struct timespec ts;
1392
1393	error = 0;
1394
1395	unbound = td->td_flags & TDF_UNBOUND;
1396
1397	kg = td->td_ksegrp;
1398	p = td->td_proc;
1399
1400	/*
1401	 * Originally bound threads never upcall but they may
1402	 * loan out their KSE at this point.
1403	 * Upcalls imply bound.. They also may want to do some Philantropy.
1404	 * Unbound threads on the other hand either yield to other work
1405	 * or transform into an upcall.
1406	 * (having saved their context to user space in both cases)
1407	 */
1408	if (unbound) {
1409		/*
1410		 * We are an unbound thread, looking to return to
1411		 * user space.
1412		 * THere are several possibilities:
1413		 * 1) we are using a borrowed KSE. save state and exit.
1414		 *    kse_reassign() will recycle the kse as needed,
1415		 * 2) we are not.. save state, and then convert ourself
1416		 *    to be an upcall, bound to the KSE.
1417		 *    if there are others that need the kse,
1418		 *    give them a chance by doing an mi_switch().
1419		 *    Because we are bound, control will eventually return
1420		 *    to us here.
1421		 * ***
1422		 * Save the thread's context, and link it
1423		 * into the KSEGRP's list of completed threads.
1424		 */
1425		error = thread_export_context(td);
1426		td->td_mailbox = NULL;
1427		td->td_usticks = 0;
1428		if (error) {
1429			/*
1430			 * If we are not running on a borrowed KSE, then
1431			 * failing to do the KSE operation just defaults
1432			 * back to synchonous operation, so just return from
1433			 * the syscall. If it IS borrowed, there is nothing
1434			 * we can do. We just lose that context. We
1435			 * probably should note this somewhere and send
1436			 * the process a signal.
1437			 */
1438			PROC_LOCK(td->td_proc);
1439			psignal(td->td_proc, SIGSEGV);
1440			mtx_lock_spin(&sched_lock);
1441			if (td->td_kse->ke_bound == NULL) {
1442				td->td_flags &= ~TDF_UNBOUND;
1443				PROC_UNLOCK(td->td_proc);
1444				mtx_unlock_spin(&sched_lock);
1445				thread_update_uticks();
1446				return (error);	/* go sync */
1447			}
1448			thread_exit();
1449		}
1450
1451		/*
1452		 * if the KSE is owned and we are borrowing it,
1453		 * don't make an upcall, just exit so that the owner
1454		 * can get its KSE if it wants it.
1455		 * Our context is already safely stored for later
1456		 * use by the UTS.
1457		 */
1458		PROC_LOCK(p);
1459		mtx_lock_spin(&sched_lock);
1460		if (td->td_kse->ke_bound) {
1461			thread_exit();
1462		}
1463		PROC_UNLOCK(p);
1464
1465		/*
1466		 * Turn ourself into a bound upcall.
1467		 * We will rely on kse_reassign()
1468		 * to make us run at a later time.
1469		 * We should look just like a sheduled upcall
1470		 * from msleep() or cv_wait().
1471		 */
1472		td->td_flags &= ~TDF_UNBOUND;
1473		td->td_flags |= TDF_UPCALLING;
1474		/* Only get here if we have become an upcall */
1475
1476	} else {
1477		mtx_lock_spin(&sched_lock);
1478	}
1479	/*
1480	 * We ARE going back to userland with this KSE.
1481	 * Check for threads that need to borrow it.
1482	 * Optimisation: don't call mi_switch if no-one wants the KSE.
1483	 * Any other thread that comes ready after this missed the boat.
1484	 */
1485	ke = td->td_kse;
1486	if ((td2 = kg->kg_last_assigned))
1487		td2 = TAILQ_NEXT(td2, td_runq);
1488	else
1489		td2 = TAILQ_FIRST(&kg->kg_runq);
1490	if (td2)  {
1491		/*
1492		 * force a switch to more urgent 'in kernel'
1493		 * work. Control will return to this thread
1494		 * when there is no more work to do.
1495		 * kse_reassign() will do tha for us.
1496		 */
1497		TD_SET_LOAN(td);
1498		ke->ke_bound = td;
1499		ke->ke_thread = NULL;
1500		mi_switch(); /* kse_reassign() will (re)find td2 */
1501	}
1502	mtx_unlock_spin(&sched_lock);
1503
1504	/*
1505	 * Optimisation:
1506	 * Ensure that we have a spare thread available,
1507	 * for when we re-enter the kernel.
1508	 */
1509	if (td->td_standin == NULL) {
1510		if (ke->ke_tdspare) {
1511			td->td_standin = ke->ke_tdspare;
1512			ke->ke_tdspare = NULL;
1513		} else {
1514			td->td_standin = thread_alloc();
1515		}
1516	}
1517
1518	thread_update_uticks();
1519	/*
1520	 * To get here, we know there is no other need for our
1521	 * KSE so we can proceed. If not upcalling, go back to
1522	 * userspace. If we are, get the upcall set up.
1523	 */
1524	if ((td->td_flags & TDF_UPCALLING) == 0)
1525		return (0);
1526
1527	/*
1528	 * We must be an upcall to get this far.
1529	 * There is no more work to do and we are going to ride
1530	 * this thead/KSE up to userland as an upcall.
1531	 * Do the last parts of the setup needed for the upcall.
1532	 */
1533	CTR3(KTR_PROC, "userret: upcall thread %p (pid %d, %s)",
1534	    td, td->td_proc->p_pid, td->td_proc->p_comm);
1535
1536	/*
1537	 * Set user context to the UTS.
1538	 */
1539	cpu_set_upcall_kse(td, ke);
1540
1541	/*
1542	 * Put any completed mailboxes on this KSE's list.
1543	 */
1544	error = thread_link_mboxes(kg, ke);
1545	if (error)
1546		goto bad;
1547
1548	/*
1549	 * Set state and mailbox.
1550	 * From now on we are just a bound outgoing process.
1551	 * **Problem** userret is often called several times.
1552	 * it would be nice if this all happenned only on the first time
1553	 * through. (the scan for extra work etc.)
1554	 */
1555	mtx_lock_spin(&sched_lock);
1556	td->td_flags &= ~TDF_UPCALLING;
1557	mtx_unlock_spin(&sched_lock);
1558#if 0
1559	error = suword((caddr_t)ke->ke_mailbox +
1560	    offsetof(struct kse_mailbox, km_curthread), 0);
1561#else	/* if user pointer arithmetic is ok in the kernel */
1562	error = suword((caddr_t)&ke->ke_mailbox->km_curthread, 0);
1563#endif
1564	ke->ke_uuticks = ke->ke_usticks = 0;
1565	if (!error) {
1566		nanotime(&ts);
1567		if (copyout(&ts, (caddr_t)&ke->ke_mailbox->km_timeofday,
1568		    sizeof(ts))) {
1569			goto bad;
1570		}
1571	}
1572	return (0);
1573
1574bad:
1575	/*
1576	 * Things are going to be so screwed we should just kill the process.
1577 	 * how do we do that?
1578	 */
1579	PROC_LOCK(td->td_proc);
1580	psignal(td->td_proc, SIGSEGV);
1581	PROC_UNLOCK(td->td_proc);
1582	return (error);	/* go sync */
1583}
1584
1585/*
1586 * Enforce single-threading.
1587 *
1588 * Returns 1 if the caller must abort (another thread is waiting to
1589 * exit the process or similar). Process is locked!
1590 * Returns 0 when you are successfully the only thread running.
1591 * A process has successfully single threaded in the suspend mode when
1592 * There are no threads in user mode. Threads in the kernel must be
1593 * allowed to continue until they get to the user boundary. They may even
1594 * copy out their return values and data before suspending. They may however be
1595 * accellerated in reaching the user boundary as we will wake up
1596 * any sleeping threads that are interruptable. (PCATCH).
1597 */
1598int
1599thread_single(int force_exit)
1600{
1601	struct thread *td;
1602	struct thread *td2;
1603	struct proc *p;
1604
1605	td = curthread;
1606	p = td->td_proc;
1607	PROC_LOCK_ASSERT(p, MA_OWNED);
1608	KASSERT((td != NULL), ("curthread is NULL"));
1609
1610	if ((p->p_flag & P_KSES) == 0)
1611		return (0);
1612
1613	/* Is someone already single threading? */
1614	if (p->p_singlethread)
1615		return (1);
1616
1617	if (force_exit == SINGLE_EXIT)
1618		p->p_flag |= P_SINGLE_EXIT;
1619	else
1620		p->p_flag &= ~P_SINGLE_EXIT;
1621	p->p_flag |= P_STOPPED_SINGLE;
1622	p->p_singlethread = td;
1623	/* XXXKSE Which lock protects the below values? */
1624	while ((p->p_numthreads - p->p_suspcount) != 1) {
1625		mtx_lock_spin(&sched_lock);
1626		FOREACH_THREAD_IN_PROC(p, td2) {
1627			if (td2 == td)
1628				continue;
1629			if (TD_IS_INHIBITED(td2)) {
1630				if (force_exit == SINGLE_EXIT) {
1631					if (TD_IS_SUSPENDED(td2)) {
1632						thread_unsuspend_one(td2);
1633					}
1634					if (TD_ON_SLEEPQ(td2) &&
1635					    (td2->td_flags & TDF_SINTR)) {
1636						if (td2->td_flags & TDF_CVWAITQ)
1637							cv_abort(td2);
1638						else
1639							abortsleep(td2);
1640					}
1641				} else {
1642					if (TD_IS_SUSPENDED(td2))
1643						continue;
1644					/* maybe other inhibitted states too? */
1645					if (TD_IS_SLEEPING(td2))
1646						thread_suspend_one(td2);
1647				}
1648			}
1649		}
1650		/*
1651		 * Maybe we suspended some threads.. was it enough?
1652		 */
1653		if ((p->p_numthreads - p->p_suspcount) == 1) {
1654			mtx_unlock_spin(&sched_lock);
1655			break;
1656		}
1657
1658		/*
1659		 * Wake us up when everyone else has suspended.
1660		 * In the mean time we suspend as well.
1661		 */
1662		thread_suspend_one(td);
1663		mtx_unlock(&Giant);
1664		PROC_UNLOCK(p);
1665		mi_switch();
1666		mtx_unlock_spin(&sched_lock);
1667		mtx_lock(&Giant);
1668		PROC_LOCK(p);
1669	}
1670	if (force_exit == SINGLE_EXIT)
1671		kse_purge(p, td);
1672	return (0);
1673}
1674
1675/*
1676 * Called in from locations that can safely check to see
1677 * whether we have to suspend or at least throttle for a
1678 * single-thread event (e.g. fork).
1679 *
1680 * Such locations include userret().
1681 * If the "return_instead" argument is non zero, the thread must be able to
1682 * accept 0 (caller may continue), or 1 (caller must abort) as a result.
1683 *
1684 * The 'return_instead' argument tells the function if it may do a
1685 * thread_exit() or suspend, or whether the caller must abort and back
1686 * out instead.
1687 *
1688 * If the thread that set the single_threading request has set the
1689 * P_SINGLE_EXIT bit in the process flags then this call will never return
1690 * if 'return_instead' is false, but will exit.
1691 *
1692 * P_SINGLE_EXIT | return_instead == 0| return_instead != 0
1693 *---------------+--------------------+---------------------
1694 *       0       | returns 0          |   returns 0 or 1
1695 *               | when ST ends       |   immediatly
1696 *---------------+--------------------+---------------------
1697 *       1       | thread exits       |   returns 1
1698 *               |                    |  immediatly
1699 * 0 = thread_exit() or suspension ok,
1700 * other = return error instead of stopping the thread.
1701 *
1702 * While a full suspension is under effect, even a single threading
1703 * thread would be suspended if it made this call (but it shouldn't).
1704 * This call should only be made from places where
1705 * thread_exit() would be safe as that may be the outcome unless
1706 * return_instead is set.
1707 */
1708int
1709thread_suspend_check(int return_instead)
1710{
1711	struct thread *td;
1712	struct proc *p;
1713	struct kse *ke;
1714	struct ksegrp *kg;
1715
1716	td = curthread;
1717	p = td->td_proc;
1718	kg = td->td_ksegrp;
1719	PROC_LOCK_ASSERT(p, MA_OWNED);
1720	while (P_SHOULDSTOP(p)) {
1721		if (P_SHOULDSTOP(p) == P_STOPPED_SINGLE) {
1722			KASSERT(p->p_singlethread != NULL,
1723			    ("singlethread not set"));
1724			/*
1725			 * The only suspension in action is a
1726			 * single-threading. Single threader need not stop.
1727			 * XXX Should be safe to access unlocked
1728			 * as it can only be set to be true by us.
1729			 */
1730			if (p->p_singlethread == td)
1731				return (0);	/* Exempt from stopping. */
1732		}
1733		if (return_instead)
1734			return (1);
1735
1736		/*
1737		 * If the process is waiting for us to exit,
1738		 * this thread should just suicide.
1739		 * Assumes that P_SINGLE_EXIT implies P_STOPPED_SINGLE.
1740		 */
1741		if ((p->p_flag & P_SINGLE_EXIT) && (p->p_singlethread != td)) {
1742			mtx_lock_spin(&sched_lock);
1743			while (mtx_owned(&Giant))
1744				mtx_unlock(&Giant);
1745			/*
1746			 * free extra kses and ksegrps, we needn't worry
1747			 * about if current thread is in same ksegrp as
1748			 * p_singlethread and last kse in the group
1749			 * could be killed, this is protected by kg_numthreads,
1750			 * in this case, we deduce that kg_numthreads must > 1.
1751			 */
1752			ke = td->td_kse;
1753			if (ke->ke_bound == NULL &&
1754			    ((kg->kg_kses != 1) || (kg->kg_numthreads == 1)))
1755				ke->ke_flags |= KEF_EXIT;
1756			thread_exit();
1757		}
1758
1759		/*
1760		 * When a thread suspends, it just
1761		 * moves to the processes's suspend queue
1762		 * and stays there.
1763		 *
1764		 * XXXKSE if TDF_BOUND is true
1765		 * it will not release it's KSE which might
1766		 * lead to deadlock if there are not enough KSEs
1767		 * to complete all waiting threads.
1768		 * Maybe be able to 'lend' it out again.
1769		 * (lent kse's can not go back to userland?)
1770		 * and can only be lent in STOPPED state.
1771		 */
1772		mtx_lock_spin(&sched_lock);
1773		if ((p->p_flag & P_STOPPED_SIG) &&
1774		    (p->p_suspcount+1 == p->p_numthreads)) {
1775			mtx_unlock_spin(&sched_lock);
1776			PROC_LOCK(p->p_pptr);
1777			if ((p->p_pptr->p_procsig->ps_flag &
1778				PS_NOCLDSTOP) == 0) {
1779				psignal(p->p_pptr, SIGCHLD);
1780			}
1781			PROC_UNLOCK(p->p_pptr);
1782			mtx_lock_spin(&sched_lock);
1783		}
1784		mtx_assert(&Giant, MA_NOTOWNED);
1785		thread_suspend_one(td);
1786		PROC_UNLOCK(p);
1787		if (P_SHOULDSTOP(p) == P_STOPPED_SINGLE) {
1788			if (p->p_numthreads == p->p_suspcount) {
1789				thread_unsuspend_one(p->p_singlethread);
1790			}
1791		}
1792		p->p_stats->p_ru.ru_nivcsw++;
1793		mi_switch();
1794		mtx_unlock_spin(&sched_lock);
1795		PROC_LOCK(p);
1796	}
1797	return (0);
1798}
1799
1800void
1801thread_suspend_one(struct thread *td)
1802{
1803	struct proc *p = td->td_proc;
1804
1805	mtx_assert(&sched_lock, MA_OWNED);
1806	p->p_suspcount++;
1807	TD_SET_SUSPENDED(td);
1808	TAILQ_INSERT_TAIL(&p->p_suspended, td, td_runq);
1809	/*
1810	 * Hack: If we are suspending but are on the sleep queue
1811	 * then we are in msleep or the cv equivalent. We
1812	 * want to look like we have two Inhibitors.
1813	 * May already be set.. doesn't matter.
1814	 */
1815	if (TD_ON_SLEEPQ(td))
1816		TD_SET_SLEEPING(td);
1817}
1818
1819void
1820thread_unsuspend_one(struct thread *td)
1821{
1822	struct proc *p = td->td_proc;
1823
1824	mtx_assert(&sched_lock, MA_OWNED);
1825	TAILQ_REMOVE(&p->p_suspended, td, td_runq);
1826	TD_CLR_SUSPENDED(td);
1827	p->p_suspcount--;
1828	setrunnable(td);
1829}
1830
1831/*
1832 * Allow all threads blocked by single threading to continue running.
1833 */
1834void
1835thread_unsuspend(struct proc *p)
1836{
1837	struct thread *td;
1838
1839	mtx_assert(&sched_lock, MA_OWNED);
1840	PROC_LOCK_ASSERT(p, MA_OWNED);
1841	if (!P_SHOULDSTOP(p)) {
1842		while (( td = TAILQ_FIRST(&p->p_suspended))) {
1843			thread_unsuspend_one(td);
1844		}
1845	} else if ((P_SHOULDSTOP(p) == P_STOPPED_SINGLE) &&
1846	    (p->p_numthreads == p->p_suspcount)) {
1847		/*
1848		 * Stopping everything also did the job for the single
1849		 * threading request. Now we've downgraded to single-threaded,
1850		 * let it continue.
1851		 */
1852		thread_unsuspend_one(p->p_singlethread);
1853	}
1854}
1855
1856void
1857thread_single_end(void)
1858{
1859	struct thread *td;
1860	struct proc *p;
1861
1862	td = curthread;
1863	p = td->td_proc;
1864	PROC_LOCK_ASSERT(p, MA_OWNED);
1865	p->p_flag &= ~P_STOPPED_SINGLE;
1866	p->p_singlethread = NULL;
1867	/*
1868	 * If there are other threads they mey now run,
1869	 * unless of course there is a blanket 'stop order'
1870	 * on the process. The single threader must be allowed
1871	 * to continue however as this is a bad place to stop.
1872	 */
1873	if ((p->p_numthreads != 1) && (!P_SHOULDSTOP(p))) {
1874		mtx_lock_spin(&sched_lock);
1875		while (( td = TAILQ_FIRST(&p->p_suspended))) {
1876			thread_unsuspend_one(td);
1877		}
1878		mtx_unlock_spin(&sched_lock);
1879	}
1880}
1881
1882
1883