kern_thread.c revision 109550
1/*
2 * Copyright (C) 2001 Julian Elischer <julian@freebsd.org>.
3 *  All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 * 1. Redistributions of source code must retain the above copyright
9 *    notice(s), this list of conditions and the following disclaimer as
10 *    the first lines of this file unmodified other than the possible
11 *    addition of one or more copyright notices.
12 * 2. Redistributions in binary form must reproduce the above copyright
13 *    notice(s), this list of conditions and the following disclaimer in the
14 *    documentation and/or other materials provided with the distribution.
15 *
16 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDER(S) ``AS IS'' AND ANY
17 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
18 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
19 * DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT HOLDER(S) BE LIABLE FOR ANY
20 * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
21 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
22 * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
23 * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
24 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
25 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH
26 * DAMAGE.
27 *
28 * $FreeBSD: head/sys/kern/kern_thread.c 109550 2003-01-20 03:41:04Z julian $
29 */
30
31#include <sys/param.h>
32#include <sys/systm.h>
33#include <sys/kernel.h>
34#include <sys/lock.h>
35#include <sys/malloc.h>
36#include <sys/mutex.h>
37#include <sys/proc.h>
38#include <sys/smp.h>
39#include <sys/sysctl.h>
40#include <sys/sysproto.h>
41#include <sys/filedesc.h>
42#include <sys/sched.h>
43#include <sys/signalvar.h>
44#include <sys/sx.h>
45#include <sys/tty.h>
46#include <sys/user.h>
47#include <sys/jail.h>
48#include <sys/kse.h>
49#include <sys/ktr.h>
50#include <sys/ucontext.h>
51
52#include <vm/vm.h>
53#include <vm/vm_object.h>
54#include <vm/pmap.h>
55#include <vm/uma.h>
56#include <vm/vm_map.h>
57
58#include <machine/frame.h>
59
60/*
61 * KSEGRP related storage.
62 */
63static uma_zone_t ksegrp_zone;
64static uma_zone_t kse_zone;
65static uma_zone_t thread_zone;
66
67/* DEBUG ONLY */
68SYSCTL_NODE(_kern, OID_AUTO, threads, CTLFLAG_RW, 0, "thread allocation");
69static int thread_debug = 0;
70SYSCTL_INT(_kern_threads, OID_AUTO, debug, CTLFLAG_RW,
71	&thread_debug, 0, "thread debug");
72
73static int max_threads_per_proc = 30;
74SYSCTL_INT(_kern_threads, OID_AUTO, max_threads_per_proc, CTLFLAG_RW,
75	&max_threads_per_proc, 0, "Limit on threads per proc");
76
77static int max_groups_per_proc = 5;
78SYSCTL_INT(_kern_threads, OID_AUTO, max_groups_per_proc, CTLFLAG_RW,
79	&max_groups_per_proc, 0, "Limit on thread groups per proc");
80
81#define RANGEOF(type, start, end) (offsetof(type, end) - offsetof(type, start))
82
83struct threadqueue zombie_threads = TAILQ_HEAD_INITIALIZER(zombie_threads);
84TAILQ_HEAD(, kse) zombie_kses = TAILQ_HEAD_INITIALIZER(zombie_kses);
85TAILQ_HEAD(, ksegrp) zombie_ksegrps = TAILQ_HEAD_INITIALIZER(zombie_ksegrps);
86struct mtx zombie_thread_lock;
87MTX_SYSINIT(zombie_thread_lock, &zombie_thread_lock,
88    "zombie_thread_lock", MTX_SPIN);
89
90static void kse_purge(struct proc *p, struct thread *td);
91
92/*
93 * Prepare a thread for use.
94 */
95static void
96thread_ctor(void *mem, int size, void *arg)
97{
98	struct thread	*td;
99
100	td = (struct thread *)mem;
101	td->td_state = TDS_INACTIVE;
102	td->td_flags |= TDF_UNBOUND;
103}
104
105/*
106 * Reclaim a thread after use.
107 */
108static void
109thread_dtor(void *mem, int size, void *arg)
110{
111	struct thread	*td;
112
113	td = (struct thread *)mem;
114
115#ifdef INVARIANTS
116	/* Verify that this thread is in a safe state to free. */
117	switch (td->td_state) {
118	case TDS_INHIBITED:
119	case TDS_RUNNING:
120	case TDS_CAN_RUN:
121	case TDS_RUNQ:
122		/*
123		 * We must never unlink a thread that is in one of
124		 * these states, because it is currently active.
125		 */
126		panic("bad state for thread unlinking");
127		/* NOTREACHED */
128	case TDS_INACTIVE:
129		break;
130	default:
131		panic("bad thread state");
132		/* NOTREACHED */
133	}
134#endif
135}
136
137/*
138 * Initialize type-stable parts of a thread (when newly created).
139 */
140static void
141thread_init(void *mem, int size)
142{
143	struct thread	*td;
144
145	td = (struct thread *)mem;
146	mtx_lock(&Giant);
147	pmap_new_thread(td, 0);
148	mtx_unlock(&Giant);
149	cpu_thread_setup(td);
150	td->td_sched = (struct td_sched *)&td[1];
151}
152
153/*
154 * Tear down type-stable parts of a thread (just before being discarded).
155 */
156static void
157thread_fini(void *mem, int size)
158{
159	struct thread	*td;
160
161	td = (struct thread *)mem;
162	pmap_dispose_thread(td);
163}
164/*
165 * Initialize type-stable parts of a kse (when newly created).
166 */
167static void
168kse_init(void *mem, int size)
169{
170	struct kse	*ke;
171
172	ke = (struct kse *)mem;
173	ke->ke_sched = (struct ke_sched *)&ke[1];
174}
175/*
176 * Initialize type-stable parts of a ksegrp (when newly created).
177 */
178static void
179ksegrp_init(void *mem, int size)
180{
181	struct ksegrp	*kg;
182
183	kg = (struct ksegrp *)mem;
184	kg->kg_sched = (struct kg_sched *)&kg[1];
185}
186
187/*
188 * KSE is linked onto the idle queue.
189 */
190void
191kse_link(struct kse *ke, struct ksegrp *kg)
192{
193	struct proc *p = kg->kg_proc;
194
195	TAILQ_INSERT_HEAD(&kg->kg_kseq, ke, ke_kglist);
196	kg->kg_kses++;
197	ke->ke_state = KES_UNQUEUED;
198	ke->ke_proc	= p;
199	ke->ke_ksegrp	= kg;
200	ke->ke_owner	= NULL;
201	ke->ke_thread	= NULL;
202	ke->ke_oncpu = NOCPU;
203}
204
205void
206kse_unlink(struct kse *ke)
207{
208	struct ksegrp *kg;
209
210	mtx_assert(&sched_lock, MA_OWNED);
211	kg = ke->ke_ksegrp;
212
213	TAILQ_REMOVE(&kg->kg_kseq, ke, ke_kglist);
214	if (--kg->kg_kses == 0) {
215			ksegrp_unlink(kg);
216	}
217	/*
218	 * Aggregate stats from the KSE
219	 */
220	kse_stash(ke);
221}
222
223void
224ksegrp_link(struct ksegrp *kg, struct proc *p)
225{
226
227	TAILQ_INIT(&kg->kg_threads);
228	TAILQ_INIT(&kg->kg_runq);	/* links with td_runq */
229	TAILQ_INIT(&kg->kg_slpq);	/* links with td_runq */
230	TAILQ_INIT(&kg->kg_kseq);	/* all kses in ksegrp */
231	TAILQ_INIT(&kg->kg_lq);		/* loan kses in ksegrp */
232	kg->kg_proc	= p;
233/* the following counters are in the -zero- section and may not need clearing */
234	kg->kg_numthreads = 0;
235	kg->kg_runnable = 0;
236	kg->kg_kses = 0;
237	kg->kg_loan_kses = 0;
238	kg->kg_runq_kses = 0; /* XXXKSE change name */
239/* link it in now that it's consistent */
240	p->p_numksegrps++;
241	TAILQ_INSERT_HEAD(&p->p_ksegrps, kg, kg_ksegrp);
242}
243
244void
245ksegrp_unlink(struct ksegrp *kg)
246{
247	struct proc *p;
248
249	mtx_assert(&sched_lock, MA_OWNED);
250	p = kg->kg_proc;
251	KASSERT(((kg->kg_numthreads == 0) && (kg->kg_kses == 0)),
252	    ("kseg_unlink: residual threads or KSEs"));
253	TAILQ_REMOVE(&p->p_ksegrps, kg, kg_ksegrp);
254	p->p_numksegrps--;
255	/*
256	 * Aggregate stats from the KSE
257	 */
258	ksegrp_stash(kg);
259}
260
261/*
262 * for a newly created process,
263 * link up a the structure and its initial threads etc.
264 */
265void
266proc_linkup(struct proc *p, struct ksegrp *kg,
267			struct kse *ke, struct thread *td)
268{
269
270	TAILQ_INIT(&p->p_ksegrps);	     /* all ksegrps in proc */
271	TAILQ_INIT(&p->p_threads);	     /* all threads in proc */
272	TAILQ_INIT(&p->p_suspended);	     /* Threads suspended */
273	p->p_numksegrps = 0;
274	p->p_numthreads = 0;
275
276	ksegrp_link(kg, p);
277	kse_link(ke, kg);
278	thread_link(td, kg);
279}
280
281int
282kse_thr_interrupt(struct thread *td, struct kse_thr_interrupt_args *uap)
283{
284	struct proc *p;
285	struct thread *td2;
286
287	p = td->td_proc;
288	/* KSE-enabled processes only, please. */
289	if (!(p->p_flag & P_KSES))
290		return (EINVAL);
291	if (uap->tmbx == NULL)
292		return (EINVAL);
293	mtx_lock_spin(&sched_lock);
294	FOREACH_THREAD_IN_PROC(p, td2) {
295		if (td2->td_mailbox == uap->tmbx) {
296			td2->td_flags |= TDF_INTERRUPT;
297			if (TD_ON_SLEEPQ(td2) && (td2->td_flags & TDF_SINTR)) {
298				if (td2->td_flags & TDF_CVWAITQ)
299					cv_abort(td2);
300				else
301					abortsleep(td2);
302			}
303			mtx_unlock_spin(&sched_lock);
304			return (0);
305		}
306	}
307	mtx_unlock_spin(&sched_lock);
308	return (ESRCH);
309}
310
311int
312kse_exit(struct thread *td, struct kse_exit_args *uap)
313{
314	struct proc *p;
315	struct ksegrp *kg;
316	struct kse *ke;
317
318	p = td->td_proc;
319	/* Only UTS can do the syscall */
320	if (!(p->p_flag & P_KSES) || (td->td_mailbox != NULL))
321		return (EINVAL);
322	kg = td->td_ksegrp;
323	/* serialize killing kse */
324	PROC_LOCK(p);
325	mtx_lock_spin(&sched_lock);
326	if ((kg->kg_kses == 1) && (kg->kg_numthreads > 1)) {
327		mtx_unlock_spin(&sched_lock);
328		PROC_UNLOCK(p);
329		return (EDEADLK);
330	}
331	ke = td->td_kse;
332	if (p->p_numthreads == 1) {
333		ke->ke_flags &= ~KEF_DOUPCALL;
334		ke->ke_mailbox = NULL;
335		p->p_flag &= ~P_KSES;
336		mtx_unlock_spin(&sched_lock);
337		PROC_UNLOCK(p);
338	} else {
339		ke->ke_flags |= KEF_EXIT;
340		thread_exit();
341		/* NOTREACHED */
342	}
343	return (0);
344}
345
346/*
347 * Either becomes an upcall or waits for an awakening event and
348 * THEN becomes an upcall. Only error cases return.
349 */
350int
351kse_release(struct thread * td, struct kse_release_args * uap)
352{
353	struct proc *p;
354	struct ksegrp *kg;
355
356	p = td->td_proc;
357	kg = td->td_ksegrp;
358	/*
359	 * kse must have a mailbox ready for upcall, and only UTS can
360	 * do the syscall.
361	 */
362	if (!(p->p_flag & P_KSES) ||
363	    (td->td_mailbox != NULL) ||
364	    (td->td_kse->ke_mailbox == NULL))
365		return (EINVAL);
366
367	PROC_LOCK(p);
368	mtx_lock_spin(&sched_lock);
369	/* Change OURSELF to become an upcall. */
370	td->td_flags = TDF_UPCALLING; /* BOUND */
371	if (!(td->td_kse->ke_flags & (KEF_DOUPCALL|KEF_ASTPENDING)) &&
372	    (kg->kg_completed == NULL)) {
373		/*
374		 * The KSE will however be lendable.
375		 */
376		TD_SET_IDLE(td);
377		PROC_UNLOCK(p);
378		p->p_stats->p_ru.ru_nvcsw++;
379		mi_switch();
380		mtx_unlock_spin(&sched_lock);
381	} else {
382		mtx_unlock_spin(&sched_lock);
383		PROC_UNLOCK(p);
384	}
385	return (0);
386}
387
388/* struct kse_wakeup_args {
389	struct kse_mailbox *mbx;
390}; */
391int
392kse_wakeup(struct thread *td, struct kse_wakeup_args *uap)
393{
394	struct proc *p;
395	struct kse *ke;
396	struct ksegrp *kg;
397	struct thread *td2;
398
399	p = td->td_proc;
400	td2 = NULL;
401	/* KSE-enabled processes only, please. */
402	if (!(p->p_flag & P_KSES))
403		return EINVAL;
404
405	mtx_lock_spin(&sched_lock);
406	if (uap->mbx) {
407		FOREACH_KSEGRP_IN_PROC(p, kg) {
408			FOREACH_KSE_IN_GROUP(kg, ke) {
409				if (ke->ke_mailbox != uap->mbx)
410					continue;
411				td2 = ke->ke_owner;
412				KASSERT((td2 != NULL),("KSE with no owner"));
413				break;
414			}
415			if (td2) {
416				break;
417			}
418		}
419	} else {
420		/*
421		 * look for any idle KSE to resurrect.
422		 */
423		kg = td->td_ksegrp;
424		FOREACH_KSE_IN_GROUP(kg, ke) {
425			td2 = ke->ke_owner;
426			KASSERT((td2 != NULL),("KSE with no owner2"));
427			if (TD_IS_IDLE(td2))
428				break;
429		}
430		KASSERT((td2 != NULL), ("no thread(s)"));
431	}
432	if (td2) {
433		if (TD_IS_IDLE(td2)) {
434			TD_CLR_IDLE(td2);
435			setrunnable(td2);
436		} else if (td != td2) {
437			/* guarantee do an upcall ASAP */
438			td2->td_kse->ke_flags |= KEF_DOUPCALL;
439		}
440		mtx_unlock_spin(&sched_lock);
441		return (0);
442	}
443	mtx_unlock_spin(&sched_lock);
444	return (ESRCH);
445}
446
447/*
448 * No new KSEG: first call: use current KSE, don't schedule an upcall
449 * All other situations, do allocate a new KSE and schedule an upcall on it.
450 */
451/* struct kse_create_args {
452	struct kse_mailbox *mbx;
453	int newgroup;
454}; */
455int
456kse_create(struct thread *td, struct kse_create_args *uap)
457{
458	struct kse *newke;
459	struct kse *ke;
460	struct ksegrp *newkg;
461	struct ksegrp *kg;
462	struct proc *p;
463	struct kse_mailbox mbx;
464	int err;
465
466	p = td->td_proc;
467	if ((err = copyin(uap->mbx, &mbx, sizeof(mbx))))
468		return (err);
469
470	p->p_flag |= P_KSES; /* easier to just set it than to test and set */
471	kg = td->td_ksegrp;
472	if (uap->newgroup) {
473		if (p->p_numksegrps >= max_groups_per_proc)
474			return (EPROCLIM);
475		/*
476		 * If we want a new KSEGRP it doesn't matter whether
477		 * we have already fired up KSE mode before or not.
478		 * We put the process in KSE mode and create a new KSEGRP
479		 * and KSE. If our KSE has not got a mailbox yet then
480		 * that doesn't matter, just leave it that way. It will
481		 * ensure that this thread stay BOUND. It's possible
482		 * that the call came form a threaded library and the main
483		 * program knows nothing of threads.
484		 */
485		newkg = ksegrp_alloc();
486		bzero(&newkg->kg_startzero, RANGEOF(struct ksegrp,
487		      kg_startzero, kg_endzero));
488		bcopy(&kg->kg_startcopy, &newkg->kg_startcopy,
489		      RANGEOF(struct ksegrp, kg_startcopy, kg_endcopy));
490		newke = kse_alloc();
491	} else {
492		/*
493		 * Otherwise, if we have already set this KSE
494		 * to have a mailbox, we want to make another KSE here,
495		 * but only if there are not already the limit, which
496		 * is 1 per CPU max.
497		 *
498		 * If the current KSE doesn't have a mailbox we just use it
499		 * and give it one.
500		 *
501		 * Because we don't like to access
502		 * the KSE outside of schedlock if we are UNBOUND,
503		 * (because it can change if we are preempted by an interrupt)
504		 * we can deduce it as having a mailbox if we are UNBOUND,
505		 * and only need to actually look at it if we are BOUND,
506		 * which is safe.
507		 */
508		if ((td->td_flags & TDF_UNBOUND) || td->td_kse->ke_mailbox) {
509			if (thread_debug == 0) { /* if debugging, allow more */
510#ifdef SMP
511			if (kg->kg_kses > mp_ncpus)
512#endif
513				return (EPROCLIM);
514			}
515			newke = kse_alloc();
516		} else {
517			newke = NULL;
518		}
519		newkg = NULL;
520	}
521	if (newke) {
522		bzero(&newke->ke_startzero, RANGEOF(struct kse,
523		      ke_startzero, ke_endzero));
524#if 0
525		bcopy(&ke->ke_startcopy, &newke->ke_startcopy,
526		      RANGEOF(struct kse, ke_startcopy, ke_endcopy));
527#endif
528		/* For the first call this may not have been set */
529		if (td->td_standin == NULL) {
530			td->td_standin = thread_alloc();
531		}
532		mtx_lock_spin(&sched_lock);
533		if (newkg) {
534			if (p->p_numksegrps >= max_groups_per_proc) {
535				mtx_unlock_spin(&sched_lock);
536				ksegrp_free(newkg);
537				kse_free(newke);
538				return (EPROCLIM);
539			}
540			ksegrp_link(newkg, p);
541		}
542		else
543			newkg = kg;
544		kse_link(newke, newkg);
545		if (p->p_sflag & PS_NEEDSIGCHK)
546			newke->ke_flags |= KEF_ASTPENDING;
547		newke->ke_mailbox = uap->mbx;
548		newke->ke_upcall = mbx.km_func;
549		bcopy(&mbx.km_stack, &newke->ke_stack, sizeof(stack_t));
550		thread_schedule_upcall(td, newke);
551		mtx_unlock_spin(&sched_lock);
552	} else {
553		/*
554		 * If we didn't allocate a new KSE then the we are using
555		 * the exisiting (BOUND) kse.
556		 */
557		ke = td->td_kse;
558		ke->ke_mailbox = uap->mbx;
559		ke->ke_upcall = mbx.km_func;
560		bcopy(&mbx.km_stack, &ke->ke_stack, sizeof(stack_t));
561	}
562	/*
563	 * Fill out the KSE-mode specific fields of the new kse.
564	 */
565	return (0);
566}
567
568/*
569 * Fill a ucontext_t with a thread's context information.
570 *
571 * This is an analogue to getcontext(3).
572 */
573void
574thread_getcontext(struct thread *td, ucontext_t *uc)
575{
576
577/*
578 * XXX this is declared in a MD include file, i386/include/ucontext.h but
579 * is used in MI code.
580 */
581#ifdef __i386__
582	get_mcontext(td, &uc->uc_mcontext);
583#endif
584	uc->uc_sigmask = td->td_proc->p_sigmask;
585}
586
587/*
588 * Set a thread's context from a ucontext_t.
589 *
590 * This is an analogue to setcontext(3).
591 */
592int
593thread_setcontext(struct thread *td, ucontext_t *uc)
594{
595	int ret;
596
597/*
598 * XXX this is declared in a MD include file, i386/include/ucontext.h but
599 * is used in MI code.
600 */
601#ifdef __i386__
602	ret = set_mcontext(td, &uc->uc_mcontext);
603#else
604	ret = ENOSYS;
605#endif
606	if (ret == 0) {
607		SIG_CANTMASK(uc->uc_sigmask);
608		PROC_LOCK(td->td_proc);
609		td->td_proc->p_sigmask = uc->uc_sigmask;
610		PROC_UNLOCK(td->td_proc);
611	}
612	return (ret);
613}
614
615/*
616 * Initialize global thread allocation resources.
617 */
618void
619threadinit(void)
620{
621
622#ifndef __ia64__
623	thread_zone = uma_zcreate("THREAD", sched_sizeof_thread(),
624	    thread_ctor, thread_dtor, thread_init, thread_fini,
625	    UMA_ALIGN_CACHE, 0);
626#else
627	/*
628	 * XXX the ia64 kstack allocator is really lame and is at the mercy
629	 * of contigmallloc().  This hackery is to pre-construct a whole
630	 * pile of thread structures with associated kernel stacks early
631	 * in the system startup while contigmalloc() still works. Once we
632	 * have them, keep them.  Sigh.
633	 */
634	thread_zone = uma_zcreate("THREAD", sched_sizeof_thread(),
635	    thread_ctor, thread_dtor, thread_init, thread_fini,
636	    UMA_ALIGN_CACHE, UMA_ZONE_NOFREE);
637	uma_prealloc(thread_zone, 512);		/* XXX arbitary */
638#endif
639	ksegrp_zone = uma_zcreate("KSEGRP", sched_sizeof_ksegrp(),
640	    NULL, NULL, ksegrp_init, NULL,
641	    UMA_ALIGN_CACHE, 0);
642	kse_zone = uma_zcreate("KSE", sched_sizeof_kse(),
643	    NULL, NULL, kse_init, NULL,
644	    UMA_ALIGN_CACHE, 0);
645}
646
647/*
648 * Stash an embarasingly extra thread into the zombie thread queue.
649 */
650void
651thread_stash(struct thread *td)
652{
653	mtx_lock_spin(&zombie_thread_lock);
654	TAILQ_INSERT_HEAD(&zombie_threads, td, td_runq);
655	mtx_unlock_spin(&zombie_thread_lock);
656}
657
658/*
659 * Stash an embarasingly extra kse into the zombie kse queue.
660 */
661void
662kse_stash(struct kse *ke)
663{
664	mtx_lock_spin(&zombie_thread_lock);
665	TAILQ_INSERT_HEAD(&zombie_kses, ke, ke_procq);
666	mtx_unlock_spin(&zombie_thread_lock);
667}
668
669/*
670 * Stash an embarasingly extra ksegrp into the zombie ksegrp queue.
671 */
672void
673ksegrp_stash(struct ksegrp *kg)
674{
675	mtx_lock_spin(&zombie_thread_lock);
676	TAILQ_INSERT_HEAD(&zombie_ksegrps, kg, kg_ksegrp);
677	mtx_unlock_spin(&zombie_thread_lock);
678}
679
680/*
681 * Reap zombie threads.
682 */
683void
684thread_reap(void)
685{
686	struct thread *td_first, *td_next;
687	struct kse *ke_first, *ke_next;
688	struct ksegrp *kg_first, * kg_next;
689
690	/*
691	 * don't even bother to lock if none at this instant
692	 * We really don't care about the next instant..
693	 */
694	if ((!TAILQ_EMPTY(&zombie_threads))
695	    || (!TAILQ_EMPTY(&zombie_kses))
696	    || (!TAILQ_EMPTY(&zombie_ksegrps))) {
697		mtx_lock_spin(&zombie_thread_lock);
698		td_first = TAILQ_FIRST(&zombie_threads);
699		ke_first = TAILQ_FIRST(&zombie_kses);
700		kg_first = TAILQ_FIRST(&zombie_ksegrps);
701		if (td_first)
702			TAILQ_INIT(&zombie_threads);
703		if (ke_first)
704			TAILQ_INIT(&zombie_kses);
705		if (kg_first)
706			TAILQ_INIT(&zombie_ksegrps);
707		mtx_unlock_spin(&zombie_thread_lock);
708		while (td_first) {
709			td_next = TAILQ_NEXT(td_first, td_runq);
710			thread_free(td_first);
711			td_first = td_next;
712		}
713		while (ke_first) {
714			ke_next = TAILQ_NEXT(ke_first, ke_procq);
715			kse_free(ke_first);
716			ke_first = ke_next;
717		}
718		while (kg_first) {
719			kg_next = TAILQ_NEXT(kg_first, kg_ksegrp);
720			ksegrp_free(kg_first);
721			kg_first = kg_next;
722		}
723	}
724}
725
726/*
727 * Allocate a ksegrp.
728 */
729struct ksegrp *
730ksegrp_alloc(void)
731{
732	return (uma_zalloc(ksegrp_zone, M_WAITOK));
733}
734
735/*
736 * Allocate a kse.
737 */
738struct kse *
739kse_alloc(void)
740{
741	return (uma_zalloc(kse_zone, M_WAITOK));
742}
743
744/*
745 * Allocate a thread.
746 */
747struct thread *
748thread_alloc(void)
749{
750	thread_reap(); /* check if any zombies to get */
751	return (uma_zalloc(thread_zone, M_WAITOK));
752}
753
754/*
755 * Deallocate a ksegrp.
756 */
757void
758ksegrp_free(struct ksegrp *td)
759{
760	uma_zfree(ksegrp_zone, td);
761}
762
763/*
764 * Deallocate a kse.
765 */
766void
767kse_free(struct kse *td)
768{
769	uma_zfree(kse_zone, td);
770}
771
772/*
773 * Deallocate a thread.
774 */
775void
776thread_free(struct thread *td)
777{
778
779	cpu_thread_clean(td);
780	uma_zfree(thread_zone, td);
781}
782
783/*
784 * Store the thread context in the UTS's mailbox.
785 * then add the mailbox at the head of a list we are building in user space.
786 * The list is anchored in the ksegrp structure.
787 */
788int
789thread_export_context(struct thread *td)
790{
791	struct proc *p;
792	struct ksegrp *kg;
793	uintptr_t mbx;
794	void *addr;
795	int error;
796	ucontext_t uc;
797	uint temp;
798
799	p = td->td_proc;
800	kg = td->td_ksegrp;
801
802	/* Export the user/machine context. */
803#if 0
804	addr = (caddr_t)td->td_mailbox +
805	    offsetof(struct kse_thr_mailbox, tm_context);
806#else /* if user pointer arithmetic is valid in the kernel */
807		addr = (void *)(&td->td_mailbox->tm_context);
808#endif
809	error = copyin(addr, &uc, sizeof(ucontext_t));
810	if (error)
811		goto bad;
812
813	thread_getcontext(td, &uc);
814	error = copyout(&uc, addr, sizeof(ucontext_t));
815	if (error)
816		goto bad;
817
818	/* get address in latest mbox of list pointer */
819#if 0
820	addr = (caddr_t)td->td_mailbox
821	    + offsetof(struct kse_thr_mailbox , tm_next);
822#else /* if user pointer arithmetic is valid in the kernel */
823	addr = (void *)(&td->td_mailbox->tm_next);
824#endif
825	/*
826	 * Put the saved address of the previous first
827	 * entry into this one
828	 */
829	for (;;) {
830		mbx = (uintptr_t)kg->kg_completed;
831		if (suword(addr, mbx)) {
832			error = EFAULT;
833			goto bad;
834		}
835		PROC_LOCK(p);
836		if (mbx == (uintptr_t)kg->kg_completed) {
837			kg->kg_completed = td->td_mailbox;
838			PROC_UNLOCK(p);
839			break;
840		}
841		PROC_UNLOCK(p);
842	}
843	addr = (caddr_t)td->td_mailbox
844		 + offsetof(struct kse_thr_mailbox, tm_sticks);
845	temp = fuword(addr) + td->td_usticks;
846	if (suword(addr, temp))
847		goto bad;
848	return (0);
849
850bad:
851	PROC_LOCK(p);
852	psignal(p, SIGSEGV);
853	PROC_UNLOCK(p);
854	return (error);
855}
856
857/*
858 * Take the list of completed mailboxes for this KSEGRP and put them on this
859 * KSE's mailbox as it's the next one going up.
860 */
861static int
862thread_link_mboxes(struct ksegrp *kg, struct kse *ke)
863{
864	struct proc *p = kg->kg_proc;
865	void *addr;
866	uintptr_t mbx;
867
868#if 0
869	addr = (caddr_t)ke->ke_mailbox
870	    + offsetof(struct kse_mailbox, km_completed);
871#else /* if user pointer arithmetic is valid in the kernel */
872		addr = (void *)(&ke->ke_mailbox->km_completed);
873#endif
874	for (;;) {
875		mbx = (uintptr_t)kg->kg_completed;
876		if (suword(addr, mbx)) {
877			PROC_LOCK(p);
878			psignal(p, SIGSEGV);
879			PROC_UNLOCK(p);
880			return (EFAULT);
881		}
882		/* XXXKSE could use atomic CMPXCH here */
883		PROC_LOCK(p);
884		if (mbx == (uintptr_t)kg->kg_completed) {
885			kg->kg_completed = NULL;
886			PROC_UNLOCK(p);
887			break;
888		}
889		PROC_UNLOCK(p);
890	}
891	return (0);
892}
893
894/*
895 * This function should be called at statclock interrupt time
896 */
897int
898thread_add_ticks_intr(int user, uint ticks)
899{
900	struct thread *td = curthread;
901	struct kse *ke = td->td_kse;
902
903	if (ke->ke_mailbox == NULL)
904		return -1;
905	if (user) {
906		/* Current always do via ast() */
907		ke->ke_flags |= KEF_ASTPENDING;
908		ke->ke_uuticks += ticks;
909	} else {
910		if (td->td_mailbox != NULL)
911			td->td_usticks += ticks;
912		else
913			ke->ke_usticks += ticks;
914	}
915	return 0;
916}
917
918static int
919thread_update_uticks(void)
920{
921	struct thread *td = curthread;
922	struct proc *p = td->td_proc;
923	struct kse *ke = td->td_kse;
924	struct kse_thr_mailbox *tmbx;
925	caddr_t addr;
926	uint uticks, sticks;
927
928	if (ke->ke_mailbox == NULL)
929		return 0;
930
931	uticks = ke->ke_uuticks;
932	ke->ke_uuticks = 0;
933	sticks = ke->ke_usticks;
934	ke->ke_usticks = 0;
935#if 0
936	tmbx = (void *)fuword((caddr_t)ke->ke_mailbox
937	    + offsetof(struct kse_mailbox, km_curthread));
938#else /* if user pointer arithmetic is ok in the kernel */
939	tmbx = (void *)fuword( (void *)&ke->ke_mailbox->km_curthread);
940#endif
941	if ((tmbx == NULL) || (tmbx == (void *)-1))
942		return 0;
943	if (uticks) {
944		addr = (caddr_t)tmbx + offsetof(struct kse_thr_mailbox, tm_uticks);
945		uticks += fuword(addr);
946		if (suword(addr, uticks))
947			goto bad;
948	}
949	if (sticks) {
950		addr = (caddr_t)tmbx + offsetof(struct kse_thr_mailbox, tm_sticks);
951		sticks += fuword(addr);
952		if (suword(addr, sticks))
953			goto bad;
954	}
955	return 0;
956bad:
957	PROC_LOCK(p);
958	psignal(p, SIGSEGV);
959	PROC_UNLOCK(p);
960	return -1;
961}
962
963/*
964 * Discard the current thread and exit from its context.
965 *
966 * Because we can't free a thread while we're operating under its context,
967 * push the current thread into our CPU's deadthread holder. This means
968 * we needn't worry about someone else grabbing our context before we
969 * do a cpu_throw().
970 */
971void
972thread_exit(void)
973{
974	struct thread *td;
975	struct kse *ke;
976	struct proc *p;
977	struct ksegrp	*kg;
978
979	td = curthread;
980	kg = td->td_ksegrp;
981	p = td->td_proc;
982	ke = td->td_kse;
983
984	mtx_assert(&sched_lock, MA_OWNED);
985	KASSERT(p != NULL, ("thread exiting without a process"));
986	KASSERT(ke != NULL, ("thread exiting without a kse"));
987	KASSERT(kg != NULL, ("thread exiting without a kse group"));
988	PROC_LOCK_ASSERT(p, MA_OWNED);
989	CTR1(KTR_PROC, "thread_exit: thread %p", td);
990	KASSERT(!mtx_owned(&Giant), ("dying thread owns giant"));
991
992	if (td->td_standin != NULL) {
993		thread_stash(td->td_standin);
994		td->td_standin = NULL;
995	}
996
997	cpu_thread_exit(td);	/* XXXSMP */
998
999	/*
1000	 * The last thread is left attached to the process
1001	 * So that the whole bundle gets recycled. Skip
1002	 * all this stuff.
1003	 */
1004	if (p->p_numthreads > 1) {
1005		/*
1006		 * Unlink this thread from its proc and the kseg.
1007		 * In keeping with the other structs we probably should
1008		 * have a thread_unlink() that does some of this but it
1009		 * would only be called from here (I think) so it would
1010		 * be a waste. (might be useful for proc_fini() as well.)
1011 		 */
1012		TAILQ_REMOVE(&p->p_threads, td, td_plist);
1013		p->p_numthreads--;
1014		TAILQ_REMOVE(&kg->kg_threads, td, td_kglist);
1015		kg->kg_numthreads--;
1016		/*
1017		 * The test below is NOT true if we are the
1018		 * sole exiting thread. P_STOPPED_SNGL is unset
1019		 * in exit1() after it is the only survivor.
1020		 */
1021		if (P_SHOULDSTOP(p) == P_STOPPED_SINGLE) {
1022			if (p->p_numthreads == p->p_suspcount) {
1023				thread_unsuspend_one(p->p_singlethread);
1024			}
1025		}
1026
1027		/* Reassign this thread's KSE. */
1028		ke->ke_state = KES_UNQUEUED;
1029
1030		/*
1031		 * Decide what to do with the KSE attached to this thread.
1032		 * XXX Possibly kse_reassign should do both cases as it already
1033		 * does some of this.
1034		 */
1035		if (ke->ke_flags & KEF_EXIT) {
1036			KASSERT((ke->ke_owner == td),
1037		    	    ("thread_exit: KSE exiting with non-owner thread"));
1038			ke->ke_thread = NULL;
1039			td->td_kse = NULL;
1040			kse_unlink(ke);
1041		} else {
1042			TD_SET_EXITING(td);	/* definitly not runnable */
1043			kse_reassign(ke);
1044		}
1045		PROC_UNLOCK(p);
1046		td->td_state	= TDS_INACTIVE;
1047		td->td_proc	= NULL;
1048		td->td_ksegrp	= NULL;
1049		td->td_last_kse	= NULL;
1050		PCPU_SET(deadthread, td);
1051	} else {
1052		PROC_UNLOCK(p);
1053	}
1054	cpu_throw();
1055	/* NOTREACHED */
1056}
1057
1058/*
1059 * Do any thread specific cleanups that may be needed in wait()
1060 * called with Giant held, proc and schedlock not held.
1061 */
1062void
1063thread_wait(struct proc *p)
1064{
1065	struct thread *td;
1066
1067	KASSERT((p->p_numthreads == 1), ("Muliple threads in wait1()"));
1068	KASSERT((p->p_numksegrps == 1), ("Muliple ksegrps in wait1()"));
1069	FOREACH_THREAD_IN_PROC(p, td) {
1070		if (td->td_standin != NULL) {
1071			thread_free(td->td_standin);
1072			td->td_standin = NULL;
1073		}
1074		cpu_thread_clean(td);
1075	}
1076	thread_reap();	/* check for zombie threads etc. */
1077}
1078
1079/*
1080 * Link a thread to a process.
1081 * set up anything that needs to be initialized for it to
1082 * be used by the process.
1083 *
1084 * Note that we do not link to the proc's ucred here.
1085 * The thread is linked as if running but no KSE assigned.
1086 */
1087void
1088thread_link(struct thread *td, struct ksegrp *kg)
1089{
1090	struct proc *p;
1091
1092	p = kg->kg_proc;
1093	td->td_state = TDS_INACTIVE;
1094	td->td_proc	= p;
1095	td->td_ksegrp	= kg;
1096	td->td_last_kse	= NULL;
1097
1098	LIST_INIT(&td->td_contested);
1099	callout_init(&td->td_slpcallout, 1);
1100	TAILQ_INSERT_HEAD(&p->p_threads, td, td_plist);
1101	TAILQ_INSERT_HEAD(&kg->kg_threads, td, td_kglist);
1102	p->p_numthreads++;
1103	kg->kg_numthreads++;
1104	td->td_kse	= NULL;
1105}
1106
1107void
1108kse_purge(struct proc *p, struct thread *td)
1109{
1110	/* XXXKSE think about this..
1111		may need to wake up threads on loan queue. */
1112	struct ksegrp *kg;
1113
1114 	KASSERT(p->p_numthreads == 1, ("bad thread number"));
1115	mtx_lock_spin(&sched_lock);
1116	while ((kg = TAILQ_FIRST(&p->p_ksegrps)) != NULL) {
1117		TAILQ_REMOVE(&p->p_ksegrps, kg, kg_ksegrp);
1118		p->p_numksegrps--;
1119		KASSERT(((kg->kg_kses == 0) && (kg != td->td_ksegrp)) ||
1120		    ((kg->kg_kses == 1) && (kg == td->td_ksegrp)),
1121			("wrong kg_kses"));
1122		if (kg != td->td_ksegrp) {
1123			ksegrp_stash(kg);
1124		}
1125	}
1126	TAILQ_INSERT_HEAD(&p->p_ksegrps, td->td_ksegrp, kg_ksegrp);
1127	p->p_numksegrps++;
1128	mtx_unlock_spin(&sched_lock);
1129}
1130
1131
1132/*
1133 * Create a thread and schedule it for upcall on the KSE given.
1134 * Use our thread's standin so that we don't have to allocate one.
1135 */
1136struct thread *
1137thread_schedule_upcall(struct thread *td, struct kse *ke)
1138{
1139	struct thread *td2;
1140	int newkse;
1141
1142	mtx_assert(&sched_lock, MA_OWNED);
1143	newkse = (ke != td->td_kse);
1144
1145	/*
1146	 * If the owner and kse are BOUND then that thread is planning to
1147	 * go to userland and upcalls are not expected. So don't make one.
1148	 * If it is not bound then make it so with the spare thread
1149	 * anf then borrw back the KSE to allow us to complete some in-kernel
1150	 * work. When we complete, the Bound thread will have the chance to
1151	 * complete. This thread will sleep as planned. Hopefully there will
1152	 * eventually be un unbound thread that can be converted to an
1153	 * upcall to report the completion of this thread.
1154	 */
1155
1156	if ((td2 = td->td_standin) != NULL) {
1157		td->td_standin = NULL;
1158	} else {
1159		if (newkse)
1160			panic("no reserve thread when called with a new kse");
1161		/*
1162		 * If called from (e.g.) sleep and we do not have
1163		 * a reserve thread, then we've used it, so do not
1164		 * create an upcall.
1165		 */
1166		return (NULL);
1167	}
1168	CTR3(KTR_PROC, "thread_schedule_upcall: thread %p (pid %d, %s)",
1169	     td2, td->td_proc->p_pid, td->td_proc->p_comm);
1170	bzero(&td2->td_startzero,
1171	    (unsigned)RANGEOF(struct thread, td_startzero, td_endzero));
1172	bcopy(&td->td_startcopy, &td2->td_startcopy,
1173	    (unsigned) RANGEOF(struct thread, td_startcopy, td_endcopy));
1174	thread_link(td2, ke->ke_ksegrp);
1175	cpu_set_upcall(td2, td->td_pcb);
1176
1177	/*
1178	 * XXXKSE do we really need this? (default values for the
1179	 * frame).
1180	 */
1181	bcopy(td->td_frame, td2->td_frame, sizeof(struct trapframe));
1182
1183	/*
1184	 * Bind the new thread to the KSE,
1185	 * and if it's our KSE, lend it back to ourself
1186	 * so we can continue running.
1187	 */
1188	td2->td_ucred = crhold(td->td_ucred);
1189	td2->td_flags = TDF_UPCALLING; /* note: BOUND */
1190	td2->td_kse = ke;
1191	td2->td_state = TDS_CAN_RUN;
1192	td2->td_inhibitors = 0;
1193	ke->ke_owner = td2;
1194	/*
1195	 * If called from kse_reassign(), we are working on the current
1196	 * KSE so fake that we borrowed it. If called from
1197	 * kse_create(), don't, as we have a new kse too.
1198	 */
1199	if (!newkse) {
1200		/*
1201		 * This thread will be scheduled when the current thread
1202		 * blocks, exits or tries to enter userspace, (which ever
1203		 * happens first). When that happens the KSe will "revert"
1204		 * to this thread in a BOUND manner. Since we are called
1205		 * from msleep() this is going to be "very soon" in nearly
1206		 * all cases.
1207		 */
1208		TD_SET_LOAN(td2);
1209	} else {
1210		ke->ke_thread = td2;
1211		ke->ke_state = KES_THREAD;
1212		setrunqueue(td2);
1213	}
1214	return (td2);	/* bogus.. should be a void function */
1215}
1216
1217/*
1218 * Schedule an upcall to notify a KSE process recieved signals.
1219 *
1220 * XXX - Modifying a sigset_t like this is totally bogus.
1221 */
1222struct thread *
1223signal_upcall(struct proc *p, int sig)
1224{
1225	struct thread *td, *td2;
1226	struct kse *ke;
1227	sigset_t ss;
1228	int error;
1229
1230	PROC_LOCK_ASSERT(p, MA_OWNED);
1231return (NULL);
1232
1233	td = FIRST_THREAD_IN_PROC(p);
1234	ke = td->td_kse;
1235	PROC_UNLOCK(p);
1236	error = copyin(&ke->ke_mailbox->km_sigscaught, &ss, sizeof(sigset_t));
1237	PROC_LOCK(p);
1238	if (error)
1239		return (NULL);
1240	SIGADDSET(ss, sig);
1241	PROC_UNLOCK(p);
1242	error = copyout(&ss, &ke->ke_mailbox->km_sigscaught, sizeof(sigset_t));
1243	PROC_LOCK(p);
1244	if (error)
1245		return (NULL);
1246	if (td->td_standin == NULL)
1247		td->td_standin = thread_alloc();
1248	mtx_lock_spin(&sched_lock);
1249	td2 = thread_schedule_upcall(td, ke); /* Bogus JRE */
1250	mtx_unlock_spin(&sched_lock);
1251	return (td2);
1252}
1253
1254/*
1255 * setup done on the thread when it enters the kernel.
1256 * XXXKSE Presently only for syscalls but eventually all kernel entries.
1257 */
1258void
1259thread_user_enter(struct proc *p, struct thread *td)
1260{
1261	struct kse *ke;
1262
1263	/*
1264	 * First check that we shouldn't just abort.
1265	 * But check if we are the single thread first!
1266	 * XXX p_singlethread not locked, but should be safe.
1267	 */
1268	if ((p->p_flag & P_WEXIT) && (p->p_singlethread != td)) {
1269		PROC_LOCK(p);
1270		mtx_lock_spin(&sched_lock);
1271		thread_exit();
1272		/* NOTREACHED */
1273	}
1274
1275	/*
1276	 * If we are doing a syscall in a KSE environment,
1277	 * note where our mailbox is. There is always the
1278	 * possibility that we could do this lazily (in kse_reassign()),
1279	 * but for now do it every time.
1280	 */
1281	ke = td->td_kse;
1282	td->td_flags &= ~TDF_UNBOUND;
1283	if (ke->ke_mailbox != NULL) {
1284#if 0
1285		td->td_mailbox = (void *)fuword((caddr_t)ke->ke_mailbox
1286		    + offsetof(struct kse_mailbox, km_curthread));
1287#else /* if user pointer arithmetic is ok in the kernel */
1288		td->td_mailbox =
1289		    (void *)fuword( (void *)&ke->ke_mailbox->km_curthread);
1290#endif
1291		if ((td->td_mailbox == NULL) ||
1292		    (td->td_mailbox == (void *)-1)) {
1293			td->td_mailbox = NULL;	/* single thread it.. */
1294			mtx_lock_spin(&sched_lock);
1295			td->td_flags &= ~(TDF_UNBOUND|TDF_CAN_UNBIND);
1296			mtx_unlock_spin(&sched_lock);
1297		} else {
1298			/*
1299			 * when thread limit reached, act like that the thread
1300			 * has already done an upcall.
1301			 */
1302			if (p->p_numthreads > max_threads_per_proc) {
1303				if (td->td_standin != NULL) {
1304					thread_stash(td->td_standin);
1305					td->td_standin = NULL;
1306				}
1307			} else {
1308				if (td->td_standin == NULL)
1309					td->td_standin = thread_alloc();
1310			}
1311			mtx_lock_spin(&sched_lock);
1312			td->td_flags |= TDF_CAN_UNBIND;
1313			mtx_unlock_spin(&sched_lock);
1314			KASSERT((ke->ke_owner == td),
1315			    ("thread_user_enter: No starting owner "));
1316			ke->ke_owner = td;
1317			td->td_usticks = 0;
1318		}
1319	}
1320}
1321
1322/*
1323 * The extra work we go through if we are a threaded process when we
1324 * return to userland.
1325 *
1326 * If we are a KSE process and returning to user mode, check for
1327 * extra work to do before we return (e.g. for more syscalls
1328 * to complete first).  If we were in a critical section, we should
1329 * just return to let it finish. Same if we were in the UTS (in
1330 * which case the mailbox's context's busy indicator will be set).
1331 * The only traps we suport will have set the mailbox.
1332 * We will clear it here.
1333 */
1334int
1335thread_userret(struct thread *td, struct trapframe *frame)
1336{
1337	int error;
1338	int unbound;
1339	struct kse *ke;
1340	struct ksegrp *kg;
1341	struct thread *worktodo;
1342	struct proc *p;
1343	struct timespec ts;
1344
1345	KASSERT((td->td_kse && td->td_kse->ke_thread && td->td_kse->ke_owner),
1346	    ("thread_userret: bad thread/kse pointers"));
1347	KASSERT((td == curthread),
1348	    ("thread_userret: bad thread argument"));
1349
1350
1351	kg = td->td_ksegrp;
1352	p = td->td_proc;
1353	error = 0;
1354	unbound = TD_IS_UNBOUND(td);
1355
1356	mtx_lock_spin(&sched_lock);
1357	if ((worktodo = kg->kg_last_assigned))
1358		worktodo = TAILQ_NEXT(worktodo, td_runq);
1359	else
1360		worktodo = TAILQ_FIRST(&kg->kg_runq);
1361
1362	/*
1363	 * Permanently bound threads never upcall but they may
1364	 * loan out their KSE at this point.
1365	 * Upcalls imply bound.. They also may want to do some Philantropy.
1366	 * Temporarily bound threads on the other hand either yield
1367	 * to other work and transform into an upcall, or proceed back to
1368	 * userland.
1369	 */
1370
1371	if (TD_CAN_UNBIND(td)) {
1372		td->td_flags &= ~(TDF_UNBOUND|TDF_CAN_UNBIND);
1373		if (!worktodo && (kg->kg_completed == NULL) &&
1374		    !(td->td_kse->ke_flags & KEF_DOUPCALL)) {
1375			/*
1376			 * This thread has not started any upcall.
1377			 * If there is no work to report other than
1378			 * ourself, then it can return direct to userland.
1379			 */
1380justreturn:
1381			mtx_unlock_spin(&sched_lock);
1382			thread_update_uticks();
1383			td->td_mailbox = NULL;
1384			return (0);
1385		}
1386		mtx_unlock_spin(&sched_lock);
1387		error = thread_export_context(td);
1388		td->td_usticks = 0;
1389		if (error) {
1390			/*
1391			 * As we are not running on a borrowed KSE,
1392			 * failing to do the KSE operation just defaults
1393			 * back to synchonous operation, so just return from
1394			 * the syscall.
1395			 */
1396			goto justreturn;
1397		}
1398		mtx_lock_spin(&sched_lock);
1399		/*
1400		 * Turn ourself into a bound upcall.
1401		 * We will rely on kse_reassign()
1402		 * to make us run at a later time.
1403		 */
1404		td->td_flags |= TDF_UPCALLING;
1405
1406		/* there may be more work since we re-locked schedlock */
1407		if ((worktodo = kg->kg_last_assigned))
1408			worktodo = TAILQ_NEXT(worktodo, td_runq);
1409		else
1410			worktodo = TAILQ_FIRST(&kg->kg_runq);
1411	} else if (unbound) {
1412		/*
1413		 * We are an unbound thread, looking to
1414		 * return to user space. There must be another owner
1415		 * of this KSE.
1416		 * We are using a borrowed KSE. save state and exit.
1417		 * kse_reassign() will recycle the kse as needed,
1418		 */
1419		mtx_unlock_spin(&sched_lock);
1420		error = thread_export_context(td);
1421		td->td_usticks = 0;
1422		if (error) {
1423			/*
1424			 * There is nothing we can do.
1425			 * We just lose that context. We
1426			 * probably should note this somewhere and send
1427			 * the process a signal.
1428			 */
1429			PROC_LOCK(td->td_proc);
1430			psignal(td->td_proc, SIGSEGV);
1431			mtx_lock_spin(&sched_lock);
1432			ke = td->td_kse;
1433			/* possibly upcall with error? */
1434		} else {
1435			/*
1436			 * Don't make an upcall, just exit so that the owner
1437			 * can get its KSE if it wants it.
1438			 * Our context is already safely stored for later
1439			 * use by the UTS.
1440			 */
1441			PROC_LOCK(p);
1442			mtx_lock_spin(&sched_lock);
1443			ke = td->td_kse;
1444		}
1445		/*
1446		 * If the owner is idling, we now have something for it
1447		 * to report, so make it runnable.
1448		 * If the owner is not an upcall, make an attempt to
1449		 * ensure that at least one of any IDLED upcalls can
1450		 * wake up.
1451		 */
1452		if (ke->ke_owner->td_flags & TDF_UPCALLING) {
1453			TD_CLR_IDLE(ke->ke_owner);
1454		} else {
1455			FOREACH_KSE_IN_GROUP(kg, ke) {
1456				if (TD_IS_IDLE(ke->ke_owner)) {
1457					TD_CLR_IDLE(ke->ke_owner);
1458					setrunnable(ke->ke_owner);
1459					break;
1460				}
1461			}
1462		}
1463		thread_exit();
1464	}
1465	/*
1466	 * We ARE going back to userland with this KSE.
1467	 * We are permanently bound. We may be an upcall.
1468	 * If an upcall, check for threads that need to borrow the KSE.
1469	 * Any other thread that comes ready after this missed the boat.
1470	 */
1471	ke = td->td_kse;
1472
1473	/*
1474	 *  If not upcalling, go back to userspace.
1475	 * If we are, get the upcall set up.
1476	 */
1477	if (td->td_flags & TDF_UPCALLING) {
1478		if (worktodo)  {
1479			/*
1480			 * force a switch to more urgent 'in kernel'
1481			 * work. Control will return to this thread
1482			 * when there is no more work to do.
1483			 * kse_reassign() will do that for us.
1484			 */
1485			TD_SET_LOAN(td);
1486			p->p_stats->p_ru.ru_nvcsw++;
1487			mi_switch(); /* kse_reassign() will (re)find worktodo */
1488		}
1489		td->td_flags &= ~TDF_UPCALLING;
1490		if (ke->ke_flags & KEF_DOUPCALL)
1491			ke->ke_flags &= ~KEF_DOUPCALL;
1492		mtx_unlock_spin(&sched_lock);
1493
1494		/*
1495		 * There is no more work to do and we are going to ride
1496		 * this thread/KSE up to userland as an upcall.
1497		 * Do the last parts of the setup needed for the upcall.
1498		 */
1499		CTR3(KTR_PROC, "userret: upcall thread %p (pid %d, %s)",
1500		    td, td->td_proc->p_pid, td->td_proc->p_comm);
1501
1502		/*
1503		 * Set user context to the UTS.
1504		 * Will use Giant in cpu_thread_clean() because it uses
1505		 * kmem_free(kernel_map, ...)
1506		 */
1507		cpu_set_upcall_kse(td, ke);
1508
1509		/*
1510		 * Unhook the list of completed threads.
1511		 * anything that completes after this gets to
1512		 * come in next time.
1513		 * Put the list of completed thread mailboxes on
1514		 * this KSE's mailbox.
1515		 */
1516		error = thread_link_mboxes(kg, ke);
1517		if (error)
1518			goto bad;
1519
1520		/*
1521		 * Set state and clear the  thread mailbox pointer.
1522		 * From now on we are just a bound outgoing process.
1523		 * **Problem** userret is often called several times.
1524		 * it would be nice if this all happenned only on the first
1525		 * time through. (the scan for extra work etc.)
1526		 */
1527#if 0
1528		error = suword((caddr_t)ke->ke_mailbox +
1529		    offsetof(struct kse_mailbox, km_curthread), 0);
1530#else	/* if user pointer arithmetic is ok in the kernel */
1531		error = suword((caddr_t)&ke->ke_mailbox->km_curthread, 0);
1532#endif
1533		ke->ke_uuticks = ke->ke_usticks = 0;
1534		if (error)
1535			goto bad;
1536		nanotime(&ts);
1537		if (copyout(&ts,
1538		    (caddr_t)&ke->ke_mailbox->km_timeofday, sizeof(ts))) {
1539			goto bad;
1540		}
1541	} else {
1542		mtx_unlock_spin(&sched_lock);
1543	}
1544	/*
1545	 * Optimisation:
1546	 * Ensure that we have a spare thread available,
1547	 * for when we re-enter the kernel.
1548	 */
1549	if (td->td_standin == NULL) {
1550		td->td_standin = thread_alloc();
1551	}
1552
1553	thread_update_uticks();
1554	td->td_mailbox = NULL;
1555	return (0);
1556
1557bad:
1558	/*
1559	 * Things are going to be so screwed we should just kill the process.
1560	 * how do we do that?
1561	 */
1562	PROC_LOCK(td->td_proc);
1563	psignal(td->td_proc, SIGSEGV);
1564	PROC_UNLOCK(td->td_proc);
1565	td->td_mailbox = NULL;
1566	return (error);	/* go sync */
1567}
1568
1569/*
1570 * Enforce single-threading.
1571 *
1572 * Returns 1 if the caller must abort (another thread is waiting to
1573 * exit the process or similar). Process is locked!
1574 * Returns 0 when you are successfully the only thread running.
1575 * A process has successfully single threaded in the suspend mode when
1576 * There are no threads in user mode. Threads in the kernel must be
1577 * allowed to continue until they get to the user boundary. They may even
1578 * copy out their return values and data before suspending. They may however be
1579 * accellerated in reaching the user boundary as we will wake up
1580 * any sleeping threads that are interruptable. (PCATCH).
1581 */
1582int
1583thread_single(int force_exit)
1584{
1585	struct thread *td;
1586	struct thread *td2;
1587	struct proc *p;
1588
1589	td = curthread;
1590	p = td->td_proc;
1591	mtx_assert(&Giant, MA_OWNED);
1592	PROC_LOCK_ASSERT(p, MA_OWNED);
1593	KASSERT((td != NULL), ("curthread is NULL"));
1594
1595	if ((p->p_flag & P_KSES) == 0)
1596		return (0);
1597
1598	/* Is someone already single threading? */
1599	if (p->p_singlethread)
1600		return (1);
1601
1602	if (force_exit == SINGLE_EXIT) {
1603		p->p_flag |= P_SINGLE_EXIT;
1604		td->td_flags &= ~TDF_UNBOUND;
1605	} else
1606		p->p_flag &= ~P_SINGLE_EXIT;
1607	p->p_flag |= P_STOPPED_SINGLE;
1608	p->p_singlethread = td;
1609	/* XXXKSE Which lock protects the below values? */
1610	while ((p->p_numthreads - p->p_suspcount) != 1) {
1611		mtx_lock_spin(&sched_lock);
1612		FOREACH_THREAD_IN_PROC(p, td2) {
1613			if (td2 == td)
1614				continue;
1615			if (TD_IS_INHIBITED(td2)) {
1616				if (force_exit == SINGLE_EXIT) {
1617					if (TD_IS_SUSPENDED(td2)) {
1618						thread_unsuspend_one(td2);
1619					}
1620					if (TD_ON_SLEEPQ(td2) &&
1621					    (td2->td_flags & TDF_SINTR)) {
1622						if (td2->td_flags & TDF_CVWAITQ)
1623							cv_abort(td2);
1624						else
1625							abortsleep(td2);
1626					}
1627					if (TD_IS_IDLE(td2)) {
1628						TD_CLR_IDLE(td2);
1629					}
1630				} else {
1631					if (TD_IS_SUSPENDED(td2))
1632						continue;
1633					/* maybe other inhibitted states too? */
1634					if (td2->td_inhibitors &
1635					    (TDI_SLEEPING | TDI_SWAPPED |
1636					    TDI_LOAN | TDI_IDLE |
1637					    TDI_EXITING))
1638						thread_suspend_one(td2);
1639				}
1640			}
1641		}
1642		/*
1643		 * Maybe we suspended some threads.. was it enough?
1644		 */
1645		if ((p->p_numthreads - p->p_suspcount) == 1) {
1646			mtx_unlock_spin(&sched_lock);
1647			break;
1648		}
1649
1650		/*
1651		 * Wake us up when everyone else has suspended.
1652		 * In the mean time we suspend as well.
1653		 */
1654		thread_suspend_one(td);
1655		mtx_unlock(&Giant);
1656		PROC_UNLOCK(p);
1657		p->p_stats->p_ru.ru_nvcsw++;
1658		mi_switch();
1659		mtx_unlock_spin(&sched_lock);
1660		mtx_lock(&Giant);
1661		PROC_LOCK(p);
1662	}
1663	if (force_exit == SINGLE_EXIT)
1664		kse_purge(p, td);
1665	return (0);
1666}
1667
1668/*
1669 * Called in from locations that can safely check to see
1670 * whether we have to suspend or at least throttle for a
1671 * single-thread event (e.g. fork).
1672 *
1673 * Such locations include userret().
1674 * If the "return_instead" argument is non zero, the thread must be able to
1675 * accept 0 (caller may continue), or 1 (caller must abort) as a result.
1676 *
1677 * The 'return_instead' argument tells the function if it may do a
1678 * thread_exit() or suspend, or whether the caller must abort and back
1679 * out instead.
1680 *
1681 * If the thread that set the single_threading request has set the
1682 * P_SINGLE_EXIT bit in the process flags then this call will never return
1683 * if 'return_instead' is false, but will exit.
1684 *
1685 * P_SINGLE_EXIT | return_instead == 0| return_instead != 0
1686 *---------------+--------------------+---------------------
1687 *       0       | returns 0          |   returns 0 or 1
1688 *               | when ST ends       |   immediatly
1689 *---------------+--------------------+---------------------
1690 *       1       | thread exits       |   returns 1
1691 *               |                    |  immediatly
1692 * 0 = thread_exit() or suspension ok,
1693 * other = return error instead of stopping the thread.
1694 *
1695 * While a full suspension is under effect, even a single threading
1696 * thread would be suspended if it made this call (but it shouldn't).
1697 * This call should only be made from places where
1698 * thread_exit() would be safe as that may be the outcome unless
1699 * return_instead is set.
1700 */
1701int
1702thread_suspend_check(int return_instead)
1703{
1704	struct thread *td;
1705	struct proc *p;
1706	struct kse *ke;
1707	struct ksegrp *kg;
1708
1709	td = curthread;
1710	p = td->td_proc;
1711	kg = td->td_ksegrp;
1712	PROC_LOCK_ASSERT(p, MA_OWNED);
1713	while (P_SHOULDSTOP(p)) {
1714		if (P_SHOULDSTOP(p) == P_STOPPED_SINGLE) {
1715			KASSERT(p->p_singlethread != NULL,
1716			    ("singlethread not set"));
1717			/*
1718			 * The only suspension in action is a
1719			 * single-threading. Single threader need not stop.
1720			 * XXX Should be safe to access unlocked
1721			 * as it can only be set to be true by us.
1722			 */
1723			if (p->p_singlethread == td)
1724				return (0);	/* Exempt from stopping. */
1725		}
1726		if (return_instead)
1727			return (1);
1728
1729		/*
1730		 * If the process is waiting for us to exit,
1731		 * this thread should just suicide.
1732		 * Assumes that P_SINGLE_EXIT implies P_STOPPED_SINGLE.
1733		 */
1734		if ((p->p_flag & P_SINGLE_EXIT) && (p->p_singlethread != td)) {
1735			mtx_lock_spin(&sched_lock);
1736			while (mtx_owned(&Giant))
1737				mtx_unlock(&Giant);
1738			/*
1739			 * All threads should be exiting
1740			 * Unless they are the active "singlethread".
1741			 * destroy un-needed KSEs as we go..
1742			 * KSEGRPS may implode too as #kses -> 0.
1743			 */
1744			ke = td->td_kse;
1745			if (ke->ke_owner == td &&
1746			    (kg->kg_kses >= kg->kg_numthreads ))
1747				ke->ke_flags |= KEF_EXIT;
1748			thread_exit();
1749		}
1750
1751		/*
1752		 * When a thread suspends, it just
1753		 * moves to the processes's suspend queue
1754		 * and stays there.
1755		 *
1756		 * XXXKSE if TDF_BOUND is true
1757		 * it will not release it's KSE which might
1758		 * lead to deadlock if there are not enough KSEs
1759		 * to complete all waiting threads.
1760		 * Maybe be able to 'lend' it out again.
1761		 * (lent kse's can not go back to userland?)
1762		 * and can only be lent in STOPPED state.
1763		 */
1764		mtx_lock_spin(&sched_lock);
1765		if ((p->p_flag & P_STOPPED_SIG) &&
1766		    (p->p_suspcount+1 == p->p_numthreads)) {
1767			mtx_unlock_spin(&sched_lock);
1768			PROC_LOCK(p->p_pptr);
1769			if ((p->p_pptr->p_procsig->ps_flag &
1770				PS_NOCLDSTOP) == 0) {
1771				psignal(p->p_pptr, SIGCHLD);
1772			}
1773			PROC_UNLOCK(p->p_pptr);
1774			mtx_lock_spin(&sched_lock);
1775		}
1776		mtx_assert(&Giant, MA_NOTOWNED);
1777		thread_suspend_one(td);
1778		PROC_UNLOCK(p);
1779		if (P_SHOULDSTOP(p) == P_STOPPED_SINGLE) {
1780			if (p->p_numthreads == p->p_suspcount) {
1781				thread_unsuspend_one(p->p_singlethread);
1782			}
1783		}
1784		p->p_stats->p_ru.ru_nivcsw++;
1785		mi_switch();
1786		mtx_unlock_spin(&sched_lock);
1787		PROC_LOCK(p);
1788	}
1789	return (0);
1790}
1791
1792void
1793thread_suspend_one(struct thread *td)
1794{
1795	struct proc *p = td->td_proc;
1796
1797	mtx_assert(&sched_lock, MA_OWNED);
1798	p->p_suspcount++;
1799	TD_SET_SUSPENDED(td);
1800	TAILQ_INSERT_TAIL(&p->p_suspended, td, td_runq);
1801	/*
1802	 * Hack: If we are suspending but are on the sleep queue
1803	 * then we are in msleep or the cv equivalent. We
1804	 * want to look like we have two Inhibitors.
1805	 * May already be set.. doesn't matter.
1806	 */
1807	if (TD_ON_SLEEPQ(td))
1808		TD_SET_SLEEPING(td);
1809}
1810
1811void
1812thread_unsuspend_one(struct thread *td)
1813{
1814	struct proc *p = td->td_proc;
1815
1816	mtx_assert(&sched_lock, MA_OWNED);
1817	TAILQ_REMOVE(&p->p_suspended, td, td_runq);
1818	TD_CLR_SUSPENDED(td);
1819	p->p_suspcount--;
1820	setrunnable(td);
1821}
1822
1823/*
1824 * Allow all threads blocked by single threading to continue running.
1825 */
1826void
1827thread_unsuspend(struct proc *p)
1828{
1829	struct thread *td;
1830
1831	mtx_assert(&sched_lock, MA_OWNED);
1832	PROC_LOCK_ASSERT(p, MA_OWNED);
1833	if (!P_SHOULDSTOP(p)) {
1834		while (( td = TAILQ_FIRST(&p->p_suspended))) {
1835			thread_unsuspend_one(td);
1836		}
1837	} else if ((P_SHOULDSTOP(p) == P_STOPPED_SINGLE) &&
1838	    (p->p_numthreads == p->p_suspcount)) {
1839		/*
1840		 * Stopping everything also did the job for the single
1841		 * threading request. Now we've downgraded to single-threaded,
1842		 * let it continue.
1843		 */
1844		thread_unsuspend_one(p->p_singlethread);
1845	}
1846}
1847
1848void
1849thread_single_end(void)
1850{
1851	struct thread *td;
1852	struct proc *p;
1853
1854	td = curthread;
1855	p = td->td_proc;
1856	PROC_LOCK_ASSERT(p, MA_OWNED);
1857	p->p_flag &= ~P_STOPPED_SINGLE;
1858	p->p_singlethread = NULL;
1859	/*
1860	 * If there are other threads they mey now run,
1861	 * unless of course there is a blanket 'stop order'
1862	 * on the process. The single threader must be allowed
1863	 * to continue however as this is a bad place to stop.
1864	 */
1865	if ((p->p_numthreads != 1) && (!P_SHOULDSTOP(p))) {
1866		mtx_lock_spin(&sched_lock);
1867		while (( td = TAILQ_FIRST(&p->p_suspended))) {
1868			thread_unsuspend_one(td);
1869		}
1870		mtx_unlock_spin(&sched_lock);
1871	}
1872}
1873
1874
1875