kern_thread.c revision 132372
1/*
2 * Copyright (C) 2001 Julian Elischer <julian@freebsd.org>.
3 *  All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 * 1. Redistributions of source code must retain the above copyright
9 *    notice(s), this list of conditions and the following disclaimer as
10 *    the first lines of this file unmodified other than the possible
11 *    addition of one or more copyright notices.
12 * 2. Redistributions in binary form must reproduce the above copyright
13 *    notice(s), this list of conditions and the following disclaimer in the
14 *    documentation and/or other materials provided with the distribution.
15 *
16 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDER(S) ``AS IS'' AND ANY
17 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
18 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
19 * DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT HOLDER(S) BE LIABLE FOR ANY
20 * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
21 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
22 * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
23 * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
24 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
25 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH
26 * DAMAGE.
27 */
28
29#include <sys/cdefs.h>
30__FBSDID("$FreeBSD: head/sys/kern/kern_thread.c 132372 2004-07-18 23:36:13Z julian $");
31
32#include <sys/param.h>
33#include <sys/systm.h>
34#include <sys/kernel.h>
35#include <sys/lock.h>
36#include <sys/mutex.h>
37#include <sys/proc.h>
38#include <sys/smp.h>
39#include <sys/sysctl.h>
40#include <sys/sched.h>
41#include <sys/sleepqueue.h>
42#include <sys/turnstile.h>
43#include <sys/ktr.h>
44
45#include <vm/vm.h>
46#include <vm/vm_extern.h>
47#include <vm/uma.h>
48
49/*
50 * KSEGRP related storage.
51 */
52static uma_zone_t ksegrp_zone;
53static uma_zone_t kse_zone;
54static uma_zone_t thread_zone;
55
56/* DEBUG ONLY */
57SYSCTL_NODE(_kern, OID_AUTO, threads, CTLFLAG_RW, 0, "thread allocation");
58static int thread_debug = 0;
59SYSCTL_INT(_kern_threads, OID_AUTO, debug, CTLFLAG_RW,
60	&thread_debug, 0, "thread debug");
61
62int max_threads_per_proc = 1500;
63SYSCTL_INT(_kern_threads, OID_AUTO, max_threads_per_proc, CTLFLAG_RW,
64	&max_threads_per_proc, 0, "Limit on threads per proc");
65
66int max_groups_per_proc = 500;
67SYSCTL_INT(_kern_threads, OID_AUTO, max_groups_per_proc, CTLFLAG_RW,
68	&max_groups_per_proc, 0, "Limit on thread groups per proc");
69
70int max_threads_hits;
71SYSCTL_INT(_kern_threads, OID_AUTO, max_threads_hits, CTLFLAG_RD,
72	&max_threads_hits, 0, "");
73
74int virtual_cpu;
75
76#define RANGEOF(type, start, end) (offsetof(type, end) - offsetof(type, start))
77
78TAILQ_HEAD(, thread) zombie_threads = TAILQ_HEAD_INITIALIZER(zombie_threads);
79TAILQ_HEAD(, kse) zombie_kses = TAILQ_HEAD_INITIALIZER(zombie_kses);
80TAILQ_HEAD(, ksegrp) zombie_ksegrps = TAILQ_HEAD_INITIALIZER(zombie_ksegrps);
81struct mtx kse_zombie_lock;
82MTX_SYSINIT(kse_zombie_lock, &kse_zombie_lock, "kse zombie lock", MTX_SPIN);
83
84void kse_purge(struct proc *p, struct thread *td);
85void kse_purge_group(struct thread *td);
86
87/* move to proc.h */
88extern void	kseinit(void);
89extern void	kse_GC(void);
90
91
92static int
93sysctl_kse_virtual_cpu(SYSCTL_HANDLER_ARGS)
94{
95	int error, new_val;
96	int def_val;
97
98	def_val = mp_ncpus;
99	if (virtual_cpu == 0)
100		new_val = def_val;
101	else
102		new_val = virtual_cpu;
103	error = sysctl_handle_int(oidp, &new_val, 0, req);
104        if (error != 0 || req->newptr == NULL)
105		return (error);
106	if (new_val < 0)
107		return (EINVAL);
108	virtual_cpu = new_val;
109	return (0);
110}
111
112/* DEBUG ONLY */
113SYSCTL_PROC(_kern_threads, OID_AUTO, virtual_cpu, CTLTYPE_INT|CTLFLAG_RW,
114	0, sizeof(virtual_cpu), sysctl_kse_virtual_cpu, "I",
115	"debug virtual cpus");
116
117/*
118 * Thread ID allocator. The allocator keeps track of assigned IDs by
119 * using a bitmap. The bitmap is created in parts. The parts are linked
120 * together.
121 */
122typedef u_long tid_bitmap_word;
123
124#define	TID_IDS_PER_PART	1024
125#define	TID_IDS_PER_IDX		(sizeof(tid_bitmap_word) << 3)
126#define	TID_BITMAP_SIZE		(TID_IDS_PER_PART / TID_IDS_PER_IDX)
127#define	TID_MIN			(PID_MAX + 1)
128
129struct tid_bitmap_part {
130	STAILQ_ENTRY(tid_bitmap_part) bmp_next;
131	tid_bitmap_word	bmp_bitmap[TID_BITMAP_SIZE];
132	lwpid_t		bmp_base;
133	int		bmp_free;
134};
135
136static STAILQ_HEAD(, tid_bitmap_part) tid_bitmap =
137    STAILQ_HEAD_INITIALIZER(tid_bitmap);
138static uma_zone_t tid_zone;
139
140struct mtx tid_lock;
141MTX_SYSINIT(tid_lock, &tid_lock, "TID lock", MTX_DEF);
142
143/*
144 * Prepare a thread for use.
145 */
146static void
147thread_ctor(void *mem, int size, void *arg)
148{
149	struct thread	*td;
150
151	td = (struct thread *)mem;
152	td->td_state = TDS_INACTIVE;
153	td->td_oncpu	= NOCPU;
154
155	/*
156	 * Note that td_critnest begins life as 1 because the thread is not
157	 * running and is thereby implicitly waiting to be on the receiving
158	 * end of a context switch.  A context switch must occur inside a
159	 * critical section, and in fact, includes hand-off of the sched_lock.
160	 * After a context switch to a newly created thread, it will release
161	 * sched_lock for the first time, and its td_critnest will hit 0 for
162	 * the first time.  This happens on the far end of a context switch,
163	 * and when it context switches away from itself, it will in fact go
164	 * back into a critical section, and hand off the sched lock to the
165	 * next thread.
166	 */
167	td->td_critnest = 1;
168}
169
170/*
171 * Reclaim a thread after use.
172 */
173static void
174thread_dtor(void *mem, int size, void *arg)
175{
176	struct thread *td;
177
178	td = (struct thread *)mem;
179
180#ifdef INVARIANTS
181	/* Verify that this thread is in a safe state to free. */
182	switch (td->td_state) {
183	case TDS_INHIBITED:
184	case TDS_RUNNING:
185	case TDS_CAN_RUN:
186	case TDS_RUNQ:
187		/*
188		 * We must never unlink a thread that is in one of
189		 * these states, because it is currently active.
190		 */
191		panic("bad state for thread unlinking");
192		/* NOTREACHED */
193	case TDS_INACTIVE:
194		break;
195	default:
196		panic("bad thread state");
197		/* NOTREACHED */
198	}
199#endif
200}
201
202/*
203 * Initialize type-stable parts of a thread (when newly created).
204 */
205static void
206thread_init(void *mem, int size)
207{
208	struct thread *td;
209	struct tid_bitmap_part *bmp, *new;
210	int bit, idx;
211
212	td = (struct thread *)mem;
213
214	mtx_lock(&tid_lock);
215	STAILQ_FOREACH(bmp, &tid_bitmap, bmp_next) {
216		if (bmp->bmp_free)
217			break;
218	}
219	/* Create a new bitmap if we run out of free bits. */
220	if (bmp == NULL) {
221		mtx_unlock(&tid_lock);
222		new = uma_zalloc(tid_zone, M_WAITOK);
223		mtx_lock(&tid_lock);
224		bmp = STAILQ_LAST(&tid_bitmap, tid_bitmap_part, bmp_next);
225		if (bmp == NULL || bmp->bmp_free < TID_IDS_PER_PART/2) {
226			/* 1=free, 0=assigned. This way we can use ffsl(). */
227			memset(new->bmp_bitmap, ~0U, sizeof(new->bmp_bitmap));
228			new->bmp_base = (bmp == NULL) ? TID_MIN :
229			    bmp->bmp_base + TID_IDS_PER_PART;
230			new->bmp_free = TID_IDS_PER_PART;
231			STAILQ_INSERT_TAIL(&tid_bitmap, new, bmp_next);
232			bmp = new;
233			new = NULL;
234		}
235	} else
236		new = NULL;
237	/* We have a bitmap with available IDs. */
238	idx = 0;
239	while (idx < TID_BITMAP_SIZE && bmp->bmp_bitmap[idx] == 0UL)
240		idx++;
241	bit = ffsl(bmp->bmp_bitmap[idx]) - 1;
242	td->td_tid = bmp->bmp_base + idx * TID_IDS_PER_IDX + bit;
243	bmp->bmp_bitmap[idx] &= ~(1UL << bit);
244	bmp->bmp_free--;
245	mtx_unlock(&tid_lock);
246	if (new != NULL)
247		uma_zfree(tid_zone, new);
248
249	vm_thread_new(td, 0);
250	cpu_thread_setup(td);
251	td->td_sleepqueue = sleepq_alloc();
252	td->td_turnstile = turnstile_alloc();
253	td->td_sched = (struct td_sched *)&td[1];
254}
255
256/*
257 * Tear down type-stable parts of a thread (just before being discarded).
258 */
259static void
260thread_fini(void *mem, int size)
261{
262	struct thread *td;
263	struct tid_bitmap_part *bmp;
264	lwpid_t tid;
265	int bit, idx;
266
267	td = (struct thread *)mem;
268	turnstile_free(td->td_turnstile);
269	sleepq_free(td->td_sleepqueue);
270	vm_thread_dispose(td);
271
272	STAILQ_FOREACH(bmp, &tid_bitmap, bmp_next) {
273		if (td->td_tid >= bmp->bmp_base &&
274		    td->td_tid < bmp->bmp_base + TID_IDS_PER_PART)
275			break;
276	}
277	KASSERT(bmp != NULL, ("No TID bitmap?"));
278	mtx_lock(&tid_lock);
279	tid = td->td_tid - bmp->bmp_base;
280	idx = tid / TID_IDS_PER_IDX;
281	bit = 1UL << (tid % TID_IDS_PER_IDX);
282	bmp->bmp_bitmap[idx] |= bit;
283	bmp->bmp_free++;
284	mtx_unlock(&tid_lock);
285}
286
287/*
288 * Initialize type-stable parts of a kse (when newly created).
289 */
290static void
291kse_init(void *mem, int size)
292{
293	struct kse	*ke;
294
295	ke = (struct kse *)mem;
296	ke->ke_sched = (struct ke_sched *)&ke[1];
297}
298
299/*
300 * Initialize type-stable parts of a ksegrp (when newly created).
301 */
302static void
303ksegrp_init(void *mem, int size)
304{
305	struct ksegrp	*kg;
306
307	kg = (struct ksegrp *)mem;
308	kg->kg_sched = (struct kg_sched *)&kg[1];
309}
310
311/*
312 * KSE is linked into kse group.
313 */
314void
315kse_link(struct kse *ke, struct ksegrp *kg)
316{
317	struct proc *p = kg->kg_proc;
318
319	TAILQ_INSERT_HEAD(&kg->kg_kseq, ke, ke_kglist);
320	kg->kg_kses++;
321	ke->ke_state	= KES_UNQUEUED;
322	ke->ke_proc	= p;
323	ke->ke_ksegrp	= kg;
324	ke->ke_thread	= NULL;
325	ke->ke_oncpu	= NOCPU;
326	ke->ke_flags	= 0;
327}
328
329void
330kse_unlink(struct kse *ke)
331{
332	struct ksegrp *kg;
333
334	mtx_assert(&sched_lock, MA_OWNED);
335	kg = ke->ke_ksegrp;
336	TAILQ_REMOVE(&kg->kg_kseq, ke, ke_kglist);
337	if (ke->ke_state == KES_IDLE) {
338		TAILQ_REMOVE(&kg->kg_iq, ke, ke_kgrlist);
339		kg->kg_idle_kses--;
340	}
341	--kg->kg_kses;
342	/*
343	 * Aggregate stats from the KSE
344	 */
345	kse_stash(ke);
346}
347
348void
349ksegrp_link(struct ksegrp *kg, struct proc *p)
350{
351
352	TAILQ_INIT(&kg->kg_threads);
353	TAILQ_INIT(&kg->kg_runq);	/* links with td_runq */
354	TAILQ_INIT(&kg->kg_slpq);	/* links with td_runq */
355	TAILQ_INIT(&kg->kg_kseq);	/* all kses in ksegrp */
356	TAILQ_INIT(&kg->kg_iq);		/* all idle kses in ksegrp */
357	TAILQ_INIT(&kg->kg_upcalls);	/* all upcall structure in ksegrp */
358	kg->kg_proc = p;
359	/*
360	 * the following counters are in the -zero- section
361	 * and may not need clearing
362	 */
363	kg->kg_numthreads = 0;
364	kg->kg_runnable   = 0;
365	kg->kg_kses       = 0;
366	kg->kg_runq_kses  = 0; /* XXXKSE change name */
367	kg->kg_idle_kses  = 0;
368	kg->kg_numupcalls = 0;
369	/* link it in now that it's consistent */
370	p->p_numksegrps++;
371	TAILQ_INSERT_HEAD(&p->p_ksegrps, kg, kg_ksegrp);
372}
373
374void
375ksegrp_unlink(struct ksegrp *kg)
376{
377	struct proc *p;
378
379	mtx_assert(&sched_lock, MA_OWNED);
380	KASSERT((kg->kg_numthreads == 0), ("ksegrp_unlink: residual threads"));
381	KASSERT((kg->kg_kses == 0), ("ksegrp_unlink: residual kses"));
382	KASSERT((kg->kg_numupcalls == 0), ("ksegrp_unlink: residual upcalls"));
383
384	p = kg->kg_proc;
385	TAILQ_REMOVE(&p->p_ksegrps, kg, kg_ksegrp);
386	p->p_numksegrps--;
387	/*
388	 * Aggregate stats from the KSE
389	 */
390	ksegrp_stash(kg);
391}
392
393/*
394 * For a newly created process,
395 * link up all the structures and its initial threads etc.
396 */
397void
398proc_linkup(struct proc *p, struct ksegrp *kg,
399	    struct kse *ke, struct thread *td)
400{
401
402	TAILQ_INIT(&p->p_ksegrps);	     /* all ksegrps in proc */
403	TAILQ_INIT(&p->p_threads);	     /* all threads in proc */
404	TAILQ_INIT(&p->p_suspended);	     /* Threads suspended */
405	p->p_numksegrps = 0;
406	p->p_numthreads = 0;
407
408	ksegrp_link(kg, p);
409	kse_link(ke, kg);
410	thread_link(td, kg);
411}
412
413/*
414 * Initialize global thread allocation resources.
415 */
416void
417threadinit(void)
418{
419
420	thread_zone = uma_zcreate("THREAD", sched_sizeof_thread(),
421	    thread_ctor, thread_dtor, thread_init, thread_fini,
422	    UMA_ALIGN_CACHE, 0);
423	tid_zone = uma_zcreate("TID", sizeof(struct tid_bitmap_part),
424	    NULL, NULL, NULL, NULL, UMA_ALIGN_CACHE, 0);
425	ksegrp_zone = uma_zcreate("KSEGRP", sched_sizeof_ksegrp(),
426	    NULL, NULL, ksegrp_init, NULL,
427	    UMA_ALIGN_CACHE, 0);
428	kse_zone = uma_zcreate("KSE", sched_sizeof_kse(),
429	    NULL, NULL, kse_init, NULL,
430	    UMA_ALIGN_CACHE, 0);
431	kseinit();
432}
433
434/*
435 * Stash an embarasingly extra thread into the zombie thread queue.
436 */
437void
438thread_stash(struct thread *td)
439{
440	mtx_lock_spin(&kse_zombie_lock);
441	TAILQ_INSERT_HEAD(&zombie_threads, td, td_runq);
442	mtx_unlock_spin(&kse_zombie_lock);
443}
444
445/*
446 * Stash an embarasingly extra kse into the zombie kse queue.
447 */
448void
449kse_stash(struct kse *ke)
450{
451	mtx_lock_spin(&kse_zombie_lock);
452	TAILQ_INSERT_HEAD(&zombie_kses, ke, ke_procq);
453	mtx_unlock_spin(&kse_zombie_lock);
454}
455
456/*
457 * Stash an embarasingly extra ksegrp into the zombie ksegrp queue.
458 */
459void
460ksegrp_stash(struct ksegrp *kg)
461{
462	mtx_lock_spin(&kse_zombie_lock);
463	TAILQ_INSERT_HEAD(&zombie_ksegrps, kg, kg_ksegrp);
464	mtx_unlock_spin(&kse_zombie_lock);
465}
466
467/*
468 * Reap zombie kse resource.
469 */
470void
471thread_reap(void)
472{
473	struct thread *td_first, *td_next;
474	struct kse *ke_first, *ke_next;
475	struct ksegrp *kg_first, * kg_next;
476
477	/*
478	 * Don't even bother to lock if none at this instant,
479	 * we really don't care about the next instant..
480	 */
481	if ((!TAILQ_EMPTY(&zombie_threads))
482	    || (!TAILQ_EMPTY(&zombie_kses))
483	    || (!TAILQ_EMPTY(&zombie_ksegrps))) {
484		mtx_lock_spin(&kse_zombie_lock);
485		td_first = TAILQ_FIRST(&zombie_threads);
486		ke_first = TAILQ_FIRST(&zombie_kses);
487		kg_first = TAILQ_FIRST(&zombie_ksegrps);
488		if (td_first)
489			TAILQ_INIT(&zombie_threads);
490		if (ke_first)
491			TAILQ_INIT(&zombie_kses);
492		if (kg_first)
493			TAILQ_INIT(&zombie_ksegrps);
494		mtx_unlock_spin(&kse_zombie_lock);
495		while (td_first) {
496			td_next = TAILQ_NEXT(td_first, td_runq);
497			if (td_first->td_ucred)
498				crfree(td_first->td_ucred);
499			thread_free(td_first);
500			td_first = td_next;
501		}
502		while (ke_first) {
503			ke_next = TAILQ_NEXT(ke_first, ke_procq);
504			kse_free(ke_first);
505			ke_first = ke_next;
506		}
507		while (kg_first) {
508			kg_next = TAILQ_NEXT(kg_first, kg_ksegrp);
509			ksegrp_free(kg_first);
510			kg_first = kg_next;
511		}
512	}
513	kse_GC();
514}
515
516/*
517 * Allocate a ksegrp.
518 */
519struct ksegrp *
520ksegrp_alloc(void)
521{
522	return (uma_zalloc(ksegrp_zone, M_WAITOK));
523}
524
525/*
526 * Allocate a kse.
527 */
528struct kse *
529kse_alloc(void)
530{
531	return (uma_zalloc(kse_zone, M_WAITOK));
532}
533
534/*
535 * Allocate a thread.
536 */
537struct thread *
538thread_alloc(void)
539{
540	thread_reap(); /* check if any zombies to get */
541	return (uma_zalloc(thread_zone, M_WAITOK));
542}
543
544/*
545 * Deallocate a ksegrp.
546 */
547void
548ksegrp_free(struct ksegrp *td)
549{
550	uma_zfree(ksegrp_zone, td);
551}
552
553/*
554 * Deallocate a kse.
555 */
556void
557kse_free(struct kse *td)
558{
559	uma_zfree(kse_zone, td);
560}
561
562/*
563 * Deallocate a thread.
564 */
565void
566thread_free(struct thread *td)
567{
568
569	cpu_thread_clean(td);
570	uma_zfree(thread_zone, td);
571}
572
573/*
574 * Discard the current thread and exit from its context.
575 * Always called with scheduler locked.
576 *
577 * Because we can't free a thread while we're operating under its context,
578 * push the current thread into our CPU's deadthread holder. This means
579 * we needn't worry about someone else grabbing our context before we
580 * do a cpu_throw().  This may not be needed now as we are under schedlock.
581 * Maybe we can just do a thread_stash() as thr_exit1 does.
582 */
583/*  XXX
584 * libthr expects its thread exit to return for the last
585 * thread, meaning that the program is back to non-threaded
586 * mode I guess. Because we do this (cpu_throw) unconditionally
587 * here, they have their own version of it. (thr_exit1())
588 * that doesn't do it all if this was the last thread.
589 * It is also called from thread_suspend_check().
590 * Of course in the end, they end up coming here through exit1
591 * anyhow..  After fixing 'thr' to play by the rules we should be able
592 * to merge these two functions together.
593 */
594void
595thread_exit(void)
596{
597	struct thread *td;
598	struct kse *ke;
599	struct proc *p;
600	struct ksegrp	*kg;
601
602	td = curthread;
603	kg = td->td_ksegrp;
604	p = td->td_proc;
605	ke = td->td_kse;
606
607	mtx_assert(&sched_lock, MA_OWNED);
608	KASSERT(p != NULL, ("thread exiting without a process"));
609	KASSERT(ke != NULL, ("thread exiting without a kse"));
610	KASSERT(kg != NULL, ("thread exiting without a kse group"));
611	PROC_LOCK_ASSERT(p, MA_OWNED);
612	CTR1(KTR_PROC, "thread_exit: thread %p", td);
613	mtx_assert(&Giant, MA_NOTOWNED);
614
615	if (td->td_standin != NULL) {
616		thread_stash(td->td_standin);
617		td->td_standin = NULL;
618	}
619
620	cpu_thread_exit(td);	/* XXXSMP */
621
622	/*
623	 * The last thread is left attached to the process
624	 * So that the whole bundle gets recycled. Skip
625	 * all this stuff.
626	 */
627	if (p->p_numthreads > 1) {
628		thread_unlink(td);
629		if (p->p_maxthrwaits)
630			wakeup(&p->p_numthreads);
631		/*
632		 * The test below is NOT true if we are the
633		 * sole exiting thread. P_STOPPED_SNGL is unset
634		 * in exit1() after it is the only survivor.
635		 */
636		if (P_SHOULDSTOP(p) == P_STOPPED_SINGLE) {
637			if (p->p_numthreads == p->p_suspcount) {
638				thread_unsuspend_one(p->p_singlethread);
639			}
640		}
641
642		/*
643		 * Because each upcall structure has an owner thread,
644		 * owner thread exits only when process is in exiting
645		 * state, so upcall to userland is no longer needed,
646		 * deleting upcall structure is safe here.
647		 * So when all threads in a group is exited, all upcalls
648		 * in the group should be automatically freed.
649		 */
650		if (td->td_upcall)
651			upcall_remove(td);
652
653		sched_exit_thread(FIRST_THREAD_IN_PROC(p), td);
654		sched_exit_kse(FIRST_KSE_IN_PROC(p), td);
655		ke->ke_state = KES_UNQUEUED;
656		ke->ke_thread = NULL;
657		/*
658		 * Decide what to do with the KSE attached to this thread.
659		 */
660		if (ke->ke_flags & KEF_EXIT) {
661			kse_unlink(ke);
662			if (kg->kg_kses == 0) {
663				sched_exit_ksegrp(FIRST_KSEGRP_IN_PROC(p), td);
664				ksegrp_unlink(kg);
665			}
666		}
667		else
668			kse_reassign(ke);
669		PROC_UNLOCK(p);
670		td->td_kse	= NULL;
671#if 0
672		td->td_proc	= NULL;
673#endif
674		td->td_ksegrp	= NULL;
675		td->td_last_kse	= NULL;
676		PCPU_SET(deadthread, td);
677	} else {
678		PROC_UNLOCK(p);
679	}
680	td->td_state	= TDS_INACTIVE;
681	/* XXX Shouldn't cpu_throw() here. */
682	mtx_assert(&sched_lock, MA_OWNED);
683	cpu_throw(td, choosethread());
684	panic("I'm a teapot!");
685	/* NOTREACHED */
686}
687
688/*
689 * Do any thread specific cleanups that may be needed in wait()
690 * called with Giant, proc and schedlock not held.
691 */
692void
693thread_wait(struct proc *p)
694{
695	struct thread *td;
696
697	mtx_assert(&Giant, MA_NOTOWNED);
698	KASSERT((p->p_numthreads == 1), ("Multiple threads in wait1()"));
699	KASSERT((p->p_numksegrps == 1), ("Multiple ksegrps in wait1()"));
700	FOREACH_THREAD_IN_PROC(p, td) {
701		if (td->td_standin != NULL) {
702			thread_free(td->td_standin);
703			td->td_standin = NULL;
704		}
705		cpu_thread_clean(td);
706	}
707	thread_reap();	/* check for zombie threads etc. */
708}
709
710/*
711 * Link a thread to a process.
712 * set up anything that needs to be initialized for it to
713 * be used by the process.
714 *
715 * Note that we do not link to the proc's ucred here.
716 * The thread is linked as if running but no KSE assigned.
717 */
718void
719thread_link(struct thread *td, struct ksegrp *kg)
720{
721	struct proc *p;
722
723	p = kg->kg_proc;
724	td->td_state    = TDS_INACTIVE;
725	td->td_proc     = p;
726	td->td_ksegrp   = kg;
727	td->td_last_kse = NULL;
728	td->td_flags    = 0;
729	td->td_kflags	= 0;
730	td->td_kse      = NULL;
731
732	LIST_INIT(&td->td_contested);
733	callout_init(&td->td_slpcallout, CALLOUT_MPSAFE);
734	TAILQ_INSERT_HEAD(&p->p_threads, td, td_plist);
735	TAILQ_INSERT_HEAD(&kg->kg_threads, td, td_kglist);
736	p->p_numthreads++;
737	kg->kg_numthreads++;
738}
739
740void
741thread_unlink(struct thread *td)
742{
743	struct proc *p = td->td_proc;
744	struct ksegrp *kg = td->td_ksegrp;
745
746	mtx_assert(&sched_lock, MA_OWNED);
747	TAILQ_REMOVE(&p->p_threads, td, td_plist);
748	p->p_numthreads--;
749	TAILQ_REMOVE(&kg->kg_threads, td, td_kglist);
750	kg->kg_numthreads--;
751	/* could clear a few other things here */
752}
753
754/*
755 * Purge a ksegrp resource. When a ksegrp is preparing to
756 * exit, it calls this function.
757 */
758void
759kse_purge_group(struct thread *td)
760{
761	struct ksegrp *kg;
762	struct kse *ke;
763
764	kg = td->td_ksegrp;
765 	KASSERT(kg->kg_numthreads == 1, ("%s: bad thread number", __func__));
766	while ((ke = TAILQ_FIRST(&kg->kg_iq)) != NULL) {
767		KASSERT(ke->ke_state == KES_IDLE,
768			("%s: wrong idle KSE state", __func__));
769		kse_unlink(ke);
770	}
771	KASSERT((kg->kg_kses == 1),
772		("%s: ksegrp still has %d KSEs", __func__, kg->kg_kses));
773	KASSERT((kg->kg_numupcalls == 0),
774	        ("%s: ksegrp still has %d upcall datas",
775		__func__, kg->kg_numupcalls));
776}
777
778/*
779 * Purge a process's KSE resource. When a process is preparing to
780 * exit, it calls kse_purge to release any extra KSE resources in
781 * the process.
782 */
783void
784kse_purge(struct proc *p, struct thread *td)
785{
786	struct ksegrp *kg;
787	struct kse *ke;
788
789 	KASSERT(p->p_numthreads == 1, ("bad thread number"));
790	while ((kg = TAILQ_FIRST(&p->p_ksegrps)) != NULL) {
791		TAILQ_REMOVE(&p->p_ksegrps, kg, kg_ksegrp);
792		p->p_numksegrps--;
793		/*
794		 * There is no ownership for KSE, after all threads
795		 * in the group exited, it is possible that some KSEs
796		 * were left in idle queue, gc them now.
797		 */
798		while ((ke = TAILQ_FIRST(&kg->kg_iq)) != NULL) {
799			KASSERT(ke->ke_state == KES_IDLE,
800			   ("%s: wrong idle KSE state", __func__));
801			TAILQ_REMOVE(&kg->kg_iq, ke, ke_kgrlist);
802			kg->kg_idle_kses--;
803			TAILQ_REMOVE(&kg->kg_kseq, ke, ke_kglist);
804			kg->kg_kses--;
805			kse_stash(ke);
806		}
807		KASSERT(((kg->kg_kses == 0) && (kg != td->td_ksegrp)) ||
808		        ((kg->kg_kses == 1) && (kg == td->td_ksegrp)),
809		        ("ksegrp has wrong kg_kses: %d", kg->kg_kses));
810		KASSERT((kg->kg_numupcalls == 0),
811		        ("%s: ksegrp still has %d upcall datas",
812			__func__, kg->kg_numupcalls));
813
814		if (kg != td->td_ksegrp)
815			ksegrp_stash(kg);
816	}
817	TAILQ_INSERT_HEAD(&p->p_ksegrps, td->td_ksegrp, kg_ksegrp);
818	p->p_numksegrps++;
819}
820
821/*
822 * Enforce single-threading.
823 *
824 * Returns 1 if the caller must abort (another thread is waiting to
825 * exit the process or similar). Process is locked!
826 * Returns 0 when you are successfully the only thread running.
827 * A process has successfully single threaded in the suspend mode when
828 * There are no threads in user mode. Threads in the kernel must be
829 * allowed to continue until they get to the user boundary. They may even
830 * copy out their return values and data before suspending. They may however be
831 * accellerated in reaching the user boundary as we will wake up
832 * any sleeping threads that are interruptable. (PCATCH).
833 */
834int
835thread_single(int force_exit)
836{
837	struct thread *td;
838	struct thread *td2;
839	struct proc *p;
840	int remaining;
841
842	td = curthread;
843	p = td->td_proc;
844	mtx_assert(&Giant, MA_NOTOWNED);
845	PROC_LOCK_ASSERT(p, MA_OWNED);
846	KASSERT((td != NULL), ("curthread is NULL"));
847
848	if ((p->p_flag & P_SA) == 0 && p->p_numthreads == 1)
849		return (0);
850
851	/* Is someone already single threading? */
852	if (p->p_singlethread)
853		return (1);
854
855	if (force_exit == SINGLE_EXIT) {
856		p->p_flag |= P_SINGLE_EXIT;
857	} else
858		p->p_flag &= ~P_SINGLE_EXIT;
859	p->p_flag |= P_STOPPED_SINGLE;
860	mtx_lock_spin(&sched_lock);
861	p->p_singlethread = td;
862	if (force_exit == SINGLE_EXIT)
863		remaining = p->p_numthreads;
864	else
865		remaining = p->p_numthreads - p->p_suspcount;
866	while (remaining != 1) {
867		FOREACH_THREAD_IN_PROC(p, td2) {
868			if (td2 == td)
869				continue;
870			td2->td_flags |= TDF_ASTPENDING;
871			if (TD_IS_INHIBITED(td2)) {
872				if (force_exit == SINGLE_EXIT) {
873					if (td->td_flags & TDF_DBSUSPEND)
874						td->td_flags &= ~TDF_DBSUSPEND;
875					if (TD_IS_SUSPENDED(td2)) {
876						thread_unsuspend_one(td2);
877					}
878					if (TD_ON_SLEEPQ(td2) &&
879					    (td2->td_flags & TDF_SINTR)) {
880						sleepq_abort(td2);
881					}
882				} else {
883					if (TD_IS_SUSPENDED(td2))
884						continue;
885					/*
886					 * maybe other inhibitted states too?
887					 * XXXKSE Is it totally safe to
888					 * suspend a non-interruptable thread?
889					 */
890					if (td2->td_inhibitors &
891					    (TDI_SLEEPING | TDI_SWAPPED))
892						thread_suspend_one(td2);
893				}
894			}
895		}
896		if (force_exit == SINGLE_EXIT)
897			remaining = p->p_numthreads;
898		else
899			remaining = p->p_numthreads - p->p_suspcount;
900
901		/*
902		 * Maybe we suspended some threads.. was it enough?
903		 */
904		if (remaining == 1)
905			break;
906
907		/*
908		 * Wake us up when everyone else has suspended.
909		 * In the mean time we suspend as well.
910		 */
911		thread_suspend_one(td);
912		PROC_UNLOCK(p);
913		mi_switch(SW_VOL, NULL);
914		mtx_unlock_spin(&sched_lock);
915		PROC_LOCK(p);
916		mtx_lock_spin(&sched_lock);
917		if (force_exit == SINGLE_EXIT)
918			remaining = p->p_numthreads;
919		else
920			remaining = p->p_numthreads - p->p_suspcount;
921	}
922	if (force_exit == SINGLE_EXIT) {
923		if (td->td_upcall)
924			upcall_remove(td);
925		kse_purge(p, td);
926	}
927	mtx_unlock_spin(&sched_lock);
928	return (0);
929}
930
931/*
932 * Called in from locations that can safely check to see
933 * whether we have to suspend or at least throttle for a
934 * single-thread event (e.g. fork).
935 *
936 * Such locations include userret().
937 * If the "return_instead" argument is non zero, the thread must be able to
938 * accept 0 (caller may continue), or 1 (caller must abort) as a result.
939 *
940 * The 'return_instead' argument tells the function if it may do a
941 * thread_exit() or suspend, or whether the caller must abort and back
942 * out instead.
943 *
944 * If the thread that set the single_threading request has set the
945 * P_SINGLE_EXIT bit in the process flags then this call will never return
946 * if 'return_instead' is false, but will exit.
947 *
948 * P_SINGLE_EXIT | return_instead == 0| return_instead != 0
949 *---------------+--------------------+---------------------
950 *       0       | returns 0          |   returns 0 or 1
951 *               | when ST ends       |   immediatly
952 *---------------+--------------------+---------------------
953 *       1       | thread exits       |   returns 1
954 *               |                    |  immediatly
955 * 0 = thread_exit() or suspension ok,
956 * other = return error instead of stopping the thread.
957 *
958 * While a full suspension is under effect, even a single threading
959 * thread would be suspended if it made this call (but it shouldn't).
960 * This call should only be made from places where
961 * thread_exit() would be safe as that may be the outcome unless
962 * return_instead is set.
963 */
964int
965thread_suspend_check(int return_instead)
966{
967	struct thread *td;
968	struct proc *p;
969
970	td = curthread;
971	p = td->td_proc;
972	mtx_assert(&Giant, MA_NOTOWNED);
973	PROC_LOCK_ASSERT(p, MA_OWNED);
974	while (P_SHOULDSTOP(p) ||
975	      ((p->p_flag & P_TRACED) && (td->td_flags & TDF_DBSUSPEND))) {
976		if (P_SHOULDSTOP(p) == P_STOPPED_SINGLE) {
977			KASSERT(p->p_singlethread != NULL,
978			    ("singlethread not set"));
979			/*
980			 * The only suspension in action is a
981			 * single-threading. Single threader need not stop.
982			 * XXX Should be safe to access unlocked
983			 * as it can only be set to be true by us.
984			 */
985			if (p->p_singlethread == td)
986				return (0);	/* Exempt from stopping. */
987		}
988		if (return_instead)
989			return (1);
990
991		mtx_lock_spin(&sched_lock);
992		thread_stopped(p);
993		/*
994		 * If the process is waiting for us to exit,
995		 * this thread should just suicide.
996		 * Assumes that P_SINGLE_EXIT implies P_STOPPED_SINGLE.
997		 */
998		if ((p->p_flag & P_SINGLE_EXIT) && (p->p_singlethread != td)) {
999			if (p->p_flag & P_SA)
1000				thread_exit();
1001			else
1002				thr_exit1();
1003		}
1004
1005		/*
1006		 * When a thread suspends, it just
1007		 * moves to the processes's suspend queue
1008		 * and stays there.
1009		 */
1010		thread_suspend_one(td);
1011		if (P_SHOULDSTOP(p) == P_STOPPED_SINGLE) {
1012			if (p->p_numthreads == p->p_suspcount) {
1013				thread_unsuspend_one(p->p_singlethread);
1014			}
1015		}
1016		PROC_UNLOCK(p);
1017		mi_switch(SW_INVOL, NULL);
1018		mtx_unlock_spin(&sched_lock);
1019		PROC_LOCK(p);
1020	}
1021	return (0);
1022}
1023
1024void
1025thread_suspend_one(struct thread *td)
1026{
1027	struct proc *p = td->td_proc;
1028
1029	mtx_assert(&sched_lock, MA_OWNED);
1030	PROC_LOCK_ASSERT(p, MA_OWNED);
1031	KASSERT(!TD_IS_SUSPENDED(td), ("already suspended"));
1032	p->p_suspcount++;
1033	TD_SET_SUSPENDED(td);
1034	TAILQ_INSERT_TAIL(&p->p_suspended, td, td_runq);
1035	/*
1036	 * Hack: If we are suspending but are on the sleep queue
1037	 * then we are in msleep or the cv equivalent. We
1038	 * want to look like we have two Inhibitors.
1039	 * May already be set.. doesn't matter.
1040	 */
1041	if (TD_ON_SLEEPQ(td))
1042		TD_SET_SLEEPING(td);
1043}
1044
1045void
1046thread_unsuspend_one(struct thread *td)
1047{
1048	struct proc *p = td->td_proc;
1049
1050	mtx_assert(&sched_lock, MA_OWNED);
1051	PROC_LOCK_ASSERT(p, MA_OWNED);
1052	TAILQ_REMOVE(&p->p_suspended, td, td_runq);
1053	TD_CLR_SUSPENDED(td);
1054	p->p_suspcount--;
1055	setrunnable(td);
1056}
1057
1058/*
1059 * Allow all threads blocked by single threading to continue running.
1060 */
1061void
1062thread_unsuspend(struct proc *p)
1063{
1064	struct thread *td;
1065
1066	mtx_assert(&sched_lock, MA_OWNED);
1067	PROC_LOCK_ASSERT(p, MA_OWNED);
1068	if (!P_SHOULDSTOP(p)) {
1069		while ((td = TAILQ_FIRST(&p->p_suspended))) {
1070			thread_unsuspend_one(td);
1071		}
1072	} else if ((P_SHOULDSTOP(p) == P_STOPPED_SINGLE) &&
1073	    (p->p_numthreads == p->p_suspcount)) {
1074		/*
1075		 * Stopping everything also did the job for the single
1076		 * threading request. Now we've downgraded to single-threaded,
1077		 * let it continue.
1078		 */
1079		thread_unsuspend_one(p->p_singlethread);
1080	}
1081}
1082
1083void
1084thread_single_end(void)
1085{
1086	struct thread *td;
1087	struct proc *p;
1088
1089	td = curthread;
1090	p = td->td_proc;
1091	PROC_LOCK_ASSERT(p, MA_OWNED);
1092	p->p_flag &= ~(P_STOPPED_SINGLE | P_SINGLE_EXIT);
1093	mtx_lock_spin(&sched_lock);
1094	p->p_singlethread = NULL;
1095	/*
1096	 * If there are other threads they mey now run,
1097	 * unless of course there is a blanket 'stop order'
1098	 * on the process. The single threader must be allowed
1099	 * to continue however as this is a bad place to stop.
1100	 */
1101	if ((p->p_numthreads != 1) && (!P_SHOULDSTOP(p))) {
1102		while (( td = TAILQ_FIRST(&p->p_suspended))) {
1103			thread_unsuspend_one(td);
1104		}
1105	}
1106	mtx_unlock_spin(&sched_lock);
1107}
1108
1109