kern_thread.c revision 111595
1/*
2 * Copyright (C) 2001 Julian Elischer <julian@freebsd.org>.
3 *  All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 * 1. Redistributions of source code must retain the above copyright
9 *    notice(s), this list of conditions and the following disclaimer as
10 *    the first lines of this file unmodified other than the possible
11 *    addition of one or more copyright notices.
12 * 2. Redistributions in binary form must reproduce the above copyright
13 *    notice(s), this list of conditions and the following disclaimer in the
14 *    documentation and/or other materials provided with the distribution.
15 *
16 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDER(S) ``AS IS'' AND ANY
17 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
18 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
19 * DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT HOLDER(S) BE LIABLE FOR ANY
20 * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
21 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
22 * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
23 * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
24 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
25 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH
26 * DAMAGE.
27 *
28 * $FreeBSD: head/sys/kern/kern_thread.c 111595 2003-02-27 05:42:01Z davidxu $
29 */
30
31#include <sys/param.h>
32#include <sys/systm.h>
33#include <sys/kernel.h>
34#include <sys/lock.h>
35#include <sys/malloc.h>
36#include <sys/mutex.h>
37#include <sys/proc.h>
38#include <sys/smp.h>
39#include <sys/sysctl.h>
40#include <sys/sysproto.h>
41#include <sys/filedesc.h>
42#include <sys/sched.h>
43#include <sys/signalvar.h>
44#include <sys/sx.h>
45#include <sys/tty.h>
46#include <sys/user.h>
47#include <sys/jail.h>
48#include <sys/kse.h>
49#include <sys/ktr.h>
50#include <sys/ucontext.h>
51
52#include <vm/vm.h>
53#include <vm/vm_object.h>
54#include <vm/pmap.h>
55#include <vm/uma.h>
56#include <vm/vm_map.h>
57
58#include <machine/frame.h>
59
60/*
61 * KSEGRP related storage.
62 */
63static uma_zone_t ksegrp_zone;
64static uma_zone_t kse_zone;
65static uma_zone_t thread_zone;
66static uma_zone_t upcall_zone;
67
68/* DEBUG ONLY */
69SYSCTL_NODE(_kern, OID_AUTO, threads, CTLFLAG_RW, 0, "thread allocation");
70static int thread_debug = 0;
71SYSCTL_INT(_kern_threads, OID_AUTO, debug, CTLFLAG_RW,
72	&thread_debug, 0, "thread debug");
73
74static int max_threads_per_proc = 30;
75SYSCTL_INT(_kern_threads, OID_AUTO, max_threads_per_proc, CTLFLAG_RW,
76	&max_threads_per_proc, 0, "Limit on threads per proc");
77
78static int max_groups_per_proc = 5;
79SYSCTL_INT(_kern_threads, OID_AUTO, max_groups_per_proc, CTLFLAG_RW,
80	&max_groups_per_proc, 0, "Limit on thread groups per proc");
81
82static int max_threads_hits;
83SYSCTL_INT(_kern_threads, OID_AUTO, max_threads_hits, CTLFLAG_RD,
84	&max_threads_hits, 0, "");
85
86static int virtual_cpu;
87
88#define RANGEOF(type, start, end) (offsetof(type, end) - offsetof(type, start))
89
90TAILQ_HEAD(, thread) zombie_threads = TAILQ_HEAD_INITIALIZER(zombie_threads);
91TAILQ_HEAD(, kse) zombie_kses = TAILQ_HEAD_INITIALIZER(zombie_kses);
92TAILQ_HEAD(, ksegrp) zombie_ksegrps = TAILQ_HEAD_INITIALIZER(zombie_ksegrps);
93TAILQ_HEAD(, kse_upcall) zombie_upcalls =
94	TAILQ_HEAD_INITIALIZER(zombie_upcalls);
95struct mtx kse_zombie_lock;
96MTX_SYSINIT(kse_zombie_lock, &kse_zombie_lock, "kse zombie lock", MTX_SPIN);
97
98static void kse_purge(struct proc *p, struct thread *td);
99static void kse_purge_group(struct thread *td);
100static int thread_update_usr_ticks(struct thread *td, int user);
101static void thread_alloc_spare(struct thread *td, struct thread *spare);
102
103static int
104sysctl_kse_virtual_cpu(SYSCTL_HANDLER_ARGS)
105{
106	int error, new_val;
107	int def_val;
108
109#ifdef SMP
110	def_val = mp_ncpus;
111#else
112	def_val = 1;
113#endif
114	if (virtual_cpu == 0)
115		new_val = def_val;
116	else
117		new_val = virtual_cpu;
118	error = sysctl_handle_int(oidp, &new_val, 0, req);
119        if (error != 0 || req->newptr == NULL)
120		return (error);
121	if (new_val < 0)
122		return (EINVAL);
123	virtual_cpu = new_val;
124	return (0);
125}
126
127/* DEBUG ONLY */
128SYSCTL_PROC(_kern_threads, OID_AUTO, virtual_cpu, CTLTYPE_INT|CTLFLAG_RW,
129	0, sizeof(virtual_cpu), sysctl_kse_virtual_cpu, "I",
130	"debug virtual cpus");
131
132/*
133 * Prepare a thread for use.
134 */
135static void
136thread_ctor(void *mem, int size, void *arg)
137{
138	struct thread	*td;
139
140	td = (struct thread *)mem;
141	td->td_state = TDS_INACTIVE;
142}
143
144/*
145 * Reclaim a thread after use.
146 */
147static void
148thread_dtor(void *mem, int size, void *arg)
149{
150	struct thread	*td;
151
152	td = (struct thread *)mem;
153
154#ifdef INVARIANTS
155	/* Verify that this thread is in a safe state to free. */
156	switch (td->td_state) {
157	case TDS_INHIBITED:
158	case TDS_RUNNING:
159	case TDS_CAN_RUN:
160	case TDS_RUNQ:
161		/*
162		 * We must never unlink a thread that is in one of
163		 * these states, because it is currently active.
164		 */
165		panic("bad state for thread unlinking");
166		/* NOTREACHED */
167	case TDS_INACTIVE:
168		break;
169	default:
170		panic("bad thread state");
171		/* NOTREACHED */
172	}
173#endif
174}
175
176/*
177 * Initialize type-stable parts of a thread (when newly created).
178 */
179static void
180thread_init(void *mem, int size)
181{
182	struct thread	*td;
183
184	td = (struct thread *)mem;
185	mtx_lock(&Giant);
186	pmap_new_thread(td, 0);
187	mtx_unlock(&Giant);
188	cpu_thread_setup(td);
189	td->td_sched = (struct td_sched *)&td[1];
190}
191
192/*
193 * Tear down type-stable parts of a thread (just before being discarded).
194 */
195static void
196thread_fini(void *mem, int size)
197{
198	struct thread	*td;
199
200	td = (struct thread *)mem;
201	pmap_dispose_thread(td);
202}
203
204/*
205 * Initialize type-stable parts of a kse (when newly created).
206 */
207static void
208kse_init(void *mem, int size)
209{
210	struct kse	*ke;
211
212	ke = (struct kse *)mem;
213	ke->ke_sched = (struct ke_sched *)&ke[1];
214}
215
216/*
217 * Initialize type-stable parts of a ksegrp (when newly created).
218 */
219static void
220ksegrp_init(void *mem, int size)
221{
222	struct ksegrp	*kg;
223
224	kg = (struct ksegrp *)mem;
225	kg->kg_sched = (struct kg_sched *)&kg[1];
226}
227
228/*
229 * KSE is linked into kse group.
230 */
231void
232kse_link(struct kse *ke, struct ksegrp *kg)
233{
234	struct proc *p = kg->kg_proc;
235
236	TAILQ_INSERT_HEAD(&kg->kg_kseq, ke, ke_kglist);
237	kg->kg_kses++;
238	ke->ke_state	= KES_UNQUEUED;
239	ke->ke_proc	= p;
240	ke->ke_ksegrp	= kg;
241	ke->ke_thread	= NULL;
242	ke->ke_oncpu	= NOCPU;
243	ke->ke_flags	= 0;
244}
245
246void
247kse_unlink(struct kse *ke)
248{
249	struct ksegrp *kg;
250
251	mtx_assert(&sched_lock, MA_OWNED);
252	kg = ke->ke_ksegrp;
253	TAILQ_REMOVE(&kg->kg_kseq, ke, ke_kglist);
254	if (ke->ke_state == KES_IDLE) {
255		TAILQ_REMOVE(&kg->kg_iq, ke, ke_kgrlist);
256		kg->kg_idle_kses--;
257	}
258	if (--kg->kg_kses == 0)
259		ksegrp_unlink(kg);
260	/*
261	 * Aggregate stats from the KSE
262	 */
263	kse_stash(ke);
264}
265
266void
267ksegrp_link(struct ksegrp *kg, struct proc *p)
268{
269
270	TAILQ_INIT(&kg->kg_threads);
271	TAILQ_INIT(&kg->kg_runq);	/* links with td_runq */
272	TAILQ_INIT(&kg->kg_slpq);	/* links with td_runq */
273	TAILQ_INIT(&kg->kg_kseq);	/* all kses in ksegrp */
274	TAILQ_INIT(&kg->kg_iq);		/* all idle kses in ksegrp */
275	TAILQ_INIT(&kg->kg_upcalls);	/* all upcall structure in ksegrp */
276	kg->kg_proc = p;
277	/*
278	 * the following counters are in the -zero- section
279	 * and may not need clearing
280	 */
281	kg->kg_numthreads = 0;
282	kg->kg_runnable   = 0;
283	kg->kg_kses       = 0;
284	kg->kg_runq_kses  = 0; /* XXXKSE change name */
285	kg->kg_idle_kses  = 0;
286	kg->kg_numupcalls = 0;
287	/* link it in now that it's consistent */
288	p->p_numksegrps++;
289	TAILQ_INSERT_HEAD(&p->p_ksegrps, kg, kg_ksegrp);
290}
291
292void
293ksegrp_unlink(struct ksegrp *kg)
294{
295	struct proc *p;
296
297	mtx_assert(&sched_lock, MA_OWNED);
298	KASSERT((kg->kg_numthreads == 0), ("ksegrp_unlink: residual threads"));
299	KASSERT((kg->kg_kses == 0), ("ksegrp_unlink: residual kses"));
300	KASSERT((kg->kg_numupcalls == 0), ("ksegrp_unlink: residual upcalls"));
301
302	p = kg->kg_proc;
303	TAILQ_REMOVE(&p->p_ksegrps, kg, kg_ksegrp);
304	p->p_numksegrps--;
305	/*
306	 * Aggregate stats from the KSE
307	 */
308	ksegrp_stash(kg);
309}
310
311struct kse_upcall *
312upcall_alloc(void)
313{
314	struct kse_upcall *ku;
315
316	ku = uma_zalloc(upcall_zone, M_WAITOK);
317	bzero(ku, sizeof(*ku));
318	return (ku);
319}
320
321void
322upcall_free(struct kse_upcall *ku)
323{
324
325	uma_zfree(upcall_zone, ku);
326}
327
328void
329upcall_link(struct kse_upcall *ku, struct ksegrp *kg)
330{
331
332	mtx_assert(&sched_lock, MA_OWNED);
333	TAILQ_INSERT_TAIL(&kg->kg_upcalls, ku, ku_link);
334	ku->ku_ksegrp = kg;
335	kg->kg_numupcalls++;
336}
337
338void
339upcall_unlink(struct kse_upcall *ku)
340{
341	struct ksegrp *kg = ku->ku_ksegrp;
342
343	mtx_assert(&sched_lock, MA_OWNED);
344	KASSERT(ku->ku_owner == NULL, ("%s: have owner", __func__));
345	TAILQ_REMOVE(&kg->kg_upcalls, ku, ku_link);
346	kg->kg_numupcalls--;
347	upcall_stash(ku);
348}
349
350void
351upcall_remove(struct thread *td)
352{
353
354	if (td->td_upcall) {
355		td->td_upcall->ku_owner = NULL;
356		upcall_unlink(td->td_upcall);
357		td->td_upcall = 0;
358	}
359}
360
361/*
362 * For a newly created process,
363 * link up all the structures and its initial threads etc.
364 */
365void
366proc_linkup(struct proc *p, struct ksegrp *kg,
367	    struct kse *ke, struct thread *td)
368{
369
370	TAILQ_INIT(&p->p_ksegrps);	     /* all ksegrps in proc */
371	TAILQ_INIT(&p->p_threads);	     /* all threads in proc */
372	TAILQ_INIT(&p->p_suspended);	     /* Threads suspended */
373	p->p_numksegrps = 0;
374	p->p_numthreads = 0;
375
376	ksegrp_link(kg, p);
377	kse_link(ke, kg);
378	thread_link(td, kg);
379}
380
381/*
382struct kse_thr_interrupt_args {
383	struct kse_thr_mailbox * tmbx;
384};
385*/
386int
387kse_thr_interrupt(struct thread *td, struct kse_thr_interrupt_args *uap)
388{
389	struct proc *p;
390	struct thread *td2;
391
392	p = td->td_proc;
393	if (!(p->p_flag & P_THREADED) || (uap->tmbx == NULL))
394		return (EINVAL);
395	mtx_lock_spin(&sched_lock);
396	FOREACH_THREAD_IN_PROC(p, td2) {
397		if (td2->td_mailbox == uap->tmbx) {
398			td2->td_flags |= TDF_INTERRUPT;
399			if (TD_ON_SLEEPQ(td2) && (td2->td_flags & TDF_SINTR)) {
400				if (td2->td_flags & TDF_CVWAITQ)
401					cv_abort(td2);
402				else
403					abortsleep(td2);
404			}
405			mtx_unlock_spin(&sched_lock);
406			return (0);
407		}
408	}
409	mtx_unlock_spin(&sched_lock);
410	return (ESRCH);
411}
412
413/*
414struct kse_exit_args {
415	register_t dummy;
416};
417*/
418int
419kse_exit(struct thread *td, struct kse_exit_args *uap)
420{
421	struct proc *p;
422	struct ksegrp *kg;
423	struct kse *ke;
424
425	p = td->td_proc;
426	/*
427	 * Only UTS can call the syscall and current group
428	 * should be a threaded group.
429	 */
430	if ((td->td_mailbox != NULL) || (td->td_ksegrp->kg_numupcalls == 0))
431		return (EINVAL);
432	KASSERT((td->td_upcall != NULL), ("%s: not own an upcall", __func__));
433
434	kg = td->td_ksegrp;
435	/* Serialize removing upcall */
436	PROC_LOCK(p);
437	mtx_lock_spin(&sched_lock);
438	if ((kg->kg_numupcalls == 1) && (kg->kg_numthreads > 1)) {
439		mtx_unlock_spin(&sched_lock);
440		PROC_UNLOCK(p);
441		return (EDEADLK);
442	}
443	ke = td->td_kse;
444	upcall_remove(td);
445	if (p->p_numthreads == 1) {
446		kse_purge(p, td);
447		p->p_flag &= ~P_THREADED;
448		mtx_unlock_spin(&sched_lock);
449		PROC_UNLOCK(p);
450	} else {
451		if (kg->kg_numthreads == 1) { /* Shutdown a group */
452			kse_purge_group(td);
453			ke->ke_flags |= KEF_EXIT;
454		}
455		thread_exit();
456		/* NOTREACHED */
457	}
458	return (0);
459}
460
461/*
462 * Either becomes an upcall or waits for an awakening event and
463 * then becomes an upcall. Only error cases return.
464 */
465/*
466struct kse_release_args {
467	struct timespec *timeout;
468};
469*/
470int
471kse_release(struct thread *td, struct kse_release_args *uap)
472{
473	struct proc *p;
474	struct ksegrp *kg;
475	struct timespec ts, ts2, ts3, timeout;
476	struct timeval tv;
477	int error;
478
479	p = td->td_proc;
480	kg = td->td_ksegrp;
481	/*
482	 * Only UTS can call the syscall and current group
483	 * should be a threaded group.
484	 */
485	if ((td->td_mailbox != NULL) || (td->td_ksegrp->kg_numupcalls == 0))
486		return (EINVAL);
487	KASSERT((td->td_upcall != NULL), ("%s: not own an upcall", __func__));
488	if (uap->timeout != NULL) {
489		if ((error = copyin(uap->timeout, &timeout, sizeof(timeout))))
490			return (error);
491		getnanouptime(&ts);
492		timespecadd(&ts, &timeout);
493		TIMESPEC_TO_TIMEVAL(&tv, &timeout);
494	}
495	mtx_lock_spin(&sched_lock);
496	/* Change OURSELF to become an upcall. */
497	td->td_flags = TDF_UPCALLING;
498	if (p->p_sflag & PS_NEEDSIGCHK)
499		td->td_flags |= TDF_ASTPENDING;
500	mtx_unlock_spin(&sched_lock);
501	PROC_LOCK(p);
502	while ((td->td_upcall->ku_flags & KUF_DOUPCALL) == 0 &&
503	       (kg->kg_completed == NULL)) {
504		kg->kg_upsleeps++;
505		error = msleep(&kg->kg_completed, &p->p_mtx, PPAUSE|PCATCH,
506			"kse_rel", (uap->timeout ? tvtohz(&tv) : 0));
507		kg->kg_upsleeps--;
508		PROC_UNLOCK(p);
509		if (uap->timeout == NULL || error != EWOULDBLOCK)
510			return (0);
511		getnanouptime(&ts2);
512		if (timespeccmp(&ts2, &ts, >=))
513			return (0);
514		ts3 = ts;
515		timespecsub(&ts3, &ts2);
516		TIMESPEC_TO_TIMEVAL(&tv, &ts3);
517		PROC_LOCK(p);
518	}
519	PROC_UNLOCK(p);
520	return (0);
521}
522
523/* struct kse_wakeup_args {
524	struct kse_mailbox *mbx;
525}; */
526int
527kse_wakeup(struct thread *td, struct kse_wakeup_args *uap)
528{
529	struct proc *p;
530	struct ksegrp *kg;
531	struct kse_upcall *ku;
532	struct thread *td2;
533
534	p = td->td_proc;
535	td2 = NULL;
536	ku = NULL;
537	/* KSE-enabled processes only, please. */
538	if (!(p->p_flag & P_THREADED))
539		return (EINVAL);
540	PROC_LOCK(p);
541	mtx_lock_spin(&sched_lock);
542	if (uap->mbx) {
543		FOREACH_KSEGRP_IN_PROC(p, kg) {
544			FOREACH_UPCALL_IN_GROUP(kg, ku) {
545				if (ku->ku_mailbox == uap->mbx)
546					break;
547			}
548			if (ku)
549				break;
550		}
551	} else {
552		kg = td->td_ksegrp;
553		if (kg->kg_upsleeps) {
554			wakeup_one(&kg->kg_completed);
555			mtx_unlock_spin(&sched_lock);
556			PROC_UNLOCK(p);
557			return (0);
558		}
559		ku = TAILQ_FIRST(&kg->kg_upcalls);
560	}
561	if (ku) {
562		if ((td2 = ku->ku_owner) == NULL) {
563			panic("%s: no owner", __func__);
564		} else if (TD_ON_SLEEPQ(td2) &&
565		           (td2->td_wchan == &kg->kg_completed)) {
566			abortsleep(td2);
567		} else {
568			ku->ku_flags |= KUF_DOUPCALL;
569		}
570		mtx_unlock_spin(&sched_lock);
571		PROC_UNLOCK(p);
572		return (0);
573	}
574	mtx_unlock_spin(&sched_lock);
575	PROC_UNLOCK(p);
576	return (ESRCH);
577}
578
579/*
580 * No new KSEG: first call: use current KSE, don't schedule an upcall
581 * All other situations, do allocate max new KSEs and schedule an upcall.
582 */
583/* struct kse_create_args {
584	struct kse_mailbox *mbx;
585	int newgroup;
586}; */
587int
588kse_create(struct thread *td, struct kse_create_args *uap)
589{
590	struct kse *newke;
591	struct ksegrp *newkg;
592	struct ksegrp *kg;
593	struct proc *p;
594	struct kse_mailbox mbx;
595	struct kse_upcall *newku;
596	int err, ncpus;
597
598	p = td->td_proc;
599	if ((err = copyin(uap->mbx, &mbx, sizeof(mbx))))
600		return (err);
601
602	/* Too bad, why hasn't kernel always a cpu counter !? */
603#ifdef SMP
604	ncpus = mp_ncpus;
605#else
606	ncpus = 1;
607#endif
608	if (thread_debug && virtual_cpu != 0)
609		ncpus = virtual_cpu;
610
611	/* Easier to just set it than to test and set */
612	p->p_flag |= P_THREADED;
613	kg = td->td_ksegrp;
614	if (uap->newgroup) {
615		/* Have race condition but it is cheap */
616		if (p->p_numksegrps >= max_groups_per_proc)
617			return (EPROCLIM);
618		/*
619		 * If we want a new KSEGRP it doesn't matter whether
620		 * we have already fired up KSE mode before or not.
621		 * We put the process in KSE mode and create a new KSEGRP.
622		 */
623		newkg = ksegrp_alloc();
624		bzero(&newkg->kg_startzero, RANGEOF(struct ksegrp,
625		      kg_startzero, kg_endzero));
626		bcopy(&kg->kg_startcopy, &newkg->kg_startcopy,
627		      RANGEOF(struct ksegrp, kg_startcopy, kg_endcopy));
628		mtx_lock_spin(&sched_lock);
629		ksegrp_link(newkg, p);
630		if (p->p_numksegrps >= max_groups_per_proc) {
631			ksegrp_unlink(newkg);
632			mtx_unlock_spin(&sched_lock);
633			return (EPROCLIM);
634		}
635		mtx_unlock_spin(&sched_lock);
636	} else {
637		newkg = kg;
638	}
639
640	/*
641	 * Creating upcalls more than number of physical cpu does
642	 * not help performance.
643	 */
644	if (newkg->kg_numupcalls >= ncpus)
645		return (EPROCLIM);
646
647	if (newkg->kg_numupcalls == 0) {
648		/*
649		 * Initialize KSE group, optimized for MP.
650		 * Create KSEs as many as physical cpus, this increases
651		 * concurrent even if userland is not MP safe and can only run
652		 * on single CPU (for early version of libpthread, it is true).
653		 * In ideal world, every physical cpu should execute a thread.
654		 * If there is enough KSEs, threads in kernel can be
655		 * executed parallel on different cpus with full speed,
656		 * Concurrent in kernel shouldn't be restricted by number of
657		 * upcalls userland provides.
658		 * Adding more upcall structures only increases concurrent
659		 * in userland.
660		 * Highest performance configuration is:
661		 * N kses = N upcalls = N phyiscal cpus
662		 */
663		while (newkg->kg_kses < ncpus) {
664			newke = kse_alloc();
665			bzero(&newke->ke_startzero, RANGEOF(struct kse,
666			      ke_startzero, ke_endzero));
667#if 0
668			mtx_lock_spin(&sched_lock);
669			bcopy(&ke->ke_startcopy, &newke->ke_startcopy,
670			      RANGEOF(struct kse, ke_startcopy, ke_endcopy));
671			mtx_unlock_spin(&sched_lock);
672#endif
673			mtx_lock_spin(&sched_lock);
674			kse_link(newke, newkg);
675			/* Add engine */
676			kse_reassign(newke);
677			mtx_unlock_spin(&sched_lock);
678		}
679	}
680	newku = upcall_alloc();
681	newku->ku_mailbox = uap->mbx;
682	newku->ku_func = mbx.km_func;
683	bcopy(&mbx.km_stack, &newku->ku_stack, sizeof(stack_t));
684
685	/* For the first call this may not have been set */
686	if (td->td_standin == NULL)
687		thread_alloc_spare(td, NULL);
688
689	mtx_lock_spin(&sched_lock);
690	if (newkg->kg_numupcalls >= ncpus) {
691		mtx_unlock_spin(&sched_lock);
692		upcall_free(newku);
693		return (EPROCLIM);
694	}
695	upcall_link(newku, newkg);
696
697	/*
698	 * Each upcall structure has an owner thread, find which
699	 * one owns it.
700	 */
701	if (uap->newgroup) {
702		/*
703		 * Because new ksegrp hasn't thread,
704		 * create an initial upcall thread to own it.
705		 */
706		thread_schedule_upcall(td, newku);
707	} else {
708		/*
709		 * If current thread hasn't an upcall structure,
710		 * just assign the upcall to it.
711		 */
712		if (td->td_upcall == NULL) {
713			newku->ku_owner = td;
714			td->td_upcall = newku;
715		} else {
716			/*
717			 * Create a new upcall thread to own it.
718			 */
719			thread_schedule_upcall(td, newku);
720		}
721	}
722	mtx_unlock_spin(&sched_lock);
723	return (0);
724}
725
726/*
727 * Fill a ucontext_t with a thread's context information.
728 *
729 * This is an analogue to getcontext(3).
730 */
731void
732thread_getcontext(struct thread *td, ucontext_t *uc)
733{
734
735/*
736 * XXX this is declared in a MD include file, i386/include/ucontext.h but
737 * is used in MI code.
738 */
739#ifdef __i386__
740	get_mcontext(td, &uc->uc_mcontext);
741#endif
742	uc->uc_sigmask = td->td_proc->p_sigmask;
743}
744
745/*
746 * Set a thread's context from a ucontext_t.
747 *
748 * This is an analogue to setcontext(3).
749 */
750int
751thread_setcontext(struct thread *td, ucontext_t *uc)
752{
753	int ret;
754
755/*
756 * XXX this is declared in a MD include file, i386/include/ucontext.h but
757 * is used in MI code.
758 */
759#ifdef __i386__
760	ret = set_mcontext(td, &uc->uc_mcontext);
761#else
762	ret = ENOSYS;
763#endif
764	if (ret == 0) {
765		SIG_CANTMASK(uc->uc_sigmask);
766		PROC_LOCK(td->td_proc);
767		td->td_proc->p_sigmask = uc->uc_sigmask;
768		PROC_UNLOCK(td->td_proc);
769	}
770	return (ret);
771}
772
773/*
774 * Initialize global thread allocation resources.
775 */
776void
777threadinit(void)
778{
779
780#ifndef __ia64__
781	thread_zone = uma_zcreate("THREAD", sched_sizeof_thread(),
782	    thread_ctor, thread_dtor, thread_init, thread_fini,
783	    UMA_ALIGN_CACHE, 0);
784#else
785	/*
786	 * XXX the ia64 kstack allocator is really lame and is at the mercy
787	 * of contigmallloc().  This hackery is to pre-construct a whole
788	 * pile of thread structures with associated kernel stacks early
789	 * in the system startup while contigmalloc() still works. Once we
790	 * have them, keep them.  Sigh.
791	 */
792	thread_zone = uma_zcreate("THREAD", sched_sizeof_thread(),
793	    thread_ctor, thread_dtor, thread_init, thread_fini,
794	    UMA_ALIGN_CACHE, UMA_ZONE_NOFREE);
795	uma_prealloc(thread_zone, 512);		/* XXX arbitary */
796#endif
797	ksegrp_zone = uma_zcreate("KSEGRP", sched_sizeof_ksegrp(),
798	    NULL, NULL, ksegrp_init, NULL,
799	    UMA_ALIGN_CACHE, 0);
800	kse_zone = uma_zcreate("KSE", sched_sizeof_kse(),
801	    NULL, NULL, kse_init, NULL,
802	    UMA_ALIGN_CACHE, 0);
803	upcall_zone = uma_zcreate("UPCALL", sizeof(struct kse_upcall),
804	    NULL, NULL, NULL, NULL, UMA_ALIGN_CACHE, 0);
805}
806
807/*
808 * Stash an embarasingly extra thread into the zombie thread queue.
809 */
810void
811thread_stash(struct thread *td)
812{
813	mtx_lock_spin(&kse_zombie_lock);
814	TAILQ_INSERT_HEAD(&zombie_threads, td, td_runq);
815	mtx_unlock_spin(&kse_zombie_lock);
816}
817
818/*
819 * Stash an embarasingly extra kse into the zombie kse queue.
820 */
821void
822kse_stash(struct kse *ke)
823{
824	mtx_lock_spin(&kse_zombie_lock);
825	TAILQ_INSERT_HEAD(&zombie_kses, ke, ke_procq);
826	mtx_unlock_spin(&kse_zombie_lock);
827}
828
829/*
830 * Stash an embarasingly extra upcall into the zombie upcall queue.
831 */
832
833void
834upcall_stash(struct kse_upcall *ku)
835{
836	mtx_lock_spin(&kse_zombie_lock);
837	TAILQ_INSERT_HEAD(&zombie_upcalls, ku, ku_link);
838	mtx_unlock_spin(&kse_zombie_lock);
839}
840
841/*
842 * Stash an embarasingly extra ksegrp into the zombie ksegrp queue.
843 */
844void
845ksegrp_stash(struct ksegrp *kg)
846{
847	mtx_lock_spin(&kse_zombie_lock);
848	TAILQ_INSERT_HEAD(&zombie_ksegrps, kg, kg_ksegrp);
849	mtx_unlock_spin(&kse_zombie_lock);
850}
851
852/*
853 * Reap zombie kse resource.
854 */
855void
856thread_reap(void)
857{
858	struct thread *td_first, *td_next;
859	struct kse *ke_first, *ke_next;
860	struct ksegrp *kg_first, * kg_next;
861	struct kse_upcall *ku_first, *ku_next;
862
863	/*
864	 * Don't even bother to lock if none at this instant,
865	 * we really don't care about the next instant..
866	 */
867	if ((!TAILQ_EMPTY(&zombie_threads))
868	    || (!TAILQ_EMPTY(&zombie_kses))
869	    || (!TAILQ_EMPTY(&zombie_ksegrps))
870	    || (!TAILQ_EMPTY(&zombie_upcalls))) {
871		mtx_lock_spin(&kse_zombie_lock);
872		td_first = TAILQ_FIRST(&zombie_threads);
873		ke_first = TAILQ_FIRST(&zombie_kses);
874		kg_first = TAILQ_FIRST(&zombie_ksegrps);
875		ku_first = TAILQ_FIRST(&zombie_upcalls);
876		if (td_first)
877			TAILQ_INIT(&zombie_threads);
878		if (ke_first)
879			TAILQ_INIT(&zombie_kses);
880		if (kg_first)
881			TAILQ_INIT(&zombie_ksegrps);
882		if (ku_first)
883			TAILQ_INIT(&zombie_upcalls);
884		mtx_unlock_spin(&kse_zombie_lock);
885		while (td_first) {
886			td_next = TAILQ_NEXT(td_first, td_runq);
887			if (td_first->td_ucred)
888				crfree(td_first->td_ucred);
889			thread_free(td_first);
890			td_first = td_next;
891		}
892		while (ke_first) {
893			ke_next = TAILQ_NEXT(ke_first, ke_procq);
894			kse_free(ke_first);
895			ke_first = ke_next;
896		}
897		while (kg_first) {
898			kg_next = TAILQ_NEXT(kg_first, kg_ksegrp);
899			ksegrp_free(kg_first);
900			kg_first = kg_next;
901		}
902		while (ku_first) {
903			ku_next = TAILQ_NEXT(ku_first, ku_link);
904			upcall_free(ku_first);
905			ku_first = ku_next;
906		}
907	}
908}
909
910/*
911 * Allocate a ksegrp.
912 */
913struct ksegrp *
914ksegrp_alloc(void)
915{
916	return (uma_zalloc(ksegrp_zone, M_WAITOK));
917}
918
919/*
920 * Allocate a kse.
921 */
922struct kse *
923kse_alloc(void)
924{
925	return (uma_zalloc(kse_zone, M_WAITOK));
926}
927
928/*
929 * Allocate a thread.
930 */
931struct thread *
932thread_alloc(void)
933{
934	thread_reap(); /* check if any zombies to get */
935	return (uma_zalloc(thread_zone, M_WAITOK));
936}
937
938/*
939 * Deallocate a ksegrp.
940 */
941void
942ksegrp_free(struct ksegrp *td)
943{
944	uma_zfree(ksegrp_zone, td);
945}
946
947/*
948 * Deallocate a kse.
949 */
950void
951kse_free(struct kse *td)
952{
953	uma_zfree(kse_zone, td);
954}
955
956/*
957 * Deallocate a thread.
958 */
959void
960thread_free(struct thread *td)
961{
962
963	cpu_thread_clean(td);
964	uma_zfree(thread_zone, td);
965}
966
967/*
968 * Store the thread context in the UTS's mailbox.
969 * then add the mailbox at the head of a list we are building in user space.
970 * The list is anchored in the ksegrp structure.
971 */
972int
973thread_export_context(struct thread *td)
974{
975	struct proc *p;
976	struct ksegrp *kg;
977	uintptr_t mbx;
978	void *addr;
979	int error,temp;
980	ucontext_t uc;
981
982	p = td->td_proc;
983	kg = td->td_ksegrp;
984
985	/* Export the user/machine context. */
986	addr = (void *)(&td->td_mailbox->tm_context);
987	error = copyin(addr, &uc, sizeof(ucontext_t));
988	if (error)
989		goto bad;
990
991	thread_getcontext(td, &uc);
992	error = copyout(&uc, addr, sizeof(ucontext_t));
993	if (error)
994		goto bad;
995
996	/* Exports clock ticks in kernel mode */
997	addr = (caddr_t)(&td->td_mailbox->tm_sticks);
998	temp = fuword(addr) + td->td_usticks;
999	if (suword(addr, temp))
1000		goto bad;
1001
1002	addr = (caddr_t)(&td->td_mailbox->tm_slices);
1003	temp = fuword(addr) - td->td_usticks;
1004	if (suword(addr, temp))
1005		goto bad;
1006
1007	/* Get address in latest mbox of list pointer */
1008	addr = (void *)(&td->td_mailbox->tm_next);
1009	/*
1010	 * Put the saved address of the previous first
1011	 * entry into this one
1012	 */
1013	for (;;) {
1014		mbx = (uintptr_t)kg->kg_completed;
1015		if (suword(addr, mbx)) {
1016			error = EFAULT;
1017			goto bad;
1018		}
1019		PROC_LOCK(p);
1020		if (mbx == (uintptr_t)kg->kg_completed) {
1021			kg->kg_completed = td->td_mailbox;
1022			/*
1023			 * The thread context may be taken away by
1024			 * other upcall threads when we unlock
1025			 * process lock. it's no longer valid to
1026			 * use it again in any other places.
1027			 */
1028			td->td_mailbox = NULL;
1029			PROC_UNLOCK(p);
1030			break;
1031		}
1032		PROC_UNLOCK(p);
1033	}
1034	td->td_usticks = 0;
1035	return (0);
1036
1037bad:
1038	PROC_LOCK(p);
1039	psignal(p, SIGSEGV);
1040	PROC_UNLOCK(p);
1041	/* The mailbox is bad, don't use it */
1042	td->td_mailbox = NULL;
1043	td->td_usticks = 0;
1044	return (error);
1045}
1046
1047/*
1048 * Take the list of completed mailboxes for this KSEGRP and put them on this
1049 * upcall's mailbox as it's the next one going up.
1050 */
1051static int
1052thread_link_mboxes(struct ksegrp *kg, struct kse_upcall *ku)
1053{
1054	struct proc *p = kg->kg_proc;
1055	void *addr;
1056	uintptr_t mbx;
1057
1058	addr = (void *)(&ku->ku_mailbox->km_completed);
1059	for (;;) {
1060		mbx = (uintptr_t)kg->kg_completed;
1061		if (suword(addr, mbx)) {
1062			PROC_LOCK(p);
1063			psignal(p, SIGSEGV);
1064			PROC_UNLOCK(p);
1065			return (EFAULT);
1066		}
1067		PROC_LOCK(p);
1068		if (mbx == (uintptr_t)kg->kg_completed) {
1069			kg->kg_completed = NULL;
1070			PROC_UNLOCK(p);
1071			break;
1072		}
1073		PROC_UNLOCK(p);
1074	}
1075	return (0);
1076}
1077
1078/*
1079 * This function should be called at statclock interrupt time
1080 */
1081int
1082thread_statclock(int user)
1083{
1084	struct thread *td = curthread;
1085
1086	if (td->td_ksegrp->kg_numupcalls == 0)
1087		return (-1);
1088	if (user) {
1089		/* Current always do via ast() */
1090		td->td_flags |= (TDF_USTATCLOCK|TDF_ASTPENDING);
1091		td->td_uuticks++;
1092	} else {
1093		if (td->td_mailbox != NULL)
1094			td->td_usticks++;
1095		else {
1096			/* XXXKSE
1097		 	 * We will call thread_user_enter() for every
1098			 * kernel entry in future, so if the thread mailbox
1099			 * is NULL, it must be a UTS kernel, don't account
1100			 * clock ticks for it.
1101			 */
1102		}
1103	}
1104	return (0);
1105}
1106
1107/*
1108 * Export state clock ticks for userland
1109 */
1110static int
1111thread_update_usr_ticks(struct thread *td, int user)
1112{
1113	struct proc *p = td->td_proc;
1114	struct kse_thr_mailbox *tmbx;
1115	struct kse_upcall *ku;
1116	caddr_t addr;
1117	uint uticks;
1118	int slices;
1119
1120	if ((ku = td->td_upcall) == NULL)
1121		return (-1);
1122
1123	tmbx = (void *)fuword((void *)&ku->ku_mailbox->km_curthread);
1124	if ((tmbx == NULL) || (tmbx == (void *)-1))
1125		return (-1);
1126	if (user) {
1127		uticks = td->td_uuticks;
1128		td->td_uuticks = 0;
1129		addr = (caddr_t)&tmbx->tm_uticks;
1130	} else {
1131		uticks = td->td_usticks;
1132		td->td_usticks = 0;
1133		addr = (caddr_t)&tmbx->tm_sticks;
1134	}
1135	if (uticks) {
1136		if (suword(addr, uticks+fuword(addr))) {
1137			PROC_LOCK(p);
1138			psignal(p, SIGSEGV);
1139			PROC_UNLOCK(p);
1140			return (-2);
1141		}
1142		addr = (caddr_t)&tmbx->tm_slices;
1143		slices = (int)fuword(addr);
1144		if (slices > 0) {
1145			slices -= (int)uticks;
1146			if (suword(addr, slices)) {
1147				PROC_LOCK(p);
1148				psignal(p, SIGSEGV);
1149				PROC_UNLOCK(p);
1150				return (-2);
1151			}
1152			if (slices <= 0) {
1153				mtx_lock_spin(&sched_lock);
1154				td->td_upcall->ku_flags |= KUF_DOUPCALL;
1155				mtx_unlock_spin(&sched_lock);
1156			}
1157		}
1158	}
1159	return (0);
1160}
1161
1162/*
1163 * Discard the current thread and exit from its context.
1164 *
1165 * Because we can't free a thread while we're operating under its context,
1166 * push the current thread into our CPU's deadthread holder. This means
1167 * we needn't worry about someone else grabbing our context before we
1168 * do a cpu_throw().
1169 */
1170void
1171thread_exit(void)
1172{
1173	struct thread *td;
1174	struct kse *ke;
1175	struct proc *p;
1176	struct ksegrp	*kg;
1177
1178	td = curthread;
1179	kg = td->td_ksegrp;
1180	p = td->td_proc;
1181	ke = td->td_kse;
1182
1183	mtx_assert(&sched_lock, MA_OWNED);
1184	KASSERT(p != NULL, ("thread exiting without a process"));
1185	KASSERT(ke != NULL, ("thread exiting without a kse"));
1186	KASSERT(kg != NULL, ("thread exiting without a kse group"));
1187	PROC_LOCK_ASSERT(p, MA_OWNED);
1188	CTR1(KTR_PROC, "thread_exit: thread %p", td);
1189	KASSERT(!mtx_owned(&Giant), ("dying thread owns giant"));
1190
1191	if (td->td_standin != NULL) {
1192		thread_stash(td->td_standin);
1193		td->td_standin = NULL;
1194	}
1195
1196	cpu_thread_exit(td);	/* XXXSMP */
1197
1198	/*
1199	 * The last thread is left attached to the process
1200	 * So that the whole bundle gets recycled. Skip
1201	 * all this stuff.
1202	 */
1203	if (p->p_numthreads > 1) {
1204		/*
1205		 * Unlink this thread from its proc and the kseg.
1206		 * In keeping with the other structs we probably should
1207		 * have a thread_unlink() that does some of this but it
1208		 * would only be called from here (I think) so it would
1209		 * be a waste. (might be useful for proc_fini() as well.)
1210 		 */
1211		TAILQ_REMOVE(&p->p_threads, td, td_plist);
1212		p->p_numthreads--;
1213		TAILQ_REMOVE(&kg->kg_threads, td, td_kglist);
1214		kg->kg_numthreads--;
1215		if (p->p_maxthrwaits)
1216			wakeup(&p->p_numthreads);
1217		/*
1218		 * The test below is NOT true if we are the
1219		 * sole exiting thread. P_STOPPED_SNGL is unset
1220		 * in exit1() after it is the only survivor.
1221		 */
1222		if (P_SHOULDSTOP(p) == P_STOPPED_SINGLE) {
1223			if (p->p_numthreads == p->p_suspcount) {
1224				thread_unsuspend_one(p->p_singlethread);
1225			}
1226		}
1227
1228		/*
1229		 * Because each upcall structure has an owner thread,
1230		 * owner thread exits only when process is in exiting
1231		 * state, so upcall to userland is no longer needed,
1232		 * deleting upcall structure is safe here.
1233		 * So when all threads in a group is exited, all upcalls
1234		 * in the group should be automatically freed.
1235		 */
1236		if (td->td_upcall)
1237			upcall_remove(td);
1238
1239		ke->ke_state = KES_UNQUEUED;
1240		ke->ke_thread = NULL;
1241		/*
1242		 * Decide what to do with the KSE attached to this thread.
1243		 */
1244		if (ke->ke_flags & KEF_EXIT)
1245			kse_unlink(ke);
1246		else
1247			kse_reassign(ke);
1248		PROC_UNLOCK(p);
1249		td->td_kse	= NULL;
1250		td->td_state	= TDS_INACTIVE;
1251		td->td_proc	= NULL;
1252		td->td_ksegrp	= NULL;
1253		td->td_last_kse	= NULL;
1254		PCPU_SET(deadthread, td);
1255	} else {
1256		PROC_UNLOCK(p);
1257	}
1258	cpu_throw();
1259	/* NOTREACHED */
1260}
1261
1262/*
1263 * Do any thread specific cleanups that may be needed in wait()
1264 * called with Giant held, proc and schedlock not held.
1265 */
1266void
1267thread_wait(struct proc *p)
1268{
1269	struct thread *td;
1270
1271	KASSERT((p->p_numthreads == 1), ("Muliple threads in wait1()"));
1272	KASSERT((p->p_numksegrps == 1), ("Muliple ksegrps in wait1()"));
1273	FOREACH_THREAD_IN_PROC(p, td) {
1274		if (td->td_standin != NULL) {
1275			thread_free(td->td_standin);
1276			td->td_standin = NULL;
1277		}
1278		cpu_thread_clean(td);
1279	}
1280	thread_reap();	/* check for zombie threads etc. */
1281}
1282
1283/*
1284 * Link a thread to a process.
1285 * set up anything that needs to be initialized for it to
1286 * be used by the process.
1287 *
1288 * Note that we do not link to the proc's ucred here.
1289 * The thread is linked as if running but no KSE assigned.
1290 */
1291void
1292thread_link(struct thread *td, struct ksegrp *kg)
1293{
1294	struct proc *p;
1295
1296	p = kg->kg_proc;
1297	td->td_state    = TDS_INACTIVE;
1298	td->td_proc     = p;
1299	td->td_ksegrp   = kg;
1300	td->td_last_kse = NULL;
1301	td->td_flags    = 0;
1302	td->td_kse      = NULL;
1303
1304	LIST_INIT(&td->td_contested);
1305	callout_init(&td->td_slpcallout, 1);
1306	TAILQ_INSERT_HEAD(&p->p_threads, td, td_plist);
1307	TAILQ_INSERT_HEAD(&kg->kg_threads, td, td_kglist);
1308	p->p_numthreads++;
1309	kg->kg_numthreads++;
1310}
1311
1312/*
1313 * Purge a ksegrp resource. When a ksegrp is preparing to
1314 * exit, it calls this function.
1315 */
1316void
1317kse_purge_group(struct thread *td)
1318{
1319	struct ksegrp *kg;
1320	struct kse *ke;
1321
1322	kg = td->td_ksegrp;
1323 	KASSERT(kg->kg_numthreads == 1, ("%s: bad thread number", __func__));
1324	while ((ke = TAILQ_FIRST(&kg->kg_iq)) != NULL) {
1325		KASSERT(ke->ke_state == KES_IDLE,
1326			("%s: wrong idle KSE state", __func__));
1327		kse_unlink(ke);
1328	}
1329	KASSERT((kg->kg_kses == 1),
1330		("%s: ksegrp still has %d KSEs", __func__, kg->kg_kses));
1331	KASSERT((kg->kg_numupcalls == 0),
1332	        ("%s: ksegrp still has %d upcall datas",
1333		__func__, kg->kg_numupcalls));
1334}
1335
1336/*
1337 * Purge a process's KSE resource. When a process is preparing to
1338 * exit, it calls kse_purge to release any extra KSE resources in
1339 * the process.
1340 */
1341void
1342kse_purge(struct proc *p, struct thread *td)
1343{
1344	struct ksegrp *kg;
1345	struct kse *ke;
1346
1347 	KASSERT(p->p_numthreads == 1, ("bad thread number"));
1348	mtx_lock_spin(&sched_lock);
1349	while ((kg = TAILQ_FIRST(&p->p_ksegrps)) != NULL) {
1350		TAILQ_REMOVE(&p->p_ksegrps, kg, kg_ksegrp);
1351		p->p_numksegrps--;
1352		/*
1353		 * There is no ownership for KSE, after all threads
1354		 * in the group exited, it is possible that some KSEs
1355		 * were left in idle queue, gc them now.
1356		 */
1357		while ((ke = TAILQ_FIRST(&kg->kg_iq)) != NULL) {
1358			KASSERT(ke->ke_state == KES_IDLE,
1359			   ("%s: wrong idle KSE state", __func__));
1360			TAILQ_REMOVE(&kg->kg_iq, ke, ke_kgrlist);
1361			kg->kg_idle_kses--;
1362			TAILQ_REMOVE(&kg->kg_kseq, ke, ke_kglist);
1363			kg->kg_kses--;
1364			kse_stash(ke);
1365		}
1366		KASSERT(((kg->kg_kses == 0) && (kg != td->td_ksegrp)) ||
1367		        ((kg->kg_kses == 1) && (kg == td->td_ksegrp)),
1368		        ("ksegrp has wrong kg_kses: %d", kg->kg_kses));
1369		KASSERT((kg->kg_numupcalls == 0),
1370		        ("%s: ksegrp still has %d upcall datas",
1371			__func__, kg->kg_numupcalls));
1372
1373		if (kg != td->td_ksegrp)
1374			ksegrp_stash(kg);
1375	}
1376	TAILQ_INSERT_HEAD(&p->p_ksegrps, td->td_ksegrp, kg_ksegrp);
1377	p->p_numksegrps++;
1378	mtx_unlock_spin(&sched_lock);
1379}
1380
1381/*
1382 * This function is intended to be used to initialize a spare thread
1383 * for upcall. Initialize thread's large data area outside sched_lock
1384 * for thread_schedule_upcall().
1385 */
1386void
1387thread_alloc_spare(struct thread *td, struct thread *spare)
1388{
1389	if (td->td_standin)
1390		return;
1391	if (spare == NULL)
1392		spare = thread_alloc();
1393	td->td_standin = spare;
1394	bzero(&spare->td_startzero,
1395	    (unsigned)RANGEOF(struct thread, td_startzero, td_endzero));
1396	spare->td_proc = td->td_proc;
1397	/* Setup PCB and fork address */
1398	cpu_set_upcall(spare, td->td_pcb);
1399	/*
1400	 * XXXKSE do we really need this? (default values for the
1401	 * frame).
1402	 */
1403	bcopy(td->td_frame, spare->td_frame, sizeof(struct trapframe));
1404	spare->td_ucred = crhold(td->td_ucred);
1405}
1406
1407/*
1408 * Create a thread and schedule it for upcall on the KSE given.
1409 * Use our thread's standin so that we don't have to allocate one.
1410 */
1411struct thread *
1412thread_schedule_upcall(struct thread *td, struct kse_upcall *ku)
1413{
1414	struct thread *td2;
1415
1416	mtx_assert(&sched_lock, MA_OWNED);
1417
1418	/*
1419	 * Schedule an upcall thread on specified kse_upcall,
1420	 * the kse_upcall must be free.
1421	 * td must have a spare thread.
1422	 */
1423	KASSERT(ku->ku_owner == NULL, ("%s: upcall has owner", __func__));
1424	if ((td2 = td->td_standin) != NULL) {
1425		td->td_standin = NULL;
1426	} else {
1427		panic("no reserve thread when scheduling an upcall");
1428		return (NULL);
1429	}
1430	CTR3(KTR_PROC, "thread_schedule_upcall: thread %p (pid %d, %s)",
1431	     td2, td->td_proc->p_pid, td->td_proc->p_comm);
1432	bcopy(&td->td_startcopy, &td2->td_startcopy,
1433	    (unsigned) RANGEOF(struct thread, td_startcopy, td_endcopy));
1434	thread_link(td2, ku->ku_ksegrp);
1435	/* Let the new thread become owner of the upcall */
1436	ku->ku_owner   = td2;
1437	td2->td_upcall = ku;
1438	td2->td_flags  = TDF_UPCALLING;
1439	if (td->td_proc->p_sflag & PS_NEEDSIGCHK)
1440		td2->td_flags |= TDF_ASTPENDING;
1441	td2->td_kse    = NULL;
1442	td2->td_state  = TDS_CAN_RUN;
1443	td2->td_inhibitors = 0;
1444	setrunqueue(td2);
1445	return (td2);	/* bogus.. should be a void function */
1446}
1447
1448void
1449thread_signal_add(struct thread *td, int sig)
1450{
1451	struct kse_upcall *ku;
1452	struct proc *p;
1453	sigset_t ss;
1454	int error;
1455
1456	PROC_LOCK_ASSERT(td->td_proc, MA_OWNED);
1457	td = curthread;
1458	ku = td->td_upcall;
1459	p = td->td_proc;
1460
1461	PROC_UNLOCK(p);
1462	error = copyin(&ku->ku_mailbox->km_sigscaught, &ss, sizeof(sigset_t));
1463	if (error)
1464		goto error;
1465
1466	SIGADDSET(ss, sig);
1467
1468	error = copyout(&ss, &ku->ku_mailbox->km_sigscaught, sizeof(sigset_t));
1469	if (error)
1470		goto error;
1471
1472	PROC_LOCK(p);
1473	return;
1474error:
1475	PROC_LOCK(p);
1476	sigexit(td, SIGILL);
1477}
1478
1479
1480/*
1481 * Schedule an upcall to notify a KSE process recieved signals.
1482 *
1483 */
1484void
1485thread_signal_upcall(struct thread *td)
1486{
1487	mtx_lock_spin(&sched_lock);
1488	td->td_flags |= TDF_UPCALLING;
1489	mtx_unlock_spin(&sched_lock);
1490
1491	return;
1492}
1493
1494/*
1495 * Setup done on the thread when it enters the kernel.
1496 * XXXKSE Presently only for syscalls but eventually all kernel entries.
1497 */
1498void
1499thread_user_enter(struct proc *p, struct thread *td)
1500{
1501	struct ksegrp *kg;
1502	struct kse_upcall *ku;
1503
1504	kg = td->td_ksegrp;
1505	/*
1506	 * First check that we shouldn't just abort.
1507	 * But check if we are the single thread first!
1508	 * XXX p_singlethread not locked, but should be safe.
1509	 */
1510	if ((p->p_flag & P_SINGLE_EXIT) && (p->p_singlethread != td)) {
1511		PROC_LOCK(p);
1512		mtx_lock_spin(&sched_lock);
1513		thread_exit();
1514		/* NOTREACHED */
1515	}
1516
1517	/*
1518	 * If we are doing a syscall in a KSE environment,
1519	 * note where our mailbox is. There is always the
1520	 * possibility that we could do this lazily (in kse_reassign()),
1521	 * but for now do it every time.
1522	 */
1523	kg = td->td_ksegrp;
1524	if (kg->kg_numupcalls) {
1525		ku = td->td_upcall;
1526		KASSERT(ku, ("%s: no upcall owned", __func__));
1527		KASSERT((ku->ku_owner == td), ("%s: wrong owner", __func__));
1528		td->td_mailbox =
1529		    (void *)fuword((void *)&ku->ku_mailbox->km_curthread);
1530		if ((td->td_mailbox == NULL) ||
1531		    (td->td_mailbox == (void *)-1)) {
1532		    	/* Don't schedule upcall when blocked */
1533			td->td_mailbox = NULL;
1534			mtx_lock_spin(&sched_lock);
1535			td->td_flags &= ~TDF_CAN_UNBIND;
1536			mtx_unlock_spin(&sched_lock);
1537		} else {
1538			if (td->td_standin == NULL)
1539				thread_alloc_spare(td, NULL);
1540			mtx_lock_spin(&sched_lock);
1541			td->td_flags |= TDF_CAN_UNBIND;
1542			mtx_unlock_spin(&sched_lock);
1543		}
1544	}
1545}
1546
1547/*
1548 * The extra work we go through if we are a threaded process when we
1549 * return to userland.
1550 *
1551 * If we are a KSE process and returning to user mode, check for
1552 * extra work to do before we return (e.g. for more syscalls
1553 * to complete first).  If we were in a critical section, we should
1554 * just return to let it finish. Same if we were in the UTS (in
1555 * which case the mailbox's context's busy indicator will be set).
1556 * The only traps we suport will have set the mailbox.
1557 * We will clear it here.
1558 */
1559int
1560thread_userret(struct thread *td, struct trapframe *frame)
1561{
1562	int error = 0, upcalls;
1563	struct kse_upcall *ku;
1564	struct ksegrp *kg, *kg2;
1565	struct proc *p;
1566	struct timespec ts;
1567
1568	p = td->td_proc;
1569	kg = td->td_ksegrp;
1570
1571	/* Nothing to do with non-threaded group/process */
1572	if (td->td_ksegrp->kg_numupcalls == 0)
1573		return (0);
1574
1575	/*
1576	 * Stat clock interrupt hit in userland, it
1577	 * is returning from interrupt, charge thread's
1578	 * userland time for UTS.
1579	 */
1580	if (td->td_flags & TDF_USTATCLOCK) {
1581		thread_update_usr_ticks(td, 1);
1582		mtx_lock_spin(&sched_lock);
1583		td->td_flags &= ~TDF_USTATCLOCK;
1584		mtx_unlock_spin(&sched_lock);
1585		if (kg->kg_completed ||
1586		    (td->td_upcall->ku_flags & KUF_DOUPCALL))
1587			thread_user_enter(p, td);
1588	}
1589
1590	/*
1591	 * Optimisation:
1592	 * This thread has not started any upcall.
1593	 * If there is no work to report other than ourself,
1594	 * then it can return direct to userland.
1595	 */
1596	if (TD_CAN_UNBIND(td)) {
1597		mtx_lock_spin(&sched_lock);
1598		td->td_flags &= ~TDF_CAN_UNBIND;
1599		mtx_unlock_spin(&sched_lock);
1600		if ((kg->kg_completed == NULL) &&
1601		    (td->td_upcall->ku_flags & KUF_DOUPCALL) == 0) {
1602			thread_update_usr_ticks(td, 0);
1603			if (!(kg->kg_completed ||
1604			    (td->td_upcall->ku_flags & KUF_DOUPCALL))) {
1605				td->td_mailbox = NULL;
1606				return (0);
1607			}
1608		}
1609		error = thread_export_context(td);
1610		if (error) {
1611			/*
1612			 * Failing to do the KSE operation just defaults
1613			 * back to synchonous operation, so just return from
1614			 * the syscall.
1615			 */
1616			return (0);
1617		}
1618		/*
1619		 * There is something to report, and we own an upcall
1620		 * strucuture, we can go to userland.
1621		 * Turn ourself into an upcall thread.
1622		 */
1623		mtx_lock_spin(&sched_lock);
1624		td->td_flags |= TDF_UPCALLING;
1625		mtx_unlock_spin(&sched_lock);
1626	} else if (td->td_mailbox) {
1627		error = thread_export_context(td);
1628		if (error) {
1629			PROC_LOCK(td->td_proc);
1630			mtx_lock_spin(&sched_lock);
1631			/* possibly upcall with error? */
1632		} else {
1633			PROC_LOCK(td->td_proc);
1634			mtx_lock_spin(&sched_lock);
1635			/*
1636			 * There are upcall threads waiting for
1637			 * work to do, wake one of them up.
1638			 * XXXKSE Maybe wake all of them up.
1639			 */
1640			if (kg->kg_upsleeps)
1641				wakeup_one(&kg->kg_completed);
1642		}
1643		thread_exit();
1644		/* NOTREACHED */
1645	}
1646
1647	KASSERT(TD_CAN_UNBIND(td) == 0, ("can unbind"));
1648
1649	if (p->p_numthreads > max_threads_per_proc) {
1650		max_threads_hits++;
1651		PROC_LOCK(p);
1652		while (p->p_numthreads > max_threads_per_proc) {
1653			if (P_SHOULDSTOP(p))
1654				break;
1655			upcalls = 0;
1656			mtx_lock_spin(&sched_lock);
1657			FOREACH_KSEGRP_IN_PROC(p, kg2) {
1658				if (kg2->kg_numupcalls == 0)
1659					upcalls++;
1660				else
1661					upcalls += kg2->kg_numupcalls;
1662			}
1663			mtx_unlock_spin(&sched_lock);
1664			if (upcalls >= max_threads_per_proc)
1665				break;
1666			p->p_maxthrwaits++;
1667			msleep(&p->p_numthreads, &p->p_mtx, PPAUSE|PCATCH,
1668			    "maxthreads", NULL);
1669			p->p_maxthrwaits--;
1670		}
1671		PROC_UNLOCK(p);
1672	}
1673
1674	if (td->td_flags & TDF_UPCALLING) {
1675		ku = td->td_upcall;
1676		/*
1677		 * There is no more work to do and we are going to ride
1678		 * this thread up to userland as an upcall.
1679		 * Do the last parts of the setup needed for the upcall.
1680		 */
1681		CTR3(KTR_PROC, "userret: upcall thread %p (pid %d, %s)",
1682		    td, td->td_proc->p_pid, td->td_proc->p_comm);
1683
1684		/*
1685		 * Set user context to the UTS.
1686		 * Will use Giant in cpu_thread_clean() because it uses
1687		 * kmem_free(kernel_map, ...)
1688		 */
1689		cpu_set_upcall_kse(td, ku);
1690		mtx_lock_spin(&sched_lock);
1691		td->td_flags &= ~TDF_UPCALLING;
1692		if (ku->ku_flags & KUF_DOUPCALL)
1693			ku->ku_flags &= ~KUF_DOUPCALL;
1694		mtx_unlock_spin(&sched_lock);
1695
1696		/*
1697		 * Unhook the list of completed threads.
1698		 * anything that completes after this gets to
1699		 * come in next time.
1700		 * Put the list of completed thread mailboxes on
1701		 * this KSE's mailbox.
1702		 */
1703		error = thread_link_mboxes(kg, ku);
1704		if (error)
1705			goto out;
1706
1707		/*
1708		 * Set state and clear the  thread mailbox pointer.
1709		 * From now on we are just a bound outgoing process.
1710		 * **Problem** userret is often called several times.
1711		 * it would be nice if this all happenned only on the first
1712		 * time through. (the scan for extra work etc.)
1713		 */
1714		error = suword((caddr_t)&ku->ku_mailbox->km_curthread, 0);
1715		if (error)
1716			goto out;
1717
1718		/* Export current system time */
1719		nanotime(&ts);
1720		error = copyout(&ts, (caddr_t)&ku->ku_mailbox->km_timeofday,
1721			sizeof(ts));
1722	}
1723
1724out:
1725	if (error) {
1726		/*
1727		 * Things are going to be so screwed we should just kill
1728		 * the process.
1729		 * how do we do that?
1730		 */
1731		PROC_LOCK(td->td_proc);
1732		psignal(td->td_proc, SIGSEGV);
1733		PROC_UNLOCK(td->td_proc);
1734	} else {
1735		/*
1736		 * Optimisation:
1737		 * Ensure that we have a spare thread available,
1738		 * for when we re-enter the kernel.
1739		 */
1740		if (td->td_standin == NULL)
1741			thread_alloc_spare(td, NULL);
1742	}
1743
1744	/*
1745	 * Clear thread mailbox first, then clear system tick count.
1746	 * The order is important because thread_statclock() use
1747	 * mailbox pointer to see if it is an userland thread or
1748	 * an UTS kernel thread.
1749	 */
1750	td->td_mailbox = NULL;
1751	td->td_usticks = 0;
1752	return (error);	/* go sync */
1753}
1754
1755/*
1756 * Enforce single-threading.
1757 *
1758 * Returns 1 if the caller must abort (another thread is waiting to
1759 * exit the process or similar). Process is locked!
1760 * Returns 0 when you are successfully the only thread running.
1761 * A process has successfully single threaded in the suspend mode when
1762 * There are no threads in user mode. Threads in the kernel must be
1763 * allowed to continue until they get to the user boundary. They may even
1764 * copy out their return values and data before suspending. They may however be
1765 * accellerated in reaching the user boundary as we will wake up
1766 * any sleeping threads that are interruptable. (PCATCH).
1767 */
1768int
1769thread_single(int force_exit)
1770{
1771	struct thread *td;
1772	struct thread *td2;
1773	struct proc *p;
1774
1775	td = curthread;
1776	p = td->td_proc;
1777	mtx_assert(&Giant, MA_OWNED);
1778	PROC_LOCK_ASSERT(p, MA_OWNED);
1779	KASSERT((td != NULL), ("curthread is NULL"));
1780
1781	if ((p->p_flag & P_THREADED) == 0)
1782		return (0);
1783
1784	/* Is someone already single threading? */
1785	if (p->p_singlethread)
1786		return (1);
1787
1788	if (force_exit == SINGLE_EXIT) {
1789		p->p_flag |= P_SINGLE_EXIT;
1790	} else
1791		p->p_flag &= ~P_SINGLE_EXIT;
1792	p->p_flag |= P_STOPPED_SINGLE;
1793	p->p_singlethread = td;
1794	/* XXXKSE Which lock protects the below values? */
1795	while ((p->p_numthreads - p->p_suspcount) != 1) {
1796		mtx_lock_spin(&sched_lock);
1797		FOREACH_THREAD_IN_PROC(p, td2) {
1798			if (td2 == td)
1799				continue;
1800			td->td_flags |= TDF_ASTPENDING;
1801			if (TD_IS_INHIBITED(td2)) {
1802				if (force_exit == SINGLE_EXIT) {
1803					if (TD_IS_SUSPENDED(td2)) {
1804						thread_unsuspend_one(td2);
1805					}
1806					if (TD_ON_SLEEPQ(td2) &&
1807					    (td2->td_flags & TDF_SINTR)) {
1808						if (td2->td_flags & TDF_CVWAITQ)
1809							cv_abort(td2);
1810						else
1811							abortsleep(td2);
1812					}
1813				} else {
1814					if (TD_IS_SUSPENDED(td2))
1815						continue;
1816					/*
1817					 * maybe other inhibitted states too?
1818					 * XXXKSE Is it totally safe to
1819					 * suspend a non-interruptable thread?
1820					 */
1821					if (td2->td_inhibitors &
1822					    (TDI_SLEEPING | TDI_SWAPPED))
1823						thread_suspend_one(td2);
1824				}
1825			}
1826		}
1827		/*
1828		 * Maybe we suspended some threads.. was it enough?
1829		 */
1830		if ((p->p_numthreads - p->p_suspcount) == 1) {
1831			mtx_unlock_spin(&sched_lock);
1832			break;
1833		}
1834
1835		/*
1836		 * Wake us up when everyone else has suspended.
1837		 * In the mean time we suspend as well.
1838		 */
1839		thread_suspend_one(td);
1840		mtx_unlock(&Giant);
1841		PROC_UNLOCK(p);
1842		p->p_stats->p_ru.ru_nvcsw++;
1843		mi_switch();
1844		mtx_unlock_spin(&sched_lock);
1845		mtx_lock(&Giant);
1846		PROC_LOCK(p);
1847	}
1848	if (force_exit == SINGLE_EXIT) {
1849		if (td->td_upcall) {
1850			mtx_lock_spin(&sched_lock);
1851			upcall_remove(td);
1852			mtx_unlock_spin(&sched_lock);
1853		}
1854		kse_purge(p, td);
1855	}
1856	return (0);
1857}
1858
1859/*
1860 * Called in from locations that can safely check to see
1861 * whether we have to suspend or at least throttle for a
1862 * single-thread event (e.g. fork).
1863 *
1864 * Such locations include userret().
1865 * If the "return_instead" argument is non zero, the thread must be able to
1866 * accept 0 (caller may continue), or 1 (caller must abort) as a result.
1867 *
1868 * The 'return_instead' argument tells the function if it may do a
1869 * thread_exit() or suspend, or whether the caller must abort and back
1870 * out instead.
1871 *
1872 * If the thread that set the single_threading request has set the
1873 * P_SINGLE_EXIT bit in the process flags then this call will never return
1874 * if 'return_instead' is false, but will exit.
1875 *
1876 * P_SINGLE_EXIT | return_instead == 0| return_instead != 0
1877 *---------------+--------------------+---------------------
1878 *       0       | returns 0          |   returns 0 or 1
1879 *               | when ST ends       |   immediatly
1880 *---------------+--------------------+---------------------
1881 *       1       | thread exits       |   returns 1
1882 *               |                    |  immediatly
1883 * 0 = thread_exit() or suspension ok,
1884 * other = return error instead of stopping the thread.
1885 *
1886 * While a full suspension is under effect, even a single threading
1887 * thread would be suspended if it made this call (but it shouldn't).
1888 * This call should only be made from places where
1889 * thread_exit() would be safe as that may be the outcome unless
1890 * return_instead is set.
1891 */
1892int
1893thread_suspend_check(int return_instead)
1894{
1895	struct thread *td;
1896	struct proc *p;
1897	struct ksegrp *kg;
1898
1899	td = curthread;
1900	p = td->td_proc;
1901	kg = td->td_ksegrp;
1902	PROC_LOCK_ASSERT(p, MA_OWNED);
1903	while (P_SHOULDSTOP(p)) {
1904		if (P_SHOULDSTOP(p) == P_STOPPED_SINGLE) {
1905			KASSERT(p->p_singlethread != NULL,
1906			    ("singlethread not set"));
1907			/*
1908			 * The only suspension in action is a
1909			 * single-threading. Single threader need not stop.
1910			 * XXX Should be safe to access unlocked
1911			 * as it can only be set to be true by us.
1912			 */
1913			if (p->p_singlethread == td)
1914				return (0);	/* Exempt from stopping. */
1915		}
1916		if (return_instead)
1917			return (1);
1918
1919		/*
1920		 * If the process is waiting for us to exit,
1921		 * this thread should just suicide.
1922		 * Assumes that P_SINGLE_EXIT implies P_STOPPED_SINGLE.
1923		 */
1924		if ((p->p_flag & P_SINGLE_EXIT) && (p->p_singlethread != td)) {
1925			mtx_lock_spin(&sched_lock);
1926			while (mtx_owned(&Giant))
1927				mtx_unlock(&Giant);
1928			thread_exit();
1929		}
1930
1931		/*
1932		 * When a thread suspends, it just
1933		 * moves to the processes's suspend queue
1934		 * and stays there.
1935		 */
1936		mtx_lock_spin(&sched_lock);
1937		if ((p->p_flag & P_STOPPED_SIG) &&
1938		    (p->p_suspcount+1 == p->p_numthreads)) {
1939			mtx_unlock_spin(&sched_lock);
1940			PROC_LOCK(p->p_pptr);
1941			if ((p->p_pptr->p_procsig->ps_flag &
1942				PS_NOCLDSTOP) == 0) {
1943				psignal(p->p_pptr, SIGCHLD);
1944			}
1945			PROC_UNLOCK(p->p_pptr);
1946			mtx_lock_spin(&sched_lock);
1947		}
1948		mtx_assert(&Giant, MA_NOTOWNED);
1949		thread_suspend_one(td);
1950		PROC_UNLOCK(p);
1951		if (P_SHOULDSTOP(p) == P_STOPPED_SINGLE) {
1952			if (p->p_numthreads == p->p_suspcount) {
1953				thread_unsuspend_one(p->p_singlethread);
1954			}
1955		}
1956		p->p_stats->p_ru.ru_nivcsw++;
1957		mi_switch();
1958		mtx_unlock_spin(&sched_lock);
1959		PROC_LOCK(p);
1960	}
1961	return (0);
1962}
1963
1964void
1965thread_suspend_one(struct thread *td)
1966{
1967	struct proc *p = td->td_proc;
1968
1969	mtx_assert(&sched_lock, MA_OWNED);
1970	p->p_suspcount++;
1971	TD_SET_SUSPENDED(td);
1972	TAILQ_INSERT_TAIL(&p->p_suspended, td, td_runq);
1973	/*
1974	 * Hack: If we are suspending but are on the sleep queue
1975	 * then we are in msleep or the cv equivalent. We
1976	 * want to look like we have two Inhibitors.
1977	 * May already be set.. doesn't matter.
1978	 */
1979	if (TD_ON_SLEEPQ(td))
1980		TD_SET_SLEEPING(td);
1981}
1982
1983void
1984thread_unsuspend_one(struct thread *td)
1985{
1986	struct proc *p = td->td_proc;
1987
1988	mtx_assert(&sched_lock, MA_OWNED);
1989	TAILQ_REMOVE(&p->p_suspended, td, td_runq);
1990	TD_CLR_SUSPENDED(td);
1991	p->p_suspcount--;
1992	setrunnable(td);
1993}
1994
1995/*
1996 * Allow all threads blocked by single threading to continue running.
1997 */
1998void
1999thread_unsuspend(struct proc *p)
2000{
2001	struct thread *td;
2002
2003	mtx_assert(&sched_lock, MA_OWNED);
2004	PROC_LOCK_ASSERT(p, MA_OWNED);
2005	if (!P_SHOULDSTOP(p)) {
2006		while (( td = TAILQ_FIRST(&p->p_suspended))) {
2007			thread_unsuspend_one(td);
2008		}
2009	} else if ((P_SHOULDSTOP(p) == P_STOPPED_SINGLE) &&
2010	    (p->p_numthreads == p->p_suspcount)) {
2011		/*
2012		 * Stopping everything also did the job for the single
2013		 * threading request. Now we've downgraded to single-threaded,
2014		 * let it continue.
2015		 */
2016		thread_unsuspend_one(p->p_singlethread);
2017	}
2018}
2019
2020void
2021thread_single_end(void)
2022{
2023	struct thread *td;
2024	struct proc *p;
2025
2026	td = curthread;
2027	p = td->td_proc;
2028	PROC_LOCK_ASSERT(p, MA_OWNED);
2029	p->p_flag &= ~P_STOPPED_SINGLE;
2030	p->p_singlethread = NULL;
2031	/*
2032	 * If there are other threads they mey now run,
2033	 * unless of course there is a blanket 'stop order'
2034	 * on the process. The single threader must be allowed
2035	 * to continue however as this is a bad place to stop.
2036	 */
2037	if ((p->p_numthreads != 1) && (!P_SHOULDSTOP(p))) {
2038		mtx_lock_spin(&sched_lock);
2039		while (( td = TAILQ_FIRST(&p->p_suspended))) {
2040			thread_unsuspend_one(td);
2041		}
2042		mtx_unlock_spin(&sched_lock);
2043	}
2044}
2045
2046
2047