kern_switch.c revision 163709
1139804Simp/*-
272376Sjake * Copyright (c) 2001 Jake Burkholder <jake@FreeBSD.org>
372376Sjake * All rights reserved.
450027Speter *
550027Speter * Redistribution and use in source and binary forms, with or without
650027Speter * modification, are permitted provided that the following conditions
750027Speter * are met:
850027Speter * 1. Redistributions of source code must retain the above copyright
950027Speter *    notice, this list of conditions and the following disclaimer.
1050027Speter * 2. Redistributions in binary form must reproduce the above copyright
1150027Speter *    notice, this list of conditions and the following disclaimer in the
1250027Speter *    documentation and/or other materials provided with the distribution.
1350027Speter *
1450027Speter * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
1550027Speter * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
1650027Speter * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
1750027Speter * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
1850027Speter * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
1950027Speter * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
2050027Speter * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
2150027Speter * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
2250027Speter * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
2350027Speter * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
2450027Speter * SUCH DAMAGE.
2550027Speter */
2650027Speter
27163709Sjb#ifdef KSE
2899072Sjulian/***
2999072SjulianHere is the logic..
3099072Sjulian
3199072SjulianIf there are N processors, then there are at most N KSEs (kernel
3299072Sjulianschedulable entities) working to process threads that belong to a
33123499SrwatsonKSEGROUP (kg). If there are X of these KSEs actually running at the
3499072Sjulianmoment in question, then there are at most M (N-X) of these KSEs on
3599072Sjulianthe run queue, as running KSEs are not on the queue.
3699072Sjulian
3799072SjulianRunnable threads are queued off the KSEGROUP in priority order.
3899072SjulianIf there are M or more threads runnable, the top M threads
3999072Sjulian(by priority) are 'preassigned' to the M KSEs not running. The KSEs take
4099072Sjuliantheir priority from those threads and are put on the run queue.
4199072Sjulian
4299072SjulianThe last thread that had a priority high enough to have a KSE associated
4399072Sjulianwith it, AND IS ON THE RUN QUEUE is pointed to by
4499072Sjuliankg->kg_last_assigned. If no threads queued off the KSEGROUP have KSEs
4599072Sjulianassigned as all the available KSEs are activly running, or because there
4699072Sjulianare no threads queued, that pointer is NULL.
4799072Sjulian
4899072SjulianWhen a KSE is removed from the run queue to become runnable, we know
4999072Sjulianit was associated with the highest priority thread in the queue (at the head
5099072Sjulianof the queue). If it is also the last assigned we know M was 1 and must
5199072Sjuliannow be 0. Since the thread is no longer queued that pointer must be
5299072Sjulianremoved from it. Since we know there were no more KSEs available,
5399072Sjulian(M was 1 and is now 0) and since we are not FREEING our KSE
5499072Sjulianbut using it, we know there are STILL no more KSEs available, we can prove
5599072Sjulianthat the next thread in the ksegrp list will not have a KSE to assign to
5699072Sjulianit, so we can show that the pointer must be made 'invalid' (NULL).
5799072Sjulian
5899072SjulianThe pointer exists so that when a new thread is made runnable, it can
5999072Sjulianhave its priority compared with the last assigned thread to see if
6099072Sjulianit should 'steal' its KSE or not.. i.e. is it 'earlier'
6199072Sjulianon the list than that thread or later.. If it's earlier, then the KSE is
6299072Sjulianremoved from the last assigned (which is now not assigned a KSE)
6399072Sjulianand reassigned to the new thread, which is placed earlier in the list.
6499072SjulianThe pointer is then backed up to the previous thread (which may or may not
6599072Sjulianbe the new thread).
6699072Sjulian
67153797SkanWhen a thread sleeps or is removed, the KSE becomes available and if there
6899072Sjulianare queued threads that are not assigned KSEs, the highest priority one of
6999072Sjulianthem is assigned the KSE, which is then placed back on the run queue at
7099072Sjulianthe approipriate place, and the kg->kg_last_assigned pointer is adjusted down
7199072Sjulianto point to it.
7299072Sjulian
7399072SjulianThe following diagram shows 2 KSEs and 3 threads from a single process.
7499072Sjulian
7599072Sjulian RUNQ: --->KSE---KSE--...    (KSEs queued at priorities from threads)
76153797Skan              \    \____
7799072Sjulian               \        \
7899072Sjulian    KSEGROUP---thread--thread--thread    (queued in priority order)
79153797Skan        \                 /
8099072Sjulian         \_______________/
8199072Sjulian          (last_assigned)
8299072Sjulian
8399072SjulianThe result of this scheme is that the M available KSEs are always
8499072Sjulianqueued at the priorities they have inherrited from the M highest priority
85153797Skanthreads for that KSEGROUP. If this situation changes, the KSEs are
8699072Sjulianreassigned to keep this true.
87116182Sobrien***/
88163709Sjb#endif
8999072Sjulian
90116182Sobrien#include <sys/cdefs.h>
91116182Sobrien__FBSDID("$FreeBSD: head/sys/kern/kern_switch.c 163709 2006-10-26 21:42:22Z jb $");
92116182Sobrien
93134591Sjulian#include "opt_sched.h"
94131481Sjhb
95134791Sjulian#ifndef KERN_SWITCH_INCLUDE
9650027Speter#include <sys/param.h>
9750027Speter#include <sys/systm.h>
98131927Smarcel#include <sys/kdb.h>
9950027Speter#include <sys/kernel.h>
10065557Sjasone#include <sys/ktr.h>
10174914Sjhb#include <sys/lock.h>
10267365Sjhb#include <sys/mutex.h>
10350027Speter#include <sys/proc.h>
10450027Speter#include <sys/queue.h>
105104964Sjeff#include <sys/sched.h>
106134791Sjulian#else  /* KERN_SWITCH_INCLUDE */
107122849Speter#if defined(SMP) && (defined(__i386__) || defined(__amd64__))
108112993Speter#include <sys/smp.h>
109112993Speter#endif
110134591Sjulian#if defined(SMP) && defined(SCHED_4BSD)
111134591Sjulian#include <sys/sysctl.h>
112134591Sjulian#endif
11350027Speter
114153510Snjl/* Uncomment this to enable logging of critical_enter/exit. */
115153510Snjl#if 0
116153510Snjl#define	KTR_CRITICAL	KTR_SCHED
117153510Snjl#else
118153510Snjl#define	KTR_CRITICAL	0
119153510Snjl#endif
120153510Snjl
121134649Sscottl#ifdef FULL_PREEMPTION
122134649Sscottl#ifndef PREEMPTION
123134649Sscottl#error "The FULL_PREEMPTION option requires the PREEMPTION option"
124134649Sscottl#endif
125134649Sscottl#endif
126134591Sjulian
12797261SjakeCTASSERT((RQB_BPW * RQB_LEN) == RQ_NQS);
12897261Sjake
129134791Sjulian#define td_kse td_sched
130134791Sjulian
131143884Srwatson/*
132143884Srwatson * kern.sched.preemption allows user space to determine if preemption support
133143884Srwatson * is compiled in or not.  It is not currently a boot or runtime flag that
134143884Srwatson * can be changed.
135143884Srwatson */
136143884Srwatson#ifdef PREEMPTION
137143884Srwatsonstatic int kern_sched_preemption = 1;
138143884Srwatson#else
139143884Srwatsonstatic int kern_sched_preemption = 0;
140143884Srwatson#endif
141143884SrwatsonSYSCTL_INT(_kern_sched, OID_AUTO, preemption, CTLFLAG_RD,
142143884Srwatson    &kern_sched_preemption, 0, "Kernel preemption enabled");
143143884Srwatson
14499072Sjulian/************************************************************************
14599072Sjulian * Functions that manipulate runnability from a thread perspective.	*
14699072Sjulian ************************************************************************/
147163709Sjb#ifdef KSE
14850027Speter/*
149111028Sjeff * Select the KSE that will be run next.  From that find the thread, and
15099072Sjulian * remove it from the KSEGRP's run queue.  If there is thread clustering,
15199072Sjulian * this will be what does it.
15250027Speter */
153163709Sjb#else
154163709Sjb/*
155163709Sjb * Select the thread that will be run next.
156163709Sjb */
157163709Sjb#endif
15883366Sjulianstruct thread *
15983366Sjulianchoosethread(void)
16050027Speter{
161163709Sjb#ifdef KSE
16299072Sjulian	struct kse *ke;
163163709Sjb#endif
16499072Sjulian	struct thread *td;
165163709Sjb#ifdef KSE
16699072Sjulian	struct ksegrp *kg;
167163709Sjb#endif
16899072Sjulian
169122849Speter#if defined(SMP) && (defined(__i386__) || defined(__amd64__))
170112993Speter	if (smp_active == 0 && PCPU_GET(cpuid) != 0) {
171112993Speter		/* Shutting down, run idlethread on AP's */
172112993Speter		td = PCPU_GET(idlethread);
173163709Sjb#ifdef KSE
174112993Speter		ke = td->td_kse;
175163709Sjb#endif
176112993Speter		CTR1(KTR_RUNQ, "choosethread: td=%p (idle)", td);
177163709Sjb#ifdef KSE
178112993Speter		ke->ke_flags |= KEF_DIDRUN;
179163709Sjb#else
180163709Sjb		td->td_kse->ke_flags |= KEF_DIDRUN;
181163709Sjb#endif
182112993Speter		TD_SET_RUNNING(td);
183112993Speter		return (td);
184112993Speter	}
185112993Speter#endif
186112993Speter
187100209Sgallatinretry:
188163709Sjb#ifdef KSE
189112993Speter	ke = sched_choose();
190112993Speter	if (ke) {
19199072Sjulian		td = ke->ke_thread;
19299072Sjulian		KASSERT((td->td_kse == ke), ("kse/thread mismatch"));
19399072Sjulian		kg = ke->ke_ksegrp;
194134791Sjulian		if (td->td_proc->p_flag & P_HADTHREADS) {
195103832Sjulian			if (kg->kg_last_assigned == td) {
19699072Sjulian				kg->kg_last_assigned = TAILQ_PREV(td,
19799072Sjulian				    threadqueue, td_runq);
198103832Sjulian			}
199112021Sdavidxu			TAILQ_REMOVE(&kg->kg_runq, td, td_runq);
20099072Sjulian		}
201163709Sjb#else
202163709Sjb	td = sched_choose();
203163709Sjb	if (td) {
204163709Sjb#endif
20599072Sjulian		CTR2(KTR_RUNQ, "choosethread: td=%p pri=%d",
20699072Sjulian		    td, td->td_priority);
20799072Sjulian	} else {
20899889Sjulian		/* Simulate runq_choose() having returned the idle thread */
20999072Sjulian		td = PCPU_GET(idlethread);
210163709Sjb#ifdef KSE
211102592Sjulian		ke = td->td_kse;
212163709Sjb#endif
21399072Sjulian		CTR1(KTR_RUNQ, "choosethread: td=%p (idle)", td);
21499072Sjulian	}
215163709Sjb#ifdef KSE
216102592Sjulian	ke->ke_flags |= KEF_DIDRUN;
217163709Sjb#else
218163709Sjb	td->td_kse->ke_flags |= KEF_DIDRUN;
219163709Sjb#endif
220108338Sjulian
221108338Sjulian	/*
222115215Sjulian	 * If we are in panic, only allow system threads,
223115215Sjulian	 * plus the one we are running in, to be run.
224108338Sjulian	 */
225100209Sgallatin	if (panicstr && ((td->td_proc->p_flag & P_SYSTEM) == 0 &&
226115215Sjulian	    (td->td_flags & TDF_INPANIC) == 0)) {
227115215Sjulian		/* note that it is no longer on the run queue */
228115215Sjulian		TD_SET_CAN_RUN(td);
229100209Sgallatin		goto retry;
230115215Sjulian	}
231108338Sjulian
232103216Sjulian	TD_SET_RUNNING(td);
23399072Sjulian	return (td);
23472376Sjake}
23550027Speter
236163709Sjb#ifdef KSE
23799072Sjulian/*
238134791Sjulian * Given a surplus system slot, try assign a new runnable thread to it.
239134791Sjulian * Called from:
240134791Sjulian *  sched_thread_exit()  (local)
241134791Sjulian *  sched_switch()  (local)
242134791Sjulian *  sched_thread_exit()  (local)
243135295Sjulian *  remrunqueue()  (local)  (not at the moment)
24499072Sjulian */
245134791Sjulianstatic void
246134791Sjulianslot_fill(struct ksegrp *kg)
24799072Sjulian{
24899072Sjulian	struct thread *td;
24999072Sjulian
250103832Sjulian	mtx_assert(&sched_lock, MA_OWNED);
251134791Sjulian	while (kg->kg_avail_opennings > 0) {
252134791Sjulian		/*
253134791Sjulian		 * Find the first unassigned thread
254134791Sjulian		 */
255134791Sjulian		if ((td = kg->kg_last_assigned) != NULL)
256134791Sjulian			td = TAILQ_NEXT(td, td_runq);
257134791Sjulian		else
258134791Sjulian			td = TAILQ_FIRST(&kg->kg_runq);
259108338Sjulian
260134791Sjulian		/*
261134791Sjulian		 * If we found one, send it to the system scheduler.
262134791Sjulian		 */
263134791Sjulian		if (td) {
264134791Sjulian			kg->kg_last_assigned = td;
265136438Sups			sched_add(td, SRQ_YIELDING);
266134791Sjulian			CTR2(KTR_RUNQ, "slot_fill: td%p -> kg%p", td, kg);
267134791Sjulian		} else {
268134791Sjulian			/* no threads to use up the slots. quit now */
269134791Sjulian			break;
270134791Sjulian		}
271104695Sjulian	}
27299072Sjulian}
27399072Sjulian
274135255Sjulian#ifdef	SCHED_4BSD
27599072Sjulian/*
27699072Sjulian * Remove a thread from its KSEGRP's run queue.
27799072Sjulian * This in turn may remove it from a KSE if it was already assigned
27899072Sjulian * to one, possibly causing a new thread to be assigned to the KSE
279111028Sjeff * and the KSE getting a new priority.
28099072Sjulian */
281105127Sjulianstatic void
28283366Sjulianremrunqueue(struct thread *td)
28372376Sjake{
284104695Sjulian	struct thread *td2, *td3;
28599072Sjulian	struct ksegrp *kg;
28699072Sjulian	struct kse *ke;
28799072Sjulian
28899072Sjulian	mtx_assert(&sched_lock, MA_OWNED);
289111028Sjeff	KASSERT((TD_ON_RUNQ(td)), ("remrunqueue: Bad state on run queue"));
29099072Sjulian	kg = td->td_ksegrp;
29199072Sjulian	ke = td->td_kse;
29299072Sjulian	CTR1(KTR_RUNQ, "remrunqueue: td%p", td);
293103216Sjulian	TD_SET_CAN_RUN(td);
294111028Sjeff	/*
295111028Sjeff	 * If it is not a threaded process, take the shortcut.
296111028Sjeff	 */
297134791Sjulian	if ((td->td_proc->p_flag & P_HADTHREADS) == 0) {
298135051Sjulian		/* remve from sys run queue and free up a slot */
299121127Sjeff		sched_rem(td);
30099072Sjulian		return;
30199072Sjulian	}
302104695Sjulian   	td3 = TAILQ_PREV(td, threadqueue, td_runq);
303104695Sjulian	TAILQ_REMOVE(&kg->kg_runq, td, td_runq);
304134791Sjulian	if (ke->ke_state == KES_ONRUNQ) {
30599072Sjulian		/*
306135051Sjulian		 * This thread has been assigned to the system run queue.
30799072Sjulian		 * We need to dissociate it and try assign the
30899072Sjulian		 * KSE to the next available thread. Then, we should
30999072Sjulian		 * see if we need to move the KSE in the run queues.
31099072Sjulian		 */
311121127Sjeff		sched_rem(td);
31299072Sjulian		td2 = kg->kg_last_assigned;
313111028Sjeff		KASSERT((td2 != NULL), ("last assigned has wrong value"));
314153797Skan		if (td2 == td)
315104695Sjulian			kg->kg_last_assigned = td3;
316135051Sjulian		/* slot_fill(kg); */ /* will replace it with another */
31799072Sjulian	}
31872376Sjake}
319135255Sjulian#endif
320163709Sjb#endif
32172376Sjake
322105127Sjulian/*
323105127Sjulian * Change the priority of a thread that is on the run queue.
324105127Sjulian */
32572376Sjakevoid
326153797Skanadjustrunqueue( struct thread *td, int newpri)
327105127Sjulian{
328163709Sjb#ifdef KSE
329105127Sjulian	struct ksegrp *kg;
330163709Sjb#endif
331105127Sjulian	struct kse *ke;
332105127Sjulian
333105127Sjulian	mtx_assert(&sched_lock, MA_OWNED);
334111028Sjeff	KASSERT((TD_ON_RUNQ(td)), ("adjustrunqueue: Bad state on run queue"));
335111028Sjeff
336111028Sjeff	ke = td->td_kse;
337111028Sjeff	CTR1(KTR_RUNQ, "adjustrunqueue: td%p", td);
338163709Sjb#ifdef KSE
339110190Sjulian	/*
340111028Sjeff	 * If it is not a threaded process, take the shortcut.
341110190Sjulian	 */
342134791Sjulian	if ((td->td_proc->p_flag & P_HADTHREADS) == 0) {
343105127Sjulian		/* We only care about the kse in the run queue. */
344105129Sjulian		td->td_priority = newpri;
345159570Sdavidxu#ifndef SCHED_CORE
346159570Sdavidxu		if (ke->ke_rqindex != (newpri / RQ_PPQ))
347159570Sdavidxu#else
348159570Sdavidxu		if (ke->ke_rqindex != newpri)
349159570Sdavidxu#endif
350159570Sdavidxu		{
351121127Sjeff			sched_rem(td);
352134586Sjulian			sched_add(td, SRQ_BORING);
353105127Sjulian		}
354105127Sjulian		return;
355105127Sjulian	}
356111028Sjeff
357111028Sjeff	/* It is a threaded process */
358105127Sjulian	kg = td->td_ksegrp;
359148661Sdavidxu	if (ke->ke_state == KES_ONRUNQ
360148661Sdavidxu#ifdef SCHED_ULE
361148661Sdavidxu	 || ((ke->ke_flags & KEF_ASSIGNED) != 0 &&
362148661Sdavidxu	     (ke->ke_flags & KEF_REMOVED) == 0)
363148661Sdavidxu#endif
364148661Sdavidxu	   ) {
365105127Sjulian		if (kg->kg_last_assigned == td) {
366105127Sjulian			kg->kg_last_assigned =
367105127Sjulian			    TAILQ_PREV(td, threadqueue, td_runq);
368105127Sjulian		}
369121127Sjeff		sched_rem(td);
370105127Sjulian	}
371105127Sjulian	TAILQ_REMOVE(&kg->kg_runq, td, td_runq);
372135295Sjulian	TD_SET_CAN_RUN(td);
373105127Sjulian	td->td_priority = newpri;
374134586Sjulian	setrunqueue(td, SRQ_BORING);
375163709Sjb#else
376163709Sjb	/* We only care about the kse in the run queue. */
377163709Sjb	td->td_priority = newpri;
378163709Sjb#ifndef SCHED_CORE
379163709Sjb	if (ke->ke_rqindex != (newpri / RQ_PPQ))
380163709Sjb#else
381163709Sjb	if (ke->ke_rqindex != newpri)
382163709Sjb#endif
383163709Sjb	{
384163709Sjb		sched_rem(td);
385163709Sjb		sched_add(td, SRQ_BORING);
386163709Sjb	}
387163709Sjb#endif
388105127Sjulian}
389136438Sups
390163709Sjb#ifdef KSE
391136438Sups/*
392136438Sups * This function is called when a thread is about to be put on a
393153797Skan * ksegrp run queue because it has been made runnable or its
394153797Skan * priority has been adjusted and the ksegrp does not have a
395136438Sups * free kse slot.  It determines if a thread from the same ksegrp
396136438Sups * should be preempted.  If so, it tries to switch threads
397136438Sups * if the thread is on the same cpu or notifies another cpu that
398153797Skan * it should switch threads.
399136438Sups */
400136438Sups
401136438Supsstatic void
402136438Supsmaybe_preempt_in_ksegrp(struct thread *td)
403136494Sups#if  !defined(SMP)
404136438Sups{
405136452Sphk	struct thread *running_thread;
406136494Sups
407136494Sups	mtx_assert(&sched_lock, MA_OWNED);
408136494Sups	running_thread = curthread;
409136494Sups
410136494Sups	if (running_thread->td_ksegrp != td->td_ksegrp)
411136494Sups		return;
412136494Sups
413146362Sups	if (td->td_priority >= running_thread->td_priority)
414136494Sups		return;
415136494Sups#ifdef PREEMPTION
416147182Sups#ifndef FULL_PREEMPTION
417147182Sups	if (td->td_priority > PRI_MAX_ITHD) {
418147182Sups		running_thread->td_flags |= TDF_NEEDRESCHED;
419147182Sups		return;
420147182Sups	}
421147182Sups#endif /* FULL_PREEMPTION */
422147182Sups
423153797Skan	if (running_thread->td_critnest > 1)
424144777Sups		running_thread->td_owepreempt = 1;
425153797Skan	 else
426136494Sups		 mi_switch(SW_INVOL, NULL);
427153797Skan
428147182Sups#else /* PREEMPTION */
429136494Sups	running_thread->td_flags |= TDF_NEEDRESCHED;
430147182Sups#endif /* PREEMPTION */
431136494Sups	return;
432136494Sups}
433136494Sups
434136494Sups#else /* SMP */
435136494Sups{
436136494Sups	struct thread *running_thread;
437136438Sups	int worst_pri;
438136438Sups	struct ksegrp *kg;
439136438Sups	cpumask_t cpumask,dontuse;
440136438Sups	struct pcpu *pc;
441136438Sups	struct pcpu *best_pcpu;
442136438Sups	struct thread *cputhread;
443136438Sups
444136438Sups	mtx_assert(&sched_lock, MA_OWNED);
445136438Sups
446136438Sups	running_thread = curthread;
447136438Sups
448136438Sups#if !defined(KSEG_PEEMPT_BEST_CPU)
449136438Sups	if (running_thread->td_ksegrp != td->td_ksegrp) {
450136438Sups#endif
451136438Sups		kg = td->td_ksegrp;
452136438Sups
453136438Sups		/* if someone is ahead of this thread, wait our turn */
454153797Skan		if (td != TAILQ_FIRST(&kg->kg_runq))
455136438Sups			return;
456153797Skan
457136438Sups		worst_pri = td->td_priority;
458136438Sups		best_pcpu = NULL;
459136438Sups		dontuse   = stopped_cpus | idle_cpus_mask;
460153797Skan
461153797Skan		/*
462136438Sups		 * Find a cpu with the worst priority that runs at thread from
463136438Sups		 * the same  ksegrp - if multiple exist give first the last run
464153797Skan		 * cpu and then the current cpu priority
465136438Sups		 */
466153797Skan
467136438Sups		SLIST_FOREACH(pc, &cpuhead, pc_allcpu) {
468136438Sups			cpumask   = pc->pc_cpumask;
469136438Sups			cputhread = pc->pc_curthread;
470136438Sups
471153797Skan			if ((cpumask & dontuse)  ||
472136438Sups			    cputhread->td_ksegrp != kg)
473153797Skan				continue;
474136438Sups
475136438Sups			if (cputhread->td_priority > worst_pri) {
476136438Sups				worst_pri = cputhread->td_priority;
477153797Skan				best_pcpu = pc;
478136438Sups				continue;
479136438Sups			}
480153797Skan
481136438Sups			if (cputhread->td_priority == worst_pri &&
482153797Skan			    best_pcpu != NULL &&
483136438Sups			    (td->td_lastcpu == pc->pc_cpuid ||
484136438Sups				(PCPU_GET(cpumask) == cpumask &&
485153797Skan				    td->td_lastcpu != best_pcpu->pc_cpuid)))
486136438Sups			    best_pcpu = pc;
487153797Skan		}
488153797Skan
489136438Sups		/* Check if we need to preempt someone */
490153797Skan		if (best_pcpu == NULL)
491136438Sups			return;
492136438Sups
493147182Sups#if defined(IPI_PREEMPTION) && defined(PREEMPTION)
494147182Sups#if !defined(FULL_PREEMPTION)
495147190Sups		if (td->td_priority <= PRI_MAX_ITHD)
496147182Sups#endif /* ! FULL_PREEMPTION */
497147182Sups			{
498147182Sups				ipi_selected(best_pcpu->pc_cpumask, IPI_PREEMPT);
499147182Sups				return;
500147182Sups			}
501147182Sups#endif /* defined(IPI_PREEMPTION) && defined(PREEMPTION) */
502147182Sups
503136438Sups		if (PCPU_GET(cpuid) != best_pcpu->pc_cpuid) {
504136438Sups			best_pcpu->pc_curthread->td_flags |= TDF_NEEDRESCHED;
505136438Sups			ipi_selected(best_pcpu->pc_cpumask, IPI_AST);
506136438Sups			return;
507136438Sups		}
508136438Sups#if !defined(KSEG_PEEMPT_BEST_CPU)
509153797Skan	}
510136438Sups#endif
511136438Sups
512146362Sups	if (td->td_priority >= running_thread->td_priority)
513136438Sups		return;
514136438Sups#ifdef PREEMPTION
515147182Sups
516147182Sups#if !defined(FULL_PREEMPTION)
517147190Sups	if (td->td_priority > PRI_MAX_ITHD) {
518147182Sups		running_thread->td_flags |= TDF_NEEDRESCHED;
519147182Sups	}
520147182Sups#endif /* ! FULL_PREEMPTION */
521153797Skan
522153797Skan	if (running_thread->td_critnest > 1)
523144777Sups		running_thread->td_owepreempt = 1;
524153797Skan	 else
525136438Sups		 mi_switch(SW_INVOL, NULL);
526153797Skan
527147182Sups#else /* PREEMPTION */
528136438Sups	running_thread->td_flags |= TDF_NEEDRESCHED;
529147182Sups#endif /* PREEMPTION */
530136438Sups	return;
531136438Sups}
532136494Sups#endif /* !SMP */
533136438Sups
534136494Sups
535134791Sjulianint limitcount;
536163709Sjb#endif
537105127Sjulianvoid
538134586Sjuliansetrunqueue(struct thread *td, int flags)
53950027Speter{
540163709Sjb#ifdef KSE
54199072Sjulian	struct ksegrp *kg;
54299072Sjulian	struct thread *td2;
54399072Sjulian	struct thread *tda;
54499072Sjulian
545134791Sjulian	CTR3(KTR_RUNQ, "setrunqueue: td:%p kg:%p pid:%d",
546134791Sjulian	    td, td->td_ksegrp, td->td_proc->p_pid);
547163709Sjb#else
548163709Sjb	CTR2(KTR_RUNQ, "setrunqueue: td:%p pid:%d",
549163709Sjb	    td, td->td_proc->p_pid);
550163709Sjb#endif
551139315Sjeff	CTR5(KTR_SCHED, "setrunqueue: %p(%s) prio %d by %p(%s)",
552139315Sjeff            td, td->td_proc->p_comm, td->td_priority, curthread,
553139315Sjeff            curthread->td_proc->p_comm);
55499072Sjulian	mtx_assert(&sched_lock, MA_OWNED);
555135181Sjulian	KASSERT((td->td_inhibitors == 0),
556135181Sjulian			("setrunqueue: trying to run inhibitted thread"));
557103216Sjulian	KASSERT((TD_CAN_RUN(td) || TD_IS_RUNNING(td)),
558103216Sjulian	    ("setrunqueue: bad thread state"));
559103216Sjulian	TD_SET_RUNQ(td);
560163709Sjb#ifdef KSE
56199072Sjulian	kg = td->td_ksegrp;
562134791Sjulian	if ((td->td_proc->p_flag & P_HADTHREADS) == 0) {
563104695Sjulian		/*
564104695Sjulian		 * Common path optimisation: Only one of everything
565104695Sjulian		 * and the KSE is always already attached.
566104695Sjulian		 * Totally ignore the ksegrp run queue.
567104695Sjulian		 */
568134791Sjulian		if (kg->kg_avail_opennings != 1) {
569134888Sjulian			if (limitcount < 1) {
570134791Sjulian				limitcount++;
571134888Sjulian				printf("pid %d: corrected slot count (%d->1)\n",
572134791Sjulian				    td->td_proc->p_pid, kg->kg_avail_opennings);
573134791Sjulian
574134791Sjulian			}
575134791Sjulian			kg->kg_avail_opennings = 1;
576134791Sjulian		}
577134586Sjulian		sched_add(td, flags);
578104695Sjulian		return;
579104695Sjulian	}
580104695Sjulian
581153797Skan	/*
582153797Skan	 * If the concurrency has reduced, and we would go in the
583153797Skan	 * assigned section, then keep removing entries from the
584153797Skan	 * system run queue, until we are not in that section
585135295Sjulian	 * or there is room for us to be put in that section.
586135295Sjulian	 * What we MUST avoid is the case where there are threads of less
587135295Sjulian	 * priority than the new one scheduled, but it can not
588135295Sjulian	 * be scheduled itself. That would lead to a non contiguous set
589135295Sjulian	 * of scheduled threads, and everything would break.
590153797Skan	 */
59199072Sjulian	tda = kg->kg_last_assigned;
592135295Sjulian	while ((kg->kg_avail_opennings <= 0) &&
593135295Sjulian	    (tda && (tda->td_priority > td->td_priority))) {
594134791Sjulian		/*
595134791Sjulian		 * None free, but there is one we can commandeer.
59699942Sjulian		 */
597134791Sjulian		CTR2(KTR_RUNQ,
598134791Sjulian		    "setrunqueue: kg:%p: take slot from td: %p", kg, tda);
599134791Sjulian		sched_rem(tda);
600134791Sjulian		tda = kg->kg_last_assigned =
601134791Sjulian		    TAILQ_PREV(tda, threadqueue, td_runq);
60299072Sjulian	}
60399072Sjulian
60499072Sjulian	/*
60599072Sjulian	 * Add the thread to the ksegrp's run queue at
60699072Sjulian	 * the appropriate place.
60799072Sjulian	 */
60899072Sjulian	TAILQ_FOREACH(td2, &kg->kg_runq, td_runq) {
60999072Sjulian		if (td2->td_priority > td->td_priority) {
61099072Sjulian			TAILQ_INSERT_BEFORE(td2, td, td_runq);
61199072Sjulian			break;
61299072Sjulian		}
61399072Sjulian	}
61499072Sjulian	if (td2 == NULL) {
61599072Sjulian		/* We ran off the end of the TAILQ or it was empty. */
61699072Sjulian		TAILQ_INSERT_TAIL(&kg->kg_runq, td, td_runq);
61799072Sjulian	}
61899072Sjulian
61999072Sjulian	/*
620134791Sjulian	 * If we have a slot to use, then put the thread on the system
621134791Sjulian	 * run queue and if needed, readjust the last_assigned pointer.
622135295Sjulian	 * it may be that we need to schedule something anyhow
623135295Sjulian	 * even if the availabel slots are -ve so that
624135295Sjulian	 * all the items < last_assigned are scheduled.
62599072Sjulian	 */
626134791Sjulian	if (kg->kg_avail_opennings > 0) {
62799072Sjulian		if (tda == NULL) {
62899072Sjulian			/*
62999072Sjulian			 * No pre-existing last assigned so whoever is first
630135295Sjulian			 * gets the slot.. (maybe us)
63199072Sjulian			 */
63299072Sjulian			td2 = TAILQ_FIRST(&kg->kg_runq);
63399072Sjulian			kg->kg_last_assigned = td2;
63499072Sjulian		} else if (tda->td_priority > td->td_priority) {
635134791Sjulian			td2 = td;
63699072Sjulian		} else {
637153797Skan			/*
638153797Skan			 * We are past last_assigned, so
639135295Sjulian			 * give the next slot to whatever is next,
64099072Sjulian			 * which may or may not be us.
64199072Sjulian			 */
64299072Sjulian			td2 = TAILQ_NEXT(tda, td_runq);
64399072Sjulian			kg->kg_last_assigned = td2;
64499072Sjulian		}
645134791Sjulian		sched_add(td2, flags);
646133396Sjulian	} else {
647133396Sjulian		CTR3(KTR_RUNQ, "setrunqueue: held: td%p kg%p pid%d",
648133396Sjulian			td, td->td_ksegrp, td->td_proc->p_pid);
649136438Sups		if ((flags & SRQ_YIELDING) == 0)
650136438Sups			maybe_preempt_in_ksegrp(td);
65199072Sjulian	}
652163709Sjb#else
653163709Sjb	sched_add(td, flags);
654163709Sjb#endif
65572376Sjake}
65650027Speter
657131481Sjhb/*
658131481Sjhb * Kernel thread preemption implementation.  Critical sections mark
659131481Sjhb * regions of code in which preemptions are not allowed.
660131481Sjhb */
66188088Sjhbvoid
66288088Sjhbcritical_enter(void)
66388088Sjhb{
66488088Sjhb	struct thread *td;
66588088Sjhb
66688088Sjhb	td = curthread;
66788088Sjhb	td->td_critnest++;
668153510Snjl	CTR4(KTR_CRITICAL, "critical_enter by thread %p (%ld, %s) to %d", td,
669137364Srwatson	    (long)td->td_proc->p_pid, td->td_proc->p_comm, td->td_critnest);
67088088Sjhb}
67188088Sjhb
67288088Sjhbvoid
67388088Sjhbcritical_exit(void)
67488088Sjhb{
67588088Sjhb	struct thread *td;
67688088Sjhb
67788088Sjhb	td = curthread;
678125315Sjeff	KASSERT(td->td_critnest != 0,
679125315Sjeff	    ("critical_exit: td_critnest == 0"));
680146554Sups#ifdef PREEMPTION
68188088Sjhb	if (td->td_critnest == 1) {
682144777Sups		td->td_critnest = 0;
683132266Sjhb		mtx_assert(&sched_lock, MA_NOTOWNED);
684144777Sups		if (td->td_owepreempt) {
685144777Sups			td->td_critnest = 1;
686131481Sjhb			mtx_lock_spin(&sched_lock);
687144777Sups			td->td_critnest--;
688131481Sjhb			mi_switch(SW_INVOL, NULL);
689131481Sjhb			mtx_unlock_spin(&sched_lock);
690131481Sjhb		}
691153797Skan	} else
692131481Sjhb#endif
69388088Sjhb		td->td_critnest--;
694153797Skan
695153510Snjl	CTR4(KTR_CRITICAL, "critical_exit by thread %p (%ld, %s) to %d", td,
696137364Srwatson	    (long)td->td_proc->p_pid, td->td_proc->p_comm, td->td_critnest);
69788088Sjhb}
69888088Sjhb
699131481Sjhb/*
700131481Sjhb * This function is called when a thread is about to be put on run queue
701131481Sjhb * because it has been made runnable or its priority has been adjusted.  It
702131481Sjhb * determines if the new thread should be immediately preempted to.  If so,
703131481Sjhb * it switches to it and eventually returns true.  If not, it returns false
704131481Sjhb * so that the caller may place the thread on an appropriate run queue.
705131481Sjhb */
706131481Sjhbint
707131481Sjhbmaybe_preempt(struct thread *td)
708131481Sjhb{
709131508Smarcel#ifdef PREEMPTION
710131481Sjhb	struct thread *ctd;
711131481Sjhb	int cpri, pri;
712131508Smarcel#endif
71399072Sjulian
714131481Sjhb	mtx_assert(&sched_lock, MA_OWNED);
715131481Sjhb#ifdef PREEMPTION
716131481Sjhb	/*
717131481Sjhb	 * The new thread should not preempt the current thread if any of the
718131481Sjhb	 * following conditions are true:
719131481Sjhb	 *
720143757Srwatson	 *  - The kernel is in the throes of crashing (panicstr).
721132266Sjhb	 *  - The current thread has a higher (numerically lower) or
722132266Sjhb	 *    equivalent priority.  Note that this prevents curthread from
723132266Sjhb	 *    trying to preempt to itself.
724131481Sjhb	 *  - It is too early in the boot for context switches (cold is set).
725131481Sjhb	 *  - The current thread has an inhibitor set or is in the process of
726131481Sjhb	 *    exiting.  In this case, the current thread is about to switch
727131481Sjhb	 *    out anyways, so there's no point in preempting.  If we did,
728131481Sjhb	 *    the current thread would not be properly resumed as well, so
729131481Sjhb	 *    just avoid that whole landmine.
730131481Sjhb	 *  - If the new thread's priority is not a realtime priority and
731131481Sjhb	 *    the current thread's priority is not an idle priority and
732131481Sjhb	 *    FULL_PREEMPTION is disabled.
733131481Sjhb	 *
734131481Sjhb	 * If all of these conditions are false, but the current thread is in
735131481Sjhb	 * a nested critical section, then we have to defer the preemption
736131481Sjhb	 * until we exit the critical section.  Otherwise, switch immediately
737131481Sjhb	 * to the new thread.
738131481Sjhb	 */
739131481Sjhb	ctd = curthread;
740134837Sjulian	KASSERT ((ctd->td_kse != NULL && ctd->td_kse->ke_thread == ctd),
741134837Sjulian	  ("thread has no (or wrong) sched-private part."));
742135181Sjulian	KASSERT((td->td_inhibitors == 0),
743135181Sjulian			("maybe_preempt: trying to run inhibitted thread"));
744131481Sjhb	pri = td->td_priority;
745131481Sjhb	cpri = ctd->td_priority;
746143757Srwatson	if (panicstr != NULL || pri >= cpri || cold /* || dumping */ ||
747143757Srwatson	    TD_IS_INHIBITED(ctd) || td->td_kse->ke_state != KES_THREAD)
748131481Sjhb		return (0);
749131481Sjhb#ifndef FULL_PREEMPTION
750147216Sups	if (pri > PRI_MAX_ITHD && cpri < PRI_MIN_IDLE)
751131481Sjhb		return (0);
752131481Sjhb#endif
753147190Sups
754131481Sjhb	if (ctd->td_critnest > 1) {
755131481Sjhb		CTR1(KTR_PROC, "maybe_preempt: in critical section %d",
756131481Sjhb		    ctd->td_critnest);
757144777Sups		ctd->td_owepreempt = 1;
758131481Sjhb		return (0);
759131481Sjhb	}
760131481Sjhb
761131481Sjhb	/*
762136170Sjulian	 * Thread is runnable but not yet put on system run queue.
763131481Sjhb	 */
764131481Sjhb	MPASS(TD_ON_RUNQ(td));
765135182Sjulian	MPASS(td->td_sched->ke_state != KES_ONRUNQ);
766163709Sjb#ifdef KSE
767135182Sjulian	if (td->td_proc->p_flag & P_HADTHREADS) {
768135182Sjulian		/*
769135182Sjulian		 * If this is a threaded process we actually ARE on the
770135182Sjulian		 * ksegrp run queue so take it off that first.
771135291Sjulian		 * Also undo any damage done to the last_assigned pointer.
772135291Sjulian		 * XXX Fix setrunqueue so this isn't needed
773135182Sjulian		 */
774135291Sjulian		struct ksegrp *kg;
775135291Sjulian
776135291Sjulian		kg = td->td_ksegrp;
777135291Sjulian		if (kg->kg_last_assigned == td)
778135291Sjulian			kg->kg_last_assigned =
779135295Sjulian			    TAILQ_PREV(td, threadqueue, td_runq);
780135291Sjulian		TAILQ_REMOVE(&kg->kg_runq, td, td_runq);
781135182Sjulian	}
782153797Skan
783163709Sjb#endif
784131481Sjhb	TD_SET_RUNNING(td);
785131481Sjhb	CTR3(KTR_PROC, "preempting to thread %p (pid %d, %s)\n", td,
786131481Sjhb	    td->td_proc->p_pid, td->td_proc->p_comm);
787136170Sjulian	mi_switch(SW_INVOL|SW_PREEMPT, td);
788131481Sjhb	return (1);
789131481Sjhb#else
790131481Sjhb	return (0);
791131481Sjhb#endif
792131481Sjhb}
793131481Sjhb
794133219Sjhb#if 0
795131481Sjhb#ifndef PREEMPTION
796131481Sjhb/* XXX: There should be a non-static version of this. */
797131481Sjhbstatic void
798131481Sjhbprintf_caddr_t(void *data)
799131481Sjhb{
800131481Sjhb	printf("%s", (char *)data);
801131481Sjhb}
802131481Sjhbstatic char preempt_warning[] =
803131481Sjhb    "WARNING: Kernel preemption is disabled, expect reduced performance.\n";
804131481SjhbSYSINIT(preempt_warning, SI_SUB_COPYRIGHT, SI_ORDER_ANY, printf_caddr_t,
805131481Sjhb    preempt_warning)
806131481Sjhb#endif
807133219Sjhb#endif
808131481Sjhb
80999072Sjulian/************************************************************************
81099072Sjulian * SYSTEM RUN QUEUE manipulations and tests				*
81199072Sjulian ************************************************************************/
81272376Sjake/*
81399072Sjulian * Initialize a run structure.
81499072Sjulian */
81599072Sjulianvoid
81699072Sjulianrunq_init(struct runq *rq)
81799072Sjulian{
81899072Sjulian	int i;
81999072Sjulian
82099072Sjulian	bzero(rq, sizeof *rq);
82199072Sjulian	for (i = 0; i < RQ_NQS; i++)
82299072Sjulian		TAILQ_INIT(&rq->rq_queues[i]);
82399072Sjulian}
82499072Sjulian
82599072Sjulian/*
82672376Sjake * Clear the status bit of the queue corresponding to priority level pri,
82772376Sjake * indicating that it is empty.
82872376Sjake */
82972376Sjakestatic __inline void
83072376Sjakerunq_clrbit(struct runq *rq, int pri)
83172376Sjake{
83272376Sjake	struct rqbits *rqb;
83365557Sjasone
83472376Sjake	rqb = &rq->rq_status;
83572376Sjake	CTR4(KTR_RUNQ, "runq_clrbit: bits=%#x %#x bit=%#x word=%d",
83672376Sjake	    rqb->rqb_bits[RQB_WORD(pri)],
83772376Sjake	    rqb->rqb_bits[RQB_WORD(pri)] & ~RQB_BIT(pri),
83872376Sjake	    RQB_BIT(pri), RQB_WORD(pri));
83972376Sjake	rqb->rqb_bits[RQB_WORD(pri)] &= ~RQB_BIT(pri);
84050027Speter}
84150027Speter
84250027Speter/*
84372376Sjake * Find the index of the first non-empty run queue.  This is done by
84472376Sjake * scanning the status bits, a set bit indicates a non-empty queue.
84550027Speter */
84672376Sjakestatic __inline int
84772376Sjakerunq_findbit(struct runq *rq)
84872376Sjake{
84972376Sjake	struct rqbits *rqb;
85072376Sjake	int pri;
85172376Sjake	int i;
85272376Sjake
85372376Sjake	rqb = &rq->rq_status;
85472376Sjake	for (i = 0; i < RQB_LEN; i++)
85572376Sjake		if (rqb->rqb_bits[i]) {
85698469Speter			pri = RQB_FFS(rqb->rqb_bits[i]) + (i << RQB_L2BPW);
85772376Sjake			CTR3(KTR_RUNQ, "runq_findbit: bits=%#x i=%d pri=%d",
85872376Sjake			    rqb->rqb_bits[i], i, pri);
85972376Sjake			return (pri);
86072376Sjake		}
86172376Sjake
86272376Sjake	return (-1);
86372376Sjake}
86472376Sjake
86572376Sjake/*
86672376Sjake * Set the status bit of the queue corresponding to priority level pri,
86772376Sjake * indicating that it is non-empty.
86872376Sjake */
86972376Sjakestatic __inline void
87072376Sjakerunq_setbit(struct runq *rq, int pri)
87172376Sjake{
87272376Sjake	struct rqbits *rqb;
87372376Sjake
87472376Sjake	rqb = &rq->rq_status;
87572376Sjake	CTR4(KTR_RUNQ, "runq_setbit: bits=%#x %#x bit=%#x word=%d",
87672376Sjake	    rqb->rqb_bits[RQB_WORD(pri)],
87772376Sjake	    rqb->rqb_bits[RQB_WORD(pri)] | RQB_BIT(pri),
87872376Sjake	    RQB_BIT(pri), RQB_WORD(pri));
87972376Sjake	rqb->rqb_bits[RQB_WORD(pri)] |= RQB_BIT(pri);
88072376Sjake}
88172376Sjake
88272376Sjake/*
88399072Sjulian * Add the KSE to the queue specified by its priority, and set the
88472376Sjake * corresponding status bit.
88572376Sjake */
88650027Spetervoid
887136170Sjulianrunq_add(struct runq *rq, struct kse *ke, int flags)
88850027Speter{
88972376Sjake	struct rqhead *rqh;
89072376Sjake	int pri;
89150027Speter
89290538Sjulian	pri = ke->ke_thread->td_priority / RQ_PPQ;
89383366Sjulian	ke->ke_rqindex = pri;
89472376Sjake	runq_setbit(rq, pri);
89572376Sjake	rqh = &rq->rq_queues[pri];
896133396Sjulian	CTR5(KTR_RUNQ, "runq_add: td=%p ke=%p pri=%d %d rqh=%p",
897133396Sjulian	    ke->ke_thread, ke, ke->ke_thread->td_priority, pri, rqh);
898136170Sjulian	if (flags & SRQ_PREEMPTED) {
899136170Sjulian		TAILQ_INSERT_HEAD(rqh, ke, ke_procq);
900136170Sjulian	} else {
901136170Sjulian		TAILQ_INSERT_TAIL(rqh, ke, ke_procq);
902136170Sjulian	}
90350027Speter}
90450027Speter
90550027Speter/*
90672376Sjake * Return true if there are runnable processes of any priority on the run
90772376Sjake * queue, false otherwise.  Has no side effects, does not modify the run
90872376Sjake * queue structure.
90950027Speter */
91072376Sjakeint
91172376Sjakerunq_check(struct runq *rq)
91250027Speter{
91372376Sjake	struct rqbits *rqb;
91472376Sjake	int i;
91572376Sjake
91672376Sjake	rqb = &rq->rq_status;
91772376Sjake	for (i = 0; i < RQB_LEN; i++)
91872376Sjake		if (rqb->rqb_bits[i]) {
91972376Sjake			CTR2(KTR_RUNQ, "runq_check: bits=%#x i=%d",
92072376Sjake			    rqb->rqb_bits[i], i);
92172376Sjake			return (1);
92272376Sjake		}
92372376Sjake	CTR0(KTR_RUNQ, "runq_check: empty");
92472376Sjake
92572376Sjake	return (0);
92650027Speter}
92750027Speter
928134591Sjulian#if defined(SMP) && defined(SCHED_4BSD)
929134591Sjulianint runq_fuzz = 1;
930134591SjulianSYSCTL_INT(_kern_sched, OID_AUTO, runq_fuzz, CTLFLAG_RW, &runq_fuzz, 0, "");
931134591Sjulian#endif
932134591Sjulian
93350027Speter/*
934104964Sjeff * Find the highest priority process on the run queue.
93550027Speter */
93683366Sjulianstruct kse *
93772376Sjakerunq_choose(struct runq *rq)
93850027Speter{
93972376Sjake	struct rqhead *rqh;
94083366Sjulian	struct kse *ke;
94172376Sjake	int pri;
94250027Speter
94365557Sjasone	mtx_assert(&sched_lock, MA_OWNED);
94499072Sjulian	while ((pri = runq_findbit(rq)) != -1) {
94572376Sjake		rqh = &rq->rq_queues[pri];
946134591Sjulian#if defined(SMP) && defined(SCHED_4BSD)
947134591Sjulian		/* fuzz == 1 is normal.. 0 or less are ignored */
948134591Sjulian		if (runq_fuzz > 1) {
949134591Sjulian			/*
950134591Sjulian			 * In the first couple of entries, check if
951134591Sjulian			 * there is one for our CPU as a preference.
952134591Sjulian			 */
953134591Sjulian			int count = runq_fuzz;
954134591Sjulian			int cpu = PCPU_GET(cpuid);
955134591Sjulian			struct kse *ke2;
956134591Sjulian			ke2 = ke = TAILQ_FIRST(rqh);
957134591Sjulian
958134591Sjulian			while (count-- && ke2) {
959134591Sjulian				if (ke->ke_thread->td_lastcpu == cpu) {
960134591Sjulian					ke = ke2;
961134591Sjulian					break;
962134591Sjulian				}
963134591Sjulian				ke2 = TAILQ_NEXT(ke2, ke_procq);
964134591Sjulian			}
965153797Skan		} else
966134591Sjulian#endif
967134591Sjulian			ke = TAILQ_FIRST(rqh);
96883366Sjulian		KASSERT(ke != NULL, ("runq_choose: no proc on busy queue"));
96999072Sjulian		CTR3(KTR_RUNQ,
97099072Sjulian		    "runq_choose: pri=%d kse=%p rqh=%p", pri, ke, rqh);
97183366Sjulian		return (ke);
97250027Speter	}
97372376Sjake	CTR1(KTR_RUNQ, "runq_choose: idleproc pri=%d", pri);
97472376Sjake
97599072Sjulian	return (NULL);
97650027Speter}
97772376Sjake
97872376Sjake/*
97999072Sjulian * Remove the KSE from the queue specified by its priority, and clear the
98072376Sjake * corresponding status bit if the queue becomes empty.
98199072Sjulian * Caller must set ke->ke_state afterwards.
98272376Sjake */
98372376Sjakevoid
98483366Sjulianrunq_remove(struct runq *rq, struct kse *ke)
98572376Sjake{
98672376Sjake	struct rqhead *rqh;
98772376Sjake	int pri;
98872376Sjake
989163709Sjb#ifdef KSE
990100913Stanimura	KASSERT(ke->ke_proc->p_sflag & PS_INMEM,
991163709Sjb#else
992163709Sjb	KASSERT(ke->ke_thread->td_proc->p_sflag & PS_INMEM,
993163709Sjb#endif
994100913Stanimura		("runq_remove: process swapped out"));
99583366Sjulian	pri = ke->ke_rqindex;
99672376Sjake	rqh = &rq->rq_queues[pri];
997133396Sjulian	CTR5(KTR_RUNQ, "runq_remove: td=%p, ke=%p pri=%d %d rqh=%p",
998133396Sjulian	    ke->ke_thread, ke, ke->ke_thread->td_priority, pri, rqh);
99983366Sjulian	KASSERT(ke != NULL, ("runq_remove: no proc on busy queue"));
100083366Sjulian	TAILQ_REMOVE(rqh, ke, ke_procq);
100172376Sjake	if (TAILQ_EMPTY(rqh)) {
100272376Sjake		CTR0(KTR_RUNQ, "runq_remove: empty");
100372376Sjake		runq_clrbit(rq, pri);
100472376Sjake	}
100572376Sjake}
100699072Sjulian
1007134791Sjulian/****** functions that are temporarily here ***********/
1008134791Sjulian#include <vm/uma.h>
1009134791Sjulianextern struct mtx kse_zombie_lock;
1010134791Sjulian
1011163709Sjb#ifdef KSE
1012134791Sjulian/*
1013134791Sjulian *  Allocate scheduler specific per-process resources.
1014134791Sjulian * The thread and ksegrp have already been linked in.
1015134791Sjulian * In this case just set the default concurrency value.
1016134791Sjulian *
1017134791Sjulian * Called from:
1018134791Sjulian *  proc_init() (UMA init method)
1019134791Sjulian */
1020134791Sjulianvoid
1021134791Sjuliansched_newproc(struct proc *p, struct ksegrp *kg, struct thread *td)
1022134791Sjulian{
1023134791Sjulian
1024134791Sjulian	/* This can go in sched_fork */
1025134791Sjulian	sched_init_concurrency(kg);
1026134791Sjulian}
1027163709Sjb#endif
1028134791Sjulian
1029134791Sjulian/*
1030134791Sjulian * thread is being either created or recycled.
1031134791Sjulian * Fix up the per-scheduler resources associated with it.
1032134791Sjulian * Called from:
1033134791Sjulian *  sched_fork_thread()
1034134791Sjulian *  thread_dtor()  (*may go away)
1035134791Sjulian *  thread_init()  (*may go away)
1036134791Sjulian */
1037134791Sjulianvoid
1038134791Sjuliansched_newthread(struct thread *td)
1039134791Sjulian{
1040134791Sjulian	struct td_sched *ke;
1041134791Sjulian
1042134791Sjulian	ke = (struct td_sched *) (td + 1);
1043134791Sjulian	bzero(ke, sizeof(*ke));
1044134791Sjulian	td->td_sched     = ke;
1045134791Sjulian	ke->ke_thread	= td;
1046134791Sjulian	ke->ke_state	= KES_THREAD;
1047134791Sjulian}
1048134791Sjulian
1049163709Sjb#ifdef KSE
1050134791Sjulian/*
1051134791Sjulian * Set up an initial concurrency of 1
1052134791Sjulian * and set the given thread (if given) to be using that
1053134791Sjulian * concurrency slot.
1054134791Sjulian * May be used "offline"..before the ksegrp is attached to the world
1055134791Sjulian * and thus wouldn't need schedlock in that case.
1056134791Sjulian * Called from:
1057134791Sjulian *  thr_create()
1058134791Sjulian *  proc_init() (UMA) via sched_newproc()
1059134791Sjulian */
1060134791Sjulianvoid
1061134791Sjuliansched_init_concurrency(struct ksegrp *kg)
1062134791Sjulian{
1063134791Sjulian
1064136167Sjulian	CTR1(KTR_RUNQ,"kg %p init slots and concurrency to 1", kg);
1065134791Sjulian	kg->kg_concurrency = 1;
1066134791Sjulian	kg->kg_avail_opennings = 1;
1067134791Sjulian}
1068134791Sjulian
1069134791Sjulian/*
1070134791Sjulian * Change the concurrency of an existing ksegrp to N
1071134791Sjulian * Called from:
1072134791Sjulian *  kse_create()
1073134791Sjulian *  kse_exit()
1074134791Sjulian *  thread_exit()
1075134791Sjulian *  thread_single()
1076134791Sjulian */
1077134791Sjulianvoid
1078134791Sjuliansched_set_concurrency(struct ksegrp *kg, int concurrency)
1079134791Sjulian{
1080134791Sjulian
1081136167Sjulian	CTR4(KTR_RUNQ,"kg %p set concurrency to %d, slots %d -> %d",
1082136167Sjulian	    kg,
1083136167Sjulian	    concurrency,
1084136167Sjulian	    kg->kg_avail_opennings,
1085136167Sjulian	    kg->kg_avail_opennings + (concurrency - kg->kg_concurrency));
1086134791Sjulian	kg->kg_avail_opennings += (concurrency - kg->kg_concurrency);
1087134791Sjulian	kg->kg_concurrency = concurrency;
1088134791Sjulian}
1089134791Sjulian
1090134791Sjulian/*
1091134791Sjulian * Called from thread_exit() for all exiting thread
1092134791Sjulian *
1093134791Sjulian * Not to be confused with sched_exit_thread()
1094134791Sjulian * that is only called from thread_exit() for threads exiting
1095134791Sjulian * without the rest of the process exiting because it is also called from
1096134791Sjulian * sched_exit() and we wouldn't want to call it twice.
1097134791Sjulian * XXX This can probably be fixed.
1098134791Sjulian */
1099134791Sjulianvoid
1100134791Sjuliansched_thread_exit(struct thread *td)
1101134791Sjulian{
1102134791Sjulian
1103136167Sjulian	SLOT_RELEASE(td->td_ksegrp);
1104134791Sjulian	slot_fill(td->td_ksegrp);
1105134791Sjulian}
1106163709Sjb#endif
1107134791Sjulian
1108134791Sjulian#endif /* KERN_SWITCH_INCLUDE */
1109