kern_switch.c revision 136167
150276Speter/*
2176187Srafan * Copyright (c) 2001 Jake Burkholder <jake@FreeBSD.org>
350276Speter * All rights reserved.
450276Speter *
550276Speter * Redistribution and use in source and binary forms, with or without
650276Speter * modification, are permitted provided that the following conditions
750276Speter * are met:
850276Speter * 1. Redistributions of source code must retain the above copyright
950276Speter *    notice, this list of conditions and the following disclaimer.
1050276Speter * 2. Redistributions in binary form must reproduce the above copyright
1150276Speter *    notice, this list of conditions and the following disclaimer in the
1250276Speter *    documentation and/or other materials provided with the distribution.
1350276Speter *
1450276Speter * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
1550276Speter * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
1650276Speter * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
1750276Speter * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
1850276Speter * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
1950276Speter * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
2050276Speter * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
2150276Speter * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
2250276Speter * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
2350276Speter * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
2450276Speter * SUCH DAMAGE.
2550276Speter */
2650276Speter
2750276Speter/***
2850276SpeterHere is the logic..
2950276Speter
3050276SpeterIf there are N processors, then there are at most N KSEs (kernel
3150276Speterschedulable entities) working to process threads that belong to a
32166124SrafanKSEGROUP (kg). If there are X of these KSEs actually running at the
3350276Spetermoment in question, then there are at most M (N-X) of these KSEs on
3450276Speterthe run queue, as running KSEs are not on the queue.
35184989Srafan
3650276SpeterRunnable threads are queued off the KSEGROUP in priority order.
3750276SpeterIf there are M or more threads runnable, the top M threads
3850276Speter(by priority) are 'preassigned' to the M KSEs not running. The KSEs take
3950276Spetertheir priority from those threads and are put on the run queue.
4050276Speter
4150276SpeterThe last thread that had a priority high enough to have a KSE associated
4250276Speterwith it, AND IS ON THE RUN QUEUE is pointed to by
4350276Speterkg->kg_last_assigned. If no threads queued off the KSEGROUP have KSEs
44166124Srafanassigned as all the available KSEs are activly running, or because there
4550276Speterare no threads queued, that pointer is NULL.
4650276Speter
4750276SpeterWhen a KSE is removed from the run queue to become runnable, we know
4850276Speterit was associated with the highest priority thread in the queue (at the head
4950276Speterof the queue). If it is also the last assigned we know M was 1 and must
5050276Speternow be 0. Since the thread is no longer queued that pointer must be
5150276Speterremoved from it. Since we know there were no more KSEs available,
5250276Speter(M was 1 and is now 0) and since we are not FREEING our KSE
5350276Speterbut using it, we know there are STILL no more KSEs available, we can prove
5450276Speterthat the next thread in the ksegrp list will not have a KSE to assign to
5550276Speterit, so we can show that the pointer must be made 'invalid' (NULL).
5650276Speter
5750276SpeterThe pointer exists so that when a new thread is made runnable, it can
58166124Srafanhave its priority compared with the last assigned thread to see if
59166124Srafanit should 'steal' its KSE or not.. i.e. is it 'earlier'
60166124Srafanon the list than that thread or later.. If it's earlier, then the KSE is
61166124Srafanremoved from the last assigned (which is now not assigned a KSE)
62166124Srafanand reassigned to the new thread, which is placed earlier in the list.
63166124SrafanThe pointer is then backed up to the previous thread (which may or may not
64166124Srafanbe the new thread).
65166124Srafan
6676726SpeterWhen a thread sleeps or is removed, the KSE becomes available and if there
6776726Speterare queued threads that are not assigned KSEs, the highest priority one of
68166124Srafanthem is assigned the KSE, which is then placed back on the run queue at
69166124Srafanthe approipriate place, and the kg->kg_last_assigned pointer is adjusted down
70166124Srafanto point to it.
71166124Srafan
72166124SrafanThe following diagram shows 2 KSEs and 3 threads from a single process.
73166124Srafan
74166124Srafan RUNQ: --->KSE---KSE--...    (KSEs queued at priorities from threads)
75166124Srafan              \    \____
76166124Srafan               \        \
77166124Srafan    KSEGROUP---thread--thread--thread    (queued in priority order)
78166124Srafan        \                 /
7950276Speter         \_______________/
80166124Srafan          (last_assigned)
8150276Speter
8250276SpeterThe result of this scheme is that the M available KSEs are always
83166124Srafanqueued at the priorities they have inherrited from the M highest priority
8450276Speterthreads for that KSEGROUP. If this situation changes, the KSEs are
8550276Speterreassigned to keep this true.
8650276Speter***/
8750276Speter
8850276Speter#include <sys/cdefs.h>
89166124Srafan__FBSDID("$FreeBSD: head/sys/kern/kern_switch.c 136167 2004-10-05 21:10:44Z julian $");
90166124Srafan
91166124Srafan#include "opt_sched.h"
92166124Srafan
9350276Speter#ifndef KERN_SWITCH_INCLUDE
9450276Speter#include <sys/param.h>
9550276Speter#include <sys/systm.h>
96166124Srafan#include <sys/kdb.h>
97166124Srafan#include <sys/kernel.h>
98166124Srafan#include <sys/ktr.h>
99166124Srafan#include <sys/lock.h>
100166124Srafan#include <sys/mutex.h>
101166124Srafan#include <sys/proc.h>
10262449Speter#include <sys/queue.h>
10362449Speter#include <sys/sched.h>
10462449Speter#else  /* KERN_SWITCH_INCLUDE */
105166124Srafan#if defined(SMP) && (defined(__i386__) || defined(__amd64__))
106174993Srafan#include <sys/smp.h>
107174993Srafan#endif
108174993Srafan#include <machine/critical.h>
109174993Srafan#if defined(SMP) && defined(SCHED_4BSD)
110174993Srafan#include <sys/sysctl.h>
111174993Srafan#endif
112174993Srafan
113166124Srafan#ifdef FULL_PREEMPTION
114166124Srafan#ifndef PREEMPTION
11562449Speter#error "The FULL_PREEMPTION option requires the PREEMPTION option"
116174993Srafan#endif
11762449Speter#endif
118166124Srafan
119166124SrafanCTASSERT((RQB_BPW * RQB_LEN) == RQ_NQS);
120166124Srafan
121166124Srafan#define td_kse td_sched
122166124Srafan
123166124Srafan/************************************************************************
124166124Srafan * Functions that manipulate runnability from a thread perspective.	*
125166124Srafan ************************************************************************/
126166124Srafan/*
127166124Srafan * Select the KSE that will be run next.  From that find the thread, and
128166124Srafan * remove it from the KSEGRP's run queue.  If there is thread clustering,
12997049Speter * this will be what does it.
13097049Speter */
13197049Speterstruct thread *
132166124Srafanchoosethread(void)
133166124Srafan{
134166124Srafan	struct kse *ke;
135166124Srafan	struct thread *td;
13650276Speter	struct ksegrp *kg;
137166124Srafan
138166124Srafan#if defined(SMP) && (defined(__i386__) || defined(__amd64__))
13950276Speter	if (smp_active == 0 && PCPU_GET(cpuid) != 0) {
14050276Speter		/* Shutting down, run idlethread on AP's */
14150276Speter		td = PCPU_GET(idlethread);
14250276Speter		ke = td->td_kse;
14350276Speter		CTR1(KTR_RUNQ, "choosethread: td=%p (idle)", td);
14450276Speter		ke->ke_flags |= KEF_DIDRUN;
14550276Speter		TD_SET_RUNNING(td);
14650276Speter		return (td);
147174993Srafan	}
14850276Speter#endif
14950276Speter
15097049Speterretry:
15197049Speter	ke = sched_choose();
15297049Speter	if (ke) {
15397049Speter		td = ke->ke_thread;
15497049Speter		KASSERT((td->td_kse == ke), ("kse/thread mismatch"));
15597049Speter		kg = ke->ke_ksegrp;
15697049Speter		if (td->td_proc->p_flag & P_HADTHREADS) {
15750276Speter			if (kg->kg_last_assigned == td) {
15850276Speter				kg->kg_last_assigned = TAILQ_PREV(td,
15950276Speter				    threadqueue, td_runq);
16050276Speter			}
16150276Speter			TAILQ_REMOVE(&kg->kg_runq, td, td_runq);
16250276Speter			kg->kg_runnable--;
16350276Speter		}
16450276Speter		CTR2(KTR_RUNQ, "choosethread: td=%p pri=%d",
16597049Speter		    td, td->td_priority);
16697049Speter	} else {
167166124Srafan		/* Simulate runq_choose() having returned the idle thread */
16897049Speter		td = PCPU_GET(idlethread);
169166124Srafan		ke = td->td_kse;
170166124Srafan		CTR1(KTR_RUNQ, "choosethread: td=%p (idle)", td);
171166124Srafan	}
172166124Srafan	ke->ke_flags |= KEF_DIDRUN;
173166124Srafan
174166124Srafan	/*
17597049Speter	 * If we are in panic, only allow system threads,
176166124Srafan	 * plus the one we are running in, to be run.
177166124Srafan	 */
178166124Srafan	if (panicstr && ((td->td_proc->p_flag & P_SYSTEM) == 0 &&
179166124Srafan	    (td->td_flags & TDF_INPANIC) == 0)) {
18050276Speter		/* note that it is no longer on the run queue */
18197049Speter		TD_SET_CAN_RUN(td);
18250276Speter		goto retry;
18350276Speter	}
18497049Speter
18597049Speter	TD_SET_RUNNING(td);
18650276Speter	return (td);
18750276Speter}
188166124Srafan
189166124Srafan/*
190166124Srafan * Given a surplus system slot, try assign a new runnable thread to it.
19150276Speter * Called from:
19250276Speter *  sched_thread_exit()  (local)
19350276Speter *  sched_switch()  (local)
194174993Srafan *  sched_thread_exit()  (local)
19550276Speter *  remrunqueue()  (local)  (not at the moment)
19650276Speter */
19750276Speterstatic void
19850276Speterslot_fill(struct ksegrp *kg)
19950276Speter{
20050276Speter	struct thread *td;
20150276Speter
20250276Speter	mtx_assert(&sched_lock, MA_OWNED);
20350276Speter	while (kg->kg_avail_opennings > 0) {
20450276Speter		/*
20550276Speter		 * Find the first unassigned thread
20650276Speter		 */
20750276Speter		if ((td = kg->kg_last_assigned) != NULL)
20850276Speter			td = TAILQ_NEXT(td, td_runq);
20950276Speter		else
21050276Speter			td = TAILQ_FIRST(&kg->kg_runq);
21150276Speter
21250276Speter		/*
21350276Speter		 * If we found one, send it to the system scheduler.
21450276Speter		 */
21550276Speter		if (td) {
21650276Speter			kg->kg_last_assigned = td;
21750276Speter			sched_add(td, SRQ_BORING);
21850276Speter			CTR2(KTR_RUNQ, "slot_fill: td%p -> kg%p", td, kg);
21950276Speter		} else {
22050276Speter			/* no threads to use up the slots. quit now */
22150276Speter			break;
22250276Speter		}
22350276Speter	}
22450276Speter}
22550276Speter
22650276Speter#ifdef	SCHED_4BSD
227174993Srafan/*
228174993Srafan * Remove a thread from its KSEGRP's run queue.
22997049Speter * This in turn may remove it from a KSE if it was already assigned
23097049Speter * to one, possibly causing a new thread to be assigned to the KSE
23176726Speter * and the KSE getting a new priority.
23297049Speter */
23350276Speterstatic void
234166124Srafanremrunqueue(struct thread *td)
235166124Srafan{
23650276Speter	struct thread *td2, *td3;
237166124Srafan	struct ksegrp *kg;
238166124Srafan	struct kse *ke;
239166124Srafan
240166124Srafan	mtx_assert(&sched_lock, MA_OWNED);
241166124Srafan	KASSERT((TD_ON_RUNQ(td)), ("remrunqueue: Bad state on run queue"));
242166124Srafan	kg = td->td_ksegrp;
243166124Srafan	ke = td->td_kse;
244166124Srafan	CTR1(KTR_RUNQ, "remrunqueue: td%p", td);
245166124Srafan	TD_SET_CAN_RUN(td);
246166124Srafan	/*
247166124Srafan	 * If it is not a threaded process, take the shortcut.
248166124Srafan	 */
249166124Srafan	if ((td->td_proc->p_flag & P_HADTHREADS) == 0) {
250166124Srafan		/* remve from sys run queue and free up a slot */
251166124Srafan		sched_rem(td);
252166124Srafan		ke->ke_state = KES_THREAD;
253166124Srafan		return;
254166124Srafan	}
25550276Speter   	td3 = TAILQ_PREV(td, threadqueue, td_runq);
256166124Srafan	TAILQ_REMOVE(&kg->kg_runq, td, td_runq);
257166124Srafan	kg->kg_runnable--;
258166124Srafan	if (ke->ke_state == KES_ONRUNQ) {
259166124Srafan		/*
260166124Srafan		 * This thread has been assigned to the system run queue.
261166124Srafan		 * We need to dissociate it and try assign the
262166124Srafan		 * KSE to the next available thread. Then, we should
26350276Speter		 * see if we need to move the KSE in the run queues.
26450276Speter		 */
26550276Speter		sched_rem(td);
26650276Speter		ke->ke_state = KES_THREAD;
26750276Speter		td2 = kg->kg_last_assigned;
268166124Srafan		KASSERT((td2 != NULL), ("last assigned has wrong value"));
269166124Srafan		if (td2 == td)
270166124Srafan			kg->kg_last_assigned = td3;
271166124Srafan		/* slot_fill(kg); */ /* will replace it with another */
272166124Srafan	}
273166124Srafan}
274166124Srafan#endif
27550276Speter
27650276Speter/*
27750276Speter * Change the priority of a thread that is on the run queue.
27850276Speter */
27950276Spetervoid
28050276Speteradjustrunqueue( struct thread *td, int newpri)
28150276Speter{
28250276Speter	struct ksegrp *kg;
28350276Speter	struct kse *ke;
28450276Speter
28550276Speter	mtx_assert(&sched_lock, MA_OWNED);
28650276Speter	KASSERT((TD_ON_RUNQ(td)), ("adjustrunqueue: Bad state on run queue"));
28750276Speter
28850276Speter	ke = td->td_kse;
28950276Speter	CTR1(KTR_RUNQ, "adjustrunqueue: td%p", td);
29050276Speter	/*
29150276Speter	 * If it is not a threaded process, take the shortcut.
29250276Speter	 */
29350276Speter	if ((td->td_proc->p_flag & P_HADTHREADS) == 0) {
29497049Speter		/* We only care about the kse in the run queue. */
29550276Speter		td->td_priority = newpri;
29650276Speter		if (ke->ke_rqindex != (newpri / RQ_PPQ)) {
29797049Speter			sched_rem(td);
29850276Speter			sched_add(td, SRQ_BORING);
29950276Speter		}
30050276Speter		return;
30150276Speter	}
30250276Speter
30350276Speter	/* It is a threaded process */
30450276Speter	kg = td->td_ksegrp;
30550276Speter	if (ke->ke_state == KES_ONRUNQ) {
30650276Speter		if (kg->kg_last_assigned == td) {
30750276Speter			kg->kg_last_assigned =
30850276Speter			    TAILQ_PREV(td, threadqueue, td_runq);
30950276Speter		}
31050276Speter		sched_rem(td);
31150276Speter	}
31250276Speter	TAILQ_REMOVE(&kg->kg_runq, td, td_runq);
31350276Speter	kg->kg_runnable--;
31450276Speter	TD_SET_CAN_RUN(td);
31550276Speter	td->td_priority = newpri;
31650276Speter	setrunqueue(td, SRQ_BORING);
31750276Speter}
31850276Speterint limitcount;
31950276Spetervoid
32050276Spetersetrunqueue(struct thread *td, int flags)
32150276Speter{
32250276Speter	struct ksegrp *kg;
32350276Speter	struct thread *td2;
32450276Speter	struct thread *tda;
32550276Speter
32650276Speter	CTR3(KTR_RUNQ, "setrunqueue: td:%p kg:%p pid:%d",
32797049Speter	    td, td->td_ksegrp, td->td_proc->p_pid);
32897049Speter	mtx_assert(&sched_lock, MA_OWNED);
32997049Speter	KASSERT((td->td_inhibitors == 0),
33097049Speter			("setrunqueue: trying to run inhibitted thread"));
33197049Speter	KASSERT((TD_CAN_RUN(td) || TD_IS_RUNNING(td)),
33297049Speter	    ("setrunqueue: bad thread state"));
33397049Speter	TD_SET_RUNQ(td);
33497049Speter	kg = td->td_ksegrp;
335166124Srafan	if ((td->td_proc->p_flag & P_HADTHREADS) == 0) {
33697049Speter		/*
33797049Speter		 * Common path optimisation: Only one of everything
33897049Speter		 * and the KSE is always already attached.
339166124Srafan		 * Totally ignore the ksegrp run queue.
340166124Srafan		 */
341166124Srafan		if (kg->kg_avail_opennings != 1) {
34250276Speter			if (limitcount < 1) {
343166124Srafan				limitcount++;
344166124Srafan				printf("pid %d: corrected slot count (%d->1)\n",
345166124Srafan				    td->td_proc->p_pid, kg->kg_avail_opennings);
346166124Srafan
34750276Speter			}
34850276Speter			kg->kg_avail_opennings = 1;
34950276Speter		}
35050276Speter		sched_add(td, flags);
35150276Speter		return;
352166124Srafan	}
353174993Srafan
354174993Srafan	/*
355166124Srafan	 * If the concurrency has reduced, and we would go in the
356166124Srafan	 * assigned section, then keep removing entries from the
35750276Speter	 * system run queue, until we are not in that section
35850276Speter	 * or there is room for us to be put in that section.
35997049Speter	 * What we MUST avoid is the case where there are threads of less
36050276Speter	 * priority than the new one scheduled, but it can not
36150276Speter	 * be scheduled itself. That would lead to a non contiguous set
362174993Srafan	 * of scheduled threads, and everything would break.
36397049Speter	 */
36450276Speter	tda = kg->kg_last_assigned;
36550276Speter	while ((kg->kg_avail_opennings <= 0) &&
36650276Speter	    (tda && (tda->td_priority > td->td_priority))) {
36762449Speter		/*
36850276Speter		 * None free, but there is one we can commandeer.
36950276Speter		 */
37062449Speter		CTR2(KTR_RUNQ,
37162449Speter		    "setrunqueue: kg:%p: take slot from td: %p", kg, tda);
37250276Speter		sched_rem(tda);
37350276Speter		tda = kg->kg_last_assigned =
37450276Speter		    TAILQ_PREV(tda, threadqueue, td_runq);
37550276Speter		SLOT_RELEASE(kg);
37650276Speter	}
37750276Speter
37850276Speter	/*
37950276Speter	 * Add the thread to the ksegrp's run queue at
38050276Speter	 * the appropriate place.
38197049Speter	 */
38297049Speter	TAILQ_FOREACH(td2, &kg->kg_runq, td_runq) {
38397049Speter		if (td2->td_priority > td->td_priority) {
38497049Speter			kg->kg_runnable++;
38597049Speter			TAILQ_INSERT_BEFORE(td2, td, td_runq);
38650276Speter			break;
38750276Speter		}
38897049Speter	}
38950276Speter	if (td2 == NULL) {
39050276Speter		/* We ran off the end of the TAILQ or it was empty. */
39150276Speter		kg->kg_runnable++;
39250276Speter		TAILQ_INSERT_TAIL(&kg->kg_runq, td, td_runq);
39350276Speter	}
39462449Speter
39562449Speter	/*
39650276Speter	 * If we have a slot to use, then put the thread on the system
39750276Speter	 * run queue and if needed, readjust the last_assigned pointer.
39850276Speter	 * it may be that we need to schedule something anyhow
39950276Speter	 * even if the availabel slots are -ve so that
40050276Speter	 * all the items < last_assigned are scheduled.
40150276Speter	 */
40250276Speter	if (kg->kg_avail_opennings > 0) {
40350276Speter		if (tda == NULL) {
40450276Speter			/*
40562449Speter			 * No pre-existing last assigned so whoever is first
40662449Speter			 * gets the slot.. (maybe us)
40762449Speter			 */
40850276Speter			td2 = TAILQ_FIRST(&kg->kg_runq);
40950276Speter			kg->kg_last_assigned = td2;
41062449Speter		} else if (tda->td_priority > td->td_priority) {
41197049Speter			td2 = td;
41297049Speter		} else {
41397049Speter			/*
414166124Srafan			 * We are past last_assigned, so
415166124Srafan			 * give the next slot to whatever is next,
41697049Speter			 * which may or may not be us.
417166124Srafan			 */
41850276Speter			td2 = TAILQ_NEXT(tda, td_runq);
419174993Srafan			kg->kg_last_assigned = td2;
42050276Speter		}
42150276Speter		sched_add(td2, flags);
422166124Srafan	} else {
423166124Srafan		CTR3(KTR_RUNQ, "setrunqueue: held: td%p kg%p pid%d",
424166124Srafan			td, td->td_ksegrp, td->td_proc->p_pid);
425166124Srafan	}
426184989Srafan}
427184989Srafan
428166124Srafan/*
429166124Srafan * Kernel thread preemption implementation.  Critical sections mark
430166124Srafan * regions of code in which preemptions are not allowed.
431166124Srafan */
432166124Srafanvoid
433166124Srafancritical_enter(void)
434166124Srafan{
435166124Srafan	struct thread *td;
436166124Srafan
437166124Srafan	td = curthread;
438166124Srafan	if (td->td_critnest == 0)
439166124Srafan		cpu_critical_enter(td);
440166124Srafan	td->td_critnest++;
441166124Srafan}
442166124Srafan
443166124Srafanvoid
444166124Srafancritical_exit(void)
445166124Srafan{
446166124Srafan	struct thread *td;
447166124Srafan
448166124Srafan	td = curthread;
449166124Srafan	KASSERT(td->td_critnest != 0,
450166124Srafan	    ("critical_exit: td_critnest == 0"));
451166124Srafan	if (td->td_critnest == 1) {
452166124Srafan#ifdef PREEMPTION
453166124Srafan		mtx_assert(&sched_lock, MA_NOTOWNED);
454166124Srafan		if (td->td_pflags & TDP_OWEPREEMPT) {
455166124Srafan			mtx_lock_spin(&sched_lock);
456166124Srafan			mi_switch(SW_INVOL, NULL);
457166124Srafan			mtx_unlock_spin(&sched_lock);
458166124Srafan		}
459166124Srafan#endif
460166124Srafan		td->td_critnest = 0;
461166124Srafan		cpu_critical_exit(td);
462166124Srafan	} else {
463166124Srafan		td->td_critnest--;
464166124Srafan	}
465166124Srafan}
466166124Srafan
467166124Srafan/*
468166124Srafan * This function is called when a thread is about to be put on run queue
469166124Srafan * because it has been made runnable or its priority has been adjusted.  It
470166124Srafan * determines if the new thread should be immediately preempted to.  If so,
47150276Speter * it switches to it and eventually returns true.  If not, it returns false
47250276Speter * so that the caller may place the thread on an appropriate run queue.
47350276Speter */
474166124Srafanint
475166124Srafanmaybe_preempt(struct thread *td)
47650276Speter{
477166124Srafan#ifdef PREEMPTION
47850276Speter	struct thread *ctd;
47950276Speter	int cpri, pri;
48050276Speter#endif
48150276Speter
48250276Speter	mtx_assert(&sched_lock, MA_OWNED);
483166124Srafan#ifdef PREEMPTION
484166124Srafan	/*
485166124Srafan	 * The new thread should not preempt the current thread if any of the
486166124Srafan	 * following conditions are true:
487166124Srafan	 *
48850276Speter	 *  - The current thread has a higher (numerically lower) or
48997049Speter	 *    equivalent priority.  Note that this prevents curthread from
49097049Speter	 *    trying to preempt to itself.
49150276Speter	 *  - It is too early in the boot for context switches (cold is set).
49250276Speter	 *  - The current thread has an inhibitor set or is in the process of
49350276Speter	 *    exiting.  In this case, the current thread is about to switch
49450276Speter	 *    out anyways, so there's no point in preempting.  If we did,
49597049Speter	 *    the current thread would not be properly resumed as well, so
49650276Speter	 *    just avoid that whole landmine.
49797049Speter	 *  - If the new thread's priority is not a realtime priority and
49897049Speter	 *    the current thread's priority is not an idle priority and
49950276Speter	 *    FULL_PREEMPTION is disabled.
50050276Speter	 *
50150276Speter	 * If all of these conditions are false, but the current thread is in
50250276Speter	 * a nested critical section, then we have to defer the preemption
50397049Speter	 * until we exit the critical section.  Otherwise, switch immediately
50450276Speter	 * to the new thread.
50550276Speter	 */
50650276Speter	ctd = curthread;
50750276Speter	KASSERT ((ctd->td_kse != NULL && ctd->td_kse->ke_thread == ctd),
50850276Speter	  ("thread has no (or wrong) sched-private part."));
50950276Speter	KASSERT((td->td_inhibitors == 0),
51050276Speter			("maybe_preempt: trying to run inhibitted thread"));
51150276Speter	pri = td->td_priority;
51250276Speter	cpri = ctd->td_priority;
51350276Speter	if (pri >= cpri || cold /* || dumping */ || TD_IS_INHIBITED(ctd) ||
514174993Srafan	    td->td_kse->ke_state != KES_THREAD)
51550276Speter		return (0);
51650276Speter#ifndef FULL_PREEMPTION
51750276Speter	if (!(pri >= PRI_MIN_ITHD && pri <= PRI_MAX_ITHD) &&
51850276Speter	    !(cpri >= PRI_MIN_IDLE))
51950276Speter		return (0);
52076726Speter#endif
52176726Speter	if (ctd->td_critnest > 1) {
52276726Speter		CTR1(KTR_PROC, "maybe_preempt: in critical section %d",
52376726Speter		    ctd->td_critnest);
52476726Speter		ctd->td_pflags |= TDP_OWEPREEMPT;
52576726Speter		return (0);
52676726Speter	}
52776726Speter
52876726Speter	/*
52976726Speter	 * Our thread state says that we are already on a run queue, so
53076726Speter	 * update our state as if we had been dequeued by choosethread().
53176726Speter	 * However we must not actually be on the system run queue yet.
53276726Speter	 */
53376726Speter	MPASS(TD_ON_RUNQ(td));
53476726Speter	MPASS(td->td_sched->ke_state != KES_ONRUNQ);
53576726Speter	if (td->td_proc->p_flag & P_HADTHREADS) {
53676726Speter		/*
53776726Speter		 * If this is a threaded process we actually ARE on the
53876726Speter		 * ksegrp run queue so take it off that first.
53976726Speter		 * Also undo any damage done to the last_assigned pointer.
54076726Speter		 * XXX Fix setrunqueue so this isn't needed
54176726Speter		 */
54276726Speter		struct ksegrp *kg;
54376726Speter
54476726Speter		kg = td->td_ksegrp;
54576726Speter		if (kg->kg_last_assigned == td)
54676726Speter			kg->kg_last_assigned =
54776726Speter			    TAILQ_PREV(td, threadqueue, td_runq);
54876726Speter		TAILQ_REMOVE(&kg->kg_runq, td, td_runq);
54976726Speter	}
55076726Speter
55176726Speter	TD_SET_RUNNING(td);
55276726Speter	CTR3(KTR_PROC, "preempting to thread %p (pid %d, %s)\n", td,
55376726Speter	    td->td_proc->p_pid, td->td_proc->p_comm);
55476726Speter	mi_switch(SW_INVOL, td);
55576726Speter	return (1);
55676726Speter#else
55776726Speter	return (0);
55876726Speter#endif
55976726Speter}
56076726Speter
56176726Speter#if 0
56276726Speter#ifndef PREEMPTION
56376726Speter/* XXX: There should be a non-static version of this. */
56476726Speterstatic void
56576726Speterprintf_caddr_t(void *data)
56676726Speter{
56776726Speter	printf("%s", (char *)data);
56876726Speter}
56976726Speterstatic char preempt_warning[] =
57076726Speter    "WARNING: Kernel preemption is disabled, expect reduced performance.\n";
57176726SpeterSYSINIT(preempt_warning, SI_SUB_COPYRIGHT, SI_ORDER_ANY, printf_caddr_t,
57276726Speter    preempt_warning)
57376726Speter#endif
57476726Speter#endif
57576726Speter
57676726Speter/************************************************************************
57776726Speter * SYSTEM RUN QUEUE manipulations and tests				*
57876726Speter ************************************************************************/
57976726Speter/*
58076726Speter * Initialize a run structure.
58176726Speter */
58276726Spetervoid
58376726Speterrunq_init(struct runq *rq)
58476726Speter{
58576726Speter	int i;
58676726Speter
58776726Speter	bzero(rq, sizeof *rq);
58876726Speter	for (i = 0; i < RQ_NQS; i++)
58976726Speter		TAILQ_INIT(&rq->rq_queues[i]);
59076726Speter}
59176726Speter
59276726Speter/*
59376726Speter * Clear the status bit of the queue corresponding to priority level pri,
59476726Speter * indicating that it is empty.
59576726Speter */
59676726Speterstatic __inline void
59776726Speterrunq_clrbit(struct runq *rq, int pri)
59876726Speter{
59976726Speter	struct rqbits *rqb;
60076726Speter
60176726Speter	rqb = &rq->rq_status;
60276726Speter	CTR4(KTR_RUNQ, "runq_clrbit: bits=%#x %#x bit=%#x word=%d",
60376726Speter	    rqb->rqb_bits[RQB_WORD(pri)],
60476726Speter	    rqb->rqb_bits[RQB_WORD(pri)] & ~RQB_BIT(pri),
60576726Speter	    RQB_BIT(pri), RQB_WORD(pri));
60676726Speter	rqb->rqb_bits[RQB_WORD(pri)] &= ~RQB_BIT(pri);
60776726Speter}
60876726Speter
60976726Speter/*
61076726Speter * Find the index of the first non-empty run queue.  This is done by
61176726Speter * scanning the status bits, a set bit indicates a non-empty queue.
61276726Speter */
61376726Speterstatic __inline int
61476726Speterrunq_findbit(struct runq *rq)
61576726Speter{
61676726Speter	struct rqbits *rqb;
61776726Speter	int pri;
61876726Speter	int i;
61976726Speter
62076726Speter	rqb = &rq->rq_status;
62176726Speter	for (i = 0; i < RQB_LEN; i++)
62276726Speter		if (rqb->rqb_bits[i]) {
62376726Speter			pri = RQB_FFS(rqb->rqb_bits[i]) + (i << RQB_L2BPW);
62476726Speter			CTR3(KTR_RUNQ, "runq_findbit: bits=%#x i=%d pri=%d",
62576726Speter			    rqb->rqb_bits[i], i, pri);
626166124Srafan			return (pri);
62750276Speter		}
62876726Speter
62950276Speter	return (-1);
63076726Speter}
63176726Speter
63276726Speter/*
63376726Speter * Set the status bit of the queue corresponding to priority level pri,
63476726Speter * indicating that it is non-empty.
63576726Speter */
63676726Speterstatic __inline void
63776726Speterrunq_setbit(struct runq *rq, int pri)
63876726Speter{
63976726Speter	struct rqbits *rqb;
64076726Speter
64176726Speter	rqb = &rq->rq_status;
64276726Speter	CTR4(KTR_RUNQ, "runq_setbit: bits=%#x %#x bit=%#x word=%d",
64376726Speter	    rqb->rqb_bits[RQB_WORD(pri)],
64476726Speter	    rqb->rqb_bits[RQB_WORD(pri)] | RQB_BIT(pri),
64576726Speter	    RQB_BIT(pri), RQB_WORD(pri));
64676726Speter	rqb->rqb_bits[RQB_WORD(pri)] |= RQB_BIT(pri);
64776726Speter}
64876726Speter
64976726Speter/*
65076726Speter * Add the KSE to the queue specified by its priority, and set the
651166124Srafan * corresponding status bit.
65250276Speter */
65376726Spetervoid
65450276Speterrunq_add(struct runq *rq, struct kse *ke)
65576726Speter{
65676726Speter	struct rqhead *rqh;
65776726Speter	int pri;
65876726Speter
65976726Speter	pri = ke->ke_thread->td_priority / RQ_PPQ;
66076726Speter	ke->ke_rqindex = pri;
66176726Speter	runq_setbit(rq, pri);
66276726Speter	rqh = &rq->rq_queues[pri];
66376726Speter	CTR5(KTR_RUNQ, "runq_add: td=%p ke=%p pri=%d %d rqh=%p",
66476726Speter	    ke->ke_thread, ke, ke->ke_thread->td_priority, pri, rqh);
66576726Speter	TAILQ_INSERT_TAIL(rqh, ke, ke_procq);
66676726Speter}
66776726Speter
66876726Speter/*
66976726Speter * Return true if there are runnable processes of any priority on the run
67076726Speter * queue, false otherwise.  Has no side effects, does not modify the run
67176726Speter * queue structure.
67276726Speter */
67376726Speterint
67476726Speterrunq_check(struct runq *rq)
675166124Srafan{
67650276Speter	struct rqbits *rqb;
67776726Speter	int i;
67876726Speter
67976726Speter	rqb = &rq->rq_status;
68076726Speter	for (i = 0; i < RQB_LEN; i++)
68176726Speter		if (rqb->rqb_bits[i]) {
68276726Speter			CTR2(KTR_RUNQ, "runq_check: bits=%#x i=%d",
68376726Speter			    rqb->rqb_bits[i], i);
68476726Speter			return (1);
68597049Speter		}
68676726Speter	CTR0(KTR_RUNQ, "runq_check: empty");
68776726Speter
68850276Speter	return (0);
68976726Speter}
69076726Speter
69176726Speter#if defined(SMP) && defined(SCHED_4BSD)
69276726Speterint runq_fuzz = 1;
69376726SpeterSYSCTL_INT(_kern_sched, OID_AUTO, runq_fuzz, CTLFLAG_RW, &runq_fuzz, 0, "");
69476726Speter#endif
69576726Speter
69676726Speter/*
69776726Speter * Find the highest priority process on the run queue.
69876726Speter */
69976726Speterstruct kse *
70076726Speterrunq_choose(struct runq *rq)
70176726Speter{
70276726Speter	struct rqhead *rqh;
70376726Speter	struct kse *ke;
70476726Speter	int pri;
70576726Speter
70676726Speter	mtx_assert(&sched_lock, MA_OWNED);
70776726Speter	while ((pri = runq_findbit(rq)) != -1) {
70876726Speter		rqh = &rq->rq_queues[pri];
70976726Speter#if defined(SMP) && defined(SCHED_4BSD)
71076726Speter		/* fuzz == 1 is normal.. 0 or less are ignored */
71176726Speter		if (runq_fuzz > 1) {
71276726Speter			/*
71376726Speter			 * In the first couple of entries, check if
71476726Speter			 * there is one for our CPU as a preference.
71576726Speter			 */
71676726Speter			int count = runq_fuzz;
71776726Speter			int cpu = PCPU_GET(cpuid);
718174993Srafan			struct kse *ke2;
71976726Speter			ke2 = ke = TAILQ_FIRST(rqh);
72076726Speter
72176726Speter			while (count-- && ke2) {
72276726Speter				if (ke->ke_thread->td_lastcpu == cpu) {
72376726Speter					ke = ke2;
72476726Speter					break;
72576726Speter				}
72676726Speter				ke2 = TAILQ_NEXT(ke2, ke_procq);
72776726Speter			}
72876726Speter		} else
72976726Speter#endif
73076726Speter			ke = TAILQ_FIRST(rqh);
73176726Speter		KASSERT(ke != NULL, ("runq_choose: no proc on busy queue"));
732166124Srafan		CTR3(KTR_RUNQ,
733166124Srafan		    "runq_choose: pri=%d kse=%p rqh=%p", pri, ke, rqh);
73476726Speter		return (ke);
73576726Speter	}
73676726Speter	CTR1(KTR_RUNQ, "runq_choose: idleproc pri=%d", pri);
737166124Srafan
73876726Speter	return (NULL);
739166124Srafan}
74076726Speter
74176726Speter/*
74276726Speter * Remove the KSE from the queue specified by its priority, and clear the
74376726Speter * corresponding status bit if the queue becomes empty.
74476726Speter * Caller must set ke->ke_state afterwards.
745166124Srafan */
746166124Srafanvoid
747166124Srafanrunq_remove(struct runq *rq, struct kse *ke)
748166124Srafan{
74976726Speter	struct rqhead *rqh;
75076726Speter	int pri;
75176726Speter
75276726Speter	KASSERT(ke->ke_proc->p_sflag & PS_INMEM,
75376726Speter		("runq_remove: process swapped out"));
75476726Speter	pri = ke->ke_rqindex;
75576726Speter	rqh = &rq->rq_queues[pri];
75676726Speter	CTR5(KTR_RUNQ, "runq_remove: td=%p, ke=%p pri=%d %d rqh=%p",
75776726Speter	    ke->ke_thread, ke, ke->ke_thread->td_priority, pri, rqh);
75876726Speter	KASSERT(ke != NULL, ("runq_remove: no proc on busy queue"));
75976726Speter	TAILQ_REMOVE(rqh, ke, ke_procq);
76076726Speter	if (TAILQ_EMPTY(rqh)) {
76176726Speter		CTR0(KTR_RUNQ, "runq_remove: empty");
76276726Speter		runq_clrbit(rq, pri);
76376726Speter	}
76476726Speter}
76576726Speter
76676726Speter/****** functions that are temporarily here ***********/
76776726Speter#include <vm/uma.h>
76876726Speter#define RANGEOF(type, start, end) (offsetof(type, end) - offsetof(type, start))
76976726Speterextern struct mtx kse_zombie_lock;
77076726Speter
77176726Speter/*
77276726Speter *  Allocate scheduler specific per-process resources.
77376726Speter * The thread and ksegrp have already been linked in.
77476726Speter * In this case just set the default concurrency value.
77576726Speter *
77676726Speter * Called from:
777166124Srafan *  proc_init() (UMA init method)
77850276Speter */
77976726Spetervoid
78076726Spetersched_newproc(struct proc *p, struct ksegrp *kg, struct thread *td)
78176726Speter{
78250276Speter
78376726Speter	/* This can go in sched_fork */
78476726Speter	sched_init_concurrency(kg);
78576726Speter}
78676726Speter
78776726Speter#define RANGEOF(type, start, end) (offsetof(type, end) - offsetof(type, start))
78876726Speter/*
78976726Speter * thread is being either created or recycled.
79076726Speter * Fix up the per-scheduler resources associated with it.
79176726Speter * Called from:
79250276Speter *  sched_fork_thread()
79397049Speter *  thread_dtor()  (*may go away)
794166124Srafan *  thread_init()  (*may go away)
79597049Speter */
796166124Srafanvoid
797166124Srafansched_newthread(struct thread *td)
798166124Srafan{
799166124Srafan	struct td_sched *ke;
800166124Srafan
801166124Srafan	ke = (struct td_sched *) (td + 1);
802174993Srafan	bzero(ke, sizeof(*ke));
803166124Srafan	td->td_sched     = ke;
804174993Srafan	ke->ke_thread	= td;
805174993Srafan	ke->ke_oncpu	= NOCPU;
806166124Srafan	ke->ke_state	= KES_THREAD;
807166124Srafan}
808166124Srafan
809166124Srafan/*
810166124Srafan * Set up an initial concurrency of 1
811174993Srafan * and set the given thread (if given) to be using that
812166124Srafan * concurrency slot.
813166124Srafan * May be used "offline"..before the ksegrp is attached to the world
814166124Srafan * and thus wouldn't need schedlock in that case.
815166124Srafan * Called from:
816166124Srafan *  thr_create()
817166124Srafan *  proc_init() (UMA) via sched_newproc()
818166124Srafan */
819166124Srafanvoid
820166124Srafansched_init_concurrency(struct ksegrp *kg)
821166124Srafan{
822174993Srafan
823166124Srafan	CTR1(KTR_RUNQ,"kg %p init slots and concurrency to 1", kg);
82497049Speter	kg->kg_concurrency = 1;
82597049Speter	kg->kg_avail_opennings = 1;
82697049Speter}
82750276Speter
828174993Srafan/*
829174993Srafan * Change the concurrency of an existing ksegrp to N
830174993Srafan * Called from:
831174993Srafan *  kse_create()
832174993Srafan *  kse_exit()
833174993Srafan *  thread_exit()
834178866Srafan *  thread_single()
835178866Srafan */
836174993Srafanvoid
837174993Srafansched_set_concurrency(struct ksegrp *kg, int concurrency)
838174993Srafan{
839174993Srafan
840174993Srafan	CTR4(KTR_RUNQ,"kg %p set concurrency to %d, slots %d -> %d",
841174993Srafan	    kg,
842174993Srafan	    concurrency,
843174993Srafan	    kg->kg_avail_opennings,
844174993Srafan	    kg->kg_avail_opennings + (concurrency - kg->kg_concurrency));
845176187Srafan	kg->kg_avail_opennings += (concurrency - kg->kg_concurrency);
846176187Srafan	kg->kg_concurrency = concurrency;
847174993Srafan}
848174993Srafan
849174993Srafan/*
850178866Srafan * Called from thread_exit() for all exiting thread
851178866Srafan *
852174993Srafan * Not to be confused with sched_exit_thread()
853174993Srafan * that is only called from thread_exit() for threads exiting
854174993Srafan * without the rest of the process exiting because it is also called from
855174993Srafan * sched_exit() and we wouldn't want to call it twice.
856174993Srafan * XXX This can probably be fixed.
857174993Srafan */
858174993Srafanvoid
859174993Srafansched_thread_exit(struct thread *td)
860174993Srafan{
861174993Srafan
862174993Srafan	SLOT_RELEASE(td->td_ksegrp);
863174993Srafan	slot_fill(td->td_ksegrp);
864174993Srafan}
865174993Srafan
866174993Srafan#endif /* KERN_SWITCH_INCLUDE */
867174993Srafan