kern_switch.c revision 134591
11556Srgrimes/*
21556Srgrimes * Copyright (c) 2001 Jake Burkholder <jake@FreeBSD.org>
31556Srgrimes * All rights reserved.
41556Srgrimes *
51556Srgrimes * Redistribution and use in source and binary forms, with or without
61556Srgrimes * modification, are permitted provided that the following conditions
71556Srgrimes * are met:
81556Srgrimes * 1. Redistributions of source code must retain the above copyright
91556Srgrimes *    notice, this list of conditions and the following disclaimer.
101556Srgrimes * 2. Redistributions in binary form must reproduce the above copyright
111556Srgrimes *    notice, this list of conditions and the following disclaimer in the
121556Srgrimes *    documentation and/or other materials provided with the distribution.
131556Srgrimes *
141556Srgrimes * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
151556Srgrimes * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
161556Srgrimes * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
171556Srgrimes * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
181556Srgrimes * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
191556Srgrimes * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
201556Srgrimes * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
211556Srgrimes * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
221556Srgrimes * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
231556Srgrimes * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
241556Srgrimes * SUCH DAMAGE.
251556Srgrimes */
261556Srgrimes
271556Srgrimes/***
281556SrgrimesHere is the logic..
291556Srgrimes
301556SrgrimesIf there are N processors, then there are at most N KSEs (kernel
311556Srgrimesschedulable entities) working to process threads that belong to a
321556SrgrimesKSEGROUP (kg). If there are X of these KSEs actually running at the
331556Srgrimesmoment in question, then there are at most M (N-X) of these KSEs on
3436150Scharnierthe run queue, as running KSEs are not on the queue.
3536150Scharnier
3636150ScharnierRunnable threads are queued off the KSEGROUP in priority order.
371556SrgrimesIf there are M or more threads runnable, the top M threads
3899110Sobrien(by priority) are 'preassigned' to the M KSEs not running. The KSEs take
3999110Sobrientheir priority from those threads and are put on the run queue.
401556Srgrimes
4117987SpeterThe last thread that had a priority high enough to have a KSE associated
4217987Speterwith it, AND IS ON THE RUN QUEUE is pointed to by
4317987Speterkg->kg_last_assigned. If no threads queued off the KSEGROUP have KSEs
4417987Speterassigned as all the available KSEs are activly running, or because there
4517987Speterare no threads queued, that pointer is NULL.
4617987Speter
4717987SpeterWhen a KSE is removed from the run queue to become runnable, we know
481556Srgrimesit was associated with the highest priority thread in the queue (at the head
491556Srgrimesof the queue). If it is also the last assigned we know M was 1 and must
501556Srgrimesnow be 0. Since the thread is no longer queued that pointer must be
511556Srgrimesremoved from it. Since we know there were no more KSEs available,
521556Srgrimes(M was 1 and is now 0) and since we are not FREEING our KSE
5317987Speterbut using it, we know there are STILL no more KSEs available, we can prove
541556Srgrimesthat the next thread in the ksegrp list will not have a KSE to assign to
551556Srgrimesit, so we can show that the pointer must be made 'invalid' (NULL).
561556Srgrimes
571556SrgrimesThe pointer exists so that when a new thread is made runnable, it can
581556Srgrimeshave its priority compared with the last assigned thread to see if
591556Srgrimesit should 'steal' its KSE or not.. i.e. is it 'earlier'
601556Srgrimeson the list than that thread or later.. If it's earlier, then the KSE is
611556Srgrimesremoved from the last assigned (which is now not assigned a KSE)
621556Srgrimesand reassigned to the new thread, which is placed earlier in the list.
63100588StjrThe pointer is then backed up to the previous thread (which may or may not
641556Srgrimesbe the new thread).
651556Srgrimes
661556SrgrimesWhen a thread sleeps or is removed, the KSE becomes available and if there
671556Srgrimesare queued threads that are not assigned KSEs, the highest priority one of
681556Srgrimesthem is assigned the KSE, which is then placed back on the run queue at
691556Srgrimesthe approipriate place, and the kg->kg_last_assigned pointer is adjusted down
701556Srgrimesto point to it.
711556Srgrimes
7212043SpeterThe following diagram shows 2 KSEs and 3 threads from a single process.
731556Srgrimes
741556Srgrimes RUNQ: --->KSE---KSE--...    (KSEs queued at priorities from threads)
751556Srgrimes              \    \____
761556Srgrimes               \        \
771556Srgrimes    KSEGROUP---thread--thread--thread    (queued in priority order)
781556Srgrimes        \                 /
791556Srgrimes         \_______________/
801556Srgrimes          (last_assigned)
811556Srgrimes
821556SrgrimesThe result of this scheme is that the M available KSEs are always
831556Srgrimesqueued at the priorities they have inherrited from the M highest priority
841556Srgrimesthreads for that KSEGROUP. If this situation changes, the KSEs are
851556Srgrimesreassigned to keep this true.
8620425Ssteve***/
8720425Ssteve
881556Srgrimes#include <sys/cdefs.h>
891556Srgrimes__FBSDID("$FreeBSD: head/sys/kern/kern_switch.c 134591 2004-09-01 06:42:02Z julian $");
901556Srgrimes
911556Srgrimes#include "opt_full_preemption.h"
921556Srgrimes#include "opt_sched.h"
931556Srgrimes
941556Srgrimes#include <sys/param.h>
951556Srgrimes#include <sys/systm.h>
96201053Sjilles#include <sys/kdb.h>
9712043Speter#include <sys/kernel.h>
981556Srgrimes#include <sys/ktr.h>
991556Srgrimes#include <sys/lock.h>
100230118Sjilles#include <sys/mutex.h>
101213760Sobrien#include <sys/proc.h>
1021556Srgrimes#include <sys/queue.h>
1031556Srgrimes#include <sys/sched.h>
1041556Srgrimes#if defined(SMP) && (defined(__i386__) || defined(__amd64__))
1051556Srgrimes#include <sys/smp.h>
1061556Srgrimes#endif
107213811Sobrien#include <machine/critical.h>
108213811Sobrien#if defined(SMP) && defined(SCHED_4BSD)
109229220Sjilles#include <sys/sysctl.h>
1101556Srgrimes#endif
1111556Srgrimes
1121556Srgrimes
1131556SrgrimesCTASSERT((RQB_BPW * RQB_LEN) == RQ_NQS);
1141556Srgrimes
115201053Sjillesvoid panc(char *string1, char *string2);
116201053Sjilles
1171556Srgrimes#if 0
1181556Srgrimesstatic void runq_readjust(struct runq *rq, struct kse *ke);
1191556Srgrimes#endif
1201556Srgrimes/************************************************************************
1211556Srgrimes * Functions that manipulate runnability from a thread perspective.	*
122194406Sjilles ************************************************************************/
123218306Sjilles/*
1241556Srgrimes * Select the KSE that will be run next.  From that find the thread, and
1251556Srgrimes * remove it from the KSEGRP's run queue.  If there is thread clustering,
1261556Srgrimes * this will be what does it.
1271556Srgrimes */
1281556Srgrimesstruct thread *
1291556Srgrimeschoosethread(void)
1301556Srgrimes{
1311556Srgrimes	struct kse *ke;
1321556Srgrimes	struct thread *td;
13390111Simp	struct ksegrp *kg;
13417987Speter
13525225Ssteve#if defined(SMP) && (defined(__i386__) || defined(__amd64__))
1361556Srgrimes	if (smp_active == 0 && PCPU_GET(cpuid) != 0) {
1371556Srgrimes		/* Shutting down, run idlethread on AP's */
1381556Srgrimes		td = PCPU_GET(idlethread);
1391556Srgrimes		ke = td->td_kse;
1401556Srgrimes		CTR1(KTR_RUNQ, "choosethread: td=%p (idle)", td);
1411556Srgrimes		ke->ke_flags |= KEF_DIDRUN;
1421556Srgrimes		TD_SET_RUNNING(td);
1431556Srgrimes		return (td);
1441556Srgrimes	}
1451556Srgrimes#endif
1461556Srgrimes
1471556Srgrimesretry:
1481556Srgrimes	ke = sched_choose();
1491556Srgrimes	if (ke) {
1501556Srgrimes		td = ke->ke_thread;
1511556Srgrimes		KASSERT((td->td_kse == ke), ("kse/thread mismatch"));
1521556Srgrimes		kg = ke->ke_ksegrp;
1531556Srgrimes		if (td->td_proc->p_flag & P_SA) {
1541556Srgrimes			if (kg->kg_last_assigned == td) {
1551556Srgrimes				kg->kg_last_assigned = TAILQ_PREV(td,
1561556Srgrimes				    threadqueue, td_runq);
1571556Srgrimes			}
1581556Srgrimes			TAILQ_REMOVE(&kg->kg_runq, td, td_runq);
1591556Srgrimes			kg->kg_runnable--;
1601556Srgrimes		}
1611556Srgrimes		CTR2(KTR_RUNQ, "choosethread: td=%p pri=%d",
16290111Simp		    td, td->td_priority);
16320425Ssteve	} else {
1641556Srgrimes		/* Simulate runq_choose() having returned the idle thread */
1651556Srgrimes		td = PCPU_GET(idlethread);
1661556Srgrimes		ke = td->td_kse;
16720425Ssteve		CTR1(KTR_RUNQ, "choosethread: td=%p (idle)", td);
168213811Sobrien	}
16990111Simp	ke->ke_flags |= KEF_DIDRUN;
17012043Speter
17112043Speter	/*
17220425Ssteve	 * If we are in panic, only allow system threads,
1731556Srgrimes	 * plus the one we are running in, to be run.
174100588Stjr	 */
175100588Stjr	if (panicstr && ((td->td_proc->p_flag & P_SYSTEM) == 0 &&
176100588Stjr	    (td->td_flags & TDF_INPANIC) == 0)) {
177100588Stjr		/* note that it is no longer on the run queue */
178100588Stjr		TD_SET_CAN_RUN(td);
179100588Stjr		goto retry;
18012043Speter	}
18125225Ssteve
18212043Speter	TD_SET_RUNNING(td);
183158143Sstefanf	return (td);
184158143Sstefanf}
18512043Speter
18612043Speter/*
187158143Sstefanf * Given a surplus KSE, either assign a new runable thread to it
188158143Sstefanf * (and put it in the run queue) or put it in the ksegrp's idle KSE list.
189238377Spfg * Assumes that the original thread is not runnable.
19012043Speter */
191158143Sstefanfvoid
192230118Sjilleskse_reassign(struct kse *ke)
193230118Sjilles{
194158143Sstefanf	struct ksegrp *kg;
195158143Sstefanf	struct thread *td;
196158143Sstefanf	struct thread *original;
197158143Sstefanf
198158143Sstefanf	mtx_assert(&sched_lock, MA_OWNED);
199158143Sstefanf	original = ke->ke_thread;
20012043Speter	KASSERT(original == NULL || TD_IS_INHIBITED(original),
20125225Ssteve    	    ("reassigning KSE with runnable thread"));
20225225Ssteve	kg = ke->ke_ksegrp;
203230118Sjilles	if (original)
20412043Speter		original->td_kse = NULL;
20512043Speter
20612043Speter	/*
20712043Speter	 * Find the first unassigned thread
20812043Speter	 */
20912043Speter	if ((td = kg->kg_last_assigned) != NULL)
21012043Speter		td = TAILQ_NEXT(td, td_runq);
21112043Speter	else
21212043Speter		td = TAILQ_FIRST(&kg->kg_runq);
21312043Speter
214199629Sjilles	/*
21512043Speter	 * If we found one, assign it the kse, otherwise idle the kse.
21612043Speter	 */
21712043Speter	if (td) {
21812043Speter		kg->kg_last_assigned = td;
21912043Speter		td->td_kse = ke;
22020425Ssteve		ke->ke_thread = td;
22112043Speter		CTR2(KTR_RUNQ, "kse_reassign: ke%p -> td%p", ke, td);
22212043Speter		sched_add(td, SRQ_BORING);
22312043Speter		return;
22412043Speter	}
2251556Srgrimes
2261556Srgrimes	ke->ke_state = KES_IDLE;
2271556Srgrimes	ke->ke_thread = NULL;
2281556Srgrimes	TAILQ_INSERT_TAIL(&kg->kg_iq, ke, ke_kgrlist);
2291556Srgrimes	kg->kg_idle_kses++;
2301556Srgrimes	CTR1(KTR_RUNQ, "kse_reassign: ke%p on idle queue", ke);
23120425Ssteve	return;
23220425Ssteve}
2331556Srgrimes
2341556Srgrimes#if 0
2351556Srgrimes/*
23690111Simp * Remove a thread from its KSEGRP's run queue.
23720425Ssteve * This in turn may remove it from a KSE if it was already assigned
23812043Speter * to one, possibly causing a new thread to be assigned to the KSE
23912043Speter * and the KSE getting a new priority.
24012043Speter */
24112043Speterstatic void
2421556Srgrimesremrunqueue(struct thread *td)
2431556Srgrimes{
2441556Srgrimes	struct thread *td2, *td3;
2451556Srgrimes	struct ksegrp *kg;
2461556Srgrimes	struct kse *ke;
2471556Srgrimes
2481556Srgrimes	mtx_assert(&sched_lock, MA_OWNED);
2491556Srgrimes	KASSERT((TD_ON_RUNQ(td)), ("remrunqueue: Bad state on run queue"));
2501556Srgrimes	kg = td->td_ksegrp;
2511556Srgrimes	ke = td->td_kse;
2521556Srgrimes	CTR1(KTR_RUNQ, "remrunqueue: td%p", td);
25312043Speter	TD_SET_CAN_RUN(td);
25412043Speter	/*
25525225Ssteve	 * If it is not a threaded process, take the shortcut.
25612043Speter	 */
25712043Speter	if ((td->td_proc->p_flag & P_SA) == 0) {
2581556Srgrimes		/* Bring its kse with it, leave the thread attached */
2591556Srgrimes		sched_rem(td);
2601556Srgrimes		ke->ke_state = KES_THREAD;
26112043Speter		return;
26212043Speter	}
2631556Srgrimes   	td3 = TAILQ_PREV(td, threadqueue, td_runq);
2641556Srgrimes	TAILQ_REMOVE(&kg->kg_runq, td, td_runq);
26512043Speter	kg->kg_runnable--;
26612043Speter	if (ke) {
26712043Speter		/*
26812043Speter		 * This thread has been assigned to a KSE.
26912043Speter		 * We need to dissociate it and try assign the
27012043Speter		 * KSE to the next available thread. Then, we should
27112043Speter		 * see if we need to move the KSE in the run queues.
27212043Speter		 */
2731556Srgrimes		sched_rem(td);
27412043Speter		ke->ke_state = KES_THREAD;
27512043Speter		td2 = kg->kg_last_assigned;
27612043Speter		KASSERT((td2 != NULL), ("last assigned has wrong value"));
27712043Speter		if (td2 == td)
27812043Speter			kg->kg_last_assigned = td3;
27912043Speter		kse_reassign(ke);
28012043Speter	}
2811556Srgrimes}
28212043Speter#endif
2831556Srgrimes
28412043Speter/*
28512043Speter * Change the priority of a thread that is on the run queue.
28612043Speter */
28712043Spetervoid
28812043Speteradjustrunqueue( struct thread *td, int newpri)
28912043Speter{
29012043Speter	struct ksegrp *kg;
29112043Speter	struct kse *ke;
29212043Speter
29312043Speter	mtx_assert(&sched_lock, MA_OWNED);
2941556Srgrimes	KASSERT((TD_ON_RUNQ(td)), ("adjustrunqueue: Bad state on run queue"));
29512043Speter
29612043Speter	ke = td->td_kse;
2971556Srgrimes	CTR1(KTR_RUNQ, "adjustrunqueue: td%p", td);
2981556Srgrimes	/*
29918018Speter	 * If it is not a threaded process, take the shortcut.
3001556Srgrimes	 */
30184261Sobrien	if ((td->td_proc->p_flag & P_SA) == 0) {
3021556Srgrimes		/* We only care about the kse in the run queue. */
30384261Sobrien		td->td_priority = newpri;
30484261Sobrien		if (ke->ke_rqindex != (newpri / RQ_PPQ)) {
3051556Srgrimes			sched_rem(td);
3061556Srgrimes			sched_add(td, SRQ_BORING);
30718018Speter		}
30812043Speter		return;
3091556Srgrimes	}
31012043Speter
3111556Srgrimes	/* It is a threaded process */
3121556Srgrimes	kg = td->td_ksegrp;
31312043Speter	TD_SET_CAN_RUN(td);
31412043Speter	if (ke) {
31512043Speter		if (kg->kg_last_assigned == td) {
3161556Srgrimes			kg->kg_last_assigned =
3171556Srgrimes			    TAILQ_PREV(td, threadqueue, td_runq);
3181556Srgrimes		}
3191556Srgrimes		sched_rem(td);
320194128Sjilles	}
321194128Sjilles	TAILQ_REMOVE(&kg->kg_runq, td, td_runq);
322194128Sjilles	kg->kg_runnable--;
323194128Sjilles	td->td_priority = newpri;
324194128Sjilles	setrunqueue(td, SRQ_BORING);
325194128Sjilles}
326194128Sjilles
327194128Sjillesvoid
328194128Sjillessetrunqueue(struct thread *td, int flags)
329194128Sjilles{
330194128Sjilles	struct kse *ke;
331194128Sjilles	struct ksegrp *kg;
332194128Sjilles	struct thread *td2;
333194128Sjilles	struct thread *tda;
334194128Sjilles	int count;
335194128Sjilles
336194128Sjilles	CTR4(KTR_RUNQ, "setrunqueue: td:%p ke:%p kg:%p pid:%d",
3371556Srgrimes	    td, td->td_kse, td->td_ksegrp, td->td_proc->p_pid);
3381556Srgrimes	mtx_assert(&sched_lock, MA_OWNED);
3391556Srgrimes	KASSERT((TD_CAN_RUN(td) || TD_IS_RUNNING(td)),
3401556Srgrimes	    ("setrunqueue: bad thread state"));
3411556Srgrimes	TD_SET_RUNQ(td);
34290111Simp	kg = td->td_ksegrp;
34390111Simp	if ((td->td_proc->p_flag & P_SA) == 0) {
3441556Srgrimes		/*
3451556Srgrimes		 * Common path optimisation: Only one of everything
3461556Srgrimes		 * and the KSE is always already attached.
3471556Srgrimes		 * Totally ignore the ksegrp run queue.
3481556Srgrimes		 */
3491556Srgrimes		sched_add(td, flags);
3501556Srgrimes		return;
3511556Srgrimes	}
3521556Srgrimes
353242895Sjilles	tda = kg->kg_last_assigned;
35490111Simp	if ((ke = td->td_kse) == NULL) {
3551556Srgrimes		if (kg->kg_idle_kses) {
3561556Srgrimes			/*
3571556Srgrimes			 * There is a free one so it's ours for the asking..
358199629Sjilles			 */
3591556Srgrimes			ke = TAILQ_FIRST(&kg->kg_iq);
3601556Srgrimes			CTR2(KTR_RUNQ, "setrunqueue: kg:%p: Use free ke:%p",
3611556Srgrimes			    kg, ke);
3621556Srgrimes			TAILQ_REMOVE(&kg->kg_iq, ke, ke_kgrlist);
3631556Srgrimes			ke->ke_state = KES_THREAD;
3641556Srgrimes			kg->kg_idle_kses--;
3651556Srgrimes		} else if (tda && (tda->td_priority > td->td_priority)) {
3661556Srgrimes			/*
36712043Speter			 * None free, but there is one we can commandeer.
368242895Sjilles			 */
3691556Srgrimes			ke = tda->td_kse;
370242895Sjilles			CTR3(KTR_RUNQ,
3711556Srgrimes			    "setrunqueue: kg:%p: take ke:%p from td: %p",
3721556Srgrimes			    kg, ke, tda);
3731556Srgrimes			sched_rem(tda);
3741556Srgrimes			tda->td_kse = NULL;
3751556Srgrimes			ke->ke_thread = NULL;
376229220Sjilles			tda = kg->kg_last_assigned =
37790111Simp		    	    TAILQ_PREV(tda, threadqueue, td_runq);
3781556Srgrimes		}
3791556Srgrimes	} else {
3801556Srgrimes		/*
3811556Srgrimes		 * Temporarily disassociate so it looks like the other cases.
3821556Srgrimes		 */
3831556Srgrimes		ke->ke_thread = NULL;
38412043Speter		td->td_kse = NULL;
385199629Sjilles	}
3861556Srgrimes
3871556Srgrimes	/*
3881556Srgrimes	 * Add the thread to the ksegrp's run queue at
3891556Srgrimes	 * the appropriate place.
3901556Srgrimes	 */
3911556Srgrimes	count = 0;
3921556Srgrimes	TAILQ_FOREACH(td2, &kg->kg_runq, td_runq) {
3931556Srgrimes		if (td2->td_priority > td->td_priority) {
3941556Srgrimes			kg->kg_runnable++;
3951556Srgrimes			TAILQ_INSERT_BEFORE(td2, td, td_runq);
3961556Srgrimes			break;
3971556Srgrimes		}
3981556Srgrimes		/* XXX Debugging hack */
3991556Srgrimes		if (++count > 10000) {
400200956Sjilles			printf("setrunqueue(): corrupt kq_runq, td= %p\n", td);
40117987Speter			panic("deadlock in setrunqueue");
4021556Srgrimes		}
4031556Srgrimes	}
4041556Srgrimes	if (td2 == NULL) {
4051556Srgrimes		/* We ran off the end of the TAILQ or it was empty. */
4061556Srgrimes		kg->kg_runnable++;
407222684Sjilles		TAILQ_INSERT_TAIL(&kg->kg_runq, td, td_runq);
4081556Srgrimes	}
409124780Sdes
4101556Srgrimes	/*
4111556Srgrimes	 * If we have a ke to use, then put it on the run queue and
4121556Srgrimes	 * If needed, readjust the last_assigned pointer.
4131556Srgrimes	 */
4141556Srgrimes	if (ke) {
4151556Srgrimes		if (tda == NULL) {
4161556Srgrimes			/*
4171556Srgrimes			 * No pre-existing last assigned so whoever is first
4181556Srgrimes			 * gets the KSE we brought in.. (maybe us)
4191556Srgrimes			 */
4201556Srgrimes			td2 = TAILQ_FIRST(&kg->kg_runq);
4211556Srgrimes			KASSERT((td2->td_kse == NULL),
4221556Srgrimes			    ("unexpected ke present"));
4231556Srgrimes			td2->td_kse = ke;
4241556Srgrimes			ke->ke_thread = td2;
4251556Srgrimes			kg->kg_last_assigned = td2;
42690111Simp		} else if (tda->td_priority > td->td_priority) {
42717987Speter			/*
42825225Ssteve			 * It's ours, grab it, but last_assigned is past us
4291556Srgrimes			 * so don't change it.
4301556Srgrimes			 */
431230118Sjilles			td->td_kse = ke;
4321556Srgrimes			ke->ke_thread = td;
4331556Srgrimes		} else {
4341556Srgrimes			/*
4351556Srgrimes			 * We are past last_assigned, so
4361556Srgrimes			 * put the new kse on whatever is next,
437230118Sjilles			 * which may or may not be us.
43812043Speter			 */
4391556Srgrimes			td2 = TAILQ_NEXT(tda, td_runq);
4401556Srgrimes			kg->kg_last_assigned = td2;
4411556Srgrimes			td2->td_kse = ke;
4421556Srgrimes			ke->ke_thread = td2;
4431556Srgrimes		}
4441556Srgrimes		sched_add(ke->ke_thread, flags);
4451556Srgrimes	} else {
4461556Srgrimes		CTR3(KTR_RUNQ, "setrunqueue: held: td%p kg%p pid%d",
4471556Srgrimes			td, td->td_ksegrp, td->td_proc->p_pid);
44890111Simp	}
44990111Simp}
4501556Srgrimes
4511556Srgrimes/*
4521556Srgrimes * Kernel thread preemption implementation.  Critical sections mark
4531556Srgrimes * regions of code in which preemptions are not allowed.
45412043Speter */
4551556Srgrimesvoid
4561556Srgrimescritical_enter(void)
4571556Srgrimes{
4581556Srgrimes	struct thread *td;
4591556Srgrimes
4601556Srgrimes	td = curthread;
4611556Srgrimes	if (td->td_critnest == 0)
4621556Srgrimes		cpu_critical_enter(td);
4631556Srgrimes	td->td_critnest++;
4641556Srgrimes}
4651556Srgrimes
4661556Srgrimesvoid
467213811Sobriencritical_exit(void)
46890111Simp{
46990111Simp	struct thread *td;
4701556Srgrimes
4711556Srgrimes	td = curthread;
4721556Srgrimes	KASSERT(td->td_critnest != 0,
47312043Speter	    ("critical_exit: td_critnest == 0"));
4741556Srgrimes	if (td->td_critnest == 1) {
4751556Srgrimes#ifdef PREEMPTION
4761556Srgrimes		mtx_assert(&sched_lock, MA_NOTOWNED);
4771556Srgrimes		if (td->td_pflags & TDP_OWEPREEMPT) {
4781556Srgrimes			mtx_lock_spin(&sched_lock);
4791556Srgrimes			mi_switch(SW_INVOL, NULL);
4801556Srgrimes			mtx_unlock_spin(&sched_lock);
4811556Srgrimes		}
4821556Srgrimes#endif
4831556Srgrimes		td->td_critnest = 0;
4841556Srgrimes		cpu_critical_exit(td);
4851556Srgrimes	} else {
48690111Simp		td->td_critnest--;
48790111Simp	}
4881556Srgrimes}
4891556Srgrimes
4901556Srgrimes/*
4911556Srgrimes * This function is called when a thread is about to be put on run queue
4921556Srgrimes * because it has been made runnable or its priority has been adjusted.  It
4931556Srgrimes * determines if the new thread should be immediately preempted to.  If so,
4941556Srgrimes * it switches to it and eventually returns true.  If not, it returns false
4951556Srgrimes * so that the caller may place the thread on an appropriate run queue.
4961556Srgrimes */
4971556Srgrimesint
4981556Srgrimesmaybe_preempt(struct thread *td)
4991556Srgrimes{
50012043Speter#ifdef PREEMPTION
5011556Srgrimes	struct thread *ctd;
5021556Srgrimes	int cpri, pri;
5031556Srgrimes#endif
5041556Srgrimes
5051556Srgrimes	mtx_assert(&sched_lock, MA_OWNED);
5061556Srgrimes#ifdef PREEMPTION
5071556Srgrimes	/*
508199647Sjilles	 * The new thread should not preempt the current thread if any of the
509199647Sjilles	 * following conditions are true:
510199647Sjilles	 *
511199647Sjilles	 *  - The current thread has a higher (numerically lower) or
512199647Sjilles	 *    equivalent priority.  Note that this prevents curthread from
513199647Sjilles	 *    trying to preempt to itself.
514199647Sjilles	 *  - It is too early in the boot for context switches (cold is set).
515199647Sjilles	 *  - The current thread has an inhibitor set or is in the process of
516199647Sjilles	 *    exiting.  In this case, the current thread is about to switch
517199647Sjilles	 *    out anyways, so there's no point in preempting.  If we did,
518199647Sjilles	 *    the current thread would not be properly resumed as well, so
519199647Sjilles	 *    just avoid that whole landmine.
520199647Sjilles	 *  - If the new thread's priority is not a realtime priority and
521199647Sjilles	 *    the current thread's priority is not an idle priority and
522199647Sjilles	 *    FULL_PREEMPTION is disabled.
523199647Sjilles	 *
524199647Sjilles	 * If all of these conditions are false, but the current thread is in
525199647Sjilles	 * a nested critical section, then we have to defer the preemption
526199647Sjilles	 * until we exit the critical section.  Otherwise, switch immediately
527199647Sjilles	 * to the new thread.
528199647Sjilles	 */
529199647Sjilles	ctd = curthread;
530199647Sjilles	if (ctd->td_kse == NULL || ctd->td_kse->ke_thread != ctd)
531199647Sjilles		return (0);
532199647Sjilles	pri = td->td_priority;
533199647Sjilles	cpri = ctd->td_priority;
5341556Srgrimes	if (pri >= cpri || cold /* || dumping */ || TD_IS_INHIBITED(ctd) ||
5351556Srgrimes	    td->td_kse->ke_state != KES_THREAD)
5361556Srgrimes		return (0);
5371556Srgrimes#ifndef FULL_PREEMPTION
53890111Simp	if (!(pri >= PRI_MIN_ITHD && pri <= PRI_MAX_ITHD) &&
53990111Simp	    !(cpri >= PRI_MIN_IDLE))
5401556Srgrimes		return (0);
5411556Srgrimes#endif
5421556Srgrimes	if (ctd->td_critnest > 1) {
5431556Srgrimes		CTR1(KTR_PROC, "maybe_preempt: in critical section %d",
5441556Srgrimes		    ctd->td_critnest);
5451556Srgrimes		ctd->td_pflags |= TDP_OWEPREEMPT;
5461556Srgrimes		return (0);
5471556Srgrimes	}
5481556Srgrimes
5491556Srgrimes	/*
5501556Srgrimes	 * Our thread state says that we are already on a run queue, so
5511556Srgrimes	 * update our state as if we had been dequeued by choosethread().
55290111Simp	 */
55390111Simp	MPASS(TD_ON_RUNQ(td));
5541556Srgrimes	TD_SET_RUNNING(td);
5551556Srgrimes	CTR3(KTR_PROC, "preempting to thread %p (pid %d, %s)\n", td,
5561556Srgrimes	    td->td_proc->p_pid, td->td_proc->p_comm);
5571556Srgrimes	mi_switch(SW_INVOL, td);
5581556Srgrimes	return (1);
5591556Srgrimes#else
560	return (0);
561#endif
562}
563
564#if 0
565#ifndef PREEMPTION
566/* XXX: There should be a non-static version of this. */
567static void
568printf_caddr_t(void *data)
569{
570	printf("%s", (char *)data);
571}
572static char preempt_warning[] =
573    "WARNING: Kernel preemption is disabled, expect reduced performance.\n";
574SYSINIT(preempt_warning, SI_SUB_COPYRIGHT, SI_ORDER_ANY, printf_caddr_t,
575    preempt_warning)
576#endif
577#endif
578
579/************************************************************************
580 * SYSTEM RUN QUEUE manipulations and tests				*
581 ************************************************************************/
582/*
583 * Initialize a run structure.
584 */
585void
586runq_init(struct runq *rq)
587{
588	int i;
589
590	bzero(rq, sizeof *rq);
591	for (i = 0; i < RQ_NQS; i++)
592		TAILQ_INIT(&rq->rq_queues[i]);
593}
594
595/*
596 * Clear the status bit of the queue corresponding to priority level pri,
597 * indicating that it is empty.
598 */
599static __inline void
600runq_clrbit(struct runq *rq, int pri)
601{
602	struct rqbits *rqb;
603
604	rqb = &rq->rq_status;
605	CTR4(KTR_RUNQ, "runq_clrbit: bits=%#x %#x bit=%#x word=%d",
606	    rqb->rqb_bits[RQB_WORD(pri)],
607	    rqb->rqb_bits[RQB_WORD(pri)] & ~RQB_BIT(pri),
608	    RQB_BIT(pri), RQB_WORD(pri));
609	rqb->rqb_bits[RQB_WORD(pri)] &= ~RQB_BIT(pri);
610}
611
612/*
613 * Find the index of the first non-empty run queue.  This is done by
614 * scanning the status bits, a set bit indicates a non-empty queue.
615 */
616static __inline int
617runq_findbit(struct runq *rq)
618{
619	struct rqbits *rqb;
620	int pri;
621	int i;
622
623	rqb = &rq->rq_status;
624	for (i = 0; i < RQB_LEN; i++)
625		if (rqb->rqb_bits[i]) {
626			pri = RQB_FFS(rqb->rqb_bits[i]) + (i << RQB_L2BPW);
627			CTR3(KTR_RUNQ, "runq_findbit: bits=%#x i=%d pri=%d",
628			    rqb->rqb_bits[i], i, pri);
629			return (pri);
630		}
631
632	return (-1);
633}
634
635/*
636 * Set the status bit of the queue corresponding to priority level pri,
637 * indicating that it is non-empty.
638 */
639static __inline void
640runq_setbit(struct runq *rq, int pri)
641{
642	struct rqbits *rqb;
643
644	rqb = &rq->rq_status;
645	CTR4(KTR_RUNQ, "runq_setbit: bits=%#x %#x bit=%#x word=%d",
646	    rqb->rqb_bits[RQB_WORD(pri)],
647	    rqb->rqb_bits[RQB_WORD(pri)] | RQB_BIT(pri),
648	    RQB_BIT(pri), RQB_WORD(pri));
649	rqb->rqb_bits[RQB_WORD(pri)] |= RQB_BIT(pri);
650}
651
652/*
653 * Add the KSE to the queue specified by its priority, and set the
654 * corresponding status bit.
655 */
656void
657runq_add(struct runq *rq, struct kse *ke)
658{
659	struct rqhead *rqh;
660	int pri;
661
662	pri = ke->ke_thread->td_priority / RQ_PPQ;
663	ke->ke_rqindex = pri;
664	runq_setbit(rq, pri);
665	rqh = &rq->rq_queues[pri];
666	CTR5(KTR_RUNQ, "runq_add: td=%p ke=%p pri=%d %d rqh=%p",
667	    ke->ke_thread, ke, ke->ke_thread->td_priority, pri, rqh);
668	TAILQ_INSERT_TAIL(rqh, ke, ke_procq);
669}
670
671/*
672 * Return true if there are runnable processes of any priority on the run
673 * queue, false otherwise.  Has no side effects, does not modify the run
674 * queue structure.
675 */
676int
677runq_check(struct runq *rq)
678{
679	struct rqbits *rqb;
680	int i;
681
682	rqb = &rq->rq_status;
683	for (i = 0; i < RQB_LEN; i++)
684		if (rqb->rqb_bits[i]) {
685			CTR2(KTR_RUNQ, "runq_check: bits=%#x i=%d",
686			    rqb->rqb_bits[i], i);
687			return (1);
688		}
689	CTR0(KTR_RUNQ, "runq_check: empty");
690
691	return (0);
692}
693
694#if defined(SMP) && defined(SCHED_4BSD)
695int runq_fuzz = 1;
696SYSCTL_DECL(_kern_sched);
697SYSCTL_INT(_kern_sched, OID_AUTO, runq_fuzz, CTLFLAG_RW, &runq_fuzz, 0, "");
698#endif
699
700/*
701 * Find the highest priority process on the run queue.
702 */
703struct kse *
704runq_choose(struct runq *rq)
705{
706	struct rqhead *rqh;
707	struct kse *ke;
708	int pri;
709
710	mtx_assert(&sched_lock, MA_OWNED);
711	while ((pri = runq_findbit(rq)) != -1) {
712		rqh = &rq->rq_queues[pri];
713#if defined(SMP) && defined(SCHED_4BSD)
714		/* fuzz == 1 is normal.. 0 or less are ignored */
715		if (runq_fuzz > 1) {
716			/*
717			 * In the first couple of entries, check if
718			 * there is one for our CPU as a preference.
719			 */
720			int count = runq_fuzz;
721			int cpu = PCPU_GET(cpuid);
722			struct kse *ke2;
723			ke2 = ke = TAILQ_FIRST(rqh);
724
725			while (count-- && ke2) {
726				if (ke->ke_thread->td_lastcpu == cpu) {
727					ke = ke2;
728					break;
729				}
730				ke2 = TAILQ_NEXT(ke2, ke_procq);
731			}
732		} else
733#endif
734			ke = TAILQ_FIRST(rqh);
735		KASSERT(ke != NULL, ("runq_choose: no proc on busy queue"));
736		CTR3(KTR_RUNQ,
737		    "runq_choose: pri=%d kse=%p rqh=%p", pri, ke, rqh);
738		return (ke);
739	}
740	CTR1(KTR_RUNQ, "runq_choose: idleproc pri=%d", pri);
741
742	return (NULL);
743}
744
745/*
746 * Remove the KSE from the queue specified by its priority, and clear the
747 * corresponding status bit if the queue becomes empty.
748 * Caller must set ke->ke_state afterwards.
749 */
750void
751runq_remove(struct runq *rq, struct kse *ke)
752{
753	struct rqhead *rqh;
754	int pri;
755
756	KASSERT(ke->ke_proc->p_sflag & PS_INMEM,
757		("runq_remove: process swapped out"));
758	pri = ke->ke_rqindex;
759	rqh = &rq->rq_queues[pri];
760	CTR5(KTR_RUNQ, "runq_remove: td=%p, ke=%p pri=%d %d rqh=%p",
761	    ke->ke_thread, ke, ke->ke_thread->td_priority, pri, rqh);
762	KASSERT(ke != NULL, ("runq_remove: no proc on busy queue"));
763	TAILQ_REMOVE(rqh, ke, ke_procq);
764	if (TAILQ_EMPTY(rqh)) {
765		CTR0(KTR_RUNQ, "runq_remove: empty");
766		runq_clrbit(rq, pri);
767	}
768}
769
770#if 0
771void
772panc(char *string1, char *string2)
773{
774	printf("%s", string1);
775	kdb_enter(string2);
776}
777
778void
779thread_sanity_check(struct thread *td, char *string)
780{
781	struct proc *p;
782	struct ksegrp *kg;
783	struct kse *ke;
784	struct thread *td2 = NULL;
785	unsigned int prevpri;
786	int	saw_lastassigned = 0;
787	int unassigned = 0;
788	int assigned = 0;
789
790	p = td->td_proc;
791	kg = td->td_ksegrp;
792	ke = td->td_kse;
793
794
795	if (ke) {
796		if (p != ke->ke_proc) {
797			panc(string, "wrong proc");
798		}
799		if (ke->ke_thread != td) {
800			panc(string, "wrong thread");
801		}
802	}
803
804	if ((p->p_flag & P_SA) == 0) {
805		if (ke == NULL) {
806			panc(string, "non KSE thread lost kse");
807		}
808	} else {
809		prevpri = 0;
810		saw_lastassigned = 0;
811		unassigned = 0;
812		assigned = 0;
813		TAILQ_FOREACH(td2, &kg->kg_runq, td_runq) {
814			if (td2->td_priority < prevpri) {
815				panc(string, "thread runqueue unosorted");
816			}
817			if ((td2->td_state == TDS_RUNQ) &&
818			    td2->td_kse &&
819			    (td2->td_kse->ke_state != KES_ONRUNQ)) {
820				panc(string, "KSE wrong state");
821			}
822			prevpri = td2->td_priority;
823			if (td2->td_kse) {
824				assigned++;
825				if (unassigned) {
826					panc(string, "unassigned before assigned");
827				}
828 				if  (kg->kg_last_assigned == NULL) {
829					panc(string, "lastassigned corrupt");
830				}
831				if (saw_lastassigned) {
832					panc(string, "last assigned not last");
833				}
834				if (td2->td_kse->ke_thread != td2) {
835					panc(string, "mismatched kse/thread");
836				}
837			} else {
838				unassigned++;
839			}
840			if (td2 == kg->kg_last_assigned) {
841				saw_lastassigned = 1;
842				if (td2->td_kse == NULL) {
843					panc(string, "last assigned not assigned");
844				}
845			}
846		}
847		if (kg->kg_last_assigned && (saw_lastassigned == 0)) {
848			panc(string, "where on earth does lastassigned point?");
849		}
850#if 0
851		FOREACH_THREAD_IN_GROUP(kg, td2) {
852			if (((td2->td_flags & TDF_UNBOUND) == 0) &&
853			    (TD_ON_RUNQ(td2))) {
854				assigned++;
855				if (td2->td_kse == NULL) {
856					panc(string, "BOUND thread with no KSE");
857				}
858			}
859		}
860#endif
861#if 0
862		if ((unassigned + assigned) != kg->kg_runnable) {
863			panc(string, "wrong number in runnable");
864		}
865#endif
866	}
867	if (assigned == 12345) {
868		printf("%p %p %p %p %p %d, %d",
869		    td, td2, ke, kg, p, assigned, saw_lastassigned);
870	}
871}
872#endif
873
874