kern_switch.c revision 134665
1/*
2 * Copyright (c) 2001 Jake Burkholder <jake@FreeBSD.org>
3 * All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 * 1. Redistributions of source code must retain the above copyright
9 *    notice, this list of conditions and the following disclaimer.
10 * 2. Redistributions in binary form must reproduce the above copyright
11 *    notice, this list of conditions and the following disclaimer in the
12 *    documentation and/or other materials provided with the distribution.
13 *
14 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
15 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
16 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
17 * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
18 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
19 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
20 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
21 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
22 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
23 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
24 * SUCH DAMAGE.
25 */
26
27/***
28Here is the logic..
29
30If there are N processors, then there are at most N KSEs (kernel
31schedulable entities) working to process threads that belong to a
32KSEGROUP (kg). If there are X of these KSEs actually running at the
33moment in question, then there are at most M (N-X) of these KSEs on
34the run queue, as running KSEs are not on the queue.
35
36Runnable threads are queued off the KSEGROUP in priority order.
37If there are M or more threads runnable, the top M threads
38(by priority) are 'preassigned' to the M KSEs not running. The KSEs take
39their priority from those threads and are put on the run queue.
40
41The last thread that had a priority high enough to have a KSE associated
42with it, AND IS ON THE RUN QUEUE is pointed to by
43kg->kg_last_assigned. If no threads queued off the KSEGROUP have KSEs
44assigned as all the available KSEs are activly running, or because there
45are no threads queued, that pointer is NULL.
46
47When a KSE is removed from the run queue to become runnable, we know
48it was associated with the highest priority thread in the queue (at the head
49of the queue). If it is also the last assigned we know M was 1 and must
50now be 0. Since the thread is no longer queued that pointer must be
51removed from it. Since we know there were no more KSEs available,
52(M was 1 and is now 0) and since we are not FREEING our KSE
53but using it, we know there are STILL no more KSEs available, we can prove
54that the next thread in the ksegrp list will not have a KSE to assign to
55it, so we can show that the pointer must be made 'invalid' (NULL).
56
57The pointer exists so that when a new thread is made runnable, it can
58have its priority compared with the last assigned thread to see if
59it should 'steal' its KSE or not.. i.e. is it 'earlier'
60on the list than that thread or later.. If it's earlier, then the KSE is
61removed from the last assigned (which is now not assigned a KSE)
62and reassigned to the new thread, which is placed earlier in the list.
63The pointer is then backed up to the previous thread (which may or may not
64be the new thread).
65
66When a thread sleeps or is removed, the KSE becomes available and if there
67are queued threads that are not assigned KSEs, the highest priority one of
68them is assigned the KSE, which is then placed back on the run queue at
69the approipriate place, and the kg->kg_last_assigned pointer is adjusted down
70to point to it.
71
72The following diagram shows 2 KSEs and 3 threads from a single process.
73
74 RUNQ: --->KSE---KSE--...    (KSEs queued at priorities from threads)
75              \    \____
76               \        \
77    KSEGROUP---thread--thread--thread    (queued in priority order)
78        \                 /
79         \_______________/
80          (last_assigned)
81
82The result of this scheme is that the M available KSEs are always
83queued at the priorities they have inherrited from the M highest priority
84threads for that KSEGROUP. If this situation changes, the KSEs are
85reassigned to keep this true.
86***/
87
88#include <sys/cdefs.h>
89__FBSDID("$FreeBSD: head/sys/kern/kern_switch.c 134665 2004-09-02 23:37:41Z julian $");
90
91#include "opt_sched.h"
92
93#include <sys/param.h>
94#include <sys/systm.h>
95#include <sys/kdb.h>
96#include <sys/kernel.h>
97#include <sys/ktr.h>
98#include <sys/lock.h>
99#include <sys/mutex.h>
100#include <sys/proc.h>
101#include <sys/queue.h>
102#include <sys/sched.h>
103#if defined(SMP) && (defined(__i386__) || defined(__amd64__))
104#include <sys/smp.h>
105#endif
106#include <machine/critical.h>
107#if defined(SMP) && defined(SCHED_4BSD)
108#include <sys/sysctl.h>
109#endif
110
111#ifdef FULL_PREEMPTION
112#ifndef PREEMPTION
113#error "The FULL_PREEMPTION option requires the PREEMPTION option"
114#endif
115#endif
116
117CTASSERT((RQB_BPW * RQB_LEN) == RQ_NQS);
118
119/************************************************************************
120 * Functions that manipulate runnability from a thread perspective.	*
121 ************************************************************************/
122/*
123 * Select the KSE that will be run next.  From that find the thread, and
124 * remove it from the KSEGRP's run queue.  If there is thread clustering,
125 * this will be what does it.
126 */
127struct thread *
128choosethread(void)
129{
130	struct kse *ke;
131	struct thread *td;
132	struct ksegrp *kg;
133
134#if defined(SMP) && (defined(__i386__) || defined(__amd64__))
135	if (smp_active == 0 && PCPU_GET(cpuid) != 0) {
136		/* Shutting down, run idlethread on AP's */
137		td = PCPU_GET(idlethread);
138		ke = td->td_kse;
139		CTR1(KTR_RUNQ, "choosethread: td=%p (idle)", td);
140		ke->ke_flags |= KEF_DIDRUN;
141		TD_SET_RUNNING(td);
142		return (td);
143	}
144#endif
145
146retry:
147	ke = sched_choose();
148	if (ke) {
149		td = ke->ke_thread;
150		KASSERT((td->td_kse == ke), ("kse/thread mismatch"));
151		kg = ke->ke_ksegrp;
152		if (td->td_proc->p_flag & P_SA) {
153			if (kg->kg_last_assigned == td) {
154				kg->kg_last_assigned = TAILQ_PREV(td,
155				    threadqueue, td_runq);
156			}
157			TAILQ_REMOVE(&kg->kg_runq, td, td_runq);
158			kg->kg_runnable--;
159		}
160		CTR2(KTR_RUNQ, "choosethread: td=%p pri=%d",
161		    td, td->td_priority);
162	} else {
163		/* Simulate runq_choose() having returned the idle thread */
164		td = PCPU_GET(idlethread);
165		ke = td->td_kse;
166		CTR1(KTR_RUNQ, "choosethread: td=%p (idle)", td);
167	}
168	ke->ke_flags |= KEF_DIDRUN;
169
170	/*
171	 * If we are in panic, only allow system threads,
172	 * plus the one we are running in, to be run.
173	 */
174	if (panicstr && ((td->td_proc->p_flag & P_SYSTEM) == 0 &&
175	    (td->td_flags & TDF_INPANIC) == 0)) {
176		/* note that it is no longer on the run queue */
177		TD_SET_CAN_RUN(td);
178		goto retry;
179	}
180
181	TD_SET_RUNNING(td);
182	return (td);
183}
184
185/*
186 * Given a surplus KSE, either assign a new runable thread to it
187 * (and put it in the run queue) or put it in the ksegrp's idle KSE list.
188 * Assumes that the original thread is not runnable.
189 */
190void
191kse_reassign(struct kse *ke)
192{
193	struct ksegrp *kg;
194	struct thread *td;
195	struct thread *original;
196
197	mtx_assert(&sched_lock, MA_OWNED);
198	original = ke->ke_thread;
199	KASSERT(original == NULL || TD_IS_INHIBITED(original),
200    	    ("reassigning KSE with runnable thread"));
201	kg = ke->ke_ksegrp;
202	if (original)
203		original->td_kse = NULL;
204
205	/*
206	 * Find the first unassigned thread
207	 */
208	if ((td = kg->kg_last_assigned) != NULL)
209		td = TAILQ_NEXT(td, td_runq);
210	else
211		td = TAILQ_FIRST(&kg->kg_runq);
212
213	/*
214	 * If we found one, assign it the kse, otherwise idle the kse.
215	 */
216	if (td) {
217		kg->kg_last_assigned = td;
218		td->td_kse = ke;
219		ke->ke_thread = td;
220		CTR2(KTR_RUNQ, "kse_reassign: ke%p -> td%p", ke, td);
221		sched_add(td, SRQ_BORING);
222		return;
223	}
224
225	ke->ke_state = KES_IDLE;
226	ke->ke_thread = NULL;
227	TAILQ_INSERT_TAIL(&kg->kg_iq, ke, ke_kgrlist);
228	kg->kg_idle_kses++;
229	CTR1(KTR_RUNQ, "kse_reassign: ke%p on idle queue", ke);
230	return;
231}
232
233#if 0
234/*
235 * Remove a thread from its KSEGRP's run queue.
236 * This in turn may remove it from a KSE if it was already assigned
237 * to one, possibly causing a new thread to be assigned to the KSE
238 * and the KSE getting a new priority.
239 */
240static void
241remrunqueue(struct thread *td)
242{
243	struct thread *td2, *td3;
244	struct ksegrp *kg;
245	struct kse *ke;
246
247	mtx_assert(&sched_lock, MA_OWNED);
248	KASSERT((TD_ON_RUNQ(td)), ("remrunqueue: Bad state on run queue"));
249	kg = td->td_ksegrp;
250	ke = td->td_kse;
251	CTR1(KTR_RUNQ, "remrunqueue: td%p", td);
252	TD_SET_CAN_RUN(td);
253	/*
254	 * If it is not a threaded process, take the shortcut.
255	 */
256	if ((td->td_proc->p_flag & P_SA) == 0) {
257		/* Bring its kse with it, leave the thread attached */
258		sched_rem(td);
259		ke->ke_state = KES_THREAD;
260		return;
261	}
262   	td3 = TAILQ_PREV(td, threadqueue, td_runq);
263	TAILQ_REMOVE(&kg->kg_runq, td, td_runq);
264	kg->kg_runnable--;
265	if (ke) {
266		/*
267		 * This thread has been assigned to a KSE.
268		 * We need to dissociate it and try assign the
269		 * KSE to the next available thread. Then, we should
270		 * see if we need to move the KSE in the run queues.
271		 */
272		sched_rem(td);
273		ke->ke_state = KES_THREAD;
274		td2 = kg->kg_last_assigned;
275		KASSERT((td2 != NULL), ("last assigned has wrong value"));
276		if (td2 == td)
277			kg->kg_last_assigned = td3;
278		kse_reassign(ke);
279	}
280}
281#endif
282
283/*
284 * Change the priority of a thread that is on the run queue.
285 */
286void
287adjustrunqueue( struct thread *td, int newpri)
288{
289	struct ksegrp *kg;
290	struct kse *ke;
291
292	mtx_assert(&sched_lock, MA_OWNED);
293	KASSERT((TD_ON_RUNQ(td)), ("adjustrunqueue: Bad state on run queue"));
294
295	ke = td->td_kse;
296	CTR1(KTR_RUNQ, "adjustrunqueue: td%p", td);
297	/*
298	 * If it is not a threaded process, take the shortcut.
299	 */
300	if ((td->td_proc->p_flag & P_SA) == 0) {
301		/* We only care about the kse in the run queue. */
302		td->td_priority = newpri;
303		if (ke->ke_rqindex != (newpri / RQ_PPQ)) {
304			sched_rem(td);
305			sched_add(td, SRQ_BORING);
306		}
307		return;
308	}
309
310	/* It is a threaded process */
311	kg = td->td_ksegrp;
312	TD_SET_CAN_RUN(td);
313	if (ke) {
314		if (kg->kg_last_assigned == td) {
315			kg->kg_last_assigned =
316			    TAILQ_PREV(td, threadqueue, td_runq);
317		}
318		sched_rem(td);
319	}
320	TAILQ_REMOVE(&kg->kg_runq, td, td_runq);
321	kg->kg_runnable--;
322	td->td_priority = newpri;
323	setrunqueue(td, SRQ_BORING);
324}
325
326void
327setrunqueue(struct thread *td, int flags)
328{
329	struct kse *ke;
330	struct ksegrp *kg;
331	struct thread *td2;
332	struct thread *tda;
333	int count;
334
335	CTR4(KTR_RUNQ, "setrunqueue: td:%p ke:%p kg:%p pid:%d",
336	    td, td->td_kse, td->td_ksegrp, td->td_proc->p_pid);
337	mtx_assert(&sched_lock, MA_OWNED);
338	KASSERT((TD_CAN_RUN(td) || TD_IS_RUNNING(td)),
339	    ("setrunqueue: bad thread state"));
340	TD_SET_RUNQ(td);
341	kg = td->td_ksegrp;
342	if ((td->td_proc->p_flag & P_SA) == 0) {
343		/*
344		 * Common path optimisation: Only one of everything
345		 * and the KSE is always already attached.
346		 * Totally ignore the ksegrp run queue.
347		 */
348		sched_add(td, flags);
349		return;
350	}
351
352	tda = kg->kg_last_assigned;
353	if ((ke = td->td_kse) == NULL) {
354		if (kg->kg_idle_kses) {
355			/*
356			 * There is a free one so it's ours for the asking..
357			 */
358			ke = TAILQ_FIRST(&kg->kg_iq);
359			CTR2(KTR_RUNQ, "setrunqueue: kg:%p: Use free ke:%p",
360			    kg, ke);
361			TAILQ_REMOVE(&kg->kg_iq, ke, ke_kgrlist);
362			ke->ke_state = KES_THREAD;
363			kg->kg_idle_kses--;
364		} else if (tda && (tda->td_priority > td->td_priority)) {
365			/*
366			 * None free, but there is one we can commandeer.
367			 */
368			ke = tda->td_kse;
369			CTR3(KTR_RUNQ,
370			    "setrunqueue: kg:%p: take ke:%p from td: %p",
371			    kg, ke, tda);
372			sched_rem(tda);
373			tda->td_kse = NULL;
374			ke->ke_thread = NULL;
375			tda = kg->kg_last_assigned =
376		    	    TAILQ_PREV(tda, threadqueue, td_runq);
377		}
378	} else {
379		/*
380		 * Temporarily disassociate so it looks like the other cases.
381		 */
382		ke->ke_thread = NULL;
383		td->td_kse = NULL;
384	}
385
386	/*
387	 * Add the thread to the ksegrp's run queue at
388	 * the appropriate place.
389	 */
390	count = 0;
391	TAILQ_FOREACH(td2, &kg->kg_runq, td_runq) {
392		if (td2->td_priority > td->td_priority) {
393			kg->kg_runnable++;
394			TAILQ_INSERT_BEFORE(td2, td, td_runq);
395			break;
396		}
397		/* XXX Debugging hack */
398		if (++count > 10000) {
399			printf("setrunqueue(): corrupt kq_runq, td= %p\n", td);
400			panic("deadlock in setrunqueue");
401		}
402	}
403	if (td2 == NULL) {
404		/* We ran off the end of the TAILQ or it was empty. */
405		kg->kg_runnable++;
406		TAILQ_INSERT_TAIL(&kg->kg_runq, td, td_runq);
407	}
408
409	/*
410	 * If we have a ke to use, then put it on the run queue and
411	 * If needed, readjust the last_assigned pointer.
412	 */
413	if (ke) {
414		if (tda == NULL) {
415			/*
416			 * No pre-existing last assigned so whoever is first
417			 * gets the KSE we brought in.. (maybe us)
418			 */
419			td2 = TAILQ_FIRST(&kg->kg_runq);
420			KASSERT((td2->td_kse == NULL),
421			    ("unexpected ke present"));
422			td2->td_kse = ke;
423			ke->ke_thread = td2;
424			kg->kg_last_assigned = td2;
425		} else if (tda->td_priority > td->td_priority) {
426			/*
427			 * It's ours, grab it, but last_assigned is past us
428			 * so don't change it.
429			 */
430			td->td_kse = ke;
431			ke->ke_thread = td;
432		} else {
433			/*
434			 * We are past last_assigned, so
435			 * put the new kse on whatever is next,
436			 * which may or may not be us.
437			 */
438			td2 = TAILQ_NEXT(tda, td_runq);
439			kg->kg_last_assigned = td2;
440			td2->td_kse = ke;
441			ke->ke_thread = td2;
442		}
443		sched_add(ke->ke_thread, flags);
444	} else {
445		CTR3(KTR_RUNQ, "setrunqueue: held: td%p kg%p pid%d",
446			td, td->td_ksegrp, td->td_proc->p_pid);
447	}
448}
449
450/*
451 * Kernel thread preemption implementation.  Critical sections mark
452 * regions of code in which preemptions are not allowed.
453 */
454void
455critical_enter(void)
456{
457	struct thread *td;
458
459	td = curthread;
460	if (td->td_critnest == 0)
461		cpu_critical_enter(td);
462	td->td_critnest++;
463}
464
465void
466critical_exit(void)
467{
468	struct thread *td;
469
470	td = curthread;
471	KASSERT(td->td_critnest != 0,
472	    ("critical_exit: td_critnest == 0"));
473	if (td->td_critnest == 1) {
474#ifdef PREEMPTION
475		mtx_assert(&sched_lock, MA_NOTOWNED);
476		if (td->td_pflags & TDP_OWEPREEMPT) {
477			mtx_lock_spin(&sched_lock);
478			mi_switch(SW_INVOL, NULL);
479			mtx_unlock_spin(&sched_lock);
480		}
481#endif
482		td->td_critnest = 0;
483		cpu_critical_exit(td);
484	} else {
485		td->td_critnest--;
486	}
487}
488
489/*
490 * This function is called when a thread is about to be put on run queue
491 * because it has been made runnable or its priority has been adjusted.  It
492 * determines if the new thread should be immediately preempted to.  If so,
493 * it switches to it and eventually returns true.  If not, it returns false
494 * so that the caller may place the thread on an appropriate run queue.
495 */
496int
497maybe_preempt(struct thread *td)
498{
499#ifdef PREEMPTION
500	struct thread *ctd;
501	int cpri, pri;
502#endif
503
504	mtx_assert(&sched_lock, MA_OWNED);
505#ifdef PREEMPTION
506	/*
507	 * The new thread should not preempt the current thread if any of the
508	 * following conditions are true:
509	 *
510	 *  - The current thread has a higher (numerically lower) or
511	 *    equivalent priority.  Note that this prevents curthread from
512	 *    trying to preempt to itself.
513	 *  - It is too early in the boot for context switches (cold is set).
514	 *  - The current thread has an inhibitor set or is in the process of
515	 *    exiting.  In this case, the current thread is about to switch
516	 *    out anyways, so there's no point in preempting.  If we did,
517	 *    the current thread would not be properly resumed as well, so
518	 *    just avoid that whole landmine.
519	 *  - If the new thread's priority is not a realtime priority and
520	 *    the current thread's priority is not an idle priority and
521	 *    FULL_PREEMPTION is disabled.
522	 *
523	 * If all of these conditions are false, but the current thread is in
524	 * a nested critical section, then we have to defer the preemption
525	 * until we exit the critical section.  Otherwise, switch immediately
526	 * to the new thread.
527	 */
528	ctd = curthread;
529	if (ctd->td_kse == NULL || ctd->td_kse->ke_thread != ctd)
530		return (0);
531	pri = td->td_priority;
532	cpri = ctd->td_priority;
533	if (pri >= cpri || cold /* || dumping */ || TD_IS_INHIBITED(ctd) ||
534	    td->td_kse->ke_state != KES_THREAD)
535		return (0);
536#ifndef FULL_PREEMPTION
537	if (!(pri >= PRI_MIN_ITHD && pri <= PRI_MAX_ITHD) &&
538	    !(cpri >= PRI_MIN_IDLE))
539		return (0);
540#endif
541	if (ctd->td_critnest > 1) {
542		CTR1(KTR_PROC, "maybe_preempt: in critical section %d",
543		    ctd->td_critnest);
544		ctd->td_pflags |= TDP_OWEPREEMPT;
545		return (0);
546	}
547
548	/*
549	 * Our thread state says that we are already on a run queue, so
550	 * update our state as if we had been dequeued by choosethread().
551	 */
552	MPASS(TD_ON_RUNQ(td));
553	TD_SET_RUNNING(td);
554	CTR3(KTR_PROC, "preempting to thread %p (pid %d, %s)\n", td,
555	    td->td_proc->p_pid, td->td_proc->p_comm);
556	mi_switch(SW_INVOL, td);
557	return (1);
558#else
559	return (0);
560#endif
561}
562
563#if 0
564#ifndef PREEMPTION
565/* XXX: There should be a non-static version of this. */
566static void
567printf_caddr_t(void *data)
568{
569	printf("%s", (char *)data);
570}
571static char preempt_warning[] =
572    "WARNING: Kernel preemption is disabled, expect reduced performance.\n";
573SYSINIT(preempt_warning, SI_SUB_COPYRIGHT, SI_ORDER_ANY, printf_caddr_t,
574    preempt_warning)
575#endif
576#endif
577
578/************************************************************************
579 * SYSTEM RUN QUEUE manipulations and tests				*
580 ************************************************************************/
581/*
582 * Initialize a run structure.
583 */
584void
585runq_init(struct runq *rq)
586{
587	int i;
588
589	bzero(rq, sizeof *rq);
590	for (i = 0; i < RQ_NQS; i++)
591		TAILQ_INIT(&rq->rq_queues[i]);
592}
593
594/*
595 * Clear the status bit of the queue corresponding to priority level pri,
596 * indicating that it is empty.
597 */
598static __inline void
599runq_clrbit(struct runq *rq, int pri)
600{
601	struct rqbits *rqb;
602
603	rqb = &rq->rq_status;
604	CTR4(KTR_RUNQ, "runq_clrbit: bits=%#x %#x bit=%#x word=%d",
605	    rqb->rqb_bits[RQB_WORD(pri)],
606	    rqb->rqb_bits[RQB_WORD(pri)] & ~RQB_BIT(pri),
607	    RQB_BIT(pri), RQB_WORD(pri));
608	rqb->rqb_bits[RQB_WORD(pri)] &= ~RQB_BIT(pri);
609}
610
611/*
612 * Find the index of the first non-empty run queue.  This is done by
613 * scanning the status bits, a set bit indicates a non-empty queue.
614 */
615static __inline int
616runq_findbit(struct runq *rq)
617{
618	struct rqbits *rqb;
619	int pri;
620	int i;
621
622	rqb = &rq->rq_status;
623	for (i = 0; i < RQB_LEN; i++)
624		if (rqb->rqb_bits[i]) {
625			pri = RQB_FFS(rqb->rqb_bits[i]) + (i << RQB_L2BPW);
626			CTR3(KTR_RUNQ, "runq_findbit: bits=%#x i=%d pri=%d",
627			    rqb->rqb_bits[i], i, pri);
628			return (pri);
629		}
630
631	return (-1);
632}
633
634/*
635 * Set the status bit of the queue corresponding to priority level pri,
636 * indicating that it is non-empty.
637 */
638static __inline void
639runq_setbit(struct runq *rq, int pri)
640{
641	struct rqbits *rqb;
642
643	rqb = &rq->rq_status;
644	CTR4(KTR_RUNQ, "runq_setbit: bits=%#x %#x bit=%#x word=%d",
645	    rqb->rqb_bits[RQB_WORD(pri)],
646	    rqb->rqb_bits[RQB_WORD(pri)] | RQB_BIT(pri),
647	    RQB_BIT(pri), RQB_WORD(pri));
648	rqb->rqb_bits[RQB_WORD(pri)] |= RQB_BIT(pri);
649}
650
651/*
652 * Add the KSE to the queue specified by its priority, and set the
653 * corresponding status bit.
654 */
655void
656runq_add(struct runq *rq, struct kse *ke)
657{
658	struct rqhead *rqh;
659	int pri;
660
661	pri = ke->ke_thread->td_priority / RQ_PPQ;
662	ke->ke_rqindex = pri;
663	runq_setbit(rq, pri);
664	rqh = &rq->rq_queues[pri];
665	CTR5(KTR_RUNQ, "runq_add: td=%p ke=%p pri=%d %d rqh=%p",
666	    ke->ke_thread, ke, ke->ke_thread->td_priority, pri, rqh);
667	TAILQ_INSERT_TAIL(rqh, ke, ke_procq);
668}
669
670/*
671 * Return true if there are runnable processes of any priority on the run
672 * queue, false otherwise.  Has no side effects, does not modify the run
673 * queue structure.
674 */
675int
676runq_check(struct runq *rq)
677{
678	struct rqbits *rqb;
679	int i;
680
681	rqb = &rq->rq_status;
682	for (i = 0; i < RQB_LEN; i++)
683		if (rqb->rqb_bits[i]) {
684			CTR2(KTR_RUNQ, "runq_check: bits=%#x i=%d",
685			    rqb->rqb_bits[i], i);
686			return (1);
687		}
688	CTR0(KTR_RUNQ, "runq_check: empty");
689
690	return (0);
691}
692
693#if defined(SMP) && defined(SCHED_4BSD)
694int runq_fuzz = 1;
695SYSCTL_DECL(_kern_sched);
696SYSCTL_INT(_kern_sched, OID_AUTO, runq_fuzz, CTLFLAG_RW, &runq_fuzz, 0, "");
697#endif
698
699/*
700 * Find the highest priority process on the run queue.
701 */
702struct kse *
703runq_choose(struct runq *rq)
704{
705	struct rqhead *rqh;
706	struct kse *ke;
707	int pri;
708
709	mtx_assert(&sched_lock, MA_OWNED);
710	while ((pri = runq_findbit(rq)) != -1) {
711		rqh = &rq->rq_queues[pri];
712#if defined(SMP) && defined(SCHED_4BSD)
713		/* fuzz == 1 is normal.. 0 or less are ignored */
714		if (runq_fuzz > 1) {
715			/*
716			 * In the first couple of entries, check if
717			 * there is one for our CPU as a preference.
718			 */
719			int count = runq_fuzz;
720			int cpu = PCPU_GET(cpuid);
721			struct kse *ke2;
722			ke2 = ke = TAILQ_FIRST(rqh);
723
724			while (count-- && ke2) {
725				if (ke->ke_thread->td_lastcpu == cpu) {
726					ke = ke2;
727					break;
728				}
729				ke2 = TAILQ_NEXT(ke2, ke_procq);
730			}
731		} else
732#endif
733			ke = TAILQ_FIRST(rqh);
734		KASSERT(ke != NULL, ("runq_choose: no proc on busy queue"));
735		CTR3(KTR_RUNQ,
736		    "runq_choose: pri=%d kse=%p rqh=%p", pri, ke, rqh);
737		return (ke);
738	}
739	CTR1(KTR_RUNQ, "runq_choose: idleproc pri=%d", pri);
740
741	return (NULL);
742}
743
744/*
745 * Remove the KSE from the queue specified by its priority, and clear the
746 * corresponding status bit if the queue becomes empty.
747 * Caller must set ke->ke_state afterwards.
748 */
749void
750runq_remove(struct runq *rq, struct kse *ke)
751{
752	struct rqhead *rqh;
753	int pri;
754
755	KASSERT(ke->ke_proc->p_sflag & PS_INMEM,
756		("runq_remove: process swapped out"));
757	pri = ke->ke_rqindex;
758	rqh = &rq->rq_queues[pri];
759	CTR5(KTR_RUNQ, "runq_remove: td=%p, ke=%p pri=%d %d rqh=%p",
760	    ke->ke_thread, ke, ke->ke_thread->td_priority, pri, rqh);
761	KASSERT(ke != NULL, ("runq_remove: no proc on busy queue"));
762	TAILQ_REMOVE(rqh, ke, ke_procq);
763	if (TAILQ_EMPTY(rqh)) {
764		CTR0(KTR_RUNQ, "runq_remove: empty");
765		runq_clrbit(rq, pri);
766	}
767}
768
769