kern_switch.c revision 143757
1/*-
2 * Copyright (c) 2001 Jake Burkholder <jake@FreeBSD.org>
3 * All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 * 1. Redistributions of source code must retain the above copyright
9 *    notice, this list of conditions and the following disclaimer.
10 * 2. Redistributions in binary form must reproduce the above copyright
11 *    notice, this list of conditions and the following disclaimer in the
12 *    documentation and/or other materials provided with the distribution.
13 *
14 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
15 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
16 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
17 * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
18 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
19 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
20 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
21 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
22 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
23 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
24 * SUCH DAMAGE.
25 */
26
27/***
28Here is the logic..
29
30If there are N processors, then there are at most N KSEs (kernel
31schedulable entities) working to process threads that belong to a
32KSEGROUP (kg). If there are X of these KSEs actually running at the
33moment in question, then there are at most M (N-X) of these KSEs on
34the run queue, as running KSEs are not on the queue.
35
36Runnable threads are queued off the KSEGROUP in priority order.
37If there are M or more threads runnable, the top M threads
38(by priority) are 'preassigned' to the M KSEs not running. The KSEs take
39their priority from those threads and are put on the run queue.
40
41The last thread that had a priority high enough to have a KSE associated
42with it, AND IS ON THE RUN QUEUE is pointed to by
43kg->kg_last_assigned. If no threads queued off the KSEGROUP have KSEs
44assigned as all the available KSEs are activly running, or because there
45are no threads queued, that pointer is NULL.
46
47When a KSE is removed from the run queue to become runnable, we know
48it was associated with the highest priority thread in the queue (at the head
49of the queue). If it is also the last assigned we know M was 1 and must
50now be 0. Since the thread is no longer queued that pointer must be
51removed from it. Since we know there were no more KSEs available,
52(M was 1 and is now 0) and since we are not FREEING our KSE
53but using it, we know there are STILL no more KSEs available, we can prove
54that the next thread in the ksegrp list will not have a KSE to assign to
55it, so we can show that the pointer must be made 'invalid' (NULL).
56
57The pointer exists so that when a new thread is made runnable, it can
58have its priority compared with the last assigned thread to see if
59it should 'steal' its KSE or not.. i.e. is it 'earlier'
60on the list than that thread or later.. If it's earlier, then the KSE is
61removed from the last assigned (which is now not assigned a KSE)
62and reassigned to the new thread, which is placed earlier in the list.
63The pointer is then backed up to the previous thread (which may or may not
64be the new thread).
65
66When a thread sleeps or is removed, the KSE becomes available and if there
67are queued threads that are not assigned KSEs, the highest priority one of
68them is assigned the KSE, which is then placed back on the run queue at
69the approipriate place, and the kg->kg_last_assigned pointer is adjusted down
70to point to it.
71
72The following diagram shows 2 KSEs and 3 threads from a single process.
73
74 RUNQ: --->KSE---KSE--...    (KSEs queued at priorities from threads)
75              \    \____
76               \        \
77    KSEGROUP---thread--thread--thread    (queued in priority order)
78        \                 /
79         \_______________/
80          (last_assigned)
81
82The result of this scheme is that the M available KSEs are always
83queued at the priorities they have inherrited from the M highest priority
84threads for that KSEGROUP. If this situation changes, the KSEs are
85reassigned to keep this true.
86***/
87
88#include <sys/cdefs.h>
89__FBSDID("$FreeBSD: head/sys/kern/kern_switch.c 143757 2005-03-17 15:18:01Z rwatson $");
90
91#include "opt_sched.h"
92
93#ifndef KERN_SWITCH_INCLUDE
94#include <sys/param.h>
95#include <sys/systm.h>
96#include <sys/kdb.h>
97#include <sys/kernel.h>
98#include <sys/ktr.h>
99#include <sys/lock.h>
100#include <sys/mutex.h>
101#include <sys/proc.h>
102#include <sys/queue.h>
103#include <sys/sched.h>
104#else  /* KERN_SWITCH_INCLUDE */
105#if defined(SMP) && (defined(__i386__) || defined(__amd64__))
106#include <sys/smp.h>
107#endif
108#include <machine/critical.h>
109#if defined(SMP) && defined(SCHED_4BSD)
110#include <sys/sysctl.h>
111#endif
112
113#ifdef FULL_PREEMPTION
114#ifndef PREEMPTION
115#error "The FULL_PREEMPTION option requires the PREEMPTION option"
116#endif
117#endif
118
119CTASSERT((RQB_BPW * RQB_LEN) == RQ_NQS);
120
121#define td_kse td_sched
122
123/************************************************************************
124 * Functions that manipulate runnability from a thread perspective.	*
125 ************************************************************************/
126/*
127 * Select the KSE that will be run next.  From that find the thread, and
128 * remove it from the KSEGRP's run queue.  If there is thread clustering,
129 * this will be what does it.
130 */
131struct thread *
132choosethread(void)
133{
134	struct kse *ke;
135	struct thread *td;
136	struct ksegrp *kg;
137
138#if defined(SMP) && (defined(__i386__) || defined(__amd64__))
139	if (smp_active == 0 && PCPU_GET(cpuid) != 0) {
140		/* Shutting down, run idlethread on AP's */
141		td = PCPU_GET(idlethread);
142		ke = td->td_kse;
143		CTR1(KTR_RUNQ, "choosethread: td=%p (idle)", td);
144		ke->ke_flags |= KEF_DIDRUN;
145		TD_SET_RUNNING(td);
146		return (td);
147	}
148#endif
149
150retry:
151	ke = sched_choose();
152	if (ke) {
153		td = ke->ke_thread;
154		KASSERT((td->td_kse == ke), ("kse/thread mismatch"));
155		kg = ke->ke_ksegrp;
156		if (td->td_proc->p_flag & P_HADTHREADS) {
157			if (kg->kg_last_assigned == td) {
158				kg->kg_last_assigned = TAILQ_PREV(td,
159				    threadqueue, td_runq);
160			}
161			TAILQ_REMOVE(&kg->kg_runq, td, td_runq);
162		}
163		CTR2(KTR_RUNQ, "choosethread: td=%p pri=%d",
164		    td, td->td_priority);
165	} else {
166		/* Simulate runq_choose() having returned the idle thread */
167		td = PCPU_GET(idlethread);
168		ke = td->td_kse;
169		CTR1(KTR_RUNQ, "choosethread: td=%p (idle)", td);
170	}
171	ke->ke_flags |= KEF_DIDRUN;
172
173	/*
174	 * If we are in panic, only allow system threads,
175	 * plus the one we are running in, to be run.
176	 */
177	if (panicstr && ((td->td_proc->p_flag & P_SYSTEM) == 0 &&
178	    (td->td_flags & TDF_INPANIC) == 0)) {
179		/* note that it is no longer on the run queue */
180		TD_SET_CAN_RUN(td);
181		goto retry;
182	}
183
184	TD_SET_RUNNING(td);
185	return (td);
186}
187
188/*
189 * Given a surplus system slot, try assign a new runnable thread to it.
190 * Called from:
191 *  sched_thread_exit()  (local)
192 *  sched_switch()  (local)
193 *  sched_thread_exit()  (local)
194 *  remrunqueue()  (local)  (not at the moment)
195 */
196static void
197slot_fill(struct ksegrp *kg)
198{
199	struct thread *td;
200
201	mtx_assert(&sched_lock, MA_OWNED);
202	while (kg->kg_avail_opennings > 0) {
203		/*
204		 * Find the first unassigned thread
205		 */
206		if ((td = kg->kg_last_assigned) != NULL)
207			td = TAILQ_NEXT(td, td_runq);
208		else
209			td = TAILQ_FIRST(&kg->kg_runq);
210
211		/*
212		 * If we found one, send it to the system scheduler.
213		 */
214		if (td) {
215			kg->kg_last_assigned = td;
216			sched_add(td, SRQ_YIELDING);
217			CTR2(KTR_RUNQ, "slot_fill: td%p -> kg%p", td, kg);
218		} else {
219			/* no threads to use up the slots. quit now */
220			break;
221		}
222	}
223}
224
225#ifdef	SCHED_4BSD
226/*
227 * Remove a thread from its KSEGRP's run queue.
228 * This in turn may remove it from a KSE if it was already assigned
229 * to one, possibly causing a new thread to be assigned to the KSE
230 * and the KSE getting a new priority.
231 */
232static void
233remrunqueue(struct thread *td)
234{
235	struct thread *td2, *td3;
236	struct ksegrp *kg;
237	struct kse *ke;
238
239	mtx_assert(&sched_lock, MA_OWNED);
240	KASSERT((TD_ON_RUNQ(td)), ("remrunqueue: Bad state on run queue"));
241	kg = td->td_ksegrp;
242	ke = td->td_kse;
243	CTR1(KTR_RUNQ, "remrunqueue: td%p", td);
244	TD_SET_CAN_RUN(td);
245	/*
246	 * If it is not a threaded process, take the shortcut.
247	 */
248	if ((td->td_proc->p_flag & P_HADTHREADS) == 0) {
249		/* remve from sys run queue and free up a slot */
250		sched_rem(td);
251		ke->ke_state = KES_THREAD;
252		return;
253	}
254   	td3 = TAILQ_PREV(td, threadqueue, td_runq);
255	TAILQ_REMOVE(&kg->kg_runq, td, td_runq);
256	if (ke->ke_state == KES_ONRUNQ) {
257		/*
258		 * This thread has been assigned to the system run queue.
259		 * We need to dissociate it and try assign the
260		 * KSE to the next available thread. Then, we should
261		 * see if we need to move the KSE in the run queues.
262		 */
263		sched_rem(td);
264		ke->ke_state = KES_THREAD;
265		td2 = kg->kg_last_assigned;
266		KASSERT((td2 != NULL), ("last assigned has wrong value"));
267		if (td2 == td)
268			kg->kg_last_assigned = td3;
269		/* slot_fill(kg); */ /* will replace it with another */
270	}
271}
272#endif
273
274/*
275 * Change the priority of a thread that is on the run queue.
276 */
277void
278adjustrunqueue( struct thread *td, int newpri)
279{
280	struct ksegrp *kg;
281	struct kse *ke;
282
283	mtx_assert(&sched_lock, MA_OWNED);
284	KASSERT((TD_ON_RUNQ(td)), ("adjustrunqueue: Bad state on run queue"));
285
286	ke = td->td_kse;
287	CTR1(KTR_RUNQ, "adjustrunqueue: td%p", td);
288	/*
289	 * If it is not a threaded process, take the shortcut.
290	 */
291	if ((td->td_proc->p_flag & P_HADTHREADS) == 0) {
292		/* We only care about the kse in the run queue. */
293		td->td_priority = newpri;
294		if (ke->ke_rqindex != (newpri / RQ_PPQ)) {
295			sched_rem(td);
296			sched_add(td, SRQ_BORING);
297		}
298		return;
299	}
300
301	/* It is a threaded process */
302	kg = td->td_ksegrp;
303	if (ke->ke_state == KES_ONRUNQ) {
304		if (kg->kg_last_assigned == td) {
305			kg->kg_last_assigned =
306			    TAILQ_PREV(td, threadqueue, td_runq);
307		}
308		sched_rem(td);
309	}
310	TAILQ_REMOVE(&kg->kg_runq, td, td_runq);
311	TD_SET_CAN_RUN(td);
312	td->td_priority = newpri;
313	setrunqueue(td, SRQ_BORING);
314}
315
316/*
317 * This function is called when a thread is about to be put on a
318 * ksegrp run queue because it has been made runnable or its
319 * priority has been adjusted and the ksegrp does not have a
320 * free kse slot.  It determines if a thread from the same ksegrp
321 * should be preempted.  If so, it tries to switch threads
322 * if the thread is on the same cpu or notifies another cpu that
323 * it should switch threads.
324 */
325
326static void
327maybe_preempt_in_ksegrp(struct thread *td)
328#if  !defined(SMP)
329{
330	struct thread *running_thread;
331
332#ifndef FULL_PREEMPTION
333	int pri;
334	pri = td->td_priority;
335	if (!(pri >= PRI_MIN_ITHD && pri <= PRI_MAX_ITHD))
336		return;
337#endif
338	mtx_assert(&sched_lock, MA_OWNED);
339	running_thread = curthread;
340
341	if (running_thread->td_ksegrp != td->td_ksegrp)
342		return;
343
344	if (td->td_priority > running_thread->td_priority)
345		return;
346#ifdef PREEMPTION
347	if (running_thread->td_critnest > 1)
348		running_thread->td_pflags |= TDP_OWEPREEMPT;
349	 else
350		 mi_switch(SW_INVOL, NULL);
351
352#else
353	running_thread->td_flags |= TDF_NEEDRESCHED;
354#endif
355	return;
356}
357
358#else /* SMP */
359{
360	struct thread *running_thread;
361	int worst_pri;
362	struct ksegrp *kg;
363	cpumask_t cpumask,dontuse;
364	struct pcpu *pc;
365	struct pcpu *best_pcpu;
366	struct thread *cputhread;
367
368#ifndef FULL_PREEMPTION
369	int pri;
370	pri = td->td_priority;
371	if (!(pri >= PRI_MIN_ITHD && pri <= PRI_MAX_ITHD))
372		return;
373#endif
374
375	mtx_assert(&sched_lock, MA_OWNED);
376
377	running_thread = curthread;
378
379#if !defined(KSEG_PEEMPT_BEST_CPU)
380	if (running_thread->td_ksegrp != td->td_ksegrp) {
381#endif
382		kg = td->td_ksegrp;
383
384		/* if someone is ahead of this thread, wait our turn */
385		if (td != TAILQ_FIRST(&kg->kg_runq))
386			return;
387
388		worst_pri = td->td_priority;
389		best_pcpu = NULL;
390		dontuse   = stopped_cpus | idle_cpus_mask;
391
392		/*
393		 * Find a cpu with the worst priority that runs at thread from
394		 * the same  ksegrp - if multiple exist give first the last run
395		 * cpu and then the current cpu priority
396		 */
397
398		SLIST_FOREACH(pc, &cpuhead, pc_allcpu) {
399			cpumask   = pc->pc_cpumask;
400			cputhread = pc->pc_curthread;
401
402			if ((cpumask & dontuse)  ||
403			    cputhread->td_ksegrp != kg)
404				continue;
405
406			if (cputhread->td_priority > worst_pri) {
407				worst_pri = cputhread->td_priority;
408				best_pcpu = pc;
409				continue;
410			}
411
412			if (cputhread->td_priority == worst_pri &&
413			    best_pcpu != NULL &&
414			    (td->td_lastcpu == pc->pc_cpuid ||
415				(PCPU_GET(cpumask) == cpumask &&
416				    td->td_lastcpu != best_pcpu->pc_cpuid)))
417			    best_pcpu = pc;
418		}
419
420		/* Check if we need to preempt someone */
421		if (best_pcpu == NULL)
422			return;
423
424		if (PCPU_GET(cpuid) != best_pcpu->pc_cpuid) {
425			best_pcpu->pc_curthread->td_flags |= TDF_NEEDRESCHED;
426			ipi_selected(best_pcpu->pc_cpumask, IPI_AST);
427			return;
428		}
429#if !defined(KSEG_PEEMPT_BEST_CPU)
430	}
431#endif
432
433	if (td->td_priority > running_thread->td_priority)
434		return;
435#ifdef PREEMPTION
436	if (running_thread->td_critnest > 1)
437		running_thread->td_pflags |= TDP_OWEPREEMPT;
438	 else
439		 mi_switch(SW_INVOL, NULL);
440
441#else
442	running_thread->td_flags |= TDF_NEEDRESCHED;
443#endif
444	return;
445}
446#endif /* !SMP */
447
448
449int limitcount;
450void
451setrunqueue(struct thread *td, int flags)
452{
453	struct ksegrp *kg;
454	struct thread *td2;
455	struct thread *tda;
456
457	CTR3(KTR_RUNQ, "setrunqueue: td:%p kg:%p pid:%d",
458	    td, td->td_ksegrp, td->td_proc->p_pid);
459	CTR5(KTR_SCHED, "setrunqueue: %p(%s) prio %d by %p(%s)",
460            td, td->td_proc->p_comm, td->td_priority, curthread,
461            curthread->td_proc->p_comm);
462	mtx_assert(&sched_lock, MA_OWNED);
463	KASSERT((td->td_inhibitors == 0),
464			("setrunqueue: trying to run inhibitted thread"));
465	KASSERT((TD_CAN_RUN(td) || TD_IS_RUNNING(td)),
466	    ("setrunqueue: bad thread state"));
467	TD_SET_RUNQ(td);
468	kg = td->td_ksegrp;
469	if ((td->td_proc->p_flag & P_HADTHREADS) == 0) {
470		/*
471		 * Common path optimisation: Only one of everything
472		 * and the KSE is always already attached.
473		 * Totally ignore the ksegrp run queue.
474		 */
475		if (kg->kg_avail_opennings != 1) {
476			if (limitcount < 1) {
477				limitcount++;
478				printf("pid %d: corrected slot count (%d->1)\n",
479				    td->td_proc->p_pid, kg->kg_avail_opennings);
480
481			}
482			kg->kg_avail_opennings = 1;
483		}
484		sched_add(td, flags);
485		return;
486	}
487
488	/*
489	 * If the concurrency has reduced, and we would go in the
490	 * assigned section, then keep removing entries from the
491	 * system run queue, until we are not in that section
492	 * or there is room for us to be put in that section.
493	 * What we MUST avoid is the case where there are threads of less
494	 * priority than the new one scheduled, but it can not
495	 * be scheduled itself. That would lead to a non contiguous set
496	 * of scheduled threads, and everything would break.
497	 */
498	tda = kg->kg_last_assigned;
499	while ((kg->kg_avail_opennings <= 0) &&
500	    (tda && (tda->td_priority > td->td_priority))) {
501		/*
502		 * None free, but there is one we can commandeer.
503		 */
504		CTR2(KTR_RUNQ,
505		    "setrunqueue: kg:%p: take slot from td: %p", kg, tda);
506		sched_rem(tda);
507		tda = kg->kg_last_assigned =
508		    TAILQ_PREV(tda, threadqueue, td_runq);
509	}
510
511	/*
512	 * Add the thread to the ksegrp's run queue at
513	 * the appropriate place.
514	 */
515	TAILQ_FOREACH(td2, &kg->kg_runq, td_runq) {
516		if (td2->td_priority > td->td_priority) {
517			TAILQ_INSERT_BEFORE(td2, td, td_runq);
518			break;
519		}
520	}
521	if (td2 == NULL) {
522		/* We ran off the end of the TAILQ or it was empty. */
523		TAILQ_INSERT_TAIL(&kg->kg_runq, td, td_runq);
524	}
525
526	/*
527	 * If we have a slot to use, then put the thread on the system
528	 * run queue and if needed, readjust the last_assigned pointer.
529	 * it may be that we need to schedule something anyhow
530	 * even if the availabel slots are -ve so that
531	 * all the items < last_assigned are scheduled.
532	 */
533	if (kg->kg_avail_opennings > 0) {
534		if (tda == NULL) {
535			/*
536			 * No pre-existing last assigned so whoever is first
537			 * gets the slot.. (maybe us)
538			 */
539			td2 = TAILQ_FIRST(&kg->kg_runq);
540			kg->kg_last_assigned = td2;
541		} else if (tda->td_priority > td->td_priority) {
542			td2 = td;
543		} else {
544			/*
545			 * We are past last_assigned, so
546			 * give the next slot to whatever is next,
547			 * which may or may not be us.
548			 */
549			td2 = TAILQ_NEXT(tda, td_runq);
550			kg->kg_last_assigned = td2;
551		}
552		sched_add(td2, flags);
553	} else {
554		CTR3(KTR_RUNQ, "setrunqueue: held: td%p kg%p pid%d",
555			td, td->td_ksegrp, td->td_proc->p_pid);
556		if ((flags & SRQ_YIELDING) == 0)
557			maybe_preempt_in_ksegrp(td);
558	}
559}
560
561/*
562 * Kernel thread preemption implementation.  Critical sections mark
563 * regions of code in which preemptions are not allowed.
564 */
565void
566critical_enter(void)
567{
568	struct thread *td;
569
570	td = curthread;
571	if (td->td_critnest == 0)
572		cpu_critical_enter(td);
573	td->td_critnest++;
574	CTR4(KTR_CRITICAL, "critical_enter by thread %p (%ld, %s) to %d", td,
575	    (long)td->td_proc->p_pid, td->td_proc->p_comm, td->td_critnest);
576}
577
578void
579critical_exit(void)
580{
581	struct thread *td;
582
583	td = curthread;
584	KASSERT(td->td_critnest != 0,
585	    ("critical_exit: td_critnest == 0"));
586	if (td->td_critnest == 1) {
587		if (td->td_pflags & TDP_WAKEPROC0) {
588			td->td_pflags &= ~TDP_WAKEPROC0;
589			wakeup(&proc0);
590		}
591#ifdef PREEMPTION
592		mtx_assert(&sched_lock, MA_NOTOWNED);
593		if (td->td_pflags & TDP_OWEPREEMPT) {
594			mtx_lock_spin(&sched_lock);
595			mi_switch(SW_INVOL, NULL);
596			mtx_unlock_spin(&sched_lock);
597		}
598#endif
599		td->td_critnest = 0;
600		cpu_critical_exit(td);
601	} else {
602		td->td_critnest--;
603	}
604	CTR4(KTR_CRITICAL, "critical_exit by thread %p (%ld, %s) to %d", td,
605	    (long)td->td_proc->p_pid, td->td_proc->p_comm, td->td_critnest);
606}
607
608/*
609 * This function is called when a thread is about to be put on run queue
610 * because it has been made runnable or its priority has been adjusted.  It
611 * determines if the new thread should be immediately preempted to.  If so,
612 * it switches to it and eventually returns true.  If not, it returns false
613 * so that the caller may place the thread on an appropriate run queue.
614 */
615int
616maybe_preempt(struct thread *td)
617{
618#ifdef PREEMPTION
619	struct thread *ctd;
620	int cpri, pri;
621#endif
622
623	mtx_assert(&sched_lock, MA_OWNED);
624#ifdef PREEMPTION
625	/*
626	 * The new thread should not preempt the current thread if any of the
627	 * following conditions are true:
628	 *
629	 *  - The kernel is in the throes of crashing (panicstr).
630	 *  - The current thread has a higher (numerically lower) or
631	 *    equivalent priority.  Note that this prevents curthread from
632	 *    trying to preempt to itself.
633	 *  - It is too early in the boot for context switches (cold is set).
634	 *  - The current thread has an inhibitor set or is in the process of
635	 *    exiting.  In this case, the current thread is about to switch
636	 *    out anyways, so there's no point in preempting.  If we did,
637	 *    the current thread would not be properly resumed as well, so
638	 *    just avoid that whole landmine.
639	 *  - If the new thread's priority is not a realtime priority and
640	 *    the current thread's priority is not an idle priority and
641	 *    FULL_PREEMPTION is disabled.
642	 *
643	 * If all of these conditions are false, but the current thread is in
644	 * a nested critical section, then we have to defer the preemption
645	 * until we exit the critical section.  Otherwise, switch immediately
646	 * to the new thread.
647	 */
648	ctd = curthread;
649	KASSERT ((ctd->td_kse != NULL && ctd->td_kse->ke_thread == ctd),
650	  ("thread has no (or wrong) sched-private part."));
651	KASSERT((td->td_inhibitors == 0),
652			("maybe_preempt: trying to run inhibitted thread"));
653	pri = td->td_priority;
654	cpri = ctd->td_priority;
655	if (panicstr != NULL || pri >= cpri || cold /* || dumping */ ||
656	    TD_IS_INHIBITED(ctd) || td->td_kse->ke_state != KES_THREAD)
657		return (0);
658#ifndef FULL_PREEMPTION
659	if (!(pri >= PRI_MIN_ITHD && pri <= PRI_MAX_ITHD) &&
660	    !(cpri >= PRI_MIN_IDLE))
661		return (0);
662#endif
663	if (ctd->td_critnest > 1) {
664		CTR1(KTR_PROC, "maybe_preempt: in critical section %d",
665		    ctd->td_critnest);
666		ctd->td_pflags |= TDP_OWEPREEMPT;
667		return (0);
668	}
669
670	/*
671	 * Thread is runnable but not yet put on system run queue.
672	 */
673	MPASS(TD_ON_RUNQ(td));
674	MPASS(td->td_sched->ke_state != KES_ONRUNQ);
675	if (td->td_proc->p_flag & P_HADTHREADS) {
676		/*
677		 * If this is a threaded process we actually ARE on the
678		 * ksegrp run queue so take it off that first.
679		 * Also undo any damage done to the last_assigned pointer.
680		 * XXX Fix setrunqueue so this isn't needed
681		 */
682		struct ksegrp *kg;
683
684		kg = td->td_ksegrp;
685		if (kg->kg_last_assigned == td)
686			kg->kg_last_assigned =
687			    TAILQ_PREV(td, threadqueue, td_runq);
688		TAILQ_REMOVE(&kg->kg_runq, td, td_runq);
689	}
690
691	TD_SET_RUNNING(td);
692	CTR3(KTR_PROC, "preempting to thread %p (pid %d, %s)\n", td,
693	    td->td_proc->p_pid, td->td_proc->p_comm);
694	mi_switch(SW_INVOL|SW_PREEMPT, td);
695	return (1);
696#else
697	return (0);
698#endif
699}
700
701#if 0
702#ifndef PREEMPTION
703/* XXX: There should be a non-static version of this. */
704static void
705printf_caddr_t(void *data)
706{
707	printf("%s", (char *)data);
708}
709static char preempt_warning[] =
710    "WARNING: Kernel preemption is disabled, expect reduced performance.\n";
711SYSINIT(preempt_warning, SI_SUB_COPYRIGHT, SI_ORDER_ANY, printf_caddr_t,
712    preempt_warning)
713#endif
714#endif
715
716/************************************************************************
717 * SYSTEM RUN QUEUE manipulations and tests				*
718 ************************************************************************/
719/*
720 * Initialize a run structure.
721 */
722void
723runq_init(struct runq *rq)
724{
725	int i;
726
727	bzero(rq, sizeof *rq);
728	for (i = 0; i < RQ_NQS; i++)
729		TAILQ_INIT(&rq->rq_queues[i]);
730}
731
732/*
733 * Clear the status bit of the queue corresponding to priority level pri,
734 * indicating that it is empty.
735 */
736static __inline void
737runq_clrbit(struct runq *rq, int pri)
738{
739	struct rqbits *rqb;
740
741	rqb = &rq->rq_status;
742	CTR4(KTR_RUNQ, "runq_clrbit: bits=%#x %#x bit=%#x word=%d",
743	    rqb->rqb_bits[RQB_WORD(pri)],
744	    rqb->rqb_bits[RQB_WORD(pri)] & ~RQB_BIT(pri),
745	    RQB_BIT(pri), RQB_WORD(pri));
746	rqb->rqb_bits[RQB_WORD(pri)] &= ~RQB_BIT(pri);
747}
748
749/*
750 * Find the index of the first non-empty run queue.  This is done by
751 * scanning the status bits, a set bit indicates a non-empty queue.
752 */
753static __inline int
754runq_findbit(struct runq *rq)
755{
756	struct rqbits *rqb;
757	int pri;
758	int i;
759
760	rqb = &rq->rq_status;
761	for (i = 0; i < RQB_LEN; i++)
762		if (rqb->rqb_bits[i]) {
763			pri = RQB_FFS(rqb->rqb_bits[i]) + (i << RQB_L2BPW);
764			CTR3(KTR_RUNQ, "runq_findbit: bits=%#x i=%d pri=%d",
765			    rqb->rqb_bits[i], i, pri);
766			return (pri);
767		}
768
769	return (-1);
770}
771
772/*
773 * Set the status bit of the queue corresponding to priority level pri,
774 * indicating that it is non-empty.
775 */
776static __inline void
777runq_setbit(struct runq *rq, int pri)
778{
779	struct rqbits *rqb;
780
781	rqb = &rq->rq_status;
782	CTR4(KTR_RUNQ, "runq_setbit: bits=%#x %#x bit=%#x word=%d",
783	    rqb->rqb_bits[RQB_WORD(pri)],
784	    rqb->rqb_bits[RQB_WORD(pri)] | RQB_BIT(pri),
785	    RQB_BIT(pri), RQB_WORD(pri));
786	rqb->rqb_bits[RQB_WORD(pri)] |= RQB_BIT(pri);
787}
788
789/*
790 * Add the KSE to the queue specified by its priority, and set the
791 * corresponding status bit.
792 */
793void
794runq_add(struct runq *rq, struct kse *ke, int flags)
795{
796	struct rqhead *rqh;
797	int pri;
798
799	pri = ke->ke_thread->td_priority / RQ_PPQ;
800	ke->ke_rqindex = pri;
801	runq_setbit(rq, pri);
802	rqh = &rq->rq_queues[pri];
803	CTR5(KTR_RUNQ, "runq_add: td=%p ke=%p pri=%d %d rqh=%p",
804	    ke->ke_thread, ke, ke->ke_thread->td_priority, pri, rqh);
805	if (flags & SRQ_PREEMPTED) {
806		TAILQ_INSERT_HEAD(rqh, ke, ke_procq);
807	} else {
808		TAILQ_INSERT_TAIL(rqh, ke, ke_procq);
809	}
810}
811
812/*
813 * Return true if there are runnable processes of any priority on the run
814 * queue, false otherwise.  Has no side effects, does not modify the run
815 * queue structure.
816 */
817int
818runq_check(struct runq *rq)
819{
820	struct rqbits *rqb;
821	int i;
822
823	rqb = &rq->rq_status;
824	for (i = 0; i < RQB_LEN; i++)
825		if (rqb->rqb_bits[i]) {
826			CTR2(KTR_RUNQ, "runq_check: bits=%#x i=%d",
827			    rqb->rqb_bits[i], i);
828			return (1);
829		}
830	CTR0(KTR_RUNQ, "runq_check: empty");
831
832	return (0);
833}
834
835#if defined(SMP) && defined(SCHED_4BSD)
836int runq_fuzz = 1;
837SYSCTL_INT(_kern_sched, OID_AUTO, runq_fuzz, CTLFLAG_RW, &runq_fuzz, 0, "");
838#endif
839
840/*
841 * Find the highest priority process on the run queue.
842 */
843struct kse *
844runq_choose(struct runq *rq)
845{
846	struct rqhead *rqh;
847	struct kse *ke;
848	int pri;
849
850	mtx_assert(&sched_lock, MA_OWNED);
851	while ((pri = runq_findbit(rq)) != -1) {
852		rqh = &rq->rq_queues[pri];
853#if defined(SMP) && defined(SCHED_4BSD)
854		/* fuzz == 1 is normal.. 0 or less are ignored */
855		if (runq_fuzz > 1) {
856			/*
857			 * In the first couple of entries, check if
858			 * there is one for our CPU as a preference.
859			 */
860			int count = runq_fuzz;
861			int cpu = PCPU_GET(cpuid);
862			struct kse *ke2;
863			ke2 = ke = TAILQ_FIRST(rqh);
864
865			while (count-- && ke2) {
866				if (ke->ke_thread->td_lastcpu == cpu) {
867					ke = ke2;
868					break;
869				}
870				ke2 = TAILQ_NEXT(ke2, ke_procq);
871			}
872		} else
873#endif
874			ke = TAILQ_FIRST(rqh);
875		KASSERT(ke != NULL, ("runq_choose: no proc on busy queue"));
876		CTR3(KTR_RUNQ,
877		    "runq_choose: pri=%d kse=%p rqh=%p", pri, ke, rqh);
878		return (ke);
879	}
880	CTR1(KTR_RUNQ, "runq_choose: idleproc pri=%d", pri);
881
882	return (NULL);
883}
884
885/*
886 * Remove the KSE from the queue specified by its priority, and clear the
887 * corresponding status bit if the queue becomes empty.
888 * Caller must set ke->ke_state afterwards.
889 */
890void
891runq_remove(struct runq *rq, struct kse *ke)
892{
893	struct rqhead *rqh;
894	int pri;
895
896	KASSERT(ke->ke_proc->p_sflag & PS_INMEM,
897		("runq_remove: process swapped out"));
898	pri = ke->ke_rqindex;
899	rqh = &rq->rq_queues[pri];
900	CTR5(KTR_RUNQ, "runq_remove: td=%p, ke=%p pri=%d %d rqh=%p",
901	    ke->ke_thread, ke, ke->ke_thread->td_priority, pri, rqh);
902	KASSERT(ke != NULL, ("runq_remove: no proc on busy queue"));
903	TAILQ_REMOVE(rqh, ke, ke_procq);
904	if (TAILQ_EMPTY(rqh)) {
905		CTR0(KTR_RUNQ, "runq_remove: empty");
906		runq_clrbit(rq, pri);
907	}
908}
909
910/****** functions that are temporarily here ***********/
911#include <vm/uma.h>
912extern struct mtx kse_zombie_lock;
913
914/*
915 *  Allocate scheduler specific per-process resources.
916 * The thread and ksegrp have already been linked in.
917 * In this case just set the default concurrency value.
918 *
919 * Called from:
920 *  proc_init() (UMA init method)
921 */
922void
923sched_newproc(struct proc *p, struct ksegrp *kg, struct thread *td)
924{
925
926	/* This can go in sched_fork */
927	sched_init_concurrency(kg);
928}
929
930/*
931 * thread is being either created or recycled.
932 * Fix up the per-scheduler resources associated with it.
933 * Called from:
934 *  sched_fork_thread()
935 *  thread_dtor()  (*may go away)
936 *  thread_init()  (*may go away)
937 */
938void
939sched_newthread(struct thread *td)
940{
941	struct td_sched *ke;
942
943	ke = (struct td_sched *) (td + 1);
944	bzero(ke, sizeof(*ke));
945	td->td_sched     = ke;
946	ke->ke_thread	= td;
947	ke->ke_state	= KES_THREAD;
948}
949
950/*
951 * Set up an initial concurrency of 1
952 * and set the given thread (if given) to be using that
953 * concurrency slot.
954 * May be used "offline"..before the ksegrp is attached to the world
955 * and thus wouldn't need schedlock in that case.
956 * Called from:
957 *  thr_create()
958 *  proc_init() (UMA) via sched_newproc()
959 */
960void
961sched_init_concurrency(struct ksegrp *kg)
962{
963
964	CTR1(KTR_RUNQ,"kg %p init slots and concurrency to 1", kg);
965	kg->kg_concurrency = 1;
966	kg->kg_avail_opennings = 1;
967}
968
969/*
970 * Change the concurrency of an existing ksegrp to N
971 * Called from:
972 *  kse_create()
973 *  kse_exit()
974 *  thread_exit()
975 *  thread_single()
976 */
977void
978sched_set_concurrency(struct ksegrp *kg, int concurrency)
979{
980
981	CTR4(KTR_RUNQ,"kg %p set concurrency to %d, slots %d -> %d",
982	    kg,
983	    concurrency,
984	    kg->kg_avail_opennings,
985	    kg->kg_avail_opennings + (concurrency - kg->kg_concurrency));
986	kg->kg_avail_opennings += (concurrency - kg->kg_concurrency);
987	kg->kg_concurrency = concurrency;
988}
989
990/*
991 * Called from thread_exit() for all exiting thread
992 *
993 * Not to be confused with sched_exit_thread()
994 * that is only called from thread_exit() for threads exiting
995 * without the rest of the process exiting because it is also called from
996 * sched_exit() and we wouldn't want to call it twice.
997 * XXX This can probably be fixed.
998 */
999void
1000sched_thread_exit(struct thread *td)
1001{
1002
1003	SLOT_RELEASE(td->td_ksegrp);
1004	slot_fill(td->td_ksegrp);
1005}
1006
1007#endif /* KERN_SWITCH_INCLUDE */
1008