kern_switch.c revision 139804
1/*-
2 * Copyright (c) 2001 Jake Burkholder <jake@FreeBSD.org>
3 * All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 * 1. Redistributions of source code must retain the above copyright
9 *    notice, this list of conditions and the following disclaimer.
10 * 2. Redistributions in binary form must reproduce the above copyright
11 *    notice, this list of conditions and the following disclaimer in the
12 *    documentation and/or other materials provided with the distribution.
13 *
14 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
15 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
16 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
17 * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
18 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
19 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
20 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
21 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
22 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
23 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
24 * SUCH DAMAGE.
25 */
26
27/***
28Here is the logic..
29
30If there are N processors, then there are at most N KSEs (kernel
31schedulable entities) working to process threads that belong to a
32KSEGROUP (kg). If there are X of these KSEs actually running at the
33moment in question, then there are at most M (N-X) of these KSEs on
34the run queue, as running KSEs are not on the queue.
35
36Runnable threads are queued off the KSEGROUP in priority order.
37If there are M or more threads runnable, the top M threads
38(by priority) are 'preassigned' to the M KSEs not running. The KSEs take
39their priority from those threads and are put on the run queue.
40
41The last thread that had a priority high enough to have a KSE associated
42with it, AND IS ON THE RUN QUEUE is pointed to by
43kg->kg_last_assigned. If no threads queued off the KSEGROUP have KSEs
44assigned as all the available KSEs are activly running, or because there
45are no threads queued, that pointer is NULL.
46
47When a KSE is removed from the run queue to become runnable, we know
48it was associated with the highest priority thread in the queue (at the head
49of the queue). If it is also the last assigned we know M was 1 and must
50now be 0. Since the thread is no longer queued that pointer must be
51removed from it. Since we know there were no more KSEs available,
52(M was 1 and is now 0) and since we are not FREEING our KSE
53but using it, we know there are STILL no more KSEs available, we can prove
54that the next thread in the ksegrp list will not have a KSE to assign to
55it, so we can show that the pointer must be made 'invalid' (NULL).
56
57The pointer exists so that when a new thread is made runnable, it can
58have its priority compared with the last assigned thread to see if
59it should 'steal' its KSE or not.. i.e. is it 'earlier'
60on the list than that thread or later.. If it's earlier, then the KSE is
61removed from the last assigned (which is now not assigned a KSE)
62and reassigned to the new thread, which is placed earlier in the list.
63The pointer is then backed up to the previous thread (which may or may not
64be the new thread).
65
66When a thread sleeps or is removed, the KSE becomes available and if there
67are queued threads that are not assigned KSEs, the highest priority one of
68them is assigned the KSE, which is then placed back on the run queue at
69the approipriate place, and the kg->kg_last_assigned pointer is adjusted down
70to point to it.
71
72The following diagram shows 2 KSEs and 3 threads from a single process.
73
74 RUNQ: --->KSE---KSE--...    (KSEs queued at priorities from threads)
75              \    \____
76               \        \
77    KSEGROUP---thread--thread--thread    (queued in priority order)
78        \                 /
79         \_______________/
80          (last_assigned)
81
82The result of this scheme is that the M available KSEs are always
83queued at the priorities they have inherrited from the M highest priority
84threads for that KSEGROUP. If this situation changes, the KSEs are
85reassigned to keep this true.
86***/
87
88#include <sys/cdefs.h>
89__FBSDID("$FreeBSD: head/sys/kern/kern_switch.c 139804 2005-01-06 23:35:40Z imp $");
90
91#include "opt_sched.h"
92
93#ifndef KERN_SWITCH_INCLUDE
94#include <sys/param.h>
95#include <sys/systm.h>
96#include <sys/kdb.h>
97#include <sys/kernel.h>
98#include <sys/ktr.h>
99#include <sys/lock.h>
100#include <sys/mutex.h>
101#include <sys/proc.h>
102#include <sys/queue.h>
103#include <sys/sched.h>
104#else  /* KERN_SWITCH_INCLUDE */
105#if defined(SMP) && (defined(__i386__) || defined(__amd64__))
106#include <sys/smp.h>
107#endif
108#include <machine/critical.h>
109#if defined(SMP) && defined(SCHED_4BSD)
110#include <sys/sysctl.h>
111#endif
112
113#ifdef FULL_PREEMPTION
114#ifndef PREEMPTION
115#error "The FULL_PREEMPTION option requires the PREEMPTION option"
116#endif
117#endif
118
119CTASSERT((RQB_BPW * RQB_LEN) == RQ_NQS);
120
121#define td_kse td_sched
122
123/************************************************************************
124 * Functions that manipulate runnability from a thread perspective.	*
125 ************************************************************************/
126/*
127 * Select the KSE that will be run next.  From that find the thread, and
128 * remove it from the KSEGRP's run queue.  If there is thread clustering,
129 * this will be what does it.
130 */
131struct thread *
132choosethread(void)
133{
134	struct kse *ke;
135	struct thread *td;
136	struct ksegrp *kg;
137
138#if defined(SMP) && (defined(__i386__) || defined(__amd64__))
139	if (smp_active == 0 && PCPU_GET(cpuid) != 0) {
140		/* Shutting down, run idlethread on AP's */
141		td = PCPU_GET(idlethread);
142		ke = td->td_kse;
143		CTR1(KTR_RUNQ, "choosethread: td=%p (idle)", td);
144		ke->ke_flags |= KEF_DIDRUN;
145		TD_SET_RUNNING(td);
146		return (td);
147	}
148#endif
149
150retry:
151	ke = sched_choose();
152	if (ke) {
153		td = ke->ke_thread;
154		KASSERT((td->td_kse == ke), ("kse/thread mismatch"));
155		kg = ke->ke_ksegrp;
156		if (td->td_proc->p_flag & P_HADTHREADS) {
157			if (kg->kg_last_assigned == td) {
158				kg->kg_last_assigned = TAILQ_PREV(td,
159				    threadqueue, td_runq);
160			}
161			TAILQ_REMOVE(&kg->kg_runq, td, td_runq);
162		}
163		CTR2(KTR_RUNQ, "choosethread: td=%p pri=%d",
164		    td, td->td_priority);
165	} else {
166		/* Simulate runq_choose() having returned the idle thread */
167		td = PCPU_GET(idlethread);
168		ke = td->td_kse;
169		CTR1(KTR_RUNQ, "choosethread: td=%p (idle)", td);
170	}
171	ke->ke_flags |= KEF_DIDRUN;
172
173	/*
174	 * If we are in panic, only allow system threads,
175	 * plus the one we are running in, to be run.
176	 */
177	if (panicstr && ((td->td_proc->p_flag & P_SYSTEM) == 0 &&
178	    (td->td_flags & TDF_INPANIC) == 0)) {
179		/* note that it is no longer on the run queue */
180		TD_SET_CAN_RUN(td);
181		goto retry;
182	}
183
184	TD_SET_RUNNING(td);
185	return (td);
186}
187
188/*
189 * Given a surplus system slot, try assign a new runnable thread to it.
190 * Called from:
191 *  sched_thread_exit()  (local)
192 *  sched_switch()  (local)
193 *  sched_thread_exit()  (local)
194 *  remrunqueue()  (local)  (not at the moment)
195 */
196static void
197slot_fill(struct ksegrp *kg)
198{
199	struct thread *td;
200
201	mtx_assert(&sched_lock, MA_OWNED);
202	while (kg->kg_avail_opennings > 0) {
203		/*
204		 * Find the first unassigned thread
205		 */
206		if ((td = kg->kg_last_assigned) != NULL)
207			td = TAILQ_NEXT(td, td_runq);
208		else
209			td = TAILQ_FIRST(&kg->kg_runq);
210
211		/*
212		 * If we found one, send it to the system scheduler.
213		 */
214		if (td) {
215			kg->kg_last_assigned = td;
216			sched_add(td, SRQ_YIELDING);
217			CTR2(KTR_RUNQ, "slot_fill: td%p -> kg%p", td, kg);
218		} else {
219			/* no threads to use up the slots. quit now */
220			break;
221		}
222	}
223}
224
225#ifdef	SCHED_4BSD
226/*
227 * Remove a thread from its KSEGRP's run queue.
228 * This in turn may remove it from a KSE if it was already assigned
229 * to one, possibly causing a new thread to be assigned to the KSE
230 * and the KSE getting a new priority.
231 */
232static void
233remrunqueue(struct thread *td)
234{
235	struct thread *td2, *td3;
236	struct ksegrp *kg;
237	struct kse *ke;
238
239	mtx_assert(&sched_lock, MA_OWNED);
240	KASSERT((TD_ON_RUNQ(td)), ("remrunqueue: Bad state on run queue"));
241	kg = td->td_ksegrp;
242	ke = td->td_kse;
243	CTR1(KTR_RUNQ, "remrunqueue: td%p", td);
244	TD_SET_CAN_RUN(td);
245	/*
246	 * If it is not a threaded process, take the shortcut.
247	 */
248	if ((td->td_proc->p_flag & P_HADTHREADS) == 0) {
249		/* remve from sys run queue and free up a slot */
250		sched_rem(td);
251		ke->ke_state = KES_THREAD;
252		return;
253	}
254   	td3 = TAILQ_PREV(td, threadqueue, td_runq);
255	TAILQ_REMOVE(&kg->kg_runq, td, td_runq);
256	if (ke->ke_state == KES_ONRUNQ) {
257		/*
258		 * This thread has been assigned to the system run queue.
259		 * We need to dissociate it and try assign the
260		 * KSE to the next available thread. Then, we should
261		 * see if we need to move the KSE in the run queues.
262		 */
263		sched_rem(td);
264		ke->ke_state = KES_THREAD;
265		td2 = kg->kg_last_assigned;
266		KASSERT((td2 != NULL), ("last assigned has wrong value"));
267		if (td2 == td)
268			kg->kg_last_assigned = td3;
269		/* slot_fill(kg); */ /* will replace it with another */
270	}
271}
272#endif
273
274/*
275 * Change the priority of a thread that is on the run queue.
276 */
277void
278adjustrunqueue( struct thread *td, int newpri)
279{
280	struct ksegrp *kg;
281	struct kse *ke;
282
283	mtx_assert(&sched_lock, MA_OWNED);
284	KASSERT((TD_ON_RUNQ(td)), ("adjustrunqueue: Bad state on run queue"));
285
286	ke = td->td_kse;
287	CTR1(KTR_RUNQ, "adjustrunqueue: td%p", td);
288	/*
289	 * If it is not a threaded process, take the shortcut.
290	 */
291	if ((td->td_proc->p_flag & P_HADTHREADS) == 0) {
292		/* We only care about the kse in the run queue. */
293		td->td_priority = newpri;
294		if (ke->ke_rqindex != (newpri / RQ_PPQ)) {
295			sched_rem(td);
296			sched_add(td, SRQ_BORING);
297		}
298		return;
299	}
300
301	/* It is a threaded process */
302	kg = td->td_ksegrp;
303	if (ke->ke_state == KES_ONRUNQ) {
304		if (kg->kg_last_assigned == td) {
305			kg->kg_last_assigned =
306			    TAILQ_PREV(td, threadqueue, td_runq);
307		}
308		sched_rem(td);
309	}
310	TAILQ_REMOVE(&kg->kg_runq, td, td_runq);
311	TD_SET_CAN_RUN(td);
312	td->td_priority = newpri;
313	setrunqueue(td, SRQ_BORING);
314}
315
316/*
317 * This function is called when a thread is about to be put on a
318 * ksegrp run queue because it has been made runnable or its
319 * priority has been adjusted and the ksegrp does not have a
320 * free kse slot.  It determines if a thread from the same ksegrp
321 * should be preempted.  If so, it tries to switch threads
322 * if the thread is on the same cpu or notifies another cpu that
323 * it should switch threads.
324 */
325
326static void
327maybe_preempt_in_ksegrp(struct thread *td)
328#if  !defined(SMP)
329{
330	struct thread *running_thread;
331
332#ifndef FULL_PREEMPTION
333	int pri;
334	pri = td->td_priority;
335	if (!(pri >= PRI_MIN_ITHD && pri <= PRI_MAX_ITHD))
336		return;
337#endif
338	mtx_assert(&sched_lock, MA_OWNED);
339	running_thread = curthread;
340
341	if (running_thread->td_ksegrp != td->td_ksegrp)
342		return;
343
344	if (td->td_priority > running_thread->td_priority)
345		return;
346#ifdef PREEMPTION
347	if (running_thread->td_critnest > 1)
348		running_thread->td_pflags |= TDP_OWEPREEMPT;
349	 else
350		 mi_switch(SW_INVOL, NULL);
351
352#else
353	running_thread->td_flags |= TDF_NEEDRESCHED;
354#endif
355	return;
356}
357
358#else /* SMP */
359{
360	struct thread *running_thread;
361	int worst_pri;
362	struct ksegrp *kg;
363	cpumask_t cpumask,dontuse;
364	struct pcpu *pc;
365	struct pcpu *best_pcpu;
366	struct thread *cputhread;
367
368#ifndef FULL_PREEMPTION
369	int pri;
370	pri = td->td_priority;
371	if (!(pri >= PRI_MIN_ITHD && pri <= PRI_MAX_ITHD))
372		return;
373#endif
374
375	mtx_assert(&sched_lock, MA_OWNED);
376
377	running_thread = curthread;
378
379#if !defined(KSEG_PEEMPT_BEST_CPU)
380	if (running_thread->td_ksegrp != td->td_ksegrp) {
381#endif
382		kg = td->td_ksegrp;
383
384		/* if someone is ahead of this thread, wait our turn */
385		if (td != TAILQ_FIRST(&kg->kg_runq))
386			return;
387
388		worst_pri = td->td_priority;
389		best_pcpu = NULL;
390		dontuse   = stopped_cpus | idle_cpus_mask;
391
392		/*
393		 * Find a cpu with the worst priority that runs at thread from
394		 * the same  ksegrp - if multiple exist give first the last run
395		 * cpu and then the current cpu priority
396		 */
397
398		SLIST_FOREACH(pc, &cpuhead, pc_allcpu) {
399			cpumask   = pc->pc_cpumask;
400			cputhread = pc->pc_curthread;
401
402			if ((cpumask & dontuse)  ||
403			    cputhread->td_ksegrp != kg)
404				continue;
405
406			if (cputhread->td_priority > worst_pri) {
407				worst_pri = cputhread->td_priority;
408				best_pcpu = pc;
409				continue;
410			}
411
412			if (cputhread->td_priority == worst_pri &&
413			    best_pcpu != NULL &&
414			    (td->td_lastcpu == pc->pc_cpuid ||
415				(PCPU_GET(cpumask) == cpumask &&
416				    td->td_lastcpu != best_pcpu->pc_cpuid)))
417			    best_pcpu = pc;
418		}
419
420		/* Check if we need to preempt someone */
421		if (best_pcpu == NULL)
422			return;
423
424		if (PCPU_GET(cpuid) != best_pcpu->pc_cpuid) {
425			best_pcpu->pc_curthread->td_flags |= TDF_NEEDRESCHED;
426			ipi_selected(best_pcpu->pc_cpumask, IPI_AST);
427			return;
428		}
429#if !defined(KSEG_PEEMPT_BEST_CPU)
430	}
431#endif
432
433	if (td->td_priority > running_thread->td_priority)
434		return;
435#ifdef PREEMPTION
436	if (running_thread->td_critnest > 1)
437		running_thread->td_pflags |= TDP_OWEPREEMPT;
438	 else
439		 mi_switch(SW_INVOL, NULL);
440
441#else
442	running_thread->td_flags |= TDF_NEEDRESCHED;
443#endif
444	return;
445}
446#endif /* !SMP */
447
448
449int limitcount;
450void
451setrunqueue(struct thread *td, int flags)
452{
453	struct ksegrp *kg;
454	struct thread *td2;
455	struct thread *tda;
456
457	CTR3(KTR_RUNQ, "setrunqueue: td:%p kg:%p pid:%d",
458	    td, td->td_ksegrp, td->td_proc->p_pid);
459	CTR5(KTR_SCHED, "setrunqueue: %p(%s) prio %d by %p(%s)",
460            td, td->td_proc->p_comm, td->td_priority, curthread,
461            curthread->td_proc->p_comm);
462	mtx_assert(&sched_lock, MA_OWNED);
463	KASSERT((td->td_inhibitors == 0),
464			("setrunqueue: trying to run inhibitted thread"));
465	KASSERT((TD_CAN_RUN(td) || TD_IS_RUNNING(td)),
466	    ("setrunqueue: bad thread state"));
467	TD_SET_RUNQ(td);
468	kg = td->td_ksegrp;
469	if ((td->td_proc->p_flag & P_HADTHREADS) == 0) {
470		/*
471		 * Common path optimisation: Only one of everything
472		 * and the KSE is always already attached.
473		 * Totally ignore the ksegrp run queue.
474		 */
475		if (kg->kg_avail_opennings != 1) {
476			if (limitcount < 1) {
477				limitcount++;
478				printf("pid %d: corrected slot count (%d->1)\n",
479				    td->td_proc->p_pid, kg->kg_avail_opennings);
480
481			}
482			kg->kg_avail_opennings = 1;
483		}
484		sched_add(td, flags);
485		return;
486	}
487
488	/*
489	 * If the concurrency has reduced, and we would go in the
490	 * assigned section, then keep removing entries from the
491	 * system run queue, until we are not in that section
492	 * or there is room for us to be put in that section.
493	 * What we MUST avoid is the case where there are threads of less
494	 * priority than the new one scheduled, but it can not
495	 * be scheduled itself. That would lead to a non contiguous set
496	 * of scheduled threads, and everything would break.
497	 */
498	tda = kg->kg_last_assigned;
499	while ((kg->kg_avail_opennings <= 0) &&
500	    (tda && (tda->td_priority > td->td_priority))) {
501		/*
502		 * None free, but there is one we can commandeer.
503		 */
504		CTR2(KTR_RUNQ,
505		    "setrunqueue: kg:%p: take slot from td: %p", kg, tda);
506		sched_rem(tda);
507		tda = kg->kg_last_assigned =
508		    TAILQ_PREV(tda, threadqueue, td_runq);
509	}
510
511	/*
512	 * Add the thread to the ksegrp's run queue at
513	 * the appropriate place.
514	 */
515	TAILQ_FOREACH(td2, &kg->kg_runq, td_runq) {
516		if (td2->td_priority > td->td_priority) {
517			TAILQ_INSERT_BEFORE(td2, td, td_runq);
518			break;
519		}
520	}
521	if (td2 == NULL) {
522		/* We ran off the end of the TAILQ or it was empty. */
523		TAILQ_INSERT_TAIL(&kg->kg_runq, td, td_runq);
524	}
525
526	/*
527	 * If we have a slot to use, then put the thread on the system
528	 * run queue and if needed, readjust the last_assigned pointer.
529	 * it may be that we need to schedule something anyhow
530	 * even if the availabel slots are -ve so that
531	 * all the items < last_assigned are scheduled.
532	 */
533	if (kg->kg_avail_opennings > 0) {
534		if (tda == NULL) {
535			/*
536			 * No pre-existing last assigned so whoever is first
537			 * gets the slot.. (maybe us)
538			 */
539			td2 = TAILQ_FIRST(&kg->kg_runq);
540			kg->kg_last_assigned = td2;
541		} else if (tda->td_priority > td->td_priority) {
542			td2 = td;
543		} else {
544			/*
545			 * We are past last_assigned, so
546			 * give the next slot to whatever is next,
547			 * which may or may not be us.
548			 */
549			td2 = TAILQ_NEXT(tda, td_runq);
550			kg->kg_last_assigned = td2;
551		}
552		sched_add(td2, flags);
553	} else {
554		CTR3(KTR_RUNQ, "setrunqueue: held: td%p kg%p pid%d",
555			td, td->td_ksegrp, td->td_proc->p_pid);
556		if ((flags & SRQ_YIELDING) == 0)
557			maybe_preempt_in_ksegrp(td);
558	}
559}
560
561/*
562 * Kernel thread preemption implementation.  Critical sections mark
563 * regions of code in which preemptions are not allowed.
564 */
565void
566critical_enter(void)
567{
568	struct thread *td;
569
570	td = curthread;
571	if (td->td_critnest == 0)
572		cpu_critical_enter(td);
573	td->td_critnest++;
574	CTR4(KTR_CRITICAL, "critical_enter by thread %p (%ld, %s) to %d", td,
575	    (long)td->td_proc->p_pid, td->td_proc->p_comm, td->td_critnest);
576}
577
578void
579critical_exit(void)
580{
581	struct thread *td;
582
583	td = curthread;
584	KASSERT(td->td_critnest != 0,
585	    ("critical_exit: td_critnest == 0"));
586	if (td->td_critnest == 1) {
587		if (td->td_pflags & TDP_WAKEPROC0) {
588			td->td_pflags &= ~TDP_WAKEPROC0;
589			wakeup(&proc0);
590		}
591#ifdef PREEMPTION
592		mtx_assert(&sched_lock, MA_NOTOWNED);
593		if (td->td_pflags & TDP_OWEPREEMPT) {
594			mtx_lock_spin(&sched_lock);
595			mi_switch(SW_INVOL, NULL);
596			mtx_unlock_spin(&sched_lock);
597		}
598#endif
599		td->td_critnest = 0;
600		cpu_critical_exit(td);
601	} else {
602		td->td_critnest--;
603	}
604	CTR4(KTR_CRITICAL, "critical_exit by thread %p (%ld, %s) to %d", td,
605	    (long)td->td_proc->p_pid, td->td_proc->p_comm, td->td_critnest);
606}
607
608/*
609 * This function is called when a thread is about to be put on run queue
610 * because it has been made runnable or its priority has been adjusted.  It
611 * determines if the new thread should be immediately preempted to.  If so,
612 * it switches to it and eventually returns true.  If not, it returns false
613 * so that the caller may place the thread on an appropriate run queue.
614 */
615int
616maybe_preempt(struct thread *td)
617{
618#ifdef PREEMPTION
619	struct thread *ctd;
620	int cpri, pri;
621#endif
622
623	mtx_assert(&sched_lock, MA_OWNED);
624#ifdef PREEMPTION
625	/*
626	 * The new thread should not preempt the current thread if any of the
627	 * following conditions are true:
628	 *
629	 *  - The current thread has a higher (numerically lower) or
630	 *    equivalent priority.  Note that this prevents curthread from
631	 *    trying to preempt to itself.
632	 *  - It is too early in the boot for context switches (cold is set).
633	 *  - The current thread has an inhibitor set or is in the process of
634	 *    exiting.  In this case, the current thread is about to switch
635	 *    out anyways, so there's no point in preempting.  If we did,
636	 *    the current thread would not be properly resumed as well, so
637	 *    just avoid that whole landmine.
638	 *  - If the new thread's priority is not a realtime priority and
639	 *    the current thread's priority is not an idle priority and
640	 *    FULL_PREEMPTION is disabled.
641	 *
642	 * If all of these conditions are false, but the current thread is in
643	 * a nested critical section, then we have to defer the preemption
644	 * until we exit the critical section.  Otherwise, switch immediately
645	 * to the new thread.
646	 */
647	ctd = curthread;
648	KASSERT ((ctd->td_kse != NULL && ctd->td_kse->ke_thread == ctd),
649	  ("thread has no (or wrong) sched-private part."));
650	KASSERT((td->td_inhibitors == 0),
651			("maybe_preempt: trying to run inhibitted thread"));
652	pri = td->td_priority;
653	cpri = ctd->td_priority;
654	if (pri >= cpri || cold /* || dumping */ || TD_IS_INHIBITED(ctd) ||
655	    td->td_kse->ke_state != KES_THREAD)
656		return (0);
657#ifndef FULL_PREEMPTION
658	if (!(pri >= PRI_MIN_ITHD && pri <= PRI_MAX_ITHD) &&
659	    !(cpri >= PRI_MIN_IDLE))
660		return (0);
661#endif
662	if (ctd->td_critnest > 1) {
663		CTR1(KTR_PROC, "maybe_preempt: in critical section %d",
664		    ctd->td_critnest);
665		ctd->td_pflags |= TDP_OWEPREEMPT;
666		return (0);
667	}
668
669	/*
670	 * Thread is runnable but not yet put on system run queue.
671	 */
672	MPASS(TD_ON_RUNQ(td));
673	MPASS(td->td_sched->ke_state != KES_ONRUNQ);
674	if (td->td_proc->p_flag & P_HADTHREADS) {
675		/*
676		 * If this is a threaded process we actually ARE on the
677		 * ksegrp run queue so take it off that first.
678		 * Also undo any damage done to the last_assigned pointer.
679		 * XXX Fix setrunqueue so this isn't needed
680		 */
681		struct ksegrp *kg;
682
683		kg = td->td_ksegrp;
684		if (kg->kg_last_assigned == td)
685			kg->kg_last_assigned =
686			    TAILQ_PREV(td, threadqueue, td_runq);
687		TAILQ_REMOVE(&kg->kg_runq, td, td_runq);
688	}
689
690	TD_SET_RUNNING(td);
691	CTR3(KTR_PROC, "preempting to thread %p (pid %d, %s)\n", td,
692	    td->td_proc->p_pid, td->td_proc->p_comm);
693	mi_switch(SW_INVOL|SW_PREEMPT, td);
694	return (1);
695#else
696	return (0);
697#endif
698}
699
700#if 0
701#ifndef PREEMPTION
702/* XXX: There should be a non-static version of this. */
703static void
704printf_caddr_t(void *data)
705{
706	printf("%s", (char *)data);
707}
708static char preempt_warning[] =
709    "WARNING: Kernel preemption is disabled, expect reduced performance.\n";
710SYSINIT(preempt_warning, SI_SUB_COPYRIGHT, SI_ORDER_ANY, printf_caddr_t,
711    preempt_warning)
712#endif
713#endif
714
715/************************************************************************
716 * SYSTEM RUN QUEUE manipulations and tests				*
717 ************************************************************************/
718/*
719 * Initialize a run structure.
720 */
721void
722runq_init(struct runq *rq)
723{
724	int i;
725
726	bzero(rq, sizeof *rq);
727	for (i = 0; i < RQ_NQS; i++)
728		TAILQ_INIT(&rq->rq_queues[i]);
729}
730
731/*
732 * Clear the status bit of the queue corresponding to priority level pri,
733 * indicating that it is empty.
734 */
735static __inline void
736runq_clrbit(struct runq *rq, int pri)
737{
738	struct rqbits *rqb;
739
740	rqb = &rq->rq_status;
741	CTR4(KTR_RUNQ, "runq_clrbit: bits=%#x %#x bit=%#x word=%d",
742	    rqb->rqb_bits[RQB_WORD(pri)],
743	    rqb->rqb_bits[RQB_WORD(pri)] & ~RQB_BIT(pri),
744	    RQB_BIT(pri), RQB_WORD(pri));
745	rqb->rqb_bits[RQB_WORD(pri)] &= ~RQB_BIT(pri);
746}
747
748/*
749 * Find the index of the first non-empty run queue.  This is done by
750 * scanning the status bits, a set bit indicates a non-empty queue.
751 */
752static __inline int
753runq_findbit(struct runq *rq)
754{
755	struct rqbits *rqb;
756	int pri;
757	int i;
758
759	rqb = &rq->rq_status;
760	for (i = 0; i < RQB_LEN; i++)
761		if (rqb->rqb_bits[i]) {
762			pri = RQB_FFS(rqb->rqb_bits[i]) + (i << RQB_L2BPW);
763			CTR3(KTR_RUNQ, "runq_findbit: bits=%#x i=%d pri=%d",
764			    rqb->rqb_bits[i], i, pri);
765			return (pri);
766		}
767
768	return (-1);
769}
770
771/*
772 * Set the status bit of the queue corresponding to priority level pri,
773 * indicating that it is non-empty.
774 */
775static __inline void
776runq_setbit(struct runq *rq, int pri)
777{
778	struct rqbits *rqb;
779
780	rqb = &rq->rq_status;
781	CTR4(KTR_RUNQ, "runq_setbit: bits=%#x %#x bit=%#x word=%d",
782	    rqb->rqb_bits[RQB_WORD(pri)],
783	    rqb->rqb_bits[RQB_WORD(pri)] | RQB_BIT(pri),
784	    RQB_BIT(pri), RQB_WORD(pri));
785	rqb->rqb_bits[RQB_WORD(pri)] |= RQB_BIT(pri);
786}
787
788/*
789 * Add the KSE to the queue specified by its priority, and set the
790 * corresponding status bit.
791 */
792void
793runq_add(struct runq *rq, struct kse *ke, int flags)
794{
795	struct rqhead *rqh;
796	int pri;
797
798	pri = ke->ke_thread->td_priority / RQ_PPQ;
799	ke->ke_rqindex = pri;
800	runq_setbit(rq, pri);
801	rqh = &rq->rq_queues[pri];
802	CTR5(KTR_RUNQ, "runq_add: td=%p ke=%p pri=%d %d rqh=%p",
803	    ke->ke_thread, ke, ke->ke_thread->td_priority, pri, rqh);
804	if (flags & SRQ_PREEMPTED) {
805		TAILQ_INSERT_HEAD(rqh, ke, ke_procq);
806	} else {
807		TAILQ_INSERT_TAIL(rqh, ke, ke_procq);
808	}
809}
810
811/*
812 * Return true if there are runnable processes of any priority on the run
813 * queue, false otherwise.  Has no side effects, does not modify the run
814 * queue structure.
815 */
816int
817runq_check(struct runq *rq)
818{
819	struct rqbits *rqb;
820	int i;
821
822	rqb = &rq->rq_status;
823	for (i = 0; i < RQB_LEN; i++)
824		if (rqb->rqb_bits[i]) {
825			CTR2(KTR_RUNQ, "runq_check: bits=%#x i=%d",
826			    rqb->rqb_bits[i], i);
827			return (1);
828		}
829	CTR0(KTR_RUNQ, "runq_check: empty");
830
831	return (0);
832}
833
834#if defined(SMP) && defined(SCHED_4BSD)
835int runq_fuzz = 1;
836SYSCTL_INT(_kern_sched, OID_AUTO, runq_fuzz, CTLFLAG_RW, &runq_fuzz, 0, "");
837#endif
838
839/*
840 * Find the highest priority process on the run queue.
841 */
842struct kse *
843runq_choose(struct runq *rq)
844{
845	struct rqhead *rqh;
846	struct kse *ke;
847	int pri;
848
849	mtx_assert(&sched_lock, MA_OWNED);
850	while ((pri = runq_findbit(rq)) != -1) {
851		rqh = &rq->rq_queues[pri];
852#if defined(SMP) && defined(SCHED_4BSD)
853		/* fuzz == 1 is normal.. 0 or less are ignored */
854		if (runq_fuzz > 1) {
855			/*
856			 * In the first couple of entries, check if
857			 * there is one for our CPU as a preference.
858			 */
859			int count = runq_fuzz;
860			int cpu = PCPU_GET(cpuid);
861			struct kse *ke2;
862			ke2 = ke = TAILQ_FIRST(rqh);
863
864			while (count-- && ke2) {
865				if (ke->ke_thread->td_lastcpu == cpu) {
866					ke = ke2;
867					break;
868				}
869				ke2 = TAILQ_NEXT(ke2, ke_procq);
870			}
871		} else
872#endif
873			ke = TAILQ_FIRST(rqh);
874		KASSERT(ke != NULL, ("runq_choose: no proc on busy queue"));
875		CTR3(KTR_RUNQ,
876		    "runq_choose: pri=%d kse=%p rqh=%p", pri, ke, rqh);
877		return (ke);
878	}
879	CTR1(KTR_RUNQ, "runq_choose: idleproc pri=%d", pri);
880
881	return (NULL);
882}
883
884/*
885 * Remove the KSE from the queue specified by its priority, and clear the
886 * corresponding status bit if the queue becomes empty.
887 * Caller must set ke->ke_state afterwards.
888 */
889void
890runq_remove(struct runq *rq, struct kse *ke)
891{
892	struct rqhead *rqh;
893	int pri;
894
895	KASSERT(ke->ke_proc->p_sflag & PS_INMEM,
896		("runq_remove: process swapped out"));
897	pri = ke->ke_rqindex;
898	rqh = &rq->rq_queues[pri];
899	CTR5(KTR_RUNQ, "runq_remove: td=%p, ke=%p pri=%d %d rqh=%p",
900	    ke->ke_thread, ke, ke->ke_thread->td_priority, pri, rqh);
901	KASSERT(ke != NULL, ("runq_remove: no proc on busy queue"));
902	TAILQ_REMOVE(rqh, ke, ke_procq);
903	if (TAILQ_EMPTY(rqh)) {
904		CTR0(KTR_RUNQ, "runq_remove: empty");
905		runq_clrbit(rq, pri);
906	}
907}
908
909/****** functions that are temporarily here ***********/
910#include <vm/uma.h>
911extern struct mtx kse_zombie_lock;
912
913/*
914 *  Allocate scheduler specific per-process resources.
915 * The thread and ksegrp have already been linked in.
916 * In this case just set the default concurrency value.
917 *
918 * Called from:
919 *  proc_init() (UMA init method)
920 */
921void
922sched_newproc(struct proc *p, struct ksegrp *kg, struct thread *td)
923{
924
925	/* This can go in sched_fork */
926	sched_init_concurrency(kg);
927}
928
929/*
930 * thread is being either created or recycled.
931 * Fix up the per-scheduler resources associated with it.
932 * Called from:
933 *  sched_fork_thread()
934 *  thread_dtor()  (*may go away)
935 *  thread_init()  (*may go away)
936 */
937void
938sched_newthread(struct thread *td)
939{
940	struct td_sched *ke;
941
942	ke = (struct td_sched *) (td + 1);
943	bzero(ke, sizeof(*ke));
944	td->td_sched     = ke;
945	ke->ke_thread	= td;
946	ke->ke_state	= KES_THREAD;
947}
948
949/*
950 * Set up an initial concurrency of 1
951 * and set the given thread (if given) to be using that
952 * concurrency slot.
953 * May be used "offline"..before the ksegrp is attached to the world
954 * and thus wouldn't need schedlock in that case.
955 * Called from:
956 *  thr_create()
957 *  proc_init() (UMA) via sched_newproc()
958 */
959void
960sched_init_concurrency(struct ksegrp *kg)
961{
962
963	CTR1(KTR_RUNQ,"kg %p init slots and concurrency to 1", kg);
964	kg->kg_concurrency = 1;
965	kg->kg_avail_opennings = 1;
966}
967
968/*
969 * Change the concurrency of an existing ksegrp to N
970 * Called from:
971 *  kse_create()
972 *  kse_exit()
973 *  thread_exit()
974 *  thread_single()
975 */
976void
977sched_set_concurrency(struct ksegrp *kg, int concurrency)
978{
979
980	CTR4(KTR_RUNQ,"kg %p set concurrency to %d, slots %d -> %d",
981	    kg,
982	    concurrency,
983	    kg->kg_avail_opennings,
984	    kg->kg_avail_opennings + (concurrency - kg->kg_concurrency));
985	kg->kg_avail_opennings += (concurrency - kg->kg_concurrency);
986	kg->kg_concurrency = concurrency;
987}
988
989/*
990 * Called from thread_exit() for all exiting thread
991 *
992 * Not to be confused with sched_exit_thread()
993 * that is only called from thread_exit() for threads exiting
994 * without the rest of the process exiting because it is also called from
995 * sched_exit() and we wouldn't want to call it twice.
996 * XXX This can probably be fixed.
997 */
998void
999sched_thread_exit(struct thread *td)
1000{
1001
1002	SLOT_RELEASE(td->td_ksegrp);
1003	slot_fill(td->td_ksegrp);
1004}
1005
1006#endif /* KERN_SWITCH_INCLUDE */
1007