kern_switch.c revision 159154
1238438Sdteske/*-
2238438Sdteske * Copyright (c) 2001 Jake Burkholder <jake@FreeBSD.org>
3238438Sdteske * All rights reserved.
4247280Sdteske *
5238438Sdteske * Redistribution and use in source and binary forms, with or without
6238438Sdteske * modification, are permitted provided that the following conditions
7 * are met:
8 * 1. Redistributions of source code must retain the above copyright
9 *    notice, this list of conditions and the following disclaimer.
10 * 2. Redistributions in binary form must reproduce the above copyright
11 *    notice, this list of conditions and the following disclaimer in the
12 *    documentation and/or other materials provided with the distribution.
13 *
14 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
15 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
16 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
17 * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
18 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
19 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
20 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
21 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
22 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
23 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
24 * SUCH DAMAGE.
25 */
26
27/***
28Here is the logic..
29
30If there are N processors, then there are at most N KSEs (kernel
31schedulable entities) working to process threads that belong to a
32KSEGROUP (kg). If there are X of these KSEs actually running at the
33moment in question, then there are at most M (N-X) of these KSEs on
34the run queue, as running KSEs are not on the queue.
35
36Runnable threads are queued off the KSEGROUP in priority order.
37If there are M or more threads runnable, the top M threads
38(by priority) are 'preassigned' to the M KSEs not running. The KSEs take
39their priority from those threads and are put on the run queue.
40
41The last thread that had a priority high enough to have a KSE associated
42with it, AND IS ON THE RUN QUEUE is pointed to by
43kg->kg_last_assigned. If no threads queued off the KSEGROUP have KSEs
44assigned as all the available KSEs are activly running, or because there
45are no threads queued, that pointer is NULL.
46
47When a KSE is removed from the run queue to become runnable, we know
48it was associated with the highest priority thread in the queue (at the head
49of the queue). If it is also the last assigned we know M was 1 and must
50now be 0. Since the thread is no longer queued that pointer must be
51removed from it. Since we know there were no more KSEs available,
52(M was 1 and is now 0) and since we are not FREEING our KSE
53but using it, we know there are STILL no more KSEs available, we can prove
54that the next thread in the ksegrp list will not have a KSE to assign to
55it, so we can show that the pointer must be made 'invalid' (NULL).
56
57The pointer exists so that when a new thread is made runnable, it can
58have its priority compared with the last assigned thread to see if
59it should 'steal' its KSE or not.. i.e. is it 'earlier'
60on the list than that thread or later.. If it's earlier, then the KSE is
61removed from the last assigned (which is now not assigned a KSE)
62and reassigned to the new thread, which is placed earlier in the list.
63The pointer is then backed up to the previous thread (which may or may not
64be the new thread).
65
66When a thread sleeps or is removed, the KSE becomes available and if there
67are queued threads that are not assigned KSEs, the highest priority one of
68them is assigned the KSE, which is then placed back on the run queue at
69the approipriate place, and the kg->kg_last_assigned pointer is adjusted down
70to point to it.
71
72The following diagram shows 2 KSEs and 3 threads from a single process.
73
74 RUNQ: --->KSE---KSE--...    (KSEs queued at priorities from threads)
75              \    \____
76               \        \
77    KSEGROUP---thread--thread--thread    (queued in priority order)
78        \                 /
79         \_______________/
80          (last_assigned)
81
82The result of this scheme is that the M available KSEs are always
83queued at the priorities they have inherrited from the M highest priority
84threads for that KSEGROUP. If this situation changes, the KSEs are
85reassigned to keep this true.
86***/
87
88#include <sys/cdefs.h>
89__FBSDID("$FreeBSD: head/sys/kern/kern_switch.c 159154 2006-06-01 22:45:56Z cognet $");
90
91#include "opt_sched.h"
92
93#ifndef KERN_SWITCH_INCLUDE
94#include <sys/param.h>
95#include <sys/systm.h>
96#include <sys/kdb.h>
97#include <sys/kernel.h>
98#include <sys/ktr.h>
99#include <sys/lock.h>
100#include <sys/mutex.h>
101#include <sys/proc.h>
102#include <sys/queue.h>
103#include <sys/sched.h>
104#else  /* KERN_SWITCH_INCLUDE */
105#if defined(SMP) && (defined(__i386__) || defined(__amd64__))
106#include <sys/smp.h>
107#endif
108#if defined(SMP) && defined(SCHED_4BSD)
109#include <sys/sysctl.h>
110#endif
111
112/* Uncomment this to enable logging of critical_enter/exit. */
113#if 0
114#define	KTR_CRITICAL	KTR_SCHED
115#else
116#define	KTR_CRITICAL	0
117#endif
118
119#ifdef FULL_PREEMPTION
120#ifndef PREEMPTION
121#error "The FULL_PREEMPTION option requires the PREEMPTION option"
122#endif
123#endif
124
125CTASSERT((RQB_BPW * RQB_LEN) == RQ_NQS);
126
127#define td_kse td_sched
128
129/*
130 * kern.sched.preemption allows user space to determine if preemption support
131 * is compiled in or not.  It is not currently a boot or runtime flag that
132 * can be changed.
133 */
134#ifdef PREEMPTION
135static int kern_sched_preemption = 1;
136#else
137static int kern_sched_preemption = 0;
138#endif
139SYSCTL_INT(_kern_sched, OID_AUTO, preemption, CTLFLAG_RD,
140    &kern_sched_preemption, 0, "Kernel preemption enabled");
141
142/************************************************************************
143 * Functions that manipulate runnability from a thread perspective.	*
144 ************************************************************************/
145/*
146 * Select the KSE that will be run next.  From that find the thread, and
147 * remove it from the KSEGRP's run queue.  If there is thread clustering,
148 * this will be what does it.
149 */
150struct thread *
151choosethread(void)
152{
153	struct kse *ke;
154	struct thread *td;
155	struct ksegrp *kg;
156
157#if defined(SMP) && (defined(__i386__) || defined(__amd64__))
158	if (smp_active == 0 && PCPU_GET(cpuid) != 0) {
159		/* Shutting down, run idlethread on AP's */
160		td = PCPU_GET(idlethread);
161		ke = td->td_kse;
162		CTR1(KTR_RUNQ, "choosethread: td=%p (idle)", td);
163		ke->ke_flags |= KEF_DIDRUN;
164		TD_SET_RUNNING(td);
165		return (td);
166	}
167#endif
168
169retry:
170	ke = sched_choose();
171	if (ke) {
172		td = ke->ke_thread;
173		KASSERT((td->td_kse == ke), ("kse/thread mismatch"));
174		kg = ke->ke_ksegrp;
175		if (td->td_proc->p_flag & P_HADTHREADS) {
176			if (kg->kg_last_assigned == td) {
177				kg->kg_last_assigned = TAILQ_PREV(td,
178				    threadqueue, td_runq);
179			}
180			TAILQ_REMOVE(&kg->kg_runq, td, td_runq);
181		}
182		CTR2(KTR_RUNQ, "choosethread: td=%p pri=%d",
183		    td, td->td_priority);
184	} else {
185		/* Simulate runq_choose() having returned the idle thread */
186		td = PCPU_GET(idlethread);
187		ke = td->td_kse;
188		CTR1(KTR_RUNQ, "choosethread: td=%p (idle)", td);
189	}
190	ke->ke_flags |= KEF_DIDRUN;
191
192	/*
193	 * If we are in panic, only allow system threads,
194	 * plus the one we are running in, to be run.
195	 */
196	if (panicstr && ((td->td_proc->p_flag & P_SYSTEM) == 0 &&
197	    (td->td_flags & TDF_INPANIC) == 0)) {
198		/* note that it is no longer on the run queue */
199		TD_SET_CAN_RUN(td);
200		goto retry;
201	}
202
203	TD_SET_RUNNING(td);
204	return (td);
205}
206
207/*
208 * Given a surplus system slot, try assign a new runnable thread to it.
209 * Called from:
210 *  sched_thread_exit()  (local)
211 *  sched_switch()  (local)
212 *  sched_thread_exit()  (local)
213 *  remrunqueue()  (local)  (not at the moment)
214 */
215static void
216slot_fill(struct ksegrp *kg)
217{
218	struct thread *td;
219
220	mtx_assert(&sched_lock, MA_OWNED);
221	while (kg->kg_avail_opennings > 0) {
222		/*
223		 * Find the first unassigned thread
224		 */
225		if ((td = kg->kg_last_assigned) != NULL)
226			td = TAILQ_NEXT(td, td_runq);
227		else
228			td = TAILQ_FIRST(&kg->kg_runq);
229
230		/*
231		 * If we found one, send it to the system scheduler.
232		 */
233		if (td) {
234			kg->kg_last_assigned = td;
235			sched_add(td, SRQ_YIELDING);
236			CTR2(KTR_RUNQ, "slot_fill: td%p -> kg%p", td, kg);
237		} else {
238			/* no threads to use up the slots. quit now */
239			break;
240		}
241	}
242}
243
244#ifdef	SCHED_4BSD
245/*
246 * Remove a thread from its KSEGRP's run queue.
247 * This in turn may remove it from a KSE if it was already assigned
248 * to one, possibly causing a new thread to be assigned to the KSE
249 * and the KSE getting a new priority.
250 */
251static void
252remrunqueue(struct thread *td)
253{
254	struct thread *td2, *td3;
255	struct ksegrp *kg;
256	struct kse *ke;
257
258	mtx_assert(&sched_lock, MA_OWNED);
259	KASSERT((TD_ON_RUNQ(td)), ("remrunqueue: Bad state on run queue"));
260	kg = td->td_ksegrp;
261	ke = td->td_kse;
262	CTR1(KTR_RUNQ, "remrunqueue: td%p", td);
263	TD_SET_CAN_RUN(td);
264	/*
265	 * If it is not a threaded process, take the shortcut.
266	 */
267	if ((td->td_proc->p_flag & P_HADTHREADS) == 0) {
268		/* remve from sys run queue and free up a slot */
269		sched_rem(td);
270		return;
271	}
272   	td3 = TAILQ_PREV(td, threadqueue, td_runq);
273	TAILQ_REMOVE(&kg->kg_runq, td, td_runq);
274	if (ke->ke_state == KES_ONRUNQ) {
275		/*
276		 * This thread has been assigned to the system run queue.
277		 * We need to dissociate it and try assign the
278		 * KSE to the next available thread. Then, we should
279		 * see if we need to move the KSE in the run queues.
280		 */
281		sched_rem(td);
282		td2 = kg->kg_last_assigned;
283		KASSERT((td2 != NULL), ("last assigned has wrong value"));
284		if (td2 == td)
285			kg->kg_last_assigned = td3;
286		/* slot_fill(kg); */ /* will replace it with another */
287	}
288}
289#endif
290
291/*
292 * Change the priority of a thread that is on the run queue.
293 */
294void
295adjustrunqueue( struct thread *td, int newpri)
296{
297	struct ksegrp *kg;
298	struct kse *ke;
299
300	mtx_assert(&sched_lock, MA_OWNED);
301	KASSERT((TD_ON_RUNQ(td)), ("adjustrunqueue: Bad state on run queue"));
302
303	ke = td->td_kse;
304	CTR1(KTR_RUNQ, "adjustrunqueue: td%p", td);
305	/*
306	 * If it is not a threaded process, take the shortcut.
307	 */
308	if ((td->td_proc->p_flag & P_HADTHREADS) == 0) {
309		/* We only care about the kse in the run queue. */
310		td->td_priority = newpri;
311		if (ke->ke_rqindex != (newpri / RQ_PPQ)) {
312			sched_rem(td);
313			sched_add(td, SRQ_BORING);
314		}
315		return;
316	}
317
318	/* It is a threaded process */
319	kg = td->td_ksegrp;
320	if (ke->ke_state == KES_ONRUNQ
321#ifdef SCHED_ULE
322	 || ((ke->ke_flags & KEF_ASSIGNED) != 0 &&
323	     (ke->ke_flags & KEF_REMOVED) == 0)
324#endif
325	   ) {
326		if (kg->kg_last_assigned == td) {
327			kg->kg_last_assigned =
328			    TAILQ_PREV(td, threadqueue, td_runq);
329		}
330		sched_rem(td);
331	}
332	TAILQ_REMOVE(&kg->kg_runq, td, td_runq);
333	TD_SET_CAN_RUN(td);
334	td->td_priority = newpri;
335	setrunqueue(td, SRQ_BORING);
336}
337
338/*
339 * This function is called when a thread is about to be put on a
340 * ksegrp run queue because it has been made runnable or its
341 * priority has been adjusted and the ksegrp does not have a
342 * free kse slot.  It determines if a thread from the same ksegrp
343 * should be preempted.  If so, it tries to switch threads
344 * if the thread is on the same cpu or notifies another cpu that
345 * it should switch threads.
346 */
347
348static void
349maybe_preempt_in_ksegrp(struct thread *td)
350#if  !defined(SMP)
351{
352	struct thread *running_thread;
353
354	mtx_assert(&sched_lock, MA_OWNED);
355	running_thread = curthread;
356
357	if (running_thread->td_ksegrp != td->td_ksegrp)
358		return;
359
360	if (td->td_priority >= running_thread->td_priority)
361		return;
362#ifdef PREEMPTION
363#ifndef FULL_PREEMPTION
364	if (td->td_priority > PRI_MAX_ITHD) {
365		running_thread->td_flags |= TDF_NEEDRESCHED;
366		return;
367	}
368#endif /* FULL_PREEMPTION */
369
370	if (running_thread->td_critnest > 1)
371		running_thread->td_owepreempt = 1;
372	 else
373		 mi_switch(SW_INVOL, NULL);
374
375#else /* PREEMPTION */
376	running_thread->td_flags |= TDF_NEEDRESCHED;
377#endif /* PREEMPTION */
378	return;
379}
380
381#else /* SMP */
382{
383	struct thread *running_thread;
384	int worst_pri;
385	struct ksegrp *kg;
386	cpumask_t cpumask,dontuse;
387	struct pcpu *pc;
388	struct pcpu *best_pcpu;
389	struct thread *cputhread;
390
391	mtx_assert(&sched_lock, MA_OWNED);
392
393	running_thread = curthread;
394
395#if !defined(KSEG_PEEMPT_BEST_CPU)
396	if (running_thread->td_ksegrp != td->td_ksegrp) {
397#endif
398		kg = td->td_ksegrp;
399
400		/* if someone is ahead of this thread, wait our turn */
401		if (td != TAILQ_FIRST(&kg->kg_runq))
402			return;
403
404		worst_pri = td->td_priority;
405		best_pcpu = NULL;
406		dontuse   = stopped_cpus | idle_cpus_mask;
407
408		/*
409		 * Find a cpu with the worst priority that runs at thread from
410		 * the same  ksegrp - if multiple exist give first the last run
411		 * cpu and then the current cpu priority
412		 */
413
414		SLIST_FOREACH(pc, &cpuhead, pc_allcpu) {
415			cpumask   = pc->pc_cpumask;
416			cputhread = pc->pc_curthread;
417
418			if ((cpumask & dontuse)  ||
419			    cputhread->td_ksegrp != kg)
420				continue;
421
422			if (cputhread->td_priority > worst_pri) {
423				worst_pri = cputhread->td_priority;
424				best_pcpu = pc;
425				continue;
426			}
427
428			if (cputhread->td_priority == worst_pri &&
429			    best_pcpu != NULL &&
430			    (td->td_lastcpu == pc->pc_cpuid ||
431				(PCPU_GET(cpumask) == cpumask &&
432				    td->td_lastcpu != best_pcpu->pc_cpuid)))
433			    best_pcpu = pc;
434		}
435
436		/* Check if we need to preempt someone */
437		if (best_pcpu == NULL)
438			return;
439
440#if defined(IPI_PREEMPTION) && defined(PREEMPTION)
441#if !defined(FULL_PREEMPTION)
442		if (td->td_priority <= PRI_MAX_ITHD)
443#endif /* ! FULL_PREEMPTION */
444			{
445				ipi_selected(best_pcpu->pc_cpumask, IPI_PREEMPT);
446				return;
447			}
448#endif /* defined(IPI_PREEMPTION) && defined(PREEMPTION) */
449
450		if (PCPU_GET(cpuid) != best_pcpu->pc_cpuid) {
451			best_pcpu->pc_curthread->td_flags |= TDF_NEEDRESCHED;
452			ipi_selected(best_pcpu->pc_cpumask, IPI_AST);
453			return;
454		}
455#if !defined(KSEG_PEEMPT_BEST_CPU)
456	}
457#endif
458
459	if (td->td_priority >= running_thread->td_priority)
460		return;
461#ifdef PREEMPTION
462
463#if !defined(FULL_PREEMPTION)
464	if (td->td_priority > PRI_MAX_ITHD) {
465		running_thread->td_flags |= TDF_NEEDRESCHED;
466	}
467#endif /* ! FULL_PREEMPTION */
468
469	if (running_thread->td_critnest > 1)
470		running_thread->td_owepreempt = 1;
471	 else
472		 mi_switch(SW_INVOL, NULL);
473
474#else /* PREEMPTION */
475	running_thread->td_flags |= TDF_NEEDRESCHED;
476#endif /* PREEMPTION */
477	return;
478}
479#endif /* !SMP */
480
481
482int limitcount;
483void
484setrunqueue(struct thread *td, int flags)
485{
486	struct ksegrp *kg;
487	struct thread *td2;
488	struct thread *tda;
489
490	CTR3(KTR_RUNQ, "setrunqueue: td:%p kg:%p pid:%d",
491	    td, td->td_ksegrp, td->td_proc->p_pid);
492	CTR5(KTR_SCHED, "setrunqueue: %p(%s) prio %d by %p(%s)",
493            td, td->td_proc->p_comm, td->td_priority, curthread,
494            curthread->td_proc->p_comm);
495	mtx_assert(&sched_lock, MA_OWNED);
496	KASSERT((td->td_inhibitors == 0),
497			("setrunqueue: trying to run inhibitted thread"));
498	KASSERT((TD_CAN_RUN(td) || TD_IS_RUNNING(td)),
499	    ("setrunqueue: bad thread state"));
500	TD_SET_RUNQ(td);
501	kg = td->td_ksegrp;
502	if ((td->td_proc->p_flag & P_HADTHREADS) == 0) {
503		/*
504		 * Common path optimisation: Only one of everything
505		 * and the KSE is always already attached.
506		 * Totally ignore the ksegrp run queue.
507		 */
508		if (kg->kg_avail_opennings != 1) {
509			if (limitcount < 1) {
510				limitcount++;
511				printf("pid %d: corrected slot count (%d->1)\n",
512				    td->td_proc->p_pid, kg->kg_avail_opennings);
513
514			}
515			kg->kg_avail_opennings = 1;
516		}
517		sched_add(td, flags);
518		return;
519	}
520
521	/*
522	 * If the concurrency has reduced, and we would go in the
523	 * assigned section, then keep removing entries from the
524	 * system run queue, until we are not in that section
525	 * or there is room for us to be put in that section.
526	 * What we MUST avoid is the case where there are threads of less
527	 * priority than the new one scheduled, but it can not
528	 * be scheduled itself. That would lead to a non contiguous set
529	 * of scheduled threads, and everything would break.
530	 */
531	tda = kg->kg_last_assigned;
532	while ((kg->kg_avail_opennings <= 0) &&
533	    (tda && (tda->td_priority > td->td_priority))) {
534		/*
535		 * None free, but there is one we can commandeer.
536		 */
537		CTR2(KTR_RUNQ,
538		    "setrunqueue: kg:%p: take slot from td: %p", kg, tda);
539		sched_rem(tda);
540		tda = kg->kg_last_assigned =
541		    TAILQ_PREV(tda, threadqueue, td_runq);
542	}
543
544	/*
545	 * Add the thread to the ksegrp's run queue at
546	 * the appropriate place.
547	 */
548	TAILQ_FOREACH(td2, &kg->kg_runq, td_runq) {
549		if (td2->td_priority > td->td_priority) {
550			TAILQ_INSERT_BEFORE(td2, td, td_runq);
551			break;
552		}
553	}
554	if (td2 == NULL) {
555		/* We ran off the end of the TAILQ or it was empty. */
556		TAILQ_INSERT_TAIL(&kg->kg_runq, td, td_runq);
557	}
558
559	/*
560	 * If we have a slot to use, then put the thread on the system
561	 * run queue and if needed, readjust the last_assigned pointer.
562	 * it may be that we need to schedule something anyhow
563	 * even if the availabel slots are -ve so that
564	 * all the items < last_assigned are scheduled.
565	 */
566	if (kg->kg_avail_opennings > 0) {
567		if (tda == NULL) {
568			/*
569			 * No pre-existing last assigned so whoever is first
570			 * gets the slot.. (maybe us)
571			 */
572			td2 = TAILQ_FIRST(&kg->kg_runq);
573			kg->kg_last_assigned = td2;
574		} else if (tda->td_priority > td->td_priority) {
575			td2 = td;
576		} else {
577			/*
578			 * We are past last_assigned, so
579			 * give the next slot to whatever is next,
580			 * which may or may not be us.
581			 */
582			td2 = TAILQ_NEXT(tda, td_runq);
583			kg->kg_last_assigned = td2;
584		}
585		sched_add(td2, flags);
586	} else {
587		CTR3(KTR_RUNQ, "setrunqueue: held: td%p kg%p pid%d",
588			td, td->td_ksegrp, td->td_proc->p_pid);
589		if ((flags & SRQ_YIELDING) == 0)
590			maybe_preempt_in_ksegrp(td);
591	}
592}
593
594/*
595 * Kernel thread preemption implementation.  Critical sections mark
596 * regions of code in which preemptions are not allowed.
597 */
598void
599critical_enter(void)
600{
601	struct thread *td;
602
603	td = curthread;
604	td->td_critnest++;
605	CTR4(KTR_CRITICAL, "critical_enter by thread %p (%ld, %s) to %d", td,
606	    (long)td->td_proc->p_pid, td->td_proc->p_comm, td->td_critnest);
607}
608
609void
610critical_exit(void)
611{
612	struct thread *td;
613
614	td = curthread;
615	KASSERT(td->td_critnest != 0,
616	    ("critical_exit: td_critnest == 0"));
617#ifdef PREEMPTION
618	if (td->td_critnest == 1) {
619		td->td_critnest = 0;
620		mtx_assert(&sched_lock, MA_NOTOWNED);
621		if (td->td_owepreempt) {
622			td->td_critnest = 1;
623			mtx_lock_spin(&sched_lock);
624			td->td_critnest--;
625			mi_switch(SW_INVOL, NULL);
626			mtx_unlock_spin(&sched_lock);
627		}
628	} else
629#endif
630		td->td_critnest--;
631
632	CTR4(KTR_CRITICAL, "critical_exit by thread %p (%ld, %s) to %d", td,
633	    (long)td->td_proc->p_pid, td->td_proc->p_comm, td->td_critnest);
634}
635
636/*
637 * This function is called when a thread is about to be put on run queue
638 * because it has been made runnable or its priority has been adjusted.  It
639 * determines if the new thread should be immediately preempted to.  If so,
640 * it switches to it and eventually returns true.  If not, it returns false
641 * so that the caller may place the thread on an appropriate run queue.
642 */
643int
644maybe_preempt(struct thread *td)
645{
646#ifdef PREEMPTION
647	struct thread *ctd;
648	int cpri, pri;
649#endif
650
651	mtx_assert(&sched_lock, MA_OWNED);
652#ifdef PREEMPTION
653	/*
654	 * The new thread should not preempt the current thread if any of the
655	 * following conditions are true:
656	 *
657	 *  - The kernel is in the throes of crashing (panicstr).
658	 *  - The current thread has a higher (numerically lower) or
659	 *    equivalent priority.  Note that this prevents curthread from
660	 *    trying to preempt to itself.
661	 *  - It is too early in the boot for context switches (cold is set).
662	 *  - The current thread has an inhibitor set or is in the process of
663	 *    exiting.  In this case, the current thread is about to switch
664	 *    out anyways, so there's no point in preempting.  If we did,
665	 *    the current thread would not be properly resumed as well, so
666	 *    just avoid that whole landmine.
667	 *  - If the new thread's priority is not a realtime priority and
668	 *    the current thread's priority is not an idle priority and
669	 *    FULL_PREEMPTION is disabled.
670	 *
671	 * If all of these conditions are false, but the current thread is in
672	 * a nested critical section, then we have to defer the preemption
673	 * until we exit the critical section.  Otherwise, switch immediately
674	 * to the new thread.
675	 */
676	ctd = curthread;
677	KASSERT ((ctd->td_kse != NULL && ctd->td_kse->ke_thread == ctd),
678	  ("thread has no (or wrong) sched-private part."));
679	KASSERT((td->td_inhibitors == 0),
680			("maybe_preempt: trying to run inhibitted thread"));
681	pri = td->td_priority;
682	cpri = ctd->td_priority;
683	if (panicstr != NULL || pri >= cpri || cold /* || dumping */ ||
684	    TD_IS_INHIBITED(ctd) || td->td_kse->ke_state != KES_THREAD)
685		return (0);
686#ifndef FULL_PREEMPTION
687	if (pri > PRI_MAX_ITHD && cpri < PRI_MIN_IDLE)
688		return (0);
689#endif
690
691	if (ctd->td_critnest > 1) {
692		CTR1(KTR_PROC, "maybe_preempt: in critical section %d",
693		    ctd->td_critnest);
694		ctd->td_owepreempt = 1;
695		return (0);
696	}
697
698	/*
699	 * Thread is runnable but not yet put on system run queue.
700	 */
701	MPASS(TD_ON_RUNQ(td));
702	MPASS(td->td_sched->ke_state != KES_ONRUNQ);
703	if (td->td_proc->p_flag & P_HADTHREADS) {
704		/*
705		 * If this is a threaded process we actually ARE on the
706		 * ksegrp run queue so take it off that first.
707		 * Also undo any damage done to the last_assigned pointer.
708		 * XXX Fix setrunqueue so this isn't needed
709		 */
710		struct ksegrp *kg;
711
712		kg = td->td_ksegrp;
713		if (kg->kg_last_assigned == td)
714			kg->kg_last_assigned =
715			    TAILQ_PREV(td, threadqueue, td_runq);
716		TAILQ_REMOVE(&kg->kg_runq, td, td_runq);
717	}
718
719	TD_SET_RUNNING(td);
720	CTR3(KTR_PROC, "preempting to thread %p (pid %d, %s)\n", td,
721	    td->td_proc->p_pid, td->td_proc->p_comm);
722	mi_switch(SW_INVOL|SW_PREEMPT, td);
723	return (1);
724#else
725	return (0);
726#endif
727}
728
729#if 0
730#ifndef PREEMPTION
731/* XXX: There should be a non-static version of this. */
732static void
733printf_caddr_t(void *data)
734{
735	printf("%s", (char *)data);
736}
737static char preempt_warning[] =
738    "WARNING: Kernel preemption is disabled, expect reduced performance.\n";
739SYSINIT(preempt_warning, SI_SUB_COPYRIGHT, SI_ORDER_ANY, printf_caddr_t,
740    preempt_warning)
741#endif
742#endif
743
744/************************************************************************
745 * SYSTEM RUN QUEUE manipulations and tests				*
746 ************************************************************************/
747/*
748 * Initialize a run structure.
749 */
750void
751runq_init(struct runq *rq)
752{
753	int i;
754
755	bzero(rq, sizeof *rq);
756	for (i = 0; i < RQ_NQS; i++)
757		TAILQ_INIT(&rq->rq_queues[i]);
758}
759
760/*
761 * Clear the status bit of the queue corresponding to priority level pri,
762 * indicating that it is empty.
763 */
764static __inline void
765runq_clrbit(struct runq *rq, int pri)
766{
767	struct rqbits *rqb;
768
769	rqb = &rq->rq_status;
770	CTR4(KTR_RUNQ, "runq_clrbit: bits=%#x %#x bit=%#x word=%d",
771	    rqb->rqb_bits[RQB_WORD(pri)],
772	    rqb->rqb_bits[RQB_WORD(pri)] & ~RQB_BIT(pri),
773	    RQB_BIT(pri), RQB_WORD(pri));
774	rqb->rqb_bits[RQB_WORD(pri)] &= ~RQB_BIT(pri);
775}
776
777/*
778 * Find the index of the first non-empty run queue.  This is done by
779 * scanning the status bits, a set bit indicates a non-empty queue.
780 */
781static __inline int
782runq_findbit(struct runq *rq)
783{
784	struct rqbits *rqb;
785	int pri;
786	int i;
787
788	rqb = &rq->rq_status;
789	for (i = 0; i < RQB_LEN; i++)
790		if (rqb->rqb_bits[i]) {
791			pri = RQB_FFS(rqb->rqb_bits[i]) + (i << RQB_L2BPW);
792			CTR3(KTR_RUNQ, "runq_findbit: bits=%#x i=%d pri=%d",
793			    rqb->rqb_bits[i], i, pri);
794			return (pri);
795		}
796
797	return (-1);
798}
799
800/*
801 * Set the status bit of the queue corresponding to priority level pri,
802 * indicating that it is non-empty.
803 */
804static __inline void
805runq_setbit(struct runq *rq, int pri)
806{
807	struct rqbits *rqb;
808
809	rqb = &rq->rq_status;
810	CTR4(KTR_RUNQ, "runq_setbit: bits=%#x %#x bit=%#x word=%d",
811	    rqb->rqb_bits[RQB_WORD(pri)],
812	    rqb->rqb_bits[RQB_WORD(pri)] | RQB_BIT(pri),
813	    RQB_BIT(pri), RQB_WORD(pri));
814	rqb->rqb_bits[RQB_WORD(pri)] |= RQB_BIT(pri);
815}
816
817/*
818 * Add the KSE to the queue specified by its priority, and set the
819 * corresponding status bit.
820 */
821void
822runq_add(struct runq *rq, struct kse *ke, int flags)
823{
824	struct rqhead *rqh;
825	int pri;
826
827	pri = ke->ke_thread->td_priority / RQ_PPQ;
828	ke->ke_rqindex = pri;
829	runq_setbit(rq, pri);
830	rqh = &rq->rq_queues[pri];
831	CTR5(KTR_RUNQ, "runq_add: td=%p ke=%p pri=%d %d rqh=%p",
832	    ke->ke_thread, ke, ke->ke_thread->td_priority, pri, rqh);
833	if (flags & SRQ_PREEMPTED) {
834		TAILQ_INSERT_HEAD(rqh, ke, ke_procq);
835	} else {
836		TAILQ_INSERT_TAIL(rqh, ke, ke_procq);
837	}
838}
839
840/*
841 * Return true if there are runnable processes of any priority on the run
842 * queue, false otherwise.  Has no side effects, does not modify the run
843 * queue structure.
844 */
845int
846runq_check(struct runq *rq)
847{
848	struct rqbits *rqb;
849	int i;
850
851	rqb = &rq->rq_status;
852	for (i = 0; i < RQB_LEN; i++)
853		if (rqb->rqb_bits[i]) {
854			CTR2(KTR_RUNQ, "runq_check: bits=%#x i=%d",
855			    rqb->rqb_bits[i], i);
856			return (1);
857		}
858	CTR0(KTR_RUNQ, "runq_check: empty");
859
860	return (0);
861}
862
863#if defined(SMP) && defined(SCHED_4BSD)
864int runq_fuzz = 1;
865SYSCTL_INT(_kern_sched, OID_AUTO, runq_fuzz, CTLFLAG_RW, &runq_fuzz, 0, "");
866#endif
867
868/*
869 * Find the highest priority process on the run queue.
870 */
871struct kse *
872runq_choose(struct runq *rq)
873{
874	struct rqhead *rqh;
875	struct kse *ke;
876	int pri;
877
878	mtx_assert(&sched_lock, MA_OWNED);
879	while ((pri = runq_findbit(rq)) != -1) {
880		rqh = &rq->rq_queues[pri];
881#if defined(SMP) && defined(SCHED_4BSD)
882		/* fuzz == 1 is normal.. 0 or less are ignored */
883		if (runq_fuzz > 1) {
884			/*
885			 * In the first couple of entries, check if
886			 * there is one for our CPU as a preference.
887			 */
888			int count = runq_fuzz;
889			int cpu = PCPU_GET(cpuid);
890			struct kse *ke2;
891			ke2 = ke = TAILQ_FIRST(rqh);
892
893			while (count-- && ke2) {
894				if (ke->ke_thread->td_lastcpu == cpu) {
895					ke = ke2;
896					break;
897				}
898				ke2 = TAILQ_NEXT(ke2, ke_procq);
899			}
900		} else
901#endif
902			ke = TAILQ_FIRST(rqh);
903		KASSERT(ke != NULL, ("runq_choose: no proc on busy queue"));
904		CTR3(KTR_RUNQ,
905		    "runq_choose: pri=%d kse=%p rqh=%p", pri, ke, rqh);
906		return (ke);
907	}
908	CTR1(KTR_RUNQ, "runq_choose: idleproc pri=%d", pri);
909
910	return (NULL);
911}
912
913/*
914 * Remove the KSE from the queue specified by its priority, and clear the
915 * corresponding status bit if the queue becomes empty.
916 * Caller must set ke->ke_state afterwards.
917 */
918void
919runq_remove(struct runq *rq, struct kse *ke)
920{
921	struct rqhead *rqh;
922	int pri;
923
924	KASSERT(ke->ke_proc->p_sflag & PS_INMEM,
925		("runq_remove: process swapped out"));
926	pri = ke->ke_rqindex;
927	rqh = &rq->rq_queues[pri];
928	CTR5(KTR_RUNQ, "runq_remove: td=%p, ke=%p pri=%d %d rqh=%p",
929	    ke->ke_thread, ke, ke->ke_thread->td_priority, pri, rqh);
930	KASSERT(ke != NULL, ("runq_remove: no proc on busy queue"));
931	TAILQ_REMOVE(rqh, ke, ke_procq);
932	if (TAILQ_EMPTY(rqh)) {
933		CTR0(KTR_RUNQ, "runq_remove: empty");
934		runq_clrbit(rq, pri);
935	}
936}
937
938/****** functions that are temporarily here ***********/
939#include <vm/uma.h>
940extern struct mtx kse_zombie_lock;
941
942/*
943 *  Allocate scheduler specific per-process resources.
944 * The thread and ksegrp have already been linked in.
945 * In this case just set the default concurrency value.
946 *
947 * Called from:
948 *  proc_init() (UMA init method)
949 */
950void
951sched_newproc(struct proc *p, struct ksegrp *kg, struct thread *td)
952{
953
954	/* This can go in sched_fork */
955	sched_init_concurrency(kg);
956}
957
958/*
959 * thread is being either created or recycled.
960 * Fix up the per-scheduler resources associated with it.
961 * Called from:
962 *  sched_fork_thread()
963 *  thread_dtor()  (*may go away)
964 *  thread_init()  (*may go away)
965 */
966void
967sched_newthread(struct thread *td)
968{
969	struct td_sched *ke;
970
971	ke = (struct td_sched *) (td + 1);
972	bzero(ke, sizeof(*ke));
973	td->td_sched     = ke;
974	ke->ke_thread	= td;
975	ke->ke_state	= KES_THREAD;
976}
977
978/*
979 * Set up an initial concurrency of 1
980 * and set the given thread (if given) to be using that
981 * concurrency slot.
982 * May be used "offline"..before the ksegrp is attached to the world
983 * and thus wouldn't need schedlock in that case.
984 * Called from:
985 *  thr_create()
986 *  proc_init() (UMA) via sched_newproc()
987 */
988void
989sched_init_concurrency(struct ksegrp *kg)
990{
991
992	CTR1(KTR_RUNQ,"kg %p init slots and concurrency to 1", kg);
993	kg->kg_concurrency = 1;
994	kg->kg_avail_opennings = 1;
995}
996
997/*
998 * Change the concurrency of an existing ksegrp to N
999 * Called from:
1000 *  kse_create()
1001 *  kse_exit()
1002 *  thread_exit()
1003 *  thread_single()
1004 */
1005void
1006sched_set_concurrency(struct ksegrp *kg, int concurrency)
1007{
1008
1009	CTR4(KTR_RUNQ,"kg %p set concurrency to %d, slots %d -> %d",
1010	    kg,
1011	    concurrency,
1012	    kg->kg_avail_opennings,
1013	    kg->kg_avail_opennings + (concurrency - kg->kg_concurrency));
1014	kg->kg_avail_opennings += (concurrency - kg->kg_concurrency);
1015	kg->kg_concurrency = concurrency;
1016}
1017
1018/*
1019 * Called from thread_exit() for all exiting thread
1020 *
1021 * Not to be confused with sched_exit_thread()
1022 * that is only called from thread_exit() for threads exiting
1023 * without the rest of the process exiting because it is also called from
1024 * sched_exit() and we wouldn't want to call it twice.
1025 * XXX This can probably be fixed.
1026 */
1027void
1028sched_thread_exit(struct thread *td)
1029{
1030
1031	SLOT_RELEASE(td->td_ksegrp);
1032	slot_fill(td->td_ksegrp);
1033}
1034
1035#endif /* KERN_SWITCH_INCLUDE */
1036