kern_switch.c revision 153797
1/*-
2 * Copyright (c) 2001 Jake Burkholder <jake@FreeBSD.org>
3 * All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 * 1. Redistributions of source code must retain the above copyright
9 *    notice, this list of conditions and the following disclaimer.
10 * 2. Redistributions in binary form must reproduce the above copyright
11 *    notice, this list of conditions and the following disclaimer in the
12 *    documentation and/or other materials provided with the distribution.
13 *
14 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
15 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
16 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
17 * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
18 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
19 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
20 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
21 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
22 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
23 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
24 * SUCH DAMAGE.
25 */
26
27/***
28Here is the logic..
29
30If there are N processors, then there are at most N KSEs (kernel
31schedulable entities) working to process threads that belong to a
32KSEGROUP (kg). If there are X of these KSEs actually running at the
33moment in question, then there are at most M (N-X) of these KSEs on
34the run queue, as running KSEs are not on the queue.
35
36Runnable threads are queued off the KSEGROUP in priority order.
37If there are M or more threads runnable, the top M threads
38(by priority) are 'preassigned' to the M KSEs not running. The KSEs take
39their priority from those threads and are put on the run queue.
40
41The last thread that had a priority high enough to have a KSE associated
42with it, AND IS ON THE RUN QUEUE is pointed to by
43kg->kg_last_assigned. If no threads queued off the KSEGROUP have KSEs
44assigned as all the available KSEs are activly running, or because there
45are no threads queued, that pointer is NULL.
46
47When a KSE is removed from the run queue to become runnable, we know
48it was associated with the highest priority thread in the queue (at the head
49of the queue). If it is also the last assigned we know M was 1 and must
50now be 0. Since the thread is no longer queued that pointer must be
51removed from it. Since we know there were no more KSEs available,
52(M was 1 and is now 0) and since we are not FREEING our KSE
53but using it, we know there are STILL no more KSEs available, we can prove
54that the next thread in the ksegrp list will not have a KSE to assign to
55it, so we can show that the pointer must be made 'invalid' (NULL).
56
57The pointer exists so that when a new thread is made runnable, it can
58have its priority compared with the last assigned thread to see if
59it should 'steal' its KSE or not.. i.e. is it 'earlier'
60on the list than that thread or later.. If it's earlier, then the KSE is
61removed from the last assigned (which is now not assigned a KSE)
62and reassigned to the new thread, which is placed earlier in the list.
63The pointer is then backed up to the previous thread (which may or may not
64be the new thread).
65
66When a thread sleeps or is removed, the KSE becomes available and if there
67are queued threads that are not assigned KSEs, the highest priority one of
68them is assigned the KSE, which is then placed back on the run queue at
69the approipriate place, and the kg->kg_last_assigned pointer is adjusted down
70to point to it.
71
72The following diagram shows 2 KSEs and 3 threads from a single process.
73
74 RUNQ: --->KSE---KSE--...    (KSEs queued at priorities from threads)
75              \    \____
76               \        \
77    KSEGROUP---thread--thread--thread    (queued in priority order)
78        \                 /
79         \_______________/
80          (last_assigned)
81
82The result of this scheme is that the M available KSEs are always
83queued at the priorities they have inherrited from the M highest priority
84threads for that KSEGROUP. If this situation changes, the KSEs are
85reassigned to keep this true.
86***/
87
88#include <sys/cdefs.h>
89__FBSDID("$FreeBSD: head/sys/kern/kern_switch.c 153797 2005-12-28 17:13:31Z kan $");
90
91#include "opt_sched.h"
92
93#ifndef KERN_SWITCH_INCLUDE
94#include <sys/param.h>
95#include <sys/systm.h>
96#include <sys/kdb.h>
97#include <sys/kernel.h>
98#include <sys/ktr.h>
99#include <sys/lock.h>
100#include <sys/mutex.h>
101#include <sys/proc.h>
102#include <sys/queue.h>
103#include <sys/sched.h>
104#else  /* KERN_SWITCH_INCLUDE */
105#if defined(SMP) && (defined(__i386__) || defined(__amd64__))
106#include <sys/smp.h>
107#endif
108#if defined(SMP) && defined(SCHED_4BSD)
109#include <sys/sysctl.h>
110#endif
111
112/* Uncomment this to enable logging of critical_enter/exit. */
113#if 0
114#define	KTR_CRITICAL	KTR_SCHED
115#else
116#define	KTR_CRITICAL	0
117#endif
118
119#ifdef FULL_PREEMPTION
120#ifndef PREEMPTION
121#error "The FULL_PREEMPTION option requires the PREEMPTION option"
122#endif
123#endif
124
125CTASSERT((RQB_BPW * RQB_LEN) == RQ_NQS);
126
127#define td_kse td_sched
128
129/*
130 * kern.sched.preemption allows user space to determine if preemption support
131 * is compiled in or not.  It is not currently a boot or runtime flag that
132 * can be changed.
133 */
134#ifdef PREEMPTION
135static int kern_sched_preemption = 1;
136#else
137static int kern_sched_preemption = 0;
138#endif
139SYSCTL_INT(_kern_sched, OID_AUTO, preemption, CTLFLAG_RD,
140    &kern_sched_preemption, 0, "Kernel preemption enabled");
141
142/************************************************************************
143 * Functions that manipulate runnability from a thread perspective.	*
144 ************************************************************************/
145/*
146 * Select the KSE that will be run next.  From that find the thread, and
147 * remove it from the KSEGRP's run queue.  If there is thread clustering,
148 * this will be what does it.
149 */
150struct thread *
151choosethread(void)
152{
153	struct kse *ke;
154	struct thread *td;
155	struct ksegrp *kg;
156
157#if defined(SMP) && (defined(__i386__) || defined(__amd64__))
158	if (smp_active == 0 && PCPU_GET(cpuid) != 0) {
159		/* Shutting down, run idlethread on AP's */
160		td = PCPU_GET(idlethread);
161		ke = td->td_kse;
162		CTR1(KTR_RUNQ, "choosethread: td=%p (idle)", td);
163		ke->ke_flags |= KEF_DIDRUN;
164		TD_SET_RUNNING(td);
165		return (td);
166	}
167#endif
168
169retry:
170	ke = sched_choose();
171	if (ke) {
172		td = ke->ke_thread;
173		KASSERT((td->td_kse == ke), ("kse/thread mismatch"));
174		kg = ke->ke_ksegrp;
175		if (td->td_proc->p_flag & P_HADTHREADS) {
176			if (kg->kg_last_assigned == td) {
177				kg->kg_last_assigned = TAILQ_PREV(td,
178				    threadqueue, td_runq);
179			}
180			TAILQ_REMOVE(&kg->kg_runq, td, td_runq);
181		}
182		CTR2(KTR_RUNQ, "choosethread: td=%p pri=%d",
183		    td, td->td_priority);
184	} else {
185		/* Simulate runq_choose() having returned the idle thread */
186		td = PCPU_GET(idlethread);
187		ke = td->td_kse;
188		CTR1(KTR_RUNQ, "choosethread: td=%p (idle)", td);
189	}
190	ke->ke_flags |= KEF_DIDRUN;
191
192	/*
193	 * If we are in panic, only allow system threads,
194	 * plus the one we are running in, to be run.
195	 */
196	if (panicstr && ((td->td_proc->p_flag & P_SYSTEM) == 0 &&
197	    (td->td_flags & TDF_INPANIC) == 0)) {
198		/* note that it is no longer on the run queue */
199		TD_SET_CAN_RUN(td);
200		goto retry;
201	}
202
203	TD_SET_RUNNING(td);
204	return (td);
205}
206
207/*
208 * Given a surplus system slot, try assign a new runnable thread to it.
209 * Called from:
210 *  sched_thread_exit()  (local)
211 *  sched_switch()  (local)
212 *  sched_thread_exit()  (local)
213 *  remrunqueue()  (local)  (not at the moment)
214 */
215static void
216slot_fill(struct ksegrp *kg)
217{
218	struct thread *td;
219
220	mtx_assert(&sched_lock, MA_OWNED);
221	while (kg->kg_avail_opennings > 0) {
222		/*
223		 * Find the first unassigned thread
224		 */
225		if ((td = kg->kg_last_assigned) != NULL)
226			td = TAILQ_NEXT(td, td_runq);
227		else
228			td = TAILQ_FIRST(&kg->kg_runq);
229
230		/*
231		 * If we found one, send it to the system scheduler.
232		 */
233		if (td) {
234			kg->kg_last_assigned = td;
235			sched_add(td, SRQ_YIELDING);
236			CTR2(KTR_RUNQ, "slot_fill: td%p -> kg%p", td, kg);
237		} else {
238			/* no threads to use up the slots. quit now */
239			break;
240		}
241	}
242}
243
244#ifdef	SCHED_4BSD
245/*
246 * Remove a thread from its KSEGRP's run queue.
247 * This in turn may remove it from a KSE if it was already assigned
248 * to one, possibly causing a new thread to be assigned to the KSE
249 * and the KSE getting a new priority.
250 */
251static void
252remrunqueue(struct thread *td)
253{
254	struct thread *td2, *td3;
255	struct ksegrp *kg;
256	struct kse *ke;
257
258	mtx_assert(&sched_lock, MA_OWNED);
259	KASSERT((TD_ON_RUNQ(td)), ("remrunqueue: Bad state on run queue"));
260	kg = td->td_ksegrp;
261	ke = td->td_kse;
262	CTR1(KTR_RUNQ, "remrunqueue: td%p", td);
263	TD_SET_CAN_RUN(td);
264	/*
265	 * If it is not a threaded process, take the shortcut.
266	 */
267	if ((td->td_proc->p_flag & P_HADTHREADS) == 0) {
268		/* remve from sys run queue and free up a slot */
269		sched_rem(td);
270		ke->ke_state = KES_THREAD;
271		return;
272	}
273   	td3 = TAILQ_PREV(td, threadqueue, td_runq);
274	TAILQ_REMOVE(&kg->kg_runq, td, td_runq);
275	if (ke->ke_state == KES_ONRUNQ) {
276		/*
277		 * This thread has been assigned to the system run queue.
278		 * We need to dissociate it and try assign the
279		 * KSE to the next available thread. Then, we should
280		 * see if we need to move the KSE in the run queues.
281		 */
282		sched_rem(td);
283		ke->ke_state = KES_THREAD;
284		td2 = kg->kg_last_assigned;
285		KASSERT((td2 != NULL), ("last assigned has wrong value"));
286		if (td2 == td)
287			kg->kg_last_assigned = td3;
288		/* slot_fill(kg); */ /* will replace it with another */
289	}
290}
291#endif
292
293/*
294 * Change the priority of a thread that is on the run queue.
295 */
296void
297adjustrunqueue( struct thread *td, int newpri)
298{
299	struct ksegrp *kg;
300	struct kse *ke;
301
302	mtx_assert(&sched_lock, MA_OWNED);
303	KASSERT((TD_ON_RUNQ(td)), ("adjustrunqueue: Bad state on run queue"));
304
305	ke = td->td_kse;
306	CTR1(KTR_RUNQ, "adjustrunqueue: td%p", td);
307	/*
308	 * If it is not a threaded process, take the shortcut.
309	 */
310	if ((td->td_proc->p_flag & P_HADTHREADS) == 0) {
311		/* We only care about the kse in the run queue. */
312		td->td_priority = newpri;
313		if (ke->ke_rqindex != (newpri / RQ_PPQ)) {
314			sched_rem(td);
315			sched_add(td, SRQ_BORING);
316		}
317		return;
318	}
319
320	/* It is a threaded process */
321	kg = td->td_ksegrp;
322	if (ke->ke_state == KES_ONRUNQ
323#ifdef SCHED_ULE
324	 || ((ke->ke_flags & KEF_ASSIGNED) != 0 &&
325	     (ke->ke_flags & KEF_REMOVED) == 0)
326#endif
327	   ) {
328		if (kg->kg_last_assigned == td) {
329			kg->kg_last_assigned =
330			    TAILQ_PREV(td, threadqueue, td_runq);
331		}
332		sched_rem(td);
333	}
334	TAILQ_REMOVE(&kg->kg_runq, td, td_runq);
335	TD_SET_CAN_RUN(td);
336	td->td_priority = newpri;
337	setrunqueue(td, SRQ_BORING);
338}
339
340/*
341 * This function is called when a thread is about to be put on a
342 * ksegrp run queue because it has been made runnable or its
343 * priority has been adjusted and the ksegrp does not have a
344 * free kse slot.  It determines if a thread from the same ksegrp
345 * should be preempted.  If so, it tries to switch threads
346 * if the thread is on the same cpu or notifies another cpu that
347 * it should switch threads.
348 */
349
350static void
351maybe_preempt_in_ksegrp(struct thread *td)
352#if  !defined(SMP)
353{
354	struct thread *running_thread;
355
356	mtx_assert(&sched_lock, MA_OWNED);
357	running_thread = curthread;
358
359	if (running_thread->td_ksegrp != td->td_ksegrp)
360		return;
361
362	if (td->td_priority >= running_thread->td_priority)
363		return;
364#ifdef PREEMPTION
365#ifndef FULL_PREEMPTION
366	if (td->td_priority > PRI_MAX_ITHD) {
367		running_thread->td_flags |= TDF_NEEDRESCHED;
368		return;
369	}
370#endif /* FULL_PREEMPTION */
371
372	if (running_thread->td_critnest > 1)
373		running_thread->td_owepreempt = 1;
374	 else
375		 mi_switch(SW_INVOL, NULL);
376
377#else /* PREEMPTION */
378	running_thread->td_flags |= TDF_NEEDRESCHED;
379#endif /* PREEMPTION */
380	return;
381}
382
383#else /* SMP */
384{
385	struct thread *running_thread;
386	int worst_pri;
387	struct ksegrp *kg;
388	cpumask_t cpumask,dontuse;
389	struct pcpu *pc;
390	struct pcpu *best_pcpu;
391	struct thread *cputhread;
392
393	mtx_assert(&sched_lock, MA_OWNED);
394
395	running_thread = curthread;
396
397#if !defined(KSEG_PEEMPT_BEST_CPU)
398	if (running_thread->td_ksegrp != td->td_ksegrp) {
399#endif
400		kg = td->td_ksegrp;
401
402		/* if someone is ahead of this thread, wait our turn */
403		if (td != TAILQ_FIRST(&kg->kg_runq))
404			return;
405
406		worst_pri = td->td_priority;
407		best_pcpu = NULL;
408		dontuse   = stopped_cpus | idle_cpus_mask;
409
410		/*
411		 * Find a cpu with the worst priority that runs at thread from
412		 * the same  ksegrp - if multiple exist give first the last run
413		 * cpu and then the current cpu priority
414		 */
415
416		SLIST_FOREACH(pc, &cpuhead, pc_allcpu) {
417			cpumask   = pc->pc_cpumask;
418			cputhread = pc->pc_curthread;
419
420			if ((cpumask & dontuse)  ||
421			    cputhread->td_ksegrp != kg)
422				continue;
423
424			if (cputhread->td_priority > worst_pri) {
425				worst_pri = cputhread->td_priority;
426				best_pcpu = pc;
427				continue;
428			}
429
430			if (cputhread->td_priority == worst_pri &&
431			    best_pcpu != NULL &&
432			    (td->td_lastcpu == pc->pc_cpuid ||
433				(PCPU_GET(cpumask) == cpumask &&
434				    td->td_lastcpu != best_pcpu->pc_cpuid)))
435			    best_pcpu = pc;
436		}
437
438		/* Check if we need to preempt someone */
439		if (best_pcpu == NULL)
440			return;
441
442#if defined(IPI_PREEMPTION) && defined(PREEMPTION)
443#if !defined(FULL_PREEMPTION)
444		if (td->td_priority <= PRI_MAX_ITHD)
445#endif /* ! FULL_PREEMPTION */
446			{
447				ipi_selected(best_pcpu->pc_cpumask, IPI_PREEMPT);
448				return;
449			}
450#endif /* defined(IPI_PREEMPTION) && defined(PREEMPTION) */
451
452		if (PCPU_GET(cpuid) != best_pcpu->pc_cpuid) {
453			best_pcpu->pc_curthread->td_flags |= TDF_NEEDRESCHED;
454			ipi_selected(best_pcpu->pc_cpumask, IPI_AST);
455			return;
456		}
457#if !defined(KSEG_PEEMPT_BEST_CPU)
458	}
459#endif
460
461	if (td->td_priority >= running_thread->td_priority)
462		return;
463#ifdef PREEMPTION
464
465#if !defined(FULL_PREEMPTION)
466	if (td->td_priority > PRI_MAX_ITHD) {
467		running_thread->td_flags |= TDF_NEEDRESCHED;
468	}
469#endif /* ! FULL_PREEMPTION */
470
471	if (running_thread->td_critnest > 1)
472		running_thread->td_owepreempt = 1;
473	 else
474		 mi_switch(SW_INVOL, NULL);
475
476#else /* PREEMPTION */
477	running_thread->td_flags |= TDF_NEEDRESCHED;
478#endif /* PREEMPTION */
479	return;
480}
481#endif /* !SMP */
482
483
484int limitcount;
485void
486setrunqueue(struct thread *td, int flags)
487{
488	struct ksegrp *kg;
489	struct thread *td2;
490	struct thread *tda;
491
492	CTR3(KTR_RUNQ, "setrunqueue: td:%p kg:%p pid:%d",
493	    td, td->td_ksegrp, td->td_proc->p_pid);
494	CTR5(KTR_SCHED, "setrunqueue: %p(%s) prio %d by %p(%s)",
495            td, td->td_proc->p_comm, td->td_priority, curthread,
496            curthread->td_proc->p_comm);
497	mtx_assert(&sched_lock, MA_OWNED);
498	KASSERT((td->td_inhibitors == 0),
499			("setrunqueue: trying to run inhibitted thread"));
500	KASSERT((TD_CAN_RUN(td) || TD_IS_RUNNING(td)),
501	    ("setrunqueue: bad thread state"));
502	TD_SET_RUNQ(td);
503	kg = td->td_ksegrp;
504	if ((td->td_proc->p_flag & P_HADTHREADS) == 0) {
505		/*
506		 * Common path optimisation: Only one of everything
507		 * and the KSE is always already attached.
508		 * Totally ignore the ksegrp run queue.
509		 */
510		if (kg->kg_avail_opennings != 1) {
511			if (limitcount < 1) {
512				limitcount++;
513				printf("pid %d: corrected slot count (%d->1)\n",
514				    td->td_proc->p_pid, kg->kg_avail_opennings);
515
516			}
517			kg->kg_avail_opennings = 1;
518		}
519		sched_add(td, flags);
520		return;
521	}
522
523	/*
524	 * If the concurrency has reduced, and we would go in the
525	 * assigned section, then keep removing entries from the
526	 * system run queue, until we are not in that section
527	 * or there is room for us to be put in that section.
528	 * What we MUST avoid is the case where there are threads of less
529	 * priority than the new one scheduled, but it can not
530	 * be scheduled itself. That would lead to a non contiguous set
531	 * of scheduled threads, and everything would break.
532	 */
533	tda = kg->kg_last_assigned;
534	while ((kg->kg_avail_opennings <= 0) &&
535	    (tda && (tda->td_priority > td->td_priority))) {
536		/*
537		 * None free, but there is one we can commandeer.
538		 */
539		CTR2(KTR_RUNQ,
540		    "setrunqueue: kg:%p: take slot from td: %p", kg, tda);
541		sched_rem(tda);
542		tda = kg->kg_last_assigned =
543		    TAILQ_PREV(tda, threadqueue, td_runq);
544	}
545
546	/*
547	 * Add the thread to the ksegrp's run queue at
548	 * the appropriate place.
549	 */
550	TAILQ_FOREACH(td2, &kg->kg_runq, td_runq) {
551		if (td2->td_priority > td->td_priority) {
552			TAILQ_INSERT_BEFORE(td2, td, td_runq);
553			break;
554		}
555	}
556	if (td2 == NULL) {
557		/* We ran off the end of the TAILQ or it was empty. */
558		TAILQ_INSERT_TAIL(&kg->kg_runq, td, td_runq);
559	}
560
561	/*
562	 * If we have a slot to use, then put the thread on the system
563	 * run queue and if needed, readjust the last_assigned pointer.
564	 * it may be that we need to schedule something anyhow
565	 * even if the availabel slots are -ve so that
566	 * all the items < last_assigned are scheduled.
567	 */
568	if (kg->kg_avail_opennings > 0) {
569		if (tda == NULL) {
570			/*
571			 * No pre-existing last assigned so whoever is first
572			 * gets the slot.. (maybe us)
573			 */
574			td2 = TAILQ_FIRST(&kg->kg_runq);
575			kg->kg_last_assigned = td2;
576		} else if (tda->td_priority > td->td_priority) {
577			td2 = td;
578		} else {
579			/*
580			 * We are past last_assigned, so
581			 * give the next slot to whatever is next,
582			 * which may or may not be us.
583			 */
584			td2 = TAILQ_NEXT(tda, td_runq);
585			kg->kg_last_assigned = td2;
586		}
587		sched_add(td2, flags);
588	} else {
589		CTR3(KTR_RUNQ, "setrunqueue: held: td%p kg%p pid%d",
590			td, td->td_ksegrp, td->td_proc->p_pid);
591		if ((flags & SRQ_YIELDING) == 0)
592			maybe_preempt_in_ksegrp(td);
593	}
594}
595
596/*
597 * Kernel thread preemption implementation.  Critical sections mark
598 * regions of code in which preemptions are not allowed.
599 */
600void
601critical_enter(void)
602{
603	struct thread *td;
604
605	td = curthread;
606	td->td_critnest++;
607	CTR4(KTR_CRITICAL, "critical_enter by thread %p (%ld, %s) to %d", td,
608	    (long)td->td_proc->p_pid, td->td_proc->p_comm, td->td_critnest);
609}
610
611void
612critical_exit(void)
613{
614	struct thread *td;
615
616	td = curthread;
617	KASSERT(td->td_critnest != 0,
618	    ("critical_exit: td_critnest == 0"));
619#ifdef PREEMPTION
620	if (td->td_critnest == 1) {
621		td->td_critnest = 0;
622		mtx_assert(&sched_lock, MA_NOTOWNED);
623		if (td->td_owepreempt) {
624			td->td_critnest = 1;
625			mtx_lock_spin(&sched_lock);
626			td->td_critnest--;
627			mi_switch(SW_INVOL, NULL);
628			mtx_unlock_spin(&sched_lock);
629		}
630	} else
631#endif
632		td->td_critnest--;
633
634	CTR4(KTR_CRITICAL, "critical_exit by thread %p (%ld, %s) to %d", td,
635	    (long)td->td_proc->p_pid, td->td_proc->p_comm, td->td_critnest);
636}
637
638/*
639 * This function is called when a thread is about to be put on run queue
640 * because it has been made runnable or its priority has been adjusted.  It
641 * determines if the new thread should be immediately preempted to.  If so,
642 * it switches to it and eventually returns true.  If not, it returns false
643 * so that the caller may place the thread on an appropriate run queue.
644 */
645int
646maybe_preempt(struct thread *td)
647{
648#ifdef PREEMPTION
649	struct thread *ctd;
650	int cpri, pri;
651#endif
652
653	mtx_assert(&sched_lock, MA_OWNED);
654#ifdef PREEMPTION
655	/*
656	 * The new thread should not preempt the current thread if any of the
657	 * following conditions are true:
658	 *
659	 *  - The kernel is in the throes of crashing (panicstr).
660	 *  - The current thread has a higher (numerically lower) or
661	 *    equivalent priority.  Note that this prevents curthread from
662	 *    trying to preempt to itself.
663	 *  - It is too early in the boot for context switches (cold is set).
664	 *  - The current thread has an inhibitor set or is in the process of
665	 *    exiting.  In this case, the current thread is about to switch
666	 *    out anyways, so there's no point in preempting.  If we did,
667	 *    the current thread would not be properly resumed as well, so
668	 *    just avoid that whole landmine.
669	 *  - If the new thread's priority is not a realtime priority and
670	 *    the current thread's priority is not an idle priority and
671	 *    FULL_PREEMPTION is disabled.
672	 *
673	 * If all of these conditions are false, but the current thread is in
674	 * a nested critical section, then we have to defer the preemption
675	 * until we exit the critical section.  Otherwise, switch immediately
676	 * to the new thread.
677	 */
678	ctd = curthread;
679	KASSERT ((ctd->td_kse != NULL && ctd->td_kse->ke_thread == ctd),
680	  ("thread has no (or wrong) sched-private part."));
681	KASSERT((td->td_inhibitors == 0),
682			("maybe_preempt: trying to run inhibitted thread"));
683	pri = td->td_priority;
684	cpri = ctd->td_priority;
685	if (panicstr != NULL || pri >= cpri || cold /* || dumping */ ||
686	    TD_IS_INHIBITED(ctd) || td->td_kse->ke_state != KES_THREAD)
687		return (0);
688#ifndef FULL_PREEMPTION
689	if (pri > PRI_MAX_ITHD && cpri < PRI_MIN_IDLE)
690		return (0);
691#endif
692
693	if (ctd->td_critnest > 1) {
694		CTR1(KTR_PROC, "maybe_preempt: in critical section %d",
695		    ctd->td_critnest);
696		ctd->td_owepreempt = 1;
697		return (0);
698	}
699
700	/*
701	 * Thread is runnable but not yet put on system run queue.
702	 */
703	MPASS(TD_ON_RUNQ(td));
704	MPASS(td->td_sched->ke_state != KES_ONRUNQ);
705	if (td->td_proc->p_flag & P_HADTHREADS) {
706		/*
707		 * If this is a threaded process we actually ARE on the
708		 * ksegrp run queue so take it off that first.
709		 * Also undo any damage done to the last_assigned pointer.
710		 * XXX Fix setrunqueue so this isn't needed
711		 */
712		struct ksegrp *kg;
713
714		kg = td->td_ksegrp;
715		if (kg->kg_last_assigned == td)
716			kg->kg_last_assigned =
717			    TAILQ_PREV(td, threadqueue, td_runq);
718		TAILQ_REMOVE(&kg->kg_runq, td, td_runq);
719	}
720
721	TD_SET_RUNNING(td);
722	CTR3(KTR_PROC, "preempting to thread %p (pid %d, %s)\n", td,
723	    td->td_proc->p_pid, td->td_proc->p_comm);
724	mi_switch(SW_INVOL|SW_PREEMPT, td);
725	return (1);
726#else
727	return (0);
728#endif
729}
730
731#if 0
732#ifndef PREEMPTION
733/* XXX: There should be a non-static version of this. */
734static void
735printf_caddr_t(void *data)
736{
737	printf("%s", (char *)data);
738}
739static char preempt_warning[] =
740    "WARNING: Kernel preemption is disabled, expect reduced performance.\n";
741SYSINIT(preempt_warning, SI_SUB_COPYRIGHT, SI_ORDER_ANY, printf_caddr_t,
742    preempt_warning)
743#endif
744#endif
745
746/************************************************************************
747 * SYSTEM RUN QUEUE manipulations and tests				*
748 ************************************************************************/
749/*
750 * Initialize a run structure.
751 */
752void
753runq_init(struct runq *rq)
754{
755	int i;
756
757	bzero(rq, sizeof *rq);
758	for (i = 0; i < RQ_NQS; i++)
759		TAILQ_INIT(&rq->rq_queues[i]);
760}
761
762/*
763 * Clear the status bit of the queue corresponding to priority level pri,
764 * indicating that it is empty.
765 */
766static __inline void
767runq_clrbit(struct runq *rq, int pri)
768{
769	struct rqbits *rqb;
770
771	rqb = &rq->rq_status;
772	CTR4(KTR_RUNQ, "runq_clrbit: bits=%#x %#x bit=%#x word=%d",
773	    rqb->rqb_bits[RQB_WORD(pri)],
774	    rqb->rqb_bits[RQB_WORD(pri)] & ~RQB_BIT(pri),
775	    RQB_BIT(pri), RQB_WORD(pri));
776	rqb->rqb_bits[RQB_WORD(pri)] &= ~RQB_BIT(pri);
777}
778
779/*
780 * Find the index of the first non-empty run queue.  This is done by
781 * scanning the status bits, a set bit indicates a non-empty queue.
782 */
783static __inline int
784runq_findbit(struct runq *rq)
785{
786	struct rqbits *rqb;
787	int pri;
788	int i;
789
790	rqb = &rq->rq_status;
791	for (i = 0; i < RQB_LEN; i++)
792		if (rqb->rqb_bits[i]) {
793			pri = RQB_FFS(rqb->rqb_bits[i]) + (i << RQB_L2BPW);
794			CTR3(KTR_RUNQ, "runq_findbit: bits=%#x i=%d pri=%d",
795			    rqb->rqb_bits[i], i, pri);
796			return (pri);
797		}
798
799	return (-1);
800}
801
802/*
803 * Set the status bit of the queue corresponding to priority level pri,
804 * indicating that it is non-empty.
805 */
806static __inline void
807runq_setbit(struct runq *rq, int pri)
808{
809	struct rqbits *rqb;
810
811	rqb = &rq->rq_status;
812	CTR4(KTR_RUNQ, "runq_setbit: bits=%#x %#x bit=%#x word=%d",
813	    rqb->rqb_bits[RQB_WORD(pri)],
814	    rqb->rqb_bits[RQB_WORD(pri)] | RQB_BIT(pri),
815	    RQB_BIT(pri), RQB_WORD(pri));
816	rqb->rqb_bits[RQB_WORD(pri)] |= RQB_BIT(pri);
817}
818
819/*
820 * Add the KSE to the queue specified by its priority, and set the
821 * corresponding status bit.
822 */
823void
824runq_add(struct runq *rq, struct kse *ke, int flags)
825{
826	struct rqhead *rqh;
827	int pri;
828
829	pri = ke->ke_thread->td_priority / RQ_PPQ;
830	ke->ke_rqindex = pri;
831	runq_setbit(rq, pri);
832	rqh = &rq->rq_queues[pri];
833	CTR5(KTR_RUNQ, "runq_add: td=%p ke=%p pri=%d %d rqh=%p",
834	    ke->ke_thread, ke, ke->ke_thread->td_priority, pri, rqh);
835	if (flags & SRQ_PREEMPTED) {
836		TAILQ_INSERT_HEAD(rqh, ke, ke_procq);
837	} else {
838		TAILQ_INSERT_TAIL(rqh, ke, ke_procq);
839	}
840}
841
842/*
843 * Return true if there are runnable processes of any priority on the run
844 * queue, false otherwise.  Has no side effects, does not modify the run
845 * queue structure.
846 */
847int
848runq_check(struct runq *rq)
849{
850	struct rqbits *rqb;
851	int i;
852
853	rqb = &rq->rq_status;
854	for (i = 0; i < RQB_LEN; i++)
855		if (rqb->rqb_bits[i]) {
856			CTR2(KTR_RUNQ, "runq_check: bits=%#x i=%d",
857			    rqb->rqb_bits[i], i);
858			return (1);
859		}
860	CTR0(KTR_RUNQ, "runq_check: empty");
861
862	return (0);
863}
864
865#if defined(SMP) && defined(SCHED_4BSD)
866int runq_fuzz = 1;
867SYSCTL_INT(_kern_sched, OID_AUTO, runq_fuzz, CTLFLAG_RW, &runq_fuzz, 0, "");
868#endif
869
870/*
871 * Find the highest priority process on the run queue.
872 */
873struct kse *
874runq_choose(struct runq *rq)
875{
876	struct rqhead *rqh;
877	struct kse *ke;
878	int pri;
879
880	mtx_assert(&sched_lock, MA_OWNED);
881	while ((pri = runq_findbit(rq)) != -1) {
882		rqh = &rq->rq_queues[pri];
883#if defined(SMP) && defined(SCHED_4BSD)
884		/* fuzz == 1 is normal.. 0 or less are ignored */
885		if (runq_fuzz > 1) {
886			/*
887			 * In the first couple of entries, check if
888			 * there is one for our CPU as a preference.
889			 */
890			int count = runq_fuzz;
891			int cpu = PCPU_GET(cpuid);
892			struct kse *ke2;
893			ke2 = ke = TAILQ_FIRST(rqh);
894
895			while (count-- && ke2) {
896				if (ke->ke_thread->td_lastcpu == cpu) {
897					ke = ke2;
898					break;
899				}
900				ke2 = TAILQ_NEXT(ke2, ke_procq);
901			}
902		} else
903#endif
904			ke = TAILQ_FIRST(rqh);
905		KASSERT(ke != NULL, ("runq_choose: no proc on busy queue"));
906		CTR3(KTR_RUNQ,
907		    "runq_choose: pri=%d kse=%p rqh=%p", pri, ke, rqh);
908		return (ke);
909	}
910	CTR1(KTR_RUNQ, "runq_choose: idleproc pri=%d", pri);
911
912	return (NULL);
913}
914
915/*
916 * Remove the KSE from the queue specified by its priority, and clear the
917 * corresponding status bit if the queue becomes empty.
918 * Caller must set ke->ke_state afterwards.
919 */
920void
921runq_remove(struct runq *rq, struct kse *ke)
922{
923	struct rqhead *rqh;
924	int pri;
925
926	KASSERT(ke->ke_proc->p_sflag & PS_INMEM,
927		("runq_remove: process swapped out"));
928	pri = ke->ke_rqindex;
929	rqh = &rq->rq_queues[pri];
930	CTR5(KTR_RUNQ, "runq_remove: td=%p, ke=%p pri=%d %d rqh=%p",
931	    ke->ke_thread, ke, ke->ke_thread->td_priority, pri, rqh);
932	KASSERT(ke != NULL, ("runq_remove: no proc on busy queue"));
933	TAILQ_REMOVE(rqh, ke, ke_procq);
934	if (TAILQ_EMPTY(rqh)) {
935		CTR0(KTR_RUNQ, "runq_remove: empty");
936		runq_clrbit(rq, pri);
937	}
938}
939
940/****** functions that are temporarily here ***********/
941#include <vm/uma.h>
942extern struct mtx kse_zombie_lock;
943
944/*
945 *  Allocate scheduler specific per-process resources.
946 * The thread and ksegrp have already been linked in.
947 * In this case just set the default concurrency value.
948 *
949 * Called from:
950 *  proc_init() (UMA init method)
951 */
952void
953sched_newproc(struct proc *p, struct ksegrp *kg, struct thread *td)
954{
955
956	/* This can go in sched_fork */
957	sched_init_concurrency(kg);
958}
959
960/*
961 * thread is being either created or recycled.
962 * Fix up the per-scheduler resources associated with it.
963 * Called from:
964 *  sched_fork_thread()
965 *  thread_dtor()  (*may go away)
966 *  thread_init()  (*may go away)
967 */
968void
969sched_newthread(struct thread *td)
970{
971	struct td_sched *ke;
972
973	ke = (struct td_sched *) (td + 1);
974	bzero(ke, sizeof(*ke));
975	td->td_sched     = ke;
976	ke->ke_thread	= td;
977	ke->ke_state	= KES_THREAD;
978}
979
980/*
981 * Set up an initial concurrency of 1
982 * and set the given thread (if given) to be using that
983 * concurrency slot.
984 * May be used "offline"..before the ksegrp is attached to the world
985 * and thus wouldn't need schedlock in that case.
986 * Called from:
987 *  thr_create()
988 *  proc_init() (UMA) via sched_newproc()
989 */
990void
991sched_init_concurrency(struct ksegrp *kg)
992{
993
994	CTR1(KTR_RUNQ,"kg %p init slots and concurrency to 1", kg);
995	kg->kg_concurrency = 1;
996	kg->kg_avail_opennings = 1;
997}
998
999/*
1000 * Change the concurrency of an existing ksegrp to N
1001 * Called from:
1002 *  kse_create()
1003 *  kse_exit()
1004 *  thread_exit()
1005 *  thread_single()
1006 */
1007void
1008sched_set_concurrency(struct ksegrp *kg, int concurrency)
1009{
1010
1011	CTR4(KTR_RUNQ,"kg %p set concurrency to %d, slots %d -> %d",
1012	    kg,
1013	    concurrency,
1014	    kg->kg_avail_opennings,
1015	    kg->kg_avail_opennings + (concurrency - kg->kg_concurrency));
1016	kg->kg_avail_opennings += (concurrency - kg->kg_concurrency);
1017	kg->kg_concurrency = concurrency;
1018}
1019
1020/*
1021 * Called from thread_exit() for all exiting thread
1022 *
1023 * Not to be confused with sched_exit_thread()
1024 * that is only called from thread_exit() for threads exiting
1025 * without the rest of the process exiting because it is also called from
1026 * sched_exit() and we wouldn't want to call it twice.
1027 * XXX This can probably be fixed.
1028 */
1029void
1030sched_thread_exit(struct thread *td)
1031{
1032
1033	SLOT_RELEASE(td->td_ksegrp);
1034	slot_fill(td->td_ksegrp);
1035}
1036
1037#endif /* KERN_SWITCH_INCLUDE */
1038