kern_switch.c revision 159570
1/*-
2 * Copyright (c) 2001 Jake Burkholder <jake@FreeBSD.org>
3 * All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 * 1. Redistributions of source code must retain the above copyright
9 *    notice, this list of conditions and the following disclaimer.
10 * 2. Redistributions in binary form must reproduce the above copyright
11 *    notice, this list of conditions and the following disclaimer in the
12 *    documentation and/or other materials provided with the distribution.
13 *
14 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
15 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
16 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
17 * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
18 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
19 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
20 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
21 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
22 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
23 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
24 * SUCH DAMAGE.
25 */
26
27/***
28Here is the logic..
29
30If there are N processors, then there are at most N KSEs (kernel
31schedulable entities) working to process threads that belong to a
32KSEGROUP (kg). If there are X of these KSEs actually running at the
33moment in question, then there are at most M (N-X) of these KSEs on
34the run queue, as running KSEs are not on the queue.
35
36Runnable threads are queued off the KSEGROUP in priority order.
37If there are M or more threads runnable, the top M threads
38(by priority) are 'preassigned' to the M KSEs not running. The KSEs take
39their priority from those threads and are put on the run queue.
40
41The last thread that had a priority high enough to have a KSE associated
42with it, AND IS ON THE RUN QUEUE is pointed to by
43kg->kg_last_assigned. If no threads queued off the KSEGROUP have KSEs
44assigned as all the available KSEs are activly running, or because there
45are no threads queued, that pointer is NULL.
46
47When a KSE is removed from the run queue to become runnable, we know
48it was associated with the highest priority thread in the queue (at the head
49of the queue). If it is also the last assigned we know M was 1 and must
50now be 0. Since the thread is no longer queued that pointer must be
51removed from it. Since we know there were no more KSEs available,
52(M was 1 and is now 0) and since we are not FREEING our KSE
53but using it, we know there are STILL no more KSEs available, we can prove
54that the next thread in the ksegrp list will not have a KSE to assign to
55it, so we can show that the pointer must be made 'invalid' (NULL).
56
57The pointer exists so that when a new thread is made runnable, it can
58have its priority compared with the last assigned thread to see if
59it should 'steal' its KSE or not.. i.e. is it 'earlier'
60on the list than that thread or later.. If it's earlier, then the KSE is
61removed from the last assigned (which is now not assigned a KSE)
62and reassigned to the new thread, which is placed earlier in the list.
63The pointer is then backed up to the previous thread (which may or may not
64be the new thread).
65
66When a thread sleeps or is removed, the KSE becomes available and if there
67are queued threads that are not assigned KSEs, the highest priority one of
68them is assigned the KSE, which is then placed back on the run queue at
69the approipriate place, and the kg->kg_last_assigned pointer is adjusted down
70to point to it.
71
72The following diagram shows 2 KSEs and 3 threads from a single process.
73
74 RUNQ: --->KSE---KSE--...    (KSEs queued at priorities from threads)
75              \    \____
76               \        \
77    KSEGROUP---thread--thread--thread    (queued in priority order)
78        \                 /
79         \_______________/
80          (last_assigned)
81
82The result of this scheme is that the M available KSEs are always
83queued at the priorities they have inherrited from the M highest priority
84threads for that KSEGROUP. If this situation changes, the KSEs are
85reassigned to keep this true.
86***/
87
88#include <sys/cdefs.h>
89__FBSDID("$FreeBSD: head/sys/kern/kern_switch.c 159570 2006-06-13 13:12:56Z davidxu $");
90
91#include "opt_sched.h"
92
93#ifndef KERN_SWITCH_INCLUDE
94#include <sys/param.h>
95#include <sys/systm.h>
96#include <sys/kdb.h>
97#include <sys/kernel.h>
98#include <sys/ktr.h>
99#include <sys/lock.h>
100#include <sys/mutex.h>
101#include <sys/proc.h>
102#include <sys/queue.h>
103#include <sys/sched.h>
104#else  /* KERN_SWITCH_INCLUDE */
105#if defined(SMP) && (defined(__i386__) || defined(__amd64__))
106#include <sys/smp.h>
107#endif
108#if defined(SMP) && defined(SCHED_4BSD)
109#include <sys/sysctl.h>
110#endif
111
112/* Uncomment this to enable logging of critical_enter/exit. */
113#if 0
114#define	KTR_CRITICAL	KTR_SCHED
115#else
116#define	KTR_CRITICAL	0
117#endif
118
119#ifdef FULL_PREEMPTION
120#ifndef PREEMPTION
121#error "The FULL_PREEMPTION option requires the PREEMPTION option"
122#endif
123#endif
124
125CTASSERT((RQB_BPW * RQB_LEN) == RQ_NQS);
126
127#define td_kse td_sched
128
129/*
130 * kern.sched.preemption allows user space to determine if preemption support
131 * is compiled in or not.  It is not currently a boot or runtime flag that
132 * can be changed.
133 */
134#ifdef PREEMPTION
135static int kern_sched_preemption = 1;
136#else
137static int kern_sched_preemption = 0;
138#endif
139SYSCTL_INT(_kern_sched, OID_AUTO, preemption, CTLFLAG_RD,
140    &kern_sched_preemption, 0, "Kernel preemption enabled");
141
142/************************************************************************
143 * Functions that manipulate runnability from a thread perspective.	*
144 ************************************************************************/
145/*
146 * Select the KSE that will be run next.  From that find the thread, and
147 * remove it from the KSEGRP's run queue.  If there is thread clustering,
148 * this will be what does it.
149 */
150struct thread *
151choosethread(void)
152{
153	struct kse *ke;
154	struct thread *td;
155	struct ksegrp *kg;
156
157#if defined(SMP) && (defined(__i386__) || defined(__amd64__))
158	if (smp_active == 0 && PCPU_GET(cpuid) != 0) {
159		/* Shutting down, run idlethread on AP's */
160		td = PCPU_GET(idlethread);
161		ke = td->td_kse;
162		CTR1(KTR_RUNQ, "choosethread: td=%p (idle)", td);
163		ke->ke_flags |= KEF_DIDRUN;
164		TD_SET_RUNNING(td);
165		return (td);
166	}
167#endif
168
169retry:
170	ke = sched_choose();
171	if (ke) {
172		td = ke->ke_thread;
173		KASSERT((td->td_kse == ke), ("kse/thread mismatch"));
174		kg = ke->ke_ksegrp;
175		if (td->td_proc->p_flag & P_HADTHREADS) {
176			if (kg->kg_last_assigned == td) {
177				kg->kg_last_assigned = TAILQ_PREV(td,
178				    threadqueue, td_runq);
179			}
180			TAILQ_REMOVE(&kg->kg_runq, td, td_runq);
181		}
182		CTR2(KTR_RUNQ, "choosethread: td=%p pri=%d",
183		    td, td->td_priority);
184	} else {
185		/* Simulate runq_choose() having returned the idle thread */
186		td = PCPU_GET(idlethread);
187		ke = td->td_kse;
188		CTR1(KTR_RUNQ, "choosethread: td=%p (idle)", td);
189	}
190	ke->ke_flags |= KEF_DIDRUN;
191
192	/*
193	 * If we are in panic, only allow system threads,
194	 * plus the one we are running in, to be run.
195	 */
196	if (panicstr && ((td->td_proc->p_flag & P_SYSTEM) == 0 &&
197	    (td->td_flags & TDF_INPANIC) == 0)) {
198		/* note that it is no longer on the run queue */
199		TD_SET_CAN_RUN(td);
200		goto retry;
201	}
202
203	TD_SET_RUNNING(td);
204	return (td);
205}
206
207/*
208 * Given a surplus system slot, try assign a new runnable thread to it.
209 * Called from:
210 *  sched_thread_exit()  (local)
211 *  sched_switch()  (local)
212 *  sched_thread_exit()  (local)
213 *  remrunqueue()  (local)  (not at the moment)
214 */
215static void
216slot_fill(struct ksegrp *kg)
217{
218	struct thread *td;
219
220	mtx_assert(&sched_lock, MA_OWNED);
221	while (kg->kg_avail_opennings > 0) {
222		/*
223		 * Find the first unassigned thread
224		 */
225		if ((td = kg->kg_last_assigned) != NULL)
226			td = TAILQ_NEXT(td, td_runq);
227		else
228			td = TAILQ_FIRST(&kg->kg_runq);
229
230		/*
231		 * If we found one, send it to the system scheduler.
232		 */
233		if (td) {
234			kg->kg_last_assigned = td;
235			sched_add(td, SRQ_YIELDING);
236			CTR2(KTR_RUNQ, "slot_fill: td%p -> kg%p", td, kg);
237		} else {
238			/* no threads to use up the slots. quit now */
239			break;
240		}
241	}
242}
243
244#ifdef	SCHED_4BSD
245/*
246 * Remove a thread from its KSEGRP's run queue.
247 * This in turn may remove it from a KSE if it was already assigned
248 * to one, possibly causing a new thread to be assigned to the KSE
249 * and the KSE getting a new priority.
250 */
251static void
252remrunqueue(struct thread *td)
253{
254	struct thread *td2, *td3;
255	struct ksegrp *kg;
256	struct kse *ke;
257
258	mtx_assert(&sched_lock, MA_OWNED);
259	KASSERT((TD_ON_RUNQ(td)), ("remrunqueue: Bad state on run queue"));
260	kg = td->td_ksegrp;
261	ke = td->td_kse;
262	CTR1(KTR_RUNQ, "remrunqueue: td%p", td);
263	TD_SET_CAN_RUN(td);
264	/*
265	 * If it is not a threaded process, take the shortcut.
266	 */
267	if ((td->td_proc->p_flag & P_HADTHREADS) == 0) {
268		/* remve from sys run queue and free up a slot */
269		sched_rem(td);
270		return;
271	}
272   	td3 = TAILQ_PREV(td, threadqueue, td_runq);
273	TAILQ_REMOVE(&kg->kg_runq, td, td_runq);
274	if (ke->ke_state == KES_ONRUNQ) {
275		/*
276		 * This thread has been assigned to the system run queue.
277		 * We need to dissociate it and try assign the
278		 * KSE to the next available thread. Then, we should
279		 * see if we need to move the KSE in the run queues.
280		 */
281		sched_rem(td);
282		td2 = kg->kg_last_assigned;
283		KASSERT((td2 != NULL), ("last assigned has wrong value"));
284		if (td2 == td)
285			kg->kg_last_assigned = td3;
286		/* slot_fill(kg); */ /* will replace it with another */
287	}
288}
289#endif
290
291/*
292 * Change the priority of a thread that is on the run queue.
293 */
294void
295adjustrunqueue( struct thread *td, int newpri)
296{
297	struct ksegrp *kg;
298	struct kse *ke;
299
300	mtx_assert(&sched_lock, MA_OWNED);
301	KASSERT((TD_ON_RUNQ(td)), ("adjustrunqueue: Bad state on run queue"));
302
303	ke = td->td_kse;
304	CTR1(KTR_RUNQ, "adjustrunqueue: td%p", td);
305	/*
306	 * If it is not a threaded process, take the shortcut.
307	 */
308	if ((td->td_proc->p_flag & P_HADTHREADS) == 0) {
309		/* We only care about the kse in the run queue. */
310		td->td_priority = newpri;
311#ifndef SCHED_CORE
312		if (ke->ke_rqindex != (newpri / RQ_PPQ))
313#else
314		if (ke->ke_rqindex != newpri)
315#endif
316		{
317			sched_rem(td);
318			sched_add(td, SRQ_BORING);
319		}
320		return;
321	}
322
323	/* It is a threaded process */
324	kg = td->td_ksegrp;
325	if (ke->ke_state == KES_ONRUNQ
326#ifdef SCHED_ULE
327	 || ((ke->ke_flags & KEF_ASSIGNED) != 0 &&
328	     (ke->ke_flags & KEF_REMOVED) == 0)
329#endif
330	   ) {
331		if (kg->kg_last_assigned == td) {
332			kg->kg_last_assigned =
333			    TAILQ_PREV(td, threadqueue, td_runq);
334		}
335		sched_rem(td);
336	}
337	TAILQ_REMOVE(&kg->kg_runq, td, td_runq);
338	TD_SET_CAN_RUN(td);
339	td->td_priority = newpri;
340	setrunqueue(td, SRQ_BORING);
341}
342
343/*
344 * This function is called when a thread is about to be put on a
345 * ksegrp run queue because it has been made runnable or its
346 * priority has been adjusted and the ksegrp does not have a
347 * free kse slot.  It determines if a thread from the same ksegrp
348 * should be preempted.  If so, it tries to switch threads
349 * if the thread is on the same cpu or notifies another cpu that
350 * it should switch threads.
351 */
352
353static void
354maybe_preempt_in_ksegrp(struct thread *td)
355#if  !defined(SMP)
356{
357	struct thread *running_thread;
358
359	mtx_assert(&sched_lock, MA_OWNED);
360	running_thread = curthread;
361
362	if (running_thread->td_ksegrp != td->td_ksegrp)
363		return;
364
365	if (td->td_priority >= running_thread->td_priority)
366		return;
367#ifdef PREEMPTION
368#ifndef FULL_PREEMPTION
369	if (td->td_priority > PRI_MAX_ITHD) {
370		running_thread->td_flags |= TDF_NEEDRESCHED;
371		return;
372	}
373#endif /* FULL_PREEMPTION */
374
375	if (running_thread->td_critnest > 1)
376		running_thread->td_owepreempt = 1;
377	 else
378		 mi_switch(SW_INVOL, NULL);
379
380#else /* PREEMPTION */
381	running_thread->td_flags |= TDF_NEEDRESCHED;
382#endif /* PREEMPTION */
383	return;
384}
385
386#else /* SMP */
387{
388	struct thread *running_thread;
389	int worst_pri;
390	struct ksegrp *kg;
391	cpumask_t cpumask,dontuse;
392	struct pcpu *pc;
393	struct pcpu *best_pcpu;
394	struct thread *cputhread;
395
396	mtx_assert(&sched_lock, MA_OWNED);
397
398	running_thread = curthread;
399
400#if !defined(KSEG_PEEMPT_BEST_CPU)
401	if (running_thread->td_ksegrp != td->td_ksegrp) {
402#endif
403		kg = td->td_ksegrp;
404
405		/* if someone is ahead of this thread, wait our turn */
406		if (td != TAILQ_FIRST(&kg->kg_runq))
407			return;
408
409		worst_pri = td->td_priority;
410		best_pcpu = NULL;
411		dontuse   = stopped_cpus | idle_cpus_mask;
412
413		/*
414		 * Find a cpu with the worst priority that runs at thread from
415		 * the same  ksegrp - if multiple exist give first the last run
416		 * cpu and then the current cpu priority
417		 */
418
419		SLIST_FOREACH(pc, &cpuhead, pc_allcpu) {
420			cpumask   = pc->pc_cpumask;
421			cputhread = pc->pc_curthread;
422
423			if ((cpumask & dontuse)  ||
424			    cputhread->td_ksegrp != kg)
425				continue;
426
427			if (cputhread->td_priority > worst_pri) {
428				worst_pri = cputhread->td_priority;
429				best_pcpu = pc;
430				continue;
431			}
432
433			if (cputhread->td_priority == worst_pri &&
434			    best_pcpu != NULL &&
435			    (td->td_lastcpu == pc->pc_cpuid ||
436				(PCPU_GET(cpumask) == cpumask &&
437				    td->td_lastcpu != best_pcpu->pc_cpuid)))
438			    best_pcpu = pc;
439		}
440
441		/* Check if we need to preempt someone */
442		if (best_pcpu == NULL)
443			return;
444
445#if defined(IPI_PREEMPTION) && defined(PREEMPTION)
446#if !defined(FULL_PREEMPTION)
447		if (td->td_priority <= PRI_MAX_ITHD)
448#endif /* ! FULL_PREEMPTION */
449			{
450				ipi_selected(best_pcpu->pc_cpumask, IPI_PREEMPT);
451				return;
452			}
453#endif /* defined(IPI_PREEMPTION) && defined(PREEMPTION) */
454
455		if (PCPU_GET(cpuid) != best_pcpu->pc_cpuid) {
456			best_pcpu->pc_curthread->td_flags |= TDF_NEEDRESCHED;
457			ipi_selected(best_pcpu->pc_cpumask, IPI_AST);
458			return;
459		}
460#if !defined(KSEG_PEEMPT_BEST_CPU)
461	}
462#endif
463
464	if (td->td_priority >= running_thread->td_priority)
465		return;
466#ifdef PREEMPTION
467
468#if !defined(FULL_PREEMPTION)
469	if (td->td_priority > PRI_MAX_ITHD) {
470		running_thread->td_flags |= TDF_NEEDRESCHED;
471	}
472#endif /* ! FULL_PREEMPTION */
473
474	if (running_thread->td_critnest > 1)
475		running_thread->td_owepreempt = 1;
476	 else
477		 mi_switch(SW_INVOL, NULL);
478
479#else /* PREEMPTION */
480	running_thread->td_flags |= TDF_NEEDRESCHED;
481#endif /* PREEMPTION */
482	return;
483}
484#endif /* !SMP */
485
486
487int limitcount;
488void
489setrunqueue(struct thread *td, int flags)
490{
491	struct ksegrp *kg;
492	struct thread *td2;
493	struct thread *tda;
494
495	CTR3(KTR_RUNQ, "setrunqueue: td:%p kg:%p pid:%d",
496	    td, td->td_ksegrp, td->td_proc->p_pid);
497	CTR5(KTR_SCHED, "setrunqueue: %p(%s) prio %d by %p(%s)",
498            td, td->td_proc->p_comm, td->td_priority, curthread,
499            curthread->td_proc->p_comm);
500	mtx_assert(&sched_lock, MA_OWNED);
501	KASSERT((td->td_inhibitors == 0),
502			("setrunqueue: trying to run inhibitted thread"));
503	KASSERT((TD_CAN_RUN(td) || TD_IS_RUNNING(td)),
504	    ("setrunqueue: bad thread state"));
505	TD_SET_RUNQ(td);
506	kg = td->td_ksegrp;
507	if ((td->td_proc->p_flag & P_HADTHREADS) == 0) {
508		/*
509		 * Common path optimisation: Only one of everything
510		 * and the KSE is always already attached.
511		 * Totally ignore the ksegrp run queue.
512		 */
513		if (kg->kg_avail_opennings != 1) {
514			if (limitcount < 1) {
515				limitcount++;
516				printf("pid %d: corrected slot count (%d->1)\n",
517				    td->td_proc->p_pid, kg->kg_avail_opennings);
518
519			}
520			kg->kg_avail_opennings = 1;
521		}
522		sched_add(td, flags);
523		return;
524	}
525
526	/*
527	 * If the concurrency has reduced, and we would go in the
528	 * assigned section, then keep removing entries from the
529	 * system run queue, until we are not in that section
530	 * or there is room for us to be put in that section.
531	 * What we MUST avoid is the case where there are threads of less
532	 * priority than the new one scheduled, but it can not
533	 * be scheduled itself. That would lead to a non contiguous set
534	 * of scheduled threads, and everything would break.
535	 */
536	tda = kg->kg_last_assigned;
537	while ((kg->kg_avail_opennings <= 0) &&
538	    (tda && (tda->td_priority > td->td_priority))) {
539		/*
540		 * None free, but there is one we can commandeer.
541		 */
542		CTR2(KTR_RUNQ,
543		    "setrunqueue: kg:%p: take slot from td: %p", kg, tda);
544		sched_rem(tda);
545		tda = kg->kg_last_assigned =
546		    TAILQ_PREV(tda, threadqueue, td_runq);
547	}
548
549	/*
550	 * Add the thread to the ksegrp's run queue at
551	 * the appropriate place.
552	 */
553	TAILQ_FOREACH(td2, &kg->kg_runq, td_runq) {
554		if (td2->td_priority > td->td_priority) {
555			TAILQ_INSERT_BEFORE(td2, td, td_runq);
556			break;
557		}
558	}
559	if (td2 == NULL) {
560		/* We ran off the end of the TAILQ or it was empty. */
561		TAILQ_INSERT_TAIL(&kg->kg_runq, td, td_runq);
562	}
563
564	/*
565	 * If we have a slot to use, then put the thread on the system
566	 * run queue and if needed, readjust the last_assigned pointer.
567	 * it may be that we need to schedule something anyhow
568	 * even if the availabel slots are -ve so that
569	 * all the items < last_assigned are scheduled.
570	 */
571	if (kg->kg_avail_opennings > 0) {
572		if (tda == NULL) {
573			/*
574			 * No pre-existing last assigned so whoever is first
575			 * gets the slot.. (maybe us)
576			 */
577			td2 = TAILQ_FIRST(&kg->kg_runq);
578			kg->kg_last_assigned = td2;
579		} else if (tda->td_priority > td->td_priority) {
580			td2 = td;
581		} else {
582			/*
583			 * We are past last_assigned, so
584			 * give the next slot to whatever is next,
585			 * which may or may not be us.
586			 */
587			td2 = TAILQ_NEXT(tda, td_runq);
588			kg->kg_last_assigned = td2;
589		}
590		sched_add(td2, flags);
591	} else {
592		CTR3(KTR_RUNQ, "setrunqueue: held: td%p kg%p pid%d",
593			td, td->td_ksegrp, td->td_proc->p_pid);
594		if ((flags & SRQ_YIELDING) == 0)
595			maybe_preempt_in_ksegrp(td);
596	}
597}
598
599/*
600 * Kernel thread preemption implementation.  Critical sections mark
601 * regions of code in which preemptions are not allowed.
602 */
603void
604critical_enter(void)
605{
606	struct thread *td;
607
608	td = curthread;
609	td->td_critnest++;
610	CTR4(KTR_CRITICAL, "critical_enter by thread %p (%ld, %s) to %d", td,
611	    (long)td->td_proc->p_pid, td->td_proc->p_comm, td->td_critnest);
612}
613
614void
615critical_exit(void)
616{
617	struct thread *td;
618
619	td = curthread;
620	KASSERT(td->td_critnest != 0,
621	    ("critical_exit: td_critnest == 0"));
622#ifdef PREEMPTION
623	if (td->td_critnest == 1) {
624		td->td_critnest = 0;
625		mtx_assert(&sched_lock, MA_NOTOWNED);
626		if (td->td_owepreempt) {
627			td->td_critnest = 1;
628			mtx_lock_spin(&sched_lock);
629			td->td_critnest--;
630			mi_switch(SW_INVOL, NULL);
631			mtx_unlock_spin(&sched_lock);
632		}
633	} else
634#endif
635		td->td_critnest--;
636
637	CTR4(KTR_CRITICAL, "critical_exit by thread %p (%ld, %s) to %d", td,
638	    (long)td->td_proc->p_pid, td->td_proc->p_comm, td->td_critnest);
639}
640
641/*
642 * This function is called when a thread is about to be put on run queue
643 * because it has been made runnable or its priority has been adjusted.  It
644 * determines if the new thread should be immediately preempted to.  If so,
645 * it switches to it and eventually returns true.  If not, it returns false
646 * so that the caller may place the thread on an appropriate run queue.
647 */
648int
649maybe_preempt(struct thread *td)
650{
651#ifdef PREEMPTION
652	struct thread *ctd;
653	int cpri, pri;
654#endif
655
656	mtx_assert(&sched_lock, MA_OWNED);
657#ifdef PREEMPTION
658	/*
659	 * The new thread should not preempt the current thread if any of the
660	 * following conditions are true:
661	 *
662	 *  - The kernel is in the throes of crashing (panicstr).
663	 *  - The current thread has a higher (numerically lower) or
664	 *    equivalent priority.  Note that this prevents curthread from
665	 *    trying to preempt to itself.
666	 *  - It is too early in the boot for context switches (cold is set).
667	 *  - The current thread has an inhibitor set or is in the process of
668	 *    exiting.  In this case, the current thread is about to switch
669	 *    out anyways, so there's no point in preempting.  If we did,
670	 *    the current thread would not be properly resumed as well, so
671	 *    just avoid that whole landmine.
672	 *  - If the new thread's priority is not a realtime priority and
673	 *    the current thread's priority is not an idle priority and
674	 *    FULL_PREEMPTION is disabled.
675	 *
676	 * If all of these conditions are false, but the current thread is in
677	 * a nested critical section, then we have to defer the preemption
678	 * until we exit the critical section.  Otherwise, switch immediately
679	 * to the new thread.
680	 */
681	ctd = curthread;
682	KASSERT ((ctd->td_kse != NULL && ctd->td_kse->ke_thread == ctd),
683	  ("thread has no (or wrong) sched-private part."));
684	KASSERT((td->td_inhibitors == 0),
685			("maybe_preempt: trying to run inhibitted thread"));
686	pri = td->td_priority;
687	cpri = ctd->td_priority;
688	if (panicstr != NULL || pri >= cpri || cold /* || dumping */ ||
689	    TD_IS_INHIBITED(ctd) || td->td_kse->ke_state != KES_THREAD)
690		return (0);
691#ifndef FULL_PREEMPTION
692	if (pri > PRI_MAX_ITHD && cpri < PRI_MIN_IDLE)
693		return (0);
694#endif
695
696	if (ctd->td_critnest > 1) {
697		CTR1(KTR_PROC, "maybe_preempt: in critical section %d",
698		    ctd->td_critnest);
699		ctd->td_owepreempt = 1;
700		return (0);
701	}
702
703	/*
704	 * Thread is runnable but not yet put on system run queue.
705	 */
706	MPASS(TD_ON_RUNQ(td));
707	MPASS(td->td_sched->ke_state != KES_ONRUNQ);
708	if (td->td_proc->p_flag & P_HADTHREADS) {
709		/*
710		 * If this is a threaded process we actually ARE on the
711		 * ksegrp run queue so take it off that first.
712		 * Also undo any damage done to the last_assigned pointer.
713		 * XXX Fix setrunqueue so this isn't needed
714		 */
715		struct ksegrp *kg;
716
717		kg = td->td_ksegrp;
718		if (kg->kg_last_assigned == td)
719			kg->kg_last_assigned =
720			    TAILQ_PREV(td, threadqueue, td_runq);
721		TAILQ_REMOVE(&kg->kg_runq, td, td_runq);
722	}
723
724	TD_SET_RUNNING(td);
725	CTR3(KTR_PROC, "preempting to thread %p (pid %d, %s)\n", td,
726	    td->td_proc->p_pid, td->td_proc->p_comm);
727	mi_switch(SW_INVOL|SW_PREEMPT, td);
728	return (1);
729#else
730	return (0);
731#endif
732}
733
734#if 0
735#ifndef PREEMPTION
736/* XXX: There should be a non-static version of this. */
737static void
738printf_caddr_t(void *data)
739{
740	printf("%s", (char *)data);
741}
742static char preempt_warning[] =
743    "WARNING: Kernel preemption is disabled, expect reduced performance.\n";
744SYSINIT(preempt_warning, SI_SUB_COPYRIGHT, SI_ORDER_ANY, printf_caddr_t,
745    preempt_warning)
746#endif
747#endif
748
749/************************************************************************
750 * SYSTEM RUN QUEUE manipulations and tests				*
751 ************************************************************************/
752/*
753 * Initialize a run structure.
754 */
755void
756runq_init(struct runq *rq)
757{
758	int i;
759
760	bzero(rq, sizeof *rq);
761	for (i = 0; i < RQ_NQS; i++)
762		TAILQ_INIT(&rq->rq_queues[i]);
763}
764
765/*
766 * Clear the status bit of the queue corresponding to priority level pri,
767 * indicating that it is empty.
768 */
769static __inline void
770runq_clrbit(struct runq *rq, int pri)
771{
772	struct rqbits *rqb;
773
774	rqb = &rq->rq_status;
775	CTR4(KTR_RUNQ, "runq_clrbit: bits=%#x %#x bit=%#x word=%d",
776	    rqb->rqb_bits[RQB_WORD(pri)],
777	    rqb->rqb_bits[RQB_WORD(pri)] & ~RQB_BIT(pri),
778	    RQB_BIT(pri), RQB_WORD(pri));
779	rqb->rqb_bits[RQB_WORD(pri)] &= ~RQB_BIT(pri);
780}
781
782/*
783 * Find the index of the first non-empty run queue.  This is done by
784 * scanning the status bits, a set bit indicates a non-empty queue.
785 */
786static __inline int
787runq_findbit(struct runq *rq)
788{
789	struct rqbits *rqb;
790	int pri;
791	int i;
792
793	rqb = &rq->rq_status;
794	for (i = 0; i < RQB_LEN; i++)
795		if (rqb->rqb_bits[i]) {
796			pri = RQB_FFS(rqb->rqb_bits[i]) + (i << RQB_L2BPW);
797			CTR3(KTR_RUNQ, "runq_findbit: bits=%#x i=%d pri=%d",
798			    rqb->rqb_bits[i], i, pri);
799			return (pri);
800		}
801
802	return (-1);
803}
804
805/*
806 * Set the status bit of the queue corresponding to priority level pri,
807 * indicating that it is non-empty.
808 */
809static __inline void
810runq_setbit(struct runq *rq, int pri)
811{
812	struct rqbits *rqb;
813
814	rqb = &rq->rq_status;
815	CTR4(KTR_RUNQ, "runq_setbit: bits=%#x %#x bit=%#x word=%d",
816	    rqb->rqb_bits[RQB_WORD(pri)],
817	    rqb->rqb_bits[RQB_WORD(pri)] | RQB_BIT(pri),
818	    RQB_BIT(pri), RQB_WORD(pri));
819	rqb->rqb_bits[RQB_WORD(pri)] |= RQB_BIT(pri);
820}
821
822/*
823 * Add the KSE to the queue specified by its priority, and set the
824 * corresponding status bit.
825 */
826void
827runq_add(struct runq *rq, struct kse *ke, int flags)
828{
829	struct rqhead *rqh;
830	int pri;
831
832	pri = ke->ke_thread->td_priority / RQ_PPQ;
833	ke->ke_rqindex = pri;
834	runq_setbit(rq, pri);
835	rqh = &rq->rq_queues[pri];
836	CTR5(KTR_RUNQ, "runq_add: td=%p ke=%p pri=%d %d rqh=%p",
837	    ke->ke_thread, ke, ke->ke_thread->td_priority, pri, rqh);
838	if (flags & SRQ_PREEMPTED) {
839		TAILQ_INSERT_HEAD(rqh, ke, ke_procq);
840	} else {
841		TAILQ_INSERT_TAIL(rqh, ke, ke_procq);
842	}
843}
844
845/*
846 * Return true if there are runnable processes of any priority on the run
847 * queue, false otherwise.  Has no side effects, does not modify the run
848 * queue structure.
849 */
850int
851runq_check(struct runq *rq)
852{
853	struct rqbits *rqb;
854	int i;
855
856	rqb = &rq->rq_status;
857	for (i = 0; i < RQB_LEN; i++)
858		if (rqb->rqb_bits[i]) {
859			CTR2(KTR_RUNQ, "runq_check: bits=%#x i=%d",
860			    rqb->rqb_bits[i], i);
861			return (1);
862		}
863	CTR0(KTR_RUNQ, "runq_check: empty");
864
865	return (0);
866}
867
868#if defined(SMP) && defined(SCHED_4BSD)
869int runq_fuzz = 1;
870SYSCTL_INT(_kern_sched, OID_AUTO, runq_fuzz, CTLFLAG_RW, &runq_fuzz, 0, "");
871#endif
872
873/*
874 * Find the highest priority process on the run queue.
875 */
876struct kse *
877runq_choose(struct runq *rq)
878{
879	struct rqhead *rqh;
880	struct kse *ke;
881	int pri;
882
883	mtx_assert(&sched_lock, MA_OWNED);
884	while ((pri = runq_findbit(rq)) != -1) {
885		rqh = &rq->rq_queues[pri];
886#if defined(SMP) && defined(SCHED_4BSD)
887		/* fuzz == 1 is normal.. 0 or less are ignored */
888		if (runq_fuzz > 1) {
889			/*
890			 * In the first couple of entries, check if
891			 * there is one for our CPU as a preference.
892			 */
893			int count = runq_fuzz;
894			int cpu = PCPU_GET(cpuid);
895			struct kse *ke2;
896			ke2 = ke = TAILQ_FIRST(rqh);
897
898			while (count-- && ke2) {
899				if (ke->ke_thread->td_lastcpu == cpu) {
900					ke = ke2;
901					break;
902				}
903				ke2 = TAILQ_NEXT(ke2, ke_procq);
904			}
905		} else
906#endif
907			ke = TAILQ_FIRST(rqh);
908		KASSERT(ke != NULL, ("runq_choose: no proc on busy queue"));
909		CTR3(KTR_RUNQ,
910		    "runq_choose: pri=%d kse=%p rqh=%p", pri, ke, rqh);
911		return (ke);
912	}
913	CTR1(KTR_RUNQ, "runq_choose: idleproc pri=%d", pri);
914
915	return (NULL);
916}
917
918/*
919 * Remove the KSE from the queue specified by its priority, and clear the
920 * corresponding status bit if the queue becomes empty.
921 * Caller must set ke->ke_state afterwards.
922 */
923void
924runq_remove(struct runq *rq, struct kse *ke)
925{
926	struct rqhead *rqh;
927	int pri;
928
929	KASSERT(ke->ke_proc->p_sflag & PS_INMEM,
930		("runq_remove: process swapped out"));
931	pri = ke->ke_rqindex;
932	rqh = &rq->rq_queues[pri];
933	CTR5(KTR_RUNQ, "runq_remove: td=%p, ke=%p pri=%d %d rqh=%p",
934	    ke->ke_thread, ke, ke->ke_thread->td_priority, pri, rqh);
935	KASSERT(ke != NULL, ("runq_remove: no proc on busy queue"));
936	TAILQ_REMOVE(rqh, ke, ke_procq);
937	if (TAILQ_EMPTY(rqh)) {
938		CTR0(KTR_RUNQ, "runq_remove: empty");
939		runq_clrbit(rq, pri);
940	}
941}
942
943/****** functions that are temporarily here ***********/
944#include <vm/uma.h>
945extern struct mtx kse_zombie_lock;
946
947/*
948 *  Allocate scheduler specific per-process resources.
949 * The thread and ksegrp have already been linked in.
950 * In this case just set the default concurrency value.
951 *
952 * Called from:
953 *  proc_init() (UMA init method)
954 */
955void
956sched_newproc(struct proc *p, struct ksegrp *kg, struct thread *td)
957{
958
959	/* This can go in sched_fork */
960	sched_init_concurrency(kg);
961}
962
963/*
964 * thread is being either created or recycled.
965 * Fix up the per-scheduler resources associated with it.
966 * Called from:
967 *  sched_fork_thread()
968 *  thread_dtor()  (*may go away)
969 *  thread_init()  (*may go away)
970 */
971void
972sched_newthread(struct thread *td)
973{
974	struct td_sched *ke;
975
976	ke = (struct td_sched *) (td + 1);
977	bzero(ke, sizeof(*ke));
978	td->td_sched     = ke;
979	ke->ke_thread	= td;
980	ke->ke_state	= KES_THREAD;
981}
982
983/*
984 * Set up an initial concurrency of 1
985 * and set the given thread (if given) to be using that
986 * concurrency slot.
987 * May be used "offline"..before the ksegrp is attached to the world
988 * and thus wouldn't need schedlock in that case.
989 * Called from:
990 *  thr_create()
991 *  proc_init() (UMA) via sched_newproc()
992 */
993void
994sched_init_concurrency(struct ksegrp *kg)
995{
996
997	CTR1(KTR_RUNQ,"kg %p init slots and concurrency to 1", kg);
998	kg->kg_concurrency = 1;
999	kg->kg_avail_opennings = 1;
1000}
1001
1002/*
1003 * Change the concurrency of an existing ksegrp to N
1004 * Called from:
1005 *  kse_create()
1006 *  kse_exit()
1007 *  thread_exit()
1008 *  thread_single()
1009 */
1010void
1011sched_set_concurrency(struct ksegrp *kg, int concurrency)
1012{
1013
1014	CTR4(KTR_RUNQ,"kg %p set concurrency to %d, slots %d -> %d",
1015	    kg,
1016	    concurrency,
1017	    kg->kg_avail_opennings,
1018	    kg->kg_avail_opennings + (concurrency - kg->kg_concurrency));
1019	kg->kg_avail_opennings += (concurrency - kg->kg_concurrency);
1020	kg->kg_concurrency = concurrency;
1021}
1022
1023/*
1024 * Called from thread_exit() for all exiting thread
1025 *
1026 * Not to be confused with sched_exit_thread()
1027 * that is only called from thread_exit() for threads exiting
1028 * without the rest of the process exiting because it is also called from
1029 * sched_exit() and we wouldn't want to call it twice.
1030 * XXX This can probably be fixed.
1031 */
1032void
1033sched_thread_exit(struct thread *td)
1034{
1035
1036	SLOT_RELEASE(td->td_ksegrp);
1037	slot_fill(td->td_ksegrp);
1038}
1039
1040#endif /* KERN_SWITCH_INCLUDE */
1041