kern_switch.c revision 194936
1/*-
2 * Copyright (c) 2001 Jake Burkholder <jake@FreeBSD.org>
3 * All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 * 1. Redistributions of source code must retain the above copyright
9 *    notice, this list of conditions and the following disclaimer.
10 * 2. Redistributions in binary form must reproduce the above copyright
11 *    notice, this list of conditions and the following disclaimer in the
12 *    documentation and/or other materials provided with the distribution.
13 *
14 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
15 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
16 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
17 * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
18 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
19 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
20 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
21 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
22 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
23 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
24 * SUCH DAMAGE.
25 */
26
27
28#include <sys/cdefs.h>
29__FBSDID("$FreeBSD: head/sys/kern/kern_switch.c 194936 2009-06-25 01:33:51Z jeff $");
30
31#include "opt_sched.h"
32
33#include <sys/param.h>
34#include <sys/systm.h>
35#include <sys/kdb.h>
36#include <sys/kernel.h>
37#include <sys/ktr.h>
38#include <sys/lock.h>
39#include <sys/mutex.h>
40#include <sys/proc.h>
41#include <sys/queue.h>
42#include <sys/sched.h>
43#include <sys/smp.h>
44#include <sys/sysctl.h>
45
46#include <machine/cpu.h>
47
48/* Uncomment this to enable logging of critical_enter/exit. */
49#if 0
50#define	KTR_CRITICAL	KTR_SCHED
51#else
52#define	KTR_CRITICAL	0
53#endif
54
55#ifdef FULL_PREEMPTION
56#ifndef PREEMPTION
57#error "The FULL_PREEMPTION option requires the PREEMPTION option"
58#endif
59#endif
60
61CTASSERT((RQB_BPW * RQB_LEN) == RQ_NQS);
62
63/*
64 * kern.sched.preemption allows user space to determine if preemption support
65 * is compiled in or not.  It is not currently a boot or runtime flag that
66 * can be changed.
67 */
68#ifdef PREEMPTION
69static int kern_sched_preemption = 1;
70#else
71static int kern_sched_preemption = 0;
72#endif
73SYSCTL_INT(_kern_sched, OID_AUTO, preemption, CTLFLAG_RD,
74    &kern_sched_preemption, 0, "Kernel preemption enabled");
75
76/*
77 * Support for scheduler stats exported via kern.sched.stats.  All stats may
78 * be reset with kern.sched.stats.reset = 1.  Stats may be defined elsewhere
79 * with SCHED_STAT_DEFINE().
80 */
81#ifdef SCHED_STATS
82SYSCTL_NODE(_kern_sched, OID_AUTO, stats, CTLFLAG_RW, 0, "switch stats");
83
84/* Switch reasons from mi_switch(). */
85DPCPU_DEFINE(long, sched_switch_stats[SWT_COUNT]);
86SCHED_STAT_DEFINE_VAR(uncategorized,
87    &DPCPU_NAME(sched_switch_stats[SWT_NONE]), "");
88SCHED_STAT_DEFINE_VAR(preempt,
89    &DPCPU_NAME(sched_switch_stats[SWT_PREEMPT]), "");
90SCHED_STAT_DEFINE_VAR(owepreempt,
91    &DPCPU_NAME(sched_switch_stats[SWT_OWEPREEMPT]), "");
92SCHED_STAT_DEFINE_VAR(turnstile,
93    &DPCPU_NAME(sched_switch_stats[SWT_TURNSTILE]), "");
94SCHED_STAT_DEFINE_VAR(sleepq,
95    &DPCPU_NAME(sched_switch_stats[SWT_SLEEPQ]), "");
96SCHED_STAT_DEFINE_VAR(sleepqtimo,
97    &DPCPU_NAME(sched_switch_stats[SWT_SLEEPQTIMO]), "");
98SCHED_STAT_DEFINE_VAR(relinquish,
99    &DPCPU_NAME(sched_switch_stats[SWT_RELINQUISH]), "");
100SCHED_STAT_DEFINE_VAR(needresched,
101    &DPCPU_NAME(sched_switch_stats[SWT_NEEDRESCHED]), "");
102SCHED_STAT_DEFINE_VAR(idle,
103    &DPCPU_NAME(sched_switch_stats[SWT_IDLE]), "");
104SCHED_STAT_DEFINE_VAR(iwait,
105    &DPCPU_NAME(sched_switch_stats[SWT_IWAIT]), "");
106SCHED_STAT_DEFINE_VAR(suspend,
107    &DPCPU_NAME(sched_switch_stats[SWT_SUSPEND]), "");
108SCHED_STAT_DEFINE_VAR(remotepreempt,
109    &DPCPU_NAME(sched_switch_stats[SWT_REMOTEPREEMPT]), "");
110SCHED_STAT_DEFINE_VAR(remotewakeidle,
111    &DPCPU_NAME(sched_switch_stats[SWT_REMOTEWAKEIDLE]), "");
112
113static int
114sysctl_stats_reset(SYSCTL_HANDLER_ARGS)
115{
116	struct sysctl_oid *p;
117	uintptr_t counter;
118        int error;
119	int val;
120	int i;
121
122        val = 0;
123        error = sysctl_handle_int(oidp, &val, 0, req);
124        if (error != 0 || req->newptr == NULL)
125                return (error);
126        if (val == 0)
127                return (0);
128	/*
129	 * Traverse the list of children of _kern_sched_stats and reset each
130	 * to 0.  Skip the reset entry.
131	 */
132	SLIST_FOREACH(p, oidp->oid_parent, oid_link) {
133		if (p == oidp || p->oid_arg1 == NULL)
134			continue;
135		counter = (uintptr_t)p->oid_arg1;
136		for (i = 0; i <= mp_maxid; i++) {
137			if (CPU_ABSENT(i))
138				continue;
139			*(long *)(dpcpu_off[i] + counter) = 0;
140		}
141	}
142	return (0);
143}
144
145SYSCTL_PROC(_kern_sched_stats, OID_AUTO, reset, CTLTYPE_INT | CTLFLAG_WR, NULL,
146    0, sysctl_stats_reset, "I", "Reset scheduler statistics");
147#endif
148
149/************************************************************************
150 * Functions that manipulate runnability from a thread perspective.	*
151 ************************************************************************/
152/*
153 * Select the thread that will be run next.
154 */
155struct thread *
156choosethread(void)
157{
158	struct thread *td;
159
160retry:
161	td = sched_choose();
162
163	/*
164	 * If we are in panic, only allow system threads,
165	 * plus the one we are running in, to be run.
166	 */
167	if (panicstr && ((td->td_proc->p_flag & P_SYSTEM) == 0 &&
168	    (td->td_flags & TDF_INPANIC) == 0)) {
169		/* note that it is no longer on the run queue */
170		TD_SET_CAN_RUN(td);
171		goto retry;
172	}
173
174	TD_SET_RUNNING(td);
175	return (td);
176}
177
178/*
179 * Kernel thread preemption implementation.  Critical sections mark
180 * regions of code in which preemptions are not allowed.
181 */
182void
183critical_enter(void)
184{
185	struct thread *td;
186
187	td = curthread;
188	td->td_critnest++;
189	CTR4(KTR_CRITICAL, "critical_enter by thread %p (%ld, %s) to %d", td,
190	    (long)td->td_proc->p_pid, td->td_name, td->td_critnest);
191}
192
193void
194critical_exit(void)
195{
196	struct thread *td;
197	int flags;
198
199	td = curthread;
200	KASSERT(td->td_critnest != 0,
201	    ("critical_exit: td_critnest == 0"));
202
203	if (td->td_critnest == 1) {
204		td->td_critnest = 0;
205		if (td->td_owepreempt) {
206			td->td_critnest = 1;
207			thread_lock(td);
208			td->td_critnest--;
209			flags = SW_INVOL | SW_PREEMPT;
210			if (TD_IS_IDLETHREAD(td))
211				flags |= SWT_IDLE;
212			else
213				flags |= SWT_OWEPREEMPT;
214			mi_switch(flags, NULL);
215			thread_unlock(td);
216		}
217	} else
218		td->td_critnest--;
219
220	CTR4(KTR_CRITICAL, "critical_exit by thread %p (%ld, %s) to %d", td,
221	    (long)td->td_proc->p_pid, td->td_name, td->td_critnest);
222}
223
224/************************************************************************
225 * SYSTEM RUN QUEUE manipulations and tests				*
226 ************************************************************************/
227/*
228 * Initialize a run structure.
229 */
230void
231runq_init(struct runq *rq)
232{
233	int i;
234
235	bzero(rq, sizeof *rq);
236	for (i = 0; i < RQ_NQS; i++)
237		TAILQ_INIT(&rq->rq_queues[i]);
238}
239
240/*
241 * Clear the status bit of the queue corresponding to priority level pri,
242 * indicating that it is empty.
243 */
244static __inline void
245runq_clrbit(struct runq *rq, int pri)
246{
247	struct rqbits *rqb;
248
249	rqb = &rq->rq_status;
250	CTR4(KTR_RUNQ, "runq_clrbit: bits=%#x %#x bit=%#x word=%d",
251	    rqb->rqb_bits[RQB_WORD(pri)],
252	    rqb->rqb_bits[RQB_WORD(pri)] & ~RQB_BIT(pri),
253	    RQB_BIT(pri), RQB_WORD(pri));
254	rqb->rqb_bits[RQB_WORD(pri)] &= ~RQB_BIT(pri);
255}
256
257/*
258 * Find the index of the first non-empty run queue.  This is done by
259 * scanning the status bits, a set bit indicates a non-empty queue.
260 */
261static __inline int
262runq_findbit(struct runq *rq)
263{
264	struct rqbits *rqb;
265	int pri;
266	int i;
267
268	rqb = &rq->rq_status;
269	for (i = 0; i < RQB_LEN; i++)
270		if (rqb->rqb_bits[i]) {
271			pri = RQB_FFS(rqb->rqb_bits[i]) + (i << RQB_L2BPW);
272			CTR3(KTR_RUNQ, "runq_findbit: bits=%#x i=%d pri=%d",
273			    rqb->rqb_bits[i], i, pri);
274			return (pri);
275		}
276
277	return (-1);
278}
279
280static __inline int
281runq_findbit_from(struct runq *rq, u_char pri)
282{
283	struct rqbits *rqb;
284	rqb_word_t mask;
285	int i;
286
287	/*
288	 * Set the mask for the first word so we ignore priorities before 'pri'.
289	 */
290	mask = (rqb_word_t)-1 << (pri & (RQB_BPW - 1));
291	rqb = &rq->rq_status;
292again:
293	for (i = RQB_WORD(pri); i < RQB_LEN; mask = -1, i++) {
294		mask = rqb->rqb_bits[i] & mask;
295		if (mask == 0)
296			continue;
297		pri = RQB_FFS(mask) + (i << RQB_L2BPW);
298		CTR3(KTR_RUNQ, "runq_findbit_from: bits=%#x i=%d pri=%d",
299		    mask, i, pri);
300		return (pri);
301	}
302	if (pri == 0)
303		return (-1);
304	/*
305	 * Wrap back around to the beginning of the list just once so we
306	 * scan the whole thing.
307	 */
308	pri = 0;
309	goto again;
310}
311
312/*
313 * Set the status bit of the queue corresponding to priority level pri,
314 * indicating that it is non-empty.
315 */
316static __inline void
317runq_setbit(struct runq *rq, int pri)
318{
319	struct rqbits *rqb;
320
321	rqb = &rq->rq_status;
322	CTR4(KTR_RUNQ, "runq_setbit: bits=%#x %#x bit=%#x word=%d",
323	    rqb->rqb_bits[RQB_WORD(pri)],
324	    rqb->rqb_bits[RQB_WORD(pri)] | RQB_BIT(pri),
325	    RQB_BIT(pri), RQB_WORD(pri));
326	rqb->rqb_bits[RQB_WORD(pri)] |= RQB_BIT(pri);
327}
328
329/*
330 * Add the thread to the queue specified by its priority, and set the
331 * corresponding status bit.
332 */
333void
334runq_add(struct runq *rq, struct thread *td, int flags)
335{
336	struct rqhead *rqh;
337	int pri;
338
339	pri = td->td_priority / RQ_PPQ;
340	td->td_rqindex = pri;
341	runq_setbit(rq, pri);
342	rqh = &rq->rq_queues[pri];
343	CTR4(KTR_RUNQ, "runq_add: td=%p pri=%d %d rqh=%p",
344	    td, td->td_priority, pri, rqh);
345	if (flags & SRQ_PREEMPTED) {
346		TAILQ_INSERT_HEAD(rqh, td, td_runq);
347	} else {
348		TAILQ_INSERT_TAIL(rqh, td, td_runq);
349	}
350}
351
352void
353runq_add_pri(struct runq *rq, struct thread *td, u_char pri, int flags)
354{
355	struct rqhead *rqh;
356
357	KASSERT(pri < RQ_NQS, ("runq_add_pri: %d out of range", pri));
358	td->td_rqindex = pri;
359	runq_setbit(rq, pri);
360	rqh = &rq->rq_queues[pri];
361	CTR4(KTR_RUNQ, "runq_add_pri: td=%p pri=%d idx=%d rqh=%p",
362	    td, td->td_priority, pri, rqh);
363	if (flags & SRQ_PREEMPTED) {
364		TAILQ_INSERT_HEAD(rqh, td, td_runq);
365	} else {
366		TAILQ_INSERT_TAIL(rqh, td, td_runq);
367	}
368}
369/*
370 * Return true if there are runnable processes of any priority on the run
371 * queue, false otherwise.  Has no side effects, does not modify the run
372 * queue structure.
373 */
374int
375runq_check(struct runq *rq)
376{
377	struct rqbits *rqb;
378	int i;
379
380	rqb = &rq->rq_status;
381	for (i = 0; i < RQB_LEN; i++)
382		if (rqb->rqb_bits[i]) {
383			CTR2(KTR_RUNQ, "runq_check: bits=%#x i=%d",
384			    rqb->rqb_bits[i], i);
385			return (1);
386		}
387	CTR0(KTR_RUNQ, "runq_check: empty");
388
389	return (0);
390}
391
392/*
393 * Find the highest priority process on the run queue.
394 */
395struct thread *
396runq_choose_fuzz(struct runq *rq, int fuzz)
397{
398	struct rqhead *rqh;
399	struct thread *td;
400	int pri;
401
402	while ((pri = runq_findbit(rq)) != -1) {
403		rqh = &rq->rq_queues[pri];
404		/* fuzz == 1 is normal.. 0 or less are ignored */
405		if (fuzz > 1) {
406			/*
407			 * In the first couple of entries, check if
408			 * there is one for our CPU as a preference.
409			 */
410			int count = fuzz;
411			int cpu = PCPU_GET(cpuid);
412			struct thread *td2;
413			td2 = td = TAILQ_FIRST(rqh);
414
415			while (count-- && td2) {
416				if (td2->td_lastcpu == cpu) {
417					td = td2;
418					break;
419				}
420				td2 = TAILQ_NEXT(td2, td_runq);
421			}
422		} else
423			td = TAILQ_FIRST(rqh);
424		KASSERT(td != NULL, ("runq_choose_fuzz: no proc on busy queue"));
425		CTR3(KTR_RUNQ,
426		    "runq_choose_fuzz: pri=%d thread=%p rqh=%p", pri, td, rqh);
427		return (td);
428	}
429	CTR1(KTR_RUNQ, "runq_choose_fuzz: idleproc pri=%d", pri);
430
431	return (NULL);
432}
433
434/*
435 * Find the highest priority process on the run queue.
436 */
437struct thread *
438runq_choose(struct runq *rq)
439{
440	struct rqhead *rqh;
441	struct thread *td;
442	int pri;
443
444	while ((pri = runq_findbit(rq)) != -1) {
445		rqh = &rq->rq_queues[pri];
446		td = TAILQ_FIRST(rqh);
447		KASSERT(td != NULL, ("runq_choose: no thread on busy queue"));
448		CTR3(KTR_RUNQ,
449		    "runq_choose: pri=%d thread=%p rqh=%p", pri, td, rqh);
450		return (td);
451	}
452	CTR1(KTR_RUNQ, "runq_choose: idlethread pri=%d", pri);
453
454	return (NULL);
455}
456
457struct thread *
458runq_choose_from(struct runq *rq, u_char idx)
459{
460	struct rqhead *rqh;
461	struct thread *td;
462	int pri;
463
464	if ((pri = runq_findbit_from(rq, idx)) != -1) {
465		rqh = &rq->rq_queues[pri];
466		td = TAILQ_FIRST(rqh);
467		KASSERT(td != NULL, ("runq_choose: no thread on busy queue"));
468		CTR4(KTR_RUNQ,
469		    "runq_choose_from: pri=%d thread=%p idx=%d rqh=%p",
470		    pri, td, td->td_rqindex, rqh);
471		return (td);
472	}
473	CTR1(KTR_RUNQ, "runq_choose_from: idlethread pri=%d", pri);
474
475	return (NULL);
476}
477/*
478 * Remove the thread from the queue specified by its priority, and clear the
479 * corresponding status bit if the queue becomes empty.
480 * Caller must set state afterwards.
481 */
482void
483runq_remove(struct runq *rq, struct thread *td)
484{
485
486	runq_remove_idx(rq, td, NULL);
487}
488
489void
490runq_remove_idx(struct runq *rq, struct thread *td, u_char *idx)
491{
492	struct rqhead *rqh;
493	u_char pri;
494
495	KASSERT(td->td_flags & TDF_INMEM,
496		("runq_remove_idx: thread swapped out"));
497	pri = td->td_rqindex;
498	KASSERT(pri < RQ_NQS, ("runq_remove_idx: Invalid index %d\n", pri));
499	rqh = &rq->rq_queues[pri];
500	CTR4(KTR_RUNQ, "runq_remove_idx: td=%p, pri=%d %d rqh=%p",
501	    td, td->td_priority, pri, rqh);
502	TAILQ_REMOVE(rqh, td, td_runq);
503	if (TAILQ_EMPTY(rqh)) {
504		CTR0(KTR_RUNQ, "runq_remove_idx: empty");
505		runq_clrbit(rq, pri);
506		if (idx != NULL && *idx == pri)
507			*idx = (pri + 1) % RQ_NQS;
508	}
509}
510