kern_switch.c revision 177435
1/*-
2 * Copyright (c) 2001 Jake Burkholder <jake@FreeBSD.org>
3 * All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 * 1. Redistributions of source code must retain the above copyright
9 *    notice, this list of conditions and the following disclaimer.
10 * 2. Redistributions in binary form must reproduce the above copyright
11 *    notice, this list of conditions and the following disclaimer in the
12 *    documentation and/or other materials provided with the distribution.
13 *
14 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
15 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
16 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
17 * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
18 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
19 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
20 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
21 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
22 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
23 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
24 * SUCH DAMAGE.
25 */
26
27
28#include <sys/cdefs.h>
29__FBSDID("$FreeBSD: head/sys/kern/kern_switch.c 177435 2008-03-20 05:51:16Z jeff $");
30
31#include "opt_sched.h"
32
33#include <sys/param.h>
34#include <sys/systm.h>
35#include <sys/kdb.h>
36#include <sys/kernel.h>
37#include <sys/ktr.h>
38#include <sys/lock.h>
39#include <sys/mutex.h>
40#include <sys/proc.h>
41#include <sys/queue.h>
42#include <sys/sched.h>
43#include <sys/smp.h>
44#include <sys/sysctl.h>
45
46#include <machine/cpu.h>
47
48/* Uncomment this to enable logging of critical_enter/exit. */
49#if 0
50#define	KTR_CRITICAL	KTR_SCHED
51#else
52#define	KTR_CRITICAL	0
53#endif
54
55#ifdef FULL_PREEMPTION
56#ifndef PREEMPTION
57#error "The FULL_PREEMPTION option requires the PREEMPTION option"
58#endif
59#endif
60
61CTASSERT((RQB_BPW * RQB_LEN) == RQ_NQS);
62
63/*
64 * kern.sched.preemption allows user space to determine if preemption support
65 * is compiled in or not.  It is not currently a boot or runtime flag that
66 * can be changed.
67 */
68#ifdef PREEMPTION
69static int kern_sched_preemption = 1;
70#else
71static int kern_sched_preemption = 0;
72#endif
73SYSCTL_INT(_kern_sched, OID_AUTO, preemption, CTLFLAG_RD,
74    &kern_sched_preemption, 0, "Kernel preemption enabled");
75
76#ifdef SCHED_STATS
77long switch_preempt;
78long switch_owepreempt;
79long switch_turnstile;
80long switch_sleepq;
81long switch_sleepqtimo;
82long switch_relinquish;
83long switch_needresched;
84static SYSCTL_NODE(_kern_sched, OID_AUTO, stats, CTLFLAG_RW, 0, "switch stats");
85SYSCTL_INT(_kern_sched_stats, OID_AUTO, preempt, CTLFLAG_RD, &switch_preempt, 0, "");
86SYSCTL_INT(_kern_sched_stats, OID_AUTO, owepreempt, CTLFLAG_RD, &switch_owepreempt, 0, "");
87SYSCTL_INT(_kern_sched_stats, OID_AUTO, turnstile, CTLFLAG_RD, &switch_turnstile, 0, "");
88SYSCTL_INT(_kern_sched_stats, OID_AUTO, sleepq, CTLFLAG_RD, &switch_sleepq, 0, "");
89SYSCTL_INT(_kern_sched_stats, OID_AUTO, sleepqtimo, CTLFLAG_RD, &switch_sleepqtimo, 0, "");
90SYSCTL_INT(_kern_sched_stats, OID_AUTO, relinquish, CTLFLAG_RD, &switch_relinquish, 0, "");
91SYSCTL_INT(_kern_sched_stats, OID_AUTO, needresched, CTLFLAG_RD, &switch_needresched, 0, "");
92static int
93sysctl_stats_reset(SYSCTL_HANDLER_ARGS)
94{
95        int error;
96	int val;
97
98        val = 0;
99        error = sysctl_handle_int(oidp, &val, 0, req);
100        if (error != 0 || req->newptr == NULL)
101                return (error);
102        if (val == 0)
103                return (0);
104	switch_preempt = 0;
105	switch_owepreempt = 0;
106	switch_turnstile = 0;
107	switch_sleepq = 0;
108	switch_sleepqtimo = 0;
109	switch_relinquish = 0;
110	switch_needresched = 0;
111
112	return (0);
113}
114
115SYSCTL_PROC(_kern_sched_stats, OID_AUTO, reset, CTLTYPE_INT | CTLFLAG_WR, NULL,
116    0, sysctl_stats_reset, "I", "Reset scheduler statistics");
117#endif
118
119/************************************************************************
120 * Functions that manipulate runnability from a thread perspective.	*
121 ************************************************************************/
122/*
123 * Select the thread that will be run next.
124 */
125struct thread *
126choosethread(void)
127{
128	struct thread *td;
129
130retry:
131	td = sched_choose();
132
133	/*
134	 * If we are in panic, only allow system threads,
135	 * plus the one we are running in, to be run.
136	 */
137	if (panicstr && ((td->td_proc->p_flag & P_SYSTEM) == 0 &&
138	    (td->td_flags & TDF_INPANIC) == 0)) {
139		/* note that it is no longer on the run queue */
140		TD_SET_CAN_RUN(td);
141		goto retry;
142	}
143
144	TD_SET_RUNNING(td);
145	return (td);
146}
147
148/*
149 * Kernel thread preemption implementation.  Critical sections mark
150 * regions of code in which preemptions are not allowed.
151 */
152void
153critical_enter(void)
154{
155	struct thread *td;
156
157	td = curthread;
158	td->td_critnest++;
159	CTR4(KTR_CRITICAL, "critical_enter by thread %p (%ld, %s) to %d", td,
160	    (long)td->td_proc->p_pid, td->td_name, td->td_critnest);
161}
162
163void
164critical_exit(void)
165{
166	struct thread *td;
167
168	td = curthread;
169	KASSERT(td->td_critnest != 0,
170	    ("critical_exit: td_critnest == 0"));
171
172	if (td->td_critnest == 1) {
173		td->td_critnest = 0;
174		if (td->td_owepreempt) {
175			td->td_critnest = 1;
176			thread_lock(td);
177			td->td_critnest--;
178			SCHED_STAT_INC(switch_owepreempt);
179			mi_switch(SW_INVOL|SW_PREEMPT, NULL);
180			thread_unlock(td);
181		}
182	} else
183		td->td_critnest--;
184
185	CTR4(KTR_CRITICAL, "critical_exit by thread %p (%ld, %s) to %d", td,
186	    (long)td->td_proc->p_pid, td->td_name, td->td_critnest);
187}
188
189/************************************************************************
190 * SYSTEM RUN QUEUE manipulations and tests				*
191 ************************************************************************/
192/*
193 * Initialize a run structure.
194 */
195void
196runq_init(struct runq *rq)
197{
198	int i;
199
200	bzero(rq, sizeof *rq);
201	for (i = 0; i < RQ_NQS; i++)
202		TAILQ_INIT(&rq->rq_queues[i]);
203}
204
205/*
206 * Clear the status bit of the queue corresponding to priority level pri,
207 * indicating that it is empty.
208 */
209static __inline void
210runq_clrbit(struct runq *rq, int pri)
211{
212	struct rqbits *rqb;
213
214	rqb = &rq->rq_status;
215	CTR4(KTR_RUNQ, "runq_clrbit: bits=%#x %#x bit=%#x word=%d",
216	    rqb->rqb_bits[RQB_WORD(pri)],
217	    rqb->rqb_bits[RQB_WORD(pri)] & ~RQB_BIT(pri),
218	    RQB_BIT(pri), RQB_WORD(pri));
219	rqb->rqb_bits[RQB_WORD(pri)] &= ~RQB_BIT(pri);
220}
221
222/*
223 * Find the index of the first non-empty run queue.  This is done by
224 * scanning the status bits, a set bit indicates a non-empty queue.
225 */
226static __inline int
227runq_findbit(struct runq *rq)
228{
229	struct rqbits *rqb;
230	int pri;
231	int i;
232
233	rqb = &rq->rq_status;
234	for (i = 0; i < RQB_LEN; i++)
235		if (rqb->rqb_bits[i]) {
236			pri = RQB_FFS(rqb->rqb_bits[i]) + (i << RQB_L2BPW);
237			CTR3(KTR_RUNQ, "runq_findbit: bits=%#x i=%d pri=%d",
238			    rqb->rqb_bits[i], i, pri);
239			return (pri);
240		}
241
242	return (-1);
243}
244
245static __inline int
246runq_findbit_from(struct runq *rq, u_char pri)
247{
248	struct rqbits *rqb;
249	rqb_word_t mask;
250	int i;
251
252	/*
253	 * Set the mask for the first word so we ignore priorities before 'pri'.
254	 */
255	mask = (rqb_word_t)-1 << (pri & (RQB_BPW - 1));
256	rqb = &rq->rq_status;
257again:
258	for (i = RQB_WORD(pri); i < RQB_LEN; mask = -1, i++) {
259		mask = rqb->rqb_bits[i] & mask;
260		if (mask == 0)
261			continue;
262		pri = RQB_FFS(mask) + (i << RQB_L2BPW);
263		CTR3(KTR_RUNQ, "runq_findbit_from: bits=%#x i=%d pri=%d",
264		    mask, i, pri);
265		return (pri);
266	}
267	if (pri == 0)
268		return (-1);
269	/*
270	 * Wrap back around to the beginning of the list just once so we
271	 * scan the whole thing.
272	 */
273	pri = 0;
274	goto again;
275}
276
277/*
278 * Set the status bit of the queue corresponding to priority level pri,
279 * indicating that it is non-empty.
280 */
281static __inline void
282runq_setbit(struct runq *rq, int pri)
283{
284	struct rqbits *rqb;
285
286	rqb = &rq->rq_status;
287	CTR4(KTR_RUNQ, "runq_setbit: bits=%#x %#x bit=%#x word=%d",
288	    rqb->rqb_bits[RQB_WORD(pri)],
289	    rqb->rqb_bits[RQB_WORD(pri)] | RQB_BIT(pri),
290	    RQB_BIT(pri), RQB_WORD(pri));
291	rqb->rqb_bits[RQB_WORD(pri)] |= RQB_BIT(pri);
292}
293
294/*
295 * Add the thread to the queue specified by its priority, and set the
296 * corresponding status bit.
297 */
298void
299runq_add(struct runq *rq, struct thread *td, int flags)
300{
301	struct rqhead *rqh;
302	int pri;
303
304	pri = td->td_priority / RQ_PPQ;
305	td->td_rqindex = pri;
306	runq_setbit(rq, pri);
307	rqh = &rq->rq_queues[pri];
308	CTR4(KTR_RUNQ, "runq_add: td=%p pri=%d %d rqh=%p",
309	    td, td->td_priority, pri, rqh);
310	if (flags & SRQ_PREEMPTED) {
311		TAILQ_INSERT_HEAD(rqh, td, td_runq);
312	} else {
313		TAILQ_INSERT_TAIL(rqh, td, td_runq);
314	}
315}
316
317void
318runq_add_pri(struct runq *rq, struct thread *td, u_char pri, int flags)
319{
320	struct rqhead *rqh;
321
322	KASSERT(pri < RQ_NQS, ("runq_add_pri: %d out of range", pri));
323	td->td_rqindex = pri;
324	runq_setbit(rq, pri);
325	rqh = &rq->rq_queues[pri];
326	CTR4(KTR_RUNQ, "runq_add_pri: td=%p pri=%d idx=%d rqh=%p",
327	    td, td->td_priority, pri, rqh);
328	if (flags & SRQ_PREEMPTED) {
329		TAILQ_INSERT_HEAD(rqh, td, td_runq);
330	} else {
331		TAILQ_INSERT_TAIL(rqh, td, td_runq);
332	}
333}
334/*
335 * Return true if there are runnable processes of any priority on the run
336 * queue, false otherwise.  Has no side effects, does not modify the run
337 * queue structure.
338 */
339int
340runq_check(struct runq *rq)
341{
342	struct rqbits *rqb;
343	int i;
344
345	rqb = &rq->rq_status;
346	for (i = 0; i < RQB_LEN; i++)
347		if (rqb->rqb_bits[i]) {
348			CTR2(KTR_RUNQ, "runq_check: bits=%#x i=%d",
349			    rqb->rqb_bits[i], i);
350			return (1);
351		}
352	CTR0(KTR_RUNQ, "runq_check: empty");
353
354	return (0);
355}
356
357/*
358 * Find the highest priority process on the run queue.
359 */
360struct thread *
361runq_choose_fuzz(struct runq *rq, int fuzz)
362{
363	struct rqhead *rqh;
364	struct thread *td;
365	int pri;
366
367	while ((pri = runq_findbit(rq)) != -1) {
368		rqh = &rq->rq_queues[pri];
369		/* fuzz == 1 is normal.. 0 or less are ignored */
370		if (fuzz > 1) {
371			/*
372			 * In the first couple of entries, check if
373			 * there is one for our CPU as a preference.
374			 */
375			int count = fuzz;
376			int cpu = PCPU_GET(cpuid);
377			struct thread *td2;
378			td2 = td = TAILQ_FIRST(rqh);
379
380			while (count-- && td2) {
381				if (td->td_lastcpu == cpu) {
382					td = td2;
383					break;
384				}
385				td2 = TAILQ_NEXT(td2, td_runq);
386			}
387		} else
388			td = TAILQ_FIRST(rqh);
389		KASSERT(td != NULL, ("runq_choose_fuzz: no proc on busy queue"));
390		CTR3(KTR_RUNQ,
391		    "runq_choose_fuzz: pri=%d thread=%p rqh=%p", pri, td, rqh);
392		return (td);
393	}
394	CTR1(KTR_RUNQ, "runq_choose_fuzz: idleproc pri=%d", pri);
395
396	return (NULL);
397}
398
399/*
400 * Find the highest priority process on the run queue.
401 */
402struct thread *
403runq_choose(struct runq *rq)
404{
405	struct rqhead *rqh;
406	struct thread *td;
407	int pri;
408
409	while ((pri = runq_findbit(rq)) != -1) {
410		rqh = &rq->rq_queues[pri];
411		td = TAILQ_FIRST(rqh);
412		KASSERT(td != NULL, ("runq_choose: no thread on busy queue"));
413		CTR3(KTR_RUNQ,
414		    "runq_choose: pri=%d thread=%p rqh=%p", pri, td, rqh);
415		return (td);
416	}
417	CTR1(KTR_RUNQ, "runq_choose: idlethread pri=%d", pri);
418
419	return (NULL);
420}
421
422struct thread *
423runq_choose_from(struct runq *rq, u_char idx)
424{
425	struct rqhead *rqh;
426	struct thread *td;
427	int pri;
428
429	if ((pri = runq_findbit_from(rq, idx)) != -1) {
430		rqh = &rq->rq_queues[pri];
431		td = TAILQ_FIRST(rqh);
432		KASSERT(td != NULL, ("runq_choose: no thread on busy queue"));
433		CTR4(KTR_RUNQ,
434		    "runq_choose_from: pri=%d thread=%p idx=%d rqh=%p",
435		    pri, td, td->td_rqindex, rqh);
436		return (td);
437	}
438	CTR1(KTR_RUNQ, "runq_choose_from: idlethread pri=%d", pri);
439
440	return (NULL);
441}
442/*
443 * Remove the thread from the queue specified by its priority, and clear the
444 * corresponding status bit if the queue becomes empty.
445 * Caller must set state afterwards.
446 */
447void
448runq_remove(struct runq *rq, struct thread *td)
449{
450
451	runq_remove_idx(rq, td, NULL);
452}
453
454void
455runq_remove_idx(struct runq *rq, struct thread *td, u_char *idx)
456{
457	struct rqhead *rqh;
458	u_char pri;
459
460	KASSERT(td->td_flags & TDF_INMEM,
461		("runq_remove_idx: thread swapped out"));
462	pri = td->td_rqindex;
463	KASSERT(pri < RQ_NQS, ("runq_remove_idx: Invalid index %d\n", pri));
464	rqh = &rq->rq_queues[pri];
465	CTR4(KTR_RUNQ, "runq_remove_idx: td=%p, pri=%d %d rqh=%p",
466	    td, td->td_priority, pri, rqh);
467	TAILQ_REMOVE(rqh, td, td_runq);
468	if (TAILQ_EMPTY(rqh)) {
469		CTR0(KTR_RUNQ, "runq_remove_idx: empty");
470		runq_clrbit(rq, pri);
471		if (idx != NULL && *idx == pri)
472			*idx = (pri + 1) % RQ_NQS;
473	}
474}
475