kern_switch.c revision 177419
1/*-
2 * Copyright (c) 2001 Jake Burkholder <jake@FreeBSD.org>
3 * All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 * 1. Redistributions of source code must retain the above copyright
9 *    notice, this list of conditions and the following disclaimer.
10 * 2. Redistributions in binary form must reproduce the above copyright
11 *    notice, this list of conditions and the following disclaimer in the
12 *    documentation and/or other materials provided with the distribution.
13 *
14 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
15 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
16 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
17 * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
18 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
19 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
20 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
21 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
22 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
23 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
24 * SUCH DAMAGE.
25 */
26
27
28#include <sys/cdefs.h>
29__FBSDID("$FreeBSD: head/sys/kern/kern_switch.c 177419 2008-03-20 02:14:02Z jeff $");
30
31#include "opt_sched.h"
32
33#ifndef KERN_SWITCH_INCLUDE
34#include <sys/param.h>
35#include <sys/systm.h>
36#include <sys/kdb.h>
37#include <sys/kernel.h>
38#include <sys/ktr.h>
39#include <sys/lock.h>
40#include <sys/mutex.h>
41#include <sys/proc.h>
42#include <sys/queue.h>
43#include <sys/sched.h>
44#else  /* KERN_SWITCH_INCLUDE */
45#if defined(SMP) && (defined(__i386__) || defined(__amd64__))
46#include <sys/smp.h>
47#endif
48
49#include <machine/cpu.h>
50
51/* Uncomment this to enable logging of critical_enter/exit. */
52#if 0
53#define	KTR_CRITICAL	KTR_SCHED
54#else
55#define	KTR_CRITICAL	0
56#endif
57
58#ifdef FULL_PREEMPTION
59#ifndef PREEMPTION
60#error "The FULL_PREEMPTION option requires the PREEMPTION option"
61#endif
62#endif
63
64CTASSERT((RQB_BPW * RQB_LEN) == RQ_NQS);
65
66/*
67 * kern.sched.preemption allows user space to determine if preemption support
68 * is compiled in or not.  It is not currently a boot or runtime flag that
69 * can be changed.
70 */
71#ifdef PREEMPTION
72static int kern_sched_preemption = 1;
73#else
74static int kern_sched_preemption = 0;
75#endif
76SYSCTL_INT(_kern_sched, OID_AUTO, preemption, CTLFLAG_RD,
77    &kern_sched_preemption, 0, "Kernel preemption enabled");
78
79#ifdef SCHED_STATS
80long switch_preempt;
81long switch_owepreempt;
82long switch_turnstile;
83long switch_sleepq;
84long switch_sleepqtimo;
85long switch_relinquish;
86long switch_needresched;
87static SYSCTL_NODE(_kern_sched, OID_AUTO, stats, CTLFLAG_RW, 0, "switch stats");
88SYSCTL_INT(_kern_sched_stats, OID_AUTO, preempt, CTLFLAG_RD, &switch_preempt, 0, "");
89SYSCTL_INT(_kern_sched_stats, OID_AUTO, owepreempt, CTLFLAG_RD, &switch_owepreempt, 0, "");
90SYSCTL_INT(_kern_sched_stats, OID_AUTO, turnstile, CTLFLAG_RD, &switch_turnstile, 0, "");
91SYSCTL_INT(_kern_sched_stats, OID_AUTO, sleepq, CTLFLAG_RD, &switch_sleepq, 0, "");
92SYSCTL_INT(_kern_sched_stats, OID_AUTO, sleepqtimo, CTLFLAG_RD, &switch_sleepqtimo, 0, "");
93SYSCTL_INT(_kern_sched_stats, OID_AUTO, relinquish, CTLFLAG_RD, &switch_relinquish, 0, "");
94SYSCTL_INT(_kern_sched_stats, OID_AUTO, needresched, CTLFLAG_RD, &switch_needresched, 0, "");
95static int
96sysctl_stats_reset(SYSCTL_HANDLER_ARGS)
97{
98        int error;
99	int val;
100
101        val = 0;
102        error = sysctl_handle_int(oidp, &val, 0, req);
103        if (error != 0 || req->newptr == NULL)
104                return (error);
105        if (val == 0)
106                return (0);
107	switch_preempt = 0;
108	switch_owepreempt = 0;
109	switch_turnstile = 0;
110	switch_sleepq = 0;
111	switch_sleepqtimo = 0;
112	switch_relinquish = 0;
113	switch_needresched = 0;
114
115	return (0);
116}
117
118SYSCTL_PROC(_kern_sched_stats, OID_AUTO, reset, CTLTYPE_INT | CTLFLAG_WR, NULL,
119    0, sysctl_stats_reset, "I", "Reset scheduler statistics");
120#endif
121
122/************************************************************************
123 * Functions that manipulate runnability from a thread perspective.	*
124 ************************************************************************/
125/*
126 * Select the thread that will be run next.
127 */
128struct thread *
129choosethread(void)
130{
131	struct thread *td;
132
133retry:
134	td = sched_choose();
135
136	/*
137	 * If we are in panic, only allow system threads,
138	 * plus the one we are running in, to be run.
139	 */
140	if (panicstr && ((td->td_proc->p_flag & P_SYSTEM) == 0 &&
141	    (td->td_flags & TDF_INPANIC) == 0)) {
142		/* note that it is no longer on the run queue */
143		TD_SET_CAN_RUN(td);
144		goto retry;
145	}
146
147	TD_SET_RUNNING(td);
148	return (td);
149}
150
151/*
152 * Kernel thread preemption implementation.  Critical sections mark
153 * regions of code in which preemptions are not allowed.
154 */
155void
156critical_enter(void)
157{
158	struct thread *td;
159
160	td = curthread;
161	td->td_critnest++;
162	CTR4(KTR_CRITICAL, "critical_enter by thread %p (%ld, %s) to %d", td,
163	    (long)td->td_proc->p_pid, td->td_name, td->td_critnest);
164}
165
166void
167critical_exit(void)
168{
169	struct thread *td;
170
171	td = curthread;
172	KASSERT(td->td_critnest != 0,
173	    ("critical_exit: td_critnest == 0"));
174
175	if (td->td_critnest == 1) {
176		td->td_critnest = 0;
177		if (td->td_owepreempt) {
178			td->td_critnest = 1;
179			thread_lock(td);
180			td->td_critnest--;
181			SCHED_STAT_INC(switch_owepreempt);
182			mi_switch(SW_INVOL|SW_PREEMPT, NULL);
183			thread_unlock(td);
184		}
185	} else
186		td->td_critnest--;
187
188	CTR4(KTR_CRITICAL, "critical_exit by thread %p (%ld, %s) to %d", td,
189	    (long)td->td_proc->p_pid, td->td_name, td->td_critnest);
190}
191
192/************************************************************************
193 * SYSTEM RUN QUEUE manipulations and tests				*
194 ************************************************************************/
195/*
196 * Initialize a run structure.
197 */
198void
199runq_init(struct runq *rq)
200{
201	int i;
202
203	bzero(rq, sizeof *rq);
204	for (i = 0; i < RQ_NQS; i++)
205		TAILQ_INIT(&rq->rq_queues[i]);
206}
207
208/*
209 * Clear the status bit of the queue corresponding to priority level pri,
210 * indicating that it is empty.
211 */
212static __inline void
213runq_clrbit(struct runq *rq, int pri)
214{
215	struct rqbits *rqb;
216
217	rqb = &rq->rq_status;
218	CTR4(KTR_RUNQ, "runq_clrbit: bits=%#x %#x bit=%#x word=%d",
219	    rqb->rqb_bits[RQB_WORD(pri)],
220	    rqb->rqb_bits[RQB_WORD(pri)] & ~RQB_BIT(pri),
221	    RQB_BIT(pri), RQB_WORD(pri));
222	rqb->rqb_bits[RQB_WORD(pri)] &= ~RQB_BIT(pri);
223}
224
225/*
226 * Find the index of the first non-empty run queue.  This is done by
227 * scanning the status bits, a set bit indicates a non-empty queue.
228 */
229static __inline int
230runq_findbit(struct runq *rq)
231{
232	struct rqbits *rqb;
233	int pri;
234	int i;
235
236	rqb = &rq->rq_status;
237	for (i = 0; i < RQB_LEN; i++)
238		if (rqb->rqb_bits[i]) {
239			pri = RQB_FFS(rqb->rqb_bits[i]) + (i << RQB_L2BPW);
240			CTR3(KTR_RUNQ, "runq_findbit: bits=%#x i=%d pri=%d",
241			    rqb->rqb_bits[i], i, pri);
242			return (pri);
243		}
244
245	return (-1);
246}
247
248static __inline int
249runq_findbit_from(struct runq *rq, u_char pri)
250{
251	struct rqbits *rqb;
252	rqb_word_t mask;
253	int i;
254
255	/*
256	 * Set the mask for the first word so we ignore priorities before 'pri'.
257	 */
258	mask = (rqb_word_t)-1 << (pri & (RQB_BPW - 1));
259	rqb = &rq->rq_status;
260again:
261	for (i = RQB_WORD(pri); i < RQB_LEN; mask = -1, i++) {
262		mask = rqb->rqb_bits[i] & mask;
263		if (mask == 0)
264			continue;
265		pri = RQB_FFS(mask) + (i << RQB_L2BPW);
266		CTR3(KTR_RUNQ, "runq_findbit_from: bits=%#x i=%d pri=%d",
267		    mask, i, pri);
268		return (pri);
269	}
270	if (pri == 0)
271		return (-1);
272	/*
273	 * Wrap back around to the beginning of the list just once so we
274	 * scan the whole thing.
275	 */
276	pri = 0;
277	goto again;
278}
279
280/*
281 * Set the status bit of the queue corresponding to priority level pri,
282 * indicating that it is non-empty.
283 */
284static __inline void
285runq_setbit(struct runq *rq, int pri)
286{
287	struct rqbits *rqb;
288
289	rqb = &rq->rq_status;
290	CTR4(KTR_RUNQ, "runq_setbit: bits=%#x %#x bit=%#x word=%d",
291	    rqb->rqb_bits[RQB_WORD(pri)],
292	    rqb->rqb_bits[RQB_WORD(pri)] | RQB_BIT(pri),
293	    RQB_BIT(pri), RQB_WORD(pri));
294	rqb->rqb_bits[RQB_WORD(pri)] |= RQB_BIT(pri);
295}
296
297/*
298 * Add the thread to the queue specified by its priority, and set the
299 * corresponding status bit.
300 */
301void
302runq_add(struct runq *rq, struct td_sched *ts, int flags)
303{
304	struct rqhead *rqh;
305	int pri;
306
307	pri = ts->ts_thread->td_priority / RQ_PPQ;
308	ts->ts_rqindex = pri;
309	runq_setbit(rq, pri);
310	rqh = &rq->rq_queues[pri];
311	CTR5(KTR_RUNQ, "runq_add: td=%p ts=%p pri=%d %d rqh=%p",
312	    ts->ts_thread, ts, ts->ts_thread->td_priority, pri, rqh);
313	if (flags & SRQ_PREEMPTED) {
314		TAILQ_INSERT_HEAD(rqh, ts, ts_procq);
315	} else {
316		TAILQ_INSERT_TAIL(rqh, ts, ts_procq);
317	}
318}
319
320void
321runq_add_pri(struct runq *rq, struct td_sched *ts, u_char pri, int flags)
322{
323	struct rqhead *rqh;
324
325	KASSERT(pri < RQ_NQS, ("runq_add_pri: %d out of range", pri));
326	ts->ts_rqindex = pri;
327	runq_setbit(rq, pri);
328	rqh = &rq->rq_queues[pri];
329	CTR5(KTR_RUNQ, "runq_add_pri: td=%p ke=%p pri=%d idx=%d rqh=%p",
330	    ts->ts_thread, ts, ts->ts_thread->td_priority, pri, rqh);
331	if (flags & SRQ_PREEMPTED) {
332		TAILQ_INSERT_HEAD(rqh, ts, ts_procq);
333	} else {
334		TAILQ_INSERT_TAIL(rqh, ts, ts_procq);
335	}
336}
337/*
338 * Return true if there are runnable processes of any priority on the run
339 * queue, false otherwise.  Has no side effects, does not modify the run
340 * queue structure.
341 */
342int
343runq_check(struct runq *rq)
344{
345	struct rqbits *rqb;
346	int i;
347
348	rqb = &rq->rq_status;
349	for (i = 0; i < RQB_LEN; i++)
350		if (rqb->rqb_bits[i]) {
351			CTR2(KTR_RUNQ, "runq_check: bits=%#x i=%d",
352			    rqb->rqb_bits[i], i);
353			return (1);
354		}
355	CTR0(KTR_RUNQ, "runq_check: empty");
356
357	return (0);
358}
359
360/*
361 * Find the highest priority process on the run queue.
362 */
363struct td_sched *
364runq_choose_fuzz(struct runq *rq, int fuzz)
365{
366	struct rqhead *rqh;
367	struct td_sched *ts;
368	int pri;
369
370	while ((pri = runq_findbit(rq)) != -1) {
371		rqh = &rq->rq_queues[pri];
372		/* fuzz == 1 is normal.. 0 or less are ignored */
373		if (fuzz > 1) {
374			/*
375			 * In the first couple of entries, check if
376			 * there is one for our CPU as a preference.
377			 */
378			int count = fuzz;
379			int cpu = PCPU_GET(cpuid);
380			struct td_sched *ts2;
381			ts2 = ts = TAILQ_FIRST(rqh);
382
383			while (count-- && ts2) {
384				if (ts->ts_thread->td_lastcpu == cpu) {
385					ts = ts2;
386					break;
387				}
388				ts2 = TAILQ_NEXT(ts2, ts_procq);
389			}
390		} else
391			ts = TAILQ_FIRST(rqh);
392		KASSERT(ts != NULL, ("runq_choose_fuzz: no proc on busy queue"));
393		CTR3(KTR_RUNQ,
394		    "runq_choose_fuzz: pri=%d td_sched=%p rqh=%p", pri, ts, rqh);
395		return (ts);
396	}
397	CTR1(KTR_RUNQ, "runq_choose_fuzz: idleproc pri=%d", pri);
398
399	return (NULL);
400}
401
402/*
403 * Find the highest priority process on the run queue.
404 */
405struct td_sched *
406runq_choose(struct runq *rq)
407{
408	struct rqhead *rqh;
409	struct td_sched *ts;
410	int pri;
411
412	while ((pri = runq_findbit(rq)) != -1) {
413		rqh = &rq->rq_queues[pri];
414		ts = TAILQ_FIRST(rqh);
415		KASSERT(ts != NULL, ("runq_choose: no proc on busy queue"));
416		CTR3(KTR_RUNQ,
417		    "runq_choose: pri=%d td_sched=%p rqh=%p", pri, ts, rqh);
418		return (ts);
419	}
420	CTR1(KTR_RUNQ, "runq_choose: idleproc pri=%d", pri);
421
422	return (NULL);
423}
424
425struct td_sched *
426runq_choose_from(struct runq *rq, u_char idx)
427{
428	struct rqhead *rqh;
429	struct td_sched *ts;
430	int pri;
431
432	if ((pri = runq_findbit_from(rq, idx)) != -1) {
433		rqh = &rq->rq_queues[pri];
434		ts = TAILQ_FIRST(rqh);
435		KASSERT(ts != NULL, ("runq_choose: no proc on busy queue"));
436		CTR4(KTR_RUNQ,
437		    "runq_choose_from: pri=%d td_sched=%p idx=%d rqh=%p",
438		    pri, ts, ts->ts_rqindex, rqh);
439		return (ts);
440	}
441	CTR1(KTR_RUNQ, "runq_choose_from: idleproc pri=%d", pri);
442
443	return (NULL);
444}
445/*
446 * Remove the thread from the queue specified by its priority, and clear the
447 * corresponding status bit if the queue becomes empty.
448 * Caller must set state afterwards.
449 */
450void
451runq_remove(struct runq *rq, struct td_sched *ts)
452{
453
454	runq_remove_idx(rq, ts, NULL);
455}
456
457void
458runq_remove_idx(struct runq *rq, struct td_sched *ts, u_char *idx)
459{
460	struct rqhead *rqh;
461	u_char pri;
462
463	KASSERT(ts->ts_thread->td_flags & TDF_INMEM,
464		("runq_remove_idx: thread swapped out"));
465	pri = ts->ts_rqindex;
466	KASSERT(pri < RQ_NQS, ("runq_remove_idx: Invalid index %d\n", pri));
467	rqh = &rq->rq_queues[pri];
468	CTR5(KTR_RUNQ, "runq_remove_idx: td=%p, ts=%p pri=%d %d rqh=%p",
469	    ts->ts_thread, ts, ts->ts_thread->td_priority, pri, rqh);
470	{
471		struct td_sched *nts;
472
473		TAILQ_FOREACH(nts, rqh, ts_procq)
474			if (nts == ts)
475				break;
476		if (ts != nts)
477			panic("runq_remove_idx: ts %p not on rqindex %d",
478			    ts, pri);
479	}
480	TAILQ_REMOVE(rqh, ts, ts_procq);
481	if (TAILQ_EMPTY(rqh)) {
482		CTR0(KTR_RUNQ, "runq_remove_idx: empty");
483		runq_clrbit(rq, pri);
484		if (idx != NULL && *idx == pri)
485			*idx = (pri + 1) % RQ_NQS;
486	}
487}
488
489/****** functions that are temporarily here ***********/
490#include <vm/uma.h>
491
492/*
493 *  Allocate scheduler specific per-process resources.
494 * The thread and proc have already been linked in.
495 *
496 * Called from:
497 *  proc_init() (UMA init method)
498 */
499void
500sched_newproc(struct proc *p, struct thread *td)
501{
502}
503
504/*
505 * thread is being either created or recycled.
506 * Fix up the per-scheduler resources associated with it.
507 * Called from:
508 *  sched_fork_thread()
509 *  thread_dtor()  (*may go away)
510 *  thread_init()  (*may go away)
511 */
512void
513sched_newthread(struct thread *td)
514{
515	struct td_sched *ts;
516
517	ts = (struct td_sched *) (td + 1);
518	bzero(ts, sizeof(*ts));
519	td->td_sched     = ts;
520	ts->ts_thread	= td;
521}
522
523#endif /* KERN_SWITCH_INCLUDE */
524