1/*-
2 * Copyright (c) 2001 Jake Burkholder <jake@FreeBSD.org>
3 * All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 * 1. Redistributions of source code must retain the above copyright
9 *    notice, this list of conditions and the following disclaimer.
10 * 2. Redistributions in binary form must reproduce the above copyright
11 *    notice, this list of conditions and the following disclaimer in the
12 *    documentation and/or other materials provided with the distribution.
13 *
14 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
15 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
16 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
17 * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
18 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
19 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
20 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
21 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
22 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
23 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
24 * SUCH DAMAGE.
25 */
26
27
28#include <sys/cdefs.h>
29__FBSDID("$FreeBSD: stable/11/sys/kern/kern_switch.c 327481 2018-01-02 00:14:46Z mjg $");
30
31#include "opt_sched.h"
32
33#include <sys/param.h>
34#include <sys/systm.h>
35#include <sys/kdb.h>
36#include <sys/kernel.h>
37#include <sys/ktr.h>
38#include <sys/lock.h>
39#include <sys/mutex.h>
40#include <sys/proc.h>
41#include <sys/queue.h>
42#include <sys/sched.h>
43#include <sys/smp.h>
44#include <sys/sysctl.h>
45
46#include <machine/cpu.h>
47
48/* Uncomment this to enable logging of critical_enter/exit. */
49#if 0
50#define	KTR_CRITICAL	KTR_SCHED
51#else
52#define	KTR_CRITICAL	0
53#endif
54
55#ifdef FULL_PREEMPTION
56#ifndef PREEMPTION
57#error "The FULL_PREEMPTION option requires the PREEMPTION option"
58#endif
59#endif
60
61CTASSERT((RQB_BPW * RQB_LEN) == RQ_NQS);
62
63/*
64 * kern.sched.preemption allows user space to determine if preemption support
65 * is compiled in or not.  It is not currently a boot or runtime flag that
66 * can be changed.
67 */
68#ifdef PREEMPTION
69static int kern_sched_preemption = 1;
70#else
71static int kern_sched_preemption = 0;
72#endif
73SYSCTL_INT(_kern_sched, OID_AUTO, preemption, CTLFLAG_RD,
74    &kern_sched_preemption, 0, "Kernel preemption enabled");
75
76/*
77 * Support for scheduler stats exported via kern.sched.stats.  All stats may
78 * be reset with kern.sched.stats.reset = 1.  Stats may be defined elsewhere
79 * with SCHED_STAT_DEFINE().
80 */
81#ifdef SCHED_STATS
82SYSCTL_NODE(_kern_sched, OID_AUTO, stats, CTLFLAG_RW, 0, "switch stats");
83
84/* Switch reasons from mi_switch(). */
85DPCPU_DEFINE(long, sched_switch_stats[SWT_COUNT]);
86SCHED_STAT_DEFINE_VAR(uncategorized,
87    &DPCPU_NAME(sched_switch_stats[SWT_NONE]), "");
88SCHED_STAT_DEFINE_VAR(preempt,
89    &DPCPU_NAME(sched_switch_stats[SWT_PREEMPT]), "");
90SCHED_STAT_DEFINE_VAR(owepreempt,
91    &DPCPU_NAME(sched_switch_stats[SWT_OWEPREEMPT]), "");
92SCHED_STAT_DEFINE_VAR(turnstile,
93    &DPCPU_NAME(sched_switch_stats[SWT_TURNSTILE]), "");
94SCHED_STAT_DEFINE_VAR(sleepq,
95    &DPCPU_NAME(sched_switch_stats[SWT_SLEEPQ]), "");
96SCHED_STAT_DEFINE_VAR(sleepqtimo,
97    &DPCPU_NAME(sched_switch_stats[SWT_SLEEPQTIMO]), "");
98SCHED_STAT_DEFINE_VAR(relinquish,
99    &DPCPU_NAME(sched_switch_stats[SWT_RELINQUISH]), "");
100SCHED_STAT_DEFINE_VAR(needresched,
101    &DPCPU_NAME(sched_switch_stats[SWT_NEEDRESCHED]), "");
102SCHED_STAT_DEFINE_VAR(idle,
103    &DPCPU_NAME(sched_switch_stats[SWT_IDLE]), "");
104SCHED_STAT_DEFINE_VAR(iwait,
105    &DPCPU_NAME(sched_switch_stats[SWT_IWAIT]), "");
106SCHED_STAT_DEFINE_VAR(suspend,
107    &DPCPU_NAME(sched_switch_stats[SWT_SUSPEND]), "");
108SCHED_STAT_DEFINE_VAR(remotepreempt,
109    &DPCPU_NAME(sched_switch_stats[SWT_REMOTEPREEMPT]), "");
110SCHED_STAT_DEFINE_VAR(remotewakeidle,
111    &DPCPU_NAME(sched_switch_stats[SWT_REMOTEWAKEIDLE]), "");
112
113static int
114sysctl_stats_reset(SYSCTL_HANDLER_ARGS)
115{
116	struct sysctl_oid *p;
117	uintptr_t counter;
118        int error;
119	int val;
120	int i;
121
122        val = 0;
123        error = sysctl_handle_int(oidp, &val, 0, req);
124        if (error != 0 || req->newptr == NULL)
125                return (error);
126        if (val == 0)
127                return (0);
128	/*
129	 * Traverse the list of children of _kern_sched_stats and reset each
130	 * to 0.  Skip the reset entry.
131	 */
132	SLIST_FOREACH(p, oidp->oid_parent, oid_link) {
133		if (p == oidp || p->oid_arg1 == NULL)
134			continue;
135		counter = (uintptr_t)p->oid_arg1;
136		CPU_FOREACH(i) {
137			*(long *)(dpcpu_off[i] + counter) = 0;
138		}
139	}
140	return (0);
141}
142
143SYSCTL_PROC(_kern_sched_stats, OID_AUTO, reset, CTLTYPE_INT | CTLFLAG_WR, NULL,
144    0, sysctl_stats_reset, "I", "Reset scheduler statistics");
145#endif
146
147/************************************************************************
148 * Functions that manipulate runnability from a thread perspective.	*
149 ************************************************************************/
150/*
151 * Select the thread that will be run next.
152 */
153
154static __noinline struct thread *
155choosethread_panic(struct thread *td)
156{
157
158	/*
159	 * If we are in panic, only allow system threads,
160	 * plus the one we are running in, to be run.
161	 */
162retry:
163	if (((td->td_proc->p_flag & P_SYSTEM) == 0 &&
164	    (td->td_flags & TDF_INPANIC) == 0)) {
165		/* note that it is no longer on the run queue */
166		TD_SET_CAN_RUN(td);
167		td = sched_choose();
168		goto retry;
169	}
170
171	TD_SET_RUNNING(td);
172	return (td);
173}
174
175struct thread *
176choosethread(void)
177{
178	struct thread *td;
179
180	td = sched_choose();
181
182	if (__predict_false(panicstr != NULL))
183		return (choosethread_panic(td));
184
185	TD_SET_RUNNING(td);
186	return (td);
187}
188
189/*
190 * Kernel thread preemption implementation.  Critical sections mark
191 * regions of code in which preemptions are not allowed.
192 *
193 * It might seem a good idea to inline critical_enter() but, in order
194 * to prevent instructions reordering by the compiler, a __compiler_membar()
195 * would have to be used here (the same as sched_pin()).  The performance
196 * penalty imposed by the membar could, then, produce slower code than
197 * the function call itself, for most cases.
198 */
199void
200critical_enter(void)
201{
202	struct thread *td;
203
204	td = curthread;
205	td->td_critnest++;
206	CTR4(KTR_CRITICAL, "critical_enter by thread %p (%ld, %s) to %d", td,
207	    (long)td->td_proc->p_pid, td->td_name, td->td_critnest);
208}
209
210void
211critical_exit(void)
212{
213	struct thread *td;
214	int flags;
215
216	td = curthread;
217	KASSERT(td->td_critnest != 0,
218	    ("critical_exit: td_critnest == 0"));
219
220	if (td->td_critnest == 1) {
221		td->td_critnest = 0;
222
223		/*
224		 * Interrupt handlers execute critical_exit() on
225		 * leave, and td_owepreempt may be left set by an
226		 * interrupt handler only when td_critnest > 0.  If we
227		 * are decrementing td_critnest from 1 to 0, read
228		 * td_owepreempt after decrementing, to not miss the
229		 * preempt.  Disallow compiler to reorder operations.
230		 */
231		__compiler_membar();
232		if (td->td_owepreempt && !kdb_active) {
233			/*
234			 * Microoptimization: we committed to switch,
235			 * disable preemption in interrupt handlers
236			 * while spinning for the thread lock.
237			 */
238			td->td_critnest = 1;
239			thread_lock(td);
240			td->td_critnest--;
241			flags = SW_INVOL | SW_PREEMPT;
242			if (TD_IS_IDLETHREAD(td))
243				flags |= SWT_IDLE;
244			else
245				flags |= SWT_OWEPREEMPT;
246			mi_switch(flags, NULL);
247			thread_unlock(td);
248		}
249	} else
250		td->td_critnest--;
251
252	CTR4(KTR_CRITICAL, "critical_exit by thread %p (%ld, %s) to %d", td,
253	    (long)td->td_proc->p_pid, td->td_name, td->td_critnest);
254}
255
256/************************************************************************
257 * SYSTEM RUN QUEUE manipulations and tests				*
258 ************************************************************************/
259/*
260 * Initialize a run structure.
261 */
262void
263runq_init(struct runq *rq)
264{
265	int i;
266
267	bzero(rq, sizeof *rq);
268	for (i = 0; i < RQ_NQS; i++)
269		TAILQ_INIT(&rq->rq_queues[i]);
270}
271
272/*
273 * Clear the status bit of the queue corresponding to priority level pri,
274 * indicating that it is empty.
275 */
276static __inline void
277runq_clrbit(struct runq *rq, int pri)
278{
279	struct rqbits *rqb;
280
281	rqb = &rq->rq_status;
282	CTR4(KTR_RUNQ, "runq_clrbit: bits=%#x %#x bit=%#x word=%d",
283	    rqb->rqb_bits[RQB_WORD(pri)],
284	    rqb->rqb_bits[RQB_WORD(pri)] & ~RQB_BIT(pri),
285	    RQB_BIT(pri), RQB_WORD(pri));
286	rqb->rqb_bits[RQB_WORD(pri)] &= ~RQB_BIT(pri);
287}
288
289/*
290 * Find the index of the first non-empty run queue.  This is done by
291 * scanning the status bits, a set bit indicates a non-empty queue.
292 */
293static __inline int
294runq_findbit(struct runq *rq)
295{
296	struct rqbits *rqb;
297	int pri;
298	int i;
299
300	rqb = &rq->rq_status;
301	for (i = 0; i < RQB_LEN; i++)
302		if (rqb->rqb_bits[i]) {
303			pri = RQB_FFS(rqb->rqb_bits[i]) + (i << RQB_L2BPW);
304			CTR3(KTR_RUNQ, "runq_findbit: bits=%#x i=%d pri=%d",
305			    rqb->rqb_bits[i], i, pri);
306			return (pri);
307		}
308
309	return (-1);
310}
311
312static __inline int
313runq_findbit_from(struct runq *rq, u_char pri)
314{
315	struct rqbits *rqb;
316	rqb_word_t mask;
317	int i;
318
319	/*
320	 * Set the mask for the first word so we ignore priorities before 'pri'.
321	 */
322	mask = (rqb_word_t)-1 << (pri & (RQB_BPW - 1));
323	rqb = &rq->rq_status;
324again:
325	for (i = RQB_WORD(pri); i < RQB_LEN; mask = -1, i++) {
326		mask = rqb->rqb_bits[i] & mask;
327		if (mask == 0)
328			continue;
329		pri = RQB_FFS(mask) + (i << RQB_L2BPW);
330		CTR3(KTR_RUNQ, "runq_findbit_from: bits=%#x i=%d pri=%d",
331		    mask, i, pri);
332		return (pri);
333	}
334	if (pri == 0)
335		return (-1);
336	/*
337	 * Wrap back around to the beginning of the list just once so we
338	 * scan the whole thing.
339	 */
340	pri = 0;
341	goto again;
342}
343
344/*
345 * Set the status bit of the queue corresponding to priority level pri,
346 * indicating that it is non-empty.
347 */
348static __inline void
349runq_setbit(struct runq *rq, int pri)
350{
351	struct rqbits *rqb;
352
353	rqb = &rq->rq_status;
354	CTR4(KTR_RUNQ, "runq_setbit: bits=%#x %#x bit=%#x word=%d",
355	    rqb->rqb_bits[RQB_WORD(pri)],
356	    rqb->rqb_bits[RQB_WORD(pri)] | RQB_BIT(pri),
357	    RQB_BIT(pri), RQB_WORD(pri));
358	rqb->rqb_bits[RQB_WORD(pri)] |= RQB_BIT(pri);
359}
360
361/*
362 * Add the thread to the queue specified by its priority, and set the
363 * corresponding status bit.
364 */
365void
366runq_add(struct runq *rq, struct thread *td, int flags)
367{
368	struct rqhead *rqh;
369	int pri;
370
371	pri = td->td_priority / RQ_PPQ;
372	td->td_rqindex = pri;
373	runq_setbit(rq, pri);
374	rqh = &rq->rq_queues[pri];
375	CTR4(KTR_RUNQ, "runq_add: td=%p pri=%d %d rqh=%p",
376	    td, td->td_priority, pri, rqh);
377	if (flags & SRQ_PREEMPTED) {
378		TAILQ_INSERT_HEAD(rqh, td, td_runq);
379	} else {
380		TAILQ_INSERT_TAIL(rqh, td, td_runq);
381	}
382}
383
384void
385runq_add_pri(struct runq *rq, struct thread *td, u_char pri, int flags)
386{
387	struct rqhead *rqh;
388
389	KASSERT(pri < RQ_NQS, ("runq_add_pri: %d out of range", pri));
390	td->td_rqindex = pri;
391	runq_setbit(rq, pri);
392	rqh = &rq->rq_queues[pri];
393	CTR4(KTR_RUNQ, "runq_add_pri: td=%p pri=%d idx=%d rqh=%p",
394	    td, td->td_priority, pri, rqh);
395	if (flags & SRQ_PREEMPTED) {
396		TAILQ_INSERT_HEAD(rqh, td, td_runq);
397	} else {
398		TAILQ_INSERT_TAIL(rqh, td, td_runq);
399	}
400}
401/*
402 * Return true if there are runnable processes of any priority on the run
403 * queue, false otherwise.  Has no side effects, does not modify the run
404 * queue structure.
405 */
406int
407runq_check(struct runq *rq)
408{
409	struct rqbits *rqb;
410	int i;
411
412	rqb = &rq->rq_status;
413	for (i = 0; i < RQB_LEN; i++)
414		if (rqb->rqb_bits[i]) {
415			CTR2(KTR_RUNQ, "runq_check: bits=%#x i=%d",
416			    rqb->rqb_bits[i], i);
417			return (1);
418		}
419	CTR0(KTR_RUNQ, "runq_check: empty");
420
421	return (0);
422}
423
424/*
425 * Find the highest priority process on the run queue.
426 */
427struct thread *
428runq_choose_fuzz(struct runq *rq, int fuzz)
429{
430	struct rqhead *rqh;
431	struct thread *td;
432	int pri;
433
434	while ((pri = runq_findbit(rq)) != -1) {
435		rqh = &rq->rq_queues[pri];
436		/* fuzz == 1 is normal.. 0 or less are ignored */
437		if (fuzz > 1) {
438			/*
439			 * In the first couple of entries, check if
440			 * there is one for our CPU as a preference.
441			 */
442			int count = fuzz;
443			int cpu = PCPU_GET(cpuid);
444			struct thread *td2;
445			td2 = td = TAILQ_FIRST(rqh);
446
447			while (count-- && td2) {
448				if (td2->td_lastcpu == cpu) {
449					td = td2;
450					break;
451				}
452				td2 = TAILQ_NEXT(td2, td_runq);
453			}
454		} else
455			td = TAILQ_FIRST(rqh);
456		KASSERT(td != NULL, ("runq_choose_fuzz: no proc on busy queue"));
457		CTR3(KTR_RUNQ,
458		    "runq_choose_fuzz: pri=%d thread=%p rqh=%p", pri, td, rqh);
459		return (td);
460	}
461	CTR1(KTR_RUNQ, "runq_choose_fuzz: idleproc pri=%d", pri);
462
463	return (NULL);
464}
465
466/*
467 * Find the highest priority process on the run queue.
468 */
469struct thread *
470runq_choose(struct runq *rq)
471{
472	struct rqhead *rqh;
473	struct thread *td;
474	int pri;
475
476	while ((pri = runq_findbit(rq)) != -1) {
477		rqh = &rq->rq_queues[pri];
478		td = TAILQ_FIRST(rqh);
479		KASSERT(td != NULL, ("runq_choose: no thread on busy queue"));
480		CTR3(KTR_RUNQ,
481		    "runq_choose: pri=%d thread=%p rqh=%p", pri, td, rqh);
482		return (td);
483	}
484	CTR1(KTR_RUNQ, "runq_choose: idlethread pri=%d", pri);
485
486	return (NULL);
487}
488
489struct thread *
490runq_choose_from(struct runq *rq, u_char idx)
491{
492	struct rqhead *rqh;
493	struct thread *td;
494	int pri;
495
496	if ((pri = runq_findbit_from(rq, idx)) != -1) {
497		rqh = &rq->rq_queues[pri];
498		td = TAILQ_FIRST(rqh);
499		KASSERT(td != NULL, ("runq_choose: no thread on busy queue"));
500		CTR4(KTR_RUNQ,
501		    "runq_choose_from: pri=%d thread=%p idx=%d rqh=%p",
502		    pri, td, td->td_rqindex, rqh);
503		return (td);
504	}
505	CTR1(KTR_RUNQ, "runq_choose_from: idlethread pri=%d", pri);
506
507	return (NULL);
508}
509/*
510 * Remove the thread from the queue specified by its priority, and clear the
511 * corresponding status bit if the queue becomes empty.
512 * Caller must set state afterwards.
513 */
514void
515runq_remove(struct runq *rq, struct thread *td)
516{
517
518	runq_remove_idx(rq, td, NULL);
519}
520
521void
522runq_remove_idx(struct runq *rq, struct thread *td, u_char *idx)
523{
524	struct rqhead *rqh;
525	u_char pri;
526
527	KASSERT(td->td_flags & TDF_INMEM,
528		("runq_remove_idx: thread swapped out"));
529	pri = td->td_rqindex;
530	KASSERT(pri < RQ_NQS, ("runq_remove_idx: Invalid index %d\n", pri));
531	rqh = &rq->rq_queues[pri];
532	CTR4(KTR_RUNQ, "runq_remove_idx: td=%p, pri=%d %d rqh=%p",
533	    td, td->td_priority, pri, rqh);
534	TAILQ_REMOVE(rqh, td, td_runq);
535	if (TAILQ_EMPTY(rqh)) {
536		CTR0(KTR_RUNQ, "runq_remove_idx: empty");
537		runq_clrbit(rq, pri);
538		if (idx != NULL && *idx == pri)
539			*idx = (pri + 1) % RQ_NQS;
540	}
541}
542