1/*-
2 * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
3 *
4 * Copyright (c) 2001 Jake Burkholder <jake@FreeBSD.org>
5 * All rights reserved.
6 *
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
9 * are met:
10 * 1. Redistributions of source code must retain the above copyright
11 *    notice, this list of conditions and the following disclaimer.
12 * 2. Redistributions in binary form must reproduce the above copyright
13 *    notice, this list of conditions and the following disclaimer in the
14 *    documentation and/or other materials provided with the distribution.
15 *
16 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
17 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19 * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
20 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
22 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
23 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
24 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
25 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
26 * SUCH DAMAGE.
27 */
28
29
30#include <sys/cdefs.h>
31__FBSDID("$FreeBSD$");
32
33#include "opt_sched.h"
34
35#include <sys/param.h>
36#include <sys/systm.h>
37#include <sys/kdb.h>
38#include <sys/kernel.h>
39#include <sys/ktr.h>
40#include <sys/lock.h>
41#include <sys/mutex.h>
42#include <sys/proc.h>
43#include <sys/queue.h>
44#include <sys/sched.h>
45#include <sys/smp.h>
46#include <sys/sysctl.h>
47
48#include <machine/cpu.h>
49
50/* Uncomment this to enable logging of critical_enter/exit. */
51#if 0
52#define	KTR_CRITICAL	KTR_SCHED
53#else
54#define	KTR_CRITICAL	0
55#endif
56
57#ifdef FULL_PREEMPTION
58#ifndef PREEMPTION
59#error "The FULL_PREEMPTION option requires the PREEMPTION option"
60#endif
61#endif
62
63CTASSERT((RQB_BPW * RQB_LEN) == RQ_NQS);
64
65/*
66 * kern.sched.preemption allows user space to determine if preemption support
67 * is compiled in or not.  It is not currently a boot or runtime flag that
68 * can be changed.
69 */
70#ifdef PREEMPTION
71static int kern_sched_preemption = 1;
72#else
73static int kern_sched_preemption = 0;
74#endif
75SYSCTL_INT(_kern_sched, OID_AUTO, preemption, CTLFLAG_RD,
76    &kern_sched_preemption, 0, "Kernel preemption enabled");
77
78/*
79 * Support for scheduler stats exported via kern.sched.stats.  All stats may
80 * be reset with kern.sched.stats.reset = 1.  Stats may be defined elsewhere
81 * with SCHED_STAT_DEFINE().
82 */
83#ifdef SCHED_STATS
84SYSCTL_NODE(_kern_sched, OID_AUTO, stats, CTLFLAG_RW, 0, "switch stats");
85
86/* Switch reasons from mi_switch(). */
87DPCPU_DEFINE(long, sched_switch_stats[SWT_COUNT]);
88SCHED_STAT_DEFINE_VAR(uncategorized,
89    &DPCPU_NAME(sched_switch_stats[SWT_NONE]), "");
90SCHED_STAT_DEFINE_VAR(preempt,
91    &DPCPU_NAME(sched_switch_stats[SWT_PREEMPT]), "");
92SCHED_STAT_DEFINE_VAR(owepreempt,
93    &DPCPU_NAME(sched_switch_stats[SWT_OWEPREEMPT]), "");
94SCHED_STAT_DEFINE_VAR(turnstile,
95    &DPCPU_NAME(sched_switch_stats[SWT_TURNSTILE]), "");
96SCHED_STAT_DEFINE_VAR(sleepq,
97    &DPCPU_NAME(sched_switch_stats[SWT_SLEEPQ]), "");
98SCHED_STAT_DEFINE_VAR(sleepqtimo,
99    &DPCPU_NAME(sched_switch_stats[SWT_SLEEPQTIMO]), "");
100SCHED_STAT_DEFINE_VAR(relinquish,
101    &DPCPU_NAME(sched_switch_stats[SWT_RELINQUISH]), "");
102SCHED_STAT_DEFINE_VAR(needresched,
103    &DPCPU_NAME(sched_switch_stats[SWT_NEEDRESCHED]), "");
104SCHED_STAT_DEFINE_VAR(idle,
105    &DPCPU_NAME(sched_switch_stats[SWT_IDLE]), "");
106SCHED_STAT_DEFINE_VAR(iwait,
107    &DPCPU_NAME(sched_switch_stats[SWT_IWAIT]), "");
108SCHED_STAT_DEFINE_VAR(suspend,
109    &DPCPU_NAME(sched_switch_stats[SWT_SUSPEND]), "");
110SCHED_STAT_DEFINE_VAR(remotepreempt,
111    &DPCPU_NAME(sched_switch_stats[SWT_REMOTEPREEMPT]), "");
112SCHED_STAT_DEFINE_VAR(remotewakeidle,
113    &DPCPU_NAME(sched_switch_stats[SWT_REMOTEWAKEIDLE]), "");
114
115static int
116sysctl_stats_reset(SYSCTL_HANDLER_ARGS)
117{
118	struct sysctl_oid *p;
119	uintptr_t counter;
120        int error;
121	int val;
122	int i;
123
124        val = 0;
125        error = sysctl_handle_int(oidp, &val, 0, req);
126        if (error != 0 || req->newptr == NULL)
127                return (error);
128        if (val == 0)
129                return (0);
130	/*
131	 * Traverse the list of children of _kern_sched_stats and reset each
132	 * to 0.  Skip the reset entry.
133	 */
134	SLIST_FOREACH(p, oidp->oid_parent, oid_link) {
135		if (p == oidp || p->oid_arg1 == NULL)
136			continue;
137		counter = (uintptr_t)p->oid_arg1;
138		CPU_FOREACH(i) {
139			*(long *)(dpcpu_off[i] + counter) = 0;
140		}
141	}
142	return (0);
143}
144
145SYSCTL_PROC(_kern_sched_stats, OID_AUTO, reset, CTLTYPE_INT | CTLFLAG_WR, NULL,
146    0, sysctl_stats_reset, "I", "Reset scheduler statistics");
147#endif
148
149/************************************************************************
150 * Functions that manipulate runnability from a thread perspective.	*
151 ************************************************************************/
152/*
153 * Select the thread that will be run next.
154 */
155
156static __noinline struct thread *
157choosethread_panic(struct thread *td)
158{
159
160	/*
161	 * If we are in panic, only allow system threads,
162	 * plus the one we are running in, to be run.
163	 */
164retry:
165	if (((td->td_proc->p_flag & P_SYSTEM) == 0 &&
166	    (td->td_flags & TDF_INPANIC) == 0)) {
167		/* note that it is no longer on the run queue */
168		TD_SET_CAN_RUN(td);
169		td = sched_choose();
170		goto retry;
171	}
172
173	TD_SET_RUNNING(td);
174	return (td);
175}
176
177struct thread *
178choosethread(void)
179{
180	struct thread *td;
181
182	td = sched_choose();
183
184	if (__predict_false(panicstr != NULL))
185		return (choosethread_panic(td));
186
187	TD_SET_RUNNING(td);
188	return (td);
189}
190
191/*
192 * Kernel thread preemption implementation.  Critical sections mark
193 * regions of code in which preemptions are not allowed.
194 *
195 * It might seem a good idea to inline critical_enter() but, in order
196 * to prevent instructions reordering by the compiler, a __compiler_membar()
197 * would have to be used here (the same as sched_pin()).  The performance
198 * penalty imposed by the membar could, then, produce slower code than
199 * the function call itself, for most cases.
200 */
201void
202critical_enter_KBI(void)
203{
204#ifdef KTR
205	struct thread *td = curthread;
206#endif
207	critical_enter();
208	CTR4(KTR_CRITICAL, "critical_enter by thread %p (%ld, %s) to %d", td,
209	    (long)td->td_proc->p_pid, td->td_name, td->td_critnest);
210}
211
212void __noinline
213critical_exit_preempt(void)
214{
215	struct thread *td;
216	int flags;
217
218	/*
219	 * If td_critnest is 0, it is possible that we are going to get
220	 * preempted again before reaching the code below. This happens
221	 * rarely and is harmless. However, this means td_owepreempt may
222	 * now be unset.
223	 */
224	td = curthread;
225	if (td->td_critnest != 0)
226		return;
227	if (kdb_active)
228		return;
229
230	/*
231	 * Microoptimization: we committed to switch,
232	 * disable preemption in interrupt handlers
233	 * while spinning for the thread lock.
234	 */
235	td->td_critnest = 1;
236	thread_lock(td);
237	td->td_critnest--;
238	flags = SW_INVOL | SW_PREEMPT;
239	if (TD_IS_IDLETHREAD(td))
240		flags |= SWT_IDLE;
241	else
242		flags |= SWT_OWEPREEMPT;
243	mi_switch(flags, NULL);
244	thread_unlock(td);
245}
246
247void
248critical_exit_KBI(void)
249{
250#ifdef KTR
251	struct thread *td = curthread;
252#endif
253	critical_exit();
254	CTR4(KTR_CRITICAL, "critical_exit by thread %p (%ld, %s) to %d", td,
255	    (long)td->td_proc->p_pid, td->td_name, td->td_critnest);
256}
257
258/************************************************************************
259 * SYSTEM RUN QUEUE manipulations and tests				*
260 ************************************************************************/
261/*
262 * Initialize a run structure.
263 */
264void
265runq_init(struct runq *rq)
266{
267	int i;
268
269	bzero(rq, sizeof *rq);
270	for (i = 0; i < RQ_NQS; i++)
271		TAILQ_INIT(&rq->rq_queues[i]);
272}
273
274/*
275 * Clear the status bit of the queue corresponding to priority level pri,
276 * indicating that it is empty.
277 */
278static __inline void
279runq_clrbit(struct runq *rq, int pri)
280{
281	struct rqbits *rqb;
282
283	rqb = &rq->rq_status;
284	CTR4(KTR_RUNQ, "runq_clrbit: bits=%#x %#x bit=%#x word=%d",
285	    rqb->rqb_bits[RQB_WORD(pri)],
286	    rqb->rqb_bits[RQB_WORD(pri)] & ~RQB_BIT(pri),
287	    RQB_BIT(pri), RQB_WORD(pri));
288	rqb->rqb_bits[RQB_WORD(pri)] &= ~RQB_BIT(pri);
289}
290
291/*
292 * Find the index of the first non-empty run queue.  This is done by
293 * scanning the status bits, a set bit indicates a non-empty queue.
294 */
295static __inline int
296runq_findbit(struct runq *rq)
297{
298	struct rqbits *rqb;
299	int pri;
300	int i;
301
302	rqb = &rq->rq_status;
303	for (i = 0; i < RQB_LEN; i++)
304		if (rqb->rqb_bits[i]) {
305			pri = RQB_FFS(rqb->rqb_bits[i]) + (i << RQB_L2BPW);
306			CTR3(KTR_RUNQ, "runq_findbit: bits=%#x i=%d pri=%d",
307			    rqb->rqb_bits[i], i, pri);
308			return (pri);
309		}
310
311	return (-1);
312}
313
314static __inline int
315runq_findbit_from(struct runq *rq, u_char pri)
316{
317	struct rqbits *rqb;
318	rqb_word_t mask;
319	int i;
320
321	/*
322	 * Set the mask for the first word so we ignore priorities before 'pri'.
323	 */
324	mask = (rqb_word_t)-1 << (pri & (RQB_BPW - 1));
325	rqb = &rq->rq_status;
326again:
327	for (i = RQB_WORD(pri); i < RQB_LEN; mask = -1, i++) {
328		mask = rqb->rqb_bits[i] & mask;
329		if (mask == 0)
330			continue;
331		pri = RQB_FFS(mask) + (i << RQB_L2BPW);
332		CTR3(KTR_RUNQ, "runq_findbit_from: bits=%#x i=%d pri=%d",
333		    mask, i, pri);
334		return (pri);
335	}
336	if (pri == 0)
337		return (-1);
338	/*
339	 * Wrap back around to the beginning of the list just once so we
340	 * scan the whole thing.
341	 */
342	pri = 0;
343	goto again;
344}
345
346/*
347 * Set the status bit of the queue corresponding to priority level pri,
348 * indicating that it is non-empty.
349 */
350static __inline void
351runq_setbit(struct runq *rq, int pri)
352{
353	struct rqbits *rqb;
354
355	rqb = &rq->rq_status;
356	CTR4(KTR_RUNQ, "runq_setbit: bits=%#x %#x bit=%#x word=%d",
357	    rqb->rqb_bits[RQB_WORD(pri)],
358	    rqb->rqb_bits[RQB_WORD(pri)] | RQB_BIT(pri),
359	    RQB_BIT(pri), RQB_WORD(pri));
360	rqb->rqb_bits[RQB_WORD(pri)] |= RQB_BIT(pri);
361}
362
363/*
364 * Add the thread to the queue specified by its priority, and set the
365 * corresponding status bit.
366 */
367void
368runq_add(struct runq *rq, struct thread *td, int flags)
369{
370	struct rqhead *rqh;
371	int pri;
372
373	pri = td->td_priority / RQ_PPQ;
374	td->td_rqindex = pri;
375	runq_setbit(rq, pri);
376	rqh = &rq->rq_queues[pri];
377	CTR4(KTR_RUNQ, "runq_add: td=%p pri=%d %d rqh=%p",
378	    td, td->td_priority, pri, rqh);
379	if (flags & SRQ_PREEMPTED) {
380		TAILQ_INSERT_HEAD(rqh, td, td_runq);
381	} else {
382		TAILQ_INSERT_TAIL(rqh, td, td_runq);
383	}
384}
385
386void
387runq_add_pri(struct runq *rq, struct thread *td, u_char pri, int flags)
388{
389	struct rqhead *rqh;
390
391	KASSERT(pri < RQ_NQS, ("runq_add_pri: %d out of range", pri));
392	td->td_rqindex = pri;
393	runq_setbit(rq, pri);
394	rqh = &rq->rq_queues[pri];
395	CTR4(KTR_RUNQ, "runq_add_pri: td=%p pri=%d idx=%d rqh=%p",
396	    td, td->td_priority, pri, rqh);
397	if (flags & SRQ_PREEMPTED) {
398		TAILQ_INSERT_HEAD(rqh, td, td_runq);
399	} else {
400		TAILQ_INSERT_TAIL(rqh, td, td_runq);
401	}
402}
403/*
404 * Return true if there are runnable processes of any priority on the run
405 * queue, false otherwise.  Has no side effects, does not modify the run
406 * queue structure.
407 */
408int
409runq_check(struct runq *rq)
410{
411	struct rqbits *rqb;
412	int i;
413
414	rqb = &rq->rq_status;
415	for (i = 0; i < RQB_LEN; i++)
416		if (rqb->rqb_bits[i]) {
417			CTR2(KTR_RUNQ, "runq_check: bits=%#x i=%d",
418			    rqb->rqb_bits[i], i);
419			return (1);
420		}
421	CTR0(KTR_RUNQ, "runq_check: empty");
422
423	return (0);
424}
425
426/*
427 * Find the highest priority process on the run queue.
428 */
429struct thread *
430runq_choose_fuzz(struct runq *rq, int fuzz)
431{
432	struct rqhead *rqh;
433	struct thread *td;
434	int pri;
435
436	while ((pri = runq_findbit(rq)) != -1) {
437		rqh = &rq->rq_queues[pri];
438		/* fuzz == 1 is normal.. 0 or less are ignored */
439		if (fuzz > 1) {
440			/*
441			 * In the first couple of entries, check if
442			 * there is one for our CPU as a preference.
443			 */
444			int count = fuzz;
445			int cpu = PCPU_GET(cpuid);
446			struct thread *td2;
447			td2 = td = TAILQ_FIRST(rqh);
448
449			while (count-- && td2) {
450				if (td2->td_lastcpu == cpu) {
451					td = td2;
452					break;
453				}
454				td2 = TAILQ_NEXT(td2, td_runq);
455			}
456		} else
457			td = TAILQ_FIRST(rqh);
458		KASSERT(td != NULL, ("runq_choose_fuzz: no proc on busy queue"));
459		CTR3(KTR_RUNQ,
460		    "runq_choose_fuzz: pri=%d thread=%p rqh=%p", pri, td, rqh);
461		return (td);
462	}
463	CTR1(KTR_RUNQ, "runq_choose_fuzz: idleproc pri=%d", pri);
464
465	return (NULL);
466}
467
468/*
469 * Find the highest priority process on the run queue.
470 */
471struct thread *
472runq_choose(struct runq *rq)
473{
474	struct rqhead *rqh;
475	struct thread *td;
476	int pri;
477
478	while ((pri = runq_findbit(rq)) != -1) {
479		rqh = &rq->rq_queues[pri];
480		td = TAILQ_FIRST(rqh);
481		KASSERT(td != NULL, ("runq_choose: no thread on busy queue"));
482		CTR3(KTR_RUNQ,
483		    "runq_choose: pri=%d thread=%p rqh=%p", pri, td, rqh);
484		return (td);
485	}
486	CTR1(KTR_RUNQ, "runq_choose: idlethread pri=%d", pri);
487
488	return (NULL);
489}
490
491struct thread *
492runq_choose_from(struct runq *rq, u_char idx)
493{
494	struct rqhead *rqh;
495	struct thread *td;
496	int pri;
497
498	if ((pri = runq_findbit_from(rq, idx)) != -1) {
499		rqh = &rq->rq_queues[pri];
500		td = TAILQ_FIRST(rqh);
501		KASSERT(td != NULL, ("runq_choose: no thread on busy queue"));
502		CTR4(KTR_RUNQ,
503		    "runq_choose_from: pri=%d thread=%p idx=%d rqh=%p",
504		    pri, td, td->td_rqindex, rqh);
505		return (td);
506	}
507	CTR1(KTR_RUNQ, "runq_choose_from: idlethread pri=%d", pri);
508
509	return (NULL);
510}
511/*
512 * Remove the thread from the queue specified by its priority, and clear the
513 * corresponding status bit if the queue becomes empty.
514 * Caller must set state afterwards.
515 */
516void
517runq_remove(struct runq *rq, struct thread *td)
518{
519
520	runq_remove_idx(rq, td, NULL);
521}
522
523void
524runq_remove_idx(struct runq *rq, struct thread *td, u_char *idx)
525{
526	struct rqhead *rqh;
527	u_char pri;
528
529	KASSERT(td->td_flags & TDF_INMEM,
530		("runq_remove_idx: thread swapped out"));
531	pri = td->td_rqindex;
532	KASSERT(pri < RQ_NQS, ("runq_remove_idx: Invalid index %d\n", pri));
533	rqh = &rq->rq_queues[pri];
534	CTR4(KTR_RUNQ, "runq_remove_idx: td=%p, pri=%d %d rqh=%p",
535	    td, td->td_priority, pri, rqh);
536	TAILQ_REMOVE(rqh, td, td_runq);
537	if (TAILQ_EMPTY(rqh)) {
538		CTR0(KTR_RUNQ, "runq_remove_idx: empty");
539		runq_clrbit(rq, pri);
540		if (idx != NULL && *idx == pri)
541			*idx = (pri + 1) % RQ_NQS;
542	}
543}
544