1/*-
2 * Copyright (c) 1982, 1986, 1990, 1991, 1993
3 *	The Regents of the University of California.  All rights reserved.
4 * (c) UNIX System Laboratories, Inc.
5 * All or some portions of this file are derived from material licensed
6 * to the University of California by American Telephone and Telegraph
7 * Co. or Unix System Laboratories, Inc. and are reproduced herein with
8 * the permission of UNIX System Laboratories, Inc.
9 *
10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted provided that the following conditions
12 * are met:
13 * 1. Redistributions of source code must retain the above copyright
14 *    notice, this list of conditions and the following disclaimer.
15 * 2. Redistributions in binary form must reproduce the above copyright
16 *    notice, this list of conditions and the following disclaimer in the
17 *    documentation and/or other materials provided with the distribution.
18 * 4. Neither the name of the University nor the names of its contributors
19 *    may be used to endorse or promote products derived from this software
20 *    without specific prior written permission.
21 *
22 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
23 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
24 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
25 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
26 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
27 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
28 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
29 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
30 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
31 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32 * SUCH DAMAGE.
33 */
34
35#include <sys/cdefs.h>
36__FBSDID("$FreeBSD: releng/11.0/sys/kern/sched_4bsd.c 303884 2016-08-09 18:56:29Z jhb $");
37
38#include "opt_hwpmc_hooks.h"
39#include "opt_sched.h"
40
41#include <sys/param.h>
42#include <sys/systm.h>
43#include <sys/cpuset.h>
44#include <sys/kernel.h>
45#include <sys/ktr.h>
46#include <sys/lock.h>
47#include <sys/kthread.h>
48#include <sys/mutex.h>
49#include <sys/proc.h>
50#include <sys/resourcevar.h>
51#include <sys/sched.h>
52#include <sys/sdt.h>
53#include <sys/smp.h>
54#include <sys/sysctl.h>
55#include <sys/sx.h>
56#include <sys/turnstile.h>
57#include <sys/umtx.h>
58#include <machine/pcb.h>
59#include <machine/smp.h>
60
61#ifdef HWPMC_HOOKS
62#include <sys/pmckern.h>
63#endif
64
65#ifdef KDTRACE_HOOKS
66#include <sys/dtrace_bsd.h>
67int				dtrace_vtime_active;
68dtrace_vtime_switch_func_t	dtrace_vtime_switch_func;
69#endif
70
71/*
72 * INVERSE_ESTCPU_WEIGHT is only suitable for statclock() frequencies in
73 * the range 100-256 Hz (approximately).
74 */
75#define	ESTCPULIM(e) \
76    min((e), INVERSE_ESTCPU_WEIGHT * (NICE_WEIGHT * (PRIO_MAX - PRIO_MIN) - \
77    RQ_PPQ) + INVERSE_ESTCPU_WEIGHT - 1)
78#ifdef SMP
79#define	INVERSE_ESTCPU_WEIGHT	(8 * smp_cpus)
80#else
81#define	INVERSE_ESTCPU_WEIGHT	8	/* 1 / (priorities per estcpu level). */
82#endif
83#define	NICE_WEIGHT		1	/* Priorities per nice level. */
84
85#define	TS_NAME_LEN (MAXCOMLEN + sizeof(" td ") + sizeof(__XSTRING(UINT_MAX)))
86
87/*
88 * The schedulable entity that runs a context.
89 * This is  an extension to the thread structure and is tailored to
90 * the requirements of this scheduler.
91 * All fields are protected by the scheduler lock.
92 */
93struct td_sched {
94	fixpt_t		ts_pctcpu;	/* %cpu during p_swtime. */
95	u_int		ts_estcpu;	/* Estimated cpu utilization. */
96	int		ts_cpticks;	/* Ticks of cpu time. */
97	int		ts_slptime;	/* Seconds !RUNNING. */
98	int		ts_slice;	/* Remaining part of time slice. */
99	int		ts_flags;
100	struct runq	*ts_runq;	/* runq the thread is currently on */
101#ifdef KTR
102	char		ts_name[TS_NAME_LEN];
103#endif
104};
105
106/* flags kept in td_flags */
107#define TDF_DIDRUN	TDF_SCHED0	/* thread actually ran. */
108#define TDF_BOUND	TDF_SCHED1	/* Bound to one CPU. */
109#define	TDF_SLICEEND	TDF_SCHED2	/* Thread time slice is over. */
110
111/* flags kept in ts_flags */
112#define	TSF_AFFINITY	0x0001		/* Has a non-"full" CPU set. */
113
114#define SKE_RUNQ_PCPU(ts)						\
115    ((ts)->ts_runq != 0 && (ts)->ts_runq != &runq)
116
117#define	THREAD_CAN_SCHED(td, cpu)	\
118    CPU_ISSET((cpu), &(td)->td_cpuset->cs_mask)
119
120_Static_assert(sizeof(struct thread) + sizeof(struct td_sched) <=
121    sizeof(struct thread0_storage),
122    "increase struct thread0_storage.t0st_sched size");
123
124static struct mtx sched_lock;
125
126static int	realstathz = 127; /* stathz is sometimes 0 and run off of hz. */
127static int	sched_tdcnt;	/* Total runnable threads in the system. */
128static int	sched_slice = 12; /* Thread run time before rescheduling. */
129
130static void	setup_runqs(void);
131static void	schedcpu(void);
132static void	schedcpu_thread(void);
133static void	sched_priority(struct thread *td, u_char prio);
134static void	sched_setup(void *dummy);
135static void	maybe_resched(struct thread *td);
136static void	updatepri(struct thread *td);
137static void	resetpriority(struct thread *td);
138static void	resetpriority_thread(struct thread *td);
139#ifdef SMP
140static int	sched_pickcpu(struct thread *td);
141static int	forward_wakeup(int cpunum);
142static void	kick_other_cpu(int pri, int cpuid);
143#endif
144
145static struct kproc_desc sched_kp = {
146        "schedcpu",
147        schedcpu_thread,
148        NULL
149};
150SYSINIT(schedcpu, SI_SUB_LAST, SI_ORDER_FIRST, kproc_start,
151    &sched_kp);
152SYSINIT(sched_setup, SI_SUB_RUN_QUEUE, SI_ORDER_FIRST, sched_setup, NULL);
153
154static void sched_initticks(void *dummy);
155SYSINIT(sched_initticks, SI_SUB_CLOCKS, SI_ORDER_THIRD, sched_initticks,
156    NULL);
157
158/*
159 * Global run queue.
160 */
161static struct runq runq;
162
163#ifdef SMP
164/*
165 * Per-CPU run queues
166 */
167static struct runq runq_pcpu[MAXCPU];
168long runq_length[MAXCPU];
169
170static cpuset_t idle_cpus_mask;
171#endif
172
173struct pcpuidlestat {
174	u_int idlecalls;
175	u_int oldidlecalls;
176};
177static DPCPU_DEFINE(struct pcpuidlestat, idlestat);
178
179static void
180setup_runqs(void)
181{
182#ifdef SMP
183	int i;
184
185	for (i = 0; i < MAXCPU; ++i)
186		runq_init(&runq_pcpu[i]);
187#endif
188
189	runq_init(&runq);
190}
191
192static int
193sysctl_kern_quantum(SYSCTL_HANDLER_ARGS)
194{
195	int error, new_val, period;
196
197	period = 1000000 / realstathz;
198	new_val = period * sched_slice;
199	error = sysctl_handle_int(oidp, &new_val, 0, req);
200	if (error != 0 || req->newptr == NULL)
201		return (error);
202	if (new_val <= 0)
203		return (EINVAL);
204	sched_slice = imax(1, (new_val + period / 2) / period);
205	hogticks = imax(1, (2 * hz * sched_slice + realstathz / 2) /
206	    realstathz);
207	return (0);
208}
209
210SYSCTL_NODE(_kern, OID_AUTO, sched, CTLFLAG_RD, 0, "Scheduler");
211
212SYSCTL_STRING(_kern_sched, OID_AUTO, name, CTLFLAG_RD, "4BSD", 0,
213    "Scheduler name");
214SYSCTL_PROC(_kern_sched, OID_AUTO, quantum, CTLTYPE_INT | CTLFLAG_RW,
215    NULL, 0, sysctl_kern_quantum, "I",
216    "Quantum for timeshare threads in microseconds");
217SYSCTL_INT(_kern_sched, OID_AUTO, slice, CTLFLAG_RW, &sched_slice, 0,
218    "Quantum for timeshare threads in stathz ticks");
219#ifdef SMP
220/* Enable forwarding of wakeups to all other cpus */
221static SYSCTL_NODE(_kern_sched, OID_AUTO, ipiwakeup, CTLFLAG_RD, NULL,
222    "Kernel SMP");
223
224static int runq_fuzz = 1;
225SYSCTL_INT(_kern_sched, OID_AUTO, runq_fuzz, CTLFLAG_RW, &runq_fuzz, 0, "");
226
227static int forward_wakeup_enabled = 1;
228SYSCTL_INT(_kern_sched_ipiwakeup, OID_AUTO, enabled, CTLFLAG_RW,
229	   &forward_wakeup_enabled, 0,
230	   "Forwarding of wakeup to idle CPUs");
231
232static int forward_wakeups_requested = 0;
233SYSCTL_INT(_kern_sched_ipiwakeup, OID_AUTO, requested, CTLFLAG_RD,
234	   &forward_wakeups_requested, 0,
235	   "Requests for Forwarding of wakeup to idle CPUs");
236
237static int forward_wakeups_delivered = 0;
238SYSCTL_INT(_kern_sched_ipiwakeup, OID_AUTO, delivered, CTLFLAG_RD,
239	   &forward_wakeups_delivered, 0,
240	   "Completed Forwarding of wakeup to idle CPUs");
241
242static int forward_wakeup_use_mask = 1;
243SYSCTL_INT(_kern_sched_ipiwakeup, OID_AUTO, usemask, CTLFLAG_RW,
244	   &forward_wakeup_use_mask, 0,
245	   "Use the mask of idle cpus");
246
247static int forward_wakeup_use_loop = 0;
248SYSCTL_INT(_kern_sched_ipiwakeup, OID_AUTO, useloop, CTLFLAG_RW,
249	   &forward_wakeup_use_loop, 0,
250	   "Use a loop to find idle cpus");
251
252#endif
253#if 0
254static int sched_followon = 0;
255SYSCTL_INT(_kern_sched, OID_AUTO, followon, CTLFLAG_RW,
256	   &sched_followon, 0,
257	   "allow threads to share a quantum");
258#endif
259
260SDT_PROVIDER_DEFINE(sched);
261
262SDT_PROBE_DEFINE3(sched, , , change__pri, "struct thread *",
263    "struct proc *", "uint8_t");
264SDT_PROBE_DEFINE3(sched, , , dequeue, "struct thread *",
265    "struct proc *", "void *");
266SDT_PROBE_DEFINE4(sched, , , enqueue, "struct thread *",
267    "struct proc *", "void *", "int");
268SDT_PROBE_DEFINE4(sched, , , lend__pri, "struct thread *",
269    "struct proc *", "uint8_t", "struct thread *");
270SDT_PROBE_DEFINE2(sched, , , load__change, "int", "int");
271SDT_PROBE_DEFINE2(sched, , , off__cpu, "struct thread *",
272    "struct proc *");
273SDT_PROBE_DEFINE(sched, , , on__cpu);
274SDT_PROBE_DEFINE(sched, , , remain__cpu);
275SDT_PROBE_DEFINE2(sched, , , surrender, "struct thread *",
276    "struct proc *");
277
278static __inline void
279sched_load_add(void)
280{
281
282	sched_tdcnt++;
283	KTR_COUNTER0(KTR_SCHED, "load", "global load", sched_tdcnt);
284	SDT_PROBE2(sched, , , load__change, NOCPU, sched_tdcnt);
285}
286
287static __inline void
288sched_load_rem(void)
289{
290
291	sched_tdcnt--;
292	KTR_COUNTER0(KTR_SCHED, "load", "global load", sched_tdcnt);
293	SDT_PROBE2(sched, , , load__change, NOCPU, sched_tdcnt);
294}
295/*
296 * Arrange to reschedule if necessary, taking the priorities and
297 * schedulers into account.
298 */
299static void
300maybe_resched(struct thread *td)
301{
302
303	THREAD_LOCK_ASSERT(td, MA_OWNED);
304	if (td->td_priority < curthread->td_priority)
305		curthread->td_flags |= TDF_NEEDRESCHED;
306}
307
308/*
309 * This function is called when a thread is about to be put on run queue
310 * because it has been made runnable or its priority has been adjusted.  It
311 * determines if the new thread should be immediately preempted to.  If so,
312 * it switches to it and eventually returns true.  If not, it returns false
313 * so that the caller may place the thread on an appropriate run queue.
314 */
315int
316maybe_preempt(struct thread *td)
317{
318#ifdef PREEMPTION
319	struct thread *ctd;
320	int cpri, pri;
321
322	/*
323	 * The new thread should not preempt the current thread if any of the
324	 * following conditions are true:
325	 *
326	 *  - The kernel is in the throes of crashing (panicstr).
327	 *  - The current thread has a higher (numerically lower) or
328	 *    equivalent priority.  Note that this prevents curthread from
329	 *    trying to preempt to itself.
330	 *  - It is too early in the boot for context switches (cold is set).
331	 *  - The current thread has an inhibitor set or is in the process of
332	 *    exiting.  In this case, the current thread is about to switch
333	 *    out anyways, so there's no point in preempting.  If we did,
334	 *    the current thread would not be properly resumed as well, so
335	 *    just avoid that whole landmine.
336	 *  - If the new thread's priority is not a realtime priority and
337	 *    the current thread's priority is not an idle priority and
338	 *    FULL_PREEMPTION is disabled.
339	 *
340	 * If all of these conditions are false, but the current thread is in
341	 * a nested critical section, then we have to defer the preemption
342	 * until we exit the critical section.  Otherwise, switch immediately
343	 * to the new thread.
344	 */
345	ctd = curthread;
346	THREAD_LOCK_ASSERT(td, MA_OWNED);
347	KASSERT((td->td_inhibitors == 0),
348			("maybe_preempt: trying to run inhibited thread"));
349	pri = td->td_priority;
350	cpri = ctd->td_priority;
351	if (panicstr != NULL || pri >= cpri || cold /* || dumping */ ||
352	    TD_IS_INHIBITED(ctd))
353		return (0);
354#ifndef FULL_PREEMPTION
355	if (pri > PRI_MAX_ITHD && cpri < PRI_MIN_IDLE)
356		return (0);
357#endif
358
359	if (ctd->td_critnest > 1) {
360		CTR1(KTR_PROC, "maybe_preempt: in critical section %d",
361		    ctd->td_critnest);
362		ctd->td_owepreempt = 1;
363		return (0);
364	}
365	/*
366	 * Thread is runnable but not yet put on system run queue.
367	 */
368	MPASS(ctd->td_lock == td->td_lock);
369	MPASS(TD_ON_RUNQ(td));
370	TD_SET_RUNNING(td);
371	CTR3(KTR_PROC, "preempting to thread %p (pid %d, %s)\n", td,
372	    td->td_proc->p_pid, td->td_name);
373	mi_switch(SW_INVOL | SW_PREEMPT | SWT_PREEMPT, td);
374	/*
375	 * td's lock pointer may have changed.  We have to return with it
376	 * locked.
377	 */
378	spinlock_enter();
379	thread_unlock(ctd);
380	thread_lock(td);
381	spinlock_exit();
382	return (1);
383#else
384	return (0);
385#endif
386}
387
388/*
389 * Constants for digital decay and forget:
390 *	90% of (ts_estcpu) usage in 5 * loadav time
391 *	95% of (ts_pctcpu) usage in 60 seconds (load insensitive)
392 *          Note that, as ps(1) mentions, this can let percentages
393 *          total over 100% (I've seen 137.9% for 3 processes).
394 *
395 * Note that schedclock() updates ts_estcpu and p_cpticks asynchronously.
396 *
397 * We wish to decay away 90% of ts_estcpu in (5 * loadavg) seconds.
398 * That is, the system wants to compute a value of decay such
399 * that the following for loop:
400 * 	for (i = 0; i < (5 * loadavg); i++)
401 * 		ts_estcpu *= decay;
402 * will compute
403 * 	ts_estcpu *= 0.1;
404 * for all values of loadavg:
405 *
406 * Mathematically this loop can be expressed by saying:
407 * 	decay ** (5 * loadavg) ~= .1
408 *
409 * The system computes decay as:
410 * 	decay = (2 * loadavg) / (2 * loadavg + 1)
411 *
412 * We wish to prove that the system's computation of decay
413 * will always fulfill the equation:
414 * 	decay ** (5 * loadavg) ~= .1
415 *
416 * If we compute b as:
417 * 	b = 2 * loadavg
418 * then
419 * 	decay = b / (b + 1)
420 *
421 * We now need to prove two things:
422 *	1) Given factor ** (5 * loadavg) ~= .1, prove factor == b/(b+1)
423 *	2) Given b/(b+1) ** power ~= .1, prove power == (5 * loadavg)
424 *
425 * Facts:
426 *         For x close to zero, exp(x) =~ 1 + x, since
427 *              exp(x) = 0! + x**1/1! + x**2/2! + ... .
428 *              therefore exp(-1/b) =~ 1 - (1/b) = (b-1)/b.
429 *         For x close to zero, ln(1+x) =~ x, since
430 *              ln(1+x) = x - x**2/2 + x**3/3 - ...     -1 < x < 1
431 *              therefore ln(b/(b+1)) = ln(1 - 1/(b+1)) =~ -1/(b+1).
432 *         ln(.1) =~ -2.30
433 *
434 * Proof of (1):
435 *    Solve (factor)**(power) =~ .1 given power (5*loadav):
436 *	solving for factor,
437 *      ln(factor) =~ (-2.30/5*loadav), or
438 *      factor =~ exp(-1/((5/2.30)*loadav)) =~ exp(-1/(2*loadav)) =
439 *          exp(-1/b) =~ (b-1)/b =~ b/(b+1).                    QED
440 *
441 * Proof of (2):
442 *    Solve (factor)**(power) =~ .1 given factor == (b/(b+1)):
443 *	solving for power,
444 *      power*ln(b/(b+1)) =~ -2.30, or
445 *      power =~ 2.3 * (b + 1) = 4.6*loadav + 2.3 =~ 5*loadav.  QED
446 *
447 * Actual power values for the implemented algorithm are as follows:
448 *      loadav: 1       2       3       4
449 *      power:  5.68    10.32   14.94   19.55
450 */
451
452/* calculations for digital decay to forget 90% of usage in 5*loadav sec */
453#define	loadfactor(loadav)	(2 * (loadav))
454#define	decay_cpu(loadfac, cpu)	(((loadfac) * (cpu)) / ((loadfac) + FSCALE))
455
456/* decay 95% of `ts_pctcpu' in 60 seconds; see CCPU_SHIFT before changing */
457static fixpt_t	ccpu = 0.95122942450071400909 * FSCALE;	/* exp(-1/20) */
458SYSCTL_UINT(_kern, OID_AUTO, ccpu, CTLFLAG_RD, &ccpu, 0, "");
459
460/*
461 * If `ccpu' is not equal to `exp(-1/20)' and you still want to use the
462 * faster/more-accurate formula, you'll have to estimate CCPU_SHIFT below
463 * and possibly adjust FSHIFT in "param.h" so that (FSHIFT >= CCPU_SHIFT).
464 *
465 * To estimate CCPU_SHIFT for exp(-1/20), the following formula was used:
466 *	1 - exp(-1/20) ~= 0.0487 ~= 0.0488 == 1 (fixed pt, *11* bits).
467 *
468 * If you don't want to bother with the faster/more-accurate formula, you
469 * can set CCPU_SHIFT to (FSHIFT + 1) which will use a slower/less-accurate
470 * (more general) method of calculating the %age of CPU used by a process.
471 */
472#define	CCPU_SHIFT	11
473
474/*
475 * Recompute process priorities, every hz ticks.
476 * MP-safe, called without the Giant mutex.
477 */
478/* ARGSUSED */
479static void
480schedcpu(void)
481{
482	register fixpt_t loadfac = loadfactor(averunnable.ldavg[0]);
483	struct thread *td;
484	struct proc *p;
485	struct td_sched *ts;
486	int awake;
487
488	sx_slock(&allproc_lock);
489	FOREACH_PROC_IN_SYSTEM(p) {
490		PROC_LOCK(p);
491		if (p->p_state == PRS_NEW) {
492			PROC_UNLOCK(p);
493			continue;
494		}
495		FOREACH_THREAD_IN_PROC(p, td) {
496			awake = 0;
497			ts = td_get_sched(td);
498			thread_lock(td);
499			/*
500			 * Increment sleep time (if sleeping).  We
501			 * ignore overflow, as above.
502			 */
503			/*
504			 * The td_sched slptimes are not touched in wakeup
505			 * because the thread may not HAVE everything in
506			 * memory? XXX I think this is out of date.
507			 */
508			if (TD_ON_RUNQ(td)) {
509				awake = 1;
510				td->td_flags &= ~TDF_DIDRUN;
511			} else if (TD_IS_RUNNING(td)) {
512				awake = 1;
513				/* Do not clear TDF_DIDRUN */
514			} else if (td->td_flags & TDF_DIDRUN) {
515				awake = 1;
516				td->td_flags &= ~TDF_DIDRUN;
517			}
518
519			/*
520			 * ts_pctcpu is only for ps and ttyinfo().
521			 */
522			ts->ts_pctcpu = (ts->ts_pctcpu * ccpu) >> FSHIFT;
523			/*
524			 * If the td_sched has been idle the entire second,
525			 * stop recalculating its priority until
526			 * it wakes up.
527			 */
528			if (ts->ts_cpticks != 0) {
529#if	(FSHIFT >= CCPU_SHIFT)
530				ts->ts_pctcpu += (realstathz == 100)
531				    ? ((fixpt_t) ts->ts_cpticks) <<
532				    (FSHIFT - CCPU_SHIFT) :
533				    100 * (((fixpt_t) ts->ts_cpticks)
534				    << (FSHIFT - CCPU_SHIFT)) / realstathz;
535#else
536				ts->ts_pctcpu += ((FSCALE - ccpu) *
537				    (ts->ts_cpticks *
538				    FSCALE / realstathz)) >> FSHIFT;
539#endif
540				ts->ts_cpticks = 0;
541			}
542			/*
543			 * If there are ANY running threads in this process,
544			 * then don't count it as sleeping.
545			 * XXX: this is broken.
546			 */
547			if (awake) {
548				if (ts->ts_slptime > 1) {
549					/*
550					 * In an ideal world, this should not
551					 * happen, because whoever woke us
552					 * up from the long sleep should have
553					 * unwound the slptime and reset our
554					 * priority before we run at the stale
555					 * priority.  Should KASSERT at some
556					 * point when all the cases are fixed.
557					 */
558					updatepri(td);
559				}
560				ts->ts_slptime = 0;
561			} else
562				ts->ts_slptime++;
563			if (ts->ts_slptime > 1) {
564				thread_unlock(td);
565				continue;
566			}
567			ts->ts_estcpu = decay_cpu(loadfac, ts->ts_estcpu);
568		      	resetpriority(td);
569			resetpriority_thread(td);
570			thread_unlock(td);
571		}
572		PROC_UNLOCK(p);
573	}
574	sx_sunlock(&allproc_lock);
575}
576
577/*
578 * Main loop for a kthread that executes schedcpu once a second.
579 */
580static void
581schedcpu_thread(void)
582{
583
584	for (;;) {
585		schedcpu();
586		pause("-", hz);
587	}
588}
589
590/*
591 * Recalculate the priority of a process after it has slept for a while.
592 * For all load averages >= 1 and max ts_estcpu of 255, sleeping for at
593 * least six times the loadfactor will decay ts_estcpu to zero.
594 */
595static void
596updatepri(struct thread *td)
597{
598	struct td_sched *ts;
599	fixpt_t loadfac;
600	unsigned int newcpu;
601
602	ts = td_get_sched(td);
603	loadfac = loadfactor(averunnable.ldavg[0]);
604	if (ts->ts_slptime > 5 * loadfac)
605		ts->ts_estcpu = 0;
606	else {
607		newcpu = ts->ts_estcpu;
608		ts->ts_slptime--;	/* was incremented in schedcpu() */
609		while (newcpu && --ts->ts_slptime)
610			newcpu = decay_cpu(loadfac, newcpu);
611		ts->ts_estcpu = newcpu;
612	}
613}
614
615/*
616 * Compute the priority of a process when running in user mode.
617 * Arrange to reschedule if the resulting priority is better
618 * than that of the current process.
619 */
620static void
621resetpriority(struct thread *td)
622{
623	u_int newpriority;
624
625	if (td->td_pri_class != PRI_TIMESHARE)
626		return;
627	newpriority = PUSER +
628	    td_get_sched(td)->ts_estcpu / INVERSE_ESTCPU_WEIGHT +
629	    NICE_WEIGHT * (td->td_proc->p_nice - PRIO_MIN);
630	newpriority = min(max(newpriority, PRI_MIN_TIMESHARE),
631	    PRI_MAX_TIMESHARE);
632	sched_user_prio(td, newpriority);
633}
634
635/*
636 * Update the thread's priority when the associated process's user
637 * priority changes.
638 */
639static void
640resetpriority_thread(struct thread *td)
641{
642
643	/* Only change threads with a time sharing user priority. */
644	if (td->td_priority < PRI_MIN_TIMESHARE ||
645	    td->td_priority > PRI_MAX_TIMESHARE)
646		return;
647
648	/* XXX the whole needresched thing is broken, but not silly. */
649	maybe_resched(td);
650
651	sched_prio(td, td->td_user_pri);
652}
653
654/* ARGSUSED */
655static void
656sched_setup(void *dummy)
657{
658
659	setup_runqs();
660
661	/* Account for thread0. */
662	sched_load_add();
663}
664
665/*
666 * This routine determines time constants after stathz and hz are setup.
667 */
668static void
669sched_initticks(void *dummy)
670{
671
672	realstathz = stathz ? stathz : hz;
673	sched_slice = realstathz / 10;	/* ~100ms */
674	hogticks = imax(1, (2 * hz * sched_slice + realstathz / 2) /
675	    realstathz);
676}
677
678/* External interfaces start here */
679
680/*
681 * Very early in the boot some setup of scheduler-specific
682 * parts of proc0 and of some scheduler resources needs to be done.
683 * Called from:
684 *  proc0_init()
685 */
686void
687schedinit(void)
688{
689
690	/*
691	 * Set up the scheduler specific parts of thread0.
692	 */
693	thread0.td_lock = &sched_lock;
694	td_get_sched(&thread0)->ts_slice = sched_slice;
695	mtx_init(&sched_lock, "sched lock", NULL, MTX_SPIN | MTX_RECURSE);
696}
697
698int
699sched_runnable(void)
700{
701#ifdef SMP
702	return runq_check(&runq) + runq_check(&runq_pcpu[PCPU_GET(cpuid)]);
703#else
704	return runq_check(&runq);
705#endif
706}
707
708int
709sched_rr_interval(void)
710{
711
712	/* Convert sched_slice from stathz to hz. */
713	return (imax(1, (sched_slice * hz + realstathz / 2) / realstathz));
714}
715
716/*
717 * We adjust the priority of the current process.  The priority of a
718 * process gets worse as it accumulates CPU time.  The cpu usage
719 * estimator (ts_estcpu) is increased here.  resetpriority() will
720 * compute a different priority each time ts_estcpu increases by
721 * INVERSE_ESTCPU_WEIGHT (until PRI_MAX_TIMESHARE is reached).  The
722 * cpu usage estimator ramps up quite quickly when the process is
723 * running (linearly), and decays away exponentially, at a rate which
724 * is proportionally slower when the system is busy.  The basic
725 * principle is that the system will 90% forget that the process used
726 * a lot of CPU time in 5 * loadav seconds.  This causes the system to
727 * favor processes which haven't run much recently, and to round-robin
728 * among other processes.
729 */
730void
731sched_clock(struct thread *td)
732{
733	struct pcpuidlestat *stat;
734	struct td_sched *ts;
735
736	THREAD_LOCK_ASSERT(td, MA_OWNED);
737	ts = td_get_sched(td);
738
739	ts->ts_cpticks++;
740	ts->ts_estcpu = ESTCPULIM(ts->ts_estcpu + 1);
741	if ((ts->ts_estcpu % INVERSE_ESTCPU_WEIGHT) == 0) {
742		resetpriority(td);
743		resetpriority_thread(td);
744	}
745
746	/*
747	 * Force a context switch if the current thread has used up a full
748	 * time slice (default is 100ms).
749	 */
750	if (!TD_IS_IDLETHREAD(td) && --ts->ts_slice <= 0) {
751		ts->ts_slice = sched_slice;
752		td->td_flags |= TDF_NEEDRESCHED | TDF_SLICEEND;
753	}
754
755	stat = DPCPU_PTR(idlestat);
756	stat->oldidlecalls = stat->idlecalls;
757	stat->idlecalls = 0;
758}
759
760/*
761 * Charge child's scheduling CPU usage to parent.
762 */
763void
764sched_exit(struct proc *p, struct thread *td)
765{
766
767	KTR_STATE1(KTR_SCHED, "thread", sched_tdname(td), "proc exit",
768	    "prio:%d", td->td_priority);
769
770	PROC_LOCK_ASSERT(p, MA_OWNED);
771	sched_exit_thread(FIRST_THREAD_IN_PROC(p), td);
772}
773
774void
775sched_exit_thread(struct thread *td, struct thread *child)
776{
777
778	KTR_STATE1(KTR_SCHED, "thread", sched_tdname(child), "exit",
779	    "prio:%d", child->td_priority);
780	thread_lock(td);
781	td_get_sched(td)->ts_estcpu = ESTCPULIM(td_get_sched(td)->ts_estcpu +
782	    td_get_sched(child)->ts_estcpu);
783	thread_unlock(td);
784	thread_lock(child);
785	if ((child->td_flags & TDF_NOLOAD) == 0)
786		sched_load_rem();
787	thread_unlock(child);
788}
789
790void
791sched_fork(struct thread *td, struct thread *childtd)
792{
793	sched_fork_thread(td, childtd);
794}
795
796void
797sched_fork_thread(struct thread *td, struct thread *childtd)
798{
799	struct td_sched *ts, *tsc;
800
801	childtd->td_oncpu = NOCPU;
802	childtd->td_lastcpu = NOCPU;
803	childtd->td_lock = &sched_lock;
804	childtd->td_cpuset = cpuset_ref(td->td_cpuset);
805	childtd->td_priority = childtd->td_base_pri;
806	ts = td_get_sched(childtd);
807	bzero(ts, sizeof(*ts));
808	tsc = td_get_sched(td);
809	ts->ts_estcpu = tsc->ts_estcpu;
810	ts->ts_flags |= (tsc->ts_flags & TSF_AFFINITY);
811	ts->ts_slice = 1;
812}
813
814void
815sched_nice(struct proc *p, int nice)
816{
817	struct thread *td;
818
819	PROC_LOCK_ASSERT(p, MA_OWNED);
820	p->p_nice = nice;
821	FOREACH_THREAD_IN_PROC(p, td) {
822		thread_lock(td);
823		resetpriority(td);
824		resetpriority_thread(td);
825		thread_unlock(td);
826	}
827}
828
829void
830sched_class(struct thread *td, int class)
831{
832	THREAD_LOCK_ASSERT(td, MA_OWNED);
833	td->td_pri_class = class;
834}
835
836/*
837 * Adjust the priority of a thread.
838 */
839static void
840sched_priority(struct thread *td, u_char prio)
841{
842
843
844	KTR_POINT3(KTR_SCHED, "thread", sched_tdname(td), "priority change",
845	    "prio:%d", td->td_priority, "new prio:%d", prio, KTR_ATTR_LINKED,
846	    sched_tdname(curthread));
847	SDT_PROBE3(sched, , , change__pri, td, td->td_proc, prio);
848	if (td != curthread && prio > td->td_priority) {
849		KTR_POINT3(KTR_SCHED, "thread", sched_tdname(curthread),
850		    "lend prio", "prio:%d", td->td_priority, "new prio:%d",
851		    prio, KTR_ATTR_LINKED, sched_tdname(td));
852		SDT_PROBE4(sched, , , lend__pri, td, td->td_proc, prio,
853		    curthread);
854	}
855	THREAD_LOCK_ASSERT(td, MA_OWNED);
856	if (td->td_priority == prio)
857		return;
858	td->td_priority = prio;
859	if (TD_ON_RUNQ(td) && td->td_rqindex != (prio / RQ_PPQ)) {
860		sched_rem(td);
861		sched_add(td, SRQ_BORING);
862	}
863}
864
865/*
866 * Update a thread's priority when it is lent another thread's
867 * priority.
868 */
869void
870sched_lend_prio(struct thread *td, u_char prio)
871{
872
873	td->td_flags |= TDF_BORROWING;
874	sched_priority(td, prio);
875}
876
877/*
878 * Restore a thread's priority when priority propagation is
879 * over.  The prio argument is the minimum priority the thread
880 * needs to have to satisfy other possible priority lending
881 * requests.  If the thread's regulary priority is less
882 * important than prio the thread will keep a priority boost
883 * of prio.
884 */
885void
886sched_unlend_prio(struct thread *td, u_char prio)
887{
888	u_char base_pri;
889
890	if (td->td_base_pri >= PRI_MIN_TIMESHARE &&
891	    td->td_base_pri <= PRI_MAX_TIMESHARE)
892		base_pri = td->td_user_pri;
893	else
894		base_pri = td->td_base_pri;
895	if (prio >= base_pri) {
896		td->td_flags &= ~TDF_BORROWING;
897		sched_prio(td, base_pri);
898	} else
899		sched_lend_prio(td, prio);
900}
901
902void
903sched_prio(struct thread *td, u_char prio)
904{
905	u_char oldprio;
906
907	/* First, update the base priority. */
908	td->td_base_pri = prio;
909
910	/*
911	 * If the thread is borrowing another thread's priority, don't ever
912	 * lower the priority.
913	 */
914	if (td->td_flags & TDF_BORROWING && td->td_priority < prio)
915		return;
916
917	/* Change the real priority. */
918	oldprio = td->td_priority;
919	sched_priority(td, prio);
920
921	/*
922	 * If the thread is on a turnstile, then let the turnstile update
923	 * its state.
924	 */
925	if (TD_ON_LOCK(td) && oldprio != prio)
926		turnstile_adjust(td, oldprio);
927}
928
929void
930sched_user_prio(struct thread *td, u_char prio)
931{
932
933	THREAD_LOCK_ASSERT(td, MA_OWNED);
934	td->td_base_user_pri = prio;
935	if (td->td_lend_user_pri <= prio)
936		return;
937	td->td_user_pri = prio;
938}
939
940void
941sched_lend_user_prio(struct thread *td, u_char prio)
942{
943
944	THREAD_LOCK_ASSERT(td, MA_OWNED);
945	td->td_lend_user_pri = prio;
946	td->td_user_pri = min(prio, td->td_base_user_pri);
947	if (td->td_priority > td->td_user_pri)
948		sched_prio(td, td->td_user_pri);
949	else if (td->td_priority != td->td_user_pri)
950		td->td_flags |= TDF_NEEDRESCHED;
951}
952
953void
954sched_sleep(struct thread *td, int pri)
955{
956
957	THREAD_LOCK_ASSERT(td, MA_OWNED);
958	td->td_slptick = ticks;
959	td_get_sched(td)->ts_slptime = 0;
960	if (pri != 0 && PRI_BASE(td->td_pri_class) == PRI_TIMESHARE)
961		sched_prio(td, pri);
962	if (TD_IS_SUSPENDED(td) || pri >= PSOCK)
963		td->td_flags |= TDF_CANSWAP;
964}
965
966void
967sched_switch(struct thread *td, struct thread *newtd, int flags)
968{
969	struct mtx *tmtx;
970	struct td_sched *ts;
971	struct proc *p;
972	int preempted;
973
974	tmtx = NULL;
975	ts = td_get_sched(td);
976	p = td->td_proc;
977
978	THREAD_LOCK_ASSERT(td, MA_OWNED);
979
980	/*
981	 * Switch to the sched lock to fix things up and pick
982	 * a new thread.
983	 * Block the td_lock in order to avoid breaking the critical path.
984	 */
985	if (td->td_lock != &sched_lock) {
986		mtx_lock_spin(&sched_lock);
987		tmtx = thread_lock_block(td);
988	}
989
990	if ((td->td_flags & TDF_NOLOAD) == 0)
991		sched_load_rem();
992
993	td->td_lastcpu = td->td_oncpu;
994	preempted = !((td->td_flags & TDF_SLICEEND) ||
995	    (flags & SWT_RELINQUISH));
996	td->td_flags &= ~(TDF_NEEDRESCHED | TDF_SLICEEND);
997	td->td_owepreempt = 0;
998	td->td_oncpu = NOCPU;
999
1000	/*
1001	 * At the last moment, if this thread is still marked RUNNING,
1002	 * then put it back on the run queue as it has not been suspended
1003	 * or stopped or any thing else similar.  We never put the idle
1004	 * threads on the run queue, however.
1005	 */
1006	if (td->td_flags & TDF_IDLETD) {
1007		TD_SET_CAN_RUN(td);
1008#ifdef SMP
1009		CPU_CLR(PCPU_GET(cpuid), &idle_cpus_mask);
1010#endif
1011	} else {
1012		if (TD_IS_RUNNING(td)) {
1013			/* Put us back on the run queue. */
1014			sched_add(td, preempted ?
1015			    SRQ_OURSELF|SRQ_YIELDING|SRQ_PREEMPTED :
1016			    SRQ_OURSELF|SRQ_YIELDING);
1017		}
1018	}
1019	if (newtd) {
1020		/*
1021		 * The thread we are about to run needs to be counted
1022		 * as if it had been added to the run queue and selected.
1023		 * It came from:
1024		 * * A preemption
1025		 * * An upcall
1026		 * * A followon
1027		 */
1028		KASSERT((newtd->td_inhibitors == 0),
1029			("trying to run inhibited thread"));
1030		newtd->td_flags |= TDF_DIDRUN;
1031        	TD_SET_RUNNING(newtd);
1032		if ((newtd->td_flags & TDF_NOLOAD) == 0)
1033			sched_load_add();
1034	} else {
1035		newtd = choosethread();
1036		MPASS(newtd->td_lock == &sched_lock);
1037	}
1038
1039	if (td != newtd) {
1040#ifdef	HWPMC_HOOKS
1041		if (PMC_PROC_IS_USING_PMCS(td->td_proc))
1042			PMC_SWITCH_CONTEXT(td, PMC_FN_CSW_OUT);
1043#endif
1044
1045		SDT_PROBE2(sched, , , off__cpu, newtd, newtd->td_proc);
1046
1047                /* I feel sleepy */
1048		lock_profile_release_lock(&sched_lock.lock_object);
1049#ifdef KDTRACE_HOOKS
1050		/*
1051		 * If DTrace has set the active vtime enum to anything
1052		 * other than INACTIVE (0), then it should have set the
1053		 * function to call.
1054		 */
1055		if (dtrace_vtime_active)
1056			(*dtrace_vtime_switch_func)(newtd);
1057#endif
1058
1059		cpu_switch(td, newtd, tmtx != NULL ? tmtx : td->td_lock);
1060		lock_profile_obtain_lock_success(&sched_lock.lock_object,
1061		    0, 0, __FILE__, __LINE__);
1062		/*
1063		 * Where am I?  What year is it?
1064		 * We are in the same thread that went to sleep above,
1065		 * but any amount of time may have passed. All our context
1066		 * will still be available as will local variables.
1067		 * PCPU values however may have changed as we may have
1068		 * changed CPU so don't trust cached values of them.
1069		 * New threads will go to fork_exit() instead of here
1070		 * so if you change things here you may need to change
1071		 * things there too.
1072		 *
1073		 * If the thread above was exiting it will never wake
1074		 * up again here, so either it has saved everything it
1075		 * needed to, or the thread_wait() or wait() will
1076		 * need to reap it.
1077		 */
1078
1079		SDT_PROBE0(sched, , , on__cpu);
1080#ifdef	HWPMC_HOOKS
1081		if (PMC_PROC_IS_USING_PMCS(td->td_proc))
1082			PMC_SWITCH_CONTEXT(td, PMC_FN_CSW_IN);
1083#endif
1084	} else
1085		SDT_PROBE0(sched, , , remain__cpu);
1086
1087#ifdef SMP
1088	if (td->td_flags & TDF_IDLETD)
1089		CPU_SET(PCPU_GET(cpuid), &idle_cpus_mask);
1090#endif
1091	sched_lock.mtx_lock = (uintptr_t)td;
1092	td->td_oncpu = PCPU_GET(cpuid);
1093	MPASS(td->td_lock == &sched_lock);
1094}
1095
1096void
1097sched_wakeup(struct thread *td)
1098{
1099	struct td_sched *ts;
1100
1101	THREAD_LOCK_ASSERT(td, MA_OWNED);
1102	ts = td_get_sched(td);
1103	td->td_flags &= ~TDF_CANSWAP;
1104	if (ts->ts_slptime > 1) {
1105		updatepri(td);
1106		resetpriority(td);
1107	}
1108	td->td_slptick = 0;
1109	ts->ts_slptime = 0;
1110	ts->ts_slice = sched_slice;
1111	sched_add(td, SRQ_BORING);
1112}
1113
1114#ifdef SMP
1115static int
1116forward_wakeup(int cpunum)
1117{
1118	struct pcpu *pc;
1119	cpuset_t dontuse, map, map2;
1120	u_int id, me;
1121	int iscpuset;
1122
1123	mtx_assert(&sched_lock, MA_OWNED);
1124
1125	CTR0(KTR_RUNQ, "forward_wakeup()");
1126
1127	if ((!forward_wakeup_enabled) ||
1128	     (forward_wakeup_use_mask == 0 && forward_wakeup_use_loop == 0))
1129		return (0);
1130	if (!smp_started || cold || panicstr)
1131		return (0);
1132
1133	forward_wakeups_requested++;
1134
1135	/*
1136	 * Check the idle mask we received against what we calculated
1137	 * before in the old version.
1138	 */
1139	me = PCPU_GET(cpuid);
1140
1141	/* Don't bother if we should be doing it ourself. */
1142	if (CPU_ISSET(me, &idle_cpus_mask) &&
1143	    (cpunum == NOCPU || me == cpunum))
1144		return (0);
1145
1146	CPU_SETOF(me, &dontuse);
1147	CPU_OR(&dontuse, &stopped_cpus);
1148	CPU_OR(&dontuse, &hlt_cpus_mask);
1149	CPU_ZERO(&map2);
1150	if (forward_wakeup_use_loop) {
1151		STAILQ_FOREACH(pc, &cpuhead, pc_allcpu) {
1152			id = pc->pc_cpuid;
1153			if (!CPU_ISSET(id, &dontuse) &&
1154			    pc->pc_curthread == pc->pc_idlethread) {
1155				CPU_SET(id, &map2);
1156			}
1157		}
1158	}
1159
1160	if (forward_wakeup_use_mask) {
1161		map = idle_cpus_mask;
1162		CPU_NAND(&map, &dontuse);
1163
1164		/* If they are both on, compare and use loop if different. */
1165		if (forward_wakeup_use_loop) {
1166			if (CPU_CMP(&map, &map2)) {
1167				printf("map != map2, loop method preferred\n");
1168				map = map2;
1169			}
1170		}
1171	} else {
1172		map = map2;
1173	}
1174
1175	/* If we only allow a specific CPU, then mask off all the others. */
1176	if (cpunum != NOCPU) {
1177		KASSERT((cpunum <= mp_maxcpus),("forward_wakeup: bad cpunum."));
1178		iscpuset = CPU_ISSET(cpunum, &map);
1179		if (iscpuset == 0)
1180			CPU_ZERO(&map);
1181		else
1182			CPU_SETOF(cpunum, &map);
1183	}
1184	if (!CPU_EMPTY(&map)) {
1185		forward_wakeups_delivered++;
1186		STAILQ_FOREACH(pc, &cpuhead, pc_allcpu) {
1187			id = pc->pc_cpuid;
1188			if (!CPU_ISSET(id, &map))
1189				continue;
1190			if (cpu_idle_wakeup(pc->pc_cpuid))
1191				CPU_CLR(id, &map);
1192		}
1193		if (!CPU_EMPTY(&map))
1194			ipi_selected(map, IPI_AST);
1195		return (1);
1196	}
1197	if (cpunum == NOCPU)
1198		printf("forward_wakeup: Idle processor not found\n");
1199	return (0);
1200}
1201
1202static void
1203kick_other_cpu(int pri, int cpuid)
1204{
1205	struct pcpu *pcpu;
1206	int cpri;
1207
1208	pcpu = pcpu_find(cpuid);
1209	if (CPU_ISSET(cpuid, &idle_cpus_mask)) {
1210		forward_wakeups_delivered++;
1211		if (!cpu_idle_wakeup(cpuid))
1212			ipi_cpu(cpuid, IPI_AST);
1213		return;
1214	}
1215
1216	cpri = pcpu->pc_curthread->td_priority;
1217	if (pri >= cpri)
1218		return;
1219
1220#if defined(IPI_PREEMPTION) && defined(PREEMPTION)
1221#if !defined(FULL_PREEMPTION)
1222	if (pri <= PRI_MAX_ITHD)
1223#endif /* ! FULL_PREEMPTION */
1224	{
1225		ipi_cpu(cpuid, IPI_PREEMPT);
1226		return;
1227	}
1228#endif /* defined(IPI_PREEMPTION) && defined(PREEMPTION) */
1229
1230	pcpu->pc_curthread->td_flags |= TDF_NEEDRESCHED;
1231	ipi_cpu(cpuid, IPI_AST);
1232	return;
1233}
1234#endif /* SMP */
1235
1236#ifdef SMP
1237static int
1238sched_pickcpu(struct thread *td)
1239{
1240	int best, cpu;
1241
1242	mtx_assert(&sched_lock, MA_OWNED);
1243
1244	if (td->td_lastcpu != NOCPU && THREAD_CAN_SCHED(td, td->td_lastcpu))
1245		best = td->td_lastcpu;
1246	else
1247		best = NOCPU;
1248	CPU_FOREACH(cpu) {
1249		if (!THREAD_CAN_SCHED(td, cpu))
1250			continue;
1251
1252		if (best == NOCPU)
1253			best = cpu;
1254		else if (runq_length[cpu] < runq_length[best])
1255			best = cpu;
1256	}
1257	KASSERT(best != NOCPU, ("no valid CPUs"));
1258
1259	return (best);
1260}
1261#endif
1262
1263void
1264sched_add(struct thread *td, int flags)
1265#ifdef SMP
1266{
1267	cpuset_t tidlemsk;
1268	struct td_sched *ts;
1269	u_int cpu, cpuid;
1270	int forwarded = 0;
1271	int single_cpu = 0;
1272
1273	ts = td_get_sched(td);
1274	THREAD_LOCK_ASSERT(td, MA_OWNED);
1275	KASSERT((td->td_inhibitors == 0),
1276	    ("sched_add: trying to run inhibited thread"));
1277	KASSERT((TD_CAN_RUN(td) || TD_IS_RUNNING(td)),
1278	    ("sched_add: bad thread state"));
1279	KASSERT(td->td_flags & TDF_INMEM,
1280	    ("sched_add: thread swapped out"));
1281
1282	KTR_STATE2(KTR_SCHED, "thread", sched_tdname(td), "runq add",
1283	    "prio:%d", td->td_priority, KTR_ATTR_LINKED,
1284	    sched_tdname(curthread));
1285	KTR_POINT1(KTR_SCHED, "thread", sched_tdname(curthread), "wokeup",
1286	    KTR_ATTR_LINKED, sched_tdname(td));
1287	SDT_PROBE4(sched, , , enqueue, td, td->td_proc, NULL,
1288	    flags & SRQ_PREEMPTED);
1289
1290
1291	/*
1292	 * Now that the thread is moving to the run-queue, set the lock
1293	 * to the scheduler's lock.
1294	 */
1295	if (td->td_lock != &sched_lock) {
1296		mtx_lock_spin(&sched_lock);
1297		thread_lock_set(td, &sched_lock);
1298	}
1299	TD_SET_RUNQ(td);
1300
1301	/*
1302	 * If SMP is started and the thread is pinned or otherwise limited to
1303	 * a specific set of CPUs, queue the thread to a per-CPU run queue.
1304	 * Otherwise, queue the thread to the global run queue.
1305	 *
1306	 * If SMP has not yet been started we must use the global run queue
1307	 * as per-CPU state may not be initialized yet and we may crash if we
1308	 * try to access the per-CPU run queues.
1309	 */
1310	if (smp_started && (td->td_pinned != 0 || td->td_flags & TDF_BOUND ||
1311	    ts->ts_flags & TSF_AFFINITY)) {
1312		if (td->td_pinned != 0)
1313			cpu = td->td_lastcpu;
1314		else if (td->td_flags & TDF_BOUND) {
1315			/* Find CPU from bound runq. */
1316			KASSERT(SKE_RUNQ_PCPU(ts),
1317			    ("sched_add: bound td_sched not on cpu runq"));
1318			cpu = ts->ts_runq - &runq_pcpu[0];
1319		} else
1320			/* Find a valid CPU for our cpuset */
1321			cpu = sched_pickcpu(td);
1322		ts->ts_runq = &runq_pcpu[cpu];
1323		single_cpu = 1;
1324		CTR3(KTR_RUNQ,
1325		    "sched_add: Put td_sched:%p(td:%p) on cpu%d runq", ts, td,
1326		    cpu);
1327	} else {
1328		CTR2(KTR_RUNQ,
1329		    "sched_add: adding td_sched:%p (td:%p) to gbl runq", ts,
1330		    td);
1331		cpu = NOCPU;
1332		ts->ts_runq = &runq;
1333	}
1334
1335	cpuid = PCPU_GET(cpuid);
1336	if (single_cpu && cpu != cpuid) {
1337	        kick_other_cpu(td->td_priority, cpu);
1338	} else {
1339		if (!single_cpu) {
1340			tidlemsk = idle_cpus_mask;
1341			CPU_NAND(&tidlemsk, &hlt_cpus_mask);
1342			CPU_CLR(cpuid, &tidlemsk);
1343
1344			if (!CPU_ISSET(cpuid, &idle_cpus_mask) &&
1345			    ((flags & SRQ_INTR) == 0) &&
1346			    !CPU_EMPTY(&tidlemsk))
1347				forwarded = forward_wakeup(cpu);
1348		}
1349
1350		if (!forwarded) {
1351			if ((flags & SRQ_YIELDING) == 0 && maybe_preempt(td))
1352				return;
1353			else
1354				maybe_resched(td);
1355		}
1356	}
1357
1358	if ((td->td_flags & TDF_NOLOAD) == 0)
1359		sched_load_add();
1360	runq_add(ts->ts_runq, td, flags);
1361	if (cpu != NOCPU)
1362		runq_length[cpu]++;
1363}
1364#else /* SMP */
1365{
1366	struct td_sched *ts;
1367
1368	ts = td_get_sched(td);
1369	THREAD_LOCK_ASSERT(td, MA_OWNED);
1370	KASSERT((td->td_inhibitors == 0),
1371	    ("sched_add: trying to run inhibited thread"));
1372	KASSERT((TD_CAN_RUN(td) || TD_IS_RUNNING(td)),
1373	    ("sched_add: bad thread state"));
1374	KASSERT(td->td_flags & TDF_INMEM,
1375	    ("sched_add: thread swapped out"));
1376	KTR_STATE2(KTR_SCHED, "thread", sched_tdname(td), "runq add",
1377	    "prio:%d", td->td_priority, KTR_ATTR_LINKED,
1378	    sched_tdname(curthread));
1379	KTR_POINT1(KTR_SCHED, "thread", sched_tdname(curthread), "wokeup",
1380	    KTR_ATTR_LINKED, sched_tdname(td));
1381	SDT_PROBE4(sched, , , enqueue, td, td->td_proc, NULL,
1382	    flags & SRQ_PREEMPTED);
1383
1384	/*
1385	 * Now that the thread is moving to the run-queue, set the lock
1386	 * to the scheduler's lock.
1387	 */
1388	if (td->td_lock != &sched_lock) {
1389		mtx_lock_spin(&sched_lock);
1390		thread_lock_set(td, &sched_lock);
1391	}
1392	TD_SET_RUNQ(td);
1393	CTR2(KTR_RUNQ, "sched_add: adding td_sched:%p (td:%p) to runq", ts, td);
1394	ts->ts_runq = &runq;
1395
1396	/*
1397	 * If we are yielding (on the way out anyhow) or the thread
1398	 * being saved is US, then don't try be smart about preemption
1399	 * or kicking off another CPU as it won't help and may hinder.
1400	 * In the YIEDLING case, we are about to run whoever is being
1401	 * put in the queue anyhow, and in the OURSELF case, we are
1402	 * putting ourself on the run queue which also only happens
1403	 * when we are about to yield.
1404	 */
1405	if ((flags & SRQ_YIELDING) == 0) {
1406		if (maybe_preempt(td))
1407			return;
1408	}
1409	if ((td->td_flags & TDF_NOLOAD) == 0)
1410		sched_load_add();
1411	runq_add(ts->ts_runq, td, flags);
1412	maybe_resched(td);
1413}
1414#endif /* SMP */
1415
1416void
1417sched_rem(struct thread *td)
1418{
1419	struct td_sched *ts;
1420
1421	ts = td_get_sched(td);
1422	KASSERT(td->td_flags & TDF_INMEM,
1423	    ("sched_rem: thread swapped out"));
1424	KASSERT(TD_ON_RUNQ(td),
1425	    ("sched_rem: thread not on run queue"));
1426	mtx_assert(&sched_lock, MA_OWNED);
1427	KTR_STATE2(KTR_SCHED, "thread", sched_tdname(td), "runq rem",
1428	    "prio:%d", td->td_priority, KTR_ATTR_LINKED,
1429	    sched_tdname(curthread));
1430	SDT_PROBE3(sched, , , dequeue, td, td->td_proc, NULL);
1431
1432	if ((td->td_flags & TDF_NOLOAD) == 0)
1433		sched_load_rem();
1434#ifdef SMP
1435	if (ts->ts_runq != &runq)
1436		runq_length[ts->ts_runq - runq_pcpu]--;
1437#endif
1438	runq_remove(ts->ts_runq, td);
1439	TD_SET_CAN_RUN(td);
1440}
1441
1442/*
1443 * Select threads to run.  Note that running threads still consume a
1444 * slot.
1445 */
1446struct thread *
1447sched_choose(void)
1448{
1449	struct thread *td;
1450	struct runq *rq;
1451
1452	mtx_assert(&sched_lock,  MA_OWNED);
1453#ifdef SMP
1454	struct thread *tdcpu;
1455
1456	rq = &runq;
1457	td = runq_choose_fuzz(&runq, runq_fuzz);
1458	tdcpu = runq_choose(&runq_pcpu[PCPU_GET(cpuid)]);
1459
1460	if (td == NULL ||
1461	    (tdcpu != NULL &&
1462	     tdcpu->td_priority < td->td_priority)) {
1463		CTR2(KTR_RUNQ, "choosing td %p from pcpu runq %d", tdcpu,
1464		     PCPU_GET(cpuid));
1465		td = tdcpu;
1466		rq = &runq_pcpu[PCPU_GET(cpuid)];
1467	} else {
1468		CTR1(KTR_RUNQ, "choosing td_sched %p from main runq", td);
1469	}
1470
1471#else
1472	rq = &runq;
1473	td = runq_choose(&runq);
1474#endif
1475
1476	if (td) {
1477#ifdef SMP
1478		if (td == tdcpu)
1479			runq_length[PCPU_GET(cpuid)]--;
1480#endif
1481		runq_remove(rq, td);
1482		td->td_flags |= TDF_DIDRUN;
1483
1484		KASSERT(td->td_flags & TDF_INMEM,
1485		    ("sched_choose: thread swapped out"));
1486		return (td);
1487	}
1488	return (PCPU_GET(idlethread));
1489}
1490
1491void
1492sched_preempt(struct thread *td)
1493{
1494
1495	SDT_PROBE2(sched, , , surrender, td, td->td_proc);
1496	thread_lock(td);
1497	if (td->td_critnest > 1)
1498		td->td_owepreempt = 1;
1499	else
1500		mi_switch(SW_INVOL | SW_PREEMPT | SWT_PREEMPT, NULL);
1501	thread_unlock(td);
1502}
1503
1504void
1505sched_userret(struct thread *td)
1506{
1507	/*
1508	 * XXX we cheat slightly on the locking here to avoid locking in
1509	 * the usual case.  Setting td_priority here is essentially an
1510	 * incomplete workaround for not setting it properly elsewhere.
1511	 * Now that some interrupt handlers are threads, not setting it
1512	 * properly elsewhere can clobber it in the window between setting
1513	 * it here and returning to user mode, so don't waste time setting
1514	 * it perfectly here.
1515	 */
1516	KASSERT((td->td_flags & TDF_BORROWING) == 0,
1517	    ("thread with borrowed priority returning to userland"));
1518	if (td->td_priority != td->td_user_pri) {
1519		thread_lock(td);
1520		td->td_priority = td->td_user_pri;
1521		td->td_base_pri = td->td_user_pri;
1522		thread_unlock(td);
1523	}
1524}
1525
1526void
1527sched_bind(struct thread *td, int cpu)
1528{
1529	struct td_sched *ts;
1530
1531	THREAD_LOCK_ASSERT(td, MA_OWNED|MA_NOTRECURSED);
1532	KASSERT(td == curthread, ("sched_bind: can only bind curthread"));
1533
1534	ts = td_get_sched(td);
1535
1536	td->td_flags |= TDF_BOUND;
1537#ifdef SMP
1538	ts->ts_runq = &runq_pcpu[cpu];
1539	if (PCPU_GET(cpuid) == cpu)
1540		return;
1541
1542	mi_switch(SW_VOL, NULL);
1543#endif
1544}
1545
1546void
1547sched_unbind(struct thread* td)
1548{
1549	THREAD_LOCK_ASSERT(td, MA_OWNED);
1550	KASSERT(td == curthread, ("sched_unbind: can only bind curthread"));
1551	td->td_flags &= ~TDF_BOUND;
1552}
1553
1554int
1555sched_is_bound(struct thread *td)
1556{
1557	THREAD_LOCK_ASSERT(td, MA_OWNED);
1558	return (td->td_flags & TDF_BOUND);
1559}
1560
1561void
1562sched_relinquish(struct thread *td)
1563{
1564	thread_lock(td);
1565	mi_switch(SW_VOL | SWT_RELINQUISH, NULL);
1566	thread_unlock(td);
1567}
1568
1569int
1570sched_load(void)
1571{
1572	return (sched_tdcnt);
1573}
1574
1575int
1576sched_sizeof_proc(void)
1577{
1578	return (sizeof(struct proc));
1579}
1580
1581int
1582sched_sizeof_thread(void)
1583{
1584	return (sizeof(struct thread) + sizeof(struct td_sched));
1585}
1586
1587fixpt_t
1588sched_pctcpu(struct thread *td)
1589{
1590	struct td_sched *ts;
1591
1592	THREAD_LOCK_ASSERT(td, MA_OWNED);
1593	ts = td_get_sched(td);
1594	return (ts->ts_pctcpu);
1595}
1596
1597#ifdef RACCT
1598/*
1599 * Calculates the contribution to the thread cpu usage for the latest
1600 * (unfinished) second.
1601 */
1602fixpt_t
1603sched_pctcpu_delta(struct thread *td)
1604{
1605	struct td_sched *ts;
1606	fixpt_t delta;
1607	int realstathz;
1608
1609	THREAD_LOCK_ASSERT(td, MA_OWNED);
1610	ts = td_get_sched(td);
1611	delta = 0;
1612	realstathz = stathz ? stathz : hz;
1613	if (ts->ts_cpticks != 0) {
1614#if	(FSHIFT >= CCPU_SHIFT)
1615		delta = (realstathz == 100)
1616		    ? ((fixpt_t) ts->ts_cpticks) <<
1617		    (FSHIFT - CCPU_SHIFT) :
1618		    100 * (((fixpt_t) ts->ts_cpticks)
1619		    << (FSHIFT - CCPU_SHIFT)) / realstathz;
1620#else
1621		delta = ((FSCALE - ccpu) *
1622		    (ts->ts_cpticks *
1623		    FSCALE / realstathz)) >> FSHIFT;
1624#endif
1625	}
1626
1627	return (delta);
1628}
1629#endif
1630
1631u_int
1632sched_estcpu(struct thread *td)
1633{
1634
1635	return (td_get_sched(td)->ts_estcpu);
1636}
1637
1638/*
1639 * The actual idle process.
1640 */
1641void
1642sched_idletd(void *dummy)
1643{
1644	struct pcpuidlestat *stat;
1645
1646	THREAD_NO_SLEEPING();
1647	stat = DPCPU_PTR(idlestat);
1648	for (;;) {
1649		mtx_assert(&Giant, MA_NOTOWNED);
1650
1651		while (sched_runnable() == 0) {
1652			cpu_idle(stat->idlecalls + stat->oldidlecalls > 64);
1653			stat->idlecalls++;
1654		}
1655
1656		mtx_lock_spin(&sched_lock);
1657		mi_switch(SW_VOL | SWT_IDLE, NULL);
1658		mtx_unlock_spin(&sched_lock);
1659	}
1660}
1661
1662/*
1663 * A CPU is entering for the first time or a thread is exiting.
1664 */
1665void
1666sched_throw(struct thread *td)
1667{
1668	/*
1669	 * Correct spinlock nesting.  The idle thread context that we are
1670	 * borrowing was created so that it would start out with a single
1671	 * spin lock (sched_lock) held in fork_trampoline().  Since we've
1672	 * explicitly acquired locks in this function, the nesting count
1673	 * is now 2 rather than 1.  Since we are nested, calling
1674	 * spinlock_exit() will simply adjust the counts without allowing
1675	 * spin lock using code to interrupt us.
1676	 */
1677	if (td == NULL) {
1678		mtx_lock_spin(&sched_lock);
1679		spinlock_exit();
1680		PCPU_SET(switchtime, cpu_ticks());
1681		PCPU_SET(switchticks, ticks);
1682	} else {
1683		lock_profile_release_lock(&sched_lock.lock_object);
1684		MPASS(td->td_lock == &sched_lock);
1685		td->td_lastcpu = td->td_oncpu;
1686		td->td_oncpu = NOCPU;
1687	}
1688	mtx_assert(&sched_lock, MA_OWNED);
1689	KASSERT(curthread->td_md.md_spinlock_count == 1, ("invalid count"));
1690	cpu_throw(td, choosethread());	/* doesn't return */
1691}
1692
1693void
1694sched_fork_exit(struct thread *td)
1695{
1696
1697	/*
1698	 * Finish setting up thread glue so that it begins execution in a
1699	 * non-nested critical section with sched_lock held but not recursed.
1700	 */
1701	td->td_oncpu = PCPU_GET(cpuid);
1702	sched_lock.mtx_lock = (uintptr_t)td;
1703	lock_profile_obtain_lock_success(&sched_lock.lock_object,
1704	    0, 0, __FILE__, __LINE__);
1705	THREAD_LOCK_ASSERT(td, MA_OWNED | MA_NOTRECURSED);
1706}
1707
1708char *
1709sched_tdname(struct thread *td)
1710{
1711#ifdef KTR
1712	struct td_sched *ts;
1713
1714	ts = td_get_sched(td);
1715	if (ts->ts_name[0] == '\0')
1716		snprintf(ts->ts_name, sizeof(ts->ts_name),
1717		    "%s tid %d", td->td_name, td->td_tid);
1718	return (ts->ts_name);
1719#else
1720	return (td->td_name);
1721#endif
1722}
1723
1724#ifdef KTR
1725void
1726sched_clear_tdname(struct thread *td)
1727{
1728	struct td_sched *ts;
1729
1730	ts = td_get_sched(td);
1731	ts->ts_name[0] = '\0';
1732}
1733#endif
1734
1735void
1736sched_affinity(struct thread *td)
1737{
1738#ifdef SMP
1739	struct td_sched *ts;
1740	int cpu;
1741
1742	THREAD_LOCK_ASSERT(td, MA_OWNED);
1743
1744	/*
1745	 * Set the TSF_AFFINITY flag if there is at least one CPU this
1746	 * thread can't run on.
1747	 */
1748	ts = td_get_sched(td);
1749	ts->ts_flags &= ~TSF_AFFINITY;
1750	CPU_FOREACH(cpu) {
1751		if (!THREAD_CAN_SCHED(td, cpu)) {
1752			ts->ts_flags |= TSF_AFFINITY;
1753			break;
1754		}
1755	}
1756
1757	/*
1758	 * If this thread can run on all CPUs, nothing else to do.
1759	 */
1760	if (!(ts->ts_flags & TSF_AFFINITY))
1761		return;
1762
1763	/* Pinned threads and bound threads should be left alone. */
1764	if (td->td_pinned != 0 || td->td_flags & TDF_BOUND)
1765		return;
1766
1767	switch (td->td_state) {
1768	case TDS_RUNQ:
1769		/*
1770		 * If we are on a per-CPU runqueue that is in the set,
1771		 * then nothing needs to be done.
1772		 */
1773		if (ts->ts_runq != &runq &&
1774		    THREAD_CAN_SCHED(td, ts->ts_runq - runq_pcpu))
1775			return;
1776
1777		/* Put this thread on a valid per-CPU runqueue. */
1778		sched_rem(td);
1779		sched_add(td, SRQ_BORING);
1780		break;
1781	case TDS_RUNNING:
1782		/*
1783		 * See if our current CPU is in the set.  If not, force a
1784		 * context switch.
1785		 */
1786		if (THREAD_CAN_SCHED(td, td->td_oncpu))
1787			return;
1788
1789		td->td_flags |= TDF_NEEDRESCHED;
1790		if (td != curthread)
1791			ipi_cpu(cpu, IPI_AST);
1792		break;
1793	default:
1794		break;
1795	}
1796#endif
1797}
1798