1104964Sjeff/*-
2104964Sjeff * Copyright (c) 1982, 1986, 1990, 1991, 1993
3104964Sjeff *	The Regents of the University of California.  All rights reserved.
4104964Sjeff * (c) UNIX System Laboratories, Inc.
5104964Sjeff * All or some portions of this file are derived from material licensed
6104964Sjeff * to the University of California by American Telephone and Telegraph
7104964Sjeff * Co. or Unix System Laboratories, Inc. and are reproduced herein with
8104964Sjeff * the permission of UNIX System Laboratories, Inc.
9104964Sjeff *
10104964Sjeff * Redistribution and use in source and binary forms, with or without
11104964Sjeff * modification, are permitted provided that the following conditions
12104964Sjeff * are met:
13104964Sjeff * 1. Redistributions of source code must retain the above copyright
14104964Sjeff *    notice, this list of conditions and the following disclaimer.
15104964Sjeff * 2. Redistributions in binary form must reproduce the above copyright
16104964Sjeff *    notice, this list of conditions and the following disclaimer in the
17104964Sjeff *    documentation and/or other materials provided with the distribution.
18104964Sjeff * 4. Neither the name of the University nor the names of its contributors
19104964Sjeff *    may be used to endorse or promote products derived from this software
20104964Sjeff *    without specific prior written permission.
21104964Sjeff *
22104964Sjeff * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
23104964Sjeff * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
24104964Sjeff * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
25104964Sjeff * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
26104964Sjeff * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
27104964Sjeff * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
28104964Sjeff * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
29104964Sjeff * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
30104964Sjeff * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
31104964Sjeff * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32104964Sjeff * SUCH DAMAGE.
33104964Sjeff */
34104964Sjeff
35116182Sobrien#include <sys/cdefs.h>
36116182Sobrien__FBSDID("$FreeBSD: stable/11/sys/kern/sched_4bsd.c 331722 2018-03-29 02:50:57Z eadler $");
37116182Sobrien
38147565Speter#include "opt_hwpmc_hooks.h"
39177418Sjeff#include "opt_sched.h"
40147565Speter
41104964Sjeff#include <sys/param.h>
42104964Sjeff#include <sys/systm.h>
43176750Smarcel#include <sys/cpuset.h>
44104964Sjeff#include <sys/kernel.h>
45104964Sjeff#include <sys/ktr.h>
46104964Sjeff#include <sys/lock.h>
47123871Sjhb#include <sys/kthread.h>
48104964Sjeff#include <sys/mutex.h>
49104964Sjeff#include <sys/proc.h>
50104964Sjeff#include <sys/resourcevar.h>
51104964Sjeff#include <sys/sched.h>
52235459Srstone#include <sys/sdt.h>
53104964Sjeff#include <sys/smp.h>
54104964Sjeff#include <sys/sysctl.h>
55104964Sjeff#include <sys/sx.h>
56139453Sjhb#include <sys/turnstile.h>
57161599Sdavidxu#include <sys/umtx.h>
58160039Sobrien#include <machine/pcb.h>
59134689Sjulian#include <machine/smp.h>
60104964Sjeff
61145256Sjkoshy#ifdef HWPMC_HOOKS
62145256Sjkoshy#include <sys/pmckern.h>
63145256Sjkoshy#endif
64145256Sjkoshy
65179297Sjb#ifdef KDTRACE_HOOKS
66179297Sjb#include <sys/dtrace_bsd.h>
67179297Sjbint				dtrace_vtime_active;
68179297Sjbdtrace_vtime_switch_func_t	dtrace_vtime_switch_func;
69179297Sjb#endif
70179297Sjb
71107135Sjeff/*
72107135Sjeff * INVERSE_ESTCPU_WEIGHT is only suitable for statclock() frequencies in
73107135Sjeff * the range 100-256 Hz (approximately).
74107135Sjeff */
75107135Sjeff#define	ESTCPULIM(e) \
76107135Sjeff    min((e), INVERSE_ESTCPU_WEIGHT * (NICE_WEIGHT * (PRIO_MAX - PRIO_MIN) - \
77107135Sjeff    RQ_PPQ) + INVERSE_ESTCPU_WEIGHT - 1)
78122355Sbde#ifdef SMP
79122355Sbde#define	INVERSE_ESTCPU_WEIGHT	(8 * smp_cpus)
80122355Sbde#else
81107135Sjeff#define	INVERSE_ESTCPU_WEIGHT	8	/* 1 / (priorities per estcpu level). */
82122355Sbde#endif
83107135Sjeff#define	NICE_WEIGHT		1	/* Priorities per nice level. */
84107135Sjeff
85187679Sjeff#define	TS_NAME_LEN (MAXCOMLEN + sizeof(" td ") + sizeof(__XSTRING(UINT_MAX)))
86187357Sjeff
87134791Sjulian/*
88163709Sjb * The schedulable entity that runs a context.
89164936Sjulian * This is  an extension to the thread structure and is tailored to
90298145Skib * the requirements of this scheduler.
91298145Skib * All fields are protected by the scheduler lock.
92163709Sjb */
93164936Sjulianstruct td_sched {
94298145Skib	fixpt_t		ts_pctcpu;	/* %cpu during p_swtime. */
95298145Skib	u_int		ts_estcpu;	/* Estimated cpu utilization. */
96298145Skib	int		ts_cpticks;	/* Ticks of cpu time. */
97298145Skib	int		ts_slptime;	/* Seconds !RUNNING. */
98239153Smav	int		ts_slice;	/* Remaining part of time slice. */
99180923Sjhb	int		ts_flags;
100164936Sjulian	struct runq	*ts_runq;	/* runq the thread is currently on */
101187357Sjeff#ifdef KTR
102187357Sjeff	char		ts_name[TS_NAME_LEN];
103187357Sjeff#endif
104109145Sjeff};
105109145Sjeff
106134791Sjulian/* flags kept in td_flags */
107164936Sjulian#define TDF_DIDRUN	TDF_SCHED0	/* thread actually ran. */
108177435Sjeff#define TDF_BOUND	TDF_SCHED1	/* Bound to one CPU. */
109239157Smav#define	TDF_SLICEEND	TDF_SCHED2	/* Thread time slice is over. */
110134791Sjulian
111180923Sjhb/* flags kept in ts_flags */
112180923Sjhb#define	TSF_AFFINITY	0x0001		/* Has a non-"full" CPU set. */
113180923Sjhb
114164936Sjulian#define SKE_RUNQ_PCPU(ts)						\
115164936Sjulian    ((ts)->ts_runq != 0 && (ts)->ts_runq != &runq)
116124955Sjeff
117180923Sjhb#define	THREAD_CAN_SCHED(td, cpu)	\
118180923Sjhb    CPU_ISSET((cpu), &(td)->td_cpuset->cs_mask)
119180923Sjhb
120301456Skib_Static_assert(sizeof(struct thread) + sizeof(struct td_sched) <=
121301456Skib    sizeof(struct thread0_storage),
122301456Skib    "increase struct thread0_storage.t0st_sched size");
123301456Skib
124265108Smariusstatic struct mtx sched_lock;
125134791Sjulian
126239185Smavstatic int	realstathz = 127; /* stathz is sometimes 0 and run off of hz. */
127125288Sjeffstatic int	sched_tdcnt;	/* Total runnable threads in the system. */
128239185Smavstatic int	sched_slice = 12; /* Thread run time before rescheduling. */
129104964Sjeff
130124955Sjeffstatic void	setup_runqs(void);
131123871Sjhbstatic void	schedcpu(void);
132124955Sjeffstatic void	schedcpu_thread(void);
133139453Sjhbstatic void	sched_priority(struct thread *td, u_char prio);
134104964Sjeffstatic void	sched_setup(void *dummy);
135104964Sjeffstatic void	maybe_resched(struct thread *td);
136163709Sjbstatic void	updatepri(struct thread *td);
137163709Sjbstatic void	resetpriority(struct thread *td);
138163709Sjbstatic void	resetpriority_thread(struct thread *td);
139134694Sjulian#ifdef SMP
140180923Sjhbstatic int	sched_pickcpu(struct thread *td);
141180879Sjhbstatic int	forward_wakeup(int cpunum);
142180879Sjhbstatic void	kick_other_cpu(int pri, int cpuid);
143134694Sjulian#endif
144104964Sjeff
145124955Sjeffstatic struct kproc_desc sched_kp = {
146124955Sjeff        "schedcpu",
147124955Sjeff        schedcpu_thread,
148124955Sjeff        NULL
149124955Sjeff};
150253604SavgSYSINIT(schedcpu, SI_SUB_LAST, SI_ORDER_FIRST, kproc_start,
151177253Srwatson    &sched_kp);
152177253SrwatsonSYSINIT(sched_setup, SI_SUB_RUN_QUEUE, SI_ORDER_FIRST, sched_setup, NULL);
153104964Sjeff
154239153Smavstatic void sched_initticks(void *dummy);
155239153SmavSYSINIT(sched_initticks, SI_SUB_CLOCKS, SI_ORDER_THIRD, sched_initticks,
156239153Smav    NULL);
157239153Smav
158104964Sjeff/*
159104964Sjeff * Global run queue.
160104964Sjeff */
161104964Sjeffstatic struct runq runq;
162104964Sjeff
163124955Sjeff#ifdef SMP
164124955Sjeff/*
165124955Sjeff * Per-CPU run queues
166124955Sjeff */
167124955Sjeffstatic struct runq runq_pcpu[MAXCPU];
168180923Sjhblong runq_length[MAXCPU];
169222001Sattilio
170222813Sattiliostatic cpuset_t idle_cpus_mask;
171124955Sjeff#endif
172124955Sjeff
173212455Smavstruct pcpuidlestat {
174212455Smav	u_int idlecalls;
175212455Smav	u_int oldidlecalls;
176212455Smav};
177215701Sdimstatic DPCPU_DEFINE(struct pcpuidlestat, idlestat);
178212455Smav
179124955Sjeffstatic void
180124955Sjeffsetup_runqs(void)
181124955Sjeff{
182124955Sjeff#ifdef SMP
183124955Sjeff	int i;
184124955Sjeff
185124955Sjeff	for (i = 0; i < MAXCPU; ++i)
186124955Sjeff		runq_init(&runq_pcpu[i]);
187124955Sjeff#endif
188124955Sjeff
189124955Sjeff	runq_init(&runq);
190124955Sjeff}
191124955Sjeff
192239185Smavstatic int
193239185Smavsysctl_kern_quantum(SYSCTL_HANDLER_ARGS)
194239185Smav{
195239185Smav	int error, new_val, period;
196239185Smav
197239185Smav	period = 1000000 / realstathz;
198239185Smav	new_val = period * sched_slice;
199239185Smav	error = sysctl_handle_int(oidp, &new_val, 0, req);
200239196Smav	if (error != 0 || req->newptr == NULL)
201239185Smav		return (error);
202239185Smav	if (new_val <= 0)
203239185Smav		return (EINVAL);
204239196Smav	sched_slice = imax(1, (new_val + period / 2) / period);
205239196Smav	hogticks = imax(1, (2 * hz * sched_slice + realstathz / 2) /
206239196Smav	    realstathz);
207239185Smav	return (0);
208239185Smav}
209239185Smav
210132589SscottlSYSCTL_NODE(_kern, OID_AUTO, sched, CTLFLAG_RD, 0, "Scheduler");
211130881Sscottl
212132589SscottlSYSCTL_STRING(_kern_sched, OID_AUTO, name, CTLFLAG_RD, "4BSD", 0,
213132589Sscottl    "Scheduler name");
214239185SmavSYSCTL_PROC(_kern_sched, OID_AUTO, quantum, CTLTYPE_INT | CTLFLAG_RW,
215239185Smav    NULL, 0, sysctl_kern_quantum, "I",
216239196Smav    "Quantum for timeshare threads in microseconds");
217239153SmavSYSCTL_INT(_kern_sched, OID_AUTO, slice, CTLFLAG_RW, &sched_slice, 0,
218239196Smav    "Quantum for timeshare threads in stathz ticks");
219134693Sjulian#ifdef SMP
220134688Sjulian/* Enable forwarding of wakeups to all other cpus */
221227309Sedstatic SYSCTL_NODE(_kern_sched, OID_AUTO, ipiwakeup, CTLFLAG_RD, NULL,
222227309Sed    "Kernel SMP");
223134688Sjulian
224177419Sjeffstatic int runq_fuzz = 1;
225177419SjeffSYSCTL_INT(_kern_sched, OID_AUTO, runq_fuzz, CTLFLAG_RW, &runq_fuzz, 0, "");
226177419Sjeff
227134792Sjulianstatic int forward_wakeup_enabled = 1;
228134688SjulianSYSCTL_INT(_kern_sched_ipiwakeup, OID_AUTO, enabled, CTLFLAG_RW,
229134688Sjulian	   &forward_wakeup_enabled, 0,
230134688Sjulian	   "Forwarding of wakeup to idle CPUs");
231134688Sjulian
232134688Sjulianstatic int forward_wakeups_requested = 0;
233134688SjulianSYSCTL_INT(_kern_sched_ipiwakeup, OID_AUTO, requested, CTLFLAG_RD,
234134688Sjulian	   &forward_wakeups_requested, 0,
235134688Sjulian	   "Requests for Forwarding of wakeup to idle CPUs");
236134688Sjulian
237134688Sjulianstatic int forward_wakeups_delivered = 0;
238134688SjulianSYSCTL_INT(_kern_sched_ipiwakeup, OID_AUTO, delivered, CTLFLAG_RD,
239134688Sjulian	   &forward_wakeups_delivered, 0,
240134688Sjulian	   "Completed Forwarding of wakeup to idle CPUs");
241134688Sjulian
242134792Sjulianstatic int forward_wakeup_use_mask = 1;
243134688SjulianSYSCTL_INT(_kern_sched_ipiwakeup, OID_AUTO, usemask, CTLFLAG_RW,
244134688Sjulian	   &forward_wakeup_use_mask, 0,
245134688Sjulian	   "Use the mask of idle cpus");
246134688Sjulian
247134688Sjulianstatic int forward_wakeup_use_loop = 0;
248134688SjulianSYSCTL_INT(_kern_sched_ipiwakeup, OID_AUTO, useloop, CTLFLAG_RW,
249134688Sjulian	   &forward_wakeup_use_loop, 0,
250134688Sjulian	   "Use a loop to find idle cpus");
251134688Sjulian
252134693Sjulian#endif
253164936Sjulian#if 0
254135051Sjulianstatic int sched_followon = 0;
255135051SjulianSYSCTL_INT(_kern_sched, OID_AUTO, followon, CTLFLAG_RW,
256135051Sjulian	   &sched_followon, 0,
257135051Sjulian	   "allow threads to share a quantum");
258163709Sjb#endif
259135051Sjulian
260235459SrstoneSDT_PROVIDER_DEFINE(sched);
261235459Srstone
262258622SavgSDT_PROBE_DEFINE3(sched, , , change__pri, "struct thread *",
263235459Srstone    "struct proc *", "uint8_t");
264258622SavgSDT_PROBE_DEFINE3(sched, , , dequeue, "struct thread *",
265235459Srstone    "struct proc *", "void *");
266258622SavgSDT_PROBE_DEFINE4(sched, , , enqueue, "struct thread *",
267235459Srstone    "struct proc *", "void *", "int");
268258622SavgSDT_PROBE_DEFINE4(sched, , , lend__pri, "struct thread *",
269235459Srstone    "struct proc *", "uint8_t", "struct thread *");
270258622SavgSDT_PROBE_DEFINE2(sched, , , load__change, "int", "int");
271258622SavgSDT_PROBE_DEFINE2(sched, , , off__cpu, "struct thread *",
272235459Srstone    "struct proc *");
273258622SavgSDT_PROBE_DEFINE(sched, , , on__cpu);
274258622SavgSDT_PROBE_DEFINE(sched, , , remain__cpu);
275258622SavgSDT_PROBE_DEFINE2(sched, , , surrender, "struct thread *",
276235459Srstone    "struct proc *");
277235459Srstone
278139317Sjeffstatic __inline void
279139317Sjeffsched_load_add(void)
280139317Sjeff{
281187357Sjeff
282139317Sjeff	sched_tdcnt++;
283187357Sjeff	KTR_COUNTER0(KTR_SCHED, "load", "global load", sched_tdcnt);
284258622Savg	SDT_PROBE2(sched, , , load__change, NOCPU, sched_tdcnt);
285139317Sjeff}
286139317Sjeff
287139317Sjeffstatic __inline void
288139317Sjeffsched_load_rem(void)
289139317Sjeff{
290187357Sjeff
291139317Sjeff	sched_tdcnt--;
292187357Sjeff	KTR_COUNTER0(KTR_SCHED, "load", "global load", sched_tdcnt);
293258622Savg	SDT_PROBE2(sched, , , load__change, NOCPU, sched_tdcnt);
294139317Sjeff}
295104964Sjeff/*
296104964Sjeff * Arrange to reschedule if necessary, taking the priorities and
297104964Sjeff * schedulers into account.
298104964Sjeff */
299104964Sjeffstatic void
300104964Sjeffmaybe_resched(struct thread *td)
301104964Sjeff{
302104964Sjeff
303170293Sjeff	THREAD_LOCK_ASSERT(td, MA_OWNED);
304134791Sjulian	if (td->td_priority < curthread->td_priority)
305111032Sjulian		curthread->td_flags |= TDF_NEEDRESCHED;
306104964Sjeff}
307104964Sjeff
308104964Sjeff/*
309177419Sjeff * This function is called when a thread is about to be put on run queue
310177419Sjeff * because it has been made runnable or its priority has been adjusted.  It
311309446Sjhb * determines if the new thread should preempt the current thread.  If so,
312309446Sjhb * it sets td_owepreempt to request a preemption.
313177419Sjeff */
314177419Sjeffint
315177419Sjeffmaybe_preempt(struct thread *td)
316177419Sjeff{
317177419Sjeff#ifdef PREEMPTION
318177419Sjeff	struct thread *ctd;
319177419Sjeff	int cpri, pri;
320177419Sjeff
321177419Sjeff	/*
322177419Sjeff	 * The new thread should not preempt the current thread if any of the
323177419Sjeff	 * following conditions are true:
324177419Sjeff	 *
325177419Sjeff	 *  - The kernel is in the throes of crashing (panicstr).
326177419Sjeff	 *  - The current thread has a higher (numerically lower) or
327177419Sjeff	 *    equivalent priority.  Note that this prevents curthread from
328177419Sjeff	 *    trying to preempt to itself.
329177419Sjeff	 *  - The current thread has an inhibitor set or is in the process of
330177419Sjeff	 *    exiting.  In this case, the current thread is about to switch
331177419Sjeff	 *    out anyways, so there's no point in preempting.  If we did,
332177419Sjeff	 *    the current thread would not be properly resumed as well, so
333177419Sjeff	 *    just avoid that whole landmine.
334177419Sjeff	 *  - If the new thread's priority is not a realtime priority and
335177419Sjeff	 *    the current thread's priority is not an idle priority and
336177419Sjeff	 *    FULL_PREEMPTION is disabled.
337177419Sjeff	 *
338177419Sjeff	 * If all of these conditions are false, but the current thread is in
339177419Sjeff	 * a nested critical section, then we have to defer the preemption
340177419Sjeff	 * until we exit the critical section.  Otherwise, switch immediately
341177419Sjeff	 * to the new thread.
342177419Sjeff	 */
343177419Sjeff	ctd = curthread;
344177419Sjeff	THREAD_LOCK_ASSERT(td, MA_OWNED);
345177419Sjeff	KASSERT((td->td_inhibitors == 0),
346177419Sjeff			("maybe_preempt: trying to run inhibited thread"));
347177419Sjeff	pri = td->td_priority;
348177419Sjeff	cpri = ctd->td_priority;
349310129Sjhb	if (panicstr != NULL || pri >= cpri /* || dumping */ ||
350177419Sjeff	    TD_IS_INHIBITED(ctd))
351177419Sjeff		return (0);
352177419Sjeff#ifndef FULL_PREEMPTION
353177419Sjeff	if (pri > PRI_MAX_ITHD && cpri < PRI_MIN_IDLE)
354177419Sjeff		return (0);
355177419Sjeff#endif
356177419Sjeff
357309446Sjhb	CTR0(KTR_PROC, "maybe_preempt: scheduling preemption");
358309446Sjhb	ctd->td_owepreempt = 1;
359177419Sjeff	return (1);
360177419Sjeff#else
361177419Sjeff	return (0);
362177419Sjeff#endif
363177419Sjeff}
364177419Sjeff
365177419Sjeff/*
366104964Sjeff * Constants for digital decay and forget:
367298145Skib *	90% of (ts_estcpu) usage in 5 * loadav time
368164936Sjulian *	95% of (ts_pctcpu) usage in 60 seconds (load insensitive)
369104964Sjeff *          Note that, as ps(1) mentions, this can let percentages
370104964Sjeff *          total over 100% (I've seen 137.9% for 3 processes).
371104964Sjeff *
372298145Skib * Note that schedclock() updates ts_estcpu and p_cpticks asynchronously.
373104964Sjeff *
374298145Skib * We wish to decay away 90% of ts_estcpu in (5 * loadavg) seconds.
375104964Sjeff * That is, the system wants to compute a value of decay such
376104964Sjeff * that the following for loop:
377104964Sjeff * 	for (i = 0; i < (5 * loadavg); i++)
378298145Skib * 		ts_estcpu *= decay;
379104964Sjeff * will compute
380298145Skib * 	ts_estcpu *= 0.1;
381104964Sjeff * for all values of loadavg:
382104964Sjeff *
383104964Sjeff * Mathematically this loop can be expressed by saying:
384104964Sjeff * 	decay ** (5 * loadavg) ~= .1
385104964Sjeff *
386104964Sjeff * The system computes decay as:
387104964Sjeff * 	decay = (2 * loadavg) / (2 * loadavg + 1)
388104964Sjeff *
389104964Sjeff * We wish to prove that the system's computation of decay
390104964Sjeff * will always fulfill the equation:
391104964Sjeff * 	decay ** (5 * loadavg) ~= .1
392104964Sjeff *
393104964Sjeff * If we compute b as:
394104964Sjeff * 	b = 2 * loadavg
395104964Sjeff * then
396104964Sjeff * 	decay = b / (b + 1)
397104964Sjeff *
398104964Sjeff * We now need to prove two things:
399104964Sjeff *	1) Given factor ** (5 * loadavg) ~= .1, prove factor == b/(b+1)
400104964Sjeff *	2) Given b/(b+1) ** power ~= .1, prove power == (5 * loadavg)
401104964Sjeff *
402104964Sjeff * Facts:
403104964Sjeff *         For x close to zero, exp(x) =~ 1 + x, since
404104964Sjeff *              exp(x) = 0! + x**1/1! + x**2/2! + ... .
405104964Sjeff *              therefore exp(-1/b) =~ 1 - (1/b) = (b-1)/b.
406104964Sjeff *         For x close to zero, ln(1+x) =~ x, since
407104964Sjeff *              ln(1+x) = x - x**2/2 + x**3/3 - ...     -1 < x < 1
408104964Sjeff *              therefore ln(b/(b+1)) = ln(1 - 1/(b+1)) =~ -1/(b+1).
409104964Sjeff *         ln(.1) =~ -2.30
410104964Sjeff *
411104964Sjeff * Proof of (1):
412104964Sjeff *    Solve (factor)**(power) =~ .1 given power (5*loadav):
413104964Sjeff *	solving for factor,
414104964Sjeff *      ln(factor) =~ (-2.30/5*loadav), or
415104964Sjeff *      factor =~ exp(-1/((5/2.30)*loadav)) =~ exp(-1/(2*loadav)) =
416104964Sjeff *          exp(-1/b) =~ (b-1)/b =~ b/(b+1).                    QED
417104964Sjeff *
418104964Sjeff * Proof of (2):
419104964Sjeff *    Solve (factor)**(power) =~ .1 given factor == (b/(b+1)):
420104964Sjeff *	solving for power,
421104964Sjeff *      power*ln(b/(b+1)) =~ -2.30, or
422104964Sjeff *      power =~ 2.3 * (b + 1) = 4.6*loadav + 2.3 =~ 5*loadav.  QED
423104964Sjeff *
424104964Sjeff * Actual power values for the implemented algorithm are as follows:
425104964Sjeff *      loadav: 1       2       3       4
426104964Sjeff *      power:  5.68    10.32   14.94   19.55
427104964Sjeff */
428104964Sjeff
429104964Sjeff/* calculations for digital decay to forget 90% of usage in 5*loadav sec */
430104964Sjeff#define	loadfactor(loadav)	(2 * (loadav))
431104964Sjeff#define	decay_cpu(loadfac, cpu)	(((loadfac) * (cpu)) / ((loadfac) + FSCALE))
432104964Sjeff
433164936Sjulian/* decay 95% of `ts_pctcpu' in 60 seconds; see CCPU_SHIFT before changing */
434104964Sjeffstatic fixpt_t	ccpu = 0.95122942450071400909 * FSCALE;	/* exp(-1/20) */
435217370SmdfSYSCTL_UINT(_kern, OID_AUTO, ccpu, CTLFLAG_RD, &ccpu, 0, "");
436104964Sjeff
437104964Sjeff/*
438104964Sjeff * If `ccpu' is not equal to `exp(-1/20)' and you still want to use the
439104964Sjeff * faster/more-accurate formula, you'll have to estimate CCPU_SHIFT below
440104964Sjeff * and possibly adjust FSHIFT in "param.h" so that (FSHIFT >= CCPU_SHIFT).
441104964Sjeff *
442104964Sjeff * To estimate CCPU_SHIFT for exp(-1/20), the following formula was used:
443104964Sjeff *	1 - exp(-1/20) ~= 0.0487 ~= 0.0488 == 1 (fixed pt, *11* bits).
444104964Sjeff *
445104964Sjeff * If you don't want to bother with the faster/more-accurate formula, you
446104964Sjeff * can set CCPU_SHIFT to (FSHIFT + 1) which will use a slower/less-accurate
447104964Sjeff * (more general) method of calculating the %age of CPU used by a process.
448104964Sjeff */
449104964Sjeff#define	CCPU_SHIFT	11
450104964Sjeff
451104964Sjeff/*
452104964Sjeff * Recompute process priorities, every hz ticks.
453104964Sjeff * MP-safe, called without the Giant mutex.
454104964Sjeff */
455104964Sjeff/* ARGSUSED */
456104964Sjeffstatic void
457123871Sjhbschedcpu(void)
458104964Sjeff{
459331643Sdim	fixpt_t loadfac = loadfactor(averunnable.ldavg[0]);
460104964Sjeff	struct thread *td;
461104964Sjeff	struct proc *p;
462164936Sjulian	struct td_sched *ts;
463239153Smav	int awake;
464104964Sjeff
465104964Sjeff	sx_slock(&allproc_lock);
466104964Sjeff	FOREACH_PROC_IN_SYSTEM(p) {
467177368Sjeff		PROC_LOCK(p);
468220390Sjhb		if (p->p_state == PRS_NEW) {
469220390Sjhb			PROC_UNLOCK(p);
470220390Sjhb			continue;
471220390Sjhb		}
472180879Sjhb		FOREACH_THREAD_IN_PROC(p, td) {
473104964Sjeff			awake = 0;
474301456Skib			ts = td_get_sched(td);
475170293Sjeff			thread_lock(td);
476163709Sjb			/*
477163709Sjb			 * Increment sleep time (if sleeping).  We
478163709Sjb			 * ignore overflow, as above.
479163709Sjb			 */
480163709Sjb			/*
481164936Sjulian			 * The td_sched slptimes are not touched in wakeup
482164936Sjulian			 * because the thread may not HAVE everything in
483164936Sjulian			 * memory? XXX I think this is out of date.
484163709Sjb			 */
485166188Sjeff			if (TD_ON_RUNQ(td)) {
486163709Sjb				awake = 1;
487177435Sjeff				td->td_flags &= ~TDF_DIDRUN;
488166188Sjeff			} else if (TD_IS_RUNNING(td)) {
489163709Sjb				awake = 1;
490177435Sjeff				/* Do not clear TDF_DIDRUN */
491177435Sjeff			} else if (td->td_flags & TDF_DIDRUN) {
492163709Sjb				awake = 1;
493177435Sjeff				td->td_flags &= ~TDF_DIDRUN;
494163709Sjb			}
495163709Sjb
496163709Sjb			/*
497164936Sjulian			 * ts_pctcpu is only for ps and ttyinfo().
498163709Sjb			 */
499164936Sjulian			ts->ts_pctcpu = (ts->ts_pctcpu * ccpu) >> FSHIFT;
500163709Sjb			/*
501164936Sjulian			 * If the td_sched has been idle the entire second,
502163709Sjb			 * stop recalculating its priority until
503163709Sjb			 * it wakes up.
504163709Sjb			 */
505164936Sjulian			if (ts->ts_cpticks != 0) {
506163709Sjb#if	(FSHIFT >= CCPU_SHIFT)
507164936Sjulian				ts->ts_pctcpu += (realstathz == 100)
508164936Sjulian				    ? ((fixpt_t) ts->ts_cpticks) <<
509164936Sjulian				    (FSHIFT - CCPU_SHIFT) :
510164936Sjulian				    100 * (((fixpt_t) ts->ts_cpticks)
511164936Sjulian				    << (FSHIFT - CCPU_SHIFT)) / realstathz;
512163709Sjb#else
513164936Sjulian				ts->ts_pctcpu += ((FSCALE - ccpu) *
514164936Sjulian				    (ts->ts_cpticks *
515164936Sjulian				    FSCALE / realstathz)) >> FSHIFT;
516163709Sjb#endif
517164936Sjulian				ts->ts_cpticks = 0;
518164267Sdavidxu			}
519180879Sjhb			/*
520163709Sjb			 * If there are ANY running threads in this process,
521104964Sjeff			 * then don't count it as sleeping.
522180879Sjhb			 * XXX: this is broken.
523104964Sjeff			 */
524104964Sjeff			if (awake) {
525172264Sjeff				if (ts->ts_slptime > 1) {
526104964Sjeff					/*
527104964Sjeff					 * In an ideal world, this should not
528104964Sjeff					 * happen, because whoever woke us
529104964Sjeff					 * up from the long sleep should have
530104964Sjeff					 * unwound the slptime and reset our
531104964Sjeff					 * priority before we run at the stale
532104964Sjeff					 * priority.  Should KASSERT at some
533104964Sjeff					 * point when all the cases are fixed.
534104964Sjeff					 */
535163709Sjb					updatepri(td);
536163709Sjb				}
537172264Sjeff				ts->ts_slptime = 0;
538163709Sjb			} else
539172264Sjeff				ts->ts_slptime++;
540172264Sjeff			if (ts->ts_slptime > 1) {
541170293Sjeff				thread_unlock(td);
542163709Sjb				continue;
543170293Sjeff			}
544298145Skib			ts->ts_estcpu = decay_cpu(loadfac, ts->ts_estcpu);
545163709Sjb		      	resetpriority(td);
546163709Sjb			resetpriority_thread(td);
547170293Sjeff			thread_unlock(td);
548180879Sjhb		}
549177368Sjeff		PROC_UNLOCK(p);
550180879Sjhb	}
551104964Sjeff	sx_sunlock(&allproc_lock);
552104964Sjeff}
553104964Sjeff
554104964Sjeff/*
555123871Sjhb * Main loop for a kthread that executes schedcpu once a second.
556123871Sjhb */
557123871Sjhbstatic void
558124955Sjeffschedcpu_thread(void)
559123871Sjhb{
560123871Sjhb
561123871Sjhb	for (;;) {
562123871Sjhb		schedcpu();
563167086Sjhb		pause("-", hz);
564123871Sjhb	}
565123871Sjhb}
566123871Sjhb
567123871Sjhb/*
568104964Sjeff * Recalculate the priority of a process after it has slept for a while.
569298145Skib * For all load averages >= 1 and max ts_estcpu of 255, sleeping for at
570298145Skib * least six times the loadfactor will decay ts_estcpu to zero.
571104964Sjeff */
572104964Sjeffstatic void
573163709Sjbupdatepri(struct thread *td)
574104964Sjeff{
575172264Sjeff	struct td_sched *ts;
576172264Sjeff	fixpt_t loadfac;
577172264Sjeff	unsigned int newcpu;
578104964Sjeff
579301456Skib	ts = td_get_sched(td);
580118972Sjhb	loadfac = loadfactor(averunnable.ldavg[0]);
581172264Sjeff	if (ts->ts_slptime > 5 * loadfac)
582298145Skib		ts->ts_estcpu = 0;
583104964Sjeff	else {
584298145Skib		newcpu = ts->ts_estcpu;
585172264Sjeff		ts->ts_slptime--;	/* was incremented in schedcpu() */
586172264Sjeff		while (newcpu && --ts->ts_slptime)
587104964Sjeff			newcpu = decay_cpu(loadfac, newcpu);
588298145Skib		ts->ts_estcpu = newcpu;
589104964Sjeff	}
590104964Sjeff}
591104964Sjeff
592104964Sjeff/*
593104964Sjeff * Compute the priority of a process when running in user mode.
594104964Sjeff * Arrange to reschedule if the resulting priority is better
595104964Sjeff * than that of the current process.
596104964Sjeff */
597104964Sjeffstatic void
598163709Sjbresetpriority(struct thread *td)
599104964Sjeff{
600298145Skib	u_int newpriority;
601104964Sjeff
602298145Skib	if (td->td_pri_class != PRI_TIMESHARE)
603298145Skib		return;
604301456Skib	newpriority = PUSER +
605301456Skib	    td_get_sched(td)->ts_estcpu / INVERSE_ESTCPU_WEIGHT +
606298145Skib	    NICE_WEIGHT * (td->td_proc->p_nice - PRIO_MIN);
607298145Skib	newpriority = min(max(newpriority, PRI_MIN_TIMESHARE),
608298145Skib	    PRI_MAX_TIMESHARE);
609298145Skib	sched_user_prio(td, newpriority);
610104964Sjeff}
611104964Sjeff
612139453Sjhb/*
613164936Sjulian * Update the thread's priority when the associated process's user
614139453Sjhb * priority changes.
615139453Sjhb */
616139453Sjhbstatic void
617163709Sjbresetpriority_thread(struct thread *td)
618139453Sjhb{
619139453Sjhb
620139453Sjhb	/* Only change threads with a time sharing user priority. */
621139453Sjhb	if (td->td_priority < PRI_MIN_TIMESHARE ||
622139453Sjhb	    td->td_priority > PRI_MAX_TIMESHARE)
623139453Sjhb		return;
624139453Sjhb
625139453Sjhb	/* XXX the whole needresched thing is broken, but not silly. */
626139453Sjhb	maybe_resched(td);
627139453Sjhb
628163709Sjb	sched_prio(td, td->td_user_pri);
629139453Sjhb}
630139453Sjhb
631104964Sjeff/* ARGSUSED */
632104964Sjeffstatic void
633104964Sjeffsched_setup(void *dummy)
634104964Sjeff{
635239185Smav
636124955Sjeff	setup_runqs();
637118972Sjhb
638125288Sjeff	/* Account for thread0. */
639139317Sjeff	sched_load_add();
640104964Sjeff}
641104964Sjeff
642239153Smav/*
643239185Smav * This routine determines time constants after stathz and hz are setup.
644239153Smav */
645239153Smavstatic void
646239153Smavsched_initticks(void *dummy)
647239153Smav{
648239153Smav
649239153Smav	realstathz = stathz ? stathz : hz;
650239153Smav	sched_slice = realstathz / 10;	/* ~100ms */
651239196Smav	hogticks = imax(1, (2 * hz * sched_slice + realstathz / 2) /
652239196Smav	    realstathz);
653239153Smav}
654239153Smav
655104964Sjeff/* External interfaces start here */
656180879Sjhb
657134791Sjulian/*
658134791Sjulian * Very early in the boot some setup of scheduler-specific
659145109Smaxim * parts of proc0 and of some scheduler resources needs to be done.
660134791Sjulian * Called from:
661134791Sjulian *  proc0_init()
662134791Sjulian */
663134791Sjulianvoid
664134791Sjulianschedinit(void)
665134791Sjulian{
666301456Skib
667134791Sjulian	/*
668301456Skib	 * Set up the scheduler specific parts of thread0.
669134791Sjulian	 */
670170293Sjeff	thread0.td_lock = &sched_lock;
671301456Skib	td_get_sched(&thread0)->ts_slice = sched_slice;
672171488Sjeff	mtx_init(&sched_lock, "sched lock", NULL, MTX_SPIN | MTX_RECURSE);
673134791Sjulian}
674134791Sjulian
675104964Sjeffint
676104964Sjeffsched_runnable(void)
677104964Sjeff{
678124955Sjeff#ifdef SMP
679124955Sjeff	return runq_check(&runq) + runq_check(&runq_pcpu[PCPU_GET(cpuid)]);
680124955Sjeff#else
681124955Sjeff	return runq_check(&runq);
682124955Sjeff#endif
683104964Sjeff}
684104964Sjeff
685180879Sjhbint
686104964Sjeffsched_rr_interval(void)
687104964Sjeff{
688239153Smav
689239153Smav	/* Convert sched_slice from stathz to hz. */
690239196Smav	return (imax(1, (sched_slice * hz + realstathz / 2) / realstathz));
691104964Sjeff}
692104964Sjeff
693104964Sjeff/*
694298145Skib * We adjust the priority of the current process.  The priority of a
695298145Skib * process gets worse as it accumulates CPU time.  The cpu usage
696298145Skib * estimator (ts_estcpu) is increased here.  resetpriority() will
697298145Skib * compute a different priority each time ts_estcpu increases by
698298145Skib * INVERSE_ESTCPU_WEIGHT (until PRI_MAX_TIMESHARE is reached).  The
699298145Skib * cpu usage estimator ramps up quite quickly when the process is
700298145Skib * running (linearly), and decays away exponentially, at a rate which
701298145Skib * is proportionally slower when the system is busy.  The basic
702298145Skib * principle is that the system will 90% forget that the process used
703298145Skib * a lot of CPU time in 5 * loadav seconds.  This causes the system to
704298145Skib * favor processes which haven't run much recently, and to round-robin
705298145Skib * among other processes.
706104964Sjeff */
707104964Sjeffvoid
708121127Sjeffsched_clock(struct thread *td)
709104964Sjeff{
710212455Smav	struct pcpuidlestat *stat;
711164936Sjulian	struct td_sched *ts;
712104964Sjeff
713170293Sjeff	THREAD_LOCK_ASSERT(td, MA_OWNED);
714301456Skib	ts = td_get_sched(td);
715113356Sjeff
716164936Sjulian	ts->ts_cpticks++;
717298145Skib	ts->ts_estcpu = ESTCPULIM(ts->ts_estcpu + 1);
718298145Skib	if ((ts->ts_estcpu % INVERSE_ESTCPU_WEIGHT) == 0) {
719163709Sjb		resetpriority(td);
720163709Sjb		resetpriority_thread(td);
721104964Sjeff	}
722173081Sjhb
723173081Sjhb	/*
724173081Sjhb	 * Force a context switch if the current thread has used up a full
725239185Smav	 * time slice (default is 100ms).
726173081Sjhb	 */
727239185Smav	if (!TD_IS_IDLETHREAD(td) && --ts->ts_slice <= 0) {
728239153Smav		ts->ts_slice = sched_slice;
729239157Smav		td->td_flags |= TDF_NEEDRESCHED | TDF_SLICEEND;
730239153Smav	}
731212455Smav
732212455Smav	stat = DPCPU_PTR(idlestat);
733212455Smav	stat->oldidlecalls = stat->idlecalls;
734212455Smav	stat->idlecalls = 0;
735104964Sjeff}
736118972Sjhb
737104964Sjeff/*
738180879Sjhb * Charge child's scheduling CPU usage to parent.
739104964Sjeff */
740104964Sjeffvoid
741132372Sjuliansched_exit(struct proc *p, struct thread *td)
742104964Sjeff{
743163709Sjb
744187357Sjeff	KTR_STATE1(KTR_SCHED, "thread", sched_tdname(td), "proc exit",
745225199Sdelphij	    "prio:%d", td->td_priority);
746187357Sjeff
747177368Sjeff	PROC_LOCK_ASSERT(p, MA_OWNED);
748164936Sjulian	sched_exit_thread(FIRST_THREAD_IN_PROC(p), td);
749113356Sjeff}
750113356Sjeff
751113356Sjeffvoid
752164936Sjuliansched_exit_thread(struct thread *td, struct thread *child)
753113356Sjeff{
754113923Sjhb
755187357Sjeff	KTR_STATE1(KTR_SCHED, "thread", sched_tdname(child), "exit",
756225199Sdelphij	    "prio:%d", child->td_priority);
757170293Sjeff	thread_lock(td);
758301456Skib	td_get_sched(td)->ts_estcpu = ESTCPULIM(td_get_sched(td)->ts_estcpu +
759301456Skib	    td_get_sched(child)->ts_estcpu);
760170293Sjeff	thread_unlock(td);
761198854Sattilio	thread_lock(child);
762198854Sattilio	if ((child->td_flags & TDF_NOLOAD) == 0)
763139317Sjeff		sched_load_rem();
764198854Sattilio	thread_unlock(child);
765113356Sjeff}
766109145Sjeff
767113356Sjeffvoid
768134791Sjuliansched_fork(struct thread *td, struct thread *childtd)
769113356Sjeff{
770134791Sjulian	sched_fork_thread(td, childtd);
771113356Sjeff}
772113356Sjeff
773113356Sjeffvoid
774134791Sjuliansched_fork_thread(struct thread *td, struct thread *childtd)
775113356Sjeff{
776301456Skib	struct td_sched *ts, *tsc;
777177426Sjeff
778286256Sjhb	childtd->td_oncpu = NOCPU;
779286256Sjhb	childtd->td_lastcpu = NOCPU;
780170293Sjeff	childtd->td_lock = &sched_lock;
781176750Smarcel	childtd->td_cpuset = cpuset_ref(td->td_cpuset);
782217078Sjhb	childtd->td_priority = childtd->td_base_pri;
783301456Skib	ts = td_get_sched(childtd);
784177426Sjeff	bzero(ts, sizeof(*ts));
785301456Skib	tsc = td_get_sched(td);
786301456Skib	ts->ts_estcpu = tsc->ts_estcpu;
787301456Skib	ts->ts_flags |= (tsc->ts_flags & TSF_AFFINITY);
788239153Smav	ts->ts_slice = 1;
789104964Sjeff}
790104964Sjeff
791104964Sjeffvoid
792130551Sjuliansched_nice(struct proc *p, int nice)
793104964Sjeff{
794139453Sjhb	struct thread *td;
795113873Sjhb
796130551Sjulian	PROC_LOCK_ASSERT(p, MA_OWNED);
797130551Sjulian	p->p_nice = nice;
798163709Sjb	FOREACH_THREAD_IN_PROC(p, td) {
799170293Sjeff		thread_lock(td);
800163709Sjb		resetpriority(td);
801163709Sjb		resetpriority_thread(td);
802170293Sjeff		thread_unlock(td);
803163709Sjb	}
804104964Sjeff}
805104964Sjeff
806113356Sjeffvoid
807163709Sjbsched_class(struct thread *td, int class)
808113356Sjeff{
809170293Sjeff	THREAD_LOCK_ASSERT(td, MA_OWNED);
810163709Sjb	td->td_pri_class = class;
811113356Sjeff}
812113356Sjeff
813105127Sjulian/*
814105127Sjulian * Adjust the priority of a thread.
815105127Sjulian */
816139453Sjhbstatic void
817139453Sjhbsched_priority(struct thread *td, u_char prio)
818104964Sjeff{
819104964Sjeff
820187357Sjeff
821187357Sjeff	KTR_POINT3(KTR_SCHED, "thread", sched_tdname(td), "priority change",
822187357Sjeff	    "prio:%d", td->td_priority, "new prio:%d", prio, KTR_ATTR_LINKED,
823187357Sjeff	    sched_tdname(curthread));
824258622Savg	SDT_PROBE3(sched, , , change__pri, td, td->td_proc, prio);
825187357Sjeff	if (td != curthread && prio > td->td_priority) {
826187357Sjeff		KTR_POINT3(KTR_SCHED, "thread", sched_tdname(curthread),
827187357Sjeff		    "lend prio", "prio:%d", td->td_priority, "new prio:%d",
828187357Sjeff		    prio, KTR_ATTR_LINKED, sched_tdname(td));
829258622Savg		SDT_PROBE4(sched, , , lend__pri, td, td->td_proc, prio,
830235459Srstone		    curthread);
831187357Sjeff	}
832170293Sjeff	THREAD_LOCK_ASSERT(td, MA_OWNED);
833139453Sjhb	if (td->td_priority == prio)
834139453Sjhb		return;
835166188Sjeff	td->td_priority = prio;
836177435Sjeff	if (TD_ON_RUNQ(td) && td->td_rqindex != (prio / RQ_PPQ)) {
837166188Sjeff		sched_rem(td);
838166188Sjeff		sched_add(td, SRQ_BORING);
839104964Sjeff	}
840104964Sjeff}
841104964Sjeff
842139453Sjhb/*
843139453Sjhb * Update a thread's priority when it is lent another thread's
844139453Sjhb * priority.
845139453Sjhb */
846104964Sjeffvoid
847139453Sjhbsched_lend_prio(struct thread *td, u_char prio)
848139453Sjhb{
849139453Sjhb
850139453Sjhb	td->td_flags |= TDF_BORROWING;
851139453Sjhb	sched_priority(td, prio);
852139453Sjhb}
853139453Sjhb
854139453Sjhb/*
855139453Sjhb * Restore a thread's priority when priority propagation is
856139453Sjhb * over.  The prio argument is the minimum priority the thread
857139453Sjhb * needs to have to satisfy other possible priority lending
858139453Sjhb * requests.  If the thread's regulary priority is less
859139453Sjhb * important than prio the thread will keep a priority boost
860139453Sjhb * of prio.
861139453Sjhb */
862139453Sjhbvoid
863139453Sjhbsched_unlend_prio(struct thread *td, u_char prio)
864139453Sjhb{
865139453Sjhb	u_char base_pri;
866139453Sjhb
867139453Sjhb	if (td->td_base_pri >= PRI_MIN_TIMESHARE &&
868139453Sjhb	    td->td_base_pri <= PRI_MAX_TIMESHARE)
869163709Sjb		base_pri = td->td_user_pri;
870139453Sjhb	else
871139453Sjhb		base_pri = td->td_base_pri;
872139453Sjhb	if (prio >= base_pri) {
873139453Sjhb		td->td_flags &= ~TDF_BORROWING;
874139453Sjhb		sched_prio(td, base_pri);
875139453Sjhb	} else
876139453Sjhb		sched_lend_prio(td, prio);
877139453Sjhb}
878139453Sjhb
879139453Sjhbvoid
880139453Sjhbsched_prio(struct thread *td, u_char prio)
881139453Sjhb{
882139453Sjhb	u_char oldprio;
883139453Sjhb
884139453Sjhb	/* First, update the base priority. */
885139453Sjhb	td->td_base_pri = prio;
886139453Sjhb
887139453Sjhb	/*
888139453Sjhb	 * If the thread is borrowing another thread's priority, don't ever
889139453Sjhb	 * lower the priority.
890139453Sjhb	 */
891139453Sjhb	if (td->td_flags & TDF_BORROWING && td->td_priority < prio)
892139453Sjhb		return;
893139453Sjhb
894139453Sjhb	/* Change the real priority. */
895139453Sjhb	oldprio = td->td_priority;
896139453Sjhb	sched_priority(td, prio);
897139453Sjhb
898139453Sjhb	/*
899139453Sjhb	 * If the thread is on a turnstile, then let the turnstile update
900139453Sjhb	 * its state.
901139453Sjhb	 */
902139453Sjhb	if (TD_ON_LOCK(td) && oldprio != prio)
903139453Sjhb		turnstile_adjust(td, oldprio);
904139453Sjhb}
905139453Sjhb
906139453Sjhbvoid
907163709Sjbsched_user_prio(struct thread *td, u_char prio)
908161599Sdavidxu{
909161599Sdavidxu
910174536Sdavidxu	THREAD_LOCK_ASSERT(td, MA_OWNED);
911163709Sjb	td->td_base_user_pri = prio;
912216313Sdavidxu	if (td->td_lend_user_pri <= prio)
913164177Sdavidxu		return;
914163709Sjb	td->td_user_pri = prio;
915161599Sdavidxu}
916161599Sdavidxu
917161599Sdavidxuvoid
918161599Sdavidxusched_lend_user_prio(struct thread *td, u_char prio)
919161599Sdavidxu{
920161599Sdavidxu
921174536Sdavidxu	THREAD_LOCK_ASSERT(td, MA_OWNED);
922216313Sdavidxu	td->td_lend_user_pri = prio;
923216791Sdavidxu	td->td_user_pri = min(prio, td->td_base_user_pri);
924216791Sdavidxu	if (td->td_priority > td->td_user_pri)
925216791Sdavidxu		sched_prio(td, td->td_user_pri);
926216791Sdavidxu	else if (td->td_priority != td->td_user_pri)
927216791Sdavidxu		td->td_flags |= TDF_NEEDRESCHED;
928161599Sdavidxu}
929161599Sdavidxu
930161599Sdavidxuvoid
931177085Sjeffsched_sleep(struct thread *td, int pri)
932104964Sjeff{
933113923Sjhb
934170293Sjeff	THREAD_LOCK_ASSERT(td, MA_OWNED);
935172264Sjeff	td->td_slptick = ticks;
936301456Skib	td_get_sched(td)->ts_slptime = 0;
937217410Sjhb	if (pri != 0 && PRI_BASE(td->td_pri_class) == PRI_TIMESHARE)
938177085Sjeff		sched_prio(td, pri);
939201347Skib	if (TD_IS_SUSPENDED(td) || pri >= PSOCK)
940177085Sjeff		td->td_flags |= TDF_CANSWAP;
941104964Sjeff}
942104964Sjeff
943104964Sjeffvoid
944135051Sjuliansched_switch(struct thread *td, struct thread *newtd, int flags)
945104964Sjeff{
946202889Sattilio	struct mtx *tmtx;
947164936Sjulian	struct td_sched *ts;
948104964Sjeff	struct proc *p;
949239157Smav	int preempted;
950104964Sjeff
951202889Sattilio	tmtx = NULL;
952301456Skib	ts = td_get_sched(td);
953104964Sjeff	p = td->td_proc;
954104964Sjeff
955170293Sjeff	THREAD_LOCK_ASSERT(td, MA_OWNED);
956180879Sjhb
957180879Sjhb	/*
958170293Sjeff	 * Switch to the sched lock to fix things up and pick
959170293Sjeff	 * a new thread.
960202889Sattilio	 * Block the td_lock in order to avoid breaking the critical path.
961170293Sjeff	 */
962170293Sjeff	if (td->td_lock != &sched_lock) {
963170293Sjeff		mtx_lock_spin(&sched_lock);
964202889Sattilio		tmtx = thread_lock_block(td);
965170293Sjeff	}
966104964Sjeff
967198854Sattilio	if ((td->td_flags & TDF_NOLOAD) == 0)
968139317Sjeff		sched_load_rem();
969135051Sjulian
970113339Sjulian	td->td_lastcpu = td->td_oncpu;
971312665Savg	preempted = (td->td_flags & TDF_SLICEEND) == 0 &&
972312665Savg	    (flags & SW_PREEMPT) != 0;
973239157Smav	td->td_flags &= ~(TDF_NEEDRESCHED | TDF_SLICEEND);
974144777Sups	td->td_owepreempt = 0;
975113339Sjulian	td->td_oncpu = NOCPU;
976180879Sjhb
977104964Sjeff	/*
978104964Sjeff	 * At the last moment, if this thread is still marked RUNNING,
979104964Sjeff	 * then put it back on the run queue as it has not been suspended
980131473Sjhb	 * or stopped or any thing else similar.  We never put the idle
981131473Sjhb	 * threads on the run queue, however.
982104964Sjeff	 */
983166415Sjulian	if (td->td_flags & TDF_IDLETD) {
984131473Sjhb		TD_SET_CAN_RUN(td);
985166415Sjulian#ifdef SMP
986223758Sattilio		CPU_CLR(PCPU_GET(cpuid), &idle_cpus_mask);
987166415Sjulian#endif
988166415Sjulian	} else {
989134791Sjulian		if (TD_IS_RUNNING(td)) {
990164936Sjulian			/* Put us back on the run queue. */
991239157Smav			sched_add(td, preempted ?
992136170Sjulian			    SRQ_OURSELF|SRQ_YIELDING|SRQ_PREEMPTED :
993136170Sjulian			    SRQ_OURSELF|SRQ_YIELDING);
994134791Sjulian		}
995104964Sjeff	}
996136170Sjulian	if (newtd) {
997180879Sjhb		/*
998136170Sjulian		 * The thread we are about to run needs to be counted
999136170Sjulian		 * as if it had been added to the run queue and selected.
1000136170Sjulian		 * It came from:
1001136170Sjulian		 * * A preemption
1002180879Sjhb		 * * An upcall
1003136170Sjulian		 * * A followon
1004136170Sjulian		 */
1005136170Sjulian		KASSERT((newtd->td_inhibitors == 0),
1006165693Srwatson			("trying to run inhibited thread"));
1007177435Sjeff		newtd->td_flags |= TDF_DIDRUN;
1008136170Sjulian        	TD_SET_RUNNING(newtd);
1009198854Sattilio		if ((newtd->td_flags & TDF_NOLOAD) == 0)
1010139317Sjeff			sched_load_add();
1011136170Sjulian	} else {
1012131473Sjhb		newtd = choosethread();
1013202940Sattilio		MPASS(newtd->td_lock == &sched_lock);
1014136170Sjulian	}
1015136170Sjulian
1016316840Savg#if (KTR_COMPILE & KTR_SCHED) != 0
1017316840Savg	if (TD_IS_IDLETHREAD(td))
1018316840Savg		KTR_STATE1(KTR_SCHED, "thread", sched_tdname(td), "idle",
1019316840Savg		    "prio:%d", td->td_priority);
1020316840Savg	else
1021316840Savg		KTR_STATE3(KTR_SCHED, "thread", sched_tdname(td), KTDSTATE(td),
1022316840Savg		    "prio:%d", td->td_priority, "wmesg:\"%s\"", td->td_wmesg,
1023316840Savg		    "lockname:\"%s\"", td->td_lockname);
1024316840Savg#endif
1025316840Savg
1026145256Sjkoshy	if (td != newtd) {
1027145256Sjkoshy#ifdef	HWPMC_HOOKS
1028145256Sjkoshy		if (PMC_PROC_IS_USING_PMCS(td->td_proc))
1029145256Sjkoshy			PMC_SWITCH_CONTEXT(td, PMC_FN_CSW_OUT);
1030145256Sjkoshy#endif
1031235459Srstone
1032260043Smarkj		SDT_PROBE2(sched, , , off__cpu, newtd, newtd->td_proc);
1033235459Srstone
1034166415Sjulian                /* I feel sleepy */
1035174629Sjeff		lock_profile_release_lock(&sched_lock.lock_object);
1036179297Sjb#ifdef KDTRACE_HOOKS
1037179297Sjb		/*
1038179297Sjb		 * If DTrace has set the active vtime enum to anything
1039179297Sjb		 * other than INACTIVE (0), then it should have set the
1040179297Sjb		 * function to call.
1041179297Sjb		 */
1042179297Sjb		if (dtrace_vtime_active)
1043179297Sjb			(*dtrace_vtime_switch_func)(newtd);
1044179297Sjb#endif
1045179297Sjb
1046202889Sattilio		cpu_switch(td, newtd, tmtx != NULL ? tmtx : td->td_lock);
1047174629Sjeff		lock_profile_obtain_lock_success(&sched_lock.lock_object,
1048174629Sjeff		    0, 0, __FILE__, __LINE__);
1049166415Sjulian		/*
1050166415Sjulian		 * Where am I?  What year is it?
1051166415Sjulian		 * We are in the same thread that went to sleep above,
1052180879Sjhb		 * but any amount of time may have passed. All our context
1053166415Sjulian		 * will still be available as will local variables.
1054166415Sjulian		 * PCPU values however may have changed as we may have
1055166415Sjulian		 * changed CPU so don't trust cached values of them.
1056166415Sjulian		 * New threads will go to fork_exit() instead of here
1057166415Sjulian		 * so if you change things here you may need to change
1058166415Sjulian		 * things there too.
1059180879Sjhb		 *
1060166415Sjulian		 * If the thread above was exiting it will never wake
1061166415Sjulian		 * up again here, so either it has saved everything it
1062166415Sjulian		 * needed to, or the thread_wait() or wait() will
1063166415Sjulian		 * need to reap it.
1064166415Sjulian		 */
1065235459Srstone
1066258622Savg		SDT_PROBE0(sched, , , on__cpu);
1067145256Sjkoshy#ifdef	HWPMC_HOOKS
1068145256Sjkoshy		if (PMC_PROC_IS_USING_PMCS(td->td_proc))
1069145256Sjkoshy			PMC_SWITCH_CONTEXT(td, PMC_FN_CSW_IN);
1070145256Sjkoshy#endif
1071235459Srstone	} else
1072258622Savg		SDT_PROBE0(sched, , , remain__cpu);
1073145256Sjkoshy
1074316840Savg	KTR_STATE1(KTR_SCHED, "thread", sched_tdname(td), "running",
1075316840Savg	    "prio:%d", td->td_priority);
1076316840Savg
1077166415Sjulian#ifdef SMP
1078166415Sjulian	if (td->td_flags & TDF_IDLETD)
1079223758Sattilio		CPU_SET(PCPU_GET(cpuid), &idle_cpus_mask);
1080166415Sjulian#endif
1081121128Sjeff	sched_lock.mtx_lock = (uintptr_t)td;
1082121128Sjeff	td->td_oncpu = PCPU_GET(cpuid);
1083170293Sjeff	MPASS(td->td_lock == &sched_lock);
1084104964Sjeff}
1085104964Sjeff
1086104964Sjeffvoid
1087104964Sjeffsched_wakeup(struct thread *td)
1088104964Sjeff{
1089172264Sjeff	struct td_sched *ts;
1090172264Sjeff
1091170293Sjeff	THREAD_LOCK_ASSERT(td, MA_OWNED);
1092301456Skib	ts = td_get_sched(td);
1093177085Sjeff	td->td_flags &= ~TDF_CANSWAP;
1094172264Sjeff	if (ts->ts_slptime > 1) {
1095163709Sjb		updatepri(td);
1096163709Sjb		resetpriority(td);
1097163709Sjb	}
1098201790Sattilio	td->td_slptick = 0;
1099172264Sjeff	ts->ts_slptime = 0;
1100239153Smav	ts->ts_slice = sched_slice;
1101166188Sjeff	sched_add(td, SRQ_BORING);
1102104964Sjeff}
1103104964Sjeff
1104134693Sjulian#ifdef SMP
1105134688Sjulianstatic int
1106180879Sjhbforward_wakeup(int cpunum)
1107134688Sjulian{
1108134688Sjulian	struct pcpu *pc;
1109223758Sattilio	cpuset_t dontuse, map, map2;
1110223758Sattilio	u_int id, me;
1111222813Sattilio	int iscpuset;
1112134688Sjulian
1113134688Sjulian	mtx_assert(&sched_lock, MA_OWNED);
1114134688Sjulian
1115134791Sjulian	CTR0(KTR_RUNQ, "forward_wakeup()");
1116134688Sjulian
1117134688Sjulian	if ((!forward_wakeup_enabled) ||
1118134688Sjulian	     (forward_wakeup_use_mask == 0 && forward_wakeup_use_loop == 0))
1119134688Sjulian		return (0);
1120310129Sjhb	if (!smp_started || panicstr)
1121134688Sjulian		return (0);
1122134688Sjulian
1123134688Sjulian	forward_wakeups_requested++;
1124134688Sjulian
1125180879Sjhb	/*
1126180879Sjhb	 * Check the idle mask we received against what we calculated
1127180879Sjhb	 * before in the old version.
1128180879Sjhb	 */
1129223758Sattilio	me = PCPU_GET(cpuid);
1130180879Sjhb
1131180879Sjhb	/* Don't bother if we should be doing it ourself. */
1132223758Sattilio	if (CPU_ISSET(me, &idle_cpus_mask) &&
1133223758Sattilio	    (cpunum == NOCPU || me == cpunum))
1134134688Sjulian		return (0);
1135134688Sjulian
1136223758Sattilio	CPU_SETOF(me, &dontuse);
1137222813Sattilio	CPU_OR(&dontuse, &stopped_cpus);
1138222813Sattilio	CPU_OR(&dontuse, &hlt_cpus_mask);
1139222813Sattilio	CPU_ZERO(&map2);
1140134688Sjulian	if (forward_wakeup_use_loop) {
1141222531Snwhitehorn		STAILQ_FOREACH(pc, &cpuhead, pc_allcpu) {
1142223758Sattilio			id = pc->pc_cpuid;
1143223758Sattilio			if (!CPU_ISSET(id, &dontuse) &&
1144134688Sjulian			    pc->pc_curthread == pc->pc_idlethread) {
1145223758Sattilio				CPU_SET(id, &map2);
1146134688Sjulian			}
1147134688Sjulian		}
1148134688Sjulian	}
1149134688Sjulian
1150134688Sjulian	if (forward_wakeup_use_mask) {
1151222813Sattilio		map = idle_cpus_mask;
1152222813Sattilio		CPU_NAND(&map, &dontuse);
1153134688Sjulian
1154180879Sjhb		/* If they are both on, compare and use loop if different. */
1155134688Sjulian		if (forward_wakeup_use_loop) {
1156222813Sattilio			if (CPU_CMP(&map, &map2)) {
1157222040Sattilio				printf("map != map2, loop method preferred\n");
1158222040Sattilio				map = map2;
1159134688Sjulian			}
1160134688Sjulian		}
1161134688Sjulian	} else {
1162222040Sattilio		map = map2;
1163134688Sjulian	}
1164180879Sjhb
1165180879Sjhb	/* If we only allow a specific CPU, then mask off all the others. */
1166134688Sjulian	if (cpunum != NOCPU) {
1167134688Sjulian		KASSERT((cpunum <= mp_maxcpus),("forward_wakeup: bad cpunum."));
1168222813Sattilio		iscpuset = CPU_ISSET(cpunum, &map);
1169222813Sattilio		if (iscpuset == 0)
1170222813Sattilio			CPU_ZERO(&map);
1171222813Sattilio		else
1172222813Sattilio			CPU_SETOF(cpunum, &map);
1173134688Sjulian	}
1174222813Sattilio	if (!CPU_EMPTY(&map)) {
1175134688Sjulian		forward_wakeups_delivered++;
1176222531Snwhitehorn		STAILQ_FOREACH(pc, &cpuhead, pc_allcpu) {
1177223758Sattilio			id = pc->pc_cpuid;
1178223758Sattilio			if (!CPU_ISSET(id, &map))
1179212455Smav				continue;
1180212455Smav			if (cpu_idle_wakeup(pc->pc_cpuid))
1181223758Sattilio				CPU_CLR(id, &map);
1182212455Smav		}
1183222813Sattilio		if (!CPU_EMPTY(&map))
1184212455Smav			ipi_selected(map, IPI_AST);
1185134688Sjulian		return (1);
1186134688Sjulian	}
1187134688Sjulian	if (cpunum == NOCPU)
1188134688Sjulian		printf("forward_wakeup: Idle processor not found\n");
1189134688Sjulian	return (0);
1190134688Sjulian}
1191134688Sjulian
1192147182Supsstatic void
1193180879Sjhbkick_other_cpu(int pri, int cpuid)
1194180879Sjhb{
1195180879Sjhb	struct pcpu *pcpu;
1196180879Sjhb	int cpri;
1197147182Sups
1198180879Sjhb	pcpu = pcpu_find(cpuid);
1199223758Sattilio	if (CPU_ISSET(cpuid, &idle_cpus_mask)) {
1200147182Sups		forward_wakeups_delivered++;
1201212455Smav		if (!cpu_idle_wakeup(cpuid))
1202212455Smav			ipi_cpu(cpuid, IPI_AST);
1203147182Sups		return;
1204147182Sups	}
1205147182Sups
1206180879Sjhb	cpri = pcpu->pc_curthread->td_priority;
1207147182Sups	if (pri >= cpri)
1208147182Sups		return;
1209147182Sups
1210147182Sups#if defined(IPI_PREEMPTION) && defined(PREEMPTION)
1211147182Sups#if !defined(FULL_PREEMPTION)
1212147182Sups	if (pri <= PRI_MAX_ITHD)
1213147182Sups#endif /* ! FULL_PREEMPTION */
1214147182Sups	{
1215210939Sjhb		ipi_cpu(cpuid, IPI_PREEMPT);
1216147182Sups		return;
1217147182Sups	}
1218147182Sups#endif /* defined(IPI_PREEMPTION) && defined(PREEMPTION) */
1219147182Sups
1220147182Sups	pcpu->pc_curthread->td_flags |= TDF_NEEDRESCHED;
1221210939Sjhb	ipi_cpu(cpuid, IPI_AST);
1222147182Sups	return;
1223147182Sups}
1224147182Sups#endif /* SMP */
1225147182Sups
1226180923Sjhb#ifdef SMP
1227180923Sjhbstatic int
1228180923Sjhbsched_pickcpu(struct thread *td)
1229180923Sjhb{
1230180923Sjhb	int best, cpu;
1231180923Sjhb
1232180923Sjhb	mtx_assert(&sched_lock, MA_OWNED);
1233180923Sjhb
1234303884Sjhb	if (td->td_lastcpu != NOCPU && THREAD_CAN_SCHED(td, td->td_lastcpu))
1235180937Sjhb		best = td->td_lastcpu;
1236180937Sjhb	else
1237180937Sjhb		best = NOCPU;
1238209059Sjhb	CPU_FOREACH(cpu) {
1239180923Sjhb		if (!THREAD_CAN_SCHED(td, cpu))
1240180923Sjhb			continue;
1241180923Sjhb
1242180923Sjhb		if (best == NOCPU)
1243180923Sjhb			best = cpu;
1244180923Sjhb		else if (runq_length[cpu] < runq_length[best])
1245180923Sjhb			best = cpu;
1246180923Sjhb	}
1247180923Sjhb	KASSERT(best != NOCPU, ("no valid CPUs"));
1248180923Sjhb
1249180923Sjhb	return (best);
1250180923Sjhb}
1251180923Sjhb#endif
1252180923Sjhb
1253104964Sjeffvoid
1254134586Sjuliansched_add(struct thread *td, int flags)
1255147182Sups#ifdef SMP
1256104964Sjeff{
1257223758Sattilio	cpuset_t tidlemsk;
1258164936Sjulian	struct td_sched *ts;
1259223758Sattilio	u_int cpu, cpuid;
1260134591Sjulian	int forwarded = 0;
1261147182Sups	int single_cpu = 0;
1262121127Sjeff
1263301456Skib	ts = td_get_sched(td);
1264170293Sjeff	THREAD_LOCK_ASSERT(td, MA_OWNED);
1265166188Sjeff	KASSERT((td->td_inhibitors == 0),
1266166188Sjeff	    ("sched_add: trying to run inhibited thread"));
1267166188Sjeff	KASSERT((TD_CAN_RUN(td) || TD_IS_RUNNING(td)),
1268166188Sjeff	    ("sched_add: bad thread state"));
1269172207Sjeff	KASSERT(td->td_flags & TDF_INMEM,
1270172207Sjeff	    ("sched_add: thread swapped out"));
1271180879Sjhb
1272187357Sjeff	KTR_STATE2(KTR_SCHED, "thread", sched_tdname(td), "runq add",
1273187357Sjeff	    "prio:%d", td->td_priority, KTR_ATTR_LINKED,
1274187357Sjeff	    sched_tdname(curthread));
1275187357Sjeff	KTR_POINT1(KTR_SCHED, "thread", sched_tdname(curthread), "wokeup",
1276187357Sjeff	    KTR_ATTR_LINKED, sched_tdname(td));
1277235459Srstone	SDT_PROBE4(sched, , , enqueue, td, td->td_proc, NULL,
1278235459Srstone	    flags & SRQ_PREEMPTED);
1279187357Sjeff
1280187357Sjeff
1281170293Sjeff	/*
1282170293Sjeff	 * Now that the thread is moving to the run-queue, set the lock
1283170293Sjeff	 * to the scheduler's lock.
1284170293Sjeff	 */
1285170293Sjeff	if (td->td_lock != &sched_lock) {
1286170293Sjeff		mtx_lock_spin(&sched_lock);
1287170293Sjeff		thread_lock_set(td, &sched_lock);
1288170293Sjeff	}
1289166188Sjeff	TD_SET_RUNQ(td);
1290131481Sjhb
1291221081Srstone	/*
1292221081Srstone	 * If SMP is started and the thread is pinned or otherwise limited to
1293221081Srstone	 * a specific set of CPUs, queue the thread to a per-CPU run queue.
1294221081Srstone	 * Otherwise, queue the thread to the global run queue.
1295221081Srstone	 *
1296221081Srstone	 * If SMP has not yet been started we must use the global run queue
1297221081Srstone	 * as per-CPU state may not be initialized yet and we may crash if we
1298221081Srstone	 * try to access the per-CPU run queues.
1299221081Srstone	 */
1300221081Srstone	if (smp_started && (td->td_pinned != 0 || td->td_flags & TDF_BOUND ||
1301221081Srstone	    ts->ts_flags & TSF_AFFINITY)) {
1302221081Srstone		if (td->td_pinned != 0)
1303221081Srstone			cpu = td->td_lastcpu;
1304221081Srstone		else if (td->td_flags & TDF_BOUND) {
1305221081Srstone			/* Find CPU from bound runq. */
1306221081Srstone			KASSERT(SKE_RUNQ_PCPU(ts),
1307221081Srstone			    ("sched_add: bound td_sched not on cpu runq"));
1308221081Srstone			cpu = ts->ts_runq - &runq_pcpu[0];
1309221081Srstone		} else
1310221081Srstone			/* Find a valid CPU for our cpuset */
1311221081Srstone			cpu = sched_pickcpu(td);
1312164936Sjulian		ts->ts_runq = &runq_pcpu[cpu];
1313147182Sups		single_cpu = 1;
1314147182Sups		CTR3(KTR_RUNQ,
1315180879Sjhb		    "sched_add: Put td_sched:%p(td:%p) on cpu%d runq", ts, td,
1316180879Sjhb		    cpu);
1317180879Sjhb	} else {
1318134591Sjulian		CTR2(KTR_RUNQ,
1319180879Sjhb		    "sched_add: adding td_sched:%p (td:%p) to gbl runq", ts,
1320180879Sjhb		    td);
1321134591Sjulian		cpu = NOCPU;
1322164936Sjulian		ts->ts_runq = &runq;
1323147182Sups	}
1324180879Sjhb
1325309446Sjhb	if ((td->td_flags & TDF_NOLOAD) == 0)
1326309446Sjhb		sched_load_add();
1327309446Sjhb	runq_add(ts->ts_runq, td, flags);
1328309446Sjhb	if (cpu != NOCPU)
1329309446Sjhb		runq_length[cpu]++;
1330309446Sjhb
1331223758Sattilio	cpuid = PCPU_GET(cpuid);
1332223758Sattilio	if (single_cpu && cpu != cpuid) {
1333180879Sjhb	        kick_other_cpu(td->td_priority, cpu);
1334124955Sjeff	} else {
1335147190Sups		if (!single_cpu) {
1336223758Sattilio			tidlemsk = idle_cpus_mask;
1337223758Sattilio			CPU_NAND(&tidlemsk, &hlt_cpus_mask);
1338223758Sattilio			CPU_CLR(cpuid, &tidlemsk);
1339147182Sups
1340223758Sattilio			if (!CPU_ISSET(cpuid, &idle_cpus_mask) &&
1341223758Sattilio			    ((flags & SRQ_INTR) == 0) &&
1342222813Sattilio			    !CPU_EMPTY(&tidlemsk))
1343147182Sups				forwarded = forward_wakeup(cpu);
1344147182Sups		}
1345147182Sups
1346147182Sups		if (!forwarded) {
1347309446Sjhb			if (!maybe_preempt(td))
1348147182Sups				maybe_resched(td);
1349147182Sups		}
1350124955Sjeff	}
1351147182Sups}
1352147182Sups#else /* SMP */
1353147182Sups{
1354164936Sjulian	struct td_sched *ts;
1355180923Sjhb
1356301456Skib	ts = td_get_sched(td);
1357170293Sjeff	THREAD_LOCK_ASSERT(td, MA_OWNED);
1358166188Sjeff	KASSERT((td->td_inhibitors == 0),
1359166188Sjeff	    ("sched_add: trying to run inhibited thread"));
1360166188Sjeff	KASSERT((TD_CAN_RUN(td) || TD_IS_RUNNING(td)),
1361166188Sjeff	    ("sched_add: bad thread state"));
1362172207Sjeff	KASSERT(td->td_flags & TDF_INMEM,
1363172207Sjeff	    ("sched_add: thread swapped out"));
1364187357Sjeff	KTR_STATE2(KTR_SCHED, "thread", sched_tdname(td), "runq add",
1365187357Sjeff	    "prio:%d", td->td_priority, KTR_ATTR_LINKED,
1366187357Sjeff	    sched_tdname(curthread));
1367187357Sjeff	KTR_POINT1(KTR_SCHED, "thread", sched_tdname(curthread), "wokeup",
1368187357Sjeff	    KTR_ATTR_LINKED, sched_tdname(td));
1369235471Spluknet	SDT_PROBE4(sched, , , enqueue, td, td->td_proc, NULL,
1370235459Srstone	    flags & SRQ_PREEMPTED);
1371180879Sjhb
1372170293Sjeff	/*
1373170293Sjeff	 * Now that the thread is moving to the run-queue, set the lock
1374170293Sjeff	 * to the scheduler's lock.
1375170293Sjeff	 */
1376170293Sjeff	if (td->td_lock != &sched_lock) {
1377170293Sjeff		mtx_lock_spin(&sched_lock);
1378170293Sjeff		thread_lock_set(td, &sched_lock);
1379170293Sjeff	}
1380166188Sjeff	TD_SET_RUNQ(td);
1381164936Sjulian	CTR2(KTR_RUNQ, "sched_add: adding td_sched:%p (td:%p) to runq", ts, td);
1382164936Sjulian	ts->ts_runq = &runq;
1383134591Sjulian
1384198854Sattilio	if ((td->td_flags & TDF_NOLOAD) == 0)
1385139317Sjeff		sched_load_add();
1386177435Sjeff	runq_add(ts->ts_runq, td, flags);
1387309446Sjhb	if (!maybe_preempt(td))
1388309446Sjhb		maybe_resched(td);
1389104964Sjeff}
1390147182Sups#endif /* SMP */
1391147182Sups
1392104964Sjeffvoid
1393121127Sjeffsched_rem(struct thread *td)
1394104964Sjeff{
1395164936Sjulian	struct td_sched *ts;
1396121127Sjeff
1397301456Skib	ts = td_get_sched(td);
1398172207Sjeff	KASSERT(td->td_flags & TDF_INMEM,
1399172207Sjeff	    ("sched_rem: thread swapped out"));
1400166188Sjeff	KASSERT(TD_ON_RUNQ(td),
1401164936Sjulian	    ("sched_rem: thread not on run queue"));
1402104964Sjeff	mtx_assert(&sched_lock, MA_OWNED);
1403187357Sjeff	KTR_STATE2(KTR_SCHED, "thread", sched_tdname(td), "runq rem",
1404187357Sjeff	    "prio:%d", td->td_priority, KTR_ATTR_LINKED,
1405187357Sjeff	    sched_tdname(curthread));
1406235459Srstone	SDT_PROBE3(sched, , , dequeue, td, td->td_proc, NULL);
1407104964Sjeff
1408198854Sattilio	if ((td->td_flags & TDF_NOLOAD) == 0)
1409139317Sjeff		sched_load_rem();
1410180923Sjhb#ifdef SMP
1411180923Sjhb	if (ts->ts_runq != &runq)
1412180923Sjhb		runq_length[ts->ts_runq - runq_pcpu]--;
1413180923Sjhb#endif
1414177435Sjeff	runq_remove(ts->ts_runq, td);
1415166188Sjeff	TD_SET_CAN_RUN(td);
1416104964Sjeff}
1417104964Sjeff
1418135295Sjulian/*
1419180879Sjhb * Select threads to run.  Note that running threads still consume a
1420180879Sjhb * slot.
1421135295Sjulian */
1422166188Sjeffstruct thread *
1423104964Sjeffsched_choose(void)
1424104964Sjeff{
1425177435Sjeff	struct thread *td;
1426124955Sjeff	struct runq *rq;
1427104964Sjeff
1428170293Sjeff	mtx_assert(&sched_lock,  MA_OWNED);
1429124955Sjeff#ifdef SMP
1430177435Sjeff	struct thread *tdcpu;
1431124955Sjeff
1432124955Sjeff	rq = &runq;
1433177435Sjeff	td = runq_choose_fuzz(&runq, runq_fuzz);
1434177435Sjeff	tdcpu = runq_choose(&runq_pcpu[PCPU_GET(cpuid)]);
1435104964Sjeff
1436180879Sjhb	if (td == NULL ||
1437180879Sjhb	    (tdcpu != NULL &&
1438177435Sjeff	     tdcpu->td_priority < td->td_priority)) {
1439177435Sjeff		CTR2(KTR_RUNQ, "choosing td %p from pcpu runq %d", tdcpu,
1440124955Sjeff		     PCPU_GET(cpuid));
1441177435Sjeff		td = tdcpu;
1442124955Sjeff		rq = &runq_pcpu[PCPU_GET(cpuid)];
1443180879Sjhb	} else {
1444177435Sjeff		CTR1(KTR_RUNQ, "choosing td_sched %p from main runq", td);
1445124955Sjeff	}
1446124955Sjeff
1447124955Sjeff#else
1448124955Sjeff	rq = &runq;
1449177435Sjeff	td = runq_choose(&runq);
1450124955Sjeff#endif
1451124955Sjeff
1452177435Sjeff	if (td) {
1453180923Sjhb#ifdef SMP
1454180923Sjhb		if (td == tdcpu)
1455180923Sjhb			runq_length[PCPU_GET(cpuid)]--;
1456180923Sjhb#endif
1457177435Sjeff		runq_remove(rq, td);
1458177435Sjeff		td->td_flags |= TDF_DIDRUN;
1459104964Sjeff
1460177435Sjeff		KASSERT(td->td_flags & TDF_INMEM,
1461172207Sjeff		    ("sched_choose: thread swapped out"));
1462177435Sjeff		return (td);
1463180879Sjhb	}
1464166188Sjeff	return (PCPU_GET(idlethread));
1465104964Sjeff}
1466104964Sjeff
1467104964Sjeffvoid
1468177004Sjeffsched_preempt(struct thread *td)
1469177004Sjeff{
1470235459Srstone
1471235459Srstone	SDT_PROBE2(sched, , , surrender, td, td->td_proc);
1472177004Sjeff	thread_lock(td);
1473177004Sjeff	if (td->td_critnest > 1)
1474177004Sjeff		td->td_owepreempt = 1;
1475177004Sjeff	else
1476178272Sjeff		mi_switch(SW_INVOL | SW_PREEMPT | SWT_PREEMPT, NULL);
1477177004Sjeff	thread_unlock(td);
1478177004Sjeff}
1479177004Sjeff
1480177004Sjeffvoid
1481104964Sjeffsched_userret(struct thread *td)
1482104964Sjeff{
1483104964Sjeff	/*
1484104964Sjeff	 * XXX we cheat slightly on the locking here to avoid locking in
1485104964Sjeff	 * the usual case.  Setting td_priority here is essentially an
1486104964Sjeff	 * incomplete workaround for not setting it properly elsewhere.
1487104964Sjeff	 * Now that some interrupt handlers are threads, not setting it
1488104964Sjeff	 * properly elsewhere can clobber it in the window between setting
1489104964Sjeff	 * it here and returning to user mode, so don't waste time setting
1490104964Sjeff	 * it perfectly here.
1491104964Sjeff	 */
1492139453Sjhb	KASSERT((td->td_flags & TDF_BORROWING) == 0,
1493139453Sjhb	    ("thread with borrowed priority returning to userland"));
1494163709Sjb	if (td->td_priority != td->td_user_pri) {
1495170293Sjeff		thread_lock(td);
1496163709Sjb		td->td_priority = td->td_user_pri;
1497163709Sjb		td->td_base_pri = td->td_user_pri;
1498170293Sjeff		thread_unlock(td);
1499163709Sjb	}
1500104964Sjeff}
1501107126Sjeff
1502124955Sjeffvoid
1503124955Sjeffsched_bind(struct thread *td, int cpu)
1504124955Sjeff{
1505164936Sjulian	struct td_sched *ts;
1506124955Sjeff
1507208391Sjhb	THREAD_LOCK_ASSERT(td, MA_OWNED|MA_NOTRECURSED);
1508208391Sjhb	KASSERT(td == curthread, ("sched_bind: can only bind curthread"));
1509124955Sjeff
1510301456Skib	ts = td_get_sched(td);
1511124955Sjeff
1512177435Sjeff	td->td_flags |= TDF_BOUND;
1513124955Sjeff#ifdef SMP
1514164936Sjulian	ts->ts_runq = &runq_pcpu[cpu];
1515124955Sjeff	if (PCPU_GET(cpuid) == cpu)
1516124955Sjeff		return;
1517124955Sjeff
1518131473Sjhb	mi_switch(SW_VOL, NULL);
1519124955Sjeff#endif
1520124955Sjeff}
1521124955Sjeff
1522124955Sjeffvoid
1523124955Sjeffsched_unbind(struct thread* td)
1524124955Sjeff{
1525170293Sjeff	THREAD_LOCK_ASSERT(td, MA_OWNED);
1526208391Sjhb	KASSERT(td == curthread, ("sched_unbind: can only bind curthread"));
1527177435Sjeff	td->td_flags &= ~TDF_BOUND;
1528124955Sjeff}
1529124955Sjeff
1530107126Sjeffint
1531145256Sjkoshysched_is_bound(struct thread *td)
1532145256Sjkoshy{
1533170293Sjeff	THREAD_LOCK_ASSERT(td, MA_OWNED);
1534177435Sjeff	return (td->td_flags & TDF_BOUND);
1535145256Sjkoshy}
1536145256Sjkoshy
1537159630Sdavidxuvoid
1538159630Sdavidxusched_relinquish(struct thread *td)
1539159630Sdavidxu{
1540170293Sjeff	thread_lock(td);
1541178272Sjeff	mi_switch(SW_VOL | SWT_RELINQUISH, NULL);
1542170293Sjeff	thread_unlock(td);
1543159630Sdavidxu}
1544159630Sdavidxu
1545145256Sjkoshyint
1546125288Sjeffsched_load(void)
1547125288Sjeff{
1548125288Sjeff	return (sched_tdcnt);
1549125288Sjeff}
1550125288Sjeff
1551125288Sjeffint
1552107126Sjeffsched_sizeof_proc(void)
1553107126Sjeff{
1554107126Sjeff	return (sizeof(struct proc));
1555107126Sjeff}
1556159630Sdavidxu
1557107126Sjeffint
1558107126Sjeffsched_sizeof_thread(void)
1559107126Sjeff{
1560164936Sjulian	return (sizeof(struct thread) + sizeof(struct td_sched));
1561107126Sjeff}
1562107137Sjeff
1563107137Sjefffixpt_t
1564121127Sjeffsched_pctcpu(struct thread *td)
1565107137Sjeff{
1566164936Sjulian	struct td_sched *ts;
1567121147Sjeff
1568208787Sjhb	THREAD_LOCK_ASSERT(td, MA_OWNED);
1569301456Skib	ts = td_get_sched(td);
1570164936Sjulian	return (ts->ts_pctcpu);
1571107137Sjeff}
1572159570Sdavidxu
1573282213Strasz#ifdef RACCT
1574242139Strasz/*
1575242139Strasz * Calculates the contribution to the thread cpu usage for the latest
1576242139Strasz * (unfinished) second.
1577242139Strasz */
1578242139Straszfixpt_t
1579242139Straszsched_pctcpu_delta(struct thread *td)
1580242139Strasz{
1581242139Strasz	struct td_sched *ts;
1582242139Strasz	fixpt_t delta;
1583242139Strasz	int realstathz;
1584242139Strasz
1585242139Strasz	THREAD_LOCK_ASSERT(td, MA_OWNED);
1586301456Skib	ts = td_get_sched(td);
1587242139Strasz	delta = 0;
1588242139Strasz	realstathz = stathz ? stathz : hz;
1589242139Strasz	if (ts->ts_cpticks != 0) {
1590242139Strasz#if	(FSHIFT >= CCPU_SHIFT)
1591242139Strasz		delta = (realstathz == 100)
1592242139Strasz		    ? ((fixpt_t) ts->ts_cpticks) <<
1593242139Strasz		    (FSHIFT - CCPU_SHIFT) :
1594242139Strasz		    100 * (((fixpt_t) ts->ts_cpticks)
1595242139Strasz		    << (FSHIFT - CCPU_SHIFT)) / realstathz;
1596242139Strasz#else
1597242139Strasz		delta = ((FSCALE - ccpu) *
1598242139Strasz		    (ts->ts_cpticks *
1599242139Strasz		    FSCALE / realstathz)) >> FSHIFT;
1600242139Strasz#endif
1601242139Strasz	}
1602242139Strasz
1603242139Strasz	return (delta);
1604242139Strasz}
1605242139Strasz#endif
1606242139Strasz
1607298145Skibu_int
1608298145Skibsched_estcpu(struct thread *td)
1609159570Sdavidxu{
1610298145Skib
1611301456Skib	return (td_get_sched(td)->ts_estcpu);
1612159570Sdavidxu}
1613166188Sjeff
1614166188Sjeff/*
1615166188Sjeff * The actual idle process.
1616166188Sjeff */
1617166188Sjeffvoid
1618166188Sjeffsched_idletd(void *dummy)
1619166188Sjeff{
1620212455Smav	struct pcpuidlestat *stat;
1621166188Sjeff
1622239585Sjhb	THREAD_NO_SLEEPING();
1623212455Smav	stat = DPCPU_PTR(idlestat);
1624166188Sjeff	for (;;) {
1625166188Sjeff		mtx_assert(&Giant, MA_NOTOWNED);
1626166188Sjeff
1627212455Smav		while (sched_runnable() == 0) {
1628212455Smav			cpu_idle(stat->idlecalls + stat->oldidlecalls > 64);
1629212455Smav			stat->idlecalls++;
1630212455Smav		}
1631166188Sjeff
1632166188Sjeff		mtx_lock_spin(&sched_lock);
1633178272Sjeff		mi_switch(SW_VOL | SWT_IDLE, NULL);
1634166188Sjeff		mtx_unlock_spin(&sched_lock);
1635166188Sjeff	}
1636166188Sjeff}
1637166188Sjeff
1638170293Sjeff/*
1639170293Sjeff * A CPU is entering for the first time or a thread is exiting.
1640170293Sjeff */
1641170293Sjeffvoid
1642170293Sjeffsched_throw(struct thread *td)
1643170293Sjeff{
1644170293Sjeff	/*
1645170293Sjeff	 * Correct spinlock nesting.  The idle thread context that we are
1646170293Sjeff	 * borrowing was created so that it would start out with a single
1647170293Sjeff	 * spin lock (sched_lock) held in fork_trampoline().  Since we've
1648170293Sjeff	 * explicitly acquired locks in this function, the nesting count
1649170293Sjeff	 * is now 2 rather than 1.  Since we are nested, calling
1650170293Sjeff	 * spinlock_exit() will simply adjust the counts without allowing
1651170293Sjeff	 * spin lock using code to interrupt us.
1652170293Sjeff	 */
1653170293Sjeff	if (td == NULL) {
1654170293Sjeff		mtx_lock_spin(&sched_lock);
1655170293Sjeff		spinlock_exit();
1656229429Sjhb		PCPU_SET(switchtime, cpu_ticks());
1657229429Sjhb		PCPU_SET(switchticks, ticks);
1658170293Sjeff	} else {
1659174629Sjeff		lock_profile_release_lock(&sched_lock.lock_object);
1660170293Sjeff		MPASS(td->td_lock == &sched_lock);
1661286256Sjhb		td->td_lastcpu = td->td_oncpu;
1662286256Sjhb		td->td_oncpu = NOCPU;
1663170293Sjeff	}
1664170293Sjeff	mtx_assert(&sched_lock, MA_OWNED);
1665170293Sjeff	KASSERT(curthread->td_md.md_spinlock_count == 1, ("invalid count"));
1666170293Sjeff	cpu_throw(td, choosethread());	/* doesn't return */
1667170293Sjeff}
1668170293Sjeff
1669170293Sjeffvoid
1670170600Sjeffsched_fork_exit(struct thread *td)
1671170293Sjeff{
1672170293Sjeff
1673170293Sjeff	/*
1674170293Sjeff	 * Finish setting up thread glue so that it begins execution in a
1675170293Sjeff	 * non-nested critical section with sched_lock held but not recursed.
1676170293Sjeff	 */
1677170600Sjeff	td->td_oncpu = PCPU_GET(cpuid);
1678170600Sjeff	sched_lock.mtx_lock = (uintptr_t)td;
1679174629Sjeff	lock_profile_obtain_lock_success(&sched_lock.lock_object,
1680174629Sjeff	    0, 0, __FILE__, __LINE__);
1681170600Sjeff	THREAD_LOCK_ASSERT(td, MA_OWNED | MA_NOTRECURSED);
1682315838Savg
1683315838Savg	KTR_STATE1(KTR_SCHED, "thread", sched_tdname(td), "running",
1684315838Savg	    "prio:%d", td->td_priority);
1685315838Savg	SDT_PROBE0(sched, , , on__cpu);
1686170293Sjeff}
1687170293Sjeff
1688187357Sjeffchar *
1689187357Sjeffsched_tdname(struct thread *td)
1690187357Sjeff{
1691187357Sjeff#ifdef KTR
1692187357Sjeff	struct td_sched *ts;
1693187357Sjeff
1694301456Skib	ts = td_get_sched(td);
1695187357Sjeff	if (ts->ts_name[0] == '\0')
1696187357Sjeff		snprintf(ts->ts_name, sizeof(ts->ts_name),
1697187357Sjeff		    "%s tid %d", td->td_name, td->td_tid);
1698187357Sjeff	return (ts->ts_name);
1699187357Sjeff#else
1700187357Sjeff	return (td->td_name);
1701187357Sjeff#endif
1702187357Sjeff}
1703187357Sjeff
1704232700Sjhb#ifdef KTR
1705176729Sjeffvoid
1706232700Sjhbsched_clear_tdname(struct thread *td)
1707232700Sjhb{
1708232700Sjhb	struct td_sched *ts;
1709232700Sjhb
1710301456Skib	ts = td_get_sched(td);
1711232700Sjhb	ts->ts_name[0] = '\0';
1712232700Sjhb}
1713232700Sjhb#endif
1714232700Sjhb
1715232700Sjhbvoid
1716176729Sjeffsched_affinity(struct thread *td)
1717176729Sjeff{
1718180923Sjhb#ifdef SMP
1719180923Sjhb	struct td_sched *ts;
1720180923Sjhb	int cpu;
1721180923Sjhb
1722180923Sjhb	THREAD_LOCK_ASSERT(td, MA_OWNED);
1723180923Sjhb
1724180923Sjhb	/*
1725180923Sjhb	 * Set the TSF_AFFINITY flag if there is at least one CPU this
1726180923Sjhb	 * thread can't run on.
1727180923Sjhb	 */
1728301456Skib	ts = td_get_sched(td);
1729180923Sjhb	ts->ts_flags &= ~TSF_AFFINITY;
1730209059Sjhb	CPU_FOREACH(cpu) {
1731180923Sjhb		if (!THREAD_CAN_SCHED(td, cpu)) {
1732180923Sjhb			ts->ts_flags |= TSF_AFFINITY;
1733180923Sjhb			break;
1734180923Sjhb		}
1735180923Sjhb	}
1736180923Sjhb
1737180923Sjhb	/*
1738180923Sjhb	 * If this thread can run on all CPUs, nothing else to do.
1739180923Sjhb	 */
1740180923Sjhb	if (!(ts->ts_flags & TSF_AFFINITY))
1741180923Sjhb		return;
1742180923Sjhb
1743180923Sjhb	/* Pinned threads and bound threads should be left alone. */
1744180923Sjhb	if (td->td_pinned != 0 || td->td_flags & TDF_BOUND)
1745180923Sjhb		return;
1746180923Sjhb
1747180923Sjhb	switch (td->td_state) {
1748180923Sjhb	case TDS_RUNQ:
1749180923Sjhb		/*
1750180923Sjhb		 * If we are on a per-CPU runqueue that is in the set,
1751180923Sjhb		 * then nothing needs to be done.
1752180923Sjhb		 */
1753180923Sjhb		if (ts->ts_runq != &runq &&
1754180923Sjhb		    THREAD_CAN_SCHED(td, ts->ts_runq - runq_pcpu))
1755180923Sjhb			return;
1756180923Sjhb
1757180923Sjhb		/* Put this thread on a valid per-CPU runqueue. */
1758180923Sjhb		sched_rem(td);
1759180923Sjhb		sched_add(td, SRQ_BORING);
1760180923Sjhb		break;
1761180923Sjhb	case TDS_RUNNING:
1762180923Sjhb		/*
1763180923Sjhb		 * See if our current CPU is in the set.  If not, force a
1764180923Sjhb		 * context switch.
1765180923Sjhb		 */
1766180923Sjhb		if (THREAD_CAN_SCHED(td, td->td_oncpu))
1767180923Sjhb			return;
1768180923Sjhb
1769180923Sjhb		td->td_flags |= TDF_NEEDRESCHED;
1770180923Sjhb		if (td != curthread)
1771210939Sjhb			ipi_cpu(cpu, IPI_AST);
1772180923Sjhb		break;
1773180923Sjhb	default:
1774180923Sjhb		break;
1775180923Sjhb	}
1776180923Sjhb#endif
1777176729Sjeff}
1778