Deleted Added
full compact
sched_ule.c (212153) sched_ule.c (212416)
1/*-
2 * Copyright (c) 2002-2007, Jeffrey Roberson <jeff@freebsd.org>
3 * All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 * 1. Redistributions of source code must retain the above copyright
9 * notice unmodified, this list of conditions, and the following
10 * disclaimer.
11 * 2. Redistributions in binary form must reproduce the above copyright
12 * notice, this list of conditions and the following disclaimer in the
13 * documentation and/or other materials provided with the distribution.
14 *
15 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
16 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
17 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
18 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
19 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
20 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
21 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
22 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
23 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
24 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
25 */
26
27/*
28 * This file implements the ULE scheduler. ULE supports independent CPU
29 * run queues and fine grain locking. It has superior interactive
30 * performance under load even on uni-processor systems.
31 *
32 * etymology:
33 * ULE is the last three letters in schedule. It owes its name to a
34 * generic user created for a scheduling system by Paul Mikesell at
35 * Isilon Systems and a general lack of creativity on the part of the author.
36 */
37
38#include <sys/cdefs.h>
1/*-
2 * Copyright (c) 2002-2007, Jeffrey Roberson <jeff@freebsd.org>
3 * All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 * 1. Redistributions of source code must retain the above copyright
9 * notice unmodified, this list of conditions, and the following
10 * disclaimer.
11 * 2. Redistributions in binary form must reproduce the above copyright
12 * notice, this list of conditions and the following disclaimer in the
13 * documentation and/or other materials provided with the distribution.
14 *
15 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
16 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
17 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
18 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
19 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
20 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
21 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
22 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
23 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
24 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
25 */
26
27/*
28 * This file implements the ULE scheduler. ULE supports independent CPU
29 * run queues and fine grain locking. It has superior interactive
30 * performance under load even on uni-processor systems.
31 *
32 * etymology:
33 * ULE is the last three letters in schedule. It owes its name to a
34 * generic user created for a scheduling system by Paul Mikesell at
35 * Isilon Systems and a general lack of creativity on the part of the author.
36 */
37
38#include <sys/cdefs.h>
39__FBSDID("$FreeBSD: head/sys/kern/sched_ule.c 212153 2010-09-02 16:23:05Z mdf $");
39__FBSDID("$FreeBSD: head/sys/kern/sched_ule.c 212416 2010-09-10 13:24:47Z mav $");
40
41#include "opt_hwpmc_hooks.h"
42#include "opt_kdtrace.h"
43#include "opt_sched.h"
44
45#include <sys/param.h>
46#include <sys/systm.h>
47#include <sys/kdb.h>
48#include <sys/kernel.h>
49#include <sys/ktr.h>
50#include <sys/lock.h>
51#include <sys/mutex.h>
52#include <sys/proc.h>
53#include <sys/resource.h>
54#include <sys/resourcevar.h>
55#include <sys/sched.h>
56#include <sys/smp.h>
57#include <sys/sx.h>
58#include <sys/sysctl.h>
59#include <sys/sysproto.h>
60#include <sys/turnstile.h>
61#include <sys/umtx.h>
62#include <sys/vmmeter.h>
63#include <sys/cpuset.h>
64#include <sys/sbuf.h>
65
66#ifdef HWPMC_HOOKS
67#include <sys/pmckern.h>
68#endif
69
70#ifdef KDTRACE_HOOKS
71#include <sys/dtrace_bsd.h>
72int dtrace_vtime_active;
73dtrace_vtime_switch_func_t dtrace_vtime_switch_func;
74#endif
75
76#include <machine/cpu.h>
77#include <machine/smp.h>
78
79#if defined(__sparc64__)
80#error "This architecture is not currently compatible with ULE"
81#endif
82
83#define KTR_ULE 0
84
85#define TS_NAME_LEN (MAXCOMLEN + sizeof(" td ") + sizeof(__XSTRING(UINT_MAX)))
86#define TDQ_NAME_LEN (sizeof("sched lock ") + sizeof(__XSTRING(MAXCPU)))
87#define TDQ_LOADNAME_LEN (PCPU_NAME_LEN + sizeof(" load"))
88
89/*
90 * Thread scheduler specific section. All fields are protected
91 * by the thread lock.
92 */
93struct td_sched {
94 struct runq *ts_runq; /* Run-queue we're queued on. */
95 short ts_flags; /* TSF_* flags. */
96 u_char ts_cpu; /* CPU that we have affinity for. */
97 int ts_rltick; /* Real last tick, for affinity. */
98 int ts_slice; /* Ticks of slice remaining. */
99 u_int ts_slptime; /* Number of ticks we vol. slept */
100 u_int ts_runtime; /* Number of ticks we were running */
101 int ts_ltick; /* Last tick that we were running on */
102 int ts_incrtick; /* Last tick that we incremented on */
103 int ts_ftick; /* First tick that we were running on */
104 int ts_ticks; /* Tick count */
105#ifdef KTR
106 char ts_name[TS_NAME_LEN];
107#endif
108};
109/* flags kept in ts_flags */
110#define TSF_BOUND 0x0001 /* Thread can not migrate. */
111#define TSF_XFERABLE 0x0002 /* Thread was added as transferable. */
112
113static struct td_sched td_sched0;
114
115#define THREAD_CAN_MIGRATE(td) ((td)->td_pinned == 0)
116#define THREAD_CAN_SCHED(td, cpu) \
117 CPU_ISSET((cpu), &(td)->td_cpuset->cs_mask)
118
119/*
120 * Cpu percentage computation macros and defines.
121 *
122 * SCHED_TICK_SECS: Number of seconds to average the cpu usage across.
123 * SCHED_TICK_TARG: Number of hz ticks to average the cpu usage across.
124 * SCHED_TICK_MAX: Maximum number of ticks before scaling back.
125 * SCHED_TICK_SHIFT: Shift factor to avoid rounding away results.
126 * SCHED_TICK_HZ: Compute the number of hz ticks for a given ticks count.
127 * SCHED_TICK_TOTAL: Gives the amount of time we've been recording ticks.
128 */
129#define SCHED_TICK_SECS 10
130#define SCHED_TICK_TARG (hz * SCHED_TICK_SECS)
131#define SCHED_TICK_MAX (SCHED_TICK_TARG + hz)
132#define SCHED_TICK_SHIFT 10
133#define SCHED_TICK_HZ(ts) ((ts)->ts_ticks >> SCHED_TICK_SHIFT)
134#define SCHED_TICK_TOTAL(ts) (max((ts)->ts_ltick - (ts)->ts_ftick, hz))
135
136/*
137 * These macros determine priorities for non-interactive threads. They are
138 * assigned a priority based on their recent cpu utilization as expressed
139 * by the ratio of ticks to the tick total. NHALF priorities at the start
140 * and end of the MIN to MAX timeshare range are only reachable with negative
141 * or positive nice respectively.
142 *
143 * PRI_RANGE: Priority range for utilization dependent priorities.
144 * PRI_NRESV: Number of nice values.
145 * PRI_TICKS: Compute a priority in PRI_RANGE from the ticks count and total.
146 * PRI_NICE: Determines the part of the priority inherited from nice.
147 */
148#define SCHED_PRI_NRESV (PRIO_MAX - PRIO_MIN)
149#define SCHED_PRI_NHALF (SCHED_PRI_NRESV / 2)
150#define SCHED_PRI_MIN (PRI_MIN_TIMESHARE + SCHED_PRI_NHALF)
151#define SCHED_PRI_MAX (PRI_MAX_TIMESHARE - SCHED_PRI_NHALF)
152#define SCHED_PRI_RANGE (SCHED_PRI_MAX - SCHED_PRI_MIN)
153#define SCHED_PRI_TICKS(ts) \
154 (SCHED_TICK_HZ((ts)) / \
155 (roundup(SCHED_TICK_TOTAL((ts)), SCHED_PRI_RANGE) / SCHED_PRI_RANGE))
156#define SCHED_PRI_NICE(nice) (nice)
157
158/*
159 * These determine the interactivity of a process. Interactivity differs from
160 * cpu utilization in that it expresses the voluntary time slept vs time ran
161 * while cpu utilization includes all time not running. This more accurately
162 * models the intent of the thread.
163 *
164 * SLP_RUN_MAX: Maximum amount of sleep time + run time we'll accumulate
165 * before throttling back.
166 * SLP_RUN_FORK: Maximum slp+run time to inherit at fork time.
167 * INTERACT_MAX: Maximum interactivity value. Smaller is better.
168 * INTERACT_THRESH: Threshhold for placement on the current runq.
169 */
170#define SCHED_SLP_RUN_MAX ((hz * 5) << SCHED_TICK_SHIFT)
171#define SCHED_SLP_RUN_FORK ((hz / 2) << SCHED_TICK_SHIFT)
172#define SCHED_INTERACT_MAX (100)
173#define SCHED_INTERACT_HALF (SCHED_INTERACT_MAX / 2)
174#define SCHED_INTERACT_THRESH (30)
175
176/*
177 * tickincr: Converts a stathz tick into a hz domain scaled by
178 * the shift factor. Without the shift the error rate
179 * due to rounding would be unacceptably high.
180 * realstathz: stathz is sometimes 0 and run off of hz.
181 * sched_slice: Runtime of each thread before rescheduling.
182 * preempt_thresh: Priority threshold for preemption and remote IPIs.
183 */
184static int sched_interact = SCHED_INTERACT_THRESH;
185static int realstathz;
186static int tickincr;
187static int sched_slice = 1;
188#ifdef PREEMPTION
189#ifdef FULL_PREEMPTION
190static int preempt_thresh = PRI_MAX_IDLE;
191#else
192static int preempt_thresh = PRI_MIN_KERN;
193#endif
194#else
195static int preempt_thresh = 0;
196#endif
197static int static_boost = PRI_MIN_TIMESHARE;
198static int sched_idlespins = 10000;
40
41#include "opt_hwpmc_hooks.h"
42#include "opt_kdtrace.h"
43#include "opt_sched.h"
44
45#include <sys/param.h>
46#include <sys/systm.h>
47#include <sys/kdb.h>
48#include <sys/kernel.h>
49#include <sys/ktr.h>
50#include <sys/lock.h>
51#include <sys/mutex.h>
52#include <sys/proc.h>
53#include <sys/resource.h>
54#include <sys/resourcevar.h>
55#include <sys/sched.h>
56#include <sys/smp.h>
57#include <sys/sx.h>
58#include <sys/sysctl.h>
59#include <sys/sysproto.h>
60#include <sys/turnstile.h>
61#include <sys/umtx.h>
62#include <sys/vmmeter.h>
63#include <sys/cpuset.h>
64#include <sys/sbuf.h>
65
66#ifdef HWPMC_HOOKS
67#include <sys/pmckern.h>
68#endif
69
70#ifdef KDTRACE_HOOKS
71#include <sys/dtrace_bsd.h>
72int dtrace_vtime_active;
73dtrace_vtime_switch_func_t dtrace_vtime_switch_func;
74#endif
75
76#include <machine/cpu.h>
77#include <machine/smp.h>
78
79#if defined(__sparc64__)
80#error "This architecture is not currently compatible with ULE"
81#endif
82
83#define KTR_ULE 0
84
85#define TS_NAME_LEN (MAXCOMLEN + sizeof(" td ") + sizeof(__XSTRING(UINT_MAX)))
86#define TDQ_NAME_LEN (sizeof("sched lock ") + sizeof(__XSTRING(MAXCPU)))
87#define TDQ_LOADNAME_LEN (PCPU_NAME_LEN + sizeof(" load"))
88
89/*
90 * Thread scheduler specific section. All fields are protected
91 * by the thread lock.
92 */
93struct td_sched {
94 struct runq *ts_runq; /* Run-queue we're queued on. */
95 short ts_flags; /* TSF_* flags. */
96 u_char ts_cpu; /* CPU that we have affinity for. */
97 int ts_rltick; /* Real last tick, for affinity. */
98 int ts_slice; /* Ticks of slice remaining. */
99 u_int ts_slptime; /* Number of ticks we vol. slept */
100 u_int ts_runtime; /* Number of ticks we were running */
101 int ts_ltick; /* Last tick that we were running on */
102 int ts_incrtick; /* Last tick that we incremented on */
103 int ts_ftick; /* First tick that we were running on */
104 int ts_ticks; /* Tick count */
105#ifdef KTR
106 char ts_name[TS_NAME_LEN];
107#endif
108};
109/* flags kept in ts_flags */
110#define TSF_BOUND 0x0001 /* Thread can not migrate. */
111#define TSF_XFERABLE 0x0002 /* Thread was added as transferable. */
112
113static struct td_sched td_sched0;
114
115#define THREAD_CAN_MIGRATE(td) ((td)->td_pinned == 0)
116#define THREAD_CAN_SCHED(td, cpu) \
117 CPU_ISSET((cpu), &(td)->td_cpuset->cs_mask)
118
119/*
120 * Cpu percentage computation macros and defines.
121 *
122 * SCHED_TICK_SECS: Number of seconds to average the cpu usage across.
123 * SCHED_TICK_TARG: Number of hz ticks to average the cpu usage across.
124 * SCHED_TICK_MAX: Maximum number of ticks before scaling back.
125 * SCHED_TICK_SHIFT: Shift factor to avoid rounding away results.
126 * SCHED_TICK_HZ: Compute the number of hz ticks for a given ticks count.
127 * SCHED_TICK_TOTAL: Gives the amount of time we've been recording ticks.
128 */
129#define SCHED_TICK_SECS 10
130#define SCHED_TICK_TARG (hz * SCHED_TICK_SECS)
131#define SCHED_TICK_MAX (SCHED_TICK_TARG + hz)
132#define SCHED_TICK_SHIFT 10
133#define SCHED_TICK_HZ(ts) ((ts)->ts_ticks >> SCHED_TICK_SHIFT)
134#define SCHED_TICK_TOTAL(ts) (max((ts)->ts_ltick - (ts)->ts_ftick, hz))
135
136/*
137 * These macros determine priorities for non-interactive threads. They are
138 * assigned a priority based on their recent cpu utilization as expressed
139 * by the ratio of ticks to the tick total. NHALF priorities at the start
140 * and end of the MIN to MAX timeshare range are only reachable with negative
141 * or positive nice respectively.
142 *
143 * PRI_RANGE: Priority range for utilization dependent priorities.
144 * PRI_NRESV: Number of nice values.
145 * PRI_TICKS: Compute a priority in PRI_RANGE from the ticks count and total.
146 * PRI_NICE: Determines the part of the priority inherited from nice.
147 */
148#define SCHED_PRI_NRESV (PRIO_MAX - PRIO_MIN)
149#define SCHED_PRI_NHALF (SCHED_PRI_NRESV / 2)
150#define SCHED_PRI_MIN (PRI_MIN_TIMESHARE + SCHED_PRI_NHALF)
151#define SCHED_PRI_MAX (PRI_MAX_TIMESHARE - SCHED_PRI_NHALF)
152#define SCHED_PRI_RANGE (SCHED_PRI_MAX - SCHED_PRI_MIN)
153#define SCHED_PRI_TICKS(ts) \
154 (SCHED_TICK_HZ((ts)) / \
155 (roundup(SCHED_TICK_TOTAL((ts)), SCHED_PRI_RANGE) / SCHED_PRI_RANGE))
156#define SCHED_PRI_NICE(nice) (nice)
157
158/*
159 * These determine the interactivity of a process. Interactivity differs from
160 * cpu utilization in that it expresses the voluntary time slept vs time ran
161 * while cpu utilization includes all time not running. This more accurately
162 * models the intent of the thread.
163 *
164 * SLP_RUN_MAX: Maximum amount of sleep time + run time we'll accumulate
165 * before throttling back.
166 * SLP_RUN_FORK: Maximum slp+run time to inherit at fork time.
167 * INTERACT_MAX: Maximum interactivity value. Smaller is better.
168 * INTERACT_THRESH: Threshhold for placement on the current runq.
169 */
170#define SCHED_SLP_RUN_MAX ((hz * 5) << SCHED_TICK_SHIFT)
171#define SCHED_SLP_RUN_FORK ((hz / 2) << SCHED_TICK_SHIFT)
172#define SCHED_INTERACT_MAX (100)
173#define SCHED_INTERACT_HALF (SCHED_INTERACT_MAX / 2)
174#define SCHED_INTERACT_THRESH (30)
175
176/*
177 * tickincr: Converts a stathz tick into a hz domain scaled by
178 * the shift factor. Without the shift the error rate
179 * due to rounding would be unacceptably high.
180 * realstathz: stathz is sometimes 0 and run off of hz.
181 * sched_slice: Runtime of each thread before rescheduling.
182 * preempt_thresh: Priority threshold for preemption and remote IPIs.
183 */
184static int sched_interact = SCHED_INTERACT_THRESH;
185static int realstathz;
186static int tickincr;
187static int sched_slice = 1;
188#ifdef PREEMPTION
189#ifdef FULL_PREEMPTION
190static int preempt_thresh = PRI_MAX_IDLE;
191#else
192static int preempt_thresh = PRI_MIN_KERN;
193#endif
194#else
195static int preempt_thresh = 0;
196#endif
197static int static_boost = PRI_MIN_TIMESHARE;
198static int sched_idlespins = 10000;
199static int sched_idlespinthresh = 4;
199static int sched_idlespinthresh = 64;
200
201/*
202 * tdq - per processor runqs and statistics. All fields are protected by the
203 * tdq_lock. The load and lowpri may be accessed without to avoid excess
204 * locking in sched_pickcpu();
205 */
206struct tdq {
207 /* Ordered to improve efficiency of cpu_search() and switch(). */
208 struct mtx tdq_lock; /* run queue lock. */
209 struct cpu_group *tdq_cg; /* Pointer to cpu topology. */
210 volatile int tdq_load; /* Aggregate load. */
200
201/*
202 * tdq - per processor runqs and statistics. All fields are protected by the
203 * tdq_lock. The load and lowpri may be accessed without to avoid excess
204 * locking in sched_pickcpu();
205 */
206struct tdq {
207 /* Ordered to improve efficiency of cpu_search() and switch(). */
208 struct mtx tdq_lock; /* run queue lock. */
209 struct cpu_group *tdq_cg; /* Pointer to cpu topology. */
210 volatile int tdq_load; /* Aggregate load. */
211 volatile int tdq_cpu_idle; /* cpu_idle() is active. */
211 int tdq_sysload; /* For loadavg, !ITHD load. */
212 int tdq_transferable; /* Transferable thread count. */
213 short tdq_switchcnt; /* Switches this tick. */
214 short tdq_oldswitchcnt; /* Switches last tick. */
215 u_char tdq_lowpri; /* Lowest priority thread. */
216 u_char tdq_ipipending; /* IPI pending. */
217 u_char tdq_idx; /* Current insert index. */
218 u_char tdq_ridx; /* Current removal index. */
219 struct runq tdq_realtime; /* real-time run queue. */
220 struct runq tdq_timeshare; /* timeshare run queue. */
221 struct runq tdq_idle; /* Queue of IDLE threads. */
222 char tdq_name[TDQ_NAME_LEN];
223#ifdef KTR
224 char tdq_loadname[TDQ_LOADNAME_LEN];
225#endif
226} __aligned(64);
227
228/* Idle thread states and config. */
229#define TDQ_RUNNING 1
230#define TDQ_IDLE 2
231
232#ifdef SMP
233struct cpu_group *cpu_top; /* CPU topology */
234
235#define SCHED_AFFINITY_DEFAULT (max(1, hz / 1000))
236#define SCHED_AFFINITY(ts, t) ((ts)->ts_rltick > ticks - ((t) * affinity))
237
238/*
239 * Run-time tunables.
240 */
241static int rebalance = 1;
242static int balance_interval = 128; /* Default set in sched_initticks(). */
243static int affinity;
244static int steal_htt = 1;
245static int steal_idle = 1;
246static int steal_thresh = 2;
247
248/*
249 * One thread queue per processor.
250 */
251static struct tdq tdq_cpu[MAXCPU];
252static struct tdq *balance_tdq;
253static int balance_ticks;
254
255#define TDQ_SELF() (&tdq_cpu[PCPU_GET(cpuid)])
256#define TDQ_CPU(x) (&tdq_cpu[(x)])
257#define TDQ_ID(x) ((int)((x) - tdq_cpu))
258#else /* !SMP */
259static struct tdq tdq_cpu;
260
261#define TDQ_ID(x) (0)
262#define TDQ_SELF() (&tdq_cpu)
263#define TDQ_CPU(x) (&tdq_cpu)
264#endif
265
266#define TDQ_LOCK_ASSERT(t, type) mtx_assert(TDQ_LOCKPTR((t)), (type))
267#define TDQ_LOCK(t) mtx_lock_spin(TDQ_LOCKPTR((t)))
268#define TDQ_LOCK_FLAGS(t, f) mtx_lock_spin_flags(TDQ_LOCKPTR((t)), (f))
269#define TDQ_UNLOCK(t) mtx_unlock_spin(TDQ_LOCKPTR((t)))
270#define TDQ_LOCKPTR(t) (&(t)->tdq_lock)
271
272static void sched_priority(struct thread *);
273static void sched_thread_priority(struct thread *, u_char);
274static int sched_interact_score(struct thread *);
275static void sched_interact_update(struct thread *);
276static void sched_interact_fork(struct thread *);
277static void sched_pctcpu_update(struct td_sched *);
278
279/* Operations on per processor queues */
280static struct thread *tdq_choose(struct tdq *);
281static void tdq_setup(struct tdq *);
282static void tdq_load_add(struct tdq *, struct thread *);
283static void tdq_load_rem(struct tdq *, struct thread *);
284static __inline void tdq_runq_add(struct tdq *, struct thread *, int);
285static __inline void tdq_runq_rem(struct tdq *, struct thread *);
286static inline int sched_shouldpreempt(int, int, int);
287void tdq_print(int cpu);
288static void runq_print(struct runq *rq);
289static void tdq_add(struct tdq *, struct thread *, int);
290#ifdef SMP
291static int tdq_move(struct tdq *, struct tdq *);
292static int tdq_idled(struct tdq *);
293static void tdq_notify(struct tdq *, struct thread *);
294static struct thread *tdq_steal(struct tdq *, int);
295static struct thread *runq_steal(struct runq *, int);
296static int sched_pickcpu(struct thread *, int);
297static void sched_balance(void);
298static int sched_balance_pair(struct tdq *, struct tdq *);
299static inline struct tdq *sched_setcpu(struct thread *, int, int);
300static inline void thread_unblock_switch(struct thread *, struct mtx *);
301static struct mtx *sched_switch_migrate(struct tdq *, struct thread *, int);
302static int sysctl_kern_sched_topology_spec(SYSCTL_HANDLER_ARGS);
303static int sysctl_kern_sched_topology_spec_internal(struct sbuf *sb,
304 struct cpu_group *cg, int indent);
305#endif
306
307static void sched_setup(void *dummy);
308SYSINIT(sched_setup, SI_SUB_RUN_QUEUE, SI_ORDER_FIRST, sched_setup, NULL);
309
310static void sched_initticks(void *dummy);
311SYSINIT(sched_initticks, SI_SUB_CLOCKS, SI_ORDER_THIRD, sched_initticks,
312 NULL);
313
314/*
315 * Print the threads waiting on a run-queue.
316 */
317static void
318runq_print(struct runq *rq)
319{
320 struct rqhead *rqh;
321 struct thread *td;
322 int pri;
323 int j;
324 int i;
325
326 for (i = 0; i < RQB_LEN; i++) {
327 printf("\t\trunq bits %d 0x%zx\n",
328 i, rq->rq_status.rqb_bits[i]);
329 for (j = 0; j < RQB_BPW; j++)
330 if (rq->rq_status.rqb_bits[i] & (1ul << j)) {
331 pri = j + (i << RQB_L2BPW);
332 rqh = &rq->rq_queues[pri];
333 TAILQ_FOREACH(td, rqh, td_runq) {
334 printf("\t\t\ttd %p(%s) priority %d rqindex %d pri %d\n",
335 td, td->td_name, td->td_priority,
336 td->td_rqindex, pri);
337 }
338 }
339 }
340}
341
342/*
343 * Print the status of a per-cpu thread queue. Should be a ddb show cmd.
344 */
345void
346tdq_print(int cpu)
347{
348 struct tdq *tdq;
349
350 tdq = TDQ_CPU(cpu);
351
352 printf("tdq %d:\n", TDQ_ID(tdq));
353 printf("\tlock %p\n", TDQ_LOCKPTR(tdq));
354 printf("\tLock name: %s\n", tdq->tdq_name);
355 printf("\tload: %d\n", tdq->tdq_load);
356 printf("\tswitch cnt: %d\n", tdq->tdq_switchcnt);
357 printf("\told switch cnt: %d\n", tdq->tdq_oldswitchcnt);
358 printf("\ttimeshare idx: %d\n", tdq->tdq_idx);
359 printf("\ttimeshare ridx: %d\n", tdq->tdq_ridx);
360 printf("\tload transferable: %d\n", tdq->tdq_transferable);
361 printf("\tlowest priority: %d\n", tdq->tdq_lowpri);
362 printf("\trealtime runq:\n");
363 runq_print(&tdq->tdq_realtime);
364 printf("\ttimeshare runq:\n");
365 runq_print(&tdq->tdq_timeshare);
366 printf("\tidle runq:\n");
367 runq_print(&tdq->tdq_idle);
368}
369
370static inline int
371sched_shouldpreempt(int pri, int cpri, int remote)
372{
373 /*
374 * If the new priority is not better than the current priority there is
375 * nothing to do.
376 */
377 if (pri >= cpri)
378 return (0);
379 /*
380 * Always preempt idle.
381 */
382 if (cpri >= PRI_MIN_IDLE)
383 return (1);
384 /*
385 * If preemption is disabled don't preempt others.
386 */
387 if (preempt_thresh == 0)
388 return (0);
389 /*
390 * Preempt if we exceed the threshold.
391 */
392 if (pri <= preempt_thresh)
393 return (1);
394 /*
395 * If we're realtime or better and there is timeshare or worse running
396 * preempt only remote processors.
397 */
398 if (remote && pri <= PRI_MAX_REALTIME && cpri > PRI_MAX_REALTIME)
399 return (1);
400 return (0);
401}
402
403#define TS_RQ_PPQ (((PRI_MAX_TIMESHARE - PRI_MIN_TIMESHARE) + 1) / RQ_NQS)
404/*
405 * Add a thread to the actual run-queue. Keeps transferable counts up to
406 * date with what is actually on the run-queue. Selects the correct
407 * queue position for timeshare threads.
408 */
409static __inline void
410tdq_runq_add(struct tdq *tdq, struct thread *td, int flags)
411{
412 struct td_sched *ts;
413 u_char pri;
414
415 TDQ_LOCK_ASSERT(tdq, MA_OWNED);
416 THREAD_LOCK_ASSERT(td, MA_OWNED);
417
418 pri = td->td_priority;
419 ts = td->td_sched;
420 TD_SET_RUNQ(td);
421 if (THREAD_CAN_MIGRATE(td)) {
422 tdq->tdq_transferable++;
423 ts->ts_flags |= TSF_XFERABLE;
424 }
425 if (pri <= PRI_MAX_REALTIME) {
426 ts->ts_runq = &tdq->tdq_realtime;
427 } else if (pri <= PRI_MAX_TIMESHARE) {
428 ts->ts_runq = &tdq->tdq_timeshare;
429 KASSERT(pri <= PRI_MAX_TIMESHARE && pri >= PRI_MIN_TIMESHARE,
430 ("Invalid priority %d on timeshare runq", pri));
431 /*
432 * This queue contains only priorities between MIN and MAX
433 * realtime. Use the whole queue to represent these values.
434 */
435 if ((flags & (SRQ_BORROWING|SRQ_PREEMPTED)) == 0) {
436 pri = (pri - PRI_MIN_TIMESHARE) / TS_RQ_PPQ;
437 pri = (pri + tdq->tdq_idx) % RQ_NQS;
438 /*
439 * This effectively shortens the queue by one so we
440 * can have a one slot difference between idx and
441 * ridx while we wait for threads to drain.
442 */
443 if (tdq->tdq_ridx != tdq->tdq_idx &&
444 pri == tdq->tdq_ridx)
445 pri = (unsigned char)(pri - 1) % RQ_NQS;
446 } else
447 pri = tdq->tdq_ridx;
448 runq_add_pri(ts->ts_runq, td, pri, flags);
449 return;
450 } else
451 ts->ts_runq = &tdq->tdq_idle;
452 runq_add(ts->ts_runq, td, flags);
453}
454
455/*
456 * Remove a thread from a run-queue. This typically happens when a thread
457 * is selected to run. Running threads are not on the queue and the
458 * transferable count does not reflect them.
459 */
460static __inline void
461tdq_runq_rem(struct tdq *tdq, struct thread *td)
462{
463 struct td_sched *ts;
464
465 ts = td->td_sched;
466 TDQ_LOCK_ASSERT(tdq, MA_OWNED);
467 KASSERT(ts->ts_runq != NULL,
468 ("tdq_runq_remove: thread %p null ts_runq", td));
469 if (ts->ts_flags & TSF_XFERABLE) {
470 tdq->tdq_transferable--;
471 ts->ts_flags &= ~TSF_XFERABLE;
472 }
473 if (ts->ts_runq == &tdq->tdq_timeshare) {
474 if (tdq->tdq_idx != tdq->tdq_ridx)
475 runq_remove_idx(ts->ts_runq, td, &tdq->tdq_ridx);
476 else
477 runq_remove_idx(ts->ts_runq, td, NULL);
478 } else
479 runq_remove(ts->ts_runq, td);
480}
481
482/*
483 * Load is maintained for all threads RUNNING and ON_RUNQ. Add the load
484 * for this thread to the referenced thread queue.
485 */
486static void
487tdq_load_add(struct tdq *tdq, struct thread *td)
488{
489
490 TDQ_LOCK_ASSERT(tdq, MA_OWNED);
491 THREAD_LOCK_ASSERT(td, MA_OWNED);
492
493 tdq->tdq_load++;
494 if ((td->td_flags & TDF_NOLOAD) == 0)
495 tdq->tdq_sysload++;
496 KTR_COUNTER0(KTR_SCHED, "load", tdq->tdq_loadname, tdq->tdq_load);
497}
498
499/*
500 * Remove the load from a thread that is transitioning to a sleep state or
501 * exiting.
502 */
503static void
504tdq_load_rem(struct tdq *tdq, struct thread *td)
505{
506
507 THREAD_LOCK_ASSERT(td, MA_OWNED);
508 TDQ_LOCK_ASSERT(tdq, MA_OWNED);
509 KASSERT(tdq->tdq_load != 0,
510 ("tdq_load_rem: Removing with 0 load on queue %d", TDQ_ID(tdq)));
511
512 tdq->tdq_load--;
513 if ((td->td_flags & TDF_NOLOAD) == 0)
514 tdq->tdq_sysload--;
515 KTR_COUNTER0(KTR_SCHED, "load", tdq->tdq_loadname, tdq->tdq_load);
516}
517
518/*
519 * Set lowpri to its exact value by searching the run-queue and
520 * evaluating curthread. curthread may be passed as an optimization.
521 */
522static void
523tdq_setlowpri(struct tdq *tdq, struct thread *ctd)
524{
525 struct thread *td;
526
527 TDQ_LOCK_ASSERT(tdq, MA_OWNED);
528 if (ctd == NULL)
529 ctd = pcpu_find(TDQ_ID(tdq))->pc_curthread;
530 td = tdq_choose(tdq);
531 if (td == NULL || td->td_priority > ctd->td_priority)
532 tdq->tdq_lowpri = ctd->td_priority;
533 else
534 tdq->tdq_lowpri = td->td_priority;
535}
536
537#ifdef SMP
538struct cpu_search {
539 cpuset_t cs_mask;
540 u_int cs_load;
541 u_int cs_cpu;
542 int cs_limit; /* Min priority for low min load for high. */
543};
544
545#define CPU_SEARCH_LOWEST 0x1
546#define CPU_SEARCH_HIGHEST 0x2
547#define CPU_SEARCH_BOTH (CPU_SEARCH_LOWEST|CPU_SEARCH_HIGHEST)
548
549#define CPUSET_FOREACH(cpu, mask) \
550 for ((cpu) = 0; (cpu) <= mp_maxid; (cpu)++) \
551 if ((mask) & 1 << (cpu))
552
553static __inline int cpu_search(struct cpu_group *cg, struct cpu_search *low,
554 struct cpu_search *high, const int match);
555int cpu_search_lowest(struct cpu_group *cg, struct cpu_search *low);
556int cpu_search_highest(struct cpu_group *cg, struct cpu_search *high);
557int cpu_search_both(struct cpu_group *cg, struct cpu_search *low,
558 struct cpu_search *high);
559
560/*
561 * This routine compares according to the match argument and should be
562 * reduced in actual instantiations via constant propagation and dead code
563 * elimination.
564 */
565static __inline int
566cpu_compare(int cpu, struct cpu_search *low, struct cpu_search *high,
567 const int match)
568{
569 struct tdq *tdq;
570
571 tdq = TDQ_CPU(cpu);
572 if (match & CPU_SEARCH_LOWEST)
573 if (CPU_ISSET(cpu, &low->cs_mask) &&
574 tdq->tdq_load < low->cs_load &&
575 tdq->tdq_lowpri > low->cs_limit) {
576 low->cs_cpu = cpu;
577 low->cs_load = tdq->tdq_load;
578 }
579 if (match & CPU_SEARCH_HIGHEST)
580 if (CPU_ISSET(cpu, &high->cs_mask) &&
581 tdq->tdq_load >= high->cs_limit &&
582 tdq->tdq_load > high->cs_load &&
583 tdq->tdq_transferable) {
584 high->cs_cpu = cpu;
585 high->cs_load = tdq->tdq_load;
586 }
587 return (tdq->tdq_load);
588}
589
590/*
591 * Search the tree of cpu_groups for the lowest or highest loaded cpu
592 * according to the match argument. This routine actually compares the
593 * load on all paths through the tree and finds the least loaded cpu on
594 * the least loaded path, which may differ from the least loaded cpu in
595 * the system. This balances work among caches and busses.
596 *
597 * This inline is instantiated in three forms below using constants for the
598 * match argument. It is reduced to the minimum set for each case. It is
599 * also recursive to the depth of the tree.
600 */
601static __inline int
602cpu_search(struct cpu_group *cg, struct cpu_search *low,
603 struct cpu_search *high, const int match)
604{
605 int total;
606
607 total = 0;
608 if (cg->cg_children) {
609 struct cpu_search lgroup;
610 struct cpu_search hgroup;
611 struct cpu_group *child;
612 u_int lload;
613 int hload;
614 int load;
615 int i;
616
617 lload = -1;
618 hload = -1;
619 for (i = 0; i < cg->cg_children; i++) {
620 child = &cg->cg_child[i];
621 if (match & CPU_SEARCH_LOWEST) {
622 lgroup = *low;
623 lgroup.cs_load = -1;
624 }
625 if (match & CPU_SEARCH_HIGHEST) {
626 hgroup = *high;
627 lgroup.cs_load = 0;
628 }
629 switch (match) {
630 case CPU_SEARCH_LOWEST:
631 load = cpu_search_lowest(child, &lgroup);
632 break;
633 case CPU_SEARCH_HIGHEST:
634 load = cpu_search_highest(child, &hgroup);
635 break;
636 case CPU_SEARCH_BOTH:
637 load = cpu_search_both(child, &lgroup, &hgroup);
638 break;
639 }
640 total += load;
641 if (match & CPU_SEARCH_LOWEST)
642 if (load < lload || low->cs_cpu == -1) {
643 *low = lgroup;
644 lload = load;
645 }
646 if (match & CPU_SEARCH_HIGHEST)
647 if (load > hload || high->cs_cpu == -1) {
648 hload = load;
649 *high = hgroup;
650 }
651 }
652 } else {
653 int cpu;
654
655 CPUSET_FOREACH(cpu, cg->cg_mask)
656 total += cpu_compare(cpu, low, high, match);
657 }
658 return (total);
659}
660
661/*
662 * cpu_search instantiations must pass constants to maintain the inline
663 * optimization.
664 */
665int
666cpu_search_lowest(struct cpu_group *cg, struct cpu_search *low)
667{
668 return cpu_search(cg, low, NULL, CPU_SEARCH_LOWEST);
669}
670
671int
672cpu_search_highest(struct cpu_group *cg, struct cpu_search *high)
673{
674 return cpu_search(cg, NULL, high, CPU_SEARCH_HIGHEST);
675}
676
677int
678cpu_search_both(struct cpu_group *cg, struct cpu_search *low,
679 struct cpu_search *high)
680{
681 return cpu_search(cg, low, high, CPU_SEARCH_BOTH);
682}
683
684/*
685 * Find the cpu with the least load via the least loaded path that has a
686 * lowpri greater than pri pri. A pri of -1 indicates any priority is
687 * acceptable.
688 */
689static inline int
690sched_lowest(struct cpu_group *cg, cpuset_t mask, int pri)
691{
692 struct cpu_search low;
693
694 low.cs_cpu = -1;
695 low.cs_load = -1;
696 low.cs_mask = mask;
697 low.cs_limit = pri;
698 cpu_search_lowest(cg, &low);
699 return low.cs_cpu;
700}
701
702/*
703 * Find the cpu with the highest load via the highest loaded path.
704 */
705static inline int
706sched_highest(struct cpu_group *cg, cpuset_t mask, int minload)
707{
708 struct cpu_search high;
709
710 high.cs_cpu = -1;
711 high.cs_load = 0;
712 high.cs_mask = mask;
713 high.cs_limit = minload;
714 cpu_search_highest(cg, &high);
715 return high.cs_cpu;
716}
717
718/*
719 * Simultaneously find the highest and lowest loaded cpu reachable via
720 * cg.
721 */
722static inline void
723sched_both(struct cpu_group *cg, cpuset_t mask, int *lowcpu, int *highcpu)
724{
725 struct cpu_search high;
726 struct cpu_search low;
727
728 low.cs_cpu = -1;
729 low.cs_limit = -1;
730 low.cs_load = -1;
731 low.cs_mask = mask;
732 high.cs_load = 0;
733 high.cs_cpu = -1;
734 high.cs_limit = -1;
735 high.cs_mask = mask;
736 cpu_search_both(cg, &low, &high);
737 *lowcpu = low.cs_cpu;
738 *highcpu = high.cs_cpu;
739 return;
740}
741
742static void
743sched_balance_group(struct cpu_group *cg)
744{
745 cpuset_t mask;
746 int high;
747 int low;
748 int i;
749
750 CPU_FILL(&mask);
751 for (;;) {
752 sched_both(cg, mask, &low, &high);
753 if (low == high || low == -1 || high == -1)
754 break;
755 if (sched_balance_pair(TDQ_CPU(high), TDQ_CPU(low)))
756 break;
757 /*
758 * If we failed to move any threads determine which cpu
759 * to kick out of the set and try again.
760 */
761 if (TDQ_CPU(high)->tdq_transferable == 0)
762 CPU_CLR(high, &mask);
763 else
764 CPU_CLR(low, &mask);
765 }
766
767 for (i = 0; i < cg->cg_children; i++)
768 sched_balance_group(&cg->cg_child[i]);
769}
770
771static void
772sched_balance(void)
773{
774 struct tdq *tdq;
775
776 /*
777 * Select a random time between .5 * balance_interval and
778 * 1.5 * balance_interval.
779 */
780 balance_ticks = max(balance_interval / 2, 1);
781 balance_ticks += random() % balance_interval;
782 if (smp_started == 0 || rebalance == 0)
783 return;
784 tdq = TDQ_SELF();
785 TDQ_UNLOCK(tdq);
786 sched_balance_group(cpu_top);
787 TDQ_LOCK(tdq);
788}
789
790/*
791 * Lock two thread queues using their address to maintain lock order.
792 */
793static void
794tdq_lock_pair(struct tdq *one, struct tdq *two)
795{
796 if (one < two) {
797 TDQ_LOCK(one);
798 TDQ_LOCK_FLAGS(two, MTX_DUPOK);
799 } else {
800 TDQ_LOCK(two);
801 TDQ_LOCK_FLAGS(one, MTX_DUPOK);
802 }
803}
804
805/*
806 * Unlock two thread queues. Order is not important here.
807 */
808static void
809tdq_unlock_pair(struct tdq *one, struct tdq *two)
810{
811 TDQ_UNLOCK(one);
812 TDQ_UNLOCK(two);
813}
814
815/*
816 * Transfer load between two imbalanced thread queues.
817 */
818static int
819sched_balance_pair(struct tdq *high, struct tdq *low)
820{
821 int transferable;
822 int high_load;
823 int low_load;
824 int moved;
825 int move;
826 int diff;
827 int i;
828
829 tdq_lock_pair(high, low);
830 transferable = high->tdq_transferable;
831 high_load = high->tdq_load;
832 low_load = low->tdq_load;
833 moved = 0;
834 /*
835 * Determine what the imbalance is and then adjust that to how many
836 * threads we actually have to give up (transferable).
837 */
838 if (transferable != 0) {
839 diff = high_load - low_load;
840 move = diff / 2;
841 if (diff & 0x1)
842 move++;
843 move = min(move, transferable);
844 for (i = 0; i < move; i++)
845 moved += tdq_move(high, low);
846 /*
847 * IPI the target cpu to force it to reschedule with the new
848 * workload.
849 */
850 ipi_cpu(TDQ_ID(low), IPI_PREEMPT);
851 }
852 tdq_unlock_pair(high, low);
853 return (moved);
854}
855
856/*
857 * Move a thread from one thread queue to another.
858 */
859static int
860tdq_move(struct tdq *from, struct tdq *to)
861{
862 struct td_sched *ts;
863 struct thread *td;
864 struct tdq *tdq;
865 int cpu;
866
867 TDQ_LOCK_ASSERT(from, MA_OWNED);
868 TDQ_LOCK_ASSERT(to, MA_OWNED);
869
870 tdq = from;
871 cpu = TDQ_ID(to);
872 td = tdq_steal(tdq, cpu);
873 if (td == NULL)
874 return (0);
875 ts = td->td_sched;
876 /*
877 * Although the run queue is locked the thread may be blocked. Lock
878 * it to clear this and acquire the run-queue lock.
879 */
880 thread_lock(td);
881 /* Drop recursive lock on from acquired via thread_lock(). */
882 TDQ_UNLOCK(from);
883 sched_rem(td);
884 ts->ts_cpu = cpu;
885 td->td_lock = TDQ_LOCKPTR(to);
886 tdq_add(to, td, SRQ_YIELDING);
887 return (1);
888}
889
890/*
891 * This tdq has idled. Try to steal a thread from another cpu and switch
892 * to it.
893 */
894static int
895tdq_idled(struct tdq *tdq)
896{
897 struct cpu_group *cg;
898 struct tdq *steal;
899 cpuset_t mask;
900 int thresh;
901 int cpu;
902
903 if (smp_started == 0 || steal_idle == 0)
904 return (1);
905 CPU_FILL(&mask);
906 CPU_CLR(PCPU_GET(cpuid), &mask);
907 /* We don't want to be preempted while we're iterating. */
908 spinlock_enter();
909 for (cg = tdq->tdq_cg; cg != NULL; ) {
910 if ((cg->cg_flags & CG_FLAG_THREAD) == 0)
911 thresh = steal_thresh;
912 else
913 thresh = 1;
914 cpu = sched_highest(cg, mask, thresh);
915 if (cpu == -1) {
916 cg = cg->cg_parent;
917 continue;
918 }
919 steal = TDQ_CPU(cpu);
920 CPU_CLR(cpu, &mask);
921 tdq_lock_pair(tdq, steal);
922 if (steal->tdq_load < thresh || steal->tdq_transferable == 0) {
923 tdq_unlock_pair(tdq, steal);
924 continue;
925 }
926 /*
927 * If a thread was added while interrupts were disabled don't
928 * steal one here. If we fail to acquire one due to affinity
929 * restrictions loop again with this cpu removed from the
930 * set.
931 */
932 if (tdq->tdq_load == 0 && tdq_move(steal, tdq) == 0) {
933 tdq_unlock_pair(tdq, steal);
934 continue;
935 }
936 spinlock_exit();
937 TDQ_UNLOCK(steal);
938 mi_switch(SW_VOL | SWT_IDLE, NULL);
939 thread_unlock(curthread);
940
941 return (0);
942 }
943 spinlock_exit();
944 return (1);
945}
946
947/*
948 * Notify a remote cpu of new work. Sends an IPI if criteria are met.
949 */
950static void
951tdq_notify(struct tdq *tdq, struct thread *td)
952{
953 struct thread *ctd;
954 int pri;
955 int cpu;
956
957 if (tdq->tdq_ipipending)
958 return;
959 cpu = td->td_sched->ts_cpu;
960 pri = td->td_priority;
961 ctd = pcpu_find(cpu)->pc_curthread;
962 if (!sched_shouldpreempt(pri, ctd->td_priority, 1))
963 return;
964 if (TD_IS_IDLETHREAD(ctd)) {
965 /*
966 * If the MD code has an idle wakeup routine try that before
967 * falling back to IPI.
968 */
212 int tdq_sysload; /* For loadavg, !ITHD load. */
213 int tdq_transferable; /* Transferable thread count. */
214 short tdq_switchcnt; /* Switches this tick. */
215 short tdq_oldswitchcnt; /* Switches last tick. */
216 u_char tdq_lowpri; /* Lowest priority thread. */
217 u_char tdq_ipipending; /* IPI pending. */
218 u_char tdq_idx; /* Current insert index. */
219 u_char tdq_ridx; /* Current removal index. */
220 struct runq tdq_realtime; /* real-time run queue. */
221 struct runq tdq_timeshare; /* timeshare run queue. */
222 struct runq tdq_idle; /* Queue of IDLE threads. */
223 char tdq_name[TDQ_NAME_LEN];
224#ifdef KTR
225 char tdq_loadname[TDQ_LOADNAME_LEN];
226#endif
227} __aligned(64);
228
229/* Idle thread states and config. */
230#define TDQ_RUNNING 1
231#define TDQ_IDLE 2
232
233#ifdef SMP
234struct cpu_group *cpu_top; /* CPU topology */
235
236#define SCHED_AFFINITY_DEFAULT (max(1, hz / 1000))
237#define SCHED_AFFINITY(ts, t) ((ts)->ts_rltick > ticks - ((t) * affinity))
238
239/*
240 * Run-time tunables.
241 */
242static int rebalance = 1;
243static int balance_interval = 128; /* Default set in sched_initticks(). */
244static int affinity;
245static int steal_htt = 1;
246static int steal_idle = 1;
247static int steal_thresh = 2;
248
249/*
250 * One thread queue per processor.
251 */
252static struct tdq tdq_cpu[MAXCPU];
253static struct tdq *balance_tdq;
254static int balance_ticks;
255
256#define TDQ_SELF() (&tdq_cpu[PCPU_GET(cpuid)])
257#define TDQ_CPU(x) (&tdq_cpu[(x)])
258#define TDQ_ID(x) ((int)((x) - tdq_cpu))
259#else /* !SMP */
260static struct tdq tdq_cpu;
261
262#define TDQ_ID(x) (0)
263#define TDQ_SELF() (&tdq_cpu)
264#define TDQ_CPU(x) (&tdq_cpu)
265#endif
266
267#define TDQ_LOCK_ASSERT(t, type) mtx_assert(TDQ_LOCKPTR((t)), (type))
268#define TDQ_LOCK(t) mtx_lock_spin(TDQ_LOCKPTR((t)))
269#define TDQ_LOCK_FLAGS(t, f) mtx_lock_spin_flags(TDQ_LOCKPTR((t)), (f))
270#define TDQ_UNLOCK(t) mtx_unlock_spin(TDQ_LOCKPTR((t)))
271#define TDQ_LOCKPTR(t) (&(t)->tdq_lock)
272
273static void sched_priority(struct thread *);
274static void sched_thread_priority(struct thread *, u_char);
275static int sched_interact_score(struct thread *);
276static void sched_interact_update(struct thread *);
277static void sched_interact_fork(struct thread *);
278static void sched_pctcpu_update(struct td_sched *);
279
280/* Operations on per processor queues */
281static struct thread *tdq_choose(struct tdq *);
282static void tdq_setup(struct tdq *);
283static void tdq_load_add(struct tdq *, struct thread *);
284static void tdq_load_rem(struct tdq *, struct thread *);
285static __inline void tdq_runq_add(struct tdq *, struct thread *, int);
286static __inline void tdq_runq_rem(struct tdq *, struct thread *);
287static inline int sched_shouldpreempt(int, int, int);
288void tdq_print(int cpu);
289static void runq_print(struct runq *rq);
290static void tdq_add(struct tdq *, struct thread *, int);
291#ifdef SMP
292static int tdq_move(struct tdq *, struct tdq *);
293static int tdq_idled(struct tdq *);
294static void tdq_notify(struct tdq *, struct thread *);
295static struct thread *tdq_steal(struct tdq *, int);
296static struct thread *runq_steal(struct runq *, int);
297static int sched_pickcpu(struct thread *, int);
298static void sched_balance(void);
299static int sched_balance_pair(struct tdq *, struct tdq *);
300static inline struct tdq *sched_setcpu(struct thread *, int, int);
301static inline void thread_unblock_switch(struct thread *, struct mtx *);
302static struct mtx *sched_switch_migrate(struct tdq *, struct thread *, int);
303static int sysctl_kern_sched_topology_spec(SYSCTL_HANDLER_ARGS);
304static int sysctl_kern_sched_topology_spec_internal(struct sbuf *sb,
305 struct cpu_group *cg, int indent);
306#endif
307
308static void sched_setup(void *dummy);
309SYSINIT(sched_setup, SI_SUB_RUN_QUEUE, SI_ORDER_FIRST, sched_setup, NULL);
310
311static void sched_initticks(void *dummy);
312SYSINIT(sched_initticks, SI_SUB_CLOCKS, SI_ORDER_THIRD, sched_initticks,
313 NULL);
314
315/*
316 * Print the threads waiting on a run-queue.
317 */
318static void
319runq_print(struct runq *rq)
320{
321 struct rqhead *rqh;
322 struct thread *td;
323 int pri;
324 int j;
325 int i;
326
327 for (i = 0; i < RQB_LEN; i++) {
328 printf("\t\trunq bits %d 0x%zx\n",
329 i, rq->rq_status.rqb_bits[i]);
330 for (j = 0; j < RQB_BPW; j++)
331 if (rq->rq_status.rqb_bits[i] & (1ul << j)) {
332 pri = j + (i << RQB_L2BPW);
333 rqh = &rq->rq_queues[pri];
334 TAILQ_FOREACH(td, rqh, td_runq) {
335 printf("\t\t\ttd %p(%s) priority %d rqindex %d pri %d\n",
336 td, td->td_name, td->td_priority,
337 td->td_rqindex, pri);
338 }
339 }
340 }
341}
342
343/*
344 * Print the status of a per-cpu thread queue. Should be a ddb show cmd.
345 */
346void
347tdq_print(int cpu)
348{
349 struct tdq *tdq;
350
351 tdq = TDQ_CPU(cpu);
352
353 printf("tdq %d:\n", TDQ_ID(tdq));
354 printf("\tlock %p\n", TDQ_LOCKPTR(tdq));
355 printf("\tLock name: %s\n", tdq->tdq_name);
356 printf("\tload: %d\n", tdq->tdq_load);
357 printf("\tswitch cnt: %d\n", tdq->tdq_switchcnt);
358 printf("\told switch cnt: %d\n", tdq->tdq_oldswitchcnt);
359 printf("\ttimeshare idx: %d\n", tdq->tdq_idx);
360 printf("\ttimeshare ridx: %d\n", tdq->tdq_ridx);
361 printf("\tload transferable: %d\n", tdq->tdq_transferable);
362 printf("\tlowest priority: %d\n", tdq->tdq_lowpri);
363 printf("\trealtime runq:\n");
364 runq_print(&tdq->tdq_realtime);
365 printf("\ttimeshare runq:\n");
366 runq_print(&tdq->tdq_timeshare);
367 printf("\tidle runq:\n");
368 runq_print(&tdq->tdq_idle);
369}
370
371static inline int
372sched_shouldpreempt(int pri, int cpri, int remote)
373{
374 /*
375 * If the new priority is not better than the current priority there is
376 * nothing to do.
377 */
378 if (pri >= cpri)
379 return (0);
380 /*
381 * Always preempt idle.
382 */
383 if (cpri >= PRI_MIN_IDLE)
384 return (1);
385 /*
386 * If preemption is disabled don't preempt others.
387 */
388 if (preempt_thresh == 0)
389 return (0);
390 /*
391 * Preempt if we exceed the threshold.
392 */
393 if (pri <= preempt_thresh)
394 return (1);
395 /*
396 * If we're realtime or better and there is timeshare or worse running
397 * preempt only remote processors.
398 */
399 if (remote && pri <= PRI_MAX_REALTIME && cpri > PRI_MAX_REALTIME)
400 return (1);
401 return (0);
402}
403
404#define TS_RQ_PPQ (((PRI_MAX_TIMESHARE - PRI_MIN_TIMESHARE) + 1) / RQ_NQS)
405/*
406 * Add a thread to the actual run-queue. Keeps transferable counts up to
407 * date with what is actually on the run-queue. Selects the correct
408 * queue position for timeshare threads.
409 */
410static __inline void
411tdq_runq_add(struct tdq *tdq, struct thread *td, int flags)
412{
413 struct td_sched *ts;
414 u_char pri;
415
416 TDQ_LOCK_ASSERT(tdq, MA_OWNED);
417 THREAD_LOCK_ASSERT(td, MA_OWNED);
418
419 pri = td->td_priority;
420 ts = td->td_sched;
421 TD_SET_RUNQ(td);
422 if (THREAD_CAN_MIGRATE(td)) {
423 tdq->tdq_transferable++;
424 ts->ts_flags |= TSF_XFERABLE;
425 }
426 if (pri <= PRI_MAX_REALTIME) {
427 ts->ts_runq = &tdq->tdq_realtime;
428 } else if (pri <= PRI_MAX_TIMESHARE) {
429 ts->ts_runq = &tdq->tdq_timeshare;
430 KASSERT(pri <= PRI_MAX_TIMESHARE && pri >= PRI_MIN_TIMESHARE,
431 ("Invalid priority %d on timeshare runq", pri));
432 /*
433 * This queue contains only priorities between MIN and MAX
434 * realtime. Use the whole queue to represent these values.
435 */
436 if ((flags & (SRQ_BORROWING|SRQ_PREEMPTED)) == 0) {
437 pri = (pri - PRI_MIN_TIMESHARE) / TS_RQ_PPQ;
438 pri = (pri + tdq->tdq_idx) % RQ_NQS;
439 /*
440 * This effectively shortens the queue by one so we
441 * can have a one slot difference between idx and
442 * ridx while we wait for threads to drain.
443 */
444 if (tdq->tdq_ridx != tdq->tdq_idx &&
445 pri == tdq->tdq_ridx)
446 pri = (unsigned char)(pri - 1) % RQ_NQS;
447 } else
448 pri = tdq->tdq_ridx;
449 runq_add_pri(ts->ts_runq, td, pri, flags);
450 return;
451 } else
452 ts->ts_runq = &tdq->tdq_idle;
453 runq_add(ts->ts_runq, td, flags);
454}
455
456/*
457 * Remove a thread from a run-queue. This typically happens when a thread
458 * is selected to run. Running threads are not on the queue and the
459 * transferable count does not reflect them.
460 */
461static __inline void
462tdq_runq_rem(struct tdq *tdq, struct thread *td)
463{
464 struct td_sched *ts;
465
466 ts = td->td_sched;
467 TDQ_LOCK_ASSERT(tdq, MA_OWNED);
468 KASSERT(ts->ts_runq != NULL,
469 ("tdq_runq_remove: thread %p null ts_runq", td));
470 if (ts->ts_flags & TSF_XFERABLE) {
471 tdq->tdq_transferable--;
472 ts->ts_flags &= ~TSF_XFERABLE;
473 }
474 if (ts->ts_runq == &tdq->tdq_timeshare) {
475 if (tdq->tdq_idx != tdq->tdq_ridx)
476 runq_remove_idx(ts->ts_runq, td, &tdq->tdq_ridx);
477 else
478 runq_remove_idx(ts->ts_runq, td, NULL);
479 } else
480 runq_remove(ts->ts_runq, td);
481}
482
483/*
484 * Load is maintained for all threads RUNNING and ON_RUNQ. Add the load
485 * for this thread to the referenced thread queue.
486 */
487static void
488tdq_load_add(struct tdq *tdq, struct thread *td)
489{
490
491 TDQ_LOCK_ASSERT(tdq, MA_OWNED);
492 THREAD_LOCK_ASSERT(td, MA_OWNED);
493
494 tdq->tdq_load++;
495 if ((td->td_flags & TDF_NOLOAD) == 0)
496 tdq->tdq_sysload++;
497 KTR_COUNTER0(KTR_SCHED, "load", tdq->tdq_loadname, tdq->tdq_load);
498}
499
500/*
501 * Remove the load from a thread that is transitioning to a sleep state or
502 * exiting.
503 */
504static void
505tdq_load_rem(struct tdq *tdq, struct thread *td)
506{
507
508 THREAD_LOCK_ASSERT(td, MA_OWNED);
509 TDQ_LOCK_ASSERT(tdq, MA_OWNED);
510 KASSERT(tdq->tdq_load != 0,
511 ("tdq_load_rem: Removing with 0 load on queue %d", TDQ_ID(tdq)));
512
513 tdq->tdq_load--;
514 if ((td->td_flags & TDF_NOLOAD) == 0)
515 tdq->tdq_sysload--;
516 KTR_COUNTER0(KTR_SCHED, "load", tdq->tdq_loadname, tdq->tdq_load);
517}
518
519/*
520 * Set lowpri to its exact value by searching the run-queue and
521 * evaluating curthread. curthread may be passed as an optimization.
522 */
523static void
524tdq_setlowpri(struct tdq *tdq, struct thread *ctd)
525{
526 struct thread *td;
527
528 TDQ_LOCK_ASSERT(tdq, MA_OWNED);
529 if (ctd == NULL)
530 ctd = pcpu_find(TDQ_ID(tdq))->pc_curthread;
531 td = tdq_choose(tdq);
532 if (td == NULL || td->td_priority > ctd->td_priority)
533 tdq->tdq_lowpri = ctd->td_priority;
534 else
535 tdq->tdq_lowpri = td->td_priority;
536}
537
538#ifdef SMP
539struct cpu_search {
540 cpuset_t cs_mask;
541 u_int cs_load;
542 u_int cs_cpu;
543 int cs_limit; /* Min priority for low min load for high. */
544};
545
546#define CPU_SEARCH_LOWEST 0x1
547#define CPU_SEARCH_HIGHEST 0x2
548#define CPU_SEARCH_BOTH (CPU_SEARCH_LOWEST|CPU_SEARCH_HIGHEST)
549
550#define CPUSET_FOREACH(cpu, mask) \
551 for ((cpu) = 0; (cpu) <= mp_maxid; (cpu)++) \
552 if ((mask) & 1 << (cpu))
553
554static __inline int cpu_search(struct cpu_group *cg, struct cpu_search *low,
555 struct cpu_search *high, const int match);
556int cpu_search_lowest(struct cpu_group *cg, struct cpu_search *low);
557int cpu_search_highest(struct cpu_group *cg, struct cpu_search *high);
558int cpu_search_both(struct cpu_group *cg, struct cpu_search *low,
559 struct cpu_search *high);
560
561/*
562 * This routine compares according to the match argument and should be
563 * reduced in actual instantiations via constant propagation and dead code
564 * elimination.
565 */
566static __inline int
567cpu_compare(int cpu, struct cpu_search *low, struct cpu_search *high,
568 const int match)
569{
570 struct tdq *tdq;
571
572 tdq = TDQ_CPU(cpu);
573 if (match & CPU_SEARCH_LOWEST)
574 if (CPU_ISSET(cpu, &low->cs_mask) &&
575 tdq->tdq_load < low->cs_load &&
576 tdq->tdq_lowpri > low->cs_limit) {
577 low->cs_cpu = cpu;
578 low->cs_load = tdq->tdq_load;
579 }
580 if (match & CPU_SEARCH_HIGHEST)
581 if (CPU_ISSET(cpu, &high->cs_mask) &&
582 tdq->tdq_load >= high->cs_limit &&
583 tdq->tdq_load > high->cs_load &&
584 tdq->tdq_transferable) {
585 high->cs_cpu = cpu;
586 high->cs_load = tdq->tdq_load;
587 }
588 return (tdq->tdq_load);
589}
590
591/*
592 * Search the tree of cpu_groups for the lowest or highest loaded cpu
593 * according to the match argument. This routine actually compares the
594 * load on all paths through the tree and finds the least loaded cpu on
595 * the least loaded path, which may differ from the least loaded cpu in
596 * the system. This balances work among caches and busses.
597 *
598 * This inline is instantiated in three forms below using constants for the
599 * match argument. It is reduced to the minimum set for each case. It is
600 * also recursive to the depth of the tree.
601 */
602static __inline int
603cpu_search(struct cpu_group *cg, struct cpu_search *low,
604 struct cpu_search *high, const int match)
605{
606 int total;
607
608 total = 0;
609 if (cg->cg_children) {
610 struct cpu_search lgroup;
611 struct cpu_search hgroup;
612 struct cpu_group *child;
613 u_int lload;
614 int hload;
615 int load;
616 int i;
617
618 lload = -1;
619 hload = -1;
620 for (i = 0; i < cg->cg_children; i++) {
621 child = &cg->cg_child[i];
622 if (match & CPU_SEARCH_LOWEST) {
623 lgroup = *low;
624 lgroup.cs_load = -1;
625 }
626 if (match & CPU_SEARCH_HIGHEST) {
627 hgroup = *high;
628 lgroup.cs_load = 0;
629 }
630 switch (match) {
631 case CPU_SEARCH_LOWEST:
632 load = cpu_search_lowest(child, &lgroup);
633 break;
634 case CPU_SEARCH_HIGHEST:
635 load = cpu_search_highest(child, &hgroup);
636 break;
637 case CPU_SEARCH_BOTH:
638 load = cpu_search_both(child, &lgroup, &hgroup);
639 break;
640 }
641 total += load;
642 if (match & CPU_SEARCH_LOWEST)
643 if (load < lload || low->cs_cpu == -1) {
644 *low = lgroup;
645 lload = load;
646 }
647 if (match & CPU_SEARCH_HIGHEST)
648 if (load > hload || high->cs_cpu == -1) {
649 hload = load;
650 *high = hgroup;
651 }
652 }
653 } else {
654 int cpu;
655
656 CPUSET_FOREACH(cpu, cg->cg_mask)
657 total += cpu_compare(cpu, low, high, match);
658 }
659 return (total);
660}
661
662/*
663 * cpu_search instantiations must pass constants to maintain the inline
664 * optimization.
665 */
666int
667cpu_search_lowest(struct cpu_group *cg, struct cpu_search *low)
668{
669 return cpu_search(cg, low, NULL, CPU_SEARCH_LOWEST);
670}
671
672int
673cpu_search_highest(struct cpu_group *cg, struct cpu_search *high)
674{
675 return cpu_search(cg, NULL, high, CPU_SEARCH_HIGHEST);
676}
677
678int
679cpu_search_both(struct cpu_group *cg, struct cpu_search *low,
680 struct cpu_search *high)
681{
682 return cpu_search(cg, low, high, CPU_SEARCH_BOTH);
683}
684
685/*
686 * Find the cpu with the least load via the least loaded path that has a
687 * lowpri greater than pri pri. A pri of -1 indicates any priority is
688 * acceptable.
689 */
690static inline int
691sched_lowest(struct cpu_group *cg, cpuset_t mask, int pri)
692{
693 struct cpu_search low;
694
695 low.cs_cpu = -1;
696 low.cs_load = -1;
697 low.cs_mask = mask;
698 low.cs_limit = pri;
699 cpu_search_lowest(cg, &low);
700 return low.cs_cpu;
701}
702
703/*
704 * Find the cpu with the highest load via the highest loaded path.
705 */
706static inline int
707sched_highest(struct cpu_group *cg, cpuset_t mask, int minload)
708{
709 struct cpu_search high;
710
711 high.cs_cpu = -1;
712 high.cs_load = 0;
713 high.cs_mask = mask;
714 high.cs_limit = minload;
715 cpu_search_highest(cg, &high);
716 return high.cs_cpu;
717}
718
719/*
720 * Simultaneously find the highest and lowest loaded cpu reachable via
721 * cg.
722 */
723static inline void
724sched_both(struct cpu_group *cg, cpuset_t mask, int *lowcpu, int *highcpu)
725{
726 struct cpu_search high;
727 struct cpu_search low;
728
729 low.cs_cpu = -1;
730 low.cs_limit = -1;
731 low.cs_load = -1;
732 low.cs_mask = mask;
733 high.cs_load = 0;
734 high.cs_cpu = -1;
735 high.cs_limit = -1;
736 high.cs_mask = mask;
737 cpu_search_both(cg, &low, &high);
738 *lowcpu = low.cs_cpu;
739 *highcpu = high.cs_cpu;
740 return;
741}
742
743static void
744sched_balance_group(struct cpu_group *cg)
745{
746 cpuset_t mask;
747 int high;
748 int low;
749 int i;
750
751 CPU_FILL(&mask);
752 for (;;) {
753 sched_both(cg, mask, &low, &high);
754 if (low == high || low == -1 || high == -1)
755 break;
756 if (sched_balance_pair(TDQ_CPU(high), TDQ_CPU(low)))
757 break;
758 /*
759 * If we failed to move any threads determine which cpu
760 * to kick out of the set and try again.
761 */
762 if (TDQ_CPU(high)->tdq_transferable == 0)
763 CPU_CLR(high, &mask);
764 else
765 CPU_CLR(low, &mask);
766 }
767
768 for (i = 0; i < cg->cg_children; i++)
769 sched_balance_group(&cg->cg_child[i]);
770}
771
772static void
773sched_balance(void)
774{
775 struct tdq *tdq;
776
777 /*
778 * Select a random time between .5 * balance_interval and
779 * 1.5 * balance_interval.
780 */
781 balance_ticks = max(balance_interval / 2, 1);
782 balance_ticks += random() % balance_interval;
783 if (smp_started == 0 || rebalance == 0)
784 return;
785 tdq = TDQ_SELF();
786 TDQ_UNLOCK(tdq);
787 sched_balance_group(cpu_top);
788 TDQ_LOCK(tdq);
789}
790
791/*
792 * Lock two thread queues using their address to maintain lock order.
793 */
794static void
795tdq_lock_pair(struct tdq *one, struct tdq *two)
796{
797 if (one < two) {
798 TDQ_LOCK(one);
799 TDQ_LOCK_FLAGS(two, MTX_DUPOK);
800 } else {
801 TDQ_LOCK(two);
802 TDQ_LOCK_FLAGS(one, MTX_DUPOK);
803 }
804}
805
806/*
807 * Unlock two thread queues. Order is not important here.
808 */
809static void
810tdq_unlock_pair(struct tdq *one, struct tdq *two)
811{
812 TDQ_UNLOCK(one);
813 TDQ_UNLOCK(two);
814}
815
816/*
817 * Transfer load between two imbalanced thread queues.
818 */
819static int
820sched_balance_pair(struct tdq *high, struct tdq *low)
821{
822 int transferable;
823 int high_load;
824 int low_load;
825 int moved;
826 int move;
827 int diff;
828 int i;
829
830 tdq_lock_pair(high, low);
831 transferable = high->tdq_transferable;
832 high_load = high->tdq_load;
833 low_load = low->tdq_load;
834 moved = 0;
835 /*
836 * Determine what the imbalance is and then adjust that to how many
837 * threads we actually have to give up (transferable).
838 */
839 if (transferable != 0) {
840 diff = high_load - low_load;
841 move = diff / 2;
842 if (diff & 0x1)
843 move++;
844 move = min(move, transferable);
845 for (i = 0; i < move; i++)
846 moved += tdq_move(high, low);
847 /*
848 * IPI the target cpu to force it to reschedule with the new
849 * workload.
850 */
851 ipi_cpu(TDQ_ID(low), IPI_PREEMPT);
852 }
853 tdq_unlock_pair(high, low);
854 return (moved);
855}
856
857/*
858 * Move a thread from one thread queue to another.
859 */
860static int
861tdq_move(struct tdq *from, struct tdq *to)
862{
863 struct td_sched *ts;
864 struct thread *td;
865 struct tdq *tdq;
866 int cpu;
867
868 TDQ_LOCK_ASSERT(from, MA_OWNED);
869 TDQ_LOCK_ASSERT(to, MA_OWNED);
870
871 tdq = from;
872 cpu = TDQ_ID(to);
873 td = tdq_steal(tdq, cpu);
874 if (td == NULL)
875 return (0);
876 ts = td->td_sched;
877 /*
878 * Although the run queue is locked the thread may be blocked. Lock
879 * it to clear this and acquire the run-queue lock.
880 */
881 thread_lock(td);
882 /* Drop recursive lock on from acquired via thread_lock(). */
883 TDQ_UNLOCK(from);
884 sched_rem(td);
885 ts->ts_cpu = cpu;
886 td->td_lock = TDQ_LOCKPTR(to);
887 tdq_add(to, td, SRQ_YIELDING);
888 return (1);
889}
890
891/*
892 * This tdq has idled. Try to steal a thread from another cpu and switch
893 * to it.
894 */
895static int
896tdq_idled(struct tdq *tdq)
897{
898 struct cpu_group *cg;
899 struct tdq *steal;
900 cpuset_t mask;
901 int thresh;
902 int cpu;
903
904 if (smp_started == 0 || steal_idle == 0)
905 return (1);
906 CPU_FILL(&mask);
907 CPU_CLR(PCPU_GET(cpuid), &mask);
908 /* We don't want to be preempted while we're iterating. */
909 spinlock_enter();
910 for (cg = tdq->tdq_cg; cg != NULL; ) {
911 if ((cg->cg_flags & CG_FLAG_THREAD) == 0)
912 thresh = steal_thresh;
913 else
914 thresh = 1;
915 cpu = sched_highest(cg, mask, thresh);
916 if (cpu == -1) {
917 cg = cg->cg_parent;
918 continue;
919 }
920 steal = TDQ_CPU(cpu);
921 CPU_CLR(cpu, &mask);
922 tdq_lock_pair(tdq, steal);
923 if (steal->tdq_load < thresh || steal->tdq_transferable == 0) {
924 tdq_unlock_pair(tdq, steal);
925 continue;
926 }
927 /*
928 * If a thread was added while interrupts were disabled don't
929 * steal one here. If we fail to acquire one due to affinity
930 * restrictions loop again with this cpu removed from the
931 * set.
932 */
933 if (tdq->tdq_load == 0 && tdq_move(steal, tdq) == 0) {
934 tdq_unlock_pair(tdq, steal);
935 continue;
936 }
937 spinlock_exit();
938 TDQ_UNLOCK(steal);
939 mi_switch(SW_VOL | SWT_IDLE, NULL);
940 thread_unlock(curthread);
941
942 return (0);
943 }
944 spinlock_exit();
945 return (1);
946}
947
948/*
949 * Notify a remote cpu of new work. Sends an IPI if criteria are met.
950 */
951static void
952tdq_notify(struct tdq *tdq, struct thread *td)
953{
954 struct thread *ctd;
955 int pri;
956 int cpu;
957
958 if (tdq->tdq_ipipending)
959 return;
960 cpu = td->td_sched->ts_cpu;
961 pri = td->td_priority;
962 ctd = pcpu_find(cpu)->pc_curthread;
963 if (!sched_shouldpreempt(pri, ctd->td_priority, 1))
964 return;
965 if (TD_IS_IDLETHREAD(ctd)) {
966 /*
967 * If the MD code has an idle wakeup routine try that before
968 * falling back to IPI.
969 */
969 if (cpu_idle_wakeup(cpu))
970 if (!tdq->tdq_cpu_idle || cpu_idle_wakeup(cpu))
970 return;
971 }
972 tdq->tdq_ipipending = 1;
973 ipi_cpu(cpu, IPI_PREEMPT);
974}
975
976/*
977 * Steals load from a timeshare queue. Honors the rotating queue head
978 * index.
979 */
980static struct thread *
981runq_steal_from(struct runq *rq, int cpu, u_char start)
982{
983 struct rqbits *rqb;
984 struct rqhead *rqh;
985 struct thread *td;
986 int first;
987 int bit;
988 int pri;
989 int i;
990
991 rqb = &rq->rq_status;
992 bit = start & (RQB_BPW -1);
993 pri = 0;
994 first = 0;
995again:
996 for (i = RQB_WORD(start); i < RQB_LEN; bit = 0, i++) {
997 if (rqb->rqb_bits[i] == 0)
998 continue;
999 if (bit != 0) {
1000 for (pri = bit; pri < RQB_BPW; pri++)
1001 if (rqb->rqb_bits[i] & (1ul << pri))
1002 break;
1003 if (pri >= RQB_BPW)
1004 continue;
1005 } else
1006 pri = RQB_FFS(rqb->rqb_bits[i]);
1007 pri += (i << RQB_L2BPW);
1008 rqh = &rq->rq_queues[pri];
1009 TAILQ_FOREACH(td, rqh, td_runq) {
1010 if (first && THREAD_CAN_MIGRATE(td) &&
1011 THREAD_CAN_SCHED(td, cpu))
1012 return (td);
1013 first = 1;
1014 }
1015 }
1016 if (start != 0) {
1017 start = 0;
1018 goto again;
1019 }
1020
1021 return (NULL);
1022}
1023
1024/*
1025 * Steals load from a standard linear queue.
1026 */
1027static struct thread *
1028runq_steal(struct runq *rq, int cpu)
1029{
1030 struct rqhead *rqh;
1031 struct rqbits *rqb;
1032 struct thread *td;
1033 int word;
1034 int bit;
1035
1036 rqb = &rq->rq_status;
1037 for (word = 0; word < RQB_LEN; word++) {
1038 if (rqb->rqb_bits[word] == 0)
1039 continue;
1040 for (bit = 0; bit < RQB_BPW; bit++) {
1041 if ((rqb->rqb_bits[word] & (1ul << bit)) == 0)
1042 continue;
1043 rqh = &rq->rq_queues[bit + (word << RQB_L2BPW)];
1044 TAILQ_FOREACH(td, rqh, td_runq)
1045 if (THREAD_CAN_MIGRATE(td) &&
1046 THREAD_CAN_SCHED(td, cpu))
1047 return (td);
1048 }
1049 }
1050 return (NULL);
1051}
1052
1053/*
1054 * Attempt to steal a thread in priority order from a thread queue.
1055 */
1056static struct thread *
1057tdq_steal(struct tdq *tdq, int cpu)
1058{
1059 struct thread *td;
1060
1061 TDQ_LOCK_ASSERT(tdq, MA_OWNED);
1062 if ((td = runq_steal(&tdq->tdq_realtime, cpu)) != NULL)
1063 return (td);
1064 if ((td = runq_steal_from(&tdq->tdq_timeshare,
1065 cpu, tdq->tdq_ridx)) != NULL)
1066 return (td);
1067 return (runq_steal(&tdq->tdq_idle, cpu));
1068}
1069
1070/*
1071 * Sets the thread lock and ts_cpu to match the requested cpu. Unlocks the
1072 * current lock and returns with the assigned queue locked.
1073 */
1074static inline struct tdq *
1075sched_setcpu(struct thread *td, int cpu, int flags)
1076{
1077
1078 struct tdq *tdq;
1079
1080 THREAD_LOCK_ASSERT(td, MA_OWNED);
1081 tdq = TDQ_CPU(cpu);
1082 td->td_sched->ts_cpu = cpu;
1083 /*
1084 * If the lock matches just return the queue.
1085 */
1086 if (td->td_lock == TDQ_LOCKPTR(tdq))
1087 return (tdq);
1088#ifdef notyet
1089 /*
1090 * If the thread isn't running its lockptr is a
1091 * turnstile or a sleepqueue. We can just lock_set without
1092 * blocking.
1093 */
1094 if (TD_CAN_RUN(td)) {
1095 TDQ_LOCK(tdq);
1096 thread_lock_set(td, TDQ_LOCKPTR(tdq));
1097 return (tdq);
1098 }
1099#endif
1100 /*
1101 * The hard case, migration, we need to block the thread first to
1102 * prevent order reversals with other cpus locks.
1103 */
1104 spinlock_enter();
1105 thread_lock_block(td);
1106 TDQ_LOCK(tdq);
1107 thread_lock_unblock(td, TDQ_LOCKPTR(tdq));
1108 spinlock_exit();
1109 return (tdq);
1110}
1111
1112SCHED_STAT_DEFINE(pickcpu_intrbind, "Soft interrupt binding");
1113SCHED_STAT_DEFINE(pickcpu_idle_affinity, "Picked idle cpu based on affinity");
1114SCHED_STAT_DEFINE(pickcpu_affinity, "Picked cpu based on affinity");
1115SCHED_STAT_DEFINE(pickcpu_lowest, "Selected lowest load");
1116SCHED_STAT_DEFINE(pickcpu_local, "Migrated to current cpu");
1117SCHED_STAT_DEFINE(pickcpu_migration, "Selection may have caused migration");
1118
1119static int
1120sched_pickcpu(struct thread *td, int flags)
1121{
1122 struct cpu_group *cg;
1123 struct td_sched *ts;
1124 struct tdq *tdq;
1125 cpuset_t mask;
1126 int self;
1127 int pri;
1128 int cpu;
1129
1130 self = PCPU_GET(cpuid);
1131 ts = td->td_sched;
1132 if (smp_started == 0)
1133 return (self);
1134 /*
1135 * Don't migrate a running thread from sched_switch().
1136 */
1137 if ((flags & SRQ_OURSELF) || !THREAD_CAN_MIGRATE(td))
1138 return (ts->ts_cpu);
1139 /*
1140 * Prefer to run interrupt threads on the processors that generate
1141 * the interrupt.
1142 */
1143 if (td->td_priority <= PRI_MAX_ITHD && THREAD_CAN_SCHED(td, self) &&
1144 curthread->td_intr_nesting_level && ts->ts_cpu != self) {
1145 SCHED_STAT_INC(pickcpu_intrbind);
1146 ts->ts_cpu = self;
1147 }
1148 /*
1149 * If the thread can run on the last cpu and the affinity has not
1150 * expired or it is idle run it there.
1151 */
1152 pri = td->td_priority;
1153 tdq = TDQ_CPU(ts->ts_cpu);
1154 if (THREAD_CAN_SCHED(td, ts->ts_cpu)) {
1155 if (tdq->tdq_lowpri > PRI_MIN_IDLE) {
1156 SCHED_STAT_INC(pickcpu_idle_affinity);
1157 return (ts->ts_cpu);
1158 }
1159 if (SCHED_AFFINITY(ts, CG_SHARE_L2) && tdq->tdq_lowpri > pri) {
1160 SCHED_STAT_INC(pickcpu_affinity);
1161 return (ts->ts_cpu);
1162 }
1163 }
1164 /*
1165 * Search for the highest level in the tree that still has affinity.
1166 */
1167 cg = NULL;
1168 for (cg = tdq->tdq_cg; cg != NULL; cg = cg->cg_parent)
1169 if (SCHED_AFFINITY(ts, cg->cg_level))
1170 break;
1171 cpu = -1;
1172 mask = td->td_cpuset->cs_mask;
1173 if (cg)
1174 cpu = sched_lowest(cg, mask, pri);
1175 if (cpu == -1)
1176 cpu = sched_lowest(cpu_top, mask, -1);
1177 /*
1178 * Compare the lowest loaded cpu to current cpu.
1179 */
1180 if (THREAD_CAN_SCHED(td, self) && TDQ_CPU(self)->tdq_lowpri > pri &&
1181 TDQ_CPU(cpu)->tdq_lowpri < PRI_MIN_IDLE) {
1182 SCHED_STAT_INC(pickcpu_local);
1183 cpu = self;
1184 } else
1185 SCHED_STAT_INC(pickcpu_lowest);
1186 if (cpu != ts->ts_cpu)
1187 SCHED_STAT_INC(pickcpu_migration);
1188 KASSERT(cpu != -1, ("sched_pickcpu: Failed to find a cpu."));
1189 return (cpu);
1190}
1191#endif
1192
1193/*
1194 * Pick the highest priority task we have and return it.
1195 */
1196static struct thread *
1197tdq_choose(struct tdq *tdq)
1198{
1199 struct thread *td;
1200
1201 TDQ_LOCK_ASSERT(tdq, MA_OWNED);
1202 td = runq_choose(&tdq->tdq_realtime);
1203 if (td != NULL)
1204 return (td);
1205 td = runq_choose_from(&tdq->tdq_timeshare, tdq->tdq_ridx);
1206 if (td != NULL) {
1207 KASSERT(td->td_priority >= PRI_MIN_TIMESHARE,
1208 ("tdq_choose: Invalid priority on timeshare queue %d",
1209 td->td_priority));
1210 return (td);
1211 }
1212 td = runq_choose(&tdq->tdq_idle);
1213 if (td != NULL) {
1214 KASSERT(td->td_priority >= PRI_MIN_IDLE,
1215 ("tdq_choose: Invalid priority on idle queue %d",
1216 td->td_priority));
1217 return (td);
1218 }
1219
1220 return (NULL);
1221}
1222
1223/*
1224 * Initialize a thread queue.
1225 */
1226static void
1227tdq_setup(struct tdq *tdq)
1228{
1229
1230 if (bootverbose)
1231 printf("ULE: setup cpu %d\n", TDQ_ID(tdq));
1232 runq_init(&tdq->tdq_realtime);
1233 runq_init(&tdq->tdq_timeshare);
1234 runq_init(&tdq->tdq_idle);
1235 snprintf(tdq->tdq_name, sizeof(tdq->tdq_name),
1236 "sched lock %d", (int)TDQ_ID(tdq));
1237 mtx_init(&tdq->tdq_lock, tdq->tdq_name, "sched lock",
1238 MTX_SPIN | MTX_RECURSE);
1239#ifdef KTR
1240 snprintf(tdq->tdq_loadname, sizeof(tdq->tdq_loadname),
1241 "CPU %d load", (int)TDQ_ID(tdq));
1242#endif
1243}
1244
1245#ifdef SMP
1246static void
1247sched_setup_smp(void)
1248{
1249 struct tdq *tdq;
1250 int i;
1251
1252 cpu_top = smp_topo();
1253 CPU_FOREACH(i) {
1254 tdq = TDQ_CPU(i);
1255 tdq_setup(tdq);
1256 tdq->tdq_cg = smp_topo_find(cpu_top, i);
1257 if (tdq->tdq_cg == NULL)
1258 panic("Can't find cpu group for %d\n", i);
1259 }
1260 balance_tdq = TDQ_SELF();
1261 sched_balance();
1262}
1263#endif
1264
1265/*
1266 * Setup the thread queues and initialize the topology based on MD
1267 * information.
1268 */
1269static void
1270sched_setup(void *dummy)
1271{
1272 struct tdq *tdq;
1273
1274 tdq = TDQ_SELF();
1275#ifdef SMP
1276 sched_setup_smp();
1277#else
1278 tdq_setup(tdq);
1279#endif
1280 /*
1281 * To avoid divide-by-zero, we set realstathz a dummy value
1282 * in case which sched_clock() called before sched_initticks().
1283 */
1284 realstathz = hz;
1285 sched_slice = (realstathz/10); /* ~100ms */
1286 tickincr = 1 << SCHED_TICK_SHIFT;
1287
1288 /* Add thread0's load since it's running. */
1289 TDQ_LOCK(tdq);
1290 thread0.td_lock = TDQ_LOCKPTR(TDQ_SELF());
1291 tdq_load_add(tdq, &thread0);
1292 tdq->tdq_lowpri = thread0.td_priority;
1293 TDQ_UNLOCK(tdq);
1294}
1295
1296/*
1297 * This routine determines the tickincr after stathz and hz are setup.
1298 */
1299/* ARGSUSED */
1300static void
1301sched_initticks(void *dummy)
1302{
1303 int incr;
1304
1305 realstathz = stathz ? stathz : hz;
1306 sched_slice = (realstathz/10); /* ~100ms */
1307
1308 /*
1309 * tickincr is shifted out by 10 to avoid rounding errors due to
1310 * hz not being evenly divisible by stathz on all platforms.
1311 */
1312 incr = (hz << SCHED_TICK_SHIFT) / realstathz;
1313 /*
1314 * This does not work for values of stathz that are more than
1315 * 1 << SCHED_TICK_SHIFT * hz. In practice this does not happen.
1316 */
1317 if (incr == 0)
1318 incr = 1;
1319 tickincr = incr;
1320#ifdef SMP
1321 /*
1322 * Set the default balance interval now that we know
1323 * what realstathz is.
1324 */
1325 balance_interval = realstathz;
1326 /*
1327 * Set steal thresh to roughly log2(mp_ncpu) but no greater than 4.
1328 * This prevents excess thrashing on large machines and excess idle
1329 * on smaller machines.
1330 */
1331 steal_thresh = min(fls(mp_ncpus) - 1, 3);
1332 affinity = SCHED_AFFINITY_DEFAULT;
1333#endif
1334}
1335
1336
1337/*
1338 * This is the core of the interactivity algorithm. Determines a score based
1339 * on past behavior. It is the ratio of sleep time to run time scaled to
1340 * a [0, 100] integer. This is the voluntary sleep time of a process, which
1341 * differs from the cpu usage because it does not account for time spent
1342 * waiting on a run-queue. Would be prettier if we had floating point.
1343 */
1344static int
1345sched_interact_score(struct thread *td)
1346{
1347 struct td_sched *ts;
1348 int div;
1349
1350 ts = td->td_sched;
1351 /*
1352 * The score is only needed if this is likely to be an interactive
1353 * task. Don't go through the expense of computing it if there's
1354 * no chance.
1355 */
1356 if (sched_interact <= SCHED_INTERACT_HALF &&
1357 ts->ts_runtime >= ts->ts_slptime)
1358 return (SCHED_INTERACT_HALF);
1359
1360 if (ts->ts_runtime > ts->ts_slptime) {
1361 div = max(1, ts->ts_runtime / SCHED_INTERACT_HALF);
1362 return (SCHED_INTERACT_HALF +
1363 (SCHED_INTERACT_HALF - (ts->ts_slptime / div)));
1364 }
1365 if (ts->ts_slptime > ts->ts_runtime) {
1366 div = max(1, ts->ts_slptime / SCHED_INTERACT_HALF);
1367 return (ts->ts_runtime / div);
1368 }
1369 /* runtime == slptime */
1370 if (ts->ts_runtime)
1371 return (SCHED_INTERACT_HALF);
1372
1373 /*
1374 * This can happen if slptime and runtime are 0.
1375 */
1376 return (0);
1377
1378}
1379
1380/*
1381 * Scale the scheduling priority according to the "interactivity" of this
1382 * process.
1383 */
1384static void
1385sched_priority(struct thread *td)
1386{
1387 int score;
1388 int pri;
1389
1390 if (td->td_pri_class != PRI_TIMESHARE)
1391 return;
1392 /*
1393 * If the score is interactive we place the thread in the realtime
1394 * queue with a priority that is less than kernel and interrupt
1395 * priorities. These threads are not subject to nice restrictions.
1396 *
1397 * Scores greater than this are placed on the normal timeshare queue
1398 * where the priority is partially decided by the most recent cpu
1399 * utilization and the rest is decided by nice value.
1400 *
1401 * The nice value of the process has a linear effect on the calculated
1402 * score. Negative nice values make it easier for a thread to be
1403 * considered interactive.
1404 */
1405 score = imax(0, sched_interact_score(td) + td->td_proc->p_nice);
1406 if (score < sched_interact) {
1407 pri = PRI_MIN_REALTIME;
1408 pri += ((PRI_MAX_REALTIME - PRI_MIN_REALTIME) / sched_interact)
1409 * score;
1410 KASSERT(pri >= PRI_MIN_REALTIME && pri <= PRI_MAX_REALTIME,
1411 ("sched_priority: invalid interactive priority %d score %d",
1412 pri, score));
1413 } else {
1414 pri = SCHED_PRI_MIN;
1415 if (td->td_sched->ts_ticks)
1416 pri += SCHED_PRI_TICKS(td->td_sched);
1417 pri += SCHED_PRI_NICE(td->td_proc->p_nice);
1418 KASSERT(pri >= PRI_MIN_TIMESHARE && pri <= PRI_MAX_TIMESHARE,
1419 ("sched_priority: invalid priority %d: nice %d, "
1420 "ticks %d ftick %d ltick %d tick pri %d",
1421 pri, td->td_proc->p_nice, td->td_sched->ts_ticks,
1422 td->td_sched->ts_ftick, td->td_sched->ts_ltick,
1423 SCHED_PRI_TICKS(td->td_sched)));
1424 }
1425 sched_user_prio(td, pri);
1426
1427 return;
1428}
1429
1430/*
1431 * This routine enforces a maximum limit on the amount of scheduling history
1432 * kept. It is called after either the slptime or runtime is adjusted. This
1433 * function is ugly due to integer math.
1434 */
1435static void
1436sched_interact_update(struct thread *td)
1437{
1438 struct td_sched *ts;
1439 u_int sum;
1440
1441 ts = td->td_sched;
1442 sum = ts->ts_runtime + ts->ts_slptime;
1443 if (sum < SCHED_SLP_RUN_MAX)
1444 return;
1445 /*
1446 * This only happens from two places:
1447 * 1) We have added an unusual amount of run time from fork_exit.
1448 * 2) We have added an unusual amount of sleep time from sched_sleep().
1449 */
1450 if (sum > SCHED_SLP_RUN_MAX * 2) {
1451 if (ts->ts_runtime > ts->ts_slptime) {
1452 ts->ts_runtime = SCHED_SLP_RUN_MAX;
1453 ts->ts_slptime = 1;
1454 } else {
1455 ts->ts_slptime = SCHED_SLP_RUN_MAX;
1456 ts->ts_runtime = 1;
1457 }
1458 return;
1459 }
1460 /*
1461 * If we have exceeded by more than 1/5th then the algorithm below
1462 * will not bring us back into range. Dividing by two here forces
1463 * us into the range of [4/5 * SCHED_INTERACT_MAX, SCHED_INTERACT_MAX]
1464 */
1465 if (sum > (SCHED_SLP_RUN_MAX / 5) * 6) {
1466 ts->ts_runtime /= 2;
1467 ts->ts_slptime /= 2;
1468 return;
1469 }
1470 ts->ts_runtime = (ts->ts_runtime / 5) * 4;
1471 ts->ts_slptime = (ts->ts_slptime / 5) * 4;
1472}
1473
1474/*
1475 * Scale back the interactivity history when a child thread is created. The
1476 * history is inherited from the parent but the thread may behave totally
1477 * differently. For example, a shell spawning a compiler process. We want
1478 * to learn that the compiler is behaving badly very quickly.
1479 */
1480static void
1481sched_interact_fork(struct thread *td)
1482{
1483 int ratio;
1484 int sum;
1485
1486 sum = td->td_sched->ts_runtime + td->td_sched->ts_slptime;
1487 if (sum > SCHED_SLP_RUN_FORK) {
1488 ratio = sum / SCHED_SLP_RUN_FORK;
1489 td->td_sched->ts_runtime /= ratio;
1490 td->td_sched->ts_slptime /= ratio;
1491 }
1492}
1493
1494/*
1495 * Called from proc0_init() to setup the scheduler fields.
1496 */
1497void
1498schedinit(void)
1499{
1500
1501 /*
1502 * Set up the scheduler specific parts of proc0.
1503 */
1504 proc0.p_sched = NULL; /* XXX */
1505 thread0.td_sched = &td_sched0;
1506 td_sched0.ts_ltick = ticks;
1507 td_sched0.ts_ftick = ticks;
1508 td_sched0.ts_slice = sched_slice;
1509}
1510
1511/*
1512 * This is only somewhat accurate since given many processes of the same
1513 * priority they will switch when their slices run out, which will be
1514 * at most sched_slice stathz ticks.
1515 */
1516int
1517sched_rr_interval(void)
1518{
1519
1520 /* Convert sched_slice to hz */
1521 return (hz/(realstathz/sched_slice));
1522}
1523
1524/*
1525 * Update the percent cpu tracking information when it is requested or
1526 * the total history exceeds the maximum. We keep a sliding history of
1527 * tick counts that slowly decays. This is less precise than the 4BSD
1528 * mechanism since it happens with less regular and frequent events.
1529 */
1530static void
1531sched_pctcpu_update(struct td_sched *ts)
1532{
1533
1534 if (ts->ts_ticks == 0)
1535 return;
1536 if (ticks - (hz / 10) < ts->ts_ltick &&
1537 SCHED_TICK_TOTAL(ts) < SCHED_TICK_MAX)
1538 return;
1539 /*
1540 * Adjust counters and watermark for pctcpu calc.
1541 */
1542 if (ts->ts_ltick > ticks - SCHED_TICK_TARG)
1543 ts->ts_ticks = (ts->ts_ticks / (ticks - ts->ts_ftick)) *
1544 SCHED_TICK_TARG;
1545 else
1546 ts->ts_ticks = 0;
1547 ts->ts_ltick = ticks;
1548 ts->ts_ftick = ts->ts_ltick - SCHED_TICK_TARG;
1549}
1550
1551/*
1552 * Adjust the priority of a thread. Move it to the appropriate run-queue
1553 * if necessary. This is the back-end for several priority related
1554 * functions.
1555 */
1556static void
1557sched_thread_priority(struct thread *td, u_char prio)
1558{
1559 struct td_sched *ts;
1560 struct tdq *tdq;
1561 int oldpri;
1562
1563 KTR_POINT3(KTR_SCHED, "thread", sched_tdname(td), "prio",
1564 "prio:%d", td->td_priority, "new prio:%d", prio,
1565 KTR_ATTR_LINKED, sched_tdname(curthread));
1566 if (td != curthread && prio > td->td_priority) {
1567 KTR_POINT3(KTR_SCHED, "thread", sched_tdname(curthread),
1568 "lend prio", "prio:%d", td->td_priority, "new prio:%d",
1569 prio, KTR_ATTR_LINKED, sched_tdname(td));
1570 }
1571 ts = td->td_sched;
1572 THREAD_LOCK_ASSERT(td, MA_OWNED);
1573 if (td->td_priority == prio)
1574 return;
1575 /*
1576 * If the priority has been elevated due to priority
1577 * propagation, we may have to move ourselves to a new
1578 * queue. This could be optimized to not re-add in some
1579 * cases.
1580 */
1581 if (TD_ON_RUNQ(td) && prio < td->td_priority) {
1582 sched_rem(td);
1583 td->td_priority = prio;
1584 sched_add(td, SRQ_BORROWING);
1585 return;
1586 }
1587 /*
1588 * If the thread is currently running we may have to adjust the lowpri
1589 * information so other cpus are aware of our current priority.
1590 */
1591 if (TD_IS_RUNNING(td)) {
1592 tdq = TDQ_CPU(ts->ts_cpu);
1593 oldpri = td->td_priority;
1594 td->td_priority = prio;
1595 if (prio < tdq->tdq_lowpri)
1596 tdq->tdq_lowpri = prio;
1597 else if (tdq->tdq_lowpri == oldpri)
1598 tdq_setlowpri(tdq, td);
1599 return;
1600 }
1601 td->td_priority = prio;
1602}
1603
1604/*
1605 * Update a thread's priority when it is lent another thread's
1606 * priority.
1607 */
1608void
1609sched_lend_prio(struct thread *td, u_char prio)
1610{
1611
1612 td->td_flags |= TDF_BORROWING;
1613 sched_thread_priority(td, prio);
1614}
1615
1616/*
1617 * Restore a thread's priority when priority propagation is
1618 * over. The prio argument is the minimum priority the thread
1619 * needs to have to satisfy other possible priority lending
1620 * requests. If the thread's regular priority is less
1621 * important than prio, the thread will keep a priority boost
1622 * of prio.
1623 */
1624void
1625sched_unlend_prio(struct thread *td, u_char prio)
1626{
1627 u_char base_pri;
1628
1629 if (td->td_base_pri >= PRI_MIN_TIMESHARE &&
1630 td->td_base_pri <= PRI_MAX_TIMESHARE)
1631 base_pri = td->td_user_pri;
1632 else
1633 base_pri = td->td_base_pri;
1634 if (prio >= base_pri) {
1635 td->td_flags &= ~TDF_BORROWING;
1636 sched_thread_priority(td, base_pri);
1637 } else
1638 sched_lend_prio(td, prio);
1639}
1640
1641/*
1642 * Standard entry for setting the priority to an absolute value.
1643 */
1644void
1645sched_prio(struct thread *td, u_char prio)
1646{
1647 u_char oldprio;
1648
1649 /* First, update the base priority. */
1650 td->td_base_pri = prio;
1651
1652 /*
1653 * If the thread is borrowing another thread's priority, don't
1654 * ever lower the priority.
1655 */
1656 if (td->td_flags & TDF_BORROWING && td->td_priority < prio)
1657 return;
1658
1659 /* Change the real priority. */
1660 oldprio = td->td_priority;
1661 sched_thread_priority(td, prio);
1662
1663 /*
1664 * If the thread is on a turnstile, then let the turnstile update
1665 * its state.
1666 */
1667 if (TD_ON_LOCK(td) && oldprio != prio)
1668 turnstile_adjust(td, oldprio);
1669}
1670
1671/*
1672 * Set the base user priority, does not effect current running priority.
1673 */
1674void
1675sched_user_prio(struct thread *td, u_char prio)
1676{
1677 u_char oldprio;
1678
1679 td->td_base_user_pri = prio;
1680 if (td->td_flags & TDF_UBORROWING && td->td_user_pri <= prio)
1681 return;
1682 oldprio = td->td_user_pri;
1683 td->td_user_pri = prio;
1684}
1685
1686void
1687sched_lend_user_prio(struct thread *td, u_char prio)
1688{
1689 u_char oldprio;
1690
1691 THREAD_LOCK_ASSERT(td, MA_OWNED);
1692 td->td_flags |= TDF_UBORROWING;
1693 oldprio = td->td_user_pri;
1694 td->td_user_pri = prio;
1695}
1696
1697void
1698sched_unlend_user_prio(struct thread *td, u_char prio)
1699{
1700 u_char base_pri;
1701
1702 THREAD_LOCK_ASSERT(td, MA_OWNED);
1703 base_pri = td->td_base_user_pri;
1704 if (prio >= base_pri) {
1705 td->td_flags &= ~TDF_UBORROWING;
1706 sched_user_prio(td, base_pri);
1707 } else {
1708 sched_lend_user_prio(td, prio);
1709 }
1710}
1711
1712/*
1713 * Handle migration from sched_switch(). This happens only for
1714 * cpu binding.
1715 */
1716static struct mtx *
1717sched_switch_migrate(struct tdq *tdq, struct thread *td, int flags)
1718{
1719 struct tdq *tdn;
1720
1721 tdn = TDQ_CPU(td->td_sched->ts_cpu);
1722#ifdef SMP
1723 tdq_load_rem(tdq, td);
1724 /*
1725 * Do the lock dance required to avoid LOR. We grab an extra
1726 * spinlock nesting to prevent preemption while we're
1727 * not holding either run-queue lock.
1728 */
1729 spinlock_enter();
1730 thread_lock_block(td); /* This releases the lock on tdq. */
1731
1732 /*
1733 * Acquire both run-queue locks before placing the thread on the new
1734 * run-queue to avoid deadlocks created by placing a thread with a
1735 * blocked lock on the run-queue of a remote processor. The deadlock
1736 * occurs when a third processor attempts to lock the two queues in
1737 * question while the target processor is spinning with its own
1738 * run-queue lock held while waiting for the blocked lock to clear.
1739 */
1740 tdq_lock_pair(tdn, tdq);
1741 tdq_add(tdn, td, flags);
1742 tdq_notify(tdn, td);
1743 TDQ_UNLOCK(tdn);
1744 spinlock_exit();
1745#endif
1746 return (TDQ_LOCKPTR(tdn));
1747}
1748
1749/*
1750 * Variadic version of thread_lock_unblock() that does not assume td_lock
1751 * is blocked.
1752 */
1753static inline void
1754thread_unblock_switch(struct thread *td, struct mtx *mtx)
1755{
1756 atomic_store_rel_ptr((volatile uintptr_t *)&td->td_lock,
1757 (uintptr_t)mtx);
1758}
1759
1760/*
1761 * Switch threads. This function has to handle threads coming in while
1762 * blocked for some reason, running, or idle. It also must deal with
1763 * migrating a thread from one queue to another as running threads may
1764 * be assigned elsewhere via binding.
1765 */
1766void
1767sched_switch(struct thread *td, struct thread *newtd, int flags)
1768{
1769 struct tdq *tdq;
1770 struct td_sched *ts;
1771 struct mtx *mtx;
1772 int srqflag;
1773 int cpuid;
1774
1775 THREAD_LOCK_ASSERT(td, MA_OWNED);
1776 KASSERT(newtd == NULL, ("sched_switch: Unsupported newtd argument"));
1777
1778 cpuid = PCPU_GET(cpuid);
1779 tdq = TDQ_CPU(cpuid);
1780 ts = td->td_sched;
1781 mtx = td->td_lock;
1782 ts->ts_rltick = ticks;
1783 td->td_lastcpu = td->td_oncpu;
1784 td->td_oncpu = NOCPU;
1785 td->td_flags &= ~TDF_NEEDRESCHED;
1786 td->td_owepreempt = 0;
1787 tdq->tdq_switchcnt++;
1788 /*
1789 * The lock pointer in an idle thread should never change. Reset it
1790 * to CAN_RUN as well.
1791 */
1792 if (TD_IS_IDLETHREAD(td)) {
1793 MPASS(td->td_lock == TDQ_LOCKPTR(tdq));
1794 TD_SET_CAN_RUN(td);
1795 } else if (TD_IS_RUNNING(td)) {
1796 MPASS(td->td_lock == TDQ_LOCKPTR(tdq));
1797 srqflag = (flags & SW_PREEMPT) ?
1798 SRQ_OURSELF|SRQ_YIELDING|SRQ_PREEMPTED :
1799 SRQ_OURSELF|SRQ_YIELDING;
1800#ifdef SMP
1801 if (THREAD_CAN_MIGRATE(td) && !THREAD_CAN_SCHED(td, ts->ts_cpu))
1802 ts->ts_cpu = sched_pickcpu(td, 0);
1803#endif
1804 if (ts->ts_cpu == cpuid)
1805 tdq_runq_add(tdq, td, srqflag);
1806 else {
1807 KASSERT(THREAD_CAN_MIGRATE(td) ||
1808 (ts->ts_flags & TSF_BOUND) != 0,
1809 ("Thread %p shouldn't migrate", td));
1810 mtx = sched_switch_migrate(tdq, td, srqflag);
1811 }
1812 } else {
1813 /* This thread must be going to sleep. */
1814 TDQ_LOCK(tdq);
1815 mtx = thread_lock_block(td);
1816 tdq_load_rem(tdq, td);
1817 }
1818 /*
1819 * We enter here with the thread blocked and assigned to the
1820 * appropriate cpu run-queue or sleep-queue and with the current
1821 * thread-queue locked.
1822 */
1823 TDQ_LOCK_ASSERT(tdq, MA_OWNED | MA_NOTRECURSED);
1824 newtd = choosethread();
1825 /*
1826 * Call the MD code to switch contexts if necessary.
1827 */
1828 if (td != newtd) {
1829#ifdef HWPMC_HOOKS
1830 if (PMC_PROC_IS_USING_PMCS(td->td_proc))
1831 PMC_SWITCH_CONTEXT(td, PMC_FN_CSW_OUT);
1832#endif
1833 lock_profile_release_lock(&TDQ_LOCKPTR(tdq)->lock_object);
1834 TDQ_LOCKPTR(tdq)->mtx_lock = (uintptr_t)newtd;
1835
1836#ifdef KDTRACE_HOOKS
1837 /*
1838 * If DTrace has set the active vtime enum to anything
1839 * other than INACTIVE (0), then it should have set the
1840 * function to call.
1841 */
1842 if (dtrace_vtime_active)
1843 (*dtrace_vtime_switch_func)(newtd);
1844#endif
1845
1846 cpu_switch(td, newtd, mtx);
1847 /*
1848 * We may return from cpu_switch on a different cpu. However,
1849 * we always return with td_lock pointing to the current cpu's
1850 * run queue lock.
1851 */
1852 cpuid = PCPU_GET(cpuid);
1853 tdq = TDQ_CPU(cpuid);
1854 lock_profile_obtain_lock_success(
1855 &TDQ_LOCKPTR(tdq)->lock_object, 0, 0, __FILE__, __LINE__);
1856#ifdef HWPMC_HOOKS
1857 if (PMC_PROC_IS_USING_PMCS(td->td_proc))
1858 PMC_SWITCH_CONTEXT(td, PMC_FN_CSW_IN);
1859#endif
1860 } else
1861 thread_unblock_switch(td, mtx);
1862 /*
1863 * Assert that all went well and return.
1864 */
1865 TDQ_LOCK_ASSERT(tdq, MA_OWNED|MA_NOTRECURSED);
1866 MPASS(td->td_lock == TDQ_LOCKPTR(tdq));
1867 td->td_oncpu = cpuid;
1868}
1869
1870/*
1871 * Adjust thread priorities as a result of a nice request.
1872 */
1873void
1874sched_nice(struct proc *p, int nice)
1875{
1876 struct thread *td;
1877
1878 PROC_LOCK_ASSERT(p, MA_OWNED);
1879
1880 p->p_nice = nice;
1881 FOREACH_THREAD_IN_PROC(p, td) {
1882 thread_lock(td);
1883 sched_priority(td);
1884 sched_prio(td, td->td_base_user_pri);
1885 thread_unlock(td);
1886 }
1887}
1888
1889/*
1890 * Record the sleep time for the interactivity scorer.
1891 */
1892void
1893sched_sleep(struct thread *td, int prio)
1894{
1895
1896 THREAD_LOCK_ASSERT(td, MA_OWNED);
1897
1898 td->td_slptick = ticks;
1899 if (TD_IS_SUSPENDED(td) || prio >= PSOCK)
1900 td->td_flags |= TDF_CANSWAP;
1901 if (static_boost == 1 && prio)
1902 sched_prio(td, prio);
1903 else if (static_boost && td->td_priority > static_boost)
1904 sched_prio(td, static_boost);
1905}
1906
1907/*
1908 * Schedule a thread to resume execution and record how long it voluntarily
1909 * slept. We also update the pctcpu, interactivity, and priority.
1910 */
1911void
1912sched_wakeup(struct thread *td)
1913{
1914 struct td_sched *ts;
1915 int slptick;
1916
1917 THREAD_LOCK_ASSERT(td, MA_OWNED);
1918 ts = td->td_sched;
1919 td->td_flags &= ~TDF_CANSWAP;
1920 /*
1921 * If we slept for more than a tick update our interactivity and
1922 * priority.
1923 */
1924 slptick = td->td_slptick;
1925 td->td_slptick = 0;
1926 if (slptick && slptick != ticks) {
1927 u_int hzticks;
1928
1929 hzticks = (ticks - slptick) << SCHED_TICK_SHIFT;
1930 ts->ts_slptime += hzticks;
1931 sched_interact_update(td);
1932 sched_pctcpu_update(ts);
1933 }
1934 /* Reset the slice value after we sleep. */
1935 ts->ts_slice = sched_slice;
1936 sched_add(td, SRQ_BORING);
1937}
1938
1939/*
1940 * Penalize the parent for creating a new child and initialize the child's
1941 * priority.
1942 */
1943void
1944sched_fork(struct thread *td, struct thread *child)
1945{
1946 THREAD_LOCK_ASSERT(td, MA_OWNED);
1947 sched_fork_thread(td, child);
1948 /*
1949 * Penalize the parent and child for forking.
1950 */
1951 sched_interact_fork(child);
1952 sched_priority(child);
1953 td->td_sched->ts_runtime += tickincr;
1954 sched_interact_update(td);
1955 sched_priority(td);
1956}
1957
1958/*
1959 * Fork a new thread, may be within the same process.
1960 */
1961void
1962sched_fork_thread(struct thread *td, struct thread *child)
1963{
1964 struct td_sched *ts;
1965 struct td_sched *ts2;
1966
1967 THREAD_LOCK_ASSERT(td, MA_OWNED);
1968 /*
1969 * Initialize child.
1970 */
1971 ts = td->td_sched;
1972 ts2 = child->td_sched;
1973 child->td_lock = TDQ_LOCKPTR(TDQ_SELF());
1974 child->td_cpuset = cpuset_ref(td->td_cpuset);
1975 ts2->ts_cpu = ts->ts_cpu;
1976 ts2->ts_flags = 0;
1977 /*
1978 * Grab our parents cpu estimation information and priority.
1979 */
1980 ts2->ts_ticks = ts->ts_ticks;
1981 ts2->ts_ltick = ts->ts_ltick;
1982 ts2->ts_incrtick = ts->ts_incrtick;
1983 ts2->ts_ftick = ts->ts_ftick;
1984 child->td_user_pri = td->td_user_pri;
1985 child->td_base_user_pri = td->td_base_user_pri;
1986 /*
1987 * And update interactivity score.
1988 */
1989 ts2->ts_slptime = ts->ts_slptime;
1990 ts2->ts_runtime = ts->ts_runtime;
1991 ts2->ts_slice = 1; /* Attempt to quickly learn interactivity. */
1992#ifdef KTR
1993 bzero(ts2->ts_name, sizeof(ts2->ts_name));
1994#endif
1995}
1996
1997/*
1998 * Adjust the priority class of a thread.
1999 */
2000void
2001sched_class(struct thread *td, int class)
2002{
2003
2004 THREAD_LOCK_ASSERT(td, MA_OWNED);
2005 if (td->td_pri_class == class)
2006 return;
2007 td->td_pri_class = class;
2008}
2009
2010/*
2011 * Return some of the child's priority and interactivity to the parent.
2012 */
2013void
2014sched_exit(struct proc *p, struct thread *child)
2015{
2016 struct thread *td;
2017
2018 KTR_STATE1(KTR_SCHED, "thread", sched_tdname(child), "proc exit",
2019 "prio:td", child->td_priority);
2020 PROC_LOCK_ASSERT(p, MA_OWNED);
2021 td = FIRST_THREAD_IN_PROC(p);
2022 sched_exit_thread(td, child);
2023}
2024
2025/*
2026 * Penalize another thread for the time spent on this one. This helps to
2027 * worsen the priority and interactivity of processes which schedule batch
2028 * jobs such as make. This has little effect on the make process itself but
2029 * causes new processes spawned by it to receive worse scores immediately.
2030 */
2031void
2032sched_exit_thread(struct thread *td, struct thread *child)
2033{
2034
2035 KTR_STATE1(KTR_SCHED, "thread", sched_tdname(child), "thread exit",
2036 "prio:td", child->td_priority);
2037 /*
2038 * Give the child's runtime to the parent without returning the
2039 * sleep time as a penalty to the parent. This causes shells that
2040 * launch expensive things to mark their children as expensive.
2041 */
2042 thread_lock(td);
2043 td->td_sched->ts_runtime += child->td_sched->ts_runtime;
2044 sched_interact_update(td);
2045 sched_priority(td);
2046 thread_unlock(td);
2047}
2048
2049void
2050sched_preempt(struct thread *td)
2051{
2052 struct tdq *tdq;
2053
2054 thread_lock(td);
2055 tdq = TDQ_SELF();
2056 TDQ_LOCK_ASSERT(tdq, MA_OWNED);
2057 tdq->tdq_ipipending = 0;
2058 if (td->td_priority > tdq->tdq_lowpri) {
2059 int flags;
2060
2061 flags = SW_INVOL | SW_PREEMPT;
2062 if (td->td_critnest > 1)
2063 td->td_owepreempt = 1;
2064 else if (TD_IS_IDLETHREAD(td))
2065 mi_switch(flags | SWT_REMOTEWAKEIDLE, NULL);
2066 else
2067 mi_switch(flags | SWT_REMOTEPREEMPT, NULL);
2068 }
2069 thread_unlock(td);
2070}
2071
2072/*
2073 * Fix priorities on return to user-space. Priorities may be elevated due
2074 * to static priorities in msleep() or similar.
2075 */
2076void
2077sched_userret(struct thread *td)
2078{
2079 /*
2080 * XXX we cheat slightly on the locking here to avoid locking in
2081 * the usual case. Setting td_priority here is essentially an
2082 * incomplete workaround for not setting it properly elsewhere.
2083 * Now that some interrupt handlers are threads, not setting it
2084 * properly elsewhere can clobber it in the window between setting
2085 * it here and returning to user mode, so don't waste time setting
2086 * it perfectly here.
2087 */
2088 KASSERT((td->td_flags & TDF_BORROWING) == 0,
2089 ("thread with borrowed priority returning to userland"));
2090 if (td->td_priority != td->td_user_pri) {
2091 thread_lock(td);
2092 td->td_priority = td->td_user_pri;
2093 td->td_base_pri = td->td_user_pri;
2094 tdq_setlowpri(TDQ_SELF(), td);
2095 thread_unlock(td);
2096 }
2097}
2098
2099/*
2100 * Handle a stathz tick. This is really only relevant for timeshare
2101 * threads.
2102 */
2103void
2104sched_clock(struct thread *td)
2105{
2106 struct tdq *tdq;
2107 struct td_sched *ts;
2108
2109 THREAD_LOCK_ASSERT(td, MA_OWNED);
2110 tdq = TDQ_SELF();
2111#ifdef SMP
2112 /*
2113 * We run the long term load balancer infrequently on the first cpu.
2114 */
2115 if (balance_tdq == tdq) {
2116 if (balance_ticks && --balance_ticks == 0)
2117 sched_balance();
2118 }
2119#endif
2120 /*
2121 * Save the old switch count so we have a record of the last ticks
2122 * activity. Initialize the new switch count based on our load.
2123 * If there is some activity seed it to reflect that.
2124 */
2125 tdq->tdq_oldswitchcnt = tdq->tdq_switchcnt;
2126 tdq->tdq_switchcnt = tdq->tdq_load;
2127 /*
2128 * Advance the insert index once for each tick to ensure that all
2129 * threads get a chance to run.
2130 */
2131 if (tdq->tdq_idx == tdq->tdq_ridx) {
2132 tdq->tdq_idx = (tdq->tdq_idx + 1) % RQ_NQS;
2133 if (TAILQ_EMPTY(&tdq->tdq_timeshare.rq_queues[tdq->tdq_ridx]))
2134 tdq->tdq_ridx = tdq->tdq_idx;
2135 }
2136 ts = td->td_sched;
2137 if (td->td_pri_class & PRI_FIFO_BIT)
2138 return;
2139 if (td->td_pri_class == PRI_TIMESHARE) {
2140 /*
2141 * We used a tick; charge it to the thread so
2142 * that we can compute our interactivity.
2143 */
2144 td->td_sched->ts_runtime += tickincr;
2145 sched_interact_update(td);
2146 sched_priority(td);
2147 }
2148 /*
2149 * We used up one time slice.
2150 */
2151 if (--ts->ts_slice > 0)
2152 return;
2153 /*
2154 * We're out of time, force a requeue at userret().
2155 */
2156 ts->ts_slice = sched_slice;
2157 td->td_flags |= TDF_NEEDRESCHED;
2158}
2159
2160/*
2161 * Called once per hz tick. Used for cpu utilization information. This
2162 * is easier than trying to scale based on stathz.
2163 */
2164void
2165sched_tick(void)
2166{
2167 struct td_sched *ts;
2168
2169 ts = curthread->td_sched;
2170 /*
2171 * Ticks is updated asynchronously on a single cpu. Check here to
2172 * avoid incrementing ts_ticks multiple times in a single tick.
2173 */
2174 if (ts->ts_incrtick == ticks)
2175 return;
2176 /* Adjust ticks for pctcpu */
2177 ts->ts_ticks += 1 << SCHED_TICK_SHIFT;
2178 ts->ts_ltick = ticks;
2179 ts->ts_incrtick = ticks;
2180 /*
2181 * Update if we've exceeded our desired tick threshhold by over one
2182 * second.
2183 */
2184 if (ts->ts_ftick + SCHED_TICK_MAX < ts->ts_ltick)
2185 sched_pctcpu_update(ts);
2186}
2187
2188/*
2189 * Return whether the current CPU has runnable tasks. Used for in-kernel
2190 * cooperative idle threads.
2191 */
2192int
2193sched_runnable(void)
2194{
2195 struct tdq *tdq;
2196 int load;
2197
2198 load = 1;
2199
2200 tdq = TDQ_SELF();
2201 if ((curthread->td_flags & TDF_IDLETD) != 0) {
2202 if (tdq->tdq_load > 0)
2203 goto out;
2204 } else
2205 if (tdq->tdq_load - 1 > 0)
2206 goto out;
2207 load = 0;
2208out:
2209 return (load);
2210}
2211
2212/*
2213 * Choose the highest priority thread to run. The thread is removed from
2214 * the run-queue while running however the load remains. For SMP we set
2215 * the tdq in the global idle bitmask if it idles here.
2216 */
2217struct thread *
2218sched_choose(void)
2219{
2220 struct thread *td;
2221 struct tdq *tdq;
2222
2223 tdq = TDQ_SELF();
2224 TDQ_LOCK_ASSERT(tdq, MA_OWNED);
2225 td = tdq_choose(tdq);
2226 if (td) {
2227 td->td_sched->ts_ltick = ticks;
2228 tdq_runq_rem(tdq, td);
2229 tdq->tdq_lowpri = td->td_priority;
2230 return (td);
2231 }
2232 tdq->tdq_lowpri = PRI_MAX_IDLE;
2233 return (PCPU_GET(idlethread));
2234}
2235
2236/*
2237 * Set owepreempt if necessary. Preemption never happens directly in ULE,
2238 * we always request it once we exit a critical section.
2239 */
2240static inline void
2241sched_setpreempt(struct thread *td)
2242{
2243 struct thread *ctd;
2244 int cpri;
2245 int pri;
2246
2247 THREAD_LOCK_ASSERT(curthread, MA_OWNED);
2248
2249 ctd = curthread;
2250 pri = td->td_priority;
2251 cpri = ctd->td_priority;
2252 if (pri < cpri)
2253 ctd->td_flags |= TDF_NEEDRESCHED;
2254 if (panicstr != NULL || pri >= cpri || cold || TD_IS_INHIBITED(ctd))
2255 return;
2256 if (!sched_shouldpreempt(pri, cpri, 0))
2257 return;
2258 ctd->td_owepreempt = 1;
2259}
2260
2261/*
2262 * Add a thread to a thread queue. Select the appropriate runq and add the
2263 * thread to it. This is the internal function called when the tdq is
2264 * predetermined.
2265 */
2266void
2267tdq_add(struct tdq *tdq, struct thread *td, int flags)
2268{
2269
2270 TDQ_LOCK_ASSERT(tdq, MA_OWNED);
2271 KASSERT((td->td_inhibitors == 0),
2272 ("sched_add: trying to run inhibited thread"));
2273 KASSERT((TD_CAN_RUN(td) || TD_IS_RUNNING(td)),
2274 ("sched_add: bad thread state"));
2275 KASSERT(td->td_flags & TDF_INMEM,
2276 ("sched_add: thread swapped out"));
2277
2278 if (td->td_priority < tdq->tdq_lowpri)
2279 tdq->tdq_lowpri = td->td_priority;
2280 tdq_runq_add(tdq, td, flags);
2281 tdq_load_add(tdq, td);
2282}
2283
2284/*
2285 * Select the target thread queue and add a thread to it. Request
2286 * preemption or IPI a remote processor if required.
2287 */
2288void
2289sched_add(struct thread *td, int flags)
2290{
2291 struct tdq *tdq;
2292#ifdef SMP
2293 int cpu;
2294#endif
2295
2296 KTR_STATE2(KTR_SCHED, "thread", sched_tdname(td), "runq add",
2297 "prio:%d", td->td_priority, KTR_ATTR_LINKED,
2298 sched_tdname(curthread));
2299 KTR_POINT1(KTR_SCHED, "thread", sched_tdname(curthread), "wokeup",
2300 KTR_ATTR_LINKED, sched_tdname(td));
2301 THREAD_LOCK_ASSERT(td, MA_OWNED);
2302 /*
2303 * Recalculate the priority before we select the target cpu or
2304 * run-queue.
2305 */
2306 if (PRI_BASE(td->td_pri_class) == PRI_TIMESHARE)
2307 sched_priority(td);
2308#ifdef SMP
2309 /*
2310 * Pick the destination cpu and if it isn't ours transfer to the
2311 * target cpu.
2312 */
2313 cpu = sched_pickcpu(td, flags);
2314 tdq = sched_setcpu(td, cpu, flags);
2315 tdq_add(tdq, td, flags);
2316 if (cpu != PCPU_GET(cpuid)) {
2317 tdq_notify(tdq, td);
2318 return;
2319 }
2320#else
2321 tdq = TDQ_SELF();
2322 TDQ_LOCK(tdq);
2323 /*
2324 * Now that the thread is moving to the run-queue, set the lock
2325 * to the scheduler's lock.
2326 */
2327 thread_lock_set(td, TDQ_LOCKPTR(tdq));
2328 tdq_add(tdq, td, flags);
2329#endif
2330 if (!(flags & SRQ_YIELDING))
2331 sched_setpreempt(td);
2332}
2333
2334/*
2335 * Remove a thread from a run-queue without running it. This is used
2336 * when we're stealing a thread from a remote queue. Otherwise all threads
2337 * exit by calling sched_exit_thread() and sched_throw() themselves.
2338 */
2339void
2340sched_rem(struct thread *td)
2341{
2342 struct tdq *tdq;
2343
2344 KTR_STATE1(KTR_SCHED, "thread", sched_tdname(td), "runq rem",
2345 "prio:%d", td->td_priority);
2346 tdq = TDQ_CPU(td->td_sched->ts_cpu);
2347 TDQ_LOCK_ASSERT(tdq, MA_OWNED);
2348 MPASS(td->td_lock == TDQ_LOCKPTR(tdq));
2349 KASSERT(TD_ON_RUNQ(td),
2350 ("sched_rem: thread not on run queue"));
2351 tdq_runq_rem(tdq, td);
2352 tdq_load_rem(tdq, td);
2353 TD_SET_CAN_RUN(td);
2354 if (td->td_priority == tdq->tdq_lowpri)
2355 tdq_setlowpri(tdq, NULL);
2356}
2357
2358/*
2359 * Fetch cpu utilization information. Updates on demand.
2360 */
2361fixpt_t
2362sched_pctcpu(struct thread *td)
2363{
2364 fixpt_t pctcpu;
2365 struct td_sched *ts;
2366
2367 pctcpu = 0;
2368 ts = td->td_sched;
2369 if (ts == NULL)
2370 return (0);
2371
2372 THREAD_LOCK_ASSERT(td, MA_OWNED);
2373 if (ts->ts_ticks) {
2374 int rtick;
2375
2376 sched_pctcpu_update(ts);
2377 /* How many rtick per second ? */
2378 rtick = min(SCHED_TICK_HZ(ts) / SCHED_TICK_SECS, hz);
2379 pctcpu = (FSCALE * ((FSCALE * rtick)/hz)) >> FSHIFT;
2380 }
2381
2382 return (pctcpu);
2383}
2384
2385/*
2386 * Enforce affinity settings for a thread. Called after adjustments to
2387 * cpumask.
2388 */
2389void
2390sched_affinity(struct thread *td)
2391{
2392#ifdef SMP
2393 struct td_sched *ts;
2394
2395 THREAD_LOCK_ASSERT(td, MA_OWNED);
2396 ts = td->td_sched;
2397 if (THREAD_CAN_SCHED(td, ts->ts_cpu))
2398 return;
2399 if (TD_ON_RUNQ(td)) {
2400 sched_rem(td);
2401 sched_add(td, SRQ_BORING);
2402 return;
2403 }
2404 if (!TD_IS_RUNNING(td))
2405 return;
2406 td->td_flags |= TDF_NEEDRESCHED;
2407 /*
2408 * Force a switch before returning to userspace. If the
2409 * target thread is not running locally send an ipi to force
2410 * the issue.
2411 */
2412 if (td != curthread)
2413 ipi_cpu(ts->ts_cpu, IPI_PREEMPT);
2414#endif
2415}
2416
2417/*
2418 * Bind a thread to a target cpu.
2419 */
2420void
2421sched_bind(struct thread *td, int cpu)
2422{
2423 struct td_sched *ts;
2424
2425 THREAD_LOCK_ASSERT(td, MA_OWNED|MA_NOTRECURSED);
2426 KASSERT(td == curthread, ("sched_bind: can only bind curthread"));
2427 ts = td->td_sched;
2428 if (ts->ts_flags & TSF_BOUND)
2429 sched_unbind(td);
2430 KASSERT(THREAD_CAN_MIGRATE(td), ("%p must be migratable", td));
2431 ts->ts_flags |= TSF_BOUND;
2432 sched_pin();
2433 if (PCPU_GET(cpuid) == cpu)
2434 return;
2435 ts->ts_cpu = cpu;
2436 /* When we return from mi_switch we'll be on the correct cpu. */
2437 mi_switch(SW_VOL, NULL);
2438}
2439
2440/*
2441 * Release a bound thread.
2442 */
2443void
2444sched_unbind(struct thread *td)
2445{
2446 struct td_sched *ts;
2447
2448 THREAD_LOCK_ASSERT(td, MA_OWNED);
2449 KASSERT(td == curthread, ("sched_unbind: can only bind curthread"));
2450 ts = td->td_sched;
2451 if ((ts->ts_flags & TSF_BOUND) == 0)
2452 return;
2453 ts->ts_flags &= ~TSF_BOUND;
2454 sched_unpin();
2455}
2456
2457int
2458sched_is_bound(struct thread *td)
2459{
2460 THREAD_LOCK_ASSERT(td, MA_OWNED);
2461 return (td->td_sched->ts_flags & TSF_BOUND);
2462}
2463
2464/*
2465 * Basic yield call.
2466 */
2467void
2468sched_relinquish(struct thread *td)
2469{
2470 thread_lock(td);
2471 mi_switch(SW_VOL | SWT_RELINQUISH, NULL);
2472 thread_unlock(td);
2473}
2474
2475/*
2476 * Return the total system load.
2477 */
2478int
2479sched_load(void)
2480{
2481#ifdef SMP
2482 int total;
2483 int i;
2484
2485 total = 0;
2486 CPU_FOREACH(i)
2487 total += TDQ_CPU(i)->tdq_sysload;
2488 return (total);
2489#else
2490 return (TDQ_SELF()->tdq_sysload);
2491#endif
2492}
2493
2494int
2495sched_sizeof_proc(void)
2496{
2497 return (sizeof(struct proc));
2498}
2499
2500int
2501sched_sizeof_thread(void)
2502{
2503 return (sizeof(struct thread) + sizeof(struct td_sched));
2504}
2505
2506#ifdef SMP
2507#define TDQ_IDLESPIN(tdq) \
2508 ((tdq)->tdq_cg != NULL && ((tdq)->tdq_cg->cg_flags & CG_FLAG_THREAD) == 0)
2509#else
2510#define TDQ_IDLESPIN(tdq) 1
2511#endif
2512
2513/*
2514 * The actual idle process.
2515 */
2516void
2517sched_idletd(void *dummy)
2518{
2519 struct thread *td;
2520 struct tdq *tdq;
2521 int switchcnt;
2522 int i;
2523
2524 mtx_assert(&Giant, MA_NOTOWNED);
2525 td = curthread;
2526 tdq = TDQ_SELF();
2527 for (;;) {
2528#ifdef SMP
2529 if (tdq_idled(tdq) == 0)
2530 continue;
2531#endif
2532 switchcnt = tdq->tdq_switchcnt + tdq->tdq_oldswitchcnt;
2533 /*
2534 * If we're switching very frequently, spin while checking
2535 * for load rather than entering a low power state that
2536 * may require an IPI. However, don't do any busy
2537 * loops while on SMT machines as this simply steals
2538 * cycles from cores doing useful work.
2539 */
2540 if (TDQ_IDLESPIN(tdq) && switchcnt > sched_idlespinthresh) {
2541 for (i = 0; i < sched_idlespins; i++) {
2542 if (tdq->tdq_load)
2543 break;
2544 cpu_spinwait();
2545 }
2546 }
2547 switchcnt = tdq->tdq_switchcnt + tdq->tdq_oldswitchcnt;
971 return;
972 }
973 tdq->tdq_ipipending = 1;
974 ipi_cpu(cpu, IPI_PREEMPT);
975}
976
977/*
978 * Steals load from a timeshare queue. Honors the rotating queue head
979 * index.
980 */
981static struct thread *
982runq_steal_from(struct runq *rq, int cpu, u_char start)
983{
984 struct rqbits *rqb;
985 struct rqhead *rqh;
986 struct thread *td;
987 int first;
988 int bit;
989 int pri;
990 int i;
991
992 rqb = &rq->rq_status;
993 bit = start & (RQB_BPW -1);
994 pri = 0;
995 first = 0;
996again:
997 for (i = RQB_WORD(start); i < RQB_LEN; bit = 0, i++) {
998 if (rqb->rqb_bits[i] == 0)
999 continue;
1000 if (bit != 0) {
1001 for (pri = bit; pri < RQB_BPW; pri++)
1002 if (rqb->rqb_bits[i] & (1ul << pri))
1003 break;
1004 if (pri >= RQB_BPW)
1005 continue;
1006 } else
1007 pri = RQB_FFS(rqb->rqb_bits[i]);
1008 pri += (i << RQB_L2BPW);
1009 rqh = &rq->rq_queues[pri];
1010 TAILQ_FOREACH(td, rqh, td_runq) {
1011 if (first && THREAD_CAN_MIGRATE(td) &&
1012 THREAD_CAN_SCHED(td, cpu))
1013 return (td);
1014 first = 1;
1015 }
1016 }
1017 if (start != 0) {
1018 start = 0;
1019 goto again;
1020 }
1021
1022 return (NULL);
1023}
1024
1025/*
1026 * Steals load from a standard linear queue.
1027 */
1028static struct thread *
1029runq_steal(struct runq *rq, int cpu)
1030{
1031 struct rqhead *rqh;
1032 struct rqbits *rqb;
1033 struct thread *td;
1034 int word;
1035 int bit;
1036
1037 rqb = &rq->rq_status;
1038 for (word = 0; word < RQB_LEN; word++) {
1039 if (rqb->rqb_bits[word] == 0)
1040 continue;
1041 for (bit = 0; bit < RQB_BPW; bit++) {
1042 if ((rqb->rqb_bits[word] & (1ul << bit)) == 0)
1043 continue;
1044 rqh = &rq->rq_queues[bit + (word << RQB_L2BPW)];
1045 TAILQ_FOREACH(td, rqh, td_runq)
1046 if (THREAD_CAN_MIGRATE(td) &&
1047 THREAD_CAN_SCHED(td, cpu))
1048 return (td);
1049 }
1050 }
1051 return (NULL);
1052}
1053
1054/*
1055 * Attempt to steal a thread in priority order from a thread queue.
1056 */
1057static struct thread *
1058tdq_steal(struct tdq *tdq, int cpu)
1059{
1060 struct thread *td;
1061
1062 TDQ_LOCK_ASSERT(tdq, MA_OWNED);
1063 if ((td = runq_steal(&tdq->tdq_realtime, cpu)) != NULL)
1064 return (td);
1065 if ((td = runq_steal_from(&tdq->tdq_timeshare,
1066 cpu, tdq->tdq_ridx)) != NULL)
1067 return (td);
1068 return (runq_steal(&tdq->tdq_idle, cpu));
1069}
1070
1071/*
1072 * Sets the thread lock and ts_cpu to match the requested cpu. Unlocks the
1073 * current lock and returns with the assigned queue locked.
1074 */
1075static inline struct tdq *
1076sched_setcpu(struct thread *td, int cpu, int flags)
1077{
1078
1079 struct tdq *tdq;
1080
1081 THREAD_LOCK_ASSERT(td, MA_OWNED);
1082 tdq = TDQ_CPU(cpu);
1083 td->td_sched->ts_cpu = cpu;
1084 /*
1085 * If the lock matches just return the queue.
1086 */
1087 if (td->td_lock == TDQ_LOCKPTR(tdq))
1088 return (tdq);
1089#ifdef notyet
1090 /*
1091 * If the thread isn't running its lockptr is a
1092 * turnstile or a sleepqueue. We can just lock_set without
1093 * blocking.
1094 */
1095 if (TD_CAN_RUN(td)) {
1096 TDQ_LOCK(tdq);
1097 thread_lock_set(td, TDQ_LOCKPTR(tdq));
1098 return (tdq);
1099 }
1100#endif
1101 /*
1102 * The hard case, migration, we need to block the thread first to
1103 * prevent order reversals with other cpus locks.
1104 */
1105 spinlock_enter();
1106 thread_lock_block(td);
1107 TDQ_LOCK(tdq);
1108 thread_lock_unblock(td, TDQ_LOCKPTR(tdq));
1109 spinlock_exit();
1110 return (tdq);
1111}
1112
1113SCHED_STAT_DEFINE(pickcpu_intrbind, "Soft interrupt binding");
1114SCHED_STAT_DEFINE(pickcpu_idle_affinity, "Picked idle cpu based on affinity");
1115SCHED_STAT_DEFINE(pickcpu_affinity, "Picked cpu based on affinity");
1116SCHED_STAT_DEFINE(pickcpu_lowest, "Selected lowest load");
1117SCHED_STAT_DEFINE(pickcpu_local, "Migrated to current cpu");
1118SCHED_STAT_DEFINE(pickcpu_migration, "Selection may have caused migration");
1119
1120static int
1121sched_pickcpu(struct thread *td, int flags)
1122{
1123 struct cpu_group *cg;
1124 struct td_sched *ts;
1125 struct tdq *tdq;
1126 cpuset_t mask;
1127 int self;
1128 int pri;
1129 int cpu;
1130
1131 self = PCPU_GET(cpuid);
1132 ts = td->td_sched;
1133 if (smp_started == 0)
1134 return (self);
1135 /*
1136 * Don't migrate a running thread from sched_switch().
1137 */
1138 if ((flags & SRQ_OURSELF) || !THREAD_CAN_MIGRATE(td))
1139 return (ts->ts_cpu);
1140 /*
1141 * Prefer to run interrupt threads on the processors that generate
1142 * the interrupt.
1143 */
1144 if (td->td_priority <= PRI_MAX_ITHD && THREAD_CAN_SCHED(td, self) &&
1145 curthread->td_intr_nesting_level && ts->ts_cpu != self) {
1146 SCHED_STAT_INC(pickcpu_intrbind);
1147 ts->ts_cpu = self;
1148 }
1149 /*
1150 * If the thread can run on the last cpu and the affinity has not
1151 * expired or it is idle run it there.
1152 */
1153 pri = td->td_priority;
1154 tdq = TDQ_CPU(ts->ts_cpu);
1155 if (THREAD_CAN_SCHED(td, ts->ts_cpu)) {
1156 if (tdq->tdq_lowpri > PRI_MIN_IDLE) {
1157 SCHED_STAT_INC(pickcpu_idle_affinity);
1158 return (ts->ts_cpu);
1159 }
1160 if (SCHED_AFFINITY(ts, CG_SHARE_L2) && tdq->tdq_lowpri > pri) {
1161 SCHED_STAT_INC(pickcpu_affinity);
1162 return (ts->ts_cpu);
1163 }
1164 }
1165 /*
1166 * Search for the highest level in the tree that still has affinity.
1167 */
1168 cg = NULL;
1169 for (cg = tdq->tdq_cg; cg != NULL; cg = cg->cg_parent)
1170 if (SCHED_AFFINITY(ts, cg->cg_level))
1171 break;
1172 cpu = -1;
1173 mask = td->td_cpuset->cs_mask;
1174 if (cg)
1175 cpu = sched_lowest(cg, mask, pri);
1176 if (cpu == -1)
1177 cpu = sched_lowest(cpu_top, mask, -1);
1178 /*
1179 * Compare the lowest loaded cpu to current cpu.
1180 */
1181 if (THREAD_CAN_SCHED(td, self) && TDQ_CPU(self)->tdq_lowpri > pri &&
1182 TDQ_CPU(cpu)->tdq_lowpri < PRI_MIN_IDLE) {
1183 SCHED_STAT_INC(pickcpu_local);
1184 cpu = self;
1185 } else
1186 SCHED_STAT_INC(pickcpu_lowest);
1187 if (cpu != ts->ts_cpu)
1188 SCHED_STAT_INC(pickcpu_migration);
1189 KASSERT(cpu != -1, ("sched_pickcpu: Failed to find a cpu."));
1190 return (cpu);
1191}
1192#endif
1193
1194/*
1195 * Pick the highest priority task we have and return it.
1196 */
1197static struct thread *
1198tdq_choose(struct tdq *tdq)
1199{
1200 struct thread *td;
1201
1202 TDQ_LOCK_ASSERT(tdq, MA_OWNED);
1203 td = runq_choose(&tdq->tdq_realtime);
1204 if (td != NULL)
1205 return (td);
1206 td = runq_choose_from(&tdq->tdq_timeshare, tdq->tdq_ridx);
1207 if (td != NULL) {
1208 KASSERT(td->td_priority >= PRI_MIN_TIMESHARE,
1209 ("tdq_choose: Invalid priority on timeshare queue %d",
1210 td->td_priority));
1211 return (td);
1212 }
1213 td = runq_choose(&tdq->tdq_idle);
1214 if (td != NULL) {
1215 KASSERT(td->td_priority >= PRI_MIN_IDLE,
1216 ("tdq_choose: Invalid priority on idle queue %d",
1217 td->td_priority));
1218 return (td);
1219 }
1220
1221 return (NULL);
1222}
1223
1224/*
1225 * Initialize a thread queue.
1226 */
1227static void
1228tdq_setup(struct tdq *tdq)
1229{
1230
1231 if (bootverbose)
1232 printf("ULE: setup cpu %d\n", TDQ_ID(tdq));
1233 runq_init(&tdq->tdq_realtime);
1234 runq_init(&tdq->tdq_timeshare);
1235 runq_init(&tdq->tdq_idle);
1236 snprintf(tdq->tdq_name, sizeof(tdq->tdq_name),
1237 "sched lock %d", (int)TDQ_ID(tdq));
1238 mtx_init(&tdq->tdq_lock, tdq->tdq_name, "sched lock",
1239 MTX_SPIN | MTX_RECURSE);
1240#ifdef KTR
1241 snprintf(tdq->tdq_loadname, sizeof(tdq->tdq_loadname),
1242 "CPU %d load", (int)TDQ_ID(tdq));
1243#endif
1244}
1245
1246#ifdef SMP
1247static void
1248sched_setup_smp(void)
1249{
1250 struct tdq *tdq;
1251 int i;
1252
1253 cpu_top = smp_topo();
1254 CPU_FOREACH(i) {
1255 tdq = TDQ_CPU(i);
1256 tdq_setup(tdq);
1257 tdq->tdq_cg = smp_topo_find(cpu_top, i);
1258 if (tdq->tdq_cg == NULL)
1259 panic("Can't find cpu group for %d\n", i);
1260 }
1261 balance_tdq = TDQ_SELF();
1262 sched_balance();
1263}
1264#endif
1265
1266/*
1267 * Setup the thread queues and initialize the topology based on MD
1268 * information.
1269 */
1270static void
1271sched_setup(void *dummy)
1272{
1273 struct tdq *tdq;
1274
1275 tdq = TDQ_SELF();
1276#ifdef SMP
1277 sched_setup_smp();
1278#else
1279 tdq_setup(tdq);
1280#endif
1281 /*
1282 * To avoid divide-by-zero, we set realstathz a dummy value
1283 * in case which sched_clock() called before sched_initticks().
1284 */
1285 realstathz = hz;
1286 sched_slice = (realstathz/10); /* ~100ms */
1287 tickincr = 1 << SCHED_TICK_SHIFT;
1288
1289 /* Add thread0's load since it's running. */
1290 TDQ_LOCK(tdq);
1291 thread0.td_lock = TDQ_LOCKPTR(TDQ_SELF());
1292 tdq_load_add(tdq, &thread0);
1293 tdq->tdq_lowpri = thread0.td_priority;
1294 TDQ_UNLOCK(tdq);
1295}
1296
1297/*
1298 * This routine determines the tickincr after stathz and hz are setup.
1299 */
1300/* ARGSUSED */
1301static void
1302sched_initticks(void *dummy)
1303{
1304 int incr;
1305
1306 realstathz = stathz ? stathz : hz;
1307 sched_slice = (realstathz/10); /* ~100ms */
1308
1309 /*
1310 * tickincr is shifted out by 10 to avoid rounding errors due to
1311 * hz not being evenly divisible by stathz on all platforms.
1312 */
1313 incr = (hz << SCHED_TICK_SHIFT) / realstathz;
1314 /*
1315 * This does not work for values of stathz that are more than
1316 * 1 << SCHED_TICK_SHIFT * hz. In practice this does not happen.
1317 */
1318 if (incr == 0)
1319 incr = 1;
1320 tickincr = incr;
1321#ifdef SMP
1322 /*
1323 * Set the default balance interval now that we know
1324 * what realstathz is.
1325 */
1326 balance_interval = realstathz;
1327 /*
1328 * Set steal thresh to roughly log2(mp_ncpu) but no greater than 4.
1329 * This prevents excess thrashing on large machines and excess idle
1330 * on smaller machines.
1331 */
1332 steal_thresh = min(fls(mp_ncpus) - 1, 3);
1333 affinity = SCHED_AFFINITY_DEFAULT;
1334#endif
1335}
1336
1337
1338/*
1339 * This is the core of the interactivity algorithm. Determines a score based
1340 * on past behavior. It is the ratio of sleep time to run time scaled to
1341 * a [0, 100] integer. This is the voluntary sleep time of a process, which
1342 * differs from the cpu usage because it does not account for time spent
1343 * waiting on a run-queue. Would be prettier if we had floating point.
1344 */
1345static int
1346sched_interact_score(struct thread *td)
1347{
1348 struct td_sched *ts;
1349 int div;
1350
1351 ts = td->td_sched;
1352 /*
1353 * The score is only needed if this is likely to be an interactive
1354 * task. Don't go through the expense of computing it if there's
1355 * no chance.
1356 */
1357 if (sched_interact <= SCHED_INTERACT_HALF &&
1358 ts->ts_runtime >= ts->ts_slptime)
1359 return (SCHED_INTERACT_HALF);
1360
1361 if (ts->ts_runtime > ts->ts_slptime) {
1362 div = max(1, ts->ts_runtime / SCHED_INTERACT_HALF);
1363 return (SCHED_INTERACT_HALF +
1364 (SCHED_INTERACT_HALF - (ts->ts_slptime / div)));
1365 }
1366 if (ts->ts_slptime > ts->ts_runtime) {
1367 div = max(1, ts->ts_slptime / SCHED_INTERACT_HALF);
1368 return (ts->ts_runtime / div);
1369 }
1370 /* runtime == slptime */
1371 if (ts->ts_runtime)
1372 return (SCHED_INTERACT_HALF);
1373
1374 /*
1375 * This can happen if slptime and runtime are 0.
1376 */
1377 return (0);
1378
1379}
1380
1381/*
1382 * Scale the scheduling priority according to the "interactivity" of this
1383 * process.
1384 */
1385static void
1386sched_priority(struct thread *td)
1387{
1388 int score;
1389 int pri;
1390
1391 if (td->td_pri_class != PRI_TIMESHARE)
1392 return;
1393 /*
1394 * If the score is interactive we place the thread in the realtime
1395 * queue with a priority that is less than kernel and interrupt
1396 * priorities. These threads are not subject to nice restrictions.
1397 *
1398 * Scores greater than this are placed on the normal timeshare queue
1399 * where the priority is partially decided by the most recent cpu
1400 * utilization and the rest is decided by nice value.
1401 *
1402 * The nice value of the process has a linear effect on the calculated
1403 * score. Negative nice values make it easier for a thread to be
1404 * considered interactive.
1405 */
1406 score = imax(0, sched_interact_score(td) + td->td_proc->p_nice);
1407 if (score < sched_interact) {
1408 pri = PRI_MIN_REALTIME;
1409 pri += ((PRI_MAX_REALTIME - PRI_MIN_REALTIME) / sched_interact)
1410 * score;
1411 KASSERT(pri >= PRI_MIN_REALTIME && pri <= PRI_MAX_REALTIME,
1412 ("sched_priority: invalid interactive priority %d score %d",
1413 pri, score));
1414 } else {
1415 pri = SCHED_PRI_MIN;
1416 if (td->td_sched->ts_ticks)
1417 pri += SCHED_PRI_TICKS(td->td_sched);
1418 pri += SCHED_PRI_NICE(td->td_proc->p_nice);
1419 KASSERT(pri >= PRI_MIN_TIMESHARE && pri <= PRI_MAX_TIMESHARE,
1420 ("sched_priority: invalid priority %d: nice %d, "
1421 "ticks %d ftick %d ltick %d tick pri %d",
1422 pri, td->td_proc->p_nice, td->td_sched->ts_ticks,
1423 td->td_sched->ts_ftick, td->td_sched->ts_ltick,
1424 SCHED_PRI_TICKS(td->td_sched)));
1425 }
1426 sched_user_prio(td, pri);
1427
1428 return;
1429}
1430
1431/*
1432 * This routine enforces a maximum limit on the amount of scheduling history
1433 * kept. It is called after either the slptime or runtime is adjusted. This
1434 * function is ugly due to integer math.
1435 */
1436static void
1437sched_interact_update(struct thread *td)
1438{
1439 struct td_sched *ts;
1440 u_int sum;
1441
1442 ts = td->td_sched;
1443 sum = ts->ts_runtime + ts->ts_slptime;
1444 if (sum < SCHED_SLP_RUN_MAX)
1445 return;
1446 /*
1447 * This only happens from two places:
1448 * 1) We have added an unusual amount of run time from fork_exit.
1449 * 2) We have added an unusual amount of sleep time from sched_sleep().
1450 */
1451 if (sum > SCHED_SLP_RUN_MAX * 2) {
1452 if (ts->ts_runtime > ts->ts_slptime) {
1453 ts->ts_runtime = SCHED_SLP_RUN_MAX;
1454 ts->ts_slptime = 1;
1455 } else {
1456 ts->ts_slptime = SCHED_SLP_RUN_MAX;
1457 ts->ts_runtime = 1;
1458 }
1459 return;
1460 }
1461 /*
1462 * If we have exceeded by more than 1/5th then the algorithm below
1463 * will not bring us back into range. Dividing by two here forces
1464 * us into the range of [4/5 * SCHED_INTERACT_MAX, SCHED_INTERACT_MAX]
1465 */
1466 if (sum > (SCHED_SLP_RUN_MAX / 5) * 6) {
1467 ts->ts_runtime /= 2;
1468 ts->ts_slptime /= 2;
1469 return;
1470 }
1471 ts->ts_runtime = (ts->ts_runtime / 5) * 4;
1472 ts->ts_slptime = (ts->ts_slptime / 5) * 4;
1473}
1474
1475/*
1476 * Scale back the interactivity history when a child thread is created. The
1477 * history is inherited from the parent but the thread may behave totally
1478 * differently. For example, a shell spawning a compiler process. We want
1479 * to learn that the compiler is behaving badly very quickly.
1480 */
1481static void
1482sched_interact_fork(struct thread *td)
1483{
1484 int ratio;
1485 int sum;
1486
1487 sum = td->td_sched->ts_runtime + td->td_sched->ts_slptime;
1488 if (sum > SCHED_SLP_RUN_FORK) {
1489 ratio = sum / SCHED_SLP_RUN_FORK;
1490 td->td_sched->ts_runtime /= ratio;
1491 td->td_sched->ts_slptime /= ratio;
1492 }
1493}
1494
1495/*
1496 * Called from proc0_init() to setup the scheduler fields.
1497 */
1498void
1499schedinit(void)
1500{
1501
1502 /*
1503 * Set up the scheduler specific parts of proc0.
1504 */
1505 proc0.p_sched = NULL; /* XXX */
1506 thread0.td_sched = &td_sched0;
1507 td_sched0.ts_ltick = ticks;
1508 td_sched0.ts_ftick = ticks;
1509 td_sched0.ts_slice = sched_slice;
1510}
1511
1512/*
1513 * This is only somewhat accurate since given many processes of the same
1514 * priority they will switch when their slices run out, which will be
1515 * at most sched_slice stathz ticks.
1516 */
1517int
1518sched_rr_interval(void)
1519{
1520
1521 /* Convert sched_slice to hz */
1522 return (hz/(realstathz/sched_slice));
1523}
1524
1525/*
1526 * Update the percent cpu tracking information when it is requested or
1527 * the total history exceeds the maximum. We keep a sliding history of
1528 * tick counts that slowly decays. This is less precise than the 4BSD
1529 * mechanism since it happens with less regular and frequent events.
1530 */
1531static void
1532sched_pctcpu_update(struct td_sched *ts)
1533{
1534
1535 if (ts->ts_ticks == 0)
1536 return;
1537 if (ticks - (hz / 10) < ts->ts_ltick &&
1538 SCHED_TICK_TOTAL(ts) < SCHED_TICK_MAX)
1539 return;
1540 /*
1541 * Adjust counters and watermark for pctcpu calc.
1542 */
1543 if (ts->ts_ltick > ticks - SCHED_TICK_TARG)
1544 ts->ts_ticks = (ts->ts_ticks / (ticks - ts->ts_ftick)) *
1545 SCHED_TICK_TARG;
1546 else
1547 ts->ts_ticks = 0;
1548 ts->ts_ltick = ticks;
1549 ts->ts_ftick = ts->ts_ltick - SCHED_TICK_TARG;
1550}
1551
1552/*
1553 * Adjust the priority of a thread. Move it to the appropriate run-queue
1554 * if necessary. This is the back-end for several priority related
1555 * functions.
1556 */
1557static void
1558sched_thread_priority(struct thread *td, u_char prio)
1559{
1560 struct td_sched *ts;
1561 struct tdq *tdq;
1562 int oldpri;
1563
1564 KTR_POINT3(KTR_SCHED, "thread", sched_tdname(td), "prio",
1565 "prio:%d", td->td_priority, "new prio:%d", prio,
1566 KTR_ATTR_LINKED, sched_tdname(curthread));
1567 if (td != curthread && prio > td->td_priority) {
1568 KTR_POINT3(KTR_SCHED, "thread", sched_tdname(curthread),
1569 "lend prio", "prio:%d", td->td_priority, "new prio:%d",
1570 prio, KTR_ATTR_LINKED, sched_tdname(td));
1571 }
1572 ts = td->td_sched;
1573 THREAD_LOCK_ASSERT(td, MA_OWNED);
1574 if (td->td_priority == prio)
1575 return;
1576 /*
1577 * If the priority has been elevated due to priority
1578 * propagation, we may have to move ourselves to a new
1579 * queue. This could be optimized to not re-add in some
1580 * cases.
1581 */
1582 if (TD_ON_RUNQ(td) && prio < td->td_priority) {
1583 sched_rem(td);
1584 td->td_priority = prio;
1585 sched_add(td, SRQ_BORROWING);
1586 return;
1587 }
1588 /*
1589 * If the thread is currently running we may have to adjust the lowpri
1590 * information so other cpus are aware of our current priority.
1591 */
1592 if (TD_IS_RUNNING(td)) {
1593 tdq = TDQ_CPU(ts->ts_cpu);
1594 oldpri = td->td_priority;
1595 td->td_priority = prio;
1596 if (prio < tdq->tdq_lowpri)
1597 tdq->tdq_lowpri = prio;
1598 else if (tdq->tdq_lowpri == oldpri)
1599 tdq_setlowpri(tdq, td);
1600 return;
1601 }
1602 td->td_priority = prio;
1603}
1604
1605/*
1606 * Update a thread's priority when it is lent another thread's
1607 * priority.
1608 */
1609void
1610sched_lend_prio(struct thread *td, u_char prio)
1611{
1612
1613 td->td_flags |= TDF_BORROWING;
1614 sched_thread_priority(td, prio);
1615}
1616
1617/*
1618 * Restore a thread's priority when priority propagation is
1619 * over. The prio argument is the minimum priority the thread
1620 * needs to have to satisfy other possible priority lending
1621 * requests. If the thread's regular priority is less
1622 * important than prio, the thread will keep a priority boost
1623 * of prio.
1624 */
1625void
1626sched_unlend_prio(struct thread *td, u_char prio)
1627{
1628 u_char base_pri;
1629
1630 if (td->td_base_pri >= PRI_MIN_TIMESHARE &&
1631 td->td_base_pri <= PRI_MAX_TIMESHARE)
1632 base_pri = td->td_user_pri;
1633 else
1634 base_pri = td->td_base_pri;
1635 if (prio >= base_pri) {
1636 td->td_flags &= ~TDF_BORROWING;
1637 sched_thread_priority(td, base_pri);
1638 } else
1639 sched_lend_prio(td, prio);
1640}
1641
1642/*
1643 * Standard entry for setting the priority to an absolute value.
1644 */
1645void
1646sched_prio(struct thread *td, u_char prio)
1647{
1648 u_char oldprio;
1649
1650 /* First, update the base priority. */
1651 td->td_base_pri = prio;
1652
1653 /*
1654 * If the thread is borrowing another thread's priority, don't
1655 * ever lower the priority.
1656 */
1657 if (td->td_flags & TDF_BORROWING && td->td_priority < prio)
1658 return;
1659
1660 /* Change the real priority. */
1661 oldprio = td->td_priority;
1662 sched_thread_priority(td, prio);
1663
1664 /*
1665 * If the thread is on a turnstile, then let the turnstile update
1666 * its state.
1667 */
1668 if (TD_ON_LOCK(td) && oldprio != prio)
1669 turnstile_adjust(td, oldprio);
1670}
1671
1672/*
1673 * Set the base user priority, does not effect current running priority.
1674 */
1675void
1676sched_user_prio(struct thread *td, u_char prio)
1677{
1678 u_char oldprio;
1679
1680 td->td_base_user_pri = prio;
1681 if (td->td_flags & TDF_UBORROWING && td->td_user_pri <= prio)
1682 return;
1683 oldprio = td->td_user_pri;
1684 td->td_user_pri = prio;
1685}
1686
1687void
1688sched_lend_user_prio(struct thread *td, u_char prio)
1689{
1690 u_char oldprio;
1691
1692 THREAD_LOCK_ASSERT(td, MA_OWNED);
1693 td->td_flags |= TDF_UBORROWING;
1694 oldprio = td->td_user_pri;
1695 td->td_user_pri = prio;
1696}
1697
1698void
1699sched_unlend_user_prio(struct thread *td, u_char prio)
1700{
1701 u_char base_pri;
1702
1703 THREAD_LOCK_ASSERT(td, MA_OWNED);
1704 base_pri = td->td_base_user_pri;
1705 if (prio >= base_pri) {
1706 td->td_flags &= ~TDF_UBORROWING;
1707 sched_user_prio(td, base_pri);
1708 } else {
1709 sched_lend_user_prio(td, prio);
1710 }
1711}
1712
1713/*
1714 * Handle migration from sched_switch(). This happens only for
1715 * cpu binding.
1716 */
1717static struct mtx *
1718sched_switch_migrate(struct tdq *tdq, struct thread *td, int flags)
1719{
1720 struct tdq *tdn;
1721
1722 tdn = TDQ_CPU(td->td_sched->ts_cpu);
1723#ifdef SMP
1724 tdq_load_rem(tdq, td);
1725 /*
1726 * Do the lock dance required to avoid LOR. We grab an extra
1727 * spinlock nesting to prevent preemption while we're
1728 * not holding either run-queue lock.
1729 */
1730 spinlock_enter();
1731 thread_lock_block(td); /* This releases the lock on tdq. */
1732
1733 /*
1734 * Acquire both run-queue locks before placing the thread on the new
1735 * run-queue to avoid deadlocks created by placing a thread with a
1736 * blocked lock on the run-queue of a remote processor. The deadlock
1737 * occurs when a third processor attempts to lock the two queues in
1738 * question while the target processor is spinning with its own
1739 * run-queue lock held while waiting for the blocked lock to clear.
1740 */
1741 tdq_lock_pair(tdn, tdq);
1742 tdq_add(tdn, td, flags);
1743 tdq_notify(tdn, td);
1744 TDQ_UNLOCK(tdn);
1745 spinlock_exit();
1746#endif
1747 return (TDQ_LOCKPTR(tdn));
1748}
1749
1750/*
1751 * Variadic version of thread_lock_unblock() that does not assume td_lock
1752 * is blocked.
1753 */
1754static inline void
1755thread_unblock_switch(struct thread *td, struct mtx *mtx)
1756{
1757 atomic_store_rel_ptr((volatile uintptr_t *)&td->td_lock,
1758 (uintptr_t)mtx);
1759}
1760
1761/*
1762 * Switch threads. This function has to handle threads coming in while
1763 * blocked for some reason, running, or idle. It also must deal with
1764 * migrating a thread from one queue to another as running threads may
1765 * be assigned elsewhere via binding.
1766 */
1767void
1768sched_switch(struct thread *td, struct thread *newtd, int flags)
1769{
1770 struct tdq *tdq;
1771 struct td_sched *ts;
1772 struct mtx *mtx;
1773 int srqflag;
1774 int cpuid;
1775
1776 THREAD_LOCK_ASSERT(td, MA_OWNED);
1777 KASSERT(newtd == NULL, ("sched_switch: Unsupported newtd argument"));
1778
1779 cpuid = PCPU_GET(cpuid);
1780 tdq = TDQ_CPU(cpuid);
1781 ts = td->td_sched;
1782 mtx = td->td_lock;
1783 ts->ts_rltick = ticks;
1784 td->td_lastcpu = td->td_oncpu;
1785 td->td_oncpu = NOCPU;
1786 td->td_flags &= ~TDF_NEEDRESCHED;
1787 td->td_owepreempt = 0;
1788 tdq->tdq_switchcnt++;
1789 /*
1790 * The lock pointer in an idle thread should never change. Reset it
1791 * to CAN_RUN as well.
1792 */
1793 if (TD_IS_IDLETHREAD(td)) {
1794 MPASS(td->td_lock == TDQ_LOCKPTR(tdq));
1795 TD_SET_CAN_RUN(td);
1796 } else if (TD_IS_RUNNING(td)) {
1797 MPASS(td->td_lock == TDQ_LOCKPTR(tdq));
1798 srqflag = (flags & SW_PREEMPT) ?
1799 SRQ_OURSELF|SRQ_YIELDING|SRQ_PREEMPTED :
1800 SRQ_OURSELF|SRQ_YIELDING;
1801#ifdef SMP
1802 if (THREAD_CAN_MIGRATE(td) && !THREAD_CAN_SCHED(td, ts->ts_cpu))
1803 ts->ts_cpu = sched_pickcpu(td, 0);
1804#endif
1805 if (ts->ts_cpu == cpuid)
1806 tdq_runq_add(tdq, td, srqflag);
1807 else {
1808 KASSERT(THREAD_CAN_MIGRATE(td) ||
1809 (ts->ts_flags & TSF_BOUND) != 0,
1810 ("Thread %p shouldn't migrate", td));
1811 mtx = sched_switch_migrate(tdq, td, srqflag);
1812 }
1813 } else {
1814 /* This thread must be going to sleep. */
1815 TDQ_LOCK(tdq);
1816 mtx = thread_lock_block(td);
1817 tdq_load_rem(tdq, td);
1818 }
1819 /*
1820 * We enter here with the thread blocked and assigned to the
1821 * appropriate cpu run-queue or sleep-queue and with the current
1822 * thread-queue locked.
1823 */
1824 TDQ_LOCK_ASSERT(tdq, MA_OWNED | MA_NOTRECURSED);
1825 newtd = choosethread();
1826 /*
1827 * Call the MD code to switch contexts if necessary.
1828 */
1829 if (td != newtd) {
1830#ifdef HWPMC_HOOKS
1831 if (PMC_PROC_IS_USING_PMCS(td->td_proc))
1832 PMC_SWITCH_CONTEXT(td, PMC_FN_CSW_OUT);
1833#endif
1834 lock_profile_release_lock(&TDQ_LOCKPTR(tdq)->lock_object);
1835 TDQ_LOCKPTR(tdq)->mtx_lock = (uintptr_t)newtd;
1836
1837#ifdef KDTRACE_HOOKS
1838 /*
1839 * If DTrace has set the active vtime enum to anything
1840 * other than INACTIVE (0), then it should have set the
1841 * function to call.
1842 */
1843 if (dtrace_vtime_active)
1844 (*dtrace_vtime_switch_func)(newtd);
1845#endif
1846
1847 cpu_switch(td, newtd, mtx);
1848 /*
1849 * We may return from cpu_switch on a different cpu. However,
1850 * we always return with td_lock pointing to the current cpu's
1851 * run queue lock.
1852 */
1853 cpuid = PCPU_GET(cpuid);
1854 tdq = TDQ_CPU(cpuid);
1855 lock_profile_obtain_lock_success(
1856 &TDQ_LOCKPTR(tdq)->lock_object, 0, 0, __FILE__, __LINE__);
1857#ifdef HWPMC_HOOKS
1858 if (PMC_PROC_IS_USING_PMCS(td->td_proc))
1859 PMC_SWITCH_CONTEXT(td, PMC_FN_CSW_IN);
1860#endif
1861 } else
1862 thread_unblock_switch(td, mtx);
1863 /*
1864 * Assert that all went well and return.
1865 */
1866 TDQ_LOCK_ASSERT(tdq, MA_OWNED|MA_NOTRECURSED);
1867 MPASS(td->td_lock == TDQ_LOCKPTR(tdq));
1868 td->td_oncpu = cpuid;
1869}
1870
1871/*
1872 * Adjust thread priorities as a result of a nice request.
1873 */
1874void
1875sched_nice(struct proc *p, int nice)
1876{
1877 struct thread *td;
1878
1879 PROC_LOCK_ASSERT(p, MA_OWNED);
1880
1881 p->p_nice = nice;
1882 FOREACH_THREAD_IN_PROC(p, td) {
1883 thread_lock(td);
1884 sched_priority(td);
1885 sched_prio(td, td->td_base_user_pri);
1886 thread_unlock(td);
1887 }
1888}
1889
1890/*
1891 * Record the sleep time for the interactivity scorer.
1892 */
1893void
1894sched_sleep(struct thread *td, int prio)
1895{
1896
1897 THREAD_LOCK_ASSERT(td, MA_OWNED);
1898
1899 td->td_slptick = ticks;
1900 if (TD_IS_SUSPENDED(td) || prio >= PSOCK)
1901 td->td_flags |= TDF_CANSWAP;
1902 if (static_boost == 1 && prio)
1903 sched_prio(td, prio);
1904 else if (static_boost && td->td_priority > static_boost)
1905 sched_prio(td, static_boost);
1906}
1907
1908/*
1909 * Schedule a thread to resume execution and record how long it voluntarily
1910 * slept. We also update the pctcpu, interactivity, and priority.
1911 */
1912void
1913sched_wakeup(struct thread *td)
1914{
1915 struct td_sched *ts;
1916 int slptick;
1917
1918 THREAD_LOCK_ASSERT(td, MA_OWNED);
1919 ts = td->td_sched;
1920 td->td_flags &= ~TDF_CANSWAP;
1921 /*
1922 * If we slept for more than a tick update our interactivity and
1923 * priority.
1924 */
1925 slptick = td->td_slptick;
1926 td->td_slptick = 0;
1927 if (slptick && slptick != ticks) {
1928 u_int hzticks;
1929
1930 hzticks = (ticks - slptick) << SCHED_TICK_SHIFT;
1931 ts->ts_slptime += hzticks;
1932 sched_interact_update(td);
1933 sched_pctcpu_update(ts);
1934 }
1935 /* Reset the slice value after we sleep. */
1936 ts->ts_slice = sched_slice;
1937 sched_add(td, SRQ_BORING);
1938}
1939
1940/*
1941 * Penalize the parent for creating a new child and initialize the child's
1942 * priority.
1943 */
1944void
1945sched_fork(struct thread *td, struct thread *child)
1946{
1947 THREAD_LOCK_ASSERT(td, MA_OWNED);
1948 sched_fork_thread(td, child);
1949 /*
1950 * Penalize the parent and child for forking.
1951 */
1952 sched_interact_fork(child);
1953 sched_priority(child);
1954 td->td_sched->ts_runtime += tickincr;
1955 sched_interact_update(td);
1956 sched_priority(td);
1957}
1958
1959/*
1960 * Fork a new thread, may be within the same process.
1961 */
1962void
1963sched_fork_thread(struct thread *td, struct thread *child)
1964{
1965 struct td_sched *ts;
1966 struct td_sched *ts2;
1967
1968 THREAD_LOCK_ASSERT(td, MA_OWNED);
1969 /*
1970 * Initialize child.
1971 */
1972 ts = td->td_sched;
1973 ts2 = child->td_sched;
1974 child->td_lock = TDQ_LOCKPTR(TDQ_SELF());
1975 child->td_cpuset = cpuset_ref(td->td_cpuset);
1976 ts2->ts_cpu = ts->ts_cpu;
1977 ts2->ts_flags = 0;
1978 /*
1979 * Grab our parents cpu estimation information and priority.
1980 */
1981 ts2->ts_ticks = ts->ts_ticks;
1982 ts2->ts_ltick = ts->ts_ltick;
1983 ts2->ts_incrtick = ts->ts_incrtick;
1984 ts2->ts_ftick = ts->ts_ftick;
1985 child->td_user_pri = td->td_user_pri;
1986 child->td_base_user_pri = td->td_base_user_pri;
1987 /*
1988 * And update interactivity score.
1989 */
1990 ts2->ts_slptime = ts->ts_slptime;
1991 ts2->ts_runtime = ts->ts_runtime;
1992 ts2->ts_slice = 1; /* Attempt to quickly learn interactivity. */
1993#ifdef KTR
1994 bzero(ts2->ts_name, sizeof(ts2->ts_name));
1995#endif
1996}
1997
1998/*
1999 * Adjust the priority class of a thread.
2000 */
2001void
2002sched_class(struct thread *td, int class)
2003{
2004
2005 THREAD_LOCK_ASSERT(td, MA_OWNED);
2006 if (td->td_pri_class == class)
2007 return;
2008 td->td_pri_class = class;
2009}
2010
2011/*
2012 * Return some of the child's priority and interactivity to the parent.
2013 */
2014void
2015sched_exit(struct proc *p, struct thread *child)
2016{
2017 struct thread *td;
2018
2019 KTR_STATE1(KTR_SCHED, "thread", sched_tdname(child), "proc exit",
2020 "prio:td", child->td_priority);
2021 PROC_LOCK_ASSERT(p, MA_OWNED);
2022 td = FIRST_THREAD_IN_PROC(p);
2023 sched_exit_thread(td, child);
2024}
2025
2026/*
2027 * Penalize another thread for the time spent on this one. This helps to
2028 * worsen the priority and interactivity of processes which schedule batch
2029 * jobs such as make. This has little effect on the make process itself but
2030 * causes new processes spawned by it to receive worse scores immediately.
2031 */
2032void
2033sched_exit_thread(struct thread *td, struct thread *child)
2034{
2035
2036 KTR_STATE1(KTR_SCHED, "thread", sched_tdname(child), "thread exit",
2037 "prio:td", child->td_priority);
2038 /*
2039 * Give the child's runtime to the parent without returning the
2040 * sleep time as a penalty to the parent. This causes shells that
2041 * launch expensive things to mark their children as expensive.
2042 */
2043 thread_lock(td);
2044 td->td_sched->ts_runtime += child->td_sched->ts_runtime;
2045 sched_interact_update(td);
2046 sched_priority(td);
2047 thread_unlock(td);
2048}
2049
2050void
2051sched_preempt(struct thread *td)
2052{
2053 struct tdq *tdq;
2054
2055 thread_lock(td);
2056 tdq = TDQ_SELF();
2057 TDQ_LOCK_ASSERT(tdq, MA_OWNED);
2058 tdq->tdq_ipipending = 0;
2059 if (td->td_priority > tdq->tdq_lowpri) {
2060 int flags;
2061
2062 flags = SW_INVOL | SW_PREEMPT;
2063 if (td->td_critnest > 1)
2064 td->td_owepreempt = 1;
2065 else if (TD_IS_IDLETHREAD(td))
2066 mi_switch(flags | SWT_REMOTEWAKEIDLE, NULL);
2067 else
2068 mi_switch(flags | SWT_REMOTEPREEMPT, NULL);
2069 }
2070 thread_unlock(td);
2071}
2072
2073/*
2074 * Fix priorities on return to user-space. Priorities may be elevated due
2075 * to static priorities in msleep() or similar.
2076 */
2077void
2078sched_userret(struct thread *td)
2079{
2080 /*
2081 * XXX we cheat slightly on the locking here to avoid locking in
2082 * the usual case. Setting td_priority here is essentially an
2083 * incomplete workaround for not setting it properly elsewhere.
2084 * Now that some interrupt handlers are threads, not setting it
2085 * properly elsewhere can clobber it in the window between setting
2086 * it here and returning to user mode, so don't waste time setting
2087 * it perfectly here.
2088 */
2089 KASSERT((td->td_flags & TDF_BORROWING) == 0,
2090 ("thread with borrowed priority returning to userland"));
2091 if (td->td_priority != td->td_user_pri) {
2092 thread_lock(td);
2093 td->td_priority = td->td_user_pri;
2094 td->td_base_pri = td->td_user_pri;
2095 tdq_setlowpri(TDQ_SELF(), td);
2096 thread_unlock(td);
2097 }
2098}
2099
2100/*
2101 * Handle a stathz tick. This is really only relevant for timeshare
2102 * threads.
2103 */
2104void
2105sched_clock(struct thread *td)
2106{
2107 struct tdq *tdq;
2108 struct td_sched *ts;
2109
2110 THREAD_LOCK_ASSERT(td, MA_OWNED);
2111 tdq = TDQ_SELF();
2112#ifdef SMP
2113 /*
2114 * We run the long term load balancer infrequently on the first cpu.
2115 */
2116 if (balance_tdq == tdq) {
2117 if (balance_ticks && --balance_ticks == 0)
2118 sched_balance();
2119 }
2120#endif
2121 /*
2122 * Save the old switch count so we have a record of the last ticks
2123 * activity. Initialize the new switch count based on our load.
2124 * If there is some activity seed it to reflect that.
2125 */
2126 tdq->tdq_oldswitchcnt = tdq->tdq_switchcnt;
2127 tdq->tdq_switchcnt = tdq->tdq_load;
2128 /*
2129 * Advance the insert index once for each tick to ensure that all
2130 * threads get a chance to run.
2131 */
2132 if (tdq->tdq_idx == tdq->tdq_ridx) {
2133 tdq->tdq_idx = (tdq->tdq_idx + 1) % RQ_NQS;
2134 if (TAILQ_EMPTY(&tdq->tdq_timeshare.rq_queues[tdq->tdq_ridx]))
2135 tdq->tdq_ridx = tdq->tdq_idx;
2136 }
2137 ts = td->td_sched;
2138 if (td->td_pri_class & PRI_FIFO_BIT)
2139 return;
2140 if (td->td_pri_class == PRI_TIMESHARE) {
2141 /*
2142 * We used a tick; charge it to the thread so
2143 * that we can compute our interactivity.
2144 */
2145 td->td_sched->ts_runtime += tickincr;
2146 sched_interact_update(td);
2147 sched_priority(td);
2148 }
2149 /*
2150 * We used up one time slice.
2151 */
2152 if (--ts->ts_slice > 0)
2153 return;
2154 /*
2155 * We're out of time, force a requeue at userret().
2156 */
2157 ts->ts_slice = sched_slice;
2158 td->td_flags |= TDF_NEEDRESCHED;
2159}
2160
2161/*
2162 * Called once per hz tick. Used for cpu utilization information. This
2163 * is easier than trying to scale based on stathz.
2164 */
2165void
2166sched_tick(void)
2167{
2168 struct td_sched *ts;
2169
2170 ts = curthread->td_sched;
2171 /*
2172 * Ticks is updated asynchronously on a single cpu. Check here to
2173 * avoid incrementing ts_ticks multiple times in a single tick.
2174 */
2175 if (ts->ts_incrtick == ticks)
2176 return;
2177 /* Adjust ticks for pctcpu */
2178 ts->ts_ticks += 1 << SCHED_TICK_SHIFT;
2179 ts->ts_ltick = ticks;
2180 ts->ts_incrtick = ticks;
2181 /*
2182 * Update if we've exceeded our desired tick threshhold by over one
2183 * second.
2184 */
2185 if (ts->ts_ftick + SCHED_TICK_MAX < ts->ts_ltick)
2186 sched_pctcpu_update(ts);
2187}
2188
2189/*
2190 * Return whether the current CPU has runnable tasks. Used for in-kernel
2191 * cooperative idle threads.
2192 */
2193int
2194sched_runnable(void)
2195{
2196 struct tdq *tdq;
2197 int load;
2198
2199 load = 1;
2200
2201 tdq = TDQ_SELF();
2202 if ((curthread->td_flags & TDF_IDLETD) != 0) {
2203 if (tdq->tdq_load > 0)
2204 goto out;
2205 } else
2206 if (tdq->tdq_load - 1 > 0)
2207 goto out;
2208 load = 0;
2209out:
2210 return (load);
2211}
2212
2213/*
2214 * Choose the highest priority thread to run. The thread is removed from
2215 * the run-queue while running however the load remains. For SMP we set
2216 * the tdq in the global idle bitmask if it idles here.
2217 */
2218struct thread *
2219sched_choose(void)
2220{
2221 struct thread *td;
2222 struct tdq *tdq;
2223
2224 tdq = TDQ_SELF();
2225 TDQ_LOCK_ASSERT(tdq, MA_OWNED);
2226 td = tdq_choose(tdq);
2227 if (td) {
2228 td->td_sched->ts_ltick = ticks;
2229 tdq_runq_rem(tdq, td);
2230 tdq->tdq_lowpri = td->td_priority;
2231 return (td);
2232 }
2233 tdq->tdq_lowpri = PRI_MAX_IDLE;
2234 return (PCPU_GET(idlethread));
2235}
2236
2237/*
2238 * Set owepreempt if necessary. Preemption never happens directly in ULE,
2239 * we always request it once we exit a critical section.
2240 */
2241static inline void
2242sched_setpreempt(struct thread *td)
2243{
2244 struct thread *ctd;
2245 int cpri;
2246 int pri;
2247
2248 THREAD_LOCK_ASSERT(curthread, MA_OWNED);
2249
2250 ctd = curthread;
2251 pri = td->td_priority;
2252 cpri = ctd->td_priority;
2253 if (pri < cpri)
2254 ctd->td_flags |= TDF_NEEDRESCHED;
2255 if (panicstr != NULL || pri >= cpri || cold || TD_IS_INHIBITED(ctd))
2256 return;
2257 if (!sched_shouldpreempt(pri, cpri, 0))
2258 return;
2259 ctd->td_owepreempt = 1;
2260}
2261
2262/*
2263 * Add a thread to a thread queue. Select the appropriate runq and add the
2264 * thread to it. This is the internal function called when the tdq is
2265 * predetermined.
2266 */
2267void
2268tdq_add(struct tdq *tdq, struct thread *td, int flags)
2269{
2270
2271 TDQ_LOCK_ASSERT(tdq, MA_OWNED);
2272 KASSERT((td->td_inhibitors == 0),
2273 ("sched_add: trying to run inhibited thread"));
2274 KASSERT((TD_CAN_RUN(td) || TD_IS_RUNNING(td)),
2275 ("sched_add: bad thread state"));
2276 KASSERT(td->td_flags & TDF_INMEM,
2277 ("sched_add: thread swapped out"));
2278
2279 if (td->td_priority < tdq->tdq_lowpri)
2280 tdq->tdq_lowpri = td->td_priority;
2281 tdq_runq_add(tdq, td, flags);
2282 tdq_load_add(tdq, td);
2283}
2284
2285/*
2286 * Select the target thread queue and add a thread to it. Request
2287 * preemption or IPI a remote processor if required.
2288 */
2289void
2290sched_add(struct thread *td, int flags)
2291{
2292 struct tdq *tdq;
2293#ifdef SMP
2294 int cpu;
2295#endif
2296
2297 KTR_STATE2(KTR_SCHED, "thread", sched_tdname(td), "runq add",
2298 "prio:%d", td->td_priority, KTR_ATTR_LINKED,
2299 sched_tdname(curthread));
2300 KTR_POINT1(KTR_SCHED, "thread", sched_tdname(curthread), "wokeup",
2301 KTR_ATTR_LINKED, sched_tdname(td));
2302 THREAD_LOCK_ASSERT(td, MA_OWNED);
2303 /*
2304 * Recalculate the priority before we select the target cpu or
2305 * run-queue.
2306 */
2307 if (PRI_BASE(td->td_pri_class) == PRI_TIMESHARE)
2308 sched_priority(td);
2309#ifdef SMP
2310 /*
2311 * Pick the destination cpu and if it isn't ours transfer to the
2312 * target cpu.
2313 */
2314 cpu = sched_pickcpu(td, flags);
2315 tdq = sched_setcpu(td, cpu, flags);
2316 tdq_add(tdq, td, flags);
2317 if (cpu != PCPU_GET(cpuid)) {
2318 tdq_notify(tdq, td);
2319 return;
2320 }
2321#else
2322 tdq = TDQ_SELF();
2323 TDQ_LOCK(tdq);
2324 /*
2325 * Now that the thread is moving to the run-queue, set the lock
2326 * to the scheduler's lock.
2327 */
2328 thread_lock_set(td, TDQ_LOCKPTR(tdq));
2329 tdq_add(tdq, td, flags);
2330#endif
2331 if (!(flags & SRQ_YIELDING))
2332 sched_setpreempt(td);
2333}
2334
2335/*
2336 * Remove a thread from a run-queue without running it. This is used
2337 * when we're stealing a thread from a remote queue. Otherwise all threads
2338 * exit by calling sched_exit_thread() and sched_throw() themselves.
2339 */
2340void
2341sched_rem(struct thread *td)
2342{
2343 struct tdq *tdq;
2344
2345 KTR_STATE1(KTR_SCHED, "thread", sched_tdname(td), "runq rem",
2346 "prio:%d", td->td_priority);
2347 tdq = TDQ_CPU(td->td_sched->ts_cpu);
2348 TDQ_LOCK_ASSERT(tdq, MA_OWNED);
2349 MPASS(td->td_lock == TDQ_LOCKPTR(tdq));
2350 KASSERT(TD_ON_RUNQ(td),
2351 ("sched_rem: thread not on run queue"));
2352 tdq_runq_rem(tdq, td);
2353 tdq_load_rem(tdq, td);
2354 TD_SET_CAN_RUN(td);
2355 if (td->td_priority == tdq->tdq_lowpri)
2356 tdq_setlowpri(tdq, NULL);
2357}
2358
2359/*
2360 * Fetch cpu utilization information. Updates on demand.
2361 */
2362fixpt_t
2363sched_pctcpu(struct thread *td)
2364{
2365 fixpt_t pctcpu;
2366 struct td_sched *ts;
2367
2368 pctcpu = 0;
2369 ts = td->td_sched;
2370 if (ts == NULL)
2371 return (0);
2372
2373 THREAD_LOCK_ASSERT(td, MA_OWNED);
2374 if (ts->ts_ticks) {
2375 int rtick;
2376
2377 sched_pctcpu_update(ts);
2378 /* How many rtick per second ? */
2379 rtick = min(SCHED_TICK_HZ(ts) / SCHED_TICK_SECS, hz);
2380 pctcpu = (FSCALE * ((FSCALE * rtick)/hz)) >> FSHIFT;
2381 }
2382
2383 return (pctcpu);
2384}
2385
2386/*
2387 * Enforce affinity settings for a thread. Called after adjustments to
2388 * cpumask.
2389 */
2390void
2391sched_affinity(struct thread *td)
2392{
2393#ifdef SMP
2394 struct td_sched *ts;
2395
2396 THREAD_LOCK_ASSERT(td, MA_OWNED);
2397 ts = td->td_sched;
2398 if (THREAD_CAN_SCHED(td, ts->ts_cpu))
2399 return;
2400 if (TD_ON_RUNQ(td)) {
2401 sched_rem(td);
2402 sched_add(td, SRQ_BORING);
2403 return;
2404 }
2405 if (!TD_IS_RUNNING(td))
2406 return;
2407 td->td_flags |= TDF_NEEDRESCHED;
2408 /*
2409 * Force a switch before returning to userspace. If the
2410 * target thread is not running locally send an ipi to force
2411 * the issue.
2412 */
2413 if (td != curthread)
2414 ipi_cpu(ts->ts_cpu, IPI_PREEMPT);
2415#endif
2416}
2417
2418/*
2419 * Bind a thread to a target cpu.
2420 */
2421void
2422sched_bind(struct thread *td, int cpu)
2423{
2424 struct td_sched *ts;
2425
2426 THREAD_LOCK_ASSERT(td, MA_OWNED|MA_NOTRECURSED);
2427 KASSERT(td == curthread, ("sched_bind: can only bind curthread"));
2428 ts = td->td_sched;
2429 if (ts->ts_flags & TSF_BOUND)
2430 sched_unbind(td);
2431 KASSERT(THREAD_CAN_MIGRATE(td), ("%p must be migratable", td));
2432 ts->ts_flags |= TSF_BOUND;
2433 sched_pin();
2434 if (PCPU_GET(cpuid) == cpu)
2435 return;
2436 ts->ts_cpu = cpu;
2437 /* When we return from mi_switch we'll be on the correct cpu. */
2438 mi_switch(SW_VOL, NULL);
2439}
2440
2441/*
2442 * Release a bound thread.
2443 */
2444void
2445sched_unbind(struct thread *td)
2446{
2447 struct td_sched *ts;
2448
2449 THREAD_LOCK_ASSERT(td, MA_OWNED);
2450 KASSERT(td == curthread, ("sched_unbind: can only bind curthread"));
2451 ts = td->td_sched;
2452 if ((ts->ts_flags & TSF_BOUND) == 0)
2453 return;
2454 ts->ts_flags &= ~TSF_BOUND;
2455 sched_unpin();
2456}
2457
2458int
2459sched_is_bound(struct thread *td)
2460{
2461 THREAD_LOCK_ASSERT(td, MA_OWNED);
2462 return (td->td_sched->ts_flags & TSF_BOUND);
2463}
2464
2465/*
2466 * Basic yield call.
2467 */
2468void
2469sched_relinquish(struct thread *td)
2470{
2471 thread_lock(td);
2472 mi_switch(SW_VOL | SWT_RELINQUISH, NULL);
2473 thread_unlock(td);
2474}
2475
2476/*
2477 * Return the total system load.
2478 */
2479int
2480sched_load(void)
2481{
2482#ifdef SMP
2483 int total;
2484 int i;
2485
2486 total = 0;
2487 CPU_FOREACH(i)
2488 total += TDQ_CPU(i)->tdq_sysload;
2489 return (total);
2490#else
2491 return (TDQ_SELF()->tdq_sysload);
2492#endif
2493}
2494
2495int
2496sched_sizeof_proc(void)
2497{
2498 return (sizeof(struct proc));
2499}
2500
2501int
2502sched_sizeof_thread(void)
2503{
2504 return (sizeof(struct thread) + sizeof(struct td_sched));
2505}
2506
2507#ifdef SMP
2508#define TDQ_IDLESPIN(tdq) \
2509 ((tdq)->tdq_cg != NULL && ((tdq)->tdq_cg->cg_flags & CG_FLAG_THREAD) == 0)
2510#else
2511#define TDQ_IDLESPIN(tdq) 1
2512#endif
2513
2514/*
2515 * The actual idle process.
2516 */
2517void
2518sched_idletd(void *dummy)
2519{
2520 struct thread *td;
2521 struct tdq *tdq;
2522 int switchcnt;
2523 int i;
2524
2525 mtx_assert(&Giant, MA_NOTOWNED);
2526 td = curthread;
2527 tdq = TDQ_SELF();
2528 for (;;) {
2529#ifdef SMP
2530 if (tdq_idled(tdq) == 0)
2531 continue;
2532#endif
2533 switchcnt = tdq->tdq_switchcnt + tdq->tdq_oldswitchcnt;
2534 /*
2535 * If we're switching very frequently, spin while checking
2536 * for load rather than entering a low power state that
2537 * may require an IPI. However, don't do any busy
2538 * loops while on SMT machines as this simply steals
2539 * cycles from cores doing useful work.
2540 */
2541 if (TDQ_IDLESPIN(tdq) && switchcnt > sched_idlespinthresh) {
2542 for (i = 0; i < sched_idlespins; i++) {
2543 if (tdq->tdq_load)
2544 break;
2545 cpu_spinwait();
2546 }
2547 }
2548 switchcnt = tdq->tdq_switchcnt + tdq->tdq_oldswitchcnt;
2548 if (tdq->tdq_load == 0)
2549 cpu_idle(switchcnt > 1);
2549 if (tdq->tdq_load == 0) {
2550 tdq->tdq_cpu_idle = 1;
2551 if (tdq->tdq_load == 0) {
2552 cpu_idle(switchcnt > sched_idlespinthresh);
2553 tdq->tdq_switchcnt++;
2554 }
2555 tdq->tdq_cpu_idle = 0;
2556 }
2550 if (tdq->tdq_load) {
2551 thread_lock(td);
2552 mi_switch(SW_VOL | SWT_IDLE, NULL);
2553 thread_unlock(td);
2554 }
2555 }
2556}
2557
2558/*
2559 * A CPU is entering for the first time or a thread is exiting.
2560 */
2561void
2562sched_throw(struct thread *td)
2563{
2564 struct thread *newtd;
2565 struct tdq *tdq;
2566
2567 tdq = TDQ_SELF();
2568 if (td == NULL) {
2569 /* Correct spinlock nesting and acquire the correct lock. */
2570 TDQ_LOCK(tdq);
2571 spinlock_exit();
2572 } else {
2573 MPASS(td->td_lock == TDQ_LOCKPTR(tdq));
2574 tdq_load_rem(tdq, td);
2575 lock_profile_release_lock(&TDQ_LOCKPTR(tdq)->lock_object);
2576 }
2577 KASSERT(curthread->td_md.md_spinlock_count == 1, ("invalid count"));
2578 newtd = choosethread();
2579 TDQ_LOCKPTR(tdq)->mtx_lock = (uintptr_t)newtd;
2580 PCPU_SET(switchtime, cpu_ticks());
2581 PCPU_SET(switchticks, ticks);
2582 cpu_throw(td, newtd); /* doesn't return */
2583}
2584
2585/*
2586 * This is called from fork_exit(). Just acquire the correct locks and
2587 * let fork do the rest of the work.
2588 */
2589void
2590sched_fork_exit(struct thread *td)
2591{
2592 struct td_sched *ts;
2593 struct tdq *tdq;
2594 int cpuid;
2595
2596 /*
2597 * Finish setting up thread glue so that it begins execution in a
2598 * non-nested critical section with the scheduler lock held.
2599 */
2600 cpuid = PCPU_GET(cpuid);
2601 tdq = TDQ_CPU(cpuid);
2602 ts = td->td_sched;
2603 if (TD_IS_IDLETHREAD(td))
2604 td->td_lock = TDQ_LOCKPTR(tdq);
2605 MPASS(td->td_lock == TDQ_LOCKPTR(tdq));
2606 td->td_oncpu = cpuid;
2607 TDQ_LOCK_ASSERT(tdq, MA_OWNED | MA_NOTRECURSED);
2608 lock_profile_obtain_lock_success(
2609 &TDQ_LOCKPTR(tdq)->lock_object, 0, 0, __FILE__, __LINE__);
2610}
2611
2612/*
2613 * Create on first use to catch odd startup conditons.
2614 */
2615char *
2616sched_tdname(struct thread *td)
2617{
2618#ifdef KTR
2619 struct td_sched *ts;
2620
2621 ts = td->td_sched;
2622 if (ts->ts_name[0] == '\0')
2623 snprintf(ts->ts_name, sizeof(ts->ts_name),
2624 "%s tid %d", td->td_name, td->td_tid);
2625 return (ts->ts_name);
2626#else
2627 return (td->td_name);
2628#endif
2629}
2630
2631#ifdef SMP
2632
2633/*
2634 * Build the CPU topology dump string. Is recursively called to collect
2635 * the topology tree.
2636 */
2637static int
2638sysctl_kern_sched_topology_spec_internal(struct sbuf *sb, struct cpu_group *cg,
2639 int indent)
2640{
2641 int i, first;
2642
2643 sbuf_printf(sb, "%*s<group level=\"%d\" cache-level=\"%d\">\n", indent,
2644 "", indent, cg->cg_level);
2645 sbuf_printf(sb, "%*s <cpu count=\"%d\" mask=\"0x%x\">", indent, "",
2646 cg->cg_count, cg->cg_mask);
2647 first = TRUE;
2648 for (i = 0; i < MAXCPU; i++) {
2649 if ((cg->cg_mask & (1 << i)) != 0) {
2650 if (!first)
2651 sbuf_printf(sb, ", ");
2652 else
2653 first = FALSE;
2654 sbuf_printf(sb, "%d", i);
2655 }
2656 }
2657 sbuf_printf(sb, "</cpu>\n");
2658
2659 if (cg->cg_flags != 0) {
2660 sbuf_printf(sb, "%*s <flags>", indent, "");
2661 if ((cg->cg_flags & CG_FLAG_HTT) != 0)
2662 sbuf_printf(sb, "<flag name=\"HTT\">HTT group</flag>");
2663 if ((cg->cg_flags & CG_FLAG_THREAD) != 0)
2664 sbuf_printf(sb, "<flag name=\"THREAD\">THREAD group</flag>");
2665 if ((cg->cg_flags & CG_FLAG_SMT) != 0)
2666 sbuf_printf(sb, "<flag name=\"SMT\">SMT group</flag>");
2667 sbuf_printf(sb, "</flags>\n");
2668 }
2669
2670 if (cg->cg_children > 0) {
2671 sbuf_printf(sb, "%*s <children>\n", indent, "");
2672 for (i = 0; i < cg->cg_children; i++)
2673 sysctl_kern_sched_topology_spec_internal(sb,
2674 &cg->cg_child[i], indent+2);
2675 sbuf_printf(sb, "%*s </children>\n", indent, "");
2676 }
2677 sbuf_printf(sb, "%*s</group>\n", indent, "");
2678 return (0);
2679}
2680
2681/*
2682 * Sysctl handler for retrieving topology dump. It's a wrapper for
2683 * the recursive sysctl_kern_smp_topology_spec_internal().
2684 */
2685static int
2686sysctl_kern_sched_topology_spec(SYSCTL_HANDLER_ARGS)
2687{
2688 struct sbuf *topo;
2689 int err;
2690
2691 KASSERT(cpu_top != NULL, ("cpu_top isn't initialized"));
2692
2693 topo = sbuf_new(NULL, NULL, 500, SBUF_AUTOEXTEND);
2694 if (topo == NULL)
2695 return (ENOMEM);
2696
2697 sbuf_printf(topo, "<groups>\n");
2698 err = sysctl_kern_sched_topology_spec_internal(topo, cpu_top, 1);
2699 sbuf_printf(topo, "</groups>\n");
2700
2701 if (err == 0) {
2702 sbuf_finish(topo);
2703 err = SYSCTL_OUT(req, sbuf_data(topo), sbuf_len(topo));
2704 }
2705 sbuf_delete(topo);
2706 return (err);
2707}
2708#endif
2709
2710SYSCTL_NODE(_kern, OID_AUTO, sched, CTLFLAG_RW, 0, "Scheduler");
2711SYSCTL_STRING(_kern_sched, OID_AUTO, name, CTLFLAG_RD, "ULE", 0,
2712 "Scheduler name");
2713SYSCTL_INT(_kern_sched, OID_AUTO, slice, CTLFLAG_RW, &sched_slice, 0,
2714 "Slice size for timeshare threads");
2715SYSCTL_INT(_kern_sched, OID_AUTO, interact, CTLFLAG_RW, &sched_interact, 0,
2716 "Interactivity score threshold");
2717SYSCTL_INT(_kern_sched, OID_AUTO, preempt_thresh, CTLFLAG_RW, &preempt_thresh,
2718 0,"Min priority for preemption, lower priorities have greater precedence");
2719SYSCTL_INT(_kern_sched, OID_AUTO, static_boost, CTLFLAG_RW, &static_boost,
2720 0,"Controls whether static kernel priorities are assigned to sleeping threads.");
2721SYSCTL_INT(_kern_sched, OID_AUTO, idlespins, CTLFLAG_RW, &sched_idlespins,
2722 0,"Number of times idle will spin waiting for new work.");
2723SYSCTL_INT(_kern_sched, OID_AUTO, idlespinthresh, CTLFLAG_RW, &sched_idlespinthresh,
2724 0,"Threshold before we will permit idle spinning.");
2725#ifdef SMP
2726SYSCTL_INT(_kern_sched, OID_AUTO, affinity, CTLFLAG_RW, &affinity, 0,
2727 "Number of hz ticks to keep thread affinity for");
2728SYSCTL_INT(_kern_sched, OID_AUTO, balance, CTLFLAG_RW, &rebalance, 0,
2729 "Enables the long-term load balancer");
2730SYSCTL_INT(_kern_sched, OID_AUTO, balance_interval, CTLFLAG_RW,
2731 &balance_interval, 0,
2732 "Average frequency in stathz ticks to run the long-term balancer");
2733SYSCTL_INT(_kern_sched, OID_AUTO, steal_htt, CTLFLAG_RW, &steal_htt, 0,
2734 "Steals work from another hyper-threaded core on idle");
2735SYSCTL_INT(_kern_sched, OID_AUTO, steal_idle, CTLFLAG_RW, &steal_idle, 0,
2736 "Attempts to steal work from other cores before idling");
2737SYSCTL_INT(_kern_sched, OID_AUTO, steal_thresh, CTLFLAG_RW, &steal_thresh, 0,
2738 "Minimum load on remote cpu before we'll steal");
2739
2740/* Retrieve SMP topology */
2741SYSCTL_PROC(_kern_sched, OID_AUTO, topology_spec, CTLTYPE_STRING |
2742 CTLFLAG_RD, NULL, 0, sysctl_kern_sched_topology_spec, "A",
2743 "XML dump of detected CPU topology");
2744#endif
2745
2746/* ps compat. All cpu percentages from ULE are weighted. */
2747static int ccpu = 0;
2748SYSCTL_INT(_kern, OID_AUTO, ccpu, CTLFLAG_RD, &ccpu, 0, "");
2557 if (tdq->tdq_load) {
2558 thread_lock(td);
2559 mi_switch(SW_VOL | SWT_IDLE, NULL);
2560 thread_unlock(td);
2561 }
2562 }
2563}
2564
2565/*
2566 * A CPU is entering for the first time or a thread is exiting.
2567 */
2568void
2569sched_throw(struct thread *td)
2570{
2571 struct thread *newtd;
2572 struct tdq *tdq;
2573
2574 tdq = TDQ_SELF();
2575 if (td == NULL) {
2576 /* Correct spinlock nesting and acquire the correct lock. */
2577 TDQ_LOCK(tdq);
2578 spinlock_exit();
2579 } else {
2580 MPASS(td->td_lock == TDQ_LOCKPTR(tdq));
2581 tdq_load_rem(tdq, td);
2582 lock_profile_release_lock(&TDQ_LOCKPTR(tdq)->lock_object);
2583 }
2584 KASSERT(curthread->td_md.md_spinlock_count == 1, ("invalid count"));
2585 newtd = choosethread();
2586 TDQ_LOCKPTR(tdq)->mtx_lock = (uintptr_t)newtd;
2587 PCPU_SET(switchtime, cpu_ticks());
2588 PCPU_SET(switchticks, ticks);
2589 cpu_throw(td, newtd); /* doesn't return */
2590}
2591
2592/*
2593 * This is called from fork_exit(). Just acquire the correct locks and
2594 * let fork do the rest of the work.
2595 */
2596void
2597sched_fork_exit(struct thread *td)
2598{
2599 struct td_sched *ts;
2600 struct tdq *tdq;
2601 int cpuid;
2602
2603 /*
2604 * Finish setting up thread glue so that it begins execution in a
2605 * non-nested critical section with the scheduler lock held.
2606 */
2607 cpuid = PCPU_GET(cpuid);
2608 tdq = TDQ_CPU(cpuid);
2609 ts = td->td_sched;
2610 if (TD_IS_IDLETHREAD(td))
2611 td->td_lock = TDQ_LOCKPTR(tdq);
2612 MPASS(td->td_lock == TDQ_LOCKPTR(tdq));
2613 td->td_oncpu = cpuid;
2614 TDQ_LOCK_ASSERT(tdq, MA_OWNED | MA_NOTRECURSED);
2615 lock_profile_obtain_lock_success(
2616 &TDQ_LOCKPTR(tdq)->lock_object, 0, 0, __FILE__, __LINE__);
2617}
2618
2619/*
2620 * Create on first use to catch odd startup conditons.
2621 */
2622char *
2623sched_tdname(struct thread *td)
2624{
2625#ifdef KTR
2626 struct td_sched *ts;
2627
2628 ts = td->td_sched;
2629 if (ts->ts_name[0] == '\0')
2630 snprintf(ts->ts_name, sizeof(ts->ts_name),
2631 "%s tid %d", td->td_name, td->td_tid);
2632 return (ts->ts_name);
2633#else
2634 return (td->td_name);
2635#endif
2636}
2637
2638#ifdef SMP
2639
2640/*
2641 * Build the CPU topology dump string. Is recursively called to collect
2642 * the topology tree.
2643 */
2644static int
2645sysctl_kern_sched_topology_spec_internal(struct sbuf *sb, struct cpu_group *cg,
2646 int indent)
2647{
2648 int i, first;
2649
2650 sbuf_printf(sb, "%*s<group level=\"%d\" cache-level=\"%d\">\n", indent,
2651 "", indent, cg->cg_level);
2652 sbuf_printf(sb, "%*s <cpu count=\"%d\" mask=\"0x%x\">", indent, "",
2653 cg->cg_count, cg->cg_mask);
2654 first = TRUE;
2655 for (i = 0; i < MAXCPU; i++) {
2656 if ((cg->cg_mask & (1 << i)) != 0) {
2657 if (!first)
2658 sbuf_printf(sb, ", ");
2659 else
2660 first = FALSE;
2661 sbuf_printf(sb, "%d", i);
2662 }
2663 }
2664 sbuf_printf(sb, "</cpu>\n");
2665
2666 if (cg->cg_flags != 0) {
2667 sbuf_printf(sb, "%*s <flags>", indent, "");
2668 if ((cg->cg_flags & CG_FLAG_HTT) != 0)
2669 sbuf_printf(sb, "<flag name=\"HTT\">HTT group</flag>");
2670 if ((cg->cg_flags & CG_FLAG_THREAD) != 0)
2671 sbuf_printf(sb, "<flag name=\"THREAD\">THREAD group</flag>");
2672 if ((cg->cg_flags & CG_FLAG_SMT) != 0)
2673 sbuf_printf(sb, "<flag name=\"SMT\">SMT group</flag>");
2674 sbuf_printf(sb, "</flags>\n");
2675 }
2676
2677 if (cg->cg_children > 0) {
2678 sbuf_printf(sb, "%*s <children>\n", indent, "");
2679 for (i = 0; i < cg->cg_children; i++)
2680 sysctl_kern_sched_topology_spec_internal(sb,
2681 &cg->cg_child[i], indent+2);
2682 sbuf_printf(sb, "%*s </children>\n", indent, "");
2683 }
2684 sbuf_printf(sb, "%*s</group>\n", indent, "");
2685 return (0);
2686}
2687
2688/*
2689 * Sysctl handler for retrieving topology dump. It's a wrapper for
2690 * the recursive sysctl_kern_smp_topology_spec_internal().
2691 */
2692static int
2693sysctl_kern_sched_topology_spec(SYSCTL_HANDLER_ARGS)
2694{
2695 struct sbuf *topo;
2696 int err;
2697
2698 KASSERT(cpu_top != NULL, ("cpu_top isn't initialized"));
2699
2700 topo = sbuf_new(NULL, NULL, 500, SBUF_AUTOEXTEND);
2701 if (topo == NULL)
2702 return (ENOMEM);
2703
2704 sbuf_printf(topo, "<groups>\n");
2705 err = sysctl_kern_sched_topology_spec_internal(topo, cpu_top, 1);
2706 sbuf_printf(topo, "</groups>\n");
2707
2708 if (err == 0) {
2709 sbuf_finish(topo);
2710 err = SYSCTL_OUT(req, sbuf_data(topo), sbuf_len(topo));
2711 }
2712 sbuf_delete(topo);
2713 return (err);
2714}
2715#endif
2716
2717SYSCTL_NODE(_kern, OID_AUTO, sched, CTLFLAG_RW, 0, "Scheduler");
2718SYSCTL_STRING(_kern_sched, OID_AUTO, name, CTLFLAG_RD, "ULE", 0,
2719 "Scheduler name");
2720SYSCTL_INT(_kern_sched, OID_AUTO, slice, CTLFLAG_RW, &sched_slice, 0,
2721 "Slice size for timeshare threads");
2722SYSCTL_INT(_kern_sched, OID_AUTO, interact, CTLFLAG_RW, &sched_interact, 0,
2723 "Interactivity score threshold");
2724SYSCTL_INT(_kern_sched, OID_AUTO, preempt_thresh, CTLFLAG_RW, &preempt_thresh,
2725 0,"Min priority for preemption, lower priorities have greater precedence");
2726SYSCTL_INT(_kern_sched, OID_AUTO, static_boost, CTLFLAG_RW, &static_boost,
2727 0,"Controls whether static kernel priorities are assigned to sleeping threads.");
2728SYSCTL_INT(_kern_sched, OID_AUTO, idlespins, CTLFLAG_RW, &sched_idlespins,
2729 0,"Number of times idle will spin waiting for new work.");
2730SYSCTL_INT(_kern_sched, OID_AUTO, idlespinthresh, CTLFLAG_RW, &sched_idlespinthresh,
2731 0,"Threshold before we will permit idle spinning.");
2732#ifdef SMP
2733SYSCTL_INT(_kern_sched, OID_AUTO, affinity, CTLFLAG_RW, &affinity, 0,
2734 "Number of hz ticks to keep thread affinity for");
2735SYSCTL_INT(_kern_sched, OID_AUTO, balance, CTLFLAG_RW, &rebalance, 0,
2736 "Enables the long-term load balancer");
2737SYSCTL_INT(_kern_sched, OID_AUTO, balance_interval, CTLFLAG_RW,
2738 &balance_interval, 0,
2739 "Average frequency in stathz ticks to run the long-term balancer");
2740SYSCTL_INT(_kern_sched, OID_AUTO, steal_htt, CTLFLAG_RW, &steal_htt, 0,
2741 "Steals work from another hyper-threaded core on idle");
2742SYSCTL_INT(_kern_sched, OID_AUTO, steal_idle, CTLFLAG_RW, &steal_idle, 0,
2743 "Attempts to steal work from other cores before idling");
2744SYSCTL_INT(_kern_sched, OID_AUTO, steal_thresh, CTLFLAG_RW, &steal_thresh, 0,
2745 "Minimum load on remote cpu before we'll steal");
2746
2747/* Retrieve SMP topology */
2748SYSCTL_PROC(_kern_sched, OID_AUTO, topology_spec, CTLTYPE_STRING |
2749 CTLFLAG_RD, NULL, 0, sysctl_kern_sched_topology_spec, "A",
2750 "XML dump of detected CPU topology");
2751#endif
2752
2753/* ps compat. All cpu percentages from ULE are weighted. */
2754static int ccpu = 0;
2755SYSCTL_INT(_kern, OID_AUTO, ccpu, CTLFLAG_RD, &ccpu, 0, "");