1/*-
2 * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
3 *
4 * Copyright (c) 2002-2007, Jeffrey Roberson <jeff@freebsd.org>
5 * All rights reserved.
6 *
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
9 * are met:
10 * 1. Redistributions of source code must retain the above copyright
11 *    notice unmodified, this list of conditions, and the following
12 *    disclaimer.
13 * 2. Redistributions in binary form must reproduce the above copyright
14 *    notice, this list of conditions and the following disclaimer in the
15 *    documentation and/or other materials provided with the distribution.
16 *
17 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
18 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
19 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
20 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
21 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
22 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
23 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
24 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
25 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
26 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
27 */
28
29/*
30 * This file implements the ULE scheduler.  ULE supports independent CPU
31 * run queues and fine grain locking.  It has superior interactive
32 * performance under load even on uni-processor systems.
33 *
34 * etymology:
35 *   ULE is the last three letters in schedule.  It owes its name to a
36 * generic user created for a scheduling system by Paul Mikesell at
37 * Isilon Systems and a general lack of creativity on the part of the author.
38 */
39
40#include <sys/cdefs.h>
41__FBSDID("$FreeBSD$");
42
43#include "opt_hwpmc_hooks.h"
44#include "opt_sched.h"
45
46#include <sys/param.h>
47#include <sys/systm.h>
48#include <sys/kdb.h>
49#include <sys/kernel.h>
50#include <sys/ktr.h>
51#include <sys/limits.h>
52#include <sys/lock.h>
53#include <sys/mutex.h>
54#include <sys/proc.h>
55#include <sys/resource.h>
56#include <sys/resourcevar.h>
57#include <sys/sched.h>
58#include <sys/sdt.h>
59#include <sys/smp.h>
60#include <sys/sx.h>
61#include <sys/sysctl.h>
62#include <sys/sysproto.h>
63#include <sys/turnstile.h>
64#include <sys/umtx.h>
65#include <sys/vmmeter.h>
66#include <sys/cpuset.h>
67#include <sys/sbuf.h>
68
69#ifdef HWPMC_HOOKS
70#include <sys/pmckern.h>
71#endif
72
73#ifdef KDTRACE_HOOKS
74#include <sys/dtrace_bsd.h>
75int __read_mostly		dtrace_vtime_active;
76dtrace_vtime_switch_func_t	dtrace_vtime_switch_func;
77#endif
78
79#include <machine/cpu.h>
80#include <machine/smp.h>
81
82#define	KTR_ULE	0
83
84#define	TS_NAME_LEN (MAXCOMLEN + sizeof(" td ") + sizeof(__XSTRING(UINT_MAX)))
85#define	TDQ_NAME_LEN	(sizeof("sched lock ") + sizeof(__XSTRING(MAXCPU)))
86#define	TDQ_LOADNAME_LEN	(sizeof("CPU ") + sizeof(__XSTRING(MAXCPU)) - 1 + sizeof(" load"))
87
88/*
89 * Thread scheduler specific section.  All fields are protected
90 * by the thread lock.
91 */
92struct td_sched {
93	struct runq	*ts_runq;	/* Run-queue we're queued on. */
94	short		ts_flags;	/* TSF_* flags. */
95	int		ts_cpu;		/* CPU that we have affinity for. */
96	int		ts_rltick;	/* Real last tick, for affinity. */
97	int		ts_slice;	/* Ticks of slice remaining. */
98	u_int		ts_slptime;	/* Number of ticks we vol. slept */
99	u_int		ts_runtime;	/* Number of ticks we were running */
100	int		ts_ltick;	/* Last tick that we were running on */
101	int		ts_ftick;	/* First tick that we were running on */
102	int		ts_ticks;	/* Tick count */
103#ifdef KTR
104	char		ts_name[TS_NAME_LEN];
105#endif
106};
107/* flags kept in ts_flags */
108#define	TSF_BOUND	0x0001		/* Thread can not migrate. */
109#define	TSF_XFERABLE	0x0002		/* Thread was added as transferable. */
110
111#define	THREAD_CAN_MIGRATE(td)	((td)->td_pinned == 0)
112#define	THREAD_CAN_SCHED(td, cpu)	\
113    CPU_ISSET((cpu), &(td)->td_cpuset->cs_mask)
114
115_Static_assert(sizeof(struct thread) + sizeof(struct td_sched) <=
116    sizeof(struct thread0_storage),
117    "increase struct thread0_storage.t0st_sched size");
118
119/*
120 * Priority ranges used for interactive and non-interactive timeshare
121 * threads.  The timeshare priorities are split up into four ranges.
122 * The first range handles interactive threads.  The last three ranges
123 * (NHALF, x, and NHALF) handle non-interactive threads with the outer
124 * ranges supporting nice values.
125 */
126#define	PRI_TIMESHARE_RANGE	(PRI_MAX_TIMESHARE - PRI_MIN_TIMESHARE + 1)
127#define	PRI_INTERACT_RANGE	((PRI_TIMESHARE_RANGE - SCHED_PRI_NRESV) / 2)
128#define	PRI_BATCH_RANGE		(PRI_TIMESHARE_RANGE - PRI_INTERACT_RANGE)
129
130#define	PRI_MIN_INTERACT	PRI_MIN_TIMESHARE
131#define	PRI_MAX_INTERACT	(PRI_MIN_TIMESHARE + PRI_INTERACT_RANGE - 1)
132#define	PRI_MIN_BATCH		(PRI_MIN_TIMESHARE + PRI_INTERACT_RANGE)
133#define	PRI_MAX_BATCH		PRI_MAX_TIMESHARE
134
135/*
136 * Cpu percentage computation macros and defines.
137 *
138 * SCHED_TICK_SECS:	Number of seconds to average the cpu usage across.
139 * SCHED_TICK_TARG:	Number of hz ticks to average the cpu usage across.
140 * SCHED_TICK_MAX:	Maximum number of ticks before scaling back.
141 * SCHED_TICK_SHIFT:	Shift factor to avoid rounding away results.
142 * SCHED_TICK_HZ:	Compute the number of hz ticks for a given ticks count.
143 * SCHED_TICK_TOTAL:	Gives the amount of time we've been recording ticks.
144 */
145#define	SCHED_TICK_SECS		10
146#define	SCHED_TICK_TARG		(hz * SCHED_TICK_SECS)
147#define	SCHED_TICK_MAX		(SCHED_TICK_TARG + hz)
148#define	SCHED_TICK_SHIFT	10
149#define	SCHED_TICK_HZ(ts)	((ts)->ts_ticks >> SCHED_TICK_SHIFT)
150#define	SCHED_TICK_TOTAL(ts)	(max((ts)->ts_ltick - (ts)->ts_ftick, hz))
151
152/*
153 * These macros determine priorities for non-interactive threads.  They are
154 * assigned a priority based on their recent cpu utilization as expressed
155 * by the ratio of ticks to the tick total.  NHALF priorities at the start
156 * and end of the MIN to MAX timeshare range are only reachable with negative
157 * or positive nice respectively.
158 *
159 * PRI_RANGE:	Priority range for utilization dependent priorities.
160 * PRI_NRESV:	Number of nice values.
161 * PRI_TICKS:	Compute a priority in PRI_RANGE from the ticks count and total.
162 * PRI_NICE:	Determines the part of the priority inherited from nice.
163 */
164#define	SCHED_PRI_NRESV		(PRIO_MAX - PRIO_MIN)
165#define	SCHED_PRI_NHALF		(SCHED_PRI_NRESV / 2)
166#define	SCHED_PRI_MIN		(PRI_MIN_BATCH + SCHED_PRI_NHALF)
167#define	SCHED_PRI_MAX		(PRI_MAX_BATCH - SCHED_PRI_NHALF)
168#define	SCHED_PRI_RANGE		(SCHED_PRI_MAX - SCHED_PRI_MIN + 1)
169#define	SCHED_PRI_TICKS(ts)						\
170    (SCHED_TICK_HZ((ts)) /						\
171    (roundup(SCHED_TICK_TOTAL((ts)), SCHED_PRI_RANGE) / SCHED_PRI_RANGE))
172#define	SCHED_PRI_NICE(nice)	(nice)
173
174/*
175 * These determine the interactivity of a process.  Interactivity differs from
176 * cpu utilization in that it expresses the voluntary time slept vs time ran
177 * while cpu utilization includes all time not running.  This more accurately
178 * models the intent of the thread.
179 *
180 * SLP_RUN_MAX:	Maximum amount of sleep time + run time we'll accumulate
181 *		before throttling back.
182 * SLP_RUN_FORK:	Maximum slp+run time to inherit at fork time.
183 * INTERACT_MAX:	Maximum interactivity value.  Smaller is better.
184 * INTERACT_THRESH:	Threshold for placement on the current runq.
185 */
186#define	SCHED_SLP_RUN_MAX	((hz * 5) << SCHED_TICK_SHIFT)
187#define	SCHED_SLP_RUN_FORK	((hz / 2) << SCHED_TICK_SHIFT)
188#define	SCHED_INTERACT_MAX	(100)
189#define	SCHED_INTERACT_HALF	(SCHED_INTERACT_MAX / 2)
190#define	SCHED_INTERACT_THRESH	(30)
191
192/*
193 * These parameters determine the slice behavior for batch work.
194 */
195#define	SCHED_SLICE_DEFAULT_DIVISOR	10	/* ~94 ms, 12 stathz ticks. */
196#define	SCHED_SLICE_MIN_DIVISOR		6	/* DEFAULT/MIN = ~16 ms. */
197
198/* Flags kept in td_flags. */
199#define	TDF_SLICEEND	TDF_SCHED2	/* Thread time slice is over. */
200
201/*
202 * tickincr:		Converts a stathz tick into a hz domain scaled by
203 *			the shift factor.  Without the shift the error rate
204 *			due to rounding would be unacceptably high.
205 * realstathz:		stathz is sometimes 0 and run off of hz.
206 * sched_slice:		Runtime of each thread before rescheduling.
207 * preempt_thresh:	Priority threshold for preemption and remote IPIs.
208 */
209static int __read_mostly sched_interact = SCHED_INTERACT_THRESH;
210static int __read_mostly tickincr = 8 << SCHED_TICK_SHIFT;
211static int __read_mostly realstathz = 127;	/* reset during boot. */
212static int __read_mostly sched_slice = 10;	/* reset during boot. */
213static int __read_mostly sched_slice_min = 1;	/* reset during boot. */
214#ifdef PREEMPTION
215#ifdef FULL_PREEMPTION
216static int __read_mostly preempt_thresh = PRI_MAX_IDLE;
217#else
218static int __read_mostly preempt_thresh = PRI_MIN_KERN;
219#endif
220#else
221static int __read_mostly preempt_thresh = 0;
222#endif
223static int __read_mostly static_boost = PRI_MIN_BATCH;
224static int __read_mostly sched_idlespins = 10000;
225static int __read_mostly sched_idlespinthresh = -1;
226
227/*
228 * tdq - per processor runqs and statistics.  All fields are protected by the
229 * tdq_lock.  The load and lowpri may be accessed without to avoid excess
230 * locking in sched_pickcpu();
231 */
232struct tdq {
233	/*
234	 * Ordered to improve efficiency of cpu_search() and switch().
235	 * tdq_lock is padded to avoid false sharing with tdq_load and
236	 * tdq_cpu_idle.
237	 */
238	struct mtx_padalign tdq_lock;		/* run queue lock. */
239	struct cpu_group *tdq_cg;		/* Pointer to cpu topology. */
240	volatile int	tdq_load;		/* Aggregate load. */
241	volatile int	tdq_cpu_idle;		/* cpu_idle() is active. */
242	int		tdq_sysload;		/* For loadavg, !ITHD load. */
243	volatile int	tdq_transferable;	/* Transferable thread count. */
244	volatile short	tdq_switchcnt;		/* Switches this tick. */
245	volatile short	tdq_oldswitchcnt;	/* Switches last tick. */
246	u_char		tdq_lowpri;		/* Lowest priority thread. */
247	u_char		tdq_owepreempt;		/* Remote preemption pending. */
248	u_char		tdq_idx;		/* Current insert index. */
249	u_char		tdq_ridx;		/* Current removal index. */
250	int		tdq_id;			/* cpuid. */
251	struct runq	tdq_realtime;		/* real-time run queue. */
252	struct runq	tdq_timeshare;		/* timeshare run queue. */
253	struct runq	tdq_idle;		/* Queue of IDLE threads. */
254	char		tdq_name[TDQ_NAME_LEN];
255#ifdef KTR
256	char		tdq_loadname[TDQ_LOADNAME_LEN];
257#endif
258} __aligned(64);
259
260/* Idle thread states and config. */
261#define	TDQ_RUNNING	1
262#define	TDQ_IDLE	2
263
264#ifdef SMP
265struct cpu_group __read_mostly *cpu_top;		/* CPU topology */
266
267#define	SCHED_AFFINITY_DEFAULT	(max(1, hz / 1000))
268#define	SCHED_AFFINITY(ts, t)	((ts)->ts_rltick > ticks - ((t) * affinity))
269
270/*
271 * Run-time tunables.
272 */
273static int rebalance = 1;
274static int balance_interval = 128;	/* Default set in sched_initticks(). */
275static int __read_mostly affinity;
276static int __read_mostly steal_idle = 1;
277static int __read_mostly steal_thresh = 2;
278static int __read_mostly always_steal = 0;
279static int __read_mostly trysteal_limit = 2;
280
281/*
282 * One thread queue per processor.
283 */
284static struct tdq __read_mostly *balance_tdq;
285static int balance_ticks;
286DPCPU_DEFINE_STATIC(struct tdq, tdq);
287DPCPU_DEFINE_STATIC(uint32_t, randomval);
288
289#define	TDQ_SELF()	((struct tdq *)PCPU_GET(sched))
290#define	TDQ_CPU(x)	(DPCPU_ID_PTR((x), tdq))
291#define	TDQ_ID(x)	((x)->tdq_id)
292#else	/* !SMP */
293static struct tdq	tdq_cpu;
294
295#define	TDQ_ID(x)	(0)
296#define	TDQ_SELF()	(&tdq_cpu)
297#define	TDQ_CPU(x)	(&tdq_cpu)
298#endif
299
300#define	TDQ_LOCK_ASSERT(t, type)	mtx_assert(TDQ_LOCKPTR((t)), (type))
301#define	TDQ_LOCK(t)		mtx_lock_spin(TDQ_LOCKPTR((t)))
302#define	TDQ_LOCK_FLAGS(t, f)	mtx_lock_spin_flags(TDQ_LOCKPTR((t)), (f))
303#define	TDQ_UNLOCK(t)		mtx_unlock_spin(TDQ_LOCKPTR((t)))
304#define	TDQ_LOCKPTR(t)		((struct mtx *)(&(t)->tdq_lock))
305
306static void sched_priority(struct thread *);
307static void sched_thread_priority(struct thread *, u_char);
308static int sched_interact_score(struct thread *);
309static void sched_interact_update(struct thread *);
310static void sched_interact_fork(struct thread *);
311static void sched_pctcpu_update(struct td_sched *, int);
312
313/* Operations on per processor queues */
314static struct thread *tdq_choose(struct tdq *);
315static void tdq_setup(struct tdq *, int i);
316static void tdq_load_add(struct tdq *, struct thread *);
317static void tdq_load_rem(struct tdq *, struct thread *);
318static __inline void tdq_runq_add(struct tdq *, struct thread *, int);
319static __inline void tdq_runq_rem(struct tdq *, struct thread *);
320static inline int sched_shouldpreempt(int, int, int);
321void tdq_print(int cpu);
322static void runq_print(struct runq *rq);
323static void tdq_add(struct tdq *, struct thread *, int);
324#ifdef SMP
325static struct thread *tdq_move(struct tdq *, struct tdq *);
326static int tdq_idled(struct tdq *);
327static void tdq_notify(struct tdq *, struct thread *);
328static struct thread *tdq_steal(struct tdq *, int);
329static struct thread *runq_steal(struct runq *, int);
330static int sched_pickcpu(struct thread *, int);
331static void sched_balance(void);
332static int sched_balance_pair(struct tdq *, struct tdq *);
333static inline struct tdq *sched_setcpu(struct thread *, int, int);
334static inline void thread_unblock_switch(struct thread *, struct mtx *);
335static int sysctl_kern_sched_topology_spec(SYSCTL_HANDLER_ARGS);
336static int sysctl_kern_sched_topology_spec_internal(struct sbuf *sb,
337    struct cpu_group *cg, int indent);
338#endif
339
340static void sched_setup(void *dummy);
341SYSINIT(sched_setup, SI_SUB_RUN_QUEUE, SI_ORDER_FIRST, sched_setup, NULL);
342
343static void sched_initticks(void *dummy);
344SYSINIT(sched_initticks, SI_SUB_CLOCKS, SI_ORDER_THIRD, sched_initticks,
345    NULL);
346
347SDT_PROVIDER_DEFINE(sched);
348
349SDT_PROBE_DEFINE3(sched, , , change__pri, "struct thread *",
350    "struct proc *", "uint8_t");
351SDT_PROBE_DEFINE3(sched, , , dequeue, "struct thread *",
352    "struct proc *", "void *");
353SDT_PROBE_DEFINE4(sched, , , enqueue, "struct thread *",
354    "struct proc *", "void *", "int");
355SDT_PROBE_DEFINE4(sched, , , lend__pri, "struct thread *",
356    "struct proc *", "uint8_t", "struct thread *");
357SDT_PROBE_DEFINE2(sched, , , load__change, "int", "int");
358SDT_PROBE_DEFINE2(sched, , , off__cpu, "struct thread *",
359    "struct proc *");
360SDT_PROBE_DEFINE(sched, , , on__cpu);
361SDT_PROBE_DEFINE(sched, , , remain__cpu);
362SDT_PROBE_DEFINE2(sched, , , surrender, "struct thread *",
363    "struct proc *");
364
365/*
366 * Print the threads waiting on a run-queue.
367 */
368static void
369runq_print(struct runq *rq)
370{
371	struct rqhead *rqh;
372	struct thread *td;
373	int pri;
374	int j;
375	int i;
376
377	for (i = 0; i < RQB_LEN; i++) {
378		printf("\t\trunq bits %d 0x%zx\n",
379		    i, rq->rq_status.rqb_bits[i]);
380		for (j = 0; j < RQB_BPW; j++)
381			if (rq->rq_status.rqb_bits[i] & (1ul << j)) {
382				pri = j + (i << RQB_L2BPW);
383				rqh = &rq->rq_queues[pri];
384				TAILQ_FOREACH(td, rqh, td_runq) {
385					printf("\t\t\ttd %p(%s) priority %d rqindex %d pri %d\n",
386					    td, td->td_name, td->td_priority,
387					    td->td_rqindex, pri);
388				}
389			}
390	}
391}
392
393/*
394 * Print the status of a per-cpu thread queue.  Should be a ddb show cmd.
395 */
396void
397tdq_print(int cpu)
398{
399	struct tdq *tdq;
400
401	tdq = TDQ_CPU(cpu);
402
403	printf("tdq %d:\n", TDQ_ID(tdq));
404	printf("\tlock            %p\n", TDQ_LOCKPTR(tdq));
405	printf("\tLock name:      %s\n", tdq->tdq_name);
406	printf("\tload:           %d\n", tdq->tdq_load);
407	printf("\tswitch cnt:     %d\n", tdq->tdq_switchcnt);
408	printf("\told switch cnt: %d\n", tdq->tdq_oldswitchcnt);
409	printf("\ttimeshare idx:  %d\n", tdq->tdq_idx);
410	printf("\ttimeshare ridx: %d\n", tdq->tdq_ridx);
411	printf("\tload transferable: %d\n", tdq->tdq_transferable);
412	printf("\tlowest priority:   %d\n", tdq->tdq_lowpri);
413	printf("\trealtime runq:\n");
414	runq_print(&tdq->tdq_realtime);
415	printf("\ttimeshare runq:\n");
416	runq_print(&tdq->tdq_timeshare);
417	printf("\tidle runq:\n");
418	runq_print(&tdq->tdq_idle);
419}
420
421static inline int
422sched_shouldpreempt(int pri, int cpri, int remote)
423{
424	/*
425	 * If the new priority is not better than the current priority there is
426	 * nothing to do.
427	 */
428	if (pri >= cpri)
429		return (0);
430	/*
431	 * Always preempt idle.
432	 */
433	if (cpri >= PRI_MIN_IDLE)
434		return (1);
435	/*
436	 * If preemption is disabled don't preempt others.
437	 */
438	if (preempt_thresh == 0)
439		return (0);
440	/*
441	 * Preempt if we exceed the threshold.
442	 */
443	if (pri <= preempt_thresh)
444		return (1);
445	/*
446	 * If we're interactive or better and there is non-interactive
447	 * or worse running preempt only remote processors.
448	 */
449	if (remote && pri <= PRI_MAX_INTERACT && cpri > PRI_MAX_INTERACT)
450		return (1);
451	return (0);
452}
453
454/*
455 * Add a thread to the actual run-queue.  Keeps transferable counts up to
456 * date with what is actually on the run-queue.  Selects the correct
457 * queue position for timeshare threads.
458 */
459static __inline void
460tdq_runq_add(struct tdq *tdq, struct thread *td, int flags)
461{
462	struct td_sched *ts;
463	u_char pri;
464
465	TDQ_LOCK_ASSERT(tdq, MA_OWNED);
466	THREAD_LOCK_BLOCKED_ASSERT(td, MA_OWNED);
467
468	pri = td->td_priority;
469	ts = td_get_sched(td);
470	TD_SET_RUNQ(td);
471	if (THREAD_CAN_MIGRATE(td)) {
472		tdq->tdq_transferable++;
473		ts->ts_flags |= TSF_XFERABLE;
474	}
475	if (pri < PRI_MIN_BATCH) {
476		ts->ts_runq = &tdq->tdq_realtime;
477	} else if (pri <= PRI_MAX_BATCH) {
478		ts->ts_runq = &tdq->tdq_timeshare;
479		KASSERT(pri <= PRI_MAX_BATCH && pri >= PRI_MIN_BATCH,
480			("Invalid priority %d on timeshare runq", pri));
481		/*
482		 * This queue contains only priorities between MIN and MAX
483		 * realtime.  Use the whole queue to represent these values.
484		 */
485		if ((flags & (SRQ_BORROWING|SRQ_PREEMPTED)) == 0) {
486			pri = RQ_NQS * (pri - PRI_MIN_BATCH) / PRI_BATCH_RANGE;
487			pri = (pri + tdq->tdq_idx) % RQ_NQS;
488			/*
489			 * This effectively shortens the queue by one so we
490			 * can have a one slot difference between idx and
491			 * ridx while we wait for threads to drain.
492			 */
493			if (tdq->tdq_ridx != tdq->tdq_idx &&
494			    pri == tdq->tdq_ridx)
495				pri = (unsigned char)(pri - 1) % RQ_NQS;
496		} else
497			pri = tdq->tdq_ridx;
498		runq_add_pri(ts->ts_runq, td, pri, flags);
499		return;
500	} else
501		ts->ts_runq = &tdq->tdq_idle;
502	runq_add(ts->ts_runq, td, flags);
503}
504
505/*
506 * Remove a thread from a run-queue.  This typically happens when a thread
507 * is selected to run.  Running threads are not on the queue and the
508 * transferable count does not reflect them.
509 */
510static __inline void
511tdq_runq_rem(struct tdq *tdq, struct thread *td)
512{
513	struct td_sched *ts;
514
515	ts = td_get_sched(td);
516	TDQ_LOCK_ASSERT(tdq, MA_OWNED);
517	THREAD_LOCK_BLOCKED_ASSERT(td, MA_OWNED);
518	KASSERT(ts->ts_runq != NULL,
519	    ("tdq_runq_remove: thread %p null ts_runq", td));
520	if (ts->ts_flags & TSF_XFERABLE) {
521		tdq->tdq_transferable--;
522		ts->ts_flags &= ~TSF_XFERABLE;
523	}
524	if (ts->ts_runq == &tdq->tdq_timeshare) {
525		if (tdq->tdq_idx != tdq->tdq_ridx)
526			runq_remove_idx(ts->ts_runq, td, &tdq->tdq_ridx);
527		else
528			runq_remove_idx(ts->ts_runq, td, NULL);
529	} else
530		runq_remove(ts->ts_runq, td);
531}
532
533/*
534 * Load is maintained for all threads RUNNING and ON_RUNQ.  Add the load
535 * for this thread to the referenced thread queue.
536 */
537static void
538tdq_load_add(struct tdq *tdq, struct thread *td)
539{
540
541	TDQ_LOCK_ASSERT(tdq, MA_OWNED);
542	THREAD_LOCK_BLOCKED_ASSERT(td, MA_OWNED);
543
544	tdq->tdq_load++;
545	if ((td->td_flags & TDF_NOLOAD) == 0)
546		tdq->tdq_sysload++;
547	KTR_COUNTER0(KTR_SCHED, "load", tdq->tdq_loadname, tdq->tdq_load);
548	SDT_PROBE2(sched, , , load__change, (int)TDQ_ID(tdq), tdq->tdq_load);
549}
550
551/*
552 * Remove the load from a thread that is transitioning to a sleep state or
553 * exiting.
554 */
555static void
556tdq_load_rem(struct tdq *tdq, struct thread *td)
557{
558
559	TDQ_LOCK_ASSERT(tdq, MA_OWNED);
560	THREAD_LOCK_BLOCKED_ASSERT(td, MA_OWNED);
561	KASSERT(tdq->tdq_load != 0,
562	    ("tdq_load_rem: Removing with 0 load on queue %d", TDQ_ID(tdq)));
563
564	tdq->tdq_load--;
565	if ((td->td_flags & TDF_NOLOAD) == 0)
566		tdq->tdq_sysload--;
567	KTR_COUNTER0(KTR_SCHED, "load", tdq->tdq_loadname, tdq->tdq_load);
568	SDT_PROBE2(sched, , , load__change, (int)TDQ_ID(tdq), tdq->tdq_load);
569}
570
571/*
572 * Bound timeshare latency by decreasing slice size as load increases.  We
573 * consider the maximum latency as the sum of the threads waiting to run
574 * aside from curthread and target no more than sched_slice latency but
575 * no less than sched_slice_min runtime.
576 */
577static inline int
578tdq_slice(struct tdq *tdq)
579{
580	int load;
581
582	/*
583	 * It is safe to use sys_load here because this is called from
584	 * contexts where timeshare threads are running and so there
585	 * cannot be higher priority load in the system.
586	 */
587	load = tdq->tdq_sysload - 1;
588	if (load >= SCHED_SLICE_MIN_DIVISOR)
589		return (sched_slice_min);
590	if (load <= 1)
591		return (sched_slice);
592	return (sched_slice / load);
593}
594
595/*
596 * Set lowpri to its exact value by searching the run-queue and
597 * evaluating curthread.  curthread may be passed as an optimization.
598 */
599static void
600tdq_setlowpri(struct tdq *tdq, struct thread *ctd)
601{
602	struct thread *td;
603
604	TDQ_LOCK_ASSERT(tdq, MA_OWNED);
605	if (ctd == NULL)
606		ctd = pcpu_find(TDQ_ID(tdq))->pc_curthread;
607	td = tdq_choose(tdq);
608	if (td == NULL || td->td_priority > ctd->td_priority)
609		tdq->tdq_lowpri = ctd->td_priority;
610	else
611		tdq->tdq_lowpri = td->td_priority;
612}
613
614#ifdef SMP
615/*
616 * We need some randomness. Implement a classic Linear Congruential
617 * Generator X_{n+1}=(aX_n+c) mod m. These values are optimized for
618 * m = 2^32, a = 69069 and c = 5. We only return the upper 16 bits
619 * of the random state (in the low bits of our answer) to keep
620 * the maximum randomness.
621 */
622static uint32_t
623sched_random(void)
624{
625	uint32_t *rndptr;
626
627	rndptr = DPCPU_PTR(randomval);
628	*rndptr = *rndptr * 69069 + 5;
629
630	return (*rndptr >> 16);
631}
632
633struct cpu_search {
634	cpuset_t cs_mask;
635	u_int	cs_prefer;
636	int	cs_pri;		/* Min priority for low. */
637	int	cs_limit;	/* Max load for low, min load for high. */
638	int	cs_cpu;
639	int	cs_load;
640};
641
642#define	CPU_SEARCH_LOWEST	0x1
643#define	CPU_SEARCH_HIGHEST	0x2
644#define	CPU_SEARCH_BOTH		(CPU_SEARCH_LOWEST|CPU_SEARCH_HIGHEST)
645
646static __always_inline int cpu_search(const struct cpu_group *cg,
647    struct cpu_search *low, struct cpu_search *high, const int match);
648int __noinline cpu_search_lowest(const struct cpu_group *cg,
649    struct cpu_search *low);
650int __noinline cpu_search_highest(const struct cpu_group *cg,
651    struct cpu_search *high);
652int __noinline cpu_search_both(const struct cpu_group *cg,
653    struct cpu_search *low, struct cpu_search *high);
654
655/*
656 * Search the tree of cpu_groups for the lowest or highest loaded cpu
657 * according to the match argument.  This routine actually compares the
658 * load on all paths through the tree and finds the least loaded cpu on
659 * the least loaded path, which may differ from the least loaded cpu in
660 * the system.  This balances work among caches and buses.
661 *
662 * This inline is instantiated in three forms below using constants for the
663 * match argument.  It is reduced to the minimum set for each case.  It is
664 * also recursive to the depth of the tree.
665 */
666static __always_inline int
667cpu_search(const struct cpu_group *cg, struct cpu_search *low,
668    struct cpu_search *high, const int match)
669{
670	struct cpu_search lgroup;
671	struct cpu_search hgroup;
672	cpuset_t cpumask;
673	struct cpu_group *child;
674	struct tdq *tdq;
675	int cpu, i, hload, lload, load, total, rnd;
676
677	total = 0;
678	cpumask = cg->cg_mask;
679	if (match & CPU_SEARCH_LOWEST) {
680		lload = INT_MAX;
681		lgroup = *low;
682	}
683	if (match & CPU_SEARCH_HIGHEST) {
684		hload = INT_MIN;
685		hgroup = *high;
686	}
687
688	/* Iterate through the child CPU groups and then remaining CPUs. */
689	for (i = cg->cg_children, cpu = mp_maxid; ; ) {
690		if (i == 0) {
691#ifdef HAVE_INLINE_FFSL
692			cpu = CPU_FFS(&cpumask) - 1;
693#else
694			while (cpu >= 0 && !CPU_ISSET(cpu, &cpumask))
695				cpu--;
696#endif
697			if (cpu < 0)
698				break;
699			child = NULL;
700		} else
701			child = &cg->cg_child[i - 1];
702
703		if (match & CPU_SEARCH_LOWEST)
704			lgroup.cs_cpu = -1;
705		if (match & CPU_SEARCH_HIGHEST)
706			hgroup.cs_cpu = -1;
707		if (child) {			/* Handle child CPU group. */
708			CPU_ANDNOT(&cpumask, &child->cg_mask);
709			switch (match) {
710			case CPU_SEARCH_LOWEST:
711				load = cpu_search_lowest(child, &lgroup);
712				break;
713			case CPU_SEARCH_HIGHEST:
714				load = cpu_search_highest(child, &hgroup);
715				break;
716			case CPU_SEARCH_BOTH:
717				load = cpu_search_both(child, &lgroup, &hgroup);
718				break;
719			}
720		} else {			/* Handle child CPU. */
721			CPU_CLR(cpu, &cpumask);
722			tdq = TDQ_CPU(cpu);
723			load = tdq->tdq_load * 256;
724			rnd = sched_random() % 32;
725			if (match & CPU_SEARCH_LOWEST) {
726				if (cpu == low->cs_prefer)
727					load -= 64;
728				/* If that CPU is allowed and get data. */
729				if (tdq->tdq_lowpri > lgroup.cs_pri &&
730				    tdq->tdq_load <= lgroup.cs_limit &&
731				    CPU_ISSET(cpu, &lgroup.cs_mask)) {
732					lgroup.cs_cpu = cpu;
733					lgroup.cs_load = load - rnd;
734				}
735			}
736			if (match & CPU_SEARCH_HIGHEST)
737				if (tdq->tdq_load >= hgroup.cs_limit &&
738				    tdq->tdq_transferable &&
739				    CPU_ISSET(cpu, &hgroup.cs_mask)) {
740					hgroup.cs_cpu = cpu;
741					hgroup.cs_load = load - rnd;
742				}
743		}
744		total += load;
745
746		/* We have info about child item. Compare it. */
747		if (match & CPU_SEARCH_LOWEST) {
748			if (lgroup.cs_cpu >= 0 &&
749			    (load < lload ||
750			     (load == lload && lgroup.cs_load < low->cs_load))) {
751				lload = load;
752				low->cs_cpu = lgroup.cs_cpu;
753				low->cs_load = lgroup.cs_load;
754			}
755		}
756		if (match & CPU_SEARCH_HIGHEST)
757			if (hgroup.cs_cpu >= 0 &&
758			    (load > hload ||
759			     (load == hload && hgroup.cs_load > high->cs_load))) {
760				hload = load;
761				high->cs_cpu = hgroup.cs_cpu;
762				high->cs_load = hgroup.cs_load;
763			}
764		if (child) {
765			i--;
766			if (i == 0 && CPU_EMPTY(&cpumask))
767				break;
768		}
769#ifndef HAVE_INLINE_FFSL
770		else
771			cpu--;
772#endif
773	}
774	return (total);
775}
776
777/*
778 * cpu_search instantiations must pass constants to maintain the inline
779 * optimization.
780 */
781int
782cpu_search_lowest(const struct cpu_group *cg, struct cpu_search *low)
783{
784	return cpu_search(cg, low, NULL, CPU_SEARCH_LOWEST);
785}
786
787int
788cpu_search_highest(const struct cpu_group *cg, struct cpu_search *high)
789{
790	return cpu_search(cg, NULL, high, CPU_SEARCH_HIGHEST);
791}
792
793int
794cpu_search_both(const struct cpu_group *cg, struct cpu_search *low,
795    struct cpu_search *high)
796{
797	return cpu_search(cg, low, high, CPU_SEARCH_BOTH);
798}
799
800/*
801 * Find the cpu with the least load via the least loaded path that has a
802 * lowpri greater than pri  pri.  A pri of -1 indicates any priority is
803 * acceptable.
804 */
805static inline int
806sched_lowest(const struct cpu_group *cg, cpuset_t mask, int pri, int maxload,
807    int prefer)
808{
809	struct cpu_search low;
810
811	low.cs_cpu = -1;
812	low.cs_prefer = prefer;
813	low.cs_mask = mask;
814	low.cs_pri = pri;
815	low.cs_limit = maxload;
816	cpu_search_lowest(cg, &low);
817	return low.cs_cpu;
818}
819
820/*
821 * Find the cpu with the highest load via the highest loaded path.
822 */
823static inline int
824sched_highest(const struct cpu_group *cg, cpuset_t mask, int minload)
825{
826	struct cpu_search high;
827
828	high.cs_cpu = -1;
829	high.cs_mask = mask;
830	high.cs_limit = minload;
831	cpu_search_highest(cg, &high);
832	return high.cs_cpu;
833}
834
835static void
836sched_balance_group(struct cpu_group *cg)
837{
838	struct tdq *tdq;
839	cpuset_t hmask, lmask;
840	int high, low, anylow;
841
842	CPU_FILL(&hmask);
843	for (;;) {
844		high = sched_highest(cg, hmask, 2);
845		/* Stop if there is no more CPU with transferrable threads. */
846		if (high == -1)
847			break;
848		CPU_CLR(high, &hmask);
849		CPU_COPY(&hmask, &lmask);
850		/* Stop if there is no more CPU left for low. */
851		if (CPU_EMPTY(&lmask))
852			break;
853		anylow = 1;
854		tdq = TDQ_CPU(high);
855nextlow:
856		low = sched_lowest(cg, lmask, -1, tdq->tdq_load - 1, high);
857		/* Stop if we looked well and found no less loaded CPU. */
858		if (anylow && low == -1)
859			break;
860		/* Go to next high if we found no less loaded CPU. */
861		if (low == -1)
862			continue;
863		/* Transfer thread from high to low. */
864		if (sched_balance_pair(tdq, TDQ_CPU(low))) {
865			/* CPU that got thread can no longer be a donor. */
866			CPU_CLR(low, &hmask);
867		} else {
868			/*
869			 * If failed, then there is no threads on high
870			 * that can run on this low. Drop low from low
871			 * mask and look for different one.
872			 */
873			CPU_CLR(low, &lmask);
874			anylow = 0;
875			goto nextlow;
876		}
877	}
878}
879
880static void
881sched_balance(void)
882{
883	struct tdq *tdq;
884
885	balance_ticks = max(balance_interval / 2, 1) +
886	    (sched_random() % balance_interval);
887	tdq = TDQ_SELF();
888	TDQ_UNLOCK(tdq);
889	sched_balance_group(cpu_top);
890	TDQ_LOCK(tdq);
891}
892
893/*
894 * Lock two thread queues using their address to maintain lock order.
895 */
896static void
897tdq_lock_pair(struct tdq *one, struct tdq *two)
898{
899	if (one < two) {
900		TDQ_LOCK(one);
901		TDQ_LOCK_FLAGS(two, MTX_DUPOK);
902	} else {
903		TDQ_LOCK(two);
904		TDQ_LOCK_FLAGS(one, MTX_DUPOK);
905	}
906}
907
908/*
909 * Unlock two thread queues.  Order is not important here.
910 */
911static void
912tdq_unlock_pair(struct tdq *one, struct tdq *two)
913{
914	TDQ_UNLOCK(one);
915	TDQ_UNLOCK(two);
916}
917
918/*
919 * Transfer load between two imbalanced thread queues.
920 */
921static int
922sched_balance_pair(struct tdq *high, struct tdq *low)
923{
924	struct thread *td;
925	int cpu;
926
927	tdq_lock_pair(high, low);
928	td = NULL;
929	/*
930	 * Transfer a thread from high to low.
931	 */
932	if (high->tdq_transferable != 0 && high->tdq_load > low->tdq_load &&
933	    (td = tdq_move(high, low)) != NULL) {
934		/*
935		 * In case the target isn't the current cpu notify it of the
936		 * new load, possibly sending an IPI to force it to reschedule.
937		 */
938		cpu = TDQ_ID(low);
939		if (cpu != PCPU_GET(cpuid))
940			tdq_notify(low, td);
941	}
942	tdq_unlock_pair(high, low);
943	return (td != NULL);
944}
945
946/*
947 * Move a thread from one thread queue to another.
948 */
949static struct thread *
950tdq_move(struct tdq *from, struct tdq *to)
951{
952	struct thread *td;
953	struct tdq *tdq;
954	int cpu;
955
956	TDQ_LOCK_ASSERT(from, MA_OWNED);
957	TDQ_LOCK_ASSERT(to, MA_OWNED);
958
959	tdq = from;
960	cpu = TDQ_ID(to);
961	td = tdq_steal(tdq, cpu);
962	if (td == NULL)
963		return (NULL);
964
965	/*
966	 * Although the run queue is locked the thread may be
967	 * blocked.  We can not set the lock until it is unblocked.
968	 */
969	thread_lock_block_wait(td);
970	sched_rem(td);
971	THREAD_LOCKPTR_ASSERT(td, TDQ_LOCKPTR(from));
972	td->td_lock = TDQ_LOCKPTR(to);
973	td_get_sched(td)->ts_cpu = cpu;
974	tdq_add(to, td, SRQ_YIELDING);
975
976	return (td);
977}
978
979/*
980 * This tdq has idled.  Try to steal a thread from another cpu and switch
981 * to it.
982 */
983static int
984tdq_idled(struct tdq *tdq)
985{
986	struct cpu_group *cg;
987	struct tdq *steal;
988	cpuset_t mask;
989	int cpu, switchcnt;
990
991	if (smp_started == 0 || steal_idle == 0 || tdq->tdq_cg == NULL)
992		return (1);
993	CPU_FILL(&mask);
994	CPU_CLR(PCPU_GET(cpuid), &mask);
995    restart:
996	switchcnt = tdq->tdq_switchcnt + tdq->tdq_oldswitchcnt;
997	for (cg = tdq->tdq_cg; ; ) {
998		cpu = sched_highest(cg, mask, steal_thresh);
999		/*
1000		 * We were assigned a thread but not preempted.  Returning
1001		 * 0 here will cause our caller to switch to it.
1002		 */
1003		if (tdq->tdq_load)
1004			return (0);
1005		if (cpu == -1) {
1006			cg = cg->cg_parent;
1007			if (cg == NULL)
1008				return (1);
1009			continue;
1010		}
1011		steal = TDQ_CPU(cpu);
1012		/*
1013		 * The data returned by sched_highest() is stale and
1014		 * the chosen CPU no longer has an eligible thread.
1015		 *
1016		 * Testing this ahead of tdq_lock_pair() only catches
1017		 * this situation about 20% of the time on an 8 core
1018		 * 16 thread Ryzen 7, but it still helps performance.
1019		 */
1020		if (steal->tdq_load < steal_thresh ||
1021		    steal->tdq_transferable == 0)
1022			goto restart;
1023		tdq_lock_pair(tdq, steal);
1024		/*
1025		 * We were assigned a thread while waiting for the locks.
1026		 * Switch to it now instead of stealing a thread.
1027		 */
1028		if (tdq->tdq_load)
1029			break;
1030		/*
1031		 * The data returned by sched_highest() is stale and
1032		 * the chosen CPU no longer has an eligible thread, or
1033		 * we were preempted and the CPU loading info may be out
1034		 * of date.  The latter is rare.  In either case restart
1035		 * the search.
1036		 */
1037		if (steal->tdq_load < steal_thresh ||
1038		    steal->tdq_transferable == 0 ||
1039		    switchcnt != tdq->tdq_switchcnt + tdq->tdq_oldswitchcnt) {
1040			tdq_unlock_pair(tdq, steal);
1041			goto restart;
1042		}
1043		/*
1044		 * Steal the thread and switch to it.
1045		 */
1046		if (tdq_move(steal, tdq) != NULL)
1047			break;
1048		/*
1049		 * We failed to acquire a thread even though it looked
1050		 * like one was available.  This could be due to affinity
1051		 * restrictions or for other reasons.  Loop again after
1052		 * removing this CPU from the set.  The restart logic
1053		 * above does not restore this CPU to the set due to the
1054		 * likelyhood of failing here again.
1055		 */
1056		CPU_CLR(cpu, &mask);
1057		tdq_unlock_pair(tdq, steal);
1058	}
1059	TDQ_UNLOCK(steal);
1060	mi_switch(SW_VOL | SWT_IDLE);
1061	return (0);
1062}
1063
1064/*
1065 * Notify a remote cpu of new work.  Sends an IPI if criteria are met.
1066 */
1067static void
1068tdq_notify(struct tdq *tdq, struct thread *td)
1069{
1070	struct thread *ctd;
1071	int pri;
1072	int cpu;
1073
1074	if (tdq->tdq_owepreempt)
1075		return;
1076	cpu = td_get_sched(td)->ts_cpu;
1077	pri = td->td_priority;
1078	ctd = pcpu_find(cpu)->pc_curthread;
1079	if (!sched_shouldpreempt(pri, ctd->td_priority, 1))
1080		return;
1081
1082	/*
1083	 * Make sure that our caller's earlier update to tdq_load is
1084	 * globally visible before we read tdq_cpu_idle.  Idle thread
1085	 * accesses both of them without locks, and the order is important.
1086	 */
1087	atomic_thread_fence_seq_cst();
1088
1089	if (TD_IS_IDLETHREAD(ctd)) {
1090		/*
1091		 * If the MD code has an idle wakeup routine try that before
1092		 * falling back to IPI.
1093		 */
1094		if (!tdq->tdq_cpu_idle || cpu_idle_wakeup(cpu))
1095			return;
1096	}
1097
1098	/*
1099	 * The run queues have been updated, so any switch on the remote CPU
1100	 * will satisfy the preemption request.
1101	 */
1102	tdq->tdq_owepreempt = 1;
1103	ipi_cpu(cpu, IPI_PREEMPT);
1104}
1105
1106/*
1107 * Steals load from a timeshare queue.  Honors the rotating queue head
1108 * index.
1109 */
1110static struct thread *
1111runq_steal_from(struct runq *rq, int cpu, u_char start)
1112{
1113	struct rqbits *rqb;
1114	struct rqhead *rqh;
1115	struct thread *td, *first;
1116	int bit;
1117	int i;
1118
1119	rqb = &rq->rq_status;
1120	bit = start & (RQB_BPW -1);
1121	first = NULL;
1122again:
1123	for (i = RQB_WORD(start); i < RQB_LEN; bit = 0, i++) {
1124		if (rqb->rqb_bits[i] == 0)
1125			continue;
1126		if (bit == 0)
1127			bit = RQB_FFS(rqb->rqb_bits[i]);
1128		for (; bit < RQB_BPW; bit++) {
1129			if ((rqb->rqb_bits[i] & (1ul << bit)) == 0)
1130				continue;
1131			rqh = &rq->rq_queues[bit + (i << RQB_L2BPW)];
1132			TAILQ_FOREACH(td, rqh, td_runq) {
1133				if (first && THREAD_CAN_MIGRATE(td) &&
1134				    THREAD_CAN_SCHED(td, cpu))
1135					return (td);
1136				first = td;
1137			}
1138		}
1139	}
1140	if (start != 0) {
1141		start = 0;
1142		goto again;
1143	}
1144
1145	if (first && THREAD_CAN_MIGRATE(first) &&
1146	    THREAD_CAN_SCHED(first, cpu))
1147		return (first);
1148	return (NULL);
1149}
1150
1151/*
1152 * Steals load from a standard linear queue.
1153 */
1154static struct thread *
1155runq_steal(struct runq *rq, int cpu)
1156{
1157	struct rqhead *rqh;
1158	struct rqbits *rqb;
1159	struct thread *td;
1160	int word;
1161	int bit;
1162
1163	rqb = &rq->rq_status;
1164	for (word = 0; word < RQB_LEN; word++) {
1165		if (rqb->rqb_bits[word] == 0)
1166			continue;
1167		for (bit = 0; bit < RQB_BPW; bit++) {
1168			if ((rqb->rqb_bits[word] & (1ul << bit)) == 0)
1169				continue;
1170			rqh = &rq->rq_queues[bit + (word << RQB_L2BPW)];
1171			TAILQ_FOREACH(td, rqh, td_runq)
1172				if (THREAD_CAN_MIGRATE(td) &&
1173				    THREAD_CAN_SCHED(td, cpu))
1174					return (td);
1175		}
1176	}
1177	return (NULL);
1178}
1179
1180/*
1181 * Attempt to steal a thread in priority order from a thread queue.
1182 */
1183static struct thread *
1184tdq_steal(struct tdq *tdq, int cpu)
1185{
1186	struct thread *td;
1187
1188	TDQ_LOCK_ASSERT(tdq, MA_OWNED);
1189	if ((td = runq_steal(&tdq->tdq_realtime, cpu)) != NULL)
1190		return (td);
1191	if ((td = runq_steal_from(&tdq->tdq_timeshare,
1192	    cpu, tdq->tdq_ridx)) != NULL)
1193		return (td);
1194	return (runq_steal(&tdq->tdq_idle, cpu));
1195}
1196
1197/*
1198 * Sets the thread lock and ts_cpu to match the requested cpu.  Unlocks the
1199 * current lock and returns with the assigned queue locked.
1200 */
1201static inline struct tdq *
1202sched_setcpu(struct thread *td, int cpu, int flags)
1203{
1204
1205	struct tdq *tdq;
1206	struct mtx *mtx;
1207
1208	THREAD_LOCK_ASSERT(td, MA_OWNED);
1209	tdq = TDQ_CPU(cpu);
1210	td_get_sched(td)->ts_cpu = cpu;
1211	/*
1212	 * If the lock matches just return the queue.
1213	 */
1214	if (td->td_lock == TDQ_LOCKPTR(tdq)) {
1215		KASSERT((flags & SRQ_HOLD) == 0,
1216		    ("sched_setcpu: Invalid lock for SRQ_HOLD"));
1217		return (tdq);
1218	}
1219
1220	/*
1221	 * The hard case, migration, we need to block the thread first to
1222	 * prevent order reversals with other cpus locks.
1223	 */
1224	spinlock_enter();
1225	mtx = thread_lock_block(td);
1226	if ((flags & SRQ_HOLD) == 0)
1227		mtx_unlock_spin(mtx);
1228	TDQ_LOCK(tdq);
1229	thread_lock_unblock(td, TDQ_LOCKPTR(tdq));
1230	spinlock_exit();
1231	return (tdq);
1232}
1233
1234SCHED_STAT_DEFINE(pickcpu_intrbind, "Soft interrupt binding");
1235SCHED_STAT_DEFINE(pickcpu_idle_affinity, "Picked idle cpu based on affinity");
1236SCHED_STAT_DEFINE(pickcpu_affinity, "Picked cpu based on affinity");
1237SCHED_STAT_DEFINE(pickcpu_lowest, "Selected lowest load");
1238SCHED_STAT_DEFINE(pickcpu_local, "Migrated to current cpu");
1239SCHED_STAT_DEFINE(pickcpu_migration, "Selection may have caused migration");
1240
1241static int
1242sched_pickcpu(struct thread *td, int flags)
1243{
1244	struct cpu_group *cg, *ccg;
1245	struct td_sched *ts;
1246	struct tdq *tdq;
1247	cpuset_t mask;
1248	int cpu, pri, self, intr;
1249
1250	self = PCPU_GET(cpuid);
1251	ts = td_get_sched(td);
1252	KASSERT(!CPU_ABSENT(ts->ts_cpu), ("sched_pickcpu: Start scheduler on "
1253	    "absent CPU %d for thread %s.", ts->ts_cpu, td->td_name));
1254	if (smp_started == 0)
1255		return (self);
1256	/*
1257	 * Don't migrate a running thread from sched_switch().
1258	 */
1259	if ((flags & SRQ_OURSELF) || !THREAD_CAN_MIGRATE(td))
1260		return (ts->ts_cpu);
1261	/*
1262	 * Prefer to run interrupt threads on the processors that generate
1263	 * the interrupt.
1264	 */
1265	if (td->td_priority <= PRI_MAX_ITHD && THREAD_CAN_SCHED(td, self) &&
1266	    curthread->td_intr_nesting_level) {
1267		tdq = TDQ_SELF();
1268		if (tdq->tdq_lowpri >= PRI_MIN_IDLE) {
1269			SCHED_STAT_INC(pickcpu_idle_affinity);
1270			return (self);
1271		}
1272		ts->ts_cpu = self;
1273		intr = 1;
1274		cg = tdq->tdq_cg;
1275		goto llc;
1276	} else {
1277		intr = 0;
1278		tdq = TDQ_CPU(ts->ts_cpu);
1279		cg = tdq->tdq_cg;
1280	}
1281	/*
1282	 * If the thread can run on the last cpu and the affinity has not
1283	 * expired and it is idle, run it there.
1284	 */
1285	if (THREAD_CAN_SCHED(td, ts->ts_cpu) &&
1286	    tdq->tdq_lowpri >= PRI_MIN_IDLE &&
1287	    SCHED_AFFINITY(ts, CG_SHARE_L2)) {
1288		if (cg->cg_flags & CG_FLAG_THREAD) {
1289			/* Check all SMT threads for being idle. */
1290			for (cpu = CPU_FFS(&cg->cg_mask) - 1; ; cpu++) {
1291				if (CPU_ISSET(cpu, &cg->cg_mask) &&
1292				    TDQ_CPU(cpu)->tdq_lowpri < PRI_MIN_IDLE)
1293					break;
1294				if (cpu >= mp_maxid) {
1295					SCHED_STAT_INC(pickcpu_idle_affinity);
1296					return (ts->ts_cpu);
1297				}
1298			}
1299		} else {
1300			SCHED_STAT_INC(pickcpu_idle_affinity);
1301			return (ts->ts_cpu);
1302		}
1303	}
1304llc:
1305	/*
1306	 * Search for the last level cache CPU group in the tree.
1307	 * Skip SMT, identical groups and caches with expired affinity.
1308	 * Interrupt threads affinity is explicit and never expires.
1309	 */
1310	for (ccg = NULL; cg != NULL; cg = cg->cg_parent) {
1311		if (cg->cg_flags & CG_FLAG_THREAD)
1312			continue;
1313		if (cg->cg_children == 1 || cg->cg_count == 1)
1314			continue;
1315		if (cg->cg_level == CG_SHARE_NONE ||
1316		    (!intr && !SCHED_AFFINITY(ts, cg->cg_level)))
1317			continue;
1318		ccg = cg;
1319	}
1320	/* Found LLC shared by all CPUs, so do a global search. */
1321	if (ccg == cpu_top)
1322		ccg = NULL;
1323	cpu = -1;
1324	mask = td->td_cpuset->cs_mask;
1325	pri = td->td_priority;
1326	/*
1327	 * Try hard to keep interrupts within found LLC.  Search the LLC for
1328	 * the least loaded CPU we can run now.  For NUMA systems it should
1329	 * be within target domain, and it also reduces scheduling overhead.
1330	 */
1331	if (ccg != NULL && intr) {
1332		cpu = sched_lowest(ccg, mask, pri, INT_MAX, ts->ts_cpu);
1333		if (cpu >= 0)
1334			SCHED_STAT_INC(pickcpu_intrbind);
1335	} else
1336	/* Search the LLC for the least loaded idle CPU we can run now. */
1337	if (ccg != NULL) {
1338		cpu = sched_lowest(ccg, mask, max(pri, PRI_MAX_TIMESHARE),
1339		    INT_MAX, ts->ts_cpu);
1340		if (cpu >= 0)
1341			SCHED_STAT_INC(pickcpu_affinity);
1342	}
1343	/* Search globally for the least loaded CPU we can run now. */
1344	if (cpu < 0) {
1345		cpu = sched_lowest(cpu_top, mask, pri, INT_MAX, ts->ts_cpu);
1346		if (cpu >= 0)
1347			SCHED_STAT_INC(pickcpu_lowest);
1348	}
1349	/* Search globally for the least loaded CPU. */
1350	if (cpu < 0) {
1351		cpu = sched_lowest(cpu_top, mask, -1, INT_MAX, ts->ts_cpu);
1352		if (cpu >= 0)
1353			SCHED_STAT_INC(pickcpu_lowest);
1354	}
1355	KASSERT(cpu >= 0, ("sched_pickcpu: Failed to find a cpu."));
1356	KASSERT(!CPU_ABSENT(cpu), ("sched_pickcpu: Picked absent CPU %d.", cpu));
1357	/*
1358	 * Compare the lowest loaded cpu to current cpu.
1359	 */
1360	tdq = TDQ_CPU(cpu);
1361	if (THREAD_CAN_SCHED(td, self) && TDQ_SELF()->tdq_lowpri > pri &&
1362	    tdq->tdq_lowpri < PRI_MIN_IDLE &&
1363	    TDQ_SELF()->tdq_load <= tdq->tdq_load + 1) {
1364		SCHED_STAT_INC(pickcpu_local);
1365		cpu = self;
1366	}
1367	if (cpu != ts->ts_cpu)
1368		SCHED_STAT_INC(pickcpu_migration);
1369	return (cpu);
1370}
1371#endif
1372
1373/*
1374 * Pick the highest priority task we have and return it.
1375 */
1376static struct thread *
1377tdq_choose(struct tdq *tdq)
1378{
1379	struct thread *td;
1380
1381	TDQ_LOCK_ASSERT(tdq, MA_OWNED);
1382	td = runq_choose(&tdq->tdq_realtime);
1383	if (td != NULL)
1384		return (td);
1385	td = runq_choose_from(&tdq->tdq_timeshare, tdq->tdq_ridx);
1386	if (td != NULL) {
1387		KASSERT(td->td_priority >= PRI_MIN_BATCH,
1388		    ("tdq_choose: Invalid priority on timeshare queue %d",
1389		    td->td_priority));
1390		return (td);
1391	}
1392	td = runq_choose(&tdq->tdq_idle);
1393	if (td != NULL) {
1394		KASSERT(td->td_priority >= PRI_MIN_IDLE,
1395		    ("tdq_choose: Invalid priority on idle queue %d",
1396		    td->td_priority));
1397		return (td);
1398	}
1399
1400	return (NULL);
1401}
1402
1403/*
1404 * Initialize a thread queue.
1405 */
1406static void
1407tdq_setup(struct tdq *tdq, int id)
1408{
1409
1410	if (bootverbose)
1411		printf("ULE: setup cpu %d\n", id);
1412	runq_init(&tdq->tdq_realtime);
1413	runq_init(&tdq->tdq_timeshare);
1414	runq_init(&tdq->tdq_idle);
1415	tdq->tdq_id = id;
1416	snprintf(tdq->tdq_name, sizeof(tdq->tdq_name),
1417	    "sched lock %d", (int)TDQ_ID(tdq));
1418	mtx_init(&tdq->tdq_lock, tdq->tdq_name, "sched lock", MTX_SPIN);
1419#ifdef KTR
1420	snprintf(tdq->tdq_loadname, sizeof(tdq->tdq_loadname),
1421	    "CPU %d load", (int)TDQ_ID(tdq));
1422#endif
1423}
1424
1425#ifdef SMP
1426static void
1427sched_setup_smp(void)
1428{
1429	struct tdq *tdq;
1430	int i;
1431
1432	cpu_top = smp_topo();
1433	CPU_FOREACH(i) {
1434		tdq = DPCPU_ID_PTR(i, tdq);
1435		tdq_setup(tdq, i);
1436		tdq->tdq_cg = smp_topo_find(cpu_top, i);
1437		if (tdq->tdq_cg == NULL)
1438			panic("Can't find cpu group for %d\n", i);
1439	}
1440	PCPU_SET(sched, DPCPU_PTR(tdq));
1441	balance_tdq = TDQ_SELF();
1442}
1443#endif
1444
1445/*
1446 * Setup the thread queues and initialize the topology based on MD
1447 * information.
1448 */
1449static void
1450sched_setup(void *dummy)
1451{
1452	struct tdq *tdq;
1453
1454#ifdef SMP
1455	sched_setup_smp();
1456#else
1457	tdq_setup(TDQ_SELF(), 0);
1458#endif
1459	tdq = TDQ_SELF();
1460
1461	/* Add thread0's load since it's running. */
1462	TDQ_LOCK(tdq);
1463	thread0.td_lock = TDQ_LOCKPTR(tdq);
1464	tdq_load_add(tdq, &thread0);
1465	tdq->tdq_lowpri = thread0.td_priority;
1466	TDQ_UNLOCK(tdq);
1467}
1468
1469/*
1470 * This routine determines time constants after stathz and hz are setup.
1471 */
1472/* ARGSUSED */
1473static void
1474sched_initticks(void *dummy)
1475{
1476	int incr;
1477
1478	realstathz = stathz ? stathz : hz;
1479	sched_slice = realstathz / SCHED_SLICE_DEFAULT_DIVISOR;
1480	sched_slice_min = sched_slice / SCHED_SLICE_MIN_DIVISOR;
1481	hogticks = imax(1, (2 * hz * sched_slice + realstathz / 2) /
1482	    realstathz);
1483
1484	/*
1485	 * tickincr is shifted out by 10 to avoid rounding errors due to
1486	 * hz not being evenly divisible by stathz on all platforms.
1487	 */
1488	incr = (hz << SCHED_TICK_SHIFT) / realstathz;
1489	/*
1490	 * This does not work for values of stathz that are more than
1491	 * 1 << SCHED_TICK_SHIFT * hz.  In practice this does not happen.
1492	 */
1493	if (incr == 0)
1494		incr = 1;
1495	tickincr = incr;
1496#ifdef SMP
1497	/*
1498	 * Set the default balance interval now that we know
1499	 * what realstathz is.
1500	 */
1501	balance_interval = realstathz;
1502	balance_ticks = balance_interval;
1503	affinity = SCHED_AFFINITY_DEFAULT;
1504#endif
1505	if (sched_idlespinthresh < 0)
1506		sched_idlespinthresh = 2 * max(10000, 6 * hz) / realstathz;
1507}
1508
1509/*
1510 * This is the core of the interactivity algorithm.  Determines a score based
1511 * on past behavior.  It is the ratio of sleep time to run time scaled to
1512 * a [0, 100] integer.  This is the voluntary sleep time of a process, which
1513 * differs from the cpu usage because it does not account for time spent
1514 * waiting on a run-queue.  Would be prettier if we had floating point.
1515 *
1516 * When a thread's sleep time is greater than its run time the
1517 * calculation is:
1518 *
1519 *                           scaling factor
1520 * interactivity score =  ---------------------
1521 *                        sleep time / run time
1522 *
1523 *
1524 * When a thread's run time is greater than its sleep time the
1525 * calculation is:
1526 *
1527 *                           scaling factor
1528 * interactivity score =  ---------------------    + scaling factor
1529 *                        run time / sleep time
1530 */
1531static int
1532sched_interact_score(struct thread *td)
1533{
1534	struct td_sched *ts;
1535	int div;
1536
1537	ts = td_get_sched(td);
1538	/*
1539	 * The score is only needed if this is likely to be an interactive
1540	 * task.  Don't go through the expense of computing it if there's
1541	 * no chance.
1542	 */
1543	if (sched_interact <= SCHED_INTERACT_HALF &&
1544		ts->ts_runtime >= ts->ts_slptime)
1545			return (SCHED_INTERACT_HALF);
1546
1547	if (ts->ts_runtime > ts->ts_slptime) {
1548		div = max(1, ts->ts_runtime / SCHED_INTERACT_HALF);
1549		return (SCHED_INTERACT_HALF +
1550		    (SCHED_INTERACT_HALF - (ts->ts_slptime / div)));
1551	}
1552	if (ts->ts_slptime > ts->ts_runtime) {
1553		div = max(1, ts->ts_slptime / SCHED_INTERACT_HALF);
1554		return (ts->ts_runtime / div);
1555	}
1556	/* runtime == slptime */
1557	if (ts->ts_runtime)
1558		return (SCHED_INTERACT_HALF);
1559
1560	/*
1561	 * This can happen if slptime and runtime are 0.
1562	 */
1563	return (0);
1564
1565}
1566
1567/*
1568 * Scale the scheduling priority according to the "interactivity" of this
1569 * process.
1570 */
1571static void
1572sched_priority(struct thread *td)
1573{
1574	int score;
1575	int pri;
1576
1577	if (PRI_BASE(td->td_pri_class) != PRI_TIMESHARE)
1578		return;
1579	/*
1580	 * If the score is interactive we place the thread in the realtime
1581	 * queue with a priority that is less than kernel and interrupt
1582	 * priorities.  These threads are not subject to nice restrictions.
1583	 *
1584	 * Scores greater than this are placed on the normal timeshare queue
1585	 * where the priority is partially decided by the most recent cpu
1586	 * utilization and the rest is decided by nice value.
1587	 *
1588	 * The nice value of the process has a linear effect on the calculated
1589	 * score.  Negative nice values make it easier for a thread to be
1590	 * considered interactive.
1591	 */
1592	score = imax(0, sched_interact_score(td) + td->td_proc->p_nice);
1593	if (score < sched_interact) {
1594		pri = PRI_MIN_INTERACT;
1595		pri += ((PRI_MAX_INTERACT - PRI_MIN_INTERACT + 1) /
1596		    sched_interact) * score;
1597		KASSERT(pri >= PRI_MIN_INTERACT && pri <= PRI_MAX_INTERACT,
1598		    ("sched_priority: invalid interactive priority %d score %d",
1599		    pri, score));
1600	} else {
1601		pri = SCHED_PRI_MIN;
1602		if (td_get_sched(td)->ts_ticks)
1603			pri += min(SCHED_PRI_TICKS(td_get_sched(td)),
1604			    SCHED_PRI_RANGE - 1);
1605		pri += SCHED_PRI_NICE(td->td_proc->p_nice);
1606		KASSERT(pri >= PRI_MIN_BATCH && pri <= PRI_MAX_BATCH,
1607		    ("sched_priority: invalid priority %d: nice %d, "
1608		    "ticks %d ftick %d ltick %d tick pri %d",
1609		    pri, td->td_proc->p_nice, td_get_sched(td)->ts_ticks,
1610		    td_get_sched(td)->ts_ftick, td_get_sched(td)->ts_ltick,
1611		    SCHED_PRI_TICKS(td_get_sched(td))));
1612	}
1613	sched_user_prio(td, pri);
1614
1615	return;
1616}
1617
1618/*
1619 * This routine enforces a maximum limit on the amount of scheduling history
1620 * kept.  It is called after either the slptime or runtime is adjusted.  This
1621 * function is ugly due to integer math.
1622 */
1623static void
1624sched_interact_update(struct thread *td)
1625{
1626	struct td_sched *ts;
1627	u_int sum;
1628
1629	ts = td_get_sched(td);
1630	sum = ts->ts_runtime + ts->ts_slptime;
1631	if (sum < SCHED_SLP_RUN_MAX)
1632		return;
1633	/*
1634	 * This only happens from two places:
1635	 * 1) We have added an unusual amount of run time from fork_exit.
1636	 * 2) We have added an unusual amount of sleep time from sched_sleep().
1637	 */
1638	if (sum > SCHED_SLP_RUN_MAX * 2) {
1639		if (ts->ts_runtime > ts->ts_slptime) {
1640			ts->ts_runtime = SCHED_SLP_RUN_MAX;
1641			ts->ts_slptime = 1;
1642		} else {
1643			ts->ts_slptime = SCHED_SLP_RUN_MAX;
1644			ts->ts_runtime = 1;
1645		}
1646		return;
1647	}
1648	/*
1649	 * If we have exceeded by more than 1/5th then the algorithm below
1650	 * will not bring us back into range.  Dividing by two here forces
1651	 * us into the range of [4/5 * SCHED_INTERACT_MAX, SCHED_INTERACT_MAX]
1652	 */
1653	if (sum > (SCHED_SLP_RUN_MAX / 5) * 6) {
1654		ts->ts_runtime /= 2;
1655		ts->ts_slptime /= 2;
1656		return;
1657	}
1658	ts->ts_runtime = (ts->ts_runtime / 5) * 4;
1659	ts->ts_slptime = (ts->ts_slptime / 5) * 4;
1660}
1661
1662/*
1663 * Scale back the interactivity history when a child thread is created.  The
1664 * history is inherited from the parent but the thread may behave totally
1665 * differently.  For example, a shell spawning a compiler process.  We want
1666 * to learn that the compiler is behaving badly very quickly.
1667 */
1668static void
1669sched_interact_fork(struct thread *td)
1670{
1671	struct td_sched *ts;
1672	int ratio;
1673	int sum;
1674
1675	ts = td_get_sched(td);
1676	sum = ts->ts_runtime + ts->ts_slptime;
1677	if (sum > SCHED_SLP_RUN_FORK) {
1678		ratio = sum / SCHED_SLP_RUN_FORK;
1679		ts->ts_runtime /= ratio;
1680		ts->ts_slptime /= ratio;
1681	}
1682}
1683
1684/*
1685 * Called from proc0_init() to setup the scheduler fields.
1686 */
1687void
1688schedinit(void)
1689{
1690	struct td_sched *ts0;
1691
1692	/*
1693	 * Set up the scheduler specific parts of thread0.
1694	 */
1695	ts0 = td_get_sched(&thread0);
1696	ts0->ts_ltick = ticks;
1697	ts0->ts_ftick = ticks;
1698	ts0->ts_slice = 0;
1699	ts0->ts_cpu = curcpu;	/* set valid CPU number */
1700}
1701
1702/*
1703 * This is only somewhat accurate since given many processes of the same
1704 * priority they will switch when their slices run out, which will be
1705 * at most sched_slice stathz ticks.
1706 */
1707int
1708sched_rr_interval(void)
1709{
1710
1711	/* Convert sched_slice from stathz to hz. */
1712	return (imax(1, (sched_slice * hz + realstathz / 2) / realstathz));
1713}
1714
1715/*
1716 * Update the percent cpu tracking information when it is requested or
1717 * the total history exceeds the maximum.  We keep a sliding history of
1718 * tick counts that slowly decays.  This is less precise than the 4BSD
1719 * mechanism since it happens with less regular and frequent events.
1720 */
1721static void
1722sched_pctcpu_update(struct td_sched *ts, int run)
1723{
1724	int t = ticks;
1725
1726	/*
1727	 * The signed difference may be negative if the thread hasn't run for
1728	 * over half of the ticks rollover period.
1729	 */
1730	if ((u_int)(t - ts->ts_ltick) >= SCHED_TICK_TARG) {
1731		ts->ts_ticks = 0;
1732		ts->ts_ftick = t - SCHED_TICK_TARG;
1733	} else if (t - ts->ts_ftick >= SCHED_TICK_MAX) {
1734		ts->ts_ticks = (ts->ts_ticks / (ts->ts_ltick - ts->ts_ftick)) *
1735		    (ts->ts_ltick - (t - SCHED_TICK_TARG));
1736		ts->ts_ftick = t - SCHED_TICK_TARG;
1737	}
1738	if (run)
1739		ts->ts_ticks += (t - ts->ts_ltick) << SCHED_TICK_SHIFT;
1740	ts->ts_ltick = t;
1741}
1742
1743/*
1744 * Adjust the priority of a thread.  Move it to the appropriate run-queue
1745 * if necessary.  This is the back-end for several priority related
1746 * functions.
1747 */
1748static void
1749sched_thread_priority(struct thread *td, u_char prio)
1750{
1751	struct td_sched *ts;
1752	struct tdq *tdq;
1753	int oldpri;
1754
1755	KTR_POINT3(KTR_SCHED, "thread", sched_tdname(td), "prio",
1756	    "prio:%d", td->td_priority, "new prio:%d", prio,
1757	    KTR_ATTR_LINKED, sched_tdname(curthread));
1758	SDT_PROBE3(sched, , , change__pri, td, td->td_proc, prio);
1759	if (td != curthread && prio < td->td_priority) {
1760		KTR_POINT3(KTR_SCHED, "thread", sched_tdname(curthread),
1761		    "lend prio", "prio:%d", td->td_priority, "new prio:%d",
1762		    prio, KTR_ATTR_LINKED, sched_tdname(td));
1763		SDT_PROBE4(sched, , , lend__pri, td, td->td_proc, prio,
1764		    curthread);
1765	}
1766	ts = td_get_sched(td);
1767	THREAD_LOCK_ASSERT(td, MA_OWNED);
1768	if (td->td_priority == prio)
1769		return;
1770	/*
1771	 * If the priority has been elevated due to priority
1772	 * propagation, we may have to move ourselves to a new
1773	 * queue.  This could be optimized to not re-add in some
1774	 * cases.
1775	 */
1776	if (TD_ON_RUNQ(td) && prio < td->td_priority) {
1777		sched_rem(td);
1778		td->td_priority = prio;
1779		sched_add(td, SRQ_BORROWING | SRQ_HOLDTD);
1780		return;
1781	}
1782	/*
1783	 * If the thread is currently running we may have to adjust the lowpri
1784	 * information so other cpus are aware of our current priority.
1785	 */
1786	if (TD_IS_RUNNING(td)) {
1787		tdq = TDQ_CPU(ts->ts_cpu);
1788		oldpri = td->td_priority;
1789		td->td_priority = prio;
1790		if (prio < tdq->tdq_lowpri)
1791			tdq->tdq_lowpri = prio;
1792		else if (tdq->tdq_lowpri == oldpri)
1793			tdq_setlowpri(tdq, td);
1794		return;
1795	}
1796	td->td_priority = prio;
1797}
1798
1799/*
1800 * Update a thread's priority when it is lent another thread's
1801 * priority.
1802 */
1803void
1804sched_lend_prio(struct thread *td, u_char prio)
1805{
1806
1807	td->td_flags |= TDF_BORROWING;
1808	sched_thread_priority(td, prio);
1809}
1810
1811/*
1812 * Restore a thread's priority when priority propagation is
1813 * over.  The prio argument is the minimum priority the thread
1814 * needs to have to satisfy other possible priority lending
1815 * requests.  If the thread's regular priority is less
1816 * important than prio, the thread will keep a priority boost
1817 * of prio.
1818 */
1819void
1820sched_unlend_prio(struct thread *td, u_char prio)
1821{
1822	u_char base_pri;
1823
1824	if (td->td_base_pri >= PRI_MIN_TIMESHARE &&
1825	    td->td_base_pri <= PRI_MAX_TIMESHARE)
1826		base_pri = td->td_user_pri;
1827	else
1828		base_pri = td->td_base_pri;
1829	if (prio >= base_pri) {
1830		td->td_flags &= ~TDF_BORROWING;
1831		sched_thread_priority(td, base_pri);
1832	} else
1833		sched_lend_prio(td, prio);
1834}
1835
1836/*
1837 * Standard entry for setting the priority to an absolute value.
1838 */
1839void
1840sched_prio(struct thread *td, u_char prio)
1841{
1842	u_char oldprio;
1843
1844	/* First, update the base priority. */
1845	td->td_base_pri = prio;
1846
1847	/*
1848	 * If the thread is borrowing another thread's priority, don't
1849	 * ever lower the priority.
1850	 */
1851	if (td->td_flags & TDF_BORROWING && td->td_priority < prio)
1852		return;
1853
1854	/* Change the real priority. */
1855	oldprio = td->td_priority;
1856	sched_thread_priority(td, prio);
1857
1858	/*
1859	 * If the thread is on a turnstile, then let the turnstile update
1860	 * its state.
1861	 */
1862	if (TD_ON_LOCK(td) && oldprio != prio)
1863		turnstile_adjust(td, oldprio);
1864}
1865
1866/*
1867 * Set the base user priority, does not effect current running priority.
1868 */
1869void
1870sched_user_prio(struct thread *td, u_char prio)
1871{
1872
1873	td->td_base_user_pri = prio;
1874	if (td->td_lend_user_pri <= prio)
1875		return;
1876	td->td_user_pri = prio;
1877}
1878
1879void
1880sched_lend_user_prio(struct thread *td, u_char prio)
1881{
1882
1883	THREAD_LOCK_ASSERT(td, MA_OWNED);
1884	td->td_lend_user_pri = prio;
1885	td->td_user_pri = min(prio, td->td_base_user_pri);
1886	if (td->td_priority > td->td_user_pri)
1887		sched_prio(td, td->td_user_pri);
1888	else if (td->td_priority != td->td_user_pri)
1889		td->td_flags |= TDF_NEEDRESCHED;
1890}
1891
1892/*
1893 * Like the above but first check if there is anything to do.
1894 */
1895void
1896sched_lend_user_prio_cond(struct thread *td, u_char prio)
1897{
1898
1899	if (td->td_lend_user_pri != prio)
1900		goto lend;
1901	if (td->td_user_pri != min(prio, td->td_base_user_pri))
1902		goto lend;
1903	if (td->td_priority != td->td_user_pri)
1904		goto lend;
1905	return;
1906
1907lend:
1908	thread_lock(td);
1909	sched_lend_user_prio(td, prio);
1910	thread_unlock(td);
1911}
1912
1913#ifdef SMP
1914/*
1915 * This tdq is about to idle.  Try to steal a thread from another CPU before
1916 * choosing the idle thread.
1917 */
1918static void
1919tdq_trysteal(struct tdq *tdq)
1920{
1921	struct cpu_group *cg;
1922	struct tdq *steal;
1923	cpuset_t mask;
1924	int cpu, i;
1925
1926	if (smp_started == 0 || trysteal_limit == 0 || tdq->tdq_cg == NULL)
1927		return;
1928	CPU_FILL(&mask);
1929	CPU_CLR(PCPU_GET(cpuid), &mask);
1930	/* We don't want to be preempted while we're iterating. */
1931	spinlock_enter();
1932	TDQ_UNLOCK(tdq);
1933	for (i = 1, cg = tdq->tdq_cg; ; ) {
1934		cpu = sched_highest(cg, mask, steal_thresh);
1935		/*
1936		 * If a thread was added while interrupts were disabled don't
1937		 * steal one here.
1938		 */
1939		if (tdq->tdq_load > 0) {
1940			TDQ_LOCK(tdq);
1941			break;
1942		}
1943		if (cpu == -1) {
1944			i++;
1945			cg = cg->cg_parent;
1946			if (cg == NULL || i > trysteal_limit) {
1947				TDQ_LOCK(tdq);
1948				break;
1949			}
1950			continue;
1951		}
1952		steal = TDQ_CPU(cpu);
1953		/*
1954		 * The data returned by sched_highest() is stale and
1955                 * the chosen CPU no longer has an eligible thread.
1956		 */
1957		if (steal->tdq_load < steal_thresh ||
1958		    steal->tdq_transferable == 0)
1959			continue;
1960		tdq_lock_pair(tdq, steal);
1961		/*
1962		 * If we get to this point, unconditonally exit the loop
1963		 * to bound the time spent in the critcal section.
1964		 *
1965		 * If a thread was added while interrupts were disabled don't
1966		 * steal one here.
1967		 */
1968		if (tdq->tdq_load > 0) {
1969			TDQ_UNLOCK(steal);
1970			break;
1971		}
1972		/*
1973		 * The data returned by sched_highest() is stale and
1974                 * the chosen CPU no longer has an eligible thread.
1975		 */
1976		if (steal->tdq_load < steal_thresh ||
1977		    steal->tdq_transferable == 0) {
1978			TDQ_UNLOCK(steal);
1979			break;
1980		}
1981		/*
1982		 * If we fail to acquire one due to affinity restrictions,
1983		 * bail out and let the idle thread to a more complete search
1984		 * outside of a critical section.
1985		 */
1986		if (tdq_move(steal, tdq) == NULL) {
1987			TDQ_UNLOCK(steal);
1988			break;
1989		}
1990		TDQ_UNLOCK(steal);
1991		break;
1992	}
1993	spinlock_exit();
1994}
1995#endif
1996
1997/*
1998 * Handle migration from sched_switch().  This happens only for
1999 * cpu binding.
2000 */
2001static struct mtx *
2002sched_switch_migrate(struct tdq *tdq, struct thread *td, int flags)
2003{
2004	struct tdq *tdn;
2005
2006	KASSERT(THREAD_CAN_MIGRATE(td) ||
2007	    (td_get_sched(td)->ts_flags & TSF_BOUND) != 0,
2008	    ("Thread %p shouldn't migrate", td));
2009	KASSERT(!CPU_ABSENT(td_get_sched(td)->ts_cpu), ("sched_switch_migrate: "
2010	    "thread %s queued on absent CPU %d.", td->td_name,
2011	    td_get_sched(td)->ts_cpu));
2012	tdn = TDQ_CPU(td_get_sched(td)->ts_cpu);
2013#ifdef SMP
2014	tdq_load_rem(tdq, td);
2015	/*
2016	 * Do the lock dance required to avoid LOR.  We have an
2017	 * extra spinlock nesting from sched_switch() which will
2018	 * prevent preemption while we're holding neither run-queue lock.
2019	 */
2020	TDQ_UNLOCK(tdq);
2021	TDQ_LOCK(tdn);
2022	tdq_add(tdn, td, flags);
2023	tdq_notify(tdn, td);
2024	TDQ_UNLOCK(tdn);
2025	TDQ_LOCK(tdq);
2026#endif
2027	return (TDQ_LOCKPTR(tdn));
2028}
2029
2030/*
2031 * thread_lock_unblock() that does not assume td_lock is blocked.
2032 */
2033static inline void
2034thread_unblock_switch(struct thread *td, struct mtx *mtx)
2035{
2036	atomic_store_rel_ptr((volatile uintptr_t *)&td->td_lock,
2037	    (uintptr_t)mtx);
2038}
2039
2040/*
2041 * Switch threads.  This function has to handle threads coming in while
2042 * blocked for some reason, running, or idle.  It also must deal with
2043 * migrating a thread from one queue to another as running threads may
2044 * be assigned elsewhere via binding.
2045 */
2046void
2047sched_switch(struct thread *td, int flags)
2048{
2049	struct thread *newtd;
2050	struct tdq *tdq;
2051	struct td_sched *ts;
2052	struct mtx *mtx;
2053	int srqflag;
2054	int cpuid, preempted;
2055
2056	THREAD_LOCK_ASSERT(td, MA_OWNED);
2057
2058	cpuid = PCPU_GET(cpuid);
2059	tdq = TDQ_SELF();
2060	ts = td_get_sched(td);
2061	sched_pctcpu_update(ts, 1);
2062	ts->ts_rltick = ticks;
2063	td->td_lastcpu = td->td_oncpu;
2064	preempted = (td->td_flags & TDF_SLICEEND) == 0 &&
2065	    (flags & SW_PREEMPT) != 0;
2066	td->td_flags &= ~(TDF_NEEDRESCHED | TDF_SLICEEND);
2067	td->td_owepreempt = 0;
2068	tdq->tdq_owepreempt = 0;
2069	if (!TD_IS_IDLETHREAD(td))
2070		tdq->tdq_switchcnt++;
2071
2072	/*
2073	 * Always block the thread lock so we can drop the tdq lock early.
2074	 */
2075	mtx = thread_lock_block(td);
2076	spinlock_enter();
2077	if (TD_IS_IDLETHREAD(td)) {
2078		MPASS(mtx == TDQ_LOCKPTR(tdq));
2079		TD_SET_CAN_RUN(td);
2080	} else if (TD_IS_RUNNING(td)) {
2081		MPASS(mtx == TDQ_LOCKPTR(tdq));
2082		srqflag = preempted ?
2083		    SRQ_OURSELF|SRQ_YIELDING|SRQ_PREEMPTED :
2084		    SRQ_OURSELF|SRQ_YIELDING;
2085#ifdef SMP
2086		if (THREAD_CAN_MIGRATE(td) && !THREAD_CAN_SCHED(td, ts->ts_cpu))
2087			ts->ts_cpu = sched_pickcpu(td, 0);
2088#endif
2089		if (ts->ts_cpu == cpuid)
2090			tdq_runq_add(tdq, td, srqflag);
2091		else
2092			mtx = sched_switch_migrate(tdq, td, srqflag);
2093	} else {
2094		/* This thread must be going to sleep. */
2095		if (mtx != TDQ_LOCKPTR(tdq)) {
2096			mtx_unlock_spin(mtx);
2097			TDQ_LOCK(tdq);
2098		}
2099		tdq_load_rem(tdq, td);
2100#ifdef SMP
2101		if (tdq->tdq_load == 0)
2102			tdq_trysteal(tdq);
2103#endif
2104	}
2105
2106#if (KTR_COMPILE & KTR_SCHED) != 0
2107	if (TD_IS_IDLETHREAD(td))
2108		KTR_STATE1(KTR_SCHED, "thread", sched_tdname(td), "idle",
2109		    "prio:%d", td->td_priority);
2110	else
2111		KTR_STATE3(KTR_SCHED, "thread", sched_tdname(td), KTDSTATE(td),
2112		    "prio:%d", td->td_priority, "wmesg:\"%s\"", td->td_wmesg,
2113		    "lockname:\"%s\"", td->td_lockname);
2114#endif
2115
2116	/*
2117	 * We enter here with the thread blocked and assigned to the
2118	 * appropriate cpu run-queue or sleep-queue and with the current
2119	 * thread-queue locked.
2120	 */
2121	TDQ_LOCK_ASSERT(tdq, MA_OWNED | MA_NOTRECURSED);
2122	newtd = choosethread();
2123	sched_pctcpu_update(td_get_sched(newtd), 0);
2124	TDQ_UNLOCK(tdq);
2125
2126	/*
2127	 * Call the MD code to switch contexts if necessary.
2128	 */
2129	if (td != newtd) {
2130#ifdef	HWPMC_HOOKS
2131		if (PMC_PROC_IS_USING_PMCS(td->td_proc))
2132			PMC_SWITCH_CONTEXT(td, PMC_FN_CSW_OUT);
2133#endif
2134		SDT_PROBE2(sched, , , off__cpu, newtd, newtd->td_proc);
2135
2136#ifdef KDTRACE_HOOKS
2137		/*
2138		 * If DTrace has set the active vtime enum to anything
2139		 * other than INACTIVE (0), then it should have set the
2140		 * function to call.
2141		 */
2142		if (dtrace_vtime_active)
2143			(*dtrace_vtime_switch_func)(newtd);
2144#endif
2145		td->td_oncpu = NOCPU;
2146		cpu_switch(td, newtd, mtx);
2147		cpuid = td->td_oncpu = PCPU_GET(cpuid);
2148
2149		SDT_PROBE0(sched, , , on__cpu);
2150#ifdef	HWPMC_HOOKS
2151		if (PMC_PROC_IS_USING_PMCS(td->td_proc))
2152			PMC_SWITCH_CONTEXT(td, PMC_FN_CSW_IN);
2153#endif
2154	} else {
2155		thread_unblock_switch(td, mtx);
2156		SDT_PROBE0(sched, , , remain__cpu);
2157	}
2158	KASSERT(curthread->td_md.md_spinlock_count == 1,
2159	    ("invalid count %d", curthread->td_md.md_spinlock_count));
2160
2161	KTR_STATE1(KTR_SCHED, "thread", sched_tdname(td), "running",
2162	    "prio:%d", td->td_priority);
2163}
2164
2165/*
2166 * Adjust thread priorities as a result of a nice request.
2167 */
2168void
2169sched_nice(struct proc *p, int nice)
2170{
2171	struct thread *td;
2172
2173	PROC_LOCK_ASSERT(p, MA_OWNED);
2174
2175	p->p_nice = nice;
2176	FOREACH_THREAD_IN_PROC(p, td) {
2177		thread_lock(td);
2178		sched_priority(td);
2179		sched_prio(td, td->td_base_user_pri);
2180		thread_unlock(td);
2181	}
2182}
2183
2184/*
2185 * Record the sleep time for the interactivity scorer.
2186 */
2187void
2188sched_sleep(struct thread *td, int prio)
2189{
2190
2191	THREAD_LOCK_ASSERT(td, MA_OWNED);
2192
2193	td->td_slptick = ticks;
2194	if (TD_IS_SUSPENDED(td) || prio >= PSOCK)
2195		td->td_flags |= TDF_CANSWAP;
2196	if (PRI_BASE(td->td_pri_class) != PRI_TIMESHARE)
2197		return;
2198	if (static_boost == 1 && prio)
2199		sched_prio(td, prio);
2200	else if (static_boost && td->td_priority > static_boost)
2201		sched_prio(td, static_boost);
2202}
2203
2204/*
2205 * Schedule a thread to resume execution and record how long it voluntarily
2206 * slept.  We also update the pctcpu, interactivity, and priority.
2207 *
2208 * Requires the thread lock on entry, drops on exit.
2209 */
2210void
2211sched_wakeup(struct thread *td, int srqflags)
2212{
2213	struct td_sched *ts;
2214	int slptick;
2215
2216	THREAD_LOCK_ASSERT(td, MA_OWNED);
2217	ts = td_get_sched(td);
2218	td->td_flags &= ~TDF_CANSWAP;
2219
2220	/*
2221	 * If we slept for more than a tick update our interactivity and
2222	 * priority.
2223	 */
2224	slptick = td->td_slptick;
2225	td->td_slptick = 0;
2226	if (slptick && slptick != ticks) {
2227		ts->ts_slptime += (ticks - slptick) << SCHED_TICK_SHIFT;
2228		sched_interact_update(td);
2229		sched_pctcpu_update(ts, 0);
2230	}
2231	/*
2232	 * Reset the slice value since we slept and advanced the round-robin.
2233	 */
2234	ts->ts_slice = 0;
2235	sched_add(td, SRQ_BORING | srqflags);
2236}
2237
2238/*
2239 * Penalize the parent for creating a new child and initialize the child's
2240 * priority.
2241 */
2242void
2243sched_fork(struct thread *td, struct thread *child)
2244{
2245	THREAD_LOCK_ASSERT(td, MA_OWNED);
2246	sched_pctcpu_update(td_get_sched(td), 1);
2247	sched_fork_thread(td, child);
2248	/*
2249	 * Penalize the parent and child for forking.
2250	 */
2251	sched_interact_fork(child);
2252	sched_priority(child);
2253	td_get_sched(td)->ts_runtime += tickincr;
2254	sched_interact_update(td);
2255	sched_priority(td);
2256}
2257
2258/*
2259 * Fork a new thread, may be within the same process.
2260 */
2261void
2262sched_fork_thread(struct thread *td, struct thread *child)
2263{
2264	struct td_sched *ts;
2265	struct td_sched *ts2;
2266	struct tdq *tdq;
2267
2268	tdq = TDQ_SELF();
2269	THREAD_LOCK_ASSERT(td, MA_OWNED);
2270	/*
2271	 * Initialize child.
2272	 */
2273	ts = td_get_sched(td);
2274	ts2 = td_get_sched(child);
2275	child->td_oncpu = NOCPU;
2276	child->td_lastcpu = NOCPU;
2277	child->td_lock = TDQ_LOCKPTR(tdq);
2278	child->td_cpuset = cpuset_ref(td->td_cpuset);
2279	child->td_domain.dr_policy = td->td_cpuset->cs_domain;
2280	ts2->ts_cpu = ts->ts_cpu;
2281	ts2->ts_flags = 0;
2282	/*
2283	 * Grab our parents cpu estimation information.
2284	 */
2285	ts2->ts_ticks = ts->ts_ticks;
2286	ts2->ts_ltick = ts->ts_ltick;
2287	ts2->ts_ftick = ts->ts_ftick;
2288	/*
2289	 * Do not inherit any borrowed priority from the parent.
2290	 */
2291	child->td_priority = child->td_base_pri;
2292	/*
2293	 * And update interactivity score.
2294	 */
2295	ts2->ts_slptime = ts->ts_slptime;
2296	ts2->ts_runtime = ts->ts_runtime;
2297	/* Attempt to quickly learn interactivity. */
2298	ts2->ts_slice = tdq_slice(tdq) - sched_slice_min;
2299#ifdef KTR
2300	bzero(ts2->ts_name, sizeof(ts2->ts_name));
2301#endif
2302}
2303
2304/*
2305 * Adjust the priority class of a thread.
2306 */
2307void
2308sched_class(struct thread *td, int class)
2309{
2310
2311	THREAD_LOCK_ASSERT(td, MA_OWNED);
2312	if (td->td_pri_class == class)
2313		return;
2314	td->td_pri_class = class;
2315}
2316
2317/*
2318 * Return some of the child's priority and interactivity to the parent.
2319 */
2320void
2321sched_exit(struct proc *p, struct thread *child)
2322{
2323	struct thread *td;
2324
2325	KTR_STATE1(KTR_SCHED, "thread", sched_tdname(child), "proc exit",
2326	    "prio:%d", child->td_priority);
2327	PROC_LOCK_ASSERT(p, MA_OWNED);
2328	td = FIRST_THREAD_IN_PROC(p);
2329	sched_exit_thread(td, child);
2330}
2331
2332/*
2333 * Penalize another thread for the time spent on this one.  This helps to
2334 * worsen the priority and interactivity of processes which schedule batch
2335 * jobs such as make.  This has little effect on the make process itself but
2336 * causes new processes spawned by it to receive worse scores immediately.
2337 */
2338void
2339sched_exit_thread(struct thread *td, struct thread *child)
2340{
2341
2342	KTR_STATE1(KTR_SCHED, "thread", sched_tdname(child), "thread exit",
2343	    "prio:%d", child->td_priority);
2344	/*
2345	 * Give the child's runtime to the parent without returning the
2346	 * sleep time as a penalty to the parent.  This causes shells that
2347	 * launch expensive things to mark their children as expensive.
2348	 */
2349	thread_lock(td);
2350	td_get_sched(td)->ts_runtime += td_get_sched(child)->ts_runtime;
2351	sched_interact_update(td);
2352	sched_priority(td);
2353	thread_unlock(td);
2354}
2355
2356void
2357sched_preempt(struct thread *td)
2358{
2359	struct tdq *tdq;
2360	int flags;
2361
2362	SDT_PROBE2(sched, , , surrender, td, td->td_proc);
2363
2364	thread_lock(td);
2365	tdq = TDQ_SELF();
2366	TDQ_LOCK_ASSERT(tdq, MA_OWNED);
2367	if (td->td_priority > tdq->tdq_lowpri) {
2368		if (td->td_critnest == 1) {
2369			flags = SW_INVOL | SW_PREEMPT;
2370			flags |= TD_IS_IDLETHREAD(td) ? SWT_REMOTEWAKEIDLE :
2371			    SWT_REMOTEPREEMPT;
2372			mi_switch(flags);
2373			/* Switch dropped thread lock. */
2374			return;
2375		}
2376		td->td_owepreempt = 1;
2377	} else {
2378		tdq->tdq_owepreempt = 0;
2379	}
2380	thread_unlock(td);
2381}
2382
2383/*
2384 * Fix priorities on return to user-space.  Priorities may be elevated due
2385 * to static priorities in msleep() or similar.
2386 */
2387void
2388sched_userret_slowpath(struct thread *td)
2389{
2390
2391	thread_lock(td);
2392	td->td_priority = td->td_user_pri;
2393	td->td_base_pri = td->td_user_pri;
2394	tdq_setlowpri(TDQ_SELF(), td);
2395	thread_unlock(td);
2396}
2397
2398/*
2399 * Handle a stathz tick.  This is really only relevant for timeshare
2400 * threads.
2401 */
2402void
2403sched_clock(struct thread *td, int cnt)
2404{
2405	struct tdq *tdq;
2406	struct td_sched *ts;
2407
2408	THREAD_LOCK_ASSERT(td, MA_OWNED);
2409	tdq = TDQ_SELF();
2410#ifdef SMP
2411	/*
2412	 * We run the long term load balancer infrequently on the first cpu.
2413	 */
2414	if (balance_tdq == tdq && smp_started != 0 && rebalance != 0 &&
2415	    balance_ticks != 0) {
2416		balance_ticks -= cnt;
2417		if (balance_ticks <= 0)
2418			sched_balance();
2419	}
2420#endif
2421	/*
2422	 * Save the old switch count so we have a record of the last ticks
2423	 * activity.   Initialize the new switch count based on our load.
2424	 * If there is some activity seed it to reflect that.
2425	 */
2426	tdq->tdq_oldswitchcnt = tdq->tdq_switchcnt;
2427	tdq->tdq_switchcnt = tdq->tdq_load;
2428	/*
2429	 * Advance the insert index once for each tick to ensure that all
2430	 * threads get a chance to run.
2431	 */
2432	if (tdq->tdq_idx == tdq->tdq_ridx) {
2433		tdq->tdq_idx = (tdq->tdq_idx + 1) % RQ_NQS;
2434		if (TAILQ_EMPTY(&tdq->tdq_timeshare.rq_queues[tdq->tdq_ridx]))
2435			tdq->tdq_ridx = tdq->tdq_idx;
2436	}
2437	ts = td_get_sched(td);
2438	sched_pctcpu_update(ts, 1);
2439	if ((td->td_pri_class & PRI_FIFO_BIT) || TD_IS_IDLETHREAD(td))
2440		return;
2441
2442	if (PRI_BASE(td->td_pri_class) == PRI_TIMESHARE) {
2443		/*
2444		 * We used a tick; charge it to the thread so
2445		 * that we can compute our interactivity.
2446		 */
2447		td_get_sched(td)->ts_runtime += tickincr * cnt;
2448		sched_interact_update(td);
2449		sched_priority(td);
2450	}
2451
2452	/*
2453	 * Force a context switch if the current thread has used up a full
2454	 * time slice (default is 100ms).
2455	 */
2456	ts->ts_slice += cnt;
2457	if (ts->ts_slice >= tdq_slice(tdq)) {
2458		ts->ts_slice = 0;
2459		td->td_flags |= TDF_NEEDRESCHED | TDF_SLICEEND;
2460	}
2461}
2462
2463u_int
2464sched_estcpu(struct thread *td __unused)
2465{
2466
2467	return (0);
2468}
2469
2470/*
2471 * Return whether the current CPU has runnable tasks.  Used for in-kernel
2472 * cooperative idle threads.
2473 */
2474int
2475sched_runnable(void)
2476{
2477	struct tdq *tdq;
2478	int load;
2479
2480	load = 1;
2481
2482	tdq = TDQ_SELF();
2483	if ((curthread->td_flags & TDF_IDLETD) != 0) {
2484		if (tdq->tdq_load > 0)
2485			goto out;
2486	} else
2487		if (tdq->tdq_load - 1 > 0)
2488			goto out;
2489	load = 0;
2490out:
2491	return (load);
2492}
2493
2494/*
2495 * Choose the highest priority thread to run.  The thread is removed from
2496 * the run-queue while running however the load remains.  For SMP we set
2497 * the tdq in the global idle bitmask if it idles here.
2498 */
2499struct thread *
2500sched_choose(void)
2501{
2502	struct thread *td;
2503	struct tdq *tdq;
2504
2505	tdq = TDQ_SELF();
2506	TDQ_LOCK_ASSERT(tdq, MA_OWNED);
2507	td = tdq_choose(tdq);
2508	if (td) {
2509		tdq_runq_rem(tdq, td);
2510		tdq->tdq_lowpri = td->td_priority;
2511		return (td);
2512	}
2513	tdq->tdq_lowpri = PRI_MAX_IDLE;
2514	return (PCPU_GET(idlethread));
2515}
2516
2517/*
2518 * Set owepreempt if necessary.  Preemption never happens directly in ULE,
2519 * we always request it once we exit a critical section.
2520 */
2521static inline void
2522sched_setpreempt(struct thread *td)
2523{
2524	struct thread *ctd;
2525	int cpri;
2526	int pri;
2527
2528	THREAD_LOCK_ASSERT(curthread, MA_OWNED);
2529
2530	ctd = curthread;
2531	pri = td->td_priority;
2532	cpri = ctd->td_priority;
2533	if (pri < cpri)
2534		ctd->td_flags |= TDF_NEEDRESCHED;
2535	if (KERNEL_PANICKED() || pri >= cpri || cold || TD_IS_INHIBITED(ctd))
2536		return;
2537	if (!sched_shouldpreempt(pri, cpri, 0))
2538		return;
2539	ctd->td_owepreempt = 1;
2540}
2541
2542/*
2543 * Add a thread to a thread queue.  Select the appropriate runq and add the
2544 * thread to it.  This is the internal function called when the tdq is
2545 * predetermined.
2546 */
2547void
2548tdq_add(struct tdq *tdq, struct thread *td, int flags)
2549{
2550
2551	TDQ_LOCK_ASSERT(tdq, MA_OWNED);
2552	THREAD_LOCK_BLOCKED_ASSERT(td, MA_OWNED);
2553	KASSERT((td->td_inhibitors == 0),
2554	    ("sched_add: trying to run inhibited thread"));
2555	KASSERT((TD_CAN_RUN(td) || TD_IS_RUNNING(td)),
2556	    ("sched_add: bad thread state"));
2557	KASSERT(td->td_flags & TDF_INMEM,
2558	    ("sched_add: thread swapped out"));
2559
2560	if (td->td_priority < tdq->tdq_lowpri)
2561		tdq->tdq_lowpri = td->td_priority;
2562	tdq_runq_add(tdq, td, flags);
2563	tdq_load_add(tdq, td);
2564}
2565
2566/*
2567 * Select the target thread queue and add a thread to it.  Request
2568 * preemption or IPI a remote processor if required.
2569 *
2570 * Requires the thread lock on entry, drops on exit.
2571 */
2572void
2573sched_add(struct thread *td, int flags)
2574{
2575	struct tdq *tdq;
2576#ifdef SMP
2577	int cpu;
2578#endif
2579
2580	KTR_STATE2(KTR_SCHED, "thread", sched_tdname(td), "runq add",
2581	    "prio:%d", td->td_priority, KTR_ATTR_LINKED,
2582	    sched_tdname(curthread));
2583	KTR_POINT1(KTR_SCHED, "thread", sched_tdname(curthread), "wokeup",
2584	    KTR_ATTR_LINKED, sched_tdname(td));
2585	SDT_PROBE4(sched, , , enqueue, td, td->td_proc, NULL,
2586	    flags & SRQ_PREEMPTED);
2587	THREAD_LOCK_ASSERT(td, MA_OWNED);
2588	/*
2589	 * Recalculate the priority before we select the target cpu or
2590	 * run-queue.
2591	 */
2592	if (PRI_BASE(td->td_pri_class) == PRI_TIMESHARE)
2593		sched_priority(td);
2594#ifdef SMP
2595	/*
2596	 * Pick the destination cpu and if it isn't ours transfer to the
2597	 * target cpu.
2598	 */
2599	cpu = sched_pickcpu(td, flags);
2600	tdq = sched_setcpu(td, cpu, flags);
2601	tdq_add(tdq, td, flags);
2602	if (cpu != PCPU_GET(cpuid))
2603		tdq_notify(tdq, td);
2604	else if (!(flags & SRQ_YIELDING))
2605		sched_setpreempt(td);
2606#else
2607	tdq = TDQ_SELF();
2608	/*
2609	 * Now that the thread is moving to the run-queue, set the lock
2610	 * to the scheduler's lock.
2611	 */
2612	if (td->td_lock != TDQ_LOCKPTR(tdq)) {
2613		TDQ_LOCK(tdq);
2614		if ((flags & SRQ_HOLD) != 0)
2615			td->td_lock = TDQ_LOCKPTR(tdq);
2616		else
2617			thread_lock_set(td, TDQ_LOCKPTR(tdq));
2618	}
2619	tdq_add(tdq, td, flags);
2620	if (!(flags & SRQ_YIELDING))
2621		sched_setpreempt(td);
2622#endif
2623	if (!(flags & SRQ_HOLDTD))
2624		thread_unlock(td);
2625}
2626
2627/*
2628 * Remove a thread from a run-queue without running it.  This is used
2629 * when we're stealing a thread from a remote queue.  Otherwise all threads
2630 * exit by calling sched_exit_thread() and sched_throw() themselves.
2631 */
2632void
2633sched_rem(struct thread *td)
2634{
2635	struct tdq *tdq;
2636
2637	KTR_STATE1(KTR_SCHED, "thread", sched_tdname(td), "runq rem",
2638	    "prio:%d", td->td_priority);
2639	SDT_PROBE3(sched, , , dequeue, td, td->td_proc, NULL);
2640	tdq = TDQ_CPU(td_get_sched(td)->ts_cpu);
2641	TDQ_LOCK_ASSERT(tdq, MA_OWNED);
2642	MPASS(td->td_lock == TDQ_LOCKPTR(tdq));
2643	KASSERT(TD_ON_RUNQ(td),
2644	    ("sched_rem: thread not on run queue"));
2645	tdq_runq_rem(tdq, td);
2646	tdq_load_rem(tdq, td);
2647	TD_SET_CAN_RUN(td);
2648	if (td->td_priority == tdq->tdq_lowpri)
2649		tdq_setlowpri(tdq, NULL);
2650}
2651
2652/*
2653 * Fetch cpu utilization information.  Updates on demand.
2654 */
2655fixpt_t
2656sched_pctcpu(struct thread *td)
2657{
2658	fixpt_t pctcpu;
2659	struct td_sched *ts;
2660
2661	pctcpu = 0;
2662	ts = td_get_sched(td);
2663
2664	THREAD_LOCK_ASSERT(td, MA_OWNED);
2665	sched_pctcpu_update(ts, TD_IS_RUNNING(td));
2666	if (ts->ts_ticks) {
2667		int rtick;
2668
2669		/* How many rtick per second ? */
2670		rtick = min(SCHED_TICK_HZ(ts) / SCHED_TICK_SECS, hz);
2671		pctcpu = (FSCALE * ((FSCALE * rtick)/hz)) >> FSHIFT;
2672	}
2673
2674	return (pctcpu);
2675}
2676
2677/*
2678 * Enforce affinity settings for a thread.  Called after adjustments to
2679 * cpumask.
2680 */
2681void
2682sched_affinity(struct thread *td)
2683{
2684#ifdef SMP
2685	struct td_sched *ts;
2686
2687	THREAD_LOCK_ASSERT(td, MA_OWNED);
2688	ts = td_get_sched(td);
2689	if (THREAD_CAN_SCHED(td, ts->ts_cpu))
2690		return;
2691	if (TD_ON_RUNQ(td)) {
2692		sched_rem(td);
2693		sched_add(td, SRQ_BORING | SRQ_HOLDTD);
2694		return;
2695	}
2696	if (!TD_IS_RUNNING(td))
2697		return;
2698	/*
2699	 * Force a switch before returning to userspace.  If the
2700	 * target thread is not running locally send an ipi to force
2701	 * the issue.
2702	 */
2703	td->td_flags |= TDF_NEEDRESCHED;
2704	if (td != curthread)
2705		ipi_cpu(ts->ts_cpu, IPI_PREEMPT);
2706#endif
2707}
2708
2709/*
2710 * Bind a thread to a target cpu.
2711 */
2712void
2713sched_bind(struct thread *td, int cpu)
2714{
2715	struct td_sched *ts;
2716
2717	THREAD_LOCK_ASSERT(td, MA_OWNED|MA_NOTRECURSED);
2718	KASSERT(td == curthread, ("sched_bind: can only bind curthread"));
2719	ts = td_get_sched(td);
2720	if (ts->ts_flags & TSF_BOUND)
2721		sched_unbind(td);
2722	KASSERT(THREAD_CAN_MIGRATE(td), ("%p must be migratable", td));
2723	ts->ts_flags |= TSF_BOUND;
2724	sched_pin();
2725	if (PCPU_GET(cpuid) == cpu)
2726		return;
2727	ts->ts_cpu = cpu;
2728	/* When we return from mi_switch we'll be on the correct cpu. */
2729	mi_switch(SW_VOL);
2730	thread_lock(td);
2731}
2732
2733/*
2734 * Release a bound thread.
2735 */
2736void
2737sched_unbind(struct thread *td)
2738{
2739	struct td_sched *ts;
2740
2741	THREAD_LOCK_ASSERT(td, MA_OWNED);
2742	KASSERT(td == curthread, ("sched_unbind: can only bind curthread"));
2743	ts = td_get_sched(td);
2744	if ((ts->ts_flags & TSF_BOUND) == 0)
2745		return;
2746	ts->ts_flags &= ~TSF_BOUND;
2747	sched_unpin();
2748}
2749
2750int
2751sched_is_bound(struct thread *td)
2752{
2753	THREAD_LOCK_ASSERT(td, MA_OWNED);
2754	return (td_get_sched(td)->ts_flags & TSF_BOUND);
2755}
2756
2757/*
2758 * Basic yield call.
2759 */
2760void
2761sched_relinquish(struct thread *td)
2762{
2763	thread_lock(td);
2764	mi_switch(SW_VOL | SWT_RELINQUISH);
2765}
2766
2767/*
2768 * Return the total system load.
2769 */
2770int
2771sched_load(void)
2772{
2773#ifdef SMP
2774	int total;
2775	int i;
2776
2777	total = 0;
2778	CPU_FOREACH(i)
2779		total += TDQ_CPU(i)->tdq_sysload;
2780	return (total);
2781#else
2782	return (TDQ_SELF()->tdq_sysload);
2783#endif
2784}
2785
2786int
2787sched_sizeof_proc(void)
2788{
2789	return (sizeof(struct proc));
2790}
2791
2792int
2793sched_sizeof_thread(void)
2794{
2795	return (sizeof(struct thread) + sizeof(struct td_sched));
2796}
2797
2798#ifdef SMP
2799#define	TDQ_IDLESPIN(tdq)						\
2800    ((tdq)->tdq_cg != NULL && ((tdq)->tdq_cg->cg_flags & CG_FLAG_THREAD) == 0)
2801#else
2802#define	TDQ_IDLESPIN(tdq)	1
2803#endif
2804
2805/*
2806 * The actual idle process.
2807 */
2808void
2809sched_idletd(void *dummy)
2810{
2811	struct thread *td;
2812	struct tdq *tdq;
2813	int oldswitchcnt, switchcnt;
2814	int i;
2815
2816	mtx_assert(&Giant, MA_NOTOWNED);
2817	td = curthread;
2818	tdq = TDQ_SELF();
2819	THREAD_NO_SLEEPING();
2820	oldswitchcnt = -1;
2821	for (;;) {
2822		if (tdq->tdq_load) {
2823			thread_lock(td);
2824			mi_switch(SW_VOL | SWT_IDLE);
2825		}
2826		switchcnt = tdq->tdq_switchcnt + tdq->tdq_oldswitchcnt;
2827#ifdef SMP
2828		if (always_steal || switchcnt != oldswitchcnt) {
2829			oldswitchcnt = switchcnt;
2830			if (tdq_idled(tdq) == 0)
2831				continue;
2832		}
2833		switchcnt = tdq->tdq_switchcnt + tdq->tdq_oldswitchcnt;
2834#else
2835		oldswitchcnt = switchcnt;
2836#endif
2837		/*
2838		 * If we're switching very frequently, spin while checking
2839		 * for load rather than entering a low power state that
2840		 * may require an IPI.  However, don't do any busy
2841		 * loops while on SMT machines as this simply steals
2842		 * cycles from cores doing useful work.
2843		 */
2844		if (TDQ_IDLESPIN(tdq) && switchcnt > sched_idlespinthresh) {
2845			for (i = 0; i < sched_idlespins; i++) {
2846				if (tdq->tdq_load)
2847					break;
2848				cpu_spinwait();
2849			}
2850		}
2851
2852		/* If there was context switch during spin, restart it. */
2853		switchcnt = tdq->tdq_switchcnt + tdq->tdq_oldswitchcnt;
2854		if (tdq->tdq_load != 0 || switchcnt != oldswitchcnt)
2855			continue;
2856
2857		/* Run main MD idle handler. */
2858		tdq->tdq_cpu_idle = 1;
2859		/*
2860		 * Make sure that tdq_cpu_idle update is globally visible
2861		 * before cpu_idle() read tdq_load.  The order is important
2862		 * to avoid race with tdq_notify.
2863		 */
2864		atomic_thread_fence_seq_cst();
2865		/*
2866		 * Checking for again after the fence picks up assigned
2867		 * threads often enough to make it worthwhile to do so in
2868		 * order to avoid calling cpu_idle().
2869		 */
2870		if (tdq->tdq_load != 0) {
2871			tdq->tdq_cpu_idle = 0;
2872			continue;
2873		}
2874		cpu_idle(switchcnt * 4 > sched_idlespinthresh);
2875		tdq->tdq_cpu_idle = 0;
2876
2877		/*
2878		 * Account thread-less hardware interrupts and
2879		 * other wakeup reasons equal to context switches.
2880		 */
2881		switchcnt = tdq->tdq_switchcnt + tdq->tdq_oldswitchcnt;
2882		if (switchcnt != oldswitchcnt)
2883			continue;
2884		tdq->tdq_switchcnt++;
2885		oldswitchcnt++;
2886	}
2887}
2888
2889/*
2890 * A CPU is entering for the first time or a thread is exiting.
2891 */
2892void
2893sched_throw(struct thread *td)
2894{
2895	struct thread *newtd;
2896	struct tdq *tdq;
2897
2898	if (__predict_false(td == NULL)) {
2899#ifdef SMP
2900		PCPU_SET(sched, DPCPU_PTR(tdq));
2901#endif
2902		/* Correct spinlock nesting and acquire the correct lock. */
2903		tdq = TDQ_SELF();
2904		TDQ_LOCK(tdq);
2905		spinlock_exit();
2906		PCPU_SET(switchtime, cpu_ticks());
2907		PCPU_SET(switchticks, ticks);
2908		PCPU_GET(idlethread)->td_lock = TDQ_LOCKPTR(tdq);
2909	} else {
2910		tdq = TDQ_SELF();
2911		THREAD_LOCK_ASSERT(td, MA_OWNED);
2912		THREAD_LOCKPTR_ASSERT(td, TDQ_LOCKPTR(tdq));
2913		tdq_load_rem(tdq, td);
2914		td->td_lastcpu = td->td_oncpu;
2915		td->td_oncpu = NOCPU;
2916		thread_lock_block(td);
2917	}
2918	newtd = choosethread();
2919	spinlock_enter();
2920	TDQ_UNLOCK(tdq);
2921	KASSERT(curthread->td_md.md_spinlock_count == 1,
2922	    ("invalid count %d", curthread->td_md.md_spinlock_count));
2923	/* doesn't return */
2924	if (__predict_false(td == NULL))
2925		cpu_throw(td, newtd);		/* doesn't return */
2926	else
2927		cpu_switch(td, newtd, TDQ_LOCKPTR(tdq));
2928}
2929
2930/*
2931 * This is called from fork_exit().  Just acquire the correct locks and
2932 * let fork do the rest of the work.
2933 */
2934void
2935sched_fork_exit(struct thread *td)
2936{
2937	struct tdq *tdq;
2938	int cpuid;
2939
2940	/*
2941	 * Finish setting up thread glue so that it begins execution in a
2942	 * non-nested critical section with the scheduler lock held.
2943	 */
2944	KASSERT(curthread->td_md.md_spinlock_count == 1,
2945	    ("invalid count %d", curthread->td_md.md_spinlock_count));
2946	cpuid = PCPU_GET(cpuid);
2947	tdq = TDQ_SELF();
2948	TDQ_LOCK(tdq);
2949	spinlock_exit();
2950	MPASS(td->td_lock == TDQ_LOCKPTR(tdq));
2951	td->td_oncpu = cpuid;
2952	KTR_STATE1(KTR_SCHED, "thread", sched_tdname(td), "running",
2953	    "prio:%d", td->td_priority);
2954	SDT_PROBE0(sched, , , on__cpu);
2955}
2956
2957/*
2958 * Create on first use to catch odd startup conditons.
2959 */
2960char *
2961sched_tdname(struct thread *td)
2962{
2963#ifdef KTR
2964	struct td_sched *ts;
2965
2966	ts = td_get_sched(td);
2967	if (ts->ts_name[0] == '\0')
2968		snprintf(ts->ts_name, sizeof(ts->ts_name),
2969		    "%s tid %d", td->td_name, td->td_tid);
2970	return (ts->ts_name);
2971#else
2972	return (td->td_name);
2973#endif
2974}
2975
2976#ifdef KTR
2977void
2978sched_clear_tdname(struct thread *td)
2979{
2980	struct td_sched *ts;
2981
2982	ts = td_get_sched(td);
2983	ts->ts_name[0] = '\0';
2984}
2985#endif
2986
2987#ifdef SMP
2988
2989/*
2990 * Build the CPU topology dump string. Is recursively called to collect
2991 * the topology tree.
2992 */
2993static int
2994sysctl_kern_sched_topology_spec_internal(struct sbuf *sb, struct cpu_group *cg,
2995    int indent)
2996{
2997	char cpusetbuf[CPUSETBUFSIZ];
2998	int i, first;
2999
3000	sbuf_printf(sb, "%*s<group level=\"%d\" cache-level=\"%d\">\n", indent,
3001	    "", 1 + indent / 2, cg->cg_level);
3002	sbuf_printf(sb, "%*s <cpu count=\"%d\" mask=\"%s\">", indent, "",
3003	    cg->cg_count, cpusetobj_strprint(cpusetbuf, &cg->cg_mask));
3004	first = TRUE;
3005	for (i = 0; i < MAXCPU; i++) {
3006		if (CPU_ISSET(i, &cg->cg_mask)) {
3007			if (!first)
3008				sbuf_printf(sb, ", ");
3009			else
3010				first = FALSE;
3011			sbuf_printf(sb, "%d", i);
3012		}
3013	}
3014	sbuf_printf(sb, "</cpu>\n");
3015
3016	if (cg->cg_flags != 0) {
3017		sbuf_printf(sb, "%*s <flags>", indent, "");
3018		if ((cg->cg_flags & CG_FLAG_HTT) != 0)
3019			sbuf_printf(sb, "<flag name=\"HTT\">HTT group</flag>");
3020		if ((cg->cg_flags & CG_FLAG_THREAD) != 0)
3021			sbuf_printf(sb, "<flag name=\"THREAD\">THREAD group</flag>");
3022		if ((cg->cg_flags & CG_FLAG_SMT) != 0)
3023			sbuf_printf(sb, "<flag name=\"SMT\">SMT group</flag>");
3024		sbuf_printf(sb, "</flags>\n");
3025	}
3026
3027	if (cg->cg_children > 0) {
3028		sbuf_printf(sb, "%*s <children>\n", indent, "");
3029		for (i = 0; i < cg->cg_children; i++)
3030			sysctl_kern_sched_topology_spec_internal(sb,
3031			    &cg->cg_child[i], indent+2);
3032		sbuf_printf(sb, "%*s </children>\n", indent, "");
3033	}
3034	sbuf_printf(sb, "%*s</group>\n", indent, "");
3035	return (0);
3036}
3037
3038/*
3039 * Sysctl handler for retrieving topology dump. It's a wrapper for
3040 * the recursive sysctl_kern_smp_topology_spec_internal().
3041 */
3042static int
3043sysctl_kern_sched_topology_spec(SYSCTL_HANDLER_ARGS)
3044{
3045	struct sbuf *topo;
3046	int err;
3047
3048	KASSERT(cpu_top != NULL, ("cpu_top isn't initialized"));
3049
3050	topo = sbuf_new_for_sysctl(NULL, NULL, 512, req);
3051	if (topo == NULL)
3052		return (ENOMEM);
3053
3054	sbuf_printf(topo, "<groups>\n");
3055	err = sysctl_kern_sched_topology_spec_internal(topo, cpu_top, 1);
3056	sbuf_printf(topo, "</groups>\n");
3057
3058	if (err == 0) {
3059		err = sbuf_finish(topo);
3060	}
3061	sbuf_delete(topo);
3062	return (err);
3063}
3064
3065#endif
3066
3067static int
3068sysctl_kern_quantum(SYSCTL_HANDLER_ARGS)
3069{
3070	int error, new_val, period;
3071
3072	period = 1000000 / realstathz;
3073	new_val = period * sched_slice;
3074	error = sysctl_handle_int(oidp, &new_val, 0, req);
3075	if (error != 0 || req->newptr == NULL)
3076		return (error);
3077	if (new_val <= 0)
3078		return (EINVAL);
3079	sched_slice = imax(1, (new_val + period / 2) / period);
3080	sched_slice_min = sched_slice / SCHED_SLICE_MIN_DIVISOR;
3081	hogticks = imax(1, (2 * hz * sched_slice + realstathz / 2) /
3082	    realstathz);
3083	return (0);
3084}
3085
3086SYSCTL_NODE(_kern, OID_AUTO, sched, CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
3087    "Scheduler");
3088SYSCTL_STRING(_kern_sched, OID_AUTO, name, CTLFLAG_RD, "ULE", 0,
3089    "Scheduler name");
3090SYSCTL_PROC(_kern_sched, OID_AUTO, quantum,
3091    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, NULL, 0,
3092    sysctl_kern_quantum, "I",
3093    "Quantum for timeshare threads in microseconds");
3094SYSCTL_INT(_kern_sched, OID_AUTO, slice, CTLFLAG_RW, &sched_slice, 0,
3095    "Quantum for timeshare threads in stathz ticks");
3096SYSCTL_INT(_kern_sched, OID_AUTO, interact, CTLFLAG_RW, &sched_interact, 0,
3097    "Interactivity score threshold");
3098SYSCTL_INT(_kern_sched, OID_AUTO, preempt_thresh, CTLFLAG_RW,
3099    &preempt_thresh, 0,
3100    "Maximal (lowest) priority for preemption");
3101SYSCTL_INT(_kern_sched, OID_AUTO, static_boost, CTLFLAG_RW, &static_boost, 0,
3102    "Assign static kernel priorities to sleeping threads");
3103SYSCTL_INT(_kern_sched, OID_AUTO, idlespins, CTLFLAG_RW, &sched_idlespins, 0,
3104    "Number of times idle thread will spin waiting for new work");
3105SYSCTL_INT(_kern_sched, OID_AUTO, idlespinthresh, CTLFLAG_RW,
3106    &sched_idlespinthresh, 0,
3107    "Threshold before we will permit idle thread spinning");
3108#ifdef SMP
3109SYSCTL_INT(_kern_sched, OID_AUTO, affinity, CTLFLAG_RW, &affinity, 0,
3110    "Number of hz ticks to keep thread affinity for");
3111SYSCTL_INT(_kern_sched, OID_AUTO, balance, CTLFLAG_RW, &rebalance, 0,
3112    "Enables the long-term load balancer");
3113SYSCTL_INT(_kern_sched, OID_AUTO, balance_interval, CTLFLAG_RW,
3114    &balance_interval, 0,
3115    "Average period in stathz ticks to run the long-term balancer");
3116SYSCTL_INT(_kern_sched, OID_AUTO, steal_idle, CTLFLAG_RW, &steal_idle, 0,
3117    "Attempts to steal work from other cores before idling");
3118SYSCTL_INT(_kern_sched, OID_AUTO, steal_thresh, CTLFLAG_RW, &steal_thresh, 0,
3119    "Minimum load on remote CPU before we'll steal");
3120SYSCTL_INT(_kern_sched, OID_AUTO, trysteal_limit, CTLFLAG_RW, &trysteal_limit,
3121    0, "Topological distance limit for stealing threads in sched_switch()");
3122SYSCTL_INT(_kern_sched, OID_AUTO, always_steal, CTLFLAG_RW, &always_steal, 0,
3123    "Always run the stealer from the idle thread");
3124SYSCTL_PROC(_kern_sched, OID_AUTO, topology_spec, CTLTYPE_STRING |
3125    CTLFLAG_MPSAFE | CTLFLAG_RD, NULL, 0, sysctl_kern_sched_topology_spec, "A",
3126    "XML dump of detected CPU topology");
3127#endif
3128
3129/* ps compat.  All cpu percentages from ULE are weighted. */
3130static int ccpu = 0;
3131SYSCTL_INT(_kern, OID_AUTO, ccpu, CTLFLAG_RD, &ccpu, 0,
3132    "Decay factor used for updating %CPU in 4BSD scheduler");
3133