sched_ule.c revision 220198
1/*-
2 * Copyright (c) 2002-2007, Jeffrey Roberson <jeff@freebsd.org>
3 * All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 * 1. Redistributions of source code must retain the above copyright
9 *    notice unmodified, this list of conditions, and the following
10 *    disclaimer.
11 * 2. Redistributions in binary form must reproduce the above copyright
12 *    notice, this list of conditions and the following disclaimer in the
13 *    documentation and/or other materials provided with the distribution.
14 *
15 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
16 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
17 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
18 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
19 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
20 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
21 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
22 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
23 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
24 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
25 */
26
27/*
28 * This file implements the ULE scheduler.  ULE supports independent CPU
29 * run queues and fine grain locking.  It has superior interactive
30 * performance under load even on uni-processor systems.
31 *
32 * etymology:
33 *   ULE is the last three letters in schedule.  It owes its name to a
34 * generic user created for a scheduling system by Paul Mikesell at
35 * Isilon Systems and a general lack of creativity on the part of the author.
36 */
37
38#include <sys/cdefs.h>
39__FBSDID("$FreeBSD: head/sys/kern/sched_ule.c 220198 2011-03-31 13:59:47Z fabient $");
40
41#include "opt_hwpmc_hooks.h"
42#include "opt_kdtrace.h"
43#include "opt_sched.h"
44
45#include <sys/param.h>
46#include <sys/systm.h>
47#include <sys/kdb.h>
48#include <sys/kernel.h>
49#include <sys/ktr.h>
50#include <sys/lock.h>
51#include <sys/mutex.h>
52#include <sys/proc.h>
53#include <sys/resource.h>
54#include <sys/resourcevar.h>
55#include <sys/sched.h>
56#include <sys/smp.h>
57#include <sys/sx.h>
58#include <sys/sysctl.h>
59#include <sys/sysproto.h>
60#include <sys/turnstile.h>
61#include <sys/umtx.h>
62#include <sys/vmmeter.h>
63#include <sys/cpuset.h>
64#include <sys/sbuf.h>
65
66#ifdef HWPMC_HOOKS
67#include <sys/pmckern.h>
68#endif
69
70#ifdef KDTRACE_HOOKS
71#include <sys/dtrace_bsd.h>
72int				dtrace_vtime_active;
73dtrace_vtime_switch_func_t	dtrace_vtime_switch_func;
74#endif
75
76#include <machine/cpu.h>
77#include <machine/smp.h>
78
79#if defined(__sparc64__)
80#error "This architecture is not currently compatible with ULE"
81#endif
82
83#define	KTR_ULE	0
84
85#define	TS_NAME_LEN (MAXCOMLEN + sizeof(" td ") + sizeof(__XSTRING(UINT_MAX)))
86#define	TDQ_NAME_LEN	(sizeof("sched lock ") + sizeof(__XSTRING(MAXCPU)))
87#define	TDQ_LOADNAME_LEN	(PCPU_NAME_LEN + sizeof(" load"))
88
89/*
90 * Thread scheduler specific section.  All fields are protected
91 * by the thread lock.
92 */
93struct td_sched {
94	struct runq	*ts_runq;	/* Run-queue we're queued on. */
95	short		ts_flags;	/* TSF_* flags. */
96	u_char		ts_cpu;		/* CPU that we have affinity for. */
97	int		ts_rltick;	/* Real last tick, for affinity. */
98	int		ts_slice;	/* Ticks of slice remaining. */
99	u_int		ts_slptime;	/* Number of ticks we vol. slept */
100	u_int		ts_runtime;	/* Number of ticks we were running */
101	int		ts_ltick;	/* Last tick that we were running on */
102	int		ts_incrtick;	/* Last tick that we incremented on */
103	int		ts_ftick;	/* First tick that we were running on */
104	int		ts_ticks;	/* Tick count */
105#ifdef KTR
106	char		ts_name[TS_NAME_LEN];
107#endif
108};
109/* flags kept in ts_flags */
110#define	TSF_BOUND	0x0001		/* Thread can not migrate. */
111#define	TSF_XFERABLE	0x0002		/* Thread was added as transferable. */
112
113static struct td_sched td_sched0;
114
115#define	THREAD_CAN_MIGRATE(td)	((td)->td_pinned == 0)
116#define	THREAD_CAN_SCHED(td, cpu)	\
117    CPU_ISSET((cpu), &(td)->td_cpuset->cs_mask)
118
119/*
120 * Priority ranges used for interactive and non-interactive timeshare
121 * threads.  The timeshare priorities are split up into four ranges.
122 * The first range handles interactive threads.  The last three ranges
123 * (NHALF, x, and NHALF) handle non-interactive threads with the outer
124 * ranges supporting nice values.
125 */
126#define	PRI_TIMESHARE_RANGE	(PRI_MAX_TIMESHARE - PRI_MIN_TIMESHARE + 1)
127#define	PRI_INTERACT_RANGE	((PRI_TIMESHARE_RANGE - SCHED_PRI_NRESV) / 2)
128
129#define	PRI_MIN_INTERACT	PRI_MIN_TIMESHARE
130#define	PRI_MAX_INTERACT	(PRI_MIN_TIMESHARE + PRI_INTERACT_RANGE - 1)
131#define	PRI_MIN_BATCH		(PRI_MIN_TIMESHARE + PRI_INTERACT_RANGE)
132#define	PRI_MAX_BATCH		PRI_MAX_TIMESHARE
133
134/*
135 * Cpu percentage computation macros and defines.
136 *
137 * SCHED_TICK_SECS:	Number of seconds to average the cpu usage across.
138 * SCHED_TICK_TARG:	Number of hz ticks to average the cpu usage across.
139 * SCHED_TICK_MAX:	Maximum number of ticks before scaling back.
140 * SCHED_TICK_SHIFT:	Shift factor to avoid rounding away results.
141 * SCHED_TICK_HZ:	Compute the number of hz ticks for a given ticks count.
142 * SCHED_TICK_TOTAL:	Gives the amount of time we've been recording ticks.
143 */
144#define	SCHED_TICK_SECS		10
145#define	SCHED_TICK_TARG		(hz * SCHED_TICK_SECS)
146#define	SCHED_TICK_MAX		(SCHED_TICK_TARG + hz)
147#define	SCHED_TICK_SHIFT	10
148#define	SCHED_TICK_HZ(ts)	((ts)->ts_ticks >> SCHED_TICK_SHIFT)
149#define	SCHED_TICK_TOTAL(ts)	(max((ts)->ts_ltick - (ts)->ts_ftick, hz))
150
151/*
152 * These macros determine priorities for non-interactive threads.  They are
153 * assigned a priority based on their recent cpu utilization as expressed
154 * by the ratio of ticks to the tick total.  NHALF priorities at the start
155 * and end of the MIN to MAX timeshare range are only reachable with negative
156 * or positive nice respectively.
157 *
158 * PRI_RANGE:	Priority range for utilization dependent priorities.
159 * PRI_NRESV:	Number of nice values.
160 * PRI_TICKS:	Compute a priority in PRI_RANGE from the ticks count and total.
161 * PRI_NICE:	Determines the part of the priority inherited from nice.
162 */
163#define	SCHED_PRI_NRESV		(PRIO_MAX - PRIO_MIN)
164#define	SCHED_PRI_NHALF		(SCHED_PRI_NRESV / 2)
165#define	SCHED_PRI_MIN		(PRI_MIN_BATCH + SCHED_PRI_NHALF)
166#define	SCHED_PRI_MAX		(PRI_MAX_BATCH - SCHED_PRI_NHALF)
167#define	SCHED_PRI_RANGE		(SCHED_PRI_MAX - SCHED_PRI_MIN + 1)
168#define	SCHED_PRI_TICKS(ts)						\
169    (SCHED_TICK_HZ((ts)) /						\
170    (roundup(SCHED_TICK_TOTAL((ts)), SCHED_PRI_RANGE) / SCHED_PRI_RANGE))
171#define	SCHED_PRI_NICE(nice)	(nice)
172
173/*
174 * These determine the interactivity of a process.  Interactivity differs from
175 * cpu utilization in that it expresses the voluntary time slept vs time ran
176 * while cpu utilization includes all time not running.  This more accurately
177 * models the intent of the thread.
178 *
179 * SLP_RUN_MAX:	Maximum amount of sleep time + run time we'll accumulate
180 *		before throttling back.
181 * SLP_RUN_FORK:	Maximum slp+run time to inherit at fork time.
182 * INTERACT_MAX:	Maximum interactivity value.  Smaller is better.
183 * INTERACT_THRESH:	Threshold for placement on the current runq.
184 */
185#define	SCHED_SLP_RUN_MAX	((hz * 5) << SCHED_TICK_SHIFT)
186#define	SCHED_SLP_RUN_FORK	((hz / 2) << SCHED_TICK_SHIFT)
187#define	SCHED_INTERACT_MAX	(100)
188#define	SCHED_INTERACT_HALF	(SCHED_INTERACT_MAX / 2)
189#define	SCHED_INTERACT_THRESH	(30)
190
191/*
192 * tickincr:		Converts a stathz tick into a hz domain scaled by
193 *			the shift factor.  Without the shift the error rate
194 *			due to rounding would be unacceptably high.
195 * realstathz:		stathz is sometimes 0 and run off of hz.
196 * sched_slice:		Runtime of each thread before rescheduling.
197 * preempt_thresh:	Priority threshold for preemption and remote IPIs.
198 */
199static int sched_interact = SCHED_INTERACT_THRESH;
200static int realstathz;
201static int tickincr;
202static int sched_slice = 1;
203#ifdef PREEMPTION
204#ifdef FULL_PREEMPTION
205static int preempt_thresh = PRI_MAX_IDLE;
206#else
207static int preempt_thresh = PRI_MIN_KERN;
208#endif
209#else
210static int preempt_thresh = 0;
211#endif
212static int static_boost = PRI_MIN_BATCH;
213static int sched_idlespins = 10000;
214static int sched_idlespinthresh = 16;
215
216/*
217 * tdq - per processor runqs and statistics.  All fields are protected by the
218 * tdq_lock.  The load and lowpri may be accessed without to avoid excess
219 * locking in sched_pickcpu();
220 */
221struct tdq {
222	/* Ordered to improve efficiency of cpu_search() and switch(). */
223	struct mtx	tdq_lock;		/* run queue lock. */
224	struct cpu_group *tdq_cg;		/* Pointer to cpu topology. */
225	volatile int	tdq_load;		/* Aggregate load. */
226	volatile int	tdq_cpu_idle;		/* cpu_idle() is active. */
227	int		tdq_sysload;		/* For loadavg, !ITHD load. */
228	int		tdq_transferable;	/* Transferable thread count. */
229	short		tdq_switchcnt;		/* Switches this tick. */
230	short		tdq_oldswitchcnt;	/* Switches last tick. */
231	u_char		tdq_lowpri;		/* Lowest priority thread. */
232	u_char		tdq_ipipending;		/* IPI pending. */
233	u_char		tdq_idx;		/* Current insert index. */
234	u_char		tdq_ridx;		/* Current removal index. */
235	struct runq	tdq_realtime;		/* real-time run queue. */
236	struct runq	tdq_timeshare;		/* timeshare run queue. */
237	struct runq	tdq_idle;		/* Queue of IDLE threads. */
238	char		tdq_name[TDQ_NAME_LEN];
239#ifdef KTR
240	char		tdq_loadname[TDQ_LOADNAME_LEN];
241#endif
242} __aligned(64);
243
244/* Idle thread states and config. */
245#define	TDQ_RUNNING	1
246#define	TDQ_IDLE	2
247
248#ifdef SMP
249struct cpu_group *cpu_top;		/* CPU topology */
250
251#define	SCHED_AFFINITY_DEFAULT	(max(1, hz / 1000))
252#define	SCHED_AFFINITY(ts, t)	((ts)->ts_rltick > ticks - ((t) * affinity))
253
254/*
255 * Run-time tunables.
256 */
257static int rebalance = 1;
258static int balance_interval = 128;	/* Default set in sched_initticks(). */
259static int affinity;
260static int steal_htt = 1;
261static int steal_idle = 1;
262static int steal_thresh = 2;
263
264/*
265 * One thread queue per processor.
266 */
267static struct tdq	tdq_cpu[MAXCPU];
268static struct tdq	*balance_tdq;
269static int balance_ticks;
270
271#define	TDQ_SELF()	(&tdq_cpu[PCPU_GET(cpuid)])
272#define	TDQ_CPU(x)	(&tdq_cpu[(x)])
273#define	TDQ_ID(x)	((int)((x) - tdq_cpu))
274#else	/* !SMP */
275static struct tdq	tdq_cpu;
276
277#define	TDQ_ID(x)	(0)
278#define	TDQ_SELF()	(&tdq_cpu)
279#define	TDQ_CPU(x)	(&tdq_cpu)
280#endif
281
282#define	TDQ_LOCK_ASSERT(t, type)	mtx_assert(TDQ_LOCKPTR((t)), (type))
283#define	TDQ_LOCK(t)		mtx_lock_spin(TDQ_LOCKPTR((t)))
284#define	TDQ_LOCK_FLAGS(t, f)	mtx_lock_spin_flags(TDQ_LOCKPTR((t)), (f))
285#define	TDQ_UNLOCK(t)		mtx_unlock_spin(TDQ_LOCKPTR((t)))
286#define	TDQ_LOCKPTR(t)		(&(t)->tdq_lock)
287
288static void sched_priority(struct thread *);
289static void sched_thread_priority(struct thread *, u_char);
290static int sched_interact_score(struct thread *);
291static void sched_interact_update(struct thread *);
292static void sched_interact_fork(struct thread *);
293static void sched_pctcpu_update(struct td_sched *);
294
295/* Operations on per processor queues */
296static struct thread *tdq_choose(struct tdq *);
297static void tdq_setup(struct tdq *);
298static void tdq_load_add(struct tdq *, struct thread *);
299static void tdq_load_rem(struct tdq *, struct thread *);
300static __inline void tdq_runq_add(struct tdq *, struct thread *, int);
301static __inline void tdq_runq_rem(struct tdq *, struct thread *);
302static inline int sched_shouldpreempt(int, int, int);
303void tdq_print(int cpu);
304static void runq_print(struct runq *rq);
305static void tdq_add(struct tdq *, struct thread *, int);
306#ifdef SMP
307static int tdq_move(struct tdq *, struct tdq *);
308static int tdq_idled(struct tdq *);
309static void tdq_notify(struct tdq *, struct thread *);
310static struct thread *tdq_steal(struct tdq *, int);
311static struct thread *runq_steal(struct runq *, int);
312static int sched_pickcpu(struct thread *, int);
313static void sched_balance(void);
314static int sched_balance_pair(struct tdq *, struct tdq *);
315static inline struct tdq *sched_setcpu(struct thread *, int, int);
316static inline void thread_unblock_switch(struct thread *, struct mtx *);
317static struct mtx *sched_switch_migrate(struct tdq *, struct thread *, int);
318static int sysctl_kern_sched_topology_spec(SYSCTL_HANDLER_ARGS);
319static int sysctl_kern_sched_topology_spec_internal(struct sbuf *sb,
320    struct cpu_group *cg, int indent);
321#endif
322
323static void sched_setup(void *dummy);
324SYSINIT(sched_setup, SI_SUB_RUN_QUEUE, SI_ORDER_FIRST, sched_setup, NULL);
325
326static void sched_initticks(void *dummy);
327SYSINIT(sched_initticks, SI_SUB_CLOCKS, SI_ORDER_THIRD, sched_initticks,
328    NULL);
329
330/*
331 * Print the threads waiting on a run-queue.
332 */
333static void
334runq_print(struct runq *rq)
335{
336	struct rqhead *rqh;
337	struct thread *td;
338	int pri;
339	int j;
340	int i;
341
342	for (i = 0; i < RQB_LEN; i++) {
343		printf("\t\trunq bits %d 0x%zx\n",
344		    i, rq->rq_status.rqb_bits[i]);
345		for (j = 0; j < RQB_BPW; j++)
346			if (rq->rq_status.rqb_bits[i] & (1ul << j)) {
347				pri = j + (i << RQB_L2BPW);
348				rqh = &rq->rq_queues[pri];
349				TAILQ_FOREACH(td, rqh, td_runq) {
350					printf("\t\t\ttd %p(%s) priority %d rqindex %d pri %d\n",
351					    td, td->td_name, td->td_priority,
352					    td->td_rqindex, pri);
353				}
354			}
355	}
356}
357
358/*
359 * Print the status of a per-cpu thread queue.  Should be a ddb show cmd.
360 */
361void
362tdq_print(int cpu)
363{
364	struct tdq *tdq;
365
366	tdq = TDQ_CPU(cpu);
367
368	printf("tdq %d:\n", TDQ_ID(tdq));
369	printf("\tlock            %p\n", TDQ_LOCKPTR(tdq));
370	printf("\tLock name:      %s\n", tdq->tdq_name);
371	printf("\tload:           %d\n", tdq->tdq_load);
372	printf("\tswitch cnt:     %d\n", tdq->tdq_switchcnt);
373	printf("\told switch cnt: %d\n", tdq->tdq_oldswitchcnt);
374	printf("\ttimeshare idx:  %d\n", tdq->tdq_idx);
375	printf("\ttimeshare ridx: %d\n", tdq->tdq_ridx);
376	printf("\tload transferable: %d\n", tdq->tdq_transferable);
377	printf("\tlowest priority:   %d\n", tdq->tdq_lowpri);
378	printf("\trealtime runq:\n");
379	runq_print(&tdq->tdq_realtime);
380	printf("\ttimeshare runq:\n");
381	runq_print(&tdq->tdq_timeshare);
382	printf("\tidle runq:\n");
383	runq_print(&tdq->tdq_idle);
384}
385
386static inline int
387sched_shouldpreempt(int pri, int cpri, int remote)
388{
389	/*
390	 * If the new priority is not better than the current priority there is
391	 * nothing to do.
392	 */
393	if (pri >= cpri)
394		return (0);
395	/*
396	 * Always preempt idle.
397	 */
398	if (cpri >= PRI_MIN_IDLE)
399		return (1);
400	/*
401	 * If preemption is disabled don't preempt others.
402	 */
403	if (preempt_thresh == 0)
404		return (0);
405	/*
406	 * Preempt if we exceed the threshold.
407	 */
408	if (pri <= preempt_thresh)
409		return (1);
410	/*
411	 * If we're interactive or better and there is non-interactive
412	 * or worse running preempt only remote processors.
413	 */
414	if (remote && pri <= PRI_MAX_INTERACT && cpri > PRI_MAX_INTERACT)
415		return (1);
416	return (0);
417}
418
419#define	TS_RQ_PPQ	(((PRI_MAX_BATCH - PRI_MIN_BATCH) + 1) / RQ_NQS)
420/*
421 * Add a thread to the actual run-queue.  Keeps transferable counts up to
422 * date with what is actually on the run-queue.  Selects the correct
423 * queue position for timeshare threads.
424 */
425static __inline void
426tdq_runq_add(struct tdq *tdq, struct thread *td, int flags)
427{
428	struct td_sched *ts;
429	u_char pri;
430
431	TDQ_LOCK_ASSERT(tdq, MA_OWNED);
432	THREAD_LOCK_ASSERT(td, MA_OWNED);
433
434	pri = td->td_priority;
435	ts = td->td_sched;
436	TD_SET_RUNQ(td);
437	if (THREAD_CAN_MIGRATE(td)) {
438		tdq->tdq_transferable++;
439		ts->ts_flags |= TSF_XFERABLE;
440	}
441	if (pri < PRI_MIN_BATCH) {
442		ts->ts_runq = &tdq->tdq_realtime;
443	} else if (pri <= PRI_MAX_BATCH) {
444		ts->ts_runq = &tdq->tdq_timeshare;
445		KASSERT(pri <= PRI_MAX_BATCH && pri >= PRI_MIN_BATCH,
446			("Invalid priority %d on timeshare runq", pri));
447		/*
448		 * This queue contains only priorities between MIN and MAX
449		 * realtime.  Use the whole queue to represent these values.
450		 */
451		if ((flags & (SRQ_BORROWING|SRQ_PREEMPTED)) == 0) {
452			pri = (pri - PRI_MIN_BATCH) / TS_RQ_PPQ;
453			pri = (pri + tdq->tdq_idx) % RQ_NQS;
454			/*
455			 * This effectively shortens the queue by one so we
456			 * can have a one slot difference between idx and
457			 * ridx while we wait for threads to drain.
458			 */
459			if (tdq->tdq_ridx != tdq->tdq_idx &&
460			    pri == tdq->tdq_ridx)
461				pri = (unsigned char)(pri - 1) % RQ_NQS;
462		} else
463			pri = tdq->tdq_ridx;
464		runq_add_pri(ts->ts_runq, td, pri, flags);
465		return;
466	} else
467		ts->ts_runq = &tdq->tdq_idle;
468	runq_add(ts->ts_runq, td, flags);
469}
470
471/*
472 * Remove a thread from a run-queue.  This typically happens when a thread
473 * is selected to run.  Running threads are not on the queue and the
474 * transferable count does not reflect them.
475 */
476static __inline void
477tdq_runq_rem(struct tdq *tdq, struct thread *td)
478{
479	struct td_sched *ts;
480
481	ts = td->td_sched;
482	TDQ_LOCK_ASSERT(tdq, MA_OWNED);
483	KASSERT(ts->ts_runq != NULL,
484	    ("tdq_runq_remove: thread %p null ts_runq", td));
485	if (ts->ts_flags & TSF_XFERABLE) {
486		tdq->tdq_transferable--;
487		ts->ts_flags &= ~TSF_XFERABLE;
488	}
489	if (ts->ts_runq == &tdq->tdq_timeshare) {
490		if (tdq->tdq_idx != tdq->tdq_ridx)
491			runq_remove_idx(ts->ts_runq, td, &tdq->tdq_ridx);
492		else
493			runq_remove_idx(ts->ts_runq, td, NULL);
494	} else
495		runq_remove(ts->ts_runq, td);
496}
497
498/*
499 * Load is maintained for all threads RUNNING and ON_RUNQ.  Add the load
500 * for this thread to the referenced thread queue.
501 */
502static void
503tdq_load_add(struct tdq *tdq, struct thread *td)
504{
505
506	TDQ_LOCK_ASSERT(tdq, MA_OWNED);
507	THREAD_LOCK_ASSERT(td, MA_OWNED);
508
509	tdq->tdq_load++;
510	if ((td->td_flags & TDF_NOLOAD) == 0)
511		tdq->tdq_sysload++;
512	KTR_COUNTER0(KTR_SCHED, "load", tdq->tdq_loadname, tdq->tdq_load);
513}
514
515/*
516 * Remove the load from a thread that is transitioning to a sleep state or
517 * exiting.
518 */
519static void
520tdq_load_rem(struct tdq *tdq, struct thread *td)
521{
522
523	THREAD_LOCK_ASSERT(td, MA_OWNED);
524	TDQ_LOCK_ASSERT(tdq, MA_OWNED);
525	KASSERT(tdq->tdq_load != 0,
526	    ("tdq_load_rem: Removing with 0 load on queue %d", TDQ_ID(tdq)));
527
528	tdq->tdq_load--;
529	if ((td->td_flags & TDF_NOLOAD) == 0)
530		tdq->tdq_sysload--;
531	KTR_COUNTER0(KTR_SCHED, "load", tdq->tdq_loadname, tdq->tdq_load);
532}
533
534/*
535 * Set lowpri to its exact value by searching the run-queue and
536 * evaluating curthread.  curthread may be passed as an optimization.
537 */
538static void
539tdq_setlowpri(struct tdq *tdq, struct thread *ctd)
540{
541	struct thread *td;
542
543	TDQ_LOCK_ASSERT(tdq, MA_OWNED);
544	if (ctd == NULL)
545		ctd = pcpu_find(TDQ_ID(tdq))->pc_curthread;
546	td = tdq_choose(tdq);
547	if (td == NULL || td->td_priority > ctd->td_priority)
548		tdq->tdq_lowpri = ctd->td_priority;
549	else
550		tdq->tdq_lowpri = td->td_priority;
551}
552
553#ifdef SMP
554struct cpu_search {
555	cpuset_t cs_mask;
556	u_int	cs_load;
557	u_int	cs_cpu;
558	int	cs_limit;	/* Min priority for low min load for high. */
559};
560
561#define	CPU_SEARCH_LOWEST	0x1
562#define	CPU_SEARCH_HIGHEST	0x2
563#define	CPU_SEARCH_BOTH		(CPU_SEARCH_LOWEST|CPU_SEARCH_HIGHEST)
564
565#define	CPUSET_FOREACH(cpu, mask)				\
566	for ((cpu) = 0; (cpu) <= mp_maxid; (cpu)++)		\
567		if ((mask) & 1 << (cpu))
568
569static __inline int cpu_search(struct cpu_group *cg, struct cpu_search *low,
570    struct cpu_search *high, const int match);
571int cpu_search_lowest(struct cpu_group *cg, struct cpu_search *low);
572int cpu_search_highest(struct cpu_group *cg, struct cpu_search *high);
573int cpu_search_both(struct cpu_group *cg, struct cpu_search *low,
574    struct cpu_search *high);
575
576/*
577 * This routine compares according to the match argument and should be
578 * reduced in actual instantiations via constant propagation and dead code
579 * elimination.
580 */
581static __inline int
582cpu_compare(int cpu, struct cpu_search *low, struct cpu_search *high,
583    const int match)
584{
585	struct tdq *tdq;
586
587	tdq = TDQ_CPU(cpu);
588	if (match & CPU_SEARCH_LOWEST)
589		if (CPU_ISSET(cpu, &low->cs_mask) &&
590		    tdq->tdq_load < low->cs_load &&
591		    tdq->tdq_lowpri > low->cs_limit) {
592			low->cs_cpu = cpu;
593			low->cs_load = tdq->tdq_load;
594		}
595	if (match & CPU_SEARCH_HIGHEST)
596		if (CPU_ISSET(cpu, &high->cs_mask) &&
597		    tdq->tdq_load >= high->cs_limit &&
598		    tdq->tdq_load > high->cs_load &&
599		    tdq->tdq_transferable) {
600			high->cs_cpu = cpu;
601			high->cs_load = tdq->tdq_load;
602		}
603	return (tdq->tdq_load);
604}
605
606/*
607 * Search the tree of cpu_groups for the lowest or highest loaded cpu
608 * according to the match argument.  This routine actually compares the
609 * load on all paths through the tree and finds the least loaded cpu on
610 * the least loaded path, which may differ from the least loaded cpu in
611 * the system.  This balances work among caches and busses.
612 *
613 * This inline is instantiated in three forms below using constants for the
614 * match argument.  It is reduced to the minimum set for each case.  It is
615 * also recursive to the depth of the tree.
616 */
617static __inline int
618cpu_search(struct cpu_group *cg, struct cpu_search *low,
619    struct cpu_search *high, const int match)
620{
621	int total;
622
623	total = 0;
624	if (cg->cg_children) {
625		struct cpu_search lgroup;
626		struct cpu_search hgroup;
627		struct cpu_group *child;
628		u_int lload;
629		int hload;
630		int load;
631		int i;
632
633		lload = -1;
634		hload = -1;
635		for (i = 0; i < cg->cg_children; i++) {
636			child = &cg->cg_child[i];
637			if (match & CPU_SEARCH_LOWEST) {
638				lgroup = *low;
639				lgroup.cs_load = -1;
640			}
641			if (match & CPU_SEARCH_HIGHEST) {
642				hgroup = *high;
643				lgroup.cs_load = 0;
644			}
645			switch (match) {
646			case CPU_SEARCH_LOWEST:
647				load = cpu_search_lowest(child, &lgroup);
648				break;
649			case CPU_SEARCH_HIGHEST:
650				load = cpu_search_highest(child, &hgroup);
651				break;
652			case CPU_SEARCH_BOTH:
653				load = cpu_search_both(child, &lgroup, &hgroup);
654				break;
655			}
656			total += load;
657			if (match & CPU_SEARCH_LOWEST)
658				if (load < lload || low->cs_cpu == -1) {
659					*low = lgroup;
660					lload = load;
661				}
662			if (match & CPU_SEARCH_HIGHEST)
663				if (load > hload || high->cs_cpu == -1) {
664					hload = load;
665					*high = hgroup;
666				}
667		}
668	} else {
669		int cpu;
670
671		CPUSET_FOREACH(cpu, cg->cg_mask)
672			total += cpu_compare(cpu, low, high, match);
673	}
674	return (total);
675}
676
677/*
678 * cpu_search instantiations must pass constants to maintain the inline
679 * optimization.
680 */
681int
682cpu_search_lowest(struct cpu_group *cg, struct cpu_search *low)
683{
684	return cpu_search(cg, low, NULL, CPU_SEARCH_LOWEST);
685}
686
687int
688cpu_search_highest(struct cpu_group *cg, struct cpu_search *high)
689{
690	return cpu_search(cg, NULL, high, CPU_SEARCH_HIGHEST);
691}
692
693int
694cpu_search_both(struct cpu_group *cg, struct cpu_search *low,
695    struct cpu_search *high)
696{
697	return cpu_search(cg, low, high, CPU_SEARCH_BOTH);
698}
699
700/*
701 * Find the cpu with the least load via the least loaded path that has a
702 * lowpri greater than pri  pri.  A pri of -1 indicates any priority is
703 * acceptable.
704 */
705static inline int
706sched_lowest(struct cpu_group *cg, cpuset_t mask, int pri)
707{
708	struct cpu_search low;
709
710	low.cs_cpu = -1;
711	low.cs_load = -1;
712	low.cs_mask = mask;
713	low.cs_limit = pri;
714	cpu_search_lowest(cg, &low);
715	return low.cs_cpu;
716}
717
718/*
719 * Find the cpu with the highest load via the highest loaded path.
720 */
721static inline int
722sched_highest(struct cpu_group *cg, cpuset_t mask, int minload)
723{
724	struct cpu_search high;
725
726	high.cs_cpu = -1;
727	high.cs_load = 0;
728	high.cs_mask = mask;
729	high.cs_limit = minload;
730	cpu_search_highest(cg, &high);
731	return high.cs_cpu;
732}
733
734/*
735 * Simultaneously find the highest and lowest loaded cpu reachable via
736 * cg.
737 */
738static inline void
739sched_both(struct cpu_group *cg, cpuset_t mask, int *lowcpu, int *highcpu)
740{
741	struct cpu_search high;
742	struct cpu_search low;
743
744	low.cs_cpu = -1;
745	low.cs_limit = -1;
746	low.cs_load = -1;
747	low.cs_mask = mask;
748	high.cs_load = 0;
749	high.cs_cpu = -1;
750	high.cs_limit = -1;
751	high.cs_mask = mask;
752	cpu_search_both(cg, &low, &high);
753	*lowcpu = low.cs_cpu;
754	*highcpu = high.cs_cpu;
755	return;
756}
757
758static void
759sched_balance_group(struct cpu_group *cg)
760{
761	cpuset_t mask;
762	int high;
763	int low;
764	int i;
765
766	CPU_FILL(&mask);
767	for (;;) {
768		sched_both(cg, mask, &low, &high);
769		if (low == high || low == -1 || high == -1)
770			break;
771		if (sched_balance_pair(TDQ_CPU(high), TDQ_CPU(low)))
772			break;
773		/*
774		 * If we failed to move any threads determine which cpu
775		 * to kick out of the set and try again.
776	 	 */
777		if (TDQ_CPU(high)->tdq_transferable == 0)
778			CPU_CLR(high, &mask);
779		else
780			CPU_CLR(low, &mask);
781	}
782
783	for (i = 0; i < cg->cg_children; i++)
784		sched_balance_group(&cg->cg_child[i]);
785}
786
787static void
788sched_balance(void)
789{
790	struct tdq *tdq;
791
792	/*
793	 * Select a random time between .5 * balance_interval and
794	 * 1.5 * balance_interval.
795	 */
796	balance_ticks = max(balance_interval / 2, 1);
797	balance_ticks += random() % balance_interval;
798	if (smp_started == 0 || rebalance == 0)
799		return;
800	tdq = TDQ_SELF();
801	TDQ_UNLOCK(tdq);
802	sched_balance_group(cpu_top);
803	TDQ_LOCK(tdq);
804}
805
806/*
807 * Lock two thread queues using their address to maintain lock order.
808 */
809static void
810tdq_lock_pair(struct tdq *one, struct tdq *two)
811{
812	if (one < two) {
813		TDQ_LOCK(one);
814		TDQ_LOCK_FLAGS(two, MTX_DUPOK);
815	} else {
816		TDQ_LOCK(two);
817		TDQ_LOCK_FLAGS(one, MTX_DUPOK);
818	}
819}
820
821/*
822 * Unlock two thread queues.  Order is not important here.
823 */
824static void
825tdq_unlock_pair(struct tdq *one, struct tdq *two)
826{
827	TDQ_UNLOCK(one);
828	TDQ_UNLOCK(two);
829}
830
831/*
832 * Transfer load between two imbalanced thread queues.
833 */
834static int
835sched_balance_pair(struct tdq *high, struct tdq *low)
836{
837	int transferable;
838	int high_load;
839	int low_load;
840	int moved;
841	int move;
842	int diff;
843	int i;
844
845	tdq_lock_pair(high, low);
846	transferable = high->tdq_transferable;
847	high_load = high->tdq_load;
848	low_load = low->tdq_load;
849	moved = 0;
850	/*
851	 * Determine what the imbalance is and then adjust that to how many
852	 * threads we actually have to give up (transferable).
853	 */
854	if (transferable != 0) {
855		diff = high_load - low_load;
856		move = diff / 2;
857		if (diff & 0x1)
858			move++;
859		move = min(move, transferable);
860		for (i = 0; i < move; i++)
861			moved += tdq_move(high, low);
862		/*
863		 * IPI the target cpu to force it to reschedule with the new
864		 * workload.
865		 */
866		ipi_cpu(TDQ_ID(low), IPI_PREEMPT);
867	}
868	tdq_unlock_pair(high, low);
869	return (moved);
870}
871
872/*
873 * Move a thread from one thread queue to another.
874 */
875static int
876tdq_move(struct tdq *from, struct tdq *to)
877{
878	struct td_sched *ts;
879	struct thread *td;
880	struct tdq *tdq;
881	int cpu;
882
883	TDQ_LOCK_ASSERT(from, MA_OWNED);
884	TDQ_LOCK_ASSERT(to, MA_OWNED);
885
886	tdq = from;
887	cpu = TDQ_ID(to);
888	td = tdq_steal(tdq, cpu);
889	if (td == NULL)
890		return (0);
891	ts = td->td_sched;
892	/*
893	 * Although the run queue is locked the thread may be blocked.  Lock
894	 * it to clear this and acquire the run-queue lock.
895	 */
896	thread_lock(td);
897	/* Drop recursive lock on from acquired via thread_lock(). */
898	TDQ_UNLOCK(from);
899	sched_rem(td);
900	ts->ts_cpu = cpu;
901	td->td_lock = TDQ_LOCKPTR(to);
902	tdq_add(to, td, SRQ_YIELDING);
903	return (1);
904}
905
906/*
907 * This tdq has idled.  Try to steal a thread from another cpu and switch
908 * to it.
909 */
910static int
911tdq_idled(struct tdq *tdq)
912{
913	struct cpu_group *cg;
914	struct tdq *steal;
915	cpuset_t mask;
916	int thresh;
917	int cpu;
918
919	if (smp_started == 0 || steal_idle == 0)
920		return (1);
921	CPU_FILL(&mask);
922	CPU_CLR(PCPU_GET(cpuid), &mask);
923	/* We don't want to be preempted while we're iterating. */
924	spinlock_enter();
925	for (cg = tdq->tdq_cg; cg != NULL; ) {
926		if ((cg->cg_flags & CG_FLAG_THREAD) == 0)
927			thresh = steal_thresh;
928		else
929			thresh = 1;
930		cpu = sched_highest(cg, mask, thresh);
931		if (cpu == -1) {
932			cg = cg->cg_parent;
933			continue;
934		}
935		steal = TDQ_CPU(cpu);
936		CPU_CLR(cpu, &mask);
937		tdq_lock_pair(tdq, steal);
938		if (steal->tdq_load < thresh || steal->tdq_transferable == 0) {
939			tdq_unlock_pair(tdq, steal);
940			continue;
941		}
942		/*
943		 * If a thread was added while interrupts were disabled don't
944		 * steal one here.  If we fail to acquire one due to affinity
945		 * restrictions loop again with this cpu removed from the
946		 * set.
947		 */
948		if (tdq->tdq_load == 0 && tdq_move(steal, tdq) == 0) {
949			tdq_unlock_pair(tdq, steal);
950			continue;
951		}
952		spinlock_exit();
953		TDQ_UNLOCK(steal);
954		mi_switch(SW_VOL | SWT_IDLE, NULL);
955		thread_unlock(curthread);
956
957		return (0);
958	}
959	spinlock_exit();
960	return (1);
961}
962
963/*
964 * Notify a remote cpu of new work.  Sends an IPI if criteria are met.
965 */
966static void
967tdq_notify(struct tdq *tdq, struct thread *td)
968{
969	struct thread *ctd;
970	int pri;
971	int cpu;
972
973	if (tdq->tdq_ipipending)
974		return;
975	cpu = td->td_sched->ts_cpu;
976	pri = td->td_priority;
977	ctd = pcpu_find(cpu)->pc_curthread;
978	if (!sched_shouldpreempt(pri, ctd->td_priority, 1))
979		return;
980	if (TD_IS_IDLETHREAD(ctd)) {
981		/*
982		 * If the MD code has an idle wakeup routine try that before
983		 * falling back to IPI.
984		 */
985		if (!tdq->tdq_cpu_idle || cpu_idle_wakeup(cpu))
986			return;
987	}
988	tdq->tdq_ipipending = 1;
989	ipi_cpu(cpu, IPI_PREEMPT);
990}
991
992/*
993 * Steals load from a timeshare queue.  Honors the rotating queue head
994 * index.
995 */
996static struct thread *
997runq_steal_from(struct runq *rq, int cpu, u_char start)
998{
999	struct rqbits *rqb;
1000	struct rqhead *rqh;
1001	struct thread *td;
1002	int first;
1003	int bit;
1004	int pri;
1005	int i;
1006
1007	rqb = &rq->rq_status;
1008	bit = start & (RQB_BPW -1);
1009	pri = 0;
1010	first = 0;
1011again:
1012	for (i = RQB_WORD(start); i < RQB_LEN; bit = 0, i++) {
1013		if (rqb->rqb_bits[i] == 0)
1014			continue;
1015		if (bit != 0) {
1016			for (pri = bit; pri < RQB_BPW; pri++)
1017				if (rqb->rqb_bits[i] & (1ul << pri))
1018					break;
1019			if (pri >= RQB_BPW)
1020				continue;
1021		} else
1022			pri = RQB_FFS(rqb->rqb_bits[i]);
1023		pri += (i << RQB_L2BPW);
1024		rqh = &rq->rq_queues[pri];
1025		TAILQ_FOREACH(td, rqh, td_runq) {
1026			if (first && THREAD_CAN_MIGRATE(td) &&
1027			    THREAD_CAN_SCHED(td, cpu))
1028				return (td);
1029			first = 1;
1030		}
1031	}
1032	if (start != 0) {
1033		start = 0;
1034		goto again;
1035	}
1036
1037	return (NULL);
1038}
1039
1040/*
1041 * Steals load from a standard linear queue.
1042 */
1043static struct thread *
1044runq_steal(struct runq *rq, int cpu)
1045{
1046	struct rqhead *rqh;
1047	struct rqbits *rqb;
1048	struct thread *td;
1049	int word;
1050	int bit;
1051
1052	rqb = &rq->rq_status;
1053	for (word = 0; word < RQB_LEN; word++) {
1054		if (rqb->rqb_bits[word] == 0)
1055			continue;
1056		for (bit = 0; bit < RQB_BPW; bit++) {
1057			if ((rqb->rqb_bits[word] & (1ul << bit)) == 0)
1058				continue;
1059			rqh = &rq->rq_queues[bit + (word << RQB_L2BPW)];
1060			TAILQ_FOREACH(td, rqh, td_runq)
1061				if (THREAD_CAN_MIGRATE(td) &&
1062				    THREAD_CAN_SCHED(td, cpu))
1063					return (td);
1064		}
1065	}
1066	return (NULL);
1067}
1068
1069/*
1070 * Attempt to steal a thread in priority order from a thread queue.
1071 */
1072static struct thread *
1073tdq_steal(struct tdq *tdq, int cpu)
1074{
1075	struct thread *td;
1076
1077	TDQ_LOCK_ASSERT(tdq, MA_OWNED);
1078	if ((td = runq_steal(&tdq->tdq_realtime, cpu)) != NULL)
1079		return (td);
1080	if ((td = runq_steal_from(&tdq->tdq_timeshare,
1081	    cpu, tdq->tdq_ridx)) != NULL)
1082		return (td);
1083	return (runq_steal(&tdq->tdq_idle, cpu));
1084}
1085
1086/*
1087 * Sets the thread lock and ts_cpu to match the requested cpu.  Unlocks the
1088 * current lock and returns with the assigned queue locked.
1089 */
1090static inline struct tdq *
1091sched_setcpu(struct thread *td, int cpu, int flags)
1092{
1093
1094	struct tdq *tdq;
1095
1096	THREAD_LOCK_ASSERT(td, MA_OWNED);
1097	tdq = TDQ_CPU(cpu);
1098	td->td_sched->ts_cpu = cpu;
1099	/*
1100	 * If the lock matches just return the queue.
1101	 */
1102	if (td->td_lock == TDQ_LOCKPTR(tdq))
1103		return (tdq);
1104#ifdef notyet
1105	/*
1106	 * If the thread isn't running its lockptr is a
1107	 * turnstile or a sleepqueue.  We can just lock_set without
1108	 * blocking.
1109	 */
1110	if (TD_CAN_RUN(td)) {
1111		TDQ_LOCK(tdq);
1112		thread_lock_set(td, TDQ_LOCKPTR(tdq));
1113		return (tdq);
1114	}
1115#endif
1116	/*
1117	 * The hard case, migration, we need to block the thread first to
1118	 * prevent order reversals with other cpus locks.
1119	 */
1120	spinlock_enter();
1121	thread_lock_block(td);
1122	TDQ_LOCK(tdq);
1123	thread_lock_unblock(td, TDQ_LOCKPTR(tdq));
1124	spinlock_exit();
1125	return (tdq);
1126}
1127
1128SCHED_STAT_DEFINE(pickcpu_intrbind, "Soft interrupt binding");
1129SCHED_STAT_DEFINE(pickcpu_idle_affinity, "Picked idle cpu based on affinity");
1130SCHED_STAT_DEFINE(pickcpu_affinity, "Picked cpu based on affinity");
1131SCHED_STAT_DEFINE(pickcpu_lowest, "Selected lowest load");
1132SCHED_STAT_DEFINE(pickcpu_local, "Migrated to current cpu");
1133SCHED_STAT_DEFINE(pickcpu_migration, "Selection may have caused migration");
1134
1135static int
1136sched_pickcpu(struct thread *td, int flags)
1137{
1138	struct cpu_group *cg;
1139	struct td_sched *ts;
1140	struct tdq *tdq;
1141	cpuset_t mask;
1142	int self;
1143	int pri;
1144	int cpu;
1145
1146	self = PCPU_GET(cpuid);
1147	ts = td->td_sched;
1148	if (smp_started == 0)
1149		return (self);
1150	/*
1151	 * Don't migrate a running thread from sched_switch().
1152	 */
1153	if ((flags & SRQ_OURSELF) || !THREAD_CAN_MIGRATE(td))
1154		return (ts->ts_cpu);
1155	/*
1156	 * Prefer to run interrupt threads on the processors that generate
1157	 * the interrupt.
1158	 */
1159	if (td->td_priority <= PRI_MAX_ITHD && THREAD_CAN_SCHED(td, self) &&
1160	    curthread->td_intr_nesting_level && ts->ts_cpu != self) {
1161		SCHED_STAT_INC(pickcpu_intrbind);
1162		ts->ts_cpu = self;
1163	}
1164	/*
1165	 * If the thread can run on the last cpu and the affinity has not
1166	 * expired or it is idle run it there.
1167	 */
1168	pri = td->td_priority;
1169	tdq = TDQ_CPU(ts->ts_cpu);
1170	if (THREAD_CAN_SCHED(td, ts->ts_cpu)) {
1171		if (tdq->tdq_lowpri > PRI_MIN_IDLE) {
1172			SCHED_STAT_INC(pickcpu_idle_affinity);
1173			return (ts->ts_cpu);
1174		}
1175		if (SCHED_AFFINITY(ts, CG_SHARE_L2) && tdq->tdq_lowpri > pri) {
1176			SCHED_STAT_INC(pickcpu_affinity);
1177			return (ts->ts_cpu);
1178		}
1179	}
1180	/*
1181	 * Search for the highest level in the tree that still has affinity.
1182	 */
1183	cg = NULL;
1184	for (cg = tdq->tdq_cg; cg != NULL; cg = cg->cg_parent)
1185		if (SCHED_AFFINITY(ts, cg->cg_level))
1186			break;
1187	cpu = -1;
1188	mask = td->td_cpuset->cs_mask;
1189	if (cg)
1190		cpu = sched_lowest(cg, mask, pri);
1191	if (cpu == -1)
1192		cpu = sched_lowest(cpu_top, mask, -1);
1193	/*
1194	 * Compare the lowest loaded cpu to current cpu.
1195	 */
1196	if (THREAD_CAN_SCHED(td, self) && TDQ_CPU(self)->tdq_lowpri > pri &&
1197	    TDQ_CPU(cpu)->tdq_lowpri < PRI_MIN_IDLE) {
1198		SCHED_STAT_INC(pickcpu_local);
1199		cpu = self;
1200	} else
1201		SCHED_STAT_INC(pickcpu_lowest);
1202	if (cpu != ts->ts_cpu)
1203		SCHED_STAT_INC(pickcpu_migration);
1204	KASSERT(cpu != -1, ("sched_pickcpu: Failed to find a cpu."));
1205	return (cpu);
1206}
1207#endif
1208
1209/*
1210 * Pick the highest priority task we have and return it.
1211 */
1212static struct thread *
1213tdq_choose(struct tdq *tdq)
1214{
1215	struct thread *td;
1216
1217	TDQ_LOCK_ASSERT(tdq, MA_OWNED);
1218	td = runq_choose(&tdq->tdq_realtime);
1219	if (td != NULL)
1220		return (td);
1221	td = runq_choose_from(&tdq->tdq_timeshare, tdq->tdq_ridx);
1222	if (td != NULL) {
1223		KASSERT(td->td_priority >= PRI_MIN_BATCH,
1224		    ("tdq_choose: Invalid priority on timeshare queue %d",
1225		    td->td_priority));
1226		return (td);
1227	}
1228	td = runq_choose(&tdq->tdq_idle);
1229	if (td != NULL) {
1230		KASSERT(td->td_priority >= PRI_MIN_IDLE,
1231		    ("tdq_choose: Invalid priority on idle queue %d",
1232		    td->td_priority));
1233		return (td);
1234	}
1235
1236	return (NULL);
1237}
1238
1239/*
1240 * Initialize a thread queue.
1241 */
1242static void
1243tdq_setup(struct tdq *tdq)
1244{
1245
1246	if (bootverbose)
1247		printf("ULE: setup cpu %d\n", TDQ_ID(tdq));
1248	runq_init(&tdq->tdq_realtime);
1249	runq_init(&tdq->tdq_timeshare);
1250	runq_init(&tdq->tdq_idle);
1251	snprintf(tdq->tdq_name, sizeof(tdq->tdq_name),
1252	    "sched lock %d", (int)TDQ_ID(tdq));
1253	mtx_init(&tdq->tdq_lock, tdq->tdq_name, "sched lock",
1254	    MTX_SPIN | MTX_RECURSE);
1255#ifdef KTR
1256	snprintf(tdq->tdq_loadname, sizeof(tdq->tdq_loadname),
1257	    "CPU %d load", (int)TDQ_ID(tdq));
1258#endif
1259}
1260
1261#ifdef SMP
1262static void
1263sched_setup_smp(void)
1264{
1265	struct tdq *tdq;
1266	int i;
1267
1268	cpu_top = smp_topo();
1269	CPU_FOREACH(i) {
1270		tdq = TDQ_CPU(i);
1271		tdq_setup(tdq);
1272		tdq->tdq_cg = smp_topo_find(cpu_top, i);
1273		if (tdq->tdq_cg == NULL)
1274			panic("Can't find cpu group for %d\n", i);
1275	}
1276	balance_tdq = TDQ_SELF();
1277	sched_balance();
1278}
1279#endif
1280
1281/*
1282 * Setup the thread queues and initialize the topology based on MD
1283 * information.
1284 */
1285static void
1286sched_setup(void *dummy)
1287{
1288	struct tdq *tdq;
1289
1290	tdq = TDQ_SELF();
1291#ifdef SMP
1292	sched_setup_smp();
1293#else
1294	tdq_setup(tdq);
1295#endif
1296	/*
1297	 * To avoid divide-by-zero, we set realstathz a dummy value
1298	 * in case which sched_clock() called before sched_initticks().
1299	 */
1300	realstathz = hz;
1301	sched_slice = (realstathz/10);	/* ~100ms */
1302	tickincr = 1 << SCHED_TICK_SHIFT;
1303
1304	/* Add thread0's load since it's running. */
1305	TDQ_LOCK(tdq);
1306	thread0.td_lock = TDQ_LOCKPTR(TDQ_SELF());
1307	tdq_load_add(tdq, &thread0);
1308	tdq->tdq_lowpri = thread0.td_priority;
1309	TDQ_UNLOCK(tdq);
1310}
1311
1312/*
1313 * This routine determines the tickincr after stathz and hz are setup.
1314 */
1315/* ARGSUSED */
1316static void
1317sched_initticks(void *dummy)
1318{
1319	int incr;
1320
1321	realstathz = stathz ? stathz : hz;
1322	sched_slice = (realstathz/10);	/* ~100ms */
1323
1324	/*
1325	 * tickincr is shifted out by 10 to avoid rounding errors due to
1326	 * hz not being evenly divisible by stathz on all platforms.
1327	 */
1328	incr = (hz << SCHED_TICK_SHIFT) / realstathz;
1329	/*
1330	 * This does not work for values of stathz that are more than
1331	 * 1 << SCHED_TICK_SHIFT * hz.  In practice this does not happen.
1332	 */
1333	if (incr == 0)
1334		incr = 1;
1335	tickincr = incr;
1336#ifdef SMP
1337	/*
1338	 * Set the default balance interval now that we know
1339	 * what realstathz is.
1340	 */
1341	balance_interval = realstathz;
1342	/*
1343	 * Set steal thresh to roughly log2(mp_ncpu) but no greater than 4.
1344	 * This prevents excess thrashing on large machines and excess idle
1345	 * on smaller machines.
1346	 */
1347	steal_thresh = min(fls(mp_ncpus) - 1, 3);
1348	affinity = SCHED_AFFINITY_DEFAULT;
1349#endif
1350}
1351
1352
1353/*
1354 * This is the core of the interactivity algorithm.  Determines a score based
1355 * on past behavior.  It is the ratio of sleep time to run time scaled to
1356 * a [0, 100] integer.  This is the voluntary sleep time of a process, which
1357 * differs from the cpu usage because it does not account for time spent
1358 * waiting on a run-queue.  Would be prettier if we had floating point.
1359 */
1360static int
1361sched_interact_score(struct thread *td)
1362{
1363	struct td_sched *ts;
1364	int div;
1365
1366	ts = td->td_sched;
1367	/*
1368	 * The score is only needed if this is likely to be an interactive
1369	 * task.  Don't go through the expense of computing it if there's
1370	 * no chance.
1371	 */
1372	if (sched_interact <= SCHED_INTERACT_HALF &&
1373		ts->ts_runtime >= ts->ts_slptime)
1374			return (SCHED_INTERACT_HALF);
1375
1376	if (ts->ts_runtime > ts->ts_slptime) {
1377		div = max(1, ts->ts_runtime / SCHED_INTERACT_HALF);
1378		return (SCHED_INTERACT_HALF +
1379		    (SCHED_INTERACT_HALF - (ts->ts_slptime / div)));
1380	}
1381	if (ts->ts_slptime > ts->ts_runtime) {
1382		div = max(1, ts->ts_slptime / SCHED_INTERACT_HALF);
1383		return (ts->ts_runtime / div);
1384	}
1385	/* runtime == slptime */
1386	if (ts->ts_runtime)
1387		return (SCHED_INTERACT_HALF);
1388
1389	/*
1390	 * This can happen if slptime and runtime are 0.
1391	 */
1392	return (0);
1393
1394}
1395
1396/*
1397 * Scale the scheduling priority according to the "interactivity" of this
1398 * process.
1399 */
1400static void
1401sched_priority(struct thread *td)
1402{
1403	int score;
1404	int pri;
1405
1406	if (PRI_BASE(td->td_pri_class) != PRI_TIMESHARE)
1407		return;
1408	/*
1409	 * If the score is interactive we place the thread in the realtime
1410	 * queue with a priority that is less than kernel and interrupt
1411	 * priorities.  These threads are not subject to nice restrictions.
1412	 *
1413	 * Scores greater than this are placed on the normal timeshare queue
1414	 * where the priority is partially decided by the most recent cpu
1415	 * utilization and the rest is decided by nice value.
1416	 *
1417	 * The nice value of the process has a linear effect on the calculated
1418	 * score.  Negative nice values make it easier for a thread to be
1419	 * considered interactive.
1420	 */
1421	score = imax(0, sched_interact_score(td) + td->td_proc->p_nice);
1422	if (score < sched_interact) {
1423		pri = PRI_MIN_INTERACT;
1424		pri += ((PRI_MAX_INTERACT - PRI_MIN_INTERACT + 1) /
1425		    sched_interact) * score;
1426		KASSERT(pri >= PRI_MIN_INTERACT && pri <= PRI_MAX_INTERACT,
1427		    ("sched_priority: invalid interactive priority %d score %d",
1428		    pri, score));
1429	} else {
1430		pri = SCHED_PRI_MIN;
1431		if (td->td_sched->ts_ticks)
1432			pri += SCHED_PRI_TICKS(td->td_sched);
1433		pri += SCHED_PRI_NICE(td->td_proc->p_nice);
1434		KASSERT(pri >= PRI_MIN_BATCH && pri <= PRI_MAX_BATCH,
1435		    ("sched_priority: invalid priority %d: nice %d, "
1436		    "ticks %d ftick %d ltick %d tick pri %d",
1437		    pri, td->td_proc->p_nice, td->td_sched->ts_ticks,
1438		    td->td_sched->ts_ftick, td->td_sched->ts_ltick,
1439		    SCHED_PRI_TICKS(td->td_sched)));
1440	}
1441	sched_user_prio(td, pri);
1442
1443	return;
1444}
1445
1446/*
1447 * This routine enforces a maximum limit on the amount of scheduling history
1448 * kept.  It is called after either the slptime or runtime is adjusted.  This
1449 * function is ugly due to integer math.
1450 */
1451static void
1452sched_interact_update(struct thread *td)
1453{
1454	struct td_sched *ts;
1455	u_int sum;
1456
1457	ts = td->td_sched;
1458	sum = ts->ts_runtime + ts->ts_slptime;
1459	if (sum < SCHED_SLP_RUN_MAX)
1460		return;
1461	/*
1462	 * This only happens from two places:
1463	 * 1) We have added an unusual amount of run time from fork_exit.
1464	 * 2) We have added an unusual amount of sleep time from sched_sleep().
1465	 */
1466	if (sum > SCHED_SLP_RUN_MAX * 2) {
1467		if (ts->ts_runtime > ts->ts_slptime) {
1468			ts->ts_runtime = SCHED_SLP_RUN_MAX;
1469			ts->ts_slptime = 1;
1470		} else {
1471			ts->ts_slptime = SCHED_SLP_RUN_MAX;
1472			ts->ts_runtime = 1;
1473		}
1474		return;
1475	}
1476	/*
1477	 * If we have exceeded by more than 1/5th then the algorithm below
1478	 * will not bring us back into range.  Dividing by two here forces
1479	 * us into the range of [4/5 * SCHED_INTERACT_MAX, SCHED_INTERACT_MAX]
1480	 */
1481	if (sum > (SCHED_SLP_RUN_MAX / 5) * 6) {
1482		ts->ts_runtime /= 2;
1483		ts->ts_slptime /= 2;
1484		return;
1485	}
1486	ts->ts_runtime = (ts->ts_runtime / 5) * 4;
1487	ts->ts_slptime = (ts->ts_slptime / 5) * 4;
1488}
1489
1490/*
1491 * Scale back the interactivity history when a child thread is created.  The
1492 * history is inherited from the parent but the thread may behave totally
1493 * differently.  For example, a shell spawning a compiler process.  We want
1494 * to learn that the compiler is behaving badly very quickly.
1495 */
1496static void
1497sched_interact_fork(struct thread *td)
1498{
1499	int ratio;
1500	int sum;
1501
1502	sum = td->td_sched->ts_runtime + td->td_sched->ts_slptime;
1503	if (sum > SCHED_SLP_RUN_FORK) {
1504		ratio = sum / SCHED_SLP_RUN_FORK;
1505		td->td_sched->ts_runtime /= ratio;
1506		td->td_sched->ts_slptime /= ratio;
1507	}
1508}
1509
1510/*
1511 * Called from proc0_init() to setup the scheduler fields.
1512 */
1513void
1514schedinit(void)
1515{
1516
1517	/*
1518	 * Set up the scheduler specific parts of proc0.
1519	 */
1520	proc0.p_sched = NULL; /* XXX */
1521	thread0.td_sched = &td_sched0;
1522	td_sched0.ts_ltick = ticks;
1523	td_sched0.ts_ftick = ticks;
1524	td_sched0.ts_slice = sched_slice;
1525}
1526
1527/*
1528 * This is only somewhat accurate since given many processes of the same
1529 * priority they will switch when their slices run out, which will be
1530 * at most sched_slice stathz ticks.
1531 */
1532int
1533sched_rr_interval(void)
1534{
1535
1536	/* Convert sched_slice to hz */
1537	return (hz/(realstathz/sched_slice));
1538}
1539
1540/*
1541 * Update the percent cpu tracking information when it is requested or
1542 * the total history exceeds the maximum.  We keep a sliding history of
1543 * tick counts that slowly decays.  This is less precise than the 4BSD
1544 * mechanism since it happens with less regular and frequent events.
1545 */
1546static void
1547sched_pctcpu_update(struct td_sched *ts)
1548{
1549
1550	if (ts->ts_ticks == 0)
1551		return;
1552	if (ticks - (hz / 10) < ts->ts_ltick &&
1553	    SCHED_TICK_TOTAL(ts) < SCHED_TICK_MAX)
1554		return;
1555	/*
1556	 * Adjust counters and watermark for pctcpu calc.
1557	 */
1558	if (ts->ts_ltick > ticks - SCHED_TICK_TARG)
1559		ts->ts_ticks = (ts->ts_ticks / (ticks - ts->ts_ftick)) *
1560			    SCHED_TICK_TARG;
1561	else
1562		ts->ts_ticks = 0;
1563	ts->ts_ltick = ticks;
1564	ts->ts_ftick = ts->ts_ltick - SCHED_TICK_TARG;
1565}
1566
1567/*
1568 * Adjust the priority of a thread.  Move it to the appropriate run-queue
1569 * if necessary.  This is the back-end for several priority related
1570 * functions.
1571 */
1572static void
1573sched_thread_priority(struct thread *td, u_char prio)
1574{
1575	struct td_sched *ts;
1576	struct tdq *tdq;
1577	int oldpri;
1578
1579	KTR_POINT3(KTR_SCHED, "thread", sched_tdname(td), "prio",
1580	    "prio:%d", td->td_priority, "new prio:%d", prio,
1581	    KTR_ATTR_LINKED, sched_tdname(curthread));
1582	if (td != curthread && prio > td->td_priority) {
1583		KTR_POINT3(KTR_SCHED, "thread", sched_tdname(curthread),
1584		    "lend prio", "prio:%d", td->td_priority, "new prio:%d",
1585		    prio, KTR_ATTR_LINKED, sched_tdname(td));
1586	}
1587	ts = td->td_sched;
1588	THREAD_LOCK_ASSERT(td, MA_OWNED);
1589	if (td->td_priority == prio)
1590		return;
1591	/*
1592	 * If the priority has been elevated due to priority
1593	 * propagation, we may have to move ourselves to a new
1594	 * queue.  This could be optimized to not re-add in some
1595	 * cases.
1596	 */
1597	if (TD_ON_RUNQ(td) && prio < td->td_priority) {
1598		sched_rem(td);
1599		td->td_priority = prio;
1600		sched_add(td, SRQ_BORROWING);
1601		return;
1602	}
1603	/*
1604	 * If the thread is currently running we may have to adjust the lowpri
1605	 * information so other cpus are aware of our current priority.
1606	 */
1607	if (TD_IS_RUNNING(td)) {
1608		tdq = TDQ_CPU(ts->ts_cpu);
1609		oldpri = td->td_priority;
1610		td->td_priority = prio;
1611		if (prio < tdq->tdq_lowpri)
1612			tdq->tdq_lowpri = prio;
1613		else if (tdq->tdq_lowpri == oldpri)
1614			tdq_setlowpri(tdq, td);
1615		return;
1616	}
1617	td->td_priority = prio;
1618}
1619
1620/*
1621 * Update a thread's priority when it is lent another thread's
1622 * priority.
1623 */
1624void
1625sched_lend_prio(struct thread *td, u_char prio)
1626{
1627
1628	td->td_flags |= TDF_BORROWING;
1629	sched_thread_priority(td, prio);
1630}
1631
1632/*
1633 * Restore a thread's priority when priority propagation is
1634 * over.  The prio argument is the minimum priority the thread
1635 * needs to have to satisfy other possible priority lending
1636 * requests.  If the thread's regular priority is less
1637 * important than prio, the thread will keep a priority boost
1638 * of prio.
1639 */
1640void
1641sched_unlend_prio(struct thread *td, u_char prio)
1642{
1643	u_char base_pri;
1644
1645	if (td->td_base_pri >= PRI_MIN_TIMESHARE &&
1646	    td->td_base_pri <= PRI_MAX_TIMESHARE)
1647		base_pri = td->td_user_pri;
1648	else
1649		base_pri = td->td_base_pri;
1650	if (prio >= base_pri) {
1651		td->td_flags &= ~TDF_BORROWING;
1652		sched_thread_priority(td, base_pri);
1653	} else
1654		sched_lend_prio(td, prio);
1655}
1656
1657/*
1658 * Standard entry for setting the priority to an absolute value.
1659 */
1660void
1661sched_prio(struct thread *td, u_char prio)
1662{
1663	u_char oldprio;
1664
1665	/* First, update the base priority. */
1666	td->td_base_pri = prio;
1667
1668	/*
1669	 * If the thread is borrowing another thread's priority, don't
1670	 * ever lower the priority.
1671	 */
1672	if (td->td_flags & TDF_BORROWING && td->td_priority < prio)
1673		return;
1674
1675	/* Change the real priority. */
1676	oldprio = td->td_priority;
1677	sched_thread_priority(td, prio);
1678
1679	/*
1680	 * If the thread is on a turnstile, then let the turnstile update
1681	 * its state.
1682	 */
1683	if (TD_ON_LOCK(td) && oldprio != prio)
1684		turnstile_adjust(td, oldprio);
1685}
1686
1687/*
1688 * Set the base user priority, does not effect current running priority.
1689 */
1690void
1691sched_user_prio(struct thread *td, u_char prio)
1692{
1693
1694	td->td_base_user_pri = prio;
1695	if (td->td_lend_user_pri <= prio)
1696		return;
1697	td->td_user_pri = prio;
1698}
1699
1700void
1701sched_lend_user_prio(struct thread *td, u_char prio)
1702{
1703
1704	THREAD_LOCK_ASSERT(td, MA_OWNED);
1705	td->td_lend_user_pri = prio;
1706	td->td_user_pri = min(prio, td->td_base_user_pri);
1707	if (td->td_priority > td->td_user_pri)
1708		sched_prio(td, td->td_user_pri);
1709	else if (td->td_priority != td->td_user_pri)
1710		td->td_flags |= TDF_NEEDRESCHED;
1711}
1712
1713/*
1714 * Handle migration from sched_switch().  This happens only for
1715 * cpu binding.
1716 */
1717static struct mtx *
1718sched_switch_migrate(struct tdq *tdq, struct thread *td, int flags)
1719{
1720	struct tdq *tdn;
1721
1722	tdn = TDQ_CPU(td->td_sched->ts_cpu);
1723#ifdef SMP
1724	tdq_load_rem(tdq, td);
1725	/*
1726	 * Do the lock dance required to avoid LOR.  We grab an extra
1727	 * spinlock nesting to prevent preemption while we're
1728	 * not holding either run-queue lock.
1729	 */
1730	spinlock_enter();
1731	thread_lock_block(td);	/* This releases the lock on tdq. */
1732
1733	/*
1734	 * Acquire both run-queue locks before placing the thread on the new
1735	 * run-queue to avoid deadlocks created by placing a thread with a
1736	 * blocked lock on the run-queue of a remote processor.  The deadlock
1737	 * occurs when a third processor attempts to lock the two queues in
1738	 * question while the target processor is spinning with its own
1739	 * run-queue lock held while waiting for the blocked lock to clear.
1740	 */
1741	tdq_lock_pair(tdn, tdq);
1742	tdq_add(tdn, td, flags);
1743	tdq_notify(tdn, td);
1744	TDQ_UNLOCK(tdn);
1745	spinlock_exit();
1746#endif
1747	return (TDQ_LOCKPTR(tdn));
1748}
1749
1750/*
1751 * Variadic version of thread_lock_unblock() that does not assume td_lock
1752 * is blocked.
1753 */
1754static inline void
1755thread_unblock_switch(struct thread *td, struct mtx *mtx)
1756{
1757	atomic_store_rel_ptr((volatile uintptr_t *)&td->td_lock,
1758	    (uintptr_t)mtx);
1759}
1760
1761/*
1762 * Switch threads.  This function has to handle threads coming in while
1763 * blocked for some reason, running, or idle.  It also must deal with
1764 * migrating a thread from one queue to another as running threads may
1765 * be assigned elsewhere via binding.
1766 */
1767void
1768sched_switch(struct thread *td, struct thread *newtd, int flags)
1769{
1770	struct tdq *tdq;
1771	struct td_sched *ts;
1772	struct mtx *mtx;
1773	int srqflag;
1774	int cpuid;
1775
1776	THREAD_LOCK_ASSERT(td, MA_OWNED);
1777	KASSERT(newtd == NULL, ("sched_switch: Unsupported newtd argument"));
1778
1779	cpuid = PCPU_GET(cpuid);
1780	tdq = TDQ_CPU(cpuid);
1781	ts = td->td_sched;
1782	mtx = td->td_lock;
1783	ts->ts_rltick = ticks;
1784	td->td_lastcpu = td->td_oncpu;
1785	td->td_oncpu = NOCPU;
1786	if (!(flags & SW_PREEMPT))
1787		td->td_flags &= ~TDF_NEEDRESCHED;
1788	td->td_owepreempt = 0;
1789	tdq->tdq_switchcnt++;
1790	/*
1791	 * The lock pointer in an idle thread should never change.  Reset it
1792	 * to CAN_RUN as well.
1793	 */
1794	if (TD_IS_IDLETHREAD(td)) {
1795		MPASS(td->td_lock == TDQ_LOCKPTR(tdq));
1796		TD_SET_CAN_RUN(td);
1797	} else if (TD_IS_RUNNING(td)) {
1798		MPASS(td->td_lock == TDQ_LOCKPTR(tdq));
1799		srqflag = (flags & SW_PREEMPT) ?
1800		    SRQ_OURSELF|SRQ_YIELDING|SRQ_PREEMPTED :
1801		    SRQ_OURSELF|SRQ_YIELDING;
1802#ifdef SMP
1803		if (THREAD_CAN_MIGRATE(td) && !THREAD_CAN_SCHED(td, ts->ts_cpu))
1804			ts->ts_cpu = sched_pickcpu(td, 0);
1805#endif
1806		if (ts->ts_cpu == cpuid)
1807			tdq_runq_add(tdq, td, srqflag);
1808		else {
1809			KASSERT(THREAD_CAN_MIGRATE(td) ||
1810			    (ts->ts_flags & TSF_BOUND) != 0,
1811			    ("Thread %p shouldn't migrate", td));
1812			mtx = sched_switch_migrate(tdq, td, srqflag);
1813		}
1814	} else {
1815		/* This thread must be going to sleep. */
1816		TDQ_LOCK(tdq);
1817		mtx = thread_lock_block(td);
1818		tdq_load_rem(tdq, td);
1819	}
1820	/*
1821	 * We enter here with the thread blocked and assigned to the
1822	 * appropriate cpu run-queue or sleep-queue and with the current
1823	 * thread-queue locked.
1824	 */
1825	TDQ_LOCK_ASSERT(tdq, MA_OWNED | MA_NOTRECURSED);
1826	newtd = choosethread();
1827	/*
1828	 * Call the MD code to switch contexts if necessary.
1829	 */
1830	if (td != newtd) {
1831#ifdef	HWPMC_HOOKS
1832		if (PMC_PROC_IS_USING_PMCS(td->td_proc))
1833			PMC_SWITCH_CONTEXT(td, PMC_FN_CSW_OUT);
1834#endif
1835		lock_profile_release_lock(&TDQ_LOCKPTR(tdq)->lock_object);
1836		TDQ_LOCKPTR(tdq)->mtx_lock = (uintptr_t)newtd;
1837
1838#ifdef KDTRACE_HOOKS
1839		/*
1840		 * If DTrace has set the active vtime enum to anything
1841		 * other than INACTIVE (0), then it should have set the
1842		 * function to call.
1843		 */
1844		if (dtrace_vtime_active)
1845			(*dtrace_vtime_switch_func)(newtd);
1846#endif
1847
1848		cpu_switch(td, newtd, mtx);
1849		/*
1850		 * We may return from cpu_switch on a different cpu.  However,
1851		 * we always return with td_lock pointing to the current cpu's
1852		 * run queue lock.
1853		 */
1854		cpuid = PCPU_GET(cpuid);
1855		tdq = TDQ_CPU(cpuid);
1856		lock_profile_obtain_lock_success(
1857		    &TDQ_LOCKPTR(tdq)->lock_object, 0, 0, __FILE__, __LINE__);
1858#ifdef	HWPMC_HOOKS
1859		if (PMC_PROC_IS_USING_PMCS(td->td_proc))
1860			PMC_SWITCH_CONTEXT(td, PMC_FN_CSW_IN);
1861#endif
1862	} else
1863		thread_unblock_switch(td, mtx);
1864	/*
1865	 * Assert that all went well and return.
1866	 */
1867	TDQ_LOCK_ASSERT(tdq, MA_OWNED|MA_NOTRECURSED);
1868	MPASS(td->td_lock == TDQ_LOCKPTR(tdq));
1869	td->td_oncpu = cpuid;
1870}
1871
1872/*
1873 * Adjust thread priorities as a result of a nice request.
1874 */
1875void
1876sched_nice(struct proc *p, int nice)
1877{
1878	struct thread *td;
1879
1880	PROC_LOCK_ASSERT(p, MA_OWNED);
1881
1882	p->p_nice = nice;
1883	FOREACH_THREAD_IN_PROC(p, td) {
1884		thread_lock(td);
1885		sched_priority(td);
1886		sched_prio(td, td->td_base_user_pri);
1887		thread_unlock(td);
1888	}
1889}
1890
1891/*
1892 * Record the sleep time for the interactivity scorer.
1893 */
1894void
1895sched_sleep(struct thread *td, int prio)
1896{
1897
1898	THREAD_LOCK_ASSERT(td, MA_OWNED);
1899
1900	td->td_slptick = ticks;
1901	if (TD_IS_SUSPENDED(td) || prio >= PSOCK)
1902		td->td_flags |= TDF_CANSWAP;
1903	if (PRI_BASE(td->td_pri_class) != PRI_TIMESHARE)
1904		return;
1905	if (static_boost == 1 && prio)
1906		sched_prio(td, prio);
1907	else if (static_boost && td->td_priority > static_boost)
1908		sched_prio(td, static_boost);
1909}
1910
1911/*
1912 * Schedule a thread to resume execution and record how long it voluntarily
1913 * slept.  We also update the pctcpu, interactivity, and priority.
1914 */
1915void
1916sched_wakeup(struct thread *td)
1917{
1918	struct td_sched *ts;
1919	int slptick;
1920
1921	THREAD_LOCK_ASSERT(td, MA_OWNED);
1922	ts = td->td_sched;
1923	td->td_flags &= ~TDF_CANSWAP;
1924	/*
1925	 * If we slept for more than a tick update our interactivity and
1926	 * priority.
1927	 */
1928	slptick = td->td_slptick;
1929	td->td_slptick = 0;
1930	if (slptick && slptick != ticks) {
1931		u_int hzticks;
1932
1933		hzticks = (ticks - slptick) << SCHED_TICK_SHIFT;
1934		ts->ts_slptime += hzticks;
1935		sched_interact_update(td);
1936		sched_pctcpu_update(ts);
1937	}
1938	/* Reset the slice value after we sleep. */
1939	ts->ts_slice = sched_slice;
1940	sched_add(td, SRQ_BORING);
1941}
1942
1943/*
1944 * Penalize the parent for creating a new child and initialize the child's
1945 * priority.
1946 */
1947void
1948sched_fork(struct thread *td, struct thread *child)
1949{
1950	THREAD_LOCK_ASSERT(td, MA_OWNED);
1951	sched_fork_thread(td, child);
1952	/*
1953	 * Penalize the parent and child for forking.
1954	 */
1955	sched_interact_fork(child);
1956	sched_priority(child);
1957	td->td_sched->ts_runtime += tickincr;
1958	sched_interact_update(td);
1959	sched_priority(td);
1960}
1961
1962/*
1963 * Fork a new thread, may be within the same process.
1964 */
1965void
1966sched_fork_thread(struct thread *td, struct thread *child)
1967{
1968	struct td_sched *ts;
1969	struct td_sched *ts2;
1970
1971	THREAD_LOCK_ASSERT(td, MA_OWNED);
1972	/*
1973	 * Initialize child.
1974	 */
1975	ts = td->td_sched;
1976	ts2 = child->td_sched;
1977	child->td_lock = TDQ_LOCKPTR(TDQ_SELF());
1978	child->td_cpuset = cpuset_ref(td->td_cpuset);
1979	ts2->ts_cpu = ts->ts_cpu;
1980	ts2->ts_flags = 0;
1981	/*
1982	 * Grab our parents cpu estimation information.
1983	 */
1984	ts2->ts_ticks = ts->ts_ticks;
1985	ts2->ts_ltick = ts->ts_ltick;
1986	ts2->ts_incrtick = ts->ts_incrtick;
1987	ts2->ts_ftick = ts->ts_ftick;
1988	/*
1989	 * Do not inherit any borrowed priority from the parent.
1990	 */
1991	child->td_priority = child->td_base_pri;
1992	/*
1993	 * And update interactivity score.
1994	 */
1995	ts2->ts_slptime = ts->ts_slptime;
1996	ts2->ts_runtime = ts->ts_runtime;
1997	ts2->ts_slice = 1;	/* Attempt to quickly learn interactivity. */
1998#ifdef KTR
1999	bzero(ts2->ts_name, sizeof(ts2->ts_name));
2000#endif
2001}
2002
2003/*
2004 * Adjust the priority class of a thread.
2005 */
2006void
2007sched_class(struct thread *td, int class)
2008{
2009
2010	THREAD_LOCK_ASSERT(td, MA_OWNED);
2011	if (td->td_pri_class == class)
2012		return;
2013	td->td_pri_class = class;
2014}
2015
2016/*
2017 * Return some of the child's priority and interactivity to the parent.
2018 */
2019void
2020sched_exit(struct proc *p, struct thread *child)
2021{
2022	struct thread *td;
2023
2024	KTR_STATE1(KTR_SCHED, "thread", sched_tdname(child), "proc exit",
2025	    "prio:td", child->td_priority);
2026	PROC_LOCK_ASSERT(p, MA_OWNED);
2027	td = FIRST_THREAD_IN_PROC(p);
2028	sched_exit_thread(td, child);
2029}
2030
2031/*
2032 * Penalize another thread for the time spent on this one.  This helps to
2033 * worsen the priority and interactivity of processes which schedule batch
2034 * jobs such as make.  This has little effect on the make process itself but
2035 * causes new processes spawned by it to receive worse scores immediately.
2036 */
2037void
2038sched_exit_thread(struct thread *td, struct thread *child)
2039{
2040
2041	KTR_STATE1(KTR_SCHED, "thread", sched_tdname(child), "thread exit",
2042	    "prio:td", child->td_priority);
2043	/*
2044	 * Give the child's runtime to the parent without returning the
2045	 * sleep time as a penalty to the parent.  This causes shells that
2046	 * launch expensive things to mark their children as expensive.
2047	 */
2048	thread_lock(td);
2049	td->td_sched->ts_runtime += child->td_sched->ts_runtime;
2050	sched_interact_update(td);
2051	sched_priority(td);
2052	thread_unlock(td);
2053}
2054
2055void
2056sched_preempt(struct thread *td)
2057{
2058	struct tdq *tdq;
2059
2060	thread_lock(td);
2061	tdq = TDQ_SELF();
2062	TDQ_LOCK_ASSERT(tdq, MA_OWNED);
2063	tdq->tdq_ipipending = 0;
2064	if (td->td_priority > tdq->tdq_lowpri) {
2065		int flags;
2066
2067		flags = SW_INVOL | SW_PREEMPT;
2068		if (td->td_critnest > 1)
2069			td->td_owepreempt = 1;
2070		else if (TD_IS_IDLETHREAD(td))
2071			mi_switch(flags | SWT_REMOTEWAKEIDLE, NULL);
2072		else
2073			mi_switch(flags | SWT_REMOTEPREEMPT, NULL);
2074	}
2075	thread_unlock(td);
2076}
2077
2078/*
2079 * Fix priorities on return to user-space.  Priorities may be elevated due
2080 * to static priorities in msleep() or similar.
2081 */
2082void
2083sched_userret(struct thread *td)
2084{
2085	/*
2086	 * XXX we cheat slightly on the locking here to avoid locking in
2087	 * the usual case.  Setting td_priority here is essentially an
2088	 * incomplete workaround for not setting it properly elsewhere.
2089	 * Now that some interrupt handlers are threads, not setting it
2090	 * properly elsewhere can clobber it in the window between setting
2091	 * it here and returning to user mode, so don't waste time setting
2092	 * it perfectly here.
2093	 */
2094	KASSERT((td->td_flags & TDF_BORROWING) == 0,
2095	    ("thread with borrowed priority returning to userland"));
2096	if (td->td_priority != td->td_user_pri) {
2097		thread_lock(td);
2098		td->td_priority = td->td_user_pri;
2099		td->td_base_pri = td->td_user_pri;
2100		tdq_setlowpri(TDQ_SELF(), td);
2101		thread_unlock(td);
2102        }
2103}
2104
2105/*
2106 * Handle a stathz tick.  This is really only relevant for timeshare
2107 * threads.
2108 */
2109void
2110sched_clock(struct thread *td)
2111{
2112	struct tdq *tdq;
2113	struct td_sched *ts;
2114
2115	THREAD_LOCK_ASSERT(td, MA_OWNED);
2116	tdq = TDQ_SELF();
2117#ifdef SMP
2118	/*
2119	 * We run the long term load balancer infrequently on the first cpu.
2120	 */
2121	if (balance_tdq == tdq) {
2122		if (balance_ticks && --balance_ticks == 0)
2123			sched_balance();
2124	}
2125#endif
2126	/*
2127	 * Save the old switch count so we have a record of the last ticks
2128	 * activity.   Initialize the new switch count based on our load.
2129	 * If there is some activity seed it to reflect that.
2130	 */
2131	tdq->tdq_oldswitchcnt = tdq->tdq_switchcnt;
2132	tdq->tdq_switchcnt = tdq->tdq_load;
2133	/*
2134	 * Advance the insert index once for each tick to ensure that all
2135	 * threads get a chance to run.
2136	 */
2137	if (tdq->tdq_idx == tdq->tdq_ridx) {
2138		tdq->tdq_idx = (tdq->tdq_idx + 1) % RQ_NQS;
2139		if (TAILQ_EMPTY(&tdq->tdq_timeshare.rq_queues[tdq->tdq_ridx]))
2140			tdq->tdq_ridx = tdq->tdq_idx;
2141	}
2142	ts = td->td_sched;
2143	if (td->td_pri_class & PRI_FIFO_BIT)
2144		return;
2145	if (PRI_BASE(td->td_pri_class) == PRI_TIMESHARE) {
2146		/*
2147		 * We used a tick; charge it to the thread so
2148		 * that we can compute our interactivity.
2149		 */
2150		td->td_sched->ts_runtime += tickincr;
2151		sched_interact_update(td);
2152		sched_priority(td);
2153	}
2154	/*
2155	 * We used up one time slice.
2156	 */
2157	if (--ts->ts_slice > 0)
2158		return;
2159	/*
2160	 * We're out of time, force a requeue at userret().
2161	 */
2162	ts->ts_slice = sched_slice;
2163	td->td_flags |= TDF_NEEDRESCHED;
2164}
2165
2166/*
2167 * Called once per hz tick.  Used for cpu utilization information.  This
2168 * is easier than trying to scale based on stathz.
2169 */
2170void
2171sched_tick(int cnt)
2172{
2173	struct td_sched *ts;
2174
2175	ts = curthread->td_sched;
2176	/*
2177	 * Ticks is updated asynchronously on a single cpu.  Check here to
2178	 * avoid incrementing ts_ticks multiple times in a single tick.
2179	 */
2180	if (ts->ts_incrtick == ticks)
2181		return;
2182	/* Adjust ticks for pctcpu */
2183	ts->ts_ticks += cnt << SCHED_TICK_SHIFT;
2184	ts->ts_ltick = ticks;
2185	ts->ts_incrtick = ticks;
2186	/*
2187	 * Update if we've exceeded our desired tick threshold by over one
2188	 * second.
2189	 */
2190	if (ts->ts_ftick + SCHED_TICK_MAX < ts->ts_ltick)
2191		sched_pctcpu_update(ts);
2192}
2193
2194/*
2195 * Return whether the current CPU has runnable tasks.  Used for in-kernel
2196 * cooperative idle threads.
2197 */
2198int
2199sched_runnable(void)
2200{
2201	struct tdq *tdq;
2202	int load;
2203
2204	load = 1;
2205
2206	tdq = TDQ_SELF();
2207	if ((curthread->td_flags & TDF_IDLETD) != 0) {
2208		if (tdq->tdq_load > 0)
2209			goto out;
2210	} else
2211		if (tdq->tdq_load - 1 > 0)
2212			goto out;
2213	load = 0;
2214out:
2215	return (load);
2216}
2217
2218/*
2219 * Choose the highest priority thread to run.  The thread is removed from
2220 * the run-queue while running however the load remains.  For SMP we set
2221 * the tdq in the global idle bitmask if it idles here.
2222 */
2223struct thread *
2224sched_choose(void)
2225{
2226	struct thread *td;
2227	struct tdq *tdq;
2228
2229	tdq = TDQ_SELF();
2230	TDQ_LOCK_ASSERT(tdq, MA_OWNED);
2231	td = tdq_choose(tdq);
2232	if (td) {
2233		td->td_sched->ts_ltick = ticks;
2234		tdq_runq_rem(tdq, td);
2235		tdq->tdq_lowpri = td->td_priority;
2236		return (td);
2237	}
2238	tdq->tdq_lowpri = PRI_MAX_IDLE;
2239	return (PCPU_GET(idlethread));
2240}
2241
2242/*
2243 * Set owepreempt if necessary.  Preemption never happens directly in ULE,
2244 * we always request it once we exit a critical section.
2245 */
2246static inline void
2247sched_setpreempt(struct thread *td)
2248{
2249	struct thread *ctd;
2250	int cpri;
2251	int pri;
2252
2253	THREAD_LOCK_ASSERT(curthread, MA_OWNED);
2254
2255	ctd = curthread;
2256	pri = td->td_priority;
2257	cpri = ctd->td_priority;
2258	if (pri < cpri)
2259		ctd->td_flags |= TDF_NEEDRESCHED;
2260	if (panicstr != NULL || pri >= cpri || cold || TD_IS_INHIBITED(ctd))
2261		return;
2262	if (!sched_shouldpreempt(pri, cpri, 0))
2263		return;
2264	ctd->td_owepreempt = 1;
2265}
2266
2267/*
2268 * Add a thread to a thread queue.  Select the appropriate runq and add the
2269 * thread to it.  This is the internal function called when the tdq is
2270 * predetermined.
2271 */
2272void
2273tdq_add(struct tdq *tdq, struct thread *td, int flags)
2274{
2275
2276	TDQ_LOCK_ASSERT(tdq, MA_OWNED);
2277	KASSERT((td->td_inhibitors == 0),
2278	    ("sched_add: trying to run inhibited thread"));
2279	KASSERT((TD_CAN_RUN(td) || TD_IS_RUNNING(td)),
2280	    ("sched_add: bad thread state"));
2281	KASSERT(td->td_flags & TDF_INMEM,
2282	    ("sched_add: thread swapped out"));
2283
2284	if (td->td_priority < tdq->tdq_lowpri)
2285		tdq->tdq_lowpri = td->td_priority;
2286	tdq_runq_add(tdq, td, flags);
2287	tdq_load_add(tdq, td);
2288}
2289
2290/*
2291 * Select the target thread queue and add a thread to it.  Request
2292 * preemption or IPI a remote processor if required.
2293 */
2294void
2295sched_add(struct thread *td, int flags)
2296{
2297	struct tdq *tdq;
2298#ifdef SMP
2299	int cpu;
2300#endif
2301
2302	KTR_STATE2(KTR_SCHED, "thread", sched_tdname(td), "runq add",
2303	    "prio:%d", td->td_priority, KTR_ATTR_LINKED,
2304	    sched_tdname(curthread));
2305	KTR_POINT1(KTR_SCHED, "thread", sched_tdname(curthread), "wokeup",
2306	    KTR_ATTR_LINKED, sched_tdname(td));
2307	THREAD_LOCK_ASSERT(td, MA_OWNED);
2308	/*
2309	 * Recalculate the priority before we select the target cpu or
2310	 * run-queue.
2311	 */
2312	if (PRI_BASE(td->td_pri_class) == PRI_TIMESHARE)
2313		sched_priority(td);
2314#ifdef SMP
2315	/*
2316	 * Pick the destination cpu and if it isn't ours transfer to the
2317	 * target cpu.
2318	 */
2319	cpu = sched_pickcpu(td, flags);
2320	tdq = sched_setcpu(td, cpu, flags);
2321	tdq_add(tdq, td, flags);
2322	if (cpu != PCPU_GET(cpuid)) {
2323		tdq_notify(tdq, td);
2324		return;
2325	}
2326#else
2327	tdq = TDQ_SELF();
2328	TDQ_LOCK(tdq);
2329	/*
2330	 * Now that the thread is moving to the run-queue, set the lock
2331	 * to the scheduler's lock.
2332	 */
2333	thread_lock_set(td, TDQ_LOCKPTR(tdq));
2334	tdq_add(tdq, td, flags);
2335#endif
2336	if (!(flags & SRQ_YIELDING))
2337		sched_setpreempt(td);
2338}
2339
2340/*
2341 * Remove a thread from a run-queue without running it.  This is used
2342 * when we're stealing a thread from a remote queue.  Otherwise all threads
2343 * exit by calling sched_exit_thread() and sched_throw() themselves.
2344 */
2345void
2346sched_rem(struct thread *td)
2347{
2348	struct tdq *tdq;
2349
2350	KTR_STATE1(KTR_SCHED, "thread", sched_tdname(td), "runq rem",
2351	    "prio:%d", td->td_priority);
2352	tdq = TDQ_CPU(td->td_sched->ts_cpu);
2353	TDQ_LOCK_ASSERT(tdq, MA_OWNED);
2354	MPASS(td->td_lock == TDQ_LOCKPTR(tdq));
2355	KASSERT(TD_ON_RUNQ(td),
2356	    ("sched_rem: thread not on run queue"));
2357	tdq_runq_rem(tdq, td);
2358	tdq_load_rem(tdq, td);
2359	TD_SET_CAN_RUN(td);
2360	if (td->td_priority == tdq->tdq_lowpri)
2361		tdq_setlowpri(tdq, NULL);
2362}
2363
2364/*
2365 * Fetch cpu utilization information.  Updates on demand.
2366 */
2367fixpt_t
2368sched_pctcpu(struct thread *td)
2369{
2370	fixpt_t pctcpu;
2371	struct td_sched *ts;
2372
2373	pctcpu = 0;
2374	ts = td->td_sched;
2375	if (ts == NULL)
2376		return (0);
2377
2378	THREAD_LOCK_ASSERT(td, MA_OWNED);
2379	if (ts->ts_ticks) {
2380		int rtick;
2381
2382		sched_pctcpu_update(ts);
2383		/* How many rtick per second ? */
2384		rtick = min(SCHED_TICK_HZ(ts) / SCHED_TICK_SECS, hz);
2385		pctcpu = (FSCALE * ((FSCALE * rtick)/hz)) >> FSHIFT;
2386	}
2387
2388	return (pctcpu);
2389}
2390
2391/*
2392 * Enforce affinity settings for a thread.  Called after adjustments to
2393 * cpumask.
2394 */
2395void
2396sched_affinity(struct thread *td)
2397{
2398#ifdef SMP
2399	struct td_sched *ts;
2400
2401	THREAD_LOCK_ASSERT(td, MA_OWNED);
2402	ts = td->td_sched;
2403	if (THREAD_CAN_SCHED(td, ts->ts_cpu))
2404		return;
2405	if (TD_ON_RUNQ(td)) {
2406		sched_rem(td);
2407		sched_add(td, SRQ_BORING);
2408		return;
2409	}
2410	if (!TD_IS_RUNNING(td))
2411		return;
2412	/*
2413	 * Force a switch before returning to userspace.  If the
2414	 * target thread is not running locally send an ipi to force
2415	 * the issue.
2416	 */
2417	td->td_flags |= TDF_NEEDRESCHED;
2418	if (td != curthread)
2419		ipi_cpu(ts->ts_cpu, IPI_PREEMPT);
2420#endif
2421}
2422
2423/*
2424 * Bind a thread to a target cpu.
2425 */
2426void
2427sched_bind(struct thread *td, int cpu)
2428{
2429	struct td_sched *ts;
2430
2431	THREAD_LOCK_ASSERT(td, MA_OWNED|MA_NOTRECURSED);
2432	KASSERT(td == curthread, ("sched_bind: can only bind curthread"));
2433	ts = td->td_sched;
2434	if (ts->ts_flags & TSF_BOUND)
2435		sched_unbind(td);
2436	KASSERT(THREAD_CAN_MIGRATE(td), ("%p must be migratable", td));
2437	ts->ts_flags |= TSF_BOUND;
2438	sched_pin();
2439	if (PCPU_GET(cpuid) == cpu)
2440		return;
2441	ts->ts_cpu = cpu;
2442	/* When we return from mi_switch we'll be on the correct cpu. */
2443	mi_switch(SW_VOL, NULL);
2444}
2445
2446/*
2447 * Release a bound thread.
2448 */
2449void
2450sched_unbind(struct thread *td)
2451{
2452	struct td_sched *ts;
2453
2454	THREAD_LOCK_ASSERT(td, MA_OWNED);
2455	KASSERT(td == curthread, ("sched_unbind: can only bind curthread"));
2456	ts = td->td_sched;
2457	if ((ts->ts_flags & TSF_BOUND) == 0)
2458		return;
2459	ts->ts_flags &= ~TSF_BOUND;
2460	sched_unpin();
2461}
2462
2463int
2464sched_is_bound(struct thread *td)
2465{
2466	THREAD_LOCK_ASSERT(td, MA_OWNED);
2467	return (td->td_sched->ts_flags & TSF_BOUND);
2468}
2469
2470/*
2471 * Basic yield call.
2472 */
2473void
2474sched_relinquish(struct thread *td)
2475{
2476	thread_lock(td);
2477	mi_switch(SW_VOL | SWT_RELINQUISH, NULL);
2478	thread_unlock(td);
2479}
2480
2481/*
2482 * Return the total system load.
2483 */
2484int
2485sched_load(void)
2486{
2487#ifdef SMP
2488	int total;
2489	int i;
2490
2491	total = 0;
2492	CPU_FOREACH(i)
2493		total += TDQ_CPU(i)->tdq_sysload;
2494	return (total);
2495#else
2496	return (TDQ_SELF()->tdq_sysload);
2497#endif
2498}
2499
2500int
2501sched_sizeof_proc(void)
2502{
2503	return (sizeof(struct proc));
2504}
2505
2506int
2507sched_sizeof_thread(void)
2508{
2509	return (sizeof(struct thread) + sizeof(struct td_sched));
2510}
2511
2512#ifdef SMP
2513#define	TDQ_IDLESPIN(tdq)						\
2514    ((tdq)->tdq_cg != NULL && ((tdq)->tdq_cg->cg_flags & CG_FLAG_THREAD) == 0)
2515#else
2516#define	TDQ_IDLESPIN(tdq)	1
2517#endif
2518
2519/*
2520 * The actual idle process.
2521 */
2522void
2523sched_idletd(void *dummy)
2524{
2525	struct thread *td;
2526	struct tdq *tdq;
2527	int switchcnt;
2528	int i;
2529
2530	mtx_assert(&Giant, MA_NOTOWNED);
2531	td = curthread;
2532	tdq = TDQ_SELF();
2533	for (;;) {
2534#ifdef SMP
2535		if (tdq_idled(tdq) == 0)
2536			continue;
2537#endif
2538		switchcnt = tdq->tdq_switchcnt + tdq->tdq_oldswitchcnt;
2539		/*
2540		 * If we're switching very frequently, spin while checking
2541		 * for load rather than entering a low power state that
2542		 * may require an IPI.  However, don't do any busy
2543		 * loops while on SMT machines as this simply steals
2544		 * cycles from cores doing useful work.
2545		 */
2546		if (TDQ_IDLESPIN(tdq) && switchcnt > sched_idlespinthresh) {
2547			for (i = 0; i < sched_idlespins; i++) {
2548				if (tdq->tdq_load)
2549					break;
2550				cpu_spinwait();
2551			}
2552		}
2553		switchcnt = tdq->tdq_switchcnt + tdq->tdq_oldswitchcnt;
2554		if (tdq->tdq_load == 0) {
2555			tdq->tdq_cpu_idle = 1;
2556			if (tdq->tdq_load == 0) {
2557				cpu_idle(switchcnt > sched_idlespinthresh * 4);
2558				tdq->tdq_switchcnt++;
2559			}
2560			tdq->tdq_cpu_idle = 0;
2561		}
2562		if (tdq->tdq_load) {
2563			thread_lock(td);
2564			mi_switch(SW_VOL | SWT_IDLE, NULL);
2565			thread_unlock(td);
2566		}
2567	}
2568}
2569
2570/*
2571 * A CPU is entering for the first time or a thread is exiting.
2572 */
2573void
2574sched_throw(struct thread *td)
2575{
2576	struct thread *newtd;
2577	struct tdq *tdq;
2578
2579	tdq = TDQ_SELF();
2580	if (td == NULL) {
2581		/* Correct spinlock nesting and acquire the correct lock. */
2582		TDQ_LOCK(tdq);
2583		spinlock_exit();
2584	} else {
2585		MPASS(td->td_lock == TDQ_LOCKPTR(tdq));
2586		tdq_load_rem(tdq, td);
2587		lock_profile_release_lock(&TDQ_LOCKPTR(tdq)->lock_object);
2588	}
2589	KASSERT(curthread->td_md.md_spinlock_count == 1, ("invalid count"));
2590	newtd = choosethread();
2591	TDQ_LOCKPTR(tdq)->mtx_lock = (uintptr_t)newtd;
2592	PCPU_SET(switchtime, cpu_ticks());
2593	PCPU_SET(switchticks, ticks);
2594	cpu_throw(td, newtd);		/* doesn't return */
2595}
2596
2597/*
2598 * This is called from fork_exit().  Just acquire the correct locks and
2599 * let fork do the rest of the work.
2600 */
2601void
2602sched_fork_exit(struct thread *td)
2603{
2604	struct td_sched *ts;
2605	struct tdq *tdq;
2606	int cpuid;
2607
2608	/*
2609	 * Finish setting up thread glue so that it begins execution in a
2610	 * non-nested critical section with the scheduler lock held.
2611	 */
2612	cpuid = PCPU_GET(cpuid);
2613	tdq = TDQ_CPU(cpuid);
2614	ts = td->td_sched;
2615	if (TD_IS_IDLETHREAD(td))
2616		td->td_lock = TDQ_LOCKPTR(tdq);
2617	MPASS(td->td_lock == TDQ_LOCKPTR(tdq));
2618	td->td_oncpu = cpuid;
2619	TDQ_LOCK_ASSERT(tdq, MA_OWNED | MA_NOTRECURSED);
2620	lock_profile_obtain_lock_success(
2621	    &TDQ_LOCKPTR(tdq)->lock_object, 0, 0, __FILE__, __LINE__);
2622}
2623
2624/*
2625 * Create on first use to catch odd startup conditons.
2626 */
2627char *
2628sched_tdname(struct thread *td)
2629{
2630#ifdef KTR
2631	struct td_sched *ts;
2632
2633	ts = td->td_sched;
2634	if (ts->ts_name[0] == '\0')
2635		snprintf(ts->ts_name, sizeof(ts->ts_name),
2636		    "%s tid %d", td->td_name, td->td_tid);
2637	return (ts->ts_name);
2638#else
2639	return (td->td_name);
2640#endif
2641}
2642
2643#ifdef SMP
2644
2645/*
2646 * Build the CPU topology dump string. Is recursively called to collect
2647 * the topology tree.
2648 */
2649static int
2650sysctl_kern_sched_topology_spec_internal(struct sbuf *sb, struct cpu_group *cg,
2651    int indent)
2652{
2653	int i, first;
2654
2655	sbuf_printf(sb, "%*s<group level=\"%d\" cache-level=\"%d\">\n", indent,
2656	    "", 1 + indent / 2, cg->cg_level);
2657	sbuf_printf(sb, "%*s <cpu count=\"%d\" mask=\"0x%x\">", indent, "",
2658	    cg->cg_count, cg->cg_mask);
2659	first = TRUE;
2660	for (i = 0; i < MAXCPU; i++) {
2661		if ((cg->cg_mask & (1 << i)) != 0) {
2662			if (!first)
2663				sbuf_printf(sb, ", ");
2664			else
2665				first = FALSE;
2666			sbuf_printf(sb, "%d", i);
2667		}
2668	}
2669	sbuf_printf(sb, "</cpu>\n");
2670
2671	if (cg->cg_flags != 0) {
2672		sbuf_printf(sb, "%*s <flags>", indent, "");
2673		if ((cg->cg_flags & CG_FLAG_HTT) != 0)
2674			sbuf_printf(sb, "<flag name=\"HTT\">HTT group</flag>");
2675		if ((cg->cg_flags & CG_FLAG_THREAD) != 0)
2676			sbuf_printf(sb, "<flag name=\"THREAD\">THREAD group</flag>");
2677		if ((cg->cg_flags & CG_FLAG_SMT) != 0)
2678			sbuf_printf(sb, "<flag name=\"SMT\">SMT group</flag>");
2679		sbuf_printf(sb, "</flags>\n");
2680	}
2681
2682	if (cg->cg_children > 0) {
2683		sbuf_printf(sb, "%*s <children>\n", indent, "");
2684		for (i = 0; i < cg->cg_children; i++)
2685			sysctl_kern_sched_topology_spec_internal(sb,
2686			    &cg->cg_child[i], indent+2);
2687		sbuf_printf(sb, "%*s </children>\n", indent, "");
2688	}
2689	sbuf_printf(sb, "%*s</group>\n", indent, "");
2690	return (0);
2691}
2692
2693/*
2694 * Sysctl handler for retrieving topology dump. It's a wrapper for
2695 * the recursive sysctl_kern_smp_topology_spec_internal().
2696 */
2697static int
2698sysctl_kern_sched_topology_spec(SYSCTL_HANDLER_ARGS)
2699{
2700	struct sbuf *topo;
2701	int err;
2702
2703	KASSERT(cpu_top != NULL, ("cpu_top isn't initialized"));
2704
2705	topo = sbuf_new(NULL, NULL, 500, SBUF_AUTOEXTEND);
2706	if (topo == NULL)
2707		return (ENOMEM);
2708
2709	sbuf_printf(topo, "<groups>\n");
2710	err = sysctl_kern_sched_topology_spec_internal(topo, cpu_top, 1);
2711	sbuf_printf(topo, "</groups>\n");
2712
2713	if (err == 0) {
2714		sbuf_finish(topo);
2715		err = SYSCTL_OUT(req, sbuf_data(topo), sbuf_len(topo));
2716	}
2717	sbuf_delete(topo);
2718	return (err);
2719}
2720
2721#endif
2722
2723SYSCTL_NODE(_kern, OID_AUTO, sched, CTLFLAG_RW, 0, "Scheduler");
2724SYSCTL_STRING(_kern_sched, OID_AUTO, name, CTLFLAG_RD, "ULE", 0,
2725    "Scheduler name");
2726SYSCTL_INT(_kern_sched, OID_AUTO, slice, CTLFLAG_RW, &sched_slice, 0,
2727    "Slice size for timeshare threads");
2728SYSCTL_INT(_kern_sched, OID_AUTO, interact, CTLFLAG_RW, &sched_interact, 0,
2729     "Interactivity score threshold");
2730SYSCTL_INT(_kern_sched, OID_AUTO, preempt_thresh, CTLFLAG_RW, &preempt_thresh,
2731     0,"Min priority for preemption, lower priorities have greater precedence");
2732SYSCTL_INT(_kern_sched, OID_AUTO, static_boost, CTLFLAG_RW, &static_boost,
2733     0,"Controls whether static kernel priorities are assigned to sleeping threads.");
2734SYSCTL_INT(_kern_sched, OID_AUTO, idlespins, CTLFLAG_RW, &sched_idlespins,
2735     0,"Number of times idle will spin waiting for new work.");
2736SYSCTL_INT(_kern_sched, OID_AUTO, idlespinthresh, CTLFLAG_RW, &sched_idlespinthresh,
2737     0,"Threshold before we will permit idle spinning.");
2738#ifdef SMP
2739SYSCTL_INT(_kern_sched, OID_AUTO, affinity, CTLFLAG_RW, &affinity, 0,
2740    "Number of hz ticks to keep thread affinity for");
2741SYSCTL_INT(_kern_sched, OID_AUTO, balance, CTLFLAG_RW, &rebalance, 0,
2742    "Enables the long-term load balancer");
2743SYSCTL_INT(_kern_sched, OID_AUTO, balance_interval, CTLFLAG_RW,
2744    &balance_interval, 0,
2745    "Average frequency in stathz ticks to run the long-term balancer");
2746SYSCTL_INT(_kern_sched, OID_AUTO, steal_htt, CTLFLAG_RW, &steal_htt, 0,
2747    "Steals work from another hyper-threaded core on idle");
2748SYSCTL_INT(_kern_sched, OID_AUTO, steal_idle, CTLFLAG_RW, &steal_idle, 0,
2749    "Attempts to steal work from other cores before idling");
2750SYSCTL_INT(_kern_sched, OID_AUTO, steal_thresh, CTLFLAG_RW, &steal_thresh, 0,
2751    "Minimum load on remote cpu before we'll steal");
2752
2753/* Retrieve SMP topology */
2754SYSCTL_PROC(_kern_sched, OID_AUTO, topology_spec, CTLTYPE_STRING |
2755    CTLFLAG_RD, NULL, 0, sysctl_kern_sched_topology_spec, "A",
2756    "XML dump of detected CPU topology");
2757
2758#endif
2759
2760/* ps compat.  All cpu percentages from ULE are weighted. */
2761static int ccpu = 0;
2762SYSCTL_INT(_kern, OID_AUTO, ccpu, CTLFLAG_RD, &ccpu, 0, "");
2763