sched_ule.c revision 187357
1/*-
2 * Copyright (c) 2002-2007, Jeffrey Roberson <jeff@freebsd.org>
3 * All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 * 1. Redistributions of source code must retain the above copyright
9 *    notice unmodified, this list of conditions, and the following
10 *    disclaimer.
11 * 2. Redistributions in binary form must reproduce the above copyright
12 *    notice, this list of conditions and the following disclaimer in the
13 *    documentation and/or other materials provided with the distribution.
14 *
15 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
16 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
17 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
18 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
19 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
20 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
21 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
22 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
23 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
24 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
25 */
26
27/*
28 * This file implements the ULE scheduler.  ULE supports independent CPU
29 * run queues and fine grain locking.  It has superior interactive
30 * performance under load even on uni-processor systems.
31 *
32 * etymology:
33 *   ULE is the last three letters in schedule.  It owes its name to a
34 * generic user created for a scheduling system by Paul Mikesell at
35 * Isilon Systems and a general lack of creativity on the part of the author.
36 */
37
38#include <sys/cdefs.h>
39__FBSDID("$FreeBSD: head/sys/kern/sched_ule.c 187357 2009-01-17 07:17:57Z jeff $");
40
41#include "opt_hwpmc_hooks.h"
42#include "opt_kdtrace.h"
43#include "opt_sched.h"
44
45#include <sys/param.h>
46#include <sys/systm.h>
47#include <sys/kdb.h>
48#include <sys/kernel.h>
49#include <sys/ktr.h>
50#include <sys/lock.h>
51#include <sys/mutex.h>
52#include <sys/proc.h>
53#include <sys/resource.h>
54#include <sys/resourcevar.h>
55#include <sys/sched.h>
56#include <sys/smp.h>
57#include <sys/sx.h>
58#include <sys/sysctl.h>
59#include <sys/sysproto.h>
60#include <sys/turnstile.h>
61#include <sys/umtx.h>
62#include <sys/vmmeter.h>
63#include <sys/cpuset.h>
64#include <sys/sbuf.h>
65#ifdef KTRACE
66#include <sys/uio.h>
67#include <sys/ktrace.h>
68#endif
69
70#ifdef HWPMC_HOOKS
71#include <sys/pmckern.h>
72#endif
73
74#ifdef KDTRACE_HOOKS
75#include <sys/dtrace_bsd.h>
76int				dtrace_vtime_active;
77dtrace_vtime_switch_func_t	dtrace_vtime_switch_func;
78#endif
79
80#include <machine/cpu.h>
81#include <machine/smp.h>
82
83#if defined(__sparc64__) || defined(__mips__)
84#error "This architecture is not currently compatible with ULE"
85#endif
86
87#define	KTR_ULE	0
88
89#define	TS_NAME_LEN (MAXCOMLEN + sizeof(" td ") + sizeof(__STRING(UINT_MAX)))
90#define	TDQ_NAME_LEN	(sizeof("sched lock ") + sizeof(__STRING(MAXCPU)))
91#define	TDQ_LOADNAME_LEN	(PCPU_NAME_LEN + sizeof(" load"))
92
93/*
94 * Thread scheduler specific section.  All fields are protected
95 * by the thread lock.
96 */
97struct td_sched {
98	struct runq	*ts_runq;	/* Run-queue we're queued on. */
99	short		ts_flags;	/* TSF_* flags. */
100	u_char		ts_cpu;		/* CPU that we have affinity for. */
101	int		ts_rltick;	/* Real last tick, for affinity. */
102	int		ts_slice;	/* Ticks of slice remaining. */
103	u_int		ts_slptime;	/* Number of ticks we vol. slept */
104	u_int		ts_runtime;	/* Number of ticks we were running */
105	int		ts_ltick;	/* Last tick that we were running on */
106	int		ts_ftick;	/* First tick that we were running on */
107	int		ts_ticks;	/* Tick count */
108#ifdef KTR
109	char		ts_name[TS_NAME_LEN];
110#endif
111};
112/* flags kept in ts_flags */
113#define	TSF_BOUND	0x0001		/* Thread can not migrate. */
114#define	TSF_XFERABLE	0x0002		/* Thread was added as transferable. */
115
116static struct td_sched td_sched0;
117
118#define	THREAD_CAN_MIGRATE(td)	((td)->td_pinned == 0)
119#define	THREAD_CAN_SCHED(td, cpu)	\
120    CPU_ISSET((cpu), &(td)->td_cpuset->cs_mask)
121
122/*
123 * Cpu percentage computation macros and defines.
124 *
125 * SCHED_TICK_SECS:	Number of seconds to average the cpu usage across.
126 * SCHED_TICK_TARG:	Number of hz ticks to average the cpu usage across.
127 * SCHED_TICK_MAX:	Maximum number of ticks before scaling back.
128 * SCHED_TICK_SHIFT:	Shift factor to avoid rounding away results.
129 * SCHED_TICK_HZ:	Compute the number of hz ticks for a given ticks count.
130 * SCHED_TICK_TOTAL:	Gives the amount of time we've been recording ticks.
131 */
132#define	SCHED_TICK_SECS		10
133#define	SCHED_TICK_TARG		(hz * SCHED_TICK_SECS)
134#define	SCHED_TICK_MAX		(SCHED_TICK_TARG + hz)
135#define	SCHED_TICK_SHIFT	10
136#define	SCHED_TICK_HZ(ts)	((ts)->ts_ticks >> SCHED_TICK_SHIFT)
137#define	SCHED_TICK_TOTAL(ts)	(max((ts)->ts_ltick - (ts)->ts_ftick, hz))
138
139/*
140 * These macros determine priorities for non-interactive threads.  They are
141 * assigned a priority based on their recent cpu utilization as expressed
142 * by the ratio of ticks to the tick total.  NHALF priorities at the start
143 * and end of the MIN to MAX timeshare range are only reachable with negative
144 * or positive nice respectively.
145 *
146 * PRI_RANGE:	Priority range for utilization dependent priorities.
147 * PRI_NRESV:	Number of nice values.
148 * PRI_TICKS:	Compute a priority in PRI_RANGE from the ticks count and total.
149 * PRI_NICE:	Determines the part of the priority inherited from nice.
150 */
151#define	SCHED_PRI_NRESV		(PRIO_MAX - PRIO_MIN)
152#define	SCHED_PRI_NHALF		(SCHED_PRI_NRESV / 2)
153#define	SCHED_PRI_MIN		(PRI_MIN_TIMESHARE + SCHED_PRI_NHALF)
154#define	SCHED_PRI_MAX		(PRI_MAX_TIMESHARE - SCHED_PRI_NHALF)
155#define	SCHED_PRI_RANGE		(SCHED_PRI_MAX - SCHED_PRI_MIN)
156#define	SCHED_PRI_TICKS(ts)						\
157    (SCHED_TICK_HZ((ts)) /						\
158    (roundup(SCHED_TICK_TOTAL((ts)), SCHED_PRI_RANGE) / SCHED_PRI_RANGE))
159#define	SCHED_PRI_NICE(nice)	(nice)
160
161/*
162 * These determine the interactivity of a process.  Interactivity differs from
163 * cpu utilization in that it expresses the voluntary time slept vs time ran
164 * while cpu utilization includes all time not running.  This more accurately
165 * models the intent of the thread.
166 *
167 * SLP_RUN_MAX:	Maximum amount of sleep time + run time we'll accumulate
168 *		before throttling back.
169 * SLP_RUN_FORK:	Maximum slp+run time to inherit at fork time.
170 * INTERACT_MAX:	Maximum interactivity value.  Smaller is better.
171 * INTERACT_THRESH:	Threshhold for placement on the current runq.
172 */
173#define	SCHED_SLP_RUN_MAX	((hz * 5) << SCHED_TICK_SHIFT)
174#define	SCHED_SLP_RUN_FORK	((hz / 2) << SCHED_TICK_SHIFT)
175#define	SCHED_INTERACT_MAX	(100)
176#define	SCHED_INTERACT_HALF	(SCHED_INTERACT_MAX / 2)
177#define	SCHED_INTERACT_THRESH	(30)
178
179/*
180 * tickincr:		Converts a stathz tick into a hz domain scaled by
181 *			the shift factor.  Without the shift the error rate
182 *			due to rounding would be unacceptably high.
183 * realstathz:		stathz is sometimes 0 and run off of hz.
184 * sched_slice:		Runtime of each thread before rescheduling.
185 * preempt_thresh:	Priority threshold for preemption and remote IPIs.
186 */
187static int sched_interact = SCHED_INTERACT_THRESH;
188static int realstathz;
189static int tickincr;
190static int sched_slice = 1;
191#ifdef PREEMPTION
192#ifdef FULL_PREEMPTION
193static int preempt_thresh = PRI_MAX_IDLE;
194#else
195static int preempt_thresh = PRI_MIN_KERN;
196#endif
197#else
198static int preempt_thresh = 0;
199#endif
200static int static_boost = PRI_MIN_TIMESHARE;
201static int sched_idlespins = 10000;
202static int sched_idlespinthresh = 4;
203
204/*
205 * tdq - per processor runqs and statistics.  All fields are protected by the
206 * tdq_lock.  The load and lowpri may be accessed without to avoid excess
207 * locking in sched_pickcpu();
208 */
209struct tdq {
210	/* Ordered to improve efficiency of cpu_search() and switch(). */
211	struct mtx	tdq_lock;		/* run queue lock. */
212	struct cpu_group *tdq_cg;		/* Pointer to cpu topology. */
213	volatile int	tdq_load;		/* Aggregate load. */
214	int		tdq_sysload;		/* For loadavg, !ITHD load. */
215	int		tdq_transferable;	/* Transferable thread count. */
216	volatile int	tdq_idlestate;		/* State of the idle thread. */
217	short		tdq_switchcnt;		/* Switches this tick. */
218	short		tdq_oldswitchcnt;	/* Switches last tick. */
219	u_char		tdq_lowpri;		/* Lowest priority thread. */
220	u_char		tdq_ipipending;		/* IPI pending. */
221	u_char		tdq_idx;		/* Current insert index. */
222	u_char		tdq_ridx;		/* Current removal index. */
223	struct runq	tdq_realtime;		/* real-time run queue. */
224	struct runq	tdq_timeshare;		/* timeshare run queue. */
225	struct runq	tdq_idle;		/* Queue of IDLE threads. */
226	char		tdq_name[TDQ_NAME_LEN];
227#ifdef KTR
228	char		tdq_loadname[TDQ_LOADNAME_LEN];
229#endif
230} __aligned(64);
231
232/* Idle thread states and config. */
233#define	TDQ_RUNNING	1
234#define	TDQ_IDLE	2
235
236#ifdef SMP
237struct cpu_group *cpu_top;		/* CPU topology */
238
239#define	SCHED_AFFINITY_DEFAULT	(max(1, hz / 1000))
240#define	SCHED_AFFINITY(ts, t)	((ts)->ts_rltick > ticks - ((t) * affinity))
241
242/*
243 * Run-time tunables.
244 */
245static int rebalance = 1;
246static int balance_interval = 128;	/* Default set in sched_initticks(). */
247static int affinity;
248static int steal_htt = 1;
249static int steal_idle = 1;
250static int steal_thresh = 2;
251
252/*
253 * One thread queue per processor.
254 */
255static struct tdq	tdq_cpu[MAXCPU];
256static struct tdq	*balance_tdq;
257static int balance_ticks;
258
259#define	TDQ_SELF()	(&tdq_cpu[PCPU_GET(cpuid)])
260#define	TDQ_CPU(x)	(&tdq_cpu[(x)])
261#define	TDQ_ID(x)	((int)((x) - tdq_cpu))
262#else	/* !SMP */
263static struct tdq	tdq_cpu;
264
265#define	TDQ_ID(x)	(0)
266#define	TDQ_SELF()	(&tdq_cpu)
267#define	TDQ_CPU(x)	(&tdq_cpu)
268#endif
269
270#define	TDQ_LOCK_ASSERT(t, type)	mtx_assert(TDQ_LOCKPTR((t)), (type))
271#define	TDQ_LOCK(t)		mtx_lock_spin(TDQ_LOCKPTR((t)))
272#define	TDQ_LOCK_FLAGS(t, f)	mtx_lock_spin_flags(TDQ_LOCKPTR((t)), (f))
273#define	TDQ_UNLOCK(t)		mtx_unlock_spin(TDQ_LOCKPTR((t)))
274#define	TDQ_LOCKPTR(t)		(&(t)->tdq_lock)
275
276static void sched_priority(struct thread *);
277static void sched_thread_priority(struct thread *, u_char);
278static int sched_interact_score(struct thread *);
279static void sched_interact_update(struct thread *);
280static void sched_interact_fork(struct thread *);
281static void sched_pctcpu_update(struct td_sched *);
282
283/* Operations on per processor queues */
284static struct thread *tdq_choose(struct tdq *);
285static void tdq_setup(struct tdq *);
286static void tdq_load_add(struct tdq *, struct thread *);
287static void tdq_load_rem(struct tdq *, struct thread *);
288static __inline void tdq_runq_add(struct tdq *, struct thread *, int);
289static __inline void tdq_runq_rem(struct tdq *, struct thread *);
290static inline int sched_shouldpreempt(int, int, int);
291void tdq_print(int cpu);
292static void runq_print(struct runq *rq);
293static void tdq_add(struct tdq *, struct thread *, int);
294#ifdef SMP
295static int tdq_move(struct tdq *, struct tdq *);
296static int tdq_idled(struct tdq *);
297static void tdq_notify(struct tdq *, struct thread *);
298static struct thread *tdq_steal(struct tdq *, int);
299static struct thread *runq_steal(struct runq *, int);
300static int sched_pickcpu(struct thread *, int);
301static void sched_balance(void);
302static int sched_balance_pair(struct tdq *, struct tdq *);
303static inline struct tdq *sched_setcpu(struct thread *, int, int);
304static inline struct mtx *thread_block_switch(struct thread *);
305static inline void thread_unblock_switch(struct thread *, struct mtx *);
306static struct mtx *sched_switch_migrate(struct tdq *, struct thread *, int);
307static int sysctl_kern_sched_topology_spec(SYSCTL_HANDLER_ARGS);
308static int sysctl_kern_sched_topology_spec_internal(struct sbuf *sb,
309    struct cpu_group *cg, int indent);
310#endif
311
312static void sched_setup(void *dummy);
313SYSINIT(sched_setup, SI_SUB_RUN_QUEUE, SI_ORDER_FIRST, sched_setup, NULL);
314
315static void sched_initticks(void *dummy);
316SYSINIT(sched_initticks, SI_SUB_CLOCKS, SI_ORDER_THIRD, sched_initticks,
317    NULL);
318
319/*
320 * Print the threads waiting on a run-queue.
321 */
322static void
323runq_print(struct runq *rq)
324{
325	struct rqhead *rqh;
326	struct thread *td;
327	int pri;
328	int j;
329	int i;
330
331	for (i = 0; i < RQB_LEN; i++) {
332		printf("\t\trunq bits %d 0x%zx\n",
333		    i, rq->rq_status.rqb_bits[i]);
334		for (j = 0; j < RQB_BPW; j++)
335			if (rq->rq_status.rqb_bits[i] & (1ul << j)) {
336				pri = j + (i << RQB_L2BPW);
337				rqh = &rq->rq_queues[pri];
338				TAILQ_FOREACH(td, rqh, td_runq) {
339					printf("\t\t\ttd %p(%s) priority %d rqindex %d pri %d\n",
340					    td, td->td_name, td->td_priority,
341					    td->td_rqindex, pri);
342				}
343			}
344	}
345}
346
347/*
348 * Print the status of a per-cpu thread queue.  Should be a ddb show cmd.
349 */
350void
351tdq_print(int cpu)
352{
353	struct tdq *tdq;
354
355	tdq = TDQ_CPU(cpu);
356
357	printf("tdq %d:\n", TDQ_ID(tdq));
358	printf("\tlock            %p\n", TDQ_LOCKPTR(tdq));
359	printf("\tLock name:      %s\n", tdq->tdq_name);
360	printf("\tload:           %d\n", tdq->tdq_load);
361	printf("\tswitch cnt:     %d\n", tdq->tdq_switchcnt);
362	printf("\told switch cnt: %d\n", tdq->tdq_oldswitchcnt);
363	printf("\tidle state:     %d\n", tdq->tdq_idlestate);
364	printf("\ttimeshare idx:  %d\n", tdq->tdq_idx);
365	printf("\ttimeshare ridx: %d\n", tdq->tdq_ridx);
366	printf("\tload transferable: %d\n", tdq->tdq_transferable);
367	printf("\tlowest priority:   %d\n", tdq->tdq_lowpri);
368	printf("\trealtime runq:\n");
369	runq_print(&tdq->tdq_realtime);
370	printf("\ttimeshare runq:\n");
371	runq_print(&tdq->tdq_timeshare);
372	printf("\tidle runq:\n");
373	runq_print(&tdq->tdq_idle);
374}
375
376static inline int
377sched_shouldpreempt(int pri, int cpri, int remote)
378{
379	/*
380	 * If the new priority is not better than the current priority there is
381	 * nothing to do.
382	 */
383	if (pri >= cpri)
384		return (0);
385	/*
386	 * Always preempt idle.
387	 */
388	if (cpri >= PRI_MIN_IDLE)
389		return (1);
390	/*
391	 * If preemption is disabled don't preempt others.
392	 */
393	if (preempt_thresh == 0)
394		return (0);
395	/*
396	 * Preempt if we exceed the threshold.
397	 */
398	if (pri <= preempt_thresh)
399		return (1);
400	/*
401	 * If we're realtime or better and there is timeshare or worse running
402	 * preempt only remote processors.
403	 */
404	if (remote && pri <= PRI_MAX_REALTIME && cpri > PRI_MAX_REALTIME)
405		return (1);
406	return (0);
407}
408
409#define	TS_RQ_PPQ	(((PRI_MAX_TIMESHARE - PRI_MIN_TIMESHARE) + 1) / RQ_NQS)
410/*
411 * Add a thread to the actual run-queue.  Keeps transferable counts up to
412 * date with what is actually on the run-queue.  Selects the correct
413 * queue position for timeshare threads.
414 */
415static __inline void
416tdq_runq_add(struct tdq *tdq, struct thread *td, int flags)
417{
418	struct td_sched *ts;
419	u_char pri;
420
421	TDQ_LOCK_ASSERT(tdq, MA_OWNED);
422	THREAD_LOCK_ASSERT(td, MA_OWNED);
423
424	pri = td->td_priority;
425	ts = td->td_sched;
426	TD_SET_RUNQ(td);
427	if (THREAD_CAN_MIGRATE(td)) {
428		tdq->tdq_transferable++;
429		ts->ts_flags |= TSF_XFERABLE;
430	}
431	if (pri <= PRI_MAX_REALTIME) {
432		ts->ts_runq = &tdq->tdq_realtime;
433	} else if (pri <= PRI_MAX_TIMESHARE) {
434		ts->ts_runq = &tdq->tdq_timeshare;
435		KASSERT(pri <= PRI_MAX_TIMESHARE && pri >= PRI_MIN_TIMESHARE,
436			("Invalid priority %d on timeshare runq", pri));
437		/*
438		 * This queue contains only priorities between MIN and MAX
439		 * realtime.  Use the whole queue to represent these values.
440		 */
441		if ((flags & (SRQ_BORROWING|SRQ_PREEMPTED)) == 0) {
442			pri = (pri - PRI_MIN_TIMESHARE) / TS_RQ_PPQ;
443			pri = (pri + tdq->tdq_idx) % RQ_NQS;
444			/*
445			 * This effectively shortens the queue by one so we
446			 * can have a one slot difference between idx and
447			 * ridx while we wait for threads to drain.
448			 */
449			if (tdq->tdq_ridx != tdq->tdq_idx &&
450			    pri == tdq->tdq_ridx)
451				pri = (unsigned char)(pri - 1) % RQ_NQS;
452		} else
453			pri = tdq->tdq_ridx;
454		runq_add_pri(ts->ts_runq, td, pri, flags);
455		return;
456	} else
457		ts->ts_runq = &tdq->tdq_idle;
458	runq_add(ts->ts_runq, td, flags);
459}
460
461/*
462 * Remove a thread from a run-queue.  This typically happens when a thread
463 * is selected to run.  Running threads are not on the queue and the
464 * transferable count does not reflect them.
465 */
466static __inline void
467tdq_runq_rem(struct tdq *tdq, struct thread *td)
468{
469	struct td_sched *ts;
470
471	ts = td->td_sched;
472	TDQ_LOCK_ASSERT(tdq, MA_OWNED);
473	KASSERT(ts->ts_runq != NULL,
474	    ("tdq_runq_remove: thread %p null ts_runq", td));
475	if (ts->ts_flags & TSF_XFERABLE) {
476		tdq->tdq_transferable--;
477		ts->ts_flags &= ~TSF_XFERABLE;
478	}
479	if (ts->ts_runq == &tdq->tdq_timeshare) {
480		if (tdq->tdq_idx != tdq->tdq_ridx)
481			runq_remove_idx(ts->ts_runq, td, &tdq->tdq_ridx);
482		else
483			runq_remove_idx(ts->ts_runq, td, NULL);
484	} else
485		runq_remove(ts->ts_runq, td);
486}
487
488/*
489 * Load is maintained for all threads RUNNING and ON_RUNQ.  Add the load
490 * for this thread to the referenced thread queue.
491 */
492static void
493tdq_load_add(struct tdq *tdq, struct thread *td)
494{
495
496	TDQ_LOCK_ASSERT(tdq, MA_OWNED);
497	THREAD_LOCK_ASSERT(td, MA_OWNED);
498
499	tdq->tdq_load++;
500	if ((td->td_proc->p_flag & P_NOLOAD) == 0)
501		tdq->tdq_sysload++;
502	KTR_COUNTER0(KTR_SCHED, "load", tdq->tdq_loadname, tdq->tdq_load);
503}
504
505/*
506 * Remove the load from a thread that is transitioning to a sleep state or
507 * exiting.
508 */
509static void
510tdq_load_rem(struct tdq *tdq, struct thread *td)
511{
512
513	THREAD_LOCK_ASSERT(td, MA_OWNED);
514	TDQ_LOCK_ASSERT(tdq, MA_OWNED);
515	KASSERT(tdq->tdq_load != 0,
516	    ("tdq_load_rem: Removing with 0 load on queue %d", TDQ_ID(tdq)));
517
518	tdq->tdq_load--;
519	if ((td->td_proc->p_flag & P_NOLOAD) == 0)
520		tdq->tdq_sysload--;
521	KTR_COUNTER0(KTR_SCHED, "load", tdq->tdq_loadname, tdq->tdq_load);
522}
523
524/*
525 * Set lowpri to its exact value by searching the run-queue and
526 * evaluating curthread.  curthread may be passed as an optimization.
527 */
528static void
529tdq_setlowpri(struct tdq *tdq, struct thread *ctd)
530{
531	struct thread *td;
532
533	TDQ_LOCK_ASSERT(tdq, MA_OWNED);
534	if (ctd == NULL)
535		ctd = pcpu_find(TDQ_ID(tdq))->pc_curthread;
536	td = tdq_choose(tdq);
537	if (td == NULL || td->td_priority > ctd->td_priority)
538		tdq->tdq_lowpri = ctd->td_priority;
539	else
540		tdq->tdq_lowpri = td->td_priority;
541}
542
543#ifdef SMP
544struct cpu_search {
545	cpumask_t cs_mask;	/* Mask of valid cpus. */
546	u_int	cs_load;
547	u_int	cs_cpu;
548	int	cs_limit;	/* Min priority for low min load for high. */
549};
550
551#define	CPU_SEARCH_LOWEST	0x1
552#define	CPU_SEARCH_HIGHEST	0x2
553#define	CPU_SEARCH_BOTH		(CPU_SEARCH_LOWEST|CPU_SEARCH_HIGHEST)
554
555#define	CPUMASK_FOREACH(cpu, mask)				\
556	for ((cpu) = 0; (cpu) < sizeof((mask)) * 8; (cpu)++)	\
557		if ((mask) & 1 << (cpu))
558
559static __inline int cpu_search(struct cpu_group *cg, struct cpu_search *low,
560    struct cpu_search *high, const int match);
561int cpu_search_lowest(struct cpu_group *cg, struct cpu_search *low);
562int cpu_search_highest(struct cpu_group *cg, struct cpu_search *high);
563int cpu_search_both(struct cpu_group *cg, struct cpu_search *low,
564    struct cpu_search *high);
565
566/*
567 * This routine compares according to the match argument and should be
568 * reduced in actual instantiations via constant propagation and dead code
569 * elimination.
570 */
571static __inline int
572cpu_compare(int cpu, struct cpu_search *low, struct cpu_search *high,
573    const int match)
574{
575	struct tdq *tdq;
576
577	tdq = TDQ_CPU(cpu);
578	if (match & CPU_SEARCH_LOWEST)
579		if (low->cs_mask & (1 << cpu) &&
580		    tdq->tdq_load < low->cs_load &&
581		    tdq->tdq_lowpri > low->cs_limit) {
582			low->cs_cpu = cpu;
583			low->cs_load = tdq->tdq_load;
584		}
585	if (match & CPU_SEARCH_HIGHEST)
586		if (high->cs_mask & (1 << cpu) &&
587		    tdq->tdq_load >= high->cs_limit &&
588		    tdq->tdq_load > high->cs_load &&
589		    tdq->tdq_transferable) {
590			high->cs_cpu = cpu;
591			high->cs_load = tdq->tdq_load;
592		}
593	return (tdq->tdq_load);
594}
595
596/*
597 * Search the tree of cpu_groups for the lowest or highest loaded cpu
598 * according to the match argument.  This routine actually compares the
599 * load on all paths through the tree and finds the least loaded cpu on
600 * the least loaded path, which may differ from the least loaded cpu in
601 * the system.  This balances work among caches and busses.
602 *
603 * This inline is instantiated in three forms below using constants for the
604 * match argument.  It is reduced to the minimum set for each case.  It is
605 * also recursive to the depth of the tree.
606 */
607static __inline int
608cpu_search(struct cpu_group *cg, struct cpu_search *low,
609    struct cpu_search *high, const int match)
610{
611	int total;
612
613	total = 0;
614	if (cg->cg_children) {
615		struct cpu_search lgroup;
616		struct cpu_search hgroup;
617		struct cpu_group *child;
618		u_int lload;
619		int hload;
620		int load;
621		int i;
622
623		lload = -1;
624		hload = -1;
625		for (i = 0; i < cg->cg_children; i++) {
626			child = &cg->cg_child[i];
627			if (match & CPU_SEARCH_LOWEST) {
628				lgroup = *low;
629				lgroup.cs_load = -1;
630			}
631			if (match & CPU_SEARCH_HIGHEST) {
632				hgroup = *high;
633				lgroup.cs_load = 0;
634			}
635			switch (match) {
636			case CPU_SEARCH_LOWEST:
637				load = cpu_search_lowest(child, &lgroup);
638				break;
639			case CPU_SEARCH_HIGHEST:
640				load = cpu_search_highest(child, &hgroup);
641				break;
642			case CPU_SEARCH_BOTH:
643				load = cpu_search_both(child, &lgroup, &hgroup);
644				break;
645			}
646			total += load;
647			if (match & CPU_SEARCH_LOWEST)
648				if (load < lload || low->cs_cpu == -1) {
649					*low = lgroup;
650					lload = load;
651				}
652			if (match & CPU_SEARCH_HIGHEST)
653				if (load > hload || high->cs_cpu == -1) {
654					hload = load;
655					*high = hgroup;
656				}
657		}
658	} else {
659		int cpu;
660
661		CPUMASK_FOREACH(cpu, cg->cg_mask)
662			total += cpu_compare(cpu, low, high, match);
663	}
664	return (total);
665}
666
667/*
668 * cpu_search instantiations must pass constants to maintain the inline
669 * optimization.
670 */
671int
672cpu_search_lowest(struct cpu_group *cg, struct cpu_search *low)
673{
674	return cpu_search(cg, low, NULL, CPU_SEARCH_LOWEST);
675}
676
677int
678cpu_search_highest(struct cpu_group *cg, struct cpu_search *high)
679{
680	return cpu_search(cg, NULL, high, CPU_SEARCH_HIGHEST);
681}
682
683int
684cpu_search_both(struct cpu_group *cg, struct cpu_search *low,
685    struct cpu_search *high)
686{
687	return cpu_search(cg, low, high, CPU_SEARCH_BOTH);
688}
689
690/*
691 * Find the cpu with the least load via the least loaded path that has a
692 * lowpri greater than pri  pri.  A pri of -1 indicates any priority is
693 * acceptable.
694 */
695static inline int
696sched_lowest(struct cpu_group *cg, cpumask_t mask, int pri)
697{
698	struct cpu_search low;
699
700	low.cs_cpu = -1;
701	low.cs_load = -1;
702	low.cs_mask = mask;
703	low.cs_limit = pri;
704	cpu_search_lowest(cg, &low);
705	return low.cs_cpu;
706}
707
708/*
709 * Find the cpu with the highest load via the highest loaded path.
710 */
711static inline int
712sched_highest(struct cpu_group *cg, cpumask_t mask, int minload)
713{
714	struct cpu_search high;
715
716	high.cs_cpu = -1;
717	high.cs_load = 0;
718	high.cs_mask = mask;
719	high.cs_limit = minload;
720	cpu_search_highest(cg, &high);
721	return high.cs_cpu;
722}
723
724/*
725 * Simultaneously find the highest and lowest loaded cpu reachable via
726 * cg.
727 */
728static inline void
729sched_both(struct cpu_group *cg, cpumask_t mask, int *lowcpu, int *highcpu)
730{
731	struct cpu_search high;
732	struct cpu_search low;
733
734	low.cs_cpu = -1;
735	low.cs_limit = -1;
736	low.cs_load = -1;
737	low.cs_mask = mask;
738	high.cs_load = 0;
739	high.cs_cpu = -1;
740	high.cs_limit = -1;
741	high.cs_mask = mask;
742	cpu_search_both(cg, &low, &high);
743	*lowcpu = low.cs_cpu;
744	*highcpu = high.cs_cpu;
745	return;
746}
747
748static void
749sched_balance_group(struct cpu_group *cg)
750{
751	cpumask_t mask;
752	int high;
753	int low;
754	int i;
755
756	mask = -1;
757	for (;;) {
758		sched_both(cg, mask, &low, &high);
759		if (low == high || low == -1 || high == -1)
760			break;
761		if (sched_balance_pair(TDQ_CPU(high), TDQ_CPU(low)))
762			break;
763		/*
764		 * If we failed to move any threads determine which cpu
765		 * to kick out of the set and try again.
766	 	 */
767		if (TDQ_CPU(high)->tdq_transferable == 0)
768			mask &= ~(1 << high);
769		else
770			mask &= ~(1 << low);
771	}
772
773	for (i = 0; i < cg->cg_children; i++)
774		sched_balance_group(&cg->cg_child[i]);
775}
776
777static void
778sched_balance()
779{
780	struct tdq *tdq;
781
782	/*
783	 * Select a random time between .5 * balance_interval and
784	 * 1.5 * balance_interval.
785	 */
786	balance_ticks = max(balance_interval / 2, 1);
787	balance_ticks += random() % balance_interval;
788	if (smp_started == 0 || rebalance == 0)
789		return;
790	tdq = TDQ_SELF();
791	TDQ_UNLOCK(tdq);
792	sched_balance_group(cpu_top);
793	TDQ_LOCK(tdq);
794}
795
796/*
797 * Lock two thread queues using their address to maintain lock order.
798 */
799static void
800tdq_lock_pair(struct tdq *one, struct tdq *two)
801{
802	if (one < two) {
803		TDQ_LOCK(one);
804		TDQ_LOCK_FLAGS(two, MTX_DUPOK);
805	} else {
806		TDQ_LOCK(two);
807		TDQ_LOCK_FLAGS(one, MTX_DUPOK);
808	}
809}
810
811/*
812 * Unlock two thread queues.  Order is not important here.
813 */
814static void
815tdq_unlock_pair(struct tdq *one, struct tdq *two)
816{
817	TDQ_UNLOCK(one);
818	TDQ_UNLOCK(two);
819}
820
821/*
822 * Transfer load between two imbalanced thread queues.
823 */
824static int
825sched_balance_pair(struct tdq *high, struct tdq *low)
826{
827	int transferable;
828	int high_load;
829	int low_load;
830	int moved;
831	int move;
832	int diff;
833	int i;
834
835	tdq_lock_pair(high, low);
836	transferable = high->tdq_transferable;
837	high_load = high->tdq_load;
838	low_load = low->tdq_load;
839	moved = 0;
840	/*
841	 * Determine what the imbalance is and then adjust that to how many
842	 * threads we actually have to give up (transferable).
843	 */
844	if (transferable != 0) {
845		diff = high_load - low_load;
846		move = diff / 2;
847		if (diff & 0x1)
848			move++;
849		move = min(move, transferable);
850		for (i = 0; i < move; i++)
851			moved += tdq_move(high, low);
852		/*
853		 * IPI the target cpu to force it to reschedule with the new
854		 * workload.
855		 */
856		ipi_selected(1 << TDQ_ID(low), IPI_PREEMPT);
857	}
858	tdq_unlock_pair(high, low);
859	return (moved);
860}
861
862/*
863 * Move a thread from one thread queue to another.
864 */
865static int
866tdq_move(struct tdq *from, struct tdq *to)
867{
868	struct td_sched *ts;
869	struct thread *td;
870	struct tdq *tdq;
871	int cpu;
872
873	TDQ_LOCK_ASSERT(from, MA_OWNED);
874	TDQ_LOCK_ASSERT(to, MA_OWNED);
875
876	tdq = from;
877	cpu = TDQ_ID(to);
878	td = tdq_steal(tdq, cpu);
879	if (td == NULL)
880		return (0);
881	ts = td->td_sched;
882	/*
883	 * Although the run queue is locked the thread may be blocked.  Lock
884	 * it to clear this and acquire the run-queue lock.
885	 */
886	thread_lock(td);
887	/* Drop recursive lock on from acquired via thread_lock(). */
888	TDQ_UNLOCK(from);
889	sched_rem(td);
890	ts->ts_cpu = cpu;
891	td->td_lock = TDQ_LOCKPTR(to);
892	tdq_add(to, td, SRQ_YIELDING);
893	return (1);
894}
895
896/*
897 * This tdq has idled.  Try to steal a thread from another cpu and switch
898 * to it.
899 */
900static int
901tdq_idled(struct tdq *tdq)
902{
903	struct cpu_group *cg;
904	struct tdq *steal;
905	cpumask_t mask;
906	int thresh;
907	int cpu;
908
909	if (smp_started == 0 || steal_idle == 0)
910		return (1);
911	mask = -1;
912	mask &= ~PCPU_GET(cpumask);
913	/* We don't want to be preempted while we're iterating. */
914	spinlock_enter();
915	for (cg = tdq->tdq_cg; cg != NULL; ) {
916		if ((cg->cg_flags & (CG_FLAG_HTT | CG_FLAG_THREAD)) == 0)
917			thresh = steal_thresh;
918		else
919			thresh = 1;
920		cpu = sched_highest(cg, mask, thresh);
921		if (cpu == -1) {
922			cg = cg->cg_parent;
923			continue;
924		}
925		steal = TDQ_CPU(cpu);
926		mask &= ~(1 << cpu);
927		tdq_lock_pair(tdq, steal);
928		if (steal->tdq_load < thresh || steal->tdq_transferable == 0) {
929			tdq_unlock_pair(tdq, steal);
930			continue;
931		}
932		/*
933		 * If a thread was added while interrupts were disabled don't
934		 * steal one here.  If we fail to acquire one due to affinity
935		 * restrictions loop again with this cpu removed from the
936		 * set.
937		 */
938		if (tdq->tdq_load == 0 && tdq_move(steal, tdq) == 0) {
939			tdq_unlock_pair(tdq, steal);
940			continue;
941		}
942		spinlock_exit();
943		TDQ_UNLOCK(steal);
944		mi_switch(SW_VOL | SWT_IDLE, NULL);
945		thread_unlock(curthread);
946
947		return (0);
948	}
949	spinlock_exit();
950	return (1);
951}
952
953/*
954 * Notify a remote cpu of new work.  Sends an IPI if criteria are met.
955 */
956static void
957tdq_notify(struct tdq *tdq, struct thread *td)
958{
959	struct thread *ctd;
960	int pri;
961	int cpu;
962
963	if (tdq->tdq_ipipending)
964		return;
965	cpu = td->td_sched->ts_cpu;
966	pri = td->td_priority;
967	ctd = pcpu_find(cpu)->pc_curthread;
968	if (!sched_shouldpreempt(pri, ctd->td_priority, 1))
969		return;
970	if (TD_IS_IDLETHREAD(ctd)) {
971		/*
972		 * If the idle thread is still 'running' it's probably
973		 * waiting on us to release the tdq spinlock already.  No
974		 * need to ipi.
975		 */
976		if (tdq->tdq_idlestate == TDQ_RUNNING)
977			return;
978		/*
979		 * If the MD code has an idle wakeup routine try that before
980		 * falling back to IPI.
981		 */
982		if (cpu_idle_wakeup(cpu))
983			return;
984	}
985	tdq->tdq_ipipending = 1;
986	ipi_selected(1 << cpu, IPI_PREEMPT);
987}
988
989/*
990 * Steals load from a timeshare queue.  Honors the rotating queue head
991 * index.
992 */
993static struct thread *
994runq_steal_from(struct runq *rq, int cpu, u_char start)
995{
996	struct rqbits *rqb;
997	struct rqhead *rqh;
998	struct thread *td;
999	int first;
1000	int bit;
1001	int pri;
1002	int i;
1003
1004	rqb = &rq->rq_status;
1005	bit = start & (RQB_BPW -1);
1006	pri = 0;
1007	first = 0;
1008again:
1009	for (i = RQB_WORD(start); i < RQB_LEN; bit = 0, i++) {
1010		if (rqb->rqb_bits[i] == 0)
1011			continue;
1012		if (bit != 0) {
1013			for (pri = bit; pri < RQB_BPW; pri++)
1014				if (rqb->rqb_bits[i] & (1ul << pri))
1015					break;
1016			if (pri >= RQB_BPW)
1017				continue;
1018		} else
1019			pri = RQB_FFS(rqb->rqb_bits[i]);
1020		pri += (i << RQB_L2BPW);
1021		rqh = &rq->rq_queues[pri];
1022		TAILQ_FOREACH(td, rqh, td_runq) {
1023			if (first && THREAD_CAN_MIGRATE(td) &&
1024			    THREAD_CAN_SCHED(td, cpu))
1025				return (td);
1026			first = 1;
1027		}
1028	}
1029	if (start != 0) {
1030		start = 0;
1031		goto again;
1032	}
1033
1034	return (NULL);
1035}
1036
1037/*
1038 * Steals load from a standard linear queue.
1039 */
1040static struct thread *
1041runq_steal(struct runq *rq, int cpu)
1042{
1043	struct rqhead *rqh;
1044	struct rqbits *rqb;
1045	struct thread *td;
1046	int word;
1047	int bit;
1048
1049	rqb = &rq->rq_status;
1050	for (word = 0; word < RQB_LEN; word++) {
1051		if (rqb->rqb_bits[word] == 0)
1052			continue;
1053		for (bit = 0; bit < RQB_BPW; bit++) {
1054			if ((rqb->rqb_bits[word] & (1ul << bit)) == 0)
1055				continue;
1056			rqh = &rq->rq_queues[bit + (word << RQB_L2BPW)];
1057			TAILQ_FOREACH(td, rqh, td_runq)
1058				if (THREAD_CAN_MIGRATE(td) &&
1059				    THREAD_CAN_SCHED(td, cpu))
1060					return (td);
1061		}
1062	}
1063	return (NULL);
1064}
1065
1066/*
1067 * Attempt to steal a thread in priority order from a thread queue.
1068 */
1069static struct thread *
1070tdq_steal(struct tdq *tdq, int cpu)
1071{
1072	struct thread *td;
1073
1074	TDQ_LOCK_ASSERT(tdq, MA_OWNED);
1075	if ((td = runq_steal(&tdq->tdq_realtime, cpu)) != NULL)
1076		return (td);
1077	if ((td = runq_steal_from(&tdq->tdq_timeshare,
1078	    cpu, tdq->tdq_ridx)) != NULL)
1079		return (td);
1080	return (runq_steal(&tdq->tdq_idle, cpu));
1081}
1082
1083/*
1084 * Sets the thread lock and ts_cpu to match the requested cpu.  Unlocks the
1085 * current lock and returns with the assigned queue locked.
1086 */
1087static inline struct tdq *
1088sched_setcpu(struct thread *td, int cpu, int flags)
1089{
1090
1091	struct tdq *tdq;
1092
1093	THREAD_LOCK_ASSERT(td, MA_OWNED);
1094	tdq = TDQ_CPU(cpu);
1095	td->td_sched->ts_cpu = cpu;
1096	/*
1097	 * If the lock matches just return the queue.
1098	 */
1099	if (td->td_lock == TDQ_LOCKPTR(tdq))
1100		return (tdq);
1101#ifdef notyet
1102	/*
1103	 * If the thread isn't running its lockptr is a
1104	 * turnstile or a sleepqueue.  We can just lock_set without
1105	 * blocking.
1106	 */
1107	if (TD_CAN_RUN(td)) {
1108		TDQ_LOCK(tdq);
1109		thread_lock_set(td, TDQ_LOCKPTR(tdq));
1110		return (tdq);
1111	}
1112#endif
1113	/*
1114	 * The hard case, migration, we need to block the thread first to
1115	 * prevent order reversals with other cpus locks.
1116	 */
1117	thread_lock_block(td);
1118	TDQ_LOCK(tdq);
1119	thread_lock_unblock(td, TDQ_LOCKPTR(tdq));
1120	return (tdq);
1121}
1122
1123SCHED_STAT_DEFINE(pickcpu_intrbind, "Soft interrupt binding");
1124SCHED_STAT_DEFINE(pickcpu_idle_affinity, "Picked idle cpu based on affinity");
1125SCHED_STAT_DEFINE(pickcpu_affinity, "Picked cpu based on affinity");
1126SCHED_STAT_DEFINE(pickcpu_lowest, "Selected lowest load");
1127SCHED_STAT_DEFINE(pickcpu_local, "Migrated to current cpu");
1128SCHED_STAT_DEFINE(pickcpu_migration, "Selection may have caused migration");
1129
1130static int
1131sched_pickcpu(struct thread *td, int flags)
1132{
1133	struct cpu_group *cg;
1134	struct td_sched *ts;
1135	struct tdq *tdq;
1136	cpumask_t mask;
1137	int self;
1138	int pri;
1139	int cpu;
1140
1141	self = PCPU_GET(cpuid);
1142	ts = td->td_sched;
1143	if (smp_started == 0)
1144		return (self);
1145	/*
1146	 * Don't migrate a running thread from sched_switch().
1147	 */
1148	if ((flags & SRQ_OURSELF) || !THREAD_CAN_MIGRATE(td))
1149		return (ts->ts_cpu);
1150	/*
1151	 * Prefer to run interrupt threads on the processors that generate
1152	 * the interrupt.
1153	 */
1154	if (td->td_priority <= PRI_MAX_ITHD && THREAD_CAN_SCHED(td, self) &&
1155	    curthread->td_intr_nesting_level && ts->ts_cpu != self) {
1156		SCHED_STAT_INC(pickcpu_intrbind);
1157		ts->ts_cpu = self;
1158	}
1159	/*
1160	 * If the thread can run on the last cpu and the affinity has not
1161	 * expired or it is idle run it there.
1162	 */
1163	pri = td->td_priority;
1164	tdq = TDQ_CPU(ts->ts_cpu);
1165	if (THREAD_CAN_SCHED(td, ts->ts_cpu)) {
1166		if (tdq->tdq_lowpri > PRI_MIN_IDLE) {
1167			SCHED_STAT_INC(pickcpu_idle_affinity);
1168			return (ts->ts_cpu);
1169		}
1170		if (SCHED_AFFINITY(ts, CG_SHARE_L2) && tdq->tdq_lowpri > pri) {
1171			SCHED_STAT_INC(pickcpu_affinity);
1172			return (ts->ts_cpu);
1173		}
1174	}
1175	/*
1176	 * Search for the highest level in the tree that still has affinity.
1177	 */
1178	cg = NULL;
1179	for (cg = tdq->tdq_cg; cg != NULL; cg = cg->cg_parent)
1180		if (SCHED_AFFINITY(ts, cg->cg_level))
1181			break;
1182	cpu = -1;
1183	mask = td->td_cpuset->cs_mask.__bits[0];
1184	if (cg)
1185		cpu = sched_lowest(cg, mask, pri);
1186	if (cpu == -1)
1187		cpu = sched_lowest(cpu_top, mask, -1);
1188	/*
1189	 * Compare the lowest loaded cpu to current cpu.
1190	 */
1191	if (THREAD_CAN_SCHED(td, self) && TDQ_CPU(self)->tdq_lowpri > pri &&
1192	    TDQ_CPU(cpu)->tdq_lowpri < PRI_MIN_IDLE) {
1193		SCHED_STAT_INC(pickcpu_local);
1194		cpu = self;
1195	} else
1196		SCHED_STAT_INC(pickcpu_lowest);
1197	if (cpu != ts->ts_cpu)
1198		SCHED_STAT_INC(pickcpu_migration);
1199	KASSERT(cpu != -1, ("sched_pickcpu: Failed to find a cpu."));
1200	return (cpu);
1201}
1202#endif
1203
1204/*
1205 * Pick the highest priority task we have and return it.
1206 */
1207static struct thread *
1208tdq_choose(struct tdq *tdq)
1209{
1210	struct thread *td;
1211
1212	TDQ_LOCK_ASSERT(tdq, MA_OWNED);
1213	td = runq_choose(&tdq->tdq_realtime);
1214	if (td != NULL)
1215		return (td);
1216	td = runq_choose_from(&tdq->tdq_timeshare, tdq->tdq_ridx);
1217	if (td != NULL) {
1218		KASSERT(td->td_priority >= PRI_MIN_TIMESHARE,
1219		    ("tdq_choose: Invalid priority on timeshare queue %d",
1220		    td->td_priority));
1221		return (td);
1222	}
1223	td = runq_choose(&tdq->tdq_idle);
1224	if (td != NULL) {
1225		KASSERT(td->td_priority >= PRI_MIN_IDLE,
1226		    ("tdq_choose: Invalid priority on idle queue %d",
1227		    td->td_priority));
1228		return (td);
1229	}
1230
1231	return (NULL);
1232}
1233
1234/*
1235 * Initialize a thread queue.
1236 */
1237static void
1238tdq_setup(struct tdq *tdq)
1239{
1240
1241	if (bootverbose)
1242		printf("ULE: setup cpu %d\n", TDQ_ID(tdq));
1243	runq_init(&tdq->tdq_realtime);
1244	runq_init(&tdq->tdq_timeshare);
1245	runq_init(&tdq->tdq_idle);
1246	snprintf(tdq->tdq_name, sizeof(tdq->tdq_name),
1247	    "sched lock %d", (int)TDQ_ID(tdq));
1248	mtx_init(&tdq->tdq_lock, tdq->tdq_name, "sched lock",
1249	    MTX_SPIN | MTX_RECURSE);
1250#ifdef KTR
1251	snprintf(tdq->tdq_loadname, sizeof(tdq->tdq_loadname),
1252	    "CPU %d load", (int)TDQ_ID(tdq));
1253#endif
1254}
1255
1256#ifdef SMP
1257static void
1258sched_setup_smp(void)
1259{
1260	struct tdq *tdq;
1261	int i;
1262
1263	cpu_top = smp_topo();
1264	for (i = 0; i < MAXCPU; i++) {
1265		if (CPU_ABSENT(i))
1266			continue;
1267		tdq = TDQ_CPU(i);
1268		tdq_setup(tdq);
1269		tdq->tdq_cg = smp_topo_find(cpu_top, i);
1270		if (tdq->tdq_cg == NULL)
1271			panic("Can't find cpu group for %d\n", i);
1272	}
1273	balance_tdq = TDQ_SELF();
1274	sched_balance();
1275}
1276#endif
1277
1278/*
1279 * Setup the thread queues and initialize the topology based on MD
1280 * information.
1281 */
1282static void
1283sched_setup(void *dummy)
1284{
1285	struct tdq *tdq;
1286
1287	tdq = TDQ_SELF();
1288#ifdef SMP
1289	sched_setup_smp();
1290#else
1291	tdq_setup(tdq);
1292#endif
1293	/*
1294	 * To avoid divide-by-zero, we set realstathz a dummy value
1295	 * in case which sched_clock() called before sched_initticks().
1296	 */
1297	realstathz = hz;
1298	sched_slice = (realstathz/10);	/* ~100ms */
1299	tickincr = 1 << SCHED_TICK_SHIFT;
1300
1301	/* Add thread0's load since it's running. */
1302	TDQ_LOCK(tdq);
1303	thread0.td_lock = TDQ_LOCKPTR(TDQ_SELF());
1304	tdq_load_add(tdq, &thread0);
1305	tdq->tdq_lowpri = thread0.td_priority;
1306	TDQ_UNLOCK(tdq);
1307}
1308
1309/*
1310 * This routine determines the tickincr after stathz and hz are setup.
1311 */
1312/* ARGSUSED */
1313static void
1314sched_initticks(void *dummy)
1315{
1316	int incr;
1317
1318	realstathz = stathz ? stathz : hz;
1319	sched_slice = (realstathz/10);	/* ~100ms */
1320
1321	/*
1322	 * tickincr is shifted out by 10 to avoid rounding errors due to
1323	 * hz not being evenly divisible by stathz on all platforms.
1324	 */
1325	incr = (hz << SCHED_TICK_SHIFT) / realstathz;
1326	/*
1327	 * This does not work for values of stathz that are more than
1328	 * 1 << SCHED_TICK_SHIFT * hz.  In practice this does not happen.
1329	 */
1330	if (incr == 0)
1331		incr = 1;
1332	tickincr = incr;
1333#ifdef SMP
1334	/*
1335	 * Set the default balance interval now that we know
1336	 * what realstathz is.
1337	 */
1338	balance_interval = realstathz;
1339	/*
1340	 * Set steal thresh to log2(mp_ncpu) but no greater than 4.  This
1341	 * prevents excess thrashing on large machines and excess idle on
1342	 * smaller machines.
1343	 */
1344	steal_thresh = min(ffs(mp_ncpus) - 1, 3);
1345	affinity = SCHED_AFFINITY_DEFAULT;
1346#endif
1347}
1348
1349
1350/*
1351 * This is the core of the interactivity algorithm.  Determines a score based
1352 * on past behavior.  It is the ratio of sleep time to run time scaled to
1353 * a [0, 100] integer.  This is the voluntary sleep time of a process, which
1354 * differs from the cpu usage because it does not account for time spent
1355 * waiting on a run-queue.  Would be prettier if we had floating point.
1356 */
1357static int
1358sched_interact_score(struct thread *td)
1359{
1360	struct td_sched *ts;
1361	int div;
1362
1363	ts = td->td_sched;
1364	/*
1365	 * The score is only needed if this is likely to be an interactive
1366	 * task.  Don't go through the expense of computing it if there's
1367	 * no chance.
1368	 */
1369	if (sched_interact <= SCHED_INTERACT_HALF &&
1370		ts->ts_runtime >= ts->ts_slptime)
1371			return (SCHED_INTERACT_HALF);
1372
1373	if (ts->ts_runtime > ts->ts_slptime) {
1374		div = max(1, ts->ts_runtime / SCHED_INTERACT_HALF);
1375		return (SCHED_INTERACT_HALF +
1376		    (SCHED_INTERACT_HALF - (ts->ts_slptime / div)));
1377	}
1378	if (ts->ts_slptime > ts->ts_runtime) {
1379		div = max(1, ts->ts_slptime / SCHED_INTERACT_HALF);
1380		return (ts->ts_runtime / div);
1381	}
1382	/* runtime == slptime */
1383	if (ts->ts_runtime)
1384		return (SCHED_INTERACT_HALF);
1385
1386	/*
1387	 * This can happen if slptime and runtime are 0.
1388	 */
1389	return (0);
1390
1391}
1392
1393/*
1394 * Scale the scheduling priority according to the "interactivity" of this
1395 * process.
1396 */
1397static void
1398sched_priority(struct thread *td)
1399{
1400	int score;
1401	int pri;
1402
1403	if (td->td_pri_class != PRI_TIMESHARE)
1404		return;
1405	/*
1406	 * If the score is interactive we place the thread in the realtime
1407	 * queue with a priority that is less than kernel and interrupt
1408	 * priorities.  These threads are not subject to nice restrictions.
1409	 *
1410	 * Scores greater than this are placed on the normal timeshare queue
1411	 * where the priority is partially decided by the most recent cpu
1412	 * utilization and the rest is decided by nice value.
1413	 *
1414	 * The nice value of the process has a linear effect on the calculated
1415	 * score.  Negative nice values make it easier for a thread to be
1416	 * considered interactive.
1417	 */
1418	score = imax(0, sched_interact_score(td) - td->td_proc->p_nice);
1419	if (score < sched_interact) {
1420		pri = PRI_MIN_REALTIME;
1421		pri += ((PRI_MAX_REALTIME - PRI_MIN_REALTIME) / sched_interact)
1422		    * score;
1423		KASSERT(pri >= PRI_MIN_REALTIME && pri <= PRI_MAX_REALTIME,
1424		    ("sched_priority: invalid interactive priority %d score %d",
1425		    pri, score));
1426	} else {
1427		pri = SCHED_PRI_MIN;
1428		if (td->td_sched->ts_ticks)
1429			pri += SCHED_PRI_TICKS(td->td_sched);
1430		pri += SCHED_PRI_NICE(td->td_proc->p_nice);
1431		KASSERT(pri >= PRI_MIN_TIMESHARE && pri <= PRI_MAX_TIMESHARE,
1432		    ("sched_priority: invalid priority %d: nice %d, "
1433		    "ticks %d ftick %d ltick %d tick pri %d",
1434		    pri, td->td_proc->p_nice, td->td_sched->ts_ticks,
1435		    td->td_sched->ts_ftick, td->td_sched->ts_ltick,
1436		    SCHED_PRI_TICKS(td->td_sched)));
1437	}
1438	sched_user_prio(td, pri);
1439
1440	return;
1441}
1442
1443/*
1444 * This routine enforces a maximum limit on the amount of scheduling history
1445 * kept.  It is called after either the slptime or runtime is adjusted.  This
1446 * function is ugly due to integer math.
1447 */
1448static void
1449sched_interact_update(struct thread *td)
1450{
1451	struct td_sched *ts;
1452	u_int sum;
1453
1454	ts = td->td_sched;
1455	sum = ts->ts_runtime + ts->ts_slptime;
1456	if (sum < SCHED_SLP_RUN_MAX)
1457		return;
1458	/*
1459	 * This only happens from two places:
1460	 * 1) We have added an unusual amount of run time from fork_exit.
1461	 * 2) We have added an unusual amount of sleep time from sched_sleep().
1462	 */
1463	if (sum > SCHED_SLP_RUN_MAX * 2) {
1464		if (ts->ts_runtime > ts->ts_slptime) {
1465			ts->ts_runtime = SCHED_SLP_RUN_MAX;
1466			ts->ts_slptime = 1;
1467		} else {
1468			ts->ts_slptime = SCHED_SLP_RUN_MAX;
1469			ts->ts_runtime = 1;
1470		}
1471		return;
1472	}
1473	/*
1474	 * If we have exceeded by more than 1/5th then the algorithm below
1475	 * will not bring us back into range.  Dividing by two here forces
1476	 * us into the range of [4/5 * SCHED_INTERACT_MAX, SCHED_INTERACT_MAX]
1477	 */
1478	if (sum > (SCHED_SLP_RUN_MAX / 5) * 6) {
1479		ts->ts_runtime /= 2;
1480		ts->ts_slptime /= 2;
1481		return;
1482	}
1483	ts->ts_runtime = (ts->ts_runtime / 5) * 4;
1484	ts->ts_slptime = (ts->ts_slptime / 5) * 4;
1485}
1486
1487/*
1488 * Scale back the interactivity history when a child thread is created.  The
1489 * history is inherited from the parent but the thread may behave totally
1490 * differently.  For example, a shell spawning a compiler process.  We want
1491 * to learn that the compiler is behaving badly very quickly.
1492 */
1493static void
1494sched_interact_fork(struct thread *td)
1495{
1496	int ratio;
1497	int sum;
1498
1499	sum = td->td_sched->ts_runtime + td->td_sched->ts_slptime;
1500	if (sum > SCHED_SLP_RUN_FORK) {
1501		ratio = sum / SCHED_SLP_RUN_FORK;
1502		td->td_sched->ts_runtime /= ratio;
1503		td->td_sched->ts_slptime /= ratio;
1504	}
1505}
1506
1507/*
1508 * Called from proc0_init() to setup the scheduler fields.
1509 */
1510void
1511schedinit(void)
1512{
1513
1514	/*
1515	 * Set up the scheduler specific parts of proc0.
1516	 */
1517	proc0.p_sched = NULL; /* XXX */
1518	thread0.td_sched = &td_sched0;
1519	td_sched0.ts_ltick = ticks;
1520	td_sched0.ts_ftick = ticks;
1521	td_sched0.ts_slice = sched_slice;
1522}
1523
1524/*
1525 * This is only somewhat accurate since given many processes of the same
1526 * priority they will switch when their slices run out, which will be
1527 * at most sched_slice stathz ticks.
1528 */
1529int
1530sched_rr_interval(void)
1531{
1532
1533	/* Convert sched_slice to hz */
1534	return (hz/(realstathz/sched_slice));
1535}
1536
1537/*
1538 * Update the percent cpu tracking information when it is requested or
1539 * the total history exceeds the maximum.  We keep a sliding history of
1540 * tick counts that slowly decays.  This is less precise than the 4BSD
1541 * mechanism since it happens with less regular and frequent events.
1542 */
1543static void
1544sched_pctcpu_update(struct td_sched *ts)
1545{
1546
1547	if (ts->ts_ticks == 0)
1548		return;
1549	if (ticks - (hz / 10) < ts->ts_ltick &&
1550	    SCHED_TICK_TOTAL(ts) < SCHED_TICK_MAX)
1551		return;
1552	/*
1553	 * Adjust counters and watermark for pctcpu calc.
1554	 */
1555	if (ts->ts_ltick > ticks - SCHED_TICK_TARG)
1556		ts->ts_ticks = (ts->ts_ticks / (ticks - ts->ts_ftick)) *
1557			    SCHED_TICK_TARG;
1558	else
1559		ts->ts_ticks = 0;
1560	ts->ts_ltick = ticks;
1561	ts->ts_ftick = ts->ts_ltick - SCHED_TICK_TARG;
1562}
1563
1564/*
1565 * Adjust the priority of a thread.  Move it to the appropriate run-queue
1566 * if necessary.  This is the back-end for several priority related
1567 * functions.
1568 */
1569static void
1570sched_thread_priority(struct thread *td, u_char prio)
1571{
1572	struct td_sched *ts;
1573	struct tdq *tdq;
1574	int oldpri;
1575
1576	KTR_POINT3(KTR_SCHED, "thread", sched_tdname(td), "prio",
1577	    "prio:%d", td->td_priority, "new prio:%d", prio,
1578	    KTR_ATTR_LINKED, sched_tdname(curthread));
1579	if (td != curthread && prio > td->td_priority) {
1580		KTR_POINT3(KTR_SCHED, "thread", sched_tdname(curthread),
1581		    "lend prio", "prio:%d", td->td_priority, "new prio:%d",
1582		    prio, KTR_ATTR_LINKED, sched_tdname(td));
1583	}
1584	ts = td->td_sched;
1585	THREAD_LOCK_ASSERT(td, MA_OWNED);
1586	if (td->td_priority == prio)
1587		return;
1588	/*
1589	 * If the priority has been elevated due to priority
1590	 * propagation, we may have to move ourselves to a new
1591	 * queue.  This could be optimized to not re-add in some
1592	 * cases.
1593	 */
1594	if (TD_ON_RUNQ(td) && prio < td->td_priority) {
1595		sched_rem(td);
1596		td->td_priority = prio;
1597		sched_add(td, SRQ_BORROWING);
1598		return;
1599	}
1600	/*
1601	 * If the thread is currently running we may have to adjust the lowpri
1602	 * information so other cpus are aware of our current priority.
1603	 */
1604	if (TD_IS_RUNNING(td)) {
1605		tdq = TDQ_CPU(ts->ts_cpu);
1606		oldpri = td->td_priority;
1607		td->td_priority = prio;
1608		if (prio < tdq->tdq_lowpri)
1609			tdq->tdq_lowpri = prio;
1610		else if (tdq->tdq_lowpri == oldpri)
1611			tdq_setlowpri(tdq, td);
1612		return;
1613	}
1614	td->td_priority = prio;
1615}
1616
1617/*
1618 * Update a thread's priority when it is lent another thread's
1619 * priority.
1620 */
1621void
1622sched_lend_prio(struct thread *td, u_char prio)
1623{
1624
1625	td->td_flags |= TDF_BORROWING;
1626	sched_thread_priority(td, prio);
1627}
1628
1629/*
1630 * Restore a thread's priority when priority propagation is
1631 * over.  The prio argument is the minimum priority the thread
1632 * needs to have to satisfy other possible priority lending
1633 * requests.  If the thread's regular priority is less
1634 * important than prio, the thread will keep a priority boost
1635 * of prio.
1636 */
1637void
1638sched_unlend_prio(struct thread *td, u_char prio)
1639{
1640	u_char base_pri;
1641
1642	if (td->td_base_pri >= PRI_MIN_TIMESHARE &&
1643	    td->td_base_pri <= PRI_MAX_TIMESHARE)
1644		base_pri = td->td_user_pri;
1645	else
1646		base_pri = td->td_base_pri;
1647	if (prio >= base_pri) {
1648		td->td_flags &= ~TDF_BORROWING;
1649		sched_thread_priority(td, base_pri);
1650	} else
1651		sched_lend_prio(td, prio);
1652}
1653
1654/*
1655 * Standard entry for setting the priority to an absolute value.
1656 */
1657void
1658sched_prio(struct thread *td, u_char prio)
1659{
1660	u_char oldprio;
1661
1662	/* First, update the base priority. */
1663	td->td_base_pri = prio;
1664
1665	/*
1666	 * If the thread is borrowing another thread's priority, don't
1667	 * ever lower the priority.
1668	 */
1669	if (td->td_flags & TDF_BORROWING && td->td_priority < prio)
1670		return;
1671
1672	/* Change the real priority. */
1673	oldprio = td->td_priority;
1674	sched_thread_priority(td, prio);
1675
1676	/*
1677	 * If the thread is on a turnstile, then let the turnstile update
1678	 * its state.
1679	 */
1680	if (TD_ON_LOCK(td) && oldprio != prio)
1681		turnstile_adjust(td, oldprio);
1682}
1683
1684/*
1685 * Set the base user priority, does not effect current running priority.
1686 */
1687void
1688sched_user_prio(struct thread *td, u_char prio)
1689{
1690	u_char oldprio;
1691
1692	td->td_base_user_pri = prio;
1693	if (td->td_flags & TDF_UBORROWING && td->td_user_pri <= prio)
1694                return;
1695	oldprio = td->td_user_pri;
1696	td->td_user_pri = prio;
1697}
1698
1699void
1700sched_lend_user_prio(struct thread *td, u_char prio)
1701{
1702	u_char oldprio;
1703
1704	THREAD_LOCK_ASSERT(td, MA_OWNED);
1705	td->td_flags |= TDF_UBORROWING;
1706	oldprio = td->td_user_pri;
1707	td->td_user_pri = prio;
1708}
1709
1710void
1711sched_unlend_user_prio(struct thread *td, u_char prio)
1712{
1713	u_char base_pri;
1714
1715	THREAD_LOCK_ASSERT(td, MA_OWNED);
1716	base_pri = td->td_base_user_pri;
1717	if (prio >= base_pri) {
1718		td->td_flags &= ~TDF_UBORROWING;
1719		sched_user_prio(td, base_pri);
1720	} else {
1721		sched_lend_user_prio(td, prio);
1722	}
1723}
1724
1725/*
1726 * Block a thread for switching.  Similar to thread_block() but does not
1727 * bump the spin count.
1728 */
1729static inline struct mtx *
1730thread_block_switch(struct thread *td)
1731{
1732	struct mtx *lock;
1733
1734	THREAD_LOCK_ASSERT(td, MA_OWNED);
1735	lock = td->td_lock;
1736	td->td_lock = &blocked_lock;
1737	mtx_unlock_spin(lock);
1738
1739	return (lock);
1740}
1741
1742/*
1743 * Handle migration from sched_switch().  This happens only for
1744 * cpu binding.
1745 */
1746static struct mtx *
1747sched_switch_migrate(struct tdq *tdq, struct thread *td, int flags)
1748{
1749	struct tdq *tdn;
1750
1751	tdn = TDQ_CPU(td->td_sched->ts_cpu);
1752#ifdef SMP
1753	tdq_load_rem(tdq, td);
1754	/*
1755	 * Do the lock dance required to avoid LOR.  We grab an extra
1756	 * spinlock nesting to prevent preemption while we're
1757	 * not holding either run-queue lock.
1758	 */
1759	spinlock_enter();
1760	thread_block_switch(td);	/* This releases the lock on tdq. */
1761	TDQ_LOCK(tdn);
1762	tdq_add(tdn, td, flags);
1763	tdq_notify(tdn, td);
1764	/*
1765	 * After we unlock tdn the new cpu still can't switch into this
1766	 * thread until we've unblocked it in cpu_switch().  The lock
1767	 * pointers may match in the case of HTT cores.  Don't unlock here
1768	 * or we can deadlock when the other CPU runs the IPI handler.
1769	 */
1770	if (TDQ_LOCKPTR(tdn) != TDQ_LOCKPTR(tdq)) {
1771		TDQ_UNLOCK(tdn);
1772		TDQ_LOCK(tdq);
1773	}
1774	spinlock_exit();
1775#endif
1776	return (TDQ_LOCKPTR(tdn));
1777}
1778
1779/*
1780 * Release a thread that was blocked with thread_block_switch().
1781 */
1782static inline void
1783thread_unblock_switch(struct thread *td, struct mtx *mtx)
1784{
1785	atomic_store_rel_ptr((volatile uintptr_t *)&td->td_lock,
1786	    (uintptr_t)mtx);
1787}
1788
1789/*
1790 * Switch threads.  This function has to handle threads coming in while
1791 * blocked for some reason, running, or idle.  It also must deal with
1792 * migrating a thread from one queue to another as running threads may
1793 * be assigned elsewhere via binding.
1794 */
1795void
1796sched_switch(struct thread *td, struct thread *newtd, int flags)
1797{
1798	struct tdq *tdq;
1799	struct td_sched *ts;
1800	struct mtx *mtx;
1801	int srqflag;
1802	int cpuid;
1803
1804	THREAD_LOCK_ASSERT(td, MA_OWNED);
1805	KASSERT(newtd == NULL, ("sched_switch: Unsupported newtd argument"));
1806
1807	cpuid = PCPU_GET(cpuid);
1808	tdq = TDQ_CPU(cpuid);
1809	ts = td->td_sched;
1810	mtx = td->td_lock;
1811	ts->ts_rltick = ticks;
1812	td->td_lastcpu = td->td_oncpu;
1813	td->td_oncpu = NOCPU;
1814	td->td_flags &= ~TDF_NEEDRESCHED;
1815	td->td_owepreempt = 0;
1816	tdq->tdq_switchcnt++;
1817	/*
1818	 * The lock pointer in an idle thread should never change.  Reset it
1819	 * to CAN_RUN as well.
1820	 */
1821	if (TD_IS_IDLETHREAD(td)) {
1822		MPASS(td->td_lock == TDQ_LOCKPTR(tdq));
1823		TD_SET_CAN_RUN(td);
1824	} else if (TD_IS_RUNNING(td)) {
1825		MPASS(td->td_lock == TDQ_LOCKPTR(tdq));
1826		srqflag = (flags & SW_PREEMPT) ?
1827		    SRQ_OURSELF|SRQ_YIELDING|SRQ_PREEMPTED :
1828		    SRQ_OURSELF|SRQ_YIELDING;
1829		if (ts->ts_cpu == cpuid)
1830			tdq_runq_add(tdq, td, srqflag);
1831		else
1832			mtx = sched_switch_migrate(tdq, td, srqflag);
1833	} else {
1834		/* This thread must be going to sleep. */
1835		TDQ_LOCK(tdq);
1836		mtx = thread_block_switch(td);
1837		tdq_load_rem(tdq, td);
1838	}
1839	/*
1840	 * We enter here with the thread blocked and assigned to the
1841	 * appropriate cpu run-queue or sleep-queue and with the current
1842	 * thread-queue locked.
1843	 */
1844	TDQ_LOCK_ASSERT(tdq, MA_OWNED | MA_NOTRECURSED);
1845	newtd = choosethread();
1846	/*
1847	 * Call the MD code to switch contexts if necessary.
1848	 */
1849	if (td != newtd) {
1850#ifdef	HWPMC_HOOKS
1851		if (PMC_PROC_IS_USING_PMCS(td->td_proc))
1852			PMC_SWITCH_CONTEXT(td, PMC_FN_CSW_OUT);
1853#endif
1854		lock_profile_release_lock(&TDQ_LOCKPTR(tdq)->lock_object);
1855		TDQ_LOCKPTR(tdq)->mtx_lock = (uintptr_t)newtd;
1856
1857#ifdef KDTRACE_HOOKS
1858		/*
1859		 * If DTrace has set the active vtime enum to anything
1860		 * other than INACTIVE (0), then it should have set the
1861		 * function to call.
1862		 */
1863		if (dtrace_vtime_active)
1864			(*dtrace_vtime_switch_func)(newtd);
1865#endif
1866
1867		cpu_switch(td, newtd, mtx);
1868		/*
1869		 * We may return from cpu_switch on a different cpu.  However,
1870		 * we always return with td_lock pointing to the current cpu's
1871		 * run queue lock.
1872		 */
1873		cpuid = PCPU_GET(cpuid);
1874		tdq = TDQ_CPU(cpuid);
1875		lock_profile_obtain_lock_success(
1876		    &TDQ_LOCKPTR(tdq)->lock_object, 0, 0, __FILE__, __LINE__);
1877#ifdef	HWPMC_HOOKS
1878		if (PMC_PROC_IS_USING_PMCS(td->td_proc))
1879			PMC_SWITCH_CONTEXT(td, PMC_FN_CSW_IN);
1880#endif
1881	} else
1882		thread_unblock_switch(td, mtx);
1883	/*
1884	 * Assert that all went well and return.
1885	 */
1886	TDQ_LOCK_ASSERT(tdq, MA_OWNED|MA_NOTRECURSED);
1887	MPASS(td->td_lock == TDQ_LOCKPTR(tdq));
1888	td->td_oncpu = cpuid;
1889}
1890
1891/*
1892 * Adjust thread priorities as a result of a nice request.
1893 */
1894void
1895sched_nice(struct proc *p, int nice)
1896{
1897	struct thread *td;
1898
1899	PROC_LOCK_ASSERT(p, MA_OWNED);
1900
1901	p->p_nice = nice;
1902	FOREACH_THREAD_IN_PROC(p, td) {
1903		thread_lock(td);
1904		sched_priority(td);
1905		sched_prio(td, td->td_base_user_pri);
1906		thread_unlock(td);
1907	}
1908}
1909
1910/*
1911 * Record the sleep time for the interactivity scorer.
1912 */
1913void
1914sched_sleep(struct thread *td, int prio)
1915{
1916
1917	THREAD_LOCK_ASSERT(td, MA_OWNED);
1918
1919	td->td_slptick = ticks;
1920	if (TD_IS_SUSPENDED(td) || prio <= PSOCK)
1921		td->td_flags |= TDF_CANSWAP;
1922	if (static_boost == 1 && prio)
1923		sched_prio(td, prio);
1924	else if (static_boost && td->td_priority > static_boost)
1925		sched_prio(td, static_boost);
1926}
1927
1928/*
1929 * Schedule a thread to resume execution and record how long it voluntarily
1930 * slept.  We also update the pctcpu, interactivity, and priority.
1931 */
1932void
1933sched_wakeup(struct thread *td)
1934{
1935	struct td_sched *ts;
1936	int slptick;
1937
1938	THREAD_LOCK_ASSERT(td, MA_OWNED);
1939	ts = td->td_sched;
1940	td->td_flags &= ~TDF_CANSWAP;
1941	/*
1942	 * If we slept for more than a tick update our interactivity and
1943	 * priority.
1944	 */
1945	slptick = td->td_slptick;
1946	td->td_slptick = 0;
1947	if (slptick && slptick != ticks) {
1948		u_int hzticks;
1949
1950		hzticks = (ticks - slptick) << SCHED_TICK_SHIFT;
1951		ts->ts_slptime += hzticks;
1952		sched_interact_update(td);
1953		sched_pctcpu_update(ts);
1954	}
1955	/* Reset the slice value after we sleep. */
1956	ts->ts_slice = sched_slice;
1957	sched_add(td, SRQ_BORING);
1958}
1959
1960/*
1961 * Penalize the parent for creating a new child and initialize the child's
1962 * priority.
1963 */
1964void
1965sched_fork(struct thread *td, struct thread *child)
1966{
1967	THREAD_LOCK_ASSERT(td, MA_OWNED);
1968	sched_fork_thread(td, child);
1969	/*
1970	 * Penalize the parent and child for forking.
1971	 */
1972	sched_interact_fork(child);
1973	sched_priority(child);
1974	td->td_sched->ts_runtime += tickincr;
1975	sched_interact_update(td);
1976	sched_priority(td);
1977}
1978
1979/*
1980 * Fork a new thread, may be within the same process.
1981 */
1982void
1983sched_fork_thread(struct thread *td, struct thread *child)
1984{
1985	struct td_sched *ts;
1986	struct td_sched *ts2;
1987
1988	THREAD_LOCK_ASSERT(td, MA_OWNED);
1989	/*
1990	 * Initialize child.
1991	 */
1992	ts = td->td_sched;
1993	ts2 = child->td_sched;
1994	child->td_lock = TDQ_LOCKPTR(TDQ_SELF());
1995	child->td_cpuset = cpuset_ref(td->td_cpuset);
1996	ts2->ts_cpu = ts->ts_cpu;
1997	ts2->ts_flags = 0;
1998	/*
1999	 * Grab our parents cpu estimation information and priority.
2000	 */
2001	ts2->ts_ticks = ts->ts_ticks;
2002	ts2->ts_ltick = ts->ts_ltick;
2003	ts2->ts_ftick = ts->ts_ftick;
2004	child->td_user_pri = td->td_user_pri;
2005	child->td_base_user_pri = td->td_base_user_pri;
2006	/*
2007	 * And update interactivity score.
2008	 */
2009	ts2->ts_slptime = ts->ts_slptime;
2010	ts2->ts_runtime = ts->ts_runtime;
2011	ts2->ts_slice = 1;	/* Attempt to quickly learn interactivity. */
2012#ifdef KTR
2013	bzero(ts2->ts_name, sizeof(ts2->ts_name));
2014#endif
2015}
2016
2017/*
2018 * Adjust the priority class of a thread.
2019 */
2020void
2021sched_class(struct thread *td, int class)
2022{
2023
2024	THREAD_LOCK_ASSERT(td, MA_OWNED);
2025	if (td->td_pri_class == class)
2026		return;
2027	td->td_pri_class = class;
2028}
2029
2030/*
2031 * Return some of the child's priority and interactivity to the parent.
2032 */
2033void
2034sched_exit(struct proc *p, struct thread *child)
2035{
2036	struct thread *td;
2037
2038	KTR_STATE1(KTR_SCHED, "thread", sched_tdname(child), "proc exit",
2039	    "prio:td", child->td_priority);
2040	PROC_LOCK_ASSERT(p, MA_OWNED);
2041	td = FIRST_THREAD_IN_PROC(p);
2042	sched_exit_thread(td, child);
2043}
2044
2045/*
2046 * Penalize another thread for the time spent on this one.  This helps to
2047 * worsen the priority and interactivity of processes which schedule batch
2048 * jobs such as make.  This has little effect on the make process itself but
2049 * causes new processes spawned by it to receive worse scores immediately.
2050 */
2051void
2052sched_exit_thread(struct thread *td, struct thread *child)
2053{
2054
2055	KTR_STATE1(KTR_SCHED, "thread", sched_tdname(child), "thread exit",
2056	    "prio:td", child->td_priority);
2057	/*
2058	 * Give the child's runtime to the parent without returning the
2059	 * sleep time as a penalty to the parent.  This causes shells that
2060	 * launch expensive things to mark their children as expensive.
2061	 */
2062	thread_lock(td);
2063	td->td_sched->ts_runtime += child->td_sched->ts_runtime;
2064	sched_interact_update(td);
2065	sched_priority(td);
2066	thread_unlock(td);
2067}
2068
2069void
2070sched_preempt(struct thread *td)
2071{
2072	struct tdq *tdq;
2073
2074	thread_lock(td);
2075	tdq = TDQ_SELF();
2076	TDQ_LOCK_ASSERT(tdq, MA_OWNED);
2077	tdq->tdq_ipipending = 0;
2078	if (td->td_priority > tdq->tdq_lowpri) {
2079		int flags;
2080
2081		flags = SW_INVOL | SW_PREEMPT;
2082		if (td->td_critnest > 1)
2083			td->td_owepreempt = 1;
2084		else if (TD_IS_IDLETHREAD(td))
2085			mi_switch(flags | SWT_REMOTEWAKEIDLE, NULL);
2086		else
2087			mi_switch(flags | SWT_REMOTEPREEMPT, NULL);
2088	}
2089	thread_unlock(td);
2090}
2091
2092/*
2093 * Fix priorities on return to user-space.  Priorities may be elevated due
2094 * to static priorities in msleep() or similar.
2095 */
2096void
2097sched_userret(struct thread *td)
2098{
2099	/*
2100	 * XXX we cheat slightly on the locking here to avoid locking in
2101	 * the usual case.  Setting td_priority here is essentially an
2102	 * incomplete workaround for not setting it properly elsewhere.
2103	 * Now that some interrupt handlers are threads, not setting it
2104	 * properly elsewhere can clobber it in the window between setting
2105	 * it here and returning to user mode, so don't waste time setting
2106	 * it perfectly here.
2107	 */
2108	KASSERT((td->td_flags & TDF_BORROWING) == 0,
2109	    ("thread with borrowed priority returning to userland"));
2110	if (td->td_priority != td->td_user_pri) {
2111		thread_lock(td);
2112		td->td_priority = td->td_user_pri;
2113		td->td_base_pri = td->td_user_pri;
2114		tdq_setlowpri(TDQ_SELF(), td);
2115		thread_unlock(td);
2116        }
2117}
2118
2119/*
2120 * Handle a stathz tick.  This is really only relevant for timeshare
2121 * threads.
2122 */
2123void
2124sched_clock(struct thread *td)
2125{
2126	struct tdq *tdq;
2127	struct td_sched *ts;
2128
2129	THREAD_LOCK_ASSERT(td, MA_OWNED);
2130	tdq = TDQ_SELF();
2131#ifdef SMP
2132	/*
2133	 * We run the long term load balancer infrequently on the first cpu.
2134	 */
2135	if (balance_tdq == tdq) {
2136		if (balance_ticks && --balance_ticks == 0)
2137			sched_balance();
2138	}
2139#endif
2140	/*
2141	 * Save the old switch count so we have a record of the last ticks
2142	 * activity.   Initialize the new switch count based on our load.
2143	 * If there is some activity seed it to reflect that.
2144	 */
2145	tdq->tdq_oldswitchcnt = tdq->tdq_switchcnt;
2146	tdq->tdq_switchcnt = tdq->tdq_load;
2147	/*
2148	 * Advance the insert index once for each tick to ensure that all
2149	 * threads get a chance to run.
2150	 */
2151	if (tdq->tdq_idx == tdq->tdq_ridx) {
2152		tdq->tdq_idx = (tdq->tdq_idx + 1) % RQ_NQS;
2153		if (TAILQ_EMPTY(&tdq->tdq_timeshare.rq_queues[tdq->tdq_ridx]))
2154			tdq->tdq_ridx = tdq->tdq_idx;
2155	}
2156	ts = td->td_sched;
2157	if (td->td_pri_class & PRI_FIFO_BIT)
2158		return;
2159	if (td->td_pri_class == PRI_TIMESHARE) {
2160		/*
2161		 * We used a tick; charge it to the thread so
2162		 * that we can compute our interactivity.
2163		 */
2164		td->td_sched->ts_runtime += tickincr;
2165		sched_interact_update(td);
2166		sched_priority(td);
2167	}
2168	/*
2169	 * We used up one time slice.
2170	 */
2171	if (--ts->ts_slice > 0)
2172		return;
2173	/*
2174	 * We're out of time, force a requeue at userret().
2175	 */
2176	ts->ts_slice = sched_slice;
2177	td->td_flags |= TDF_NEEDRESCHED;
2178}
2179
2180/*
2181 * Called once per hz tick.  Used for cpu utilization information.  This
2182 * is easier than trying to scale based on stathz.
2183 */
2184void
2185sched_tick(void)
2186{
2187	struct td_sched *ts;
2188
2189	ts = curthread->td_sched;
2190	/*
2191	 * Ticks is updated asynchronously on a single cpu.  Check here to
2192	 * avoid incrementing ts_ticks multiple times in a single tick.
2193	 */
2194	if (ts->ts_ltick == ticks)
2195		return;
2196	/* Adjust ticks for pctcpu */
2197	ts->ts_ticks += 1 << SCHED_TICK_SHIFT;
2198	ts->ts_ltick = ticks;
2199	/*
2200	 * Update if we've exceeded our desired tick threshhold by over one
2201	 * second.
2202	 */
2203	if (ts->ts_ftick + SCHED_TICK_MAX < ts->ts_ltick)
2204		sched_pctcpu_update(ts);
2205}
2206
2207/*
2208 * Return whether the current CPU has runnable tasks.  Used for in-kernel
2209 * cooperative idle threads.
2210 */
2211int
2212sched_runnable(void)
2213{
2214	struct tdq *tdq;
2215	int load;
2216
2217	load = 1;
2218
2219	tdq = TDQ_SELF();
2220	if ((curthread->td_flags & TDF_IDLETD) != 0) {
2221		if (tdq->tdq_load > 0)
2222			goto out;
2223	} else
2224		if (tdq->tdq_load - 1 > 0)
2225			goto out;
2226	load = 0;
2227out:
2228	return (load);
2229}
2230
2231/*
2232 * Choose the highest priority thread to run.  The thread is removed from
2233 * the run-queue while running however the load remains.  For SMP we set
2234 * the tdq in the global idle bitmask if it idles here.
2235 */
2236struct thread *
2237sched_choose(void)
2238{
2239	struct thread *td;
2240	struct tdq *tdq;
2241
2242	tdq = TDQ_SELF();
2243	TDQ_LOCK_ASSERT(tdq, MA_OWNED);
2244	td = tdq_choose(tdq);
2245	if (td) {
2246		td->td_sched->ts_ltick = ticks;
2247		tdq_runq_rem(tdq, td);
2248		tdq->tdq_lowpri = td->td_priority;
2249		return (td);
2250	}
2251	tdq->tdq_lowpri = PRI_MAX_IDLE;
2252	return (PCPU_GET(idlethread));
2253}
2254
2255/*
2256 * Set owepreempt if necessary.  Preemption never happens directly in ULE,
2257 * we always request it once we exit a critical section.
2258 */
2259static inline void
2260sched_setpreempt(struct thread *td)
2261{
2262	struct thread *ctd;
2263	int cpri;
2264	int pri;
2265
2266	THREAD_LOCK_ASSERT(curthread, MA_OWNED);
2267
2268	ctd = curthread;
2269	pri = td->td_priority;
2270	cpri = ctd->td_priority;
2271	if (pri < cpri)
2272		ctd->td_flags |= TDF_NEEDRESCHED;
2273	if (panicstr != NULL || pri >= cpri || cold || TD_IS_INHIBITED(ctd))
2274		return;
2275	if (!sched_shouldpreempt(pri, cpri, 0))
2276		return;
2277	ctd->td_owepreempt = 1;
2278}
2279
2280/*
2281 * Add a thread to a thread queue.  Select the appropriate runq and add the
2282 * thread to it.  This is the internal function called when the tdq is
2283 * predetermined.
2284 */
2285void
2286tdq_add(struct tdq *tdq, struct thread *td, int flags)
2287{
2288
2289	TDQ_LOCK_ASSERT(tdq, MA_OWNED);
2290	KASSERT((td->td_inhibitors == 0),
2291	    ("sched_add: trying to run inhibited thread"));
2292	KASSERT((TD_CAN_RUN(td) || TD_IS_RUNNING(td)),
2293	    ("sched_add: bad thread state"));
2294	KASSERT(td->td_flags & TDF_INMEM,
2295	    ("sched_add: thread swapped out"));
2296
2297	if (td->td_priority < tdq->tdq_lowpri)
2298		tdq->tdq_lowpri = td->td_priority;
2299	tdq_runq_add(tdq, td, flags);
2300	tdq_load_add(tdq, td);
2301}
2302
2303/*
2304 * Select the target thread queue and add a thread to it.  Request
2305 * preemption or IPI a remote processor if required.
2306 */
2307void
2308sched_add(struct thread *td, int flags)
2309{
2310	struct tdq *tdq;
2311#ifdef SMP
2312	int cpu;
2313#endif
2314
2315	KTR_STATE2(KTR_SCHED, "thread", sched_tdname(td), "runq add",
2316	    "prio:%d", td->td_priority, KTR_ATTR_LINKED,
2317	    sched_tdname(curthread));
2318	KTR_POINT1(KTR_SCHED, "thread", sched_tdname(curthread), "wokeup",
2319	    KTR_ATTR_LINKED, sched_tdname(td));
2320	THREAD_LOCK_ASSERT(td, MA_OWNED);
2321	/*
2322	 * Recalculate the priority before we select the target cpu or
2323	 * run-queue.
2324	 */
2325	if (PRI_BASE(td->td_pri_class) == PRI_TIMESHARE)
2326		sched_priority(td);
2327#ifdef SMP
2328	/*
2329	 * Pick the destination cpu and if it isn't ours transfer to the
2330	 * target cpu.
2331	 */
2332	cpu = sched_pickcpu(td, flags);
2333	tdq = sched_setcpu(td, cpu, flags);
2334	tdq_add(tdq, td, flags);
2335	if (cpu != PCPU_GET(cpuid)) {
2336		tdq_notify(tdq, td);
2337		return;
2338	}
2339#else
2340	tdq = TDQ_SELF();
2341	TDQ_LOCK(tdq);
2342	/*
2343	 * Now that the thread is moving to the run-queue, set the lock
2344	 * to the scheduler's lock.
2345	 */
2346	thread_lock_set(td, TDQ_LOCKPTR(tdq));
2347	tdq_add(tdq, td, flags);
2348#endif
2349	if (!(flags & SRQ_YIELDING))
2350		sched_setpreempt(td);
2351}
2352
2353/*
2354 * Remove a thread from a run-queue without running it.  This is used
2355 * when we're stealing a thread from a remote queue.  Otherwise all threads
2356 * exit by calling sched_exit_thread() and sched_throw() themselves.
2357 */
2358void
2359sched_rem(struct thread *td)
2360{
2361	struct tdq *tdq;
2362
2363	KTR_STATE1(KTR_SCHED, "thread", sched_tdname(td), "runq rem",
2364	    "prio:%d", td->td_priority);
2365	tdq = TDQ_CPU(td->td_sched->ts_cpu);
2366	TDQ_LOCK_ASSERT(tdq, MA_OWNED);
2367	MPASS(td->td_lock == TDQ_LOCKPTR(tdq));
2368	KASSERT(TD_ON_RUNQ(td),
2369	    ("sched_rem: thread not on run queue"));
2370	tdq_runq_rem(tdq, td);
2371	tdq_load_rem(tdq, td);
2372	TD_SET_CAN_RUN(td);
2373	if (td->td_priority == tdq->tdq_lowpri)
2374		tdq_setlowpri(tdq, NULL);
2375}
2376
2377/*
2378 * Fetch cpu utilization information.  Updates on demand.
2379 */
2380fixpt_t
2381sched_pctcpu(struct thread *td)
2382{
2383	fixpt_t pctcpu;
2384	struct td_sched *ts;
2385
2386	pctcpu = 0;
2387	ts = td->td_sched;
2388	if (ts == NULL)
2389		return (0);
2390
2391	thread_lock(td);
2392	if (ts->ts_ticks) {
2393		int rtick;
2394
2395		sched_pctcpu_update(ts);
2396		/* How many rtick per second ? */
2397		rtick = min(SCHED_TICK_HZ(ts) / SCHED_TICK_SECS, hz);
2398		pctcpu = (FSCALE * ((FSCALE * rtick)/hz)) >> FSHIFT;
2399	}
2400	thread_unlock(td);
2401
2402	return (pctcpu);
2403}
2404
2405/*
2406 * Enforce affinity settings for a thread.  Called after adjustments to
2407 * cpumask.
2408 */
2409void
2410sched_affinity(struct thread *td)
2411{
2412#ifdef SMP
2413	struct td_sched *ts;
2414	int cpu;
2415
2416	THREAD_LOCK_ASSERT(td, MA_OWNED);
2417	ts = td->td_sched;
2418	if (THREAD_CAN_SCHED(td, ts->ts_cpu))
2419		return;
2420	if (!TD_IS_RUNNING(td))
2421		return;
2422	td->td_flags |= TDF_NEEDRESCHED;
2423	if (!THREAD_CAN_MIGRATE(td))
2424		return;
2425	/*
2426	 * Assign the new cpu and force a switch before returning to
2427	 * userspace.  If the target thread is not running locally send
2428	 * an ipi to force the issue.
2429	 */
2430	cpu = ts->ts_cpu;
2431	ts->ts_cpu = sched_pickcpu(td, 0);
2432	if (cpu != PCPU_GET(cpuid))
2433		ipi_selected(1 << cpu, IPI_PREEMPT);
2434#endif
2435}
2436
2437/*
2438 * Bind a thread to a target cpu.
2439 */
2440void
2441sched_bind(struct thread *td, int cpu)
2442{
2443	struct td_sched *ts;
2444
2445	THREAD_LOCK_ASSERT(td, MA_OWNED|MA_NOTRECURSED);
2446	ts = td->td_sched;
2447	if (ts->ts_flags & TSF_BOUND)
2448		sched_unbind(td);
2449	ts->ts_flags |= TSF_BOUND;
2450	sched_pin();
2451	if (PCPU_GET(cpuid) == cpu)
2452		return;
2453	ts->ts_cpu = cpu;
2454	/* When we return from mi_switch we'll be on the correct cpu. */
2455	mi_switch(SW_VOL, NULL);
2456}
2457
2458/*
2459 * Release a bound thread.
2460 */
2461void
2462sched_unbind(struct thread *td)
2463{
2464	struct td_sched *ts;
2465
2466	THREAD_LOCK_ASSERT(td, MA_OWNED);
2467	ts = td->td_sched;
2468	if ((ts->ts_flags & TSF_BOUND) == 0)
2469		return;
2470	ts->ts_flags &= ~TSF_BOUND;
2471	sched_unpin();
2472}
2473
2474int
2475sched_is_bound(struct thread *td)
2476{
2477	THREAD_LOCK_ASSERT(td, MA_OWNED);
2478	return (td->td_sched->ts_flags & TSF_BOUND);
2479}
2480
2481/*
2482 * Basic yield call.
2483 */
2484void
2485sched_relinquish(struct thread *td)
2486{
2487	thread_lock(td);
2488	mi_switch(SW_VOL | SWT_RELINQUISH, NULL);
2489	thread_unlock(td);
2490}
2491
2492/*
2493 * Return the total system load.
2494 */
2495int
2496sched_load(void)
2497{
2498#ifdef SMP
2499	int total;
2500	int i;
2501
2502	total = 0;
2503	for (i = 0; i <= mp_maxid; i++)
2504		total += TDQ_CPU(i)->tdq_sysload;
2505	return (total);
2506#else
2507	return (TDQ_SELF()->tdq_sysload);
2508#endif
2509}
2510
2511int
2512sched_sizeof_proc(void)
2513{
2514	return (sizeof(struct proc));
2515}
2516
2517int
2518sched_sizeof_thread(void)
2519{
2520	return (sizeof(struct thread) + sizeof(struct td_sched));
2521}
2522
2523/*
2524 * The actual idle process.
2525 */
2526void
2527sched_idletd(void *dummy)
2528{
2529	struct thread *td;
2530	struct tdq *tdq;
2531	int switchcnt;
2532	int i;
2533
2534	td = curthread;
2535	tdq = TDQ_SELF();
2536	mtx_assert(&Giant, MA_NOTOWNED);
2537	/* ULE relies on preemption for idle interruption. */
2538	for (;;) {
2539		tdq->tdq_idlestate = TDQ_RUNNING;
2540#ifdef SMP
2541		if (tdq_idled(tdq) == 0)
2542			continue;
2543#endif
2544		switchcnt = tdq->tdq_switchcnt + tdq->tdq_oldswitchcnt;
2545		/*
2546		 * If we're switching very frequently, spin while checking
2547		 * for load rather than entering a low power state that
2548		 * requires an IPI.
2549		 */
2550		if (switchcnt > sched_idlespinthresh) {
2551			for (i = 0; i < sched_idlespins; i++) {
2552				if (tdq->tdq_load)
2553					break;
2554				cpu_spinwait();
2555			}
2556		}
2557		/*
2558		 * We must set our state to IDLE before checking
2559		 * tdq_load for the last time to avoid a race with
2560		 * tdq_notify().
2561		 */
2562		if (tdq->tdq_load == 0) {
2563			switchcnt = tdq->tdq_switchcnt + tdq->tdq_oldswitchcnt;
2564			tdq->tdq_idlestate = TDQ_IDLE;
2565			if (tdq->tdq_load == 0)
2566				cpu_idle(switchcnt > 1);
2567		}
2568		if (tdq->tdq_load) {
2569			thread_lock(td);
2570			mi_switch(SW_VOL | SWT_IDLE, NULL);
2571			thread_unlock(td);
2572		}
2573	}
2574}
2575
2576/*
2577 * A CPU is entering for the first time or a thread is exiting.
2578 */
2579void
2580sched_throw(struct thread *td)
2581{
2582	struct thread *newtd;
2583	struct tdq *tdq;
2584
2585	tdq = TDQ_SELF();
2586	if (td == NULL) {
2587		/* Correct spinlock nesting and acquire the correct lock. */
2588		TDQ_LOCK(tdq);
2589		spinlock_exit();
2590	} else {
2591		MPASS(td->td_lock == TDQ_LOCKPTR(tdq));
2592		tdq_load_rem(tdq, td);
2593		lock_profile_release_lock(&TDQ_LOCKPTR(tdq)->lock_object);
2594	}
2595	KASSERT(curthread->td_md.md_spinlock_count == 1, ("invalid count"));
2596	newtd = choosethread();
2597	TDQ_LOCKPTR(tdq)->mtx_lock = (uintptr_t)newtd;
2598	PCPU_SET(switchtime, cpu_ticks());
2599	PCPU_SET(switchticks, ticks);
2600	cpu_throw(td, newtd);		/* doesn't return */
2601}
2602
2603/*
2604 * This is called from fork_exit().  Just acquire the correct locks and
2605 * let fork do the rest of the work.
2606 */
2607void
2608sched_fork_exit(struct thread *td)
2609{
2610	struct td_sched *ts;
2611	struct tdq *tdq;
2612	int cpuid;
2613
2614	/*
2615	 * Finish setting up thread glue so that it begins execution in a
2616	 * non-nested critical section with the scheduler lock held.
2617	 */
2618	cpuid = PCPU_GET(cpuid);
2619	tdq = TDQ_CPU(cpuid);
2620	ts = td->td_sched;
2621	if (TD_IS_IDLETHREAD(td))
2622		td->td_lock = TDQ_LOCKPTR(tdq);
2623	MPASS(td->td_lock == TDQ_LOCKPTR(tdq));
2624	td->td_oncpu = cpuid;
2625	TDQ_LOCK_ASSERT(tdq, MA_OWNED | MA_NOTRECURSED);
2626	lock_profile_obtain_lock_success(
2627	    &TDQ_LOCKPTR(tdq)->lock_object, 0, 0, __FILE__, __LINE__);
2628}
2629
2630/*
2631 * Create on first use to catch odd startup conditons.
2632 */
2633char *
2634sched_tdname(struct thread *td)
2635{
2636#ifdef KTR
2637	struct td_sched *ts;
2638
2639	ts = td->td_sched;
2640	if (ts->ts_name[0] == '\0')
2641		snprintf(ts->ts_name, sizeof(ts->ts_name),
2642		    "%s tid %d", td->td_name, td->td_tid);
2643	return (ts->ts_name);
2644#else
2645	return (td->td_name);
2646#endif
2647}
2648
2649#ifdef SMP
2650
2651/*
2652 * Build the CPU topology dump string. Is recursively called to collect
2653 * the topology tree.
2654 */
2655static int
2656sysctl_kern_sched_topology_spec_internal(struct sbuf *sb, struct cpu_group *cg,
2657    int indent)
2658{
2659	int i, first;
2660
2661	sbuf_printf(sb, "%*s<group level=\"%d\" cache-level=\"%d\">\n", indent,
2662	    "", indent, cg->cg_level);
2663	sbuf_printf(sb, "%*s <cpu count=\"%d\" mask=\"0x%x\">", indent, "",
2664	    cg->cg_count, cg->cg_mask);
2665	first = TRUE;
2666	for (i = 0; i < MAXCPU; i++) {
2667		if ((cg->cg_mask & (1 << i)) != 0) {
2668			if (!first)
2669				sbuf_printf(sb, ", ");
2670			else
2671				first = FALSE;
2672			sbuf_printf(sb, "%d", i);
2673		}
2674	}
2675	sbuf_printf(sb, "</cpu>\n");
2676
2677	sbuf_printf(sb, "%*s <flags>", indent, "");
2678	if (cg->cg_flags != 0) {
2679		if ((cg->cg_flags & CG_FLAG_HTT) != 0)
2680			sbuf_printf(sb, "<flag name=\"HTT\">HTT group</flag>\n");
2681		if ((cg->cg_flags & CG_FLAG_THREAD) != 0)
2682			sbuf_printf(sb, "<flag name=\"THREAD\">SMT group</flag>\n");
2683	}
2684	sbuf_printf(sb, "</flags>\n");
2685
2686	if (cg->cg_children > 0) {
2687		sbuf_printf(sb, "%*s <children>\n", indent, "");
2688		for (i = 0; i < cg->cg_children; i++)
2689			sysctl_kern_sched_topology_spec_internal(sb,
2690			    &cg->cg_child[i], indent+2);
2691		sbuf_printf(sb, "%*s </children>\n", indent, "");
2692	}
2693	sbuf_printf(sb, "%*s</group>\n", indent, "");
2694	return (0);
2695}
2696
2697/*
2698 * Sysctl handler for retrieving topology dump. It's a wrapper for
2699 * the recursive sysctl_kern_smp_topology_spec_internal().
2700 */
2701static int
2702sysctl_kern_sched_topology_spec(SYSCTL_HANDLER_ARGS)
2703{
2704	struct sbuf *topo;
2705	int err;
2706
2707	KASSERT(cpu_top != NULL, ("cpu_top isn't initialized"));
2708
2709	topo = sbuf_new(NULL, NULL, 500, SBUF_AUTOEXTEND);
2710	if (topo == NULL)
2711		return (ENOMEM);
2712
2713	sbuf_printf(topo, "<groups>\n");
2714	err = sysctl_kern_sched_topology_spec_internal(topo, cpu_top, 1);
2715	sbuf_printf(topo, "</groups>\n");
2716
2717	if (err == 0) {
2718		sbuf_finish(topo);
2719		err = SYSCTL_OUT(req, sbuf_data(topo), sbuf_len(topo));
2720	}
2721	sbuf_delete(topo);
2722	return (err);
2723}
2724#endif
2725
2726SYSCTL_NODE(_kern, OID_AUTO, sched, CTLFLAG_RW, 0, "Scheduler");
2727SYSCTL_STRING(_kern_sched, OID_AUTO, name, CTLFLAG_RD, "ULE", 0,
2728    "Scheduler name");
2729SYSCTL_INT(_kern_sched, OID_AUTO, slice, CTLFLAG_RW, &sched_slice, 0,
2730    "Slice size for timeshare threads");
2731SYSCTL_INT(_kern_sched, OID_AUTO, interact, CTLFLAG_RW, &sched_interact, 0,
2732     "Interactivity score threshold");
2733SYSCTL_INT(_kern_sched, OID_AUTO, preempt_thresh, CTLFLAG_RW, &preempt_thresh,
2734     0,"Min priority for preemption, lower priorities have greater precedence");
2735SYSCTL_INT(_kern_sched, OID_AUTO, static_boost, CTLFLAG_RW, &static_boost,
2736     0,"Controls whether static kernel priorities are assigned to sleeping threads.");
2737SYSCTL_INT(_kern_sched, OID_AUTO, idlespins, CTLFLAG_RW, &sched_idlespins,
2738     0,"Number of times idle will spin waiting for new work.");
2739SYSCTL_INT(_kern_sched, OID_AUTO, idlespinthresh, CTLFLAG_RW, &sched_idlespinthresh,
2740     0,"Threshold before we will permit idle spinning.");
2741#ifdef SMP
2742SYSCTL_INT(_kern_sched, OID_AUTO, affinity, CTLFLAG_RW, &affinity, 0,
2743    "Number of hz ticks to keep thread affinity for");
2744SYSCTL_INT(_kern_sched, OID_AUTO, balance, CTLFLAG_RW, &rebalance, 0,
2745    "Enables the long-term load balancer");
2746SYSCTL_INT(_kern_sched, OID_AUTO, balance_interval, CTLFLAG_RW,
2747    &balance_interval, 0,
2748    "Average frequency in stathz ticks to run the long-term balancer");
2749SYSCTL_INT(_kern_sched, OID_AUTO, steal_htt, CTLFLAG_RW, &steal_htt, 0,
2750    "Steals work from another hyper-threaded core on idle");
2751SYSCTL_INT(_kern_sched, OID_AUTO, steal_idle, CTLFLAG_RW, &steal_idle, 0,
2752    "Attempts to steal work from other cores before idling");
2753SYSCTL_INT(_kern_sched, OID_AUTO, steal_thresh, CTLFLAG_RW, &steal_thresh, 0,
2754    "Minimum load on remote cpu before we'll steal");
2755
2756/* Retrieve SMP topology */
2757SYSCTL_PROC(_kern_sched, OID_AUTO, topology_spec, CTLTYPE_STRING |
2758    CTLFLAG_RD, NULL, 0, sysctl_kern_sched_topology_spec, "A",
2759    "XML dump of detected CPU topology");
2760#endif
2761
2762/* ps compat.  All cpu percentages from ULE are weighted. */
2763static int ccpu = 0;
2764SYSCTL_INT(_kern, OID_AUTO, ccpu, CTLFLAG_RD, &ccpu, 0, "");
2765