sched_ule.c revision 178277
1/*-
2 * Copyright (c) 2002-2007, Jeffrey Roberson <jeff@freebsd.org>
3 * All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 * 1. Redistributions of source code must retain the above copyright
9 *    notice unmodified, this list of conditions, and the following
10 *    disclaimer.
11 * 2. Redistributions in binary form must reproduce the above copyright
12 *    notice, this list of conditions and the following disclaimer in the
13 *    documentation and/or other materials provided with the distribution.
14 *
15 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
16 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
17 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
18 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
19 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
20 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
21 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
22 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
23 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
24 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
25 */
26
27/*
28 * This file implements the ULE scheduler.  ULE supports independent CPU
29 * run queues and fine grain locking.  It has superior interactive
30 * performance under load even on uni-processor systems.
31 *
32 * etymology:
33 *   ULE is the last three letters in schedule.  It owes its name to a
34 * generic user created for a scheduling system by Paul Mikesell at
35 * Isilon Systems and a general lack of creativity on the part of the author.
36 */
37
38#include <sys/cdefs.h>
39__FBSDID("$FreeBSD: head/sys/kern/sched_ule.c 178277 2008-04-17 09:56:01Z jeff $");
40
41#include "opt_hwpmc_hooks.h"
42#include "opt_sched.h"
43
44#include <sys/param.h>
45#include <sys/systm.h>
46#include <sys/kdb.h>
47#include <sys/kernel.h>
48#include <sys/ktr.h>
49#include <sys/lock.h>
50#include <sys/mutex.h>
51#include <sys/proc.h>
52#include <sys/resource.h>
53#include <sys/resourcevar.h>
54#include <sys/sched.h>
55#include <sys/smp.h>
56#include <sys/sx.h>
57#include <sys/sysctl.h>
58#include <sys/sysproto.h>
59#include <sys/turnstile.h>
60#include <sys/umtx.h>
61#include <sys/vmmeter.h>
62#include <sys/cpuset.h>
63#ifdef KTRACE
64#include <sys/uio.h>
65#include <sys/ktrace.h>
66#endif
67
68#ifdef HWPMC_HOOKS
69#include <sys/pmckern.h>
70#endif
71
72#include <machine/cpu.h>
73#include <machine/smp.h>
74
75#if defined(__sparc64__) || defined(__mips__)
76#error "This architecture is not currently compatible with ULE"
77#endif
78
79#define	KTR_ULE	0
80
81/*
82 * Thread scheduler specific section.  All fields are protected
83 * by the thread lock.
84 */
85struct td_sched {
86	struct runq	*ts_runq;	/* Run-queue we're queued on. */
87	short		ts_flags;	/* TSF_* flags. */
88	u_char		ts_cpu;		/* CPU that we have affinity for. */
89	int		ts_rltick;	/* Real last tick, for affinity. */
90	int		ts_slice;	/* Ticks of slice remaining. */
91	u_int		ts_slptime;	/* Number of ticks we vol. slept */
92	u_int		ts_runtime;	/* Number of ticks we were running */
93	int		ts_ltick;	/* Last tick that we were running on */
94	int		ts_ftick;	/* First tick that we were running on */
95	int		ts_ticks;	/* Tick count */
96};
97/* flags kept in ts_flags */
98#define	TSF_BOUND	0x0001		/* Thread can not migrate. */
99#define	TSF_XFERABLE	0x0002		/* Thread was added as transferable. */
100
101static struct td_sched td_sched0;
102
103#define	THREAD_CAN_MIGRATE(td)	((td)->td_pinned == 0)
104#define	THREAD_CAN_SCHED(td, cpu)	\
105    CPU_ISSET((cpu), &(td)->td_cpuset->cs_mask)
106
107/*
108 * Cpu percentage computation macros and defines.
109 *
110 * SCHED_TICK_SECS:	Number of seconds to average the cpu usage across.
111 * SCHED_TICK_TARG:	Number of hz ticks to average the cpu usage across.
112 * SCHED_TICK_MAX:	Maximum number of ticks before scaling back.
113 * SCHED_TICK_SHIFT:	Shift factor to avoid rounding away results.
114 * SCHED_TICK_HZ:	Compute the number of hz ticks for a given ticks count.
115 * SCHED_TICK_TOTAL:	Gives the amount of time we've been recording ticks.
116 */
117#define	SCHED_TICK_SECS		10
118#define	SCHED_TICK_TARG		(hz * SCHED_TICK_SECS)
119#define	SCHED_TICK_MAX		(SCHED_TICK_TARG + hz)
120#define	SCHED_TICK_SHIFT	10
121#define	SCHED_TICK_HZ(ts)	((ts)->ts_ticks >> SCHED_TICK_SHIFT)
122#define	SCHED_TICK_TOTAL(ts)	(max((ts)->ts_ltick - (ts)->ts_ftick, hz))
123
124/*
125 * These macros determine priorities for non-interactive threads.  They are
126 * assigned a priority based on their recent cpu utilization as expressed
127 * by the ratio of ticks to the tick total.  NHALF priorities at the start
128 * and end of the MIN to MAX timeshare range are only reachable with negative
129 * or positive nice respectively.
130 *
131 * PRI_RANGE:	Priority range for utilization dependent priorities.
132 * PRI_NRESV:	Number of nice values.
133 * PRI_TICKS:	Compute a priority in PRI_RANGE from the ticks count and total.
134 * PRI_NICE:	Determines the part of the priority inherited from nice.
135 */
136#define	SCHED_PRI_NRESV		(PRIO_MAX - PRIO_MIN)
137#define	SCHED_PRI_NHALF		(SCHED_PRI_NRESV / 2)
138#define	SCHED_PRI_MIN		(PRI_MIN_TIMESHARE + SCHED_PRI_NHALF)
139#define	SCHED_PRI_MAX		(PRI_MAX_TIMESHARE - SCHED_PRI_NHALF)
140#define	SCHED_PRI_RANGE		(SCHED_PRI_MAX - SCHED_PRI_MIN)
141#define	SCHED_PRI_TICKS(ts)						\
142    (SCHED_TICK_HZ((ts)) /						\
143    (roundup(SCHED_TICK_TOTAL((ts)), SCHED_PRI_RANGE) / SCHED_PRI_RANGE))
144#define	SCHED_PRI_NICE(nice)	(nice)
145
146/*
147 * These determine the interactivity of a process.  Interactivity differs from
148 * cpu utilization in that it expresses the voluntary time slept vs time ran
149 * while cpu utilization includes all time not running.  This more accurately
150 * models the intent of the thread.
151 *
152 * SLP_RUN_MAX:	Maximum amount of sleep time + run time we'll accumulate
153 *		before throttling back.
154 * SLP_RUN_FORK:	Maximum slp+run time to inherit at fork time.
155 * INTERACT_MAX:	Maximum interactivity value.  Smaller is better.
156 * INTERACT_THRESH:	Threshhold for placement on the current runq.
157 */
158#define	SCHED_SLP_RUN_MAX	((hz * 5) << SCHED_TICK_SHIFT)
159#define	SCHED_SLP_RUN_FORK	((hz / 2) << SCHED_TICK_SHIFT)
160#define	SCHED_INTERACT_MAX	(100)
161#define	SCHED_INTERACT_HALF	(SCHED_INTERACT_MAX / 2)
162#define	SCHED_INTERACT_THRESH	(30)
163
164/*
165 * tickincr:		Converts a stathz tick into a hz domain scaled by
166 *			the shift factor.  Without the shift the error rate
167 *			due to rounding would be unacceptably high.
168 * realstathz:		stathz is sometimes 0 and run off of hz.
169 * sched_slice:		Runtime of each thread before rescheduling.
170 * preempt_thresh:	Priority threshold for preemption and remote IPIs.
171 */
172static int sched_interact = SCHED_INTERACT_THRESH;
173static int realstathz;
174static int tickincr;
175static int sched_slice = 1;
176#ifdef PREEMPTION
177#ifdef FULL_PREEMPTION
178static int preempt_thresh = PRI_MAX_IDLE;
179#else
180static int preempt_thresh = PRI_MIN_KERN;
181#endif
182#else
183static int preempt_thresh = 0;
184#endif
185static int static_boost = PRI_MIN_TIMESHARE;
186static int sched_idlespins = 10000;
187static int sched_idlespinthresh = 4;
188
189/*
190 * tdq - per processor runqs and statistics.  All fields are protected by the
191 * tdq_lock.  The load and lowpri may be accessed without to avoid excess
192 * locking in sched_pickcpu();
193 */
194struct tdq {
195	/* Ordered to improve efficiency of cpu_search() and switch(). */
196	struct mtx	tdq_lock;		/* run queue lock. */
197	struct cpu_group *tdq_cg;		/* Pointer to cpu topology. */
198	volatile int	tdq_load;		/* Aggregate load. */
199	int		tdq_sysload;		/* For loadavg, !ITHD load. */
200	int		tdq_transferable;	/* Transferable thread count. */
201	volatile int	tdq_idlestate;		/* State of the idle thread. */
202	short		tdq_switchcnt;		/* Switches this tick. */
203	short		tdq_oldswitchcnt;	/* Switches last tick. */
204	u_char		tdq_lowpri;		/* Lowest priority thread. */
205	u_char		tdq_ipipending;		/* IPI pending. */
206	u_char		tdq_idx;		/* Current insert index. */
207	u_char		tdq_ridx;		/* Current removal index. */
208	struct runq	tdq_realtime;		/* real-time run queue. */
209	struct runq	tdq_timeshare;		/* timeshare run queue. */
210	struct runq	tdq_idle;		/* Queue of IDLE threads. */
211	char		tdq_name[sizeof("sched lock") + 6];
212} __aligned(64);
213
214/* Idle thread states and config. */
215#define	TDQ_RUNNING	1
216#define	TDQ_IDLE	2
217
218#ifdef SMP
219struct cpu_group *cpu_top;
220
221#define	SCHED_AFFINITY_DEFAULT	(max(1, hz / 1000))
222#define	SCHED_AFFINITY(ts, t)	((ts)->ts_rltick > ticks - ((t) * affinity))
223
224/*
225 * Run-time tunables.
226 */
227static int rebalance = 1;
228static int balance_interval = 128;	/* Default set in sched_initticks(). */
229static int affinity;
230static int steal_htt = 1;
231static int steal_idle = 1;
232static int steal_thresh = 2;
233
234/*
235 * One thread queue per processor.
236 */
237static struct tdq	tdq_cpu[MAXCPU];
238static struct tdq	*balance_tdq;
239static int balance_ticks;
240
241#define	TDQ_SELF()	(&tdq_cpu[PCPU_GET(cpuid)])
242#define	TDQ_CPU(x)	(&tdq_cpu[(x)])
243#define	TDQ_ID(x)	((int)((x) - tdq_cpu))
244#else	/* !SMP */
245static struct tdq	tdq_cpu;
246
247#define	TDQ_ID(x)	(0)
248#define	TDQ_SELF()	(&tdq_cpu)
249#define	TDQ_CPU(x)	(&tdq_cpu)
250#endif
251
252#define	TDQ_LOCK_ASSERT(t, type)	mtx_assert(TDQ_LOCKPTR((t)), (type))
253#define	TDQ_LOCK(t)		mtx_lock_spin(TDQ_LOCKPTR((t)))
254#define	TDQ_LOCK_FLAGS(t, f)	mtx_lock_spin_flags(TDQ_LOCKPTR((t)), (f))
255#define	TDQ_UNLOCK(t)		mtx_unlock_spin(TDQ_LOCKPTR((t)))
256#define	TDQ_LOCKPTR(t)		(&(t)->tdq_lock)
257
258static void sched_priority(struct thread *);
259static void sched_thread_priority(struct thread *, u_char);
260static int sched_interact_score(struct thread *);
261static void sched_interact_update(struct thread *);
262static void sched_interact_fork(struct thread *);
263static void sched_pctcpu_update(struct td_sched *);
264
265/* Operations on per processor queues */
266static struct thread *tdq_choose(struct tdq *);
267static void tdq_setup(struct tdq *);
268static void tdq_load_add(struct tdq *, struct thread *);
269static void tdq_load_rem(struct tdq *, struct thread *);
270static __inline void tdq_runq_add(struct tdq *, struct thread *, int);
271static __inline void tdq_runq_rem(struct tdq *, struct thread *);
272static inline int sched_shouldpreempt(int, int, int);
273void tdq_print(int cpu);
274static void runq_print(struct runq *rq);
275static void tdq_add(struct tdq *, struct thread *, int);
276#ifdef SMP
277static int tdq_move(struct tdq *, struct tdq *);
278static int tdq_idled(struct tdq *);
279static void tdq_notify(struct tdq *, struct thread *);
280static struct thread *tdq_steal(struct tdq *, int);
281static struct thread *runq_steal(struct runq *, int);
282static int sched_pickcpu(struct thread *, int);
283static void sched_balance(void);
284static int sched_balance_pair(struct tdq *, struct tdq *);
285static inline struct tdq *sched_setcpu(struct thread *, int, int);
286static inline struct mtx *thread_block_switch(struct thread *);
287static inline void thread_unblock_switch(struct thread *, struct mtx *);
288static struct mtx *sched_switch_migrate(struct tdq *, struct thread *, int);
289#endif
290
291static void sched_setup(void *dummy);
292SYSINIT(sched_setup, SI_SUB_RUN_QUEUE, SI_ORDER_FIRST, sched_setup, NULL);
293
294static void sched_initticks(void *dummy);
295SYSINIT(sched_initticks, SI_SUB_CLOCKS, SI_ORDER_THIRD, sched_initticks,
296    NULL);
297
298/*
299 * Print the threads waiting on a run-queue.
300 */
301static void
302runq_print(struct runq *rq)
303{
304	struct rqhead *rqh;
305	struct thread *td;
306	int pri;
307	int j;
308	int i;
309
310	for (i = 0; i < RQB_LEN; i++) {
311		printf("\t\trunq bits %d 0x%zx\n",
312		    i, rq->rq_status.rqb_bits[i]);
313		for (j = 0; j < RQB_BPW; j++)
314			if (rq->rq_status.rqb_bits[i] & (1ul << j)) {
315				pri = j + (i << RQB_L2BPW);
316				rqh = &rq->rq_queues[pri];
317				TAILQ_FOREACH(td, rqh, td_runq) {
318					printf("\t\t\ttd %p(%s) priority %d rqindex %d pri %d\n",
319					    td, td->td_name, td->td_priority,
320					    td->td_rqindex, pri);
321				}
322			}
323	}
324}
325
326/*
327 * Print the status of a per-cpu thread queue.  Should be a ddb show cmd.
328 */
329void
330tdq_print(int cpu)
331{
332	struct tdq *tdq;
333
334	tdq = TDQ_CPU(cpu);
335
336	printf("tdq %d:\n", TDQ_ID(tdq));
337	printf("\tlock            %p\n", TDQ_LOCKPTR(tdq));
338	printf("\tLock name:      %s\n", tdq->tdq_name);
339	printf("\tload:           %d\n", tdq->tdq_load);
340	printf("\tswitch cnt:     %d\n", tdq->tdq_switchcnt);
341	printf("\told switch cnt: %d\n", tdq->tdq_oldswitchcnt);
342	printf("\tidle state:     %d\n", tdq->tdq_idlestate);
343	printf("\ttimeshare idx:  %d\n", tdq->tdq_idx);
344	printf("\ttimeshare ridx: %d\n", tdq->tdq_ridx);
345	printf("\tload transferable: %d\n", tdq->tdq_transferable);
346	printf("\tlowest priority:   %d\n", tdq->tdq_lowpri);
347	printf("\trealtime runq:\n");
348	runq_print(&tdq->tdq_realtime);
349	printf("\ttimeshare runq:\n");
350	runq_print(&tdq->tdq_timeshare);
351	printf("\tidle runq:\n");
352	runq_print(&tdq->tdq_idle);
353}
354
355static inline int
356sched_shouldpreempt(int pri, int cpri, int remote)
357{
358	/*
359	 * If the new priority is not better than the current priority there is
360	 * nothing to do.
361	 */
362	if (pri >= cpri)
363		return (0);
364	/*
365	 * Always preempt idle.
366	 */
367	if (cpri >= PRI_MIN_IDLE)
368		return (1);
369	/*
370	 * If preemption is disabled don't preempt others.
371	 */
372	if (preempt_thresh == 0)
373		return (0);
374	/*
375	 * Preempt if we exceed the threshold.
376	 */
377	if (pri <= preempt_thresh)
378		return (1);
379	/*
380	 * If we're realtime or better and there is timeshare or worse running
381	 * preempt only remote processors.
382	 */
383	if (remote && pri <= PRI_MAX_REALTIME && cpri > PRI_MAX_REALTIME)
384		return (1);
385	return (0);
386}
387
388#define	TS_RQ_PPQ	(((PRI_MAX_TIMESHARE - PRI_MIN_TIMESHARE) + 1) / RQ_NQS)
389/*
390 * Add a thread to the actual run-queue.  Keeps transferable counts up to
391 * date with what is actually on the run-queue.  Selects the correct
392 * queue position for timeshare threads.
393 */
394static __inline void
395tdq_runq_add(struct tdq *tdq, struct thread *td, int flags)
396{
397	struct td_sched *ts;
398	u_char pri;
399
400	TDQ_LOCK_ASSERT(tdq, MA_OWNED);
401	THREAD_LOCK_ASSERT(td, MA_OWNED);
402
403	pri = td->td_priority;
404	ts = td->td_sched;
405	TD_SET_RUNQ(td);
406	if (THREAD_CAN_MIGRATE(td)) {
407		tdq->tdq_transferable++;
408		ts->ts_flags |= TSF_XFERABLE;
409	}
410	if (pri <= PRI_MAX_REALTIME) {
411		ts->ts_runq = &tdq->tdq_realtime;
412	} else if (pri <= PRI_MAX_TIMESHARE) {
413		ts->ts_runq = &tdq->tdq_timeshare;
414		KASSERT(pri <= PRI_MAX_TIMESHARE && pri >= PRI_MIN_TIMESHARE,
415			("Invalid priority %d on timeshare runq", pri));
416		/*
417		 * This queue contains only priorities between MIN and MAX
418		 * realtime.  Use the whole queue to represent these values.
419		 */
420		if ((flags & (SRQ_BORROWING|SRQ_PREEMPTED)) == 0) {
421			pri = (pri - PRI_MIN_TIMESHARE) / TS_RQ_PPQ;
422			pri = (pri + tdq->tdq_idx) % RQ_NQS;
423			/*
424			 * This effectively shortens the queue by one so we
425			 * can have a one slot difference between idx and
426			 * ridx while we wait for threads to drain.
427			 */
428			if (tdq->tdq_ridx != tdq->tdq_idx &&
429			    pri == tdq->tdq_ridx)
430				pri = (unsigned char)(pri - 1) % RQ_NQS;
431		} else
432			pri = tdq->tdq_ridx;
433		runq_add_pri(ts->ts_runq, td, pri, flags);
434		return;
435	} else
436		ts->ts_runq = &tdq->tdq_idle;
437	runq_add(ts->ts_runq, td, flags);
438}
439
440/*
441 * Remove a thread from a run-queue.  This typically happens when a thread
442 * is selected to run.  Running threads are not on the queue and the
443 * transferable count does not reflect them.
444 */
445static __inline void
446tdq_runq_rem(struct tdq *tdq, struct thread *td)
447{
448	struct td_sched *ts;
449
450	ts = td->td_sched;
451	TDQ_LOCK_ASSERT(tdq, MA_OWNED);
452	KASSERT(ts->ts_runq != NULL,
453	    ("tdq_runq_remove: thread %p null ts_runq", td));
454	if (ts->ts_flags & TSF_XFERABLE) {
455		tdq->tdq_transferable--;
456		ts->ts_flags &= ~TSF_XFERABLE;
457	}
458	if (ts->ts_runq == &tdq->tdq_timeshare) {
459		if (tdq->tdq_idx != tdq->tdq_ridx)
460			runq_remove_idx(ts->ts_runq, td, &tdq->tdq_ridx);
461		else
462			runq_remove_idx(ts->ts_runq, td, NULL);
463	} else
464		runq_remove(ts->ts_runq, td);
465}
466
467/*
468 * Load is maintained for all threads RUNNING and ON_RUNQ.  Add the load
469 * for this thread to the referenced thread queue.
470 */
471static void
472tdq_load_add(struct tdq *tdq, struct thread *td)
473{
474
475	TDQ_LOCK_ASSERT(tdq, MA_OWNED);
476	THREAD_LOCK_ASSERT(td, MA_OWNED);
477
478	tdq->tdq_load++;
479	if ((td->td_proc->p_flag & P_NOLOAD) == 0)
480		tdq->tdq_sysload++;
481	CTR2(KTR_SCHED, "cpu %d load: %d", TDQ_ID(tdq), tdq->tdq_load);
482}
483
484/*
485 * Remove the load from a thread that is transitioning to a sleep state or
486 * exiting.
487 */
488static void
489tdq_load_rem(struct tdq *tdq, struct thread *td)
490{
491
492	THREAD_LOCK_ASSERT(td, MA_OWNED);
493	TDQ_LOCK_ASSERT(tdq, MA_OWNED);
494	KASSERT(tdq->tdq_load != 0,
495	    ("tdq_load_rem: Removing with 0 load on queue %d", TDQ_ID(tdq)));
496
497	tdq->tdq_load--;
498	if ((td->td_proc->p_flag & P_NOLOAD) == 0)
499		tdq->tdq_sysload--;
500	CTR1(KTR_SCHED, "load: %d", tdq->tdq_load);
501}
502
503/*
504 * Set lowpri to its exact value by searching the run-queue and
505 * evaluating curthread.  curthread may be passed as an optimization.
506 */
507static void
508tdq_setlowpri(struct tdq *tdq, struct thread *ctd)
509{
510	struct thread *td;
511
512	TDQ_LOCK_ASSERT(tdq, MA_OWNED);
513	if (ctd == NULL)
514		ctd = pcpu_find(TDQ_ID(tdq))->pc_curthread;
515	td = tdq_choose(tdq);
516	if (td == NULL || td->td_priority > ctd->td_priority)
517		tdq->tdq_lowpri = ctd->td_priority;
518	else
519		tdq->tdq_lowpri = td->td_priority;
520}
521
522#ifdef SMP
523struct cpu_search {
524	cpumask_t cs_mask;	/* Mask of valid cpus. */
525	u_int	cs_load;
526	u_int	cs_cpu;
527	int	cs_limit;	/* Min priority for low min load for high. */
528};
529
530#define	CPU_SEARCH_LOWEST	0x1
531#define	CPU_SEARCH_HIGHEST	0x2
532#define	CPU_SEARCH_BOTH		(CPU_SEARCH_LOWEST|CPU_SEARCH_HIGHEST)
533
534#define	CPUMASK_FOREACH(cpu, mask)				\
535	for ((cpu) = 0; (cpu) < sizeof((mask)) * 8; (cpu)++)	\
536		if ((mask) & 1 << (cpu))
537
538static __inline int cpu_search(struct cpu_group *cg, struct cpu_search *low,
539    struct cpu_search *high, const int match);
540int cpu_search_lowest(struct cpu_group *cg, struct cpu_search *low);
541int cpu_search_highest(struct cpu_group *cg, struct cpu_search *high);
542int cpu_search_both(struct cpu_group *cg, struct cpu_search *low,
543    struct cpu_search *high);
544
545/*
546 * This routine compares according to the match argument and should be
547 * reduced in actual instantiations via constant propagation and dead code
548 * elimination.
549 */
550static __inline int
551cpu_compare(int cpu, struct cpu_search *low, struct cpu_search *high,
552    const int match)
553{
554	struct tdq *tdq;
555
556	tdq = TDQ_CPU(cpu);
557	if (match & CPU_SEARCH_LOWEST)
558		if (low->cs_mask & (1 << cpu) &&
559		    tdq->tdq_load < low->cs_load &&
560		    tdq->tdq_lowpri > low->cs_limit) {
561			low->cs_cpu = cpu;
562			low->cs_load = tdq->tdq_load;
563		}
564	if (match & CPU_SEARCH_HIGHEST)
565		if (high->cs_mask & (1 << cpu) &&
566		    tdq->tdq_load >= high->cs_limit &&
567		    tdq->tdq_load > high->cs_load &&
568		    tdq->tdq_transferable) {
569			high->cs_cpu = cpu;
570			high->cs_load = tdq->tdq_load;
571		}
572	return (tdq->tdq_load);
573}
574
575/*
576 * Search the tree of cpu_groups for the lowest or highest loaded cpu
577 * according to the match argument.  This routine actually compares the
578 * load on all paths through the tree and finds the least loaded cpu on
579 * the least loaded path, which may differ from the least loaded cpu in
580 * the system.  This balances work among caches and busses.
581 *
582 * This inline is instantiated in three forms below using constants for the
583 * match argument.  It is reduced to the minimum set for each case.  It is
584 * also recursive to the depth of the tree.
585 */
586static __inline int
587cpu_search(struct cpu_group *cg, struct cpu_search *low,
588    struct cpu_search *high, const int match)
589{
590	int total;
591
592	total = 0;
593	if (cg->cg_children) {
594		struct cpu_search lgroup;
595		struct cpu_search hgroup;
596		struct cpu_group *child;
597		u_int lload;
598		int hload;
599		int load;
600		int i;
601
602		lload = -1;
603		hload = -1;
604		for (i = 0; i < cg->cg_children; i++) {
605			child = &cg->cg_child[i];
606			if (match & CPU_SEARCH_LOWEST) {
607				lgroup = *low;
608				lgroup.cs_load = -1;
609			}
610			if (match & CPU_SEARCH_HIGHEST) {
611				hgroup = *high;
612				lgroup.cs_load = 0;
613			}
614			switch (match) {
615			case CPU_SEARCH_LOWEST:
616				load = cpu_search_lowest(child, &lgroup);
617				break;
618			case CPU_SEARCH_HIGHEST:
619				load = cpu_search_highest(child, &hgroup);
620				break;
621			case CPU_SEARCH_BOTH:
622				load = cpu_search_both(child, &lgroup, &hgroup);
623				break;
624			}
625			total += load;
626			if (match & CPU_SEARCH_LOWEST)
627				if (load < lload || low->cs_cpu == -1) {
628					*low = lgroup;
629					lload = load;
630				}
631			if (match & CPU_SEARCH_HIGHEST)
632				if (load > hload || high->cs_cpu == -1) {
633					hload = load;
634					*high = hgroup;
635				}
636		}
637	} else {
638		int cpu;
639
640		CPUMASK_FOREACH(cpu, cg->cg_mask)
641			total += cpu_compare(cpu, low, high, match);
642	}
643	return (total);
644}
645
646/*
647 * cpu_search instantiations must pass constants to maintain the inline
648 * optimization.
649 */
650int
651cpu_search_lowest(struct cpu_group *cg, struct cpu_search *low)
652{
653	return cpu_search(cg, low, NULL, CPU_SEARCH_LOWEST);
654}
655
656int
657cpu_search_highest(struct cpu_group *cg, struct cpu_search *high)
658{
659	return cpu_search(cg, NULL, high, CPU_SEARCH_HIGHEST);
660}
661
662int
663cpu_search_both(struct cpu_group *cg, struct cpu_search *low,
664    struct cpu_search *high)
665{
666	return cpu_search(cg, low, high, CPU_SEARCH_BOTH);
667}
668
669/*
670 * Find the cpu with the least load via the least loaded path that has a
671 * lowpri greater than pri  pri.  A pri of -1 indicates any priority is
672 * acceptable.
673 */
674static inline int
675sched_lowest(struct cpu_group *cg, cpumask_t mask, int pri)
676{
677	struct cpu_search low;
678
679	low.cs_cpu = -1;
680	low.cs_load = -1;
681	low.cs_mask = mask;
682	low.cs_limit = pri;
683	cpu_search_lowest(cg, &low);
684	return low.cs_cpu;
685}
686
687/*
688 * Find the cpu with the highest load via the highest loaded path.
689 */
690static inline int
691sched_highest(struct cpu_group *cg, cpumask_t mask, int minload)
692{
693	struct cpu_search high;
694
695	high.cs_cpu = -1;
696	high.cs_load = 0;
697	high.cs_mask = mask;
698	high.cs_limit = minload;
699	cpu_search_highest(cg, &high);
700	return high.cs_cpu;
701}
702
703/*
704 * Simultaneously find the highest and lowest loaded cpu reachable via
705 * cg.
706 */
707static inline void
708sched_both(struct cpu_group *cg, cpumask_t mask, int *lowcpu, int *highcpu)
709{
710	struct cpu_search high;
711	struct cpu_search low;
712
713	low.cs_cpu = -1;
714	low.cs_limit = -1;
715	low.cs_load = -1;
716	low.cs_mask = mask;
717	high.cs_load = 0;
718	high.cs_cpu = -1;
719	high.cs_limit = -1;
720	high.cs_mask = mask;
721	cpu_search_both(cg, &low, &high);
722	*lowcpu = low.cs_cpu;
723	*highcpu = high.cs_cpu;
724	return;
725}
726
727static void
728sched_balance_group(struct cpu_group *cg)
729{
730	cpumask_t mask;
731	int high;
732	int low;
733	int i;
734
735	mask = -1;
736	for (;;) {
737		sched_both(cg, mask, &low, &high);
738		if (low == high || low == -1 || high == -1)
739			break;
740		if (sched_balance_pair(TDQ_CPU(high), TDQ_CPU(low)))
741			break;
742		/*
743		 * If we failed to move any threads determine which cpu
744		 * to kick out of the set and try again.
745	 	 */
746		if (TDQ_CPU(high)->tdq_transferable == 0)
747			mask &= ~(1 << high);
748		else
749			mask &= ~(1 << low);
750	}
751
752	for (i = 0; i < cg->cg_children; i++)
753		sched_balance_group(&cg->cg_child[i]);
754}
755
756static void
757sched_balance()
758{
759	struct tdq *tdq;
760
761	/*
762	 * Select a random time between .5 * balance_interval and
763	 * 1.5 * balance_interval.
764	 */
765	balance_ticks = max(balance_interval / 2, 1);
766	balance_ticks += random() % balance_interval;
767	if (smp_started == 0 || rebalance == 0)
768		return;
769	tdq = TDQ_SELF();
770	TDQ_UNLOCK(tdq);
771	sched_balance_group(cpu_top);
772	TDQ_LOCK(tdq);
773}
774
775/*
776 * Lock two thread queues using their address to maintain lock order.
777 */
778static void
779tdq_lock_pair(struct tdq *one, struct tdq *two)
780{
781	if (one < two) {
782		TDQ_LOCK(one);
783		TDQ_LOCK_FLAGS(two, MTX_DUPOK);
784	} else {
785		TDQ_LOCK(two);
786		TDQ_LOCK_FLAGS(one, MTX_DUPOK);
787	}
788}
789
790/*
791 * Unlock two thread queues.  Order is not important here.
792 */
793static void
794tdq_unlock_pair(struct tdq *one, struct tdq *two)
795{
796	TDQ_UNLOCK(one);
797	TDQ_UNLOCK(two);
798}
799
800/*
801 * Transfer load between two imbalanced thread queues.
802 */
803static int
804sched_balance_pair(struct tdq *high, struct tdq *low)
805{
806	int transferable;
807	int high_load;
808	int low_load;
809	int moved;
810	int move;
811	int diff;
812	int i;
813
814	tdq_lock_pair(high, low);
815	transferable = high->tdq_transferable;
816	high_load = high->tdq_load;
817	low_load = low->tdq_load;
818	moved = 0;
819	/*
820	 * Determine what the imbalance is and then adjust that to how many
821	 * threads we actually have to give up (transferable).
822	 */
823	if (transferable != 0) {
824		diff = high_load - low_load;
825		move = diff / 2;
826		if (diff & 0x1)
827			move++;
828		move = min(move, transferable);
829		for (i = 0; i < move; i++)
830			moved += tdq_move(high, low);
831		/*
832		 * IPI the target cpu to force it to reschedule with the new
833		 * workload.
834		 */
835		ipi_selected(1 << TDQ_ID(low), IPI_PREEMPT);
836	}
837	tdq_unlock_pair(high, low);
838	return (moved);
839}
840
841/*
842 * Move a thread from one thread queue to another.
843 */
844static int
845tdq_move(struct tdq *from, struct tdq *to)
846{
847	struct td_sched *ts;
848	struct thread *td;
849	struct tdq *tdq;
850	int cpu;
851
852	TDQ_LOCK_ASSERT(from, MA_OWNED);
853	TDQ_LOCK_ASSERT(to, MA_OWNED);
854
855	tdq = from;
856	cpu = TDQ_ID(to);
857	td = tdq_steal(tdq, cpu);
858	if (td == NULL)
859		return (0);
860	ts = td->td_sched;
861	/*
862	 * Although the run queue is locked the thread may be blocked.  Lock
863	 * it to clear this and acquire the run-queue lock.
864	 */
865	thread_lock(td);
866	/* Drop recursive lock on from acquired via thread_lock(). */
867	TDQ_UNLOCK(from);
868	sched_rem(td);
869	ts->ts_cpu = cpu;
870	td->td_lock = TDQ_LOCKPTR(to);
871	tdq_add(to, td, SRQ_YIELDING);
872	return (1);
873}
874
875/*
876 * This tdq has idled.  Try to steal a thread from another cpu and switch
877 * to it.
878 */
879static int
880tdq_idled(struct tdq *tdq)
881{
882	struct cpu_group *cg;
883	struct tdq *steal;
884	cpumask_t mask;
885	int thresh;
886	int cpu;
887
888	if (smp_started == 0 || steal_idle == 0)
889		return (1);
890	mask = -1;
891	mask &= ~PCPU_GET(cpumask);
892	/* We don't want to be preempted while we're iterating. */
893	spinlock_enter();
894	for (cg = tdq->tdq_cg; cg != NULL; ) {
895		if ((cg->cg_flags & (CG_FLAG_HTT | CG_FLAG_THREAD)) == 0)
896			thresh = steal_thresh;
897		else
898			thresh = 1;
899		cpu = sched_highest(cg, mask, thresh);
900		if (cpu == -1) {
901			cg = cg->cg_parent;
902			continue;
903		}
904		steal = TDQ_CPU(cpu);
905		mask &= ~(1 << cpu);
906		tdq_lock_pair(tdq, steal);
907		if (steal->tdq_load < thresh || steal->tdq_transferable == 0) {
908			tdq_unlock_pair(tdq, steal);
909			continue;
910		}
911		/*
912		 * If a thread was added while interrupts were disabled don't
913		 * steal one here.  If we fail to acquire one due to affinity
914		 * restrictions loop again with this cpu removed from the
915		 * set.
916		 */
917		if (tdq->tdq_load == 0 && tdq_move(steal, tdq) == 0) {
918			tdq_unlock_pair(tdq, steal);
919			continue;
920		}
921		spinlock_exit();
922		TDQ_UNLOCK(steal);
923		mi_switch(SW_VOL | SWT_IDLE, NULL);
924		thread_unlock(curthread);
925
926		return (0);
927	}
928	spinlock_exit();
929	return (1);
930}
931
932/*
933 * Notify a remote cpu of new work.  Sends an IPI if criteria are met.
934 */
935static void
936tdq_notify(struct tdq *tdq, struct thread *td)
937{
938	int cpri;
939	int pri;
940	int cpu;
941
942	if (tdq->tdq_ipipending)
943		return;
944	cpu = td->td_sched->ts_cpu;
945	pri = td->td_priority;
946	cpri = pcpu_find(cpu)->pc_curthread->td_priority;
947	if (!sched_shouldpreempt(pri, cpri, 1))
948		return;
949	if (TD_IS_IDLETHREAD(td)) {
950		/*
951		 * If the idle thread is still 'running' it's probably
952		 * waiting on us to release the tdq spinlock already.  No
953		 * need to ipi.
954		 */
955		if (tdq->tdq_idlestate == TDQ_RUNNING)
956			return;
957	}
958	tdq->tdq_ipipending = 1;
959	ipi_selected(1 << cpu, IPI_PREEMPT);
960}
961
962/*
963 * Steals load from a timeshare queue.  Honors the rotating queue head
964 * index.
965 */
966static struct thread *
967runq_steal_from(struct runq *rq, int cpu, u_char start)
968{
969	struct rqbits *rqb;
970	struct rqhead *rqh;
971	struct thread *td;
972	int first;
973	int bit;
974	int pri;
975	int i;
976
977	rqb = &rq->rq_status;
978	bit = start & (RQB_BPW -1);
979	pri = 0;
980	first = 0;
981again:
982	for (i = RQB_WORD(start); i < RQB_LEN; bit = 0, i++) {
983		if (rqb->rqb_bits[i] == 0)
984			continue;
985		if (bit != 0) {
986			for (pri = bit; pri < RQB_BPW; pri++)
987				if (rqb->rqb_bits[i] & (1ul << pri))
988					break;
989			if (pri >= RQB_BPW)
990				continue;
991		} else
992			pri = RQB_FFS(rqb->rqb_bits[i]);
993		pri += (i << RQB_L2BPW);
994		rqh = &rq->rq_queues[pri];
995		TAILQ_FOREACH(td, rqh, td_runq) {
996			if (first && THREAD_CAN_MIGRATE(td) &&
997			    THREAD_CAN_SCHED(td, cpu))
998				return (td);
999			first = 1;
1000		}
1001	}
1002	if (start != 0) {
1003		start = 0;
1004		goto again;
1005	}
1006
1007	return (NULL);
1008}
1009
1010/*
1011 * Steals load from a standard linear queue.
1012 */
1013static struct thread *
1014runq_steal(struct runq *rq, int cpu)
1015{
1016	struct rqhead *rqh;
1017	struct rqbits *rqb;
1018	struct thread *td;
1019	int word;
1020	int bit;
1021
1022	rqb = &rq->rq_status;
1023	for (word = 0; word < RQB_LEN; word++) {
1024		if (rqb->rqb_bits[word] == 0)
1025			continue;
1026		for (bit = 0; bit < RQB_BPW; bit++) {
1027			if ((rqb->rqb_bits[word] & (1ul << bit)) == 0)
1028				continue;
1029			rqh = &rq->rq_queues[bit + (word << RQB_L2BPW)];
1030			TAILQ_FOREACH(td, rqh, td_runq)
1031				if (THREAD_CAN_MIGRATE(td) &&
1032				    THREAD_CAN_SCHED(td, cpu))
1033					return (td);
1034		}
1035	}
1036	return (NULL);
1037}
1038
1039/*
1040 * Attempt to steal a thread in priority order from a thread queue.
1041 */
1042static struct thread *
1043tdq_steal(struct tdq *tdq, int cpu)
1044{
1045	struct thread *td;
1046
1047	TDQ_LOCK_ASSERT(tdq, MA_OWNED);
1048	if ((td = runq_steal(&tdq->tdq_realtime, cpu)) != NULL)
1049		return (td);
1050	if ((td = runq_steal_from(&tdq->tdq_timeshare,
1051	    cpu, tdq->tdq_ridx)) != NULL)
1052		return (td);
1053	return (runq_steal(&tdq->tdq_idle, cpu));
1054}
1055
1056/*
1057 * Sets the thread lock and ts_cpu to match the requested cpu.  Unlocks the
1058 * current lock and returns with the assigned queue locked.
1059 */
1060static inline struct tdq *
1061sched_setcpu(struct thread *td, int cpu, int flags)
1062{
1063
1064	struct tdq *tdq;
1065
1066	THREAD_LOCK_ASSERT(td, MA_OWNED);
1067	tdq = TDQ_CPU(cpu);
1068	td->td_sched->ts_cpu = cpu;
1069	/*
1070	 * If the lock matches just return the queue.
1071	 */
1072	if (td->td_lock == TDQ_LOCKPTR(tdq))
1073		return (tdq);
1074#ifdef notyet
1075	/*
1076	 * If the thread isn't running its lockptr is a
1077	 * turnstile or a sleepqueue.  We can just lock_set without
1078	 * blocking.
1079	 */
1080	if (TD_CAN_RUN(td)) {
1081		TDQ_LOCK(tdq);
1082		thread_lock_set(td, TDQ_LOCKPTR(tdq));
1083		return (tdq);
1084	}
1085#endif
1086	/*
1087	 * The hard case, migration, we need to block the thread first to
1088	 * prevent order reversals with other cpus locks.
1089	 */
1090	thread_lock_block(td);
1091	TDQ_LOCK(tdq);
1092	thread_lock_unblock(td, TDQ_LOCKPTR(tdq));
1093	return (tdq);
1094}
1095
1096SCHED_STAT_DEFINE(pickcpu_intrbind, "Soft interrupt binding");
1097SCHED_STAT_DEFINE(pickcpu_idle_affinity, "Picked idle cpu based on affinity");
1098SCHED_STAT_DEFINE(pickcpu_affinity, "Picked cpu based on affinity");
1099SCHED_STAT_DEFINE(pickcpu_lowest, "Selected lowest load");
1100SCHED_STAT_DEFINE(pickcpu_local, "Migrated to current cpu");
1101SCHED_STAT_DEFINE(pickcpu_migration, "Selection may have caused migration");
1102
1103static int
1104sched_pickcpu(struct thread *td, int flags)
1105{
1106	struct cpu_group *cg;
1107	struct td_sched *ts;
1108	struct tdq *tdq;
1109	cpumask_t mask;
1110	int self;
1111	int pri;
1112	int cpu;
1113
1114	self = PCPU_GET(cpuid);
1115	ts = td->td_sched;
1116	if (smp_started == 0)
1117		return (self);
1118	/*
1119	 * Don't migrate a running thread from sched_switch().
1120	 */
1121	if ((flags & SRQ_OURSELF) || !THREAD_CAN_MIGRATE(td))
1122		return (ts->ts_cpu);
1123	/*
1124	 * Prefer to run interrupt threads on the processors that generate
1125	 * the interrupt.
1126	 */
1127	if (td->td_priority <= PRI_MAX_ITHD && THREAD_CAN_SCHED(td, self) &&
1128	    curthread->td_intr_nesting_level && ts->ts_cpu != self) {
1129		SCHED_STAT_INC(pickcpu_intrbind);
1130		ts->ts_cpu = self;
1131	}
1132	/*
1133	 * If the thread can run on the last cpu and the affinity has not
1134	 * expired or it is idle run it there.
1135	 */
1136	pri = td->td_priority;
1137	tdq = TDQ_CPU(ts->ts_cpu);
1138	if (THREAD_CAN_SCHED(td, ts->ts_cpu)) {
1139		if (tdq->tdq_lowpri > PRI_MIN_IDLE) {
1140			SCHED_STAT_INC(pickcpu_idle_affinity);
1141			return (ts->ts_cpu);
1142		}
1143		if (SCHED_AFFINITY(ts, CG_SHARE_L2) && tdq->tdq_lowpri > pri) {
1144			SCHED_STAT_INC(pickcpu_affinity);
1145			return (ts->ts_cpu);
1146		}
1147	}
1148	/*
1149	 * Search for the highest level in the tree that still has affinity.
1150	 */
1151	cg = NULL;
1152	for (cg = tdq->tdq_cg; cg != NULL; cg = cg->cg_parent)
1153		if (SCHED_AFFINITY(ts, cg->cg_level))
1154			break;
1155	cpu = -1;
1156	mask = td->td_cpuset->cs_mask.__bits[0];
1157	if (cg)
1158		cpu = sched_lowest(cg, mask, pri);
1159	if (cpu == -1)
1160		cpu = sched_lowest(cpu_top, mask, -1);
1161	/*
1162	 * Compare the lowest loaded cpu to current cpu.
1163	 */
1164	if (THREAD_CAN_SCHED(td, self) && TDQ_CPU(self)->tdq_lowpri > pri &&
1165	    TDQ_CPU(cpu)->tdq_lowpri < PRI_MIN_IDLE) {
1166		SCHED_STAT_INC(pickcpu_local);
1167		cpu = self;
1168	} else
1169		SCHED_STAT_INC(pickcpu_lowest);
1170	if (cpu != ts->ts_cpu)
1171		SCHED_STAT_INC(pickcpu_migration);
1172	KASSERT(cpu != -1, ("sched_pickcpu: Failed to find a cpu."));
1173	return (cpu);
1174}
1175#endif
1176
1177/*
1178 * Pick the highest priority task we have and return it.
1179 */
1180static struct thread *
1181tdq_choose(struct tdq *tdq)
1182{
1183	struct thread *td;
1184
1185	TDQ_LOCK_ASSERT(tdq, MA_OWNED);
1186	td = runq_choose(&tdq->tdq_realtime);
1187	if (td != NULL)
1188		return (td);
1189	td = runq_choose_from(&tdq->tdq_timeshare, tdq->tdq_ridx);
1190	if (td != NULL) {
1191		KASSERT(td->td_priority >= PRI_MIN_TIMESHARE,
1192		    ("tdq_choose: Invalid priority on timeshare queue %d",
1193		    td->td_priority));
1194		return (td);
1195	}
1196	td = runq_choose(&tdq->tdq_idle);
1197	if (td != NULL) {
1198		KASSERT(td->td_priority >= PRI_MIN_IDLE,
1199		    ("tdq_choose: Invalid priority on idle queue %d",
1200		    td->td_priority));
1201		return (td);
1202	}
1203
1204	return (NULL);
1205}
1206
1207/*
1208 * Initialize a thread queue.
1209 */
1210static void
1211tdq_setup(struct tdq *tdq)
1212{
1213
1214	if (bootverbose)
1215		printf("ULE: setup cpu %d\n", TDQ_ID(tdq));
1216	runq_init(&tdq->tdq_realtime);
1217	runq_init(&tdq->tdq_timeshare);
1218	runq_init(&tdq->tdq_idle);
1219	snprintf(tdq->tdq_name, sizeof(tdq->tdq_name),
1220	    "sched lock %d", (int)TDQ_ID(tdq));
1221	mtx_init(&tdq->tdq_lock, tdq->tdq_name, "sched lock",
1222	    MTX_SPIN | MTX_RECURSE);
1223}
1224
1225#ifdef SMP
1226static void
1227sched_setup_smp(void)
1228{
1229	struct tdq *tdq;
1230	int i;
1231
1232	cpu_top = smp_topo();
1233	for (i = 0; i < MAXCPU; i++) {
1234		if (CPU_ABSENT(i))
1235			continue;
1236		tdq = TDQ_CPU(i);
1237		tdq_setup(tdq);
1238		tdq->tdq_cg = smp_topo_find(cpu_top, i);
1239		if (tdq->tdq_cg == NULL)
1240			panic("Can't find cpu group for %d\n", i);
1241	}
1242	balance_tdq = TDQ_SELF();
1243	sched_balance();
1244}
1245#endif
1246
1247/*
1248 * Setup the thread queues and initialize the topology based on MD
1249 * information.
1250 */
1251static void
1252sched_setup(void *dummy)
1253{
1254	struct tdq *tdq;
1255
1256	tdq = TDQ_SELF();
1257#ifdef SMP
1258	sched_setup_smp();
1259#else
1260	tdq_setup(tdq);
1261#endif
1262	/*
1263	 * To avoid divide-by-zero, we set realstathz a dummy value
1264	 * in case which sched_clock() called before sched_initticks().
1265	 */
1266	realstathz = hz;
1267	sched_slice = (realstathz/10);	/* ~100ms */
1268	tickincr = 1 << SCHED_TICK_SHIFT;
1269
1270	/* Add thread0's load since it's running. */
1271	TDQ_LOCK(tdq);
1272	thread0.td_lock = TDQ_LOCKPTR(TDQ_SELF());
1273	tdq_load_add(tdq, &thread0);
1274	tdq->tdq_lowpri = thread0.td_priority;
1275	TDQ_UNLOCK(tdq);
1276}
1277
1278/*
1279 * This routine determines the tickincr after stathz and hz are setup.
1280 */
1281/* ARGSUSED */
1282static void
1283sched_initticks(void *dummy)
1284{
1285	int incr;
1286
1287	realstathz = stathz ? stathz : hz;
1288	sched_slice = (realstathz/10);	/* ~100ms */
1289
1290	/*
1291	 * tickincr is shifted out by 10 to avoid rounding errors due to
1292	 * hz not being evenly divisible by stathz on all platforms.
1293	 */
1294	incr = (hz << SCHED_TICK_SHIFT) / realstathz;
1295	/*
1296	 * This does not work for values of stathz that are more than
1297	 * 1 << SCHED_TICK_SHIFT * hz.  In practice this does not happen.
1298	 */
1299	if (incr == 0)
1300		incr = 1;
1301	tickincr = incr;
1302#ifdef SMP
1303	/*
1304	 * Set the default balance interval now that we know
1305	 * what realstathz is.
1306	 */
1307	balance_interval = realstathz;
1308	/*
1309	 * Set steal thresh to log2(mp_ncpu) but no greater than 4.  This
1310	 * prevents excess thrashing on large machines and excess idle on
1311	 * smaller machines.
1312	 */
1313	steal_thresh = min(ffs(mp_ncpus) - 1, 3);
1314	affinity = SCHED_AFFINITY_DEFAULT;
1315#endif
1316}
1317
1318
1319/*
1320 * This is the core of the interactivity algorithm.  Determines a score based
1321 * on past behavior.  It is the ratio of sleep time to run time scaled to
1322 * a [0, 100] integer.  This is the voluntary sleep time of a process, which
1323 * differs from the cpu usage because it does not account for time spent
1324 * waiting on a run-queue.  Would be prettier if we had floating point.
1325 */
1326static int
1327sched_interact_score(struct thread *td)
1328{
1329	struct td_sched *ts;
1330	int div;
1331
1332	ts = td->td_sched;
1333	/*
1334	 * The score is only needed if this is likely to be an interactive
1335	 * task.  Don't go through the expense of computing it if there's
1336	 * no chance.
1337	 */
1338	if (sched_interact <= SCHED_INTERACT_HALF &&
1339		ts->ts_runtime >= ts->ts_slptime)
1340			return (SCHED_INTERACT_HALF);
1341
1342	if (ts->ts_runtime > ts->ts_slptime) {
1343		div = max(1, ts->ts_runtime / SCHED_INTERACT_HALF);
1344		return (SCHED_INTERACT_HALF +
1345		    (SCHED_INTERACT_HALF - (ts->ts_slptime / div)));
1346	}
1347	if (ts->ts_slptime > ts->ts_runtime) {
1348		div = max(1, ts->ts_slptime / SCHED_INTERACT_HALF);
1349		return (ts->ts_runtime / div);
1350	}
1351	/* runtime == slptime */
1352	if (ts->ts_runtime)
1353		return (SCHED_INTERACT_HALF);
1354
1355	/*
1356	 * This can happen if slptime and runtime are 0.
1357	 */
1358	return (0);
1359
1360}
1361
1362/*
1363 * Scale the scheduling priority according to the "interactivity" of this
1364 * process.
1365 */
1366static void
1367sched_priority(struct thread *td)
1368{
1369	int score;
1370	int pri;
1371
1372	if (td->td_pri_class != PRI_TIMESHARE)
1373		return;
1374	/*
1375	 * If the score is interactive we place the thread in the realtime
1376	 * queue with a priority that is less than kernel and interrupt
1377	 * priorities.  These threads are not subject to nice restrictions.
1378	 *
1379	 * Scores greater than this are placed on the normal timeshare queue
1380	 * where the priority is partially decided by the most recent cpu
1381	 * utilization and the rest is decided by nice value.
1382	 *
1383	 * The nice value of the process has a linear effect on the calculated
1384	 * score.  Negative nice values make it easier for a thread to be
1385	 * considered interactive.
1386	 */
1387	score = imax(0, sched_interact_score(td) - td->td_proc->p_nice);
1388	if (score < sched_interact) {
1389		pri = PRI_MIN_REALTIME;
1390		pri += ((PRI_MAX_REALTIME - PRI_MIN_REALTIME) / sched_interact)
1391		    * score;
1392		KASSERT(pri >= PRI_MIN_REALTIME && pri <= PRI_MAX_REALTIME,
1393		    ("sched_priority: invalid interactive priority %d score %d",
1394		    pri, score));
1395	} else {
1396		pri = SCHED_PRI_MIN;
1397		if (td->td_sched->ts_ticks)
1398			pri += SCHED_PRI_TICKS(td->td_sched);
1399		pri += SCHED_PRI_NICE(td->td_proc->p_nice);
1400		KASSERT(pri >= PRI_MIN_TIMESHARE && pri <= PRI_MAX_TIMESHARE,
1401		    ("sched_priority: invalid priority %d: nice %d, "
1402		    "ticks %d ftick %d ltick %d tick pri %d",
1403		    pri, td->td_proc->p_nice, td->td_sched->ts_ticks,
1404		    td->td_sched->ts_ftick, td->td_sched->ts_ltick,
1405		    SCHED_PRI_TICKS(td->td_sched)));
1406	}
1407	sched_user_prio(td, pri);
1408
1409	return;
1410}
1411
1412/*
1413 * This routine enforces a maximum limit on the amount of scheduling history
1414 * kept.  It is called after either the slptime or runtime is adjusted.  This
1415 * function is ugly due to integer math.
1416 */
1417static void
1418sched_interact_update(struct thread *td)
1419{
1420	struct td_sched *ts;
1421	u_int sum;
1422
1423	ts = td->td_sched;
1424	sum = ts->ts_runtime + ts->ts_slptime;
1425	if (sum < SCHED_SLP_RUN_MAX)
1426		return;
1427	/*
1428	 * This only happens from two places:
1429	 * 1) We have added an unusual amount of run time from fork_exit.
1430	 * 2) We have added an unusual amount of sleep time from sched_sleep().
1431	 */
1432	if (sum > SCHED_SLP_RUN_MAX * 2) {
1433		if (ts->ts_runtime > ts->ts_slptime) {
1434			ts->ts_runtime = SCHED_SLP_RUN_MAX;
1435			ts->ts_slptime = 1;
1436		} else {
1437			ts->ts_slptime = SCHED_SLP_RUN_MAX;
1438			ts->ts_runtime = 1;
1439		}
1440		return;
1441	}
1442	/*
1443	 * If we have exceeded by more than 1/5th then the algorithm below
1444	 * will not bring us back into range.  Dividing by two here forces
1445	 * us into the range of [4/5 * SCHED_INTERACT_MAX, SCHED_INTERACT_MAX]
1446	 */
1447	if (sum > (SCHED_SLP_RUN_MAX / 5) * 6) {
1448		ts->ts_runtime /= 2;
1449		ts->ts_slptime /= 2;
1450		return;
1451	}
1452	ts->ts_runtime = (ts->ts_runtime / 5) * 4;
1453	ts->ts_slptime = (ts->ts_slptime / 5) * 4;
1454}
1455
1456/*
1457 * Scale back the interactivity history when a child thread is created.  The
1458 * history is inherited from the parent but the thread may behave totally
1459 * differently.  For example, a shell spawning a compiler process.  We want
1460 * to learn that the compiler is behaving badly very quickly.
1461 */
1462static void
1463sched_interact_fork(struct thread *td)
1464{
1465	int ratio;
1466	int sum;
1467
1468	sum = td->td_sched->ts_runtime + td->td_sched->ts_slptime;
1469	if (sum > SCHED_SLP_RUN_FORK) {
1470		ratio = sum / SCHED_SLP_RUN_FORK;
1471		td->td_sched->ts_runtime /= ratio;
1472		td->td_sched->ts_slptime /= ratio;
1473	}
1474}
1475
1476/*
1477 * Called from proc0_init() to setup the scheduler fields.
1478 */
1479void
1480schedinit(void)
1481{
1482
1483	/*
1484	 * Set up the scheduler specific parts of proc0.
1485	 */
1486	proc0.p_sched = NULL; /* XXX */
1487	thread0.td_sched = &td_sched0;
1488	td_sched0.ts_ltick = ticks;
1489	td_sched0.ts_ftick = ticks;
1490	td_sched0.ts_slice = sched_slice;
1491}
1492
1493/*
1494 * This is only somewhat accurate since given many processes of the same
1495 * priority they will switch when their slices run out, which will be
1496 * at most sched_slice stathz ticks.
1497 */
1498int
1499sched_rr_interval(void)
1500{
1501
1502	/* Convert sched_slice to hz */
1503	return (hz/(realstathz/sched_slice));
1504}
1505
1506/*
1507 * Update the percent cpu tracking information when it is requested or
1508 * the total history exceeds the maximum.  We keep a sliding history of
1509 * tick counts that slowly decays.  This is less precise than the 4BSD
1510 * mechanism since it happens with less regular and frequent events.
1511 */
1512static void
1513sched_pctcpu_update(struct td_sched *ts)
1514{
1515
1516	if (ts->ts_ticks == 0)
1517		return;
1518	if (ticks - (hz / 10) < ts->ts_ltick &&
1519	    SCHED_TICK_TOTAL(ts) < SCHED_TICK_MAX)
1520		return;
1521	/*
1522	 * Adjust counters and watermark for pctcpu calc.
1523	 */
1524	if (ts->ts_ltick > ticks - SCHED_TICK_TARG)
1525		ts->ts_ticks = (ts->ts_ticks / (ticks - ts->ts_ftick)) *
1526			    SCHED_TICK_TARG;
1527	else
1528		ts->ts_ticks = 0;
1529	ts->ts_ltick = ticks;
1530	ts->ts_ftick = ts->ts_ltick - SCHED_TICK_TARG;
1531}
1532
1533/*
1534 * Adjust the priority of a thread.  Move it to the appropriate run-queue
1535 * if necessary.  This is the back-end for several priority related
1536 * functions.
1537 */
1538static void
1539sched_thread_priority(struct thread *td, u_char prio)
1540{
1541	struct td_sched *ts;
1542	struct tdq *tdq;
1543	int oldpri;
1544
1545	CTR6(KTR_SCHED, "sched_prio: %p(%s) prio %d newprio %d by %p(%s)",
1546	    td, td->td_name, td->td_priority, prio, curthread,
1547	    curthread->td_name);
1548	ts = td->td_sched;
1549	THREAD_LOCK_ASSERT(td, MA_OWNED);
1550	if (td->td_priority == prio)
1551		return;
1552	/*
1553	 * If the priority has been elevated due to priority
1554	 * propagation, we may have to move ourselves to a new
1555	 * queue.  This could be optimized to not re-add in some
1556	 * cases.
1557	 */
1558	if (TD_ON_RUNQ(td) && prio < td->td_priority) {
1559		sched_rem(td);
1560		td->td_priority = prio;
1561		sched_add(td, SRQ_BORROWING);
1562		return;
1563	}
1564	/*
1565	 * If the thread is currently running we may have to adjust the lowpri
1566	 * information so other cpus are aware of our current priority.
1567	 */
1568	if (TD_IS_RUNNING(td)) {
1569		tdq = TDQ_CPU(ts->ts_cpu);
1570		oldpri = td->td_priority;
1571		td->td_priority = prio;
1572		if (prio < tdq->tdq_lowpri)
1573			tdq->tdq_lowpri = prio;
1574		else if (tdq->tdq_lowpri == oldpri)
1575			tdq_setlowpri(tdq, td);
1576		return;
1577	}
1578	td->td_priority = prio;
1579}
1580
1581/*
1582 * Update a thread's priority when it is lent another thread's
1583 * priority.
1584 */
1585void
1586sched_lend_prio(struct thread *td, u_char prio)
1587{
1588
1589	td->td_flags |= TDF_BORROWING;
1590	sched_thread_priority(td, prio);
1591}
1592
1593/*
1594 * Restore a thread's priority when priority propagation is
1595 * over.  The prio argument is the minimum priority the thread
1596 * needs to have to satisfy other possible priority lending
1597 * requests.  If the thread's regular priority is less
1598 * important than prio, the thread will keep a priority boost
1599 * of prio.
1600 */
1601void
1602sched_unlend_prio(struct thread *td, u_char prio)
1603{
1604	u_char base_pri;
1605
1606	if (td->td_base_pri >= PRI_MIN_TIMESHARE &&
1607	    td->td_base_pri <= PRI_MAX_TIMESHARE)
1608		base_pri = td->td_user_pri;
1609	else
1610		base_pri = td->td_base_pri;
1611	if (prio >= base_pri) {
1612		td->td_flags &= ~TDF_BORROWING;
1613		sched_thread_priority(td, base_pri);
1614	} else
1615		sched_lend_prio(td, prio);
1616}
1617
1618/*
1619 * Standard entry for setting the priority to an absolute value.
1620 */
1621void
1622sched_prio(struct thread *td, u_char prio)
1623{
1624	u_char oldprio;
1625
1626	/* First, update the base priority. */
1627	td->td_base_pri = prio;
1628
1629	/*
1630	 * If the thread is borrowing another thread's priority, don't
1631	 * ever lower the priority.
1632	 */
1633	if (td->td_flags & TDF_BORROWING && td->td_priority < prio)
1634		return;
1635
1636	/* Change the real priority. */
1637	oldprio = td->td_priority;
1638	sched_thread_priority(td, prio);
1639
1640	/*
1641	 * If the thread is on a turnstile, then let the turnstile update
1642	 * its state.
1643	 */
1644	if (TD_ON_LOCK(td) && oldprio != prio)
1645		turnstile_adjust(td, oldprio);
1646}
1647
1648/*
1649 * Set the base user priority, does not effect current running priority.
1650 */
1651void
1652sched_user_prio(struct thread *td, u_char prio)
1653{
1654	u_char oldprio;
1655
1656	td->td_base_user_pri = prio;
1657	if (td->td_flags & TDF_UBORROWING && td->td_user_pri <= prio)
1658                return;
1659	oldprio = td->td_user_pri;
1660	td->td_user_pri = prio;
1661}
1662
1663void
1664sched_lend_user_prio(struct thread *td, u_char prio)
1665{
1666	u_char oldprio;
1667
1668	THREAD_LOCK_ASSERT(td, MA_OWNED);
1669	td->td_flags |= TDF_UBORROWING;
1670	oldprio = td->td_user_pri;
1671	td->td_user_pri = prio;
1672}
1673
1674void
1675sched_unlend_user_prio(struct thread *td, u_char prio)
1676{
1677	u_char base_pri;
1678
1679	THREAD_LOCK_ASSERT(td, MA_OWNED);
1680	base_pri = td->td_base_user_pri;
1681	if (prio >= base_pri) {
1682		td->td_flags &= ~TDF_UBORROWING;
1683		sched_user_prio(td, base_pri);
1684	} else {
1685		sched_lend_user_prio(td, prio);
1686	}
1687}
1688
1689/*
1690 * Block a thread for switching.  Similar to thread_block() but does not
1691 * bump the spin count.
1692 */
1693static inline struct mtx *
1694thread_block_switch(struct thread *td)
1695{
1696	struct mtx *lock;
1697
1698	THREAD_LOCK_ASSERT(td, MA_OWNED);
1699	lock = td->td_lock;
1700	td->td_lock = &blocked_lock;
1701	mtx_unlock_spin(lock);
1702
1703	return (lock);
1704}
1705
1706/*
1707 * Handle migration from sched_switch().  This happens only for
1708 * cpu binding.
1709 */
1710static struct mtx *
1711sched_switch_migrate(struct tdq *tdq, struct thread *td, int flags)
1712{
1713	struct tdq *tdn;
1714
1715	tdn = TDQ_CPU(td->td_sched->ts_cpu);
1716#ifdef SMP
1717	tdq_load_rem(tdq, td);
1718	/*
1719	 * Do the lock dance required to avoid LOR.  We grab an extra
1720	 * spinlock nesting to prevent preemption while we're
1721	 * not holding either run-queue lock.
1722	 */
1723	spinlock_enter();
1724	thread_block_switch(td);	/* This releases the lock on tdq. */
1725	TDQ_LOCK(tdn);
1726	tdq_add(tdn, td, flags);
1727	tdq_notify(tdn, td);
1728	/*
1729	 * After we unlock tdn the new cpu still can't switch into this
1730	 * thread until we've unblocked it in cpu_switch().  The lock
1731	 * pointers may match in the case of HTT cores.  Don't unlock here
1732	 * or we can deadlock when the other CPU runs the IPI handler.
1733	 */
1734	if (TDQ_LOCKPTR(tdn) != TDQ_LOCKPTR(tdq)) {
1735		TDQ_UNLOCK(tdn);
1736		TDQ_LOCK(tdq);
1737	}
1738	spinlock_exit();
1739#endif
1740	return (TDQ_LOCKPTR(tdn));
1741}
1742
1743/*
1744 * Release a thread that was blocked with thread_block_switch().
1745 */
1746static inline void
1747thread_unblock_switch(struct thread *td, struct mtx *mtx)
1748{
1749	atomic_store_rel_ptr((volatile uintptr_t *)&td->td_lock,
1750	    (uintptr_t)mtx);
1751}
1752
1753/*
1754 * Switch threads.  This function has to handle threads coming in while
1755 * blocked for some reason, running, or idle.  It also must deal with
1756 * migrating a thread from one queue to another as running threads may
1757 * be assigned elsewhere via binding.
1758 */
1759void
1760sched_switch(struct thread *td, struct thread *newtd, int flags)
1761{
1762	struct tdq *tdq;
1763	struct td_sched *ts;
1764	struct mtx *mtx;
1765	int srqflag;
1766	int cpuid;
1767
1768	THREAD_LOCK_ASSERT(td, MA_OWNED);
1769	KASSERT(newtd == NULL, ("sched_switch: Unsupported newtd argument"));
1770
1771	cpuid = PCPU_GET(cpuid);
1772	tdq = TDQ_CPU(cpuid);
1773	ts = td->td_sched;
1774	mtx = td->td_lock;
1775	ts->ts_rltick = ticks;
1776	td->td_lastcpu = td->td_oncpu;
1777	td->td_oncpu = NOCPU;
1778	td->td_flags &= ~TDF_NEEDRESCHED;
1779	td->td_owepreempt = 0;
1780	tdq->tdq_switchcnt++;
1781	/*
1782	 * The lock pointer in an idle thread should never change.  Reset it
1783	 * to CAN_RUN as well.
1784	 */
1785	if (TD_IS_IDLETHREAD(td)) {
1786		MPASS(td->td_lock == TDQ_LOCKPTR(tdq));
1787		TD_SET_CAN_RUN(td);
1788	} else if (TD_IS_RUNNING(td)) {
1789		MPASS(td->td_lock == TDQ_LOCKPTR(tdq));
1790		srqflag = (flags & SW_PREEMPT) ?
1791		    SRQ_OURSELF|SRQ_YIELDING|SRQ_PREEMPTED :
1792		    SRQ_OURSELF|SRQ_YIELDING;
1793		if (ts->ts_cpu == cpuid)
1794			tdq_runq_add(tdq, td, srqflag);
1795		else
1796			mtx = sched_switch_migrate(tdq, td, srqflag);
1797	} else {
1798		/* This thread must be going to sleep. */
1799		TDQ_LOCK(tdq);
1800		mtx = thread_block_switch(td);
1801		tdq_load_rem(tdq, td);
1802	}
1803	/*
1804	 * We enter here with the thread blocked and assigned to the
1805	 * appropriate cpu run-queue or sleep-queue and with the current
1806	 * thread-queue locked.
1807	 */
1808	TDQ_LOCK_ASSERT(tdq, MA_OWNED | MA_NOTRECURSED);
1809	newtd = choosethread();
1810	/*
1811	 * Call the MD code to switch contexts if necessary.
1812	 */
1813	if (td != newtd) {
1814#ifdef	HWPMC_HOOKS
1815		if (PMC_PROC_IS_USING_PMCS(td->td_proc))
1816			PMC_SWITCH_CONTEXT(td, PMC_FN_CSW_OUT);
1817#endif
1818		lock_profile_release_lock(&TDQ_LOCKPTR(tdq)->lock_object);
1819		TDQ_LOCKPTR(tdq)->mtx_lock = (uintptr_t)newtd;
1820		cpu_switch(td, newtd, mtx);
1821		/*
1822		 * We may return from cpu_switch on a different cpu.  However,
1823		 * we always return with td_lock pointing to the current cpu's
1824		 * run queue lock.
1825		 */
1826		cpuid = PCPU_GET(cpuid);
1827		tdq = TDQ_CPU(cpuid);
1828		lock_profile_obtain_lock_success(
1829		    &TDQ_LOCKPTR(tdq)->lock_object, 0, 0, __FILE__, __LINE__);
1830#ifdef	HWPMC_HOOKS
1831		if (PMC_PROC_IS_USING_PMCS(td->td_proc))
1832			PMC_SWITCH_CONTEXT(td, PMC_FN_CSW_IN);
1833#endif
1834	} else
1835		thread_unblock_switch(td, mtx);
1836	/*
1837	 * Assert that all went well and return.
1838	 */
1839	TDQ_LOCK_ASSERT(tdq, MA_OWNED|MA_NOTRECURSED);
1840	MPASS(td->td_lock == TDQ_LOCKPTR(tdq));
1841	td->td_oncpu = cpuid;
1842}
1843
1844/*
1845 * Adjust thread priorities as a result of a nice request.
1846 */
1847void
1848sched_nice(struct proc *p, int nice)
1849{
1850	struct thread *td;
1851
1852	PROC_LOCK_ASSERT(p, MA_OWNED);
1853
1854	p->p_nice = nice;
1855	FOREACH_THREAD_IN_PROC(p, td) {
1856		thread_lock(td);
1857		sched_priority(td);
1858		sched_prio(td, td->td_base_user_pri);
1859		thread_unlock(td);
1860	}
1861}
1862
1863/*
1864 * Record the sleep time for the interactivity scorer.
1865 */
1866void
1867sched_sleep(struct thread *td, int prio)
1868{
1869
1870	THREAD_LOCK_ASSERT(td, MA_OWNED);
1871
1872	td->td_slptick = ticks;
1873	if (TD_IS_SUSPENDED(td) || prio <= PSOCK)
1874		td->td_flags |= TDF_CANSWAP;
1875	if (static_boost == 1 && prio)
1876		sched_prio(td, prio);
1877	else if (static_boost && td->td_priority > static_boost)
1878		sched_prio(td, static_boost);
1879}
1880
1881/*
1882 * Schedule a thread to resume execution and record how long it voluntarily
1883 * slept.  We also update the pctcpu, interactivity, and priority.
1884 */
1885void
1886sched_wakeup(struct thread *td)
1887{
1888	struct td_sched *ts;
1889	int slptick;
1890
1891	THREAD_LOCK_ASSERT(td, MA_OWNED);
1892	ts = td->td_sched;
1893	td->td_flags &= ~TDF_CANSWAP;
1894	/*
1895	 * If we slept for more than a tick update our interactivity and
1896	 * priority.
1897	 */
1898	slptick = td->td_slptick;
1899	td->td_slptick = 0;
1900	if (slptick && slptick != ticks) {
1901		u_int hzticks;
1902
1903		hzticks = (ticks - slptick) << SCHED_TICK_SHIFT;
1904		ts->ts_slptime += hzticks;
1905		sched_interact_update(td);
1906		sched_pctcpu_update(ts);
1907	}
1908	/* Reset the slice value after we sleep. */
1909	ts->ts_slice = sched_slice;
1910	sched_add(td, SRQ_BORING);
1911}
1912
1913/*
1914 * Penalize the parent for creating a new child and initialize the child's
1915 * priority.
1916 */
1917void
1918sched_fork(struct thread *td, struct thread *child)
1919{
1920	THREAD_LOCK_ASSERT(td, MA_OWNED);
1921	sched_fork_thread(td, child);
1922	/*
1923	 * Penalize the parent and child for forking.
1924	 */
1925	sched_interact_fork(child);
1926	sched_priority(child);
1927	td->td_sched->ts_runtime += tickincr;
1928	sched_interact_update(td);
1929	sched_priority(td);
1930}
1931
1932/*
1933 * Fork a new thread, may be within the same process.
1934 */
1935void
1936sched_fork_thread(struct thread *td, struct thread *child)
1937{
1938	struct td_sched *ts;
1939	struct td_sched *ts2;
1940
1941	THREAD_LOCK_ASSERT(td, MA_OWNED);
1942	/*
1943	 * Initialize child.
1944	 */
1945	ts = td->td_sched;
1946	ts2 = child->td_sched;
1947	child->td_lock = TDQ_LOCKPTR(TDQ_SELF());
1948	child->td_cpuset = cpuset_ref(td->td_cpuset);
1949	ts2->ts_cpu = ts->ts_cpu;
1950	ts2->ts_flags = 0;
1951	/*
1952	 * Grab our parents cpu estimation information and priority.
1953	 */
1954	ts2->ts_ticks = ts->ts_ticks;
1955	ts2->ts_ltick = ts->ts_ltick;
1956	ts2->ts_ftick = ts->ts_ftick;
1957	child->td_user_pri = td->td_user_pri;
1958	child->td_base_user_pri = td->td_base_user_pri;
1959	/*
1960	 * And update interactivity score.
1961	 */
1962	ts2->ts_slptime = ts->ts_slptime;
1963	ts2->ts_runtime = ts->ts_runtime;
1964	ts2->ts_slice = 1;	/* Attempt to quickly learn interactivity. */
1965}
1966
1967/*
1968 * Adjust the priority class of a thread.
1969 */
1970void
1971sched_class(struct thread *td, int class)
1972{
1973
1974	THREAD_LOCK_ASSERT(td, MA_OWNED);
1975	if (td->td_pri_class == class)
1976		return;
1977	td->td_pri_class = class;
1978}
1979
1980/*
1981 * Return some of the child's priority and interactivity to the parent.
1982 */
1983void
1984sched_exit(struct proc *p, struct thread *child)
1985{
1986	struct thread *td;
1987
1988	CTR3(KTR_SCHED, "sched_exit: %p(%s) prio %d",
1989	    child, child->td_name, child->td_priority);
1990
1991	PROC_LOCK_ASSERT(p, MA_OWNED);
1992	td = FIRST_THREAD_IN_PROC(p);
1993	sched_exit_thread(td, child);
1994}
1995
1996/*
1997 * Penalize another thread for the time spent on this one.  This helps to
1998 * worsen the priority and interactivity of processes which schedule batch
1999 * jobs such as make.  This has little effect on the make process itself but
2000 * causes new processes spawned by it to receive worse scores immediately.
2001 */
2002void
2003sched_exit_thread(struct thread *td, struct thread *child)
2004{
2005
2006	CTR3(KTR_SCHED, "sched_exit_thread: %p(%s) prio %d",
2007	    child, child->td_name, child->td_priority);
2008
2009	/*
2010	 * Give the child's runtime to the parent without returning the
2011	 * sleep time as a penalty to the parent.  This causes shells that
2012	 * launch expensive things to mark their children as expensive.
2013	 */
2014	thread_lock(td);
2015	td->td_sched->ts_runtime += child->td_sched->ts_runtime;
2016	sched_interact_update(td);
2017	sched_priority(td);
2018	thread_unlock(td);
2019}
2020
2021void
2022sched_preempt(struct thread *td)
2023{
2024	struct tdq *tdq;
2025
2026	thread_lock(td);
2027	tdq = TDQ_SELF();
2028	TDQ_LOCK_ASSERT(tdq, MA_OWNED);
2029	tdq->tdq_ipipending = 0;
2030	if (td->td_priority > tdq->tdq_lowpri) {
2031		int flags;
2032
2033		flags = SW_INVOL | SW_PREEMPT;
2034		if (td->td_critnest > 1)
2035			td->td_owepreempt = 1;
2036		else if (TD_IS_IDLETHREAD(td))
2037			mi_switch(flags | SWT_REMOTEWAKEIDLE, NULL);
2038		else
2039			mi_switch(flags | SWT_REMOTEPREEMPT, NULL);
2040	}
2041	thread_unlock(td);
2042}
2043
2044/*
2045 * Fix priorities on return to user-space.  Priorities may be elevated due
2046 * to static priorities in msleep() or similar.
2047 */
2048void
2049sched_userret(struct thread *td)
2050{
2051	/*
2052	 * XXX we cheat slightly on the locking here to avoid locking in
2053	 * the usual case.  Setting td_priority here is essentially an
2054	 * incomplete workaround for not setting it properly elsewhere.
2055	 * Now that some interrupt handlers are threads, not setting it
2056	 * properly elsewhere can clobber it in the window between setting
2057	 * it here and returning to user mode, so don't waste time setting
2058	 * it perfectly here.
2059	 */
2060	KASSERT((td->td_flags & TDF_BORROWING) == 0,
2061	    ("thread with borrowed priority returning to userland"));
2062	if (td->td_priority != td->td_user_pri) {
2063		thread_lock(td);
2064		td->td_priority = td->td_user_pri;
2065		td->td_base_pri = td->td_user_pri;
2066		tdq_setlowpri(TDQ_SELF(), td);
2067		thread_unlock(td);
2068        }
2069}
2070
2071/*
2072 * Handle a stathz tick.  This is really only relevant for timeshare
2073 * threads.
2074 */
2075void
2076sched_clock(struct thread *td)
2077{
2078	struct tdq *tdq;
2079	struct td_sched *ts;
2080
2081	THREAD_LOCK_ASSERT(td, MA_OWNED);
2082	tdq = TDQ_SELF();
2083#ifdef SMP
2084	/*
2085	 * We run the long term load balancer infrequently on the first cpu.
2086	 */
2087	if (balance_tdq == tdq) {
2088		if (balance_ticks && --balance_ticks == 0)
2089			sched_balance();
2090	}
2091#endif
2092	/*
2093	 * Save the old switch count so we have a record of the last ticks
2094	 * activity.   Initialize the new switch count based on our load.
2095	 * If there is some activity seed it to reflect that.
2096	 */
2097	tdq->tdq_oldswitchcnt = tdq->tdq_switchcnt;
2098	if (tdq->tdq_load)
2099		tdq->tdq_switchcnt = 2;
2100	else
2101		tdq->tdq_switchcnt = 0;
2102	/*
2103	 * Advance the insert index once for each tick to ensure that all
2104	 * threads get a chance to run.
2105	 */
2106	if (tdq->tdq_idx == tdq->tdq_ridx) {
2107		tdq->tdq_idx = (tdq->tdq_idx + 1) % RQ_NQS;
2108		if (TAILQ_EMPTY(&tdq->tdq_timeshare.rq_queues[tdq->tdq_ridx]))
2109			tdq->tdq_ridx = tdq->tdq_idx;
2110	}
2111	ts = td->td_sched;
2112	if (td->td_pri_class & PRI_FIFO_BIT)
2113		return;
2114	if (td->td_pri_class == PRI_TIMESHARE) {
2115		/*
2116		 * We used a tick; charge it to the thread so
2117		 * that we can compute our interactivity.
2118		 */
2119		td->td_sched->ts_runtime += tickincr;
2120		sched_interact_update(td);
2121		sched_priority(td);
2122	}
2123	/*
2124	 * We used up one time slice.
2125	 */
2126	if (--ts->ts_slice > 0)
2127		return;
2128	/*
2129	 * We're out of time, force a requeue at userret().
2130	 */
2131	ts->ts_slice = sched_slice;
2132	td->td_flags |= TDF_NEEDRESCHED;
2133}
2134
2135/*
2136 * Called once per hz tick.  Used for cpu utilization information.  This
2137 * is easier than trying to scale based on stathz.
2138 */
2139void
2140sched_tick(void)
2141{
2142	struct td_sched *ts;
2143
2144	ts = curthread->td_sched;
2145	/* Adjust ticks for pctcpu */
2146	ts->ts_ticks += 1 << SCHED_TICK_SHIFT;
2147	ts->ts_ltick = ticks;
2148	/*
2149	 * Update if we've exceeded our desired tick threshhold by over one
2150	 * second.
2151	 */
2152	if (ts->ts_ftick + SCHED_TICK_MAX < ts->ts_ltick)
2153		sched_pctcpu_update(ts);
2154}
2155
2156/*
2157 * Return whether the current CPU has runnable tasks.  Used for in-kernel
2158 * cooperative idle threads.
2159 */
2160int
2161sched_runnable(void)
2162{
2163	struct tdq *tdq;
2164	int load;
2165
2166	load = 1;
2167
2168	tdq = TDQ_SELF();
2169	if ((curthread->td_flags & TDF_IDLETD) != 0) {
2170		if (tdq->tdq_load > 0)
2171			goto out;
2172	} else
2173		if (tdq->tdq_load - 1 > 0)
2174			goto out;
2175	load = 0;
2176out:
2177	return (load);
2178}
2179
2180/*
2181 * Choose the highest priority thread to run.  The thread is removed from
2182 * the run-queue while running however the load remains.  For SMP we set
2183 * the tdq in the global idle bitmask if it idles here.
2184 */
2185struct thread *
2186sched_choose(void)
2187{
2188	struct thread *td;
2189	struct tdq *tdq;
2190
2191	tdq = TDQ_SELF();
2192	TDQ_LOCK_ASSERT(tdq, MA_OWNED);
2193	td = tdq_choose(tdq);
2194	if (td) {
2195		td->td_sched->ts_ltick = ticks;
2196		tdq_runq_rem(tdq, td);
2197		tdq->tdq_lowpri = td->td_priority;
2198		return (td);
2199	}
2200	tdq->tdq_lowpri = PRI_MAX_IDLE;
2201	return (PCPU_GET(idlethread));
2202}
2203
2204/*
2205 * Set owepreempt if necessary.  Preemption never happens directly in ULE,
2206 * we always request it once we exit a critical section.
2207 */
2208static inline void
2209sched_setpreempt(struct thread *td)
2210{
2211	struct thread *ctd;
2212	int cpri;
2213	int pri;
2214
2215	THREAD_LOCK_ASSERT(curthread, MA_OWNED);
2216
2217	ctd = curthread;
2218	pri = td->td_priority;
2219	cpri = ctd->td_priority;
2220	if (pri < cpri)
2221		ctd->td_flags |= TDF_NEEDRESCHED;
2222	if (panicstr != NULL || pri >= cpri || cold || TD_IS_INHIBITED(ctd))
2223		return;
2224	if (!sched_shouldpreempt(pri, cpri, 0))
2225		return;
2226	ctd->td_owepreempt = 1;
2227}
2228
2229/*
2230 * Add a thread to a thread queue.  Select the appropriate runq and add the
2231 * thread to it.  This is the internal function called when the tdq is
2232 * predetermined.
2233 */
2234void
2235tdq_add(struct tdq *tdq, struct thread *td, int flags)
2236{
2237
2238	TDQ_LOCK_ASSERT(tdq, MA_OWNED);
2239	KASSERT((td->td_inhibitors == 0),
2240	    ("sched_add: trying to run inhibited thread"));
2241	KASSERT((TD_CAN_RUN(td) || TD_IS_RUNNING(td)),
2242	    ("sched_add: bad thread state"));
2243	KASSERT(td->td_flags & TDF_INMEM,
2244	    ("sched_add: thread swapped out"));
2245
2246	if (td->td_priority < tdq->tdq_lowpri)
2247		tdq->tdq_lowpri = td->td_priority;
2248	tdq_runq_add(tdq, td, flags);
2249	tdq_load_add(tdq, td);
2250}
2251
2252/*
2253 * Select the target thread queue and add a thread to it.  Request
2254 * preemption or IPI a remote processor if required.
2255 */
2256void
2257sched_add(struct thread *td, int flags)
2258{
2259	struct tdq *tdq;
2260#ifdef SMP
2261	int cpu;
2262#endif
2263	CTR5(KTR_SCHED, "sched_add: %p(%s) prio %d by %p(%s)",
2264	    td, td->td_name, td->td_priority, curthread,
2265	    curthread->td_name);
2266	THREAD_LOCK_ASSERT(td, MA_OWNED);
2267	/*
2268	 * Recalculate the priority before we select the target cpu or
2269	 * run-queue.
2270	 */
2271	if (PRI_BASE(td->td_pri_class) == PRI_TIMESHARE)
2272		sched_priority(td);
2273#ifdef SMP
2274	/*
2275	 * Pick the destination cpu and if it isn't ours transfer to the
2276	 * target cpu.
2277	 */
2278	cpu = sched_pickcpu(td, flags);
2279	tdq = sched_setcpu(td, cpu, flags);
2280	tdq_add(tdq, td, flags);
2281	if (cpu != PCPU_GET(cpuid)) {
2282		tdq_notify(tdq, td);
2283		return;
2284	}
2285#else
2286	tdq = TDQ_SELF();
2287	TDQ_LOCK(tdq);
2288	/*
2289	 * Now that the thread is moving to the run-queue, set the lock
2290	 * to the scheduler's lock.
2291	 */
2292	thread_lock_set(td, TDQ_LOCKPTR(tdq));
2293	tdq_add(tdq, td, flags);
2294#endif
2295	if (!(flags & SRQ_YIELDING))
2296		sched_setpreempt(td);
2297}
2298
2299/*
2300 * Remove a thread from a run-queue without running it.  This is used
2301 * when we're stealing a thread from a remote queue.  Otherwise all threads
2302 * exit by calling sched_exit_thread() and sched_throw() themselves.
2303 */
2304void
2305sched_rem(struct thread *td)
2306{
2307	struct tdq *tdq;
2308
2309	CTR5(KTR_SCHED, "sched_rem: %p(%s) prio %d by %p(%s)",
2310	    td, td->td_name, td->td_priority, curthread,
2311	    curthread->td_name);
2312	tdq = TDQ_CPU(td->td_sched->ts_cpu);
2313	TDQ_LOCK_ASSERT(tdq, MA_OWNED);
2314	MPASS(td->td_lock == TDQ_LOCKPTR(tdq));
2315	KASSERT(TD_ON_RUNQ(td),
2316	    ("sched_rem: thread not on run queue"));
2317	tdq_runq_rem(tdq, td);
2318	tdq_load_rem(tdq, td);
2319	TD_SET_CAN_RUN(td);
2320	if (td->td_priority == tdq->tdq_lowpri)
2321		tdq_setlowpri(tdq, NULL);
2322}
2323
2324/*
2325 * Fetch cpu utilization information.  Updates on demand.
2326 */
2327fixpt_t
2328sched_pctcpu(struct thread *td)
2329{
2330	fixpt_t pctcpu;
2331	struct td_sched *ts;
2332
2333	pctcpu = 0;
2334	ts = td->td_sched;
2335	if (ts == NULL)
2336		return (0);
2337
2338	thread_lock(td);
2339	if (ts->ts_ticks) {
2340		int rtick;
2341
2342		sched_pctcpu_update(ts);
2343		/* How many rtick per second ? */
2344		rtick = min(SCHED_TICK_HZ(ts) / SCHED_TICK_SECS, hz);
2345		pctcpu = (FSCALE * ((FSCALE * rtick)/hz)) >> FSHIFT;
2346	}
2347	thread_unlock(td);
2348
2349	return (pctcpu);
2350}
2351
2352/*
2353 * Enforce affinity settings for a thread.  Called after adjustments to
2354 * cpumask.
2355 */
2356void
2357sched_affinity(struct thread *td)
2358{
2359#ifdef SMP
2360	struct td_sched *ts;
2361	int cpu;
2362
2363	THREAD_LOCK_ASSERT(td, MA_OWNED);
2364	ts = td->td_sched;
2365	if (THREAD_CAN_SCHED(td, ts->ts_cpu))
2366		return;
2367	if (!TD_IS_RUNNING(td))
2368		return;
2369	td->td_flags |= TDF_NEEDRESCHED;
2370	if (!THREAD_CAN_MIGRATE(td))
2371		return;
2372	/*
2373	 * Assign the new cpu and force a switch before returning to
2374	 * userspace.  If the target thread is not running locally send
2375	 * an ipi to force the issue.
2376	 */
2377	cpu = ts->ts_cpu;
2378	ts->ts_cpu = sched_pickcpu(td, 0);
2379	if (cpu != PCPU_GET(cpuid))
2380		ipi_selected(1 << cpu, IPI_PREEMPT);
2381#endif
2382}
2383
2384/*
2385 * Bind a thread to a target cpu.
2386 */
2387void
2388sched_bind(struct thread *td, int cpu)
2389{
2390	struct td_sched *ts;
2391
2392	THREAD_LOCK_ASSERT(td, MA_OWNED|MA_NOTRECURSED);
2393	ts = td->td_sched;
2394	if (ts->ts_flags & TSF_BOUND)
2395		sched_unbind(td);
2396	ts->ts_flags |= TSF_BOUND;
2397	sched_pin();
2398	if (PCPU_GET(cpuid) == cpu)
2399		return;
2400	ts->ts_cpu = cpu;
2401	/* When we return from mi_switch we'll be on the correct cpu. */
2402	mi_switch(SW_VOL, NULL);
2403}
2404
2405/*
2406 * Release a bound thread.
2407 */
2408void
2409sched_unbind(struct thread *td)
2410{
2411	struct td_sched *ts;
2412
2413	THREAD_LOCK_ASSERT(td, MA_OWNED);
2414	ts = td->td_sched;
2415	if ((ts->ts_flags & TSF_BOUND) == 0)
2416		return;
2417	ts->ts_flags &= ~TSF_BOUND;
2418	sched_unpin();
2419}
2420
2421int
2422sched_is_bound(struct thread *td)
2423{
2424	THREAD_LOCK_ASSERT(td, MA_OWNED);
2425	return (td->td_sched->ts_flags & TSF_BOUND);
2426}
2427
2428/*
2429 * Basic yield call.
2430 */
2431void
2432sched_relinquish(struct thread *td)
2433{
2434	thread_lock(td);
2435	mi_switch(SW_VOL | SWT_RELINQUISH, NULL);
2436	thread_unlock(td);
2437}
2438
2439/*
2440 * Return the total system load.
2441 */
2442int
2443sched_load(void)
2444{
2445#ifdef SMP
2446	int total;
2447	int i;
2448
2449	total = 0;
2450	for (i = 0; i <= mp_maxid; i++)
2451		total += TDQ_CPU(i)->tdq_sysload;
2452	return (total);
2453#else
2454	return (TDQ_SELF()->tdq_sysload);
2455#endif
2456}
2457
2458int
2459sched_sizeof_proc(void)
2460{
2461	return (sizeof(struct proc));
2462}
2463
2464int
2465sched_sizeof_thread(void)
2466{
2467	return (sizeof(struct thread) + sizeof(struct td_sched));
2468}
2469
2470/*
2471 * The actual idle process.
2472 */
2473void
2474sched_idletd(void *dummy)
2475{
2476	struct thread *td;
2477	struct tdq *tdq;
2478	int switchcnt;
2479	int i;
2480
2481	td = curthread;
2482	tdq = TDQ_SELF();
2483	mtx_assert(&Giant, MA_NOTOWNED);
2484	/* ULE relies on preemption for idle interruption. */
2485	for (;;) {
2486		tdq->tdq_idlestate = TDQ_RUNNING;
2487#ifdef SMP
2488		if (tdq_idled(tdq) == 0)
2489			continue;
2490#endif
2491		switchcnt = tdq->tdq_switchcnt + tdq->tdq_oldswitchcnt;
2492		/*
2493		 * If we're switching very frequently, spin while checking
2494		 * for load rather than entering a low power state that
2495		 * requires an IPI.
2496		 */
2497		if (switchcnt > sched_idlespinthresh) {
2498			for (i = 0; i < sched_idlespins; i++) {
2499				if (tdq->tdq_load)
2500					break;
2501				cpu_spinwait();
2502			}
2503		}
2504		/*
2505		 * We must set our state to IDLE before checking
2506		 * tdq_load for the last time to avoid a race with
2507		 * tdq_notify().
2508		 */
2509		if (tdq->tdq_load == 0) {
2510			tdq->tdq_idlestate = TDQ_IDLE;
2511			if (tdq->tdq_load == 0)
2512				cpu_idle();
2513		}
2514		if (tdq->tdq_load) {
2515			thread_lock(td);
2516			mi_switch(SW_VOL | SWT_IDLE, NULL);
2517			thread_unlock(td);
2518		}
2519	}
2520}
2521
2522/*
2523 * A CPU is entering for the first time or a thread is exiting.
2524 */
2525void
2526sched_throw(struct thread *td)
2527{
2528	struct thread *newtd;
2529	struct tdq *tdq;
2530
2531	tdq = TDQ_SELF();
2532	if (td == NULL) {
2533		/* Correct spinlock nesting and acquire the correct lock. */
2534		TDQ_LOCK(tdq);
2535		spinlock_exit();
2536	} else {
2537		MPASS(td->td_lock == TDQ_LOCKPTR(tdq));
2538		tdq_load_rem(tdq, td);
2539		lock_profile_release_lock(&TDQ_LOCKPTR(tdq)->lock_object);
2540	}
2541	KASSERT(curthread->td_md.md_spinlock_count == 1, ("invalid count"));
2542	newtd = choosethread();
2543	TDQ_LOCKPTR(tdq)->mtx_lock = (uintptr_t)newtd;
2544	PCPU_SET(switchtime, cpu_ticks());
2545	PCPU_SET(switchticks, ticks);
2546	cpu_throw(td, newtd);		/* doesn't return */
2547}
2548
2549/*
2550 * This is called from fork_exit().  Just acquire the correct locks and
2551 * let fork do the rest of the work.
2552 */
2553void
2554sched_fork_exit(struct thread *td)
2555{
2556	struct td_sched *ts;
2557	struct tdq *tdq;
2558	int cpuid;
2559
2560	/*
2561	 * Finish setting up thread glue so that it begins execution in a
2562	 * non-nested critical section with the scheduler lock held.
2563	 */
2564	cpuid = PCPU_GET(cpuid);
2565	tdq = TDQ_CPU(cpuid);
2566	ts = td->td_sched;
2567	if (TD_IS_IDLETHREAD(td))
2568		td->td_lock = TDQ_LOCKPTR(tdq);
2569	MPASS(td->td_lock == TDQ_LOCKPTR(tdq));
2570	td->td_oncpu = cpuid;
2571	TDQ_LOCK_ASSERT(tdq, MA_OWNED | MA_NOTRECURSED);
2572	lock_profile_obtain_lock_success(
2573	    &TDQ_LOCKPTR(tdq)->lock_object, 0, 0, __FILE__, __LINE__);
2574}
2575
2576SYSCTL_NODE(_kern, OID_AUTO, sched, CTLFLAG_RW, 0, "Scheduler");
2577SYSCTL_STRING(_kern_sched, OID_AUTO, name, CTLFLAG_RD, "ULE", 0,
2578    "Scheduler name");
2579SYSCTL_INT(_kern_sched, OID_AUTO, slice, CTLFLAG_RW, &sched_slice, 0,
2580    "Slice size for timeshare threads");
2581SYSCTL_INT(_kern_sched, OID_AUTO, interact, CTLFLAG_RW, &sched_interact, 0,
2582     "Interactivity score threshold");
2583SYSCTL_INT(_kern_sched, OID_AUTO, preempt_thresh, CTLFLAG_RW, &preempt_thresh,
2584     0,"Min priority for preemption, lower priorities have greater precedence");
2585SYSCTL_INT(_kern_sched, OID_AUTO, static_boost, CTLFLAG_RW, &static_boost,
2586     0,"Controls whether static kernel priorities are assigned to sleeping threads.");
2587SYSCTL_INT(_kern_sched, OID_AUTO, idlespins, CTLFLAG_RW, &sched_idlespins,
2588     0,"Number of times idle will spin waiting for new work.");
2589SYSCTL_INT(_kern_sched, OID_AUTO, idlespinthresh, CTLFLAG_RW, &sched_idlespinthresh,
2590     0,"Threshold before we will permit idle spinning.");
2591#ifdef SMP
2592SYSCTL_INT(_kern_sched, OID_AUTO, affinity, CTLFLAG_RW, &affinity, 0,
2593    "Number of hz ticks to keep thread affinity for");
2594SYSCTL_INT(_kern_sched, OID_AUTO, balance, CTLFLAG_RW, &rebalance, 0,
2595    "Enables the long-term load balancer");
2596SYSCTL_INT(_kern_sched, OID_AUTO, balance_interval, CTLFLAG_RW,
2597    &balance_interval, 0,
2598    "Average frequency in stathz ticks to run the long-term balancer");
2599SYSCTL_INT(_kern_sched, OID_AUTO, steal_htt, CTLFLAG_RW, &steal_htt, 0,
2600    "Steals work from another hyper-threaded core on idle");
2601SYSCTL_INT(_kern_sched, OID_AUTO, steal_idle, CTLFLAG_RW, &steal_idle, 0,
2602    "Attempts to steal work from other cores before idling");
2603SYSCTL_INT(_kern_sched, OID_AUTO, steal_thresh, CTLFLAG_RW, &steal_thresh, 0,
2604    "Minimum load on remote cpu before we'll steal");
2605#endif
2606
2607/* ps compat.  All cpu percentages from ULE are weighted. */
2608static int ccpu = 0;
2609SYSCTL_INT(_kern, OID_AUTO, ccpu, CTLFLAG_RD, &ccpu, 0, "");
2610