sched_ule.c revision 176735
1/*-
2 * Copyright (c) 2002-2007, Jeffrey Roberson <jeff@freebsd.org>
3 * All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 * 1. Redistributions of source code must retain the above copyright
9 *    notice unmodified, this list of conditions, and the following
10 *    disclaimer.
11 * 2. Redistributions in binary form must reproduce the above copyright
12 *    notice, this list of conditions and the following disclaimer in the
13 *    documentation and/or other materials provided with the distribution.
14 *
15 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
16 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
17 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
18 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
19 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
20 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
21 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
22 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
23 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
24 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
25 */
26
27/*
28 * This file implements the ULE scheduler.  ULE supports independent CPU
29 * run queues and fine grain locking.  It has superior interactive
30 * performance under load even on uni-processor systems.
31 *
32 * etymology:
33 *   ULE is the last three letters in schedule.  It owes its name to a
34 * generic user created for a scheduling system by Paul Mikesell at
35 * Isilon Systems and a general lack of creativity on the part of the author.
36 */
37
38#include <sys/cdefs.h>
39__FBSDID("$FreeBSD: head/sys/kern/sched_ule.c 176735 2008-03-02 08:20:59Z jeff $");
40
41#include "opt_hwpmc_hooks.h"
42#include "opt_sched.h"
43
44#include <sys/param.h>
45#include <sys/systm.h>
46#include <sys/kdb.h>
47#include <sys/kernel.h>
48#include <sys/ktr.h>
49#include <sys/lock.h>
50#include <sys/mutex.h>
51#include <sys/proc.h>
52#include <sys/resource.h>
53#include <sys/resourcevar.h>
54#include <sys/sched.h>
55#include <sys/smp.h>
56#include <sys/sx.h>
57#include <sys/sysctl.h>
58#include <sys/sysproto.h>
59#include <sys/turnstile.h>
60#include <sys/umtx.h>
61#include <sys/vmmeter.h>
62#include <sys/cpuset.h>
63#ifdef KTRACE
64#include <sys/uio.h>
65#include <sys/ktrace.h>
66#endif
67
68#ifdef HWPMC_HOOKS
69#include <sys/pmckern.h>
70#endif
71
72#include <machine/cpu.h>
73#include <machine/smp.h>
74
75#if !defined(__i386__) && !defined(__amd64__) && !defined(__powerpc__) && !defined(__arm__)
76#error "This architecture is not currently compatible with ULE"
77#endif
78
79#define	KTR_ULE	0
80
81/*
82 * Thread scheduler specific section.  All fields are protected
83 * by the thread lock.
84 */
85struct td_sched {
86	TAILQ_ENTRY(td_sched) ts_procq;	/* Run queue. */
87	struct thread	*ts_thread;	/* Active associated thread. */
88	struct runq	*ts_runq;	/* Run-queue we're queued on. */
89	short		ts_flags;	/* TSF_* flags. */
90	u_char		ts_rqindex;	/* Run queue index. */
91	u_char		ts_cpu;		/* CPU that we have affinity for. */
92	int		ts_slice;	/* Ticks of slice remaining. */
93	u_int		ts_slptime;	/* Number of ticks we vol. slept */
94	u_int		ts_runtime;	/* Number of ticks we were running */
95	/* The following variables are only used for pctcpu calculation */
96	int		ts_ltick;	/* Last tick that we were running on */
97	int		ts_ftick;	/* First tick that we were running on */
98	int		ts_ticks;	/* Tick count */
99	int		ts_rltick;	/* Real last tick, for affinity. */
100};
101/* flags kept in ts_flags */
102#define	TSF_BOUND	0x0001		/* Thread can not migrate. */
103#define	TSF_XFERABLE	0x0002		/* Thread was added as transferable. */
104
105static struct td_sched td_sched0;
106
107#define	THREAD_CAN_MIGRATE(td)	((td)->td_pinned == 0)
108#define	THREAD_CAN_SCHED(td, cpu)	\
109    CPU_ISSET((cpu), &(td)->td_cpuset->cs_mask)
110
111/*
112 * Cpu percentage computation macros and defines.
113 *
114 * SCHED_TICK_SECS:	Number of seconds to average the cpu usage across.
115 * SCHED_TICK_TARG:	Number of hz ticks to average the cpu usage across.
116 * SCHED_TICK_MAX:	Maximum number of ticks before scaling back.
117 * SCHED_TICK_SHIFT:	Shift factor to avoid rounding away results.
118 * SCHED_TICK_HZ:	Compute the number of hz ticks for a given ticks count.
119 * SCHED_TICK_TOTAL:	Gives the amount of time we've been recording ticks.
120 */
121#define	SCHED_TICK_SECS		10
122#define	SCHED_TICK_TARG		(hz * SCHED_TICK_SECS)
123#define	SCHED_TICK_MAX		(SCHED_TICK_TARG + hz)
124#define	SCHED_TICK_SHIFT	10
125#define	SCHED_TICK_HZ(ts)	((ts)->ts_ticks >> SCHED_TICK_SHIFT)
126#define	SCHED_TICK_TOTAL(ts)	(max((ts)->ts_ltick - (ts)->ts_ftick, hz))
127
128/*
129 * These macros determine priorities for non-interactive threads.  They are
130 * assigned a priority based on their recent cpu utilization as expressed
131 * by the ratio of ticks to the tick total.  NHALF priorities at the start
132 * and end of the MIN to MAX timeshare range are only reachable with negative
133 * or positive nice respectively.
134 *
135 * PRI_RANGE:	Priority range for utilization dependent priorities.
136 * PRI_NRESV:	Number of nice values.
137 * PRI_TICKS:	Compute a priority in PRI_RANGE from the ticks count and total.
138 * PRI_NICE:	Determines the part of the priority inherited from nice.
139 */
140#define	SCHED_PRI_NRESV		(PRIO_MAX - PRIO_MIN)
141#define	SCHED_PRI_NHALF		(SCHED_PRI_NRESV / 2)
142#define	SCHED_PRI_MIN		(PRI_MIN_TIMESHARE + SCHED_PRI_NHALF)
143#define	SCHED_PRI_MAX		(PRI_MAX_TIMESHARE - SCHED_PRI_NHALF)
144#define	SCHED_PRI_RANGE		(SCHED_PRI_MAX - SCHED_PRI_MIN)
145#define	SCHED_PRI_TICKS(ts)						\
146    (SCHED_TICK_HZ((ts)) /						\
147    (roundup(SCHED_TICK_TOTAL((ts)), SCHED_PRI_RANGE) / SCHED_PRI_RANGE))
148#define	SCHED_PRI_NICE(nice)	(nice)
149
150/*
151 * These determine the interactivity of a process.  Interactivity differs from
152 * cpu utilization in that it expresses the voluntary time slept vs time ran
153 * while cpu utilization includes all time not running.  This more accurately
154 * models the intent of the thread.
155 *
156 * SLP_RUN_MAX:	Maximum amount of sleep time + run time we'll accumulate
157 *		before throttling back.
158 * SLP_RUN_FORK:	Maximum slp+run time to inherit at fork time.
159 * INTERACT_MAX:	Maximum interactivity value.  Smaller is better.
160 * INTERACT_THRESH:	Threshhold for placement on the current runq.
161 */
162#define	SCHED_SLP_RUN_MAX	((hz * 5) << SCHED_TICK_SHIFT)
163#define	SCHED_SLP_RUN_FORK	((hz / 2) << SCHED_TICK_SHIFT)
164#define	SCHED_INTERACT_MAX	(100)
165#define	SCHED_INTERACT_HALF	(SCHED_INTERACT_MAX / 2)
166#define	SCHED_INTERACT_THRESH	(30)
167
168/*
169 * tickincr:		Converts a stathz tick into a hz domain scaled by
170 *			the shift factor.  Without the shift the error rate
171 *			due to rounding would be unacceptably high.
172 * realstathz:		stathz is sometimes 0 and run off of hz.
173 * sched_slice:		Runtime of each thread before rescheduling.
174 * preempt_thresh:	Priority threshold for preemption and remote IPIs.
175 */
176static int sched_interact = SCHED_INTERACT_THRESH;
177static int realstathz;
178static int tickincr;
179static int sched_slice;
180#ifdef PREEMPTION
181#ifdef FULL_PREEMPTION
182static int preempt_thresh = PRI_MAX_IDLE;
183#else
184static int preempt_thresh = PRI_MIN_KERN;
185#endif
186#else
187static int preempt_thresh = 0;
188#endif
189static int lowpri_userret = 1;
190
191/*
192 * tdq - per processor runqs and statistics.  All fields are protected by the
193 * tdq_lock.  The load and lowpri may be accessed without to avoid excess
194 * locking in sched_pickcpu();
195 */
196struct tdq {
197	struct cpu_group *tdq_cg;		/* Pointer to cpu topology. */
198	struct mtx	tdq_lock;		/* run queue lock. */
199	struct runq	tdq_realtime;		/* real-time run queue. */
200	struct runq	tdq_timeshare;		/* timeshare run queue. */
201	struct runq	tdq_idle;		/* Queue of IDLE threads. */
202	int		tdq_load;		/* Aggregate load. */
203	int		tdq_sysload;		/* For loadavg, !ITHD load. */
204	u_char		tdq_idx;		/* Current insert index. */
205	u_char		tdq_ridx;		/* Current removal index. */
206	u_char		tdq_lowpri;		/* Lowest priority thread. */
207	int		tdq_transferable;	/* Transferable thread count. */
208	char		tdq_name[sizeof("sched lock") + 6];
209} __aligned(64);
210
211
212#ifdef SMP
213struct cpu_group *cpu_top;
214
215#define	SCHED_AFFINITY_DEFAULT	(max(1, hz / 1000))
216#define	SCHED_AFFINITY(ts, t)	((ts)->ts_rltick > ticks - ((t) * affinity))
217
218/*
219 * Run-time tunables.
220 */
221static int rebalance = 1;
222static int balance_interval = 128;	/* Default set in sched_initticks(). */
223static int pick_pri = 1;
224static int affinity;
225static int tryself = 1;
226static int oldtryself = 0;
227static int steal_htt = 1;
228static int steal_idle = 1;
229static int steal_thresh = 2;
230
231/*
232 * One thread queue per processor.
233 */
234static struct tdq	tdq_cpu[MAXCPU];
235static struct tdq	*balance_tdq;
236static int balance_ticks;
237
238#define	TDQ_SELF()	(&tdq_cpu[PCPU_GET(cpuid)])
239#define	TDQ_CPU(x)	(&tdq_cpu[(x)])
240#define	TDQ_ID(x)	((int)((x) - tdq_cpu))
241#else	/* !SMP */
242static struct tdq	tdq_cpu;
243
244#define	TDQ_ID(x)	(0)
245#define	TDQ_SELF()	(&tdq_cpu)
246#define	TDQ_CPU(x)	(&tdq_cpu)
247#endif
248
249#define	TDQ_LOCK_ASSERT(t, type)	mtx_assert(TDQ_LOCKPTR((t)), (type))
250#define	TDQ_LOCK(t)		mtx_lock_spin(TDQ_LOCKPTR((t)))
251#define	TDQ_LOCK_FLAGS(t, f)	mtx_lock_spin_flags(TDQ_LOCKPTR((t)), (f))
252#define	TDQ_UNLOCK(t)		mtx_unlock_spin(TDQ_LOCKPTR((t)))
253#define	TDQ_LOCKPTR(t)		(&(t)->tdq_lock)
254
255static void sched_priority(struct thread *);
256static void sched_thread_priority(struct thread *, u_char);
257static int sched_interact_score(struct thread *);
258static void sched_interact_update(struct thread *);
259static void sched_interact_fork(struct thread *);
260static void sched_pctcpu_update(struct td_sched *);
261
262/* Operations on per processor queues */
263static struct td_sched * tdq_choose(struct tdq *);
264static void tdq_setup(struct tdq *);
265static void tdq_load_add(struct tdq *, struct td_sched *);
266static void tdq_load_rem(struct tdq *, struct td_sched *);
267static __inline void tdq_runq_add(struct tdq *, struct td_sched *, int);
268static __inline void tdq_runq_rem(struct tdq *, struct td_sched *);
269void tdq_print(int cpu);
270static void runq_print(struct runq *rq);
271static void tdq_add(struct tdq *, struct thread *, int);
272#ifdef SMP
273static int tdq_move(struct tdq *, struct tdq *);
274static int tdq_idled(struct tdq *);
275static void tdq_notify(struct td_sched *);
276static struct td_sched *tdq_steal(struct tdq *, int);
277static struct td_sched *runq_steal(struct runq *, int);
278static int sched_pickcpu(struct td_sched *, int);
279static void sched_balance(void);
280static int sched_balance_pair(struct tdq *, struct tdq *);
281static inline struct tdq *sched_setcpu(struct td_sched *, int, int);
282static inline struct mtx *thread_block_switch(struct thread *);
283static inline void thread_unblock_switch(struct thread *, struct mtx *);
284static struct mtx *sched_switch_migrate(struct tdq *, struct thread *, int);
285#endif
286
287static void sched_setup(void *dummy);
288SYSINIT(sched_setup, SI_SUB_RUN_QUEUE, SI_ORDER_FIRST, sched_setup, NULL)
289
290static void sched_initticks(void *dummy);
291SYSINIT(sched_initticks, SI_SUB_CLOCKS, SI_ORDER_THIRD, sched_initticks, NULL)
292
293/*
294 * Print the threads waiting on a run-queue.
295 */
296static void
297runq_print(struct runq *rq)
298{
299	struct rqhead *rqh;
300	struct td_sched *ts;
301	int pri;
302	int j;
303	int i;
304
305	for (i = 0; i < RQB_LEN; i++) {
306		printf("\t\trunq bits %d 0x%zx\n",
307		    i, rq->rq_status.rqb_bits[i]);
308		for (j = 0; j < RQB_BPW; j++)
309			if (rq->rq_status.rqb_bits[i] & (1ul << j)) {
310				pri = j + (i << RQB_L2BPW);
311				rqh = &rq->rq_queues[pri];
312				TAILQ_FOREACH(ts, rqh, ts_procq) {
313					printf("\t\t\ttd %p(%s) priority %d rqindex %d pri %d\n",
314					    ts->ts_thread, ts->ts_thread->td_name, ts->ts_thread->td_priority, ts->ts_rqindex, pri);
315				}
316			}
317	}
318}
319
320/*
321 * Print the status of a per-cpu thread queue.  Should be a ddb show cmd.
322 */
323void
324tdq_print(int cpu)
325{
326	struct tdq *tdq;
327
328	tdq = TDQ_CPU(cpu);
329
330	printf("tdq %d:\n", TDQ_ID(tdq));
331	printf("\tlock            %p\n", TDQ_LOCKPTR(tdq));
332	printf("\tLock name:      %s\n", tdq->tdq_name);
333	printf("\tload:           %d\n", tdq->tdq_load);
334	printf("\ttimeshare idx:  %d\n", tdq->tdq_idx);
335	printf("\ttimeshare ridx: %d\n", tdq->tdq_ridx);
336	printf("\trealtime runq:\n");
337	runq_print(&tdq->tdq_realtime);
338	printf("\ttimeshare runq:\n");
339	runq_print(&tdq->tdq_timeshare);
340	printf("\tidle runq:\n");
341	runq_print(&tdq->tdq_idle);
342	printf("\tload transferable: %d\n", tdq->tdq_transferable);
343	printf("\tlowest priority:   %d\n", tdq->tdq_lowpri);
344}
345
346#define	TS_RQ_PPQ	(((PRI_MAX_TIMESHARE - PRI_MIN_TIMESHARE) + 1) / RQ_NQS)
347/*
348 * Add a thread to the actual run-queue.  Keeps transferable counts up to
349 * date with what is actually on the run-queue.  Selects the correct
350 * queue position for timeshare threads.
351 */
352static __inline void
353tdq_runq_add(struct tdq *tdq, struct td_sched *ts, int flags)
354{
355	TDQ_LOCK_ASSERT(tdq, MA_OWNED);
356	THREAD_LOCK_ASSERT(ts->ts_thread, MA_OWNED);
357	if (THREAD_CAN_MIGRATE(ts->ts_thread)) {
358		tdq->tdq_transferable++;
359		ts->ts_flags |= TSF_XFERABLE;
360	}
361	if (ts->ts_runq == &tdq->tdq_timeshare) {
362		u_char pri;
363
364		pri = ts->ts_thread->td_priority;
365		KASSERT(pri <= PRI_MAX_TIMESHARE && pri >= PRI_MIN_TIMESHARE,
366			("Invalid priority %d on timeshare runq", pri));
367		/*
368		 * This queue contains only priorities between MIN and MAX
369		 * realtime.  Use the whole queue to represent these values.
370		 */
371		if ((flags & (SRQ_BORROWING|SRQ_PREEMPTED)) == 0) {
372			pri = (pri - PRI_MIN_TIMESHARE) / TS_RQ_PPQ;
373			pri = (pri + tdq->tdq_idx) % RQ_NQS;
374			/*
375			 * This effectively shortens the queue by one so we
376			 * can have a one slot difference between idx and
377			 * ridx while we wait for threads to drain.
378			 */
379			if (tdq->tdq_ridx != tdq->tdq_idx &&
380			    pri == tdq->tdq_ridx)
381				pri = (unsigned char)(pri - 1) % RQ_NQS;
382		} else
383			pri = tdq->tdq_ridx;
384		runq_add_pri(ts->ts_runq, ts, pri, flags);
385	} else
386		runq_add(ts->ts_runq, ts, flags);
387}
388
389/*
390 * Remove a thread from a run-queue.  This typically happens when a thread
391 * is selected to run.  Running threads are not on the queue and the
392 * transferable count does not reflect them.
393 */
394static __inline void
395tdq_runq_rem(struct tdq *tdq, struct td_sched *ts)
396{
397	TDQ_LOCK_ASSERT(tdq, MA_OWNED);
398	KASSERT(ts->ts_runq != NULL,
399	    ("tdq_runq_remove: thread %p null ts_runq", ts->ts_thread));
400	if (ts->ts_flags & TSF_XFERABLE) {
401		tdq->tdq_transferable--;
402		ts->ts_flags &= ~TSF_XFERABLE;
403	}
404	if (ts->ts_runq == &tdq->tdq_timeshare) {
405		if (tdq->tdq_idx != tdq->tdq_ridx)
406			runq_remove_idx(ts->ts_runq, ts, &tdq->tdq_ridx);
407		else
408			runq_remove_idx(ts->ts_runq, ts, NULL);
409		/*
410		 * For timeshare threads we update the priority here so
411		 * the priority reflects the time we've been sleeping.
412		 */
413		ts->ts_ltick = ticks;
414		sched_pctcpu_update(ts);
415		sched_priority(ts->ts_thread);
416	} else
417		runq_remove(ts->ts_runq, ts);
418}
419
420/*
421 * Load is maintained for all threads RUNNING and ON_RUNQ.  Add the load
422 * for this thread to the referenced thread queue.
423 */
424static void
425tdq_load_add(struct tdq *tdq, struct td_sched *ts)
426{
427	int class;
428
429	TDQ_LOCK_ASSERT(tdq, MA_OWNED);
430	THREAD_LOCK_ASSERT(ts->ts_thread, MA_OWNED);
431	class = PRI_BASE(ts->ts_thread->td_pri_class);
432	tdq->tdq_load++;
433	CTR2(KTR_SCHED, "cpu %d load: %d", TDQ_ID(tdq), tdq->tdq_load);
434	if (class != PRI_ITHD &&
435	    (ts->ts_thread->td_proc->p_flag & P_NOLOAD) == 0)
436		tdq->tdq_sysload++;
437}
438
439/*
440 * Remove the load from a thread that is transitioning to a sleep state or
441 * exiting.
442 */
443static void
444tdq_load_rem(struct tdq *tdq, struct td_sched *ts)
445{
446	int class;
447
448	THREAD_LOCK_ASSERT(ts->ts_thread, MA_OWNED);
449	TDQ_LOCK_ASSERT(tdq, MA_OWNED);
450	class = PRI_BASE(ts->ts_thread->td_pri_class);
451	if (class != PRI_ITHD &&
452	    (ts->ts_thread->td_proc->p_flag & P_NOLOAD) == 0)
453		tdq->tdq_sysload--;
454	KASSERT(tdq->tdq_load != 0,
455	    ("tdq_load_rem: Removing with 0 load on queue %d", TDQ_ID(tdq)));
456	tdq->tdq_load--;
457	CTR1(KTR_SCHED, "load: %d", tdq->tdq_load);
458	ts->ts_runq = NULL;
459}
460
461/*
462 * Set lowpri to its exact value by searching the run-queue and
463 * evaluating curthread.  curthread may be passed as an optimization.
464 */
465static void
466tdq_setlowpri(struct tdq *tdq, struct thread *ctd)
467{
468	struct td_sched *ts;
469	struct thread *td;
470
471	TDQ_LOCK_ASSERT(tdq, MA_OWNED);
472	if (ctd == NULL)
473		ctd = pcpu_find(TDQ_ID(tdq))->pc_curthread;
474	ts = tdq_choose(tdq);
475	if (ts)
476		td = ts->ts_thread;
477	if (ts == NULL || td->td_priority > ctd->td_priority)
478		tdq->tdq_lowpri = ctd->td_priority;
479	else
480		tdq->tdq_lowpri = td->td_priority;
481}
482
483#ifdef SMP
484struct cpu_search {
485	cpumask_t cs_mask;	/* Mask of valid cpus. */
486	u_int	cs_load;
487	u_int	cs_cpu;
488	int	cs_limit;	/* Min priority for low min load for high. */
489};
490
491#define	CPU_SEARCH_LOWEST	0x1
492#define	CPU_SEARCH_HIGHEST	0x2
493#define	CPU_SEARCH_BOTH		(CPU_SEARCH_LOWEST|CPU_SEARCH_HIGHEST)
494
495#define	CPUMASK_FOREACH(cpu, mask)				\
496	for ((cpu) = 0; (cpu) < sizeof((mask)) * 8; (cpu)++)	\
497		if ((mask) & 1 << (cpu))
498
499__inline int cpu_search(struct cpu_group *cg, struct cpu_search *low,
500    struct cpu_search *high, const int match);
501int cpu_search_lowest(struct cpu_group *cg, struct cpu_search *low);
502int cpu_search_highest(struct cpu_group *cg, struct cpu_search *high);
503int cpu_search_both(struct cpu_group *cg, struct cpu_search *low,
504    struct cpu_search *high);
505
506/*
507 * This routine compares according to the match argument and should be
508 * reduced in actual instantiations via constant propagation and dead code
509 * elimination.
510 */
511static __inline int
512cpu_compare(int cpu, struct cpu_search *low, struct cpu_search *high,
513    const int match)
514{
515	struct tdq *tdq;
516
517	tdq = TDQ_CPU(cpu);
518	if (match & CPU_SEARCH_LOWEST)
519		if (low->cs_mask & (1 << cpu) &&
520		    tdq->tdq_load < low->cs_load &&
521		    tdq->tdq_lowpri > low->cs_limit) {
522			low->cs_cpu = cpu;
523			low->cs_load = tdq->tdq_load;
524		}
525	if (match & CPU_SEARCH_HIGHEST)
526		if (high->cs_mask & (1 << cpu) &&
527		    tdq->tdq_load >= high->cs_limit &&
528		    tdq->tdq_load > high->cs_load &&
529		    tdq->tdq_transferable) {
530			high->cs_cpu = cpu;
531			high->cs_load = tdq->tdq_load;
532		}
533	return (tdq->tdq_load);
534}
535
536/*
537 * Search the tree of cpu_groups for the lowest or highest loaded cpu
538 * according to the match argument.  This routine actually compares the
539 * load on all paths through the tree and finds the least loaded cpu on
540 * the least loaded path, which may differ from the least loaded cpu in
541 * the system.  This balances work among caches and busses.
542 *
543 * This inline is instantiated in three forms below using constants for the
544 * match argument.  It is reduced to the minimum set for each case.  It is
545 * also recursive to the depth of the tree.
546 */
547static inline int
548cpu_search(struct cpu_group *cg, struct cpu_search *low,
549    struct cpu_search *high, const int match)
550{
551	int total;
552
553	total = 0;
554	if (cg->cg_children) {
555		struct cpu_search lgroup;
556		struct cpu_search hgroup;
557		struct cpu_group *child;
558		u_int lload;
559		int hload;
560		int load;
561		int i;
562
563		lload = -1;
564		hload = -1;
565		for (i = 0; i < cg->cg_children; i++) {
566			child = &cg->cg_child[i];
567			if (match & CPU_SEARCH_LOWEST) {
568				lgroup = *low;
569				lgroup.cs_load = -1;
570			}
571			if (match & CPU_SEARCH_HIGHEST) {
572				hgroup = *high;
573				lgroup.cs_load = 0;
574			}
575			switch (match) {
576			case CPU_SEARCH_LOWEST:
577				load = cpu_search_lowest(child, &lgroup);
578				break;
579			case CPU_SEARCH_HIGHEST:
580				load = cpu_search_highest(child, &hgroup);
581				break;
582			case CPU_SEARCH_BOTH:
583				load = cpu_search_both(child, &lgroup, &hgroup);
584				break;
585			}
586			total += load;
587			if (match & CPU_SEARCH_LOWEST)
588				if (load < lload || low->cs_cpu == -1) {
589					*low = lgroup;
590					lload = load;
591				}
592			if (match & CPU_SEARCH_HIGHEST)
593				if (load > hload || high->cs_cpu == -1) {
594					hload = load;
595					*high = hgroup;
596				}
597		}
598	} else {
599		int cpu;
600
601		CPUMASK_FOREACH(cpu, cg->cg_mask)
602			total += cpu_compare(cpu, low, high, match);
603	}
604	return (total);
605}
606
607/*
608 * cpu_search instantiations must pass constants to maintain the inline
609 * optimization.
610 */
611int
612cpu_search_lowest(struct cpu_group *cg, struct cpu_search *low)
613{
614	return cpu_search(cg, low, NULL, CPU_SEARCH_LOWEST);
615}
616
617int
618cpu_search_highest(struct cpu_group *cg, struct cpu_search *high)
619{
620	return cpu_search(cg, NULL, high, CPU_SEARCH_HIGHEST);
621}
622
623int
624cpu_search_both(struct cpu_group *cg, struct cpu_search *low,
625    struct cpu_search *high)
626{
627	return cpu_search(cg, low, high, CPU_SEARCH_BOTH);
628}
629
630/*
631 * Find the cpu with the least load via the least loaded path that has a
632 * lowpri greater than pri  pri.  A pri of -1 indicates any priority is
633 * acceptable.
634 */
635static inline int
636sched_lowest(struct cpu_group *cg, cpumask_t mask, int pri)
637{
638	struct cpu_search low;
639
640	low.cs_cpu = -1;
641	low.cs_load = -1;
642	low.cs_mask = mask;
643	low.cs_limit = pri;
644	cpu_search_lowest(cg, &low);
645	return low.cs_cpu;
646}
647
648/*
649 * Find the cpu with the highest load via the highest loaded path.
650 */
651static inline int
652sched_highest(struct cpu_group *cg, cpumask_t mask, int minload)
653{
654	struct cpu_search high;
655
656	high.cs_cpu = -1;
657	high.cs_load = 0;
658	high.cs_mask = mask;
659	high.cs_limit = minload;
660	cpu_search_highest(cg, &high);
661	return high.cs_cpu;
662}
663
664/*
665 * Simultaneously find the highest and lowest loaded cpu reachable via
666 * cg.
667 */
668static inline void
669sched_both(struct cpu_group *cg, cpumask_t mask, int *lowcpu, int *highcpu)
670{
671	struct cpu_search high;
672	struct cpu_search low;
673
674	low.cs_cpu = -1;
675	low.cs_limit = -1;
676	low.cs_load = -1;
677	low.cs_mask = mask;
678	high.cs_load = 0;
679	high.cs_cpu = -1;
680	high.cs_limit = -1;
681	high.cs_mask = mask;
682	cpu_search_both(cg, &low, &high);
683	*lowcpu = low.cs_cpu;
684	*highcpu = high.cs_cpu;
685	return;
686}
687
688static void
689sched_balance_group(struct cpu_group *cg)
690{
691	cpumask_t mask;
692	int high;
693	int low;
694	int i;
695
696	mask = -1;
697	for (;;) {
698		sched_both(cg, mask, &low, &high);
699		if (low == high || low == -1 || high == -1)
700			break;
701		if (sched_balance_pair(TDQ_CPU(high), TDQ_CPU(low)))
702			break;
703		/*
704		 * If we failed to move any threads determine which cpu
705		 * to kick out of the set and try again.
706	 	 */
707		if (TDQ_CPU(high)->tdq_transferable == 0)
708			mask &= ~(1 << high);
709		else
710			mask &= ~(1 << low);
711	}
712
713	for (i = 0; i < cg->cg_children; i++)
714		sched_balance_group(&cg->cg_child[i]);
715}
716
717static void
718sched_balance()
719{
720	struct tdq *tdq;
721
722	/*
723	 * Select a random time between .5 * balance_interval and
724	 * 1.5 * balance_interval.
725	 */
726	balance_ticks = max(balance_interval / 2, 1);
727	balance_ticks += random() % balance_interval;
728	if (smp_started == 0 || rebalance == 0)
729		return;
730	tdq = TDQ_SELF();
731	TDQ_UNLOCK(tdq);
732	sched_balance_group(cpu_top);
733	TDQ_LOCK(tdq);
734}
735
736/*
737 * Lock two thread queues using their address to maintain lock order.
738 */
739static void
740tdq_lock_pair(struct tdq *one, struct tdq *two)
741{
742	if (one < two) {
743		TDQ_LOCK(one);
744		TDQ_LOCK_FLAGS(two, MTX_DUPOK);
745	} else {
746		TDQ_LOCK(two);
747		TDQ_LOCK_FLAGS(one, MTX_DUPOK);
748	}
749}
750
751/*
752 * Unlock two thread queues.  Order is not important here.
753 */
754static void
755tdq_unlock_pair(struct tdq *one, struct tdq *two)
756{
757	TDQ_UNLOCK(one);
758	TDQ_UNLOCK(two);
759}
760
761/*
762 * Transfer load between two imbalanced thread queues.
763 */
764static int
765sched_balance_pair(struct tdq *high, struct tdq *low)
766{
767	int transferable;
768	int high_load;
769	int low_load;
770	int moved;
771	int move;
772	int diff;
773	int i;
774
775	tdq_lock_pair(high, low);
776	transferable = high->tdq_transferable;
777	high_load = high->tdq_load;
778	low_load = low->tdq_load;
779	moved = 0;
780	/*
781	 * Determine what the imbalance is and then adjust that to how many
782	 * threads we actually have to give up (transferable).
783	 */
784	if (transferable != 0) {
785		diff = high_load - low_load;
786		move = diff / 2;
787		if (diff & 0x1)
788			move++;
789		move = min(move, transferable);
790		for (i = 0; i < move; i++)
791			moved += tdq_move(high, low);
792		/*
793		 * IPI the target cpu to force it to reschedule with the new
794		 * workload.
795		 */
796		ipi_selected(1 << TDQ_ID(low), IPI_PREEMPT);
797	}
798	tdq_unlock_pair(high, low);
799	return (moved);
800}
801
802/*
803 * Move a thread from one thread queue to another.
804 */
805static int
806tdq_move(struct tdq *from, struct tdq *to)
807{
808	struct td_sched *ts;
809	struct thread *td;
810	struct tdq *tdq;
811	int cpu;
812
813	TDQ_LOCK_ASSERT(from, MA_OWNED);
814	TDQ_LOCK_ASSERT(to, MA_OWNED);
815
816	tdq = from;
817	cpu = TDQ_ID(to);
818	ts = tdq_steal(tdq, cpu);
819	if (ts == NULL)
820		return (0);
821	td = ts->ts_thread;
822	/*
823	 * Although the run queue is locked the thread may be blocked.  Lock
824	 * it to clear this and acquire the run-queue lock.
825	 */
826	thread_lock(td);
827	/* Drop recursive lock on from acquired via thread_lock(). */
828	TDQ_UNLOCK(from);
829	sched_rem(td);
830	ts->ts_cpu = cpu;
831	td->td_lock = TDQ_LOCKPTR(to);
832	tdq_add(to, td, SRQ_YIELDING);
833	return (1);
834}
835
836/*
837 * This tdq has idled.  Try to steal a thread from another cpu and switch
838 * to it.
839 */
840static int
841tdq_idled(struct tdq *tdq)
842{
843	struct cpu_group *cg;
844	struct tdq *steal;
845	cpumask_t mask;
846	int thresh;
847	int cpu;
848
849	if (smp_started == 0 || steal_idle == 0)
850		return (1);
851	mask = -1;
852	mask &= ~PCPU_GET(cpumask);
853	/* We don't want to be preempted while we're iterating. */
854	spinlock_enter();
855	for (cg = tdq->tdq_cg; cg != NULL; ) {
856		if ((cg->cg_flags & (CG_FLAG_HTT | CG_FLAG_THREAD)) == 0)
857			thresh = steal_thresh;
858		else
859			thresh = 1;
860		cpu = sched_highest(cg, mask, thresh);
861		if (cpu == -1) {
862			cg = cg->cg_parent;
863			continue;
864		}
865		steal = TDQ_CPU(cpu);
866		mask &= ~(1 << cpu);
867		tdq_lock_pair(tdq, steal);
868		if (steal->tdq_load < thresh || steal->tdq_transferable == 0) {
869			tdq_unlock_pair(tdq, steal);
870			continue;
871		}
872		/*
873		 * If a thread was added while interrupts were disabled don't
874		 * steal one here.  If we fail to acquire one due to affinity
875		 * restrictions loop again with this cpu removed from the
876		 * set.
877		 */
878		if (tdq->tdq_load == 0 && tdq_move(steal, tdq) == 0) {
879			tdq_unlock_pair(tdq, steal);
880			continue;
881		}
882		spinlock_exit();
883		TDQ_UNLOCK(steal);
884		mi_switch(SW_VOL, NULL);
885		thread_unlock(curthread);
886
887		return (0);
888	}
889	spinlock_exit();
890	return (1);
891}
892
893/*
894 * Notify a remote cpu of new work.  Sends an IPI if criteria are met.
895 */
896static void
897tdq_notify(struct td_sched *ts)
898{
899	struct thread *ctd;
900	struct pcpu *pcpu;
901	int cpri;
902	int pri;
903	int cpu;
904
905	cpu = ts->ts_cpu;
906	pri = ts->ts_thread->td_priority;
907	pcpu = pcpu_find(cpu);
908	ctd = pcpu->pc_curthread;
909	cpri = ctd->td_priority;
910
911	/*
912	 * If our priority is not better than the current priority there is
913	 * nothing to do.
914	 */
915	if (pri > cpri)
916		return;
917	/*
918	 * Always IPI idle.
919	 */
920	if (cpri > PRI_MIN_IDLE)
921		goto sendipi;
922	/*
923	 * If we're realtime or better and there is timeshare or worse running
924	 * send an IPI.
925	 */
926	if (pri < PRI_MAX_REALTIME && cpri > PRI_MAX_REALTIME)
927		goto sendipi;
928	/*
929	 * Otherwise only IPI if we exceed the threshold.
930	 */
931	if (pri > preempt_thresh)
932		return;
933sendipi:
934	ctd->td_flags |= TDF_NEEDRESCHED;
935	ipi_selected(1 << cpu, IPI_PREEMPT);
936}
937
938/*
939 * Steals load from a timeshare queue.  Honors the rotating queue head
940 * index.
941 */
942static struct td_sched *
943runq_steal_from(struct runq *rq, int cpu, u_char start)
944{
945	struct td_sched *ts;
946	struct rqbits *rqb;
947	struct rqhead *rqh;
948	int first;
949	int bit;
950	int pri;
951	int i;
952
953	rqb = &rq->rq_status;
954	bit = start & (RQB_BPW -1);
955	pri = 0;
956	first = 0;
957again:
958	for (i = RQB_WORD(start); i < RQB_LEN; bit = 0, i++) {
959		if (rqb->rqb_bits[i] == 0)
960			continue;
961		if (bit != 0) {
962			for (pri = bit; pri < RQB_BPW; pri++)
963				if (rqb->rqb_bits[i] & (1ul << pri))
964					break;
965			if (pri >= RQB_BPW)
966				continue;
967		} else
968			pri = RQB_FFS(rqb->rqb_bits[i]);
969		pri += (i << RQB_L2BPW);
970		rqh = &rq->rq_queues[pri];
971		TAILQ_FOREACH(ts, rqh, ts_procq) {
972			if (first && THREAD_CAN_MIGRATE(ts->ts_thread) &&
973			    THREAD_CAN_SCHED(ts->ts_thread, cpu))
974				return (ts);
975			first = 1;
976		}
977	}
978	if (start != 0) {
979		start = 0;
980		goto again;
981	}
982
983	return (NULL);
984}
985
986/*
987 * Steals load from a standard linear queue.
988 */
989static struct td_sched *
990runq_steal(struct runq *rq, int cpu)
991{
992	struct rqhead *rqh;
993	struct rqbits *rqb;
994	struct td_sched *ts;
995	int word;
996	int bit;
997
998	rqb = &rq->rq_status;
999	for (word = 0; word < RQB_LEN; word++) {
1000		if (rqb->rqb_bits[word] == 0)
1001			continue;
1002		for (bit = 0; bit < RQB_BPW; bit++) {
1003			if ((rqb->rqb_bits[word] & (1ul << bit)) == 0)
1004				continue;
1005			rqh = &rq->rq_queues[bit + (word << RQB_L2BPW)];
1006			TAILQ_FOREACH(ts, rqh, ts_procq)
1007				if (THREAD_CAN_MIGRATE(ts->ts_thread) &&
1008				    THREAD_CAN_SCHED(ts->ts_thread, cpu))
1009					return (ts);
1010		}
1011	}
1012	return (NULL);
1013}
1014
1015/*
1016 * Attempt to steal a thread in priority order from a thread queue.
1017 */
1018static struct td_sched *
1019tdq_steal(struct tdq *tdq, int cpu)
1020{
1021	struct td_sched *ts;
1022
1023	TDQ_LOCK_ASSERT(tdq, MA_OWNED);
1024	if ((ts = runq_steal(&tdq->tdq_realtime, cpu)) != NULL)
1025		return (ts);
1026	if ((ts = runq_steal_from(&tdq->tdq_timeshare, cpu, tdq->tdq_ridx))
1027	    != NULL)
1028		return (ts);
1029	return (runq_steal(&tdq->tdq_idle, cpu));
1030}
1031
1032/*
1033 * Sets the thread lock and ts_cpu to match the requested cpu.  Unlocks the
1034 * current lock and returns with the assigned queue locked.
1035 */
1036static inline struct tdq *
1037sched_setcpu(struct td_sched *ts, int cpu, int flags)
1038{
1039	struct thread *td;
1040	struct tdq *tdq;
1041
1042	THREAD_LOCK_ASSERT(ts->ts_thread, MA_OWNED);
1043
1044	tdq = TDQ_CPU(cpu);
1045	td = ts->ts_thread;
1046	ts->ts_cpu = cpu;
1047
1048	/* If the lock matches just return the queue. */
1049	if (td->td_lock == TDQ_LOCKPTR(tdq))
1050		return (tdq);
1051#ifdef notyet
1052	/*
1053	 * If the thread isn't running its lockptr is a
1054	 * turnstile or a sleepqueue.  We can just lock_set without
1055	 * blocking.
1056	 */
1057	if (TD_CAN_RUN(td)) {
1058		TDQ_LOCK(tdq);
1059		thread_lock_set(td, TDQ_LOCKPTR(tdq));
1060		return (tdq);
1061	}
1062#endif
1063	/*
1064	 * The hard case, migration, we need to block the thread first to
1065	 * prevent order reversals with other cpus locks.
1066	 */
1067	thread_lock_block(td);
1068	TDQ_LOCK(tdq);
1069	thread_lock_unblock(td, TDQ_LOCKPTR(tdq));
1070	return (tdq);
1071}
1072
1073static int
1074sched_pickcpu(struct td_sched *ts, int flags)
1075{
1076	struct cpu_group *cg;
1077	struct thread *td;
1078	struct tdq *tdq;
1079	cpumask_t mask;
1080	int self;
1081	int pri;
1082	int cpu;
1083
1084	self = PCPU_GET(cpuid);
1085	td = ts->ts_thread;
1086	if (smp_started == 0)
1087		return (self);
1088	/*
1089	 * Don't migrate a running thread from sched_switch().
1090	 */
1091	if ((flags & SRQ_OURSELF) || !THREAD_CAN_MIGRATE(td))
1092		return (ts->ts_cpu);
1093	/*
1094	 * Prefer to run interrupt threads on the processors that generate
1095	 * the interrupt.
1096	 */
1097	if (td->td_priority <= PRI_MAX_ITHD && THREAD_CAN_SCHED(td, self) &&
1098	    curthread->td_intr_nesting_level)
1099		ts->ts_cpu = self;
1100	/*
1101	 * If the thread can run on the last cpu and the affinity has not
1102	 * expired or it is idle run it there.
1103	 */
1104	pri = td->td_priority;
1105	tdq = TDQ_CPU(ts->ts_cpu);
1106	if (THREAD_CAN_SCHED(td, ts->ts_cpu)) {
1107		if (tdq->tdq_lowpri > PRI_MIN_IDLE)
1108			return (ts->ts_cpu);
1109		if (SCHED_AFFINITY(ts, CG_SHARE_L2) && tdq->tdq_lowpri > pri)
1110			return (ts->ts_cpu);
1111	}
1112	/*
1113	 * Search for the highest level in the tree that still has affinity.
1114	 */
1115	cg = NULL;
1116	for (cg = tdq->tdq_cg; cg != NULL; cg = cg->cg_parent)
1117		if (SCHED_AFFINITY(ts, cg->cg_level))
1118			break;
1119	cpu = -1;
1120	mask = td->td_cpuset->cs_mask.__bits[0];
1121	if (cg)
1122		cpu = sched_lowest(cg, mask, pri);
1123	if (cpu == -1)
1124		cpu = sched_lowest(cpu_top, mask, -1);
1125	/*
1126	 * Compare the lowest loaded cpu to current cpu.
1127	 */
1128	if (THREAD_CAN_SCHED(td, self) &&
1129	    TDQ_CPU(cpu)->tdq_lowpri < PRI_MIN_IDLE) {
1130		if (tryself && TDQ_CPU(self)->tdq_lowpri > pri)
1131			cpu = self;
1132		else if (oldtryself && curthread->td_priority > pri)
1133			cpu = self;
1134	}
1135	if (cpu == -1) {
1136		panic("cpu == -1, mask 0x%X cpu top %p", mask, cpu_top);
1137	}
1138	return (cpu);
1139}
1140#endif
1141
1142/*
1143 * Pick the highest priority task we have and return it.
1144 */
1145static struct td_sched *
1146tdq_choose(struct tdq *tdq)
1147{
1148	struct td_sched *ts;
1149
1150	TDQ_LOCK_ASSERT(tdq, MA_OWNED);
1151	ts = runq_choose(&tdq->tdq_realtime);
1152	if (ts != NULL)
1153		return (ts);
1154	ts = runq_choose_from(&tdq->tdq_timeshare, tdq->tdq_ridx);
1155	if (ts != NULL) {
1156		KASSERT(ts->ts_thread->td_priority >= PRI_MIN_TIMESHARE,
1157		    ("tdq_choose: Invalid priority on timeshare queue %d",
1158		    ts->ts_thread->td_priority));
1159		return (ts);
1160	}
1161
1162	ts = runq_choose(&tdq->tdq_idle);
1163	if (ts != NULL) {
1164		KASSERT(ts->ts_thread->td_priority >= PRI_MIN_IDLE,
1165		    ("tdq_choose: Invalid priority on idle queue %d",
1166		    ts->ts_thread->td_priority));
1167		return (ts);
1168	}
1169
1170	return (NULL);
1171}
1172
1173/*
1174 * Initialize a thread queue.
1175 */
1176static void
1177tdq_setup(struct tdq *tdq)
1178{
1179
1180	if (bootverbose)
1181		printf("ULE: setup cpu %d\n", TDQ_ID(tdq));
1182	runq_init(&tdq->tdq_realtime);
1183	runq_init(&tdq->tdq_timeshare);
1184	runq_init(&tdq->tdq_idle);
1185	snprintf(tdq->tdq_name, sizeof(tdq->tdq_name),
1186	    "sched lock %d", (int)TDQ_ID(tdq));
1187	mtx_init(&tdq->tdq_lock, tdq->tdq_name, "sched lock",
1188	    MTX_SPIN | MTX_RECURSE);
1189}
1190
1191#ifdef SMP
1192static void
1193sched_setup_smp(void)
1194{
1195	struct tdq *tdq;
1196	int i;
1197
1198	cpu_top = smp_topo();
1199	for (i = 0; i < MAXCPU; i++) {
1200		if (CPU_ABSENT(i))
1201			continue;
1202		tdq = TDQ_CPU(i);
1203		tdq_setup(tdq);
1204		tdq->tdq_cg = smp_topo_find(cpu_top, i);
1205		if (tdq->tdq_cg == NULL)
1206			panic("Can't find cpu group for %d\n", i);
1207	}
1208	balance_tdq = TDQ_SELF();
1209	sched_balance();
1210}
1211#endif
1212
1213/*
1214 * Setup the thread queues and initialize the topology based on MD
1215 * information.
1216 */
1217static void
1218sched_setup(void *dummy)
1219{
1220	struct tdq *tdq;
1221
1222	tdq = TDQ_SELF();
1223#ifdef SMP
1224	sched_setup_smp();
1225#else
1226	tdq_setup(tdq);
1227#endif
1228	/*
1229	 * To avoid divide-by-zero, we set realstathz a dummy value
1230	 * in case which sched_clock() called before sched_initticks().
1231	 */
1232	realstathz = hz;
1233	sched_slice = (realstathz/10);	/* ~100ms */
1234	tickincr = 1 << SCHED_TICK_SHIFT;
1235
1236	/* Add thread0's load since it's running. */
1237	TDQ_LOCK(tdq);
1238	thread0.td_lock = TDQ_LOCKPTR(TDQ_SELF());
1239	tdq_load_add(tdq, &td_sched0);
1240	tdq->tdq_lowpri = thread0.td_priority;
1241	TDQ_UNLOCK(tdq);
1242}
1243
1244/*
1245 * This routine determines the tickincr after stathz and hz are setup.
1246 */
1247/* ARGSUSED */
1248static void
1249sched_initticks(void *dummy)
1250{
1251	int incr;
1252
1253	realstathz = stathz ? stathz : hz;
1254	sched_slice = (realstathz/10);	/* ~100ms */
1255
1256	/*
1257	 * tickincr is shifted out by 10 to avoid rounding errors due to
1258	 * hz not being evenly divisible by stathz on all platforms.
1259	 */
1260	incr = (hz << SCHED_TICK_SHIFT) / realstathz;
1261	/*
1262	 * This does not work for values of stathz that are more than
1263	 * 1 << SCHED_TICK_SHIFT * hz.  In practice this does not happen.
1264	 */
1265	if (incr == 0)
1266		incr = 1;
1267	tickincr = incr;
1268#ifdef SMP
1269	/*
1270	 * Set the default balance interval now that we know
1271	 * what realstathz is.
1272	 */
1273	balance_interval = realstathz;
1274	/*
1275	 * Set steal thresh to log2(mp_ncpu) but no greater than 4.  This
1276	 * prevents excess thrashing on large machines and excess idle on
1277	 * smaller machines.
1278	 */
1279	steal_thresh = min(ffs(mp_ncpus) - 1, 3);
1280	affinity = SCHED_AFFINITY_DEFAULT;
1281#endif
1282}
1283
1284
1285/*
1286 * This is the core of the interactivity algorithm.  Determines a score based
1287 * on past behavior.  It is the ratio of sleep time to run time scaled to
1288 * a [0, 100] integer.  This is the voluntary sleep time of a process, which
1289 * differs from the cpu usage because it does not account for time spent
1290 * waiting on a run-queue.  Would be prettier if we had floating point.
1291 */
1292static int
1293sched_interact_score(struct thread *td)
1294{
1295	struct td_sched *ts;
1296	int div;
1297
1298	ts = td->td_sched;
1299	/*
1300	 * The score is only needed if this is likely to be an interactive
1301	 * task.  Don't go through the expense of computing it if there's
1302	 * no chance.
1303	 */
1304	if (sched_interact <= SCHED_INTERACT_HALF &&
1305		ts->ts_runtime >= ts->ts_slptime)
1306			return (SCHED_INTERACT_HALF);
1307
1308	if (ts->ts_runtime > ts->ts_slptime) {
1309		div = max(1, ts->ts_runtime / SCHED_INTERACT_HALF);
1310		return (SCHED_INTERACT_HALF +
1311		    (SCHED_INTERACT_HALF - (ts->ts_slptime / div)));
1312	}
1313	if (ts->ts_slptime > ts->ts_runtime) {
1314		div = max(1, ts->ts_slptime / SCHED_INTERACT_HALF);
1315		return (ts->ts_runtime / div);
1316	}
1317	/* runtime == slptime */
1318	if (ts->ts_runtime)
1319		return (SCHED_INTERACT_HALF);
1320
1321	/*
1322	 * This can happen if slptime and runtime are 0.
1323	 */
1324	return (0);
1325
1326}
1327
1328/*
1329 * Scale the scheduling priority according to the "interactivity" of this
1330 * process.
1331 */
1332static void
1333sched_priority(struct thread *td)
1334{
1335	int score;
1336	int pri;
1337
1338	if (td->td_pri_class != PRI_TIMESHARE)
1339		return;
1340	/*
1341	 * If the score is interactive we place the thread in the realtime
1342	 * queue with a priority that is less than kernel and interrupt
1343	 * priorities.  These threads are not subject to nice restrictions.
1344	 *
1345	 * Scores greater than this are placed on the normal timeshare queue
1346	 * where the priority is partially decided by the most recent cpu
1347	 * utilization and the rest is decided by nice value.
1348	 *
1349	 * The nice value of the process has a linear effect on the calculated
1350	 * score.  Negative nice values make it easier for a thread to be
1351	 * considered interactive.
1352	 */
1353	score = imax(0, sched_interact_score(td) - td->td_proc->p_nice);
1354	if (score < sched_interact) {
1355		pri = PRI_MIN_REALTIME;
1356		pri += ((PRI_MAX_REALTIME - PRI_MIN_REALTIME) / sched_interact)
1357		    * score;
1358		KASSERT(pri >= PRI_MIN_REALTIME && pri <= PRI_MAX_REALTIME,
1359		    ("sched_priority: invalid interactive priority %d score %d",
1360		    pri, score));
1361	} else {
1362		pri = SCHED_PRI_MIN;
1363		if (td->td_sched->ts_ticks)
1364			pri += SCHED_PRI_TICKS(td->td_sched);
1365		pri += SCHED_PRI_NICE(td->td_proc->p_nice);
1366		KASSERT(pri >= PRI_MIN_TIMESHARE && pri <= PRI_MAX_TIMESHARE,
1367		    ("sched_priority: invalid priority %d: nice %d, "
1368		    "ticks %d ftick %d ltick %d tick pri %d",
1369		    pri, td->td_proc->p_nice, td->td_sched->ts_ticks,
1370		    td->td_sched->ts_ftick, td->td_sched->ts_ltick,
1371		    SCHED_PRI_TICKS(td->td_sched)));
1372	}
1373	sched_user_prio(td, pri);
1374
1375	return;
1376}
1377
1378/*
1379 * This routine enforces a maximum limit on the amount of scheduling history
1380 * kept.  It is called after either the slptime or runtime is adjusted.  This
1381 * function is ugly due to integer math.
1382 */
1383static void
1384sched_interact_update(struct thread *td)
1385{
1386	struct td_sched *ts;
1387	u_int sum;
1388
1389	ts = td->td_sched;
1390	sum = ts->ts_runtime + ts->ts_slptime;
1391	if (sum < SCHED_SLP_RUN_MAX)
1392		return;
1393	/*
1394	 * This only happens from two places:
1395	 * 1) We have added an unusual amount of run time from fork_exit.
1396	 * 2) We have added an unusual amount of sleep time from sched_sleep().
1397	 */
1398	if (sum > SCHED_SLP_RUN_MAX * 2) {
1399		if (ts->ts_runtime > ts->ts_slptime) {
1400			ts->ts_runtime = SCHED_SLP_RUN_MAX;
1401			ts->ts_slptime = 1;
1402		} else {
1403			ts->ts_slptime = SCHED_SLP_RUN_MAX;
1404			ts->ts_runtime = 1;
1405		}
1406		return;
1407	}
1408	/*
1409	 * If we have exceeded by more than 1/5th then the algorithm below
1410	 * will not bring us back into range.  Dividing by two here forces
1411	 * us into the range of [4/5 * SCHED_INTERACT_MAX, SCHED_INTERACT_MAX]
1412	 */
1413	if (sum > (SCHED_SLP_RUN_MAX / 5) * 6) {
1414		ts->ts_runtime /= 2;
1415		ts->ts_slptime /= 2;
1416		return;
1417	}
1418	ts->ts_runtime = (ts->ts_runtime / 5) * 4;
1419	ts->ts_slptime = (ts->ts_slptime / 5) * 4;
1420}
1421
1422/*
1423 * Scale back the interactivity history when a child thread is created.  The
1424 * history is inherited from the parent but the thread may behave totally
1425 * differently.  For example, a shell spawning a compiler process.  We want
1426 * to learn that the compiler is behaving badly very quickly.
1427 */
1428static void
1429sched_interact_fork(struct thread *td)
1430{
1431	int ratio;
1432	int sum;
1433
1434	sum = td->td_sched->ts_runtime + td->td_sched->ts_slptime;
1435	if (sum > SCHED_SLP_RUN_FORK) {
1436		ratio = sum / SCHED_SLP_RUN_FORK;
1437		td->td_sched->ts_runtime /= ratio;
1438		td->td_sched->ts_slptime /= ratio;
1439	}
1440}
1441
1442/*
1443 * Called from proc0_init() to setup the scheduler fields.
1444 */
1445void
1446schedinit(void)
1447{
1448
1449	/*
1450	 * Set up the scheduler specific parts of proc0.
1451	 */
1452	proc0.p_sched = NULL; /* XXX */
1453	thread0.td_sched = &td_sched0;
1454	td_sched0.ts_ltick = ticks;
1455	td_sched0.ts_ftick = ticks;
1456	td_sched0.ts_thread = &thread0;
1457}
1458
1459/*
1460 * This is only somewhat accurate since given many processes of the same
1461 * priority they will switch when their slices run out, which will be
1462 * at most sched_slice stathz ticks.
1463 */
1464int
1465sched_rr_interval(void)
1466{
1467
1468	/* Convert sched_slice to hz */
1469	return (hz/(realstathz/sched_slice));
1470}
1471
1472/*
1473 * Update the percent cpu tracking information when it is requested or
1474 * the total history exceeds the maximum.  We keep a sliding history of
1475 * tick counts that slowly decays.  This is less precise than the 4BSD
1476 * mechanism since it happens with less regular and frequent events.
1477 */
1478static void
1479sched_pctcpu_update(struct td_sched *ts)
1480{
1481
1482	if (ts->ts_ticks == 0)
1483		return;
1484	if (ticks - (hz / 10) < ts->ts_ltick &&
1485	    SCHED_TICK_TOTAL(ts) < SCHED_TICK_MAX)
1486		return;
1487	/*
1488	 * Adjust counters and watermark for pctcpu calc.
1489	 */
1490	if (ts->ts_ltick > ticks - SCHED_TICK_TARG)
1491		ts->ts_ticks = (ts->ts_ticks / (ticks - ts->ts_ftick)) *
1492			    SCHED_TICK_TARG;
1493	else
1494		ts->ts_ticks = 0;
1495	ts->ts_ltick = ticks;
1496	ts->ts_ftick = ts->ts_ltick - SCHED_TICK_TARG;
1497}
1498
1499/*
1500 * Adjust the priority of a thread.  Move it to the appropriate run-queue
1501 * if necessary.  This is the back-end for several priority related
1502 * functions.
1503 */
1504static void
1505sched_thread_priority(struct thread *td, u_char prio)
1506{
1507	struct td_sched *ts;
1508
1509	CTR6(KTR_SCHED, "sched_prio: %p(%s) prio %d newprio %d by %p(%s)",
1510	    td, td->td_name, td->td_priority, prio, curthread,
1511	    curthread->td_name);
1512	ts = td->td_sched;
1513	THREAD_LOCK_ASSERT(td, MA_OWNED);
1514	if (td->td_priority == prio)
1515		return;
1516
1517	if (TD_ON_RUNQ(td) && prio < td->td_priority) {
1518		/*
1519		 * If the priority has been elevated due to priority
1520		 * propagation, we may have to move ourselves to a new
1521		 * queue.  This could be optimized to not re-add in some
1522		 * cases.
1523		 */
1524		sched_rem(td);
1525		td->td_priority = prio;
1526		sched_add(td, SRQ_BORROWING);
1527	} else if (TD_IS_RUNNING(td)) {
1528		struct tdq *tdq;
1529		int oldpri;
1530
1531		tdq = TDQ_CPU(ts->ts_cpu);
1532		oldpri = td->td_priority;
1533		td->td_priority = prio;
1534		if (prio < tdq->tdq_lowpri)
1535			tdq->tdq_lowpri = prio;
1536		else if (tdq->tdq_lowpri == oldpri)
1537			tdq_setlowpri(tdq, td);
1538	} else
1539		td->td_priority = prio;
1540}
1541
1542/*
1543 * Update a thread's priority when it is lent another thread's
1544 * priority.
1545 */
1546void
1547sched_lend_prio(struct thread *td, u_char prio)
1548{
1549
1550	td->td_flags |= TDF_BORROWING;
1551	sched_thread_priority(td, prio);
1552}
1553
1554/*
1555 * Restore a thread's priority when priority propagation is
1556 * over.  The prio argument is the minimum priority the thread
1557 * needs to have to satisfy other possible priority lending
1558 * requests.  If the thread's regular priority is less
1559 * important than prio, the thread will keep a priority boost
1560 * of prio.
1561 */
1562void
1563sched_unlend_prio(struct thread *td, u_char prio)
1564{
1565	u_char base_pri;
1566
1567	if (td->td_base_pri >= PRI_MIN_TIMESHARE &&
1568	    td->td_base_pri <= PRI_MAX_TIMESHARE)
1569		base_pri = td->td_user_pri;
1570	else
1571		base_pri = td->td_base_pri;
1572	if (prio >= base_pri) {
1573		td->td_flags &= ~TDF_BORROWING;
1574		sched_thread_priority(td, base_pri);
1575	} else
1576		sched_lend_prio(td, prio);
1577}
1578
1579/*
1580 * Standard entry for setting the priority to an absolute value.
1581 */
1582void
1583sched_prio(struct thread *td, u_char prio)
1584{
1585	u_char oldprio;
1586
1587	/* First, update the base priority. */
1588	td->td_base_pri = prio;
1589
1590	/*
1591	 * If the thread is borrowing another thread's priority, don't
1592	 * ever lower the priority.
1593	 */
1594	if (td->td_flags & TDF_BORROWING && td->td_priority < prio)
1595		return;
1596
1597	/* Change the real priority. */
1598	oldprio = td->td_priority;
1599	sched_thread_priority(td, prio);
1600
1601	/*
1602	 * If the thread is on a turnstile, then let the turnstile update
1603	 * its state.
1604	 */
1605	if (TD_ON_LOCK(td) && oldprio != prio)
1606		turnstile_adjust(td, oldprio);
1607}
1608
1609/*
1610 * Set the base user priority, does not effect current running priority.
1611 */
1612void
1613sched_user_prio(struct thread *td, u_char prio)
1614{
1615	u_char oldprio;
1616
1617	td->td_base_user_pri = prio;
1618	if (td->td_flags & TDF_UBORROWING && td->td_user_pri <= prio)
1619                return;
1620	oldprio = td->td_user_pri;
1621	td->td_user_pri = prio;
1622}
1623
1624void
1625sched_lend_user_prio(struct thread *td, u_char prio)
1626{
1627	u_char oldprio;
1628
1629	THREAD_LOCK_ASSERT(td, MA_OWNED);
1630	td->td_flags |= TDF_UBORROWING;
1631	oldprio = td->td_user_pri;
1632	td->td_user_pri = prio;
1633}
1634
1635void
1636sched_unlend_user_prio(struct thread *td, u_char prio)
1637{
1638	u_char base_pri;
1639
1640	THREAD_LOCK_ASSERT(td, MA_OWNED);
1641	base_pri = td->td_base_user_pri;
1642	if (prio >= base_pri) {
1643		td->td_flags &= ~TDF_UBORROWING;
1644		sched_user_prio(td, base_pri);
1645	} else {
1646		sched_lend_user_prio(td, prio);
1647	}
1648}
1649
1650/*
1651 * Add the thread passed as 'newtd' to the run queue before selecting
1652 * the next thread to run.  This is only used for KSE.
1653 */
1654static void
1655sched_switchin(struct tdq *tdq, struct thread *td)
1656{
1657#ifdef SMP
1658	spinlock_enter();
1659	TDQ_UNLOCK(tdq);
1660	thread_lock(td);
1661	spinlock_exit();
1662	sched_setcpu(td->td_sched, TDQ_ID(tdq), SRQ_YIELDING);
1663#else
1664	td->td_lock = TDQ_LOCKPTR(tdq);
1665#endif
1666	tdq_add(tdq, td, SRQ_YIELDING);
1667	MPASS(td->td_lock == TDQ_LOCKPTR(tdq));
1668}
1669
1670/*
1671 * Block a thread for switching.  Similar to thread_block() but does not
1672 * bump the spin count.
1673 */
1674static inline struct mtx *
1675thread_block_switch(struct thread *td)
1676{
1677	struct mtx *lock;
1678
1679	THREAD_LOCK_ASSERT(td, MA_OWNED);
1680	lock = td->td_lock;
1681	td->td_lock = &blocked_lock;
1682	mtx_unlock_spin(lock);
1683
1684	return (lock);
1685}
1686
1687/*
1688 * Handle migration from sched_switch().  This happens only for
1689 * cpu binding.
1690 */
1691static struct mtx *
1692sched_switch_migrate(struct tdq *tdq, struct thread *td, int flags)
1693{
1694	struct tdq *tdn;
1695
1696	tdn = TDQ_CPU(td->td_sched->ts_cpu);
1697#ifdef SMP
1698	/*
1699	 * Do the lock dance required to avoid LOR.  We grab an extra
1700	 * spinlock nesting to prevent preemption while we're
1701	 * not holding either run-queue lock.
1702	 */
1703	spinlock_enter();
1704	thread_block_switch(td);	/* This releases the lock on tdq. */
1705	TDQ_LOCK(tdn);
1706	tdq_add(tdn, td, flags);
1707	tdq_notify(td->td_sched);
1708	/*
1709	 * After we unlock tdn the new cpu still can't switch into this
1710	 * thread until we've unblocked it in cpu_switch().  The lock
1711	 * pointers may match in the case of HTT cores.  Don't unlock here
1712	 * or we can deadlock when the other CPU runs the IPI handler.
1713	 */
1714	if (TDQ_LOCKPTR(tdn) != TDQ_LOCKPTR(tdq)) {
1715		TDQ_UNLOCK(tdn);
1716		TDQ_LOCK(tdq);
1717	}
1718	spinlock_exit();
1719#endif
1720	return (TDQ_LOCKPTR(tdn));
1721}
1722
1723/*
1724 * Release a thread that was blocked with thread_block_switch().
1725 */
1726static inline void
1727thread_unblock_switch(struct thread *td, struct mtx *mtx)
1728{
1729	atomic_store_rel_ptr((volatile uintptr_t *)&td->td_lock,
1730	    (uintptr_t)mtx);
1731}
1732
1733/*
1734 * Switch threads.  This function has to handle threads coming in while
1735 * blocked for some reason, running, or idle.  It also must deal with
1736 * migrating a thread from one queue to another as running threads may
1737 * be assigned elsewhere via binding.
1738 */
1739void
1740sched_switch(struct thread *td, struct thread *newtd, int flags)
1741{
1742	struct tdq *tdq;
1743	struct td_sched *ts;
1744	struct mtx *mtx;
1745	int srqflag;
1746	int cpuid;
1747
1748	THREAD_LOCK_ASSERT(td, MA_OWNED);
1749
1750	cpuid = PCPU_GET(cpuid);
1751	tdq = TDQ_CPU(cpuid);
1752	ts = td->td_sched;
1753	mtx = td->td_lock;
1754	ts->ts_rltick = ticks;
1755	td->td_lastcpu = td->td_oncpu;
1756	td->td_oncpu = NOCPU;
1757	td->td_flags &= ~TDF_NEEDRESCHED;
1758	td->td_owepreempt = 0;
1759	/*
1760	 * The lock pointer in an idle thread should never change.  Reset it
1761	 * to CAN_RUN as well.
1762	 */
1763	if (TD_IS_IDLETHREAD(td)) {
1764		MPASS(td->td_lock == TDQ_LOCKPTR(tdq));
1765		TD_SET_CAN_RUN(td);
1766	} else if (TD_IS_RUNNING(td)) {
1767		MPASS(td->td_lock == TDQ_LOCKPTR(tdq));
1768		tdq_load_rem(tdq, ts);
1769		srqflag = (flags & SW_PREEMPT) ?
1770		    SRQ_OURSELF|SRQ_YIELDING|SRQ_PREEMPTED :
1771		    SRQ_OURSELF|SRQ_YIELDING;
1772		if (ts->ts_cpu == cpuid)
1773			tdq_add(tdq, td, srqflag);
1774		else
1775			mtx = sched_switch_migrate(tdq, td, srqflag);
1776	} else {
1777		/* This thread must be going to sleep. */
1778		TDQ_LOCK(tdq);
1779		mtx = thread_block_switch(td);
1780		tdq_load_rem(tdq, ts);
1781	}
1782	/*
1783	 * We enter here with the thread blocked and assigned to the
1784	 * appropriate cpu run-queue or sleep-queue and with the current
1785	 * thread-queue locked.
1786	 */
1787	TDQ_LOCK_ASSERT(tdq, MA_OWNED | MA_NOTRECURSED);
1788	/*
1789	 * If KSE assigned a new thread just add it here and let choosethread
1790	 * select the best one.
1791	 */
1792	if (newtd != NULL)
1793		sched_switchin(tdq, newtd);
1794	newtd = choosethread();
1795	/*
1796	 * Call the MD code to switch contexts if necessary.
1797	 */
1798	if (td != newtd) {
1799#ifdef	HWPMC_HOOKS
1800		if (PMC_PROC_IS_USING_PMCS(td->td_proc))
1801			PMC_SWITCH_CONTEXT(td, PMC_FN_CSW_OUT);
1802#endif
1803		lock_profile_release_lock(&TDQ_LOCKPTR(tdq)->lock_object);
1804		TDQ_LOCKPTR(tdq)->mtx_lock = (uintptr_t)newtd;
1805		cpu_switch(td, newtd, mtx);
1806		/*
1807		 * We may return from cpu_switch on a different cpu.  However,
1808		 * we always return with td_lock pointing to the current cpu's
1809		 * run queue lock.
1810		 */
1811		cpuid = PCPU_GET(cpuid);
1812		tdq = TDQ_CPU(cpuid);
1813		lock_profile_obtain_lock_success(
1814		    &TDQ_LOCKPTR(tdq)->lock_object, 0, 0, __FILE__, __LINE__);
1815#ifdef	HWPMC_HOOKS
1816		if (PMC_PROC_IS_USING_PMCS(td->td_proc))
1817			PMC_SWITCH_CONTEXT(td, PMC_FN_CSW_IN);
1818#endif
1819	} else
1820		thread_unblock_switch(td, mtx);
1821	/*
1822	 * We should always get here with the lowest priority td possible.
1823	 */
1824	tdq->tdq_lowpri = td->td_priority;
1825	/*
1826	 * Assert that all went well and return.
1827	 */
1828	TDQ_LOCK_ASSERT(tdq, MA_OWNED|MA_NOTRECURSED);
1829	MPASS(td->td_lock == TDQ_LOCKPTR(tdq));
1830	td->td_oncpu = cpuid;
1831}
1832
1833/*
1834 * Adjust thread priorities as a result of a nice request.
1835 */
1836void
1837sched_nice(struct proc *p, int nice)
1838{
1839	struct thread *td;
1840
1841	PROC_LOCK_ASSERT(p, MA_OWNED);
1842	PROC_SLOCK_ASSERT(p, MA_OWNED);
1843
1844	p->p_nice = nice;
1845	FOREACH_THREAD_IN_PROC(p, td) {
1846		thread_lock(td);
1847		sched_priority(td);
1848		sched_prio(td, td->td_base_user_pri);
1849		thread_unlock(td);
1850	}
1851}
1852
1853/*
1854 * Record the sleep time for the interactivity scorer.
1855 */
1856void
1857sched_sleep(struct thread *td)
1858{
1859
1860	THREAD_LOCK_ASSERT(td, MA_OWNED);
1861
1862	td->td_slptick = ticks;
1863}
1864
1865/*
1866 * Schedule a thread to resume execution and record how long it voluntarily
1867 * slept.  We also update the pctcpu, interactivity, and priority.
1868 */
1869void
1870sched_wakeup(struct thread *td)
1871{
1872	struct td_sched *ts;
1873	int slptick;
1874
1875	THREAD_LOCK_ASSERT(td, MA_OWNED);
1876	ts = td->td_sched;
1877	/*
1878	 * If we slept for more than a tick update our interactivity and
1879	 * priority.
1880	 */
1881	slptick = td->td_slptick;
1882	td->td_slptick = 0;
1883	if (slptick && slptick != ticks) {
1884		u_int hzticks;
1885
1886		hzticks = (ticks - slptick) << SCHED_TICK_SHIFT;
1887		ts->ts_slptime += hzticks;
1888		sched_interact_update(td);
1889		sched_pctcpu_update(ts);
1890		sched_priority(td);
1891	}
1892	/* Reset the slice value after we sleep. */
1893	ts->ts_slice = sched_slice;
1894	sched_add(td, SRQ_BORING);
1895}
1896
1897/*
1898 * Penalize the parent for creating a new child and initialize the child's
1899 * priority.
1900 */
1901void
1902sched_fork(struct thread *td, struct thread *child)
1903{
1904	THREAD_LOCK_ASSERT(td, MA_OWNED);
1905	sched_fork_thread(td, child);
1906	/*
1907	 * Penalize the parent and child for forking.
1908	 */
1909	sched_interact_fork(child);
1910	sched_priority(child);
1911	td->td_sched->ts_runtime += tickincr;
1912	sched_interact_update(td);
1913	sched_priority(td);
1914}
1915
1916/*
1917 * Fork a new thread, may be within the same process.
1918 */
1919void
1920sched_fork_thread(struct thread *td, struct thread *child)
1921{
1922	struct td_sched *ts;
1923	struct td_sched *ts2;
1924
1925	/*
1926	 * Initialize child.
1927	 */
1928	THREAD_LOCK_ASSERT(td, MA_OWNED);
1929	sched_newthread(child);
1930	child->td_lock = TDQ_LOCKPTR(TDQ_SELF());
1931	child->td_cpuset = cpuset_ref(td->td_cpuset);
1932	ts = td->td_sched;
1933	ts2 = child->td_sched;
1934	ts2->ts_cpu = ts->ts_cpu;
1935	ts2->ts_runq = NULL;
1936	/*
1937	 * Grab our parents cpu estimation information and priority.
1938	 */
1939	ts2->ts_ticks = ts->ts_ticks;
1940	ts2->ts_ltick = ts->ts_ltick;
1941	ts2->ts_ftick = ts->ts_ftick;
1942	child->td_user_pri = td->td_user_pri;
1943	child->td_base_user_pri = td->td_base_user_pri;
1944	/*
1945	 * And update interactivity score.
1946	 */
1947	ts2->ts_slptime = ts->ts_slptime;
1948	ts2->ts_runtime = ts->ts_runtime;
1949	ts2->ts_slice = 1;	/* Attempt to quickly learn interactivity. */
1950}
1951
1952/*
1953 * Adjust the priority class of a thread.
1954 */
1955void
1956sched_class(struct thread *td, int class)
1957{
1958
1959	THREAD_LOCK_ASSERT(td, MA_OWNED);
1960	if (td->td_pri_class == class)
1961		return;
1962	/*
1963	 * On SMP if we're on the RUNQ we must adjust the transferable
1964	 * count because could be changing to or from an interrupt
1965	 * class.
1966	 */
1967	if (TD_ON_RUNQ(td)) {
1968		struct tdq *tdq;
1969
1970		tdq = TDQ_CPU(td->td_sched->ts_cpu);
1971		if (THREAD_CAN_MIGRATE(td))
1972			tdq->tdq_transferable--;
1973		td->td_pri_class = class;
1974		if (THREAD_CAN_MIGRATE(td))
1975			tdq->tdq_transferable++;
1976	}
1977	td->td_pri_class = class;
1978}
1979
1980/*
1981 * Return some of the child's priority and interactivity to the parent.
1982 */
1983void
1984sched_exit(struct proc *p, struct thread *child)
1985{
1986	struct thread *td;
1987
1988	CTR3(KTR_SCHED, "sched_exit: %p(%s) prio %d",
1989	    child, child->td_name, child->td_priority);
1990
1991	PROC_SLOCK_ASSERT(p, MA_OWNED);
1992	td = FIRST_THREAD_IN_PROC(p);
1993	sched_exit_thread(td, child);
1994}
1995
1996/*
1997 * Penalize another thread for the time spent on this one.  This helps to
1998 * worsen the priority and interactivity of processes which schedule batch
1999 * jobs such as make.  This has little effect on the make process itself but
2000 * causes new processes spawned by it to receive worse scores immediately.
2001 */
2002void
2003sched_exit_thread(struct thread *td, struct thread *child)
2004{
2005
2006	CTR3(KTR_SCHED, "sched_exit_thread: %p(%s) prio %d",
2007	    child, child->td_name, child->td_priority);
2008
2009#ifdef KSE
2010	/*
2011	 * KSE forks and exits so often that this penalty causes short-lived
2012	 * threads to always be non-interactive.  This causes mozilla to
2013	 * crawl under load.
2014	 */
2015	if ((td->td_pflags & TDP_SA) && td->td_proc == child->td_proc)
2016		return;
2017#endif
2018	/*
2019	 * Give the child's runtime to the parent without returning the
2020	 * sleep time as a penalty to the parent.  This causes shells that
2021	 * launch expensive things to mark their children as expensive.
2022	 */
2023	thread_lock(td);
2024	td->td_sched->ts_runtime += child->td_sched->ts_runtime;
2025	sched_interact_update(td);
2026	sched_priority(td);
2027	thread_unlock(td);
2028}
2029
2030/*
2031 * Fix priorities on return to user-space.  Priorities may be elevated due
2032 * to static priorities in msleep() or similar.
2033 */
2034void
2035sched_userret(struct thread *td)
2036{
2037	/*
2038	 * XXX we cheat slightly on the locking here to avoid locking in
2039	 * the usual case.  Setting td_priority here is essentially an
2040	 * incomplete workaround for not setting it properly elsewhere.
2041	 * Now that some interrupt handlers are threads, not setting it
2042	 * properly elsewhere can clobber it in the window between setting
2043	 * it here and returning to user mode, so don't waste time setting
2044	 * it perfectly here.
2045	 */
2046	KASSERT((td->td_flags & TDF_BORROWING) == 0,
2047	    ("thread with borrowed priority returning to userland"));
2048	if (td->td_priority != td->td_user_pri) {
2049		thread_lock(td);
2050		td->td_priority = td->td_user_pri;
2051		td->td_base_pri = td->td_user_pri;
2052		if (lowpri_userret)
2053			tdq_setlowpri(TDQ_SELF(), td);
2054		thread_unlock(td);
2055        }
2056}
2057
2058/*
2059 * Handle a stathz tick.  This is really only relevant for timeshare
2060 * threads.
2061 */
2062void
2063sched_clock(struct thread *td)
2064{
2065	struct tdq *tdq;
2066	struct td_sched *ts;
2067
2068	THREAD_LOCK_ASSERT(td, MA_OWNED);
2069	tdq = TDQ_SELF();
2070#ifdef SMP
2071	/*
2072	 * We run the long term load balancer infrequently on the first cpu.
2073	 */
2074	if (balance_tdq == tdq) {
2075		if (balance_ticks && --balance_ticks == 0)
2076			sched_balance();
2077	}
2078#endif
2079	/*
2080	 * Advance the insert index once for each tick to ensure that all
2081	 * threads get a chance to run.
2082	 */
2083	if (tdq->tdq_idx == tdq->tdq_ridx) {
2084		tdq->tdq_idx = (tdq->tdq_idx + 1) % RQ_NQS;
2085		if (TAILQ_EMPTY(&tdq->tdq_timeshare.rq_queues[tdq->tdq_ridx]))
2086			tdq->tdq_ridx = tdq->tdq_idx;
2087	}
2088	ts = td->td_sched;
2089	if (td->td_pri_class & PRI_FIFO_BIT)
2090		return;
2091	if (td->td_pri_class == PRI_TIMESHARE) {
2092		/*
2093		 * We used a tick; charge it to the thread so
2094		 * that we can compute our interactivity.
2095		 */
2096		td->td_sched->ts_runtime += tickincr;
2097		sched_interact_update(td);
2098	}
2099	/*
2100	 * We used up one time slice.
2101	 */
2102	if (--ts->ts_slice > 0)
2103		return;
2104	/*
2105	 * We're out of time, recompute priorities and requeue.
2106	 */
2107	sched_priority(td);
2108	td->td_flags |= TDF_NEEDRESCHED;
2109}
2110
2111/*
2112 * Called once per hz tick.  Used for cpu utilization information.  This
2113 * is easier than trying to scale based on stathz.
2114 */
2115void
2116sched_tick(void)
2117{
2118	struct td_sched *ts;
2119
2120	ts = curthread->td_sched;
2121	/* Adjust ticks for pctcpu */
2122	ts->ts_ticks += 1 << SCHED_TICK_SHIFT;
2123	ts->ts_ltick = ticks;
2124	/*
2125	 * Update if we've exceeded our desired tick threshhold by over one
2126	 * second.
2127	 */
2128	if (ts->ts_ftick + SCHED_TICK_MAX < ts->ts_ltick)
2129		sched_pctcpu_update(ts);
2130}
2131
2132/*
2133 * Return whether the current CPU has runnable tasks.  Used for in-kernel
2134 * cooperative idle threads.
2135 */
2136int
2137sched_runnable(void)
2138{
2139	struct tdq *tdq;
2140	int load;
2141
2142	load = 1;
2143
2144	tdq = TDQ_SELF();
2145	if ((curthread->td_flags & TDF_IDLETD) != 0) {
2146		if (tdq->tdq_load > 0)
2147			goto out;
2148	} else
2149		if (tdq->tdq_load - 1 > 0)
2150			goto out;
2151	load = 0;
2152out:
2153	return (load);
2154}
2155
2156/*
2157 * Choose the highest priority thread to run.  The thread is removed from
2158 * the run-queue while running however the load remains.  For SMP we set
2159 * the tdq in the global idle bitmask if it idles here.
2160 */
2161struct thread *
2162sched_choose(void)
2163{
2164	struct td_sched *ts;
2165	struct tdq *tdq;
2166
2167	tdq = TDQ_SELF();
2168	TDQ_LOCK_ASSERT(tdq, MA_OWNED);
2169	ts = tdq_choose(tdq);
2170	if (ts) {
2171		tdq_runq_rem(tdq, ts);
2172		return (ts->ts_thread);
2173	}
2174	return (PCPU_GET(idlethread));
2175}
2176
2177/*
2178 * Set owepreempt if necessary.  Preemption never happens directly in ULE,
2179 * we always request it once we exit a critical section.
2180 */
2181static inline void
2182sched_setpreempt(struct thread *td)
2183{
2184	struct thread *ctd;
2185	int cpri;
2186	int pri;
2187
2188	ctd = curthread;
2189	pri = td->td_priority;
2190	cpri = ctd->td_priority;
2191	if (td->td_priority < cpri)
2192		curthread->td_flags |= TDF_NEEDRESCHED;
2193	if (panicstr != NULL || pri >= cpri || cold || TD_IS_INHIBITED(ctd))
2194		return;
2195	/*
2196	 * Always preempt IDLE threads.  Otherwise only if the preempting
2197	 * thread is an ithread.
2198	 */
2199	if (pri > preempt_thresh && cpri < PRI_MIN_IDLE)
2200		return;
2201	ctd->td_owepreempt = 1;
2202	return;
2203}
2204
2205/*
2206 * Add a thread to a thread queue.  Initializes priority, slice, runq, and
2207 * add it to the appropriate queue.  This is the internal function called
2208 * when the tdq is predetermined.
2209 */
2210void
2211tdq_add(struct tdq *tdq, struct thread *td, int flags)
2212{
2213	struct td_sched *ts;
2214	int class;
2215
2216	TDQ_LOCK_ASSERT(tdq, MA_OWNED);
2217	KASSERT((td->td_inhibitors == 0),
2218	    ("sched_add: trying to run inhibited thread"));
2219	KASSERT((TD_CAN_RUN(td) || TD_IS_RUNNING(td)),
2220	    ("sched_add: bad thread state"));
2221	KASSERT(td->td_flags & TDF_INMEM,
2222	    ("sched_add: thread swapped out"));
2223
2224	ts = td->td_sched;
2225	class = PRI_BASE(td->td_pri_class);
2226        TD_SET_RUNQ(td);
2227	if (ts->ts_slice == 0)
2228		ts->ts_slice = sched_slice;
2229	/*
2230	 * Pick the run queue based on priority.
2231	 */
2232	if (td->td_priority <= PRI_MAX_REALTIME)
2233		ts->ts_runq = &tdq->tdq_realtime;
2234	else if (td->td_priority <= PRI_MAX_TIMESHARE)
2235		ts->ts_runq = &tdq->tdq_timeshare;
2236	else
2237		ts->ts_runq = &tdq->tdq_idle;
2238	if (td->td_priority < tdq->tdq_lowpri)
2239		tdq->tdq_lowpri = td->td_priority;
2240	tdq_runq_add(tdq, ts, flags);
2241	tdq_load_add(tdq, ts);
2242}
2243
2244/*
2245 * Select the target thread queue and add a thread to it.  Request
2246 * preemption or IPI a remote processor if required.
2247 */
2248void
2249sched_add(struct thread *td, int flags)
2250{
2251	struct td_sched *ts;
2252	struct tdq *tdq;
2253#ifdef SMP
2254	int cpuid;
2255	int cpu;
2256#endif
2257	CTR5(KTR_SCHED, "sched_add: %p(%s) prio %d by %p(%s)",
2258	    td, td->td_name, td->td_priority, curthread,
2259	    curthread->td_name);
2260	THREAD_LOCK_ASSERT(td, MA_OWNED);
2261	ts = td->td_sched;
2262	/*
2263	 * Recalculate the priority before we select the target cpu or
2264	 * run-queue.
2265	 */
2266	if (PRI_BASE(td->td_pri_class) == PRI_TIMESHARE)
2267		sched_priority(td);
2268#ifdef SMP
2269	cpuid = PCPU_GET(cpuid);
2270	/*
2271	 * Pick the destination cpu and if it isn't ours transfer to the
2272	 * target cpu.
2273	 */
2274	cpu = sched_pickcpu(ts, flags);
2275	tdq = sched_setcpu(ts, cpu, flags);
2276	tdq_add(tdq, td, flags);
2277	if (cpu != cpuid) {
2278		tdq_notify(ts);
2279		return;
2280	}
2281#else
2282	tdq = TDQ_SELF();
2283	TDQ_LOCK(tdq);
2284	/*
2285	 * Now that the thread is moving to the run-queue, set the lock
2286	 * to the scheduler's lock.
2287	 */
2288	thread_lock_set(td, TDQ_LOCKPTR(tdq));
2289	tdq_add(tdq, td, flags);
2290#endif
2291	if (!(flags & SRQ_YIELDING))
2292		sched_setpreempt(td);
2293}
2294
2295/*
2296 * Remove a thread from a run-queue without running it.  This is used
2297 * when we're stealing a thread from a remote queue.  Otherwise all threads
2298 * exit by calling sched_exit_thread() and sched_throw() themselves.
2299 */
2300void
2301sched_rem(struct thread *td)
2302{
2303	struct tdq *tdq;
2304	struct td_sched *ts;
2305
2306	CTR5(KTR_SCHED, "sched_rem: %p(%s) prio %d by %p(%s)",
2307	    td, td->td_name, td->td_priority, curthread,
2308	    curthread->td_name);
2309	ts = td->td_sched;
2310	tdq = TDQ_CPU(ts->ts_cpu);
2311	TDQ_LOCK_ASSERT(tdq, MA_OWNED);
2312	MPASS(td->td_lock == TDQ_LOCKPTR(tdq));
2313	KASSERT(TD_ON_RUNQ(td),
2314	    ("sched_rem: thread not on run queue"));
2315	tdq_runq_rem(tdq, ts);
2316	tdq_load_rem(tdq, ts);
2317	TD_SET_CAN_RUN(td);
2318	if (td->td_priority == tdq->tdq_lowpri)
2319		tdq_setlowpri(tdq, NULL);
2320}
2321
2322/*
2323 * Fetch cpu utilization information.  Updates on demand.
2324 */
2325fixpt_t
2326sched_pctcpu(struct thread *td)
2327{
2328	fixpt_t pctcpu;
2329	struct td_sched *ts;
2330
2331	pctcpu = 0;
2332	ts = td->td_sched;
2333	if (ts == NULL)
2334		return (0);
2335
2336	thread_lock(td);
2337	if (ts->ts_ticks) {
2338		int rtick;
2339
2340		sched_pctcpu_update(ts);
2341		/* How many rtick per second ? */
2342		rtick = min(SCHED_TICK_HZ(ts) / SCHED_TICK_SECS, hz);
2343		pctcpu = (FSCALE * ((FSCALE * rtick)/hz)) >> FSHIFT;
2344	}
2345	thread_unlock(td);
2346
2347	return (pctcpu);
2348}
2349
2350/*
2351 * Enforce affinity settings for a thread.  Called after adjustments to
2352 * cpumask.
2353 */
2354void
2355sched_affinity(struct thread *td)
2356{
2357#ifdef SMP
2358	struct td_sched *ts;
2359	int cpu;
2360
2361	THREAD_LOCK_ASSERT(td, MA_OWNED);
2362	ts = td->td_sched;
2363	if (THREAD_CAN_SCHED(td, ts->ts_cpu))
2364		return;
2365	if (!TD_IS_RUNNING(td))
2366		return;
2367	td->td_flags |= TDF_NEEDRESCHED;
2368	if (!THREAD_CAN_MIGRATE(td))
2369		return;
2370	/*
2371	 * Assign the new cpu and force a switch before returning to
2372	 * userspace.  If the target thread is not running locally send
2373	 * an ipi to force the issue.
2374	 */
2375	cpu = ts->ts_cpu;
2376	ts->ts_cpu = sched_pickcpu(ts, 0);
2377	if (cpu != PCPU_GET(cpuid))
2378		ipi_selected(1 << cpu, IPI_PREEMPT);
2379#endif
2380}
2381
2382/*
2383 * Bind a thread to a target cpu.
2384 */
2385void
2386sched_bind(struct thread *td, int cpu)
2387{
2388	struct td_sched *ts;
2389
2390	THREAD_LOCK_ASSERT(td, MA_OWNED|MA_NOTRECURSED);
2391	ts = td->td_sched;
2392	if (ts->ts_flags & TSF_BOUND)
2393		sched_unbind(td);
2394	ts->ts_flags |= TSF_BOUND;
2395	sched_pin();
2396	if (PCPU_GET(cpuid) == cpu)
2397		return;
2398	ts->ts_cpu = cpu;
2399	/* When we return from mi_switch we'll be on the correct cpu. */
2400	mi_switch(SW_VOL, NULL);
2401}
2402
2403/*
2404 * Release a bound thread.
2405 */
2406void
2407sched_unbind(struct thread *td)
2408{
2409	struct td_sched *ts;
2410
2411	THREAD_LOCK_ASSERT(td, MA_OWNED);
2412	ts = td->td_sched;
2413	if ((ts->ts_flags & TSF_BOUND) == 0)
2414		return;
2415	ts->ts_flags &= ~TSF_BOUND;
2416	sched_unpin();
2417}
2418
2419int
2420sched_is_bound(struct thread *td)
2421{
2422	THREAD_LOCK_ASSERT(td, MA_OWNED);
2423	return (td->td_sched->ts_flags & TSF_BOUND);
2424}
2425
2426/*
2427 * Basic yield call.
2428 */
2429void
2430sched_relinquish(struct thread *td)
2431{
2432	thread_lock(td);
2433	SCHED_STAT_INC(switch_relinquish);
2434	mi_switch(SW_VOL, NULL);
2435	thread_unlock(td);
2436}
2437
2438/*
2439 * Return the total system load.
2440 */
2441int
2442sched_load(void)
2443{
2444#ifdef SMP
2445	int total;
2446	int i;
2447
2448	total = 0;
2449	for (i = 0; i <= mp_maxid; i++)
2450		total += TDQ_CPU(i)->tdq_sysload;
2451	return (total);
2452#else
2453	return (TDQ_SELF()->tdq_sysload);
2454#endif
2455}
2456
2457int
2458sched_sizeof_proc(void)
2459{
2460	return (sizeof(struct proc));
2461}
2462
2463int
2464sched_sizeof_thread(void)
2465{
2466	return (sizeof(struct thread) + sizeof(struct td_sched));
2467}
2468
2469/*
2470 * The actual idle process.
2471 */
2472void
2473sched_idletd(void *dummy)
2474{
2475	struct thread *td;
2476	struct tdq *tdq;
2477
2478	td = curthread;
2479	tdq = TDQ_SELF();
2480	mtx_assert(&Giant, MA_NOTOWNED);
2481	/* ULE relies on preemption for idle interruption. */
2482	for (;;) {
2483#ifdef SMP
2484		if (tdq_idled(tdq))
2485			cpu_idle();
2486#else
2487		cpu_idle();
2488#endif
2489	}
2490}
2491
2492/*
2493 * A CPU is entering for the first time or a thread is exiting.
2494 */
2495void
2496sched_throw(struct thread *td)
2497{
2498	struct thread *newtd;
2499	struct tdq *tdq;
2500
2501	tdq = TDQ_SELF();
2502	if (td == NULL) {
2503		/* Correct spinlock nesting and acquire the correct lock. */
2504		TDQ_LOCK(tdq);
2505		spinlock_exit();
2506	} else {
2507		MPASS(td->td_lock == TDQ_LOCKPTR(tdq));
2508		tdq_load_rem(tdq, td->td_sched);
2509		lock_profile_release_lock(&TDQ_LOCKPTR(tdq)->lock_object);
2510	}
2511	KASSERT(curthread->td_md.md_spinlock_count == 1, ("invalid count"));
2512	newtd = choosethread();
2513	TDQ_LOCKPTR(tdq)->mtx_lock = (uintptr_t)newtd;
2514	PCPU_SET(switchtime, cpu_ticks());
2515	PCPU_SET(switchticks, ticks);
2516	cpu_throw(td, newtd);		/* doesn't return */
2517}
2518
2519/*
2520 * This is called from fork_exit().  Just acquire the correct locks and
2521 * let fork do the rest of the work.
2522 */
2523void
2524sched_fork_exit(struct thread *td)
2525{
2526	struct td_sched *ts;
2527	struct tdq *tdq;
2528	int cpuid;
2529
2530	/*
2531	 * Finish setting up thread glue so that it begins execution in a
2532	 * non-nested critical section with the scheduler lock held.
2533	 */
2534	cpuid = PCPU_GET(cpuid);
2535	tdq = TDQ_CPU(cpuid);
2536	ts = td->td_sched;
2537	if (TD_IS_IDLETHREAD(td))
2538		td->td_lock = TDQ_LOCKPTR(tdq);
2539	MPASS(td->td_lock == TDQ_LOCKPTR(tdq));
2540	td->td_oncpu = cpuid;
2541	TDQ_LOCK_ASSERT(tdq, MA_OWNED | MA_NOTRECURSED);
2542	lock_profile_obtain_lock_success(
2543	    &TDQ_LOCKPTR(tdq)->lock_object, 0, 0, __FILE__, __LINE__);
2544	tdq->tdq_lowpri = td->td_priority;
2545}
2546
2547static SYSCTL_NODE(_kern, OID_AUTO, sched, CTLFLAG_RW, 0,
2548    "Scheduler");
2549SYSCTL_STRING(_kern_sched, OID_AUTO, name, CTLFLAG_RD, "ULE", 0,
2550    "Scheduler name");
2551SYSCTL_INT(_kern_sched, OID_AUTO, slice, CTLFLAG_RW, &sched_slice, 0,
2552    "Slice size for timeshare threads");
2553SYSCTL_INT(_kern_sched, OID_AUTO, interact, CTLFLAG_RW, &sched_interact, 0,
2554     "Interactivity score threshold");
2555SYSCTL_INT(_kern_sched, OID_AUTO, preempt_thresh, CTLFLAG_RW, &preempt_thresh,
2556     0,"Min priority for preemption, lower priorities have greater precedence");
2557#ifdef SMP
2558SYSCTL_INT(_kern_sched, OID_AUTO, pick_pri, CTLFLAG_RW, &pick_pri, 0,
2559    "Pick the target cpu based on priority rather than load.");
2560SYSCTL_INT(_kern_sched, OID_AUTO, affinity, CTLFLAG_RW, &affinity, 0,
2561    "Number of hz ticks to keep thread affinity for");
2562SYSCTL_INT(_kern_sched, OID_AUTO, tryself, CTLFLAG_RW, &tryself, 0, "");
2563SYSCTL_INT(_kern_sched, OID_AUTO, userret, CTLFLAG_RW, &lowpri_userret, 0, "");
2564SYSCTL_INT(_kern_sched, OID_AUTO, oldtryself, CTLFLAG_RW, &oldtryself, 0, "");
2565SYSCTL_INT(_kern_sched, OID_AUTO, balance, CTLFLAG_RW, &rebalance, 0,
2566    "Enables the long-term load balancer");
2567SYSCTL_INT(_kern_sched, OID_AUTO, balance_interval, CTLFLAG_RW,
2568    &balance_interval, 0,
2569    "Average frequency in stathz ticks to run the long-term balancer");
2570SYSCTL_INT(_kern_sched, OID_AUTO, steal_htt, CTLFLAG_RW, &steal_htt, 0,
2571    "Steals work from another hyper-threaded core on idle");
2572SYSCTL_INT(_kern_sched, OID_AUTO, steal_idle, CTLFLAG_RW, &steal_idle, 0,
2573    "Attempts to steal work from other cores before idling");
2574SYSCTL_INT(_kern_sched, OID_AUTO, steal_thresh, CTLFLAG_RW, &steal_thresh, 0,
2575    "Minimum load on remote cpu before we'll steal");
2576#endif
2577
2578/* ps compat.  All cpu percentages from ULE are weighted. */
2579static int ccpu = 0;
2580SYSCTL_INT(_kern, OID_AUTO, ccpu, CTLFLAG_RD, &ccpu, 0, "");
2581
2582
2583#define KERN_SWITCH_INCLUDE 1
2584#include "kern/kern_switch.c"
2585