sched_ule.c revision 177009
1/*-
2 * Copyright (c) 2002-2007, Jeffrey Roberson <jeff@freebsd.org>
3 * All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 * 1. Redistributions of source code must retain the above copyright
9 *    notice unmodified, this list of conditions, and the following
10 *    disclaimer.
11 * 2. Redistributions in binary form must reproduce the above copyright
12 *    notice, this list of conditions and the following disclaimer in the
13 *    documentation and/or other materials provided with the distribution.
14 *
15 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
16 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
17 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
18 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
19 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
20 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
21 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
22 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
23 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
24 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
25 */
26
27/*
28 * This file implements the ULE scheduler.  ULE supports independent CPU
29 * run queues and fine grain locking.  It has superior interactive
30 * performance under load even on uni-processor systems.
31 *
32 * etymology:
33 *   ULE is the last three letters in schedule.  It owes its name to a
34 * generic user created for a scheduling system by Paul Mikesell at
35 * Isilon Systems and a general lack of creativity on the part of the author.
36 */
37
38#include <sys/cdefs.h>
39__FBSDID("$FreeBSD: head/sys/kern/sched_ule.c 177009 2008-03-10 03:15:19Z jeff $");
40
41#include "opt_hwpmc_hooks.h"
42#include "opt_sched.h"
43
44#include <sys/param.h>
45#include <sys/systm.h>
46#include <sys/kdb.h>
47#include <sys/kernel.h>
48#include <sys/ktr.h>
49#include <sys/lock.h>
50#include <sys/mutex.h>
51#include <sys/proc.h>
52#include <sys/resource.h>
53#include <sys/resourcevar.h>
54#include <sys/sched.h>
55#include <sys/smp.h>
56#include <sys/sx.h>
57#include <sys/sysctl.h>
58#include <sys/sysproto.h>
59#include <sys/turnstile.h>
60#include <sys/umtx.h>
61#include <sys/vmmeter.h>
62#include <sys/cpuset.h>
63#ifdef KTRACE
64#include <sys/uio.h>
65#include <sys/ktrace.h>
66#endif
67
68#ifdef HWPMC_HOOKS
69#include <sys/pmckern.h>
70#endif
71
72#include <machine/cpu.h>
73#include <machine/smp.h>
74
75#if !defined(__i386__) && !defined(__amd64__) && !defined(__powerpc__) && !defined(__arm__)
76#error "This architecture is not currently compatible with ULE"
77#endif
78
79#define	KTR_ULE	0
80
81/*
82 * Thread scheduler specific section.  All fields are protected
83 * by the thread lock.
84 */
85struct td_sched {
86	TAILQ_ENTRY(td_sched) ts_procq;	/* Run queue. */
87	struct thread	*ts_thread;	/* Active associated thread. */
88	struct runq	*ts_runq;	/* Run-queue we're queued on. */
89	short		ts_flags;	/* TSF_* flags. */
90	u_char		ts_rqindex;	/* Run queue index. */
91	u_char		ts_cpu;		/* CPU that we have affinity for. */
92	int		ts_rltick;	/* Real last tick, for affinity. */
93	int		ts_slice;	/* Ticks of slice remaining. */
94	u_int		ts_slptime;	/* Number of ticks we vol. slept */
95	u_int		ts_runtime;	/* Number of ticks we were running */
96	int		ts_ltick;	/* Last tick that we were running on */
97	int		ts_ftick;	/* First tick that we were running on */
98	int		ts_ticks;	/* Tick count */
99};
100/* flags kept in ts_flags */
101#define	TSF_BOUND	0x0001		/* Thread can not migrate. */
102#define	TSF_XFERABLE	0x0002		/* Thread was added as transferable. */
103
104static struct td_sched td_sched0;
105
106#define	THREAD_CAN_MIGRATE(td)	((td)->td_pinned == 0)
107#define	THREAD_CAN_SCHED(td, cpu)	\
108    CPU_ISSET((cpu), &(td)->td_cpuset->cs_mask)
109
110/*
111 * Cpu percentage computation macros and defines.
112 *
113 * SCHED_TICK_SECS:	Number of seconds to average the cpu usage across.
114 * SCHED_TICK_TARG:	Number of hz ticks to average the cpu usage across.
115 * SCHED_TICK_MAX:	Maximum number of ticks before scaling back.
116 * SCHED_TICK_SHIFT:	Shift factor to avoid rounding away results.
117 * SCHED_TICK_HZ:	Compute the number of hz ticks for a given ticks count.
118 * SCHED_TICK_TOTAL:	Gives the amount of time we've been recording ticks.
119 */
120#define	SCHED_TICK_SECS		10
121#define	SCHED_TICK_TARG		(hz * SCHED_TICK_SECS)
122#define	SCHED_TICK_MAX		(SCHED_TICK_TARG + hz)
123#define	SCHED_TICK_SHIFT	10
124#define	SCHED_TICK_HZ(ts)	((ts)->ts_ticks >> SCHED_TICK_SHIFT)
125#define	SCHED_TICK_TOTAL(ts)	(max((ts)->ts_ltick - (ts)->ts_ftick, hz))
126
127/*
128 * These macros determine priorities for non-interactive threads.  They are
129 * assigned a priority based on their recent cpu utilization as expressed
130 * by the ratio of ticks to the tick total.  NHALF priorities at the start
131 * and end of the MIN to MAX timeshare range are only reachable with negative
132 * or positive nice respectively.
133 *
134 * PRI_RANGE:	Priority range for utilization dependent priorities.
135 * PRI_NRESV:	Number of nice values.
136 * PRI_TICKS:	Compute a priority in PRI_RANGE from the ticks count and total.
137 * PRI_NICE:	Determines the part of the priority inherited from nice.
138 */
139#define	SCHED_PRI_NRESV		(PRIO_MAX - PRIO_MIN)
140#define	SCHED_PRI_NHALF		(SCHED_PRI_NRESV / 2)
141#define	SCHED_PRI_MIN		(PRI_MIN_TIMESHARE + SCHED_PRI_NHALF)
142#define	SCHED_PRI_MAX		(PRI_MAX_TIMESHARE - SCHED_PRI_NHALF)
143#define	SCHED_PRI_RANGE		(SCHED_PRI_MAX - SCHED_PRI_MIN)
144#define	SCHED_PRI_TICKS(ts)						\
145    (SCHED_TICK_HZ((ts)) /						\
146    (roundup(SCHED_TICK_TOTAL((ts)), SCHED_PRI_RANGE) / SCHED_PRI_RANGE))
147#define	SCHED_PRI_NICE(nice)	(nice)
148
149/*
150 * These determine the interactivity of a process.  Interactivity differs from
151 * cpu utilization in that it expresses the voluntary time slept vs time ran
152 * while cpu utilization includes all time not running.  This more accurately
153 * models the intent of the thread.
154 *
155 * SLP_RUN_MAX:	Maximum amount of sleep time + run time we'll accumulate
156 *		before throttling back.
157 * SLP_RUN_FORK:	Maximum slp+run time to inherit at fork time.
158 * INTERACT_MAX:	Maximum interactivity value.  Smaller is better.
159 * INTERACT_THRESH:	Threshhold for placement on the current runq.
160 */
161#define	SCHED_SLP_RUN_MAX	((hz * 5) << SCHED_TICK_SHIFT)
162#define	SCHED_SLP_RUN_FORK	((hz / 2) << SCHED_TICK_SHIFT)
163#define	SCHED_INTERACT_MAX	(100)
164#define	SCHED_INTERACT_HALF	(SCHED_INTERACT_MAX / 2)
165#define	SCHED_INTERACT_THRESH	(30)
166
167/*
168 * tickincr:		Converts a stathz tick into a hz domain scaled by
169 *			the shift factor.  Without the shift the error rate
170 *			due to rounding would be unacceptably high.
171 * realstathz:		stathz is sometimes 0 and run off of hz.
172 * sched_slice:		Runtime of each thread before rescheduling.
173 * preempt_thresh:	Priority threshold for preemption and remote IPIs.
174 */
175static int sched_interact = SCHED_INTERACT_THRESH;
176static int realstathz;
177static int tickincr;
178static int sched_slice = 1;
179#ifdef PREEMPTION
180#ifdef FULL_PREEMPTION
181static int preempt_thresh = PRI_MAX_IDLE;
182#else
183static int preempt_thresh = PRI_MIN_KERN;
184#endif
185#else
186static int preempt_thresh = 0;
187#endif
188
189/*
190 * tdq - per processor runqs and statistics.  All fields are protected by the
191 * tdq_lock.  The load and lowpri may be accessed without to avoid excess
192 * locking in sched_pickcpu();
193 */
194struct tdq {
195	/* Ordered to improve efficiency of cpu_search() and switch(). */
196	struct mtx	tdq_lock;		/* run queue lock. */
197	struct cpu_group *tdq_cg;		/* Pointer to cpu topology. */
198	int		tdq_load;		/* Aggregate load. */
199	int		tdq_sysload;		/* For loadavg, !ITHD load. */
200	int		tdq_transferable;	/* Transferable thread count. */
201	u_char		tdq_lowpri;		/* Lowest priority thread. */
202	u_char		tdq_ipipending;		/* IPI pending. */
203	u_char		tdq_idx;		/* Current insert index. */
204	u_char		tdq_ridx;		/* Current removal index. */
205	struct runq	tdq_realtime;		/* real-time run queue. */
206	struct runq	tdq_timeshare;		/* timeshare run queue. */
207	struct runq	tdq_idle;		/* Queue of IDLE threads. */
208	char		tdq_name[sizeof("sched lock") + 6];
209} __aligned(64);
210
211
212#ifdef SMP
213struct cpu_group *cpu_top;
214
215#define	SCHED_AFFINITY_DEFAULT	(max(1, hz / 1000))
216#define	SCHED_AFFINITY(ts, t)	((ts)->ts_rltick > ticks - ((t) * affinity))
217
218/*
219 * Run-time tunables.
220 */
221static int rebalance = 1;
222static int balance_interval = 128;	/* Default set in sched_initticks(). */
223static int affinity;
224static int steal_htt = 1;
225static int steal_idle = 1;
226static int steal_thresh = 2;
227
228/*
229 * One thread queue per processor.
230 */
231static struct tdq	tdq_cpu[MAXCPU];
232static struct tdq	*balance_tdq;
233static int balance_ticks;
234
235#define	TDQ_SELF()	(&tdq_cpu[PCPU_GET(cpuid)])
236#define	TDQ_CPU(x)	(&tdq_cpu[(x)])
237#define	TDQ_ID(x)	((int)((x) - tdq_cpu))
238#else	/* !SMP */
239static struct tdq	tdq_cpu;
240
241#define	TDQ_ID(x)	(0)
242#define	TDQ_SELF()	(&tdq_cpu)
243#define	TDQ_CPU(x)	(&tdq_cpu)
244#endif
245
246#define	TDQ_LOCK_ASSERT(t, type)	mtx_assert(TDQ_LOCKPTR((t)), (type))
247#define	TDQ_LOCK(t)		mtx_lock_spin(TDQ_LOCKPTR((t)))
248#define	TDQ_LOCK_FLAGS(t, f)	mtx_lock_spin_flags(TDQ_LOCKPTR((t)), (f))
249#define	TDQ_UNLOCK(t)		mtx_unlock_spin(TDQ_LOCKPTR((t)))
250#define	TDQ_LOCKPTR(t)		(&(t)->tdq_lock)
251
252static void sched_priority(struct thread *);
253static void sched_thread_priority(struct thread *, u_char);
254static int sched_interact_score(struct thread *);
255static void sched_interact_update(struct thread *);
256static void sched_interact_fork(struct thread *);
257static void sched_pctcpu_update(struct td_sched *);
258
259/* Operations on per processor queues */
260static struct td_sched * tdq_choose(struct tdq *);
261static void tdq_setup(struct tdq *);
262static void tdq_load_add(struct tdq *, struct td_sched *);
263static void tdq_load_rem(struct tdq *, struct td_sched *);
264static __inline void tdq_runq_add(struct tdq *, struct td_sched *, int);
265static __inline void tdq_runq_rem(struct tdq *, struct td_sched *);
266static inline int sched_shouldpreempt(int, int, int);
267void tdq_print(int cpu);
268static void runq_print(struct runq *rq);
269static void tdq_add(struct tdq *, struct thread *, int);
270#ifdef SMP
271static int tdq_move(struct tdq *, struct tdq *);
272static int tdq_idled(struct tdq *);
273static void tdq_notify(struct tdq *, struct td_sched *);
274static struct td_sched *tdq_steal(struct tdq *, int);
275static struct td_sched *runq_steal(struct runq *, int);
276static int sched_pickcpu(struct td_sched *, int);
277static void sched_balance(void);
278static int sched_balance_pair(struct tdq *, struct tdq *);
279static inline struct tdq *sched_setcpu(struct td_sched *, int, int);
280static inline struct mtx *thread_block_switch(struct thread *);
281static inline void thread_unblock_switch(struct thread *, struct mtx *);
282static struct mtx *sched_switch_migrate(struct tdq *, struct thread *, int);
283#endif
284
285static void sched_setup(void *dummy);
286SYSINIT(sched_setup, SI_SUB_RUN_QUEUE, SI_ORDER_FIRST, sched_setup, NULL)
287
288static void sched_initticks(void *dummy);
289SYSINIT(sched_initticks, SI_SUB_CLOCKS, SI_ORDER_THIRD, sched_initticks, NULL)
290
291/*
292 * Print the threads waiting on a run-queue.
293 */
294static void
295runq_print(struct runq *rq)
296{
297	struct rqhead *rqh;
298	struct td_sched *ts;
299	int pri;
300	int j;
301	int i;
302
303	for (i = 0; i < RQB_LEN; i++) {
304		printf("\t\trunq bits %d 0x%zx\n",
305		    i, rq->rq_status.rqb_bits[i]);
306		for (j = 0; j < RQB_BPW; j++)
307			if (rq->rq_status.rqb_bits[i] & (1ul << j)) {
308				pri = j + (i << RQB_L2BPW);
309				rqh = &rq->rq_queues[pri];
310				TAILQ_FOREACH(ts, rqh, ts_procq) {
311					printf("\t\t\ttd %p(%s) priority %d rqindex %d pri %d\n",
312					    ts->ts_thread, ts->ts_thread->td_name, ts->ts_thread->td_priority, ts->ts_rqindex, pri);
313				}
314			}
315	}
316}
317
318/*
319 * Print the status of a per-cpu thread queue.  Should be a ddb show cmd.
320 */
321void
322tdq_print(int cpu)
323{
324	struct tdq *tdq;
325
326	tdq = TDQ_CPU(cpu);
327
328	printf("tdq %d:\n", TDQ_ID(tdq));
329	printf("\tlock            %p\n", TDQ_LOCKPTR(tdq));
330	printf("\tLock name:      %s\n", tdq->tdq_name);
331	printf("\tload:           %d\n", tdq->tdq_load);
332	printf("\ttimeshare idx:  %d\n", tdq->tdq_idx);
333	printf("\ttimeshare ridx: %d\n", tdq->tdq_ridx);
334	printf("\trealtime runq:\n");
335	runq_print(&tdq->tdq_realtime);
336	printf("\ttimeshare runq:\n");
337	runq_print(&tdq->tdq_timeshare);
338	printf("\tidle runq:\n");
339	runq_print(&tdq->tdq_idle);
340	printf("\tload transferable: %d\n", tdq->tdq_transferable);
341	printf("\tlowest priority:   %d\n", tdq->tdq_lowpri);
342}
343
344static inline int
345sched_shouldpreempt(int pri, int cpri, int remote)
346{
347	/*
348	 * If the new priority is not better than the current priority there is
349	 * nothing to do.
350	 */
351	if (pri >= cpri)
352		return (0);
353	/*
354	 * Always preempt idle.
355	 */
356	if (cpri >= PRI_MIN_IDLE)
357		return (1);
358	/*
359	 * If preemption is disabled don't preempt others.
360	 */
361	if (preempt_thresh == 0)
362		return (0);
363	/*
364	 * Preempt if we exceed the threshold.
365	 */
366	if (pri <= preempt_thresh)
367		return (1);
368	/*
369	 * If we're realtime or better and there is timeshare or worse running
370	 * preempt only remote processors.
371	 */
372	if (remote && pri <= PRI_MAX_REALTIME && cpri > PRI_MAX_REALTIME)
373		return (1);
374	return (0);
375}
376
377#define	TS_RQ_PPQ	(((PRI_MAX_TIMESHARE - PRI_MIN_TIMESHARE) + 1) / RQ_NQS)
378/*
379 * Add a thread to the actual run-queue.  Keeps transferable counts up to
380 * date with what is actually on the run-queue.  Selects the correct
381 * queue position for timeshare threads.
382 */
383static __inline void
384tdq_runq_add(struct tdq *tdq, struct td_sched *ts, int flags)
385{
386	TDQ_LOCK_ASSERT(tdq, MA_OWNED);
387	THREAD_LOCK_ASSERT(ts->ts_thread, MA_OWNED);
388
389	TD_SET_RUNQ(ts->ts_thread);
390	if (THREAD_CAN_MIGRATE(ts->ts_thread)) {
391		tdq->tdq_transferable++;
392		ts->ts_flags |= TSF_XFERABLE;
393	}
394	if (ts->ts_runq == &tdq->tdq_timeshare) {
395		u_char pri;
396
397		pri = ts->ts_thread->td_priority;
398		KASSERT(pri <= PRI_MAX_TIMESHARE && pri >= PRI_MIN_TIMESHARE,
399			("Invalid priority %d on timeshare runq", pri));
400		/*
401		 * This queue contains only priorities between MIN and MAX
402		 * realtime.  Use the whole queue to represent these values.
403		 */
404		if ((flags & (SRQ_BORROWING|SRQ_PREEMPTED)) == 0) {
405			pri = (pri - PRI_MIN_TIMESHARE) / TS_RQ_PPQ;
406			pri = (pri + tdq->tdq_idx) % RQ_NQS;
407			/*
408			 * This effectively shortens the queue by one so we
409			 * can have a one slot difference between idx and
410			 * ridx while we wait for threads to drain.
411			 */
412			if (tdq->tdq_ridx != tdq->tdq_idx &&
413			    pri == tdq->tdq_ridx)
414				pri = (unsigned char)(pri - 1) % RQ_NQS;
415		} else
416			pri = tdq->tdq_ridx;
417		runq_add_pri(ts->ts_runq, ts, pri, flags);
418	} else
419		runq_add(ts->ts_runq, ts, flags);
420}
421
422/*
423 * Pick the run queue based on priority.
424 */
425static __inline void
426tdq_runq_pick(struct tdq *tdq, struct td_sched *ts)
427{
428	int pri;
429
430	pri = ts->ts_thread->td_priority;
431	if (pri <= PRI_MAX_REALTIME)
432		ts->ts_runq = &tdq->tdq_realtime;
433	else if (pri <= PRI_MAX_TIMESHARE)
434		ts->ts_runq = &tdq->tdq_timeshare;
435	else
436		ts->ts_runq = &tdq->tdq_idle;
437}
438
439/*
440 * Remove a thread from a run-queue.  This typically happens when a thread
441 * is selected to run.  Running threads are not on the queue and the
442 * transferable count does not reflect them.
443 */
444static __inline void
445tdq_runq_rem(struct tdq *tdq, struct td_sched *ts)
446{
447	TDQ_LOCK_ASSERT(tdq, MA_OWNED);
448	KASSERT(ts->ts_runq != NULL,
449	    ("tdq_runq_remove: thread %p null ts_runq", ts->ts_thread));
450	if (ts->ts_flags & TSF_XFERABLE) {
451		tdq->tdq_transferable--;
452		ts->ts_flags &= ~TSF_XFERABLE;
453	}
454	if (ts->ts_runq == &tdq->tdq_timeshare) {
455		if (tdq->tdq_idx != tdq->tdq_ridx)
456			runq_remove_idx(ts->ts_runq, ts, &tdq->tdq_ridx);
457		else
458			runq_remove_idx(ts->ts_runq, ts, NULL);
459		ts->ts_ltick = ticks;
460	} else
461		runq_remove(ts->ts_runq, ts);
462}
463
464/*
465 * Load is maintained for all threads RUNNING and ON_RUNQ.  Add the load
466 * for this thread to the referenced thread queue.
467 */
468static void
469tdq_load_add(struct tdq *tdq, struct td_sched *ts)
470{
471	int class;
472
473	TDQ_LOCK_ASSERT(tdq, MA_OWNED);
474	THREAD_LOCK_ASSERT(ts->ts_thread, MA_OWNED);
475	class = PRI_BASE(ts->ts_thread->td_pri_class);
476	tdq->tdq_load++;
477	CTR2(KTR_SCHED, "cpu %d load: %d", TDQ_ID(tdq), tdq->tdq_load);
478	if (class != PRI_ITHD &&
479	    (ts->ts_thread->td_proc->p_flag & P_NOLOAD) == 0)
480		tdq->tdq_sysload++;
481}
482
483/*
484 * Remove the load from a thread that is transitioning to a sleep state or
485 * exiting.
486 */
487static void
488tdq_load_rem(struct tdq *tdq, struct td_sched *ts)
489{
490	int class;
491
492	THREAD_LOCK_ASSERT(ts->ts_thread, MA_OWNED);
493	TDQ_LOCK_ASSERT(tdq, MA_OWNED);
494	class = PRI_BASE(ts->ts_thread->td_pri_class);
495	if (class != PRI_ITHD &&
496	    (ts->ts_thread->td_proc->p_flag & P_NOLOAD) == 0)
497		tdq->tdq_sysload--;
498	KASSERT(tdq->tdq_load != 0,
499	    ("tdq_load_rem: Removing with 0 load on queue %d", TDQ_ID(tdq)));
500	tdq->tdq_load--;
501	CTR1(KTR_SCHED, "load: %d", tdq->tdq_load);
502	ts->ts_runq = NULL;
503}
504
505/*
506 * Set lowpri to its exact value by searching the run-queue and
507 * evaluating curthread.  curthread may be passed as an optimization.
508 */
509static void
510tdq_setlowpri(struct tdq *tdq, struct thread *ctd)
511{
512	struct td_sched *ts;
513	struct thread *td;
514
515	TDQ_LOCK_ASSERT(tdq, MA_OWNED);
516	if (ctd == NULL)
517		ctd = pcpu_find(TDQ_ID(tdq))->pc_curthread;
518	ts = tdq_choose(tdq);
519	if (ts)
520		td = ts->ts_thread;
521	if (ts == NULL || td->td_priority > ctd->td_priority)
522		tdq->tdq_lowpri = ctd->td_priority;
523	else
524		tdq->tdq_lowpri = td->td_priority;
525}
526
527#ifdef SMP
528struct cpu_search {
529	cpumask_t cs_mask;	/* Mask of valid cpus. */
530	u_int	cs_load;
531	u_int	cs_cpu;
532	int	cs_limit;	/* Min priority for low min load for high. */
533};
534
535#define	CPU_SEARCH_LOWEST	0x1
536#define	CPU_SEARCH_HIGHEST	0x2
537#define	CPU_SEARCH_BOTH		(CPU_SEARCH_LOWEST|CPU_SEARCH_HIGHEST)
538
539#define	CPUMASK_FOREACH(cpu, mask)				\
540	for ((cpu) = 0; (cpu) < sizeof((mask)) * 8; (cpu)++)	\
541		if ((mask) & 1 << (cpu))
542
543__inline int cpu_search(struct cpu_group *cg, struct cpu_search *low,
544    struct cpu_search *high, const int match);
545int cpu_search_lowest(struct cpu_group *cg, struct cpu_search *low);
546int cpu_search_highest(struct cpu_group *cg, struct cpu_search *high);
547int cpu_search_both(struct cpu_group *cg, struct cpu_search *low,
548    struct cpu_search *high);
549
550/*
551 * This routine compares according to the match argument and should be
552 * reduced in actual instantiations via constant propagation and dead code
553 * elimination.
554 */
555static __inline int
556cpu_compare(int cpu, struct cpu_search *low, struct cpu_search *high,
557    const int match)
558{
559	struct tdq *tdq;
560
561	tdq = TDQ_CPU(cpu);
562	if (match & CPU_SEARCH_LOWEST)
563		if (low->cs_mask & (1 << cpu) &&
564		    tdq->tdq_load < low->cs_load &&
565		    tdq->tdq_lowpri > low->cs_limit) {
566			low->cs_cpu = cpu;
567			low->cs_load = tdq->tdq_load;
568		}
569	if (match & CPU_SEARCH_HIGHEST)
570		if (high->cs_mask & (1 << cpu) &&
571		    tdq->tdq_load >= high->cs_limit &&
572		    tdq->tdq_load > high->cs_load &&
573		    tdq->tdq_transferable) {
574			high->cs_cpu = cpu;
575			high->cs_load = tdq->tdq_load;
576		}
577	return (tdq->tdq_load);
578}
579
580/*
581 * Search the tree of cpu_groups for the lowest or highest loaded cpu
582 * according to the match argument.  This routine actually compares the
583 * load on all paths through the tree and finds the least loaded cpu on
584 * the least loaded path, which may differ from the least loaded cpu in
585 * the system.  This balances work among caches and busses.
586 *
587 * This inline is instantiated in three forms below using constants for the
588 * match argument.  It is reduced to the minimum set for each case.  It is
589 * also recursive to the depth of the tree.
590 */
591static inline int
592cpu_search(struct cpu_group *cg, struct cpu_search *low,
593    struct cpu_search *high, const int match)
594{
595	int total;
596
597	total = 0;
598	if (cg->cg_children) {
599		struct cpu_search lgroup;
600		struct cpu_search hgroup;
601		struct cpu_group *child;
602		u_int lload;
603		int hload;
604		int load;
605		int i;
606
607		lload = -1;
608		hload = -1;
609		for (i = 0; i < cg->cg_children; i++) {
610			child = &cg->cg_child[i];
611			if (match & CPU_SEARCH_LOWEST) {
612				lgroup = *low;
613				lgroup.cs_load = -1;
614			}
615			if (match & CPU_SEARCH_HIGHEST) {
616				hgroup = *high;
617				lgroup.cs_load = 0;
618			}
619			switch (match) {
620			case CPU_SEARCH_LOWEST:
621				load = cpu_search_lowest(child, &lgroup);
622				break;
623			case CPU_SEARCH_HIGHEST:
624				load = cpu_search_highest(child, &hgroup);
625				break;
626			case CPU_SEARCH_BOTH:
627				load = cpu_search_both(child, &lgroup, &hgroup);
628				break;
629			}
630			total += load;
631			if (match & CPU_SEARCH_LOWEST)
632				if (load < lload || low->cs_cpu == -1) {
633					*low = lgroup;
634					lload = load;
635				}
636			if (match & CPU_SEARCH_HIGHEST)
637				if (load > hload || high->cs_cpu == -1) {
638					hload = load;
639					*high = hgroup;
640				}
641		}
642	} else {
643		int cpu;
644
645		CPUMASK_FOREACH(cpu, cg->cg_mask)
646			total += cpu_compare(cpu, low, high, match);
647	}
648	return (total);
649}
650
651/*
652 * cpu_search instantiations must pass constants to maintain the inline
653 * optimization.
654 */
655int
656cpu_search_lowest(struct cpu_group *cg, struct cpu_search *low)
657{
658	return cpu_search(cg, low, NULL, CPU_SEARCH_LOWEST);
659}
660
661int
662cpu_search_highest(struct cpu_group *cg, struct cpu_search *high)
663{
664	return cpu_search(cg, NULL, high, CPU_SEARCH_HIGHEST);
665}
666
667int
668cpu_search_both(struct cpu_group *cg, struct cpu_search *low,
669    struct cpu_search *high)
670{
671	return cpu_search(cg, low, high, CPU_SEARCH_BOTH);
672}
673
674/*
675 * Find the cpu with the least load via the least loaded path that has a
676 * lowpri greater than pri  pri.  A pri of -1 indicates any priority is
677 * acceptable.
678 */
679static inline int
680sched_lowest(struct cpu_group *cg, cpumask_t mask, int pri)
681{
682	struct cpu_search low;
683
684	low.cs_cpu = -1;
685	low.cs_load = -1;
686	low.cs_mask = mask;
687	low.cs_limit = pri;
688	cpu_search_lowest(cg, &low);
689	return low.cs_cpu;
690}
691
692/*
693 * Find the cpu with the highest load via the highest loaded path.
694 */
695static inline int
696sched_highest(struct cpu_group *cg, cpumask_t mask, int minload)
697{
698	struct cpu_search high;
699
700	high.cs_cpu = -1;
701	high.cs_load = 0;
702	high.cs_mask = mask;
703	high.cs_limit = minload;
704	cpu_search_highest(cg, &high);
705	return high.cs_cpu;
706}
707
708/*
709 * Simultaneously find the highest and lowest loaded cpu reachable via
710 * cg.
711 */
712static inline void
713sched_both(struct cpu_group *cg, cpumask_t mask, int *lowcpu, int *highcpu)
714{
715	struct cpu_search high;
716	struct cpu_search low;
717
718	low.cs_cpu = -1;
719	low.cs_limit = -1;
720	low.cs_load = -1;
721	low.cs_mask = mask;
722	high.cs_load = 0;
723	high.cs_cpu = -1;
724	high.cs_limit = -1;
725	high.cs_mask = mask;
726	cpu_search_both(cg, &low, &high);
727	*lowcpu = low.cs_cpu;
728	*highcpu = high.cs_cpu;
729	return;
730}
731
732static void
733sched_balance_group(struct cpu_group *cg)
734{
735	cpumask_t mask;
736	int high;
737	int low;
738	int i;
739
740	mask = -1;
741	for (;;) {
742		sched_both(cg, mask, &low, &high);
743		if (low == high || low == -1 || high == -1)
744			break;
745		if (sched_balance_pair(TDQ_CPU(high), TDQ_CPU(low)))
746			break;
747		/*
748		 * If we failed to move any threads determine which cpu
749		 * to kick out of the set and try again.
750	 	 */
751		if (TDQ_CPU(high)->tdq_transferable == 0)
752			mask &= ~(1 << high);
753		else
754			mask &= ~(1 << low);
755	}
756
757	for (i = 0; i < cg->cg_children; i++)
758		sched_balance_group(&cg->cg_child[i]);
759}
760
761static void
762sched_balance()
763{
764	struct tdq *tdq;
765
766	/*
767	 * Select a random time between .5 * balance_interval and
768	 * 1.5 * balance_interval.
769	 */
770	balance_ticks = max(balance_interval / 2, 1);
771	balance_ticks += random() % balance_interval;
772	if (smp_started == 0 || rebalance == 0)
773		return;
774	tdq = TDQ_SELF();
775	TDQ_UNLOCK(tdq);
776	sched_balance_group(cpu_top);
777	TDQ_LOCK(tdq);
778}
779
780/*
781 * Lock two thread queues using their address to maintain lock order.
782 */
783static void
784tdq_lock_pair(struct tdq *one, struct tdq *two)
785{
786	if (one < two) {
787		TDQ_LOCK(one);
788		TDQ_LOCK_FLAGS(two, MTX_DUPOK);
789	} else {
790		TDQ_LOCK(two);
791		TDQ_LOCK_FLAGS(one, MTX_DUPOK);
792	}
793}
794
795/*
796 * Unlock two thread queues.  Order is not important here.
797 */
798static void
799tdq_unlock_pair(struct tdq *one, struct tdq *two)
800{
801	TDQ_UNLOCK(one);
802	TDQ_UNLOCK(two);
803}
804
805/*
806 * Transfer load between two imbalanced thread queues.
807 */
808static int
809sched_balance_pair(struct tdq *high, struct tdq *low)
810{
811	int transferable;
812	int high_load;
813	int low_load;
814	int moved;
815	int move;
816	int diff;
817	int i;
818
819	tdq_lock_pair(high, low);
820	transferable = high->tdq_transferable;
821	high_load = high->tdq_load;
822	low_load = low->tdq_load;
823	moved = 0;
824	/*
825	 * Determine what the imbalance is and then adjust that to how many
826	 * threads we actually have to give up (transferable).
827	 */
828	if (transferable != 0) {
829		diff = high_load - low_load;
830		move = diff / 2;
831		if (diff & 0x1)
832			move++;
833		move = min(move, transferable);
834		for (i = 0; i < move; i++)
835			moved += tdq_move(high, low);
836		/*
837		 * IPI the target cpu to force it to reschedule with the new
838		 * workload.
839		 */
840		ipi_selected(1 << TDQ_ID(low), IPI_PREEMPT);
841	}
842	tdq_unlock_pair(high, low);
843	return (moved);
844}
845
846/*
847 * Move a thread from one thread queue to another.
848 */
849static int
850tdq_move(struct tdq *from, struct tdq *to)
851{
852	struct td_sched *ts;
853	struct thread *td;
854	struct tdq *tdq;
855	int cpu;
856
857	TDQ_LOCK_ASSERT(from, MA_OWNED);
858	TDQ_LOCK_ASSERT(to, MA_OWNED);
859
860	tdq = from;
861	cpu = TDQ_ID(to);
862	ts = tdq_steal(tdq, cpu);
863	if (ts == NULL)
864		return (0);
865	td = ts->ts_thread;
866	/*
867	 * Although the run queue is locked the thread may be blocked.  Lock
868	 * it to clear this and acquire the run-queue lock.
869	 */
870	thread_lock(td);
871	/* Drop recursive lock on from acquired via thread_lock(). */
872	TDQ_UNLOCK(from);
873	sched_rem(td);
874	ts->ts_cpu = cpu;
875	td->td_lock = TDQ_LOCKPTR(to);
876	tdq_add(to, td, SRQ_YIELDING);
877	return (1);
878}
879
880/*
881 * This tdq has idled.  Try to steal a thread from another cpu and switch
882 * to it.
883 */
884static int
885tdq_idled(struct tdq *tdq)
886{
887	struct cpu_group *cg;
888	struct tdq *steal;
889	cpumask_t mask;
890	int thresh;
891	int cpu;
892
893	if (smp_started == 0 || steal_idle == 0)
894		return (1);
895	mask = -1;
896	mask &= ~PCPU_GET(cpumask);
897	/* We don't want to be preempted while we're iterating. */
898	spinlock_enter();
899	for (cg = tdq->tdq_cg; cg != NULL; ) {
900		if ((cg->cg_flags & (CG_FLAG_HTT | CG_FLAG_THREAD)) == 0)
901			thresh = steal_thresh;
902		else
903			thresh = 1;
904		cpu = sched_highest(cg, mask, thresh);
905		if (cpu == -1) {
906			cg = cg->cg_parent;
907			continue;
908		}
909		steal = TDQ_CPU(cpu);
910		mask &= ~(1 << cpu);
911		tdq_lock_pair(tdq, steal);
912		if (steal->tdq_load < thresh || steal->tdq_transferable == 0) {
913			tdq_unlock_pair(tdq, steal);
914			continue;
915		}
916		/*
917		 * If a thread was added while interrupts were disabled don't
918		 * steal one here.  If we fail to acquire one due to affinity
919		 * restrictions loop again with this cpu removed from the
920		 * set.
921		 */
922		if (tdq->tdq_load == 0 && tdq_move(steal, tdq) == 0) {
923			tdq_unlock_pair(tdq, steal);
924			continue;
925		}
926		spinlock_exit();
927		TDQ_UNLOCK(steal);
928		mi_switch(SW_VOL, NULL);
929		thread_unlock(curthread);
930
931		return (0);
932	}
933	spinlock_exit();
934	return (1);
935}
936
937/*
938 * Notify a remote cpu of new work.  Sends an IPI if criteria are met.
939 */
940static void
941tdq_notify(struct tdq *tdq, struct td_sched *ts)
942{
943	int cpri;
944	int pri;
945	int cpu;
946
947	if (tdq->tdq_ipipending)
948		return;
949	cpu = ts->ts_cpu;
950	pri = ts->ts_thread->td_priority;
951	cpri = pcpu_find(cpu)->pc_curthread->td_priority;
952	if (!sched_shouldpreempt(pri, cpri, 1))
953		return;
954	tdq->tdq_ipipending = 1;
955	ipi_selected(1 << cpu, IPI_PREEMPT);
956}
957
958/*
959 * Steals load from a timeshare queue.  Honors the rotating queue head
960 * index.
961 */
962static struct td_sched *
963runq_steal_from(struct runq *rq, int cpu, u_char start)
964{
965	struct td_sched *ts;
966	struct rqbits *rqb;
967	struct rqhead *rqh;
968	int first;
969	int bit;
970	int pri;
971	int i;
972
973	rqb = &rq->rq_status;
974	bit = start & (RQB_BPW -1);
975	pri = 0;
976	first = 0;
977again:
978	for (i = RQB_WORD(start); i < RQB_LEN; bit = 0, i++) {
979		if (rqb->rqb_bits[i] == 0)
980			continue;
981		if (bit != 0) {
982			for (pri = bit; pri < RQB_BPW; pri++)
983				if (rqb->rqb_bits[i] & (1ul << pri))
984					break;
985			if (pri >= RQB_BPW)
986				continue;
987		} else
988			pri = RQB_FFS(rqb->rqb_bits[i]);
989		pri += (i << RQB_L2BPW);
990		rqh = &rq->rq_queues[pri];
991		TAILQ_FOREACH(ts, rqh, ts_procq) {
992			if (first && THREAD_CAN_MIGRATE(ts->ts_thread) &&
993			    THREAD_CAN_SCHED(ts->ts_thread, cpu))
994				return (ts);
995			first = 1;
996		}
997	}
998	if (start != 0) {
999		start = 0;
1000		goto again;
1001	}
1002
1003	return (NULL);
1004}
1005
1006/*
1007 * Steals load from a standard linear queue.
1008 */
1009static struct td_sched *
1010runq_steal(struct runq *rq, int cpu)
1011{
1012	struct rqhead *rqh;
1013	struct rqbits *rqb;
1014	struct td_sched *ts;
1015	int word;
1016	int bit;
1017
1018	rqb = &rq->rq_status;
1019	for (word = 0; word < RQB_LEN; word++) {
1020		if (rqb->rqb_bits[word] == 0)
1021			continue;
1022		for (bit = 0; bit < RQB_BPW; bit++) {
1023			if ((rqb->rqb_bits[word] & (1ul << bit)) == 0)
1024				continue;
1025			rqh = &rq->rq_queues[bit + (word << RQB_L2BPW)];
1026			TAILQ_FOREACH(ts, rqh, ts_procq)
1027				if (THREAD_CAN_MIGRATE(ts->ts_thread) &&
1028				    THREAD_CAN_SCHED(ts->ts_thread, cpu))
1029					return (ts);
1030		}
1031	}
1032	return (NULL);
1033}
1034
1035/*
1036 * Attempt to steal a thread in priority order from a thread queue.
1037 */
1038static struct td_sched *
1039tdq_steal(struct tdq *tdq, int cpu)
1040{
1041	struct td_sched *ts;
1042
1043	TDQ_LOCK_ASSERT(tdq, MA_OWNED);
1044	if ((ts = runq_steal(&tdq->tdq_realtime, cpu)) != NULL)
1045		return (ts);
1046	if ((ts = runq_steal_from(&tdq->tdq_timeshare, cpu, tdq->tdq_ridx))
1047	    != NULL)
1048		return (ts);
1049	return (runq_steal(&tdq->tdq_idle, cpu));
1050}
1051
1052/*
1053 * Sets the thread lock and ts_cpu to match the requested cpu.  Unlocks the
1054 * current lock and returns with the assigned queue locked.
1055 */
1056static inline struct tdq *
1057sched_setcpu(struct td_sched *ts, int cpu, int flags)
1058{
1059	struct thread *td;
1060	struct tdq *tdq;
1061
1062	THREAD_LOCK_ASSERT(ts->ts_thread, MA_OWNED);
1063
1064	tdq = TDQ_CPU(cpu);
1065	td = ts->ts_thread;
1066	ts->ts_cpu = cpu;
1067
1068	/* If the lock matches just return the queue. */
1069	if (td->td_lock == TDQ_LOCKPTR(tdq))
1070		return (tdq);
1071#ifdef notyet
1072	/*
1073	 * If the thread isn't running its lockptr is a
1074	 * turnstile or a sleepqueue.  We can just lock_set without
1075	 * blocking.
1076	 */
1077	if (TD_CAN_RUN(td)) {
1078		TDQ_LOCK(tdq);
1079		thread_lock_set(td, TDQ_LOCKPTR(tdq));
1080		return (tdq);
1081	}
1082#endif
1083	/*
1084	 * The hard case, migration, we need to block the thread first to
1085	 * prevent order reversals with other cpus locks.
1086	 */
1087	thread_lock_block(td);
1088	TDQ_LOCK(tdq);
1089	thread_lock_unblock(td, TDQ_LOCKPTR(tdq));
1090	return (tdq);
1091}
1092
1093static int
1094sched_pickcpu(struct td_sched *ts, int flags)
1095{
1096	struct cpu_group *cg;
1097	struct thread *td;
1098	struct tdq *tdq;
1099	cpumask_t mask;
1100	int self;
1101	int pri;
1102	int cpu;
1103
1104	self = PCPU_GET(cpuid);
1105	td = ts->ts_thread;
1106	if (smp_started == 0)
1107		return (self);
1108	/*
1109	 * Don't migrate a running thread from sched_switch().
1110	 */
1111	if ((flags & SRQ_OURSELF) || !THREAD_CAN_MIGRATE(td))
1112		return (ts->ts_cpu);
1113	/*
1114	 * Prefer to run interrupt threads on the processors that generate
1115	 * the interrupt.
1116	 */
1117	if (td->td_priority <= PRI_MAX_ITHD && THREAD_CAN_SCHED(td, self) &&
1118	    curthread->td_intr_nesting_level)
1119		ts->ts_cpu = self;
1120	/*
1121	 * If the thread can run on the last cpu and the affinity has not
1122	 * expired or it is idle run it there.
1123	 */
1124	pri = td->td_priority;
1125	tdq = TDQ_CPU(ts->ts_cpu);
1126	if (THREAD_CAN_SCHED(td, ts->ts_cpu)) {
1127		if (tdq->tdq_lowpri > PRI_MIN_IDLE)
1128			return (ts->ts_cpu);
1129		if (SCHED_AFFINITY(ts, CG_SHARE_L2) && tdq->tdq_lowpri > pri)
1130			return (ts->ts_cpu);
1131	}
1132	/*
1133	 * Search for the highest level in the tree that still has affinity.
1134	 */
1135	cg = NULL;
1136	for (cg = tdq->tdq_cg; cg != NULL; cg = cg->cg_parent)
1137		if (SCHED_AFFINITY(ts, cg->cg_level))
1138			break;
1139	cpu = -1;
1140	mask = td->td_cpuset->cs_mask.__bits[0];
1141	if (cg)
1142		cpu = sched_lowest(cg, mask, pri);
1143	if (cpu == -1)
1144		cpu = sched_lowest(cpu_top, mask, -1);
1145	/*
1146	 * Compare the lowest loaded cpu to current cpu.
1147	 */
1148	if (THREAD_CAN_SCHED(td, self) && TDQ_CPU(self)->tdq_lowpri > pri &&
1149	    TDQ_CPU(cpu)->tdq_lowpri < PRI_MIN_IDLE)
1150		cpu = self;
1151	KASSERT(cpu != -1, ("sched_pickcpu: Failed to find a cpu."));
1152	return (cpu);
1153}
1154#endif
1155
1156/*
1157 * Pick the highest priority task we have and return it.
1158 */
1159static struct td_sched *
1160tdq_choose(struct tdq *tdq)
1161{
1162	struct td_sched *ts;
1163
1164	TDQ_LOCK_ASSERT(tdq, MA_OWNED);
1165	ts = runq_choose(&tdq->tdq_realtime);
1166	if (ts != NULL)
1167		return (ts);
1168	ts = runq_choose_from(&tdq->tdq_timeshare, tdq->tdq_ridx);
1169	if (ts != NULL) {
1170		KASSERT(ts->ts_thread->td_priority >= PRI_MIN_TIMESHARE,
1171		    ("tdq_choose: Invalid priority on timeshare queue %d",
1172		    ts->ts_thread->td_priority));
1173		return (ts);
1174	}
1175
1176	ts = runq_choose(&tdq->tdq_idle);
1177	if (ts != NULL) {
1178		KASSERT(ts->ts_thread->td_priority >= PRI_MIN_IDLE,
1179		    ("tdq_choose: Invalid priority on idle queue %d",
1180		    ts->ts_thread->td_priority));
1181		return (ts);
1182	}
1183
1184	return (NULL);
1185}
1186
1187/*
1188 * Initialize a thread queue.
1189 */
1190static void
1191tdq_setup(struct tdq *tdq)
1192{
1193
1194	if (bootverbose)
1195		printf("ULE: setup cpu %d\n", TDQ_ID(tdq));
1196	runq_init(&tdq->tdq_realtime);
1197	runq_init(&tdq->tdq_timeshare);
1198	runq_init(&tdq->tdq_idle);
1199	snprintf(tdq->tdq_name, sizeof(tdq->tdq_name),
1200	    "sched lock %d", (int)TDQ_ID(tdq));
1201	mtx_init(&tdq->tdq_lock, tdq->tdq_name, "sched lock",
1202	    MTX_SPIN | MTX_RECURSE);
1203}
1204
1205#ifdef SMP
1206static void
1207sched_setup_smp(void)
1208{
1209	struct tdq *tdq;
1210	int i;
1211
1212	cpu_top = smp_topo();
1213	for (i = 0; i < MAXCPU; i++) {
1214		if (CPU_ABSENT(i))
1215			continue;
1216		tdq = TDQ_CPU(i);
1217		tdq_setup(tdq);
1218		tdq->tdq_cg = smp_topo_find(cpu_top, i);
1219		if (tdq->tdq_cg == NULL)
1220			panic("Can't find cpu group for %d\n", i);
1221	}
1222	balance_tdq = TDQ_SELF();
1223	sched_balance();
1224}
1225#endif
1226
1227/*
1228 * Setup the thread queues and initialize the topology based on MD
1229 * information.
1230 */
1231static void
1232sched_setup(void *dummy)
1233{
1234	struct tdq *tdq;
1235
1236	tdq = TDQ_SELF();
1237#ifdef SMP
1238	sched_setup_smp();
1239#else
1240	tdq_setup(tdq);
1241#endif
1242	/*
1243	 * To avoid divide-by-zero, we set realstathz a dummy value
1244	 * in case which sched_clock() called before sched_initticks().
1245	 */
1246	realstathz = hz;
1247	sched_slice = (realstathz/10);	/* ~100ms */
1248	tickincr = 1 << SCHED_TICK_SHIFT;
1249
1250	/* Add thread0's load since it's running. */
1251	TDQ_LOCK(tdq);
1252	thread0.td_lock = TDQ_LOCKPTR(TDQ_SELF());
1253	tdq_load_add(tdq, &td_sched0);
1254	tdq->tdq_lowpri = thread0.td_priority;
1255	TDQ_UNLOCK(tdq);
1256}
1257
1258/*
1259 * This routine determines the tickincr after stathz and hz are setup.
1260 */
1261/* ARGSUSED */
1262static void
1263sched_initticks(void *dummy)
1264{
1265	int incr;
1266
1267	realstathz = stathz ? stathz : hz;
1268	sched_slice = (realstathz/10);	/* ~100ms */
1269
1270	/*
1271	 * tickincr is shifted out by 10 to avoid rounding errors due to
1272	 * hz not being evenly divisible by stathz on all platforms.
1273	 */
1274	incr = (hz << SCHED_TICK_SHIFT) / realstathz;
1275	/*
1276	 * This does not work for values of stathz that are more than
1277	 * 1 << SCHED_TICK_SHIFT * hz.  In practice this does not happen.
1278	 */
1279	if (incr == 0)
1280		incr = 1;
1281	tickincr = incr;
1282#ifdef SMP
1283	/*
1284	 * Set the default balance interval now that we know
1285	 * what realstathz is.
1286	 */
1287	balance_interval = realstathz;
1288	/*
1289	 * Set steal thresh to log2(mp_ncpu) but no greater than 4.  This
1290	 * prevents excess thrashing on large machines and excess idle on
1291	 * smaller machines.
1292	 */
1293	steal_thresh = min(ffs(mp_ncpus) - 1, 3);
1294	affinity = SCHED_AFFINITY_DEFAULT;
1295#endif
1296}
1297
1298
1299/*
1300 * This is the core of the interactivity algorithm.  Determines a score based
1301 * on past behavior.  It is the ratio of sleep time to run time scaled to
1302 * a [0, 100] integer.  This is the voluntary sleep time of a process, which
1303 * differs from the cpu usage because it does not account for time spent
1304 * waiting on a run-queue.  Would be prettier if we had floating point.
1305 */
1306static int
1307sched_interact_score(struct thread *td)
1308{
1309	struct td_sched *ts;
1310	int div;
1311
1312	ts = td->td_sched;
1313	/*
1314	 * The score is only needed if this is likely to be an interactive
1315	 * task.  Don't go through the expense of computing it if there's
1316	 * no chance.
1317	 */
1318	if (sched_interact <= SCHED_INTERACT_HALF &&
1319		ts->ts_runtime >= ts->ts_slptime)
1320			return (SCHED_INTERACT_HALF);
1321
1322	if (ts->ts_runtime > ts->ts_slptime) {
1323		div = max(1, ts->ts_runtime / SCHED_INTERACT_HALF);
1324		return (SCHED_INTERACT_HALF +
1325		    (SCHED_INTERACT_HALF - (ts->ts_slptime / div)));
1326	}
1327	if (ts->ts_slptime > ts->ts_runtime) {
1328		div = max(1, ts->ts_slptime / SCHED_INTERACT_HALF);
1329		return (ts->ts_runtime / div);
1330	}
1331	/* runtime == slptime */
1332	if (ts->ts_runtime)
1333		return (SCHED_INTERACT_HALF);
1334
1335	/*
1336	 * This can happen if slptime and runtime are 0.
1337	 */
1338	return (0);
1339
1340}
1341
1342/*
1343 * Scale the scheduling priority according to the "interactivity" of this
1344 * process.
1345 */
1346static void
1347sched_priority(struct thread *td)
1348{
1349	int score;
1350	int pri;
1351
1352	if (td->td_pri_class != PRI_TIMESHARE)
1353		return;
1354	/*
1355	 * If the score is interactive we place the thread in the realtime
1356	 * queue with a priority that is less than kernel and interrupt
1357	 * priorities.  These threads are not subject to nice restrictions.
1358	 *
1359	 * Scores greater than this are placed on the normal timeshare queue
1360	 * where the priority is partially decided by the most recent cpu
1361	 * utilization and the rest is decided by nice value.
1362	 *
1363	 * The nice value of the process has a linear effect on the calculated
1364	 * score.  Negative nice values make it easier for a thread to be
1365	 * considered interactive.
1366	 */
1367	score = imax(0, sched_interact_score(td) - td->td_proc->p_nice);
1368	if (score < sched_interact) {
1369		pri = PRI_MIN_REALTIME;
1370		pri += ((PRI_MAX_REALTIME - PRI_MIN_REALTIME) / sched_interact)
1371		    * score;
1372		KASSERT(pri >= PRI_MIN_REALTIME && pri <= PRI_MAX_REALTIME,
1373		    ("sched_priority: invalid interactive priority %d score %d",
1374		    pri, score));
1375	} else {
1376		pri = SCHED_PRI_MIN;
1377		if (td->td_sched->ts_ticks)
1378			pri += SCHED_PRI_TICKS(td->td_sched);
1379		pri += SCHED_PRI_NICE(td->td_proc->p_nice);
1380		KASSERT(pri >= PRI_MIN_TIMESHARE && pri <= PRI_MAX_TIMESHARE,
1381		    ("sched_priority: invalid priority %d: nice %d, "
1382		    "ticks %d ftick %d ltick %d tick pri %d",
1383		    pri, td->td_proc->p_nice, td->td_sched->ts_ticks,
1384		    td->td_sched->ts_ftick, td->td_sched->ts_ltick,
1385		    SCHED_PRI_TICKS(td->td_sched)));
1386	}
1387	sched_user_prio(td, pri);
1388
1389	return;
1390}
1391
1392/*
1393 * This routine enforces a maximum limit on the amount of scheduling history
1394 * kept.  It is called after either the slptime or runtime is adjusted.  This
1395 * function is ugly due to integer math.
1396 */
1397static void
1398sched_interact_update(struct thread *td)
1399{
1400	struct td_sched *ts;
1401	u_int sum;
1402
1403	ts = td->td_sched;
1404	sum = ts->ts_runtime + ts->ts_slptime;
1405	if (sum < SCHED_SLP_RUN_MAX)
1406		return;
1407	/*
1408	 * This only happens from two places:
1409	 * 1) We have added an unusual amount of run time from fork_exit.
1410	 * 2) We have added an unusual amount of sleep time from sched_sleep().
1411	 */
1412	if (sum > SCHED_SLP_RUN_MAX * 2) {
1413		if (ts->ts_runtime > ts->ts_slptime) {
1414			ts->ts_runtime = SCHED_SLP_RUN_MAX;
1415			ts->ts_slptime = 1;
1416		} else {
1417			ts->ts_slptime = SCHED_SLP_RUN_MAX;
1418			ts->ts_runtime = 1;
1419		}
1420		return;
1421	}
1422	/*
1423	 * If we have exceeded by more than 1/5th then the algorithm below
1424	 * will not bring us back into range.  Dividing by two here forces
1425	 * us into the range of [4/5 * SCHED_INTERACT_MAX, SCHED_INTERACT_MAX]
1426	 */
1427	if (sum > (SCHED_SLP_RUN_MAX / 5) * 6) {
1428		ts->ts_runtime /= 2;
1429		ts->ts_slptime /= 2;
1430		return;
1431	}
1432	ts->ts_runtime = (ts->ts_runtime / 5) * 4;
1433	ts->ts_slptime = (ts->ts_slptime / 5) * 4;
1434}
1435
1436/*
1437 * Scale back the interactivity history when a child thread is created.  The
1438 * history is inherited from the parent but the thread may behave totally
1439 * differently.  For example, a shell spawning a compiler process.  We want
1440 * to learn that the compiler is behaving badly very quickly.
1441 */
1442static void
1443sched_interact_fork(struct thread *td)
1444{
1445	int ratio;
1446	int sum;
1447
1448	sum = td->td_sched->ts_runtime + td->td_sched->ts_slptime;
1449	if (sum > SCHED_SLP_RUN_FORK) {
1450		ratio = sum / SCHED_SLP_RUN_FORK;
1451		td->td_sched->ts_runtime /= ratio;
1452		td->td_sched->ts_slptime /= ratio;
1453	}
1454}
1455
1456/*
1457 * Called from proc0_init() to setup the scheduler fields.
1458 */
1459void
1460schedinit(void)
1461{
1462
1463	/*
1464	 * Set up the scheduler specific parts of proc0.
1465	 */
1466	proc0.p_sched = NULL; /* XXX */
1467	thread0.td_sched = &td_sched0;
1468	td_sched0.ts_ltick = ticks;
1469	td_sched0.ts_ftick = ticks;
1470	td_sched0.ts_thread = &thread0;
1471	td_sched0.ts_slice = sched_slice;
1472}
1473
1474/*
1475 * This is only somewhat accurate since given many processes of the same
1476 * priority they will switch when their slices run out, which will be
1477 * at most sched_slice stathz ticks.
1478 */
1479int
1480sched_rr_interval(void)
1481{
1482
1483	/* Convert sched_slice to hz */
1484	return (hz/(realstathz/sched_slice));
1485}
1486
1487/*
1488 * Update the percent cpu tracking information when it is requested or
1489 * the total history exceeds the maximum.  We keep a sliding history of
1490 * tick counts that slowly decays.  This is less precise than the 4BSD
1491 * mechanism since it happens with less regular and frequent events.
1492 */
1493static void
1494sched_pctcpu_update(struct td_sched *ts)
1495{
1496
1497	if (ts->ts_ticks == 0)
1498		return;
1499	if (ticks - (hz / 10) < ts->ts_ltick &&
1500	    SCHED_TICK_TOTAL(ts) < SCHED_TICK_MAX)
1501		return;
1502	/*
1503	 * Adjust counters and watermark for pctcpu calc.
1504	 */
1505	if (ts->ts_ltick > ticks - SCHED_TICK_TARG)
1506		ts->ts_ticks = (ts->ts_ticks / (ticks - ts->ts_ftick)) *
1507			    SCHED_TICK_TARG;
1508	else
1509		ts->ts_ticks = 0;
1510	ts->ts_ltick = ticks;
1511	ts->ts_ftick = ts->ts_ltick - SCHED_TICK_TARG;
1512}
1513
1514/*
1515 * Adjust the priority of a thread.  Move it to the appropriate run-queue
1516 * if necessary.  This is the back-end for several priority related
1517 * functions.
1518 */
1519static void
1520sched_thread_priority(struct thread *td, u_char prio)
1521{
1522	struct td_sched *ts;
1523	struct tdq *tdq;
1524	int oldpri;
1525
1526	CTR6(KTR_SCHED, "sched_prio: %p(%s) prio %d newprio %d by %p(%s)",
1527	    td, td->td_name, td->td_priority, prio, curthread,
1528	    curthread->td_name);
1529	ts = td->td_sched;
1530	THREAD_LOCK_ASSERT(td, MA_OWNED);
1531	if (td->td_priority == prio)
1532		return;
1533
1534	if (TD_ON_RUNQ(td) && prio < td->td_priority) {
1535		/*
1536		 * If the priority has been elevated due to priority
1537		 * propagation, we may have to move ourselves to a new
1538		 * queue.  This could be optimized to not re-add in some
1539		 * cases.
1540		 */
1541		sched_rem(td);
1542		td->td_priority = prio;
1543		sched_add(td, SRQ_BORROWING);
1544		return;
1545	}
1546	tdq = TDQ_CPU(ts->ts_cpu);
1547	oldpri = td->td_priority;
1548	td->td_priority = prio;
1549	tdq_runq_pick(tdq, ts);
1550	if (TD_IS_RUNNING(td)) {
1551		if (prio < tdq->tdq_lowpri)
1552			tdq->tdq_lowpri = prio;
1553		else if (tdq->tdq_lowpri == oldpri)
1554			tdq_setlowpri(tdq, td);
1555	}
1556}
1557
1558/*
1559 * Update a thread's priority when it is lent another thread's
1560 * priority.
1561 */
1562void
1563sched_lend_prio(struct thread *td, u_char prio)
1564{
1565
1566	td->td_flags |= TDF_BORROWING;
1567	sched_thread_priority(td, prio);
1568}
1569
1570/*
1571 * Restore a thread's priority when priority propagation is
1572 * over.  The prio argument is the minimum priority the thread
1573 * needs to have to satisfy other possible priority lending
1574 * requests.  If the thread's regular priority is less
1575 * important than prio, the thread will keep a priority boost
1576 * of prio.
1577 */
1578void
1579sched_unlend_prio(struct thread *td, u_char prio)
1580{
1581	u_char base_pri;
1582
1583	if (td->td_base_pri >= PRI_MIN_TIMESHARE &&
1584	    td->td_base_pri <= PRI_MAX_TIMESHARE)
1585		base_pri = td->td_user_pri;
1586	else
1587		base_pri = td->td_base_pri;
1588	if (prio >= base_pri) {
1589		td->td_flags &= ~TDF_BORROWING;
1590		sched_thread_priority(td, base_pri);
1591	} else
1592		sched_lend_prio(td, prio);
1593}
1594
1595/*
1596 * Standard entry for setting the priority to an absolute value.
1597 */
1598void
1599sched_prio(struct thread *td, u_char prio)
1600{
1601	u_char oldprio;
1602
1603	/* First, update the base priority. */
1604	td->td_base_pri = prio;
1605
1606	/*
1607	 * If the thread is borrowing another thread's priority, don't
1608	 * ever lower the priority.
1609	 */
1610	if (td->td_flags & TDF_BORROWING && td->td_priority < prio)
1611		return;
1612
1613	/* Change the real priority. */
1614	oldprio = td->td_priority;
1615	sched_thread_priority(td, prio);
1616
1617	/*
1618	 * If the thread is on a turnstile, then let the turnstile update
1619	 * its state.
1620	 */
1621	if (TD_ON_LOCK(td) && oldprio != prio)
1622		turnstile_adjust(td, oldprio);
1623}
1624
1625/*
1626 * Set the base user priority, does not effect current running priority.
1627 */
1628void
1629sched_user_prio(struct thread *td, u_char prio)
1630{
1631	u_char oldprio;
1632
1633	td->td_base_user_pri = prio;
1634	if (td->td_flags & TDF_UBORROWING && td->td_user_pri <= prio)
1635                return;
1636	oldprio = td->td_user_pri;
1637	td->td_user_pri = prio;
1638}
1639
1640void
1641sched_lend_user_prio(struct thread *td, u_char prio)
1642{
1643	u_char oldprio;
1644
1645	THREAD_LOCK_ASSERT(td, MA_OWNED);
1646	td->td_flags |= TDF_UBORROWING;
1647	oldprio = td->td_user_pri;
1648	td->td_user_pri = prio;
1649}
1650
1651void
1652sched_unlend_user_prio(struct thread *td, u_char prio)
1653{
1654	u_char base_pri;
1655
1656	THREAD_LOCK_ASSERT(td, MA_OWNED);
1657	base_pri = td->td_base_user_pri;
1658	if (prio >= base_pri) {
1659		td->td_flags &= ~TDF_UBORROWING;
1660		sched_user_prio(td, base_pri);
1661	} else {
1662		sched_lend_user_prio(td, prio);
1663	}
1664}
1665
1666/*
1667 * Add the thread passed as 'newtd' to the run queue before selecting
1668 * the next thread to run.  This is only used for KSE.
1669 */
1670static void
1671sched_switchin(struct tdq *tdq, struct thread *td)
1672{
1673#ifdef SMP
1674	spinlock_enter();
1675	TDQ_UNLOCK(tdq);
1676	thread_lock(td);
1677	spinlock_exit();
1678	sched_setcpu(td->td_sched, TDQ_ID(tdq), SRQ_YIELDING);
1679#else
1680	td->td_lock = TDQ_LOCKPTR(tdq);
1681#endif
1682	tdq_add(tdq, td, SRQ_YIELDING);
1683	MPASS(td->td_lock == TDQ_LOCKPTR(tdq));
1684}
1685
1686/*
1687 * Block a thread for switching.  Similar to thread_block() but does not
1688 * bump the spin count.
1689 */
1690static inline struct mtx *
1691thread_block_switch(struct thread *td)
1692{
1693	struct mtx *lock;
1694
1695	THREAD_LOCK_ASSERT(td, MA_OWNED);
1696	lock = td->td_lock;
1697	td->td_lock = &blocked_lock;
1698	mtx_unlock_spin(lock);
1699
1700	return (lock);
1701}
1702
1703/*
1704 * Handle migration from sched_switch().  This happens only for
1705 * cpu binding.
1706 */
1707static struct mtx *
1708sched_switch_migrate(struct tdq *tdq, struct thread *td, int flags)
1709{
1710	struct tdq *tdn;
1711
1712	tdn = TDQ_CPU(td->td_sched->ts_cpu);
1713#ifdef SMP
1714	tdq_load_rem(tdq, td->td_sched);
1715	/*
1716	 * Do the lock dance required to avoid LOR.  We grab an extra
1717	 * spinlock nesting to prevent preemption while we're
1718	 * not holding either run-queue lock.
1719	 */
1720	spinlock_enter();
1721	thread_block_switch(td);	/* This releases the lock on tdq. */
1722	TDQ_LOCK(tdn);
1723	tdq_add(tdn, td, flags);
1724	tdq_notify(tdn, td->td_sched);
1725	/*
1726	 * After we unlock tdn the new cpu still can't switch into this
1727	 * thread until we've unblocked it in cpu_switch().  The lock
1728	 * pointers may match in the case of HTT cores.  Don't unlock here
1729	 * or we can deadlock when the other CPU runs the IPI handler.
1730	 */
1731	if (TDQ_LOCKPTR(tdn) != TDQ_LOCKPTR(tdq)) {
1732		TDQ_UNLOCK(tdn);
1733		TDQ_LOCK(tdq);
1734	}
1735	spinlock_exit();
1736#endif
1737	return (TDQ_LOCKPTR(tdn));
1738}
1739
1740/*
1741 * Release a thread that was blocked with thread_block_switch().
1742 */
1743static inline void
1744thread_unblock_switch(struct thread *td, struct mtx *mtx)
1745{
1746	atomic_store_rel_ptr((volatile uintptr_t *)&td->td_lock,
1747	    (uintptr_t)mtx);
1748}
1749
1750/*
1751 * Switch threads.  This function has to handle threads coming in while
1752 * blocked for some reason, running, or idle.  It also must deal with
1753 * migrating a thread from one queue to another as running threads may
1754 * be assigned elsewhere via binding.
1755 */
1756void
1757sched_switch(struct thread *td, struct thread *newtd, int flags)
1758{
1759	struct tdq *tdq;
1760	struct td_sched *ts;
1761	struct mtx *mtx;
1762	int srqflag;
1763	int cpuid;
1764
1765	THREAD_LOCK_ASSERT(td, MA_OWNED);
1766
1767	cpuid = PCPU_GET(cpuid);
1768	tdq = TDQ_CPU(cpuid);
1769	ts = td->td_sched;
1770	mtx = td->td_lock;
1771	ts->ts_rltick = ticks;
1772	td->td_lastcpu = td->td_oncpu;
1773	td->td_oncpu = NOCPU;
1774	td->td_flags &= ~TDF_NEEDRESCHED;
1775	td->td_owepreempt = 0;
1776	/*
1777	 * The lock pointer in an idle thread should never change.  Reset it
1778	 * to CAN_RUN as well.
1779	 */
1780	if (TD_IS_IDLETHREAD(td)) {
1781		MPASS(td->td_lock == TDQ_LOCKPTR(tdq));
1782		TD_SET_CAN_RUN(td);
1783	} else if (TD_IS_RUNNING(td)) {
1784		MPASS(td->td_lock == TDQ_LOCKPTR(tdq));
1785		srqflag = (flags & SW_PREEMPT) ?
1786		    SRQ_OURSELF|SRQ_YIELDING|SRQ_PREEMPTED :
1787		    SRQ_OURSELF|SRQ_YIELDING;
1788		if (ts->ts_cpu == cpuid)
1789			tdq_runq_add(tdq, ts, srqflag);
1790		else
1791			mtx = sched_switch_migrate(tdq, td, srqflag);
1792	} else {
1793		/* This thread must be going to sleep. */
1794		TDQ_LOCK(tdq);
1795		mtx = thread_block_switch(td);
1796		tdq_load_rem(tdq, ts);
1797	}
1798	/*
1799	 * We enter here with the thread blocked and assigned to the
1800	 * appropriate cpu run-queue or sleep-queue and with the current
1801	 * thread-queue locked.
1802	 */
1803	TDQ_LOCK_ASSERT(tdq, MA_OWNED | MA_NOTRECURSED);
1804	/*
1805	 * If KSE assigned a new thread just add it here and let choosethread
1806	 * select the best one.
1807	 */
1808	if (newtd != NULL)
1809		sched_switchin(tdq, newtd);
1810	newtd = choosethread();
1811	/*
1812	 * Call the MD code to switch contexts if necessary.
1813	 */
1814	if (td != newtd) {
1815#ifdef	HWPMC_HOOKS
1816		if (PMC_PROC_IS_USING_PMCS(td->td_proc))
1817			PMC_SWITCH_CONTEXT(td, PMC_FN_CSW_OUT);
1818#endif
1819		lock_profile_release_lock(&TDQ_LOCKPTR(tdq)->lock_object);
1820		TDQ_LOCKPTR(tdq)->mtx_lock = (uintptr_t)newtd;
1821		cpu_switch(td, newtd, mtx);
1822		/*
1823		 * We may return from cpu_switch on a different cpu.  However,
1824		 * we always return with td_lock pointing to the current cpu's
1825		 * run queue lock.
1826		 */
1827		cpuid = PCPU_GET(cpuid);
1828		tdq = TDQ_CPU(cpuid);
1829		lock_profile_obtain_lock_success(
1830		    &TDQ_LOCKPTR(tdq)->lock_object, 0, 0, __FILE__, __LINE__);
1831#ifdef	HWPMC_HOOKS
1832		if (PMC_PROC_IS_USING_PMCS(td->td_proc))
1833			PMC_SWITCH_CONTEXT(td, PMC_FN_CSW_IN);
1834#endif
1835	} else
1836		thread_unblock_switch(td, mtx);
1837	/*
1838	 * We should always get here with the lowest priority td possible.
1839	 */
1840	tdq->tdq_lowpri = td->td_priority;
1841	/*
1842	 * Assert that all went well and return.
1843	 */
1844	TDQ_LOCK_ASSERT(tdq, MA_OWNED|MA_NOTRECURSED);
1845	MPASS(td->td_lock == TDQ_LOCKPTR(tdq));
1846	td->td_oncpu = cpuid;
1847}
1848
1849/*
1850 * Adjust thread priorities as a result of a nice request.
1851 */
1852void
1853sched_nice(struct proc *p, int nice)
1854{
1855	struct thread *td;
1856
1857	PROC_LOCK_ASSERT(p, MA_OWNED);
1858	PROC_SLOCK_ASSERT(p, MA_OWNED);
1859
1860	p->p_nice = nice;
1861	FOREACH_THREAD_IN_PROC(p, td) {
1862		thread_lock(td);
1863		sched_priority(td);
1864		sched_prio(td, td->td_base_user_pri);
1865		thread_unlock(td);
1866	}
1867}
1868
1869/*
1870 * Record the sleep time for the interactivity scorer.
1871 */
1872void
1873sched_sleep(struct thread *td)
1874{
1875
1876	THREAD_LOCK_ASSERT(td, MA_OWNED);
1877
1878	td->td_slptick = ticks;
1879}
1880
1881/*
1882 * Schedule a thread to resume execution and record how long it voluntarily
1883 * slept.  We also update the pctcpu, interactivity, and priority.
1884 */
1885void
1886sched_wakeup(struct thread *td)
1887{
1888	struct td_sched *ts;
1889	int slptick;
1890
1891	THREAD_LOCK_ASSERT(td, MA_OWNED);
1892	ts = td->td_sched;
1893	/*
1894	 * If we slept for more than a tick update our interactivity and
1895	 * priority.
1896	 */
1897	slptick = td->td_slptick;
1898	td->td_slptick = 0;
1899	if (slptick && slptick != ticks) {
1900		u_int hzticks;
1901
1902		hzticks = (ticks - slptick) << SCHED_TICK_SHIFT;
1903		ts->ts_slptime += hzticks;
1904		sched_interact_update(td);
1905		sched_pctcpu_update(ts);
1906	}
1907	/* Reset the slice value after we sleep. */
1908	ts->ts_slice = sched_slice;
1909	sched_add(td, SRQ_BORING);
1910}
1911
1912/*
1913 * Penalize the parent for creating a new child and initialize the child's
1914 * priority.
1915 */
1916void
1917sched_fork(struct thread *td, struct thread *child)
1918{
1919	THREAD_LOCK_ASSERT(td, MA_OWNED);
1920	sched_fork_thread(td, child);
1921	/*
1922	 * Penalize the parent and child for forking.
1923	 */
1924	sched_interact_fork(child);
1925	sched_priority(child);
1926	td->td_sched->ts_runtime += tickincr;
1927	sched_interact_update(td);
1928	sched_priority(td);
1929}
1930
1931/*
1932 * Fork a new thread, may be within the same process.
1933 */
1934void
1935sched_fork_thread(struct thread *td, struct thread *child)
1936{
1937	struct td_sched *ts;
1938	struct td_sched *ts2;
1939
1940	/*
1941	 * Initialize child.
1942	 */
1943	THREAD_LOCK_ASSERT(td, MA_OWNED);
1944	sched_newthread(child);
1945	child->td_lock = TDQ_LOCKPTR(TDQ_SELF());
1946	child->td_cpuset = cpuset_ref(td->td_cpuset);
1947	ts = td->td_sched;
1948	ts2 = child->td_sched;
1949	ts2->ts_cpu = ts->ts_cpu;
1950	ts2->ts_runq = NULL;
1951	/*
1952	 * Grab our parents cpu estimation information and priority.
1953	 */
1954	ts2->ts_ticks = ts->ts_ticks;
1955	ts2->ts_ltick = ts->ts_ltick;
1956	ts2->ts_ftick = ts->ts_ftick;
1957	child->td_user_pri = td->td_user_pri;
1958	child->td_base_user_pri = td->td_base_user_pri;
1959	/*
1960	 * And update interactivity score.
1961	 */
1962	ts2->ts_slptime = ts->ts_slptime;
1963	ts2->ts_runtime = ts->ts_runtime;
1964	ts2->ts_slice = 1;	/* Attempt to quickly learn interactivity. */
1965}
1966
1967/*
1968 * Adjust the priority class of a thread.
1969 */
1970void
1971sched_class(struct thread *td, int class)
1972{
1973
1974	THREAD_LOCK_ASSERT(td, MA_OWNED);
1975	if (td->td_pri_class == class)
1976		return;
1977	/*
1978	 * On SMP if we're on the RUNQ we must adjust the transferable
1979	 * count because could be changing to or from an interrupt
1980	 * class.
1981	 */
1982	if (TD_ON_RUNQ(td)) {
1983		struct tdq *tdq;
1984
1985		tdq = TDQ_CPU(td->td_sched->ts_cpu);
1986		if (THREAD_CAN_MIGRATE(td))
1987			tdq->tdq_transferable--;
1988		td->td_pri_class = class;
1989		if (THREAD_CAN_MIGRATE(td))
1990			tdq->tdq_transferable++;
1991	}
1992	td->td_pri_class = class;
1993}
1994
1995/*
1996 * Return some of the child's priority and interactivity to the parent.
1997 */
1998void
1999sched_exit(struct proc *p, struct thread *child)
2000{
2001	struct thread *td;
2002
2003	CTR3(KTR_SCHED, "sched_exit: %p(%s) prio %d",
2004	    child, child->td_name, child->td_priority);
2005
2006	PROC_SLOCK_ASSERT(p, MA_OWNED);
2007	td = FIRST_THREAD_IN_PROC(p);
2008	sched_exit_thread(td, child);
2009}
2010
2011/*
2012 * Penalize another thread for the time spent on this one.  This helps to
2013 * worsen the priority and interactivity of processes which schedule batch
2014 * jobs such as make.  This has little effect on the make process itself but
2015 * causes new processes spawned by it to receive worse scores immediately.
2016 */
2017void
2018sched_exit_thread(struct thread *td, struct thread *child)
2019{
2020
2021	CTR3(KTR_SCHED, "sched_exit_thread: %p(%s) prio %d",
2022	    child, child->td_name, child->td_priority);
2023
2024#ifdef KSE
2025	/*
2026	 * KSE forks and exits so often that this penalty causes short-lived
2027	 * threads to always be non-interactive.  This causes mozilla to
2028	 * crawl under load.
2029	 */
2030	if ((td->td_pflags & TDP_SA) && td->td_proc == child->td_proc)
2031		return;
2032#endif
2033	/*
2034	 * Give the child's runtime to the parent without returning the
2035	 * sleep time as a penalty to the parent.  This causes shells that
2036	 * launch expensive things to mark their children as expensive.
2037	 */
2038	thread_lock(td);
2039	td->td_sched->ts_runtime += child->td_sched->ts_runtime;
2040	sched_interact_update(td);
2041	sched_priority(td);
2042	thread_unlock(td);
2043}
2044
2045void
2046sched_preempt(struct thread *td)
2047{
2048	struct tdq *tdq;
2049
2050	thread_lock(td);
2051	tdq = TDQ_SELF();
2052	TDQ_LOCK_ASSERT(tdq, MA_OWNED);
2053	tdq->tdq_ipipending = 0;
2054	if (td->td_priority > tdq->tdq_lowpri) {
2055		if (td->td_critnest > 1)
2056			td->td_owepreempt = 1;
2057		else
2058			mi_switch(SW_INVOL | SW_PREEMPT, NULL);
2059	}
2060	thread_unlock(td);
2061}
2062
2063/*
2064 * Fix priorities on return to user-space.  Priorities may be elevated due
2065 * to static priorities in msleep() or similar.
2066 */
2067void
2068sched_userret(struct thread *td)
2069{
2070	/*
2071	 * XXX we cheat slightly on the locking here to avoid locking in
2072	 * the usual case.  Setting td_priority here is essentially an
2073	 * incomplete workaround for not setting it properly elsewhere.
2074	 * Now that some interrupt handlers are threads, not setting it
2075	 * properly elsewhere can clobber it in the window between setting
2076	 * it here and returning to user mode, so don't waste time setting
2077	 * it perfectly here.
2078	 */
2079	KASSERT((td->td_flags & TDF_BORROWING) == 0,
2080	    ("thread with borrowed priority returning to userland"));
2081	if (td->td_priority != td->td_user_pri) {
2082		thread_lock(td);
2083		td->td_priority = td->td_user_pri;
2084		td->td_base_pri = td->td_user_pri;
2085		tdq_setlowpri(TDQ_SELF(), td);
2086		thread_unlock(td);
2087        }
2088}
2089
2090/*
2091 * Handle a stathz tick.  This is really only relevant for timeshare
2092 * threads.
2093 */
2094void
2095sched_clock(struct thread *td)
2096{
2097	struct tdq *tdq;
2098	struct td_sched *ts;
2099
2100	THREAD_LOCK_ASSERT(td, MA_OWNED);
2101	tdq = TDQ_SELF();
2102#ifdef SMP
2103	/*
2104	 * We run the long term load balancer infrequently on the first cpu.
2105	 */
2106	if (balance_tdq == tdq) {
2107		if (balance_ticks && --balance_ticks == 0)
2108			sched_balance();
2109	}
2110#endif
2111	/*
2112	 * Advance the insert index once for each tick to ensure that all
2113	 * threads get a chance to run.
2114	 */
2115	if (tdq->tdq_idx == tdq->tdq_ridx) {
2116		tdq->tdq_idx = (tdq->tdq_idx + 1) % RQ_NQS;
2117		if (TAILQ_EMPTY(&tdq->tdq_timeshare.rq_queues[tdq->tdq_ridx]))
2118			tdq->tdq_ridx = tdq->tdq_idx;
2119	}
2120	ts = td->td_sched;
2121	if (td->td_pri_class & PRI_FIFO_BIT)
2122		return;
2123	if (td->td_pri_class == PRI_TIMESHARE) {
2124		/*
2125		 * We used a tick; charge it to the thread so
2126		 * that we can compute our interactivity.
2127		 */
2128		td->td_sched->ts_runtime += tickincr;
2129		sched_interact_update(td);
2130		sched_priority(td);
2131	}
2132	/*
2133	 * We used up one time slice.
2134	 */
2135	if (--ts->ts_slice > 0)
2136		return;
2137	/*
2138	 * We're out of time, force a requeue at userret().
2139	 */
2140	ts->ts_slice = sched_slice;
2141	td->td_flags |= TDF_NEEDRESCHED;
2142}
2143
2144/*
2145 * Called once per hz tick.  Used for cpu utilization information.  This
2146 * is easier than trying to scale based on stathz.
2147 */
2148void
2149sched_tick(void)
2150{
2151	struct td_sched *ts;
2152
2153	ts = curthread->td_sched;
2154	/* Adjust ticks for pctcpu */
2155	ts->ts_ticks += 1 << SCHED_TICK_SHIFT;
2156	ts->ts_ltick = ticks;
2157	/*
2158	 * Update if we've exceeded our desired tick threshhold by over one
2159	 * second.
2160	 */
2161	if (ts->ts_ftick + SCHED_TICK_MAX < ts->ts_ltick)
2162		sched_pctcpu_update(ts);
2163}
2164
2165/*
2166 * Return whether the current CPU has runnable tasks.  Used for in-kernel
2167 * cooperative idle threads.
2168 */
2169int
2170sched_runnable(void)
2171{
2172	struct tdq *tdq;
2173	int load;
2174
2175	load = 1;
2176
2177	tdq = TDQ_SELF();
2178	if ((curthread->td_flags & TDF_IDLETD) != 0) {
2179		if (tdq->tdq_load > 0)
2180			goto out;
2181	} else
2182		if (tdq->tdq_load - 1 > 0)
2183			goto out;
2184	load = 0;
2185out:
2186	return (load);
2187}
2188
2189/*
2190 * Choose the highest priority thread to run.  The thread is removed from
2191 * the run-queue while running however the load remains.  For SMP we set
2192 * the tdq in the global idle bitmask if it idles here.
2193 */
2194struct thread *
2195sched_choose(void)
2196{
2197	struct td_sched *ts;
2198	struct tdq *tdq;
2199
2200	tdq = TDQ_SELF();
2201	TDQ_LOCK_ASSERT(tdq, MA_OWNED);
2202	ts = tdq_choose(tdq);
2203	if (ts) {
2204		tdq_runq_rem(tdq, ts);
2205		return (ts->ts_thread);
2206	}
2207	return (PCPU_GET(idlethread));
2208}
2209
2210/*
2211 * Set owepreempt if necessary.  Preemption never happens directly in ULE,
2212 * we always request it once we exit a critical section.
2213 */
2214static inline void
2215sched_setpreempt(struct thread *td)
2216{
2217	struct thread *ctd;
2218	int cpri;
2219	int pri;
2220
2221	THREAD_LOCK_ASSERT(curthread, MA_OWNED);
2222
2223	ctd = curthread;
2224	pri = td->td_priority;
2225	cpri = ctd->td_priority;
2226	if (pri < cpri)
2227		ctd->td_flags |= TDF_NEEDRESCHED;
2228	if (panicstr != NULL || pri >= cpri || cold || TD_IS_INHIBITED(ctd))
2229		return;
2230	if (!sched_shouldpreempt(pri, cpri, 0))
2231		return;
2232	ctd->td_owepreempt = 1;
2233}
2234
2235/*
2236 * Add a thread to a thread queue.  Select the appropriate runq and add the
2237 * thread to it.  This is the internal function called when the tdq is
2238 * predetermined.
2239 */
2240void
2241tdq_add(struct tdq *tdq, struct thread *td, int flags)
2242{
2243	struct td_sched *ts;
2244
2245	TDQ_LOCK_ASSERT(tdq, MA_OWNED);
2246	KASSERT((td->td_inhibitors == 0),
2247	    ("sched_add: trying to run inhibited thread"));
2248	KASSERT((TD_CAN_RUN(td) || TD_IS_RUNNING(td)),
2249	    ("sched_add: bad thread state"));
2250	KASSERT(td->td_flags & TDF_INMEM,
2251	    ("sched_add: thread swapped out"));
2252
2253	ts = td->td_sched;
2254	if (td->td_priority < tdq->tdq_lowpri)
2255		tdq->tdq_lowpri = td->td_priority;
2256	tdq_runq_pick(tdq, ts);
2257	tdq_runq_add(tdq, ts, flags);
2258	tdq_load_add(tdq, ts);
2259}
2260
2261/*
2262 * Select the target thread queue and add a thread to it.  Request
2263 * preemption or IPI a remote processor if required.
2264 */
2265void
2266sched_add(struct thread *td, int flags)
2267{
2268	struct tdq *tdq;
2269#ifdef SMP
2270	struct td_sched *ts;
2271	int cpu;
2272#endif
2273	CTR5(KTR_SCHED, "sched_add: %p(%s) prio %d by %p(%s)",
2274	    td, td->td_name, td->td_priority, curthread,
2275	    curthread->td_name);
2276	THREAD_LOCK_ASSERT(td, MA_OWNED);
2277	/*
2278	 * Recalculate the priority before we select the target cpu or
2279	 * run-queue.
2280	 */
2281	if (PRI_BASE(td->td_pri_class) == PRI_TIMESHARE)
2282		sched_priority(td);
2283#ifdef SMP
2284	/*
2285	 * Pick the destination cpu and if it isn't ours transfer to the
2286	 * target cpu.
2287	 */
2288	ts = td->td_sched;
2289	cpu = sched_pickcpu(ts, flags);
2290	tdq = sched_setcpu(ts, cpu, flags);
2291	tdq_add(tdq, td, flags);
2292	if (cpu != PCPU_GET(cpuid)) {
2293		tdq_notify(tdq, ts);
2294		return;
2295	}
2296#else
2297	tdq = TDQ_SELF();
2298	TDQ_LOCK(tdq);
2299	/*
2300	 * Now that the thread is moving to the run-queue, set the lock
2301	 * to the scheduler's lock.
2302	 */
2303	thread_lock_set(td, TDQ_LOCKPTR(tdq));
2304	tdq_add(tdq, td, flags);
2305#endif
2306	if (!(flags & SRQ_YIELDING))
2307		sched_setpreempt(td);
2308}
2309
2310/*
2311 * Remove a thread from a run-queue without running it.  This is used
2312 * when we're stealing a thread from a remote queue.  Otherwise all threads
2313 * exit by calling sched_exit_thread() and sched_throw() themselves.
2314 */
2315void
2316sched_rem(struct thread *td)
2317{
2318	struct tdq *tdq;
2319	struct td_sched *ts;
2320
2321	CTR5(KTR_SCHED, "sched_rem: %p(%s) prio %d by %p(%s)",
2322	    td, td->td_name, td->td_priority, curthread,
2323	    curthread->td_name);
2324	ts = td->td_sched;
2325	tdq = TDQ_CPU(ts->ts_cpu);
2326	TDQ_LOCK_ASSERT(tdq, MA_OWNED);
2327	MPASS(td->td_lock == TDQ_LOCKPTR(tdq));
2328	KASSERT(TD_ON_RUNQ(td),
2329	    ("sched_rem: thread not on run queue"));
2330	tdq_runq_rem(tdq, ts);
2331	tdq_load_rem(tdq, ts);
2332	TD_SET_CAN_RUN(td);
2333	if (td->td_priority == tdq->tdq_lowpri)
2334		tdq_setlowpri(tdq, NULL);
2335}
2336
2337/*
2338 * Fetch cpu utilization information.  Updates on demand.
2339 */
2340fixpt_t
2341sched_pctcpu(struct thread *td)
2342{
2343	fixpt_t pctcpu;
2344	struct td_sched *ts;
2345
2346	pctcpu = 0;
2347	ts = td->td_sched;
2348	if (ts == NULL)
2349		return (0);
2350
2351	thread_lock(td);
2352	if (ts->ts_ticks) {
2353		int rtick;
2354
2355		sched_pctcpu_update(ts);
2356		/* How many rtick per second ? */
2357		rtick = min(SCHED_TICK_HZ(ts) / SCHED_TICK_SECS, hz);
2358		pctcpu = (FSCALE * ((FSCALE * rtick)/hz)) >> FSHIFT;
2359	}
2360	thread_unlock(td);
2361
2362	return (pctcpu);
2363}
2364
2365/*
2366 * Enforce affinity settings for a thread.  Called after adjustments to
2367 * cpumask.
2368 */
2369void
2370sched_affinity(struct thread *td)
2371{
2372#ifdef SMP
2373	struct td_sched *ts;
2374	int cpu;
2375
2376	THREAD_LOCK_ASSERT(td, MA_OWNED);
2377	ts = td->td_sched;
2378	if (THREAD_CAN_SCHED(td, ts->ts_cpu))
2379		return;
2380	if (!TD_IS_RUNNING(td))
2381		return;
2382	td->td_flags |= TDF_NEEDRESCHED;
2383	if (!THREAD_CAN_MIGRATE(td))
2384		return;
2385	/*
2386	 * Assign the new cpu and force a switch before returning to
2387	 * userspace.  If the target thread is not running locally send
2388	 * an ipi to force the issue.
2389	 */
2390	cpu = ts->ts_cpu;
2391	ts->ts_cpu = sched_pickcpu(ts, 0);
2392	if (cpu != PCPU_GET(cpuid))
2393		ipi_selected(1 << cpu, IPI_PREEMPT);
2394#endif
2395}
2396
2397/*
2398 * Bind a thread to a target cpu.
2399 */
2400void
2401sched_bind(struct thread *td, int cpu)
2402{
2403	struct td_sched *ts;
2404
2405	THREAD_LOCK_ASSERT(td, MA_OWNED|MA_NOTRECURSED);
2406	ts = td->td_sched;
2407	if (ts->ts_flags & TSF_BOUND)
2408		sched_unbind(td);
2409	ts->ts_flags |= TSF_BOUND;
2410	sched_pin();
2411	if (PCPU_GET(cpuid) == cpu)
2412		return;
2413	ts->ts_cpu = cpu;
2414	/* When we return from mi_switch we'll be on the correct cpu. */
2415	mi_switch(SW_VOL, NULL);
2416}
2417
2418/*
2419 * Release a bound thread.
2420 */
2421void
2422sched_unbind(struct thread *td)
2423{
2424	struct td_sched *ts;
2425
2426	THREAD_LOCK_ASSERT(td, MA_OWNED);
2427	ts = td->td_sched;
2428	if ((ts->ts_flags & TSF_BOUND) == 0)
2429		return;
2430	ts->ts_flags &= ~TSF_BOUND;
2431	sched_unpin();
2432}
2433
2434int
2435sched_is_bound(struct thread *td)
2436{
2437	THREAD_LOCK_ASSERT(td, MA_OWNED);
2438	return (td->td_sched->ts_flags & TSF_BOUND);
2439}
2440
2441/*
2442 * Basic yield call.
2443 */
2444void
2445sched_relinquish(struct thread *td)
2446{
2447	thread_lock(td);
2448	SCHED_STAT_INC(switch_relinquish);
2449	mi_switch(SW_VOL, NULL);
2450	thread_unlock(td);
2451}
2452
2453/*
2454 * Return the total system load.
2455 */
2456int
2457sched_load(void)
2458{
2459#ifdef SMP
2460	int total;
2461	int i;
2462
2463	total = 0;
2464	for (i = 0; i <= mp_maxid; i++)
2465		total += TDQ_CPU(i)->tdq_sysload;
2466	return (total);
2467#else
2468	return (TDQ_SELF()->tdq_sysload);
2469#endif
2470}
2471
2472int
2473sched_sizeof_proc(void)
2474{
2475	return (sizeof(struct proc));
2476}
2477
2478int
2479sched_sizeof_thread(void)
2480{
2481	return (sizeof(struct thread) + sizeof(struct td_sched));
2482}
2483
2484/*
2485 * The actual idle process.
2486 */
2487void
2488sched_idletd(void *dummy)
2489{
2490	struct thread *td;
2491	struct tdq *tdq;
2492
2493	td = curthread;
2494	tdq = TDQ_SELF();
2495	mtx_assert(&Giant, MA_NOTOWNED);
2496	/* ULE relies on preemption for idle interruption. */
2497	for (;;) {
2498#ifdef SMP
2499		if (tdq_idled(tdq))
2500			cpu_idle();
2501#else
2502		cpu_idle();
2503#endif
2504	}
2505}
2506
2507/*
2508 * A CPU is entering for the first time or a thread is exiting.
2509 */
2510void
2511sched_throw(struct thread *td)
2512{
2513	struct thread *newtd;
2514	struct tdq *tdq;
2515
2516	tdq = TDQ_SELF();
2517	if (td == NULL) {
2518		/* Correct spinlock nesting and acquire the correct lock. */
2519		TDQ_LOCK(tdq);
2520		spinlock_exit();
2521	} else {
2522		MPASS(td->td_lock == TDQ_LOCKPTR(tdq));
2523		tdq_load_rem(tdq, td->td_sched);
2524		lock_profile_release_lock(&TDQ_LOCKPTR(tdq)->lock_object);
2525	}
2526	KASSERT(curthread->td_md.md_spinlock_count == 1, ("invalid count"));
2527	newtd = choosethread();
2528	TDQ_LOCKPTR(tdq)->mtx_lock = (uintptr_t)newtd;
2529	PCPU_SET(switchtime, cpu_ticks());
2530	PCPU_SET(switchticks, ticks);
2531	cpu_throw(td, newtd);		/* doesn't return */
2532}
2533
2534/*
2535 * This is called from fork_exit().  Just acquire the correct locks and
2536 * let fork do the rest of the work.
2537 */
2538void
2539sched_fork_exit(struct thread *td)
2540{
2541	struct td_sched *ts;
2542	struct tdq *tdq;
2543	int cpuid;
2544
2545	/*
2546	 * Finish setting up thread glue so that it begins execution in a
2547	 * non-nested critical section with the scheduler lock held.
2548	 */
2549	cpuid = PCPU_GET(cpuid);
2550	tdq = TDQ_CPU(cpuid);
2551	ts = td->td_sched;
2552	if (TD_IS_IDLETHREAD(td))
2553		td->td_lock = TDQ_LOCKPTR(tdq);
2554	MPASS(td->td_lock == TDQ_LOCKPTR(tdq));
2555	td->td_oncpu = cpuid;
2556	TDQ_LOCK_ASSERT(tdq, MA_OWNED | MA_NOTRECURSED);
2557	lock_profile_obtain_lock_success(
2558	    &TDQ_LOCKPTR(tdq)->lock_object, 0, 0, __FILE__, __LINE__);
2559	tdq->tdq_lowpri = td->td_priority;
2560}
2561
2562static SYSCTL_NODE(_kern, OID_AUTO, sched, CTLFLAG_RW, 0,
2563    "Scheduler");
2564SYSCTL_STRING(_kern_sched, OID_AUTO, name, CTLFLAG_RD, "ULE", 0,
2565    "Scheduler name");
2566SYSCTL_INT(_kern_sched, OID_AUTO, slice, CTLFLAG_RW, &sched_slice, 0,
2567    "Slice size for timeshare threads");
2568SYSCTL_INT(_kern_sched, OID_AUTO, interact, CTLFLAG_RW, &sched_interact, 0,
2569     "Interactivity score threshold");
2570SYSCTL_INT(_kern_sched, OID_AUTO, preempt_thresh, CTLFLAG_RW, &preempt_thresh,
2571     0,"Min priority for preemption, lower priorities have greater precedence");
2572#ifdef SMP
2573SYSCTL_INT(_kern_sched, OID_AUTO, affinity, CTLFLAG_RW, &affinity, 0,
2574    "Number of hz ticks to keep thread affinity for");
2575SYSCTL_INT(_kern_sched, OID_AUTO, balance, CTLFLAG_RW, &rebalance, 0,
2576    "Enables the long-term load balancer");
2577SYSCTL_INT(_kern_sched, OID_AUTO, balance_interval, CTLFLAG_RW,
2578    &balance_interval, 0,
2579    "Average frequency in stathz ticks to run the long-term balancer");
2580SYSCTL_INT(_kern_sched, OID_AUTO, steal_htt, CTLFLAG_RW, &steal_htt, 0,
2581    "Steals work from another hyper-threaded core on idle");
2582SYSCTL_INT(_kern_sched, OID_AUTO, steal_idle, CTLFLAG_RW, &steal_idle, 0,
2583    "Attempts to steal work from other cores before idling");
2584SYSCTL_INT(_kern_sched, OID_AUTO, steal_thresh, CTLFLAG_RW, &steal_thresh, 0,
2585    "Minimum load on remote cpu before we'll steal");
2586#endif
2587
2588/* ps compat.  All cpu percentages from ULE are weighted. */
2589static int ccpu = 0;
2590SYSCTL_INT(_kern, OID_AUTO, ccpu, CTLFLAG_RD, &ccpu, 0, "");
2591
2592
2593#define KERN_SWITCH_INCLUDE 1
2594#include "kern/kern_switch.c"
2595