sched_ule.c revision 139453
1109864Sjeff/*-
2113357Sjeff * Copyright (c) 2002-2003, Jeffrey Roberson <jeff@freebsd.org>
3109864Sjeff * All rights reserved.
4109864Sjeff *
5109864Sjeff * Redistribution and use in source and binary forms, with or without
6109864Sjeff * modification, are permitted provided that the following conditions
7109864Sjeff * are met:
8109864Sjeff * 1. Redistributions of source code must retain the above copyright
9109864Sjeff *    notice unmodified, this list of conditions, and the following
10109864Sjeff *    disclaimer.
11109864Sjeff * 2. Redistributions in binary form must reproduce the above copyright
12109864Sjeff *    notice, this list of conditions and the following disclaimer in the
13109864Sjeff *    documentation and/or other materials provided with the distribution.
14109864Sjeff *
15109864Sjeff * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
16109864Sjeff * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
17109864Sjeff * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
18109864Sjeff * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
19109864Sjeff * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
20109864Sjeff * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
21109864Sjeff * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
22109864Sjeff * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
23109864Sjeff * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
24109864Sjeff * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
25109864Sjeff */
26109864Sjeff
27116182Sobrien#include <sys/cdefs.h>
28116182Sobrien__FBSDID("$FreeBSD: head/sys/kern/sched_ule.c 139453 2004-12-30 20:52:44Z jhb $");
29116182Sobrien
30134649Sscottl#include <opt_sched.h>
31134649Sscottl
32134791Sjulian#define kse td_sched
33134791Sjulian
34109864Sjeff#include <sys/param.h>
35109864Sjeff#include <sys/systm.h>
36131929Smarcel#include <sys/kdb.h>
37109864Sjeff#include <sys/kernel.h>
38109864Sjeff#include <sys/ktr.h>
39109864Sjeff#include <sys/lock.h>
40109864Sjeff#include <sys/mutex.h>
41109864Sjeff#include <sys/proc.h>
42112966Sjeff#include <sys/resource.h>
43122038Sjeff#include <sys/resourcevar.h>
44109864Sjeff#include <sys/sched.h>
45109864Sjeff#include <sys/smp.h>
46109864Sjeff#include <sys/sx.h>
47109864Sjeff#include <sys/sysctl.h>
48109864Sjeff#include <sys/sysproto.h>
49139453Sjhb#include <sys/turnstile.h>
50109864Sjeff#include <sys/vmmeter.h>
51109864Sjeff#ifdef KTRACE
52109864Sjeff#include <sys/uio.h>
53109864Sjeff#include <sys/ktrace.h>
54109864Sjeff#endif
55109864Sjeff
56109864Sjeff#include <machine/cpu.h>
57121790Sjeff#include <machine/smp.h>
58109864Sjeff
59109864Sjeff/* decay 95% of `p_pctcpu' in 60 seconds; see CCPU_SHIFT before changing */
60109864Sjeff/* XXX This is bogus compatability crap for ps */
61109864Sjeffstatic fixpt_t  ccpu = 0.95122942450071400909 * FSCALE; /* exp(-1/20) */
62109864SjeffSYSCTL_INT(_kern, OID_AUTO, ccpu, CTLFLAG_RD, &ccpu, 0, "");
63109864Sjeff
64109864Sjeffstatic void sched_setup(void *dummy);
65109864SjeffSYSINIT(sched_setup, SI_SUB_RUN_QUEUE, SI_ORDER_FIRST, sched_setup, NULL)
66109864Sjeff
67132589Sscottlstatic SYSCTL_NODE(_kern, OID_AUTO, sched, CTLFLAG_RW, 0, "Scheduler");
68113357Sjeff
69132589SscottlSYSCTL_STRING(_kern_sched, OID_AUTO, name, CTLFLAG_RD, "ule", 0,
70132589Sscottl    "Scheduler name");
71130881Sscottl
72113357Sjeffstatic int slice_min = 1;
73113357SjeffSYSCTL_INT(_kern_sched, OID_AUTO, slice_min, CTLFLAG_RW, &slice_min, 0, "");
74113357Sjeff
75116365Sjeffstatic int slice_max = 10;
76113357SjeffSYSCTL_INT(_kern_sched, OID_AUTO, slice_max, CTLFLAG_RW, &slice_max, 0, "");
77113357Sjeff
78111857Sjeffint realstathz;
79113357Sjeffint tickincr = 1;
80111857Sjeff
81109864Sjeff/*
82134791Sjulian * The schedulable entity that can be given a context to run.
83134791Sjulian * A process may have several of these. Probably one per processor
84134791Sjulian * but posibly a few more. In this universe they are grouped
85134791Sjulian * with a KSEG that contains the priority and niceness
86134791Sjulian * for the group.
87134791Sjulian */
88134791Sjulianstruct kse {
89134791Sjulian	TAILQ_ENTRY(kse) ke_procq;	/* (j/z) Run queue. */
90134791Sjulian	int		ke_flags;	/* (j) KEF_* flags. */
91134791Sjulian	struct thread	*ke_thread;	/* (*) Active associated thread. */
92134791Sjulian	fixpt_t		ke_pctcpu;	/* (j) %cpu during p_swtime. */
93134791Sjulian	char		ke_rqindex;	/* (j) Run queue index. */
94134791Sjulian	enum {
95134791Sjulian		KES_THREAD = 0x0,	/* slaved to thread state */
96134791Sjulian		KES_ONRUNQ
97134791Sjulian	} ke_state;			/* (j) thread sched specific status. */
98134791Sjulian	int		ke_slptime;
99134791Sjulian	int		ke_slice;
100134791Sjulian	struct runq	*ke_runq;
101134791Sjulian	u_char		ke_cpu;		/* CPU that we have affinity for. */
102134791Sjulian	/* The following variables are only used for pctcpu calculation */
103134791Sjulian	int		ke_ltick;	/* Last tick that we were running on */
104134791Sjulian	int		ke_ftick;	/* First tick that we were running on */
105134791Sjulian	int		ke_ticks;	/* Tick count */
106134791Sjulian
107134791Sjulian};
108134791Sjulian
109134791Sjulian
110134791Sjulian#define td_kse td_sched
111134791Sjulian#define	td_slptime		td_kse->ke_slptime
112134791Sjulian#define ke_proc			ke_thread->td_proc
113134791Sjulian#define ke_ksegrp		ke_thread->td_ksegrp
114134791Sjulian
115134791Sjulian/* flags kept in ke_flags */
116134791Sjulian#define	KEF_SCHED0	0x00001	/* For scheduler-specific use. */
117134791Sjulian#define	KEF_SCHED1	0x00002	/* For scheduler-specific use. */
118134791Sjulian#define	KEF_SCHED2	0x00004	/* For scheduler-specific use. */
119134791Sjulian#define	KEF_SCHED3	0x00008	/* For scheduler-specific use. */
120138842Sjeff#define	KEF_SCHED4	0x00010
121138842Sjeff#define	KEF_SCHED5	0x00020
122134791Sjulian#define	KEF_DIDRUN	0x02000	/* Thread actually ran. */
123134791Sjulian#define	KEF_EXIT	0x04000	/* Thread is being killed. */
124134791Sjulian
125134791Sjulian/*
126109864Sjeff * These datastructures are allocated within their parent datastructure but
127109864Sjeff * are scheduler specific.
128109864Sjeff */
129109864Sjeff
130121790Sjeff#define	ke_assign	ke_procq.tqe_next
131109864Sjeff
132139334Sjeff#define	KEF_ASSIGNED	0x0001		/* Thread is being migrated. */
133139334Sjeff#define	KEF_BOUND	0x0002		/* Thread can not migrate. */
134139334Sjeff#define	KEF_XFERABLE	0x0004		/* Thread was added as transferable. */
135139334Sjeff#define	KEF_HOLD	0x0008		/* Thread is temporarily bound. */
136139334Sjeff#define	KEF_REMOVED	0x0010		/* Thread was removed while ASSIGNED */
137139453Sjhb#define	KEF_INTERNAL	0x0020
138121790Sjeff
139109864Sjeffstruct kg_sched {
140134791Sjulian	struct thread	*skg_last_assigned; /* (j) Last thread assigned to */
141134791Sjulian					   /* the system scheduler */
142110645Sjeff	int	skg_slptime;		/* Number of ticks we vol. slept */
143110645Sjeff	int	skg_runtime;		/* Number of ticks we were running */
144134791Sjulian	int	skg_avail_opennings;	/* (j) Num unfilled slots in group.*/
145134791Sjulian	int	skg_concurrency;	/* (j) Num threads requested in group.*/
146109864Sjeff};
147134791Sjulian#define kg_last_assigned	kg_sched->skg_last_assigned
148134791Sjulian#define kg_avail_opennings	kg_sched->skg_avail_opennings
149134791Sjulian#define kg_concurrency		kg_sched->skg_concurrency
150134791Sjulian#define kg_runtime		kg_sched->skg_runtime
151134791Sjulian#define kg_slptime		kg_sched->skg_slptime
152109864Sjeff
153136167Sjulian#define SLOT_RELEASE(kg)						\
154136167Sjuliando {									\
155136167Sjulian	kg->kg_avail_opennings++; 					\
156136167Sjulian	CTR3(KTR_RUNQ, "kg %p(%d) Slot released (->%d)",		\
157136167Sjulian	kg,								\
158136167Sjulian	kg->kg_concurrency,						\
159136167Sjulian	 kg->kg_avail_opennings);					\
160136167Sjulian	/*KASSERT((kg->kg_avail_opennings <= kg->kg_concurrency),	\
161136167Sjulian	    ("slots out of whack")); */					\
162136167Sjulian} while (0)
163109864Sjeff
164136167Sjulian#define SLOT_USE(kg)							\
165136167Sjuliando {									\
166136167Sjulian	kg->kg_avail_opennings--; 					\
167136167Sjulian	CTR3(KTR_RUNQ, "kg %p(%d) Slot used (->%d)",			\
168136167Sjulian	kg,								\
169136167Sjulian	kg->kg_concurrency,						\
170136167Sjulian	 kg->kg_avail_opennings);					\
171136167Sjulian	/*KASSERT((kg->kg_avail_opennings >= 0),			\
172136167Sjulian	    ("slots out of whack"));*/ 					\
173136167Sjulian} while (0)
174136167Sjulian
175134791Sjulianstatic struct kse kse0;
176134791Sjulianstatic struct kg_sched kg_sched0;
177109864Sjeff
178109864Sjeff/*
179116642Sjeff * The priority is primarily determined by the interactivity score.  Thus, we
180116642Sjeff * give lower(better) priorities to kse groups that use less CPU.  The nice
181116642Sjeff * value is then directly added to this to allow nice to have some effect
182116642Sjeff * on latency.
183111857Sjeff *
184111857Sjeff * PRI_RANGE:	Total priority range for timeshare threads.
185116642Sjeff * PRI_NRESV:	Number of nice values.
186111857Sjeff * PRI_BASE:	The start of the dynamic range.
187109864Sjeff */
188111857Sjeff#define	SCHED_PRI_RANGE		(PRI_MAX_TIMESHARE - PRI_MIN_TIMESHARE + 1)
189121869Sjeff#define	SCHED_PRI_NRESV		((PRIO_MAX - PRIO_MIN) + 1)
190121869Sjeff#define	SCHED_PRI_NHALF		(SCHED_PRI_NRESV / 2)
191116642Sjeff#define	SCHED_PRI_BASE		(PRI_MIN_TIMESHARE)
192113357Sjeff#define	SCHED_PRI_INTERACT(score)					\
193116642Sjeff    ((score) * SCHED_PRI_RANGE / SCHED_INTERACT_MAX)
194109864Sjeff
195109864Sjeff/*
196111857Sjeff * These determine the interactivity of a process.
197109864Sjeff *
198110645Sjeff * SLP_RUN_MAX:	Maximum amount of sleep time + run time we'll accumulate
199110645Sjeff *		before throttling back.
200121868Sjeff * SLP_RUN_FORK:	Maximum slp+run time to inherit at fork time.
201116365Sjeff * INTERACT_MAX:	Maximum interactivity value.  Smaller is better.
202111857Sjeff * INTERACT_THRESH:	Threshhold for placement on the current runq.
203109864Sjeff */
204121126Sjeff#define	SCHED_SLP_RUN_MAX	((hz * 5) << 10)
205121868Sjeff#define	SCHED_SLP_RUN_FORK	((hz / 2) << 10)
206116365Sjeff#define	SCHED_INTERACT_MAX	(100)
207116365Sjeff#define	SCHED_INTERACT_HALF	(SCHED_INTERACT_MAX / 2)
208121126Sjeff#define	SCHED_INTERACT_THRESH	(30)
209111857Sjeff
210109864Sjeff/*
211109864Sjeff * These parameters and macros determine the size of the time slice that is
212109864Sjeff * granted to each thread.
213109864Sjeff *
214109864Sjeff * SLICE_MIN:	Minimum time slice granted, in units of ticks.
215109864Sjeff * SLICE_MAX:	Maximum time slice granted.
216109864Sjeff * SLICE_RANGE:	Range of available time slices scaled by hz.
217112966Sjeff * SLICE_SCALE:	The number slices granted per val in the range of [0, max].
218112966Sjeff * SLICE_NICE:  Determine the amount of slice granted to a scaled nice.
219121871Sjeff * SLICE_NTHRESH:	The nice cutoff point for slice assignment.
220109864Sjeff */
221113357Sjeff#define	SCHED_SLICE_MIN			(slice_min)
222113357Sjeff#define	SCHED_SLICE_MAX			(slice_max)
223125299Sjeff#define	SCHED_SLICE_INTERACTIVE		(slice_max)
224121871Sjeff#define	SCHED_SLICE_NTHRESH	(SCHED_PRI_NHALF - 1)
225111857Sjeff#define	SCHED_SLICE_RANGE		(SCHED_SLICE_MAX - SCHED_SLICE_MIN + 1)
226109864Sjeff#define	SCHED_SLICE_SCALE(val, max)	(((val) * SCHED_SLICE_RANGE) / (max))
227112966Sjeff#define	SCHED_SLICE_NICE(nice)						\
228121871Sjeff    (SCHED_SLICE_MAX - SCHED_SLICE_SCALE((nice), SCHED_SLICE_NTHRESH))
229109864Sjeff
230109864Sjeff/*
231134791Sjulian * This macro determines whether or not the thread belongs on the current or
232109864Sjeff * next run queue.
233109864Sjeff */
234113357Sjeff#define	SCHED_INTERACTIVE(kg)						\
235113357Sjeff    (sched_interact_score(kg) < SCHED_INTERACT_THRESH)
236113417Sjeff#define	SCHED_CURR(kg, ke)						\
237139453Sjhb    ((ke->ke_thread->td_flags & TDF_BORROWING) || SCHED_INTERACTIVE(kg))
238109864Sjeff
239109864Sjeff/*
240109864Sjeff * Cpu percentage computation macros and defines.
241109864Sjeff *
242109864Sjeff * SCHED_CPU_TIME:	Number of seconds to average the cpu usage across.
243109864Sjeff * SCHED_CPU_TICKS:	Number of hz ticks to average the cpu usage across.
244109864Sjeff */
245109864Sjeff
246112971Sjeff#define	SCHED_CPU_TIME	10
247109864Sjeff#define	SCHED_CPU_TICKS	(hz * SCHED_CPU_TIME)
248109864Sjeff
249109864Sjeff/*
250113357Sjeff * kseq - per processor runqs and statistics.
251109864Sjeff */
252109864Sjeffstruct kseq {
253113357Sjeff	struct runq	ksq_idle;		/* Queue of IDLE threads. */
254113357Sjeff	struct runq	ksq_timeshare[2];	/* Run queues for !IDLE. */
255113357Sjeff	struct runq	*ksq_next;		/* Next timeshare queue. */
256113357Sjeff	struct runq	*ksq_curr;		/* Current queue. */
257121896Sjeff	int		ksq_load_timeshare;	/* Load for timeshare. */
258113357Sjeff	int		ksq_load;		/* Aggregate load. */
259121869Sjeff	short		ksq_nice[SCHED_PRI_NRESV]; /* KSEs in each nice bin. */
260113357Sjeff	short		ksq_nicemin;		/* Least nice. */
261110267Sjeff#ifdef SMP
262123433Sjeff	int			ksq_transferable;
263123433Sjeff	LIST_ENTRY(kseq)	ksq_siblings;	/* Next in kseq group. */
264123433Sjeff	struct kseq_group	*ksq_group;	/* Our processor group. */
265123433Sjeff	volatile struct kse	*ksq_assigned;	/* assigned by another CPU. */
266125289Sjeff#else
267125289Sjeff	int		ksq_sysload;		/* For loadavg, !ITHD load. */
268110267Sjeff#endif
269109864Sjeff};
270109864Sjeff
271123433Sjeff#ifdef SMP
272109864Sjeff/*
273123433Sjeff * kseq groups are groups of processors which can cheaply share threads.  When
274123433Sjeff * one processor in the group goes idle it will check the runqs of the other
275123433Sjeff * processors in its group prior to halting and waiting for an interrupt.
276123433Sjeff * These groups are suitable for SMT (Symetric Multi-Threading) and not NUMA.
277123433Sjeff * In a numa environment we'd want an idle bitmap per group and a two tiered
278123433Sjeff * load balancer.
279123433Sjeff */
280123433Sjeffstruct kseq_group {
281123433Sjeff	int	ksg_cpus;		/* Count of CPUs in this kseq group. */
282127498Smarcel	cpumask_t ksg_cpumask;		/* Mask of cpus in this group. */
283127498Smarcel	cpumask_t ksg_idlemask;		/* Idle cpus in this group. */
284127498Smarcel	cpumask_t ksg_mask;		/* Bit mask for first cpu. */
285123487Sjeff	int	ksg_load;		/* Total load of this group. */
286123433Sjeff	int	ksg_transferable;	/* Transferable load of this group. */
287123433Sjeff	LIST_HEAD(, kseq)	ksg_members; /* Linked list of all members. */
288123433Sjeff};
289123433Sjeff#endif
290123433Sjeff
291123433Sjeff/*
292109864Sjeff * One kse queue per processor.
293109864Sjeff */
294110028Sjeff#ifdef SMP
295127498Smarcelstatic cpumask_t kseq_idle;
296123487Sjeffstatic int ksg_maxid;
297121790Sjeffstatic struct kseq	kseq_cpu[MAXCPU];
298123433Sjeffstatic struct kseq_group kseq_groups[MAXCPU];
299129982Sjeffstatic int bal_tick;
300129982Sjeffstatic int gbal_tick;
301139334Sjeffstatic int balance_groups;
302129982Sjeff
303123433Sjeff#define	KSEQ_SELF()	(&kseq_cpu[PCPU_GET(cpuid)])
304123433Sjeff#define	KSEQ_CPU(x)	(&kseq_cpu[(x)])
305123487Sjeff#define	KSEQ_ID(x)	((x) - kseq_cpu)
306123487Sjeff#define	KSEQ_GROUP(x)	(&kseq_groups[(x)])
307123433Sjeff#else	/* !SMP */
308121790Sjeffstatic struct kseq	kseq_cpu;
309129982Sjeff
310110028Sjeff#define	KSEQ_SELF()	(&kseq_cpu)
311110028Sjeff#define	KSEQ_CPU(x)	(&kseq_cpu)
312110028Sjeff#endif
313109864Sjeff
314134791Sjulianstatic void	slot_fill(struct ksegrp *kg);
315134791Sjulianstatic struct kse *sched_choose(void);		/* XXX Should be thread * */
316112966Sjeffstatic void sched_slice(struct kse *ke);
317113357Sjeffstatic void sched_priority(struct ksegrp *kg);
318139453Sjhbstatic void sched_thread_priority(struct thread *td, u_char prio);
319111857Sjeffstatic int sched_interact_score(struct ksegrp *kg);
320116463Sjeffstatic void sched_interact_update(struct ksegrp *kg);
321121868Sjeffstatic void sched_interact_fork(struct ksegrp *kg);
322121790Sjeffstatic void sched_pctcpu_update(struct kse *ke);
323109864Sjeff
324110267Sjeff/* Operations on per processor queues */
325121790Sjeffstatic struct kse * kseq_choose(struct kseq *kseq);
326110028Sjeffstatic void kseq_setup(struct kseq *kseq);
327122744Sjeffstatic void kseq_load_add(struct kseq *kseq, struct kse *ke);
328122744Sjeffstatic void kseq_load_rem(struct kseq *kseq, struct kse *ke);
329139334Sjeffstatic __inline void kseq_runq_add(struct kseq *kseq, struct kse *ke, int);
330122744Sjeffstatic __inline void kseq_runq_rem(struct kseq *kseq, struct kse *ke);
331113357Sjeffstatic void kseq_nice_add(struct kseq *kseq, int nice);
332113357Sjeffstatic void kseq_nice_rem(struct kseq *kseq, int nice);
333113660Sjeffvoid kseq_print(int cpu);
334110267Sjeff#ifdef SMP
335123433Sjeffstatic int kseq_transfer(struct kseq *ksq, struct kse *ke, int class);
336121790Sjeffstatic struct kse *runq_steal(struct runq *rq);
337129982Sjeffstatic void sched_balance(void);
338129982Sjeffstatic void sched_balance_groups(void);
339123487Sjeffstatic void sched_balance_group(struct kseq_group *ksg);
340123487Sjeffstatic void sched_balance_pair(struct kseq *high, struct kseq *low);
341121790Sjeffstatic void kseq_move(struct kseq *from, int cpu);
342123433Sjeffstatic int kseq_idled(struct kseq *kseq);
343121790Sjeffstatic void kseq_notify(struct kse *ke, int cpu);
344121790Sjeffstatic void kseq_assign(struct kseq *);
345123433Sjeffstatic struct kse *kseq_steal(struct kseq *kseq, int stealidle);
346139334Sjeff#define	KSE_CAN_MIGRATE(ke)						\
347135076Sscottl    ((ke)->ke_thread->td_pinned == 0 && ((ke)->ke_flags & KEF_BOUND) == 0)
348121790Sjeff#endif
349110028Sjeff
350113357Sjeffvoid
351113660Sjeffkseq_print(int cpu)
352110267Sjeff{
353113660Sjeff	struct kseq *kseq;
354113357Sjeff	int i;
355112994Sjeff
356113660Sjeff	kseq = KSEQ_CPU(cpu);
357112994Sjeff
358113357Sjeff	printf("kseq:\n");
359113357Sjeff	printf("\tload:           %d\n", kseq->ksq_load);
360122744Sjeff	printf("\tload TIMESHARE: %d\n", kseq->ksq_load_timeshare);
361121896Sjeff#ifdef SMP
362123433Sjeff	printf("\tload transferable: %d\n", kseq->ksq_transferable);
363121896Sjeff#endif
364113357Sjeff	printf("\tnicemin:\t%d\n", kseq->ksq_nicemin);
365113357Sjeff	printf("\tnice counts:\n");
366121869Sjeff	for (i = 0; i < SCHED_PRI_NRESV; i++)
367113357Sjeff		if (kseq->ksq_nice[i])
368113357Sjeff			printf("\t\t%d = %d\n",
369113357Sjeff			    i - SCHED_PRI_NHALF, kseq->ksq_nice[i]);
370113357Sjeff}
371112994Sjeff
372122744Sjeffstatic __inline void
373139334Sjeffkseq_runq_add(struct kseq *kseq, struct kse *ke, int flags)
374122744Sjeff{
375122744Sjeff#ifdef SMP
376139334Sjeff	if (KSE_CAN_MIGRATE(ke)) {
377123433Sjeff		kseq->ksq_transferable++;
378123433Sjeff		kseq->ksq_group->ksg_transferable++;
379133427Sjeff		ke->ke_flags |= KEF_XFERABLE;
380123433Sjeff	}
381122744Sjeff#endif
382139334Sjeff	runq_add(ke->ke_runq, ke, flags);
383122744Sjeff}
384122744Sjeff
385122744Sjeffstatic __inline void
386122744Sjeffkseq_runq_rem(struct kseq *kseq, struct kse *ke)
387122744Sjeff{
388122744Sjeff#ifdef SMP
389133427Sjeff	if (ke->ke_flags & KEF_XFERABLE) {
390123433Sjeff		kseq->ksq_transferable--;
391123433Sjeff		kseq->ksq_group->ksg_transferable--;
392133427Sjeff		ke->ke_flags &= ~KEF_XFERABLE;
393123433Sjeff	}
394122744Sjeff#endif
395122744Sjeff	runq_remove(ke->ke_runq, ke);
396122744Sjeff}
397122744Sjeff
398113357Sjeffstatic void
399122744Sjeffkseq_load_add(struct kseq *kseq, struct kse *ke)
400113357Sjeff{
401121896Sjeff	int class;
402115998Sjeff	mtx_assert(&sched_lock, MA_OWNED);
403121896Sjeff	class = PRI_BASE(ke->ke_ksegrp->kg_pri_class);
404121896Sjeff	if (class == PRI_TIMESHARE)
405121896Sjeff		kseq->ksq_load_timeshare++;
406113357Sjeff	kseq->ksq_load++;
407139316Sjeff	CTR1(KTR_SCHED, "load: %d", kseq->ksq_load);
408128563Sobrien	if (class != PRI_ITHD && (ke->ke_proc->p_flag & P_NOLOAD) == 0)
409123487Sjeff#ifdef SMP
410123487Sjeff		kseq->ksq_group->ksg_load++;
411125289Sjeff#else
412125289Sjeff		kseq->ksq_sysload++;
413123487Sjeff#endif
414113357Sjeff	if (ke->ke_ksegrp->kg_pri_class == PRI_TIMESHARE)
415130551Sjulian		kseq_nice_add(kseq, ke->ke_proc->p_nice);
416110267Sjeff}
417113357Sjeff
418112994Sjeffstatic void
419122744Sjeffkseq_load_rem(struct kseq *kseq, struct kse *ke)
420110267Sjeff{
421121896Sjeff	int class;
422115998Sjeff	mtx_assert(&sched_lock, MA_OWNED);
423121896Sjeff	class = PRI_BASE(ke->ke_ksegrp->kg_pri_class);
424121896Sjeff	if (class == PRI_TIMESHARE)
425121896Sjeff		kseq->ksq_load_timeshare--;
426128563Sobrien	if (class != PRI_ITHD  && (ke->ke_proc->p_flag & P_NOLOAD) == 0)
427123487Sjeff#ifdef SMP
428123487Sjeff		kseq->ksq_group->ksg_load--;
429125289Sjeff#else
430125289Sjeff		kseq->ksq_sysload--;
431123487Sjeff#endif
432113357Sjeff	kseq->ksq_load--;
433139316Sjeff	CTR1(KTR_SCHED, "load: %d", kseq->ksq_load);
434113357Sjeff	ke->ke_runq = NULL;
435113357Sjeff	if (ke->ke_ksegrp->kg_pri_class == PRI_TIMESHARE)
436130551Sjulian		kseq_nice_rem(kseq, ke->ke_proc->p_nice);
437110267Sjeff}
438110267Sjeff
439113357Sjeffstatic void
440113357Sjeffkseq_nice_add(struct kseq *kseq, int nice)
441110267Sjeff{
442115998Sjeff	mtx_assert(&sched_lock, MA_OWNED);
443113357Sjeff	/* Normalize to zero. */
444113357Sjeff	kseq->ksq_nice[nice + SCHED_PRI_NHALF]++;
445121896Sjeff	if (nice < kseq->ksq_nicemin || kseq->ksq_load_timeshare == 1)
446113357Sjeff		kseq->ksq_nicemin = nice;
447110267Sjeff}
448110267Sjeff
449113357Sjeffstatic void
450113357Sjeffkseq_nice_rem(struct kseq *kseq, int nice)
451110267Sjeff{
452113357Sjeff	int n;
453113357Sjeff
454115998Sjeff	mtx_assert(&sched_lock, MA_OWNED);
455113357Sjeff	/* Normalize to zero. */
456113357Sjeff	n = nice + SCHED_PRI_NHALF;
457113357Sjeff	kseq->ksq_nice[n]--;
458113357Sjeff	KASSERT(kseq->ksq_nice[n] >= 0, ("Negative nice count."));
459113357Sjeff
460113357Sjeff	/*
461113357Sjeff	 * If this wasn't the smallest nice value or there are more in
462113357Sjeff	 * this bucket we can just return.  Otherwise we have to recalculate
463113357Sjeff	 * the smallest nice.
464113357Sjeff	 */
465113357Sjeff	if (nice != kseq->ksq_nicemin ||
466113357Sjeff	    kseq->ksq_nice[n] != 0 ||
467121896Sjeff	    kseq->ksq_load_timeshare == 0)
468113357Sjeff		return;
469113357Sjeff
470121869Sjeff	for (; n < SCHED_PRI_NRESV; n++)
471113357Sjeff		if (kseq->ksq_nice[n]) {
472113357Sjeff			kseq->ksq_nicemin = n - SCHED_PRI_NHALF;
473113357Sjeff			return;
474113357Sjeff		}
475110267Sjeff}
476110267Sjeff
477113357Sjeff#ifdef SMP
478116069Sjeff/*
479122744Sjeff * sched_balance is a simple CPU load balancing algorithm.  It operates by
480116069Sjeff * finding the least loaded and most loaded cpu and equalizing their load
481116069Sjeff * by migrating some processes.
482116069Sjeff *
483116069Sjeff * Dealing only with two CPUs at a time has two advantages.  Firstly, most
484116069Sjeff * installations will only have 2 cpus.  Secondly, load balancing too much at
485116069Sjeff * once can have an unpleasant effect on the system.  The scheduler rarely has
486116069Sjeff * enough information to make perfect decisions.  So this algorithm chooses
487116069Sjeff * algorithm simplicity and more gradual effects on load in larger systems.
488116069Sjeff *
489116069Sjeff * It could be improved by considering the priorities and slices assigned to
490116069Sjeff * each task prior to balancing them.  There are many pathological cases with
491116069Sjeff * any approach and so the semi random algorithm below may work as well as any.
492116069Sjeff *
493116069Sjeff */
494121790Sjeffstatic void
495129982Sjeffsched_balance(void)
496116069Sjeff{
497123487Sjeff	struct kseq_group *high;
498123487Sjeff	struct kseq_group *low;
499123487Sjeff	struct kseq_group *ksg;
500123487Sjeff	int cnt;
501123487Sjeff	int i;
502123487Sjeff
503139334Sjeff	bal_tick = ticks + (random() % (hz * 2));
504123487Sjeff	if (smp_started == 0)
505139334Sjeff		return;
506123487Sjeff	low = high = NULL;
507123487Sjeff	i = random() % (ksg_maxid + 1);
508123487Sjeff	for (cnt = 0; cnt <= ksg_maxid; cnt++) {
509123487Sjeff		ksg = KSEQ_GROUP(i);
510123487Sjeff		/*
511123487Sjeff		 * Find the CPU with the highest load that has some
512123487Sjeff		 * threads to transfer.
513123487Sjeff		 */
514123487Sjeff		if ((high == NULL || ksg->ksg_load > high->ksg_load)
515123487Sjeff		    && ksg->ksg_transferable)
516123487Sjeff			high = ksg;
517123487Sjeff		if (low == NULL || ksg->ksg_load < low->ksg_load)
518123487Sjeff			low = ksg;
519123487Sjeff		if (++i > ksg_maxid)
520123487Sjeff			i = 0;
521123487Sjeff	}
522123487Sjeff	if (low != NULL && high != NULL && high != low)
523123487Sjeff		sched_balance_pair(LIST_FIRST(&high->ksg_members),
524123487Sjeff		    LIST_FIRST(&low->ksg_members));
525123487Sjeff}
526123487Sjeff
527123487Sjeffstatic void
528129982Sjeffsched_balance_groups(void)
529123487Sjeff{
530123487Sjeff	int i;
531123487Sjeff
532139334Sjeff	gbal_tick = ticks + (random() % (hz * 2));
533129982Sjeff	mtx_assert(&sched_lock, MA_OWNED);
534123487Sjeff	if (smp_started)
535123487Sjeff		for (i = 0; i <= ksg_maxid; i++)
536123487Sjeff			sched_balance_group(KSEQ_GROUP(i));
537123487Sjeff}
538123487Sjeff
539123487Sjeffstatic void
540123487Sjeffsched_balance_group(struct kseq_group *ksg)
541123487Sjeff{
542116069Sjeff	struct kseq *kseq;
543123487Sjeff	struct kseq *high;
544123487Sjeff	struct kseq *low;
545123487Sjeff	int load;
546123487Sjeff
547123487Sjeff	if (ksg->ksg_transferable == 0)
548123487Sjeff		return;
549123487Sjeff	low = NULL;
550123487Sjeff	high = NULL;
551123487Sjeff	LIST_FOREACH(kseq, &ksg->ksg_members, ksq_siblings) {
552123487Sjeff		load = kseq->ksq_load;
553123487Sjeff		if (high == NULL || load > high->ksq_load)
554123487Sjeff			high = kseq;
555123487Sjeff		if (low == NULL || load < low->ksq_load)
556123487Sjeff			low = kseq;
557123487Sjeff	}
558123487Sjeff	if (high != NULL && low != NULL && high != low)
559123487Sjeff		sched_balance_pair(high, low);
560123487Sjeff}
561123487Sjeff
562123487Sjeffstatic void
563123487Sjeffsched_balance_pair(struct kseq *high, struct kseq *low)
564123487Sjeff{
565123433Sjeff	int transferable;
566116069Sjeff	int high_load;
567116069Sjeff	int low_load;
568116069Sjeff	int move;
569116069Sjeff	int diff;
570116069Sjeff	int i;
571116069Sjeff
572116069Sjeff	/*
573123433Sjeff	 * If we're transfering within a group we have to use this specific
574123433Sjeff	 * kseq's transferable count, otherwise we can steal from other members
575123433Sjeff	 * of the group.
576123433Sjeff	 */
577123487Sjeff	if (high->ksq_group == low->ksq_group) {
578123487Sjeff		transferable = high->ksq_transferable;
579123487Sjeff		high_load = high->ksq_load;
580123487Sjeff		low_load = low->ksq_load;
581123487Sjeff	} else {
582123487Sjeff		transferable = high->ksq_group->ksg_transferable;
583123487Sjeff		high_load = high->ksq_group->ksg_load;
584123487Sjeff		low_load = low->ksq_group->ksg_load;
585123487Sjeff	}
586123433Sjeff	if (transferable == 0)
587123487Sjeff		return;
588123433Sjeff	/*
589122744Sjeff	 * Determine what the imbalance is and then adjust that to how many
590123433Sjeff	 * kses we actually have to give up (transferable).
591122744Sjeff	 */
592123487Sjeff	diff = high_load - low_load;
593116069Sjeff	move = diff / 2;
594116069Sjeff	if (diff & 0x1)
595116069Sjeff		move++;
596123433Sjeff	move = min(move, transferable);
597116069Sjeff	for (i = 0; i < move; i++)
598123487Sjeff		kseq_move(high, KSEQ_ID(low));
599116069Sjeff	return;
600116069Sjeff}
601116069Sjeff
602121790Sjeffstatic void
603116069Sjeffkseq_move(struct kseq *from, int cpu)
604116069Sjeff{
605123433Sjeff	struct kseq *kseq;
606123433Sjeff	struct kseq *to;
607116069Sjeff	struct kse *ke;
608116069Sjeff
609123433Sjeff	kseq = from;
610123433Sjeff	to = KSEQ_CPU(cpu);
611123433Sjeff	ke = kseq_steal(kseq, 1);
612123433Sjeff	if (ke == NULL) {
613123433Sjeff		struct kseq_group *ksg;
614123433Sjeff
615123433Sjeff		ksg = kseq->ksq_group;
616123433Sjeff		LIST_FOREACH(kseq, &ksg->ksg_members, ksq_siblings) {
617123433Sjeff			if (kseq == from || kseq->ksq_transferable == 0)
618123433Sjeff				continue;
619123433Sjeff			ke = kseq_steal(kseq, 1);
620123433Sjeff			break;
621123433Sjeff		}
622123433Sjeff		if (ke == NULL)
623123433Sjeff			panic("kseq_move: No KSEs available with a "
624123433Sjeff			    "transferable count of %d\n",
625123433Sjeff			    ksg->ksg_transferable);
626123433Sjeff	}
627123433Sjeff	if (kseq == to)
628123433Sjeff		return;
629116069Sjeff	ke->ke_state = KES_THREAD;
630123433Sjeff	kseq_runq_rem(kseq, ke);
631123433Sjeff	kseq_load_rem(kseq, ke);
632121923Sjeff	kseq_notify(ke, cpu);
633116069Sjeff}
634110267Sjeff
635123433Sjeffstatic int
636123433Sjeffkseq_idled(struct kseq *kseq)
637121790Sjeff{
638123433Sjeff	struct kseq_group *ksg;
639123433Sjeff	struct kseq *steal;
640123433Sjeff	struct kse *ke;
641123433Sjeff
642123433Sjeff	ksg = kseq->ksq_group;
643123433Sjeff	/*
644123433Sjeff	 * If we're in a cpu group, try and steal kses from another cpu in
645123433Sjeff	 * the group before idling.
646123433Sjeff	 */
647123433Sjeff	if (ksg->ksg_cpus > 1 && ksg->ksg_transferable) {
648123433Sjeff		LIST_FOREACH(steal, &ksg->ksg_members, ksq_siblings) {
649123433Sjeff			if (steal == kseq || steal->ksq_transferable == 0)
650123433Sjeff				continue;
651123433Sjeff			ke = kseq_steal(steal, 0);
652123433Sjeff			if (ke == NULL)
653123433Sjeff				continue;
654123433Sjeff			ke->ke_state = KES_THREAD;
655123433Sjeff			kseq_runq_rem(steal, ke);
656123433Sjeff			kseq_load_rem(steal, ke);
657123433Sjeff			ke->ke_cpu = PCPU_GET(cpuid);
658139334Sjeff			ke->ke_flags |= KEF_INTERNAL | KEF_HOLD;
659139334Sjeff			sched_add(ke->ke_thread, SRQ_YIELDING);
660123433Sjeff			return (0);
661123433Sjeff		}
662123433Sjeff	}
663123433Sjeff	/*
664123433Sjeff	 * We only set the idled bit when all of the cpus in the group are
665123433Sjeff	 * idle.  Otherwise we could get into a situation where a KSE bounces
666123433Sjeff	 * back and forth between two idle cores on seperate physical CPUs.
667123433Sjeff	 */
668123433Sjeff	ksg->ksg_idlemask |= PCPU_GET(cpumask);
669123433Sjeff	if (ksg->ksg_idlemask != ksg->ksg_cpumask)
670123433Sjeff		return (1);
671123433Sjeff	atomic_set_int(&kseq_idle, ksg->ksg_mask);
672123433Sjeff	return (1);
673121790Sjeff}
674121790Sjeff
675121790Sjeffstatic void
676121790Sjeffkseq_assign(struct kseq *kseq)
677121790Sjeff{
678121790Sjeff	struct kse *nke;
679121790Sjeff	struct kse *ke;
680121790Sjeff
681121790Sjeff	do {
682132776Skan		*(volatile struct kse **)&ke = kseq->ksq_assigned;
683121790Sjeff	} while(!atomic_cmpset_ptr(&kseq->ksq_assigned, ke, NULL));
684121790Sjeff	for (; ke != NULL; ke = nke) {
685121790Sjeff		nke = ke->ke_assign;
686139334Sjeff		kseq->ksq_group->ksg_load--;
687139334Sjeff		kseq->ksq_load--;
688121790Sjeff		ke->ke_flags &= ~KEF_ASSIGNED;
689139334Sjeff		ke->ke_flags |= KEF_INTERNAL | KEF_HOLD;
690139334Sjeff		sched_add(ke->ke_thread, SRQ_YIELDING);
691121790Sjeff	}
692121790Sjeff}
693121790Sjeff
694121790Sjeffstatic void
695121790Sjeffkseq_notify(struct kse *ke, int cpu)
696121790Sjeff{
697121790Sjeff	struct kseq *kseq;
698121790Sjeff	struct thread *td;
699121790Sjeff	struct pcpu *pcpu;
700139334Sjeff	int class;
701133427Sjeff	int prio;
702121790Sjeff
703139334Sjeff	kseq = KSEQ_CPU(cpu);
704139334Sjeff	/* XXX */
705139334Sjeff	class = PRI_BASE(ke->ke_ksegrp->kg_pri_class);
706139334Sjeff	if ((class == PRI_TIMESHARE || class == PRI_REALTIME) &&
707139334Sjeff	    (kseq_idle & kseq->ksq_group->ksg_mask))
708139334Sjeff		atomic_clear_int(&kseq_idle, kseq->ksq_group->ksg_mask);
709139334Sjeff	kseq->ksq_group->ksg_load++;
710139334Sjeff	kseq->ksq_load++;
711123529Sjeff	ke->ke_cpu = cpu;
712121790Sjeff	ke->ke_flags |= KEF_ASSIGNED;
713133427Sjeff	prio = ke->ke_thread->td_priority;
714121790Sjeff
715121790Sjeff	/*
716121790Sjeff	 * Place a KSE on another cpu's queue and force a resched.
717121790Sjeff	 */
718121790Sjeff	do {
719132776Skan		*(volatile struct kse **)&ke->ke_assign = kseq->ksq_assigned;
720121790Sjeff	} while(!atomic_cmpset_ptr(&kseq->ksq_assigned, ke->ke_assign, ke));
721133427Sjeff	/*
722133427Sjeff	 * Without sched_lock we could lose a race where we set NEEDRESCHED
723133427Sjeff	 * on a thread that is switched out before the IPI is delivered.  This
724133427Sjeff	 * would lead us to miss the resched.  This will be a problem once
725133427Sjeff	 * sched_lock is pushed down.
726133427Sjeff	 */
727121790Sjeff	pcpu = pcpu_find(cpu);
728121790Sjeff	td = pcpu->pc_curthread;
729121790Sjeff	if (ke->ke_thread->td_priority < td->td_priority ||
730121790Sjeff	    td == pcpu->pc_idlethread) {
731121790Sjeff		td->td_flags |= TDF_NEEDRESCHED;
732121790Sjeff		ipi_selected(1 << cpu, IPI_AST);
733121790Sjeff	}
734121790Sjeff}
735121790Sjeff
736121790Sjeffstatic struct kse *
737121790Sjeffrunq_steal(struct runq *rq)
738121790Sjeff{
739121790Sjeff	struct rqhead *rqh;
740121790Sjeff	struct rqbits *rqb;
741121790Sjeff	struct kse *ke;
742121790Sjeff	int word;
743121790Sjeff	int bit;
744121790Sjeff
745121790Sjeff	mtx_assert(&sched_lock, MA_OWNED);
746121790Sjeff	rqb = &rq->rq_status;
747121790Sjeff	for (word = 0; word < RQB_LEN; word++) {
748121790Sjeff		if (rqb->rqb_bits[word] == 0)
749121790Sjeff			continue;
750121790Sjeff		for (bit = 0; bit < RQB_BPW; bit++) {
751123231Speter			if ((rqb->rqb_bits[word] & (1ul << bit)) == 0)
752121790Sjeff				continue;
753121790Sjeff			rqh = &rq->rq_queues[bit + (word << RQB_L2BPW)];
754121790Sjeff			TAILQ_FOREACH(ke, rqh, ke_procq) {
755139334Sjeff				if (KSE_CAN_MIGRATE(ke))
756121790Sjeff					return (ke);
757121790Sjeff			}
758121790Sjeff		}
759121790Sjeff	}
760121790Sjeff	return (NULL);
761121790Sjeff}
762121790Sjeff
763121790Sjeffstatic struct kse *
764123433Sjeffkseq_steal(struct kseq *kseq, int stealidle)
765121790Sjeff{
766121790Sjeff	struct kse *ke;
767121790Sjeff
768123433Sjeff	/*
769123433Sjeff	 * Steal from next first to try to get a non-interactive task that
770123433Sjeff	 * may not have run for a while.
771123433Sjeff	 */
772123433Sjeff	if ((ke = runq_steal(kseq->ksq_next)) != NULL)
773123433Sjeff		return (ke);
774121790Sjeff	if ((ke = runq_steal(kseq->ksq_curr)) != NULL)
775121790Sjeff		return (ke);
776123433Sjeff	if (stealidle)
777123433Sjeff		return (runq_steal(&kseq->ksq_idle));
778123433Sjeff	return (NULL);
779121790Sjeff}
780123433Sjeff
781123433Sjeffint
782123433Sjeffkseq_transfer(struct kseq *kseq, struct kse *ke, int class)
783123433Sjeff{
784139334Sjeff	struct kseq_group *nksg;
785123433Sjeff	struct kseq_group *ksg;
786139334Sjeff	struct kseq *old;
787123433Sjeff	int cpu;
788139334Sjeff	int idx;
789123433Sjeff
790123685Sjeff	if (smp_started == 0)
791123685Sjeff		return (0);
792123433Sjeff	cpu = 0;
793123433Sjeff	/*
794133427Sjeff	 * If our load exceeds a certain threshold we should attempt to
795133427Sjeff	 * reassign this thread.  The first candidate is the cpu that
796133427Sjeff	 * originally ran the thread.  If it is idle, assign it there,
797133427Sjeff	 * otherwise, pick an idle cpu.
798133427Sjeff	 *
799133427Sjeff	 * The threshold at which we start to reassign kses has a large impact
800123685Sjeff	 * on the overall performance of the system.  Tuned too high and
801123685Sjeff	 * some CPUs may idle.  Too low and there will be excess migration
802128055Scognet	 * and context switches.
803123685Sjeff	 */
804139334Sjeff	old = KSEQ_CPU(ke->ke_cpu);
805139334Sjeff	nksg = old->ksq_group;
806133427Sjeff	ksg = kseq->ksq_group;
807139334Sjeff	if (kseq_idle) {
808139334Sjeff		if (kseq_idle & nksg->ksg_mask) {
809139334Sjeff			cpu = ffs(nksg->ksg_idlemask);
810139334Sjeff			if (cpu) {
811139334Sjeff				CTR2(KTR_SCHED,
812139334Sjeff				    "kseq_transfer: %p found old cpu %X "
813139334Sjeff				    "in idlemask.", ke, cpu);
814133427Sjeff				goto migrate;
815139334Sjeff			}
816133427Sjeff		}
817123433Sjeff		/*
818123433Sjeff		 * Multiple cpus could find this bit simultaneously
819123433Sjeff		 * but the race shouldn't be terrible.
820123433Sjeff		 */
821123433Sjeff		cpu = ffs(kseq_idle);
822139334Sjeff		if (cpu) {
823139334Sjeff			CTR2(KTR_SCHED, "kseq_transfer: %p found %X "
824139334Sjeff			    "in idlemask.", ke, cpu);
825133427Sjeff			goto migrate;
826139334Sjeff		}
827123433Sjeff	}
828139334Sjeff	idx = 0;
829139334Sjeff#if 0
830139334Sjeff	if (old->ksq_load < kseq->ksq_load) {
831139334Sjeff		cpu = ke->ke_cpu + 1;
832139334Sjeff		CTR2(KTR_SCHED, "kseq_transfer: %p old cpu %X "
833139334Sjeff		    "load less than ours.", ke, cpu);
834139334Sjeff		goto migrate;
835139334Sjeff	}
836123433Sjeff	/*
837139334Sjeff	 * No new CPU was found, look for one with less load.
838139334Sjeff	 */
839139334Sjeff	for (idx = 0; idx <= ksg_maxid; idx++) {
840139334Sjeff		nksg = KSEQ_GROUP(idx);
841139334Sjeff		if (nksg->ksg_load /*+ (nksg->ksg_cpus  * 2)*/ < ksg->ksg_load) {
842139334Sjeff			cpu = ffs(nksg->ksg_cpumask);
843139334Sjeff			CTR2(KTR_SCHED, "kseq_transfer: %p cpu %X load less "
844139334Sjeff			    "than ours.", ke, cpu);
845139334Sjeff			goto migrate;
846139334Sjeff		}
847139334Sjeff	}
848139334Sjeff#endif
849139334Sjeff	/*
850123433Sjeff	 * If another cpu in this group has idled, assign a thread over
851123433Sjeff	 * to them after checking to see if there are idled groups.
852123433Sjeff	 */
853133427Sjeff	if (ksg->ksg_idlemask) {
854123433Sjeff		cpu = ffs(ksg->ksg_idlemask);
855139334Sjeff		if (cpu) {
856139334Sjeff			CTR2(KTR_SCHED, "kseq_transfer: %p cpu %X idle in "
857139334Sjeff			    "group.", ke, cpu);
858133427Sjeff			goto migrate;
859139334Sjeff		}
860123433Sjeff	}
861133427Sjeff	return (0);
862133427Sjeffmigrate:
863133427Sjeff	/*
864123433Sjeff	 * Now that we've found an idle CPU, migrate the thread.
865123433Sjeff	 */
866133427Sjeff	cpu--;
867133427Sjeff	ke->ke_runq = NULL;
868133427Sjeff	kseq_notify(ke, cpu);
869133427Sjeff
870133427Sjeff	return (1);
871123433Sjeff}
872123433Sjeff
873121790Sjeff#endif	/* SMP */
874121790Sjeff
875117326Sjeff/*
876121790Sjeff * Pick the highest priority task we have and return it.
877117326Sjeff */
878117326Sjeff
879121790Sjeffstatic struct kse *
880121790Sjeffkseq_choose(struct kseq *kseq)
881110267Sjeff{
882137067Sjeff	struct runq *swap;
883110267Sjeff	struct kse *ke;
884137067Sjeff	int nice;
885110267Sjeff
886115998Sjeff	mtx_assert(&sched_lock, MA_OWNED);
887113357Sjeff	swap = NULL;
888112994Sjeff
889113357Sjeff	for (;;) {
890113357Sjeff		ke = runq_choose(kseq->ksq_curr);
891113357Sjeff		if (ke == NULL) {
892113357Sjeff			/*
893131473Sjhb			 * We already swapped once and didn't get anywhere.
894113357Sjeff			 */
895113357Sjeff			if (swap)
896113357Sjeff				break;
897113357Sjeff			swap = kseq->ksq_curr;
898113357Sjeff			kseq->ksq_curr = kseq->ksq_next;
899113357Sjeff			kseq->ksq_next = swap;
900113357Sjeff			continue;
901113357Sjeff		}
902113357Sjeff		/*
903113357Sjeff		 * If we encounter a slice of 0 the kse is in a
904113357Sjeff		 * TIMESHARE kse group and its nice was too far out
905113357Sjeff		 * of the range that receives slices.
906113357Sjeff		 */
907137067Sjeff		nice = ke->ke_proc->p_nice + (0 - kseq->ksq_nicemin);
908138842Sjeff		if (ke->ke_slice == 0 || (nice > SCHED_SLICE_NTHRESH &&
909138842Sjeff		    ke->ke_proc->p_nice != 0)) {
910113357Sjeff			runq_remove(ke->ke_runq, ke);
911113357Sjeff			sched_slice(ke);
912113357Sjeff			ke->ke_runq = kseq->ksq_next;
913136170Sjulian			runq_add(ke->ke_runq, ke, 0);
914113357Sjeff			continue;
915113357Sjeff		}
916113357Sjeff		return (ke);
917110267Sjeff	}
918110267Sjeff
919113357Sjeff	return (runq_choose(&kseq->ksq_idle));
920110267Sjeff}
921110267Sjeff
922109864Sjeffstatic void
923110028Sjeffkseq_setup(struct kseq *kseq)
924110028Sjeff{
925113357Sjeff	runq_init(&kseq->ksq_timeshare[0]);
926113357Sjeff	runq_init(&kseq->ksq_timeshare[1]);
927112994Sjeff	runq_init(&kseq->ksq_idle);
928113357Sjeff	kseq->ksq_curr = &kseq->ksq_timeshare[0];
929113357Sjeff	kseq->ksq_next = &kseq->ksq_timeshare[1];
930113660Sjeff	kseq->ksq_load = 0;
931121896Sjeff	kseq->ksq_load_timeshare = 0;
932110028Sjeff}
933110028Sjeff
934110028Sjeffstatic void
935109864Sjeffsched_setup(void *dummy)
936109864Sjeff{
937117313Sjeff#ifdef SMP
938109864Sjeff	int i;
939117313Sjeff#endif
940109864Sjeff
941116946Sjeff	slice_min = (hz/100);	/* 10ms */
942116946Sjeff	slice_max = (hz/7);	/* ~140ms */
943111857Sjeff
944117237Sjeff#ifdef SMP
945123487Sjeff	balance_groups = 0;
946123433Sjeff	/*
947123433Sjeff	 * Initialize the kseqs.
948123433Sjeff	 */
949123433Sjeff	for (i = 0; i < MAXCPU; i++) {
950123433Sjeff		struct kseq *ksq;
951123433Sjeff
952123433Sjeff		ksq = &kseq_cpu[i];
953123433Sjeff		ksq->ksq_assigned = NULL;
954123433Sjeff		kseq_setup(&kseq_cpu[i]);
955123433Sjeff	}
956117237Sjeff	if (smp_topology == NULL) {
957123433Sjeff		struct kseq_group *ksg;
958123433Sjeff		struct kseq *ksq;
959139334Sjeff		int cpus;
960123433Sjeff
961139334Sjeff		for (cpus = 0, i = 0; i < MAXCPU; i++) {
962139334Sjeff			if (CPU_ABSENT(i))
963139334Sjeff				continue;
964139334Sjeff			ksq = &kseq_cpu[cpus];
965139334Sjeff			ksg = &kseq_groups[cpus];
966123433Sjeff			/*
967129982Sjeff			 * Setup a kseq group with one member.
968123433Sjeff			 */
969123433Sjeff			ksq->ksq_transferable = 0;
970123433Sjeff			ksq->ksq_group = ksg;
971123433Sjeff			ksg->ksg_cpus = 1;
972123433Sjeff			ksg->ksg_idlemask = 0;
973123433Sjeff			ksg->ksg_cpumask = ksg->ksg_mask = 1 << i;
974123487Sjeff			ksg->ksg_load = 0;
975123433Sjeff			ksg->ksg_transferable = 0;
976123433Sjeff			LIST_INIT(&ksg->ksg_members);
977123433Sjeff			LIST_INSERT_HEAD(&ksg->ksg_members, ksq, ksq_siblings);
978139334Sjeff			cpus++;
979117237Sjeff		}
980139334Sjeff		ksg_maxid = cpus - 1;
981117237Sjeff	} else {
982123433Sjeff		struct kseq_group *ksg;
983123433Sjeff		struct cpu_group *cg;
984117237Sjeff		int j;
985113357Sjeff
986117237Sjeff		for (i = 0; i < smp_topology->ct_count; i++) {
987117237Sjeff			cg = &smp_topology->ct_group[i];
988123433Sjeff			ksg = &kseq_groups[i];
989123433Sjeff			/*
990123433Sjeff			 * Initialize the group.
991123433Sjeff			 */
992123433Sjeff			ksg->ksg_idlemask = 0;
993123487Sjeff			ksg->ksg_load = 0;
994123433Sjeff			ksg->ksg_transferable = 0;
995123433Sjeff			ksg->ksg_cpus = cg->cg_count;
996123433Sjeff			ksg->ksg_cpumask = cg->cg_mask;
997123433Sjeff			LIST_INIT(&ksg->ksg_members);
998123433Sjeff			/*
999123433Sjeff			 * Find all of the group members and add them.
1000123433Sjeff			 */
1001123433Sjeff			for (j = 0; j < MAXCPU; j++) {
1002123433Sjeff				if ((cg->cg_mask & (1 << j)) != 0) {
1003123433Sjeff					if (ksg->ksg_mask == 0)
1004123433Sjeff						ksg->ksg_mask = 1 << j;
1005123433Sjeff					kseq_cpu[j].ksq_transferable = 0;
1006123433Sjeff					kseq_cpu[j].ksq_group = ksg;
1007123433Sjeff					LIST_INSERT_HEAD(&ksg->ksg_members,
1008123433Sjeff					    &kseq_cpu[j], ksq_siblings);
1009123433Sjeff				}
1010123433Sjeff			}
1011123487Sjeff			if (ksg->ksg_cpus > 1)
1012123487Sjeff				balance_groups = 1;
1013117237Sjeff		}
1014123487Sjeff		ksg_maxid = smp_topology->ct_count - 1;
1015117237Sjeff	}
1016123487Sjeff	/*
1017123487Sjeff	 * Stagger the group and global load balancer so they do not
1018123487Sjeff	 * interfere with each other.
1019123487Sjeff	 */
1020129982Sjeff	bal_tick = ticks + hz;
1021123487Sjeff	if (balance_groups)
1022129982Sjeff		gbal_tick = ticks + (hz / 2);
1023117237Sjeff#else
1024117237Sjeff	kseq_setup(KSEQ_SELF());
1025116069Sjeff#endif
1026117237Sjeff	mtx_lock_spin(&sched_lock);
1027122744Sjeff	kseq_load_add(KSEQ_SELF(), &kse0);
1028117237Sjeff	mtx_unlock_spin(&sched_lock);
1029109864Sjeff}
1030109864Sjeff
1031109864Sjeff/*
1032109864Sjeff * Scale the scheduling priority according to the "interactivity" of this
1033109864Sjeff * process.
1034109864Sjeff */
1035113357Sjeffstatic void
1036109864Sjeffsched_priority(struct ksegrp *kg)
1037109864Sjeff{
1038109864Sjeff	int pri;
1039109864Sjeff
1040109864Sjeff	if (kg->kg_pri_class != PRI_TIMESHARE)
1041113357Sjeff		return;
1042109864Sjeff
1043113357Sjeff	pri = SCHED_PRI_INTERACT(sched_interact_score(kg));
1044111857Sjeff	pri += SCHED_PRI_BASE;
1045130551Sjulian	pri += kg->kg_proc->p_nice;
1046109864Sjeff
1047109864Sjeff	if (pri > PRI_MAX_TIMESHARE)
1048109864Sjeff		pri = PRI_MAX_TIMESHARE;
1049109864Sjeff	else if (pri < PRI_MIN_TIMESHARE)
1050109864Sjeff		pri = PRI_MIN_TIMESHARE;
1051109864Sjeff
1052109864Sjeff	kg->kg_user_pri = pri;
1053109864Sjeff
1054113357Sjeff	return;
1055109864Sjeff}
1056109864Sjeff
1057109864Sjeff/*
1058112966Sjeff * Calculate a time slice based on the properties of the kseg and the runq
1059112994Sjeff * that we're on.  This is only for PRI_TIMESHARE ksegrps.
1060109864Sjeff */
1061112966Sjeffstatic void
1062112966Sjeffsched_slice(struct kse *ke)
1063109864Sjeff{
1064113357Sjeff	struct kseq *kseq;
1065112966Sjeff	struct ksegrp *kg;
1066109864Sjeff
1067112966Sjeff	kg = ke->ke_ksegrp;
1068113357Sjeff	kseq = KSEQ_CPU(ke->ke_cpu);
1069109864Sjeff
1070139453Sjhb	if (ke->ke_thread->td_flags & TDF_BORROWING) {
1071138842Sjeff		ke->ke_slice = SCHED_SLICE_MIN;
1072138842Sjeff		return;
1073138842Sjeff	}
1074138842Sjeff
1075112966Sjeff	/*
1076112966Sjeff	 * Rationale:
1077133427Sjeff	 * KSEs in interactive ksegs get a minimal slice so that we
1078112966Sjeff	 * quickly notice if it abuses its advantage.
1079112966Sjeff	 *
1080112966Sjeff	 * KSEs in non-interactive ksegs are assigned a slice that is
1081112966Sjeff	 * based on the ksegs nice value relative to the least nice kseg
1082112966Sjeff	 * on the run queue for this cpu.
1083112966Sjeff	 *
1084112966Sjeff	 * If the KSE is less nice than all others it gets the maximum
1085112966Sjeff	 * slice and other KSEs will adjust their slice relative to
1086112966Sjeff	 * this when they first expire.
1087112966Sjeff	 *
1088112966Sjeff	 * There is 20 point window that starts relative to the least
1089112966Sjeff	 * nice kse on the run queue.  Slice size is determined by
1090112966Sjeff	 * the kse distance from the last nice ksegrp.
1091112966Sjeff	 *
1092121871Sjeff	 * If the kse is outside of the window it will get no slice
1093121871Sjeff	 * and will be reevaluated each time it is selected on the
1094121871Sjeff	 * run queue.  The exception to this is nice 0 ksegs when
1095121871Sjeff	 * a nice -20 is running.  They are always granted a minimum
1096121871Sjeff	 * slice.
1097112966Sjeff	 */
1098113357Sjeff	if (!SCHED_INTERACTIVE(kg)) {
1099112966Sjeff		int nice;
1100112966Sjeff
1101130551Sjulian		nice = kg->kg_proc->p_nice + (0 - kseq->ksq_nicemin);
1102121896Sjeff		if (kseq->ksq_load_timeshare == 0 ||
1103130551Sjulian		    kg->kg_proc->p_nice < kseq->ksq_nicemin)
1104112966Sjeff			ke->ke_slice = SCHED_SLICE_MAX;
1105121871Sjeff		else if (nice <= SCHED_SLICE_NTHRESH)
1106112966Sjeff			ke->ke_slice = SCHED_SLICE_NICE(nice);
1107130551Sjulian		else if (kg->kg_proc->p_nice == 0)
1108121871Sjeff			ke->ke_slice = SCHED_SLICE_MIN;
1109112966Sjeff		else
1110112966Sjeff			ke->ke_slice = 0;
1111112966Sjeff	} else
1112123684Sjeff		ke->ke_slice = SCHED_SLICE_INTERACTIVE;
1113112966Sjeff
1114112966Sjeff	return;
1115109864Sjeff}
1116109864Sjeff
1117121868Sjeff/*
1118121868Sjeff * This routine enforces a maximum limit on the amount of scheduling history
1119121868Sjeff * kept.  It is called after either the slptime or runtime is adjusted.
1120121868Sjeff * This routine will not operate correctly when slp or run times have been
1121121868Sjeff * adjusted to more than double their maximum.
1122121868Sjeff */
1123116463Sjeffstatic void
1124116463Sjeffsched_interact_update(struct ksegrp *kg)
1125116463Sjeff{
1126121868Sjeff	int sum;
1127121605Sjeff
1128121868Sjeff	sum = kg->kg_runtime + kg->kg_slptime;
1129121868Sjeff	if (sum < SCHED_SLP_RUN_MAX)
1130121868Sjeff		return;
1131121868Sjeff	/*
1132121868Sjeff	 * If we have exceeded by more than 1/5th then the algorithm below
1133121868Sjeff	 * will not bring us back into range.  Dividing by two here forces
1134133427Sjeff	 * us into the range of [4/5 * SCHED_INTERACT_MAX, SCHED_INTERACT_MAX]
1135121868Sjeff	 */
1136127850Sjeff	if (sum > (SCHED_SLP_RUN_MAX / 5) * 6) {
1137121868Sjeff		kg->kg_runtime /= 2;
1138121868Sjeff		kg->kg_slptime /= 2;
1139121868Sjeff		return;
1140116463Sjeff	}
1141121868Sjeff	kg->kg_runtime = (kg->kg_runtime / 5) * 4;
1142121868Sjeff	kg->kg_slptime = (kg->kg_slptime / 5) * 4;
1143116463Sjeff}
1144116463Sjeff
1145121868Sjeffstatic void
1146121868Sjeffsched_interact_fork(struct ksegrp *kg)
1147121868Sjeff{
1148121868Sjeff	int ratio;
1149121868Sjeff	int sum;
1150121868Sjeff
1151121868Sjeff	sum = kg->kg_runtime + kg->kg_slptime;
1152121868Sjeff	if (sum > SCHED_SLP_RUN_FORK) {
1153121868Sjeff		ratio = sum / SCHED_SLP_RUN_FORK;
1154121868Sjeff		kg->kg_runtime /= ratio;
1155121868Sjeff		kg->kg_slptime /= ratio;
1156121868Sjeff	}
1157121868Sjeff}
1158121868Sjeff
1159111857Sjeffstatic int
1160111857Sjeffsched_interact_score(struct ksegrp *kg)
1161111857Sjeff{
1162116365Sjeff	int div;
1163111857Sjeff
1164111857Sjeff	if (kg->kg_runtime > kg->kg_slptime) {
1165116365Sjeff		div = max(1, kg->kg_runtime / SCHED_INTERACT_HALF);
1166116365Sjeff		return (SCHED_INTERACT_HALF +
1167116365Sjeff		    (SCHED_INTERACT_HALF - (kg->kg_slptime / div)));
1168116365Sjeff	} if (kg->kg_slptime > kg->kg_runtime) {
1169116365Sjeff		div = max(1, kg->kg_slptime / SCHED_INTERACT_HALF);
1170116365Sjeff		return (kg->kg_runtime / div);
1171111857Sjeff	}
1172111857Sjeff
1173116365Sjeff	/*
1174116365Sjeff	 * This can happen if slptime and runtime are 0.
1175116365Sjeff	 */
1176116365Sjeff	return (0);
1177111857Sjeff
1178111857Sjeff}
1179111857Sjeff
1180113357Sjeff/*
1181134791Sjulian * Very early in the boot some setup of scheduler-specific
1182134791Sjulian * parts of proc0 and of soem scheduler resources needs to be done.
1183134791Sjulian * Called from:
1184134791Sjulian *  proc0_init()
1185134791Sjulian */
1186134791Sjulianvoid
1187134791Sjulianschedinit(void)
1188134791Sjulian{
1189134791Sjulian	/*
1190134791Sjulian	 * Set up the scheduler specific parts of proc0.
1191134791Sjulian	 */
1192136167Sjulian	proc0.p_sched = NULL; /* XXX */
1193134791Sjulian	ksegrp0.kg_sched = &kg_sched0;
1194136167Sjulian	thread0.td_sched = &kse0;
1195134791Sjulian	kse0.ke_thread = &thread0;
1196134791Sjulian	kse0.ke_state = KES_THREAD;
1197134791Sjulian	kg_sched0.skg_concurrency = 1;
1198134791Sjulian	kg_sched0.skg_avail_opennings = 0; /* we are already running */
1199134791Sjulian}
1200134791Sjulian
1201134791Sjulian/*
1202113357Sjeff * This is only somewhat accurate since given many processes of the same
1203113357Sjeff * priority they will switch when their slices run out, which will be
1204113357Sjeff * at most SCHED_SLICE_MAX.
1205113357Sjeff */
1206109864Sjeffint
1207109864Sjeffsched_rr_interval(void)
1208109864Sjeff{
1209109864Sjeff	return (SCHED_SLICE_MAX);
1210109864Sjeff}
1211109864Sjeff
1212121790Sjeffstatic void
1213109864Sjeffsched_pctcpu_update(struct kse *ke)
1214109864Sjeff{
1215109864Sjeff	/*
1216109864Sjeff	 * Adjust counters and watermark for pctcpu calc.
1217116365Sjeff	 */
1218120272Sjeff	if (ke->ke_ltick > ticks - SCHED_CPU_TICKS) {
1219120272Sjeff		/*
1220120272Sjeff		 * Shift the tick count out so that the divide doesn't
1221120272Sjeff		 * round away our results.
1222120272Sjeff		 */
1223120272Sjeff		ke->ke_ticks <<= 10;
1224120272Sjeff		ke->ke_ticks = (ke->ke_ticks / (ticks - ke->ke_ftick)) *
1225120272Sjeff			    SCHED_CPU_TICKS;
1226120272Sjeff		ke->ke_ticks >>= 10;
1227120272Sjeff	} else
1228120272Sjeff		ke->ke_ticks = 0;
1229109864Sjeff	ke->ke_ltick = ticks;
1230109864Sjeff	ke->ke_ftick = ke->ke_ltick - SCHED_CPU_TICKS;
1231109864Sjeff}
1232109864Sjeff
1233109864Sjeffvoid
1234139453Sjhbsched_thread_priority(struct thread *td, u_char prio)
1235109864Sjeff{
1236121605Sjeff	struct kse *ke;
1237109864Sjeff
1238139316Sjeff	CTR6(KTR_SCHED, "sched_prio: %p(%s) prio %d newprio %d by %p(%s)",
1239139316Sjeff	    td, td->td_proc->p_comm, td->td_priority, prio, curthread,
1240139316Sjeff	    curthread->td_proc->p_comm);
1241121605Sjeff	ke = td->td_kse;
1242109864Sjeff	mtx_assert(&sched_lock, MA_OWNED);
1243139453Sjhb	if (td->td_priority == prio)
1244139453Sjhb		return;
1245109864Sjeff	if (TD_ON_RUNQ(td)) {
1246121605Sjeff		/*
1247121605Sjeff		 * If the priority has been elevated due to priority
1248121605Sjeff		 * propagation, we may have to move ourselves to a new
1249121605Sjeff		 * queue.  We still call adjustrunqueue below in case kse
1250121605Sjeff		 * needs to fix things up.
1251121605Sjeff		 */
1252138842Sjeff		if (prio < td->td_priority && ke->ke_runq != NULL &&
1253121872Sjeff		    (ke->ke_flags & KEF_ASSIGNED) == 0 &&
1254121790Sjeff		    ke->ke_runq != KSEQ_CPU(ke->ke_cpu)->ksq_curr) {
1255121605Sjeff			runq_remove(ke->ke_runq, ke);
1256121605Sjeff			ke->ke_runq = KSEQ_CPU(ke->ke_cpu)->ksq_curr;
1257136170Sjulian			runq_add(ke->ke_runq, ke, 0);
1258121605Sjeff		}
1259133555Sjeff		/*
1260133555Sjeff		 * Hold this kse on this cpu so that sched_prio() doesn't
1261133555Sjeff		 * cause excessive migration.  We only want migration to
1262133555Sjeff		 * happen as the result of a wakeup.
1263133555Sjeff		 */
1264133555Sjeff		ke->ke_flags |= KEF_HOLD;
1265119488Sdavidxu		adjustrunqueue(td, prio);
1266139334Sjeff		ke->ke_flags &= ~KEF_HOLD;
1267121605Sjeff	} else
1268119488Sdavidxu		td->td_priority = prio;
1269109864Sjeff}
1270109864Sjeff
1271139453Sjhb/*
1272139453Sjhb * Update a thread's priority when it is lent another thread's
1273139453Sjhb * priority.
1274139453Sjhb */
1275109864Sjeffvoid
1276139453Sjhbsched_lend_prio(struct thread *td, u_char prio)
1277139453Sjhb{
1278139453Sjhb
1279139453Sjhb	td->td_flags |= TDF_BORROWING;
1280139453Sjhb	sched_thread_priority(td, prio);
1281139453Sjhb}
1282139453Sjhb
1283139453Sjhb/*
1284139453Sjhb * Restore a thread's priority when priority propagation is
1285139453Sjhb * over.  The prio argument is the minimum priority the thread
1286139453Sjhb * needs to have to satisfy other possible priority lending
1287139453Sjhb * requests.  If the thread's regular priority is less
1288139453Sjhb * important than prio, the thread will keep a priority boost
1289139453Sjhb * of prio.
1290139453Sjhb */
1291139453Sjhbvoid
1292139453Sjhbsched_unlend_prio(struct thread *td, u_char prio)
1293139453Sjhb{
1294139453Sjhb	u_char base_pri;
1295139453Sjhb
1296139453Sjhb	if (td->td_base_pri >= PRI_MIN_TIMESHARE &&
1297139453Sjhb	    td->td_base_pri <= PRI_MAX_TIMESHARE)
1298139453Sjhb		base_pri = td->td_ksegrp->kg_user_pri;
1299139453Sjhb	else
1300139453Sjhb		base_pri = td->td_base_pri;
1301139453Sjhb	if (prio >= base_pri) {
1302139453Sjhb		td->td_flags &= ~ TDF_BORROWING;
1303139453Sjhb		sched_thread_priority(td, base_pri);
1304139453Sjhb	} else
1305139453Sjhb		sched_lend_prio(td, prio);
1306139453Sjhb}
1307139453Sjhb
1308139453Sjhbvoid
1309139453Sjhbsched_prio(struct thread *td, u_char prio)
1310139453Sjhb{
1311139453Sjhb	u_char oldprio;
1312139453Sjhb
1313139453Sjhb	/* First, update the base priority. */
1314139453Sjhb	td->td_base_pri = prio;
1315139453Sjhb
1316139453Sjhb	/*
1317139453Sjhb	 * If the therad is borrowing another thread's priority, don't
1318139453Sjhb	 * ever lower the priority.
1319139453Sjhb	 */
1320139453Sjhb	if (td->td_flags & TDF_BORROWING && td->td_priority < prio)
1321139453Sjhb		return;
1322139453Sjhb
1323139453Sjhb	/* Change the real priority. */
1324139453Sjhb	oldprio = td->td_priority;
1325139453Sjhb	sched_thread_priority(td, prio);
1326139453Sjhb
1327139453Sjhb	/*
1328139453Sjhb	 * If the thread is on a turnstile, then let the turnstile update
1329139453Sjhb	 * its state.
1330139453Sjhb	 */
1331139453Sjhb	if (TD_ON_LOCK(td) && oldprio != prio)
1332139453Sjhb		turnstile_adjust(td, oldprio);
1333139453Sjhb}
1334139453Sjhb
1335139453Sjhbvoid
1336135051Sjuliansched_switch(struct thread *td, struct thread *newtd, int flags)
1337109864Sjeff{
1338139334Sjeff	struct kseq *ksq;
1339109864Sjeff	struct kse *ke;
1340109864Sjeff
1341109864Sjeff	mtx_assert(&sched_lock, MA_OWNED);
1342109864Sjeff
1343109864Sjeff	ke = td->td_kse;
1344139334Sjeff	ksq = KSEQ_SELF();
1345109864Sjeff
1346133555Sjeff	td->td_lastcpu = td->td_oncpu;
1347113339Sjulian	td->td_oncpu = NOCPU;
1348132266Sjhb	td->td_flags &= ~TDF_NEEDRESCHED;
1349132266Sjhb	td->td_pflags &= ~TDP_OWEPREEMPT;
1350109864Sjeff
1351123434Sjeff	/*
1352123434Sjeff	 * If the KSE has been assigned it may be in the process of switching
1353123434Sjeff	 * to the new cpu.  This is the case in sched_bind().
1354123434Sjeff	 */
1355139334Sjeff	if (td == PCPU_GET(idlethread)) {
1356139334Sjeff		TD_SET_CAN_RUN(td);
1357139334Sjeff	} else if ((ke->ke_flags & KEF_ASSIGNED) == 0) {
1358139334Sjeff		/* We are ending our run so make our slot available again */
1359139334Sjeff		SLOT_RELEASE(td->td_ksegrp);
1360139334Sjeff		if (ke->ke_runq == NULL)
1361139334Sjeff			panic("Thread not on runq.");
1362139334Sjeff		kseq_load_rem(ksq, ke);
1363139334Sjeff		if (TD_IS_RUNNING(td)) {
1364139334Sjeff			/*
1365139334Sjeff			 * Don't allow the thread to migrate
1366139334Sjeff			 * from a preemption.
1367139334Sjeff			 */
1368139334Sjeff			ke->ke_flags |= KEF_HOLD;
1369139334Sjeff			setrunqueue(td, (flags & SW_PREEMPT) ?
1370139334Sjeff			    SRQ_OURSELF|SRQ_YIELDING|SRQ_PREEMPTED :
1371139334Sjeff			    SRQ_OURSELF|SRQ_YIELDING);
1372139334Sjeff			ke->ke_flags &= ~KEF_HOLD;
1373139334Sjeff		} else if ((td->td_proc->p_flag & P_HADTHREADS) &&
1374139334Sjeff		    (newtd == NULL || newtd->td_ksegrp != td->td_ksegrp))
1375139334Sjeff			/*
1376139334Sjeff			 * We will not be on the run queue.
1377139334Sjeff			 * So we must be sleeping or similar.
1378139334Sjeff			 * Don't use the slot if we will need it
1379139334Sjeff			 * for newtd.
1380139334Sjeff			 */
1381139334Sjeff			slot_fill(td->td_ksegrp);
1382121146Sjeff	}
1383136167Sjulian	if (newtd != NULL) {
1384136170Sjulian		/*
1385136170Sjulian		 * If we bring in a thread,
1386136170Sjulian		 * then account for it as if it had been added to the
1387136170Sjulian		 * run queue and then chosen.
1388136170Sjulian		 */
1389136169Sjulian		newtd->td_kse->ke_flags |= KEF_DIDRUN;
1390139334Sjeff		newtd->td_kse->ke_runq = ksq->ksq_curr;
1391136167Sjulian		SLOT_USE(newtd->td_ksegrp);
1392136173Sjulian		TD_SET_RUNNING(newtd);
1393133427Sjeff		kseq_load_add(KSEQ_SELF(), newtd->td_kse);
1394136167Sjulian	} else
1395131473Sjhb		newtd = choosethread();
1396121128Sjeff	if (td != newtd)
1397121128Sjeff		cpu_switch(td, newtd);
1398121128Sjeff	sched_lock.mtx_lock = (uintptr_t)td;
1399109864Sjeff
1400113339Sjulian	td->td_oncpu = PCPU_GET(cpuid);
1401109864Sjeff}
1402109864Sjeff
1403109864Sjeffvoid
1404130551Sjuliansched_nice(struct proc *p, int nice)
1405109864Sjeff{
1406130551Sjulian	struct ksegrp *kg;
1407113357Sjeff	struct kse *ke;
1408109864Sjeff	struct thread *td;
1409113357Sjeff	struct kseq *kseq;
1410109864Sjeff
1411130551Sjulian	PROC_LOCK_ASSERT(p, MA_OWNED);
1412113873Sjhb	mtx_assert(&sched_lock, MA_OWNED);
1413113357Sjeff	/*
1414113357Sjeff	 * We need to adjust the nice counts for running KSEs.
1415113357Sjeff	 */
1416130551Sjulian	FOREACH_KSEGRP_IN_PROC(p, kg) {
1417130551Sjulian		if (kg->kg_pri_class == PRI_TIMESHARE) {
1418134791Sjulian			FOREACH_THREAD_IN_GROUP(kg, td) {
1419134791Sjulian				ke = td->td_kse;
1420130551Sjulian				if (ke->ke_runq == NULL)
1421130551Sjulian					continue;
1422130551Sjulian				kseq = KSEQ_CPU(ke->ke_cpu);
1423130551Sjulian				kseq_nice_rem(kseq, p->p_nice);
1424130551Sjulian				kseq_nice_add(kseq, nice);
1425130551Sjulian			}
1426113357Sjeff		}
1427130551Sjulian	}
1428130551Sjulian	p->p_nice = nice;
1429130551Sjulian	FOREACH_KSEGRP_IN_PROC(p, kg) {
1430130551Sjulian		sched_priority(kg);
1431130551Sjulian		FOREACH_THREAD_IN_GROUP(kg, td)
1432130551Sjulian			td->td_flags |= TDF_NEEDRESCHED;
1433130551Sjulian	}
1434109864Sjeff}
1435109864Sjeff
1436109864Sjeffvoid
1437126326Sjhbsched_sleep(struct thread *td)
1438109864Sjeff{
1439109864Sjeff	mtx_assert(&sched_lock, MA_OWNED);
1440109864Sjeff
1441109864Sjeff	td->td_slptime = ticks;
1442109864Sjeff}
1443109864Sjeff
1444109864Sjeffvoid
1445109864Sjeffsched_wakeup(struct thread *td)
1446109864Sjeff{
1447109864Sjeff	mtx_assert(&sched_lock, MA_OWNED);
1448109864Sjeff
1449109864Sjeff	/*
1450109864Sjeff	 * Let the kseg know how long we slept for.  This is because process
1451109864Sjeff	 * interactivity behavior is modeled in the kseg.
1452109864Sjeff	 */
1453111788Sjeff	if (td->td_slptime) {
1454111788Sjeff		struct ksegrp *kg;
1455113357Sjeff		int hzticks;
1456109864Sjeff
1457111788Sjeff		kg = td->td_ksegrp;
1458121868Sjeff		hzticks = (ticks - td->td_slptime) << 10;
1459121868Sjeff		if (hzticks >= SCHED_SLP_RUN_MAX) {
1460121868Sjeff			kg->kg_slptime = SCHED_SLP_RUN_MAX;
1461121868Sjeff			kg->kg_runtime = 1;
1462121868Sjeff		} else {
1463121868Sjeff			kg->kg_slptime += hzticks;
1464121868Sjeff			sched_interact_update(kg);
1465121868Sjeff		}
1466111788Sjeff		sched_priority(kg);
1467134791Sjulian		sched_slice(td->td_kse);
1468111788Sjeff		td->td_slptime = 0;
1469109864Sjeff	}
1470134586Sjulian	setrunqueue(td, SRQ_BORING);
1471109864Sjeff}
1472109864Sjeff
1473109864Sjeff/*
1474109864Sjeff * Penalize the parent for creating a new child and initialize the child's
1475109864Sjeff * priority.
1476109864Sjeff */
1477109864Sjeffvoid
1478134791Sjuliansched_fork(struct thread *td, struct thread *childtd)
1479109864Sjeff{
1480109864Sjeff
1481109864Sjeff	mtx_assert(&sched_lock, MA_OWNED);
1482109864Sjeff
1483134791Sjulian	sched_fork_ksegrp(td, childtd->td_ksegrp);
1484134791Sjulian	sched_fork_thread(td, childtd);
1485113357Sjeff}
1486113357Sjeff
1487113357Sjeffvoid
1488132372Sjuliansched_fork_ksegrp(struct thread *td, struct ksegrp *child)
1489113357Sjeff{
1490132372Sjulian	struct ksegrp *kg = td->td_ksegrp;
1491134791Sjulian	mtx_assert(&sched_lock, MA_OWNED);
1492116365Sjeff
1493121868Sjeff	child->kg_slptime = kg->kg_slptime;
1494121868Sjeff	child->kg_runtime = kg->kg_runtime;
1495121868Sjeff	child->kg_user_pri = kg->kg_user_pri;
1496121868Sjeff	sched_interact_fork(child);
1497116463Sjeff	kg->kg_runtime += tickincr << 10;
1498116463Sjeff	sched_interact_update(kg);
1499113357Sjeff}
1500109864Sjeff
1501113357Sjeffvoid
1502113357Sjeffsched_fork_thread(struct thread *td, struct thread *child)
1503113357Sjeff{
1504134791Sjulian	struct kse *ke;
1505134791Sjulian	struct kse *ke2;
1506134791Sjulian
1507134791Sjulian	sched_newthread(child);
1508134791Sjulian	ke = td->td_kse;
1509134791Sjulian	ke2 = child->td_kse;
1510134791Sjulian	ke2->ke_slice = 1;	/* Attempt to quickly learn interactivity. */
1511134791Sjulian	ke2->ke_cpu = ke->ke_cpu;
1512134791Sjulian	ke2->ke_runq = NULL;
1513134791Sjulian
1514134791Sjulian	/* Grab our parents cpu estimation information. */
1515134791Sjulian	ke2->ke_ticks = ke->ke_ticks;
1516134791Sjulian	ke2->ke_ltick = ke->ke_ltick;
1517134791Sjulian	ke2->ke_ftick = ke->ke_ftick;
1518113357Sjeff}
1519113357Sjeff
1520113357Sjeffvoid
1521113357Sjeffsched_class(struct ksegrp *kg, int class)
1522113357Sjeff{
1523113357Sjeff	struct kseq *kseq;
1524113357Sjeff	struct kse *ke;
1525134791Sjulian	struct thread *td;
1526121896Sjeff	int nclass;
1527121896Sjeff	int oclass;
1528113357Sjeff
1529113923Sjhb	mtx_assert(&sched_lock, MA_OWNED);
1530113357Sjeff	if (kg->kg_pri_class == class)
1531113357Sjeff		return;
1532113357Sjeff
1533121896Sjeff	nclass = PRI_BASE(class);
1534121896Sjeff	oclass = PRI_BASE(kg->kg_pri_class);
1535134791Sjulian	FOREACH_THREAD_IN_GROUP(kg, td) {
1536134791Sjulian		ke = td->td_kse;
1537113357Sjeff		if (ke->ke_state != KES_ONRUNQ &&
1538113357Sjeff		    ke->ke_state != KES_THREAD)
1539113357Sjeff			continue;
1540113357Sjeff		kseq = KSEQ_CPU(ke->ke_cpu);
1541113357Sjeff
1542121896Sjeff#ifdef SMP
1543122744Sjeff		/*
1544122744Sjeff		 * On SMP if we're on the RUNQ we must adjust the transferable
1545122744Sjeff		 * count because could be changing to or from an interrupt
1546122744Sjeff		 * class.
1547122744Sjeff		 */
1548122744Sjeff		if (ke->ke_state == KES_ONRUNQ) {
1549139334Sjeff			if (KSE_CAN_MIGRATE(ke)) {
1550123433Sjeff				kseq->ksq_transferable--;
1551123433Sjeff				kseq->ksq_group->ksg_transferable--;
1552123433Sjeff			}
1553139334Sjeff			if (KSE_CAN_MIGRATE(ke)) {
1554123433Sjeff				kseq->ksq_transferable++;
1555123433Sjeff				kseq->ksq_group->ksg_transferable++;
1556123433Sjeff			}
1557122744Sjeff		}
1558121896Sjeff#endif
1559122744Sjeff		if (oclass == PRI_TIMESHARE) {
1560121896Sjeff			kseq->ksq_load_timeshare--;
1561130551Sjulian			kseq_nice_rem(kseq, kg->kg_proc->p_nice);
1562122744Sjeff		}
1563122744Sjeff		if (nclass == PRI_TIMESHARE) {
1564121896Sjeff			kseq->ksq_load_timeshare++;
1565130551Sjulian			kseq_nice_add(kseq, kg->kg_proc->p_nice);
1566122744Sjeff		}
1567109970Sjeff	}
1568109970Sjeff
1569113357Sjeff	kg->kg_pri_class = class;
1570109864Sjeff}
1571109864Sjeff
1572109864Sjeff/*
1573109864Sjeff * Return some of the child's priority and interactivity to the parent.
1574109864Sjeff */
1575109864Sjeffvoid
1576134791Sjuliansched_exit(struct proc *p, struct thread *childtd)
1577109864Sjeff{
1578109864Sjeff	mtx_assert(&sched_lock, MA_OWNED);
1579134791Sjulian	sched_exit_ksegrp(FIRST_KSEGRP_IN_PROC(p), childtd);
1580139316Sjeff	sched_exit_thread(NULL, childtd);
1581109864Sjeff}
1582109864Sjeff
1583109864Sjeffvoid
1584132372Sjuliansched_exit_ksegrp(struct ksegrp *kg, struct thread *td)
1585113372Sjeff{
1586132372Sjulian	/* kg->kg_slptime += td->td_ksegrp->kg_slptime; */
1587132372Sjulian	kg->kg_runtime += td->td_ksegrp->kg_runtime;
1588116463Sjeff	sched_interact_update(kg);
1589113372Sjeff}
1590113372Sjeff
1591113372Sjeffvoid
1592134791Sjuliansched_exit_thread(struct thread *td, struct thread *childtd)
1593113372Sjeff{
1594139316Sjeff	CTR3(KTR_SCHED, "sched_exit_thread: %p(%s) prio %d",
1595139316Sjeff	    childtd, childtd->td_proc->p_comm, childtd->td_priority);
1596134791Sjulian	kseq_load_rem(KSEQ_CPU(childtd->td_kse->ke_cpu), childtd->td_kse);
1597113372Sjeff}
1598113372Sjeff
1599113372Sjeffvoid
1600121127Sjeffsched_clock(struct thread *td)
1601109864Sjeff{
1602113357Sjeff	struct kseq *kseq;
1603113357Sjeff	struct ksegrp *kg;
1604121127Sjeff	struct kse *ke;
1605109864Sjeff
1606129982Sjeff	mtx_assert(&sched_lock, MA_OWNED);
1607133427Sjeff	kseq = KSEQ_SELF();
1608129982Sjeff#ifdef SMP
1609139334Sjeff	if (ticks >= bal_tick)
1610129982Sjeff		sched_balance();
1611139334Sjeff	if (ticks >= gbal_tick && balance_groups)
1612129982Sjeff		sched_balance_groups();
1613133427Sjeff	/*
1614133427Sjeff	 * We could have been assigned a non real-time thread without an
1615133427Sjeff	 * IPI.
1616133427Sjeff	 */
1617133427Sjeff	if (kseq->ksq_assigned)
1618133427Sjeff		kseq_assign(kseq);	/* Potentially sets NEEDRESCHED */
1619129982Sjeff#endif
1620113357Sjeff	/*
1621113357Sjeff	 * sched_setup() apparently happens prior to stathz being set.  We
1622113357Sjeff	 * need to resolve the timers earlier in the boot so we can avoid
1623113357Sjeff	 * calculating this here.
1624113357Sjeff	 */
1625113357Sjeff	if (realstathz == 0) {
1626113357Sjeff		realstathz = stathz ? stathz : hz;
1627113357Sjeff		tickincr = hz / realstathz;
1628113357Sjeff		/*
1629113357Sjeff		 * XXX This does not work for values of stathz that are much
1630113357Sjeff		 * larger than hz.
1631113357Sjeff		 */
1632113357Sjeff		if (tickincr == 0)
1633113357Sjeff			tickincr = 1;
1634113357Sjeff	}
1635109864Sjeff
1636121127Sjeff	ke = td->td_kse;
1637113357Sjeff	kg = ke->ke_ksegrp;
1638109864Sjeff
1639110028Sjeff	/* Adjust ticks for pctcpu */
1640111793Sjeff	ke->ke_ticks++;
1641109971Sjeff	ke->ke_ltick = ticks;
1642112994Sjeff
1643109971Sjeff	/* Go up to one second beyond our max and then trim back down */
1644109971Sjeff	if (ke->ke_ftick + SCHED_CPU_TICKS + hz < ke->ke_ltick)
1645109971Sjeff		sched_pctcpu_update(ke);
1646109971Sjeff
1647114496Sjulian	if (td->td_flags & TDF_IDLETD)
1648109864Sjeff		return;
1649110028Sjeff	/*
1650113357Sjeff	 * We only do slicing code for TIMESHARE ksegrps.
1651113357Sjeff	 */
1652113357Sjeff	if (kg->kg_pri_class != PRI_TIMESHARE)
1653113357Sjeff		return;
1654113357Sjeff	/*
1655110645Sjeff	 * We used a tick charge it to the ksegrp so that we can compute our
1656113357Sjeff	 * interactivity.
1657109864Sjeff	 */
1658113357Sjeff	kg->kg_runtime += tickincr << 10;
1659116463Sjeff	sched_interact_update(kg);
1660110645Sjeff
1661109864Sjeff	/*
1662109864Sjeff	 * We used up one time slice.
1663109864Sjeff	 */
1664122847Sjeff	if (--ke->ke_slice > 0)
1665113357Sjeff		return;
1666109864Sjeff	/*
1667113357Sjeff	 * We're out of time, recompute priorities and requeue.
1668109864Sjeff	 */
1669122744Sjeff	kseq_load_rem(kseq, ke);
1670113357Sjeff	sched_priority(kg);
1671113357Sjeff	sched_slice(ke);
1672113357Sjeff	if (SCHED_CURR(kg, ke))
1673113357Sjeff		ke->ke_runq = kseq->ksq_curr;
1674113357Sjeff	else
1675113357Sjeff		ke->ke_runq = kseq->ksq_next;
1676122744Sjeff	kseq_load_add(kseq, ke);
1677113357Sjeff	td->td_flags |= TDF_NEEDRESCHED;
1678109864Sjeff}
1679109864Sjeff
1680109864Sjeffint
1681109864Sjeffsched_runnable(void)
1682109864Sjeff{
1683109864Sjeff	struct kseq *kseq;
1684115998Sjeff	int load;
1685109864Sjeff
1686115998Sjeff	load = 1;
1687115998Sjeff
1688110028Sjeff	kseq = KSEQ_SELF();
1689121790Sjeff#ifdef SMP
1690122094Sjeff	if (kseq->ksq_assigned) {
1691122094Sjeff		mtx_lock_spin(&sched_lock);
1692121790Sjeff		kseq_assign(kseq);
1693122094Sjeff		mtx_unlock_spin(&sched_lock);
1694122094Sjeff	}
1695121790Sjeff#endif
1696121605Sjeff	if ((curthread->td_flags & TDF_IDLETD) != 0) {
1697121605Sjeff		if (kseq->ksq_load > 0)
1698121605Sjeff			goto out;
1699121605Sjeff	} else
1700121605Sjeff		if (kseq->ksq_load - 1 > 0)
1701121605Sjeff			goto out;
1702115998Sjeff	load = 0;
1703115998Sjeffout:
1704115998Sjeff	return (load);
1705109864Sjeff}
1706109864Sjeff
1707109864Sjeffvoid
1708109864Sjeffsched_userret(struct thread *td)
1709109864Sjeff{
1710109864Sjeff	struct ksegrp *kg;
1711121605Sjeff
1712139453Sjhb	KASSERT((td->td_flags & TDF_BORROWING) == 0,
1713139453Sjhb	    ("thread with borrowed priority returning to userland"));
1714139453Sjhb	kg = td->td_ksegrp;
1715139453Sjhb	if (td->td_priority != kg->kg_user_pri) {
1716109864Sjeff		mtx_lock_spin(&sched_lock);
1717109864Sjeff		td->td_priority = kg->kg_user_pri;
1718139453Sjhb		td->td_base_pri = kg->kg_user_pri;
1719109864Sjeff		mtx_unlock_spin(&sched_lock);
1720109864Sjeff	}
1721109864Sjeff}
1722109864Sjeff
1723109864Sjeffstruct kse *
1724109970Sjeffsched_choose(void)
1725109970Sjeff{
1726110028Sjeff	struct kseq *kseq;
1727109970Sjeff	struct kse *ke;
1728109970Sjeff
1729115998Sjeff	mtx_assert(&sched_lock, MA_OWNED);
1730121790Sjeff	kseq = KSEQ_SELF();
1731113357Sjeff#ifdef SMP
1732123433Sjeffrestart:
1733121790Sjeff	if (kseq->ksq_assigned)
1734121790Sjeff		kseq_assign(kseq);
1735113357Sjeff#endif
1736121790Sjeff	ke = kseq_choose(kseq);
1737109864Sjeff	if (ke) {
1738121790Sjeff#ifdef SMP
1739121790Sjeff		if (ke->ke_ksegrp->kg_pri_class == PRI_IDLE)
1740123433Sjeff			if (kseq_idled(kseq) == 0)
1741123433Sjeff				goto restart;
1742121790Sjeff#endif
1743122744Sjeff		kseq_runq_rem(kseq, ke);
1744109864Sjeff		ke->ke_state = KES_THREAD;
1745113357Sjeff		return (ke);
1746109864Sjeff	}
1747109970Sjeff#ifdef SMP
1748123433Sjeff	if (kseq_idled(kseq) == 0)
1749123433Sjeff		goto restart;
1750109970Sjeff#endif
1751113357Sjeff	return (NULL);
1752109864Sjeff}
1753109864Sjeff
1754109864Sjeffvoid
1755134586Sjuliansched_add(struct thread *td, int flags)
1756109864Sjeff{
1757110267Sjeff	struct kseq *kseq;
1758113357Sjeff	struct ksegrp *kg;
1759121127Sjeff	struct kse *ke;
1760139334Sjeff	int preemptive;
1761133427Sjeff	int canmigrate;
1762121790Sjeff	int class;
1763109864Sjeff
1764139316Sjeff	CTR5(KTR_SCHED, "sched_add: %p(%s) prio %d by %p(%s)",
1765139316Sjeff	    td, td->td_proc->p_comm, td->td_priority, curthread,
1766139316Sjeff	    curthread->td_proc->p_comm);
1767121790Sjeff	mtx_assert(&sched_lock, MA_OWNED);
1768121127Sjeff	ke = td->td_kse;
1769121127Sjeff	kg = td->td_ksegrp;
1770139334Sjeff	canmigrate = 1;
1771139334Sjeff	preemptive = !(flags & SRQ_YIELDING);
1772139334Sjeff	class = PRI_BASE(kg->kg_pri_class);
1773139334Sjeff	kseq = KSEQ_SELF();
1774139334Sjeff	if ((ke->ke_flags & KEF_INTERNAL) == 0)
1775139334Sjeff		SLOT_USE(td->td_ksegrp);
1776139334Sjeff	ke->ke_flags &= ~KEF_INTERNAL;
1777139334Sjeff#ifdef SMP
1778138802Sjeff	if (ke->ke_flags & KEF_ASSIGNED) {
1779139334Sjeff		if (ke->ke_flags & KEF_REMOVED)
1780138802Sjeff			ke->ke_flags &= ~KEF_REMOVED;
1781121790Sjeff		return;
1782138802Sjeff	}
1783139334Sjeff	canmigrate = KSE_CAN_MIGRATE(ke);
1784139334Sjeff#endif
1785109864Sjeff	KASSERT(ke->ke_state != KES_ONRUNQ,
1786110267Sjeff	    ("sched_add: kse %p (%s) already in run queue", ke,
1787109864Sjeff	    ke->ke_proc->p_comm));
1788109864Sjeff	KASSERT(ke->ke_proc->p_sflag & PS_INMEM,
1789110267Sjeff	    ("sched_add: process swapped out"));
1790113387Sjeff	KASSERT(ke->ke_runq == NULL,
1791113387Sjeff	    ("sched_add: KSE %p is still assigned to a run queue", ke));
1792121790Sjeff	switch (class) {
1793112994Sjeff	case PRI_ITHD:
1794112994Sjeff	case PRI_REALTIME:
1795113357Sjeff		ke->ke_runq = kseq->ksq_curr;
1796113357Sjeff		ke->ke_slice = SCHED_SLICE_MAX;
1797139334Sjeff		if (canmigrate)
1798139334Sjeff			ke->ke_cpu = PCPU_GET(cpuid);
1799112994Sjeff		break;
1800112994Sjeff	case PRI_TIMESHARE:
1801113387Sjeff		if (SCHED_CURR(kg, ke))
1802113387Sjeff			ke->ke_runq = kseq->ksq_curr;
1803113387Sjeff		else
1804113387Sjeff			ke->ke_runq = kseq->ksq_next;
1805113357Sjeff		break;
1806112994Sjeff	case PRI_IDLE:
1807113357Sjeff		/*
1808113357Sjeff		 * This is for priority prop.
1809113357Sjeff		 */
1810121605Sjeff		if (ke->ke_thread->td_priority < PRI_MIN_IDLE)
1811113357Sjeff			ke->ke_runq = kseq->ksq_curr;
1812113357Sjeff		else
1813113357Sjeff			ke->ke_runq = &kseq->ksq_idle;
1814113357Sjeff		ke->ke_slice = SCHED_SLICE_MIN;
1815112994Sjeff		break;
1816113357Sjeff	default:
1817121868Sjeff		panic("Unknown pri class.");
1818113357Sjeff		break;
1819112994Sjeff	}
1820121790Sjeff#ifdef SMP
1821133427Sjeff	/*
1822133427Sjeff	 * Don't migrate running threads here.  Force the long term balancer
1823133427Sjeff	 * to do it.
1824133427Sjeff	 */
1825133555Sjeff	if (ke->ke_flags & KEF_HOLD) {
1826133555Sjeff		ke->ke_flags &= ~KEF_HOLD;
1827133427Sjeff		canmigrate = 0;
1828133555Sjeff	}
1829133427Sjeff	/*
1830133427Sjeff	 * If this thread is pinned or bound, notify the target cpu.
1831133427Sjeff	 */
1832133427Sjeff	if (!canmigrate && ke->ke_cpu != PCPU_GET(cpuid) ) {
1833123529Sjeff		ke->ke_runq = NULL;
1834123433Sjeff		kseq_notify(ke, ke->ke_cpu);
1835123433Sjeff		return;
1836123433Sjeff	}
1837121790Sjeff	/*
1838123685Sjeff	 * If we had been idle, clear our bit in the group and potentially
1839123685Sjeff	 * the global bitmap.  If not, see if we should transfer this thread.
1840121790Sjeff	 */
1841123433Sjeff	if ((class == PRI_TIMESHARE || class == PRI_REALTIME) &&
1842123433Sjeff	    (kseq->ksq_group->ksg_idlemask & PCPU_GET(cpumask)) != 0) {
1843121790Sjeff		/*
1844123433Sjeff		 * Check to see if our group is unidling, and if so, remove it
1845123433Sjeff		 * from the global idle mask.
1846121790Sjeff		 */
1847123433Sjeff		if (kseq->ksq_group->ksg_idlemask ==
1848123433Sjeff		    kseq->ksq_group->ksg_cpumask)
1849123433Sjeff			atomic_clear_int(&kseq_idle, kseq->ksq_group->ksg_mask);
1850123433Sjeff		/*
1851123433Sjeff		 * Now remove ourselves from the group specific idle mask.
1852123433Sjeff		 */
1853123433Sjeff		kseq->ksq_group->ksg_idlemask &= ~PCPU_GET(cpumask);
1854139334Sjeff	} else if (canmigrate && kseq->ksq_load > 1 && class != PRI_ITHD)
1855123685Sjeff		if (kseq_transfer(kseq, ke, class))
1856123685Sjeff			return;
1857133427Sjeff	ke->ke_cpu = PCPU_GET(cpuid);
1858121790Sjeff#endif
1859133555Sjeff	if (td->td_priority < curthread->td_priority &&
1860133555Sjeff	    ke->ke_runq == kseq->ksq_curr)
1861133555Sjeff		curthread->td_flags |= TDF_NEEDRESCHED;
1862131839Sjhb	if (preemptive && maybe_preempt(td))
1863131481Sjhb		return;
1864109864Sjeff	ke->ke_state = KES_ONRUNQ;
1865109864Sjeff
1866139334Sjeff	kseq_runq_add(kseq, ke, flags);
1867122744Sjeff	kseq_load_add(kseq, ke);
1868109864Sjeff}
1869109864Sjeff
1870109864Sjeffvoid
1871121127Sjeffsched_rem(struct thread *td)
1872109864Sjeff{
1873113357Sjeff	struct kseq *kseq;
1874121127Sjeff	struct kse *ke;
1875113357Sjeff
1876139316Sjeff	CTR5(KTR_SCHED, "sched_rem: %p(%s) prio %d by %p(%s)",
1877139316Sjeff	    td, td->td_proc->p_comm, td->td_priority, curthread,
1878139316Sjeff	    curthread->td_proc->p_comm);
1879139334Sjeff	mtx_assert(&sched_lock, MA_OWNED);
1880139334Sjeff	ke = td->td_kse;
1881139334Sjeff	SLOT_RELEASE(td->td_ksegrp);
1882138802Sjeff	if (ke->ke_flags & KEF_ASSIGNED) {
1883138802Sjeff		ke->ke_flags |= KEF_REMOVED;
1884121790Sjeff		return;
1885138802Sjeff	}
1886124958Sjeff	KASSERT((ke->ke_state == KES_ONRUNQ),
1887124958Sjeff	    ("sched_rem: KSE not on run queue"));
1888109864Sjeff
1889109864Sjeff	ke->ke_state = KES_THREAD;
1890113357Sjeff	kseq = KSEQ_CPU(ke->ke_cpu);
1891122744Sjeff	kseq_runq_rem(kseq, ke);
1892122744Sjeff	kseq_load_rem(kseq, ke);
1893109864Sjeff}
1894109864Sjeff
1895109864Sjefffixpt_t
1896121127Sjeffsched_pctcpu(struct thread *td)
1897109864Sjeff{
1898109864Sjeff	fixpt_t pctcpu;
1899121127Sjeff	struct kse *ke;
1900109864Sjeff
1901109864Sjeff	pctcpu = 0;
1902121127Sjeff	ke = td->td_kse;
1903121290Sjeff	if (ke == NULL)
1904121290Sjeff		return (0);
1905109864Sjeff
1906115998Sjeff	mtx_lock_spin(&sched_lock);
1907109864Sjeff	if (ke->ke_ticks) {
1908109864Sjeff		int rtick;
1909109864Sjeff
1910116365Sjeff		/*
1911116365Sjeff		 * Don't update more frequently than twice a second.  Allowing
1912116365Sjeff		 * this causes the cpu usage to decay away too quickly due to
1913116365Sjeff		 * rounding errors.
1914116365Sjeff		 */
1915123435Sjeff		if (ke->ke_ftick + SCHED_CPU_TICKS < ke->ke_ltick ||
1916123435Sjeff		    ke->ke_ltick < (ticks - (hz / 2)))
1917116365Sjeff			sched_pctcpu_update(ke);
1918109864Sjeff		/* How many rtick per second ? */
1919116365Sjeff		rtick = min(ke->ke_ticks / SCHED_CPU_TIME, SCHED_CPU_TICKS);
1920110226Sscottl		pctcpu = (FSCALE * ((FSCALE * rtick)/realstathz)) >> FSHIFT;
1921109864Sjeff	}
1922109864Sjeff
1923109864Sjeff	ke->ke_proc->p_swtime = ke->ke_ltick - ke->ke_ftick;
1924113865Sjhb	mtx_unlock_spin(&sched_lock);
1925109864Sjeff
1926109864Sjeff	return (pctcpu);
1927109864Sjeff}
1928109864Sjeff
1929122038Sjeffvoid
1930122038Sjeffsched_bind(struct thread *td, int cpu)
1931122038Sjeff{
1932122038Sjeff	struct kse *ke;
1933122038Sjeff
1934122038Sjeff	mtx_assert(&sched_lock, MA_OWNED);
1935122038Sjeff	ke = td->td_kse;
1936122038Sjeff	ke->ke_flags |= KEF_BOUND;
1937123433Sjeff#ifdef SMP
1938123433Sjeff	if (PCPU_GET(cpuid) == cpu)
1939122038Sjeff		return;
1940122038Sjeff	/* sched_rem without the runq_remove */
1941122038Sjeff	ke->ke_state = KES_THREAD;
1942122744Sjeff	kseq_load_rem(KSEQ_CPU(ke->ke_cpu), ke);
1943122038Sjeff	kseq_notify(ke, cpu);
1944122038Sjeff	/* When we return from mi_switch we'll be on the correct cpu. */
1945131527Sphk	mi_switch(SW_VOL, NULL);
1946122038Sjeff#endif
1947122038Sjeff}
1948122038Sjeff
1949122038Sjeffvoid
1950122038Sjeffsched_unbind(struct thread *td)
1951122038Sjeff{
1952122038Sjeff	mtx_assert(&sched_lock, MA_OWNED);
1953122038Sjeff	td->td_kse->ke_flags &= ~KEF_BOUND;
1954122038Sjeff}
1955122038Sjeff
1956109864Sjeffint
1957125289Sjeffsched_load(void)
1958125289Sjeff{
1959125289Sjeff#ifdef SMP
1960125289Sjeff	int total;
1961125289Sjeff	int i;
1962125289Sjeff
1963125289Sjeff	total = 0;
1964125289Sjeff	for (i = 0; i <= ksg_maxid; i++)
1965125289Sjeff		total += KSEQ_GROUP(i)->ksg_load;
1966125289Sjeff	return (total);
1967125289Sjeff#else
1968125289Sjeff	return (KSEQ_SELF()->ksq_sysload);
1969125289Sjeff#endif
1970125289Sjeff}
1971125289Sjeff
1972125289Sjeffint
1973109864Sjeffsched_sizeof_ksegrp(void)
1974109864Sjeff{
1975109864Sjeff	return (sizeof(struct ksegrp) + sizeof(struct kg_sched));
1976109864Sjeff}
1977109864Sjeff
1978109864Sjeffint
1979109864Sjeffsched_sizeof_proc(void)
1980109864Sjeff{
1981109864Sjeff	return (sizeof(struct proc));
1982109864Sjeff}
1983109864Sjeff
1984109864Sjeffint
1985109864Sjeffsched_sizeof_thread(void)
1986109864Sjeff{
1987109864Sjeff	return (sizeof(struct thread) + sizeof(struct td_sched));
1988109864Sjeff}
1989134791Sjulian#define KERN_SWITCH_INCLUDE 1
1990134791Sjulian#include "kern/kern_switch.c"
1991