sched_ule.c revision 116365
1109864Sjeff/*-
2113357Sjeff * Copyright (c) 2002-2003, Jeffrey Roberson <jeff@freebsd.org>
3109864Sjeff * All rights reserved.
4109864Sjeff *
5109864Sjeff * Redistribution and use in source and binary forms, with or without
6109864Sjeff * modification, are permitted provided that the following conditions
7109864Sjeff * are met:
8109864Sjeff * 1. Redistributions of source code must retain the above copyright
9109864Sjeff *    notice unmodified, this list of conditions, and the following
10109864Sjeff *    disclaimer.
11109864Sjeff * 2. Redistributions in binary form must reproduce the above copyright
12109864Sjeff *    notice, this list of conditions and the following disclaimer in the
13109864Sjeff *    documentation and/or other materials provided with the distribution.
14109864Sjeff *
15109864Sjeff * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
16109864Sjeff * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
17109864Sjeff * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
18109864Sjeff * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
19109864Sjeff * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
20109864Sjeff * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
21109864Sjeff * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
22109864Sjeff * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
23109864Sjeff * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
24109864Sjeff * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
25109864Sjeff */
26109864Sjeff
27116182Sobrien#include <sys/cdefs.h>
28116182Sobrien__FBSDID("$FreeBSD: head/sys/kern/sched_ule.c 116365 2003-06-15 02:18:29Z jeff $");
29116182Sobrien
30109864Sjeff#include <sys/param.h>
31109864Sjeff#include <sys/systm.h>
32109864Sjeff#include <sys/kernel.h>
33109864Sjeff#include <sys/ktr.h>
34109864Sjeff#include <sys/lock.h>
35109864Sjeff#include <sys/mutex.h>
36109864Sjeff#include <sys/proc.h>
37112966Sjeff#include <sys/resource.h>
38109864Sjeff#include <sys/sched.h>
39109864Sjeff#include <sys/smp.h>
40109864Sjeff#include <sys/sx.h>
41109864Sjeff#include <sys/sysctl.h>
42109864Sjeff#include <sys/sysproto.h>
43109864Sjeff#include <sys/vmmeter.h>
44109864Sjeff#ifdef DDB
45109864Sjeff#include <ddb/ddb.h>
46109864Sjeff#endif
47109864Sjeff#ifdef KTRACE
48109864Sjeff#include <sys/uio.h>
49109864Sjeff#include <sys/ktrace.h>
50109864Sjeff#endif
51109864Sjeff
52109864Sjeff#include <machine/cpu.h>
53109864Sjeff
54113357Sjeff#define KTR_ULE         KTR_NFS
55113357Sjeff
56109864Sjeff/* decay 95% of `p_pctcpu' in 60 seconds; see CCPU_SHIFT before changing */
57109864Sjeff/* XXX This is bogus compatability crap for ps */
58109864Sjeffstatic fixpt_t  ccpu = 0.95122942450071400909 * FSCALE; /* exp(-1/20) */
59109864SjeffSYSCTL_INT(_kern, OID_AUTO, ccpu, CTLFLAG_RD, &ccpu, 0, "");
60109864Sjeff
61109864Sjeffstatic void sched_setup(void *dummy);
62109864SjeffSYSINIT(sched_setup, SI_SUB_RUN_QUEUE, SI_ORDER_FIRST, sched_setup, NULL)
63109864Sjeff
64113357Sjeffstatic SYSCTL_NODE(_kern, OID_AUTO, sched, CTLFLAG_RW, 0, "SCHED");
65113357Sjeff
66113357Sjeffstatic int sched_strict;
67113357SjeffSYSCTL_INT(_kern_sched, OID_AUTO, strict, CTLFLAG_RD, &sched_strict, 0, "");
68113357Sjeff
69113357Sjeffstatic int slice_min = 1;
70113357SjeffSYSCTL_INT(_kern_sched, OID_AUTO, slice_min, CTLFLAG_RW, &slice_min, 0, "");
71113357Sjeff
72116365Sjeffstatic int slice_max = 10;
73113357SjeffSYSCTL_INT(_kern_sched, OID_AUTO, slice_max, CTLFLAG_RW, &slice_max, 0, "");
74113357Sjeff
75111857Sjeffint realstathz;
76113357Sjeffint tickincr = 1;
77111857Sjeff
78116069Sjeff#ifdef SMP
79116069Sjeff/* Callout to handle load balancing SMP systems. */
80116069Sjeffstatic struct callout kseq_lb_callout;
81116069Sjeff#endif
82116069Sjeff
83109864Sjeff/*
84109864Sjeff * These datastructures are allocated within their parent datastructure but
85109864Sjeff * are scheduler specific.
86109864Sjeff */
87109864Sjeff
88109864Sjeffstruct ke_sched {
89109864Sjeff	int		ske_slice;
90109864Sjeff	struct runq	*ske_runq;
91109864Sjeff	/* The following variables are only used for pctcpu calculation */
92109864Sjeff	int		ske_ltick;	/* Last tick that we were running on */
93109864Sjeff	int		ske_ftick;	/* First tick that we were running on */
94109864Sjeff	int		ske_ticks;	/* Tick count */
95113357Sjeff	/* CPU that we have affinity for. */
96110260Sjeff	u_char		ske_cpu;
97109864Sjeff};
98109864Sjeff#define	ke_slice	ke_sched->ske_slice
99109864Sjeff#define	ke_runq		ke_sched->ske_runq
100109864Sjeff#define	ke_ltick	ke_sched->ske_ltick
101109864Sjeff#define	ke_ftick	ke_sched->ske_ftick
102109864Sjeff#define	ke_ticks	ke_sched->ske_ticks
103110260Sjeff#define	ke_cpu		ke_sched->ske_cpu
104109864Sjeff
105109864Sjeffstruct kg_sched {
106110645Sjeff	int	skg_slptime;		/* Number of ticks we vol. slept */
107110645Sjeff	int	skg_runtime;		/* Number of ticks we were running */
108109864Sjeff};
109109864Sjeff#define	kg_slptime	kg_sched->skg_slptime
110110645Sjeff#define	kg_runtime	kg_sched->skg_runtime
111109864Sjeff
112109864Sjeffstruct td_sched {
113109864Sjeff	int	std_slptime;
114109864Sjeff};
115109864Sjeff#define	td_slptime	td_sched->std_slptime
116109864Sjeff
117110267Sjeffstruct td_sched td_sched;
118109864Sjeffstruct ke_sched ke_sched;
119109864Sjeffstruct kg_sched kg_sched;
120109864Sjeff
121109864Sjeffstruct ke_sched *kse0_sched = &ke_sched;
122109864Sjeffstruct kg_sched *ksegrp0_sched = &kg_sched;
123109864Sjeffstruct p_sched *proc0_sched = NULL;
124109864Sjeffstruct td_sched *thread0_sched = &td_sched;
125109864Sjeff
126109864Sjeff/*
127109864Sjeff * This priority range has 20 priorities on either end that are reachable
128109864Sjeff * only through nice values.
129111857Sjeff *
130111857Sjeff * PRI_RANGE:	Total priority range for timeshare threads.
131111857Sjeff * PRI_NRESV:	Reserved priorities for nice.
132111857Sjeff * PRI_BASE:	The start of the dynamic range.
133111857Sjeff * DYN_RANGE:	Number of priorities that are available int the dynamic
134111857Sjeff *		priority range.
135109864Sjeff */
136111857Sjeff#define	SCHED_PRI_RANGE		(PRI_MAX_TIMESHARE - PRI_MIN_TIMESHARE + 1)
137112966Sjeff#define	SCHED_PRI_NRESV		PRIO_TOTAL
138112970Sjeff#define	SCHED_PRI_NHALF		(PRIO_TOTAL / 2)
139113357Sjeff#define	SCHED_PRI_NTHRESH	(SCHED_PRI_NHALF - 1)
140111857Sjeff#define	SCHED_PRI_BASE		((SCHED_PRI_NRESV / 2) + PRI_MIN_TIMESHARE)
141111857Sjeff#define	SCHED_DYN_RANGE		(SCHED_PRI_RANGE - SCHED_PRI_NRESV)
142113357Sjeff#define	SCHED_PRI_INTERACT(score)					\
143116365Sjeff    ((score) * SCHED_DYN_RANGE / SCHED_INTERACT_MAX)
144109864Sjeff
145109864Sjeff/*
146111857Sjeff * These determine the interactivity of a process.
147109864Sjeff *
148110645Sjeff * SLP_RUN_MAX:	Maximum amount of sleep time + run time we'll accumulate
149110645Sjeff *		before throttling back.
150111857Sjeff * SLP_RUN_THROTTLE:	Divisor for reducing slp/run time.
151116365Sjeff * INTERACT_MAX:	Maximum interactivity value.  Smaller is better.
152111857Sjeff * INTERACT_THRESH:	Threshhold for placement on the current runq.
153109864Sjeff */
154113357Sjeff#define	SCHED_SLP_RUN_MAX	((hz / 10) << 10)
155110645Sjeff#define	SCHED_SLP_RUN_THROTTLE	(10)
156116365Sjeff#define	SCHED_INTERACT_MAX	(100)
157116365Sjeff#define	SCHED_INTERACT_HALF	(SCHED_INTERACT_MAX / 2)
158116365Sjeff#define	SCHED_INTERACT_THRESH	(20)
159111857Sjeff
160109864Sjeff/*
161109864Sjeff * These parameters and macros determine the size of the time slice that is
162109864Sjeff * granted to each thread.
163109864Sjeff *
164109864Sjeff * SLICE_MIN:	Minimum time slice granted, in units of ticks.
165109864Sjeff * SLICE_MAX:	Maximum time slice granted.
166109864Sjeff * SLICE_RANGE:	Range of available time slices scaled by hz.
167112966Sjeff * SLICE_SCALE:	The number slices granted per val in the range of [0, max].
168112966Sjeff * SLICE_NICE:  Determine the amount of slice granted to a scaled nice.
169109864Sjeff */
170113357Sjeff#define	SCHED_SLICE_MIN			(slice_min)
171113357Sjeff#define	SCHED_SLICE_MAX			(slice_max)
172111857Sjeff#define	SCHED_SLICE_RANGE		(SCHED_SLICE_MAX - SCHED_SLICE_MIN + 1)
173109864Sjeff#define	SCHED_SLICE_SCALE(val, max)	(((val) * SCHED_SLICE_RANGE) / (max))
174112966Sjeff#define	SCHED_SLICE_NICE(nice)						\
175113357Sjeff    (SCHED_SLICE_MAX - SCHED_SLICE_SCALE((nice), SCHED_PRI_NTHRESH))
176109864Sjeff
177109864Sjeff/*
178109864Sjeff * This macro determines whether or not the kse belongs on the current or
179109864Sjeff * next run queue.
180110645Sjeff *
181110645Sjeff * XXX nice value should effect how interactive a kg is.
182109864Sjeff */
183113357Sjeff#define	SCHED_INTERACTIVE(kg)						\
184113357Sjeff    (sched_interact_score(kg) < SCHED_INTERACT_THRESH)
185113417Sjeff#define	SCHED_CURR(kg, ke)						\
186113357Sjeff    (ke->ke_thread->td_priority < PRI_MIN_TIMESHARE || SCHED_INTERACTIVE(kg))
187109864Sjeff
188109864Sjeff/*
189109864Sjeff * Cpu percentage computation macros and defines.
190109864Sjeff *
191109864Sjeff * SCHED_CPU_TIME:	Number of seconds to average the cpu usage across.
192109864Sjeff * SCHED_CPU_TICKS:	Number of hz ticks to average the cpu usage across.
193109864Sjeff */
194109864Sjeff
195112971Sjeff#define	SCHED_CPU_TIME	10
196109864Sjeff#define	SCHED_CPU_TICKS	(hz * SCHED_CPU_TIME)
197109864Sjeff
198109864Sjeff/*
199113357Sjeff * kseq - per processor runqs and statistics.
200109864Sjeff */
201109864Sjeff
202113357Sjeff#define	KSEQ_NCLASS	(PRI_IDLE + 1)	/* Number of run classes. */
203113357Sjeff
204109864Sjeffstruct kseq {
205113357Sjeff	struct runq	ksq_idle;		/* Queue of IDLE threads. */
206113357Sjeff	struct runq	ksq_timeshare[2];	/* Run queues for !IDLE. */
207113357Sjeff	struct runq	*ksq_next;		/* Next timeshare queue. */
208113357Sjeff	struct runq	*ksq_curr;		/* Current queue. */
209113357Sjeff	int		ksq_loads[KSEQ_NCLASS];	/* Load for each class */
210113357Sjeff	int		ksq_load;		/* Aggregate load. */
211113357Sjeff	short		ksq_nice[PRIO_TOTAL + 1]; /* KSEs in each nice bin. */
212113357Sjeff	short		ksq_nicemin;		/* Least nice. */
213110267Sjeff#ifdef SMP
214110267Sjeff	unsigned int	ksq_rslices;	/* Slices on run queue */
215110267Sjeff#endif
216109864Sjeff};
217109864Sjeff
218109864Sjeff/*
219109864Sjeff * One kse queue per processor.
220109864Sjeff */
221110028Sjeff#ifdef SMP
222109864Sjeffstruct kseq	kseq_cpu[MAXCPU];
223110028Sjeff#define	KSEQ_SELF()	(&kseq_cpu[PCPU_GET(cpuid)])
224110028Sjeff#define	KSEQ_CPU(x)	(&kseq_cpu[(x)])
225110028Sjeff#else
226110028Sjeffstruct kseq	kseq_cpu;
227110028Sjeff#define	KSEQ_SELF()	(&kseq_cpu)
228110028Sjeff#define	KSEQ_CPU(x)	(&kseq_cpu)
229110028Sjeff#endif
230109864Sjeff
231112966Sjeffstatic void sched_slice(struct kse *ke);
232113357Sjeffstatic void sched_priority(struct ksegrp *kg);
233111857Sjeffstatic int sched_interact_score(struct ksegrp *kg);
234109864Sjeffvoid sched_pctcpu_update(struct kse *ke);
235109864Sjeffint sched_pickcpu(void);
236109864Sjeff
237110267Sjeff/* Operations on per processor queues */
238110028Sjeffstatic struct kse * kseq_choose(struct kseq *kseq);
239110028Sjeffstatic void kseq_setup(struct kseq *kseq);
240112994Sjeffstatic void kseq_add(struct kseq *kseq, struct kse *ke);
241113357Sjeffstatic void kseq_rem(struct kseq *kseq, struct kse *ke);
242113357Sjeffstatic void kseq_nice_add(struct kseq *kseq, int nice);
243113357Sjeffstatic void kseq_nice_rem(struct kseq *kseq, int nice);
244113660Sjeffvoid kseq_print(int cpu);
245110267Sjeff#ifdef SMP
246110267Sjeffstruct kseq * kseq_load_highest(void);
247116069Sjeffvoid kseq_balance(void *arg);
248116069Sjeffvoid kseq_move(struct kseq *from, int cpu);
249110267Sjeff#endif
250110028Sjeff
251113357Sjeffvoid
252113660Sjeffkseq_print(int cpu)
253110267Sjeff{
254113660Sjeff	struct kseq *kseq;
255113357Sjeff	int i;
256112994Sjeff
257113660Sjeff	kseq = KSEQ_CPU(cpu);
258112994Sjeff
259113357Sjeff	printf("kseq:\n");
260113357Sjeff	printf("\tload:           %d\n", kseq->ksq_load);
261113357Sjeff	printf("\tload ITHD:      %d\n", kseq->ksq_loads[PRI_ITHD]);
262113357Sjeff	printf("\tload REALTIME:  %d\n", kseq->ksq_loads[PRI_REALTIME]);
263113357Sjeff	printf("\tload TIMESHARE: %d\n", kseq->ksq_loads[PRI_TIMESHARE]);
264113357Sjeff	printf("\tload IDLE:      %d\n", kseq->ksq_loads[PRI_IDLE]);
265113357Sjeff	printf("\tnicemin:\t%d\n", kseq->ksq_nicemin);
266113357Sjeff	printf("\tnice counts:\n");
267113357Sjeff	for (i = 0; i < PRIO_TOTAL + 1; i++)
268113357Sjeff		if (kseq->ksq_nice[i])
269113357Sjeff			printf("\t\t%d = %d\n",
270113357Sjeff			    i - SCHED_PRI_NHALF, kseq->ksq_nice[i]);
271113357Sjeff}
272112994Sjeff
273113357Sjeffstatic void
274113357Sjeffkseq_add(struct kseq *kseq, struct kse *ke)
275113357Sjeff{
276115998Sjeff	mtx_assert(&sched_lock, MA_OWNED);
277113386Sjeff	kseq->ksq_loads[PRI_BASE(ke->ke_ksegrp->kg_pri_class)]++;
278113357Sjeff	kseq->ksq_load++;
279113357Sjeff	if (ke->ke_ksegrp->kg_pri_class == PRI_TIMESHARE)
280113357Sjeff	CTR6(KTR_ULE, "Add kse %p to %p (slice: %d, pri: %d, nice: %d(%d))",
281113357Sjeff	    ke, ke->ke_runq, ke->ke_slice, ke->ke_thread->td_priority,
282113357Sjeff	    ke->ke_ksegrp->kg_nice, kseq->ksq_nicemin);
283113357Sjeff	if (ke->ke_ksegrp->kg_pri_class == PRI_TIMESHARE)
284113357Sjeff		kseq_nice_add(kseq, ke->ke_ksegrp->kg_nice);
285110267Sjeff#ifdef SMP
286110267Sjeff	kseq->ksq_rslices += ke->ke_slice;
287110267Sjeff#endif
288110267Sjeff}
289113357Sjeff
290112994Sjeffstatic void
291110267Sjeffkseq_rem(struct kseq *kseq, struct kse *ke)
292110267Sjeff{
293115998Sjeff	mtx_assert(&sched_lock, MA_OWNED);
294113386Sjeff	kseq->ksq_loads[PRI_BASE(ke->ke_ksegrp->kg_pri_class)]--;
295113357Sjeff	kseq->ksq_load--;
296113357Sjeff	ke->ke_runq = NULL;
297113357Sjeff	if (ke->ke_ksegrp->kg_pri_class == PRI_TIMESHARE)
298113357Sjeff		kseq_nice_rem(kseq, ke->ke_ksegrp->kg_nice);
299110267Sjeff#ifdef SMP
300110267Sjeff	kseq->ksq_rslices -= ke->ke_slice;
301110267Sjeff#endif
302110267Sjeff}
303110267Sjeff
304113357Sjeffstatic void
305113357Sjeffkseq_nice_add(struct kseq *kseq, int nice)
306110267Sjeff{
307115998Sjeff	mtx_assert(&sched_lock, MA_OWNED);
308113357Sjeff	/* Normalize to zero. */
309113357Sjeff	kseq->ksq_nice[nice + SCHED_PRI_NHALF]++;
310115998Sjeff	if (nice < kseq->ksq_nicemin || kseq->ksq_loads[PRI_TIMESHARE] == 1)
311113357Sjeff		kseq->ksq_nicemin = nice;
312110267Sjeff}
313110267Sjeff
314113357Sjeffstatic void
315113357Sjeffkseq_nice_rem(struct kseq *kseq, int nice)
316110267Sjeff{
317113357Sjeff	int n;
318113357Sjeff
319115998Sjeff	mtx_assert(&sched_lock, MA_OWNED);
320113357Sjeff	/* Normalize to zero. */
321113357Sjeff	n = nice + SCHED_PRI_NHALF;
322113357Sjeff	kseq->ksq_nice[n]--;
323113357Sjeff	KASSERT(kseq->ksq_nice[n] >= 0, ("Negative nice count."));
324113357Sjeff
325113357Sjeff	/*
326113357Sjeff	 * If this wasn't the smallest nice value or there are more in
327113357Sjeff	 * this bucket we can just return.  Otherwise we have to recalculate
328113357Sjeff	 * the smallest nice.
329113357Sjeff	 */
330113357Sjeff	if (nice != kseq->ksq_nicemin ||
331113357Sjeff	    kseq->ksq_nice[n] != 0 ||
332113357Sjeff	    kseq->ksq_loads[PRI_TIMESHARE] == 0)
333113357Sjeff		return;
334113357Sjeff
335113357Sjeff	for (; n < SCHED_PRI_NRESV + 1; n++)
336113357Sjeff		if (kseq->ksq_nice[n]) {
337113357Sjeff			kseq->ksq_nicemin = n - SCHED_PRI_NHALF;
338113357Sjeff			return;
339113357Sjeff		}
340110267Sjeff}
341110267Sjeff
342113357Sjeff#ifdef SMP
343116069Sjeff/*
344116069Sjeff * kseq_balance is a simple CPU load balancing algorithm.  It operates by
345116069Sjeff * finding the least loaded and most loaded cpu and equalizing their load
346116069Sjeff * by migrating some processes.
347116069Sjeff *
348116069Sjeff * Dealing only with two CPUs at a time has two advantages.  Firstly, most
349116069Sjeff * installations will only have 2 cpus.  Secondly, load balancing too much at
350116069Sjeff * once can have an unpleasant effect on the system.  The scheduler rarely has
351116069Sjeff * enough information to make perfect decisions.  So this algorithm chooses
352116069Sjeff * algorithm simplicity and more gradual effects on load in larger systems.
353116069Sjeff *
354116069Sjeff * It could be improved by considering the priorities and slices assigned to
355116069Sjeff * each task prior to balancing them.  There are many pathological cases with
356116069Sjeff * any approach and so the semi random algorithm below may work as well as any.
357116069Sjeff *
358116069Sjeff */
359116069Sjeffvoid
360116069Sjeffkseq_balance(void *arg)
361116069Sjeff{
362116069Sjeff	struct kseq *kseq;
363116069Sjeff	int high_load;
364116069Sjeff	int low_load;
365116069Sjeff	int high_cpu;
366116069Sjeff	int low_cpu;
367116069Sjeff	int move;
368116069Sjeff	int diff;
369116069Sjeff	int i;
370116069Sjeff
371116069Sjeff	high_cpu = 0;
372116069Sjeff	low_cpu = 0;
373116069Sjeff	high_load = 0;
374116069Sjeff	low_load = -1;
375116069Sjeff
376116069Sjeff	mtx_lock_spin(&sched_lock);
377116069Sjeff	for (i = 0; i < mp_maxid; i++) {
378116069Sjeff		if (CPU_ABSENT(i))
379116069Sjeff			continue;
380116069Sjeff		kseq = KSEQ_CPU(i);
381116069Sjeff		if (kseq->ksq_load > high_load) {
382116069Sjeff			high_load = kseq->ksq_load;
383116069Sjeff			high_cpu = i;
384116069Sjeff		}
385116069Sjeff		if (low_load == -1 || kseq->ksq_load < low_load) {
386116069Sjeff			low_load = kseq->ksq_load;
387116069Sjeff			low_cpu = i;
388116069Sjeff		}
389116069Sjeff	}
390116069Sjeff
391116069Sjeff	/*
392116069Sjeff	 * Nothing to do.
393116069Sjeff	 */
394116069Sjeff	if (high_load < 2 || low_load == high_load)
395116069Sjeff		goto out;
396116069Sjeff
397116069Sjeff	diff = high_load - low_load;
398116069Sjeff	move = diff / 2;
399116069Sjeff	if (diff & 0x1)
400116069Sjeff		move++;
401116069Sjeff
402116069Sjeff	for (i = 0; i < move; i++)
403116069Sjeff		kseq_move(KSEQ_CPU(high_cpu), low_cpu);
404116069Sjeff
405116069Sjeffout:
406116069Sjeff	mtx_unlock_spin(&sched_lock);
407116069Sjeff	callout_reset(&kseq_lb_callout, hz, kseq_balance, NULL);
408116069Sjeff
409116069Sjeff	return;
410116069Sjeff}
411116069Sjeff
412110267Sjeffstruct kseq *
413110267Sjeffkseq_load_highest(void)
414110267Sjeff{
415110267Sjeff	struct kseq *kseq;
416110267Sjeff	int load;
417110267Sjeff	int cpu;
418110267Sjeff	int i;
419110267Sjeff
420115998Sjeff	mtx_assert(&sched_lock, MA_OWNED);
421110267Sjeff	cpu = 0;
422110267Sjeff	load = 0;
423110267Sjeff
424110267Sjeff	for (i = 0; i < mp_maxid; i++) {
425110267Sjeff		if (CPU_ABSENT(i))
426110267Sjeff			continue;
427110267Sjeff		kseq = KSEQ_CPU(i);
428113357Sjeff		if (kseq->ksq_load > load) {
429113357Sjeff			load = kseq->ksq_load;
430110267Sjeff			cpu = i;
431110267Sjeff		}
432110267Sjeff	}
433113371Sjeff	if (load > 1)
434110267Sjeff		return (KSEQ_CPU(cpu));
435110267Sjeff
436110267Sjeff	return (NULL);
437110267Sjeff}
438116069Sjeff
439116069Sjeffvoid
440116069Sjeffkseq_move(struct kseq *from, int cpu)
441116069Sjeff{
442116069Sjeff	struct kse *ke;
443116069Sjeff
444116069Sjeff	ke = kseq_choose(from);
445116069Sjeff	runq_remove(ke->ke_runq, ke);
446116069Sjeff	ke->ke_state = KES_THREAD;
447116069Sjeff	kseq_rem(from, ke);
448116069Sjeff
449116069Sjeff	ke->ke_cpu = cpu;
450116069Sjeff	sched_add(ke);
451116069Sjeff}
452110267Sjeff#endif
453110267Sjeff
454110267Sjeffstruct kse *
455110267Sjeffkseq_choose(struct kseq *kseq)
456110267Sjeff{
457110267Sjeff	struct kse *ke;
458110267Sjeff	struct runq *swap;
459110267Sjeff
460115998Sjeff	mtx_assert(&sched_lock, MA_OWNED);
461113357Sjeff	swap = NULL;
462112994Sjeff
463113357Sjeff	for (;;) {
464113357Sjeff		ke = runq_choose(kseq->ksq_curr);
465113357Sjeff		if (ke == NULL) {
466113357Sjeff			/*
467113357Sjeff			 * We already swaped once and didn't get anywhere.
468113357Sjeff			 */
469113357Sjeff			if (swap)
470113357Sjeff				break;
471113357Sjeff			swap = kseq->ksq_curr;
472113357Sjeff			kseq->ksq_curr = kseq->ksq_next;
473113357Sjeff			kseq->ksq_next = swap;
474113357Sjeff			continue;
475113357Sjeff		}
476113357Sjeff		/*
477113357Sjeff		 * If we encounter a slice of 0 the kse is in a
478113357Sjeff		 * TIMESHARE kse group and its nice was too far out
479113357Sjeff		 * of the range that receives slices.
480113357Sjeff		 */
481113357Sjeff		if (ke->ke_slice == 0) {
482113357Sjeff			runq_remove(ke->ke_runq, ke);
483113357Sjeff			sched_slice(ke);
484113357Sjeff			ke->ke_runq = kseq->ksq_next;
485113357Sjeff			runq_add(ke->ke_runq, ke);
486113357Sjeff			continue;
487113357Sjeff		}
488113357Sjeff		return (ke);
489110267Sjeff	}
490110267Sjeff
491113357Sjeff	return (runq_choose(&kseq->ksq_idle));
492110267Sjeff}
493110267Sjeff
494109864Sjeffstatic void
495110028Sjeffkseq_setup(struct kseq *kseq)
496110028Sjeff{
497113357Sjeff	runq_init(&kseq->ksq_timeshare[0]);
498113357Sjeff	runq_init(&kseq->ksq_timeshare[1]);
499112994Sjeff	runq_init(&kseq->ksq_idle);
500113357Sjeff
501113357Sjeff	kseq->ksq_curr = &kseq->ksq_timeshare[0];
502113357Sjeff	kseq->ksq_next = &kseq->ksq_timeshare[1];
503113357Sjeff
504113357Sjeff	kseq->ksq_loads[PRI_ITHD] = 0;
505113357Sjeff	kseq->ksq_loads[PRI_REALTIME] = 0;
506113357Sjeff	kseq->ksq_loads[PRI_TIMESHARE] = 0;
507113357Sjeff	kseq->ksq_loads[PRI_IDLE] = 0;
508113660Sjeff	kseq->ksq_load = 0;
509110267Sjeff#ifdef SMP
510110267Sjeff	kseq->ksq_rslices = 0;
511110267Sjeff#endif
512110028Sjeff}
513110028Sjeff
514110028Sjeffstatic void
515109864Sjeffsched_setup(void *dummy)
516109864Sjeff{
517109864Sjeff	int i;
518109864Sjeff
519113357Sjeff	slice_min = (hz/100);
520113357Sjeff	slice_max = (hz/10);
521111857Sjeff
522109864Sjeff	mtx_lock_spin(&sched_lock);
523109864Sjeff	/* init kseqs */
524110028Sjeff	for (i = 0; i < MAXCPU; i++)
525110028Sjeff		kseq_setup(KSEQ_CPU(i));
526113357Sjeff
527113357Sjeff	kseq_add(KSEQ_SELF(), &kse0);
528109864Sjeff	mtx_unlock_spin(&sched_lock);
529116069Sjeff#ifdef SMP
530116069Sjeff	callout_init(&kseq_lb_callout, 1);
531116069Sjeff	kseq_balance(NULL);
532116069Sjeff#endif
533109864Sjeff}
534109864Sjeff
535109864Sjeff/*
536109864Sjeff * Scale the scheduling priority according to the "interactivity" of this
537109864Sjeff * process.
538109864Sjeff */
539113357Sjeffstatic void
540109864Sjeffsched_priority(struct ksegrp *kg)
541109864Sjeff{
542109864Sjeff	int pri;
543109864Sjeff
544109864Sjeff	if (kg->kg_pri_class != PRI_TIMESHARE)
545113357Sjeff		return;
546109864Sjeff
547113357Sjeff	pri = SCHED_PRI_INTERACT(sched_interact_score(kg));
548111857Sjeff	pri += SCHED_PRI_BASE;
549109864Sjeff	pri += kg->kg_nice;
550109864Sjeff
551109864Sjeff	if (pri > PRI_MAX_TIMESHARE)
552109864Sjeff		pri = PRI_MAX_TIMESHARE;
553109864Sjeff	else if (pri < PRI_MIN_TIMESHARE)
554109864Sjeff		pri = PRI_MIN_TIMESHARE;
555109864Sjeff
556109864Sjeff	kg->kg_user_pri = pri;
557109864Sjeff
558113357Sjeff	return;
559109864Sjeff}
560109864Sjeff
561109864Sjeff/*
562112966Sjeff * Calculate a time slice based on the properties of the kseg and the runq
563112994Sjeff * that we're on.  This is only for PRI_TIMESHARE ksegrps.
564109864Sjeff */
565112966Sjeffstatic void
566112966Sjeffsched_slice(struct kse *ke)
567109864Sjeff{
568113357Sjeff	struct kseq *kseq;
569112966Sjeff	struct ksegrp *kg;
570109864Sjeff
571112966Sjeff	kg = ke->ke_ksegrp;
572113357Sjeff	kseq = KSEQ_CPU(ke->ke_cpu);
573109864Sjeff
574112966Sjeff	/*
575112966Sjeff	 * Rationale:
576112966Sjeff	 * KSEs in interactive ksegs get the minimum slice so that we
577112966Sjeff	 * quickly notice if it abuses its advantage.
578112966Sjeff	 *
579112966Sjeff	 * KSEs in non-interactive ksegs are assigned a slice that is
580112966Sjeff	 * based on the ksegs nice value relative to the least nice kseg
581112966Sjeff	 * on the run queue for this cpu.
582112966Sjeff	 *
583112966Sjeff	 * If the KSE is less nice than all others it gets the maximum
584112966Sjeff	 * slice and other KSEs will adjust their slice relative to
585112966Sjeff	 * this when they first expire.
586112966Sjeff	 *
587112966Sjeff	 * There is 20 point window that starts relative to the least
588112966Sjeff	 * nice kse on the run queue.  Slice size is determined by
589112966Sjeff	 * the kse distance from the last nice ksegrp.
590112966Sjeff	 *
591112966Sjeff	 * If you are outside of the window you will get no slice and
592112966Sjeff	 * you will be reevaluated each time you are selected on the
593112966Sjeff	 * run queue.
594112966Sjeff	 *
595112966Sjeff	 */
596109864Sjeff
597113357Sjeff	if (!SCHED_INTERACTIVE(kg)) {
598112966Sjeff		int nice;
599112966Sjeff
600113357Sjeff		nice = kg->kg_nice + (0 - kseq->ksq_nicemin);
601113357Sjeff		if (kseq->ksq_loads[PRI_TIMESHARE] == 0 ||
602113357Sjeff		    kg->kg_nice < kseq->ksq_nicemin)
603112966Sjeff			ke->ke_slice = SCHED_SLICE_MAX;
604113357Sjeff		else if (nice <= SCHED_PRI_NTHRESH)
605112966Sjeff			ke->ke_slice = SCHED_SLICE_NICE(nice);
606112966Sjeff		else
607112966Sjeff			ke->ke_slice = 0;
608112966Sjeff	} else
609112966Sjeff		ke->ke_slice = SCHED_SLICE_MIN;
610112966Sjeff
611113357Sjeff	CTR6(KTR_ULE,
612113357Sjeff	    "Sliced %p(%d) (nice: %d, nicemin: %d, load: %d, interactive: %d)",
613113357Sjeff	    ke, ke->ke_slice, kg->kg_nice, kseq->ksq_nicemin,
614113357Sjeff	    kseq->ksq_loads[PRI_TIMESHARE], SCHED_INTERACTIVE(kg));
615113357Sjeff
616110645Sjeff	/*
617112994Sjeff	 * Check to see if we need to scale back the slp and run time
618112994Sjeff	 * in the kg.  This will cause us to forget old interactivity
619112994Sjeff	 * while maintaining the current ratio.
620110645Sjeff	 */
621110645Sjeff	if ((kg->kg_runtime + kg->kg_slptime) >  SCHED_SLP_RUN_MAX) {
622110645Sjeff		kg->kg_runtime /= SCHED_SLP_RUN_THROTTLE;
623110645Sjeff		kg->kg_slptime /= SCHED_SLP_RUN_THROTTLE;
624110645Sjeff	}
625113357Sjeff	CTR4(KTR_ULE, "Slp vs Run(2) %p (Slp %d, Run %d, Score %d)",
626113357Sjeff	    ke, kg->kg_slptime >> 10, kg->kg_runtime >> 10,
627113357Sjeff	    sched_interact_score(kg));
628110645Sjeff
629112966Sjeff	return;
630109864Sjeff}
631109864Sjeff
632111857Sjeffstatic int
633111857Sjeffsched_interact_score(struct ksegrp *kg)
634111857Sjeff{
635116365Sjeff	int div;
636111857Sjeff
637111857Sjeff	if (kg->kg_runtime > kg->kg_slptime) {
638116365Sjeff		div = max(1, kg->kg_runtime / SCHED_INTERACT_HALF);
639116365Sjeff		return (SCHED_INTERACT_HALF +
640116365Sjeff		    (SCHED_INTERACT_HALF - (kg->kg_slptime / div)));
641116365Sjeff	} if (kg->kg_slptime > kg->kg_runtime) {
642116365Sjeff		div = max(1, kg->kg_slptime / SCHED_INTERACT_HALF);
643116365Sjeff		return (kg->kg_runtime / div);
644111857Sjeff	}
645111857Sjeff
646116365Sjeff	/*
647116365Sjeff	 * This can happen if slptime and runtime are 0.
648116365Sjeff	 */
649116365Sjeff	return (0);
650111857Sjeff
651111857Sjeff}
652111857Sjeff
653113357Sjeff/*
654113357Sjeff * This is only somewhat accurate since given many processes of the same
655113357Sjeff * priority they will switch when their slices run out, which will be
656113357Sjeff * at most SCHED_SLICE_MAX.
657113357Sjeff */
658109864Sjeffint
659109864Sjeffsched_rr_interval(void)
660109864Sjeff{
661109864Sjeff	return (SCHED_SLICE_MAX);
662109864Sjeff}
663109864Sjeff
664109864Sjeffvoid
665109864Sjeffsched_pctcpu_update(struct kse *ke)
666109864Sjeff{
667109864Sjeff	/*
668109864Sjeff	 * Adjust counters and watermark for pctcpu calc.
669116365Sjeff	 */
670116365Sjeff
671116365Sjeff	/*
672111793Sjeff	 * Shift the tick count out so that the divide doesn't round away
673111793Sjeff	 * our results.
674111793Sjeff	 */
675111793Sjeff	ke->ke_ticks <<= 10;
676109864Sjeff	ke->ke_ticks = (ke->ke_ticks / (ke->ke_ltick - ke->ke_ftick)) *
677109864Sjeff		    SCHED_CPU_TICKS;
678111793Sjeff	ke->ke_ticks >>= 10;
679109864Sjeff	ke->ke_ltick = ticks;
680109864Sjeff	ke->ke_ftick = ke->ke_ltick - SCHED_CPU_TICKS;
681109864Sjeff}
682109864Sjeff
683109864Sjeff#ifdef SMP
684110267Sjeff/* XXX Should be changed to kseq_load_lowest() */
685109864Sjeffint
686109864Sjeffsched_pickcpu(void)
687109864Sjeff{
688110028Sjeff	struct kseq *kseq;
689110028Sjeff	int load;
690109864Sjeff	int cpu;
691109864Sjeff	int i;
692109864Sjeff
693115998Sjeff	mtx_assert(&sched_lock, MA_OWNED);
694109864Sjeff	if (!smp_started)
695109864Sjeff		return (0);
696109864Sjeff
697110028Sjeff	load = 0;
698110028Sjeff	cpu = 0;
699109864Sjeff
700109864Sjeff	for (i = 0; i < mp_maxid; i++) {
701109864Sjeff		if (CPU_ABSENT(i))
702109864Sjeff			continue;
703110028Sjeff		kseq = KSEQ_CPU(i);
704113357Sjeff		if (kseq->ksq_load < load) {
705109864Sjeff			cpu = i;
706113357Sjeff			load = kseq->ksq_load;
707109864Sjeff		}
708109864Sjeff	}
709109864Sjeff
710109864Sjeff	CTR1(KTR_RUNQ, "sched_pickcpu: %d", cpu);
711109864Sjeff	return (cpu);
712109864Sjeff}
713109864Sjeff#else
714109864Sjeffint
715109864Sjeffsched_pickcpu(void)
716109864Sjeff{
717109864Sjeff	return (0);
718109864Sjeff}
719109864Sjeff#endif
720109864Sjeff
721109864Sjeffvoid
722109864Sjeffsched_prio(struct thread *td, u_char prio)
723109864Sjeff{
724109864Sjeff	struct kse *ke;
725109864Sjeff	struct runq *rq;
726109864Sjeff
727109864Sjeff	mtx_assert(&sched_lock, MA_OWNED);
728109864Sjeff	ke = td->td_kse;
729109864Sjeff	td->td_priority = prio;
730109864Sjeff
731109864Sjeff	if (TD_ON_RUNQ(td)) {
732109864Sjeff		rq = ke->ke_runq;
733109864Sjeff
734109864Sjeff		runq_remove(rq, ke);
735109864Sjeff		runq_add(rq, ke);
736109864Sjeff	}
737109864Sjeff}
738109864Sjeff
739109864Sjeffvoid
740109864Sjeffsched_switchout(struct thread *td)
741109864Sjeff{
742109864Sjeff	struct kse *ke;
743109864Sjeff
744109864Sjeff	mtx_assert(&sched_lock, MA_OWNED);
745109864Sjeff
746109864Sjeff	ke = td->td_kse;
747109864Sjeff
748109864Sjeff	td->td_last_kse = ke;
749113339Sjulian        td->td_lastcpu = td->td_oncpu;
750113339Sjulian	td->td_oncpu = NOCPU;
751111032Sjulian        td->td_flags &= ~TDF_NEEDRESCHED;
752109864Sjeff
753109864Sjeff	if (TD_IS_RUNNING(td)) {
754116365Sjeff		/*
755116365Sjeff		 * This queue is always correct except for idle threads which
756116365Sjeff		 * have a higher priority due to priority propagation.
757116365Sjeff		 */
758116365Sjeff		if (ke->ke_ksegrp->kg_pri_class == PRI_IDLE &&
759116365Sjeff		    ke->ke_thread->td_priority > PRI_MIN_IDLE)
760116365Sjeff			ke->ke_runq = KSEQ_SELF()->ksq_curr;
761113357Sjeff		runq_add(ke->ke_runq, ke);
762113357Sjeff		/* setrunqueue(td); */
763109864Sjeff		return;
764111857Sjeff	}
765113357Sjeff	if (ke->ke_runq)
766113357Sjeff		kseq_rem(KSEQ_CPU(ke->ke_cpu), ke);
767109864Sjeff	/*
768109864Sjeff	 * We will not be on the run queue. So we must be
769109864Sjeff	 * sleeping or similar.
770109864Sjeff	 */
771116361Sdavidxu	if (td->td_proc->p_flag & P_SA)
772109864Sjeff		kse_reassign(ke);
773109864Sjeff}
774109864Sjeff
775109864Sjeffvoid
776109864Sjeffsched_switchin(struct thread *td)
777109864Sjeff{
778109864Sjeff	/* struct kse *ke = td->td_kse; */
779109864Sjeff	mtx_assert(&sched_lock, MA_OWNED);
780109864Sjeff
781113339Sjulian	td->td_oncpu = PCPU_GET(cpuid);
782109864Sjeff}
783109864Sjeff
784109864Sjeffvoid
785109864Sjeffsched_nice(struct ksegrp *kg, int nice)
786109864Sjeff{
787113357Sjeff	struct kse *ke;
788109864Sjeff	struct thread *td;
789113357Sjeff	struct kseq *kseq;
790109864Sjeff
791113873Sjhb	PROC_LOCK_ASSERT(kg->kg_proc, MA_OWNED);
792113873Sjhb	mtx_assert(&sched_lock, MA_OWNED);
793113357Sjeff	/*
794113357Sjeff	 * We need to adjust the nice counts for running KSEs.
795113357Sjeff	 */
796113357Sjeff	if (kg->kg_pri_class == PRI_TIMESHARE)
797113357Sjeff		FOREACH_KSE_IN_GROUP(kg, ke) {
798113357Sjeff			if (ke->ke_state != KES_ONRUNQ &&
799113357Sjeff			    ke->ke_state != KES_THREAD)
800113357Sjeff				continue;
801113357Sjeff			kseq = KSEQ_CPU(ke->ke_cpu);
802113357Sjeff			kseq_nice_rem(kseq, kg->kg_nice);
803113357Sjeff			kseq_nice_add(kseq, nice);
804113357Sjeff		}
805109864Sjeff	kg->kg_nice = nice;
806109864Sjeff	sched_priority(kg);
807113357Sjeff	FOREACH_THREAD_IN_GROUP(kg, td)
808111032Sjulian		td->td_flags |= TDF_NEEDRESCHED;
809109864Sjeff}
810109864Sjeff
811109864Sjeffvoid
812109864Sjeffsched_sleep(struct thread *td, u_char prio)
813109864Sjeff{
814109864Sjeff	mtx_assert(&sched_lock, MA_OWNED);
815109864Sjeff
816109864Sjeff	td->td_slptime = ticks;
817109864Sjeff	td->td_priority = prio;
818109864Sjeff
819113357Sjeff	CTR2(KTR_ULE, "sleep kse %p (tick: %d)",
820113357Sjeff	    td->td_kse, td->td_slptime);
821109864Sjeff}
822109864Sjeff
823109864Sjeffvoid
824109864Sjeffsched_wakeup(struct thread *td)
825109864Sjeff{
826109864Sjeff	mtx_assert(&sched_lock, MA_OWNED);
827109864Sjeff
828109864Sjeff	/*
829109864Sjeff	 * Let the kseg know how long we slept for.  This is because process
830109864Sjeff	 * interactivity behavior is modeled in the kseg.
831109864Sjeff	 */
832111788Sjeff	if (td->td_slptime) {
833111788Sjeff		struct ksegrp *kg;
834113357Sjeff		int hzticks;
835109864Sjeff
836111788Sjeff		kg = td->td_ksegrp;
837113357Sjeff		hzticks = ticks - td->td_slptime;
838113357Sjeff		kg->kg_slptime += hzticks << 10;
839111788Sjeff		sched_priority(kg);
840113357Sjeff		CTR2(KTR_ULE, "wakeup kse %p (%d ticks)",
841113357Sjeff		    td->td_kse, hzticks);
842111788Sjeff		td->td_slptime = 0;
843109864Sjeff	}
844109864Sjeff	setrunqueue(td);
845109864Sjeff        if (td->td_priority < curthread->td_priority)
846111032Sjulian                curthread->td_flags |= TDF_NEEDRESCHED;
847109864Sjeff}
848109864Sjeff
849109864Sjeff/*
850109864Sjeff * Penalize the parent for creating a new child and initialize the child's
851109864Sjeff * priority.
852109864Sjeff */
853109864Sjeffvoid
854113357Sjeffsched_fork(struct proc *p, struct proc *p1)
855109864Sjeff{
856109864Sjeff
857109864Sjeff	mtx_assert(&sched_lock, MA_OWNED);
858109864Sjeff
859113357Sjeff	sched_fork_ksegrp(FIRST_KSEGRP_IN_PROC(p), FIRST_KSEGRP_IN_PROC(p1));
860113357Sjeff	sched_fork_kse(FIRST_KSE_IN_PROC(p), FIRST_KSE_IN_PROC(p1));
861113357Sjeff	sched_fork_thread(FIRST_THREAD_IN_PROC(p), FIRST_THREAD_IN_PROC(p1));
862113357Sjeff}
863113357Sjeff
864113357Sjeffvoid
865113357Sjeffsched_fork_kse(struct kse *ke, struct kse *child)
866113357Sjeff{
867113923Sjhb
868116365Sjeff	child->ke_slice = 1;	/* Attempt to quickly learn interactivity. */
869113357Sjeff	child->ke_cpu = ke->ke_cpu; /* sched_pickcpu(); */
870113357Sjeff	child->ke_runq = NULL;
871113357Sjeff
872113357Sjeff	/*
873113357Sjeff	 * Claim that we've been running for one second for statistical
874113357Sjeff	 * purposes.
875113357Sjeff	 */
876113357Sjeff	child->ke_ticks = 0;
877113357Sjeff	child->ke_ltick = ticks;
878113357Sjeff	child->ke_ftick = ticks - hz;
879113357Sjeff}
880113357Sjeff
881113357Sjeffvoid
882113357Sjeffsched_fork_ksegrp(struct ksegrp *kg, struct ksegrp *child)
883113357Sjeff{
884113923Sjhb
885113923Sjhb	PROC_LOCK_ASSERT(child->kg_proc, MA_OWNED);
886109864Sjeff	/* XXX Need something better here */
887116365Sjeff
888116365Sjeff#if 1
889116365Sjeff	child->kg_slptime = kg->kg_slptime;
890116365Sjeff	child->kg_runtime = kg->kg_runtime;
891116365Sjeff#else
892110645Sjeff	if (kg->kg_slptime > kg->kg_runtime) {
893111857Sjeff		child->kg_slptime = SCHED_DYN_RANGE;
894111857Sjeff		child->kg_runtime = kg->kg_slptime / SCHED_DYN_RANGE;
895110645Sjeff	} else {
896111857Sjeff		child->kg_runtime = SCHED_DYN_RANGE;
897111857Sjeff		child->kg_slptime = kg->kg_runtime / SCHED_DYN_RANGE;
898110645Sjeff	}
899116365Sjeff#endif
900113357Sjeff
901109864Sjeff	child->kg_user_pri = kg->kg_user_pri;
902113357Sjeff	child->kg_nice = kg->kg_nice;
903113357Sjeff}
904109864Sjeff
905113357Sjeffvoid
906113357Sjeffsched_fork_thread(struct thread *td, struct thread *child)
907113357Sjeff{
908113357Sjeff}
909113357Sjeff
910113357Sjeffvoid
911113357Sjeffsched_class(struct ksegrp *kg, int class)
912113357Sjeff{
913113357Sjeff	struct kseq *kseq;
914113357Sjeff	struct kse *ke;
915113357Sjeff
916113923Sjhb	mtx_assert(&sched_lock, MA_OWNED);
917113357Sjeff	if (kg->kg_pri_class == class)
918113357Sjeff		return;
919113357Sjeff
920113357Sjeff	FOREACH_KSE_IN_GROUP(kg, ke) {
921113357Sjeff		if (ke->ke_state != KES_ONRUNQ &&
922113357Sjeff		    ke->ke_state != KES_THREAD)
923113357Sjeff			continue;
924113357Sjeff		kseq = KSEQ_CPU(ke->ke_cpu);
925113357Sjeff
926113386Sjeff		kseq->ksq_loads[PRI_BASE(kg->kg_pri_class)]--;
927113386Sjeff		kseq->ksq_loads[PRI_BASE(class)]++;
928113357Sjeff
929113357Sjeff		if (kg->kg_pri_class == PRI_TIMESHARE)
930113357Sjeff			kseq_nice_rem(kseq, kg->kg_nice);
931113357Sjeff		else if (class == PRI_TIMESHARE)
932113357Sjeff			kseq_nice_add(kseq, kg->kg_nice);
933109970Sjeff	}
934109970Sjeff
935113357Sjeff	kg->kg_pri_class = class;
936109864Sjeff}
937109864Sjeff
938109864Sjeff/*
939109864Sjeff * Return some of the child's priority and interactivity to the parent.
940109864Sjeff */
941109864Sjeffvoid
942113357Sjeffsched_exit(struct proc *p, struct proc *child)
943109864Sjeff{
944109864Sjeff	/* XXX Need something better here */
945109864Sjeff	mtx_assert(&sched_lock, MA_OWNED);
946113372Sjeff	sched_exit_kse(FIRST_KSE_IN_PROC(p), FIRST_KSE_IN_PROC(child));
947116365Sjeff	sched_exit_ksegrp(FIRST_KSEGRP_IN_PROC(p), FIRST_KSEGRP_IN_PROC(child));
948109864Sjeff}
949109864Sjeff
950109864Sjeffvoid
951113372Sjeffsched_exit_kse(struct kse *ke, struct kse *child)
952113372Sjeff{
953113372Sjeff	kseq_rem(KSEQ_CPU(child->ke_cpu), child);
954113372Sjeff}
955113372Sjeff
956113372Sjeffvoid
957113372Sjeffsched_exit_ksegrp(struct ksegrp *kg, struct ksegrp *child)
958113372Sjeff{
959116365Sjeff	kg->kg_slptime += child->kg_slptime;
960116365Sjeff	kg->kg_runtime += child->kg_runtime;
961113372Sjeff}
962113372Sjeff
963113372Sjeffvoid
964113372Sjeffsched_exit_thread(struct thread *td, struct thread *child)
965113372Sjeff{
966113372Sjeff}
967113372Sjeff
968113372Sjeffvoid
969113357Sjeffsched_clock(struct kse *ke)
970109864Sjeff{
971113357Sjeff	struct kseq *kseq;
972113357Sjeff	struct ksegrp *kg;
973113357Sjeff	struct thread *td;
974113357Sjeff#if 0
975109864Sjeff	struct kse *nke;
976110267Sjeff#endif
977109864Sjeff
978113357Sjeff	/*
979113357Sjeff	 * sched_setup() apparently happens prior to stathz being set.  We
980113357Sjeff	 * need to resolve the timers earlier in the boot so we can avoid
981113357Sjeff	 * calculating this here.
982113357Sjeff	 */
983113357Sjeff	if (realstathz == 0) {
984113357Sjeff		realstathz = stathz ? stathz : hz;
985113357Sjeff		tickincr = hz / realstathz;
986113357Sjeff		/*
987113357Sjeff		 * XXX This does not work for values of stathz that are much
988113357Sjeff		 * larger than hz.
989113357Sjeff		 */
990113357Sjeff		if (tickincr == 0)
991113357Sjeff			tickincr = 1;
992113357Sjeff	}
993109864Sjeff
994113357Sjeff	td = ke->ke_thread;
995113357Sjeff	kg = ke->ke_ksegrp;
996109864Sjeff
997110028Sjeff	mtx_assert(&sched_lock, MA_OWNED);
998110028Sjeff	KASSERT((td != NULL), ("schedclock: null thread pointer"));
999110028Sjeff
1000110028Sjeff	/* Adjust ticks for pctcpu */
1001111793Sjeff	ke->ke_ticks++;
1002109971Sjeff	ke->ke_ltick = ticks;
1003112994Sjeff
1004109971Sjeff	/* Go up to one second beyond our max and then trim back down */
1005109971Sjeff	if (ke->ke_ftick + SCHED_CPU_TICKS + hz < ke->ke_ltick)
1006109971Sjeff		sched_pctcpu_update(ke);
1007109971Sjeff
1008114496Sjulian	if (td->td_flags & TDF_IDLETD)
1009109864Sjeff		return;
1010110028Sjeff
1011113357Sjeff	CTR4(KTR_ULE, "Tick kse %p (slice: %d, slptime: %d, runtime: %d)",
1012113357Sjeff	    ke, ke->ke_slice, kg->kg_slptime >> 10, kg->kg_runtime >> 10);
1013113357Sjeff
1014110028Sjeff	/*
1015113357Sjeff	 * We only do slicing code for TIMESHARE ksegrps.
1016113357Sjeff	 */
1017113357Sjeff	if (kg->kg_pri_class != PRI_TIMESHARE)
1018113357Sjeff		return;
1019113357Sjeff	/*
1020110028Sjeff	 * Check for a higher priority task on the run queue.  This can happen
1021110028Sjeff	 * on SMP if another processor woke up a process on our runq.
1022110028Sjeff	 */
1023110028Sjeff	kseq = KSEQ_SELF();
1024113357Sjeff#if 0
1025113357Sjeff	if (kseq->ksq_load > 1 && (nke = kseq_choose(kseq)) != NULL) {
1026113357Sjeff		if (sched_strict &&
1027113357Sjeff		    nke->ke_thread->td_priority < td->td_priority)
1028113357Sjeff			td->td_flags |= TDF_NEEDRESCHED;
1029113357Sjeff		else if (nke->ke_thread->td_priority <
1030113357Sjeff		    td->td_priority SCHED_PRIO_SLOP)
1031113357Sjeff
1032113357Sjeff		if (nke->ke_thread->td_priority < td->td_priority)
1033113357Sjeff			td->td_flags |= TDF_NEEDRESCHED;
1034113357Sjeff	}
1035110267Sjeff#endif
1036109864Sjeff	/*
1037110645Sjeff	 * We used a tick charge it to the ksegrp so that we can compute our
1038113357Sjeff	 * interactivity.
1039109864Sjeff	 */
1040113357Sjeff	kg->kg_runtime += tickincr << 10;
1041110645Sjeff
1042109864Sjeff	/*
1043109864Sjeff	 * We used up one time slice.
1044109864Sjeff	 */
1045109864Sjeff	ke->ke_slice--;
1046113357Sjeff#ifdef SMP
1047113370Sjeff	kseq->ksq_rslices--;
1048113357Sjeff#endif
1049113357Sjeff
1050113357Sjeff	if (ke->ke_slice > 0)
1051113357Sjeff		return;
1052109864Sjeff	/*
1053113357Sjeff	 * We're out of time, recompute priorities and requeue.
1054109864Sjeff	 */
1055113357Sjeff	kseq_rem(kseq, ke);
1056113357Sjeff	sched_priority(kg);
1057113357Sjeff	sched_slice(ke);
1058113357Sjeff	if (SCHED_CURR(kg, ke))
1059113357Sjeff		ke->ke_runq = kseq->ksq_curr;
1060113357Sjeff	else
1061113357Sjeff		ke->ke_runq = kseq->ksq_next;
1062113357Sjeff	kseq_add(kseq, ke);
1063113357Sjeff	td->td_flags |= TDF_NEEDRESCHED;
1064109864Sjeff}
1065109864Sjeff
1066109864Sjeffint
1067109864Sjeffsched_runnable(void)
1068109864Sjeff{
1069109864Sjeff	struct kseq *kseq;
1070115998Sjeff	int load;
1071109864Sjeff
1072115998Sjeff	load = 1;
1073115998Sjeff
1074115998Sjeff	mtx_lock_spin(&sched_lock);
1075110028Sjeff	kseq = KSEQ_SELF();
1076109864Sjeff
1077113357Sjeff	if (kseq->ksq_load)
1078115998Sjeff		goto out;
1079109970Sjeff#ifdef SMP
1080110028Sjeff	/*
1081110028Sjeff	 * For SMP we may steal other processor's KSEs.  Just search until we
1082110028Sjeff	 * verify that at least on other cpu has a runnable task.
1083110028Sjeff	 */
1084109970Sjeff	if (smp_started) {
1085109970Sjeff		int i;
1086109970Sjeff
1087109970Sjeff		for (i = 0; i < mp_maxid; i++) {
1088109970Sjeff			if (CPU_ABSENT(i))
1089109970Sjeff				continue;
1090110028Sjeff			kseq = KSEQ_CPU(i);
1091113660Sjeff			if (kseq->ksq_load > 1)
1092115998Sjeff				goto out;
1093109970Sjeff		}
1094109970Sjeff	}
1095109970Sjeff#endif
1096115998Sjeff	load = 0;
1097115998Sjeffout:
1098115998Sjeff	mtx_unlock_spin(&sched_lock);
1099115998Sjeff	return (load);
1100109864Sjeff}
1101109864Sjeff
1102109864Sjeffvoid
1103109864Sjeffsched_userret(struct thread *td)
1104109864Sjeff{
1105109864Sjeff	struct ksegrp *kg;
1106116365Sjeff	struct kseq *kseq;
1107116365Sjeff	struct kse *ke;
1108109864Sjeff
1109109864Sjeff	kg = td->td_ksegrp;
1110109864Sjeff
1111109864Sjeff	if (td->td_priority != kg->kg_user_pri) {
1112109864Sjeff		mtx_lock_spin(&sched_lock);
1113109864Sjeff		td->td_priority = kg->kg_user_pri;
1114116365Sjeff		kseq = KSEQ_SELF();
1115116365Sjeff		if (td->td_ksegrp->kg_pri_class == PRI_TIMESHARE &&
1116116365Sjeff		    kseq->ksq_load > 1 &&
1117116365Sjeff		    (ke = kseq_choose(kseq)) != NULL &&
1118116365Sjeff		    ke->ke_thread->td_priority < td->td_priority)
1119116365Sjeff			curthread->td_flags |= TDF_NEEDRESCHED;
1120109864Sjeff		mtx_unlock_spin(&sched_lock);
1121109864Sjeff	}
1122109864Sjeff}
1123109864Sjeff
1124109864Sjeffstruct kse *
1125109970Sjeffsched_choose(void)
1126109970Sjeff{
1127110028Sjeff	struct kseq *kseq;
1128109970Sjeff	struct kse *ke;
1129109970Sjeff
1130115998Sjeff	mtx_assert(&sched_lock, MA_OWNED);
1131113357Sjeff#ifdef SMP
1132112966Sjeffretry:
1133113357Sjeff#endif
1134113370Sjeff	kseq = KSEQ_SELF();
1135110028Sjeff	ke = kseq_choose(kseq);
1136109864Sjeff	if (ke) {
1137113357Sjeff		runq_remove(ke->ke_runq, ke);
1138109864Sjeff		ke->ke_state = KES_THREAD;
1139112966Sjeff
1140113357Sjeff		if (ke->ke_ksegrp->kg_pri_class == PRI_TIMESHARE) {
1141113357Sjeff			CTR4(KTR_ULE, "Run kse %p from %p (slice: %d, pri: %d)",
1142113357Sjeff			    ke, ke->ke_runq, ke->ke_slice,
1143113357Sjeff			    ke->ke_thread->td_priority);
1144113357Sjeff		}
1145113357Sjeff		return (ke);
1146109864Sjeff	}
1147109864Sjeff
1148109970Sjeff#ifdef SMP
1149113370Sjeff	if (smp_started) {
1150109970Sjeff		/*
1151109970Sjeff		 * Find the cpu with the highest load and steal one proc.
1152109970Sjeff		 */
1153113370Sjeff		if ((kseq = kseq_load_highest()) == NULL)
1154113370Sjeff			return (NULL);
1155113370Sjeff
1156113370Sjeff		/*
1157113370Sjeff		 * Remove this kse from this kseq and runq and then requeue
1158113370Sjeff		 * on the current processor.  Then we will dequeue it
1159113370Sjeff		 * normally above.
1160113370Sjeff		 */
1161116069Sjeff		kseq_move(kseq, PCPU_GET(cpuid));
1162113370Sjeff		goto retry;
1163109970Sjeff	}
1164109970Sjeff#endif
1165113357Sjeff
1166113357Sjeff	return (NULL);
1167109864Sjeff}
1168109864Sjeff
1169109864Sjeffvoid
1170109864Sjeffsched_add(struct kse *ke)
1171109864Sjeff{
1172110267Sjeff	struct kseq *kseq;
1173113357Sjeff	struct ksegrp *kg;
1174109864Sjeff
1175109864Sjeff	mtx_assert(&sched_lock, MA_OWNED);
1176110267Sjeff	KASSERT((ke->ke_thread != NULL), ("sched_add: No thread on KSE"));
1177109864Sjeff	KASSERT((ke->ke_thread->td_kse != NULL),
1178110267Sjeff	    ("sched_add: No KSE on thread"));
1179109864Sjeff	KASSERT(ke->ke_state != KES_ONRUNQ,
1180110267Sjeff	    ("sched_add: kse %p (%s) already in run queue", ke,
1181109864Sjeff	    ke->ke_proc->p_comm));
1182109864Sjeff	KASSERT(ke->ke_proc->p_sflag & PS_INMEM,
1183110267Sjeff	    ("sched_add: process swapped out"));
1184113387Sjeff	KASSERT(ke->ke_runq == NULL,
1185113387Sjeff	    ("sched_add: KSE %p is still assigned to a run queue", ke));
1186109864Sjeff
1187113357Sjeff	kg = ke->ke_ksegrp;
1188113357Sjeff
1189113386Sjeff	switch (PRI_BASE(kg->kg_pri_class)) {
1190112994Sjeff	case PRI_ITHD:
1191112994Sjeff	case PRI_REALTIME:
1192112994Sjeff		kseq = KSEQ_SELF();
1193113357Sjeff		ke->ke_runq = kseq->ksq_curr;
1194113357Sjeff		ke->ke_slice = SCHED_SLICE_MAX;
1195113660Sjeff		ke->ke_cpu = PCPU_GET(cpuid);
1196112994Sjeff		break;
1197112994Sjeff	case PRI_TIMESHARE:
1198113357Sjeff		kseq = KSEQ_CPU(ke->ke_cpu);
1199113387Sjeff		if (SCHED_CURR(kg, ke))
1200113387Sjeff			ke->ke_runq = kseq->ksq_curr;
1201113387Sjeff		else
1202113387Sjeff			ke->ke_runq = kseq->ksq_next;
1203113357Sjeff		break;
1204112994Sjeff	case PRI_IDLE:
1205111789Sjeff		kseq = KSEQ_CPU(ke->ke_cpu);
1206113357Sjeff		/*
1207113357Sjeff		 * This is for priority prop.
1208113357Sjeff		 */
1209116365Sjeff		if (ke->ke_thread->td_priority > PRI_MIN_IDLE)
1210113357Sjeff			ke->ke_runq = kseq->ksq_curr;
1211113357Sjeff		else
1212113357Sjeff			ke->ke_runq = &kseq->ksq_idle;
1213113357Sjeff		ke->ke_slice = SCHED_SLICE_MIN;
1214112994Sjeff		break;
1215113357Sjeff	default:
1216113357Sjeff		panic("Unknown pri class.\n");
1217113357Sjeff		break;
1218112994Sjeff	}
1219109864Sjeff
1220109864Sjeff	ke->ke_ksegrp->kg_runq_kses++;
1221109864Sjeff	ke->ke_state = KES_ONRUNQ;
1222109864Sjeff
1223113357Sjeff	runq_add(ke->ke_runq, ke);
1224113387Sjeff	kseq_add(kseq, ke);
1225109864Sjeff}
1226109864Sjeff
1227109864Sjeffvoid
1228109864Sjeffsched_rem(struct kse *ke)
1229109864Sjeff{
1230113357Sjeff	struct kseq *kseq;
1231113357Sjeff
1232109864Sjeff	mtx_assert(&sched_lock, MA_OWNED);
1233113387Sjeff	KASSERT((ke->ke_state == KES_ONRUNQ), ("KSE not on run queue"));
1234109864Sjeff
1235109864Sjeff	ke->ke_state = KES_THREAD;
1236109864Sjeff	ke->ke_ksegrp->kg_runq_kses--;
1237113357Sjeff	kseq = KSEQ_CPU(ke->ke_cpu);
1238113357Sjeff	runq_remove(ke->ke_runq, ke);
1239113357Sjeff	kseq_rem(kseq, ke);
1240109864Sjeff}
1241109864Sjeff
1242109864Sjefffixpt_t
1243109864Sjeffsched_pctcpu(struct kse *ke)
1244109864Sjeff{
1245109864Sjeff	fixpt_t pctcpu;
1246109864Sjeff
1247109864Sjeff	pctcpu = 0;
1248109864Sjeff
1249115998Sjeff	mtx_lock_spin(&sched_lock);
1250109864Sjeff	if (ke->ke_ticks) {
1251109864Sjeff		int rtick;
1252109864Sjeff
1253109864Sjeff		/* Update to account for time potentially spent sleeping */
1254109864Sjeff		ke->ke_ltick = ticks;
1255116365Sjeff		/*
1256116365Sjeff		 * Don't update more frequently than twice a second.  Allowing
1257116365Sjeff		 * this causes the cpu usage to decay away too quickly due to
1258116365Sjeff		 * rounding errors.
1259116365Sjeff		 */
1260116365Sjeff		if (ke->ke_ltick < (ticks - (hz / 2)))
1261116365Sjeff			sched_pctcpu_update(ke);
1262109864Sjeff
1263109864Sjeff		/* How many rtick per second ? */
1264116365Sjeff		rtick = min(ke->ke_ticks / SCHED_CPU_TIME, SCHED_CPU_TICKS);
1265110226Sscottl		pctcpu = (FSCALE * ((FSCALE * rtick)/realstathz)) >> FSHIFT;
1266109864Sjeff	}
1267109864Sjeff
1268109864Sjeff	ke->ke_proc->p_swtime = ke->ke_ltick - ke->ke_ftick;
1269113865Sjhb	mtx_unlock_spin(&sched_lock);
1270109864Sjeff
1271109864Sjeff	return (pctcpu);
1272109864Sjeff}
1273109864Sjeff
1274109864Sjeffint
1275109864Sjeffsched_sizeof_kse(void)
1276109864Sjeff{
1277109864Sjeff	return (sizeof(struct kse) + sizeof(struct ke_sched));
1278109864Sjeff}
1279109864Sjeff
1280109864Sjeffint
1281109864Sjeffsched_sizeof_ksegrp(void)
1282109864Sjeff{
1283109864Sjeff	return (sizeof(struct ksegrp) + sizeof(struct kg_sched));
1284109864Sjeff}
1285109864Sjeff
1286109864Sjeffint
1287109864Sjeffsched_sizeof_proc(void)
1288109864Sjeff{
1289109864Sjeff	return (sizeof(struct proc));
1290109864Sjeff}
1291109864Sjeff
1292109864Sjeffint
1293109864Sjeffsched_sizeof_thread(void)
1294109864Sjeff{
1295109864Sjeff	return (sizeof(struct thread) + sizeof(struct td_sched));
1296109864Sjeff}
1297