sched_ule.c revision 116463
1109864Sjeff/*-
2113357Sjeff * Copyright (c) 2002-2003, Jeffrey Roberson <jeff@freebsd.org>
3109864Sjeff * All rights reserved.
4109864Sjeff *
5109864Sjeff * Redistribution and use in source and binary forms, with or without
6109864Sjeff * modification, are permitted provided that the following conditions
7109864Sjeff * are met:
8109864Sjeff * 1. Redistributions of source code must retain the above copyright
9109864Sjeff *    notice unmodified, this list of conditions, and the following
10109864Sjeff *    disclaimer.
11109864Sjeff * 2. Redistributions in binary form must reproduce the above copyright
12109864Sjeff *    notice, this list of conditions and the following disclaimer in the
13109864Sjeff *    documentation and/or other materials provided with the distribution.
14109864Sjeff *
15109864Sjeff * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
16109864Sjeff * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
17109864Sjeff * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
18109864Sjeff * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
19109864Sjeff * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
20109864Sjeff * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
21109864Sjeff * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
22109864Sjeff * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
23109864Sjeff * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
24109864Sjeff * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
25109864Sjeff */
26109864Sjeff
27116182Sobrien#include <sys/cdefs.h>
28116182Sobrien__FBSDID("$FreeBSD: head/sys/kern/sched_ule.c 116463 2003-06-17 06:39:51Z jeff $");
29116182Sobrien
30109864Sjeff#include <sys/param.h>
31109864Sjeff#include <sys/systm.h>
32109864Sjeff#include <sys/kernel.h>
33109864Sjeff#include <sys/ktr.h>
34109864Sjeff#include <sys/lock.h>
35109864Sjeff#include <sys/mutex.h>
36109864Sjeff#include <sys/proc.h>
37112966Sjeff#include <sys/resource.h>
38109864Sjeff#include <sys/sched.h>
39109864Sjeff#include <sys/smp.h>
40109864Sjeff#include <sys/sx.h>
41109864Sjeff#include <sys/sysctl.h>
42109864Sjeff#include <sys/sysproto.h>
43109864Sjeff#include <sys/vmmeter.h>
44109864Sjeff#ifdef DDB
45109864Sjeff#include <ddb/ddb.h>
46109864Sjeff#endif
47109864Sjeff#ifdef KTRACE
48109864Sjeff#include <sys/uio.h>
49109864Sjeff#include <sys/ktrace.h>
50109864Sjeff#endif
51109864Sjeff
52109864Sjeff#include <machine/cpu.h>
53109864Sjeff
54113357Sjeff#define KTR_ULE         KTR_NFS
55113357Sjeff
56109864Sjeff/* decay 95% of `p_pctcpu' in 60 seconds; see CCPU_SHIFT before changing */
57109864Sjeff/* XXX This is bogus compatability crap for ps */
58109864Sjeffstatic fixpt_t  ccpu = 0.95122942450071400909 * FSCALE; /* exp(-1/20) */
59109864SjeffSYSCTL_INT(_kern, OID_AUTO, ccpu, CTLFLAG_RD, &ccpu, 0, "");
60109864Sjeff
61109864Sjeffstatic void sched_setup(void *dummy);
62109864SjeffSYSINIT(sched_setup, SI_SUB_RUN_QUEUE, SI_ORDER_FIRST, sched_setup, NULL)
63109864Sjeff
64113357Sjeffstatic SYSCTL_NODE(_kern, OID_AUTO, sched, CTLFLAG_RW, 0, "SCHED");
65113357Sjeff
66113357Sjeffstatic int sched_strict;
67113357SjeffSYSCTL_INT(_kern_sched, OID_AUTO, strict, CTLFLAG_RD, &sched_strict, 0, "");
68113357Sjeff
69113357Sjeffstatic int slice_min = 1;
70113357SjeffSYSCTL_INT(_kern_sched, OID_AUTO, slice_min, CTLFLAG_RW, &slice_min, 0, "");
71113357Sjeff
72116365Sjeffstatic int slice_max = 10;
73113357SjeffSYSCTL_INT(_kern_sched, OID_AUTO, slice_max, CTLFLAG_RW, &slice_max, 0, "");
74113357Sjeff
75111857Sjeffint realstathz;
76113357Sjeffint tickincr = 1;
77111857Sjeff
78116069Sjeff#ifdef SMP
79116069Sjeff/* Callout to handle load balancing SMP systems. */
80116069Sjeffstatic struct callout kseq_lb_callout;
81116069Sjeff#endif
82116069Sjeff
83109864Sjeff/*
84109864Sjeff * These datastructures are allocated within their parent datastructure but
85109864Sjeff * are scheduler specific.
86109864Sjeff */
87109864Sjeff
88109864Sjeffstruct ke_sched {
89109864Sjeff	int		ske_slice;
90109864Sjeff	struct runq	*ske_runq;
91109864Sjeff	/* The following variables are only used for pctcpu calculation */
92109864Sjeff	int		ske_ltick;	/* Last tick that we were running on */
93109864Sjeff	int		ske_ftick;	/* First tick that we were running on */
94109864Sjeff	int		ske_ticks;	/* Tick count */
95113357Sjeff	/* CPU that we have affinity for. */
96110260Sjeff	u_char		ske_cpu;
97109864Sjeff};
98109864Sjeff#define	ke_slice	ke_sched->ske_slice
99109864Sjeff#define	ke_runq		ke_sched->ske_runq
100109864Sjeff#define	ke_ltick	ke_sched->ske_ltick
101109864Sjeff#define	ke_ftick	ke_sched->ske_ftick
102109864Sjeff#define	ke_ticks	ke_sched->ske_ticks
103110260Sjeff#define	ke_cpu		ke_sched->ske_cpu
104109864Sjeff
105109864Sjeffstruct kg_sched {
106110645Sjeff	int	skg_slptime;		/* Number of ticks we vol. slept */
107110645Sjeff	int	skg_runtime;		/* Number of ticks we were running */
108109864Sjeff};
109109864Sjeff#define	kg_slptime	kg_sched->skg_slptime
110110645Sjeff#define	kg_runtime	kg_sched->skg_runtime
111109864Sjeff
112109864Sjeffstruct td_sched {
113109864Sjeff	int	std_slptime;
114109864Sjeff};
115109864Sjeff#define	td_slptime	td_sched->std_slptime
116109864Sjeff
117110267Sjeffstruct td_sched td_sched;
118109864Sjeffstruct ke_sched ke_sched;
119109864Sjeffstruct kg_sched kg_sched;
120109864Sjeff
121109864Sjeffstruct ke_sched *kse0_sched = &ke_sched;
122109864Sjeffstruct kg_sched *ksegrp0_sched = &kg_sched;
123109864Sjeffstruct p_sched *proc0_sched = NULL;
124109864Sjeffstruct td_sched *thread0_sched = &td_sched;
125109864Sjeff
126109864Sjeff/*
127109864Sjeff * This priority range has 20 priorities on either end that are reachable
128109864Sjeff * only through nice values.
129111857Sjeff *
130111857Sjeff * PRI_RANGE:	Total priority range for timeshare threads.
131111857Sjeff * PRI_NRESV:	Reserved priorities for nice.
132111857Sjeff * PRI_BASE:	The start of the dynamic range.
133111857Sjeff * DYN_RANGE:	Number of priorities that are available int the dynamic
134111857Sjeff *		priority range.
135109864Sjeff */
136111857Sjeff#define	SCHED_PRI_RANGE		(PRI_MAX_TIMESHARE - PRI_MIN_TIMESHARE + 1)
137112966Sjeff#define	SCHED_PRI_NRESV		PRIO_TOTAL
138112970Sjeff#define	SCHED_PRI_NHALF		(PRIO_TOTAL / 2)
139113357Sjeff#define	SCHED_PRI_NTHRESH	(SCHED_PRI_NHALF - 1)
140111857Sjeff#define	SCHED_PRI_BASE		((SCHED_PRI_NRESV / 2) + PRI_MIN_TIMESHARE)
141111857Sjeff#define	SCHED_DYN_RANGE		(SCHED_PRI_RANGE - SCHED_PRI_NRESV)
142113357Sjeff#define	SCHED_PRI_INTERACT(score)					\
143116365Sjeff    ((score) * SCHED_DYN_RANGE / SCHED_INTERACT_MAX)
144109864Sjeff
145109864Sjeff/*
146111857Sjeff * These determine the interactivity of a process.
147109864Sjeff *
148110645Sjeff * SLP_RUN_MAX:	Maximum amount of sleep time + run time we'll accumulate
149110645Sjeff *		before throttling back.
150111857Sjeff * SLP_RUN_THROTTLE:	Divisor for reducing slp/run time.
151116365Sjeff * INTERACT_MAX:	Maximum interactivity value.  Smaller is better.
152111857Sjeff * INTERACT_THRESH:	Threshhold for placement on the current runq.
153109864Sjeff */
154116463Sjeff#define	SCHED_SLP_RUN_MAX	((hz * 2) << 10)
155116377Sjeff#define	SCHED_SLP_RUN_THROTTLE	(2)
156116365Sjeff#define	SCHED_INTERACT_MAX	(100)
157116365Sjeff#define	SCHED_INTERACT_HALF	(SCHED_INTERACT_MAX / 2)
158116365Sjeff#define	SCHED_INTERACT_THRESH	(20)
159111857Sjeff
160109864Sjeff/*
161109864Sjeff * These parameters and macros determine the size of the time slice that is
162109864Sjeff * granted to each thread.
163109864Sjeff *
164109864Sjeff * SLICE_MIN:	Minimum time slice granted, in units of ticks.
165109864Sjeff * SLICE_MAX:	Maximum time slice granted.
166109864Sjeff * SLICE_RANGE:	Range of available time slices scaled by hz.
167112966Sjeff * SLICE_SCALE:	The number slices granted per val in the range of [0, max].
168112966Sjeff * SLICE_NICE:  Determine the amount of slice granted to a scaled nice.
169109864Sjeff */
170113357Sjeff#define	SCHED_SLICE_MIN			(slice_min)
171113357Sjeff#define	SCHED_SLICE_MAX			(slice_max)
172111857Sjeff#define	SCHED_SLICE_RANGE		(SCHED_SLICE_MAX - SCHED_SLICE_MIN + 1)
173109864Sjeff#define	SCHED_SLICE_SCALE(val, max)	(((val) * SCHED_SLICE_RANGE) / (max))
174112966Sjeff#define	SCHED_SLICE_NICE(nice)						\
175113357Sjeff    (SCHED_SLICE_MAX - SCHED_SLICE_SCALE((nice), SCHED_PRI_NTHRESH))
176109864Sjeff
177109864Sjeff/*
178109864Sjeff * This macro determines whether or not the kse belongs on the current or
179109864Sjeff * next run queue.
180110645Sjeff *
181110645Sjeff * XXX nice value should effect how interactive a kg is.
182109864Sjeff */
183113357Sjeff#define	SCHED_INTERACTIVE(kg)						\
184113357Sjeff    (sched_interact_score(kg) < SCHED_INTERACT_THRESH)
185113417Sjeff#define	SCHED_CURR(kg, ke)						\
186113357Sjeff    (ke->ke_thread->td_priority < PRI_MIN_TIMESHARE || SCHED_INTERACTIVE(kg))
187109864Sjeff
188109864Sjeff/*
189109864Sjeff * Cpu percentage computation macros and defines.
190109864Sjeff *
191109864Sjeff * SCHED_CPU_TIME:	Number of seconds to average the cpu usage across.
192109864Sjeff * SCHED_CPU_TICKS:	Number of hz ticks to average the cpu usage across.
193109864Sjeff */
194109864Sjeff
195112971Sjeff#define	SCHED_CPU_TIME	10
196109864Sjeff#define	SCHED_CPU_TICKS	(hz * SCHED_CPU_TIME)
197109864Sjeff
198109864Sjeff/*
199113357Sjeff * kseq - per processor runqs and statistics.
200109864Sjeff */
201109864Sjeff
202113357Sjeff#define	KSEQ_NCLASS	(PRI_IDLE + 1)	/* Number of run classes. */
203113357Sjeff
204109864Sjeffstruct kseq {
205113357Sjeff	struct runq	ksq_idle;		/* Queue of IDLE threads. */
206113357Sjeff	struct runq	ksq_timeshare[2];	/* Run queues for !IDLE. */
207113357Sjeff	struct runq	*ksq_next;		/* Next timeshare queue. */
208113357Sjeff	struct runq	*ksq_curr;		/* Current queue. */
209113357Sjeff	int		ksq_loads[KSEQ_NCLASS];	/* Load for each class */
210113357Sjeff	int		ksq_load;		/* Aggregate load. */
211113357Sjeff	short		ksq_nice[PRIO_TOTAL + 1]; /* KSEs in each nice bin. */
212113357Sjeff	short		ksq_nicemin;		/* Least nice. */
213110267Sjeff#ifdef SMP
214110267Sjeff	unsigned int	ksq_rslices;	/* Slices on run queue */
215110267Sjeff#endif
216109864Sjeff};
217109864Sjeff
218109864Sjeff/*
219109864Sjeff * One kse queue per processor.
220109864Sjeff */
221110028Sjeff#ifdef SMP
222109864Sjeffstruct kseq	kseq_cpu[MAXCPU];
223110028Sjeff#define	KSEQ_SELF()	(&kseq_cpu[PCPU_GET(cpuid)])
224110028Sjeff#define	KSEQ_CPU(x)	(&kseq_cpu[(x)])
225110028Sjeff#else
226110028Sjeffstruct kseq	kseq_cpu;
227110028Sjeff#define	KSEQ_SELF()	(&kseq_cpu)
228110028Sjeff#define	KSEQ_CPU(x)	(&kseq_cpu)
229110028Sjeff#endif
230109864Sjeff
231112966Sjeffstatic void sched_slice(struct kse *ke);
232113357Sjeffstatic void sched_priority(struct ksegrp *kg);
233111857Sjeffstatic int sched_interact_score(struct ksegrp *kg);
234116463Sjeffstatic void sched_interact_update(struct ksegrp *kg);
235109864Sjeffvoid sched_pctcpu_update(struct kse *ke);
236109864Sjeffint sched_pickcpu(void);
237109864Sjeff
238110267Sjeff/* Operations on per processor queues */
239110028Sjeffstatic struct kse * kseq_choose(struct kseq *kseq);
240110028Sjeffstatic void kseq_setup(struct kseq *kseq);
241112994Sjeffstatic void kseq_add(struct kseq *kseq, struct kse *ke);
242113357Sjeffstatic void kseq_rem(struct kseq *kseq, struct kse *ke);
243113357Sjeffstatic void kseq_nice_add(struct kseq *kseq, int nice);
244113357Sjeffstatic void kseq_nice_rem(struct kseq *kseq, int nice);
245113660Sjeffvoid kseq_print(int cpu);
246110267Sjeff#ifdef SMP
247110267Sjeffstruct kseq * kseq_load_highest(void);
248116069Sjeffvoid kseq_balance(void *arg);
249116069Sjeffvoid kseq_move(struct kseq *from, int cpu);
250110267Sjeff#endif
251110028Sjeff
252113357Sjeffvoid
253113660Sjeffkseq_print(int cpu)
254110267Sjeff{
255113660Sjeff	struct kseq *kseq;
256113357Sjeff	int i;
257112994Sjeff
258113660Sjeff	kseq = KSEQ_CPU(cpu);
259112994Sjeff
260113357Sjeff	printf("kseq:\n");
261113357Sjeff	printf("\tload:           %d\n", kseq->ksq_load);
262113357Sjeff	printf("\tload ITHD:      %d\n", kseq->ksq_loads[PRI_ITHD]);
263113357Sjeff	printf("\tload REALTIME:  %d\n", kseq->ksq_loads[PRI_REALTIME]);
264113357Sjeff	printf("\tload TIMESHARE: %d\n", kseq->ksq_loads[PRI_TIMESHARE]);
265113357Sjeff	printf("\tload IDLE:      %d\n", kseq->ksq_loads[PRI_IDLE]);
266113357Sjeff	printf("\tnicemin:\t%d\n", kseq->ksq_nicemin);
267113357Sjeff	printf("\tnice counts:\n");
268113357Sjeff	for (i = 0; i < PRIO_TOTAL + 1; i++)
269113357Sjeff		if (kseq->ksq_nice[i])
270113357Sjeff			printf("\t\t%d = %d\n",
271113357Sjeff			    i - SCHED_PRI_NHALF, kseq->ksq_nice[i]);
272113357Sjeff}
273112994Sjeff
274113357Sjeffstatic void
275113357Sjeffkseq_add(struct kseq *kseq, struct kse *ke)
276113357Sjeff{
277115998Sjeff	mtx_assert(&sched_lock, MA_OWNED);
278113386Sjeff	kseq->ksq_loads[PRI_BASE(ke->ke_ksegrp->kg_pri_class)]++;
279113357Sjeff	kseq->ksq_load++;
280113357Sjeff	if (ke->ke_ksegrp->kg_pri_class == PRI_TIMESHARE)
281113357Sjeff	CTR6(KTR_ULE, "Add kse %p to %p (slice: %d, pri: %d, nice: %d(%d))",
282113357Sjeff	    ke, ke->ke_runq, ke->ke_slice, ke->ke_thread->td_priority,
283113357Sjeff	    ke->ke_ksegrp->kg_nice, kseq->ksq_nicemin);
284113357Sjeff	if (ke->ke_ksegrp->kg_pri_class == PRI_TIMESHARE)
285113357Sjeff		kseq_nice_add(kseq, ke->ke_ksegrp->kg_nice);
286110267Sjeff#ifdef SMP
287110267Sjeff	kseq->ksq_rslices += ke->ke_slice;
288110267Sjeff#endif
289110267Sjeff}
290113357Sjeff
291112994Sjeffstatic void
292110267Sjeffkseq_rem(struct kseq *kseq, struct kse *ke)
293110267Sjeff{
294115998Sjeff	mtx_assert(&sched_lock, MA_OWNED);
295113386Sjeff	kseq->ksq_loads[PRI_BASE(ke->ke_ksegrp->kg_pri_class)]--;
296113357Sjeff	kseq->ksq_load--;
297113357Sjeff	ke->ke_runq = NULL;
298113357Sjeff	if (ke->ke_ksegrp->kg_pri_class == PRI_TIMESHARE)
299113357Sjeff		kseq_nice_rem(kseq, ke->ke_ksegrp->kg_nice);
300110267Sjeff#ifdef SMP
301110267Sjeff	kseq->ksq_rslices -= ke->ke_slice;
302110267Sjeff#endif
303110267Sjeff}
304110267Sjeff
305113357Sjeffstatic void
306113357Sjeffkseq_nice_add(struct kseq *kseq, int nice)
307110267Sjeff{
308115998Sjeff	mtx_assert(&sched_lock, MA_OWNED);
309113357Sjeff	/* Normalize to zero. */
310113357Sjeff	kseq->ksq_nice[nice + SCHED_PRI_NHALF]++;
311115998Sjeff	if (nice < kseq->ksq_nicemin || kseq->ksq_loads[PRI_TIMESHARE] == 1)
312113357Sjeff		kseq->ksq_nicemin = nice;
313110267Sjeff}
314110267Sjeff
315113357Sjeffstatic void
316113357Sjeffkseq_nice_rem(struct kseq *kseq, int nice)
317110267Sjeff{
318113357Sjeff	int n;
319113357Sjeff
320115998Sjeff	mtx_assert(&sched_lock, MA_OWNED);
321113357Sjeff	/* Normalize to zero. */
322113357Sjeff	n = nice + SCHED_PRI_NHALF;
323113357Sjeff	kseq->ksq_nice[n]--;
324113357Sjeff	KASSERT(kseq->ksq_nice[n] >= 0, ("Negative nice count."));
325113357Sjeff
326113357Sjeff	/*
327113357Sjeff	 * If this wasn't the smallest nice value or there are more in
328113357Sjeff	 * this bucket we can just return.  Otherwise we have to recalculate
329113357Sjeff	 * the smallest nice.
330113357Sjeff	 */
331113357Sjeff	if (nice != kseq->ksq_nicemin ||
332113357Sjeff	    kseq->ksq_nice[n] != 0 ||
333113357Sjeff	    kseq->ksq_loads[PRI_TIMESHARE] == 0)
334113357Sjeff		return;
335113357Sjeff
336113357Sjeff	for (; n < SCHED_PRI_NRESV + 1; n++)
337113357Sjeff		if (kseq->ksq_nice[n]) {
338113357Sjeff			kseq->ksq_nicemin = n - SCHED_PRI_NHALF;
339113357Sjeff			return;
340113357Sjeff		}
341110267Sjeff}
342110267Sjeff
343113357Sjeff#ifdef SMP
344116069Sjeff/*
345116069Sjeff * kseq_balance is a simple CPU load balancing algorithm.  It operates by
346116069Sjeff * finding the least loaded and most loaded cpu and equalizing their load
347116069Sjeff * by migrating some processes.
348116069Sjeff *
349116069Sjeff * Dealing only with two CPUs at a time has two advantages.  Firstly, most
350116069Sjeff * installations will only have 2 cpus.  Secondly, load balancing too much at
351116069Sjeff * once can have an unpleasant effect on the system.  The scheduler rarely has
352116069Sjeff * enough information to make perfect decisions.  So this algorithm chooses
353116069Sjeff * algorithm simplicity and more gradual effects on load in larger systems.
354116069Sjeff *
355116069Sjeff * It could be improved by considering the priorities and slices assigned to
356116069Sjeff * each task prior to balancing them.  There are many pathological cases with
357116069Sjeff * any approach and so the semi random algorithm below may work as well as any.
358116069Sjeff *
359116069Sjeff */
360116069Sjeffvoid
361116069Sjeffkseq_balance(void *arg)
362116069Sjeff{
363116069Sjeff	struct kseq *kseq;
364116069Sjeff	int high_load;
365116069Sjeff	int low_load;
366116069Sjeff	int high_cpu;
367116069Sjeff	int low_cpu;
368116069Sjeff	int move;
369116069Sjeff	int diff;
370116069Sjeff	int i;
371116069Sjeff
372116069Sjeff	high_cpu = 0;
373116069Sjeff	low_cpu = 0;
374116069Sjeff	high_load = 0;
375116069Sjeff	low_load = -1;
376116069Sjeff
377116069Sjeff	mtx_lock_spin(&sched_lock);
378116069Sjeff	for (i = 0; i < mp_maxid; i++) {
379116069Sjeff		if (CPU_ABSENT(i))
380116069Sjeff			continue;
381116069Sjeff		kseq = KSEQ_CPU(i);
382116069Sjeff		if (kseq->ksq_load > high_load) {
383116069Sjeff			high_load = kseq->ksq_load;
384116069Sjeff			high_cpu = i;
385116069Sjeff		}
386116069Sjeff		if (low_load == -1 || kseq->ksq_load < low_load) {
387116069Sjeff			low_load = kseq->ksq_load;
388116069Sjeff			low_cpu = i;
389116069Sjeff		}
390116069Sjeff	}
391116069Sjeff
392116069Sjeff	/*
393116069Sjeff	 * Nothing to do.
394116069Sjeff	 */
395116069Sjeff	if (high_load < 2 || low_load == high_load)
396116069Sjeff		goto out;
397116069Sjeff
398116069Sjeff	diff = high_load - low_load;
399116069Sjeff	move = diff / 2;
400116069Sjeff	if (diff & 0x1)
401116069Sjeff		move++;
402116069Sjeff
403116069Sjeff	for (i = 0; i < move; i++)
404116069Sjeff		kseq_move(KSEQ_CPU(high_cpu), low_cpu);
405116069Sjeff
406116069Sjeffout:
407116069Sjeff	mtx_unlock_spin(&sched_lock);
408116069Sjeff	callout_reset(&kseq_lb_callout, hz, kseq_balance, NULL);
409116069Sjeff
410116069Sjeff	return;
411116069Sjeff}
412116069Sjeff
413110267Sjeffstruct kseq *
414110267Sjeffkseq_load_highest(void)
415110267Sjeff{
416110267Sjeff	struct kseq *kseq;
417110267Sjeff	int load;
418110267Sjeff	int cpu;
419110267Sjeff	int i;
420110267Sjeff
421115998Sjeff	mtx_assert(&sched_lock, MA_OWNED);
422110267Sjeff	cpu = 0;
423110267Sjeff	load = 0;
424110267Sjeff
425110267Sjeff	for (i = 0; i < mp_maxid; i++) {
426110267Sjeff		if (CPU_ABSENT(i))
427110267Sjeff			continue;
428110267Sjeff		kseq = KSEQ_CPU(i);
429113357Sjeff		if (kseq->ksq_load > load) {
430113357Sjeff			load = kseq->ksq_load;
431110267Sjeff			cpu = i;
432110267Sjeff		}
433110267Sjeff	}
434113371Sjeff	if (load > 1)
435110267Sjeff		return (KSEQ_CPU(cpu));
436110267Sjeff
437110267Sjeff	return (NULL);
438110267Sjeff}
439116069Sjeff
440116069Sjeffvoid
441116069Sjeffkseq_move(struct kseq *from, int cpu)
442116069Sjeff{
443116069Sjeff	struct kse *ke;
444116069Sjeff
445116069Sjeff	ke = kseq_choose(from);
446116069Sjeff	runq_remove(ke->ke_runq, ke);
447116069Sjeff	ke->ke_state = KES_THREAD;
448116069Sjeff	kseq_rem(from, ke);
449116069Sjeff
450116069Sjeff	ke->ke_cpu = cpu;
451116069Sjeff	sched_add(ke);
452116069Sjeff}
453110267Sjeff#endif
454110267Sjeff
455110267Sjeffstruct kse *
456110267Sjeffkseq_choose(struct kseq *kseq)
457110267Sjeff{
458110267Sjeff	struct kse *ke;
459110267Sjeff	struct runq *swap;
460110267Sjeff
461115998Sjeff	mtx_assert(&sched_lock, MA_OWNED);
462113357Sjeff	swap = NULL;
463112994Sjeff
464113357Sjeff	for (;;) {
465113357Sjeff		ke = runq_choose(kseq->ksq_curr);
466113357Sjeff		if (ke == NULL) {
467113357Sjeff			/*
468113357Sjeff			 * We already swaped once and didn't get anywhere.
469113357Sjeff			 */
470113357Sjeff			if (swap)
471113357Sjeff				break;
472113357Sjeff			swap = kseq->ksq_curr;
473113357Sjeff			kseq->ksq_curr = kseq->ksq_next;
474113357Sjeff			kseq->ksq_next = swap;
475113357Sjeff			continue;
476113357Sjeff		}
477113357Sjeff		/*
478113357Sjeff		 * If we encounter a slice of 0 the kse is in a
479113357Sjeff		 * TIMESHARE kse group and its nice was too far out
480113357Sjeff		 * of the range that receives slices.
481113357Sjeff		 */
482113357Sjeff		if (ke->ke_slice == 0) {
483113357Sjeff			runq_remove(ke->ke_runq, ke);
484113357Sjeff			sched_slice(ke);
485113357Sjeff			ke->ke_runq = kseq->ksq_next;
486113357Sjeff			runq_add(ke->ke_runq, ke);
487113357Sjeff			continue;
488113357Sjeff		}
489113357Sjeff		return (ke);
490110267Sjeff	}
491110267Sjeff
492113357Sjeff	return (runq_choose(&kseq->ksq_idle));
493110267Sjeff}
494110267Sjeff
495109864Sjeffstatic void
496110028Sjeffkseq_setup(struct kseq *kseq)
497110028Sjeff{
498113357Sjeff	runq_init(&kseq->ksq_timeshare[0]);
499113357Sjeff	runq_init(&kseq->ksq_timeshare[1]);
500112994Sjeff	runq_init(&kseq->ksq_idle);
501113357Sjeff
502113357Sjeff	kseq->ksq_curr = &kseq->ksq_timeshare[0];
503113357Sjeff	kseq->ksq_next = &kseq->ksq_timeshare[1];
504113357Sjeff
505113357Sjeff	kseq->ksq_loads[PRI_ITHD] = 0;
506113357Sjeff	kseq->ksq_loads[PRI_REALTIME] = 0;
507113357Sjeff	kseq->ksq_loads[PRI_TIMESHARE] = 0;
508113357Sjeff	kseq->ksq_loads[PRI_IDLE] = 0;
509113660Sjeff	kseq->ksq_load = 0;
510110267Sjeff#ifdef SMP
511110267Sjeff	kseq->ksq_rslices = 0;
512110267Sjeff#endif
513110028Sjeff}
514110028Sjeff
515110028Sjeffstatic void
516109864Sjeffsched_setup(void *dummy)
517109864Sjeff{
518109864Sjeff	int i;
519109864Sjeff
520113357Sjeff	slice_min = (hz/100);
521113357Sjeff	slice_max = (hz/10);
522111857Sjeff
523109864Sjeff	mtx_lock_spin(&sched_lock);
524109864Sjeff	/* init kseqs */
525110028Sjeff	for (i = 0; i < MAXCPU; i++)
526110028Sjeff		kseq_setup(KSEQ_CPU(i));
527113357Sjeff
528113357Sjeff	kseq_add(KSEQ_SELF(), &kse0);
529109864Sjeff	mtx_unlock_spin(&sched_lock);
530116069Sjeff#ifdef SMP
531116069Sjeff	callout_init(&kseq_lb_callout, 1);
532116069Sjeff	kseq_balance(NULL);
533116069Sjeff#endif
534109864Sjeff}
535109864Sjeff
536109864Sjeff/*
537109864Sjeff * Scale the scheduling priority according to the "interactivity" of this
538109864Sjeff * process.
539109864Sjeff */
540113357Sjeffstatic void
541109864Sjeffsched_priority(struct ksegrp *kg)
542109864Sjeff{
543109864Sjeff	int pri;
544109864Sjeff
545109864Sjeff	if (kg->kg_pri_class != PRI_TIMESHARE)
546113357Sjeff		return;
547109864Sjeff
548113357Sjeff	pri = SCHED_PRI_INTERACT(sched_interact_score(kg));
549111857Sjeff	pri += SCHED_PRI_BASE;
550109864Sjeff	pri += kg->kg_nice;
551109864Sjeff
552109864Sjeff	if (pri > PRI_MAX_TIMESHARE)
553109864Sjeff		pri = PRI_MAX_TIMESHARE;
554109864Sjeff	else if (pri < PRI_MIN_TIMESHARE)
555109864Sjeff		pri = PRI_MIN_TIMESHARE;
556109864Sjeff
557109864Sjeff	kg->kg_user_pri = pri;
558109864Sjeff
559113357Sjeff	return;
560109864Sjeff}
561109864Sjeff
562109864Sjeff/*
563112966Sjeff * Calculate a time slice based on the properties of the kseg and the runq
564112994Sjeff * that we're on.  This is only for PRI_TIMESHARE ksegrps.
565109864Sjeff */
566112966Sjeffstatic void
567112966Sjeffsched_slice(struct kse *ke)
568109864Sjeff{
569113357Sjeff	struct kseq *kseq;
570112966Sjeff	struct ksegrp *kg;
571109864Sjeff
572112966Sjeff	kg = ke->ke_ksegrp;
573113357Sjeff	kseq = KSEQ_CPU(ke->ke_cpu);
574109864Sjeff
575112966Sjeff	/*
576112966Sjeff	 * Rationale:
577112966Sjeff	 * KSEs in interactive ksegs get the minimum slice so that we
578112966Sjeff	 * quickly notice if it abuses its advantage.
579112966Sjeff	 *
580112966Sjeff	 * KSEs in non-interactive ksegs are assigned a slice that is
581112966Sjeff	 * based on the ksegs nice value relative to the least nice kseg
582112966Sjeff	 * on the run queue for this cpu.
583112966Sjeff	 *
584112966Sjeff	 * If the KSE is less nice than all others it gets the maximum
585112966Sjeff	 * slice and other KSEs will adjust their slice relative to
586112966Sjeff	 * this when they first expire.
587112966Sjeff	 *
588112966Sjeff	 * There is 20 point window that starts relative to the least
589112966Sjeff	 * nice kse on the run queue.  Slice size is determined by
590112966Sjeff	 * the kse distance from the last nice ksegrp.
591112966Sjeff	 *
592112966Sjeff	 * If you are outside of the window you will get no slice and
593112966Sjeff	 * you will be reevaluated each time you are selected on the
594112966Sjeff	 * run queue.
595112966Sjeff	 *
596112966Sjeff	 */
597109864Sjeff
598113357Sjeff	if (!SCHED_INTERACTIVE(kg)) {
599112966Sjeff		int nice;
600112966Sjeff
601113357Sjeff		nice = kg->kg_nice + (0 - kseq->ksq_nicemin);
602113357Sjeff		if (kseq->ksq_loads[PRI_TIMESHARE] == 0 ||
603113357Sjeff		    kg->kg_nice < kseq->ksq_nicemin)
604112966Sjeff			ke->ke_slice = SCHED_SLICE_MAX;
605113357Sjeff		else if (nice <= SCHED_PRI_NTHRESH)
606112966Sjeff			ke->ke_slice = SCHED_SLICE_NICE(nice);
607112966Sjeff		else
608112966Sjeff			ke->ke_slice = 0;
609112966Sjeff	} else
610112966Sjeff		ke->ke_slice = SCHED_SLICE_MIN;
611112966Sjeff
612113357Sjeff	CTR6(KTR_ULE,
613113357Sjeff	    "Sliced %p(%d) (nice: %d, nicemin: %d, load: %d, interactive: %d)",
614113357Sjeff	    ke, ke->ke_slice, kg->kg_nice, kseq->ksq_nicemin,
615113357Sjeff	    kseq->ksq_loads[PRI_TIMESHARE], SCHED_INTERACTIVE(kg));
616113357Sjeff
617110645Sjeff	/*
618112994Sjeff	 * Check to see if we need to scale back the slp and run time
619112994Sjeff	 * in the kg.  This will cause us to forget old interactivity
620112994Sjeff	 * while maintaining the current ratio.
621110645Sjeff	 */
622116463Sjeff	sched_interact_update(kg);
623110645Sjeff
624112966Sjeff	return;
625109864Sjeff}
626109864Sjeff
627116463Sjeffstatic void
628116463Sjeffsched_interact_update(struct ksegrp *kg)
629116463Sjeff{
630116463Sjeff	if ((kg->kg_runtime + kg->kg_slptime) >  SCHED_SLP_RUN_MAX) {
631116463Sjeff		kg->kg_runtime = (kg->kg_runtime / 5) * 4;
632116463Sjeff		kg->kg_slptime = (kg->kg_slptime / 5) * 4;
633116463Sjeff	}
634116463Sjeff}
635116463Sjeff
636111857Sjeffstatic int
637111857Sjeffsched_interact_score(struct ksegrp *kg)
638111857Sjeff{
639116365Sjeff	int div;
640111857Sjeff
641111857Sjeff	if (kg->kg_runtime > kg->kg_slptime) {
642116365Sjeff		div = max(1, kg->kg_runtime / SCHED_INTERACT_HALF);
643116365Sjeff		return (SCHED_INTERACT_HALF +
644116365Sjeff		    (SCHED_INTERACT_HALF - (kg->kg_slptime / div)));
645116365Sjeff	} if (kg->kg_slptime > kg->kg_runtime) {
646116365Sjeff		div = max(1, kg->kg_slptime / SCHED_INTERACT_HALF);
647116365Sjeff		return (kg->kg_runtime / div);
648111857Sjeff	}
649111857Sjeff
650116365Sjeff	/*
651116365Sjeff	 * This can happen if slptime and runtime are 0.
652116365Sjeff	 */
653116365Sjeff	return (0);
654111857Sjeff
655111857Sjeff}
656111857Sjeff
657113357Sjeff/*
658113357Sjeff * This is only somewhat accurate since given many processes of the same
659113357Sjeff * priority they will switch when their slices run out, which will be
660113357Sjeff * at most SCHED_SLICE_MAX.
661113357Sjeff */
662109864Sjeffint
663109864Sjeffsched_rr_interval(void)
664109864Sjeff{
665109864Sjeff	return (SCHED_SLICE_MAX);
666109864Sjeff}
667109864Sjeff
668109864Sjeffvoid
669109864Sjeffsched_pctcpu_update(struct kse *ke)
670109864Sjeff{
671109864Sjeff	/*
672109864Sjeff	 * Adjust counters and watermark for pctcpu calc.
673116365Sjeff	 */
674116365Sjeff
675116365Sjeff	/*
676111793Sjeff	 * Shift the tick count out so that the divide doesn't round away
677111793Sjeff	 * our results.
678111793Sjeff	 */
679111793Sjeff	ke->ke_ticks <<= 10;
680109864Sjeff	ke->ke_ticks = (ke->ke_ticks / (ke->ke_ltick - ke->ke_ftick)) *
681109864Sjeff		    SCHED_CPU_TICKS;
682111793Sjeff	ke->ke_ticks >>= 10;
683109864Sjeff	ke->ke_ltick = ticks;
684109864Sjeff	ke->ke_ftick = ke->ke_ltick - SCHED_CPU_TICKS;
685109864Sjeff}
686109864Sjeff
687109864Sjeff#ifdef SMP
688110267Sjeff/* XXX Should be changed to kseq_load_lowest() */
689109864Sjeffint
690109864Sjeffsched_pickcpu(void)
691109864Sjeff{
692110028Sjeff	struct kseq *kseq;
693110028Sjeff	int load;
694109864Sjeff	int cpu;
695109864Sjeff	int i;
696109864Sjeff
697115998Sjeff	mtx_assert(&sched_lock, MA_OWNED);
698109864Sjeff	if (!smp_started)
699109864Sjeff		return (0);
700109864Sjeff
701110028Sjeff	load = 0;
702110028Sjeff	cpu = 0;
703109864Sjeff
704109864Sjeff	for (i = 0; i < mp_maxid; i++) {
705109864Sjeff		if (CPU_ABSENT(i))
706109864Sjeff			continue;
707110028Sjeff		kseq = KSEQ_CPU(i);
708113357Sjeff		if (kseq->ksq_load < load) {
709109864Sjeff			cpu = i;
710113357Sjeff			load = kseq->ksq_load;
711109864Sjeff		}
712109864Sjeff	}
713109864Sjeff
714109864Sjeff	CTR1(KTR_RUNQ, "sched_pickcpu: %d", cpu);
715109864Sjeff	return (cpu);
716109864Sjeff}
717109864Sjeff#else
718109864Sjeffint
719109864Sjeffsched_pickcpu(void)
720109864Sjeff{
721109864Sjeff	return (0);
722109864Sjeff}
723109864Sjeff#endif
724109864Sjeff
725109864Sjeffvoid
726109864Sjeffsched_prio(struct thread *td, u_char prio)
727109864Sjeff{
728109864Sjeff	struct kse *ke;
729109864Sjeff	struct runq *rq;
730109864Sjeff
731109864Sjeff	mtx_assert(&sched_lock, MA_OWNED);
732109864Sjeff	ke = td->td_kse;
733109864Sjeff	td->td_priority = prio;
734109864Sjeff
735109864Sjeff	if (TD_ON_RUNQ(td)) {
736109864Sjeff		rq = ke->ke_runq;
737109864Sjeff
738109864Sjeff		runq_remove(rq, ke);
739109864Sjeff		runq_add(rq, ke);
740109864Sjeff	}
741109864Sjeff}
742109864Sjeff
743109864Sjeffvoid
744109864Sjeffsched_switchout(struct thread *td)
745109864Sjeff{
746109864Sjeff	struct kse *ke;
747109864Sjeff
748109864Sjeff	mtx_assert(&sched_lock, MA_OWNED);
749109864Sjeff
750109864Sjeff	ke = td->td_kse;
751109864Sjeff
752109864Sjeff	td->td_last_kse = ke;
753113339Sjulian        td->td_lastcpu = td->td_oncpu;
754113339Sjulian	td->td_oncpu = NOCPU;
755111032Sjulian        td->td_flags &= ~TDF_NEEDRESCHED;
756109864Sjeff
757109864Sjeff	if (TD_IS_RUNNING(td)) {
758116365Sjeff		/*
759116365Sjeff		 * This queue is always correct except for idle threads which
760116365Sjeff		 * have a higher priority due to priority propagation.
761116365Sjeff		 */
762116365Sjeff		if (ke->ke_ksegrp->kg_pri_class == PRI_IDLE &&
763116365Sjeff		    ke->ke_thread->td_priority > PRI_MIN_IDLE)
764116365Sjeff			ke->ke_runq = KSEQ_SELF()->ksq_curr;
765113357Sjeff		runq_add(ke->ke_runq, ke);
766113357Sjeff		/* setrunqueue(td); */
767109864Sjeff		return;
768111857Sjeff	}
769113357Sjeff	if (ke->ke_runq)
770113357Sjeff		kseq_rem(KSEQ_CPU(ke->ke_cpu), ke);
771109864Sjeff	/*
772109864Sjeff	 * We will not be on the run queue. So we must be
773109864Sjeff	 * sleeping or similar.
774109864Sjeff	 */
775116361Sdavidxu	if (td->td_proc->p_flag & P_SA)
776109864Sjeff		kse_reassign(ke);
777109864Sjeff}
778109864Sjeff
779109864Sjeffvoid
780109864Sjeffsched_switchin(struct thread *td)
781109864Sjeff{
782109864Sjeff	/* struct kse *ke = td->td_kse; */
783109864Sjeff	mtx_assert(&sched_lock, MA_OWNED);
784109864Sjeff
785113339Sjulian	td->td_oncpu = PCPU_GET(cpuid);
786109864Sjeff}
787109864Sjeff
788109864Sjeffvoid
789109864Sjeffsched_nice(struct ksegrp *kg, int nice)
790109864Sjeff{
791113357Sjeff	struct kse *ke;
792109864Sjeff	struct thread *td;
793113357Sjeff	struct kseq *kseq;
794109864Sjeff
795113873Sjhb	PROC_LOCK_ASSERT(kg->kg_proc, MA_OWNED);
796113873Sjhb	mtx_assert(&sched_lock, MA_OWNED);
797113357Sjeff	/*
798113357Sjeff	 * We need to adjust the nice counts for running KSEs.
799113357Sjeff	 */
800113357Sjeff	if (kg->kg_pri_class == PRI_TIMESHARE)
801113357Sjeff		FOREACH_KSE_IN_GROUP(kg, ke) {
802113357Sjeff			if (ke->ke_state != KES_ONRUNQ &&
803113357Sjeff			    ke->ke_state != KES_THREAD)
804113357Sjeff				continue;
805113357Sjeff			kseq = KSEQ_CPU(ke->ke_cpu);
806113357Sjeff			kseq_nice_rem(kseq, kg->kg_nice);
807113357Sjeff			kseq_nice_add(kseq, nice);
808113357Sjeff		}
809109864Sjeff	kg->kg_nice = nice;
810109864Sjeff	sched_priority(kg);
811113357Sjeff	FOREACH_THREAD_IN_GROUP(kg, td)
812111032Sjulian		td->td_flags |= TDF_NEEDRESCHED;
813109864Sjeff}
814109864Sjeff
815109864Sjeffvoid
816109864Sjeffsched_sleep(struct thread *td, u_char prio)
817109864Sjeff{
818109864Sjeff	mtx_assert(&sched_lock, MA_OWNED);
819109864Sjeff
820109864Sjeff	td->td_slptime = ticks;
821109864Sjeff	td->td_priority = prio;
822109864Sjeff
823113357Sjeff	CTR2(KTR_ULE, "sleep kse %p (tick: %d)",
824113357Sjeff	    td->td_kse, td->td_slptime);
825109864Sjeff}
826109864Sjeff
827109864Sjeffvoid
828109864Sjeffsched_wakeup(struct thread *td)
829109864Sjeff{
830109864Sjeff	mtx_assert(&sched_lock, MA_OWNED);
831109864Sjeff
832109864Sjeff	/*
833109864Sjeff	 * Let the kseg know how long we slept for.  This is because process
834109864Sjeff	 * interactivity behavior is modeled in the kseg.
835109864Sjeff	 */
836111788Sjeff	if (td->td_slptime) {
837111788Sjeff		struct ksegrp *kg;
838113357Sjeff		int hzticks;
839109864Sjeff
840111788Sjeff		kg = td->td_ksegrp;
841113357Sjeff		hzticks = ticks - td->td_slptime;
842113357Sjeff		kg->kg_slptime += hzticks << 10;
843116463Sjeff		sched_interact_update(kg);
844111788Sjeff		sched_priority(kg);
845116463Sjeff		if (td->td_kse)
846116463Sjeff			sched_slice(td->td_kse);
847113357Sjeff		CTR2(KTR_ULE, "wakeup kse %p (%d ticks)",
848113357Sjeff		    td->td_kse, hzticks);
849111788Sjeff		td->td_slptime = 0;
850109864Sjeff	}
851109864Sjeff	setrunqueue(td);
852109864Sjeff        if (td->td_priority < curthread->td_priority)
853111032Sjulian                curthread->td_flags |= TDF_NEEDRESCHED;
854109864Sjeff}
855109864Sjeff
856109864Sjeff/*
857109864Sjeff * Penalize the parent for creating a new child and initialize the child's
858109864Sjeff * priority.
859109864Sjeff */
860109864Sjeffvoid
861113357Sjeffsched_fork(struct proc *p, struct proc *p1)
862109864Sjeff{
863109864Sjeff
864109864Sjeff	mtx_assert(&sched_lock, MA_OWNED);
865109864Sjeff
866113357Sjeff	sched_fork_ksegrp(FIRST_KSEGRP_IN_PROC(p), FIRST_KSEGRP_IN_PROC(p1));
867113357Sjeff	sched_fork_kse(FIRST_KSE_IN_PROC(p), FIRST_KSE_IN_PROC(p1));
868113357Sjeff	sched_fork_thread(FIRST_THREAD_IN_PROC(p), FIRST_THREAD_IN_PROC(p1));
869113357Sjeff}
870113357Sjeff
871113357Sjeffvoid
872113357Sjeffsched_fork_kse(struct kse *ke, struct kse *child)
873113357Sjeff{
874113923Sjhb
875116365Sjeff	child->ke_slice = 1;	/* Attempt to quickly learn interactivity. */
876113357Sjeff	child->ke_cpu = ke->ke_cpu; /* sched_pickcpu(); */
877113357Sjeff	child->ke_runq = NULL;
878113357Sjeff
879113357Sjeff	/*
880113357Sjeff	 * Claim that we've been running for one second for statistical
881113357Sjeff	 * purposes.
882113357Sjeff	 */
883113357Sjeff	child->ke_ticks = 0;
884113357Sjeff	child->ke_ltick = ticks;
885113357Sjeff	child->ke_ftick = ticks - hz;
886113357Sjeff}
887113357Sjeff
888113357Sjeffvoid
889113357Sjeffsched_fork_ksegrp(struct ksegrp *kg, struct ksegrp *child)
890113357Sjeff{
891113923Sjhb
892113923Sjhb	PROC_LOCK_ASSERT(child->kg_proc, MA_OWNED);
893109864Sjeff	/* XXX Need something better here */
894116365Sjeff
895116365Sjeff	child->kg_slptime = kg->kg_slptime;
896116365Sjeff	child->kg_runtime = kg->kg_runtime;
897116463Sjeff	kg->kg_runtime += tickincr << 10;
898116463Sjeff	sched_interact_update(kg);
899113357Sjeff
900109864Sjeff	child->kg_user_pri = kg->kg_user_pri;
901113357Sjeff	child->kg_nice = kg->kg_nice;
902113357Sjeff}
903109864Sjeff
904113357Sjeffvoid
905113357Sjeffsched_fork_thread(struct thread *td, struct thread *child)
906113357Sjeff{
907113357Sjeff}
908113357Sjeff
909113357Sjeffvoid
910113357Sjeffsched_class(struct ksegrp *kg, int class)
911113357Sjeff{
912113357Sjeff	struct kseq *kseq;
913113357Sjeff	struct kse *ke;
914113357Sjeff
915113923Sjhb	mtx_assert(&sched_lock, MA_OWNED);
916113357Sjeff	if (kg->kg_pri_class == class)
917113357Sjeff		return;
918113357Sjeff
919113357Sjeff	FOREACH_KSE_IN_GROUP(kg, ke) {
920113357Sjeff		if (ke->ke_state != KES_ONRUNQ &&
921113357Sjeff		    ke->ke_state != KES_THREAD)
922113357Sjeff			continue;
923113357Sjeff		kseq = KSEQ_CPU(ke->ke_cpu);
924113357Sjeff
925113386Sjeff		kseq->ksq_loads[PRI_BASE(kg->kg_pri_class)]--;
926113386Sjeff		kseq->ksq_loads[PRI_BASE(class)]++;
927113357Sjeff
928113357Sjeff		if (kg->kg_pri_class == PRI_TIMESHARE)
929113357Sjeff			kseq_nice_rem(kseq, kg->kg_nice);
930113357Sjeff		else if (class == PRI_TIMESHARE)
931113357Sjeff			kseq_nice_add(kseq, kg->kg_nice);
932109970Sjeff	}
933109970Sjeff
934113357Sjeff	kg->kg_pri_class = class;
935109864Sjeff}
936109864Sjeff
937109864Sjeff/*
938109864Sjeff * Return some of the child's priority and interactivity to the parent.
939109864Sjeff */
940109864Sjeffvoid
941113357Sjeffsched_exit(struct proc *p, struct proc *child)
942109864Sjeff{
943109864Sjeff	/* XXX Need something better here */
944109864Sjeff	mtx_assert(&sched_lock, MA_OWNED);
945113372Sjeff	sched_exit_kse(FIRST_KSE_IN_PROC(p), FIRST_KSE_IN_PROC(child));
946116365Sjeff	sched_exit_ksegrp(FIRST_KSEGRP_IN_PROC(p), FIRST_KSEGRP_IN_PROC(child));
947109864Sjeff}
948109864Sjeff
949109864Sjeffvoid
950113372Sjeffsched_exit_kse(struct kse *ke, struct kse *child)
951113372Sjeff{
952113372Sjeff	kseq_rem(KSEQ_CPU(child->ke_cpu), child);
953113372Sjeff}
954113372Sjeff
955113372Sjeffvoid
956113372Sjeffsched_exit_ksegrp(struct ksegrp *kg, struct ksegrp *child)
957113372Sjeff{
958116463Sjeff	/* kg->kg_slptime += child->kg_slptime; */
959116365Sjeff	kg->kg_runtime += child->kg_runtime;
960116463Sjeff	sched_interact_update(kg);
961113372Sjeff}
962113372Sjeff
963113372Sjeffvoid
964113372Sjeffsched_exit_thread(struct thread *td, struct thread *child)
965113372Sjeff{
966113372Sjeff}
967113372Sjeff
968113372Sjeffvoid
969113357Sjeffsched_clock(struct kse *ke)
970109864Sjeff{
971113357Sjeff	struct kseq *kseq;
972113357Sjeff	struct ksegrp *kg;
973113357Sjeff	struct thread *td;
974113357Sjeff#if 0
975109864Sjeff	struct kse *nke;
976110267Sjeff#endif
977109864Sjeff
978113357Sjeff	/*
979113357Sjeff	 * sched_setup() apparently happens prior to stathz being set.  We
980113357Sjeff	 * need to resolve the timers earlier in the boot so we can avoid
981113357Sjeff	 * calculating this here.
982113357Sjeff	 */
983113357Sjeff	if (realstathz == 0) {
984113357Sjeff		realstathz = stathz ? stathz : hz;
985113357Sjeff		tickincr = hz / realstathz;
986113357Sjeff		/*
987113357Sjeff		 * XXX This does not work for values of stathz that are much
988113357Sjeff		 * larger than hz.
989113357Sjeff		 */
990113357Sjeff		if (tickincr == 0)
991113357Sjeff			tickincr = 1;
992113357Sjeff	}
993109864Sjeff
994113357Sjeff	td = ke->ke_thread;
995113357Sjeff	kg = ke->ke_ksegrp;
996109864Sjeff
997110028Sjeff	mtx_assert(&sched_lock, MA_OWNED);
998110028Sjeff	KASSERT((td != NULL), ("schedclock: null thread pointer"));
999110028Sjeff
1000110028Sjeff	/* Adjust ticks for pctcpu */
1001111793Sjeff	ke->ke_ticks++;
1002109971Sjeff	ke->ke_ltick = ticks;
1003112994Sjeff
1004109971Sjeff	/* Go up to one second beyond our max and then trim back down */
1005109971Sjeff	if (ke->ke_ftick + SCHED_CPU_TICKS + hz < ke->ke_ltick)
1006109971Sjeff		sched_pctcpu_update(ke);
1007109971Sjeff
1008114496Sjulian	if (td->td_flags & TDF_IDLETD)
1009109864Sjeff		return;
1010110028Sjeff
1011113357Sjeff	CTR4(KTR_ULE, "Tick kse %p (slice: %d, slptime: %d, runtime: %d)",
1012113357Sjeff	    ke, ke->ke_slice, kg->kg_slptime >> 10, kg->kg_runtime >> 10);
1013113357Sjeff
1014110028Sjeff	/*
1015113357Sjeff	 * We only do slicing code for TIMESHARE ksegrps.
1016113357Sjeff	 */
1017113357Sjeff	if (kg->kg_pri_class != PRI_TIMESHARE)
1018113357Sjeff		return;
1019113357Sjeff	/*
1020110028Sjeff	 * Check for a higher priority task on the run queue.  This can happen
1021110028Sjeff	 * on SMP if another processor woke up a process on our runq.
1022110028Sjeff	 */
1023110028Sjeff	kseq = KSEQ_SELF();
1024113357Sjeff#if 0
1025113357Sjeff	if (kseq->ksq_load > 1 && (nke = kseq_choose(kseq)) != NULL) {
1026113357Sjeff		if (sched_strict &&
1027113357Sjeff		    nke->ke_thread->td_priority < td->td_priority)
1028113357Sjeff			td->td_flags |= TDF_NEEDRESCHED;
1029113357Sjeff		else if (nke->ke_thread->td_priority <
1030113357Sjeff		    td->td_priority SCHED_PRIO_SLOP)
1031113357Sjeff
1032113357Sjeff		if (nke->ke_thread->td_priority < td->td_priority)
1033113357Sjeff			td->td_flags |= TDF_NEEDRESCHED;
1034113357Sjeff	}
1035110267Sjeff#endif
1036109864Sjeff	/*
1037110645Sjeff	 * We used a tick charge it to the ksegrp so that we can compute our
1038113357Sjeff	 * interactivity.
1039109864Sjeff	 */
1040113357Sjeff	kg->kg_runtime += tickincr << 10;
1041116463Sjeff	sched_interact_update(kg);
1042110645Sjeff
1043109864Sjeff	/*
1044109864Sjeff	 * We used up one time slice.
1045109864Sjeff	 */
1046109864Sjeff	ke->ke_slice--;
1047113357Sjeff#ifdef SMP
1048113370Sjeff	kseq->ksq_rslices--;
1049113357Sjeff#endif
1050113357Sjeff
1051113357Sjeff	if (ke->ke_slice > 0)
1052113357Sjeff		return;
1053109864Sjeff	/*
1054113357Sjeff	 * We're out of time, recompute priorities and requeue.
1055109864Sjeff	 */
1056113357Sjeff	kseq_rem(kseq, ke);
1057113357Sjeff	sched_priority(kg);
1058113357Sjeff	sched_slice(ke);
1059113357Sjeff	if (SCHED_CURR(kg, ke))
1060113357Sjeff		ke->ke_runq = kseq->ksq_curr;
1061113357Sjeff	else
1062113357Sjeff		ke->ke_runq = kseq->ksq_next;
1063113357Sjeff	kseq_add(kseq, ke);
1064113357Sjeff	td->td_flags |= TDF_NEEDRESCHED;
1065109864Sjeff}
1066109864Sjeff
1067109864Sjeffint
1068109864Sjeffsched_runnable(void)
1069109864Sjeff{
1070109864Sjeff	struct kseq *kseq;
1071115998Sjeff	int load;
1072109864Sjeff
1073115998Sjeff	load = 1;
1074115998Sjeff
1075115998Sjeff	mtx_lock_spin(&sched_lock);
1076110028Sjeff	kseq = KSEQ_SELF();
1077109864Sjeff
1078113357Sjeff	if (kseq->ksq_load)
1079115998Sjeff		goto out;
1080109970Sjeff#ifdef SMP
1081110028Sjeff	/*
1082110028Sjeff	 * For SMP we may steal other processor's KSEs.  Just search until we
1083110028Sjeff	 * verify that at least on other cpu has a runnable task.
1084110028Sjeff	 */
1085109970Sjeff	if (smp_started) {
1086109970Sjeff		int i;
1087109970Sjeff
1088109970Sjeff		for (i = 0; i < mp_maxid; i++) {
1089109970Sjeff			if (CPU_ABSENT(i))
1090109970Sjeff				continue;
1091110028Sjeff			kseq = KSEQ_CPU(i);
1092113660Sjeff			if (kseq->ksq_load > 1)
1093115998Sjeff				goto out;
1094109970Sjeff		}
1095109970Sjeff	}
1096109970Sjeff#endif
1097115998Sjeff	load = 0;
1098115998Sjeffout:
1099115998Sjeff	mtx_unlock_spin(&sched_lock);
1100115998Sjeff	return (load);
1101109864Sjeff}
1102109864Sjeff
1103109864Sjeffvoid
1104109864Sjeffsched_userret(struct thread *td)
1105109864Sjeff{
1106109864Sjeff	struct ksegrp *kg;
1107116365Sjeff	struct kseq *kseq;
1108116365Sjeff	struct kse *ke;
1109109864Sjeff
1110109864Sjeff	kg = td->td_ksegrp;
1111109864Sjeff
1112109864Sjeff	if (td->td_priority != kg->kg_user_pri) {
1113109864Sjeff		mtx_lock_spin(&sched_lock);
1114109864Sjeff		td->td_priority = kg->kg_user_pri;
1115116365Sjeff		kseq = KSEQ_SELF();
1116116365Sjeff		if (td->td_ksegrp->kg_pri_class == PRI_TIMESHARE &&
1117116365Sjeff		    kseq->ksq_load > 1 &&
1118116365Sjeff		    (ke = kseq_choose(kseq)) != NULL &&
1119116365Sjeff		    ke->ke_thread->td_priority < td->td_priority)
1120116365Sjeff			curthread->td_flags |= TDF_NEEDRESCHED;
1121109864Sjeff		mtx_unlock_spin(&sched_lock);
1122109864Sjeff	}
1123109864Sjeff}
1124109864Sjeff
1125109864Sjeffstruct kse *
1126109970Sjeffsched_choose(void)
1127109970Sjeff{
1128110028Sjeff	struct kseq *kseq;
1129109970Sjeff	struct kse *ke;
1130109970Sjeff
1131115998Sjeff	mtx_assert(&sched_lock, MA_OWNED);
1132113357Sjeff#ifdef SMP
1133112966Sjeffretry:
1134113357Sjeff#endif
1135113370Sjeff	kseq = KSEQ_SELF();
1136110028Sjeff	ke = kseq_choose(kseq);
1137109864Sjeff	if (ke) {
1138113357Sjeff		runq_remove(ke->ke_runq, ke);
1139109864Sjeff		ke->ke_state = KES_THREAD;
1140112966Sjeff
1141113357Sjeff		if (ke->ke_ksegrp->kg_pri_class == PRI_TIMESHARE) {
1142113357Sjeff			CTR4(KTR_ULE, "Run kse %p from %p (slice: %d, pri: %d)",
1143113357Sjeff			    ke, ke->ke_runq, ke->ke_slice,
1144113357Sjeff			    ke->ke_thread->td_priority);
1145113357Sjeff		}
1146113357Sjeff		return (ke);
1147109864Sjeff	}
1148109864Sjeff
1149109970Sjeff#ifdef SMP
1150113370Sjeff	if (smp_started) {
1151109970Sjeff		/*
1152109970Sjeff		 * Find the cpu with the highest load and steal one proc.
1153109970Sjeff		 */
1154113370Sjeff		if ((kseq = kseq_load_highest()) == NULL)
1155113370Sjeff			return (NULL);
1156113370Sjeff
1157113370Sjeff		/*
1158113370Sjeff		 * Remove this kse from this kseq and runq and then requeue
1159113370Sjeff		 * on the current processor.  Then we will dequeue it
1160113370Sjeff		 * normally above.
1161113370Sjeff		 */
1162116069Sjeff		kseq_move(kseq, PCPU_GET(cpuid));
1163113370Sjeff		goto retry;
1164109970Sjeff	}
1165109970Sjeff#endif
1166113357Sjeff
1167113357Sjeff	return (NULL);
1168109864Sjeff}
1169109864Sjeff
1170109864Sjeffvoid
1171109864Sjeffsched_add(struct kse *ke)
1172109864Sjeff{
1173110267Sjeff	struct kseq *kseq;
1174113357Sjeff	struct ksegrp *kg;
1175109864Sjeff
1176109864Sjeff	mtx_assert(&sched_lock, MA_OWNED);
1177110267Sjeff	KASSERT((ke->ke_thread != NULL), ("sched_add: No thread on KSE"));
1178109864Sjeff	KASSERT((ke->ke_thread->td_kse != NULL),
1179110267Sjeff	    ("sched_add: No KSE on thread"));
1180109864Sjeff	KASSERT(ke->ke_state != KES_ONRUNQ,
1181110267Sjeff	    ("sched_add: kse %p (%s) already in run queue", ke,
1182109864Sjeff	    ke->ke_proc->p_comm));
1183109864Sjeff	KASSERT(ke->ke_proc->p_sflag & PS_INMEM,
1184110267Sjeff	    ("sched_add: process swapped out"));
1185113387Sjeff	KASSERT(ke->ke_runq == NULL,
1186113387Sjeff	    ("sched_add: KSE %p is still assigned to a run queue", ke));
1187109864Sjeff
1188113357Sjeff	kg = ke->ke_ksegrp;
1189113357Sjeff
1190113386Sjeff	switch (PRI_BASE(kg->kg_pri_class)) {
1191112994Sjeff	case PRI_ITHD:
1192112994Sjeff	case PRI_REALTIME:
1193112994Sjeff		kseq = KSEQ_SELF();
1194113357Sjeff		ke->ke_runq = kseq->ksq_curr;
1195113357Sjeff		ke->ke_slice = SCHED_SLICE_MAX;
1196113660Sjeff		ke->ke_cpu = PCPU_GET(cpuid);
1197112994Sjeff		break;
1198112994Sjeff	case PRI_TIMESHARE:
1199113357Sjeff		kseq = KSEQ_CPU(ke->ke_cpu);
1200113387Sjeff		if (SCHED_CURR(kg, ke))
1201113387Sjeff			ke->ke_runq = kseq->ksq_curr;
1202113387Sjeff		else
1203113387Sjeff			ke->ke_runq = kseq->ksq_next;
1204113357Sjeff		break;
1205112994Sjeff	case PRI_IDLE:
1206111789Sjeff		kseq = KSEQ_CPU(ke->ke_cpu);
1207113357Sjeff		/*
1208113357Sjeff		 * This is for priority prop.
1209113357Sjeff		 */
1210116365Sjeff		if (ke->ke_thread->td_priority > PRI_MIN_IDLE)
1211113357Sjeff			ke->ke_runq = kseq->ksq_curr;
1212113357Sjeff		else
1213113357Sjeff			ke->ke_runq = &kseq->ksq_idle;
1214113357Sjeff		ke->ke_slice = SCHED_SLICE_MIN;
1215112994Sjeff		break;
1216113357Sjeff	default:
1217113357Sjeff		panic("Unknown pri class.\n");
1218113357Sjeff		break;
1219112994Sjeff	}
1220109864Sjeff
1221109864Sjeff	ke->ke_ksegrp->kg_runq_kses++;
1222109864Sjeff	ke->ke_state = KES_ONRUNQ;
1223109864Sjeff
1224113357Sjeff	runq_add(ke->ke_runq, ke);
1225113387Sjeff	kseq_add(kseq, ke);
1226109864Sjeff}
1227109864Sjeff
1228109864Sjeffvoid
1229109864Sjeffsched_rem(struct kse *ke)
1230109864Sjeff{
1231113357Sjeff	struct kseq *kseq;
1232113357Sjeff
1233109864Sjeff	mtx_assert(&sched_lock, MA_OWNED);
1234113387Sjeff	KASSERT((ke->ke_state == KES_ONRUNQ), ("KSE not on run queue"));
1235109864Sjeff
1236109864Sjeff	ke->ke_state = KES_THREAD;
1237109864Sjeff	ke->ke_ksegrp->kg_runq_kses--;
1238113357Sjeff	kseq = KSEQ_CPU(ke->ke_cpu);
1239113357Sjeff	runq_remove(ke->ke_runq, ke);
1240113357Sjeff	kseq_rem(kseq, ke);
1241109864Sjeff}
1242109864Sjeff
1243109864Sjefffixpt_t
1244109864Sjeffsched_pctcpu(struct kse *ke)
1245109864Sjeff{
1246109864Sjeff	fixpt_t pctcpu;
1247109864Sjeff
1248109864Sjeff	pctcpu = 0;
1249109864Sjeff
1250115998Sjeff	mtx_lock_spin(&sched_lock);
1251109864Sjeff	if (ke->ke_ticks) {
1252109864Sjeff		int rtick;
1253109864Sjeff
1254109864Sjeff		/* Update to account for time potentially spent sleeping */
1255109864Sjeff		ke->ke_ltick = ticks;
1256116365Sjeff		/*
1257116365Sjeff		 * Don't update more frequently than twice a second.  Allowing
1258116365Sjeff		 * this causes the cpu usage to decay away too quickly due to
1259116365Sjeff		 * rounding errors.
1260116365Sjeff		 */
1261116365Sjeff		if (ke->ke_ltick < (ticks - (hz / 2)))
1262116365Sjeff			sched_pctcpu_update(ke);
1263109864Sjeff
1264109864Sjeff		/* How many rtick per second ? */
1265116365Sjeff		rtick = min(ke->ke_ticks / SCHED_CPU_TIME, SCHED_CPU_TICKS);
1266110226Sscottl		pctcpu = (FSCALE * ((FSCALE * rtick)/realstathz)) >> FSHIFT;
1267109864Sjeff	}
1268109864Sjeff
1269109864Sjeff	ke->ke_proc->p_swtime = ke->ke_ltick - ke->ke_ftick;
1270113865Sjhb	mtx_unlock_spin(&sched_lock);
1271109864Sjeff
1272109864Sjeff	return (pctcpu);
1273109864Sjeff}
1274109864Sjeff
1275109864Sjeffint
1276109864Sjeffsched_sizeof_kse(void)
1277109864Sjeff{
1278109864Sjeff	return (sizeof(struct kse) + sizeof(struct ke_sched));
1279109864Sjeff}
1280109864Sjeff
1281109864Sjeffint
1282109864Sjeffsched_sizeof_ksegrp(void)
1283109864Sjeff{
1284109864Sjeff	return (sizeof(struct ksegrp) + sizeof(struct kg_sched));
1285109864Sjeff}
1286109864Sjeff
1287109864Sjeffint
1288109864Sjeffsched_sizeof_proc(void)
1289109864Sjeff{
1290109864Sjeff	return (sizeof(struct proc));
1291109864Sjeff}
1292109864Sjeff
1293109864Sjeffint
1294109864Sjeffsched_sizeof_thread(void)
1295109864Sjeff{
1296109864Sjeff	return (sizeof(struct thread) + sizeof(struct td_sched));
1297109864Sjeff}
1298