sched_ule.c revision 112966
1109864Sjeff/*-
2109864Sjeff * Copyright (c) 2003, Jeffrey Roberson <jeff@freebsd.org>
3109864Sjeff * All rights reserved.
4109864Sjeff *
5109864Sjeff * Redistribution and use in source and binary forms, with or without
6109864Sjeff * modification, are permitted provided that the following conditions
7109864Sjeff * are met:
8109864Sjeff * 1. Redistributions of source code must retain the above copyright
9109864Sjeff *    notice unmodified, this list of conditions, and the following
10109864Sjeff *    disclaimer.
11109864Sjeff * 2. Redistributions in binary form must reproduce the above copyright
12109864Sjeff *    notice, this list of conditions and the following disclaimer in the
13109864Sjeff *    documentation and/or other materials provided with the distribution.
14109864Sjeff *
15109864Sjeff * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
16109864Sjeff * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
17109864Sjeff * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
18109864Sjeff * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
19109864Sjeff * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
20109864Sjeff * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
21109864Sjeff * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
22109864Sjeff * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
23109864Sjeff * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
24109864Sjeff * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
25109864Sjeff *
26109864Sjeff * $FreeBSD: head/sys/kern/sched_ule.c 112966 2003-04-02 06:46:43Z jeff $
27109864Sjeff */
28109864Sjeff
29109864Sjeff#include <sys/param.h>
30109864Sjeff#include <sys/systm.h>
31109864Sjeff#include <sys/kernel.h>
32109864Sjeff#include <sys/ktr.h>
33109864Sjeff#include <sys/lock.h>
34109864Sjeff#include <sys/mutex.h>
35109864Sjeff#include <sys/proc.h>
36112966Sjeff#include <sys/resource.h>
37109864Sjeff#include <sys/sched.h>
38109864Sjeff#include <sys/smp.h>
39109864Sjeff#include <sys/sx.h>
40109864Sjeff#include <sys/sysctl.h>
41109864Sjeff#include <sys/sysproto.h>
42109864Sjeff#include <sys/vmmeter.h>
43109864Sjeff#ifdef DDB
44109864Sjeff#include <ddb/ddb.h>
45109864Sjeff#endif
46109864Sjeff#ifdef KTRACE
47109864Sjeff#include <sys/uio.h>
48109864Sjeff#include <sys/ktrace.h>
49109864Sjeff#endif
50109864Sjeff
51109864Sjeff#include <machine/cpu.h>
52109864Sjeff
53109864Sjeff/* decay 95% of `p_pctcpu' in 60 seconds; see CCPU_SHIFT before changing */
54109864Sjeff/* XXX This is bogus compatability crap for ps */
55109864Sjeffstatic fixpt_t  ccpu = 0.95122942450071400909 * FSCALE; /* exp(-1/20) */
56109864SjeffSYSCTL_INT(_kern, OID_AUTO, ccpu, CTLFLAG_RD, &ccpu, 0, "");
57109864Sjeff
58109864Sjeffstatic void sched_setup(void *dummy);
59109864SjeffSYSINIT(sched_setup, SI_SUB_RUN_QUEUE, SI_ORDER_FIRST, sched_setup, NULL)
60109864Sjeff
61111857Sjeffint realstathz;
62111857Sjeff
63110646Sjeff#define	SCHED_STRICT_RESCHED 1
64110646Sjeff
65109864Sjeff/*
66109864Sjeff * These datastructures are allocated within their parent datastructure but
67109864Sjeff * are scheduler specific.
68109864Sjeff */
69109864Sjeff
70109864Sjeffstruct ke_sched {
71109864Sjeff	int		ske_slice;
72109864Sjeff	struct runq	*ske_runq;
73109864Sjeff	/* The following variables are only used for pctcpu calculation */
74109864Sjeff	int		ske_ltick;	/* Last tick that we were running on */
75109864Sjeff	int		ske_ftick;	/* First tick that we were running on */
76109864Sjeff	int		ske_ticks;	/* Tick count */
77110260Sjeff	u_char		ske_cpu;
78109864Sjeff};
79109864Sjeff#define	ke_slice	ke_sched->ske_slice
80109864Sjeff#define	ke_runq		ke_sched->ske_runq
81109864Sjeff#define	ke_ltick	ke_sched->ske_ltick
82109864Sjeff#define	ke_ftick	ke_sched->ske_ftick
83109864Sjeff#define	ke_ticks	ke_sched->ske_ticks
84110260Sjeff#define	ke_cpu		ke_sched->ske_cpu
85109864Sjeff
86109864Sjeffstruct kg_sched {
87110645Sjeff	int	skg_slptime;		/* Number of ticks we vol. slept */
88110645Sjeff	int	skg_runtime;		/* Number of ticks we were running */
89109864Sjeff};
90109864Sjeff#define	kg_slptime	kg_sched->skg_slptime
91110645Sjeff#define	kg_runtime	kg_sched->skg_runtime
92109864Sjeff
93109864Sjeffstruct td_sched {
94109864Sjeff	int	std_slptime;
95110267Sjeff	int	std_schedflag;
96109864Sjeff};
97109864Sjeff#define	td_slptime	td_sched->std_slptime
98110267Sjeff#define	td_schedflag	td_sched->std_schedflag
99109864Sjeff
100110267Sjeff#define	TD_SCHED_BLOAD	0x0001		/*
101110267Sjeff					 * thread was counted as being in short
102110267Sjeff					 * term sleep.
103110267Sjeff					 */
104110267Sjeffstruct td_sched td_sched;
105109864Sjeffstruct ke_sched ke_sched;
106109864Sjeffstruct kg_sched kg_sched;
107109864Sjeff
108109864Sjeffstruct ke_sched *kse0_sched = &ke_sched;
109109864Sjeffstruct kg_sched *ksegrp0_sched = &kg_sched;
110109864Sjeffstruct p_sched *proc0_sched = NULL;
111109864Sjeffstruct td_sched *thread0_sched = &td_sched;
112109864Sjeff
113109864Sjeff/*
114109864Sjeff * This priority range has 20 priorities on either end that are reachable
115109864Sjeff * only through nice values.
116111857Sjeff *
117111857Sjeff * PRI_RANGE:	Total priority range for timeshare threads.
118111857Sjeff * PRI_NRESV:	Reserved priorities for nice.
119111857Sjeff * PRI_BASE:	The start of the dynamic range.
120111857Sjeff * DYN_RANGE:	Number of priorities that are available int the dynamic
121111857Sjeff *		priority range.
122111857Sjeff * DYN_HALF:	Half of DYN_RANGE for convenience elsewhere.
123111857Sjeff * PRI_DYN:	The dynamic priority which is derived from the number of ticks
124111857Sjeff *		running vs the total number of ticks.
125109864Sjeff */
126111857Sjeff#define	SCHED_PRI_RANGE		(PRI_MAX_TIMESHARE - PRI_MIN_TIMESHARE + 1)
127112966Sjeff#define	SCHED_PRI_NRESV		PRIO_TOTAL
128112966Sjeff#define	SCHED_PRI_NHALF		(PRIO_TOTAL >> 2)
129111857Sjeff#define	SCHED_PRI_BASE		((SCHED_PRI_NRESV / 2) + PRI_MIN_TIMESHARE)
130111857Sjeff#define	SCHED_DYN_RANGE		(SCHED_PRI_RANGE - SCHED_PRI_NRESV)
131111857Sjeff#define	SCHED_DYN_HALF		(SCHED_DYN_RANGE / 2)
132111857Sjeff#define	SCHED_PRI_DYN(run, total)	(((run) * SCHED_DYN_RANGE) / (total))
133109864Sjeff
134111857Sjeff
135109864Sjeff/*
136111857Sjeff * These determine the interactivity of a process.
137109864Sjeff *
138110645Sjeff * SLP_RUN_MAX:	Maximum amount of sleep time + run time we'll accumulate
139110645Sjeff *		before throttling back.
140111857Sjeff * SLP_RUN_THROTTLE:	Divisor for reducing slp/run time.
141111857Sjeff * INTERACT_RANGE:	Range of interactivity values.  Smaller is better.
142111857Sjeff * INTERACT_HALF:	Convenience define, half of the interactivity range.
143111857Sjeff * INTERACT_THRESH:	Threshhold for placement on the current runq.
144109864Sjeff */
145111857Sjeff#define	SCHED_SLP_RUN_MAX	((hz * 30) << 10)
146110645Sjeff#define	SCHED_SLP_RUN_THROTTLE	(10)
147111857Sjeff#define	SCHED_INTERACT_RANGE	(100)
148111857Sjeff#define	SCHED_INTERACT_HALF	(SCHED_INTERACT_RANGE / 2)
149111857Sjeff#define	SCHED_INTERACT_THRESH	(10)
150111857Sjeff
151109864Sjeff/*
152109864Sjeff * These parameters and macros determine the size of the time slice that is
153109864Sjeff * granted to each thread.
154109864Sjeff *
155109864Sjeff * SLICE_MIN:	Minimum time slice granted, in units of ticks.
156109864Sjeff * SLICE_MAX:	Maximum time slice granted.
157109864Sjeff * SLICE_RANGE:	Range of available time slices scaled by hz.
158112966Sjeff * SLICE_SCALE:	The number slices granted per val in the range of [0, max].
159112966Sjeff * SLICE_NICE:  Determine the amount of slice granted to a scaled nice.
160109864Sjeff */
161111857Sjeff#define	SCHED_SLICE_MIN			(hz / 100)
162111857Sjeff#define	SCHED_SLICE_MAX			(hz / 10)
163111857Sjeff#define	SCHED_SLICE_RANGE		(SCHED_SLICE_MAX - SCHED_SLICE_MIN + 1)
164109864Sjeff#define	SCHED_SLICE_SCALE(val, max)	(((val) * SCHED_SLICE_RANGE) / (max))
165112966Sjeff#define	SCHED_SLICE_NICE(nice)						\
166112966Sjeff    (SCHED_SLICE_MAX - SCHED_SLICE_SCALE((nice), SCHED_PRI_NHALF))
167109864Sjeff
168109864Sjeff/*
169109864Sjeff * This macro determines whether or not the kse belongs on the current or
170109864Sjeff * next run queue.
171110645Sjeff *
172110645Sjeff * XXX nice value should effect how interactive a kg is.
173109864Sjeff */
174111857Sjeff#define	SCHED_CURR(kg)	(sched_interact_score(kg) < SCHED_INTERACT_THRESH)
175109864Sjeff
176109864Sjeff/*
177109864Sjeff * Cpu percentage computation macros and defines.
178109864Sjeff *
179109864Sjeff * SCHED_CPU_TIME:	Number of seconds to average the cpu usage across.
180109864Sjeff * SCHED_CPU_TICKS:	Number of hz ticks to average the cpu usage across.
181109864Sjeff */
182109864Sjeff
183109864Sjeff#define	SCHED_CPU_TIME	60
184109864Sjeff#define	SCHED_CPU_TICKS	(hz * SCHED_CPU_TIME)
185109864Sjeff
186109864Sjeff/*
187109864Sjeff * kseq - pair of runqs per processor
188109864Sjeff */
189109864Sjeff
190109864Sjeffstruct kseq {
191109864Sjeff	struct runq	ksq_runqs[2];
192109864Sjeff	struct runq	*ksq_curr;
193109864Sjeff	struct runq	*ksq_next;
194109864Sjeff	int		ksq_load;	/* Total runnable */
195110267Sjeff#ifdef SMP
196110267Sjeff	unsigned int	ksq_rslices;	/* Slices on run queue */
197110267Sjeff	unsigned int	ksq_bload;	/* Threads waiting on IO */
198110267Sjeff#endif
199109864Sjeff};
200109864Sjeff
201109864Sjeff/*
202109864Sjeff * One kse queue per processor.
203109864Sjeff */
204110028Sjeff#ifdef SMP
205109864Sjeffstruct kseq	kseq_cpu[MAXCPU];
206110028Sjeff#define	KSEQ_SELF()	(&kseq_cpu[PCPU_GET(cpuid)])
207110028Sjeff#define	KSEQ_CPU(x)	(&kseq_cpu[(x)])
208110028Sjeff#else
209110028Sjeffstruct kseq	kseq_cpu;
210110028Sjeff#define	KSEQ_SELF()	(&kseq_cpu)
211110028Sjeff#define	KSEQ_CPU(x)	(&kseq_cpu)
212110028Sjeff#endif
213109864Sjeff
214112966Sjeffstatic void sched_slice(struct kse *ke);
215109864Sjeffstatic int sched_priority(struct ksegrp *kg);
216111857Sjeffstatic int sched_interact_score(struct ksegrp *kg);
217109864Sjeffvoid sched_pctcpu_update(struct kse *ke);
218109864Sjeffint sched_pickcpu(void);
219109864Sjeff
220110267Sjeff/* Operations on per processor queues */
221110028Sjeffstatic struct kse * kseq_choose(struct kseq *kseq);
222112966Sjeffstatic int kseq_nice_min(struct kseq *kseq);
223110028Sjeffstatic void kseq_setup(struct kseq *kseq);
224110267Sjeffstatic __inline void kseq_add(struct kseq *kseq, struct kse *ke);
225110267Sjeffstatic __inline void kseq_rem(struct kseq *kseq, struct kse *ke);
226110267Sjeff#ifdef SMP
227110267Sjeffstatic __inline void kseq_sleep(struct kseq *kseq, struct kse *ke);
228110267Sjeffstatic __inline void kseq_wakeup(struct kseq *kseq, struct kse *ke);
229110267Sjeffstruct kseq * kseq_load_highest(void);
230110267Sjeff#endif
231110028Sjeff
232110267Sjeffstatic __inline void
233110267Sjeffkseq_add(struct kseq *kseq, struct kse *ke)
234110267Sjeff{
235110267Sjeff	runq_add(ke->ke_runq, ke);
236110267Sjeff	kseq->ksq_load++;
237110267Sjeff#ifdef SMP
238110267Sjeff	kseq->ksq_rslices += ke->ke_slice;
239110267Sjeff#endif
240110267Sjeff}
241110267Sjeffstatic __inline void
242110267Sjeffkseq_rem(struct kseq *kseq, struct kse *ke)
243110267Sjeff{
244110267Sjeff	kseq->ksq_load--;
245110267Sjeff	runq_remove(ke->ke_runq, ke);
246110267Sjeff#ifdef SMP
247110267Sjeff	kseq->ksq_rslices -= ke->ke_slice;
248110267Sjeff#endif
249110267Sjeff}
250110267Sjeff
251110267Sjeff#ifdef SMP
252110267Sjeffstatic __inline void
253110267Sjeffkseq_sleep(struct kseq *kseq, struct kse *ke)
254110267Sjeff{
255110267Sjeff	kseq->ksq_bload++;
256110267Sjeff}
257110267Sjeff
258110267Sjeffstatic __inline void
259110267Sjeffkseq_wakeup(struct kseq *kseq, struct kse *ke)
260110267Sjeff{
261110267Sjeff	kseq->ksq_bload--;
262110267Sjeff}
263110267Sjeff
264110267Sjeffstruct kseq *
265110267Sjeffkseq_load_highest(void)
266110267Sjeff{
267110267Sjeff	struct kseq *kseq;
268110267Sjeff	int load;
269110267Sjeff	int cpu;
270110267Sjeff	int i;
271110267Sjeff
272110267Sjeff	cpu = 0;
273110267Sjeff	load = 0;
274110267Sjeff
275110267Sjeff	for (i = 0; i < mp_maxid; i++) {
276110267Sjeff		if (CPU_ABSENT(i))
277110267Sjeff			continue;
278110267Sjeff		kseq = KSEQ_CPU(i);
279110267Sjeff		if (kseq->ksq_load > load) {
280110267Sjeff			load = kseq->ksq_load;
281110267Sjeff			cpu = i;
282110267Sjeff		}
283110267Sjeff	}
284110267Sjeff	if (load)
285110267Sjeff		return (KSEQ_CPU(cpu));
286110267Sjeff
287110267Sjeff	return (NULL);
288110267Sjeff}
289110267Sjeff#endif
290110267Sjeff
291110267Sjeffstruct kse *
292110267Sjeffkseq_choose(struct kseq *kseq)
293110267Sjeff{
294110267Sjeff	struct kse *ke;
295110267Sjeff	struct runq *swap;
296110267Sjeff
297110267Sjeff	if ((ke = runq_choose(kseq->ksq_curr)) == NULL) {
298110267Sjeff		swap = kseq->ksq_curr;
299110267Sjeff		kseq->ksq_curr = kseq->ksq_next;
300110267Sjeff		kseq->ksq_next = swap;
301110267Sjeff		ke = runq_choose(kseq->ksq_curr);
302110267Sjeff	}
303110267Sjeff
304110267Sjeff	return (ke);
305110267Sjeff}
306110267Sjeff
307112966Sjeffstatic int
308112966Sjeffkseq_nice_min(struct kseq *kseq)
309112966Sjeff{
310112966Sjeff	struct kse *ke0;
311112966Sjeff	struct kse *ke1;
312110267Sjeff
313112966Sjeff	if (kseq->ksq_load == 0)
314112966Sjeff		return (0);
315112966Sjeff
316112966Sjeff	ke0 = runq_choose(kseq->ksq_curr);
317112966Sjeff	ke1 = runq_choose(kseq->ksq_next);
318112966Sjeff
319112966Sjeff	if (ke0 == NULL)
320112966Sjeff		return (ke1->ke_ksegrp->kg_nice);
321112966Sjeff
322112966Sjeff	if (ke1 == NULL)
323112966Sjeff		return (ke0->ke_ksegrp->kg_nice);
324112966Sjeff
325112966Sjeff	return (min(ke0->ke_ksegrp->kg_nice, ke1->ke_ksegrp->kg_nice));
326112966Sjeff}
327112966Sjeff
328109864Sjeffstatic void
329110028Sjeffkseq_setup(struct kseq *kseq)
330110028Sjeff{
331110028Sjeff	kseq->ksq_curr = &kseq->ksq_runqs[0];
332110028Sjeff	kseq->ksq_next = &kseq->ksq_runqs[1];
333110028Sjeff	runq_init(kseq->ksq_curr);
334110028Sjeff	runq_init(kseq->ksq_next);
335110267Sjeff	kseq->ksq_load = 0;
336110267Sjeff#ifdef SMP
337110267Sjeff	kseq->ksq_rslices = 0;
338110267Sjeff	kseq->ksq_bload = 0;
339110267Sjeff#endif
340110028Sjeff}
341110028Sjeff
342110028Sjeffstatic void
343109864Sjeffsched_setup(void *dummy)
344109864Sjeff{
345109864Sjeff	int i;
346109864Sjeff
347111857Sjeff	realstathz = stathz ? stathz : hz;
348111857Sjeff
349109864Sjeff	mtx_lock_spin(&sched_lock);
350109864Sjeff	/* init kseqs */
351110028Sjeff	for (i = 0; i < MAXCPU; i++)
352110028Sjeff		kseq_setup(KSEQ_CPU(i));
353109864Sjeff	mtx_unlock_spin(&sched_lock);
354109864Sjeff}
355109864Sjeff
356109864Sjeff/*
357109864Sjeff * Scale the scheduling priority according to the "interactivity" of this
358109864Sjeff * process.
359109864Sjeff */
360109864Sjeffstatic int
361109864Sjeffsched_priority(struct ksegrp *kg)
362109864Sjeff{
363109864Sjeff	int pri;
364109864Sjeff
365109864Sjeff	if (kg->kg_pri_class != PRI_TIMESHARE)
366109864Sjeff		return (kg->kg_user_pri);
367109864Sjeff
368111857Sjeff	pri = sched_interact_score(kg) * SCHED_DYN_RANGE / SCHED_INTERACT_RANGE;
369111857Sjeff	pri += SCHED_PRI_BASE;
370109864Sjeff	pri += kg->kg_nice;
371109864Sjeff
372109864Sjeff	if (pri > PRI_MAX_TIMESHARE)
373109864Sjeff		pri = PRI_MAX_TIMESHARE;
374109864Sjeff	else if (pri < PRI_MIN_TIMESHARE)
375109864Sjeff		pri = PRI_MIN_TIMESHARE;
376109864Sjeff
377109864Sjeff	kg->kg_user_pri = pri;
378109864Sjeff
379109864Sjeff	return (kg->kg_user_pri);
380109864Sjeff}
381109864Sjeff
382109864Sjeff/*
383112966Sjeff * Calculate a time slice based on the properties of the kseg and the runq
384112966Sjeff * that we're on.
385109864Sjeff */
386112966Sjeffstatic void
387112966Sjeffsched_slice(struct kse *ke)
388109864Sjeff{
389112966Sjeff	struct ksegrp *kg;
390109864Sjeff
391112966Sjeff	kg = ke->ke_ksegrp;
392109864Sjeff
393112966Sjeff	/*
394112966Sjeff	 * Rationale:
395112966Sjeff	 * KSEs in interactive ksegs get the minimum slice so that we
396112966Sjeff	 * quickly notice if it abuses its advantage.
397112966Sjeff	 *
398112966Sjeff	 * KSEs in non-interactive ksegs are assigned a slice that is
399112966Sjeff	 * based on the ksegs nice value relative to the least nice kseg
400112966Sjeff	 * on the run queue for this cpu.
401112966Sjeff	 *
402112966Sjeff	 * If the KSE is less nice than all others it gets the maximum
403112966Sjeff	 * slice and other KSEs will adjust their slice relative to
404112966Sjeff	 * this when they first expire.
405112966Sjeff	 *
406112966Sjeff	 * There is 20 point window that starts relative to the least
407112966Sjeff	 * nice kse on the run queue.  Slice size is determined by
408112966Sjeff	 * the kse distance from the last nice ksegrp.
409112966Sjeff	 *
410112966Sjeff	 * If you are outside of the window you will get no slice and
411112966Sjeff	 * you will be reevaluated each time you are selected on the
412112966Sjeff	 * run queue.
413112966Sjeff	 *
414112966Sjeff	 */
415109864Sjeff
416112966Sjeff	if (!SCHED_CURR(kg)) {
417112966Sjeff		struct kseq *kseq;
418112966Sjeff		int nice_base;
419112966Sjeff		int nice;
420112966Sjeff
421112966Sjeff		kseq = KSEQ_CPU(ke->ke_cpu);
422112966Sjeff		nice_base = kseq_nice_min(kseq);
423112966Sjeff		nice = kg->kg_nice + (0 - nice_base);
424112966Sjeff
425112966Sjeff		if (kseq->ksq_load == 0 || kg->kg_nice < nice_base)
426112966Sjeff			ke->ke_slice = SCHED_SLICE_MAX;
427112966Sjeff		else if (nice <= SCHED_PRI_NHALF)
428112966Sjeff			ke->ke_slice = SCHED_SLICE_NICE(nice);
429112966Sjeff		else
430112966Sjeff			ke->ke_slice = 0;
431112966Sjeff	} else
432112966Sjeff		ke->ke_slice = SCHED_SLICE_MIN;
433112966Sjeff
434110645Sjeff	/*
435110645Sjeff	 * Every time we grant a new slice check to see if we need to scale
436110645Sjeff	 * back the slp and run time in the kg.  This will cause us to forget
437110645Sjeff	 * old interactivity while maintaining the current ratio.
438110645Sjeff	 */
439110645Sjeff	if ((kg->kg_runtime + kg->kg_slptime) >  SCHED_SLP_RUN_MAX) {
440110645Sjeff		kg->kg_runtime /= SCHED_SLP_RUN_THROTTLE;
441110645Sjeff		kg->kg_slptime /= SCHED_SLP_RUN_THROTTLE;
442110645Sjeff	}
443110645Sjeff
444112966Sjeff	return;
445109864Sjeff}
446109864Sjeff
447111857Sjeffstatic int
448111857Sjeffsched_interact_score(struct ksegrp *kg)
449111857Sjeff{
450111857Sjeff	int big;
451111857Sjeff	int small;
452111857Sjeff	int base;
453111857Sjeff
454111857Sjeff	if (kg->kg_runtime > kg->kg_slptime) {
455111857Sjeff		big = kg->kg_runtime;
456111857Sjeff		small = kg->kg_slptime;
457111857Sjeff		base = SCHED_INTERACT_HALF;
458111857Sjeff	} else {
459111857Sjeff		big = kg->kg_slptime;
460111857Sjeff		small = kg->kg_runtime;
461111857Sjeff		base = 0;
462111857Sjeff	}
463111857Sjeff
464111857Sjeff	big /= SCHED_INTERACT_HALF;
465111857Sjeff	if (big != 0)
466111857Sjeff		small /= big;
467111857Sjeff	else
468111857Sjeff		small = 0;
469111857Sjeff
470111857Sjeff	small += base;
471111857Sjeff	/* XXX Factor in nice */
472111857Sjeff	return (small);
473111857Sjeff}
474111857Sjeff
475109864Sjeffint
476109864Sjeffsched_rr_interval(void)
477109864Sjeff{
478109864Sjeff	return (SCHED_SLICE_MAX);
479109864Sjeff}
480109864Sjeff
481109864Sjeffvoid
482109864Sjeffsched_pctcpu_update(struct kse *ke)
483109864Sjeff{
484109864Sjeff	/*
485109864Sjeff	 * Adjust counters and watermark for pctcpu calc.
486109864Sjeff	 */
487111793Sjeff	/*
488111793Sjeff	 * Shift the tick count out so that the divide doesn't round away
489111793Sjeff	 * our results.
490111793Sjeff	 */
491111793Sjeff	ke->ke_ticks <<= 10;
492109864Sjeff	ke->ke_ticks = (ke->ke_ticks / (ke->ke_ltick - ke->ke_ftick)) *
493109864Sjeff		    SCHED_CPU_TICKS;
494111793Sjeff	ke->ke_ticks >>= 10;
495109864Sjeff	ke->ke_ltick = ticks;
496109864Sjeff	ke->ke_ftick = ke->ke_ltick - SCHED_CPU_TICKS;
497109864Sjeff}
498109864Sjeff
499109864Sjeff#ifdef SMP
500110267Sjeff/* XXX Should be changed to kseq_load_lowest() */
501109864Sjeffint
502109864Sjeffsched_pickcpu(void)
503109864Sjeff{
504110028Sjeff	struct kseq *kseq;
505110028Sjeff	int load;
506109864Sjeff	int cpu;
507109864Sjeff	int i;
508109864Sjeff
509109864Sjeff	if (!smp_started)
510109864Sjeff		return (0);
511109864Sjeff
512110028Sjeff	load = 0;
513110028Sjeff	cpu = 0;
514109864Sjeff
515109864Sjeff	for (i = 0; i < mp_maxid; i++) {
516109864Sjeff		if (CPU_ABSENT(i))
517109864Sjeff			continue;
518110028Sjeff		kseq = KSEQ_CPU(i);
519110028Sjeff		if (kseq->ksq_load < load) {
520109864Sjeff			cpu = i;
521110028Sjeff			load = kseq->ksq_load;
522109864Sjeff		}
523109864Sjeff	}
524109864Sjeff
525109864Sjeff	CTR1(KTR_RUNQ, "sched_pickcpu: %d", cpu);
526109864Sjeff	return (cpu);
527109864Sjeff}
528109864Sjeff#else
529109864Sjeffint
530109864Sjeffsched_pickcpu(void)
531109864Sjeff{
532109864Sjeff	return (0);
533109864Sjeff}
534109864Sjeff#endif
535109864Sjeff
536109864Sjeffvoid
537109864Sjeffsched_prio(struct thread *td, u_char prio)
538109864Sjeff{
539109864Sjeff	struct kse *ke;
540109864Sjeff	struct runq *rq;
541109864Sjeff
542109864Sjeff	mtx_assert(&sched_lock, MA_OWNED);
543109864Sjeff	ke = td->td_kse;
544109864Sjeff	td->td_priority = prio;
545109864Sjeff
546109864Sjeff	if (TD_ON_RUNQ(td)) {
547109864Sjeff		rq = ke->ke_runq;
548109864Sjeff
549109864Sjeff		runq_remove(rq, ke);
550109864Sjeff		runq_add(rq, ke);
551109864Sjeff	}
552109864Sjeff}
553109864Sjeff
554109864Sjeffvoid
555109864Sjeffsched_switchout(struct thread *td)
556109864Sjeff{
557109864Sjeff	struct kse *ke;
558109864Sjeff
559109864Sjeff	mtx_assert(&sched_lock, MA_OWNED);
560109864Sjeff
561109864Sjeff	ke = td->td_kse;
562109864Sjeff
563109864Sjeff	td->td_last_kse = ke;
564109864Sjeff        td->td_lastcpu = ke->ke_oncpu;
565110260Sjeff	ke->ke_oncpu = NOCPU;
566111032Sjulian        td->td_flags &= ~TDF_NEEDRESCHED;
567109864Sjeff
568109864Sjeff	if (TD_IS_RUNNING(td)) {
569109864Sjeff		setrunqueue(td);
570109864Sjeff		return;
571111857Sjeff	}
572111857Sjeff	td->td_kse->ke_runq = NULL;
573109864Sjeff
574109864Sjeff	/*
575109864Sjeff	 * We will not be on the run queue. So we must be
576109864Sjeff	 * sleeping or similar.
577109864Sjeff	 */
578111585Sjulian	if (td->td_proc->p_flag & P_THREADED)
579109864Sjeff		kse_reassign(ke);
580109864Sjeff}
581109864Sjeff
582109864Sjeffvoid
583109864Sjeffsched_switchin(struct thread *td)
584109864Sjeff{
585109864Sjeff	/* struct kse *ke = td->td_kse; */
586109864Sjeff	mtx_assert(&sched_lock, MA_OWNED);
587109864Sjeff
588110260Sjeff	td->td_kse->ke_oncpu = PCPU_GET(cpuid);
589110267Sjeff#if SCHED_STRICT_RESCHED
590109864Sjeff	if (td->td_ksegrp->kg_pri_class == PRI_TIMESHARE &&
591109864Sjeff	    td->td_priority != td->td_ksegrp->kg_user_pri)
592111032Sjulian		curthread->td_flags |= TDF_NEEDRESCHED;
593110267Sjeff#endif
594109864Sjeff}
595109864Sjeff
596109864Sjeffvoid
597109864Sjeffsched_nice(struct ksegrp *kg, int nice)
598109864Sjeff{
599109864Sjeff	struct thread *td;
600109864Sjeff
601109864Sjeff	kg->kg_nice = nice;
602109864Sjeff	sched_priority(kg);
603109864Sjeff	FOREACH_THREAD_IN_GROUP(kg, td) {
604111032Sjulian		td->td_flags |= TDF_NEEDRESCHED;
605109864Sjeff	}
606109864Sjeff}
607109864Sjeff
608109864Sjeffvoid
609109864Sjeffsched_sleep(struct thread *td, u_char prio)
610109864Sjeff{
611109864Sjeff	mtx_assert(&sched_lock, MA_OWNED);
612109864Sjeff
613109864Sjeff	td->td_slptime = ticks;
614109864Sjeff	td->td_priority = prio;
615109864Sjeff
616110267Sjeff#ifdef SMP
617110267Sjeff	if (td->td_priority < PZERO) {
618110267Sjeff		kseq_sleep(KSEQ_CPU(td->td_kse->ke_cpu), td->td_kse);
619110267Sjeff		td->td_schedflag |= TD_SCHED_BLOAD;
620110267Sjeff	}
621110028Sjeff#endif
622109864Sjeff}
623109864Sjeff
624109864Sjeffvoid
625109864Sjeffsched_wakeup(struct thread *td)
626109864Sjeff{
627109864Sjeff	mtx_assert(&sched_lock, MA_OWNED);
628109864Sjeff
629109864Sjeff	/*
630109864Sjeff	 * Let the kseg know how long we slept for.  This is because process
631109864Sjeff	 * interactivity behavior is modeled in the kseg.
632109864Sjeff	 */
633111788Sjeff	if (td->td_slptime) {
634111788Sjeff		struct ksegrp *kg;
635109864Sjeff
636111788Sjeff		kg = td->td_ksegrp;
637111857Sjeff		kg->kg_slptime += (ticks - td->td_slptime) << 10;
638111788Sjeff		sched_priority(kg);
639111788Sjeff		td->td_slptime = 0;
640109864Sjeff	}
641110267Sjeff#ifdef SMP
642110267Sjeff	if (td->td_priority < PZERO && td->td_schedflag & TD_SCHED_BLOAD) {
643110267Sjeff		kseq_wakeup(KSEQ_CPU(td->td_kse->ke_cpu), td->td_kse);
644110267Sjeff		td->td_schedflag &= ~TD_SCHED_BLOAD;
645110267Sjeff	}
646110028Sjeff#endif
647109864Sjeff	setrunqueue(td);
648110267Sjeff#if SCHED_STRICT_RESCHED
649109864Sjeff        if (td->td_priority < curthread->td_priority)
650111032Sjulian                curthread->td_flags |= TDF_NEEDRESCHED;
651110267Sjeff#endif
652109864Sjeff}
653109864Sjeff
654109864Sjeff/*
655109864Sjeff * Penalize the parent for creating a new child and initialize the child's
656109864Sjeff * priority.
657109864Sjeff */
658109864Sjeffvoid
659109864Sjeffsched_fork(struct ksegrp *kg, struct ksegrp *child)
660109864Sjeff{
661109864Sjeff	struct kse *ckse;
662109864Sjeff	struct kse *pkse;
663109864Sjeff
664109864Sjeff	mtx_assert(&sched_lock, MA_OWNED);
665109864Sjeff	ckse = FIRST_KSE_IN_KSEGRP(child);
666109864Sjeff	pkse = FIRST_KSE_IN_KSEGRP(kg);
667109864Sjeff
668109864Sjeff	/* XXX Need something better here */
669110645Sjeff	if (kg->kg_slptime > kg->kg_runtime) {
670111857Sjeff		child->kg_slptime = SCHED_DYN_RANGE;
671111857Sjeff		child->kg_runtime = kg->kg_slptime / SCHED_DYN_RANGE;
672110645Sjeff	} else {
673111857Sjeff		child->kg_runtime = SCHED_DYN_RANGE;
674111857Sjeff		child->kg_slptime = kg->kg_runtime / SCHED_DYN_RANGE;
675110645Sjeff	}
676110645Sjeff#if 0
677109864Sjeff	child->kg_slptime = kg->kg_slptime;
678110645Sjeff	child->kg_runtime = kg->kg_runtime;
679110645Sjeff#endif
680109864Sjeff	child->kg_user_pri = kg->kg_user_pri;
681109864Sjeff
682110645Sjeff#if 0
683110260Sjeff	if (pkse->ke_cpu != PCPU_GET(cpuid)) {
684110260Sjeff		printf("pkse->ke_cpu = %d\n", pkse->ke_cpu);
685109970Sjeff		printf("cpuid = %d", PCPU_GET(cpuid));
686109970Sjeff		Debugger("stop");
687109970Sjeff	}
688110645Sjeff#endif
689109970Sjeff
690109864Sjeff	ckse->ke_slice = pkse->ke_slice;
691110260Sjeff	ckse->ke_cpu = pkse->ke_cpu; /* sched_pickcpu(); */
692109864Sjeff	ckse->ke_runq = NULL;
693109864Sjeff	/*
694109864Sjeff	 * Claim that we've been running for one second for statistical
695109864Sjeff	 * purposes.
696109864Sjeff	 */
697109864Sjeff	ckse->ke_ticks = 0;
698109864Sjeff	ckse->ke_ltick = ticks;
699109864Sjeff	ckse->ke_ftick = ticks - hz;
700109864Sjeff}
701109864Sjeff
702109864Sjeff/*
703109864Sjeff * Return some of the child's priority and interactivity to the parent.
704109864Sjeff */
705109864Sjeffvoid
706109864Sjeffsched_exit(struct ksegrp *kg, struct ksegrp *child)
707109864Sjeff{
708109864Sjeff	/* XXX Need something better here */
709109864Sjeff	mtx_assert(&sched_lock, MA_OWNED);
710109864Sjeff	kg->kg_slptime = child->kg_slptime;
711110645Sjeff	kg->kg_runtime = child->kg_runtime;
712109864Sjeff	sched_priority(kg);
713109864Sjeff}
714109864Sjeff
715109864Sjeffvoid
716109864Sjeffsched_clock(struct thread *td)
717109864Sjeff{
718109864Sjeff	struct kse *ke;
719110267Sjeff#if SCHED_STRICT_RESCHED
720109864Sjeff	struct kse *nke;
721110028Sjeff	struct kseq *kseq;
722110267Sjeff#endif
723109864Sjeff	struct ksegrp *kg;
724109864Sjeff
725109864Sjeff
726109864Sjeff	ke = td->td_kse;
727109864Sjeff	kg = td->td_ksegrp;
728109864Sjeff
729110028Sjeff	mtx_assert(&sched_lock, MA_OWNED);
730110028Sjeff	KASSERT((td != NULL), ("schedclock: null thread pointer"));
731110028Sjeff
732110028Sjeff	/* Adjust ticks for pctcpu */
733111793Sjeff	ke->ke_ticks++;
734109971Sjeff	ke->ke_ltick = ticks;
735109971Sjeff	/* Go up to one second beyond our max and then trim back down */
736109971Sjeff	if (ke->ke_ftick + SCHED_CPU_TICKS + hz < ke->ke_ltick)
737109971Sjeff		sched_pctcpu_update(ke);
738109971Sjeff
739110028Sjeff	if (td->td_kse->ke_flags & KEF_IDLEKSE)
740109864Sjeff		return;
741110028Sjeff
742110028Sjeff	/*
743110028Sjeff	 * Check for a higher priority task on the run queue.  This can happen
744110028Sjeff	 * on SMP if another processor woke up a process on our runq.
745110028Sjeff	 */
746110267Sjeff#if SCHED_STRICT_RESCHED
747110028Sjeff	kseq = KSEQ_SELF();
748109970Sjeff	nke = runq_choose(kseq->ksq_curr);
749109970Sjeff
750109864Sjeff	if (nke && nke->ke_thread &&
751110028Sjeff	    nke->ke_thread->td_priority < td->td_priority)
752111032Sjulian		td->td_flags |= TDF_NEEDRESCHED;
753110267Sjeff#endif
754109864Sjeff	/*
755110645Sjeff	 * We used a tick charge it to the ksegrp so that we can compute our
756109864Sjeff	 * "interactivity".
757109864Sjeff	 */
758111857Sjeff	kg->kg_runtime += 1 << 10;
759110645Sjeff
760109864Sjeff	/*
761109864Sjeff	 * We used up one time slice.
762109864Sjeff	 */
763109864Sjeff	ke->ke_slice--;
764109864Sjeff	/*
765109864Sjeff	 * We're out of time, recompute priorities and requeue
766109864Sjeff	 */
767111857Sjeff	if (ke->ke_slice <= 0) {
768111857Sjeff		sched_priority(kg);
769112966Sjeff		sched_slice(ke);
770111032Sjulian		td->td_flags |= TDF_NEEDRESCHED;
771109864Sjeff		ke->ke_runq = NULL;
772109864Sjeff	}
773109864Sjeff}
774109864Sjeff
775109864Sjeffint
776109864Sjeffsched_runnable(void)
777109864Sjeff{
778109864Sjeff	struct kseq *kseq;
779109864Sjeff
780110028Sjeff	kseq = KSEQ_SELF();
781109864Sjeff
782110028Sjeff	if (kseq->ksq_load)
783109970Sjeff		return (1);
784109970Sjeff#ifdef SMP
785110028Sjeff	/*
786110028Sjeff	 * For SMP we may steal other processor's KSEs.  Just search until we
787110028Sjeff	 * verify that at least on other cpu has a runnable task.
788110028Sjeff	 */
789109970Sjeff	if (smp_started) {
790109970Sjeff		int i;
791109970Sjeff
792110267Sjeff#if 0
793110267Sjeff		if (kseq->ksq_bload)
794110267Sjeff			return (0);
795110267Sjeff#endif
796110267Sjeff
797109970Sjeff		for (i = 0; i < mp_maxid; i++) {
798109970Sjeff			if (CPU_ABSENT(i))
799109970Sjeff				continue;
800110028Sjeff			kseq = KSEQ_CPU(i);
801110028Sjeff			if (kseq->ksq_load)
802109970Sjeff				return (1);
803109970Sjeff		}
804109970Sjeff	}
805109970Sjeff#endif
806109970Sjeff	return (0);
807109864Sjeff}
808109864Sjeff
809109864Sjeffvoid
810109864Sjeffsched_userret(struct thread *td)
811109864Sjeff{
812109864Sjeff	struct ksegrp *kg;
813109864Sjeff
814109864Sjeff	kg = td->td_ksegrp;
815109864Sjeff
816109864Sjeff	if (td->td_priority != kg->kg_user_pri) {
817109864Sjeff		mtx_lock_spin(&sched_lock);
818109864Sjeff		td->td_priority = kg->kg_user_pri;
819109864Sjeff		mtx_unlock_spin(&sched_lock);
820109864Sjeff	}
821109864Sjeff}
822109864Sjeff
823109864Sjeffstruct kse *
824109970Sjeffsched_choose(void)
825109970Sjeff{
826110028Sjeff	struct kseq *kseq;
827109970Sjeff	struct kse *ke;
828109970Sjeff
829110028Sjeff	kseq = KSEQ_SELF();
830112966Sjeffretry:
831110028Sjeff	ke = kseq_choose(kseq);
832109970Sjeff
833109864Sjeff	if (ke) {
834109864Sjeff		ke->ke_state = KES_THREAD;
835110267Sjeff		kseq_rem(kseq, ke);
836112966Sjeff
837112966Sjeff		/*
838112966Sjeff		 * If we dequeue a kse with a slice of zero it was below the
839112966Sjeff		 * nice threshold to acquire a slice.  Recalculate the slice
840112966Sjeff		 * to see if the situation has changed and then requeue.
841112966Sjeff		 */
842112966Sjeff		if (ke->ke_slice == 0) {
843112966Sjeff			sched_slice(ke);
844112966Sjeff			ke->ke_runq = kseq->ksq_next;
845112966Sjeff			kseq_add(kseq, ke);
846112966Sjeff			goto retry;
847112966Sjeff		}
848109864Sjeff	}
849109864Sjeff
850109970Sjeff#ifdef SMP
851109970Sjeff	if (ke == NULL && smp_started) {
852110267Sjeff#if 0
853110267Sjeff		if (kseq->ksq_bload)
854110267Sjeff			return (NULL);
855110267Sjeff#endif
856109970Sjeff		/*
857109970Sjeff		 * Find the cpu with the highest load and steal one proc.
858109970Sjeff		 */
859110267Sjeff		kseq = kseq_load_highest();
860110267Sjeff		if (kseq == NULL)
861110267Sjeff			return (NULL);
862110267Sjeff		ke = kseq_choose(kseq);
863110267Sjeff		kseq_rem(kseq, ke);
864109970Sjeff
865110267Sjeff		ke->ke_state = KES_THREAD;
866110267Sjeff		ke->ke_runq = NULL;
867110267Sjeff		ke->ke_cpu = PCPU_GET(cpuid);
868109970Sjeff	}
869109970Sjeff#endif
870109864Sjeff	return (ke);
871109864Sjeff}
872109864Sjeff
873109864Sjeffvoid
874109864Sjeffsched_add(struct kse *ke)
875109864Sjeff{
876110267Sjeff	struct kseq *kseq;
877109864Sjeff
878109864Sjeff	mtx_assert(&sched_lock, MA_OWNED);
879110267Sjeff	KASSERT((ke->ke_thread != NULL), ("sched_add: No thread on KSE"));
880109864Sjeff	KASSERT((ke->ke_thread->td_kse != NULL),
881110267Sjeff	    ("sched_add: No KSE on thread"));
882109864Sjeff	KASSERT(ke->ke_state != KES_ONRUNQ,
883110267Sjeff	    ("sched_add: kse %p (%s) already in run queue", ke,
884109864Sjeff	    ke->ke_proc->p_comm));
885109864Sjeff	KASSERT(ke->ke_proc->p_sflag & PS_INMEM,
886110267Sjeff	    ("sched_add: process swapped out"));
887109864Sjeff
888111789Sjeff	/*
889111789Sjeff	 * Timeshare threads get placed on the appropriate queue on their
890111789Sjeff	 * bound cpu.
891111789Sjeff	 */
892111789Sjeff	if (ke->ke_ksegrp->kg_pri_class == PRI_TIMESHARE) {
893111789Sjeff		kseq = KSEQ_CPU(ke->ke_cpu);
894109864Sjeff
895111789Sjeff		if (ke->ke_runq == NULL) {
896111789Sjeff			if (SCHED_CURR(ke->ke_ksegrp))
897111789Sjeff				ke->ke_runq = kseq->ksq_curr;
898111789Sjeff			else
899111789Sjeff				ke->ke_runq = kseq->ksq_next;
900111789Sjeff		}
901111789Sjeff	/*
902111789Sjeff	 * If we're a real-time or interrupt thread place us on the curr
903111789Sjeff	 * queue for the current processor.  Hopefully this will yield the
904111789Sjeff	 * lowest latency response.
905111789Sjeff	 */
906111789Sjeff	} else {
907111789Sjeff		kseq = KSEQ_SELF();
908111789Sjeff		ke->ke_runq = kseq->ksq_curr;
909109864Sjeff	}
910109864Sjeff	ke->ke_ksegrp->kg_runq_kses++;
911109864Sjeff	ke->ke_state = KES_ONRUNQ;
912109864Sjeff
913110267Sjeff	kseq_add(kseq, ke);
914109864Sjeff}
915109864Sjeff
916109864Sjeffvoid
917109864Sjeffsched_rem(struct kse *ke)
918109864Sjeff{
919109864Sjeff	mtx_assert(&sched_lock, MA_OWNED);
920109864Sjeff	/* KASSERT((ke->ke_state == KES_ONRUNQ), ("KSE not on run queue")); */
921109864Sjeff
922109864Sjeff	ke->ke_runq = NULL;
923109864Sjeff	ke->ke_state = KES_THREAD;
924109864Sjeff	ke->ke_ksegrp->kg_runq_kses--;
925110267Sjeff
926110267Sjeff	kseq_rem(KSEQ_CPU(ke->ke_cpu), ke);
927109864Sjeff}
928109864Sjeff
929109864Sjefffixpt_t
930109864Sjeffsched_pctcpu(struct kse *ke)
931109864Sjeff{
932109864Sjeff	fixpt_t pctcpu;
933110226Sscottl	int realstathz;
934109864Sjeff
935109864Sjeff	pctcpu = 0;
936110226Sscottl	realstathz = stathz ? stathz : hz;
937109864Sjeff
938109864Sjeff	if (ke->ke_ticks) {
939109864Sjeff		int rtick;
940109864Sjeff
941109864Sjeff		/* Update to account for time potentially spent sleeping */
942109864Sjeff		ke->ke_ltick = ticks;
943109864Sjeff		sched_pctcpu_update(ke);
944109864Sjeff
945109864Sjeff		/* How many rtick per second ? */
946111793Sjeff		rtick = ke->ke_ticks / SCHED_CPU_TIME;
947110226Sscottl		pctcpu = (FSCALE * ((FSCALE * rtick)/realstathz)) >> FSHIFT;
948109864Sjeff	}
949109864Sjeff
950109864Sjeff	ke->ke_proc->p_swtime = ke->ke_ltick - ke->ke_ftick;
951109864Sjeff
952109864Sjeff	return (pctcpu);
953109864Sjeff}
954109864Sjeff
955109864Sjeffint
956109864Sjeffsched_sizeof_kse(void)
957109864Sjeff{
958109864Sjeff	return (sizeof(struct kse) + sizeof(struct ke_sched));
959109864Sjeff}
960109864Sjeff
961109864Sjeffint
962109864Sjeffsched_sizeof_ksegrp(void)
963109864Sjeff{
964109864Sjeff	return (sizeof(struct ksegrp) + sizeof(struct kg_sched));
965109864Sjeff}
966109864Sjeff
967109864Sjeffint
968109864Sjeffsched_sizeof_proc(void)
969109864Sjeff{
970109864Sjeff	return (sizeof(struct proc));
971109864Sjeff}
972109864Sjeff
973109864Sjeffint
974109864Sjeffsched_sizeof_thread(void)
975109864Sjeff{
976109864Sjeff	return (sizeof(struct thread) + sizeof(struct td_sched));
977109864Sjeff}
978