sched_ule.c revision 113357
1109864Sjeff/*-
2113357Sjeff * Copyright (c) 2002-2003, Jeffrey Roberson <jeff@freebsd.org>
3109864Sjeff * All rights reserved.
4109864Sjeff *
5109864Sjeff * Redistribution and use in source and binary forms, with or without
6109864Sjeff * modification, are permitted provided that the following conditions
7109864Sjeff * are met:
8109864Sjeff * 1. Redistributions of source code must retain the above copyright
9109864Sjeff *    notice unmodified, this list of conditions, and the following
10109864Sjeff *    disclaimer.
11109864Sjeff * 2. Redistributions in binary form must reproduce the above copyright
12109864Sjeff *    notice, this list of conditions and the following disclaimer in the
13109864Sjeff *    documentation and/or other materials provided with the distribution.
14109864Sjeff *
15109864Sjeff * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
16109864Sjeff * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
17109864Sjeff * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
18109864Sjeff * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
19109864Sjeff * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
20109864Sjeff * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
21109864Sjeff * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
22109864Sjeff * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
23109864Sjeff * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
24109864Sjeff * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
25109864Sjeff *
26109864Sjeff * $FreeBSD: head/sys/kern/sched_ule.c 113357 2003-04-11 03:47:14Z jeff $
27109864Sjeff */
28109864Sjeff
29109864Sjeff#include <sys/param.h>
30109864Sjeff#include <sys/systm.h>
31109864Sjeff#include <sys/kernel.h>
32109864Sjeff#include <sys/ktr.h>
33109864Sjeff#include <sys/lock.h>
34109864Sjeff#include <sys/mutex.h>
35109864Sjeff#include <sys/proc.h>
36112966Sjeff#include <sys/resource.h>
37109864Sjeff#include <sys/sched.h>
38109864Sjeff#include <sys/smp.h>
39109864Sjeff#include <sys/sx.h>
40109864Sjeff#include <sys/sysctl.h>
41109864Sjeff#include <sys/sysproto.h>
42109864Sjeff#include <sys/vmmeter.h>
43109864Sjeff#ifdef DDB
44109864Sjeff#include <ddb/ddb.h>
45109864Sjeff#endif
46109864Sjeff#ifdef KTRACE
47109864Sjeff#include <sys/uio.h>
48109864Sjeff#include <sys/ktrace.h>
49109864Sjeff#endif
50109864Sjeff
51109864Sjeff#include <machine/cpu.h>
52109864Sjeff
53113357Sjeff#define KTR_ULE         KTR_NFS
54113357Sjeff
55109864Sjeff/* decay 95% of `p_pctcpu' in 60 seconds; see CCPU_SHIFT before changing */
56109864Sjeff/* XXX This is bogus compatability crap for ps */
57109864Sjeffstatic fixpt_t  ccpu = 0.95122942450071400909 * FSCALE; /* exp(-1/20) */
58109864SjeffSYSCTL_INT(_kern, OID_AUTO, ccpu, CTLFLAG_RD, &ccpu, 0, "");
59109864Sjeff
60109864Sjeffstatic void sched_setup(void *dummy);
61109864SjeffSYSINIT(sched_setup, SI_SUB_RUN_QUEUE, SI_ORDER_FIRST, sched_setup, NULL)
62109864Sjeff
63113357Sjeffstatic SYSCTL_NODE(_kern, OID_AUTO, sched, CTLFLAG_RW, 0, "SCHED");
64113357Sjeff
65113357Sjeffstatic int sched_strict;
66113357SjeffSYSCTL_INT(_kern_sched, OID_AUTO, strict, CTLFLAG_RD, &sched_strict, 0, "");
67113357Sjeff
68113357Sjeffstatic int slice_min = 1;
69113357SjeffSYSCTL_INT(_kern_sched, OID_AUTO, slice_min, CTLFLAG_RW, &slice_min, 0, "");
70113357Sjeff
71113357Sjeffstatic int slice_max = 2;
72113357SjeffSYSCTL_INT(_kern_sched, OID_AUTO, slice_max, CTLFLAG_RW, &slice_max, 0, "");
73113357Sjeff
74111857Sjeffint realstathz;
75113357Sjeffint tickincr = 1;
76111857Sjeff
77109864Sjeff/*
78109864Sjeff * These datastructures are allocated within their parent datastructure but
79109864Sjeff * are scheduler specific.
80109864Sjeff */
81109864Sjeff
82109864Sjeffstruct ke_sched {
83109864Sjeff	int		ske_slice;
84109864Sjeff	struct runq	*ske_runq;
85109864Sjeff	/* The following variables are only used for pctcpu calculation */
86109864Sjeff	int		ske_ltick;	/* Last tick that we were running on */
87109864Sjeff	int		ske_ftick;	/* First tick that we were running on */
88109864Sjeff	int		ske_ticks;	/* Tick count */
89113357Sjeff	/* CPU that we have affinity for. */
90110260Sjeff	u_char		ske_cpu;
91109864Sjeff};
92109864Sjeff#define	ke_slice	ke_sched->ske_slice
93109864Sjeff#define	ke_runq		ke_sched->ske_runq
94109864Sjeff#define	ke_ltick	ke_sched->ske_ltick
95109864Sjeff#define	ke_ftick	ke_sched->ske_ftick
96109864Sjeff#define	ke_ticks	ke_sched->ske_ticks
97110260Sjeff#define	ke_cpu		ke_sched->ske_cpu
98109864Sjeff
99109864Sjeffstruct kg_sched {
100110645Sjeff	int	skg_slptime;		/* Number of ticks we vol. slept */
101110645Sjeff	int	skg_runtime;		/* Number of ticks we were running */
102109864Sjeff};
103109864Sjeff#define	kg_slptime	kg_sched->skg_slptime
104110645Sjeff#define	kg_runtime	kg_sched->skg_runtime
105109864Sjeff
106109864Sjeffstruct td_sched {
107109864Sjeff	int	std_slptime;
108109864Sjeff};
109109864Sjeff#define	td_slptime	td_sched->std_slptime
110109864Sjeff
111110267Sjeffstruct td_sched td_sched;
112109864Sjeffstruct ke_sched ke_sched;
113109864Sjeffstruct kg_sched kg_sched;
114109864Sjeff
115109864Sjeffstruct ke_sched *kse0_sched = &ke_sched;
116109864Sjeffstruct kg_sched *ksegrp0_sched = &kg_sched;
117109864Sjeffstruct p_sched *proc0_sched = NULL;
118109864Sjeffstruct td_sched *thread0_sched = &td_sched;
119109864Sjeff
120109864Sjeff/*
121109864Sjeff * This priority range has 20 priorities on either end that are reachable
122109864Sjeff * only through nice values.
123111857Sjeff *
124111857Sjeff * PRI_RANGE:	Total priority range for timeshare threads.
125111857Sjeff * PRI_NRESV:	Reserved priorities for nice.
126111857Sjeff * PRI_BASE:	The start of the dynamic range.
127111857Sjeff * DYN_RANGE:	Number of priorities that are available int the dynamic
128111857Sjeff *		priority range.
129109864Sjeff */
130111857Sjeff#define	SCHED_PRI_RANGE		(PRI_MAX_TIMESHARE - PRI_MIN_TIMESHARE + 1)
131112966Sjeff#define	SCHED_PRI_NRESV		PRIO_TOTAL
132112970Sjeff#define	SCHED_PRI_NHALF		(PRIO_TOTAL / 2)
133113357Sjeff#define	SCHED_PRI_NTHRESH	(SCHED_PRI_NHALF - 1)
134111857Sjeff#define	SCHED_PRI_BASE		((SCHED_PRI_NRESV / 2) + PRI_MIN_TIMESHARE)
135111857Sjeff#define	SCHED_DYN_RANGE		(SCHED_PRI_RANGE - SCHED_PRI_NRESV)
136113357Sjeff#define	SCHED_PRI_INTERACT(score)					\
137113357Sjeff    ((score) * SCHED_DYN_RANGE / SCHED_INTERACT_RANGE)
138109864Sjeff
139109864Sjeff/*
140111857Sjeff * These determine the interactivity of a process.
141109864Sjeff *
142110645Sjeff * SLP_RUN_MAX:	Maximum amount of sleep time + run time we'll accumulate
143110645Sjeff *		before throttling back.
144111857Sjeff * SLP_RUN_THROTTLE:	Divisor for reducing slp/run time.
145111857Sjeff * INTERACT_RANGE:	Range of interactivity values.  Smaller is better.
146111857Sjeff * INTERACT_HALF:	Convenience define, half of the interactivity range.
147111857Sjeff * INTERACT_THRESH:	Threshhold for placement on the current runq.
148109864Sjeff */
149113357Sjeff#define	SCHED_SLP_RUN_MAX	((hz / 10) << 10)
150110645Sjeff#define	SCHED_SLP_RUN_THROTTLE	(10)
151111857Sjeff#define	SCHED_INTERACT_RANGE	(100)
152111857Sjeff#define	SCHED_INTERACT_HALF	(SCHED_INTERACT_RANGE / 2)
153111857Sjeff#define	SCHED_INTERACT_THRESH	(10)
154111857Sjeff
155109864Sjeff/*
156109864Sjeff * These parameters and macros determine the size of the time slice that is
157109864Sjeff * granted to each thread.
158109864Sjeff *
159109864Sjeff * SLICE_MIN:	Minimum time slice granted, in units of ticks.
160109864Sjeff * SLICE_MAX:	Maximum time slice granted.
161109864Sjeff * SLICE_RANGE:	Range of available time slices scaled by hz.
162112966Sjeff * SLICE_SCALE:	The number slices granted per val in the range of [0, max].
163112966Sjeff * SLICE_NICE:  Determine the amount of slice granted to a scaled nice.
164109864Sjeff */
165113357Sjeff#define	SCHED_SLICE_MIN			(slice_min)
166113357Sjeff#define	SCHED_SLICE_MAX			(slice_max)
167111857Sjeff#define	SCHED_SLICE_RANGE		(SCHED_SLICE_MAX - SCHED_SLICE_MIN + 1)
168109864Sjeff#define	SCHED_SLICE_SCALE(val, max)	(((val) * SCHED_SLICE_RANGE) / (max))
169112966Sjeff#define	SCHED_SLICE_NICE(nice)						\
170113357Sjeff    (SCHED_SLICE_MAX - SCHED_SLICE_SCALE((nice), SCHED_PRI_NTHRESH))
171109864Sjeff
172109864Sjeff/*
173109864Sjeff * This macro determines whether or not the kse belongs on the current or
174109864Sjeff * next run queue.
175110645Sjeff *
176110645Sjeff * XXX nice value should effect how interactive a kg is.
177109864Sjeff */
178113357Sjeff#define	SCHED_INTERACTIVE(kg)						\
179113357Sjeff    (sched_interact_score(kg) < SCHED_INTERACT_THRESH)
180113357Sjeff#define	SCHED_CURR(kg, ke)	SCHED_INTERACTIVE(kg)
181113357Sjeff#if 0
182113357Sjeff    (ke->ke_thread->td_priority < PRI_MIN_TIMESHARE || SCHED_INTERACTIVE(kg))
183113357Sjeff#endif
184109864Sjeff
185109864Sjeff/*
186109864Sjeff * Cpu percentage computation macros and defines.
187109864Sjeff *
188109864Sjeff * SCHED_CPU_TIME:	Number of seconds to average the cpu usage across.
189109864Sjeff * SCHED_CPU_TICKS:	Number of hz ticks to average the cpu usage across.
190109864Sjeff */
191109864Sjeff
192112971Sjeff#define	SCHED_CPU_TIME	10
193109864Sjeff#define	SCHED_CPU_TICKS	(hz * SCHED_CPU_TIME)
194109864Sjeff
195109864Sjeff/*
196113357Sjeff * kseq - per processor runqs and statistics.
197109864Sjeff */
198109864Sjeff
199113357Sjeff#define	KSEQ_NCLASS	(PRI_IDLE + 1)	/* Number of run classes. */
200113357Sjeff
201109864Sjeffstruct kseq {
202113357Sjeff	struct runq	ksq_idle;		/* Queue of IDLE threads. */
203113357Sjeff	struct runq	ksq_timeshare[2];	/* Run queues for !IDLE. */
204113357Sjeff	struct runq	*ksq_next;		/* Next timeshare queue. */
205113357Sjeff	struct runq	*ksq_curr;		/* Current queue. */
206113357Sjeff	int		ksq_loads[KSEQ_NCLASS];	/* Load for each class */
207113357Sjeff	int		ksq_load;		/* Aggregate load. */
208113357Sjeff	short		ksq_nice[PRIO_TOTAL + 1]; /* KSEs in each nice bin. */
209113357Sjeff	short		ksq_nicemin;		/* Least nice. */
210110267Sjeff#ifdef SMP
211110267Sjeff	unsigned int	ksq_rslices;	/* Slices on run queue */
212110267Sjeff#endif
213109864Sjeff};
214109864Sjeff
215109864Sjeff/*
216109864Sjeff * One kse queue per processor.
217109864Sjeff */
218110028Sjeff#ifdef SMP
219109864Sjeffstruct kseq	kseq_cpu[MAXCPU];
220110028Sjeff#define	KSEQ_SELF()	(&kseq_cpu[PCPU_GET(cpuid)])
221110028Sjeff#define	KSEQ_CPU(x)	(&kseq_cpu[(x)])
222110028Sjeff#else
223110028Sjeffstruct kseq	kseq_cpu;
224110028Sjeff#define	KSEQ_SELF()	(&kseq_cpu)
225110028Sjeff#define	KSEQ_CPU(x)	(&kseq_cpu)
226110028Sjeff#endif
227109864Sjeff
228112966Sjeffstatic void sched_slice(struct kse *ke);
229113357Sjeffstatic void sched_priority(struct ksegrp *kg);
230111857Sjeffstatic int sched_interact_score(struct ksegrp *kg);
231109864Sjeffvoid sched_pctcpu_update(struct kse *ke);
232109864Sjeffint sched_pickcpu(void);
233109864Sjeff
234110267Sjeff/* Operations on per processor queues */
235110028Sjeffstatic struct kse * kseq_choose(struct kseq *kseq);
236110028Sjeffstatic void kseq_setup(struct kseq *kseq);
237112994Sjeffstatic void kseq_add(struct kseq *kseq, struct kse *ke);
238113357Sjeffstatic void kseq_rem(struct kseq *kseq, struct kse *ke);
239113357Sjeffstatic void kseq_nice_add(struct kseq *kseq, int nice);
240113357Sjeffstatic void kseq_nice_rem(struct kseq *kseq, int nice);
241113357Sjeffvoid kseq_print(struct kseq *kseq);
242110267Sjeff#ifdef SMP
243110267Sjeffstruct kseq * kseq_load_highest(void);
244110267Sjeff#endif
245110028Sjeff
246113357Sjeffvoid
247113357Sjeffkseq_print(struct kseq *kseq)
248110267Sjeff{
249113357Sjeff	int i;
250112994Sjeff
251113357Sjeff	if (kseq == NULL)
252113357Sjeff		kseq = KSEQ_SELF();
253112994Sjeff
254113357Sjeff	printf("kseq:\n");
255113357Sjeff	printf("\tload:           %d\n", kseq->ksq_load);
256113357Sjeff	printf("\tload ITHD:      %d\n", kseq->ksq_loads[PRI_ITHD]);
257113357Sjeff	printf("\tload REALTIME:  %d\n", kseq->ksq_loads[PRI_REALTIME]);
258113357Sjeff	printf("\tload TIMESHARE: %d\n", kseq->ksq_loads[PRI_TIMESHARE]);
259113357Sjeff	printf("\tload IDLE:      %d\n", kseq->ksq_loads[PRI_IDLE]);
260113357Sjeff	printf("\tnicemin:\t%d\n", kseq->ksq_nicemin);
261113357Sjeff	printf("\tnice counts:\n");
262113357Sjeff	for (i = 0; i < PRIO_TOTAL + 1; i++)
263113357Sjeff		if (kseq->ksq_nice[i])
264113357Sjeff			printf("\t\t%d = %d\n",
265113357Sjeff			    i - SCHED_PRI_NHALF, kseq->ksq_nice[i]);
266113357Sjeff}
267112994Sjeff
268113357Sjeffstatic void
269113357Sjeffkseq_add(struct kseq *kseq, struct kse *ke)
270113357Sjeff{
271113357Sjeff	kseq->ksq_loads[ke->ke_ksegrp->kg_pri_class]++;
272113357Sjeff	kseq->ksq_load++;
273113357Sjeff	if (ke->ke_ksegrp->kg_pri_class == PRI_TIMESHARE)
274113357Sjeff	CTR6(KTR_ULE, "Add kse %p to %p (slice: %d, pri: %d, nice: %d(%d))",
275113357Sjeff	    ke, ke->ke_runq, ke->ke_slice, ke->ke_thread->td_priority,
276113357Sjeff	    ke->ke_ksegrp->kg_nice, kseq->ksq_nicemin);
277113357Sjeff	if (ke->ke_ksegrp->kg_pri_class == PRI_TIMESHARE)
278113357Sjeff		kseq_nice_add(kseq, ke->ke_ksegrp->kg_nice);
279110267Sjeff#ifdef SMP
280110267Sjeff	kseq->ksq_rslices += ke->ke_slice;
281110267Sjeff#endif
282110267Sjeff}
283113357Sjeff
284112994Sjeffstatic void
285110267Sjeffkseq_rem(struct kseq *kseq, struct kse *ke)
286110267Sjeff{
287113357Sjeff	kseq->ksq_loads[ke->ke_ksegrp->kg_pri_class]--;
288113357Sjeff	kseq->ksq_load--;
289113357Sjeff	ke->ke_runq = NULL;
290113357Sjeff	if (ke->ke_ksegrp->kg_pri_class == PRI_TIMESHARE)
291113357Sjeff		kseq_nice_rem(kseq, ke->ke_ksegrp->kg_nice);
292110267Sjeff#ifdef SMP
293110267Sjeff	kseq->ksq_rslices -= ke->ke_slice;
294110267Sjeff#endif
295110267Sjeff}
296110267Sjeff
297113357Sjeffstatic void
298113357Sjeffkseq_nice_add(struct kseq *kseq, int nice)
299110267Sjeff{
300113357Sjeff	/* Normalize to zero. */
301113357Sjeff	kseq->ksq_nice[nice + SCHED_PRI_NHALF]++;
302113357Sjeff	if (nice < kseq->ksq_nicemin || kseq->ksq_loads[PRI_TIMESHARE] == 0)
303113357Sjeff		kseq->ksq_nicemin = nice;
304110267Sjeff}
305110267Sjeff
306113357Sjeffstatic void
307113357Sjeffkseq_nice_rem(struct kseq *kseq, int nice)
308110267Sjeff{
309113357Sjeff	int n;
310113357Sjeff
311113357Sjeff	/* Normalize to zero. */
312113357Sjeff	n = nice + SCHED_PRI_NHALF;
313113357Sjeff	kseq->ksq_nice[n]--;
314113357Sjeff	KASSERT(kseq->ksq_nice[n] >= 0, ("Negative nice count."));
315113357Sjeff
316113357Sjeff	/*
317113357Sjeff	 * If this wasn't the smallest nice value or there are more in
318113357Sjeff	 * this bucket we can just return.  Otherwise we have to recalculate
319113357Sjeff	 * the smallest nice.
320113357Sjeff	 */
321113357Sjeff	if (nice != kseq->ksq_nicemin ||
322113357Sjeff	    kseq->ksq_nice[n] != 0 ||
323113357Sjeff	    kseq->ksq_loads[PRI_TIMESHARE] == 0)
324113357Sjeff		return;
325113357Sjeff
326113357Sjeff	for (; n < SCHED_PRI_NRESV + 1; n++)
327113357Sjeff		if (kseq->ksq_nice[n]) {
328113357Sjeff			kseq->ksq_nicemin = n - SCHED_PRI_NHALF;
329113357Sjeff			return;
330113357Sjeff		}
331110267Sjeff}
332110267Sjeff
333113357Sjeff#ifdef SMP
334110267Sjeffstruct kseq *
335110267Sjeffkseq_load_highest(void)
336110267Sjeff{
337110267Sjeff	struct kseq *kseq;
338110267Sjeff	int load;
339110267Sjeff	int cpu;
340110267Sjeff	int i;
341110267Sjeff
342110267Sjeff	cpu = 0;
343110267Sjeff	load = 0;
344110267Sjeff
345110267Sjeff	for (i = 0; i < mp_maxid; i++) {
346110267Sjeff		if (CPU_ABSENT(i))
347110267Sjeff			continue;
348110267Sjeff		kseq = KSEQ_CPU(i);
349113357Sjeff		if (kseq->ksq_load > load) {
350113357Sjeff			load = kseq->ksq_load;
351110267Sjeff			cpu = i;
352110267Sjeff		}
353110267Sjeff	}
354110267Sjeff	if (load)
355110267Sjeff		return (KSEQ_CPU(cpu));
356110267Sjeff
357110267Sjeff	return (NULL);
358110267Sjeff}
359110267Sjeff#endif
360110267Sjeff
361110267Sjeffstruct kse *
362110267Sjeffkseq_choose(struct kseq *kseq)
363110267Sjeff{
364110267Sjeff	struct kse *ke;
365110267Sjeff	struct runq *swap;
366110267Sjeff
367113357Sjeff	swap = NULL;
368112994Sjeff
369113357Sjeff	for (;;) {
370113357Sjeff		ke = runq_choose(kseq->ksq_curr);
371113357Sjeff		if (ke == NULL) {
372113357Sjeff			/*
373113357Sjeff			 * We already swaped once and didn't get anywhere.
374113357Sjeff			 */
375113357Sjeff			if (swap)
376113357Sjeff				break;
377113357Sjeff			swap = kseq->ksq_curr;
378113357Sjeff			kseq->ksq_curr = kseq->ksq_next;
379113357Sjeff			kseq->ksq_next = swap;
380113357Sjeff			continue;
381113357Sjeff		}
382113357Sjeff		/*
383113357Sjeff		 * If we encounter a slice of 0 the kse is in a
384113357Sjeff		 * TIMESHARE kse group and its nice was too far out
385113357Sjeff		 * of the range that receives slices.
386113357Sjeff		 */
387113357Sjeff		if (ke->ke_slice == 0) {
388113357Sjeff			runq_remove(ke->ke_runq, ke);
389113357Sjeff			sched_slice(ke);
390113357Sjeff			ke->ke_runq = kseq->ksq_next;
391113357Sjeff			runq_add(ke->ke_runq, ke);
392113357Sjeff			continue;
393113357Sjeff		}
394113357Sjeff		return (ke);
395110267Sjeff	}
396110267Sjeff
397113357Sjeff	return (runq_choose(&kseq->ksq_idle));
398110267Sjeff}
399110267Sjeff
400109864Sjeffstatic void
401110028Sjeffkseq_setup(struct kseq *kseq)
402110028Sjeff{
403113357Sjeff	runq_init(&kseq->ksq_timeshare[0]);
404113357Sjeff	runq_init(&kseq->ksq_timeshare[1]);
405112994Sjeff	runq_init(&kseq->ksq_idle);
406113357Sjeff
407113357Sjeff	kseq->ksq_curr = &kseq->ksq_timeshare[0];
408113357Sjeff	kseq->ksq_next = &kseq->ksq_timeshare[1];
409113357Sjeff
410113357Sjeff	kseq->ksq_loads[PRI_ITHD] = 0;
411113357Sjeff	kseq->ksq_loads[PRI_REALTIME] = 0;
412113357Sjeff	kseq->ksq_loads[PRI_TIMESHARE] = 0;
413113357Sjeff	kseq->ksq_loads[PRI_IDLE] = 0;
414110267Sjeff#ifdef SMP
415110267Sjeff	kseq->ksq_rslices = 0;
416110267Sjeff#endif
417110028Sjeff}
418110028Sjeff
419110028Sjeffstatic void
420109864Sjeffsched_setup(void *dummy)
421109864Sjeff{
422109864Sjeff	int i;
423109864Sjeff
424113357Sjeff	slice_min = (hz/100);
425113357Sjeff	slice_max = (hz/10);
426111857Sjeff
427109864Sjeff	mtx_lock_spin(&sched_lock);
428109864Sjeff	/* init kseqs */
429110028Sjeff	for (i = 0; i < MAXCPU; i++)
430110028Sjeff		kseq_setup(KSEQ_CPU(i));
431113357Sjeff
432113357Sjeff	kseq_add(KSEQ_SELF(), &kse0);
433109864Sjeff	mtx_unlock_spin(&sched_lock);
434109864Sjeff}
435109864Sjeff
436109864Sjeff/*
437109864Sjeff * Scale the scheduling priority according to the "interactivity" of this
438109864Sjeff * process.
439109864Sjeff */
440113357Sjeffstatic void
441109864Sjeffsched_priority(struct ksegrp *kg)
442109864Sjeff{
443109864Sjeff	int pri;
444109864Sjeff
445109864Sjeff	if (kg->kg_pri_class != PRI_TIMESHARE)
446113357Sjeff		return;
447109864Sjeff
448113357Sjeff	pri = SCHED_PRI_INTERACT(sched_interact_score(kg));
449111857Sjeff	pri += SCHED_PRI_BASE;
450109864Sjeff	pri += kg->kg_nice;
451109864Sjeff
452109864Sjeff	if (pri > PRI_MAX_TIMESHARE)
453109864Sjeff		pri = PRI_MAX_TIMESHARE;
454109864Sjeff	else if (pri < PRI_MIN_TIMESHARE)
455109864Sjeff		pri = PRI_MIN_TIMESHARE;
456109864Sjeff
457109864Sjeff	kg->kg_user_pri = pri;
458109864Sjeff
459113357Sjeff	return;
460109864Sjeff}
461109864Sjeff
462109864Sjeff/*
463112966Sjeff * Calculate a time slice based on the properties of the kseg and the runq
464112994Sjeff * that we're on.  This is only for PRI_TIMESHARE ksegrps.
465109864Sjeff */
466112966Sjeffstatic void
467112966Sjeffsched_slice(struct kse *ke)
468109864Sjeff{
469113357Sjeff	struct kseq *kseq;
470112966Sjeff	struct ksegrp *kg;
471109864Sjeff
472112966Sjeff	kg = ke->ke_ksegrp;
473113357Sjeff	kseq = KSEQ_CPU(ke->ke_cpu);
474109864Sjeff
475112966Sjeff	/*
476112966Sjeff	 * Rationale:
477112966Sjeff	 * KSEs in interactive ksegs get the minimum slice so that we
478112966Sjeff	 * quickly notice if it abuses its advantage.
479112966Sjeff	 *
480112966Sjeff	 * KSEs in non-interactive ksegs are assigned a slice that is
481112966Sjeff	 * based on the ksegs nice value relative to the least nice kseg
482112966Sjeff	 * on the run queue for this cpu.
483112966Sjeff	 *
484112966Sjeff	 * If the KSE is less nice than all others it gets the maximum
485112966Sjeff	 * slice and other KSEs will adjust their slice relative to
486112966Sjeff	 * this when they first expire.
487112966Sjeff	 *
488112966Sjeff	 * There is 20 point window that starts relative to the least
489112966Sjeff	 * nice kse on the run queue.  Slice size is determined by
490112966Sjeff	 * the kse distance from the last nice ksegrp.
491112966Sjeff	 *
492112966Sjeff	 * If you are outside of the window you will get no slice and
493112966Sjeff	 * you will be reevaluated each time you are selected on the
494112966Sjeff	 * run queue.
495112966Sjeff	 *
496112966Sjeff	 */
497109864Sjeff
498113357Sjeff	if (!SCHED_INTERACTIVE(kg)) {
499112966Sjeff		int nice;
500112966Sjeff
501113357Sjeff		nice = kg->kg_nice + (0 - kseq->ksq_nicemin);
502113357Sjeff		if (kseq->ksq_loads[PRI_TIMESHARE] == 0 ||
503113357Sjeff		    kg->kg_nice < kseq->ksq_nicemin)
504112966Sjeff			ke->ke_slice = SCHED_SLICE_MAX;
505113357Sjeff		else if (nice <= SCHED_PRI_NTHRESH)
506112966Sjeff			ke->ke_slice = SCHED_SLICE_NICE(nice);
507112966Sjeff		else
508112966Sjeff			ke->ke_slice = 0;
509112966Sjeff	} else
510112966Sjeff		ke->ke_slice = SCHED_SLICE_MIN;
511112966Sjeff
512113357Sjeff	CTR6(KTR_ULE,
513113357Sjeff	    "Sliced %p(%d) (nice: %d, nicemin: %d, load: %d, interactive: %d)",
514113357Sjeff	    ke, ke->ke_slice, kg->kg_nice, kseq->ksq_nicemin,
515113357Sjeff	    kseq->ksq_loads[PRI_TIMESHARE], SCHED_INTERACTIVE(kg));
516113357Sjeff
517110645Sjeff	/*
518112994Sjeff	 * Check to see if we need to scale back the slp and run time
519112994Sjeff	 * in the kg.  This will cause us to forget old interactivity
520112994Sjeff	 * while maintaining the current ratio.
521110645Sjeff	 */
522113357Sjeff	CTR4(KTR_ULE, "Slp vs Run %p (Slp %d, Run %d, Score %d)",
523113357Sjeff	    ke, kg->kg_slptime >> 10, kg->kg_runtime >> 10,
524113357Sjeff	    sched_interact_score(kg));
525113357Sjeff
526110645Sjeff	if ((kg->kg_runtime + kg->kg_slptime) >  SCHED_SLP_RUN_MAX) {
527110645Sjeff		kg->kg_runtime /= SCHED_SLP_RUN_THROTTLE;
528110645Sjeff		kg->kg_slptime /= SCHED_SLP_RUN_THROTTLE;
529110645Sjeff	}
530113357Sjeff	CTR4(KTR_ULE, "Slp vs Run(2) %p (Slp %d, Run %d, Score %d)",
531113357Sjeff	    ke, kg->kg_slptime >> 10, kg->kg_runtime >> 10,
532113357Sjeff	    sched_interact_score(kg));
533110645Sjeff
534112966Sjeff	return;
535109864Sjeff}
536109864Sjeff
537111857Sjeffstatic int
538111857Sjeffsched_interact_score(struct ksegrp *kg)
539111857Sjeff{
540111857Sjeff	int big;
541111857Sjeff	int small;
542111857Sjeff	int base;
543111857Sjeff
544111857Sjeff	if (kg->kg_runtime > kg->kg_slptime) {
545111857Sjeff		big = kg->kg_runtime;
546111857Sjeff		small = kg->kg_slptime;
547111857Sjeff		base = SCHED_INTERACT_HALF;
548111857Sjeff	} else {
549111857Sjeff		big = kg->kg_slptime;
550111857Sjeff		small = kg->kg_runtime;
551111857Sjeff		base = 0;
552111857Sjeff	}
553111857Sjeff
554111857Sjeff	big /= SCHED_INTERACT_HALF;
555111857Sjeff	if (big != 0)
556111857Sjeff		small /= big;
557111857Sjeff	else
558111857Sjeff		small = 0;
559111857Sjeff
560111857Sjeff	small += base;
561111857Sjeff	/* XXX Factor in nice */
562111857Sjeff	return (small);
563111857Sjeff}
564111857Sjeff
565113357Sjeff/*
566113357Sjeff * This is only somewhat accurate since given many processes of the same
567113357Sjeff * priority they will switch when their slices run out, which will be
568113357Sjeff * at most SCHED_SLICE_MAX.
569113357Sjeff */
570109864Sjeffint
571109864Sjeffsched_rr_interval(void)
572109864Sjeff{
573109864Sjeff	return (SCHED_SLICE_MAX);
574109864Sjeff}
575109864Sjeff
576109864Sjeffvoid
577109864Sjeffsched_pctcpu_update(struct kse *ke)
578109864Sjeff{
579109864Sjeff	/*
580109864Sjeff	 * Adjust counters and watermark for pctcpu calc.
581113357Sjeff	 *
582111793Sjeff	 * Shift the tick count out so that the divide doesn't round away
583111793Sjeff	 * our results.
584111793Sjeff	 */
585111793Sjeff	ke->ke_ticks <<= 10;
586109864Sjeff	ke->ke_ticks = (ke->ke_ticks / (ke->ke_ltick - ke->ke_ftick)) *
587109864Sjeff		    SCHED_CPU_TICKS;
588111793Sjeff	ke->ke_ticks >>= 10;
589109864Sjeff	ke->ke_ltick = ticks;
590109864Sjeff	ke->ke_ftick = ke->ke_ltick - SCHED_CPU_TICKS;
591109864Sjeff}
592109864Sjeff
593109864Sjeff#ifdef SMP
594110267Sjeff/* XXX Should be changed to kseq_load_lowest() */
595109864Sjeffint
596109864Sjeffsched_pickcpu(void)
597109864Sjeff{
598110028Sjeff	struct kseq *kseq;
599110028Sjeff	int load;
600109864Sjeff	int cpu;
601109864Sjeff	int i;
602109864Sjeff
603109864Sjeff	if (!smp_started)
604109864Sjeff		return (0);
605109864Sjeff
606110028Sjeff	load = 0;
607110028Sjeff	cpu = 0;
608109864Sjeff
609109864Sjeff	for (i = 0; i < mp_maxid; i++) {
610109864Sjeff		if (CPU_ABSENT(i))
611109864Sjeff			continue;
612110028Sjeff		kseq = KSEQ_CPU(i);
613113357Sjeff		if (kseq->ksq_load < load) {
614109864Sjeff			cpu = i;
615113357Sjeff			load = kseq->ksq_load;
616109864Sjeff		}
617109864Sjeff	}
618109864Sjeff
619109864Sjeff	CTR1(KTR_RUNQ, "sched_pickcpu: %d", cpu);
620109864Sjeff	return (cpu);
621109864Sjeff}
622109864Sjeff#else
623109864Sjeffint
624109864Sjeffsched_pickcpu(void)
625109864Sjeff{
626109864Sjeff	return (0);
627109864Sjeff}
628109864Sjeff#endif
629109864Sjeff
630109864Sjeffvoid
631109864Sjeffsched_prio(struct thread *td, u_char prio)
632109864Sjeff{
633109864Sjeff	struct kse *ke;
634109864Sjeff	struct runq *rq;
635109864Sjeff
636109864Sjeff	mtx_assert(&sched_lock, MA_OWNED);
637109864Sjeff	ke = td->td_kse;
638109864Sjeff	td->td_priority = prio;
639109864Sjeff
640109864Sjeff	if (TD_ON_RUNQ(td)) {
641109864Sjeff		rq = ke->ke_runq;
642109864Sjeff
643109864Sjeff		runq_remove(rq, ke);
644109864Sjeff		runq_add(rq, ke);
645109864Sjeff	}
646109864Sjeff}
647109864Sjeff
648109864Sjeffvoid
649109864Sjeffsched_switchout(struct thread *td)
650109864Sjeff{
651109864Sjeff	struct kse *ke;
652109864Sjeff
653109864Sjeff	mtx_assert(&sched_lock, MA_OWNED);
654109864Sjeff
655109864Sjeff	ke = td->td_kse;
656109864Sjeff
657109864Sjeff	td->td_last_kse = ke;
658113339Sjulian        td->td_lastcpu = td->td_oncpu;
659113339Sjulian	td->td_oncpu = NOCPU;
660111032Sjulian        td->td_flags &= ~TDF_NEEDRESCHED;
661109864Sjeff
662109864Sjeff	if (TD_IS_RUNNING(td)) {
663113357Sjeff		runq_add(ke->ke_runq, ke);
664113357Sjeff		/* setrunqueue(td); */
665109864Sjeff		return;
666111857Sjeff	}
667113357Sjeff	if (ke->ke_runq)
668113357Sjeff		kseq_rem(KSEQ_CPU(ke->ke_cpu), ke);
669109864Sjeff	/*
670109864Sjeff	 * We will not be on the run queue. So we must be
671109864Sjeff	 * sleeping or similar.
672109864Sjeff	 */
673111585Sjulian	if (td->td_proc->p_flag & P_THREADED)
674109864Sjeff		kse_reassign(ke);
675109864Sjeff}
676109864Sjeff
677109864Sjeffvoid
678109864Sjeffsched_switchin(struct thread *td)
679109864Sjeff{
680109864Sjeff	/* struct kse *ke = td->td_kse; */
681109864Sjeff	mtx_assert(&sched_lock, MA_OWNED);
682109864Sjeff
683113339Sjulian	td->td_oncpu = PCPU_GET(cpuid);
684113357Sjeff
685109864Sjeff	if (td->td_ksegrp->kg_pri_class == PRI_TIMESHARE &&
686109864Sjeff	    td->td_priority != td->td_ksegrp->kg_user_pri)
687111032Sjulian		curthread->td_flags |= TDF_NEEDRESCHED;
688109864Sjeff}
689109864Sjeff
690109864Sjeffvoid
691109864Sjeffsched_nice(struct ksegrp *kg, int nice)
692109864Sjeff{
693113357Sjeff	struct kse *ke;
694109864Sjeff	struct thread *td;
695113357Sjeff	struct kseq *kseq;
696109864Sjeff
697113357Sjeff	/*
698113357Sjeff	 * We need to adjust the nice counts for running KSEs.
699113357Sjeff	 */
700113357Sjeff	if (kg->kg_pri_class == PRI_TIMESHARE)
701113357Sjeff		FOREACH_KSE_IN_GROUP(kg, ke) {
702113357Sjeff			if (ke->ke_state != KES_ONRUNQ &&
703113357Sjeff			    ke->ke_state != KES_THREAD)
704113357Sjeff				continue;
705113357Sjeff			kseq = KSEQ_CPU(ke->ke_cpu);
706113357Sjeff			kseq_nice_rem(kseq, kg->kg_nice);
707113357Sjeff			kseq_nice_add(kseq, nice);
708113357Sjeff		}
709109864Sjeff	kg->kg_nice = nice;
710109864Sjeff	sched_priority(kg);
711113357Sjeff	FOREACH_THREAD_IN_GROUP(kg, td)
712111032Sjulian		td->td_flags |= TDF_NEEDRESCHED;
713109864Sjeff}
714109864Sjeff
715109864Sjeffvoid
716109864Sjeffsched_sleep(struct thread *td, u_char prio)
717109864Sjeff{
718109864Sjeff	mtx_assert(&sched_lock, MA_OWNED);
719109864Sjeff
720109864Sjeff	td->td_slptime = ticks;
721109864Sjeff	td->td_priority = prio;
722109864Sjeff
723113357Sjeff	CTR2(KTR_ULE, "sleep kse %p (tick: %d)",
724113357Sjeff	    td->td_kse, td->td_slptime);
725109864Sjeff}
726109864Sjeff
727109864Sjeffvoid
728109864Sjeffsched_wakeup(struct thread *td)
729109864Sjeff{
730109864Sjeff	mtx_assert(&sched_lock, MA_OWNED);
731109864Sjeff
732109864Sjeff	/*
733109864Sjeff	 * Let the kseg know how long we slept for.  This is because process
734109864Sjeff	 * interactivity behavior is modeled in the kseg.
735109864Sjeff	 */
736111788Sjeff	if (td->td_slptime) {
737111788Sjeff		struct ksegrp *kg;
738113357Sjeff		int hzticks;
739109864Sjeff
740111788Sjeff		kg = td->td_ksegrp;
741113357Sjeff		hzticks = ticks - td->td_slptime;
742113357Sjeff		kg->kg_slptime += hzticks << 10;
743111788Sjeff		sched_priority(kg);
744113357Sjeff		CTR2(KTR_ULE, "wakeup kse %p (%d ticks)",
745113357Sjeff		    td->td_kse, hzticks);
746111788Sjeff		td->td_slptime = 0;
747109864Sjeff	}
748109864Sjeff	setrunqueue(td);
749109864Sjeff        if (td->td_priority < curthread->td_priority)
750111032Sjulian                curthread->td_flags |= TDF_NEEDRESCHED;
751109864Sjeff}
752109864Sjeff
753109864Sjeff/*
754109864Sjeff * Penalize the parent for creating a new child and initialize the child's
755109864Sjeff * priority.
756109864Sjeff */
757109864Sjeffvoid
758113357Sjeffsched_fork(struct proc *p, struct proc *p1)
759109864Sjeff{
760109864Sjeff
761109864Sjeff	mtx_assert(&sched_lock, MA_OWNED);
762109864Sjeff
763113357Sjeff	sched_fork_ksegrp(FIRST_KSEGRP_IN_PROC(p), FIRST_KSEGRP_IN_PROC(p1));
764113357Sjeff	sched_fork_kse(FIRST_KSE_IN_PROC(p), FIRST_KSE_IN_PROC(p1));
765113357Sjeff	sched_fork_thread(FIRST_THREAD_IN_PROC(p), FIRST_THREAD_IN_PROC(p1));
766113357Sjeff}
767113357Sjeff
768113357Sjeffvoid
769113357Sjeffsched_fork_kse(struct kse *ke, struct kse *child)
770113357Sjeff{
771113357Sjeff	child->ke_slice = ke->ke_slice;
772113357Sjeff	child->ke_cpu = ke->ke_cpu; /* sched_pickcpu(); */
773113357Sjeff	child->ke_runq = NULL;
774113357Sjeff
775113357Sjeff	/*
776113357Sjeff	 * Claim that we've been running for one second for statistical
777113357Sjeff	 * purposes.
778113357Sjeff	 */
779113357Sjeff	child->ke_ticks = 0;
780113357Sjeff	child->ke_ltick = ticks;
781113357Sjeff	child->ke_ftick = ticks - hz;
782113357Sjeff}
783113357Sjeff
784113357Sjeffvoid
785113357Sjeffsched_fork_ksegrp(struct ksegrp *kg, struct ksegrp *child)
786113357Sjeff{
787109864Sjeff	/* XXX Need something better here */
788110645Sjeff	if (kg->kg_slptime > kg->kg_runtime) {
789111857Sjeff		child->kg_slptime = SCHED_DYN_RANGE;
790111857Sjeff		child->kg_runtime = kg->kg_slptime / SCHED_DYN_RANGE;
791110645Sjeff	} else {
792111857Sjeff		child->kg_runtime = SCHED_DYN_RANGE;
793111857Sjeff		child->kg_slptime = kg->kg_runtime / SCHED_DYN_RANGE;
794110645Sjeff	}
795113357Sjeff
796109864Sjeff	child->kg_user_pri = kg->kg_user_pri;
797113357Sjeff	child->kg_nice = kg->kg_nice;
798113357Sjeff}
799109864Sjeff
800113357Sjeffvoid
801113357Sjeffsched_fork_thread(struct thread *td, struct thread *child)
802113357Sjeff{
803113357Sjeff}
804113357Sjeff
805113357Sjeffvoid
806113357Sjeffsched_class(struct ksegrp *kg, int class)
807113357Sjeff{
808113357Sjeff	struct kseq *kseq;
809113357Sjeff	struct kse *ke;
810113357Sjeff
811113357Sjeff	if (kg->kg_pri_class == class)
812113357Sjeff		return;
813113357Sjeff
814113357Sjeff	FOREACH_KSE_IN_GROUP(kg, ke) {
815113357Sjeff		if (ke->ke_state != KES_ONRUNQ &&
816113357Sjeff		    ke->ke_state != KES_THREAD)
817113357Sjeff			continue;
818113357Sjeff		kseq = KSEQ_CPU(ke->ke_cpu);
819113357Sjeff
820113357Sjeff		kseq->ksq_loads[kg->kg_pri_class]--;
821113357Sjeff		kseq->ksq_loads[class]++;
822113357Sjeff
823113357Sjeff		if (kg->kg_pri_class == PRI_TIMESHARE)
824113357Sjeff			kseq_nice_rem(kseq, kg->kg_nice);
825113357Sjeff		else if (class == PRI_TIMESHARE)
826113357Sjeff			kseq_nice_add(kseq, kg->kg_nice);
827109970Sjeff	}
828109970Sjeff
829113357Sjeff	kg->kg_pri_class = class;
830109864Sjeff}
831109864Sjeff
832109864Sjeff/*
833109864Sjeff * Return some of the child's priority and interactivity to the parent.
834109864Sjeff */
835109864Sjeffvoid
836113357Sjeffsched_exit(struct proc *p, struct proc *child)
837109864Sjeff{
838113357Sjeff	struct ksegrp *kg;
839113357Sjeff	struct kse *ke;
840113357Sjeff
841109864Sjeff	/* XXX Need something better here */
842109864Sjeff	mtx_assert(&sched_lock, MA_OWNED);
843113357Sjeff	kg = FIRST_KSEGRP_IN_PROC(child);
844113357Sjeff	ke = FIRST_KSE_IN_KSEGRP(kg);
845113357Sjeff	kseq_rem(KSEQ_CPU(ke->ke_cpu), ke);
846109864Sjeff}
847109864Sjeff
848109864Sjeffvoid
849113357Sjeffsched_clock(struct kse *ke)
850109864Sjeff{
851113357Sjeff	struct kseq *kseq;
852113357Sjeff	struct ksegrp *kg;
853113357Sjeff	struct thread *td;
854113357Sjeff#if 0
855109864Sjeff	struct kse *nke;
856110267Sjeff#endif
857109864Sjeff
858113357Sjeff	/*
859113357Sjeff	 * sched_setup() apparently happens prior to stathz being set.  We
860113357Sjeff	 * need to resolve the timers earlier in the boot so we can avoid
861113357Sjeff	 * calculating this here.
862113357Sjeff	 */
863113357Sjeff	if (realstathz == 0) {
864113357Sjeff		realstathz = stathz ? stathz : hz;
865113357Sjeff		tickincr = hz / realstathz;
866113357Sjeff		/*
867113357Sjeff		 * XXX This does not work for values of stathz that are much
868113357Sjeff		 * larger than hz.
869113357Sjeff		 */
870113357Sjeff		if (tickincr == 0)
871113357Sjeff			tickincr = 1;
872113357Sjeff	}
873109864Sjeff
874113357Sjeff	td = ke->ke_thread;
875113357Sjeff	kg = ke->ke_ksegrp;
876109864Sjeff
877110028Sjeff	mtx_assert(&sched_lock, MA_OWNED);
878110028Sjeff	KASSERT((td != NULL), ("schedclock: null thread pointer"));
879110028Sjeff
880110028Sjeff	/* Adjust ticks for pctcpu */
881111793Sjeff	ke->ke_ticks++;
882109971Sjeff	ke->ke_ltick = ticks;
883112994Sjeff
884109971Sjeff	/* Go up to one second beyond our max and then trim back down */
885109971Sjeff	if (ke->ke_ftick + SCHED_CPU_TICKS + hz < ke->ke_ltick)
886109971Sjeff		sched_pctcpu_update(ke);
887109971Sjeff
888110028Sjeff	if (td->td_kse->ke_flags & KEF_IDLEKSE)
889109864Sjeff		return;
890110028Sjeff
891113357Sjeff	CTR4(KTR_ULE, "Tick kse %p (slice: %d, slptime: %d, runtime: %d)",
892113357Sjeff	    ke, ke->ke_slice, kg->kg_slptime >> 10, kg->kg_runtime >> 10);
893113357Sjeff
894110028Sjeff	/*
895113357Sjeff	 * We only do slicing code for TIMESHARE ksegrps.
896113357Sjeff	 */
897113357Sjeff	if (kg->kg_pri_class != PRI_TIMESHARE)
898113357Sjeff		return;
899113357Sjeff	/*
900110028Sjeff	 * Check for a higher priority task on the run queue.  This can happen
901110028Sjeff	 * on SMP if another processor woke up a process on our runq.
902110028Sjeff	 */
903110028Sjeff	kseq = KSEQ_SELF();
904113357Sjeff#if 0
905113357Sjeff	if (kseq->ksq_load > 1 && (nke = kseq_choose(kseq)) != NULL) {
906113357Sjeff		if (sched_strict &&
907113357Sjeff		    nke->ke_thread->td_priority < td->td_priority)
908113357Sjeff			td->td_flags |= TDF_NEEDRESCHED;
909113357Sjeff		else if (nke->ke_thread->td_priority <
910113357Sjeff		    td->td_priority SCHED_PRIO_SLOP)
911113357Sjeff
912113357Sjeff		if (nke->ke_thread->td_priority < td->td_priority)
913113357Sjeff			td->td_flags |= TDF_NEEDRESCHED;
914113357Sjeff	}
915110267Sjeff#endif
916109864Sjeff	/*
917110645Sjeff	 * We used a tick charge it to the ksegrp so that we can compute our
918113357Sjeff	 * interactivity.
919109864Sjeff	 */
920113357Sjeff	kg->kg_runtime += tickincr << 10;
921110645Sjeff
922109864Sjeff	/*
923109864Sjeff	 * We used up one time slice.
924109864Sjeff	 */
925109864Sjeff	ke->ke_slice--;
926113357Sjeff#ifdef SMP
927113357Sjeff	kseq->ksq_rslice--;
928113357Sjeff#endif
929113357Sjeff
930113357Sjeff	if (ke->ke_slice > 0)
931113357Sjeff		return;
932109864Sjeff	/*
933113357Sjeff	 * We're out of time, recompute priorities and requeue.
934109864Sjeff	 */
935113357Sjeff	kseq_rem(kseq, ke);
936113357Sjeff	sched_priority(kg);
937113357Sjeff	sched_slice(ke);
938113357Sjeff	if (SCHED_CURR(kg, ke))
939113357Sjeff		ke->ke_runq = kseq->ksq_curr;
940113357Sjeff	else
941113357Sjeff		ke->ke_runq = kseq->ksq_next;
942113357Sjeff	kseq_add(kseq, ke);
943113357Sjeff	td->td_flags |= TDF_NEEDRESCHED;
944109864Sjeff}
945109864Sjeff
946109864Sjeffint
947109864Sjeffsched_runnable(void)
948109864Sjeff{
949109864Sjeff	struct kseq *kseq;
950109864Sjeff
951110028Sjeff	kseq = KSEQ_SELF();
952109864Sjeff
953113357Sjeff	if (kseq->ksq_load)
954109970Sjeff		return (1);
955109970Sjeff#ifdef SMP
956110028Sjeff	/*
957110028Sjeff	 * For SMP we may steal other processor's KSEs.  Just search until we
958110028Sjeff	 * verify that at least on other cpu has a runnable task.
959110028Sjeff	 */
960109970Sjeff	if (smp_started) {
961109970Sjeff		int i;
962109970Sjeff
963109970Sjeff		for (i = 0; i < mp_maxid; i++) {
964109970Sjeff			if (CPU_ABSENT(i))
965109970Sjeff				continue;
966110028Sjeff			kseq = KSEQ_CPU(i);
967113357Sjeff			if (kseq->ksq_load)
968109970Sjeff				return (1);
969109970Sjeff		}
970109970Sjeff	}
971109970Sjeff#endif
972109970Sjeff	return (0);
973109864Sjeff}
974109864Sjeff
975109864Sjeffvoid
976109864Sjeffsched_userret(struct thread *td)
977109864Sjeff{
978109864Sjeff	struct ksegrp *kg;
979109864Sjeff
980109864Sjeff	kg = td->td_ksegrp;
981109864Sjeff
982109864Sjeff	if (td->td_priority != kg->kg_user_pri) {
983109864Sjeff		mtx_lock_spin(&sched_lock);
984109864Sjeff		td->td_priority = kg->kg_user_pri;
985109864Sjeff		mtx_unlock_spin(&sched_lock);
986109864Sjeff	}
987109864Sjeff}
988109864Sjeff
989109864Sjeffstruct kse *
990109970Sjeffsched_choose(void)
991109970Sjeff{
992110028Sjeff	struct kseq *kseq;
993109970Sjeff	struct kse *ke;
994113357Sjeff#ifdef SMP
995113357Sjeff	int steal;
996109970Sjeff
997113357Sjeff	steal = 0;
998113357Sjeff#endif
999113357Sjeff
1000110028Sjeff	kseq = KSEQ_SELF();
1001113357Sjeff#ifdef SMP
1002112966Sjeffretry:
1003113357Sjeff#endif
1004110028Sjeff	ke = kseq_choose(kseq);
1005109864Sjeff	if (ke) {
1006113357Sjeff		runq_remove(ke->ke_runq, ke);
1007109864Sjeff		ke->ke_state = KES_THREAD;
1008112966Sjeff
1009113357Sjeff		if (ke->ke_ksegrp->kg_pri_class == PRI_TIMESHARE) {
1010113357Sjeff			CTR4(KTR_ULE, "Run kse %p from %p (slice: %d, pri: %d)",
1011113357Sjeff			    ke, ke->ke_runq, ke->ke_slice,
1012113357Sjeff			    ke->ke_thread->td_priority);
1013113357Sjeff		}
1014113357Sjeff#ifdef SMP
1015112966Sjeff		/*
1016113357Sjeff		 * If we've stolen this thread we need to kill the pointer
1017113357Sjeff		 * to the run queue and reset the cpu id.
1018112966Sjeff		 */
1019113357Sjeff		if (steal) {
1020113357Sjeff			kseq_rem(kseq, ke);
1021113357Sjeff			ke->ke_cpu = PCPU_GET(cpuid);
1022113357Sjeff			kseq_add(KSEQ_SELF(), ke);
1023112966Sjeff		}
1024113357Sjeff#endif
1025113357Sjeff		return (ke);
1026109864Sjeff	}
1027109864Sjeff
1028109970Sjeff#ifdef SMP
1029109970Sjeff	if (ke == NULL && smp_started) {
1030109970Sjeff		/*
1031109970Sjeff		 * Find the cpu with the highest load and steal one proc.
1032109970Sjeff		 */
1033113357Sjeff		steal = 1;
1034113357Sjeff		if ((kseq = kseq_load_highest()) != NULL)
1035113357Sjeff			goto retry;
1036109970Sjeff	}
1037109970Sjeff#endif
1038113357Sjeff
1039113357Sjeff	return (NULL);
1040109864Sjeff}
1041109864Sjeff
1042109864Sjeffvoid
1043109864Sjeffsched_add(struct kse *ke)
1044109864Sjeff{
1045110267Sjeff	struct kseq *kseq;
1046113357Sjeff	struct ksegrp *kg;
1047109864Sjeff
1048109864Sjeff	mtx_assert(&sched_lock, MA_OWNED);
1049110267Sjeff	KASSERT((ke->ke_thread != NULL), ("sched_add: No thread on KSE"));
1050109864Sjeff	KASSERT((ke->ke_thread->td_kse != NULL),
1051110267Sjeff	    ("sched_add: No KSE on thread"));
1052109864Sjeff	KASSERT(ke->ke_state != KES_ONRUNQ,
1053110267Sjeff	    ("sched_add: kse %p (%s) already in run queue", ke,
1054109864Sjeff	    ke->ke_proc->p_comm));
1055109864Sjeff	KASSERT(ke->ke_proc->p_sflag & PS_INMEM,
1056110267Sjeff	    ("sched_add: process swapped out"));
1057109864Sjeff
1058113357Sjeff	kg = ke->ke_ksegrp;
1059113357Sjeff
1060113357Sjeff	if (ke->ke_runq)
1061113357Sjeff		Debugger("hrm?");
1062113357Sjeff
1063113357Sjeff	switch (kg->kg_pri_class) {
1064112994Sjeff	case PRI_ITHD:
1065112994Sjeff	case PRI_REALTIME:
1066112994Sjeff		kseq = KSEQ_SELF();
1067113357Sjeff		if (ke->ke_runq == NULL)
1068113357Sjeff			kseq_add(kseq, ke);
1069113357Sjeff		ke->ke_runq = kseq->ksq_curr;
1070113357Sjeff		ke->ke_slice = SCHED_SLICE_MAX;
1071112994Sjeff		break;
1072112994Sjeff	case PRI_TIMESHARE:
1073113357Sjeff		kseq = KSEQ_CPU(ke->ke_cpu);
1074113357Sjeff		if (ke->ke_runq == NULL) {
1075113357Sjeff			if (SCHED_CURR(kg, ke))
1076113357Sjeff				ke->ke_runq = kseq->ksq_curr;
1077113357Sjeff			else
1078113357Sjeff				ke->ke_runq = kseq->ksq_next;
1079113357Sjeff			kseq_add(kseq, ke);
1080113357Sjeff		}
1081113357Sjeff		break;
1082112994Sjeff	case PRI_IDLE:
1083111789Sjeff		kseq = KSEQ_CPU(ke->ke_cpu);
1084113357Sjeff
1085113357Sjeff		if (ke->ke_runq == NULL)
1086113357Sjeff			kseq_add(kseq, ke);
1087113357Sjeff		/*
1088113357Sjeff		 * This is for priority prop.
1089113357Sjeff		 */
1090113357Sjeff		if (ke->ke_thread->td_priority < PRI_MAX_TIMESHARE)
1091113357Sjeff			ke->ke_runq = kseq->ksq_curr;
1092113357Sjeff		else
1093113357Sjeff			ke->ke_runq = &kseq->ksq_idle;
1094113357Sjeff		ke->ke_slice = SCHED_SLICE_MIN;
1095112994Sjeff		break;
1096113357Sjeff	default:
1097113357Sjeff		panic("Unknown pri class.\n");
1098113357Sjeff		break;
1099112994Sjeff	}
1100109864Sjeff
1101109864Sjeff	ke->ke_ksegrp->kg_runq_kses++;
1102109864Sjeff	ke->ke_state = KES_ONRUNQ;
1103109864Sjeff
1104113357Sjeff	runq_add(ke->ke_runq, ke);
1105109864Sjeff}
1106109864Sjeff
1107109864Sjeffvoid
1108109864Sjeffsched_rem(struct kse *ke)
1109109864Sjeff{
1110113357Sjeff	struct kseq *kseq;
1111113357Sjeff
1112109864Sjeff	mtx_assert(&sched_lock, MA_OWNED);
1113109864Sjeff	/* KASSERT((ke->ke_state == KES_ONRUNQ), ("KSE not on run queue")); */
1114113357Sjeff	panic("WTF\n");
1115109864Sjeff
1116109864Sjeff	ke->ke_state = KES_THREAD;
1117109864Sjeff	ke->ke_ksegrp->kg_runq_kses--;
1118113357Sjeff	kseq = KSEQ_CPU(ke->ke_cpu);
1119113357Sjeff	runq_remove(ke->ke_runq, ke);
1120113357Sjeff	kseq_rem(kseq, ke);
1121109864Sjeff}
1122109864Sjeff
1123109864Sjefffixpt_t
1124109864Sjeffsched_pctcpu(struct kse *ke)
1125109864Sjeff{
1126109864Sjeff	fixpt_t pctcpu;
1127109864Sjeff
1128109864Sjeff	pctcpu = 0;
1129109864Sjeff
1130109864Sjeff	if (ke->ke_ticks) {
1131109864Sjeff		int rtick;
1132109864Sjeff
1133109864Sjeff		/* Update to account for time potentially spent sleeping */
1134109864Sjeff		ke->ke_ltick = ticks;
1135109864Sjeff		sched_pctcpu_update(ke);
1136109864Sjeff
1137109864Sjeff		/* How many rtick per second ? */
1138111793Sjeff		rtick = ke->ke_ticks / SCHED_CPU_TIME;
1139110226Sscottl		pctcpu = (FSCALE * ((FSCALE * rtick)/realstathz)) >> FSHIFT;
1140109864Sjeff	}
1141109864Sjeff
1142109864Sjeff	ke->ke_proc->p_swtime = ke->ke_ltick - ke->ke_ftick;
1143109864Sjeff
1144109864Sjeff	return (pctcpu);
1145109864Sjeff}
1146109864Sjeff
1147109864Sjeffint
1148109864Sjeffsched_sizeof_kse(void)
1149109864Sjeff{
1150109864Sjeff	return (sizeof(struct kse) + sizeof(struct ke_sched));
1151109864Sjeff}
1152109864Sjeff
1153109864Sjeffint
1154109864Sjeffsched_sizeof_ksegrp(void)
1155109864Sjeff{
1156109864Sjeff	return (sizeof(struct ksegrp) + sizeof(struct kg_sched));
1157109864Sjeff}
1158109864Sjeff
1159109864Sjeffint
1160109864Sjeffsched_sizeof_proc(void)
1161109864Sjeff{
1162109864Sjeff	return (sizeof(struct proc));
1163109864Sjeff}
1164109864Sjeff
1165109864Sjeffint
1166109864Sjeffsched_sizeof_thread(void)
1167109864Sjeff{
1168109864Sjeff	return (sizeof(struct thread) + sizeof(struct td_sched));
1169109864Sjeff}
1170