sched_ule.c revision 112994
1/*-
2 * Copyright (c) 2003, Jeffrey Roberson <jeff@freebsd.org>
3 * All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 * 1. Redistributions of source code must retain the above copyright
9 *    notice unmodified, this list of conditions, and the following
10 *    disclaimer.
11 * 2. Redistributions in binary form must reproduce the above copyright
12 *    notice, this list of conditions and the following disclaimer in the
13 *    documentation and/or other materials provided with the distribution.
14 *
15 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
16 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
17 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
18 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
19 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
20 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
21 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
22 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
23 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
24 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
25 *
26 * $FreeBSD: head/sys/kern/sched_ule.c 112994 2003-04-03 00:29:28Z jeff $
27 */
28
29#include <sys/param.h>
30#include <sys/systm.h>
31#include <sys/kernel.h>
32#include <sys/ktr.h>
33#include <sys/lock.h>
34#include <sys/mutex.h>
35#include <sys/proc.h>
36#include <sys/resource.h>
37#include <sys/sched.h>
38#include <sys/smp.h>
39#include <sys/sx.h>
40#include <sys/sysctl.h>
41#include <sys/sysproto.h>
42#include <sys/vmmeter.h>
43#ifdef DDB
44#include <ddb/ddb.h>
45#endif
46#ifdef KTRACE
47#include <sys/uio.h>
48#include <sys/ktrace.h>
49#endif
50
51#include <machine/cpu.h>
52
53/* decay 95% of `p_pctcpu' in 60 seconds; see CCPU_SHIFT before changing */
54/* XXX This is bogus compatability crap for ps */
55static fixpt_t  ccpu = 0.95122942450071400909 * FSCALE; /* exp(-1/20) */
56SYSCTL_INT(_kern, OID_AUTO, ccpu, CTLFLAG_RD, &ccpu, 0, "");
57
58static void sched_setup(void *dummy);
59SYSINIT(sched_setup, SI_SUB_RUN_QUEUE, SI_ORDER_FIRST, sched_setup, NULL)
60
61int realstathz;
62
63#define	SCHED_STRICT_RESCHED 1
64
65/*
66 * These datastructures are allocated within their parent datastructure but
67 * are scheduler specific.
68 */
69
70struct ke_sched {
71	int		ske_slice;
72	struct runq	*ske_runq;
73	/* The following variables are only used for pctcpu calculation */
74	int		ske_ltick;	/* Last tick that we were running on */
75	int		ske_ftick;	/* First tick that we were running on */
76	int		ske_ticks;	/* Tick count */
77	u_char		ske_cpu;
78};
79#define	ke_slice	ke_sched->ske_slice
80#define	ke_runq		ke_sched->ske_runq
81#define	ke_ltick	ke_sched->ske_ltick
82#define	ke_ftick	ke_sched->ske_ftick
83#define	ke_ticks	ke_sched->ske_ticks
84#define	ke_cpu		ke_sched->ske_cpu
85
86struct kg_sched {
87	int	skg_slptime;		/* Number of ticks we vol. slept */
88	int	skg_runtime;		/* Number of ticks we were running */
89};
90#define	kg_slptime	kg_sched->skg_slptime
91#define	kg_runtime	kg_sched->skg_runtime
92
93struct td_sched {
94	int	std_slptime;
95	int	std_schedflag;
96};
97#define	td_slptime	td_sched->std_slptime
98#define	td_schedflag	td_sched->std_schedflag
99
100#define	TD_SCHED_BLOAD	0x0001		/*
101					 * thread was counted as being in short
102					 * term sleep.
103					 */
104struct td_sched td_sched;
105struct ke_sched ke_sched;
106struct kg_sched kg_sched;
107
108struct ke_sched *kse0_sched = &ke_sched;
109struct kg_sched *ksegrp0_sched = &kg_sched;
110struct p_sched *proc0_sched = NULL;
111struct td_sched *thread0_sched = &td_sched;
112
113/*
114 * This priority range has 20 priorities on either end that are reachable
115 * only through nice values.
116 *
117 * PRI_RANGE:	Total priority range for timeshare threads.
118 * PRI_NRESV:	Reserved priorities for nice.
119 * PRI_BASE:	The start of the dynamic range.
120 * DYN_RANGE:	Number of priorities that are available int the dynamic
121 *		priority range.
122 * DYN_HALF:	Half of DYN_RANGE for convenience elsewhere.
123 * PRI_DYN:	The dynamic priority which is derived from the number of ticks
124 *		running vs the total number of ticks.
125 */
126#define	SCHED_PRI_RANGE		(PRI_MAX_TIMESHARE - PRI_MIN_TIMESHARE + 1)
127#define	SCHED_PRI_NRESV		PRIO_TOTAL
128#define	SCHED_PRI_NHALF		(PRIO_TOTAL / 2)
129#define	SCHED_PRI_BASE		((SCHED_PRI_NRESV / 2) + PRI_MIN_TIMESHARE)
130#define	SCHED_DYN_RANGE		(SCHED_PRI_RANGE - SCHED_PRI_NRESV)
131#define	SCHED_DYN_HALF		(SCHED_DYN_RANGE / 2)
132#define	SCHED_PRI_DYN(run, total)	(((run) * SCHED_DYN_RANGE) / (total))
133
134
135/*
136 * These determine the interactivity of a process.
137 *
138 * SLP_RUN_MAX:	Maximum amount of sleep time + run time we'll accumulate
139 *		before throttling back.
140 * SLP_RUN_THROTTLE:	Divisor for reducing slp/run time.
141 * INTERACT_RANGE:	Range of interactivity values.  Smaller is better.
142 * INTERACT_HALF:	Convenience define, half of the interactivity range.
143 * INTERACT_THRESH:	Threshhold for placement on the current runq.
144 */
145#define	SCHED_SLP_RUN_MAX	((hz * 2) << 10)
146#define	SCHED_SLP_RUN_THROTTLE	(10)
147#define	SCHED_INTERACT_RANGE	(100)
148#define	SCHED_INTERACT_HALF	(SCHED_INTERACT_RANGE / 2)
149#define	SCHED_INTERACT_THRESH	(10)
150
151/*
152 * These parameters and macros determine the size of the time slice that is
153 * granted to each thread.
154 *
155 * SLICE_MIN:	Minimum time slice granted, in units of ticks.
156 * SLICE_MAX:	Maximum time slice granted.
157 * SLICE_RANGE:	Range of available time slices scaled by hz.
158 * SLICE_SCALE:	The number slices granted per val in the range of [0, max].
159 * SLICE_NICE:  Determine the amount of slice granted to a scaled nice.
160 */
161#define	SCHED_SLICE_MIN			(hz / 100)
162#define	SCHED_SLICE_MAX			(hz / 10)
163#define	SCHED_SLICE_RANGE		(SCHED_SLICE_MAX - SCHED_SLICE_MIN + 1)
164#define	SCHED_SLICE_SCALE(val, max)	(((val) * SCHED_SLICE_RANGE) / (max))
165#define	SCHED_SLICE_NICE(nice)						\
166    (SCHED_SLICE_MAX - SCHED_SLICE_SCALE((nice), SCHED_PRI_NHALF))
167
168/*
169 * This macro determines whether or not the kse belongs on the current or
170 * next run queue.
171 *
172 * XXX nice value should effect how interactive a kg is.
173 */
174#define	SCHED_CURR(kg)	(sched_interact_score(kg) < SCHED_INTERACT_THRESH)
175
176/*
177 * Cpu percentage computation macros and defines.
178 *
179 * SCHED_CPU_TIME:	Number of seconds to average the cpu usage across.
180 * SCHED_CPU_TICKS:	Number of hz ticks to average the cpu usage across.
181 */
182
183#define	SCHED_CPU_TIME	10
184#define	SCHED_CPU_TICKS	(hz * SCHED_CPU_TIME)
185
186/*
187 * kseq - pair of runqs per processor
188 */
189
190struct kseq {
191	struct runq	ksq_ithd;	/* Queue of ITHD and REALTIME tds. */
192	struct runq	ksq_idle;	/* Queue of IDLE threads. */
193	struct runq	ksq_runqs[2];	/* Run queues for TIMESHARE. */
194	struct runq	*ksq_curr;
195	struct runq	*ksq_next;
196	int		ksq_itload;	/* Total runnable for ITHD. */
197	int		ksq_tsload;	/* Total runnable for TIMESHARE. */
198	int		ksq_idload;	/* Total runnable for IDLE. */
199#ifdef SMP
200	unsigned int	ksq_rslices;	/* Slices on run queue */
201	unsigned int	ksq_bload;	/* Threads waiting on IO */
202#endif
203};
204
205/*
206 * One kse queue per processor.
207 */
208#ifdef SMP
209struct kseq	kseq_cpu[MAXCPU];
210#define	KSEQ_SELF()	(&kseq_cpu[PCPU_GET(cpuid)])
211#define	KSEQ_CPU(x)	(&kseq_cpu[(x)])
212#else
213struct kseq	kseq_cpu;
214#define	KSEQ_SELF()	(&kseq_cpu)
215#define	KSEQ_CPU(x)	(&kseq_cpu)
216#endif
217
218static void sched_slice(struct kse *ke);
219static int sched_priority(struct ksegrp *kg);
220static int sched_interact_score(struct ksegrp *kg);
221void sched_pctcpu_update(struct kse *ke);
222int sched_pickcpu(void);
223
224/* Operations on per processor queues */
225static struct kse * kseq_choose(struct kseq *kseq);
226static int kseq_nice_min(struct kseq *kseq);
227static void kseq_setup(struct kseq *kseq);
228static void kseq_add(struct kseq *kseq, struct kse *ke);
229static __inline void kseq_rem(struct kseq *kseq, struct kse *ke);
230#ifdef SMP
231static __inline void kseq_sleep(struct kseq *kseq, struct kse *ke);
232static __inline void kseq_wakeup(struct kseq *kseq, struct kse *ke);
233struct kseq * kseq_load_highest(void);
234#endif
235
236static void
237kseq_add(struct kseq *kseq, struct kse *ke)
238{
239	struct ksegrp *kg;
240
241	kg = ke->ke_ksegrp;
242
243	/*
244	 * Figure out what run queue we should go on and assign a slice.
245	 */
246	switch (kg->kg_pri_class) {
247	/*
248	 * If we're a real-time or interrupt thread place us on the curr
249	 * queue for the current processor.  Hopefully this will yield the
250	 * lowest latency response.
251	 */
252	case PRI_ITHD:
253	case PRI_REALTIME:
254		ke->ke_runq = &kseq->ksq_ithd;
255		ke->ke_slice = SCHED_SLICE_MAX;
256		kseq->ksq_itload++;
257		break;
258	/*
259	 * Timeshare threads get placed on the appropriate queue on their
260	 * bound cpu.
261	 */
262	case PRI_TIMESHARE:
263		if (ke->ke_runq == NULL) {
264			if (SCHED_CURR(kg))
265				ke->ke_runq = kseq->ksq_curr;
266			else
267				ke->ke_runq = kseq->ksq_next;
268		}
269		if (ke->ke_slice == 0)
270			sched_slice(ke);
271		kseq->ksq_tsload++;
272		break;
273	/*
274	 * Only grant PRI_IDLE processes a slice if there is nothing else
275	 * running.
276	 */
277	case PRI_IDLE:
278		ke->ke_runq = &kseq->ksq_idle;
279		ke->ke_slice = SCHED_SLICE_MIN;
280		kseq->ksq_idload++;
281		break;
282	default:
283		panic("Unknown priority class.\n");
284		break;
285	}
286
287	runq_add(ke->ke_runq, ke);
288#ifdef SMP
289	kseq->ksq_rslices += ke->ke_slice;
290#endif
291}
292static void
293kseq_rem(struct kseq *kseq, struct kse *ke)
294{
295	struct ksegrp *kg;
296
297	kg = ke->ke_ksegrp;
298
299	/*
300	 * XXX Consider making the load an array.
301	 */
302	switch (kg->kg_pri_class) {
303	case PRI_ITHD:
304	case PRI_REALTIME:
305		kseq->ksq_itload--;
306		break;
307	case PRI_TIMESHARE:
308		kseq->ksq_tsload--;
309		break;
310	case PRI_IDLE:
311		kseq->ksq_idload--;
312		break;
313	}
314	runq_remove(ke->ke_runq, ke);
315#ifdef SMP
316	kseq->ksq_rslices -= ke->ke_slice;
317#endif
318}
319
320#ifdef SMP
321static __inline void
322kseq_sleep(struct kseq *kseq, struct kse *ke)
323{
324	kseq->ksq_bload++;
325}
326
327static __inline void
328kseq_wakeup(struct kseq *kseq, struct kse *ke)
329{
330	kseq->ksq_bload--;
331}
332
333struct kseq *
334kseq_load_highest(void)
335{
336	struct kseq *kseq;
337	int load;
338	int cpu;
339	int i;
340
341	cpu = 0;
342	load = 0;
343
344	for (i = 0; i < mp_maxid; i++) {
345		if (CPU_ABSENT(i))
346			continue;
347		kseq = KSEQ_CPU(i);
348		if (kseq->ksq_tsload > load) {
349			load = kseq->ksq_tsload;
350			cpu = i;
351		}
352	}
353	if (load)
354		return (KSEQ_CPU(cpu));
355
356	return (NULL);
357}
358#endif
359
360struct kse *
361kseq_choose(struct kseq *kseq)
362{
363	struct kse *ke;
364	struct runq *swap;
365
366	if (kseq->ksq_itload)
367		return (runq_choose(&kseq->ksq_ithd));
368
369	if (kseq->ksq_tsload) {
370		if ((ke = runq_choose(kseq->ksq_curr)) != NULL)
371			return (ke);
372
373		swap = kseq->ksq_curr;
374		kseq->ksq_curr = kseq->ksq_next;
375		kseq->ksq_next = swap;
376
377		return (runq_choose(kseq->ksq_curr));
378	}
379	if (kseq->ksq_idload)
380		return (runq_choose(&kseq->ksq_idle));
381
382	return (NULL);
383}
384
385static int
386kseq_nice_min(struct kseq *kseq)
387{
388	struct kse *ke0;
389	struct kse *ke1;
390
391	if (kseq->ksq_tsload == 0)
392		return (0);
393
394	ke0 = runq_choose(kseq->ksq_curr);
395	ke1 = runq_choose(kseq->ksq_next);
396
397	if (ke0 == NULL)
398		return (ke1->ke_ksegrp->kg_nice);
399
400	if (ke1 == NULL)
401		return (ke0->ke_ksegrp->kg_nice);
402
403	return (min(ke0->ke_ksegrp->kg_nice, ke1->ke_ksegrp->kg_nice));
404}
405
406static void
407kseq_setup(struct kseq *kseq)
408{
409	kseq->ksq_curr = &kseq->ksq_runqs[0];
410	kseq->ksq_next = &kseq->ksq_runqs[1];
411	runq_init(&kseq->ksq_ithd);
412	runq_init(kseq->ksq_curr);
413	runq_init(kseq->ksq_next);
414	runq_init(&kseq->ksq_idle);
415	kseq->ksq_itload = 0;
416	kseq->ksq_tsload = 0;
417	kseq->ksq_idload = 0;
418#ifdef SMP
419	kseq->ksq_rslices = 0;
420	kseq->ksq_bload = 0;
421#endif
422}
423
424static void
425sched_setup(void *dummy)
426{
427	int i;
428
429	realstathz = stathz ? stathz : hz;
430
431	mtx_lock_spin(&sched_lock);
432	/* init kseqs */
433	for (i = 0; i < MAXCPU; i++)
434		kseq_setup(KSEQ_CPU(i));
435	mtx_unlock_spin(&sched_lock);
436}
437
438/*
439 * Scale the scheduling priority according to the "interactivity" of this
440 * process.
441 */
442static int
443sched_priority(struct ksegrp *kg)
444{
445	int pri;
446
447	if (kg->kg_pri_class != PRI_TIMESHARE)
448		return (kg->kg_user_pri);
449
450	pri = sched_interact_score(kg) * SCHED_DYN_RANGE / SCHED_INTERACT_RANGE;
451	pri += SCHED_PRI_BASE;
452	pri += kg->kg_nice;
453
454	if (pri > PRI_MAX_TIMESHARE)
455		pri = PRI_MAX_TIMESHARE;
456	else if (pri < PRI_MIN_TIMESHARE)
457		pri = PRI_MIN_TIMESHARE;
458
459	kg->kg_user_pri = pri;
460
461	return (kg->kg_user_pri);
462}
463
464/*
465 * Calculate a time slice based on the properties of the kseg and the runq
466 * that we're on.  This is only for PRI_TIMESHARE ksegrps.
467 */
468static void
469sched_slice(struct kse *ke)
470{
471	struct ksegrp *kg;
472
473	kg = ke->ke_ksegrp;
474
475	/*
476	 * Rationale:
477	 * KSEs in interactive ksegs get the minimum slice so that we
478	 * quickly notice if it abuses its advantage.
479	 *
480	 * KSEs in non-interactive ksegs are assigned a slice that is
481	 * based on the ksegs nice value relative to the least nice kseg
482	 * on the run queue for this cpu.
483	 *
484	 * If the KSE is less nice than all others it gets the maximum
485	 * slice and other KSEs will adjust their slice relative to
486	 * this when they first expire.
487	 *
488	 * There is 20 point window that starts relative to the least
489	 * nice kse on the run queue.  Slice size is determined by
490	 * the kse distance from the last nice ksegrp.
491	 *
492	 * If you are outside of the window you will get no slice and
493	 * you will be reevaluated each time you are selected on the
494	 * run queue.
495	 *
496	 */
497
498	if (!SCHED_CURR(kg)) {
499		struct kseq *kseq;
500		int nice_base;
501		int nice;
502
503		kseq = KSEQ_CPU(ke->ke_cpu);
504		nice_base = kseq_nice_min(kseq);
505		nice = kg->kg_nice + (0 - nice_base);
506
507		if (kseq->ksq_tsload == 0 || kg->kg_nice < nice_base)
508			ke->ke_slice = SCHED_SLICE_MAX;
509		else if (nice <= SCHED_PRI_NHALF)
510			ke->ke_slice = SCHED_SLICE_NICE(nice);
511		else
512			ke->ke_slice = 0;
513	} else
514		ke->ke_slice = SCHED_SLICE_MIN;
515
516	/*
517	 * Check to see if we need to scale back the slp and run time
518	 * in the kg.  This will cause us to forget old interactivity
519	 * while maintaining the current ratio.
520	 */
521	if ((kg->kg_runtime + kg->kg_slptime) >  SCHED_SLP_RUN_MAX) {
522		kg->kg_runtime /= SCHED_SLP_RUN_THROTTLE;
523		kg->kg_slptime /= SCHED_SLP_RUN_THROTTLE;
524	}
525
526	return;
527}
528
529static int
530sched_interact_score(struct ksegrp *kg)
531{
532	int big;
533	int small;
534	int base;
535
536	if (kg->kg_runtime > kg->kg_slptime) {
537		big = kg->kg_runtime;
538		small = kg->kg_slptime;
539		base = SCHED_INTERACT_HALF;
540	} else {
541		big = kg->kg_slptime;
542		small = kg->kg_runtime;
543		base = 0;
544	}
545
546	big /= SCHED_INTERACT_HALF;
547	if (big != 0)
548		small /= big;
549	else
550		small = 0;
551
552	small += base;
553	/* XXX Factor in nice */
554	return (small);
555}
556
557int
558sched_rr_interval(void)
559{
560	return (SCHED_SLICE_MAX);
561}
562
563void
564sched_pctcpu_update(struct kse *ke)
565{
566	/*
567	 * Adjust counters and watermark for pctcpu calc.
568	 */
569	/*
570	 * Shift the tick count out so that the divide doesn't round away
571	 * our results.
572	 */
573	ke->ke_ticks <<= 10;
574	ke->ke_ticks = (ke->ke_ticks / (ke->ke_ltick - ke->ke_ftick)) *
575		    SCHED_CPU_TICKS;
576	ke->ke_ticks >>= 10;
577	ke->ke_ltick = ticks;
578	ke->ke_ftick = ke->ke_ltick - SCHED_CPU_TICKS;
579}
580
581#ifdef SMP
582/* XXX Should be changed to kseq_load_lowest() */
583int
584sched_pickcpu(void)
585{
586	struct kseq *kseq;
587	int load;
588	int cpu;
589	int i;
590
591	if (!smp_started)
592		return (0);
593
594	load = 0;
595	cpu = 0;
596
597	for (i = 0; i < mp_maxid; i++) {
598		if (CPU_ABSENT(i))
599			continue;
600		kseq = KSEQ_CPU(i);
601		if (kseq->ksq_tsload < load) {
602			cpu = i;
603			load = kseq->ksq_tsload;
604		}
605	}
606
607	CTR1(KTR_RUNQ, "sched_pickcpu: %d", cpu);
608	return (cpu);
609}
610#else
611int
612sched_pickcpu(void)
613{
614	return (0);
615}
616#endif
617
618void
619sched_prio(struct thread *td, u_char prio)
620{
621	struct kse *ke;
622	struct runq *rq;
623
624	mtx_assert(&sched_lock, MA_OWNED);
625	ke = td->td_kse;
626	td->td_priority = prio;
627
628	if (TD_ON_RUNQ(td)) {
629		rq = ke->ke_runq;
630
631		runq_remove(rq, ke);
632		runq_add(rq, ke);
633	}
634}
635
636void
637sched_switchout(struct thread *td)
638{
639	struct kse *ke;
640
641	mtx_assert(&sched_lock, MA_OWNED);
642
643	ke = td->td_kse;
644
645	td->td_last_kse = ke;
646        td->td_lastcpu = ke->ke_oncpu;
647	ke->ke_oncpu = NOCPU;
648        td->td_flags &= ~TDF_NEEDRESCHED;
649
650	if (TD_IS_RUNNING(td)) {
651		setrunqueue(td);
652		return;
653	}
654	td->td_kse->ke_runq = NULL;
655
656	/*
657	 * We will not be on the run queue. So we must be
658	 * sleeping or similar.
659	 */
660	if (td->td_proc->p_flag & P_THREADED)
661		kse_reassign(ke);
662}
663
664void
665sched_switchin(struct thread *td)
666{
667	/* struct kse *ke = td->td_kse; */
668	mtx_assert(&sched_lock, MA_OWNED);
669
670	td->td_kse->ke_oncpu = PCPU_GET(cpuid);
671#if SCHED_STRICT_RESCHED
672	if (td->td_ksegrp->kg_pri_class == PRI_TIMESHARE &&
673	    td->td_priority != td->td_ksegrp->kg_user_pri)
674		curthread->td_flags |= TDF_NEEDRESCHED;
675#endif
676}
677
678void
679sched_nice(struct ksegrp *kg, int nice)
680{
681	struct thread *td;
682
683	kg->kg_nice = nice;
684	sched_priority(kg);
685	FOREACH_THREAD_IN_GROUP(kg, td) {
686		td->td_flags |= TDF_NEEDRESCHED;
687	}
688}
689
690void
691sched_sleep(struct thread *td, u_char prio)
692{
693	mtx_assert(&sched_lock, MA_OWNED);
694
695	td->td_slptime = ticks;
696	td->td_priority = prio;
697
698#ifdef SMP
699	if (td->td_priority < PZERO) {
700		kseq_sleep(KSEQ_CPU(td->td_kse->ke_cpu), td->td_kse);
701		td->td_schedflag |= TD_SCHED_BLOAD;
702	}
703#endif
704}
705
706void
707sched_wakeup(struct thread *td)
708{
709	mtx_assert(&sched_lock, MA_OWNED);
710
711	/*
712	 * Let the kseg know how long we slept for.  This is because process
713	 * interactivity behavior is modeled in the kseg.
714	 */
715	if (td->td_slptime) {
716		struct ksegrp *kg;
717
718		kg = td->td_ksegrp;
719		kg->kg_slptime += (ticks - td->td_slptime) << 10;
720		sched_priority(kg);
721		td->td_slptime = 0;
722	}
723#ifdef SMP
724	if (td->td_priority < PZERO && td->td_schedflag & TD_SCHED_BLOAD) {
725		kseq_wakeup(KSEQ_CPU(td->td_kse->ke_cpu), td->td_kse);
726		td->td_schedflag &= ~TD_SCHED_BLOAD;
727	}
728#endif
729	setrunqueue(td);
730#if SCHED_STRICT_RESCHED
731        if (td->td_priority < curthread->td_priority)
732                curthread->td_flags |= TDF_NEEDRESCHED;
733#endif
734}
735
736/*
737 * Penalize the parent for creating a new child and initialize the child's
738 * priority.
739 */
740void
741sched_fork(struct ksegrp *kg, struct ksegrp *child)
742{
743	struct kse *ckse;
744	struct kse *pkse;
745
746	mtx_assert(&sched_lock, MA_OWNED);
747	ckse = FIRST_KSE_IN_KSEGRP(child);
748	pkse = FIRST_KSE_IN_KSEGRP(kg);
749
750	/* XXX Need something better here */
751	if (kg->kg_slptime > kg->kg_runtime) {
752		child->kg_slptime = SCHED_DYN_RANGE;
753		child->kg_runtime = kg->kg_slptime / SCHED_DYN_RANGE;
754	} else {
755		child->kg_runtime = SCHED_DYN_RANGE;
756		child->kg_slptime = kg->kg_runtime / SCHED_DYN_RANGE;
757	}
758#if 0
759	child->kg_slptime = kg->kg_slptime;
760	child->kg_runtime = kg->kg_runtime;
761#endif
762	child->kg_user_pri = kg->kg_user_pri;
763
764#if 0
765	if (pkse->ke_cpu != PCPU_GET(cpuid)) {
766		printf("pkse->ke_cpu = %d\n", pkse->ke_cpu);
767		printf("cpuid = %d", PCPU_GET(cpuid));
768		Debugger("stop");
769	}
770#endif
771
772	ckse->ke_slice = pkse->ke_slice;
773	ckse->ke_cpu = pkse->ke_cpu; /* sched_pickcpu(); */
774	ckse->ke_runq = NULL;
775	/*
776	 * Claim that we've been running for one second for statistical
777	 * purposes.
778	 */
779	ckse->ke_ticks = 0;
780	ckse->ke_ltick = ticks;
781	ckse->ke_ftick = ticks - hz;
782}
783
784/*
785 * Return some of the child's priority and interactivity to the parent.
786 */
787void
788sched_exit(struct ksegrp *kg, struct ksegrp *child)
789{
790	/* XXX Need something better here */
791	mtx_assert(&sched_lock, MA_OWNED);
792#if 0
793	kg->kg_slptime = child->kg_slptime;
794	kg->kg_runtime = child->kg_runtime;
795	sched_priority(kg);
796#endif
797}
798
799void
800sched_clock(struct thread *td)
801{
802	struct kse *ke;
803#if SCHED_STRICT_RESCHED
804	struct kse *nke;
805	struct kseq *kseq;
806#endif
807	struct ksegrp *kg;
808
809
810	ke = td->td_kse;
811	kg = td->td_ksegrp;
812
813	mtx_assert(&sched_lock, MA_OWNED);
814	KASSERT((td != NULL), ("schedclock: null thread pointer"));
815
816	/* Adjust ticks for pctcpu */
817	ke->ke_ticks++;
818	ke->ke_ltick = ticks;
819
820	/* Go up to one second beyond our max and then trim back down */
821	if (ke->ke_ftick + SCHED_CPU_TICKS + hz < ke->ke_ltick)
822		sched_pctcpu_update(ke);
823
824	if (td->td_kse->ke_flags & KEF_IDLEKSE)
825		return;
826
827	/*
828	 * Check for a higher priority task on the run queue.  This can happen
829	 * on SMP if another processor woke up a process on our runq.
830	 */
831#if SCHED_STRICT_RESCHED
832	kseq = KSEQ_SELF();
833	nke = runq_choose(kseq->ksq_curr);
834
835	if (nke && nke->ke_thread &&
836	    nke->ke_thread->td_priority < td->td_priority)
837		td->td_flags |= TDF_NEEDRESCHED;
838#endif
839	/*
840	 * We only do slicing code for TIMESHARE ksegrps.
841	 */
842	if (kg->kg_pri_class != PRI_TIMESHARE)
843		return;
844	/*
845	 * We used a tick charge it to the ksegrp so that we can compute our
846	 * "interactivity".
847	 */
848	kg->kg_runtime += 1 << 10;
849
850	/*
851	 * We used up one time slice.
852	 */
853	ke->ke_slice--;
854	/*
855	 * We're out of time, recompute priorities and requeue.  We'll get a
856	 * new slice when we're put back on the run queue.
857	 */
858	if (ke->ke_slice <= 0) {
859		sched_priority(kg);
860		td->td_flags |= TDF_NEEDRESCHED;
861		ke->ke_runq = NULL;
862	}
863}
864
865int
866sched_runnable(void)
867{
868	struct kseq *kseq;
869
870	kseq = KSEQ_SELF();
871
872	if (kseq->ksq_tsload || kseq->ksq_idload || kseq->ksq_itload)
873		return (1);
874#ifdef SMP
875	/*
876	 * For SMP we may steal other processor's KSEs.  Just search until we
877	 * verify that at least on other cpu has a runnable task.
878	 */
879	if (smp_started) {
880		int i;
881
882#if 0
883		if (kseq->ksq_bload)
884			return (0);
885#endif
886
887		for (i = 0; i < mp_maxid; i++) {
888			if (CPU_ABSENT(i))
889				continue;
890			kseq = KSEQ_CPU(i);
891			if (kseq->ksq_tsload)
892				return (1);
893		}
894	}
895#endif
896	return (0);
897}
898
899void
900sched_userret(struct thread *td)
901{
902	struct ksegrp *kg;
903
904	kg = td->td_ksegrp;
905
906	if (td->td_priority != kg->kg_user_pri) {
907		mtx_lock_spin(&sched_lock);
908		td->td_priority = kg->kg_user_pri;
909		mtx_unlock_spin(&sched_lock);
910	}
911}
912
913struct kse *
914sched_choose(void)
915{
916	struct kseq *kseq;
917	struct kse *ke;
918
919	kseq = KSEQ_SELF();
920retry:
921	ke = kseq_choose(kseq);
922
923	if (ke) {
924		ke->ke_state = KES_THREAD;
925		kseq_rem(kseq, ke);
926
927		/*
928		 * If we dequeue a kse with a slice of zero it was below the
929		 * nice threshold to acquire a slice.  Force it on to the
930		 * next run queue and let kseq_add() pick a new slice.
931		 *
932		 * XXX This code should live in a TIMESHARE specific section.
933		 */
934		if (ke->ke_slice == 0) {
935			ke->ke_runq = kseq->ksq_next;
936			kseq_add(kseq, ke);
937			goto retry;
938		}
939	}
940
941#ifdef SMP
942	if (ke == NULL && smp_started) {
943#if 0
944		if (kseq->ksq_bload)
945			return (NULL);
946#endif
947		/*
948		 * Find the cpu with the highest load and steal one proc.
949		 */
950		kseq = kseq_load_highest();
951		if (kseq == NULL)
952			return (NULL);
953		/*
954		 * XXX Do we want to migrate interrupt or realtime threads?
955		 * Currently we'll only try to steal if there is a TIMESHARE
956		 * thread available but we will steal a REALTIME or interrupt
957		 */
958		ke = kseq_choose(kseq);
959		kseq_rem(kseq, ke);
960
961		ke->ke_state = KES_THREAD;
962		ke->ke_runq = NULL;
963		ke->ke_cpu = PCPU_GET(cpuid);
964	}
965#endif
966	return (ke);
967}
968
969void
970sched_add(struct kse *ke)
971{
972	struct kseq *kseq;
973
974	mtx_assert(&sched_lock, MA_OWNED);
975	KASSERT((ke->ke_thread != NULL), ("sched_add: No thread on KSE"));
976	KASSERT((ke->ke_thread->td_kse != NULL),
977	    ("sched_add: No KSE on thread"));
978	KASSERT(ke->ke_state != KES_ONRUNQ,
979	    ("sched_add: kse %p (%s) already in run queue", ke,
980	    ke->ke_proc->p_comm));
981	KASSERT(ke->ke_proc->p_sflag & PS_INMEM,
982	    ("sched_add: process swapped out"));
983
984	switch (ke->ke_ksegrp->kg_pri_class) {
985	case PRI_ITHD:
986	case PRI_REALTIME:
987		kseq = KSEQ_SELF();
988		break;
989	case PRI_TIMESHARE:
990	case PRI_IDLE:
991	default:
992		kseq = KSEQ_CPU(ke->ke_cpu);
993		break;
994	}
995
996	ke->ke_ksegrp->kg_runq_kses++;
997	ke->ke_state = KES_ONRUNQ;
998
999	kseq_add(kseq, ke);
1000}
1001
1002void
1003sched_rem(struct kse *ke)
1004{
1005	mtx_assert(&sched_lock, MA_OWNED);
1006	/* KASSERT((ke->ke_state == KES_ONRUNQ), ("KSE not on run queue")); */
1007
1008	ke->ke_runq = NULL;
1009	ke->ke_state = KES_THREAD;
1010	ke->ke_ksegrp->kg_runq_kses--;
1011
1012	kseq_rem(KSEQ_CPU(ke->ke_cpu), ke);
1013}
1014
1015fixpt_t
1016sched_pctcpu(struct kse *ke)
1017{
1018	fixpt_t pctcpu;
1019	int realstathz;
1020
1021	pctcpu = 0;
1022	realstathz = stathz ? stathz : hz;
1023
1024	if (ke->ke_ticks) {
1025		int rtick;
1026
1027		/* Update to account for time potentially spent sleeping */
1028		ke->ke_ltick = ticks;
1029		sched_pctcpu_update(ke);
1030
1031		/* How many rtick per second ? */
1032		rtick = ke->ke_ticks / SCHED_CPU_TIME;
1033		pctcpu = (FSCALE * ((FSCALE * rtick)/realstathz)) >> FSHIFT;
1034	}
1035
1036	ke->ke_proc->p_swtime = ke->ke_ltick - ke->ke_ftick;
1037
1038	return (pctcpu);
1039}
1040
1041int
1042sched_sizeof_kse(void)
1043{
1044	return (sizeof(struct kse) + sizeof(struct ke_sched));
1045}
1046
1047int
1048sched_sizeof_ksegrp(void)
1049{
1050	return (sizeof(struct ksegrp) + sizeof(struct kg_sched));
1051}
1052
1053int
1054sched_sizeof_proc(void)
1055{
1056	return (sizeof(struct proc));
1057}
1058
1059int
1060sched_sizeof_thread(void)
1061{
1062	return (sizeof(struct thread) + sizeof(struct td_sched));
1063}
1064