sched_ule.c revision 111857
1/*-
2 * Copyright (c) 2003, Jeffrey Roberson <jeff@freebsd.org>
3 * All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 * 1. Redistributions of source code must retain the above copyright
9 *    notice unmodified, this list of conditions, and the following
10 *    disclaimer.
11 * 2. Redistributions in binary form must reproduce the above copyright
12 *    notice, this list of conditions and the following disclaimer in the
13 *    documentation and/or other materials provided with the distribution.
14 *
15 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
16 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
17 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
18 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
19 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
20 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
21 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
22 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
23 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
24 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
25 *
26 * $FreeBSD: head/sys/kern/sched_ule.c 111857 2003-03-04 02:45:59Z jeff $
27 */
28
29#include <sys/param.h>
30#include <sys/systm.h>
31#include <sys/kernel.h>
32#include <sys/ktr.h>
33#include <sys/lock.h>
34#include <sys/mutex.h>
35#include <sys/proc.h>
36#include <sys/sched.h>
37#include <sys/smp.h>
38#include <sys/sx.h>
39#include <sys/sysctl.h>
40#include <sys/sysproto.h>
41#include <sys/vmmeter.h>
42#ifdef DDB
43#include <ddb/ddb.h>
44#endif
45#ifdef KTRACE
46#include <sys/uio.h>
47#include <sys/ktrace.h>
48#endif
49
50#include <machine/cpu.h>
51
52/* decay 95% of `p_pctcpu' in 60 seconds; see CCPU_SHIFT before changing */
53/* XXX This is bogus compatability crap for ps */
54static fixpt_t  ccpu = 0.95122942450071400909 * FSCALE; /* exp(-1/20) */
55SYSCTL_INT(_kern, OID_AUTO, ccpu, CTLFLAG_RD, &ccpu, 0, "");
56
57static void sched_setup(void *dummy);
58SYSINIT(sched_setup, SI_SUB_RUN_QUEUE, SI_ORDER_FIRST, sched_setup, NULL)
59
60int realstathz;
61
62#define	SCHED_STRICT_RESCHED 1
63
64/*
65 * These datastructures are allocated within their parent datastructure but
66 * are scheduler specific.
67 */
68
69struct ke_sched {
70	int		ske_slice;
71	struct runq	*ske_runq;
72	/* The following variables are only used for pctcpu calculation */
73	int		ske_ltick;	/* Last tick that we were running on */
74	int		ske_ftick;	/* First tick that we were running on */
75	int		ske_ticks;	/* Tick count */
76	u_char		ske_cpu;
77};
78#define	ke_slice	ke_sched->ske_slice
79#define	ke_runq		ke_sched->ske_runq
80#define	ke_ltick	ke_sched->ske_ltick
81#define	ke_ftick	ke_sched->ske_ftick
82#define	ke_ticks	ke_sched->ske_ticks
83#define	ke_cpu		ke_sched->ske_cpu
84
85struct kg_sched {
86	int	skg_slptime;		/* Number of ticks we vol. slept */
87	int	skg_runtime;		/* Number of ticks we were running */
88};
89#define	kg_slptime	kg_sched->skg_slptime
90#define	kg_runtime	kg_sched->skg_runtime
91
92struct td_sched {
93	int	std_slptime;
94	int	std_schedflag;
95};
96#define	td_slptime	td_sched->std_slptime
97#define	td_schedflag	td_sched->std_schedflag
98
99#define	TD_SCHED_BLOAD	0x0001		/*
100					 * thread was counted as being in short
101					 * term sleep.
102					 */
103struct td_sched td_sched;
104struct ke_sched ke_sched;
105struct kg_sched kg_sched;
106
107struct ke_sched *kse0_sched = &ke_sched;
108struct kg_sched *ksegrp0_sched = &kg_sched;
109struct p_sched *proc0_sched = NULL;
110struct td_sched *thread0_sched = &td_sched;
111
112/*
113 * This priority range has 20 priorities on either end that are reachable
114 * only through nice values.
115 *
116 * PRI_RANGE:	Total priority range for timeshare threads.
117 * PRI_NRESV:	Reserved priorities for nice.
118 * PRI_BASE:	The start of the dynamic range.
119 * DYN_RANGE:	Number of priorities that are available int the dynamic
120 *		priority range.
121 * DYN_HALF:	Half of DYN_RANGE for convenience elsewhere.
122 * PRI_DYN:	The dynamic priority which is derived from the number of ticks
123 *		running vs the total number of ticks.
124 */
125#define	SCHED_PRI_RANGE		(PRI_MAX_TIMESHARE - PRI_MIN_TIMESHARE + 1)
126#define	SCHED_PRI_NRESV		40
127#define	SCHED_PRI_BASE		((SCHED_PRI_NRESV / 2) + PRI_MIN_TIMESHARE)
128#define	SCHED_DYN_RANGE		(SCHED_PRI_RANGE - SCHED_PRI_NRESV)
129#define	SCHED_DYN_HALF		(SCHED_DYN_RANGE / 2)
130#define	SCHED_PRI_DYN(run, total)	(((run) * SCHED_DYN_RANGE) / (total))
131
132
133/*
134 * These determine the interactivity of a process.
135 *
136 * SLP_RUN_MAX:	Maximum amount of sleep time + run time we'll accumulate
137 *		before throttling back.
138 * SLP_RUN_THROTTLE:	Divisor for reducing slp/run time.
139 * INTERACT_RANGE:	Range of interactivity values.  Smaller is better.
140 * INTERACT_HALF:	Convenience define, half of the interactivity range.
141 * INTERACT_THRESH:	Threshhold for placement on the current runq.
142 */
143#define	SCHED_SLP_RUN_MAX	((hz * 30) << 10)
144#define	SCHED_SLP_RUN_THROTTLE	(10)
145#define	SCHED_INTERACT_RANGE	(100)
146#define	SCHED_INTERACT_HALF	(SCHED_INTERACT_RANGE / 2)
147#define	SCHED_INTERACT_THRESH	(10)
148
149/*
150 * These parameters and macros determine the size of the time slice that is
151 * granted to each thread.
152 *
153 * SLICE_MIN:	Minimum time slice granted, in units of ticks.
154 * SLICE_MAX:	Maximum time slice granted.
155 * SLICE_RANGE:	Range of available time slices scaled by hz.
156 * SLICE_SCALE:	The number slices granted per unit of pri or slp.
157 * PRI_TOSLICE:	Compute a slice size that is proportional to the priority.
158 * INTERACT_TOSLICE:	Compute a slice size that is inversely proportional to
159 *		the amount of time slept. (smaller slices for interactive ksegs)
160 * PRI_COMP:	This determines what fraction of the actual slice comes from
161 *		the slice size computed from the priority.
162 * INTERACT_COMP:This determines what component of the actual slice comes from
163 *		the slize size computed from the interactivity score.
164 */
165#define	SCHED_SLICE_MIN			(hz / 100)
166#define	SCHED_SLICE_MAX			(hz / 10)
167#define	SCHED_SLICE_RANGE		(SCHED_SLICE_MAX - SCHED_SLICE_MIN + 1)
168#define	SCHED_SLICE_SCALE(val, max)	(((val) * SCHED_SLICE_RANGE) / (max))
169#define	SCHED_INTERACT_COMP(slice)	((slice) / 2)	/* 50% */
170#define	SCHED_PRI_COMP(slice)		((slice) / 2)	/* 50% */
171#define	SCHED_PRI_TOSLICE(pri)						\
172    (SCHED_SLICE_MAX - SCHED_SLICE_SCALE((pri), SCHED_PRI_RANGE))
173#define	SCHED_INTERACT_TOSLICE(score)					\
174    (SCHED_SLICE_SCALE((score), SCHED_INTERACT_RANGE))
175
176/*
177 * This macro determines whether or not the kse belongs on the current or
178 * next run queue.
179 *
180 * XXX nice value should effect how interactive a kg is.
181 */
182#define	SCHED_CURR(kg)	(sched_interact_score(kg) < SCHED_INTERACT_THRESH)
183
184/*
185 * Cpu percentage computation macros and defines.
186 *
187 * SCHED_CPU_TIME:	Number of seconds to average the cpu usage across.
188 * SCHED_CPU_TICKS:	Number of hz ticks to average the cpu usage across.
189 */
190
191#define	SCHED_CPU_TIME	60
192#define	SCHED_CPU_TICKS	(hz * SCHED_CPU_TIME)
193
194/*
195 * kseq - pair of runqs per processor
196 */
197
198struct kseq {
199	struct runq	ksq_runqs[2];
200	struct runq	*ksq_curr;
201	struct runq	*ksq_next;
202	int		ksq_load;	/* Total runnable */
203#ifdef SMP
204	unsigned int	ksq_rslices;	/* Slices on run queue */
205	unsigned int	ksq_bload;	/* Threads waiting on IO */
206#endif
207};
208
209/*
210 * One kse queue per processor.
211 */
212#ifdef SMP
213struct kseq	kseq_cpu[MAXCPU];
214#define	KSEQ_SELF()	(&kseq_cpu[PCPU_GET(cpuid)])
215#define	KSEQ_CPU(x)	(&kseq_cpu[(x)])
216#else
217struct kseq	kseq_cpu;
218#define	KSEQ_SELF()	(&kseq_cpu)
219#define	KSEQ_CPU(x)	(&kseq_cpu)
220#endif
221
222static int sched_slice(struct ksegrp *kg);
223static int sched_priority(struct ksegrp *kg);
224static int sched_interact_score(struct ksegrp *kg);
225void sched_pctcpu_update(struct kse *ke);
226int sched_pickcpu(void);
227
228/* Operations on per processor queues */
229static struct kse * kseq_choose(struct kseq *kseq);
230static void kseq_setup(struct kseq *kseq);
231static __inline void kseq_add(struct kseq *kseq, struct kse *ke);
232static __inline void kseq_rem(struct kseq *kseq, struct kse *ke);
233#ifdef SMP
234static __inline void kseq_sleep(struct kseq *kseq, struct kse *ke);
235static __inline void kseq_wakeup(struct kseq *kseq, struct kse *ke);
236struct kseq * kseq_load_highest(void);
237#endif
238
239static __inline void
240kseq_add(struct kseq *kseq, struct kse *ke)
241{
242	runq_add(ke->ke_runq, ke);
243	kseq->ksq_load++;
244#ifdef SMP
245	kseq->ksq_rslices += ke->ke_slice;
246#endif
247}
248static __inline void
249kseq_rem(struct kseq *kseq, struct kse *ke)
250{
251	kseq->ksq_load--;
252	runq_remove(ke->ke_runq, ke);
253#ifdef SMP
254	kseq->ksq_rslices -= ke->ke_slice;
255#endif
256}
257
258#ifdef SMP
259static __inline void
260kseq_sleep(struct kseq *kseq, struct kse *ke)
261{
262	kseq->ksq_bload++;
263}
264
265static __inline void
266kseq_wakeup(struct kseq *kseq, struct kse *ke)
267{
268	kseq->ksq_bload--;
269}
270
271struct kseq *
272kseq_load_highest(void)
273{
274	struct kseq *kseq;
275	int load;
276	int cpu;
277	int i;
278
279	cpu = 0;
280	load = 0;
281
282	for (i = 0; i < mp_maxid; i++) {
283		if (CPU_ABSENT(i))
284			continue;
285		kseq = KSEQ_CPU(i);
286		if (kseq->ksq_load > load) {
287			load = kseq->ksq_load;
288			cpu = i;
289		}
290	}
291	if (load)
292		return (KSEQ_CPU(cpu));
293
294	return (NULL);
295}
296#endif
297
298struct kse *
299kseq_choose(struct kseq *kseq)
300{
301	struct kse *ke;
302	struct runq *swap;
303
304	if ((ke = runq_choose(kseq->ksq_curr)) == NULL) {
305		swap = kseq->ksq_curr;
306		kseq->ksq_curr = kseq->ksq_next;
307		kseq->ksq_next = swap;
308		ke = runq_choose(kseq->ksq_curr);
309	}
310
311	return (ke);
312}
313
314
315static void
316kseq_setup(struct kseq *kseq)
317{
318	kseq->ksq_curr = &kseq->ksq_runqs[0];
319	kseq->ksq_next = &kseq->ksq_runqs[1];
320	runq_init(kseq->ksq_curr);
321	runq_init(kseq->ksq_next);
322	kseq->ksq_load = 0;
323#ifdef SMP
324	kseq->ksq_rslices = 0;
325	kseq->ksq_bload = 0;
326#endif
327}
328
329static void
330sched_setup(void *dummy)
331{
332	int i;
333
334	realstathz = stathz ? stathz : hz;
335
336	mtx_lock_spin(&sched_lock);
337	/* init kseqs */
338	for (i = 0; i < MAXCPU; i++)
339		kseq_setup(KSEQ_CPU(i));
340	mtx_unlock_spin(&sched_lock);
341}
342
343/*
344 * Scale the scheduling priority according to the "interactivity" of this
345 * process.
346 */
347static int
348sched_priority(struct ksegrp *kg)
349{
350	int pri;
351
352	if (kg->kg_pri_class != PRI_TIMESHARE)
353		return (kg->kg_user_pri);
354
355	pri = sched_interact_score(kg) * SCHED_DYN_RANGE / SCHED_INTERACT_RANGE;
356	pri += SCHED_PRI_BASE;
357	pri += kg->kg_nice;
358
359	if (pri > PRI_MAX_TIMESHARE)
360		pri = PRI_MAX_TIMESHARE;
361	else if (pri < PRI_MIN_TIMESHARE)
362		pri = PRI_MIN_TIMESHARE;
363
364	kg->kg_user_pri = pri;
365
366	return (kg->kg_user_pri);
367}
368
369/*
370 * Calculate a time slice based on the process priority.
371 */
372static int
373sched_slice(struct ksegrp *kg)
374{
375	int pslice;
376	int islice;
377	int slice;
378	int pri;
379
380	pri = kg->kg_user_pri;
381	pri -= PRI_MIN_TIMESHARE;
382	pslice = SCHED_PRI_TOSLICE(pri);
383	islice = SCHED_INTERACT_TOSLICE(sched_interact_score(kg));
384	slice = SCHED_INTERACT_COMP(islice) + SCHED_PRI_COMP(pslice);
385
386	if (slice < SCHED_SLICE_MIN)
387		slice = SCHED_SLICE_MIN;
388	else if (slice > SCHED_SLICE_MAX)
389		slice = SCHED_SLICE_MAX;
390
391	/*
392	 * Every time we grant a new slice check to see if we need to scale
393	 * back the slp and run time in the kg.  This will cause us to forget
394	 * old interactivity while maintaining the current ratio.
395	 */
396	if ((kg->kg_runtime + kg->kg_slptime) >  SCHED_SLP_RUN_MAX) {
397		kg->kg_runtime /= SCHED_SLP_RUN_THROTTLE;
398		kg->kg_slptime /= SCHED_SLP_RUN_THROTTLE;
399	}
400
401	return (slice);
402}
403
404static int
405sched_interact_score(struct ksegrp *kg)
406{
407	int big;
408	int small;
409	int base;
410
411	if (kg->kg_runtime > kg->kg_slptime) {
412		big = kg->kg_runtime;
413		small = kg->kg_slptime;
414		base = SCHED_INTERACT_HALF;
415	} else {
416		big = kg->kg_slptime;
417		small = kg->kg_runtime;
418		base = 0;
419	}
420
421	big /= SCHED_INTERACT_HALF;
422	if (big != 0)
423		small /= big;
424	else
425		small = 0;
426
427	small += base;
428	/* XXX Factor in nice */
429	return (small);
430}
431
432int
433sched_rr_interval(void)
434{
435	return (SCHED_SLICE_MAX);
436}
437
438void
439sched_pctcpu_update(struct kse *ke)
440{
441	/*
442	 * Adjust counters and watermark for pctcpu calc.
443	 */
444	/*
445	 * Shift the tick count out so that the divide doesn't round away
446	 * our results.
447	 */
448	ke->ke_ticks <<= 10;
449	ke->ke_ticks = (ke->ke_ticks / (ke->ke_ltick - ke->ke_ftick)) *
450		    SCHED_CPU_TICKS;
451	ke->ke_ticks >>= 10;
452	ke->ke_ltick = ticks;
453	ke->ke_ftick = ke->ke_ltick - SCHED_CPU_TICKS;
454}
455
456#ifdef SMP
457/* XXX Should be changed to kseq_load_lowest() */
458int
459sched_pickcpu(void)
460{
461	struct kseq *kseq;
462	int load;
463	int cpu;
464	int i;
465
466	if (!smp_started)
467		return (0);
468
469	load = 0;
470	cpu = 0;
471
472	for (i = 0; i < mp_maxid; i++) {
473		if (CPU_ABSENT(i))
474			continue;
475		kseq = KSEQ_CPU(i);
476		if (kseq->ksq_load < load) {
477			cpu = i;
478			load = kseq->ksq_load;
479		}
480	}
481
482	CTR1(KTR_RUNQ, "sched_pickcpu: %d", cpu);
483	return (cpu);
484}
485#else
486int
487sched_pickcpu(void)
488{
489	return (0);
490}
491#endif
492
493void
494sched_prio(struct thread *td, u_char prio)
495{
496	struct kse *ke;
497	struct runq *rq;
498
499	mtx_assert(&sched_lock, MA_OWNED);
500	ke = td->td_kse;
501	td->td_priority = prio;
502
503	if (TD_ON_RUNQ(td)) {
504		rq = ke->ke_runq;
505
506		runq_remove(rq, ke);
507		runq_add(rq, ke);
508	}
509}
510
511void
512sched_switchout(struct thread *td)
513{
514	struct kse *ke;
515
516	mtx_assert(&sched_lock, MA_OWNED);
517
518	ke = td->td_kse;
519
520	td->td_last_kse = ke;
521        td->td_lastcpu = ke->ke_oncpu;
522	ke->ke_oncpu = NOCPU;
523        td->td_flags &= ~TDF_NEEDRESCHED;
524
525	if (TD_IS_RUNNING(td)) {
526		setrunqueue(td);
527		return;
528	}
529	td->td_kse->ke_runq = NULL;
530
531	/*
532	 * We will not be on the run queue. So we must be
533	 * sleeping or similar.
534	 */
535	if (td->td_proc->p_flag & P_THREADED)
536		kse_reassign(ke);
537}
538
539void
540sched_switchin(struct thread *td)
541{
542	/* struct kse *ke = td->td_kse; */
543	mtx_assert(&sched_lock, MA_OWNED);
544
545	td->td_kse->ke_oncpu = PCPU_GET(cpuid);
546#if SCHED_STRICT_RESCHED
547	if (td->td_ksegrp->kg_pri_class == PRI_TIMESHARE &&
548	    td->td_priority != td->td_ksegrp->kg_user_pri)
549		curthread->td_flags |= TDF_NEEDRESCHED;
550#endif
551}
552
553void
554sched_nice(struct ksegrp *kg, int nice)
555{
556	struct thread *td;
557
558	kg->kg_nice = nice;
559	sched_priority(kg);
560	FOREACH_THREAD_IN_GROUP(kg, td) {
561		td->td_flags |= TDF_NEEDRESCHED;
562	}
563}
564
565void
566sched_sleep(struct thread *td, u_char prio)
567{
568	mtx_assert(&sched_lock, MA_OWNED);
569
570	td->td_slptime = ticks;
571	td->td_priority = prio;
572
573#ifdef SMP
574	if (td->td_priority < PZERO) {
575		kseq_sleep(KSEQ_CPU(td->td_kse->ke_cpu), td->td_kse);
576		td->td_schedflag |= TD_SCHED_BLOAD;
577	}
578#endif
579}
580
581void
582sched_wakeup(struct thread *td)
583{
584	mtx_assert(&sched_lock, MA_OWNED);
585
586	/*
587	 * Let the kseg know how long we slept for.  This is because process
588	 * interactivity behavior is modeled in the kseg.
589	 */
590	if (td->td_slptime) {
591		struct ksegrp *kg;
592
593		kg = td->td_ksegrp;
594		kg->kg_slptime += (ticks - td->td_slptime) << 10;
595		sched_priority(kg);
596		td->td_slptime = 0;
597	}
598#ifdef SMP
599	if (td->td_priority < PZERO && td->td_schedflag & TD_SCHED_BLOAD) {
600		kseq_wakeup(KSEQ_CPU(td->td_kse->ke_cpu), td->td_kse);
601		td->td_schedflag &= ~TD_SCHED_BLOAD;
602	}
603#endif
604	setrunqueue(td);
605#if SCHED_STRICT_RESCHED
606        if (td->td_priority < curthread->td_priority)
607                curthread->td_flags |= TDF_NEEDRESCHED;
608#endif
609}
610
611/*
612 * Penalize the parent for creating a new child and initialize the child's
613 * priority.
614 */
615void
616sched_fork(struct ksegrp *kg, struct ksegrp *child)
617{
618	struct kse *ckse;
619	struct kse *pkse;
620
621	mtx_assert(&sched_lock, MA_OWNED);
622	ckse = FIRST_KSE_IN_KSEGRP(child);
623	pkse = FIRST_KSE_IN_KSEGRP(kg);
624
625	/* XXX Need something better here */
626	if (kg->kg_slptime > kg->kg_runtime) {
627		child->kg_slptime = SCHED_DYN_RANGE;
628		child->kg_runtime = kg->kg_slptime / SCHED_DYN_RANGE;
629	} else {
630		child->kg_runtime = SCHED_DYN_RANGE;
631		child->kg_slptime = kg->kg_runtime / SCHED_DYN_RANGE;
632	}
633#if 0
634	child->kg_slptime = kg->kg_slptime;
635	child->kg_runtime = kg->kg_runtime;
636#endif
637	child->kg_user_pri = kg->kg_user_pri;
638
639#if 0
640	if (pkse->ke_cpu != PCPU_GET(cpuid)) {
641		printf("pkse->ke_cpu = %d\n", pkse->ke_cpu);
642		printf("cpuid = %d", PCPU_GET(cpuid));
643		Debugger("stop");
644	}
645#endif
646
647	ckse->ke_slice = pkse->ke_slice;
648	ckse->ke_cpu = pkse->ke_cpu; /* sched_pickcpu(); */
649	ckse->ke_runq = NULL;
650	/*
651	 * Claim that we've been running for one second for statistical
652	 * purposes.
653	 */
654	ckse->ke_ticks = 0;
655	ckse->ke_ltick = ticks;
656	ckse->ke_ftick = ticks - hz;
657}
658
659/*
660 * Return some of the child's priority and interactivity to the parent.
661 */
662void
663sched_exit(struct ksegrp *kg, struct ksegrp *child)
664{
665	/* XXX Need something better here */
666	mtx_assert(&sched_lock, MA_OWNED);
667	kg->kg_slptime = child->kg_slptime;
668	kg->kg_runtime = child->kg_runtime;
669	sched_priority(kg);
670}
671
672void
673sched_clock(struct thread *td)
674{
675	struct kse *ke;
676#if SCHED_STRICT_RESCHED
677	struct kse *nke;
678	struct kseq *kseq;
679#endif
680	struct ksegrp *kg;
681
682
683	ke = td->td_kse;
684	kg = td->td_ksegrp;
685
686	mtx_assert(&sched_lock, MA_OWNED);
687	KASSERT((td != NULL), ("schedclock: null thread pointer"));
688
689	/* Adjust ticks for pctcpu */
690	ke->ke_ticks++;
691	ke->ke_ltick = ticks;
692	/* Go up to one second beyond our max and then trim back down */
693	if (ke->ke_ftick + SCHED_CPU_TICKS + hz < ke->ke_ltick)
694		sched_pctcpu_update(ke);
695
696	if (td->td_kse->ke_flags & KEF_IDLEKSE)
697		return;
698
699	/*
700	 * Check for a higher priority task on the run queue.  This can happen
701	 * on SMP if another processor woke up a process on our runq.
702	 */
703#if SCHED_STRICT_RESCHED
704	kseq = KSEQ_SELF();
705	nke = runq_choose(kseq->ksq_curr);
706
707	if (nke && nke->ke_thread &&
708	    nke->ke_thread->td_priority < td->td_priority)
709		td->td_flags |= TDF_NEEDRESCHED;
710#endif
711	/*
712	 * We used a tick charge it to the ksegrp so that we can compute our
713	 * "interactivity".
714	 */
715	kg->kg_runtime += 1 << 10;
716
717	/*
718	 * We used up one time slice.
719	 */
720	ke->ke_slice--;
721	/*
722	 * We're out of time, recompute priorities and requeue
723	 */
724	if (ke->ke_slice <= 0) {
725		sched_priority(kg);
726		ke->ke_slice = sched_slice(kg);
727		td->td_flags |= TDF_NEEDRESCHED;
728		ke->ke_runq = NULL;
729	}
730}
731
732int
733sched_runnable(void)
734{
735	struct kseq *kseq;
736
737	kseq = KSEQ_SELF();
738
739	if (kseq->ksq_load)
740		return (1);
741#ifdef SMP
742	/*
743	 * For SMP we may steal other processor's KSEs.  Just search until we
744	 * verify that at least on other cpu has a runnable task.
745	 */
746	if (smp_started) {
747		int i;
748
749#if 0
750		if (kseq->ksq_bload)
751			return (0);
752#endif
753
754		for (i = 0; i < mp_maxid; i++) {
755			if (CPU_ABSENT(i))
756				continue;
757			kseq = KSEQ_CPU(i);
758			if (kseq->ksq_load)
759				return (1);
760		}
761	}
762#endif
763	return (0);
764}
765
766void
767sched_userret(struct thread *td)
768{
769	struct ksegrp *kg;
770
771	kg = td->td_ksegrp;
772
773	if (td->td_priority != kg->kg_user_pri) {
774		mtx_lock_spin(&sched_lock);
775		td->td_priority = kg->kg_user_pri;
776		mtx_unlock_spin(&sched_lock);
777	}
778}
779
780struct kse *
781sched_choose(void)
782{
783	struct kseq *kseq;
784	struct kse *ke;
785
786	kseq = KSEQ_SELF();
787	ke = kseq_choose(kseq);
788
789	if (ke) {
790		ke->ke_state = KES_THREAD;
791		kseq_rem(kseq, ke);
792	}
793
794#ifdef SMP
795	if (ke == NULL && smp_started) {
796#if 0
797		if (kseq->ksq_bload)
798			return (NULL);
799#endif
800		/*
801		 * Find the cpu with the highest load and steal one proc.
802		 */
803		kseq = kseq_load_highest();
804		if (kseq == NULL)
805			return (NULL);
806		ke = kseq_choose(kseq);
807		kseq_rem(kseq, ke);
808
809		ke->ke_state = KES_THREAD;
810		ke->ke_runq = NULL;
811		ke->ke_cpu = PCPU_GET(cpuid);
812	}
813#endif
814	return (ke);
815}
816
817void
818sched_add(struct kse *ke)
819{
820	struct kseq *kseq;
821
822	mtx_assert(&sched_lock, MA_OWNED);
823	KASSERT((ke->ke_thread != NULL), ("sched_add: No thread on KSE"));
824	KASSERT((ke->ke_thread->td_kse != NULL),
825	    ("sched_add: No KSE on thread"));
826	KASSERT(ke->ke_state != KES_ONRUNQ,
827	    ("sched_add: kse %p (%s) already in run queue", ke,
828	    ke->ke_proc->p_comm));
829	KASSERT(ke->ke_proc->p_sflag & PS_INMEM,
830	    ("sched_add: process swapped out"));
831
832	/*
833	 * Timeshare threads get placed on the appropriate queue on their
834	 * bound cpu.
835	 */
836	if (ke->ke_ksegrp->kg_pri_class == PRI_TIMESHARE) {
837		kseq = KSEQ_CPU(ke->ke_cpu);
838
839		if (ke->ke_runq == NULL) {
840			if (SCHED_CURR(ke->ke_ksegrp))
841				ke->ke_runq = kseq->ksq_curr;
842			else
843				ke->ke_runq = kseq->ksq_next;
844		}
845	/*
846	 * If we're a real-time or interrupt thread place us on the curr
847	 * queue for the current processor.  Hopefully this will yield the
848	 * lowest latency response.
849	 */
850	} else {
851		kseq = KSEQ_SELF();
852		ke->ke_runq = kseq->ksq_curr;
853	}
854	ke->ke_ksegrp->kg_runq_kses++;
855	ke->ke_state = KES_ONRUNQ;
856
857	kseq_add(kseq, ke);
858}
859
860void
861sched_rem(struct kse *ke)
862{
863	mtx_assert(&sched_lock, MA_OWNED);
864	/* KASSERT((ke->ke_state == KES_ONRUNQ), ("KSE not on run queue")); */
865
866	ke->ke_runq = NULL;
867	ke->ke_state = KES_THREAD;
868	ke->ke_ksegrp->kg_runq_kses--;
869
870	kseq_rem(KSEQ_CPU(ke->ke_cpu), ke);
871}
872
873fixpt_t
874sched_pctcpu(struct kse *ke)
875{
876	fixpt_t pctcpu;
877	int realstathz;
878
879	pctcpu = 0;
880	realstathz = stathz ? stathz : hz;
881
882	if (ke->ke_ticks) {
883		int rtick;
884
885		/* Update to account for time potentially spent sleeping */
886		ke->ke_ltick = ticks;
887		sched_pctcpu_update(ke);
888
889		/* How many rtick per second ? */
890		rtick = ke->ke_ticks / SCHED_CPU_TIME;
891		pctcpu = (FSCALE * ((FSCALE * rtick)/realstathz)) >> FSHIFT;
892	}
893
894	ke->ke_proc->p_swtime = ke->ke_ltick - ke->ke_ftick;
895
896	return (pctcpu);
897}
898
899int
900sched_sizeof_kse(void)
901{
902	return (sizeof(struct kse) + sizeof(struct ke_sched));
903}
904
905int
906sched_sizeof_ksegrp(void)
907{
908	return (sizeof(struct ksegrp) + sizeof(struct kg_sched));
909}
910
911int
912sched_sizeof_proc(void)
913{
914	return (sizeof(struct proc));
915}
916
917int
918sched_sizeof_thread(void)
919{
920	return (sizeof(struct thread) + sizeof(struct td_sched));
921}
922