sched_ule.c revision 121868
1/*-
2 * Copyright (c) 2002-2003, Jeffrey Roberson <jeff@freebsd.org>
3 * All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 * 1. Redistributions of source code must retain the above copyright
9 *    notice unmodified, this list of conditions, and the following
10 *    disclaimer.
11 * 2. Redistributions in binary form must reproduce the above copyright
12 *    notice, this list of conditions and the following disclaimer in the
13 *    documentation and/or other materials provided with the distribution.
14 *
15 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
16 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
17 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
18 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
19 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
20 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
21 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
22 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
23 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
24 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
25 */
26
27#include <sys/cdefs.h>
28__FBSDID("$FreeBSD: head/sys/kern/sched_ule.c 121868 2003-11-02 03:36:33Z jeff $");
29
30#include <sys/param.h>
31#include <sys/systm.h>
32#include <sys/kernel.h>
33#include <sys/ktr.h>
34#include <sys/lock.h>
35#include <sys/mutex.h>
36#include <sys/proc.h>
37#include <sys/resource.h>
38#include <sys/sched.h>
39#include <sys/smp.h>
40#include <sys/sx.h>
41#include <sys/sysctl.h>
42#include <sys/sysproto.h>
43#include <sys/vmmeter.h>
44#ifdef DDB
45#include <ddb/ddb.h>
46#endif
47#ifdef KTRACE
48#include <sys/uio.h>
49#include <sys/ktrace.h>
50#endif
51
52#include <machine/cpu.h>
53#include <machine/smp.h>
54
55#define KTR_ULE         KTR_NFS
56
57/* decay 95% of `p_pctcpu' in 60 seconds; see CCPU_SHIFT before changing */
58/* XXX This is bogus compatability crap for ps */
59static fixpt_t  ccpu = 0.95122942450071400909 * FSCALE; /* exp(-1/20) */
60SYSCTL_INT(_kern, OID_AUTO, ccpu, CTLFLAG_RD, &ccpu, 0, "");
61
62static void sched_setup(void *dummy);
63SYSINIT(sched_setup, SI_SUB_RUN_QUEUE, SI_ORDER_FIRST, sched_setup, NULL)
64
65static SYSCTL_NODE(_kern, OID_AUTO, sched, CTLFLAG_RW, 0, "SCHED");
66
67static int sched_strict;
68SYSCTL_INT(_kern_sched, OID_AUTO, strict, CTLFLAG_RD, &sched_strict, 0, "");
69
70static int slice_min = 1;
71SYSCTL_INT(_kern_sched, OID_AUTO, slice_min, CTLFLAG_RW, &slice_min, 0, "");
72
73static int slice_max = 10;
74SYSCTL_INT(_kern_sched, OID_AUTO, slice_max, CTLFLAG_RW, &slice_max, 0, "");
75
76int realstathz;
77int tickincr = 1;
78
79#ifdef SMP
80/* Callout to handle load balancing SMP systems. */
81static struct callout kseq_lb_callout;
82#endif
83
84/*
85 * These datastructures are allocated within their parent datastructure but
86 * are scheduler specific.
87 */
88
89struct ke_sched {
90	int		ske_slice;
91	struct runq	*ske_runq;
92	/* The following variables are only used for pctcpu calculation */
93	int		ske_ltick;	/* Last tick that we were running on */
94	int		ske_ftick;	/* First tick that we were running on */
95	int		ske_ticks;	/* Tick count */
96	/* CPU that we have affinity for. */
97	u_char		ske_cpu;
98};
99#define	ke_slice	ke_sched->ske_slice
100#define	ke_runq		ke_sched->ske_runq
101#define	ke_ltick	ke_sched->ske_ltick
102#define	ke_ftick	ke_sched->ske_ftick
103#define	ke_ticks	ke_sched->ske_ticks
104#define	ke_cpu		ke_sched->ske_cpu
105#define	ke_assign	ke_procq.tqe_next
106
107#define	KEF_ASSIGNED	KEF_SCHED0	/* KSE is being migrated. */
108
109struct kg_sched {
110	int	skg_slptime;		/* Number of ticks we vol. slept */
111	int	skg_runtime;		/* Number of ticks we were running */
112};
113#define	kg_slptime	kg_sched->skg_slptime
114#define	kg_runtime	kg_sched->skg_runtime
115
116struct td_sched {
117	int	std_slptime;
118};
119#define	td_slptime	td_sched->std_slptime
120
121struct td_sched td_sched;
122struct ke_sched ke_sched;
123struct kg_sched kg_sched;
124
125struct ke_sched *kse0_sched = &ke_sched;
126struct kg_sched *ksegrp0_sched = &kg_sched;
127struct p_sched *proc0_sched = NULL;
128struct td_sched *thread0_sched = &td_sched;
129
130/*
131 * The priority is primarily determined by the interactivity score.  Thus, we
132 * give lower(better) priorities to kse groups that use less CPU.  The nice
133 * value is then directly added to this to allow nice to have some effect
134 * on latency.
135 *
136 * PRI_RANGE:	Total priority range for timeshare threads.
137 * PRI_NRESV:	Number of nice values.
138 * PRI_BASE:	The start of the dynamic range.
139 */
140#define	SCHED_PRI_RANGE		(PRI_MAX_TIMESHARE - PRI_MIN_TIMESHARE + 1)
141#define	SCHED_PRI_NRESV		PRIO_TOTAL
142#define	SCHED_PRI_NHALF		(PRIO_TOTAL / 2)
143#define	SCHED_PRI_NTHRESH	(SCHED_PRI_NHALF - 1)
144#define	SCHED_PRI_BASE		(PRI_MIN_TIMESHARE)
145#define	SCHED_PRI_INTERACT(score)					\
146    ((score) * SCHED_PRI_RANGE / SCHED_INTERACT_MAX)
147
148/*
149 * These determine the interactivity of a process.
150 *
151 * SLP_RUN_MAX:	Maximum amount of sleep time + run time we'll accumulate
152 *		before throttling back.
153 * SLP_RUN_FORK:	Maximum slp+run time to inherit at fork time.
154 * INTERACT_MAX:	Maximum interactivity value.  Smaller is better.
155 * INTERACT_THRESH:	Threshhold for placement on the current runq.
156 */
157#define	SCHED_SLP_RUN_MAX	((hz * 5) << 10)
158#define	SCHED_SLP_RUN_FORK	((hz / 2) << 10)
159#define	SCHED_INTERACT_MAX	(100)
160#define	SCHED_INTERACT_HALF	(SCHED_INTERACT_MAX / 2)
161#define	SCHED_INTERACT_THRESH	(30)
162
163/*
164 * These parameters and macros determine the size of the time slice that is
165 * granted to each thread.
166 *
167 * SLICE_MIN:	Minimum time slice granted, in units of ticks.
168 * SLICE_MAX:	Maximum time slice granted.
169 * SLICE_RANGE:	Range of available time slices scaled by hz.
170 * SLICE_SCALE:	The number slices granted per val in the range of [0, max].
171 * SLICE_NICE:  Determine the amount of slice granted to a scaled nice.
172 */
173#define	SCHED_SLICE_MIN			(slice_min)
174#define	SCHED_SLICE_MAX			(slice_max)
175#define	SCHED_SLICE_RANGE		(SCHED_SLICE_MAX - SCHED_SLICE_MIN + 1)
176#define	SCHED_SLICE_SCALE(val, max)	(((val) * SCHED_SLICE_RANGE) / (max))
177#define	SCHED_SLICE_NICE(nice)						\
178    (SCHED_SLICE_MAX - SCHED_SLICE_SCALE((nice), SCHED_PRI_NTHRESH))
179
180/*
181 * This macro determines whether or not the kse belongs on the current or
182 * next run queue.
183 *
184 * XXX nice value should effect how interactive a kg is.
185 */
186#define	SCHED_INTERACTIVE(kg)						\
187    (sched_interact_score(kg) < SCHED_INTERACT_THRESH)
188#define	SCHED_CURR(kg, ke)						\
189    (ke->ke_thread->td_priority != kg->kg_user_pri ||			\
190    SCHED_INTERACTIVE(kg))
191
192/*
193 * Cpu percentage computation macros and defines.
194 *
195 * SCHED_CPU_TIME:	Number of seconds to average the cpu usage across.
196 * SCHED_CPU_TICKS:	Number of hz ticks to average the cpu usage across.
197 */
198
199#define	SCHED_CPU_TIME	10
200#define	SCHED_CPU_TICKS	(hz * SCHED_CPU_TIME)
201
202/*
203 * kseq - per processor runqs and statistics.
204 */
205
206#define	KSEQ_NCLASS	(PRI_IDLE + 1)	/* Number of run classes. */
207
208struct kseq {
209	struct runq	ksq_idle;		/* Queue of IDLE threads. */
210	struct runq	ksq_timeshare[2];	/* Run queues for !IDLE. */
211	struct runq	*ksq_next;		/* Next timeshare queue. */
212	struct runq	*ksq_curr;		/* Current queue. */
213	int		ksq_loads[KSEQ_NCLASS];	/* Load for each class */
214	int		ksq_load;		/* Aggregate load. */
215	short		ksq_nice[PRIO_TOTAL + 1]; /* KSEs in each nice bin. */
216	short		ksq_nicemin;		/* Least nice. */
217#ifdef SMP
218	unsigned int	ksq_rslices;	/* Slices on run queue */
219	int		ksq_cpus;	/* Count of CPUs in this kseq. */
220	struct kse 	*ksq_assigned;	/* KSEs assigned by another CPU. */
221#endif
222};
223
224/*
225 * One kse queue per processor.
226 */
227#ifdef SMP
228static int kseq_idle;
229static struct kseq	kseq_cpu[MAXCPU];
230static struct kseq	*kseq_idmap[MAXCPU];
231#define	KSEQ_SELF()	(kseq_idmap[PCPU_GET(cpuid)])
232#define	KSEQ_CPU(x)	(kseq_idmap[(x)])
233#else
234static struct kseq	kseq_cpu;
235#define	KSEQ_SELF()	(&kseq_cpu)
236#define	KSEQ_CPU(x)	(&kseq_cpu)
237#endif
238
239static void sched_slice(struct kse *ke);
240static void sched_priority(struct ksegrp *kg);
241static int sched_interact_score(struct ksegrp *kg);
242static void sched_interact_update(struct ksegrp *kg);
243static void sched_interact_fork(struct ksegrp *kg);
244static void sched_pctcpu_update(struct kse *ke);
245
246/* Operations on per processor queues */
247static struct kse * kseq_choose(struct kseq *kseq);
248static void kseq_setup(struct kseq *kseq);
249static void kseq_add(struct kseq *kseq, struct kse *ke);
250static void kseq_rem(struct kseq *kseq, struct kse *ke);
251static void kseq_nice_add(struct kseq *kseq, int nice);
252static void kseq_nice_rem(struct kseq *kseq, int nice);
253void kseq_print(int cpu);
254#ifdef SMP
255#if 0
256static int sched_pickcpu(void);
257#endif
258static struct kse *runq_steal(struct runq *rq);
259static struct kseq *kseq_load_highest(void);
260static void kseq_balance(void *arg);
261static void kseq_move(struct kseq *from, int cpu);
262static int kseq_find(void);
263static void kseq_notify(struct kse *ke, int cpu);
264static void kseq_assign(struct kseq *);
265static struct kse *kseq_steal(struct kseq *kseq);
266#endif
267
268void
269kseq_print(int cpu)
270{
271	struct kseq *kseq;
272	int i;
273
274	kseq = KSEQ_CPU(cpu);
275
276	printf("kseq:\n");
277	printf("\tload:           %d\n", kseq->ksq_load);
278	printf("\tload ITHD:      %d\n", kseq->ksq_loads[PRI_ITHD]);
279	printf("\tload REALTIME:  %d\n", kseq->ksq_loads[PRI_REALTIME]);
280	printf("\tload TIMESHARE: %d\n", kseq->ksq_loads[PRI_TIMESHARE]);
281	printf("\tload IDLE:      %d\n", kseq->ksq_loads[PRI_IDLE]);
282	printf("\tnicemin:\t%d\n", kseq->ksq_nicemin);
283	printf("\tnice counts:\n");
284	for (i = 0; i < PRIO_TOTAL + 1; i++)
285		if (kseq->ksq_nice[i])
286			printf("\t\t%d = %d\n",
287			    i - SCHED_PRI_NHALF, kseq->ksq_nice[i]);
288}
289
290static void
291kseq_add(struct kseq *kseq, struct kse *ke)
292{
293	mtx_assert(&sched_lock, MA_OWNED);
294	kseq->ksq_loads[PRI_BASE(ke->ke_ksegrp->kg_pri_class)]++;
295	kseq->ksq_load++;
296	if (ke->ke_ksegrp->kg_pri_class == PRI_TIMESHARE)
297	CTR6(KTR_ULE, "Add kse %p to %p (slice: %d, pri: %d, nice: %d(%d))",
298	    ke, ke->ke_runq, ke->ke_slice, ke->ke_thread->td_priority,
299	    ke->ke_ksegrp->kg_nice, kseq->ksq_nicemin);
300	if (ke->ke_ksegrp->kg_pri_class == PRI_TIMESHARE)
301		kseq_nice_add(kseq, ke->ke_ksegrp->kg_nice);
302#ifdef SMP
303	kseq->ksq_rslices += ke->ke_slice;
304#endif
305}
306
307static void
308kseq_rem(struct kseq *kseq, struct kse *ke)
309{
310	mtx_assert(&sched_lock, MA_OWNED);
311	kseq->ksq_loads[PRI_BASE(ke->ke_ksegrp->kg_pri_class)]--;
312	kseq->ksq_load--;
313	ke->ke_runq = NULL;
314	if (ke->ke_ksegrp->kg_pri_class == PRI_TIMESHARE)
315		kseq_nice_rem(kseq, ke->ke_ksegrp->kg_nice);
316#ifdef SMP
317	kseq->ksq_rslices -= ke->ke_slice;
318#endif
319}
320
321static void
322kseq_nice_add(struct kseq *kseq, int nice)
323{
324	mtx_assert(&sched_lock, MA_OWNED);
325	/* Normalize to zero. */
326	kseq->ksq_nice[nice + SCHED_PRI_NHALF]++;
327	if (nice < kseq->ksq_nicemin || kseq->ksq_loads[PRI_TIMESHARE] == 1)
328		kseq->ksq_nicemin = nice;
329}
330
331static void
332kseq_nice_rem(struct kseq *kseq, int nice)
333{
334	int n;
335
336	mtx_assert(&sched_lock, MA_OWNED);
337	/* Normalize to zero. */
338	n = nice + SCHED_PRI_NHALF;
339	kseq->ksq_nice[n]--;
340	KASSERT(kseq->ksq_nice[n] >= 0, ("Negative nice count."));
341
342	/*
343	 * If this wasn't the smallest nice value or there are more in
344	 * this bucket we can just return.  Otherwise we have to recalculate
345	 * the smallest nice.
346	 */
347	if (nice != kseq->ksq_nicemin ||
348	    kseq->ksq_nice[n] != 0 ||
349	    kseq->ksq_loads[PRI_TIMESHARE] == 0)
350		return;
351
352	for (; n < SCHED_PRI_NRESV + 1; n++)
353		if (kseq->ksq_nice[n]) {
354			kseq->ksq_nicemin = n - SCHED_PRI_NHALF;
355			return;
356		}
357}
358
359#ifdef SMP
360/*
361 * kseq_balance is a simple CPU load balancing algorithm.  It operates by
362 * finding the least loaded and most loaded cpu and equalizing their load
363 * by migrating some processes.
364 *
365 * Dealing only with two CPUs at a time has two advantages.  Firstly, most
366 * installations will only have 2 cpus.  Secondly, load balancing too much at
367 * once can have an unpleasant effect on the system.  The scheduler rarely has
368 * enough information to make perfect decisions.  So this algorithm chooses
369 * algorithm simplicity and more gradual effects on load in larger systems.
370 *
371 * It could be improved by considering the priorities and slices assigned to
372 * each task prior to balancing them.  There are many pathological cases with
373 * any approach and so the semi random algorithm below may work as well as any.
374 *
375 */
376static void
377kseq_balance(void *arg)
378{
379	struct kseq *kseq;
380	int high_load;
381	int low_load;
382	int high_cpu;
383	int low_cpu;
384	int move;
385	int diff;
386	int i;
387
388	high_cpu = 0;
389	low_cpu = 0;
390	high_load = 0;
391	low_load = -1;
392
393	mtx_lock_spin(&sched_lock);
394	if (smp_started == 0)
395		goto out;
396
397	for (i = 0; i < mp_maxid; i++) {
398		if (CPU_ABSENT(i) || (i & stopped_cpus) != 0)
399			continue;
400		kseq = KSEQ_CPU(i);
401		if (kseq->ksq_load > high_load) {
402			high_load = kseq->ksq_load;
403			high_cpu = i;
404		}
405		if (low_load == -1 || kseq->ksq_load < low_load) {
406			low_load = kseq->ksq_load;
407			low_cpu = i;
408		}
409	}
410
411	kseq = KSEQ_CPU(high_cpu);
412
413	high_load = kseq->ksq_loads[PRI_IDLE] + kseq->ksq_loads[PRI_TIMESHARE] +
414	    kseq->ksq_loads[PRI_REALTIME];
415	/*
416	 * Nothing to do.
417	 */
418	if (high_load < kseq->ksq_cpus + 1)
419		goto out;
420
421	high_load -= kseq->ksq_cpus;
422
423	if (low_load >= high_load)
424		goto out;
425
426	diff = high_load - low_load;
427	move = diff / 2;
428	if (diff & 0x1)
429		move++;
430
431	for (i = 0; i < move; i++)
432		kseq_move(kseq, low_cpu);
433
434out:
435	mtx_unlock_spin(&sched_lock);
436	callout_reset(&kseq_lb_callout, hz, kseq_balance, NULL);
437
438	return;
439}
440
441static struct kseq *
442kseq_load_highest(void)
443{
444	struct kseq *kseq;
445	int load;
446	int cpu;
447	int i;
448
449	mtx_assert(&sched_lock, MA_OWNED);
450	cpu = 0;
451	load = 0;
452
453	for (i = 0; i < mp_maxid; i++) {
454		if (CPU_ABSENT(i) || (i & stopped_cpus) != 0)
455			continue;
456		kseq = KSEQ_CPU(i);
457		if (kseq->ksq_load > load) {
458			load = kseq->ksq_load;
459			cpu = i;
460		}
461	}
462	kseq = KSEQ_CPU(cpu);
463
464	if ((kseq->ksq_loads[PRI_IDLE] + kseq->ksq_loads[PRI_TIMESHARE] +
465	    kseq->ksq_loads[PRI_REALTIME]) > kseq->ksq_cpus)
466		return (kseq);
467
468	return (NULL);
469}
470
471static void
472kseq_move(struct kseq *from, int cpu)
473{
474	struct kse *ke;
475
476	ke = kseq_steal(from);
477	runq_remove(ke->ke_runq, ke);
478	ke->ke_state = KES_THREAD;
479	kseq_rem(from, ke);
480
481	ke->ke_cpu = cpu;
482	sched_add(ke->ke_thread);
483}
484
485static int
486kseq_find(void)
487{
488	struct kseq *high;
489
490	if (!smp_started)
491		return (0);
492	if (kseq_idle & PCPU_GET(cpumask))
493		return (0);
494	/*
495	 * Find the cpu with the highest load and steal one proc.
496	 */
497	if ((high = kseq_load_highest()) == NULL ||
498	    high == KSEQ_SELF()) {
499		/*
500		 * If we couldn't find one, set ourselves in the
501		 * idle map.
502		 */
503		atomic_set_int(&kseq_idle, PCPU_GET(cpumask));
504		return (0);
505	}
506	/*
507	 * Remove this kse from this kseq and runq and then requeue
508	 * on the current processor.  We now have a load of one!
509	 */
510	kseq_move(high, PCPU_GET(cpuid));
511
512	return (1);
513}
514
515static void
516kseq_assign(struct kseq *kseq)
517{
518	struct kse *nke;
519	struct kse *ke;
520
521	do {
522		ke = kseq->ksq_assigned;
523	} while(!atomic_cmpset_ptr(&kseq->ksq_assigned, ke, NULL));
524	for (; ke != NULL; ke = nke) {
525		nke = ke->ke_assign;
526		ke->ke_flags &= ~KEF_ASSIGNED;
527		sched_add(ke->ke_thread);
528	}
529}
530
531static void
532kseq_notify(struct kse *ke, int cpu)
533{
534	struct kseq *kseq;
535	struct thread *td;
536	struct pcpu *pcpu;
537
538	ke->ke_flags |= KEF_ASSIGNED;
539
540	kseq = KSEQ_CPU(cpu);
541
542	/*
543	 * Place a KSE on another cpu's queue and force a resched.
544	 */
545	do {
546		ke->ke_assign = kseq->ksq_assigned;
547	} while(!atomic_cmpset_ptr(&kseq->ksq_assigned, ke->ke_assign, ke));
548	pcpu = pcpu_find(cpu);
549	td = pcpu->pc_curthread;
550	if (ke->ke_thread->td_priority < td->td_priority ||
551	    td == pcpu->pc_idlethread) {
552		td->td_flags |= TDF_NEEDRESCHED;
553		ipi_selected(1 << cpu, IPI_AST);
554	}
555}
556
557static struct kse *
558runq_steal(struct runq *rq)
559{
560	struct rqhead *rqh;
561	struct rqbits *rqb;
562	struct kse *ke;
563	int word;
564	int bit;
565
566	mtx_assert(&sched_lock, MA_OWNED);
567	rqb = &rq->rq_status;
568	for (word = 0; word < RQB_LEN; word++) {
569		if (rqb->rqb_bits[word] == 0)
570			continue;
571		for (bit = 0; bit < RQB_BPW; bit++) {
572			if ((rqb->rqb_bits[word] & (1 << bit)) == 0)
573				continue;
574			rqh = &rq->rq_queues[bit + (word << RQB_L2BPW)];
575			TAILQ_FOREACH(ke, rqh, ke_procq) {
576				if (PRI_BASE(ke->ke_ksegrp->kg_pri_class) !=
577				    PRI_ITHD)
578					return (ke);
579			}
580		}
581	}
582	return (NULL);
583}
584
585static struct kse *
586kseq_steal(struct kseq *kseq)
587{
588	struct kse *ke;
589
590	if ((ke = runq_steal(kseq->ksq_curr)) != NULL)
591		return (ke);
592	if ((ke = runq_steal(kseq->ksq_next)) != NULL)
593		return (ke);
594	return (runq_steal(&kseq->ksq_idle));
595}
596#endif	/* SMP */
597
598/*
599 * Pick the highest priority task we have and return it.
600 */
601
602static struct kse *
603kseq_choose(struct kseq *kseq)
604{
605	struct kse *ke;
606	struct runq *swap;
607
608	mtx_assert(&sched_lock, MA_OWNED);
609	swap = NULL;
610
611	for (;;) {
612		ke = runq_choose(kseq->ksq_curr);
613		if (ke == NULL) {
614			/*
615			 * We already swaped once and didn't get anywhere.
616			 */
617			if (swap)
618				break;
619			swap = kseq->ksq_curr;
620			kseq->ksq_curr = kseq->ksq_next;
621			kseq->ksq_next = swap;
622			continue;
623		}
624		/*
625		 * If we encounter a slice of 0 the kse is in a
626		 * TIMESHARE kse group and its nice was too far out
627		 * of the range that receives slices.
628		 */
629		if (ke->ke_slice == 0) {
630			runq_remove(ke->ke_runq, ke);
631			sched_slice(ke);
632			ke->ke_runq = kseq->ksq_next;
633			runq_add(ke->ke_runq, ke);
634			continue;
635		}
636		return (ke);
637	}
638
639	return (runq_choose(&kseq->ksq_idle));
640}
641
642static void
643kseq_setup(struct kseq *kseq)
644{
645	runq_init(&kseq->ksq_timeshare[0]);
646	runq_init(&kseq->ksq_timeshare[1]);
647	runq_init(&kseq->ksq_idle);
648
649	kseq->ksq_curr = &kseq->ksq_timeshare[0];
650	kseq->ksq_next = &kseq->ksq_timeshare[1];
651
652	kseq->ksq_loads[PRI_ITHD] = 0;
653	kseq->ksq_loads[PRI_REALTIME] = 0;
654	kseq->ksq_loads[PRI_TIMESHARE] = 0;
655	kseq->ksq_loads[PRI_IDLE] = 0;
656	kseq->ksq_load = 0;
657#ifdef SMP
658	kseq->ksq_rslices = 0;
659	kseq->ksq_assigned = NULL;
660#endif
661}
662
663static void
664sched_setup(void *dummy)
665{
666#ifdef SMP
667	int i;
668#endif
669
670	slice_min = (hz/100);	/* 10ms */
671	slice_max = (hz/7);	/* ~140ms */
672
673#ifdef SMP
674	/* init kseqs */
675	/* Create the idmap. */
676#ifdef ULE_HTT_EXPERIMENTAL
677	if (smp_topology == NULL) {
678#else
679	if (1) {
680#endif
681		for (i = 0; i < MAXCPU; i++) {
682			kseq_setup(&kseq_cpu[i]);
683			kseq_idmap[i] = &kseq_cpu[i];
684			kseq_cpu[i].ksq_cpus = 1;
685		}
686	} else {
687		int j;
688
689		for (i = 0; i < smp_topology->ct_count; i++) {
690			struct cpu_group *cg;
691
692			cg = &smp_topology->ct_group[i];
693			kseq_setup(&kseq_cpu[i]);
694
695			for (j = 0; j < MAXCPU; j++)
696				if ((cg->cg_mask & (1 << j)) != 0)
697					kseq_idmap[j] = &kseq_cpu[i];
698			kseq_cpu[i].ksq_cpus = cg->cg_count;
699		}
700	}
701	callout_init(&kseq_lb_callout, CALLOUT_MPSAFE);
702	kseq_balance(NULL);
703#else
704	kseq_setup(KSEQ_SELF());
705#endif
706	mtx_lock_spin(&sched_lock);
707	kseq_add(KSEQ_SELF(), &kse0);
708	mtx_unlock_spin(&sched_lock);
709}
710
711/*
712 * Scale the scheduling priority according to the "interactivity" of this
713 * process.
714 */
715static void
716sched_priority(struct ksegrp *kg)
717{
718	int pri;
719
720	if (kg->kg_pri_class != PRI_TIMESHARE)
721		return;
722
723	pri = SCHED_PRI_INTERACT(sched_interact_score(kg));
724	pri += SCHED_PRI_BASE;
725	pri += kg->kg_nice;
726
727	if (pri > PRI_MAX_TIMESHARE)
728		pri = PRI_MAX_TIMESHARE;
729	else if (pri < PRI_MIN_TIMESHARE)
730		pri = PRI_MIN_TIMESHARE;
731
732	kg->kg_user_pri = pri;
733
734	return;
735}
736
737/*
738 * Calculate a time slice based on the properties of the kseg and the runq
739 * that we're on.  This is only for PRI_TIMESHARE ksegrps.
740 */
741static void
742sched_slice(struct kse *ke)
743{
744	struct kseq *kseq;
745	struct ksegrp *kg;
746
747	kg = ke->ke_ksegrp;
748	kseq = KSEQ_CPU(ke->ke_cpu);
749
750	/*
751	 * Rationale:
752	 * KSEs in interactive ksegs get the minimum slice so that we
753	 * quickly notice if it abuses its advantage.
754	 *
755	 * KSEs in non-interactive ksegs are assigned a slice that is
756	 * based on the ksegs nice value relative to the least nice kseg
757	 * on the run queue for this cpu.
758	 *
759	 * If the KSE is less nice than all others it gets the maximum
760	 * slice and other KSEs will adjust their slice relative to
761	 * this when they first expire.
762	 *
763	 * There is 20 point window that starts relative to the least
764	 * nice kse on the run queue.  Slice size is determined by
765	 * the kse distance from the last nice ksegrp.
766	 *
767	 * If you are outside of the window you will get no slice and
768	 * you will be reevaluated each time you are selected on the
769	 * run queue.
770	 *
771	 */
772
773	if (!SCHED_INTERACTIVE(kg)) {
774		int nice;
775
776		nice = kg->kg_nice + (0 - kseq->ksq_nicemin);
777		if (kseq->ksq_loads[PRI_TIMESHARE] == 0 ||
778		    kg->kg_nice < kseq->ksq_nicemin)
779			ke->ke_slice = SCHED_SLICE_MAX;
780		else if (nice <= SCHED_PRI_NTHRESH)
781			ke->ke_slice = SCHED_SLICE_NICE(nice);
782		else
783			ke->ke_slice = 0;
784	} else
785		ke->ke_slice = SCHED_SLICE_MIN;
786
787	CTR6(KTR_ULE,
788	    "Sliced %p(%d) (nice: %d, nicemin: %d, load: %d, interactive: %d)",
789	    ke, ke->ke_slice, kg->kg_nice, kseq->ksq_nicemin,
790	    kseq->ksq_loads[PRI_TIMESHARE], SCHED_INTERACTIVE(kg));
791
792	return;
793}
794
795/*
796 * This routine enforces a maximum limit on the amount of scheduling history
797 * kept.  It is called after either the slptime or runtime is adjusted.
798 * This routine will not operate correctly when slp or run times have been
799 * adjusted to more than double their maximum.
800 */
801static void
802sched_interact_update(struct ksegrp *kg)
803{
804	int sum;
805
806	sum = kg->kg_runtime + kg->kg_slptime;
807	if (sum < SCHED_SLP_RUN_MAX)
808		return;
809	/*
810	 * If we have exceeded by more than 1/5th then the algorithm below
811	 * will not bring us back into range.  Dividing by two here forces
812	 * us into the range of [3/5 * SCHED_INTERACT_MAX, SCHED_INTERACT_MAX]
813	 */
814	if (sum > (SCHED_INTERACT_MAX / 5) * 6) {
815		kg->kg_runtime /= 2;
816		kg->kg_slptime /= 2;
817		return;
818	}
819	kg->kg_runtime = (kg->kg_runtime / 5) * 4;
820	kg->kg_slptime = (kg->kg_slptime / 5) * 4;
821}
822
823static void
824sched_interact_fork(struct ksegrp *kg)
825{
826	int ratio;
827	int sum;
828
829	sum = kg->kg_runtime + kg->kg_slptime;
830	if (sum > SCHED_SLP_RUN_FORK) {
831		ratio = sum / SCHED_SLP_RUN_FORK;
832		kg->kg_runtime /= ratio;
833		kg->kg_slptime /= ratio;
834	}
835}
836
837static int
838sched_interact_score(struct ksegrp *kg)
839{
840	int div;
841
842	if (kg->kg_runtime > kg->kg_slptime) {
843		div = max(1, kg->kg_runtime / SCHED_INTERACT_HALF);
844		return (SCHED_INTERACT_HALF +
845		    (SCHED_INTERACT_HALF - (kg->kg_slptime / div)));
846	} if (kg->kg_slptime > kg->kg_runtime) {
847		div = max(1, kg->kg_slptime / SCHED_INTERACT_HALF);
848		return (kg->kg_runtime / div);
849	}
850
851	/*
852	 * This can happen if slptime and runtime are 0.
853	 */
854	return (0);
855
856}
857
858/*
859 * This is only somewhat accurate since given many processes of the same
860 * priority they will switch when their slices run out, which will be
861 * at most SCHED_SLICE_MAX.
862 */
863int
864sched_rr_interval(void)
865{
866	return (SCHED_SLICE_MAX);
867}
868
869static void
870sched_pctcpu_update(struct kse *ke)
871{
872	/*
873	 * Adjust counters and watermark for pctcpu calc.
874	 */
875	if (ke->ke_ltick > ticks - SCHED_CPU_TICKS) {
876		/*
877		 * Shift the tick count out so that the divide doesn't
878		 * round away our results.
879		 */
880		ke->ke_ticks <<= 10;
881		ke->ke_ticks = (ke->ke_ticks / (ticks - ke->ke_ftick)) *
882			    SCHED_CPU_TICKS;
883		ke->ke_ticks >>= 10;
884	} else
885		ke->ke_ticks = 0;
886	ke->ke_ltick = ticks;
887	ke->ke_ftick = ke->ke_ltick - SCHED_CPU_TICKS;
888}
889
890#if 0
891/* XXX Should be changed to kseq_load_lowest() */
892int
893sched_pickcpu(void)
894{
895	struct kseq *kseq;
896	int load;
897	int cpu;
898	int i;
899
900	mtx_assert(&sched_lock, MA_OWNED);
901	if (!smp_started)
902		return (0);
903
904	load = 0;
905	cpu = 0;
906
907	for (i = 0; i < mp_maxid; i++) {
908		if (CPU_ABSENT(i) || (i & stopped_cpus) != 0)
909			continue;
910		kseq = KSEQ_CPU(i);
911		if (kseq->ksq_load < load) {
912			cpu = i;
913			load = kseq->ksq_load;
914		}
915	}
916
917	CTR1(KTR_ULE, "sched_pickcpu: %d", cpu);
918	return (cpu);
919}
920#endif
921
922void
923sched_prio(struct thread *td, u_char prio)
924{
925	struct kse *ke;
926
927	ke = td->td_kse;
928	mtx_assert(&sched_lock, MA_OWNED);
929	if (TD_ON_RUNQ(td)) {
930		/*
931		 * If the priority has been elevated due to priority
932		 * propagation, we may have to move ourselves to a new
933		 * queue.  We still call adjustrunqueue below in case kse
934		 * needs to fix things up.
935		 */
936		if (ke && (ke->ke_flags & KEF_ASSIGNED) == 0 &&
937		    ke->ke_runq != KSEQ_CPU(ke->ke_cpu)->ksq_curr) {
938			runq_remove(ke->ke_runq, ke);
939			ke->ke_runq = KSEQ_CPU(ke->ke_cpu)->ksq_curr;
940			runq_add(ke->ke_runq, ke);
941		}
942		adjustrunqueue(td, prio);
943	} else
944		td->td_priority = prio;
945}
946
947void
948sched_switch(struct thread *td)
949{
950	struct thread *newtd;
951	struct kse *ke;
952
953	mtx_assert(&sched_lock, MA_OWNED);
954
955	ke = td->td_kse;
956
957	td->td_last_kse = ke;
958        td->td_lastcpu = td->td_oncpu;
959	td->td_oncpu = NOCPU;
960        td->td_flags &= ~TDF_NEEDRESCHED;
961
962	if (TD_IS_RUNNING(td)) {
963		if (td->td_proc->p_flag & P_SA) {
964			kseq_rem(KSEQ_CPU(ke->ke_cpu), ke);
965			setrunqueue(td);
966		} else {
967			/*
968			 * This queue is always correct except for idle threads
969			 * which have a higher priority due to priority
970			 * propagation.
971			 */
972			if (ke->ke_ksegrp->kg_pri_class == PRI_IDLE) {
973				if (td->td_priority < PRI_MIN_IDLE)
974					ke->ke_runq = KSEQ_SELF()->ksq_curr;
975				else
976					ke->ke_runq = &KSEQ_SELF()->ksq_idle;
977			}
978			runq_add(ke->ke_runq, ke);
979			/* setrunqueue(td); */
980		}
981	} else {
982		if (ke->ke_runq)
983			kseq_rem(KSEQ_CPU(ke->ke_cpu), ke);
984		/*
985		 * We will not be on the run queue. So we must be
986		 * sleeping or similar.
987		 */
988		if (td->td_proc->p_flag & P_SA)
989			kse_reassign(ke);
990	}
991	newtd = choosethread();
992	if (td != newtd)
993		cpu_switch(td, newtd);
994	sched_lock.mtx_lock = (uintptr_t)td;
995
996	td->td_oncpu = PCPU_GET(cpuid);
997}
998
999void
1000sched_nice(struct ksegrp *kg, int nice)
1001{
1002	struct kse *ke;
1003	struct thread *td;
1004	struct kseq *kseq;
1005
1006	PROC_LOCK_ASSERT(kg->kg_proc, MA_OWNED);
1007	mtx_assert(&sched_lock, MA_OWNED);
1008	/*
1009	 * We need to adjust the nice counts for running KSEs.
1010	 */
1011	if (kg->kg_pri_class == PRI_TIMESHARE)
1012		FOREACH_KSE_IN_GROUP(kg, ke) {
1013			if (ke->ke_runq == NULL)
1014				continue;
1015			kseq = KSEQ_CPU(ke->ke_cpu);
1016			kseq_nice_rem(kseq, kg->kg_nice);
1017			kseq_nice_add(kseq, nice);
1018		}
1019	kg->kg_nice = nice;
1020	sched_priority(kg);
1021	FOREACH_THREAD_IN_GROUP(kg, td)
1022		td->td_flags |= TDF_NEEDRESCHED;
1023}
1024
1025void
1026sched_sleep(struct thread *td, u_char prio)
1027{
1028	mtx_assert(&sched_lock, MA_OWNED);
1029
1030	td->td_slptime = ticks;
1031	td->td_priority = prio;
1032
1033	CTR2(KTR_ULE, "sleep kse %p (tick: %d)",
1034	    td->td_kse, td->td_slptime);
1035}
1036
1037void
1038sched_wakeup(struct thread *td)
1039{
1040	mtx_assert(&sched_lock, MA_OWNED);
1041
1042	/*
1043	 * Let the kseg know how long we slept for.  This is because process
1044	 * interactivity behavior is modeled in the kseg.
1045	 */
1046	if (td->td_slptime) {
1047		struct ksegrp *kg;
1048		int hzticks;
1049
1050		kg = td->td_ksegrp;
1051		hzticks = (ticks - td->td_slptime) << 10;
1052		if (hzticks >= SCHED_SLP_RUN_MAX) {
1053			kg->kg_slptime = SCHED_SLP_RUN_MAX;
1054			kg->kg_runtime = 1;
1055		} else {
1056			kg->kg_slptime += hzticks;
1057			sched_interact_update(kg);
1058		}
1059		sched_priority(kg);
1060		if (td->td_kse)
1061			sched_slice(td->td_kse);
1062		CTR2(KTR_ULE, "wakeup kse %p (%d ticks)",
1063		    td->td_kse, hzticks);
1064		td->td_slptime = 0;
1065	}
1066	setrunqueue(td);
1067}
1068
1069/*
1070 * Penalize the parent for creating a new child and initialize the child's
1071 * priority.
1072 */
1073void
1074sched_fork(struct proc *p, struct proc *p1)
1075{
1076
1077	mtx_assert(&sched_lock, MA_OWNED);
1078
1079	sched_fork_ksegrp(FIRST_KSEGRP_IN_PROC(p), FIRST_KSEGRP_IN_PROC(p1));
1080	sched_fork_kse(FIRST_KSE_IN_PROC(p), FIRST_KSE_IN_PROC(p1));
1081	sched_fork_thread(FIRST_THREAD_IN_PROC(p), FIRST_THREAD_IN_PROC(p1));
1082}
1083
1084void
1085sched_fork_kse(struct kse *ke, struct kse *child)
1086{
1087
1088	child->ke_slice = 1;	/* Attempt to quickly learn interactivity. */
1089	child->ke_cpu = ke->ke_cpu; /* sched_pickcpu(); */
1090	child->ke_runq = NULL;
1091
1092	/* Grab our parents cpu estimation information. */
1093	child->ke_ticks = ke->ke_ticks;
1094	child->ke_ltick = ke->ke_ltick;
1095	child->ke_ftick = ke->ke_ftick;
1096}
1097
1098void
1099sched_fork_ksegrp(struct ksegrp *kg, struct ksegrp *child)
1100{
1101	PROC_LOCK_ASSERT(child->kg_proc, MA_OWNED);
1102
1103	child->kg_slptime = kg->kg_slptime;
1104	child->kg_runtime = kg->kg_runtime;
1105	child->kg_user_pri = kg->kg_user_pri;
1106	child->kg_nice = kg->kg_nice;
1107	sched_interact_fork(child);
1108	kg->kg_runtime += tickincr << 10;
1109	sched_interact_update(kg);
1110
1111	CTR6(KTR_ULE, "sched_fork_ksegrp: %d(%d, %d) - %d(%d, %d)",
1112	    kg->kg_proc->p_pid, kg->kg_slptime, kg->kg_runtime,
1113	    child->kg_proc->p_pid, child->kg_slptime, child->kg_runtime);
1114}
1115
1116void
1117sched_fork_thread(struct thread *td, struct thread *child)
1118{
1119}
1120
1121void
1122sched_class(struct ksegrp *kg, int class)
1123{
1124	struct kseq *kseq;
1125	struct kse *ke;
1126
1127	mtx_assert(&sched_lock, MA_OWNED);
1128	if (kg->kg_pri_class == class)
1129		return;
1130
1131	FOREACH_KSE_IN_GROUP(kg, ke) {
1132		if (ke->ke_state != KES_ONRUNQ &&
1133		    ke->ke_state != KES_THREAD)
1134			continue;
1135		kseq = KSEQ_CPU(ke->ke_cpu);
1136
1137		kseq->ksq_loads[PRI_BASE(kg->kg_pri_class)]--;
1138		kseq->ksq_loads[PRI_BASE(class)]++;
1139
1140		if (kg->kg_pri_class == PRI_TIMESHARE)
1141			kseq_nice_rem(kseq, kg->kg_nice);
1142		else if (class == PRI_TIMESHARE)
1143			kseq_nice_add(kseq, kg->kg_nice);
1144	}
1145
1146	kg->kg_pri_class = class;
1147}
1148
1149/*
1150 * Return some of the child's priority and interactivity to the parent.
1151 */
1152void
1153sched_exit(struct proc *p, struct proc *child)
1154{
1155	mtx_assert(&sched_lock, MA_OWNED);
1156	sched_exit_kse(FIRST_KSE_IN_PROC(p), FIRST_KSE_IN_PROC(child));
1157	sched_exit_ksegrp(FIRST_KSEGRP_IN_PROC(p), FIRST_KSEGRP_IN_PROC(child));
1158}
1159
1160void
1161sched_exit_kse(struct kse *ke, struct kse *child)
1162{
1163	kseq_rem(KSEQ_CPU(child->ke_cpu), child);
1164}
1165
1166void
1167sched_exit_ksegrp(struct ksegrp *kg, struct ksegrp *child)
1168{
1169	/* kg->kg_slptime += child->kg_slptime; */
1170	kg->kg_runtime += child->kg_runtime;
1171	sched_interact_update(kg);
1172}
1173
1174void
1175sched_exit_thread(struct thread *td, struct thread *child)
1176{
1177}
1178
1179void
1180sched_clock(struct thread *td)
1181{
1182	struct kseq *kseq;
1183	struct ksegrp *kg;
1184	struct kse *ke;
1185
1186	/*
1187	 * sched_setup() apparently happens prior to stathz being set.  We
1188	 * need to resolve the timers earlier in the boot so we can avoid
1189	 * calculating this here.
1190	 */
1191	if (realstathz == 0) {
1192		realstathz = stathz ? stathz : hz;
1193		tickincr = hz / realstathz;
1194		/*
1195		 * XXX This does not work for values of stathz that are much
1196		 * larger than hz.
1197		 */
1198		if (tickincr == 0)
1199			tickincr = 1;
1200	}
1201
1202	ke = td->td_kse;
1203	kg = ke->ke_ksegrp;
1204
1205	mtx_assert(&sched_lock, MA_OWNED);
1206	KASSERT((td != NULL), ("schedclock: null thread pointer"));
1207
1208	/* Adjust ticks for pctcpu */
1209	ke->ke_ticks++;
1210	ke->ke_ltick = ticks;
1211
1212	/* Go up to one second beyond our max and then trim back down */
1213	if (ke->ke_ftick + SCHED_CPU_TICKS + hz < ke->ke_ltick)
1214		sched_pctcpu_update(ke);
1215
1216	if (td->td_flags & TDF_IDLETD)
1217		return;
1218
1219	CTR4(KTR_ULE, "Tick kse %p (slice: %d, slptime: %d, runtime: %d)",
1220	    ke, ke->ke_slice, kg->kg_slptime >> 10, kg->kg_runtime >> 10);
1221	/*
1222	 * We only do slicing code for TIMESHARE ksegrps.
1223	 */
1224	if (kg->kg_pri_class != PRI_TIMESHARE)
1225		return;
1226	/*
1227	 * We used a tick charge it to the ksegrp so that we can compute our
1228	 * interactivity.
1229	 */
1230	kg->kg_runtime += tickincr << 10;
1231	sched_interact_update(kg);
1232
1233	/*
1234	 * We used up one time slice.
1235	 */
1236	ke->ke_slice--;
1237	kseq = KSEQ_SELF();
1238#ifdef SMP
1239	kseq->ksq_rslices--;
1240#endif
1241
1242	if (ke->ke_slice > 0)
1243		return;
1244	/*
1245	 * We're out of time, recompute priorities and requeue.
1246	 */
1247	kseq_rem(kseq, ke);
1248	sched_priority(kg);
1249	sched_slice(ke);
1250	if (SCHED_CURR(kg, ke))
1251		ke->ke_runq = kseq->ksq_curr;
1252	else
1253		ke->ke_runq = kseq->ksq_next;
1254	kseq_add(kseq, ke);
1255	td->td_flags |= TDF_NEEDRESCHED;
1256}
1257
1258int
1259sched_runnable(void)
1260{
1261	struct kseq *kseq;
1262	int load;
1263
1264	load = 1;
1265
1266	mtx_lock_spin(&sched_lock);
1267	kseq = KSEQ_SELF();
1268#ifdef SMP
1269	if (kseq->ksq_assigned)
1270		kseq_assign(kseq);
1271#endif
1272	if ((curthread->td_flags & TDF_IDLETD) != 0) {
1273		if (kseq->ksq_load > 0)
1274			goto out;
1275	} else
1276		if (kseq->ksq_load - 1 > 0)
1277			goto out;
1278	load = 0;
1279out:
1280	mtx_unlock_spin(&sched_lock);
1281	return (load);
1282}
1283
1284void
1285sched_userret(struct thread *td)
1286{
1287	struct ksegrp *kg;
1288
1289	kg = td->td_ksegrp;
1290
1291	if (td->td_priority != kg->kg_user_pri) {
1292		mtx_lock_spin(&sched_lock);
1293		td->td_priority = kg->kg_user_pri;
1294		mtx_unlock_spin(&sched_lock);
1295	}
1296}
1297
1298struct kse *
1299sched_choose(void)
1300{
1301	struct kseq *kseq;
1302	struct kse *ke;
1303
1304	mtx_assert(&sched_lock, MA_OWNED);
1305	kseq = KSEQ_SELF();
1306#ifdef SMP
1307retry:
1308	if (kseq->ksq_assigned)
1309		kseq_assign(kseq);
1310#endif
1311	ke = kseq_choose(kseq);
1312	if (ke) {
1313#ifdef SMP
1314		if (ke->ke_ksegrp->kg_pri_class == PRI_IDLE)
1315			if (kseq_find())
1316				goto retry;
1317#endif
1318		runq_remove(ke->ke_runq, ke);
1319		ke->ke_state = KES_THREAD;
1320
1321		if (ke->ke_ksegrp->kg_pri_class == PRI_TIMESHARE) {
1322			CTR4(KTR_ULE, "Run kse %p from %p (slice: %d, pri: %d)",
1323			    ke, ke->ke_runq, ke->ke_slice,
1324			    ke->ke_thread->td_priority);
1325		}
1326		return (ke);
1327	}
1328#ifdef SMP
1329	if (kseq_find())
1330		goto retry;
1331#endif
1332
1333	return (NULL);
1334}
1335
1336void
1337sched_add(struct thread *td)
1338{
1339	struct kseq *kseq;
1340	struct ksegrp *kg;
1341	struct kse *ke;
1342	int class;
1343
1344	mtx_assert(&sched_lock, MA_OWNED);
1345	ke = td->td_kse;
1346	kg = td->td_ksegrp;
1347	if (ke->ke_flags & KEF_ASSIGNED)
1348		return;
1349	kseq = KSEQ_SELF();
1350	KASSERT((ke->ke_thread != NULL), ("sched_add: No thread on KSE"));
1351	KASSERT((ke->ke_thread->td_kse != NULL),
1352	    ("sched_add: No KSE on thread"));
1353	KASSERT(ke->ke_state != KES_ONRUNQ,
1354	    ("sched_add: kse %p (%s) already in run queue", ke,
1355	    ke->ke_proc->p_comm));
1356	KASSERT(ke->ke_proc->p_sflag & PS_INMEM,
1357	    ("sched_add: process swapped out"));
1358	KASSERT(ke->ke_runq == NULL,
1359	    ("sched_add: KSE %p is still assigned to a run queue", ke));
1360
1361	class = PRI_BASE(kg->kg_pri_class);
1362	switch (class) {
1363	case PRI_ITHD:
1364	case PRI_REALTIME:
1365		ke->ke_runq = kseq->ksq_curr;
1366		ke->ke_slice = SCHED_SLICE_MAX;
1367		ke->ke_cpu = PCPU_GET(cpuid);
1368		break;
1369	case PRI_TIMESHARE:
1370#ifdef SMP
1371		if (ke->ke_cpu != PCPU_GET(cpuid)) {
1372			kseq_notify(ke, ke->ke_cpu);
1373			return;
1374		}
1375#endif
1376		if (SCHED_CURR(kg, ke))
1377			ke->ke_runq = kseq->ksq_curr;
1378		else
1379			ke->ke_runq = kseq->ksq_next;
1380		break;
1381	case PRI_IDLE:
1382#ifdef SMP
1383		if (ke->ke_cpu != PCPU_GET(cpuid)) {
1384			kseq_notify(ke, ke->ke_cpu);
1385			return;
1386		}
1387#endif
1388		/*
1389		 * This is for priority prop.
1390		 */
1391		if (ke->ke_thread->td_priority < PRI_MIN_IDLE)
1392			ke->ke_runq = kseq->ksq_curr;
1393		else
1394			ke->ke_runq = &kseq->ksq_idle;
1395		ke->ke_slice = SCHED_SLICE_MIN;
1396		break;
1397	default:
1398		panic("Unknown pri class.");
1399		break;
1400	}
1401#ifdef SMP
1402	/*
1403	 * If there are any idle processors, give them our extra load.
1404	 */
1405	if (kseq_idle && class != PRI_ITHD &&
1406	    (kseq->ksq_loads[PRI_IDLE] + kseq->ksq_loads[PRI_TIMESHARE] +
1407	    kseq->ksq_loads[PRI_REALTIME]) >= kseq->ksq_cpus) {
1408		int cpu;
1409
1410		/*
1411		 * Multiple cpus could find this bit simultaneously but the
1412		 * race shouldn't be terrible.
1413		 */
1414		cpu = ffs(kseq_idle);
1415		if (cpu) {
1416			cpu--;
1417			atomic_clear_int(&kseq_idle, 1 << cpu);
1418			ke->ke_cpu = cpu;
1419			ke->ke_runq = NULL;
1420			kseq_notify(ke, cpu);
1421			return;
1422		}
1423	}
1424	if (class == PRI_TIMESHARE || class == PRI_REALTIME)
1425		atomic_clear_int(&kseq_idle, PCPU_GET(cpumask));
1426#endif
1427        if (td->td_priority < curthread->td_priority)
1428                curthread->td_flags |= TDF_NEEDRESCHED;
1429
1430	ke->ke_ksegrp->kg_runq_kses++;
1431	ke->ke_state = KES_ONRUNQ;
1432
1433	runq_add(ke->ke_runq, ke);
1434	kseq_add(kseq, ke);
1435}
1436
1437void
1438sched_rem(struct thread *td)
1439{
1440	struct kseq *kseq;
1441	struct kse *ke;
1442
1443	ke = td->td_kse;
1444	/*
1445	 * It is safe to just return here because sched_rem() is only ever
1446	 * used in places where we're immediately going to add the
1447	 * kse back on again.  In that case it'll be added with the correct
1448	 * thread and priority when the caller drops the sched_lock.
1449	 */
1450	if (ke->ke_flags & KEF_ASSIGNED)
1451		return;
1452	mtx_assert(&sched_lock, MA_OWNED);
1453	KASSERT((ke->ke_state == KES_ONRUNQ), ("KSE not on run queue"));
1454
1455	ke->ke_state = KES_THREAD;
1456	ke->ke_ksegrp->kg_runq_kses--;
1457	kseq = KSEQ_CPU(ke->ke_cpu);
1458	runq_remove(ke->ke_runq, ke);
1459	kseq_rem(kseq, ke);
1460}
1461
1462fixpt_t
1463sched_pctcpu(struct thread *td)
1464{
1465	fixpt_t pctcpu;
1466	struct kse *ke;
1467
1468	pctcpu = 0;
1469	ke = td->td_kse;
1470	if (ke == NULL)
1471		return (0);
1472
1473	mtx_lock_spin(&sched_lock);
1474	if (ke->ke_ticks) {
1475		int rtick;
1476
1477		/*
1478		 * Don't update more frequently than twice a second.  Allowing
1479		 * this causes the cpu usage to decay away too quickly due to
1480		 * rounding errors.
1481		 */
1482		if (ke->ke_ltick < (ticks - (hz / 2)))
1483			sched_pctcpu_update(ke);
1484		/* How many rtick per second ? */
1485		rtick = min(ke->ke_ticks / SCHED_CPU_TIME, SCHED_CPU_TICKS);
1486		pctcpu = (FSCALE * ((FSCALE * rtick)/realstathz)) >> FSHIFT;
1487	}
1488
1489	ke->ke_proc->p_swtime = ke->ke_ltick - ke->ke_ftick;
1490	mtx_unlock_spin(&sched_lock);
1491
1492	return (pctcpu);
1493}
1494
1495int
1496sched_sizeof_kse(void)
1497{
1498	return (sizeof(struct kse) + sizeof(struct ke_sched));
1499}
1500
1501int
1502sched_sizeof_ksegrp(void)
1503{
1504	return (sizeof(struct ksegrp) + sizeof(struct kg_sched));
1505}
1506
1507int
1508sched_sizeof_proc(void)
1509{
1510	return (sizeof(struct proc));
1511}
1512
1513int
1514sched_sizeof_thread(void)
1515{
1516	return (sizeof(struct thread) + sizeof(struct td_sched));
1517}
1518