sched_ule.c revision 163709
1/*-
2 * Copyright (c) 2002-2005, Jeffrey Roberson <jeff@freebsd.org>
3 * All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 * 1. Redistributions of source code must retain the above copyright
9 *    notice unmodified, this list of conditions, and the following
10 *    disclaimer.
11 * 2. Redistributions in binary form must reproduce the above copyright
12 *    notice, this list of conditions and the following disclaimer in the
13 *    documentation and/or other materials provided with the distribution.
14 *
15 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
16 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
17 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
18 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
19 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
20 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
21 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
22 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
23 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
24 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
25 */
26
27#include <sys/cdefs.h>
28__FBSDID("$FreeBSD: head/sys/kern/sched_ule.c 163709 2006-10-26 21:42:22Z jb $");
29
30#include "opt_hwpmc_hooks.h"
31#include "opt_sched.h"
32
33#define kse td_sched
34
35#include <sys/param.h>
36#include <sys/systm.h>
37#include <sys/kdb.h>
38#include <sys/kernel.h>
39#include <sys/ktr.h>
40#include <sys/lock.h>
41#include <sys/mutex.h>
42#include <sys/proc.h>
43#include <sys/resource.h>
44#include <sys/resourcevar.h>
45#include <sys/sched.h>
46#include <sys/smp.h>
47#include <sys/sx.h>
48#include <sys/sysctl.h>
49#include <sys/sysproto.h>
50#include <sys/turnstile.h>
51#include <sys/umtx.h>
52#include <sys/vmmeter.h>
53#ifdef KTRACE
54#include <sys/uio.h>
55#include <sys/ktrace.h>
56#endif
57
58#ifdef HWPMC_HOOKS
59#include <sys/pmckern.h>
60#endif
61
62#include <machine/cpu.h>
63#include <machine/smp.h>
64
65/* decay 95% of `p_pctcpu' in 60 seconds; see CCPU_SHIFT before changing */
66/* XXX This is bogus compatability crap for ps */
67static fixpt_t  ccpu = 0.95122942450071400909 * FSCALE; /* exp(-1/20) */
68SYSCTL_INT(_kern, OID_AUTO, ccpu, CTLFLAG_RD, &ccpu, 0, "");
69
70static void sched_setup(void *dummy);
71SYSINIT(sched_setup, SI_SUB_RUN_QUEUE, SI_ORDER_FIRST, sched_setup, NULL)
72
73static void sched_initticks(void *dummy);
74SYSINIT(sched_initticks, SI_SUB_CLOCKS, SI_ORDER_THIRD, sched_initticks, NULL)
75
76static SYSCTL_NODE(_kern, OID_AUTO, sched, CTLFLAG_RW, 0, "Scheduler");
77
78SYSCTL_STRING(_kern_sched, OID_AUTO, name, CTLFLAG_RD, "ule", 0,
79    "Scheduler name");
80
81static int slice_min = 1;
82SYSCTL_INT(_kern_sched, OID_AUTO, slice_min, CTLFLAG_RW, &slice_min, 0, "");
83
84static int slice_max = 10;
85SYSCTL_INT(_kern_sched, OID_AUTO, slice_max, CTLFLAG_RW, &slice_max, 0, "");
86
87int realstathz;
88int tickincr = 1 << 10;
89
90/*
91 * The following datastructures are allocated within their parent structure
92 * but are scheduler specific.
93 */
94/*
95 * The schedulable entity that can be given a context to run.  A process may
96 * have several of these.
97 */
98struct td_sched {	/* really kse */
99	TAILQ_ENTRY(kse) ke_procq;	/* (j/z) Run queue. */
100	int		ke_flags;	/* (j) KEF_* flags. */
101	struct thread	*ke_thread;	/* (*) Active associated thread. */
102	fixpt_t		ke_pctcpu;	/* (j) %cpu during p_swtime. */
103	u_char		ke_rqindex;	/* (j) Run queue index. */
104	enum {
105		KES_THREAD = 0x0,	/* slaved to thread state */
106		KES_ONRUNQ
107	} ke_state;			/* (j) thread sched specific status. */
108	int		ke_slptime;
109	int		ke_slice;
110	struct runq	*ke_runq;
111	u_char		ke_cpu;		/* CPU that we have affinity for. */
112	/* The following variables are only used for pctcpu calculation */
113	int		ke_ltick;	/* Last tick that we were running on */
114	int		ke_ftick;	/* First tick that we were running on */
115	int		ke_ticks;	/* Tick count */
116
117	/* originally from kg_sched */
118	int	skg_slptime;		/* Number of ticks we vol. slept */
119	int	skg_runtime;		/* Number of ticks we were running */
120};
121#define	td_kse			td_sched
122#define	ke_assign		ke_procq.tqe_next
123/* flags kept in ke_flags */
124#define	KEF_ASSIGNED	0x0001		/* Thread is being migrated. */
125#define	KEF_BOUND	0x0002		/* Thread can not migrate. */
126#define	KEF_XFERABLE	0x0004		/* Thread was added as transferable. */
127#define	KEF_HOLD	0x0008		/* Thread is temporarily bound. */
128#define	KEF_REMOVED	0x0010		/* Thread was removed while ASSIGNED */
129#define	KEF_INTERNAL	0x0020		/* Thread added due to migration. */
130#define	KEF_PREEMPTED	0x0040		/* Thread was preempted */
131#define	KEF_DIDRUN	0x02000		/* Thread actually ran. */
132#define	KEF_EXIT	0x04000		/* Thread is being killed. */
133
134static struct kse kse0;
135
136/*
137 * The priority is primarily determined by the interactivity score.  Thus, we
138 * give lower(better) priorities to kse groups that use less CPU.  The nice
139 * value is then directly added to this to allow nice to have some effect
140 * on latency.
141 *
142 * PRI_RANGE:	Total priority range for timeshare threads.
143 * PRI_NRESV:	Number of nice values.
144 * PRI_BASE:	The start of the dynamic range.
145 */
146#define	SCHED_PRI_RANGE		(PRI_MAX_TIMESHARE - PRI_MIN_TIMESHARE + 1)
147#define	SCHED_PRI_NRESV		((PRIO_MAX - PRIO_MIN) + 1)
148#define	SCHED_PRI_NHALF		(SCHED_PRI_NRESV / 2)
149#define	SCHED_PRI_BASE		(PRI_MIN_TIMESHARE)
150#define	SCHED_PRI_INTERACT(score)					\
151    ((score) * SCHED_PRI_RANGE / SCHED_INTERACT_MAX)
152
153/*
154 * These determine the interactivity of a process.
155 *
156 * SLP_RUN_MAX:	Maximum amount of sleep time + run time we'll accumulate
157 *		before throttling back.
158 * SLP_RUN_FORK:	Maximum slp+run time to inherit at fork time.
159 * INTERACT_MAX:	Maximum interactivity value.  Smaller is better.
160 * INTERACT_THRESH:	Threshhold for placement on the current runq.
161 */
162#define	SCHED_SLP_RUN_MAX	((hz * 5) << 10)
163#define	SCHED_SLP_RUN_FORK	((hz / 2) << 10)
164#define	SCHED_INTERACT_MAX	(100)
165#define	SCHED_INTERACT_HALF	(SCHED_INTERACT_MAX / 2)
166#define	SCHED_INTERACT_THRESH	(30)
167
168/*
169 * These parameters and macros determine the size of the time slice that is
170 * granted to each thread.
171 *
172 * SLICE_MIN:	Minimum time slice granted, in units of ticks.
173 * SLICE_MAX:	Maximum time slice granted.
174 * SLICE_RANGE:	Range of available time slices scaled by hz.
175 * SLICE_SCALE:	The number slices granted per val in the range of [0, max].
176 * SLICE_NICE:  Determine the amount of slice granted to a scaled nice.
177 * SLICE_NTHRESH:	The nice cutoff point for slice assignment.
178 */
179#define	SCHED_SLICE_MIN			(slice_min)
180#define	SCHED_SLICE_MAX			(slice_max)
181#define	SCHED_SLICE_INTERACTIVE		(slice_max)
182#define	SCHED_SLICE_NTHRESH	(SCHED_PRI_NHALF - 1)
183#define	SCHED_SLICE_RANGE		(SCHED_SLICE_MAX - SCHED_SLICE_MIN + 1)
184#define	SCHED_SLICE_SCALE(val, max)	(((val) * SCHED_SLICE_RANGE) / (max))
185#define	SCHED_SLICE_NICE(nice)						\
186    (SCHED_SLICE_MAX - SCHED_SLICE_SCALE((nice), SCHED_SLICE_NTHRESH))
187
188/*
189 * This macro determines whether or not the thread belongs on the current or
190 * next run queue.
191 */
192#define	SCHED_INTERACTIVE(td)						\
193    (sched_interact_score(td) < SCHED_INTERACT_THRESH)
194#define	SCHED_CURR(td, ke)						\
195    ((ke->ke_thread->td_flags & TDF_BORROWING) ||			\
196     (ke->ke_flags & KEF_PREEMPTED) || SCHED_INTERACTIVE(td))
197
198/*
199 * Cpu percentage computation macros and defines.
200 *
201 * SCHED_CPU_TIME:	Number of seconds to average the cpu usage across.
202 * SCHED_CPU_TICKS:	Number of hz ticks to average the cpu usage across.
203 */
204
205#define	SCHED_CPU_TIME	10
206#define	SCHED_CPU_TICKS	(hz * SCHED_CPU_TIME)
207
208/*
209 * kseq - per processor runqs and statistics.
210 */
211struct kseq {
212	struct runq	ksq_idle;		/* Queue of IDLE threads. */
213	struct runq	ksq_timeshare[2];	/* Run queues for !IDLE. */
214	struct runq	*ksq_next;		/* Next timeshare queue. */
215	struct runq	*ksq_curr;		/* Current queue. */
216	int		ksq_load_timeshare;	/* Load for timeshare. */
217	int		ksq_load;		/* Aggregate load. */
218	short		ksq_nice[SCHED_PRI_NRESV]; /* KSEs in each nice bin. */
219	short		ksq_nicemin;		/* Least nice. */
220#ifdef SMP
221	int			ksq_transferable;
222	LIST_ENTRY(kseq)	ksq_siblings;	/* Next in kseq group. */
223	struct kseq_group	*ksq_group;	/* Our processor group. */
224	volatile struct kse	*ksq_assigned;	/* assigned by another CPU. */
225#else
226	int		ksq_sysload;		/* For loadavg, !ITHD load. */
227#endif
228};
229
230#ifdef SMP
231/*
232 * kseq groups are groups of processors which can cheaply share threads.  When
233 * one processor in the group goes idle it will check the runqs of the other
234 * processors in its group prior to halting and waiting for an interrupt.
235 * These groups are suitable for SMT (Symetric Multi-Threading) and not NUMA.
236 * In a numa environment we'd want an idle bitmap per group and a two tiered
237 * load balancer.
238 */
239struct kseq_group {
240	int	ksg_cpus;		/* Count of CPUs in this kseq group. */
241	cpumask_t ksg_cpumask;		/* Mask of cpus in this group. */
242	cpumask_t ksg_idlemask;		/* Idle cpus in this group. */
243	cpumask_t ksg_mask;		/* Bit mask for first cpu. */
244	int	ksg_load;		/* Total load of this group. */
245	int	ksg_transferable;	/* Transferable load of this group. */
246	LIST_HEAD(, kseq)	ksg_members; /* Linked list of all members. */
247};
248#endif
249
250/*
251 * One kse queue per processor.
252 */
253#ifdef SMP
254static cpumask_t kseq_idle;
255static int ksg_maxid;
256static struct kseq	kseq_cpu[MAXCPU];
257static struct kseq_group kseq_groups[MAXCPU];
258static int bal_tick;
259static int gbal_tick;
260static int balance_groups;
261
262#define	KSEQ_SELF()	(&kseq_cpu[PCPU_GET(cpuid)])
263#define	KSEQ_CPU(x)	(&kseq_cpu[(x)])
264#define	KSEQ_ID(x)	((x) - kseq_cpu)
265#define	KSEQ_GROUP(x)	(&kseq_groups[(x)])
266#else	/* !SMP */
267static struct kseq	kseq_cpu;
268
269#define	KSEQ_SELF()	(&kseq_cpu)
270#define	KSEQ_CPU(x)	(&kseq_cpu)
271#endif
272
273static struct kse *sched_choose(void);		/* XXX Should be thread * */
274static void sched_slice(struct kse *);
275static void sched_priority(struct thread *);
276static void sched_thread_priority(struct thread *, u_char);
277static int sched_interact_score(struct thread *);
278static void sched_interact_update(struct thread *);
279static void sched_interact_fork(struct thread *);
280static void sched_pctcpu_update(struct kse *);
281
282/* Operations on per processor queues */
283static struct kse * kseq_choose(struct kseq *);
284static void kseq_setup(struct kseq *);
285static void kseq_load_add(struct kseq *, struct kse *);
286static void kseq_load_rem(struct kseq *, struct kse *);
287static __inline void kseq_runq_add(struct kseq *, struct kse *, int);
288static __inline void kseq_runq_rem(struct kseq *, struct kse *);
289static void kseq_nice_add(struct kseq *, int);
290static void kseq_nice_rem(struct kseq *, int);
291void kseq_print(int cpu);
292#ifdef SMP
293static int kseq_transfer(struct kseq *, struct kse *, int);
294static struct kse *runq_steal(struct runq *);
295static void sched_balance(void);
296static void sched_balance_groups(void);
297static void sched_balance_group(struct kseq_group *);
298static void sched_balance_pair(struct kseq *, struct kseq *);
299static void kseq_move(struct kseq *, int);
300static int kseq_idled(struct kseq *);
301static void kseq_notify(struct kse *, int);
302static void kseq_assign(struct kseq *);
303static struct kse *kseq_steal(struct kseq *, int);
304#define	KSE_CAN_MIGRATE(ke)						\
305    ((ke)->ke_thread->td_pinned == 0 && ((ke)->ke_flags & KEF_BOUND) == 0)
306#endif
307
308void
309kseq_print(int cpu)
310{
311	struct kseq *kseq;
312	int i;
313
314	kseq = KSEQ_CPU(cpu);
315
316	printf("kseq:\n");
317	printf("\tload:           %d\n", kseq->ksq_load);
318	printf("\tload TIMESHARE: %d\n", kseq->ksq_load_timeshare);
319#ifdef SMP
320	printf("\tload transferable: %d\n", kseq->ksq_transferable);
321#endif
322	printf("\tnicemin:\t%d\n", kseq->ksq_nicemin);
323	printf("\tnice counts:\n");
324	for (i = 0; i < SCHED_PRI_NRESV; i++)
325		if (kseq->ksq_nice[i])
326			printf("\t\t%d = %d\n",
327			    i - SCHED_PRI_NHALF, kseq->ksq_nice[i]);
328}
329
330static __inline void
331kseq_runq_add(struct kseq *kseq, struct kse *ke, int flags)
332{
333#ifdef SMP
334	if (KSE_CAN_MIGRATE(ke)) {
335		kseq->ksq_transferable++;
336		kseq->ksq_group->ksg_transferable++;
337		ke->ke_flags |= KEF_XFERABLE;
338	}
339#endif
340	if (ke->ke_flags & KEF_PREEMPTED)
341		flags |= SRQ_PREEMPTED;
342	runq_add(ke->ke_runq, ke, flags);
343}
344
345static __inline void
346kseq_runq_rem(struct kseq *kseq, struct kse *ke)
347{
348#ifdef SMP
349	if (ke->ke_flags & KEF_XFERABLE) {
350		kseq->ksq_transferable--;
351		kseq->ksq_group->ksg_transferable--;
352		ke->ke_flags &= ~KEF_XFERABLE;
353	}
354#endif
355	runq_remove(ke->ke_runq, ke);
356}
357
358static void
359kseq_load_add(struct kseq *kseq, struct kse *ke)
360{
361	int class;
362	mtx_assert(&sched_lock, MA_OWNED);
363	class = PRI_BASE(ke->ke_thread->td_pri_class);
364	if (class == PRI_TIMESHARE)
365		kseq->ksq_load_timeshare++;
366	kseq->ksq_load++;
367	CTR1(KTR_SCHED, "load: %d", kseq->ksq_load);
368	if (class != PRI_ITHD && (ke->ke_thread->td_proc->p_flag & P_NOLOAD) == 0)
369#ifdef SMP
370		kseq->ksq_group->ksg_load++;
371#else
372		kseq->ksq_sysload++;
373#endif
374	if (ke->ke_thread->td_pri_class == PRI_TIMESHARE)
375		kseq_nice_add(kseq, ke->ke_thread->td_proc->p_nice);
376}
377
378static void
379kseq_load_rem(struct kseq *kseq, struct kse *ke)
380{
381	int class;
382	mtx_assert(&sched_lock, MA_OWNED);
383	class = PRI_BASE(ke->ke_thread->td_pri_class);
384	if (class == PRI_TIMESHARE)
385		kseq->ksq_load_timeshare--;
386	if (class != PRI_ITHD  && (ke->ke_thread->td_proc->p_flag & P_NOLOAD) == 0)
387#ifdef SMP
388		kseq->ksq_group->ksg_load--;
389#else
390		kseq->ksq_sysload--;
391#endif
392	kseq->ksq_load--;
393	CTR1(KTR_SCHED, "load: %d", kseq->ksq_load);
394	ke->ke_runq = NULL;
395	if (ke->ke_thread->td_pri_class == PRI_TIMESHARE)
396		kseq_nice_rem(kseq, ke->ke_thread->td_proc->p_nice);
397}
398
399static void
400kseq_nice_add(struct kseq *kseq, int nice)
401{
402	mtx_assert(&sched_lock, MA_OWNED);
403	/* Normalize to zero. */
404	kseq->ksq_nice[nice + SCHED_PRI_NHALF]++;
405	if (nice < kseq->ksq_nicemin || kseq->ksq_load_timeshare == 1)
406		kseq->ksq_nicemin = nice;
407}
408
409static void
410kseq_nice_rem(struct kseq *kseq, int nice)
411{
412	int n;
413
414	mtx_assert(&sched_lock, MA_OWNED);
415	/* Normalize to zero. */
416	n = nice + SCHED_PRI_NHALF;
417	kseq->ksq_nice[n]--;
418	KASSERT(kseq->ksq_nice[n] >= 0, ("Negative nice count."));
419
420	/*
421	 * If this wasn't the smallest nice value or there are more in
422	 * this bucket we can just return.  Otherwise we have to recalculate
423	 * the smallest nice.
424	 */
425	if (nice != kseq->ksq_nicemin ||
426	    kseq->ksq_nice[n] != 0 ||
427	    kseq->ksq_load_timeshare == 0)
428		return;
429
430	for (; n < SCHED_PRI_NRESV; n++)
431		if (kseq->ksq_nice[n]) {
432			kseq->ksq_nicemin = n - SCHED_PRI_NHALF;
433			return;
434		}
435}
436
437#ifdef SMP
438/*
439 * sched_balance is a simple CPU load balancing algorithm.  It operates by
440 * finding the least loaded and most loaded cpu and equalizing their load
441 * by migrating some processes.
442 *
443 * Dealing only with two CPUs at a time has two advantages.  Firstly, most
444 * installations will only have 2 cpus.  Secondly, load balancing too much at
445 * once can have an unpleasant effect on the system.  The scheduler rarely has
446 * enough information to make perfect decisions.  So this algorithm chooses
447 * algorithm simplicity and more gradual effects on load in larger systems.
448 *
449 * It could be improved by considering the priorities and slices assigned to
450 * each task prior to balancing them.  There are many pathological cases with
451 * any approach and so the semi random algorithm below may work as well as any.
452 *
453 */
454static void
455sched_balance(void)
456{
457	struct kseq_group *high;
458	struct kseq_group *low;
459	struct kseq_group *ksg;
460	int cnt;
461	int i;
462
463	bal_tick = ticks + (random() % (hz * 2));
464	if (smp_started == 0)
465		return;
466	low = high = NULL;
467	i = random() % (ksg_maxid + 1);
468	for (cnt = 0; cnt <= ksg_maxid; cnt++) {
469		ksg = KSEQ_GROUP(i);
470		/*
471		 * Find the CPU with the highest load that has some
472		 * threads to transfer.
473		 */
474		if ((high == NULL || ksg->ksg_load > high->ksg_load)
475		    && ksg->ksg_transferable)
476			high = ksg;
477		if (low == NULL || ksg->ksg_load < low->ksg_load)
478			low = ksg;
479		if (++i > ksg_maxid)
480			i = 0;
481	}
482	if (low != NULL && high != NULL && high != low)
483		sched_balance_pair(LIST_FIRST(&high->ksg_members),
484		    LIST_FIRST(&low->ksg_members));
485}
486
487static void
488sched_balance_groups(void)
489{
490	int i;
491
492	gbal_tick = ticks + (random() % (hz * 2));
493	mtx_assert(&sched_lock, MA_OWNED);
494	if (smp_started)
495		for (i = 0; i <= ksg_maxid; i++)
496			sched_balance_group(KSEQ_GROUP(i));
497}
498
499static void
500sched_balance_group(struct kseq_group *ksg)
501{
502	struct kseq *kseq;
503	struct kseq *high;
504	struct kseq *low;
505	int load;
506
507	if (ksg->ksg_transferable == 0)
508		return;
509	low = NULL;
510	high = NULL;
511	LIST_FOREACH(kseq, &ksg->ksg_members, ksq_siblings) {
512		load = kseq->ksq_load;
513		if (high == NULL || load > high->ksq_load)
514			high = kseq;
515		if (low == NULL || load < low->ksq_load)
516			low = kseq;
517	}
518	if (high != NULL && low != NULL && high != low)
519		sched_balance_pair(high, low);
520}
521
522static void
523sched_balance_pair(struct kseq *high, struct kseq *low)
524{
525	int transferable;
526	int high_load;
527	int low_load;
528	int move;
529	int diff;
530	int i;
531
532	/*
533	 * If we're transfering within a group we have to use this specific
534	 * kseq's transferable count, otherwise we can steal from other members
535	 * of the group.
536	 */
537	if (high->ksq_group == low->ksq_group) {
538		transferable = high->ksq_transferable;
539		high_load = high->ksq_load;
540		low_load = low->ksq_load;
541	} else {
542		transferable = high->ksq_group->ksg_transferable;
543		high_load = high->ksq_group->ksg_load;
544		low_load = low->ksq_group->ksg_load;
545	}
546	if (transferable == 0)
547		return;
548	/*
549	 * Determine what the imbalance is and then adjust that to how many
550	 * kses we actually have to give up (transferable).
551	 */
552	diff = high_load - low_load;
553	move = diff / 2;
554	if (diff & 0x1)
555		move++;
556	move = min(move, transferable);
557	for (i = 0; i < move; i++)
558		kseq_move(high, KSEQ_ID(low));
559	return;
560}
561
562static void
563kseq_move(struct kseq *from, int cpu)
564{
565	struct kseq *kseq;
566	struct kseq *to;
567	struct kse *ke;
568
569	kseq = from;
570	to = KSEQ_CPU(cpu);
571	ke = kseq_steal(kseq, 1);
572	if (ke == NULL) {
573		struct kseq_group *ksg;
574
575		ksg = kseq->ksq_group;
576		LIST_FOREACH(kseq, &ksg->ksg_members, ksq_siblings) {
577			if (kseq == from || kseq->ksq_transferable == 0)
578				continue;
579			ke = kseq_steal(kseq, 1);
580			break;
581		}
582		if (ke == NULL)
583			panic("kseq_move: No KSEs available with a "
584			    "transferable count of %d\n",
585			    ksg->ksg_transferable);
586	}
587	if (kseq == to)
588		return;
589	ke->ke_state = KES_THREAD;
590	kseq_runq_rem(kseq, ke);
591	kseq_load_rem(kseq, ke);
592	kseq_notify(ke, cpu);
593}
594
595static int
596kseq_idled(struct kseq *kseq)
597{
598	struct kseq_group *ksg;
599	struct kseq *steal;
600	struct kse *ke;
601
602	ksg = kseq->ksq_group;
603	/*
604	 * If we're in a cpu group, try and steal kses from another cpu in
605	 * the group before idling.
606	 */
607	if (ksg->ksg_cpus > 1 && ksg->ksg_transferable) {
608		LIST_FOREACH(steal, &ksg->ksg_members, ksq_siblings) {
609			if (steal == kseq || steal->ksq_transferable == 0)
610				continue;
611			ke = kseq_steal(steal, 0);
612			if (ke == NULL)
613				continue;
614			ke->ke_state = KES_THREAD;
615			kseq_runq_rem(steal, ke);
616			kseq_load_rem(steal, ke);
617			ke->ke_cpu = PCPU_GET(cpuid);
618			ke->ke_flags |= KEF_INTERNAL | KEF_HOLD;
619			sched_add(ke->ke_thread, SRQ_YIELDING);
620			return (0);
621		}
622	}
623	/*
624	 * We only set the idled bit when all of the cpus in the group are
625	 * idle.  Otherwise we could get into a situation where a KSE bounces
626	 * back and forth between two idle cores on seperate physical CPUs.
627	 */
628	ksg->ksg_idlemask |= PCPU_GET(cpumask);
629	if (ksg->ksg_idlemask != ksg->ksg_cpumask)
630		return (1);
631	atomic_set_int(&kseq_idle, ksg->ksg_mask);
632	return (1);
633}
634
635static void
636kseq_assign(struct kseq *kseq)
637{
638	struct kse *nke;
639	struct kse *ke;
640
641	do {
642		*(volatile struct kse **)&ke = kseq->ksq_assigned;
643	} while(!atomic_cmpset_ptr((volatile uintptr_t *)&kseq->ksq_assigned,
644		(uintptr_t)ke, (uintptr_t)NULL));
645	for (; ke != NULL; ke = nke) {
646		nke = ke->ke_assign;
647		kseq->ksq_group->ksg_load--;
648		kseq->ksq_load--;
649		ke->ke_flags &= ~KEF_ASSIGNED;
650		if (ke->ke_flags & KEF_REMOVED) {
651			ke->ke_flags &= ~KEF_REMOVED;
652			continue;
653		}
654		ke->ke_flags |= KEF_INTERNAL | KEF_HOLD;
655		sched_add(ke->ke_thread, SRQ_YIELDING);
656	}
657}
658
659static void
660kseq_notify(struct kse *ke, int cpu)
661{
662	struct kseq *kseq;
663	struct thread *td;
664	struct pcpu *pcpu;
665	int class;
666	int prio;
667
668	kseq = KSEQ_CPU(cpu);
669	/* XXX */
670	class = PRI_BASE(ke->ke_thread->td_pri_class);
671	if ((class == PRI_TIMESHARE || class == PRI_REALTIME) &&
672	    (kseq_idle & kseq->ksq_group->ksg_mask))
673		atomic_clear_int(&kseq_idle, kseq->ksq_group->ksg_mask);
674	kseq->ksq_group->ksg_load++;
675	kseq->ksq_load++;
676	ke->ke_cpu = cpu;
677	ke->ke_flags |= KEF_ASSIGNED;
678	prio = ke->ke_thread->td_priority;
679
680	/*
681	 * Place a KSE on another cpu's queue and force a resched.
682	 */
683	do {
684		*(volatile struct kse **)&ke->ke_assign = kseq->ksq_assigned;
685	} while(!atomic_cmpset_ptr((volatile uintptr_t *)&kseq->ksq_assigned,
686		(uintptr_t)ke->ke_assign, (uintptr_t)ke));
687	/*
688	 * Without sched_lock we could lose a race where we set NEEDRESCHED
689	 * on a thread that is switched out before the IPI is delivered.  This
690	 * would lead us to miss the resched.  This will be a problem once
691	 * sched_lock is pushed down.
692	 */
693	pcpu = pcpu_find(cpu);
694	td = pcpu->pc_curthread;
695	if (ke->ke_thread->td_priority < td->td_priority ||
696	    td == pcpu->pc_idlethread) {
697		td->td_flags |= TDF_NEEDRESCHED;
698		ipi_selected(1 << cpu, IPI_AST);
699	}
700}
701
702static struct kse *
703runq_steal(struct runq *rq)
704{
705	struct rqhead *rqh;
706	struct rqbits *rqb;
707	struct kse *ke;
708	int word;
709	int bit;
710
711	mtx_assert(&sched_lock, MA_OWNED);
712	rqb = &rq->rq_status;
713	for (word = 0; word < RQB_LEN; word++) {
714		if (rqb->rqb_bits[word] == 0)
715			continue;
716		for (bit = 0; bit < RQB_BPW; bit++) {
717			if ((rqb->rqb_bits[word] & (1ul << bit)) == 0)
718				continue;
719			rqh = &rq->rq_queues[bit + (word << RQB_L2BPW)];
720			TAILQ_FOREACH(ke, rqh, ke_procq) {
721				if (KSE_CAN_MIGRATE(ke))
722					return (ke);
723			}
724		}
725	}
726	return (NULL);
727}
728
729static struct kse *
730kseq_steal(struct kseq *kseq, int stealidle)
731{
732	struct kse *ke;
733
734	/*
735	 * Steal from next first to try to get a non-interactive task that
736	 * may not have run for a while.
737	 */
738	if ((ke = runq_steal(kseq->ksq_next)) != NULL)
739		return (ke);
740	if ((ke = runq_steal(kseq->ksq_curr)) != NULL)
741		return (ke);
742	if (stealidle)
743		return (runq_steal(&kseq->ksq_idle));
744	return (NULL);
745}
746
747int
748kseq_transfer(struct kseq *kseq, struct kse *ke, int class)
749{
750	struct kseq_group *nksg;
751	struct kseq_group *ksg;
752	struct kseq *old;
753	int cpu;
754	int idx;
755
756	if (smp_started == 0)
757		return (0);
758	cpu = 0;
759	/*
760	 * If our load exceeds a certain threshold we should attempt to
761	 * reassign this thread.  The first candidate is the cpu that
762	 * originally ran the thread.  If it is idle, assign it there,
763	 * otherwise, pick an idle cpu.
764	 *
765	 * The threshold at which we start to reassign kses has a large impact
766	 * on the overall performance of the system.  Tuned too high and
767	 * some CPUs may idle.  Too low and there will be excess migration
768	 * and context switches.
769	 */
770	old = KSEQ_CPU(ke->ke_cpu);
771	nksg = old->ksq_group;
772	ksg = kseq->ksq_group;
773	if (kseq_idle) {
774		if (kseq_idle & nksg->ksg_mask) {
775			cpu = ffs(nksg->ksg_idlemask);
776			if (cpu) {
777				CTR2(KTR_SCHED,
778				    "kseq_transfer: %p found old cpu %X "
779				    "in idlemask.", ke, cpu);
780				goto migrate;
781			}
782		}
783		/*
784		 * Multiple cpus could find this bit simultaneously
785		 * but the race shouldn't be terrible.
786		 */
787		cpu = ffs(kseq_idle);
788		if (cpu) {
789			CTR2(KTR_SCHED, "kseq_transfer: %p found %X "
790			    "in idlemask.", ke, cpu);
791			goto migrate;
792		}
793	}
794	idx = 0;
795#if 0
796	if (old->ksq_load < kseq->ksq_load) {
797		cpu = ke->ke_cpu + 1;
798		CTR2(KTR_SCHED, "kseq_transfer: %p old cpu %X "
799		    "load less than ours.", ke, cpu);
800		goto migrate;
801	}
802	/*
803	 * No new CPU was found, look for one with less load.
804	 */
805	for (idx = 0; idx <= ksg_maxid; idx++) {
806		nksg = KSEQ_GROUP(idx);
807		if (nksg->ksg_load /*+ (nksg->ksg_cpus  * 2)*/ < ksg->ksg_load) {
808			cpu = ffs(nksg->ksg_cpumask);
809			CTR2(KTR_SCHED, "kseq_transfer: %p cpu %X load less "
810			    "than ours.", ke, cpu);
811			goto migrate;
812		}
813	}
814#endif
815	/*
816	 * If another cpu in this group has idled, assign a thread over
817	 * to them after checking to see if there are idled groups.
818	 */
819	if (ksg->ksg_idlemask) {
820		cpu = ffs(ksg->ksg_idlemask);
821		if (cpu) {
822			CTR2(KTR_SCHED, "kseq_transfer: %p cpu %X idle in "
823			    "group.", ke, cpu);
824			goto migrate;
825		}
826	}
827	return (0);
828migrate:
829	/*
830	 * Now that we've found an idle CPU, migrate the thread.
831	 */
832	cpu--;
833	ke->ke_runq = NULL;
834	kseq_notify(ke, cpu);
835
836	return (1);
837}
838
839#endif	/* SMP */
840
841/*
842 * Pick the highest priority task we have and return it.
843 */
844
845static struct kse *
846kseq_choose(struct kseq *kseq)
847{
848	struct runq *swap;
849	struct kse *ke;
850	int nice;
851
852	mtx_assert(&sched_lock, MA_OWNED);
853	swap = NULL;
854
855	for (;;) {
856		ke = runq_choose(kseq->ksq_curr);
857		if (ke == NULL) {
858			/*
859			 * We already swapped once and didn't get anywhere.
860			 */
861			if (swap)
862				break;
863			swap = kseq->ksq_curr;
864			kseq->ksq_curr = kseq->ksq_next;
865			kseq->ksq_next = swap;
866			continue;
867		}
868		/*
869		 * If we encounter a slice of 0 the kse is in a
870		 * TIMESHARE kse group and its nice was too far out
871		 * of the range that receives slices.
872		 */
873		nice = ke->ke_thread->td_proc->p_nice + (0 - kseq->ksq_nicemin);
874#if 0
875		if (ke->ke_slice == 0 || (nice > SCHED_SLICE_NTHRESH &&
876		    ke->ke_thread->td_proc->p_nice != 0)) {
877			runq_remove(ke->ke_runq, ke);
878			sched_slice(ke);
879			ke->ke_runq = kseq->ksq_next;
880			runq_add(ke->ke_runq, ke, 0);
881			continue;
882		}
883#endif
884		return (ke);
885	}
886
887	return (runq_choose(&kseq->ksq_idle));
888}
889
890static void
891kseq_setup(struct kseq *kseq)
892{
893	runq_init(&kseq->ksq_timeshare[0]);
894	runq_init(&kseq->ksq_timeshare[1]);
895	runq_init(&kseq->ksq_idle);
896	kseq->ksq_curr = &kseq->ksq_timeshare[0];
897	kseq->ksq_next = &kseq->ksq_timeshare[1];
898	kseq->ksq_load = 0;
899	kseq->ksq_load_timeshare = 0;
900}
901
902static void
903sched_setup(void *dummy)
904{
905#ifdef SMP
906	int i;
907#endif
908
909	/*
910	 * To avoid divide-by-zero, we set realstathz a dummy value
911	 * in case which sched_clock() called before sched_initticks().
912	 */
913	realstathz = hz;
914	slice_min = (hz/100);	/* 10ms */
915	slice_max = (hz/7);	/* ~140ms */
916
917#ifdef SMP
918	balance_groups = 0;
919	/*
920	 * Initialize the kseqs.
921	 */
922	for (i = 0; i < MAXCPU; i++) {
923		struct kseq *ksq;
924
925		ksq = &kseq_cpu[i];
926		ksq->ksq_assigned = NULL;
927		kseq_setup(&kseq_cpu[i]);
928	}
929	if (smp_topology == NULL) {
930		struct kseq_group *ksg;
931		struct kseq *ksq;
932		int cpus;
933
934		for (cpus = 0, i = 0; i < MAXCPU; i++) {
935			if (CPU_ABSENT(i))
936				continue;
937			ksq = &kseq_cpu[i];
938			ksg = &kseq_groups[cpus];
939			/*
940			 * Setup a kseq group with one member.
941			 */
942			ksq->ksq_transferable = 0;
943			ksq->ksq_group = ksg;
944			ksg->ksg_cpus = 1;
945			ksg->ksg_idlemask = 0;
946			ksg->ksg_cpumask = ksg->ksg_mask = 1 << i;
947			ksg->ksg_load = 0;
948			ksg->ksg_transferable = 0;
949			LIST_INIT(&ksg->ksg_members);
950			LIST_INSERT_HEAD(&ksg->ksg_members, ksq, ksq_siblings);
951			cpus++;
952		}
953		ksg_maxid = cpus - 1;
954	} else {
955		struct kseq_group *ksg;
956		struct cpu_group *cg;
957		int j;
958
959		for (i = 0; i < smp_topology->ct_count; i++) {
960			cg = &smp_topology->ct_group[i];
961			ksg = &kseq_groups[i];
962			/*
963			 * Initialize the group.
964			 */
965			ksg->ksg_idlemask = 0;
966			ksg->ksg_load = 0;
967			ksg->ksg_transferable = 0;
968			ksg->ksg_cpus = cg->cg_count;
969			ksg->ksg_cpumask = cg->cg_mask;
970			LIST_INIT(&ksg->ksg_members);
971			/*
972			 * Find all of the group members and add them.
973			 */
974			for (j = 0; j < MAXCPU; j++) {
975				if ((cg->cg_mask & (1 << j)) != 0) {
976					if (ksg->ksg_mask == 0)
977						ksg->ksg_mask = 1 << j;
978					kseq_cpu[j].ksq_transferable = 0;
979					kseq_cpu[j].ksq_group = ksg;
980					LIST_INSERT_HEAD(&ksg->ksg_members,
981					    &kseq_cpu[j], ksq_siblings);
982				}
983			}
984			if (ksg->ksg_cpus > 1)
985				balance_groups = 1;
986		}
987		ksg_maxid = smp_topology->ct_count - 1;
988	}
989	/*
990	 * Stagger the group and global load balancer so they do not
991	 * interfere with each other.
992	 */
993	bal_tick = ticks + hz;
994	if (balance_groups)
995		gbal_tick = ticks + (hz / 2);
996#else
997	kseq_setup(KSEQ_SELF());
998#endif
999	mtx_lock_spin(&sched_lock);
1000	kseq_load_add(KSEQ_SELF(), &kse0);
1001	mtx_unlock_spin(&sched_lock);
1002}
1003
1004/* ARGSUSED */
1005static void
1006sched_initticks(void *dummy)
1007{
1008	mtx_lock_spin(&sched_lock);
1009	realstathz = stathz ? stathz : hz;
1010	slice_min = (realstathz/100);	/* 10ms */
1011	slice_max = (realstathz/7);	/* ~140ms */
1012
1013	tickincr = (hz << 10) / realstathz;
1014	/*
1015	 * XXX This does not work for values of stathz that are much
1016	 * larger than hz.
1017	 */
1018	if (tickincr == 0)
1019		tickincr = 1;
1020	mtx_unlock_spin(&sched_lock);
1021}
1022
1023
1024/*
1025 * Scale the scheduling priority according to the "interactivity" of this
1026 * process.
1027 */
1028static void
1029sched_priority(struct thread *td)
1030{
1031	int pri;
1032
1033	if (td->td_pri_class != PRI_TIMESHARE)
1034		return;
1035
1036	pri = SCHED_PRI_INTERACT(sched_interact_score(td));
1037	pri += SCHED_PRI_BASE;
1038	pri += td->td_proc->p_nice;
1039
1040	if (pri > PRI_MAX_TIMESHARE)
1041		pri = PRI_MAX_TIMESHARE;
1042	else if (pri < PRI_MIN_TIMESHARE)
1043		pri = PRI_MIN_TIMESHARE;
1044
1045#ifdef KSE
1046	sched_user_prio(kg, pri);
1047#else
1048	sched_user_prio(td, pri);
1049#endif
1050
1051	return;
1052}
1053
1054/*
1055 * Calculate a time slice based on the properties of the kseg and the runq
1056 * that we're on.  This is only for PRI_TIMESHARE threads.
1057 */
1058static void
1059sched_slice(struct kse *ke)
1060{
1061	struct kseq *kseq;
1062	struct thread *td;
1063
1064	td = ke->ke_thread;
1065	kseq = KSEQ_CPU(ke->ke_cpu);
1066
1067	if (td->td_flags & TDF_BORROWING) {
1068		ke->ke_slice = SCHED_SLICE_MIN;
1069		return;
1070	}
1071
1072	/*
1073	 * Rationale:
1074	 * KSEs in interactive ksegs get a minimal slice so that we
1075	 * quickly notice if it abuses its advantage.
1076	 *
1077	 * KSEs in non-interactive ksegs are assigned a slice that is
1078	 * based on the ksegs nice value relative to the least nice kseg
1079	 * on the run queue for this cpu.
1080	 *
1081	 * If the KSE is less nice than all others it gets the maximum
1082	 * slice and other KSEs will adjust their slice relative to
1083	 * this when they first expire.
1084	 *
1085	 * There is 20 point window that starts relative to the least
1086	 * nice kse on the run queue.  Slice size is determined by
1087	 * the kse distance from the last nice thread.
1088	 *
1089	 * If the kse is outside of the window it will get no slice
1090	 * and will be reevaluated each time it is selected on the
1091	 * run queue.  The exception to this is nice 0 ksegs when
1092	 * a nice -20 is running.  They are always granted a minimum
1093	 * slice.
1094	 */
1095	if (!SCHED_INTERACTIVE(td)) {
1096		int nice;
1097
1098		nice = td->td_proc->p_nice + (0 - kseq->ksq_nicemin);
1099		if (kseq->ksq_load_timeshare == 0 ||
1100		    td->td_proc->p_nice < kseq->ksq_nicemin)
1101			ke->ke_slice = SCHED_SLICE_MAX;
1102		else if (nice <= SCHED_SLICE_NTHRESH)
1103			ke->ke_slice = SCHED_SLICE_NICE(nice);
1104		else if (td->td_proc->p_nice == 0)
1105			ke->ke_slice = SCHED_SLICE_MIN;
1106		else
1107			ke->ke_slice = SCHED_SLICE_MIN; /* 0 */
1108	} else
1109		ke->ke_slice = SCHED_SLICE_INTERACTIVE;
1110
1111	return;
1112}
1113
1114/*
1115 * This routine enforces a maximum limit on the amount of scheduling history
1116 * kept.  It is called after either the slptime or runtime is adjusted.
1117 * This routine will not operate correctly when slp or run times have been
1118 * adjusted to more than double their maximum.
1119 */
1120static void
1121sched_interact_update(struct thread *td)
1122{
1123	int sum;
1124
1125	sum = td->td_sched->skg_runtime + td->td_sched->skg_slptime;
1126	if (sum < SCHED_SLP_RUN_MAX)
1127		return;
1128	/*
1129	 * If we have exceeded by more than 1/5th then the algorithm below
1130	 * will not bring us back into range.  Dividing by two here forces
1131	 * us into the range of [4/5 * SCHED_INTERACT_MAX, SCHED_INTERACT_MAX]
1132	 */
1133	if (sum > (SCHED_SLP_RUN_MAX / 5) * 6) {
1134		td->td_sched->skg_runtime /= 2;
1135		td->td_sched->skg_slptime /= 2;
1136		return;
1137	}
1138	td->td_sched->skg_runtime = (td->td_sched->skg_runtime / 5) * 4;
1139	td->td_sched->skg_slptime = (td->td_sched->skg_slptime / 5) * 4;
1140}
1141
1142static void
1143sched_interact_fork(struct thread *td)
1144{
1145	int ratio;
1146	int sum;
1147
1148	sum = td->td_sched->skg_runtime + td->td_sched->skg_slptime;
1149	if (sum > SCHED_SLP_RUN_FORK) {
1150		ratio = sum / SCHED_SLP_RUN_FORK;
1151		td->td_sched->skg_runtime /= ratio;
1152		td->td_sched->skg_slptime /= ratio;
1153	}
1154}
1155
1156static int
1157sched_interact_score(struct thread *td)
1158{
1159	int div;
1160
1161	if (td->td_sched->skg_runtime > td->td_sched->skg_slptime) {
1162		div = max(1, td->td_sched->skg_runtime / SCHED_INTERACT_HALF);
1163		return (SCHED_INTERACT_HALF +
1164		    (SCHED_INTERACT_HALF - (td->td_sched->skg_slptime / div)));
1165	} if (td->td_sched->skg_slptime > td->td_sched->skg_runtime) {
1166		div = max(1, td->td_sched->skg_slptime / SCHED_INTERACT_HALF);
1167		return (td->td_sched->skg_runtime / div);
1168	}
1169
1170	/*
1171	 * This can happen if slptime and runtime are 0.
1172	 */
1173	return (0);
1174
1175}
1176
1177/*
1178 * Very early in the boot some setup of scheduler-specific
1179 * parts of proc0 and of soem scheduler resources needs to be done.
1180 * Called from:
1181 *  proc0_init()
1182 */
1183void
1184schedinit(void)
1185{
1186	/*
1187	 * Set up the scheduler specific parts of proc0.
1188	 */
1189	proc0.p_sched = NULL; /* XXX */
1190	thread0.td_sched = &kse0;
1191	kse0.ke_thread = &thread0;
1192	kse0.ke_state = KES_THREAD;
1193}
1194
1195/*
1196 * This is only somewhat accurate since given many processes of the same
1197 * priority they will switch when their slices run out, which will be
1198 * at most SCHED_SLICE_MAX.
1199 */
1200int
1201sched_rr_interval(void)
1202{
1203	return (SCHED_SLICE_MAX);
1204}
1205
1206static void
1207sched_pctcpu_update(struct kse *ke)
1208{
1209	/*
1210	 * Adjust counters and watermark for pctcpu calc.
1211	 */
1212	if (ke->ke_ltick > ticks - SCHED_CPU_TICKS) {
1213		/*
1214		 * Shift the tick count out so that the divide doesn't
1215		 * round away our results.
1216		 */
1217		ke->ke_ticks <<= 10;
1218		ke->ke_ticks = (ke->ke_ticks / (ticks - ke->ke_ftick)) *
1219			    SCHED_CPU_TICKS;
1220		ke->ke_ticks >>= 10;
1221	} else
1222		ke->ke_ticks = 0;
1223	ke->ke_ltick = ticks;
1224	ke->ke_ftick = ke->ke_ltick - SCHED_CPU_TICKS;
1225}
1226
1227void
1228sched_thread_priority(struct thread *td, u_char prio)
1229{
1230	struct kse *ke;
1231
1232	CTR6(KTR_SCHED, "sched_prio: %p(%s) prio %d newprio %d by %p(%s)",
1233	    td, td->td_proc->p_comm, td->td_priority, prio, curthread,
1234	    curthread->td_proc->p_comm);
1235	ke = td->td_kse;
1236	mtx_assert(&sched_lock, MA_OWNED);
1237	if (td->td_priority == prio)
1238		return;
1239	if (TD_ON_RUNQ(td)) {
1240		/*
1241		 * If the priority has been elevated due to priority
1242		 * propagation, we may have to move ourselves to a new
1243		 * queue.  We still call adjustrunqueue below in case kse
1244		 * needs to fix things up.
1245		 */
1246		if (prio < td->td_priority && ke->ke_runq != NULL &&
1247		    (ke->ke_flags & KEF_ASSIGNED) == 0 &&
1248		    ke->ke_runq != KSEQ_CPU(ke->ke_cpu)->ksq_curr) {
1249			runq_remove(ke->ke_runq, ke);
1250			ke->ke_runq = KSEQ_CPU(ke->ke_cpu)->ksq_curr;
1251			runq_add(ke->ke_runq, ke, 0);
1252		}
1253		/*
1254		 * Hold this kse on this cpu so that sched_prio() doesn't
1255		 * cause excessive migration.  We only want migration to
1256		 * happen as the result of a wakeup.
1257		 */
1258		ke->ke_flags |= KEF_HOLD;
1259		adjustrunqueue(td, prio);
1260		ke->ke_flags &= ~KEF_HOLD;
1261	} else
1262		td->td_priority = prio;
1263}
1264
1265/*
1266 * Update a thread's priority when it is lent another thread's
1267 * priority.
1268 */
1269void
1270sched_lend_prio(struct thread *td, u_char prio)
1271{
1272
1273	td->td_flags |= TDF_BORROWING;
1274	sched_thread_priority(td, prio);
1275}
1276
1277/*
1278 * Restore a thread's priority when priority propagation is
1279 * over.  The prio argument is the minimum priority the thread
1280 * needs to have to satisfy other possible priority lending
1281 * requests.  If the thread's regular priority is less
1282 * important than prio, the thread will keep a priority boost
1283 * of prio.
1284 */
1285void
1286sched_unlend_prio(struct thread *td, u_char prio)
1287{
1288	u_char base_pri;
1289
1290	if (td->td_base_pri >= PRI_MIN_TIMESHARE &&
1291	    td->td_base_pri <= PRI_MAX_TIMESHARE)
1292		base_pri = td->td_user_pri;
1293	else
1294		base_pri = td->td_base_pri;
1295	if (prio >= base_pri) {
1296		td->td_flags &= ~TDF_BORROWING;
1297		sched_thread_priority(td, base_pri);
1298	} else
1299		sched_lend_prio(td, prio);
1300}
1301
1302void
1303sched_prio(struct thread *td, u_char prio)
1304{
1305	u_char oldprio;
1306
1307	/* First, update the base priority. */
1308	td->td_base_pri = prio;
1309
1310	/*
1311	 * If the thread is borrowing another thread's priority, don't
1312	 * ever lower the priority.
1313	 */
1314	if (td->td_flags & TDF_BORROWING && td->td_priority < prio)
1315		return;
1316
1317	/* Change the real priority. */
1318	oldprio = td->td_priority;
1319	sched_thread_priority(td, prio);
1320
1321	/*
1322	 * If the thread is on a turnstile, then let the turnstile update
1323	 * its state.
1324	 */
1325	if (TD_ON_LOCK(td) && oldprio != prio)
1326		turnstile_adjust(td, oldprio);
1327}
1328
1329void
1330#ifdef KSE
1331sched_user_prio(struct ksegrp *kg, u_char prio)
1332#else
1333sched_user_prio(struct thread *td, u_char prio)
1334#endif
1335{
1336#ifdef KSE
1337	struct thread *td;
1338#endif
1339	u_char oldprio;
1340
1341#ifdef KSE
1342	kg->kg_base_user_pri = prio;
1343
1344	/* XXXKSE only for 1:1 */
1345
1346	td = TAILQ_FIRST(&kg->kg_threads);
1347	if (td == NULL) {
1348		kg->kg_user_pri = prio;
1349		return;
1350	}
1351
1352	if (td->td_flags & TDF_UBORROWING && kg->kg_user_pri <= prio)
1353		return;
1354
1355	oldprio = kg->kg_user_pri;
1356	kg->kg_user_pri = prio;
1357#else
1358	td->td_base_user_pri = prio;
1359
1360	oldprio = td->td_user_pri;
1361	td->td_user_pri = prio;
1362#endif
1363
1364	if (TD_ON_UPILOCK(td) && oldprio != prio)
1365		umtx_pi_adjust(td, oldprio);
1366}
1367
1368void
1369sched_lend_user_prio(struct thread *td, u_char prio)
1370{
1371	u_char oldprio;
1372
1373	td->td_flags |= TDF_UBORROWING;
1374
1375#ifdef KSE
1376	oldprio = td->td_ksegrp->kg_user_pri;
1377	td->td_ksegrp->kg_user_pri = prio;
1378#else
1379	oldprio = td->td__user_pri;
1380	td->td_user_pri = prio;
1381#endif
1382
1383	if (TD_ON_UPILOCK(td) && oldprio != prio)
1384		umtx_pi_adjust(td, oldprio);
1385}
1386
1387void
1388sched_unlend_user_prio(struct thread *td, u_char prio)
1389{
1390#ifdef KSE
1391	struct ksegrp *kg = td->td_ksegrp;
1392#endif
1393	u_char base_pri;
1394
1395#ifdef KSE
1396	base_pri = kg->kg_base_user_pri;
1397#else
1398	base_pri = td->td_base_user_pri;
1399#endif
1400	if (prio >= base_pri) {
1401		td->td_flags &= ~TDF_UBORROWING;
1402#ifdef KSE
1403		sched_user_prio(kg, base_pri);
1404#else
1405		sched_user_prio(td, base_pri);
1406#endif
1407	} else
1408		sched_lend_user_prio(td, prio);
1409}
1410
1411void
1412sched_switch(struct thread *td, struct thread *newtd, int flags)
1413{
1414	struct kseq *ksq;
1415	struct kse *ke;
1416
1417	mtx_assert(&sched_lock, MA_OWNED);
1418
1419	ke = td->td_kse;
1420	ksq = KSEQ_SELF();
1421
1422	td->td_lastcpu = td->td_oncpu;
1423	td->td_oncpu = NOCPU;
1424	td->td_flags &= ~TDF_NEEDRESCHED;
1425	td->td_owepreempt = 0;
1426
1427	/*
1428	 * If the KSE has been assigned it may be in the process of switching
1429	 * to the new cpu.  This is the case in sched_bind().
1430	 */
1431	if (td == PCPU_GET(idlethread)) {
1432		TD_SET_CAN_RUN(td);
1433	} else if ((ke->ke_flags & KEF_ASSIGNED) == 0) {
1434		/* We are ending our run so make our slot available again */
1435		kseq_load_rem(ksq, ke);
1436		if (TD_IS_RUNNING(td)) {
1437			/*
1438			 * Don't allow the thread to migrate
1439			 * from a preemption.
1440			 */
1441			ke->ke_flags |= KEF_HOLD;
1442			setrunqueue(td, (flags & SW_PREEMPT) ?
1443			    SRQ_OURSELF|SRQ_YIELDING|SRQ_PREEMPTED :
1444			    SRQ_OURSELF|SRQ_YIELDING);
1445			ke->ke_flags &= ~KEF_HOLD;
1446		}
1447	}
1448	if (newtd != NULL) {
1449		/*
1450		 * If we bring in a thread account for it as if it had been
1451		 * added to the run queue and then chosen.
1452		 */
1453		newtd->td_kse->ke_flags |= KEF_DIDRUN;
1454		newtd->td_kse->ke_runq = ksq->ksq_curr;
1455		TD_SET_RUNNING(newtd);
1456		kseq_load_add(KSEQ_SELF(), newtd->td_kse);
1457	} else
1458		newtd = choosethread();
1459	if (td != newtd) {
1460#ifdef	HWPMC_HOOKS
1461		if (PMC_PROC_IS_USING_PMCS(td->td_proc))
1462			PMC_SWITCH_CONTEXT(td, PMC_FN_CSW_OUT);
1463#endif
1464
1465		cpu_switch(td, newtd);
1466#ifdef	HWPMC_HOOKS
1467		if (PMC_PROC_IS_USING_PMCS(td->td_proc))
1468			PMC_SWITCH_CONTEXT(td, PMC_FN_CSW_IN);
1469#endif
1470	}
1471
1472	sched_lock.mtx_lock = (uintptr_t)td;
1473
1474	td->td_oncpu = PCPU_GET(cpuid);
1475}
1476
1477void
1478sched_nice(struct proc *p, int nice)
1479{
1480	struct kse *ke;
1481	struct thread *td;
1482	struct kseq *kseq;
1483
1484	PROC_LOCK_ASSERT(p, MA_OWNED);
1485	mtx_assert(&sched_lock, MA_OWNED);
1486	/*
1487	 * We need to adjust the nice counts for running KSEs.
1488	 */
1489	FOREACH_THREAD_IN_PROC(p, td) {
1490		if (td->td_pri_class == PRI_TIMESHARE) {
1491			ke = td->td_kse;
1492			if (ke->ke_runq == NULL)
1493				continue;
1494			kseq = KSEQ_CPU(ke->ke_cpu);
1495			kseq_nice_rem(kseq, p->p_nice);
1496			kseq_nice_add(kseq, nice);
1497		}
1498	}
1499	p->p_nice = nice;
1500	FOREACH_THREAD_IN_PROC(p, td) {
1501		sched_priority(td);
1502		td->td_flags |= TDF_NEEDRESCHED;
1503	}
1504}
1505
1506void
1507sched_sleep(struct thread *td)
1508{
1509	mtx_assert(&sched_lock, MA_OWNED);
1510
1511	td->td_kse->ke_slptime = ticks;
1512}
1513
1514void
1515sched_wakeup(struct thread *td)
1516{
1517	mtx_assert(&sched_lock, MA_OWNED);
1518
1519	/*
1520	 * Let the kseg know how long we slept for.  This is because process
1521	 * interactivity behavior is modeled in the kseg.
1522	 */
1523	if (td->td_kse->ke_slptime) {
1524		int hzticks;
1525
1526		hzticks = (ticks - td->td_kse->ke_slptime) << 10;
1527		if (hzticks >= SCHED_SLP_RUN_MAX) {
1528			td->td_sched->skg_slptime = SCHED_SLP_RUN_MAX;
1529			td->td_sched->skg_runtime = 1;
1530		} else {
1531			td->td_sched->skg_slptime += hzticks;
1532			sched_interact_update(td);
1533		}
1534		sched_priority(td);
1535		sched_slice(td->td_kse);
1536		td->td_kse->ke_slptime = 0;
1537	}
1538	setrunqueue(td, SRQ_BORING);
1539}
1540
1541/*
1542 * Penalize the parent for creating a new child and initialize the child's
1543 * priority.
1544 */
1545void
1546sched_fork(struct thread *td, struct thread *child)
1547{
1548	struct kse *ke;
1549	struct kse *ke2;
1550
1551	mtx_assert(&sched_lock, MA_OWNED);
1552
1553	child->td_sched->skg_slptime = td->td_sched->skg_slptime;
1554	child->td_sched->skg_runtime = td->td_sched->skg_runtime;
1555	child->td_user_pri = td->td_user_pri;
1556	child->kg_base_user_pri = kg->kg_base_user_pri;
1557	sched_interact_fork(child);
1558	td->td_sched->skg_runtime += tickincr;
1559	sched_interact_update(td);
1560
1561	sched_newthread(child);
1562
1563	ke = td->td_kse;
1564	ke2 = child->td_kse;
1565	ke2->ke_slice = 1;	/* Attempt to quickly learn interactivity. */
1566	ke2->ke_cpu = ke->ke_cpu;
1567	ke2->ke_runq = NULL;
1568
1569	/* Grab our parents cpu estimation information. */
1570	ke2->ke_ticks = ke->ke_ticks;
1571	ke2->ke_ltick = ke->ke_ltick;
1572	ke2->ke_ftick = ke->ke_ftick;
1573}
1574
1575void
1576sched_class(struct thread *td, int class)
1577{
1578	struct kseq *kseq;
1579	struct kse *ke;
1580	int nclass;
1581	int oclass;
1582
1583	mtx_assert(&sched_lock, MA_OWNED);
1584	if (td->td_pri_class == class)
1585		return;
1586
1587	nclass = PRI_BASE(class);
1588	oclass = PRI_BASE(td->td_pri_class);
1589	ke = td->td_kse;
1590	if ((ke->ke_state != KES_ONRUNQ &&
1591	    ke->ke_state != KES_THREAD) || ke->ke_runq == NULL)
1592		continue;
1593	kseq = KSEQ_CPU(ke->ke_cpu);
1594
1595#ifdef SMP
1596	/*
1597	 * On SMP if we're on the RUNQ we must adjust the transferable
1598	 * count because could be changing to or from an interrupt
1599	 * class.
1600	 */
1601	if (ke->ke_state == KES_ONRUNQ) {
1602		if (KSE_CAN_MIGRATE(ke)) {
1603			kseq->ksq_transferable--;
1604			kseq->ksq_group->ksg_transferable--;
1605		}
1606		if (KSE_CAN_MIGRATE(ke)) {
1607			kseq->ksq_transferable++;
1608			kseq->ksq_group->ksg_transferable++;
1609		}
1610	}
1611#endif
1612	if (oclass == PRI_TIMESHARE) {
1613		kseq->ksq_load_timeshare--;
1614		kseq_nice_rem(kseq, td->td_proc->p_nice);
1615	}
1616	if (nclass == PRI_TIMESHARE) {
1617		kseq->ksq_load_timeshare++;
1618		kseq_nice_add(kseq, td->td_proc->p_nice);
1619	}
1620
1621	td->td_pri_class = class;
1622}
1623
1624/*
1625 * Return some of the child's priority and interactivity to the parent.
1626 */
1627void
1628sched_exit(struct proc *p, struct thread *childtd)
1629{
1630	struct thread *parent = FIRST_THREAD_IN_PROC(p);
1631	mtx_assert(&sched_lock, MA_OWNED);
1632
1633	CTR3(KTR_SCHED, "sched_exit: %p(%s) prio %d",
1634	    childtd, childtd->td_proc->p_comm, childtd->td_priority);
1635
1636	/* parent->td_sched->skg_slptime += childtd->td_sched->skg_slptime; */
1637	parent->td_sched->skg_runtime += childtd->td_sched->skg_runtime;
1638	sched_interact_update(parent);
1639
1640	kseq_load_rem(KSEQ_CPU(childtd->td_kse->ke_cpu), childtd->td_kse);
1641}
1642
1643void
1644sched_clock(struct thread *td)
1645{
1646	struct kseq *kseq;
1647	struct kse *ke;
1648
1649	mtx_assert(&sched_lock, MA_OWNED);
1650	kseq = KSEQ_SELF();
1651#ifdef SMP
1652	if (ticks >= bal_tick)
1653		sched_balance();
1654	if (ticks >= gbal_tick && balance_groups)
1655		sched_balance_groups();
1656	/*
1657	 * We could have been assigned a non real-time thread without an
1658	 * IPI.
1659	 */
1660	if (kseq->ksq_assigned)
1661		kseq_assign(kseq);	/* Potentially sets NEEDRESCHED */
1662#endif
1663	ke = td->td_kse;
1664
1665	/* Adjust ticks for pctcpu */
1666	ke->ke_ticks++;
1667	ke->ke_ltick = ticks;
1668
1669	/* Go up to one second beyond our max and then trim back down */
1670	if (ke->ke_ftick + SCHED_CPU_TICKS + hz < ke->ke_ltick)
1671		sched_pctcpu_update(ke);
1672
1673	if (td->td_flags & TDF_IDLETD)
1674		return;
1675	/*
1676	 * We only do slicing code for TIMESHARE threads.
1677	 */
1678	if (td->td_pri_class != PRI_TIMESHARE)
1679		return;
1680	/*
1681	 * We used a tick charge it to the thread so that we can compute our
1682	 * interactivity.
1683	 */
1684	td->td_sched->skg_runtime += tickincr;
1685	sched_interact_update(td);
1686
1687	/*
1688	 * We used up one time slice.
1689	 */
1690	if (--ke->ke_slice > 0)
1691		return;
1692	/*
1693	 * We're out of time, recompute priorities and requeue.
1694	 */
1695	kseq_load_rem(kseq, ke);
1696	sched_priority(td);
1697	sched_slice(ke);
1698	if (SCHED_CURR(td, ke))
1699		ke->ke_runq = kseq->ksq_curr;
1700	else
1701		ke->ke_runq = kseq->ksq_next;
1702	kseq_load_add(kseq, ke);
1703	td->td_flags |= TDF_NEEDRESCHED;
1704}
1705
1706int
1707sched_runnable(void)
1708{
1709	struct kseq *kseq;
1710	int load;
1711
1712	load = 1;
1713
1714	kseq = KSEQ_SELF();
1715#ifdef SMP
1716	if (kseq->ksq_assigned) {
1717		mtx_lock_spin(&sched_lock);
1718		kseq_assign(kseq);
1719		mtx_unlock_spin(&sched_lock);
1720	}
1721#endif
1722	if ((curthread->td_flags & TDF_IDLETD) != 0) {
1723		if (kseq->ksq_load > 0)
1724			goto out;
1725	} else
1726		if (kseq->ksq_load - 1 > 0)
1727			goto out;
1728	load = 0;
1729out:
1730	return (load);
1731}
1732
1733struct kse *
1734sched_choose(void)
1735{
1736	struct kseq *kseq;
1737	struct kse *ke;
1738
1739	mtx_assert(&sched_lock, MA_OWNED);
1740	kseq = KSEQ_SELF();
1741#ifdef SMP
1742restart:
1743	if (kseq->ksq_assigned)
1744		kseq_assign(kseq);
1745#endif
1746	ke = kseq_choose(kseq);
1747	if (ke) {
1748#ifdef SMP
1749		if (ke->ke_thread->td_pri_class == PRI_IDLE)
1750			if (kseq_idled(kseq) == 0)
1751				goto restart;
1752#endif
1753		kseq_runq_rem(kseq, ke);
1754		ke->ke_state = KES_THREAD;
1755		ke->ke_flags &= ~KEF_PREEMPTED;
1756		return (ke);
1757	}
1758#ifdef SMP
1759	if (kseq_idled(kseq) == 0)
1760		goto restart;
1761#endif
1762	return (NULL);
1763}
1764
1765void
1766sched_add(struct thread *td, int flags)
1767{
1768	struct kseq *kseq;
1769	struct kse *ke;
1770	int preemptive;
1771	int canmigrate;
1772	int class;
1773
1774	CTR5(KTR_SCHED, "sched_add: %p(%s) prio %d by %p(%s)",
1775	    td, td->td_proc->p_comm, td->td_priority, curthread,
1776	    curthread->td_proc->p_comm);
1777	mtx_assert(&sched_lock, MA_OWNED);
1778	ke = td->td_kse;
1779	canmigrate = 1;
1780	preemptive = !(flags & SRQ_YIELDING);
1781	class = PRI_BASE(td->td_pri_class);
1782	kseq = KSEQ_SELF();
1783	ke->ke_flags &= ~KEF_INTERNAL;
1784#ifdef SMP
1785	if (ke->ke_flags & KEF_ASSIGNED) {
1786		if (ke->ke_flags & KEF_REMOVED)
1787			ke->ke_flags &= ~KEF_REMOVED;
1788		return;
1789	}
1790	canmigrate = KSE_CAN_MIGRATE(ke);
1791	/*
1792	 * Don't migrate running threads here.  Force the long term balancer
1793	 * to do it.
1794	 */
1795	if (ke->ke_flags & KEF_HOLD) {
1796		ke->ke_flags &= ~KEF_HOLD;
1797		canmigrate = 0;
1798	}
1799#endif
1800	KASSERT(ke->ke_state != KES_ONRUNQ,
1801	    ("sched_add: kse %p (%s) already in run queue", ke,
1802	    td->td_proc->p_comm));
1803	KASSERT(td->td_proc->p_sflag & PS_INMEM,
1804	    ("sched_add: process swapped out"));
1805	KASSERT(ke->ke_runq == NULL,
1806	    ("sched_add: KSE %p is still assigned to a run queue", ke));
1807	if (flags & SRQ_PREEMPTED)
1808		ke->ke_flags |= KEF_PREEMPTED;
1809	switch (class) {
1810	case PRI_ITHD:
1811	case PRI_REALTIME:
1812		ke->ke_runq = kseq->ksq_curr;
1813		ke->ke_slice = SCHED_SLICE_MAX;
1814		if (canmigrate)
1815			ke->ke_cpu = PCPU_GET(cpuid);
1816		break;
1817	case PRI_TIMESHARE:
1818		if (SCHED_CURR(td, ke))
1819			ke->ke_runq = kseq->ksq_curr;
1820		else
1821			ke->ke_runq = kseq->ksq_next;
1822		break;
1823	case PRI_IDLE:
1824		/*
1825		 * This is for priority prop.
1826		 */
1827		if (ke->ke_thread->td_priority < PRI_MIN_IDLE)
1828			ke->ke_runq = kseq->ksq_curr;
1829		else
1830			ke->ke_runq = &kseq->ksq_idle;
1831		ke->ke_slice = SCHED_SLICE_MIN;
1832		break;
1833	default:
1834		panic("Unknown pri class.");
1835		break;
1836	}
1837#ifdef SMP
1838	/*
1839	 * If this thread is pinned or bound, notify the target cpu.
1840	 */
1841	if (!canmigrate && ke->ke_cpu != PCPU_GET(cpuid) ) {
1842		ke->ke_runq = NULL;
1843		kseq_notify(ke, ke->ke_cpu);
1844		return;
1845	}
1846	/*
1847	 * If we had been idle, clear our bit in the group and potentially
1848	 * the global bitmap.  If not, see if we should transfer this thread.
1849	 */
1850	if ((class == PRI_TIMESHARE || class == PRI_REALTIME) &&
1851	    (kseq->ksq_group->ksg_idlemask & PCPU_GET(cpumask)) != 0) {
1852		/*
1853		 * Check to see if our group is unidling, and if so, remove it
1854		 * from the global idle mask.
1855		 */
1856		if (kseq->ksq_group->ksg_idlemask ==
1857		    kseq->ksq_group->ksg_cpumask)
1858			atomic_clear_int(&kseq_idle, kseq->ksq_group->ksg_mask);
1859		/*
1860		 * Now remove ourselves from the group specific idle mask.
1861		 */
1862		kseq->ksq_group->ksg_idlemask &= ~PCPU_GET(cpumask);
1863	} else if (canmigrate && kseq->ksq_load > 1 && class != PRI_ITHD)
1864		if (kseq_transfer(kseq, ke, class))
1865			return;
1866	ke->ke_cpu = PCPU_GET(cpuid);
1867#endif
1868	if (td->td_priority < curthread->td_priority &&
1869	    ke->ke_runq == kseq->ksq_curr)
1870		curthread->td_flags |= TDF_NEEDRESCHED;
1871	if (preemptive && maybe_preempt(td))
1872		return;
1873	ke->ke_state = KES_ONRUNQ;
1874
1875	kseq_runq_add(kseq, ke, flags);
1876	kseq_load_add(kseq, ke);
1877}
1878
1879void
1880sched_rem(struct thread *td)
1881{
1882	struct kseq *kseq;
1883	struct kse *ke;
1884
1885	CTR5(KTR_SCHED, "sched_rem: %p(%s) prio %d by %p(%s)",
1886	    td, td->td_proc->p_comm, td->td_priority, curthread,
1887	    curthread->td_proc->p_comm);
1888	mtx_assert(&sched_lock, MA_OWNED);
1889	ke = td->td_kse;
1890	ke->ke_flags &= ~KEF_PREEMPTED;
1891	if (ke->ke_flags & KEF_ASSIGNED) {
1892		ke->ke_flags |= KEF_REMOVED;
1893		return;
1894	}
1895	KASSERT((ke->ke_state == KES_ONRUNQ),
1896	    ("sched_rem: KSE not on run queue"));
1897
1898	ke->ke_state = KES_THREAD;
1899	kseq = KSEQ_CPU(ke->ke_cpu);
1900	kseq_runq_rem(kseq, ke);
1901	kseq_load_rem(kseq, ke);
1902}
1903
1904fixpt_t
1905sched_pctcpu(struct thread *td)
1906{
1907	fixpt_t pctcpu;
1908	struct kse *ke;
1909
1910	pctcpu = 0;
1911	ke = td->td_kse;
1912	if (ke == NULL)
1913		return (0);
1914
1915	mtx_lock_spin(&sched_lock);
1916	if (ke->ke_ticks) {
1917		int rtick;
1918
1919		/*
1920		 * Don't update more frequently than twice a second.  Allowing
1921		 * this causes the cpu usage to decay away too quickly due to
1922		 * rounding errors.
1923		 */
1924		if (ke->ke_ftick + SCHED_CPU_TICKS < ke->ke_ltick ||
1925		    ke->ke_ltick < (ticks - (hz / 2)))
1926			sched_pctcpu_update(ke);
1927		/* How many rtick per second ? */
1928		rtick = min(ke->ke_ticks / SCHED_CPU_TIME, SCHED_CPU_TICKS);
1929		pctcpu = (FSCALE * ((FSCALE * rtick)/realstathz)) >> FSHIFT;
1930	}
1931
1932	td->td_proc->p_swtime = ke->ke_ltick - ke->ke_ftick;
1933	mtx_unlock_spin(&sched_lock);
1934
1935	return (pctcpu);
1936}
1937
1938void
1939sched_bind(struct thread *td, int cpu)
1940{
1941	struct kse *ke;
1942
1943	mtx_assert(&sched_lock, MA_OWNED);
1944	ke = td->td_kse;
1945	ke->ke_flags |= KEF_BOUND;
1946#ifdef SMP
1947	if (PCPU_GET(cpuid) == cpu)
1948		return;
1949	/* sched_rem without the runq_remove */
1950	ke->ke_state = KES_THREAD;
1951	kseq_load_rem(KSEQ_CPU(ke->ke_cpu), ke);
1952	kseq_notify(ke, cpu);
1953	/* When we return from mi_switch we'll be on the correct cpu. */
1954	mi_switch(SW_VOL, NULL);
1955#endif
1956}
1957
1958void
1959sched_unbind(struct thread *td)
1960{
1961	mtx_assert(&sched_lock, MA_OWNED);
1962	td->td_kse->ke_flags &= ~KEF_BOUND;
1963}
1964
1965int
1966sched_is_bound(struct thread *td)
1967{
1968	mtx_assert(&sched_lock, MA_OWNED);
1969	return (td->td_kse->ke_flags & KEF_BOUND);
1970}
1971
1972void
1973sched_relinquish(struct thread *td)
1974{
1975#ifdef KSE
1976	struct ksegrp *kg;
1977
1978	kg = td->td_ksegrp;
1979#endif
1980	mtx_lock_spin(&sched_lock);
1981#ifdef KSE
1982	if (kg->kg_pri_class == PRI_TIMESHARE)
1983#else
1984	if (td->td_pri_class == PRI_TIMESHARE)
1985#endif
1986		sched_prio(td, PRI_MAX_TIMESHARE);
1987	mi_switch(SW_VOL, NULL);
1988	mtx_unlock_spin(&sched_lock);
1989}
1990
1991int
1992sched_load(void)
1993{
1994#ifdef SMP
1995	int total;
1996	int i;
1997
1998	total = 0;
1999	for (i = 0; i <= ksg_maxid; i++)
2000		total += KSEQ_GROUP(i)->ksg_load;
2001	return (total);
2002#else
2003	return (KSEQ_SELF()->ksq_sysload);
2004#endif
2005}
2006
2007int
2008sched_sizeof_proc(void)
2009{
2010	return (sizeof(struct proc));
2011}
2012
2013int
2014sched_sizeof_thread(void)
2015{
2016	return (sizeof(struct thread) + sizeof(struct td_sched));
2017}
2018
2019void
2020sched_tick(void)
2021{
2022}
2023#define KERN_SWITCH_INCLUDE 1
2024#include "kern/kern_switch.c"
2025