sched_ule.c revision 164939
1/*-
2 * Copyright (c) 2002-2005, Jeffrey Roberson <jeff@freebsd.org>
3 * All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 * 1. Redistributions of source code must retain the above copyright
9 *    notice unmodified, this list of conditions, and the following
10 *    disclaimer.
11 * 2. Redistributions in binary form must reproduce the above copyright
12 *    notice, this list of conditions and the following disclaimer in the
13 *    documentation and/or other materials provided with the distribution.
14 *
15 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
16 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
17 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
18 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
19 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
20 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
21 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
22 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
23 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
24 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
25 */
26
27#include <sys/cdefs.h>
28__FBSDID("$FreeBSD: head/sys/kern/sched_ule.c 164939 2006-12-06 06:55:59Z julian $");
29
30#include "opt_hwpmc_hooks.h"
31#include "opt_sched.h"
32
33#include <sys/param.h>
34#include <sys/systm.h>
35#include <sys/kdb.h>
36#include <sys/kernel.h>
37#include <sys/ktr.h>
38#include <sys/lock.h>
39#include <sys/mutex.h>
40#include <sys/proc.h>
41#include <sys/resource.h>
42#include <sys/resourcevar.h>
43#include <sys/sched.h>
44#include <sys/smp.h>
45#include <sys/sx.h>
46#include <sys/sysctl.h>
47#include <sys/sysproto.h>
48#include <sys/turnstile.h>
49#include <sys/umtx.h>
50#include <sys/vmmeter.h>
51#ifdef KTRACE
52#include <sys/uio.h>
53#include <sys/ktrace.h>
54#endif
55
56#ifdef HWPMC_HOOKS
57#include <sys/pmckern.h>
58#endif
59
60#include <machine/cpu.h>
61#include <machine/smp.h>
62
63/* decay 95% of `p_pctcpu' in 60 seconds; see CCPU_SHIFT before changing */
64/* XXX This is bogus compatability crap for ps */
65static fixpt_t  ccpu = 0.95122942450071400909 * FSCALE; /* exp(-1/20) */
66SYSCTL_INT(_kern, OID_AUTO, ccpu, CTLFLAG_RD, &ccpu, 0, "");
67
68static void sched_setup(void *dummy);
69SYSINIT(sched_setup, SI_SUB_RUN_QUEUE, SI_ORDER_FIRST, sched_setup, NULL)
70
71static void sched_initticks(void *dummy);
72SYSINIT(sched_initticks, SI_SUB_CLOCKS, SI_ORDER_THIRD, sched_initticks, NULL)
73
74static SYSCTL_NODE(_kern, OID_AUTO, sched, CTLFLAG_RW, 0, "Scheduler");
75
76SYSCTL_STRING(_kern_sched, OID_AUTO, name, CTLFLAG_RD, "ule", 0,
77    "Scheduler name");
78
79static int slice_min = 1;
80SYSCTL_INT(_kern_sched, OID_AUTO, slice_min, CTLFLAG_RW, &slice_min, 0, "");
81
82static int slice_max = 10;
83SYSCTL_INT(_kern_sched, OID_AUTO, slice_max, CTLFLAG_RW, &slice_max, 0, "");
84
85int realstathz;
86int tickincr = 1 << 10;
87
88/*
89 * The following datastructures are allocated within their parent structure
90 * but are scheduler specific.
91 */
92/*
93 * Thread scheduler specific section.
94 * fields int he thread structure that are specific to this scheduler.
95 */
96struct td_sched {
97	TAILQ_ENTRY(td_sched) ts_procq;	/* (j/z) Run queue. */
98	int		ts_flags;	/* (j) TSF_* flags. */
99	struct thread	*ts_thread;	/* (*) Active associated thread. */
100	fixpt_t		ts_pctcpu;	/* (j) %cpu during p_swtime. */
101	u_char		ts_rqindex;	/* (j) Run queue index. */
102	enum {
103		TSS_THREAD = 0x0,	/* slaved to thread state */
104		TSS_ONRUNQ
105	} ts_state;			/* (j) thread sched specific status. */
106	int		ts_slptime;
107	int		ts_slice;
108	struct runq	*ts_runq;
109	u_char		ts_cpu;		/* CPU that we have affinity for. */
110	/* The following variables are only used for pctcpu calculation */
111	int		ts_ltick;	/* Last tick that we were running on */
112	int		ts_ftick;	/* First tick that we were running on */
113	int		ts_ticks;	/* Tick count */
114
115	/* originally from kg_sched */
116	int	skg_slptime;		/* Number of ticks we vol. slept */
117	int	skg_runtime;		/* Number of ticks we were running */
118};
119#define	ts_assign		ts_procq.tqe_next
120/* flags kept in ts_flags */
121#define	TSF_ASSIGNED	0x0001		/* Thread is being migrated. */
122#define	TSF_BOUND	0x0002		/* Thread can not migrate. */
123#define	TSF_XFERABLE	0x0004		/* Thread was added as transferable. */
124#define	TSF_HOLD	0x0008		/* Thread is temporarily bound. */
125#define	TSF_REMOVED	0x0010		/* Thread was removed while ASSIGNED */
126#define	TSF_INTERNAL	0x0020		/* Thread added due to migration. */
127#define	TSF_PREEMPTED	0x0040		/* Thread was preempted */
128#define	TSF_DIDRUN	0x02000		/* Thread actually ran. */
129#define	TSF_EXIT	0x04000		/* Thread is being killed. */
130
131static struct td_sched td_sched0;
132
133/*
134 * The priority is primarily determined by the interactivity score.  Thus, we
135 * give lower(better) priorities to kse groups that use less CPU.  The nice
136 * value is then directly added to this to allow nice to have some effect
137 * on latency.
138 *
139 * PRI_RANGE:	Total priority range for timeshare threads.
140 * PRI_NRESV:	Number of nice values.
141 * PRI_BASE:	The start of the dynamic range.
142 */
143#define	SCHED_PRI_RANGE		(PRI_MAX_TIMESHARE - PRI_MIN_TIMESHARE + 1)
144#define	SCHED_PRI_NRESV		((PRIO_MAX - PRIO_MIN) + 1)
145#define	SCHED_PRI_NHALF		(SCHED_PRI_NRESV / 2)
146#define	SCHED_PRI_BASE		(PRI_MIN_TIMESHARE)
147#define	SCHED_PRI_INTERACT(score)					\
148    ((score) * SCHED_PRI_RANGE / SCHED_INTERACT_MAX)
149
150/*
151 * These determine the interactivity of a process.
152 *
153 * SLP_RUN_MAX:	Maximum amount of sleep time + run time we'll accumulate
154 *		before throttling back.
155 * SLP_RUN_FORK:	Maximum slp+run time to inherit at fork time.
156 * INTERACT_MAX:	Maximum interactivity value.  Smaller is better.
157 * INTERACT_THRESH:	Threshhold for placement on the current runq.
158 */
159#define	SCHED_SLP_RUN_MAX	((hz * 5) << 10)
160#define	SCHED_SLP_RUN_FORK	((hz / 2) << 10)
161#define	SCHED_INTERACT_MAX	(100)
162#define	SCHED_INTERACT_HALF	(SCHED_INTERACT_MAX / 2)
163#define	SCHED_INTERACT_THRESH	(30)
164
165/*
166 * These parameters and macros determine the size of the time slice that is
167 * granted to each thread.
168 *
169 * SLICE_MIN:	Minimum time slice granted, in units of ticks.
170 * SLICE_MAX:	Maximum time slice granted.
171 * SLICE_RANGE:	Range of available time slices scaled by hz.
172 * SLICE_SCALE:	The number slices granted per val in the range of [0, max].
173 * SLICE_NICE:  Determine the amount of slice granted to a scaled nice.
174 * SLICE_NTHRESH:	The nice cutoff point for slice assignment.
175 */
176#define	SCHED_SLICE_MIN			(slice_min)
177#define	SCHED_SLICE_MAX			(slice_max)
178#define	SCHED_SLICE_INTERACTIVE		(slice_max)
179#define	SCHED_SLICE_NTHRESH	(SCHED_PRI_NHALF - 1)
180#define	SCHED_SLICE_RANGE		(SCHED_SLICE_MAX - SCHED_SLICE_MIN + 1)
181#define	SCHED_SLICE_SCALE(val, max)	(((val) * SCHED_SLICE_RANGE) / (max))
182#define	SCHED_SLICE_NICE(nice)						\
183    (SCHED_SLICE_MAX - SCHED_SLICE_SCALE((nice), SCHED_SLICE_NTHRESH))
184
185/*
186 * This macro determines whether or not the thread belongs on the current or
187 * next run queue.
188 */
189#define	SCHED_INTERACTIVE(td)						\
190    (sched_interact_score(td) < SCHED_INTERACT_THRESH)
191#define	SCHED_CURR(td, ts)						\
192    ((ts->ts_thread->td_flags & TDF_BORROWING) ||			\
193     (ts->ts_flags & TSF_PREEMPTED) || SCHED_INTERACTIVE(td))
194
195/*
196 * Cpu percentage computation macros and defines.
197 *
198 * SCHED_CPU_TIME:	Number of seconds to average the cpu usage across.
199 * SCHED_CPU_TICKS:	Number of hz ticks to average the cpu usage across.
200 */
201
202#define	SCHED_CPU_TIME	10
203#define	SCHED_CPU_TICKS	(hz * SCHED_CPU_TIME)
204
205/*
206 * tdq - per processor runqs and statistics.
207 */
208struct tdq {
209	struct runq	ksq_idle;		/* Queue of IDLE threads. */
210	struct runq	ksq_timeshare[2];	/* Run queues for !IDLE. */
211	struct runq	*ksq_next;		/* Next timeshare queue. */
212	struct runq	*ksq_curr;		/* Current queue. */
213	int		ksq_load_timeshare;	/* Load for timeshare. */
214	int		ksq_load;		/* Aggregate load. */
215	short		ksq_nice[SCHED_PRI_NRESV]; /* threadss in each nice bin. */
216	short		ksq_nicemin;		/* Least nice. */
217#ifdef SMP
218	int			ksq_transferable;
219	LIST_ENTRY(tdq)	ksq_siblings;	/* Next in tdq group. */
220	struct tdq_group	*ksq_group;	/* Our processor group. */
221	volatile struct td_sched *ksq_assigned;	/* assigned by another CPU. */
222#else
223	int		ksq_sysload;		/* For loadavg, !ITHD load. */
224#endif
225};
226
227#ifdef SMP
228/*
229 * tdq groups are groups of processors which can cheaply share threads.  When
230 * one processor in the group goes idle it will check the runqs of the other
231 * processors in its group prior to halting and waiting for an interrupt.
232 * These groups are suitable for SMT (Symetric Multi-Threading) and not NUMA.
233 * In a numa environment we'd want an idle bitmap per group and a two tiered
234 * load balancer.
235 */
236struct tdq_group {
237	int	ksg_cpus;		/* Count of CPUs in this tdq group. */
238	cpumask_t ksg_cpumask;		/* Mask of cpus in this group. */
239	cpumask_t ksg_idlemask;		/* Idle cpus in this group. */
240	cpumask_t ksg_mask;		/* Bit mask for first cpu. */
241	int	ksg_load;		/* Total load of this group. */
242	int	ksg_transferable;	/* Transferable load of this group. */
243	LIST_HEAD(, tdq)	ksg_members; /* Linked list of all members. */
244};
245#endif
246
247/*
248 * One kse queue per processor.
249 */
250#ifdef SMP
251static cpumask_t tdq_idle;
252static int ksg_maxid;
253static struct tdq	tdq_cpu[MAXCPU];
254static struct tdq_group tdq_groups[MAXCPU];
255static int bal_tick;
256static int gbal_tick;
257static int balance_groups;
258
259#define	TDQ_SELF()	(&tdq_cpu[PCPU_GET(cpuid)])
260#define	TDQ_CPU(x)	(&tdq_cpu[(x)])
261#define	TDQ_ID(x)	((x) - tdq_cpu)
262#define	TDQ_GROUP(x)	(&tdq_groups[(x)])
263#else	/* !SMP */
264static struct tdq	tdq_cpu;
265
266#define	TDQ_SELF()	(&tdq_cpu)
267#define	TDQ_CPU(x)	(&tdq_cpu)
268#endif
269
270static struct td_sched *sched_choose(void);		/* XXX Should be thread * */
271static void sched_slice(struct td_sched *);
272static void sched_priority(struct thread *);
273static void sched_thread_priority(struct thread *, u_char);
274static int sched_interact_score(struct thread *);
275static void sched_interact_update(struct thread *);
276static void sched_interact_fork(struct thread *);
277static void sched_pctcpu_update(struct td_sched *);
278
279/* Operations on per processor queues */
280static struct td_sched * tdq_choose(struct tdq *);
281static void tdq_setup(struct tdq *);
282static void tdq_load_add(struct tdq *, struct td_sched *);
283static void tdq_load_rem(struct tdq *, struct td_sched *);
284static __inline void tdq_runq_add(struct tdq *, struct td_sched *, int);
285static __inline void tdq_runq_rem(struct tdq *, struct td_sched *);
286static void tdq_nice_add(struct tdq *, int);
287static void tdq_nice_rem(struct tdq *, int);
288void tdq_print(int cpu);
289#ifdef SMP
290static int tdq_transfer(struct tdq *, struct td_sched *, int);
291static struct td_sched *runq_steal(struct runq *);
292static void sched_balance(void);
293static void sched_balance_groups(void);
294static void sched_balance_group(struct tdq_group *);
295static void sched_balance_pair(struct tdq *, struct tdq *);
296static void tdq_move(struct tdq *, int);
297static int tdq_idled(struct tdq *);
298static void tdq_notify(struct td_sched *, int);
299static void tdq_assign(struct tdq *);
300static struct td_sched *tdq_steal(struct tdq *, int);
301#define	THREAD_CAN_MIGRATE(ts)						\
302    ((ts)->ts_thread->td_pinned == 0 && ((ts)->ts_flags & TSF_BOUND) == 0)
303#endif
304
305void
306tdq_print(int cpu)
307{
308	struct tdq *tdq;
309	int i;
310
311	tdq = TDQ_CPU(cpu);
312
313	printf("tdq:\n");
314	printf("\tload:           %d\n", tdq->ksq_load);
315	printf("\tload TIMESHARE: %d\n", tdq->ksq_load_timeshare);
316#ifdef SMP
317	printf("\tload transferable: %d\n", tdq->ksq_transferable);
318#endif
319	printf("\tnicemin:\t%d\n", tdq->ksq_nicemin);
320	printf("\tnice counts:\n");
321	for (i = 0; i < SCHED_PRI_NRESV; i++)
322		if (tdq->ksq_nice[i])
323			printf("\t\t%d = %d\n",
324			    i - SCHED_PRI_NHALF, tdq->ksq_nice[i]);
325}
326
327static __inline void
328tdq_runq_add(struct tdq *tdq, struct td_sched *ts, int flags)
329{
330#ifdef SMP
331	if (THREAD_CAN_MIGRATE(ts)) {
332		tdq->ksq_transferable++;
333		tdq->ksq_group->ksg_transferable++;
334		ts->ts_flags |= TSF_XFERABLE;
335	}
336#endif
337	if (ts->ts_flags & TSF_PREEMPTED)
338		flags |= SRQ_PREEMPTED;
339	runq_add(ts->ts_runq, ts, flags);
340}
341
342static __inline void
343tdq_runq_rem(struct tdq *tdq, struct td_sched *ts)
344{
345#ifdef SMP
346	if (ts->ts_flags & TSF_XFERABLE) {
347		tdq->ksq_transferable--;
348		tdq->ksq_group->ksg_transferable--;
349		ts->ts_flags &= ~TSF_XFERABLE;
350	}
351#endif
352	runq_remove(ts->ts_runq, ts);
353}
354
355static void
356tdq_load_add(struct tdq *tdq, struct td_sched *ts)
357{
358	int class;
359	mtx_assert(&sched_lock, MA_OWNED);
360	class = PRI_BASE(ts->ts_thread->td_pri_class);
361	if (class == PRI_TIMESHARE)
362		tdq->ksq_load_timeshare++;
363	tdq->ksq_load++;
364	CTR1(KTR_SCHED, "load: %d", tdq->ksq_load);
365	if (class != PRI_ITHD && (ts->ts_thread->td_proc->p_flag & P_NOLOAD) == 0)
366#ifdef SMP
367		tdq->ksq_group->ksg_load++;
368#else
369		tdq->ksq_sysload++;
370#endif
371	if (ts->ts_thread->td_pri_class == PRI_TIMESHARE)
372		tdq_nice_add(tdq, ts->ts_thread->td_proc->p_nice);
373}
374
375static void
376tdq_load_rem(struct tdq *tdq, struct td_sched *ts)
377{
378	int class;
379	mtx_assert(&sched_lock, MA_OWNED);
380	class = PRI_BASE(ts->ts_thread->td_pri_class);
381	if (class == PRI_TIMESHARE)
382		tdq->ksq_load_timeshare--;
383	if (class != PRI_ITHD  && (ts->ts_thread->td_proc->p_flag & P_NOLOAD) == 0)
384#ifdef SMP
385		tdq->ksq_group->ksg_load--;
386#else
387		tdq->ksq_sysload--;
388#endif
389	tdq->ksq_load--;
390	CTR1(KTR_SCHED, "load: %d", tdq->ksq_load);
391	ts->ts_runq = NULL;
392	if (ts->ts_thread->td_pri_class == PRI_TIMESHARE)
393		tdq_nice_rem(tdq, ts->ts_thread->td_proc->p_nice);
394}
395
396static void
397tdq_nice_add(struct tdq *tdq, int nice)
398{
399	mtx_assert(&sched_lock, MA_OWNED);
400	/* Normalize to zero. */
401	tdq->ksq_nice[nice + SCHED_PRI_NHALF]++;
402	if (nice < tdq->ksq_nicemin || tdq->ksq_load_timeshare == 1)
403		tdq->ksq_nicemin = nice;
404}
405
406static void
407tdq_nice_rem(struct tdq *tdq, int nice)
408{
409	int n;
410
411	mtx_assert(&sched_lock, MA_OWNED);
412	/* Normalize to zero. */
413	n = nice + SCHED_PRI_NHALF;
414	tdq->ksq_nice[n]--;
415	KASSERT(tdq->ksq_nice[n] >= 0, ("Negative nice count."));
416
417	/*
418	 * If this wasn't the smallest nice value or there are more in
419	 * this bucket we can just return.  Otherwise we have to recalculate
420	 * the smallest nice.
421	 */
422	if (nice != tdq->ksq_nicemin ||
423	    tdq->ksq_nice[n] != 0 ||
424	    tdq->ksq_load_timeshare == 0)
425		return;
426
427	for (; n < SCHED_PRI_NRESV; n++)
428		if (tdq->ksq_nice[n]) {
429			tdq->ksq_nicemin = n - SCHED_PRI_NHALF;
430			return;
431		}
432}
433
434#ifdef SMP
435/*
436 * sched_balance is a simple CPU load balancing algorithm.  It operates by
437 * finding the least loaded and most loaded cpu and equalizing their load
438 * by migrating some processes.
439 *
440 * Dealing only with two CPUs at a time has two advantages.  Firstly, most
441 * installations will only have 2 cpus.  Secondly, load balancing too much at
442 * once can have an unpleasant effect on the system.  The scheduler rarely has
443 * enough information to make perfect decisions.  So this algorithm chooses
444 * algorithm simplicity and more gradual effects on load in larger systems.
445 *
446 * It could be improved by considering the priorities and slices assigned to
447 * each task prior to balancing them.  There are many pathological cases with
448 * any approach and so the semi random algorithm below may work as well as any.
449 *
450 */
451static void
452sched_balance(void)
453{
454	struct tdq_group *high;
455	struct tdq_group *low;
456	struct tdq_group *ksg;
457	int cnt;
458	int i;
459
460	bal_tick = ticks + (random() % (hz * 2));
461	if (smp_started == 0)
462		return;
463	low = high = NULL;
464	i = random() % (ksg_maxid + 1);
465	for (cnt = 0; cnt <= ksg_maxid; cnt++) {
466		ksg = TDQ_GROUP(i);
467		/*
468		 * Find the CPU with the highest load that has some
469		 * threads to transfer.
470		 */
471		if ((high == NULL || ksg->ksg_load > high->ksg_load)
472		    && ksg->ksg_transferable)
473			high = ksg;
474		if (low == NULL || ksg->ksg_load < low->ksg_load)
475			low = ksg;
476		if (++i > ksg_maxid)
477			i = 0;
478	}
479	if (low != NULL && high != NULL && high != low)
480		sched_balance_pair(LIST_FIRST(&high->ksg_members),
481		    LIST_FIRST(&low->ksg_members));
482}
483
484static void
485sched_balance_groups(void)
486{
487	int i;
488
489	gbal_tick = ticks + (random() % (hz * 2));
490	mtx_assert(&sched_lock, MA_OWNED);
491	if (smp_started)
492		for (i = 0; i <= ksg_maxid; i++)
493			sched_balance_group(TDQ_GROUP(i));
494}
495
496static void
497sched_balance_group(struct tdq_group *ksg)
498{
499	struct tdq *tdq;
500	struct tdq *high;
501	struct tdq *low;
502	int load;
503
504	if (ksg->ksg_transferable == 0)
505		return;
506	low = NULL;
507	high = NULL;
508	LIST_FOREACH(tdq, &ksg->ksg_members, ksq_siblings) {
509		load = tdq->ksq_load;
510		if (high == NULL || load > high->ksq_load)
511			high = tdq;
512		if (low == NULL || load < low->ksq_load)
513			low = tdq;
514	}
515	if (high != NULL && low != NULL && high != low)
516		sched_balance_pair(high, low);
517}
518
519static void
520sched_balance_pair(struct tdq *high, struct tdq *low)
521{
522	int transferable;
523	int high_load;
524	int low_load;
525	int move;
526	int diff;
527	int i;
528
529	/*
530	 * If we're transfering within a group we have to use this specific
531	 * tdq's transferable count, otherwise we can steal from other members
532	 * of the group.
533	 */
534	if (high->ksq_group == low->ksq_group) {
535		transferable = high->ksq_transferable;
536		high_load = high->ksq_load;
537		low_load = low->ksq_load;
538	} else {
539		transferable = high->ksq_group->ksg_transferable;
540		high_load = high->ksq_group->ksg_load;
541		low_load = low->ksq_group->ksg_load;
542	}
543	if (transferable == 0)
544		return;
545	/*
546	 * Determine what the imbalance is and then adjust that to how many
547	 * kses we actually have to give up (transferable).
548	 */
549	diff = high_load - low_load;
550	move = diff / 2;
551	if (diff & 0x1)
552		move++;
553	move = min(move, transferable);
554	for (i = 0; i < move; i++)
555		tdq_move(high, TDQ_ID(low));
556	return;
557}
558
559static void
560tdq_move(struct tdq *from, int cpu)
561{
562	struct tdq *tdq;
563	struct tdq *to;
564	struct td_sched *ts;
565
566	tdq = from;
567	to = TDQ_CPU(cpu);
568	ts = tdq_steal(tdq, 1);
569	if (ts == NULL) {
570		struct tdq_group *ksg;
571
572		ksg = tdq->ksq_group;
573		LIST_FOREACH(tdq, &ksg->ksg_members, ksq_siblings) {
574			if (tdq == from || tdq->ksq_transferable == 0)
575				continue;
576			ts = tdq_steal(tdq, 1);
577			break;
578		}
579		if (ts == NULL)
580			panic("tdq_move: No threads available with a "
581			    "transferable count of %d\n",
582			    ksg->ksg_transferable);
583	}
584	if (tdq == to)
585		return;
586	ts->ts_state = TSS_THREAD;
587	tdq_runq_rem(tdq, ts);
588	tdq_load_rem(tdq, ts);
589	tdq_notify(ts, cpu);
590}
591
592static int
593tdq_idled(struct tdq *tdq)
594{
595	struct tdq_group *ksg;
596	struct tdq *steal;
597	struct td_sched *ts;
598
599	ksg = tdq->ksq_group;
600	/*
601	 * If we're in a cpu group, try and steal kses from another cpu in
602	 * the group before idling.
603	 */
604	if (ksg->ksg_cpus > 1 && ksg->ksg_transferable) {
605		LIST_FOREACH(steal, &ksg->ksg_members, ksq_siblings) {
606			if (steal == tdq || steal->ksq_transferable == 0)
607				continue;
608			ts = tdq_steal(steal, 0);
609			if (ts == NULL)
610				continue;
611			ts->ts_state = TSS_THREAD;
612			tdq_runq_rem(steal, ts);
613			tdq_load_rem(steal, ts);
614			ts->ts_cpu = PCPU_GET(cpuid);
615			ts->ts_flags |= TSF_INTERNAL | TSF_HOLD;
616			sched_add(ts->ts_thread, SRQ_YIELDING);
617			return (0);
618		}
619	}
620	/*
621	 * We only set the idled bit when all of the cpus in the group are
622	 * idle.  Otherwise we could get into a situation where a thread bounces
623	 * back and forth between two idle cores on seperate physical CPUs.
624	 */
625	ksg->ksg_idlemask |= PCPU_GET(cpumask);
626	if (ksg->ksg_idlemask != ksg->ksg_cpumask)
627		return (1);
628	atomic_set_int(&tdq_idle, ksg->ksg_mask);
629	return (1);
630}
631
632static void
633tdq_assign(struct tdq *tdq)
634{
635	struct td_sched *nts;
636	struct td_sched *ts;
637
638	do {
639		*(volatile struct td_sched **)&ts = tdq->ksq_assigned;
640	} while(!atomic_cmpset_ptr((volatile uintptr_t *)&tdq->ksq_assigned,
641		(uintptr_t)ts, (uintptr_t)NULL));
642	for (; ts != NULL; ts = nts) {
643		nts = ts->ts_assign;
644		tdq->ksq_group->ksg_load--;
645		tdq->ksq_load--;
646		ts->ts_flags &= ~TSF_ASSIGNED;
647		if (ts->ts_flags & TSF_REMOVED) {
648			ts->ts_flags &= ~TSF_REMOVED;
649			continue;
650		}
651		ts->ts_flags |= TSF_INTERNAL | TSF_HOLD;
652		sched_add(ts->ts_thread, SRQ_YIELDING);
653	}
654}
655
656static void
657tdq_notify(struct td_sched *ts, int cpu)
658{
659	struct tdq *tdq;
660	struct thread *td;
661	struct pcpu *pcpu;
662	int class;
663	int prio;
664
665	tdq = TDQ_CPU(cpu);
666	/* XXX */
667	class = PRI_BASE(ts->ts_thread->td_pri_class);
668	if ((class == PRI_TIMESHARE || class == PRI_REALTIME) &&
669	    (tdq_idle & tdq->ksq_group->ksg_mask))
670		atomic_clear_int(&tdq_idle, tdq->ksq_group->ksg_mask);
671	tdq->ksq_group->ksg_load++;
672	tdq->ksq_load++;
673	ts->ts_cpu = cpu;
674	ts->ts_flags |= TSF_ASSIGNED;
675	prio = ts->ts_thread->td_priority;
676
677	/*
678	 * Place a thread on another cpu's queue and force a resched.
679	 */
680	do {
681		*(volatile struct td_sched **)&ts->ts_assign = tdq->ksq_assigned;
682	} while(!atomic_cmpset_ptr((volatile uintptr_t *)&tdq->ksq_assigned,
683		(uintptr_t)ts->ts_assign, (uintptr_t)ts));
684	/*
685	 * Without sched_lock we could lose a race where we set NEEDRESCHED
686	 * on a thread that is switched out before the IPI is delivered.  This
687	 * would lead us to miss the resched.  This will be a problem once
688	 * sched_lock is pushed down.
689	 */
690	pcpu = pcpu_find(cpu);
691	td = pcpu->pc_curthread;
692	if (ts->ts_thread->td_priority < td->td_priority ||
693	    td == pcpu->pc_idlethread) {
694		td->td_flags |= TDF_NEEDRESCHED;
695		ipi_selected(1 << cpu, IPI_AST);
696	}
697}
698
699static struct td_sched *
700runq_steal(struct runq *rq)
701{
702	struct rqhead *rqh;
703	struct rqbits *rqb;
704	struct td_sched *ts;
705	int word;
706	int bit;
707
708	mtx_assert(&sched_lock, MA_OWNED);
709	rqb = &rq->rq_status;
710	for (word = 0; word < RQB_LEN; word++) {
711		if (rqb->rqb_bits[word] == 0)
712			continue;
713		for (bit = 0; bit < RQB_BPW; bit++) {
714			if ((rqb->rqb_bits[word] & (1ul << bit)) == 0)
715				continue;
716			rqh = &rq->rq_queues[bit + (word << RQB_L2BPW)];
717			TAILQ_FOREACH(ts, rqh, ts_procq) {
718				if (THREAD_CAN_MIGRATE(ts))
719					return (ts);
720			}
721		}
722	}
723	return (NULL);
724}
725
726static struct td_sched *
727tdq_steal(struct tdq *tdq, int stealidle)
728{
729	struct td_sched *ts;
730
731	/*
732	 * Steal from next first to try to get a non-interactive task that
733	 * may not have run for a while.
734	 */
735	if ((ts = runq_steal(tdq->ksq_next)) != NULL)
736		return (ts);
737	if ((ts = runq_steal(tdq->ksq_curr)) != NULL)
738		return (ts);
739	if (stealidle)
740		return (runq_steal(&tdq->ksq_idle));
741	return (NULL);
742}
743
744int
745tdq_transfer(struct tdq *tdq, struct td_sched *ts, int class)
746{
747	struct tdq_group *nksg;
748	struct tdq_group *ksg;
749	struct tdq *old;
750	int cpu;
751	int idx;
752
753	if (smp_started == 0)
754		return (0);
755	cpu = 0;
756	/*
757	 * If our load exceeds a certain threshold we should attempt to
758	 * reassign this thread.  The first candidate is the cpu that
759	 * originally ran the thread.  If it is idle, assign it there,
760	 * otherwise, pick an idle cpu.
761	 *
762	 * The threshold at which we start to reassign kses has a large impact
763	 * on the overall performance of the system.  Tuned too high and
764	 * some CPUs may idle.  Too low and there will be excess migration
765	 * and context switches.
766	 */
767	old = TDQ_CPU(ts->ts_cpu);
768	nksg = old->ksq_group;
769	ksg = tdq->ksq_group;
770	if (tdq_idle) {
771		if (tdq_idle & nksg->ksg_mask) {
772			cpu = ffs(nksg->ksg_idlemask);
773			if (cpu) {
774				CTR2(KTR_SCHED,
775				    "tdq_transfer: %p found old cpu %X "
776				    "in idlemask.", ts, cpu);
777				goto migrate;
778			}
779		}
780		/*
781		 * Multiple cpus could find this bit simultaneously
782		 * but the race shouldn't be terrible.
783		 */
784		cpu = ffs(tdq_idle);
785		if (cpu) {
786			CTR2(KTR_SCHED, "tdq_transfer: %p found %X "
787			    "in idlemask.", ts, cpu);
788			goto migrate;
789		}
790	}
791	idx = 0;
792#if 0
793	if (old->ksq_load < tdq->ksq_load) {
794		cpu = ts->ts_cpu + 1;
795		CTR2(KTR_SCHED, "tdq_transfer: %p old cpu %X "
796		    "load less than ours.", ts, cpu);
797		goto migrate;
798	}
799	/*
800	 * No new CPU was found, look for one with less load.
801	 */
802	for (idx = 0; idx <= ksg_maxid; idx++) {
803		nksg = TDQ_GROUP(idx);
804		if (nksg->ksg_load /*+ (nksg->ksg_cpus  * 2)*/ < ksg->ksg_load) {
805			cpu = ffs(nksg->ksg_cpumask);
806			CTR2(KTR_SCHED, "tdq_transfer: %p cpu %X load less "
807			    "than ours.", ts, cpu);
808			goto migrate;
809		}
810	}
811#endif
812	/*
813	 * If another cpu in this group has idled, assign a thread over
814	 * to them after checking to see if there are idled groups.
815	 */
816	if (ksg->ksg_idlemask) {
817		cpu = ffs(ksg->ksg_idlemask);
818		if (cpu) {
819			CTR2(KTR_SCHED, "tdq_transfer: %p cpu %X idle in "
820			    "group.", ts, cpu);
821			goto migrate;
822		}
823	}
824	return (0);
825migrate:
826	/*
827	 * Now that we've found an idle CPU, migrate the thread.
828	 */
829	cpu--;
830	ts->ts_runq = NULL;
831	tdq_notify(ts, cpu);
832
833	return (1);
834}
835
836#endif	/* SMP */
837
838/*
839 * Pick the highest priority task we have and return it.
840 */
841
842static struct td_sched *
843tdq_choose(struct tdq *tdq)
844{
845	struct runq *swap;
846	struct td_sched *ts;
847	int nice;
848
849	mtx_assert(&sched_lock, MA_OWNED);
850	swap = NULL;
851
852	for (;;) {
853		ts = runq_choose(tdq->ksq_curr);
854		if (ts == NULL) {
855			/*
856			 * We already swapped once and didn't get anywhere.
857			 */
858			if (swap)
859				break;
860			swap = tdq->ksq_curr;
861			tdq->ksq_curr = tdq->ksq_next;
862			tdq->ksq_next = swap;
863			continue;
864		}
865		/*
866		 * If we encounter a slice of 0 the td_sched is in a
867		 * TIMESHARE td_sched group and its nice was too far out
868		 * of the range that receives slices.
869		 */
870		nice = ts->ts_thread->td_proc->p_nice + (0 - tdq->ksq_nicemin);
871#if 0
872		if (ts->ts_slice == 0 || (nice > SCHED_SLICE_NTHRESH &&
873		    ts->ts_thread->td_proc->p_nice != 0)) {
874			runq_remove(ts->ts_runq, ts);
875			sched_slice(ts);
876			ts->ts_runq = tdq->ksq_next;
877			runq_add(ts->ts_runq, ts, 0);
878			continue;
879		}
880#endif
881		return (ts);
882	}
883
884	return (runq_choose(&tdq->ksq_idle));
885}
886
887static void
888tdq_setup(struct tdq *tdq)
889{
890	runq_init(&tdq->ksq_timeshare[0]);
891	runq_init(&tdq->ksq_timeshare[1]);
892	runq_init(&tdq->ksq_idle);
893	tdq->ksq_curr = &tdq->ksq_timeshare[0];
894	tdq->ksq_next = &tdq->ksq_timeshare[1];
895	tdq->ksq_load = 0;
896	tdq->ksq_load_timeshare = 0;
897}
898
899static void
900sched_setup(void *dummy)
901{
902#ifdef SMP
903	int i;
904#endif
905
906	/*
907	 * To avoid divide-by-zero, we set realstathz a dummy value
908	 * in case which sched_clock() called before sched_initticks().
909	 */
910	realstathz = hz;
911	slice_min = (hz/100);	/* 10ms */
912	slice_max = (hz/7);	/* ~140ms */
913
914#ifdef SMP
915	balance_groups = 0;
916	/*
917	 * Initialize the tdqs.
918	 */
919	for (i = 0; i < MAXCPU; i++) {
920		struct tdq *ksq;
921
922		ksq = &tdq_cpu[i];
923		ksq->ksq_assigned = NULL;
924		tdq_setup(&tdq_cpu[i]);
925	}
926	if (smp_topology == NULL) {
927		struct tdq_group *ksg;
928		struct tdq *ksq;
929		int cpus;
930
931		for (cpus = 0, i = 0; i < MAXCPU; i++) {
932			if (CPU_ABSENT(i))
933				continue;
934			ksq = &tdq_cpu[i];
935			ksg = &tdq_groups[cpus];
936			/*
937			 * Setup a tdq group with one member.
938			 */
939			ksq->ksq_transferable = 0;
940			ksq->ksq_group = ksg;
941			ksg->ksg_cpus = 1;
942			ksg->ksg_idlemask = 0;
943			ksg->ksg_cpumask = ksg->ksg_mask = 1 << i;
944			ksg->ksg_load = 0;
945			ksg->ksg_transferable = 0;
946			LIST_INIT(&ksg->ksg_members);
947			LIST_INSERT_HEAD(&ksg->ksg_members, ksq, ksq_siblings);
948			cpus++;
949		}
950		ksg_maxid = cpus - 1;
951	} else {
952		struct tdq_group *ksg;
953		struct cpu_group *cg;
954		int j;
955
956		for (i = 0; i < smp_topology->ct_count; i++) {
957			cg = &smp_topology->ct_group[i];
958			ksg = &tdq_groups[i];
959			/*
960			 * Initialize the group.
961			 */
962			ksg->ksg_idlemask = 0;
963			ksg->ksg_load = 0;
964			ksg->ksg_transferable = 0;
965			ksg->ksg_cpus = cg->cg_count;
966			ksg->ksg_cpumask = cg->cg_mask;
967			LIST_INIT(&ksg->ksg_members);
968			/*
969			 * Find all of the group members and add them.
970			 */
971			for (j = 0; j < MAXCPU; j++) {
972				if ((cg->cg_mask & (1 << j)) != 0) {
973					if (ksg->ksg_mask == 0)
974						ksg->ksg_mask = 1 << j;
975					tdq_cpu[j].ksq_transferable = 0;
976					tdq_cpu[j].ksq_group = ksg;
977					LIST_INSERT_HEAD(&ksg->ksg_members,
978					    &tdq_cpu[j], ksq_siblings);
979				}
980			}
981			if (ksg->ksg_cpus > 1)
982				balance_groups = 1;
983		}
984		ksg_maxid = smp_topology->ct_count - 1;
985	}
986	/*
987	 * Stagger the group and global load balancer so they do not
988	 * interfere with each other.
989	 */
990	bal_tick = ticks + hz;
991	if (balance_groups)
992		gbal_tick = ticks + (hz / 2);
993#else
994	tdq_setup(TDQ_SELF());
995#endif
996	mtx_lock_spin(&sched_lock);
997	tdq_load_add(TDQ_SELF(), &td_sched0);
998	mtx_unlock_spin(&sched_lock);
999}
1000
1001/* ARGSUSED */
1002static void
1003sched_initticks(void *dummy)
1004{
1005	mtx_lock_spin(&sched_lock);
1006	realstathz = stathz ? stathz : hz;
1007	slice_min = (realstathz/100);	/* 10ms */
1008	slice_max = (realstathz/7);	/* ~140ms */
1009
1010	tickincr = (hz << 10) / realstathz;
1011	/*
1012	 * XXX This does not work for values of stathz that are much
1013	 * larger than hz.
1014	 */
1015	if (tickincr == 0)
1016		tickincr = 1;
1017	mtx_unlock_spin(&sched_lock);
1018}
1019
1020
1021/*
1022 * Scale the scheduling priority according to the "interactivity" of this
1023 * process.
1024 */
1025static void
1026sched_priority(struct thread *td)
1027{
1028	int pri;
1029
1030	if (td->td_pri_class != PRI_TIMESHARE)
1031		return;
1032
1033	pri = SCHED_PRI_INTERACT(sched_interact_score(td));
1034	pri += SCHED_PRI_BASE;
1035	pri += td->td_proc->p_nice;
1036
1037	if (pri > PRI_MAX_TIMESHARE)
1038		pri = PRI_MAX_TIMESHARE;
1039	else if (pri < PRI_MIN_TIMESHARE)
1040		pri = PRI_MIN_TIMESHARE;
1041
1042	sched_user_prio(td, pri);
1043
1044	return;
1045}
1046
1047/*
1048 * Calculate a time slice based on the properties of the process
1049 * and the runq that we're on.  This is only for PRI_TIMESHARE threads.
1050 */
1051static void
1052sched_slice(struct td_sched *ts)
1053{
1054	struct tdq *tdq;
1055	struct thread *td;
1056
1057	td = ts->ts_thread;
1058	tdq = TDQ_CPU(ts->ts_cpu);
1059
1060	if (td->td_flags & TDF_BORROWING) {
1061		ts->ts_slice = SCHED_SLICE_MIN;
1062		return;
1063	}
1064
1065	/*
1066	 * Rationale:
1067	 * Threads in interactive procs get a minimal slice so that we
1068	 * quickly notice if it abuses its advantage.
1069	 *
1070	 * Threads in non-interactive procs are assigned a slice that is
1071	 * based on the procs nice value relative to the least nice procs
1072	 * on the run queue for this cpu.
1073	 *
1074	 * If the thread is less nice than all others it gets the maximum
1075	 * slice and other threads will adjust their slice relative to
1076	 * this when they first expire.
1077	 *
1078	 * There is 20 point window that starts relative to the least
1079	 * nice td_sched on the run queue.  Slice size is determined by
1080	 * the td_sched distance from the last nice thread.
1081	 *
1082	 * If the td_sched is outside of the window it will get no slice
1083	 * and will be reevaluated each time it is selected on the
1084	 * run queue.  The exception to this is nice 0 procs when
1085	 * a nice -20 is running.  They are always granted a minimum
1086	 * slice.
1087	 */
1088	if (!SCHED_INTERACTIVE(td)) {
1089		int nice;
1090
1091		nice = td->td_proc->p_nice + (0 - tdq->ksq_nicemin);
1092		if (tdq->ksq_load_timeshare == 0 ||
1093		    td->td_proc->p_nice < tdq->ksq_nicemin)
1094			ts->ts_slice = SCHED_SLICE_MAX;
1095		else if (nice <= SCHED_SLICE_NTHRESH)
1096			ts->ts_slice = SCHED_SLICE_NICE(nice);
1097		else if (td->td_proc->p_nice == 0)
1098			ts->ts_slice = SCHED_SLICE_MIN;
1099		else
1100			ts->ts_slice = SCHED_SLICE_MIN; /* 0 */
1101	} else
1102		ts->ts_slice = SCHED_SLICE_INTERACTIVE;
1103
1104	return;
1105}
1106
1107/*
1108 * This routine enforces a maximum limit on the amount of scheduling history
1109 * kept.  It is called after either the slptime or runtime is adjusted.
1110 * This routine will not operate correctly when slp or run times have been
1111 * adjusted to more than double their maximum.
1112 */
1113static void
1114sched_interact_update(struct thread *td)
1115{
1116	int sum;
1117
1118	sum = td->td_sched->skg_runtime + td->td_sched->skg_slptime;
1119	if (sum < SCHED_SLP_RUN_MAX)
1120		return;
1121	/*
1122	 * If we have exceeded by more than 1/5th then the algorithm below
1123	 * will not bring us back into range.  Dividing by two here forces
1124	 * us into the range of [4/5 * SCHED_INTERACT_MAX, SCHED_INTERACT_MAX]
1125	 */
1126	if (sum > (SCHED_SLP_RUN_MAX / 5) * 6) {
1127		td->td_sched->skg_runtime /= 2;
1128		td->td_sched->skg_slptime /= 2;
1129		return;
1130	}
1131	td->td_sched->skg_runtime = (td->td_sched->skg_runtime / 5) * 4;
1132	td->td_sched->skg_slptime = (td->td_sched->skg_slptime / 5) * 4;
1133}
1134
1135static void
1136sched_interact_fork(struct thread *td)
1137{
1138	int ratio;
1139	int sum;
1140
1141	sum = td->td_sched->skg_runtime + td->td_sched->skg_slptime;
1142	if (sum > SCHED_SLP_RUN_FORK) {
1143		ratio = sum / SCHED_SLP_RUN_FORK;
1144		td->td_sched->skg_runtime /= ratio;
1145		td->td_sched->skg_slptime /= ratio;
1146	}
1147}
1148
1149static int
1150sched_interact_score(struct thread *td)
1151{
1152	int div;
1153
1154	if (td->td_sched->skg_runtime > td->td_sched->skg_slptime) {
1155		div = max(1, td->td_sched->skg_runtime / SCHED_INTERACT_HALF);
1156		return (SCHED_INTERACT_HALF +
1157		    (SCHED_INTERACT_HALF - (td->td_sched->skg_slptime / div)));
1158	} if (td->td_sched->skg_slptime > td->td_sched->skg_runtime) {
1159		div = max(1, td->td_sched->skg_slptime / SCHED_INTERACT_HALF);
1160		return (td->td_sched->skg_runtime / div);
1161	}
1162
1163	/*
1164	 * This can happen if slptime and runtime are 0.
1165	 */
1166	return (0);
1167
1168}
1169
1170/*
1171 * Very early in the boot some setup of scheduler-specific
1172 * parts of proc0 and of soem scheduler resources needs to be done.
1173 * Called from:
1174 *  proc0_init()
1175 */
1176void
1177schedinit(void)
1178{
1179	/*
1180	 * Set up the scheduler specific parts of proc0.
1181	 */
1182	proc0.p_sched = NULL; /* XXX */
1183	thread0.td_sched = &td_sched0;
1184	td_sched0.ts_thread = &thread0;
1185	td_sched0.ts_state = TSS_THREAD;
1186}
1187
1188/*
1189 * This is only somewhat accurate since given many processes of the same
1190 * priority they will switch when their slices run out, which will be
1191 * at most SCHED_SLICE_MAX.
1192 */
1193int
1194sched_rr_interval(void)
1195{
1196	return (SCHED_SLICE_MAX);
1197}
1198
1199static void
1200sched_pctcpu_update(struct td_sched *ts)
1201{
1202	/*
1203	 * Adjust counters and watermark for pctcpu calc.
1204	 */
1205	if (ts->ts_ltick > ticks - SCHED_CPU_TICKS) {
1206		/*
1207		 * Shift the tick count out so that the divide doesn't
1208		 * round away our results.
1209		 */
1210		ts->ts_ticks <<= 10;
1211		ts->ts_ticks = (ts->ts_ticks / (ticks - ts->ts_ftick)) *
1212			    SCHED_CPU_TICKS;
1213		ts->ts_ticks >>= 10;
1214	} else
1215		ts->ts_ticks = 0;
1216	ts->ts_ltick = ticks;
1217	ts->ts_ftick = ts->ts_ltick - SCHED_CPU_TICKS;
1218}
1219
1220void
1221sched_thread_priority(struct thread *td, u_char prio)
1222{
1223	struct td_sched *ts;
1224
1225	CTR6(KTR_SCHED, "sched_prio: %p(%s) prio %d newprio %d by %p(%s)",
1226	    td, td->td_proc->p_comm, td->td_priority, prio, curthread,
1227	    curthread->td_proc->p_comm);
1228	ts = td->td_sched;
1229	mtx_assert(&sched_lock, MA_OWNED);
1230	if (td->td_priority == prio)
1231		return;
1232	if (TD_ON_RUNQ(td)) {
1233		/*
1234		 * If the priority has been elevated due to priority
1235		 * propagation, we may have to move ourselves to a new
1236		 * queue.  We still call adjustrunqueue below in case kse
1237		 * needs to fix things up.
1238		 */
1239		if (prio < td->td_priority && ts->ts_runq != NULL &&
1240		    (ts->ts_flags & TSF_ASSIGNED) == 0 &&
1241		    ts->ts_runq != TDQ_CPU(ts->ts_cpu)->ksq_curr) {
1242			runq_remove(ts->ts_runq, ts);
1243			ts->ts_runq = TDQ_CPU(ts->ts_cpu)->ksq_curr;
1244			runq_add(ts->ts_runq, ts, 0);
1245		}
1246		/*
1247		 * Hold this td_sched on this cpu so that sched_prio() doesn't
1248		 * cause excessive migration.  We only want migration to
1249		 * happen as the result of a wakeup.
1250		 */
1251		ts->ts_flags |= TSF_HOLD;
1252		adjustrunqueue(td, prio);
1253		ts->ts_flags &= ~TSF_HOLD;
1254	} else
1255		td->td_priority = prio;
1256}
1257
1258/*
1259 * Update a thread's priority when it is lent another thread's
1260 * priority.
1261 */
1262void
1263sched_lend_prio(struct thread *td, u_char prio)
1264{
1265
1266	td->td_flags |= TDF_BORROWING;
1267	sched_thread_priority(td, prio);
1268}
1269
1270/*
1271 * Restore a thread's priority when priority propagation is
1272 * over.  The prio argument is the minimum priority the thread
1273 * needs to have to satisfy other possible priority lending
1274 * requests.  If the thread's regular priority is less
1275 * important than prio, the thread will keep a priority boost
1276 * of prio.
1277 */
1278void
1279sched_unlend_prio(struct thread *td, u_char prio)
1280{
1281	u_char base_pri;
1282
1283	if (td->td_base_pri >= PRI_MIN_TIMESHARE &&
1284	    td->td_base_pri <= PRI_MAX_TIMESHARE)
1285		base_pri = td->td_user_pri;
1286	else
1287		base_pri = td->td_base_pri;
1288	if (prio >= base_pri) {
1289		td->td_flags &= ~TDF_BORROWING;
1290		sched_thread_priority(td, base_pri);
1291	} else
1292		sched_lend_prio(td, prio);
1293}
1294
1295void
1296sched_prio(struct thread *td, u_char prio)
1297{
1298	u_char oldprio;
1299
1300	/* First, update the base priority. */
1301	td->td_base_pri = prio;
1302
1303	/*
1304	 * If the thread is borrowing another thread's priority, don't
1305	 * ever lower the priority.
1306	 */
1307	if (td->td_flags & TDF_BORROWING && td->td_priority < prio)
1308		return;
1309
1310	/* Change the real priority. */
1311	oldprio = td->td_priority;
1312	sched_thread_priority(td, prio);
1313
1314	/*
1315	 * If the thread is on a turnstile, then let the turnstile update
1316	 * its state.
1317	 */
1318	if (TD_ON_LOCK(td) && oldprio != prio)
1319		turnstile_adjust(td, oldprio);
1320}
1321
1322void
1323sched_user_prio(struct thread *td, u_char prio)
1324{
1325	u_char oldprio;
1326
1327	td->td_base_user_pri = prio;
1328	if (td->td_flags & TDF_UBORROWING && td->td_user_pri <= prio)
1329                return;
1330	oldprio = td->td_user_pri;
1331	td->td_user_pri = prio;
1332
1333	if (TD_ON_UPILOCK(td) && oldprio != prio)
1334		umtx_pi_adjust(td, oldprio);
1335}
1336
1337void
1338sched_lend_user_prio(struct thread *td, u_char prio)
1339{
1340	u_char oldprio;
1341
1342	td->td_flags |= TDF_UBORROWING;
1343
1344	oldprio = td->td_user_pri;
1345	td->td_user_pri = prio;
1346
1347	if (TD_ON_UPILOCK(td) && oldprio != prio)
1348		umtx_pi_adjust(td, oldprio);
1349}
1350
1351void
1352sched_unlend_user_prio(struct thread *td, u_char prio)
1353{
1354	u_char base_pri;
1355
1356	base_pri = td->td_base_user_pri;
1357	if (prio >= base_pri) {
1358		td->td_flags &= ~TDF_UBORROWING;
1359		sched_user_prio(td, base_pri);
1360	} else
1361		sched_lend_user_prio(td, prio);
1362}
1363
1364void
1365sched_switch(struct thread *td, struct thread *newtd, int flags)
1366{
1367	struct tdq *ksq;
1368	struct td_sched *ts;
1369
1370	mtx_assert(&sched_lock, MA_OWNED);
1371
1372	ts = td->td_sched;
1373	ksq = TDQ_SELF();
1374
1375	td->td_lastcpu = td->td_oncpu;
1376	td->td_oncpu = NOCPU;
1377	td->td_flags &= ~TDF_NEEDRESCHED;
1378	td->td_owepreempt = 0;
1379
1380	/*
1381	 * If the thread has been assigned it may be in the process of switching
1382	 * to the new cpu.  This is the case in sched_bind().
1383	 */
1384	if (td == PCPU_GET(idlethread)) {
1385		TD_SET_CAN_RUN(td);
1386	} else if ((ts->ts_flags & TSF_ASSIGNED) == 0) {
1387		/* We are ending our run so make our slot available again */
1388		tdq_load_rem(ksq, ts);
1389		if (TD_IS_RUNNING(td)) {
1390			/*
1391			 * Don't allow the thread to migrate
1392			 * from a preemption.
1393			 */
1394			ts->ts_flags |= TSF_HOLD;
1395			setrunqueue(td, (flags & SW_PREEMPT) ?
1396			    SRQ_OURSELF|SRQ_YIELDING|SRQ_PREEMPTED :
1397			    SRQ_OURSELF|SRQ_YIELDING);
1398			ts->ts_flags &= ~TSF_HOLD;
1399		}
1400	}
1401	if (newtd != NULL) {
1402		/*
1403		 * If we bring in a thread account for it as if it had been
1404		 * added to the run queue and then chosen.
1405		 */
1406		newtd->td_sched->ts_flags |= TSF_DIDRUN;
1407		newtd->td_sched->ts_runq = ksq->ksq_curr;
1408		TD_SET_RUNNING(newtd);
1409		tdq_load_add(TDQ_SELF(), newtd->td_sched);
1410	} else
1411		newtd = choosethread();
1412	if (td != newtd) {
1413#ifdef	HWPMC_HOOKS
1414		if (PMC_PROC_IS_USING_PMCS(td->td_proc))
1415			PMC_SWITCH_CONTEXT(td, PMC_FN_CSW_OUT);
1416#endif
1417
1418		cpu_switch(td, newtd);
1419#ifdef	HWPMC_HOOKS
1420		if (PMC_PROC_IS_USING_PMCS(td->td_proc))
1421			PMC_SWITCH_CONTEXT(td, PMC_FN_CSW_IN);
1422#endif
1423	}
1424
1425	sched_lock.mtx_lock = (uintptr_t)td;
1426
1427	td->td_oncpu = PCPU_GET(cpuid);
1428}
1429
1430void
1431sched_nice(struct proc *p, int nice)
1432{
1433	struct td_sched *ts;
1434	struct thread *td;
1435	struct tdq *tdq;
1436
1437	PROC_LOCK_ASSERT(p, MA_OWNED);
1438	mtx_assert(&sched_lock, MA_OWNED);
1439	/*
1440	 * We need to adjust the nice counts for running threads.
1441	 */
1442	FOREACH_THREAD_IN_PROC(p, td) {
1443		if (td->td_pri_class == PRI_TIMESHARE) {
1444			ts = td->td_sched;
1445			if (ts->ts_runq == NULL)
1446				continue;
1447			tdq = TDQ_CPU(ts->ts_cpu);
1448			tdq_nice_rem(tdq, p->p_nice);
1449			tdq_nice_add(tdq, nice);
1450		}
1451	}
1452	p->p_nice = nice;
1453	FOREACH_THREAD_IN_PROC(p, td) {
1454		sched_priority(td);
1455		td->td_flags |= TDF_NEEDRESCHED;
1456	}
1457}
1458
1459void
1460sched_sleep(struct thread *td)
1461{
1462	mtx_assert(&sched_lock, MA_OWNED);
1463
1464	td->td_sched->ts_slptime = ticks;
1465}
1466
1467void
1468sched_wakeup(struct thread *td)
1469{
1470	mtx_assert(&sched_lock, MA_OWNED);
1471
1472	/*
1473	 * Let the procs know how long we slept for.  This is because process
1474	 * interactivity behavior is modeled in the procs.
1475	 */
1476	if (td->td_sched->ts_slptime) {
1477		int hzticks;
1478
1479		hzticks = (ticks - td->td_sched->ts_slptime) << 10;
1480		if (hzticks >= SCHED_SLP_RUN_MAX) {
1481			td->td_sched->skg_slptime = SCHED_SLP_RUN_MAX;
1482			td->td_sched->skg_runtime = 1;
1483		} else {
1484			td->td_sched->skg_slptime += hzticks;
1485			sched_interact_update(td);
1486		}
1487		sched_priority(td);
1488		sched_slice(td->td_sched);
1489		td->td_sched->ts_slptime = 0;
1490	}
1491	setrunqueue(td, SRQ_BORING);
1492}
1493
1494/*
1495 * Penalize the parent for creating a new child and initialize the child's
1496 * priority.
1497 */
1498void
1499sched_fork(struct thread *td, struct thread *child)
1500{
1501	mtx_assert(&sched_lock, MA_OWNED);
1502	sched_fork_thread(td, child);
1503}
1504
1505void
1506sched_fork_thread(struct thread *td, struct thread *child)
1507{
1508	struct td_sched *ts;
1509	struct td_sched *ts2;
1510
1511	child->td_sched->skg_slptime = td->td_sched->skg_slptime;
1512	child->td_sched->skg_runtime = td->td_sched->skg_runtime;
1513	child->td_user_pri = td->td_user_pri;
1514	child->td_base_user_pri = td->td_base_user_pri;
1515	sched_interact_fork(child);
1516	td->td_sched->skg_runtime += tickincr;
1517	sched_interact_update(td);
1518
1519	sched_newthread(child);
1520
1521	ts = td->td_sched;
1522	ts2 = child->td_sched;
1523	ts2->ts_slice = 1;	/* Attempt to quickly learn interactivity. */
1524	ts2->ts_cpu = ts->ts_cpu;
1525	ts2->ts_runq = NULL;
1526
1527	/* Grab our parents cpu estimation information. */
1528	ts2->ts_ticks = ts->ts_ticks;
1529	ts2->ts_ltick = ts->ts_ltick;
1530	ts2->ts_ftick = ts->ts_ftick;
1531}
1532
1533void
1534sched_class(struct thread *td, int class)
1535{
1536	struct tdq *tdq;
1537	struct td_sched *ts;
1538	int nclass;
1539	int oclass;
1540
1541	mtx_assert(&sched_lock, MA_OWNED);
1542	if (td->td_pri_class == class)
1543		return;
1544
1545	nclass = PRI_BASE(class);
1546	oclass = PRI_BASE(td->td_pri_class);
1547	ts = td->td_sched;
1548	if (!((ts->ts_state != TSS_ONRUNQ &&
1549	    ts->ts_state != TSS_THREAD) || ts->ts_runq == NULL)) {
1550		tdq = TDQ_CPU(ts->ts_cpu);
1551
1552#ifdef SMP
1553		/*
1554		 * On SMP if we're on the RUNQ we must adjust the transferable
1555		 * count because could be changing to or from an interrupt
1556		 * class.
1557		 */
1558		if (ts->ts_state == TSS_ONRUNQ) {
1559			if (THREAD_CAN_MIGRATE(ts)) {
1560				tdq->ksq_transferable--;
1561				tdq->ksq_group->ksg_transferable--;
1562			}
1563			if (THREAD_CAN_MIGRATE(ts)) {
1564				tdq->ksq_transferable++;
1565				tdq->ksq_group->ksg_transferable++;
1566			}
1567		}
1568#endif
1569		if (oclass == PRI_TIMESHARE) {
1570			tdq->ksq_load_timeshare--;
1571			tdq_nice_rem(tdq, td->td_proc->p_nice);
1572		}
1573		if (nclass == PRI_TIMESHARE) {
1574			tdq->ksq_load_timeshare++;
1575			tdq_nice_add(tdq, td->td_proc->p_nice);
1576		}
1577	}
1578
1579	td->td_pri_class = class;
1580}
1581
1582/*
1583 * Return some of the child's priority and interactivity to the parent.
1584 */
1585void
1586sched_exit(struct proc *p, struct thread *child)
1587{
1588
1589	CTR3(KTR_SCHED, "sched_exit: %p(%s) prio %d",
1590	    child, child->td_proc->p_comm, child->td_priority);
1591
1592	sched_exit_thread(FIRST_THREAD_IN_PROC(p), child);
1593}
1594
1595void
1596sched_exit_thread(struct thread *td, struct thread *child)
1597{
1598	CTR3(KTR_SCHED, "sched_exit_thread: %p(%s) prio %d",
1599	    child, childproc->p_comm, child->td_priority);
1600
1601	td->td_sched->skg_runtime += child->td_sched->skg_runtime;
1602	sched_interact_update(td);
1603	tdq_load_rem(TDQ_CPU(child->td_sched->ts_cpu), child->td_sched);
1604}
1605
1606void
1607sched_userret(struct thread *td)
1608{
1609	/*
1610	 * XXX we cheat slightly on the locking here to avoid locking in
1611	 * the usual case.  Setting td_priority here is essentially an
1612	 * incomplete workaround for not setting it properly elsewhere.
1613	 * Now that some interrupt handlers are threads, not setting it
1614	 * properly elsewhere can clobber it in the window between setting
1615	 * it here and returning to user mode, so don't waste time setting
1616	 * it perfectly here.
1617	 */
1618	KASSERT((td->td_flags & TDF_BORROWING) == 0,
1619	    ("thread with borrowed priority returning to userland"));
1620	if (td->td_priority != td->td_user_pri) {
1621		mtx_lock_spin(&sched_lock);
1622		td->td_priority = td->td_user_pri;
1623		td->td_base_pri = td->td_user_pri;
1624		mtx_unlock_spin(&sched_lock);
1625        }
1626}
1627
1628void
1629sched_clock(struct thread *td)
1630{
1631	struct tdq *tdq;
1632	struct td_sched *ts;
1633
1634	mtx_assert(&sched_lock, MA_OWNED);
1635	tdq = TDQ_SELF();
1636#ifdef SMP
1637	if (ticks >= bal_tick)
1638		sched_balance();
1639	if (ticks >= gbal_tick && balance_groups)
1640		sched_balance_groups();
1641	/*
1642	 * We could have been assigned a non real-time thread without an
1643	 * IPI.
1644	 */
1645	if (tdq->ksq_assigned)
1646		tdq_assign(tdq);	/* Potentially sets NEEDRESCHED */
1647#endif
1648	ts = td->td_sched;
1649
1650	/* Adjust ticks for pctcpu */
1651	ts->ts_ticks++;
1652	ts->ts_ltick = ticks;
1653
1654	/* Go up to one second beyond our max and then trim back down */
1655	if (ts->ts_ftick + SCHED_CPU_TICKS + hz < ts->ts_ltick)
1656		sched_pctcpu_update(ts);
1657
1658	if (td->td_flags & TDF_IDLETD)
1659		return;
1660	/*
1661	 * We only do slicing code for TIMESHARE threads.
1662	 */
1663	if (td->td_pri_class != PRI_TIMESHARE)
1664		return;
1665	/*
1666	 * We used a tick charge it to the thread so that we can compute our
1667	 * interactivity.
1668	 */
1669	td->td_sched->skg_runtime += tickincr;
1670	sched_interact_update(td);
1671
1672	/*
1673	 * We used up one time slice.
1674	 */
1675	if (--ts->ts_slice > 0)
1676		return;
1677	/*
1678	 * We're out of time, recompute priorities and requeue.
1679	 */
1680	tdq_load_rem(tdq, ts);
1681	sched_priority(td);
1682	sched_slice(ts);
1683	if (SCHED_CURR(td, ts))
1684		ts->ts_runq = tdq->ksq_curr;
1685	else
1686		ts->ts_runq = tdq->ksq_next;
1687	tdq_load_add(tdq, ts);
1688	td->td_flags |= TDF_NEEDRESCHED;
1689}
1690
1691int
1692sched_runnable(void)
1693{
1694	struct tdq *tdq;
1695	int load;
1696
1697	load = 1;
1698
1699	tdq = TDQ_SELF();
1700#ifdef SMP
1701	if (tdq->ksq_assigned) {
1702		mtx_lock_spin(&sched_lock);
1703		tdq_assign(tdq);
1704		mtx_unlock_spin(&sched_lock);
1705	}
1706#endif
1707	if ((curthread->td_flags & TDF_IDLETD) != 0) {
1708		if (tdq->ksq_load > 0)
1709			goto out;
1710	} else
1711		if (tdq->ksq_load - 1 > 0)
1712			goto out;
1713	load = 0;
1714out:
1715	return (load);
1716}
1717
1718struct td_sched *
1719sched_choose(void)
1720{
1721	struct tdq *tdq;
1722	struct td_sched *ts;
1723
1724	mtx_assert(&sched_lock, MA_OWNED);
1725	tdq = TDQ_SELF();
1726#ifdef SMP
1727restart:
1728	if (tdq->ksq_assigned)
1729		tdq_assign(tdq);
1730#endif
1731	ts = tdq_choose(tdq);
1732	if (ts) {
1733#ifdef SMP
1734		if (ts->ts_thread->td_pri_class == PRI_IDLE)
1735			if (tdq_idled(tdq) == 0)
1736				goto restart;
1737#endif
1738		tdq_runq_rem(tdq, ts);
1739		ts->ts_state = TSS_THREAD;
1740		ts->ts_flags &= ~TSF_PREEMPTED;
1741		return (ts);
1742	}
1743#ifdef SMP
1744	if (tdq_idled(tdq) == 0)
1745		goto restart;
1746#endif
1747	return (NULL);
1748}
1749
1750void
1751sched_add(struct thread *td, int flags)
1752{
1753	struct tdq *tdq;
1754	struct td_sched *ts;
1755	int preemptive;
1756	int canmigrate;
1757	int class;
1758
1759	CTR5(KTR_SCHED, "sched_add: %p(%s) prio %d by %p(%s)",
1760	    td, td->td_proc->p_comm, td->td_priority, curthread,
1761	    curthread->td_proc->p_comm);
1762	mtx_assert(&sched_lock, MA_OWNED);
1763	ts = td->td_sched;
1764	canmigrate = 1;
1765	preemptive = !(flags & SRQ_YIELDING);
1766	class = PRI_BASE(td->td_pri_class);
1767	tdq = TDQ_SELF();
1768	ts->ts_flags &= ~TSF_INTERNAL;
1769#ifdef SMP
1770	if (ts->ts_flags & TSF_ASSIGNED) {
1771		if (ts->ts_flags & TSF_REMOVED)
1772			ts->ts_flags &= ~TSF_REMOVED;
1773		return;
1774	}
1775	canmigrate = THREAD_CAN_MIGRATE(ts);
1776	/*
1777	 * Don't migrate running threads here.  Force the long term balancer
1778	 * to do it.
1779	 */
1780	if (ts->ts_flags & TSF_HOLD) {
1781		ts->ts_flags &= ~TSF_HOLD;
1782		canmigrate = 0;
1783	}
1784#endif
1785	KASSERT(ts->ts_state != TSS_ONRUNQ,
1786	    ("sched_add: thread %p (%s) already in run queue", td,
1787	    td->td_proc->p_comm));
1788	KASSERT(td->td_proc->p_sflag & PS_INMEM,
1789	    ("sched_add: process swapped out"));
1790	KASSERT(ts->ts_runq == NULL,
1791	    ("sched_add: thread %p is still assigned to a run queue", td));
1792	if (flags & SRQ_PREEMPTED)
1793		ts->ts_flags |= TSF_PREEMPTED;
1794	switch (class) {
1795	case PRI_ITHD:
1796	case PRI_REALTIME:
1797		ts->ts_runq = tdq->ksq_curr;
1798		ts->ts_slice = SCHED_SLICE_MAX;
1799		if (canmigrate)
1800			ts->ts_cpu = PCPU_GET(cpuid);
1801		break;
1802	case PRI_TIMESHARE:
1803		if (SCHED_CURR(td, ts))
1804			ts->ts_runq = tdq->ksq_curr;
1805		else
1806			ts->ts_runq = tdq->ksq_next;
1807		break;
1808	case PRI_IDLE:
1809		/*
1810		 * This is for priority prop.
1811		 */
1812		if (ts->ts_thread->td_priority < PRI_MIN_IDLE)
1813			ts->ts_runq = tdq->ksq_curr;
1814		else
1815			ts->ts_runq = &tdq->ksq_idle;
1816		ts->ts_slice = SCHED_SLICE_MIN;
1817		break;
1818	default:
1819		panic("Unknown pri class.");
1820		break;
1821	}
1822#ifdef SMP
1823	/*
1824	 * If this thread is pinned or bound, notify the target cpu.
1825	 */
1826	if (!canmigrate && ts->ts_cpu != PCPU_GET(cpuid) ) {
1827		ts->ts_runq = NULL;
1828		tdq_notify(ts, ts->ts_cpu);
1829		return;
1830	}
1831	/*
1832	 * If we had been idle, clear our bit in the group and potentially
1833	 * the global bitmap.  If not, see if we should transfer this thread.
1834	 */
1835	if ((class == PRI_TIMESHARE || class == PRI_REALTIME) &&
1836	    (tdq->ksq_group->ksg_idlemask & PCPU_GET(cpumask)) != 0) {
1837		/*
1838		 * Check to see if our group is unidling, and if so, remove it
1839		 * from the global idle mask.
1840		 */
1841		if (tdq->ksq_group->ksg_idlemask ==
1842		    tdq->ksq_group->ksg_cpumask)
1843			atomic_clear_int(&tdq_idle, tdq->ksq_group->ksg_mask);
1844		/*
1845		 * Now remove ourselves from the group specific idle mask.
1846		 */
1847		tdq->ksq_group->ksg_idlemask &= ~PCPU_GET(cpumask);
1848	} else if (canmigrate && tdq->ksq_load > 1 && class != PRI_ITHD)
1849		if (tdq_transfer(tdq, ts, class))
1850			return;
1851	ts->ts_cpu = PCPU_GET(cpuid);
1852#endif
1853	if (td->td_priority < curthread->td_priority &&
1854	    ts->ts_runq == tdq->ksq_curr)
1855		curthread->td_flags |= TDF_NEEDRESCHED;
1856	if (preemptive && maybe_preempt(td))
1857		return;
1858	ts->ts_state = TSS_ONRUNQ;
1859
1860	tdq_runq_add(tdq, ts, flags);
1861	tdq_load_add(tdq, ts);
1862}
1863
1864void
1865sched_rem(struct thread *td)
1866{
1867	struct tdq *tdq;
1868	struct td_sched *ts;
1869
1870	CTR5(KTR_SCHED, "sched_rem: %p(%s) prio %d by %p(%s)",
1871	    td, td->td_proc->p_comm, td->td_priority, curthread,
1872	    curthread->td_proc->p_comm);
1873	mtx_assert(&sched_lock, MA_OWNED);
1874	ts = td->td_sched;
1875	ts->ts_flags &= ~TSF_PREEMPTED;
1876	if (ts->ts_flags & TSF_ASSIGNED) {
1877		ts->ts_flags |= TSF_REMOVED;
1878		return;
1879	}
1880	KASSERT((ts->ts_state == TSS_ONRUNQ),
1881	    ("sched_rem: thread not on run queue"));
1882
1883	ts->ts_state = TSS_THREAD;
1884	tdq = TDQ_CPU(ts->ts_cpu);
1885	tdq_runq_rem(tdq, ts);
1886	tdq_load_rem(tdq, ts);
1887}
1888
1889fixpt_t
1890sched_pctcpu(struct thread *td)
1891{
1892	fixpt_t pctcpu;
1893	struct td_sched *ts;
1894
1895	pctcpu = 0;
1896	ts = td->td_sched;
1897	if (ts == NULL)
1898		return (0);
1899
1900	mtx_lock_spin(&sched_lock);
1901	if (ts->ts_ticks) {
1902		int rtick;
1903
1904		/*
1905		 * Don't update more frequently than twice a second.  Allowing
1906		 * this causes the cpu usage to decay away too quickly due to
1907		 * rounding errors.
1908		 */
1909		if (ts->ts_ftick + SCHED_CPU_TICKS < ts->ts_ltick ||
1910		    ts->ts_ltick < (ticks - (hz / 2)))
1911			sched_pctcpu_update(ts);
1912		/* How many rtick per second ? */
1913		rtick = min(ts->ts_ticks / SCHED_CPU_TIME, SCHED_CPU_TICKS);
1914		pctcpu = (FSCALE * ((FSCALE * rtick)/realstathz)) >> FSHIFT;
1915	}
1916
1917	td->td_proc->p_swtime = ts->ts_ltick - ts->ts_ftick;
1918	mtx_unlock_spin(&sched_lock);
1919
1920	return (pctcpu);
1921}
1922
1923void
1924sched_bind(struct thread *td, int cpu)
1925{
1926	struct td_sched *ts;
1927
1928	mtx_assert(&sched_lock, MA_OWNED);
1929	ts = td->td_sched;
1930	ts->ts_flags |= TSF_BOUND;
1931#ifdef SMP
1932	if (PCPU_GET(cpuid) == cpu)
1933		return;
1934	/* sched_rem without the runq_remove */
1935	ts->ts_state = TSS_THREAD;
1936	tdq_load_rem(TDQ_CPU(ts->ts_cpu), ts);
1937	tdq_notify(ts, cpu);
1938	/* When we return from mi_switch we'll be on the correct cpu. */
1939	mi_switch(SW_VOL, NULL);
1940#endif
1941}
1942
1943void
1944sched_unbind(struct thread *td)
1945{
1946	mtx_assert(&sched_lock, MA_OWNED);
1947	td->td_sched->ts_flags &= ~TSF_BOUND;
1948}
1949
1950int
1951sched_is_bound(struct thread *td)
1952{
1953	mtx_assert(&sched_lock, MA_OWNED);
1954	return (td->td_sched->ts_flags & TSF_BOUND);
1955}
1956
1957void
1958sched_relinquish(struct thread *td)
1959{
1960	mtx_lock_spin(&sched_lock);
1961	if (td->td_pri_class == PRI_TIMESHARE)
1962		sched_prio(td, PRI_MAX_TIMESHARE);
1963	mi_switch(SW_VOL, NULL);
1964	mtx_unlock_spin(&sched_lock);
1965}
1966
1967int
1968sched_load(void)
1969{
1970#ifdef SMP
1971	int total;
1972	int i;
1973
1974	total = 0;
1975	for (i = 0; i <= ksg_maxid; i++)
1976		total += TDQ_GROUP(i)->ksg_load;
1977	return (total);
1978#else
1979	return (TDQ_SELF()->ksq_sysload);
1980#endif
1981}
1982
1983int
1984sched_sizeof_proc(void)
1985{
1986	return (sizeof(struct proc));
1987}
1988
1989int
1990sched_sizeof_thread(void)
1991{
1992	return (sizeof(struct thread) + sizeof(struct td_sched));
1993}
1994
1995void
1996sched_tick(void)
1997{
1998}
1999#define KERN_SWITCH_INCLUDE 1
2000#include "kern/kern_switch.c"
2001