sched_ule.c revision 165830
1/*-
2 * Copyright (c) 2002-2007, Jeffrey Roberson <jeff@freebsd.org>
3 * All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 * 1. Redistributions of source code must retain the above copyright
9 *    notice unmodified, this list of conditions, and the following
10 *    disclaimer.
11 * 2. Redistributions in binary form must reproduce the above copyright
12 *    notice, this list of conditions and the following disclaimer in the
13 *    documentation and/or other materials provided with the distribution.
14 *
15 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
16 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
17 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
18 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
19 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
20 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
21 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
22 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
23 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
24 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
25 */
26
27#include <sys/cdefs.h>
28__FBSDID("$FreeBSD: head/sys/kern/sched_ule.c 165830 2007-01-06 12:33:43Z jeff $");
29
30#include "opt_hwpmc_hooks.h"
31#include "opt_sched.h"
32
33#include <sys/param.h>
34#include <sys/systm.h>
35#include <sys/kdb.h>
36#include <sys/kernel.h>
37#include <sys/ktr.h>
38#include <sys/lock.h>
39#include <sys/mutex.h>
40#include <sys/proc.h>
41#include <sys/resource.h>
42#include <sys/resourcevar.h>
43#include <sys/sched.h>
44#include <sys/smp.h>
45#include <sys/sx.h>
46#include <sys/sysctl.h>
47#include <sys/sysproto.h>
48#include <sys/turnstile.h>
49#include <sys/umtx.h>
50#include <sys/vmmeter.h>
51#ifdef KTRACE
52#include <sys/uio.h>
53#include <sys/ktrace.h>
54#endif
55
56#ifdef HWPMC_HOOKS
57#include <sys/pmckern.h>
58#endif
59
60#include <machine/cpu.h>
61#include <machine/smp.h>
62
63/*
64 * Thread scheduler specific section.
65 */
66struct td_sched {
67	TAILQ_ENTRY(td_sched) ts_procq;	/* (j/z) Run queue. */
68	int		ts_flags;	/* (j) TSF_* flags. */
69	struct thread	*ts_thread;	/* (*) Active associated thread. */
70	u_char		ts_rqindex;	/* (j) Run queue index. */
71	enum {
72		TSS_THREAD,
73		TSS_ONRUNQ
74	} ts_state;			/* (j) thread sched specific status. */
75	int		ts_slptime;
76	int		ts_slice;
77	struct runq	*ts_runq;
78	u_char		ts_cpu;		/* CPU that we have affinity for. */
79	/* The following variables are only used for pctcpu calculation */
80	int		ts_ltick;	/* Last tick that we were running on */
81	int		ts_ftick;	/* First tick that we were running on */
82	int		ts_ticks;	/* Tick count */
83
84	/* originally from kg_sched */
85	int	skg_slptime;		/* Number of ticks we vol. slept */
86	int	skg_runtime;		/* Number of ticks we were running */
87};
88#define	ts_assign		ts_procq.tqe_next
89/* flags kept in ts_flags */
90#define	TSF_ASSIGNED	0x0001		/* Thread is being migrated. */
91#define	TSF_BOUND	0x0002		/* Thread can not migrate. */
92#define	TSF_XFERABLE	0x0004		/* Thread was added as transferable. */
93#define	TSF_REMOVED	0x0008		/* Thread was removed while ASSIGNED */
94#define	TSF_DIDRUN	0x2000		/* Thread actually ran. */
95
96static struct td_sched td_sched0;
97
98/*
99 * Cpu percentage computation macros and defines.
100 *
101 * SCHED_TICK_SECS:	Number of seconds to average the cpu usage across.
102 * SCHED_TICK_TARG:	Number of hz ticks to average the cpu usage across.
103 * SCHED_TICK_MAX:	Maximum number of ticks before scaling back.
104 * SCHED_TICK_SHIFT:	Shift factor to avoid rounding away results.
105 * SCHED_TICK_HZ:	Compute the number of hz ticks for a given ticks count.
106 * SCHED_TICK_TOTAL:	Gives the amount of time we've been recording ticks.
107 */
108#define	SCHED_TICK_SECS		10
109#define	SCHED_TICK_TARG		(hz * SCHED_TICK_SECS)
110#define	SCHED_TICK_MAX		(SCHED_TICK_TARG + hz)
111#define	SCHED_TICK_SHIFT	10
112#define	SCHED_TICK_HZ(ts)	((ts)->ts_ticks >> SCHED_TICK_SHIFT)
113#define	SCHED_TICK_TOTAL(ts)	(max((ts)->ts_ltick - (ts)->ts_ftick, hz))
114
115/*
116 * These macros determine priorities for non-interactive threads.  They are
117 * assigned a priority based on their recent cpu utilization as expressed
118 * by the ratio of ticks to the tick total.  NHALF priorities at the start
119 * and end of the MIN to MAX timeshare range are only reachable with negative
120 * or positive nice respectively.
121 *
122 * PRI_RANGE:	Priority range for utilization dependent priorities.
123 * PRI_NRESV:	Number of nice values.
124 * PRI_TICKS:	Compute a priority in PRI_RANGE from the ticks count and total.
125 * PRI_NICE:	Determines the part of the priority inherited from nice.
126 */
127#define	SCHED_PRI_NRESV		(PRIO_MAX - PRIO_MIN)
128#define	SCHED_PRI_NHALF		(SCHED_PRI_NRESV / 2)
129#define	SCHED_PRI_MIN		(PRI_MIN_TIMESHARE + SCHED_PRI_NHALF)
130#define	SCHED_PRI_MAX		(PRI_MAX_TIMESHARE - SCHED_PRI_NHALF)
131#define	SCHED_PRI_RANGE		(SCHED_PRI_MAX - SCHED_PRI_MIN + 1)
132#define	SCHED_PRI_TICKS(ts)						\
133    (SCHED_TICK_HZ((ts)) /						\
134    (roundup(SCHED_TICK_TOTAL((ts)), SCHED_PRI_RANGE) / SCHED_PRI_RANGE))
135#define	SCHED_PRI_NICE(nice)	(nice)
136
137/*
138 * These determine the interactivity of a process.  Interactivity differs from
139 * cpu utilization in that it expresses the voluntary time slept vs time ran
140 * while cpu utilization includes all time not running.  This more accurately
141 * models the intent of the thread.
142 *
143 * SLP_RUN_MAX:	Maximum amount of sleep time + run time we'll accumulate
144 *		before throttling back.
145 * SLP_RUN_FORK:	Maximum slp+run time to inherit at fork time.
146 * INTERACT_MAX:	Maximum interactivity value.  Smaller is better.
147 * INTERACT_THRESH:	Threshhold for placement on the current runq.
148 */
149#define	SCHED_SLP_RUN_MAX	((hz * 5) << SCHED_TICK_SHIFT)
150#define	SCHED_SLP_RUN_FORK	((hz / 2) << SCHED_TICK_SHIFT)
151#define	SCHED_INTERACT_MAX	(100)
152#define	SCHED_INTERACT_HALF	(SCHED_INTERACT_MAX / 2)
153#define	SCHED_INTERACT_THRESH	(30)
154
155/*
156 * tickincr:		Converts a stathz tick into a hz domain scaled by
157 *			the shift factor.  Without the shift the error rate
158 *			due to rounding would be unacceptably high.
159 * realstathz:		stathz is sometimes 0 and run off of hz.
160 * sched_slice:		Runtime of each thread before rescheduling.
161 */
162static int sched_interact = SCHED_INTERACT_THRESH;
163static int realstathz;
164static int tickincr;
165static int sched_slice;
166static int sched_rebalance = 1;
167
168/*
169 * tdq - per processor runqs and statistics.
170 */
171struct tdq {
172	struct runq	tdq_idle;		/* Queue of IDLE threads. */
173	struct runq	tdq_timeshare;		/* timeshare run queue. */
174	struct runq	tdq_realtime;		/* real-time run queue. */
175	int		tdq_idx;		/* Current insert index. */
176	int		tdq_ridx;		/* Current removal index. */
177	int		tdq_load;		/* Aggregate load. */
178#ifdef SMP
179	int		tdq_transferable;
180	LIST_ENTRY(tdq)	tdq_siblings;		/* Next in tdq group. */
181	struct tdq_group *tdq_group;		/* Our processor group. */
182	volatile struct td_sched *tdq_assigned;	/* assigned by another CPU. */
183#else
184	int		tdq_sysload;		/* For loadavg, !ITHD load. */
185#endif
186};
187
188#ifdef SMP
189/*
190 * tdq groups are groups of processors which can cheaply share threads.  When
191 * one processor in the group goes idle it will check the runqs of the other
192 * processors in its group prior to halting and waiting for an interrupt.
193 * These groups are suitable for SMT (Symetric Multi-Threading) and not NUMA.
194 * In a numa environment we'd want an idle bitmap per group and a two tiered
195 * load balancer.
196 */
197struct tdq_group {
198	int	tdg_cpus;		/* Count of CPUs in this tdq group. */
199	cpumask_t tdg_cpumask;		/* Mask of cpus in this group. */
200	cpumask_t tdg_idlemask;		/* Idle cpus in this group. */
201	cpumask_t tdg_mask;		/* Bit mask for first cpu. */
202	int	tdg_load;		/* Total load of this group. */
203	int	tdg_transferable;	/* Transferable load of this group. */
204	LIST_HEAD(, tdq) tdg_members;	/* Linked list of all members. */
205};
206#endif
207
208/*
209 * One thread queue per processor.
210 */
211#ifdef SMP
212static cpumask_t tdq_idle;
213static int tdg_maxid;
214static struct tdq	tdq_cpu[MAXCPU];
215static struct tdq_group tdq_groups[MAXCPU];
216static int bal_tick;
217static int gbal_tick;
218static int balance_groups;
219
220#define	TDQ_SELF()	(&tdq_cpu[PCPU_GET(cpuid)])
221#define	TDQ_CPU(x)	(&tdq_cpu[(x)])
222#define	TDQ_ID(x)	((x) - tdq_cpu)
223#define	TDQ_GROUP(x)	(&tdq_groups[(x)])
224#else	/* !SMP */
225static struct tdq	tdq_cpu;
226
227#define	TDQ_SELF()	(&tdq_cpu)
228#define	TDQ_CPU(x)	(&tdq_cpu)
229#endif
230
231static struct td_sched *sched_choose(void);	/* XXX Should be thread * */
232static void sched_priority(struct thread *);
233static void sched_thread_priority(struct thread *, u_char);
234static int sched_interact_score(struct thread *);
235static void sched_interact_update(struct thread *);
236static void sched_interact_fork(struct thread *);
237static void sched_pctcpu_update(struct td_sched *);
238static inline void sched_pin_td(struct thread *td);
239static inline void sched_unpin_td(struct thread *td);
240
241/* Operations on per processor queues */
242static struct td_sched * tdq_choose(struct tdq *);
243static void tdq_setup(struct tdq *);
244static void tdq_load_add(struct tdq *, struct td_sched *);
245static void tdq_load_rem(struct tdq *, struct td_sched *);
246static __inline void tdq_runq_add(struct tdq *, struct td_sched *, int);
247static __inline void tdq_runq_rem(struct tdq *, struct td_sched *);
248void tdq_print(int cpu);
249static void runq_print(struct runq *rq);
250#ifdef SMP
251static int tdq_transfer(struct tdq *, struct td_sched *, int);
252static struct td_sched *runq_steal(struct runq *);
253static void sched_balance(void);
254static void sched_balance_groups(void);
255static void sched_balance_group(struct tdq_group *);
256static void sched_balance_pair(struct tdq *, struct tdq *);
257static void sched_smp_tick(void);
258static void tdq_move(struct tdq *, int);
259static int tdq_idled(struct tdq *);
260static void tdq_notify(struct td_sched *, int);
261static void tdq_assign(struct tdq *);
262static struct td_sched *tdq_steal(struct tdq *, int);
263
264#define	THREAD_CAN_MIGRATE(td)						\
265    ((td)->td_pinned == 0 && (td)->td_pri_class != PRI_ITHD)
266#endif
267
268static void sched_setup(void *dummy);
269SYSINIT(sched_setup, SI_SUB_RUN_QUEUE, SI_ORDER_FIRST, sched_setup, NULL)
270
271static void sched_initticks(void *dummy);
272SYSINIT(sched_initticks, SI_SUB_CLOCKS, SI_ORDER_THIRD, sched_initticks, NULL)
273
274static inline void
275sched_pin_td(struct thread *td)
276{
277	td->td_pinned++;
278}
279
280static inline void
281sched_unpin_td(struct thread *td)
282{
283	td->td_pinned--;
284}
285
286static void
287runq_print(struct runq *rq)
288{
289	struct rqhead *rqh;
290	struct td_sched *ts;
291	int pri;
292	int j;
293	int i;
294
295	for (i = 0; i < RQB_LEN; i++) {
296		printf("\t\trunq bits %d 0x%zx\n",
297		    i, rq->rq_status.rqb_bits[i]);
298		for (j = 0; j < RQB_BPW; j++)
299			if (rq->rq_status.rqb_bits[i] & (1ul << j)) {
300				pri = j + (i << RQB_L2BPW);
301				rqh = &rq->rq_queues[pri];
302				TAILQ_FOREACH(ts, rqh, ts_procq) {
303					printf("\t\t\ttd %p(%s) priority %d rqindex %d pri %d\n",
304					    ts->ts_thread, ts->ts_thread->td_proc->p_comm, ts->ts_thread->td_priority, ts->ts_rqindex, pri);
305				}
306			}
307	}
308}
309
310void
311tdq_print(int cpu)
312{
313	struct tdq *tdq;
314
315	tdq = TDQ_CPU(cpu);
316
317	printf("tdq:\n");
318	printf("\tload:           %d\n", tdq->tdq_load);
319	printf("\ttimeshare idx: %d\n", tdq->tdq_idx);
320	printf("\ttimeshare ridx: %d\n", tdq->tdq_ridx);
321	printf("\trealtime runq:\n");
322	runq_print(&tdq->tdq_realtime);
323	printf("\ttimeshare runq:\n");
324	runq_print(&tdq->tdq_timeshare);
325	printf("\tidle runq:\n");
326	runq_print(&tdq->tdq_idle);
327#ifdef SMP
328	printf("\tload transferable: %d\n", tdq->tdq_transferable);
329#endif
330}
331
332static __inline void
333tdq_runq_add(struct tdq *tdq, struct td_sched *ts, int flags)
334{
335#ifdef SMP
336	if (THREAD_CAN_MIGRATE(ts->ts_thread)) {
337		tdq->tdq_transferable++;
338		tdq->tdq_group->tdg_transferable++;
339		ts->ts_flags |= TSF_XFERABLE;
340	}
341#endif
342	if (ts->ts_runq == &tdq->tdq_timeshare) {
343		int pri;
344
345		pri = ts->ts_thread->td_priority;
346		KASSERT(pri <= PRI_MAX_TIMESHARE && pri >= PRI_MIN_TIMESHARE,
347			("Invalid priority %d on timeshare runq", pri));
348		/*
349		 * This queue contains only priorities between MIN and MAX
350		 * realtime.  Use the whole queue to represent these values.
351		 */
352#define	TS_RQ_PPQ	(((PRI_MAX_TIMESHARE - PRI_MIN_TIMESHARE) + 1) / RQ_NQS)
353		if ((flags & SRQ_BORROWING) == 0) {
354			pri = (pri - PRI_MIN_TIMESHARE) / TS_RQ_PPQ;
355			pri = (pri + tdq->tdq_idx) % RQ_NQS;
356			/*
357			 * This effectively shortens the queue by one so we
358			 * can have a one slot difference between idx and
359			 * ridx while we wait for threads to drain.
360			 */
361			if (tdq->tdq_ridx != tdq->tdq_idx &&
362			    pri == tdq->tdq_ridx)
363				pri = (pri - 1) % RQ_NQS;
364		} else
365			pri = tdq->tdq_ridx;
366		runq_add_pri(ts->ts_runq, ts, pri, flags);
367	} else
368		runq_add(ts->ts_runq, ts, flags);
369}
370
371static __inline void
372tdq_runq_rem(struct tdq *tdq, struct td_sched *ts)
373{
374#ifdef SMP
375	if (ts->ts_flags & TSF_XFERABLE) {
376		tdq->tdq_transferable--;
377		tdq->tdq_group->tdg_transferable--;
378		ts->ts_flags &= ~TSF_XFERABLE;
379	}
380#endif
381	if (ts->ts_runq == &tdq->tdq_timeshare) {
382		if (tdq->tdq_idx != tdq->tdq_ridx)
383			runq_remove_idx(ts->ts_runq, ts, &tdq->tdq_ridx);
384		else
385			runq_remove_idx(ts->ts_runq, ts, NULL);
386		/*
387		 * For timeshare threads we update the priority here so
388		 * the priority reflects the time we've been sleeping.
389		 */
390		ts->ts_ltick = ticks;
391		sched_pctcpu_update(ts);
392		sched_priority(ts->ts_thread);
393	} else
394		runq_remove(ts->ts_runq, ts);
395}
396
397static void
398tdq_load_add(struct tdq *tdq, struct td_sched *ts)
399{
400	int class;
401	mtx_assert(&sched_lock, MA_OWNED);
402	class = PRI_BASE(ts->ts_thread->td_pri_class);
403	tdq->tdq_load++;
404	CTR1(KTR_SCHED, "load: %d", tdq->tdq_load);
405	if (class != PRI_ITHD && (ts->ts_thread->td_proc->p_flag & P_NOLOAD) == 0)
406#ifdef SMP
407		tdq->tdq_group->tdg_load++;
408#else
409		tdq->tdq_sysload++;
410#endif
411}
412
413static void
414tdq_load_rem(struct tdq *tdq, struct td_sched *ts)
415{
416	int class;
417	mtx_assert(&sched_lock, MA_OWNED);
418	class = PRI_BASE(ts->ts_thread->td_pri_class);
419	if (class != PRI_ITHD  && (ts->ts_thread->td_proc->p_flag & P_NOLOAD) == 0)
420#ifdef SMP
421		tdq->tdq_group->tdg_load--;
422#else
423		tdq->tdq_sysload--;
424#endif
425	tdq->tdq_load--;
426	CTR1(KTR_SCHED, "load: %d", tdq->tdq_load);
427	ts->ts_runq = NULL;
428}
429
430#ifdef SMP
431static void
432sched_smp_tick(void)
433{
434	struct tdq *tdq;
435
436	tdq = TDQ_SELF();
437	if (sched_rebalance) {
438		if (ticks >= bal_tick)
439			sched_balance();
440		if (ticks >= gbal_tick && balance_groups)
441			sched_balance_groups();
442	}
443	/*
444	 * We could have been assigned a non real-time thread without an
445	 * IPI.
446	 */
447	if (tdq->tdq_assigned)
448		tdq_assign(tdq);	/* Potentially sets NEEDRESCHED */
449}
450
451/*
452 * sched_balance is a simple CPU load balancing algorithm.  It operates by
453 * finding the least loaded and most loaded cpu and equalizing their load
454 * by migrating some processes.
455 *
456 * Dealing only with two CPUs at a time has two advantages.  Firstly, most
457 * installations will only have 2 cpus.  Secondly, load balancing too much at
458 * once can have an unpleasant effect on the system.  The scheduler rarely has
459 * enough information to make perfect decisions.  So this algorithm chooses
460 * algorithm simplicity and more gradual effects on load in larger systems.
461 *
462 * It could be improved by considering the priorities and slices assigned to
463 * each task prior to balancing them.  There are many pathological cases with
464 * any approach and so the semi random algorithm below may work as well as any.
465 *
466 */
467static void
468sched_balance(void)
469{
470	struct tdq_group *high;
471	struct tdq_group *low;
472	struct tdq_group *tdg;
473	int cnt;
474	int i;
475
476	bal_tick = ticks + (random() % (hz * 2));
477	if (smp_started == 0)
478		return;
479	low = high = NULL;
480	i = random() % (tdg_maxid + 1);
481	for (cnt = 0; cnt <= tdg_maxid; cnt++) {
482		tdg = TDQ_GROUP(i);
483		/*
484		 * Find the CPU with the highest load that has some
485		 * threads to transfer.
486		 */
487		if ((high == NULL || tdg->tdg_load > high->tdg_load)
488		    && tdg->tdg_transferable)
489			high = tdg;
490		if (low == NULL || tdg->tdg_load < low->tdg_load)
491			low = tdg;
492		if (++i > tdg_maxid)
493			i = 0;
494	}
495	if (low != NULL && high != NULL && high != low)
496		sched_balance_pair(LIST_FIRST(&high->tdg_members),
497		    LIST_FIRST(&low->tdg_members));
498}
499
500static void
501sched_balance_groups(void)
502{
503	int i;
504
505	gbal_tick = ticks + (random() % (hz * 2));
506	mtx_assert(&sched_lock, MA_OWNED);
507	if (smp_started)
508		for (i = 0; i <= tdg_maxid; i++)
509			sched_balance_group(TDQ_GROUP(i));
510}
511
512static void
513sched_balance_group(struct tdq_group *tdg)
514{
515	struct tdq *tdq;
516	struct tdq *high;
517	struct tdq *low;
518	int load;
519
520	if (tdg->tdg_transferable == 0)
521		return;
522	low = NULL;
523	high = NULL;
524	LIST_FOREACH(tdq, &tdg->tdg_members, tdq_siblings) {
525		load = tdq->tdq_load;
526		if (high == NULL || load > high->tdq_load)
527			high = tdq;
528		if (low == NULL || load < low->tdq_load)
529			low = tdq;
530	}
531	if (high != NULL && low != NULL && high != low)
532		sched_balance_pair(high, low);
533}
534
535static void
536sched_balance_pair(struct tdq *high, struct tdq *low)
537{
538	int transferable;
539	int high_load;
540	int low_load;
541	int move;
542	int diff;
543	int i;
544
545	/*
546	 * If we're transfering within a group we have to use this specific
547	 * tdq's transferable count, otherwise we can steal from other members
548	 * of the group.
549	 */
550	if (high->tdq_group == low->tdq_group) {
551		transferable = high->tdq_transferable;
552		high_load = high->tdq_load;
553		low_load = low->tdq_load;
554	} else {
555		transferable = high->tdq_group->tdg_transferable;
556		high_load = high->tdq_group->tdg_load;
557		low_load = low->tdq_group->tdg_load;
558	}
559	if (transferable == 0)
560		return;
561	/*
562	 * Determine what the imbalance is and then adjust that to how many
563	 * threads we actually have to give up (transferable).
564	 */
565	diff = high_load - low_load;
566	move = diff / 2;
567	if (diff & 0x1)
568		move++;
569	move = min(move, transferable);
570	for (i = 0; i < move; i++)
571		tdq_move(high, TDQ_ID(low));
572	return;
573}
574
575static void
576tdq_move(struct tdq *from, int cpu)
577{
578	struct tdq *tdq;
579	struct tdq *to;
580	struct td_sched *ts;
581
582	tdq = from;
583	to = TDQ_CPU(cpu);
584	ts = tdq_steal(tdq, 1);
585	if (ts == NULL) {
586		struct tdq_group *tdg;
587
588		tdg = tdq->tdq_group;
589		LIST_FOREACH(tdq, &tdg->tdg_members, tdq_siblings) {
590			if (tdq == from || tdq->tdq_transferable == 0)
591				continue;
592			ts = tdq_steal(tdq, 1);
593			break;
594		}
595		if (ts == NULL)
596			panic("tdq_move: No threads available with a "
597			    "transferable count of %d\n",
598			    tdg->tdg_transferable);
599	}
600	if (tdq == to)
601		return;
602	ts->ts_state = TSS_THREAD;
603	tdq_runq_rem(tdq, ts);
604	tdq_load_rem(tdq, ts);
605	tdq_notify(ts, cpu);
606}
607
608static int
609tdq_idled(struct tdq *tdq)
610{
611	struct tdq_group *tdg;
612	struct tdq *steal;
613	struct td_sched *ts;
614
615	tdg = tdq->tdq_group;
616	/*
617	 * If we're in a cpu group, try and steal threads from another cpu in
618	 * the group before idling.
619	 */
620	if (tdg->tdg_cpus > 1 && tdg->tdg_transferable) {
621		LIST_FOREACH(steal, &tdg->tdg_members, tdq_siblings) {
622			if (steal == tdq || steal->tdq_transferable == 0)
623				continue;
624			ts = tdq_steal(steal, 0);
625			if (ts == NULL)
626				continue;
627			ts->ts_state = TSS_THREAD;
628			tdq_runq_rem(steal, ts);
629			tdq_load_rem(steal, ts);
630			ts->ts_cpu = PCPU_GET(cpuid);
631			sched_pin_td(ts->ts_thread);
632			sched_add(ts->ts_thread, SRQ_YIELDING);
633			sched_unpin_td(ts->ts_thread);
634			return (0);
635		}
636	}
637	/*
638	 * We only set the idled bit when all of the cpus in the group are
639	 * idle.  Otherwise we could get into a situation where a thread bounces
640	 * back and forth between two idle cores on seperate physical CPUs.
641	 */
642	tdg->tdg_idlemask |= PCPU_GET(cpumask);
643	if (tdg->tdg_idlemask != tdg->tdg_cpumask)
644		return (1);
645	atomic_set_int(&tdq_idle, tdg->tdg_mask);
646	return (1);
647}
648
649static void
650tdq_assign(struct tdq *tdq)
651{
652	struct td_sched *nts;
653	struct td_sched *ts;
654
655	do {
656		*(volatile struct td_sched **)&ts = tdq->tdq_assigned;
657	} while(!atomic_cmpset_ptr((volatile uintptr_t *)&tdq->tdq_assigned,
658		(uintptr_t)ts, (uintptr_t)NULL));
659	for (; ts != NULL; ts = nts) {
660		nts = ts->ts_assign;
661		tdq->tdq_group->tdg_load--;
662		tdq->tdq_load--;
663		ts->ts_flags &= ~TSF_ASSIGNED;
664		if (ts->ts_flags & TSF_REMOVED) {
665			ts->ts_flags &= ~TSF_REMOVED;
666			continue;
667		}
668		sched_pin_td(ts->ts_thread);
669		sched_add(ts->ts_thread, SRQ_YIELDING);
670		sched_unpin_td(ts->ts_thread);
671	}
672}
673
674static void
675tdq_notify(struct td_sched *ts, int cpu)
676{
677	struct tdq *tdq;
678	struct thread *td;
679	struct pcpu *pcpu;
680	int class;
681	int prio;
682
683	tdq = TDQ_CPU(cpu);
684	class = PRI_BASE(ts->ts_thread->td_pri_class);
685	if ((class != PRI_IDLE && class != PRI_ITHD)
686	    && (tdq_idle & tdq->tdq_group->tdg_mask))
687		atomic_clear_int(&tdq_idle, tdq->tdq_group->tdg_mask);
688	tdq->tdq_group->tdg_load++;
689	tdq->tdq_load++;
690	ts->ts_cpu = cpu;
691	ts->ts_flags |= TSF_ASSIGNED;
692	prio = ts->ts_thread->td_priority;
693
694	/*
695	 * Place a thread on another cpu's queue and force a resched.
696	 */
697	do {
698		*(volatile struct td_sched **)&ts->ts_assign = tdq->tdq_assigned;
699	} while(!atomic_cmpset_ptr((volatile uintptr_t *)&tdq->tdq_assigned,
700		(uintptr_t)ts->ts_assign, (uintptr_t)ts));
701	/* Only ipi for realtime/ithd priorities */
702	if (ts->ts_thread->td_priority > PRI_MIN_KERN)
703		return;
704	/*
705	 * Without sched_lock we could lose a race where we set NEEDRESCHED
706	 * on a thread that is switched out before the IPI is delivered.  This
707	 * would lead us to miss the resched.  This will be a problem once
708	 * sched_lock is pushed down.
709	 */
710	pcpu = pcpu_find(cpu);
711	td = pcpu->pc_curthread;
712	if (ts->ts_thread->td_priority < td->td_priority) {
713		td->td_flags |= TDF_NEEDRESCHED;
714		ipi_selected(1 << cpu, IPI_AST);
715	}
716}
717
718static struct td_sched *
719runq_steal(struct runq *rq)
720{
721	struct rqhead *rqh;
722	struct rqbits *rqb;
723	struct td_sched *ts;
724	int word;
725	int bit;
726
727	mtx_assert(&sched_lock, MA_OWNED);
728	rqb = &rq->rq_status;
729	for (word = 0; word < RQB_LEN; word++) {
730		if (rqb->rqb_bits[word] == 0)
731			continue;
732		for (bit = 0; bit < RQB_BPW; bit++) {
733			if ((rqb->rqb_bits[word] & (1ul << bit)) == 0)
734				continue;
735			rqh = &rq->rq_queues[bit + (word << RQB_L2BPW)];
736			TAILQ_FOREACH(ts, rqh, ts_procq) {
737				if (THREAD_CAN_MIGRATE(ts->ts_thread))
738					return (ts);
739			}
740		}
741	}
742	return (NULL);
743}
744
745static struct td_sched *
746tdq_steal(struct tdq *tdq, int stealidle)
747{
748	struct td_sched *ts;
749
750	/*
751	 * Steal from next first to try to get a non-interactive task that
752	 * may not have run for a while.
753	 * XXX Need to effect steal order for timeshare threads.
754	 */
755	if ((ts = runq_steal(&tdq->tdq_realtime)) != NULL)
756		return (ts);
757	if ((ts = runq_steal(&tdq->tdq_timeshare)) != NULL)
758		return (ts);
759	if (stealidle)
760		return (runq_steal(&tdq->tdq_idle));
761	return (NULL);
762}
763
764int
765tdq_transfer(struct tdq *tdq, struct td_sched *ts, int class)
766{
767	struct tdq_group *ntdg;
768	struct tdq_group *tdg;
769	struct tdq *old;
770	int cpu;
771	int idx;
772
773	if (smp_started == 0)
774		return (0);
775	cpu = 0;
776	/*
777	 * If our load exceeds a certain threshold we should attempt to
778	 * reassign this thread.  The first candidate is the cpu that
779	 * originally ran the thread.  If it is idle, assign it there,
780	 * otherwise, pick an idle cpu.
781	 *
782	 * The threshold at which we start to reassign has a large impact
783	 * on the overall performance of the system.  Tuned too high and
784	 * some CPUs may idle.  Too low and there will be excess migration
785	 * and context switches.
786	 */
787	old = TDQ_CPU(ts->ts_cpu);
788	ntdg = old->tdq_group;
789	tdg = tdq->tdq_group;
790	if (tdq_idle) {
791		if (tdq_idle & ntdg->tdg_mask) {
792			cpu = ffs(ntdg->tdg_idlemask);
793			if (cpu) {
794				CTR2(KTR_SCHED,
795				    "tdq_transfer: %p found old cpu %X "
796				    "in idlemask.", ts, cpu);
797				goto migrate;
798			}
799		}
800		/*
801		 * Multiple cpus could find this bit simultaneously
802		 * but the race shouldn't be terrible.
803		 */
804		cpu = ffs(tdq_idle);
805		if (cpu) {
806			CTR2(KTR_SCHED, "tdq_transfer: %p found %X "
807			    "in idlemask.", ts, cpu);
808			goto migrate;
809		}
810	}
811	idx = 0;
812#if 0
813	if (old->tdq_load < tdq->tdq_load) {
814		cpu = ts->ts_cpu + 1;
815		CTR2(KTR_SCHED, "tdq_transfer: %p old cpu %X "
816		    "load less than ours.", ts, cpu);
817		goto migrate;
818	}
819	/*
820	 * No new CPU was found, look for one with less load.
821	 */
822	for (idx = 0; idx <= tdg_maxid; idx++) {
823		ntdg = TDQ_GROUP(idx);
824		if (ntdg->tdg_load /*+ (ntdg->tdg_cpus  * 2)*/ < tdg->tdg_load) {
825			cpu = ffs(ntdg->tdg_cpumask);
826			CTR2(KTR_SCHED, "tdq_transfer: %p cpu %X load less "
827			    "than ours.", ts, cpu);
828			goto migrate;
829		}
830	}
831#endif
832	/*
833	 * If another cpu in this group has idled, assign a thread over
834	 * to them after checking to see if there are idled groups.
835	 */
836	if (tdg->tdg_idlemask) {
837		cpu = ffs(tdg->tdg_idlemask);
838		if (cpu) {
839			CTR2(KTR_SCHED, "tdq_transfer: %p cpu %X idle in "
840			    "group.", ts, cpu);
841			goto migrate;
842		}
843	}
844	return (0);
845migrate:
846	/*
847	 * Now that we've found an idle CPU, migrate the thread.
848	 */
849	cpu--;
850	ts->ts_runq = NULL;
851	tdq_notify(ts, cpu);
852
853	return (1);
854}
855
856#endif	/* SMP */
857
858/*
859 * Pick the highest priority task we have and return it.
860 */
861
862static struct td_sched *
863tdq_choose(struct tdq *tdq)
864{
865	struct td_sched *ts;
866
867	mtx_assert(&sched_lock, MA_OWNED);
868
869	ts = runq_choose(&tdq->tdq_realtime);
870	if (ts != NULL) {
871		KASSERT(ts->ts_thread->td_priority <= PRI_MAX_REALTIME,
872		    ("tdq_choose: Invalid priority on realtime queue %d",
873		    ts->ts_thread->td_priority));
874		return (ts);
875	}
876	ts = runq_choose_from(&tdq->tdq_timeshare, tdq->tdq_ridx);
877	if (ts != NULL) {
878		KASSERT(ts->ts_thread->td_priority <= PRI_MAX_TIMESHARE &&
879		    ts->ts_thread->td_priority >= PRI_MIN_TIMESHARE,
880		    ("tdq_choose: Invalid priority on timeshare queue %d",
881		    ts->ts_thread->td_priority));
882		return (ts);
883	}
884
885	ts = runq_choose(&tdq->tdq_idle);
886	if (ts != NULL) {
887		KASSERT(ts->ts_thread->td_priority >= PRI_MIN_IDLE,
888		    ("tdq_choose: Invalid priority on idle queue %d",
889		    ts->ts_thread->td_priority));
890		return (ts);
891	}
892
893	return (NULL);
894}
895
896static void
897tdq_setup(struct tdq *tdq)
898{
899	runq_init(&tdq->tdq_realtime);
900	runq_init(&tdq->tdq_timeshare);
901	runq_init(&tdq->tdq_idle);
902	tdq->tdq_load = 0;
903}
904
905static void
906sched_setup(void *dummy)
907{
908#ifdef SMP
909	int i;
910#endif
911
912	/*
913	 * To avoid divide-by-zero, we set realstathz a dummy value
914	 * in case which sched_clock() called before sched_initticks().
915	 */
916	realstathz = hz;
917	sched_slice = (realstathz/7);	/* 140ms */
918	tickincr = 1 << SCHED_TICK_SHIFT;
919
920#ifdef SMP
921	balance_groups = 0;
922	/*
923	 * Initialize the tdqs.
924	 */
925	for (i = 0; i < MAXCPU; i++) {
926		struct tdq *tdq;
927
928		tdq = &tdq_cpu[i];
929		tdq->tdq_assigned = NULL;
930		tdq_setup(&tdq_cpu[i]);
931	}
932	if (smp_topology == NULL) {
933		struct tdq_group *tdg;
934		struct tdq *tdq;
935		int cpus;
936
937		for (cpus = 0, i = 0; i < MAXCPU; i++) {
938			if (CPU_ABSENT(i))
939				continue;
940			tdq = &tdq_cpu[i];
941			tdg = &tdq_groups[cpus];
942			/*
943			 * Setup a tdq group with one member.
944			 */
945			tdq->tdq_transferable = 0;
946			tdq->tdq_group = tdg;
947			tdg->tdg_cpus = 1;
948			tdg->tdg_idlemask = 0;
949			tdg->tdg_cpumask = tdg->tdg_mask = 1 << i;
950			tdg->tdg_load = 0;
951			tdg->tdg_transferable = 0;
952			LIST_INIT(&tdg->tdg_members);
953			LIST_INSERT_HEAD(&tdg->tdg_members, tdq, tdq_siblings);
954			cpus++;
955		}
956		tdg_maxid = cpus - 1;
957	} else {
958		struct tdq_group *tdg;
959		struct cpu_group *cg;
960		int j;
961
962		for (i = 0; i < smp_topology->ct_count; i++) {
963			cg = &smp_topology->ct_group[i];
964			tdg = &tdq_groups[i];
965			/*
966			 * Initialize the group.
967			 */
968			tdg->tdg_idlemask = 0;
969			tdg->tdg_load = 0;
970			tdg->tdg_transferable = 0;
971			tdg->tdg_cpus = cg->cg_count;
972			tdg->tdg_cpumask = cg->cg_mask;
973			LIST_INIT(&tdg->tdg_members);
974			/*
975			 * Find all of the group members and add them.
976			 */
977			for (j = 0; j < MAXCPU; j++) {
978				if ((cg->cg_mask & (1 << j)) != 0) {
979					if (tdg->tdg_mask == 0)
980						tdg->tdg_mask = 1 << j;
981					tdq_cpu[j].tdq_transferable = 0;
982					tdq_cpu[j].tdq_group = tdg;
983					LIST_INSERT_HEAD(&tdg->tdg_members,
984					    &tdq_cpu[j], tdq_siblings);
985				}
986			}
987			if (tdg->tdg_cpus > 1)
988				balance_groups = 1;
989		}
990		tdg_maxid = smp_topology->ct_count - 1;
991	}
992	/*
993	 * Stagger the group and global load balancer so they do not
994	 * interfere with each other.
995	 */
996	bal_tick = ticks + hz;
997	if (balance_groups)
998		gbal_tick = ticks + (hz / 2);
999#else
1000	tdq_setup(TDQ_SELF());
1001#endif
1002	mtx_lock_spin(&sched_lock);
1003	tdq_load_add(TDQ_SELF(), &td_sched0);
1004	mtx_unlock_spin(&sched_lock);
1005}
1006
1007/* ARGSUSED */
1008static void
1009sched_initticks(void *dummy)
1010{
1011	mtx_lock_spin(&sched_lock);
1012	realstathz = stathz ? stathz : hz;
1013	sched_slice = (realstathz/7);	/* ~140ms */
1014
1015	/*
1016	 * tickincr is shifted out by 10 to avoid rounding errors due to
1017	 * hz not being evenly divisible by stathz on all platforms.
1018	 */
1019	tickincr = (hz << SCHED_TICK_SHIFT) / realstathz;
1020	/*
1021	 * This does not work for values of stathz that are more than
1022	 * 1 << SCHED_TICK_SHIFT * hz.  In practice this does not happen.
1023	 */
1024	if (tickincr == 0)
1025		tickincr = 1;
1026	mtx_unlock_spin(&sched_lock);
1027}
1028
1029
1030/*
1031 * Scale the scheduling priority according to the "interactivity" of this
1032 * process.
1033 */
1034static void
1035sched_priority(struct thread *td)
1036{
1037	int score;
1038	int pri;
1039
1040	if (td->td_pri_class != PRI_TIMESHARE)
1041		return;
1042	/*
1043	 * If the score is interactive we place the thread in the realtime
1044	 * queue with a priority that is less than kernel and interrupt
1045	 * priorities.  These threads are not subject to nice restrictions.
1046	 *
1047	 * Scores greater than this are placed on the normal realtime queue
1048	 * where the priority is partially decided by the most recent cpu
1049	 * utilization and the rest is decided by nice value.
1050	 */
1051	score = sched_interact_score(td);
1052	if (score < sched_interact) {
1053		pri = PRI_MIN_REALTIME;
1054		pri += ((PRI_MAX_REALTIME - PRI_MIN_REALTIME) / sched_interact)
1055		    * score;
1056		KASSERT(pri >= PRI_MIN_REALTIME && pri <= PRI_MAX_REALTIME,
1057		    ("sched_priority: invalid interactive priority %d", pri));
1058	} else {
1059		pri = SCHED_PRI_MIN;
1060		if (td->td_sched->ts_ticks)
1061			pri += SCHED_PRI_TICKS(td->td_sched);
1062		pri += SCHED_PRI_NICE(td->td_proc->p_nice);
1063		if (!(pri >= PRI_MIN_TIMESHARE && pri <= PRI_MAX_TIMESHARE)) {
1064			static int once = 1;
1065			if (once) {
1066				printf("sched_priority: invalid priority %d",
1067				    pri);
1068				printf("nice %d, ticks %d ftick %d ltick %d tick pri %d\n",
1069				    td->td_proc->p_nice,
1070				    td->td_sched->ts_ticks,
1071				    td->td_sched->ts_ftick,
1072				    td->td_sched->ts_ltick,
1073				    SCHED_PRI_TICKS(td->td_sched));
1074				once = 0;
1075			}
1076			pri = min(max(pri, PRI_MIN_TIMESHARE),
1077			    PRI_MAX_TIMESHARE);
1078		}
1079	}
1080	sched_user_prio(td, pri);
1081
1082	return;
1083}
1084
1085/*
1086 * This routine enforces a maximum limit on the amount of scheduling history
1087 * kept.  It is called after either the slptime or runtime is adjusted.
1088 */
1089static void
1090sched_interact_update(struct thread *td)
1091{
1092	struct td_sched *ts;
1093	int sum;
1094
1095	ts = td->td_sched;
1096	sum = ts->skg_runtime + ts->skg_slptime;
1097	if (sum < SCHED_SLP_RUN_MAX)
1098		return;
1099	/*
1100	 * This only happens from two places:
1101	 * 1) We have added an unusual amount of run time from fork_exit.
1102	 * 2) We have added an unusual amount of sleep time from sched_sleep().
1103	 */
1104	if (sum > SCHED_SLP_RUN_MAX * 2) {
1105		if (ts->skg_runtime > ts->skg_slptime) {
1106			ts->skg_runtime = SCHED_SLP_RUN_MAX;
1107			ts->skg_slptime = 1;
1108		} else {
1109			ts->skg_slptime = SCHED_SLP_RUN_MAX;
1110			ts->skg_runtime = 1;
1111		}
1112		return;
1113	}
1114	/*
1115	 * If we have exceeded by more than 1/5th then the algorithm below
1116	 * will not bring us back into range.  Dividing by two here forces
1117	 * us into the range of [4/5 * SCHED_INTERACT_MAX, SCHED_INTERACT_MAX]
1118	 */
1119	if (sum > (SCHED_SLP_RUN_MAX / 5) * 6) {
1120		ts->skg_runtime /= 2;
1121		ts->skg_slptime /= 2;
1122		return;
1123	}
1124	ts->skg_runtime = (ts->skg_runtime / 5) * 4;
1125	ts->skg_slptime = (ts->skg_slptime / 5) * 4;
1126}
1127
1128static void
1129sched_interact_fork(struct thread *td)
1130{
1131	int ratio;
1132	int sum;
1133
1134	sum = td->td_sched->skg_runtime + td->td_sched->skg_slptime;
1135	if (sum > SCHED_SLP_RUN_FORK) {
1136		ratio = sum / SCHED_SLP_RUN_FORK;
1137		td->td_sched->skg_runtime /= ratio;
1138		td->td_sched->skg_slptime /= ratio;
1139	}
1140}
1141
1142static int
1143sched_interact_score(struct thread *td)
1144{
1145	int div;
1146
1147	if (td->td_sched->skg_runtime > td->td_sched->skg_slptime) {
1148		div = max(1, td->td_sched->skg_runtime / SCHED_INTERACT_HALF);
1149		return (SCHED_INTERACT_HALF +
1150		    (SCHED_INTERACT_HALF - (td->td_sched->skg_slptime / div)));
1151	} if (td->td_sched->skg_slptime > td->td_sched->skg_runtime) {
1152		div = max(1, td->td_sched->skg_slptime / SCHED_INTERACT_HALF);
1153		return (td->td_sched->skg_runtime / div);
1154	}
1155
1156	/*
1157	 * This can happen if slptime and runtime are 0.
1158	 */
1159	return (0);
1160
1161}
1162
1163/*
1164 * Called from proc0_init() to bootstrap the scheduler.
1165 */
1166void
1167schedinit(void)
1168{
1169
1170	/*
1171	 * Set up the scheduler specific parts of proc0.
1172	 */
1173	proc0.p_sched = NULL; /* XXX */
1174	thread0.td_sched = &td_sched0;
1175	td_sched0.ts_ltick = ticks;
1176	td_sched0.ts_ftick = ticks;
1177	td_sched0.ts_thread = &thread0;
1178	td_sched0.ts_state = TSS_THREAD;
1179}
1180
1181/*
1182 * This is only somewhat accurate since given many processes of the same
1183 * priority they will switch when their slices run out, which will be
1184 * at most sched_slice stathz ticks.
1185 */
1186int
1187sched_rr_interval(void)
1188{
1189
1190	/* Convert sched_slice to hz */
1191	return (hz/(realstathz/sched_slice));
1192}
1193
1194static void
1195sched_pctcpu_update(struct td_sched *ts)
1196{
1197
1198	if (ts->ts_ticks == 0)
1199		return;
1200	if (ticks - (hz / 10) < ts->ts_ltick &&
1201	    SCHED_TICK_TOTAL(ts) < SCHED_TICK_MAX)
1202		return;
1203	/*
1204	 * Adjust counters and watermark for pctcpu calc.
1205	 */
1206	if (ts->ts_ltick > ticks - SCHED_TICK_TARG)
1207		ts->ts_ticks = (ts->ts_ticks / (ticks - ts->ts_ftick)) *
1208			    SCHED_TICK_TARG;
1209	else
1210		ts->ts_ticks = 0;
1211	ts->ts_ltick = ticks;
1212	ts->ts_ftick = ts->ts_ltick - SCHED_TICK_TARG;
1213}
1214
1215static void
1216sched_thread_priority(struct thread *td, u_char prio)
1217{
1218	struct td_sched *ts;
1219
1220	CTR6(KTR_SCHED, "sched_prio: %p(%s) prio %d newprio %d by %p(%s)",
1221	    td, td->td_proc->p_comm, td->td_priority, prio, curthread,
1222	    curthread->td_proc->p_comm);
1223	ts = td->td_sched;
1224	mtx_assert(&sched_lock, MA_OWNED);
1225	if (td->td_priority == prio)
1226		return;
1227
1228	if (TD_ON_RUNQ(td) && prio < td->td_priority) {
1229		/*
1230		 * If the priority has been elevated due to priority
1231		 * propagation, we may have to move ourselves to a new
1232		 * queue.  This could be optimized to not re-add in some
1233		 * cases.
1234		 *
1235		 * Hold this td_sched on this cpu so that sched_prio() doesn't
1236		 * cause excessive migration.  We only want migration to
1237		 * happen as the result of a wakeup.
1238		 */
1239		sched_pin_td(td);
1240		sched_rem(td);
1241		td->td_priority = prio;
1242		sched_add(td, SRQ_BORROWING);
1243		sched_unpin_td(td);
1244	} else
1245		td->td_priority = prio;
1246}
1247
1248/*
1249 * Update a thread's priority when it is lent another thread's
1250 * priority.
1251 */
1252void
1253sched_lend_prio(struct thread *td, u_char prio)
1254{
1255
1256	td->td_flags |= TDF_BORROWING;
1257	sched_thread_priority(td, prio);
1258}
1259
1260/*
1261 * Restore a thread's priority when priority propagation is
1262 * over.  The prio argument is the minimum priority the thread
1263 * needs to have to satisfy other possible priority lending
1264 * requests.  If the thread's regular priority is less
1265 * important than prio, the thread will keep a priority boost
1266 * of prio.
1267 */
1268void
1269sched_unlend_prio(struct thread *td, u_char prio)
1270{
1271	u_char base_pri;
1272
1273	if (td->td_base_pri >= PRI_MIN_TIMESHARE &&
1274	    td->td_base_pri <= PRI_MAX_TIMESHARE)
1275		base_pri = td->td_user_pri;
1276	else
1277		base_pri = td->td_base_pri;
1278	if (prio >= base_pri) {
1279		td->td_flags &= ~TDF_BORROWING;
1280		sched_thread_priority(td, base_pri);
1281	} else
1282		sched_lend_prio(td, prio);
1283}
1284
1285void
1286sched_prio(struct thread *td, u_char prio)
1287{
1288	u_char oldprio;
1289
1290	/* First, update the base priority. */
1291	td->td_base_pri = prio;
1292
1293	/*
1294	 * If the thread is borrowing another thread's priority, don't
1295	 * ever lower the priority.
1296	 */
1297	if (td->td_flags & TDF_BORROWING && td->td_priority < prio)
1298		return;
1299
1300	/* Change the real priority. */
1301	oldprio = td->td_priority;
1302	sched_thread_priority(td, prio);
1303
1304	/*
1305	 * If the thread is on a turnstile, then let the turnstile update
1306	 * its state.
1307	 */
1308	if (TD_ON_LOCK(td) && oldprio != prio)
1309		turnstile_adjust(td, oldprio);
1310}
1311
1312void
1313sched_user_prio(struct thread *td, u_char prio)
1314{
1315	u_char oldprio;
1316
1317	td->td_base_user_pri = prio;
1318	if (td->td_flags & TDF_UBORROWING && td->td_user_pri <= prio)
1319                return;
1320	oldprio = td->td_user_pri;
1321	td->td_user_pri = prio;
1322
1323	if (TD_ON_UPILOCK(td) && oldprio != prio)
1324		umtx_pi_adjust(td, oldprio);
1325}
1326
1327void
1328sched_lend_user_prio(struct thread *td, u_char prio)
1329{
1330	u_char oldprio;
1331
1332	td->td_flags |= TDF_UBORROWING;
1333
1334	oldprio = td->td_user_pri;
1335	td->td_user_pri = prio;
1336
1337	if (TD_ON_UPILOCK(td) && oldprio != prio)
1338		umtx_pi_adjust(td, oldprio);
1339}
1340
1341void
1342sched_unlend_user_prio(struct thread *td, u_char prio)
1343{
1344	u_char base_pri;
1345
1346	base_pri = td->td_base_user_pri;
1347	if (prio >= base_pri) {
1348		td->td_flags &= ~TDF_UBORROWING;
1349		sched_user_prio(td, base_pri);
1350	} else
1351		sched_lend_user_prio(td, prio);
1352}
1353
1354void
1355sched_switch(struct thread *td, struct thread *newtd, int flags)
1356{
1357	struct tdq *tdq;
1358	struct td_sched *ts;
1359
1360	mtx_assert(&sched_lock, MA_OWNED);
1361
1362	tdq = TDQ_SELF();
1363	ts = td->td_sched;
1364	td->td_lastcpu = td->td_oncpu;
1365	td->td_oncpu = NOCPU;
1366	td->td_flags &= ~TDF_NEEDRESCHED;
1367	td->td_owepreempt = 0;
1368	/*
1369	 * If the thread has been assigned it may be in the process of switching
1370	 * to the new cpu.  This is the case in sched_bind().
1371	 */
1372	if (td == PCPU_GET(idlethread)) {
1373		TD_SET_CAN_RUN(td);
1374	} else if ((ts->ts_flags & TSF_ASSIGNED) == 0) {
1375		/* We are ending our run so make our slot available again */
1376		tdq_load_rem(tdq, ts);
1377		if (TD_IS_RUNNING(td)) {
1378			/*
1379			 * Don't allow the thread to migrate
1380			 * from a preemption.
1381			 */
1382			sched_pin_td(td);
1383			setrunqueue(td, (flags & SW_PREEMPT) ?
1384			    SRQ_OURSELF|SRQ_YIELDING|SRQ_PREEMPTED :
1385			    SRQ_OURSELF|SRQ_YIELDING);
1386			sched_unpin_td(td);
1387		}
1388	}
1389	if (newtd != NULL) {
1390		/*
1391		 * If we bring in a thread account for it as if it had been
1392		 * added to the run queue and then chosen.
1393		 */
1394		newtd->td_sched->ts_flags |= TSF_DIDRUN;
1395		TD_SET_RUNNING(newtd);
1396		tdq_load_add(TDQ_SELF(), newtd->td_sched);
1397	} else
1398		newtd = choosethread();
1399	if (td != newtd) {
1400#ifdef	HWPMC_HOOKS
1401		if (PMC_PROC_IS_USING_PMCS(td->td_proc))
1402			PMC_SWITCH_CONTEXT(td, PMC_FN_CSW_OUT);
1403#endif
1404
1405		cpu_switch(td, newtd);
1406#ifdef	HWPMC_HOOKS
1407		if (PMC_PROC_IS_USING_PMCS(td->td_proc))
1408			PMC_SWITCH_CONTEXT(td, PMC_FN_CSW_IN);
1409#endif
1410	}
1411	sched_lock.mtx_lock = (uintptr_t)td;
1412	td->td_oncpu = PCPU_GET(cpuid);
1413}
1414
1415void
1416sched_nice(struct proc *p, int nice)
1417{
1418	struct thread *td;
1419
1420	PROC_LOCK_ASSERT(p, MA_OWNED);
1421	mtx_assert(&sched_lock, MA_OWNED);
1422
1423	p->p_nice = nice;
1424	FOREACH_THREAD_IN_PROC(p, td) {
1425		sched_priority(td);
1426		sched_prio(td, td->td_base_user_pri);
1427	}
1428}
1429
1430void
1431sched_sleep(struct thread *td)
1432{
1433
1434	mtx_assert(&sched_lock, MA_OWNED);
1435
1436	td->td_sched->ts_slptime = ticks;
1437}
1438
1439void
1440sched_wakeup(struct thread *td)
1441{
1442	int slptime;
1443
1444	mtx_assert(&sched_lock, MA_OWNED);
1445
1446	/*
1447	 * If we slept for more than a tick update our interactivity and
1448	 * priority.
1449	 */
1450	slptime = td->td_sched->ts_slptime;
1451	td->td_sched->ts_slptime = 0;
1452	if (slptime && slptime != ticks) {
1453		int hzticks;
1454
1455		hzticks = (ticks - slptime) << SCHED_TICK_SHIFT;
1456		td->td_sched->skg_slptime += hzticks;
1457		sched_interact_update(td);
1458		sched_pctcpu_update(td->td_sched);
1459		sched_priority(td);
1460	}
1461	setrunqueue(td, SRQ_BORING);
1462}
1463
1464/*
1465 * Penalize the parent for creating a new child and initialize the child's
1466 * priority.
1467 */
1468void
1469sched_fork(struct thread *td, struct thread *child)
1470{
1471	mtx_assert(&sched_lock, MA_OWNED);
1472	sched_fork_thread(td, child);
1473	/*
1474	 * Penalize the parent and child for forking.
1475	 */
1476	sched_interact_fork(child);
1477	sched_priority(child);
1478	td->td_sched->skg_runtime += tickincr;
1479	sched_interact_update(td);
1480	sched_priority(td);
1481}
1482
1483void
1484sched_fork_thread(struct thread *td, struct thread *child)
1485{
1486	struct td_sched *ts;
1487	struct td_sched *ts2;
1488
1489	/*
1490	 * Initialize child.
1491	 */
1492	sched_newthread(child);
1493	ts = td->td_sched;
1494	ts2 = child->td_sched;
1495	ts2->ts_cpu = ts->ts_cpu;
1496	ts2->ts_runq = NULL;
1497	/*
1498	 * Grab our parents cpu estimation information and priority.
1499	 */
1500	ts2->ts_ticks = ts->ts_ticks;
1501	ts2->ts_ltick = ts->ts_ltick;
1502	ts2->ts_ftick = ts->ts_ftick;
1503	child->td_user_pri = td->td_user_pri;
1504	child->td_base_user_pri = td->td_base_user_pri;
1505	/*
1506	 * And update interactivity score.
1507	 */
1508	ts2->skg_slptime = ts->skg_slptime;
1509	ts2->skg_runtime = ts->skg_runtime;
1510	ts2->ts_slice = 1;	/* Attempt to quickly learn interactivity. */
1511}
1512
1513void
1514sched_class(struct thread *td, int class)
1515{
1516
1517	mtx_assert(&sched_lock, MA_OWNED);
1518	if (td->td_pri_class == class)
1519		return;
1520
1521#ifdef SMP
1522	/*
1523	 * On SMP if we're on the RUNQ we must adjust the transferable
1524	 * count because could be changing to or from an interrupt
1525	 * class.
1526	 */
1527	if (td->td_sched->ts_state == TSS_ONRUNQ) {
1528		struct tdq *tdq;
1529
1530		tdq = TDQ_CPU(td->td_sched->ts_cpu);
1531		if (THREAD_CAN_MIGRATE(td)) {
1532			tdq->tdq_transferable--;
1533			tdq->tdq_group->tdg_transferable--;
1534		}
1535		td->td_pri_class = class;
1536		if (THREAD_CAN_MIGRATE(td)) {
1537			tdq->tdq_transferable++;
1538			tdq->tdq_group->tdg_transferable++;
1539		}
1540	}
1541#endif
1542	td->td_pri_class = class;
1543}
1544
1545/*
1546 * Return some of the child's priority and interactivity to the parent.
1547 */
1548void
1549sched_exit(struct proc *p, struct thread *child)
1550{
1551	struct thread *td;
1552
1553	CTR3(KTR_SCHED, "sched_exit: %p(%s) prio %d",
1554	    child, child->td_proc->p_comm, child->td_priority);
1555
1556	td = FIRST_THREAD_IN_PROC(p);
1557	sched_exit_thread(td, child);
1558}
1559
1560void
1561sched_exit_thread(struct thread *td, struct thread *child)
1562{
1563
1564	CTR3(KTR_SCHED, "sched_exit_thread: %p(%s) prio %d",
1565	    child, child->td_proc->p_comm, child->td_priority);
1566
1567	tdq_load_rem(TDQ_CPU(child->td_sched->ts_cpu), child->td_sched);
1568#ifdef KSE
1569	/*
1570	 * KSE forks and exits so often that this penalty causes short-lived
1571	 * threads to always be non-interactive.  This causes mozilla to
1572	 * crawl under load.
1573	 */
1574	if ((td->td_pflags & TDP_SA) && td->td_proc == child->td_proc)
1575		return;
1576#endif
1577	/*
1578	 * Give the child's runtime to the parent without returning the
1579	 * sleep time as a penalty to the parent.  This causes shells that
1580	 * launch expensive things to mark their children as expensive.
1581	 */
1582	td->td_sched->skg_runtime += child->td_sched->skg_runtime;
1583	sched_interact_update(td);
1584	sched_priority(td);
1585}
1586
1587void
1588sched_userret(struct thread *td)
1589{
1590	/*
1591	 * XXX we cheat slightly on the locking here to avoid locking in
1592	 * the usual case.  Setting td_priority here is essentially an
1593	 * incomplete workaround for not setting it properly elsewhere.
1594	 * Now that some interrupt handlers are threads, not setting it
1595	 * properly elsewhere can clobber it in the window between setting
1596	 * it here and returning to user mode, so don't waste time setting
1597	 * it perfectly here.
1598	 */
1599	KASSERT((td->td_flags & TDF_BORROWING) == 0,
1600	    ("thread with borrowed priority returning to userland"));
1601	if (td->td_priority != td->td_user_pri) {
1602		mtx_lock_spin(&sched_lock);
1603		td->td_priority = td->td_user_pri;
1604		td->td_base_pri = td->td_user_pri;
1605		mtx_unlock_spin(&sched_lock);
1606        }
1607}
1608
1609void
1610sched_clock(struct thread *td)
1611{
1612	struct tdq *tdq;
1613	struct td_sched *ts;
1614
1615	mtx_assert(&sched_lock, MA_OWNED);
1616#ifdef SMP
1617	sched_smp_tick();
1618#endif
1619	tdq = TDQ_SELF();
1620	/*
1621	 * Advance the insert index once for each tick to ensure that all
1622	 * threads get a chance to run.
1623	 */
1624	if (tdq->tdq_idx == tdq->tdq_ridx) {
1625		tdq->tdq_idx = (tdq->tdq_idx + 1) % RQ_NQS;
1626		if (TAILQ_EMPTY(&tdq->tdq_timeshare.rq_queues[tdq->tdq_ridx]))
1627			tdq->tdq_ridx = tdq->tdq_idx;
1628	}
1629	/* Adjust ticks for pctcpu */
1630	ts = td->td_sched;
1631	ts->ts_ticks += tickincr;
1632	ts->ts_ltick = ticks;
1633	/*
1634	 * Update if we've exceeded our desired tick threshhold by over one
1635	 * second.
1636	 */
1637	if (ts->ts_ftick + SCHED_TICK_MAX < ts->ts_ltick)
1638		sched_pctcpu_update(ts);
1639	/*
1640	 * We only do slicing code for TIMESHARE threads.
1641	 */
1642	if (td->td_pri_class != PRI_TIMESHARE)
1643		return;
1644	/*
1645	 * We used a tick; charge it to the thread so that we can compute our
1646	 * interactivity.
1647	 */
1648	td->td_sched->skg_runtime += tickincr;
1649	sched_interact_update(td);
1650	/*
1651	 * We used up one time slice.
1652	 */
1653	if (--ts->ts_slice > 0)
1654		return;
1655	/*
1656	 * We're out of time, recompute priorities and requeue.
1657	 */
1658	sched_priority(td);
1659	tdq_load_rem(tdq, ts);
1660	ts->ts_slice = sched_slice;
1661	tdq_load_add(tdq, ts);
1662	td->td_flags |= TDF_NEEDRESCHED;
1663}
1664
1665int
1666sched_runnable(void)
1667{
1668	struct tdq *tdq;
1669	int load;
1670
1671	load = 1;
1672
1673	tdq = TDQ_SELF();
1674#ifdef SMP
1675	if (tdq->tdq_assigned) {
1676		mtx_lock_spin(&sched_lock);
1677		tdq_assign(tdq);
1678		mtx_unlock_spin(&sched_lock);
1679	}
1680#endif
1681	if ((curthread->td_flags & TDF_IDLETD) != 0) {
1682		if (tdq->tdq_load > 0)
1683			goto out;
1684	} else
1685		if (tdq->tdq_load - 1 > 0)
1686			goto out;
1687	load = 0;
1688out:
1689	return (load);
1690}
1691
1692struct td_sched *
1693sched_choose(void)
1694{
1695	struct tdq *tdq;
1696	struct td_sched *ts;
1697
1698	mtx_assert(&sched_lock, MA_OWNED);
1699	tdq = TDQ_SELF();
1700#ifdef SMP
1701restart:
1702	if (tdq->tdq_assigned)
1703		tdq_assign(tdq);
1704#endif
1705	ts = tdq_choose(tdq);
1706	if (ts) {
1707#ifdef SMP
1708		if (ts->ts_thread->td_priority > PRI_MIN_IDLE)
1709			if (tdq_idled(tdq) == 0)
1710				goto restart;
1711#endif
1712		tdq_runq_rem(tdq, ts);
1713		ts->ts_state = TSS_THREAD;
1714		return (ts);
1715	}
1716#ifdef SMP
1717	if (tdq_idled(tdq) == 0)
1718		goto restart;
1719#endif
1720	return (NULL);
1721}
1722
1723void
1724sched_add(struct thread *td, int flags)
1725{
1726	struct tdq *tdq;
1727	struct td_sched *ts;
1728	int preemptive;
1729	int canmigrate;
1730	int class;
1731
1732	CTR5(KTR_SCHED, "sched_add: %p(%s) prio %d by %p(%s)",
1733	    td, td->td_proc->p_comm, td->td_priority, curthread,
1734	    curthread->td_proc->p_comm);
1735	mtx_assert(&sched_lock, MA_OWNED);
1736	tdq = TDQ_SELF();
1737	ts = td->td_sched;
1738	class = PRI_BASE(td->td_pri_class);
1739	preemptive = !(flags & SRQ_YIELDING);
1740	canmigrate = 1;
1741#ifdef SMP
1742	if (ts->ts_flags & TSF_ASSIGNED) {
1743		if (ts->ts_flags & TSF_REMOVED)
1744			ts->ts_flags &= ~TSF_REMOVED;
1745		return;
1746	}
1747	canmigrate = THREAD_CAN_MIGRATE(td);
1748#endif
1749	KASSERT(ts->ts_state != TSS_ONRUNQ,
1750	    ("sched_add: thread %p (%s) already in run queue", td,
1751	    td->td_proc->p_comm));
1752	KASSERT(td->td_proc->p_sflag & PS_INMEM,
1753	    ("sched_add: process swapped out"));
1754	KASSERT(ts->ts_runq == NULL,
1755	    ("sched_add: thread %p is still assigned to a run queue", td));
1756	/*
1757	 * Set the slice and pick the run queue.
1758	 */
1759	if (ts->ts_slice == 0)
1760		ts->ts_slice = sched_slice;
1761	if (class == PRI_TIMESHARE)
1762		sched_priority(td);
1763	if (td->td_priority <= PRI_MAX_REALTIME) {
1764		ts->ts_runq = &tdq->tdq_realtime;
1765		/*
1766		 * If the thread is not artificially pinned and it's in
1767		 * the realtime queue we directly dispatch it on this cpu
1768		 * for minimum latency.  Interrupt handlers may also have
1769		 * to complete on the cpu that dispatched them.
1770		 */
1771		if (td->td_pinned == 0 && class == PRI_ITHD)
1772			ts->ts_cpu = PCPU_GET(cpuid);
1773	} else if (td->td_priority <= PRI_MAX_TIMESHARE)
1774		ts->ts_runq = &tdq->tdq_timeshare;
1775	else
1776		ts->ts_runq = &tdq->tdq_idle;
1777
1778#ifdef SMP
1779	/*
1780	 * If this thread is pinned or bound, notify the target cpu.
1781	 */
1782	if (!canmigrate && ts->ts_cpu != PCPU_GET(cpuid) ) {
1783		ts->ts_runq = NULL;
1784		tdq_notify(ts, ts->ts_cpu);
1785		return;
1786	}
1787	/*
1788	 * If we had been idle, clear our bit in the group and potentially
1789	 * the global bitmap.  If not, see if we should transfer this thread.
1790	 */
1791	if ((class != PRI_IDLE && class != PRI_ITHD) &&
1792	    (tdq->tdq_group->tdg_idlemask & PCPU_GET(cpumask)) != 0) {
1793		/*
1794		 * Check to see if our group is unidling, and if so, remove it
1795		 * from the global idle mask.
1796		 */
1797		if (tdq->tdq_group->tdg_idlemask ==
1798		    tdq->tdq_group->tdg_cpumask)
1799			atomic_clear_int(&tdq_idle, tdq->tdq_group->tdg_mask);
1800		/*
1801		 * Now remove ourselves from the group specific idle mask.
1802		 */
1803		tdq->tdq_group->tdg_idlemask &= ~PCPU_GET(cpumask);
1804	} else if (canmigrate && tdq->tdq_load > 1)
1805		if (tdq_transfer(tdq, ts, class))
1806			return;
1807	ts->ts_cpu = PCPU_GET(cpuid);
1808#endif
1809	if (td->td_priority < curthread->td_priority)
1810		curthread->td_flags |= TDF_NEEDRESCHED;
1811	if (preemptive && maybe_preempt(td))
1812		return;
1813	ts->ts_state = TSS_ONRUNQ;
1814
1815	tdq_runq_add(tdq, ts, flags);
1816	tdq_load_add(tdq, ts);
1817}
1818
1819void
1820sched_rem(struct thread *td)
1821{
1822	struct tdq *tdq;
1823	struct td_sched *ts;
1824
1825	CTR5(KTR_SCHED, "sched_rem: %p(%s) prio %d by %p(%s)",
1826	    td, td->td_proc->p_comm, td->td_priority, curthread,
1827	    curthread->td_proc->p_comm);
1828	mtx_assert(&sched_lock, MA_OWNED);
1829	ts = td->td_sched;
1830	if (ts->ts_flags & TSF_ASSIGNED) {
1831		ts->ts_flags |= TSF_REMOVED;
1832		return;
1833	}
1834	KASSERT((ts->ts_state == TSS_ONRUNQ),
1835	    ("sched_rem: thread not on run queue"));
1836
1837	ts->ts_state = TSS_THREAD;
1838	tdq = TDQ_CPU(ts->ts_cpu);
1839	tdq_runq_rem(tdq, ts);
1840	tdq_load_rem(tdq, ts);
1841}
1842
1843fixpt_t
1844sched_pctcpu(struct thread *td)
1845{
1846	fixpt_t pctcpu;
1847	struct td_sched *ts;
1848
1849	pctcpu = 0;
1850	ts = td->td_sched;
1851	if (ts == NULL)
1852		return (0);
1853
1854	mtx_lock_spin(&sched_lock);
1855	if (ts->ts_ticks) {
1856		int rtick;
1857
1858		sched_pctcpu_update(ts);
1859		/* How many rtick per second ? */
1860		rtick = min(SCHED_TICK_HZ(ts) / SCHED_TICK_SECS, hz);
1861		pctcpu = (FSCALE * ((FSCALE * rtick)/hz)) >> FSHIFT;
1862	}
1863	td->td_proc->p_swtime = ts->ts_ltick - ts->ts_ftick;
1864	mtx_unlock_spin(&sched_lock);
1865
1866	return (pctcpu);
1867}
1868
1869void
1870sched_bind(struct thread *td, int cpu)
1871{
1872	struct td_sched *ts;
1873
1874	mtx_assert(&sched_lock, MA_OWNED);
1875	ts = td->td_sched;
1876	KASSERT((ts->ts_flags & TSF_BOUND) == 0,
1877	    ("sched_bind: thread %p already bound.", td));
1878	ts->ts_flags |= TSF_BOUND;
1879#ifdef SMP
1880	if (PCPU_GET(cpuid) == cpu)
1881		return;
1882	/* sched_rem without the runq_remove */
1883	ts->ts_state = TSS_THREAD;
1884	tdq_load_rem(TDQ_CPU(ts->ts_cpu), ts);
1885	tdq_notify(ts, cpu);
1886	/* When we return from mi_switch we'll be on the correct cpu. */
1887	mi_switch(SW_VOL, NULL);
1888	sched_pin();
1889#endif
1890}
1891
1892void
1893sched_unbind(struct thread *td)
1894{
1895	struct td_sched *ts;
1896
1897	mtx_assert(&sched_lock, MA_OWNED);
1898	ts = td->td_sched;
1899	KASSERT(ts->ts_flags & TSF_BOUND,
1900	    ("sched_unbind: thread %p not bound.", td));
1901	mtx_assert(&sched_lock, MA_OWNED);
1902	ts->ts_flags &= ~TSF_BOUND;
1903#ifdef SMP
1904	sched_unpin();
1905#endif
1906}
1907
1908int
1909sched_is_bound(struct thread *td)
1910{
1911	mtx_assert(&sched_lock, MA_OWNED);
1912	return (td->td_sched->ts_flags & TSF_BOUND);
1913}
1914
1915void
1916sched_relinquish(struct thread *td)
1917{
1918	mtx_lock_spin(&sched_lock);
1919	if (td->td_pri_class == PRI_TIMESHARE)
1920		sched_prio(td, PRI_MAX_TIMESHARE);
1921	mi_switch(SW_VOL, NULL);
1922	mtx_unlock_spin(&sched_lock);
1923}
1924
1925int
1926sched_load(void)
1927{
1928#ifdef SMP
1929	int total;
1930	int i;
1931
1932	total = 0;
1933	for (i = 0; i <= tdg_maxid; i++)
1934		total += TDQ_GROUP(i)->tdg_load;
1935	return (total);
1936#else
1937	return (TDQ_SELF()->tdq_sysload);
1938#endif
1939}
1940
1941int
1942sched_sizeof_proc(void)
1943{
1944	return (sizeof(struct proc));
1945}
1946
1947int
1948sched_sizeof_thread(void)
1949{
1950	return (sizeof(struct thread) + sizeof(struct td_sched));
1951}
1952
1953void
1954sched_tick(void)
1955{
1956}
1957
1958static SYSCTL_NODE(_kern, OID_AUTO, sched, CTLFLAG_RW, 0, "Scheduler");
1959SYSCTL_STRING(_kern_sched, OID_AUTO, name, CTLFLAG_RD, "ule", 0,
1960    "Scheduler name");
1961SYSCTL_INT(_kern_sched, OID_AUTO, slice, CTLFLAG_RW, &sched_slice, 0, "");
1962SYSCTL_INT(_kern_sched, OID_AUTO, interact, CTLFLAG_RW, &sched_interact, 0, "");
1963SYSCTL_INT(_kern_sched, OID_AUTO, tickincr, CTLFLAG_RD, &tickincr, 0, "");
1964SYSCTL_INT(_kern_sched, OID_AUTO, realstathz, CTLFLAG_RD, &realstathz, 0, "");
1965SYSCTL_INT(_kern_sched, OID_AUTO, balance, CTLFLAG_RW, &sched_rebalance, 0, "");
1966
1967/* ps compat */
1968static fixpt_t  ccpu = 0.95122942450071400909 * FSCALE; /* exp(-1/20) */
1969SYSCTL_INT(_kern, OID_AUTO, ccpu, CTLFLAG_RD, &ccpu, 0, "");
1970
1971
1972#define KERN_SWITCH_INCLUDE 1
1973#include "kern/kern_switch.c"
1974