sched_ule.c revision 165766
1/*-
2 * Copyright (c) 2002-2007, Jeffrey Roberson <jeff@freebsd.org>
3 * All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 * 1. Redistributions of source code must retain the above copyright
9 *    notice unmodified, this list of conditions, and the following
10 *    disclaimer.
11 * 2. Redistributions in binary form must reproduce the above copyright
12 *    notice, this list of conditions and the following disclaimer in the
13 *    documentation and/or other materials provided with the distribution.
14 *
15 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
16 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
17 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
18 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
19 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
20 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
21 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
22 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
23 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
24 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
25 */
26
27#include <sys/cdefs.h>
28__FBSDID("$FreeBSD: head/sys/kern/sched_ule.c 165766 2007-01-04 12:16:19Z jeff $");
29
30#include "opt_hwpmc_hooks.h"
31#include "opt_sched.h"
32
33#include <sys/param.h>
34#include <sys/systm.h>
35#include <sys/kdb.h>
36#include <sys/kernel.h>
37#include <sys/ktr.h>
38#include <sys/lock.h>
39#include <sys/mutex.h>
40#include <sys/proc.h>
41#include <sys/resource.h>
42#include <sys/resourcevar.h>
43#include <sys/sched.h>
44#include <sys/smp.h>
45#include <sys/sx.h>
46#include <sys/sysctl.h>
47#include <sys/sysproto.h>
48#include <sys/turnstile.h>
49#include <sys/umtx.h>
50#include <sys/vmmeter.h>
51#ifdef KTRACE
52#include <sys/uio.h>
53#include <sys/ktrace.h>
54#endif
55
56#ifdef HWPMC_HOOKS
57#include <sys/pmckern.h>
58#endif
59
60#include <machine/cpu.h>
61#include <machine/smp.h>
62
63/*
64 * Thread scheduler specific section.
65 */
66struct td_sched {
67	TAILQ_ENTRY(td_sched) ts_procq;	/* (j/z) Run queue. */
68	int		ts_flags;	/* (j) TSF_* flags. */
69	struct thread	*ts_thread;	/* (*) Active associated thread. */
70	fixpt_t		ts_pctcpu;	/* (j) %cpu during p_swtime. */
71	u_char		ts_rqindex;	/* (j) Run queue index. */
72	enum {
73		TSS_THREAD,
74		TSS_ONRUNQ
75	} ts_state;			/* (j) thread sched specific status. */
76	int		ts_slptime;
77	int		ts_slice;
78	struct runq	*ts_runq;
79	u_char		ts_cpu;		/* CPU that we have affinity for. */
80	/* The following variables are only used for pctcpu calculation */
81	int		ts_ltick;	/* Last tick that we were running on */
82	int		ts_ftick;	/* First tick that we were running on */
83	int		ts_ticks;	/* Tick count */
84
85	/* originally from kg_sched */
86	int	skg_slptime;		/* Number of ticks we vol. slept */
87	int	skg_runtime;		/* Number of ticks we were running */
88};
89#define	ts_assign		ts_procq.tqe_next
90/* flags kept in ts_flags */
91#define	TSF_ASSIGNED	0x0001		/* Thread is being migrated. */
92#define	TSF_BOUND	0x0002		/* Thread can not migrate. */
93#define	TSF_XFERABLE	0x0004		/* Thread was added as transferable. */
94#define	TSF_HOLD	0x0008		/* Thread is temporarily bound. */
95#define	TSF_REMOVED	0x0010		/* Thread was removed while ASSIGNED */
96#define	TSF_INTERNAL	0x0020		/* Thread added due to migration. */
97#define	TSF_DIDRUN	0x2000		/* Thread actually ran. */
98#define	TSF_EXIT	0x4000		/* Thread is being killed. */
99
100static struct td_sched td_sched0;
101
102/*
103 * Cpu percentage computation macros and defines.
104 *
105 * SCHED_TICK_SECS:	Number of seconds to average the cpu usage across.
106 * SCHED_TICK_TARG:	Number of hz ticks to average the cpu usage across.
107 * SCHED_TICK_SHIFT:	Shift factor to avoid rounding away results.
108 * SCHED_TICK_HZ:	Compute the number of hz ticks for a given ticks count.
109 * SCHED_TICK_TOTAL:	Gives the amount of time we've been recording ticks.
110 */
111#define	SCHED_TICK_SECS		10
112#define	SCHED_TICK_TARG		(hz * SCHED_TICK_SECS)
113#define	SCHED_TICK_SHIFT	10
114#define	SCHED_TICK_HZ(ts)	((ts)->ts_ticks >> SCHED_TICK_SHIFT)
115#define	SCHED_TICK_TOTAL(ts)	((ts)->ts_ltick - (ts)->ts_ftick)
116
117/*
118 * These macros determine priorities for non-interactive threads.  They are
119 * assigned a priority based on their recent cpu utilization as expressed
120 * by the ratio of ticks to the tick total.  NHALF priorities at the start
121 * and end of the MIN to MAX timeshare range are only reachable with negative
122 * or positive nice respectively.
123 *
124 * PRI_RANGE:	Priority range for utilization dependent priorities.
125 * PRI_NRESV:	Number of nice values.
126 * PRI_TICKS:	Compute a priority in PRI_RANGE from the ticks count and total.
127 * PRI_NICE:	Determines the part of the priority inherited from nice.
128 */
129#define	SCHED_PRI_NRESV		(PRIO_MAX - PRIO_MIN)
130#define	SCHED_PRI_NHALF		(SCHED_PRI_NRESV / 2)
131#define	SCHED_PRI_MIN		(PRI_MIN_TIMESHARE + SCHED_PRI_NHALF)
132#define	SCHED_PRI_MAX		(PRI_MAX_TIMESHARE - SCHED_PRI_NHALF)
133#define	SCHED_PRI_RANGE		(SCHED_PRI_MAX - SCHED_PRI_MIN + 1)
134#define	SCHED_PRI_TICKS(ts)						\
135    (SCHED_TICK_HZ((ts)) /						\
136    (max(SCHED_TICK_TOTAL((ts)), SCHED_PRI_RANGE) / SCHED_PRI_RANGE))
137#define	SCHED_PRI_NICE(nice)	(nice)
138
139/*
140 * These determine the interactivity of a process.  Interactivity differs from
141 * cpu utilization in that it expresses the voluntary time slept vs time ran
142 * while cpu utilization includes all time not running.  This more accurately
143 * models the intent of the thread.
144 *
145 * SLP_RUN_MAX:	Maximum amount of sleep time + run time we'll accumulate
146 *		before throttling back.
147 * SLP_RUN_FORK:	Maximum slp+run time to inherit at fork time.
148 * INTERACT_MAX:	Maximum interactivity value.  Smaller is better.
149 * INTERACT_THRESH:	Threshhold for placement on the current runq.
150 */
151#define	SCHED_SLP_RUN_MAX	((hz * 5) << SCHED_TICK_SHIFT)
152#define	SCHED_SLP_RUN_FORK	((hz / 2) << SCHED_TICK_SHIFT)
153#define	SCHED_INTERACT_MAX	(100)
154#define	SCHED_INTERACT_HALF	(SCHED_INTERACT_MAX / 2)
155#define	SCHED_INTERACT_THRESH	(30)
156
157/*
158 * tickincr:		Converts a stathz tick into a hz domain scaled by
159 *			the shift factor.  Without the shift the error rate
160 *			due to rounding would be unacceptably high.
161 * realstathz:		stathz is sometimes 0 and run off of hz.
162 * sched_slice:		Runtime of each thread before rescheduling.
163 */
164static int sched_interact = SCHED_INTERACT_THRESH;
165static int realstathz;
166static int tickincr;
167static int sched_slice;
168
169/*
170 * tdq - per processor runqs and statistics.
171 */
172struct tdq {
173	struct runq	tdq_idle;		/* Queue of IDLE threads. */
174	struct runq	tdq_timeshare;		/* timeshare run queue. */
175	struct runq	tdq_realtime;		/* real-time run queue. */
176	int		tdq_idx;		/* Current insert index. */
177	int		tdq_ridx;		/* Current removal index. */
178	int		tdq_load_timeshare;	/* Load for timeshare. */
179	int		tdq_load;		/* Aggregate load. */
180#ifdef SMP
181	int		tdq_transferable;
182	LIST_ENTRY(tdq)	tdq_siblings;		/* Next in tdq group. */
183	struct tdq_group *tdq_group;		/* Our processor group. */
184	volatile struct td_sched *tdq_assigned;	/* assigned by another CPU. */
185#else
186	int		tdq_sysload;		/* For loadavg, !ITHD load. */
187#endif
188};
189
190#ifdef SMP
191/*
192 * tdq groups are groups of processors which can cheaply share threads.  When
193 * one processor in the group goes idle it will check the runqs of the other
194 * processors in its group prior to halting and waiting for an interrupt.
195 * These groups are suitable for SMT (Symetric Multi-Threading) and not NUMA.
196 * In a numa environment we'd want an idle bitmap per group and a two tiered
197 * load balancer.
198 */
199struct tdq_group {
200	int	tdg_cpus;		/* Count of CPUs in this tdq group. */
201	cpumask_t tdg_cpumask;		/* Mask of cpus in this group. */
202	cpumask_t tdg_idlemask;		/* Idle cpus in this group. */
203	cpumask_t tdg_mask;		/* Bit mask for first cpu. */
204	int	tdg_load;		/* Total load of this group. */
205	int	tdg_transferable;	/* Transferable load of this group. */
206	LIST_HEAD(, tdq) tdg_members;	/* Linked list of all members. */
207};
208#endif
209
210/*
211 * One thread queue per processor.
212 */
213#ifdef SMP
214static cpumask_t tdq_idle;
215static int tdg_maxid;
216static struct tdq	tdq_cpu[MAXCPU];
217static struct tdq_group tdq_groups[MAXCPU];
218static int bal_tick;
219static int gbal_tick;
220static int balance_groups;
221
222#define	TDQ_SELF()	(&tdq_cpu[PCPU_GET(cpuid)])
223#define	TDQ_CPU(x)	(&tdq_cpu[(x)])
224#define	TDQ_ID(x)	((x) - tdq_cpu)
225#define	TDQ_GROUP(x)	(&tdq_groups[(x)])
226#else	/* !SMP */
227static struct tdq	tdq_cpu;
228
229#define	TDQ_SELF()	(&tdq_cpu)
230#define	TDQ_CPU(x)	(&tdq_cpu)
231#endif
232
233static struct td_sched *sched_choose(void);	/* XXX Should be thread * */
234static void sched_priority(struct thread *);
235static void sched_thread_priority(struct thread *, u_char);
236static int sched_interact_score(struct thread *);
237static void sched_interact_update(struct thread *);
238static void sched_interact_fork(struct thread *);
239static void sched_pctcpu_update(struct td_sched *);
240
241/* Operations on per processor queues */
242static struct td_sched * tdq_choose(struct tdq *);
243static void tdq_setup(struct tdq *);
244static void tdq_load_add(struct tdq *, struct td_sched *);
245static void tdq_load_rem(struct tdq *, struct td_sched *);
246static __inline void tdq_runq_add(struct tdq *, struct td_sched *, int);
247static __inline void tdq_runq_rem(struct tdq *, struct td_sched *);
248void tdq_print(int cpu);
249static void runq_print(struct runq *rq);
250#ifdef SMP
251static int tdq_transfer(struct tdq *, struct td_sched *, int);
252static struct td_sched *runq_steal(struct runq *);
253static void sched_balance(void);
254static void sched_balance_groups(void);
255static void sched_balance_group(struct tdq_group *);
256static void sched_balance_pair(struct tdq *, struct tdq *);
257static void sched_smp_tick(void);
258static void tdq_move(struct tdq *, int);
259static int tdq_idled(struct tdq *);
260static void tdq_notify(struct td_sched *, int);
261static void tdq_assign(struct tdq *);
262static struct td_sched *tdq_steal(struct tdq *, int);
263#define	THREAD_CAN_MIGRATE(td)						\
264    ((td)->td_pinned == 0 && (td)->td_pri_class != PRI_ITHD)
265#endif
266
267static void sched_setup(void *dummy);
268SYSINIT(sched_setup, SI_SUB_RUN_QUEUE, SI_ORDER_FIRST, sched_setup, NULL)
269
270static void sched_initticks(void *dummy);
271SYSINIT(sched_initticks, SI_SUB_CLOCKS, SI_ORDER_THIRD, sched_initticks, NULL)
272
273static void
274runq_print(struct runq *rq)
275{
276	struct rqhead *rqh;
277	struct td_sched *ts;
278	int pri;
279	int j;
280	int i;
281
282	for (i = 0; i < RQB_LEN; i++) {
283		printf("\t\trunq bits %d 0x%zx\n",
284		    i, rq->rq_status.rqb_bits[i]);
285		for (j = 0; j < RQB_BPW; j++)
286			if (rq->rq_status.rqb_bits[i] & (1ul << j)) {
287				pri = j + (i << RQB_L2BPW);
288				rqh = &rq->rq_queues[pri];
289				TAILQ_FOREACH(ts, rqh, ts_procq) {
290					printf("\t\t\ttd %p(%s) priority %d rqindex %d pri %d\n",
291					    ts->ts_thread, ts->ts_thread->td_proc->p_comm, ts->ts_thread->td_priority, ts->ts_rqindex, pri);
292				}
293			}
294	}
295}
296
297void
298tdq_print(int cpu)
299{
300	struct tdq *tdq;
301
302	tdq = TDQ_CPU(cpu);
303
304	printf("tdq:\n");
305	printf("\tload:           %d\n", tdq->tdq_load);
306	printf("\tload TIMESHARE: %d\n", tdq->tdq_load_timeshare);
307	printf("\ttimeshare idx: %d\n", tdq->tdq_idx);
308	printf("\ttimeshare ridx: %d\n", tdq->tdq_ridx);
309	printf("\trealtime runq:\n");
310	runq_print(&tdq->tdq_realtime);
311	printf("\ttimeshare runq:\n");
312	runq_print(&tdq->tdq_timeshare);
313	printf("\tidle runq:\n");
314	runq_print(&tdq->tdq_idle);
315#ifdef SMP
316	printf("\tload transferable: %d\n", tdq->tdq_transferable);
317#endif
318}
319
320static __inline void
321tdq_runq_add(struct tdq *tdq, struct td_sched *ts, int flags)
322{
323#ifdef SMP
324	if (THREAD_CAN_MIGRATE(ts->ts_thread)) {
325		tdq->tdq_transferable++;
326		tdq->tdq_group->tdg_transferable++;
327		ts->ts_flags |= TSF_XFERABLE;
328	}
329#endif
330	if (ts->ts_runq == &tdq->tdq_timeshare) {
331		int pri;
332
333		pri = ts->ts_thread->td_priority;
334		KASSERT(pri <= PRI_MAX_TIMESHARE && pri >= PRI_MIN_TIMESHARE,
335			("Invalid priority %d on timeshare runq", pri));
336		/*
337		 * This queue contains only priorities between MIN and MAX
338		 * realtime.  Use the whole queue to represent these values.
339		 */
340#define	TS_RQ_PPQ	(((PRI_MAX_TIMESHARE - PRI_MIN_TIMESHARE) + 1) / RQ_NQS)
341		if ((flags & SRQ_BORROWING) == 0) {
342			pri = (pri - PRI_MIN_TIMESHARE) / TS_RQ_PPQ;
343			pri = (pri + tdq->tdq_idx) % RQ_NQS;
344			/*
345			 * This effectively shortens the queue by one so we
346			 * can have a one slot difference between idx and
347			 * ridx while we wait for threads to drain.
348			 */
349			if (tdq->tdq_ridx != tdq->tdq_idx &&
350			    pri == tdq->tdq_ridx)
351				pri = (pri - 1) % RQ_NQS;
352		} else
353			pri = tdq->tdq_ridx;
354		runq_add_pri(ts->ts_runq, ts, pri, flags);
355	} else
356		runq_add(ts->ts_runq, ts, flags);
357}
358
359static __inline void
360tdq_runq_rem(struct tdq *tdq, struct td_sched *ts)
361{
362#ifdef SMP
363	if (ts->ts_flags & TSF_XFERABLE) {
364		tdq->tdq_transferable--;
365		tdq->tdq_group->tdg_transferable--;
366		ts->ts_flags &= ~TSF_XFERABLE;
367	}
368#endif
369	if (ts->ts_runq == &tdq->tdq_timeshare) {
370		if (tdq->tdq_idx != tdq->tdq_ridx)
371			runq_remove_idx(ts->ts_runq, ts, &tdq->tdq_ridx);
372		else
373			runq_remove_idx(ts->ts_runq, ts, NULL);
374	} else
375		runq_remove(ts->ts_runq, ts);
376}
377
378static void
379tdq_load_add(struct tdq *tdq, struct td_sched *ts)
380{
381	int class;
382	mtx_assert(&sched_lock, MA_OWNED);
383	class = PRI_BASE(ts->ts_thread->td_pri_class);
384	if (class == PRI_TIMESHARE)
385		tdq->tdq_load_timeshare++;
386	tdq->tdq_load++;
387	CTR1(KTR_SCHED, "load: %d", tdq->tdq_load);
388	if (class != PRI_ITHD && (ts->ts_thread->td_proc->p_flag & P_NOLOAD) == 0)
389#ifdef SMP
390		tdq->tdq_group->tdg_load++;
391#else
392		tdq->tdq_sysload++;
393#endif
394}
395
396static void
397tdq_load_rem(struct tdq *tdq, struct td_sched *ts)
398{
399	int class;
400	mtx_assert(&sched_lock, MA_OWNED);
401	class = PRI_BASE(ts->ts_thread->td_pri_class);
402	if (class == PRI_TIMESHARE)
403		tdq->tdq_load_timeshare--;
404	if (class != PRI_ITHD  && (ts->ts_thread->td_proc->p_flag & P_NOLOAD) == 0)
405#ifdef SMP
406		tdq->tdq_group->tdg_load--;
407#else
408		tdq->tdq_sysload--;
409#endif
410	tdq->tdq_load--;
411	CTR1(KTR_SCHED, "load: %d", tdq->tdq_load);
412	ts->ts_runq = NULL;
413}
414
415#ifdef SMP
416static void
417sched_smp_tick(void)
418{
419	struct tdq *tdq;
420
421	tdq = TDQ_SELF();
422	if (ticks >= bal_tick)
423		sched_balance();
424	if (ticks >= gbal_tick && balance_groups)
425		sched_balance_groups();
426	/*
427	 * We could have been assigned a non real-time thread without an
428	 * IPI.
429	 */
430	if (tdq->tdq_assigned)
431		tdq_assign(tdq);	/* Potentially sets NEEDRESCHED */
432}
433
434/*
435 * sched_balance is a simple CPU load balancing algorithm.  It operates by
436 * finding the least loaded and most loaded cpu and equalizing their load
437 * by migrating some processes.
438 *
439 * Dealing only with two CPUs at a time has two advantages.  Firstly, most
440 * installations will only have 2 cpus.  Secondly, load balancing too much at
441 * once can have an unpleasant effect on the system.  The scheduler rarely has
442 * enough information to make perfect decisions.  So this algorithm chooses
443 * algorithm simplicity and more gradual effects on load in larger systems.
444 *
445 * It could be improved by considering the priorities and slices assigned to
446 * each task prior to balancing them.  There are many pathological cases with
447 * any approach and so the semi random algorithm below may work as well as any.
448 *
449 */
450static void
451sched_balance(void)
452{
453	struct tdq_group *high;
454	struct tdq_group *low;
455	struct tdq_group *tdg;
456	int cnt;
457	int i;
458
459	bal_tick = ticks + (random() % (hz * 2));
460	if (smp_started == 0)
461		return;
462	low = high = NULL;
463	i = random() % (tdg_maxid + 1);
464	for (cnt = 0; cnt <= tdg_maxid; cnt++) {
465		tdg = TDQ_GROUP(i);
466		/*
467		 * Find the CPU with the highest load that has some
468		 * threads to transfer.
469		 */
470		if ((high == NULL || tdg->tdg_load > high->tdg_load)
471		    && tdg->tdg_transferable)
472			high = tdg;
473		if (low == NULL || tdg->tdg_load < low->tdg_load)
474			low = tdg;
475		if (++i > tdg_maxid)
476			i = 0;
477	}
478	if (low != NULL && high != NULL && high != low)
479		sched_balance_pair(LIST_FIRST(&high->tdg_members),
480		    LIST_FIRST(&low->tdg_members));
481}
482
483static void
484sched_balance_groups(void)
485{
486	int i;
487
488	gbal_tick = ticks + (random() % (hz * 2));
489	mtx_assert(&sched_lock, MA_OWNED);
490	if (smp_started)
491		for (i = 0; i <= tdg_maxid; i++)
492			sched_balance_group(TDQ_GROUP(i));
493}
494
495static void
496sched_balance_group(struct tdq_group *tdg)
497{
498	struct tdq *tdq;
499	struct tdq *high;
500	struct tdq *low;
501	int load;
502
503	if (tdg->tdg_transferable == 0)
504		return;
505	low = NULL;
506	high = NULL;
507	LIST_FOREACH(tdq, &tdg->tdg_members, tdq_siblings) {
508		load = tdq->tdq_load;
509		if (high == NULL || load > high->tdq_load)
510			high = tdq;
511		if (low == NULL || load < low->tdq_load)
512			low = tdq;
513	}
514	if (high != NULL && low != NULL && high != low)
515		sched_balance_pair(high, low);
516}
517
518static void
519sched_balance_pair(struct tdq *high, struct tdq *low)
520{
521	int transferable;
522	int high_load;
523	int low_load;
524	int move;
525	int diff;
526	int i;
527
528	/*
529	 * If we're transfering within a group we have to use this specific
530	 * tdq's transferable count, otherwise we can steal from other members
531	 * of the group.
532	 */
533	if (high->tdq_group == low->tdq_group) {
534		transferable = high->tdq_transferable;
535		high_load = high->tdq_load;
536		low_load = low->tdq_load;
537	} else {
538		transferable = high->tdq_group->tdg_transferable;
539		high_load = high->tdq_group->tdg_load;
540		low_load = low->tdq_group->tdg_load;
541	}
542	if (transferable == 0)
543		return;
544	/*
545	 * Determine what the imbalance is and then adjust that to how many
546	 * threads we actually have to give up (transferable).
547	 */
548	diff = high_load - low_load;
549	move = diff / 2;
550	if (diff & 0x1)
551		move++;
552	move = min(move, transferable);
553	for (i = 0; i < move; i++)
554		tdq_move(high, TDQ_ID(low));
555	return;
556}
557
558static void
559tdq_move(struct tdq *from, int cpu)
560{
561	struct tdq *tdq;
562	struct tdq *to;
563	struct td_sched *ts;
564
565	tdq = from;
566	to = TDQ_CPU(cpu);
567	ts = tdq_steal(tdq, 1);
568	if (ts == NULL) {
569		struct tdq_group *tdg;
570
571		tdg = tdq->tdq_group;
572		LIST_FOREACH(tdq, &tdg->tdg_members, tdq_siblings) {
573			if (tdq == from || tdq->tdq_transferable == 0)
574				continue;
575			ts = tdq_steal(tdq, 1);
576			break;
577		}
578		if (ts == NULL)
579			panic("tdq_move: No threads available with a "
580			    "transferable count of %d\n",
581			    tdg->tdg_transferable);
582	}
583	if (tdq == to)
584		return;
585	ts->ts_state = TSS_THREAD;
586	tdq_runq_rem(tdq, ts);
587	tdq_load_rem(tdq, ts);
588	tdq_notify(ts, cpu);
589}
590
591static int
592tdq_idled(struct tdq *tdq)
593{
594	struct tdq_group *tdg;
595	struct tdq *steal;
596	struct td_sched *ts;
597
598	tdg = tdq->tdq_group;
599	/*
600	 * If we're in a cpu group, try and steal threads from another cpu in
601	 * the group before idling.
602	 */
603	if (tdg->tdg_cpus > 1 && tdg->tdg_transferable) {
604		LIST_FOREACH(steal, &tdg->tdg_members, tdq_siblings) {
605			if (steal == tdq || steal->tdq_transferable == 0)
606				continue;
607			ts = tdq_steal(steal, 0);
608			if (ts == NULL)
609				continue;
610			ts->ts_state = TSS_THREAD;
611			tdq_runq_rem(steal, ts);
612			tdq_load_rem(steal, ts);
613			ts->ts_cpu = PCPU_GET(cpuid);
614			ts->ts_flags |= TSF_INTERNAL | TSF_HOLD;
615			sched_add(ts->ts_thread, SRQ_YIELDING);
616			return (0);
617		}
618	}
619	/*
620	 * We only set the idled bit when all of the cpus in the group are
621	 * idle.  Otherwise we could get into a situation where a thread bounces
622	 * back and forth between two idle cores on seperate physical CPUs.
623	 */
624	tdg->tdg_idlemask |= PCPU_GET(cpumask);
625	if (tdg->tdg_idlemask != tdg->tdg_cpumask)
626		return (1);
627	atomic_set_int(&tdq_idle, tdg->tdg_mask);
628	return (1);
629}
630
631static void
632tdq_assign(struct tdq *tdq)
633{
634	struct td_sched *nts;
635	struct td_sched *ts;
636
637	do {
638		*(volatile struct td_sched **)&ts = tdq->tdq_assigned;
639	} while(!atomic_cmpset_ptr((volatile uintptr_t *)&tdq->tdq_assigned,
640		(uintptr_t)ts, (uintptr_t)NULL));
641	for (; ts != NULL; ts = nts) {
642		nts = ts->ts_assign;
643		tdq->tdq_group->tdg_load--;
644		tdq->tdq_load--;
645		ts->ts_flags &= ~TSF_ASSIGNED;
646		if (ts->ts_flags & TSF_REMOVED) {
647			ts->ts_flags &= ~TSF_REMOVED;
648			continue;
649		}
650		ts->ts_flags |= TSF_INTERNAL | TSF_HOLD;
651		sched_add(ts->ts_thread, SRQ_YIELDING);
652	}
653}
654
655static void
656tdq_notify(struct td_sched *ts, int cpu)
657{
658	struct tdq *tdq;
659	struct thread *td;
660	struct pcpu *pcpu;
661	int class;
662	int prio;
663
664	tdq = TDQ_CPU(cpu);
665	class = PRI_BASE(ts->ts_thread->td_pri_class);
666	if ((class != PRI_IDLE && class != PRI_ITHD)
667	    && (tdq_idle & tdq->tdq_group->tdg_mask))
668		atomic_clear_int(&tdq_idle, tdq->tdq_group->tdg_mask);
669	tdq->tdq_group->tdg_load++;
670	tdq->tdq_load++;
671	ts->ts_cpu = cpu;
672	ts->ts_flags |= TSF_ASSIGNED;
673	prio = ts->ts_thread->td_priority;
674
675	/*
676	 * Place a thread on another cpu's queue and force a resched.
677	 */
678	do {
679		*(volatile struct td_sched **)&ts->ts_assign = tdq->tdq_assigned;
680	} while(!atomic_cmpset_ptr((volatile uintptr_t *)&tdq->tdq_assigned,
681		(uintptr_t)ts->ts_assign, (uintptr_t)ts));
682	/*
683	 * Without sched_lock we could lose a race where we set NEEDRESCHED
684	 * on a thread that is switched out before the IPI is delivered.  This
685	 * would lead us to miss the resched.  This will be a problem once
686	 * sched_lock is pushed down.
687	 */
688	pcpu = pcpu_find(cpu);
689	td = pcpu->pc_curthread;
690	if (ts->ts_thread->td_priority < td->td_priority ||
691	    td == pcpu->pc_idlethread) {
692		td->td_flags |= TDF_NEEDRESCHED;
693		ipi_selected(1 << cpu, IPI_AST);
694	}
695}
696
697static struct td_sched *
698runq_steal(struct runq *rq)
699{
700	struct rqhead *rqh;
701	struct rqbits *rqb;
702	struct td_sched *ts;
703	int word;
704	int bit;
705
706	mtx_assert(&sched_lock, MA_OWNED);
707	rqb = &rq->rq_status;
708	for (word = 0; word < RQB_LEN; word++) {
709		if (rqb->rqb_bits[word] == 0)
710			continue;
711		for (bit = 0; bit < RQB_BPW; bit++) {
712			if ((rqb->rqb_bits[word] & (1ul << bit)) == 0)
713				continue;
714			rqh = &rq->rq_queues[bit + (word << RQB_L2BPW)];
715			TAILQ_FOREACH(ts, rqh, ts_procq) {
716				if (THREAD_CAN_MIGRATE(ts->ts_thread))
717					return (ts);
718			}
719		}
720	}
721	return (NULL);
722}
723
724static struct td_sched *
725tdq_steal(struct tdq *tdq, int stealidle)
726{
727	struct td_sched *ts;
728
729	/*
730	 * Steal from next first to try to get a non-interactive task that
731	 * may not have run for a while.
732	 * XXX Need to effect steal order for timeshare threads.
733	 */
734	if ((ts = runq_steal(&tdq->tdq_realtime)) != NULL)
735		return (ts);
736	if ((ts = runq_steal(&tdq->tdq_timeshare)) != NULL)
737		return (ts);
738	if (stealidle)
739		return (runq_steal(&tdq->tdq_idle));
740	return (NULL);
741}
742
743int
744tdq_transfer(struct tdq *tdq, struct td_sched *ts, int class)
745{
746	struct tdq_group *ntdg;
747	struct tdq_group *tdg;
748	struct tdq *old;
749	int cpu;
750	int idx;
751
752	if (smp_started == 0)
753		return (0);
754	cpu = 0;
755	/*
756	 * If our load exceeds a certain threshold we should attempt to
757	 * reassign this thread.  The first candidate is the cpu that
758	 * originally ran the thread.  If it is idle, assign it there,
759	 * otherwise, pick an idle cpu.
760	 *
761	 * The threshold at which we start to reassign has a large impact
762	 * on the overall performance of the system.  Tuned too high and
763	 * some CPUs may idle.  Too low and there will be excess migration
764	 * and context switches.
765	 */
766	old = TDQ_CPU(ts->ts_cpu);
767	ntdg = old->tdq_group;
768	tdg = tdq->tdq_group;
769	if (tdq_idle) {
770		if (tdq_idle & ntdg->tdg_mask) {
771			cpu = ffs(ntdg->tdg_idlemask);
772			if (cpu) {
773				CTR2(KTR_SCHED,
774				    "tdq_transfer: %p found old cpu %X "
775				    "in idlemask.", ts, cpu);
776				goto migrate;
777			}
778		}
779		/*
780		 * Multiple cpus could find this bit simultaneously
781		 * but the race shouldn't be terrible.
782		 */
783		cpu = ffs(tdq_idle);
784		if (cpu) {
785			CTR2(KTR_SCHED, "tdq_transfer: %p found %X "
786			    "in idlemask.", ts, cpu);
787			goto migrate;
788		}
789	}
790	idx = 0;
791#if 0
792	if (old->tdq_load < tdq->tdq_load) {
793		cpu = ts->ts_cpu + 1;
794		CTR2(KTR_SCHED, "tdq_transfer: %p old cpu %X "
795		    "load less than ours.", ts, cpu);
796		goto migrate;
797	}
798	/*
799	 * No new CPU was found, look for one with less load.
800	 */
801	for (idx = 0; idx <= tdg_maxid; idx++) {
802		ntdg = TDQ_GROUP(idx);
803		if (ntdg->tdg_load /*+ (ntdg->tdg_cpus  * 2)*/ < tdg->tdg_load) {
804			cpu = ffs(ntdg->tdg_cpumask);
805			CTR2(KTR_SCHED, "tdq_transfer: %p cpu %X load less "
806			    "than ours.", ts, cpu);
807			goto migrate;
808		}
809	}
810#endif
811	/*
812	 * If another cpu in this group has idled, assign a thread over
813	 * to them after checking to see if there are idled groups.
814	 */
815	if (tdg->tdg_idlemask) {
816		cpu = ffs(tdg->tdg_idlemask);
817		if (cpu) {
818			CTR2(KTR_SCHED, "tdq_transfer: %p cpu %X idle in "
819			    "group.", ts, cpu);
820			goto migrate;
821		}
822	}
823	return (0);
824migrate:
825	/*
826	 * Now that we've found an idle CPU, migrate the thread.
827	 */
828	cpu--;
829	ts->ts_runq = NULL;
830	tdq_notify(ts, cpu);
831
832	return (1);
833}
834
835#endif	/* SMP */
836
837/*
838 * Pick the highest priority task we have and return it.
839 */
840
841static struct td_sched *
842tdq_choose(struct tdq *tdq)
843{
844	struct td_sched *ts;
845
846	mtx_assert(&sched_lock, MA_OWNED);
847
848	ts = runq_choose(&tdq->tdq_realtime);
849	if (ts != NULL) {
850		KASSERT(ts->ts_thread->td_priority <= PRI_MAX_REALTIME,
851		    ("tdq_choose: Invalid priority on realtime queue %d",
852		    ts->ts_thread->td_priority));
853		return (ts);
854	}
855	ts = runq_choose_from(&tdq->tdq_timeshare, tdq->tdq_ridx);
856	if (ts != NULL) {
857		KASSERT(ts->ts_thread->td_priority <= PRI_MAX_TIMESHARE &&
858		    ts->ts_thread->td_priority >= PRI_MIN_TIMESHARE,
859		    ("tdq_choose: Invalid priority on timeshare queue %d",
860		    ts->ts_thread->td_priority));
861		return (ts);
862	}
863
864	ts = runq_choose(&tdq->tdq_idle);
865	if (ts != NULL) {
866		KASSERT(ts->ts_thread->td_priority >= PRI_MIN_IDLE,
867		    ("tdq_choose: Invalid priority on idle queue %d",
868		    ts->ts_thread->td_priority));
869		return (ts);
870	}
871
872	return (NULL);
873}
874
875static void
876tdq_setup(struct tdq *tdq)
877{
878	runq_init(&tdq->tdq_realtime);
879	runq_init(&tdq->tdq_timeshare);
880	runq_init(&tdq->tdq_idle);
881	tdq->tdq_load = 0;
882	tdq->tdq_load_timeshare = 0;
883}
884
885static void
886sched_setup(void *dummy)
887{
888#ifdef SMP
889	int i;
890#endif
891
892	/*
893	 * To avoid divide-by-zero, we set realstathz a dummy value
894	 * in case which sched_clock() called before sched_initticks().
895	 */
896	realstathz = hz;
897	sched_slice = (realstathz/7);	/* 140ms */
898	tickincr = 1 << SCHED_TICK_SHIFT;
899
900#ifdef SMP
901	balance_groups = 0;
902	/*
903	 * Initialize the tdqs.
904	 */
905	for (i = 0; i < MAXCPU; i++) {
906		struct tdq *tdq;
907
908		tdq = &tdq_cpu[i];
909		tdq->tdq_assigned = NULL;
910		tdq_setup(&tdq_cpu[i]);
911	}
912	if (smp_topology == NULL) {
913		struct tdq_group *tdg;
914		struct tdq *tdq;
915		int cpus;
916
917		for (cpus = 0, i = 0; i < MAXCPU; i++) {
918			if (CPU_ABSENT(i))
919				continue;
920			tdq = &tdq_cpu[i];
921			tdg = &tdq_groups[cpus];
922			/*
923			 * Setup a tdq group with one member.
924			 */
925			tdq->tdq_transferable = 0;
926			tdq->tdq_group = tdg;
927			tdg->tdg_cpus = 1;
928			tdg->tdg_idlemask = 0;
929			tdg->tdg_cpumask = tdg->tdg_mask = 1 << i;
930			tdg->tdg_load = 0;
931			tdg->tdg_transferable = 0;
932			LIST_INIT(&tdg->tdg_members);
933			LIST_INSERT_HEAD(&tdg->tdg_members, tdq, tdq_siblings);
934			cpus++;
935		}
936		tdg_maxid = cpus - 1;
937	} else {
938		struct tdq_group *tdg;
939		struct cpu_group *cg;
940		int j;
941
942		for (i = 0; i < smp_topology->ct_count; i++) {
943			cg = &smp_topology->ct_group[i];
944			tdg = &tdq_groups[i];
945			/*
946			 * Initialize the group.
947			 */
948			tdg->tdg_idlemask = 0;
949			tdg->tdg_load = 0;
950			tdg->tdg_transferable = 0;
951			tdg->tdg_cpus = cg->cg_count;
952			tdg->tdg_cpumask = cg->cg_mask;
953			LIST_INIT(&tdg->tdg_members);
954			/*
955			 * Find all of the group members and add them.
956			 */
957			for (j = 0; j < MAXCPU; j++) {
958				if ((cg->cg_mask & (1 << j)) != 0) {
959					if (tdg->tdg_mask == 0)
960						tdg->tdg_mask = 1 << j;
961					tdq_cpu[j].tdq_transferable = 0;
962					tdq_cpu[j].tdq_group = tdg;
963					LIST_INSERT_HEAD(&tdg->tdg_members,
964					    &tdq_cpu[j], tdq_siblings);
965				}
966			}
967			if (tdg->tdg_cpus > 1)
968				balance_groups = 1;
969		}
970		tdg_maxid = smp_topology->ct_count - 1;
971	}
972	/*
973	 * Stagger the group and global load balancer so they do not
974	 * interfere with each other.
975	 */
976	bal_tick = ticks + hz;
977	if (balance_groups)
978		gbal_tick = ticks + (hz / 2);
979#else
980	tdq_setup(TDQ_SELF());
981#endif
982	mtx_lock_spin(&sched_lock);
983	tdq_load_add(TDQ_SELF(), &td_sched0);
984	mtx_unlock_spin(&sched_lock);
985}
986
987/* ARGSUSED */
988static void
989sched_initticks(void *dummy)
990{
991	mtx_lock_spin(&sched_lock);
992	realstathz = stathz ? stathz : hz;
993	sched_slice = (realstathz/7);	/* ~140ms */
994
995	/*
996	 * tickincr is shifted out by 10 to avoid rounding errors due to
997	 * hz not being evenly divisible by stathz on all platforms.
998	 */
999	tickincr = (hz << SCHED_TICK_SHIFT) / realstathz;
1000	/*
1001	 * This does not work for values of stathz that are more than
1002	 * 1 << SCHED_TICK_SHIFT * hz.  In practice this does not happen.
1003	 */
1004	if (tickincr == 0)
1005		tickincr = 1;
1006	mtx_unlock_spin(&sched_lock);
1007}
1008
1009
1010/*
1011 * Scale the scheduling priority according to the "interactivity" of this
1012 * process.
1013 */
1014static void
1015sched_priority(struct thread *td)
1016{
1017	int score;
1018	int pri;
1019
1020	if (td->td_pri_class != PRI_TIMESHARE)
1021		return;
1022	/*
1023	 * If the score is interactive we place the thread in the realtime
1024	 * queue with a priority that is less than kernel and interrupt
1025	 * priorities.  These threads are not subject to nice restrictions.
1026	 *
1027	 * Scores greater than this are placed on the normal realtime queue
1028	 * where the priority is partially decided by the most recent cpu
1029	 * utilization and the rest is decided by nice value.
1030	 */
1031	score = sched_interact_score(td);
1032	if (score < sched_interact) {
1033		pri = PRI_MIN_REALTIME;
1034		pri += ((PRI_MAX_REALTIME - PRI_MIN_REALTIME) / sched_interact)
1035		    * score;
1036		KASSERT(pri >= PRI_MIN_REALTIME && pri <= PRI_MAX_REALTIME,
1037		    ("sched_priority: invalid interactive priority %d", pri));
1038	} else {
1039		pri = SCHED_PRI_MIN;
1040		if (td->td_sched->ts_ticks)
1041			pri += SCHED_PRI_TICKS(td->td_sched);
1042		pri += SCHED_PRI_NICE(td->td_proc->p_nice);
1043		KASSERT(pri >= PRI_MIN_TIMESHARE && pri <= PRI_MAX_TIMESHARE,
1044		    ("sched_priority: invalid priority %d", pri));
1045	}
1046	sched_user_prio(td, pri);
1047
1048	return;
1049}
1050
1051/*
1052 * This routine enforces a maximum limit on the amount of scheduling history
1053 * kept.  It is called after either the slptime or runtime is adjusted.
1054 * This routine will not operate correctly when slp or run times have been
1055 * adjusted to more than double their maximum.
1056 */
1057static void
1058sched_interact_update(struct thread *td)
1059{
1060	int sum;
1061
1062	sum = td->td_sched->skg_runtime + td->td_sched->skg_slptime;
1063	if (sum < SCHED_SLP_RUN_MAX)
1064		return;
1065	/*
1066	 * If we have exceeded by more than 1/5th then the algorithm below
1067	 * will not bring us back into range.  Dividing by two here forces
1068	 * us into the range of [4/5 * SCHED_INTERACT_MAX, SCHED_INTERACT_MAX]
1069	 */
1070	if (sum > (SCHED_SLP_RUN_MAX / 5) * 6) {
1071		td->td_sched->skg_runtime /= 2;
1072		td->td_sched->skg_slptime /= 2;
1073		return;
1074	}
1075	td->td_sched->skg_runtime = (td->td_sched->skg_runtime / 5) * 4;
1076	td->td_sched->skg_slptime = (td->td_sched->skg_slptime / 5) * 4;
1077}
1078
1079static void
1080sched_interact_fork(struct thread *td)
1081{
1082	int ratio;
1083	int sum;
1084
1085	sum = td->td_sched->skg_runtime + td->td_sched->skg_slptime;
1086	if (sum > SCHED_SLP_RUN_FORK) {
1087		ratio = sum / SCHED_SLP_RUN_FORK;
1088		td->td_sched->skg_runtime /= ratio;
1089		td->td_sched->skg_slptime /= ratio;
1090	}
1091}
1092
1093static int
1094sched_interact_score(struct thread *td)
1095{
1096	int div;
1097
1098	if (td->td_sched->skg_runtime > td->td_sched->skg_slptime) {
1099		div = max(1, td->td_sched->skg_runtime / SCHED_INTERACT_HALF);
1100		return (SCHED_INTERACT_HALF +
1101		    (SCHED_INTERACT_HALF - (td->td_sched->skg_slptime / div)));
1102	} if (td->td_sched->skg_slptime > td->td_sched->skg_runtime) {
1103		div = max(1, td->td_sched->skg_slptime / SCHED_INTERACT_HALF);
1104		return (td->td_sched->skg_runtime / div);
1105	}
1106
1107	/*
1108	 * This can happen if slptime and runtime are 0.
1109	 */
1110	return (0);
1111
1112}
1113
1114/*
1115 * Called from proc0_init() to bootstrap the scheduler.
1116 */
1117void
1118schedinit(void)
1119{
1120
1121	/*
1122	 * Set up the scheduler specific parts of proc0.
1123	 */
1124	proc0.p_sched = NULL; /* XXX */
1125	thread0.td_sched = &td_sched0;
1126	td_sched0.ts_ltick = ticks;
1127	td_sched0.ts_ftick = ticks - 1;
1128	td_sched0.ts_thread = &thread0;
1129	td_sched0.ts_state = TSS_THREAD;
1130}
1131
1132/*
1133 * This is only somewhat accurate since given many processes of the same
1134 * priority they will switch when their slices run out, which will be
1135 * at most sched_slice stathz ticks.
1136 */
1137int
1138sched_rr_interval(void)
1139{
1140
1141	/* Convert sched_slice to hz */
1142	return (hz/(realstathz/sched_slice));
1143}
1144
1145static void
1146sched_pctcpu_update(struct td_sched *ts)
1147{
1148
1149	if (ts->ts_ticks == 0)
1150		return;
1151	/*
1152	 * Adjust counters and watermark for pctcpu calc.
1153	 */
1154	if (ts->ts_ltick > ticks - SCHED_TICK_TARG)
1155		ts->ts_ticks = (ts->ts_ticks / (ticks - ts->ts_ftick)) *
1156			    SCHED_TICK_TARG;
1157	else
1158		ts->ts_ticks = 0;
1159	ts->ts_ltick = ticks;
1160	ts->ts_ftick = ts->ts_ltick - SCHED_TICK_TARG;
1161}
1162
1163static void
1164sched_thread_priority(struct thread *td, u_char prio)
1165{
1166	struct td_sched *ts;
1167
1168	CTR6(KTR_SCHED, "sched_prio: %p(%s) prio %d newprio %d by %p(%s)",
1169	    td, td->td_proc->p_comm, td->td_priority, prio, curthread,
1170	    curthread->td_proc->p_comm);
1171	ts = td->td_sched;
1172	mtx_assert(&sched_lock, MA_OWNED);
1173	if (td->td_priority == prio)
1174		return;
1175
1176	if (TD_ON_RUNQ(td) && prio < td->td_priority) {
1177		/*
1178		 * If the priority has been elevated due to priority
1179		 * propagation, we may have to move ourselves to a new
1180		 * queue.  This could be optimized to not re-add in some
1181		 * cases.
1182		 *
1183		 * Hold this td_sched on this cpu so that sched_prio() doesn't
1184		 * cause excessive migration.  We only want migration to
1185		 * happen as the result of a wakeup.
1186		 */
1187		ts->ts_flags |= TSF_HOLD;
1188		sched_rem(td);
1189		td->td_priority = prio;
1190		sched_add(td, SRQ_BORROWING);
1191		ts->ts_flags &= ~TSF_HOLD;
1192	} else
1193		td->td_priority = prio;
1194}
1195
1196/*
1197 * Update a thread's priority when it is lent another thread's
1198 * priority.
1199 */
1200void
1201sched_lend_prio(struct thread *td, u_char prio)
1202{
1203
1204	td->td_flags |= TDF_BORROWING;
1205	sched_thread_priority(td, prio);
1206}
1207
1208/*
1209 * Restore a thread's priority when priority propagation is
1210 * over.  The prio argument is the minimum priority the thread
1211 * needs to have to satisfy other possible priority lending
1212 * requests.  If the thread's regular priority is less
1213 * important than prio, the thread will keep a priority boost
1214 * of prio.
1215 */
1216void
1217sched_unlend_prio(struct thread *td, u_char prio)
1218{
1219	u_char base_pri;
1220
1221	if (td->td_base_pri >= PRI_MIN_TIMESHARE &&
1222	    td->td_base_pri <= PRI_MAX_TIMESHARE)
1223		base_pri = td->td_user_pri;
1224	else
1225		base_pri = td->td_base_pri;
1226	if (prio >= base_pri) {
1227		td->td_flags &= ~TDF_BORROWING;
1228		sched_thread_priority(td, base_pri);
1229	} else
1230		sched_lend_prio(td, prio);
1231}
1232
1233void
1234sched_prio(struct thread *td, u_char prio)
1235{
1236	u_char oldprio;
1237
1238	/* First, update the base priority. */
1239	td->td_base_pri = prio;
1240
1241	/*
1242	 * If the thread is borrowing another thread's priority, don't
1243	 * ever lower the priority.
1244	 */
1245	if (td->td_flags & TDF_BORROWING && td->td_priority < prio)
1246		return;
1247
1248	/* Change the real priority. */
1249	oldprio = td->td_priority;
1250	sched_thread_priority(td, prio);
1251
1252	/*
1253	 * If the thread is on a turnstile, then let the turnstile update
1254	 * its state.
1255	 */
1256	if (TD_ON_LOCK(td) && oldprio != prio)
1257		turnstile_adjust(td, oldprio);
1258}
1259
1260void
1261sched_user_prio(struct thread *td, u_char prio)
1262{
1263	u_char oldprio;
1264
1265	td->td_base_user_pri = prio;
1266	if (td->td_flags & TDF_UBORROWING && td->td_user_pri <= prio)
1267                return;
1268	oldprio = td->td_user_pri;
1269	td->td_user_pri = prio;
1270
1271	if (TD_ON_UPILOCK(td) && oldprio != prio)
1272		umtx_pi_adjust(td, oldprio);
1273}
1274
1275void
1276sched_lend_user_prio(struct thread *td, u_char prio)
1277{
1278	u_char oldprio;
1279
1280	td->td_flags |= TDF_UBORROWING;
1281
1282	oldprio = td->td_user_pri;
1283	td->td_user_pri = prio;
1284
1285	if (TD_ON_UPILOCK(td) && oldprio != prio)
1286		umtx_pi_adjust(td, oldprio);
1287}
1288
1289void
1290sched_unlend_user_prio(struct thread *td, u_char prio)
1291{
1292	u_char base_pri;
1293
1294	base_pri = td->td_base_user_pri;
1295	if (prio >= base_pri) {
1296		td->td_flags &= ~TDF_UBORROWING;
1297		sched_user_prio(td, base_pri);
1298	} else
1299		sched_lend_user_prio(td, prio);
1300}
1301
1302void
1303sched_switch(struct thread *td, struct thread *newtd, int flags)
1304{
1305	struct tdq *tdq;
1306	struct td_sched *ts;
1307
1308	mtx_assert(&sched_lock, MA_OWNED);
1309
1310	tdq = TDQ_SELF();
1311	ts = td->td_sched;
1312	td->td_lastcpu = td->td_oncpu;
1313	td->td_oncpu = NOCPU;
1314	td->td_flags &= ~TDF_NEEDRESCHED;
1315	td->td_owepreempt = 0;
1316	/*
1317	 * If the thread has been assigned it may be in the process of switching
1318	 * to the new cpu.  This is the case in sched_bind().
1319	 */
1320	if (td == PCPU_GET(idlethread)) {
1321		TD_SET_CAN_RUN(td);
1322	} else if ((ts->ts_flags & TSF_ASSIGNED) == 0) {
1323		/* We are ending our run so make our slot available again */
1324		tdq_load_rem(tdq, ts);
1325		if (TD_IS_RUNNING(td)) {
1326			/*
1327			 * Don't allow the thread to migrate
1328			 * from a preemption.
1329			 */
1330			ts->ts_flags |= TSF_HOLD;
1331			setrunqueue(td, (flags & SW_PREEMPT) ?
1332			    SRQ_OURSELF|SRQ_YIELDING|SRQ_PREEMPTED :
1333			    SRQ_OURSELF|SRQ_YIELDING);
1334			ts->ts_flags &= ~TSF_HOLD;
1335		}
1336	}
1337	if (newtd != NULL) {
1338		/*
1339		 * If we bring in a thread account for it as if it had been
1340		 * added to the run queue and then chosen.
1341		 */
1342		newtd->td_sched->ts_flags |= TSF_DIDRUN;
1343		TD_SET_RUNNING(newtd);
1344		tdq_load_add(TDQ_SELF(), newtd->td_sched);
1345	} else
1346		newtd = choosethread();
1347	if (td != newtd) {
1348#ifdef	HWPMC_HOOKS
1349		if (PMC_PROC_IS_USING_PMCS(td->td_proc))
1350			PMC_SWITCH_CONTEXT(td, PMC_FN_CSW_OUT);
1351#endif
1352
1353		cpu_switch(td, newtd);
1354#ifdef	HWPMC_HOOKS
1355		if (PMC_PROC_IS_USING_PMCS(td->td_proc))
1356			PMC_SWITCH_CONTEXT(td, PMC_FN_CSW_IN);
1357#endif
1358	}
1359	sched_lock.mtx_lock = (uintptr_t)td;
1360	td->td_oncpu = PCPU_GET(cpuid);
1361}
1362
1363void
1364sched_nice(struct proc *p, int nice)
1365{
1366	struct thread *td;
1367
1368	PROC_LOCK_ASSERT(p, MA_OWNED);
1369	mtx_assert(&sched_lock, MA_OWNED);
1370
1371	p->p_nice = nice;
1372	FOREACH_THREAD_IN_PROC(p, td) {
1373		sched_priority(td);
1374		sched_prio(td, td->td_base_user_pri);
1375	}
1376}
1377
1378void
1379sched_sleep(struct thread *td)
1380{
1381
1382	mtx_assert(&sched_lock, MA_OWNED);
1383
1384	td->td_sched->ts_slptime = ticks;
1385}
1386
1387void
1388sched_wakeup(struct thread *td)
1389{
1390	int slptime;
1391
1392	mtx_assert(&sched_lock, MA_OWNED);
1393
1394	/*
1395	 * If we slept for more than a tick update our interactivity and
1396	 * priority.
1397	 */
1398	slptime = td->td_sched->ts_slptime;
1399	td->td_sched->ts_slptime = 0;
1400	if (slptime && slptime != ticks) {
1401		int hzticks;
1402
1403		hzticks = (ticks - slptime) << SCHED_TICK_SHIFT;
1404		if (hzticks >= SCHED_SLP_RUN_MAX) {
1405			td->td_sched->skg_slptime = SCHED_SLP_RUN_MAX;
1406			td->td_sched->skg_runtime = 1;
1407		} else {
1408			td->td_sched->skg_slptime += hzticks;
1409			sched_interact_update(td);
1410		}
1411		if (ticks - (hz / 10) > td->td_sched->ts_ltick)
1412			sched_pctcpu_update(td->td_sched);
1413		sched_priority(td);
1414	}
1415	setrunqueue(td, SRQ_BORING);
1416}
1417
1418/*
1419 * Penalize the parent for creating a new child and initialize the child's
1420 * priority.
1421 */
1422void
1423sched_fork(struct thread *td, struct thread *child)
1424{
1425	mtx_assert(&sched_lock, MA_OWNED);
1426	sched_fork_thread(td, child);
1427	/*
1428	 * Penalize the parent and child for forking.
1429	 */
1430	sched_interact_fork(child);
1431	sched_priority(child);
1432	td->td_sched->skg_runtime += tickincr;
1433	sched_interact_update(td);
1434	sched_priority(td);
1435}
1436
1437void
1438sched_fork_thread(struct thread *td, struct thread *child)
1439{
1440	struct td_sched *ts;
1441	struct td_sched *ts2;
1442
1443	/*
1444	 * Initialize child.
1445	 */
1446	sched_newthread(child);
1447	ts = td->td_sched;
1448	ts2 = child->td_sched;
1449	ts2->ts_cpu = ts->ts_cpu;
1450	ts2->ts_runq = NULL;
1451	/*
1452	 * Grab our parents cpu estimation information and priority.
1453	 */
1454	ts2->ts_ticks = ts->ts_ticks;
1455	ts2->ts_ltick = ts->ts_ltick;
1456	ts2->ts_ftick = ts->ts_ftick;
1457	child->td_user_pri = td->td_user_pri;
1458	child->td_base_user_pri = td->td_base_user_pri;
1459	/*
1460	 * And update interactivity score.
1461	 */
1462	ts2->skg_slptime = ts->skg_slptime;
1463	ts2->skg_runtime = ts->skg_runtime;
1464	ts2->ts_slice = 1;	/* Attempt to quickly learn interactivity. */
1465}
1466
1467void
1468sched_class(struct thread *td, int class)
1469{
1470	struct tdq *tdq;
1471	struct td_sched *ts;
1472	int nclass;
1473	int oclass;
1474
1475	mtx_assert(&sched_lock, MA_OWNED);
1476	if (td->td_pri_class == class)
1477		return;
1478
1479	nclass = PRI_BASE(class);
1480	oclass = PRI_BASE(td->td_pri_class);
1481	ts = td->td_sched;
1482	if (ts->ts_state == TSS_ONRUNQ || td->td_state == TDS_RUNNING) {
1483		tdq = TDQ_CPU(ts->ts_cpu);
1484#ifdef SMP
1485		/*
1486		 * On SMP if we're on the RUNQ we must adjust the transferable
1487		 * count because could be changing to or from an interrupt
1488		 * class.
1489		 */
1490		if (ts->ts_state == TSS_ONRUNQ) {
1491			if (THREAD_CAN_MIGRATE(ts->ts_thread)) {
1492				tdq->tdq_transferable--;
1493				tdq->tdq_group->tdg_transferable--;
1494			}
1495			if (THREAD_CAN_MIGRATE(ts->ts_thread)) {
1496				tdq->tdq_transferable++;
1497				tdq->tdq_group->tdg_transferable++;
1498			}
1499		}
1500#endif
1501		if (oclass == PRI_TIMESHARE)
1502			tdq->tdq_load_timeshare--;
1503		if (nclass == PRI_TIMESHARE)
1504			tdq->tdq_load_timeshare++;
1505	}
1506
1507	td->td_pri_class = class;
1508}
1509
1510/*
1511 * Return some of the child's priority and interactivity to the parent.
1512 */
1513void
1514sched_exit(struct proc *p, struct thread *child)
1515{
1516	struct thread *td;
1517
1518	CTR3(KTR_SCHED, "sched_exit: %p(%s) prio %d",
1519	    child, child->td_proc->p_comm, child->td_priority);
1520
1521	td = FIRST_THREAD_IN_PROC(p);
1522	sched_exit_thread(td, child);
1523}
1524
1525void
1526sched_exit_thread(struct thread *td, struct thread *child)
1527{
1528
1529	CTR3(KTR_SCHED, "sched_exit_thread: %p(%s) prio %d",
1530	    child, child->td_proc->p_comm, child->td_priority);
1531
1532	tdq_load_rem(TDQ_CPU(child->td_sched->ts_cpu), child->td_sched);
1533#ifdef KSE
1534	/*
1535	 * KSE forks and exits so often that this penalty causes short-lived
1536	 * threads to always be non-interactive.  This causes mozilla to
1537	 * crawl under load.
1538	 */
1539	if ((td->td_pflags & TDP_SA) && td->td_proc == child->td_proc)
1540		return;
1541#endif
1542	/*
1543	 * Give the child's runtime to the parent without returning the
1544	 * sleep time as a penalty to the parent.  This causes shells that
1545	 * launch expensive things to mark their children as expensive.
1546	 */
1547	td->td_sched->skg_runtime += child->td_sched->skg_runtime;
1548	sched_interact_update(td);
1549	sched_priority(td);
1550}
1551
1552void
1553sched_userret(struct thread *td)
1554{
1555	/*
1556	 * XXX we cheat slightly on the locking here to avoid locking in
1557	 * the usual case.  Setting td_priority here is essentially an
1558	 * incomplete workaround for not setting it properly elsewhere.
1559	 * Now that some interrupt handlers are threads, not setting it
1560	 * properly elsewhere can clobber it in the window between setting
1561	 * it here and returning to user mode, so don't waste time setting
1562	 * it perfectly here.
1563	 */
1564	KASSERT((td->td_flags & TDF_BORROWING) == 0,
1565	    ("thread with borrowed priority returning to userland"));
1566	if (td->td_priority != td->td_user_pri) {
1567		mtx_lock_spin(&sched_lock);
1568		td->td_priority = td->td_user_pri;
1569		td->td_base_pri = td->td_user_pri;
1570		mtx_unlock_spin(&sched_lock);
1571        }
1572}
1573
1574void
1575sched_clock(struct thread *td)
1576{
1577	struct tdq *tdq;
1578	struct td_sched *ts;
1579
1580	mtx_assert(&sched_lock, MA_OWNED);
1581#ifdef SMP
1582	sched_smp_tick();
1583#endif
1584	tdq = TDQ_SELF();
1585	/*
1586	 * Advance the insert index once for each tick to ensure that all
1587	 * threads get a chance to run.
1588	 */
1589	if (tdq->tdq_idx == tdq->tdq_ridx) {
1590		tdq->tdq_idx = (tdq->tdq_idx + 1) % RQ_NQS;
1591		if (TAILQ_EMPTY(&tdq->tdq_timeshare.rq_queues[tdq->tdq_ridx]))
1592			tdq->tdq_ridx = tdq->tdq_idx;
1593	}
1594	/* Adjust ticks for pctcpu */
1595	ts = td->td_sched;
1596	ts->ts_ticks += tickincr;
1597	ts->ts_ltick = ticks;
1598	/*
1599	 * Update if we've exceeded our desired tick threshhold by over one
1600	 * second.
1601	 */
1602	if (ts->ts_ftick + SCHED_TICK_TARG + hz < ts->ts_ltick)
1603		sched_pctcpu_update(ts);
1604	/*
1605	 * We only do slicing code for TIMESHARE threads.
1606	 */
1607	if (td->td_pri_class != PRI_TIMESHARE)
1608		return;
1609	/*
1610	 * We used a tick; charge it to the thread so that we can compute our
1611	 * interactivity.
1612	 */
1613	td->td_sched->skg_runtime += tickincr;
1614	sched_interact_update(td);
1615	/*
1616	 * We used up one time slice.
1617	 */
1618	if (--ts->ts_slice > 0)
1619		return;
1620	/*
1621	 * We're out of time, recompute priorities and requeue.
1622	 */
1623	tdq_load_rem(tdq, ts);
1624	sched_priority(td);
1625	ts->ts_slice = sched_slice;
1626	tdq_load_add(tdq, ts);
1627	td->td_flags |= TDF_NEEDRESCHED;
1628}
1629
1630int
1631sched_runnable(void)
1632{
1633	struct tdq *tdq;
1634	int load;
1635
1636	load = 1;
1637
1638	tdq = TDQ_SELF();
1639#ifdef SMP
1640	if (tdq->tdq_assigned) {
1641		mtx_lock_spin(&sched_lock);
1642		tdq_assign(tdq);
1643		mtx_unlock_spin(&sched_lock);
1644	}
1645#endif
1646	if ((curthread->td_flags & TDF_IDLETD) != 0) {
1647		if (tdq->tdq_load > 0)
1648			goto out;
1649	} else
1650		if (tdq->tdq_load - 1 > 0)
1651			goto out;
1652	load = 0;
1653out:
1654	return (load);
1655}
1656
1657struct td_sched *
1658sched_choose(void)
1659{
1660	struct tdq *tdq;
1661	struct td_sched *ts;
1662
1663	mtx_assert(&sched_lock, MA_OWNED);
1664	tdq = TDQ_SELF();
1665#ifdef SMP
1666restart:
1667	if (tdq->tdq_assigned)
1668		tdq_assign(tdq);
1669#endif
1670	ts = tdq_choose(tdq);
1671	if (ts) {
1672#ifdef SMP
1673		if (ts->ts_thread->td_priority <= PRI_MIN_IDLE)
1674			if (tdq_idled(tdq) == 0)
1675				goto restart;
1676#endif
1677		tdq_runq_rem(tdq, ts);
1678		ts->ts_state = TSS_THREAD;
1679		return (ts);
1680	}
1681#ifdef SMP
1682	if (tdq_idled(tdq) == 0)
1683		goto restart;
1684#endif
1685	return (NULL);
1686}
1687
1688void
1689sched_add(struct thread *td, int flags)
1690{
1691	struct tdq *tdq;
1692	struct td_sched *ts;
1693	int preemptive;
1694	int canmigrate;
1695	int class;
1696
1697	CTR5(KTR_SCHED, "sched_add: %p(%s) prio %d by %p(%s)",
1698	    td, td->td_proc->p_comm, td->td_priority, curthread,
1699	    curthread->td_proc->p_comm);
1700	mtx_assert(&sched_lock, MA_OWNED);
1701	tdq = TDQ_SELF();
1702	ts = td->td_sched;
1703	ts->ts_flags &= ~TSF_INTERNAL;
1704	class = PRI_BASE(td->td_pri_class);
1705	preemptive = !(flags & SRQ_YIELDING);
1706	canmigrate = 1;
1707#ifdef SMP
1708	if (ts->ts_flags & TSF_ASSIGNED) {
1709		if (ts->ts_flags & TSF_REMOVED)
1710			ts->ts_flags &= ~TSF_REMOVED;
1711		return;
1712	}
1713	canmigrate = THREAD_CAN_MIGRATE(td);
1714	/*
1715	 * Don't migrate running threads here.  Force the long term balancer
1716	 * to do it.
1717	 */
1718	if (ts->ts_flags & TSF_HOLD) {
1719		ts->ts_flags &= ~TSF_HOLD;
1720		canmigrate = 0;
1721	}
1722#endif
1723	KASSERT(ts->ts_state != TSS_ONRUNQ,
1724	    ("sched_add: thread %p (%s) already in run queue", td,
1725	    td->td_proc->p_comm));
1726	KASSERT(td->td_proc->p_sflag & PS_INMEM,
1727	    ("sched_add: process swapped out"));
1728	KASSERT(ts->ts_runq == NULL,
1729	    ("sched_add: thread %p is still assigned to a run queue", td));
1730	/*
1731	 * Set the slice and pick the run queue.
1732	 */
1733	if (ts->ts_slice == 0)
1734		ts->ts_slice = sched_slice;
1735	if (td->td_priority <= PRI_MAX_REALTIME) {
1736		ts->ts_runq = &tdq->tdq_realtime;
1737		/*
1738		 * If the thread is not artificially pinned and it's in
1739		 * the realtime queue we directly dispatch it on this cpu
1740		 * for minimum latency.  Interrupt handlers may also have
1741		 * to complete on the cpu that dispatched them.
1742		 */
1743		if (td->td_pinned == 0)
1744			ts->ts_cpu = PCPU_GET(cpuid);
1745	} else if (td->td_priority <= PRI_MAX_TIMESHARE)
1746		ts->ts_runq = &tdq->tdq_timeshare;
1747	else
1748		ts->ts_runq = &tdq->tdq_idle;
1749
1750#ifdef SMP
1751	/*
1752	 * If this thread is pinned or bound, notify the target cpu.
1753	 */
1754	if (!canmigrate && ts->ts_cpu != PCPU_GET(cpuid) ) {
1755		ts->ts_runq = NULL;
1756		tdq_notify(ts, ts->ts_cpu);
1757		return;
1758	}
1759	/*
1760	 * If we had been idle, clear our bit in the group and potentially
1761	 * the global bitmap.  If not, see if we should transfer this thread.
1762	 */
1763	if ((class != PRI_IDLE && class != PRI_ITHD) &&
1764	    (tdq->tdq_group->tdg_idlemask & PCPU_GET(cpumask)) != 0) {
1765		/*
1766		 * Check to see if our group is unidling, and if so, remove it
1767		 * from the global idle mask.
1768		 */
1769		if (tdq->tdq_group->tdg_idlemask ==
1770		    tdq->tdq_group->tdg_cpumask)
1771			atomic_clear_int(&tdq_idle, tdq->tdq_group->tdg_mask);
1772		/*
1773		 * Now remove ourselves from the group specific idle mask.
1774		 */
1775		tdq->tdq_group->tdg_idlemask &= ~PCPU_GET(cpumask);
1776	} else if (canmigrate && tdq->tdq_load > 1)
1777		if (tdq_transfer(tdq, ts, class))
1778			return;
1779	ts->ts_cpu = PCPU_GET(cpuid);
1780#endif
1781	if (td->td_priority < curthread->td_priority)
1782		curthread->td_flags |= TDF_NEEDRESCHED;
1783	if (preemptive && maybe_preempt(td))
1784		return;
1785	ts->ts_state = TSS_ONRUNQ;
1786
1787	tdq_runq_add(tdq, ts, flags);
1788	tdq_load_add(tdq, ts);
1789}
1790
1791void
1792sched_rem(struct thread *td)
1793{
1794	struct tdq *tdq;
1795	struct td_sched *ts;
1796
1797	CTR5(KTR_SCHED, "sched_rem: %p(%s) prio %d by %p(%s)",
1798	    td, td->td_proc->p_comm, td->td_priority, curthread,
1799	    curthread->td_proc->p_comm);
1800	mtx_assert(&sched_lock, MA_OWNED);
1801	ts = td->td_sched;
1802	if (ts->ts_flags & TSF_ASSIGNED) {
1803		ts->ts_flags |= TSF_REMOVED;
1804		return;
1805	}
1806	KASSERT((ts->ts_state == TSS_ONRUNQ),
1807	    ("sched_rem: thread not on run queue"));
1808
1809	ts->ts_state = TSS_THREAD;
1810	tdq = TDQ_CPU(ts->ts_cpu);
1811	tdq_runq_rem(tdq, ts);
1812	tdq_load_rem(tdq, ts);
1813}
1814
1815fixpt_t
1816sched_pctcpu(struct thread *td)
1817{
1818	fixpt_t pctcpu;
1819	struct td_sched *ts;
1820
1821	pctcpu = 0;
1822	ts = td->td_sched;
1823	if (ts == NULL)
1824		return (0);
1825
1826	mtx_lock_spin(&sched_lock);
1827	if (ts->ts_ticks) {
1828		int rtick;
1829
1830		if (ticks - (hz / 10) > td->td_sched->ts_ltick)
1831			sched_pctcpu_update(ts);
1832		/* How many rtick per second ? */
1833		rtick = min(SCHED_TICK_HZ(ts) / SCHED_TICK_SECS, hz);
1834		pctcpu = (FSCALE * ((FSCALE * rtick)/hz)) >> FSHIFT;
1835	}
1836	td->td_proc->p_swtime = ts->ts_ltick - ts->ts_ftick;
1837	mtx_unlock_spin(&sched_lock);
1838
1839	return (pctcpu);
1840}
1841
1842void
1843sched_bind(struct thread *td, int cpu)
1844{
1845	struct td_sched *ts;
1846
1847	mtx_assert(&sched_lock, MA_OWNED);
1848	ts = td->td_sched;
1849	KASSERT((ts->ts_flags & TSF_BOUND) == 0,
1850	    ("sched_bind: thread %p already bound.", td));
1851	ts->ts_flags |= TSF_BOUND;
1852#ifdef SMP
1853	if (PCPU_GET(cpuid) == cpu)
1854		return;
1855	/* sched_rem without the runq_remove */
1856	ts->ts_state = TSS_THREAD;
1857	tdq_load_rem(TDQ_CPU(ts->ts_cpu), ts);
1858	tdq_notify(ts, cpu);
1859	/* When we return from mi_switch we'll be on the correct cpu. */
1860	mi_switch(SW_VOL, NULL);
1861	sched_pin();
1862#endif
1863}
1864
1865void
1866sched_unbind(struct thread *td)
1867{
1868	struct td_sched *ts;
1869
1870	mtx_assert(&sched_lock, MA_OWNED);
1871	ts = td->td_sched;
1872	KASSERT(ts->ts_flags & TSF_BOUND,
1873	    ("sched_unbind: thread %p not bound.", td));
1874	mtx_assert(&sched_lock, MA_OWNED);
1875	ts->ts_flags &= ~TSF_BOUND;
1876#ifdef SMP
1877	sched_unpin();
1878#endif
1879}
1880
1881int
1882sched_is_bound(struct thread *td)
1883{
1884	mtx_assert(&sched_lock, MA_OWNED);
1885	return (td->td_sched->ts_flags & TSF_BOUND);
1886}
1887
1888void
1889sched_relinquish(struct thread *td)
1890{
1891	mtx_lock_spin(&sched_lock);
1892	if (td->td_pri_class == PRI_TIMESHARE)
1893		sched_prio(td, PRI_MAX_TIMESHARE);
1894	mi_switch(SW_VOL, NULL);
1895	mtx_unlock_spin(&sched_lock);
1896}
1897
1898int
1899sched_load(void)
1900{
1901#ifdef SMP
1902	int total;
1903	int i;
1904
1905	total = 0;
1906	for (i = 0; i <= tdg_maxid; i++)
1907		total += TDQ_GROUP(i)->tdg_load;
1908	return (total);
1909#else
1910	return (TDQ_SELF()->tdq_sysload);
1911#endif
1912}
1913
1914int
1915sched_sizeof_proc(void)
1916{
1917	return (sizeof(struct proc));
1918}
1919
1920int
1921sched_sizeof_thread(void)
1922{
1923	return (sizeof(struct thread) + sizeof(struct td_sched));
1924}
1925
1926void
1927sched_tick(void)
1928{
1929}
1930
1931static SYSCTL_NODE(_kern, OID_AUTO, sched, CTLFLAG_RW, 0, "Scheduler");
1932SYSCTL_STRING(_kern_sched, OID_AUTO, name, CTLFLAG_RD, "ule", 0,
1933    "Scheduler name");
1934SYSCTL_INT(_kern_sched, OID_AUTO, slice, CTLFLAG_RW, &sched_slice, 0, "");
1935SYSCTL_INT(_kern_sched, OID_AUTO, interact, CTLFLAG_RW, &sched_interact, 0, "");
1936SYSCTL_INT(_kern_sched, OID_AUTO, tickincr, CTLFLAG_RD, &tickincr, 0, "");
1937SYSCTL_INT(_kern_sched, OID_AUTO, realstathz, CTLFLAG_RD, &realstathz, 0, "");
1938
1939/* ps compat */
1940static fixpt_t  ccpu = 0.95122942450071400909 * FSCALE; /* exp(-1/20) */
1941SYSCTL_INT(_kern, OID_AUTO, ccpu, CTLFLAG_RD, &ccpu, 0, "");
1942
1943
1944#define KERN_SWITCH_INCLUDE 1
1945#include "kern/kern_switch.c"
1946