sched_4bsd.c revision 135051
1104964Sjeff/*-
2104964Sjeff * Copyright (c) 1982, 1986, 1990, 1991, 1993
3104964Sjeff *	The Regents of the University of California.  All rights reserved.
4104964Sjeff * (c) UNIX System Laboratories, Inc.
5104964Sjeff * All or some portions of this file are derived from material licensed
6104964Sjeff * to the University of California by American Telephone and Telegraph
7104964Sjeff * Co. or Unix System Laboratories, Inc. and are reproduced herein with
8104964Sjeff * the permission of UNIX System Laboratories, Inc.
9104964Sjeff *
10104964Sjeff * Redistribution and use in source and binary forms, with or without
11104964Sjeff * modification, are permitted provided that the following conditions
12104964Sjeff * are met:
13104964Sjeff * 1. Redistributions of source code must retain the above copyright
14104964Sjeff *    notice, this list of conditions and the following disclaimer.
15104964Sjeff * 2. Redistributions in binary form must reproduce the above copyright
16104964Sjeff *    notice, this list of conditions and the following disclaimer in the
17104964Sjeff *    documentation and/or other materials provided with the distribution.
18104964Sjeff * 4. Neither the name of the University nor the names of its contributors
19104964Sjeff *    may be used to endorse or promote products derived from this software
20104964Sjeff *    without specific prior written permission.
21104964Sjeff *
22104964Sjeff * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
23104964Sjeff * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
24104964Sjeff * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
25104964Sjeff * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
26104964Sjeff * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
27104964Sjeff * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
28104964Sjeff * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
29104964Sjeff * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
30104964Sjeff * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
31104964Sjeff * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32104964Sjeff * SUCH DAMAGE.
33104964Sjeff */
34104964Sjeff
35116182Sobrien#include <sys/cdefs.h>
36116182Sobrien__FBSDID("$FreeBSD: head/sys/kern/sched_4bsd.c 135051 2004-09-10 21:04:38Z julian $");
37116182Sobrien
38134791Sjulian#define kse td_sched
39134791Sjulian
40104964Sjeff#include <sys/param.h>
41104964Sjeff#include <sys/systm.h>
42104964Sjeff#include <sys/kernel.h>
43104964Sjeff#include <sys/ktr.h>
44104964Sjeff#include <sys/lock.h>
45123871Sjhb#include <sys/kthread.h>
46104964Sjeff#include <sys/mutex.h>
47104964Sjeff#include <sys/proc.h>
48104964Sjeff#include <sys/resourcevar.h>
49104964Sjeff#include <sys/sched.h>
50104964Sjeff#include <sys/smp.h>
51104964Sjeff#include <sys/sysctl.h>
52104964Sjeff#include <sys/sx.h>
53134689Sjulian#include <machine/smp.h>
54104964Sjeff
55107135Sjeff/*
56107135Sjeff * INVERSE_ESTCPU_WEIGHT is only suitable for statclock() frequencies in
57107135Sjeff * the range 100-256 Hz (approximately).
58107135Sjeff */
59107135Sjeff#define	ESTCPULIM(e) \
60107135Sjeff    min((e), INVERSE_ESTCPU_WEIGHT * (NICE_WEIGHT * (PRIO_MAX - PRIO_MIN) - \
61107135Sjeff    RQ_PPQ) + INVERSE_ESTCPU_WEIGHT - 1)
62122355Sbde#ifdef SMP
63122355Sbde#define	INVERSE_ESTCPU_WEIGHT	(8 * smp_cpus)
64122355Sbde#else
65107135Sjeff#define	INVERSE_ESTCPU_WEIGHT	8	/* 1 / (priorities per estcpu level). */
66122355Sbde#endif
67107135Sjeff#define	NICE_WEIGHT		1	/* Priorities per nice level. */
68107135Sjeff
69134791Sjulian/*
70134791Sjulian * The schedulable entity that can be given a context to run.
71134791Sjulian * A process may have several of these. Probably one per processor
72134791Sjulian * but posibly a few more. In this universe they are grouped
73134791Sjulian * with a KSEG that contains the priority and niceness
74134791Sjulian * for the group.
75134791Sjulian */
76134791Sjulianstruct kse {
77134791Sjulian	TAILQ_ENTRY(kse) ke_kglist;	/* (*) Queue of KSEs in ke_ksegrp. */
78134791Sjulian	TAILQ_ENTRY(kse) ke_kgrlist;	/* (*) Queue of KSEs in this state. */
79134791Sjulian	TAILQ_ENTRY(kse) ke_procq;	/* (j/z) Run queue. */
80134791Sjulian	struct thread	*ke_thread;	/* (*) Active associated thread. */
81134791Sjulian	fixpt_t		ke_pctcpu;	/* (j) %cpu during p_swtime. */
82134791Sjulian	u_char		ke_oncpu;	/* (j) Which cpu we are on. */
83134791Sjulian	char		ke_rqindex;	/* (j) Run queue index. */
84134791Sjulian	enum {
85134791Sjulian		KES_THREAD = 0x0,	/* slaved to thread state */
86134791Sjulian		KES_ONRUNQ
87134791Sjulian	} ke_state;			/* (j) KSE status. */
88134791Sjulian	int		ke_cpticks;	/* (j) Ticks of cpu time. */
89134791Sjulian	struct runq	*ke_runq;	/* runq the kse is currently on */
90134791Sjulian	int		ke_pinned;	/* nested count of pinned to a cpu */
91109145Sjeff};
92109145Sjeff
93134791Sjulian#define ke_proc		ke_thread->td_proc
94134791Sjulian#define ke_ksegrp	ke_thread->td_ksegrp
95134791Sjulian
96134791Sjulian#define td_kse td_sched
97134791Sjulian
98134791Sjulian/* flags kept in td_flags */
99134791Sjulian#define TDF_DIDRUN	TDF_SCHED0	/* KSE actually ran. */
100134791Sjulian#define TDF_EXIT	TDF_SCHED1	/* KSE is being killed. */
101134791Sjulian#define TDF_BOUND	TDF_SCHED2
102134791Sjulian
103134791Sjulian#define ke_flags	ke_thread->td_flags
104134791Sjulian#define KEF_DIDRUN	TDF_DIDRUN /* KSE actually ran. */
105134791Sjulian#define KEF_EXIT	TDF_EXIT /* KSE is being killed. */
106134791Sjulian#define KEF_BOUND	TDF_BOUND /* stuck to one CPU */
107134791Sjulian
108124955Sjeff#define SKE_RUNQ_PCPU(ke)						\
109124955Sjeff    ((ke)->ke_runq != 0 && (ke)->ke_runq != &runq)
110124955Sjeff
111134791Sjulianstruct kg_sched {
112134791Sjulian	struct thread	*skg_last_assigned; /* (j) Last thread assigned to */
113134791Sjulian					   /* the system scheduler. */
114134791Sjulian	int	skg_avail_opennings;	/* (j) Num KSEs requested in group. */
115134791Sjulian	int	skg_concurrency;	/* (j) Num KSEs requested in group. */
116134791Sjulian	int	skg_runq_kses;		/* (j) Num KSEs on runq. */
117134791Sjulian};
118134791Sjulian#define kg_last_assigned	kg_sched->skg_last_assigned
119134791Sjulian#define kg_avail_opennings	kg_sched->skg_avail_opennings
120134791Sjulian#define kg_concurrency		kg_sched->skg_concurrency
121134791Sjulian#define kg_runq_kses		kg_sched->skg_runq_kses
122134791Sjulian
123124955Sjeff/*
124124955Sjeff * KSE_CAN_MIGRATE macro returns true if the kse can migrate between
125125295Sjeff * cpus.
126124955Sjeff */
127124955Sjeff#define KSE_CAN_MIGRATE(ke)						\
128134791Sjulian    ((ke)->ke_pinned == 0 && ((ke)->ke_flags & KEF_BOUND) == 0)
129109145Sjeff
130134791Sjulianstatic struct kse kse0;
131134791Sjulianstatic struct kg_sched kg_sched0;
132104964Sjeff
133125288Sjeffstatic int	sched_tdcnt;	/* Total runnable threads in the system. */
134104964Sjeffstatic int	sched_quantum;	/* Roundrobin scheduling quantum in ticks. */
135112535Smux#define	SCHED_QUANTUM	(hz / 10)	/* Default sched quantum */
136104964Sjeff
137104964Sjeffstatic struct callout roundrobin_callout;
138104964Sjeff
139134791Sjulianstatic void	slot_fill(struct ksegrp *kg);
140134791Sjulianstatic struct kse *sched_choose(void);		/* XXX Should be thread * */
141134791Sjulian
142124955Sjeffstatic void	setup_runqs(void);
143104964Sjeffstatic void	roundrobin(void *arg);
144123871Sjhbstatic void	schedcpu(void);
145124955Sjeffstatic void	schedcpu_thread(void);
146104964Sjeffstatic void	sched_setup(void *dummy);
147104964Sjeffstatic void	maybe_resched(struct thread *td);
148104964Sjeffstatic void	updatepri(struct ksegrp *kg);
149104964Sjeffstatic void	resetpriority(struct ksegrp *kg);
150134694Sjulian#ifdef SMP
151134688Sjulianstatic int	forward_wakeup(int  cpunum);
152134694Sjulian#endif
153104964Sjeff
154124955Sjeffstatic struct kproc_desc sched_kp = {
155124955Sjeff        "schedcpu",
156124955Sjeff        schedcpu_thread,
157124955Sjeff        NULL
158124955Sjeff};
159124955SjeffSYSINIT(schedcpu, SI_SUB_RUN_SCHEDULER, SI_ORDER_FIRST, kproc_start, &sched_kp)
160124955SjeffSYSINIT(sched_setup, SI_SUB_RUN_QUEUE, SI_ORDER_FIRST, sched_setup, NULL)
161104964Sjeff
162104964Sjeff/*
163104964Sjeff * Global run queue.
164104964Sjeff */
165104964Sjeffstatic struct runq runq;
166104964Sjeff
167124955Sjeff#ifdef SMP
168124955Sjeff/*
169124955Sjeff * Per-CPU run queues
170124955Sjeff */
171124955Sjeffstatic struct runq runq_pcpu[MAXCPU];
172124955Sjeff#endif
173124955Sjeff
174124955Sjeffstatic void
175124955Sjeffsetup_runqs(void)
176124955Sjeff{
177124955Sjeff#ifdef SMP
178124955Sjeff	int i;
179124955Sjeff
180124955Sjeff	for (i = 0; i < MAXCPU; ++i)
181124955Sjeff		runq_init(&runq_pcpu[i]);
182124955Sjeff#endif
183124955Sjeff
184124955Sjeff	runq_init(&runq);
185124955Sjeff}
186124955Sjeff
187104964Sjeffstatic int
188104964Sjeffsysctl_kern_quantum(SYSCTL_HANDLER_ARGS)
189104964Sjeff{
190104964Sjeff	int error, new_val;
191104964Sjeff
192104964Sjeff	new_val = sched_quantum * tick;
193104964Sjeff	error = sysctl_handle_int(oidp, &new_val, 0, req);
194104964Sjeff        if (error != 0 || req->newptr == NULL)
195104964Sjeff		return (error);
196104964Sjeff	if (new_val < tick)
197104964Sjeff		return (EINVAL);
198104964Sjeff	sched_quantum = new_val / tick;
199104964Sjeff	hogticks = 2 * sched_quantum;
200104964Sjeff	return (0);
201104964Sjeff}
202104964Sjeff
203132589SscottlSYSCTL_NODE(_kern, OID_AUTO, sched, CTLFLAG_RD, 0, "Scheduler");
204130881Sscottl
205132589SscottlSYSCTL_STRING(_kern_sched, OID_AUTO, name, CTLFLAG_RD, "4BSD", 0,
206132589Sscottl    "Scheduler name");
207130881Sscottl
208132589SscottlSYSCTL_PROC(_kern_sched, OID_AUTO, quantum, CTLTYPE_INT | CTLFLAG_RW,
209132589Sscottl    0, sizeof sched_quantum, sysctl_kern_quantum, "I",
210132589Sscottl    "Roundrobin scheduling quantum in microseconds");
211104964Sjeff
212134693Sjulian#ifdef SMP
213134688Sjulian/* Enable forwarding of wakeups to all other cpus */
214134688SjulianSYSCTL_NODE(_kern_sched, OID_AUTO, ipiwakeup, CTLFLAG_RD, NULL, "Kernel SMP");
215134688Sjulian
216134792Sjulianstatic int forward_wakeup_enabled = 1;
217134688SjulianSYSCTL_INT(_kern_sched_ipiwakeup, OID_AUTO, enabled, CTLFLAG_RW,
218134688Sjulian	   &forward_wakeup_enabled, 0,
219134688Sjulian	   "Forwarding of wakeup to idle CPUs");
220134688Sjulian
221134688Sjulianstatic int forward_wakeups_requested = 0;
222134688SjulianSYSCTL_INT(_kern_sched_ipiwakeup, OID_AUTO, requested, CTLFLAG_RD,
223134688Sjulian	   &forward_wakeups_requested, 0,
224134688Sjulian	   "Requests for Forwarding of wakeup to idle CPUs");
225134688Sjulian
226134688Sjulianstatic int forward_wakeups_delivered = 0;
227134688SjulianSYSCTL_INT(_kern_sched_ipiwakeup, OID_AUTO, delivered, CTLFLAG_RD,
228134688Sjulian	   &forward_wakeups_delivered, 0,
229134688Sjulian	   "Completed Forwarding of wakeup to idle CPUs");
230134688Sjulian
231134792Sjulianstatic int forward_wakeup_use_mask = 1;
232134688SjulianSYSCTL_INT(_kern_sched_ipiwakeup, OID_AUTO, usemask, CTLFLAG_RW,
233134688Sjulian	   &forward_wakeup_use_mask, 0,
234134688Sjulian	   "Use the mask of idle cpus");
235134688Sjulian
236134688Sjulianstatic int forward_wakeup_use_loop = 0;
237134688SjulianSYSCTL_INT(_kern_sched_ipiwakeup, OID_AUTO, useloop, CTLFLAG_RW,
238134688Sjulian	   &forward_wakeup_use_loop, 0,
239134688Sjulian	   "Use a loop to find idle cpus");
240134688Sjulian
241134688Sjulianstatic int forward_wakeup_use_single = 0;
242134688SjulianSYSCTL_INT(_kern_sched_ipiwakeup, OID_AUTO, onecpu, CTLFLAG_RW,
243134688Sjulian	   &forward_wakeup_use_single, 0,
244134688Sjulian	   "Only signal one idle cpu");
245134688Sjulian
246134688Sjulianstatic int forward_wakeup_use_htt = 0;
247134688SjulianSYSCTL_INT(_kern_sched_ipiwakeup, OID_AUTO, htt2, CTLFLAG_RW,
248134688Sjulian	   &forward_wakeup_use_htt, 0,
249134688Sjulian	   "account for htt");
250135051Sjulian
251134693Sjulian#endif
252135051Sjulianstatic int sched_followon = 0;
253135051SjulianSYSCTL_INT(_kern_sched, OID_AUTO, followon, CTLFLAG_RW,
254135051Sjulian	   &sched_followon, 0,
255135051Sjulian	   "allow threads to share a quantum");
256134688Sjulian
257135051Sjulianstatic int sched_pfollowons = 0;
258135051SjulianSYSCTL_INT(_kern_sched, OID_AUTO, pfollowons, CTLFLAG_RD,
259135051Sjulian	   &sched_pfollowons, 0,
260135051Sjulian	   "number of followons done to a different ksegrp");
261135051Sjulian
262135051Sjulianstatic int sched_kgfollowons = 0;
263135051SjulianSYSCTL_INT(_kern_sched, OID_AUTO, kgfollowons, CTLFLAG_RD,
264135051Sjulian	   &sched_kgfollowons, 0,
265135051Sjulian	   "number of followons done in a ksegrp");
266135051Sjulian
267104964Sjeff/*
268104964Sjeff * Arrange to reschedule if necessary, taking the priorities and
269104964Sjeff * schedulers into account.
270104964Sjeff */
271104964Sjeffstatic void
272104964Sjeffmaybe_resched(struct thread *td)
273104964Sjeff{
274104964Sjeff
275104964Sjeff	mtx_assert(&sched_lock, MA_OWNED);
276134791Sjulian	if (td->td_priority < curthread->td_priority)
277111032Sjulian		curthread->td_flags |= TDF_NEEDRESCHED;
278104964Sjeff}
279104964Sjeff
280104964Sjeff/*
281104964Sjeff * Force switch among equal priority processes every 100ms.
282104964Sjeff * We don't actually need to force a context switch of the current process.
283104964Sjeff * The act of firing the event triggers a context switch to softclock() and
284104964Sjeff * then switching back out again which is equivalent to a preemption, thus
285104964Sjeff * no further work is needed on the local CPU.
286104964Sjeff */
287104964Sjeff/* ARGSUSED */
288104964Sjeffstatic void
289104964Sjeffroundrobin(void *arg)
290104964Sjeff{
291104964Sjeff
292104964Sjeff#ifdef SMP
293104964Sjeff	mtx_lock_spin(&sched_lock);
294104964Sjeff	forward_roundrobin();
295104964Sjeff	mtx_unlock_spin(&sched_lock);
296104964Sjeff#endif
297104964Sjeff
298104964Sjeff	callout_reset(&roundrobin_callout, sched_quantum, roundrobin, NULL);
299104964Sjeff}
300104964Sjeff
301104964Sjeff/*
302104964Sjeff * Constants for digital decay and forget:
303118972Sjhb *	90% of (kg_estcpu) usage in 5 * loadav time
304118972Sjhb *	95% of (ke_pctcpu) usage in 60 seconds (load insensitive)
305104964Sjeff *          Note that, as ps(1) mentions, this can let percentages
306104964Sjeff *          total over 100% (I've seen 137.9% for 3 processes).
307104964Sjeff *
308118972Sjhb * Note that schedclock() updates kg_estcpu and p_cpticks asynchronously.
309104964Sjeff *
310118972Sjhb * We wish to decay away 90% of kg_estcpu in (5 * loadavg) seconds.
311104964Sjeff * That is, the system wants to compute a value of decay such
312104964Sjeff * that the following for loop:
313104964Sjeff * 	for (i = 0; i < (5 * loadavg); i++)
314118972Sjhb * 		kg_estcpu *= decay;
315104964Sjeff * will compute
316118972Sjhb * 	kg_estcpu *= 0.1;
317104964Sjeff * for all values of loadavg:
318104964Sjeff *
319104964Sjeff * Mathematically this loop can be expressed by saying:
320104964Sjeff * 	decay ** (5 * loadavg) ~= .1
321104964Sjeff *
322104964Sjeff * The system computes decay as:
323104964Sjeff * 	decay = (2 * loadavg) / (2 * loadavg + 1)
324104964Sjeff *
325104964Sjeff * We wish to prove that the system's computation of decay
326104964Sjeff * will always fulfill the equation:
327104964Sjeff * 	decay ** (5 * loadavg) ~= .1
328104964Sjeff *
329104964Sjeff * If we compute b as:
330104964Sjeff * 	b = 2 * loadavg
331104964Sjeff * then
332104964Sjeff * 	decay = b / (b + 1)
333104964Sjeff *
334104964Sjeff * We now need to prove two things:
335104964Sjeff *	1) Given factor ** (5 * loadavg) ~= .1, prove factor == b/(b+1)
336104964Sjeff *	2) Given b/(b+1) ** power ~= .1, prove power == (5 * loadavg)
337104964Sjeff *
338104964Sjeff * Facts:
339104964Sjeff *         For x close to zero, exp(x) =~ 1 + x, since
340104964Sjeff *              exp(x) = 0! + x**1/1! + x**2/2! + ... .
341104964Sjeff *              therefore exp(-1/b) =~ 1 - (1/b) = (b-1)/b.
342104964Sjeff *         For x close to zero, ln(1+x) =~ x, since
343104964Sjeff *              ln(1+x) = x - x**2/2 + x**3/3 - ...     -1 < x < 1
344104964Sjeff *              therefore ln(b/(b+1)) = ln(1 - 1/(b+1)) =~ -1/(b+1).
345104964Sjeff *         ln(.1) =~ -2.30
346104964Sjeff *
347104964Sjeff * Proof of (1):
348104964Sjeff *    Solve (factor)**(power) =~ .1 given power (5*loadav):
349104964Sjeff *	solving for factor,
350104964Sjeff *      ln(factor) =~ (-2.30/5*loadav), or
351104964Sjeff *      factor =~ exp(-1/((5/2.30)*loadav)) =~ exp(-1/(2*loadav)) =
352104964Sjeff *          exp(-1/b) =~ (b-1)/b =~ b/(b+1).                    QED
353104964Sjeff *
354104964Sjeff * Proof of (2):
355104964Sjeff *    Solve (factor)**(power) =~ .1 given factor == (b/(b+1)):
356104964Sjeff *	solving for power,
357104964Sjeff *      power*ln(b/(b+1)) =~ -2.30, or
358104964Sjeff *      power =~ 2.3 * (b + 1) = 4.6*loadav + 2.3 =~ 5*loadav.  QED
359104964Sjeff *
360104964Sjeff * Actual power values for the implemented algorithm are as follows:
361104964Sjeff *      loadav: 1       2       3       4
362104964Sjeff *      power:  5.68    10.32   14.94   19.55
363104964Sjeff */
364104964Sjeff
365104964Sjeff/* calculations for digital decay to forget 90% of usage in 5*loadav sec */
366104964Sjeff#define	loadfactor(loadav)	(2 * (loadav))
367104964Sjeff#define	decay_cpu(loadfac, cpu)	(((loadfac) * (cpu)) / ((loadfac) + FSCALE))
368104964Sjeff
369118972Sjhb/* decay 95% of `ke_pctcpu' in 60 seconds; see CCPU_SHIFT before changing */
370104964Sjeffstatic fixpt_t	ccpu = 0.95122942450071400909 * FSCALE;	/* exp(-1/20) */
371104964SjeffSYSCTL_INT(_kern, OID_AUTO, ccpu, CTLFLAG_RD, &ccpu, 0, "");
372104964Sjeff
373104964Sjeff/*
374104964Sjeff * If `ccpu' is not equal to `exp(-1/20)' and you still want to use the
375104964Sjeff * faster/more-accurate formula, you'll have to estimate CCPU_SHIFT below
376104964Sjeff * and possibly adjust FSHIFT in "param.h" so that (FSHIFT >= CCPU_SHIFT).
377104964Sjeff *
378104964Sjeff * To estimate CCPU_SHIFT for exp(-1/20), the following formula was used:
379104964Sjeff *	1 - exp(-1/20) ~= 0.0487 ~= 0.0488 == 1 (fixed pt, *11* bits).
380104964Sjeff *
381104964Sjeff * If you don't want to bother with the faster/more-accurate formula, you
382104964Sjeff * can set CCPU_SHIFT to (FSHIFT + 1) which will use a slower/less-accurate
383104964Sjeff * (more general) method of calculating the %age of CPU used by a process.
384104964Sjeff */
385104964Sjeff#define	CCPU_SHIFT	11
386104964Sjeff
387104964Sjeff/*
388104964Sjeff * Recompute process priorities, every hz ticks.
389104964Sjeff * MP-safe, called without the Giant mutex.
390104964Sjeff */
391104964Sjeff/* ARGSUSED */
392104964Sjeffstatic void
393123871Sjhbschedcpu(void)
394104964Sjeff{
395104964Sjeff	register fixpt_t loadfac = loadfactor(averunnable.ldavg[0]);
396104964Sjeff	struct thread *td;
397104964Sjeff	struct proc *p;
398104964Sjeff	struct kse *ke;
399104964Sjeff	struct ksegrp *kg;
400118972Sjhb	int awake, realstathz;
401104964Sjeff
402104964Sjeff	realstathz = stathz ? stathz : hz;
403104964Sjeff	sx_slock(&allproc_lock);
404104964Sjeff	FOREACH_PROC_IN_SYSTEM(p) {
405118972Sjhb		/*
406118972Sjhb		 * Prevent state changes and protect run queue.
407118972Sjhb		 */
408104964Sjeff		mtx_lock_spin(&sched_lock);
409118972Sjhb		/*
410118972Sjhb		 * Increment time in/out of memory.  We ignore overflow; with
411118972Sjhb		 * 16-bit int's (remember them?) overflow takes 45 days.
412118972Sjhb		 */
413104964Sjeff		p->p_swtime++;
414104964Sjeff		FOREACH_KSEGRP_IN_PROC(p, kg) {
415104964Sjeff			awake = 0;
416134791Sjulian			FOREACH_THREAD_IN_GROUP(kg, td) {
417134791Sjulian				ke = td->td_kse;
418104964Sjeff				/*
419118972Sjhb				 * Increment sleep time (if sleeping).  We
420118972Sjhb				 * ignore overflow, as above.
421104964Sjeff				 */
422104964Sjeff				/*
423104964Sjeff				 * The kse slptimes are not touched in wakeup
424104964Sjeff				 * because the thread may not HAVE a KSE.
425104964Sjeff				 */
426104964Sjeff				if (ke->ke_state == KES_ONRUNQ) {
427104964Sjeff					awake = 1;
428104964Sjeff					ke->ke_flags &= ~KEF_DIDRUN;
429104964Sjeff				} else if ((ke->ke_state == KES_THREAD) &&
430134791Sjulian				    (TD_IS_RUNNING(td))) {
431104964Sjeff					awake = 1;
432104964Sjeff					/* Do not clear KEF_DIDRUN */
433104964Sjeff				} else if (ke->ke_flags & KEF_DIDRUN) {
434104964Sjeff					awake = 1;
435104964Sjeff					ke->ke_flags &= ~KEF_DIDRUN;
436104964Sjeff				}
437104964Sjeff
438104964Sjeff				/*
439118972Sjhb				 * ke_pctcpu is only for ps and ttyinfo().
440118972Sjhb				 * Do it per kse, and add them up at the end?
441104964Sjeff				 * XXXKSE
442104964Sjeff				 */
443118972Sjhb				ke->ke_pctcpu = (ke->ke_pctcpu * ccpu) >>
444109145Sjeff				    FSHIFT;
445104964Sjeff				/*
446104964Sjeff				 * If the kse has been idle the entire second,
447104964Sjeff				 * stop recalculating its priority until
448104964Sjeff				 * it wakes up.
449104964Sjeff				 */
450134145Sjulian				if (ke->ke_cpticks == 0)
451104964Sjeff					continue;
452104964Sjeff#if	(FSHIFT >= CCPU_SHIFT)
453109157Sjeff				ke->ke_pctcpu += (realstathz == 100)
454134145Sjulian				    ? ((fixpt_t) ke->ke_cpticks) <<
455104964Sjeff				    (FSHIFT - CCPU_SHIFT) :
456134145Sjulian				    100 * (((fixpt_t) ke->ke_cpticks)
457109145Sjeff				    << (FSHIFT - CCPU_SHIFT)) / realstathz;
458104964Sjeff#else
459109157Sjeff				ke->ke_pctcpu += ((FSCALE - ccpu) *
460134145Sjulian				    (ke->ke_cpticks *
461109145Sjeff				    FSCALE / realstathz)) >> FSHIFT;
462104964Sjeff#endif
463134145Sjulian				ke->ke_cpticks = 0;
464104964Sjeff			} /* end of kse loop */
465104964Sjeff			/*
466104964Sjeff			 * If there are ANY running threads in this KSEGRP,
467104964Sjeff			 * then don't count it as sleeping.
468104964Sjeff			 */
469104964Sjeff			if (awake) {
470104964Sjeff				if (kg->kg_slptime > 1) {
471104964Sjeff					/*
472104964Sjeff					 * In an ideal world, this should not
473104964Sjeff					 * happen, because whoever woke us
474104964Sjeff					 * up from the long sleep should have
475104964Sjeff					 * unwound the slptime and reset our
476104964Sjeff					 * priority before we run at the stale
477104964Sjeff					 * priority.  Should KASSERT at some
478104964Sjeff					 * point when all the cases are fixed.
479104964Sjeff					 */
480104964Sjeff					updatepri(kg);
481104964Sjeff				}
482104964Sjeff				kg->kg_slptime = 0;
483118972Sjhb			} else
484104964Sjeff				kg->kg_slptime++;
485104964Sjeff			if (kg->kg_slptime > 1)
486104964Sjeff				continue;
487104964Sjeff			kg->kg_estcpu = decay_cpu(loadfac, kg->kg_estcpu);
488104964Sjeff		      	resetpriority(kg);
489104964Sjeff			FOREACH_THREAD_IN_GROUP(kg, td) {
490104964Sjeff				if (td->td_priority >= PUSER) {
491105127Sjulian					sched_prio(td, kg->kg_user_pri);
492104964Sjeff				}
493104964Sjeff			}
494104964Sjeff		} /* end of ksegrp loop */
495104964Sjeff		mtx_unlock_spin(&sched_lock);
496104964Sjeff	} /* end of process loop */
497104964Sjeff	sx_sunlock(&allproc_lock);
498104964Sjeff}
499104964Sjeff
500104964Sjeff/*
501123871Sjhb * Main loop for a kthread that executes schedcpu once a second.
502123871Sjhb */
503123871Sjhbstatic void
504124955Sjeffschedcpu_thread(void)
505123871Sjhb{
506123871Sjhb	int nowake;
507123871Sjhb
508123871Sjhb	for (;;) {
509123871Sjhb		schedcpu();
510123871Sjhb		tsleep(&nowake, curthread->td_priority, "-", hz);
511123871Sjhb	}
512123871Sjhb}
513123871Sjhb
514123871Sjhb/*
515104964Sjeff * Recalculate the priority of a process after it has slept for a while.
516118972Sjhb * For all load averages >= 1 and max kg_estcpu of 255, sleeping for at
517118972Sjhb * least six times the loadfactor will decay kg_estcpu to zero.
518104964Sjeff */
519104964Sjeffstatic void
520104964Sjeffupdatepri(struct ksegrp *kg)
521104964Sjeff{
522118972Sjhb	register fixpt_t loadfac;
523104964Sjeff	register unsigned int newcpu;
524104964Sjeff
525118972Sjhb	loadfac = loadfactor(averunnable.ldavg[0]);
526104964Sjeff	if (kg->kg_slptime > 5 * loadfac)
527104964Sjeff		kg->kg_estcpu = 0;
528104964Sjeff	else {
529118972Sjhb		newcpu = kg->kg_estcpu;
530118972Sjhb		kg->kg_slptime--;	/* was incremented in schedcpu() */
531104964Sjeff		while (newcpu && --kg->kg_slptime)
532104964Sjeff			newcpu = decay_cpu(loadfac, newcpu);
533104964Sjeff		kg->kg_estcpu = newcpu;
534104964Sjeff	}
535104964Sjeff	resetpriority(kg);
536104964Sjeff}
537104964Sjeff
538104964Sjeff/*
539104964Sjeff * Compute the priority of a process when running in user mode.
540104964Sjeff * Arrange to reschedule if the resulting priority is better
541104964Sjeff * than that of the current process.
542104964Sjeff */
543104964Sjeffstatic void
544104964Sjeffresetpriority(struct ksegrp *kg)
545104964Sjeff{
546104964Sjeff	register unsigned int newpriority;
547104964Sjeff	struct thread *td;
548104964Sjeff
549104964Sjeff	if (kg->kg_pri_class == PRI_TIMESHARE) {
550104964Sjeff		newpriority = PUSER + kg->kg_estcpu / INVERSE_ESTCPU_WEIGHT +
551130551Sjulian		    NICE_WEIGHT * (kg->kg_proc->p_nice - PRIO_MIN);
552104964Sjeff		newpriority = min(max(newpriority, PRI_MIN_TIMESHARE),
553104964Sjeff		    PRI_MAX_TIMESHARE);
554104964Sjeff		kg->kg_user_pri = newpriority;
555104964Sjeff	}
556104964Sjeff	FOREACH_THREAD_IN_GROUP(kg, td) {
557104964Sjeff		maybe_resched(td);			/* XXXKSE silly */
558104964Sjeff	}
559104964Sjeff}
560104964Sjeff
561104964Sjeff/* ARGSUSED */
562104964Sjeffstatic void
563104964Sjeffsched_setup(void *dummy)
564104964Sjeff{
565124955Sjeff	setup_runqs();
566118972Sjhb
567104964Sjeff	if (sched_quantum == 0)
568104964Sjeff		sched_quantum = SCHED_QUANTUM;
569104964Sjeff	hogticks = 2 * sched_quantum;
570104964Sjeff
571126665Srwatson	callout_init(&roundrobin_callout, CALLOUT_MPSAFE);
572104964Sjeff
573104964Sjeff	/* Kick off timeout driven events by calling first time. */
574104964Sjeff	roundrobin(NULL);
575125288Sjeff
576125288Sjeff	/* Account for thread0. */
577125288Sjeff	sched_tdcnt++;
578104964Sjeff}
579104964Sjeff
580104964Sjeff/* External interfaces start here */
581134791Sjulian/*
582134791Sjulian * Very early in the boot some setup of scheduler-specific
583134791Sjulian * parts of proc0 and of soem scheduler resources needs to be done.
584134791Sjulian * Called from:
585134791Sjulian *  proc0_init()
586134791Sjulian */
587134791Sjulianvoid
588134791Sjulianschedinit(void)
589134791Sjulian{
590134791Sjulian	/*
591134791Sjulian	 * Set up the scheduler specific parts of proc0.
592134791Sjulian	 */
593134791Sjulian	proc0.p_sched = NULL; /* XXX */
594134791Sjulian	ksegrp0.kg_sched = &kg_sched0;
595134791Sjulian	thread0.td_sched = &kse0;
596134791Sjulian	kse0.ke_thread = &thread0;
597134791Sjulian	kse0.ke_oncpu = NOCPU; /* wrong.. can we use PCPU(cpuid) yet? */
598134791Sjulian	kse0.ke_state = KES_THREAD;
599134791Sjulian	kg_sched0.skg_concurrency = 1;
600134791Sjulian	kg_sched0.skg_avail_opennings = 0; /* we are already running */
601134791Sjulian}
602134791Sjulian
603104964Sjeffint
604104964Sjeffsched_runnable(void)
605104964Sjeff{
606124955Sjeff#ifdef SMP
607124955Sjeff	return runq_check(&runq) + runq_check(&runq_pcpu[PCPU_GET(cpuid)]);
608124955Sjeff#else
609124955Sjeff	return runq_check(&runq);
610124955Sjeff#endif
611104964Sjeff}
612104964Sjeff
613104964Sjeffint
614104964Sjeffsched_rr_interval(void)
615104964Sjeff{
616104964Sjeff	if (sched_quantum == 0)
617104964Sjeff		sched_quantum = SCHED_QUANTUM;
618104964Sjeff	return (sched_quantum);
619104964Sjeff}
620104964Sjeff
621104964Sjeff/*
622104964Sjeff * We adjust the priority of the current process.  The priority of
623104964Sjeff * a process gets worse as it accumulates CPU time.  The cpu usage
624118972Sjhb * estimator (kg_estcpu) is increased here.  resetpriority() will
625118972Sjhb * compute a different priority each time kg_estcpu increases by
626104964Sjeff * INVERSE_ESTCPU_WEIGHT
627104964Sjeff * (until MAXPRI is reached).  The cpu usage estimator ramps up
628104964Sjeff * quite quickly when the process is running (linearly), and decays
629104964Sjeff * away exponentially, at a rate which is proportionally slower when
630104964Sjeff * the system is busy.  The basic principle is that the system will
631104964Sjeff * 90% forget that the process used a lot of CPU time in 5 * loadav
632104964Sjeff * seconds.  This causes the system to favor processes which haven't
633104964Sjeff * run much recently, and to round-robin among other processes.
634104964Sjeff */
635104964Sjeffvoid
636121127Sjeffsched_clock(struct thread *td)
637104964Sjeff{
638104964Sjeff	struct ksegrp *kg;
639121127Sjeff	struct kse *ke;
640104964Sjeff
641113923Sjhb	mtx_assert(&sched_lock, MA_OWNED);
642121127Sjeff	kg = td->td_ksegrp;
643121127Sjeff	ke = td->td_kse;
644113356Sjeff
645134145Sjulian	ke->ke_cpticks++;
646104964Sjeff	kg->kg_estcpu = ESTCPULIM(kg->kg_estcpu + 1);
647104964Sjeff	if ((kg->kg_estcpu % INVERSE_ESTCPU_WEIGHT) == 0) {
648104964Sjeff		resetpriority(kg);
649104964Sjeff		if (td->td_priority >= PUSER)
650104964Sjeff			td->td_priority = kg->kg_user_pri;
651104964Sjeff	}
652104964Sjeff}
653118972Sjhb
654104964Sjeff/*
655104964Sjeff * charge childs scheduling cpu usage to parent.
656104964Sjeff *
657104964Sjeff * XXXKSE assume only one thread & kse & ksegrp keep estcpu in each ksegrp.
658104964Sjeff * Charge it to the ksegrp that did the wait since process estcpu is sum of
659104964Sjeff * all ksegrps, this is strictly as expected.  Assume that the child process
660104964Sjeff * aggregated all the estcpu into the 'built-in' ksegrp.
661104964Sjeff */
662104964Sjeffvoid
663132372Sjuliansched_exit(struct proc *p, struct thread *td)
664104964Sjeff{
665132372Sjulian	sched_exit_ksegrp(FIRST_KSEGRP_IN_PROC(p), td);
666132372Sjulian	sched_exit_thread(FIRST_THREAD_IN_PROC(p), td);
667113356Sjeff}
668113356Sjeff
669113356Sjeffvoid
670132372Sjuliansched_exit_ksegrp(struct ksegrp *kg, struct thread *childtd)
671113356Sjeff{
672113923Sjhb
673113923Sjhb	mtx_assert(&sched_lock, MA_OWNED);
674132372Sjulian	kg->kg_estcpu = ESTCPULIM(kg->kg_estcpu + childtd->td_ksegrp->kg_estcpu);
675104964Sjeff}
676104964Sjeff
677104964Sjeffvoid
678113356Sjeffsched_exit_thread(struct thread *td, struct thread *child)
679104964Sjeff{
680127894Sdfr	if ((child->td_proc->p_flag & P_NOLOAD) == 0)
681125288Sjeff		sched_tdcnt--;
682113356Sjeff}
683109145Sjeff
684113356Sjeffvoid
685134791Sjuliansched_fork(struct thread *td, struct thread *childtd)
686113356Sjeff{
687134791Sjulian	sched_fork_ksegrp(td, childtd->td_ksegrp);
688134791Sjulian	sched_fork_thread(td, childtd);
689113356Sjeff}
690113356Sjeff
691113356Sjeffvoid
692132372Sjuliansched_fork_ksegrp(struct thread *td, struct ksegrp *child)
693113356Sjeff{
694113923Sjhb	mtx_assert(&sched_lock, MA_OWNED);
695132372Sjulian	child->kg_estcpu = td->td_ksegrp->kg_estcpu;
696113356Sjeff}
697109145Sjeff
698113356Sjeffvoid
699134791Sjuliansched_fork_thread(struct thread *td, struct thread *childtd)
700113356Sjeff{
701134791Sjulian	sched_newthread(childtd);
702104964Sjeff}
703104964Sjeff
704104964Sjeffvoid
705130551Sjuliansched_nice(struct proc *p, int nice)
706104964Sjeff{
707130551Sjulian	struct ksegrp *kg;
708113873Sjhb
709130551Sjulian	PROC_LOCK_ASSERT(p, MA_OWNED);
710113873Sjhb	mtx_assert(&sched_lock, MA_OWNED);
711130551Sjulian	p->p_nice = nice;
712130551Sjulian	FOREACH_KSEGRP_IN_PROC(p, kg) {
713130551Sjulian		resetpriority(kg);
714130551Sjulian	}
715104964Sjeff}
716104964Sjeff
717113356Sjeffvoid
718113356Sjeffsched_class(struct ksegrp *kg, int class)
719113356Sjeff{
720113923Sjhb	mtx_assert(&sched_lock, MA_OWNED);
721113356Sjeff	kg->kg_pri_class = class;
722113356Sjeff}
723113356Sjeff
724105127Sjulian/*
725105127Sjulian * Adjust the priority of a thread.
726105127Sjulian * This may include moving the thread within the KSEGRP,
727105127Sjulian * changing the assignment of a kse to the thread,
728105127Sjulian * and moving a KSE in the system run queue.
729105127Sjulian */
730104964Sjeffvoid
731104964Sjeffsched_prio(struct thread *td, u_char prio)
732104964Sjeff{
733104964Sjeff
734113923Sjhb	mtx_assert(&sched_lock, MA_OWNED);
735104964Sjeff	if (TD_ON_RUNQ(td)) {
736105127Sjulian		adjustrunqueue(td, prio);
737105127Sjulian	} else {
738105127Sjulian		td->td_priority = prio;
739104964Sjeff	}
740104964Sjeff}
741104964Sjeff
742104964Sjeffvoid
743126326Sjhbsched_sleep(struct thread *td)
744104964Sjeff{
745113923Sjhb
746113923Sjhb	mtx_assert(&sched_lock, MA_OWNED);
747104964Sjeff	td->td_ksegrp->kg_slptime = 0;
748126326Sjhb	td->td_base_pri = td->td_priority;
749104964Sjeff}
750104964Sjeff
751135051Sjulianstatic void remrunqueue(struct thread *td);
752135051Sjulian
753104964Sjeffvoid
754135051Sjuliansched_switch(struct thread *td, struct thread *newtd, int flags)
755104964Sjeff{
756104964Sjeff	struct kse *ke;
757135051Sjulian	struct ksegrp *kg;
758104964Sjeff	struct proc *p;
759104964Sjeff
760104964Sjeff	ke = td->td_kse;
761104964Sjeff	p = td->td_proc;
762104964Sjeff
763113923Sjhb	mtx_assert(&sched_lock, MA_OWNED);
764104964Sjeff
765125295Sjeff	if ((p->p_flag & P_NOLOAD) == 0)
766125288Sjeff		sched_tdcnt--;
767135051Sjulian
768134791Sjulian	/*
769135051Sjulian	 * We are volunteering to switch out so we get to nominate
770135051Sjulian	 * a successor for the rest of our quantum
771135051Sjulian	 * First try another thread in our ksegrp, and then look for
772135051Sjulian	 * other ksegrps in our process.
773135051Sjulian	 */
774135051Sjulian	if (sched_followon &&
775135051Sjulian	    (p->p_flag & P_HADTHREADS) &&
776135051Sjulian	    (flags & SW_VOL) &&
777135051Sjulian	    newtd == NULL) {
778135051Sjulian		/* lets schedule another thread from this process */
779135051Sjulian		 kg = td->td_ksegrp;
780135051Sjulian		 if ((newtd = TAILQ_FIRST(&kg->kg_runq))) {
781135051Sjulian			remrunqueue(newtd);
782135051Sjulian			sched_kgfollowons++;
783135051Sjulian		 } else {
784135051Sjulian			FOREACH_KSEGRP_IN_PROC(p, kg) {
785135051Sjulian				if ((newtd = TAILQ_FIRST(&kg->kg_runq))) {
786135051Sjulian					sched_pfollowons++;
787135051Sjulian					remrunqueue(newtd);
788135051Sjulian					break;
789135051Sjulian				}
790135051Sjulian			}
791135051Sjulian		}
792135051Sjulian	}
793135051Sjulian
794135051Sjulian	/*
795134791Sjulian	 * The thread we are about to run needs to be counted as if it had been
796134791Sjulian	 * added to the run queue and selected.
797134791Sjulian	 */
798134791Sjulian	if (newtd) {
799134791Sjulian		newtd->td_ksegrp->kg_avail_opennings--;
800134791Sjulian		newtd->td_kse->ke_flags |= KEF_DIDRUN;
801134791Sjulian        	TD_SET_RUNNING(newtd);
802134832Sjulian		if ((newtd->td_proc->p_flag & P_NOLOAD) == 0)
803134832Sjulian			sched_tdcnt++;
804134791Sjulian	}
805135051Sjulian
806113339Sjulian	td->td_lastcpu = td->td_oncpu;
807132266Sjhb	td->td_flags &= ~TDF_NEEDRESCHED;
808132266Sjhb	td->td_pflags &= ~TDP_OWEPREEMPT;
809113339Sjulian	td->td_oncpu = NOCPU;
810104964Sjeff	/*
811104964Sjeff	 * At the last moment, if this thread is still marked RUNNING,
812104964Sjeff	 * then put it back on the run queue as it has not been suspended
813131473Sjhb	 * or stopped or any thing else similar.  We never put the idle
814131473Sjhb	 * threads on the run queue, however.
815104964Sjeff	 */
816131473Sjhb	if (td == PCPU_GET(idlethread))
817131473Sjhb		TD_SET_CAN_RUN(td);
818134791Sjulian	else {
819134791Sjulian		td->td_ksegrp->kg_avail_opennings++;
820134791Sjulian		if (TD_IS_RUNNING(td)) {
821134791Sjulian			/* Put us back on the run queue (kse and all). */
822134791Sjulian			setrunqueue(td, SRQ_OURSELF|SRQ_YIELDING);
823134791Sjulian		} else if (p->p_flag & P_HADTHREADS) {
824134791Sjulian			/*
825134791Sjulian			 * We will not be on the run queue. So we must be
826134791Sjulian			 * sleeping or similar. As it's available,
827134791Sjulian			 * someone else can use the KSE if they need it.
828134791Sjulian			 */
829134791Sjulian			slot_fill(td->td_ksegrp);
830134791Sjulian		}
831104964Sjeff	}
832131473Sjhb	if (newtd == NULL)
833131473Sjhb		newtd = choosethread();
834121128Sjeff	if (td != newtd)
835121128Sjeff		cpu_switch(td, newtd);
836121128Sjeff	sched_lock.mtx_lock = (uintptr_t)td;
837121128Sjeff	td->td_oncpu = PCPU_GET(cpuid);
838104964Sjeff}
839104964Sjeff
840104964Sjeffvoid
841104964Sjeffsched_wakeup(struct thread *td)
842104964Sjeff{
843104964Sjeff	struct ksegrp *kg;
844104964Sjeff
845113923Sjhb	mtx_assert(&sched_lock, MA_OWNED);
846104964Sjeff	kg = td->td_ksegrp;
847104964Sjeff	if (kg->kg_slptime > 1)
848104964Sjeff		updatepri(kg);
849104964Sjeff	kg->kg_slptime = 0;
850134586Sjulian	setrunqueue(td, SRQ_BORING);
851104964Sjeff}
852104964Sjeff
853134693Sjulian#ifdef SMP
854134688Sjulian/* enable HTT_2 if you have a 2-way HTT cpu.*/
855134688Sjulianstatic int
856134688Sjulianforward_wakeup(int  cpunum)
857134688Sjulian{
858134688Sjulian	cpumask_t map, me, dontuse;
859134688Sjulian	cpumask_t map2;
860134688Sjulian	struct pcpu *pc;
861134688Sjulian	cpumask_t id, map3;
862134688Sjulian
863134688Sjulian	mtx_assert(&sched_lock, MA_OWNED);
864134688Sjulian
865134791Sjulian	CTR0(KTR_RUNQ, "forward_wakeup()");
866134688Sjulian
867134688Sjulian	if ((!forward_wakeup_enabled) ||
868134688Sjulian	     (forward_wakeup_use_mask == 0 && forward_wakeup_use_loop == 0))
869134688Sjulian		return (0);
870134688Sjulian	if (!smp_started || cold || panicstr)
871134688Sjulian		return (0);
872134688Sjulian
873134688Sjulian	forward_wakeups_requested++;
874134688Sjulian
875134688Sjulian/*
876134688Sjulian * check the idle mask we received against what we calculated before
877134688Sjulian * in the old version.
878134688Sjulian */
879134688Sjulian	me = PCPU_GET(cpumask);
880134688Sjulian	/*
881134688Sjulian	 * don't bother if we should be doing it ourself..
882134688Sjulian	 */
883134688Sjulian	if ((me & idle_cpus_mask) && (cpunum == NOCPU || me == (1 << cpunum)))
884134688Sjulian		return (0);
885134688Sjulian
886134688Sjulian	dontuse = me | stopped_cpus | hlt_cpus_mask;
887134688Sjulian	map3 = 0;
888134688Sjulian	if (forward_wakeup_use_loop) {
889134688Sjulian		SLIST_FOREACH(pc, &cpuhead, pc_allcpu) {
890134688Sjulian			id = pc->pc_cpumask;
891134688Sjulian			if ( (id & dontuse) == 0 &&
892134688Sjulian			    pc->pc_curthread == pc->pc_idlethread) {
893134688Sjulian				map3 |= id;
894134688Sjulian			}
895134688Sjulian		}
896134688Sjulian	}
897134688Sjulian
898134688Sjulian	if (forward_wakeup_use_mask) {
899134688Sjulian		map = 0;
900134688Sjulian		map = idle_cpus_mask & ~dontuse;
901134688Sjulian
902134688Sjulian		/* If they are both on, compare and use loop if different */
903134688Sjulian		if (forward_wakeup_use_loop) {
904134688Sjulian			if (map != map3) {
905134688Sjulian				printf("map (%02X) != map3 (%02X)\n",
906134688Sjulian						map, map3);
907134688Sjulian				map = map3;
908134688Sjulian			}
909134688Sjulian		}
910134688Sjulian	} else {
911134688Sjulian		map = map3;
912134688Sjulian	}
913134688Sjulian	/* If we only allow a specific CPU, then mask off all the others */
914134688Sjulian	if (cpunum != NOCPU) {
915134688Sjulian		KASSERT((cpunum <= mp_maxcpus),("forward_wakeup: bad cpunum."));
916134688Sjulian		map &= (1 << cpunum);
917134688Sjulian	} else {
918134688Sjulian		/* Try choose an idle die. */
919134688Sjulian		if (forward_wakeup_use_htt) {
920134688Sjulian			map2 =  (map & (map >> 1)) & 0x5555;
921134688Sjulian			if (map2) {
922134688Sjulian				map = map2;
923134688Sjulian			}
924134688Sjulian		}
925134688Sjulian
926134688Sjulian		/* set only one bit */
927134688Sjulian		if (forward_wakeup_use_single) {
928134688Sjulian			map = map & ((~map) + 1);
929134688Sjulian		}
930134688Sjulian	}
931134688Sjulian	if (map) {
932134688Sjulian		forward_wakeups_delivered++;
933134688Sjulian		ipi_selected(map, IPI_AST);
934134688Sjulian		return (1);
935134688Sjulian	}
936134688Sjulian	if (cpunum == NOCPU)
937134688Sjulian		printf("forward_wakeup: Idle processor not found\n");
938134688Sjulian	return (0);
939134688Sjulian}
940134693Sjulian#endif
941134688Sjulian
942104964Sjeffvoid
943134586Sjuliansched_add(struct thread *td, int flags)
944104964Sjeff{
945121127Sjeff	struct kse *ke;
946134591Sjulian#ifdef SMP
947134591Sjulian	int forwarded = 0;
948134591Sjulian	int cpu;
949134591Sjulian#endif
950121127Sjeff
951121127Sjeff	ke = td->td_kse;
952104964Sjeff	mtx_assert(&sched_lock, MA_OWNED);
953104964Sjeff	KASSERT(ke->ke_state != KES_ONRUNQ,
954124957Sjeff	    ("sched_add: kse %p (%s) already in run queue", ke,
955104964Sjeff	    ke->ke_proc->p_comm));
956104964Sjeff	KASSERT(ke->ke_proc->p_sflag & PS_INMEM,
957124957Sjeff	    ("sched_add: process swapped out"));
958131481Sjhb
959131481Sjhb#ifdef SMP
960124955Sjeff	if (KSE_CAN_MIGRATE(ke)) {
961134591Sjulian		CTR2(KTR_RUNQ,
962134591Sjulian		    "sched_add: adding kse:%p (td:%p) to gbl runq", ke, td);
963134591Sjulian		cpu = NOCPU;
964124955Sjeff		ke->ke_runq = &runq;
965124955Sjeff	} else {
966124955Sjeff		if (!SKE_RUNQ_PCPU(ke))
967134591Sjulian			ke->ke_runq = &runq_pcpu[(cpu = PCPU_GET(cpuid))];
968134591Sjulian		else
969134591Sjulian			cpu = td->td_lastcpu;
970134591Sjulian		CTR3(KTR_RUNQ,
971134591Sjulian		    "sched_add: Put kse:%p(td:%p) on cpu%d runq", ke, td, cpu);
972124955Sjeff	}
973124955Sjeff#else
974133396Sjulian	CTR2(KTR_RUNQ, "sched_add: adding kse:%p (td:%p) to runq", ke, td);
975124955Sjeff	ke->ke_runq = &runq;
976134591Sjulian
977124955Sjeff#endif
978134591Sjulian	/*
979134591Sjulian	 * If we are yielding (on the way out anyhow)
980134591Sjulian	 * or the thread being saved is US,
981134591Sjulian	 * then don't try be smart about preemption
982134591Sjulian	 * or kicking off another CPU
983134591Sjulian	 * as it won't help and may hinder.
984134591Sjulian	 * In the YIEDLING case, we are about to run whoever is
985134591Sjulian	 * being put in the queue anyhow, and in the
986134591Sjulian	 * OURSELF case, we are puting ourself on the run queue
987134591Sjulian	 * which also only happens when we are about to yield.
988134591Sjulian	 */
989134591Sjulian	if((flags & SRQ_YIELDING) == 0) {
990134591Sjulian#ifdef SMP
991134591Sjulian		cpumask_t me = PCPU_GET(cpumask);
992134591Sjulian		int idle = idle_cpus_mask & me;
993134591Sjulian		/*
994134591Sjulian		 * Only try to kick off another CPU if
995134591Sjulian		 * the thread is unpinned
996134591Sjulian		 * or pinned to another cpu,
997134591Sjulian		 * and there are other available and idle CPUs.
998134837Sjulian		 * if we are idle, or it's an interrupt,
999134837Sjulian		 * then skip straight to preemption.
1000134591Sjulian		 */
1001134837Sjulian		if ( (! idle) && ((flags & SRQ_INTR) == 0) &&
1002134591Sjulian		    (idle_cpus_mask & ~(hlt_cpus_mask | me)) &&
1003134591Sjulian		    ( KSE_CAN_MIGRATE(ke) ||
1004134591Sjulian		      ke->ke_runq != &runq_pcpu[PCPU_GET(cpuid)])) {
1005134591Sjulian			forwarded = forward_wakeup(cpu);
1006134591Sjulian		}
1007134591Sjulian		/*
1008134591Sjulian		 * If we failed to kick off another cpu, then look to
1009134591Sjulian		 * see if we should preempt this CPU. Only allow this
1010134591Sjulian		 * if it is not pinned or IS pinned to this CPU.
1011134591Sjulian		 * If we are the idle thread, we also try do preempt.
1012134591Sjulian		 * as it will be quicker and being idle, we won't
1013134591Sjulian		 * lose in doing so..
1014134591Sjulian		 */
1015134591Sjulian		if ((!forwarded) &&
1016134591Sjulian		    (ke->ke_runq == &runq ||
1017134591Sjulian		     ke->ke_runq == &runq_pcpu[PCPU_GET(cpuid)]))
1018134591Sjulian#endif
1019134591Sjulian
1020134591Sjulian		{
1021134591Sjulian			if (maybe_preempt(td))
1022134591Sjulian				return;
1023134591Sjulian		}
1024134591Sjulian	}
1025125295Sjeff	if ((td->td_proc->p_flag & P_NOLOAD) == 0)
1026125288Sjeff		sched_tdcnt++;
1027124955Sjeff	runq_add(ke->ke_runq, ke);
1028133520Sjulian	ke->ke_ksegrp->kg_runq_kses++;
1029133520Sjulian	ke->ke_state = KES_ONRUNQ;
1030132118Sjhb	maybe_resched(td);
1031104964Sjeff}
1032104964Sjeff
1033104964Sjeffvoid
1034121127Sjeffsched_rem(struct thread *td)
1035104964Sjeff{
1036121127Sjeff	struct kse *ke;
1037121127Sjeff
1038121127Sjeff	ke = td->td_kse;
1039104964Sjeff	KASSERT(ke->ke_proc->p_sflag & PS_INMEM,
1040124957Sjeff	    ("sched_rem: process swapped out"));
1041124957Sjeff	KASSERT((ke->ke_state == KES_ONRUNQ),
1042124957Sjeff	    ("sched_rem: KSE not on run queue"));
1043104964Sjeff	mtx_assert(&sched_lock, MA_OWNED);
1044104964Sjeff
1045125295Sjeff	if ((td->td_proc->p_flag & P_NOLOAD) == 0)
1046125288Sjeff		sched_tdcnt--;
1047134145Sjulian	runq_remove(ke->ke_runq, ke);
1048124955Sjeff
1049104964Sjeff	ke->ke_state = KES_THREAD;
1050104964Sjeff	ke->ke_ksegrp->kg_runq_kses--;
1051104964Sjeff}
1052104964Sjeff
1053104964Sjeffstruct kse *
1054104964Sjeffsched_choose(void)
1055104964Sjeff{
1056104964Sjeff	struct kse *ke;
1057124955Sjeff	struct runq *rq;
1058104964Sjeff
1059124955Sjeff#ifdef SMP
1060124955Sjeff	struct kse *kecpu;
1061124955Sjeff
1062124955Sjeff	rq = &runq;
1063104964Sjeff	ke = runq_choose(&runq);
1064124955Sjeff	kecpu = runq_choose(&runq_pcpu[PCPU_GET(cpuid)]);
1065104964Sjeff
1066124955Sjeff	if (ke == NULL ||
1067124955Sjeff	    (kecpu != NULL &&
1068124955Sjeff	     kecpu->ke_thread->td_priority < ke->ke_thread->td_priority)) {
1069133396Sjulian		CTR2(KTR_RUNQ, "choosing kse %p from pcpu runq %d", kecpu,
1070124955Sjeff		     PCPU_GET(cpuid));
1071124955Sjeff		ke = kecpu;
1072124955Sjeff		rq = &runq_pcpu[PCPU_GET(cpuid)];
1073124955Sjeff	} else {
1074133396Sjulian		CTR1(KTR_RUNQ, "choosing kse %p from main runq", ke);
1075124955Sjeff	}
1076124955Sjeff
1077124955Sjeff#else
1078124955Sjeff	rq = &runq;
1079124955Sjeff	ke = runq_choose(&runq);
1080124955Sjeff#endif
1081124955Sjeff
1082104964Sjeff	if (ke != NULL) {
1083124955Sjeff		runq_remove(rq, ke);
1084104964Sjeff		ke->ke_state = KES_THREAD;
1085133520Sjulian		ke->ke_ksegrp->kg_runq_kses--;
1086104964Sjeff
1087104964Sjeff		KASSERT(ke->ke_proc->p_sflag & PS_INMEM,
1088124957Sjeff		    ("sched_choose: process swapped out"));
1089104964Sjeff	}
1090104964Sjeff	return (ke);
1091104964Sjeff}
1092104964Sjeff
1093104964Sjeffvoid
1094104964Sjeffsched_userret(struct thread *td)
1095104964Sjeff{
1096104964Sjeff	struct ksegrp *kg;
1097104964Sjeff	/*
1098104964Sjeff	 * XXX we cheat slightly on the locking here to avoid locking in
1099104964Sjeff	 * the usual case.  Setting td_priority here is essentially an
1100104964Sjeff	 * incomplete workaround for not setting it properly elsewhere.
1101104964Sjeff	 * Now that some interrupt handlers are threads, not setting it
1102104964Sjeff	 * properly elsewhere can clobber it in the window between setting
1103104964Sjeff	 * it here and returning to user mode, so don't waste time setting
1104104964Sjeff	 * it perfectly here.
1105104964Sjeff	 */
1106104964Sjeff	kg = td->td_ksegrp;
1107104964Sjeff	if (td->td_priority != kg->kg_user_pri) {
1108104964Sjeff		mtx_lock_spin(&sched_lock);
1109104964Sjeff		td->td_priority = kg->kg_user_pri;
1110104964Sjeff		mtx_unlock_spin(&sched_lock);
1111104964Sjeff	}
1112104964Sjeff}
1113107126Sjeff
1114124955Sjeffvoid
1115124955Sjeffsched_bind(struct thread *td, int cpu)
1116124955Sjeff{
1117124955Sjeff	struct kse *ke;
1118124955Sjeff
1119124955Sjeff	mtx_assert(&sched_lock, MA_OWNED);
1120124955Sjeff	KASSERT(TD_IS_RUNNING(td),
1121124955Sjeff	    ("sched_bind: cannot bind non-running thread"));
1122124955Sjeff
1123124955Sjeff	ke = td->td_kse;
1124124955Sjeff
1125124955Sjeff	ke->ke_flags |= KEF_BOUND;
1126124955Sjeff#ifdef SMP
1127124955Sjeff	ke->ke_runq = &runq_pcpu[cpu];
1128124955Sjeff	if (PCPU_GET(cpuid) == cpu)
1129124955Sjeff		return;
1130124955Sjeff
1131124955Sjeff	ke->ke_state = KES_THREAD;
1132124955Sjeff
1133131473Sjhb	mi_switch(SW_VOL, NULL);
1134124955Sjeff#endif
1135124955Sjeff}
1136124955Sjeff
1137124955Sjeffvoid
1138124955Sjeffsched_unbind(struct thread* td)
1139124955Sjeff{
1140124955Sjeff	mtx_assert(&sched_lock, MA_OWNED);
1141124955Sjeff	td->td_kse->ke_flags &= ~KEF_BOUND;
1142124955Sjeff}
1143124955Sjeff
1144107126Sjeffint
1145125288Sjeffsched_load(void)
1146125288Sjeff{
1147125288Sjeff	return (sched_tdcnt);
1148125288Sjeff}
1149125288Sjeff
1150125288Sjeffint
1151107126Sjeffsched_sizeof_ksegrp(void)
1152107126Sjeff{
1153134791Sjulian	return (sizeof(struct ksegrp) + sizeof(struct kg_sched));
1154107126Sjeff}
1155107126Sjeffint
1156107126Sjeffsched_sizeof_proc(void)
1157107126Sjeff{
1158107126Sjeff	return (sizeof(struct proc));
1159107126Sjeff}
1160107126Sjeffint
1161107126Sjeffsched_sizeof_thread(void)
1162107126Sjeff{
1163134791Sjulian	return (sizeof(struct thread) + sizeof(struct kse));
1164107126Sjeff}
1165107137Sjeff
1166107137Sjefffixpt_t
1167121127Sjeffsched_pctcpu(struct thread *td)
1168107137Sjeff{
1169121147Sjeff	struct kse *ke;
1170121147Sjeff
1171121147Sjeff	ke = td->td_kse;
1172134791Sjulian	return (ke->ke_pctcpu);
1173121147Sjeff
1174121147Sjeff	return (0);
1175107137Sjeff}
1176134791Sjulian#define KERN_SWITCH_INCLUDE 1
1177134791Sjulian#include "kern/kern_switch.c"
1178