sched_4bsd.c revision 135295
1104964Sjeff/*-
2104964Sjeff * Copyright (c) 1982, 1986, 1990, 1991, 1993
3104964Sjeff *	The Regents of the University of California.  All rights reserved.
4104964Sjeff * (c) UNIX System Laboratories, Inc.
5104964Sjeff * All or some portions of this file are derived from material licensed
6104964Sjeff * to the University of California by American Telephone and Telegraph
7104964Sjeff * Co. or Unix System Laboratories, Inc. and are reproduced herein with
8104964Sjeff * the permission of UNIX System Laboratories, Inc.
9104964Sjeff *
10104964Sjeff * Redistribution and use in source and binary forms, with or without
11104964Sjeff * modification, are permitted provided that the following conditions
12104964Sjeff * are met:
13104964Sjeff * 1. Redistributions of source code must retain the above copyright
14104964Sjeff *    notice, this list of conditions and the following disclaimer.
15104964Sjeff * 2. Redistributions in binary form must reproduce the above copyright
16104964Sjeff *    notice, this list of conditions and the following disclaimer in the
17104964Sjeff *    documentation and/or other materials provided with the distribution.
18104964Sjeff * 4. Neither the name of the University nor the names of its contributors
19104964Sjeff *    may be used to endorse or promote products derived from this software
20104964Sjeff *    without specific prior written permission.
21104964Sjeff *
22104964Sjeff * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
23104964Sjeff * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
24104964Sjeff * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
25104964Sjeff * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
26104964Sjeff * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
27104964Sjeff * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
28104964Sjeff * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
29104964Sjeff * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
30104964Sjeff * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
31104964Sjeff * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32104964Sjeff * SUCH DAMAGE.
33104964Sjeff */
34104964Sjeff
35116182Sobrien#include <sys/cdefs.h>
36116182Sobrien__FBSDID("$FreeBSD: head/sys/kern/sched_4bsd.c 135295 2004-09-16 07:12:59Z julian $");
37116182Sobrien
38134791Sjulian#define kse td_sched
39134791Sjulian
40104964Sjeff#include <sys/param.h>
41104964Sjeff#include <sys/systm.h>
42104964Sjeff#include <sys/kernel.h>
43104964Sjeff#include <sys/ktr.h>
44104964Sjeff#include <sys/lock.h>
45123871Sjhb#include <sys/kthread.h>
46104964Sjeff#include <sys/mutex.h>
47104964Sjeff#include <sys/proc.h>
48104964Sjeff#include <sys/resourcevar.h>
49104964Sjeff#include <sys/sched.h>
50104964Sjeff#include <sys/smp.h>
51104964Sjeff#include <sys/sysctl.h>
52104964Sjeff#include <sys/sx.h>
53134689Sjulian#include <machine/smp.h>
54104964Sjeff
55107135Sjeff/*
56107135Sjeff * INVERSE_ESTCPU_WEIGHT is only suitable for statclock() frequencies in
57107135Sjeff * the range 100-256 Hz (approximately).
58107135Sjeff */
59107135Sjeff#define	ESTCPULIM(e) \
60107135Sjeff    min((e), INVERSE_ESTCPU_WEIGHT * (NICE_WEIGHT * (PRIO_MAX - PRIO_MIN) - \
61107135Sjeff    RQ_PPQ) + INVERSE_ESTCPU_WEIGHT - 1)
62122355Sbde#ifdef SMP
63122355Sbde#define	INVERSE_ESTCPU_WEIGHT	(8 * smp_cpus)
64122355Sbde#else
65107135Sjeff#define	INVERSE_ESTCPU_WEIGHT	8	/* 1 / (priorities per estcpu level). */
66122355Sbde#endif
67107135Sjeff#define	NICE_WEIGHT		1	/* Priorities per nice level. */
68107135Sjeff
69134791Sjulian/*
70134791Sjulian * The schedulable entity that can be given a context to run.
71134791Sjulian * A process may have several of these. Probably one per processor
72134791Sjulian * but posibly a few more. In this universe they are grouped
73134791Sjulian * with a KSEG that contains the priority and niceness
74134791Sjulian * for the group.
75134791Sjulian */
76134791Sjulianstruct kse {
77134791Sjulian	TAILQ_ENTRY(kse) ke_kglist;	/* (*) Queue of KSEs in ke_ksegrp. */
78134791Sjulian	TAILQ_ENTRY(kse) ke_kgrlist;	/* (*) Queue of KSEs in this state. */
79134791Sjulian	TAILQ_ENTRY(kse) ke_procq;	/* (j/z) Run queue. */
80134791Sjulian	struct thread	*ke_thread;	/* (*) Active associated thread. */
81134791Sjulian	fixpt_t		ke_pctcpu;	/* (j) %cpu during p_swtime. */
82134791Sjulian	u_char		ke_oncpu;	/* (j) Which cpu we are on. */
83134791Sjulian	char		ke_rqindex;	/* (j) Run queue index. */
84134791Sjulian	enum {
85134791Sjulian		KES_THREAD = 0x0,	/* slaved to thread state */
86134791Sjulian		KES_ONRUNQ
87134791Sjulian	} ke_state;			/* (j) KSE status. */
88134791Sjulian	int		ke_cpticks;	/* (j) Ticks of cpu time. */
89134791Sjulian	struct runq	*ke_runq;	/* runq the kse is currently on */
90109145Sjeff};
91109145Sjeff
92134791Sjulian#define ke_proc		ke_thread->td_proc
93134791Sjulian#define ke_ksegrp	ke_thread->td_ksegrp
94134791Sjulian
95134791Sjulian#define td_kse td_sched
96134791Sjulian
97134791Sjulian/* flags kept in td_flags */
98134791Sjulian#define TDF_DIDRUN	TDF_SCHED0	/* KSE actually ran. */
99134791Sjulian#define TDF_EXIT	TDF_SCHED1	/* KSE is being killed. */
100134791Sjulian#define TDF_BOUND	TDF_SCHED2
101134791Sjulian
102134791Sjulian#define ke_flags	ke_thread->td_flags
103134791Sjulian#define KEF_DIDRUN	TDF_DIDRUN /* KSE actually ran. */
104134791Sjulian#define KEF_EXIT	TDF_EXIT /* KSE is being killed. */
105134791Sjulian#define KEF_BOUND	TDF_BOUND /* stuck to one CPU */
106134791Sjulian
107124955Sjeff#define SKE_RUNQ_PCPU(ke)						\
108124955Sjeff    ((ke)->ke_runq != 0 && (ke)->ke_runq != &runq)
109124955Sjeff
110134791Sjulianstruct kg_sched {
111134791Sjulian	struct thread	*skg_last_assigned; /* (j) Last thread assigned to */
112134791Sjulian					   /* the system scheduler. */
113134791Sjulian	int	skg_avail_opennings;	/* (j) Num KSEs requested in group. */
114134791Sjulian	int	skg_concurrency;	/* (j) Num KSEs requested in group. */
115134791Sjulian	int	skg_runq_kses;		/* (j) Num KSEs on runq. */
116134791Sjulian};
117134791Sjulian#define kg_last_assigned	kg_sched->skg_last_assigned
118134791Sjulian#define kg_avail_opennings	kg_sched->skg_avail_opennings
119134791Sjulian#define kg_concurrency		kg_sched->skg_concurrency
120134791Sjulian#define kg_runq_kses		kg_sched->skg_runq_kses
121134791Sjulian
122124955Sjeff/*
123124955Sjeff * KSE_CAN_MIGRATE macro returns true if the kse can migrate between
124125295Sjeff * cpus.
125124955Sjeff */
126124955Sjeff#define KSE_CAN_MIGRATE(ke)						\
127135076Sscottl    ((ke)->ke_thread->td_pinned == 0 && ((ke)->ke_flags & KEF_BOUND) == 0)
128109145Sjeff
129134791Sjulianstatic struct kse kse0;
130134791Sjulianstatic struct kg_sched kg_sched0;
131104964Sjeff
132125288Sjeffstatic int	sched_tdcnt;	/* Total runnable threads in the system. */
133104964Sjeffstatic int	sched_quantum;	/* Roundrobin scheduling quantum in ticks. */
134112535Smux#define	SCHED_QUANTUM	(hz / 10)	/* Default sched quantum */
135104964Sjeff
136104964Sjeffstatic struct callout roundrobin_callout;
137104964Sjeff
138134791Sjulianstatic void	slot_fill(struct ksegrp *kg);
139134791Sjulianstatic struct kse *sched_choose(void);		/* XXX Should be thread * */
140134791Sjulian
141124955Sjeffstatic void	setup_runqs(void);
142104964Sjeffstatic void	roundrobin(void *arg);
143123871Sjhbstatic void	schedcpu(void);
144124955Sjeffstatic void	schedcpu_thread(void);
145104964Sjeffstatic void	sched_setup(void *dummy);
146104964Sjeffstatic void	maybe_resched(struct thread *td);
147104964Sjeffstatic void	updatepri(struct ksegrp *kg);
148104964Sjeffstatic void	resetpriority(struct ksegrp *kg);
149134694Sjulian#ifdef SMP
150134688Sjulianstatic int	forward_wakeup(int  cpunum);
151134694Sjulian#endif
152104964Sjeff
153124955Sjeffstatic struct kproc_desc sched_kp = {
154124955Sjeff        "schedcpu",
155124955Sjeff        schedcpu_thread,
156124955Sjeff        NULL
157124955Sjeff};
158124955SjeffSYSINIT(schedcpu, SI_SUB_RUN_SCHEDULER, SI_ORDER_FIRST, kproc_start, &sched_kp)
159124955SjeffSYSINIT(sched_setup, SI_SUB_RUN_QUEUE, SI_ORDER_FIRST, sched_setup, NULL)
160104964Sjeff
161104964Sjeff/*
162104964Sjeff * Global run queue.
163104964Sjeff */
164104964Sjeffstatic struct runq runq;
165104964Sjeff
166124955Sjeff#ifdef SMP
167124955Sjeff/*
168124955Sjeff * Per-CPU run queues
169124955Sjeff */
170124955Sjeffstatic struct runq runq_pcpu[MAXCPU];
171124955Sjeff#endif
172124955Sjeff
173124955Sjeffstatic void
174124955Sjeffsetup_runqs(void)
175124955Sjeff{
176124955Sjeff#ifdef SMP
177124955Sjeff	int i;
178124955Sjeff
179124955Sjeff	for (i = 0; i < MAXCPU; ++i)
180124955Sjeff		runq_init(&runq_pcpu[i]);
181124955Sjeff#endif
182124955Sjeff
183124955Sjeff	runq_init(&runq);
184124955Sjeff}
185124955Sjeff
186104964Sjeffstatic int
187104964Sjeffsysctl_kern_quantum(SYSCTL_HANDLER_ARGS)
188104964Sjeff{
189104964Sjeff	int error, new_val;
190104964Sjeff
191104964Sjeff	new_val = sched_quantum * tick;
192104964Sjeff	error = sysctl_handle_int(oidp, &new_val, 0, req);
193104964Sjeff        if (error != 0 || req->newptr == NULL)
194104964Sjeff		return (error);
195104964Sjeff	if (new_val < tick)
196104964Sjeff		return (EINVAL);
197104964Sjeff	sched_quantum = new_val / tick;
198104964Sjeff	hogticks = 2 * sched_quantum;
199104964Sjeff	return (0);
200104964Sjeff}
201104964Sjeff
202132589SscottlSYSCTL_NODE(_kern, OID_AUTO, sched, CTLFLAG_RD, 0, "Scheduler");
203130881Sscottl
204132589SscottlSYSCTL_STRING(_kern_sched, OID_AUTO, name, CTLFLAG_RD, "4BSD", 0,
205132589Sscottl    "Scheduler name");
206130881Sscottl
207132589SscottlSYSCTL_PROC(_kern_sched, OID_AUTO, quantum, CTLTYPE_INT | CTLFLAG_RW,
208132589Sscottl    0, sizeof sched_quantum, sysctl_kern_quantum, "I",
209132589Sscottl    "Roundrobin scheduling quantum in microseconds");
210104964Sjeff
211134693Sjulian#ifdef SMP
212134688Sjulian/* Enable forwarding of wakeups to all other cpus */
213134688SjulianSYSCTL_NODE(_kern_sched, OID_AUTO, ipiwakeup, CTLFLAG_RD, NULL, "Kernel SMP");
214134688Sjulian
215134792Sjulianstatic int forward_wakeup_enabled = 1;
216134688SjulianSYSCTL_INT(_kern_sched_ipiwakeup, OID_AUTO, enabled, CTLFLAG_RW,
217134688Sjulian	   &forward_wakeup_enabled, 0,
218134688Sjulian	   "Forwarding of wakeup to idle CPUs");
219134688Sjulian
220134688Sjulianstatic int forward_wakeups_requested = 0;
221134688SjulianSYSCTL_INT(_kern_sched_ipiwakeup, OID_AUTO, requested, CTLFLAG_RD,
222134688Sjulian	   &forward_wakeups_requested, 0,
223134688Sjulian	   "Requests for Forwarding of wakeup to idle CPUs");
224134688Sjulian
225134688Sjulianstatic int forward_wakeups_delivered = 0;
226134688SjulianSYSCTL_INT(_kern_sched_ipiwakeup, OID_AUTO, delivered, CTLFLAG_RD,
227134688Sjulian	   &forward_wakeups_delivered, 0,
228134688Sjulian	   "Completed Forwarding of wakeup to idle CPUs");
229134688Sjulian
230134792Sjulianstatic int forward_wakeup_use_mask = 1;
231134688SjulianSYSCTL_INT(_kern_sched_ipiwakeup, OID_AUTO, usemask, CTLFLAG_RW,
232134688Sjulian	   &forward_wakeup_use_mask, 0,
233134688Sjulian	   "Use the mask of idle cpus");
234134688Sjulian
235134688Sjulianstatic int forward_wakeup_use_loop = 0;
236134688SjulianSYSCTL_INT(_kern_sched_ipiwakeup, OID_AUTO, useloop, CTLFLAG_RW,
237134688Sjulian	   &forward_wakeup_use_loop, 0,
238134688Sjulian	   "Use a loop to find idle cpus");
239134688Sjulian
240134688Sjulianstatic int forward_wakeup_use_single = 0;
241134688SjulianSYSCTL_INT(_kern_sched_ipiwakeup, OID_AUTO, onecpu, CTLFLAG_RW,
242134688Sjulian	   &forward_wakeup_use_single, 0,
243134688Sjulian	   "Only signal one idle cpu");
244134688Sjulian
245134688Sjulianstatic int forward_wakeup_use_htt = 0;
246134688SjulianSYSCTL_INT(_kern_sched_ipiwakeup, OID_AUTO, htt2, CTLFLAG_RW,
247134688Sjulian	   &forward_wakeup_use_htt, 0,
248134688Sjulian	   "account for htt");
249135051Sjulian
250134693Sjulian#endif
251135051Sjulianstatic int sched_followon = 0;
252135051SjulianSYSCTL_INT(_kern_sched, OID_AUTO, followon, CTLFLAG_RW,
253135051Sjulian	   &sched_followon, 0,
254135051Sjulian	   "allow threads to share a quantum");
255134688Sjulian
256135051Sjulianstatic int sched_pfollowons = 0;
257135051SjulianSYSCTL_INT(_kern_sched, OID_AUTO, pfollowons, CTLFLAG_RD,
258135051Sjulian	   &sched_pfollowons, 0,
259135051Sjulian	   "number of followons done to a different ksegrp");
260135051Sjulian
261135051Sjulianstatic int sched_kgfollowons = 0;
262135051SjulianSYSCTL_INT(_kern_sched, OID_AUTO, kgfollowons, CTLFLAG_RD,
263135051Sjulian	   &sched_kgfollowons, 0,
264135051Sjulian	   "number of followons done in a ksegrp");
265135051Sjulian
266104964Sjeff/*
267104964Sjeff * Arrange to reschedule if necessary, taking the priorities and
268104964Sjeff * schedulers into account.
269104964Sjeff */
270104964Sjeffstatic void
271104964Sjeffmaybe_resched(struct thread *td)
272104964Sjeff{
273104964Sjeff
274104964Sjeff	mtx_assert(&sched_lock, MA_OWNED);
275134791Sjulian	if (td->td_priority < curthread->td_priority)
276111032Sjulian		curthread->td_flags |= TDF_NEEDRESCHED;
277104964Sjeff}
278104964Sjeff
279104964Sjeff/*
280104964Sjeff * Force switch among equal priority processes every 100ms.
281104964Sjeff * We don't actually need to force a context switch of the current process.
282104964Sjeff * The act of firing the event triggers a context switch to softclock() and
283104964Sjeff * then switching back out again which is equivalent to a preemption, thus
284104964Sjeff * no further work is needed on the local CPU.
285104964Sjeff */
286104964Sjeff/* ARGSUSED */
287104964Sjeffstatic void
288104964Sjeffroundrobin(void *arg)
289104964Sjeff{
290104964Sjeff
291104964Sjeff#ifdef SMP
292104964Sjeff	mtx_lock_spin(&sched_lock);
293104964Sjeff	forward_roundrobin();
294104964Sjeff	mtx_unlock_spin(&sched_lock);
295104964Sjeff#endif
296104964Sjeff
297104964Sjeff	callout_reset(&roundrobin_callout, sched_quantum, roundrobin, NULL);
298104964Sjeff}
299104964Sjeff
300104964Sjeff/*
301104964Sjeff * Constants for digital decay and forget:
302118972Sjhb *	90% of (kg_estcpu) usage in 5 * loadav time
303118972Sjhb *	95% of (ke_pctcpu) usage in 60 seconds (load insensitive)
304104964Sjeff *          Note that, as ps(1) mentions, this can let percentages
305104964Sjeff *          total over 100% (I've seen 137.9% for 3 processes).
306104964Sjeff *
307118972Sjhb * Note that schedclock() updates kg_estcpu and p_cpticks asynchronously.
308104964Sjeff *
309118972Sjhb * We wish to decay away 90% of kg_estcpu in (5 * loadavg) seconds.
310104964Sjeff * That is, the system wants to compute a value of decay such
311104964Sjeff * that the following for loop:
312104964Sjeff * 	for (i = 0; i < (5 * loadavg); i++)
313118972Sjhb * 		kg_estcpu *= decay;
314104964Sjeff * will compute
315118972Sjhb * 	kg_estcpu *= 0.1;
316104964Sjeff * for all values of loadavg:
317104964Sjeff *
318104964Sjeff * Mathematically this loop can be expressed by saying:
319104964Sjeff * 	decay ** (5 * loadavg) ~= .1
320104964Sjeff *
321104964Sjeff * The system computes decay as:
322104964Sjeff * 	decay = (2 * loadavg) / (2 * loadavg + 1)
323104964Sjeff *
324104964Sjeff * We wish to prove that the system's computation of decay
325104964Sjeff * will always fulfill the equation:
326104964Sjeff * 	decay ** (5 * loadavg) ~= .1
327104964Sjeff *
328104964Sjeff * If we compute b as:
329104964Sjeff * 	b = 2 * loadavg
330104964Sjeff * then
331104964Sjeff * 	decay = b / (b + 1)
332104964Sjeff *
333104964Sjeff * We now need to prove two things:
334104964Sjeff *	1) Given factor ** (5 * loadavg) ~= .1, prove factor == b/(b+1)
335104964Sjeff *	2) Given b/(b+1) ** power ~= .1, prove power == (5 * loadavg)
336104964Sjeff *
337104964Sjeff * Facts:
338104964Sjeff *         For x close to zero, exp(x) =~ 1 + x, since
339104964Sjeff *              exp(x) = 0! + x**1/1! + x**2/2! + ... .
340104964Sjeff *              therefore exp(-1/b) =~ 1 - (1/b) = (b-1)/b.
341104964Sjeff *         For x close to zero, ln(1+x) =~ x, since
342104964Sjeff *              ln(1+x) = x - x**2/2 + x**3/3 - ...     -1 < x < 1
343104964Sjeff *              therefore ln(b/(b+1)) = ln(1 - 1/(b+1)) =~ -1/(b+1).
344104964Sjeff *         ln(.1) =~ -2.30
345104964Sjeff *
346104964Sjeff * Proof of (1):
347104964Sjeff *    Solve (factor)**(power) =~ .1 given power (5*loadav):
348104964Sjeff *	solving for factor,
349104964Sjeff *      ln(factor) =~ (-2.30/5*loadav), or
350104964Sjeff *      factor =~ exp(-1/((5/2.30)*loadav)) =~ exp(-1/(2*loadav)) =
351104964Sjeff *          exp(-1/b) =~ (b-1)/b =~ b/(b+1).                    QED
352104964Sjeff *
353104964Sjeff * Proof of (2):
354104964Sjeff *    Solve (factor)**(power) =~ .1 given factor == (b/(b+1)):
355104964Sjeff *	solving for power,
356104964Sjeff *      power*ln(b/(b+1)) =~ -2.30, or
357104964Sjeff *      power =~ 2.3 * (b + 1) = 4.6*loadav + 2.3 =~ 5*loadav.  QED
358104964Sjeff *
359104964Sjeff * Actual power values for the implemented algorithm are as follows:
360104964Sjeff *      loadav: 1       2       3       4
361104964Sjeff *      power:  5.68    10.32   14.94   19.55
362104964Sjeff */
363104964Sjeff
364104964Sjeff/* calculations for digital decay to forget 90% of usage in 5*loadav sec */
365104964Sjeff#define	loadfactor(loadav)	(2 * (loadav))
366104964Sjeff#define	decay_cpu(loadfac, cpu)	(((loadfac) * (cpu)) / ((loadfac) + FSCALE))
367104964Sjeff
368118972Sjhb/* decay 95% of `ke_pctcpu' in 60 seconds; see CCPU_SHIFT before changing */
369104964Sjeffstatic fixpt_t	ccpu = 0.95122942450071400909 * FSCALE;	/* exp(-1/20) */
370104964SjeffSYSCTL_INT(_kern, OID_AUTO, ccpu, CTLFLAG_RD, &ccpu, 0, "");
371104964Sjeff
372104964Sjeff/*
373104964Sjeff * If `ccpu' is not equal to `exp(-1/20)' and you still want to use the
374104964Sjeff * faster/more-accurate formula, you'll have to estimate CCPU_SHIFT below
375104964Sjeff * and possibly adjust FSHIFT in "param.h" so that (FSHIFT >= CCPU_SHIFT).
376104964Sjeff *
377104964Sjeff * To estimate CCPU_SHIFT for exp(-1/20), the following formula was used:
378104964Sjeff *	1 - exp(-1/20) ~= 0.0487 ~= 0.0488 == 1 (fixed pt, *11* bits).
379104964Sjeff *
380104964Sjeff * If you don't want to bother with the faster/more-accurate formula, you
381104964Sjeff * can set CCPU_SHIFT to (FSHIFT + 1) which will use a slower/less-accurate
382104964Sjeff * (more general) method of calculating the %age of CPU used by a process.
383104964Sjeff */
384104964Sjeff#define	CCPU_SHIFT	11
385104964Sjeff
386104964Sjeff/*
387104964Sjeff * Recompute process priorities, every hz ticks.
388104964Sjeff * MP-safe, called without the Giant mutex.
389104964Sjeff */
390104964Sjeff/* ARGSUSED */
391104964Sjeffstatic void
392123871Sjhbschedcpu(void)
393104964Sjeff{
394104964Sjeff	register fixpt_t loadfac = loadfactor(averunnable.ldavg[0]);
395104964Sjeff	struct thread *td;
396104964Sjeff	struct proc *p;
397104964Sjeff	struct kse *ke;
398104964Sjeff	struct ksegrp *kg;
399118972Sjhb	int awake, realstathz;
400104964Sjeff
401104964Sjeff	realstathz = stathz ? stathz : hz;
402104964Sjeff	sx_slock(&allproc_lock);
403104964Sjeff	FOREACH_PROC_IN_SYSTEM(p) {
404118972Sjhb		/*
405118972Sjhb		 * Prevent state changes and protect run queue.
406118972Sjhb		 */
407104964Sjeff		mtx_lock_spin(&sched_lock);
408118972Sjhb		/*
409118972Sjhb		 * Increment time in/out of memory.  We ignore overflow; with
410118972Sjhb		 * 16-bit int's (remember them?) overflow takes 45 days.
411118972Sjhb		 */
412104964Sjeff		p->p_swtime++;
413104964Sjeff		FOREACH_KSEGRP_IN_PROC(p, kg) {
414104964Sjeff			awake = 0;
415134791Sjulian			FOREACH_THREAD_IN_GROUP(kg, td) {
416134791Sjulian				ke = td->td_kse;
417104964Sjeff				/*
418118972Sjhb				 * Increment sleep time (if sleeping).  We
419118972Sjhb				 * ignore overflow, as above.
420104964Sjeff				 */
421104964Sjeff				/*
422104964Sjeff				 * The kse slptimes are not touched in wakeup
423104964Sjeff				 * because the thread may not HAVE a KSE.
424104964Sjeff				 */
425104964Sjeff				if (ke->ke_state == KES_ONRUNQ) {
426104964Sjeff					awake = 1;
427104964Sjeff					ke->ke_flags &= ~KEF_DIDRUN;
428104964Sjeff				} else if ((ke->ke_state == KES_THREAD) &&
429134791Sjulian				    (TD_IS_RUNNING(td))) {
430104964Sjeff					awake = 1;
431104964Sjeff					/* Do not clear KEF_DIDRUN */
432104964Sjeff				} else if (ke->ke_flags & KEF_DIDRUN) {
433104964Sjeff					awake = 1;
434104964Sjeff					ke->ke_flags &= ~KEF_DIDRUN;
435104964Sjeff				}
436104964Sjeff
437104964Sjeff				/*
438118972Sjhb				 * ke_pctcpu is only for ps and ttyinfo().
439118972Sjhb				 * Do it per kse, and add them up at the end?
440104964Sjeff				 * XXXKSE
441104964Sjeff				 */
442118972Sjhb				ke->ke_pctcpu = (ke->ke_pctcpu * ccpu) >>
443109145Sjeff				    FSHIFT;
444104964Sjeff				/*
445104964Sjeff				 * If the kse has been idle the entire second,
446104964Sjeff				 * stop recalculating its priority until
447104964Sjeff				 * it wakes up.
448104964Sjeff				 */
449134145Sjulian				if (ke->ke_cpticks == 0)
450104964Sjeff					continue;
451104964Sjeff#if	(FSHIFT >= CCPU_SHIFT)
452109157Sjeff				ke->ke_pctcpu += (realstathz == 100)
453134145Sjulian				    ? ((fixpt_t) ke->ke_cpticks) <<
454104964Sjeff				    (FSHIFT - CCPU_SHIFT) :
455134145Sjulian				    100 * (((fixpt_t) ke->ke_cpticks)
456109145Sjeff				    << (FSHIFT - CCPU_SHIFT)) / realstathz;
457104964Sjeff#else
458109157Sjeff				ke->ke_pctcpu += ((FSCALE - ccpu) *
459134145Sjulian				    (ke->ke_cpticks *
460109145Sjeff				    FSCALE / realstathz)) >> FSHIFT;
461104964Sjeff#endif
462134145Sjulian				ke->ke_cpticks = 0;
463104964Sjeff			} /* end of kse loop */
464104964Sjeff			/*
465104964Sjeff			 * If there are ANY running threads in this KSEGRP,
466104964Sjeff			 * then don't count it as sleeping.
467104964Sjeff			 */
468104964Sjeff			if (awake) {
469104964Sjeff				if (kg->kg_slptime > 1) {
470104964Sjeff					/*
471104964Sjeff					 * In an ideal world, this should not
472104964Sjeff					 * happen, because whoever woke us
473104964Sjeff					 * up from the long sleep should have
474104964Sjeff					 * unwound the slptime and reset our
475104964Sjeff					 * priority before we run at the stale
476104964Sjeff					 * priority.  Should KASSERT at some
477104964Sjeff					 * point when all the cases are fixed.
478104964Sjeff					 */
479104964Sjeff					updatepri(kg);
480104964Sjeff				}
481104964Sjeff				kg->kg_slptime = 0;
482118972Sjhb			} else
483104964Sjeff				kg->kg_slptime++;
484104964Sjeff			if (kg->kg_slptime > 1)
485104964Sjeff				continue;
486104964Sjeff			kg->kg_estcpu = decay_cpu(loadfac, kg->kg_estcpu);
487104964Sjeff		      	resetpriority(kg);
488104964Sjeff			FOREACH_THREAD_IN_GROUP(kg, td) {
489104964Sjeff				if (td->td_priority >= PUSER) {
490105127Sjulian					sched_prio(td, kg->kg_user_pri);
491104964Sjeff				}
492104964Sjeff			}
493104964Sjeff		} /* end of ksegrp loop */
494104964Sjeff		mtx_unlock_spin(&sched_lock);
495104964Sjeff	} /* end of process loop */
496104964Sjeff	sx_sunlock(&allproc_lock);
497104964Sjeff}
498104964Sjeff
499104964Sjeff/*
500123871Sjhb * Main loop for a kthread that executes schedcpu once a second.
501123871Sjhb */
502123871Sjhbstatic void
503124955Sjeffschedcpu_thread(void)
504123871Sjhb{
505123871Sjhb	int nowake;
506123871Sjhb
507123871Sjhb	for (;;) {
508123871Sjhb		schedcpu();
509123871Sjhb		tsleep(&nowake, curthread->td_priority, "-", hz);
510123871Sjhb	}
511123871Sjhb}
512123871Sjhb
513123871Sjhb/*
514104964Sjeff * Recalculate the priority of a process after it has slept for a while.
515118972Sjhb * For all load averages >= 1 and max kg_estcpu of 255, sleeping for at
516118972Sjhb * least six times the loadfactor will decay kg_estcpu to zero.
517104964Sjeff */
518104964Sjeffstatic void
519104964Sjeffupdatepri(struct ksegrp *kg)
520104964Sjeff{
521118972Sjhb	register fixpt_t loadfac;
522104964Sjeff	register unsigned int newcpu;
523104964Sjeff
524118972Sjhb	loadfac = loadfactor(averunnable.ldavg[0]);
525104964Sjeff	if (kg->kg_slptime > 5 * loadfac)
526104964Sjeff		kg->kg_estcpu = 0;
527104964Sjeff	else {
528118972Sjhb		newcpu = kg->kg_estcpu;
529118972Sjhb		kg->kg_slptime--;	/* was incremented in schedcpu() */
530104964Sjeff		while (newcpu && --kg->kg_slptime)
531104964Sjeff			newcpu = decay_cpu(loadfac, newcpu);
532104964Sjeff		kg->kg_estcpu = newcpu;
533104964Sjeff	}
534104964Sjeff	resetpriority(kg);
535104964Sjeff}
536104964Sjeff
537104964Sjeff/*
538104964Sjeff * Compute the priority of a process when running in user mode.
539104964Sjeff * Arrange to reschedule if the resulting priority is better
540104964Sjeff * than that of the current process.
541104964Sjeff */
542104964Sjeffstatic void
543104964Sjeffresetpriority(struct ksegrp *kg)
544104964Sjeff{
545104964Sjeff	register unsigned int newpriority;
546104964Sjeff	struct thread *td;
547104964Sjeff
548104964Sjeff	if (kg->kg_pri_class == PRI_TIMESHARE) {
549104964Sjeff		newpriority = PUSER + kg->kg_estcpu / INVERSE_ESTCPU_WEIGHT +
550130551Sjulian		    NICE_WEIGHT * (kg->kg_proc->p_nice - PRIO_MIN);
551104964Sjeff		newpriority = min(max(newpriority, PRI_MIN_TIMESHARE),
552104964Sjeff		    PRI_MAX_TIMESHARE);
553104964Sjeff		kg->kg_user_pri = newpriority;
554104964Sjeff	}
555104964Sjeff	FOREACH_THREAD_IN_GROUP(kg, td) {
556104964Sjeff		maybe_resched(td);			/* XXXKSE silly */
557104964Sjeff	}
558104964Sjeff}
559104964Sjeff
560104964Sjeff/* ARGSUSED */
561104964Sjeffstatic void
562104964Sjeffsched_setup(void *dummy)
563104964Sjeff{
564124955Sjeff	setup_runqs();
565118972Sjhb
566104964Sjeff	if (sched_quantum == 0)
567104964Sjeff		sched_quantum = SCHED_QUANTUM;
568104964Sjeff	hogticks = 2 * sched_quantum;
569104964Sjeff
570126665Srwatson	callout_init(&roundrobin_callout, CALLOUT_MPSAFE);
571104964Sjeff
572104964Sjeff	/* Kick off timeout driven events by calling first time. */
573104964Sjeff	roundrobin(NULL);
574125288Sjeff
575125288Sjeff	/* Account for thread0. */
576125288Sjeff	sched_tdcnt++;
577104964Sjeff}
578104964Sjeff
579104964Sjeff/* External interfaces start here */
580134791Sjulian/*
581134791Sjulian * Very early in the boot some setup of scheduler-specific
582134791Sjulian * parts of proc0 and of soem scheduler resources needs to be done.
583134791Sjulian * Called from:
584134791Sjulian *  proc0_init()
585134791Sjulian */
586134791Sjulianvoid
587134791Sjulianschedinit(void)
588134791Sjulian{
589134791Sjulian	/*
590134791Sjulian	 * Set up the scheduler specific parts of proc0.
591134791Sjulian	 */
592134791Sjulian	proc0.p_sched = NULL; /* XXX */
593134791Sjulian	ksegrp0.kg_sched = &kg_sched0;
594134791Sjulian	thread0.td_sched = &kse0;
595134791Sjulian	kse0.ke_thread = &thread0;
596134791Sjulian	kse0.ke_oncpu = NOCPU; /* wrong.. can we use PCPU(cpuid) yet? */
597134791Sjulian	kse0.ke_state = KES_THREAD;
598134791Sjulian	kg_sched0.skg_concurrency = 1;
599134791Sjulian	kg_sched0.skg_avail_opennings = 0; /* we are already running */
600134791Sjulian}
601134791Sjulian
602104964Sjeffint
603104964Sjeffsched_runnable(void)
604104964Sjeff{
605124955Sjeff#ifdef SMP
606124955Sjeff	return runq_check(&runq) + runq_check(&runq_pcpu[PCPU_GET(cpuid)]);
607124955Sjeff#else
608124955Sjeff	return runq_check(&runq);
609124955Sjeff#endif
610104964Sjeff}
611104964Sjeff
612104964Sjeffint
613104964Sjeffsched_rr_interval(void)
614104964Sjeff{
615104964Sjeff	if (sched_quantum == 0)
616104964Sjeff		sched_quantum = SCHED_QUANTUM;
617104964Sjeff	return (sched_quantum);
618104964Sjeff}
619104964Sjeff
620104964Sjeff/*
621104964Sjeff * We adjust the priority of the current process.  The priority of
622104964Sjeff * a process gets worse as it accumulates CPU time.  The cpu usage
623118972Sjhb * estimator (kg_estcpu) is increased here.  resetpriority() will
624118972Sjhb * compute a different priority each time kg_estcpu increases by
625104964Sjeff * INVERSE_ESTCPU_WEIGHT
626104964Sjeff * (until MAXPRI is reached).  The cpu usage estimator ramps up
627104964Sjeff * quite quickly when the process is running (linearly), and decays
628104964Sjeff * away exponentially, at a rate which is proportionally slower when
629104964Sjeff * the system is busy.  The basic principle is that the system will
630104964Sjeff * 90% forget that the process used a lot of CPU time in 5 * loadav
631104964Sjeff * seconds.  This causes the system to favor processes which haven't
632104964Sjeff * run much recently, and to round-robin among other processes.
633104964Sjeff */
634104964Sjeffvoid
635121127Sjeffsched_clock(struct thread *td)
636104964Sjeff{
637104964Sjeff	struct ksegrp *kg;
638121127Sjeff	struct kse *ke;
639104964Sjeff
640113923Sjhb	mtx_assert(&sched_lock, MA_OWNED);
641121127Sjeff	kg = td->td_ksegrp;
642121127Sjeff	ke = td->td_kse;
643113356Sjeff
644134145Sjulian	ke->ke_cpticks++;
645104964Sjeff	kg->kg_estcpu = ESTCPULIM(kg->kg_estcpu + 1);
646104964Sjeff	if ((kg->kg_estcpu % INVERSE_ESTCPU_WEIGHT) == 0) {
647104964Sjeff		resetpriority(kg);
648104964Sjeff		if (td->td_priority >= PUSER)
649104964Sjeff			td->td_priority = kg->kg_user_pri;
650104964Sjeff	}
651104964Sjeff}
652118972Sjhb
653104964Sjeff/*
654104964Sjeff * charge childs scheduling cpu usage to parent.
655104964Sjeff *
656104964Sjeff * XXXKSE assume only one thread & kse & ksegrp keep estcpu in each ksegrp.
657104964Sjeff * Charge it to the ksegrp that did the wait since process estcpu is sum of
658104964Sjeff * all ksegrps, this is strictly as expected.  Assume that the child process
659104964Sjeff * aggregated all the estcpu into the 'built-in' ksegrp.
660104964Sjeff */
661104964Sjeffvoid
662132372Sjuliansched_exit(struct proc *p, struct thread *td)
663104964Sjeff{
664132372Sjulian	sched_exit_ksegrp(FIRST_KSEGRP_IN_PROC(p), td);
665132372Sjulian	sched_exit_thread(FIRST_THREAD_IN_PROC(p), td);
666113356Sjeff}
667113356Sjeff
668113356Sjeffvoid
669132372Sjuliansched_exit_ksegrp(struct ksegrp *kg, struct thread *childtd)
670113356Sjeff{
671113923Sjhb
672113923Sjhb	mtx_assert(&sched_lock, MA_OWNED);
673132372Sjulian	kg->kg_estcpu = ESTCPULIM(kg->kg_estcpu + childtd->td_ksegrp->kg_estcpu);
674104964Sjeff}
675104964Sjeff
676104964Sjeffvoid
677113356Sjeffsched_exit_thread(struct thread *td, struct thread *child)
678104964Sjeff{
679127894Sdfr	if ((child->td_proc->p_flag & P_NOLOAD) == 0)
680125288Sjeff		sched_tdcnt--;
681113356Sjeff}
682109145Sjeff
683113356Sjeffvoid
684134791Sjuliansched_fork(struct thread *td, struct thread *childtd)
685113356Sjeff{
686134791Sjulian	sched_fork_ksegrp(td, childtd->td_ksegrp);
687134791Sjulian	sched_fork_thread(td, childtd);
688113356Sjeff}
689113356Sjeff
690113356Sjeffvoid
691132372Sjuliansched_fork_ksegrp(struct thread *td, struct ksegrp *child)
692113356Sjeff{
693113923Sjhb	mtx_assert(&sched_lock, MA_OWNED);
694132372Sjulian	child->kg_estcpu = td->td_ksegrp->kg_estcpu;
695113356Sjeff}
696109145Sjeff
697113356Sjeffvoid
698134791Sjuliansched_fork_thread(struct thread *td, struct thread *childtd)
699113356Sjeff{
700134791Sjulian	sched_newthread(childtd);
701104964Sjeff}
702104964Sjeff
703104964Sjeffvoid
704130551Sjuliansched_nice(struct proc *p, int nice)
705104964Sjeff{
706130551Sjulian	struct ksegrp *kg;
707113873Sjhb
708130551Sjulian	PROC_LOCK_ASSERT(p, MA_OWNED);
709113873Sjhb	mtx_assert(&sched_lock, MA_OWNED);
710130551Sjulian	p->p_nice = nice;
711130551Sjulian	FOREACH_KSEGRP_IN_PROC(p, kg) {
712130551Sjulian		resetpriority(kg);
713130551Sjulian	}
714104964Sjeff}
715104964Sjeff
716113356Sjeffvoid
717113356Sjeffsched_class(struct ksegrp *kg, int class)
718113356Sjeff{
719113923Sjhb	mtx_assert(&sched_lock, MA_OWNED);
720113356Sjeff	kg->kg_pri_class = class;
721113356Sjeff}
722113356Sjeff
723105127Sjulian/*
724105127Sjulian * Adjust the priority of a thread.
725105127Sjulian * This may include moving the thread within the KSEGRP,
726105127Sjulian * changing the assignment of a kse to the thread,
727105127Sjulian * and moving a KSE in the system run queue.
728105127Sjulian */
729104964Sjeffvoid
730104964Sjeffsched_prio(struct thread *td, u_char prio)
731104964Sjeff{
732104964Sjeff
733113923Sjhb	mtx_assert(&sched_lock, MA_OWNED);
734104964Sjeff	if (TD_ON_RUNQ(td)) {
735105127Sjulian		adjustrunqueue(td, prio);
736105127Sjulian	} else {
737105127Sjulian		td->td_priority = prio;
738104964Sjeff	}
739104964Sjeff}
740104964Sjeff
741104964Sjeffvoid
742126326Sjhbsched_sleep(struct thread *td)
743104964Sjeff{
744113923Sjhb
745113923Sjhb	mtx_assert(&sched_lock, MA_OWNED);
746104964Sjeff	td->td_ksegrp->kg_slptime = 0;
747126326Sjhb	td->td_base_pri = td->td_priority;
748104964Sjeff}
749104964Sjeff
750135051Sjulianstatic void remrunqueue(struct thread *td);
751135051Sjulian
752104964Sjeffvoid
753135051Sjuliansched_switch(struct thread *td, struct thread *newtd, int flags)
754104964Sjeff{
755104964Sjeff	struct kse *ke;
756135051Sjulian	struct ksegrp *kg;
757104964Sjeff	struct proc *p;
758104964Sjeff
759104964Sjeff	ke = td->td_kse;
760104964Sjeff	p = td->td_proc;
761104964Sjeff
762113923Sjhb	mtx_assert(&sched_lock, MA_OWNED);
763104964Sjeff
764125295Sjeff	if ((p->p_flag & P_NOLOAD) == 0)
765125288Sjeff		sched_tdcnt--;
766134791Sjulian	/*
767135051Sjulian	 * We are volunteering to switch out so we get to nominate
768135051Sjulian	 * a successor for the rest of our quantum
769135051Sjulian	 * First try another thread in our ksegrp, and then look for
770135051Sjulian	 * other ksegrps in our process.
771135051Sjulian	 */
772135051Sjulian	if (sched_followon &&
773135051Sjulian	    (p->p_flag & P_HADTHREADS) &&
774135051Sjulian	    (flags & SW_VOL) &&
775135051Sjulian	    newtd == NULL) {
776135051Sjulian		/* lets schedule another thread from this process */
777135051Sjulian		 kg = td->td_ksegrp;
778135051Sjulian		 if ((newtd = TAILQ_FIRST(&kg->kg_runq))) {
779135051Sjulian			remrunqueue(newtd);
780135051Sjulian			sched_kgfollowons++;
781135051Sjulian		 } else {
782135051Sjulian			FOREACH_KSEGRP_IN_PROC(p, kg) {
783135051Sjulian				if ((newtd = TAILQ_FIRST(&kg->kg_runq))) {
784135051Sjulian					sched_pfollowons++;
785135051Sjulian					remrunqueue(newtd);
786135051Sjulian					break;
787135051Sjulian				}
788135051Sjulian			}
789135051Sjulian		}
790135051Sjulian	}
791135051Sjulian
792135051Sjulian	/*
793134791Sjulian	 * The thread we are about to run needs to be counted as if it had been
794134791Sjulian	 * added to the run queue and selected.
795135295Sjulian	 * it came from:
796135295Sjulian	 * A preemption
797135295Sjulian	 * An upcall
798135295Sjulian	 * A followon
799135295Sjulian	 * Do this before saving curthread so that the slot count
800135295Sjulian	 * doesn't give an overly optimistic view when that happens.
801134791Sjulian	 */
802134791Sjulian	if (newtd) {
803135181Sjulian		KASSERT((newtd->td_inhibitors == 0),
804135181Sjulian			("trying to run inhibitted thread"));
805134791Sjulian		newtd->td_ksegrp->kg_avail_opennings--;
806134791Sjulian		newtd->td_kse->ke_flags |= KEF_DIDRUN;
807134791Sjulian        	TD_SET_RUNNING(newtd);
808134832Sjulian		if ((newtd->td_proc->p_flag & P_NOLOAD) == 0)
809134832Sjulian			sched_tdcnt++;
810134791Sjulian	}
811135051Sjulian
812113339Sjulian	td->td_lastcpu = td->td_oncpu;
813132266Sjhb	td->td_flags &= ~TDF_NEEDRESCHED;
814132266Sjhb	td->td_pflags &= ~TDP_OWEPREEMPT;
815113339Sjulian	td->td_oncpu = NOCPU;
816104964Sjeff	/*
817104964Sjeff	 * At the last moment, if this thread is still marked RUNNING,
818104964Sjeff	 * then put it back on the run queue as it has not been suspended
819131473Sjhb	 * or stopped or any thing else similar.  We never put the idle
820131473Sjhb	 * threads on the run queue, however.
821104964Sjeff	 */
822131473Sjhb	if (td == PCPU_GET(idlethread))
823131473Sjhb		TD_SET_CAN_RUN(td);
824134791Sjulian	else {
825134791Sjulian		td->td_ksegrp->kg_avail_opennings++;
826134791Sjulian		if (TD_IS_RUNNING(td)) {
827134791Sjulian			/* Put us back on the run queue (kse and all). */
828134791Sjulian			setrunqueue(td, SRQ_OURSELF|SRQ_YIELDING);
829134791Sjulian		} else if (p->p_flag & P_HADTHREADS) {
830134791Sjulian			/*
831134791Sjulian			 * We will not be on the run queue. So we must be
832134791Sjulian			 * sleeping or similar. As it's available,
833134791Sjulian			 * someone else can use the KSE if they need it.
834134791Sjulian			 */
835134791Sjulian			slot_fill(td->td_ksegrp);
836134791Sjulian		}
837104964Sjeff	}
838131473Sjhb	if (newtd == NULL)
839131473Sjhb		newtd = choosethread();
840121128Sjeff	if (td != newtd)
841121128Sjeff		cpu_switch(td, newtd);
842121128Sjeff	sched_lock.mtx_lock = (uintptr_t)td;
843121128Sjeff	td->td_oncpu = PCPU_GET(cpuid);
844104964Sjeff}
845104964Sjeff
846104964Sjeffvoid
847104964Sjeffsched_wakeup(struct thread *td)
848104964Sjeff{
849104964Sjeff	struct ksegrp *kg;
850104964Sjeff
851113923Sjhb	mtx_assert(&sched_lock, MA_OWNED);
852104964Sjeff	kg = td->td_ksegrp;
853104964Sjeff	if (kg->kg_slptime > 1)
854104964Sjeff		updatepri(kg);
855104964Sjeff	kg->kg_slptime = 0;
856134586Sjulian	setrunqueue(td, SRQ_BORING);
857104964Sjeff}
858104964Sjeff
859134693Sjulian#ifdef SMP
860134688Sjulian/* enable HTT_2 if you have a 2-way HTT cpu.*/
861134688Sjulianstatic int
862134688Sjulianforward_wakeup(int  cpunum)
863134688Sjulian{
864134688Sjulian	cpumask_t map, me, dontuse;
865134688Sjulian	cpumask_t map2;
866134688Sjulian	struct pcpu *pc;
867134688Sjulian	cpumask_t id, map3;
868134688Sjulian
869134688Sjulian	mtx_assert(&sched_lock, MA_OWNED);
870134688Sjulian
871134791Sjulian	CTR0(KTR_RUNQ, "forward_wakeup()");
872134688Sjulian
873134688Sjulian	if ((!forward_wakeup_enabled) ||
874134688Sjulian	     (forward_wakeup_use_mask == 0 && forward_wakeup_use_loop == 0))
875134688Sjulian		return (0);
876134688Sjulian	if (!smp_started || cold || panicstr)
877134688Sjulian		return (0);
878134688Sjulian
879134688Sjulian	forward_wakeups_requested++;
880134688Sjulian
881134688Sjulian/*
882134688Sjulian * check the idle mask we received against what we calculated before
883134688Sjulian * in the old version.
884134688Sjulian */
885134688Sjulian	me = PCPU_GET(cpumask);
886134688Sjulian	/*
887134688Sjulian	 * don't bother if we should be doing it ourself..
888134688Sjulian	 */
889134688Sjulian	if ((me & idle_cpus_mask) && (cpunum == NOCPU || me == (1 << cpunum)))
890134688Sjulian		return (0);
891134688Sjulian
892134688Sjulian	dontuse = me | stopped_cpus | hlt_cpus_mask;
893134688Sjulian	map3 = 0;
894134688Sjulian	if (forward_wakeup_use_loop) {
895134688Sjulian		SLIST_FOREACH(pc, &cpuhead, pc_allcpu) {
896134688Sjulian			id = pc->pc_cpumask;
897134688Sjulian			if ( (id & dontuse) == 0 &&
898134688Sjulian			    pc->pc_curthread == pc->pc_idlethread) {
899134688Sjulian				map3 |= id;
900134688Sjulian			}
901134688Sjulian		}
902134688Sjulian	}
903134688Sjulian
904134688Sjulian	if (forward_wakeup_use_mask) {
905134688Sjulian		map = 0;
906134688Sjulian		map = idle_cpus_mask & ~dontuse;
907134688Sjulian
908134688Sjulian		/* If they are both on, compare and use loop if different */
909134688Sjulian		if (forward_wakeup_use_loop) {
910134688Sjulian			if (map != map3) {
911134688Sjulian				printf("map (%02X) != map3 (%02X)\n",
912134688Sjulian						map, map3);
913134688Sjulian				map = map3;
914134688Sjulian			}
915134688Sjulian		}
916134688Sjulian	} else {
917134688Sjulian		map = map3;
918134688Sjulian	}
919134688Sjulian	/* If we only allow a specific CPU, then mask off all the others */
920134688Sjulian	if (cpunum != NOCPU) {
921134688Sjulian		KASSERT((cpunum <= mp_maxcpus),("forward_wakeup: bad cpunum."));
922134688Sjulian		map &= (1 << cpunum);
923134688Sjulian	} else {
924134688Sjulian		/* Try choose an idle die. */
925134688Sjulian		if (forward_wakeup_use_htt) {
926134688Sjulian			map2 =  (map & (map >> 1)) & 0x5555;
927134688Sjulian			if (map2) {
928134688Sjulian				map = map2;
929134688Sjulian			}
930134688Sjulian		}
931134688Sjulian
932134688Sjulian		/* set only one bit */
933134688Sjulian		if (forward_wakeup_use_single) {
934134688Sjulian			map = map & ((~map) + 1);
935134688Sjulian		}
936134688Sjulian	}
937134688Sjulian	if (map) {
938134688Sjulian		forward_wakeups_delivered++;
939134688Sjulian		ipi_selected(map, IPI_AST);
940134688Sjulian		return (1);
941134688Sjulian	}
942134688Sjulian	if (cpunum == NOCPU)
943134688Sjulian		printf("forward_wakeup: Idle processor not found\n");
944134688Sjulian	return (0);
945134688Sjulian}
946134693Sjulian#endif
947134688Sjulian
948104964Sjeffvoid
949134586Sjuliansched_add(struct thread *td, int flags)
950104964Sjeff{
951121127Sjeff	struct kse *ke;
952134591Sjulian#ifdef SMP
953134591Sjulian	int forwarded = 0;
954134591Sjulian	int cpu;
955134591Sjulian#endif
956121127Sjeff
957121127Sjeff	ke = td->td_kse;
958104964Sjeff	mtx_assert(&sched_lock, MA_OWNED);
959104964Sjeff	KASSERT(ke->ke_state != KES_ONRUNQ,
960124957Sjeff	    ("sched_add: kse %p (%s) already in run queue", ke,
961104964Sjeff	    ke->ke_proc->p_comm));
962104964Sjeff	KASSERT(ke->ke_proc->p_sflag & PS_INMEM,
963124957Sjeff	    ("sched_add: process swapped out"));
964131481Sjhb
965131481Sjhb#ifdef SMP
966124955Sjeff	if (KSE_CAN_MIGRATE(ke)) {
967134591Sjulian		CTR2(KTR_RUNQ,
968134591Sjulian		    "sched_add: adding kse:%p (td:%p) to gbl runq", ke, td);
969134591Sjulian		cpu = NOCPU;
970124955Sjeff		ke->ke_runq = &runq;
971124955Sjeff	} else {
972124955Sjeff		if (!SKE_RUNQ_PCPU(ke))
973134591Sjulian			ke->ke_runq = &runq_pcpu[(cpu = PCPU_GET(cpuid))];
974134591Sjulian		else
975134591Sjulian			cpu = td->td_lastcpu;
976134591Sjulian		CTR3(KTR_RUNQ,
977134591Sjulian		    "sched_add: Put kse:%p(td:%p) on cpu%d runq", ke, td, cpu);
978124955Sjeff	}
979124955Sjeff#else
980133396Sjulian	CTR2(KTR_RUNQ, "sched_add: adding kse:%p (td:%p) to runq", ke, td);
981124955Sjeff	ke->ke_runq = &runq;
982134591Sjulian
983124955Sjeff#endif
984134591Sjulian	/*
985134591Sjulian	 * If we are yielding (on the way out anyhow)
986134591Sjulian	 * or the thread being saved is US,
987134591Sjulian	 * then don't try be smart about preemption
988134591Sjulian	 * or kicking off another CPU
989134591Sjulian	 * as it won't help and may hinder.
990134591Sjulian	 * In the YIEDLING case, we are about to run whoever is
991134591Sjulian	 * being put in the queue anyhow, and in the
992134591Sjulian	 * OURSELF case, we are puting ourself on the run queue
993134591Sjulian	 * which also only happens when we are about to yield.
994134591Sjulian	 */
995134591Sjulian	if((flags & SRQ_YIELDING) == 0) {
996134591Sjulian#ifdef SMP
997134591Sjulian		cpumask_t me = PCPU_GET(cpumask);
998134591Sjulian		int idle = idle_cpus_mask & me;
999134591Sjulian		/*
1000134591Sjulian		 * Only try to kick off another CPU if
1001134591Sjulian		 * the thread is unpinned
1002134591Sjulian		 * or pinned to another cpu,
1003134591Sjulian		 * and there are other available and idle CPUs.
1004134837Sjulian		 * if we are idle, or it's an interrupt,
1005134837Sjulian		 * then skip straight to preemption.
1006134591Sjulian		 */
1007134837Sjulian		if ( (! idle) && ((flags & SRQ_INTR) == 0) &&
1008134591Sjulian		    (idle_cpus_mask & ~(hlt_cpus_mask | me)) &&
1009134591Sjulian		    ( KSE_CAN_MIGRATE(ke) ||
1010134591Sjulian		      ke->ke_runq != &runq_pcpu[PCPU_GET(cpuid)])) {
1011134591Sjulian			forwarded = forward_wakeup(cpu);
1012134591Sjulian		}
1013134591Sjulian		/*
1014134591Sjulian		 * If we failed to kick off another cpu, then look to
1015134591Sjulian		 * see if we should preempt this CPU. Only allow this
1016134591Sjulian		 * if it is not pinned or IS pinned to this CPU.
1017134591Sjulian		 * If we are the idle thread, we also try do preempt.
1018134591Sjulian		 * as it will be quicker and being idle, we won't
1019134591Sjulian		 * lose in doing so..
1020134591Sjulian		 */
1021134591Sjulian		if ((!forwarded) &&
1022134591Sjulian		    (ke->ke_runq == &runq ||
1023134591Sjulian		     ke->ke_runq == &runq_pcpu[PCPU_GET(cpuid)]))
1024134591Sjulian#endif
1025134591Sjulian
1026134591Sjulian		{
1027134591Sjulian			if (maybe_preempt(td))
1028134591Sjulian				return;
1029134591Sjulian		}
1030134591Sjulian	}
1031125295Sjeff	if ((td->td_proc->p_flag & P_NOLOAD) == 0)
1032125288Sjeff		sched_tdcnt++;
1033135295Sjulian	td->td_ksegrp->kg_avail_opennings--;
1034124955Sjeff	runq_add(ke->ke_runq, ke);
1035133520Sjulian	ke->ke_ksegrp->kg_runq_kses++;
1036133520Sjulian	ke->ke_state = KES_ONRUNQ;
1037132118Sjhb	maybe_resched(td);
1038104964Sjeff}
1039104964Sjeff
1040104964Sjeffvoid
1041121127Sjeffsched_rem(struct thread *td)
1042104964Sjeff{
1043121127Sjeff	struct kse *ke;
1044121127Sjeff
1045121127Sjeff	ke = td->td_kse;
1046104964Sjeff	KASSERT(ke->ke_proc->p_sflag & PS_INMEM,
1047124957Sjeff	    ("sched_rem: process swapped out"));
1048124957Sjeff	KASSERT((ke->ke_state == KES_ONRUNQ),
1049124957Sjeff	    ("sched_rem: KSE not on run queue"));
1050104964Sjeff	mtx_assert(&sched_lock, MA_OWNED);
1051104964Sjeff
1052125295Sjeff	if ((td->td_proc->p_flag & P_NOLOAD) == 0)
1053125288Sjeff		sched_tdcnt--;
1054135295Sjulian	td->td_ksegrp->kg_avail_opennings++;
1055134145Sjulian	runq_remove(ke->ke_runq, ke);
1056124955Sjeff
1057104964Sjeff	ke->ke_state = KES_THREAD;
1058135295Sjulian	td->td_ksegrp->kg_runq_kses--;
1059104964Sjeff}
1060104964Sjeff
1061135295Sjulian/*
1062135295Sjulian * Select threads to run.
1063135295Sjulian * Notice that the running threads still consume a slot.
1064135295Sjulian */
1065104964Sjeffstruct kse *
1066104964Sjeffsched_choose(void)
1067104964Sjeff{
1068104964Sjeff	struct kse *ke;
1069124955Sjeff	struct runq *rq;
1070104964Sjeff
1071124955Sjeff#ifdef SMP
1072124955Sjeff	struct kse *kecpu;
1073124955Sjeff
1074124955Sjeff	rq = &runq;
1075104964Sjeff	ke = runq_choose(&runq);
1076124955Sjeff	kecpu = runq_choose(&runq_pcpu[PCPU_GET(cpuid)]);
1077104964Sjeff
1078124955Sjeff	if (ke == NULL ||
1079124955Sjeff	    (kecpu != NULL &&
1080124955Sjeff	     kecpu->ke_thread->td_priority < ke->ke_thread->td_priority)) {
1081133396Sjulian		CTR2(KTR_RUNQ, "choosing kse %p from pcpu runq %d", kecpu,
1082124955Sjeff		     PCPU_GET(cpuid));
1083124955Sjeff		ke = kecpu;
1084124955Sjeff		rq = &runq_pcpu[PCPU_GET(cpuid)];
1085124955Sjeff	} else {
1086133396Sjulian		CTR1(KTR_RUNQ, "choosing kse %p from main runq", ke);
1087124955Sjeff	}
1088124955Sjeff
1089124955Sjeff#else
1090124955Sjeff	rq = &runq;
1091124955Sjeff	ke = runq_choose(&runq);
1092124955Sjeff#endif
1093124955Sjeff
1094104964Sjeff	if (ke != NULL) {
1095124955Sjeff		runq_remove(rq, ke);
1096104964Sjeff		ke->ke_state = KES_THREAD;
1097133520Sjulian		ke->ke_ksegrp->kg_runq_kses--;
1098104964Sjeff
1099104964Sjeff		KASSERT(ke->ke_proc->p_sflag & PS_INMEM,
1100124957Sjeff		    ("sched_choose: process swapped out"));
1101104964Sjeff	}
1102104964Sjeff	return (ke);
1103104964Sjeff}
1104104964Sjeff
1105104964Sjeffvoid
1106104964Sjeffsched_userret(struct thread *td)
1107104964Sjeff{
1108104964Sjeff	struct ksegrp *kg;
1109104964Sjeff	/*
1110104964Sjeff	 * XXX we cheat slightly on the locking here to avoid locking in
1111104964Sjeff	 * the usual case.  Setting td_priority here is essentially an
1112104964Sjeff	 * incomplete workaround for not setting it properly elsewhere.
1113104964Sjeff	 * Now that some interrupt handlers are threads, not setting it
1114104964Sjeff	 * properly elsewhere can clobber it in the window between setting
1115104964Sjeff	 * it here and returning to user mode, so don't waste time setting
1116104964Sjeff	 * it perfectly here.
1117104964Sjeff	 */
1118104964Sjeff	kg = td->td_ksegrp;
1119104964Sjeff	if (td->td_priority != kg->kg_user_pri) {
1120104964Sjeff		mtx_lock_spin(&sched_lock);
1121104964Sjeff		td->td_priority = kg->kg_user_pri;
1122104964Sjeff		mtx_unlock_spin(&sched_lock);
1123104964Sjeff	}
1124104964Sjeff}
1125107126Sjeff
1126124955Sjeffvoid
1127124955Sjeffsched_bind(struct thread *td, int cpu)
1128124955Sjeff{
1129124955Sjeff	struct kse *ke;
1130124955Sjeff
1131124955Sjeff	mtx_assert(&sched_lock, MA_OWNED);
1132124955Sjeff	KASSERT(TD_IS_RUNNING(td),
1133124955Sjeff	    ("sched_bind: cannot bind non-running thread"));
1134124955Sjeff
1135124955Sjeff	ke = td->td_kse;
1136124955Sjeff
1137124955Sjeff	ke->ke_flags |= KEF_BOUND;
1138124955Sjeff#ifdef SMP
1139124955Sjeff	ke->ke_runq = &runq_pcpu[cpu];
1140124955Sjeff	if (PCPU_GET(cpuid) == cpu)
1141124955Sjeff		return;
1142124955Sjeff
1143124955Sjeff	ke->ke_state = KES_THREAD;
1144124955Sjeff
1145131473Sjhb	mi_switch(SW_VOL, NULL);
1146124955Sjeff#endif
1147124955Sjeff}
1148124955Sjeff
1149124955Sjeffvoid
1150124955Sjeffsched_unbind(struct thread* td)
1151124955Sjeff{
1152124955Sjeff	mtx_assert(&sched_lock, MA_OWNED);
1153124955Sjeff	td->td_kse->ke_flags &= ~KEF_BOUND;
1154124955Sjeff}
1155124955Sjeff
1156107126Sjeffint
1157125288Sjeffsched_load(void)
1158125288Sjeff{
1159125288Sjeff	return (sched_tdcnt);
1160125288Sjeff}
1161125288Sjeff
1162125288Sjeffint
1163107126Sjeffsched_sizeof_ksegrp(void)
1164107126Sjeff{
1165134791Sjulian	return (sizeof(struct ksegrp) + sizeof(struct kg_sched));
1166107126Sjeff}
1167107126Sjeffint
1168107126Sjeffsched_sizeof_proc(void)
1169107126Sjeff{
1170107126Sjeff	return (sizeof(struct proc));
1171107126Sjeff}
1172107126Sjeffint
1173107126Sjeffsched_sizeof_thread(void)
1174107126Sjeff{
1175134791Sjulian	return (sizeof(struct thread) + sizeof(struct kse));
1176107126Sjeff}
1177107137Sjeff
1178107137Sjefffixpt_t
1179121127Sjeffsched_pctcpu(struct thread *td)
1180107137Sjeff{
1181121147Sjeff	struct kse *ke;
1182121147Sjeff
1183121147Sjeff	ke = td->td_kse;
1184134791Sjulian	return (ke->ke_pctcpu);
1185121147Sjeff
1186121147Sjeff	return (0);
1187107137Sjeff}
1188134791Sjulian#define KERN_SWITCH_INCLUDE 1
1189134791Sjulian#include "kern/kern_switch.c"
1190