kern_synch.c revision 82096
1298948Sadrian/*-
2298948Sadrian * Copyright (c) 1982, 1986, 1990, 1991, 1993
3298948Sadrian *	The Regents of the University of California.  All rights reserved.
4298948Sadrian * (c) UNIX System Laboratories, Inc.
5298948Sadrian * All or some portions of this file are derived from material licensed
6298948Sadrian * to the University of California by American Telephone and Telegraph
7298948Sadrian * Co. or Unix System Laboratories, Inc. and are reproduced herein with
8298948Sadrian * the permission of UNIX System Laboratories, Inc.
9298948Sadrian *
10298948Sadrian * Redistribution and use in source and binary forms, with or without
11298948Sadrian * modification, are permitted provided that the following conditions
12298948Sadrian * are met:
13298948Sadrian * 1. Redistributions of source code must retain the above copyright
14298948Sadrian *    notice, this list of conditions and the following disclaimer.
15298948Sadrian * 2. Redistributions in binary form must reproduce the above copyright
16298948Sadrian *    notice, this list of conditions and the following disclaimer in the
17298948Sadrian *    documentation and/or other materials provided with the distribution.
18298948Sadrian * 3. All advertising materials mentioning features or use of this software
19298948Sadrian *    must display the following acknowledgement:
20298948Sadrian *	This product includes software developed by the University of
21298948Sadrian *	California, Berkeley and its contributors.
22298948Sadrian * 4. Neither the name of the University nor the names of its contributors
23298948Sadrian *    may be used to endorse or promote products derived from this software
24298948Sadrian *    without specific prior written permission.
25298948Sadrian *
26298948Sadrian * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
27298948Sadrian * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28298948Sadrian * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29298948Sadrian * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
30298948Sadrian * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
31298948Sadrian * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
32298948Sadrian * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
33298948Sadrian * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
34298948Sadrian * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
35298948Sadrian * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
36298948Sadrian * SUCH DAMAGE.
37298948Sadrian *
38298948Sadrian *	@(#)kern_synch.c	8.9 (Berkeley) 5/19/95
39298948Sadrian * $FreeBSD: head/sys/kern/kern_synch.c 82096 2001-08-21 20:09:05Z jhb $
40298948Sadrian */
41298948Sadrian
42298948Sadrian#include "opt_ddb.h"
43298948Sadrian#include "opt_ktrace.h"
44298948Sadrian
45298948Sadrian#include <sys/param.h>
46298948Sadrian#include <sys/systm.h>
47298948Sadrian#include <sys/condvar.h>
48298948Sadrian#include <sys/kernel.h>
49298948Sadrian#include <sys/ktr.h>
50298948Sadrian#include <sys/lock.h>
51298948Sadrian#include <sys/mutex.h>
52298948Sadrian#include <sys/proc.h>
53298948Sadrian#include <sys/resourcevar.h>
54298948Sadrian#include <sys/signalvar.h>
55298948Sadrian#include <sys/smp.h>
56298948Sadrian#include <sys/sx.h>
57298948Sadrian#include <sys/sysctl.h>
58298948Sadrian#include <sys/sysproto.h>
59298948Sadrian#include <sys/vmmeter.h>
60298948Sadrian#include <vm/vm.h>
61298948Sadrian#include <vm/vm_extern.h>
62298948Sadrian#ifdef DDB
63298948Sadrian#include <ddb/ddb.h>
64298948Sadrian#endif
65298948Sadrian#ifdef KTRACE
66298948Sadrian#include <sys/uio.h>
67298948Sadrian#include <sys/ktrace.h>
68298948Sadrian#endif
69298948Sadrian
70298948Sadrian#include <machine/cpu.h>
71298948Sadrian
72298948Sadrianstatic void sched_setup __P((void *dummy));
73298948SadrianSYSINIT(sched_setup, SI_SUB_KICK_SCHEDULER, SI_ORDER_FIRST, sched_setup, NULL)
74298948Sadrian
75298948Sadrianint	hogticks;
76298948Sadrianint	lbolt;
77298948Sadrianint	sched_quantum;		/* Roundrobin scheduling quantum in ticks. */
78298948Sadrian
79298948Sadrianstatic struct callout schedcpu_callout;
80298948Sadrianstatic struct callout roundrobin_callout;
81298948Sadrian
82298948Sadrianstatic void	endtsleep __P((void *));
83298948Sadrianstatic void	roundrobin __P((void *arg));
84298948Sadrianstatic void	schedcpu __P((void *arg));
85298948Sadrian
86298948Sadrianstatic int
87298948Sadriansysctl_kern_quantum(SYSCTL_HANDLER_ARGS)
88298948Sadrian{
89298948Sadrian	int error, new_val;
90298948Sadrian
91298948Sadrian	new_val = sched_quantum * tick;
92298948Sadrian	error = sysctl_handle_int(oidp, &new_val, 0, req);
93298948Sadrian        if (error != 0 || req->newptr == NULL)
94298948Sadrian		return (error);
95298948Sadrian	if (new_val < tick)
96298948Sadrian		return (EINVAL);
97298948Sadrian	sched_quantum = new_val / tick;
98298948Sadrian	hogticks = 2 * sched_quantum;
99298948Sadrian	return (0);
100298948Sadrian}
101298948Sadrian
102298948SadrianSYSCTL_PROC(_kern, OID_AUTO, quantum, CTLTYPE_INT|CTLFLAG_RW,
103298948Sadrian	0, sizeof sched_quantum, sysctl_kern_quantum, "I", "");
104298948Sadrian
105298948Sadrian/*
106298948Sadrian * Arrange to reschedule if necessary, taking the priorities and
107298948Sadrian * schedulers into account.
108298948Sadrian */
109298948Sadrianvoid
110298948Sadrianmaybe_resched(p)
111298948Sadrian	struct proc *p;
112298948Sadrian{
113298948Sadrian
114298948Sadrian	mtx_assert(&sched_lock, MA_OWNED);
115298948Sadrian	if (p->p_pri.pri_level < curproc->p_pri.pri_level)
116298948Sadrian		curproc->p_sflag |= PS_NEEDRESCHED;
117298948Sadrian}
118298948Sadrian
119298948Sadrianint
120298948Sadrianroundrobin_interval(void)
121298948Sadrian{
122298948Sadrian	return (sched_quantum);
123298948Sadrian}
124298948Sadrian
125298948Sadrian/*
126298948Sadrian * Force switch among equal priority processes every 100ms.
127298948Sadrian * We don't actually need to force a context switch of the current process.
128298948Sadrian * The act of firing the event triggers a context switch to softclock() and
129298948Sadrian * then switching back out again which is equivalent to a preemption, thus
130298948Sadrian * no further work is needed on the local CPU.
131298948Sadrian */
132298948Sadrian/* ARGSUSED */
133298948Sadrianstatic void
134298948Sadrianroundrobin(arg)
135298948Sadrian	void *arg;
136298948Sadrian{
137298948Sadrian
138298948Sadrian#ifdef SMP
139298948Sadrian	mtx_lock_spin(&sched_lock);
140298948Sadrian	forward_roundrobin();
141298948Sadrian	mtx_unlock_spin(&sched_lock);
142298948Sadrian#endif
143298948Sadrian
144298948Sadrian	callout_reset(&roundrobin_callout, sched_quantum, roundrobin, NULL);
145298948Sadrian}
146298948Sadrian
147298948Sadrian/*
148298948Sadrian * Constants for digital decay and forget:
149298948Sadrian *	90% of (p_estcpu) usage in 5 * loadav time
150298948Sadrian *	95% of (p_pctcpu) usage in 60 seconds (load insensitive)
151298948Sadrian *          Note that, as ps(1) mentions, this can let percentages
152298948Sadrian *          total over 100% (I've seen 137.9% for 3 processes).
153298948Sadrian *
154298948Sadrian * Note that schedclock() updates p_estcpu and p_cpticks asynchronously.
155298948Sadrian *
156298948Sadrian * We wish to decay away 90% of p_estcpu in (5 * loadavg) seconds.
157298948Sadrian * That is, the system wants to compute a value of decay such
158298948Sadrian * that the following for loop:
159298948Sadrian * 	for (i = 0; i < (5 * loadavg); i++)
160298948Sadrian * 		p_estcpu *= decay;
161298948Sadrian * will compute
162298948Sadrian * 	p_estcpu *= 0.1;
163298948Sadrian * for all values of loadavg:
164298948Sadrian *
165298948Sadrian * Mathematically this loop can be expressed by saying:
166298948Sadrian * 	decay ** (5 * loadavg) ~= .1
167298948Sadrian *
168298948Sadrian * The system computes decay as:
169298948Sadrian * 	decay = (2 * loadavg) / (2 * loadavg + 1)
170298948Sadrian *
171298948Sadrian * We wish to prove that the system's computation of decay
172298948Sadrian * will always fulfill the equation:
173298948Sadrian * 	decay ** (5 * loadavg) ~= .1
174298948Sadrian *
175298948Sadrian * If we compute b as:
176298948Sadrian * 	b = 2 * loadavg
177298948Sadrian * then
178298948Sadrian * 	decay = b / (b + 1)
179298948Sadrian *
180298948Sadrian * We now need to prove two things:
181298948Sadrian *	1) Given factor ** (5 * loadavg) ~= .1, prove factor == b/(b+1)
182298948Sadrian *	2) Given b/(b+1) ** power ~= .1, prove power == (5 * loadavg)
183298948Sadrian *
184298948Sadrian * Facts:
185298948Sadrian *         For x close to zero, exp(x) =~ 1 + x, since
186298948Sadrian *              exp(x) = 0! + x**1/1! + x**2/2! + ... .
187298948Sadrian *              therefore exp(-1/b) =~ 1 - (1/b) = (b-1)/b.
188298948Sadrian *         For x close to zero, ln(1+x) =~ x, since
189298948Sadrian *              ln(1+x) = x - x**2/2 + x**3/3 - ...     -1 < x < 1
190298948Sadrian *              therefore ln(b/(b+1)) = ln(1 - 1/(b+1)) =~ -1/(b+1).
191298948Sadrian *         ln(.1) =~ -2.30
192298948Sadrian *
193298948Sadrian * Proof of (1):
194298948Sadrian *    Solve (factor)**(power) =~ .1 given power (5*loadav):
195298948Sadrian *	solving for factor,
196298948Sadrian *      ln(factor) =~ (-2.30/5*loadav), or
197298948Sadrian *      factor =~ exp(-1/((5/2.30)*loadav)) =~ exp(-1/(2*loadav)) =
198298948Sadrian *          exp(-1/b) =~ (b-1)/b =~ b/(b+1).                    QED
199298948Sadrian *
200298948Sadrian * Proof of (2):
201298948Sadrian *    Solve (factor)**(power) =~ .1 given factor == (b/(b+1)):
202298948Sadrian *	solving for power,
203298948Sadrian *      power*ln(b/(b+1)) =~ -2.30, or
204298948Sadrian *      power =~ 2.3 * (b + 1) = 4.6*loadav + 2.3 =~ 5*loadav.  QED
205298948Sadrian *
206298948Sadrian * Actual power values for the implemented algorithm are as follows:
207298948Sadrian *      loadav: 1       2       3       4
208298948Sadrian *      power:  5.68    10.32   14.94   19.55
209298948Sadrian */
210298948Sadrian
211298948Sadrian/* calculations for digital decay to forget 90% of usage in 5*loadav sec */
212298948Sadrian#define	loadfactor(loadav)	(2 * (loadav))
213298948Sadrian#define	decay_cpu(loadfac, cpu)	(((loadfac) * (cpu)) / ((loadfac) + FSCALE))
214298948Sadrian
215298948Sadrian/* decay 95% of `p_pctcpu' in 60 seconds; see CCPU_SHIFT before changing */
216298948Sadrianstatic fixpt_t	ccpu = 0.95122942450071400909 * FSCALE;	/* exp(-1/20) */
217298948SadrianSYSCTL_INT(_kern, OID_AUTO, ccpu, CTLFLAG_RD, &ccpu, 0, "");
218298948Sadrian
219298948Sadrian/* kernel uses `FSCALE', userland (SHOULD) use kern.fscale */
220298948Sadrianstatic int	fscale __unused = FSCALE;
221298948SadrianSYSCTL_INT(_kern, OID_AUTO, fscale, CTLFLAG_RD, 0, FSCALE, "");
222298948Sadrian
223298948Sadrian/*
224298948Sadrian * If `ccpu' is not equal to `exp(-1/20)' and you still want to use the
225298948Sadrian * faster/more-accurate formula, you'll have to estimate CCPU_SHIFT below
226298948Sadrian * and possibly adjust FSHIFT in "param.h" so that (FSHIFT >= CCPU_SHIFT).
227298948Sadrian *
228298948Sadrian * To estimate CCPU_SHIFT for exp(-1/20), the following formula was used:
229298948Sadrian *	1 - exp(-1/20) ~= 0.0487 ~= 0.0488 == 1 (fixed pt, *11* bits).
230298948Sadrian *
231298948Sadrian * If you don't want to bother with the faster/more-accurate formula, you
232298948Sadrian * can set CCPU_SHIFT to (FSHIFT + 1) which will use a slower/less-accurate
233298948Sadrian * (more general) method of calculating the %age of CPU used by a process.
234298948Sadrian */
235298948Sadrian#define	CCPU_SHIFT	11
236298948Sadrian
237298948Sadrian/*
238298948Sadrian * Recompute process priorities, every hz ticks.
239298948Sadrian * MP-safe, called without the Giant mutex.
240298948Sadrian */
241298948Sadrian/* ARGSUSED */
242298948Sadrianstatic void
243298948Sadrianschedcpu(arg)
244298948Sadrian	void *arg;
245298948Sadrian{
246298948Sadrian	register fixpt_t loadfac = loadfactor(averunnable.ldavg[0]);
247298948Sadrian	register struct proc *p;
248298948Sadrian	register int realstathz;
249298948Sadrian
250298948Sadrian	realstathz = stathz ? stathz : hz;
251298948Sadrian	sx_slock(&allproc_lock);
252298948Sadrian	LIST_FOREACH(p, &allproc, p_list) {
253298948Sadrian		/*
254298948Sadrian		 * Increment time in/out of memory and sleep time
255298948Sadrian		 * (if sleeping).  We ignore overflow; with 16-bit int's
256298948Sadrian		 * (remember them?) overflow takes 45 days.
257298948Sadrian		 */
258298948Sadrian		mtx_lock_spin(&sched_lock);
259298948Sadrian		p->p_swtime++;
260298948Sadrian		if (p->p_stat == SSLEEP || p->p_stat == SSTOP)
261298948Sadrian			p->p_slptime++;
262298948Sadrian		p->p_pctcpu = (p->p_pctcpu * ccpu) >> FSHIFT;
263298948Sadrian		/*
264298948Sadrian		 * If the process has slept the entire second,
265298948Sadrian		 * stop recalculating its priority until it wakes up.
266298948Sadrian		 */
267298948Sadrian		if (p->p_slptime > 1) {
268298948Sadrian			mtx_unlock_spin(&sched_lock);
269298948Sadrian			continue;
270298948Sadrian		}
271298948Sadrian
272298948Sadrian		/*
273298948Sadrian		 * p_pctcpu is only for ps.
274298948Sadrian		 */
275298948Sadrian#if	(FSHIFT >= CCPU_SHIFT)
276298948Sadrian		p->p_pctcpu += (realstathz == 100)?
277298948Sadrian			((fixpt_t) p->p_cpticks) << (FSHIFT - CCPU_SHIFT):
278298948Sadrian                	100 * (((fixpt_t) p->p_cpticks)
279298948Sadrian				<< (FSHIFT - CCPU_SHIFT)) / realstathz;
280298948Sadrian#else
281298948Sadrian		p->p_pctcpu += ((FSCALE - ccpu) *
282298948Sadrian			(p->p_cpticks * FSCALE / realstathz)) >> FSHIFT;
283298948Sadrian#endif
284298948Sadrian		p->p_cpticks = 0;
285298948Sadrian		p->p_estcpu = decay_cpu(loadfac, p->p_estcpu);
286298948Sadrian		resetpriority(p);
287298948Sadrian		if (p->p_pri.pri_level >= PUSER) {
288298948Sadrian			if (p->p_oncpu == NOCPU && 	/* idle */
289298948Sadrian			    p->p_stat == SRUN &&
290298948Sadrian			    (p->p_sflag & PS_INMEM) &&
291298948Sadrian			    (p->p_pri.pri_level / RQ_PPQ) !=
292298948Sadrian			    (p->p_pri.pri_user / RQ_PPQ)) {
293298948Sadrian				remrunqueue(p);
294298948Sadrian				p->p_pri.pri_level = p->p_pri.pri_user;
295298948Sadrian				setrunqueue(p);
296298948Sadrian			} else
297298948Sadrian				p->p_pri.pri_level = p->p_pri.pri_user;
298298948Sadrian		}
299298948Sadrian		mtx_unlock_spin(&sched_lock);
300298948Sadrian	}
301298948Sadrian	sx_sunlock(&allproc_lock);
302298948Sadrian	vmmeter();
303298948Sadrian	wakeup((caddr_t)&lbolt);
304298948Sadrian	callout_reset(&schedcpu_callout, hz, schedcpu, NULL);
305298948Sadrian}
306298948Sadrian
307298948Sadrian/*
308298948Sadrian * Recalculate the priority of a process after it has slept for a while.
309298948Sadrian * For all load averages >= 1 and max p_estcpu of 255, sleeping for at
310298948Sadrian * least six times the loadfactor will decay p_estcpu to zero.
311298948Sadrian */
312298948Sadrianvoid
313298948Sadrianupdatepri(p)
314298948Sadrian	register struct proc *p;
315298948Sadrian{
316298948Sadrian	register unsigned int newcpu = p->p_estcpu;
317298948Sadrian	register fixpt_t loadfac = loadfactor(averunnable.ldavg[0]);
318298948Sadrian
319298948Sadrian	if (p->p_slptime > 5 * loadfac)
320298948Sadrian		p->p_estcpu = 0;
321298948Sadrian	else {
322298948Sadrian		p->p_slptime--;	/* the first time was done in schedcpu */
323298948Sadrian		while (newcpu && --p->p_slptime)
324298948Sadrian			newcpu = decay_cpu(loadfac, newcpu);
325298948Sadrian		p->p_estcpu = newcpu;
326298948Sadrian	}
327298948Sadrian	resetpriority(p);
328298948Sadrian}
329298948Sadrian
330298948Sadrian/*
331298948Sadrian * We're only looking at 7 bits of the address; everything is
332298948Sadrian * aligned to 4, lots of things are aligned to greater powers
333298948Sadrian * of 2.  Shift right by 8, i.e. drop the bottom 256 worth.
334298948Sadrian */
335298948Sadrian#define TABLESIZE	128
336298948Sadrianstatic TAILQ_HEAD(slpquehead, proc) slpque[TABLESIZE];
337298948Sadrian#define LOOKUP(x)	(((intptr_t)(x) >> 8) & (TABLESIZE - 1))
338298948Sadrian
339298948Sadrianvoid
340298948Sadriansleepinit(void)
341298948Sadrian{
342298948Sadrian	int i;
343298948Sadrian
344298948Sadrian	sched_quantum = hz/10;
345298948Sadrian	hogticks = 2 * sched_quantum;
346298948Sadrian	for (i = 0; i < TABLESIZE; i++)
347298948Sadrian		TAILQ_INIT(&slpque[i]);
348298948Sadrian}
349298948Sadrian
350298948Sadrian/*
351298948Sadrian * General sleep call.  Suspends the current process until a wakeup is
352298948Sadrian * performed on the specified identifier.  The process will then be made
353298948Sadrian * runnable with the specified priority.  Sleeps at most timo/hz seconds
354298948Sadrian * (0 means no timeout).  If pri includes PCATCH flag, signals are checked
355298948Sadrian * before and after sleeping, else signals are not checked.  Returns 0 if
356298948Sadrian * awakened, EWOULDBLOCK if the timeout expires.  If PCATCH is set and a
357298948Sadrian * signal needs to be delivered, ERESTART is returned if the current system
358298948Sadrian * call should be restarted if possible, and EINTR is returned if the system
359298948Sadrian * call should be interrupted by the signal (return EINTR).
360298948Sadrian *
361298948Sadrian * The mutex argument is exited before the caller is suspended, and
362298948Sadrian * entered before msleep returns.  If priority includes the PDROP
363298948Sadrian * flag the mutex is not entered before returning.
364298948Sadrian */
365298948Sadrianint
366298948Sadrianmsleep(ident, mtx, priority, wmesg, timo)
367298948Sadrian	void *ident;
368298948Sadrian	struct mtx *mtx;
369298948Sadrian	int priority, timo;
370298948Sadrian	const char *wmesg;
371298948Sadrian{
372298948Sadrian	struct proc *p = curproc;
373298948Sadrian	int sig, catch = priority & PCATCH;
374298948Sadrian	int rval = 0;
375298948Sadrian	WITNESS_SAVE_DECL(mtx);
376298948Sadrian
377298948Sadrian#ifdef KTRACE
378298948Sadrian	if (p && KTRPOINT(p, KTR_CSW))
379298948Sadrian		ktrcsw(p->p_tracep, 1, 0);
380298948Sadrian#endif
381298948Sadrian	WITNESS_SLEEP(0, &mtx->mtx_object);
382298948Sadrian	KASSERT(timo != 0 || mtx_owned(&Giant) || mtx != NULL,
383298948Sadrian	    ("sleeping without a mutex"));
384298948Sadrian	mtx_lock_spin(&sched_lock);
385298948Sadrian	if (cold || panicstr) {
386298948Sadrian		/*
387298948Sadrian		 * After a panic, or during autoconfiguration,
388298948Sadrian		 * just give interrupts a chance, then just return;
389298948Sadrian		 * don't run any other procs or panic below,
390298948Sadrian		 * in case this is the idle process and already asleep.
391298948Sadrian		 */
392298948Sadrian		if (mtx != NULL && priority & PDROP)
393298948Sadrian			mtx_unlock_flags(mtx, MTX_NOSWITCH);
394298948Sadrian		mtx_unlock_spin(&sched_lock);
395298948Sadrian		return (0);
396298948Sadrian	}
397298948Sadrian
398298948Sadrian	DROP_GIANT_NOSWITCH();
399298948Sadrian
400298948Sadrian	if (mtx != NULL) {
401298948Sadrian		mtx_assert(mtx, MA_OWNED | MA_NOTRECURSED);
402298948Sadrian		WITNESS_SAVE(&mtx->mtx_object, mtx);
403298948Sadrian		mtx_unlock_flags(mtx, MTX_NOSWITCH);
404298948Sadrian		if (priority & PDROP)
405298948Sadrian			mtx = NULL;
406298948Sadrian	}
407298948Sadrian
408298948Sadrian	KASSERT(p != NULL, ("msleep1"));
409298948Sadrian	KASSERT(ident != NULL && p->p_stat == SRUN, ("msleep"));
410298948Sadrian
411298948Sadrian	p->p_wchan = ident;
412298948Sadrian	p->p_wmesg = wmesg;
413298948Sadrian	p->p_slptime = 0;
414298948Sadrian	p->p_pri.pri_level = priority & PRIMASK;
415298948Sadrian	CTR5(KTR_PROC, "msleep: proc %p (pid %d, %s) on %s (%p)", p, p->p_pid,
416298948Sadrian	    p->p_comm, wmesg, ident);
417298948Sadrian	TAILQ_INSERT_TAIL(&slpque[LOOKUP(ident)], p, p_slpq);
418298948Sadrian	if (timo)
419298948Sadrian		callout_reset(&p->p_slpcallout, timo, endtsleep, p);
420298948Sadrian	/*
421298948Sadrian	 * We put ourselves on the sleep queue and start our timeout
422298948Sadrian	 * before calling CURSIG, as we could stop there, and a wakeup
423298948Sadrian	 * or a SIGCONT (or both) could occur while we were stopped.
424298948Sadrian	 * A SIGCONT would cause us to be marked as SSLEEP
425298948Sadrian	 * without resuming us, thus we must be ready for sleep
426298948Sadrian	 * when CURSIG is called.  If the wakeup happens while we're
427298948Sadrian	 * stopped, p->p_wchan will be 0 upon return from CURSIG.
428298948Sadrian	 */
429298948Sadrian	if (catch) {
430298948Sadrian		CTR3(KTR_PROC, "msleep caught: proc %p (pid %d, %s)", p,
431298948Sadrian		    p->p_pid, p->p_comm);
432298948Sadrian		p->p_sflag |= PS_SINTR;
433298948Sadrian		mtx_unlock_spin(&sched_lock);
434298948Sadrian		PROC_LOCK(p);
435298948Sadrian		sig = CURSIG(p);
436298948Sadrian		mtx_lock_spin(&sched_lock);
437298948Sadrian		PROC_UNLOCK_NOSWITCH(p);
438298948Sadrian		if (sig != 0) {
439298948Sadrian			if (p->p_wchan != NULL)
440298948Sadrian				unsleep(p);
441298948Sadrian		} else if (p->p_wchan == NULL)
442298948Sadrian			catch = 0;
443298948Sadrian	} else
444298948Sadrian		sig = 0;
445298948Sadrian	if (p->p_wchan != NULL) {
446298948Sadrian		p->p_stat = SSLEEP;
447298948Sadrian		p->p_stats->p_ru.ru_nvcsw++;
448298948Sadrian		mi_switch();
449298948Sadrian	}
450298948Sadrian	CTR3(KTR_PROC, "msleep resume: proc %p (pid %d, %s)", p, p->p_pid,
451298948Sadrian	    p->p_comm);
452298948Sadrian	KASSERT(p->p_stat == SRUN, ("running but not SRUN"));
453298948Sadrian	p->p_sflag &= ~PS_SINTR;
454298948Sadrian	if (p->p_sflag & PS_TIMEOUT) {
455298948Sadrian		p->p_sflag &= ~PS_TIMEOUT;
456298948Sadrian		if (sig == 0)
457298948Sadrian			rval = EWOULDBLOCK;
458298948Sadrian	} else if (p->p_sflag & PS_TIMOFAIL)
459298948Sadrian		p->p_sflag &= ~PS_TIMOFAIL;
460298948Sadrian	else if (timo && callout_stop(&p->p_slpcallout) == 0) {
461298948Sadrian		/*
462298948Sadrian		 * This isn't supposed to be pretty.  If we are here, then
463298948Sadrian		 * the endtsleep() callout is currently executing on another
464298948Sadrian		 * CPU and is either spinning on the sched_lock or will be
465298948Sadrian		 * soon.  If we don't synchronize here, there is a chance
466298948Sadrian		 * that this process may msleep() again before the callout
467298948Sadrian		 * has a chance to run and the callout may end up waking up
468298948Sadrian		 * the wrong msleep().  Yuck.
469298948Sadrian		 */
470298948Sadrian		p->p_sflag |= PS_TIMEOUT;
471298948Sadrian		p->p_stats->p_ru.ru_nivcsw++;
472298948Sadrian		mi_switch();
473298948Sadrian	}
474298948Sadrian	mtx_unlock_spin(&sched_lock);
475298948Sadrian
476298948Sadrian	if (rval == 0 && catch) {
477298948Sadrian		PROC_LOCK(p);
478298948Sadrian		/* XXX: shouldn't we always be calling CURSIG() */
479298948Sadrian		if (sig != 0 || (sig = CURSIG(p))) {
480298948Sadrian			if (SIGISMEMBER(p->p_sigacts->ps_sigintr, sig))
481298948Sadrian				rval = EINTR;
482298948Sadrian			else
483298948Sadrian				rval = ERESTART;
484298948Sadrian		}
485298948Sadrian		PROC_UNLOCK(p);
486298948Sadrian	}
487298948Sadrian	PICKUP_GIANT();
488298948Sadrian#ifdef KTRACE
489298948Sadrian	mtx_lock(&Giant);
490298948Sadrian	if (KTRPOINT(p, KTR_CSW))
491298948Sadrian		ktrcsw(p->p_tracep, 0, 0);
492298948Sadrian	mtx_unlock(&Giant);
493298948Sadrian#endif
494298948Sadrian	if (mtx != NULL) {
495298948Sadrian		mtx_lock(mtx);
496298948Sadrian		WITNESS_RESTORE(&mtx->mtx_object, mtx);
497298948Sadrian	}
498298948Sadrian	return (rval);
499298948Sadrian}
500298948Sadrian
501298948Sadrian/*
502298948Sadrian * Implement timeout for msleep()
503298948Sadrian *
504298948Sadrian * If process hasn't been awakened (wchan non-zero),
505298948Sadrian * set timeout flag and undo the sleep.  If proc
506298948Sadrian * is stopped, just unsleep so it will remain stopped.
507298948Sadrian * MP-safe, called without the Giant mutex.
508298948Sadrian */
509298948Sadrianstatic void
510298948Sadrianendtsleep(arg)
511298948Sadrian	void *arg;
512298948Sadrian{
513298948Sadrian	register struct proc *p;
514298948Sadrian
515298948Sadrian	p = (struct proc *)arg;
516298948Sadrian	CTR3(KTR_PROC, "endtsleep: proc %p (pid %d, %s)", p, p->p_pid,
517298948Sadrian	    p->p_comm);
518298948Sadrian	mtx_lock_spin(&sched_lock);
519298948Sadrian	/*
520298948Sadrian	 * This is the other half of the synchronization with msleep()
521298948Sadrian	 * described above.  If the PS_TIMEOUT flag is set, we lost the
522298948Sadrian	 * race and just need to put the process back on the runqueue.
523298948Sadrian	 */
524298948Sadrian	if ((p->p_sflag & PS_TIMEOUT) != 0) {
525298948Sadrian		p->p_sflag &= ~PS_TIMEOUT;
526298948Sadrian		setrunqueue(p);
527298948Sadrian	} else if (p->p_wchan != NULL) {
528298948Sadrian		if (p->p_stat == SSLEEP)
529298948Sadrian			setrunnable(p);
530298948Sadrian		else
531298948Sadrian			unsleep(p);
532298948Sadrian		p->p_sflag |= PS_TIMEOUT;
533298948Sadrian	} else
534298948Sadrian		p->p_sflag |= PS_TIMOFAIL;
535298948Sadrian	mtx_unlock_spin(&sched_lock);
536298948Sadrian}
537298948Sadrian
538298948Sadrian/*
539298948Sadrian * Remove a process from its wait queue
540298948Sadrian */
541298948Sadrianvoid
542298948Sadrianunsleep(p)
543298948Sadrian	register struct proc *p;
544298948Sadrian{
545298948Sadrian
546298948Sadrian	mtx_lock_spin(&sched_lock);
547298948Sadrian	if (p->p_wchan != NULL) {
548298948Sadrian		TAILQ_REMOVE(&slpque[LOOKUP(p->p_wchan)], p, p_slpq);
549298948Sadrian		p->p_wchan = NULL;
550298948Sadrian	}
551298948Sadrian	mtx_unlock_spin(&sched_lock);
552298948Sadrian}
553298948Sadrian
554298948Sadrian/*
555298948Sadrian * Make all processes sleeping on the specified identifier runnable.
556298948Sadrian */
557298948Sadrianvoid
558298948Sadrianwakeup(ident)
559298948Sadrian	register void *ident;
560298948Sadrian{
561298948Sadrian	register struct slpquehead *qp;
562298948Sadrian	register struct proc *p;
563298948Sadrian
564298948Sadrian	mtx_lock_spin(&sched_lock);
565298948Sadrian	qp = &slpque[LOOKUP(ident)];
566298948Sadrianrestart:
567298948Sadrian	TAILQ_FOREACH(p, qp, p_slpq) {
568298948Sadrian		if (p->p_wchan == ident) {
569298948Sadrian			TAILQ_REMOVE(qp, p, p_slpq);
570298948Sadrian			p->p_wchan = NULL;
571298948Sadrian			if (p->p_stat == SSLEEP) {
572298948Sadrian				/* OPTIMIZED EXPANSION OF setrunnable(p); */
573298948Sadrian				CTR3(KTR_PROC, "wakeup: proc %p (pid %d, %s)",
574298948Sadrian				    p, p->p_pid, p->p_comm);
575298948Sadrian				if (p->p_slptime > 1)
576298948Sadrian					updatepri(p);
577298948Sadrian				p->p_slptime = 0;
578298948Sadrian				p->p_stat = SRUN;
579298948Sadrian				if (p->p_sflag & PS_INMEM) {
580298948Sadrian					setrunqueue(p);
581298948Sadrian					maybe_resched(p);
582298948Sadrian				} else {
583298948Sadrian					p->p_sflag |= PS_SWAPINREQ;
584298948Sadrian					wakeup((caddr_t)&proc0);
585298948Sadrian				}
586298948Sadrian				/* END INLINE EXPANSION */
587298948Sadrian				goto restart;
588298948Sadrian			}
589298948Sadrian		}
590298948Sadrian	}
591298948Sadrian	mtx_unlock_spin(&sched_lock);
592298948Sadrian}
593298948Sadrian
594298948Sadrian/*
595298948Sadrian * Make a process sleeping on the specified identifier runnable.
596298948Sadrian * May wake more than one process if a target process is currently
597298948Sadrian * swapped out.
598298948Sadrian */
599298948Sadrianvoid
600298948Sadrianwakeup_one(ident)
601298948Sadrian	register void *ident;
602298948Sadrian{
603298948Sadrian	register struct slpquehead *qp;
604298948Sadrian	register struct proc *p;
605298948Sadrian
606298948Sadrian	mtx_lock_spin(&sched_lock);
607298948Sadrian	qp = &slpque[LOOKUP(ident)];
608298948Sadrian
609298948Sadrian	TAILQ_FOREACH(p, qp, p_slpq) {
610298948Sadrian		if (p->p_wchan == ident) {
611298948Sadrian			TAILQ_REMOVE(qp, p, p_slpq);
612298948Sadrian			p->p_wchan = NULL;
613298948Sadrian			if (p->p_stat == SSLEEP) {
614298948Sadrian				/* OPTIMIZED EXPANSION OF setrunnable(p); */
615298948Sadrian				CTR3(KTR_PROC, "wakeup1: proc %p (pid %d, %s)",
616298948Sadrian				    p, p->p_pid, p->p_comm);
617298948Sadrian				if (p->p_slptime > 1)
618298948Sadrian					updatepri(p);
619298948Sadrian				p->p_slptime = 0;
620298948Sadrian				p->p_stat = SRUN;
621298948Sadrian				if (p->p_sflag & PS_INMEM) {
622298948Sadrian					setrunqueue(p);
623298948Sadrian					maybe_resched(p);
624298948Sadrian					break;
625298948Sadrian				} else {
626298948Sadrian					p->p_sflag |= PS_SWAPINREQ;
627298948Sadrian					wakeup((caddr_t)&proc0);
628298948Sadrian				}
629298948Sadrian				/* END INLINE EXPANSION */
630298948Sadrian			}
631298948Sadrian		}
632298948Sadrian	}
633298948Sadrian	mtx_unlock_spin(&sched_lock);
634298948Sadrian}
635298948Sadrian
636298948Sadrian/*
637298948Sadrian * The machine independent parts of mi_switch().
638298948Sadrian */
639298948Sadrianvoid
640298948Sadrianmi_switch()
641298948Sadrian{
642298948Sadrian	struct timeval new_switchtime;
643298948Sadrian	register struct proc *p = curproc;	/* XXX */
644298948Sadrian#if 0
645298948Sadrian	register struct rlimit *rlim;
646298948Sadrian#endif
647298948Sadrian	critical_t sched_crit;
648298948Sadrian	u_int sched_nest;
649298948Sadrian
650298948Sadrian	mtx_assert(&sched_lock, MA_OWNED | MA_NOTRECURSED);
651298948Sadrian
652298948Sadrian	/*
653298948Sadrian	 * Compute the amount of time during which the current
654298948Sadrian	 * process was running, and add that to its total so far.
655298948Sadrian	 */
656298948Sadrian	microuptime(&new_switchtime);
657298948Sadrian	if (timevalcmp(&new_switchtime, PCPU_PTR(switchtime), <)) {
658298948Sadrian#if 0
659298948Sadrian		/* XXX: This doesn't play well with sched_lock right now. */
660298948Sadrian		printf("microuptime() went backwards (%ld.%06ld -> %ld.%06ld)\n",
661298948Sadrian		    PCPU_GET(switchtime.tv_sec), PCPU_GET(switchtime.tv_usec),
662298948Sadrian		    new_switchtime.tv_sec, new_switchtime.tv_usec);
663298948Sadrian#endif
664298948Sadrian		new_switchtime = PCPU_GET(switchtime);
665298948Sadrian	} else {
666298948Sadrian		p->p_runtime += (new_switchtime.tv_usec - PCPU_GET(switchtime.tv_usec)) +
667298948Sadrian		    (new_switchtime.tv_sec - PCPU_GET(switchtime.tv_sec)) *
668298948Sadrian		    (int64_t)1000000;
669298948Sadrian	}
670298948Sadrian
671298948Sadrian#ifdef DDB
672298948Sadrian	/*
673298948Sadrian	 * Don't perform context switches from the debugger.
674298948Sadrian	 */
675298948Sadrian	if (db_active)
676298948Sadrian		db_error("Context switches not allowed in the debugger.");
677298948Sadrian#endif
678298948Sadrian
679298948Sadrian#if 0
680298948Sadrian	/*
681298948Sadrian	 * Check if the process exceeds its cpu resource allocation.
682298948Sadrian	 * If over max, kill it.
683298948Sadrian	 *
684298948Sadrian	 * XXX drop sched_lock, pickup Giant
685298948Sadrian	 */
686298948Sadrian	if (p->p_stat != SZOMB && p->p_limit->p_cpulimit != RLIM_INFINITY &&
687298948Sadrian	    p->p_runtime > p->p_limit->p_cpulimit) {
688298948Sadrian		rlim = &p->p_rlimit[RLIMIT_CPU];
689298948Sadrian		if (p->p_runtime / (rlim_t)1000000 >= rlim->rlim_max) {
690298948Sadrian			mtx_unlock_spin(&sched_lock);
691298948Sadrian			PROC_LOCK(p);
692298948Sadrian			killproc(p, "exceeded maximum CPU limit");
693298948Sadrian			mtx_lock_spin(&sched_lock);
694298948Sadrian			PROC_UNLOCK_NOSWITCH(p);
695298948Sadrian		} else {
696298948Sadrian			mtx_unlock_spin(&sched_lock);
697298948Sadrian			PROC_LOCK(p);
698298948Sadrian			psignal(p, SIGXCPU);
699298948Sadrian			mtx_lock_spin(&sched_lock);
700298948Sadrian			PROC_UNLOCK_NOSWITCH(p);
701298948Sadrian			if (rlim->rlim_cur < rlim->rlim_max) {
702298948Sadrian				/* XXX: we should make a private copy */
703298948Sadrian				rlim->rlim_cur += 5;
704298948Sadrian			}
705298948Sadrian		}
706298948Sadrian	}
707298948Sadrian#endif
708298948Sadrian
709298948Sadrian	/*
710298948Sadrian	 * Pick a new current process and record its start time.
711298948Sadrian	 */
712298948Sadrian	cnt.v_swtch++;
713298948Sadrian	PCPU_SET(switchtime, new_switchtime);
714298948Sadrian	CTR3(KTR_PROC, "mi_switch: old proc %p (pid %d, %s)", p, p->p_pid,
715298948Sadrian	    p->p_comm);
716298948Sadrian	sched_crit = sched_lock.mtx_savecrit;
717298948Sadrian	sched_nest = sched_lock.mtx_recurse;
718298948Sadrian	p->p_lastcpu = p->p_oncpu;
719298948Sadrian	p->p_oncpu = NOCPU;
720298948Sadrian	p->p_sflag &= ~PS_NEEDRESCHED;
721298948Sadrian	cpu_switch();
722298948Sadrian	p->p_oncpu = PCPU_GET(cpuid);
723298948Sadrian	sched_lock.mtx_savecrit = sched_crit;
724298948Sadrian	sched_lock.mtx_recurse = sched_nest;
725298948Sadrian	sched_lock.mtx_lock = (uintptr_t)p;
726298948Sadrian	CTR3(KTR_PROC, "mi_switch: new proc %p (pid %d, %s)", p, p->p_pid,
727298948Sadrian	    p->p_comm);
728298948Sadrian	if (PCPU_GET(switchtime.tv_sec) == 0)
729298948Sadrian		microuptime(PCPU_PTR(switchtime));
730298948Sadrian	PCPU_SET(switchticks, ticks);
731298948Sadrian}
732298948Sadrian
733298948Sadrian/*
734298948Sadrian * Change process state to be runnable,
735298948Sadrian * placing it on the run queue if it is in memory,
736298948Sadrian * and awakening the swapper if it isn't in memory.
737298948Sadrian */
738298948Sadrianvoid
739298948Sadriansetrunnable(p)
740298948Sadrian	register struct proc *p;
741298948Sadrian{
742298948Sadrian
743298948Sadrian	mtx_lock_spin(&sched_lock);
744298948Sadrian	switch (p->p_stat) {
745298948Sadrian	case 0:
746298948Sadrian	case SRUN:
747298948Sadrian	case SZOMB:
748298948Sadrian	case SWAIT:
749298948Sadrian	default:
750298948Sadrian		panic("setrunnable");
751298948Sadrian	case SSTOP:
752298948Sadrian	case SSLEEP:			/* e.g. when sending signals */
753298948Sadrian		if (p->p_sflag & PS_CVWAITQ)
754298948Sadrian			cv_waitq_remove(p);
755298948Sadrian		else
756298948Sadrian			unsleep(p);
757298948Sadrian		break;
758298948Sadrian
759298948Sadrian	case SIDL:
760298948Sadrian		break;
761298948Sadrian	}
762298948Sadrian	p->p_stat = SRUN;
763298948Sadrian	if (p->p_slptime > 1)
764298948Sadrian		updatepri(p);
765298948Sadrian	p->p_slptime = 0;
766298948Sadrian	if ((p->p_sflag & PS_INMEM) == 0) {
767298948Sadrian		p->p_sflag |= PS_SWAPINREQ;
768298948Sadrian		wakeup((caddr_t)&proc0);
769298948Sadrian	} else {
770298948Sadrian		setrunqueue(p);
771298948Sadrian		maybe_resched(p);
772298948Sadrian	}
773298948Sadrian	mtx_unlock_spin(&sched_lock);
774298948Sadrian}
775298948Sadrian
776298948Sadrian/*
777298948Sadrian * Compute the priority of a process when running in user mode.
778298948Sadrian * Arrange to reschedule if the resulting priority is better
779298948Sadrian * than that of the current process.
780298948Sadrian */
781298948Sadrianvoid
782298948Sadrianresetpriority(p)
783298948Sadrian	register struct proc *p;
784298948Sadrian{
785298948Sadrian	register unsigned int newpriority;
786298948Sadrian
787298948Sadrian	mtx_lock_spin(&sched_lock);
788298948Sadrian	if (p->p_pri.pri_class == PRI_TIMESHARE) {
789298948Sadrian		newpriority = PUSER + p->p_estcpu / INVERSE_ESTCPU_WEIGHT +
790298948Sadrian		    NICE_WEIGHT * (p->p_nice - PRIO_MIN);
791298948Sadrian		newpriority = min(max(newpriority, PRI_MIN_TIMESHARE),
792298948Sadrian		    PRI_MAX_TIMESHARE);
793298948Sadrian		p->p_pri.pri_user = newpriority;
794298948Sadrian	}
795298948Sadrian	maybe_resched(p);
796298948Sadrian	mtx_unlock_spin(&sched_lock);
797298948Sadrian}
798298948Sadrian
799298948Sadrian/* ARGSUSED */
800298948Sadrianstatic void
801298948Sadriansched_setup(dummy)
802298948Sadrian	void *dummy;
803298948Sadrian{
804298948Sadrian
805298948Sadrian	callout_init(&schedcpu_callout, 1);
806298948Sadrian	callout_init(&roundrobin_callout, 0);
807298948Sadrian
808298948Sadrian	/* Kick off timeout driven events by calling first time. */
809298948Sadrian	roundrobin(NULL);
810298948Sadrian	schedcpu(NULL);
811298948Sadrian}
812298948Sadrian
813298948Sadrian/*
814298948Sadrian * We adjust the priority of the current process.  The priority of
815298948Sadrian * a process gets worse as it accumulates CPU time.  The cpu usage
816298948Sadrian * estimator (p_estcpu) is increased here.  resetpriority() will
817298948Sadrian * compute a different priority each time p_estcpu increases by
818298948Sadrian * INVERSE_ESTCPU_WEIGHT
819298948Sadrian * (until MAXPRI is reached).  The cpu usage estimator ramps up
820298948Sadrian * quite quickly when the process is running (linearly), and decays
821298948Sadrian * away exponentially, at a rate which is proportionally slower when
822298948Sadrian * the system is busy.  The basic principle is that the system will
823298948Sadrian * 90% forget that the process used a lot of CPU time in 5 * loadav
824298948Sadrian * seconds.  This causes the system to favor processes which haven't
825298948Sadrian * run much recently, and to round-robin among other processes.
826298948Sadrian */
827298948Sadrianvoid
828298948Sadrianschedclock(p)
829298948Sadrian	struct proc *p;
830298948Sadrian{
831298948Sadrian
832298948Sadrian	p->p_cpticks++;
833298948Sadrian	p->p_estcpu = ESTCPULIM(p->p_estcpu + 1);
834298948Sadrian	if ((p->p_estcpu % INVERSE_ESTCPU_WEIGHT) == 0) {
835298948Sadrian		resetpriority(p);
836298948Sadrian		if (p->p_pri.pri_level >= PUSER)
837298948Sadrian			p->p_pri.pri_level = p->p_pri.pri_user;
838298948Sadrian	}
839298948Sadrian}
840298948Sadrian
841298948Sadrian/*
842298948Sadrian * General purpose yield system call
843298948Sadrian */
844298948Sadrianint
845298948Sadrianyield(struct proc *p, struct yield_args *uap)
846298948Sadrian{
847298948Sadrian
848298948Sadrian	p->p_retval[0] = 0;
849298948Sadrian
850298948Sadrian	mtx_lock_spin(&sched_lock);
851298948Sadrian	DROP_GIANT_NOSWITCH();
852298948Sadrian	p->p_pri.pri_level = PRI_MAX_TIMESHARE;
853298948Sadrian	setrunqueue(p);
854298948Sadrian	p->p_stats->p_ru.ru_nvcsw++;
855298948Sadrian	mi_switch();
856298948Sadrian	mtx_unlock_spin(&sched_lock);
857298948Sadrian	PICKUP_GIANT();
858298948Sadrian
859298948Sadrian	return (0);
860298948Sadrian}
861298948Sadrian