kern_synch.c revision 62573
1248871Sjoel/*-
2248842Ssbruno * Copyright (c) 1982, 1986, 1990, 1991, 1993
3248842Ssbruno *	The Regents of the University of California.  All rights reserved.
4248842Ssbruno * (c) UNIX System Laboratories, Inc.
5248842Ssbruno * All or some portions of this file are derived from material licensed
6248842Ssbruno * to the University of California by American Telephone and Telegraph
7248842Ssbruno * Co. or Unix System Laboratories, Inc. and are reproduced herein with
8248842Ssbruno * the permission of UNIX System Laboratories, Inc.
9248842Ssbruno *
10248842Ssbruno * Redistribution and use in source and binary forms, with or without
11248842Ssbruno * modification, are permitted provided that the following conditions
12248842Ssbruno * are met:
13248842Ssbruno * 1. Redistributions of source code must retain the above copyright
14248842Ssbruno *    notice, this list of conditions and the following disclaimer.
15248842Ssbruno * 2. Redistributions in binary form must reproduce the above copyright
16248842Ssbruno *    notice, this list of conditions and the following disclaimer in the
17248842Ssbruno *    documentation and/or other materials provided with the distribution.
18248842Ssbruno * 3. All advertising materials mentioning features or use of this software
19248842Ssbruno *    must display the following acknowledgement:
20248842Ssbruno *	This product includes software developed by the University of
21248842Ssbruno *	California, Berkeley and its contributors.
22248842Ssbruno * 4. Neither the name of the University nor the names of its contributors
23248842Ssbruno *    may be used to endorse or promote products derived from this software
24248842Ssbruno *    without specific prior written permission.
25248842Ssbruno *
26248842Ssbruno * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
27248842Ssbruno * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28248842Ssbruno * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29248842Ssbruno * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
30248842Ssbruno * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
31248842Ssbruno * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
32248842Ssbruno * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
33248842Ssbruno * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
34248842Ssbruno * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
35248842Ssbruno * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
36248842Ssbruno * SUCH DAMAGE.
37248842Ssbruno *
38248842Ssbruno *	@(#)kern_synch.c	8.9 (Berkeley) 5/19/95
39248842Ssbruno * $FreeBSD: head/sys/kern/kern_synch.c 62573 2000-07-04 11:25:35Z phk $
40248842Ssbruno */
41248842Ssbruno
42248842Ssbruno#include "opt_ktrace.h"
43248842Ssbruno
44248842Ssbruno#include <sys/param.h>
45248842Ssbruno#include <sys/systm.h>
46248842Ssbruno#include <sys/proc.h>
47248842Ssbruno#include <sys/kernel.h>
48248842Ssbruno#include <sys/signalvar.h>
49248842Ssbruno#include <sys/resourcevar.h>
50248842Ssbruno#include <sys/vmmeter.h>
51248842Ssbruno#include <sys/sysctl.h>
52248842Ssbruno#include <vm/vm.h>
53248842Ssbruno#include <vm/vm_extern.h>
54248842Ssbruno#ifdef KTRACE
55248842Ssbruno#include <sys/uio.h>
56248842Ssbruno#include <sys/ktrace.h>
57248842Ssbruno#endif
58248842Ssbruno
59248842Ssbruno#include <machine/cpu.h>
60248842Ssbruno#include <machine/ipl.h>
61248842Ssbruno#include <machine/smp.h>
62248842Ssbruno
63248842Ssbrunostatic void sched_setup __P((void *dummy));
64248842SsbrunoSYSINIT(sched_setup, SI_SUB_KICK_SCHEDULER, SI_ORDER_FIRST, sched_setup, NULL)
65248842Ssbruno
66248842Ssbrunou_char	curpriority;
67248842Ssbrunoint	hogticks;
68248842Ssbrunoint	lbolt;
69248842Ssbrunoint	sched_quantum;		/* Roundrobin scheduling quantum in ticks. */
70248842Ssbruno
71248842Ssbrunostatic int	curpriority_cmp __P((struct proc *p));
72248842Ssbrunostatic void	endtsleep __P((void *));
73248842Ssbrunostatic void	maybe_resched __P((struct proc *chk));
74248842Ssbrunostatic void	roundrobin __P((void *arg));
75248842Ssbrunostatic void	schedcpu __P((void *arg));
76248842Ssbrunostatic void	updatepri __P((struct proc *p));
77248842Ssbruno
78248842Ssbrunostatic int
79248842Ssbrunosysctl_kern_quantum(SYSCTL_HANDLER_ARGS)
80248842Ssbruno{
81248842Ssbruno	int error, new_val;
82248842Ssbruno
83248842Ssbruno	new_val = sched_quantum * tick;
84248842Ssbruno	error = sysctl_handle_int(oidp, &new_val, 0, req);
85248842Ssbruno        if (error != 0 || req->newptr == NULL)
86248842Ssbruno		return (error);
87248842Ssbruno	if (new_val < tick)
88248842Ssbruno		return (EINVAL);
89248842Ssbruno	sched_quantum = new_val / tick;
90248842Ssbruno	hogticks = 2 * sched_quantum;
91248842Ssbruno	return (0);
92248842Ssbruno}
93248842Ssbruno
94248842SsbrunoSYSCTL_PROC(_kern, OID_AUTO, quantum, CTLTYPE_INT|CTLFLAG_RW,
95248842Ssbruno	0, sizeof sched_quantum, sysctl_kern_quantum, "I", "");
96248842Ssbruno
97248842Ssbruno/*-
98248842Ssbruno * Compare priorities.  Return:
99248842Ssbruno *     <0: priority of p < current priority
100248842Ssbruno *      0: priority of p == current priority
101248842Ssbruno *     >0: priority of p > current priority
102248842Ssbruno * The priorities are the normal priorities or the normal realtime priorities
103248842Ssbruno * if p is on the same scheduler as curproc.  Otherwise the process on the
104248842Ssbruno * more realtimeish scheduler has lowest priority.  As usual, a higher
105248842Ssbruno * priority really means a lower priority.
106248842Ssbruno */
107248842Ssbrunostatic int
108248842Ssbrunocurpriority_cmp(p)
109248842Ssbruno	struct proc *p;
110248842Ssbruno{
111248842Ssbruno	int c_class, p_class;
112248842Ssbruno
113248842Ssbruno	c_class = RTP_PRIO_BASE(curproc->p_rtprio.type);
114248842Ssbruno	p_class = RTP_PRIO_BASE(p->p_rtprio.type);
115248842Ssbruno	if (p_class != c_class)
116248842Ssbruno		return (p_class - c_class);
117248842Ssbruno	if (p_class == RTP_PRIO_NORMAL)
118248842Ssbruno		return (((int)p->p_priority - (int)curpriority) / PPQ);
119248842Ssbruno	return ((int)p->p_rtprio.prio - (int)curproc->p_rtprio.prio);
120248842Ssbruno}
121248842Ssbruno
122248842Ssbruno/*
123248842Ssbruno * Arrange to reschedule if necessary, taking the priorities and
124248842Ssbruno * schedulers into account.
125248842Ssbruno */
126248842Ssbrunostatic void
127248842Ssbrunomaybe_resched(chk)
128248842Ssbruno	struct proc *chk;
129248842Ssbruno{
130248842Ssbruno	struct proc *p = curproc; /* XXX */
131248842Ssbruno
132248842Ssbruno	/*
133248842Ssbruno	 * XXX idle scheduler still broken because proccess stays on idle
134248842Ssbruno	 * scheduler during waits (such as when getting FS locks).  If a
135248842Ssbruno	 * standard process becomes runaway cpu-bound, the system can lockup
136248842Ssbruno	 * due to idle-scheduler processes in wakeup never getting any cpu.
137248842Ssbruno	 */
138248842Ssbruno	if (p == NULL) {
139248842Ssbruno#if 0
140248842Ssbruno		need_resched();
141248842Ssbruno#endif
142248842Ssbruno	} else if (chk == p) {
143248842Ssbruno		/* We may need to yield if our priority has been raised. */
144248871Sjoel		if (curpriority_cmp(chk) > 0)
145248842Ssbruno			need_resched();
146248842Ssbruno	} else if (curpriority_cmp(chk) < 0)
147248842Ssbruno		need_resched();
148248842Ssbruno}
149248842Ssbruno
150248842Ssbrunoint
151248842Ssbrunoroundrobin_interval(void)
152248842Ssbruno{
153248842Ssbruno	return (sched_quantum);
154248842Ssbruno}
155248842Ssbruno
156248842Ssbruno/*
157248842Ssbruno * Force switch among equal priority processes every 100ms.
158248842Ssbruno */
159248842Ssbruno/* ARGSUSED */
160248842Ssbrunostatic void
161248842Ssbrunoroundrobin(arg)
162248842Ssbruno	void *arg;
163248842Ssbruno{
164248842Ssbruno#ifndef SMP
165248842Ssbruno 	struct proc *p = curproc; /* XXX */
166248842Ssbruno#endif
167248842Ssbruno
168248842Ssbruno#ifdef SMP
169248842Ssbruno	need_resched();
170248842Ssbruno	forward_roundrobin();
171248842Ssbruno#else
172248842Ssbruno 	if (p == 0 || RTP_PRIO_NEED_RR(p->p_rtprio.type))
173248842Ssbruno 		need_resched();
174248842Ssbruno#endif
175248842Ssbruno
176248842Ssbruno 	timeout(roundrobin, NULL, sched_quantum);
177248842Ssbruno}
178248842Ssbruno
179248842Ssbruno/*
180248842Ssbruno * Constants for digital decay and forget:
181248842Ssbruno *	90% of (p_estcpu) usage in 5 * loadav time
182248842Ssbruno *	95% of (p_pctcpu) usage in 60 seconds (load insensitive)
183248842Ssbruno *          Note that, as ps(1) mentions, this can let percentages
184248842Ssbruno *          total over 100% (I've seen 137.9% for 3 processes).
185248842Ssbruno *
186248842Ssbruno * Note that schedclock() updates p_estcpu and p_cpticks asynchronously.
187248842Ssbruno *
188248842Ssbruno * We wish to decay away 90% of p_estcpu in (5 * loadavg) seconds.
189248842Ssbruno * That is, the system wants to compute a value of decay such
190248842Ssbruno * that the following for loop:
191248842Ssbruno * 	for (i = 0; i < (5 * loadavg); i++)
192248842Ssbruno * 		p_estcpu *= decay;
193248842Ssbruno * will compute
194248842Ssbruno * 	p_estcpu *= 0.1;
195248842Ssbruno * for all values of loadavg:
196248842Ssbruno *
197248842Ssbruno * Mathematically this loop can be expressed by saying:
198248842Ssbruno * 	decay ** (5 * loadavg) ~= .1
199248842Ssbruno *
200248842Ssbruno * The system computes decay as:
201248842Ssbruno * 	decay = (2 * loadavg) / (2 * loadavg + 1)
202248842Ssbruno *
203248842Ssbruno * We wish to prove that the system's computation of decay
204248842Ssbruno * will always fulfill the equation:
205248842Ssbruno * 	decay ** (5 * loadavg) ~= .1
206248842Ssbruno *
207248842Ssbruno * If we compute b as:
208248842Ssbruno * 	b = 2 * loadavg
209248842Ssbruno * then
210248842Ssbruno * 	decay = b / (b + 1)
211248842Ssbruno *
212248842Ssbruno * We now need to prove two things:
213248842Ssbruno *	1) Given factor ** (5 * loadavg) ~= .1, prove factor == b/(b+1)
214248842Ssbruno *	2) Given b/(b+1) ** power ~= .1, prove power == (5 * loadavg)
215248842Ssbruno *
216248842Ssbruno * Facts:
217248842Ssbruno *         For x close to zero, exp(x) =~ 1 + x, since
218248842Ssbruno *              exp(x) = 0! + x**1/1! + x**2/2! + ... .
219248842Ssbruno *              therefore exp(-1/b) =~ 1 - (1/b) = (b-1)/b.
220248842Ssbruno *         For x close to zero, ln(1+x) =~ x, since
221248842Ssbruno *              ln(1+x) = x - x**2/2 + x**3/3 - ...     -1 < x < 1
222248842Ssbruno *              therefore ln(b/(b+1)) = ln(1 - 1/(b+1)) =~ -1/(b+1).
223248842Ssbruno *         ln(.1) =~ -2.30
224248842Ssbruno *
225248842Ssbruno * Proof of (1):
226248842Ssbruno *    Solve (factor)**(power) =~ .1 given power (5*loadav):
227248842Ssbruno *	solving for factor,
228248842Ssbruno *      ln(factor) =~ (-2.30/5*loadav), or
229267773Sbapt *      factor =~ exp(-1/((5/2.30)*loadav)) =~ exp(-1/(2*loadav)) =
230248842Ssbruno *          exp(-1/b) =~ (b-1)/b =~ b/(b+1).                    QED
231248842Ssbruno *
232248842Ssbruno * Proof of (2):
233267773Sbapt *    Solve (factor)**(power) =~ .1 given factor == (b/(b+1)):
234248871Sjoel *	solving for power,
235248842Ssbruno *      power*ln(b/(b+1)) =~ -2.30, or
236267773Sbapt *      power =~ 2.3 * (b + 1) = 4.6*loadav + 2.3 =~ 5*loadav.  QED
237 *
238 * Actual power values for the implemented algorithm are as follows:
239 *      loadav: 1       2       3       4
240 *      power:  5.68    10.32   14.94   19.55
241 */
242
243/* calculations for digital decay to forget 90% of usage in 5*loadav sec */
244#define	loadfactor(loadav)	(2 * (loadav))
245#define	decay_cpu(loadfac, cpu)	(((loadfac) * (cpu)) / ((loadfac) + FSCALE))
246
247/* decay 95% of `p_pctcpu' in 60 seconds; see CCPU_SHIFT before changing */
248static fixpt_t	ccpu = 0.95122942450071400909 * FSCALE;	/* exp(-1/20) */
249SYSCTL_INT(_kern, OID_AUTO, ccpu, CTLFLAG_RD, &ccpu, 0, "");
250
251/* kernel uses `FSCALE', userland (SHOULD) use kern.fscale */
252static int	fscale __unused = FSCALE;
253SYSCTL_INT(_kern, OID_AUTO, fscale, CTLFLAG_RD, 0, FSCALE, "");
254
255/*
256 * If `ccpu' is not equal to `exp(-1/20)' and you still want to use the
257 * faster/more-accurate formula, you'll have to estimate CCPU_SHIFT below
258 * and possibly adjust FSHIFT in "param.h" so that (FSHIFT >= CCPU_SHIFT).
259 *
260 * To estimate CCPU_SHIFT for exp(-1/20), the following formula was used:
261 *	1 - exp(-1/20) ~= 0.0487 ~= 0.0488 == 1 (fixed pt, *11* bits).
262 *
263 * If you don't want to bother with the faster/more-accurate formula, you
264 * can set CCPU_SHIFT to (FSHIFT + 1) which will use a slower/less-accurate
265 * (more general) method of calculating the %age of CPU used by a process.
266 */
267#define	CCPU_SHIFT	11
268
269/*
270 * Recompute process priorities, every hz ticks.
271 */
272/* ARGSUSED */
273static void
274schedcpu(arg)
275	void *arg;
276{
277	register fixpt_t loadfac = loadfactor(averunnable.ldavg[0]);
278	register struct proc *p;
279	register int realstathz, s;
280
281	realstathz = stathz ? stathz : hz;
282	LIST_FOREACH(p, &allproc, p_list) {
283		/*
284		 * Increment time in/out of memory and sleep time
285		 * (if sleeping).  We ignore overflow; with 16-bit int's
286		 * (remember them?) overflow takes 45 days.
287		 */
288		p->p_swtime++;
289		if (p->p_stat == SSLEEP || p->p_stat == SSTOP)
290			p->p_slptime++;
291		p->p_pctcpu = (p->p_pctcpu * ccpu) >> FSHIFT;
292		/*
293		 * If the process has slept the entire second,
294		 * stop recalculating its priority until it wakes up.
295		 */
296		if (p->p_slptime > 1)
297			continue;
298		s = splhigh();	/* prevent state changes and protect run queue */
299		/*
300		 * p_pctcpu is only for ps.
301		 */
302#if	(FSHIFT >= CCPU_SHIFT)
303		p->p_pctcpu += (realstathz == 100)?
304			((fixpt_t) p->p_cpticks) << (FSHIFT - CCPU_SHIFT):
305                	100 * (((fixpt_t) p->p_cpticks)
306				<< (FSHIFT - CCPU_SHIFT)) / realstathz;
307#else
308		p->p_pctcpu += ((FSCALE - ccpu) *
309			(p->p_cpticks * FSCALE / realstathz)) >> FSHIFT;
310#endif
311		p->p_cpticks = 0;
312		p->p_estcpu = decay_cpu(loadfac, p->p_estcpu);
313		resetpriority(p);
314		if (p->p_priority >= PUSER) {
315			if ((p != curproc) &&
316#ifdef SMP
317			    p->p_oncpu == 0xff && 	/* idle */
318#endif
319			    p->p_stat == SRUN &&
320			    (p->p_flag & P_INMEM) &&
321			    (p->p_priority / PPQ) != (p->p_usrpri / PPQ)) {
322				remrunqueue(p);
323				p->p_priority = p->p_usrpri;
324				setrunqueue(p);
325			} else
326				p->p_priority = p->p_usrpri;
327		}
328		splx(s);
329	}
330	vmmeter();
331	wakeup((caddr_t)&lbolt);
332	timeout(schedcpu, (void *)0, hz);
333}
334
335/*
336 * Recalculate the priority of a process after it has slept for a while.
337 * For all load averages >= 1 and max p_estcpu of 255, sleeping for at
338 * least six times the loadfactor will decay p_estcpu to zero.
339 */
340static void
341updatepri(p)
342	register struct proc *p;
343{
344	register unsigned int newcpu = p->p_estcpu;
345	register fixpt_t loadfac = loadfactor(averunnable.ldavg[0]);
346
347	if (p->p_slptime > 5 * loadfac)
348		p->p_estcpu = 0;
349	else {
350		p->p_slptime--;	/* the first time was done in schedcpu */
351		while (newcpu && --p->p_slptime)
352			newcpu = decay_cpu(loadfac, newcpu);
353		p->p_estcpu = newcpu;
354	}
355	resetpriority(p);
356}
357
358/*
359 * We're only looking at 7 bits of the address; everything is
360 * aligned to 4, lots of things are aligned to greater powers
361 * of 2.  Shift right by 8, i.e. drop the bottom 256 worth.
362 */
363#define TABLESIZE	128
364static TAILQ_HEAD(slpquehead, proc) slpque[TABLESIZE];
365#define LOOKUP(x)	(((intptr_t)(x) >> 8) & (TABLESIZE - 1))
366
367/*
368 * During autoconfiguration or after a panic, a sleep will simply
369 * lower the priority briefly to allow interrupts, then return.
370 * The priority to be used (safepri) is machine-dependent, thus this
371 * value is initialized and maintained in the machine-dependent layers.
372 * This priority will typically be 0, or the lowest priority
373 * that is safe for use on the interrupt stack; it can be made
374 * higher to block network software interrupts after panics.
375 */
376int safepri;
377
378void
379sleepinit(void)
380{
381	int i;
382
383	sched_quantum = hz/10;
384	hogticks = 2 * sched_quantum;
385	for (i = 0; i < TABLESIZE; i++)
386		TAILQ_INIT(&slpque[i]);
387}
388
389/*
390 * General sleep call.  Suspends the current process until a wakeup is
391 * performed on the specified identifier.  The process will then be made
392 * runnable with the specified priority.  Sleeps at most timo/hz seconds
393 * (0 means no timeout).  If pri includes PCATCH flag, signals are checked
394 * before and after sleeping, else signals are not checked.  Returns 0 if
395 * awakened, EWOULDBLOCK if the timeout expires.  If PCATCH is set and a
396 * signal needs to be delivered, ERESTART is returned if the current system
397 * call should be restarted if possible, and EINTR is returned if the system
398 * call should be interrupted by the signal (return EINTR).
399 */
400int
401tsleep(ident, priority, wmesg, timo)
402	void *ident;
403	int priority, timo;
404	const char *wmesg;
405{
406	struct proc *p = curproc;
407	int s, sig, catch = priority & PCATCH;
408	struct callout_handle thandle;
409
410#ifdef KTRACE
411	if (p && KTRPOINT(p, KTR_CSW))
412		ktrcsw(p->p_tracep, 1, 0);
413#endif
414	s = splhigh();
415	if (cold || panicstr) {
416		/*
417		 * After a panic, or during autoconfiguration,
418		 * just give interrupts a chance, then just return;
419		 * don't run any other procs or panic below,
420		 * in case this is the idle process and already asleep.
421		 */
422		splx(safepri);
423		splx(s);
424		return (0);
425	}
426	KASSERT(p != NULL, ("tsleep1"));
427	KASSERT(ident != NULL && p->p_stat == SRUN, ("tsleep"));
428	/*
429	 * Process may be sitting on a slpque if asleep() was called, remove
430	 * it before re-adding.
431	 */
432	if (p->p_wchan != NULL)
433		unsleep(p);
434
435	p->p_wchan = ident;
436	p->p_wmesg = wmesg;
437	p->p_slptime = 0;
438	p->p_priority = priority & PRIMASK;
439	TAILQ_INSERT_TAIL(&slpque[LOOKUP(ident)], p, p_procq);
440	if (timo)
441		thandle = timeout(endtsleep, (void *)p, timo);
442	/*
443	 * We put ourselves on the sleep queue and start our timeout
444	 * before calling CURSIG, as we could stop there, and a wakeup
445	 * or a SIGCONT (or both) could occur while we were stopped.
446	 * A SIGCONT would cause us to be marked as SSLEEP
447	 * without resuming us, thus we must be ready for sleep
448	 * when CURSIG is called.  If the wakeup happens while we're
449	 * stopped, p->p_wchan will be 0 upon return from CURSIG.
450	 */
451	if (catch) {
452		p->p_flag |= P_SINTR;
453		if ((sig = CURSIG(p))) {
454			if (p->p_wchan)
455				unsleep(p);
456			p->p_stat = SRUN;
457			goto resume;
458		}
459		if (p->p_wchan == 0) {
460			catch = 0;
461			goto resume;
462		}
463	} else
464		sig = 0;
465	p->p_stat = SSLEEP;
466	p->p_stats->p_ru.ru_nvcsw++;
467	mi_switch();
468resume:
469	curpriority = p->p_usrpri;
470	splx(s);
471	p->p_flag &= ~P_SINTR;
472	if (p->p_flag & P_TIMEOUT) {
473		p->p_flag &= ~P_TIMEOUT;
474		if (sig == 0) {
475#ifdef KTRACE
476			if (KTRPOINT(p, KTR_CSW))
477				ktrcsw(p->p_tracep, 0, 0);
478#endif
479			return (EWOULDBLOCK);
480		}
481	} else if (timo)
482		untimeout(endtsleep, (void *)p, thandle);
483	if (catch && (sig != 0 || (sig = CURSIG(p)))) {
484#ifdef KTRACE
485		if (KTRPOINT(p, KTR_CSW))
486			ktrcsw(p->p_tracep, 0, 0);
487#endif
488		if (SIGISMEMBER(p->p_sigacts->ps_sigintr, sig))
489			return (EINTR);
490		return (ERESTART);
491	}
492#ifdef KTRACE
493	if (KTRPOINT(p, KTR_CSW))
494		ktrcsw(p->p_tracep, 0, 0);
495#endif
496	return (0);
497}
498
499/*
500 * asleep() - async sleep call.  Place process on wait queue and return
501 * immediately without blocking.  The process stays runnable until await()
502 * is called.  If ident is NULL, remove process from wait queue if it is still
503 * on one.
504 *
505 * Only the most recent sleep condition is effective when making successive
506 * calls to asleep() or when calling tsleep().
507 *
508 * The timeout, if any, is not initiated until await() is called.  The sleep
509 * priority, signal, and timeout is specified in the asleep() call but may be
510 * overriden in the await() call.
511 *
512 * <<<<<<<< EXPERIMENTAL, UNTESTED >>>>>>>>>>
513 */
514
515int
516asleep(void *ident, int priority, const char *wmesg, int timo)
517{
518	struct proc *p = curproc;
519	int s;
520
521	/*
522	 * splhigh() while manipulating sleep structures and slpque.
523	 *
524	 * Remove preexisting wait condition (if any) and place process
525	 * on appropriate slpque, but do not put process to sleep.
526	 */
527
528	s = splhigh();
529
530	if (p->p_wchan != NULL)
531		unsleep(p);
532
533	if (ident) {
534		p->p_wchan = ident;
535		p->p_wmesg = wmesg;
536		p->p_slptime = 0;
537		p->p_asleep.as_priority = priority;
538		p->p_asleep.as_timo = timo;
539		TAILQ_INSERT_TAIL(&slpque[LOOKUP(ident)], p, p_procq);
540	}
541
542	splx(s);
543
544	return(0);
545}
546
547/*
548 * await() - wait for async condition to occur.   The process blocks until
549 * wakeup() is called on the most recent asleep() address.  If wakeup is called
550 * priority to await(), await() winds up being a NOP.
551 *
552 * If await() is called more then once (without an intervening asleep() call),
553 * await() is still effectively a NOP but it calls mi_switch() to give other
554 * processes some cpu before returning.  The process is left runnable.
555 *
556 * <<<<<<<< EXPERIMENTAL, UNTESTED >>>>>>>>>>
557 */
558
559int
560await(int priority, int timo)
561{
562	struct proc *p = curproc;
563	int s;
564
565	s = splhigh();
566
567	if (p->p_wchan != NULL) {
568		struct callout_handle thandle;
569		int sig;
570		int catch;
571
572		/*
573		 * The call to await() can override defaults specified in
574		 * the original asleep().
575		 */
576		if (priority < 0)
577			priority = p->p_asleep.as_priority;
578		if (timo < 0)
579			timo = p->p_asleep.as_timo;
580
581		/*
582		 * Install timeout
583		 */
584
585		if (timo)
586			thandle = timeout(endtsleep, (void *)p, timo);
587
588		sig = 0;
589		catch = priority & PCATCH;
590
591		if (catch) {
592			p->p_flag |= P_SINTR;
593			if ((sig = CURSIG(p))) {
594				if (p->p_wchan)
595					unsleep(p);
596				p->p_stat = SRUN;
597				goto resume;
598			}
599			if (p->p_wchan == NULL) {
600				catch = 0;
601				goto resume;
602			}
603		}
604		p->p_stat = SSLEEP;
605		p->p_stats->p_ru.ru_nvcsw++;
606		mi_switch();
607resume:
608		curpriority = p->p_usrpri;
609
610		splx(s);
611		p->p_flag &= ~P_SINTR;
612		if (p->p_flag & P_TIMEOUT) {
613			p->p_flag &= ~P_TIMEOUT;
614			if (sig == 0) {
615#ifdef KTRACE
616				if (KTRPOINT(p, KTR_CSW))
617					ktrcsw(p->p_tracep, 0, 0);
618#endif
619				return (EWOULDBLOCK);
620			}
621		} else if (timo)
622			untimeout(endtsleep, (void *)p, thandle);
623		if (catch && (sig != 0 || (sig = CURSIG(p)))) {
624#ifdef KTRACE
625			if (KTRPOINT(p, KTR_CSW))
626				ktrcsw(p->p_tracep, 0, 0);
627#endif
628			if (SIGISMEMBER(p->p_sigacts->ps_sigintr, sig))
629				return (EINTR);
630			return (ERESTART);
631		}
632#ifdef KTRACE
633		if (KTRPOINT(p, KTR_CSW))
634			ktrcsw(p->p_tracep, 0, 0);
635#endif
636	} else {
637		/*
638		 * If as_priority is 0, await() has been called without an
639		 * intervening asleep().  We are still effectively a NOP,
640		 * but we call mi_switch() for safety.
641		 */
642
643		if (p->p_asleep.as_priority == 0) {
644			p->p_stats->p_ru.ru_nvcsw++;
645			mi_switch();
646		}
647		splx(s);
648	}
649
650	/*
651	 * clear p_asleep.as_priority as an indication that await() has been
652	 * called.  If await() is called again without an intervening asleep(),
653	 * await() is still effectively a NOP but the above mi_switch() code
654	 * is triggered as a safety.
655	 */
656	p->p_asleep.as_priority = 0;
657
658	return (0);
659}
660
661/*
662 * Implement timeout for tsleep or asleep()/await()
663 *
664 * If process hasn't been awakened (wchan non-zero),
665 * set timeout flag and undo the sleep.  If proc
666 * is stopped, just unsleep so it will remain stopped.
667 */
668static void
669endtsleep(arg)
670	void *arg;
671{
672	register struct proc *p;
673	int s;
674
675	p = (struct proc *)arg;
676	s = splhigh();
677	if (p->p_wchan) {
678		if (p->p_stat == SSLEEP)
679			setrunnable(p);
680		else
681			unsleep(p);
682		p->p_flag |= P_TIMEOUT;
683	}
684	splx(s);
685}
686
687/*
688 * Remove a process from its wait queue
689 */
690void
691unsleep(p)
692	register struct proc *p;
693{
694	int s;
695
696	s = splhigh();
697	if (p->p_wchan) {
698		TAILQ_REMOVE(&slpque[LOOKUP(p->p_wchan)], p, p_procq);
699		p->p_wchan = 0;
700	}
701	splx(s);
702}
703
704/*
705 * Make all processes sleeping on the specified identifier runnable.
706 */
707void
708wakeup(ident)
709	register void *ident;
710{
711	register struct slpquehead *qp;
712	register struct proc *p;
713	int s;
714
715	s = splhigh();
716	qp = &slpque[LOOKUP(ident)];
717restart:
718	TAILQ_FOREACH(p, qp, p_procq) {
719		if (p->p_wchan == ident) {
720			TAILQ_REMOVE(qp, p, p_procq);
721			p->p_wchan = 0;
722			if (p->p_stat == SSLEEP) {
723				/* OPTIMIZED EXPANSION OF setrunnable(p); */
724				if (p->p_slptime > 1)
725					updatepri(p);
726				p->p_slptime = 0;
727				p->p_stat = SRUN;
728				if (p->p_flag & P_INMEM) {
729					setrunqueue(p);
730					maybe_resched(p);
731				} else {
732					p->p_flag |= P_SWAPINREQ;
733					wakeup((caddr_t)&proc0);
734				}
735				/* END INLINE EXPANSION */
736				goto restart;
737			}
738		}
739	}
740	splx(s);
741}
742
743/*
744 * Make a process sleeping on the specified identifier runnable.
745 * May wake more than one process if a target process is currently
746 * swapped out.
747 */
748void
749wakeup_one(ident)
750	register void *ident;
751{
752	register struct slpquehead *qp;
753	register struct proc *p;
754	int s;
755
756	s = splhigh();
757	qp = &slpque[LOOKUP(ident)];
758
759	TAILQ_FOREACH(p, qp, p_procq) {
760		if (p->p_wchan == ident) {
761			TAILQ_REMOVE(qp, p, p_procq);
762			p->p_wchan = 0;
763			if (p->p_stat == SSLEEP) {
764				/* OPTIMIZED EXPANSION OF setrunnable(p); */
765				if (p->p_slptime > 1)
766					updatepri(p);
767				p->p_slptime = 0;
768				p->p_stat = SRUN;
769				if (p->p_flag & P_INMEM) {
770					setrunqueue(p);
771					maybe_resched(p);
772					break;
773				} else {
774					p->p_flag |= P_SWAPINREQ;
775					wakeup((caddr_t)&proc0);
776				}
777				/* END INLINE EXPANSION */
778			}
779		}
780	}
781	splx(s);
782}
783
784/*
785 * The machine independent parts of mi_switch().
786 * Must be called at splstatclock() or higher.
787 */
788void
789mi_switch()
790{
791	struct timeval new_switchtime;
792	register struct proc *p = curproc;	/* XXX */
793	register struct rlimit *rlim;
794	int x;
795
796	/*
797	 * XXX this spl is almost unnecessary.  It is partly to allow for
798	 * sloppy callers that don't do it (issignal() via CURSIG() is the
799	 * main offender).  It is partly to work around a bug in the i386
800	 * cpu_switch() (the ipl is not preserved).  We ran for years
801	 * without it.  I think there was only a interrupt latency problem.
802	 * The main caller, tsleep(), does an splx() a couple of instructions
803	 * after calling here.  The buggy caller, issignal(), usually calls
804	 * here at spl0() and sometimes returns at splhigh().  The process
805	 * then runs for a little too long at splhigh().  The ipl gets fixed
806	 * when the process returns to user mode (or earlier).
807	 *
808	 * It would probably be better to always call here at spl0(). Callers
809	 * are prepared to give up control to another process, so they must
810	 * be prepared to be interrupted.  The clock stuff here may not
811	 * actually need splstatclock().
812	 */
813	x = splstatclock();
814
815#ifdef SIMPLELOCK_DEBUG
816	if (p->p_simple_locks)
817		printf("sleep: holding simple lock\n");
818#endif
819	/*
820	 * Compute the amount of time during which the current
821	 * process was running, and add that to its total so far.
822	 */
823	microuptime(&new_switchtime);
824	if (timevalcmp(&new_switchtime, &switchtime, <)) {
825		printf("microuptime() went backwards (%ld.%06ld -> %ld.%06ld)\n",
826		    switchtime.tv_sec, switchtime.tv_usec,
827		    new_switchtime.tv_sec, new_switchtime.tv_usec);
828		new_switchtime = switchtime;
829	} else {
830		p->p_runtime += (new_switchtime.tv_usec - switchtime.tv_usec) +
831		    (new_switchtime.tv_sec - switchtime.tv_sec) * (int64_t)1000000;
832	}
833
834	/*
835	 * Check if the process exceeds its cpu resource allocation.
836	 * If over max, kill it.
837	 */
838	if (p->p_stat != SZOMB && p->p_limit->p_cpulimit != RLIM_INFINITY &&
839	    p->p_runtime > p->p_limit->p_cpulimit) {
840		rlim = &p->p_rlimit[RLIMIT_CPU];
841		if (p->p_runtime / (rlim_t)1000000 >= rlim->rlim_max) {
842			killproc(p, "exceeded maximum CPU limit");
843		} else {
844			psignal(p, SIGXCPU);
845			if (rlim->rlim_cur < rlim->rlim_max) {
846				/* XXX: we should make a private copy */
847				rlim->rlim_cur += 5;
848			}
849		}
850	}
851
852	/*
853	 * Pick a new current process and record its start time.
854	 */
855	cnt.v_swtch++;
856	switchtime = new_switchtime;
857	cpu_switch(p);
858	if (switchtime.tv_sec == 0)
859		microuptime(&switchtime);
860	switchticks = ticks;
861
862	splx(x);
863}
864
865/*
866 * Change process state to be runnable,
867 * placing it on the run queue if it is in memory,
868 * and awakening the swapper if it isn't in memory.
869 */
870void
871setrunnable(p)
872	register struct proc *p;
873{
874	register int s;
875
876	s = splhigh();
877	switch (p->p_stat) {
878	case 0:
879	case SRUN:
880	case SZOMB:
881	default:
882		panic("setrunnable");
883	case SSTOP:
884	case SSLEEP:
885		unsleep(p);		/* e.g. when sending signals */
886		break;
887
888	case SIDL:
889		break;
890	}
891	p->p_stat = SRUN;
892	if (p->p_flag & P_INMEM)
893		setrunqueue(p);
894	splx(s);
895	if (p->p_slptime > 1)
896		updatepri(p);
897	p->p_slptime = 0;
898	if ((p->p_flag & P_INMEM) == 0) {
899		p->p_flag |= P_SWAPINREQ;
900		wakeup((caddr_t)&proc0);
901	}
902	else
903		maybe_resched(p);
904}
905
906/*
907 * Compute the priority of a process when running in user mode.
908 * Arrange to reschedule if the resulting priority is better
909 * than that of the current process.
910 */
911void
912resetpriority(p)
913	register struct proc *p;
914{
915	register unsigned int newpriority;
916
917	if (p->p_rtprio.type == RTP_PRIO_NORMAL) {
918		newpriority = PUSER + p->p_estcpu / INVERSE_ESTCPU_WEIGHT +
919		    NICE_WEIGHT * (p->p_nice - PRIO_MIN);
920		newpriority = min(newpriority, MAXPRI);
921		p->p_usrpri = newpriority;
922	}
923	maybe_resched(p);
924}
925
926/* ARGSUSED */
927static void
928sched_setup(dummy)
929	void *dummy;
930{
931	/* Kick off timeout driven events by calling first time. */
932	roundrobin(NULL);
933	schedcpu(NULL);
934}
935
936/*
937 * We adjust the priority of the current process.  The priority of
938 * a process gets worse as it accumulates CPU time.  The cpu usage
939 * estimator (p_estcpu) is increased here.  resetpriority() will
940 * compute a different priority each time p_estcpu increases by
941 * INVERSE_ESTCPU_WEIGHT
942 * (until MAXPRI is reached).  The cpu usage estimator ramps up
943 * quite quickly when the process is running (linearly), and decays
944 * away exponentially, at a rate which is proportionally slower when
945 * the system is busy.  The basic principle is that the system will
946 * 90% forget that the process used a lot of CPU time in 5 * loadav
947 * seconds.  This causes the system to favor processes which haven't
948 * run much recently, and to round-robin among other processes.
949 */
950void
951schedclock(p)
952	struct proc *p;
953{
954
955	p->p_cpticks++;
956	p->p_estcpu = ESTCPULIM(p->p_estcpu + 1);
957	if ((p->p_estcpu % INVERSE_ESTCPU_WEIGHT) == 0) {
958		resetpriority(p);
959		if (p->p_priority >= PUSER)
960			p->p_priority = p->p_usrpri;
961	}
962}
963