kern_synch.c revision 66716
1139749Simp/*-
2113584Ssimokawa * Copyright (c) 1982, 1986, 1990, 1991, 1993
3103285Sikob *	The Regents of the University of California.  All rights reserved.
4103285Sikob * (c) UNIX System Laboratories, Inc.
5103285Sikob * All or some portions of this file are derived from material licensed
6103285Sikob * to the University of California by American Telephone and Telegraph
7103285Sikob * Co. or Unix System Laboratories, Inc. and are reproduced herein with
8103285Sikob * the permission of UNIX System Laboratories, Inc.
9103285Sikob *
10103285Sikob * Redistribution and use in source and binary forms, with or without
11103285Sikob * modification, are permitted provided that the following conditions
12103285Sikob * are met:
13103285Sikob * 1. Redistributions of source code must retain the above copyright
14103285Sikob *    notice, this list of conditions and the following disclaimer.
15103285Sikob * 2. Redistributions in binary form must reproduce the above copyright
16103285Sikob *    notice, this list of conditions and the following disclaimer in the
17103285Sikob *    documentation and/or other materials provided with the distribution.
18103285Sikob * 3. All advertising materials mentioning features or use of this software
19103285Sikob *    must display the following acknowledgement:
20103285Sikob *	This product includes software developed by the University of
21103285Sikob *	California, Berkeley and its contributors.
22103285Sikob * 4. Neither the name of the University nor the names of its contributors
23103285Sikob *    may be used to endorse or promote products derived from this software
24103285Sikob *    without specific prior written permission.
25103285Sikob *
26103285Sikob * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
27103285Sikob * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28103285Sikob * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29103285Sikob * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
30103285Sikob * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
31103285Sikob * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
32103285Sikob * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
33103285Sikob * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
34103285Sikob * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
35227843Smarius * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
36227843Smarius * SUCH DAMAGE.
37227843Smarius *
38103285Sikob *	@(#)kern_synch.c	8.9 (Berkeley) 5/19/95
39103285Sikob * $FreeBSD: head/sys/kern/kern_synch.c 66716 2000-10-06 02:20:21Z jhb $
40103285Sikob */
41103285Sikob
42193066Sjamie#include "opt_ktrace.h"
43103285Sikob
44129879Sphk#include <sys/param.h>
45103285Sikob#include <sys/systm.h>
46103285Sikob#include <sys/proc.h>
47103285Sikob#include <sys/kernel.h>
48169806Ssimokawa#include <sys/ktr.h>
49103285Sikob#include <sys/signalvar.h>
50170374Ssimokawa#include <sys/resourcevar.h>
51170374Ssimokawa#include <sys/vmmeter.h>
52127468Ssimokawa#include <sys/sysctl.h>
53117067Ssimokawa#include <vm/vm.h>
54117067Ssimokawa#include <vm/vm_extern.h>
55103285Sikob#ifdef KTRACE
56103285Sikob#include <sys/uio.h>
57113584Ssimokawa#include <sys/ktrace.h>
58103285Sikob#endif
59127468Ssimokawa
60127468Ssimokawa#include <machine/cpu.h>
61127468Ssimokawa#include <machine/ipl.h>
62127468Ssimokawa#include <machine/smp.h>
63127468Ssimokawa#include <machine/mutex.h>
64127468Ssimokawa
65127468Ssimokawastatic void sched_setup __P((void *dummy));
66103285SikobSYSINIT(sched_setup, SI_SUB_KICK_SCHEDULER, SI_ORDER_FIRST, sched_setup, NULL)
67103285Sikob
68110072Ssimokawau_char	curpriority;
69103285Sikobint	hogticks;
70103285Sikobint	lbolt;
71127468Ssimokawaint	sched_quantum;		/* Roundrobin scheduling quantum in ticks. */
72103285Sikob
73116376Ssimokawastatic int	curpriority_cmp __P((struct proc *p));
74116376Ssimokawastatic void	endtsleep __P((void *));
75116376Ssimokawastatic void	maybe_resched __P((struct proc *chk));
76116376Ssimokawastatic void	roundrobin __P((void *arg));
77116376Ssimokawastatic void	schedcpu __P((void *arg));
78116376Ssimokawastatic void	updatepri __P((struct proc *p));
79116376Ssimokawa
80188704Ssbrunostatic int
81103285Sikobsysctl_kern_quantum(SYSCTL_HANDLER_ARGS)
82108281Ssimokawa{
83109736Ssimokawa	int error, new_val;
84109736Ssimokawa
85109736Ssimokawa	new_val = sched_quantum * tick;
86120850Ssimokawa	error = sysctl_handle_int(oidp, &new_val, 0, req);
87120850Ssimokawa        if (error != 0 || req->newptr == NULL)
88103285Sikob		return (error);
89110195Ssimokawa	if (new_val < tick)
90110269Ssimokawa		return (EINVAL);
91110195Ssimokawa	sched_quantum = new_val / tick;
92103285Sikob	hogticks = 2 * sched_quantum;
93103285Sikob	return (0);
94103285Sikob}
95103285Sikob
96125238SsimokawaSYSCTL_PROC(_kern, OID_AUTO, quantum, CTLTYPE_INT|CTLFLAG_RW,
97125238Ssimokawa	0, sizeof sched_quantum, sysctl_kern_quantum, "I", "");
98124169Ssimokawa
99124169Ssimokawa/*-
100124169Ssimokawa * Compare priorities.  Return:
101170374Ssimokawa *     <0: priority of p < current priority
102103285Sikob *      0: priority of p == current priority
103124169Ssimokawa *     >0: priority of p > current priority
104103285Sikob * The priorities are the normal priorities or the normal realtime priorities
105212413Savg * if p is on the same scheduler as curproc.  Otherwise the process on the
106124169Ssimokawa * more realtimeish scheduler has lowest priority.  As usual, a higher
107124169Ssimokawa * priority really means a lower priority.
108124169Ssimokawa */
109124169Ssimokawastatic int
110124169Ssimokawacurpriority_cmp(p)
111124169Ssimokawa	struct proc *p;
112169806Ssimokawa{
113106543Ssimokawa	int c_class, p_class;
114124169Ssimokawa
115106543Ssimokawa	c_class = RTP_PRIO_BASE(curproc->p_rtprio.type);
116124169Ssimokawa	p_class = RTP_PRIO_BASE(p->p_rtprio.type);
117170374Ssimokawa	if (p_class != c_class)
118103285Sikob		return (p_class - c_class);
119103285Sikob	if (p_class == RTP_PRIO_NORMAL)
120103285Sikob		return (((int)p->p_priority - (int)curpriority) / PPQ);
121125238Ssimokawa	return ((int)p->p_rtprio.prio - (int)curproc->p_rtprio.prio);
122125238Ssimokawa}
123103285Sikob
124103285Sikob/*
125108642Ssimokawa * Arrange to reschedule if necessary, taking the priorities and
126116978Ssimokawa * schedulers into account.
127103285Sikob */
128103285Sikobstatic void
129103285Sikobmaybe_resched(chk)
130103285Sikob	struct proc *chk;
131103285Sikob{
132227843Smarius	struct proc *p = curproc; /* XXX */
133103285Sikob
134124251Ssimokawa	/*
135124251Ssimokawa	 * XXX idle scheduler still broken because proccess stays on idle
136124251Ssimokawa	 * scheduler during waits (such as when getting FS locks).  If a
137124251Ssimokawa	 * standard process becomes runaway cpu-bound, the system can lockup
138103285Sikob	 * due to idle-scheduler processes in wakeup never getting any cpu.
139124251Ssimokawa	 */
140124251Ssimokawa	if (p == idleproc) {
141124251Ssimokawa#if 0
142124251Ssimokawa		need_resched();
143124251Ssimokawa#endif
144124251Ssimokawa	} else if (chk == p) {
145124251Ssimokawa		/* We may need to yield if our priority has been raised. */
146114909Ssimokawa		if (curpriority_cmp(chk) > 0)
147114909Ssimokawa			need_resched();
148114909Ssimokawa	} else if (curpriority_cmp(chk) < 0)
149114909Ssimokawa		need_resched();
150106813Ssimokawa}
151103285Sikob
152103285Sikobint
153103285Sikobroundrobin_interval(void)
154103285Sikob{
155103285Sikob	return (sched_quantum);
156103285Sikob}
157103285Sikob
158110072Ssimokawa/*
159103285Sikob * Force switch among equal priority processes every 100ms.
160106810Ssimokawa */
161110072Ssimokawa/* ARGSUSED */
162103285Sikobstatic void
163103285Sikobroundrobin(arg)
164110072Ssimokawa	void *arg;
165110072Ssimokawa{
166110072Ssimokawa#ifndef SMP
167110193Ssimokawa 	struct proc *p = curproc; /* XXX */
168120660Ssimokawa#endif
169103285Sikob
170110072Ssimokawa#ifdef SMP
171110072Ssimokawa	need_resched();
172106810Ssimokawa	forward_roundrobin();
173103285Sikob#else
174106813Ssimokawa 	if (p == idleproc || RTP_PRIO_NEED_RR(p->p_rtprio.type))
175103285Sikob 		need_resched();
176110072Ssimokawa#endif
177110072Ssimokawa
178110072Ssimokawa 	timeout(roundrobin, NULL, sched_quantum);
179110582Ssimokawa}
180110072Ssimokawa
181110072Ssimokawa/*
182110072Ssimokawa * Constants for digital decay and forget:
183110072Ssimokawa *	90% of (p_estcpu) usage in 5 * loadav time
184110072Ssimokawa *	95% of (p_pctcpu) usage in 60 seconds (load insensitive)
185170374Ssimokawa *          Note that, as ps(1) mentions, this can let percentages
186110193Ssimokawa *          total over 100% (I've seen 137.9% for 3 processes).
187110582Ssimokawa *
188110072Ssimokawa * Note that schedclock() updates p_estcpu and p_cpticks asynchronously.
189170374Ssimokawa *
190110072Ssimokawa * We wish to decay away 90% of p_estcpu in (5 * loadavg) seconds.
191110072Ssimokawa * That is, the system wants to compute a value of decay such
192110072Ssimokawa * that the following for loop:
193110072Ssimokawa * 	for (i = 0; i < (5 * loadavg); i++)
194110072Ssimokawa * 		p_estcpu *= decay;
195110072Ssimokawa * will compute
196110072Ssimokawa * 	p_estcpu *= 0.1;
197110072Ssimokawa * for all values of loadavg:
198103285Sikob *
199103285Sikob * Mathematically this loop can be expressed by saying:
200103285Sikob * 	decay ** (5 * loadavg) ~= .1
201103285Sikob *
202103285Sikob * The system computes decay as:
203103285Sikob * 	decay = (2 * loadavg) / (2 * loadavg + 1)
204103285Sikob *
205170374Ssimokawa * We wish to prove that the system's computation of decay
206103285Sikob * will always fulfill the equation:
207103285Sikob * 	decay ** (5 * loadavg) ~= .1
208103285Sikob *
209103285Sikob * If we compute b as:
210103285Sikob * 	b = 2 * loadavg
211167632Ssimokawa * then
212167632Ssimokawa * 	decay = b / (b + 1)
213103285Sikob *
214103285Sikob * We now need to prove two things:
215120660Ssimokawa *	1) Given factor ** (5 * loadavg) ~= .1, prove factor == b/(b+1)
216103285Sikob *	2) Given b/(b+1) ** power ~= .1, prove power == (5 * loadavg)
217103285Sikob *
218103285Sikob * Facts:
219103285Sikob *         For x close to zero, exp(x) =~ 1 + x, since
220124251Ssimokawa *              exp(x) = 0! + x**1/1! + x**2/2! + ... .
221103285Sikob *              therefore exp(-1/b) =~ 1 - (1/b) = (b-1)/b.
222103285Sikob *         For x close to zero, ln(1+x) =~ x, since
223170425Ssimokawa *              ln(1+x) = x - x**2/2 + x**3/3 - ...     -1 < x < 1
224170425Ssimokawa *              therefore ln(b/(b+1)) = ln(1 - 1/(b+1)) =~ -1/(b+1).
225170425Ssimokawa *         ln(.1) =~ -2.30
226170425Ssimokawa *
227170425Ssimokawa * Proof of (1):
228170425Ssimokawa *    Solve (factor)**(power) =~ .1 given power (5*loadav):
229170425Ssimokawa *	solving for factor,
230170425Ssimokawa *      ln(factor) =~ (-2.30/5*loadav), or
231170425Ssimokawa *      factor =~ exp(-1/((5/2.30)*loadav)) =~ exp(-1/(2*loadav)) =
232170425Ssimokawa *          exp(-1/b) =~ (b-1)/b =~ b/(b+1).                    QED
233170425Ssimokawa *
234103285Sikob * Proof of (2):
235103285Sikob *    Solve (factor)**(power) =~ .1 given factor == (b/(b+1)):
236103285Sikob *	solving for power,
237103285Sikob *      power*ln(b/(b+1)) =~ -2.30, or
238103285Sikob *      power =~ 2.3 * (b + 1) = 4.6*loadav + 2.3 =~ 5*loadav.  QED
239120660Ssimokawa *
240120660Ssimokawa * Actual power values for the implemented algorithm are as follows:
241120660Ssimokawa *      loadav: 1       2       3       4
242120660Ssimokawa *      power:  5.68    10.32   14.94   19.55
243103285Sikob */
244120660Ssimokawa
245103285Sikob/* calculations for digital decay to forget 90% of usage in 5*loadav sec */
246120660Ssimokawa#define	loadfactor(loadav)	(2 * (loadav))
247120660Ssimokawa#define	decay_cpu(loadfac, cpu)	(((loadfac) * (cpu)) / ((loadfac) + FSCALE))
248120660Ssimokawa
249120660Ssimokawa/* decay 95% of `p_pctcpu' in 60 seconds; see CCPU_SHIFT before changing */
250124251Ssimokawastatic fixpt_t	ccpu = 0.95122942450071400909 * FSCALE;	/* exp(-1/20) */
251124251SsimokawaSYSCTL_INT(_kern, OID_AUTO, ccpu, CTLFLAG_RD, &ccpu, 0, "");
252103285Sikob
253103285Sikob/* kernel uses `FSCALE', userland (SHOULD) use kern.fscale */
254106790Ssimokawastatic int	fscale __unused = FSCALE;
255103285SikobSYSCTL_INT(_kern, OID_AUTO, fscale, CTLFLAG_RD, 0, FSCALE, "");
256103285Sikob
257103285Sikob/*
258103285Sikob * If `ccpu' is not equal to `exp(-1/20)' and you still want to use the
259103285Sikob * faster/more-accurate formula, you'll have to estimate CCPU_SHIFT below
260108655Ssimokawa * and possibly adjust FSHIFT in "param.h" so that (FSHIFT >= CCPU_SHIFT).
261108655Ssimokawa *
262170374Ssimokawa * To estimate CCPU_SHIFT for exp(-1/20), the following formula was used:
263103285Sikob *	1 - exp(-1/20) ~= 0.0487 ~= 0.0488 == 1 (fixed pt, *11* bits).
264103285Sikob *
265170374Ssimokawa * If you don't want to bother with the faster/more-accurate formula, you
266103285Sikob * can set CCPU_SHIFT to (FSHIFT + 1) which will use a slower/less-accurate
267170374Ssimokawa * (more general) method of calculating the %age of CPU used by a process.
268130460Sdfr */
269103285Sikob#define	CCPU_SHIFT	11
270103285Sikob
271103285Sikob/*
272103285Sikob * Recompute process priorities, every hz ticks.
273103285Sikob */
274103285Sikob/* ARGSUSED */
275103285Sikobstatic void
276103285Sikobschedcpu(arg)
277103285Sikob	void *arg;
278103285Sikob{
279103285Sikob	register fixpt_t loadfac = loadfactor(averunnable.ldavg[0]);
280103285Sikob	register struct proc *p;
281103285Sikob	register int realstathz, s;
282170374Ssimokawa
283170374Ssimokawa	realstathz = stathz ? stathz : hz;
284170374Ssimokawa	LIST_FOREACH(p, &allproc, p_list) {
285170374Ssimokawa		/*
286170374Ssimokawa		 * Increment time in/out of memory and sleep time
287170374Ssimokawa		 * (if sleeping).  We ignore overflow; with 16-bit int's
288170374Ssimokawa		 * (remember them?) overflow takes 45 days.
289170374Ssimokawa		if (p->p_stat == SWAIT)
290103285Sikob			continue;
291103285Sikob		 */
292103285Sikob		mtx_enter(&sched_lock, MTX_SPIN);
293103285Sikob		p->p_swtime++;
294170374Ssimokawa		if (p->p_stat == SSLEEP || p->p_stat == SSTOP)
295170374Ssimokawa			p->p_slptime++;
296170374Ssimokawa		p->p_pctcpu = (p->p_pctcpu * ccpu) >> FSHIFT;
297170374Ssimokawa		/*
298170374Ssimokawa		 * If the process has slept the entire second,
299170374Ssimokawa		 * stop recalculating its priority until it wakes up.
300170374Ssimokawa		 */
301170374Ssimokawa		if (p->p_slptime > 1) {
302170374Ssimokawa			mtx_exit(&sched_lock, MTX_SPIN);
303170374Ssimokawa			continue;
304170374Ssimokawa		}
305170374Ssimokawa
306170374Ssimokawa		/*
307170374Ssimokawa		 * prevent state changes and protect run queue
308103285Sikob		 */
309103285Sikob		s = splhigh();
310103285Sikob
311106790Ssimokawa		/*
312106790Ssimokawa		 * p_pctcpu is only for ps.
313106790Ssimokawa		 */
314103285Sikob#if	(FSHIFT >= CCPU_SHIFT)
315103285Sikob		p->p_pctcpu += (realstathz == 100)?
316170374Ssimokawa			((fixpt_t) p->p_cpticks) << (FSHIFT - CCPU_SHIFT):
317170374Ssimokawa                	100 * (((fixpt_t) p->p_cpticks)
318170374Ssimokawa				<< (FSHIFT - CCPU_SHIFT)) / realstathz;
319170374Ssimokawa#else
320103285Sikob		p->p_pctcpu += ((FSCALE - ccpu) *
321170374Ssimokawa			(p->p_cpticks * FSCALE / realstathz)) >> FSHIFT;
322103285Sikob#endif
323170374Ssimokawa		p->p_cpticks = 0;
324170374Ssimokawa		p->p_estcpu = decay_cpu(loadfac, p->p_estcpu);
325103285Sikob		resetpriority(p);
326103285Sikob		if (p->p_priority >= PUSER) {
327103285Sikob			if ((p != curproc) &&
328103285Sikob#ifdef SMP
329103285Sikob			    p->p_oncpu == 0xff && 	/* idle */
330103285Sikob#endif
331106790Ssimokawa			    p->p_stat == SRUN &&
332125238Ssimokawa			    (p->p_flag & P_INMEM) &&
333125238Ssimokawa			    (p->p_priority / PPQ) != (p->p_usrpri / PPQ)) {
334125238Ssimokawa				remrunqueue(p);
335125238Ssimokawa				p->p_priority = p->p_usrpri;
336125238Ssimokawa				setrunqueue(p);
337125238Ssimokawa			} else
338103285Sikob				p->p_priority = p->p_usrpri;
339125238Ssimokawa		}
340103285Sikob		mtx_exit(&sched_lock, MTX_SPIN);
341108281Ssimokawa		splx(s);
342125238Ssimokawa	}
343103285Sikob	vmmeter();
344106790Ssimokawa	wakeup((caddr_t)&lbolt);
345110577Ssimokawa	timeout(schedcpu, (void *)0, hz);
346170374Ssimokawa}
347110577Ssimokawa
348170374Ssimokawa/*
349170374Ssimokawa * Recalculate the priority of a process after it has slept for a while.
350110577Ssimokawa * For all load averages >= 1 and max p_estcpu of 255, sleeping for at
351110577Ssimokawa * least six times the loadfactor will decay p_estcpu to zero.
352170374Ssimokawa */
353111040Ssimokawastatic void
354110577Ssimokawaupdatepri(p)
355120660Ssimokawa	register struct proc *p;
356120660Ssimokawa{
357110577Ssimokawa	register unsigned int newcpu = p->p_estcpu;
358110577Ssimokawa	register fixpt_t loadfac = loadfactor(averunnable.ldavg[0]);
359110577Ssimokawa
360170374Ssimokawa	if (p->p_slptime > 5 * loadfac)
361110577Ssimokawa		p->p_estcpu = 0;
362111040Ssimokawa	else {
363171513Ssimokawa		p->p_slptime--;	/* the first time was done in schedcpu */
364110577Ssimokawa		while (newcpu && --p->p_slptime)
365169119Ssimokawa			newcpu = decay_cpu(loadfac, newcpu);
366170427Ssimokawa		p->p_estcpu = newcpu;
367170427Ssimokawa	}
368170427Ssimokawa	resetpriority(p);
369110577Ssimokawa}
370110577Ssimokawa
371110577Ssimokawa/*
372110577Ssimokawa * We're only looking at 7 bits of the address; everything is
373170374Ssimokawa * aligned to 4, lots of things are aligned to greater powers
374170374Ssimokawa * of 2.  Shift right by 8, i.e. drop the bottom 256 worth.
375170374Ssimokawa */
376110577Ssimokawa#define TABLESIZE	128
377249291Swillstatic TAILQ_HEAD(slpquehead, proc) slpque[TABLESIZE];
378170374Ssimokawa#define LOOKUP(x)	(((intptr_t)(x) >> 8) & (TABLESIZE - 1))
379170374Ssimokawa
380110577Ssimokawa#if 0
381110577Ssimokawa/*
382171513Ssimokawa * During autoconfiguration or after a panic, a sleep will simply
383111040Ssimokawa * lower the priority briefly to allow interrupts, then return.
384170374Ssimokawa * The priority to be used (safepri) is machine-dependent, thus this
385170374Ssimokawa * value is initialized and maintained in the machine-dependent layers.
386170374Ssimokawa * This priority will typically be 0, or the lowest priority
387170374Ssimokawa * that is safe for use on the interrupt stack; it can be made
388110577Ssimokawa * higher to block network software interrupts after panics.
389110577Ssimokawa */
390170374Ssimokawaint safepri;
391110577Ssimokawa#endif
392110577Ssimokawa
393110577Ssimokawavoid
394110577Ssimokawasleepinit(void)
395170374Ssimokawa{
396110577Ssimokawa	int i;
397110577Ssimokawa
398121463Ssimokawa	sched_quantum = hz/10;
399121463Ssimokawa	hogticks = 2 * sched_quantum;
400121463Ssimokawa	for (i = 0; i < TABLESIZE; i++)
401121463Ssimokawa		TAILQ_INIT(&slpque[i]);
402121463Ssimokawa}
403121463Ssimokawa
404170374Ssimokawa/*
405170374Ssimokawa * General sleep call.  Suspends the current process until a wakeup is
406170374Ssimokawa * performed on the specified identifier.  The process will then be made
407170374Ssimokawa * runnable with the specified priority.  Sleeps at most timo/hz seconds
408121463Ssimokawa * (0 means no timeout).  If pri includes PCATCH flag, signals are checked
409170374Ssimokawa * before and after sleeping, else signals are not checked.  Returns 0 if
410110577Ssimokawa * awakened, EWOULDBLOCK if the timeout expires.  If PCATCH is set and a
411110577Ssimokawa * signal needs to be delivered, ERESTART is returned if the current system
412110577Ssimokawa * call should be restarted if possible, and EINTR is returned if the system
413103285Sikob * call should be interrupted by the signal (return EINTR).
414103285Sikob *
415103285Sikob * The mutex argument is exited before the caller is suspended, and
416103285Sikob * entered before msleep returns.  If priority includes the PDROP
417118455Ssimokawa * flag the mutex is not entered before returning.
418103285Sikob */
419118455Ssimokawaint
420103285Sikobmsleep(ident, mtx, priority, wmesg, timo)
421103285Sikob	void *ident;
422103285Sikob	struct mtx *mtx;
423103285Sikob	int priority, timo;
424103285Sikob	const char *wmesg;
425103285Sikob{
426116978Ssimokawa	struct proc *p = curproc;
427103285Sikob	int s, sig, catch = priority & PCATCH;
428118455Ssimokawa	struct callout_handle thandle;
429118455Ssimokawa	int rval = 0;
430103285Sikob	WITNESS_SAVE_DECL(mtx);
431118455Ssimokawa
432118455Ssimokawa#ifdef KTRACE
433187993Ssbruno	if (p && KTRPOINT(p, KTR_CSW))
434187993Ssbruno		ktrcsw(p->p_tracep, 1, 0);
435187993Ssbruno#endif
436187993Ssbruno	WITNESS_SLEEP(0, mtx);
437187993Ssbruno	mtx_enter(&sched_lock, MTX_SPIN);
438187993Ssbruno
439187993Ssbruno	if (mtx != NULL) {
440187993Ssbruno		KASSERT(mtx->mtx_recurse == 0,
441187993Ssbruno		    ("sleeping on recursed mutex %s", mtx->mtx_description));
442187993Ssbruno		WITNESS_SAVE(mtx, mtx);
443187993Ssbruno		mtx_exit(mtx, MTX_DEF | MTX_NOSWITCH);
444187993Ssbruno		if (priority & PDROP)
445187993Ssbruno			mtx = NULL;
446187993Ssbruno	}
447187993Ssbruno
448187993Ssbruno	s = splhigh();
449187993Ssbruno	if (cold || panicstr) {
450187993Ssbruno		/*
451187993Ssbruno		 * After a panic, or during autoconfiguration,
452187993Ssbruno		 * just give interrupts a chance, then just return;
453187993Ssbruno		 * don't run any other procs or panic below,
454187993Ssbruno		 * in case this is the idle process and already asleep.
455187993Ssbruno		 */
456187993Ssbruno		mtx_exit(&sched_lock, MTX_SPIN);
457187993Ssbruno		splx(s);
458170374Ssimokawa		return (0);
459171513Ssimokawa	}
460170374Ssimokawa
461170374Ssimokawa	KASSERT(p != NULL, ("tsleep1"));
462170374Ssimokawa	KASSERT(ident != NULL && p->p_stat == SRUN, ("tsleep"));
463170374Ssimokawa	/*
464108853Ssimokawa	 * Process may be sitting on a slpque if asleep() was called, remove
465110577Ssimokawa	 * it before re-adding.
466110577Ssimokawa	 */
467110193Ssimokawa	if (p->p_wchan != NULL)
468169806Ssimokawa		unsleep(p);
469172836Sjulian
470169806Ssimokawa	p->p_wchan = ident;
471169806Ssimokawa	p->p_wmesg = wmesg;
472103285Sikob	p->p_slptime = 0;
473103285Sikob	p->p_priority = priority & PRIMASK;
474103285Sikob	p->p_nativepri = p->p_priority;
475103285Sikob	CTR4(KTR_PROC, "tsleep: proc %p (pid %d, %s), schedlock %p",
476103285Sikob		p, p->p_pid, p->p_comm, (void *) sched_lock.mtx_lock);
477103285Sikob	TAILQ_INSERT_TAIL(&slpque[LOOKUP(ident)], p, p_procq);
478103285Sikob	if (timo)
479187993Ssbruno		thandle = timeout(endtsleep, (void *)p, timo);
480169117Ssimokawa	/*
481187993Ssbruno	 * We put ourselves on the sleep queue and start our timeout
482103285Sikob	 * before calling CURSIG, as we could stop there, and a wakeup
483103285Sikob	 * or a SIGCONT (or both) could occur while we were stopped.
484103285Sikob	 * A SIGCONT would cause us to be marked as SSLEEP
485103285Sikob	 * without resuming us, thus we must be ready for sleep
486103285Sikob	 * when CURSIG is called.  If the wakeup happens while we're
487103285Sikob	 * stopped, p->p_wchan will be 0 upon return from CURSIG.
488103285Sikob	 */
489103285Sikob	if (catch) {
490103285Sikob		CTR4(KTR_PROC,
491212413Savg		        "tsleep caught: proc %p (pid %d, %s), schedlock %p",
492103285Sikob			p, p->p_pid, p->p_comm, (void *) sched_lock.mtx_lock);
493103285Sikob		p->p_flag |= P_SINTR;
494103285Sikob		if ((sig = CURSIG(p))) {
495103285Sikob			if (p->p_wchan)
496103285Sikob				unsleep(p);
497103285Sikob			p->p_stat = SRUN;
498103285Sikob			goto resume;
499103285Sikob		}
500103285Sikob		if (p->p_wchan == 0) {
501103285Sikob			catch = 0;
502103285Sikob			goto resume;
503103285Sikob		}
504103285Sikob	} else
505106790Ssimokawa		sig = 0;
506116978Ssimokawa	p->p_stat = SSLEEP;
507116978Ssimokawa	p->p_stats->p_ru.ru_nvcsw++;
508116978Ssimokawa	mi_switch();
509116978Ssimokawa	CTR4(KTR_PROC,
510116978Ssimokawa	        "tsleep resume: proc %p (pid %d, %s), schedlock %p",
511116978Ssimokawa		p, p->p_pid, p->p_comm, (void *) sched_lock.mtx_lock);
512116978Ssimokawaresume:
513116978Ssimokawa	curpriority = p->p_usrpri;
514116978Ssimokawa	splx(s);
515116978Ssimokawa	p->p_flag &= ~P_SINTR;
516116978Ssimokawa	if (p->p_flag & P_TIMEOUT) {
517116978Ssimokawa		p->p_flag &= ~P_TIMEOUT;
518116978Ssimokawa		if (sig == 0) {
519103285Sikob#ifdef KTRACE
520103285Sikob			if (KTRPOINT(p, KTR_CSW))
521103285Sikob				ktrcsw(p->p_tracep, 0, 0);
522103285Sikob#endif
523118455Ssimokawa			rval = EWOULDBLOCK;
524103285Sikob			goto out;
525103285Sikob		}
526169806Ssimokawa	} else if (timo)
527111078Ssimokawa		untimeout(endtsleep, (void *)p, thandle);
528118455Ssimokawa	if (catch && (sig != 0 || (sig = CURSIG(p)))) {
529103285Sikob#ifdef KTRACE
530103285Sikob		if (KTRPOINT(p, KTR_CSW))
531169806Ssimokawa			ktrcsw(p->p_tracep, 0, 0);
532170374Ssimokawa#endif
533169806Ssimokawa		if (SIGISMEMBER(p->p_sigacts->ps_sigintr, sig))
534170374Ssimokawa			rval = EINTR;
535170374Ssimokawa		else
536170374Ssimokawa			rval = ERESTART;
537170374Ssimokawa		goto out;
538169806Ssimokawa	}
539178915Ssimokawaout:
540178915Ssimokawa	mtx_exit(&sched_lock, MTX_SPIN);
541178915Ssimokawa#ifdef KTRACE
542118455Ssimokawa	if (KTRPOINT(p, KTR_CSW))
543118455Ssimokawa		ktrcsw(p->p_tracep, 0, 0);
544106790Ssimokawa#endif
545118455Ssimokawa	if (mtx != NULL) {
546118455Ssimokawa		mtx_enter(mtx, MTX_DEF);
547111078Ssimokawa		WITNESS_RESTORE(mtx, mtx);
548169806Ssimokawa	}
549169806Ssimokawa	return (rval);
550169806Ssimokawa}
551111078Ssimokawa
552178915Ssimokawa/*
553169806Ssimokawa * asleep() - async sleep call.  Place process on wait queue and return
554111078Ssimokawa * immediately without blocking.  The process stays runnable until await()
555111078Ssimokawa * is called.  If ident is NULL, remove process from wait queue if it is still
556111078Ssimokawa * on one.
557111078Ssimokawa *
558169806Ssimokawa * Only the most recent sleep condition is effective when making successive
559169806Ssimokawa * calls to asleep() or when calling tsleep().
560169806Ssimokawa *
561169806Ssimokawa * The timeout, if any, is not initiated until await() is called.  The sleep
562171513Ssimokawa * priority, signal, and timeout is specified in the asleep() call but may be
563170374Ssimokawa * overriden in the await() call.
564103285Sikob *
565103285Sikob * <<<<<<<< EXPERIMENTAL, UNTESTED >>>>>>>>>>
566103285Sikob */
567103285Sikob
568103285Sikobint
569103285Sikobasleep(void *ident, int priority, const char *wmesg, int timo)
570103285Sikob{
571103285Sikob	struct proc *p = curproc;
572103285Sikob	int s;
573106790Ssimokawa
574110577Ssimokawa	/*
575110577Ssimokawa	 * obtain sched_lock while manipulating sleep structures and slpque.
576110798Ssimokawa	 *
577110577Ssimokawa	 * Remove preexisting wait condition (if any) and place process
578110577Ssimokawa	 * on appropriate slpque, but do not put process to sleep.
579110577Ssimokawa	 */
580110577Ssimokawa
581110577Ssimokawa	s = splhigh();
582170374Ssimokawa	mtx_enter(&sched_lock, MTX_SPIN);
583111942Ssimokawa
584170374Ssimokawa	if (p->p_wchan != NULL)
585110577Ssimokawa		unsleep(p);
586170374Ssimokawa
587113584Ssimokawa	if (ident) {
588110577Ssimokawa		p->p_wchan = ident;
589110577Ssimokawa		p->p_wmesg = wmesg;
590110577Ssimokawa		p->p_slptime = 0;
591110798Ssimokawa		p->p_asleep.as_priority = priority;
592110798Ssimokawa		p->p_asleep.as_timo = timo;
593110798Ssimokawa		TAILQ_INSERT_TAIL(&slpque[LOOKUP(ident)], p, p_procq);
594170374Ssimokawa	}
595170374Ssimokawa
596110798Ssimokawa	mtx_exit(&sched_lock, MTX_SPIN);
597110798Ssimokawa	splx(s);
598170374Ssimokawa
599170374Ssimokawa	return(0);
600170374Ssimokawa}
601110798Ssimokawa
602110798Ssimokawa/*
603110798Ssimokawa * await() - wait for async condition to occur.   The process blocks until
604110798Ssimokawa * wakeup() is called on the most recent asleep() address.  If wakeup is called
605171513Ssimokawa * priority to await(), await() winds up being a NOP.
606170374Ssimokawa *
607171513Ssimokawa * If await() is called more then once (without an intervening asleep() call),
608170374Ssimokawa * await() is still effectively a NOP but it calls mi_switch() to give other
609170374Ssimokawa * processes some cpu before returning.  The process is left runnable.
610170374Ssimokawa *
611170374Ssimokawa * <<<<<<<< EXPERIMENTAL, UNTESTED >>>>>>>>>>
612249291Swill */
613170374Ssimokawa
614170374Ssimokawaint
615170374Ssimokawaawait(int priority, int timo)
616170374Ssimokawa{
617171513Ssimokawa	struct proc *p = curproc;
618170374Ssimokawa	int rval = 0;
619170374Ssimokawa	int s;
620170374Ssimokawa
621110798Ssimokawa	mtx_enter(&sched_lock, MTX_SPIN);
622110798Ssimokawa
623116376Ssimokawa	s = splhigh();
624116376Ssimokawa
625103285Sikob	if (p->p_wchan != NULL) {
626103285Sikob		struct callout_handle thandle;
627103285Sikob		int sig;
628103285Sikob		int catch;
629103285Sikob
630103285Sikob		/*
631103285Sikob		 * The call to await() can override defaults specified in
632103285Sikob		 * the original asleep().
633103285Sikob		 */
634103285Sikob		if (priority < 0)
635103285Sikob			priority = p->p_asleep.as_priority;
636103285Sikob		if (timo < 0)
637103285Sikob			timo = p->p_asleep.as_timo;
638103285Sikob
639103285Sikob		/*
640103285Sikob		 * Install timeout
641103285Sikob		 */
642103285Sikob
643103285Sikob		if (timo)
644103285Sikob			thandle = timeout(endtsleep, (void *)p, timo);
645103285Sikob
646103285Sikob		sig = 0;
647103285Sikob		catch = priority & PCATCH;
648103285Sikob
649103285Sikob		if (catch) {
650103285Sikob			p->p_flag |= P_SINTR;
651103285Sikob			if ((sig = CURSIG(p))) {
652103285Sikob				if (p->p_wchan)
653103285Sikob					unsleep(p);
654103285Sikob				p->p_stat = SRUN;
655103285Sikob				goto resume;
656103285Sikob			}
657103285Sikob			if (p->p_wchan == NULL) {
658103285Sikob				catch = 0;
659103285Sikob				goto resume;
660103285Sikob			}
661103285Sikob		}
662103285Sikob		p->p_stat = SSLEEP;
663103285Sikob		p->p_stats->p_ru.ru_nvcsw++;
664103285Sikob		mi_switch();
665103285Sikobresume:
666116376Ssimokawa		curpriority = p->p_usrpri;
667113584Ssimokawa
668116376Ssimokawa		splx(s);
669116376Ssimokawa		p->p_flag &= ~P_SINTR;
670116376Ssimokawa		if (p->p_flag & P_TIMEOUT) {
671116376Ssimokawa			p->p_flag &= ~P_TIMEOUT;
672116376Ssimokawa			if (sig == 0) {
673116376Ssimokawa#ifdef KTRACE
674116376Ssimokawa				if (KTRPOINT(p, KTR_CSW))
675116376Ssimokawa					ktrcsw(p->p_tracep, 0, 0);
676116376Ssimokawa#endif
677116376Ssimokawa				rval = EWOULDBLOCK;
678116376Ssimokawa				goto out;
679116376Ssimokawa			}
680116376Ssimokawa		} else if (timo)
681116376Ssimokawa			untimeout(endtsleep, (void *)p, thandle);
682116376Ssimokawa		if (catch && (sig != 0 || (sig = CURSIG(p)))) {
683116376Ssimokawa#ifdef KTRACE
684116376Ssimokawa			if (KTRPOINT(p, KTR_CSW))
685116376Ssimokawa				ktrcsw(p->p_tracep, 0, 0);
686116376Ssimokawa#endif
687116376Ssimokawa			if (SIGISMEMBER(p->p_sigacts->ps_sigintr, sig))
688116376Ssimokawa				rval = EINTR;
689189928Ssbruno			else
690189928Ssbruno				rval = ERESTART;
691116376Ssimokawa			goto out;
692116376Ssimokawa		}
693116376Ssimokawa#ifdef KTRACE
694116376Ssimokawa		if (KTRPOINT(p, KTR_CSW))
695116376Ssimokawa			ktrcsw(p->p_tracep, 0, 0);
696116376Ssimokawa#endif
697116376Ssimokawa	} else {
698116376Ssimokawa		/*
699116376Ssimokawa		 * If as_priority is 0, await() has been called without an
700116376Ssimokawa		 * intervening asleep().  We are still effectively a NOP,
701116376Ssimokawa		 * but we call mi_switch() for safety.
702116376Ssimokawa		 */
703116376Ssimokawa
704116376Ssimokawa		if (p->p_asleep.as_priority == 0) {
705116376Ssimokawa			p->p_stats->p_ru.ru_nvcsw++;
706116376Ssimokawa			mi_switch();
707116376Ssimokawa		}
708116376Ssimokawa		splx(s);
709116376Ssimokawa	}
710116376Ssimokawa
711116376Ssimokawa	/*
712116376Ssimokawa	 * clear p_asleep.as_priority as an indication that await() has been
713116376Ssimokawa	 * called.  If await() is called again without an intervening asleep(),
714116376Ssimokawa	 * await() is still effectively a NOP but the above mi_switch() code
715116376Ssimokawa	 * is triggered as a safety.
716116376Ssimokawa	 */
717116376Ssimokawa	p->p_asleep.as_priority = 0;
718116376Ssimokawa
719116376Ssimokawaout:
720127468Ssimokawa	mtx_exit(&sched_lock, MTX_SPIN);
721127468Ssimokawa
722127468Ssimokawa	return (rval);
723127468Ssimokawa}
724116376Ssimokawa
725116376Ssimokawa/*
726127468Ssimokawa * Implement timeout for tsleep or asleep()/await()
727193066Sjamie *
728194118Sjamie * If process hasn't been awakened (wchan non-zero),
729193066Sjamie * set timeout flag and undo the sleep.  If proc
730116376Ssimokawa * is stopped, just unsleep so it will remain stopped.
731116376Ssimokawa */
732116376Ssimokawastatic void
733116376Ssimokawaendtsleep(arg)
734116376Ssimokawa	void *arg;
735116376Ssimokawa{
736169117Ssimokawa	register struct proc *p;
737116376Ssimokawa	int s;
738116376Ssimokawa
739117350Ssimokawa	p = (struct proc *)arg;
740116376Ssimokawa	CTR4(KTR_PROC,
741189928Ssbruno	        "endtsleep: proc %p (pid %d, %s), schedlock %p",
742116376Ssimokawa		p, p->p_pid, p->p_comm, (void *) sched_lock.mtx_lock);
743116376Ssimokawa	s = splhigh();
744187993Ssbruno	mtx_enter(&sched_lock, MTX_SPIN);
745187993Ssbruno	if (p->p_wchan) {
746116376Ssimokawa		if (p->p_stat == SSLEEP)
747187993Ssbruno			setrunnable(p);
748169117Ssimokawa		else
749116376Ssimokawa			unsleep(p);
750187993Ssbruno		p->p_flag |= P_TIMEOUT;
751187993Ssbruno	}
752187993Ssbruno	mtx_exit(&sched_lock, MTX_SPIN);
753187993Ssbruno	splx(s);
754116376Ssimokawa}
755116376Ssimokawa
756113584Ssimokawa/*
757113584Ssimokawa * Remove a process from its wait queue
758113584Ssimokawa */
759113584Ssimokawavoid
760113584Ssimokawaunsleep(p)
761113584Ssimokawa	register struct proc *p;
762113584Ssimokawa{
763113584Ssimokawa	int s;
764113584Ssimokawa
765116376Ssimokawa	s = splhigh();
766117350Ssimokawa	mtx_enter(&sched_lock, MTX_SPIN);
767189928Ssbruno	if (p->p_wchan) {
768189928Ssbruno		TAILQ_REMOVE(&slpque[LOOKUP(p->p_wchan)], p, p_procq);
769189928Ssbruno		p->p_wchan = 0;
770189928Ssbruno	}
771189928Ssbruno	mtx_exit(&sched_lock, MTX_SPIN);
772189928Ssbruno	splx(s);
773189928Ssbruno}
774189928Ssbruno
775189928Ssbruno/*
776189928Ssbruno * Make all processes sleeping on the specified identifier runnable.
777189928Ssbruno */
778189928Ssbrunovoid
779189928Ssbrunowakeup(ident)
780189928Ssbruno	register void *ident;
781189928Ssbruno{
782189928Ssbruno	register struct slpquehead *qp;
783189928Ssbruno	register struct proc *p;
784189928Ssbruno	int s;
785189928Ssbruno
786189928Ssbruno	s = splhigh();
787189928Ssbruno	mtx_enter(&sched_lock, MTX_SPIN);
788189928Ssbruno	qp = &slpque[LOOKUP(ident)];
789189928Ssbrunorestart:
790189928Ssbruno	TAILQ_FOREACH(p, qp, p_procq) {
791189928Ssbruno		if (p->p_wchan == ident) {
792103285Sikob			TAILQ_REMOVE(qp, p, p_procq);
793106790Ssimokawa			p->p_wchan = 0;
794103285Sikob			if (p->p_stat == SSLEEP) {
795106790Ssimokawa				/* OPTIMIZED EXPANSION OF setrunnable(p); */
796103285Sikob				CTR4(KTR_PROC,
797103285Sikob				        "wakeup: proc %p (pid %d, %s), schedlock %p",
798106543Ssimokawa					p, p->p_pid, p->p_comm, (void *) sched_lock.mtx_lock);
799103285Sikob				if (p->p_slptime > 1)
800103285Sikob					updatepri(p);
801106543Ssimokawa				p->p_slptime = 0;
802103285Sikob				p->p_stat = SRUN;
803103285Sikob				if (p->p_flag & P_INMEM) {
804103285Sikob					setrunqueue(p);
805103285Sikob					maybe_resched(p);
806103285Sikob				} else {
807103285Sikob					p->p_flag |= P_SWAPINREQ;
808103285Sikob					wakeup((caddr_t)&proc0);
809103285Sikob				}
810103285Sikob				/* END INLINE EXPANSION */
811103285Sikob				goto restart;
812103285Sikob			}
813113584Ssimokawa		}
814113584Ssimokawa	}
815113584Ssimokawa	mtx_exit(&sched_lock, MTX_SPIN);
816113584Ssimokawa	splx(s);
817103285Sikob}
818103285Sikob
819103285Sikob/*
820103285Sikob * Make a process sleeping on the specified identifier runnable.
821103285Sikob * May wake more than one process if a target process is currently
822103285Sikob * swapped out.
823103285Sikob */
824103285Sikobvoid
825103285Sikobwakeup_one(ident)
826103285Sikob	register void *ident;
827103285Sikob{
828103285Sikob	register struct slpquehead *qp;
829103285Sikob	register struct proc *p;
830103285Sikob	int s;
831103285Sikob
832103285Sikob	s = splhigh();
833103285Sikob	mtx_enter(&sched_lock, MTX_SPIN);
834103285Sikob	qp = &slpque[LOOKUP(ident)];
835103285Sikob
836103285Sikob	TAILQ_FOREACH(p, qp, p_procq) {
837103285Sikob		if (p->p_wchan == ident) {
838103285Sikob			TAILQ_REMOVE(qp, p, p_procq);
839103285Sikob			p->p_wchan = 0;
840103285Sikob			if (p->p_stat == SSLEEP) {
841103285Sikob				/* OPTIMIZED EXPANSION OF setrunnable(p); */
842103285Sikob				CTR4(KTR_PROC,
843103285Sikob				        "wakeup1: proc %p (pid %d, %s), schedlock %p",
844103285Sikob					p, p->p_pid, p->p_comm, (void *) sched_lock.mtx_lock);
845103285Sikob				if (p->p_slptime > 1)
846103285Sikob					updatepri(p);
847187993Ssbruno				p->p_slptime = 0;
848103285Sikob				p->p_stat = SRUN;
849103285Sikob				if (p->p_flag & P_INMEM) {
850103285Sikob					setrunqueue(p);
851103285Sikob					maybe_resched(p);
852103285Sikob					break;
853110193Ssimokawa				} else {
854103285Sikob					p->p_flag |= P_SWAPINREQ;
855103285Sikob					wakeup((caddr_t)&proc0);
856103285Sikob				}
857103285Sikob				/* END INLINE EXPANSION */
858103285Sikob			}
859103285Sikob		}
860103285Sikob	}
861103285Sikob	mtx_exit(&sched_lock, MTX_SPIN);
862103285Sikob	splx(s);
863103285Sikob}
864103285Sikob
865103285Sikob/*
866103285Sikob * The machine independent parts of mi_switch().
867103285Sikob * Must be called at splstatclock() or higher.
868103285Sikob */
869103285Sikobvoid
870103285Sikobmi_switch()
871103285Sikob{
872103285Sikob	struct timeval new_switchtime;
873103285Sikob	register struct proc *p = curproc;	/* XXX */
874103285Sikob	register struct rlimit *rlim;
875103285Sikob	int giantreleased;
876116376Ssimokawa	int x;
877103285Sikob	WITNESS_SAVE_DECL(Giant);
878106543Ssimokawa
879103285Sikob	/*
880103285Sikob	 * XXX this spl is almost unnecessary.  It is partly to allow for
881103285Sikob	 * sloppy callers that don't do it (issignal() via CURSIG() is the
882110195Ssimokawa	 * main offender).  It is partly to work around a bug in the i386
883103285Sikob	 * cpu_switch() (the ipl is not preserved).  We ran for years
884103285Sikob	 * without it.  I think there was only a interrupt latency problem.
885139680Sjmg	 * The main caller, tsleep(), does an splx() a couple of instructions
886103285Sikob	 * after calling here.  The buggy caller, issignal(), usually calls
887167632Ssimokawa	 * here at spl0() and sometimes returns at splhigh().  The process
888103285Sikob	 * then runs for a little too long at splhigh().  The ipl gets fixed
889103285Sikob	 * when the process returns to user mode (or earlier).
890103285Sikob	 *
891103285Sikob	 * It would probably be better to always call here at spl0(). Callers
892103285Sikob	 * are prepared to give up control to another process, so they must
893103285Sikob	 * be prepared to be interrupted.  The clock stuff here may not
894103285Sikob	 * actually need splstatclock().
895103285Sikob	 */
896106543Ssimokawa	x = splstatclock();
897103285Sikob
898106790Ssimokawa	CTR4(KTR_PROC, "mi_switch: old proc %p (pid %d, %s), schedlock %p",
899120660Ssimokawa		p, p->p_pid, p->p_comm, (void *) sched_lock.mtx_lock);
900120660Ssimokawa	mtx_enter(&sched_lock, MTX_SPIN | MTX_RLIKELY);
901120660Ssimokawa
902103285Sikob	WITNESS_SAVE(&Giant, Giant);
903129541Sdfr	for (giantreleased = 0; mtx_owned(&Giant); giantreleased++)
904103285Sikob		mtx_exit(&Giant, MTX_DEF | MTX_NOSWITCH);
905106813Ssimokawa
906129585Sdfr#ifdef SIMPLELOCK_DEBUG
907103285Sikob	if (p->p_simple_locks)
908120660Ssimokawa		printf("sleep: holding simple lock\n");
909170374Ssimokawa#endif
910120660Ssimokawa	/*
911120660Ssimokawa	 * Compute the amount of time during which the current
912170374Ssimokawa	 * process was running, and add that to its total so far.
913120660Ssimokawa	 */
914170374Ssimokawa	microuptime(&new_switchtime);
915170374Ssimokawa	if (timevalcmp(&new_switchtime, &switchtime, <)) {
916170374Ssimokawa		printf("microuptime() went backwards (%ld.%06ld -> %ld.%06ld)\n",
917170374Ssimokawa		    switchtime.tv_sec, switchtime.tv_usec,
918170374Ssimokawa		    new_switchtime.tv_sec, new_switchtime.tv_usec);
919170374Ssimokawa		new_switchtime = switchtime;
920103285Sikob	} else {
921106790Ssimokawa		p->p_runtime += (new_switchtime.tv_usec - switchtime.tv_usec) +
922103285Sikob		    (new_switchtime.tv_sec - switchtime.tv_sec) * (int64_t)1000000;
923103285Sikob	}
924103285Sikob
925106790Ssimokawa	/*
926106790Ssimokawa	 * Check if the process exceeds its cpu resource allocation.
927103285Sikob	 * If over max, kill it.
928120660Ssimokawa	 *
929170374Ssimokawa	 * XXX drop sched_lock, pickup Giant
930120660Ssimokawa	 */
931120660Ssimokawa	if (p->p_stat != SZOMB && p->p_limit->p_cpulimit != RLIM_INFINITY &&
932127468Ssimokawa	    p->p_runtime > p->p_limit->p_cpulimit) {
933120660Ssimokawa		rlim = &p->p_rlimit[RLIMIT_CPU];
934120660Ssimokawa		if (p->p_runtime / (rlim_t)1000000 >= rlim->rlim_max) {
935120660Ssimokawa			killproc(p, "exceeded maximum CPU limit");
936170374Ssimokawa		} else {
937120660Ssimokawa			psignal(p, SIGXCPU);
938120660Ssimokawa			if (rlim->rlim_cur < rlim->rlim_max) {
939120660Ssimokawa				/* XXX: we should make a private copy */
940120660Ssimokawa				rlim->rlim_cur += 5;
941120660Ssimokawa			}
942170374Ssimokawa		}
943103285Sikob	}
944170374Ssimokawa
945120660Ssimokawa	/*
946170374Ssimokawa	 * Pick a new current process and record its start time.
947170374Ssimokawa	 */
948170374Ssimokawa	cnt.v_swtch++;
949103285Sikob	switchtime = new_switchtime;
950170374Ssimokawa	CTR4(KTR_PROC, "mi_switch: old proc %p (pid %d, %s), schedlock %p",
951170374Ssimokawa		p, p->p_pid, p->p_comm, (void *) sched_lock.mtx_lock);
952103285Sikob	cpu_switch();
953103285Sikob	CTR4(KTR_PROC, "mi_switch: new proc %p (pid %d, %s), schedlock %p",
954103285Sikob		p, p->p_pid, p->p_comm, (void *) sched_lock.mtx_lock);
955103285Sikob	if (switchtime.tv_sec == 0)
956103285Sikob		microuptime(&switchtime);
957106790Ssimokawa	switchticks = ticks;
958106790Ssimokawa	mtx_exit(&sched_lock, MTX_SPIN);
959103285Sikob	while (giantreleased--)
960120660Ssimokawa		mtx_enter(&Giant, MTX_DEF);
961120660Ssimokawa	WITNESS_RESTORE(&Giant, Giant);
962120660Ssimokawa
963120660Ssimokawa	splx(x);
964103285Sikob}
965103285Sikob
966103285Sikob/*
967170374Ssimokawa * Change process state to be runnable,
968120660Ssimokawa * placing it on the run queue if it is in memory,
969120660Ssimokawa * and awakening the swapper if it isn't in memory.
970120660Ssimokawa */
971120660Ssimokawavoid
972120660Ssimokawasetrunnable(p)
973120660Ssimokawa	register struct proc *p;
974129541Sdfr{
975170374Ssimokawa	register int s;
976120660Ssimokawa
977120660Ssimokawa	s = splhigh();
978120660Ssimokawa	mtx_enter(&sched_lock, MTX_SPIN);
979120660Ssimokawa	switch (p->p_stat) {
980113584Ssimokawa	case 0:
981113584Ssimokawa	case SRUN:
982113584Ssimokawa	case SZOMB:
983113584Ssimokawa	case SWAIT:
984113584Ssimokawa	default:
985113584Ssimokawa		panic("setrunnable");
986120660Ssimokawa	case SSTOP:
987170374Ssimokawa	case SSLEEP:
988113584Ssimokawa		unsleep(p);		/* e.g. when sending signals */
989103285Sikob		break;
990103285Sikob
991103285Sikob	case SIDL:
992103285Sikob		break;
993169130Ssimokawa	}
994169130Ssimokawa	p->p_stat = SRUN;
995169130Ssimokawa	if (p->p_flag & P_INMEM)
996169130Ssimokawa		setrunqueue(p);
997169130Ssimokawa	splx(s);
998169130Ssimokawa	if (p->p_slptime > 1)
999169130Ssimokawa		updatepri(p);
1000169130Ssimokawa	p->p_slptime = 0;
1001169130Ssimokawa	if ((p->p_flag & P_INMEM) == 0) {
1002169130Ssimokawa		p->p_flag |= P_SWAPINREQ;
1003169130Ssimokawa		wakeup((caddr_t)&proc0);
1004169130Ssimokawa	}
1005169130Ssimokawa	else
1006169130Ssimokawa		maybe_resched(p);
1007169130Ssimokawa	mtx_exit(&sched_lock, MTX_SPIN);
1008169130Ssimokawa}
1009169130Ssimokawa
1010169130Ssimokawa/*
1011169130Ssimokawa * Compute the priority of a process when running in user mode.
1012169130Ssimokawa * Arrange to reschedule if the resulting priority is better
1013169130Ssimokawa * than that of the current process.
1014169130Ssimokawa */
1015169130Ssimokawavoid
1016169130Ssimokawaresetpriority(p)
1017169130Ssimokawa	register struct proc *p;
1018169130Ssimokawa{
1019169130Ssimokawa	register unsigned int newpriority;
1020169130Ssimokawa
1021169130Ssimokawa	mtx_enter(&sched_lock, MTX_SPIN);
1022169130Ssimokawa	if (p->p_rtprio.type == RTP_PRIO_NORMAL) {
1023169130Ssimokawa		newpriority = PUSER + p->p_estcpu / INVERSE_ESTCPU_WEIGHT +
1024169130Ssimokawa		    NICE_WEIGHT * (p->p_nice - PRIO_MIN);
1025169130Ssimokawa		newpriority = min(newpriority, MAXPRI);
1026170374Ssimokawa		p->p_usrpri = newpriority;
1027170374Ssimokawa	}
1028170374Ssimokawa	maybe_resched(p);
1029170374Ssimokawa	mtx_exit(&sched_lock, MTX_SPIN);
1030170374Ssimokawa}
1031170374Ssimokawa
1032170374Ssimokawa/* ARGSUSED */
1033170374Ssimokawastatic void
1034170374Ssimokawasched_setup(dummy)
1035170374Ssimokawa	void *dummy;
1036170374Ssimokawa{
1037170374Ssimokawa	/* Kick off timeout driven events by calling first time. */
1038170374Ssimokawa	roundrobin(NULL);
1039169130Ssimokawa	schedcpu(NULL);
1040103285Sikob}
1041103285Sikob
1042103285Sikob/*
1043106790Ssimokawa * We adjust the priority of the current process.  The priority of
1044106790Ssimokawa * a process gets worse as it accumulates CPU time.  The cpu usage
1045103285Sikob * estimator (p_estcpu) is increased here.  resetpriority() will
1046169119Ssimokawa * compute a different priority each time p_estcpu increases by
1047169119Ssimokawa * INVERSE_ESTCPU_WEIGHT
1048103285Sikob * (until MAXPRI is reached).  The cpu usage estimator ramps up
1049169119Ssimokawa * quite quickly when the process is running (linearly), and decays
1050171513Ssimokawa * away exponentially, at a rate which is proportionally slower when
1051249291Swill * the system is busy.  The basic principle is that the system will
1052249291Swill * 90% forget that the process used a lot of CPU time in 5 * loadav
1053249291Swill * seconds.  This causes the system to favor processes which haven't
1054249291Swill * run much recently, and to round-robin among other processes.
1055169119Ssimokawa */
1056169119Ssimokawavoid
1057169119Ssimokawaschedclock(p)
1058169119Ssimokawa	struct proc *p;
1059169119Ssimokawa{
1060170374Ssimokawa
1061170374Ssimokawa	p->p_cpticks++;
1062170374Ssimokawa	p->p_estcpu = ESTCPULIM(p->p_estcpu + 1);
1063170374Ssimokawa	if ((p->p_estcpu % INVERSE_ESTCPU_WEIGHT) == 0) {
1064170374Ssimokawa		resetpriority(p);
1065170374Ssimokawa		if (p->p_priority >= PUSER)
1066171513Ssimokawa			p->p_priority = p->p_usrpri;
1067169119Ssimokawa	}
1068169119Ssimokawa}
1069103285Sikob