kern_synch.c revision 71288
1165138Syongari/*-
2165138Syongari * Copyright (c) 1982, 1986, 1990, 1991, 1993
3165138Syongari *	The Regents of the University of California.  All rights reserved.
4165138Syongari * (c) UNIX System Laboratories, Inc.
5165138Syongari * All or some portions of this file are derived from material licensed
6165138Syongari * to the University of California by American Telephone and Telegraph
7165138Syongari * Co. or Unix System Laboratories, Inc. and are reproduced herein with
8165138Syongari * the permission of UNIX System Laboratories, Inc.
9165138Syongari *
10165138Syongari * Redistribution and use in source and binary forms, with or without
11165138Syongari * modification, are permitted provided that the following conditions
12165138Syongari * are met:
13165138Syongari * 1. Redistributions of source code must retain the above copyright
14165138Syongari *    notice, this list of conditions and the following disclaimer.
15165138Syongari * 2. Redistributions in binary form must reproduce the above copyright
16165138Syongari *    notice, this list of conditions and the following disclaimer in the
17165138Syongari *    documentation and/or other materials provided with the distribution.
18165138Syongari * 3. All advertising materials mentioning features or use of this software
19165138Syongari *    must display the following acknowledgement:
20165138Syongari *	This product includes software developed by the University of
21165138Syongari *	California, Berkeley and its contributors.
22165138Syongari * 4. Neither the name of the University nor the names of its contributors
23165138Syongari *    may be used to endorse or promote products derived from this software
24165138Syongari *    without specific prior written permission.
25165138Syongari *
26165138Syongari * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
27165138Syongari * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28165138Syongari * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29165138Syongari * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
30165138Syongari * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
31165138Syongari * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
32165138Syongari * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
33165138Syongari * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
34165138Syongari * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
35165138Syongari * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
36165138Syongari * SUCH DAMAGE.
37165138Syongari *
38165138Syongari *	@(#)kern_synch.c	8.9 (Berkeley) 5/19/95
39165138Syongari * $FreeBSD: head/sys/kern/kern_synch.c 71288 2001-01-20 02:57:59Z jhb $
40165138Syongari */
41165138Syongari
42165138Syongari#include "opt_ktrace.h"
43165138Syongari
44165138Syongari#include <sys/param.h>
45165138Syongari#include <sys/systm.h>
46165138Syongari#include <sys/proc.h>
47165138Syongari#include <sys/ipl.h>
48165138Syongari#include <sys/kernel.h>
49165138Syongari#include <sys/ktr.h>
50165138Syongari#include <sys/condvar.h>
51165138Syongari#include <sys/lock.h>
52165138Syongari#include <sys/mutex.h>
53165138Syongari#include <sys/signalvar.h>
54165138Syongari#include <sys/resourcevar.h>
55165138Syongari#include <sys/vmmeter.h>
56165138Syongari#include <sys/sysctl.h>
57165138Syongari#include <sys/sysproto.h>
58165138Syongari#include <vm/vm.h>
59165138Syongari#include <vm/vm_extern.h>
60165138Syongari#ifdef KTRACE
61165138Syongari#include <sys/uio.h>
62165138Syongari#include <sys/ktrace.h>
63165138Syongari#endif
64165138Syongari
65165138Syongari#include <machine/cpu.h>
66165138Syongari#include <machine/smp.h>
67165138Syongari
68165138Syongaristatic void sched_setup __P((void *dummy));
69165138SyongariSYSINIT(sched_setup, SI_SUB_KICK_SCHEDULER, SI_ORDER_FIRST, sched_setup, NULL)
70165138Syongari
71165138Syongariu_char	curpriority;
72165138Syongariint	hogticks;
73165138Syongariint	lbolt;
74165138Syongariint	sched_quantum;		/* Roundrobin scheduling quantum in ticks. */
75165138Syongari
76165138Syongaristatic struct callout schedcpu_callout;
77165138Syongaristatic struct callout roundrobin_callout;
78165138Syongari
79165138Syongaristatic int	curpriority_cmp __P((struct proc *p));
80165138Syongaristatic void	endtsleep __P((void *));
81165138Syongaristatic void	roundrobin __P((void *arg));
82165138Syongaristatic void	schedcpu __P((void *arg));
83165138Syongari
84165138Syongaristatic int
85165138Syongarisysctl_kern_quantum(SYSCTL_HANDLER_ARGS)
86165138Syongari{
87165138Syongari	int error, new_val;
88165138Syongari
89165138Syongari	new_val = sched_quantum * tick;
90165138Syongari	error = sysctl_handle_int(oidp, &new_val, 0, req);
91165138Syongari        if (error != 0 || req->newptr == NULL)
92165138Syongari		return (error);
93165138Syongari	if (new_val < tick)
94165138Syongari		return (EINVAL);
95165138Syongari	sched_quantum = new_val / tick;
96165138Syongari	hogticks = 2 * sched_quantum;
97165138Syongari	return (0);
98165138Syongari}
99165138Syongari
100165138SyongariSYSCTL_PROC(_kern, OID_AUTO, quantum, CTLTYPE_INT|CTLFLAG_RW,
101165138Syongari	0, sizeof sched_quantum, sysctl_kern_quantum, "I", "");
102165138Syongari
103165138Syongari/*-
104165138Syongari * Compare priorities.  Return:
105165138Syongari *     <0: priority of p < current priority
106165138Syongari *      0: priority of p == current priority
107165138Syongari *     >0: priority of p > current priority
108165138Syongari * The priorities are the normal priorities or the normal realtime priorities
109165138Syongari * if p is on the same scheduler as curproc.  Otherwise the process on the
110165138Syongari * more realtimeish scheduler has lowest priority.  As usual, a higher
111165138Syongari * priority really means a lower priority.
112165138Syongari */
113165138Syongaristatic int
114165138Syongaricurpriority_cmp(p)
115165138Syongari	struct proc *p;
116165138Syongari{
117165138Syongari	int c_class, p_class;
118165138Syongari
119165138Syongari	c_class = RTP_PRIO_BASE(curproc->p_rtprio.type);
120165138Syongari	p_class = RTP_PRIO_BASE(p->p_rtprio.type);
121165138Syongari	if (p_class != c_class)
122165138Syongari		return (p_class - c_class);
123165138Syongari	if (p_class == RTP_PRIO_NORMAL)
124165138Syongari		return (((int)p->p_priority - (int)curpriority) / PPQ);
125165138Syongari	return ((int)p->p_rtprio.prio - (int)curproc->p_rtprio.prio);
126165138Syongari}
127165138Syongari
128165138Syongari/*
129165138Syongari * Arrange to reschedule if necessary, taking the priorities and
130165138Syongari * schedulers into account.
131165138Syongari */
132165138Syongarivoid
133192736Syongarimaybe_resched(chk)
134192736Syongari	struct proc *chk;
135192736Syongari{
136198475Slulf	struct proc *p = curproc; /* XXX */
137192736Syongari
138165138Syongari	/*
139165138Syongari	 * XXX idle scheduler still broken because proccess stays on idle
140165138Syongari	 * scheduler during waits (such as when getting FS locks).  If a
141165138Syongari	 * standard process becomes runaway cpu-bound, the system can lockup
142165138Syongari	 * due to idle-scheduler processes in wakeup never getting any cpu.
143193299Syongari	 */
144173775Syongari	if (p == PCPU_GET(idleproc)) {
145193299Syongari#if 0
146193299Syongari		need_resched();
147199012Syongari#endif
148165138Syongari	} else if (chk == p) {
149165138Syongari		/* We may need to yield if our priority has been raised. */
150165138Syongari		if (curpriority_cmp(chk) > 0)
151165138Syongari			need_resched();
152165138Syongari	} else if (curpriority_cmp(chk) < 0)
153197592Syongari		need_resched();
154165138Syongari}
155165138Syongari
156165138Syongariint
157165138Syongariroundrobin_interval(void)
158165138Syongari{
159165138Syongari	return (sched_quantum);
160165138Syongari}
161165138Syongari
162165138Syongari/*
163165138Syongari * Force switch among equal priority processes every 100ms.
164165138Syongari */
165165138Syongari/* ARGSUSED */
166165138Syongaristatic void
167165138Syongariroundrobin(arg)
168165138Syongari	void *arg;
169165138Syongari{
170165138Syongari#ifndef SMP
171165138Syongari 	struct proc *p = curproc; /* XXX */
172165138Syongari#endif
173165138Syongari
174165138Syongari#ifdef SMP
175165138Syongari	need_resched();
176165138Syongari	forward_roundrobin();
177165138Syongari#else
178165138Syongari 	if (p == PCPU_GET(idleproc) || RTP_PRIO_NEED_RR(p->p_rtprio.type))
179165138Syongari 		need_resched();
180165138Syongari#endif
181165138Syongari
182165138Syongari	callout_reset(&roundrobin_callout, sched_quantum, roundrobin, NULL);
183165138Syongari}
184165138Syongari
185165138Syongari/*
186165138Syongari * Constants for digital decay and forget:
187165138Syongari *	90% of (p_estcpu) usage in 5 * loadav time
188165138Syongari *	95% of (p_pctcpu) usage in 60 seconds (load insensitive)
189165138Syongari *          Note that, as ps(1) mentions, this can let percentages
190165138Syongari *          total over 100% (I've seen 137.9% for 3 processes).
191165138Syongari *
192165138Syongari * Note that schedclock() updates p_estcpu and p_cpticks asynchronously.
193165138Syongari *
194165138Syongari * We wish to decay away 90% of p_estcpu in (5 * loadavg) seconds.
195165138Syongari * That is, the system wants to compute a value of decay such
196165138Syongari * that the following for loop:
197165138Syongari * 	for (i = 0; i < (5 * loadavg); i++)
198165138Syongari * 		p_estcpu *= decay;
199165138Syongari * will compute
200165138Syongari * 	p_estcpu *= 0.1;
201165138Syongari * for all values of loadavg:
202165138Syongari *
203165138Syongari * Mathematically this loop can be expressed by saying:
204165138Syongari * 	decay ** (5 * loadavg) ~= .1
205165138Syongari *
206165138Syongari * The system computes decay as:
207165138Syongari * 	decay = (2 * loadavg) / (2 * loadavg + 1)
208165138Syongari *
209165138Syongari * We wish to prove that the system's computation of decay
210165138Syongari * will always fulfill the equation:
211165138Syongari * 	decay ** (5 * loadavg) ~= .1
212165138Syongari *
213165138Syongari * If we compute b as:
214165138Syongari * 	b = 2 * loadavg
215165138Syongari * then
216165138Syongari * 	decay = b / (b + 1)
217165138Syongari *
218165138Syongari * We now need to prove two things:
219165138Syongari *	1) Given factor ** (5 * loadavg) ~= .1, prove factor == b/(b+1)
220165138Syongari *	2) Given b/(b+1) ** power ~= .1, prove power == (5 * loadavg)
221165138Syongari *
222165138Syongari * Facts:
223165138Syongari *         For x close to zero, exp(x) =~ 1 + x, since
224165138Syongari *              exp(x) = 0! + x**1/1! + x**2/2! + ... .
225165138Syongari *              therefore exp(-1/b) =~ 1 - (1/b) = (b-1)/b.
226165138Syongari *         For x close to zero, ln(1+x) =~ x, since
227165138Syongari *              ln(1+x) = x - x**2/2 + x**3/3 - ...     -1 < x < 1
228165138Syongari *              therefore ln(b/(b+1)) = ln(1 - 1/(b+1)) =~ -1/(b+1).
229165138Syongari *         ln(.1) =~ -2.30
230165138Syongari *
231165138Syongari * Proof of (1):
232165138Syongari *    Solve (factor)**(power) =~ .1 given power (5*loadav):
233193293Syongari *	solving for factor,
234193293Syongari *      ln(factor) =~ (-2.30/5*loadav), or
235165138Syongari *      factor =~ exp(-1/((5/2.30)*loadav)) =~ exp(-1/(2*loadav)) =
236165138Syongari *          exp(-1/b) =~ (b-1)/b =~ b/(b+1).                    QED
237165138Syongari *
238165138Syongari * Proof of (2):
239165138Syongari *    Solve (factor)**(power) =~ .1 given factor == (b/(b+1)):
240165138Syongari *	solving for power,
241165138Syongari *      power*ln(b/(b+1)) =~ -2.30, or
242165138Syongari *      power =~ 2.3 * (b + 1) = 4.6*loadav + 2.3 =~ 5*loadav.  QED
243165138Syongari *
244165138Syongari * Actual power values for the implemented algorithm are as follows:
245165138Syongari *      loadav: 1       2       3       4
246165138Syongari *      power:  5.68    10.32   14.94   19.55
247165138Syongari */
248165138Syongari
249165138Syongari/* calculations for digital decay to forget 90% of usage in 5*loadav sec */
250165138Syongari#define	loadfactor(loadav)	(2 * (loadav))
251165138Syongari#define	decay_cpu(loadfac, cpu)	(((loadfac) * (cpu)) / ((loadfac) + FSCALE))
252165138Syongari
253165138Syongari/* decay 95% of `p_pctcpu' in 60 seconds; see CCPU_SHIFT before changing */
254165138Syongaristatic fixpt_t	ccpu = 0.95122942450071400909 * FSCALE;	/* exp(-1/20) */
255165138SyongariSYSCTL_INT(_kern, OID_AUTO, ccpu, CTLFLAG_RD, &ccpu, 0, "");
256165138Syongari
257165138Syongari/* kernel uses `FSCALE', userland (SHOULD) use kern.fscale */
258165138Syongaristatic int	fscale __unused = FSCALE;
259165138SyongariSYSCTL_INT(_kern, OID_AUTO, fscale, CTLFLAG_RD, 0, FSCALE, "");
260165138Syongari
261165138Syongari/*
262165138Syongari * If `ccpu' is not equal to `exp(-1/20)' and you still want to use the
263165138Syongari * faster/more-accurate formula, you'll have to estimate CCPU_SHIFT below
264165138Syongari * and possibly adjust FSHIFT in "param.h" so that (FSHIFT >= CCPU_SHIFT).
265165138Syongari *
266165138Syongari * To estimate CCPU_SHIFT for exp(-1/20), the following formula was used:
267165138Syongari *	1 - exp(-1/20) ~= 0.0487 ~= 0.0488 == 1 (fixed pt, *11* bits).
268165138Syongari *
269165138Syongari * If you don't want to bother with the faster/more-accurate formula, you
270165138Syongari * can set CCPU_SHIFT to (FSHIFT + 1) which will use a slower/less-accurate
271165138Syongari * (more general) method of calculating the %age of CPU used by a process.
272165138Syongari */
273165138Syongari#define	CCPU_SHIFT	11
274165138Syongari
275165138Syongari/*
276165138Syongari * Recompute process priorities, every hz ticks.
277165138Syongari * MP-safe, called without the Giant mutex.
278165138Syongari */
279165138Syongari/* ARGSUSED */
280165138Syongaristatic void
281165138Syongarischedcpu(arg)
282165138Syongari	void *arg;
283165138Syongari{
284165138Syongari	register fixpt_t loadfac = loadfactor(averunnable.ldavg[0]);
285165138Syongari	register struct proc *p;
286165138Syongari	register int realstathz, s;
287165138Syongari
288165138Syongari	realstathz = stathz ? stathz : hz;
289165138Syongari	ALLPROC_LOCK(AP_SHARED);
290165138Syongari	LIST_FOREACH(p, &allproc, p_list) {
291165138Syongari		/*
292165138Syongari		 * Increment time in/out of memory and sleep time
293165138Syongari		 * (if sleeping).  We ignore overflow; with 16-bit int's
294165138Syongari		 * (remember them?) overflow takes 45 days.
295165138Syongari		if (p->p_stat == SWAIT)
296165138Syongari			continue;
297165138Syongari		 */
298165138Syongari		mtx_enter(&sched_lock, MTX_SPIN);
299165138Syongari		p->p_swtime++;
300165138Syongari		if (p->p_stat == SSLEEP || p->p_stat == SSTOP)
301165138Syongari			p->p_slptime++;
302165138Syongari		p->p_pctcpu = (p->p_pctcpu * ccpu) >> FSHIFT;
303165138Syongari		/*
304165138Syongari		 * If the process has slept the entire second,
305165138Syongari		 * stop recalculating its priority until it wakes up.
306165138Syongari		 */
307165138Syongari		if (p->p_slptime > 1) {
308165138Syongari			mtx_exit(&sched_lock, MTX_SPIN);
309165138Syongari			continue;
310165138Syongari		}
311165138Syongari
312165138Syongari		/*
313165138Syongari		 * prevent state changes and protect run queue
314165138Syongari		 */
315165138Syongari		s = splhigh();
316165138Syongari
317165138Syongari		/*
318165138Syongari		 * p_pctcpu is only for ps.
319165138Syongari		 */
320165138Syongari#if	(FSHIFT >= CCPU_SHIFT)
321165138Syongari		p->p_pctcpu += (realstathz == 100)?
322165138Syongari			((fixpt_t) p->p_cpticks) << (FSHIFT - CCPU_SHIFT):
323165138Syongari                	100 * (((fixpt_t) p->p_cpticks)
324165138Syongari				<< (FSHIFT - CCPU_SHIFT)) / realstathz;
325165138Syongari#else
326165138Syongari		p->p_pctcpu += ((FSCALE - ccpu) *
327165138Syongari			(p->p_cpticks * FSCALE / realstathz)) >> FSHIFT;
328165138Syongari#endif
329165138Syongari		p->p_cpticks = 0;
330165138Syongari		p->p_estcpu = decay_cpu(loadfac, p->p_estcpu);
331165138Syongari		resetpriority(p);
332165138Syongari		if (p->p_priority >= PUSER) {
333165138Syongari			if ((p != curproc) &&
334165138Syongari#ifdef SMP
335193293Syongari			    p->p_oncpu == 0xff && 	/* idle */
336193293Syongari#endif
337193293Syongari			    p->p_stat == SRUN &&
338193293Syongari			    (p->p_flag & P_INMEM) &&
339193293Syongari			    (p->p_priority / PPQ) != (p->p_usrpri / PPQ)) {
340193293Syongari				remrunqueue(p);
341193293Syongari				p->p_priority = p->p_usrpri;
342193293Syongari				setrunqueue(p);
343193293Syongari			} else
344193293Syongari				p->p_priority = p->p_usrpri;
345193293Syongari		}
346193293Syongari		mtx_exit(&sched_lock, MTX_SPIN);
347193293Syongari		splx(s);
348193293Syongari	}
349193293Syongari	ALLPROC_LOCK(AP_RELEASE);
350193293Syongari	vmmeter();
351193293Syongari	wakeup((caddr_t)&lbolt);
352193293Syongari	callout_reset(&schedcpu_callout, hz, schedcpu, NULL);
353193293Syongari}
354193293Syongari
355193293Syongari/*
356193293Syongari * Recalculate the priority of a process after it has slept for a while.
357193293Syongari * For all load averages >= 1 and max p_estcpu of 255, sleeping for at
358193293Syongari * least six times the loadfactor will decay p_estcpu to zero.
359193293Syongari */
360193293Syongarivoid
361193293Syongariupdatepri(p)
362193293Syongari	register struct proc *p;
363193293Syongari{
364193293Syongari	register unsigned int newcpu = p->p_estcpu;
365193293Syongari	register fixpt_t loadfac = loadfactor(averunnable.ldavg[0]);
366193293Syongari
367193293Syongari	if (p->p_slptime > 5 * loadfac)
368193293Syongari		p->p_estcpu = 0;
369193293Syongari	else {
370193293Syongari		p->p_slptime--;	/* the first time was done in schedcpu */
371193293Syongari		while (newcpu && --p->p_slptime)
372193293Syongari			newcpu = decay_cpu(loadfac, newcpu);
373193293Syongari		p->p_estcpu = newcpu;
374193293Syongari	}
375193293Syongari	resetpriority(p);
376193293Syongari}
377193293Syongari
378193293Syongari/*
379193293Syongari * We're only looking at 7 bits of the address; everything is
380193293Syongari * aligned to 4, lots of things are aligned to greater powers
381193293Syongari * of 2.  Shift right by 8, i.e. drop the bottom 256 worth.
382193293Syongari */
383193293Syongari#define TABLESIZE	128
384193293Syongaristatic TAILQ_HEAD(slpquehead, proc) slpque[TABLESIZE];
385165138Syongari#define LOOKUP(x)	(((intptr_t)(x) >> 8) & (TABLESIZE - 1))
386165138Syongari
387165138Syongarivoid
388165138Syongarisleepinit(void)
389165138Syongari{
390165138Syongari	int i;
391165138Syongari
392165138Syongari	sched_quantum = hz/10;
393165138Syongari	hogticks = 2 * sched_quantum;
394165138Syongari	for (i = 0; i < TABLESIZE; i++)
395165138Syongari		TAILQ_INIT(&slpque[i]);
396165138Syongari}
397165138Syongari
398165138Syongari/*
399165138Syongari * General sleep call.  Suspends the current process until a wakeup is
400165138Syongari * performed on the specified identifier.  The process will then be made
401165138Syongari * runnable with the specified priority.  Sleeps at most timo/hz seconds
402165138Syongari * (0 means no timeout).  If pri includes PCATCH flag, signals are checked
403165138Syongari * before and after sleeping, else signals are not checked.  Returns 0 if
404165138Syongari * awakened, EWOULDBLOCK if the timeout expires.  If PCATCH is set and a
405165138Syongari * signal needs to be delivered, ERESTART is returned if the current system
406165138Syongari * call should be restarted if possible, and EINTR is returned if the system
407165138Syongari * call should be interrupted by the signal (return EINTR).
408165138Syongari *
409165138Syongari * The mutex argument is exited before the caller is suspended, and
410165138Syongari * entered before msleep returns.  If priority includes the PDROP
411165138Syongari * flag the mutex is not entered before returning.
412165138Syongari */
413165138Syongariint
414165138Syongarimsleep(ident, mtx, priority, wmesg, timo)
415165138Syongari	void *ident;
416165138Syongari	struct mtx *mtx;
417165138Syongari	int priority, timo;
418165138Syongari	const char *wmesg;
419165138Syongari{
420165138Syongari	struct proc *p = curproc;
421165138Syongari	int s, sig, catch = priority & PCATCH;
422165138Syongari	int rval = 0;
423165138Syongari	WITNESS_SAVE_DECL(mtx);
424165138Syongari
425165138Syongari#ifdef KTRACE
426165138Syongari	if (p && KTRPOINT(p, KTR_CSW))
427165138Syongari		ktrcsw(p->p_tracep, 1, 0);
428165138Syongari#endif
429165138Syongari	WITNESS_SLEEP(0, mtx);
430165138Syongari	mtx_enter(&sched_lock, MTX_SPIN);
431165138Syongari	s = splhigh();
432165138Syongari	if (cold || panicstr) {
433165138Syongari		/*
434165138Syongari		 * After a panic, or during autoconfiguration,
435165138Syongari		 * just give interrupts a chance, then just return;
436165138Syongari		 * don't run any other procs or panic below,
437165138Syongari		 * in case this is the idle process and already asleep.
438165138Syongari		 */
439165138Syongari		if (mtx != NULL && priority & PDROP)
440165138Syongari			mtx_exit(mtx, MTX_DEF | MTX_NOSWITCH);
441165138Syongari		mtx_exit(&sched_lock, MTX_SPIN);
442165138Syongari		splx(s);
443165138Syongari		return (0);
444165138Syongari	}
445165138Syongari
446165138Syongari	DROP_GIANT_NOSWITCH();
447165138Syongari
448165138Syongari	if (mtx != NULL) {
449165138Syongari		mtx_assert(mtx, MA_OWNED | MA_NOTRECURSED);
450165138Syongari		WITNESS_SAVE(mtx, mtx);
451165138Syongari		mtx_exit(mtx, MTX_DEF | MTX_NOSWITCH);
452165138Syongari		if (priority & PDROP)
453165138Syongari			mtx = NULL;
454165138Syongari	}
455165138Syongari
456165138Syongari	KASSERT(p != NULL, ("msleep1"));
457165138Syongari	KASSERT(ident != NULL && p->p_stat == SRUN, ("msleep"));
458165138Syongari	/*
459165138Syongari	 * Process may be sitting on a slpque if asleep() was called, remove
460165138Syongari	 * it before re-adding.
461165138Syongari	 */
462165138Syongari	if (p->p_wchan != NULL)
463165138Syongari		unsleep(p);
464165138Syongari
465165138Syongari	p->p_wchan = ident;
466165138Syongari	p->p_wmesg = wmesg;
467165138Syongari	p->p_slptime = 0;
468165138Syongari	p->p_priority = priority & PRIMASK;
469165138Syongari	CTR4(KTR_PROC, "msleep: proc %p (pid %d, %s), schedlock %p",
470165138Syongari		p, p->p_pid, p->p_comm, (void *) sched_lock.mtx_lock);
471165138Syongari	TAILQ_INSERT_TAIL(&slpque[LOOKUP(ident)], p, p_slpq);
472165138Syongari	if (timo)
473165138Syongari		callout_reset(&p->p_slpcallout, timo, endtsleep, p);
474165138Syongari	/*
475165138Syongari	 * We put ourselves on the sleep queue and start our timeout
476165138Syongari	 * before calling CURSIG, as we could stop there, and a wakeup
477165138Syongari	 * or a SIGCONT (or both) could occur while we were stopped.
478165138Syongari	 * A SIGCONT would cause us to be marked as SSLEEP
479165138Syongari	 * without resuming us, thus we must be ready for sleep
480165138Syongari	 * when CURSIG is called.  If the wakeup happens while we're
481165138Syongari	 * stopped, p->p_wchan will be 0 upon return from CURSIG.
482165138Syongari	 */
483165138Syongari	if (catch) {
484165138Syongari		CTR4(KTR_PROC,
485165138Syongari		        "msleep caught: proc %p (pid %d, %s), schedlock %p",
486165138Syongari			p, p->p_pid, p->p_comm, (void *) sched_lock.mtx_lock);
487165138Syongari		p->p_flag |= P_SINTR;
488165138Syongari		mtx_exit(&sched_lock, MTX_SPIN);
489165138Syongari		if ((sig = CURSIG(p))) {
490165138Syongari			mtx_enter(&sched_lock, MTX_SPIN);
491165138Syongari			if (p->p_wchan)
492165138Syongari				unsleep(p);
493165138Syongari			p->p_stat = SRUN;
494165138Syongari			goto resume;
495165138Syongari		}
496165138Syongari		mtx_enter(&sched_lock, MTX_SPIN);
497165138Syongari		if (p->p_wchan == 0) {
498165138Syongari			catch = 0;
499165138Syongari			goto resume;
500165138Syongari		}
501165138Syongari	} else
502165138Syongari		sig = 0;
503165138Syongari	p->p_stat = SSLEEP;
504165138Syongari	p->p_stats->p_ru.ru_nvcsw++;
505165138Syongari	mi_switch();
506165138Syongari	CTR4(KTR_PROC,
507165138Syongari	        "msleep resume: proc %p (pid %d, %s), schedlock %p",
508165138Syongari		p, p->p_pid, p->p_comm, (void *) sched_lock.mtx_lock);
509165138Syongariresume:
510165138Syongari	curpriority = p->p_usrpri;
511165138Syongari	splx(s);
512165138Syongari	p->p_flag &= ~P_SINTR;
513165138Syongari	if (p->p_flag & P_TIMEOUT) {
514165138Syongari		p->p_flag &= ~P_TIMEOUT;
515165138Syongari		if (sig == 0) {
516165138Syongari#ifdef KTRACE
517165138Syongari			if (KTRPOINT(p, KTR_CSW))
518165138Syongari				ktrcsw(p->p_tracep, 0, 0);
519165138Syongari#endif
520165138Syongari			rval = EWOULDBLOCK;
521165138Syongari			mtx_exit(&sched_lock, MTX_SPIN);
522165138Syongari			goto out;
523165138Syongari		}
524165138Syongari	} else if (timo)
525165138Syongari		callout_stop(&p->p_slpcallout);
526165138Syongari	mtx_exit(&sched_lock, MTX_SPIN);
527165138Syongari
528165138Syongari	if (catch && (sig != 0 || (sig = CURSIG(p)))) {
529165138Syongari#ifdef KTRACE
530165138Syongari		if (KTRPOINT(p, KTR_CSW))
531165138Syongari			ktrcsw(p->p_tracep, 0, 0);
532165138Syongari#endif
533165138Syongari		if (SIGISMEMBER(p->p_sigacts->ps_sigintr, sig))
534165138Syongari			rval = EINTR;
535165138Syongari		else
536165138Syongari			rval = ERESTART;
537165138Syongari		goto out;
538165138Syongari	}
539165138Syongariout:
540165138Syongari#ifdef KTRACE
541165138Syongari	if (KTRPOINT(p, KTR_CSW))
542165138Syongari		ktrcsw(p->p_tracep, 0, 0);
543165138Syongari#endif
544165138Syongari	PICKUP_GIANT();
545165138Syongari	if (mtx != NULL) {
546165138Syongari		mtx_enter(mtx, MTX_DEF);
547165138Syongari		WITNESS_RESTORE(mtx, mtx);
548165138Syongari	}
549165138Syongari	return (rval);
550165138Syongari}
551165138Syongari
552165138Syongari/*
553165138Syongari * asleep() - async sleep call.  Place process on wait queue and return
554165138Syongari * immediately without blocking.  The process stays runnable until mawait()
555165138Syongari * is called.  If ident is NULL, remove process from wait queue if it is still
556165138Syongari * on one.
557165138Syongari *
558165138Syongari * Only the most recent sleep condition is effective when making successive
559165138Syongari * calls to asleep() or when calling msleep().
560165138Syongari *
561165138Syongari * The timeout, if any, is not initiated until mawait() is called.  The sleep
562165138Syongari * priority, signal, and timeout is specified in the asleep() call but may be
563165138Syongari * overriden in the mawait() call.
564165138Syongari *
565165138Syongari * <<<<<<<< EXPERIMENTAL, UNTESTED >>>>>>>>>>
566165138Syongari */
567165138Syongari
568165138Syongariint
569165138Syongariasleep(void *ident, int priority, const char *wmesg, int timo)
570165138Syongari{
571165138Syongari	struct proc *p = curproc;
572165138Syongari	int s;
573165138Syongari
574165138Syongari	/*
575165138Syongari	 * obtain sched_lock while manipulating sleep structures and slpque.
576165138Syongari	 *
577165138Syongari	 * Remove preexisting wait condition (if any) and place process
578165138Syongari	 * on appropriate slpque, but do not put process to sleep.
579165138Syongari	 */
580165138Syongari
581165138Syongari	s = splhigh();
582165138Syongari	mtx_enter(&sched_lock, MTX_SPIN);
583165138Syongari
584165138Syongari	if (p->p_wchan != NULL)
585165138Syongari		unsleep(p);
586165138Syongari
587165138Syongari	if (ident) {
588165138Syongari		p->p_wchan = ident;
589165138Syongari		p->p_wmesg = wmesg;
590165138Syongari		p->p_slptime = 0;
591165138Syongari		p->p_asleep.as_priority = priority;
592165138Syongari		p->p_asleep.as_timo = timo;
593165138Syongari		TAILQ_INSERT_TAIL(&slpque[LOOKUP(ident)], p, p_slpq);
594165138Syongari	}
595165138Syongari
596165138Syongari	mtx_exit(&sched_lock, MTX_SPIN);
597165138Syongari	splx(s);
598165138Syongari
599165138Syongari	return(0);
600165138Syongari}
601165138Syongari
602165138Syongari/*
603165138Syongari * mawait() - wait for async condition to occur.   The process blocks until
604165138Syongari * wakeup() is called on the most recent asleep() address.  If wakeup is called
605165138Syongari * prior to mawait(), mawait() winds up being a NOP.
606165138Syongari *
607165138Syongari * If mawait() is called more then once (without an intervening asleep() call),
608165138Syongari * mawait() is still effectively a NOP but it calls mi_switch() to give other
609165138Syongari * processes some cpu before returning.  The process is left runnable.
610165138Syongari *
611165138Syongari * <<<<<<<< EXPERIMENTAL, UNTESTED >>>>>>>>>>
612165138Syongari */
613165138Syongari
614165138Syongariint
615165138Syongarimawait(struct mtx *mtx, int priority, int timo)
616165138Syongari{
617165138Syongari	struct proc *p = curproc;
618165138Syongari	int rval = 0;
619165138Syongari	int s;
620165138Syongari	WITNESS_SAVE_DECL(mtx);
621165138Syongari
622165138Syongari	WITNESS_SLEEP(0, mtx);
623165138Syongari	mtx_enter(&sched_lock, MTX_SPIN);
624207409Syongari	DROP_GIANT_NOSWITCH();
625207409Syongari	if (mtx != NULL) {
626165138Syongari		mtx_assert(mtx, MA_OWNED | MA_NOTRECURSED);
627165138Syongari		WITNESS_SAVE(mtx, mtx);
628165138Syongari		mtx_exit(mtx, MTX_DEF | MTX_NOSWITCH);
629165138Syongari		if (priority & PDROP)
630165138Syongari			mtx = NULL;
631165138Syongari	}
632165138Syongari
633165138Syongari	s = splhigh();
634165138Syongari
635165138Syongari	if (p->p_wchan != NULL) {
636165138Syongari		int sig;
637165138Syongari		int catch;
638165138Syongari
639165138Syongari		/*
640165138Syongari		 * The call to mawait() can override defaults specified in
641165138Syongari		 * the original asleep().
642165138Syongari		 */
643165138Syongari		if (priority < 0)
644165138Syongari			priority = p->p_asleep.as_priority;
645165138Syongari		if (timo < 0)
646165138Syongari			timo = p->p_asleep.as_timo;
647165138Syongari
648165138Syongari		/*
649165138Syongari		 * Install timeout
650165138Syongari		 */
651165138Syongari
652165138Syongari		if (timo)
653165138Syongari			callout_reset(&p->p_slpcallout, timo, endtsleep, p);
654165138Syongari
655165138Syongari		sig = 0;
656165138Syongari		catch = priority & PCATCH;
657165138Syongari
658165138Syongari		if (catch) {
659165138Syongari			p->p_flag |= P_SINTR;
660165138Syongari			mtx_exit(&sched_lock, MTX_SPIN);
661165138Syongari			if ((sig = CURSIG(p))) {
662165138Syongari				mtx_enter(&sched_lock, MTX_SPIN);
663165138Syongari				if (p->p_wchan)
664165138Syongari					unsleep(p);
665165138Syongari				p->p_stat = SRUN;
666165138Syongari				goto resume;
667165138Syongari			}
668165138Syongari			mtx_enter(&sched_lock, MTX_SPIN);
669165138Syongari			if (p->p_wchan == NULL) {
670165138Syongari				catch = 0;
671165138Syongari				goto resume;
672165138Syongari			}
673165138Syongari		}
674165138Syongari		p->p_stat = SSLEEP;
675165138Syongari		p->p_stats->p_ru.ru_nvcsw++;
676165138Syongari		mi_switch();
677165138Syongariresume:
678165138Syongari		curpriority = p->p_usrpri;
679165138Syongari
680165138Syongari		splx(s);
681193293Syongari		p->p_flag &= ~P_SINTR;
682165138Syongari		if (p->p_flag & P_TIMEOUT) {
683165138Syongari			p->p_flag &= ~P_TIMEOUT;
684165138Syongari			if (sig == 0) {
685165138Syongari#ifdef KTRACE
686165138Syongari				if (KTRPOINT(p, KTR_CSW))
687165138Syongari					ktrcsw(p->p_tracep, 0, 0);
688165138Syongari#endif
689165138Syongari				rval = EWOULDBLOCK;
690165138Syongari				mtx_exit(&sched_lock, MTX_SPIN);
691165138Syongari				goto out;
692165138Syongari			}
693165138Syongari		} else if (timo)
694165138Syongari			callout_stop(&p->p_slpcallout);
695165138Syongari		mtx_exit(&sched_lock, MTX_SPIN);
696165138Syongari
697165138Syongari		if (catch && (sig != 0 || (sig = CURSIG(p)))) {
698165138Syongari#ifdef KTRACE
699165138Syongari			if (KTRPOINT(p, KTR_CSW))
700165138Syongari				ktrcsw(p->p_tracep, 0, 0);
701165138Syongari#endif
702165138Syongari			if (SIGISMEMBER(p->p_sigacts->ps_sigintr, sig))
703165138Syongari				rval = EINTR;
704165138Syongari			else
705165138Syongari				rval = ERESTART;
706165138Syongari			goto out;
707165138Syongari		}
708165138Syongari#ifdef KTRACE
709165138Syongari		if (KTRPOINT(p, KTR_CSW))
710165138Syongari			ktrcsw(p->p_tracep, 0, 0);
711165138Syongari#endif
712165138Syongari	} else {
713165138Syongari		/*
714165138Syongari		 * If as_priority is 0, mawait() has been called without an
715165138Syongari		 * intervening asleep().  We are still effectively a NOP,
716165138Syongari		 * but we call mi_switch() for safety.
717165138Syongari		 */
718165138Syongari
719165138Syongari		if (p->p_asleep.as_priority == 0) {
720165138Syongari			p->p_stats->p_ru.ru_nvcsw++;
721165138Syongari			mi_switch();
722165138Syongari		}
723165138Syongari		mtx_exit(&sched_lock, MTX_SPIN);
724165138Syongari		splx(s);
725165138Syongari	}
726165138Syongari
727165138Syongari	/*
728165138Syongari	 * clear p_asleep.as_priority as an indication that mawait() has been
729165138Syongari	 * called.  If mawait() is called again without an intervening asleep(),
730165138Syongari	 * mawait() is still effectively a NOP but the above mi_switch() code
731165138Syongari	 * is triggered as a safety.
732165138Syongari	 */
733165138Syongari	p->p_asleep.as_priority = 0;
734165138Syongari
735165138Syongariout:
736165138Syongari	PICKUP_GIANT();
737165138Syongari	if (mtx != NULL) {
738165138Syongari		mtx_enter(mtx, MTX_DEF);
739165138Syongari		WITNESS_RESTORE(mtx, mtx);
740165138Syongari	}
741165138Syongari	return (rval);
742165138Syongari}
743165138Syongari
744165138Syongari/*
745165138Syongari * Implement timeout for msleep or asleep()/mawait()
746165138Syongari *
747165138Syongari * If process hasn't been awakened (wchan non-zero),
748165138Syongari * set timeout flag and undo the sleep.  If proc
749165138Syongari * is stopped, just unsleep so it will remain stopped.
750165138Syongari * MP-safe, called without the Giant mutex.
751165138Syongari */
752165138Syongaristatic void
753165138Syongariendtsleep(arg)
754165138Syongari	void *arg;
755165138Syongari{
756165138Syongari	register struct proc *p;
757165138Syongari	int s;
758165138Syongari
759165138Syongari	p = (struct proc *)arg;
760165138Syongari	CTR4(KTR_PROC,
761165138Syongari	        "endtsleep: proc %p (pid %d, %s), schedlock %p",
762165138Syongari		p, p->p_pid, p->p_comm, (void *) sched_lock.mtx_lock);
763165138Syongari	s = splhigh();
764165138Syongari	mtx_enter(&sched_lock, MTX_SPIN);
765165138Syongari	if (p->p_wchan) {
766165138Syongari		if (p->p_stat == SSLEEP)
767165138Syongari			setrunnable(p);
768165138Syongari		else
769165138Syongari			unsleep(p);
770165138Syongari		p->p_flag |= P_TIMEOUT;
771165138Syongari	}
772165138Syongari	mtx_exit(&sched_lock, MTX_SPIN);
773165138Syongari	splx(s);
774165138Syongari}
775165138Syongari
776165138Syongari/*
777165138Syongari * Remove a process from its wait queue
778165138Syongari */
779165138Syongarivoid
780165138Syongariunsleep(p)
781165138Syongari	register struct proc *p;
782165138Syongari{
783165138Syongari	int s;
784165138Syongari
785165138Syongari	s = splhigh();
786165138Syongari	mtx_enter(&sched_lock, MTX_SPIN);
787165138Syongari	if (p->p_wchan) {
788165138Syongari		TAILQ_REMOVE(&slpque[LOOKUP(p->p_wchan)], p, p_slpq);
789165138Syongari		p->p_wchan = 0;
790165138Syongari	}
791165138Syongari	mtx_exit(&sched_lock, MTX_SPIN);
792165138Syongari	splx(s);
793165138Syongari}
794165138Syongari
795165138Syongari/*
796165138Syongari * Make all processes sleeping on the specified identifier runnable.
797165138Syongari */
798165138Syongarivoid
799165138Syongariwakeup(ident)
800165138Syongari	register void *ident;
801165138Syongari{
802165138Syongari	register struct slpquehead *qp;
803165138Syongari	register struct proc *p;
804165138Syongari	int s;
805165138Syongari
806165138Syongari	s = splhigh();
807165138Syongari	mtx_enter(&sched_lock, MTX_SPIN);
808165138Syongari	qp = &slpque[LOOKUP(ident)];
809165138Syongarirestart:
810165138Syongari	TAILQ_FOREACH(p, qp, p_slpq) {
811165138Syongari		if (p->p_wchan == ident) {
812165138Syongari			TAILQ_REMOVE(qp, p, p_slpq);
813165138Syongari			p->p_wchan = 0;
814165138Syongari			if (p->p_stat == SSLEEP) {
815165138Syongari				/* OPTIMIZED EXPANSION OF setrunnable(p); */
816165138Syongari				CTR4(KTR_PROC,
817165138Syongari				        "wakeup: proc %p (pid %d, %s), schedlock %p",
818165138Syongari					p, p->p_pid, p->p_comm, (void *) sched_lock.mtx_lock);
819165138Syongari				if (p->p_slptime > 1)
820165138Syongari					updatepri(p);
821165138Syongari				p->p_slptime = 0;
822165138Syongari				p->p_stat = SRUN;
823165138Syongari				if (p->p_flag & P_INMEM) {
824165138Syongari					setrunqueue(p);
825165138Syongari					maybe_resched(p);
826165138Syongari				} else {
827165138Syongari					p->p_flag |= P_SWAPINREQ;
828165138Syongari					wakeup((caddr_t)&proc0);
829165138Syongari				}
830165138Syongari				/* END INLINE EXPANSION */
831165138Syongari				goto restart;
832165138Syongari			}
833165138Syongari		}
834165138Syongari	}
835165138Syongari	mtx_exit(&sched_lock, MTX_SPIN);
836165138Syongari	splx(s);
837165138Syongari}
838165138Syongari
839165138Syongari/*
840165138Syongari * Make a process sleeping on the specified identifier runnable.
841165138Syongari * May wake more than one process if a target process is currently
842165138Syongari * swapped out.
843165138Syongari */
844165138Syongarivoid
845165138Syongariwakeup_one(ident)
846165138Syongari	register void *ident;
847165138Syongari{
848165138Syongari	register struct slpquehead *qp;
849165138Syongari	register struct proc *p;
850165138Syongari	int s;
851165138Syongari
852165138Syongari	s = splhigh();
853165138Syongari	mtx_enter(&sched_lock, MTX_SPIN);
854165138Syongari	qp = &slpque[LOOKUP(ident)];
855165138Syongari
856165138Syongari	TAILQ_FOREACH(p, qp, p_slpq) {
857165138Syongari		if (p->p_wchan == ident) {
858165138Syongari			TAILQ_REMOVE(qp, p, p_slpq);
859165138Syongari			p->p_wchan = 0;
860165138Syongari			if (p->p_stat == SSLEEP) {
861165138Syongari				/* OPTIMIZED EXPANSION OF setrunnable(p); */
862165138Syongari				CTR4(KTR_PROC,
863165138Syongari				        "wakeup1: proc %p (pid %d, %s), schedlock %p",
864165138Syongari					p, p->p_pid, p->p_comm, (void *) sched_lock.mtx_lock);
865165138Syongari				if (p->p_slptime > 1)
866165138Syongari					updatepri(p);
867165138Syongari				p->p_slptime = 0;
868165138Syongari				p->p_stat = SRUN;
869165138Syongari				if (p->p_flag & P_INMEM) {
870165138Syongari					setrunqueue(p);
871165138Syongari					maybe_resched(p);
872165138Syongari					break;
873165138Syongari				} else {
874165138Syongari					p->p_flag |= P_SWAPINREQ;
875165138Syongari					wakeup((caddr_t)&proc0);
876165138Syongari				}
877165138Syongari				/* END INLINE EXPANSION */
878165138Syongari			}
879165138Syongari		}
880165138Syongari	}
881165138Syongari	mtx_exit(&sched_lock, MTX_SPIN);
882165138Syongari	splx(s);
883165138Syongari}
884165138Syongari
885165138Syongari/*
886165138Syongari * The machine independent parts of mi_switch().
887165138Syongari * Must be called at splstatclock() or higher.
888165138Syongari */
889165138Syongarivoid
890165138Syongarimi_switch()
891193293Syongari{
892165138Syongari	struct timeval new_switchtime;
893165138Syongari	register struct proc *p = curproc;	/* XXX */
894192734Syongari	register struct rlimit *rlim;
895199012Syongari	int x;
896199012Syongari
897165138Syongari	/*
898165138Syongari	 * XXX this spl is almost unnecessary.  It is partly to allow for
899165138Syongari	 * sloppy callers that don't do it (issignal() via CURSIG() is the
900165138Syongari	 * main offender).  It is partly to work around a bug in the i386
901165138Syongari	 * cpu_switch() (the ipl is not preserved).  We ran for years
902165138Syongari	 * without it.  I think there was only a interrupt latency problem.
903165138Syongari	 * The main caller, msleep(), does an splx() a couple of instructions
904165138Syongari	 * after calling here.  The buggy caller, issignal(), usually calls
905165138Syongari	 * here at spl0() and sometimes returns at splhigh().  The process
906165138Syongari	 * then runs for a little too long at splhigh().  The ipl gets fixed
907173769Syongari	 * when the process returns to user mode (or earlier).
908173769Syongari	 *
909165138Syongari	 * It would probably be better to always call here at spl0(). Callers
910192734Syongari	 * are prepared to give up control to another process, so they must
911192734Syongari	 * be prepared to be interrupted.  The clock stuff here may not
912193293Syongari	 * actually need splstatclock().
913193293Syongari	 */
914193293Syongari	x = splstatclock();
915165138Syongari
916165138Syongari	mtx_assert(&sched_lock, MA_OWNED);
917165138Syongari
918165138Syongari#ifdef SIMPLELOCK_DEBUG
919165138Syongari	if (p->p_simple_locks)
920165138Syongari		printf("sleep: holding simple lock\n");
921165138Syongari#endif
922165138Syongari	/*
923165138Syongari	 * Compute the amount of time during which the current
924165138Syongari	 * process was running, and add that to its total so far.
925165138Syongari	 */
926165138Syongari	microuptime(&new_switchtime);
927165138Syongari	if (timevalcmp(&new_switchtime, PCPU_PTR(switchtime), <)) {
928165138Syongari#if 0
929165138Syongari		/* XXX: This doesn't play well with sched_lock right now. */
930165138Syongari		printf("microuptime() went backwards (%ld.%06ld -> %ld.%06ld)\n",
931165138Syongari		    PCPU_GET(switchtime.tv_sec), PCPU_GET(switchtime.tv_usec),
932165138Syongari		    new_switchtime.tv_sec, new_switchtime.tv_usec);
933165138Syongari#endif
934165138Syongari		new_switchtime = PCPU_GET(switchtime);
935165138Syongari	} else {
936165138Syongari		p->p_runtime += (new_switchtime.tv_usec - PCPU_GET(switchtime.tv_usec)) +
937165138Syongari		    (new_switchtime.tv_sec - PCPU_GET(switchtime.tv_sec)) *
938165138Syongari		    (int64_t)1000000;
939165138Syongari	}
940165138Syongari
941165138Syongari#if 0
942165138Syongari	/*
943165138Syongari	 * Check if the process exceeds its cpu resource allocation.
944165138Syongari	 * If over max, kill it.
945165138Syongari	 *
946165138Syongari	 * XXX drop sched_lock, pickup Giant
947165138Syongari	 */
948165138Syongari	if (p->p_stat != SZOMB && p->p_limit->p_cpulimit != RLIM_INFINITY &&
949165138Syongari	    p->p_runtime > p->p_limit->p_cpulimit) {
950165138Syongari		rlim = &p->p_rlimit[RLIMIT_CPU];
951165138Syongari		if (p->p_runtime / (rlim_t)1000000 >= rlim->rlim_max) {
952165138Syongari			killproc(p, "exceeded maximum CPU limit");
953165138Syongari		} else {
954165138Syongari			psignal(p, SIGXCPU);
955165138Syongari			if (rlim->rlim_cur < rlim->rlim_max) {
956165138Syongari				/* XXX: we should make a private copy */
957165138Syongari				rlim->rlim_cur += 5;
958165138Syongari			}
959165138Syongari		}
960165138Syongari	}
961165138Syongari#endif
962165138Syongari
963165138Syongari	/*
964165138Syongari	 * Pick a new current process and record its start time.
965165138Syongari	 */
966165138Syongari	cnt.v_swtch++;
967165138Syongari	PCPU_SET(switchtime, new_switchtime);
968165138Syongari	CTR4(KTR_PROC, "mi_switch: old proc %p (pid %d, %s), schedlock %p",
969165138Syongari		p, p->p_pid, p->p_comm, (void *) sched_lock.mtx_lock);
970165138Syongari	cpu_switch();
971165138Syongari	CTR4(KTR_PROC, "mi_switch: new proc %p (pid %d, %s), schedlock %p",
972165138Syongari		p, p->p_pid, p->p_comm, (void *) sched_lock.mtx_lock);
973165138Syongari	if (PCPU_GET(switchtime.tv_sec) == 0)
974165138Syongari		microuptime(PCPU_PTR(switchtime));
975165138Syongari	PCPU_SET(switchticks, ticks);
976165138Syongari	splx(x);
977165138Syongari}
978165138Syongari
979193293Syongari/*
980193293Syongari * Change process state to be runnable,
981193293Syongari * placing it on the run queue if it is in memory,
982193293Syongari * and awakening the swapper if it isn't in memory.
983193293Syongari */
984193293Syongarivoid
985193293Syongarisetrunnable(p)
986193293Syongari	register struct proc *p;
987193293Syongari{
988193293Syongari	register int s;
989193293Syongari
990193293Syongari	s = splhigh();
991165138Syongari	mtx_enter(&sched_lock, MTX_SPIN);
992165138Syongari	switch (p->p_stat) {
993165138Syongari	case 0:
994165138Syongari	case SRUN:
995165138Syongari	case SZOMB:
996165138Syongari	case SWAIT:
997165138Syongari	default:
998165138Syongari		panic("setrunnable");
999165138Syongari	case SSTOP:
1000165138Syongari	case SSLEEP:			/* e.g. when sending signals */
1001165138Syongari		if (p->p_flag & P_CVWAITQ)
1002165138Syongari			cv_waitq_remove(p);
1003165138Syongari		else
1004165138Syongari			unsleep(p);
1005165138Syongari		break;
1006165138Syongari
1007165138Syongari	case SIDL:
1008165138Syongari		break;
1009165138Syongari	}
1010165138Syongari	p->p_stat = SRUN;
1011165138Syongari	if (p->p_flag & P_INMEM)
1012165138Syongari		setrunqueue(p);
1013165138Syongari	splx(s);
1014165138Syongari	if (p->p_slptime > 1)
1015165138Syongari		updatepri(p);
1016165138Syongari	p->p_slptime = 0;
1017165138Syongari	if ((p->p_flag & P_INMEM) == 0) {
1018165138Syongari		p->p_flag |= P_SWAPINREQ;
1019165138Syongari		wakeup((caddr_t)&proc0);
1020165138Syongari	}
1021165138Syongari	else
1022165138Syongari		maybe_resched(p);
1023165138Syongari	mtx_exit(&sched_lock, MTX_SPIN);
1024165138Syongari}
1025165138Syongari
1026165138Syongari/*
1027165138Syongari * Compute the priority of a process when running in user mode.
1028165138Syongari * Arrange to reschedule if the resulting priority is better
1029165138Syongari * than that of the current process.
1030165138Syongari */
1031165138Syongarivoid
1032165138Syongariresetpriority(p)
1033165138Syongari	register struct proc *p;
1034165138Syongari{
1035165138Syongari	register unsigned int newpriority;
1036165138Syongari
1037165138Syongari	mtx_enter(&sched_lock, MTX_SPIN);
1038165138Syongari	if (p->p_rtprio.type == RTP_PRIO_NORMAL) {
1039165138Syongari		newpriority = PUSER + p->p_estcpu / INVERSE_ESTCPU_WEIGHT +
1040165138Syongari		    NICE_WEIGHT * (p->p_nice - PRIO_MIN);
1041165138Syongari		newpriority = min(newpriority, MAXPRI);
1042165138Syongari		p->p_usrpri = newpriority;
1043165138Syongari	}
1044165138Syongari	maybe_resched(p);
1045165138Syongari	mtx_exit(&sched_lock, MTX_SPIN);
1046165138Syongari}
1047165138Syongari
1048165138Syongari/* ARGSUSED */
1049165138Syongaristatic void
1050165138Syongarisched_setup(dummy)
1051165138Syongari	void *dummy;
1052165138Syongari{
1053165138Syongari
1054165138Syongari	callout_init(&schedcpu_callout, 1);
1055165138Syongari	callout_init(&roundrobin_callout, 0);
1056165138Syongari
1057165138Syongari	/* Kick off timeout driven events by calling first time. */
1058165138Syongari	roundrobin(NULL);
1059165138Syongari	schedcpu(NULL);
1060165138Syongari}
1061165138Syongari
1062165138Syongari/*
1063165138Syongari * We adjust the priority of the current process.  The priority of
1064165138Syongari * a process gets worse as it accumulates CPU time.  The cpu usage
1065165138Syongari * estimator (p_estcpu) is increased here.  resetpriority() will
1066165138Syongari * compute a different priority each time p_estcpu increases by
1067165138Syongari * INVERSE_ESTCPU_WEIGHT
1068165138Syongari * (until MAXPRI is reached).  The cpu usage estimator ramps up
1069165138Syongari * quite quickly when the process is running (linearly), and decays
1070165138Syongari * away exponentially, at a rate which is proportionally slower when
1071165138Syongari * the system is busy.  The basic principle is that the system will
1072165138Syongari * 90% forget that the process used a lot of CPU time in 5 * loadav
1073165138Syongari * seconds.  This causes the system to favor processes which haven't
1074165138Syongari * run much recently, and to round-robin among other processes.
1075165138Syongari */
1076165138Syongarivoid
1077165138Syongarischedclock(p)
1078165138Syongari	struct proc *p;
1079165138Syongari{
1080165138Syongari
1081165138Syongari	p->p_cpticks++;
1082165138Syongari	p->p_estcpu = ESTCPULIM(p->p_estcpu + 1);
1083165138Syongari	if ((p->p_estcpu % INVERSE_ESTCPU_WEIGHT) == 0) {
1084165138Syongari		resetpriority(p);
1085165138Syongari		if (p->p_priority >= PUSER)
1086165138Syongari			p->p_priority = p->p_usrpri;
1087165138Syongari	}
1088165138Syongari}
1089165138Syongari
1090165138Syongari/*
1091165138Syongari * General purpose yield system call
1092165138Syongari */
1093165138Syongariint
1094165138Syongariyield(struct proc *p, struct yield_args *uap)
1095165138Syongari{
1096165138Syongari	int s;
1097165138Syongari
1098165138Syongari	p->p_retval[0] = 0;
1099165138Syongari
1100165138Syongari	s = splhigh();
1101165138Syongari	mtx_enter(&sched_lock, MTX_SPIN);
1102165138Syongari	DROP_GIANT_NOSWITCH();
1103165138Syongari	p->p_priority = MAXPRI;
1104165138Syongari	setrunqueue(p);
1105165138Syongari	p->p_stats->p_ru.ru_nvcsw++;
1106165138Syongari	mi_switch();
1107165138Syongari	mtx_exit(&sched_lock, MTX_SPIN);
1108165138Syongari	PICKUP_GIANT();
1109165138Syongari	splx(s);
1110165138Syongari
1111165138Syongari	return (0);
1112193293Syongari}
1113193293Syongari