kern_synch.c revision 60938
1/*-
2 * Copyright (c) 1982, 1986, 1990, 1991, 1993
3 *	The Regents of the University of California.  All rights reserved.
4 * (c) UNIX System Laboratories, Inc.
5 * All or some portions of this file are derived from material licensed
6 * to the University of California by American Telephone and Telegraph
7 * Co. or Unix System Laboratories, Inc. and are reproduced herein with
8 * the permission of UNIX System Laboratories, Inc.
9 *
10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted provided that the following conditions
12 * are met:
13 * 1. Redistributions of source code must retain the above copyright
14 *    notice, this list of conditions and the following disclaimer.
15 * 2. Redistributions in binary form must reproduce the above copyright
16 *    notice, this list of conditions and the following disclaimer in the
17 *    documentation and/or other materials provided with the distribution.
18 * 3. All advertising materials mentioning features or use of this software
19 *    must display the following acknowledgement:
20 *	This product includes software developed by the University of
21 *	California, Berkeley and its contributors.
22 * 4. Neither the name of the University nor the names of its contributors
23 *    may be used to endorse or promote products derived from this software
24 *    without specific prior written permission.
25 *
26 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
27 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
30 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
31 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
32 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
33 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
34 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
35 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
36 * SUCH DAMAGE.
37 *
38 *	@(#)kern_synch.c	8.9 (Berkeley) 5/19/95
39 * $FreeBSD: head/sys/kern/kern_synch.c 60938 2000-05-26 02:09:24Z jake $
40 */
41
42#include "opt_ktrace.h"
43
44#include <sys/param.h>
45#include <sys/systm.h>
46#include <sys/proc.h>
47#include <sys/kernel.h>
48#include <sys/signalvar.h>
49#include <sys/resourcevar.h>
50#include <sys/vmmeter.h>
51#include <sys/sysctl.h>
52#include <vm/vm.h>
53#include <vm/vm_extern.h>
54#ifdef KTRACE
55#include <sys/uio.h>
56#include <sys/ktrace.h>
57#endif
58
59#include <machine/cpu.h>
60#include <machine/ipl.h>
61#include <machine/smp.h>
62
63static void sched_setup __P((void *dummy));
64SYSINIT(sched_setup, SI_SUB_KICK_SCHEDULER, SI_ORDER_FIRST, sched_setup, NULL)
65
66u_char	curpriority;
67int	hogticks;
68int	lbolt;
69int	sched_quantum;		/* Roundrobin scheduling quantum in ticks. */
70
71static int	curpriority_cmp __P((struct proc *p));
72static void	endtsleep __P((void *));
73static void	maybe_resched __P((struct proc *chk));
74static void	roundrobin __P((void *arg));
75static void	schedcpu __P((void *arg));
76static void	updatepri __P((struct proc *p));
77
78static int
79sysctl_kern_quantum SYSCTL_HANDLER_ARGS
80{
81	int error, new_val;
82
83	new_val = sched_quantum * tick;
84	error = sysctl_handle_int(oidp, &new_val, 0, req);
85        if (error != 0 || req->newptr == NULL)
86		return (error);
87	if (new_val < tick)
88		return (EINVAL);
89	sched_quantum = new_val / tick;
90	hogticks = 2 * sched_quantum;
91	return (0);
92}
93
94SYSCTL_PROC(_kern, OID_AUTO, quantum, CTLTYPE_INT|CTLFLAG_RW,
95	0, sizeof sched_quantum, sysctl_kern_quantum, "I", "");
96
97/*-
98 * Compare priorities.  Return:
99 *     <0: priority of p < current priority
100 *      0: priority of p == current priority
101 *     >0: priority of p > current priority
102 * The priorities are the normal priorities or the normal realtime priorities
103 * if p is on the same scheduler as curproc.  Otherwise the process on the
104 * more realtimeish scheduler has lowest priority.  As usual, a higher
105 * priority really means a lower priority.
106 */
107static int
108curpriority_cmp(p)
109	struct proc *p;
110{
111	int c_class, p_class;
112
113	c_class = RTP_PRIO_BASE(curproc->p_rtprio.type);
114	p_class = RTP_PRIO_BASE(p->p_rtprio.type);
115	if (p_class != c_class)
116		return (p_class - c_class);
117	if (p_class == RTP_PRIO_NORMAL)
118		return (((int)p->p_priority - (int)curpriority) / PPQ);
119	return ((int)p->p_rtprio.prio - (int)curproc->p_rtprio.prio);
120}
121
122/*
123 * Arrange to reschedule if necessary, taking the priorities and
124 * schedulers into account.
125 */
126static void
127maybe_resched(chk)
128	struct proc *chk;
129{
130	struct proc *p = curproc; /* XXX */
131
132	/*
133	 * XXX idle scheduler still broken because proccess stays on idle
134	 * scheduler during waits (such as when getting FS locks).  If a
135	 * standard process becomes runaway cpu-bound, the system can lockup
136	 * due to idle-scheduler processes in wakeup never getting any cpu.
137	 */
138	if (p == NULL) {
139#if 0
140		need_resched();
141#endif
142	} else if (chk == p) {
143		/* We may need to yield if our priority has been raised. */
144		if (curpriority_cmp(chk) > 0)
145			need_resched();
146	} else if (curpriority_cmp(chk) < 0)
147		need_resched();
148}
149
150int
151roundrobin_interval(void)
152{
153	return (sched_quantum);
154}
155
156/*
157 * Force switch among equal priority processes every 100ms.
158 */
159/* ARGSUSED */
160static void
161roundrobin(arg)
162	void *arg;
163{
164#ifndef SMP
165 	struct proc *p = curproc; /* XXX */
166#endif
167
168#ifdef SMP
169	need_resched();
170	forward_roundrobin();
171#else
172 	if (p == 0 || RTP_PRIO_NEED_RR(p->p_rtprio.type))
173 		need_resched();
174#endif
175
176 	timeout(roundrobin, NULL, sched_quantum);
177}
178
179/*
180 * Constants for digital decay and forget:
181 *	90% of (p_estcpu) usage in 5 * loadav time
182 *	95% of (p_pctcpu) usage in 60 seconds (load insensitive)
183 *          Note that, as ps(1) mentions, this can let percentages
184 *          total over 100% (I've seen 137.9% for 3 processes).
185 *
186 * Note that schedclock() updates p_estcpu and p_cpticks asynchronously.
187 *
188 * We wish to decay away 90% of p_estcpu in (5 * loadavg) seconds.
189 * That is, the system wants to compute a value of decay such
190 * that the following for loop:
191 * 	for (i = 0; i < (5 * loadavg); i++)
192 * 		p_estcpu *= decay;
193 * will compute
194 * 	p_estcpu *= 0.1;
195 * for all values of loadavg:
196 *
197 * Mathematically this loop can be expressed by saying:
198 * 	decay ** (5 * loadavg) ~= .1
199 *
200 * The system computes decay as:
201 * 	decay = (2 * loadavg) / (2 * loadavg + 1)
202 *
203 * We wish to prove that the system's computation of decay
204 * will always fulfill the equation:
205 * 	decay ** (5 * loadavg) ~= .1
206 *
207 * If we compute b as:
208 * 	b = 2 * loadavg
209 * then
210 * 	decay = b / (b + 1)
211 *
212 * We now need to prove two things:
213 *	1) Given factor ** (5 * loadavg) ~= .1, prove factor == b/(b+1)
214 *	2) Given b/(b+1) ** power ~= .1, prove power == (5 * loadavg)
215 *
216 * Facts:
217 *         For x close to zero, exp(x) =~ 1 + x, since
218 *              exp(x) = 0! + x**1/1! + x**2/2! + ... .
219 *              therefore exp(-1/b) =~ 1 - (1/b) = (b-1)/b.
220 *         For x close to zero, ln(1+x) =~ x, since
221 *              ln(1+x) = x - x**2/2 + x**3/3 - ...     -1 < x < 1
222 *              therefore ln(b/(b+1)) = ln(1 - 1/(b+1)) =~ -1/(b+1).
223 *         ln(.1) =~ -2.30
224 *
225 * Proof of (1):
226 *    Solve (factor)**(power) =~ .1 given power (5*loadav):
227 *	solving for factor,
228 *      ln(factor) =~ (-2.30/5*loadav), or
229 *      factor =~ exp(-1/((5/2.30)*loadav)) =~ exp(-1/(2*loadav)) =
230 *          exp(-1/b) =~ (b-1)/b =~ b/(b+1).                    QED
231 *
232 * Proof of (2):
233 *    Solve (factor)**(power) =~ .1 given factor == (b/(b+1)):
234 *	solving for power,
235 *      power*ln(b/(b+1)) =~ -2.30, or
236 *      power =~ 2.3 * (b + 1) = 4.6*loadav + 2.3 =~ 5*loadav.  QED
237 *
238 * Actual power values for the implemented algorithm are as follows:
239 *      loadav: 1       2       3       4
240 *      power:  5.68    10.32   14.94   19.55
241 */
242
243/* calculations for digital decay to forget 90% of usage in 5*loadav sec */
244#define	loadfactor(loadav)	(2 * (loadav))
245#define	decay_cpu(loadfac, cpu)	(((loadfac) * (cpu)) / ((loadfac) + FSCALE))
246
247/* decay 95% of `p_pctcpu' in 60 seconds; see CCPU_SHIFT before changing */
248static fixpt_t	ccpu = 0.95122942450071400909 * FSCALE;	/* exp(-1/20) */
249SYSCTL_INT(_kern, OID_AUTO, ccpu, CTLFLAG_RD, &ccpu, 0, "");
250
251/* kernel uses `FSCALE', userland (SHOULD) use kern.fscale */
252static int	fscale __unused = FSCALE;
253SYSCTL_INT(_kern, OID_AUTO, fscale, CTLFLAG_RD, 0, FSCALE, "");
254
255/*
256 * If `ccpu' is not equal to `exp(-1/20)' and you still want to use the
257 * faster/more-accurate formula, you'll have to estimate CCPU_SHIFT below
258 * and possibly adjust FSHIFT in "param.h" so that (FSHIFT >= CCPU_SHIFT).
259 *
260 * To estimate CCPU_SHIFT for exp(-1/20), the following formula was used:
261 *	1 - exp(-1/20) ~= 0.0487 ~= 0.0488 == 1 (fixed pt, *11* bits).
262 *
263 * If you don't want to bother with the faster/more-accurate formula, you
264 * can set CCPU_SHIFT to (FSHIFT + 1) which will use a slower/less-accurate
265 * (more general) method of calculating the %age of CPU used by a process.
266 */
267#define	CCPU_SHIFT	11
268
269/*
270 * Recompute process priorities, every hz ticks.
271 */
272/* ARGSUSED */
273static void
274schedcpu(arg)
275	void *arg;
276{
277	register fixpt_t loadfac = loadfactor(averunnable.ldavg[0]);
278	register struct proc *p;
279	register int realstathz, s;
280
281	realstathz = stathz ? stathz : hz;
282	LIST_FOREACH(p, &allproc, p_list) {
283		/*
284		 * Increment time in/out of memory and sleep time
285		 * (if sleeping).  We ignore overflow; with 16-bit int's
286		 * (remember them?) overflow takes 45 days.
287		 */
288		p->p_swtime++;
289		if (p->p_stat == SSLEEP || p->p_stat == SSTOP)
290			p->p_slptime++;
291		p->p_pctcpu = (p->p_pctcpu * ccpu) >> FSHIFT;
292		/*
293		 * If the process has slept the entire second,
294		 * stop recalculating its priority until it wakes up.
295		 */
296		if (p->p_slptime > 1)
297			continue;
298		s = splhigh();	/* prevent state changes and protect run queue */
299		/*
300		 * p_pctcpu is only for ps.
301		 */
302#if	(FSHIFT >= CCPU_SHIFT)
303		p->p_pctcpu += (realstathz == 100)?
304			((fixpt_t) p->p_cpticks) << (FSHIFT - CCPU_SHIFT):
305                	100 * (((fixpt_t) p->p_cpticks)
306				<< (FSHIFT - CCPU_SHIFT)) / realstathz;
307#else
308		p->p_pctcpu += ((FSCALE - ccpu) *
309			(p->p_cpticks * FSCALE / realstathz)) >> FSHIFT;
310#endif
311		p->p_cpticks = 0;
312		p->p_estcpu = decay_cpu(loadfac, p->p_estcpu);
313		resetpriority(p);
314		if (p->p_priority >= PUSER) {
315			if ((p != curproc) &&
316#ifdef SMP
317			    p->p_oncpu == 0xff && 	/* idle */
318#endif
319			    p->p_stat == SRUN &&
320			    (p->p_flag & P_INMEM) &&
321			    (p->p_priority / PPQ) != (p->p_usrpri / PPQ)) {
322				remrunqueue(p);
323				p->p_priority = p->p_usrpri;
324				setrunqueue(p);
325			} else
326				p->p_priority = p->p_usrpri;
327		}
328		splx(s);
329	}
330	vmmeter();
331	wakeup((caddr_t)&lbolt);
332	timeout(schedcpu, (void *)0, hz);
333}
334
335/*
336 * Recalculate the priority of a process after it has slept for a while.
337 * For all load averages >= 1 and max p_estcpu of 255, sleeping for at
338 * least six times the loadfactor will decay p_estcpu to zero.
339 */
340static void
341updatepri(p)
342	register struct proc *p;
343{
344	register unsigned int newcpu = p->p_estcpu;
345	register fixpt_t loadfac = loadfactor(averunnable.ldavg[0]);
346
347	if (p->p_slptime > 5 * loadfac)
348		p->p_estcpu = 0;
349	else {
350		p->p_slptime--;	/* the first time was done in schedcpu */
351		while (newcpu && --p->p_slptime)
352			newcpu = decay_cpu(loadfac, newcpu);
353		p->p_estcpu = newcpu;
354	}
355	resetpriority(p);
356}
357
358/*
359 * We're only looking at 7 bits of the address; everything is
360 * aligned to 4, lots of things are aligned to greater powers
361 * of 2.  Shift right by 8, i.e. drop the bottom 256 worth.
362 */
363#define TABLESIZE	128
364static TAILQ_HEAD(slpquehead, proc) slpque[TABLESIZE];
365#define LOOKUP(x)	(((intptr_t)(x) >> 8) & (TABLESIZE - 1))
366
367/*
368 * During autoconfiguration or after a panic, a sleep will simply
369 * lower the priority briefly to allow interrupts, then return.
370 * The priority to be used (safepri) is machine-dependent, thus this
371 * value is initialized and maintained in the machine-dependent layers.
372 * This priority will typically be 0, or the lowest priority
373 * that is safe for use on the interrupt stack; it can be made
374 * higher to block network software interrupts after panics.
375 */
376int safepri;
377
378void
379sleepinit(void)
380{
381	int i;
382
383	sched_quantum = hz/10;
384	hogticks = 2 * sched_quantum;
385	for (i = 0; i < TABLESIZE; i++)
386		TAILQ_INIT(&slpque[i]);
387}
388
389/*
390 * General sleep call.  Suspends the current process until a wakeup is
391 * performed on the specified identifier.  The process will then be made
392 * runnable with the specified priority.  Sleeps at most timo/hz seconds
393 * (0 means no timeout).  If pri includes PCATCH flag, signals are checked
394 * before and after sleeping, else signals are not checked.  Returns 0 if
395 * awakened, EWOULDBLOCK if the timeout expires.  If PCATCH is set and a
396 * signal needs to be delivered, ERESTART is returned if the current system
397 * call should be restarted if possible, and EINTR is returned if the system
398 * call should be interrupted by the signal (return EINTR).
399 */
400int
401tsleep(ident, priority, wmesg, timo)
402	void *ident;
403	int priority, timo;
404	const char *wmesg;
405{
406	struct proc *p = curproc;
407	int s, sig, catch = priority & PCATCH;
408	struct callout_handle thandle;
409
410#ifdef KTRACE
411	if (p && KTRPOINT(p, KTR_CSW))
412		ktrcsw(p->p_tracep, 1, 0);
413#endif
414	s = splhigh();
415	if (cold || panicstr) {
416		/*
417		 * After a panic, or during autoconfiguration,
418		 * just give interrupts a chance, then just return;
419		 * don't run any other procs or panic below,
420		 * in case this is the idle process and already asleep.
421		 */
422		splx(safepri);
423		splx(s);
424		return (0);
425	}
426	KASSERT(p != NULL, ("tsleep1"));
427	KASSERT(ident != NULL && p->p_stat == SRUN, ("tsleep"));
428	/*
429	 * Process may be sitting on a slpque if asleep() was called, remove
430	 * it before re-adding.
431	 */
432	if (p->p_wchan != NULL)
433		unsleep(p);
434
435	p->p_wchan = ident;
436	p->p_wmesg = wmesg;
437	p->p_slptime = 0;
438	p->p_priority = priority & PRIMASK;
439	TAILQ_INSERT_TAIL(&slpque[LOOKUP(ident)], p, p_procq);
440	if (timo)
441		thandle = timeout(endtsleep, (void *)p, timo);
442	/*
443	 * We put ourselves on the sleep queue and start our timeout
444	 * before calling CURSIG, as we could stop there, and a wakeup
445	 * or a SIGCONT (or both) could occur while we were stopped.
446	 * A SIGCONT would cause us to be marked as SSLEEP
447	 * without resuming us, thus we must be ready for sleep
448	 * when CURSIG is called.  If the wakeup happens while we're
449	 * stopped, p->p_wchan will be 0 upon return from CURSIG.
450	 */
451	if (catch) {
452		p->p_flag |= P_SINTR;
453		if ((sig = CURSIG(p))) {
454			if (p->p_wchan)
455				unsleep(p);
456			p->p_stat = SRUN;
457			goto resume;
458		}
459		if (p->p_wchan == 0) {
460			catch = 0;
461			goto resume;
462		}
463	} else
464		sig = 0;
465	p->p_stat = SSLEEP;
466	p->p_stats->p_ru.ru_nvcsw++;
467	mi_switch();
468resume:
469	curpriority = p->p_usrpri;
470	splx(s);
471	p->p_flag &= ~P_SINTR;
472	if (p->p_flag & P_TIMEOUT) {
473		p->p_flag &= ~P_TIMEOUT;
474		if (sig == 0) {
475#ifdef KTRACE
476			if (KTRPOINT(p, KTR_CSW))
477				ktrcsw(p->p_tracep, 0, 0);
478#endif
479			return (EWOULDBLOCK);
480		}
481	} else if (timo)
482		untimeout(endtsleep, (void *)p, thandle);
483	if (catch && (sig != 0 || (sig = CURSIG(p)))) {
484#ifdef KTRACE
485		if (KTRPOINT(p, KTR_CSW))
486			ktrcsw(p->p_tracep, 0, 0);
487#endif
488		if (SIGISMEMBER(p->p_sigacts->ps_sigintr, sig))
489			return (EINTR);
490		return (ERESTART);
491	}
492#ifdef KTRACE
493	if (KTRPOINT(p, KTR_CSW))
494		ktrcsw(p->p_tracep, 0, 0);
495#endif
496	return (0);
497}
498
499/*
500 * asleep() - async sleep call.  Place process on wait queue and return
501 * immediately without blocking.  The process stays runnable until await()
502 * is called.  If ident is NULL, remove process from wait queue if it is still
503 * on one.
504 *
505 * Only the most recent sleep condition is effective when making successive
506 * calls to asleep() or when calling tsleep().
507 *
508 * The timeout, if any, is not initiated until await() is called.  The sleep
509 * priority, signal, and timeout is specified in the asleep() call but may be
510 * overriden in the await() call.
511 *
512 * <<<<<<<< EXPERIMENTAL, UNTESTED >>>>>>>>>>
513 */
514
515int
516asleep(void *ident, int priority, const char *wmesg, int timo)
517{
518	struct proc *p = curproc;
519	int s;
520
521	/*
522	 * splhigh() while manipulating sleep structures and slpque.
523	 *
524	 * Remove preexisting wait condition (if any) and place process
525	 * on appropriate slpque, but do not put process to sleep.
526	 */
527
528	s = splhigh();
529
530	if (p->p_wchan != NULL)
531		unsleep(p);
532
533	if (ident) {
534		p->p_wchan = ident;
535		p->p_wmesg = wmesg;
536		p->p_slptime = 0;
537		p->p_asleep.as_priority = priority;
538		p->p_asleep.as_timo = timo;
539		TAILQ_INSERT_TAIL(&slpque[LOOKUP(ident)], p, p_procq);
540	}
541
542	splx(s);
543
544	return(0);
545}
546
547/*
548 * await() - wait for async condition to occur.   The process blocks until
549 * wakeup() is called on the most recent asleep() address.  If wakeup is called
550 * priority to await(), await() winds up being a NOP.
551 *
552 * If await() is called more then once (without an intervening asleep() call),
553 * await() is still effectively a NOP but it calls mi_switch() to give other
554 * processes some cpu before returning.  The process is left runnable.
555 *
556 * <<<<<<<< EXPERIMENTAL, UNTESTED >>>>>>>>>>
557 */
558
559int
560await(int priority, int timo)
561{
562	struct proc *p = curproc;
563	int s;
564
565	s = splhigh();
566
567	if (p->p_wchan != NULL) {
568		struct callout_handle thandle;
569		int sig;
570		int catch;
571
572		/*
573		 * The call to await() can override defaults specified in
574		 * the original asleep().
575		 */
576		if (priority < 0)
577			priority = p->p_asleep.as_priority;
578		if (timo < 0)
579			timo = p->p_asleep.as_timo;
580
581		/*
582		 * Install timeout
583		 */
584
585		if (timo)
586			thandle = timeout(endtsleep, (void *)p, timo);
587
588		sig = 0;
589		catch = priority & PCATCH;
590
591		if (catch) {
592			p->p_flag |= P_SINTR;
593			if ((sig = CURSIG(p))) {
594				if (p->p_wchan)
595					unsleep(p);
596				p->p_stat = SRUN;
597				goto resume;
598			}
599			if (p->p_wchan == NULL) {
600				catch = 0;
601				goto resume;
602			}
603		}
604		p->p_stat = SSLEEP;
605		p->p_stats->p_ru.ru_nvcsw++;
606		mi_switch();
607resume:
608		curpriority = p->p_usrpri;
609
610		splx(s);
611		p->p_flag &= ~P_SINTR;
612		if (p->p_flag & P_TIMEOUT) {
613			p->p_flag &= ~P_TIMEOUT;
614			if (sig == 0) {
615#ifdef KTRACE
616				if (KTRPOINT(p, KTR_CSW))
617					ktrcsw(p->p_tracep, 0, 0);
618#endif
619				return (EWOULDBLOCK);
620			}
621		} else if (timo)
622			untimeout(endtsleep, (void *)p, thandle);
623		if (catch && (sig != 0 || (sig = CURSIG(p)))) {
624#ifdef KTRACE
625			if (KTRPOINT(p, KTR_CSW))
626				ktrcsw(p->p_tracep, 0, 0);
627#endif
628			if (SIGISMEMBER(p->p_sigacts->ps_sigintr, sig))
629				return (EINTR);
630			return (ERESTART);
631		}
632#ifdef KTRACE
633		if (KTRPOINT(p, KTR_CSW))
634			ktrcsw(p->p_tracep, 0, 0);
635#endif
636	} else {
637		/*
638		 * If as_priority is 0, await() has been called without an
639		 * intervening asleep().  We are still effectively a NOP,
640		 * but we call mi_switch() for safety.
641		 */
642
643		if (p->p_asleep.as_priority == 0) {
644			p->p_stats->p_ru.ru_nvcsw++;
645			mi_switch();
646		}
647		splx(s);
648	}
649
650	/*
651	 * clear p_asleep.as_priority as an indication that await() has been
652	 * called.  If await() is called again without an intervening asleep(),
653	 * await() is still effectively a NOP but the above mi_switch() code
654	 * is triggered as a safety.
655	 */
656	p->p_asleep.as_priority = 0;
657
658	return (0);
659}
660
661/*
662 * Implement timeout for tsleep or asleep()/await()
663 *
664 * If process hasn't been awakened (wchan non-zero),
665 * set timeout flag and undo the sleep.  If proc
666 * is stopped, just unsleep so it will remain stopped.
667 */
668static void
669endtsleep(arg)
670	void *arg;
671{
672	register struct proc *p;
673	int s;
674
675	p = (struct proc *)arg;
676	s = splhigh();
677	if (p->p_wchan) {
678		if (p->p_stat == SSLEEP)
679			setrunnable(p);
680		else
681			unsleep(p);
682		p->p_flag |= P_TIMEOUT;
683	}
684	splx(s);
685}
686
687/*
688 * Remove a process from its wait queue
689 */
690void
691unsleep(p)
692	register struct proc *p;
693{
694	int s;
695
696	s = splhigh();
697	if (p->p_wchan) {
698		TAILQ_REMOVE(&slpque[LOOKUP(p->p_wchan)], p, p_procq);
699		p->p_wchan = 0;
700	}
701	splx(s);
702}
703
704/*
705 * Make all processes sleeping on the specified identifier runnable.
706 */
707void
708wakeup(ident)
709	register void *ident;
710{
711	register struct slpquehead *qp;
712	register struct proc *p;
713	int s;
714
715	s = splhigh();
716	qp = &slpque[LOOKUP(ident)];
717restart:
718	TAILQ_FOREACH(p, qp, p_procq) {
719		if (p->p_wchan == ident) {
720			TAILQ_REMOVE(qp, p, p_procq);
721			p->p_wchan = 0;
722			if (p->p_stat == SSLEEP) {
723				/* OPTIMIZED EXPANSION OF setrunnable(p); */
724				if (p->p_slptime > 1)
725					updatepri(p);
726				p->p_slptime = 0;
727				p->p_stat = SRUN;
728				if (p->p_flag & P_INMEM) {
729					setrunqueue(p);
730					maybe_resched(p);
731				} else {
732					p->p_flag |= P_SWAPINREQ;
733					wakeup((caddr_t)&proc0);
734				}
735				/* END INLINE EXPANSION */
736				goto restart;
737			}
738		}
739	}
740	splx(s);
741}
742
743/*
744 * Make a process sleeping on the specified identifier runnable.
745 * May wake more than one process if a target process is currently
746 * swapped out.
747 */
748void
749wakeup_one(ident)
750	register void *ident;
751{
752	register struct slpquehead *qp;
753	register struct proc *p;
754	int s;
755
756	s = splhigh();
757	qp = &slpque[LOOKUP(ident)];
758
759	TAILQ_FOREACH(p, qp, p_procq) {
760		if (p->p_wchan == ident) {
761			TAILQ_REMOVE(qp, p, p_procq);
762			p->p_wchan = 0;
763			if (p->p_stat == SSLEEP) {
764				/* OPTIMIZED EXPANSION OF setrunnable(p); */
765				if (p->p_slptime > 1)
766					updatepri(p);
767				p->p_slptime = 0;
768				p->p_stat = SRUN;
769				if (p->p_flag & P_INMEM) {
770					setrunqueue(p);
771					maybe_resched(p);
772					break;
773				} else {
774					p->p_flag |= P_SWAPINREQ;
775					wakeup((caddr_t)&proc0);
776				}
777				/* END INLINE EXPANSION */
778			}
779		}
780	}
781	splx(s);
782}
783
784/*
785 * The machine independent parts of mi_switch().
786 * Must be called at splstatclock() or higher.
787 */
788void
789mi_switch()
790{
791	struct timeval new_switchtime;
792	register struct proc *p = curproc;	/* XXX */
793	register struct rlimit *rlim;
794	int x;
795
796	/*
797	 * XXX this spl is almost unnecessary.  It is partly to allow for
798	 * sloppy callers that don't do it (issignal() via CURSIG() is the
799	 * main offender).  It is partly to work around a bug in the i386
800	 * cpu_switch() (the ipl is not preserved).  We ran for years
801	 * without it.  I think there was only a interrupt latency problem.
802	 * The main caller, tsleep(), does an splx() a couple of instructions
803	 * after calling here.  The buggy caller, issignal(), usually calls
804	 * here at spl0() and sometimes returns at splhigh().  The process
805	 * then runs for a little too long at splhigh().  The ipl gets fixed
806	 * when the process returns to user mode (or earlier).
807	 *
808	 * It would probably be better to always call here at spl0(). Callers
809	 * are prepared to give up control to another process, so they must
810	 * be prepared to be interrupted.  The clock stuff here may not
811	 * actually need splstatclock().
812	 */
813	x = splstatclock();
814
815#ifdef SIMPLELOCK_DEBUG
816	if (p->p_simple_locks)
817		printf("sleep: holding simple lock\n");
818#endif
819	/*
820	 * Compute the amount of time during which the current
821	 * process was running, and add that to its total so far.
822	 */
823	microuptime(&new_switchtime);
824	if (timevalcmp(&new_switchtime, &switchtime, <)) {
825		printf("microuptime() went backwards (%ld.%06ld -> %ld.%06ld)\n",
826		    switchtime.tv_sec, switchtime.tv_usec,
827		    new_switchtime.tv_sec, new_switchtime.tv_usec);
828		new_switchtime = switchtime;
829	} else {
830		p->p_runtime += (new_switchtime.tv_usec - switchtime.tv_usec) +
831		    (new_switchtime.tv_sec - switchtime.tv_sec) * (int64_t)1000000;
832	}
833
834	/*
835	 * Check if the process exceeds its cpu resource allocation.
836	 * If over max, kill it.
837	 */
838	if (p->p_stat != SZOMB && p->p_limit->p_cpulimit != RLIM_INFINITY &&
839	    p->p_runtime > p->p_limit->p_cpulimit) {
840		rlim = &p->p_rlimit[RLIMIT_CPU];
841		if (p->p_runtime / (rlim_t)1000000 >= rlim->rlim_max) {
842			killproc(p, "exceeded maximum CPU limit");
843		} else {
844			psignal(p, SIGXCPU);
845			if (rlim->rlim_cur < rlim->rlim_max) {
846				/* XXX: we should make a private copy */
847				rlim->rlim_cur += 5;
848			}
849		}
850	}
851
852	/*
853	 * Pick a new current process and record its start time.
854	 */
855	cnt.v_swtch++;
856	switchtime = new_switchtime;
857	cpu_switch(p);
858	if (switchtime.tv_sec == 0)
859		microuptime(&switchtime);
860	switchticks = ticks;
861
862	splx(x);
863}
864
865/*
866 * Change process state to be runnable,
867 * placing it on the run queue if it is in memory,
868 * and awakening the swapper if it isn't in memory.
869 */
870void
871setrunnable(p)
872	register struct proc *p;
873{
874	register int s;
875
876	s = splhigh();
877	switch (p->p_stat) {
878	case 0:
879	case SRUN:
880	case SZOMB:
881	default:
882		panic("setrunnable");
883	case SSTOP:
884	case SSLEEP:
885		unsleep(p);		/* e.g. when sending signals */
886		break;
887
888	case SIDL:
889		break;
890	}
891	p->p_stat = SRUN;
892	if (p->p_flag & P_INMEM)
893		setrunqueue(p);
894	splx(s);
895	if (p->p_slptime > 1)
896		updatepri(p);
897	p->p_slptime = 0;
898	if ((p->p_flag & P_INMEM) == 0) {
899		p->p_flag |= P_SWAPINREQ;
900		wakeup((caddr_t)&proc0);
901	}
902	else
903		maybe_resched(p);
904}
905
906/*
907 * Compute the priority of a process when running in user mode.
908 * Arrange to reschedule if the resulting priority is better
909 * than that of the current process.
910 */
911void
912resetpriority(p)
913	register struct proc *p;
914{
915	register unsigned int newpriority;
916
917	if (p->p_rtprio.type == RTP_PRIO_NORMAL) {
918		newpriority = PUSER + p->p_estcpu / INVERSE_ESTCPU_WEIGHT +
919		    NICE_WEIGHT * (p->p_nice - PRIO_MIN);
920		newpriority = min(newpriority, MAXPRI);
921		p->p_usrpri = newpriority;
922	}
923	maybe_resched(p);
924}
925
926/* ARGSUSED */
927static void
928sched_setup(dummy)
929	void *dummy;
930{
931	/* Kick off timeout driven events by calling first time. */
932	roundrobin(NULL);
933	schedcpu(NULL);
934}
935
936/*
937 * We adjust the priority of the current process.  The priority of
938 * a process gets worse as it accumulates CPU time.  The cpu usage
939 * estimator (p_estcpu) is increased here.  resetpriority() will
940 * compute a different priority each time p_estcpu increases by
941 * INVERSE_ESTCPU_WEIGHT
942 * (until MAXPRI is reached).  The cpu usage estimator ramps up
943 * quite quickly when the process is running (linearly), and decays
944 * away exponentially, at a rate which is proportionally slower when
945 * the system is busy.  The basic principle is that the system will
946 * 90% forget that the process used a lot of CPU time in 5 * loadav
947 * seconds.  This causes the system to favor processes which haven't
948 * run much recently, and to round-robin among other processes.
949 */
950void
951schedclock(p)
952	struct proc *p;
953{
954
955	p->p_cpticks++;
956	p->p_estcpu = ESTCPULIM(p->p_estcpu + 1);
957	if ((p->p_estcpu % INVERSE_ESTCPU_WEIGHT) == 0) {
958		resetpriority(p);
959		if (p->p_priority >= PUSER)
960			p->p_priority = p->p_usrpri;
961	}
962}
963