kern_synch.c revision 53944
1/*-
2 * Copyright (c) 1982, 1986, 1990, 1991, 1993
3 *	The Regents of the University of California.  All rights reserved.
4 * (c) UNIX System Laboratories, Inc.
5 * All or some portions of this file are derived from material licensed
6 * to the University of California by American Telephone and Telegraph
7 * Co. or Unix System Laboratories, Inc. and are reproduced herein with
8 * the permission of UNIX System Laboratories, Inc.
9 *
10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted provided that the following conditions
12 * are met:
13 * 1. Redistributions of source code must retain the above copyright
14 *    notice, this list of conditions and the following disclaimer.
15 * 2. Redistributions in binary form must reproduce the above copyright
16 *    notice, this list of conditions and the following disclaimer in the
17 *    documentation and/or other materials provided with the distribution.
18 * 3. All advertising materials mentioning features or use of this software
19 *    must display the following acknowledgement:
20 *	This product includes software developed by the University of
21 *	California, Berkeley and its contributors.
22 * 4. Neither the name of the University nor the names of its contributors
23 *    may be used to endorse or promote products derived from this software
24 *    without specific prior written permission.
25 *
26 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
27 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
30 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
31 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
32 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
33 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
34 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
35 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
36 * SUCH DAMAGE.
37 *
38 *	@(#)kern_synch.c	8.9 (Berkeley) 5/19/95
39 * $FreeBSD: head/sys/kern/kern_synch.c 53944 1999-11-30 09:01:46Z peter $
40 */
41
42#include "opt_ktrace.h"
43
44#include <sys/param.h>
45#include <sys/systm.h>
46#include <sys/proc.h>
47#include <sys/kernel.h>
48#include <sys/signalvar.h>
49#include <sys/resourcevar.h>
50#include <sys/vmmeter.h>
51#include <sys/sysctl.h>
52#include <vm/vm.h>
53#include <vm/vm_extern.h>
54#ifdef KTRACE
55#include <sys/uio.h>
56#include <sys/ktrace.h>
57#endif
58
59#include <machine/cpu.h>
60#ifdef SMP
61#include <machine/smp.h>
62#endif
63
64static void sched_setup __P((void *dummy));
65SYSINIT(sched_setup, SI_SUB_KICK_SCHEDULER, SI_ORDER_FIRST, sched_setup, NULL)
66
67u_char	curpriority;
68int	hogticks;
69int	lbolt;
70int	sched_quantum;		/* Roundrobin scheduling quantum in ticks. */
71
72static void	endtsleep __P((void *));
73static void	roundrobin __P((void *arg));
74static void	schedcpu __P((void *arg));
75static void	updatepri __P((struct proc *p));
76
77static int
78sysctl_kern_quantum SYSCTL_HANDLER_ARGS
79{
80	int error, new_val;
81
82	new_val = sched_quantum * tick;
83	error = sysctl_handle_int(oidp, &new_val, 0, req);
84        if (error != 0 || req->newptr == NULL)
85		return (error);
86	if (new_val < tick)
87		return (EINVAL);
88	sched_quantum = new_val / tick;
89	hogticks = 2 * sched_quantum;
90	return (0);
91}
92
93SYSCTL_PROC(_kern, OID_AUTO, quantum, CTLTYPE_INT|CTLFLAG_RW,
94	0, sizeof sched_quantum, sysctl_kern_quantum, "I", "");
95
96/* maybe_resched: Decide if you need to reschedule or not
97 * taking the priorities and schedulers into account.
98 */
99static void maybe_resched(struct proc *chk)
100{
101	struct proc *p = curproc; /* XXX */
102
103	/*
104	 * Compare priorities if the new process is on the same scheduler,
105	 * otherwise the one on the more realtimeish scheduler wins.
106	 *
107	 * XXX idle scheduler still broken because proccess stays on idle
108	 * scheduler during waits (such as when getting FS locks).  If a
109	 * standard process becomes runaway cpu-bound, the system can lockup
110	 * due to idle-scheduler processes in wakeup never getting any cpu.
111	 */
112	if (p == 0 ||
113		(chk->p_priority < curpriority && RTP_PRIO_BASE(p->p_rtprio.type) == RTP_PRIO_BASE(chk->p_rtprio.type)) ||
114		RTP_PRIO_BASE(chk->p_rtprio.type) < RTP_PRIO_BASE(p->p_rtprio.type)
115	) {
116		need_resched();
117	}
118}
119
120int
121roundrobin_interval(void)
122{
123	return (sched_quantum);
124}
125
126/*
127 * Force switch among equal priority processes every 100ms.
128 */
129/* ARGSUSED */
130static void
131roundrobin(arg)
132	void *arg;
133{
134#ifndef SMP
135 	struct proc *p = curproc; /* XXX */
136#endif
137
138#ifdef SMP
139	need_resched();
140	forward_roundrobin();
141#else
142 	if (p == 0 || RTP_PRIO_NEED_RR(p->p_rtprio.type))
143 		need_resched();
144#endif
145
146 	timeout(roundrobin, NULL, sched_quantum);
147}
148
149/*
150 * Constants for digital decay and forget:
151 *	90% of (p_estcpu) usage in 5 * loadav time
152 *	95% of (p_pctcpu) usage in 60 seconds (load insensitive)
153 *          Note that, as ps(1) mentions, this can let percentages
154 *          total over 100% (I've seen 137.9% for 3 processes).
155 *
156 * Note that schedclock() updates p_estcpu and p_cpticks asynchronously.
157 *
158 * We wish to decay away 90% of p_estcpu in (5 * loadavg) seconds.
159 * That is, the system wants to compute a value of decay such
160 * that the following for loop:
161 * 	for (i = 0; i < (5 * loadavg); i++)
162 * 		p_estcpu *= decay;
163 * will compute
164 * 	p_estcpu *= 0.1;
165 * for all values of loadavg:
166 *
167 * Mathematically this loop can be expressed by saying:
168 * 	decay ** (5 * loadavg) ~= .1
169 *
170 * The system computes decay as:
171 * 	decay = (2 * loadavg) / (2 * loadavg + 1)
172 *
173 * We wish to prove that the system's computation of decay
174 * will always fulfill the equation:
175 * 	decay ** (5 * loadavg) ~= .1
176 *
177 * If we compute b as:
178 * 	b = 2 * loadavg
179 * then
180 * 	decay = b / (b + 1)
181 *
182 * We now need to prove two things:
183 *	1) Given factor ** (5 * loadavg) ~= .1, prove factor == b/(b+1)
184 *	2) Given b/(b+1) ** power ~= .1, prove power == (5 * loadavg)
185 *
186 * Facts:
187 *         For x close to zero, exp(x) =~ 1 + x, since
188 *              exp(x) = 0! + x**1/1! + x**2/2! + ... .
189 *              therefore exp(-1/b) =~ 1 - (1/b) = (b-1)/b.
190 *         For x close to zero, ln(1+x) =~ x, since
191 *              ln(1+x) = x - x**2/2 + x**3/3 - ...     -1 < x < 1
192 *              therefore ln(b/(b+1)) = ln(1 - 1/(b+1)) =~ -1/(b+1).
193 *         ln(.1) =~ -2.30
194 *
195 * Proof of (1):
196 *    Solve (factor)**(power) =~ .1 given power (5*loadav):
197 *	solving for factor,
198 *      ln(factor) =~ (-2.30/5*loadav), or
199 *      factor =~ exp(-1/((5/2.30)*loadav)) =~ exp(-1/(2*loadav)) =
200 *          exp(-1/b) =~ (b-1)/b =~ b/(b+1).                    QED
201 *
202 * Proof of (2):
203 *    Solve (factor)**(power) =~ .1 given factor == (b/(b+1)):
204 *	solving for power,
205 *      power*ln(b/(b+1)) =~ -2.30, or
206 *      power =~ 2.3 * (b + 1) = 4.6*loadav + 2.3 =~ 5*loadav.  QED
207 *
208 * Actual power values for the implemented algorithm are as follows:
209 *      loadav: 1       2       3       4
210 *      power:  5.68    10.32   14.94   19.55
211 */
212
213/* calculations for digital decay to forget 90% of usage in 5*loadav sec */
214#define	loadfactor(loadav)	(2 * (loadav))
215#define	decay_cpu(loadfac, cpu)	(((loadfac) * (cpu)) / ((loadfac) + FSCALE))
216
217/* decay 95% of `p_pctcpu' in 60 seconds; see CCPU_SHIFT before changing */
218static fixpt_t	ccpu = 0.95122942450071400909 * FSCALE;	/* exp(-1/20) */
219SYSCTL_INT(_kern, OID_AUTO, ccpu, CTLFLAG_RD, &ccpu, 0, "");
220
221/* kernel uses `FSCALE', userland (SHOULD) use kern.fscale */
222static int	fscale __unused = FSCALE;
223SYSCTL_INT(_kern, OID_AUTO, fscale, CTLFLAG_RD, 0, FSCALE, "");
224
225/*
226 * If `ccpu' is not equal to `exp(-1/20)' and you still want to use the
227 * faster/more-accurate formula, you'll have to estimate CCPU_SHIFT below
228 * and possibly adjust FSHIFT in "param.h" so that (FSHIFT >= CCPU_SHIFT).
229 *
230 * To estimate CCPU_SHIFT for exp(-1/20), the following formula was used:
231 *	1 - exp(-1/20) ~= 0.0487 ~= 0.0488 == 1 (fixed pt, *11* bits).
232 *
233 * If you don't want to bother with the faster/more-accurate formula, you
234 * can set CCPU_SHIFT to (FSHIFT + 1) which will use a slower/less-accurate
235 * (more general) method of calculating the %age of CPU used by a process.
236 */
237#define	CCPU_SHIFT	11
238
239/*
240 * Recompute process priorities, every hz ticks.
241 */
242/* ARGSUSED */
243static void
244schedcpu(arg)
245	void *arg;
246{
247	register fixpt_t loadfac = loadfactor(averunnable.ldavg[0]);
248	register struct proc *p;
249	register int realstathz, s;
250
251	realstathz = stathz ? stathz : hz;
252	LIST_FOREACH(p, &allproc, p_list) {
253		/*
254		 * Increment time in/out of memory and sleep time
255		 * (if sleeping).  We ignore overflow; with 16-bit int's
256		 * (remember them?) overflow takes 45 days.
257		 */
258		p->p_swtime++;
259		if (p->p_stat == SSLEEP || p->p_stat == SSTOP)
260			p->p_slptime++;
261		p->p_pctcpu = (p->p_pctcpu * ccpu) >> FSHIFT;
262		/*
263		 * If the process has slept the entire second,
264		 * stop recalculating its priority until it wakes up.
265		 */
266		if (p->p_slptime > 1)
267			continue;
268		s = splhigh();	/* prevent state changes and protect run queue */
269		/*
270		 * p_pctcpu is only for ps.
271		 */
272#if	(FSHIFT >= CCPU_SHIFT)
273		p->p_pctcpu += (realstathz == 100)?
274			((fixpt_t) p->p_cpticks) << (FSHIFT - CCPU_SHIFT):
275                	100 * (((fixpt_t) p->p_cpticks)
276				<< (FSHIFT - CCPU_SHIFT)) / realstathz;
277#else
278		p->p_pctcpu += ((FSCALE - ccpu) *
279			(p->p_cpticks * FSCALE / realstathz)) >> FSHIFT;
280#endif
281		p->p_cpticks = 0;
282		p->p_estcpu = decay_cpu(loadfac, p->p_estcpu);
283		resetpriority(p);
284		if (p->p_priority >= PUSER) {
285			if ((p != curproc) &&
286#ifdef SMP
287			    p->p_oncpu == 0xff && 	/* idle */
288#endif
289			    p->p_stat == SRUN &&
290			    (p->p_flag & P_INMEM) &&
291			    (p->p_priority / PPQ) != (p->p_usrpri / PPQ)) {
292				remrunqueue(p);
293				p->p_priority = p->p_usrpri;
294				setrunqueue(p);
295			} else
296				p->p_priority = p->p_usrpri;
297		}
298		splx(s);
299	}
300	vmmeter();
301	wakeup((caddr_t)&lbolt);
302	timeout(schedcpu, (void *)0, hz);
303}
304
305/*
306 * Recalculate the priority of a process after it has slept for a while.
307 * For all load averages >= 1 and max p_estcpu of 255, sleeping for at
308 * least six times the loadfactor will decay p_estcpu to zero.
309 */
310static void
311updatepri(p)
312	register struct proc *p;
313{
314	register unsigned int newcpu = p->p_estcpu;
315	register fixpt_t loadfac = loadfactor(averunnable.ldavg[0]);
316
317	if (p->p_slptime > 5 * loadfac)
318		p->p_estcpu = 0;
319	else {
320		p->p_slptime--;	/* the first time was done in schedcpu */
321		while (newcpu && --p->p_slptime)
322			newcpu = decay_cpu(loadfac, newcpu);
323		p->p_estcpu = newcpu;
324	}
325	resetpriority(p);
326}
327
328/*
329 * We're only looking at 7 bits of the address; everything is
330 * aligned to 4, lots of things are aligned to greater powers
331 * of 2.  Shift right by 8, i.e. drop the bottom 256 worth.
332 */
333#define TABLESIZE	128
334static TAILQ_HEAD(slpquehead, proc) slpque[TABLESIZE];
335#define LOOKUP(x)	(((intptr_t)(x) >> 8) & (TABLESIZE - 1))
336
337/*
338 * During autoconfiguration or after a panic, a sleep will simply
339 * lower the priority briefly to allow interrupts, then return.
340 * The priority to be used (safepri) is machine-dependent, thus this
341 * value is initialized and maintained in the machine-dependent layers.
342 * This priority will typically be 0, or the lowest priority
343 * that is safe for use on the interrupt stack; it can be made
344 * higher to block network software interrupts after panics.
345 */
346int safepri;
347
348void
349sleepinit(void)
350{
351	int i;
352
353	sched_quantum = hz/10;
354	hogticks = 2 * sched_quantum;
355	for (i = 0; i < TABLESIZE; i++)
356		TAILQ_INIT(&slpque[i]);
357}
358
359/*
360 * General sleep call.  Suspends the current process until a wakeup is
361 * performed on the specified identifier.  The process will then be made
362 * runnable with the specified priority.  Sleeps at most timo/hz seconds
363 * (0 means no timeout).  If pri includes PCATCH flag, signals are checked
364 * before and after sleeping, else signals are not checked.  Returns 0 if
365 * awakened, EWOULDBLOCK if the timeout expires.  If PCATCH is set and a
366 * signal needs to be delivered, ERESTART is returned if the current system
367 * call should be restarted if possible, and EINTR is returned if the system
368 * call should be interrupted by the signal (return EINTR).
369 */
370int
371tsleep(ident, priority, wmesg, timo)
372	void *ident;
373	int priority, timo;
374	const char *wmesg;
375{
376	struct proc *p = curproc;
377	int s, sig, catch = priority & PCATCH;
378	struct callout_handle thandle;
379
380#ifdef KTRACE
381	if (p && KTRPOINT(p, KTR_CSW))
382		ktrcsw(p->p_tracep, 1, 0);
383#endif
384	s = splhigh();
385	if (cold || panicstr) {
386		/*
387		 * After a panic, or during autoconfiguration,
388		 * just give interrupts a chance, then just return;
389		 * don't run any other procs or panic below,
390		 * in case this is the idle process and already asleep.
391		 */
392		splx(safepri);
393		splx(s);
394		return (0);
395	}
396	KASSERT(p != NULL, ("tsleep1"));
397	KASSERT(ident != NULL && p->p_stat == SRUN, ("tsleep"));
398	/*
399	 * Process may be sitting on a slpque if asleep() was called, remove
400	 * it before re-adding.
401	 */
402	if (p->p_wchan != NULL)
403		unsleep(p);
404
405	p->p_wchan = ident;
406	p->p_wmesg = wmesg;
407	p->p_slptime = 0;
408	p->p_priority = priority & PRIMASK;
409	TAILQ_INSERT_TAIL(&slpque[LOOKUP(ident)], p, p_procq);
410	if (timo)
411		thandle = timeout(endtsleep, (void *)p, timo);
412	/*
413	 * We put ourselves on the sleep queue and start our timeout
414	 * before calling CURSIG, as we could stop there, and a wakeup
415	 * or a SIGCONT (or both) could occur while we were stopped.
416	 * A SIGCONT would cause us to be marked as SSLEEP
417	 * without resuming us, thus we must be ready for sleep
418	 * when CURSIG is called.  If the wakeup happens while we're
419	 * stopped, p->p_wchan will be 0 upon return from CURSIG.
420	 */
421	if (catch) {
422		p->p_flag |= P_SINTR;
423		if ((sig = CURSIG(p))) {
424			if (p->p_wchan)
425				unsleep(p);
426			p->p_stat = SRUN;
427			goto resume;
428		}
429		if (p->p_wchan == 0) {
430			catch = 0;
431			goto resume;
432		}
433	} else
434		sig = 0;
435	p->p_stat = SSLEEP;
436	p->p_stats->p_ru.ru_nvcsw++;
437	mi_switch();
438resume:
439	curpriority = p->p_usrpri;
440	splx(s);
441	p->p_flag &= ~P_SINTR;
442	if (p->p_flag & P_TIMEOUT) {
443		p->p_flag &= ~P_TIMEOUT;
444		if (sig == 0) {
445#ifdef KTRACE
446			if (KTRPOINT(p, KTR_CSW))
447				ktrcsw(p->p_tracep, 0, 0);
448#endif
449			return (EWOULDBLOCK);
450		}
451	} else if (timo)
452		untimeout(endtsleep, (void *)p, thandle);
453	if (catch && (sig != 0 || (sig = CURSIG(p)))) {
454#ifdef KTRACE
455		if (KTRPOINT(p, KTR_CSW))
456			ktrcsw(p->p_tracep, 0, 0);
457#endif
458		if (SIGISMEMBER(p->p_sigacts->ps_sigintr, sig))
459			return (EINTR);
460		return (ERESTART);
461	}
462#ifdef KTRACE
463	if (KTRPOINT(p, KTR_CSW))
464		ktrcsw(p->p_tracep, 0, 0);
465#endif
466	return (0);
467}
468
469/*
470 * asleep() - async sleep call.  Place process on wait queue and return
471 * immediately without blocking.  The process stays runnable until await()
472 * is called.  If ident is NULL, remove process from wait queue if it is still
473 * on one.
474 *
475 * Only the most recent sleep condition is effective when making successive
476 * calls to asleep() or when calling tsleep().
477 *
478 * The timeout, if any, is not initiated until await() is called.  The sleep
479 * priority, signal, and timeout is specified in the asleep() call but may be
480 * overriden in the await() call.
481 *
482 * <<<<<<<< EXPERIMENTAL, UNTESTED >>>>>>>>>>
483 */
484
485int
486asleep(void *ident, int priority, const char *wmesg, int timo)
487{
488	struct proc *p = curproc;
489	int s;
490
491	/*
492	 * splhigh() while manipulating sleep structures and slpque.
493	 *
494	 * Remove preexisting wait condition (if any) and place process
495	 * on appropriate slpque, but do not put process to sleep.
496	 */
497
498	s = splhigh();
499
500	if (p->p_wchan != NULL)
501		unsleep(p);
502
503	if (ident) {
504		p->p_wchan = ident;
505		p->p_wmesg = wmesg;
506		p->p_slptime = 0;
507		p->p_asleep.as_priority = priority;
508		p->p_asleep.as_timo = timo;
509		TAILQ_INSERT_TAIL(&slpque[LOOKUP(ident)], p, p_procq);
510	}
511
512	splx(s);
513
514	return(0);
515}
516
517/*
518 * await() - wait for async condition to occur.   The process blocks until
519 * wakeup() is called on the most recent asleep() address.  If wakeup is called
520 * priority to await(), await() winds up being a NOP.
521 *
522 * If await() is called more then once (without an intervening asleep() call),
523 * await() is still effectively a NOP but it calls mi_switch() to give other
524 * processes some cpu before returning.  The process is left runnable.
525 *
526 * <<<<<<<< EXPERIMENTAL, UNTESTED >>>>>>>>>>
527 */
528
529int
530await(int priority, int timo)
531{
532	struct proc *p = curproc;
533	int s;
534
535	s = splhigh();
536
537	if (p->p_wchan != NULL) {
538		struct callout_handle thandle;
539		int sig;
540		int catch;
541
542		/*
543		 * The call to await() can override defaults specified in
544		 * the original asleep().
545		 */
546		if (priority < 0)
547			priority = p->p_asleep.as_priority;
548		if (timo < 0)
549			timo = p->p_asleep.as_timo;
550
551		/*
552		 * Install timeout
553		 */
554
555		if (timo)
556			thandle = timeout(endtsleep, (void *)p, timo);
557
558		sig = 0;
559		catch = priority & PCATCH;
560
561		if (catch) {
562			p->p_flag |= P_SINTR;
563			if ((sig = CURSIG(p))) {
564				if (p->p_wchan)
565					unsleep(p);
566				p->p_stat = SRUN;
567				goto resume;
568			}
569			if (p->p_wchan == NULL) {
570				catch = 0;
571				goto resume;
572			}
573		}
574		p->p_stat = SSLEEP;
575		p->p_stats->p_ru.ru_nvcsw++;
576		mi_switch();
577resume:
578		curpriority = p->p_usrpri;
579
580		splx(s);
581		p->p_flag &= ~P_SINTR;
582		if (p->p_flag & P_TIMEOUT) {
583			p->p_flag &= ~P_TIMEOUT;
584			if (sig == 0) {
585#ifdef KTRACE
586				if (KTRPOINT(p, KTR_CSW))
587					ktrcsw(p->p_tracep, 0, 0);
588#endif
589				return (EWOULDBLOCK);
590			}
591		} else if (timo)
592			untimeout(endtsleep, (void *)p, thandle);
593		if (catch && (sig != 0 || (sig = CURSIG(p)))) {
594#ifdef KTRACE
595			if (KTRPOINT(p, KTR_CSW))
596				ktrcsw(p->p_tracep, 0, 0);
597#endif
598			if (SIGISMEMBER(p->p_sigacts->ps_sigintr, sig))
599				return (EINTR);
600			return (ERESTART);
601		}
602#ifdef KTRACE
603		if (KTRPOINT(p, KTR_CSW))
604			ktrcsw(p->p_tracep, 0, 0);
605#endif
606	} else {
607		/*
608		 * If as_priority is 0, await() has been called without an
609		 * intervening asleep().  We are still effectively a NOP,
610		 * but we call mi_switch() for safety.
611		 */
612
613		if (p->p_asleep.as_priority == 0) {
614			p->p_stats->p_ru.ru_nvcsw++;
615			mi_switch();
616		}
617		splx(s);
618	}
619
620	/*
621	 * clear p_asleep.as_priority as an indication that await() has been
622	 * called.  If await() is called again without an intervening asleep(),
623	 * await() is still effectively a NOP but the above mi_switch() code
624	 * is triggered as a safety.
625	 */
626	p->p_asleep.as_priority = 0;
627
628	return (0);
629}
630
631/*
632 * Implement timeout for tsleep or asleep()/await()
633 *
634 * If process hasn't been awakened (wchan non-zero),
635 * set timeout flag and undo the sleep.  If proc
636 * is stopped, just unsleep so it will remain stopped.
637 */
638static void
639endtsleep(arg)
640	void *arg;
641{
642	register struct proc *p;
643	int s;
644
645	p = (struct proc *)arg;
646	s = splhigh();
647	if (p->p_wchan) {
648		if (p->p_stat == SSLEEP)
649			setrunnable(p);
650		else
651			unsleep(p);
652		p->p_flag |= P_TIMEOUT;
653	}
654	splx(s);
655}
656
657/*
658 * Remove a process from its wait queue
659 */
660void
661unsleep(p)
662	register struct proc *p;
663{
664	int s;
665
666	s = splhigh();
667	if (p->p_wchan) {
668		TAILQ_REMOVE(&slpque[LOOKUP(p->p_wchan)], p, p_procq);
669		p->p_wchan = 0;
670	}
671	splx(s);
672}
673
674/*
675 * Make all processes sleeping on the specified identifier runnable.
676 */
677void
678wakeup(ident)
679	register void *ident;
680{
681	register struct slpquehead *qp;
682	register struct proc *p;
683	int s;
684
685	s = splhigh();
686	qp = &slpque[LOOKUP(ident)];
687restart:
688	TAILQ_FOREACH(p, qp, p_procq) {
689		if (p->p_wchan == ident) {
690			TAILQ_REMOVE(qp, p, p_procq);
691			p->p_wchan = 0;
692			if (p->p_stat == SSLEEP) {
693				/* OPTIMIZED EXPANSION OF setrunnable(p); */
694				if (p->p_slptime > 1)
695					updatepri(p);
696				p->p_slptime = 0;
697				p->p_stat = SRUN;
698				if (p->p_flag & P_INMEM) {
699					setrunqueue(p);
700					maybe_resched(p);
701				} else {
702					p->p_flag |= P_SWAPINREQ;
703					wakeup((caddr_t)&proc0);
704				}
705				/* END INLINE EXPANSION */
706				goto restart;
707			}
708		}
709	}
710	splx(s);
711}
712
713/*
714 * Make a process sleeping on the specified identifier runnable.
715 * May wake more than one process if a target prcoess is currently
716 * swapped out.
717 */
718void
719wakeup_one(ident)
720	register void *ident;
721{
722	register struct slpquehead *qp;
723	register struct proc *p;
724	int s;
725
726	s = splhigh();
727	qp = &slpque[LOOKUP(ident)];
728
729	TAILQ_FOREACH(p, qp, p_procq) {
730		if (p->p_wchan == ident) {
731			TAILQ_REMOVE(qp, p, p_procq);
732			p->p_wchan = 0;
733			if (p->p_stat == SSLEEP) {
734				/* OPTIMIZED EXPANSION OF setrunnable(p); */
735				if (p->p_slptime > 1)
736					updatepri(p);
737				p->p_slptime = 0;
738				p->p_stat = SRUN;
739				if (p->p_flag & P_INMEM) {
740					setrunqueue(p);
741					maybe_resched(p);
742					break;
743				} else {
744					p->p_flag |= P_SWAPINREQ;
745					wakeup((caddr_t)&proc0);
746				}
747				/* END INLINE EXPANSION */
748			}
749		}
750	}
751	splx(s);
752}
753
754/*
755 * The machine independent parts of mi_switch().
756 * Must be called at splstatclock() or higher.
757 */
758void
759mi_switch()
760{
761	struct timeval new_switchtime;
762	register struct proc *p = curproc;	/* XXX */
763	register struct rlimit *rlim;
764	int x;
765
766	/*
767	 * XXX this spl is almost unnecessary.  It is partly to allow for
768	 * sloppy callers that don't do it (issignal() via CURSIG() is the
769	 * main offender).  It is partly to work around a bug in the i386
770	 * cpu_switch() (the ipl is not preserved).  We ran for years
771	 * without it.  I think there was only a interrupt latency problem.
772	 * The main caller, tsleep(), does an splx() a couple of instructions
773	 * after calling here.  The buggy caller, issignal(), usually calls
774	 * here at spl0() and sometimes returns at splhigh().  The process
775	 * then runs for a little too long at splhigh().  The ipl gets fixed
776	 * when the process returns to user mode (or earlier).
777	 *
778	 * It would probably be better to always call here at spl0(). Callers
779	 * are prepared to give up control to another process, so they must
780	 * be prepared to be interrupted.  The clock stuff here may not
781	 * actually need splstatclock().
782	 */
783	x = splstatclock();
784
785#ifdef SIMPLELOCK_DEBUG
786	if (p->p_simple_locks)
787		printf("sleep: holding simple lock\n");
788#endif
789	/*
790	 * Compute the amount of time during which the current
791	 * process was running, and add that to its total so far.
792	 */
793	microuptime(&new_switchtime);
794	if (timevalcmp(&new_switchtime, &switchtime, <)) {
795		printf("microuptime() went backwards (%ld.%06ld -> %ld,%06ld)\n",
796		    switchtime.tv_sec, switchtime.tv_usec,
797		    new_switchtime.tv_sec, new_switchtime.tv_usec);
798		new_switchtime = switchtime;
799	} else {
800		p->p_runtime += (new_switchtime.tv_usec - switchtime.tv_usec) +
801		    (new_switchtime.tv_sec - switchtime.tv_sec) * (int64_t)1000000;
802	}
803
804	/*
805	 * Check if the process exceeds its cpu resource allocation.
806	 * If over max, kill it.
807	 */
808	if (p->p_stat != SZOMB && p->p_limit->p_cpulimit != RLIM_INFINITY &&
809	    p->p_runtime > p->p_limit->p_cpulimit) {
810		rlim = &p->p_rlimit[RLIMIT_CPU];
811		if (p->p_runtime / (rlim_t)1000000 >= rlim->rlim_max) {
812			killproc(p, "exceeded maximum CPU limit");
813		} else {
814			psignal(p, SIGXCPU);
815			if (rlim->rlim_cur < rlim->rlim_max) {
816				/* XXX: we should make a private copy */
817				rlim->rlim_cur += 5;
818			}
819		}
820	}
821
822	/*
823	 * Pick a new current process and record its start time.
824	 */
825	cnt.v_swtch++;
826	switchtime = new_switchtime;
827	cpu_switch(p);
828	if (switchtime.tv_sec == 0)
829		microuptime(&switchtime);
830	switchticks = ticks;
831
832	splx(x);
833}
834
835/*
836 * Change process state to be runnable,
837 * placing it on the run queue if it is in memory,
838 * and awakening the swapper if it isn't in memory.
839 */
840void
841setrunnable(p)
842	register struct proc *p;
843{
844	register int s;
845
846	s = splhigh();
847	switch (p->p_stat) {
848	case 0:
849	case SRUN:
850	case SZOMB:
851	default:
852		panic("setrunnable");
853	case SSTOP:
854	case SSLEEP:
855		unsleep(p);		/* e.g. when sending signals */
856		break;
857
858	case SIDL:
859		break;
860	}
861	p->p_stat = SRUN;
862	if (p->p_flag & P_INMEM)
863		setrunqueue(p);
864	splx(s);
865	if (p->p_slptime > 1)
866		updatepri(p);
867	p->p_slptime = 0;
868	if ((p->p_flag & P_INMEM) == 0) {
869		p->p_flag |= P_SWAPINREQ;
870		wakeup((caddr_t)&proc0);
871	}
872	else
873		maybe_resched(p);
874}
875
876/*
877 * Compute the priority of a process when running in user mode.
878 * Arrange to reschedule if the resulting priority is better
879 * than that of the current process.
880 */
881void
882resetpriority(p)
883	register struct proc *p;
884{
885	register unsigned int newpriority;
886
887	if (p->p_rtprio.type == RTP_PRIO_NORMAL) {
888		newpriority = PUSER + p->p_estcpu / INVERSE_ESTCPU_WEIGHT +
889		    NICE_WEIGHT * p->p_nice;
890		newpriority = min(newpriority, MAXPRI);
891		p->p_usrpri = newpriority;
892	}
893	maybe_resched(p);
894}
895
896/* ARGSUSED */
897static void
898sched_setup(dummy)
899	void *dummy;
900{
901	/* Kick off timeout driven events by calling first time. */
902	roundrobin(NULL);
903	schedcpu(NULL);
904}
905
906/*
907 * We adjust the priority of the current process.  The priority of
908 * a process gets worse as it accumulates CPU time.  The cpu usage
909 * estimator (p_estcpu) is increased here.  resetpriority() will
910 * compute a different priority each time p_estcpu increases by
911 * INVERSE_ESTCPU_WEIGHT
912 * (until MAXPRI is reached).  The cpu usage estimator ramps up
913 * quite quickly when the process is running (linearly), and decays
914 * away exponentially, at a rate which is proportionally slower when
915 * the system is busy.  The basic principle is that the system will
916 * 90% forget that the process used a lot of CPU time in 5 * loadav
917 * seconds.  This causes the system to favor processes which haven't
918 * run much recently, and to round-robin among other processes.
919 */
920void
921schedclock(p)
922	struct proc *p;
923{
924
925	p->p_cpticks++;
926	p->p_estcpu = ESTCPULIM(p->p_estcpu + 1);
927	if ((p->p_estcpu % INVERSE_ESTCPU_WEIGHT) == 0) {
928		resetpriority(p);
929		if (p->p_priority >= PUSER)
930			p->p_priority = p->p_usrpri;
931	}
932}
933