kern_synch.c revision 2441
1/*-
2 * Copyright (c) 1982, 1986, 1990, 1991, 1993
3 *	The Regents of the University of California.  All rights reserved.
4 * (c) UNIX System Laboratories, Inc.
5 * All or some portions of this file are derived from material licensed
6 * to the University of California by American Telephone and Telegraph
7 * Co. or Unix System Laboratories, Inc. and are reproduced herein with
8 * the permission of UNIX System Laboratories, Inc.
9 *
10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted provided that the following conditions
12 * are met:
13 * 1. Redistributions of source code must retain the above copyright
14 *    notice, this list of conditions and the following disclaimer.
15 * 2. Redistributions in binary form must reproduce the above copyright
16 *    notice, this list of conditions and the following disclaimer in the
17 *    documentation and/or other materials provided with the distribution.
18 * 3. All advertising materials mentioning features or use of this software
19 *    must display the following acknowledgement:
20 *	This product includes software developed by the University of
21 *	California, Berkeley and its contributors.
22 * 4. Neither the name of the University nor the names of its contributors
23 *    may be used to endorse or promote products derived from this software
24 *    without specific prior written permission.
25 *
26 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
27 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
30 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
31 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
32 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
33 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
34 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
35 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
36 * SUCH DAMAGE.
37 *
38 *	@(#)kern_synch.c	8.6 (Berkeley) 1/21/94
39 * $Id: kern_synch.c,v 1.3 1994/08/02 07:42:17 davidg Exp $
40 */
41
42#include <sys/param.h>
43#include <sys/systm.h>
44#include <sys/proc.h>
45#include <sys/kernel.h>
46#include <sys/buf.h>
47#include <sys/signalvar.h>
48#include <sys/resourcevar.h>
49#include <sys/vmmeter.h>
50#ifdef KTRACE
51#include <sys/ktrace.h>
52#endif
53
54#include <machine/cpu.h>
55
56u_char	curpriority;		/* usrpri of curproc */
57int	lbolt;			/* once a second sleep address */
58
59/*
60 * Force switch among equal priority processes every 100ms.
61 */
62/* ARGSUSED */
63void
64roundrobin(arg)
65	void *arg;
66{
67
68	need_resched();
69	timeout(roundrobin, NULL, hz / 10);
70}
71
72/*
73 * Constants for digital decay and forget:
74 *	90% of (p_estcpu) usage in 5 * loadav time
75 *	95% of (p_pctcpu) usage in 60 seconds (load insensitive)
76 *          Note that, as ps(1) mentions, this can let percentages
77 *          total over 100% (I've seen 137.9% for 3 processes).
78 *
79 * Note that hardclock updates p_estcpu and p_cpticks independently.
80 *
81 * We wish to decay away 90% of p_estcpu in (5 * loadavg) seconds.
82 * That is, the system wants to compute a value of decay such
83 * that the following for loop:
84 * 	for (i = 0; i < (5 * loadavg); i++)
85 * 		p_estcpu *= decay;
86 * will compute
87 * 	p_estcpu *= 0.1;
88 * for all values of loadavg:
89 *
90 * Mathematically this loop can be expressed by saying:
91 * 	decay ** (5 * loadavg) ~= .1
92 *
93 * The system computes decay as:
94 * 	decay = (2 * loadavg) / (2 * loadavg + 1)
95 *
96 * We wish to prove that the system's computation of decay
97 * will always fulfill the equation:
98 * 	decay ** (5 * loadavg) ~= .1
99 *
100 * If we compute b as:
101 * 	b = 2 * loadavg
102 * then
103 * 	decay = b / (b + 1)
104 *
105 * We now need to prove two things:
106 *	1) Given factor ** (5 * loadavg) ~= .1, prove factor == b/(b+1)
107 *	2) Given b/(b+1) ** power ~= .1, prove power == (5 * loadavg)
108 *
109 * Facts:
110 *         For x close to zero, exp(x) =~ 1 + x, since
111 *              exp(x) = 0! + x**1/1! + x**2/2! + ... .
112 *              therefore exp(-1/b) =~ 1 - (1/b) = (b-1)/b.
113 *         For x close to zero, ln(1+x) =~ x, since
114 *              ln(1+x) = x - x**2/2 + x**3/3 - ...     -1 < x < 1
115 *              therefore ln(b/(b+1)) = ln(1 - 1/(b+1)) =~ -1/(b+1).
116 *         ln(.1) =~ -2.30
117 *
118 * Proof of (1):
119 *    Solve (factor)**(power) =~ .1 given power (5*loadav):
120 *	solving for factor,
121 *      ln(factor) =~ (-2.30/5*loadav), or
122 *      factor =~ exp(-1/((5/2.30)*loadav)) =~ exp(-1/(2*loadav)) =
123 *          exp(-1/b) =~ (b-1)/b =~ b/(b+1).                    QED
124 *
125 * Proof of (2):
126 *    Solve (factor)**(power) =~ .1 given factor == (b/(b+1)):
127 *	solving for power,
128 *      power*ln(b/(b+1)) =~ -2.30, or
129 *      power =~ 2.3 * (b + 1) = 4.6*loadav + 2.3 =~ 5*loadav.  QED
130 *
131 * Actual power values for the implemented algorithm are as follows:
132 *      loadav: 1       2       3       4
133 *      power:  5.68    10.32   14.94   19.55
134 */
135
136/* calculations for digital decay to forget 90% of usage in 5*loadav sec */
137#define	loadfactor(loadav)	(2 * (loadav))
138#define	decay_cpu(loadfac, cpu)	(((loadfac) * (cpu)) / ((loadfac) + FSCALE))
139
140/* decay 95% of `p_pctcpu' in 60 seconds; see CCPU_SHIFT before changing */
141fixpt_t	ccpu = 0.95122942450071400909 * FSCALE;		/* exp(-1/20) */
142
143/*
144 * If `ccpu' is not equal to `exp(-1/20)' and you still want to use the
145 * faster/more-accurate formula, you'll have to estimate CCPU_SHIFT below
146 * and possibly adjust FSHIFT in "param.h" so that (FSHIFT >= CCPU_SHIFT).
147 *
148 * To estimate CCPU_SHIFT for exp(-1/20), the following formula was used:
149 *	1 - exp(-1/20) ~= 0.0487 ~= 0.0488 == 1 (fixed pt, *11* bits).
150 *
151 * If you dont want to bother with the faster/more-accurate formula, you
152 * can set CCPU_SHIFT to (FSHIFT + 1) which will use a slower/less-accurate
153 * (more general) method of calculating the %age of CPU used by a process.
154 */
155#define	CCPU_SHIFT	11
156
157/*
158 * Recompute process priorities, every hz ticks.
159 */
160/* ARGSUSED */
161void
162schedcpu(arg)
163	void *arg;
164{
165	register fixpt_t loadfac = loadfactor(averunnable.ldavg[0]);
166	register struct proc *p;
167	register int s;
168	register unsigned int newcpu;
169
170	wakeup((caddr_t)&lbolt);
171	for (p = (struct proc *)allproc; p != NULL; p = p->p_next) {
172		/*
173		 * Increment time in/out of memory and sleep time
174		 * (if sleeping).  We ignore overflow; with 16-bit int's
175		 * (remember them?) overflow takes 45 days.
176		 */
177		p->p_swtime++;
178		if (p->p_stat == SSLEEP || p->p_stat == SSTOP)
179			p->p_slptime++;
180		p->p_pctcpu = (p->p_pctcpu * ccpu) >> FSHIFT;
181		/*
182		 * If the process has slept the entire second,
183		 * stop recalculating its priority until it wakes up.
184		 */
185		if (p->p_slptime > 1)
186			continue;
187		s = splstatclock();	/* prevent state changes */
188		/*
189		 * p_pctcpu is only for ps.
190		 */
191#if	(FSHIFT >= CCPU_SHIFT)
192		p->p_pctcpu += (hz == 100)?
193			((fixpt_t) p->p_cpticks) << (FSHIFT - CCPU_SHIFT):
194                	100 * (((fixpt_t) p->p_cpticks)
195				<< (FSHIFT - CCPU_SHIFT)) / hz;
196#else
197		p->p_pctcpu += ((FSCALE - ccpu) *
198			(p->p_cpticks * FSCALE / hz)) >> FSHIFT;
199#endif
200		p->p_cpticks = 0;
201		newcpu = (u_int) decay_cpu(loadfac, p->p_estcpu) + p->p_nice;
202		p->p_estcpu = min(newcpu, UCHAR_MAX);
203		resetpriority(p);
204		if (p->p_priority >= PUSER) {
205#define	PPQ	(128 / NQS)		/* priorities per queue */
206			if ((p != curproc) &&
207			    p->p_stat == SRUN &&
208			    (p->p_flag & P_INMEM) &&
209			    (p->p_priority / PPQ) != (p->p_usrpri / PPQ)) {
210				remrq(p);
211				p->p_priority = p->p_usrpri;
212				setrunqueue(p);
213			} else
214				p->p_priority = p->p_usrpri;
215		}
216		splx(s);
217	}
218	vmmeter();
219	if (bclnlist != NULL)
220		wakeup((caddr_t)pageproc);
221	timeout(schedcpu, (void *)0, hz);
222}
223
224/*
225 * Recalculate the priority of a process after it has slept for a while.
226 * For all load averages >= 1 and max p_estcpu of 255, sleeping for at
227 * least six times the loadfactor will decay p_estcpu to zero.
228 */
229void
230updatepri(p)
231	register struct proc *p;
232{
233	register unsigned int newcpu = p->p_estcpu;
234	register fixpt_t loadfac = loadfactor(averunnable.ldavg[0]);
235
236	if (p->p_slptime > 5 * loadfac)
237		p->p_estcpu = 0;
238	else {
239		p->p_slptime--;	/* the first time was done in schedcpu */
240		while (newcpu && --p->p_slptime)
241			newcpu = (int) decay_cpu(loadfac, newcpu);
242		p->p_estcpu = min(newcpu, UCHAR_MAX);
243	}
244	resetpriority(p);
245}
246
247/*
248 * We're only looking at 7 bits of the address; everything is
249 * aligned to 4, lots of things are aligned to greater powers
250 * of 2.  Shift right by 8, i.e. drop the bottom 256 worth.
251 */
252#define TABLESIZE	128
253#define LOOKUP(x)	(((int)(x) >> 8) & (TABLESIZE - 1))
254struct slpque {
255	struct proc *sq_head;
256	struct proc **sq_tailp;
257} slpque[TABLESIZE];
258
259/*
260 * During autoconfiguration or after a panic, a sleep will simply
261 * lower the priority briefly to allow interrupts, then return.
262 * The priority to be used (safepri) is machine-dependent, thus this
263 * value is initialized and maintained in the machine-dependent layers.
264 * This priority will typically be 0, or the lowest priority
265 * that is safe for use on the interrupt stack; it can be made
266 * higher to block network software interrupts after panics.
267 */
268int safepri;
269
270/*
271 * General sleep call.  Suspends the current process until a wakeup is
272 * performed on the specified identifier.  The process will then be made
273 * runnable with the specified priority.  Sleeps at most timo/hz seconds
274 * (0 means no timeout).  If pri includes PCATCH flag, signals are checked
275 * before and after sleeping, else signals are not checked.  Returns 0 if
276 * awakened, EWOULDBLOCK if the timeout expires.  If PCATCH is set and a
277 * signal needs to be delivered, ERESTART is returned if the current system
278 * call should be restarted if possible, and EINTR is returned if the system
279 * call should be interrupted by the signal (return EINTR).
280 */
281int
282tsleep(ident, priority, wmesg, timo)
283	void *ident;
284	int priority, timo;
285	char *wmesg;
286{
287	register struct proc *p = curproc;
288	register struct slpque *qp;
289	register s;
290	int sig, catch = priority & PCATCH;
291	extern int cold;
292	void endtsleep __P((void *));
293
294#ifdef KTRACE
295	if (KTRPOINT(p, KTR_CSW))
296		ktrcsw(p->p_tracep, 1, 0);
297#endif
298	s = splhigh();
299	if (cold || panicstr) {
300		/*
301		 * After a panic, or during autoconfiguration,
302		 * just give interrupts a chance, then just return;
303		 * don't run any other procs or panic below,
304		 * in case this is the idle process and already asleep.
305		 */
306		splx(safepri);
307		splx(s);
308		return (0);
309	}
310#ifdef DIAGNOSTIC
311	if (ident == NULL || p->p_stat != SRUN || p->p_back)
312		panic("tsleep");
313#endif
314	p->p_wchan = ident;
315	p->p_wmesg = wmesg;
316	p->p_slptime = 0;
317	p->p_priority = priority & PRIMASK;
318	qp = &slpque[LOOKUP(ident)];
319	if (qp->sq_head == 0)
320		qp->sq_head = p;
321	else
322		*qp->sq_tailp = p;
323	*(qp->sq_tailp = &p->p_forw) = 0;
324	if (timo)
325		timeout(endtsleep, (void *)p, timo);
326	/*
327	 * We put ourselves on the sleep queue and start our timeout
328	 * before calling CURSIG, as we could stop there, and a wakeup
329	 * or a SIGCONT (or both) could occur while we were stopped.
330	 * A SIGCONT would cause us to be marked as SSLEEP
331	 * without resuming us, thus we must be ready for sleep
332	 * when CURSIG is called.  If the wakeup happens while we're
333	 * stopped, p->p_wchan will be 0 upon return from CURSIG.
334	 */
335	if (catch) {
336		p->p_flag |= P_SINTR;
337		if (sig = CURSIG(p)) {
338			if (p->p_wchan)
339				unsleep(p);
340			p->p_stat = SRUN;
341			goto resume;
342		}
343		if (p->p_wchan == 0) {
344			catch = 0;
345			goto resume;
346		}
347	} else
348		sig = 0;
349	p->p_stat = SSLEEP;
350	p->p_stats->p_ru.ru_nvcsw++;
351	mi_switch();
352resume:
353	curpriority = p->p_usrpri;
354	splx(s);
355	p->p_flag &= ~P_SINTR;
356	if (p->p_flag & P_TIMEOUT) {
357		p->p_flag &= ~P_TIMEOUT;
358		if (sig == 0) {
359#ifdef KTRACE
360			if (KTRPOINT(p, KTR_CSW))
361				ktrcsw(p->p_tracep, 0, 0);
362#endif
363			return (EWOULDBLOCK);
364		}
365	} else if (timo)
366		untimeout(endtsleep, (void *)p);
367	if (catch && (sig != 0 || (sig = CURSIG(p)))) {
368#ifdef KTRACE
369		if (KTRPOINT(p, KTR_CSW))
370			ktrcsw(p->p_tracep, 0, 0);
371#endif
372		if (p->p_sigacts->ps_sigintr & sigmask(sig))
373			return (EINTR);
374		return (ERESTART);
375	}
376#ifdef KTRACE
377	if (KTRPOINT(p, KTR_CSW))
378		ktrcsw(p->p_tracep, 0, 0);
379#endif
380	return (0);
381}
382
383/*
384 * Implement timeout for tsleep.
385 * If process hasn't been awakened (wchan non-zero),
386 * set timeout flag and undo the sleep.  If proc
387 * is stopped, just unsleep so it will remain stopped.
388 */
389void
390endtsleep(arg)
391	void *arg;
392{
393	register struct proc *p;
394	int s;
395
396	p = (struct proc *)arg;
397	s = splhigh();
398	if (p->p_wchan) {
399		if (p->p_stat == SSLEEP)
400			setrunnable(p);
401		else
402			unsleep(p);
403		p->p_flag |= P_TIMEOUT;
404	}
405	splx(s);
406}
407
408/*
409 * Short-term, non-interruptable sleep.
410 */
411void
412sleep(ident, priority)
413	void *ident;
414	int priority;
415{
416	register struct proc *p = curproc;
417	register struct slpque *qp;
418	register s;
419	extern int cold;
420
421#ifdef DIAGNOSTIC
422	if (priority > PZERO) {
423		printf("sleep called with priority %d > PZERO, wchan: %x\n",
424		    priority, ident);
425		panic("old sleep");
426	}
427#endif
428	s = splhigh();
429	if (cold || panicstr) {
430		/*
431		 * After a panic, or during autoconfiguration,
432		 * just give interrupts a chance, then just return;
433		 * don't run any other procs or panic below,
434		 * in case this is the idle process and already asleep.
435		 */
436		splx(safepri);
437		splx(s);
438		return;
439	}
440#ifdef DIAGNOSTIC
441	if (ident == NULL || p->p_stat != SRUN || p->p_back)
442		panic("sleep");
443#endif
444	p->p_wchan = ident;
445	p->p_wmesg = NULL;
446	p->p_slptime = 0;
447	p->p_priority = priority;
448	qp = &slpque[LOOKUP(ident)];
449	if (qp->sq_head == 0)
450		qp->sq_head = p;
451	else
452		*qp->sq_tailp = p;
453	*(qp->sq_tailp = &p->p_forw) = 0;
454	p->p_stat = SSLEEP;
455	p->p_stats->p_ru.ru_nvcsw++;
456#ifdef KTRACE
457	if (KTRPOINT(p, KTR_CSW))
458		ktrcsw(p->p_tracep, 1, 0);
459#endif
460	mi_switch();
461#ifdef KTRACE
462	if (KTRPOINT(p, KTR_CSW))
463		ktrcsw(p->p_tracep, 0, 0);
464#endif
465	curpriority = p->p_usrpri;
466	splx(s);
467}
468
469/*
470 * Remove a process from its wait queue
471 */
472void
473unsleep(p)
474	register struct proc *p;
475{
476	register struct slpque *qp;
477	register struct proc **hp;
478	int s;
479
480	s = splhigh();
481	if (p->p_wchan) {
482		hp = &(qp = &slpque[LOOKUP(p->p_wchan)])->sq_head;
483		while (*hp != p)
484			hp = &(*hp)->p_forw;
485		*hp = p->p_forw;
486		if (qp->sq_tailp == &p->p_forw)
487			qp->sq_tailp = hp;
488		p->p_wchan = 0;
489	}
490	splx(s);
491}
492
493/*
494 * Make all processes sleeping on the specified identifier runnable.
495 */
496void
497wakeup(ident)
498	register void *ident;
499{
500	register struct slpque *qp;
501	register struct proc *p, **q;
502	int s;
503
504	s = splhigh();
505	qp = &slpque[LOOKUP(ident)];
506restart:
507	for (q = &qp->sq_head; p = *q; ) {
508#ifdef DIAGNOSTIC
509		if (p->p_back || p->p_stat != SSLEEP && p->p_stat != SSTOP)
510			panic("wakeup");
511#endif
512		if (p->p_wchan == ident) {
513			p->p_wchan = 0;
514			*q = p->p_forw;
515			if (qp->sq_tailp == &p->p_forw)
516				qp->sq_tailp = q;
517			if (p->p_stat == SSLEEP) {
518				/* OPTIMIZED EXPANSION OF setrunnable(p); */
519				if (p->p_slptime > 1)
520					updatepri(p);
521				p->p_slptime = 0;
522				p->p_stat = SRUN;
523				if (p->p_flag & P_INMEM)
524					setrunqueue(p);
525				/*
526				 * Since curpriority is a user priority,
527				 * p->p_priority is always better than
528				 * curpriority.
529				 */
530				if ((p->p_flag & P_INMEM) == 0)
531					wakeup((caddr_t)&proc0);
532				else
533					need_resched();
534				/* END INLINE EXPANSION */
535				goto restart;
536			}
537		} else
538			q = &p->p_forw;
539	}
540	splx(s);
541}
542
543/*
544 * The machine independent parts of mi_switch().
545 * Must be called at splstatclock() or higher.
546 */
547void
548mi_switch()
549{
550	register struct proc *p = curproc;	/* XXX */
551	register struct rlimit *rlim;
552	register long s, u;
553	struct timeval tv;
554
555	/*
556	 * Compute the amount of time during which the current
557	 * process was running, and add that to its total so far.
558	 */
559	microtime(&tv);
560	u = p->p_rtime.tv_usec + (tv.tv_usec - runtime.tv_usec);
561	s = p->p_rtime.tv_sec + (tv.tv_sec - runtime.tv_sec);
562	if (u < 0) {
563		u += 1000000;
564		s--;
565	} else if (u >= 1000000) {
566		u -= 1000000;
567		s++;
568	}
569	p->p_rtime.tv_usec = u;
570	p->p_rtime.tv_sec = s;
571
572	/*
573	 * Check if the process exceeds its cpu resource allocation.
574	 * If over max, kill it.  In any case, if it has run for more
575	 * than 10 minutes, reduce priority to give others a chance.
576	 */
577	rlim = &p->p_rlimit[RLIMIT_CPU];
578	if (s >= rlim->rlim_cur) {
579		if (s >= rlim->rlim_max)
580			psignal(p, SIGKILL);
581		else {
582			psignal(p, SIGXCPU);
583			if (rlim->rlim_cur < rlim->rlim_max)
584				rlim->rlim_cur += 5;
585		}
586	}
587	if (s > 10 * 60 && p->p_ucred->cr_uid && p->p_nice == NZERO) {
588		p->p_nice = NZERO + 4;
589		resetpriority(p);
590	}
591
592	/*
593	 * Pick a new current process and record its start time.
594	 */
595	cnt.v_swtch++;
596	cpu_switch(p);
597	microtime(&runtime);
598}
599
600/*
601 * Initialize the (doubly-linked) run queues
602 * to be empty.
603 */
604void
605rqinit()
606{
607	register int i;
608
609	for (i = 0; i < NQS; i++) {
610		qs[i].ph_link = qs[i].ph_rlink = (struct proc *)&qs[i];
611		rtqs[i].ph_link = rtqs[i].ph_rlink = (struct proc *)&rtqs[i];
612	}
613}
614
615/*
616 * Change process state to be runnable,
617 * placing it on the run queue if it is in memory,
618 * and awakening the swapper if it isn't in memory.
619 */
620void
621setrunnable(p)
622	register struct proc *p;
623{
624	register int s;
625
626	s = splhigh();
627	switch (p->p_stat) {
628	case 0:
629	case SRUN:
630	case SZOMB:
631	default:
632		panic("setrunnable");
633	case SSTOP:
634	case SSLEEP:
635		unsleep(p);		/* e.g. when sending signals */
636		break;
637
638	case SIDL:
639		break;
640	}
641	p->p_stat = SRUN;
642	if (p->p_flag & P_INMEM)
643		setrunqueue(p);
644	splx(s);
645	if (p->p_slptime > 1)
646		updatepri(p);
647	p->p_slptime = 0;
648	if ((p->p_flag & P_INMEM) == 0)
649		wakeup((caddr_t)&proc0);
650	else if (p->p_priority < curpriority)
651		need_resched();
652}
653
654/*
655 * Compute the priority of a process when running in user mode.
656 * Arrange to reschedule if the resulting priority is better
657 * than that of the current process.
658 */
659void
660resetpriority(p)
661	register struct proc *p;
662{
663	register unsigned int newpriority;
664
665	if (p->p_rtprio == RTPRIO_RTOFF) {
666		newpriority = PUSER + p->p_estcpu / 4 + 2 * p->p_nice;
667		newpriority = min(newpriority, MAXPRI);
668		p->p_usrpri = newpriority;
669		if (newpriority < curpriority)
670			need_resched();
671	} else {
672		need_resched();
673	}
674}
675