kern_timeout.c revision 2320
1/*-
2 * Copyright (c) 1982, 1986, 1991, 1993
3 *	The Regents of the University of California.  All rights reserved.
4 * (c) UNIX System Laboratories, Inc.
5 * All or some portions of this file are derived from material licensed
6 * to the University of California by American Telephone and Telegraph
7 * Co. or Unix System Laboratories, Inc. and are reproduced herein with
8 * the permission of UNIX System Laboratories, Inc.
9 *
10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted provided that the following conditions
12 * are met:
13 * 1. Redistributions of source code must retain the above copyright
14 *    notice, this list of conditions and the following disclaimer.
15 * 2. Redistributions in binary form must reproduce the above copyright
16 *    notice, this list of conditions and the following disclaimer in the
17 *    documentation and/or other materials provided with the distribution.
18 * 3. All advertising materials mentioning features or use of this software
19 *    must display the following acknowledgement:
20 *	This product includes software developed by the University of
21 *	California, Berkeley and its contributors.
22 * 4. Neither the name of the University nor the names of its contributors
23 *    may be used to endorse or promote products derived from this software
24 *    without specific prior written permission.
25 *
26 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
27 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
30 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
31 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
32 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
33 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
34 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
35 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
36 * SUCH DAMAGE.
37 *
38 *	@(#)kern_clock.c	8.5 (Berkeley) 1/21/94
39 * $Id: kern_clock.c,v 1.4 1994/08/18 22:34:58 wollman Exp $
40 */
41
42#include <sys/param.h>
43#include <sys/systm.h>
44#include <sys/dkstat.h>
45#include <sys/callout.h>
46#include <sys/kernel.h>
47#include <sys/proc.h>
48#include <sys/resourcevar.h>
49#include <vm/vm.h>
50
51#include <machine/cpu.h>
52
53#ifdef GPROF
54#include <sys/gmon.h>
55#endif
56
57/* Does anybody else really care about these? */
58struct callout *callfree, *callout, calltodo;
59int ncallout;
60
61/* Some of these don't belong here, but it's easiest to concentrate them. */
62long cp_time[CPUSTATES];
63long dk_seek[DK_NDRIVE];
64long dk_time[DK_NDRIVE];
65long dk_wds[DK_NDRIVE];
66long dk_wpms[DK_NDRIVE];
67long dk_xfer[DK_NDRIVE];
68
69int dk_busy;
70int dk_ndrive = DK_NDRIVE;
71
72long tk_cancc;
73long tk_nin;
74long tk_nout;
75long tk_rawcc;
76
77/*
78 * Clock handling routines.
79 *
80 * This code is written to operate with two timers that run independently of
81 * each other.  The main clock, running hz times per second, is used to keep
82 * track of real time.  The second timer handles kernel and user profiling,
83 * and does resource use estimation.  If the second timer is programmable,
84 * it is randomized to avoid aliasing between the two clocks.  For example,
85 * the randomization prevents an adversary from always giving up the cpu
86 * just before its quantum expires.  Otherwise, it would never accumulate
87 * cpu ticks.  The mean frequency of the second timer is stathz.
88 *
89 * If no second timer exists, stathz will be zero; in this case we drive
90 * profiling and statistics off the main clock.  This WILL NOT be accurate;
91 * do not do it unless absolutely necessary.
92 *
93 * The statistics clock may (or may not) be run at a higher rate while
94 * profiling.  This profile clock runs at profhz.  We require that profhz
95 * be an integral multiple of stathz.
96 *
97 * If the statistics clock is running fast, it must be divided by the ratio
98 * profhz/stathz for statistics.  (For profiling, every tick counts.)
99 */
100
101/*
102 * TODO:
103 *	allocate more timeout table slots when table overflows.
104 */
105
106/*
107 * Bump a timeval by a small number of usec's.
108 */
109#define BUMPTIME(t, usec) { \
110	register volatile struct timeval *tp = (t); \
111	register long us; \
112 \
113	tp->tv_usec = us = tp->tv_usec + (usec); \
114	if (us >= 1000000) { \
115		tp->tv_usec = us - 1000000; \
116		tp->tv_sec++; \
117	} \
118}
119
120int	stathz;
121int	profhz;
122int	profprocs;
123int	ticks;
124static int psdiv, pscnt;	/* prof => stat divider */
125int	psratio;		/* ratio: prof / stat */
126
127volatile struct	timeval time;
128volatile struct	timeval mono_time;
129
130/*
131 * Initialize clock frequencies and start both clocks running.
132 */
133void
134initclocks()
135{
136	register int i;
137
138	/*
139	 * Set divisors to 1 (normal case) and let the machine-specific
140	 * code do its bit.
141	 */
142	psdiv = pscnt = 1;
143	cpu_initclocks();
144
145	/*
146	 * Compute profhz/stathz, and fix profhz if needed.
147	 */
148	i = stathz ? stathz : hz;
149	if (profhz == 0)
150		profhz = i;
151	psratio = profhz / i;
152}
153
154/*
155 * The real-time timer, interrupting hz times per second.
156 */
157void
158hardclock(frame)
159	register struct clockframe *frame;
160{
161	register struct callout *p1;
162	register struct proc *p;
163	register int delta, needsoft;
164	extern int tickdelta;
165	extern long timedelta;
166
167	/*
168	 * Update real-time timeout queue.
169	 * At front of queue are some number of events which are ``due''.
170	 * The time to these is <= 0 and if negative represents the
171	 * number of ticks which have passed since it was supposed to happen.
172	 * The rest of the q elements (times > 0) are events yet to happen,
173	 * where the time for each is given as a delta from the previous.
174	 * Decrementing just the first of these serves to decrement the time
175	 * to all events.
176	 */
177	needsoft = 0;
178	for (p1 = calltodo.c_next; p1 != NULL; p1 = p1->c_next) {
179		if (--p1->c_time > 0)
180			break;
181		needsoft = 1;
182		if (p1->c_time == 0)
183			break;
184	}
185
186	p = curproc;
187	if (p) {
188		register struct pstats *pstats;
189
190		/*
191		 * Run current process's virtual and profile time, as needed.
192		 */
193		pstats = p->p_stats;
194		if (CLKF_USERMODE(frame) &&
195		    timerisset(&pstats->p_timer[ITIMER_VIRTUAL].it_value) &&
196		    itimerdecr(&pstats->p_timer[ITIMER_VIRTUAL], tick) == 0)
197			psignal(p, SIGVTALRM);
198		if (timerisset(&pstats->p_timer[ITIMER_PROF].it_value) &&
199		    itimerdecr(&pstats->p_timer[ITIMER_PROF], tick) == 0)
200			psignal(p, SIGPROF);
201	}
202
203	/*
204	 * If no separate statistics clock is available, run it from here.
205	 */
206	if (stathz == 0)
207		statclock(frame);
208
209	/*
210	 * Increment the time-of-day.  The increment is just ``tick'' unless
211	 * we are still adjusting the clock; see adjtime().
212	 */
213	ticks++;
214	if (timedelta == 0)
215		delta = tick;
216	else {
217		delta = tick + tickdelta;
218		timedelta -= tickdelta;
219	}
220	BUMPTIME(&time, delta);
221	BUMPTIME(&mono_time, delta);
222
223	/*
224	 * Process callouts at a very low cpu priority, so we don't keep the
225	 * relatively high clock interrupt priority any longer than necessary.
226	 */
227	if (needsoft) {
228		if (CLKF_BASEPRI(frame)) {
229			/*
230			 * Save the overhead of a software interrupt;
231			 * it will happen as soon as we return, so do it now.
232			 */
233			(void)splsoftclock();
234			softclock();
235		} else
236			setsoftclock();
237	}
238}
239
240/*
241 * Software (low priority) clock interrupt.
242 * Run periodic events from timeout queue.
243 */
244/*ARGSUSED*/
245void
246softclock()
247{
248	register struct callout *c;
249	register void *arg;
250	register void (*func) __P((void *));
251	register int s;
252
253	s = splhigh();
254	while ((c = calltodo.c_next) != NULL && c->c_time <= 0) {
255		func = c->c_func;
256		arg = c->c_arg;
257		calltodo.c_next = c->c_next;
258		c->c_next = callfree;
259		callfree = c;
260		splx(s);
261		(*func)(arg);
262		(void) splhigh();
263	}
264	splx(s);
265}
266
267/*
268 * timeout --
269 *	Execute a function after a specified length of time.
270 *
271 * untimeout --
272 *	Cancel previous timeout function call.
273 *
274 *	See AT&T BCI Driver Reference Manual for specification.  This
275 *	implementation differs from that one in that no identification
276 *	value is returned from timeout, rather, the original arguments
277 *	to timeout are used to identify entries for untimeout.
278 */
279void
280timeout(ftn, arg, ticks)
281	timeout_t ftn;
282	void *arg;
283	register int ticks;
284{
285	register struct callout *new, *p, *t;
286	register int s;
287
288	if (ticks <= 0)
289		ticks = 1;
290
291	/* Lock out the clock. */
292	s = splhigh();
293
294	/* Fill in the next free callout structure. */
295	if (callfree == NULL)
296		panic("timeout table full");
297	new = callfree;
298	callfree = new->c_next;
299	new->c_arg = arg;
300	new->c_func = ftn;
301
302	/*
303	 * The time for each event is stored as a difference from the time
304	 * of the previous event on the queue.  Walk the queue, correcting
305	 * the ticks argument for queue entries passed.  Correct the ticks
306	 * value for the queue entry immediately after the insertion point
307	 * as well.  Watch out for negative c_time values; these represent
308	 * overdue events.
309	 */
310	for (p = &calltodo;
311	    (t = p->c_next) != NULL && ticks > t->c_time; p = t)
312		if (t->c_time > 0)
313			ticks -= t->c_time;
314	new->c_time = ticks;
315	if (t != NULL)
316		t->c_time -= ticks;
317
318	/* Insert the new entry into the queue. */
319	p->c_next = new;
320	new->c_next = t;
321	splx(s);
322}
323
324void
325untimeout(ftn, arg)
326	timeout_t ftn;
327	void *arg;
328{
329	register struct callout *p, *t;
330	register int s;
331
332	s = splhigh();
333	for (p = &calltodo; (t = p->c_next) != NULL; p = t)
334		if (t->c_func == ftn && t->c_arg == arg) {
335			/* Increment next entry's tick count. */
336			if (t->c_next && t->c_time > 0)
337				t->c_next->c_time += t->c_time;
338
339			/* Move entry from callout queue to callfree queue. */
340			p->c_next = t->c_next;
341			t->c_next = callfree;
342			callfree = t;
343			break;
344		}
345	splx(s);
346}
347
348/*
349 * Compute number of hz until specified time.  Used to
350 * compute third argument to timeout() from an absolute time.
351 */
352int
353hzto(tv)
354	struct timeval *tv;
355{
356	register long ticks, sec;
357	int s;
358
359	/*
360	 * If number of milliseconds will fit in 32 bit arithmetic,
361	 * then compute number of milliseconds to time and scale to
362	 * ticks.  Otherwise just compute number of hz in time, rounding
363	 * times greater than representible to maximum value.
364	 *
365	 * Delta times less than 25 days can be computed ``exactly''.
366	 * Maximum value for any timeout in 10ms ticks is 250 days.
367	 */
368	s = splhigh();
369	sec = tv->tv_sec - time.tv_sec;
370	if (sec <= 0x7fffffff / 1000 - 1000)
371		ticks = ((tv->tv_sec - time.tv_sec) * 1000 +
372			(tv->tv_usec - time.tv_usec) / 1000) / (tick / 1000);
373	else if (sec <= 0x7fffffff / hz)
374		ticks = sec * hz;
375	else
376		ticks = 0x7fffffff;
377	splx(s);
378	return (ticks);
379}
380
381/*
382 * Start profiling on a process.
383 *
384 * Kernel profiling passes proc0 which never exits and hence
385 * keeps the profile clock running constantly.
386 */
387void
388startprofclock(p)
389	register struct proc *p;
390{
391	int s;
392
393	if ((p->p_flag & P_PROFIL) == 0) {
394		p->p_flag |= P_PROFIL;
395		if (++profprocs == 1 && stathz != 0) {
396			s = splstatclock();
397			psdiv = pscnt = psratio;
398			setstatclockrate(profhz);
399			splx(s);
400		}
401	}
402}
403
404/*
405 * Stop profiling on a process.
406 */
407void
408stopprofclock(p)
409	register struct proc *p;
410{
411	int s;
412
413	if (p->p_flag & P_PROFIL) {
414		p->p_flag &= ~P_PROFIL;
415		if (--profprocs == 0 && stathz != 0) {
416			s = splstatclock();
417			psdiv = pscnt = 1;
418			setstatclockrate(stathz);
419			splx(s);
420		}
421	}
422}
423
424/*
425 * Statistics clock.  Grab profile sample, and if divider reaches 0,
426 * do process and kernel statistics.
427 */
428void
429statclock(frame)
430	register struct clockframe *frame;
431{
432#ifdef GPROF
433	register struct gmonparam *g;
434#endif
435	register struct proc *p = curproc;
436	register int i;
437
438	if (p) {
439		struct pstats *pstats;
440		struct rusage *ru;
441		struct vmspace *vm;
442
443		/* bump the resource usage of integral space use */
444		if ((pstats = p->p_stats) && (ru = &pstats->p_ru) && (vm = p->p_vmspace)) {
445			ru->ru_ixrss += vm->vm_tsize * PAGE_SIZE / 1024;
446			ru->ru_idrss += vm->vm_dsize * PAGE_SIZE / 1024;
447			ru->ru_isrss += vm->vm_ssize * PAGE_SIZE / 1024;
448			if ((vm->vm_pmap.pm_stats.resident_count * PAGE_SIZE / 1024) >
449			    ru->ru_maxrss) {
450				ru->ru_maxrss =
451				    vm->vm_pmap.pm_stats.resident_count * PAGE_SIZE / 1024;
452			}
453        	}
454	}
455
456	if (CLKF_USERMODE(frame)) {
457		if (p->p_flag & P_PROFIL)
458			addupc_intr(p, CLKF_PC(frame), 1);
459		if (--pscnt > 0)
460			return;
461		/*
462		 * Came from user mode; CPU was in user state.
463		 * If this process is being profiled record the tick.
464		 */
465		p->p_uticks++;
466		if (p->p_nice > NZERO)
467			cp_time[CP_NICE]++;
468		else
469			cp_time[CP_USER]++;
470	} else {
471#ifdef GPROF
472		/*
473		 * Kernel statistics are just like addupc_intr, only easier.
474		 */
475		g = &_gmonparam;
476		if (g->state == GMON_PROF_ON) {
477			i = CLKF_PC(frame) - g->lowpc;
478			if (i < g->textsize) {
479				i /= HISTFRACTION * sizeof(*g->kcount);
480				g->kcount[i]++;
481			}
482		}
483#endif
484		if (--pscnt > 0)
485			return;
486		/*
487		 * Came from kernel mode, so we were:
488		 * - handling an interrupt,
489		 * - doing syscall or trap work on behalf of the current
490		 *   user process, or
491		 * - spinning in the idle loop.
492		 * Whichever it is, charge the time as appropriate.
493		 * Note that we charge interrupts to the current process,
494		 * regardless of whether they are ``for'' that process,
495		 * so that we know how much of its real time was spent
496		 * in ``non-process'' (i.e., interrupt) work.
497		 */
498		if (CLKF_INTR(frame)) {
499			if (p != NULL)
500				p->p_iticks++;
501			cp_time[CP_INTR]++;
502		} else if (p != NULL) {
503			p->p_sticks++;
504			cp_time[CP_SYS]++;
505		} else
506			cp_time[CP_IDLE]++;
507	}
508	pscnt = psdiv;
509
510	/*
511	 * We maintain statistics shown by user-level statistics
512	 * programs:  the amount of time in each cpu state, and
513	 * the amount of time each of DK_NDRIVE ``drives'' is busy.
514	 *
515	 * XXX	should either run linked list of drives, or (better)
516	 *	grab timestamps in the start & done code.
517	 */
518	for (i = 0; i < DK_NDRIVE; i++)
519		if (dk_busy & (1 << i))
520			dk_time[i]++;
521
522	/*
523	 * We adjust the priority of the current process.  The priority of
524	 * a process gets worse as it accumulates CPU time.  The cpu usage
525	 * estimator (p_estcpu) is increased here.  The formula for computing
526	 * priorities (in kern_synch.c) will compute a different value each
527	 * time p_estcpu increases by 4.  The cpu usage estimator ramps up
528	 * quite quickly when the process is running (linearly), and decays
529	 * away exponentially, at a rate which is proportionally slower when
530	 * the system is busy.  The basic principal is that the system will
531	 * 90% forget that the process used a lot of CPU time in 5 * loadav
532	 * seconds.  This causes the system to favor processes which haven't
533	 * run much recently, and to round-robin among other processes.
534	 */
535	if (p != NULL) {
536		p->p_cpticks++;
537		if (++p->p_estcpu == 0)
538			p->p_estcpu--;
539		if ((p->p_estcpu & 3) == 0) {
540			resetpriority(p);
541			if (p->p_priority >= PUSER)
542				p->p_priority = p->p_usrpri;
543		}
544	}
545}
546
547/*
548 * Return information about system clocks.
549 */
550int
551sysctl_clockrate(where, sizep)
552	register char *where;
553	size_t *sizep;
554{
555	struct clockinfo clkinfo;
556
557	/*
558	 * Construct clockinfo structure.
559	 */
560	clkinfo.hz = hz;
561	clkinfo.tick = tick;
562	clkinfo.profhz = profhz;
563	clkinfo.stathz = stathz ? stathz : hz;
564	return (sysctl_rdstruct(where, sizep, NULL, &clkinfo, sizeof(clkinfo)));
565}
566