kern_timeout.c revision 2112
1/*-
2 * Copyright (c) 1982, 1986, 1991, 1993
3 *	The Regents of the University of California.  All rights reserved.
4 * (c) UNIX System Laboratories, Inc.
5 * All or some portions of this file are derived from material licensed
6 * to the University of California by American Telephone and Telegraph
7 * Co. or Unix System Laboratories, Inc. and are reproduced herein with
8 * the permission of UNIX System Laboratories, Inc.
9 *
10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted provided that the following conditions
12 * are met:
13 * 1. Redistributions of source code must retain the above copyright
14 *    notice, this list of conditions and the following disclaimer.
15 * 2. Redistributions in binary form must reproduce the above copyright
16 *    notice, this list of conditions and the following disclaimer in the
17 *    documentation and/or other materials provided with the distribution.
18 * 3. All advertising materials mentioning features or use of this software
19 *    must display the following acknowledgement:
20 *	This product includes software developed by the University of
21 *	California, Berkeley and its contributors.
22 * 4. Neither the name of the University nor the names of its contributors
23 *    may be used to endorse or promote products derived from this software
24 *    without specific prior written permission.
25 *
26 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
27 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
30 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
31 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
32 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
33 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
34 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
35 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
36 * SUCH DAMAGE.
37 *
38 *	@(#)kern_clock.c	8.5 (Berkeley) 1/21/94
39 * $Id: kern_clock.c,v 1.3 1994/08/02 07:41:54 davidg Exp $
40 */
41
42#include <sys/param.h>
43#include <sys/systm.h>
44#include <sys/dkstat.h>
45#include <sys/callout.h>
46#include <sys/kernel.h>
47#include <sys/proc.h>
48#include <sys/resourcevar.h>
49
50#include <machine/cpu.h>
51
52#ifdef GPROF
53#include <sys/gmon.h>
54#endif
55
56/* Does anybody else really care about these? */
57struct callout *callfree, *callout, calltodo;
58int ncallout;
59
60/* Some of these don't belong here, but it's easiest to concentrate them. */
61long cp_time[CPUSTATES];
62long dk_seek[DK_NDRIVE];
63long dk_time[DK_NDRIVE];
64long dk_wds[DK_NDRIVE];
65long dk_wpms[DK_NDRIVE];
66long dk_xfer[DK_NDRIVE];
67
68int dk_busy;
69int dk_ndrive = DK_NDRIVE;
70
71long tk_cancc;
72long tk_nin;
73long tk_nout;
74long tk_rawcc;
75
76/*
77 * Clock handling routines.
78 *
79 * This code is written to operate with two timers that run independently of
80 * each other.  The main clock, running hz times per second, is used to keep
81 * track of real time.  The second timer handles kernel and user profiling,
82 * and does resource use estimation.  If the second timer is programmable,
83 * it is randomized to avoid aliasing between the two clocks.  For example,
84 * the randomization prevents an adversary from always giving up the cpu
85 * just before its quantum expires.  Otherwise, it would never accumulate
86 * cpu ticks.  The mean frequency of the second timer is stathz.
87 *
88 * If no second timer exists, stathz will be zero; in this case we drive
89 * profiling and statistics off the main clock.  This WILL NOT be accurate;
90 * do not do it unless absolutely necessary.
91 *
92 * The statistics clock may (or may not) be run at a higher rate while
93 * profiling.  This profile clock runs at profhz.  We require that profhz
94 * be an integral multiple of stathz.
95 *
96 * If the statistics clock is running fast, it must be divided by the ratio
97 * profhz/stathz for statistics.  (For profiling, every tick counts.)
98 */
99
100/*
101 * TODO:
102 *	allocate more timeout table slots when table overflows.
103 */
104
105/*
106 * Bump a timeval by a small number of usec's.
107 */
108#define BUMPTIME(t, usec) { \
109	register volatile struct timeval *tp = (t); \
110	register long us; \
111 \
112	tp->tv_usec = us = tp->tv_usec + (usec); \
113	if (us >= 1000000) { \
114		tp->tv_usec = us - 1000000; \
115		tp->tv_sec++; \
116	} \
117}
118
119int	stathz;
120int	profhz;
121int	profprocs;
122int	ticks;
123static int psdiv, pscnt;	/* prof => stat divider */
124int	psratio;		/* ratio: prof / stat */
125
126volatile struct	timeval time;
127volatile struct	timeval mono_time;
128
129/*
130 * Initialize clock frequencies and start both clocks running.
131 */
132void
133initclocks()
134{
135	register int i;
136
137	/*
138	 * Set divisors to 1 (normal case) and let the machine-specific
139	 * code do its bit.
140	 */
141	psdiv = pscnt = 1;
142	cpu_initclocks();
143
144	/*
145	 * Compute profhz/stathz, and fix profhz if needed.
146	 */
147	i = stathz ? stathz : hz;
148	if (profhz == 0)
149		profhz = i;
150	psratio = profhz / i;
151}
152
153/*
154 * The real-time timer, interrupting hz times per second.
155 */
156void
157hardclock(frame)
158	register struct clockframe *frame;
159{
160	register struct callout *p1;
161	register struct proc *p;
162	register int delta, needsoft;
163	extern int tickdelta;
164	extern long timedelta;
165
166	/*
167	 * Update real-time timeout queue.
168	 * At front of queue are some number of events which are ``due''.
169	 * The time to these is <= 0 and if negative represents the
170	 * number of ticks which have passed since it was supposed to happen.
171	 * The rest of the q elements (times > 0) are events yet to happen,
172	 * where the time for each is given as a delta from the previous.
173	 * Decrementing just the first of these serves to decrement the time
174	 * to all events.
175	 */
176	needsoft = 0;
177	for (p1 = calltodo.c_next; p1 != NULL; p1 = p1->c_next) {
178		if (--p1->c_time > 0)
179			break;
180		needsoft = 1;
181		if (p1->c_time == 0)
182			break;
183	}
184
185	p = curproc;
186	if (p) {
187		register struct pstats *pstats;
188
189		/*
190		 * Run current process's virtual and profile time, as needed.
191		 */
192		pstats = p->p_stats;
193		if (CLKF_USERMODE(frame) &&
194		    timerisset(&pstats->p_timer[ITIMER_VIRTUAL].it_value) &&
195		    itimerdecr(&pstats->p_timer[ITIMER_VIRTUAL], tick) == 0)
196			psignal(p, SIGVTALRM);
197		if (timerisset(&pstats->p_timer[ITIMER_PROF].it_value) &&
198		    itimerdecr(&pstats->p_timer[ITIMER_PROF], tick) == 0)
199			psignal(p, SIGPROF);
200	}
201
202	/*
203	 * If no separate statistics clock is available, run it from here.
204	 */
205	if (stathz == 0)
206		statclock(frame);
207
208	/*
209	 * Increment the time-of-day.  The increment is just ``tick'' unless
210	 * we are still adjusting the clock; see adjtime().
211	 */
212	ticks++;
213	if (timedelta == 0)
214		delta = tick;
215	else {
216		delta = tick + tickdelta;
217		timedelta -= tickdelta;
218	}
219	BUMPTIME(&time, delta);
220	BUMPTIME(&mono_time, delta);
221
222	/*
223	 * Process callouts at a very low cpu priority, so we don't keep the
224	 * relatively high clock interrupt priority any longer than necessary.
225	 */
226	if (needsoft) {
227		if (CLKF_BASEPRI(frame)) {
228			/*
229			 * Save the overhead of a software interrupt;
230			 * it will happen as soon as we return, so do it now.
231			 */
232			(void)splsoftclock();
233			softclock();
234		} else
235			setsoftclock();
236	}
237}
238
239/*
240 * Software (low priority) clock interrupt.
241 * Run periodic events from timeout queue.
242 */
243/*ARGSUSED*/
244void
245softclock()
246{
247	register struct callout *c;
248	register void *arg;
249	register void (*func) __P((void *));
250	register int s;
251
252	s = splhigh();
253	while ((c = calltodo.c_next) != NULL && c->c_time <= 0) {
254		func = c->c_func;
255		arg = c->c_arg;
256		calltodo.c_next = c->c_next;
257		c->c_next = callfree;
258		callfree = c;
259		splx(s);
260		(*func)(arg);
261		(void) splhigh();
262	}
263	splx(s);
264}
265
266/*
267 * timeout --
268 *	Execute a function after a specified length of time.
269 *
270 * untimeout --
271 *	Cancel previous timeout function call.
272 *
273 *	See AT&T BCI Driver Reference Manual for specification.  This
274 *	implementation differs from that one in that no identification
275 *	value is returned from timeout, rather, the original arguments
276 *	to timeout are used to identify entries for untimeout.
277 */
278void
279timeout(ftn, arg, ticks)
280	timeout_t ftn;
281	void *arg;
282	register int ticks;
283{
284	register struct callout *new, *p, *t;
285	register int s;
286
287	if (ticks <= 0)
288		ticks = 1;
289
290	/* Lock out the clock. */
291	s = splhigh();
292
293	/* Fill in the next free callout structure. */
294	if (callfree == NULL)
295		panic("timeout table full");
296	new = callfree;
297	callfree = new->c_next;
298	new->c_arg = arg;
299	new->c_func = ftn;
300
301	/*
302	 * The time for each event is stored as a difference from the time
303	 * of the previous event on the queue.  Walk the queue, correcting
304	 * the ticks argument for queue entries passed.  Correct the ticks
305	 * value for the queue entry immediately after the insertion point
306	 * as well.  Watch out for negative c_time values; these represent
307	 * overdue events.
308	 */
309	for (p = &calltodo;
310	    (t = p->c_next) != NULL && ticks > t->c_time; p = t)
311		if (t->c_time > 0)
312			ticks -= t->c_time;
313	new->c_time = ticks;
314	if (t != NULL)
315		t->c_time -= ticks;
316
317	/* Insert the new entry into the queue. */
318	p->c_next = new;
319	new->c_next = t;
320	splx(s);
321}
322
323void
324untimeout(ftn, arg)
325	timeout_t ftn;
326	void *arg;
327{
328	register struct callout *p, *t;
329	register int s;
330
331	s = splhigh();
332	for (p = &calltodo; (t = p->c_next) != NULL; p = t)
333		if (t->c_func == ftn && t->c_arg == arg) {
334			/* Increment next entry's tick count. */
335			if (t->c_next && t->c_time > 0)
336				t->c_next->c_time += t->c_time;
337
338			/* Move entry from callout queue to callfree queue. */
339			p->c_next = t->c_next;
340			t->c_next = callfree;
341			callfree = t;
342			break;
343		}
344	splx(s);
345}
346
347/*
348 * Compute number of hz until specified time.  Used to
349 * compute third argument to timeout() from an absolute time.
350 */
351int
352hzto(tv)
353	struct timeval *tv;
354{
355	register long ticks, sec;
356	int s;
357
358	/*
359	 * If number of milliseconds will fit in 32 bit arithmetic,
360	 * then compute number of milliseconds to time and scale to
361	 * ticks.  Otherwise just compute number of hz in time, rounding
362	 * times greater than representible to maximum value.
363	 *
364	 * Delta times less than 25 days can be computed ``exactly''.
365	 * Maximum value for any timeout in 10ms ticks is 250 days.
366	 */
367	s = splhigh();
368	sec = tv->tv_sec - time.tv_sec;
369	if (sec <= 0x7fffffff / 1000 - 1000)
370		ticks = ((tv->tv_sec - time.tv_sec) * 1000 +
371			(tv->tv_usec - time.tv_usec) / 1000) / (tick / 1000);
372	else if (sec <= 0x7fffffff / hz)
373		ticks = sec * hz;
374	else
375		ticks = 0x7fffffff;
376	splx(s);
377	return (ticks);
378}
379
380/*
381 * Start profiling on a process.
382 *
383 * Kernel profiling passes proc0 which never exits and hence
384 * keeps the profile clock running constantly.
385 */
386void
387startprofclock(p)
388	register struct proc *p;
389{
390	int s;
391
392	if ((p->p_flag & P_PROFIL) == 0) {
393		p->p_flag |= P_PROFIL;
394		if (++profprocs == 1 && stathz != 0) {
395			s = splstatclock();
396			psdiv = pscnt = psratio;
397			setstatclockrate(profhz);
398			splx(s);
399		}
400	}
401}
402
403/*
404 * Stop profiling on a process.
405 */
406void
407stopprofclock(p)
408	register struct proc *p;
409{
410	int s;
411
412	if (p->p_flag & P_PROFIL) {
413		p->p_flag &= ~P_PROFIL;
414		if (--profprocs == 0 && stathz != 0) {
415			s = splstatclock();
416			psdiv = pscnt = 1;
417			setstatclockrate(stathz);
418			splx(s);
419		}
420	}
421}
422
423/*
424 * Statistics clock.  Grab profile sample, and if divider reaches 0,
425 * do process and kernel statistics.
426 */
427void
428statclock(frame)
429	register struct clockframe *frame;
430{
431#ifdef GPROF
432	register struct gmonparam *g;
433#endif
434	register struct proc *p;
435	register int i;
436
437	if (CLKF_USERMODE(frame)) {
438		p = curproc;
439		if (p->p_flag & P_PROFIL)
440			addupc_intr(p, CLKF_PC(frame), 1);
441		if (--pscnt > 0)
442			return;
443		/*
444		 * Came from user mode; CPU was in user state.
445		 * If this process is being profiled record the tick.
446		 */
447		p->p_uticks++;
448		if (p->p_nice > NZERO)
449			cp_time[CP_NICE]++;
450		else
451			cp_time[CP_USER]++;
452	} else {
453#ifdef GPROF
454		/*
455		 * Kernel statistics are just like addupc_intr, only easier.
456		 */
457		g = &_gmonparam;
458		if (g->state == GMON_PROF_ON) {
459			i = CLKF_PC(frame) - g->lowpc;
460			if (i < g->textsize) {
461				i /= HISTFRACTION * sizeof(*g->kcount);
462				g->kcount[i]++;
463			}
464		}
465#endif
466		if (--pscnt > 0)
467			return;
468		/*
469		 * Came from kernel mode, so we were:
470		 * - handling an interrupt,
471		 * - doing syscall or trap work on behalf of the current
472		 *   user process, or
473		 * - spinning in the idle loop.
474		 * Whichever it is, charge the time as appropriate.
475		 * Note that we charge interrupts to the current process,
476		 * regardless of whether they are ``for'' that process,
477		 * so that we know how much of its real time was spent
478		 * in ``non-process'' (i.e., interrupt) work.
479		 */
480		p = curproc;
481		if (CLKF_INTR(frame)) {
482			if (p != NULL)
483				p->p_iticks++;
484			cp_time[CP_INTR]++;
485		} else if (p != NULL) {
486			p->p_sticks++;
487			cp_time[CP_SYS]++;
488		} else
489			cp_time[CP_IDLE]++;
490	}
491	pscnt = psdiv;
492
493	/*
494	 * We maintain statistics shown by user-level statistics
495	 * programs:  the amount of time in each cpu state, and
496	 * the amount of time each of DK_NDRIVE ``drives'' is busy.
497	 *
498	 * XXX	should either run linked list of drives, or (better)
499	 *	grab timestamps in the start & done code.
500	 */
501	for (i = 0; i < DK_NDRIVE; i++)
502		if (dk_busy & (1 << i))
503			dk_time[i]++;
504
505	/*
506	 * We adjust the priority of the current process.  The priority of
507	 * a process gets worse as it accumulates CPU time.  The cpu usage
508	 * estimator (p_estcpu) is increased here.  The formula for computing
509	 * priorities (in kern_synch.c) will compute a different value each
510	 * time p_estcpu increases by 4.  The cpu usage estimator ramps up
511	 * quite quickly when the process is running (linearly), and decays
512	 * away exponentially, at a rate which is proportionally slower when
513	 * the system is busy.  The basic principal is that the system will
514	 * 90% forget that the process used a lot of CPU time in 5 * loadav
515	 * seconds.  This causes the system to favor processes which haven't
516	 * run much recently, and to round-robin among other processes.
517	 */
518	if (p != NULL) {
519		p->p_cpticks++;
520		if (++p->p_estcpu == 0)
521			p->p_estcpu--;
522		if ((p->p_estcpu & 3) == 0) {
523			resetpriority(p);
524			if (p->p_priority >= PUSER)
525				p->p_priority = p->p_usrpri;
526		}
527	}
528}
529
530/*
531 * Return information about system clocks.
532 */
533int
534sysctl_clockrate(where, sizep)
535	register char *where;
536	size_t *sizep;
537{
538	struct clockinfo clkinfo;
539
540	/*
541	 * Construct clockinfo structure.
542	 */
543	clkinfo.hz = hz;
544	clkinfo.tick = tick;
545	clkinfo.profhz = profhz;
546	clkinfo.stathz = stathz ? stathz : hz;
547	return (sysctl_rdstruct(where, sizep, NULL, &clkinfo, sizeof(clkinfo)));
548}
549