kern_tc.c revision 1817
1/*-
2 * Copyright (c) 1982, 1986, 1991, 1993
3 *	The Regents of the University of California.  All rights reserved.
4 * (c) UNIX System Laboratories, Inc.
5 * All or some portions of this file are derived from material licensed
6 * to the University of California by American Telephone and Telegraph
7 * Co. or Unix System Laboratories, Inc. and are reproduced herein with
8 * the permission of UNIX System Laboratories, Inc.
9 *
10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted provided that the following conditions
12 * are met:
13 * 1. Redistributions of source code must retain the above copyright
14 *    notice, this list of conditions and the following disclaimer.
15 * 2. Redistributions in binary form must reproduce the above copyright
16 *    notice, this list of conditions and the following disclaimer in the
17 *    documentation and/or other materials provided with the distribution.
18 * 3. All advertising materials mentioning features or use of this software
19 *    must display the following acknowledgement:
20 *	This product includes software developed by the University of
21 *	California, Berkeley and its contributors.
22 * 4. Neither the name of the University nor the names of its contributors
23 *    may be used to endorse or promote products derived from this software
24 *    without specific prior written permission.
25 *
26 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
27 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
30 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
31 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
32 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
33 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
34 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
35 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
36 * SUCH DAMAGE.
37 *
38 *	@(#)kern_clock.c	8.5 (Berkeley) 1/21/94
39 * $Id$
40 */
41
42#include <sys/param.h>
43#include <sys/systm.h>
44#include <sys/dkstat.h>
45#include <sys/callout.h>
46#include <sys/kernel.h>
47#include <sys/proc.h>
48#include <sys/resourcevar.h>
49
50#include <machine/cpu.h>
51
52#ifdef GPROF
53#include <sys/gmon.h>
54#endif
55
56/*
57 * Clock handling routines.
58 *
59 * This code is written to operate with two timers that run independently of
60 * each other.  The main clock, running hz times per second, is used to keep
61 * track of real time.  The second timer handles kernel and user profiling,
62 * and does resource use estimation.  If the second timer is programmable,
63 * it is randomized to avoid aliasing between the two clocks.  For example,
64 * the randomization prevents an adversary from always giving up the cpu
65 * just before its quantum expires.  Otherwise, it would never accumulate
66 * cpu ticks.  The mean frequency of the second timer is stathz.
67 *
68 * If no second timer exists, stathz will be zero; in this case we drive
69 * profiling and statistics off the main clock.  This WILL NOT be accurate;
70 * do not do it unless absolutely necessary.
71 *
72 * The statistics clock may (or may not) be run at a higher rate while
73 * profiling.  This profile clock runs at profhz.  We require that profhz
74 * be an integral multiple of stathz.
75 *
76 * If the statistics clock is running fast, it must be divided by the ratio
77 * profhz/stathz for statistics.  (For profiling, every tick counts.)
78 */
79
80/*
81 * TODO:
82 *	allocate more timeout table slots when table overflows.
83 */
84
85/*
86 * Bump a timeval by a small number of usec's.
87 */
88#define BUMPTIME(t, usec) { \
89	register volatile struct timeval *tp = (t); \
90	register long us; \
91 \
92	tp->tv_usec = us = tp->tv_usec + (usec); \
93	if (us >= 1000000) { \
94		tp->tv_usec = us - 1000000; \
95		tp->tv_sec++; \
96	} \
97}
98
99int	stathz;
100int	profhz;
101int	profprocs;
102int	ticks;
103static int psdiv, pscnt;	/* prof => stat divider */
104int	psratio;		/* ratio: prof / stat */
105
106volatile struct	timeval time;
107volatile struct	timeval mono_time;
108
109/*
110 * Initialize clock frequencies and start both clocks running.
111 */
112void
113initclocks()
114{
115	register int i;
116
117	/*
118	 * Set divisors to 1 (normal case) and let the machine-specific
119	 * code do its bit.
120	 */
121	psdiv = pscnt = 1;
122	cpu_initclocks();
123
124	/*
125	 * Compute profhz/stathz, and fix profhz if needed.
126	 */
127	i = stathz ? stathz : hz;
128	if (profhz == 0)
129		profhz = i;
130	psratio = profhz / i;
131}
132
133/*
134 * The real-time timer, interrupting hz times per second.
135 */
136void
137hardclock(frame)
138	register struct clockframe *frame;
139{
140	register struct callout *p1;
141	register struct proc *p;
142	register int delta, needsoft;
143	extern int tickdelta;
144	extern long timedelta;
145
146	/*
147	 * Update real-time timeout queue.
148	 * At front of queue are some number of events which are ``due''.
149	 * The time to these is <= 0 and if negative represents the
150	 * number of ticks which have passed since it was supposed to happen.
151	 * The rest of the q elements (times > 0) are events yet to happen,
152	 * where the time for each is given as a delta from the previous.
153	 * Decrementing just the first of these serves to decrement the time
154	 * to all events.
155	 */
156	needsoft = 0;
157	for (p1 = calltodo.c_next; p1 != NULL; p1 = p1->c_next) {
158		if (--p1->c_time > 0)
159			break;
160		needsoft = 1;
161		if (p1->c_time == 0)
162			break;
163	}
164
165	p = curproc;
166	if (p) {
167		register struct pstats *pstats;
168
169		/*
170		 * Run current process's virtual and profile time, as needed.
171		 */
172		pstats = p->p_stats;
173		if (CLKF_USERMODE(frame) &&
174		    timerisset(&pstats->p_timer[ITIMER_VIRTUAL].it_value) &&
175		    itimerdecr(&pstats->p_timer[ITIMER_VIRTUAL], tick) == 0)
176			psignal(p, SIGVTALRM);
177		if (timerisset(&pstats->p_timer[ITIMER_PROF].it_value) &&
178		    itimerdecr(&pstats->p_timer[ITIMER_PROF], tick) == 0)
179			psignal(p, SIGPROF);
180	}
181
182	/*
183	 * If no separate statistics clock is available, run it from here.
184	 */
185	if (stathz == 0)
186		statclock(frame);
187
188	/*
189	 * Increment the time-of-day.  The increment is just ``tick'' unless
190	 * we are still adjusting the clock; see adjtime().
191	 */
192	ticks++;
193	if (timedelta == 0)
194		delta = tick;
195	else {
196		delta = tick + tickdelta;
197		timedelta -= tickdelta;
198	}
199	BUMPTIME(&time, delta);
200	BUMPTIME(&mono_time, delta);
201
202	/*
203	 * Process callouts at a very low cpu priority, so we don't keep the
204	 * relatively high clock interrupt priority any longer than necessary.
205	 */
206	if (needsoft) {
207		if (CLKF_BASEPRI(frame)) {
208			/*
209			 * Save the overhead of a software interrupt;
210			 * it will happen as soon as we return, so do it now.
211			 */
212			(void)splsoftclock();
213			softclock();
214		} else
215			setsoftclock();
216	}
217}
218
219/*
220 * Software (low priority) clock interrupt.
221 * Run periodic events from timeout queue.
222 */
223/*ARGSUSED*/
224void
225softclock()
226{
227	register struct callout *c;
228	register void *arg;
229	register void (*func) __P((void *));
230	register int s;
231
232	s = splhigh();
233	while ((c = calltodo.c_next) != NULL && c->c_time <= 0) {
234		func = c->c_func;
235		arg = c->c_arg;
236		calltodo.c_next = c->c_next;
237		c->c_next = callfree;
238		callfree = c;
239		splx(s);
240		(*func)(arg);
241		(void) splhigh();
242	}
243	splx(s);
244}
245
246/*
247 * timeout --
248 *	Execute a function after a specified length of time.
249 *
250 * untimeout --
251 *	Cancel previous timeout function call.
252 *
253 *	See AT&T BCI Driver Reference Manual for specification.  This
254 *	implementation differs from that one in that no identification
255 *	value is returned from timeout, rather, the original arguments
256 *	to timeout are used to identify entries for untimeout.
257 */
258void
259timeout(ftn, arg, ticks)
260	void (*ftn) __P((void *));
261	void *arg;
262	register int ticks;
263{
264	register struct callout *new, *p, *t;
265	register int s;
266
267	if (ticks <= 0)
268		ticks = 1;
269
270	/* Lock out the clock. */
271	s = splhigh();
272
273	/* Fill in the next free callout structure. */
274	if (callfree == NULL)
275		panic("timeout table full");
276	new = callfree;
277	callfree = new->c_next;
278	new->c_arg = arg;
279	new->c_func = ftn;
280
281	/*
282	 * The time for each event is stored as a difference from the time
283	 * of the previous event on the queue.  Walk the queue, correcting
284	 * the ticks argument for queue entries passed.  Correct the ticks
285	 * value for the queue entry immediately after the insertion point
286	 * as well.  Watch out for negative c_time values; these represent
287	 * overdue events.
288	 */
289	for (p = &calltodo;
290	    (t = p->c_next) != NULL && ticks > t->c_time; p = t)
291		if (t->c_time > 0)
292			ticks -= t->c_time;
293	new->c_time = ticks;
294	if (t != NULL)
295		t->c_time -= ticks;
296
297	/* Insert the new entry into the queue. */
298	p->c_next = new;
299	new->c_next = t;
300	splx(s);
301}
302
303void
304untimeout(ftn, arg)
305	void (*ftn) __P((void *));
306	void *arg;
307{
308	register struct callout *p, *t;
309	register int s;
310
311	s = splhigh();
312	for (p = &calltodo; (t = p->c_next) != NULL; p = t)
313		if (t->c_func == ftn && t->c_arg == arg) {
314			/* Increment next entry's tick count. */
315			if (t->c_next && t->c_time > 0)
316				t->c_next->c_time += t->c_time;
317
318			/* Move entry from callout queue to callfree queue. */
319			p->c_next = t->c_next;
320			t->c_next = callfree;
321			callfree = t;
322			break;
323		}
324	splx(s);
325}
326
327/*
328 * Compute number of hz until specified time.  Used to
329 * compute third argument to timeout() from an absolute time.
330 */
331int
332hzto(tv)
333	struct timeval *tv;
334{
335	register long ticks, sec;
336	int s;
337
338	/*
339	 * If number of milliseconds will fit in 32 bit arithmetic,
340	 * then compute number of milliseconds to time and scale to
341	 * ticks.  Otherwise just compute number of hz in time, rounding
342	 * times greater than representible to maximum value.
343	 *
344	 * Delta times less than 25 days can be computed ``exactly''.
345	 * Maximum value for any timeout in 10ms ticks is 250 days.
346	 */
347	s = splhigh();
348	sec = tv->tv_sec - time.tv_sec;
349	if (sec <= 0x7fffffff / 1000 - 1000)
350		ticks = ((tv->tv_sec - time.tv_sec) * 1000 +
351			(tv->tv_usec - time.tv_usec) / 1000) / (tick / 1000);
352	else if (sec <= 0x7fffffff / hz)
353		ticks = sec * hz;
354	else
355		ticks = 0x7fffffff;
356	splx(s);
357	return (ticks);
358}
359
360/*
361 * Start profiling on a process.
362 *
363 * Kernel profiling passes proc0 which never exits and hence
364 * keeps the profile clock running constantly.
365 */
366void
367startprofclock(p)
368	register struct proc *p;
369{
370	int s;
371
372	if ((p->p_flag & P_PROFIL) == 0) {
373		p->p_flag |= P_PROFIL;
374		if (++profprocs == 1 && stathz != 0) {
375			s = splstatclock();
376			psdiv = pscnt = psratio;
377			setstatclockrate(profhz);
378			splx(s);
379		}
380	}
381}
382
383/*
384 * Stop profiling on a process.
385 */
386void
387stopprofclock(p)
388	register struct proc *p;
389{
390	int s;
391
392	if (p->p_flag & P_PROFIL) {
393		p->p_flag &= ~P_PROFIL;
394		if (--profprocs == 0 && stathz != 0) {
395			s = splstatclock();
396			psdiv = pscnt = 1;
397			setstatclockrate(stathz);
398			splx(s);
399		}
400	}
401}
402
403int	dk_ndrive = DK_NDRIVE;
404
405/*
406 * Statistics clock.  Grab profile sample, and if divider reaches 0,
407 * do process and kernel statistics.
408 */
409void
410statclock(frame)
411	register struct clockframe *frame;
412{
413#ifdef GPROF
414	register struct gmonparam *g;
415#endif
416	register struct proc *p;
417	register int i;
418
419	if (CLKF_USERMODE(frame)) {
420		p = curproc;
421		if (p->p_flag & P_PROFIL)
422			addupc_intr(p, CLKF_PC(frame), 1);
423		if (--pscnt > 0)
424			return;
425		/*
426		 * Came from user mode; CPU was in user state.
427		 * If this process is being profiled record the tick.
428		 */
429		p->p_uticks++;
430		if (p->p_nice > NZERO)
431			cp_time[CP_NICE]++;
432		else
433			cp_time[CP_USER]++;
434	} else {
435#ifdef GPROF
436		/*
437		 * Kernel statistics are just like addupc_intr, only easier.
438		 */
439		g = &_gmonparam;
440		if (g->state == GMON_PROF_ON) {
441			i = CLKF_PC(frame) - g->lowpc;
442			if (i < g->textsize) {
443				i /= HISTFRACTION * sizeof(*g->kcount);
444				g->kcount[i]++;
445			}
446		}
447#endif
448		if (--pscnt > 0)
449			return;
450		/*
451		 * Came from kernel mode, so we were:
452		 * - handling an interrupt,
453		 * - doing syscall or trap work on behalf of the current
454		 *   user process, or
455		 * - spinning in the idle loop.
456		 * Whichever it is, charge the time as appropriate.
457		 * Note that we charge interrupts to the current process,
458		 * regardless of whether they are ``for'' that process,
459		 * so that we know how much of its real time was spent
460		 * in ``non-process'' (i.e., interrupt) work.
461		 */
462		p = curproc;
463		if (CLKF_INTR(frame)) {
464			if (p != NULL)
465				p->p_iticks++;
466			cp_time[CP_INTR]++;
467		} else if (p != NULL) {
468			p->p_sticks++;
469			cp_time[CP_SYS]++;
470		} else
471			cp_time[CP_IDLE]++;
472	}
473	pscnt = psdiv;
474
475	/*
476	 * We maintain statistics shown by user-level statistics
477	 * programs:  the amount of time in each cpu state, and
478	 * the amount of time each of DK_NDRIVE ``drives'' is busy.
479	 *
480	 * XXX	should either run linked list of drives, or (better)
481	 *	grab timestamps in the start & done code.
482	 */
483	for (i = 0; i < DK_NDRIVE; i++)
484		if (dk_busy & (1 << i))
485			dk_time[i]++;
486
487	/*
488	 * We adjust the priority of the current process.  The priority of
489	 * a process gets worse as it accumulates CPU time.  The cpu usage
490	 * estimator (p_estcpu) is increased here.  The formula for computing
491	 * priorities (in kern_synch.c) will compute a different value each
492	 * time p_estcpu increases by 4.  The cpu usage estimator ramps up
493	 * quite quickly when the process is running (linearly), and decays
494	 * away exponentially, at a rate which is proportionally slower when
495	 * the system is busy.  The basic principal is that the system will
496	 * 90% forget that the process used a lot of CPU time in 5 * loadav
497	 * seconds.  This causes the system to favor processes which haven't
498	 * run much recently, and to round-robin among other processes.
499	 */
500	if (p != NULL) {
501		p->p_cpticks++;
502		if (++p->p_estcpu == 0)
503			p->p_estcpu--;
504		if ((p->p_estcpu & 3) == 0) {
505			resetpriority(p);
506			if (p->p_priority >= PUSER)
507				p->p_priority = p->p_usrpri;
508		}
509	}
510}
511
512/*
513 * Return information about system clocks.
514 */
515int
516sysctl_clockrate(where, sizep)
517	register char *where;
518	size_t *sizep;
519{
520	struct clockinfo clkinfo;
521
522	/*
523	 * Construct clockinfo structure.
524	 */
525	clkinfo.hz = hz;
526	clkinfo.tick = tick;
527	clkinfo.profhz = profhz;
528	clkinfo.stathz = stathz ? stathz : hz;
529	return (sysctl_rdstruct(where, sizep, NULL, &clkinfo, sizeof(clkinfo)));
530}
531