kern_clock.c revision 330897
1/*-
2 * SPDX-License-Identifier: BSD-3-Clause
3 *
4 * Copyright (c) 1982, 1986, 1991, 1993
5 *	The Regents of the University of California.  All rights reserved.
6 * (c) UNIX System Laboratories, Inc.
7 * All or some portions of this file are derived from material licensed
8 * to the University of California by American Telephone and Telegraph
9 * Co. or Unix System Laboratories, Inc. and are reproduced herein with
10 * the permission of UNIX System Laboratories, Inc.
11 *
12 * Redistribution and use in source and binary forms, with or without
13 * modification, are permitted provided that the following conditions
14 * are met:
15 * 1. Redistributions of source code must retain the above copyright
16 *    notice, this list of conditions and the following disclaimer.
17 * 2. Redistributions in binary form must reproduce the above copyright
18 *    notice, this list of conditions and the following disclaimer in the
19 *    documentation and/or other materials provided with the distribution.
20 * 4. Neither the name of the University nor the names of its contributors
21 *    may be used to endorse or promote products derived from this software
22 *    without specific prior written permission.
23 *
24 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
25 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
26 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
27 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
28 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
29 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
30 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
31 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
32 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
33 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
34 * SUCH DAMAGE.
35 *
36 *	@(#)kern_clock.c	8.5 (Berkeley) 1/21/94
37 */
38
39#include <sys/cdefs.h>
40__FBSDID("$FreeBSD: stable/11/sys/kern/kern_clock.c 330897 2018-03-14 03:19:51Z eadler $");
41
42#include "opt_kdb.h"
43#include "opt_device_polling.h"
44#include "opt_hwpmc_hooks.h"
45#include "opt_ntp.h"
46#include "opt_watchdog.h"
47
48#include <sys/param.h>
49#include <sys/systm.h>
50#include <sys/callout.h>
51#include <sys/kdb.h>
52#include <sys/kernel.h>
53#include <sys/kthread.h>
54#include <sys/ktr.h>
55#include <sys/lock.h>
56#include <sys/mutex.h>
57#include <sys/proc.h>
58#include <sys/resource.h>
59#include <sys/resourcevar.h>
60#include <sys/sched.h>
61#include <sys/sdt.h>
62#include <sys/signalvar.h>
63#include <sys/sleepqueue.h>
64#include <sys/smp.h>
65#include <vm/vm.h>
66#include <vm/pmap.h>
67#include <vm/vm_map.h>
68#include <sys/sysctl.h>
69#include <sys/bus.h>
70#include <sys/interrupt.h>
71#include <sys/limits.h>
72#include <sys/timetc.h>
73
74#ifdef GPROF
75#include <sys/gmon.h>
76#endif
77
78#ifdef HWPMC_HOOKS
79#include <sys/pmckern.h>
80PMC_SOFT_DEFINE( , , clock, hard);
81PMC_SOFT_DEFINE( , , clock, stat);
82PMC_SOFT_DEFINE_EX( , , clock, prof, \
83    cpu_startprofclock, cpu_stopprofclock);
84#endif
85
86#ifdef DEVICE_POLLING
87extern void hardclock_device_poll(void);
88#endif /* DEVICE_POLLING */
89
90static void initclocks(void *dummy);
91SYSINIT(clocks, SI_SUB_CLOCKS, SI_ORDER_FIRST, initclocks, NULL);
92
93/* Spin-lock protecting profiling statistics. */
94static struct mtx time_lock;
95
96SDT_PROVIDER_DECLARE(sched);
97SDT_PROBE_DEFINE2(sched, , , tick, "struct thread *", "struct proc *");
98
99static int
100sysctl_kern_cp_time(SYSCTL_HANDLER_ARGS)
101{
102	int error;
103	long cp_time[CPUSTATES];
104#ifdef SCTL_MASK32
105	int i;
106	unsigned int cp_time32[CPUSTATES];
107#endif
108
109	read_cpu_time(cp_time);
110#ifdef SCTL_MASK32
111	if (req->flags & SCTL_MASK32) {
112		if (!req->oldptr)
113			return SYSCTL_OUT(req, 0, sizeof(cp_time32));
114		for (i = 0; i < CPUSTATES; i++)
115			cp_time32[i] = (unsigned int)cp_time[i];
116		error = SYSCTL_OUT(req, cp_time32, sizeof(cp_time32));
117	} else
118#endif
119	{
120		if (!req->oldptr)
121			return SYSCTL_OUT(req, 0, sizeof(cp_time));
122		error = SYSCTL_OUT(req, cp_time, sizeof(cp_time));
123	}
124	return error;
125}
126
127SYSCTL_PROC(_kern, OID_AUTO, cp_time, CTLTYPE_LONG|CTLFLAG_RD|CTLFLAG_MPSAFE,
128    0,0, sysctl_kern_cp_time, "LU", "CPU time statistics");
129
130static long empty[CPUSTATES];
131
132static int
133sysctl_kern_cp_times(SYSCTL_HANDLER_ARGS)
134{
135	struct pcpu *pcpu;
136	int error;
137	int c;
138	long *cp_time;
139#ifdef SCTL_MASK32
140	unsigned int cp_time32[CPUSTATES];
141	int i;
142#endif
143
144	if (!req->oldptr) {
145#ifdef SCTL_MASK32
146		if (req->flags & SCTL_MASK32)
147			return SYSCTL_OUT(req, 0, sizeof(cp_time32) * (mp_maxid + 1));
148		else
149#endif
150			return SYSCTL_OUT(req, 0, sizeof(long) * CPUSTATES * (mp_maxid + 1));
151	}
152	for (error = 0, c = 0; error == 0 && c <= mp_maxid; c++) {
153		if (!CPU_ABSENT(c)) {
154			pcpu = pcpu_find(c);
155			cp_time = pcpu->pc_cp_time;
156		} else {
157			cp_time = empty;
158		}
159#ifdef SCTL_MASK32
160		if (req->flags & SCTL_MASK32) {
161			for (i = 0; i < CPUSTATES; i++)
162				cp_time32[i] = (unsigned int)cp_time[i];
163			error = SYSCTL_OUT(req, cp_time32, sizeof(cp_time32));
164		} else
165#endif
166			error = SYSCTL_OUT(req, cp_time, sizeof(long) * CPUSTATES);
167	}
168	return error;
169}
170
171SYSCTL_PROC(_kern, OID_AUTO, cp_times, CTLTYPE_LONG|CTLFLAG_RD|CTLFLAG_MPSAFE,
172    0,0, sysctl_kern_cp_times, "LU", "per-CPU time statistics");
173
174#ifdef DEADLKRES
175static const char *blessed[] = {
176	"getblk",
177	"so_snd_sx",
178	"so_rcv_sx",
179	NULL
180};
181static int slptime_threshold = 1800;
182static int blktime_threshold = 900;
183static int sleepfreq = 3;
184
185static void
186deadlkres(void)
187{
188	struct proc *p;
189	struct thread *td;
190	void *wchan;
191	int blkticks, i, slpticks, slptype, tryl, tticks;
192
193	tryl = 0;
194	for (;;) {
195		blkticks = blktime_threshold * hz;
196		slpticks = slptime_threshold * hz;
197
198		/*
199		 * Avoid to sleep on the sx_lock in order to avoid a possible
200		 * priority inversion problem leading to starvation.
201		 * If the lock can't be held after 100 tries, panic.
202		 */
203		if (!sx_try_slock(&allproc_lock)) {
204			if (tryl > 100)
205		panic("%s: possible deadlock detected on allproc_lock\n",
206				    __func__);
207			tryl++;
208			pause("allproc", sleepfreq * hz);
209			continue;
210		}
211		tryl = 0;
212		FOREACH_PROC_IN_SYSTEM(p) {
213			PROC_LOCK(p);
214			if (p->p_state == PRS_NEW) {
215				PROC_UNLOCK(p);
216				continue;
217			}
218			FOREACH_THREAD_IN_PROC(p, td) {
219
220				thread_lock(td);
221				if (TD_ON_LOCK(td)) {
222
223					/*
224					 * The thread should be blocked on a
225					 * turnstile, simply check if the
226					 * turnstile channel is in good state.
227					 */
228					MPASS(td->td_blocked != NULL);
229
230					tticks = ticks - td->td_blktick;
231					thread_unlock(td);
232					if (tticks > blkticks) {
233
234						/*
235						 * Accordingly with provided
236						 * thresholds, this thread is
237						 * stuck for too long on a
238						 * turnstile.
239						 */
240						PROC_UNLOCK(p);
241						sx_sunlock(&allproc_lock);
242	panic("%s: possible deadlock detected for %p, blocked for %d ticks\n",
243						    __func__, td, tticks);
244					}
245				} else if (TD_IS_SLEEPING(td) &&
246				    TD_ON_SLEEPQ(td)) {
247
248					/*
249					 * Check if the thread is sleeping on a
250					 * lock, otherwise skip the check.
251					 * Drop the thread lock in order to
252					 * avoid a LOR with the sleepqueue
253					 * spinlock.
254					 */
255					wchan = td->td_wchan;
256					tticks = ticks - td->td_slptick;
257					thread_unlock(td);
258					slptype = sleepq_type(wchan);
259					if ((slptype == SLEEPQ_SX ||
260					    slptype == SLEEPQ_LK) &&
261					    tticks > slpticks) {
262
263						/*
264						 * Accordingly with provided
265						 * thresholds, this thread is
266						 * stuck for too long on a
267						 * sleepqueue.
268						 * However, being on a
269						 * sleepqueue, we might still
270						 * check for the blessed
271						 * list.
272						 */
273						tryl = 0;
274						for (i = 0; blessed[i] != NULL;
275						    i++) {
276							if (!strcmp(blessed[i],
277							    td->td_wmesg)) {
278								tryl = 1;
279								break;
280							}
281						}
282						if (tryl != 0) {
283							tryl = 0;
284							continue;
285						}
286						PROC_UNLOCK(p);
287						sx_sunlock(&allproc_lock);
288	panic("%s: possible deadlock detected for %p, blocked for %d ticks\n",
289						    __func__, td, tticks);
290					}
291				} else
292					thread_unlock(td);
293			}
294			PROC_UNLOCK(p);
295		}
296		sx_sunlock(&allproc_lock);
297
298		/* Sleep for sleepfreq seconds. */
299		pause("-", sleepfreq * hz);
300	}
301}
302
303static struct kthread_desc deadlkres_kd = {
304	"deadlkres",
305	deadlkres,
306	(struct thread **)NULL
307};
308
309SYSINIT(deadlkres, SI_SUB_CLOCKS, SI_ORDER_ANY, kthread_start, &deadlkres_kd);
310
311static SYSCTL_NODE(_debug, OID_AUTO, deadlkres, CTLFLAG_RW, 0,
312    "Deadlock resolver");
313SYSCTL_INT(_debug_deadlkres, OID_AUTO, slptime_threshold, CTLFLAG_RW,
314    &slptime_threshold, 0,
315    "Number of seconds within is valid to sleep on a sleepqueue");
316SYSCTL_INT(_debug_deadlkres, OID_AUTO, blktime_threshold, CTLFLAG_RW,
317    &blktime_threshold, 0,
318    "Number of seconds within is valid to block on a turnstile");
319SYSCTL_INT(_debug_deadlkres, OID_AUTO, sleepfreq, CTLFLAG_RW, &sleepfreq, 0,
320    "Number of seconds between any deadlock resolver thread run");
321#endif	/* DEADLKRES */
322
323void
324read_cpu_time(long *cp_time)
325{
326	struct pcpu *pc;
327	int i, j;
328
329	/* Sum up global cp_time[]. */
330	bzero(cp_time, sizeof(long) * CPUSTATES);
331	CPU_FOREACH(i) {
332		pc = pcpu_find(i);
333		for (j = 0; j < CPUSTATES; j++)
334			cp_time[j] += pc->pc_cp_time[j];
335	}
336}
337
338#include <sys/watchdog.h>
339
340static int watchdog_ticks;
341static int watchdog_enabled;
342static void watchdog_fire(void);
343static void watchdog_config(void *, u_int, int *);
344
345static void
346watchdog_attach(void)
347{
348	EVENTHANDLER_REGISTER(watchdog_list, watchdog_config, NULL, 0);
349}
350
351/*
352 * Clock handling routines.
353 *
354 * This code is written to operate with two timers that run independently of
355 * each other.
356 *
357 * The main timer, running hz times per second, is used to trigger interval
358 * timers, timeouts and rescheduling as needed.
359 *
360 * The second timer handles kernel and user profiling,
361 * and does resource use estimation.  If the second timer is programmable,
362 * it is randomized to avoid aliasing between the two clocks.  For example,
363 * the randomization prevents an adversary from always giving up the cpu
364 * just before its quantum expires.  Otherwise, it would never accumulate
365 * cpu ticks.  The mean frequency of the second timer is stathz.
366 *
367 * If no second timer exists, stathz will be zero; in this case we drive
368 * profiling and statistics off the main clock.  This WILL NOT be accurate;
369 * do not do it unless absolutely necessary.
370 *
371 * The statistics clock may (or may not) be run at a higher rate while
372 * profiling.  This profile clock runs at profhz.  We require that profhz
373 * be an integral multiple of stathz.
374 *
375 * If the statistics clock is running fast, it must be divided by the ratio
376 * profhz/stathz for statistics.  (For profiling, every tick counts.)
377 *
378 * Time-of-day is maintained using a "timecounter", which may or may
379 * not be related to the hardware generating the above mentioned
380 * interrupts.
381 */
382
383int	stathz;
384int	profhz;
385int	profprocs;
386volatile int	ticks;
387int	psratio;
388
389static DPCPU_DEFINE(int, pcputicks);	/* Per-CPU version of ticks. */
390#ifdef DEVICE_POLLING
391static int devpoll_run = 0;
392#endif
393
394/*
395 * Initialize clock frequencies and start both clocks running.
396 */
397/* ARGSUSED*/
398static void
399initclocks(dummy)
400	void *dummy;
401{
402	register int i;
403
404	/*
405	 * Set divisors to 1 (normal case) and let the machine-specific
406	 * code do its bit.
407	 */
408	mtx_init(&time_lock, "time lock", NULL, MTX_DEF);
409	cpu_initclocks();
410
411	/*
412	 * Compute profhz/stathz, and fix profhz if needed.
413	 */
414	i = stathz ? stathz : hz;
415	if (profhz == 0)
416		profhz = i;
417	psratio = profhz / i;
418
419#ifdef SW_WATCHDOG
420	/* Enable hardclock watchdog now, even if a hardware watchdog exists. */
421	watchdog_attach();
422#else
423	/* Volunteer to run a software watchdog. */
424	if (wdog_software_attach == NULL)
425		wdog_software_attach = watchdog_attach;
426#endif
427}
428
429/*
430 * Each time the real-time timer fires, this function is called on all CPUs.
431 * Note that hardclock() calls hardclock_cpu() for the boot CPU, so only
432 * the other CPUs in the system need to call this function.
433 */
434void
435hardclock_cpu(int usermode)
436{
437	struct pstats *pstats;
438	struct thread *td = curthread;
439	struct proc *p = td->td_proc;
440	int flags;
441
442	/*
443	 * Run current process's virtual and profile time, as needed.
444	 */
445	pstats = p->p_stats;
446	flags = 0;
447	if (usermode &&
448	    timevalisset(&pstats->p_timer[ITIMER_VIRTUAL].it_value)) {
449		PROC_ITIMLOCK(p);
450		if (itimerdecr(&pstats->p_timer[ITIMER_VIRTUAL], tick) == 0)
451			flags |= TDF_ALRMPEND | TDF_ASTPENDING;
452		PROC_ITIMUNLOCK(p);
453	}
454	if (timevalisset(&pstats->p_timer[ITIMER_PROF].it_value)) {
455		PROC_ITIMLOCK(p);
456		if (itimerdecr(&pstats->p_timer[ITIMER_PROF], tick) == 0)
457			flags |= TDF_PROFPEND | TDF_ASTPENDING;
458		PROC_ITIMUNLOCK(p);
459	}
460	thread_lock(td);
461	td->td_flags |= flags;
462	thread_unlock(td);
463
464#ifdef HWPMC_HOOKS
465	if (PMC_CPU_HAS_SAMPLES(PCPU_GET(cpuid)))
466		PMC_CALL_HOOK_UNLOCKED(curthread, PMC_FN_DO_SAMPLES, NULL);
467	if (td->td_intr_frame != NULL)
468		PMC_SOFT_CALL_TF( , , clock, hard, td->td_intr_frame);
469#endif
470	callout_process(sbinuptime());
471}
472
473/*
474 * The real-time timer, interrupting hz times per second.
475 */
476void
477hardclock(int usermode, uintfptr_t pc)
478{
479
480	atomic_add_int(&ticks, 1);
481	hardclock_cpu(usermode);
482	tc_ticktock(1);
483	cpu_tick_calibration();
484	/*
485	 * If no separate statistics clock is available, run it from here.
486	 *
487	 * XXX: this only works for UP
488	 */
489	if (stathz == 0) {
490		profclock(usermode, pc);
491		statclock(usermode);
492	}
493#ifdef DEVICE_POLLING
494	hardclock_device_poll();	/* this is very short and quick */
495#endif /* DEVICE_POLLING */
496	if (watchdog_enabled > 0 && --watchdog_ticks <= 0)
497		watchdog_fire();
498}
499
500void
501hardclock_cnt(int cnt, int usermode)
502{
503	struct pstats *pstats;
504	struct thread *td = curthread;
505	struct proc *p = td->td_proc;
506	int *t = DPCPU_PTR(pcputicks);
507	int flags, global, newticks;
508	int i;
509
510	/*
511	 * Update per-CPU and possibly global ticks values.
512	 */
513	*t += cnt;
514	do {
515		global = ticks;
516		newticks = *t - global;
517		if (newticks <= 0) {
518			if (newticks < -1)
519				*t = global - 1;
520			newticks = 0;
521			break;
522		}
523	} while (!atomic_cmpset_int(&ticks, global, *t));
524
525	/*
526	 * Run current process's virtual and profile time, as needed.
527	 */
528	pstats = p->p_stats;
529	flags = 0;
530	if (usermode &&
531	    timevalisset(&pstats->p_timer[ITIMER_VIRTUAL].it_value)) {
532		PROC_ITIMLOCK(p);
533		if (itimerdecr(&pstats->p_timer[ITIMER_VIRTUAL],
534		    tick * cnt) == 0)
535			flags |= TDF_ALRMPEND | TDF_ASTPENDING;
536		PROC_ITIMUNLOCK(p);
537	}
538	if (timevalisset(&pstats->p_timer[ITIMER_PROF].it_value)) {
539		PROC_ITIMLOCK(p);
540		if (itimerdecr(&pstats->p_timer[ITIMER_PROF],
541		    tick * cnt) == 0)
542			flags |= TDF_PROFPEND | TDF_ASTPENDING;
543		PROC_ITIMUNLOCK(p);
544	}
545	if (flags != 0) {
546		thread_lock(td);
547		td->td_flags |= flags;
548		thread_unlock(td);
549	}
550
551#ifdef	HWPMC_HOOKS
552	if (PMC_CPU_HAS_SAMPLES(PCPU_GET(cpuid)))
553		PMC_CALL_HOOK_UNLOCKED(curthread, PMC_FN_DO_SAMPLES, NULL);
554	if (td->td_intr_frame != NULL)
555		PMC_SOFT_CALL_TF( , , clock, hard, td->td_intr_frame);
556#endif
557	/* We are in charge to handle this tick duty. */
558	if (newticks > 0) {
559		tc_ticktock(newticks);
560#ifdef DEVICE_POLLING
561		/* Dangerous and no need to call these things concurrently. */
562		if (atomic_cmpset_acq_int(&devpoll_run, 0, 1)) {
563			/* This is very short and quick. */
564			hardclock_device_poll();
565			atomic_store_rel_int(&devpoll_run, 0);
566		}
567#endif /* DEVICE_POLLING */
568		if (watchdog_enabled > 0) {
569			i = atomic_fetchadd_int(&watchdog_ticks, -newticks);
570			if (i > 0 && i <= newticks)
571				watchdog_fire();
572		}
573	}
574	if (curcpu == CPU_FIRST())
575		cpu_tick_calibration();
576}
577
578void
579hardclock_sync(int cpu)
580{
581	int	*t = DPCPU_ID_PTR(cpu, pcputicks);
582
583	*t = ticks;
584}
585
586/*
587 * Compute number of ticks in the specified amount of time.
588 */
589int
590tvtohz(tv)
591	struct timeval *tv;
592{
593	register unsigned long ticks;
594	register long sec, usec;
595
596	/*
597	 * If the number of usecs in the whole seconds part of the time
598	 * difference fits in a long, then the total number of usecs will
599	 * fit in an unsigned long.  Compute the total and convert it to
600	 * ticks, rounding up and adding 1 to allow for the current tick
601	 * to expire.  Rounding also depends on unsigned long arithmetic
602	 * to avoid overflow.
603	 *
604	 * Otherwise, if the number of ticks in the whole seconds part of
605	 * the time difference fits in a long, then convert the parts to
606	 * ticks separately and add, using similar rounding methods and
607	 * overflow avoidance.  This method would work in the previous
608	 * case but it is slightly slower and assumes that hz is integral.
609	 *
610	 * Otherwise, round the time difference down to the maximum
611	 * representable value.
612	 *
613	 * If ints have 32 bits, then the maximum value for any timeout in
614	 * 10ms ticks is 248 days.
615	 */
616	sec = tv->tv_sec;
617	usec = tv->tv_usec;
618	if (usec < 0) {
619		sec--;
620		usec += 1000000;
621	}
622	if (sec < 0) {
623#ifdef DIAGNOSTIC
624		if (usec > 0) {
625			sec++;
626			usec -= 1000000;
627		}
628		printf("tvotohz: negative time difference %ld sec %ld usec\n",
629		       sec, usec);
630#endif
631		ticks = 1;
632	} else if (sec <= LONG_MAX / 1000000)
633		ticks = howmany(sec * 1000000 + (unsigned long)usec, tick) + 1;
634	else if (sec <= LONG_MAX / hz)
635		ticks = sec * hz
636			+ howmany((unsigned long)usec, tick) + 1;
637	else
638		ticks = LONG_MAX;
639	if (ticks > INT_MAX)
640		ticks = INT_MAX;
641	return ((int)ticks);
642}
643
644/*
645 * Start profiling on a process.
646 *
647 * Kernel profiling passes proc0 which never exits and hence
648 * keeps the profile clock running constantly.
649 */
650void
651startprofclock(p)
652	register struct proc *p;
653{
654
655	PROC_LOCK_ASSERT(p, MA_OWNED);
656	if (p->p_flag & P_STOPPROF)
657		return;
658	if ((p->p_flag & P_PROFIL) == 0) {
659		p->p_flag |= P_PROFIL;
660		mtx_lock(&time_lock);
661		if (++profprocs == 1)
662			cpu_startprofclock();
663		mtx_unlock(&time_lock);
664	}
665}
666
667/*
668 * Stop profiling on a process.
669 */
670void
671stopprofclock(p)
672	register struct proc *p;
673{
674
675	PROC_LOCK_ASSERT(p, MA_OWNED);
676	if (p->p_flag & P_PROFIL) {
677		if (p->p_profthreads != 0) {
678			while (p->p_profthreads != 0) {
679				p->p_flag |= P_STOPPROF;
680				msleep(&p->p_profthreads, &p->p_mtx, PPAUSE,
681				    "stopprof", 0);
682			}
683		}
684		if ((p->p_flag & P_PROFIL) == 0)
685			return;
686		p->p_flag &= ~P_PROFIL;
687		mtx_lock(&time_lock);
688		if (--profprocs == 0)
689			cpu_stopprofclock();
690		mtx_unlock(&time_lock);
691	}
692}
693
694/*
695 * Statistics clock.  Updates rusage information and calls the scheduler
696 * to adjust priorities of the active thread.
697 *
698 * This should be called by all active processors.
699 */
700void
701statclock(int usermode)
702{
703
704	statclock_cnt(1, usermode);
705}
706
707void
708statclock_cnt(int cnt, int usermode)
709{
710	struct rusage *ru;
711	struct vmspace *vm;
712	struct thread *td;
713	struct proc *p;
714	long rss;
715	long *cp_time;
716
717	td = curthread;
718	p = td->td_proc;
719
720	cp_time = (long *)PCPU_PTR(cp_time);
721	if (usermode) {
722		/*
723		 * Charge the time as appropriate.
724		 */
725		td->td_uticks += cnt;
726		if (p->p_nice > NZERO)
727			cp_time[CP_NICE] += cnt;
728		else
729			cp_time[CP_USER] += cnt;
730	} else {
731		/*
732		 * Came from kernel mode, so we were:
733		 * - handling an interrupt,
734		 * - doing syscall or trap work on behalf of the current
735		 *   user process, or
736		 * - spinning in the idle loop.
737		 * Whichever it is, charge the time as appropriate.
738		 * Note that we charge interrupts to the current process,
739		 * regardless of whether they are ``for'' that process,
740		 * so that we know how much of its real time was spent
741		 * in ``non-process'' (i.e., interrupt) work.
742		 */
743		if ((td->td_pflags & TDP_ITHREAD) ||
744		    td->td_intr_nesting_level >= 2) {
745			td->td_iticks += cnt;
746			cp_time[CP_INTR] += cnt;
747		} else {
748			td->td_pticks += cnt;
749			td->td_sticks += cnt;
750			if (!TD_IS_IDLETHREAD(td))
751				cp_time[CP_SYS] += cnt;
752			else
753				cp_time[CP_IDLE] += cnt;
754		}
755	}
756
757	/* Update resource usage integrals and maximums. */
758	MPASS(p->p_vmspace != NULL);
759	vm = p->p_vmspace;
760	ru = &td->td_ru;
761	ru->ru_ixrss += pgtok(vm->vm_tsize) * cnt;
762	ru->ru_idrss += pgtok(vm->vm_dsize) * cnt;
763	ru->ru_isrss += pgtok(vm->vm_ssize) * cnt;
764	rss = pgtok(vmspace_resident_count(vm));
765	if (ru->ru_maxrss < rss)
766		ru->ru_maxrss = rss;
767	KTR_POINT2(KTR_SCHED, "thread", sched_tdname(td), "statclock",
768	    "prio:%d", td->td_priority, "stathz:%d", (stathz)?stathz:hz);
769	SDT_PROBE2(sched, , , tick, td, td->td_proc);
770	thread_lock_flags(td, MTX_QUIET);
771	for ( ; cnt > 0; cnt--)
772		sched_clock(td);
773	thread_unlock(td);
774#ifdef HWPMC_HOOKS
775	if (td->td_intr_frame != NULL)
776		PMC_SOFT_CALL_TF( , , clock, stat, td->td_intr_frame);
777#endif
778}
779
780void
781profclock(int usermode, uintfptr_t pc)
782{
783
784	profclock_cnt(1, usermode, pc);
785}
786
787void
788profclock_cnt(int cnt, int usermode, uintfptr_t pc)
789{
790	struct thread *td;
791#ifdef GPROF
792	struct gmonparam *g;
793	uintfptr_t i;
794#endif
795
796	td = curthread;
797	if (usermode) {
798		/*
799		 * Came from user mode; CPU was in user state.
800		 * If this process is being profiled, record the tick.
801		 * if there is no related user location yet, don't
802		 * bother trying to count it.
803		 */
804		if (td->td_proc->p_flag & P_PROFIL)
805			addupc_intr(td, pc, cnt);
806	}
807#ifdef GPROF
808	else {
809		/*
810		 * Kernel statistics are just like addupc_intr, only easier.
811		 */
812		g = &_gmonparam;
813		if (g->state == GMON_PROF_ON && pc >= g->lowpc) {
814			i = PC_TO_I(g, pc);
815			if (i < g->textsize) {
816				KCOUNT(g, i) += cnt;
817			}
818		}
819	}
820#endif
821#ifdef HWPMC_HOOKS
822	if (td->td_intr_frame != NULL)
823		PMC_SOFT_CALL_TF( , , clock, prof, td->td_intr_frame);
824#endif
825}
826
827/*
828 * Return information about system clocks.
829 */
830static int
831sysctl_kern_clockrate(SYSCTL_HANDLER_ARGS)
832{
833	struct clockinfo clkinfo;
834	/*
835	 * Construct clockinfo structure.
836	 */
837	bzero(&clkinfo, sizeof(clkinfo));
838	clkinfo.hz = hz;
839	clkinfo.tick = tick;
840	clkinfo.profhz = profhz;
841	clkinfo.stathz = stathz ? stathz : hz;
842	return (sysctl_handle_opaque(oidp, &clkinfo, sizeof clkinfo, req));
843}
844
845SYSCTL_PROC(_kern, KERN_CLOCKRATE, clockrate,
846	CTLTYPE_STRUCT|CTLFLAG_RD|CTLFLAG_MPSAFE,
847	0, 0, sysctl_kern_clockrate, "S,clockinfo",
848	"Rate and period of various kernel clocks");
849
850static void
851watchdog_config(void *unused __unused, u_int cmd, int *error)
852{
853	u_int u;
854
855	u = cmd & WD_INTERVAL;
856	if (u >= WD_TO_1SEC) {
857		watchdog_ticks = (1 << (u - WD_TO_1SEC)) * hz;
858		watchdog_enabled = 1;
859		*error = 0;
860	} else {
861		watchdog_enabled = 0;
862	}
863}
864
865/*
866 * Handle a watchdog timeout by dumping interrupt information and
867 * then either dropping to DDB or panicking.
868 */
869static void
870watchdog_fire(void)
871{
872	int nintr;
873	uint64_t inttotal;
874	u_long *curintr;
875	char *curname;
876
877	curintr = intrcnt;
878	curname = intrnames;
879	inttotal = 0;
880	nintr = sintrcnt / sizeof(u_long);
881
882	printf("interrupt                   total\n");
883	while (--nintr >= 0) {
884		if (*curintr)
885			printf("%-12s %20lu\n", curname, *curintr);
886		curname += strlen(curname) + 1;
887		inttotal += *curintr++;
888	}
889	printf("Total        %20ju\n", (uintmax_t)inttotal);
890
891#if defined(KDB) && !defined(KDB_UNATTENDED)
892	kdb_backtrace();
893	kdb_enter(KDB_WHY_WATCHDOG, "watchdog timeout");
894#else
895	panic("watchdog timeout");
896#endif
897}
898