kern_clock.c revision 170174
1/*- 2 * Copyright (c) 1982, 1986, 1991, 1993 3 * The Regents of the University of California. All rights reserved. 4 * (c) UNIX System Laboratories, Inc. 5 * All or some portions of this file are derived from material licensed 6 * to the University of California by American Telephone and Telegraph 7 * Co. or Unix System Laboratories, Inc. and are reproduced herein with 8 * the permission of UNIX System Laboratories, Inc. 9 * 10 * Redistribution and use in source and binary forms, with or without 11 * modification, are permitted provided that the following conditions 12 * are met: 13 * 1. Redistributions of source code must retain the above copyright 14 * notice, this list of conditions and the following disclaimer. 15 * 2. Redistributions in binary form must reproduce the above copyright 16 * notice, this list of conditions and the following disclaimer in the 17 * documentation and/or other materials provided with the distribution. 18 * 4. Neither the name of the University nor the names of its contributors 19 * may be used to endorse or promote products derived from this software 20 * without specific prior written permission. 21 * 22 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 23 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 24 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 25 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 26 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 27 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 28 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 29 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 30 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 31 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 32 * SUCH DAMAGE. 33 * 34 * @(#)kern_clock.c 8.5 (Berkeley) 1/21/94 35 */ 36 37#include <sys/cdefs.h> 38__FBSDID("$FreeBSD: head/sys/kern/kern_clock.c 170174 2007-06-01 01:12:45Z jeff $"); 39 40#include "opt_kdb.h" 41#include "opt_device_polling.h" 42#include "opt_hwpmc_hooks.h" 43#include "opt_ntp.h" 44#include "opt_watchdog.h" 45 46#include <sys/param.h> 47#include <sys/systm.h> 48#include <sys/callout.h> 49#include <sys/kdb.h> 50#include <sys/kernel.h> 51#include <sys/lock.h> 52#include <sys/ktr.h> 53#include <sys/mutex.h> 54#include <sys/proc.h> 55#include <sys/resource.h> 56#include <sys/resourcevar.h> 57#include <sys/sched.h> 58#include <sys/signalvar.h> 59#include <sys/smp.h> 60#include <vm/vm.h> 61#include <vm/pmap.h> 62#include <vm/vm_map.h> 63#include <sys/sysctl.h> 64#include <sys/bus.h> 65#include <sys/interrupt.h> 66#include <sys/limits.h> 67#include <sys/timetc.h> 68 69#ifdef GPROF 70#include <sys/gmon.h> 71#endif 72 73#ifdef HWPMC_HOOKS 74#include <sys/pmckern.h> 75#endif 76 77#ifdef DEVICE_POLLING 78extern void hardclock_device_poll(void); 79#endif /* DEVICE_POLLING */ 80 81static void initclocks(void *dummy); 82SYSINIT(clocks, SI_SUB_CLOCKS, SI_ORDER_FIRST, initclocks, NULL) 83 84/* Some of these don't belong here, but it's easiest to concentrate them. */ 85long cp_time[CPUSTATES]; 86 87/* Spin-lock protecting profiling statistics. */ 88struct mtx time_lock; 89 90static int 91sysctl_kern_cp_time(SYSCTL_HANDLER_ARGS) 92{ 93 int error; 94#ifdef SCTL_MASK32 95 int i; 96 unsigned int cp_time32[CPUSTATES]; 97 98 if (req->flags & SCTL_MASK32) { 99 if (!req->oldptr) 100 return SYSCTL_OUT(req, 0, sizeof(cp_time32)); 101 for (i = 0; i < CPUSTATES; i++) 102 cp_time32[i] = (unsigned int)cp_time[i]; 103 error = SYSCTL_OUT(req, cp_time32, sizeof(cp_time32)); 104 } else 105#endif 106 { 107 if (!req->oldptr) 108 return SYSCTL_OUT(req, 0, sizeof(cp_time)); 109 error = SYSCTL_OUT(req, cp_time, sizeof(cp_time)); 110 } 111 return error; 112} 113 114SYSCTL_PROC(_kern, OID_AUTO, cp_time, CTLTYPE_LONG|CTLFLAG_RD, 115 0,0, sysctl_kern_cp_time, "LU", "CPU time statistics"); 116 117#ifdef SW_WATCHDOG 118#include <sys/watchdog.h> 119 120static int watchdog_ticks; 121static int watchdog_enabled; 122static void watchdog_fire(void); 123static void watchdog_config(void *, u_int, int *); 124#endif /* SW_WATCHDOG */ 125 126/* 127 * Clock handling routines. 128 * 129 * This code is written to operate with two timers that run independently of 130 * each other. 131 * 132 * The main timer, running hz times per second, is used to trigger interval 133 * timers, timeouts and rescheduling as needed. 134 * 135 * The second timer handles kernel and user profiling, 136 * and does resource use estimation. If the second timer is programmable, 137 * it is randomized to avoid aliasing between the two clocks. For example, 138 * the randomization prevents an adversary from always giving up the cpu 139 * just before its quantum expires. Otherwise, it would never accumulate 140 * cpu ticks. The mean frequency of the second timer is stathz. 141 * 142 * If no second timer exists, stathz will be zero; in this case we drive 143 * profiling and statistics off the main clock. This WILL NOT be accurate; 144 * do not do it unless absolutely necessary. 145 * 146 * The statistics clock may (or may not) be run at a higher rate while 147 * profiling. This profile clock runs at profhz. We require that profhz 148 * be an integral multiple of stathz. 149 * 150 * If the statistics clock is running fast, it must be divided by the ratio 151 * profhz/stathz for statistics. (For profiling, every tick counts.) 152 * 153 * Time-of-day is maintained using a "timecounter", which may or may 154 * not be related to the hardware generating the above mentioned 155 * interrupts. 156 */ 157 158int stathz; 159int profhz; 160int profprocs; 161int ticks; 162int psratio; 163 164/* 165 * Initialize clock frequencies and start both clocks running. 166 */ 167/* ARGSUSED*/ 168static void 169initclocks(dummy) 170 void *dummy; 171{ 172 register int i; 173 174 /* 175 * Set divisors to 1 (normal case) and let the machine-specific 176 * code do its bit. 177 */ 178 mtx_init(&time_lock, "time lock", NULL, MTX_SPIN); 179 cpu_initclocks(); 180 181 /* 182 * Compute profhz/stathz, and fix profhz if needed. 183 */ 184 i = stathz ? stathz : hz; 185 if (profhz == 0) 186 profhz = i; 187 psratio = profhz / i; 188#ifdef SW_WATCHDOG 189 EVENTHANDLER_REGISTER(watchdog_list, watchdog_config, NULL, 0); 190#endif 191} 192 193/* 194 * Each time the real-time timer fires, this function is called on all CPUs. 195 * Note that hardclock() calls hardclock_cpu() for the boot CPU, so only 196 * the other CPUs in the system need to call this function. 197 */ 198void 199hardclock_cpu(int usermode) 200{ 201 struct pstats *pstats; 202 struct thread *td = curthread; 203 struct proc *p = td->td_proc; 204 205 /* 206 * Run current process's virtual and profile time, as needed. 207 */ 208 mtx_lock_spin_flags(&sched_lock, MTX_QUIET); 209 sched_tick(); 210#ifdef KSE 211#if 0 /* for now do nothing */ 212 if (p->p_flag & P_SA) { 213 /* XXXKSE What to do? Should do more. */ 214 } 215#endif 216#endif 217 pstats = p->p_stats; 218 if (usermode && 219 timevalisset(&pstats->p_timer[ITIMER_VIRTUAL].it_value) && 220 itimerdecr(&pstats->p_timer[ITIMER_VIRTUAL], tick) == 0) { 221 p->p_sflag |= PS_ALRMPEND; 222 td->td_flags |= TDF_ASTPENDING; 223 } 224 if (timevalisset(&pstats->p_timer[ITIMER_PROF].it_value) && 225 itimerdecr(&pstats->p_timer[ITIMER_PROF], tick) == 0) { 226 p->p_sflag |= PS_PROFPEND; 227 td->td_flags |= TDF_ASTPENDING; 228 } 229 mtx_unlock_spin_flags(&sched_lock, MTX_QUIET); 230 231#ifdef HWPMC_HOOKS 232 if (PMC_CPU_HAS_SAMPLES(PCPU_GET(cpuid))) 233 PMC_CALL_HOOK_UNLOCKED(curthread, PMC_FN_DO_SAMPLES, NULL); 234#endif 235} 236 237/* 238 * The real-time timer, interrupting hz times per second. 239 */ 240void 241hardclock(int usermode, uintfptr_t pc) 242{ 243 int need_softclock = 0; 244 245 hardclock_cpu(usermode); 246 247 tc_ticktock(); 248 /* 249 * If no separate statistics clock is available, run it from here. 250 * 251 * XXX: this only works for UP 252 */ 253 if (stathz == 0) { 254 profclock(usermode, pc); 255 statclock(usermode); 256 } 257 258#ifdef DEVICE_POLLING 259 hardclock_device_poll(); /* this is very short and quick */ 260#endif /* DEVICE_POLLING */ 261 262 /* 263 * Process callouts at a very low cpu priority, so we don't keep the 264 * relatively high clock interrupt priority any longer than necessary. 265 */ 266 mtx_lock_spin_flags(&callout_lock, MTX_QUIET); 267 ticks++; 268 if (!TAILQ_EMPTY(&callwheel[ticks & callwheelmask])) { 269 need_softclock = 1; 270 } else if (softticks + 1 == ticks) 271 ++softticks; 272 mtx_unlock_spin_flags(&callout_lock, MTX_QUIET); 273 274 /* 275 * swi_sched acquires sched_lock, so we don't want to call it with 276 * callout_lock held; incorrect locking order. 277 */ 278 if (need_softclock) 279 swi_sched(softclock_ih, 0); 280 281#ifdef SW_WATCHDOG 282 if (watchdog_enabled > 0 && --watchdog_ticks <= 0) 283 watchdog_fire(); 284#endif /* SW_WATCHDOG */ 285} 286 287/* 288 * Compute number of ticks in the specified amount of time. 289 */ 290int 291tvtohz(tv) 292 struct timeval *tv; 293{ 294 register unsigned long ticks; 295 register long sec, usec; 296 297 /* 298 * If the number of usecs in the whole seconds part of the time 299 * difference fits in a long, then the total number of usecs will 300 * fit in an unsigned long. Compute the total and convert it to 301 * ticks, rounding up and adding 1 to allow for the current tick 302 * to expire. Rounding also depends on unsigned long arithmetic 303 * to avoid overflow. 304 * 305 * Otherwise, if the number of ticks in the whole seconds part of 306 * the time difference fits in a long, then convert the parts to 307 * ticks separately and add, using similar rounding methods and 308 * overflow avoidance. This method would work in the previous 309 * case but it is slightly slower and assumes that hz is integral. 310 * 311 * Otherwise, round the time difference down to the maximum 312 * representable value. 313 * 314 * If ints have 32 bits, then the maximum value for any timeout in 315 * 10ms ticks is 248 days. 316 */ 317 sec = tv->tv_sec; 318 usec = tv->tv_usec; 319 if (usec < 0) { 320 sec--; 321 usec += 1000000; 322 } 323 if (sec < 0) { 324#ifdef DIAGNOSTIC 325 if (usec > 0) { 326 sec++; 327 usec -= 1000000; 328 } 329 printf("tvotohz: negative time difference %ld sec %ld usec\n", 330 sec, usec); 331#endif 332 ticks = 1; 333 } else if (sec <= LONG_MAX / 1000000) 334 ticks = (sec * 1000000 + (unsigned long)usec + (tick - 1)) 335 / tick + 1; 336 else if (sec <= LONG_MAX / hz) 337 ticks = sec * hz 338 + ((unsigned long)usec + (tick - 1)) / tick + 1; 339 else 340 ticks = LONG_MAX; 341 if (ticks > INT_MAX) 342 ticks = INT_MAX; 343 return ((int)ticks); 344} 345 346/* 347 * Start profiling on a process. 348 * 349 * Kernel profiling passes proc0 which never exits and hence 350 * keeps the profile clock running constantly. 351 */ 352void 353startprofclock(p) 354 register struct proc *p; 355{ 356 357 PROC_LOCK_ASSERT(p, MA_OWNED); 358 if (p->p_flag & P_STOPPROF) 359 return; 360 if ((p->p_flag & P_PROFIL) == 0) { 361 p->p_flag |= P_PROFIL; 362 mtx_lock_spin(&time_lock); 363 if (++profprocs == 1) 364 cpu_startprofclock(); 365 mtx_unlock_spin(&time_lock); 366 } 367} 368 369/* 370 * Stop profiling on a process. 371 */ 372void 373stopprofclock(p) 374 register struct proc *p; 375{ 376 377 PROC_LOCK_ASSERT(p, MA_OWNED); 378 if (p->p_flag & P_PROFIL) { 379 if (p->p_profthreads != 0) { 380 p->p_flag |= P_STOPPROF; 381 while (p->p_profthreads != 0) 382 msleep(&p->p_profthreads, &p->p_mtx, PPAUSE, 383 "stopprof", 0); 384 p->p_flag &= ~P_STOPPROF; 385 } 386 if ((p->p_flag & P_PROFIL) == 0) 387 return; 388 p->p_flag &= ~P_PROFIL; 389 mtx_lock_spin(&time_lock); 390 if (--profprocs == 0) 391 cpu_stopprofclock(); 392 mtx_unlock_spin(&time_lock); 393 } 394} 395 396/* 397 * Statistics clock. Updates rusage information and calls the scheduler 398 * to adjust priorities of the active thread. 399 * 400 * This should be called by all active processors. 401 */ 402void 403statclock(int usermode) 404{ 405 struct rusage *ru; 406 struct vmspace *vm; 407 struct thread *td; 408 struct proc *p; 409 long rss; 410 411 td = curthread; 412 p = td->td_proc; 413 414 if (usermode) { 415 /* 416 * Charge the time as appropriate. 417 */ 418#ifdef KSE 419 if (p->p_flag & P_SA) 420 thread_statclock(1); 421#endif 422 td->td_uticks++; 423 mtx_lock_spin_flags(&time_lock, MTX_QUIET); 424 if (p->p_nice > NZERO) 425 cp_time[CP_NICE]++; 426 else 427 cp_time[CP_USER]++; 428 } else { 429 /* 430 * Came from kernel mode, so we were: 431 * - handling an interrupt, 432 * - doing syscall or trap work on behalf of the current 433 * user process, or 434 * - spinning in the idle loop. 435 * Whichever it is, charge the time as appropriate. 436 * Note that we charge interrupts to the current process, 437 * regardless of whether they are ``for'' that process, 438 * so that we know how much of its real time was spent 439 * in ``non-process'' (i.e., interrupt) work. 440 */ 441 if ((td->td_pflags & TDP_ITHREAD) || 442 td->td_intr_nesting_level >= 2) { 443 td->td_iticks++; 444 mtx_lock_spin_flags(&time_lock, MTX_QUIET); 445 cp_time[CP_INTR]++; 446 } else { 447#ifdef KSE 448 if (p->p_flag & P_SA) 449 thread_statclock(0); 450#endif 451 td->td_pticks++; 452 td->td_sticks++; 453 mtx_lock_spin_flags(&time_lock, MTX_QUIET); 454 if (!TD_IS_IDLETHREAD(td)) 455 cp_time[CP_SYS]++; 456 else 457 cp_time[CP_IDLE]++; 458 } 459 } 460 mtx_unlock_spin_flags(&time_lock, MTX_QUIET); 461 CTR4(KTR_SCHED, "statclock: %p(%s) prio %d stathz %d", 462 td, td->td_proc->p_comm, td->td_priority, (stathz)?stathz:hz); 463 464 mtx_lock_spin_flags(&sched_lock, MTX_QUIET); 465 sched_clock(td); 466 467 /* Update resource usage integrals and maximums. */ 468 MPASS(p->p_vmspace != NULL); 469 vm = p->p_vmspace; 470 ru = &td->td_ru; 471 ru->ru_ixrss += pgtok(vm->vm_tsize); 472 ru->ru_idrss += pgtok(vm->vm_dsize); 473 ru->ru_isrss += pgtok(vm->vm_ssize); 474 rss = pgtok(vmspace_resident_count(vm)); 475 if (ru->ru_maxrss < rss) 476 ru->ru_maxrss = rss; 477 mtx_unlock_spin_flags(&sched_lock, MTX_QUIET); 478} 479 480void 481profclock(int usermode, uintfptr_t pc) 482{ 483 struct thread *td; 484#ifdef GPROF 485 struct gmonparam *g; 486 uintfptr_t i; 487#endif 488 489 td = curthread; 490 if (usermode) { 491 /* 492 * Came from user mode; CPU was in user state. 493 * If this process is being profiled, record the tick. 494 * if there is no related user location yet, don't 495 * bother trying to count it. 496 */ 497 if (td->td_proc->p_flag & P_PROFIL) 498 addupc_intr(td, pc, 1); 499 } 500#ifdef GPROF 501 else { 502 /* 503 * Kernel statistics are just like addupc_intr, only easier. 504 */ 505 g = &_gmonparam; 506 if (g->state == GMON_PROF_ON && pc >= g->lowpc) { 507 i = PC_TO_I(g, pc); 508 if (i < g->textsize) { 509 KCOUNT(g, i)++; 510 } 511 } 512 } 513#endif 514} 515 516/* 517 * Return information about system clocks. 518 */ 519static int 520sysctl_kern_clockrate(SYSCTL_HANDLER_ARGS) 521{ 522 struct clockinfo clkinfo; 523 /* 524 * Construct clockinfo structure. 525 */ 526 bzero(&clkinfo, sizeof(clkinfo)); 527 clkinfo.hz = hz; 528 clkinfo.tick = tick; 529 clkinfo.profhz = profhz; 530 clkinfo.stathz = stathz ? stathz : hz; 531 return (sysctl_handle_opaque(oidp, &clkinfo, sizeof clkinfo, req)); 532} 533 534SYSCTL_PROC(_kern, KERN_CLOCKRATE, clockrate, CTLTYPE_STRUCT|CTLFLAG_RD, 535 0, 0, sysctl_kern_clockrate, "S,clockinfo", 536 "Rate and period of various kernel clocks"); 537 538#ifdef SW_WATCHDOG 539 540static void 541watchdog_config(void *unused __unused, u_int cmd, int *error) 542{ 543 u_int u; 544 545 u = cmd & WD_INTERVAL; 546 if (u >= WD_TO_1SEC) { 547 watchdog_ticks = (1 << (u - WD_TO_1SEC)) * hz; 548 watchdog_enabled = 1; 549 *error = 0; 550 } else { 551 watchdog_enabled = 0; 552 } 553} 554 555/* 556 * Handle a watchdog timeout by dumping interrupt information and 557 * then either dropping to DDB or panicking. 558 */ 559static void 560watchdog_fire(void) 561{ 562 int nintr; 563 u_int64_t inttotal; 564 u_long *curintr; 565 char *curname; 566 567 curintr = intrcnt; 568 curname = intrnames; 569 inttotal = 0; 570 nintr = eintrcnt - intrcnt; 571 572 printf("interrupt total\n"); 573 while (--nintr >= 0) { 574 if (*curintr) 575 printf("%-12s %20lu\n", curname, *curintr); 576 curname += strlen(curname) + 1; 577 inttotal += *curintr++; 578 } 579 printf("Total %20ju\n", (uintmax_t)inttotal); 580 581#if defined(KDB) && !defined(KDB_UNATTENDED) 582 kdb_backtrace(); 583 kdb_enter("watchdog timeout"); 584#else 585 panic("watchdog timeout"); 586#endif 587} 588 589#endif /* SW_WATCHDOG */ 590