1/* $NetBSD: kern_heartbeat.c,v 1.13 2024/03/08 23:34:03 riastradh Exp $ */ 2 3/*- 4 * Copyright (c) 2023 The NetBSD Foundation, Inc. 5 * All rights reserved. 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 1. Redistributions of source code must retain the above copyright 11 * notice, this list of conditions and the following disclaimer. 12 * 2. Redistributions in binary form must reproduce the above copyright 13 * notice, this list of conditions and the following disclaimer in the 14 * documentation and/or other materials provided with the distribution. 15 * 16 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS 17 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 18 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 19 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS 20 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 21 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 22 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 23 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 24 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 25 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 26 * POSSIBILITY OF SUCH DAMAGE. 27 */ 28 29/* 30 * heartbeat(9) -- periodic checks to ensure CPUs are making progress 31 * 32 * Manual tests to run when changing this file. Magic numbers are for 33 * evbarm; adjust for other platforms. Tests involving cpuctl 34 * online/offline assume a 2-CPU system -- for full testing on a >2-CPU 35 * system, offline all but one CPU. 36 * 37 * 1. cpuctl offline 0 38 * sleep 20 39 * cpuctl online 0 40 * 41 * 2. cpuctl offline 1 42 * sleep 20 43 * cpuctl online 1 44 * 45 * 3. cpuctl offline 0 46 * sysctl -w kern.heartbeat.max_period=5 47 * sleep 10 48 * sysctl -w kern.heartbeat.max_period=0 49 * sleep 10 50 * sysctl -w kern.heartbeat.max_period=5 51 * sleep 10 52 * cpuctl online 0 53 * 54 * 4. sysctl -w debug.crashme_enable=1 55 * sysctl -w debug.crashme.spl_spinout=1 # IPL_SOFTCLOCK 56 * # verify system panics after 15sec, with a stack trace through 57 * # crashme_spl_spinout 58 * 59 * 5. sysctl -w debug.crashme_enable=1 60 * sysctl -w debug.crashme.spl_spinout=6 # IPL_SCHED 61 * # verify system panics after 15sec, with a stack trace through 62 * # crashme_spl_spinout 63 * 64 * 6. cpuctl offline 0 65 * sysctl -w debug.crashme_enable=1 66 * sysctl -w debug.crashme.spl_spinout=1 # IPL_SOFTCLOCK 67 * # verify system panics after 15sec, with a stack trace through 68 * # crashme_spl_spinout 69 * 70 * 7. cpuctl offline 0 71 * sysctl -w debug.crashme_enable=1 72 * sysctl -w debug.crashme.spl_spinout=5 # IPL_VM 73 * # verify system panics after 15sec, with a stack trace through 74 * # crashme_spl_spinout 75 * 76 * # Not this -- IPL_SCHED and IPL_HIGH spinout on a single CPU 77 * # require a hardware watchdog timer. 78 * #cpuctl offline 0 79 * #sysctl -w debug.crashme_enable 80 * #sysctl -w debug.crashme.spl_spinout=6 # IPL_SCHED 81 * # hope watchdog timer kicks in 82 */ 83 84#include <sys/cdefs.h> 85__KERNEL_RCSID(0, "$NetBSD: kern_heartbeat.c,v 1.13 2024/03/08 23:34:03 riastradh Exp $"); 86 87#ifdef _KERNEL_OPT 88#include "opt_ddb.h" 89#include "opt_heartbeat.h" 90#endif 91 92#include "heartbeat.h" 93 94#include <sys/param.h> 95#include <sys/types.h> 96 97#include <sys/atomic.h> 98#include <sys/cpu.h> 99#include <sys/errno.h> 100#include <sys/heartbeat.h> 101#include <sys/ipi.h> 102#include <sys/kernel.h> 103#include <sys/mutex.h> 104#include <sys/sysctl.h> 105#include <sys/systm.h> 106#include <sys/xcall.h> 107 108#ifdef DDB 109#include <ddb/ddb.h> 110#endif 111 112/* 113 * Global state. 114 * 115 * heartbeat_lock serializes access to heartbeat_max_period_secs 116 * and heartbeat_max_period_ticks. Two separate variables so we 117 * can avoid multiplication or division in the heartbeat routine. 118 * 119 * heartbeat_sih is stable after initialization in 120 * heartbeat_start. 121 */ 122kmutex_t heartbeat_lock __cacheline_aligned; 123unsigned heartbeat_max_period_secs __read_mostly; 124unsigned heartbeat_max_period_ticks __read_mostly; 125 126void *heartbeat_sih __read_mostly; 127 128/* 129 * heartbeat_suspend() 130 * 131 * Suspend heartbeat monitoring of the current CPU. 132 * 133 * Called after the current CPU has been marked offline but before 134 * it has stopped running, or after IPL has been raised for 135 * polling-mode console input. Nestable (but only 2^32 times, so 136 * don't do this in a loop). Reversed by heartbeat_resume. 137 * 138 * Caller must be bound to the CPU, i.e., curcpu_stable() must be 139 * true. This function does not assert curcpu_stable() since it 140 * is used in the ddb entry path, where any assertions risk 141 * infinite regress into undebuggable chaos, so callers must be 142 * careful. 143 */ 144void 145heartbeat_suspend(void) 146{ 147 unsigned *p; 148 149 p = &curcpu()->ci_heartbeat_suspend; 150 atomic_store_relaxed(p, *p + 1); 151} 152 153/* 154 * heartbeat_resume_cpu(ci) 155 * 156 * Resume heartbeat monitoring of ci. 157 * 158 * Called at startup while cold, and whenever heartbeat monitoring 159 * is re-enabled after being disabled or the period is changed. 160 * When not cold, ci must be the current CPU. 161 * 162 * Must be run at splsched. 163 */ 164static void 165heartbeat_resume_cpu(struct cpu_info *ci) 166{ 167 168 KASSERT(__predict_false(cold) || curcpu_stable()); 169 KASSERT(__predict_false(cold) || ci == curcpu()); 170 /* XXX KASSERT IPL_SCHED */ 171 172 ci->ci_heartbeat_count = 0; 173 ci->ci_heartbeat_uptime_cache = time_uptime; 174 ci->ci_heartbeat_uptime_stamp = 0; 175} 176 177/* 178 * heartbeat_resume() 179 * 180 * Resume heartbeat monitoring of the current CPU. 181 * 182 * Called after the current CPU has started running but before it 183 * has been marked online, or when ending polling-mode input 184 * before IPL is restored. Reverses heartbeat_suspend. 185 * 186 * Caller must be bound to the CPU, i.e., curcpu_stable() must be 187 * true. 188 */ 189void 190heartbeat_resume(void) 191{ 192 struct cpu_info *ci = curcpu(); 193 unsigned *p; 194 int s; 195 196 KASSERT(curcpu_stable()); 197 198 /* 199 * Reset the state so nobody spuriously thinks we had a heart 200 * attack as soon as the heartbeat checks resume. 201 */ 202 s = splsched(); 203 heartbeat_resume_cpu(ci); 204 splx(s); 205 206 p = &ci->ci_heartbeat_suspend; 207 atomic_store_relaxed(p, *p - 1); 208} 209 210/* 211 * heartbeat_timecounter_suspended() 212 * 213 * True if timecounter heartbeat checks are suspended because the 214 * timecounter may not be advancing, false if heartbeat checks 215 * should check for timecounter progress. 216 */ 217static bool 218heartbeat_timecounter_suspended(void) 219{ 220 CPU_INFO_ITERATOR cii; 221 struct cpu_info *ci; 222 223 /* 224 * The timecounter ticks only on the primary CPU. Check 225 * whether it's suspended. 226 * 227 * XXX Would be nice if we could find the primary CPU without 228 * iterating over all CPUs. 229 */ 230 for (CPU_INFO_FOREACH(cii, ci)) { 231 if (CPU_IS_PRIMARY(ci)) 232 return atomic_load_relaxed(&ci->ci_heartbeat_suspend); 233 } 234 235 /* 236 * This should be unreachable -- there had better be a primary 237 * CPU in the system! If not, the timecounter will be busted 238 * anyway. 239 */ 240 panic("no primary CPU"); 241} 242 243/* 244 * heartbeat_reset_xc(a, b) 245 * 246 * Cross-call handler to reset heartbeat state just prior to 247 * enabling heartbeat checks. 248 */ 249static void 250heartbeat_reset_xc(void *a, void *b) 251{ 252 int s; 253 254 s = splsched(); 255 heartbeat_resume_cpu(curcpu()); 256 splx(s); 257} 258 259/* 260 * set_max_period(max_period) 261 * 262 * Set the maximum period, in seconds, for heartbeat checks. 263 * 264 * - If max_period is zero, disable them. 265 * 266 * - If the max period was zero and max_period is nonzero, ensure 267 * all CPUs' heartbeat uptime caches are up-to-date before 268 * re-enabling them. 269 * 270 * max_period must be below UINT_MAX/4/hz to avoid arithmetic 271 * overflow and give room for slop. 272 * 273 * Caller must hold heartbeat_lock. 274 */ 275static void 276set_max_period(unsigned max_period) 277{ 278 279 KASSERTMSG(max_period <= UINT_MAX/4/hz, 280 "max_period=%u must not exceed UINT_MAX/4/hz=%u (hz=%u)", 281 max_period, UINT_MAX/4/hz, hz); 282 KASSERT(mutex_owned(&heartbeat_lock)); 283 284 /* 285 * If we're enabling heartbeat checks, make sure we have a 286 * reasonably up-to-date time_uptime cache on all CPUs so we 287 * don't think we had an instant heart attack. 288 */ 289 if (heartbeat_max_period_secs == 0 && max_period != 0) { 290 if (cold) { 291 CPU_INFO_ITERATOR cii; 292 struct cpu_info *ci; 293 294 for (CPU_INFO_FOREACH(cii, ci)) 295 heartbeat_resume_cpu(ci); 296 } else { 297 const uint64_t ticket = 298 xc_broadcast(0, &heartbeat_reset_xc, NULL, NULL); 299 xc_wait(ticket); 300 } 301 } 302 303 /* 304 * Once the heartbeat state has been updated on all (online) 305 * CPUs, set the period. At this point, heartbeat checks can 306 * begin. 307 */ 308 atomic_store_relaxed(&heartbeat_max_period_secs, max_period); 309 atomic_store_relaxed(&heartbeat_max_period_ticks, max_period*hz); 310} 311 312/* 313 * heartbeat_max_period_ticks(SYSCTLFN_ARGS) 314 * 315 * Sysctl handler for sysctl kern.heartbeat.max_period. Verifies 316 * it lies within a reasonable interval and sets it. 317 */ 318static int 319heartbeat_max_period_sysctl(SYSCTLFN_ARGS) 320{ 321 struct sysctlnode node; 322 unsigned max_period; 323 int error; 324 325 mutex_enter(&heartbeat_lock); 326 327 max_period = heartbeat_max_period_secs; 328 node = *rnode; 329 node.sysctl_data = &max_period; 330 error = sysctl_lookup(SYSCTLFN_CALL(&node)); 331 if (error || newp == NULL) 332 goto out; 333 334 /* 335 * Ensure there's plenty of slop between heartbeats. 336 */ 337 if (max_period > UINT_MAX/4/hz) { 338 error = EOVERFLOW; 339 goto out; 340 } 341 342 /* 343 * Success! Set the period. This enables heartbeat checks if 344 * we went from zero period to nonzero period, or disables them 345 * if the other way around. 346 */ 347 set_max_period(max_period); 348 error = 0; 349 350out: mutex_exit(&heartbeat_lock); 351 return error; 352} 353 354/* 355 * sysctl_heartbeat_setup() 356 * 357 * Set up the kern.heartbeat.* sysctl subtree. 358 */ 359SYSCTL_SETUP(sysctl_heartbeat_setup, "sysctl kern.heartbeat setup") 360{ 361 const struct sysctlnode *rnode; 362 int error; 363 364 mutex_init(&heartbeat_lock, MUTEX_DEFAULT, IPL_NONE); 365 366 /* kern.heartbeat */ 367 error = sysctl_createv(NULL, 0, NULL, &rnode, 368 CTLFLAG_PERMANENT, 369 CTLTYPE_NODE, "heartbeat", 370 SYSCTL_DESCR("Kernel heartbeat parameters"), 371 NULL, 0, NULL, 0, 372 CTL_KERN, CTL_CREATE, CTL_EOL); 373 if (error) { 374 printf("%s: failed to create kern.heartbeat: %d\n", 375 __func__, error); 376 return; 377 } 378 379 /* kern.heartbeat.max_period */ 380 error = sysctl_createv(NULL, 0, &rnode, NULL, 381 CTLFLAG_PERMANENT|CTLFLAG_READWRITE, 382 CTLTYPE_INT, "max_period", 383 SYSCTL_DESCR("Max seconds between heartbeats before panic"), 384 &heartbeat_max_period_sysctl, 0, NULL, 0, 385 CTL_CREATE, CTL_EOL); 386 if (error) { 387 printf("%s: failed to create kern.heartbeat.max_period: %d\n", 388 __func__, error); 389 return; 390 } 391} 392 393/* 394 * heartbeat_intr(cookie) 395 * 396 * Soft interrupt handler to update the local CPU's view of the 397 * system uptime. This runs at the same priority level as 398 * callouts, so if callouts are stuck on this CPU, it won't run, 399 * and eventually another CPU will notice that this one is stuck. 400 * 401 * Don't do spl* here -- keep it to a minimum so if anything goes 402 * wrong we don't end up with hard interrupts blocked and unable 403 * to detect a missed heartbeat. 404 */ 405static void 406heartbeat_intr(void *cookie) 407{ 408 unsigned count = atomic_load_relaxed(&curcpu()->ci_heartbeat_count); 409 unsigned uptime = time_uptime; 410 411 atomic_store_relaxed(&curcpu()->ci_heartbeat_uptime_stamp, count); 412 atomic_store_relaxed(&curcpu()->ci_heartbeat_uptime_cache, uptime); 413} 414 415/* 416 * heartbeat_start() 417 * 418 * Start system heartbeat monitoring. 419 */ 420void 421heartbeat_start(void) 422{ 423 const unsigned max_period = HEARTBEAT_MAX_PERIOD_DEFAULT; 424 425 /* 426 * Establish a softint so we can schedule it once ready. This 427 * should be at the lowest softint priority level so that we 428 * ensure all softint priorities are making progress. 429 */ 430 heartbeat_sih = softint_establish(SOFTINT_CLOCK|SOFTINT_MPSAFE, 431 &heartbeat_intr, NULL); 432 433 /* 434 * Now that the softint is established, kick off heartbeat 435 * monitoring with the default period. This will initialize 436 * the per-CPU state to an up-to-date cache of time_uptime. 437 */ 438 mutex_enter(&heartbeat_lock); 439 set_max_period(max_period); 440 mutex_exit(&heartbeat_lock); 441} 442 443/* 444 * defibrillator(cookie) 445 * 446 * IPI handler for defibrillation. If the CPU's heart has stopped 447 * beating normally, but the CPU can still execute things, 448 * acknowledge the IPI to the doctor and then panic so we at least 449 * get a stack trace from whatever the current CPU is stuck doing, 450 * if not a core dump. 451 * 452 * (This metaphor is a little stretched, since defibrillation is 453 * usually administered when the heart is beating errattically but 454 * hasn't stopped, and causes the heart to stop temporarily, and 455 * one hopes it is not fatal. But we're (software) engineers, so 456 * we can stretch metaphors like silly putty in a blender.) 457 */ 458static void 459defibrillator(void *cookie) 460{ 461 bool *ack = cookie; 462 463 /* 464 * Acknowledge the interrupt so the doctor CPU won't trigger a 465 * new panic for defibrillation timeout. 466 */ 467 atomic_store_relaxed(ack, true); 468 469 /* 470 * If a panic is already in progress, we may have interrupted 471 * the logic that prints a stack trace on this CPU -- so let's 472 * not make it worse by giving the misapprehension of a 473 * recursive panic. 474 */ 475 if (atomic_load_relaxed(&panicstr) != NULL) 476 return; 477 478 panic("%s[%d %s]: heart stopped beating", cpu_name(curcpu()), 479 curlwp->l_lid, 480 curlwp->l_name ? curlwp->l_name : curproc->p_comm); 481} 482 483/* 484 * defibrillate(ci, unsigned d) 485 * 486 * The patient CPU ci's heart has stopped beating after d seconds. 487 * Force the patient CPU ci to panic, or panic on this CPU if the 488 * patient CPU doesn't respond within 1sec. 489 */ 490static void __noinline 491defibrillate(struct cpu_info *ci, unsigned d) 492{ 493 bool ack = false; 494 ipi_msg_t msg = { 495 .func = &defibrillator, 496 .arg = &ack, 497 }; 498 unsigned countdown = 1000; /* 1sec */ 499 500 KASSERT(curcpu_stable()); 501 502 /* 503 * First notify the console that the patient CPU's heart seems 504 * to have stopped beating. 505 */ 506 printf("%s: found %s heart stopped beating after %u seconds\n", 507 cpu_name(curcpu()), cpu_name(ci), d); 508 509 /* 510 * Next, give the patient CPU a chance to panic, so we get a 511 * stack trace on that CPU even if we don't get a crash dump. 512 */ 513 ipi_unicast(&msg, ci); 514 515 /* 516 * Busy-wait up to 1sec for the patient CPU to print a stack 517 * trace and panic. If the patient CPU acknowledges the IPI, 518 * just give up and stop here -- the system is coming down soon 519 * and we should avoid getting in the way. 520 */ 521 while (countdown --> 0) { 522 if (atomic_load_relaxed(&ack)) 523 return; 524 DELAY(1000); /* 1ms */ 525 } 526 527 /* 528 * The patient CPU failed to acknowledge the panic request. 529 * Panic now; with any luck, we'll get a crash dump. 530 */ 531 panic("%s: found %s heart stopped beating and unresponsive", 532 cpu_name(curcpu()), cpu_name(ci)); 533} 534 535/* 536 * select_patient() 537 * 538 * Select another CPU to check the heartbeat of. Returns NULL if 539 * there are no other online CPUs. Never returns curcpu(). 540 * Caller must have kpreemption disabled. 541 */ 542static struct cpu_info * 543select_patient(void) 544{ 545 CPU_INFO_ITERATOR cii; 546 struct cpu_info *first = NULL, *patient = NULL, *ci; 547 bool passedcur = false; 548 549 KASSERT(curcpu_stable()); 550 551 /* 552 * In the iteration order of all CPUs, find the next online CPU 553 * after curcpu(), or the first online one if curcpu() is last 554 * in the iteration order. 555 */ 556 for (CPU_INFO_FOREACH(cii, ci)) { 557 if (atomic_load_relaxed(&ci->ci_heartbeat_suspend)) 558 continue; 559 if (passedcur) { 560 /* 561 * (...|curcpu()|ci|...) 562 * 563 * Found the patient right after curcpu(). 564 */ 565 KASSERT(patient != ci); 566 patient = ci; 567 break; 568 } 569 if (ci == curcpu()) { 570 /* 571 * (...|prev|ci=curcpu()|next|...) 572 * 573 * Note that we want next (or first, if there's 574 * nothing after curcpu()). 575 */ 576 passedcur = true; 577 continue; 578 } 579 if (first == NULL) { 580 /* 581 * (ci|...|curcpu()|...) 582 * 583 * Record ci as first in case there's nothing 584 * after curcpu(). 585 */ 586 first = ci; 587 continue; 588 } 589 } 590 591 /* 592 * If we hit the end, wrap around to the beginning. 593 */ 594 if (patient == NULL) { 595 KASSERT(passedcur); 596 patient = first; 597 } 598 599 return patient; 600} 601 602/* 603 * heartbeat() 604 * 605 * 1. Count a heartbeat on the local CPU. 606 * 607 * 2. Panic if the system uptime doesn't seem to have advanced in 608 * a while. 609 * 610 * 3. Panic if the soft interrupt on this CPU hasn't advanced the 611 * local view of the system uptime. 612 * 613 * 4. Schedule the soft interrupt to advance the local view of the 614 * system uptime. 615 * 616 * 5. Select another CPU to check the heartbeat of. 617 * 618 * 6. Panic if the other CPU hasn't advanced its view of the 619 * system uptime in a while. 620 */ 621void 622heartbeat(void) 623{ 624 unsigned period_ticks, period_secs; 625 unsigned count, uptime, cache, stamp, d; 626 struct cpu_info *patient; 627 628 KASSERT(curcpu_stable()); 629 630 /* 631 * If heartbeat checks are disabled globally, or if they are 632 * suspended locally, or if we're already panicking so it's not 633 * helpful to trigger more panics for more reasons, do nothing. 634 */ 635 period_ticks = atomic_load_relaxed(&heartbeat_max_period_ticks); 636 period_secs = atomic_load_relaxed(&heartbeat_max_period_secs); 637 if (__predict_false(period_ticks == 0) || 638 __predict_false(period_secs == 0) || 639 __predict_false(curcpu()->ci_heartbeat_suspend) || 640 __predict_false(panicstr != NULL)) 641 return; 642 643 /* 644 * Count a heartbeat on this CPU. 645 */ 646 count = curcpu()->ci_heartbeat_count++; 647 648 /* 649 * If the uptime hasn't changed, make sure that we haven't 650 * counted too many of our own heartbeats since the uptime last 651 * changed, and stop here -- we only do the cross-CPU work once 652 * per second. 653 */ 654 uptime = time_uptime; 655 cache = atomic_load_relaxed(&curcpu()->ci_heartbeat_uptime_cache); 656 if (__predict_true(cache == uptime)) { 657 /* 658 * Timecounter hasn't advanced by more than a second. 659 * Make sure the timecounter isn't stuck according to 660 * our heartbeats -- unless timecounter heartbeats are 661 * suspended too. 662 * 663 * Our own heartbeat count can't roll back, and 664 * time_uptime should be updated before it wraps 665 * around, so d should never go negative; hence no 666 * check for d < UINT_MAX/2. 667 */ 668 stamp = 669 atomic_load_relaxed(&curcpu()->ci_heartbeat_uptime_stamp); 670 d = count - stamp; 671 if (__predict_false(d > period_ticks) && 672 !heartbeat_timecounter_suspended()) { 673 panic("%s: time has not advanced in %u heartbeats", 674 cpu_name(curcpu()), d); 675 } 676 return; 677 } 678 679 /* 680 * If the uptime has changed, make sure that it hasn't changed 681 * so much that softints must be stuck on this CPU. Since 682 * time_uptime is monotonic, this can't go negative, hence no 683 * check for d < UINT_MAX/2. 684 * 685 * This uses the hard timer interrupt handler on the current 686 * CPU to ensure soft interrupts at all priority levels have 687 * made progress. 688 */ 689 d = uptime - cache; 690 if (__predict_false(d > period_secs)) { 691 panic("%s: softints stuck for %u seconds", 692 cpu_name(curcpu()), d); 693 } 694 695 /* 696 * Schedule a softint to update our cache of the system uptime 697 * so the next call to heartbeat, on this or another CPU, can 698 * detect progress on this one. 699 */ 700 softint_schedule(heartbeat_sih); 701 702 /* 703 * Select a patient to check the heartbeat of. If there's no 704 * other online CPU, nothing to do. 705 */ 706 patient = select_patient(); 707 if (patient == NULL) 708 return; 709 710 /* 711 * Verify that time is advancing on the patient CPU. If the 712 * delta exceeds UINT_MAX/2, that means it is already ahead by 713 * a little on the other CPU, and the subtraction went 714 * negative, which is OK. If the CPU's heartbeats have been 715 * suspended since we selected it, no worries. 716 * 717 * This uses the current CPU to ensure the other CPU has made 718 * progress, even if the other CPU's hard timer interrupt 719 * handler is stuck for some reason. 720 * 721 * XXX Maybe confirm it hasn't gone negative by more than 722 * max_period? 723 */ 724 d = uptime - atomic_load_relaxed(&patient->ci_heartbeat_uptime_cache); 725 if (__predict_false(d > period_secs) && 726 __predict_false(d < UINT_MAX/2) && 727 atomic_load_relaxed(&patient->ci_heartbeat_suspend) == 0) 728 defibrillate(patient, d); 729} 730 731/* 732 * heartbeat_dump() 733 * 734 * Print the heartbeat data of all CPUs. Can be called from ddb. 735 */ 736#ifdef DDB 737static unsigned 738db_read_unsigned(const volatile unsigned *p) 739{ 740 unsigned x; 741 742 db_read_bytes((db_addr_t)(uintptr_t)p, sizeof(x), (char *)&x); 743 744 return x; 745} 746 747void 748heartbeat_dump(void) 749{ 750 struct cpu_info *ci; 751 752 db_printf("Heartbeats:\n"); 753 for (ci = db_cpu_first(); ci != NULL; ci = db_cpu_next(ci)) { 754 db_printf("cpu%u: count %u uptime %u stamp %u suspend %u\n", 755 db_read_unsigned(&ci->ci_index), 756 db_read_unsigned(&ci->ci_heartbeat_count), 757 db_read_unsigned(&ci->ci_heartbeat_uptime_cache), 758 db_read_unsigned(&ci->ci_heartbeat_uptime_stamp), 759 db_read_unsigned(&ci->ci_heartbeat_suspend)); 760 } 761} 762#endif 763