1/*	$NetBSD: kern_heartbeat.c,v 1.13 2024/03/08 23:34:03 riastradh Exp $	*/
2
3/*-
4 * Copyright (c) 2023 The NetBSD Foundation, Inc.
5 * All rights reserved.
6 *
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
9 * are met:
10 * 1. Redistributions of source code must retain the above copyright
11 *    notice, this list of conditions and the following disclaimer.
12 * 2. Redistributions in binary form must reproduce the above copyright
13 *    notice, this list of conditions and the following disclaimer in the
14 *    documentation and/or other materials provided with the distribution.
15 *
16 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
17 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
18 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
19 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
20 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
21 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
22 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
23 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
24 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
25 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
26 * POSSIBILITY OF SUCH DAMAGE.
27 */
28
29/*
30 * heartbeat(9) -- periodic checks to ensure CPUs are making progress
31 *
32 * Manual tests to run when changing this file.  Magic numbers are for
33 * evbarm; adjust for other platforms.  Tests involving cpuctl
34 * online/offline assume a 2-CPU system -- for full testing on a >2-CPU
35 * system, offline all but one CPU.
36 *
37 * 1.	cpuctl offline 0
38 *	sleep 20
39 *	cpuctl online 0
40 *
41 * 2.	cpuctl offline 1
42 *	sleep 20
43 *	cpuctl online 1
44 *
45 * 3.	cpuctl offline 0
46 *	sysctl -w kern.heartbeat.max_period=5
47 *	sleep 10
48 *	sysctl -w kern.heartbeat.max_period=0
49 *	sleep 10
50 *	sysctl -w kern.heartbeat.max_period=5
51 *	sleep 10
52 *	cpuctl online 0
53 *
54 * 4.	sysctl -w debug.crashme_enable=1
55 *	sysctl -w debug.crashme.spl_spinout=1   # IPL_SOFTCLOCK
56 *	# verify system panics after 15sec, with a stack trace through
57 *	# crashme_spl_spinout
58 *
59 * 5.	sysctl -w debug.crashme_enable=1
60 *	sysctl -w debug.crashme.spl_spinout=6   # IPL_SCHED
61 *	# verify system panics after 15sec, with a stack trace through
62 *	# crashme_spl_spinout
63 *
64 * 6.	cpuctl offline 0
65 *	sysctl -w debug.crashme_enable=1
66 *	sysctl -w debug.crashme.spl_spinout=1   # IPL_SOFTCLOCK
67 *	# verify system panics after 15sec, with a stack trace through
68 *	# crashme_spl_spinout
69 *
70 * 7.	cpuctl offline 0
71 *	sysctl -w debug.crashme_enable=1
72 *	sysctl -w debug.crashme.spl_spinout=5   # IPL_VM
73 *	# verify system panics after 15sec, with a stack trace through
74 *	# crashme_spl_spinout
75 *
76 *	# Not this -- IPL_SCHED and IPL_HIGH spinout on a single CPU
77 *	# require a hardware watchdog timer.
78 *	#cpuctl offline 0
79 *	#sysctl -w debug.crashme_enable
80 *	#sysctl -w debug.crashme.spl_spinout=6   # IPL_SCHED
81 *	# hope watchdog timer kicks in
82 */
83
84#include <sys/cdefs.h>
85__KERNEL_RCSID(0, "$NetBSD: kern_heartbeat.c,v 1.13 2024/03/08 23:34:03 riastradh Exp $");
86
87#ifdef _KERNEL_OPT
88#include "opt_ddb.h"
89#include "opt_heartbeat.h"
90#endif
91
92#include "heartbeat.h"
93
94#include <sys/param.h>
95#include <sys/types.h>
96
97#include <sys/atomic.h>
98#include <sys/cpu.h>
99#include <sys/errno.h>
100#include <sys/heartbeat.h>
101#include <sys/ipi.h>
102#include <sys/kernel.h>
103#include <sys/mutex.h>
104#include <sys/sysctl.h>
105#include <sys/systm.h>
106#include <sys/xcall.h>
107
108#ifdef DDB
109#include <ddb/ddb.h>
110#endif
111
112/*
113 * Global state.
114 *
115 *	heartbeat_lock serializes access to heartbeat_max_period_secs
116 *	and heartbeat_max_period_ticks.  Two separate variables so we
117 *	can avoid multiplication or division in the heartbeat routine.
118 *
119 *	heartbeat_sih is stable after initialization in
120 *	heartbeat_start.
121 */
122kmutex_t heartbeat_lock			__cacheline_aligned;
123unsigned heartbeat_max_period_secs	__read_mostly;
124unsigned heartbeat_max_period_ticks	__read_mostly;
125
126void *heartbeat_sih			__read_mostly;
127
128/*
129 * heartbeat_suspend()
130 *
131 *	Suspend heartbeat monitoring of the current CPU.
132 *
133 *	Called after the current CPU has been marked offline but before
134 *	it has stopped running, or after IPL has been raised for
135 *	polling-mode console input.  Nestable (but only 2^32 times, so
136 *	don't do this in a loop).  Reversed by heartbeat_resume.
137 *
138 *	Caller must be bound to the CPU, i.e., curcpu_stable() must be
139 *	true.  This function does not assert curcpu_stable() since it
140 *	is used in the ddb entry path, where any assertions risk
141 *	infinite regress into undebuggable chaos, so callers must be
142 *	careful.
143 */
144void
145heartbeat_suspend(void)
146{
147	unsigned *p;
148
149	p = &curcpu()->ci_heartbeat_suspend;
150	atomic_store_relaxed(p, *p + 1);
151}
152
153/*
154 * heartbeat_resume_cpu(ci)
155 *
156 *	Resume heartbeat monitoring of ci.
157 *
158 *	Called at startup while cold, and whenever heartbeat monitoring
159 *	is re-enabled after being disabled or the period is changed.
160 *	When not cold, ci must be the current CPU.
161 *
162 *	Must be run at splsched.
163 */
164static void
165heartbeat_resume_cpu(struct cpu_info *ci)
166{
167
168	KASSERT(__predict_false(cold) || curcpu_stable());
169	KASSERT(__predict_false(cold) || ci == curcpu());
170	/* XXX KASSERT IPL_SCHED */
171
172	ci->ci_heartbeat_count = 0;
173	ci->ci_heartbeat_uptime_cache = time_uptime;
174	ci->ci_heartbeat_uptime_stamp = 0;
175}
176
177/*
178 * heartbeat_resume()
179 *
180 *	Resume heartbeat monitoring of the current CPU.
181 *
182 *	Called after the current CPU has started running but before it
183 *	has been marked online, or when ending polling-mode input
184 *	before IPL is restored.  Reverses heartbeat_suspend.
185 *
186 *	Caller must be bound to the CPU, i.e., curcpu_stable() must be
187 *	true.
188 */
189void
190heartbeat_resume(void)
191{
192	struct cpu_info *ci = curcpu();
193	unsigned *p;
194	int s;
195
196	KASSERT(curcpu_stable());
197
198	/*
199	 * Reset the state so nobody spuriously thinks we had a heart
200	 * attack as soon as the heartbeat checks resume.
201	 */
202	s = splsched();
203	heartbeat_resume_cpu(ci);
204	splx(s);
205
206	p = &ci->ci_heartbeat_suspend;
207	atomic_store_relaxed(p, *p - 1);
208}
209
210/*
211 * heartbeat_timecounter_suspended()
212 *
213 *	True if timecounter heartbeat checks are suspended because the
214 *	timecounter may not be advancing, false if heartbeat checks
215 *	should check for timecounter progress.
216 */
217static bool
218heartbeat_timecounter_suspended(void)
219{
220	CPU_INFO_ITERATOR cii;
221	struct cpu_info *ci;
222
223	/*
224	 * The timecounter ticks only on the primary CPU.  Check
225	 * whether it's suspended.
226	 *
227	 * XXX Would be nice if we could find the primary CPU without
228	 * iterating over all CPUs.
229	 */
230	for (CPU_INFO_FOREACH(cii, ci)) {
231		if (CPU_IS_PRIMARY(ci))
232			return atomic_load_relaxed(&ci->ci_heartbeat_suspend);
233	}
234
235	/*
236	 * This should be unreachable -- there had better be a primary
237	 * CPU in the system!  If not, the timecounter will be busted
238	 * anyway.
239	 */
240	panic("no primary CPU");
241}
242
243/*
244 * heartbeat_reset_xc(a, b)
245 *
246 *	Cross-call handler to reset heartbeat state just prior to
247 *	enabling heartbeat checks.
248 */
249static void
250heartbeat_reset_xc(void *a, void *b)
251{
252	int s;
253
254	s = splsched();
255	heartbeat_resume_cpu(curcpu());
256	splx(s);
257}
258
259/*
260 * set_max_period(max_period)
261 *
262 *	Set the maximum period, in seconds, for heartbeat checks.
263 *
264 *	- If max_period is zero, disable them.
265 *
266 *	- If the max period was zero and max_period is nonzero, ensure
267 *	  all CPUs' heartbeat uptime caches are up-to-date before
268 *	  re-enabling them.
269 *
270 *	max_period must be below UINT_MAX/4/hz to avoid arithmetic
271 *	overflow and give room for slop.
272 *
273 *	Caller must hold heartbeat_lock.
274 */
275static void
276set_max_period(unsigned max_period)
277{
278
279	KASSERTMSG(max_period <= UINT_MAX/4/hz,
280	    "max_period=%u must not exceed UINT_MAX/4/hz=%u (hz=%u)",
281	    max_period, UINT_MAX/4/hz, hz);
282	KASSERT(mutex_owned(&heartbeat_lock));
283
284	/*
285	 * If we're enabling heartbeat checks, make sure we have a
286	 * reasonably up-to-date time_uptime cache on all CPUs so we
287	 * don't think we had an instant heart attack.
288	 */
289	if (heartbeat_max_period_secs == 0 && max_period != 0) {
290		if (cold) {
291			CPU_INFO_ITERATOR cii;
292			struct cpu_info *ci;
293
294			for (CPU_INFO_FOREACH(cii, ci))
295				heartbeat_resume_cpu(ci);
296		} else {
297			const uint64_t ticket =
298			    xc_broadcast(0, &heartbeat_reset_xc, NULL, NULL);
299			xc_wait(ticket);
300		}
301	}
302
303	/*
304	 * Once the heartbeat state has been updated on all (online)
305	 * CPUs, set the period.  At this point, heartbeat checks can
306	 * begin.
307	 */
308	atomic_store_relaxed(&heartbeat_max_period_secs, max_period);
309	atomic_store_relaxed(&heartbeat_max_period_ticks, max_period*hz);
310}
311
312/*
313 * heartbeat_max_period_ticks(SYSCTLFN_ARGS)
314 *
315 *	Sysctl handler for sysctl kern.heartbeat.max_period.  Verifies
316 *	it lies within a reasonable interval and sets it.
317 */
318static int
319heartbeat_max_period_sysctl(SYSCTLFN_ARGS)
320{
321	struct sysctlnode node;
322	unsigned max_period;
323	int error;
324
325	mutex_enter(&heartbeat_lock);
326
327	max_period = heartbeat_max_period_secs;
328	node = *rnode;
329	node.sysctl_data = &max_period;
330	error = sysctl_lookup(SYSCTLFN_CALL(&node));
331	if (error || newp == NULL)
332		goto out;
333
334	/*
335	 * Ensure there's plenty of slop between heartbeats.
336	 */
337	if (max_period > UINT_MAX/4/hz) {
338		error = EOVERFLOW;
339		goto out;
340	}
341
342	/*
343	 * Success!  Set the period.  This enables heartbeat checks if
344	 * we went from zero period to nonzero period, or disables them
345	 * if the other way around.
346	 */
347	set_max_period(max_period);
348	error = 0;
349
350out:	mutex_exit(&heartbeat_lock);
351	return error;
352}
353
354/*
355 * sysctl_heartbeat_setup()
356 *
357 *	Set up the kern.heartbeat.* sysctl subtree.
358 */
359SYSCTL_SETUP(sysctl_heartbeat_setup, "sysctl kern.heartbeat setup")
360{
361	const struct sysctlnode *rnode;
362	int error;
363
364	mutex_init(&heartbeat_lock, MUTEX_DEFAULT, IPL_NONE);
365
366	/* kern.heartbeat */
367	error = sysctl_createv(NULL, 0, NULL, &rnode,
368	    CTLFLAG_PERMANENT,
369	    CTLTYPE_NODE, "heartbeat",
370	    SYSCTL_DESCR("Kernel heartbeat parameters"),
371	    NULL, 0, NULL, 0,
372	    CTL_KERN, CTL_CREATE, CTL_EOL);
373	if (error) {
374		printf("%s: failed to create kern.heartbeat: %d\n",
375		    __func__, error);
376		return;
377	}
378
379	/* kern.heartbeat.max_period */
380	error = sysctl_createv(NULL, 0, &rnode, NULL,
381	    CTLFLAG_PERMANENT|CTLFLAG_READWRITE,
382	    CTLTYPE_INT, "max_period",
383	    SYSCTL_DESCR("Max seconds between heartbeats before panic"),
384	    &heartbeat_max_period_sysctl, 0, NULL, 0,
385	    CTL_CREATE, CTL_EOL);
386	if (error) {
387		printf("%s: failed to create kern.heartbeat.max_period: %d\n",
388		    __func__, error);
389		return;
390	}
391}
392
393/*
394 * heartbeat_intr(cookie)
395 *
396 *	Soft interrupt handler to update the local CPU's view of the
397 *	system uptime.  This runs at the same priority level as
398 *	callouts, so if callouts are stuck on this CPU, it won't run,
399 *	and eventually another CPU will notice that this one is stuck.
400 *
401 *	Don't do spl* here -- keep it to a minimum so if anything goes
402 *	wrong we don't end up with hard interrupts blocked and unable
403 *	to detect a missed heartbeat.
404 */
405static void
406heartbeat_intr(void *cookie)
407{
408	unsigned count = atomic_load_relaxed(&curcpu()->ci_heartbeat_count);
409	unsigned uptime = time_uptime;
410
411	atomic_store_relaxed(&curcpu()->ci_heartbeat_uptime_stamp, count);
412	atomic_store_relaxed(&curcpu()->ci_heartbeat_uptime_cache, uptime);
413}
414
415/*
416 * heartbeat_start()
417 *
418 *	Start system heartbeat monitoring.
419 */
420void
421heartbeat_start(void)
422{
423	const unsigned max_period = HEARTBEAT_MAX_PERIOD_DEFAULT;
424
425	/*
426	 * Establish a softint so we can schedule it once ready.  This
427	 * should be at the lowest softint priority level so that we
428	 * ensure all softint priorities are making progress.
429	 */
430	heartbeat_sih = softint_establish(SOFTINT_CLOCK|SOFTINT_MPSAFE,
431	    &heartbeat_intr, NULL);
432
433	/*
434	 * Now that the softint is established, kick off heartbeat
435	 * monitoring with the default period.  This will initialize
436	 * the per-CPU state to an up-to-date cache of time_uptime.
437	 */
438	mutex_enter(&heartbeat_lock);
439	set_max_period(max_period);
440	mutex_exit(&heartbeat_lock);
441}
442
443/*
444 * defibrillator(cookie)
445 *
446 *	IPI handler for defibrillation.  If the CPU's heart has stopped
447 *	beating normally, but the CPU can still execute things,
448 *	acknowledge the IPI to the doctor and then panic so we at least
449 *	get a stack trace from whatever the current CPU is stuck doing,
450 *	if not a core dump.
451 *
452 *	(This metaphor is a little stretched, since defibrillation is
453 *	usually administered when the heart is beating errattically but
454 *	hasn't stopped, and causes the heart to stop temporarily, and
455 *	one hopes it is not fatal.  But we're (software) engineers, so
456 *	we can stretch metaphors like silly putty in a blender.)
457 */
458static void
459defibrillator(void *cookie)
460{
461	bool *ack = cookie;
462
463	/*
464	 * Acknowledge the interrupt so the doctor CPU won't trigger a
465	 * new panic for defibrillation timeout.
466	 */
467	atomic_store_relaxed(ack, true);
468
469	/*
470	 * If a panic is already in progress, we may have interrupted
471	 * the logic that prints a stack trace on this CPU -- so let's
472	 * not make it worse by giving the misapprehension of a
473	 * recursive panic.
474	 */
475	if (atomic_load_relaxed(&panicstr) != NULL)
476		return;
477
478	panic("%s[%d %s]: heart stopped beating", cpu_name(curcpu()),
479	    curlwp->l_lid,
480	    curlwp->l_name ? curlwp->l_name : curproc->p_comm);
481}
482
483/*
484 * defibrillate(ci, unsigned d)
485 *
486 *	The patient CPU ci's heart has stopped beating after d seconds.
487 *	Force the patient CPU ci to panic, or panic on this CPU if the
488 *	patient CPU doesn't respond within 1sec.
489 */
490static void __noinline
491defibrillate(struct cpu_info *ci, unsigned d)
492{
493	bool ack = false;
494	ipi_msg_t msg = {
495		.func = &defibrillator,
496		.arg = &ack,
497	};
498	unsigned countdown = 1000; /* 1sec */
499
500	KASSERT(curcpu_stable());
501
502	/*
503	 * First notify the console that the patient CPU's heart seems
504	 * to have stopped beating.
505	 */
506	printf("%s: found %s heart stopped beating after %u seconds\n",
507	    cpu_name(curcpu()), cpu_name(ci), d);
508
509	/*
510	 * Next, give the patient CPU a chance to panic, so we get a
511	 * stack trace on that CPU even if we don't get a crash dump.
512	 */
513	ipi_unicast(&msg, ci);
514
515	/*
516	 * Busy-wait up to 1sec for the patient CPU to print a stack
517	 * trace and panic.  If the patient CPU acknowledges the IPI,
518	 * just give up and stop here -- the system is coming down soon
519	 * and we should avoid getting in the way.
520	 */
521	while (countdown --> 0) {
522		if (atomic_load_relaxed(&ack))
523			return;
524		DELAY(1000);	/* 1ms */
525	}
526
527	/*
528	 * The patient CPU failed to acknowledge the panic request.
529	 * Panic now; with any luck, we'll get a crash dump.
530	 */
531	panic("%s: found %s heart stopped beating and unresponsive",
532	    cpu_name(curcpu()), cpu_name(ci));
533}
534
535/*
536 * select_patient()
537 *
538 *	Select another CPU to check the heartbeat of.  Returns NULL if
539 *	there are no other online CPUs.  Never returns curcpu().
540 *	Caller must have kpreemption disabled.
541 */
542static struct cpu_info *
543select_patient(void)
544{
545	CPU_INFO_ITERATOR cii;
546	struct cpu_info *first = NULL, *patient = NULL, *ci;
547	bool passedcur = false;
548
549	KASSERT(curcpu_stable());
550
551	/*
552	 * In the iteration order of all CPUs, find the next online CPU
553	 * after curcpu(), or the first online one if curcpu() is last
554	 * in the iteration order.
555	 */
556	for (CPU_INFO_FOREACH(cii, ci)) {
557		if (atomic_load_relaxed(&ci->ci_heartbeat_suspend))
558			continue;
559		if (passedcur) {
560			/*
561			 * (...|curcpu()|ci|...)
562			 *
563			 * Found the patient right after curcpu().
564			 */
565			KASSERT(patient != ci);
566			patient = ci;
567			break;
568		}
569		if (ci == curcpu()) {
570			/*
571			 * (...|prev|ci=curcpu()|next|...)
572			 *
573			 * Note that we want next (or first, if there's
574			 * nothing after curcpu()).
575			 */
576			passedcur = true;
577			continue;
578		}
579		if (first == NULL) {
580			/*
581			 * (ci|...|curcpu()|...)
582			 *
583			 * Record ci as first in case there's nothing
584			 * after curcpu().
585			 */
586			first = ci;
587			continue;
588		}
589	}
590
591	/*
592	 * If we hit the end, wrap around to the beginning.
593	 */
594	if (patient == NULL) {
595		KASSERT(passedcur);
596		patient = first;
597	}
598
599	return patient;
600}
601
602/*
603 * heartbeat()
604 *
605 *	1. Count a heartbeat on the local CPU.
606 *
607 *	2. Panic if the system uptime doesn't seem to have advanced in
608 *	   a while.
609 *
610 *	3. Panic if the soft interrupt on this CPU hasn't advanced the
611 *	   local view of the system uptime.
612 *
613 *	4. Schedule the soft interrupt to advance the local view of the
614 *	   system uptime.
615 *
616 *	5. Select another CPU to check the heartbeat of.
617 *
618 *	6. Panic if the other CPU hasn't advanced its view of the
619 *	   system uptime in a while.
620 */
621void
622heartbeat(void)
623{
624	unsigned period_ticks, period_secs;
625	unsigned count, uptime, cache, stamp, d;
626	struct cpu_info *patient;
627
628	KASSERT(curcpu_stable());
629
630	/*
631	 * If heartbeat checks are disabled globally, or if they are
632	 * suspended locally, or if we're already panicking so it's not
633	 * helpful to trigger more panics for more reasons, do nothing.
634	 */
635	period_ticks = atomic_load_relaxed(&heartbeat_max_period_ticks);
636	period_secs = atomic_load_relaxed(&heartbeat_max_period_secs);
637	if (__predict_false(period_ticks == 0) ||
638	    __predict_false(period_secs == 0) ||
639	    __predict_false(curcpu()->ci_heartbeat_suspend) ||
640	    __predict_false(panicstr != NULL))
641		return;
642
643	/*
644	 * Count a heartbeat on this CPU.
645	 */
646	count = curcpu()->ci_heartbeat_count++;
647
648	/*
649	 * If the uptime hasn't changed, make sure that we haven't
650	 * counted too many of our own heartbeats since the uptime last
651	 * changed, and stop here -- we only do the cross-CPU work once
652	 * per second.
653	 */
654	uptime = time_uptime;
655	cache = atomic_load_relaxed(&curcpu()->ci_heartbeat_uptime_cache);
656	if (__predict_true(cache == uptime)) {
657		/*
658		 * Timecounter hasn't advanced by more than a second.
659		 * Make sure the timecounter isn't stuck according to
660		 * our heartbeats -- unless timecounter heartbeats are
661		 * suspended too.
662		 *
663		 * Our own heartbeat count can't roll back, and
664		 * time_uptime should be updated before it wraps
665		 * around, so d should never go negative; hence no
666		 * check for d < UINT_MAX/2.
667		 */
668		stamp =
669		    atomic_load_relaxed(&curcpu()->ci_heartbeat_uptime_stamp);
670		d = count - stamp;
671		if (__predict_false(d > period_ticks) &&
672		    !heartbeat_timecounter_suspended()) {
673			panic("%s: time has not advanced in %u heartbeats",
674			    cpu_name(curcpu()), d);
675		}
676		return;
677	}
678
679	/*
680	 * If the uptime has changed, make sure that it hasn't changed
681	 * so much that softints must be stuck on this CPU.  Since
682	 * time_uptime is monotonic, this can't go negative, hence no
683	 * check for d < UINT_MAX/2.
684	 *
685	 * This uses the hard timer interrupt handler on the current
686	 * CPU to ensure soft interrupts at all priority levels have
687	 * made progress.
688	 */
689	d = uptime - cache;
690	if (__predict_false(d > period_secs)) {
691		panic("%s: softints stuck for %u seconds",
692		    cpu_name(curcpu()), d);
693	}
694
695	/*
696	 * Schedule a softint to update our cache of the system uptime
697	 * so the next call to heartbeat, on this or another CPU, can
698	 * detect progress on this one.
699	 */
700	softint_schedule(heartbeat_sih);
701
702	/*
703	 * Select a patient to check the heartbeat of.  If there's no
704	 * other online CPU, nothing to do.
705	 */
706	patient = select_patient();
707	if (patient == NULL)
708		return;
709
710	/*
711	 * Verify that time is advancing on the patient CPU.  If the
712	 * delta exceeds UINT_MAX/2, that means it is already ahead by
713	 * a little on the other CPU, and the subtraction went
714	 * negative, which is OK.  If the CPU's heartbeats have been
715	 * suspended since we selected it, no worries.
716	 *
717	 * This uses the current CPU to ensure the other CPU has made
718	 * progress, even if the other CPU's hard timer interrupt
719	 * handler is stuck for some reason.
720	 *
721	 * XXX Maybe confirm it hasn't gone negative by more than
722	 * max_period?
723	 */
724	d = uptime - atomic_load_relaxed(&patient->ci_heartbeat_uptime_cache);
725	if (__predict_false(d > period_secs) &&
726	    __predict_false(d < UINT_MAX/2) &&
727	    atomic_load_relaxed(&patient->ci_heartbeat_suspend) == 0)
728		defibrillate(patient, d);
729}
730
731/*
732 * heartbeat_dump()
733 *
734 *	Print the heartbeat data of all CPUs.  Can be called from ddb.
735 */
736#ifdef DDB
737static unsigned
738db_read_unsigned(const volatile unsigned *p)
739{
740	unsigned x;
741
742	db_read_bytes((db_addr_t)(uintptr_t)p, sizeof(x), (char *)&x);
743
744	return x;
745}
746
747void
748heartbeat_dump(void)
749{
750	struct cpu_info *ci;
751
752	db_printf("Heartbeats:\n");
753	for (ci = db_cpu_first(); ci != NULL; ci = db_cpu_next(ci)) {
754		db_printf("cpu%u: count %u uptime %u stamp %u suspend %u\n",
755		    db_read_unsigned(&ci->ci_index),
756		    db_read_unsigned(&ci->ci_heartbeat_count),
757		    db_read_unsigned(&ci->ci_heartbeat_uptime_cache),
758		    db_read_unsigned(&ci->ci_heartbeat_uptime_stamp),
759		    db_read_unsigned(&ci->ci_heartbeat_suspend));
760	}
761}
762#endif
763