1// SPDX-License-Identifier: GPL-2.0
2/*
3 * This file contains functions which emulate a local clock-event
4 * device via a broadcast event source.
5 *
6 * Copyright(C) 2005-2006, Thomas Gleixner <tglx@linutronix.de>
7 * Copyright(C) 2005-2007, Red Hat, Inc., Ingo Molnar
8 * Copyright(C) 2006-2007, Timesys Corp., Thomas Gleixner
9 */
10#include <linux/cpu.h>
11#include <linux/err.h>
12#include <linux/hrtimer.h>
13#include <linux/interrupt.h>
14#include <linux/percpu.h>
15#include <linux/profile.h>
16#include <linux/sched.h>
17#include <linux/smp.h>
18#include <linux/module.h>
19
20#include "tick-internal.h"
21
22/*
23 * Broadcast support for broken x86 hardware, where the local apic
24 * timer stops in C3 state.
25 */
26
27static struct tick_device tick_broadcast_device;
28static cpumask_var_t tick_broadcast_mask __cpumask_var_read_mostly;
29static cpumask_var_t tick_broadcast_on __cpumask_var_read_mostly;
30static cpumask_var_t tmpmask __cpumask_var_read_mostly;
31static int tick_broadcast_forced;
32
33static __cacheline_aligned_in_smp DEFINE_RAW_SPINLOCK(tick_broadcast_lock);
34
35#ifdef CONFIG_TICK_ONESHOT
36static DEFINE_PER_CPU(struct clock_event_device *, tick_oneshot_wakeup_device);
37
38static void tick_broadcast_setup_oneshot(struct clock_event_device *bc, bool from_periodic);
39static void tick_broadcast_clear_oneshot(int cpu);
40static void tick_resume_broadcast_oneshot(struct clock_event_device *bc);
41# ifdef CONFIG_HOTPLUG_CPU
42static void tick_broadcast_oneshot_offline(unsigned int cpu);
43# endif
44#else
45static inline void
46tick_broadcast_setup_oneshot(struct clock_event_device *bc, bool from_periodic) { BUG(); }
47static inline void tick_broadcast_clear_oneshot(int cpu) { }
48static inline void tick_resume_broadcast_oneshot(struct clock_event_device *bc) { }
49# ifdef CONFIG_HOTPLUG_CPU
50static inline void tick_broadcast_oneshot_offline(unsigned int cpu) { }
51# endif
52#endif
53
54/*
55 * Debugging: see timer_list.c
56 */
57struct tick_device *tick_get_broadcast_device(void)
58{
59	return &tick_broadcast_device;
60}
61
62struct cpumask *tick_get_broadcast_mask(void)
63{
64	return tick_broadcast_mask;
65}
66
67static struct clock_event_device *tick_get_oneshot_wakeup_device(int cpu);
68
69const struct clock_event_device *tick_get_wakeup_device(int cpu)
70{
71	return tick_get_oneshot_wakeup_device(cpu);
72}
73
74/*
75 * Start the device in periodic mode
76 */
77static void tick_broadcast_start_periodic(struct clock_event_device *bc)
78{
79	if (bc)
80		tick_setup_periodic(bc, 1);
81}
82
83/*
84 * Check, if the device can be utilized as broadcast device:
85 */
86static bool tick_check_broadcast_device(struct clock_event_device *curdev,
87					struct clock_event_device *newdev)
88{
89	if ((newdev->features & CLOCK_EVT_FEAT_DUMMY) ||
90	    (newdev->features & CLOCK_EVT_FEAT_PERCPU) ||
91	    (newdev->features & CLOCK_EVT_FEAT_C3STOP))
92		return false;
93
94	if (tick_broadcast_device.mode == TICKDEV_MODE_ONESHOT &&
95	    !(newdev->features & CLOCK_EVT_FEAT_ONESHOT))
96		return false;
97
98	return !curdev || newdev->rating > curdev->rating;
99}
100
101#ifdef CONFIG_TICK_ONESHOT
102static struct clock_event_device *tick_get_oneshot_wakeup_device(int cpu)
103{
104	return per_cpu(tick_oneshot_wakeup_device, cpu);
105}
106
107static void tick_oneshot_wakeup_handler(struct clock_event_device *wd)
108{
109	/*
110	 * If we woke up early and the tick was reprogrammed in the
111	 * meantime then this may be spurious but harmless.
112	 */
113	tick_receive_broadcast();
114}
115
116static bool tick_set_oneshot_wakeup_device(struct clock_event_device *newdev,
117					   int cpu)
118{
119	struct clock_event_device *curdev = tick_get_oneshot_wakeup_device(cpu);
120
121	if (!newdev)
122		goto set_device;
123
124	if ((newdev->features & CLOCK_EVT_FEAT_DUMMY) ||
125	    (newdev->features & CLOCK_EVT_FEAT_C3STOP))
126		 return false;
127
128	if (!(newdev->features & CLOCK_EVT_FEAT_PERCPU) ||
129	    !(newdev->features & CLOCK_EVT_FEAT_ONESHOT))
130		return false;
131
132	if (!cpumask_equal(newdev->cpumask, cpumask_of(cpu)))
133		return false;
134
135	if (curdev && newdev->rating <= curdev->rating)
136		return false;
137
138	if (!try_module_get(newdev->owner))
139		return false;
140
141	newdev->event_handler = tick_oneshot_wakeup_handler;
142set_device:
143	clockevents_exchange_device(curdev, newdev);
144	per_cpu(tick_oneshot_wakeup_device, cpu) = newdev;
145	return true;
146}
147#else
148static struct clock_event_device *tick_get_oneshot_wakeup_device(int cpu)
149{
150	return NULL;
151}
152
153static bool tick_set_oneshot_wakeup_device(struct clock_event_device *newdev,
154					   int cpu)
155{
156	return false;
157}
158#endif
159
160/*
161 * Conditionally install/replace broadcast device
162 */
163void tick_install_broadcast_device(struct clock_event_device *dev, int cpu)
164{
165	struct clock_event_device *cur = tick_broadcast_device.evtdev;
166
167	if (tick_set_oneshot_wakeup_device(dev, cpu))
168		return;
169
170	if (!tick_check_broadcast_device(cur, dev))
171		return;
172
173	if (!try_module_get(dev->owner))
174		return;
175
176	clockevents_exchange_device(cur, dev);
177	if (cur)
178		cur->event_handler = clockevents_handle_noop;
179	tick_broadcast_device.evtdev = dev;
180	if (!cpumask_empty(tick_broadcast_mask))
181		tick_broadcast_start_periodic(dev);
182
183	if (!(dev->features & CLOCK_EVT_FEAT_ONESHOT))
184		return;
185
186	/*
187	 * If the system already runs in oneshot mode, switch the newly
188	 * registered broadcast device to oneshot mode explicitly.
189	 */
190	if (tick_broadcast_oneshot_active()) {
191		tick_broadcast_switch_to_oneshot();
192		return;
193	}
194
195	/*
196	 * Inform all cpus about this. We might be in a situation
197	 * where we did not switch to oneshot mode because the per cpu
198	 * devices are affected by CLOCK_EVT_FEAT_C3STOP and the lack
199	 * of a oneshot capable broadcast device. Without that
200	 * notification the systems stays stuck in periodic mode
201	 * forever.
202	 */
203	tick_clock_notify();
204}
205
206/*
207 * Check, if the device is the broadcast device
208 */
209int tick_is_broadcast_device(struct clock_event_device *dev)
210{
211	return (dev && tick_broadcast_device.evtdev == dev);
212}
213
214int tick_broadcast_update_freq(struct clock_event_device *dev, u32 freq)
215{
216	int ret = -ENODEV;
217
218	if (tick_is_broadcast_device(dev)) {
219		raw_spin_lock(&tick_broadcast_lock);
220		ret = __clockevents_update_freq(dev, freq);
221		raw_spin_unlock(&tick_broadcast_lock);
222	}
223	return ret;
224}
225
226
227static void err_broadcast(const struct cpumask *mask)
228{
229	pr_crit_once("Failed to broadcast timer tick. Some CPUs may be unresponsive.\n");
230}
231
232static void tick_device_setup_broadcast_func(struct clock_event_device *dev)
233{
234	if (!dev->broadcast)
235		dev->broadcast = tick_broadcast;
236	if (!dev->broadcast) {
237		pr_warn_once("%s depends on broadcast, but no broadcast function available\n",
238			     dev->name);
239		dev->broadcast = err_broadcast;
240	}
241}
242
243/*
244 * Check, if the device is dysfunctional and a placeholder, which
245 * needs to be handled by the broadcast device.
246 */
247int tick_device_uses_broadcast(struct clock_event_device *dev, int cpu)
248{
249	struct clock_event_device *bc = tick_broadcast_device.evtdev;
250	unsigned long flags;
251	int ret = 0;
252
253	raw_spin_lock_irqsave(&tick_broadcast_lock, flags);
254
255	/*
256	 * Devices might be registered with both periodic and oneshot
257	 * mode disabled. This signals, that the device needs to be
258	 * operated from the broadcast device and is a placeholder for
259	 * the cpu local device.
260	 */
261	if (!tick_device_is_functional(dev)) {
262		dev->event_handler = tick_handle_periodic;
263		tick_device_setup_broadcast_func(dev);
264		cpumask_set_cpu(cpu, tick_broadcast_mask);
265		if (tick_broadcast_device.mode == TICKDEV_MODE_PERIODIC)
266			tick_broadcast_start_periodic(bc);
267		else
268			tick_broadcast_setup_oneshot(bc, false);
269		ret = 1;
270	} else {
271		/*
272		 * Clear the broadcast bit for this cpu if the
273		 * device is not power state affected.
274		 */
275		if (!(dev->features & CLOCK_EVT_FEAT_C3STOP))
276			cpumask_clear_cpu(cpu, tick_broadcast_mask);
277		else
278			tick_device_setup_broadcast_func(dev);
279
280		/*
281		 * Clear the broadcast bit if the CPU is not in
282		 * periodic broadcast on state.
283		 */
284		if (!cpumask_test_cpu(cpu, tick_broadcast_on))
285			cpumask_clear_cpu(cpu, tick_broadcast_mask);
286
287		switch (tick_broadcast_device.mode) {
288		case TICKDEV_MODE_ONESHOT:
289			/*
290			 * If the system is in oneshot mode we can
291			 * unconditionally clear the oneshot mask bit,
292			 * because the CPU is running and therefore
293			 * not in an idle state which causes the power
294			 * state affected device to stop. Let the
295			 * caller initialize the device.
296			 */
297			tick_broadcast_clear_oneshot(cpu);
298			ret = 0;
299			break;
300
301		case TICKDEV_MODE_PERIODIC:
302			/*
303			 * If the system is in periodic mode, check
304			 * whether the broadcast device can be
305			 * switched off now.
306			 */
307			if (cpumask_empty(tick_broadcast_mask) && bc)
308				clockevents_shutdown(bc);
309			/*
310			 * If we kept the cpu in the broadcast mask,
311			 * tell the caller to leave the per cpu device
312			 * in shutdown state. The periodic interrupt
313			 * is delivered by the broadcast device, if
314			 * the broadcast device exists and is not
315			 * hrtimer based.
316			 */
317			if (bc && !(bc->features & CLOCK_EVT_FEAT_HRTIMER))
318				ret = cpumask_test_cpu(cpu, tick_broadcast_mask);
319			break;
320		default:
321			break;
322		}
323	}
324	raw_spin_unlock_irqrestore(&tick_broadcast_lock, flags);
325	return ret;
326}
327
328int tick_receive_broadcast(void)
329{
330	struct tick_device *td = this_cpu_ptr(&tick_cpu_device);
331	struct clock_event_device *evt = td->evtdev;
332
333	if (!evt)
334		return -ENODEV;
335
336	if (!evt->event_handler)
337		return -EINVAL;
338
339	evt->event_handler(evt);
340	return 0;
341}
342
343/*
344 * Broadcast the event to the cpus, which are set in the mask (mangled).
345 */
346static bool tick_do_broadcast(struct cpumask *mask)
347{
348	int cpu = smp_processor_id();
349	struct tick_device *td;
350	bool local = false;
351
352	/*
353	 * Check, if the current cpu is in the mask
354	 */
355	if (cpumask_test_cpu(cpu, mask)) {
356		struct clock_event_device *bc = tick_broadcast_device.evtdev;
357
358		cpumask_clear_cpu(cpu, mask);
359		/*
360		 * We only run the local handler, if the broadcast
361		 * device is not hrtimer based. Otherwise we run into
362		 * a hrtimer recursion.
363		 *
364		 * local timer_interrupt()
365		 *   local_handler()
366		 *     expire_hrtimers()
367		 *       bc_handler()
368		 *         local_handler()
369		 *	     expire_hrtimers()
370		 */
371		local = !(bc->features & CLOCK_EVT_FEAT_HRTIMER);
372	}
373
374	if (!cpumask_empty(mask)) {
375		/*
376		 * It might be necessary to actually check whether the devices
377		 * have different broadcast functions. For now, just use the
378		 * one of the first device. This works as long as we have this
379		 * misfeature only on x86 (lapic)
380		 */
381		td = &per_cpu(tick_cpu_device, cpumask_first(mask));
382		td->evtdev->broadcast(mask);
383	}
384	return local;
385}
386
387/*
388 * Periodic broadcast:
389 * - invoke the broadcast handlers
390 */
391static bool tick_do_periodic_broadcast(void)
392{
393	cpumask_and(tmpmask, cpu_online_mask, tick_broadcast_mask);
394	return tick_do_broadcast(tmpmask);
395}
396
397/*
398 * Event handler for periodic broadcast ticks
399 */
400static void tick_handle_periodic_broadcast(struct clock_event_device *dev)
401{
402	struct tick_device *td = this_cpu_ptr(&tick_cpu_device);
403	bool bc_local;
404
405	raw_spin_lock(&tick_broadcast_lock);
406
407	/* Handle spurious interrupts gracefully */
408	if (clockevent_state_shutdown(tick_broadcast_device.evtdev)) {
409		raw_spin_unlock(&tick_broadcast_lock);
410		return;
411	}
412
413	bc_local = tick_do_periodic_broadcast();
414
415	if (clockevent_state_oneshot(dev)) {
416		ktime_t next = ktime_add_ns(dev->next_event, TICK_NSEC);
417
418		clockevents_program_event(dev, next, true);
419	}
420	raw_spin_unlock(&tick_broadcast_lock);
421
422	/*
423	 * We run the handler of the local cpu after dropping
424	 * tick_broadcast_lock because the handler might deadlock when
425	 * trying to switch to oneshot mode.
426	 */
427	if (bc_local)
428		td->evtdev->event_handler(td->evtdev);
429}
430
431/**
432 * tick_broadcast_control - Enable/disable or force broadcast mode
433 * @mode:	The selected broadcast mode
434 *
435 * Called when the system enters a state where affected tick devices
436 * might stop. Note: TICK_BROADCAST_FORCE cannot be undone.
437 */
438void tick_broadcast_control(enum tick_broadcast_mode mode)
439{
440	struct clock_event_device *bc, *dev;
441	struct tick_device *td;
442	int cpu, bc_stopped;
443	unsigned long flags;
444
445	/* Protects also the local clockevent device. */
446	raw_spin_lock_irqsave(&tick_broadcast_lock, flags);
447	td = this_cpu_ptr(&tick_cpu_device);
448	dev = td->evtdev;
449
450	/*
451	 * Is the device not affected by the powerstate ?
452	 */
453	if (!dev || !(dev->features & CLOCK_EVT_FEAT_C3STOP))
454		goto out;
455
456	if (!tick_device_is_functional(dev))
457		goto out;
458
459	cpu = smp_processor_id();
460	bc = tick_broadcast_device.evtdev;
461	bc_stopped = cpumask_empty(tick_broadcast_mask);
462
463	switch (mode) {
464	case TICK_BROADCAST_FORCE:
465		tick_broadcast_forced = 1;
466		fallthrough;
467	case TICK_BROADCAST_ON:
468		cpumask_set_cpu(cpu, tick_broadcast_on);
469		if (!cpumask_test_and_set_cpu(cpu, tick_broadcast_mask)) {
470			/*
471			 * Only shutdown the cpu local device, if:
472			 *
473			 * - the broadcast device exists
474			 * - the broadcast device is not a hrtimer based one
475			 * - the broadcast device is in periodic mode to
476			 *   avoid a hiccup during switch to oneshot mode
477			 */
478			if (bc && !(bc->features & CLOCK_EVT_FEAT_HRTIMER) &&
479			    tick_broadcast_device.mode == TICKDEV_MODE_PERIODIC)
480				clockevents_shutdown(dev);
481		}
482		break;
483
484	case TICK_BROADCAST_OFF:
485		if (tick_broadcast_forced)
486			break;
487		cpumask_clear_cpu(cpu, tick_broadcast_on);
488		if (cpumask_test_and_clear_cpu(cpu, tick_broadcast_mask)) {
489			if (tick_broadcast_device.mode ==
490			    TICKDEV_MODE_PERIODIC)
491				tick_setup_periodic(dev, 0);
492		}
493		break;
494	}
495
496	if (bc) {
497		if (cpumask_empty(tick_broadcast_mask)) {
498			if (!bc_stopped)
499				clockevents_shutdown(bc);
500		} else if (bc_stopped) {
501			if (tick_broadcast_device.mode == TICKDEV_MODE_PERIODIC)
502				tick_broadcast_start_periodic(bc);
503			else
504				tick_broadcast_setup_oneshot(bc, false);
505		}
506	}
507out:
508	raw_spin_unlock_irqrestore(&tick_broadcast_lock, flags);
509}
510EXPORT_SYMBOL_GPL(tick_broadcast_control);
511
512/*
513 * Set the periodic handler depending on broadcast on/off
514 */
515void tick_set_periodic_handler(struct clock_event_device *dev, int broadcast)
516{
517	if (!broadcast)
518		dev->event_handler = tick_handle_periodic;
519	else
520		dev->event_handler = tick_handle_periodic_broadcast;
521}
522
523#ifdef CONFIG_HOTPLUG_CPU
524static void tick_shutdown_broadcast(void)
525{
526	struct clock_event_device *bc = tick_broadcast_device.evtdev;
527
528	if (tick_broadcast_device.mode == TICKDEV_MODE_PERIODIC) {
529		if (bc && cpumask_empty(tick_broadcast_mask))
530			clockevents_shutdown(bc);
531	}
532}
533
534/*
535 * Remove a CPU from broadcasting
536 */
537void tick_broadcast_offline(unsigned int cpu)
538{
539	raw_spin_lock(&tick_broadcast_lock);
540	cpumask_clear_cpu(cpu, tick_broadcast_mask);
541	cpumask_clear_cpu(cpu, tick_broadcast_on);
542	tick_broadcast_oneshot_offline(cpu);
543	tick_shutdown_broadcast();
544	raw_spin_unlock(&tick_broadcast_lock);
545}
546
547#endif
548
549void tick_suspend_broadcast(void)
550{
551	struct clock_event_device *bc;
552	unsigned long flags;
553
554	raw_spin_lock_irqsave(&tick_broadcast_lock, flags);
555
556	bc = tick_broadcast_device.evtdev;
557	if (bc)
558		clockevents_shutdown(bc);
559
560	raw_spin_unlock_irqrestore(&tick_broadcast_lock, flags);
561}
562
563/*
564 * This is called from tick_resume_local() on a resuming CPU. That's
565 * called from the core resume function, tick_unfreeze() and the magic XEN
566 * resume hackery.
567 *
568 * In none of these cases the broadcast device mode can change and the
569 * bit of the resuming CPU in the broadcast mask is safe as well.
570 */
571bool tick_resume_check_broadcast(void)
572{
573	if (tick_broadcast_device.mode == TICKDEV_MODE_ONESHOT)
574		return false;
575	else
576		return cpumask_test_cpu(smp_processor_id(), tick_broadcast_mask);
577}
578
579void tick_resume_broadcast(void)
580{
581	struct clock_event_device *bc;
582	unsigned long flags;
583
584	raw_spin_lock_irqsave(&tick_broadcast_lock, flags);
585
586	bc = tick_broadcast_device.evtdev;
587
588	if (bc) {
589		clockevents_tick_resume(bc);
590
591		switch (tick_broadcast_device.mode) {
592		case TICKDEV_MODE_PERIODIC:
593			if (!cpumask_empty(tick_broadcast_mask))
594				tick_broadcast_start_periodic(bc);
595			break;
596		case TICKDEV_MODE_ONESHOT:
597			if (!cpumask_empty(tick_broadcast_mask))
598				tick_resume_broadcast_oneshot(bc);
599			break;
600		}
601	}
602	raw_spin_unlock_irqrestore(&tick_broadcast_lock, flags);
603}
604
605#ifdef CONFIG_TICK_ONESHOT
606
607static cpumask_var_t tick_broadcast_oneshot_mask __cpumask_var_read_mostly;
608static cpumask_var_t tick_broadcast_pending_mask __cpumask_var_read_mostly;
609static cpumask_var_t tick_broadcast_force_mask __cpumask_var_read_mostly;
610
611/*
612 * Exposed for debugging: see timer_list.c
613 */
614struct cpumask *tick_get_broadcast_oneshot_mask(void)
615{
616	return tick_broadcast_oneshot_mask;
617}
618
619/*
620 * Called before going idle with interrupts disabled. Checks whether a
621 * broadcast event from the other core is about to happen. We detected
622 * that in tick_broadcast_oneshot_control(). The callsite can use this
623 * to avoid a deep idle transition as we are about to get the
624 * broadcast IPI right away.
625 */
626noinstr int tick_check_broadcast_expired(void)
627{
628#ifdef _ASM_GENERIC_BITOPS_INSTRUMENTED_NON_ATOMIC_H
629	return arch_test_bit(smp_processor_id(), cpumask_bits(tick_broadcast_force_mask));
630#else
631	return cpumask_test_cpu(smp_processor_id(), tick_broadcast_force_mask);
632#endif
633}
634
635/*
636 * Set broadcast interrupt affinity
637 */
638static void tick_broadcast_set_affinity(struct clock_event_device *bc,
639					const struct cpumask *cpumask)
640{
641	if (!(bc->features & CLOCK_EVT_FEAT_DYNIRQ))
642		return;
643
644	if (cpumask_equal(bc->cpumask, cpumask))
645		return;
646
647	bc->cpumask = cpumask;
648	irq_set_affinity(bc->irq, bc->cpumask);
649}
650
651static void tick_broadcast_set_event(struct clock_event_device *bc, int cpu,
652				     ktime_t expires)
653{
654	if (!clockevent_state_oneshot(bc))
655		clockevents_switch_state(bc, CLOCK_EVT_STATE_ONESHOT);
656
657	clockevents_program_event(bc, expires, 1);
658	tick_broadcast_set_affinity(bc, cpumask_of(cpu));
659}
660
661static void tick_resume_broadcast_oneshot(struct clock_event_device *bc)
662{
663	clockevents_switch_state(bc, CLOCK_EVT_STATE_ONESHOT);
664}
665
666/*
667 * Called from irq_enter() when idle was interrupted to reenable the
668 * per cpu device.
669 */
670void tick_check_oneshot_broadcast_this_cpu(void)
671{
672	if (cpumask_test_cpu(smp_processor_id(), tick_broadcast_oneshot_mask)) {
673		struct tick_device *td = this_cpu_ptr(&tick_cpu_device);
674
675		/*
676		 * We might be in the middle of switching over from
677		 * periodic to oneshot. If the CPU has not yet
678		 * switched over, leave the device alone.
679		 */
680		if (td->mode == TICKDEV_MODE_ONESHOT) {
681			clockevents_switch_state(td->evtdev,
682					      CLOCK_EVT_STATE_ONESHOT);
683		}
684	}
685}
686
687/*
688 * Handle oneshot mode broadcasting
689 */
690static void tick_handle_oneshot_broadcast(struct clock_event_device *dev)
691{
692	struct tick_device *td;
693	ktime_t now, next_event;
694	int cpu, next_cpu = 0;
695	bool bc_local;
696
697	raw_spin_lock(&tick_broadcast_lock);
698	dev->next_event = KTIME_MAX;
699	next_event = KTIME_MAX;
700	cpumask_clear(tmpmask);
701	now = ktime_get();
702	/* Find all expired events */
703	for_each_cpu(cpu, tick_broadcast_oneshot_mask) {
704		/*
705		 * Required for !SMP because for_each_cpu() reports
706		 * unconditionally CPU0 as set on UP kernels.
707		 */
708		if (!IS_ENABLED(CONFIG_SMP) &&
709		    cpumask_empty(tick_broadcast_oneshot_mask))
710			break;
711
712		td = &per_cpu(tick_cpu_device, cpu);
713		if (td->evtdev->next_event <= now) {
714			cpumask_set_cpu(cpu, tmpmask);
715			/*
716			 * Mark the remote cpu in the pending mask, so
717			 * it can avoid reprogramming the cpu local
718			 * timer in tick_broadcast_oneshot_control().
719			 */
720			cpumask_set_cpu(cpu, tick_broadcast_pending_mask);
721		} else if (td->evtdev->next_event < next_event) {
722			next_event = td->evtdev->next_event;
723			next_cpu = cpu;
724		}
725	}
726
727	/*
728	 * Remove the current cpu from the pending mask. The event is
729	 * delivered immediately in tick_do_broadcast() !
730	 */
731	cpumask_clear_cpu(smp_processor_id(), tick_broadcast_pending_mask);
732
733	/* Take care of enforced broadcast requests */
734	cpumask_or(tmpmask, tmpmask, tick_broadcast_force_mask);
735	cpumask_clear(tick_broadcast_force_mask);
736
737	/*
738	 * Sanity check. Catch the case where we try to broadcast to
739	 * offline cpus.
740	 */
741	if (WARN_ON_ONCE(!cpumask_subset(tmpmask, cpu_online_mask)))
742		cpumask_and(tmpmask, tmpmask, cpu_online_mask);
743
744	/*
745	 * Wakeup the cpus which have an expired event.
746	 */
747	bc_local = tick_do_broadcast(tmpmask);
748
749	/*
750	 * Two reasons for reprogram:
751	 *
752	 * - The global event did not expire any CPU local
753	 * events. This happens in dyntick mode, as the maximum PIT
754	 * delta is quite small.
755	 *
756	 * - There are pending events on sleeping CPUs which were not
757	 * in the event mask
758	 */
759	if (next_event != KTIME_MAX)
760		tick_broadcast_set_event(dev, next_cpu, next_event);
761
762	raw_spin_unlock(&tick_broadcast_lock);
763
764	if (bc_local) {
765		td = this_cpu_ptr(&tick_cpu_device);
766		td->evtdev->event_handler(td->evtdev);
767	}
768}
769
770static int broadcast_needs_cpu(struct clock_event_device *bc, int cpu)
771{
772	if (!(bc->features & CLOCK_EVT_FEAT_HRTIMER))
773		return 0;
774	if (bc->next_event == KTIME_MAX)
775		return 0;
776	return bc->bound_on == cpu ? -EBUSY : 0;
777}
778
779static void broadcast_shutdown_local(struct clock_event_device *bc,
780				     struct clock_event_device *dev)
781{
782	/*
783	 * For hrtimer based broadcasting we cannot shutdown the cpu
784	 * local device if our own event is the first one to expire or
785	 * if we own the broadcast timer.
786	 */
787	if (bc->features & CLOCK_EVT_FEAT_HRTIMER) {
788		if (broadcast_needs_cpu(bc, smp_processor_id()))
789			return;
790		if (dev->next_event < bc->next_event)
791			return;
792	}
793	clockevents_switch_state(dev, CLOCK_EVT_STATE_SHUTDOWN);
794}
795
796static int ___tick_broadcast_oneshot_control(enum tick_broadcast_state state,
797					     struct tick_device *td,
798					     int cpu)
799{
800	struct clock_event_device *bc, *dev = td->evtdev;
801	int ret = 0;
802	ktime_t now;
803
804	raw_spin_lock(&tick_broadcast_lock);
805	bc = tick_broadcast_device.evtdev;
806
807	if (state == TICK_BROADCAST_ENTER) {
808		/*
809		 * If the current CPU owns the hrtimer broadcast
810		 * mechanism, it cannot go deep idle and we do not add
811		 * the CPU to the broadcast mask. We don't have to go
812		 * through the EXIT path as the local timer is not
813		 * shutdown.
814		 */
815		ret = broadcast_needs_cpu(bc, cpu);
816		if (ret)
817			goto out;
818
819		/*
820		 * If the broadcast device is in periodic mode, we
821		 * return.
822		 */
823		if (tick_broadcast_device.mode == TICKDEV_MODE_PERIODIC) {
824			/* If it is a hrtimer based broadcast, return busy */
825			if (bc->features & CLOCK_EVT_FEAT_HRTIMER)
826				ret = -EBUSY;
827			goto out;
828		}
829
830		if (!cpumask_test_and_set_cpu(cpu, tick_broadcast_oneshot_mask)) {
831			WARN_ON_ONCE(cpumask_test_cpu(cpu, tick_broadcast_pending_mask));
832
833			/* Conditionally shut down the local timer. */
834			broadcast_shutdown_local(bc, dev);
835
836			/*
837			 * We only reprogram the broadcast timer if we
838			 * did not mark ourself in the force mask and
839			 * if the cpu local event is earlier than the
840			 * broadcast event. If the current CPU is in
841			 * the force mask, then we are going to be
842			 * woken by the IPI right away; we return
843			 * busy, so the CPU does not try to go deep
844			 * idle.
845			 */
846			if (cpumask_test_cpu(cpu, tick_broadcast_force_mask)) {
847				ret = -EBUSY;
848			} else if (dev->next_event < bc->next_event) {
849				tick_broadcast_set_event(bc, cpu, dev->next_event);
850				/*
851				 * In case of hrtimer broadcasts the
852				 * programming might have moved the
853				 * timer to this cpu. If yes, remove
854				 * us from the broadcast mask and
855				 * return busy.
856				 */
857				ret = broadcast_needs_cpu(bc, cpu);
858				if (ret) {
859					cpumask_clear_cpu(cpu,
860						tick_broadcast_oneshot_mask);
861				}
862			}
863		}
864	} else {
865		if (cpumask_test_and_clear_cpu(cpu, tick_broadcast_oneshot_mask)) {
866			clockevents_switch_state(dev, CLOCK_EVT_STATE_ONESHOT);
867			/*
868			 * The cpu which was handling the broadcast
869			 * timer marked this cpu in the broadcast
870			 * pending mask and fired the broadcast
871			 * IPI. So we are going to handle the expired
872			 * event anyway via the broadcast IPI
873			 * handler. No need to reprogram the timer
874			 * with an already expired event.
875			 */
876			if (cpumask_test_and_clear_cpu(cpu,
877				       tick_broadcast_pending_mask))
878				goto out;
879
880			/*
881			 * Bail out if there is no next event.
882			 */
883			if (dev->next_event == KTIME_MAX)
884				goto out;
885			/*
886			 * If the pending bit is not set, then we are
887			 * either the CPU handling the broadcast
888			 * interrupt or we got woken by something else.
889			 *
890			 * We are no longer in the broadcast mask, so
891			 * if the cpu local expiry time is already
892			 * reached, we would reprogram the cpu local
893			 * timer with an already expired event.
894			 *
895			 * This can lead to a ping-pong when we return
896			 * to idle and therefore rearm the broadcast
897			 * timer before the cpu local timer was able
898			 * to fire. This happens because the forced
899			 * reprogramming makes sure that the event
900			 * will happen in the future and depending on
901			 * the min_delta setting this might be far
902			 * enough out that the ping-pong starts.
903			 *
904			 * If the cpu local next_event has expired
905			 * then we know that the broadcast timer
906			 * next_event has expired as well and
907			 * broadcast is about to be handled. So we
908			 * avoid reprogramming and enforce that the
909			 * broadcast handler, which did not run yet,
910			 * will invoke the cpu local handler.
911			 *
912			 * We cannot call the handler directly from
913			 * here, because we might be in a NOHZ phase
914			 * and we did not go through the irq_enter()
915			 * nohz fixups.
916			 */
917			now = ktime_get();
918			if (dev->next_event <= now) {
919				cpumask_set_cpu(cpu, tick_broadcast_force_mask);
920				goto out;
921			}
922			/*
923			 * We got woken by something else. Reprogram
924			 * the cpu local timer device.
925			 */
926			tick_program_event(dev->next_event, 1);
927		}
928	}
929out:
930	raw_spin_unlock(&tick_broadcast_lock);
931	return ret;
932}
933
934static int tick_oneshot_wakeup_control(enum tick_broadcast_state state,
935				       struct tick_device *td,
936				       int cpu)
937{
938	struct clock_event_device *dev, *wd;
939
940	dev = td->evtdev;
941	if (td->mode != TICKDEV_MODE_ONESHOT)
942		return -EINVAL;
943
944	wd = tick_get_oneshot_wakeup_device(cpu);
945	if (!wd)
946		return -ENODEV;
947
948	switch (state) {
949	case TICK_BROADCAST_ENTER:
950		clockevents_switch_state(dev, CLOCK_EVT_STATE_ONESHOT_STOPPED);
951		clockevents_switch_state(wd, CLOCK_EVT_STATE_ONESHOT);
952		clockevents_program_event(wd, dev->next_event, 1);
953		break;
954	case TICK_BROADCAST_EXIT:
955		/* We may have transitioned to oneshot mode while idle */
956		if (clockevent_get_state(wd) != CLOCK_EVT_STATE_ONESHOT)
957			return -ENODEV;
958	}
959
960	return 0;
961}
962
963int __tick_broadcast_oneshot_control(enum tick_broadcast_state state)
964{
965	struct tick_device *td = this_cpu_ptr(&tick_cpu_device);
966	int cpu = smp_processor_id();
967
968	if (!tick_oneshot_wakeup_control(state, td, cpu))
969		return 0;
970
971	if (tick_broadcast_device.evtdev)
972		return ___tick_broadcast_oneshot_control(state, td, cpu);
973
974	/*
975	 * If there is no broadcast or wakeup device, tell the caller not
976	 * to go into deep idle.
977	 */
978	return -EBUSY;
979}
980
981/*
982 * Reset the one shot broadcast for a cpu
983 *
984 * Called with tick_broadcast_lock held
985 */
986static void tick_broadcast_clear_oneshot(int cpu)
987{
988	cpumask_clear_cpu(cpu, tick_broadcast_oneshot_mask);
989	cpumask_clear_cpu(cpu, tick_broadcast_pending_mask);
990}
991
992static void tick_broadcast_init_next_event(struct cpumask *mask,
993					   ktime_t expires)
994{
995	struct tick_device *td;
996	int cpu;
997
998	for_each_cpu(cpu, mask) {
999		td = &per_cpu(tick_cpu_device, cpu);
1000		if (td->evtdev)
1001			td->evtdev->next_event = expires;
1002	}
1003}
1004
1005static inline ktime_t tick_get_next_period(void)
1006{
1007	ktime_t next;
1008
1009	/*
1010	 * Protect against concurrent updates (store /load tearing on
1011	 * 32bit). It does not matter if the time is already in the
1012	 * past. The broadcast device which is about to be programmed will
1013	 * fire in any case.
1014	 */
1015	raw_spin_lock(&jiffies_lock);
1016	next = tick_next_period;
1017	raw_spin_unlock(&jiffies_lock);
1018	return next;
1019}
1020
1021/**
1022 * tick_broadcast_setup_oneshot - setup the broadcast device
1023 */
1024static void tick_broadcast_setup_oneshot(struct clock_event_device *bc,
1025					 bool from_periodic)
1026{
1027	int cpu = smp_processor_id();
1028	ktime_t nexttick = 0;
1029
1030	if (!bc)
1031		return;
1032
1033	/*
1034	 * When the broadcast device was switched to oneshot by the first
1035	 * CPU handling the NOHZ change, the other CPUs will reach this
1036	 * code via hrtimer_run_queues() -> tick_check_oneshot_change()
1037	 * too. Set up the broadcast device only once!
1038	 */
1039	if (bc->event_handler == tick_handle_oneshot_broadcast) {
1040		/*
1041		 * The CPU which switched from periodic to oneshot mode
1042		 * set the broadcast oneshot bit for all other CPUs which
1043		 * are in the general (periodic) broadcast mask to ensure
1044		 * that CPUs which wait for the periodic broadcast are
1045		 * woken up.
1046		 *
1047		 * Clear the bit for the local CPU as the set bit would
1048		 * prevent the first tick_broadcast_enter() after this CPU
1049		 * switched to oneshot state to program the broadcast
1050		 * device.
1051		 *
1052		 * This code can also be reached via tick_broadcast_control(),
1053		 * but this cannot avoid the tick_broadcast_clear_oneshot()
1054		 * as that would break the periodic to oneshot transition of
1055		 * secondary CPUs. But that's harmless as the below only
1056		 * clears already cleared bits.
1057		 */
1058		tick_broadcast_clear_oneshot(cpu);
1059		return;
1060	}
1061
1062
1063	bc->event_handler = tick_handle_oneshot_broadcast;
1064	bc->next_event = KTIME_MAX;
1065
1066	/*
1067	 * When the tick mode is switched from periodic to oneshot it must
1068	 * be ensured that CPUs which are waiting for periodic broadcast
1069	 * get their wake-up at the next tick.  This is achieved by ORing
1070	 * tick_broadcast_mask into tick_broadcast_oneshot_mask.
1071	 *
1072	 * For other callers, e.g. broadcast device replacement,
1073	 * tick_broadcast_oneshot_mask must not be touched as this would
1074	 * set bits for CPUs which are already NOHZ, but not idle. Their
1075	 * next tick_broadcast_enter() would observe the bit set and fail
1076	 * to update the expiry time and the broadcast event device.
1077	 */
1078	if (from_periodic) {
1079		cpumask_copy(tmpmask, tick_broadcast_mask);
1080		/* Remove the local CPU as it is obviously not idle */
1081		cpumask_clear_cpu(cpu, tmpmask);
1082		cpumask_or(tick_broadcast_oneshot_mask, tick_broadcast_oneshot_mask, tmpmask);
1083
1084		/*
1085		 * Ensure that the oneshot broadcast handler will wake the
1086		 * CPUs which are still waiting for periodic broadcast.
1087		 */
1088		nexttick = tick_get_next_period();
1089		tick_broadcast_init_next_event(tmpmask, nexttick);
1090
1091		/*
1092		 * If the underlying broadcast clock event device is
1093		 * already in oneshot state, then there is nothing to do.
1094		 * The device was already armed for the next tick
1095		 * in tick_handle_broadcast_periodic()
1096		 */
1097		if (clockevent_state_oneshot(bc))
1098			return;
1099	}
1100
1101	/*
1102	 * When switching from periodic to oneshot mode arm the broadcast
1103	 * device for the next tick.
1104	 *
1105	 * If the broadcast device has been replaced in oneshot mode and
1106	 * the oneshot broadcast mask is not empty, then arm it to expire
1107	 * immediately in order to reevaluate the next expiring timer.
1108	 * @nexttick is 0 and therefore in the past which will cause the
1109	 * clockevent code to force an event.
1110	 *
1111	 * For both cases the programming can be avoided when the oneshot
1112	 * broadcast mask is empty.
1113	 *
1114	 * tick_broadcast_set_event() implicitly switches the broadcast
1115	 * device to oneshot state.
1116	 */
1117	if (!cpumask_empty(tick_broadcast_oneshot_mask))
1118		tick_broadcast_set_event(bc, cpu, nexttick);
1119}
1120
1121/*
1122 * Select oneshot operating mode for the broadcast device
1123 */
1124void tick_broadcast_switch_to_oneshot(void)
1125{
1126	struct clock_event_device *bc;
1127	enum tick_device_mode oldmode;
1128	unsigned long flags;
1129
1130	raw_spin_lock_irqsave(&tick_broadcast_lock, flags);
1131
1132	oldmode = tick_broadcast_device.mode;
1133	tick_broadcast_device.mode = TICKDEV_MODE_ONESHOT;
1134	bc = tick_broadcast_device.evtdev;
1135	if (bc)
1136		tick_broadcast_setup_oneshot(bc, oldmode == TICKDEV_MODE_PERIODIC);
1137
1138	raw_spin_unlock_irqrestore(&tick_broadcast_lock, flags);
1139}
1140
1141#ifdef CONFIG_HOTPLUG_CPU
1142void hotplug_cpu__broadcast_tick_pull(int deadcpu)
1143{
1144	struct clock_event_device *bc;
1145	unsigned long flags;
1146
1147	raw_spin_lock_irqsave(&tick_broadcast_lock, flags);
1148	bc = tick_broadcast_device.evtdev;
1149
1150	if (bc && broadcast_needs_cpu(bc, deadcpu)) {
1151		/* This moves the broadcast assignment to this CPU: */
1152		clockevents_program_event(bc, bc->next_event, 1);
1153	}
1154	raw_spin_unlock_irqrestore(&tick_broadcast_lock, flags);
1155}
1156
1157/*
1158 * Remove a dying CPU from broadcasting
1159 */
1160static void tick_broadcast_oneshot_offline(unsigned int cpu)
1161{
1162	if (tick_get_oneshot_wakeup_device(cpu))
1163		tick_set_oneshot_wakeup_device(NULL, cpu);
1164
1165	/*
1166	 * Clear the broadcast masks for the dead cpu, but do not stop
1167	 * the broadcast device!
1168	 */
1169	cpumask_clear_cpu(cpu, tick_broadcast_oneshot_mask);
1170	cpumask_clear_cpu(cpu, tick_broadcast_pending_mask);
1171	cpumask_clear_cpu(cpu, tick_broadcast_force_mask);
1172}
1173#endif
1174
1175/*
1176 * Check, whether the broadcast device is in one shot mode
1177 */
1178int tick_broadcast_oneshot_active(void)
1179{
1180	return tick_broadcast_device.mode == TICKDEV_MODE_ONESHOT;
1181}
1182
1183/*
1184 * Check whether the broadcast device supports oneshot.
1185 */
1186bool tick_broadcast_oneshot_available(void)
1187{
1188	struct clock_event_device *bc = tick_broadcast_device.evtdev;
1189
1190	return bc ? bc->features & CLOCK_EVT_FEAT_ONESHOT : false;
1191}
1192
1193#else
1194int __tick_broadcast_oneshot_control(enum tick_broadcast_state state)
1195{
1196	struct clock_event_device *bc = tick_broadcast_device.evtdev;
1197
1198	if (!bc || (bc->features & CLOCK_EVT_FEAT_HRTIMER))
1199		return -EBUSY;
1200
1201	return 0;
1202}
1203#endif
1204
1205void __init tick_broadcast_init(void)
1206{
1207	zalloc_cpumask_var(&tick_broadcast_mask, GFP_NOWAIT);
1208	zalloc_cpumask_var(&tick_broadcast_on, GFP_NOWAIT);
1209	zalloc_cpumask_var(&tmpmask, GFP_NOWAIT);
1210#ifdef CONFIG_TICK_ONESHOT
1211	zalloc_cpumask_var(&tick_broadcast_oneshot_mask, GFP_NOWAIT);
1212	zalloc_cpumask_var(&tick_broadcast_pending_mask, GFP_NOWAIT);
1213	zalloc_cpumask_var(&tick_broadcast_force_mask, GFP_NOWAIT);
1214#endif
1215}
1216