1/* CPU control.
2 * (C) 2001, 2002, 2003, 2004 Rusty Russell
3 *
4 * This code is licenced under the GPL.
5 */
6#include <linux/sched/mm.h>
7#include <linux/proc_fs.h>
8#include <linux/smp.h>
9#include <linux/init.h>
10#include <linux/notifier.h>
11#include <linux/sched/signal.h>
12#include <linux/sched/hotplug.h>
13#include <linux/sched/isolation.h>
14#include <linux/sched/task.h>
15#include <linux/sched/smt.h>
16#include <linux/unistd.h>
17#include <linux/cpu.h>
18#include <linux/oom.h>
19#include <linux/rcupdate.h>
20#include <linux/export.h>
21#include <linux/bug.h>
22#include <linux/kthread.h>
23#include <linux/stop_machine.h>
24#include <linux/mutex.h>
25#include <linux/gfp.h>
26#include <linux/suspend.h>
27#include <linux/lockdep.h>
28#include <linux/tick.h>
29#include <linux/irq.h>
30#include <linux/nmi.h>
31#include <linux/smpboot.h>
32#include <linux/relay.h>
33#include <linux/slab.h>
34#include <linux/scs.h>
35#include <linux/percpu-rwsem.h>
36#include <linux/cpuset.h>
37#include <linux/random.h>
38#include <linux/cc_platform.h>
39
40#include <trace/events/power.h>
41#define CREATE_TRACE_POINTS
42#include <trace/events/cpuhp.h>
43
44#include "smpboot.h"
45
46/**
47 * struct cpuhp_cpu_state - Per cpu hotplug state storage
48 * @state:	The current cpu state
49 * @target:	The target state
50 * @fail:	Current CPU hotplug callback state
51 * @thread:	Pointer to the hotplug thread
52 * @should_run:	Thread should execute
53 * @rollback:	Perform a rollback
54 * @single:	Single callback invocation
55 * @bringup:	Single callback bringup or teardown selector
56 * @cpu:	CPU number
57 * @node:	Remote CPU node; for multi-instance, do a
58 *		single entry callback for install/remove
59 * @last:	For multi-instance rollback, remember how far we got
60 * @cb_state:	The state for a single callback (install/uninstall)
61 * @result:	Result of the operation
62 * @done_up:	Signal completion to the issuer of the task for cpu-up
63 * @done_down:	Signal completion to the issuer of the task for cpu-down
64 */
65struct cpuhp_cpu_state {
66	enum cpuhp_state	state;
67	enum cpuhp_state	target;
68	enum cpuhp_state	fail;
69#ifdef CONFIG_SMP
70	struct task_struct	*thread;
71	bool			should_run;
72	bool			rollback;
73	bool			single;
74	bool			bringup;
75	struct hlist_node	*node;
76	struct hlist_node	*last;
77	enum cpuhp_state	cb_state;
78	int			result;
79	struct completion	done_up;
80	struct completion	done_down;
81#endif
82};
83
84static DEFINE_PER_CPU(struct cpuhp_cpu_state, cpuhp_state) = {
85	.fail = CPUHP_INVALID,
86};
87
88#ifdef CONFIG_SMP
89cpumask_t cpus_booted_once_mask;
90#endif
91
92#if defined(CONFIG_LOCKDEP) && defined(CONFIG_SMP)
93static struct lockdep_map cpuhp_state_up_map =
94	STATIC_LOCKDEP_MAP_INIT("cpuhp_state-up", &cpuhp_state_up_map);
95static struct lockdep_map cpuhp_state_down_map =
96	STATIC_LOCKDEP_MAP_INIT("cpuhp_state-down", &cpuhp_state_down_map);
97
98
99static inline void cpuhp_lock_acquire(bool bringup)
100{
101	lock_map_acquire(bringup ? &cpuhp_state_up_map : &cpuhp_state_down_map);
102}
103
104static inline void cpuhp_lock_release(bool bringup)
105{
106	lock_map_release(bringup ? &cpuhp_state_up_map : &cpuhp_state_down_map);
107}
108#else
109
110static inline void cpuhp_lock_acquire(bool bringup) { }
111static inline void cpuhp_lock_release(bool bringup) { }
112
113#endif
114
115/**
116 * struct cpuhp_step - Hotplug state machine step
117 * @name:	Name of the step
118 * @startup:	Startup function of the step
119 * @teardown:	Teardown function of the step
120 * @cant_stop:	Bringup/teardown can't be stopped at this step
121 * @multi_instance:	State has multiple instances which get added afterwards
122 */
123struct cpuhp_step {
124	const char		*name;
125	union {
126		int		(*single)(unsigned int cpu);
127		int		(*multi)(unsigned int cpu,
128					 struct hlist_node *node);
129	} startup;
130	union {
131		int		(*single)(unsigned int cpu);
132		int		(*multi)(unsigned int cpu,
133					 struct hlist_node *node);
134	} teardown;
135	/* private: */
136	struct hlist_head	list;
137	/* public: */
138	bool			cant_stop;
139	bool			multi_instance;
140};
141
142static DEFINE_MUTEX(cpuhp_state_mutex);
143static struct cpuhp_step cpuhp_hp_states[];
144
145static struct cpuhp_step *cpuhp_get_step(enum cpuhp_state state)
146{
147	return cpuhp_hp_states + state;
148}
149
150static bool cpuhp_step_empty(bool bringup, struct cpuhp_step *step)
151{
152	return bringup ? !step->startup.single : !step->teardown.single;
153}
154
155/**
156 * cpuhp_invoke_callback - Invoke the callbacks for a given state
157 * @cpu:	The cpu for which the callback should be invoked
158 * @state:	The state to do callbacks for
159 * @bringup:	True if the bringup callback should be invoked
160 * @node:	For multi-instance, do a single entry callback for install/remove
161 * @lastp:	For multi-instance rollback, remember how far we got
162 *
163 * Called from cpu hotplug and from the state register machinery.
164 *
165 * Return: %0 on success or a negative errno code
166 */
167static int cpuhp_invoke_callback(unsigned int cpu, enum cpuhp_state state,
168				 bool bringup, struct hlist_node *node,
169				 struct hlist_node **lastp)
170{
171	struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu);
172	struct cpuhp_step *step = cpuhp_get_step(state);
173	int (*cbm)(unsigned int cpu, struct hlist_node *node);
174	int (*cb)(unsigned int cpu);
175	int ret, cnt;
176
177	if (st->fail == state) {
178		st->fail = CPUHP_INVALID;
179		return -EAGAIN;
180	}
181
182	if (cpuhp_step_empty(bringup, step)) {
183		WARN_ON_ONCE(1);
184		return 0;
185	}
186
187	if (!step->multi_instance) {
188		WARN_ON_ONCE(lastp && *lastp);
189		cb = bringup ? step->startup.single : step->teardown.single;
190
191		trace_cpuhp_enter(cpu, st->target, state, cb);
192		ret = cb(cpu);
193		trace_cpuhp_exit(cpu, st->state, state, ret);
194		return ret;
195	}
196	cbm = bringup ? step->startup.multi : step->teardown.multi;
197
198	/* Single invocation for instance add/remove */
199	if (node) {
200		WARN_ON_ONCE(lastp && *lastp);
201		trace_cpuhp_multi_enter(cpu, st->target, state, cbm, node);
202		ret = cbm(cpu, node);
203		trace_cpuhp_exit(cpu, st->state, state, ret);
204		return ret;
205	}
206
207	/* State transition. Invoke on all instances */
208	cnt = 0;
209	hlist_for_each(node, &step->list) {
210		if (lastp && node == *lastp)
211			break;
212
213		trace_cpuhp_multi_enter(cpu, st->target, state, cbm, node);
214		ret = cbm(cpu, node);
215		trace_cpuhp_exit(cpu, st->state, state, ret);
216		if (ret) {
217			if (!lastp)
218				goto err;
219
220			*lastp = node;
221			return ret;
222		}
223		cnt++;
224	}
225	if (lastp)
226		*lastp = NULL;
227	return 0;
228err:
229	/* Rollback the instances if one failed */
230	cbm = !bringup ? step->startup.multi : step->teardown.multi;
231	if (!cbm)
232		return ret;
233
234	hlist_for_each(node, &step->list) {
235		if (!cnt--)
236			break;
237
238		trace_cpuhp_multi_enter(cpu, st->target, state, cbm, node);
239		ret = cbm(cpu, node);
240		trace_cpuhp_exit(cpu, st->state, state, ret);
241		/*
242		 * Rollback must not fail,
243		 */
244		WARN_ON_ONCE(ret);
245	}
246	return ret;
247}
248
249#ifdef CONFIG_SMP
250static bool cpuhp_is_ap_state(enum cpuhp_state state)
251{
252	/*
253	 * The extra check for CPUHP_TEARDOWN_CPU is only for documentation
254	 * purposes as that state is handled explicitly in cpu_down.
255	 */
256	return state > CPUHP_BRINGUP_CPU && state != CPUHP_TEARDOWN_CPU;
257}
258
259static inline void wait_for_ap_thread(struct cpuhp_cpu_state *st, bool bringup)
260{
261	struct completion *done = bringup ? &st->done_up : &st->done_down;
262	wait_for_completion(done);
263}
264
265static inline void complete_ap_thread(struct cpuhp_cpu_state *st, bool bringup)
266{
267	struct completion *done = bringup ? &st->done_up : &st->done_down;
268	complete(done);
269}
270
271/*
272 * The former STARTING/DYING states, ran with IRQs disabled and must not fail.
273 */
274static bool cpuhp_is_atomic_state(enum cpuhp_state state)
275{
276	return CPUHP_AP_IDLE_DEAD <= state && state < CPUHP_AP_ONLINE;
277}
278
279/* Serializes the updates to cpu_online_mask, cpu_present_mask */
280static DEFINE_MUTEX(cpu_add_remove_lock);
281bool cpuhp_tasks_frozen;
282EXPORT_SYMBOL_GPL(cpuhp_tasks_frozen);
283
284/*
285 * The following two APIs (cpu_maps_update_begin/done) must be used when
286 * attempting to serialize the updates to cpu_online_mask & cpu_present_mask.
287 */
288void cpu_maps_update_begin(void)
289{
290	mutex_lock(&cpu_add_remove_lock);
291}
292
293void cpu_maps_update_done(void)
294{
295	mutex_unlock(&cpu_add_remove_lock);
296}
297
298/*
299 * If set, cpu_up and cpu_down will return -EBUSY and do nothing.
300 * Should always be manipulated under cpu_add_remove_lock
301 */
302static int cpu_hotplug_disabled;
303
304#ifdef CONFIG_HOTPLUG_CPU
305
306DEFINE_STATIC_PERCPU_RWSEM(cpu_hotplug_lock);
307
308void cpus_read_lock(void)
309{
310	percpu_down_read(&cpu_hotplug_lock);
311}
312EXPORT_SYMBOL_GPL(cpus_read_lock);
313
314int cpus_read_trylock(void)
315{
316	return percpu_down_read_trylock(&cpu_hotplug_lock);
317}
318EXPORT_SYMBOL_GPL(cpus_read_trylock);
319
320void cpus_read_unlock(void)
321{
322	percpu_up_read(&cpu_hotplug_lock);
323}
324EXPORT_SYMBOL_GPL(cpus_read_unlock);
325
326void cpus_write_lock(void)
327{
328	percpu_down_write(&cpu_hotplug_lock);
329}
330
331void cpus_write_unlock(void)
332{
333	percpu_up_write(&cpu_hotplug_lock);
334}
335
336void lockdep_assert_cpus_held(void)
337{
338	/*
339	 * We can't have hotplug operations before userspace starts running,
340	 * and some init codepaths will knowingly not take the hotplug lock.
341	 * This is all valid, so mute lockdep until it makes sense to report
342	 * unheld locks.
343	 */
344	if (system_state < SYSTEM_RUNNING)
345		return;
346
347	percpu_rwsem_assert_held(&cpu_hotplug_lock);
348}
349
350#ifdef CONFIG_LOCKDEP
351int lockdep_is_cpus_held(void)
352{
353	return percpu_rwsem_is_held(&cpu_hotplug_lock);
354}
355#endif
356
357static void lockdep_acquire_cpus_lock(void)
358{
359	rwsem_acquire(&cpu_hotplug_lock.dep_map, 0, 0, _THIS_IP_);
360}
361
362static void lockdep_release_cpus_lock(void)
363{
364	rwsem_release(&cpu_hotplug_lock.dep_map, _THIS_IP_);
365}
366
367/*
368 * Wait for currently running CPU hotplug operations to complete (if any) and
369 * disable future CPU hotplug (from sysfs). The 'cpu_add_remove_lock' protects
370 * the 'cpu_hotplug_disabled' flag. The same lock is also acquired by the
371 * hotplug path before performing hotplug operations. So acquiring that lock
372 * guarantees mutual exclusion from any currently running hotplug operations.
373 */
374void cpu_hotplug_disable(void)
375{
376	cpu_maps_update_begin();
377	cpu_hotplug_disabled++;
378	cpu_maps_update_done();
379}
380EXPORT_SYMBOL_GPL(cpu_hotplug_disable);
381
382static void __cpu_hotplug_enable(void)
383{
384	if (WARN_ONCE(!cpu_hotplug_disabled, "Unbalanced cpu hotplug enable\n"))
385		return;
386	cpu_hotplug_disabled--;
387}
388
389void cpu_hotplug_enable(void)
390{
391	cpu_maps_update_begin();
392	__cpu_hotplug_enable();
393	cpu_maps_update_done();
394}
395EXPORT_SYMBOL_GPL(cpu_hotplug_enable);
396
397#else
398
399static void lockdep_acquire_cpus_lock(void)
400{
401}
402
403static void lockdep_release_cpus_lock(void)
404{
405}
406
407#endif	/* CONFIG_HOTPLUG_CPU */
408
409/*
410 * Architectures that need SMT-specific errata handling during SMT hotplug
411 * should override this.
412 */
413void __weak arch_smt_update(void) { }
414
415#ifdef CONFIG_HOTPLUG_SMT
416enum cpuhp_smt_control cpu_smt_control __read_mostly = CPU_SMT_ENABLED;
417
418void __init cpu_smt_disable(bool force)
419{
420	if (!cpu_smt_possible())
421		return;
422
423	if (force) {
424		pr_info("SMT: Force disabled\n");
425		cpu_smt_control = CPU_SMT_FORCE_DISABLED;
426	} else {
427		pr_info("SMT: disabled\n");
428		cpu_smt_control = CPU_SMT_DISABLED;
429	}
430}
431
432/*
433 * The decision whether SMT is supported can only be done after the full
434 * CPU identification. Called from architecture code.
435 */
436void __init cpu_smt_check_topology(void)
437{
438	if (!topology_smt_supported())
439		cpu_smt_control = CPU_SMT_NOT_SUPPORTED;
440}
441
442static int __init smt_cmdline_disable(char *str)
443{
444	cpu_smt_disable(str && !strcmp(str, "force"));
445	return 0;
446}
447early_param("nosmt", smt_cmdline_disable);
448
449static inline bool cpu_smt_allowed(unsigned int cpu)
450{
451	if (cpu_smt_control == CPU_SMT_ENABLED)
452		return true;
453
454	if (topology_is_primary_thread(cpu))
455		return true;
456
457	/*
458	 * On x86 it's required to boot all logical CPUs at least once so
459	 * that the init code can get a chance to set CR4.MCE on each
460	 * CPU. Otherwise, a broadcasted MCE observing CR4.MCE=0b on any
461	 * core will shutdown the machine.
462	 */
463	return !cpumask_test_cpu(cpu, &cpus_booted_once_mask);
464}
465
466/* Returns true if SMT is not supported of forcefully (irreversibly) disabled */
467bool cpu_smt_possible(void)
468{
469	return cpu_smt_control != CPU_SMT_FORCE_DISABLED &&
470		cpu_smt_control != CPU_SMT_NOT_SUPPORTED;
471}
472EXPORT_SYMBOL_GPL(cpu_smt_possible);
473#else
474static inline bool cpu_smt_allowed(unsigned int cpu) { return true; }
475#endif
476
477static inline enum cpuhp_state
478cpuhp_set_state(int cpu, struct cpuhp_cpu_state *st, enum cpuhp_state target)
479{
480	enum cpuhp_state prev_state = st->state;
481	bool bringup = st->state < target;
482
483	st->rollback = false;
484	st->last = NULL;
485
486	st->target = target;
487	st->single = false;
488	st->bringup = bringup;
489	if (cpu_dying(cpu) != !bringup)
490		set_cpu_dying(cpu, !bringup);
491
492	return prev_state;
493}
494
495static inline void
496cpuhp_reset_state(int cpu, struct cpuhp_cpu_state *st,
497		  enum cpuhp_state prev_state)
498{
499	bool bringup = !st->bringup;
500
501	st->target = prev_state;
502
503	/*
504	 * Already rolling back. No need invert the bringup value or to change
505	 * the current state.
506	 */
507	if (st->rollback)
508		return;
509
510	st->rollback = true;
511
512	/*
513	 * If we have st->last we need to undo partial multi_instance of this
514	 * state first. Otherwise start undo at the previous state.
515	 */
516	if (!st->last) {
517		if (st->bringup)
518			st->state--;
519		else
520			st->state++;
521	}
522
523	st->bringup = bringup;
524	if (cpu_dying(cpu) != !bringup)
525		set_cpu_dying(cpu, !bringup);
526}
527
528/* Regular hotplug invocation of the AP hotplug thread */
529static void __cpuhp_kick_ap(struct cpuhp_cpu_state *st)
530{
531	if (!st->single && st->state == st->target)
532		return;
533
534	st->result = 0;
535	/*
536	 * Make sure the above stores are visible before should_run becomes
537	 * true. Paired with the mb() above in cpuhp_thread_fun()
538	 */
539	smp_mb();
540	st->should_run = true;
541	wake_up_process(st->thread);
542	wait_for_ap_thread(st, st->bringup);
543}
544
545static int cpuhp_kick_ap(int cpu, struct cpuhp_cpu_state *st,
546			 enum cpuhp_state target)
547{
548	enum cpuhp_state prev_state;
549	int ret;
550
551	prev_state = cpuhp_set_state(cpu, st, target);
552	__cpuhp_kick_ap(st);
553	if ((ret = st->result)) {
554		cpuhp_reset_state(cpu, st, prev_state);
555		__cpuhp_kick_ap(st);
556	}
557
558	return ret;
559}
560
561static int bringup_wait_for_ap(unsigned int cpu)
562{
563	struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu);
564
565	/* Wait for the CPU to reach CPUHP_AP_ONLINE_IDLE */
566	wait_for_ap_thread(st, true);
567	if (WARN_ON_ONCE((!cpu_online(cpu))))
568		return -ECANCELED;
569
570	/* Unpark the hotplug thread of the target cpu */
571	kthread_unpark(st->thread);
572
573	/*
574	 * SMT soft disabling on X86 requires to bring the CPU out of the
575	 * BIOS 'wait for SIPI' state in order to set the CR4.MCE bit.  The
576	 * CPU marked itself as booted_once in notify_cpu_starting() so the
577	 * cpu_smt_allowed() check will now return false if this is not the
578	 * primary sibling.
579	 */
580	if (!cpu_smt_allowed(cpu))
581		return -ECANCELED;
582
583	if (st->target <= CPUHP_AP_ONLINE_IDLE)
584		return 0;
585
586	return cpuhp_kick_ap(cpu, st, st->target);
587}
588
589static int bringup_cpu(unsigned int cpu)
590{
591	struct task_struct *idle = idle_thread_get(cpu);
592	int ret;
593
594	/*
595	 * Reset stale stack state from the last time this CPU was online.
596	 */
597	scs_task_reset(idle);
598	kasan_unpoison_task_stack(idle);
599
600	/*
601	 * Some architectures have to walk the irq descriptors to
602	 * setup the vector space for the cpu which comes online.
603	 * Prevent irq alloc/free across the bringup.
604	 */
605	irq_lock_sparse();
606
607	/* Arch-specific enabling code. */
608	ret = __cpu_up(cpu, idle);
609	irq_unlock_sparse();
610	if (ret)
611		return ret;
612	return bringup_wait_for_ap(cpu);
613}
614
615static int finish_cpu(unsigned int cpu)
616{
617	struct task_struct *idle = idle_thread_get(cpu);
618	struct mm_struct *mm = idle->active_mm;
619
620	/*
621	 * idle_task_exit() will have switched to &init_mm, now
622	 * clean up any remaining active_mm state.
623	 */
624	if (mm != &init_mm)
625		idle->active_mm = &init_mm;
626	mmdrop(mm);
627	return 0;
628}
629
630/*
631 * Hotplug state machine related functions
632 */
633
634/*
635 * Get the next state to run. Empty ones will be skipped. Returns true if a
636 * state must be run.
637 *
638 * st->state will be modified ahead of time, to match state_to_run, as if it
639 * has already ran.
640 */
641static bool cpuhp_next_state(bool bringup,
642			     enum cpuhp_state *state_to_run,
643			     struct cpuhp_cpu_state *st,
644			     enum cpuhp_state target)
645{
646	do {
647		if (bringup) {
648			if (st->state >= target)
649				return false;
650
651			*state_to_run = ++st->state;
652		} else {
653			if (st->state <= target)
654				return false;
655
656			*state_to_run = st->state--;
657		}
658
659		if (!cpuhp_step_empty(bringup, cpuhp_get_step(*state_to_run)))
660			break;
661	} while (true);
662
663	return true;
664}
665
666static int cpuhp_invoke_callback_range(bool bringup,
667				       unsigned int cpu,
668				       struct cpuhp_cpu_state *st,
669				       enum cpuhp_state target)
670{
671	enum cpuhp_state state;
672	int err = 0;
673
674	while (cpuhp_next_state(bringup, &state, st, target)) {
675		err = cpuhp_invoke_callback(cpu, state, bringup, NULL, NULL);
676		if (err)
677			break;
678	}
679
680	return err;
681}
682
683static inline bool can_rollback_cpu(struct cpuhp_cpu_state *st)
684{
685	if (IS_ENABLED(CONFIG_HOTPLUG_CPU))
686		return true;
687	/*
688	 * When CPU hotplug is disabled, then taking the CPU down is not
689	 * possible because takedown_cpu() and the architecture and
690	 * subsystem specific mechanisms are not available. So the CPU
691	 * which would be completely unplugged again needs to stay around
692	 * in the current state.
693	 */
694	return st->state <= CPUHP_BRINGUP_CPU;
695}
696
697static int cpuhp_up_callbacks(unsigned int cpu, struct cpuhp_cpu_state *st,
698			      enum cpuhp_state target)
699{
700	enum cpuhp_state prev_state = st->state;
701	int ret = 0;
702
703	ret = cpuhp_invoke_callback_range(true, cpu, st, target);
704	if (ret) {
705		pr_debug("CPU UP failed (%d) CPU %u state %s (%d)\n",
706			 ret, cpu, cpuhp_get_step(st->state)->name,
707			 st->state);
708
709		cpuhp_reset_state(cpu, st, prev_state);
710		if (can_rollback_cpu(st))
711			WARN_ON(cpuhp_invoke_callback_range(false, cpu, st,
712							    prev_state));
713	}
714	return ret;
715}
716
717/*
718 * The cpu hotplug threads manage the bringup and teardown of the cpus
719 */
720static int cpuhp_should_run(unsigned int cpu)
721{
722	struct cpuhp_cpu_state *st = this_cpu_ptr(&cpuhp_state);
723
724	return st->should_run;
725}
726
727/*
728 * Execute teardown/startup callbacks on the plugged cpu. Also used to invoke
729 * callbacks when a state gets [un]installed at runtime.
730 *
731 * Each invocation of this function by the smpboot thread does a single AP
732 * state callback.
733 *
734 * It has 3 modes of operation:
735 *  - single: runs st->cb_state
736 *  - up:     runs ++st->state, while st->state < st->target
737 *  - down:   runs st->state--, while st->state > st->target
738 *
739 * When complete or on error, should_run is cleared and the completion is fired.
740 */
741static void cpuhp_thread_fun(unsigned int cpu)
742{
743	struct cpuhp_cpu_state *st = this_cpu_ptr(&cpuhp_state);
744	bool bringup = st->bringup;
745	enum cpuhp_state state;
746
747	if (WARN_ON_ONCE(!st->should_run))
748		return;
749
750	/*
751	 * ACQUIRE for the cpuhp_should_run() load of ->should_run. Ensures
752	 * that if we see ->should_run we also see the rest of the state.
753	 */
754	smp_mb();
755
756	/*
757	 * The BP holds the hotplug lock, but we're now running on the AP,
758	 * ensure that anybody asserting the lock is held, will actually find
759	 * it so.
760	 */
761	lockdep_acquire_cpus_lock();
762	cpuhp_lock_acquire(bringup);
763
764	if (st->single) {
765		state = st->cb_state;
766		st->should_run = false;
767	} else {
768		st->should_run = cpuhp_next_state(bringup, &state, st, st->target);
769		if (!st->should_run)
770			goto end;
771	}
772
773	WARN_ON_ONCE(!cpuhp_is_ap_state(state));
774
775	if (cpuhp_is_atomic_state(state)) {
776		local_irq_disable();
777		st->result = cpuhp_invoke_callback(cpu, state, bringup, st->node, &st->last);
778		local_irq_enable();
779
780		/*
781		 * STARTING/DYING must not fail!
782		 */
783		WARN_ON_ONCE(st->result);
784	} else {
785		st->result = cpuhp_invoke_callback(cpu, state, bringup, st->node, &st->last);
786	}
787
788	if (st->result) {
789		/*
790		 * If we fail on a rollback, we're up a creek without no
791		 * paddle, no way forward, no way back. We loose, thanks for
792		 * playing.
793		 */
794		WARN_ON_ONCE(st->rollback);
795		st->should_run = false;
796	}
797
798end:
799	cpuhp_lock_release(bringup);
800	lockdep_release_cpus_lock();
801
802	if (!st->should_run)
803		complete_ap_thread(st, bringup);
804}
805
806/* Invoke a single callback on a remote cpu */
807static int
808cpuhp_invoke_ap_callback(int cpu, enum cpuhp_state state, bool bringup,
809			 struct hlist_node *node)
810{
811	struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu);
812	int ret;
813
814	if (!cpu_online(cpu))
815		return 0;
816
817	cpuhp_lock_acquire(false);
818	cpuhp_lock_release(false);
819
820	cpuhp_lock_acquire(true);
821	cpuhp_lock_release(true);
822
823	/*
824	 * If we are up and running, use the hotplug thread. For early calls
825	 * we invoke the thread function directly.
826	 */
827	if (!st->thread)
828		return cpuhp_invoke_callback(cpu, state, bringup, node, NULL);
829
830	st->rollback = false;
831	st->last = NULL;
832
833	st->node = node;
834	st->bringup = bringup;
835	st->cb_state = state;
836	st->single = true;
837
838	__cpuhp_kick_ap(st);
839
840	/*
841	 * If we failed and did a partial, do a rollback.
842	 */
843	if ((ret = st->result) && st->last) {
844		st->rollback = true;
845		st->bringup = !bringup;
846
847		__cpuhp_kick_ap(st);
848	}
849
850	/*
851	 * Clean up the leftovers so the next hotplug operation wont use stale
852	 * data.
853	 */
854	st->node = st->last = NULL;
855	return ret;
856}
857
858static int cpuhp_kick_ap_work(unsigned int cpu)
859{
860	struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu);
861	enum cpuhp_state prev_state = st->state;
862	int ret;
863
864	cpuhp_lock_acquire(false);
865	cpuhp_lock_release(false);
866
867	cpuhp_lock_acquire(true);
868	cpuhp_lock_release(true);
869
870	trace_cpuhp_enter(cpu, st->target, prev_state, cpuhp_kick_ap_work);
871	ret = cpuhp_kick_ap(cpu, st, st->target);
872	trace_cpuhp_exit(cpu, st->state, prev_state, ret);
873
874	return ret;
875}
876
877static struct smp_hotplug_thread cpuhp_threads = {
878	.store			= &cpuhp_state.thread,
879	.thread_should_run	= cpuhp_should_run,
880	.thread_fn		= cpuhp_thread_fun,
881	.thread_comm		= "cpuhp/%u",
882	.selfparking		= true,
883};
884
885static __init void cpuhp_init_state(void)
886{
887	struct cpuhp_cpu_state *st;
888	int cpu;
889
890	for_each_possible_cpu(cpu) {
891		st = per_cpu_ptr(&cpuhp_state, cpu);
892		init_completion(&st->done_up);
893		init_completion(&st->done_down);
894	}
895}
896
897void __init cpuhp_threads_init(void)
898{
899	cpuhp_init_state();
900	BUG_ON(smpboot_register_percpu_thread(&cpuhp_threads));
901	kthread_unpark(this_cpu_read(cpuhp_state.thread));
902}
903
904/*
905 *
906 * Serialize hotplug trainwrecks outside of the cpu_hotplug_lock
907 * protected region.
908 *
909 * The operation is still serialized against concurrent CPU hotplug via
910 * cpu_add_remove_lock, i.e. CPU map protection.  But it is _not_
911 * serialized against other hotplug related activity like adding or
912 * removing of state callbacks and state instances, which invoke either the
913 * startup or the teardown callback of the affected state.
914 *
915 * This is required for subsystems which are unfixable vs. CPU hotplug and
916 * evade lock inversion problems by scheduling work which has to be
917 * completed _before_ cpu_up()/_cpu_down() returns.
918 *
919 * Don't even think about adding anything to this for any new code or even
920 * drivers. It's only purpose is to keep existing lock order trainwrecks
921 * working.
922 *
923 * For cpu_down() there might be valid reasons to finish cleanups which are
924 * not required to be done under cpu_hotplug_lock, but that's a different
925 * story and would be not invoked via this.
926 */
927static void cpu_up_down_serialize_trainwrecks(bool tasks_frozen)
928{
929	/*
930	 * cpusets delegate hotplug operations to a worker to "solve" the
931	 * lock order problems. Wait for the worker, but only if tasks are
932	 * _not_ frozen (suspend, hibernate) as that would wait forever.
933	 *
934	 * The wait is required because otherwise the hotplug operation
935	 * returns with inconsistent state, which could even be observed in
936	 * user space when a new CPU is brought up. The CPU plug uevent
937	 * would be delivered and user space reacting on it would fail to
938	 * move tasks to the newly plugged CPU up to the point where the
939	 * work has finished because up to that point the newly plugged CPU
940	 * is not assignable in cpusets/cgroups. On unplug that's not
941	 * necessarily a visible issue, but it is still inconsistent state,
942	 * which is the real problem which needs to be "fixed". This can't
943	 * prevent the transient state between scheduling the work and
944	 * returning from waiting for it.
945	 */
946	if (!tasks_frozen)
947		cpuset_wait_for_hotplug();
948}
949
950#ifdef CONFIG_HOTPLUG_CPU
951#ifndef arch_clear_mm_cpumask_cpu
952#define arch_clear_mm_cpumask_cpu(cpu, mm) cpumask_clear_cpu(cpu, mm_cpumask(mm))
953#endif
954
955/**
956 * clear_tasks_mm_cpumask - Safely clear tasks' mm_cpumask for a CPU
957 * @cpu: a CPU id
958 *
959 * This function walks all processes, finds a valid mm struct for each one and
960 * then clears a corresponding bit in mm's cpumask.  While this all sounds
961 * trivial, there are various non-obvious corner cases, which this function
962 * tries to solve in a safe manner.
963 *
964 * Also note that the function uses a somewhat relaxed locking scheme, so it may
965 * be called only for an already offlined CPU.
966 */
967void clear_tasks_mm_cpumask(int cpu)
968{
969	struct task_struct *p;
970
971	/*
972	 * This function is called after the cpu is taken down and marked
973	 * offline, so its not like new tasks will ever get this cpu set in
974	 * their mm mask. -- Peter Zijlstra
975	 * Thus, we may use rcu_read_lock() here, instead of grabbing
976	 * full-fledged tasklist_lock.
977	 */
978	WARN_ON(cpu_online(cpu));
979	rcu_read_lock();
980	for_each_process(p) {
981		struct task_struct *t;
982
983		/*
984		 * Main thread might exit, but other threads may still have
985		 * a valid mm. Find one.
986		 */
987		t = find_lock_task_mm(p);
988		if (!t)
989			continue;
990		arch_clear_mm_cpumask_cpu(cpu, t->mm);
991		task_unlock(t);
992	}
993	rcu_read_unlock();
994}
995
996/* Take this CPU down. */
997static int take_cpu_down(void *_param)
998{
999	struct cpuhp_cpu_state *st = this_cpu_ptr(&cpuhp_state);
1000	enum cpuhp_state target = max((int)st->target, CPUHP_AP_OFFLINE);
1001	int err, cpu = smp_processor_id();
1002	int ret;
1003
1004	/* Ensure this CPU doesn't handle any more interrupts. */
1005	err = __cpu_disable();
1006	if (err < 0)
1007		return err;
1008
1009	/*
1010	 * Must be called from CPUHP_TEARDOWN_CPU, which means, as we are going
1011	 * down, that the current state is CPUHP_TEARDOWN_CPU - 1.
1012	 */
1013	WARN_ON(st->state != (CPUHP_TEARDOWN_CPU - 1));
1014
1015	/* Invoke the former CPU_DYING callbacks */
1016	ret = cpuhp_invoke_callback_range(false, cpu, st, target);
1017
1018	/*
1019	 * DYING must not fail!
1020	 */
1021	WARN_ON_ONCE(ret);
1022
1023	/* Give up timekeeping duties */
1024	tick_handover_do_timer();
1025	/* Remove CPU from timer broadcasting */
1026	tick_offline_cpu(cpu);
1027	/* Park the stopper thread */
1028	stop_machine_park(cpu);
1029	return 0;
1030}
1031
1032static int takedown_cpu(unsigned int cpu)
1033{
1034	struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu);
1035	int err;
1036
1037	/* Park the smpboot threads */
1038	kthread_park(st->thread);
1039
1040	/*
1041	 * Prevent irq alloc/free while the dying cpu reorganizes the
1042	 * interrupt affinities.
1043	 */
1044	irq_lock_sparse();
1045
1046	/*
1047	 * So now all preempt/rcu users must observe !cpu_active().
1048	 */
1049	err = stop_machine_cpuslocked(take_cpu_down, NULL, cpumask_of(cpu));
1050	if (err) {
1051		/* CPU refused to die */
1052		irq_unlock_sparse();
1053		/* Unpark the hotplug thread so we can rollback there */
1054		kthread_unpark(st->thread);
1055		return err;
1056	}
1057	BUG_ON(cpu_online(cpu));
1058
1059	/*
1060	 * The teardown callback for CPUHP_AP_SCHED_STARTING will have removed
1061	 * all runnable tasks from the CPU, there's only the idle task left now
1062	 * that the migration thread is done doing the stop_machine thing.
1063	 *
1064	 * Wait for the stop thread to go away.
1065	 */
1066	wait_for_ap_thread(st, false);
1067	BUG_ON(st->state != CPUHP_AP_IDLE_DEAD);
1068
1069	/* Interrupts are moved away from the dying cpu, reenable alloc/free */
1070	irq_unlock_sparse();
1071
1072	hotplug_cpu__broadcast_tick_pull(cpu);
1073	/* This actually kills the CPU. */
1074	__cpu_die(cpu);
1075
1076	tick_cleanup_dead_cpu(cpu);
1077	rcutree_migrate_callbacks(cpu);
1078	return 0;
1079}
1080
1081static void cpuhp_complete_idle_dead(void *arg)
1082{
1083	struct cpuhp_cpu_state *st = arg;
1084
1085	complete_ap_thread(st, false);
1086}
1087
1088void cpuhp_report_idle_dead(void)
1089{
1090	struct cpuhp_cpu_state *st = this_cpu_ptr(&cpuhp_state);
1091
1092	BUG_ON(st->state != CPUHP_AP_OFFLINE);
1093	rcu_report_dead(smp_processor_id());
1094	st->state = CPUHP_AP_IDLE_DEAD;
1095	/*
1096	 * We cannot call complete after rcu_report_dead() so we delegate it
1097	 * to an online cpu.
1098	 */
1099	smp_call_function_single(cpumask_first(cpu_online_mask),
1100				 cpuhp_complete_idle_dead, st, 0);
1101}
1102
1103static int cpuhp_down_callbacks(unsigned int cpu, struct cpuhp_cpu_state *st,
1104				enum cpuhp_state target)
1105{
1106	enum cpuhp_state prev_state = st->state;
1107	int ret = 0;
1108
1109	ret = cpuhp_invoke_callback_range(false, cpu, st, target);
1110	if (ret) {
1111		pr_debug("CPU DOWN failed (%d) CPU %u state %s (%d)\n",
1112			 ret, cpu, cpuhp_get_step(st->state)->name,
1113			 st->state);
1114
1115		cpuhp_reset_state(cpu, st, prev_state);
1116
1117		if (st->state < prev_state)
1118			WARN_ON(cpuhp_invoke_callback_range(true, cpu, st,
1119							    prev_state));
1120	}
1121
1122	return ret;
1123}
1124
1125/* Requires cpu_add_remove_lock to be held */
1126static int __ref _cpu_down(unsigned int cpu, int tasks_frozen,
1127			   enum cpuhp_state target)
1128{
1129	struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu);
1130	int prev_state, ret = 0;
1131
1132	if (num_online_cpus() == 1)
1133		return -EBUSY;
1134
1135	if (!cpu_present(cpu))
1136		return -EINVAL;
1137
1138	cpus_write_lock();
1139
1140	cpuhp_tasks_frozen = tasks_frozen;
1141
1142	prev_state = cpuhp_set_state(cpu, st, target);
1143	/*
1144	 * If the current CPU state is in the range of the AP hotplug thread,
1145	 * then we need to kick the thread.
1146	 */
1147	if (st->state > CPUHP_TEARDOWN_CPU) {
1148		st->target = max((int)target, CPUHP_TEARDOWN_CPU);
1149		ret = cpuhp_kick_ap_work(cpu);
1150		/*
1151		 * The AP side has done the error rollback already. Just
1152		 * return the error code..
1153		 */
1154		if (ret)
1155			goto out;
1156
1157		/*
1158		 * We might have stopped still in the range of the AP hotplug
1159		 * thread. Nothing to do anymore.
1160		 */
1161		if (st->state > CPUHP_TEARDOWN_CPU)
1162			goto out;
1163
1164		st->target = target;
1165	}
1166	/*
1167	 * The AP brought itself down to CPUHP_TEARDOWN_CPU. So we need
1168	 * to do the further cleanups.
1169	 */
1170	ret = cpuhp_down_callbacks(cpu, st, target);
1171	if (ret && st->state < prev_state) {
1172		if (st->state == CPUHP_TEARDOWN_CPU) {
1173			cpuhp_reset_state(cpu, st, prev_state);
1174			__cpuhp_kick_ap(st);
1175		} else {
1176			WARN(1, "DEAD callback error for CPU%d", cpu);
1177		}
1178	}
1179
1180out:
1181	cpus_write_unlock();
1182	/*
1183	 * Do post unplug cleanup. This is still protected against
1184	 * concurrent CPU hotplug via cpu_add_remove_lock.
1185	 */
1186	lockup_detector_cleanup();
1187	arch_smt_update();
1188	cpu_up_down_serialize_trainwrecks(tasks_frozen);
1189	return ret;
1190}
1191
1192static int cpu_down_maps_locked(unsigned int cpu, enum cpuhp_state target)
1193{
1194	/*
1195	 * If the platform does not support hotplug, report it explicitly to
1196	 * differentiate it from a transient offlining failure.
1197	 */
1198	if (cc_platform_has(CC_ATTR_HOTPLUG_DISABLED))
1199		return -EOPNOTSUPP;
1200	if (cpu_hotplug_disabled)
1201		return -EBUSY;
1202	return _cpu_down(cpu, 0, target);
1203}
1204
1205static int cpu_down(unsigned int cpu, enum cpuhp_state target)
1206{
1207	int err;
1208
1209	cpu_maps_update_begin();
1210	err = cpu_down_maps_locked(cpu, target);
1211	cpu_maps_update_done();
1212	return err;
1213}
1214
1215/**
1216 * cpu_device_down - Bring down a cpu device
1217 * @dev: Pointer to the cpu device to offline
1218 *
1219 * This function is meant to be used by device core cpu subsystem only.
1220 *
1221 * Other subsystems should use remove_cpu() instead.
1222 *
1223 * Return: %0 on success or a negative errno code
1224 */
1225int cpu_device_down(struct device *dev)
1226{
1227	return cpu_down(dev->id, CPUHP_OFFLINE);
1228}
1229
1230int remove_cpu(unsigned int cpu)
1231{
1232	int ret;
1233
1234	lock_device_hotplug();
1235	ret = device_offline(get_cpu_device(cpu));
1236	unlock_device_hotplug();
1237
1238	return ret;
1239}
1240EXPORT_SYMBOL_GPL(remove_cpu);
1241
1242void smp_shutdown_nonboot_cpus(unsigned int primary_cpu)
1243{
1244	unsigned int cpu;
1245	int error;
1246
1247	cpu_maps_update_begin();
1248
1249	/*
1250	 * Make certain the cpu I'm about to reboot on is online.
1251	 *
1252	 * This is inline to what migrate_to_reboot_cpu() already do.
1253	 */
1254	if (!cpu_online(primary_cpu))
1255		primary_cpu = cpumask_first(cpu_online_mask);
1256
1257	for_each_online_cpu(cpu) {
1258		if (cpu == primary_cpu)
1259			continue;
1260
1261		error = cpu_down_maps_locked(cpu, CPUHP_OFFLINE);
1262		if (error) {
1263			pr_err("Failed to offline CPU%d - error=%d",
1264				cpu, error);
1265			break;
1266		}
1267	}
1268
1269	/*
1270	 * Ensure all but the reboot CPU are offline.
1271	 */
1272	BUG_ON(num_online_cpus() > 1);
1273
1274	/*
1275	 * Make sure the CPUs won't be enabled by someone else after this
1276	 * point. Kexec will reboot to a new kernel shortly resetting
1277	 * everything along the way.
1278	 */
1279	cpu_hotplug_disabled++;
1280
1281	cpu_maps_update_done();
1282}
1283
1284#else
1285#define takedown_cpu		NULL
1286#endif /*CONFIG_HOTPLUG_CPU*/
1287
1288/**
1289 * notify_cpu_starting(cpu) - Invoke the callbacks on the starting CPU
1290 * @cpu: cpu that just started
1291 *
1292 * It must be called by the arch code on the new cpu, before the new cpu
1293 * enables interrupts and before the "boot" cpu returns from __cpu_up().
1294 */
1295void notify_cpu_starting(unsigned int cpu)
1296{
1297	struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu);
1298	enum cpuhp_state target = min((int)st->target, CPUHP_AP_ONLINE);
1299	int ret;
1300
1301	rcu_cpu_starting(cpu);	/* Enables RCU usage on this CPU. */
1302	cpumask_set_cpu(cpu, &cpus_booted_once_mask);
1303	ret = cpuhp_invoke_callback_range(true, cpu, st, target);
1304
1305	/*
1306	 * STARTING must not fail!
1307	 */
1308	WARN_ON_ONCE(ret);
1309}
1310
1311/*
1312 * Called from the idle task. Wake up the controlling task which brings the
1313 * hotplug thread of the upcoming CPU up and then delegates the rest of the
1314 * online bringup to the hotplug thread.
1315 */
1316void cpuhp_online_idle(enum cpuhp_state state)
1317{
1318	struct cpuhp_cpu_state *st = this_cpu_ptr(&cpuhp_state);
1319
1320	/* Happens for the boot cpu */
1321	if (state != CPUHP_AP_ONLINE_IDLE)
1322		return;
1323
1324	/*
1325	 * Unpart the stopper thread before we start the idle loop (and start
1326	 * scheduling); this ensures the stopper task is always available.
1327	 */
1328	stop_machine_unpark(smp_processor_id());
1329
1330	st->state = CPUHP_AP_ONLINE_IDLE;
1331	complete_ap_thread(st, true);
1332}
1333
1334/* Requires cpu_add_remove_lock to be held */
1335static int _cpu_up(unsigned int cpu, int tasks_frozen, enum cpuhp_state target)
1336{
1337	struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu);
1338	struct task_struct *idle;
1339	int ret = 0;
1340
1341	cpus_write_lock();
1342
1343	if (!cpu_present(cpu)) {
1344		ret = -EINVAL;
1345		goto out;
1346	}
1347
1348	/*
1349	 * The caller of cpu_up() might have raced with another
1350	 * caller. Nothing to do.
1351	 */
1352	if (st->state >= target)
1353		goto out;
1354
1355	if (st->state == CPUHP_OFFLINE) {
1356		/* Let it fail before we try to bring the cpu up */
1357		idle = idle_thread_get(cpu);
1358		if (IS_ERR(idle)) {
1359			ret = PTR_ERR(idle);
1360			goto out;
1361		}
1362	}
1363
1364	cpuhp_tasks_frozen = tasks_frozen;
1365
1366	cpuhp_set_state(cpu, st, target);
1367	/*
1368	 * If the current CPU state is in the range of the AP hotplug thread,
1369	 * then we need to kick the thread once more.
1370	 */
1371	if (st->state > CPUHP_BRINGUP_CPU) {
1372		ret = cpuhp_kick_ap_work(cpu);
1373		/*
1374		 * The AP side has done the error rollback already. Just
1375		 * return the error code..
1376		 */
1377		if (ret)
1378			goto out;
1379	}
1380
1381	/*
1382	 * Try to reach the target state. We max out on the BP at
1383	 * CPUHP_BRINGUP_CPU. After that the AP hotplug thread is
1384	 * responsible for bringing it up to the target state.
1385	 */
1386	target = min((int)target, CPUHP_BRINGUP_CPU);
1387	ret = cpuhp_up_callbacks(cpu, st, target);
1388out:
1389	cpus_write_unlock();
1390	arch_smt_update();
1391	cpu_up_down_serialize_trainwrecks(tasks_frozen);
1392	return ret;
1393}
1394
1395static int cpu_up(unsigned int cpu, enum cpuhp_state target)
1396{
1397	int err = 0;
1398
1399	if (!cpu_possible(cpu)) {
1400		pr_err("can't online cpu %d because it is not configured as may-hotadd at boot time\n",
1401		       cpu);
1402#if defined(CONFIG_IA64)
1403		pr_err("please check additional_cpus= boot parameter\n");
1404#endif
1405		return -EINVAL;
1406	}
1407
1408	err = try_online_node(cpu_to_node(cpu));
1409	if (err)
1410		return err;
1411
1412	cpu_maps_update_begin();
1413
1414	if (cpu_hotplug_disabled) {
1415		err = -EBUSY;
1416		goto out;
1417	}
1418	if (!cpu_smt_allowed(cpu)) {
1419		err = -EPERM;
1420		goto out;
1421	}
1422
1423	err = _cpu_up(cpu, 0, target);
1424out:
1425	cpu_maps_update_done();
1426	return err;
1427}
1428
1429/**
1430 * cpu_device_up - Bring up a cpu device
1431 * @dev: Pointer to the cpu device to online
1432 *
1433 * This function is meant to be used by device core cpu subsystem only.
1434 *
1435 * Other subsystems should use add_cpu() instead.
1436 *
1437 * Return: %0 on success or a negative errno code
1438 */
1439int cpu_device_up(struct device *dev)
1440{
1441	return cpu_up(dev->id, CPUHP_ONLINE);
1442}
1443
1444int add_cpu(unsigned int cpu)
1445{
1446	int ret;
1447
1448	lock_device_hotplug();
1449	ret = device_online(get_cpu_device(cpu));
1450	unlock_device_hotplug();
1451
1452	return ret;
1453}
1454EXPORT_SYMBOL_GPL(add_cpu);
1455
1456/**
1457 * bringup_hibernate_cpu - Bring up the CPU that we hibernated on
1458 * @sleep_cpu: The cpu we hibernated on and should be brought up.
1459 *
1460 * On some architectures like arm64, we can hibernate on any CPU, but on
1461 * wake up the CPU we hibernated on might be offline as a side effect of
1462 * using maxcpus= for example.
1463 *
1464 * Return: %0 on success or a negative errno code
1465 */
1466int bringup_hibernate_cpu(unsigned int sleep_cpu)
1467{
1468	int ret;
1469
1470	if (!cpu_online(sleep_cpu)) {
1471		pr_info("Hibernated on a CPU that is offline! Bringing CPU up.\n");
1472		ret = cpu_up(sleep_cpu, CPUHP_ONLINE);
1473		if (ret) {
1474			pr_err("Failed to bring hibernate-CPU up!\n");
1475			return ret;
1476		}
1477	}
1478	return 0;
1479}
1480
1481void bringup_nonboot_cpus(unsigned int setup_max_cpus)
1482{
1483	unsigned int cpu;
1484
1485	for_each_present_cpu(cpu) {
1486		if (num_online_cpus() >= setup_max_cpus)
1487			break;
1488		if (!cpu_online(cpu))
1489			cpu_up(cpu, CPUHP_ONLINE);
1490	}
1491}
1492
1493#ifdef CONFIG_PM_SLEEP_SMP
1494static cpumask_var_t frozen_cpus;
1495
1496int freeze_secondary_cpus(int primary)
1497{
1498	int cpu, error = 0;
1499
1500	cpu_maps_update_begin();
1501	if (primary == -1) {
1502		primary = cpumask_first(cpu_online_mask);
1503		if (!housekeeping_cpu(primary, HK_TYPE_TIMER))
1504			primary = housekeeping_any_cpu(HK_TYPE_TIMER);
1505	} else {
1506		if (!cpu_online(primary))
1507			primary = cpumask_first(cpu_online_mask);
1508	}
1509
1510	/*
1511	 * We take down all of the non-boot CPUs in one shot to avoid races
1512	 * with the userspace trying to use the CPU hotplug at the same time
1513	 */
1514	cpumask_clear(frozen_cpus);
1515
1516	pr_info("Disabling non-boot CPUs ...\n");
1517	for_each_online_cpu(cpu) {
1518		if (cpu == primary)
1519			continue;
1520
1521		if (pm_wakeup_pending()) {
1522			pr_info("Wakeup pending. Abort CPU freeze\n");
1523			error = -EBUSY;
1524			break;
1525		}
1526
1527		trace_suspend_resume(TPS("CPU_OFF"), cpu, true);
1528		error = _cpu_down(cpu, 1, CPUHP_OFFLINE);
1529		trace_suspend_resume(TPS("CPU_OFF"), cpu, false);
1530		if (!error)
1531			cpumask_set_cpu(cpu, frozen_cpus);
1532		else {
1533			pr_err("Error taking CPU%d down: %d\n", cpu, error);
1534			break;
1535		}
1536	}
1537
1538	if (!error)
1539		BUG_ON(num_online_cpus() > 1);
1540	else
1541		pr_err("Non-boot CPUs are not disabled\n");
1542
1543	/*
1544	 * Make sure the CPUs won't be enabled by someone else. We need to do
1545	 * this even in case of failure as all freeze_secondary_cpus() users are
1546	 * supposed to do thaw_secondary_cpus() on the failure path.
1547	 */
1548	cpu_hotplug_disabled++;
1549
1550	cpu_maps_update_done();
1551	return error;
1552}
1553
1554void __weak arch_thaw_secondary_cpus_begin(void)
1555{
1556}
1557
1558void __weak arch_thaw_secondary_cpus_end(void)
1559{
1560}
1561
1562void thaw_secondary_cpus(void)
1563{
1564	int cpu, error;
1565
1566	/* Allow everyone to use the CPU hotplug again */
1567	cpu_maps_update_begin();
1568	__cpu_hotplug_enable();
1569	if (cpumask_empty(frozen_cpus))
1570		goto out;
1571
1572	pr_info("Enabling non-boot CPUs ...\n");
1573
1574	arch_thaw_secondary_cpus_begin();
1575
1576	for_each_cpu(cpu, frozen_cpus) {
1577		trace_suspend_resume(TPS("CPU_ON"), cpu, true);
1578		error = _cpu_up(cpu, 1, CPUHP_ONLINE);
1579		trace_suspend_resume(TPS("CPU_ON"), cpu, false);
1580		if (!error) {
1581			pr_info("CPU%d is up\n", cpu);
1582			continue;
1583		}
1584		pr_warn("Error taking CPU%d up: %d\n", cpu, error);
1585	}
1586
1587	arch_thaw_secondary_cpus_end();
1588
1589	cpumask_clear(frozen_cpus);
1590out:
1591	cpu_maps_update_done();
1592}
1593
1594static int __init alloc_frozen_cpus(void)
1595{
1596	if (!alloc_cpumask_var(&frozen_cpus, GFP_KERNEL|__GFP_ZERO))
1597		return -ENOMEM;
1598	return 0;
1599}
1600core_initcall(alloc_frozen_cpus);
1601
1602/*
1603 * When callbacks for CPU hotplug notifications are being executed, we must
1604 * ensure that the state of the system with respect to the tasks being frozen
1605 * or not, as reported by the notification, remains unchanged *throughout the
1606 * duration* of the execution of the callbacks.
1607 * Hence we need to prevent the freezer from racing with regular CPU hotplug.
1608 *
1609 * This synchronization is implemented by mutually excluding regular CPU
1610 * hotplug and Suspend/Hibernate call paths by hooking onto the Suspend/
1611 * Hibernate notifications.
1612 */
1613static int
1614cpu_hotplug_pm_callback(struct notifier_block *nb,
1615			unsigned long action, void *ptr)
1616{
1617	switch (action) {
1618
1619	case PM_SUSPEND_PREPARE:
1620	case PM_HIBERNATION_PREPARE:
1621		cpu_hotplug_disable();
1622		break;
1623
1624	case PM_POST_SUSPEND:
1625	case PM_POST_HIBERNATION:
1626		cpu_hotplug_enable();
1627		break;
1628
1629	default:
1630		return NOTIFY_DONE;
1631	}
1632
1633	return NOTIFY_OK;
1634}
1635
1636
1637static int __init cpu_hotplug_pm_sync_init(void)
1638{
1639	/*
1640	 * cpu_hotplug_pm_callback has higher priority than x86
1641	 * bsp_pm_callback which depends on cpu_hotplug_pm_callback
1642	 * to disable cpu hotplug to avoid cpu hotplug race.
1643	 */
1644	pm_notifier(cpu_hotplug_pm_callback, 0);
1645	return 0;
1646}
1647core_initcall(cpu_hotplug_pm_sync_init);
1648
1649#endif /* CONFIG_PM_SLEEP_SMP */
1650
1651int __boot_cpu_id;
1652
1653#endif /* CONFIG_SMP */
1654
1655/* Boot processor state steps */
1656static struct cpuhp_step cpuhp_hp_states[] = {
1657	[CPUHP_OFFLINE] = {
1658		.name			= "offline",
1659		.startup.single		= NULL,
1660		.teardown.single	= NULL,
1661	},
1662#ifdef CONFIG_SMP
1663	[CPUHP_CREATE_THREADS]= {
1664		.name			= "threads:prepare",
1665		.startup.single		= smpboot_create_threads,
1666		.teardown.single	= NULL,
1667		.cant_stop		= true,
1668	},
1669	[CPUHP_PERF_PREPARE] = {
1670		.name			= "perf:prepare",
1671		.startup.single		= perf_event_init_cpu,
1672		.teardown.single	= perf_event_exit_cpu,
1673	},
1674	[CPUHP_RANDOM_PREPARE] = {
1675		.name			= "random:prepare",
1676		.startup.single		= random_prepare_cpu,
1677		.teardown.single	= NULL,
1678	},
1679	[CPUHP_WORKQUEUE_PREP] = {
1680		.name			= "workqueue:prepare",
1681		.startup.single		= workqueue_prepare_cpu,
1682		.teardown.single	= NULL,
1683	},
1684	[CPUHP_HRTIMERS_PREPARE] = {
1685		.name			= "hrtimers:prepare",
1686		.startup.single		= hrtimers_prepare_cpu,
1687		.teardown.single	= hrtimers_dead_cpu,
1688	},
1689	[CPUHP_SMPCFD_PREPARE] = {
1690		.name			= "smpcfd:prepare",
1691		.startup.single		= smpcfd_prepare_cpu,
1692		.teardown.single	= smpcfd_dead_cpu,
1693	},
1694	[CPUHP_RELAY_PREPARE] = {
1695		.name			= "relay:prepare",
1696		.startup.single		= relay_prepare_cpu,
1697		.teardown.single	= NULL,
1698	},
1699	[CPUHP_SLAB_PREPARE] = {
1700		.name			= "slab:prepare",
1701		.startup.single		= slab_prepare_cpu,
1702		.teardown.single	= slab_dead_cpu,
1703	},
1704	[CPUHP_RCUTREE_PREP] = {
1705		.name			= "RCU/tree:prepare",
1706		.startup.single		= rcutree_prepare_cpu,
1707		.teardown.single	= rcutree_dead_cpu,
1708	},
1709	/*
1710	 * On the tear-down path, timers_dead_cpu() must be invoked
1711	 * before blk_mq_queue_reinit_notify() from notify_dead(),
1712	 * otherwise a RCU stall occurs.
1713	 */
1714	[CPUHP_TIMERS_PREPARE] = {
1715		.name			= "timers:prepare",
1716		.startup.single		= timers_prepare_cpu,
1717		.teardown.single	= timers_dead_cpu,
1718	},
1719	/* Kicks the plugged cpu into life */
1720	[CPUHP_BRINGUP_CPU] = {
1721		.name			= "cpu:bringup",
1722		.startup.single		= bringup_cpu,
1723		.teardown.single	= finish_cpu,
1724		.cant_stop		= true,
1725	},
1726	/* Final state before CPU kills itself */
1727	[CPUHP_AP_IDLE_DEAD] = {
1728		.name			= "idle:dead",
1729	},
1730	/*
1731	 * Last state before CPU enters the idle loop to die. Transient state
1732	 * for synchronization.
1733	 */
1734	[CPUHP_AP_OFFLINE] = {
1735		.name			= "ap:offline",
1736		.cant_stop		= true,
1737	},
1738	/* First state is scheduler control. Interrupts are disabled */
1739	[CPUHP_AP_SCHED_STARTING] = {
1740		.name			= "sched:starting",
1741		.startup.single		= sched_cpu_starting,
1742		.teardown.single	= sched_cpu_dying,
1743	},
1744	[CPUHP_AP_RCUTREE_DYING] = {
1745		.name			= "RCU/tree:dying",
1746		.startup.single		= NULL,
1747		.teardown.single	= rcutree_dying_cpu,
1748	},
1749	[CPUHP_AP_SMPCFD_DYING] = {
1750		.name			= "smpcfd:dying",
1751		.startup.single		= NULL,
1752		.teardown.single	= smpcfd_dying_cpu,
1753	},
1754	/* Entry state on starting. Interrupts enabled from here on. Transient
1755	 * state for synchronsization */
1756	[CPUHP_AP_ONLINE] = {
1757		.name			= "ap:online",
1758	},
1759	/*
1760	 * Handled on control processor until the plugged processor manages
1761	 * this itself.
1762	 */
1763	[CPUHP_TEARDOWN_CPU] = {
1764		.name			= "cpu:teardown",
1765		.startup.single		= NULL,
1766		.teardown.single	= takedown_cpu,
1767		.cant_stop		= true,
1768	},
1769
1770	[CPUHP_AP_SCHED_WAIT_EMPTY] = {
1771		.name			= "sched:waitempty",
1772		.startup.single		= NULL,
1773		.teardown.single	= sched_cpu_wait_empty,
1774	},
1775
1776	/* Handle smpboot threads park/unpark */
1777	[CPUHP_AP_SMPBOOT_THREADS] = {
1778		.name			= "smpboot/threads:online",
1779		.startup.single		= smpboot_unpark_threads,
1780		.teardown.single	= smpboot_park_threads,
1781	},
1782	[CPUHP_AP_IRQ_AFFINITY_ONLINE] = {
1783		.name			= "irq/affinity:online",
1784		.startup.single		= irq_affinity_online_cpu,
1785		.teardown.single	= NULL,
1786	},
1787	[CPUHP_AP_PERF_ONLINE] = {
1788		.name			= "perf:online",
1789		.startup.single		= perf_event_init_cpu,
1790		.teardown.single	= perf_event_exit_cpu,
1791	},
1792	[CPUHP_AP_WATCHDOG_ONLINE] = {
1793		.name			= "lockup_detector:online",
1794		.startup.single		= lockup_detector_online_cpu,
1795		.teardown.single	= lockup_detector_offline_cpu,
1796	},
1797	[CPUHP_AP_WORKQUEUE_ONLINE] = {
1798		.name			= "workqueue:online",
1799		.startup.single		= workqueue_online_cpu,
1800		.teardown.single	= workqueue_offline_cpu,
1801	},
1802	[CPUHP_AP_RANDOM_ONLINE] = {
1803		.name			= "random:online",
1804		.startup.single		= random_online_cpu,
1805		.teardown.single	= NULL,
1806	},
1807	[CPUHP_AP_RCUTREE_ONLINE] = {
1808		.name			= "RCU/tree:online",
1809		.startup.single		= rcutree_online_cpu,
1810		.teardown.single	= rcutree_offline_cpu,
1811	},
1812#endif
1813	/*
1814	 * The dynamically registered state space is here
1815	 */
1816
1817#ifdef CONFIG_SMP
1818	/* Last state is scheduler control setting the cpu active */
1819	[CPUHP_AP_ACTIVE] = {
1820		.name			= "sched:active",
1821		.startup.single		= sched_cpu_activate,
1822		.teardown.single	= sched_cpu_deactivate,
1823	},
1824#endif
1825
1826	/* CPU is fully up and running. */
1827	[CPUHP_ONLINE] = {
1828		.name			= "online",
1829		.startup.single		= NULL,
1830		.teardown.single	= NULL,
1831	},
1832};
1833
1834/* Sanity check for callbacks */
1835static int cpuhp_cb_check(enum cpuhp_state state)
1836{
1837	if (state <= CPUHP_OFFLINE || state >= CPUHP_ONLINE)
1838		return -EINVAL;
1839	return 0;
1840}
1841
1842/*
1843 * Returns a free for dynamic slot assignment of the Online state. The states
1844 * are protected by the cpuhp_slot_states mutex and an empty slot is identified
1845 * by having no name assigned.
1846 */
1847static int cpuhp_reserve_state(enum cpuhp_state state)
1848{
1849	enum cpuhp_state i, end;
1850	struct cpuhp_step *step;
1851
1852	switch (state) {
1853	case CPUHP_AP_ONLINE_DYN:
1854		step = cpuhp_hp_states + CPUHP_AP_ONLINE_DYN;
1855		end = CPUHP_AP_ONLINE_DYN_END;
1856		break;
1857	case CPUHP_BP_PREPARE_DYN:
1858		step = cpuhp_hp_states + CPUHP_BP_PREPARE_DYN;
1859		end = CPUHP_BP_PREPARE_DYN_END;
1860		break;
1861	default:
1862		return -EINVAL;
1863	}
1864
1865	for (i = state; i <= end; i++, step++) {
1866		if (!step->name)
1867			return i;
1868	}
1869	WARN(1, "No more dynamic states available for CPU hotplug\n");
1870	return -ENOSPC;
1871}
1872
1873static int cpuhp_store_callbacks(enum cpuhp_state state, const char *name,
1874				 int (*startup)(unsigned int cpu),
1875				 int (*teardown)(unsigned int cpu),
1876				 bool multi_instance)
1877{
1878	/* (Un)Install the callbacks for further cpu hotplug operations */
1879	struct cpuhp_step *sp;
1880	int ret = 0;
1881
1882	/*
1883	 * If name is NULL, then the state gets removed.
1884	 *
1885	 * CPUHP_AP_ONLINE_DYN and CPUHP_BP_PREPARE_DYN are handed out on
1886	 * the first allocation from these dynamic ranges, so the removal
1887	 * would trigger a new allocation and clear the wrong (already
1888	 * empty) state, leaving the callbacks of the to be cleared state
1889	 * dangling, which causes wreckage on the next hotplug operation.
1890	 */
1891	if (name && (state == CPUHP_AP_ONLINE_DYN ||
1892		     state == CPUHP_BP_PREPARE_DYN)) {
1893		ret = cpuhp_reserve_state(state);
1894		if (ret < 0)
1895			return ret;
1896		state = ret;
1897	}
1898	sp = cpuhp_get_step(state);
1899	if (name && sp->name)
1900		return -EBUSY;
1901
1902	sp->startup.single = startup;
1903	sp->teardown.single = teardown;
1904	sp->name = name;
1905	sp->multi_instance = multi_instance;
1906	INIT_HLIST_HEAD(&sp->list);
1907	return ret;
1908}
1909
1910static void *cpuhp_get_teardown_cb(enum cpuhp_state state)
1911{
1912	return cpuhp_get_step(state)->teardown.single;
1913}
1914
1915/*
1916 * Call the startup/teardown function for a step either on the AP or
1917 * on the current CPU.
1918 */
1919static int cpuhp_issue_call(int cpu, enum cpuhp_state state, bool bringup,
1920			    struct hlist_node *node)
1921{
1922	struct cpuhp_step *sp = cpuhp_get_step(state);
1923	int ret;
1924
1925	/*
1926	 * If there's nothing to do, we done.
1927	 * Relies on the union for multi_instance.
1928	 */
1929	if (cpuhp_step_empty(bringup, sp))
1930		return 0;
1931	/*
1932	 * The non AP bound callbacks can fail on bringup. On teardown
1933	 * e.g. module removal we crash for now.
1934	 */
1935#ifdef CONFIG_SMP
1936	if (cpuhp_is_ap_state(state))
1937		ret = cpuhp_invoke_ap_callback(cpu, state, bringup, node);
1938	else
1939		ret = cpuhp_invoke_callback(cpu, state, bringup, node, NULL);
1940#else
1941	ret = cpuhp_invoke_callback(cpu, state, bringup, node, NULL);
1942#endif
1943	BUG_ON(ret && !bringup);
1944	return ret;
1945}
1946
1947/*
1948 * Called from __cpuhp_setup_state on a recoverable failure.
1949 *
1950 * Note: The teardown callbacks for rollback are not allowed to fail!
1951 */
1952static void cpuhp_rollback_install(int failedcpu, enum cpuhp_state state,
1953				   struct hlist_node *node)
1954{
1955	int cpu;
1956
1957	/* Roll back the already executed steps on the other cpus */
1958	for_each_present_cpu(cpu) {
1959		struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu);
1960		int cpustate = st->state;
1961
1962		if (cpu >= failedcpu)
1963			break;
1964
1965		/* Did we invoke the startup call on that cpu ? */
1966		if (cpustate >= state)
1967			cpuhp_issue_call(cpu, state, false, node);
1968	}
1969}
1970
1971int __cpuhp_state_add_instance_cpuslocked(enum cpuhp_state state,
1972					  struct hlist_node *node,
1973					  bool invoke)
1974{
1975	struct cpuhp_step *sp;
1976	int cpu;
1977	int ret;
1978
1979	lockdep_assert_cpus_held();
1980
1981	sp = cpuhp_get_step(state);
1982	if (sp->multi_instance == false)
1983		return -EINVAL;
1984
1985	mutex_lock(&cpuhp_state_mutex);
1986
1987	if (!invoke || !sp->startup.multi)
1988		goto add_node;
1989
1990	/*
1991	 * Try to call the startup callback for each present cpu
1992	 * depending on the hotplug state of the cpu.
1993	 */
1994	for_each_present_cpu(cpu) {
1995		struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu);
1996		int cpustate = st->state;
1997
1998		if (cpustate < state)
1999			continue;
2000
2001		ret = cpuhp_issue_call(cpu, state, true, node);
2002		if (ret) {
2003			if (sp->teardown.multi)
2004				cpuhp_rollback_install(cpu, state, node);
2005			goto unlock;
2006		}
2007	}
2008add_node:
2009	ret = 0;
2010	hlist_add_head(node, &sp->list);
2011unlock:
2012	mutex_unlock(&cpuhp_state_mutex);
2013	return ret;
2014}
2015
2016int __cpuhp_state_add_instance(enum cpuhp_state state, struct hlist_node *node,
2017			       bool invoke)
2018{
2019	int ret;
2020
2021	cpus_read_lock();
2022	ret = __cpuhp_state_add_instance_cpuslocked(state, node, invoke);
2023	cpus_read_unlock();
2024	return ret;
2025}
2026EXPORT_SYMBOL_GPL(__cpuhp_state_add_instance);
2027
2028/**
2029 * __cpuhp_setup_state_cpuslocked - Setup the callbacks for an hotplug machine state
2030 * @state:		The state to setup
2031 * @name:		Name of the step
2032 * @invoke:		If true, the startup function is invoked for cpus where
2033 *			cpu state >= @state
2034 * @startup:		startup callback function
2035 * @teardown:		teardown callback function
2036 * @multi_instance:	State is set up for multiple instances which get
2037 *			added afterwards.
2038 *
2039 * The caller needs to hold cpus read locked while calling this function.
2040 * Return:
2041 *   On success:
2042 *      Positive state number if @state is CPUHP_AP_ONLINE_DYN;
2043 *      0 for all other states
2044 *   On failure: proper (negative) error code
2045 */
2046int __cpuhp_setup_state_cpuslocked(enum cpuhp_state state,
2047				   const char *name, bool invoke,
2048				   int (*startup)(unsigned int cpu),
2049				   int (*teardown)(unsigned int cpu),
2050				   bool multi_instance)
2051{
2052	int cpu, ret = 0;
2053	bool dynstate;
2054
2055	lockdep_assert_cpus_held();
2056
2057	if (cpuhp_cb_check(state) || !name)
2058		return -EINVAL;
2059
2060	mutex_lock(&cpuhp_state_mutex);
2061
2062	ret = cpuhp_store_callbacks(state, name, startup, teardown,
2063				    multi_instance);
2064
2065	dynstate = state == CPUHP_AP_ONLINE_DYN;
2066	if (ret > 0 && dynstate) {
2067		state = ret;
2068		ret = 0;
2069	}
2070
2071	if (ret || !invoke || !startup)
2072		goto out;
2073
2074	/*
2075	 * Try to call the startup callback for each present cpu
2076	 * depending on the hotplug state of the cpu.
2077	 */
2078	for_each_present_cpu(cpu) {
2079		struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu);
2080		int cpustate = st->state;
2081
2082		if (cpustate < state)
2083			continue;
2084
2085		ret = cpuhp_issue_call(cpu, state, true, NULL);
2086		if (ret) {
2087			if (teardown)
2088				cpuhp_rollback_install(cpu, state, NULL);
2089			cpuhp_store_callbacks(state, NULL, NULL, NULL, false);
2090			goto out;
2091		}
2092	}
2093out:
2094	mutex_unlock(&cpuhp_state_mutex);
2095	/*
2096	 * If the requested state is CPUHP_AP_ONLINE_DYN, return the
2097	 * dynamically allocated state in case of success.
2098	 */
2099	if (!ret && dynstate)
2100		return state;
2101	return ret;
2102}
2103EXPORT_SYMBOL(__cpuhp_setup_state_cpuslocked);
2104
2105int __cpuhp_setup_state(enum cpuhp_state state,
2106			const char *name, bool invoke,
2107			int (*startup)(unsigned int cpu),
2108			int (*teardown)(unsigned int cpu),
2109			bool multi_instance)
2110{
2111	int ret;
2112
2113	cpus_read_lock();
2114	ret = __cpuhp_setup_state_cpuslocked(state, name, invoke, startup,
2115					     teardown, multi_instance);
2116	cpus_read_unlock();
2117	return ret;
2118}
2119EXPORT_SYMBOL(__cpuhp_setup_state);
2120
2121int __cpuhp_state_remove_instance(enum cpuhp_state state,
2122				  struct hlist_node *node, bool invoke)
2123{
2124	struct cpuhp_step *sp = cpuhp_get_step(state);
2125	int cpu;
2126
2127	BUG_ON(cpuhp_cb_check(state));
2128
2129	if (!sp->multi_instance)
2130		return -EINVAL;
2131
2132	cpus_read_lock();
2133	mutex_lock(&cpuhp_state_mutex);
2134
2135	if (!invoke || !cpuhp_get_teardown_cb(state))
2136		goto remove;
2137	/*
2138	 * Call the teardown callback for each present cpu depending
2139	 * on the hotplug state of the cpu. This function is not
2140	 * allowed to fail currently!
2141	 */
2142	for_each_present_cpu(cpu) {
2143		struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu);
2144		int cpustate = st->state;
2145
2146		if (cpustate >= state)
2147			cpuhp_issue_call(cpu, state, false, node);
2148	}
2149
2150remove:
2151	hlist_del(node);
2152	mutex_unlock(&cpuhp_state_mutex);
2153	cpus_read_unlock();
2154
2155	return 0;
2156}
2157EXPORT_SYMBOL_GPL(__cpuhp_state_remove_instance);
2158
2159/**
2160 * __cpuhp_remove_state_cpuslocked - Remove the callbacks for an hotplug machine state
2161 * @state:	The state to remove
2162 * @invoke:	If true, the teardown function is invoked for cpus where
2163 *		cpu state >= @state
2164 *
2165 * The caller needs to hold cpus read locked while calling this function.
2166 * The teardown callback is currently not allowed to fail. Think
2167 * about module removal!
2168 */
2169void __cpuhp_remove_state_cpuslocked(enum cpuhp_state state, bool invoke)
2170{
2171	struct cpuhp_step *sp = cpuhp_get_step(state);
2172	int cpu;
2173
2174	BUG_ON(cpuhp_cb_check(state));
2175
2176	lockdep_assert_cpus_held();
2177
2178	mutex_lock(&cpuhp_state_mutex);
2179	if (sp->multi_instance) {
2180		WARN(!hlist_empty(&sp->list),
2181		     "Error: Removing state %d which has instances left.\n",
2182		     state);
2183		goto remove;
2184	}
2185
2186	if (!invoke || !cpuhp_get_teardown_cb(state))
2187		goto remove;
2188
2189	/*
2190	 * Call the teardown callback for each present cpu depending
2191	 * on the hotplug state of the cpu. This function is not
2192	 * allowed to fail currently!
2193	 */
2194	for_each_present_cpu(cpu) {
2195		struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu);
2196		int cpustate = st->state;
2197
2198		if (cpustate >= state)
2199			cpuhp_issue_call(cpu, state, false, NULL);
2200	}
2201remove:
2202	cpuhp_store_callbacks(state, NULL, NULL, NULL, false);
2203	mutex_unlock(&cpuhp_state_mutex);
2204}
2205EXPORT_SYMBOL(__cpuhp_remove_state_cpuslocked);
2206
2207void __cpuhp_remove_state(enum cpuhp_state state, bool invoke)
2208{
2209	cpus_read_lock();
2210	__cpuhp_remove_state_cpuslocked(state, invoke);
2211	cpus_read_unlock();
2212}
2213EXPORT_SYMBOL(__cpuhp_remove_state);
2214
2215#ifdef CONFIG_HOTPLUG_SMT
2216static void cpuhp_offline_cpu_device(unsigned int cpu)
2217{
2218	struct device *dev = get_cpu_device(cpu);
2219
2220	dev->offline = true;
2221	/* Tell user space about the state change */
2222	kobject_uevent(&dev->kobj, KOBJ_OFFLINE);
2223}
2224
2225static void cpuhp_online_cpu_device(unsigned int cpu)
2226{
2227	struct device *dev = get_cpu_device(cpu);
2228
2229	dev->offline = false;
2230	/* Tell user space about the state change */
2231	kobject_uevent(&dev->kobj, KOBJ_ONLINE);
2232}
2233
2234int cpuhp_smt_disable(enum cpuhp_smt_control ctrlval)
2235{
2236	int cpu, ret = 0;
2237
2238	cpu_maps_update_begin();
2239	for_each_online_cpu(cpu) {
2240		if (topology_is_primary_thread(cpu))
2241			continue;
2242		ret = cpu_down_maps_locked(cpu, CPUHP_OFFLINE);
2243		if (ret)
2244			break;
2245		/*
2246		 * As this needs to hold the cpu maps lock it's impossible
2247		 * to call device_offline() because that ends up calling
2248		 * cpu_down() which takes cpu maps lock. cpu maps lock
2249		 * needs to be held as this might race against in kernel
2250		 * abusers of the hotplug machinery (thermal management).
2251		 *
2252		 * So nothing would update device:offline state. That would
2253		 * leave the sysfs entry stale and prevent onlining after
2254		 * smt control has been changed to 'off' again. This is
2255		 * called under the sysfs hotplug lock, so it is properly
2256		 * serialized against the regular offline usage.
2257		 */
2258		cpuhp_offline_cpu_device(cpu);
2259	}
2260	if (!ret)
2261		cpu_smt_control = ctrlval;
2262	cpu_maps_update_done();
2263	return ret;
2264}
2265
2266int cpuhp_smt_enable(void)
2267{
2268	int cpu, ret = 0;
2269
2270	cpu_maps_update_begin();
2271	cpu_smt_control = CPU_SMT_ENABLED;
2272	for_each_present_cpu(cpu) {
2273		/* Skip online CPUs and CPUs on offline nodes */
2274		if (cpu_online(cpu) || !node_online(cpu_to_node(cpu)))
2275			continue;
2276		ret = _cpu_up(cpu, 0, CPUHP_ONLINE);
2277		if (ret)
2278			break;
2279		/* See comment in cpuhp_smt_disable() */
2280		cpuhp_online_cpu_device(cpu);
2281	}
2282	cpu_maps_update_done();
2283	return ret;
2284}
2285#endif
2286
2287#if defined(CONFIG_SYSFS) && defined(CONFIG_HOTPLUG_CPU)
2288static ssize_t state_show(struct device *dev,
2289			  struct device_attribute *attr, char *buf)
2290{
2291	struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, dev->id);
2292
2293	return sprintf(buf, "%d\n", st->state);
2294}
2295static DEVICE_ATTR_RO(state);
2296
2297static ssize_t target_store(struct device *dev, struct device_attribute *attr,
2298			    const char *buf, size_t count)
2299{
2300	struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, dev->id);
2301	struct cpuhp_step *sp;
2302	int target, ret;
2303
2304	ret = kstrtoint(buf, 10, &target);
2305	if (ret)
2306		return ret;
2307
2308#ifdef CONFIG_CPU_HOTPLUG_STATE_CONTROL
2309	if (target < CPUHP_OFFLINE || target > CPUHP_ONLINE)
2310		return -EINVAL;
2311#else
2312	if (target != CPUHP_OFFLINE && target != CPUHP_ONLINE)
2313		return -EINVAL;
2314#endif
2315
2316	ret = lock_device_hotplug_sysfs();
2317	if (ret)
2318		return ret;
2319
2320	mutex_lock(&cpuhp_state_mutex);
2321	sp = cpuhp_get_step(target);
2322	ret = !sp->name || sp->cant_stop ? -EINVAL : 0;
2323	mutex_unlock(&cpuhp_state_mutex);
2324	if (ret)
2325		goto out;
2326
2327	if (st->state < target)
2328		ret = cpu_up(dev->id, target);
2329	else
2330		ret = cpu_down(dev->id, target);
2331out:
2332	unlock_device_hotplug();
2333	return ret ? ret : count;
2334}
2335
2336static ssize_t target_show(struct device *dev,
2337			   struct device_attribute *attr, char *buf)
2338{
2339	struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, dev->id);
2340
2341	return sprintf(buf, "%d\n", st->target);
2342}
2343static DEVICE_ATTR_RW(target);
2344
2345static ssize_t fail_store(struct device *dev, struct device_attribute *attr,
2346			  const char *buf, size_t count)
2347{
2348	struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, dev->id);
2349	struct cpuhp_step *sp;
2350	int fail, ret;
2351
2352	ret = kstrtoint(buf, 10, &fail);
2353	if (ret)
2354		return ret;
2355
2356	if (fail == CPUHP_INVALID) {
2357		st->fail = fail;
2358		return count;
2359	}
2360
2361	if (fail < CPUHP_OFFLINE || fail > CPUHP_ONLINE)
2362		return -EINVAL;
2363
2364	/*
2365	 * Cannot fail STARTING/DYING callbacks.
2366	 */
2367	if (cpuhp_is_atomic_state(fail))
2368		return -EINVAL;
2369
2370	/*
2371	 * DEAD callbacks cannot fail...
2372	 * ... neither can CPUHP_BRINGUP_CPU during hotunplug. The latter
2373	 * triggering STARTING callbacks, a failure in this state would
2374	 * hinder rollback.
2375	 */
2376	if (fail <= CPUHP_BRINGUP_CPU && st->state > CPUHP_BRINGUP_CPU)
2377		return -EINVAL;
2378
2379	/*
2380	 * Cannot fail anything that doesn't have callbacks.
2381	 */
2382	mutex_lock(&cpuhp_state_mutex);
2383	sp = cpuhp_get_step(fail);
2384	if (!sp->startup.single && !sp->teardown.single)
2385		ret = -EINVAL;
2386	mutex_unlock(&cpuhp_state_mutex);
2387	if (ret)
2388		return ret;
2389
2390	st->fail = fail;
2391
2392	return count;
2393}
2394
2395static ssize_t fail_show(struct device *dev,
2396			 struct device_attribute *attr, char *buf)
2397{
2398	struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, dev->id);
2399
2400	return sprintf(buf, "%d\n", st->fail);
2401}
2402
2403static DEVICE_ATTR_RW(fail);
2404
2405static struct attribute *cpuhp_cpu_attrs[] = {
2406	&dev_attr_state.attr,
2407	&dev_attr_target.attr,
2408	&dev_attr_fail.attr,
2409	NULL
2410};
2411
2412static const struct attribute_group cpuhp_cpu_attr_group = {
2413	.attrs = cpuhp_cpu_attrs,
2414	.name = "hotplug",
2415	NULL
2416};
2417
2418static ssize_t states_show(struct device *dev,
2419				 struct device_attribute *attr, char *buf)
2420{
2421	ssize_t cur, res = 0;
2422	int i;
2423
2424	mutex_lock(&cpuhp_state_mutex);
2425	for (i = CPUHP_OFFLINE; i <= CPUHP_ONLINE; i++) {
2426		struct cpuhp_step *sp = cpuhp_get_step(i);
2427
2428		if (sp->name) {
2429			cur = sprintf(buf, "%3d: %s\n", i, sp->name);
2430			buf += cur;
2431			res += cur;
2432		}
2433	}
2434	mutex_unlock(&cpuhp_state_mutex);
2435	return res;
2436}
2437static DEVICE_ATTR_RO(states);
2438
2439static struct attribute *cpuhp_cpu_root_attrs[] = {
2440	&dev_attr_states.attr,
2441	NULL
2442};
2443
2444static const struct attribute_group cpuhp_cpu_root_attr_group = {
2445	.attrs = cpuhp_cpu_root_attrs,
2446	.name = "hotplug",
2447	NULL
2448};
2449
2450#ifdef CONFIG_HOTPLUG_SMT
2451
2452static ssize_t
2453__store_smt_control(struct device *dev, struct device_attribute *attr,
2454		    const char *buf, size_t count)
2455{
2456	int ctrlval, ret;
2457
2458	if (sysfs_streq(buf, "on"))
2459		ctrlval = CPU_SMT_ENABLED;
2460	else if (sysfs_streq(buf, "off"))
2461		ctrlval = CPU_SMT_DISABLED;
2462	else if (sysfs_streq(buf, "forceoff"))
2463		ctrlval = CPU_SMT_FORCE_DISABLED;
2464	else
2465		return -EINVAL;
2466
2467	if (cpu_smt_control == CPU_SMT_FORCE_DISABLED)
2468		return -EPERM;
2469
2470	if (cpu_smt_control == CPU_SMT_NOT_SUPPORTED)
2471		return -ENODEV;
2472
2473	ret = lock_device_hotplug_sysfs();
2474	if (ret)
2475		return ret;
2476
2477	if (ctrlval != cpu_smt_control) {
2478		switch (ctrlval) {
2479		case CPU_SMT_ENABLED:
2480			ret = cpuhp_smt_enable();
2481			break;
2482		case CPU_SMT_DISABLED:
2483		case CPU_SMT_FORCE_DISABLED:
2484			ret = cpuhp_smt_disable(ctrlval);
2485			break;
2486		}
2487	}
2488
2489	unlock_device_hotplug();
2490	return ret ? ret : count;
2491}
2492
2493#else /* !CONFIG_HOTPLUG_SMT */
2494static ssize_t
2495__store_smt_control(struct device *dev, struct device_attribute *attr,
2496		    const char *buf, size_t count)
2497{
2498	return -ENODEV;
2499}
2500#endif /* CONFIG_HOTPLUG_SMT */
2501
2502static const char *smt_states[] = {
2503	[CPU_SMT_ENABLED]		= "on",
2504	[CPU_SMT_DISABLED]		= "off",
2505	[CPU_SMT_FORCE_DISABLED]	= "forceoff",
2506	[CPU_SMT_NOT_SUPPORTED]		= "notsupported",
2507	[CPU_SMT_NOT_IMPLEMENTED]	= "notimplemented",
2508};
2509
2510static ssize_t control_show(struct device *dev,
2511			    struct device_attribute *attr, char *buf)
2512{
2513	const char *state = smt_states[cpu_smt_control];
2514
2515	return snprintf(buf, PAGE_SIZE - 2, "%s\n", state);
2516}
2517
2518static ssize_t control_store(struct device *dev, struct device_attribute *attr,
2519			     const char *buf, size_t count)
2520{
2521	return __store_smt_control(dev, attr, buf, count);
2522}
2523static DEVICE_ATTR_RW(control);
2524
2525static ssize_t active_show(struct device *dev,
2526			   struct device_attribute *attr, char *buf)
2527{
2528	return snprintf(buf, PAGE_SIZE - 2, "%d\n", sched_smt_active());
2529}
2530static DEVICE_ATTR_RO(active);
2531
2532static struct attribute *cpuhp_smt_attrs[] = {
2533	&dev_attr_control.attr,
2534	&dev_attr_active.attr,
2535	NULL
2536};
2537
2538static const struct attribute_group cpuhp_smt_attr_group = {
2539	.attrs = cpuhp_smt_attrs,
2540	.name = "smt",
2541	NULL
2542};
2543
2544static int __init cpu_smt_sysfs_init(void)
2545{
2546	return sysfs_create_group(&cpu_subsys.dev_root->kobj,
2547				  &cpuhp_smt_attr_group);
2548}
2549
2550static int __init cpuhp_sysfs_init(void)
2551{
2552	int cpu, ret;
2553
2554	ret = cpu_smt_sysfs_init();
2555	if (ret)
2556		return ret;
2557
2558	ret = sysfs_create_group(&cpu_subsys.dev_root->kobj,
2559				 &cpuhp_cpu_root_attr_group);
2560	if (ret)
2561		return ret;
2562
2563	for_each_possible_cpu(cpu) {
2564		struct device *dev = get_cpu_device(cpu);
2565
2566		if (!dev)
2567			continue;
2568		ret = sysfs_create_group(&dev->kobj, &cpuhp_cpu_attr_group);
2569		if (ret)
2570			return ret;
2571	}
2572	return 0;
2573}
2574device_initcall(cpuhp_sysfs_init);
2575#endif /* CONFIG_SYSFS && CONFIG_HOTPLUG_CPU */
2576
2577/*
2578 * cpu_bit_bitmap[] is a special, "compressed" data structure that
2579 * represents all NR_CPUS bits binary values of 1<<nr.
2580 *
2581 * It is used by cpumask_of() to get a constant address to a CPU
2582 * mask value that has a single bit set only.
2583 */
2584
2585/* cpu_bit_bitmap[0] is empty - so we can back into it */
2586#define MASK_DECLARE_1(x)	[x+1][0] = (1UL << (x))
2587#define MASK_DECLARE_2(x)	MASK_DECLARE_1(x), MASK_DECLARE_1(x+1)
2588#define MASK_DECLARE_4(x)	MASK_DECLARE_2(x), MASK_DECLARE_2(x+2)
2589#define MASK_DECLARE_8(x)	MASK_DECLARE_4(x), MASK_DECLARE_4(x+4)
2590
2591const unsigned long cpu_bit_bitmap[BITS_PER_LONG+1][BITS_TO_LONGS(NR_CPUS)] = {
2592
2593	MASK_DECLARE_8(0),	MASK_DECLARE_8(8),
2594	MASK_DECLARE_8(16),	MASK_DECLARE_8(24),
2595#if BITS_PER_LONG > 32
2596	MASK_DECLARE_8(32),	MASK_DECLARE_8(40),
2597	MASK_DECLARE_8(48),	MASK_DECLARE_8(56),
2598#endif
2599};
2600EXPORT_SYMBOL_GPL(cpu_bit_bitmap);
2601
2602const DECLARE_BITMAP(cpu_all_bits, NR_CPUS) = CPU_BITS_ALL;
2603EXPORT_SYMBOL(cpu_all_bits);
2604
2605#ifdef CONFIG_INIT_ALL_POSSIBLE
2606struct cpumask __cpu_possible_mask __read_mostly
2607	= {CPU_BITS_ALL};
2608#else
2609struct cpumask __cpu_possible_mask __read_mostly;
2610#endif
2611EXPORT_SYMBOL(__cpu_possible_mask);
2612
2613struct cpumask __cpu_online_mask __read_mostly;
2614EXPORT_SYMBOL(__cpu_online_mask);
2615
2616struct cpumask __cpu_present_mask __read_mostly;
2617EXPORT_SYMBOL(__cpu_present_mask);
2618
2619struct cpumask __cpu_active_mask __read_mostly;
2620EXPORT_SYMBOL(__cpu_active_mask);
2621
2622struct cpumask __cpu_dying_mask __read_mostly;
2623EXPORT_SYMBOL(__cpu_dying_mask);
2624
2625atomic_t __num_online_cpus __read_mostly;
2626EXPORT_SYMBOL(__num_online_cpus);
2627
2628void init_cpu_present(const struct cpumask *src)
2629{
2630	cpumask_copy(&__cpu_present_mask, src);
2631}
2632
2633void init_cpu_possible(const struct cpumask *src)
2634{
2635	cpumask_copy(&__cpu_possible_mask, src);
2636}
2637
2638void init_cpu_online(const struct cpumask *src)
2639{
2640	cpumask_copy(&__cpu_online_mask, src);
2641}
2642
2643void set_cpu_online(unsigned int cpu, bool online)
2644{
2645	/*
2646	 * atomic_inc/dec() is required to handle the horrid abuse of this
2647	 * function by the reboot and kexec code which invoke it from
2648	 * IPI/NMI broadcasts when shutting down CPUs. Invocation from
2649	 * regular CPU hotplug is properly serialized.
2650	 *
2651	 * Note, that the fact that __num_online_cpus is of type atomic_t
2652	 * does not protect readers which are not serialized against
2653	 * concurrent hotplug operations.
2654	 */
2655	if (online) {
2656		if (!cpumask_test_and_set_cpu(cpu, &__cpu_online_mask))
2657			atomic_inc(&__num_online_cpus);
2658	} else {
2659		if (cpumask_test_and_clear_cpu(cpu, &__cpu_online_mask))
2660			atomic_dec(&__num_online_cpus);
2661	}
2662}
2663
2664/*
2665 * Activate the first processor.
2666 */
2667void __init boot_cpu_init(void)
2668{
2669	int cpu = smp_processor_id();
2670
2671	/* Mark the boot cpu "present", "online" etc for SMP and UP case */
2672	set_cpu_online(cpu, true);
2673	set_cpu_active(cpu, true);
2674	set_cpu_present(cpu, true);
2675	set_cpu_possible(cpu, true);
2676
2677#ifdef CONFIG_SMP
2678	__boot_cpu_id = cpu;
2679#endif
2680}
2681
2682/*
2683 * Must be called _AFTER_ setting up the per_cpu areas
2684 */
2685void __init boot_cpu_hotplug_init(void)
2686{
2687#ifdef CONFIG_SMP
2688	cpumask_set_cpu(smp_processor_id(), &cpus_booted_once_mask);
2689#endif
2690	this_cpu_write(cpuhp_state.state, CPUHP_ONLINE);
2691}
2692
2693/*
2694 * These are used for a global "mitigations=" cmdline option for toggling
2695 * optional CPU mitigations.
2696 */
2697enum cpu_mitigations {
2698	CPU_MITIGATIONS_OFF,
2699	CPU_MITIGATIONS_AUTO,
2700	CPU_MITIGATIONS_AUTO_NOSMT,
2701};
2702
2703static enum cpu_mitigations cpu_mitigations __ro_after_init =
2704	CPU_MITIGATIONS_AUTO;
2705
2706static int __init mitigations_parse_cmdline(char *arg)
2707{
2708	if (!strcmp(arg, "off"))
2709		cpu_mitigations = CPU_MITIGATIONS_OFF;
2710	else if (!strcmp(arg, "auto"))
2711		cpu_mitigations = CPU_MITIGATIONS_AUTO;
2712	else if (!strcmp(arg, "auto,nosmt"))
2713		cpu_mitigations = CPU_MITIGATIONS_AUTO_NOSMT;
2714	else
2715		pr_crit("Unsupported mitigations=%s, system may still be vulnerable\n",
2716			arg);
2717
2718	return 0;
2719}
2720early_param("mitigations", mitigations_parse_cmdline);
2721
2722/* mitigations=off */
2723bool cpu_mitigations_off(void)
2724{
2725	return cpu_mitigations == CPU_MITIGATIONS_OFF;
2726}
2727EXPORT_SYMBOL_GPL(cpu_mitigations_off);
2728
2729/* mitigations=auto,nosmt */
2730bool cpu_mitigations_auto_nosmt(void)
2731{
2732	return cpu_mitigations == CPU_MITIGATIONS_AUTO_NOSMT;
2733}
2734EXPORT_SYMBOL_GPL(cpu_mitigations_auto_nosmt);
2735