sysctrl_quiesce.c revision 1341:6d7c4f090a72
1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21
22/*
23 * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
24 * Use is subject to license terms.
25 */
26
27#pragma ident	"%Z%%M%	%I%	%E% SMI"
28
29/*
30 * This workaround inhibits prom_printf after the cpus are grabbed.
31 * This can be removed when 4154263 is corrected.
32 */
33#define	Bug_4154263
34
35/*
36 * A CPR derivative specifically for sunfire
37 */
38
39#include <sys/types.h>
40#include <sys/systm.h>
41#include <sys/machparam.h>
42#include <sys/machsystm.h>
43#include <sys/ddi.h>
44#define	SUNDDI_IMPL
45#include <sys/sunddi.h>
46#include <sys/time.h>
47#include <sys/kmem.h>
48#include <nfs/lm.h>
49#include <sys/ddi_impldefs.h>
50#include <sys/obpdefs.h>
51#include <sys/cmn_err.h>
52#include <sys/debug.h>
53#include <sys/errno.h>
54#include <sys/callb.h>
55#include <sys/clock.h>
56#include <sys/x_call.h>
57#include <sys/cpuvar.h>
58#include <sys/epm.h>
59#include <sys/vfs.h>
60#include <sys/fhc.h>
61#include <sys/sysctrl.h>
62#include <sys/promif.h>
63#include <sys/conf.h>
64#include <sys/modctl.h>
65#include <sys/cyclic.h>
66#include <sys/sunndi.h>
67#include <sys/machsystm.h>
68
69static enum sysctrl_suspend_state {
70	SYSC_STATE_BEGIN = 0,
71	SYSC_STATE_USER,
72	SYSC_STATE_DAEMON,
73	SYSC_STATE_DRIVER,
74	SYSC_STATE_FULL } suspend_state;
75
76static int	pstate_save;
77static uint_t	sysctrl_gate[NCPU];
78int	sysctrl_quiesce_debug = FALSE;
79static int	sysctrl_skip_kernel_threads = TRUE;
80
81/*
82 * sysctrl_skip_user_threads is used to control if user threads should
83 * be suspended.  If sysctrl_skip_user_threads is true, the rest of the
84 * flags are not used; if it is false, sysctrl_check_user_stop_result
85 * will be used to control whether or not we need to check suspend
86 * result, and sysctrl_allow_blocked_threads will be used to control
87 * whether or not we allow suspend to continue if there are blocked
88 * threads.  We allow all combinations of sysctrl_check_user_stop_result
89 * and sysctrl_allow_block_threads, even though it might not make much
90 * sense to not allow block threads when we don't even check stop
91 * result.
92 */
93static int	sysctrl_skip_user_threads = 0;		/* default to FALSE */
94static int	sysctrl_check_user_stop_result = 1;	/* default to TRUE */
95static int	sysctrl_allow_blocked_threads = 1;	/* default to TRUE */
96
97static int	sysc_watchdog_suspended;
98
99extern int	sysctrl_enable_detach_suspend;
100static int	sysc_lastval;
101
102#define	DEBUGP(p) { if (sysctrl_quiesce_debug) p; }
103#define	errp	prom_printf
104
105#define	SYSC_CPU_LOOP_MSEC	1000
106
107static void
108sysctrl_grab_cpus(void)
109{
110	int		i;
111	cpuset_t	others;
112	extern cpuset_t	cpu_ready_set;
113	extern void	sysctrl_freeze(void);
114	uint64_t	sysc_tick_limit;
115	uint64_t	sysc_current_tick;
116	uint64_t	sysc_tick_deadline;
117
118	extern u_longlong_t	gettick(void);
119
120	for (i = 0; i < NCPU; i++)
121		sysctrl_gate[i] = 0;
122
123	/* tell other cpus to go quiet and wait for continue signal */
124	others = cpu_ready_set;
125	CPUSET_DEL(others, CPU->cpu_id);
126	xt_some(others, (xcfunc_t *)sysctrl_freeze, (uint64_t)sysctrl_gate,
127		(uint64_t)(&sysctrl_gate[CPU->cpu_id]));
128
129	sysc_tick_limit =
130		((uint64_t)sys_tick_freq * SYSC_CPU_LOOP_MSEC) / 1000;
131
132	/* wait for each cpu to check in */
133	for (i = 0; i < NCPU; i++) {
134		if (!CPU_IN_SET(others, i))
135			continue;
136
137		/*
138		 * Get current tick value and calculate the deadline tick
139		 */
140		sysc_current_tick = gettick();
141		sysc_tick_deadline = sysc_current_tick + sysc_tick_limit;
142
143		while (sysctrl_gate[i] == 0) {
144			/* If in panic, we just return */
145			if (panicstr)
146				break;
147
148			/* Panic the system if cpu not responsed by deadline */
149			sysc_current_tick = gettick();
150			if (sysc_current_tick >= sysc_tick_deadline) {
151			    cmn_err(CE_PANIC, "sysctrl: cpu %d not "
152				"responding to quiesce command", i);
153			}
154		}
155	}
156
157	/* now even our interrupts are disabled -- really quiet now */
158	pstate_save = disable_vec_intr();
159}
160
161static void
162sysctrl_release_cpus(void)
163{
164	/* let the other cpus go */
165	sysctrl_gate[CPU->cpu_id] = 1;
166
167	/* restore our interrupts too */
168	enable_vec_intr(pstate_save);
169}
170
171static void
172sysctrl_stop_intr(void)
173{
174	mutex_enter(&cpu_lock);
175	kpreempt_disable();
176	cyclic_suspend();
177}
178
179static void
180sysctrl_enable_intr(void)
181{
182	cyclic_resume();
183	(void) spl0();
184	kpreempt_enable();
185	mutex_exit(&cpu_lock);
186}
187
188static int
189sysctrl_is_real_device(dev_info_t *dip)
190{
191	struct regspec *regbuf;
192	int length;
193	int rc;
194
195	if (ddi_get_driver(dip) == NULL)
196		return (FALSE);
197
198	if (DEVI(dip)->devi_pm_flags & (PMC_NEEDS_SR|PMC_PARENTAL_SR))
199		return (TRUE);
200	if (DEVI(dip)->devi_pm_flags & PMC_NO_SR)
201		return (FALSE);
202
203	/*
204	 * now the general case
205	 */
206	rc = ddi_getlongprop(DDI_DEV_T_ANY, dip, DDI_PROP_DONTPASS, "reg",
207		(caddr_t)&regbuf, &length);
208	ASSERT(rc != DDI_PROP_NO_MEMORY);
209	if (rc != DDI_PROP_SUCCESS) {
210		return (FALSE);
211	} else {
212		kmem_free(regbuf, length);
213		return (TRUE);
214	}
215}
216
217static dev_info_t *failed_driver;
218static char device_path[MAXPATHLEN];
219
220static int
221sysctrl_suspend_devices(dev_info_t *dip, sysc_cfga_pkt_t *pkt)
222{
223	int circ;
224
225	ASSERT(dip == NULL || ddi_get_parent(dip) == NULL ||
226	    DEVI_BUSY_OWNED(ddi_get_parent(dip)));
227
228	failed_driver = NULL;
229	for (; dip != NULL; dip = ddi_get_next_sibling(dip)) {
230		/*
231		 * Hold parent busy while walking child list
232		 */
233		ndi_devi_enter(dip, &circ);
234		if (sysctrl_suspend_devices(ddi_get_child(dip), pkt)) {
235			ndi_devi_exit(dip, circ);
236			return (ENXIO);
237		}
238		ndi_devi_exit(dip, circ);
239
240		if (!sysctrl_is_real_device(dip))
241			continue;
242
243		/*
244		 * Safe to call ddi_pathname() as parent is held busy
245		 */
246		(void) ddi_pathname(dip, device_path);
247		DEBUGP(errp(" suspending device %s\n", device_path));
248		if (devi_detach(dip, DDI_SUSPEND) != DDI_SUCCESS) {
249			DEBUGP(errp("  unable to suspend device %s\n",
250				device_path));
251
252			(void) strncpy(pkt->errbuf, device_path,
253				SYSC_OUTPUT_LEN);
254			SYSC_ERR_SET(pkt, SYSC_ERR_SUSPEND);
255			ndi_hold_devi(dip);
256			failed_driver = dip;
257			return (ENXIO);
258		}
259	}
260
261	return (DDI_SUCCESS);
262}
263
264static void
265sysctrl_resume_devices(dev_info_t *start, sysc_cfga_pkt_t *pkt)
266{
267	int		circ;
268	dev_info_t	*dip, *next, *last = NULL;
269
270	ASSERT(start == NULL || ddi_get_parent(start) == NULL ||
271	    DEVI_BUSY_OWNED(ddi_get_parent(start)));
272
273	/* attach in reverse device tree order */
274	while (last != start) {
275		dip = start;
276		next = ddi_get_next_sibling(dip);
277		while (next != last && dip != failed_driver) {
278			dip = next;
279			next = ddi_get_next_sibling(dip);
280		}
281		if (dip == failed_driver) {
282			failed_driver = NULL;
283			ndi_rele_devi(dip);
284		} else if (sysctrl_is_real_device(dip) &&
285		    failed_driver == NULL) {
286			/*
287			 * Parent dip is held busy, so ddi_pathname() can
288			 * be safely called.
289			 */
290			(void) ddi_pathname(dip, device_path);
291			DEBUGP(errp(" resuming device %s\n", device_path));
292			if (devi_attach(dip, DDI_RESUME) != DDI_SUCCESS) {
293				/*
294				 * XXX - if in the future we decide not to
295				 * panic the system, we need to set the error
296				 * SYSC_ERR_RESUME here and also change the
297				 * cfgadm platform library.
298				 */
299				cmn_err(CE_PANIC, "Unable to resume device %s",
300					device_path);
301			}
302		}
303		ndi_devi_enter(dip, &circ);
304		sysctrl_resume_devices(ddi_get_child(dip), pkt);
305		ndi_devi_exit(dip, circ);
306
307		last = dip;
308	}
309}
310
311/*
312 * True if thread is virtually stopped.  Similar to CPR_VSTOPPED
313 * but from DR point of view.  These user threads are waiting in
314 * the kernel.  Once they complete in the kernel, they will process
315 * the stop signal and stop.
316 */
317#define	SYSCTRL_VSTOPPED(t)		\
318	((t)->t_state == TS_SLEEP &&	\
319	(t)->t_wchan != NULL &&		\
320	(t)->t_astflag &&		\
321	((t)->t_proc_flag & TP_CHKPT))
322
323static int
324sysctrl_stop_user_threads(sysc_cfga_pkt_t *pkt)
325{
326	int		count;
327	char		cache_psargs[PSARGSZ];
328	kthread_id_t	cache_tp;
329	uint_t		cache_t_state;
330	int		bailout;
331	pid_t		pid;
332
333	extern void add_one_utstop();
334	extern void utstop_timedwait(clock_t);
335	extern void utstop_init(void);
336
337#define	SYSCTRL_UTSTOP_RETRY	4
338#define	SYSCTRL_UTSTOP_WAIT	hz
339
340	if (sysctrl_skip_user_threads)
341		return (DDI_SUCCESS);
342
343	utstop_init();
344
345	/* we need to try a few times to get past fork, etc. */
346	for (count = 0; count < SYSCTRL_UTSTOP_RETRY; count++) {
347		kthread_id_t tp;
348
349		/* walk the entire threadlist */
350		mutex_enter(&pidlock);
351		for (tp = curthread->t_next; tp != curthread; tp = tp->t_next) {
352			proc_t *p = ttoproc(tp);
353
354			/* handle kernel threads separately */
355			if (p->p_as == &kas || p->p_stat == SZOMB)
356				continue;
357
358			mutex_enter(&p->p_lock);
359			thread_lock(tp);
360
361			if (tp->t_state == TS_STOPPED) {
362				/* add another reason to stop this thread */
363				tp->t_schedflag &= ~TS_RESUME;
364			} else {
365				tp->t_proc_flag |= TP_CHKPT;
366
367				thread_unlock(tp);
368				mutex_exit(&p->p_lock);
369				add_one_utstop();
370				mutex_enter(&p->p_lock);
371				thread_lock(tp);
372
373				aston(tp);
374
375				if (tp->t_state == TS_SLEEP &&
376				    (tp->t_flag & T_WAKEABLE)) {
377					setrun_locked(tp);
378				}
379
380			}
381
382			/* grab thread if needed */
383			if (tp->t_state == TS_ONPROC && tp->t_cpu != CPU)
384				poke_cpu(tp->t_cpu->cpu_id);
385
386
387			thread_unlock(tp);
388			mutex_exit(&p->p_lock);
389		}
390		mutex_exit(&pidlock);
391
392
393		/* let everything catch up */
394		utstop_timedwait(count * count * SYSCTRL_UTSTOP_WAIT);
395
396
397		/* now, walk the threadlist again to see if we are done */
398		mutex_enter(&pidlock);
399		for (tp = curthread->t_next, bailout = 0;
400		    bailout == 0 && tp != curthread; tp = tp->t_next) {
401			proc_t *p = ttoproc(tp);
402
403			/* handle kernel threads separately */
404			if (p->p_as == &kas || p->p_stat == SZOMB)
405				continue;
406
407			/*
408			 * If this thread didn't stop, and we don't allow
409			 * unstopped blocked threads, bail.
410			 */
411			/* did this thread stop? */
412			thread_lock(tp);
413			if (!CPR_ISTOPPED(tp) &&
414			    !(sysctrl_allow_blocked_threads &&
415			    SYSCTRL_VSTOPPED(tp))) {
416
417				/* nope, cache the details for later */
418				bcopy(p->p_user.u_psargs, cache_psargs,
419					sizeof (cache_psargs));
420				cache_tp = tp;
421				cache_t_state = tp->t_state;
422				bailout = 1;
423				pid = p->p_pidp->pid_id;
424			}
425			thread_unlock(tp);
426		}
427		mutex_exit(&pidlock);
428
429		/* were all the threads stopped? */
430		if (!bailout)
431			break;
432	}
433
434	/* were we unable to stop all threads after a few tries? */
435	if (bailout) {
436		(void) sprintf(pkt->errbuf, "process: %s id: %d state: %x"
437		    " thread descriptor: %p",
438		    cache_psargs, (int)pid, cache_t_state,
439			(void *)cache_tp);
440
441		SYSC_ERR_SET(pkt, SYSC_ERR_UTHREAD);
442
443		return (ESRCH);
444	}
445
446	return (DDI_SUCCESS);
447}
448
449static int
450sysctrl_stop_kernel_threads(sysc_cfga_pkt_t *pkt)
451{
452	caddr_t		name;
453	kthread_id_t	tp;
454
455	if (sysctrl_skip_kernel_threads) {
456		return (DDI_SUCCESS);
457	}
458
459	/*
460	 * Note: we unlock the table in resume.
461	 * We only need to lock the callback table if we are actually
462	 * suspending kernel threads.
463	 */
464	callb_lock_table();
465	if ((name = callb_execute_class(CB_CL_CPR_DAEMON,
466	    CB_CODE_CPR_CHKPT)) != (caddr_t)NULL) {
467
468		(void) strncpy(pkt->errbuf, name, SYSC_OUTPUT_LEN);
469		SYSC_ERR_SET(pkt, SYSC_ERR_KTHREAD);
470		return (EBUSY);
471	}
472
473	/*
474	 * Verify that all threads are accounted for
475	 */
476	mutex_enter(&pidlock);
477	for (tp = curthread->t_next; tp != curthread; tp = tp->t_next) {
478		proc_t	*p = ttoproc(tp);
479
480		if (p->p_as != &kas)
481			continue;
482
483		if (tp->t_flag & T_INTR_THREAD)
484			continue;
485
486		if (!callb_is_stopped(tp, &name)) {
487			mutex_exit(&pidlock);
488			(void) strncpy(pkt->errbuf, name, SYSC_OUTPUT_LEN);
489			SYSC_ERR_SET(pkt, SYSC_ERR_KTHREAD);
490			return (EBUSY);
491		}
492	}
493
494	mutex_exit(&pidlock);
495	return (DDI_SUCCESS);
496}
497
498static void
499sysctrl_start_user_threads(void)
500{
501	kthread_id_t tp;
502
503	mutex_enter(&pidlock);
504
505	/* walk all threads and release them */
506	for (tp = curthread->t_next; tp != curthread; tp = tp->t_next) {
507		proc_t *p = ttoproc(tp);
508
509		/* skip kernel threads */
510		if (ttoproc(tp)->p_as == &kas)
511			continue;
512
513		mutex_enter(&p->p_lock);
514		tp->t_proc_flag &= ~TP_CHKPT;
515		mutex_exit(&p->p_lock);
516
517		thread_lock(tp);
518		if (CPR_ISTOPPED(tp)) {
519			/* back on the runq */
520			tp->t_schedflag |= TS_RESUME;
521			setrun_locked(tp);
522		}
523		thread_unlock(tp);
524	}
525
526	mutex_exit(&pidlock);
527}
528
529static void
530sysctrl_signal_user(int sig)
531{
532	struct proc *p;
533
534	mutex_enter(&pidlock);
535
536	for (p = practive; p != NULL; p = p->p_next) {
537		/* only user threads */
538		if (p->p_exec == NULL || p->p_stat == SZOMB ||
539		    p == proc_init || p == ttoproc(curthread))
540			continue;
541
542		mutex_enter(&p->p_lock);
543		sigtoproc(p, NULL, sig);
544		mutex_exit(&p->p_lock);
545	}
546
547	mutex_exit(&pidlock);
548
549	/* add a bit of delay */
550	delay(hz);
551}
552
553void
554sysctrl_resume(sysc_cfga_pkt_t *pkt)
555{
556#ifndef Bug_4154263
557	DEBUGP(errp("resume system...\n"));
558#endif
559	switch (suspend_state) {
560	case SYSC_STATE_FULL:
561		/*
562		 * release all the other cpus
563		 */
564#ifndef	Bug_4154263
565		DEBUGP(errp("release cpus..."));
566#endif
567		sysctrl_release_cpus();
568		DEBUGP(errp("cpus resumed...\n"));
569
570		/*
571		 * If we suspended hw watchdog at suspend,
572		 * re-enable it now.
573		 */
574		if (sysc_watchdog_suspended) {
575			mutex_enter(&tod_lock);
576			tod_ops.tod_set_watchdog_timer(
577				watchdog_timeout_seconds);
578			mutex_exit(&tod_lock);
579		}
580
581		/*
582		 * resume callout
583		 */
584		(void) callb_execute_class(CB_CL_CPR_RPC, CB_CODE_CPR_RESUME);
585		(void) callb_execute_class(CB_CL_CPR_CALLOUT,
586			CB_CODE_CPR_RESUME);
587		sysctrl_enable_intr();
588		/* FALLTHROUGH */
589
590	case SYSC_STATE_DRIVER:
591		/*
592		 * resume drivers
593		 */
594		DEBUGP(errp("resume drivers..."));
595		sysctrl_resume_devices(ddi_root_node(), pkt);
596		DEBUGP(errp("done\n"));
597
598		/*
599		 * resume the lock manager
600		 */
601		lm_cprresume();
602
603		/* FALLTHROUGH */
604
605	case SYSC_STATE_DAEMON:
606		/*
607		 * resume kernel daemons
608		 */
609		if (!sysctrl_skip_kernel_threads) {
610			DEBUGP(errp("starting kernel daemons..."));
611			(void) callb_execute_class(CB_CL_CPR_DAEMON,
612				CB_CODE_CPR_RESUME);
613			callb_unlock_table();
614		}
615		DEBUGP(errp("done\n"));
616
617		/* FALLTHROUGH */
618
619	case SYSC_STATE_USER:
620		/*
621		 * finally, resume user threads
622		 */
623		if (!sysctrl_skip_user_threads) {
624			DEBUGP(errp("starting user threads..."));
625			sysctrl_start_user_threads();
626			DEBUGP(errp("done\n"));
627		}
628		/* FALLTHROUGH */
629
630	case SYSC_STATE_BEGIN:
631	default:
632		/*
633		 * let those who care know that we've just resumed
634		 */
635		DEBUGP(errp("sending SIGTHAW..."));
636		sysctrl_signal_user(SIGTHAW);
637		DEBUGP(errp("done\n"));
638		break;
639	}
640
641	/* Restore sysctrl detach/suspend to its original value */
642	sysctrl_enable_detach_suspend = sysc_lastval;
643
644	DEBUGP(errp("system state restored\n"));
645}
646
647void
648sysctrl_suspend_prepare(void)
649{
650	/*
651	 * We use a function, lm_cprsuspend(), in the suspend flow that
652	 * is redirected to a module through the modstubs mechanism.
653	 * If the module is currently not loaded, modstubs attempts
654	 * the modload. The context this happens in below causes the
655	 * module load to block forever, so this function must be called
656	 * in the normal system call context ahead of time.
657	 */
658	(void) modload("misc", "klmmod");
659}
660
661int
662sysctrl_suspend(sysc_cfga_pkt_t *pkt)
663{
664	int rc = DDI_SUCCESS;
665
666	/* enable sysctrl detach/suspend function */
667	sysc_lastval = sysctrl_enable_detach_suspend;
668	sysctrl_enable_detach_suspend = 1;
669
670	/*
671	 * first, stop all user threads
672	 */
673	DEBUGP(errp("\nstopping user threads..."));
674	suspend_state = SYSC_STATE_USER;
675	if (((rc = sysctrl_stop_user_threads(pkt)) != DDI_SUCCESS) &&
676	    sysctrl_check_user_stop_result) {
677		sysctrl_resume(pkt);
678		return (rc);
679	}
680	DEBUGP(errp("done\n"));
681
682	/*
683	 * now stop daemon activities
684	 */
685	DEBUGP(errp("stopping kernel daemons..."));
686	suspend_state = SYSC_STATE_DAEMON;
687	if (rc = sysctrl_stop_kernel_threads(pkt)) {
688		sysctrl_resume(pkt);
689		return (rc);
690	}
691	DEBUGP(errp("done\n"));
692
693	/*
694	 * This sync swap out all user pages
695	 */
696	vfs_sync(SYNC_ALL);
697
698	/*
699	 * special treatment for lock manager
700	 */
701	lm_cprsuspend();
702
703	/*
704	 * sync the file system in case we never make it back
705	 */
706	sync();
707
708	/*
709	 * now suspend drivers
710	 */
711	DEBUGP(errp("suspending drivers..."));
712	suspend_state = SYSC_STATE_DRIVER;
713	if (rc = sysctrl_suspend_devices(ddi_root_node(), pkt)) {
714		sysctrl_resume(pkt);
715		return (rc);
716	}
717	DEBUGP(errp("done\n"));
718
719	/*
720	 * handle the callout table
721	 */
722	sysctrl_stop_intr();
723
724	(void) callb_execute_class(CB_CL_CPR_CALLOUT, CB_CODE_CPR_CHKPT);
725
726	/*
727	 * if watchdog was activated, disable it
728	 */
729	if (watchdog_activated) {
730		mutex_enter(&tod_lock);
731		tod_ops.tod_clear_watchdog_timer();
732		mutex_exit(&tod_lock);
733		sysc_watchdog_suspended = 1;
734	} else {
735		sysc_watchdog_suspended = 0;
736	}
737
738	/*
739	 * finally, grab all cpus
740	 */
741	DEBUGP(errp("freezing all cpus...\n"));
742	suspend_state = SYSC_STATE_FULL;
743	sysctrl_grab_cpus();
744#ifndef	Bug_4154263
745	DEBUGP(errp("done\n"));
746
747	DEBUGP(errp("system is quiesced\n"));
748#endif
749
750	return (rc);
751}
752