uadmin.c revision 7656:2621e50fdf4a
1178479Sjb/*
2178479Sjb * CDDL HEADER START
3178479Sjb *
4178479Sjb * The contents of this file are subject to the terms of the
5178479Sjb * Common Development and Distribution License (the "License").
6178479Sjb * You may not use this file except in compliance with the License.
7178479Sjb *
8178479Sjb * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9178479Sjb * or http://www.opensolaris.org/os/licensing.
10178479Sjb * See the License for the specific language governing permissions
11178479Sjb * and limitations under the License.
12178479Sjb *
13178479Sjb * When distributing Covered Code, include this CDDL HEADER in each
14178479Sjb * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15178479Sjb * If applicable, add the following below this CDDL HEADER, with the
16178479Sjb * fields enclosed by brackets "[]" replaced with your own identifying
17178479Sjb * information: Portions Copyright [yyyy] [name of copyright owner]
18178479Sjb *
19178479Sjb * CDDL HEADER END
20178479Sjb */
21178479Sjb
22178479Sjb/*
23178573Sjb * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
24178479Sjb * Use is subject to license terms.
25178479Sjb */
26178479Sjb
27178479Sjb
28178479Sjb#include <sys/param.h>
29178479Sjb#include <sys/types.h>
30178479Sjb#include <sys/sysmacros.h>
31178479Sjb#include <sys/systm.h>
32178479Sjb#include <sys/errno.h>
33178573Sjb#include <sys/vfs.h>
34178479Sjb#include <sys/vnode.h>
35178573Sjb#include <sys/swap.h>
36178573Sjb#include <sys/file.h>
37178573Sjb#include <sys/proc.h>
38178479Sjb#include <sys/var.h>
39178479Sjb#include <sys/uadmin.h>
40178479Sjb#include <sys/signal.h>
41178573Sjb#include <sys/time.h>
42178479Sjb#include <vm/seg_kmem.h>
43178573Sjb#include <sys/modctl.h>
44178479Sjb#include <sys/callb.h>
45178479Sjb#include <sys/dumphdr.h>
46178479Sjb#include <sys/debug.h>
47178479Sjb#include <sys/ftrace.h>
48178479Sjb#include <sys/cmn_err.h>
49178479Sjb#include <sys/panic.h>
50178573Sjb#include <sys/ddi.h>
51178479Sjb#include <sys/sunddi.h>
52178573Sjb#include <sys/policy.h>
53178573Sjb#include <sys/zone.h>
54211554Srpaulo#include <sys/condvar.h>
55211554Srpaulo#include <sys/thread.h>
56211554Srpaulo#include <sys/sdt.h>
57178573Sjb
58178479Sjb/*
59178479Sjb * Administrivia system call.  We provide this in two flavors: one for calling
60178479Sjb * from the system call path (uadmin), and the other for calling from elsewhere
61178479Sjb * within the kernel (kadmin).  Callers must beware that certain uadmin cmd
62178479Sjb * values (specifically A_SWAPCTL) are only supported by uadmin and not kadmin.
63178479Sjb */
64178479Sjb
65178479Sjbextern ksema_t fsflush_sema;
66178479Sjbkmutex_t ualock;
67178479Sjbkcondvar_t uacond;
68178479Sjbkthread_t *ua_shutdown_thread = NULL;
69178479Sjb
70178479Sjbint sys_shutdown = 0;
71178479Sjbvolatile int fastreboot_dryrun = 0;
72178479Sjb
73178479Sjb/*
74178479Sjb * Kill all user processes in said zone.  A special argument of ALL_ZONES is
75178479Sjb * passed in when the system as a whole is shutting down.  The lack of per-zone
76178479Sjb * process lists is likely to make the following a performance bottleneck on a
77178479Sjb * system with many zones.
78178479Sjb */
79178479Sjbvoid
80178479Sjbkillall(zoneid_t zoneid)
81178479Sjb{
82178479Sjb	proc_t *p;
83178479Sjb
84178479Sjb	ASSERT(zoneid != GLOBAL_ZONEID);
85178479Sjb	/*
86178479Sjb	 * Kill all processes except kernel daemons and ourself.
87178479Sjb	 * Make a first pass to stop all processes so they won't
88178479Sjb	 * be trying to restart children as we kill them.
89178479Sjb	 */
90178479Sjb	mutex_enter(&pidlock);
91178479Sjb	for (p = practive; p != NULL; p = p->p_next) {
92178479Sjb		if ((zoneid == ALL_ZONES || p->p_zone->zone_id == zoneid) &&
93178479Sjb		    p->p_exec != NULLVP &&	/* kernel daemons */
94178479Sjb		    p->p_as != &kas &&
95178479Sjb		    p->p_stat != SZOMB) {
96178479Sjb			mutex_enter(&p->p_lock);
97178479Sjb			p->p_flag |= SNOWAIT;
98178479Sjb			sigtoproc(p, NULL, SIGSTOP);
99178479Sjb			mutex_exit(&p->p_lock);
100178479Sjb		}
101178479Sjb	}
102178479Sjb	p = practive;
103178479Sjb	while (p != NULL) {
104178479Sjb		if ((zoneid == ALL_ZONES || p->p_zone->zone_id == zoneid) &&
105178479Sjb		    p->p_exec != NULLVP &&	/* kernel daemons */
106178479Sjb		    p->p_as != &kas &&
107178479Sjb		    p->p_stat != SIDL &&
108178479Sjb		    p->p_stat != SZOMB) {
109178479Sjb			mutex_enter(&p->p_lock);
110178479Sjb			if (sigismember(&p->p_sig, SIGKILL)) {
111178479Sjb				mutex_exit(&p->p_lock);
112178479Sjb				p = p->p_next;
113178479Sjb			} else {
114178479Sjb				sigtoproc(p, NULL, SIGKILL);
115178479Sjb				mutex_exit(&p->p_lock);
116178479Sjb				(void) cv_timedwait(&p->p_srwchan_cv, &pidlock,
117178479Sjb				    lbolt + hz);
118178479Sjb				p = practive;
119178479Sjb			}
120178479Sjb		} else {
121178479Sjb			p = p->p_next;
122178479Sjb		}
123178479Sjb	}
124178479Sjb	mutex_exit(&pidlock);
125178479Sjb}
126178479Sjb
127178479Sjbint
128178479Sjbkadmin(int cmd, int fcn, void *mdep, cred_t *credp)
129178479Sjb{
130178479Sjb	int error = 0;
131178479Sjb	char *buf;
132178479Sjb	size_t buflen = 0;
133178479Sjb	boolean_t invoke_cb = B_FALSE;
134178479Sjb
135178479Sjb	/*
136178479Sjb	 * We might be called directly by the kernel's fault-handling code, so
137178479Sjb	 * we can't assert that the caller is in the global zone.
138178479Sjb	 */
139178479Sjb
140178479Sjb	/*
141178479Sjb	 * Make sure that cmd is one of the valid <sys/uadmin.h> command codes
142178479Sjb	 * and that we have appropriate privileges for this action.
143178479Sjb	 */
144178479Sjb	switch (cmd) {
145178479Sjb	case A_FTRACE:
146178479Sjb	case A_SHUTDOWN:
147178479Sjb	case A_REBOOT:
148178479Sjb	case A_REMOUNT:
149178479Sjb	case A_FREEZE:
150178479Sjb	case A_DUMP:
151178479Sjb	case A_SDTTEST:
152178479Sjb		if (secpolicy_sys_config(credp, B_FALSE) != 0)
153178479Sjb			return (EPERM);
154178479Sjb		break;
155178479Sjb
156178479Sjb	default:
157178479Sjb		return (EINVAL);
158178479Sjb	}
159178479Sjb
160178479Sjb	/*
161178479Sjb	 * Serialize these operations on ualock.  If it is held, the
162178479Sjb	 * system should shutdown, reboot, or remount shortly, unless there is
163178479Sjb	 * an error.  We need a cv rather than just a mutex because proper
164178479Sjb	 * functioning of A_REBOOT relies on being able to interrupt blocked
165178479Sjb	 * userland callers.
166178479Sjb	 *
167178479Sjb	 * We only clear ua_shutdown_thread after A_REMOUNT, because A_SHUTDOWN
168178479Sjb	 * and A_REBOOT should never return.
169178479Sjb	 */
170178479Sjb	if (cmd == A_SHUTDOWN || cmd == A_REBOOT || cmd == A_REMOUNT) {
171178479Sjb		mutex_enter(&ualock);
172178479Sjb		while (ua_shutdown_thread != NULL) {
173178479Sjb			if (cv_wait_sig(&uacond, &ualock) == 0) {
174178479Sjb				/*
175178479Sjb				 * If we were interrupted, leave, and handle
176178479Sjb				 * the signal (or exit, depending on what
177178479Sjb				 * happened)
178178479Sjb				 */
179178479Sjb				mutex_exit(&ualock);
180178479Sjb				return (EINTR);
181178479Sjb			}
182178479Sjb		}
183178479Sjb		ua_shutdown_thread = curthread;
184178479Sjb		mutex_exit(&ualock);
185178479Sjb	}
186178479Sjb
187178479Sjb	switch (cmd) {
188178479Sjb	case A_SHUTDOWN:
189178479Sjb	{
190178479Sjb		proc_t *p = ttoproc(curthread);
191178479Sjb
192178479Sjb		/*
193178479Sjb		 * Release (almost) all of our own resources if we are called
194178479Sjb		 * from a user context, however if we are calling kadmin() from
195178479Sjb		 * a kernel context then we do not release these resources.
196178479Sjb		 */
197178479Sjb		if (p != &p0) {
198178479Sjb			proc_is_exiting(p);
199178479Sjb			if ((error = exitlwps(0)) != 0) {
200178479Sjb				/*
201178479Sjb				 * Another thread in this process also called
202178479Sjb				 * exitlwps().
203178479Sjb				 */
204178479Sjb				mutex_enter(&ualock);
205178479Sjb				ua_shutdown_thread = NULL;
206178479Sjb				cv_signal(&uacond);
207178479Sjb				mutex_exit(&ualock);
208178479Sjb				return (error);
209178479Sjb			}
210178479Sjb			mutex_enter(&p->p_lock);
211178479Sjb			p->p_flag |= SNOWAIT;
212178479Sjb			sigfillset(&p->p_ignore);
213178479Sjb			curthread->t_lwp->lwp_cursig = 0;
214178479Sjb			curthread->t_lwp->lwp_extsig = 0;
215178479Sjb			if (p->p_exec) {
216178479Sjb				vnode_t *exec_vp = p->p_exec;
217178479Sjb				p->p_exec = NULLVP;
218178479Sjb				mutex_exit(&p->p_lock);
219178479Sjb				VN_RELE(exec_vp);
220178479Sjb			} else {
221178479Sjb				mutex_exit(&p->p_lock);
222178479Sjb			}
223178479Sjb
224178479Sjb			pollcleanup();
225178479Sjb			closeall(P_FINFO(curproc));
226178479Sjb			relvm();
227178479Sjb
228178479Sjb		} else {
229178479Sjb			/*
230178573Sjb			 * Reset t_cred if not set because much of the
231178573Sjb			 * filesystem code depends on CRED() being valid.
232178573Sjb			 */
233178573Sjb			if (curthread->t_cred == NULL)
234178573Sjb				curthread->t_cred = kcred;
235178573Sjb		}
236178573Sjb
237178479Sjb		/* indicate shutdown in progress */
238178479Sjb		sys_shutdown = 1;
239178479Sjb
240178479Sjb		/*
241178573Sjb		 * Communcate that init shouldn't be restarted.
242178573Sjb		 */
243178573Sjb		zone_shutdown_global();
244178573Sjb
245178573Sjb		killall(ALL_ZONES);
246178573Sjb		/*
247178479Sjb		 * If we are calling kadmin() from a kernel context then we
248178479Sjb		 * do not release these resources.
249178479Sjb		 */
250178479Sjb		if (ttoproc(curthread) != &p0) {
251178479Sjb			VN_RELE(PTOU(curproc)->u_cdir);
252178479Sjb			if (PTOU(curproc)->u_rdir)
253178479Sjb				VN_RELE(PTOU(curproc)->u_rdir);
254178479Sjb			if (PTOU(curproc)->u_cwd)
255178479Sjb				refstr_rele(PTOU(curproc)->u_cwd);
256178479Sjb
257178479Sjb			PTOU(curproc)->u_cdir = rootdir;
258178479Sjb			PTOU(curproc)->u_rdir = NULL;
259178479Sjb			PTOU(curproc)->u_cwd = NULL;
260178479Sjb		}
261178479Sjb
262178479Sjb		/*
263178479Sjb		 * Allow the reboot/halt/poweroff code a chance to do
264178479Sjb		 * anything it needs to whilst we still have filesystems
265178479Sjb		 * mounted, like loading any modules necessary for later
266178479Sjb		 * performing the actual poweroff.
267178479Sjb		 */
268178479Sjb		if ((mdep != NULL) && (*(char *)mdep == '/')) {
269178479Sjb			buf = i_convert_boot_device_name(mdep, NULL, &buflen);
270178479Sjb			mdpreboot(cmd, fcn, buf);
271178479Sjb		} else
272178479Sjb			mdpreboot(cmd, fcn, mdep);
273178479Sjb
274178479Sjb		/*
275178479Sjb		 * Allow fsflush to finish running and then prevent it
276178479Sjb		 * from ever running again so that vfs_unmountall() and
277178479Sjb		 * vfs_syncall() can acquire the vfs locks they need.
278178479Sjb		 */
279178479Sjb		sema_p(&fsflush_sema);
280178479Sjb		(void) callb_execute_class(CB_CL_UADMIN_PRE_VFS, NULL);
281178479Sjb
282178479Sjb		vfs_unmountall();
283178479Sjb		(void) VFS_MOUNTROOT(rootvfs, ROOT_UNMOUNT);
284178479Sjb		vfs_syncall();
285178479Sjb
286178479Sjb		dump_ereports();
287178479Sjb		dump_messages();
288178479Sjb
289178479Sjb		invoke_cb = B_TRUE;
290178479Sjb
291178479Sjb		/* FALLTHROUGH */
292178479Sjb	}
293178479Sjb
294178479Sjb	case A_REBOOT:
295178479Sjb		if ((mdep != NULL) && (*(char *)mdep == '/')) {
296178479Sjb			buf = i_convert_boot_device_name(mdep, NULL, &buflen);
297178479Sjb			mdboot(cmd, fcn, buf, invoke_cb);
298178479Sjb		} else
299178479Sjb			mdboot(cmd, fcn, mdep, invoke_cb);
300178479Sjb		/* no return expected */
301178479Sjb		break;
302178479Sjb
303178479Sjb	case A_REMOUNT:
304178479Sjb		(void) VFS_MOUNTROOT(rootvfs, ROOT_REMOUNT);
305178479Sjb		/* Let other threads enter the shutdown path now */
306178479Sjb		mutex_enter(&ualock);
307178479Sjb		ua_shutdown_thread = NULL;
308178479Sjb		cv_signal(&uacond);
309178479Sjb		mutex_exit(&ualock);
310178479Sjb		break;
311178479Sjb
312178479Sjb	case A_FREEZE:
313178479Sjb	{
314178479Sjb		/*
315178479Sjb		 * This is the entrypoint for all suspend/resume actions.
316178479Sjb		 */
317178479Sjb		extern int cpr(int, void *);
318178479Sjb
319178479Sjb		if (modload("misc", "cpr") == -1)
320178479Sjb			return (ENOTSUP);
321178479Sjb		/* Let the CPR module decide what to do with mdep */
322178479Sjb		error = cpr(fcn, mdep);
323178479Sjb		break;
324178479Sjb	}
325178479Sjb
326178479Sjb	case A_FTRACE:
327178479Sjb	{
328178479Sjb		switch (fcn) {
329178479Sjb		case AD_FTRACE_START:
330178479Sjb			(void) FTRACE_START();
331178479Sjb			break;
332178479Sjb		case AD_FTRACE_STOP:
333178479Sjb			(void) FTRACE_STOP();
334178479Sjb			break;
335178479Sjb		default:
336178479Sjb			error = EINVAL;
337178479Sjb		}
338178479Sjb		break;
339178479Sjb	}
340178479Sjb
341178479Sjb	case A_DUMP:
342178479Sjb	{
343178479Sjb		if (fcn == AD_NOSYNC) {
344178479Sjb			in_sync = 1;
345178479Sjb			break;
346178479Sjb		}
347178479Sjb
348178479Sjb		panic_bootfcn = fcn;
349178479Sjb		panic_forced = 1;
350178479Sjb
351178479Sjb		if ((mdep != NULL) && (*(char *)mdep == '/')) {
352178479Sjb			panic_bootstr = i_convert_boot_device_name(mdep,
353178479Sjb			    NULL, &buflen);
354178479Sjb		} else
355178479Sjb			panic_bootstr = mdep;
356178479Sjb
357178479Sjb		panic("forced crash dump initiated at user request");
358178479Sjb		/*NOTREACHED*/
359178479Sjb	}
360178479Sjb
361178479Sjb	case A_SDTTEST:
362178479Sjb	{
363178479Sjb		DTRACE_PROBE7(test, int, 1, int, 2, int, 3, int, 4, int, 5,
364178479Sjb		    int, 6, int, 7);
365178479Sjb		break;
366178479Sjb	}
367178479Sjb
368178479Sjb	default:
369178479Sjb		error = EINVAL;
370178479Sjb	}
371178479Sjb
372178479Sjb	return (error);
373178479Sjb}
374178479Sjb
375178479Sjbint
376178479Sjbuadmin(int cmd, int fcn, uintptr_t mdep)
377178479Sjb{
378178479Sjb	int error = 0, rv = 0;
379178479Sjb	size_t nbytes = 0;
380178479Sjb	cred_t *credp = CRED();
381178479Sjb	char *bootargs = NULL;
382178479Sjb	int reset_status = 0;
383178479Sjb
384178479Sjb	if (cmd == A_SHUTDOWN && fcn == AD_FASTREBOOT_DRYRUN) {
385178479Sjb		ddi_walk_devs(ddi_root_node(), check_driver_quiesce,
386178479Sjb		    &reset_status);
387178479Sjb		if (reset_status != 0)
388178479Sjb			return (EIO);
389178479Sjb		else
390178479Sjb			return (0);
391178479Sjb	}
392178479Sjb
393178479Sjb	/*
394178479Sjb	 * The swapctl system call doesn't have its own entry point: it uses
395178479Sjb	 * uadmin as a wrapper so we just call it directly from here.
396178479Sjb	 */
397178479Sjb	if (cmd == A_SWAPCTL) {
398178479Sjb		if (get_udatamodel() == DATAMODEL_NATIVE)
399178479Sjb			error = swapctl(fcn, (void *)mdep, &rv);
400178479Sjb#if defined(_SYSCALL32_IMPL)
401178479Sjb		else
402178479Sjb			error = swapctl32(fcn, (void *)mdep, &rv);
403178479Sjb#endif /* _SYSCALL32_IMPL */
404178479Sjb		return (error ? set_errno(error) : rv);
405178479Sjb	}
406178479Sjb
407178479Sjb	/*
408178479Sjb	 * Certain subcommands intepret a non-NULL mdep value as a pointer to
409178479Sjb	 * a boot string.  We pull that in as bootargs, if applicable.
410178479Sjb	 */
411178479Sjb	if (mdep != NULL &&
412178479Sjb	    (cmd == A_SHUTDOWN || cmd == A_REBOOT || cmd == A_DUMP ||
413178479Sjb	    cmd == A_FREEZE)) {
414178479Sjb		bootargs = kmem_zalloc(BOOTARGS_MAX, KM_SLEEP);
415178479Sjb		if ((error = copyinstr((const char *)mdep, bootargs,
416178479Sjb		    BOOTARGS_MAX, &nbytes)) != 0) {
417178479Sjb			kmem_free(bootargs, BOOTARGS_MAX);
418178573Sjb			return (set_errno(error));
419178573Sjb		}
420178573Sjb	}
421178573Sjb
422178573Sjb	/*
423178573Sjb	 * Invoke the appropriate kadmin() routine.
424178573Sjb	 */
425178573Sjb	if (getzoneid() != GLOBAL_ZONEID)
426178573Sjb		error = zone_kadmin(cmd, fcn, bootargs, credp);
427178573Sjb	else
428178479Sjb		error = kadmin(cmd, fcn, bootargs, credp);
429178479Sjb
430178479Sjb	if (bootargs != NULL)
431178479Sjb		kmem_free(bootargs, BOOTARGS_MAX);
432178479Sjb	return (error ? set_errno(error) : 0);
433178479Sjb}
434178479Sjb