uadmin.c revision 7656:2621e50fdf4a
1178479Sjb/* 2178479Sjb * CDDL HEADER START 3178479Sjb * 4178479Sjb * The contents of this file are subject to the terms of the 5178479Sjb * Common Development and Distribution License (the "License"). 6178479Sjb * You may not use this file except in compliance with the License. 7178479Sjb * 8178479Sjb * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9178479Sjb * or http://www.opensolaris.org/os/licensing. 10178479Sjb * See the License for the specific language governing permissions 11178479Sjb * and limitations under the License. 12178479Sjb * 13178479Sjb * When distributing Covered Code, include this CDDL HEADER in each 14178479Sjb * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15178479Sjb * If applicable, add the following below this CDDL HEADER, with the 16178479Sjb * fields enclosed by brackets "[]" replaced with your own identifying 17178479Sjb * information: Portions Copyright [yyyy] [name of copyright owner] 18178479Sjb * 19178479Sjb * CDDL HEADER END 20178479Sjb */ 21178479Sjb 22178479Sjb/* 23178573Sjb * Copyright 2008 Sun Microsystems, Inc. All rights reserved. 24178479Sjb * Use is subject to license terms. 25178479Sjb */ 26178479Sjb 27178479Sjb 28178479Sjb#include <sys/param.h> 29178479Sjb#include <sys/types.h> 30178479Sjb#include <sys/sysmacros.h> 31178479Sjb#include <sys/systm.h> 32178479Sjb#include <sys/errno.h> 33178573Sjb#include <sys/vfs.h> 34178479Sjb#include <sys/vnode.h> 35178573Sjb#include <sys/swap.h> 36178573Sjb#include <sys/file.h> 37178573Sjb#include <sys/proc.h> 38178479Sjb#include <sys/var.h> 39178479Sjb#include <sys/uadmin.h> 40178479Sjb#include <sys/signal.h> 41178573Sjb#include <sys/time.h> 42178479Sjb#include <vm/seg_kmem.h> 43178573Sjb#include <sys/modctl.h> 44178479Sjb#include <sys/callb.h> 45178479Sjb#include <sys/dumphdr.h> 46178479Sjb#include <sys/debug.h> 47178479Sjb#include <sys/ftrace.h> 48178479Sjb#include <sys/cmn_err.h> 49178479Sjb#include <sys/panic.h> 50178573Sjb#include <sys/ddi.h> 51178479Sjb#include <sys/sunddi.h> 52178573Sjb#include <sys/policy.h> 53178573Sjb#include <sys/zone.h> 54211554Srpaulo#include <sys/condvar.h> 55211554Srpaulo#include <sys/thread.h> 56211554Srpaulo#include <sys/sdt.h> 57178573Sjb 58178479Sjb/* 59178479Sjb * Administrivia system call. We provide this in two flavors: one for calling 60178479Sjb * from the system call path (uadmin), and the other for calling from elsewhere 61178479Sjb * within the kernel (kadmin). Callers must beware that certain uadmin cmd 62178479Sjb * values (specifically A_SWAPCTL) are only supported by uadmin and not kadmin. 63178479Sjb */ 64178479Sjb 65178479Sjbextern ksema_t fsflush_sema; 66178479Sjbkmutex_t ualock; 67178479Sjbkcondvar_t uacond; 68178479Sjbkthread_t *ua_shutdown_thread = NULL; 69178479Sjb 70178479Sjbint sys_shutdown = 0; 71178479Sjbvolatile int fastreboot_dryrun = 0; 72178479Sjb 73178479Sjb/* 74178479Sjb * Kill all user processes in said zone. A special argument of ALL_ZONES is 75178479Sjb * passed in when the system as a whole is shutting down. The lack of per-zone 76178479Sjb * process lists is likely to make the following a performance bottleneck on a 77178479Sjb * system with many zones. 78178479Sjb */ 79178479Sjbvoid 80178479Sjbkillall(zoneid_t zoneid) 81178479Sjb{ 82178479Sjb proc_t *p; 83178479Sjb 84178479Sjb ASSERT(zoneid != GLOBAL_ZONEID); 85178479Sjb /* 86178479Sjb * Kill all processes except kernel daemons and ourself. 87178479Sjb * Make a first pass to stop all processes so they won't 88178479Sjb * be trying to restart children as we kill them. 89178479Sjb */ 90178479Sjb mutex_enter(&pidlock); 91178479Sjb for (p = practive; p != NULL; p = p->p_next) { 92178479Sjb if ((zoneid == ALL_ZONES || p->p_zone->zone_id == zoneid) && 93178479Sjb p->p_exec != NULLVP && /* kernel daemons */ 94178479Sjb p->p_as != &kas && 95178479Sjb p->p_stat != SZOMB) { 96178479Sjb mutex_enter(&p->p_lock); 97178479Sjb p->p_flag |= SNOWAIT; 98178479Sjb sigtoproc(p, NULL, SIGSTOP); 99178479Sjb mutex_exit(&p->p_lock); 100178479Sjb } 101178479Sjb } 102178479Sjb p = practive; 103178479Sjb while (p != NULL) { 104178479Sjb if ((zoneid == ALL_ZONES || p->p_zone->zone_id == zoneid) && 105178479Sjb p->p_exec != NULLVP && /* kernel daemons */ 106178479Sjb p->p_as != &kas && 107178479Sjb p->p_stat != SIDL && 108178479Sjb p->p_stat != SZOMB) { 109178479Sjb mutex_enter(&p->p_lock); 110178479Sjb if (sigismember(&p->p_sig, SIGKILL)) { 111178479Sjb mutex_exit(&p->p_lock); 112178479Sjb p = p->p_next; 113178479Sjb } else { 114178479Sjb sigtoproc(p, NULL, SIGKILL); 115178479Sjb mutex_exit(&p->p_lock); 116178479Sjb (void) cv_timedwait(&p->p_srwchan_cv, &pidlock, 117178479Sjb lbolt + hz); 118178479Sjb p = practive; 119178479Sjb } 120178479Sjb } else { 121178479Sjb p = p->p_next; 122178479Sjb } 123178479Sjb } 124178479Sjb mutex_exit(&pidlock); 125178479Sjb} 126178479Sjb 127178479Sjbint 128178479Sjbkadmin(int cmd, int fcn, void *mdep, cred_t *credp) 129178479Sjb{ 130178479Sjb int error = 0; 131178479Sjb char *buf; 132178479Sjb size_t buflen = 0; 133178479Sjb boolean_t invoke_cb = B_FALSE; 134178479Sjb 135178479Sjb /* 136178479Sjb * We might be called directly by the kernel's fault-handling code, so 137178479Sjb * we can't assert that the caller is in the global zone. 138178479Sjb */ 139178479Sjb 140178479Sjb /* 141178479Sjb * Make sure that cmd is one of the valid <sys/uadmin.h> command codes 142178479Sjb * and that we have appropriate privileges for this action. 143178479Sjb */ 144178479Sjb switch (cmd) { 145178479Sjb case A_FTRACE: 146178479Sjb case A_SHUTDOWN: 147178479Sjb case A_REBOOT: 148178479Sjb case A_REMOUNT: 149178479Sjb case A_FREEZE: 150178479Sjb case A_DUMP: 151178479Sjb case A_SDTTEST: 152178479Sjb if (secpolicy_sys_config(credp, B_FALSE) != 0) 153178479Sjb return (EPERM); 154178479Sjb break; 155178479Sjb 156178479Sjb default: 157178479Sjb return (EINVAL); 158178479Sjb } 159178479Sjb 160178479Sjb /* 161178479Sjb * Serialize these operations on ualock. If it is held, the 162178479Sjb * system should shutdown, reboot, or remount shortly, unless there is 163178479Sjb * an error. We need a cv rather than just a mutex because proper 164178479Sjb * functioning of A_REBOOT relies on being able to interrupt blocked 165178479Sjb * userland callers. 166178479Sjb * 167178479Sjb * We only clear ua_shutdown_thread after A_REMOUNT, because A_SHUTDOWN 168178479Sjb * and A_REBOOT should never return. 169178479Sjb */ 170178479Sjb if (cmd == A_SHUTDOWN || cmd == A_REBOOT || cmd == A_REMOUNT) { 171178479Sjb mutex_enter(&ualock); 172178479Sjb while (ua_shutdown_thread != NULL) { 173178479Sjb if (cv_wait_sig(&uacond, &ualock) == 0) { 174178479Sjb /* 175178479Sjb * If we were interrupted, leave, and handle 176178479Sjb * the signal (or exit, depending on what 177178479Sjb * happened) 178178479Sjb */ 179178479Sjb mutex_exit(&ualock); 180178479Sjb return (EINTR); 181178479Sjb } 182178479Sjb } 183178479Sjb ua_shutdown_thread = curthread; 184178479Sjb mutex_exit(&ualock); 185178479Sjb } 186178479Sjb 187178479Sjb switch (cmd) { 188178479Sjb case A_SHUTDOWN: 189178479Sjb { 190178479Sjb proc_t *p = ttoproc(curthread); 191178479Sjb 192178479Sjb /* 193178479Sjb * Release (almost) all of our own resources if we are called 194178479Sjb * from a user context, however if we are calling kadmin() from 195178479Sjb * a kernel context then we do not release these resources. 196178479Sjb */ 197178479Sjb if (p != &p0) { 198178479Sjb proc_is_exiting(p); 199178479Sjb if ((error = exitlwps(0)) != 0) { 200178479Sjb /* 201178479Sjb * Another thread in this process also called 202178479Sjb * exitlwps(). 203178479Sjb */ 204178479Sjb mutex_enter(&ualock); 205178479Sjb ua_shutdown_thread = NULL; 206178479Sjb cv_signal(&uacond); 207178479Sjb mutex_exit(&ualock); 208178479Sjb return (error); 209178479Sjb } 210178479Sjb mutex_enter(&p->p_lock); 211178479Sjb p->p_flag |= SNOWAIT; 212178479Sjb sigfillset(&p->p_ignore); 213178479Sjb curthread->t_lwp->lwp_cursig = 0; 214178479Sjb curthread->t_lwp->lwp_extsig = 0; 215178479Sjb if (p->p_exec) { 216178479Sjb vnode_t *exec_vp = p->p_exec; 217178479Sjb p->p_exec = NULLVP; 218178479Sjb mutex_exit(&p->p_lock); 219178479Sjb VN_RELE(exec_vp); 220178479Sjb } else { 221178479Sjb mutex_exit(&p->p_lock); 222178479Sjb } 223178479Sjb 224178479Sjb pollcleanup(); 225178479Sjb closeall(P_FINFO(curproc)); 226178479Sjb relvm(); 227178479Sjb 228178479Sjb } else { 229178479Sjb /* 230178573Sjb * Reset t_cred if not set because much of the 231178573Sjb * filesystem code depends on CRED() being valid. 232178573Sjb */ 233178573Sjb if (curthread->t_cred == NULL) 234178573Sjb curthread->t_cred = kcred; 235178573Sjb } 236178573Sjb 237178479Sjb /* indicate shutdown in progress */ 238178479Sjb sys_shutdown = 1; 239178479Sjb 240178479Sjb /* 241178573Sjb * Communcate that init shouldn't be restarted. 242178573Sjb */ 243178573Sjb zone_shutdown_global(); 244178573Sjb 245178573Sjb killall(ALL_ZONES); 246178573Sjb /* 247178479Sjb * If we are calling kadmin() from a kernel context then we 248178479Sjb * do not release these resources. 249178479Sjb */ 250178479Sjb if (ttoproc(curthread) != &p0) { 251178479Sjb VN_RELE(PTOU(curproc)->u_cdir); 252178479Sjb if (PTOU(curproc)->u_rdir) 253178479Sjb VN_RELE(PTOU(curproc)->u_rdir); 254178479Sjb if (PTOU(curproc)->u_cwd) 255178479Sjb refstr_rele(PTOU(curproc)->u_cwd); 256178479Sjb 257178479Sjb PTOU(curproc)->u_cdir = rootdir; 258178479Sjb PTOU(curproc)->u_rdir = NULL; 259178479Sjb PTOU(curproc)->u_cwd = NULL; 260178479Sjb } 261178479Sjb 262178479Sjb /* 263178479Sjb * Allow the reboot/halt/poweroff code a chance to do 264178479Sjb * anything it needs to whilst we still have filesystems 265178479Sjb * mounted, like loading any modules necessary for later 266178479Sjb * performing the actual poweroff. 267178479Sjb */ 268178479Sjb if ((mdep != NULL) && (*(char *)mdep == '/')) { 269178479Sjb buf = i_convert_boot_device_name(mdep, NULL, &buflen); 270178479Sjb mdpreboot(cmd, fcn, buf); 271178479Sjb } else 272178479Sjb mdpreboot(cmd, fcn, mdep); 273178479Sjb 274178479Sjb /* 275178479Sjb * Allow fsflush to finish running and then prevent it 276178479Sjb * from ever running again so that vfs_unmountall() and 277178479Sjb * vfs_syncall() can acquire the vfs locks they need. 278178479Sjb */ 279178479Sjb sema_p(&fsflush_sema); 280178479Sjb (void) callb_execute_class(CB_CL_UADMIN_PRE_VFS, NULL); 281178479Sjb 282178479Sjb vfs_unmountall(); 283178479Sjb (void) VFS_MOUNTROOT(rootvfs, ROOT_UNMOUNT); 284178479Sjb vfs_syncall(); 285178479Sjb 286178479Sjb dump_ereports(); 287178479Sjb dump_messages(); 288178479Sjb 289178479Sjb invoke_cb = B_TRUE; 290178479Sjb 291178479Sjb /* FALLTHROUGH */ 292178479Sjb } 293178479Sjb 294178479Sjb case A_REBOOT: 295178479Sjb if ((mdep != NULL) && (*(char *)mdep == '/')) { 296178479Sjb buf = i_convert_boot_device_name(mdep, NULL, &buflen); 297178479Sjb mdboot(cmd, fcn, buf, invoke_cb); 298178479Sjb } else 299178479Sjb mdboot(cmd, fcn, mdep, invoke_cb); 300178479Sjb /* no return expected */ 301178479Sjb break; 302178479Sjb 303178479Sjb case A_REMOUNT: 304178479Sjb (void) VFS_MOUNTROOT(rootvfs, ROOT_REMOUNT); 305178479Sjb /* Let other threads enter the shutdown path now */ 306178479Sjb mutex_enter(&ualock); 307178479Sjb ua_shutdown_thread = NULL; 308178479Sjb cv_signal(&uacond); 309178479Sjb mutex_exit(&ualock); 310178479Sjb break; 311178479Sjb 312178479Sjb case A_FREEZE: 313178479Sjb { 314178479Sjb /* 315178479Sjb * This is the entrypoint for all suspend/resume actions. 316178479Sjb */ 317178479Sjb extern int cpr(int, void *); 318178479Sjb 319178479Sjb if (modload("misc", "cpr") == -1) 320178479Sjb return (ENOTSUP); 321178479Sjb /* Let the CPR module decide what to do with mdep */ 322178479Sjb error = cpr(fcn, mdep); 323178479Sjb break; 324178479Sjb } 325178479Sjb 326178479Sjb case A_FTRACE: 327178479Sjb { 328178479Sjb switch (fcn) { 329178479Sjb case AD_FTRACE_START: 330178479Sjb (void) FTRACE_START(); 331178479Sjb break; 332178479Sjb case AD_FTRACE_STOP: 333178479Sjb (void) FTRACE_STOP(); 334178479Sjb break; 335178479Sjb default: 336178479Sjb error = EINVAL; 337178479Sjb } 338178479Sjb break; 339178479Sjb } 340178479Sjb 341178479Sjb case A_DUMP: 342178479Sjb { 343178479Sjb if (fcn == AD_NOSYNC) { 344178479Sjb in_sync = 1; 345178479Sjb break; 346178479Sjb } 347178479Sjb 348178479Sjb panic_bootfcn = fcn; 349178479Sjb panic_forced = 1; 350178479Sjb 351178479Sjb if ((mdep != NULL) && (*(char *)mdep == '/')) { 352178479Sjb panic_bootstr = i_convert_boot_device_name(mdep, 353178479Sjb NULL, &buflen); 354178479Sjb } else 355178479Sjb panic_bootstr = mdep; 356178479Sjb 357178479Sjb panic("forced crash dump initiated at user request"); 358178479Sjb /*NOTREACHED*/ 359178479Sjb } 360178479Sjb 361178479Sjb case A_SDTTEST: 362178479Sjb { 363178479Sjb DTRACE_PROBE7(test, int, 1, int, 2, int, 3, int, 4, int, 5, 364178479Sjb int, 6, int, 7); 365178479Sjb break; 366178479Sjb } 367178479Sjb 368178479Sjb default: 369178479Sjb error = EINVAL; 370178479Sjb } 371178479Sjb 372178479Sjb return (error); 373178479Sjb} 374178479Sjb 375178479Sjbint 376178479Sjbuadmin(int cmd, int fcn, uintptr_t mdep) 377178479Sjb{ 378178479Sjb int error = 0, rv = 0; 379178479Sjb size_t nbytes = 0; 380178479Sjb cred_t *credp = CRED(); 381178479Sjb char *bootargs = NULL; 382178479Sjb int reset_status = 0; 383178479Sjb 384178479Sjb if (cmd == A_SHUTDOWN && fcn == AD_FASTREBOOT_DRYRUN) { 385178479Sjb ddi_walk_devs(ddi_root_node(), check_driver_quiesce, 386178479Sjb &reset_status); 387178479Sjb if (reset_status != 0) 388178479Sjb return (EIO); 389178479Sjb else 390178479Sjb return (0); 391178479Sjb } 392178479Sjb 393178479Sjb /* 394178479Sjb * The swapctl system call doesn't have its own entry point: it uses 395178479Sjb * uadmin as a wrapper so we just call it directly from here. 396178479Sjb */ 397178479Sjb if (cmd == A_SWAPCTL) { 398178479Sjb if (get_udatamodel() == DATAMODEL_NATIVE) 399178479Sjb error = swapctl(fcn, (void *)mdep, &rv); 400178479Sjb#if defined(_SYSCALL32_IMPL) 401178479Sjb else 402178479Sjb error = swapctl32(fcn, (void *)mdep, &rv); 403178479Sjb#endif /* _SYSCALL32_IMPL */ 404178479Sjb return (error ? set_errno(error) : rv); 405178479Sjb } 406178479Sjb 407178479Sjb /* 408178479Sjb * Certain subcommands intepret a non-NULL mdep value as a pointer to 409178479Sjb * a boot string. We pull that in as bootargs, if applicable. 410178479Sjb */ 411178479Sjb if (mdep != NULL && 412178479Sjb (cmd == A_SHUTDOWN || cmd == A_REBOOT || cmd == A_DUMP || 413178479Sjb cmd == A_FREEZE)) { 414178479Sjb bootargs = kmem_zalloc(BOOTARGS_MAX, KM_SLEEP); 415178479Sjb if ((error = copyinstr((const char *)mdep, bootargs, 416178479Sjb BOOTARGS_MAX, &nbytes)) != 0) { 417178479Sjb kmem_free(bootargs, BOOTARGS_MAX); 418178573Sjb return (set_errno(error)); 419178573Sjb } 420178573Sjb } 421178573Sjb 422178573Sjb /* 423178573Sjb * Invoke the appropriate kadmin() routine. 424178573Sjb */ 425178573Sjb if (getzoneid() != GLOBAL_ZONEID) 426178573Sjb error = zone_kadmin(cmd, fcn, bootargs, credp); 427178573Sjb else 428178479Sjb error = kadmin(cmd, fcn, bootargs, credp); 429178479Sjb 430178479Sjb if (bootargs != NULL) 431178479Sjb kmem_free(bootargs, BOOTARGS_MAX); 432178479Sjb return (error ? set_errno(error) : 0); 433178479Sjb} 434178479Sjb