exit.c revision 3813:c7c433a53b1a
1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21
22/*
23 * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
24 * Use is subject to license terms.
25 */
26
27/*	Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T	*/
28
29#pragma ident	"%Z%%M%	%I%	%E% SMI"	/* from SVr4.0 1.74 */
30
31#include <sys/types.h>
32#include <sys/param.h>
33#include <sys/sysmacros.h>
34#include <sys/systm.h>
35#include <sys/cred.h>
36#include <sys/user.h>
37#include <sys/errno.h>
38#include <sys/proc.h>
39#include <sys/ucontext.h>
40#include <sys/procfs.h>
41#include <sys/vnode.h>
42#include <sys/acct.h>
43#include <sys/var.h>
44#include <sys/cmn_err.h>
45#include <sys/debug.h>
46#include <sys/wait.h>
47#include <sys/siginfo.h>
48#include <sys/procset.h>
49#include <sys/class.h>
50#include <sys/file.h>
51#include <sys/session.h>
52#include <sys/kmem.h>
53#include <sys/vtrace.h>
54#include <sys/prsystm.h>
55#include <sys/ipc.h>
56#include <sys/sem_impl.h>
57#include <c2/audit.h>
58#include <sys/aio_impl.h>
59#include <vm/as.h>
60#include <sys/poll.h>
61#include <sys/door.h>
62#include <sys/lwpchan_impl.h>
63#include <sys/utrap.h>
64#include <sys/task.h>
65#include <sys/exacct.h>
66#include <sys/cyclic.h>
67#include <sys/schedctl.h>
68#include <sys/rctl.h>
69#include <sys/contract_impl.h>
70#include <sys/contract/process_impl.h>
71#include <sys/list.h>
72#include <sys/dtrace.h>
73#include <sys/pool.h>
74#include <sys/sdt.h>
75#include <sys/corectl.h>
76#include <sys/brand.h>
77#include <sys/libc_kernel.h>
78
79/*
80 * convert code/data pair into old style wait status
81 */
82int
83wstat(int code, int data)
84{
85	int stat = (data & 0377);
86
87	switch (code) {
88	case CLD_EXITED:
89		stat <<= 8;
90		break;
91	case CLD_DUMPED:
92		stat |= WCOREFLG;
93		break;
94	case CLD_KILLED:
95		break;
96	case CLD_TRAPPED:
97	case CLD_STOPPED:
98		stat <<= 8;
99		stat |= WSTOPFLG;
100		break;
101	case CLD_CONTINUED:
102		stat = WCONTFLG;
103		break;
104	default:
105		cmn_err(CE_PANIC, "wstat: bad code");
106		/* NOTREACHED */
107	}
108	return (stat);
109}
110
111static char *
112exit_reason(char *buf, size_t bufsz, int what, int why)
113{
114	switch (why) {
115	case CLD_EXITED:
116		(void) snprintf(buf, bufsz, "exited with status %d", what);
117		break;
118	case CLD_KILLED:
119		(void) snprintf(buf, bufsz, "exited on fatal signal %d", what);
120		break;
121	case CLD_DUMPED:
122		(void) snprintf(buf, bufsz, "core dumped on signal %d", what);
123		break;
124	default:
125		(void) snprintf(buf, bufsz, "encountered unknown error "
126		    "(%d, %d)", why, what);
127		break;
128	}
129
130	return (buf);
131}
132
133/*
134 * exit system call: pass back caller's arg.
135 */
136void
137rexit(int rval)
138{
139	exit(CLD_EXITED, rval);
140}
141
142/*
143 * Called by proc_exit() when a zone's init exits, presumably because
144 * it failed.  As long as the given zone is still in the "running"
145 * state, we will re-exec() init, but first we need to reset things
146 * which are usually inherited across exec() but will break init's
147 * assumption that it is being exec()'d from a virgin process.  Most
148 * importantly this includes closing all file descriptors (exec only
149 * closes those marked close-on-exec) and resetting signals (exec only
150 * resets handled signals, and we need to clear any signals which
151 * killed init).  Anything else that exec(2) says would be inherited,
152 * but would affect the execution of init, needs to be reset.
153 */
154static int
155restart_init(int what, int why)
156{
157	kthread_t *t = curthread;
158	klwp_t *lwp = ttolwp(t);
159	proc_t *p = ttoproc(t);
160	user_t *up = PTOU(p);
161
162	vnode_t *oldcd, *oldrd;
163	int i, err;
164	char reason_buf[64];
165
166	/*
167	 * Let zone admin (and global zone admin if this is for a non-global
168	 * zone) know that init has failed and will be restarted.
169	 */
170	zcmn_err(p->p_zone->zone_id, CE_WARN,
171	    "init(1M) %s: restarting automatically",
172	    exit_reason(reason_buf, sizeof (reason_buf), what, why));
173
174	if (!INGLOBALZONE(p)) {
175		cmn_err(CE_WARN, "init(1M) for zone %s (pid %d) %s: "
176		    "restarting automatically",
177		    p->p_zone->zone_name, p->p_pid, reason_buf);
178	}
179
180	/*
181	 * Remove any fpollinfo_t's for this (last) thread from our file
182	 * descriptors so closeall() can ASSERT() that they're all gone.
183	 * Then close all open file descriptors in the process.
184	 */
185	pollcleanup();
186	closeall(P_FINFO(p));
187
188	/*
189	 * Grab p_lock and begin clearing miscellaneous global process
190	 * state that needs to be reset before we exec the new init(1M).
191	 */
192
193	mutex_enter(&p->p_lock);
194	prbarrier(p);
195
196	p->p_flag &= ~(SKILLED | SEXTKILLED | SEXITING | SDOCORE);
197	up->u_cmask = CMASK;
198
199	sigemptyset(&t->t_hold);
200	sigemptyset(&t->t_sig);
201	sigemptyset(&t->t_extsig);
202
203	sigemptyset(&p->p_sig);
204	sigemptyset(&p->p_extsig);
205
206	sigdelq(p, t, 0);
207	sigdelq(p, NULL, 0);
208
209	if (p->p_killsqp) {
210		siginfofree(p->p_killsqp);
211		p->p_killsqp = NULL;
212	}
213
214	/*
215	 * Reset any signals that are ignored back to the default disposition.
216	 * Other u_signal members will be cleared when exec calls sigdefault().
217	 */
218	for (i = 1; i < NSIG; i++) {
219		if (up->u_signal[i - 1] == SIG_IGN) {
220			up->u_signal[i - 1] = SIG_DFL;
221			sigemptyset(&up->u_sigmask[i - 1]);
222		}
223	}
224
225	/*
226	 * Clear the current signal, any signal info associated with it, and
227	 * any signal information from contracts and/or contract templates.
228	 */
229	lwp->lwp_cursig = 0;
230	lwp->lwp_extsig = 0;
231	if (lwp->lwp_curinfo != NULL) {
232		siginfofree(lwp->lwp_curinfo);
233		lwp->lwp_curinfo = NULL;
234	}
235	lwp_ctmpl_clear(lwp);
236
237	/*
238	 * Reset both the process root directory and the current working
239	 * directory to the root of the zone just as we do during boot.
240	 */
241	VN_HOLD(p->p_zone->zone_rootvp);
242	oldrd = up->u_rdir;
243	up->u_rdir = p->p_zone->zone_rootvp;
244
245	VN_HOLD(p->p_zone->zone_rootvp);
246	oldcd = up->u_cdir;
247	up->u_cdir = p->p_zone->zone_rootvp;
248
249	if (up->u_cwd != NULL) {
250		refstr_rele(up->u_cwd);
251		up->u_cwd = NULL;
252	}
253
254	mutex_exit(&p->p_lock);
255
256	if (oldrd != NULL)
257		VN_RELE(oldrd);
258	if (oldcd != NULL)
259		VN_RELE(oldcd);
260
261	/* Free the controlling tty.  (freectty() always assumes curproc.) */
262	ASSERT(p == curproc);
263	(void) freectty(B_TRUE);
264
265	/*
266	 * Now exec() the new init(1M) on top of the current process.  If we
267	 * succeed, the caller will treat this like a successful system call.
268	 * If we fail, we issue messages and the caller will proceed with exit.
269	 */
270	err = exec_init(p->p_zone->zone_initname, NULL);
271
272	if (err == 0)
273		return (0);
274
275	zcmn_err(p->p_zone->zone_id, CE_WARN,
276	    "failed to restart init(1M) (err=%d): system reboot required", err);
277
278	if (!INGLOBALZONE(p)) {
279		cmn_err(CE_WARN, "failed to restart init(1M) for zone %s "
280		    "(pid %d, err=%d): zoneadm(1M) boot required",
281		    p->p_zone->zone_name, p->p_pid, err);
282	}
283
284	return (-1);
285}
286
287/*
288 * Release resources.
289 * Enter zombie state.
290 * Wake up parent and init processes,
291 * and dispose of children.
292 */
293void
294exit(int why, int what)
295{
296	/*
297	 * If proc_exit() fails, then some other lwp in the process
298	 * got there first.  We just have to call lwp_exit() to allow
299	 * the other lwp to finish exiting the process.  Otherwise we're
300	 * restarting init, and should return.
301	 */
302	if (proc_exit(why, what) != 0) {
303		mutex_enter(&curproc->p_lock);
304		ASSERT(curproc->p_flag & SEXITLWPS);
305		lwp_exit();
306		/* NOTREACHED */
307	}
308}
309
310/*
311 * Set the SEXITING flag on the process, after making sure /proc does
312 * not have it locked.  This is done in more places than proc_exit(),
313 * so it is a separate function.
314 */
315void
316proc_is_exiting(proc_t *p)
317{
318	mutex_enter(&p->p_lock);
319	prbarrier(p);
320	p->p_flag |= SEXITING;
321	mutex_exit(&p->p_lock);
322}
323
324/*
325 * Return value:
326 *   1 - exitlwps() failed, call (or continue) lwp_exit()
327 *   0 - restarting init.  Return through system call path
328 */
329int
330proc_exit(int why, int what)
331{
332	kthread_t *t = curthread;
333	klwp_t *lwp = ttolwp(t);
334	proc_t *p = ttoproc(t);
335	zone_t *z = p->p_zone;
336	timeout_id_t tmp_id;
337	int rv;
338	proc_t *q;
339	task_t *tk;
340	vnode_t *exec_vp, *execdir_vp, *cdir, *rdir;
341	sigqueue_t *sqp;
342	lwpdir_t *lwpdir;
343	uint_t lwpdir_sz;
344	lwpdir_t **tidhash;
345	uint_t tidhash_sz;
346	refstr_t *cwd;
347	hrtime_t hrutime, hrstime;
348	int evaporate;
349
350	/*
351	 * Stop and discard the process's lwps except for the current one,
352	 * unless some other lwp beat us to it.  If exitlwps() fails then
353	 * return and the calling lwp will call (or continue in) lwp_exit().
354	 */
355	proc_is_exiting(p);
356	if (exitlwps(0) != 0)
357		return (1);
358
359	DTRACE_PROC(lwp__exit);
360	DTRACE_PROC1(exit, int, why);
361
362	/*
363	 * Will perform any brand specific proc exit processing, since this
364	 * is always the last lwp, will also perform lwp_exit and free brand
365	 * data
366	 */
367	if (PROC_IS_BRANDED(p))
368		BROP(p)->b_proc_exit(p, lwp);
369
370	/*
371	 * Don't let init exit unless zone_start_init() failed its exec, or
372	 * we are shutting down the zone or the machine.
373	 *
374	 * Since we are single threaded, we don't need to lock the
375	 * following accesses to zone_proc_initpid.
376	 */
377	if (p->p_pid == z->zone_proc_initpid) {
378		if (z->zone_boot_err == 0 &&
379		    zone_status_get(z) < ZONE_IS_SHUTTING_DOWN &&
380		    zone_status_get(global_zone) < ZONE_IS_SHUTTING_DOWN &&
381		    z->zone_restart_init == B_TRUE &&
382		    restart_init(what, why) == 0)
383			return (0);
384		/*
385		 * Since we didn't or couldn't restart init, we clear
386		 * the zone's init state and proceed with exit
387		 * processing.
388		 */
389		z->zone_proc_initpid = -1;
390	}
391
392	/*
393	 * Allocate a sigqueue now, before we grab locks.
394	 * It will be given to sigcld(), below.
395	 * Special case:  If we will be making the process disappear
396	 * without a trace (for the benefit of posix_spawn() in libc)
397	 * don't bother to allocate a useless sigqueue.
398	 */
399	evaporate = ((p->p_flag & SVFORK) &&
400	    why == CLD_EXITED && what == _EVAPORATE);
401	if (!evaporate)
402		sqp = kmem_zalloc(sizeof (sigqueue_t), KM_SLEEP);
403
404	/*
405	 * revoke any doors created by the process.
406	 */
407	if (p->p_door_list)
408		door_exit();
409
410	/*
411	 * Release schedctl data structures.
412	 */
413	if (p->p_pagep)
414		schedctl_proc_cleanup();
415
416	/*
417	 * make sure all pending kaio has completed.
418	 */
419	if (p->p_aio)
420		aio_cleanup_exit();
421
422	/*
423	 * discard the lwpchan cache.
424	 */
425	if (p->p_lcp != NULL)
426		lwpchan_destroy_cache(0);
427
428	/*
429	 * Clean up any DTrace helper actions or probes for the process.
430	 */
431	if (p->p_dtrace_helpers != NULL) {
432		ASSERT(dtrace_helpers_cleanup != NULL);
433		(*dtrace_helpers_cleanup)();
434	}
435
436	/* untimeout the realtime timers */
437	if (p->p_itimer != NULL)
438		timer_exit();
439
440	if ((tmp_id = p->p_alarmid) != 0) {
441		p->p_alarmid = 0;
442		(void) untimeout(tmp_id);
443	}
444
445	/*
446	 * Remove any fpollinfo_t's for this (last) thread from our file
447	 * descriptors so closeall() can ASSERT() that they're all gone.
448	 */
449	pollcleanup();
450
451	if (p->p_rprof_cyclic != CYCLIC_NONE) {
452		mutex_enter(&cpu_lock);
453		cyclic_remove(p->p_rprof_cyclic);
454		mutex_exit(&cpu_lock);
455	}
456
457	mutex_enter(&p->p_lock);
458
459	/*
460	 * Clean up any DTrace probes associated with this process.
461	 */
462	if (p->p_dtrace_probes) {
463		ASSERT(dtrace_fasttrap_exit_ptr != NULL);
464		dtrace_fasttrap_exit_ptr(p);
465	}
466
467	while ((tmp_id = p->p_itimerid) != 0) {
468		p->p_itimerid = 0;
469		mutex_exit(&p->p_lock);
470		(void) untimeout(tmp_id);
471		mutex_enter(&p->p_lock);
472	}
473
474	lwp_cleanup();
475
476	/*
477	 * We are about to exit; prevent our resource associations from
478	 * being changed.
479	 */
480	pool_barrier_enter();
481
482	/*
483	 * Block the process against /proc now that we have really
484	 * acquired p->p_lock (to manipulate p_tlist at least).
485	 */
486	prbarrier(p);
487
488#ifdef	SUN_SRC_COMPAT
489	if (code == CLD_KILLED)
490		u.u_acflag |= AXSIG;
491#endif
492	sigfillset(&p->p_ignore);
493	sigemptyset(&p->p_siginfo);
494	sigemptyset(&p->p_sig);
495	sigemptyset(&p->p_extsig);
496	sigemptyset(&t->t_sig);
497	sigemptyset(&t->t_extsig);
498	sigemptyset(&p->p_sigmask);
499	sigdelq(p, t, 0);
500	lwp->lwp_cursig = 0;
501	lwp->lwp_extsig = 0;
502	p->p_flag &= ~(SKILLED | SEXTKILLED);
503	if (lwp->lwp_curinfo) {
504		siginfofree(lwp->lwp_curinfo);
505		lwp->lwp_curinfo = NULL;
506	}
507
508	t->t_proc_flag |= TP_LWPEXIT;
509	ASSERT(p->p_lwpcnt == 1 && p->p_zombcnt == 0);
510	prlwpexit(t);		/* notify /proc */
511	lwp_hash_out(p, t->t_tid);
512	prexit(p);
513
514	p->p_lwpcnt = 0;
515	p->p_tlist = NULL;
516	sigqfree(p);
517	term_mstate(t);
518	p->p_mterm = gethrtime();
519
520	exec_vp = p->p_exec;
521	execdir_vp = p->p_execdir;
522	p->p_exec = NULLVP;
523	p->p_execdir = NULLVP;
524	mutex_exit(&p->p_lock);
525	if (exec_vp)
526		VN_RELE(exec_vp);
527	if (execdir_vp)
528		VN_RELE(execdir_vp);
529
530	pr_free_watched_pages(p);
531
532	closeall(P_FINFO(p));
533
534	/* Free the controlling tty.  (freectty() always assumes curproc.) */
535	ASSERT(p == curproc);
536	(void) freectty(B_TRUE);
537
538#if defined(__sparc)
539	if (p->p_utraps != NULL)
540		utrap_free(p);
541#endif
542	if (p->p_semacct)			/* IPC semaphore exit */
543		semexit(p);
544	rv = wstat(why, what);
545
546	acct(rv & 0xff);
547	exacct_commit_proc(p, rv);
548
549	/*
550	 * Release any resources associated with C2 auditing
551	 */
552#ifdef C2_AUDIT
553	if (audit_active) {
554		/*
555		 * audit exit system call
556		 */
557		audit_exit(why, what);
558	}
559#endif
560
561	/*
562	 * Free address space.
563	 */
564	relvm();
565
566	/*
567	 * Release held contracts.
568	 */
569	contract_exit(p);
570
571	/*
572	 * Depart our encapsulating process contract.
573	 */
574	if ((p->p_flag & SSYS) == 0) {
575		ASSERT(p->p_ct_process);
576		contract_process_exit(p->p_ct_process, p, rv);
577	}
578
579	/*
580	 * Remove pool association, and block if requested by pool_do_bind.
581	 */
582	mutex_enter(&p->p_lock);
583	ASSERT(p->p_pool->pool_ref > 0);
584	atomic_add_32(&p->p_pool->pool_ref, -1);
585	p->p_pool = pool_default;
586	/*
587	 * Now that our address space has been freed and all other threads
588	 * in this process have exited, set the PEXITED pool flag.  This
589	 * tells the pools subsystems to ignore this process if it was
590	 * requested to rebind this process to a new pool.
591	 */
592	p->p_poolflag |= PEXITED;
593	pool_barrier_exit();
594	mutex_exit(&p->p_lock);
595
596	mutex_enter(&pidlock);
597
598	/*
599	 * Delete this process from the newstate list of its parent. We
600	 * will put it in the right place in the sigcld in the end.
601	 */
602	delete_ns(p->p_parent, p);
603
604	/*
605	 * Reassign the orphans to the next of kin.
606	 * Don't rearrange init's orphanage.
607	 */
608	if ((q = p->p_orphan) != NULL && p != proc_init) {
609
610		proc_t *nokp = p->p_nextofkin;
611
612		for (;;) {
613			q->p_nextofkin = nokp;
614			if (q->p_nextorph == NULL)
615				break;
616			q = q->p_nextorph;
617		}
618		q->p_nextorph = nokp->p_orphan;
619		nokp->p_orphan = p->p_orphan;
620		p->p_orphan = NULL;
621	}
622
623	/*
624	 * Reassign the children to init.
625	 * Don't try to assign init's children to init.
626	 */
627	if ((q = p->p_child) != NULL && p != proc_init) {
628		struct proc	*np;
629		struct proc	*initp = proc_init;
630		boolean_t	setzonetop = B_FALSE;
631
632		if (!INGLOBALZONE(curproc))
633			setzonetop = B_TRUE;
634
635		pgdetach(p);
636
637		do {
638			np = q->p_sibling;
639			/*
640			 * Delete it from its current parent new state
641			 * list and add it to init new state list
642			 */
643			delete_ns(q->p_parent, q);
644
645			q->p_ppid = 1;
646			q->p_pidflag &= ~(CLDNOSIGCHLD | CLDWAITPID);
647			if (setzonetop) {
648				mutex_enter(&q->p_lock);
649				q->p_flag |= SZONETOP;
650				mutex_exit(&q->p_lock);
651			}
652			q->p_parent = initp;
653
654			/*
655			 * Since q will be the first child,
656			 * it will not have a previous sibling.
657			 */
658			q->p_psibling = NULL;
659			if (initp->p_child) {
660				initp->p_child->p_psibling = q;
661			}
662			q->p_sibling = initp->p_child;
663			initp->p_child = q;
664			if (q->p_proc_flag & P_PR_PTRACE) {
665				mutex_enter(&q->p_lock);
666				sigtoproc(q, NULL, SIGKILL);
667				mutex_exit(&q->p_lock);
668			}
669			/*
670			 * sigcld() will add the child to parents
671			 * newstate list.
672			 */
673			if (q->p_stat == SZOMB)
674				sigcld(q, NULL);
675		} while ((q = np) != NULL);
676
677		p->p_child = NULL;
678		ASSERT(p->p_child_ns == NULL);
679	}
680
681	TRACE_1(TR_FAC_PROC, TR_PROC_EXIT, "proc_exit: %p", p);
682
683	mutex_enter(&p->p_lock);
684	CL_EXIT(curthread); /* tell the scheduler that curthread is exiting */
685
686	hrutime = mstate_aggr_state(p, LMS_USER);
687	hrstime = mstate_aggr_state(p, LMS_SYSTEM);
688	p->p_utime = (clock_t)NSEC_TO_TICK(hrutime) + p->p_cutime;
689	p->p_stime = (clock_t)NSEC_TO_TICK(hrstime) + p->p_cstime;
690
691	p->p_acct[LMS_USER]	+= p->p_cacct[LMS_USER];
692	p->p_acct[LMS_SYSTEM]	+= p->p_cacct[LMS_SYSTEM];
693	p->p_acct[LMS_TRAP]	+= p->p_cacct[LMS_TRAP];
694	p->p_acct[LMS_TFAULT]	+= p->p_cacct[LMS_TFAULT];
695	p->p_acct[LMS_DFAULT]	+= p->p_cacct[LMS_DFAULT];
696	p->p_acct[LMS_KFAULT]	+= p->p_cacct[LMS_KFAULT];
697	p->p_acct[LMS_USER_LOCK] += p->p_cacct[LMS_USER_LOCK];
698	p->p_acct[LMS_SLEEP]	+= p->p_cacct[LMS_SLEEP];
699	p->p_acct[LMS_WAIT_CPU]	+= p->p_cacct[LMS_WAIT_CPU];
700	p->p_acct[LMS_STOPPED]	+= p->p_cacct[LMS_STOPPED];
701
702	p->p_ru.minflt	+= p->p_cru.minflt;
703	p->p_ru.majflt	+= p->p_cru.majflt;
704	p->p_ru.nswap	+= p->p_cru.nswap;
705	p->p_ru.inblock	+= p->p_cru.inblock;
706	p->p_ru.oublock	+= p->p_cru.oublock;
707	p->p_ru.msgsnd	+= p->p_cru.msgsnd;
708	p->p_ru.msgrcv	+= p->p_cru.msgrcv;
709	p->p_ru.nsignals += p->p_cru.nsignals;
710	p->p_ru.nvcsw	+= p->p_cru.nvcsw;
711	p->p_ru.nivcsw	+= p->p_cru.nivcsw;
712	p->p_ru.sysc	+= p->p_cru.sysc;
713	p->p_ru.ioch	+= p->p_cru.ioch;
714
715	p->p_stat = SZOMB;
716	p->p_proc_flag &= ~P_PR_PTRACE;
717	p->p_wdata = what;
718	p->p_wcode = (char)why;
719
720	cdir = PTOU(p)->u_cdir;
721	rdir = PTOU(p)->u_rdir;
722	cwd = PTOU(p)->u_cwd;
723
724	/*
725	 * Release resource controls, as they are no longer enforceable.
726	 */
727	rctl_set_free(p->p_rctls);
728
729	/*
730	 * Give up task and project memberships.  Decrement tk_nlwps counter
731	 * for our task.max-lwps resource control.  An extended accounting
732	 * record, if that facility is active, is scheduled to be written.
733	 * Zombie processes are false members of task0 for the remainder of
734	 * their lifetime; no accounting information is recorded for them.
735	 */
736	tk = p->p_task;
737
738	mutex_enter(&p->p_zone->zone_nlwps_lock);
739	tk->tk_nlwps--;
740	tk->tk_proj->kpj_nlwps--;
741	p->p_zone->zone_nlwps--;
742	mutex_exit(&p->p_zone->zone_nlwps_lock);
743	task_detach(p);
744	p->p_task = task0p;
745
746	/*
747	 * Clear the lwp directory and the lwpid hash table
748	 * now that /proc can't bother us any more.
749	 * We free the memory below, after dropping p->p_lock.
750	 */
751	lwpdir = p->p_lwpdir;
752	lwpdir_sz = p->p_lwpdir_sz;
753	tidhash = p->p_tidhash;
754	tidhash_sz = p->p_tidhash_sz;
755	p->p_lwpdir = NULL;
756	p->p_lwpfree = NULL;
757	p->p_lwpdir_sz = 0;
758	p->p_tidhash = NULL;
759	p->p_tidhash_sz = 0;
760
761	/*
762	 * If the process has context ops installed, call the exit routine
763	 * on behalf of this last remaining thread. Normally exitpctx() is
764	 * called during thread_exit() or lwp_exit(), but because this is the
765	 * last thread in the process, we must call it here. By the time
766	 * thread_exit() is called (below), the association with the relevant
767	 * process has been lost.
768	 *
769	 * We also free the context here.
770	 */
771	if (p->p_pctx) {
772		kpreempt_disable();
773		exitpctx(p);
774		kpreempt_enable();
775
776		freepctx(p, 0);
777	}
778
779	/*
780	 * curthread's proc pointer is changed to point to the 'sched'
781	 * process for the corresponding zone, except in the case when
782	 * the exiting process is in fact a zsched instance, in which
783	 * case the proc pointer is set to p0.  We do so, so that the
784	 * process still points at the right zone when we call the VN_RELE()
785	 * below.
786	 *
787	 * This is because curthread's original proc pointer can be freed as
788	 * soon as the child sends a SIGCLD to its parent.  We use zsched so
789	 * that for user processes, even in the final moments of death, the
790	 * process is still associated with its zone.
791	 */
792	if (p != t->t_procp->p_zone->zone_zsched)
793		t->t_procp = t->t_procp->p_zone->zone_zsched;
794	else
795		t->t_procp = &p0;
796
797	mutex_exit(&p->p_lock);
798	if (!evaporate) {
799		p->p_pidflag &= ~CLDPEND;
800		sigcld(p, sqp);
801	} else {
802		/*
803		 * Do what sigcld() would do if the disposition
804		 * of the SIGCHLD signal were set to be ignored.
805		 */
806		cv_broadcast(&p->p_srwchan_cv);
807		freeproc(p);
808	}
809	mutex_exit(&pidlock);
810
811	/*
812	 * We don't release u_cdir and u_rdir until SZOMB is set.
813	 * This protects us against dofusers().
814	 */
815	VN_RELE(cdir);
816	if (rdir)
817		VN_RELE(rdir);
818	if (cwd)
819		refstr_rele(cwd);
820
821	/*
822	 * task_rele() may ultimately cause the zone to go away (or
823	 * may cause the last user process in a zone to go away, which
824	 * signals zsched to go away).  So prior to this call, we must
825	 * no longer point at zsched.
826	 */
827	t->t_procp = &p0;
828	task_rele(tk);
829
830	kmem_free(lwpdir, lwpdir_sz * sizeof (lwpdir_t));
831	kmem_free(tidhash, tidhash_sz * sizeof (lwpdir_t *));
832
833	lwp_pcb_exit();
834
835	thread_exit();
836	/* NOTREACHED */
837}
838
839/*
840 * Format siginfo structure for wait system calls.
841 */
842void
843winfo(proc_t *pp, k_siginfo_t *ip, int waitflag)
844{
845	ASSERT(MUTEX_HELD(&pidlock));
846
847	bzero(ip, sizeof (k_siginfo_t));
848	ip->si_signo = SIGCLD;
849	ip->si_code = pp->p_wcode;
850	ip->si_pid = pp->p_pid;
851	ip->si_ctid = PRCTID(pp);
852	ip->si_zoneid = pp->p_zone->zone_id;
853	ip->si_status = pp->p_wdata;
854	ip->si_stime = pp->p_stime;
855	ip->si_utime = pp->p_utime;
856
857	if (waitflag) {
858		pp->p_wcode = 0;
859		pp->p_wdata = 0;
860		pp->p_pidflag &= ~CLDPEND;
861	}
862}
863
864/*
865 * Wait system call.
866 * Search for a terminated (zombie) child,
867 * finally lay it to rest, and collect its status.
868 * Look also for stopped children,
869 * and pass back status from them.
870 */
871int
872waitid(idtype_t idtype, id_t id, k_siginfo_t *ip, int options)
873{
874	int found;
875	proc_t *cp, *pp;
876	int proc_gone;
877	int waitflag = !(options & WNOWAIT);
878
879	/*
880	 * Obsolete flag, defined here only for binary compatibility
881	 * with old statically linked executables.  Delete this when
882	 * we no longer care about these old and broken applications.
883	 */
884#define	_WNOCHLD	0400
885	options &= ~_WNOCHLD;
886
887	if (options == 0 || (options & ~WOPTMASK))
888		return (EINVAL);
889
890	switch (idtype) {
891	case P_PID:
892	case P_PGID:
893		if (id < 0 || id >= maxpid)
894			return (EINVAL);
895		/* FALLTHROUGH */
896	case P_ALL:
897		break;
898	default:
899		return (EINVAL);
900	}
901
902	pp = ttoproc(curthread);
903
904	/*
905	 * lock parent mutex so that sibling chain can be searched.
906	 */
907	mutex_enter(&pidlock);
908
909	/*
910	 * if we are only looking for exited processes and child_ns list
911	 * is empty no reason to look at all children.
912	 */
913	if (idtype == P_ALL &&
914	    (options & ~WNOWAIT) == (WNOHANG | WEXITED) &&
915	    pp->p_child_ns == NULL) {
916		if (pp->p_child) {
917			mutex_exit(&pidlock);
918			bzero(ip, sizeof (k_siginfo_t));
919			return (0);
920		}
921		mutex_exit(&pidlock);
922		return (ECHILD);
923	}
924
925	while (pp->p_child != NULL) {
926
927		proc_gone = 0;
928
929		for (cp = pp->p_child_ns; cp != NULL; cp = cp->p_sibling_ns) {
930			if (idtype != P_PID && (cp->p_pidflag & CLDWAITPID))
931				continue;
932			if (idtype == P_PID && id != cp->p_pid)
933				continue;
934			if (idtype == P_PGID && id != cp->p_pgrp)
935				continue;
936
937			switch (cp->p_wcode) {
938
939			case CLD_TRAPPED:
940			case CLD_STOPPED:
941			case CLD_CONTINUED:
942				cmn_err(CE_PANIC,
943				    "waitid: wrong state %d on the p_newstate"
944				    " list", cp->p_wcode);
945				break;
946
947			case CLD_EXITED:
948			case CLD_DUMPED:
949			case CLD_KILLED:
950				if (!(options & WEXITED)) {
951					/*
952					 * Count how many are already gone
953					 * for good.
954					 */
955					proc_gone++;
956					break;
957				}
958				if (!waitflag) {
959					winfo(cp, ip, 0);
960				} else {
961					winfo(cp, ip, 1);
962					freeproc(cp);
963				}
964				mutex_exit(&pidlock);
965				if (waitflag) {		/* accept SIGCLD */
966					sigcld_delete(ip);
967					sigcld_repost();
968				}
969				return (0);
970			}
971
972			if (idtype == P_PID)
973				break;
974		}
975
976		/*
977		 * Wow! None of the threads on the p_sibling_ns list were
978		 * interesting threads. Check all the kids!
979		 */
980		found = 0;
981		for (cp = pp->p_child; cp != NULL; cp = cp->p_sibling) {
982			if (idtype == P_PID && id != cp->p_pid)
983				continue;
984			if (idtype == P_PGID && id != cp->p_pgrp)
985				continue;
986
987			switch (cp->p_wcode) {
988			case CLD_TRAPPED:
989				if (!(options & WTRAPPED))
990					break;
991				winfo(cp, ip, waitflag);
992				mutex_exit(&pidlock);
993				if (waitflag) {		/* accept SIGCLD */
994					sigcld_delete(ip);
995					sigcld_repost();
996				}
997				return (0);
998
999			case CLD_STOPPED:
1000				if (!(options & WSTOPPED))
1001					break;
1002				/* Is it still stopped? */
1003				mutex_enter(&cp->p_lock);
1004				if (!jobstopped(cp)) {
1005					mutex_exit(&cp->p_lock);
1006					break;
1007				}
1008				mutex_exit(&cp->p_lock);
1009				winfo(cp, ip, waitflag);
1010				mutex_exit(&pidlock);
1011				if (waitflag) {		/* accept SIGCLD */
1012					sigcld_delete(ip);
1013					sigcld_repost();
1014				}
1015				return (0);
1016
1017			case CLD_CONTINUED:
1018				if (!(options & WCONTINUED))
1019					break;
1020				winfo(cp, ip, waitflag);
1021				mutex_exit(&pidlock);
1022				if (waitflag) {		/* accept SIGCLD */
1023					sigcld_delete(ip);
1024					sigcld_repost();
1025				}
1026				return (0);
1027
1028			case CLD_EXITED:
1029			case CLD_DUMPED:
1030			case CLD_KILLED:
1031				if (idtype != P_PID &&
1032				    (cp->p_pidflag & CLDWAITPID))
1033					continue;
1034				/*
1035				 * Don't complain if a process was found in
1036				 * the first loop but we broke out of the loop
1037				 * because of the arguments passed to us.
1038				 */
1039				if (proc_gone == 0) {
1040					cmn_err(CE_PANIC,
1041					    "waitid: wrong state on the"
1042					    " p_child list");
1043				} else {
1044					break;
1045				}
1046			}
1047
1048			found++;
1049
1050			if (idtype == P_PID)
1051				break;
1052		}
1053
1054		/*
1055		 * If we found no interesting processes at all,
1056		 * break out and return ECHILD.
1057		 */
1058		if (found + proc_gone == 0)
1059			break;
1060
1061		if (options & WNOHANG) {
1062			mutex_exit(&pidlock);
1063			bzero(ip, sizeof (k_siginfo_t));
1064			/*
1065			 * We should set ip->si_signo = SIGCLD,
1066			 * but there is an SVVS test that expects
1067			 * ip->si_signo to be zero in this case.
1068			 */
1069			return (0);
1070		}
1071
1072		/*
1073		 * If we found no processes of interest that could
1074		 * change state while we wait, we don't wait at all.
1075		 * Get out with ECHILD according to SVID.
1076		 */
1077		if (found == proc_gone)
1078			break;
1079
1080		if (!cv_wait_sig_swap(&pp->p_cv, &pidlock)) {
1081			mutex_exit(&pidlock);
1082			return (EINTR);
1083		}
1084	}
1085	mutex_exit(&pidlock);
1086	return (ECHILD);
1087}
1088
1089/*
1090 * The wait() system call trap is no longer invoked by libc.
1091 * It is retained only for the benefit of statically linked applications.
1092 * Delete this when we no longer care about these old and broken applications.
1093 */
1094int64_t
1095wait(void)
1096{
1097	int error;
1098	k_siginfo_t info;
1099	rval_t	r;
1100
1101	if (error =  waitid(P_ALL, (id_t)0, &info, WEXITED|WTRAPPED))
1102		return (set_errno(error));
1103	r.r_val1 = info.si_pid;
1104	r.r_val2 = wstat(info.si_code, info.si_status);
1105	return (r.r_vals);
1106}
1107
1108int
1109waitsys(idtype_t idtype, id_t id, siginfo_t *infop, int options)
1110{
1111	int error;
1112	k_siginfo_t info;
1113
1114	if (error = waitid(idtype, id, &info, options))
1115		return (set_errno(error));
1116	if (copyout(&info, infop, sizeof (k_siginfo_t)))
1117		return (set_errno(EFAULT));
1118	return (0);
1119}
1120
1121#ifdef _SYSCALL32_IMPL
1122
1123int
1124waitsys32(idtype_t idtype, id_t id, siginfo_t *infop, int options)
1125{
1126	int error;
1127	k_siginfo_t info;
1128	siginfo32_t info32;
1129
1130	if (error = waitid(idtype, id, &info, options))
1131		return (set_errno(error));
1132	siginfo_kto32(&info, &info32);
1133	if (copyout(&info32, infop, sizeof (info32)))
1134		return (set_errno(EFAULT));
1135	return (0);
1136}
1137
1138#endif	/* _SYSCALL32_IMPL */
1139
1140void
1141proc_detach(proc_t *p)
1142{
1143	proc_t *q;
1144
1145	ASSERT(MUTEX_HELD(&pidlock));
1146
1147	q = p->p_parent;
1148	ASSERT(q != NULL);
1149
1150	/*
1151	 * Take it off the newstate list of its parent
1152	 */
1153	delete_ns(q, p);
1154
1155	if (q->p_child == p) {
1156		q->p_child = p->p_sibling;
1157		/*
1158		 * If the parent has no children, it better not
1159		 * have any with new states either!
1160		 */
1161		ASSERT(q->p_child ? 1 : q->p_child_ns == NULL);
1162	}
1163
1164	if (p->p_sibling) {
1165		p->p_sibling->p_psibling = p->p_psibling;
1166	}
1167
1168	if (p->p_psibling) {
1169		p->p_psibling->p_sibling = p->p_sibling;
1170	}
1171}
1172
1173/*
1174 * Remove zombie children from the process table.
1175 */
1176void
1177freeproc(proc_t *p)
1178{
1179	proc_t *q;
1180
1181	ASSERT(p->p_stat == SZOMB);
1182	ASSERT(p->p_tlist == NULL);
1183	ASSERT(MUTEX_HELD(&pidlock));
1184
1185	sigdelq(p, NULL, 0);
1186	if (p->p_killsqp) {
1187		siginfofree(p->p_killsqp);
1188		p->p_killsqp = NULL;
1189	}
1190
1191	prfree(p);	/* inform /proc */
1192
1193	/*
1194	 * Don't free the init processes.
1195	 * Other dying processes will access it.
1196	 */
1197	if (p == proc_init)
1198		return;
1199
1200
1201	/*
1202	 * We wait until now to free the cred structure because a
1203	 * zombie process's credentials may be examined by /proc.
1204	 * No cred locking needed because there are no threads at this point.
1205	 */
1206	upcount_dec(crgetruid(p->p_cred), crgetzoneid(p->p_cred));
1207	crfree(p->p_cred);
1208	if (p->p_corefile != NULL) {
1209		corectl_path_rele(p->p_corefile);
1210		p->p_corefile = NULL;
1211	}
1212	if (p->p_content != NULL) {
1213		corectl_content_rele(p->p_content);
1214		p->p_content = NULL;
1215	}
1216
1217	if (p->p_nextofkin && !((p->p_nextofkin->p_flag & SNOWAIT) ||
1218	    (PTOU(p->p_nextofkin)->u_signal[SIGCLD - 1] == SIG_IGN))) {
1219		/*
1220		 * This should still do the right thing since p_utime/stime
1221		 * get set to the correct value on process exit, so it
1222		 * should get properly updated
1223		 */
1224		p->p_nextofkin->p_cutime += p->p_utime;
1225		p->p_nextofkin->p_cstime += p->p_stime;
1226
1227		p->p_nextofkin->p_cacct[LMS_USER] += p->p_acct[LMS_USER];
1228		p->p_nextofkin->p_cacct[LMS_SYSTEM] += p->p_acct[LMS_SYSTEM];
1229		p->p_nextofkin->p_cacct[LMS_TRAP] += p->p_acct[LMS_TRAP];
1230		p->p_nextofkin->p_cacct[LMS_TFAULT] += p->p_acct[LMS_TFAULT];
1231		p->p_nextofkin->p_cacct[LMS_DFAULT] += p->p_acct[LMS_DFAULT];
1232		p->p_nextofkin->p_cacct[LMS_KFAULT] += p->p_acct[LMS_KFAULT];
1233		p->p_nextofkin->p_cacct[LMS_USER_LOCK]
1234		    += p->p_acct[LMS_USER_LOCK];
1235		p->p_nextofkin->p_cacct[LMS_SLEEP] += p->p_acct[LMS_SLEEP];
1236		p->p_nextofkin->p_cacct[LMS_WAIT_CPU]
1237		    += p->p_acct[LMS_WAIT_CPU];
1238		p->p_nextofkin->p_cacct[LMS_STOPPED] += p->p_acct[LMS_STOPPED];
1239
1240		p->p_nextofkin->p_cru.minflt	+= p->p_ru.minflt;
1241		p->p_nextofkin->p_cru.majflt	+= p->p_ru.majflt;
1242		p->p_nextofkin->p_cru.nswap	+= p->p_ru.nswap;
1243		p->p_nextofkin->p_cru.inblock	+= p->p_ru.inblock;
1244		p->p_nextofkin->p_cru.oublock	+= p->p_ru.oublock;
1245		p->p_nextofkin->p_cru.msgsnd	+= p->p_ru.msgsnd;
1246		p->p_nextofkin->p_cru.msgrcv	+= p->p_ru.msgrcv;
1247		p->p_nextofkin->p_cru.nsignals	+= p->p_ru.nsignals;
1248		p->p_nextofkin->p_cru.nvcsw	+= p->p_ru.nvcsw;
1249		p->p_nextofkin->p_cru.nivcsw	+= p->p_ru.nivcsw;
1250		p->p_nextofkin->p_cru.sysc	+= p->p_ru.sysc;
1251		p->p_nextofkin->p_cru.ioch	+= p->p_ru.ioch;
1252
1253	}
1254
1255	q = p->p_nextofkin;
1256	if (q && q->p_orphan == p)
1257		q->p_orphan = p->p_nextorph;
1258	else if (q) {
1259		for (q = q->p_orphan; q; q = q->p_nextorph)
1260			if (q->p_nextorph == p)
1261				break;
1262		ASSERT(q && q->p_nextorph == p);
1263		q->p_nextorph = p->p_nextorph;
1264	}
1265
1266	proc_detach(p);
1267	pid_exit(p);	/* frees pid and proc structure */
1268}
1269
1270/*
1271 * Delete process "child" from the newstate list of process "parent"
1272 */
1273void
1274delete_ns(proc_t *parent, proc_t *child)
1275{
1276	proc_t **ns;
1277
1278	ASSERT(MUTEX_HELD(&pidlock));
1279	ASSERT(child->p_parent == parent);
1280	for (ns = &parent->p_child_ns; *ns != NULL; ns = &(*ns)->p_sibling_ns) {
1281		if (*ns == child) {
1282
1283			ASSERT((*ns)->p_parent == parent);
1284
1285			*ns = child->p_sibling_ns;
1286			child->p_sibling_ns = NULL;
1287			return;
1288		}
1289	}
1290}
1291
1292/*
1293 * Add process "child" to the new state list of process "parent"
1294 */
1295void
1296add_ns(proc_t *parent, proc_t *child)
1297{
1298	ASSERT(child->p_sibling_ns == NULL);
1299	child->p_sibling_ns = parent->p_child_ns;
1300	parent->p_child_ns = child;
1301}
1302