1178479Sjb/*
2178479Sjb * CDDL HEADER START
3178479Sjb *
4178479Sjb * The contents of this file are subject to the terms of the
5178479Sjb * Common Development and Distribution License (the "License").
6178479Sjb * You may not use this file except in compliance with the License.
7178479Sjb *
8178479Sjb * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9178479Sjb * or http://www.opensolaris.org/os/licensing.
10178479Sjb * See the License for the specific language governing permissions
11178479Sjb * and limitations under the License.
12178479Sjb *
13178479Sjb * When distributing Covered Code, include this CDDL HEADER in each
14178479Sjb * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15178479Sjb * If applicable, add the following below this CDDL HEADER, with the
16178479Sjb * fields enclosed by brackets "[]" replaced with your own identifying
17178479Sjb * information: Portions Copyright [yyyy] [name of copyright owner]
18178479Sjb *
19178479Sjb * CDDL HEADER END
20178479Sjb */
21178479Sjb
22178479Sjb/*
23210767Srpaulo * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
24178479Sjb * Use is subject to license terms.
25178479Sjb */
26178479Sjb
27178479Sjb/*
28178479Sjb * DTrace Process Control
29178479Sjb *
30178479Sjb * This file provides a set of routines that permit libdtrace and its clients
31178479Sjb * to create and grab process handles using libproc, and to share these handles
32178479Sjb * between library mechanisms that need libproc access, such as ustack(), and
33178479Sjb * client mechanisms that need libproc access, such as dtrace(1M) -c and -p.
34178479Sjb * The library provides several mechanisms in the libproc control layer:
35178479Sjb *
36178479Sjb * Reference Counting: The library code and client code can independently grab
37178479Sjb * the same process handles without interfering with one another.  Only when
38178479Sjb * the reference count drops to zero and the handle is not being cached (see
39178479Sjb * below for more information on caching) will Prelease() be called on it.
40178479Sjb *
41178479Sjb * Handle Caching: If a handle is grabbed PGRAB_RDONLY (e.g. by ustack()) and
42178479Sjb * the reference count drops to zero, the handle is not immediately released.
43178479Sjb * Instead, libproc handles are maintained on dph_lrulist in order from most-
44178479Sjb * recently accessed to least-recently accessed.  Idle handles are maintained
45178479Sjb * until a pre-defined LRU cache limit is exceeded, permitting repeated calls
46178479Sjb * to ustack() to avoid the overhead of releasing and re-grabbing processes.
47178479Sjb *
48178479Sjb * Process Control: For processes that are grabbed for control (~PGRAB_RDONLY)
49178479Sjb * or created by dt_proc_create(), a control thread is created to provide
50178479Sjb * callbacks on process exit and symbol table caching on dlopen()s.
51178479Sjb *
52178479Sjb * MT-Safety: Libproc is not MT-Safe, so dt_proc_lock() and dt_proc_unlock()
53178479Sjb * are provided to synchronize access to the libproc handle between libdtrace
54178479Sjb * code and client code and the control thread's use of the ps_prochandle.
55178479Sjb *
56178479Sjb * NOTE: MT-Safety is NOT provided for libdtrace itself, or for use of the
57178479Sjb * dtrace_proc_grab/dtrace_proc_create mechanisms.  Like all exported libdtrace
58178479Sjb * calls, these are assumed to be MT-Unsafe.  MT-Safety is ONLY provided for
59178479Sjb * synchronization between libdtrace control threads and the client thread.
60178479Sjb *
61178479Sjb * The ps_prochandles themselves are maintained along with a dt_proc_t struct
62178479Sjb * in a hash table indexed by PID.  This provides basic locking and reference
63178479Sjb * counting.  The dt_proc_t is also maintained in LRU order on dph_lrulist.
64178479Sjb * The dph_lrucnt and dph_lrulim count the number of cacheable processes and
65178479Sjb * the current limit on the number of actively cached entries.
66178479Sjb *
67178479Sjb * The control thread for a process establishes breakpoints at the rtld_db
68178479Sjb * locations of interest, updates mappings and symbol tables at these points,
69178479Sjb * and handles exec and fork (by always following the parent).  The control
70178479Sjb * thread automatically exits when the process dies or control is lost.
71178479Sjb *
72178479Sjb * A simple notification mechanism is provided for libdtrace clients using
73178479Sjb * dtrace_handle_proc() for notification of PS_UNDEAD or PS_LOST events.  If
74178479Sjb * such an event occurs, the dt_proc_t itself is enqueued on a notification
75178479Sjb * list and the control thread broadcasts to dph_cv.  dtrace_sleep() will wake
76178479Sjb * up using this condition and will then call the client handler as necessary.
77178479Sjb */
78178479Sjb
79178479Sjb#include <sys/wait.h>
80297077Smav#ifdef illumos
81178479Sjb#include <sys/lwp.h>
82178565Sjb#endif
83178479Sjb#include <strings.h>
84178479Sjb#include <signal.h>
85178479Sjb#include <assert.h>
86178479Sjb#include <errno.h>
87178479Sjb
88178479Sjb#include <dt_proc.h>
89178479Sjb#include <dt_pid.h>
90178479Sjb#include <dt_impl.h>
91178479Sjb
92297077Smav#ifndef illumos
93211554Srpaulo#include <sys/syscall.h>
94211554Srpaulo#include <libproc_compat.h>
95211554Srpaulo#define	SYS_forksys SYS_fork
96211554Srpaulo#endif
97211554Srpaulo
98210767Srpaulo#define	IS_SYS_EXEC(w)	(w == SYS_execve)
99210767Srpaulo#define	IS_SYS_FORK(w)	(w == SYS_vfork || w == SYS_forksys)
100178479Sjb
101178479Sjbstatic dt_bkpt_t *
102178479Sjbdt_proc_bpcreate(dt_proc_t *dpr, uintptr_t addr, dt_bkpt_f *func, void *data)
103178479Sjb{
104178479Sjb	struct ps_prochandle *P = dpr->dpr_proc;
105178479Sjb	dt_bkpt_t *dbp;
106178479Sjb
107210775Srpaulo	assert(DT_MUTEX_HELD(&dpr->dpr_lock));
108178479Sjb
109178479Sjb	if ((dbp = dt_zalloc(dpr->dpr_hdl, sizeof (dt_bkpt_t))) != NULL) {
110178479Sjb		dbp->dbp_func = func;
111178479Sjb		dbp->dbp_data = data;
112178479Sjb		dbp->dbp_addr = addr;
113178479Sjb
114178479Sjb		if (Psetbkpt(P, dbp->dbp_addr, &dbp->dbp_instr) == 0)
115178479Sjb			dbp->dbp_active = B_TRUE;
116178479Sjb
117178479Sjb		dt_list_append(&dpr->dpr_bps, dbp);
118178479Sjb	}
119178479Sjb
120178479Sjb	return (dbp);
121178479Sjb}
122178479Sjb
123178479Sjbstatic void
124178479Sjbdt_proc_bpdestroy(dt_proc_t *dpr, int delbkpts)
125178479Sjb{
126178479Sjb	int state = Pstate(dpr->dpr_proc);
127178479Sjb	dt_bkpt_t *dbp, *nbp;
128178479Sjb
129210775Srpaulo	assert(DT_MUTEX_HELD(&dpr->dpr_lock));
130178479Sjb
131178479Sjb	for (dbp = dt_list_next(&dpr->dpr_bps); dbp != NULL; dbp = nbp) {
132178479Sjb		if (delbkpts && dbp->dbp_active &&
133178479Sjb		    state != PS_LOST && state != PS_UNDEAD) {
134178479Sjb			(void) Pdelbkpt(dpr->dpr_proc,
135178479Sjb			    dbp->dbp_addr, dbp->dbp_instr);
136178479Sjb		}
137178479Sjb		nbp = dt_list_next(dbp);
138178479Sjb		dt_list_delete(&dpr->dpr_bps, dbp);
139178479Sjb		dt_free(dpr->dpr_hdl, dbp);
140178479Sjb	}
141178479Sjb}
142178479Sjb
143178479Sjbstatic void
144178479Sjbdt_proc_bpmatch(dtrace_hdl_t *dtp, dt_proc_t *dpr)
145178479Sjb{
146297077Smav#ifdef illumos
147178479Sjb	const lwpstatus_t *psp = &Pstatus(dpr->dpr_proc)->pr_lwp;
148211554Srpaulo#else
149211554Srpaulo	unsigned long pc;
150211554Srpaulo#endif
151178479Sjb	dt_bkpt_t *dbp;
152178479Sjb
153210775Srpaulo	assert(DT_MUTEX_HELD(&dpr->dpr_lock));
154178479Sjb
155297077Smav#ifndef illumos
156211554Srpaulo	proc_regget(dpr->dpr_proc, REG_PC, &pc);
157211554Srpaulo	proc_bkptregadj(&pc);
158211554Srpaulo#endif
159211554Srpaulo
160178479Sjb	for (dbp = dt_list_next(&dpr->dpr_bps);
161178479Sjb	    dbp != NULL; dbp = dt_list_next(dbp)) {
162297077Smav#ifdef illumos
163178479Sjb		if (psp->pr_reg[R_PC] == dbp->dbp_addr)
164178479Sjb			break;
165211554Srpaulo#else
166211554Srpaulo		if (pc == dbp->dbp_addr)
167211554Srpaulo			break;
168211554Srpaulo#endif
169178479Sjb	}
170178479Sjb
171178479Sjb	if (dbp == NULL) {
172178479Sjb		dt_dprintf("pid %d: spurious breakpoint wakeup for %lx\n",
173297077Smav#ifdef illumos
174178479Sjb		    (int)dpr->dpr_pid, (ulong_t)psp->pr_reg[R_PC]);
175211554Srpaulo#else
176211554Srpaulo		    (int)dpr->dpr_pid, pc);
177211554Srpaulo#endif
178178479Sjb		return;
179178479Sjb	}
180178479Sjb
181178479Sjb	dt_dprintf("pid %d: hit breakpoint at %lx (%lu)\n",
182178479Sjb	    (int)dpr->dpr_pid, (ulong_t)dbp->dbp_addr, ++dbp->dbp_hits);
183178479Sjb
184178479Sjb	dbp->dbp_func(dtp, dpr, dbp->dbp_data);
185178479Sjb	(void) Pxecbkpt(dpr->dpr_proc, dbp->dbp_instr);
186178479Sjb}
187178479Sjb
188178479Sjbstatic void
189178479Sjbdt_proc_bpenable(dt_proc_t *dpr)
190178479Sjb{
191178479Sjb	dt_bkpt_t *dbp;
192178479Sjb
193210775Srpaulo	assert(DT_MUTEX_HELD(&dpr->dpr_lock));
194178479Sjb
195178479Sjb	for (dbp = dt_list_next(&dpr->dpr_bps);
196178479Sjb	    dbp != NULL; dbp = dt_list_next(dbp)) {
197178479Sjb		if (!dbp->dbp_active && Psetbkpt(dpr->dpr_proc,
198178479Sjb		    dbp->dbp_addr, &dbp->dbp_instr) == 0)
199178479Sjb			dbp->dbp_active = B_TRUE;
200178479Sjb	}
201178479Sjb
202178479Sjb	dt_dprintf("breakpoints enabled\n");
203178479Sjb}
204178479Sjb
205178479Sjbstatic void
206178479Sjbdt_proc_bpdisable(dt_proc_t *dpr)
207178479Sjb{
208178479Sjb	dt_bkpt_t *dbp;
209178479Sjb
210210775Srpaulo	assert(DT_MUTEX_HELD(&dpr->dpr_lock));
211178479Sjb
212178479Sjb	for (dbp = dt_list_next(&dpr->dpr_bps);
213178479Sjb	    dbp != NULL; dbp = dt_list_next(dbp)) {
214178479Sjb		if (dbp->dbp_active && Pdelbkpt(dpr->dpr_proc,
215178479Sjb		    dbp->dbp_addr, dbp->dbp_instr) == 0)
216178479Sjb			dbp->dbp_active = B_FALSE;
217178479Sjb	}
218178479Sjb
219178479Sjb	dt_dprintf("breakpoints disabled\n");
220178479Sjb}
221178479Sjb
222178479Sjbstatic void
223178479Sjbdt_proc_notify(dtrace_hdl_t *dtp, dt_proc_hash_t *dph, dt_proc_t *dpr,
224178479Sjb    const char *msg)
225178479Sjb{
226178479Sjb	dt_proc_notify_t *dprn = dt_alloc(dtp, sizeof (dt_proc_notify_t));
227178479Sjb
228178479Sjb	if (dprn == NULL) {
229178479Sjb		dt_dprintf("failed to allocate notification for %d %s\n",
230178479Sjb		    (int)dpr->dpr_pid, msg);
231178479Sjb	} else {
232178479Sjb		dprn->dprn_dpr = dpr;
233178479Sjb		if (msg == NULL)
234178479Sjb			dprn->dprn_errmsg[0] = '\0';
235178479Sjb		else
236178479Sjb			(void) strlcpy(dprn->dprn_errmsg, msg,
237178479Sjb			    sizeof (dprn->dprn_errmsg));
238178479Sjb
239178479Sjb		(void) pthread_mutex_lock(&dph->dph_lock);
240178479Sjb
241178479Sjb		dprn->dprn_next = dph->dph_notify;
242178479Sjb		dph->dph_notify = dprn;
243178479Sjb
244178479Sjb		(void) pthread_cond_broadcast(&dph->dph_cv);
245178479Sjb		(void) pthread_mutex_unlock(&dph->dph_lock);
246178479Sjb	}
247178479Sjb}
248178479Sjb
249178479Sjb/*
250178479Sjb * Check to see if the control thread was requested to stop when the victim
251178479Sjb * process reached a particular event (why) rather than continuing the victim.
252178479Sjb * If 'why' is set in the stop mask, we wait on dpr_cv for dt_proc_continue().
253178479Sjb * If 'why' is not set, this function returns immediately and does nothing.
254178479Sjb */
255178479Sjbstatic void
256178479Sjbdt_proc_stop(dt_proc_t *dpr, uint8_t why)
257178479Sjb{
258210775Srpaulo	assert(DT_MUTEX_HELD(&dpr->dpr_lock));
259178479Sjb	assert(why != DT_PROC_STOP_IDLE);
260178479Sjb
261178479Sjb	if (dpr->dpr_stop & why) {
262178479Sjb		dpr->dpr_stop |= DT_PROC_STOP_IDLE;
263178479Sjb		dpr->dpr_stop &= ~why;
264178479Sjb
265178479Sjb		(void) pthread_cond_broadcast(&dpr->dpr_cv);
266178479Sjb
267178479Sjb		/*
268178479Sjb		 * We disable breakpoints while stopped to preserve the
269178479Sjb		 * integrity of the program text for both our own disassembly
270178479Sjb		 * and that of the kernel.
271178479Sjb		 */
272178479Sjb		dt_proc_bpdisable(dpr);
273178479Sjb
274178479Sjb		while (dpr->dpr_stop & DT_PROC_STOP_IDLE)
275178479Sjb			(void) pthread_cond_wait(&dpr->dpr_cv, &dpr->dpr_lock);
276178479Sjb
277178479Sjb		dt_proc_bpenable(dpr);
278178479Sjb	}
279178479Sjb}
280178479Sjb
281178479Sjb/*ARGSUSED*/
282178479Sjbstatic void
283178479Sjbdt_proc_bpmain(dtrace_hdl_t *dtp, dt_proc_t *dpr, const char *fname)
284178479Sjb{
285178479Sjb	dt_dprintf("pid %d: breakpoint at %s()\n", (int)dpr->dpr_pid, fname);
286178479Sjb	dt_proc_stop(dpr, DT_PROC_STOP_MAIN);
287178479Sjb}
288178479Sjb
289178479Sjbstatic void
290178479Sjbdt_proc_rdevent(dtrace_hdl_t *dtp, dt_proc_t *dpr, const char *evname)
291178479Sjb{
292178479Sjb	rd_event_msg_t rdm;
293178479Sjb	rd_err_e err;
294178479Sjb
295178479Sjb	if ((err = rd_event_getmsg(dpr->dpr_rtld, &rdm)) != RD_OK) {
296178479Sjb		dt_dprintf("pid %d: failed to get %s event message: %s\n",
297178479Sjb		    (int)dpr->dpr_pid, evname, rd_errstr(err));
298178479Sjb		return;
299178479Sjb	}
300178479Sjb
301178479Sjb	dt_dprintf("pid %d: rtld event %s type=%d state %d\n",
302178479Sjb	    (int)dpr->dpr_pid, evname, rdm.type, rdm.u.state);
303178479Sjb
304178479Sjb	switch (rdm.type) {
305178479Sjb	case RD_DLACTIVITY:
306178479Sjb		if (rdm.u.state != RD_CONSISTENT)
307178479Sjb			break;
308178479Sjb
309178479Sjb		Pupdate_syms(dpr->dpr_proc);
310178479Sjb		if (dt_pid_create_probes_module(dtp, dpr) != 0)
311178479Sjb			dt_proc_notify(dtp, dtp->dt_procs, dpr,
312178479Sjb			    dpr->dpr_errmsg);
313178479Sjb
314178479Sjb		break;
315178479Sjb	case RD_PREINIT:
316178479Sjb		Pupdate_syms(dpr->dpr_proc);
317178479Sjb		dt_proc_stop(dpr, DT_PROC_STOP_PREINIT);
318178479Sjb		break;
319178479Sjb	case RD_POSTINIT:
320178479Sjb		Pupdate_syms(dpr->dpr_proc);
321178479Sjb		dt_proc_stop(dpr, DT_PROC_STOP_POSTINIT);
322178479Sjb		break;
323178479Sjb	}
324178479Sjb}
325178479Sjb
326178479Sjbstatic void
327178479Sjbdt_proc_rdwatch(dt_proc_t *dpr, rd_event_e event, const char *evname)
328178479Sjb{
329178479Sjb	rd_notify_t rdn;
330178479Sjb	rd_err_e err;
331178479Sjb
332178479Sjb	if ((err = rd_event_addr(dpr->dpr_rtld, event, &rdn)) != RD_OK) {
333178479Sjb		dt_dprintf("pid %d: failed to get event address for %s: %s\n",
334178479Sjb		    (int)dpr->dpr_pid, evname, rd_errstr(err));
335178479Sjb		return;
336178479Sjb	}
337178479Sjb
338178479Sjb	if (rdn.type != RD_NOTIFY_BPT) {
339178479Sjb		dt_dprintf("pid %d: event %s has unexpected type %d\n",
340178479Sjb		    (int)dpr->dpr_pid, evname, rdn.type);
341178479Sjb		return;
342178479Sjb	}
343178479Sjb
344178479Sjb	(void) dt_proc_bpcreate(dpr, rdn.u.bptaddr,
345297077Smav#ifdef illumos
346178479Sjb	    (dt_bkpt_f *)dt_proc_rdevent, (void *)evname);
347211554Srpaulo#else
348211554Srpaulo	    /* XXX ugly */
349211554Srpaulo	    (dt_bkpt_f *)dt_proc_rdevent, __DECONST(void *, evname));
350211554Srpaulo#endif
351178479Sjb}
352178479Sjb
353178479Sjb/*
354178479Sjb * Common code for enabling events associated with the run-time linker after
355178479Sjb * attaching to a process or after a victim process completes an exec(2).
356178479Sjb */
357178479Sjbstatic void
358178479Sjbdt_proc_attach(dt_proc_t *dpr, int exec)
359178479Sjb{
360297077Smav#ifdef illumos
361178479Sjb	const pstatus_t *psp = Pstatus(dpr->dpr_proc);
362211554Srpaulo#endif
363178479Sjb	rd_err_e err;
364178479Sjb	GElf_Sym sym;
365178479Sjb
366210775Srpaulo	assert(DT_MUTEX_HELD(&dpr->dpr_lock));
367178479Sjb
368178479Sjb	if (exec) {
369297077Smav#ifdef illumos
370178479Sjb		if (psp->pr_lwp.pr_errno != 0)
371178479Sjb			return; /* exec failed: nothing needs to be done */
372211554Srpaulo#endif
373178479Sjb
374178479Sjb		dt_proc_bpdestroy(dpr, B_FALSE);
375297077Smav#ifdef illumos
376178479Sjb		Preset_maps(dpr->dpr_proc);
377211554Srpaulo#endif
378178479Sjb	}
379178479Sjb	if ((dpr->dpr_rtld = Prd_agent(dpr->dpr_proc)) != NULL &&
380178479Sjb	    (err = rd_event_enable(dpr->dpr_rtld, B_TRUE)) == RD_OK) {
381297077Smav#ifdef illumos
382178479Sjb		dt_proc_rdwatch(dpr, RD_PREINIT, "RD_PREINIT");
383211554Srpaulo#endif
384178479Sjb		dt_proc_rdwatch(dpr, RD_POSTINIT, "RD_POSTINIT");
385297077Smav#ifdef illumos
386178479Sjb		dt_proc_rdwatch(dpr, RD_DLACTIVITY, "RD_DLACTIVITY");
387211554Srpaulo#endif
388178479Sjb	} else {
389178479Sjb		dt_dprintf("pid %d: failed to enable rtld events: %s\n",
390178479Sjb		    (int)dpr->dpr_pid, dpr->dpr_rtld ? rd_errstr(err) :
391178479Sjb		    "rtld_db agent initialization failed");
392178479Sjb	}
393178479Sjb
394178479Sjb	Pupdate_maps(dpr->dpr_proc);
395178479Sjb
396178479Sjb	if (Pxlookup_by_name(dpr->dpr_proc, LM_ID_BASE,
397178479Sjb	    "a.out", "main", &sym, NULL) == 0) {
398178479Sjb		(void) dt_proc_bpcreate(dpr, (uintptr_t)sym.st_value,
399178479Sjb		    (dt_bkpt_f *)dt_proc_bpmain, "a.out`main");
400178479Sjb	} else {
401178479Sjb		dt_dprintf("pid %d: failed to find a.out`main: %s\n",
402178479Sjb		    (int)dpr->dpr_pid, strerror(errno));
403178479Sjb	}
404178479Sjb}
405178479Sjb
406178479Sjb/*
407178479Sjb * Wait for a stopped process to be set running again by some other debugger.
408178479Sjb * This is typically not required by /proc-based debuggers, since the usual
409178479Sjb * model is that one debugger controls one victim.  But DTrace, as usual, has
410178479Sjb * its own needs: the stop() action assumes that prun(1) or some other tool
411178479Sjb * will be applied to resume the victim process.  This could be solved by
412178479Sjb * adding a PCWRUN directive to /proc, but that seems like overkill unless
413178479Sjb * other debuggers end up needing this functionality, so we implement a cheap
414178479Sjb * equivalent to PCWRUN using the set of existing kernel mechanisms.
415178479Sjb *
416178479Sjb * Our intent is really not just to wait for the victim to run, but rather to
417178479Sjb * wait for it to run and then stop again for a reason other than the current
418178479Sjb * PR_REQUESTED stop.  Since PCWSTOP/Pstopstatus() can be applied repeatedly
419178479Sjb * to a stopped process and will return the same result without affecting the
420178479Sjb * victim, we can just perform these operations repeatedly until Pstate()
421178479Sjb * changes, the representative LWP ID changes, or the stop timestamp advances.
422178479Sjb * dt_proc_control() will then rediscover the new state and continue as usual.
423178479Sjb * When the process is still stopped in the same exact state, we sleep for a
424178479Sjb * brief interval before waiting again so as not to spin consuming CPU cycles.
425178479Sjb */
426178479Sjbstatic void
427178479Sjbdt_proc_waitrun(dt_proc_t *dpr)
428178479Sjb{
429211554Srpauloprintf("%s:%s(%d): DOODAD\n",__FUNCTION__,__FILE__,__LINE__);
430211554Srpaulo#ifdef DOODAD
431178479Sjb	struct ps_prochandle *P = dpr->dpr_proc;
432178479Sjb	const lwpstatus_t *psp = &Pstatus(P)->pr_lwp;
433178479Sjb
434178479Sjb	int krflag = psp->pr_flags & (PR_KLC | PR_RLC);
435178479Sjb	timestruc_t tstamp = psp->pr_tstamp;
436178479Sjb	lwpid_t lwpid = psp->pr_lwpid;
437178479Sjb
438178479Sjb	const long wstop = PCWSTOP;
439178479Sjb	int pfd = Pctlfd(P);
440178479Sjb
441210775Srpaulo	assert(DT_MUTEX_HELD(&dpr->dpr_lock));
442178479Sjb	assert(psp->pr_flags & PR_STOPPED);
443178479Sjb	assert(Pstate(P) == PS_STOP);
444178479Sjb
445178479Sjb	/*
446178479Sjb	 * While we are waiting for the victim to run, clear PR_KLC and PR_RLC
447178479Sjb	 * so that if the libdtrace client is killed, the victim stays stopped.
448178479Sjb	 * dt_proc_destroy() will also observe this and perform PRELEASE_HANG.
449178479Sjb	 */
450178479Sjb	(void) Punsetflags(P, krflag);
451178479Sjb	Psync(P);
452178479Sjb
453178479Sjb	(void) pthread_mutex_unlock(&dpr->dpr_lock);
454178479Sjb
455178479Sjb	while (!dpr->dpr_quit) {
456178479Sjb		if (write(pfd, &wstop, sizeof (wstop)) == -1 && errno == EINTR)
457178479Sjb			continue; /* check dpr_quit and continue waiting */
458178479Sjb
459178479Sjb		(void) pthread_mutex_lock(&dpr->dpr_lock);
460178479Sjb		(void) Pstopstatus(P, PCNULL, 0);
461178479Sjb		psp = &Pstatus(P)->pr_lwp;
462178479Sjb
463178479Sjb		/*
464178479Sjb		 * If we've reached a new state, found a new representative, or
465178479Sjb		 * the stop timestamp has changed, restore PR_KLC/PR_RLC to its
466178479Sjb		 * original setting and then return with dpr_lock held.
467178479Sjb		 */
468178479Sjb		if (Pstate(P) != PS_STOP || psp->pr_lwpid != lwpid ||
469178479Sjb		    bcmp(&psp->pr_tstamp, &tstamp, sizeof (tstamp)) != 0) {
470178479Sjb			(void) Psetflags(P, krflag);
471178479Sjb			Psync(P);
472178479Sjb			return;
473178479Sjb		}
474178479Sjb
475178479Sjb		(void) pthread_mutex_unlock(&dpr->dpr_lock);
476178479Sjb		(void) poll(NULL, 0, MILLISEC / 2);
477178479Sjb	}
478178479Sjb
479178479Sjb	(void) pthread_mutex_lock(&dpr->dpr_lock);
480211554Srpaulo#endif
481178479Sjb}
482178479Sjb
483178479Sjbtypedef struct dt_proc_control_data {
484178479Sjb	dtrace_hdl_t *dpcd_hdl;			/* DTrace handle */
485178479Sjb	dt_proc_t *dpcd_proc;			/* proccess to control */
486178479Sjb} dt_proc_control_data_t;
487178479Sjb
488178479Sjb/*
489178479Sjb * Main loop for all victim process control threads.  We initialize all the
490178479Sjb * appropriate /proc control mechanisms, and then enter a loop waiting for
491178479Sjb * the process to stop on an event or die.  We process any events by calling
492178479Sjb * appropriate subroutines, and exit when the victim dies or we lose control.
493178479Sjb *
494178479Sjb * The control thread synchronizes the use of dpr_proc with other libdtrace
495178479Sjb * threads using dpr_lock.  We hold the lock for all of our operations except
496178479Sjb * waiting while the process is running: this is accomplished by writing a
497178479Sjb * PCWSTOP directive directly to the underlying /proc/<pid>/ctl file.  If the
498178479Sjb * libdtrace client wishes to exit or abort our wait, SIGCANCEL can be used.
499178479Sjb */
500178479Sjbstatic void *
501178479Sjbdt_proc_control(void *arg)
502178479Sjb{
503178479Sjb	dt_proc_control_data_t *datap = arg;
504178479Sjb	dtrace_hdl_t *dtp = datap->dpcd_hdl;
505178479Sjb	dt_proc_t *dpr = datap->dpcd_proc;
506249573Spfg	dt_proc_hash_t *dph = dpr->dpr_hdl->dt_procs;
507178479Sjb	struct ps_prochandle *P = dpr->dpr_proc;
508178565Sjb	int pid = dpr->dpr_pid;
509178479Sjb
510297077Smav#ifdef illumos
511178479Sjb	int pfd = Pctlfd(P);
512178479Sjb
513178479Sjb	const long wstop = PCWSTOP;
514178565Sjb#endif
515178479Sjb	int notify = B_FALSE;
516178479Sjb
517178479Sjb	/*
518178479Sjb	 * We disable the POSIX thread cancellation mechanism so that the
519178479Sjb	 * client program using libdtrace can't accidentally cancel our thread.
520178479Sjb	 * dt_proc_destroy() uses SIGCANCEL explicitly to simply poke us out
521178479Sjb	 * of PCWSTOP with EINTR, at which point we will see dpr_quit and exit.
522178479Sjb	 */
523178479Sjb	(void) pthread_setcancelstate(PTHREAD_CANCEL_DISABLE, NULL);
524178479Sjb
525178479Sjb	/*
526178479Sjb	 * Set up the corresponding process for tracing by libdtrace.  We want
527178479Sjb	 * to be able to catch breakpoints and efficiently single-step over
528178479Sjb	 * them, and we need to enable librtld_db to watch libdl activity.
529178479Sjb	 */
530178479Sjb	(void) pthread_mutex_lock(&dpr->dpr_lock);
531178479Sjb
532297077Smav#ifdef illumos
533178479Sjb	(void) Punsetflags(P, PR_ASYNC);	/* require synchronous mode */
534178479Sjb	(void) Psetflags(P, PR_BPTADJ);		/* always adjust eip on x86 */
535178479Sjb	(void) Punsetflags(P, PR_FORK);		/* do not inherit on fork */
536178479Sjb
537178479Sjb	(void) Pfault(P, FLTBPT, B_TRUE);	/* always trace breakpoints */
538178479Sjb	(void) Pfault(P, FLTTRACE, B_TRUE);	/* always trace single-step */
539178479Sjb
540178479Sjb	/*
541178479Sjb	 * We must trace exit from exec() system calls so that if the exec is
542178479Sjb	 * successful, we can reset our breakpoints and re-initialize libproc.
543178479Sjb	 */
544178479Sjb	(void) Psysexit(P, SYS_execve, B_TRUE);
545178479Sjb
546178479Sjb	/*
547178479Sjb	 * We must trace entry and exit for fork() system calls in order to
548178479Sjb	 * disable our breakpoints temporarily during the fork.  We do not set
549178479Sjb	 * the PR_FORK flag, so if fork succeeds the child begins executing and
550178479Sjb	 * does not inherit any other tracing behaviors or a control thread.
551178479Sjb	 */
552178479Sjb	(void) Psysentry(P, SYS_vfork, B_TRUE);
553178479Sjb	(void) Psysexit(P, SYS_vfork, B_TRUE);
554178479Sjb	(void) Psysentry(P, SYS_forksys, B_TRUE);
555178479Sjb	(void) Psysexit(P, SYS_forksys, B_TRUE);
556178479Sjb
557178479Sjb	Psync(P);				/* enable all /proc changes */
558211554Srpaulo#endif
559178479Sjb	dt_proc_attach(dpr, B_FALSE);		/* enable rtld breakpoints */
560178479Sjb
561178479Sjb	/*
562178479Sjb	 * If PR_KLC is set, we created the process; otherwise we grabbed it.
563178479Sjb	 * Check for an appropriate stop request and wait for dt_proc_continue.
564178479Sjb	 */
565297077Smav#ifdef illumos
566178479Sjb	if (Pstatus(P)->pr_flags & PR_KLC)
567211554Srpaulo#else
568211554Srpaulo	if (proc_getflags(P) & PR_KLC)
569211554Srpaulo#endif
570178479Sjb		dt_proc_stop(dpr, DT_PROC_STOP_CREATE);
571178479Sjb	else
572178479Sjb		dt_proc_stop(dpr, DT_PROC_STOP_GRAB);
573178479Sjb
574178479Sjb	if (Psetrun(P, 0, 0) == -1) {
575178479Sjb		dt_dprintf("pid %d: failed to set running: %s\n",
576178479Sjb		    (int)dpr->dpr_pid, strerror(errno));
577178479Sjb	}
578178479Sjb
579178479Sjb	(void) pthread_mutex_unlock(&dpr->dpr_lock);
580178479Sjb
581178479Sjb	/*
582178479Sjb	 * Wait for the process corresponding to this control thread to stop,
583178479Sjb	 * process the event, and then set it running again.  We want to sleep
584178479Sjb	 * with dpr_lock *unheld* so that other parts of libdtrace can use the
585178479Sjb	 * ps_prochandle in the meantime (e.g. ustack()).  To do this, we write
586178479Sjb	 * a PCWSTOP directive directly to the underlying /proc/<pid>/ctl file.
587178479Sjb	 * Once the process stops, we wake up, grab dpr_lock, and then call
588178479Sjb	 * Pwait() (which will return immediately) and do our processing.
589178479Sjb	 */
590178479Sjb	while (!dpr->dpr_quit) {
591178479Sjb		const lwpstatus_t *psp;
592178479Sjb
593297077Smav#ifdef illumos
594178479Sjb		if (write(pfd, &wstop, sizeof (wstop)) == -1 && errno == EINTR)
595178479Sjb			continue; /* check dpr_quit and continue waiting */
596178565Sjb#else
597178565Sjb		/* Wait for the process to report status. */
598210695Srpaulo		proc_wstatus(P);
599211554Srpaulo		if (errno == EINTR)
600211554Srpaulo			continue; /* check dpr_quit and continue waiting */
601178565Sjb#endif
602178479Sjb
603178479Sjb		(void) pthread_mutex_lock(&dpr->dpr_lock);
604178565Sjb
605297077Smav#ifdef illumos
606178479Sjbpwait_locked:
607178479Sjb		if (Pstopstatus(P, PCNULL, 0) == -1 && errno == EINTR) {
608178479Sjb			(void) pthread_mutex_unlock(&dpr->dpr_lock);
609178479Sjb			continue; /* check dpr_quit and continue waiting */
610178479Sjb		}
611178565Sjb#endif
612178479Sjb
613211554Srpaulo		switch (Pstate(P)) {
614211554Srpaulo		case PS_STOP:
615297077Smav#ifdef illumos
616211554Srpaulo			psp = &Pstatus(P)->pr_lwp;
617178565Sjb#else
618211554Srpaulo			psp = proc_getlwpstatus(P);
619178565Sjb#endif
620178479Sjb
621178479Sjb			dt_dprintf("pid %d: proc stopped showing %d/%d\n",
622178479Sjb			    pid, psp->pr_why, psp->pr_what);
623178479Sjb
624178479Sjb			/*
625178479Sjb			 * If the process stops showing PR_REQUESTED, then the
626178479Sjb			 * DTrace stop() action was applied to it or another
627178479Sjb			 * debugging utility (e.g. pstop(1)) asked it to stop.
628178479Sjb			 * In either case, the user's intention is for the
629178479Sjb			 * process to remain stopped until another external
630178479Sjb			 * mechanism (e.g. prun(1)) is applied.  So instead of
631178479Sjb			 * setting the process running ourself, we wait for
632178479Sjb			 * someone else to do so.  Once that happens, we return
633178479Sjb			 * to our normal loop waiting for an event of interest.
634178479Sjb			 */
635178479Sjb			if (psp->pr_why == PR_REQUESTED) {
636178479Sjb				dt_proc_waitrun(dpr);
637178479Sjb				(void) pthread_mutex_unlock(&dpr->dpr_lock);
638178479Sjb				continue;
639178479Sjb			}
640178479Sjb
641178479Sjb			/*
642178479Sjb			 * If the process stops showing one of the events that
643178479Sjb			 * we are tracing, perform the appropriate response.
644178479Sjb			 * Note that we ignore PR_SUSPENDED, PR_CHECKPOINT, and
645178479Sjb			 * PR_JOBCONTROL by design: if one of these conditions
646178479Sjb			 * occurs, we will fall through to Psetrun() but the
647178479Sjb			 * process will remain stopped in the kernel by the
648178479Sjb			 * corresponding mechanism (e.g. job control stop).
649178479Sjb			 */
650178479Sjb			if (psp->pr_why == PR_FAULTED && psp->pr_what == FLTBPT)
651178479Sjb				dt_proc_bpmatch(dtp, dpr);
652178479Sjb			else if (psp->pr_why == PR_SYSENTRY &&
653178479Sjb			    IS_SYS_FORK(psp->pr_what))
654178479Sjb				dt_proc_bpdisable(dpr);
655178479Sjb			else if (psp->pr_why == PR_SYSEXIT &&
656178479Sjb			    IS_SYS_FORK(psp->pr_what))
657178479Sjb				dt_proc_bpenable(dpr);
658178479Sjb			else if (psp->pr_why == PR_SYSEXIT &&
659178479Sjb			    IS_SYS_EXEC(psp->pr_what))
660178479Sjb				dt_proc_attach(dpr, B_TRUE);
661178479Sjb			break;
662178479Sjb
663178479Sjb		case PS_LOST:
664297077Smav#ifdef illumos
665178479Sjb			if (Preopen(P) == 0)
666178479Sjb				goto pwait_locked;
667178565Sjb#endif
668178479Sjb
669178479Sjb			dt_dprintf("pid %d: proc lost: %s\n",
670178479Sjb			    pid, strerror(errno));
671178479Sjb
672178479Sjb			dpr->dpr_quit = B_TRUE;
673178479Sjb			notify = B_TRUE;
674178479Sjb			break;
675178479Sjb
676178479Sjb		case PS_UNDEAD:
677178479Sjb			dt_dprintf("pid %d: proc died\n", pid);
678178479Sjb			dpr->dpr_quit = B_TRUE;
679178479Sjb			notify = B_TRUE;
680178479Sjb			break;
681178479Sjb		}
682178479Sjb
683178479Sjb		if (Pstate(P) != PS_UNDEAD && Psetrun(P, 0, 0) == -1) {
684178479Sjb			dt_dprintf("pid %d: failed to set running: %s\n",
685178479Sjb			    (int)dpr->dpr_pid, strerror(errno));
686178479Sjb		}
687178479Sjb
688178479Sjb		(void) pthread_mutex_unlock(&dpr->dpr_lock);
689178479Sjb	}
690178479Sjb
691178479Sjb	/*
692178479Sjb	 * If the control thread detected PS_UNDEAD or PS_LOST, then enqueue
693178479Sjb	 * the dt_proc_t structure on the dt_proc_hash_t notification list.
694178479Sjb	 */
695178479Sjb	if (notify)
696178479Sjb		dt_proc_notify(dtp, dph, dpr, NULL);
697178479Sjb
698178479Sjb	/*
699178479Sjb	 * Destroy and remove any remaining breakpoints, set dpr_done and clear
700178479Sjb	 * dpr_tid to indicate the control thread has exited, and notify any
701178479Sjb	 * waiting thread in dt_proc_destroy() that we have succesfully exited.
702178479Sjb	 */
703178479Sjb	(void) pthread_mutex_lock(&dpr->dpr_lock);
704178479Sjb
705178479Sjb	dt_proc_bpdestroy(dpr, B_TRUE);
706178479Sjb	dpr->dpr_done = B_TRUE;
707178479Sjb	dpr->dpr_tid = 0;
708178479Sjb
709178479Sjb	(void) pthread_cond_broadcast(&dpr->dpr_cv);
710178479Sjb	(void) pthread_mutex_unlock(&dpr->dpr_lock);
711178479Sjb
712178479Sjb	return (NULL);
713178479Sjb}
714178479Sjb
715178479Sjb/*PRINTFLIKE3*/
716178479Sjbstatic struct ps_prochandle *
717178479Sjbdt_proc_error(dtrace_hdl_t *dtp, dt_proc_t *dpr, const char *format, ...)
718178479Sjb{
719178479Sjb	va_list ap;
720178479Sjb
721178479Sjb	va_start(ap, format);
722178479Sjb	dt_set_errmsg(dtp, NULL, NULL, NULL, 0, format, ap);
723178479Sjb	va_end(ap);
724178479Sjb
725178479Sjb	if (dpr->dpr_proc != NULL)
726178479Sjb		Prelease(dpr->dpr_proc, 0);
727178479Sjb
728178479Sjb	dt_free(dtp, dpr);
729178479Sjb	(void) dt_set_errno(dtp, EDT_COMPILER);
730178479Sjb	return (NULL);
731178479Sjb}
732178479Sjb
733178479Sjbdt_proc_t *
734178479Sjbdt_proc_lookup(dtrace_hdl_t *dtp, struct ps_prochandle *P, int remove)
735178479Sjb{
736178479Sjb	dt_proc_hash_t *dph = dtp->dt_procs;
737297077Smav#ifdef illumos
738178479Sjb	pid_t pid = Pstatus(P)->pr_pid;
739178565Sjb#else
740178565Sjb	pid_t pid = proc_getpid(P);
741178565Sjb#endif
742178479Sjb	dt_proc_t *dpr, **dpp = &dph->dph_hash[pid & (dph->dph_hashlen - 1)];
743178479Sjb
744178479Sjb	for (dpr = *dpp; dpr != NULL; dpr = dpr->dpr_hash) {
745178479Sjb		if (dpr->dpr_pid == pid)
746178479Sjb			break;
747178479Sjb		else
748178479Sjb			dpp = &dpr->dpr_hash;
749178479Sjb	}
750178479Sjb
751178479Sjb	assert(dpr != NULL);
752178479Sjb	assert(dpr->dpr_proc == P);
753178479Sjb
754178479Sjb	if (remove)
755178479Sjb		*dpp = dpr->dpr_hash; /* remove from pid hash chain */
756178479Sjb
757178479Sjb	return (dpr);
758178479Sjb}
759178479Sjb
760178479Sjbstatic void
761178479Sjbdt_proc_destroy(dtrace_hdl_t *dtp, struct ps_prochandle *P)
762178479Sjb{
763178479Sjb	dt_proc_t *dpr = dt_proc_lookup(dtp, P, B_FALSE);
764178479Sjb	dt_proc_hash_t *dph = dtp->dt_procs;
765178479Sjb	dt_proc_notify_t *npr, **npp;
766178479Sjb	int rflag;
767178479Sjb
768178479Sjb	assert(dpr != NULL);
769178479Sjb
770178479Sjb	/*
771178479Sjb	 * If neither PR_KLC nor PR_RLC is set, then the process is stopped by
772178479Sjb	 * an external debugger and we were waiting in dt_proc_waitrun().
773178479Sjb	 * Leave the process in this condition using PRELEASE_HANG.
774178479Sjb	 */
775297077Smav#ifdef illumos
776178479Sjb	if (!(Pstatus(dpr->dpr_proc)->pr_flags & (PR_KLC | PR_RLC))) {
777178565Sjb#else
778178565Sjb	if (!(proc_getflags(dpr->dpr_proc) & (PR_KLC | PR_RLC))) {
779178565Sjb#endif
780178479Sjb		dt_dprintf("abandoning pid %d\n", (int)dpr->dpr_pid);
781210767Srpaulo		rflag = PRELEASE_HANG;
782297077Smav#ifdef illumos
783210767Srpaulo	} else if (Pstatus(dpr->dpr_proc)->pr_flags & PR_KLC) {
784178565Sjb#else
785210767Srpaulo	} else if (proc_getflags(dpr->dpr_proc) & PR_KLC) {
786178565Sjb#endif
787210767Srpaulo		dt_dprintf("killing pid %d\n", (int)dpr->dpr_pid);
788210767Srpaulo		rflag = PRELEASE_KILL; /* apply kill-on-last-close */
789178479Sjb	} else {
790178479Sjb		dt_dprintf("releasing pid %d\n", (int)dpr->dpr_pid);
791210767Srpaulo		rflag = 0; /* apply run-on-last-close */
792178479Sjb	}
793178479Sjb
794178479Sjb	if (dpr->dpr_tid) {
795178479Sjb		/*
796178479Sjb		 * Set the dpr_quit flag to tell the daemon thread to exit.  We
797178479Sjb		 * send it a SIGCANCEL to poke it out of PCWSTOP or any other
798178479Sjb		 * long-term /proc system call.  Our daemon threads have POSIX
799178479Sjb		 * cancellation disabled, so EINTR will be the only effect.  We
800178479Sjb		 * then wait for dpr_done to indicate the thread has exited.
801178479Sjb		 *
802178479Sjb		 * We can't use pthread_kill() to send SIGCANCEL because the
803178479Sjb		 * interface forbids it and we can't use pthread_cancel()
804178479Sjb		 * because with cancellation disabled it won't actually
805178479Sjb		 * send SIGCANCEL to the target thread, so we use _lwp_kill()
806178479Sjb		 * to do the job.  This is all built on evil knowledge of
807178479Sjb		 * the details of the cancellation mechanism in libc.
808178479Sjb		 */
809178479Sjb		(void) pthread_mutex_lock(&dpr->dpr_lock);
810178479Sjb		dpr->dpr_quit = B_TRUE;
811297077Smav#ifdef illumos
812178479Sjb		(void) _lwp_kill(dpr->dpr_tid, SIGCANCEL);
813178565Sjb#else
814234234Sgnn		pthread_kill(dpr->dpr_tid, SIGTHR);
815178565Sjb#endif
816178479Sjb
817178479Sjb		/*
818178479Sjb		 * If the process is currently idling in dt_proc_stop(), re-
819178479Sjb		 * enable breakpoints and poke it into running again.
820178479Sjb		 */
821178479Sjb		if (dpr->dpr_stop & DT_PROC_STOP_IDLE) {
822178479Sjb			dt_proc_bpenable(dpr);
823178479Sjb			dpr->dpr_stop &= ~DT_PROC_STOP_IDLE;
824178479Sjb			(void) pthread_cond_broadcast(&dpr->dpr_cv);
825178479Sjb		}
826178479Sjb
827178479Sjb		while (!dpr->dpr_done)
828178479Sjb			(void) pthread_cond_wait(&dpr->dpr_cv, &dpr->dpr_lock);
829178479Sjb
830178479Sjb		(void) pthread_mutex_unlock(&dpr->dpr_lock);
831178479Sjb	}
832178479Sjb
833178479Sjb	/*
834178479Sjb	 * Before we free the process structure, remove this dt_proc_t from the
835178479Sjb	 * lookup hash, and then walk the dt_proc_hash_t's notification list
836178479Sjb	 * and remove this dt_proc_t if it is enqueued.
837178479Sjb	 */
838178479Sjb	(void) pthread_mutex_lock(&dph->dph_lock);
839178479Sjb	(void) dt_proc_lookup(dtp, P, B_TRUE);
840178479Sjb	npp = &dph->dph_notify;
841178479Sjb
842178479Sjb	while ((npr = *npp) != NULL) {
843178479Sjb		if (npr->dprn_dpr == dpr) {
844178479Sjb			*npp = npr->dprn_next;
845178479Sjb			dt_free(dtp, npr);
846178479Sjb		} else {
847178479Sjb			npp = &npr->dprn_next;
848178479Sjb		}
849178479Sjb	}
850178479Sjb
851178479Sjb	(void) pthread_mutex_unlock(&dph->dph_lock);
852178479Sjb
853178479Sjb	/*
854178479Sjb	 * Remove the dt_proc_list from the LRU list, release the underlying
855178479Sjb	 * libproc handle, and free our dt_proc_t data structure.
856178479Sjb	 */
857178479Sjb	if (dpr->dpr_cacheable) {
858178479Sjb		assert(dph->dph_lrucnt != 0);
859178479Sjb		dph->dph_lrucnt--;
860178479Sjb	}
861178479Sjb
862178479Sjb	dt_list_delete(&dph->dph_lrulist, dpr);
863178479Sjb	Prelease(dpr->dpr_proc, rflag);
864178479Sjb	dt_free(dtp, dpr);
865178479Sjb}
866178479Sjb
867178479Sjbstatic int
868178479Sjbdt_proc_create_thread(dtrace_hdl_t *dtp, dt_proc_t *dpr, uint_t stop)
869178479Sjb{
870178479Sjb	dt_proc_control_data_t data;
871178479Sjb	sigset_t nset, oset;
872178479Sjb	pthread_attr_t a;
873178479Sjb	int err;
874178479Sjb
875178479Sjb	(void) pthread_mutex_lock(&dpr->dpr_lock);
876178479Sjb	dpr->dpr_stop |= stop; /* set bit for initial rendezvous */
877178479Sjb
878178479Sjb	(void) pthread_attr_init(&a);
879178479Sjb	(void) pthread_attr_setdetachstate(&a, PTHREAD_CREATE_DETACHED);
880178479Sjb
881178479Sjb	(void) sigfillset(&nset);
882178479Sjb	(void) sigdelset(&nset, SIGABRT);	/* unblocked for assert() */
883297077Smav#ifdef illumos
884178479Sjb	(void) sigdelset(&nset, SIGCANCEL);	/* see dt_proc_destroy() */
885178565Sjb#else
886178565Sjb	(void) sigdelset(&nset, SIGUSR1);	/* see dt_proc_destroy() */
887178565Sjb#endif
888178479Sjb
889178479Sjb	data.dpcd_hdl = dtp;
890178479Sjb	data.dpcd_proc = dpr;
891178479Sjb
892178479Sjb	(void) pthread_sigmask(SIG_SETMASK, &nset, &oset);
893178479Sjb	err = pthread_create(&dpr->dpr_tid, &a, dt_proc_control, &data);
894178479Sjb	(void) pthread_sigmask(SIG_SETMASK, &oset, NULL);
895178479Sjb
896178479Sjb	/*
897178479Sjb	 * If the control thread was created, then wait on dpr_cv for either
898178479Sjb	 * dpr_done to be set (the victim died or the control thread failed)
899178479Sjb	 * or DT_PROC_STOP_IDLE to be set, indicating that the victim is now
900178479Sjb	 * stopped by /proc and the control thread is at the rendezvous event.
901178479Sjb	 * On success, we return with the process and control thread stopped:
902178479Sjb	 * the caller can then apply dt_proc_continue() to resume both.
903178479Sjb	 */
904178479Sjb	if (err == 0) {
905178479Sjb		while (!dpr->dpr_done && !(dpr->dpr_stop & DT_PROC_STOP_IDLE))
906178479Sjb			(void) pthread_cond_wait(&dpr->dpr_cv, &dpr->dpr_lock);
907178479Sjb
908178479Sjb		/*
909178479Sjb		 * If dpr_done is set, the control thread aborted before it
910178479Sjb		 * reached the rendezvous event.  This is either due to PS_LOST
911178479Sjb		 * or PS_UNDEAD (i.e. the process died).  We try to provide a
912178479Sjb		 * small amount of useful information to help figure it out.
913178479Sjb		 */
914178479Sjb		if (dpr->dpr_done) {
915297077Smav#ifdef illumos
916178479Sjb			const psinfo_t *prp = Ppsinfo(dpr->dpr_proc);
917178479Sjb			int stat = prp ? prp->pr_wstat : 0;
918178479Sjb			int pid = dpr->dpr_pid;
919178565Sjb#else
920211554Srpaulo			int stat = proc_getwstat(dpr->dpr_proc);
921211554Srpaulo			int pid = proc_getpid(dpr->dpr_proc);
922211554Srpaulo#endif
923178565Sjb			if (proc_state(dpr->dpr_proc) == PS_LOST) {
924178479Sjb				(void) dt_proc_error(dpr->dpr_hdl, dpr,
925178479Sjb				    "failed to control pid %d: process exec'd "
926178479Sjb				    "set-id or unobservable program\n", pid);
927178479Sjb			} else if (WIFSIGNALED(stat)) {
928178479Sjb				(void) dt_proc_error(dpr->dpr_hdl, dpr,
929178479Sjb				    "failed to control pid %d: process died "
930178479Sjb				    "from signal %d\n", pid, WTERMSIG(stat));
931178479Sjb			} else {
932178479Sjb				(void) dt_proc_error(dpr->dpr_hdl, dpr,
933178479Sjb				    "failed to control pid %d: process exited "
934178479Sjb				    "with status %d\n", pid, WEXITSTATUS(stat));
935178479Sjb			}
936178479Sjb
937178479Sjb			err = ESRCH; /* cause grab() or create() to fail */
938178479Sjb		}
939178479Sjb	} else {
940178479Sjb		(void) dt_proc_error(dpr->dpr_hdl, dpr,
941178479Sjb		    "failed to create control thread for process-id %d: %s\n",
942178479Sjb		    (int)dpr->dpr_pid, strerror(err));
943178479Sjb	}
944178479Sjb
945238979Sgnn	if (err == 0)
946238979Sgnn		(void) pthread_mutex_unlock(&dpr->dpr_lock);
947178479Sjb	(void) pthread_attr_destroy(&a);
948178479Sjb
949178479Sjb	return (err);
950178479Sjb}
951178479Sjb
952178479Sjbstruct ps_prochandle *
953184696Srodrigcdt_proc_create(dtrace_hdl_t *dtp, const char *file, char *const *argv,
954184696Srodrigc    proc_child_func *pcf, void *child_arg)
955178479Sjb{
956178479Sjb	dt_proc_hash_t *dph = dtp->dt_procs;
957178479Sjb	dt_proc_t *dpr;
958178479Sjb	int err;
959178479Sjb
960178479Sjb	if ((dpr = dt_zalloc(dtp, sizeof (dt_proc_t))) == NULL)
961178479Sjb		return (NULL); /* errno is set for us */
962178479Sjb
963178479Sjb	(void) pthread_mutex_init(&dpr->dpr_lock, NULL);
964178479Sjb	(void) pthread_cond_init(&dpr->dpr_cv, NULL);
965178479Sjb
966297077Smav#ifdef illumos
967249573Spfg	if ((dpr->dpr_proc = Pcreate(file, argv, &err, NULL, 0)) == NULL) {
968211554Srpaulo#else
969211554Srpaulo	if ((err = proc_create(file, argv, pcf, child_arg,
970211554Srpaulo	    &dpr->dpr_proc)) != 0) {
971211554Srpaulo#endif
972178479Sjb		return (dt_proc_error(dtp, dpr,
973178479Sjb		    "failed to execute %s: %s\n", file, Pcreate_error(err)));
974178479Sjb	}
975178479Sjb
976178479Sjb	dpr->dpr_hdl = dtp;
977297077Smav#ifdef illumos
978178479Sjb	dpr->dpr_pid = Pstatus(dpr->dpr_proc)->pr_pid;
979178565Sjb#else
980178565Sjb	dpr->dpr_pid = proc_getpid(dpr->dpr_proc);
981178565Sjb#endif
982178479Sjb
983211554Srpaulo	(void) Punsetflags(dpr->dpr_proc, PR_RLC);
984211554Srpaulo	(void) Psetflags(dpr->dpr_proc, PR_KLC);
985211554Srpaulo
986178479Sjb	if (dt_proc_create_thread(dtp, dpr, dtp->dt_prcmode) != 0)
987178479Sjb		return (NULL); /* dt_proc_error() has been called for us */
988178479Sjb
989178479Sjb	dpr->dpr_hash = dph->dph_hash[dpr->dpr_pid & (dph->dph_hashlen - 1)];
990178479Sjb	dph->dph_hash[dpr->dpr_pid & (dph->dph_hashlen - 1)] = dpr;
991178479Sjb	dt_list_prepend(&dph->dph_lrulist, dpr);
992178479Sjb
993178479Sjb	dt_dprintf("created pid %d\n", (int)dpr->dpr_pid);
994178479Sjb	dpr->dpr_refs++;
995178479Sjb
996178479Sjb	return (dpr->dpr_proc);
997178479Sjb}
998178479Sjb
999178479Sjbstruct ps_prochandle *
1000178479Sjbdt_proc_grab(dtrace_hdl_t *dtp, pid_t pid, int flags, int nomonitor)
1001178479Sjb{
1002178479Sjb	dt_proc_hash_t *dph = dtp->dt_procs;
1003178479Sjb	uint_t h = pid & (dph->dph_hashlen - 1);
1004178479Sjb	dt_proc_t *dpr, *opr;
1005178479Sjb	int err;
1006178479Sjb
1007178479Sjb	/*
1008178479Sjb	 * Search the hash table for the pid.  If it is already grabbed or
1009178479Sjb	 * created, move the handle to the front of the lrulist, increment
1010178479Sjb	 * the reference count, and return the existing ps_prochandle.
1011178479Sjb	 */
1012178479Sjb	for (dpr = dph->dph_hash[h]; dpr != NULL; dpr = dpr->dpr_hash) {
1013178479Sjb		if (dpr->dpr_pid == pid && !dpr->dpr_stale) {
1014178479Sjb			/*
1015178479Sjb			 * If the cached handle was opened read-only and
1016178479Sjb			 * this request is for a writeable handle, mark
1017178479Sjb			 * the cached handle as stale and open a new handle.
1018178479Sjb			 * Since it's stale, unmark it as cacheable.
1019178479Sjb			 */
1020178479Sjb			if (dpr->dpr_rdonly && !(flags & PGRAB_RDONLY)) {
1021178479Sjb				dt_dprintf("upgrading pid %d\n", (int)pid);
1022178479Sjb				dpr->dpr_stale = B_TRUE;
1023178479Sjb				dpr->dpr_cacheable = B_FALSE;
1024178479Sjb				dph->dph_lrucnt--;
1025178479Sjb				break;
1026178479Sjb			}
1027178479Sjb
1028178479Sjb			dt_dprintf("grabbed pid %d (cached)\n", (int)pid);
1029178479Sjb			dt_list_delete(&dph->dph_lrulist, dpr);
1030178479Sjb			dt_list_prepend(&dph->dph_lrulist, dpr);
1031178479Sjb			dpr->dpr_refs++;
1032178479Sjb			return (dpr->dpr_proc);
1033178479Sjb		}
1034178479Sjb	}
1035178479Sjb
1036178479Sjb	if ((dpr = dt_zalloc(dtp, sizeof (dt_proc_t))) == NULL)
1037178479Sjb		return (NULL); /* errno is set for us */
1038178479Sjb
1039178479Sjb	(void) pthread_mutex_init(&dpr->dpr_lock, NULL);
1040178479Sjb	(void) pthread_cond_init(&dpr->dpr_cv, NULL);
1041178479Sjb
1042297077Smav#ifdef illumos
1043178479Sjb	if ((dpr->dpr_proc = Pgrab(pid, flags, &err)) == NULL) {
1044211554Srpaulo#else
1045211554Srpaulo	if ((err = proc_attach(pid, flags, &dpr->dpr_proc)) != 0) {
1046211554Srpaulo#endif
1047178479Sjb		return (dt_proc_error(dtp, dpr,
1048178479Sjb		    "failed to grab pid %d: %s\n", (int)pid, Pgrab_error(err)));
1049178479Sjb	}
1050178479Sjb
1051178479Sjb	dpr->dpr_hdl = dtp;
1052178479Sjb	dpr->dpr_pid = pid;
1053178479Sjb
1054178479Sjb	(void) Punsetflags(dpr->dpr_proc, PR_KLC);
1055178479Sjb	(void) Psetflags(dpr->dpr_proc, PR_RLC);
1056178479Sjb
1057178479Sjb	/*
1058178479Sjb	 * If we are attempting to grab the process without a monitor
1059178479Sjb	 * thread, then mark the process cacheable only if it's being
1060178479Sjb	 * grabbed read-only.  If we're currently caching more process
1061178479Sjb	 * handles than dph_lrulim permits, attempt to find the
1062178479Sjb	 * least-recently-used handle that is currently unreferenced and
1063178479Sjb	 * release it from the cache.  Otherwise we are grabbing the process
1064178479Sjb	 * for control: create a control thread for this process and store
1065178479Sjb	 * its ID in dpr->dpr_tid.
1066178479Sjb	 */
1067178479Sjb	if (nomonitor || (flags & PGRAB_RDONLY)) {
1068178479Sjb		if (dph->dph_lrucnt >= dph->dph_lrulim) {
1069178479Sjb			for (opr = dt_list_prev(&dph->dph_lrulist);
1070178479Sjb			    opr != NULL; opr = dt_list_prev(opr)) {
1071178479Sjb				if (opr->dpr_cacheable && opr->dpr_refs == 0) {
1072178479Sjb					dt_proc_destroy(dtp, opr->dpr_proc);
1073178479Sjb					break;
1074178479Sjb				}
1075178479Sjb			}
1076178479Sjb		}
1077178479Sjb
1078178479Sjb		if (flags & PGRAB_RDONLY) {
1079178479Sjb			dpr->dpr_cacheable = B_TRUE;
1080178479Sjb			dpr->dpr_rdonly = B_TRUE;
1081178479Sjb			dph->dph_lrucnt++;
1082178479Sjb		}
1083178479Sjb
1084178479Sjb	} else if (dt_proc_create_thread(dtp, dpr, DT_PROC_STOP_GRAB) != 0)
1085178479Sjb		return (NULL); /* dt_proc_error() has been called for us */
1086178479Sjb
1087178479Sjb	dpr->dpr_hash = dph->dph_hash[h];
1088178479Sjb	dph->dph_hash[h] = dpr;
1089178479Sjb	dt_list_prepend(&dph->dph_lrulist, dpr);
1090178479Sjb
1091178479Sjb	dt_dprintf("grabbed pid %d\n", (int)pid);
1092178479Sjb	dpr->dpr_refs++;
1093178479Sjb
1094178479Sjb	return (dpr->dpr_proc);
1095178479Sjb}
1096178479Sjb
1097178479Sjbvoid
1098178479Sjbdt_proc_release(dtrace_hdl_t *dtp, struct ps_prochandle *P)
1099178479Sjb{
1100178479Sjb	dt_proc_t *dpr = dt_proc_lookup(dtp, P, B_FALSE);
1101178479Sjb	dt_proc_hash_t *dph = dtp->dt_procs;
1102178479Sjb
1103178479Sjb	assert(dpr != NULL);
1104178479Sjb	assert(dpr->dpr_refs != 0);
1105178479Sjb
1106178479Sjb	if (--dpr->dpr_refs == 0 &&
1107178479Sjb	    (!dpr->dpr_cacheable || dph->dph_lrucnt > dph->dph_lrulim))
1108178479Sjb		dt_proc_destroy(dtp, P);
1109178479Sjb}
1110178479Sjb
1111178479Sjbvoid
1112178479Sjbdt_proc_continue(dtrace_hdl_t *dtp, struct ps_prochandle *P)
1113178479Sjb{
1114178479Sjb	dt_proc_t *dpr = dt_proc_lookup(dtp, P, B_FALSE);
1115178479Sjb
1116178479Sjb	(void) pthread_mutex_lock(&dpr->dpr_lock);
1117178479Sjb
1118178479Sjb	if (dpr->dpr_stop & DT_PROC_STOP_IDLE) {
1119178479Sjb		dpr->dpr_stop &= ~DT_PROC_STOP_IDLE;
1120178479Sjb		(void) pthread_cond_broadcast(&dpr->dpr_cv);
1121178479Sjb	}
1122178479Sjb
1123178479Sjb	(void) pthread_mutex_unlock(&dpr->dpr_lock);
1124178479Sjb}
1125178479Sjb
1126178479Sjbvoid
1127178479Sjbdt_proc_lock(dtrace_hdl_t *dtp, struct ps_prochandle *P)
1128178479Sjb{
1129178479Sjb	dt_proc_t *dpr = dt_proc_lookup(dtp, P, B_FALSE);
1130178479Sjb	int err = pthread_mutex_lock(&dpr->dpr_lock);
1131178479Sjb	assert(err == 0); /* check for recursion */
1132178479Sjb}
1133178479Sjb
1134178479Sjbvoid
1135178479Sjbdt_proc_unlock(dtrace_hdl_t *dtp, struct ps_prochandle *P)
1136178479Sjb{
1137178479Sjb	dt_proc_t *dpr = dt_proc_lookup(dtp, P, B_FALSE);
1138178479Sjb	int err = pthread_mutex_unlock(&dpr->dpr_lock);
1139178479Sjb	assert(err == 0); /* check for unheld lock */
1140178479Sjb}
1141178479Sjb
1142178479Sjbvoid
1143249573Spfgdt_proc_hash_create(dtrace_hdl_t *dtp)
1144178479Sjb{
1145178479Sjb	if ((dtp->dt_procs = dt_zalloc(dtp, sizeof (dt_proc_hash_t) +
1146249573Spfg	    sizeof (dt_proc_t *) * _dtrace_pidbuckets - 1)) != NULL) {
1147178479Sjb
1148249573Spfg		(void) pthread_mutex_init(&dtp->dt_procs->dph_lock, NULL);
1149249573Spfg		(void) pthread_cond_init(&dtp->dt_procs->dph_cv, NULL);
1150178479Sjb
1151249573Spfg		dtp->dt_procs->dph_hashlen = _dtrace_pidbuckets;
1152249573Spfg		dtp->dt_procs->dph_lrulim = _dtrace_pidlrulim;
1153178479Sjb	}
1154178479Sjb}
1155178479Sjb
1156178479Sjbvoid
1157249573Spfgdt_proc_hash_destroy(dtrace_hdl_t *dtp)
1158178479Sjb{
1159178479Sjb	dt_proc_hash_t *dph = dtp->dt_procs;
1160178479Sjb	dt_proc_t *dpr;
1161178479Sjb
1162178479Sjb	while ((dpr = dt_list_next(&dph->dph_lrulist)) != NULL)
1163178479Sjb		dt_proc_destroy(dtp, dpr->dpr_proc);
1164178479Sjb
1165178479Sjb	dtp->dt_procs = NULL;
1166178479Sjb	dt_free(dtp, dph);
1167178479Sjb}
1168178479Sjb
1169178479Sjbstruct ps_prochandle *
1170184696Srodrigcdtrace_proc_create(dtrace_hdl_t *dtp, const char *file, char *const *argv,
1171184696Srodrigc    proc_child_func *pcf, void *child_arg)
1172178479Sjb{
1173178479Sjb	dt_ident_t *idp = dt_idhash_lookup(dtp->dt_macros, "target");
1174184696Srodrigc	struct ps_prochandle *P = dt_proc_create(dtp, file, argv, pcf, child_arg);
1175178479Sjb
1176211554Srpaulo	if (P != NULL && idp != NULL && idp->di_id == 0) {
1177297077Smav#ifdef illumos
1178178479Sjb		idp->di_id = Pstatus(P)->pr_pid; /* $target = created pid */
1179178565Sjb#else
1180178565Sjb		idp->di_id = proc_getpid(P); /* $target = created pid */
1181178565Sjb#endif
1182211554Srpaulo	}
1183178479Sjb
1184178479Sjb	return (P);
1185178479Sjb}
1186178479Sjb
1187178479Sjbstruct ps_prochandle *
1188178479Sjbdtrace_proc_grab(dtrace_hdl_t *dtp, pid_t pid, int flags)
1189178479Sjb{
1190178479Sjb	dt_ident_t *idp = dt_idhash_lookup(dtp->dt_macros, "target");
1191178479Sjb	struct ps_prochandle *P = dt_proc_grab(dtp, pid, flags, 0);
1192178479Sjb
1193178479Sjb	if (P != NULL && idp != NULL && idp->di_id == 0)
1194178479Sjb		idp->di_id = pid; /* $target = grabbed pid */
1195178479Sjb
1196178479Sjb	return (P);
1197178479Sjb}
1198178479Sjb
1199178479Sjbvoid
1200178479Sjbdtrace_proc_release(dtrace_hdl_t *dtp, struct ps_prochandle *P)
1201178479Sjb{
1202178479Sjb	dt_proc_release(dtp, P);
1203178479Sjb}
1204178479Sjb
1205178479Sjbvoid
1206178479Sjbdtrace_proc_continue(dtrace_hdl_t *dtp, struct ps_prochandle *P)
1207178479Sjb{
1208178479Sjb	dt_proc_continue(dtp, P);
1209178479Sjb}
1210