1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21
22/*
23 * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
24 * Use is subject to license terms.
25 */
26
27/*
28 * Copyright (c) 2012 by Delphix. All rights reserved.
29 */
30
31/*
32 * DTrace Process Control
33 *
34 * This file provides a set of routines that permit libdtrace and its clients
35 * to create and grab process handles using libproc, and to share these handles
36 * between library mechanisms that need libproc access, such as ustack(), and
37 * client mechanisms that need libproc access, such as dtrace(1M) -c and -p.
38 * The library provides several mechanisms in the libproc control layer:
39 *
40 * Reference Counting: The library code and client code can independently grab
41 * the same process handles without interfering with one another.  Only when
42 * the reference count drops to zero and the handle is not being cached (see
43 * below for more information on caching) will Prelease() be called on it.
44 *
45 * Handle Caching: If a handle is grabbed PGRAB_RDONLY (e.g. by ustack()) and
46 * the reference count drops to zero, the handle is not immediately released.
47 * Instead, libproc handles are maintained on dph_lrulist in order from most-
48 * recently accessed to least-recently accessed.  Idle handles are maintained
49 * until a pre-defined LRU cache limit is exceeded, permitting repeated calls
50 * to ustack() to avoid the overhead of releasing and re-grabbing processes.
51 *
52 * Process Control: For processes that are grabbed for control (~PGRAB_RDONLY)
53 * or created by dt_proc_create(), a control thread is created to provide
54 * callbacks on process exit and symbol table caching on dlopen()s.
55 *
56 * MT-Safety: Libproc is not MT-Safe, so dt_proc_lock() and dt_proc_unlock()
57 * are provided to synchronize access to the libproc handle between libdtrace
58 * code and client code and the control thread's use of the ps_prochandle.
59 *
60 * NOTE: MT-Safety is NOT provided for libdtrace itself, or for use of the
61 * dtrace_proc_grab/dtrace_proc_create mechanisms.  Like all exported libdtrace
62 * calls, these are assumed to be MT-Unsafe.  MT-Safety is ONLY provided for
63 * synchronization between libdtrace control threads and the client thread.
64 *
65 * The ps_prochandles themselves are maintained along with a dt_proc_t struct
66 * in a hash table indexed by PID.  This provides basic locking and reference
67 * counting.  The dt_proc_t is also maintained in LRU order on dph_lrulist.
68 * The dph_lrucnt and dph_lrulim count the number of cacheable processes and
69 * the current limit on the number of actively cached entries.
70 *
71 * The control thread for a process establishes breakpoints at the rtld_db
72 * locations of interest, updates mappings and symbol tables at these points,
73 * and handles exec and fork (by always following the parent).  The control
74 * thread automatically exits when the process dies or control is lost.
75 *
76 * A simple notification mechanism is provided for libdtrace clients using
77 * dtrace_handle_proc() for notification of PS_UNDEAD or PS_LOST events.  If
78 * such an event occurs, the dt_proc_t itself is enqueued on a notification
79 * list and the control thread broadcasts to dph_cv.  dtrace_sleep() will wake
80 * up using this condition and will then call the client handler as necessary.
81 */
82
83#include <sys/syscall.h>
84#include <sys/wait.h>
85#include <strings.h>
86#include <signal.h>
87#include <assert.h>
88#include <errno.h>
89
90#include <dt_proc.h>
91#include <dt_pid.h>
92#include <dt_impl.h>
93
94#include <libproc_compat.h>
95
96#define	IS_SYS_EXEC(w)	(w == SYS_execve)
97#define	IS_SYS_FORK(w)	(w == SYS_vfork || w == SYS_fork)
98
99static dt_bkpt_t *
100dt_proc_bpcreate(dt_proc_t *dpr, uintptr_t addr, dt_bkpt_f *func, void *data)
101{
102	struct ps_prochandle *P = dpr->dpr_proc;
103	dt_bkpt_t *dbp;
104
105	assert(DT_MUTEX_HELD(&dpr->dpr_lock));
106
107	if ((dbp = dt_zalloc(dpr->dpr_hdl, sizeof (dt_bkpt_t))) != NULL) {
108		dbp->dbp_func = func;
109		dbp->dbp_data = data;
110		dbp->dbp_addr = addr;
111
112		if (Psetbkpt(P, dbp->dbp_addr, &dbp->dbp_instr) == 0)
113			dbp->dbp_active = B_TRUE;
114
115		dt_list_append(&dpr->dpr_bps, dbp);
116	}
117
118	return (dbp);
119}
120
121static void
122dt_proc_bpdestroy(dt_proc_t *dpr, int delbkpts)
123{
124	int state = Pstate(dpr->dpr_proc);
125	dt_bkpt_t *dbp, *nbp;
126
127	assert(DT_MUTEX_HELD(&dpr->dpr_lock));
128
129	for (dbp = dt_list_next(&dpr->dpr_bps); dbp != NULL; dbp = nbp) {
130		if (delbkpts && dbp->dbp_active &&
131		    state != PS_LOST && state != PS_UNDEAD) {
132			(void) Pdelbkpt(dpr->dpr_proc,
133			    dbp->dbp_addr, dbp->dbp_instr);
134		}
135		nbp = dt_list_next(dbp);
136		dt_list_delete(&dpr->dpr_bps, dbp);
137		dt_free(dpr->dpr_hdl, dbp);
138	}
139}
140
141static void
142dt_proc_bpmatch(dtrace_hdl_t *dtp, dt_proc_t *dpr)
143{
144	unsigned long pc;
145	dt_bkpt_t *dbp;
146
147	assert(DT_MUTEX_HELD(&dpr->dpr_lock));
148
149	proc_regget(dpr->dpr_proc, REG_PC, &pc);
150	proc_bkptregadj(&pc);
151
152	for (dbp = dt_list_next(&dpr->dpr_bps);
153	    dbp != NULL; dbp = dt_list_next(dbp)) {
154		if (pc == dbp->dbp_addr)
155			break;
156	}
157
158	if (dbp == NULL) {
159		dt_dprintf("pid %d: spurious breakpoint wakeup for %lx\n",
160		    (int)dpr->dpr_pid, pc);
161		return;
162	}
163
164	dt_dprintf("pid %d: hit breakpoint at %lx (%lu)\n",
165	    (int)dpr->dpr_pid, (ulong_t)dbp->dbp_addr, ++dbp->dbp_hits);
166
167	dbp->dbp_func(dtp, dpr, dbp->dbp_data);
168	(void) Pxecbkpt(dpr->dpr_proc, dbp->dbp_instr);
169}
170
171static void
172dt_proc_bpenable(dt_proc_t *dpr)
173{
174	dt_bkpt_t *dbp;
175
176	assert(DT_MUTEX_HELD(&dpr->dpr_lock));
177
178	for (dbp = dt_list_next(&dpr->dpr_bps);
179	    dbp != NULL; dbp = dt_list_next(dbp)) {
180		if (!dbp->dbp_active && Psetbkpt(dpr->dpr_proc,
181		    dbp->dbp_addr, &dbp->dbp_instr) == 0)
182			dbp->dbp_active = B_TRUE;
183	}
184
185	dt_dprintf("breakpoints enabled\n");
186}
187
188static void
189dt_proc_bpdisable(dt_proc_t *dpr)
190{
191	dt_bkpt_t *dbp;
192
193	assert(DT_MUTEX_HELD(&dpr->dpr_lock));
194
195	for (dbp = dt_list_next(&dpr->dpr_bps);
196	    dbp != NULL; dbp = dt_list_next(dbp)) {
197		if (dbp->dbp_active && Pdelbkpt(dpr->dpr_proc,
198		    dbp->dbp_addr, dbp->dbp_instr) == 0)
199			dbp->dbp_active = B_FALSE;
200	}
201
202	dt_dprintf("breakpoints disabled\n");
203}
204
205static void
206dt_proc_notify(dtrace_hdl_t *dtp, dt_proc_hash_t *dph, dt_proc_t *dpr,
207    const char *msg)
208{
209	dt_proc_notify_t *dprn = dt_alloc(dtp, sizeof (dt_proc_notify_t));
210
211	if (dprn == NULL) {
212		dt_dprintf("failed to allocate notification for %d %s\n",
213		    (int)dpr->dpr_pid, msg);
214	} else {
215		dprn->dprn_dpr = dpr;
216		if (msg == NULL)
217			dprn->dprn_errmsg[0] = '\0';
218		else
219			(void) strlcpy(dprn->dprn_errmsg, msg,
220			    sizeof (dprn->dprn_errmsg));
221
222		(void) pthread_mutex_lock(&dph->dph_lock);
223
224		dprn->dprn_next = dph->dph_notify;
225		dph->dph_notify = dprn;
226
227		(void) pthread_cond_broadcast(&dph->dph_cv);
228		(void) pthread_mutex_unlock(&dph->dph_lock);
229	}
230}
231
232/*
233 * Check to see if the control thread was requested to stop when the victim
234 * process reached a particular event (why) rather than continuing the victim.
235 * If 'why' is set in the stop mask, we wait on dpr_cv for dt_proc_continue().
236 * If 'why' is not set, this function returns immediately and does nothing.
237 */
238static void
239dt_proc_stop(dt_proc_t *dpr, uint8_t why)
240{
241	assert(DT_MUTEX_HELD(&dpr->dpr_lock));
242	assert(why != DT_PROC_STOP_IDLE);
243
244	if (dpr->dpr_stop & why) {
245		dpr->dpr_stop |= DT_PROC_STOP_IDLE;
246		dpr->dpr_stop &= ~why;
247
248		(void) pthread_cond_broadcast(&dpr->dpr_cv);
249
250		/*
251		 * We disable breakpoints while stopped to preserve the
252		 * integrity of the program text for both our own disassembly
253		 * and that of the kernel.
254		 */
255		dt_proc_bpdisable(dpr);
256
257		while (dpr->dpr_stop & DT_PROC_STOP_IDLE)
258			(void) pthread_cond_wait(&dpr->dpr_cv, &dpr->dpr_lock);
259
260		dt_proc_bpenable(dpr);
261	}
262}
263
264/*ARGSUSED*/
265static void
266dt_proc_bpmain(dtrace_hdl_t *dtp, dt_proc_t *dpr, const char *fname)
267{
268	dt_dprintf("pid %d: breakpoint at %s()\n", (int)dpr->dpr_pid, fname);
269	dt_proc_stop(dpr, DT_PROC_STOP_MAIN);
270}
271
272static void
273dt_proc_rdevent(dtrace_hdl_t *dtp, dt_proc_t *dpr, const char *evname)
274{
275	rd_event_msg_t rdm;
276	rd_err_e err;
277
278	if ((err = rd_event_getmsg(dpr->dpr_rtld, &rdm)) != RD_OK) {
279		dt_dprintf("pid %d: failed to get %s event message: %s\n",
280		    (int)dpr->dpr_pid, evname, rd_errstr(err));
281		return;
282	}
283
284	dt_dprintf("pid %d: rtld event %s type=%d state %d\n",
285	    (int)dpr->dpr_pid, evname, rdm.type, rdm.u.state);
286
287	switch (rdm.type) {
288	case RD_DLACTIVITY:
289		if (rdm.u.state != RD_CONSISTENT)
290			break;
291
292		Pupdate_syms(dpr->dpr_proc);
293		if (dt_pid_create_probes_module(dtp, dpr) != 0)
294			dt_proc_notify(dtp, dtp->dt_procs, dpr,
295			    dpr->dpr_errmsg);
296
297		break;
298	case RD_PREINIT:
299		Pupdate_syms(dpr->dpr_proc);
300		dt_proc_stop(dpr, DT_PROC_STOP_PREINIT);
301		break;
302	case RD_POSTINIT:
303		Pupdate_syms(dpr->dpr_proc);
304		dt_proc_stop(dpr, DT_PROC_STOP_POSTINIT);
305		break;
306	}
307}
308
309static void
310dt_proc_rdwatch(dt_proc_t *dpr, rd_event_e event, const char *evname)
311{
312	rd_notify_t rdn;
313	rd_err_e err;
314
315	if ((err = rd_event_addr(dpr->dpr_rtld, event, &rdn)) != RD_OK) {
316		dt_dprintf("pid %d: failed to get event address for %s: %s\n",
317		    (int)dpr->dpr_pid, evname, rd_errstr(err));
318		return;
319	}
320
321	if (rdn.type != RD_NOTIFY_BPT) {
322		dt_dprintf("pid %d: event %s has unexpected type %d\n",
323		    (int)dpr->dpr_pid, evname, rdn.type);
324		return;
325	}
326
327	(void) dt_proc_bpcreate(dpr, rdn.u.bptaddr,
328	    /* XXX ugly */
329	    (dt_bkpt_f *)dt_proc_rdevent, __DECONST(void *, evname));
330}
331
332/*
333 * Common code for enabling events associated with the run-time linker after
334 * attaching to a process or after a victim process completes an exec(2).
335 */
336static void
337dt_proc_attach(dt_proc_t *dpr, int exec)
338{
339	rd_err_e err;
340	GElf_Sym sym;
341
342	assert(DT_MUTEX_HELD(&dpr->dpr_lock));
343
344	if (exec) {
345
346		dt_proc_bpdestroy(dpr, B_FALSE);
347	}
348	if ((dpr->dpr_rtld = Prd_agent(dpr->dpr_proc)) != NULL &&
349	    (err = rd_event_enable(dpr->dpr_rtld, B_TRUE)) == RD_OK) {
350		dt_proc_rdwatch(dpr, RD_POSTINIT, "RD_POSTINIT");
351	} else {
352		dt_dprintf("pid %d: failed to enable rtld events: %s\n",
353		    (int)dpr->dpr_pid, dpr->dpr_rtld ? rd_errstr(err) :
354		    "rtld_db agent initialization failed");
355	}
356
357	Pupdate_maps(dpr->dpr_proc);
358
359	if (Pxlookup_by_name(dpr->dpr_proc, LM_ID_BASE,
360	    "a.out", "main", &sym, NULL) == 0) {
361		(void) dt_proc_bpcreate(dpr, (uintptr_t)sym.st_value,
362		    (dt_bkpt_f *)dt_proc_bpmain, "a.out`main");
363	} else {
364		dt_dprintf("pid %d: failed to find a.out`main: %s\n",
365		    (int)dpr->dpr_pid, strerror(errno));
366	}
367}
368
369typedef struct dt_proc_control_data {
370	dtrace_hdl_t *dpcd_hdl;			/* DTrace handle */
371	dt_proc_t *dpcd_proc;			/* proccess to control */
372} dt_proc_control_data_t;
373
374/*
375 * Main loop for all victim process control threads.  We initialize all the
376 * appropriate /proc control mechanisms, and then enter a loop waiting for
377 * the process to stop on an event or die.  We process any events by calling
378 * appropriate subroutines, and exit when the victim dies or we lose control.
379 *
380 * The control thread synchronizes the use of dpr_proc with other libdtrace
381 * threads using dpr_lock.  We hold the lock for all of our operations except
382 * waiting while the process is running: this is accomplished by writing a
383 * PCWSTOP directive directly to the underlying /proc/<pid>/ctl file.  If the
384 * libdtrace client wishes to exit or abort our wait, SIGCANCEL can be used.
385 */
386static void *
387dt_proc_control(void *arg)
388{
389	dt_proc_control_data_t *datap = arg;
390	dtrace_hdl_t *dtp = datap->dpcd_hdl;
391	dt_proc_t *dpr = datap->dpcd_proc;
392	dt_proc_hash_t *dph = dtp->dt_procs;
393	struct ps_prochandle *P = dpr->dpr_proc;
394	int pid = dpr->dpr_pid;
395	int notify = B_FALSE;
396
397	/*
398	 * We disable the POSIX thread cancellation mechanism so that the
399	 * client program using libdtrace can't accidentally cancel our thread.
400	 * dt_proc_destroy() uses SIGCANCEL explicitly to simply poke us out
401	 * of PCWSTOP with EINTR, at which point we will see dpr_quit and exit.
402	 */
403	(void) pthread_setcancelstate(PTHREAD_CANCEL_DISABLE, NULL);
404
405	/*
406	 * Set up the corresponding process for tracing by libdtrace.  We want
407	 * to be able to catch breakpoints and efficiently single-step over
408	 * them, and we need to enable librtld_db to watch libdl activity.
409	 */
410	(void) pthread_mutex_lock(&dpr->dpr_lock);
411
412	dt_proc_attach(dpr, B_FALSE);		/* enable rtld breakpoints */
413
414	/*
415	 * If DT_CLOSE_KILL is set, we created the process; otherwise we
416	 * grabbed it.  Check for an appropriate stop request and wait for
417	 * dt_proc_continue.
418	 */
419	if (dpr->dpr_close == DT_CLOSE_KILL)
420		dt_proc_stop(dpr, DT_PROC_STOP_CREATE);
421	else
422		dt_proc_stop(dpr, DT_PROC_STOP_GRAB);
423
424	if (Psetrun(P, 0, 0) == -1) {
425		dt_dprintf("pid %d: failed to set running: %s\n",
426		    (int)dpr->dpr_pid, strerror(errno));
427	}
428
429	(void) pthread_mutex_unlock(&dpr->dpr_lock);
430
431	/*
432	 * Wait for the process corresponding to this control thread to stop,
433	 * process the event, and then set it running again.  We want to sleep
434	 * with dpr_lock *unheld* so that other parts of libdtrace can use the
435	 * ps_prochandle in the meantime (e.g. ustack()).  To do this, we write
436	 * a PCWSTOP directive directly to the underlying /proc/<pid>/ctl file.
437	 * Once the process stops, we wake up, grab dpr_lock, and then call
438	 * Pwait() (which will return immediately) and do our processing.
439	 */
440	while (!dpr->dpr_quit) {
441		const lwpstatus_t *psp;
442
443		/* Wait for the process to report status. */
444		proc_wstatus(P);
445		if (errno == EINTR)
446			continue; /* check dpr_quit and continue waiting */
447
448		(void) pthread_mutex_lock(&dpr->dpr_lock);
449
450		switch (Pstate(P)) {
451		case PS_STOP:
452			psp = proc_getlwpstatus(P);
453
454			dt_dprintf("pid %d: proc stopped showing %d/%d\n",
455			    pid, psp->pr_why, psp->pr_what);
456
457			/*
458			 * If the process stops showing one of the events that
459			 * we are tracing, perform the appropriate response.
460			 * Note that we ignore PR_SUSPENDED, PR_CHECKPOINT, and
461			 * PR_JOBCONTROL by design: if one of these conditions
462			 * occurs, we will fall through to Psetrun() but the
463			 * process will remain stopped in the kernel by the
464			 * corresponding mechanism (e.g. job control stop).
465			 */
466			if (psp->pr_why == PR_FAULTED && psp->pr_what == FLTBPT)
467				dt_proc_bpmatch(dtp, dpr);
468			else if (psp->pr_why == PR_SYSENTRY &&
469			    IS_SYS_FORK(psp->pr_what))
470				dt_proc_bpdisable(dpr);
471			else if (psp->pr_why == PR_SYSEXIT &&
472			    IS_SYS_FORK(psp->pr_what))
473				dt_proc_bpenable(dpr);
474			else if (psp->pr_why == PR_SYSEXIT &&
475			    IS_SYS_EXEC(psp->pr_what))
476				dt_proc_attach(dpr, B_TRUE);
477			break;
478
479		case PS_LOST:
480			dt_dprintf("pid %d: proc lost: %s\n",
481			    pid, strerror(errno));
482
483			dpr->dpr_quit = B_TRUE;
484			notify = B_TRUE;
485			break;
486
487		case PS_UNDEAD:
488			dt_dprintf("pid %d: proc died\n", pid);
489			dpr->dpr_quit = B_TRUE;
490			notify = B_TRUE;
491			break;
492		}
493
494		if (Pstate(P) != PS_UNDEAD) {
495			if (dpr->dpr_quit && dpr->dpr_close == DT_CLOSE_KILL) {
496				/*
497				 * We're about to kill the child, so don't
498				 * bother resuming it.  In some cases, such as
499				 * an initialization error, we shouldn't have
500				 * started it in the first place, so letting it
501				 * run could be harmful.
502				 */
503			} else if (Psetrun(P, 0, 0) == -1) {
504				dt_dprintf("pid %d: failed to set running: "
505				    "%s\n", (int)dpr->dpr_pid, strerror(errno));
506			}
507		}
508
509		(void) pthread_mutex_unlock(&dpr->dpr_lock);
510	}
511
512	/*
513	 * If the control thread detected PS_UNDEAD or PS_LOST, then enqueue
514	 * the dt_proc_t structure on the dt_proc_hash_t notification list.
515	 */
516	if (notify)
517		dt_proc_notify(dtp, dph, dpr, NULL);
518
519	/*
520	 * Destroy and remove any remaining breakpoints, set dpr_done and clear
521	 * dpr_tid to indicate the control thread has exited, and notify any
522	 * waiting thread in dt_proc_destroy() that we have succesfully exited.
523	 */
524	(void) pthread_mutex_lock(&dpr->dpr_lock);
525
526	dt_proc_bpdestroy(dpr, B_TRUE);
527	dpr->dpr_done = B_TRUE;
528	dpr->dpr_tid = 0;
529
530	(void) pthread_cond_broadcast(&dpr->dpr_cv);
531	(void) pthread_mutex_unlock(&dpr->dpr_lock);
532
533	return (NULL);
534}
535
536/*PRINTFLIKE3*/
537static struct ps_prochandle *
538dt_proc_error(dtrace_hdl_t *dtp, dt_proc_t *dpr, const char *format, ...)
539{
540	va_list ap;
541
542	va_start(ap, format);
543	dt_set_errmsg(dtp, NULL, NULL, NULL, 0, format, ap);
544	va_end(ap);
545
546	if (dpr->dpr_proc != NULL)
547		Prelease(dpr->dpr_proc, 0);
548
549	dt_free(dtp, dpr);
550	(void) dt_set_errno(dtp, EDT_COMPILER);
551	return (NULL);
552}
553
554dt_proc_t *
555dt_proc_lookup(dtrace_hdl_t *dtp, struct ps_prochandle *P, int remove)
556{
557	dt_proc_hash_t *dph = dtp->dt_procs;
558	pid_t pid = proc_getpid(P);
559	dt_proc_t *dpr, **dpp = &dph->dph_hash[pid & (dph->dph_hashlen - 1)];
560
561	for (dpr = *dpp; dpr != NULL; dpr = dpr->dpr_hash) {
562		if (dpr->dpr_pid == pid)
563			break;
564		else
565			dpp = &dpr->dpr_hash;
566	}
567
568	assert(dpr != NULL);
569	assert(dpr->dpr_proc == P);
570
571	if (remove)
572		*dpp = dpr->dpr_hash; /* remove from pid hash chain */
573
574	return (dpr);
575}
576
577static void
578dt_proc_destroy(dtrace_hdl_t *dtp, struct ps_prochandle *P)
579{
580	dt_proc_t *dpr = dt_proc_lookup(dtp, P, B_FALSE);
581	dt_proc_hash_t *dph = dtp->dt_procs;
582	dt_proc_notify_t *npr, **npp;
583	int rflag;
584
585	assert(dpr != NULL);
586
587	switch (dpr->dpr_close) {
588	case DT_CLOSE_KILL:
589		dt_dprintf("killing pid %d\n", (int)dpr->dpr_pid);
590		rflag = PRELEASE_KILL;
591		break;
592	case DT_CLOSE_RUN:
593		dt_dprintf("releasing pid %d\n", (int)dpr->dpr_pid);
594		rflag = 0;
595		break;
596	}
597
598	if (dpr->dpr_tid) {
599		/*
600		 * Set the dpr_quit flag to tell the daemon thread to exit.  We
601		 * send it a SIGCANCEL to poke it out of PCWSTOP or any other
602		 * long-term /proc system call.  Our daemon threads have POSIX
603		 * cancellation disabled, so EINTR will be the only effect.  We
604		 * then wait for dpr_done to indicate the thread has exited.
605		 *
606		 * We can't use pthread_kill() to send SIGCANCEL because the
607		 * interface forbids it and we can't use pthread_cancel()
608		 * because with cancellation disabled it won't actually
609		 * send SIGCANCEL to the target thread, so we use _lwp_kill()
610		 * to do the job.  This is all built on evil knowledge of
611		 * the details of the cancellation mechanism in libc.
612		 */
613		(void) pthread_mutex_lock(&dpr->dpr_lock);
614		dpr->dpr_quit = B_TRUE;
615		pthread_kill(dpr->dpr_tid, SIGTHR);
616
617		/*
618		 * If the process is currently idling in dt_proc_stop(), re-
619		 * enable breakpoints and poke it into running again.
620		 */
621		if (dpr->dpr_stop & DT_PROC_STOP_IDLE) {
622			dt_proc_bpenable(dpr);
623			dpr->dpr_stop &= ~DT_PROC_STOP_IDLE;
624			(void) pthread_cond_broadcast(&dpr->dpr_cv);
625		}
626
627		while (!dpr->dpr_done)
628			(void) pthread_cond_wait(&dpr->dpr_cv, &dpr->dpr_lock);
629
630		(void) pthread_mutex_unlock(&dpr->dpr_lock);
631	}
632
633	/*
634	 * Before we free the process structure, remove this dt_proc_t from the
635	 * lookup hash, and then walk the dt_proc_hash_t's notification list
636	 * and remove this dt_proc_t if it is enqueued.
637	 */
638	(void) pthread_mutex_lock(&dph->dph_lock);
639	(void) dt_proc_lookup(dtp, P, B_TRUE);
640	npp = &dph->dph_notify;
641
642	while ((npr = *npp) != NULL) {
643		if (npr->dprn_dpr == dpr) {
644			*npp = npr->dprn_next;
645			dt_free(dtp, npr);
646		} else {
647			npp = &npr->dprn_next;
648		}
649	}
650
651	(void) pthread_mutex_unlock(&dph->dph_lock);
652
653	/*
654	 * Remove the dt_proc_list from the LRU list, release the underlying
655	 * libproc handle, and free our dt_proc_t data structure.
656	 */
657	if (dpr->dpr_cacheable) {
658		assert(dph->dph_lrucnt != 0);
659		dph->dph_lrucnt--;
660	}
661
662	dt_list_delete(&dph->dph_lrulist, dpr);
663	Prelease(dpr->dpr_proc, rflag);
664	dt_free(dtp, dpr);
665}
666
667static int
668dt_proc_create_thread(dtrace_hdl_t *dtp, dt_proc_t *dpr, uint_t stop)
669{
670	dt_proc_control_data_t data;
671	sigset_t nset, oset;
672	pthread_attr_t a;
673	int err;
674
675	(void) pthread_mutex_lock(&dpr->dpr_lock);
676	dpr->dpr_stop |= stop; /* set bit for initial rendezvous */
677
678	(void) pthread_attr_init(&a);
679	(void) pthread_attr_setdetachstate(&a, PTHREAD_CREATE_DETACHED);
680
681	(void) sigfillset(&nset);
682	(void) sigdelset(&nset, SIGABRT);	/* unblocked for assert() */
683	(void) sigdelset(&nset, SIGUSR1);	/* see dt_proc_destroy() */
684
685	data.dpcd_hdl = dtp;
686	data.dpcd_proc = dpr;
687
688	(void) pthread_sigmask(SIG_SETMASK, &nset, &oset);
689	err = pthread_create(&dpr->dpr_tid, &a, dt_proc_control, &data);
690	(void) pthread_sigmask(SIG_SETMASK, &oset, NULL);
691
692	/*
693	 * If the control thread was created, then wait on dpr_cv for either
694	 * dpr_done to be set (the victim died or the control thread failed)
695	 * or DT_PROC_STOP_IDLE to be set, indicating that the victim is now
696	 * stopped by /proc and the control thread is at the rendezvous event.
697	 * On success, we return with the process and control thread stopped:
698	 * the caller can then apply dt_proc_continue() to resume both.
699	 */
700	if (err == 0) {
701		while (!dpr->dpr_done && !(dpr->dpr_stop & DT_PROC_STOP_IDLE))
702			(void) pthread_cond_wait(&dpr->dpr_cv, &dpr->dpr_lock);
703
704		/*
705		 * If dpr_done is set, the control thread aborted before it
706		 * reached the rendezvous event.  This is either due to PS_LOST
707		 * or PS_UNDEAD (i.e. the process died).  We try to provide a
708		 * small amount of useful information to help figure it out.
709		 */
710		if (dpr->dpr_done) {
711			int stat = proc_getwstat(dpr->dpr_proc);
712			int pid = proc_getpid(dpr->dpr_proc);
713			if (proc_state(dpr->dpr_proc) == PS_LOST) {
714				(void) dt_proc_error(dpr->dpr_hdl, dpr,
715				    "failed to control pid %d: process exec'd "
716				    "set-id or unobservable program\n", pid);
717			} else if (WIFSIGNALED(stat)) {
718				(void) dt_proc_error(dpr->dpr_hdl, dpr,
719				    "failed to control pid %d: process died "
720				    "from signal %d\n", pid, WTERMSIG(stat));
721			} else {
722				(void) dt_proc_error(dpr->dpr_hdl, dpr,
723				    "failed to control pid %d: process exited "
724				    "with status %d\n", pid, WEXITSTATUS(stat));
725			}
726
727			err = ESRCH; /* cause grab() or create() to fail */
728		}
729	} else {
730		(void) dt_proc_error(dpr->dpr_hdl, dpr,
731		    "failed to create control thread for process-id %d: %s\n",
732		    (int)dpr->dpr_pid, strerror(err));
733	}
734
735	if (err == 0)
736		(void) pthread_mutex_unlock(&dpr->dpr_lock);
737	(void) pthread_attr_destroy(&a);
738
739	return (err);
740}
741
742struct ps_prochandle *
743dt_proc_create(dtrace_hdl_t *dtp, const char *file, char *const *argv,
744    proc_child_func *pcf, void *child_arg)
745{
746	dt_proc_hash_t *dph = dtp->dt_procs;
747	dt_proc_t *dpr;
748	int err;
749
750	if ((dpr = dt_zalloc(dtp, sizeof (dt_proc_t))) == NULL)
751		return (NULL); /* errno is set for us */
752
753	(void) pthread_mutex_init(&dpr->dpr_lock, NULL);
754	(void) pthread_cond_init(&dpr->dpr_cv, NULL);
755
756	if ((err = proc_create(file, argv, dtp->dt_proc_env, pcf, child_arg,
757	    &dpr->dpr_proc)) != 0) {
758		return (dt_proc_error(dtp, dpr,
759		    "failed to execute %s: %s\n", file, Pcreate_error(err)));
760	}
761
762	dpr->dpr_hdl = dtp;
763	dpr->dpr_pid = proc_getpid(dpr->dpr_proc);
764	dpr->dpr_close = DT_CLOSE_KILL;
765
766	if (dt_proc_create_thread(dtp, dpr, dtp->dt_prcmode) != 0)
767		return (NULL); /* dt_proc_error() has been called for us */
768
769	dpr->dpr_hash = dph->dph_hash[dpr->dpr_pid & (dph->dph_hashlen - 1)];
770	dph->dph_hash[dpr->dpr_pid & (dph->dph_hashlen - 1)] = dpr;
771	dt_list_prepend(&dph->dph_lrulist, dpr);
772
773	dt_dprintf("created pid %d\n", (int)dpr->dpr_pid);
774	dpr->dpr_refs++;
775
776	return (dpr->dpr_proc);
777}
778
779struct ps_prochandle *
780dt_proc_grab(dtrace_hdl_t *dtp, pid_t pid, int flags, int nomonitor)
781{
782	dt_proc_hash_t *dph = dtp->dt_procs;
783	uint_t h = pid & (dph->dph_hashlen - 1);
784	dt_proc_t *dpr, *opr;
785	int err;
786
787	/*
788	 * Search the hash table for the pid.  If it is already grabbed or
789	 * created, move the handle to the front of the lrulist, increment
790	 * the reference count, and return the existing ps_prochandle.
791	 */
792	for (dpr = dph->dph_hash[h]; dpr != NULL; dpr = dpr->dpr_hash) {
793		if (dpr->dpr_pid == pid && !dpr->dpr_stale) {
794			/*
795			 * If the cached handle was opened read-only and
796			 * this request is for a writeable handle, mark
797			 * the cached handle as stale and open a new handle.
798			 * Since it's stale, unmark it as cacheable.
799			 */
800			if (dpr->dpr_rdonly && !(flags & PGRAB_RDONLY)) {
801				dt_dprintf("upgrading pid %d\n", (int)pid);
802				dpr->dpr_stale = B_TRUE;
803				dpr->dpr_cacheable = B_FALSE;
804				dph->dph_lrucnt--;
805				break;
806			}
807
808			dt_dprintf("grabbed pid %d (cached)\n", (int)pid);
809			dt_list_delete(&dph->dph_lrulist, dpr);
810			dt_list_prepend(&dph->dph_lrulist, dpr);
811			dpr->dpr_refs++;
812			return (dpr->dpr_proc);
813		}
814	}
815
816	if ((dpr = dt_zalloc(dtp, sizeof (dt_proc_t))) == NULL)
817		return (NULL); /* errno is set for us */
818
819	(void) pthread_mutex_init(&dpr->dpr_lock, NULL);
820	(void) pthread_cond_init(&dpr->dpr_cv, NULL);
821
822	if ((err = proc_attach(pid, flags, &dpr->dpr_proc)) != 0) {
823		return (dt_proc_error(dtp, dpr,
824		    "failed to grab pid %d: %s\n", (int)pid, Pgrab_error(err)));
825	}
826
827	dpr->dpr_hdl = dtp;
828	dpr->dpr_pid = pid;
829	dpr->dpr_close = DT_CLOSE_RUN;
830
831	/*
832	 * If we are attempting to grab the process without a monitor
833	 * thread, then mark the process cacheable only if it's being
834	 * grabbed read-only.  If we're currently caching more process
835	 * handles than dph_lrulim permits, attempt to find the
836	 * least-recently-used handle that is currently unreferenced and
837	 * release it from the cache.  Otherwise we are grabbing the process
838	 * for control: create a control thread for this process and store
839	 * its ID in dpr->dpr_tid.
840	 */
841	if (nomonitor || (flags & PGRAB_RDONLY)) {
842		if (dph->dph_lrucnt >= dph->dph_lrulim) {
843			for (opr = dt_list_prev(&dph->dph_lrulist);
844			    opr != NULL; opr = dt_list_prev(opr)) {
845				if (opr->dpr_cacheable && opr->dpr_refs == 0) {
846					dt_proc_destroy(dtp, opr->dpr_proc);
847					break;
848				}
849			}
850		}
851
852		if (flags & PGRAB_RDONLY) {
853			dpr->dpr_cacheable = B_TRUE;
854			dpr->dpr_rdonly = B_TRUE;
855			dph->dph_lrucnt++;
856		}
857
858	} else if (dt_proc_create_thread(dtp, dpr, DT_PROC_STOP_GRAB) != 0)
859		return (NULL); /* dt_proc_error() has been called for us */
860
861	dpr->dpr_hash = dph->dph_hash[h];
862	dph->dph_hash[h] = dpr;
863	dt_list_prepend(&dph->dph_lrulist, dpr);
864
865	dt_dprintf("grabbed pid %d\n", (int)pid);
866	dpr->dpr_refs++;
867
868	return (dpr->dpr_proc);
869}
870
871void
872dt_proc_release(dtrace_hdl_t *dtp, struct ps_prochandle *P)
873{
874	dt_proc_t *dpr = dt_proc_lookup(dtp, P, B_FALSE);
875	dt_proc_hash_t *dph = dtp->dt_procs;
876
877	assert(dpr != NULL);
878	assert(dpr->dpr_refs != 0);
879
880	if (--dpr->dpr_refs == 0 &&
881	    (!dpr->dpr_cacheable || dph->dph_lrucnt > dph->dph_lrulim))
882		dt_proc_destroy(dtp, P);
883}
884
885void
886dt_proc_continue(dtrace_hdl_t *dtp, struct ps_prochandle *P)
887{
888	dt_proc_t *dpr = dt_proc_lookup(dtp, P, B_FALSE);
889
890	(void) pthread_mutex_lock(&dpr->dpr_lock);
891
892	if (dpr->dpr_stop & DT_PROC_STOP_IDLE) {
893		dpr->dpr_stop &= ~DT_PROC_STOP_IDLE;
894		(void) pthread_cond_broadcast(&dpr->dpr_cv);
895	}
896
897	(void) pthread_mutex_unlock(&dpr->dpr_lock);
898}
899
900void
901dt_proc_lock(dtrace_hdl_t *dtp, struct ps_prochandle *P)
902{
903	dt_proc_t *dpr = dt_proc_lookup(dtp, P, B_FALSE);
904	int err = pthread_mutex_lock(&dpr->dpr_lock);
905	assert(err == 0); /* check for recursion */
906}
907
908void
909dt_proc_unlock(dtrace_hdl_t *dtp, struct ps_prochandle *P)
910{
911	dt_proc_t *dpr = dt_proc_lookup(dtp, P, B_FALSE);
912	int err = pthread_mutex_unlock(&dpr->dpr_lock);
913	assert(err == 0); /* check for unheld lock */
914}
915
916void
917dt_proc_init(dtrace_hdl_t *dtp)
918{
919	extern char **environ;
920	static char *envdef[] = {
921		"LD_NOLAZYLOAD=1",	/* linker lazy loading hides funcs */
922		NULL
923	};
924	char **p;
925	int i;
926
927	if ((dtp->dt_procs = dt_zalloc(dtp, sizeof (dt_proc_hash_t) +
928	    sizeof (dt_proc_t *) * _dtrace_pidbuckets - 1)) == NULL)
929		return;
930
931	(void) pthread_mutex_init(&dtp->dt_procs->dph_lock, NULL);
932	(void) pthread_cond_init(&dtp->dt_procs->dph_cv, NULL);
933
934	dtp->dt_procs->dph_hashlen = _dtrace_pidbuckets;
935	dtp->dt_procs->dph_lrulim = _dtrace_pidlrulim;
936
937	/*
938	 * Count how big our environment needs to be.
939	 */
940	for (i = 1, p = environ; *p != NULL; i++, p++)
941		continue;
942	for (p = envdef; *p != NULL; i++, p++)
943		continue;
944
945	if ((dtp->dt_proc_env = dt_zalloc(dtp, sizeof (char *) * i)) == NULL)
946		return;
947
948	for (i = 0, p = environ; *p != NULL; i++, p++) {
949		if ((dtp->dt_proc_env[i] = strdup(*p)) == NULL)
950			goto err;
951	}
952	for (p = envdef; *p != NULL; i++, p++) {
953		if ((dtp->dt_proc_env[i] = strdup(*p)) == NULL)
954			goto err;
955	}
956
957	return;
958
959err:
960	while (--i != 0) {
961		dt_free(dtp, dtp->dt_proc_env[i]);
962	}
963	dt_free(dtp, dtp->dt_proc_env);
964	dtp->dt_proc_env = NULL;
965}
966
967void
968dt_proc_fini(dtrace_hdl_t *dtp)
969{
970	dt_proc_hash_t *dph = dtp->dt_procs;
971	dt_proc_t *dpr;
972	char **p;
973
974	while ((dpr = dt_list_next(&dph->dph_lrulist)) != NULL)
975		dt_proc_destroy(dtp, dpr->dpr_proc);
976
977	dtp->dt_procs = NULL;
978	dt_free(dtp, dph);
979
980	for (p = dtp->dt_proc_env; *p != NULL; p++)
981		dt_free(dtp, *p);
982
983	dt_free(dtp, dtp->dt_proc_env);
984	dtp->dt_proc_env = NULL;
985}
986
987struct ps_prochandle *
988dtrace_proc_create(dtrace_hdl_t *dtp, const char *file, char *const *argv,
989    proc_child_func *pcf, void *child_arg)
990{
991	dt_ident_t *idp = dt_idhash_lookup(dtp->dt_macros, "target");
992	struct ps_prochandle *P = dt_proc_create(dtp, file, argv, pcf, child_arg);
993
994	if (P != NULL && idp != NULL && idp->di_id == 0) {
995		idp->di_id = proc_getpid(P); /* $target = created pid */
996	}
997
998	return (P);
999}
1000
1001struct ps_prochandle *
1002dtrace_proc_grab(dtrace_hdl_t *dtp, pid_t pid, int flags)
1003{
1004	dt_ident_t *idp = dt_idhash_lookup(dtp->dt_macros, "target");
1005	struct ps_prochandle *P = dt_proc_grab(dtp, pid, flags, 0);
1006
1007	if (P != NULL && idp != NULL && idp->di_id == 0)
1008		idp->di_id = pid; /* $target = grabbed pid */
1009
1010	return (P);
1011}
1012
1013void
1014dtrace_proc_release(dtrace_hdl_t *dtp, struct ps_prochandle *P)
1015{
1016	dt_proc_release(dtp, P);
1017}
1018
1019void
1020dtrace_proc_continue(dtrace_hdl_t *dtp, struct ps_prochandle *P)
1021{
1022	dt_proc_continue(dtp, P);
1023}
1024