1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21
22/*
23 * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
24 * Use is subject to license terms.
25 */
26
27#pragma ident	"@(#)dt_proc.c	1.15	07/05/18 SMI"
28
29/*
30 * DTrace Process Control
31 *
32 * This file provides a set of routines that permit libdtrace and its clients
33 * to create and grab process handles using libproc, and to share these handles
34 * between library mechanisms that need libproc access, such as ustack(), and
35 * client mechanisms that need libproc access, such as dtrace(1M) -c and -p.
36 * The library provides several mechanisms in the libproc control layer:
37 *
38 * Reference Counting: The library code and client code can independently grab
39 * the same process handles without interfering with one another.  Only when
40 * the reference count drops to zero and the handle is not being cached (see
41 * below for more information on caching) will Prelease() be called on it.
42 *
43 * Handle Caching: If a handle is grabbed PGRAB_RDONLY (e.g. by ustack()) and
44 * the reference count drops to zero, the handle is not immediately released.
45 * Instead, libproc handles are maintained on dph_lrulist in order from most-
46 * recently accessed to least-recently accessed.  Idle handles are maintained
47 * until a pre-defined LRU cache limit is exceeded, permitting repeated calls
48 * to ustack() to avoid the overhead of releasing and re-grabbing processes.
49 *
50 * Process Control: For processes that are grabbed for control (~PGRAB_RDONLY)
51 * or created by dt_proc_create(), a control thread is created to provide
52 * callbacks on process exit and symbol table caching on dlopen()s.
53 *
54 * MT-Safety: Libproc is not MT-Safe, so dt_proc_lock() and dt_proc_unlock()
55 * are provided to synchronize access to the libproc handle between libdtrace
56 * code and client code and the control thread's use of the ps_prochandle.
57 *
58 * NOTE: MT-Safety is NOT provided for libdtrace itself, or for use of the
59 * dtrace_proc_grab/dtrace_proc_create mechanisms.  Like all exported libdtrace
60 * calls, these are assumed to be MT-Unsafe.  MT-Safety is ONLY provided for
61 * synchronization between libdtrace control threads and the client thread.
62 *
63 * The ps_prochandles themselves are maintained along with a dt_proc_t struct
64 * in a hash table indexed by PID.  This provides basic locking and reference
65 * counting.  The dt_proc_t is also maintained in LRU order on dph_lrulist.
66 * The dph_lrucnt and dph_lrulim count the number of cacheable processes and
67 * the current limit on the number of actively cached entries.
68 *
69 * The control thread for a process establishes breakpoints at the rtld_db
70 * locations of interest, updates mappings and symbol tables at these points,
71 * and handles exec and fork (by always following the parent).  The control
72 * thread automatically exits when the process dies or control is lost.
73 *
74 * A simple notification mechanism is provided for libdtrace clients using
75 * dtrace_handle_proc() for notification of PS_UNDEAD or PS_LOST events.  If
76 * such an event occurs, the dt_proc_t itself is enqueued on a notification
77 * list and the control thread broadcasts to dph_cv.  dtrace_sleep() will wake
78 * up using this condition and will then call the client handler as necessary.
79 */
80
81#if !defined(__APPLE__)
82#include <sys/wait.h>
83#include <sys/lwp.h>
84#include <strings.h>
85#include <signal.h>
86#include <assert.h>
87#include <errno.h>
88
89#include <dt_proc.h>
90#include <dt_pid.h>
91#include <dt_impl.h>
92
93#define	IS_SYS_EXEC(w)	(w == SYS_exec || w == SYS_execve)
94#define	IS_SYS_FORK(w)	(w == SYS_vfork || w == SYS_fork1 ||	\
95			w == SYS_forkall || w == SYS_forksys)
96
97#else /* is Apple Mac OS X */
98#include <sys/wait.h>
99#include <strings.h>
100#include <signal.h>
101#include <assert.h>
102#include <errno.h>
103
104#include "dt_impl.h" /* lifted up from just below */
105#include <dt_proc.h>
106#include <dt_pid.h>
107
108extern void dt_proc_rdwatch(dt_proc_t *, rd_event_e, const char *);
109extern void *dt_proc_control(void *arg);
110
111#endif /* __APPLE__ */
112
113dt_bkpt_t *
114dt_proc_bpcreate(dt_proc_t *dpr, uintptr_t addr, dt_bkpt_f *func, void *data)
115{
116	struct ps_prochandle *P = dpr->dpr_proc;
117	dt_bkpt_t *dbp;
118
119	assert(DT_MUTEX_HELD(&dpr->dpr_lock));
120
121	if ((dbp = dt_zalloc(dpr->dpr_hdl, sizeof (dt_bkpt_t))) != NULL) {
122		dbp->dbp_func = func;
123		dbp->dbp_data = data;
124		dbp->dbp_addr = addr;
125
126		if (Psetbkpt(P, dbp->dbp_addr, &dbp->dbp_instr) == 0)
127			dbp->dbp_active = B_TRUE;
128
129		dt_list_append(&dpr->dpr_bps, dbp);
130	}
131
132	return (dbp);
133}
134
135void
136dt_proc_bpdestroy(dt_proc_t *dpr, int delbkpts)
137{
138	int state = Pstate(dpr->dpr_proc);
139	dt_bkpt_t *dbp, *nbp;
140
141	assert(DT_MUTEX_HELD(&dpr->dpr_lock));
142
143	for (dbp = dt_list_next(&dpr->dpr_bps); dbp != NULL; dbp = nbp) {
144		if (delbkpts && dbp->dbp_active &&
145		    state != PS_LOST && state != PS_UNDEAD) {
146			(void) Pdelbkpt(dpr->dpr_proc,
147			    dbp->dbp_addr, dbp->dbp_instr);
148		}
149		nbp = dt_list_next(dbp);
150		dt_list_delete(&dpr->dpr_bps, dbp);
151		dt_free(dpr->dpr_hdl, dbp);
152	}
153}
154
155#if !defined(__APPLE__)
156static void
157dt_proc_bpmatch(dtrace_hdl_t *dtp, dt_proc_t *dpr)
158{
159	const lwpstatus_t *psp = &Pstatus(dpr->dpr_proc)->pr_lwp;
160	dt_bkpt_t *dbp;
161
162	assert(DT_MUTEX_HELD(&dpr->dpr_lock));
163
164	for (dbp = dt_list_next(&dpr->dpr_bps);
165	    dbp != NULL; dbp = dt_list_next(dbp)) {
166		if (psp->pr_reg[R_PC] == dbp->dbp_addr)
167			break;
168	}
169
170	if (dbp == NULL) {
171		dt_dprintf("pid %d: spurious breakpoint wakeup for %lx\n",
172		    (int)dpr->dpr_pid, (ulong_t)psp->pr_reg[R_PC]);
173		return;
174	}
175
176	dt_dprintf("pid %d: hit breakpoint at %lx (%lu)\n",
177	    (int)dpr->dpr_pid, (ulong_t)dbp->dbp_addr, ++dbp->dbp_hits);
178
179	dbp->dbp_func(dtp, dpr, dbp->dbp_data);
180	(void) Pxecbkpt(dpr->dpr_proc, dbp->dbp_instr);
181}
182#else
183// Moved dt_proc_bpmatch() to dt_proc_apple.m
184#endif /* !__APPLE__ */
185
186/*
187 * APPLE NOTE:
188 *
189 * We do not support any of the following breakpoint functionality...
190 */
191
192void
193dt_proc_bpenable(dt_proc_t *dpr)
194{
195#if !defined(__APPLE__)
196	dt_bkpt_t *dbp;
197
198	assert(DT_MUTEX_HELD(&dpr->dpr_lock));
199
200	for (dbp = dt_list_next(&dpr->dpr_bps);
201	    dbp != NULL; dbp = dt_list_next(dbp)) {
202		if (!dbp->dbp_active && Psetbkpt(dpr->dpr_proc,
203		    dbp->dbp_addr, &dbp->dbp_instr) == 0)
204			dbp->dbp_active = B_TRUE;
205	}
206
207	dt_dprintf("breakpoints enabled\n");
208#endif
209}
210
211void
212dt_proc_bpdisable(dt_proc_t *dpr)
213{
214#if !defined(__APPLE__)
215	dt_bkpt_t *dbp;
216
217	assert(DT_MUTEX_HELD(&dpr->dpr_lock));
218
219	for (dbp = dt_list_next(&dpr->dpr_bps);
220	    dbp != NULL; dbp = dt_list_next(dbp)) {
221		if (dbp->dbp_active && Pdelbkpt(dpr->dpr_proc,
222		    dbp->dbp_addr, dbp->dbp_instr) == 0)
223			dbp->dbp_active = B_FALSE;
224	}
225
226	dt_dprintf("breakpoints disabled\n");
227#endif
228}
229
230void
231dt_proc_notify(dtrace_hdl_t *dtp, dt_proc_hash_t *dph, dt_proc_t *dpr,
232    const char *msg)
233{
234	dt_proc_notify_t *dprn = dt_alloc(dtp, sizeof (dt_proc_notify_t));
235
236	if (dprn == NULL) {
237		dt_dprintf("failed to allocate notification for %d %s\n",
238		    (int)dpr->dpr_pid, msg);
239	} else {
240		dprn->dprn_dpr = dpr;
241		if (msg == NULL)
242			dprn->dprn_errmsg[0] = '\0';
243		else
244			(void) strlcpy(dprn->dprn_errmsg, msg,
245			    sizeof (dprn->dprn_errmsg));
246
247		(void) pthread_mutex_lock(&dph->dph_lock);
248
249		dprn->dprn_next = dph->dph_notify;
250		dph->dph_notify = dprn;
251
252		(void) pthread_cond_broadcast(&dph->dph_cv);
253		(void) pthread_mutex_unlock(&dph->dph_lock);
254	}
255}
256
257/*
258 * Check to see if the control thread was requested to stop when the victim
259 * process reached a particular event (why) rather than continuing the victim.
260 * If 'why' is set in the stop mask, we wait on dpr_cv for dt_proc_continue().
261 * If 'why' is not set, this function returns immediately and does nothing.
262 */
263void
264dt_proc_stop(dt_proc_t *dpr, uint8_t why)
265{
266	assert(DT_MUTEX_HELD(&dpr->dpr_lock));
267	assert(why != DT_PROC_STOP_IDLE);
268
269	if (dpr->dpr_stop & why) {
270		dpr->dpr_stop |= DT_PROC_STOP_IDLE;
271		dpr->dpr_stop &= ~why;
272
273		(void) pthread_cond_broadcast(&dpr->dpr_cv);
274
275		/*
276		 * We disable breakpoints while stopped to preserve the
277		 * integrity of the program text for both our own disassembly
278		 * and that of the kernel.
279		 */
280		dt_proc_bpdisable(dpr);
281
282		while (dpr->dpr_stop & DT_PROC_STOP_IDLE)
283			(void) pthread_cond_wait(&dpr->dpr_cv, &dpr->dpr_lock);
284
285		dt_proc_bpenable(dpr);
286	}
287}
288
289void
290dt_proc_bpmain(dtrace_hdl_t *dtp, dt_proc_t *dpr, const char *fname)
291{
292	dt_dprintf("pid %d: breakpoint at %s()\n", (int)dpr->dpr_pid, fname);
293	dt_proc_stop(dpr, DT_PROC_STOP_MAIN);
294}
295
296static void
297dt_proc_rdevent(dtrace_hdl_t *dtp, dt_proc_t *dpr, const char *evname)
298{
299	rd_event_msg_t rdm;
300	rd_err_e err;
301
302	if ((err = rd_event_getmsg(dpr->dpr_rtld, &rdm)) != RD_OK) {
303		dt_dprintf("pid %d: failed to get %s event message: %s\n",
304		    (int)dpr->dpr_pid, evname, rd_errstr(err));
305		return;
306	}
307
308	dt_dprintf("pid %d: rtld event %s type=%d state %d\n",
309	    (int)dpr->dpr_pid, evname, rdm.type, rdm.u.state);
310
311	switch (rdm.type) {
312	case RD_DLACTIVITY:
313		if (rdm.u.state != RD_CONSISTENT)
314			break;
315
316		Pupdate_syms(dpr->dpr_proc);
317		if (dt_pid_create_probes_module(dtp, dpr) != 0)
318			dt_proc_notify(dtp, dtp->dt_procs, dpr,
319			    dpr->dpr_errmsg);
320
321		break;
322	case RD_PREINIT:
323		Pupdate_syms(dpr->dpr_proc);
324		dt_proc_stop(dpr, DT_PROC_STOP_PREINIT);
325		break;
326	case RD_POSTINIT:
327		Pupdate_syms(dpr->dpr_proc);
328		dt_proc_stop(dpr, DT_PROC_STOP_POSTINIT);
329		break;
330	}
331
332#if defined(__APPLE__)
333	// Take note of symbol owners (i.e. modules) already processed. */
334	if (!(dpr->dpr_stop & ~DT_PROC_STOP_IDLE))
335		Pcheckpoint_syms(dpr->dpr_proc);
336#endif /* __APPLE__ */
337}
338
339void
340dt_proc_rdwatch(dt_proc_t *dpr, rd_event_e event, const char *evname)
341{
342	rd_notify_t rdn;
343	rd_err_e err;
344
345	if ((err = rd_event_addr(dpr->dpr_rtld, event, &rdn)) != RD_OK) {
346		dt_dprintf("pid %d: failed to get event address for %s: %s\n",
347		    (int)dpr->dpr_pid, evname, rd_errstr(err));
348		return;
349	}
350
351	if (rdn.type != RD_NOTIFY_BPT) {
352		dt_dprintf("pid %d: event %s has unexpected type %d\n",
353		    (int)dpr->dpr_pid, evname, rdn.type);
354		return;
355	}
356
357	(void) dt_proc_bpcreate(dpr, rdn.u.bptaddr,
358	    (dt_bkpt_f *)dt_proc_rdevent, (void *)evname);
359}
360
361#if !defined(__APPLE__)
362/*
363 * Common code for enabling events associated with the run-time linker after
364 * attaching to a process or after a victim process completes an exec(2).
365 */
366static void
367dt_proc_attach(dt_proc_t *dpr, int exec)
368{
369	const pstatus_t *psp = Pstatus(dpr->dpr_proc);
370	rd_err_e err;
371	GElf_Sym sym;
372
373	assert(DT_MUTEX_HELD(&dpr->dpr_lock));
374
375	if (exec) {
376		if (psp->pr_lwp.pr_errno != 0)
377			return; /* exec failed: nothing needs to be done */
378
379		dt_proc_bpdestroy(dpr, B_FALSE);
380		Preset_maps(dpr->dpr_proc);
381	}
382
383	if ((dpr->dpr_rtld = Prd_agent(dpr->dpr_proc)) != NULL &&
384	    (err = rd_event_enable(dpr->dpr_rtld, B_TRUE)) == RD_OK) {
385		dt_proc_rdwatch(dpr, RD_PREINIT, "RD_PREINIT");
386		dt_proc_rdwatch(dpr, RD_POSTINIT, "RD_POSTINIT");
387		dt_proc_rdwatch(dpr, RD_DLACTIVITY, "RD_DLACTIVITY");
388	} else {
389		dt_dprintf("pid %d: failed to enable rtld events: %s\n",
390		    (int)dpr->dpr_pid, dpr->dpr_rtld ? rd_errstr(err) :
391		    "rtld_db agent initialization failed");
392	}
393
394	Pupdate_maps(dpr->dpr_proc);
395
396	if (Pxlookup_by_name(dpr->dpr_proc, LM_ID_BASE,
397	    "a.out", "main", &sym, NULL) == 0) {
398		(void) dt_proc_bpcreate(dpr, (uintptr_t)sym.st_value,
399		    (dt_bkpt_f *)dt_proc_bpmain, "a.out`main");
400	} else {
401		dt_dprintf("pid %d: failed to find a.out`main: %s\n",
402		    (int)dpr->dpr_pid, strerror(errno));
403	}
404}
405
406/*
407 * Wait for a stopped process to be set running again by some other debugger.
408 * This is typically not required by /proc-based debuggers, since the usual
409 * model is that one debugger controls one victim.  But DTrace, as usual, has
410 * its own needs: the stop() action assumes that prun(1) or some other tool
411 * will be applied to resume the victim process.  This could be solved by
412 * adding a PCWRUN directive to /proc, but that seems like overkill unless
413 * other debuggers end up needing this functionality, so we implement a cheap
414 * equivalent to PCWRUN using the set of existing kernel mechanisms.
415 *
416 * Our intent is really not just to wait for the victim to run, but rather to
417 * wait for it to run and then stop again for a reason other than the current
418 * PR_REQUESTED stop.  Since PCWSTOP/Pstopstatus() can be applied repeatedly
419 * to a stopped process and will return the same result without affecting the
420 * victim, we can just perform these operations repeatedly until Pstate()
421 * changes, the representative LWP ID changes, or the stop timestamp advances.
422 * dt_proc_control() will then rediscover the new state and continue as usual.
423 * When the process is still stopped in the same exact state, we sleep for a
424 * brief interval before waiting again so as not to spin consuming CPU cycles.
425 */
426static void
427dt_proc_waitrun(dt_proc_t *dpr)
428{
429	struct ps_prochandle *P = dpr->dpr_proc;
430	const lwpstatus_t *psp = &Pstatus(P)->pr_lwp;
431
432	int krflag = psp->pr_flags & (PR_KLC | PR_RLC);
433	timestruc_t tstamp = psp->pr_tstamp;
434	lwpid_t lwpid = psp->pr_lwpid;
435
436	const long wstop = PCWSTOP;
437	int pfd = Pctlfd(P);
438
439	assert(DT_MUTEX_HELD(&dpr->dpr_lock));
440	assert(psp->pr_flags & PR_STOPPED);
441	assert(Pstate(P) == PS_STOP);
442
443	/*
444	 * While we are waiting for the victim to run, clear PR_KLC and PR_RLC
445	 * so that if the libdtrace client is killed, the victim stays stopped.
446	 * dt_proc_destroy() will also observe this and perform PRELEASE_HANG.
447	 */
448	(void) Punsetflags(P, krflag);
449	Psync(P);
450
451	(void) pthread_mutex_unlock(&dpr->dpr_lock);
452
453	while (!dpr->dpr_quit) {
454		if (write(pfd, &wstop, sizeof (wstop)) == -1 && errno == EINTR)
455			continue; /* check dpr_quit and continue waiting */
456
457		(void) pthread_mutex_lock(&dpr->dpr_lock);
458		(void) Pstopstatus(P, PCNULL, 0);
459		psp = &Pstatus(P)->pr_lwp;
460
461		/*
462		 * If we've reached a new state, found a new representative, or
463		 * the stop timestamp has changed, restore PR_KLC/PR_RLC to its
464		 * original setting and then return with dpr_lock held.
465		 */
466		if (Pstate(P) != PS_STOP || psp->pr_lwpid != lwpid ||
467		    bcmp(&psp->pr_tstamp, &tstamp, sizeof (tstamp)) != 0) {
468			(void) Psetflags(P, krflag);
469			Psync(P);
470			return;
471		}
472
473		(void) pthread_mutex_unlock(&dpr->dpr_lock);
474		(void) poll(NULL, 0, MILLISEC / 2);
475	}
476
477	(void) pthread_mutex_lock(&dpr->dpr_lock);
478}
479#else
480// Moved dt_proc_attach() to dt_proc_apple.m
481#endif /* !__APPLE__ */
482
483typedef struct dt_proc_control_data {
484	dtrace_hdl_t *dpcd_hdl;			/* DTrace handle */
485	dt_proc_t *dpcd_proc;			/* proccess to control */
486} dt_proc_control_data_t;
487
488/*
489 * Main loop for all victim process control threads.  We initialize all the
490 * appropriate /proc control mechanisms, and then enter a loop waiting for
491 * the process to stop on an event or die.  We process any events by calling
492 * appropriate subroutines, and exit when the victim dies or we lose control.
493 *
494 * The control thread synchronizes the use of dpr_proc with other libdtrace
495 * threads using dpr_lock.  We hold the lock for all of our operations except
496 * waiting while the process is running: this is accomplished by writing a
497 * PCWSTOP directive directly to the underlying /proc/<pid>/ctl file.  If the
498 * libdtrace client wishes to exit or abort our wait, SIGCANCEL can be used.
499 */
500#if !defined(__APPLE__)
501static void *
502dt_proc_control(void *arg)
503{
504	dt_proc_control_data_t *datap = arg;
505	dtrace_hdl_t *dtp = datap->dpcd_hdl;
506	dt_proc_t *dpr = datap->dpcd_proc;
507	dt_proc_hash_t *dph = dpr->dpr_hdl->dt_procs;
508	struct ps_prochandle *P = dpr->dpr_proc;
509
510	int pfd = Pctlfd(P);
511	int pid = dpr->dpr_pid;
512
513	const long wstop = PCWSTOP;
514	int notify = B_FALSE;
515
516	/*
517	 * We disable the POSIX thread cancellation mechanism so that the
518	 * client program using libdtrace can't accidentally cancel our thread.
519	 * dt_proc_destroy() uses SIGCANCEL explicitly to simply poke us out
520	 * of PCWSTOP with EINTR, at which point we will see dpr_quit and exit.
521	 */
522	(void) pthread_setcancelstate(PTHREAD_CANCEL_DISABLE, NULL);
523
524	/*
525	 * Set up the corresponding process for tracing by libdtrace.  We want
526	 * to be able to catch breakpoints and efficiently single-step over
527	 * them, and we need to enable librtld_db to watch libdl activity.
528	 */
529	(void) pthread_mutex_lock(&dpr->dpr_lock);
530
531	(void) Punsetflags(P, PR_ASYNC);	/* require synchronous mode */
532	(void) Psetflags(P, PR_BPTADJ);		/* always adjust eip on x86 */
533	(void) Punsetflags(P, PR_FORK);		/* do not inherit on fork */
534
535	(void) Pfault(P, FLTBPT, B_TRUE);	/* always trace breakpoints */
536	(void) Pfault(P, FLTTRACE, B_TRUE);	/* always trace single-step */
537
538	/*
539	 * We must trace exit from exec() system calls so that if the exec is
540	 * successful, we can reset our breakpoints and re-initialize libproc.
541	 */
542	(void) Psysexit(P, SYS_exec, B_TRUE);
543	(void) Psysexit(P, SYS_execve, B_TRUE);
544
545	/*
546	 * We must trace entry and exit for fork() system calls in order to
547	 * disable our breakpoints temporarily during the fork.  We do not set
548	 * the PR_FORK flag, so if fork succeeds the child begins executing and
549	 * does not inherit any other tracing behaviors or a control thread.
550	 */
551	(void) Psysentry(P, SYS_vfork, B_TRUE);
552	(void) Psysexit(P, SYS_vfork, B_TRUE);
553	(void) Psysentry(P, SYS_fork1, B_TRUE);
554	(void) Psysexit(P, SYS_fork1, B_TRUE);
555	(void) Psysentry(P, SYS_forkall, B_TRUE);
556	(void) Psysexit(P, SYS_forkall, B_TRUE);
557	(void) Psysentry(P, SYS_forksys, B_TRUE);
558	(void) Psysexit(P, SYS_forksys, B_TRUE);
559
560	Psync(P);				/* enable all /proc changes */
561	dt_proc_attach(dpr, B_FALSE);		/* enable rtld breakpoints */
562
563	/*
564	 * If PR_KLC is set, we created the process; otherwise we grabbed it.
565	 * Check for an appropriate stop request and wait for dt_proc_continue.
566	 */
567	if (Pstatus(P)->pr_flags & PR_KLC)
568		dt_proc_stop(dpr, DT_PROC_STOP_CREATE);
569	else
570		dt_proc_stop(dpr, DT_PROC_STOP_GRAB);
571
572	if (Psetrun(P, 0, 0) == -1) {
573		dt_dprintf("pid %d: failed to set running: %s\n",
574		    (int)dpr->dpr_pid, strerror(errno));
575	}
576
577	(void) pthread_mutex_unlock(&dpr->dpr_lock);
578
579	/*
580	 * Wait for the process corresponding to this control thread to stop,
581	 * process the event, and then set it running again.  We want to sleep
582	 * with dpr_lock *unheld* so that other parts of libdtrace can use the
583	 * ps_prochandle in the meantime (e.g. ustack()).  To do this, we write
584	 * a PCWSTOP directive directly to the underlying /proc/<pid>/ctl file.
585	 * Once the process stops, we wake up, grab dpr_lock, and then call
586	 * Pwait() (which will return immediately) and do our processing.
587	 */
588	while (!dpr->dpr_quit) {
589		const lwpstatus_t *psp;
590
591		if (write(pfd, &wstop, sizeof (wstop)) == -1 && errno == EINTR)
592			continue; /* check dpr_quit and continue waiting */
593
594		(void) pthread_mutex_lock(&dpr->dpr_lock);
595pwait_locked:
596		if (Pstopstatus(P, PCNULL, 0) == -1 && errno == EINTR) {
597			(void) pthread_mutex_unlock(&dpr->dpr_lock);
598			continue; /* check dpr_quit and continue waiting */
599		}
600
601		switch (Pstate(P)) {
602		case PS_STOP:
603			psp = &Pstatus(P)->pr_lwp;
604
605			dt_dprintf("pid %d: proc stopped showing %d/%d\n",
606			    pid, psp->pr_why, psp->pr_what);
607
608			/*
609			 * If the process stops showing PR_REQUESTED, then the
610			 * DTrace stop() action was applied to it or another
611			 * debugging utility (e.g. pstop(1)) asked it to stop.
612			 * In either case, the user's intention is for the
613			 * process to remain stopped until another external
614			 * mechanism (e.g. prun(1)) is applied.  So instead of
615			 * setting the process running ourself, we wait for
616			 * someone else to do so.  Once that happens, we return
617			 * to our normal loop waiting for an event of interest.
618			 */
619			if (psp->pr_why == PR_REQUESTED) {
620				dt_proc_waitrun(dpr);
621				(void) pthread_mutex_unlock(&dpr->dpr_lock);
622				continue;
623			}
624
625			/*
626			 * If the process stops showing one of the events that
627			 * we are tracing, perform the appropriate response.
628			 * Note that we ignore PR_SUSPENDED, PR_CHECKPOINT, and
629			 * PR_JOBCONTROL by design: if one of these conditions
630			 * occurs, we will fall through to Psetrun() but the
631			 * process will remain stopped in the kernel by the
632			 * corresponding mechanism (e.g. job control stop).
633			 */
634			if (psp->pr_why == PR_FAULTED && psp->pr_what == FLTBPT)
635				dt_proc_bpmatch(dtp, dpr);
636			else if (psp->pr_why == PR_SYSENTRY &&
637			    IS_SYS_FORK(psp->pr_what))
638				dt_proc_bpdisable(dpr);
639			else if (psp->pr_why == PR_SYSEXIT &&
640			    IS_SYS_FORK(psp->pr_what))
641				dt_proc_bpenable(dpr);
642			else if (psp->pr_why == PR_SYSEXIT &&
643			    IS_SYS_EXEC(psp->pr_what))
644				dt_proc_attach(dpr, B_TRUE);
645			break;
646
647		case PS_LOST:
648			if (Preopen(P) == 0)
649				goto pwait_locked;
650
651			dt_dprintf("pid %d: proc lost: %s\n",
652			    pid, strerror(errno));
653
654			dpr->dpr_quit = B_TRUE;
655			notify = B_TRUE;
656			break;
657
658		case PS_UNDEAD:
659			dt_dprintf("pid %d: proc died\n", pid);
660			dpr->dpr_quit = B_TRUE;
661			notify = B_TRUE;
662			break;
663		}
664
665		if (Pstate(P) != PS_UNDEAD && Psetrun(P, 0, 0) == -1) {
666			dt_dprintf("pid %d: failed to set running: %s\n",
667			    (int)dpr->dpr_pid, strerror(errno));
668		}
669
670		(void) pthread_mutex_unlock(&dpr->dpr_lock);
671	}
672
673	/*
674	 * If the control thread detected PS_UNDEAD or PS_LOST, then enqueue
675	 * the dt_proc_t structure on the dt_proc_hash_t notification list.
676	 */
677	if (notify)
678		dt_proc_notify(dtp, dph, dpr, NULL);
679
680	/*
681	 * Destroy and remove any remaining breakpoints, set dpr_done and clear
682	 * dpr_tid to indicate the control thread has exited, and notify any
683	 * waiting thread in dt_proc_destroy() that we have succesfully exited.
684	 */
685	(void) pthread_mutex_lock(&dpr->dpr_lock);
686
687	dt_proc_bpdestroy(dpr, B_TRUE);
688	dpr->dpr_done = B_TRUE;
689	dpr->dpr_tid = 0;
690
691	(void) pthread_cond_broadcast(&dpr->dpr_cv);
692	(void) pthread_mutex_unlock(&dpr->dpr_lock);
693
694	return (NULL);
695}
696#else
697// Moved dt_proc_control() to dt_proc_apple.m
698#endif /* __APPLE__ */
699
700/*PRINTFLIKE3*/
701static struct ps_prochandle *
702dt_proc_error(dtrace_hdl_t *dtp, dt_proc_t *dpr, const char *format, ...)
703{
704	va_list ap;
705
706	va_start(ap, format);
707	dt_set_errmsg(dtp, NULL, NULL, NULL, 0, format, ap);
708	va_end(ap);
709
710	if (dpr->dpr_proc != NULL)
711		Prelease(dpr->dpr_proc, 0);
712
713	dt_free(dtp, dpr);
714	(void) dt_set_errno(dtp, EDT_COMPILER);
715	return (NULL);
716}
717
718dt_proc_t *
719dt_proc_lookup(dtrace_hdl_t *dtp, struct ps_prochandle *P, int remove)
720{
721	dt_proc_hash_t *dph = dtp->dt_procs;
722	pid_t pid = Pstatus(P)->pr_pid;
723	dt_proc_t *dpr, **dpp = &dph->dph_hash[pid & (dph->dph_hashlen - 1)];
724
725	for (dpr = *dpp; dpr != NULL; dpr = dpr->dpr_hash) {
726		if (dpr->dpr_pid == pid)
727			break;
728		else
729			dpp = &dpr->dpr_hash;
730	}
731
732	assert(dpr != NULL);
733	assert(dpr->dpr_proc == P);
734
735	if (remove)
736		*dpp = dpr->dpr_hash; /* remove from pid hash chain */
737
738	return (dpr);
739}
740
741static void
742dt_proc_destroy(dtrace_hdl_t *dtp, struct ps_prochandle *P)
743{
744	dt_proc_t *dpr = dt_proc_lookup(dtp, P, B_FALSE);
745	dt_proc_hash_t *dph = dtp->dt_procs;
746	dt_proc_notify_t *npr, **npp;
747	int rflag;
748
749	assert(dpr != NULL);
750
751	/*
752	 * If neither PR_KLC nor PR_RLC is set, then the process is stopped by
753	 * an external debugger and we were waiting in dt_proc_waitrun().
754	 * Leave the process in this condition using PRELEASE_HANG.
755	 */
756	if (!(Pstatus(dpr->dpr_proc)->pr_flags & (PR_KLC | PR_RLC))) {
757		dt_dprintf("abandoning pid %d\n", (int)dpr->dpr_pid);
758		rflag = PRELEASE_HANG;
759	} else {
760		dt_dprintf("releasing pid %d\n", (int)dpr->dpr_pid);
761		rflag = 0; /* apply kill or run-on-last-close */
762	}
763
764	if (dpr->dpr_tid) {
765		/*
766		 * Set the dpr_quit flag to tell the daemon thread to exit.  We
767		 * send it a SIGCANCEL to poke it out of PCWSTOP or any other
768		 * long-term /proc system call.  Our daemon threads have POSIX
769		 * cancellation disabled, so EINTR will be the only effect.  We
770		 * then wait for dpr_done to indicate the thread has exited.
771		 *
772		 * We can't use pthread_kill() to send SIGCANCEL because the
773		 * interface forbids it and we can't use pthread_cancel()
774		 * because with cancellation disabled it won't actually
775		 * send SIGCANCEL to the target thread, so we use _lwp_kill()
776		 * to do the job.  This is all built on evil knowledge of
777		 * the details of the cancellation mechanism in libc.
778		 */
779		(void) pthread_mutex_lock(&dpr->dpr_lock);
780		dpr->dpr_quit = B_TRUE;
781#if !defined(__APPLE__)
782		(void) _lwp_kill(dpr->dpr_tid, SIGCANCEL);
783#else
784		Pcreate_async_proc_activity(dpr->dpr_proc, RD_NONE);
785#endif /* __APPLE__ */
786
787		/*
788		 * If the process is currently idling in dt_proc_stop(), re-
789		 * enable breakpoints and poke it into running again.
790		 */
791		if (dpr->dpr_stop & DT_PROC_STOP_IDLE) {
792			dt_proc_bpenable(dpr);
793			dpr->dpr_stop &= ~DT_PROC_STOP_IDLE;
794			(void) pthread_cond_broadcast(&dpr->dpr_cv);
795		}
796
797		while (!dpr->dpr_done)
798			(void) pthread_cond_wait(&dpr->dpr_cv, &dpr->dpr_lock);
799
800		(void) pthread_mutex_unlock(&dpr->dpr_lock);
801	}
802
803	/*
804	 * Before we free the process structure, remove this dt_proc_t from the
805	 * lookup hash, and then walk the dt_proc_hash_t's notification list
806	 * and remove this dt_proc_t if it is enqueued.
807	 */
808	(void) pthread_mutex_lock(&dph->dph_lock);
809	(void) dt_proc_lookup(dtp, P, B_TRUE);
810	npp = &dph->dph_notify;
811
812	while ((npr = *npp) != NULL) {
813		if (npr->dprn_dpr == dpr) {
814			*npp = npr->dprn_next;
815			dt_free(dtp, npr);
816		} else {
817			npp = &npr->dprn_next;
818		}
819	}
820
821	(void) pthread_mutex_unlock(&dph->dph_lock);
822
823	/*
824	 * Remove the dt_proc_list from the LRU list, release the underlying
825	 * libproc handle, and free our dt_proc_t data structure.
826	 */
827	if (dpr->dpr_cacheable) {
828		assert(dph->dph_lrucnt != 0);
829		dph->dph_lrucnt--;
830	}
831
832	dt_list_delete(&dph->dph_lrulist, dpr);
833	Prelease(dpr->dpr_proc, rflag);
834	dt_free(dtp, dpr);
835}
836
837static int
838dt_proc_create_thread(dtrace_hdl_t *dtp, dt_proc_t *dpr, uint_t stop)
839{
840	dt_proc_control_data_t data;
841	sigset_t nset, oset;
842	pthread_attr_t a;
843	int err;
844
845	(void) pthread_mutex_lock(&dpr->dpr_lock);
846	dpr->dpr_stop |= stop; /* set bit for initial rendezvous */
847
848	(void) pthread_attr_init(&a);
849	(void) pthread_attr_setdetachstate(&a, PTHREAD_CREATE_DETACHED);
850
851	(void) sigfillset(&nset);
852	(void) sigdelset(&nset, SIGABRT);	/* unblocked for assert() */
853#if !defined(__APPLE__)
854	(void) sigdelset(&nset, SIGCANCEL);	/* see dt_proc_destroy() */
855#endif /* __APPLE__ */
856
857	data.dpcd_hdl = dtp;
858	data.dpcd_proc = dpr;
859
860	(void) pthread_sigmask(SIG_SETMASK, &nset, &oset);
861	err = pthread_create(&dpr->dpr_tid, &a, dt_proc_control, &data);
862	(void) pthread_sigmask(SIG_SETMASK, &oset, NULL);
863
864	/*
865	 * If the control thread was created, then wait on dpr_cv for either
866	 * dpr_done to be set (the victim died or the control thread failed)
867	 * or DT_PROC_STOP_IDLE to be set, indicating that the victim is now
868	 * stopped by /proc and the control thread is at the rendezvous event.
869	 * On success, we return with the process and control thread stopped:
870	 * the caller can then apply dt_proc_continue() to resume both.
871	 */
872	if (err == 0) {
873		while (!dpr->dpr_done && !(dpr->dpr_stop & DT_PROC_STOP_IDLE))
874			(void) pthread_cond_wait(&dpr->dpr_cv, &dpr->dpr_lock);
875
876		/*
877		 * If dpr_done is set, the control thread aborted before it
878		 * reached the rendezvous event.  This is either due to PS_LOST
879		 * or PS_UNDEAD (i.e. the process died).  We try to provide a
880		 * small amount of useful information to help figure it out.
881		 */
882		if (dpr->dpr_done) {
883#if !defined(__APPLE__)
884			const psinfo_t *prp = Ppsinfo(dpr->dpr_proc);
885			int stat = prp ? prp->pr_wstat : 0;
886#else
887			int stat = 0;
888#endif /* __APPLE__ */
889			int pid = dpr->dpr_pid;
890
891			if (Pstate(dpr->dpr_proc) == PS_LOST) {
892				(void) dt_proc_error(dpr->dpr_hdl, dpr,
893				    "failed to control pid %d: process exec'd "
894				    "set-id or unobservable program\n", pid);
895			} else if (WIFSIGNALED(stat)) {
896				(void) dt_proc_error(dpr->dpr_hdl, dpr,
897				    "failed to control pid %d: process died "
898				    "from signal %d\n", pid, WTERMSIG(stat));
899			} else {
900				(void) dt_proc_error(dpr->dpr_hdl, dpr,
901				    "failed to control pid %d: process exited "
902				    "with status %d\n", pid, WEXITSTATUS(stat));
903			}
904
905			err = ESRCH; /* cause grab() or create() to fail */
906		}
907	} else {
908		(void) dt_proc_error(dpr->dpr_hdl, dpr,
909		    "failed to create control thread for process-id %d: %s\n",
910		    (int)dpr->dpr_pid, strerror(err));
911	}
912
913	(void) pthread_mutex_unlock(&dpr->dpr_lock);
914	(void) pthread_attr_destroy(&a);
915
916	return (err);
917}
918
919struct ps_prochandle *
920dt_proc_create(dtrace_hdl_t *dtp, const char *file, char *const *argv)
921{
922	dt_proc_hash_t *dph = dtp->dt_procs;
923	dt_proc_t *dpr;
924	int err;
925
926	if ((dpr = dt_zalloc(dtp, sizeof (dt_proc_t))) == NULL)
927		return (NULL); /* errno is set for us */
928
929	(void) pthread_mutex_init(&dpr->dpr_lock, NULL);
930	(void) pthread_cond_init(&dpr->dpr_cv, NULL);
931#if !defined(__APPLE__)
932	if ((dpr->dpr_proc = Pcreate(file, argv, &err, NULL, 0)) == NULL) {
933#else
934	if ((dpr->dpr_proc = Pcreate(file, argv, &err, NULL, 0, dtp->dt_arch)) == NULL) {
935#endif
936		return (dt_proc_error(dtp, dpr,
937		    "failed to execute %s: %s\n", file, Pcreate_error(err)));
938	}
939
940	dpr->dpr_hdl = dtp;
941	dpr->dpr_pid = Pstatus(dpr->dpr_proc)->pr_pid;
942
943	(void) Punsetflags(dpr->dpr_proc, PR_RLC);
944	(void) Psetflags(dpr->dpr_proc, PR_KLC);
945
946	if (dt_proc_create_thread(dtp, dpr, dtp->dt_prcmode) != 0)
947		return (NULL); /* dt_proc_error() has been called for us */
948
949	dpr->dpr_hash = dph->dph_hash[dpr->dpr_pid & (dph->dph_hashlen - 1)];
950	dph->dph_hash[dpr->dpr_pid & (dph->dph_hashlen - 1)] = dpr;
951	dt_list_prepend(&dph->dph_lrulist, dpr);
952
953	dt_dprintf("created pid %d\n", (int)dpr->dpr_pid);
954	dpr->dpr_refs++;
955
956	return (dpr->dpr_proc);
957}
958
959struct ps_prochandle *
960dt_proc_grab(dtrace_hdl_t *dtp, pid_t pid, int flags, int nomonitor)
961{
962	dt_proc_hash_t *dph = dtp->dt_procs;
963	uint_t h = pid & (dph->dph_hashlen - 1);
964	dt_proc_t *dpr, *opr;
965	int err;
966
967	/*
968	 * Search the hash table for the pid.  If it is already grabbed or
969	 * created, move the handle to the front of the lrulist, increment
970	 * the reference count, and return the existing ps_prochandle.
971	 */
972	for (dpr = dph->dph_hash[h]; dpr != NULL; dpr = dpr->dpr_hash) {
973		if (dpr->dpr_pid == pid && !dpr->dpr_stale) {
974			/*
975			 * If the cached handle was opened read-only and
976			 * this request is for a writeable handle, mark
977			 * the cached handle as stale and open a new handle.
978			 * Since it's stale, unmark it as cacheable.
979			 */
980			if (dpr->dpr_rdonly && !(flags & PGRAB_RDONLY)) {
981				dt_dprintf("upgrading pid %d\n", (int)pid);
982				dpr->dpr_stale = B_TRUE;
983				dpr->dpr_cacheable = B_FALSE;
984				dph->dph_lrucnt--;
985				break;
986			}
987
988			dt_dprintf("grabbed pid %d (cached)\n", (int)pid);
989			dt_list_delete(&dph->dph_lrulist, dpr);
990			dt_list_prepend(&dph->dph_lrulist, dpr);
991			dpr->dpr_refs++;
992			return (dpr->dpr_proc);
993		}
994	}
995
996	if ((dpr = dt_zalloc(dtp, sizeof (dt_proc_t))) == NULL)
997		return (NULL); /* errno is set for us */
998
999	(void) pthread_mutex_init(&dpr->dpr_lock, NULL);
1000	(void) pthread_cond_init(&dpr->dpr_cv, NULL);
1001
1002	if ((dpr->dpr_proc = Pgrab(pid, flags, &err)) == NULL) {
1003		return (dt_proc_error(dtp, dpr,
1004		    "failed to grab pid %d: %s\n", (int)pid, Pgrab_error(err)));
1005	}
1006
1007	dpr->dpr_hdl = dtp;
1008	dpr->dpr_pid = pid;
1009
1010	(void) Punsetflags(dpr->dpr_proc, PR_KLC);
1011	(void) Psetflags(dpr->dpr_proc, PR_RLC);
1012
1013	/*
1014	 * If we are attempting to grab the process without a monitor
1015	 * thread, then mark the process cacheable only if it's being
1016	 * grabbed read-only.  If we're currently caching more process
1017	 * handles than dph_lrulim permits, attempt to find the
1018	 * least-recently-used handle that is currently unreferenced and
1019	 * release it from the cache.  Otherwise we are grabbing the process
1020	 * for control: create a control thread for this process and store
1021	 * its ID in dpr->dpr_tid.
1022	 */
1023	if (nomonitor || (flags & PGRAB_RDONLY)) {
1024		if (dph->dph_lrucnt >= dph->dph_lrulim) {
1025			for (opr = dt_list_prev(&dph->dph_lrulist);
1026			    opr != NULL; opr = dt_list_prev(opr)) {
1027				if (opr->dpr_cacheable && opr->dpr_refs == 0) {
1028					dt_proc_destroy(dtp, opr->dpr_proc);
1029					break;
1030				}
1031			}
1032		}
1033
1034		if (flags & PGRAB_RDONLY) {
1035			dpr->dpr_cacheable = B_TRUE;
1036			dpr->dpr_rdonly = B_TRUE;
1037			dph->dph_lrucnt++;
1038		}
1039
1040	} else if (dt_proc_create_thread(dtp, dpr, DT_PROC_STOP_GRAB) != 0)
1041		return (NULL); /* dt_proc_error() has been called for us */
1042
1043	dpr->dpr_hash = dph->dph_hash[h];
1044	dph->dph_hash[h] = dpr;
1045	dt_list_prepend(&dph->dph_lrulist, dpr);
1046
1047	dt_dprintf("grabbed pid %d\n", (int)pid);
1048	dpr->dpr_refs++;
1049
1050	return (dpr->dpr_proc);
1051}
1052
1053void
1054dt_proc_release(dtrace_hdl_t *dtp, struct ps_prochandle *P)
1055{
1056	dt_proc_t *dpr = dt_proc_lookup(dtp, P, B_FALSE);
1057	dt_proc_hash_t *dph = dtp->dt_procs;
1058
1059	assert(dpr != NULL);
1060	assert(dpr->dpr_refs != 0);
1061
1062	if (--dpr->dpr_refs == 0 &&
1063	    (!dpr->dpr_cacheable || dph->dph_lrucnt > dph->dph_lrulim))
1064		dt_proc_destroy(dtp, P);
1065}
1066
1067void
1068dt_proc_continue(dtrace_hdl_t *dtp, struct ps_prochandle *P)
1069{
1070	dt_proc_t *dpr = dt_proc_lookup(dtp, P, B_FALSE);
1071
1072	(void) pthread_mutex_lock(&dpr->dpr_lock);
1073
1074	if (dpr->dpr_stop & DT_PROC_STOP_IDLE) {
1075		dpr->dpr_stop &= ~DT_PROC_STOP_IDLE;
1076		(void) pthread_cond_broadcast(&dpr->dpr_cv);
1077	}
1078
1079	(void) pthread_mutex_unlock(&dpr->dpr_lock);
1080}
1081
1082void
1083dt_proc_lock(dtrace_hdl_t *dtp, struct ps_prochandle *P)
1084{
1085	dt_proc_t *dpr = dt_proc_lookup(dtp, P, B_FALSE);
1086	int err = pthread_mutex_lock(&dpr->dpr_lock);
1087	assert(err == 0); /* check for recursion */
1088}
1089
1090void
1091dt_proc_unlock(dtrace_hdl_t *dtp, struct ps_prochandle *P)
1092{
1093	dt_proc_t *dpr = dt_proc_lookup(dtp, P, B_FALSE);
1094	int err = pthread_mutex_unlock(&dpr->dpr_lock);
1095	assert(err == 0); /* check for unheld lock */
1096}
1097
1098void
1099dt_proc_hash_create(dtrace_hdl_t *dtp)
1100{
1101	if ((dtp->dt_procs = dt_zalloc(dtp, sizeof (dt_proc_hash_t) +
1102	    sizeof (dt_proc_t *) * _dtrace_pidbuckets - 1)) != NULL) {
1103
1104		(void) pthread_mutex_init(&dtp->dt_procs->dph_lock, NULL);
1105		(void) pthread_cond_init(&dtp->dt_procs->dph_cv, NULL);
1106
1107		dtp->dt_procs->dph_hashlen = _dtrace_pidbuckets;
1108		dtp->dt_procs->dph_lrulim = _dtrace_pidlrulim;
1109	}
1110}
1111
1112void
1113dt_proc_hash_destroy(dtrace_hdl_t *dtp)
1114{
1115	dt_proc_hash_t *dph = dtp->dt_procs;
1116	dt_proc_t *dpr;
1117
1118	while ((dpr = dt_list_next(&dph->dph_lrulist)) != NULL)
1119		dt_proc_destroy(dtp, dpr->dpr_proc);
1120
1121	dtp->dt_procs = NULL;
1122	dt_free(dtp, dph);
1123}
1124
1125struct ps_prochandle *
1126dtrace_proc_create(dtrace_hdl_t *dtp, const char *file, char *const *argv)
1127{
1128	dt_ident_t *idp = dt_idhash_lookup(dtp->dt_macros, "target");
1129	struct ps_prochandle *P = dt_proc_create(dtp, file, argv);
1130
1131	if (P != NULL && idp != NULL && idp->di_id == 0)
1132		idp->di_id = Pstatus(P)->pr_pid; /* $target = created pid */
1133
1134	return (P);
1135}
1136
1137struct ps_prochandle *
1138dtrace_proc_grab(dtrace_hdl_t *dtp, pid_t pid, int flags)
1139{
1140	dt_ident_t *idp = dt_idhash_lookup(dtp->dt_macros, "target");
1141	struct ps_prochandle *P = dt_proc_grab(dtp, pid, flags, 0);
1142
1143	if (P != NULL && idp != NULL && idp->di_id == 0)
1144		idp->di_id = pid; /* $target = grabbed pid */
1145
1146	return (P);
1147}
1148
1149void
1150dtrace_proc_release(dtrace_hdl_t *dtp, struct ps_prochandle *P)
1151{
1152	dt_proc_release(dtp, P);
1153}
1154
1155void
1156dtrace_proc_continue(dtrace_hdl_t *dtp, struct ps_prochandle *P)
1157{
1158	dt_proc_continue(dtp, P);
1159}
1160