1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21/*
22 * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
23 * Use is subject to license terms.
24 */
25
26/* #pragma ident	"@(#)systrace.c	1.6	06/09/19 SMI" */
27
28#if !defined(__APPLE__)
29#include <sys/dtrace.h>
30#include <sys/systrace.h>
31#include <sys/stat.h>
32#include <sys/systm.h>
33#include <sys/conf.h>
34#include <sys/ddi.h>
35#include <sys/sunddi.h>
36#include <sys/atomic.h>
37#define	SYSTRACE_ARTIFICIAL_FRAMES	1
38#else
39
40#ifdef KERNEL
41#ifndef _KERNEL
42#define _KERNEL /* Solaris vs. Darwin */
43#endif
44#endif
45
46#include <kern/thread.h>
47#include <mach/thread_status.h>
48
49/* XXX All of these should really be derived from syscall_sw.h */
50#if defined (__x86_64__)
51#define SYSCALL_CLASS_SHIFT 24
52#define SYSCALL_CLASS_MASK  (0xFF << SYSCALL_CLASS_SHIFT)
53#define SYSCALL_NUMBER_MASK (~SYSCALL_CLASS_MASK)
54#define I386_SYSCALL_NUMBER_MASK (0xFFFF)
55#endif
56
57#include <sys/param.h>
58#include <sys/systm.h>
59#include <sys/proc.h>
60#include <sys/errno.h>
61#include <sys/ioctl.h>
62#include <sys/conf.h>
63#include <sys/fcntl.h>
64#include <miscfs/devfs/devfs.h>
65
66#include <sys/dtrace.h>
67#include <sys/dtrace_impl.h>
68#include "systrace.h"
69#include <sys/stat.h>
70#include <sys/systm.h>
71#include <sys/conf.h>
72#include <sys/user.h>
73
74#include <machine/pal_routines.h>
75
76#if defined (__x86_64__)
77#define	SYSTRACE_ARTIFICIAL_FRAMES	2
78#define MACHTRACE_ARTIFICIAL_FRAMES 3
79#else
80#error Unknown Architecture
81#endif
82
83#include <sys/sysent.h>
84#define sy_callc sy_call /* Map Solaris slot name to Darwin's */
85#define NSYSCALL nsysent /* and is less than 500 or so */
86
87extern const char *syscallnames[];
88
89#include <sys/dtrace_glue.h>
90#define casptr dtrace_casptr
91#define membar_enter dtrace_membar_producer
92
93#define LOADABLE_SYSCALL(a) 0 /* Not pertinent to Darwin. */
94#define LOADED_SYSCALL(a) 1 /* Not pertinent to Darwin. */
95
96extern lck_attr_t* dtrace_lck_attr;
97extern lck_grp_t* dtrace_lck_grp;
98static lck_mtx_t	dtrace_systrace_lock;		/* probe state lock */
99
100systrace_sysent_t *systrace_sysent = NULL;
101void (*systrace_probe)(dtrace_id_t, uint64_t, uint64_t, uint64_t, uint64_t, uint64_t);
102
103static uint64_t systrace_getarg(void *, dtrace_id_t, void *, int, int);
104
105void
106systrace_stub(dtrace_id_t id, uint64_t arg0, uint64_t arg1,
107    uint64_t arg2, uint64_t arg3, uint64_t arg4)
108{
109#pragma unused(id,arg0,arg1,arg2,arg3,arg4)
110}
111
112int32_t
113dtrace_systrace_syscall(struct proc *pp, void *uap, int *rv)
114{
115	unsigned short      code;	/* The system call number */
116
117	systrace_sysent_t *sy;
118	dtrace_id_t id;
119	int32_t rval;
120#if 0 /* XXX */
121	proc_t *p;
122#endif
123	syscall_arg_t *ip = (syscall_arg_t *)uap;
124
125#if defined (__x86_64__)
126	{
127		pal_register_cache_state(current_thread(), VALID);
128		x86_saved_state_t   *tagged_regs = (x86_saved_state_t *)find_user_regs(current_thread());
129
130		if (is_saved_state64(tagged_regs)) {
131			x86_saved_state64_t *regs = saved_state64(tagged_regs);
132			code = regs->rax & SYSCALL_NUMBER_MASK;
133			/*
134			 * Check for indirect system call... system call number
135			 * passed as 'arg0'
136			 */
137			if (code == 0) {
138				code = regs->rdi;
139			}
140		} else {
141			code = saved_state32(tagged_regs)->eax & I386_SYSCALL_NUMBER_MASK;
142
143			if (code == 0) {
144				vm_offset_t params = (vm_offset_t) (saved_state32(tagged_regs)->uesp + sizeof (int));
145				code = fuword(params);
146			}
147		}
148	}
149#else
150#error Unknown Architecture
151#endif
152
153	// Bounds "check" the value of code a la unix_syscall
154	sy = (code >= NUM_SYSENT) ? &systrace_sysent[63] : &systrace_sysent[code];
155
156	if ((id = sy->stsy_entry) != DTRACE_IDNONE) {
157		uthread_t uthread = (uthread_t)get_bsdthread_info(current_thread());
158		if (uthread)
159			uthread->t_dtrace_syscall_args = (void *)ip;
160
161		if (ip)
162			(*systrace_probe)(id, *ip, *(ip+1), *(ip+2), *(ip+3), *(ip+4));
163		else
164			(*systrace_probe)(id, 0, 0, 0, 0, 0);
165
166		if (uthread)
167			uthread->t_dtrace_syscall_args = (void *)0;
168	}
169
170#if 0 /* XXX */
171	/*
172	 * We want to explicitly allow DTrace consumers to stop a process
173	 * before it actually executes the meat of the syscall.
174	 */
175	p = ttoproc(curthread);
176	mutex_enter(&p->p_lock);
177	if (curthread->t_dtrace_stop && !curthread->t_lwp->lwp_nostop) {
178		curthread->t_dtrace_stop = 0;
179		stop(PR_REQUESTED, 0);
180	}
181	mutex_exit(&p->p_lock);
182#endif
183
184	rval = (*sy->stsy_underlying)(pp, uap, rv);
185
186	if ((id = sy->stsy_return) != DTRACE_IDNONE) {
187		uint64_t munged_rv0, munged_rv1;
188    	uthread_t uthread = (uthread_t)get_bsdthread_info(current_thread());
189
190		if (uthread)
191			uthread->t_dtrace_errno = rval; /* Establish t_dtrace_errno now in case this enabling refers to it. */
192
193		/*
194	 	 * "Decode" rv for use in the call to dtrace_probe()
195	 	 */
196		if (rval == ERESTART) {
197			munged_rv0 = -1LL; /* System call will be reissued in user mode. Make DTrace report a -1 return. */
198			munged_rv1 = -1LL;
199		} else if (rval != EJUSTRETURN) {
200			if (rval) {
201				munged_rv0 = -1LL; /* Mimic what libc will do. */
202				munged_rv1 = -1LL;
203			} else {
204				switch (sy->stsy_return_type) {
205				case _SYSCALL_RET_INT_T:
206					munged_rv0 = rv[0];
207					munged_rv1 = rv[1];
208					break;
209				case _SYSCALL_RET_UINT_T:
210					munged_rv0 = ((u_int)rv[0]);
211					munged_rv1 = ((u_int)rv[1]);
212					break;
213				case _SYSCALL_RET_OFF_T:
214				case _SYSCALL_RET_UINT64_T:
215					munged_rv0 = *(u_int64_t *)rv;
216					munged_rv1 = 0LL;
217					break;
218				case _SYSCALL_RET_ADDR_T:
219				case _SYSCALL_RET_SIZE_T:
220				case _SYSCALL_RET_SSIZE_T:
221					munged_rv0 = *(user_addr_t *)rv;
222					munged_rv1 = 0LL;
223					break;
224				case _SYSCALL_RET_NONE:
225					munged_rv0 = 0LL;
226					munged_rv1 = 0LL;
227					break;
228				default:
229					munged_rv0 = 0LL;
230					munged_rv1 = 0LL;
231					break;
232				}
233			}
234		} else {
235			munged_rv0 = 0LL;
236			munged_rv1 = 0LL;
237		}
238
239		/*
240		 * <http://mail.opensolaris.org/pipermail/dtrace-discuss/2007-January/003276.html> says:
241		 *
242		 * "This is a bit of an historical artifact. At first, the syscall provider just
243		 * had its return value in arg0, and the fbt and pid providers had their return
244		 * values in arg1 (so that we could use arg0 for the offset of the return site).
245		 *
246		 * We inevitably started writing scripts where we wanted to see the return
247		 * values from probes in all three providers, and we made this script easier
248		 * to write by replicating the syscall return values in arg1 to match fbt and
249		 * pid. We debated briefly about removing the return value from arg0, but
250		 * decided that it would be less confusing to have the same data in two places
251		 * than to have some non-helpful, non-intuitive value in arg0.
252		 *
253		 * This change was made 4/23/2003 according to the DTrace project's putback log."
254		 */
255		(*systrace_probe)(id, munged_rv0, munged_rv0, munged_rv1, (uint64_t)rval, 0);
256	}
257
258	return (rval);
259}
260
261void
262dtrace_systrace_syscall_return(unsigned short code, int rval, int *rv)
263{
264	systrace_sysent_t *sy;
265	dtrace_id_t id;
266
267	// Bounds "check" the value of code a la unix_syscall_return
268	sy = (code >= NUM_SYSENT) ? &systrace_sysent[63] : &systrace_sysent[code];
269
270	if ((id = sy->stsy_return) != DTRACE_IDNONE) {
271		uint64_t munged_rv0, munged_rv1;
272    	uthread_t uthread = (uthread_t)get_bsdthread_info(current_thread());
273
274		if (uthread)
275			uthread->t_dtrace_errno = rval; /* Establish t_dtrace_errno now in case this enabling refers to it. */
276
277		/*
278	 	 * "Decode" rv for use in the call to dtrace_probe()
279	 	 */
280		if (rval == ERESTART) {
281			munged_rv0 = -1LL; /* System call will be reissued in user mode. Make DTrace report a -1 return. */
282			munged_rv1 = -1LL;
283		} else if (rval != EJUSTRETURN) {
284			if (rval) {
285				munged_rv0 = -1LL; /* Mimic what libc will do. */
286				munged_rv1 = -1LL;
287			} else {
288				switch (sy->stsy_return_type) {
289				case _SYSCALL_RET_INT_T:
290					munged_rv0 = rv[0];
291					munged_rv1 = rv[1];
292					break;
293				case _SYSCALL_RET_UINT_T:
294					munged_rv0 = ((u_int)rv[0]);
295					munged_rv1 = ((u_int)rv[1]);
296					break;
297				case _SYSCALL_RET_OFF_T:
298				case _SYSCALL_RET_UINT64_T:
299					munged_rv0 = *(u_int64_t *)rv;
300					munged_rv1 = 0LL;
301					break;
302				case _SYSCALL_RET_ADDR_T:
303				case _SYSCALL_RET_SIZE_T:
304				case _SYSCALL_RET_SSIZE_T:
305					munged_rv0 = *(user_addr_t *)rv;
306					munged_rv1 = 0LL;
307					break;
308				case _SYSCALL_RET_NONE:
309					munged_rv0 = 0LL;
310					munged_rv1 = 0LL;
311					break;
312				default:
313					munged_rv0 = 0LL;
314					munged_rv1 = 0LL;
315					break;
316				}
317			}
318		} else {
319			munged_rv0 = 0LL;
320			munged_rv1 = 0LL;
321		}
322
323		(*systrace_probe)(id, munged_rv0, munged_rv0, munged_rv1, (uint64_t)rval, 0);
324	}
325}
326#endif /* __APPLE__ */
327
328#define	SYSTRACE_SHIFT			16
329#define	SYSTRACE_ISENTRY(x)		((int)(x) >> SYSTRACE_SHIFT)
330#define	SYSTRACE_SYSNUM(x)		((int)(x) & ((1 << SYSTRACE_SHIFT) - 1))
331#define	SYSTRACE_ENTRY(id)		((1 << SYSTRACE_SHIFT) | (id))
332#define	SYSTRACE_RETURN(id)		(id)
333
334#if ((1 << SYSTRACE_SHIFT) <= NSYSCALL)
335#error 1 << SYSTRACE_SHIFT must exceed number of system calls
336#endif
337
338static dev_info_t *systrace_devi;
339static dtrace_provider_id_t systrace_id;
340
341#if !defined (__APPLE__)
342static void
343systrace_init(struct sysent *actual, systrace_sysent_t **interposed)
344{
345	systrace_sysent_t *sysent = *interposed;
346	int i;
347
348	if (sysent == NULL) {
349		*interposed = sysent = kmem_zalloc(sizeof (systrace_sysent_t) *
350		    NSYSCALL, KM_SLEEP);
351	}
352
353	for (i = 0; i < NSYSCALL; i++) {
354		struct sysent *a = &actual[i];
355		systrace_sysent_t *s = &sysent[i];
356
357		if (LOADABLE_SYSCALL(a) && !LOADED_SYSCALL(a))
358			continue;
359
360		if (a->sy_callc == dtrace_systrace_syscall)
361			continue;
362
363#ifdef _SYSCALL32_IMPL
364		if (a->sy_callc == dtrace_systrace_syscall32)
365			continue;
366#endif
367
368		s->stsy_underlying = a->sy_callc;
369	}
370}
371#else
372#define systrace_init _systrace_init /* Avoid name clash with Darwin automagic conf symbol */
373static void
374systrace_init(struct sysent *actual, systrace_sysent_t **interposed)
375{
376
377	systrace_sysent_t *ssysent = *interposed;  /* Avoid sysent shadow warning
378							   from bsd/sys/sysent.h */
379	int i;
380
381	if (ssysent == NULL) {
382		*interposed = ssysent = kmem_zalloc(sizeof (systrace_sysent_t) *
383		    NSYSCALL, KM_SLEEP);
384	}
385
386	for (i = 0; i < NSYSCALL; i++) {
387		struct sysent *a = &actual[i];
388		systrace_sysent_t *s = &ssysent[i];
389
390		if (LOADABLE_SYSCALL(a) && !LOADED_SYSCALL(a))
391			continue;
392
393		if (a->sy_callc == dtrace_systrace_syscall)
394			continue;
395
396#ifdef _SYSCALL32_IMPL
397		if (a->sy_callc == dtrace_systrace_syscall32)
398			continue;
399#endif
400
401		s->stsy_underlying = a->sy_callc;
402		s->stsy_return_type = a->sy_return_type;
403	}
404	lck_mtx_init(&dtrace_systrace_lock, dtrace_lck_grp, dtrace_lck_attr);
405}
406
407#endif /* __APPLE__ */
408
409/*ARGSUSED*/
410static void
411systrace_provide(void *arg, const dtrace_probedesc_t *desc)
412{
413#pragma unused(arg) /* __APPLE__ */
414	int i;
415
416	if (desc != NULL)
417		return;
418
419	systrace_init(sysent, &systrace_sysent);
420#ifdef _SYSCALL32_IMPL
421	systrace_init(sysent32, &systrace_sysent32);
422#endif
423
424	for (i = 0; i < NSYSCALL; i++) {
425		if (systrace_sysent[i].stsy_underlying == NULL)
426			continue;
427
428		if (dtrace_probe_lookup(systrace_id, NULL,
429		    syscallnames[i], "entry") != 0)
430			continue;
431
432		(void) dtrace_probe_create(systrace_id, NULL, syscallnames[i],
433		    "entry", SYSTRACE_ARTIFICIAL_FRAMES,
434		    (void *)((uintptr_t)SYSTRACE_ENTRY(i)));
435		(void) dtrace_probe_create(systrace_id, NULL, syscallnames[i],
436		    "return", SYSTRACE_ARTIFICIAL_FRAMES,
437		    (void *)((uintptr_t)SYSTRACE_RETURN(i)));
438
439		systrace_sysent[i].stsy_entry = DTRACE_IDNONE;
440		systrace_sysent[i].stsy_return = DTRACE_IDNONE;
441#ifdef _SYSCALL32_IMPL
442		systrace_sysent32[i].stsy_entry = DTRACE_IDNONE;
443		systrace_sysent32[i].stsy_return = DTRACE_IDNONE;
444#endif
445	}
446}
447#if defined(__APPLE__)
448#undef systrace_init
449#endif
450
451/*ARGSUSED*/
452static void
453systrace_destroy(void *arg, dtrace_id_t id, void *parg)
454{
455#pragma unused(arg,id) /* __APPLE__ */
456
457	int sysnum = SYSTRACE_SYSNUM((uintptr_t)parg);
458
459#pragma unused(sysnum)  /* __APPLE__ */
460	/*
461	 * There's nothing to do here but assert that we have actually been
462	 * disabled.
463	 */
464	if (SYSTRACE_ISENTRY((uintptr_t)parg)) {
465		ASSERT(systrace_sysent[sysnum].stsy_entry == DTRACE_IDNONE);
466#ifdef _SYSCALL32_IMPL
467		ASSERT(systrace_sysent32[sysnum].stsy_entry == DTRACE_IDNONE);
468#endif
469	} else {
470		ASSERT(systrace_sysent[sysnum].stsy_return == DTRACE_IDNONE);
471#ifdef _SYSCALL32_IMPL
472		ASSERT(systrace_sysent32[sysnum].stsy_return == DTRACE_IDNONE);
473#endif
474	}
475}
476
477/*ARGSUSED*/
478static int
479systrace_enable(void *arg, dtrace_id_t id, void *parg)
480{
481#pragma unused(arg) /* __APPLE__ */
482
483	int sysnum = SYSTRACE_SYSNUM((uintptr_t)parg);
484	int enabled = (systrace_sysent[sysnum].stsy_entry != DTRACE_IDNONE ||
485	    systrace_sysent[sysnum].stsy_return != DTRACE_IDNONE);
486
487	if (SYSTRACE_ISENTRY((uintptr_t)parg)) {
488		systrace_sysent[sysnum].stsy_entry = id;
489#ifdef _SYSCALL32_IMPL
490		systrace_sysent32[sysnum].stsy_entry = id;
491#endif
492	} else {
493		systrace_sysent[sysnum].stsy_return = id;
494#ifdef _SYSCALL32_IMPL
495		systrace_sysent32[sysnum].stsy_return = id;
496#endif
497	}
498
499	if (enabled) {
500		ASSERT(sysent[sysnum].sy_callc == dtrace_systrace_syscall);
501		return(0);
502	}
503#ifdef _SYSCALL32_IMPL
504	(void) casptr(&sysent32[sysnum].sy_callc,
505	    (void *)systrace_sysent32[sysnum].stsy_underlying,
506	    (void *)dtrace_systrace_syscall32);
507#endif
508
509	lck_mtx_lock(&dtrace_systrace_lock);
510	if (sysent[sysnum].sy_callc == systrace_sysent[sysnum].stsy_underlying) {
511		vm_offset_t dss = (vm_offset_t)&dtrace_systrace_syscall;
512		ml_nofault_copy((vm_offset_t)&dss, (vm_offset_t)&sysent[sysnum].sy_callc, sizeof(vm_offset_t));
513	}
514	lck_mtx_unlock(&dtrace_systrace_lock);
515	return (0);
516}
517
518/*ARGSUSED*/
519static void
520systrace_disable(void *arg, dtrace_id_t id, void *parg)
521{
522#pragma unused(arg,id) /* __APPLE__ */
523
524	int sysnum = SYSTRACE_SYSNUM((uintptr_t)parg);
525	int disable = (systrace_sysent[sysnum].stsy_entry == DTRACE_IDNONE ||
526	    systrace_sysent[sysnum].stsy_return == DTRACE_IDNONE);
527
528	if (disable) {
529		lck_mtx_lock(&dtrace_systrace_lock);
530		if (sysent[sysnum].sy_callc == dtrace_systrace_syscall)
531			ml_nofault_copy((vm_offset_t)&systrace_sysent[sysnum].stsy_underlying, (vm_offset_t)&sysent[sysnum].sy_callc, sizeof(systrace_sysent[sysnum].stsy_underlying));
532		lck_mtx_unlock(&dtrace_systrace_lock);
533
534#ifdef _SYSCALL32_IMPL
535		(void) casptr(&sysent32[sysnum].sy_callc,
536		    (void *)dtrace_systrace_syscall32,
537		    (void *)systrace_sysent32[sysnum].stsy_underlying);
538#endif
539	}
540
541	if (SYSTRACE_ISENTRY((uintptr_t)parg)) {
542		systrace_sysent[sysnum].stsy_entry = DTRACE_IDNONE;
543#ifdef _SYSCALL32_IMPL
544		systrace_sysent32[sysnum].stsy_entry = DTRACE_IDNONE;
545#endif
546	} else {
547		systrace_sysent[sysnum].stsy_return = DTRACE_IDNONE;
548#ifdef _SYSCALL32_IMPL
549		systrace_sysent32[sysnum].stsy_return = DTRACE_IDNONE;
550#endif
551	}
552}
553
554static dtrace_pattr_t systrace_attr = {
555{ DTRACE_STABILITY_EVOLVING, DTRACE_STABILITY_EVOLVING, DTRACE_CLASS_COMMON },
556{ DTRACE_STABILITY_PRIVATE, DTRACE_STABILITY_PRIVATE, DTRACE_CLASS_UNKNOWN },
557{ DTRACE_STABILITY_PRIVATE, DTRACE_STABILITY_PRIVATE, DTRACE_CLASS_ISA },
558{ DTRACE_STABILITY_EVOLVING, DTRACE_STABILITY_EVOLVING, DTRACE_CLASS_COMMON },
559{ DTRACE_STABILITY_PRIVATE, DTRACE_STABILITY_PRIVATE, DTRACE_CLASS_ISA },
560};
561
562static dtrace_pops_t systrace_pops = {
563	systrace_provide,
564	NULL,
565	systrace_enable,
566	systrace_disable,
567	NULL,
568	NULL,
569	NULL,
570	systrace_getarg,
571	NULL,
572	systrace_destroy
573};
574
575static int
576systrace_attach(dev_info_t *devi, ddi_attach_cmd_t cmd)
577{
578	switch (cmd) {
579	case DDI_ATTACH:
580		break;
581	case DDI_RESUME:
582		return (DDI_SUCCESS);
583	default:
584		return (DDI_FAILURE);
585	}
586
587#if !defined(__APPLE__)
588	systrace_probe = (void (*)())dtrace_probe;
589	membar_enter();
590
591	if (ddi_create_minor_node(devi, "systrace", S_IFCHR, 0,
592	    DDI_PSEUDO, NULL) == DDI_FAILURE ||
593	    dtrace_register("syscall", &systrace_attr, DTRACE_PRIV_USER, NULL,
594	    &systrace_pops, NULL, &systrace_id) != 0) {
595		systrace_probe = systrace_stub;
596		ddi_remove_minor_node(devi, NULL);
597		return (DDI_FAILURE);
598	}
599#else
600	systrace_probe = (void(*))&dtrace_probe;
601	membar_enter();
602
603	if (ddi_create_minor_node(devi, "systrace", S_IFCHR, 0,
604	    DDI_PSEUDO, 0) == DDI_FAILURE ||
605	    dtrace_register("syscall", &systrace_attr, DTRACE_PRIV_USER, NULL,
606	    &systrace_pops, NULL, &systrace_id) != 0) {
607		systrace_probe = systrace_stub;
608		ddi_remove_minor_node(devi, NULL);
609		return (DDI_FAILURE);
610	}
611#endif /* __APPLE__ */
612
613	ddi_report_dev(devi);
614	systrace_devi = devi;
615
616	return (DDI_SUCCESS);
617}
618
619#if !defined(__APPLE__)
620static int
621systrace_detach(dev_info_t *devi, ddi_detach_cmd_t cmd)
622{
623	switch (cmd) {
624	case DDI_DETACH:
625		break;
626	case DDI_SUSPEND:
627		return (DDI_SUCCESS);
628	default:
629		return (DDI_FAILURE);
630	}
631
632	if (dtrace_unregister(systrace_id) != 0)
633		return (DDI_FAILURE);
634
635	ddi_remove_minor_node(devi, NULL);
636	systrace_probe = systrace_stub;
637	return (DDI_SUCCESS);
638}
639
640/*ARGSUSED*/
641static int
642systrace_info(dev_info_t *dip, ddi_info_cmd_t infocmd, void *arg, void **result)
643{
644	int error;
645
646	switch (infocmd) {
647	case DDI_INFO_DEVT2DEVINFO:
648		*result = (void *)systrace_devi;
649		error = DDI_SUCCESS;
650		break;
651	case DDI_INFO_DEVT2INSTANCE:
652		*result = (void *)0;
653		error = DDI_SUCCESS;
654		break;
655	default:
656		error = DDI_FAILURE;
657	}
658	return (error);
659}
660
661/*ARGSUSED*/
662static int
663systrace_open(dev_t *devp, int flag, int otyp, cred_t *cred_p)
664{
665	return (0);
666}
667
668static struct cb_ops systrace_cb_ops = {
669	systrace_open,		/* open */
670	nodev,			/* close */
671	nulldev,		/* strategy */
672	nulldev,		/* print */
673	nodev,			/* dump */
674	nodev,			/* read */
675	nodev,			/* write */
676	nodev,			/* ioctl */
677	nodev,			/* devmap */
678	nodev,			/* mmap */
679	nodev,			/* segmap */
680	nochpoll,		/* poll */
681	ddi_prop_op,		/* cb_prop_op */
682	0,			/* streamtab  */
683	D_NEW | D_MP		/* Driver compatibility flag */
684};
685
686static struct dev_ops systrace_ops = {
687	DEVO_REV,		/* devo_rev, */
688	0,			/* refcnt  */
689	systrace_info,		/* get_dev_info */
690	nulldev,		/* identify */
691	nulldev,		/* probe */
692	systrace_attach,	/* attach */
693	systrace_detach,	/* detach */
694	nodev,			/* reset */
695	&systrace_cb_ops,	/* driver operations */
696	NULL,			/* bus operations */
697	nodev			/* dev power */
698};
699
700/*
701 * Module linkage information for the kernel.
702 */
703static struct modldrv modldrv = {
704	&mod_driverops,		/* module type (this is a pseudo driver) */
705	"System Call Tracing",	/* name of module */
706	&systrace_ops,		/* driver ops */
707};
708
709static struct modlinkage modlinkage = {
710	MODREV_1,
711	(void *)&modldrv,
712	NULL
713};
714
715int
716_init(void)
717{
718	return (mod_install(&modlinkage));
719}
720
721int
722_info(struct modinfo *modinfop)
723{
724	return (mod_info(&modlinkage, modinfop));
725}
726
727int
728_fini(void)
729{
730	return (mod_remove(&modlinkage));
731}
732#else
733typedef kern_return_t (*mach_call_t)(void *);
734
735/* XXX From #include <kern/syscall_sw.h> which may be changed for 64 bit! */
736typedef void    mach_munge_t(const void *, void *);
737
738typedef struct {
739	int			mach_trap_arg_count;
740	kern_return_t		(*mach_trap_function)(void *);
741#if defined(__x86_64__)
742	mach_munge_t		*mach_trap_arg_munge32; /* system call arguments for 32-bit */
743#endif
744	int			mach_trap_u32_words;
745#if	MACH_ASSERT
746	const char*		mach_trap_name;
747#endif /* MACH_ASSERT */
748} mach_trap_t;
749
750extern const mach_trap_t              mach_trap_table[]; /* syscall_sw.h now declares this as const */
751extern int                      mach_trap_count;
752
753extern const char *mach_syscall_name_table[];
754
755/* XXX From osfmk/i386/bsd_i386.c */
756struct mach_call_args {
757        syscall_arg_t arg1;
758        syscall_arg_t arg2;
759        syscall_arg_t arg3;
760        syscall_arg_t arg4;
761        syscall_arg_t arg5;
762        syscall_arg_t arg6;
763        syscall_arg_t arg7;
764        syscall_arg_t arg8;
765        syscall_arg_t arg9;
766};
767
768#undef NSYSCALL
769#define NSYSCALL mach_trap_count
770
771#if ((1 << SYSTRACE_SHIFT) <= NSYSCALL)
772#error 1 << SYSTRACE_SHIFT must exceed number of Mach traps
773#endif
774
775typedef struct machtrace_sysent {
776	dtrace_id_t	stsy_entry;
777	dtrace_id_t	stsy_return;
778	kern_return_t	(*stsy_underlying)(void *);
779	int32_t		stsy_return_type;
780} machtrace_sysent_t;
781
782static machtrace_sysent_t *machtrace_sysent = NULL;
783
784void (*machtrace_probe)(dtrace_id_t, uint64_t, uint64_t,
785    uint64_t, uint64_t, uint64_t);
786
787static uint64_t machtrace_getarg(void *, dtrace_id_t, void *, int, int);
788
789static dev_info_t *machtrace_devi;
790static dtrace_provider_id_t machtrace_id;
791
792static kern_return_t
793dtrace_machtrace_syscall(struct mach_call_args *args)
794{
795	int code;	/* The mach call number */
796
797	machtrace_sysent_t *sy;
798	dtrace_id_t id;
799	kern_return_t rval;
800#if 0 /* XXX */
801	proc_t *p;
802#endif
803	syscall_arg_t *ip = (syscall_arg_t *)args;
804	mach_call_t mach_call;
805
806#if defined (__x86_64__)
807	{
808		pal_register_cache_state(current_thread(), VALID);
809		x86_saved_state_t   *tagged_regs = (x86_saved_state_t *)find_user_regs(current_thread());
810
811		if (is_saved_state64(tagged_regs)) {
812			code = saved_state64(tagged_regs)->rax & SYSCALL_NUMBER_MASK;
813		} else {
814			code = -saved_state32(tagged_regs)->eax;
815		}
816	}
817#else
818#error Unknown Architecture
819#endif
820
821	sy = &machtrace_sysent[code];
822
823	if ((id = sy->stsy_entry) != DTRACE_IDNONE) {
824		uthread_t uthread = (uthread_t)get_bsdthread_info(current_thread());
825
826		if (uthread)
827			uthread->t_dtrace_syscall_args = (void *)ip;
828
829		(*machtrace_probe)(id, *ip, *(ip+1), *(ip+2), *(ip+3), *(ip+4));
830
831		if (uthread)
832			uthread->t_dtrace_syscall_args = (void *)0;
833	}
834
835#if 0 /* XXX */
836	/*
837	 * We want to explicitly allow DTrace consumers to stop a process
838	 * before it actually executes the meat of the syscall.
839	 */
840	p = ttoproc(curthread);
841	mutex_enter(&p->p_lock);
842	if (curthread->t_dtrace_stop && !curthread->t_lwp->lwp_nostop) {
843		curthread->t_dtrace_stop = 0;
844		stop(PR_REQUESTED, 0);
845	}
846	mutex_exit(&p->p_lock);
847#endif
848
849	mach_call = (mach_call_t)(*sy->stsy_underlying);
850	rval = mach_call(args);
851
852	if ((id = sy->stsy_return) != DTRACE_IDNONE)
853		(*machtrace_probe)(id, (uint64_t)rval, 0, 0, 0, 0);
854
855	return (rval);
856}
857
858static void
859machtrace_init(const mach_trap_t *actual, machtrace_sysent_t **interposed)
860{
861	machtrace_sysent_t *msysent = *interposed;
862	int i;
863
864	if (msysent == NULL) {
865		*interposed = msysent = kmem_zalloc(sizeof (machtrace_sysent_t) *
866				NSYSCALL, KM_SLEEP);
867	}
868
869	for (i = 0; i < NSYSCALL; i++) {
870		const mach_trap_t *a = &actual[i];
871		machtrace_sysent_t *s = &msysent[i];
872
873		if (LOADABLE_SYSCALL(a) && !LOADED_SYSCALL(a))
874			continue;
875
876		if (a->mach_trap_function == (mach_call_t)(dtrace_machtrace_syscall))
877			continue;
878
879		s->stsy_underlying = a->mach_trap_function;
880	}
881}
882
883/*ARGSUSED*/
884static void
885machtrace_provide(void *arg, const dtrace_probedesc_t *desc)
886{
887#pragma unused(arg) /* __APPLE__ */
888
889	int i;
890
891	if (desc != NULL)
892		return;
893
894	machtrace_init(mach_trap_table, &machtrace_sysent);
895
896	for (i = 0; i < NSYSCALL; i++) {
897
898		if (machtrace_sysent[i].stsy_underlying == NULL)
899			continue;
900
901		if (dtrace_probe_lookup(machtrace_id, NULL,
902					mach_syscall_name_table[i], "entry") != 0)
903			continue;
904
905		(void) dtrace_probe_create(machtrace_id, NULL, mach_syscall_name_table[i],
906					   "entry", MACHTRACE_ARTIFICIAL_FRAMES,
907					   (void *)((uintptr_t)SYSTRACE_ENTRY(i)));
908		(void) dtrace_probe_create(machtrace_id, NULL, mach_syscall_name_table[i],
909					   "return", MACHTRACE_ARTIFICIAL_FRAMES,
910					   (void *)((uintptr_t)SYSTRACE_RETURN(i)));
911
912		machtrace_sysent[i].stsy_entry = DTRACE_IDNONE;
913		machtrace_sysent[i].stsy_return = DTRACE_IDNONE;
914	}
915}
916
917/*ARGSUSED*/
918static void
919machtrace_destroy(void *arg, dtrace_id_t id, void *parg)
920{
921#pragma unused(arg,id) /* __APPLE__ */
922	int sysnum = SYSTRACE_SYSNUM((uintptr_t)parg);
923
924#pragma unused(sysnum) /* __APPLE__ */
925
926	/*
927	 * There's nothing to do here but assert that we have actually been
928	 * disabled.
929	 */
930	if (SYSTRACE_ISENTRY((uintptr_t)parg)) {
931		ASSERT(machtrace_sysent[sysnum].stsy_entry == DTRACE_IDNONE);
932	} else {
933		ASSERT(machtrace_sysent[sysnum].stsy_return == DTRACE_IDNONE);
934	}
935}
936
937/*ARGSUSED*/
938static int
939machtrace_enable(void *arg, dtrace_id_t id, void *parg)
940{
941#pragma unused(arg) /* __APPLE__ */
942
943	int sysnum = SYSTRACE_SYSNUM((uintptr_t)parg);
944	int enabled = (machtrace_sysent[sysnum].stsy_entry != DTRACE_IDNONE ||
945			machtrace_sysent[sysnum].stsy_return != DTRACE_IDNONE);
946
947	if (SYSTRACE_ISENTRY((uintptr_t)parg)) {
948		machtrace_sysent[sysnum].stsy_entry = id;
949	} else {
950		machtrace_sysent[sysnum].stsy_return = id;
951	}
952
953	if (enabled) {
954	    ASSERT(mach_trap_table[sysnum].mach_trap_function == (void *)dtrace_machtrace_syscall);
955	    return(0);
956	}
957
958	lck_mtx_lock(&dtrace_systrace_lock);
959
960	if (mach_trap_table[sysnum].mach_trap_function == machtrace_sysent[sysnum].stsy_underlying) {
961		vm_offset_t dss = (vm_offset_t)&dtrace_machtrace_syscall;
962		ml_nofault_copy((vm_offset_t)&dss, (vm_offset_t)&mach_trap_table[sysnum].mach_trap_function, sizeof(vm_offset_t));
963	}
964
965	lck_mtx_unlock(&dtrace_systrace_lock);
966
967	return(0);
968}
969
970/*ARGSUSED*/
971static void
972machtrace_disable(void *arg, dtrace_id_t id, void *parg)
973{
974#pragma unused(arg,id) /* __APPLE__ */
975
976	int sysnum = SYSTRACE_SYSNUM((uintptr_t)parg);
977	int disable = (machtrace_sysent[sysnum].stsy_entry == DTRACE_IDNONE ||
978			machtrace_sysent[sysnum].stsy_return == DTRACE_IDNONE);
979
980	if (disable) {
981
982		lck_mtx_lock(&dtrace_systrace_lock);
983
984		if (mach_trap_table[sysnum].mach_trap_function == (mach_call_t)dtrace_machtrace_syscall) {
985			ml_nofault_copy((vm_offset_t)&machtrace_sysent[sysnum].stsy_underlying, (vm_offset_t)&mach_trap_table[sysnum].mach_trap_function, sizeof(vm_offset_t));
986		}
987		lck_mtx_unlock(&dtrace_systrace_lock);
988	}
989
990	if (SYSTRACE_ISENTRY((uintptr_t)parg)) {
991		machtrace_sysent[sysnum].stsy_entry = DTRACE_IDNONE;
992	} else {
993		machtrace_sysent[sysnum].stsy_return = DTRACE_IDNONE;
994	}
995}
996
997static dtrace_pattr_t machtrace_attr = {
998{ DTRACE_STABILITY_EVOLVING, DTRACE_STABILITY_EVOLVING, DTRACE_CLASS_COMMON },
999{ DTRACE_STABILITY_PRIVATE, DTRACE_STABILITY_PRIVATE, DTRACE_CLASS_UNKNOWN },
1000{ DTRACE_STABILITY_PRIVATE, DTRACE_STABILITY_PRIVATE, DTRACE_CLASS_ISA },
1001{ DTRACE_STABILITY_EVOLVING, DTRACE_STABILITY_EVOLVING, DTRACE_CLASS_COMMON },
1002{ DTRACE_STABILITY_PRIVATE, DTRACE_STABILITY_PRIVATE, DTRACE_CLASS_ISA },
1003};
1004
1005static dtrace_pops_t machtrace_pops = {
1006	machtrace_provide,
1007	NULL,
1008	machtrace_enable,
1009	machtrace_disable,
1010	NULL,
1011	NULL,
1012	NULL,
1013	machtrace_getarg,
1014	NULL,
1015	machtrace_destroy
1016};
1017
1018static int
1019machtrace_attach(dev_info_t *devi, ddi_attach_cmd_t cmd)
1020{
1021	switch (cmd) {
1022		case DDI_ATTACH:
1023			break;
1024		case DDI_RESUME:
1025			return (DDI_SUCCESS);
1026		default:
1027			return (DDI_FAILURE);
1028	}
1029
1030#if !defined(__APPLE__)
1031	machtrace_probe = (void (*)())dtrace_probe;
1032	membar_enter();
1033
1034	if (ddi_create_minor_node(devi, "machtrace", S_IFCHR, 0,
1035				DDI_PSEUDO, NULL) == DDI_FAILURE ||
1036			dtrace_register("mach_trap", &machtrace_attr, DTRACE_PRIV_USER, NULL,
1037				&machtrace_pops, NULL, &machtrace_id) != 0) {
1038		machtrace_probe = systrace_stub;
1039#else
1040	machtrace_probe = dtrace_probe;
1041	membar_enter();
1042
1043	if (ddi_create_minor_node(devi, "machtrace", S_IFCHR, 0,
1044				DDI_PSEUDO, 0) == DDI_FAILURE ||
1045			dtrace_register("mach_trap", &machtrace_attr, DTRACE_PRIV_USER, NULL,
1046				&machtrace_pops, NULL, &machtrace_id) != 0) {
1047                machtrace_probe = (void (*))&systrace_stub;
1048#endif /* __APPLE__ */
1049		ddi_remove_minor_node(devi, NULL);
1050		return (DDI_FAILURE);
1051	}
1052
1053	ddi_report_dev(devi);
1054	machtrace_devi = devi;
1055
1056	return (DDI_SUCCESS);
1057}
1058
1059d_open_t _systrace_open;
1060
1061int _systrace_open(dev_t dev, int flags, int devtype, struct proc *p)
1062{
1063#pragma unused(dev,flags,devtype,p)
1064	return 0;
1065}
1066
1067#define SYSTRACE_MAJOR  -24 /* let the kernel pick the device number */
1068
1069/*
1070 * A struct describing which functions will get invoked for certain
1071 * actions.
1072 */
1073static struct cdevsw systrace_cdevsw =
1074{
1075	_systrace_open,		/* open */
1076	eno_opcl,		/* close */
1077	eno_rdwrt,			/* read */
1078	eno_rdwrt,			/* write */
1079	eno_ioctl,		/* ioctl */
1080	(stop_fcn_t *)nulldev, /* stop */
1081	(reset_fcn_t *)nulldev, /* reset */
1082	NULL,				/* tty's */
1083	eno_select,			/* select */
1084	eno_mmap,			/* mmap */
1085	eno_strat,			/* strategy */
1086	eno_getc,			/* getc */
1087	eno_putc,			/* putc */
1088	0					/* type */
1089};
1090
1091static int gSysTraceInited = 0;
1092
1093void systrace_init( void );
1094
1095void systrace_init( void )
1096{
1097	if (0 == gSysTraceInited) {
1098		int majdevno = cdevsw_add(SYSTRACE_MAJOR, &systrace_cdevsw);
1099
1100		if (majdevno < 0) {
1101			printf("systrace_init: failed to allocate a major number!\n");
1102			gSysTraceInited = 0;
1103			return;
1104		}
1105
1106		systrace_attach( (dev_info_t	*)(uintptr_t)majdevno, DDI_ATTACH );
1107		machtrace_attach( (dev_info_t	*)(uintptr_t)majdevno, DDI_ATTACH );
1108
1109		gSysTraceInited = 1;
1110	} else
1111		panic("systrace_init: called twice!\n");
1112}
1113#undef SYSTRACE_MAJOR
1114#endif /* __APPLE__ */
1115
1116static uint64_t
1117systrace_getarg(void *arg, dtrace_id_t id, void *parg, int argno, int aframes)
1118{
1119#pragma unused(arg,id,parg,aframes)     /* __APPLE__ */
1120	uint64_t val = 0;
1121	syscall_arg_t *stack = (syscall_arg_t *)NULL;
1122
1123	uthread_t uthread = (uthread_t)get_bsdthread_info(current_thread());
1124
1125	if (uthread)
1126		stack = (syscall_arg_t *)uthread->t_dtrace_syscall_args;
1127
1128	if (!stack)
1129		return(0);
1130
1131	DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
1132	/* dtrace_probe arguments arg0 .. arg4 are 64bits wide */
1133	val = (uint64_t)*(stack+argno);
1134	DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);
1135	return (val);
1136}
1137
1138
1139static uint64_t
1140machtrace_getarg(void *arg, dtrace_id_t id, void *parg, int argno, int aframes)
1141{
1142#pragma unused(arg,id,parg,aframes)     /* __APPLE__ */
1143	uint64_t val = 0;
1144	syscall_arg_t *stack = (syscall_arg_t *)NULL;
1145
1146	uthread_t uthread = (uthread_t)get_bsdthread_info(current_thread());
1147
1148	if (uthread)
1149		stack = (syscall_arg_t *)uthread->t_dtrace_syscall_args;
1150
1151	if (!stack)
1152		return(0);
1153
1154	DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
1155	/* dtrace_probe arguments arg0 .. arg4 are 64bits wide */
1156	val = (uint64_t)*(stack+argno);
1157	DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);
1158	return (val);
1159}
1160
1161