1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21/*
22 * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
23 * Use is subject to license terms.
24 */
25
26/* #pragma ident	"@(#)systrace.c	1.6	06/09/19 SMI" */
27
28#if !defined(__APPLE__)
29#include <sys/dtrace.h>
30#include <sys/systrace.h>
31#include <sys/stat.h>
32#include <sys/systm.h>
33#include <sys/conf.h>
34#include <sys/ddi.h>
35#include <sys/sunddi.h>
36#include <sys/atomic.h>
37#define	SYSTRACE_ARTIFICIAL_FRAMES	1
38#else
39
40#ifdef KERNEL
41#ifndef _KERNEL
42#define _KERNEL /* Solaris vs. Darwin */
43#endif
44#endif
45
46#define MACH__POSIX_C_SOURCE_PRIVATE 1 /* pulls in suitable savearea from mach/ppc/thread_status.h */
47#include <kern/thread.h>
48#include <mach/thread_status.h>
49/* XXX All of these should really be derived from syscall_sw.h */
50#if defined(__i386__) || defined (__x86_64__)
51#define SYSCALL_CLASS_SHIFT 24
52#define SYSCALL_CLASS_MASK  (0xFF << SYSCALL_CLASS_SHIFT)
53#define SYSCALL_NUMBER_MASK (~SYSCALL_CLASS_MASK)
54#define I386_SYSCALL_NUMBER_MASK (0xFFFF)
55
56typedef x86_saved_state_t savearea_t;
57#endif
58
59#include <sys/param.h>
60#include <sys/systm.h>
61#include <sys/proc.h>
62#include <sys/errno.h>
63#include <sys/ioctl.h>
64#include <sys/conf.h>
65#include <sys/fcntl.h>
66#include <miscfs/devfs/devfs.h>
67
68#include <sys/dtrace.h>
69#include <sys/dtrace_impl.h>
70#include "systrace.h"
71#include <sys/stat.h>
72#include <sys/systm.h>
73#include <sys/conf.h>
74#include <sys/user.h>
75
76#include <machine/pal_routines.h>
77
78#if defined(__i386__) || defined (__x86_64__)
79#define	SYSTRACE_ARTIFICIAL_FRAMES	2
80#define MACHTRACE_ARTIFICIAL_FRAMES 3
81#else
82#error Unknown Architecture
83#endif
84
85#include <sys/sysent.h>
86#define sy_callc sy_call /* Map Solaris slot name to Darwin's */
87#define NSYSCALL nsysent /* and is less than 500 or so */
88
89extern const char *syscallnames[];
90
91#include <sys/dtrace_glue.h>
92#define casptr dtrace_casptr
93#define membar_enter dtrace_membar_producer
94
95#define LOADABLE_SYSCALL(a) 0 /* Not pertinent to Darwin. */
96#define LOADED_SYSCALL(a) 1 /* Not pertinent to Darwin. */
97
98extern lck_attr_t* dtrace_lck_attr;
99extern lck_grp_t* dtrace_lck_grp;
100static lck_mtx_t	dtrace_systrace_lock;		/* probe state lock */
101
102systrace_sysent_t *systrace_sysent = NULL;
103void (*systrace_probe)(dtrace_id_t, uint64_t, uint64_t, uint64_t, uint64_t, uint64_t);
104
105static uint64_t systrace_getarg(void *, dtrace_id_t, void *, int, int);
106
107void
108systrace_stub(dtrace_id_t id, uint64_t arg0, uint64_t arg1,
109    uint64_t arg2, uint64_t arg3, uint64_t arg4)
110{
111#pragma unused(id,arg0,arg1,arg2,arg3,arg4)
112}
113
114int32_t
115dtrace_systrace_syscall(struct proc *pp, void *uap, int *rv)
116{
117	boolean_t           flavor;
118	unsigned short      code;
119
120	systrace_sysent_t *sy;
121	dtrace_id_t id;
122	int32_t rval;
123#if 0 /* XXX */
124	proc_t *p;
125#endif
126	syscall_arg_t *ip = (syscall_arg_t *)uap;
127
128#if defined(__i386__) || defined (__x86_64__)
129#pragma unused(flavor)
130	{
131		pal_register_cache_state(current_thread(), VALID);
132		x86_saved_state_t   *tagged_regs = (x86_saved_state_t *)find_user_regs(current_thread());
133
134		if (is_saved_state64(tagged_regs)) {
135			x86_saved_state64_t *regs = saved_state64(tagged_regs);
136			code = regs->rax & SYSCALL_NUMBER_MASK;
137			/*
138			 * Check for indirect system call... system call number
139			 * passed as 'arg0'
140			 */
141			if (code == 0) {
142				code = regs->rdi;
143			}
144		} else {
145			code = saved_state32(tagged_regs)->eax & I386_SYSCALL_NUMBER_MASK;
146
147			if (code == 0) {
148				vm_offset_t params = (vm_offset_t) (saved_state32(tagged_regs)->uesp + sizeof (int));
149				code = fuword(params);
150			}
151		}
152	}
153#else
154#error Unknown Architecture
155#endif
156
157	// Bounds "check" the value of code a la unix_syscall
158	sy = (code >= NUM_SYSENT) ? &systrace_sysent[63] : &systrace_sysent[code];
159
160	if ((id = sy->stsy_entry) != DTRACE_IDNONE) {
161		uthread_t uthread = (uthread_t)get_bsdthread_info(current_thread());
162		if (uthread)
163			uthread->t_dtrace_syscall_args = (void *)ip;
164
165		if (ip)
166			(*systrace_probe)(id, *ip, *(ip+1), *(ip+2), *(ip+3), *(ip+4));
167		else
168			(*systrace_probe)(id, 0, 0, 0, 0, 0);
169
170		if (uthread)
171			uthread->t_dtrace_syscall_args = (void *)0;
172	}
173
174#if 0 /* XXX */
175	/*
176	 * We want to explicitly allow DTrace consumers to stop a process
177	 * before it actually executes the meat of the syscall.
178	 */
179	p = ttoproc(curthread);
180	mutex_enter(&p->p_lock);
181	if (curthread->t_dtrace_stop && !curthread->t_lwp->lwp_nostop) {
182		curthread->t_dtrace_stop = 0;
183		stop(PR_REQUESTED, 0);
184	}
185	mutex_exit(&p->p_lock);
186#endif
187
188	rval = (*sy->stsy_underlying)(pp, uap, rv);
189
190	if ((id = sy->stsy_return) != DTRACE_IDNONE) {
191		uint64_t munged_rv0, munged_rv1;
192    	uthread_t uthread = (uthread_t)get_bsdthread_info(current_thread());
193
194		if (uthread)
195			uthread->t_dtrace_errno = rval; /* Establish t_dtrace_errno now in case this enabling refers to it. */
196
197		/*
198	 	 * "Decode" rv for use in the call to dtrace_probe()
199	 	 */
200		if (rval == ERESTART) {
201			munged_rv0 = -1LL; /* System call will be reissued in user mode. Make DTrace report a -1 return. */
202			munged_rv1 = -1LL;
203		} else if (rval != EJUSTRETURN) {
204			if (rval) {
205				munged_rv0 = -1LL; /* Mimic what libc will do. */
206				munged_rv1 = -1LL;
207			} else {
208				switch (sy->stsy_return_type) {
209				case _SYSCALL_RET_INT_T:
210					munged_rv0 = rv[0];
211					munged_rv1 = rv[1];
212					break;
213				case _SYSCALL_RET_UINT_T:
214					munged_rv0 = ((u_int)rv[0]);
215					munged_rv1 = ((u_int)rv[1]);
216					break;
217				case _SYSCALL_RET_OFF_T:
218				case _SYSCALL_RET_UINT64_T:
219					munged_rv0 = *(u_int64_t *)rv;
220					munged_rv1 = 0LL;
221					break;
222				case _SYSCALL_RET_ADDR_T:
223				case _SYSCALL_RET_SIZE_T:
224				case _SYSCALL_RET_SSIZE_T:
225					munged_rv0 = *(user_addr_t *)rv;
226					munged_rv1 = 0LL;
227					break;
228				case _SYSCALL_RET_NONE:
229					munged_rv0 = 0LL;
230					munged_rv1 = 0LL;
231					break;
232				default:
233					munged_rv0 = 0LL;
234					munged_rv1 = 0LL;
235					break;
236				}
237			}
238		} else {
239			munged_rv0 = 0LL;
240			munged_rv1 = 0LL;
241		}
242
243		/*
244		 * <http://mail.opensolaris.org/pipermail/dtrace-discuss/2007-January/003276.html> says:
245		 *
246		 * "This is a bit of an historical artifact. At first, the syscall provider just
247		 * had its return value in arg0, and the fbt and pid providers had their return
248		 * values in arg1 (so that we could use arg0 for the offset of the return site).
249		 *
250		 * We inevitably started writing scripts where we wanted to see the return
251		 * values from probes in all three providers, and we made this script easier
252		 * to write by replicating the syscall return values in arg1 to match fbt and
253		 * pid. We debated briefly about removing the return value from arg0, but
254		 * decided that it would be less confusing to have the same data in two places
255		 * than to have some non-helpful, non-intuitive value in arg0.
256		 *
257		 * This change was made 4/23/2003 according to the DTrace project's putback log."
258		 */
259		(*systrace_probe)(id, munged_rv0, munged_rv0, munged_rv1, (uint64_t)rval, 0);
260	}
261
262	return (rval);
263}
264
265void
266dtrace_systrace_syscall_return(unsigned short code, int rval, int *rv)
267{
268	systrace_sysent_t *sy;
269	dtrace_id_t id;
270
271	// Bounds "check" the value of code a la unix_syscall_return
272	sy = (code >= NUM_SYSENT) ? &systrace_sysent[63] : &systrace_sysent[code];
273
274	if ((id = sy->stsy_return) != DTRACE_IDNONE) {
275		uint64_t munged_rv0, munged_rv1;
276    	uthread_t uthread = (uthread_t)get_bsdthread_info(current_thread());
277
278		if (uthread)
279			uthread->t_dtrace_errno = rval; /* Establish t_dtrace_errno now in case this enabling refers to it. */
280
281		/*
282	 	 * "Decode" rv for use in the call to dtrace_probe()
283	 	 */
284		if (rval == ERESTART) {
285			munged_rv0 = -1LL; /* System call will be reissued in user mode. Make DTrace report a -1 return. */
286			munged_rv1 = -1LL;
287		} else if (rval != EJUSTRETURN) {
288			if (rval) {
289				munged_rv0 = -1LL; /* Mimic what libc will do. */
290				munged_rv1 = -1LL;
291			} else {
292				switch (sy->stsy_return_type) {
293				case _SYSCALL_RET_INT_T:
294					munged_rv0 = rv[0];
295					munged_rv1 = rv[1];
296					break;
297				case _SYSCALL_RET_UINT_T:
298					munged_rv0 = ((u_int)rv[0]);
299					munged_rv1 = ((u_int)rv[1]);
300					break;
301				case _SYSCALL_RET_OFF_T:
302				case _SYSCALL_RET_UINT64_T:
303					munged_rv0 = *(u_int64_t *)rv;
304					munged_rv1 = 0LL;
305					break;
306				case _SYSCALL_RET_ADDR_T:
307				case _SYSCALL_RET_SIZE_T:
308				case _SYSCALL_RET_SSIZE_T:
309					munged_rv0 = *(user_addr_t *)rv;
310					munged_rv1 = 0LL;
311					break;
312				case _SYSCALL_RET_NONE:
313					munged_rv0 = 0LL;
314					munged_rv1 = 0LL;
315					break;
316				default:
317					munged_rv0 = 0LL;
318					munged_rv1 = 0LL;
319					break;
320				}
321			}
322		} else {
323			munged_rv0 = 0LL;
324			munged_rv1 = 0LL;
325		}
326
327		(*systrace_probe)(id, munged_rv0, munged_rv0, munged_rv1, (uint64_t)rval, 0);
328	}
329}
330#endif /* __APPLE__ */
331
332#define	SYSTRACE_SHIFT			16
333#define	SYSTRACE_ISENTRY(x)		((int)(x) >> SYSTRACE_SHIFT)
334#define	SYSTRACE_SYSNUM(x)		((int)(x) & ((1 << SYSTRACE_SHIFT) - 1))
335#define	SYSTRACE_ENTRY(id)		((1 << SYSTRACE_SHIFT) | (id))
336#define	SYSTRACE_RETURN(id)		(id)
337
338#if ((1 << SYSTRACE_SHIFT) <= NSYSCALL)
339#error 1 << SYSTRACE_SHIFT must exceed number of system calls
340#endif
341
342static dev_info_t *systrace_devi;
343static dtrace_provider_id_t systrace_id;
344
345#if !defined (__APPLE__)
346static void
347systrace_init(struct sysent *actual, systrace_sysent_t **interposed)
348{
349	systrace_sysent_t *sysent = *interposed;
350	int i;
351
352	if (sysent == NULL) {
353		*interposed = sysent = kmem_zalloc(sizeof (systrace_sysent_t) *
354		    NSYSCALL, KM_SLEEP);
355	}
356
357	for (i = 0; i < NSYSCALL; i++) {
358		struct sysent *a = &actual[i];
359		systrace_sysent_t *s = &sysent[i];
360
361		if (LOADABLE_SYSCALL(a) && !LOADED_SYSCALL(a))
362			continue;
363
364		if (a->sy_callc == dtrace_systrace_syscall)
365			continue;
366
367#ifdef _SYSCALL32_IMPL
368		if (a->sy_callc == dtrace_systrace_syscall32)
369			continue;
370#endif
371
372		s->stsy_underlying = a->sy_callc;
373	}
374}
375#else
376#define systrace_init _systrace_init /* Avoid name clash with Darwin automagic conf symbol */
377static void
378systrace_init(struct sysent *actual, systrace_sysent_t **interposed)
379{
380
381	systrace_sysent_t *ssysent = *interposed;  /* Avoid sysent shadow warning
382							   from bsd/sys/sysent.h */
383	int i;
384
385	if (ssysent == NULL) {
386		*interposed = ssysent = kmem_zalloc(sizeof (systrace_sysent_t) *
387		    NSYSCALL, KM_SLEEP);
388	}
389
390	for (i = 0; i < NSYSCALL; i++) {
391		struct sysent *a = &actual[i];
392		systrace_sysent_t *s = &ssysent[i];
393
394		if (LOADABLE_SYSCALL(a) && !LOADED_SYSCALL(a))
395			continue;
396
397		if (a->sy_callc == dtrace_systrace_syscall)
398			continue;
399
400#ifdef _SYSCALL32_IMPL
401		if (a->sy_callc == dtrace_systrace_syscall32)
402			continue;
403#endif
404
405		s->stsy_underlying = a->sy_callc;
406		s->stsy_return_type = a->sy_return_type;
407	}
408	lck_mtx_init(&dtrace_systrace_lock, dtrace_lck_grp, dtrace_lck_attr);
409}
410
411#endif /* __APPLE__ */
412
413/*ARGSUSED*/
414static void
415systrace_provide(void *arg, const dtrace_probedesc_t *desc)
416{
417#pragma unused(arg) /* __APPLE__ */
418	int i;
419
420	if (desc != NULL)
421		return;
422
423	systrace_init(sysent, &systrace_sysent);
424#ifdef _SYSCALL32_IMPL
425	systrace_init(sysent32, &systrace_sysent32);
426#endif
427
428	for (i = 0; i < NSYSCALL; i++) {
429		if (systrace_sysent[i].stsy_underlying == NULL)
430			continue;
431
432		if (dtrace_probe_lookup(systrace_id, NULL,
433		    syscallnames[i], "entry") != 0)
434			continue;
435
436		(void) dtrace_probe_create(systrace_id, NULL, syscallnames[i],
437		    "entry", SYSTRACE_ARTIFICIAL_FRAMES,
438		    (void *)((uintptr_t)SYSTRACE_ENTRY(i)));
439		(void) dtrace_probe_create(systrace_id, NULL, syscallnames[i],
440		    "return", SYSTRACE_ARTIFICIAL_FRAMES,
441		    (void *)((uintptr_t)SYSTRACE_RETURN(i)));
442
443		systrace_sysent[i].stsy_entry = DTRACE_IDNONE;
444		systrace_sysent[i].stsy_return = DTRACE_IDNONE;
445#ifdef _SYSCALL32_IMPL
446		systrace_sysent32[i].stsy_entry = DTRACE_IDNONE;
447		systrace_sysent32[i].stsy_return = DTRACE_IDNONE;
448#endif
449	}
450}
451#if defined(__APPLE__)
452#undef systrace_init
453#endif
454
455/*ARGSUSED*/
456static void
457systrace_destroy(void *arg, dtrace_id_t id, void *parg)
458{
459#pragma unused(arg,id) /* __APPLE__ */
460
461	int sysnum = SYSTRACE_SYSNUM((uintptr_t)parg);
462
463#pragma unused(sysnum)  /* __APPLE__ */
464	/*
465	 * There's nothing to do here but assert that we have actually been
466	 * disabled.
467	 */
468	if (SYSTRACE_ISENTRY((uintptr_t)parg)) {
469		ASSERT(systrace_sysent[sysnum].stsy_entry == DTRACE_IDNONE);
470#ifdef _SYSCALL32_IMPL
471		ASSERT(systrace_sysent32[sysnum].stsy_entry == DTRACE_IDNONE);
472#endif
473	} else {
474		ASSERT(systrace_sysent[sysnum].stsy_return == DTRACE_IDNONE);
475#ifdef _SYSCALL32_IMPL
476		ASSERT(systrace_sysent32[sysnum].stsy_return == DTRACE_IDNONE);
477#endif
478	}
479}
480
481/*ARGSUSED*/
482static int
483systrace_enable(void *arg, dtrace_id_t id, void *parg)
484{
485#pragma unused(arg) /* __APPLE__ */
486
487	int sysnum = SYSTRACE_SYSNUM((uintptr_t)parg);
488	int enabled = (systrace_sysent[sysnum].stsy_entry != DTRACE_IDNONE ||
489	    systrace_sysent[sysnum].stsy_return != DTRACE_IDNONE);
490
491	if (SYSTRACE_ISENTRY((uintptr_t)parg)) {
492		systrace_sysent[sysnum].stsy_entry = id;
493#ifdef _SYSCALL32_IMPL
494		systrace_sysent32[sysnum].stsy_entry = id;
495#endif
496	} else {
497		systrace_sysent[sysnum].stsy_return = id;
498#ifdef _SYSCALL32_IMPL
499		systrace_sysent32[sysnum].stsy_return = id;
500#endif
501	}
502
503	if (enabled) {
504		ASSERT(sysent[sysnum].sy_callc == dtrace_systrace_syscall);
505		return(0);
506	}
507#ifdef _SYSCALL32_IMPL
508	(void) casptr(&sysent32[sysnum].sy_callc,
509	    (void *)systrace_sysent32[sysnum].stsy_underlying,
510	    (void *)dtrace_systrace_syscall32);
511#endif
512
513	lck_mtx_lock(&dtrace_systrace_lock);
514	if (sysent[sysnum].sy_callc == systrace_sysent[sysnum].stsy_underlying) {
515		vm_offset_t dss = (vm_offset_t)&dtrace_systrace_syscall;
516		ml_nofault_copy((vm_offset_t)&dss, (vm_offset_t)&sysent[sysnum].sy_callc, sizeof(vm_offset_t));
517	}
518	lck_mtx_unlock(&dtrace_systrace_lock);
519	return (0);
520}
521
522/*ARGSUSED*/
523static void
524systrace_disable(void *arg, dtrace_id_t id, void *parg)
525{
526#pragma unused(arg,id) /* __APPLE__ */
527
528	int sysnum = SYSTRACE_SYSNUM((uintptr_t)parg);
529	int disable = (systrace_sysent[sysnum].stsy_entry == DTRACE_IDNONE ||
530	    systrace_sysent[sysnum].stsy_return == DTRACE_IDNONE);
531
532	if (disable) {
533		lck_mtx_lock(&dtrace_systrace_lock);
534		if (sysent[sysnum].sy_callc == dtrace_systrace_syscall)
535			ml_nofault_copy((vm_offset_t)&systrace_sysent[sysnum].stsy_underlying, (vm_offset_t)&sysent[sysnum].sy_callc, sizeof(systrace_sysent[sysnum].stsy_underlying));
536		lck_mtx_unlock(&dtrace_systrace_lock);
537
538#ifdef _SYSCALL32_IMPL
539		(void) casptr(&sysent32[sysnum].sy_callc,
540		    (void *)dtrace_systrace_syscall32,
541		    (void *)systrace_sysent32[sysnum].stsy_underlying);
542#endif
543	}
544
545	if (SYSTRACE_ISENTRY((uintptr_t)parg)) {
546		systrace_sysent[sysnum].stsy_entry = DTRACE_IDNONE;
547#ifdef _SYSCALL32_IMPL
548		systrace_sysent32[sysnum].stsy_entry = DTRACE_IDNONE;
549#endif
550	} else {
551		systrace_sysent[sysnum].stsy_return = DTRACE_IDNONE;
552#ifdef _SYSCALL32_IMPL
553		systrace_sysent32[sysnum].stsy_return = DTRACE_IDNONE;
554#endif
555	}
556}
557
558static dtrace_pattr_t systrace_attr = {
559{ DTRACE_STABILITY_EVOLVING, DTRACE_STABILITY_EVOLVING, DTRACE_CLASS_COMMON },
560{ DTRACE_STABILITY_PRIVATE, DTRACE_STABILITY_PRIVATE, DTRACE_CLASS_UNKNOWN },
561{ DTRACE_STABILITY_PRIVATE, DTRACE_STABILITY_PRIVATE, DTRACE_CLASS_ISA },
562{ DTRACE_STABILITY_EVOLVING, DTRACE_STABILITY_EVOLVING, DTRACE_CLASS_COMMON },
563{ DTRACE_STABILITY_PRIVATE, DTRACE_STABILITY_PRIVATE, DTRACE_CLASS_ISA },
564};
565
566static dtrace_pops_t systrace_pops = {
567	systrace_provide,
568	NULL,
569	systrace_enable,
570	systrace_disable,
571	NULL,
572	NULL,
573	NULL,
574	systrace_getarg,
575	NULL,
576	systrace_destroy
577};
578
579static int
580systrace_attach(dev_info_t *devi, ddi_attach_cmd_t cmd)
581{
582	switch (cmd) {
583	case DDI_ATTACH:
584		break;
585	case DDI_RESUME:
586		return (DDI_SUCCESS);
587	default:
588		return (DDI_FAILURE);
589	}
590
591#if !defined(__APPLE__)
592	systrace_probe = (void (*)())dtrace_probe;
593	membar_enter();
594
595	if (ddi_create_minor_node(devi, "systrace", S_IFCHR, 0,
596	    DDI_PSEUDO, NULL) == DDI_FAILURE ||
597	    dtrace_register("syscall", &systrace_attr, DTRACE_PRIV_USER, NULL,
598	    &systrace_pops, NULL, &systrace_id) != 0) {
599		systrace_probe = systrace_stub;
600		ddi_remove_minor_node(devi, NULL);
601		return (DDI_FAILURE);
602	}
603#else
604	systrace_probe = (void(*))&dtrace_probe;
605	membar_enter();
606
607	if (ddi_create_minor_node(devi, "systrace", S_IFCHR, 0,
608	    DDI_PSEUDO, 0) == DDI_FAILURE ||
609	    dtrace_register("syscall", &systrace_attr, DTRACE_PRIV_USER, NULL,
610	    &systrace_pops, NULL, &systrace_id) != 0) {
611		systrace_probe = systrace_stub;
612		ddi_remove_minor_node(devi, NULL);
613		return (DDI_FAILURE);
614	}
615#endif /* __APPLE__ */
616
617	ddi_report_dev(devi);
618	systrace_devi = devi;
619
620	return (DDI_SUCCESS);
621}
622
623#if !defined(__APPLE__)
624static int
625systrace_detach(dev_info_t *devi, ddi_detach_cmd_t cmd)
626{
627	switch (cmd) {
628	case DDI_DETACH:
629		break;
630	case DDI_SUSPEND:
631		return (DDI_SUCCESS);
632	default:
633		return (DDI_FAILURE);
634	}
635
636	if (dtrace_unregister(systrace_id) != 0)
637		return (DDI_FAILURE);
638
639	ddi_remove_minor_node(devi, NULL);
640	systrace_probe = systrace_stub;
641	return (DDI_SUCCESS);
642}
643
644/*ARGSUSED*/
645static int
646systrace_info(dev_info_t *dip, ddi_info_cmd_t infocmd, void *arg, void **result)
647{
648	int error;
649
650	switch (infocmd) {
651	case DDI_INFO_DEVT2DEVINFO:
652		*result = (void *)systrace_devi;
653		error = DDI_SUCCESS;
654		break;
655	case DDI_INFO_DEVT2INSTANCE:
656		*result = (void *)0;
657		error = DDI_SUCCESS;
658		break;
659	default:
660		error = DDI_FAILURE;
661	}
662	return (error);
663}
664
665/*ARGSUSED*/
666static int
667systrace_open(dev_t *devp, int flag, int otyp, cred_t *cred_p)
668{
669	return (0);
670}
671
672static struct cb_ops systrace_cb_ops = {
673	systrace_open,		/* open */
674	nodev,			/* close */
675	nulldev,		/* strategy */
676	nulldev,		/* print */
677	nodev,			/* dump */
678	nodev,			/* read */
679	nodev,			/* write */
680	nodev,			/* ioctl */
681	nodev,			/* devmap */
682	nodev,			/* mmap */
683	nodev,			/* segmap */
684	nochpoll,		/* poll */
685	ddi_prop_op,		/* cb_prop_op */
686	0,			/* streamtab  */
687	D_NEW | D_MP		/* Driver compatibility flag */
688};
689
690static struct dev_ops systrace_ops = {
691	DEVO_REV,		/* devo_rev, */
692	0,			/* refcnt  */
693	systrace_info,		/* get_dev_info */
694	nulldev,		/* identify */
695	nulldev,		/* probe */
696	systrace_attach,	/* attach */
697	systrace_detach,	/* detach */
698	nodev,			/* reset */
699	&systrace_cb_ops,	/* driver operations */
700	NULL,			/* bus operations */
701	nodev			/* dev power */
702};
703
704/*
705 * Module linkage information for the kernel.
706 */
707static struct modldrv modldrv = {
708	&mod_driverops,		/* module type (this is a pseudo driver) */
709	"System Call Tracing",	/* name of module */
710	&systrace_ops,		/* driver ops */
711};
712
713static struct modlinkage modlinkage = {
714	MODREV_1,
715	(void *)&modldrv,
716	NULL
717};
718
719int
720_init(void)
721{
722	return (mod_install(&modlinkage));
723}
724
725int
726_info(struct modinfo *modinfop)
727{
728	return (mod_info(&modlinkage, modinfop));
729}
730
731int
732_fini(void)
733{
734	return (mod_remove(&modlinkage));
735}
736#else
737typedef kern_return_t (*mach_call_t)(void *);
738
739/* XXX From #include <kern/syscall_sw.h> which may be changed for 64 bit! */
740typedef void    mach_munge_t(const void *, void *);
741
742typedef struct {
743	int			mach_trap_arg_count;
744	kern_return_t		(*mach_trap_function)(void *);
745#if 0 /* no active architectures use mungers for mach traps */
746	mach_munge_t		*mach_trap_arg_munge32; /* system call arguments for 32-bit */
747	mach_munge_t		*mach_trap_arg_munge64; /* system call arguments for 64-bit */
748#endif
749#if	MACH_ASSERT
750	const char*		mach_trap_name;
751#endif /* MACH_ASSERT */
752} mach_trap_t;
753
754extern mach_trap_t              mach_trap_table[];
755extern int                      mach_trap_count;
756
757extern const char *mach_syscall_name_table[];
758
759/* XXX From osfmk/i386/bsd_i386.c */
760struct mach_call_args {
761        syscall_arg_t arg1;
762        syscall_arg_t arg2;
763        syscall_arg_t arg3;
764        syscall_arg_t arg4;
765        syscall_arg_t arg5;
766        syscall_arg_t arg6;
767        syscall_arg_t arg7;
768        syscall_arg_t arg8;
769        syscall_arg_t arg9;
770};
771
772#undef NSYSCALL
773#define NSYSCALL mach_trap_count
774
775#if ((1 << SYSTRACE_SHIFT) <= NSYSCALL)
776#error 1 << SYSTRACE_SHIFT must exceed number of Mach traps
777#endif
778
779typedef struct machtrace_sysent {
780	dtrace_id_t	stsy_entry;
781	dtrace_id_t	stsy_return;
782	kern_return_t	(*stsy_underlying)(void *);
783	int32_t		stsy_return_type;
784} machtrace_sysent_t;
785
786static machtrace_sysent_t *machtrace_sysent = NULL;
787
788void (*machtrace_probe)(dtrace_id_t, uint64_t, uint64_t,
789    uint64_t, uint64_t, uint64_t);
790
791static uint64_t machtrace_getarg(void *, dtrace_id_t, void *, int, int);
792
793static dev_info_t *machtrace_devi;
794static dtrace_provider_id_t machtrace_id;
795
796static kern_return_t
797dtrace_machtrace_syscall(struct mach_call_args *args)
798{
799	boolean_t           flavor;
800	unsigned short      code;
801
802	machtrace_sysent_t *sy;
803	dtrace_id_t id;
804	kern_return_t rval;
805#if 0 /* XXX */
806	proc_t *p;
807#endif
808	syscall_arg_t *ip = (syscall_arg_t *)args;
809	mach_call_t mach_call;
810
811#if defined(__i386__) || defined (__x86_64__)
812#pragma unused(flavor)
813	{
814		pal_register_cache_state(current_thread(), VALID);
815		x86_saved_state_t   *tagged_regs = (x86_saved_state_t *)find_user_regs(current_thread());
816
817		if (is_saved_state64(tagged_regs)) {
818			code = saved_state64(tagged_regs)->rax & SYSCALL_NUMBER_MASK;
819		} else {
820			code = -saved_state32(tagged_regs)->eax;
821		}
822	}
823#else
824#error Unknown Architecture
825#endif
826
827	sy = &machtrace_sysent[code];
828
829	if ((id = sy->stsy_entry) != DTRACE_IDNONE) {
830		uthread_t uthread = (uthread_t)get_bsdthread_info(current_thread());
831
832		if (uthread)
833			uthread->t_dtrace_syscall_args = (void *)ip;
834
835		(*machtrace_probe)(id, *ip, *(ip+1), *(ip+2), *(ip+3), *(ip+4));
836
837		if (uthread)
838			uthread->t_dtrace_syscall_args = (void *)0;
839	}
840
841#if 0 /* XXX */
842	/*
843	 * We want to explicitly allow DTrace consumers to stop a process
844	 * before it actually executes the meat of the syscall.
845	 */
846	p = ttoproc(curthread);
847	mutex_enter(&p->p_lock);
848	if (curthread->t_dtrace_stop && !curthread->t_lwp->lwp_nostop) {
849		curthread->t_dtrace_stop = 0;
850		stop(PR_REQUESTED, 0);
851	}
852	mutex_exit(&p->p_lock);
853#endif
854
855	mach_call = (mach_call_t)(*sy->stsy_underlying);
856	rval = mach_call(args);
857
858	if ((id = sy->stsy_return) != DTRACE_IDNONE)
859		(*machtrace_probe)(id, (uint64_t)rval, 0, 0, 0, 0);
860
861	return (rval);
862}
863
864static void
865machtrace_init(mach_trap_t *actual, machtrace_sysent_t **interposed)
866{
867	machtrace_sysent_t *msysent = *interposed;
868	int i;
869
870	if (msysent == NULL) {
871		*interposed = msysent = kmem_zalloc(sizeof (machtrace_sysent_t) *
872				NSYSCALL, KM_SLEEP);
873	}
874
875	for (i = 0; i < NSYSCALL; i++) {
876		mach_trap_t *a = &actual[i];
877		machtrace_sysent_t *s = &msysent[i];
878
879		if (LOADABLE_SYSCALL(a) && !LOADED_SYSCALL(a))
880			continue;
881
882		if (a->mach_trap_function == (mach_call_t)(dtrace_machtrace_syscall))
883			continue;
884
885		s->stsy_underlying = a->mach_trap_function;
886	}
887}
888
889/*ARGSUSED*/
890static void
891machtrace_provide(void *arg, const dtrace_probedesc_t *desc)
892{
893#pragma unused(arg) /* __APPLE__ */
894
895	int i;
896
897	if (desc != NULL)
898		return;
899
900	machtrace_init(mach_trap_table, &machtrace_sysent);
901
902	for (i = 0; i < NSYSCALL; i++) {
903
904		if (machtrace_sysent[i].stsy_underlying == NULL)
905			continue;
906
907		if (dtrace_probe_lookup(machtrace_id, NULL,
908					mach_syscall_name_table[i], "entry") != 0)
909			continue;
910
911		(void) dtrace_probe_create(machtrace_id, NULL, mach_syscall_name_table[i],
912					   "entry", MACHTRACE_ARTIFICIAL_FRAMES,
913					   (void *)((uintptr_t)SYSTRACE_ENTRY(i)));
914		(void) dtrace_probe_create(machtrace_id, NULL, mach_syscall_name_table[i],
915					   "return", MACHTRACE_ARTIFICIAL_FRAMES,
916					   (void *)((uintptr_t)SYSTRACE_RETURN(i)));
917
918		machtrace_sysent[i].stsy_entry = DTRACE_IDNONE;
919		machtrace_sysent[i].stsy_return = DTRACE_IDNONE;
920	}
921}
922
923/*ARGSUSED*/
924static void
925machtrace_destroy(void *arg, dtrace_id_t id, void *parg)
926{
927#pragma unused(arg,id) /* __APPLE__ */
928	int sysnum = SYSTRACE_SYSNUM((uintptr_t)parg);
929
930#pragma unused(sysnum) /* __APPLE__ */
931
932	/*
933	 * There's nothing to do here but assert that we have actually been
934	 * disabled.
935	 */
936	if (SYSTRACE_ISENTRY((uintptr_t)parg)) {
937		ASSERT(machtrace_sysent[sysnum].stsy_entry == DTRACE_IDNONE);
938	} else {
939		ASSERT(machtrace_sysent[sysnum].stsy_return == DTRACE_IDNONE);
940	}
941}
942
943/*ARGSUSED*/
944static int
945machtrace_enable(void *arg, dtrace_id_t id, void *parg)
946{
947#pragma unused(arg) /* __APPLE__ */
948
949	int sysnum = SYSTRACE_SYSNUM((uintptr_t)parg);
950	int enabled = (machtrace_sysent[sysnum].stsy_entry != DTRACE_IDNONE ||
951			machtrace_sysent[sysnum].stsy_return != DTRACE_IDNONE);
952
953	if (SYSTRACE_ISENTRY((uintptr_t)parg)) {
954		machtrace_sysent[sysnum].stsy_entry = id;
955	} else {
956		machtrace_sysent[sysnum].stsy_return = id;
957	}
958
959	if (enabled) {
960	    ASSERT(mach_trap_table[sysnum].mach_trap_function == (void *)dtrace_machtrace_syscall);
961	    return(0);
962	}
963
964	lck_mtx_lock(&dtrace_systrace_lock);
965
966	if (mach_trap_table[sysnum].mach_trap_function == machtrace_sysent[sysnum].stsy_underlying) {
967		vm_offset_t dss = (vm_offset_t)&dtrace_machtrace_syscall;
968		ml_nofault_copy((vm_offset_t)&dss, (vm_offset_t)&mach_trap_table[sysnum].mach_trap_function, sizeof(vm_offset_t));
969	}
970
971	lck_mtx_unlock(&dtrace_systrace_lock);
972
973	return(0);
974}
975
976/*ARGSUSED*/
977static void
978machtrace_disable(void *arg, dtrace_id_t id, void *parg)
979{
980#pragma unused(arg,id) /* __APPLE__ */
981
982	int sysnum = SYSTRACE_SYSNUM((uintptr_t)parg);
983	int disable = (machtrace_sysent[sysnum].stsy_entry == DTRACE_IDNONE ||
984			machtrace_sysent[sysnum].stsy_return == DTRACE_IDNONE);
985
986	if (disable) {
987
988		lck_mtx_lock(&dtrace_systrace_lock);
989
990		if (mach_trap_table[sysnum].mach_trap_function == (mach_call_t)dtrace_machtrace_syscall) {
991			ml_nofault_copy((vm_offset_t)&machtrace_sysent[sysnum].stsy_underlying, (vm_offset_t)&mach_trap_table[sysnum].mach_trap_function, sizeof(vm_offset_t));
992		}
993		lck_mtx_unlock(&dtrace_systrace_lock);
994	}
995
996	if (SYSTRACE_ISENTRY((uintptr_t)parg)) {
997		machtrace_sysent[sysnum].stsy_entry = DTRACE_IDNONE;
998	} else {
999		machtrace_sysent[sysnum].stsy_return = DTRACE_IDNONE;
1000	}
1001}
1002
1003static dtrace_pattr_t machtrace_attr = {
1004{ DTRACE_STABILITY_EVOLVING, DTRACE_STABILITY_EVOLVING, DTRACE_CLASS_COMMON },
1005{ DTRACE_STABILITY_PRIVATE, DTRACE_STABILITY_PRIVATE, DTRACE_CLASS_UNKNOWN },
1006{ DTRACE_STABILITY_PRIVATE, DTRACE_STABILITY_PRIVATE, DTRACE_CLASS_ISA },
1007{ DTRACE_STABILITY_EVOLVING, DTRACE_STABILITY_EVOLVING, DTRACE_CLASS_COMMON },
1008{ DTRACE_STABILITY_PRIVATE, DTRACE_STABILITY_PRIVATE, DTRACE_CLASS_ISA },
1009};
1010
1011static dtrace_pops_t machtrace_pops = {
1012	machtrace_provide,
1013	NULL,
1014	machtrace_enable,
1015	machtrace_disable,
1016	NULL,
1017	NULL,
1018	NULL,
1019	machtrace_getarg,
1020	NULL,
1021	machtrace_destroy
1022};
1023
1024static int
1025machtrace_attach(dev_info_t *devi, ddi_attach_cmd_t cmd)
1026{
1027	switch (cmd) {
1028		case DDI_ATTACH:
1029			break;
1030		case DDI_RESUME:
1031			return (DDI_SUCCESS);
1032		default:
1033			return (DDI_FAILURE);
1034	}
1035
1036#if !defined(__APPLE__)
1037	machtrace_probe = (void (*)())dtrace_probe;
1038	membar_enter();
1039
1040	if (ddi_create_minor_node(devi, "machtrace", S_IFCHR, 0,
1041				DDI_PSEUDO, NULL) == DDI_FAILURE ||
1042			dtrace_register("mach_trap", &machtrace_attr, DTRACE_PRIV_USER, NULL,
1043				&machtrace_pops, NULL, &machtrace_id) != 0) {
1044		machtrace_probe = systrace_stub;
1045#else
1046	machtrace_probe = dtrace_probe;
1047	membar_enter();
1048
1049	if (ddi_create_minor_node(devi, "machtrace", S_IFCHR, 0,
1050				DDI_PSEUDO, 0) == DDI_FAILURE ||
1051			dtrace_register("mach_trap", &machtrace_attr, DTRACE_PRIV_USER, NULL,
1052				&machtrace_pops, NULL, &machtrace_id) != 0) {
1053                machtrace_probe = (void (*))&systrace_stub;
1054#endif /* __APPLE__ */
1055		ddi_remove_minor_node(devi, NULL);
1056		return (DDI_FAILURE);
1057	}
1058
1059	ddi_report_dev(devi);
1060	machtrace_devi = devi;
1061
1062	return (DDI_SUCCESS);
1063}
1064
1065d_open_t _systrace_open;
1066
1067int _systrace_open(dev_t dev, int flags, int devtype, struct proc *p)
1068{
1069#pragma unused(dev,flags,devtype,p)
1070	return 0;
1071}
1072
1073#define SYSTRACE_MAJOR  -24 /* let the kernel pick the device number */
1074
1075/*
1076 * A struct describing which functions will get invoked for certain
1077 * actions.
1078 */
1079static struct cdevsw systrace_cdevsw =
1080{
1081	_systrace_open,		/* open */
1082	eno_opcl,		/* close */
1083	eno_rdwrt,			/* read */
1084	eno_rdwrt,			/* write */
1085	eno_ioctl,		/* ioctl */
1086	(stop_fcn_t *)nulldev, /* stop */
1087	(reset_fcn_t *)nulldev, /* reset */
1088	NULL,				/* tty's */
1089	eno_select,			/* select */
1090	eno_mmap,			/* mmap */
1091	eno_strat,			/* strategy */
1092	eno_getc,			/* getc */
1093	eno_putc,			/* putc */
1094	0					/* type */
1095};
1096
1097static int gSysTraceInited = 0;
1098
1099void systrace_init( void );
1100
1101void systrace_init( void )
1102{
1103	if (0 == gSysTraceInited) {
1104		int majdevno = cdevsw_add(SYSTRACE_MAJOR, &systrace_cdevsw);
1105
1106		if (majdevno < 0) {
1107			printf("systrace_init: failed to allocate a major number!\n");
1108			gSysTraceInited = 0;
1109			return;
1110		}
1111
1112		systrace_attach( (dev_info_t	*)(uintptr_t)majdevno, DDI_ATTACH );
1113		machtrace_attach( (dev_info_t	*)(uintptr_t)majdevno, DDI_ATTACH );
1114
1115		gSysTraceInited = 1;
1116	} else
1117		panic("systrace_init: called twice!\n");
1118}
1119#undef SYSTRACE_MAJOR
1120#endif /* __APPLE__ */
1121
1122static uint64_t
1123systrace_getarg(void *arg, dtrace_id_t id, void *parg, int argno, int aframes)
1124{
1125#pragma unused(arg,id,parg,aframes)     /* __APPLE__ */
1126	uint64_t val = 0;
1127	syscall_arg_t *stack = (syscall_arg_t *)NULL;
1128
1129	uthread_t uthread = (uthread_t)get_bsdthread_info(current_thread());
1130
1131	if (uthread)
1132		stack = (syscall_arg_t *)uthread->t_dtrace_syscall_args;
1133
1134	if (!stack)
1135		return(0);
1136
1137	DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
1138	/* dtrace_probe arguments arg0 .. arg4 are 64bits wide */
1139	val = (uint64_t)*(stack+argno);
1140	DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);
1141	return (val);
1142}
1143
1144
1145static uint64_t
1146machtrace_getarg(void *arg, dtrace_id_t id, void *parg, int argno, int aframes)
1147{
1148#pragma unused(arg,id,parg,aframes)     /* __APPLE__ */
1149	uint64_t val = 0;
1150	syscall_arg_t *stack = (syscall_arg_t *)NULL;
1151
1152	uthread_t uthread = (uthread_t)get_bsdthread_info(current_thread());
1153
1154	if (uthread)
1155		stack = (syscall_arg_t *)uthread->t_dtrace_syscall_args;
1156
1157	if (!stack)
1158		return(0);
1159
1160	DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
1161	/* dtrace_probe arguments arg0 .. arg4 are 64bits wide */
1162	val = (uint64_t)*(stack+argno);
1163	DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);
1164	return (val);
1165}
1166
1167