1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21/*
22 * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
23 * Use is subject to license terms.
24 */
25
26/* #pragma ident	"@(#)systrace.c	1.6	06/09/19 SMI" */
27
28#ifdef KERNEL
29#ifndef _KERNEL
30#define _KERNEL /* Solaris vs. Darwin */
31#endif
32#endif
33
34#include <kern/thread.h>
35#include <mach/thread_status.h>
36
37/* XXX All of these should really be derived from syscall_sw.h */
38#if defined (__x86_64__)
39#define SYSCALL_CLASS_SHIFT 24
40#define SYSCALL_CLASS_MASK  (0xFF << SYSCALL_CLASS_SHIFT)
41#define SYSCALL_NUMBER_MASK (~SYSCALL_CLASS_MASK)
42#define I386_SYSCALL_NUMBER_MASK (0xFFFF)
43#endif
44
45#include <sys/param.h>
46#include <sys/systm.h>
47#include <sys/proc.h>
48#include <sys/errno.h>
49#include <sys/ioctl.h>
50#include <sys/conf.h>
51#include <sys/fcntl.h>
52#include <miscfs/devfs/devfs.h>
53
54#include <sys/dtrace.h>
55#include <sys/dtrace_impl.h>
56#include "systrace.h"
57#include <sys/stat.h>
58#include <sys/systm.h>
59#include <sys/conf.h>
60#include <sys/user.h>
61
62#include <machine/pal_routines.h>
63
64#if defined (__x86_64__)
65#define	SYSTRACE_ARTIFICIAL_FRAMES	2
66#define MACHTRACE_ARTIFICIAL_FRAMES 3
67#else
68#error Unknown Architecture
69#endif
70
71#include <sys/sysent.h>
72#define sy_callc sy_call /* Map Solaris slot name to Darwin's */
73#define NSYSCALL nsysent /* and is less than 500 or so */
74
75extern const char *syscallnames[];
76
77#include <sys/dtrace_glue.h>
78#define casptr dtrace_casptr
79#define membar_enter dtrace_membar_producer
80
81#define LOADABLE_SYSCALL(a) 0 /* Not pertinent to Darwin. */
82#define LOADED_SYSCALL(a) 1 /* Not pertinent to Darwin. */
83
84extern lck_attr_t* dtrace_lck_attr;
85extern lck_grp_t* dtrace_lck_grp;
86static lck_mtx_t	dtrace_systrace_lock;		/* probe state lock */
87
88systrace_sysent_t *systrace_sysent = NULL;
89void (*systrace_probe)(dtrace_id_t, uint64_t, uint64_t, uint64_t, uint64_t, uint64_t);
90
91static uint64_t systrace_getarg(void *, dtrace_id_t, void *, int, int);
92
93void
94systrace_stub(dtrace_id_t id, uint64_t arg0, uint64_t arg1,
95    uint64_t arg2, uint64_t arg3, uint64_t arg4)
96{
97#pragma unused(id,arg0,arg1,arg2,arg3,arg4)
98}
99
100int32_t
101dtrace_systrace_syscall(struct proc *pp, void *uap, int *rv)
102{
103	unsigned short      code;	/* The system call number */
104
105	systrace_sysent_t *sy;
106	dtrace_id_t id;
107	int32_t rval;
108#if 0 /* XXX */
109	proc_t *p;
110#endif
111	syscall_arg_t *ip = (syscall_arg_t *)uap;
112
113#if defined (__x86_64__)
114	{
115		pal_register_cache_state(current_thread(), VALID);
116		x86_saved_state_t   *tagged_regs = (x86_saved_state_t *)find_user_regs(current_thread());
117
118		if (is_saved_state64(tagged_regs)) {
119			x86_saved_state64_t *regs = saved_state64(tagged_regs);
120			code = regs->rax & SYSCALL_NUMBER_MASK;
121			/*
122			 * Check for indirect system call... system call number
123			 * passed as 'arg0'
124			 */
125			if (code == 0) {
126				code = regs->rdi;
127			}
128		} else {
129			code = saved_state32(tagged_regs)->eax & I386_SYSCALL_NUMBER_MASK;
130
131			if (code == 0) {
132				vm_offset_t params = (vm_offset_t) (saved_state32(tagged_regs)->uesp + sizeof (int));
133				code = fuword(params);
134			}
135		}
136	}
137#else
138#error Unknown Architecture
139#endif
140
141	// Bounds "check" the value of code a la unix_syscall
142	sy = (code >= NUM_SYSENT) ? &systrace_sysent[63] : &systrace_sysent[code];
143
144	if ((id = sy->stsy_entry) != DTRACE_IDNONE) {
145		uthread_t uthread = (uthread_t)get_bsdthread_info(current_thread());
146		if (uthread)
147			uthread->t_dtrace_syscall_args = (void *)ip;
148
149		if (ip)
150			(*systrace_probe)(id, *ip, *(ip+1), *(ip+2), *(ip+3), *(ip+4));
151		else
152			(*systrace_probe)(id, 0, 0, 0, 0, 0);
153
154		if (uthread)
155			uthread->t_dtrace_syscall_args = (void *)0;
156	}
157
158#if 0 /* XXX */
159	/*
160	 * APPLE NOTE: Not implemented.
161	 * We want to explicitly allow DTrace consumers to stop a process
162	 * before it actually executes the meat of the syscall.
163	 */
164	p = ttoproc(curthread);
165	mutex_enter(&p->p_lock);
166	if (curthread->t_dtrace_stop && !curthread->t_lwp->lwp_nostop) {
167		curthread->t_dtrace_stop = 0;
168		stop(PR_REQUESTED, 0);
169	}
170	mutex_exit(&p->p_lock);
171#endif
172
173	rval = (*sy->stsy_underlying)(pp, uap, rv);
174
175	if ((id = sy->stsy_return) != DTRACE_IDNONE) {
176		uint64_t munged_rv0, munged_rv1;
177    	uthread_t uthread = (uthread_t)get_bsdthread_info(current_thread());
178
179		if (uthread)
180			uthread->t_dtrace_errno = rval; /* Establish t_dtrace_errno now in case this enabling refers to it. */
181
182		/*
183	 	 * "Decode" rv for use in the call to dtrace_probe()
184	 	 */
185		if (rval == ERESTART) {
186			munged_rv0 = -1LL; /* System call will be reissued in user mode. Make DTrace report a -1 return. */
187			munged_rv1 = -1LL;
188		} else if (rval != EJUSTRETURN) {
189			if (rval) {
190				munged_rv0 = -1LL; /* Mimic what libc will do. */
191				munged_rv1 = -1LL;
192			} else {
193				switch (sy->stsy_return_type) {
194				case _SYSCALL_RET_INT_T:
195					munged_rv0 = rv[0];
196					munged_rv1 = rv[1];
197					break;
198				case _SYSCALL_RET_UINT_T:
199					munged_rv0 = ((u_int)rv[0]);
200					munged_rv1 = ((u_int)rv[1]);
201					break;
202				case _SYSCALL_RET_OFF_T:
203				case _SYSCALL_RET_UINT64_T:
204					munged_rv0 = *(u_int64_t *)rv;
205					munged_rv1 = 0LL;
206					break;
207				case _SYSCALL_RET_ADDR_T:
208				case _SYSCALL_RET_SIZE_T:
209				case _SYSCALL_RET_SSIZE_T:
210					munged_rv0 = *(user_addr_t *)rv;
211					munged_rv1 = 0LL;
212					break;
213				case _SYSCALL_RET_NONE:
214					munged_rv0 = 0LL;
215					munged_rv1 = 0LL;
216					break;
217				default:
218					munged_rv0 = 0LL;
219					munged_rv1 = 0LL;
220					break;
221				}
222			}
223		} else {
224			munged_rv0 = 0LL;
225			munged_rv1 = 0LL;
226		}
227
228		/*
229		 * <http://mail.opensolaris.org/pipermail/dtrace-discuss/2007-January/003276.html> says:
230		 *
231		 * "This is a bit of an historical artifact. At first, the syscall provider just
232		 * had its return value in arg0, and the fbt and pid providers had their return
233		 * values in arg1 (so that we could use arg0 for the offset of the return site).
234		 *
235		 * We inevitably started writing scripts where we wanted to see the return
236		 * values from probes in all three providers, and we made this script easier
237		 * to write by replicating the syscall return values in arg1 to match fbt and
238		 * pid. We debated briefly about removing the return value from arg0, but
239		 * decided that it would be less confusing to have the same data in two places
240		 * than to have some non-helpful, non-intuitive value in arg0.
241		 *
242		 * This change was made 4/23/2003 according to the DTrace project's putback log."
243		 */
244		(*systrace_probe)(id, munged_rv0, munged_rv0, munged_rv1, (uint64_t)rval, 0);
245	}
246
247	return (rval);
248}
249
250void
251dtrace_systrace_syscall_return(unsigned short code, int rval, int *rv)
252{
253	systrace_sysent_t *sy;
254	dtrace_id_t id;
255
256	// Bounds "check" the value of code a la unix_syscall_return
257	sy = (code >= NUM_SYSENT) ? &systrace_sysent[63] : &systrace_sysent[code];
258
259	if ((id = sy->stsy_return) != DTRACE_IDNONE) {
260		uint64_t munged_rv0, munged_rv1;
261    	uthread_t uthread = (uthread_t)get_bsdthread_info(current_thread());
262
263		if (uthread)
264			uthread->t_dtrace_errno = rval; /* Establish t_dtrace_errno now in case this enabling refers to it. */
265
266		/*
267	 	 * "Decode" rv for use in the call to dtrace_probe()
268	 	 */
269		if (rval == ERESTART) {
270			munged_rv0 = -1LL; /* System call will be reissued in user mode. Make DTrace report a -1 return. */
271			munged_rv1 = -1LL;
272		} else if (rval != EJUSTRETURN) {
273			if (rval) {
274				munged_rv0 = -1LL; /* Mimic what libc will do. */
275				munged_rv1 = -1LL;
276			} else {
277				switch (sy->stsy_return_type) {
278				case _SYSCALL_RET_INT_T:
279					munged_rv0 = rv[0];
280					munged_rv1 = rv[1];
281					break;
282				case _SYSCALL_RET_UINT_T:
283					munged_rv0 = ((u_int)rv[0]);
284					munged_rv1 = ((u_int)rv[1]);
285					break;
286				case _SYSCALL_RET_OFF_T:
287				case _SYSCALL_RET_UINT64_T:
288					munged_rv0 = *(u_int64_t *)rv;
289					munged_rv1 = 0LL;
290					break;
291				case _SYSCALL_RET_ADDR_T:
292				case _SYSCALL_RET_SIZE_T:
293				case _SYSCALL_RET_SSIZE_T:
294					munged_rv0 = *(user_addr_t *)rv;
295					munged_rv1 = 0LL;
296					break;
297				case _SYSCALL_RET_NONE:
298					munged_rv0 = 0LL;
299					munged_rv1 = 0LL;
300					break;
301				default:
302					munged_rv0 = 0LL;
303					munged_rv1 = 0LL;
304					break;
305				}
306			}
307		} else {
308			munged_rv0 = 0LL;
309			munged_rv1 = 0LL;
310		}
311
312		(*systrace_probe)(id, munged_rv0, munged_rv0, munged_rv1, (uint64_t)rval, 0);
313	}
314}
315
316#define	SYSTRACE_SHIFT			16
317#define	SYSTRACE_ISENTRY(x)		((int)(x) >> SYSTRACE_SHIFT)
318#define	SYSTRACE_SYSNUM(x)		((int)(x) & ((1 << SYSTRACE_SHIFT) - 1))
319#define	SYSTRACE_ENTRY(id)		((1 << SYSTRACE_SHIFT) | (id))
320#define	SYSTRACE_RETURN(id)		(id)
321
322#if ((1 << SYSTRACE_SHIFT) <= NSYSCALL)
323#error 1 << SYSTRACE_SHIFT must exceed number of system calls
324#endif
325
326static dev_info_t *systrace_devi;
327static dtrace_provider_id_t systrace_id;
328
329/*
330 * APPLE NOTE: Avoid name clash with Darwin automagic conf symbol.
331 * See balanced undef below.
332 */
333#define systrace_init _systrace_init
334
335static void
336systrace_init(struct sysent *actual, systrace_sysent_t **interposed)
337{
338
339	systrace_sysent_t *ssysent = *interposed;  /* Avoid sysent shadow warning
340							   from bsd/sys/sysent.h */
341	int i;
342
343	if (ssysent == NULL) {
344		*interposed = ssysent = kmem_zalloc(sizeof (systrace_sysent_t) *
345		    NSYSCALL, KM_SLEEP);
346	}
347
348	for (i = 0; i < NSYSCALL; i++) {
349		struct sysent *a = &actual[i];
350		systrace_sysent_t *s = &ssysent[i];
351
352		if (LOADABLE_SYSCALL(a) && !LOADED_SYSCALL(a))
353			continue;
354
355		if (a->sy_callc == dtrace_systrace_syscall)
356			continue;
357
358#ifdef _SYSCALL32_IMPL
359		if (a->sy_callc == dtrace_systrace_syscall32)
360			continue;
361#endif
362
363		s->stsy_underlying = a->sy_callc;
364		s->stsy_return_type = a->sy_return_type;
365	}
366	lck_mtx_init(&dtrace_systrace_lock, dtrace_lck_grp, dtrace_lck_attr);
367}
368
369
370/*ARGSUSED*/
371static void
372systrace_provide(void *arg, const dtrace_probedesc_t *desc)
373{
374#pragma unused(arg) /* __APPLE__ */
375	int i;
376
377	if (desc != NULL)
378		return;
379
380	systrace_init(sysent, &systrace_sysent);
381#ifdef _SYSCALL32_IMPL
382	systrace_init(sysent32, &systrace_sysent32);
383#endif
384
385	for (i = 0; i < NSYSCALL; i++) {
386		if (systrace_sysent[i].stsy_underlying == NULL)
387			continue;
388
389		if (dtrace_probe_lookup(systrace_id, NULL,
390		    syscallnames[i], "entry") != 0)
391			continue;
392
393		(void) dtrace_probe_create(systrace_id, NULL, syscallnames[i],
394		    "entry", SYSTRACE_ARTIFICIAL_FRAMES,
395		    (void *)((uintptr_t)SYSTRACE_ENTRY(i)));
396		(void) dtrace_probe_create(systrace_id, NULL, syscallnames[i],
397		    "return", SYSTRACE_ARTIFICIAL_FRAMES,
398		    (void *)((uintptr_t)SYSTRACE_RETURN(i)));
399
400		systrace_sysent[i].stsy_entry = DTRACE_IDNONE;
401		systrace_sysent[i].stsy_return = DTRACE_IDNONE;
402#ifdef _SYSCALL32_IMPL
403		systrace_sysent32[i].stsy_entry = DTRACE_IDNONE;
404		systrace_sysent32[i].stsy_return = DTRACE_IDNONE;
405#endif
406	}
407}
408#undef systrace_init
409
410/*ARGSUSED*/
411static void
412systrace_destroy(void *arg, dtrace_id_t id, void *parg)
413{
414#pragma unused(arg,id) /* __APPLE__ */
415
416	int sysnum = SYSTRACE_SYSNUM((uintptr_t)parg);
417
418#pragma unused(sysnum)  /* __APPLE__ */
419	/*
420	 * There's nothing to do here but assert that we have actually been
421	 * disabled.
422	 */
423	if (SYSTRACE_ISENTRY((uintptr_t)parg)) {
424		ASSERT(systrace_sysent[sysnum].stsy_entry == DTRACE_IDNONE);
425#ifdef _SYSCALL32_IMPL
426		ASSERT(systrace_sysent32[sysnum].stsy_entry == DTRACE_IDNONE);
427#endif
428	} else {
429		ASSERT(systrace_sysent[sysnum].stsy_return == DTRACE_IDNONE);
430#ifdef _SYSCALL32_IMPL
431		ASSERT(systrace_sysent32[sysnum].stsy_return == DTRACE_IDNONE);
432#endif
433	}
434}
435
436/*ARGSUSED*/
437static int
438systrace_enable(void *arg, dtrace_id_t id, void *parg)
439{
440#pragma unused(arg) /* __APPLE__ */
441
442	int sysnum = SYSTRACE_SYSNUM((uintptr_t)parg);
443	int enabled = (systrace_sysent[sysnum].stsy_entry != DTRACE_IDNONE ||
444	    systrace_sysent[sysnum].stsy_return != DTRACE_IDNONE);
445
446	if (SYSTRACE_ISENTRY((uintptr_t)parg)) {
447		systrace_sysent[sysnum].stsy_entry = id;
448#ifdef _SYSCALL32_IMPL
449		systrace_sysent32[sysnum].stsy_entry = id;
450#endif
451	} else {
452		systrace_sysent[sysnum].stsy_return = id;
453#ifdef _SYSCALL32_IMPL
454		systrace_sysent32[sysnum].stsy_return = id;
455#endif
456	}
457
458	if (enabled) {
459		ASSERT(sysent[sysnum].sy_callc == dtrace_systrace_syscall);
460		return(0);
461	}
462#ifdef _SYSCALL32_IMPL
463	(void) casptr(&sysent32[sysnum].sy_callc,
464	    (void *)systrace_sysent32[sysnum].stsy_underlying,
465	    (void *)dtrace_systrace_syscall32);
466#endif
467
468	lck_mtx_lock(&dtrace_systrace_lock);
469	if (sysent[sysnum].sy_callc == systrace_sysent[sysnum].stsy_underlying) {
470		vm_offset_t dss = (vm_offset_t)&dtrace_systrace_syscall;
471		ml_nofault_copy((vm_offset_t)&dss, (vm_offset_t)&sysent[sysnum].sy_callc, sizeof(vm_offset_t));
472	}
473	lck_mtx_unlock(&dtrace_systrace_lock);
474	return (0);
475}
476
477/*ARGSUSED*/
478static void
479systrace_disable(void *arg, dtrace_id_t id, void *parg)
480{
481#pragma unused(arg,id) /* __APPLE__ */
482
483	int sysnum = SYSTRACE_SYSNUM((uintptr_t)parg);
484	int disable = (systrace_sysent[sysnum].stsy_entry == DTRACE_IDNONE ||
485	    systrace_sysent[sysnum].stsy_return == DTRACE_IDNONE);
486
487	if (disable) {
488		lck_mtx_lock(&dtrace_systrace_lock);
489		if (sysent[sysnum].sy_callc == dtrace_systrace_syscall)
490			ml_nofault_copy((vm_offset_t)&systrace_sysent[sysnum].stsy_underlying, (vm_offset_t)&sysent[sysnum].sy_callc, sizeof(systrace_sysent[sysnum].stsy_underlying));
491		lck_mtx_unlock(&dtrace_systrace_lock);
492
493#ifdef _SYSCALL32_IMPL
494		(void) casptr(&sysent32[sysnum].sy_callc,
495		    (void *)dtrace_systrace_syscall32,
496		    (void *)systrace_sysent32[sysnum].stsy_underlying);
497#endif
498	}
499
500	if (SYSTRACE_ISENTRY((uintptr_t)parg)) {
501		systrace_sysent[sysnum].stsy_entry = DTRACE_IDNONE;
502#ifdef _SYSCALL32_IMPL
503		systrace_sysent32[sysnum].stsy_entry = DTRACE_IDNONE;
504#endif
505	} else {
506		systrace_sysent[sysnum].stsy_return = DTRACE_IDNONE;
507#ifdef _SYSCALL32_IMPL
508		systrace_sysent32[sysnum].stsy_return = DTRACE_IDNONE;
509#endif
510	}
511}
512
513static dtrace_pattr_t systrace_attr = {
514{ DTRACE_STABILITY_EVOLVING, DTRACE_STABILITY_EVOLVING, DTRACE_CLASS_COMMON },
515{ DTRACE_STABILITY_PRIVATE, DTRACE_STABILITY_PRIVATE, DTRACE_CLASS_UNKNOWN },
516{ DTRACE_STABILITY_PRIVATE, DTRACE_STABILITY_PRIVATE, DTRACE_CLASS_ISA },
517{ DTRACE_STABILITY_EVOLVING, DTRACE_STABILITY_EVOLVING, DTRACE_CLASS_COMMON },
518{ DTRACE_STABILITY_PRIVATE, DTRACE_STABILITY_PRIVATE, DTRACE_CLASS_ISA },
519};
520
521static dtrace_pops_t systrace_pops = {
522	systrace_provide,
523	NULL,
524	systrace_enable,
525	systrace_disable,
526	NULL,
527	NULL,
528	NULL,
529	systrace_getarg,
530	NULL,
531	systrace_destroy
532};
533
534static int
535systrace_attach(dev_info_t *devi, ddi_attach_cmd_t cmd)
536{
537	switch (cmd) {
538	case DDI_ATTACH:
539		break;
540	case DDI_RESUME:
541		return (DDI_SUCCESS);
542	default:
543		return (DDI_FAILURE);
544	}
545
546	systrace_probe = (void(*))&dtrace_probe;
547	membar_enter();
548
549	if (ddi_create_minor_node(devi, "systrace", S_IFCHR, 0,
550	    DDI_PSEUDO, 0) == DDI_FAILURE ||
551	    dtrace_register("syscall", &systrace_attr, DTRACE_PRIV_USER, NULL,
552	    &systrace_pops, NULL, &systrace_id) != 0) {
553		systrace_probe = systrace_stub;
554		ddi_remove_minor_node(devi, NULL);
555		return (DDI_FAILURE);
556	}
557
558	ddi_report_dev(devi);
559	systrace_devi = devi;
560
561	return (DDI_SUCCESS);
562}
563
564
565/*
566 * APPLE NOTE:  systrace_detach not implemented
567 */
568#if !defined(__APPLE__)
569static int
570systrace_detach(dev_info_t *devi, ddi_detach_cmd_t cmd)
571{
572	switch (cmd) {
573	case DDI_DETACH:
574		break;
575	case DDI_SUSPEND:
576		return (DDI_SUCCESS);
577	default:
578		return (DDI_FAILURE);
579	}
580
581	if (dtrace_unregister(systrace_id) != 0)
582		return (DDI_FAILURE);
583
584	ddi_remove_minor_node(devi, NULL);
585	systrace_probe = systrace_stub;
586	return (DDI_SUCCESS);
587}
588#endif /* __APPLE__ */
589
590
591typedef kern_return_t (*mach_call_t)(void *);
592
593/* APPLE NOTE: From #include <kern/syscall_sw.h> which may be changed for 64 bit! */
594typedef void    mach_munge_t(void *);
595
596typedef struct {
597	int			mach_trap_arg_count;
598	kern_return_t		(*mach_trap_function)(void *);
599#if defined(__arm64__) || defined(__x86_64__)
600	mach_munge_t		*mach_trap_arg_munge32; /* system call arguments for 32-bit */
601#endif
602	int			mach_trap_u32_words;
603#if	MACH_ASSERT
604	const char*		mach_trap_name;
605#endif /* MACH_ASSERT */
606} mach_trap_t;
607
608extern const mach_trap_t              mach_trap_table[]; /* syscall_sw.h now declares this as const */
609extern int                      mach_trap_count;
610
611extern const char *mach_syscall_name_table[];
612
613/* XXX From osfmk/i386/bsd_i386.c */
614struct mach_call_args {
615        syscall_arg_t arg1;
616        syscall_arg_t arg2;
617        syscall_arg_t arg3;
618        syscall_arg_t arg4;
619        syscall_arg_t arg5;
620        syscall_arg_t arg6;
621        syscall_arg_t arg7;
622        syscall_arg_t arg8;
623        syscall_arg_t arg9;
624};
625
626#undef NSYSCALL
627#define NSYSCALL mach_trap_count
628
629#if ((1 << SYSTRACE_SHIFT) <= NSYSCALL)
630#error 1 << SYSTRACE_SHIFT must exceed number of Mach traps
631#endif
632
633typedef struct machtrace_sysent {
634	dtrace_id_t	stsy_entry;
635	dtrace_id_t	stsy_return;
636	kern_return_t	(*stsy_underlying)(void *);
637	int32_t		stsy_return_type;
638} machtrace_sysent_t;
639
640static machtrace_sysent_t *machtrace_sysent = NULL;
641
642void (*machtrace_probe)(dtrace_id_t, uint64_t, uint64_t,
643    uint64_t, uint64_t, uint64_t);
644
645static uint64_t machtrace_getarg(void *, dtrace_id_t, void *, int, int);
646
647static dev_info_t *machtrace_devi;
648static dtrace_provider_id_t machtrace_id;
649
650static kern_return_t
651dtrace_machtrace_syscall(struct mach_call_args *args)
652{
653	int code;	/* The mach call number */
654
655	machtrace_sysent_t *sy;
656	dtrace_id_t id;
657	kern_return_t rval;
658#if 0 /* XXX */
659	proc_t *p;
660#endif
661	syscall_arg_t *ip = (syscall_arg_t *)args;
662	mach_call_t mach_call;
663
664#if defined (__x86_64__)
665	{
666		pal_register_cache_state(current_thread(), VALID);
667		x86_saved_state_t   *tagged_regs = (x86_saved_state_t *)find_user_regs(current_thread());
668
669		if (is_saved_state64(tagged_regs)) {
670			code = saved_state64(tagged_regs)->rax & SYSCALL_NUMBER_MASK;
671		} else {
672			code = -saved_state32(tagged_regs)->eax;
673		}
674	}
675#else
676#error Unknown Architecture
677#endif
678
679	sy = &machtrace_sysent[code];
680
681	if ((id = sy->stsy_entry) != DTRACE_IDNONE) {
682		uthread_t uthread = (uthread_t)get_bsdthread_info(current_thread());
683
684		if (uthread)
685			uthread->t_dtrace_syscall_args = (void *)ip;
686
687		(*machtrace_probe)(id, *ip, *(ip+1), *(ip+2), *(ip+3), *(ip+4));
688
689		if (uthread)
690			uthread->t_dtrace_syscall_args = (void *)0;
691	}
692
693#if 0 /* XXX */
694	/*
695	 * APPLE NOTE:  Not implemented.
696	 * We want to explicitly allow DTrace consumers to stop a process
697	 * before it actually executes the meat of the syscall.
698	 */
699	p = ttoproc(curthread);
700	mutex_enter(&p->p_lock);
701	if (curthread->t_dtrace_stop && !curthread->t_lwp->lwp_nostop) {
702		curthread->t_dtrace_stop = 0;
703		stop(PR_REQUESTED, 0);
704	}
705	mutex_exit(&p->p_lock);
706#endif
707
708	mach_call = (mach_call_t)(*sy->stsy_underlying);
709	rval = mach_call(args);
710
711	if ((id = sy->stsy_return) != DTRACE_IDNONE)
712		(*machtrace_probe)(id, (uint64_t)rval, 0, 0, 0, 0);
713
714	return (rval);
715}
716
717static void
718machtrace_init(const mach_trap_t *actual, machtrace_sysent_t **interposed)
719{
720	machtrace_sysent_t *msysent = *interposed;
721	int i;
722
723	if (msysent == NULL) {
724		*interposed = msysent = kmem_zalloc(sizeof (machtrace_sysent_t) *
725				NSYSCALL, KM_SLEEP);
726	}
727
728	for (i = 0; i < NSYSCALL; i++) {
729		const mach_trap_t *a = &actual[i];
730		machtrace_sysent_t *s = &msysent[i];
731
732		if (LOADABLE_SYSCALL(a) && !LOADED_SYSCALL(a))
733			continue;
734
735		if (a->mach_trap_function == (mach_call_t)(dtrace_machtrace_syscall))
736			continue;
737
738		s->stsy_underlying = a->mach_trap_function;
739	}
740}
741
742/*ARGSUSED*/
743static void
744machtrace_provide(void *arg, const dtrace_probedesc_t *desc)
745{
746#pragma unused(arg) /* __APPLE__ */
747
748	int i;
749
750	if (desc != NULL)
751		return;
752
753	machtrace_init(mach_trap_table, &machtrace_sysent);
754
755	for (i = 0; i < NSYSCALL; i++) {
756
757		if (machtrace_sysent[i].stsy_underlying == NULL)
758			continue;
759
760		if (dtrace_probe_lookup(machtrace_id, NULL,
761					mach_syscall_name_table[i], "entry") != 0)
762			continue;
763
764		(void) dtrace_probe_create(machtrace_id, NULL, mach_syscall_name_table[i],
765					   "entry", MACHTRACE_ARTIFICIAL_FRAMES,
766					   (void *)((uintptr_t)SYSTRACE_ENTRY(i)));
767		(void) dtrace_probe_create(machtrace_id, NULL, mach_syscall_name_table[i],
768					   "return", MACHTRACE_ARTIFICIAL_FRAMES,
769					   (void *)((uintptr_t)SYSTRACE_RETURN(i)));
770
771		machtrace_sysent[i].stsy_entry = DTRACE_IDNONE;
772		machtrace_sysent[i].stsy_return = DTRACE_IDNONE;
773	}
774}
775
776/*ARGSUSED*/
777static void
778machtrace_destroy(void *arg, dtrace_id_t id, void *parg)
779{
780#pragma unused(arg,id) /* __APPLE__ */
781	int sysnum = SYSTRACE_SYSNUM((uintptr_t)parg);
782
783#pragma unused(sysnum) /* __APPLE__ */
784
785	/*
786	 * There's nothing to do here but assert that we have actually been
787	 * disabled.
788	 */
789	if (SYSTRACE_ISENTRY((uintptr_t)parg)) {
790		ASSERT(machtrace_sysent[sysnum].stsy_entry == DTRACE_IDNONE);
791	} else {
792		ASSERT(machtrace_sysent[sysnum].stsy_return == DTRACE_IDNONE);
793	}
794}
795
796/*ARGSUSED*/
797static int
798machtrace_enable(void *arg, dtrace_id_t id, void *parg)
799{
800#pragma unused(arg) /* __APPLE__ */
801
802	int sysnum = SYSTRACE_SYSNUM((uintptr_t)parg);
803	int enabled = (machtrace_sysent[sysnum].stsy_entry != DTRACE_IDNONE ||
804			machtrace_sysent[sysnum].stsy_return != DTRACE_IDNONE);
805
806	if (SYSTRACE_ISENTRY((uintptr_t)parg)) {
807		machtrace_sysent[sysnum].stsy_entry = id;
808	} else {
809		machtrace_sysent[sysnum].stsy_return = id;
810	}
811
812	if (enabled) {
813	    ASSERT(mach_trap_table[sysnum].mach_trap_function == (void *)dtrace_machtrace_syscall);
814	    return(0);
815	}
816
817	lck_mtx_lock(&dtrace_systrace_lock);
818
819	if (mach_trap_table[sysnum].mach_trap_function == machtrace_sysent[sysnum].stsy_underlying) {
820		vm_offset_t dss = (vm_offset_t)&dtrace_machtrace_syscall;
821		ml_nofault_copy((vm_offset_t)&dss, (vm_offset_t)&mach_trap_table[sysnum].mach_trap_function, sizeof(vm_offset_t));
822	}
823
824	lck_mtx_unlock(&dtrace_systrace_lock);
825
826	return(0);
827}
828
829/*ARGSUSED*/
830static void
831machtrace_disable(void *arg, dtrace_id_t id, void *parg)
832{
833#pragma unused(arg,id) /* __APPLE__ */
834
835	int sysnum = SYSTRACE_SYSNUM((uintptr_t)parg);
836	int disable = (machtrace_sysent[sysnum].stsy_entry == DTRACE_IDNONE ||
837			machtrace_sysent[sysnum].stsy_return == DTRACE_IDNONE);
838
839	if (disable) {
840
841		lck_mtx_lock(&dtrace_systrace_lock);
842
843		if (mach_trap_table[sysnum].mach_trap_function == (mach_call_t)dtrace_machtrace_syscall) {
844			ml_nofault_copy((vm_offset_t)&machtrace_sysent[sysnum].stsy_underlying, (vm_offset_t)&mach_trap_table[sysnum].mach_trap_function, sizeof(vm_offset_t));
845		}
846		lck_mtx_unlock(&dtrace_systrace_lock);
847	}
848
849	if (SYSTRACE_ISENTRY((uintptr_t)parg)) {
850		machtrace_sysent[sysnum].stsy_entry = DTRACE_IDNONE;
851	} else {
852		machtrace_sysent[sysnum].stsy_return = DTRACE_IDNONE;
853	}
854}
855
856static dtrace_pattr_t machtrace_attr = {
857{ DTRACE_STABILITY_EVOLVING, DTRACE_STABILITY_EVOLVING, DTRACE_CLASS_COMMON },
858{ DTRACE_STABILITY_PRIVATE, DTRACE_STABILITY_PRIVATE, DTRACE_CLASS_UNKNOWN },
859{ DTRACE_STABILITY_PRIVATE, DTRACE_STABILITY_PRIVATE, DTRACE_CLASS_ISA },
860{ DTRACE_STABILITY_EVOLVING, DTRACE_STABILITY_EVOLVING, DTRACE_CLASS_COMMON },
861{ DTRACE_STABILITY_PRIVATE, DTRACE_STABILITY_PRIVATE, DTRACE_CLASS_ISA },
862};
863
864static dtrace_pops_t machtrace_pops = {
865	machtrace_provide,
866	NULL,
867	machtrace_enable,
868	machtrace_disable,
869	NULL,
870	NULL,
871	NULL,
872	machtrace_getarg,
873	NULL,
874	machtrace_destroy
875};
876
877static int
878machtrace_attach(dev_info_t *devi, ddi_attach_cmd_t cmd)
879{
880	switch (cmd) {
881		case DDI_ATTACH:
882			break;
883		case DDI_RESUME:
884			return (DDI_SUCCESS);
885		default:
886			return (DDI_FAILURE);
887	}
888
889	machtrace_probe = dtrace_probe;
890	membar_enter();
891
892	if (ddi_create_minor_node(devi, "machtrace", S_IFCHR, 0,
893				DDI_PSEUDO, 0) == DDI_FAILURE ||
894			dtrace_register("mach_trap", &machtrace_attr, DTRACE_PRIV_USER, NULL,
895				&machtrace_pops, NULL, &machtrace_id) != 0) {
896                machtrace_probe = (void (*))&systrace_stub;
897		ddi_remove_minor_node(devi, NULL);
898		return (DDI_FAILURE);
899	}
900
901	ddi_report_dev(devi);
902	machtrace_devi = devi;
903
904	return (DDI_SUCCESS);
905}
906
907d_open_t _systrace_open;
908
909int _systrace_open(dev_t dev, int flags, int devtype, struct proc *p)
910{
911#pragma unused(dev,flags,devtype,p)
912	return 0;
913}
914
915#define SYSTRACE_MAJOR  -24 /* let the kernel pick the device number */
916
917/*
918 * A struct describing which functions will get invoked for certain
919 * actions.
920 */
921static struct cdevsw systrace_cdevsw =
922{
923	_systrace_open,		/* open */
924	eno_opcl,		/* close */
925	eno_rdwrt,			/* read */
926	eno_rdwrt,			/* write */
927	eno_ioctl,		/* ioctl */
928	(stop_fcn_t *)nulldev, /* stop */
929	(reset_fcn_t *)nulldev, /* reset */
930	NULL,				/* tty's */
931	eno_select,			/* select */
932	eno_mmap,			/* mmap */
933	eno_strat,			/* strategy */
934	eno_getc,			/* getc */
935	eno_putc,			/* putc */
936	0					/* type */
937};
938
939static int gSysTraceInited = 0;
940
941void systrace_init( void );
942
943void systrace_init( void )
944{
945	if (0 == gSysTraceInited) {
946		int majdevno = cdevsw_add(SYSTRACE_MAJOR, &systrace_cdevsw);
947
948		if (majdevno < 0) {
949			printf("systrace_init: failed to allocate a major number!\n");
950			gSysTraceInited = 0;
951			return;
952		}
953
954		systrace_attach( (dev_info_t	*)(uintptr_t)majdevno, DDI_ATTACH );
955		machtrace_attach( (dev_info_t	*)(uintptr_t)majdevno, DDI_ATTACH );
956
957		gSysTraceInited = 1;
958	} else
959		panic("systrace_init: called twice!\n");
960}
961#undef SYSTRACE_MAJOR
962
963static uint64_t
964systrace_getarg(void *arg, dtrace_id_t id, void *parg, int argno, int aframes)
965{
966#pragma unused(arg,id,parg,aframes)     /* __APPLE__ */
967	uint64_t val = 0;
968	syscall_arg_t *stack = (syscall_arg_t *)NULL;
969
970	uthread_t uthread = (uthread_t)get_bsdthread_info(current_thread());
971
972	if (uthread)
973		stack = (syscall_arg_t *)uthread->t_dtrace_syscall_args;
974
975	if (!stack)
976		return(0);
977
978	DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
979	/* dtrace_probe arguments arg0 .. arg4 are 64bits wide */
980	val = (uint64_t)*(stack+argno);
981	DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);
982	return (val);
983}
984
985
986static uint64_t
987machtrace_getarg(void *arg, dtrace_id_t id, void *parg, int argno, int aframes)
988{
989#pragma unused(arg,id,parg,aframes)     /* __APPLE__ */
990	uint64_t val = 0;
991	syscall_arg_t *stack = (syscall_arg_t *)NULL;
992
993	uthread_t uthread = (uthread_t)get_bsdthread_info(current_thread());
994
995	if (uthread)
996		stack = (syscall_arg_t *)uthread->t_dtrace_syscall_args;
997
998	if (!stack)
999		return(0);
1000
1001	DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
1002	/* dtrace_probe arguments arg0 .. arg4 are 64bits wide */
1003	val = (uint64_t)*(stack+argno);
1004	DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);
1005	return (val);
1006}
1007
1008