systrace.c revision 227441
1179237Sjb/*
2179237Sjb * CDDL HEADER START
3179237Sjb *
4179237Sjb * The contents of this file are subject to the terms of the
5179237Sjb * Common Development and Distribution License (the "License").
6179237Sjb * You may not use this file except in compliance with the License.
7179237Sjb *
8179237Sjb * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9179237Sjb * or http://www.opensolaris.org/os/licensing.
10179237Sjb * See the License for the specific language governing permissions
11179237Sjb * and limitations under the License.
12179237Sjb *
13179237Sjb * When distributing Covered Code, include this CDDL HEADER in each
14179237Sjb * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15179237Sjb * If applicable, add the following below this CDDL HEADER, with the
16179237Sjb * fields enclosed by brackets "[]" replaced with your own identifying
17179237Sjb * information: Portions Copyright [yyyy] [name of copyright owner]
18179237Sjb *
19179237Sjb * CDDL HEADER END
20179237Sjb *
21179237Sjb * Portions Copyright 2006-2008 John Birrell jb@freebsd.org
22179237Sjb *
23179237Sjb * $FreeBSD: head/sys/cddl/dev/systrace/systrace.c 227441 2011-11-11 03:49:42Z rstone $
24179237Sjb *
25179237Sjb */
26179237Sjb
27179237Sjb/*
28179237Sjb * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
29179237Sjb * Use is subject to license terms.
30179237Sjb */
31179237Sjb
32179237Sjb#include <sys/cdefs.h>
33179237Sjb#include <sys/param.h>
34179237Sjb#include <sys/systm.h>
35179237Sjb#include <sys/conf.h>
36179237Sjb#include <sys/cpuvar.h>
37179237Sjb#include <sys/fcntl.h>
38179237Sjb#include <sys/filio.h>
39179237Sjb#include <sys/kdb.h>
40179237Sjb#include <sys/kernel.h>
41179237Sjb#include <sys/kmem.h>
42179237Sjb#include <sys/kthread.h>
43179237Sjb#include <sys/limits.h>
44179237Sjb#include <sys/linker.h>
45179237Sjb#include <sys/lock.h>
46179237Sjb#include <sys/malloc.h>
47179237Sjb#include <sys/module.h>
48179237Sjb#include <sys/mutex.h>
49179237Sjb#include <sys/poll.h>
50179237Sjb#include <sys/proc.h>
51179237Sjb#include <sys/selinfo.h>
52179237Sjb#include <sys/smp.h>
53184698Srodrigc#include <sys/sysproto.h>
54179237Sjb#include <sys/sysent.h>
55179237Sjb#include <sys/uio.h>
56179237Sjb#include <sys/unistd.h>
57179237Sjb#include <machine/stdarg.h>
58179237Sjb
59179237Sjb#include <sys/dtrace.h>
60179237Sjb
61184698Srodrigc#ifdef LINUX_SYSTRACE
62219561Savg#if defined(__amd64__)
63219561Savg#include <amd64/linux32/linux.h>
64219561Savg#include <amd64/linux32/linux32_proto.h>
65219561Savg#include <amd64/linux32/linux32_syscalls.c>
66219561Savg#include <amd64/linux32/linux32_systrace_args.c>
67220437Sart#define	MODNAME		"linux32"
68219561Savg#elif defined(__i386__)
69219561Savg#include <i386/linux/linux.h>
70219561Savg#include <i386/linux/linux_proto.h>
71219561Savg#include <i386/linux/linux_syscalls.c>
72219561Savg#include <i386/linux/linux_systrace_args.c>
73220437Sart#define	MODNAME		"linux"
74219561Savg#else
75219561Savg#error Only i386 and amd64 are supported.
76219561Savg#endif
77184698Srodrigcextern struct sysent linux_sysent[];
78184698Srodrigc#define	MAXSYSCALL	LINUX_SYS_MAXSYSCALL
79184698Srodrigc#define	SYSCALLNAMES	linux_syscallnames
80184698Srodrigc#define	SYSENT		linux_sysent
81219561Savg#elif defined(FREEBSD32_SYSTRACE)
82219561Savg/*
83219561Savg * The syscall arguments are processed into a DTrace argument array
84219561Savg * using a generated function. See sys/kern/makesyscalls.sh.
85219561Savg */
86219561Savg#include <compat/freebsd32/freebsd32_proto.h>
87219561Savg#include <compat/freebsd32/freebsd32_util.h>
88219561Savg#include <compat/freebsd32/freebsd32_syscall.h>
89219561Savg#include <compat/freebsd32/freebsd32_systrace_args.c>
90219561Savgextern const char *freebsd32_syscallnames[];
91219561Savg#define	MODNAME		"freebsd32"
92219561Savg#define	MAXSYSCALL	FREEBSD32_SYS_MAXSYSCALL
93219561Savg#define	SYSCALLNAMES	freebsd32_syscallnames
94219561Savg#define	SYSENT		freebsd32_sysent
95184698Srodrigc#else
96184698Srodrigc/*
97184698Srodrigc * The syscall arguments are processed into a DTrace argument array
98184698Srodrigc * using a generated function. See sys/kern/makesyscalls.sh.
99184698Srodrigc */
100184698Srodrigc#include <sys/syscall.h>
101184698Srodrigc#include <kern/systrace_args.c>
102219561Savg#define	MODNAME		"freebsd"
103184698Srodrigc#define	MAXSYSCALL	SYS_MAXSYSCALL
104184698Srodrigc#define	SYSCALLNAMES	syscallnames
105184698Srodrigc#define	SYSENT		sysent
106184698Srodrigc#endif
107184698Srodrigc
108219561Savg#define	PROVNAME	"syscall"
109219561Savg#define	DEVNAME	        "dtrace/systrace/" MODNAME
110219561Savg
111179237Sjb#define	SYSTRACE_ARTIFICIAL_FRAMES	1
112179237Sjb
113179237Sjb#define	SYSTRACE_SHIFT			16
114179237Sjb#define	SYSTRACE_ISENTRY(x)		((int)(x) >> SYSTRACE_SHIFT)
115179237Sjb#define	SYSTRACE_SYSNUM(x)		((int)(x) & ((1 << SYSTRACE_SHIFT) - 1))
116179237Sjb#define	SYSTRACE_ENTRY(id)		((1 << SYSTRACE_SHIFT) | (id))
117179237Sjb#define	SYSTRACE_RETURN(id)		(id)
118179237Sjb
119184698Srodrigc#if ((1 << SYSTRACE_SHIFT) <= MAXSYSCALL)
120179237Sjb#error 1 << SYSTRACE_SHIFT must exceed number of system calls
121179237Sjb#endif
122179237Sjb
123179237Sjbstatic d_open_t	systrace_open;
124179237Sjbstatic int	systrace_unload(void);
125179237Sjbstatic void	systrace_getargdesc(void *, dtrace_id_t, void *, dtrace_argdesc_t *);
126179237Sjbstatic void	systrace_provide(void *, dtrace_probedesc_t *);
127179237Sjbstatic void	systrace_destroy(void *, dtrace_id_t, void *);
128179237Sjbstatic void	systrace_enable(void *, dtrace_id_t, void *);
129179237Sjbstatic void	systrace_disable(void *, dtrace_id_t, void *);
130179237Sjbstatic void	systrace_load(void *);
131179237Sjb
132179237Sjbstatic struct cdevsw systrace_cdevsw = {
133179237Sjb	.d_version	= D_VERSION,
134179237Sjb	.d_open		= systrace_open,
135184698Srodrigc#ifdef LINUX_SYSTRACE
136220437Sart	.d_name		= "systrace_" MODNAME,
137184698Srodrigc#else
138179237Sjb	.d_name		= "systrace",
139184698Srodrigc#endif
140179237Sjb};
141179237Sjb
142184698Srodrigcstatic union	{
143184698Srodrigc	const char	**p_constnames;
144184698Srodrigc	char		**pp_syscallnames;
145184698Srodrigc} uglyhack = { SYSCALLNAMES };
146184698Srodrigc
147179237Sjbstatic dtrace_pattr_t systrace_attr = {
148179237Sjb{ DTRACE_STABILITY_EVOLVING, DTRACE_STABILITY_EVOLVING, DTRACE_CLASS_COMMON },
149179237Sjb{ DTRACE_STABILITY_PRIVATE, DTRACE_STABILITY_PRIVATE, DTRACE_CLASS_UNKNOWN },
150179237Sjb{ DTRACE_STABILITY_PRIVATE, DTRACE_STABILITY_PRIVATE, DTRACE_CLASS_ISA },
151179237Sjb{ DTRACE_STABILITY_EVOLVING, DTRACE_STABILITY_EVOLVING, DTRACE_CLASS_COMMON },
152179237Sjb{ DTRACE_STABILITY_PRIVATE, DTRACE_STABILITY_PRIVATE, DTRACE_CLASS_ISA },
153179237Sjb};
154179237Sjb
155179237Sjbstatic dtrace_pops_t systrace_pops = {
156179237Sjb	systrace_provide,
157179237Sjb	NULL,
158179237Sjb	systrace_enable,
159179237Sjb	systrace_disable,
160179237Sjb	NULL,
161179237Sjb	NULL,
162179237Sjb	systrace_getargdesc,
163179237Sjb	NULL,
164179237Sjb	NULL,
165179237Sjb	systrace_destroy
166179237Sjb};
167179237Sjb
168179237Sjbstatic struct cdev		*systrace_cdev;
169179237Sjbstatic dtrace_provider_id_t	systrace_id;
170179237Sjb
171184698Srodrigc#if !defined(LINUX_SYSTRACE)
172179237Sjb/*
173184698Srodrigc * Probe callback function.
174184698Srodrigc *
175184698Srodrigc * Note: This function is called for _all_ syscalls, regardless of which sysent
176184698Srodrigc *       array the syscall comes from. It could be a standard syscall or a
177184698Srodrigc *       compat syscall from something like Linux.
178179237Sjb */
179179237Sjbstatic void
180211608Srpaulosystrace_probe(u_int32_t id, int sysnum, struct sysent *sysent, void *params,
181211608Srpaulo    int ret)
182179237Sjb{
183179237Sjb	int		n_args	= 0;
184179237Sjb	u_int64_t	uargs[8];
185179237Sjb
186211608Srpaulo	memset(uargs, 0, sizeof(uargs));
187179237Sjb	/*
188184698Srodrigc	 * Check if this syscall has an argument conversion function
189184698Srodrigc	 * registered.
190179237Sjb	 */
191211608Srpaulo	if (params && sysent->sy_systrace_args_func != NULL) {
192179237Sjb		/*
193179237Sjb		 * Convert the syscall parameters using the registered
194179237Sjb		 * function.
195179237Sjb		 */
196184698Srodrigc		(*sysent->sy_systrace_args_func)(sysnum, params, uargs, &n_args);
197211608Srpaulo	} else if (params) {
198179237Sjb		/*
199179237Sjb		 * Use the built-in system call argument conversion
200179237Sjb		 * function to translate the syscall structure fields
201184698Srodrigc		 * into the array of 64-bit values that DTrace
202179237Sjb		 * expects.
203179237Sjb		 */
204179237Sjb		systrace_args(sysnum, params, uargs, &n_args);
205211608Srpaulo	} else {
206211608Srpaulo		/*
207211608Srpaulo		 * Since params is NULL, this is a 'return' probe.
208211608Srpaulo		 * Set arg0 and arg1 as the return value of this syscall.
209211608Srpaulo		 */
210211608Srpaulo		uargs[0] = uargs[1] = ret;
211211608Srpaulo	}
212179237Sjb
213179237Sjb	/* Process the probe using the converted argments. */
214179237Sjb	dtrace_probe(id, uargs[0], uargs[1], uargs[2], uargs[3], uargs[4]);
215179237Sjb}
216227441Srstone
217184698Srodrigc#endif
218179237Sjb
219179237Sjbstatic void
220179237Sjbsystrace_getargdesc(void *arg, dtrace_id_t id, void *parg, dtrace_argdesc_t *desc)
221179237Sjb{
222179237Sjb	int sysnum = SYSTRACE_SYSNUM((uintptr_t)parg);
223179237Sjb
224227441Srstone	if (SYSTRACE_ISENTRY((uintptr_t)parg))
225227441Srstone		systrace_entry_setargdesc(sysnum, desc->dtargd_ndx,
226227441Srstone		    desc->dtargd_native, sizeof(desc->dtargd_native));
227227441Srstone	else
228227441Srstone		systrace_return_setargdesc(sysnum, desc->dtargd_ndx,
229227441Srstone		    desc->dtargd_native, sizeof(desc->dtargd_native));
230179237Sjb
231179237Sjb	if (desc->dtargd_native[0] == '\0')
232179237Sjb		desc->dtargd_ndx = DTRACE_ARGNONE;
233179237Sjb
234179237Sjb	return;
235179237Sjb}
236179237Sjb
237179237Sjbstatic void
238179237Sjbsystrace_provide(void *arg, dtrace_probedesc_t *desc)
239179237Sjb{
240179237Sjb	int i;
241179237Sjb
242179237Sjb	if (desc != NULL)
243179237Sjb		return;
244179237Sjb
245184698Srodrigc	for (i = 0; i < MAXSYSCALL; i++) {
246219561Savg		if (dtrace_probe_lookup(systrace_id, MODNAME,
247184698Srodrigc		    uglyhack.pp_syscallnames[i], "entry") != 0)
248179237Sjb			continue;
249179237Sjb
250219561Savg		(void) dtrace_probe_create(systrace_id, MODNAME, uglyhack.pp_syscallnames[i],
251179237Sjb		    "entry", SYSTRACE_ARTIFICIAL_FRAMES,
252179237Sjb		    (void *)((uintptr_t)SYSTRACE_ENTRY(i)));
253219561Savg		(void) dtrace_probe_create(systrace_id, MODNAME, uglyhack.pp_syscallnames[i],
254179237Sjb		    "return", SYSTRACE_ARTIFICIAL_FRAMES,
255179237Sjb		    (void *)((uintptr_t)SYSTRACE_RETURN(i)));
256179237Sjb	}
257179237Sjb}
258179237Sjb
259179237Sjbstatic void
260179237Sjbsystrace_destroy(void *arg, dtrace_id_t id, void *parg)
261179237Sjb{
262179237Sjb#ifdef DEBUG
263179237Sjb	int sysnum = SYSTRACE_SYSNUM((uintptr_t)parg);
264179237Sjb
265179237Sjb	/*
266179237Sjb	 * There's nothing to do here but assert that we have actually been
267179237Sjb	 * disabled.
268179237Sjb	 */
269179237Sjb	if (SYSTRACE_ISENTRY((uintptr_t)parg)) {
270179237Sjb		ASSERT(sysent[sysnum].sy_entry == 0);
271179237Sjb	} else {
272179237Sjb		ASSERT(sysent[sysnum].sy_return == 0);
273179237Sjb	}
274179237Sjb#endif
275179237Sjb}
276179237Sjb
277179237Sjbstatic void
278179237Sjbsystrace_enable(void *arg, dtrace_id_t id, void *parg)
279179237Sjb{
280179237Sjb	int sysnum = SYSTRACE_SYSNUM((uintptr_t)parg);
281179237Sjb
282184698Srodrigc	if (SYSENT[sysnum].sy_systrace_args_func == NULL)
283184698Srodrigc		SYSENT[sysnum].sy_systrace_args_func = systrace_args;
284184698Srodrigc
285179237Sjb	if (SYSTRACE_ISENTRY((uintptr_t)parg))
286184698Srodrigc		SYSENT[sysnum].sy_entry = id;
287179237Sjb	else
288184698Srodrigc		SYSENT[sysnum].sy_return = id;
289179237Sjb}
290179237Sjb
291179237Sjbstatic void
292179237Sjbsystrace_disable(void *arg, dtrace_id_t id, void *parg)
293179237Sjb{
294179237Sjb	int sysnum = SYSTRACE_SYSNUM((uintptr_t)parg);
295179237Sjb
296184698Srodrigc	SYSENT[sysnum].sy_entry = 0;
297184698Srodrigc	SYSENT[sysnum].sy_return = 0;
298179237Sjb}
299179237Sjb
300179237Sjbstatic void
301179237Sjbsystrace_load(void *dummy)
302179237Sjb{
303179237Sjb	/* Create the /dev/dtrace/systrace entry. */
304179237Sjb	systrace_cdev = make_dev(&systrace_cdevsw, 0, UID_ROOT, GID_WHEEL, 0600,
305184698Srodrigc	   DEVNAME);
306179237Sjb
307184698Srodrigc	if (dtrace_register(PROVNAME, &systrace_attr, DTRACE_PRIV_USER,
308179237Sjb	    NULL, &systrace_pops, NULL, &systrace_id) != 0)
309179237Sjb		return;
310179237Sjb
311184698Srodrigc#if !defined(LINUX_SYSTRACE)
312179237Sjb	systrace_probe_func = systrace_probe;
313184698Srodrigc#endif
314179237Sjb}
315179237Sjb
316179237Sjb
317179237Sjbstatic int
318179237Sjbsystrace_unload()
319179237Sjb{
320179237Sjb	int error = 0;
321179237Sjb
322179237Sjb	if ((error = dtrace_unregister(systrace_id)) != 0)
323179237Sjb		return (error);
324179237Sjb
325184698Srodrigc#if !defined(LINUX_SYSTRACE)
326179237Sjb	systrace_probe_func = NULL;
327184698Srodrigc#endif
328179237Sjb
329179237Sjb	destroy_dev(systrace_cdev);
330179237Sjb
331179237Sjb	return (error);
332179237Sjb}
333179237Sjb
334179237Sjbstatic int
335179237Sjbsystrace_modevent(module_t mod __unused, int type, void *data __unused)
336179237Sjb{
337179237Sjb	int error = 0;
338179237Sjb
339179237Sjb	switch (type) {
340179237Sjb	case MOD_LOAD:
341179237Sjb		break;
342179237Sjb
343179237Sjb	case MOD_UNLOAD:
344179237Sjb		break;
345179237Sjb
346179237Sjb	case MOD_SHUTDOWN:
347179237Sjb		break;
348179237Sjb
349179237Sjb	default:
350179237Sjb		error = EOPNOTSUPP;
351179237Sjb		break;
352179237Sjb
353179237Sjb	}
354179237Sjb	return (error);
355179237Sjb}
356179237Sjb
357179237Sjbstatic int
358179237Sjbsystrace_open(struct cdev *dev __unused, int oflags __unused, int devtype __unused, struct thread *td __unused)
359179237Sjb{
360179237Sjb	return (0);
361179237Sjb}
362179237Sjb
363179237SjbSYSINIT(systrace_load, SI_SUB_DTRACE_PROVIDER, SI_ORDER_ANY, systrace_load, NULL);
364179237SjbSYSUNINIT(systrace_unload, SI_SUB_DTRACE_PROVIDER, SI_ORDER_ANY, systrace_unload, NULL);
365179237Sjb
366184698Srodrigc#ifdef LINUX_SYSTRACE
367219561SavgDEV_MODULE(systrace_linux32, systrace_modevent, NULL);
368219561SavgMODULE_VERSION(systrace_linux32, 1);
369219561SavgMODULE_DEPEND(systrace_linux32, linux, 1, 1, 1);
370219561SavgMODULE_DEPEND(systrace_linux32, dtrace, 1, 1, 1);
371219561SavgMODULE_DEPEND(systrace_linux32, opensolaris, 1, 1, 1);
372219561Savg#elif defined(FREEBSD32_SYSTRACE)
373219561SavgDEV_MODULE(systrace_freebsd32, systrace_modevent, NULL);
374219561SavgMODULE_VERSION(systrace_freebsd32, 1);
375219561SavgMODULE_DEPEND(systrace_freebsd32, dtrace, 1, 1, 1);
376219561SavgMODULE_DEPEND(systrace_freebsd32, opensolaris, 1, 1, 1);
377184698Srodrigc#else
378179237SjbDEV_MODULE(systrace, systrace_modevent, NULL);
379179237SjbMODULE_VERSION(systrace, 1);
380179237SjbMODULE_DEPEND(systrace, dtrace, 1, 1, 1);
381179237SjbMODULE_DEPEND(systrace, opensolaris, 1, 1, 1);
382184698Srodrigc#endif
383