profile.c revision 179237
1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 *
21 * Portions Copyright 2006-2008 John Birrell jb@freebsd.org
22 *
23 * $FreeBSD: head/sys/cddl/dev/profile/profile.c 179237 2008-05-23 05:59:42Z jb $
24 *
25 */
26
27/*
28 * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
29 * Use is subject to license terms.
30 */
31
32#include <sys/cdefs.h>
33#include <sys/param.h>
34#include <sys/systm.h>
35#include <sys/conf.h>
36#include <sys/cpuvar.h>
37#include <sys/fcntl.h>
38#include <sys/filio.h>
39#include <sys/kdb.h>
40#include <sys/kernel.h>
41#include <sys/kmem.h>
42#include <sys/kthread.h>
43#include <sys/limits.h>
44#include <sys/linker.h>
45#include <sys/lock.h>
46#include <sys/malloc.h>
47#include <sys/module.h>
48#include <sys/mutex.h>
49#include <sys/poll.h>
50#include <sys/proc.h>
51#include <sys/selinfo.h>
52#include <sys/smp.h>
53#include <sys/uio.h>
54#include <sys/unistd.h>
55#include <machine/stdarg.h>
56
57#include <sys/cyclic.h>
58#include <sys/dtrace.h>
59#include <sys/dtrace_bsd.h>
60
61#define	PROF_NAMELEN		15
62
63#define	PROF_PROFILE		0
64#define	PROF_TICK		1
65#define	PROF_PREFIX_PROFILE	"profile-"
66#define	PROF_PREFIX_TICK	"tick-"
67
68/*
69 * Regardless of platform, there are five artificial frames in the case of the
70 * profile provider:
71 *
72 *	profile_fire
73 *	cyclic_expire
74 *	cyclic_fire
75 *	[ cbe ]
76 *	[ locore ]
77 *
78 * On amd64, there are two frames associated with locore:  one in locore, and
79 * another in common interrupt dispatch code.  (i386 has not been modified to
80 * use this common layer.)  Further, on i386, the interrupted instruction
81 * appears as its own stack frame.  All of this means that we need to add one
82 * frame for amd64, and then take one away for both amd64 and i386.
83 *
84 * On SPARC, the picture is further complicated because the compiler
85 * optimizes away tail-calls -- so the following frames are optimized away:
86 *
87 * 	profile_fire
88 *	cyclic_expire
89 *
90 * This gives three frames.  However, on DEBUG kernels, the cyclic_expire
91 * frame cannot be tail-call eliminated, yielding four frames in this case.
92 *
93 * All of the above constraints lead to the mess below.  Yes, the profile
94 * provider should ideally figure this out on-the-fly by hiting one of its own
95 * probes and then walking its own stack trace.  This is complicated, however,
96 * and the static definition doesn't seem to be overly brittle.  Still, we
97 * allow for a manual override in case we get it completely wrong.
98 */
99#ifdef __amd64
100#define	PROF_ARTIFICIAL_FRAMES	7
101#else
102#ifdef __i386
103#define	PROF_ARTIFICIAL_FRAMES	6
104#else
105#ifdef __sparc
106#ifdef DEBUG
107#define	PROF_ARTIFICIAL_FRAMES	4
108#else
109#define	PROF_ARTIFICIAL_FRAMES	3
110#endif
111#endif
112#endif
113#endif
114
115typedef struct profile_probe {
116	char		prof_name[PROF_NAMELEN];
117	dtrace_id_t	prof_id;
118	int		prof_kind;
119	hrtime_t	prof_interval;
120	cyclic_id_t	prof_cyclic;
121} profile_probe_t;
122
123typedef struct profile_probe_percpu {
124	hrtime_t	profc_expected;
125	hrtime_t	profc_interval;
126	profile_probe_t	*profc_probe;
127} profile_probe_percpu_t;
128
129static d_open_t	profile_open;
130static int	profile_unload(void);
131static void	profile_create(hrtime_t, char *, int);
132static void	profile_destroy(void *, dtrace_id_t, void *);
133static void	profile_enable(void *, dtrace_id_t, void *);
134static void	profile_disable(void *, dtrace_id_t, void *);
135static void	profile_load(void *);
136static void	profile_provide(void *, dtrace_probedesc_t *);
137
138static int profile_rates[] = {
139    97, 199, 499, 997, 1999,
140    4001, 4999, 0, 0, 0,
141    0, 0, 0, 0, 0,
142    0, 0, 0, 0, 0
143};
144
145static int profile_ticks[] = {
146    1, 10, 100, 500, 1000,
147    5000, 0, 0, 0, 0,
148    0, 0, 0, 0, 0
149};
150
151/*
152 * profile_max defines the upper bound on the number of profile probes that
153 * can exist (this is to prevent malicious or clumsy users from exhausing
154 * system resources by creating a slew of profile probes). At mod load time,
155 * this gets its value from PROFILE_MAX_DEFAULT or profile-max-probes if it's
156 * present in the profile.conf file.
157 */
158#define	PROFILE_MAX_DEFAULT	1000	/* default max. number of probes */
159static uint32_t profile_max = PROFILE_MAX_DEFAULT;
160					/* maximum number of profile probes */
161static uint32_t profile_total;		/* current number of profile probes */
162
163static struct cdevsw profile_cdevsw = {
164	.d_version	= D_VERSION,
165	.d_open		= profile_open,
166	.d_name		= "profile",
167};
168
169static dtrace_pattr_t profile_attr = {
170{ DTRACE_STABILITY_EVOLVING, DTRACE_STABILITY_EVOLVING, DTRACE_CLASS_COMMON },
171{ DTRACE_STABILITY_PRIVATE, DTRACE_STABILITY_PRIVATE, DTRACE_CLASS_UNKNOWN },
172{ DTRACE_STABILITY_PRIVATE, DTRACE_STABILITY_PRIVATE, DTRACE_CLASS_ISA },
173{ DTRACE_STABILITY_EVOLVING, DTRACE_STABILITY_EVOLVING, DTRACE_CLASS_COMMON },
174{ DTRACE_STABILITY_PRIVATE, DTRACE_STABILITY_PRIVATE, DTRACE_CLASS_ISA },
175};
176
177static dtrace_pops_t profile_pops = {
178	profile_provide,
179	NULL,
180	profile_enable,
181	profile_disable,
182	NULL,
183	NULL,
184	NULL,
185	NULL,
186	NULL,
187	profile_destroy
188};
189
190static struct cdev		*profile_cdev;
191static dtrace_provider_id_t	profile_id;
192static hrtime_t			profile_interval_min = NANOSEC / 5000;	/* 5000 hz */
193static int			profile_aframes = 0;			/* override */
194
195static void
196profile_fire(void *arg)
197{
198	profile_probe_percpu_t *pcpu = arg;
199	profile_probe_t *prof = pcpu->profc_probe;
200	hrtime_t late;
201	solaris_cpu_t *c = &solaris_cpu[curcpu];
202
203	late = gethrtime() - pcpu->profc_expected;
204	pcpu->profc_expected += pcpu->profc_interval;
205
206	dtrace_probe(prof->prof_id, c->cpu_profile_pc,
207	    c->cpu_profile_upc, late, 0, 0);
208}
209
210static void
211profile_tick(void *arg)
212{
213	profile_probe_t *prof = arg;
214	solaris_cpu_t *c = &solaris_cpu[curcpu];
215
216	dtrace_probe(prof->prof_id, c->cpu_profile_pc,
217	    c->cpu_profile_upc, 0, 0, 0);
218}
219
220static void
221profile_create(hrtime_t interval, char *name, int kind)
222{
223	profile_probe_t *prof;
224
225	if (interval < profile_interval_min)
226		return;
227
228	if (dtrace_probe_lookup(profile_id, NULL, NULL, name) != 0)
229		return;
230
231	atomic_add_32(&profile_total, 1);
232	if (profile_total > profile_max) {
233		atomic_add_32(&profile_total, -1);
234		return;
235	}
236
237	prof = kmem_zalloc(sizeof (profile_probe_t), KM_SLEEP);
238	(void) strcpy(prof->prof_name, name);
239	prof->prof_interval = interval;
240	prof->prof_cyclic = CYCLIC_NONE;
241	prof->prof_kind = kind;
242	prof->prof_id = dtrace_probe_create(profile_id,
243	    NULL, NULL, name,
244	    profile_aframes ? profile_aframes : PROF_ARTIFICIAL_FRAMES, prof);
245}
246
247/*ARGSUSED*/
248static void
249profile_provide(void *arg, dtrace_probedesc_t *desc)
250{
251	int i, j, rate, kind;
252	hrtime_t val = 0, mult = 1, len = 0;
253	char *name, *suffix = NULL;
254
255	const struct {
256		char *prefix;
257		int kind;
258	} types[] = {
259		{ PROF_PREFIX_PROFILE, PROF_PROFILE },
260		{ PROF_PREFIX_TICK, PROF_TICK },
261		{ 0, 0 }
262	};
263
264	const struct {
265		char *name;
266		hrtime_t mult;
267	} suffixes[] = {
268		{ "ns", 	NANOSEC / NANOSEC },
269		{ "nsec",	NANOSEC / NANOSEC },
270		{ "us",		NANOSEC / MICROSEC },
271		{ "usec",	NANOSEC / MICROSEC },
272		{ "ms",		NANOSEC / MILLISEC },
273		{ "msec",	NANOSEC / MILLISEC },
274		{ "s",		NANOSEC / SEC },
275		{ "sec",	NANOSEC / SEC },
276		{ "m",		NANOSEC * (hrtime_t)60 },
277		{ "min",	NANOSEC * (hrtime_t)60 },
278		{ "h",		NANOSEC * (hrtime_t)(60 * 60) },
279		{ "hour",	NANOSEC * (hrtime_t)(60 * 60) },
280		{ "d",		NANOSEC * (hrtime_t)(24 * 60 * 60) },
281		{ "day",	NANOSEC * (hrtime_t)(24 * 60 * 60) },
282		{ "hz",		0 },
283		{ NULL }
284	};
285
286	if (desc == NULL) {
287		char n[PROF_NAMELEN];
288
289		/*
290		 * If no description was provided, provide all of our probes.
291		 */
292		for (i = 0; i < sizeof (profile_rates) / sizeof (int); i++) {
293			if ((rate = profile_rates[i]) == 0)
294				continue;
295
296			(void) snprintf(n, PROF_NAMELEN, "%s%d",
297			    PROF_PREFIX_PROFILE, rate);
298			profile_create(NANOSEC / rate, n, PROF_PROFILE);
299		}
300
301		for (i = 0; i < sizeof (profile_ticks) / sizeof (int); i++) {
302			if ((rate = profile_ticks[i]) == 0)
303				continue;
304
305			(void) snprintf(n, PROF_NAMELEN, "%s%d",
306			    PROF_PREFIX_TICK, rate);
307			profile_create(NANOSEC / rate, n, PROF_TICK);
308		}
309
310		return;
311	}
312
313	name = desc->dtpd_name;
314
315	for (i = 0; types[i].prefix != NULL; i++) {
316		len = strlen(types[i].prefix);
317
318		if (strncmp(name, types[i].prefix, len) != 0)
319			continue;
320		break;
321	}
322
323	if (types[i].prefix == NULL)
324		return;
325
326	kind = types[i].kind;
327	j = strlen(name) - len;
328
329	/*
330	 * We need to start before any time suffix.
331	 */
332	for (j = strlen(name); j >= len; j--) {
333		if (name[j] >= '0' && name[j] <= '9')
334			break;
335		suffix = &name[j];
336	}
337
338	ASSERT(suffix != NULL);
339
340	/*
341	 * Now determine the numerical value present in the probe name.
342	 */
343	for (; j >= len; j--) {
344		if (name[j] < '0' || name[j] > '9')
345			return;
346
347		val += (name[j] - '0') * mult;
348		mult *= (hrtime_t)10;
349	}
350
351	if (val == 0)
352		return;
353
354	/*
355	 * Look-up the suffix to determine the multiplier.
356	 */
357	for (i = 0, mult = 0; suffixes[i].name != NULL; i++) {
358		if (strcasecmp(suffixes[i].name, suffix) == 0) {
359			mult = suffixes[i].mult;
360			break;
361		}
362	}
363
364	if (suffixes[i].name == NULL && *suffix != '\0')
365		return;
366
367	if (mult == 0) {
368		/*
369		 * The default is frequency-per-second.
370		 */
371		val = NANOSEC / val;
372	} else {
373		val *= mult;
374	}
375
376	profile_create(val, name, kind);
377}
378
379/* ARGSUSED */
380static void
381profile_destroy(void *arg, dtrace_id_t id, void *parg)
382{
383	profile_probe_t *prof = parg;
384
385	ASSERT(prof->prof_cyclic == CYCLIC_NONE);
386	kmem_free(prof, sizeof (profile_probe_t));
387
388	ASSERT(profile_total >= 1);
389	atomic_add_32(&profile_total, -1);
390}
391
392/*ARGSUSED*/
393static void
394profile_online(void *arg, cpu_t *cpu, cyc_handler_t *hdlr, cyc_time_t *when)
395{
396	profile_probe_t *prof = arg;
397	profile_probe_percpu_t *pcpu;
398
399	pcpu = kmem_zalloc(sizeof (profile_probe_percpu_t), KM_SLEEP);
400	pcpu->profc_probe = prof;
401
402	hdlr->cyh_func = profile_fire;
403	hdlr->cyh_arg = pcpu;
404
405	when->cyt_interval = prof->prof_interval;
406	when->cyt_when = gethrtime() + when->cyt_interval;
407
408	pcpu->profc_expected = when->cyt_when;
409	pcpu->profc_interval = when->cyt_interval;
410}
411
412/*ARGSUSED*/
413static void
414profile_offline(void *arg, cpu_t *cpu, void *oarg)
415{
416	profile_probe_percpu_t *pcpu = oarg;
417
418	ASSERT(pcpu->profc_probe == arg);
419	kmem_free(pcpu, sizeof (profile_probe_percpu_t));
420}
421
422/* ARGSUSED */
423static void
424profile_enable(void *arg, dtrace_id_t id, void *parg)
425{
426	profile_probe_t *prof = parg;
427	cyc_omni_handler_t omni;
428	cyc_handler_t hdlr;
429	cyc_time_t when;
430
431	ASSERT(prof->prof_interval != 0);
432	ASSERT(MUTEX_HELD(&cpu_lock));
433
434	if (prof->prof_kind == PROF_TICK) {
435		hdlr.cyh_func = profile_tick;
436		hdlr.cyh_arg = prof;
437
438		when.cyt_interval = prof->prof_interval;
439		when.cyt_when = gethrtime() + when.cyt_interval;
440	} else {
441		ASSERT(prof->prof_kind == PROF_PROFILE);
442		omni.cyo_online = profile_online;
443		omni.cyo_offline = profile_offline;
444		omni.cyo_arg = prof;
445	}
446
447	if (prof->prof_kind == PROF_TICK) {
448		prof->prof_cyclic = cyclic_add(&hdlr, &when);
449	} else {
450		prof->prof_cyclic = cyclic_add_omni(&omni);
451	}
452}
453
454/* ARGSUSED */
455static void
456profile_disable(void *arg, dtrace_id_t id, void *parg)
457{
458	profile_probe_t *prof = parg;
459
460	ASSERT(prof->prof_cyclic != CYCLIC_NONE);
461	ASSERT(MUTEX_HELD(&cpu_lock));
462
463	cyclic_remove(prof->prof_cyclic);
464	prof->prof_cyclic = CYCLIC_NONE;
465}
466
467static void
468profile_load(void *dummy)
469{
470	/* Create the /dev/dtrace/profile entry. */
471	profile_cdev = make_dev(&profile_cdevsw, 0, UID_ROOT, GID_WHEEL, 0600,
472	    "dtrace/profile");
473
474	if (dtrace_register("profile", &profile_attr, DTRACE_PRIV_USER,
475	    NULL, &profile_pops, NULL, &profile_id) != 0)
476		return;
477}
478
479
480static int
481profile_unload()
482{
483	int error = 0;
484
485	if ((error = dtrace_unregister(profile_id)) != 0)
486		return (error);
487
488	destroy_dev(profile_cdev);
489
490	return (error);
491}
492
493/* ARGSUSED */
494static int
495profile_modevent(module_t mod __unused, int type, void *data __unused)
496{
497	int error = 0;
498
499	switch (type) {
500	case MOD_LOAD:
501		break;
502
503	case MOD_UNLOAD:
504		break;
505
506	case MOD_SHUTDOWN:
507		break;
508
509	default:
510		error = EOPNOTSUPP;
511		break;
512
513	}
514	return (error);
515}
516
517/* ARGSUSED */
518static int
519profile_open(struct cdev *dev __unused, int oflags __unused, int devtype __unused, struct thread *td __unused)
520{
521	return (0);
522}
523
524SYSINIT(profile_load, SI_SUB_DTRACE_PROVIDER, SI_ORDER_ANY, profile_load, NULL);
525SYSUNINIT(profile_unload, SI_SUB_DTRACE_PROVIDER, SI_ORDER_ANY, profile_unload, NULL);
526
527DEV_MODULE(profile, profile_modevent, NULL);
528MODULE_VERSION(profile, 1);
529MODULE_DEPEND(profile, dtrace, 1, 1, 1);
530MODULE_DEPEND(profile, cyclic, 1, 1, 1);
531MODULE_DEPEND(profile, opensolaris, 1, 1, 1);
532