profile.c revision 233409
1193323Sed/*
2193323Sed * CDDL HEADER START
3193323Sed *
4193323Sed * The contents of this file are subject to the terms of the
5193323Sed * Common Development and Distribution License (the "License").
6193323Sed * You may not use this file except in compliance with the License.
7193323Sed *
8193323Sed * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9193323Sed * or http://www.opensolaris.org/os/licensing.
10193323Sed * See the License for the specific language governing permissions
11193323Sed * and limitations under the License.
12193323Sed *
13193323Sed * When distributing Covered Code, include this CDDL HEADER in each
14202375Srdivacky * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15202375Srdivacky * If applicable, add the following below this CDDL HEADER, with the
16193323Sed * fields enclosed by brackets "[]" replaced with your own identifying
17193323Sed * information: Portions Copyright [yyyy] [name of copyright owner]
18193323Sed *
19202375Srdivacky * CDDL HEADER END
20202375Srdivacky *
21202375Srdivacky * Portions Copyright 2006-2008 John Birrell jb@freebsd.org
22202375Srdivacky *
23202375Srdivacky * $FreeBSD: head/sys/cddl/dev/profile/profile.c 233409 2012-03-24 05:14:37Z gonzo $
24202375Srdivacky *
25202375Srdivacky */
26202375Srdivacky
27202375Srdivacky/*
28202375Srdivacky * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
29202375Srdivacky * Use is subject to license terms.
30202375Srdivacky */
31202375Srdivacky
32193323Sed#include <sys/cdefs.h>
33193323Sed#include <sys/param.h>
34193323Sed#include <sys/systm.h>
35193323Sed#include <sys/conf.h>
36202375Srdivacky#include <sys/cpuvar.h>
37202375Srdivacky#include <sys/fcntl.h>
38202375Srdivacky#include <sys/filio.h>
39202375Srdivacky#include <sys/kdb.h>
40193323Sed#include <sys/kernel.h>
41202375Srdivacky#include <sys/kmem.h>
42193323Sed#include <sys/kthread.h>
43202375Srdivacky#include <sys/limits.h>
44202375Srdivacky#include <sys/linker.h>
45193323Sed#include <sys/lock.h>
46202878Srdivacky#include <sys/malloc.h>
47193323Sed#include <sys/module.h>
48193323Sed#include <sys/mutex.h>
49193323Sed#include <sys/poll.h>
50193323Sed#include <sys/proc.h>
51202375Srdivacky#include <sys/selinfo.h>
52202375Srdivacky#include <sys/smp.h>
53202375Srdivacky#include <sys/uio.h>
54226633Sdim#include <sys/unistd.h>
55226633Sdim#include <machine/stdarg.h>
56226633Sdim
57226633Sdim#include <sys/cyclic.h>
58193323Sed#include <sys/dtrace.h>
59193323Sed#include <sys/dtrace_bsd.h>
60
61#define	PROF_NAMELEN		15
62
63#define	PROF_PROFILE		0
64#define	PROF_TICK		1
65#define	PROF_PREFIX_PROFILE	"profile-"
66#define	PROF_PREFIX_TICK	"tick-"
67
68/*
69 * Regardless of platform, there are five artificial frames in the case of the
70 * profile provider:
71 *
72 *	profile_fire
73 *	cyclic_expire
74 *	cyclic_fire
75 *	[ cbe ]
76 *	[ locore ]
77 *
78 * On amd64, there are two frames associated with locore:  one in locore, and
79 * another in common interrupt dispatch code.  (i386 has not been modified to
80 * use this common layer.)  Further, on i386, the interrupted instruction
81 * appears as its own stack frame.  All of this means that we need to add one
82 * frame for amd64, and then take one away for both amd64 and i386.
83 *
84 * On SPARC, the picture is further complicated because the compiler
85 * optimizes away tail-calls -- so the following frames are optimized away:
86 *
87 * 	profile_fire
88 *	cyclic_expire
89 *
90 * This gives three frames.  However, on DEBUG kernels, the cyclic_expire
91 * frame cannot be tail-call eliminated, yielding four frames in this case.
92 *
93 * All of the above constraints lead to the mess below.  Yes, the profile
94 * provider should ideally figure this out on-the-fly by hiting one of its own
95 * probes and then walking its own stack trace.  This is complicated, however,
96 * and the static definition doesn't seem to be overly brittle.  Still, we
97 * allow for a manual override in case we get it completely wrong.
98 */
99#ifdef __amd64
100#define	PROF_ARTIFICIAL_FRAMES	7
101#else
102#ifdef __i386
103#define	PROF_ARTIFICIAL_FRAMES	6
104#else
105#ifdef __sparc
106#ifdef DEBUG
107#define	PROF_ARTIFICIAL_FRAMES	4
108#else
109#define	PROF_ARTIFICIAL_FRAMES	3
110#endif
111#endif
112#endif
113#endif
114
115#ifdef __mips
116/*
117 * This value is bogus just to make module compilable on mips
118 */
119#define	PROF_ARTIFICIAL_FRAMES	3
120#endif
121
122typedef struct profile_probe {
123	char		prof_name[PROF_NAMELEN];
124	dtrace_id_t	prof_id;
125	int		prof_kind;
126	hrtime_t	prof_interval;
127	cyclic_id_t	prof_cyclic;
128} profile_probe_t;
129
130typedef struct profile_probe_percpu {
131	hrtime_t	profc_expected;
132	hrtime_t	profc_interval;
133	profile_probe_t	*profc_probe;
134} profile_probe_percpu_t;
135
136static d_open_t	profile_open;
137static int	profile_unload(void);
138static void	profile_create(hrtime_t, char *, int);
139static void	profile_destroy(void *, dtrace_id_t, void *);
140static void	profile_enable(void *, dtrace_id_t, void *);
141static void	profile_disable(void *, dtrace_id_t, void *);
142static void	profile_load(void *);
143static void	profile_provide(void *, dtrace_probedesc_t *);
144
145static int profile_rates[] = {
146    97, 199, 499, 997, 1999,
147    4001, 4999, 0, 0, 0,
148    0, 0, 0, 0, 0,
149    0, 0, 0, 0, 0
150};
151
152static int profile_ticks[] = {
153    1, 10, 100, 500, 1000,
154    5000, 0, 0, 0, 0,
155    0, 0, 0, 0, 0
156};
157
158/*
159 * profile_max defines the upper bound on the number of profile probes that
160 * can exist (this is to prevent malicious or clumsy users from exhausing
161 * system resources by creating a slew of profile probes). At mod load time,
162 * this gets its value from PROFILE_MAX_DEFAULT or profile-max-probes if it's
163 * present in the profile.conf file.
164 */
165#define	PROFILE_MAX_DEFAULT	1000	/* default max. number of probes */
166static uint32_t profile_max = PROFILE_MAX_DEFAULT;
167					/* maximum number of profile probes */
168static uint32_t profile_total;		/* current number of profile probes */
169
170static struct cdevsw profile_cdevsw = {
171	.d_version	= D_VERSION,
172	.d_open		= profile_open,
173	.d_name		= "profile",
174};
175
176static dtrace_pattr_t profile_attr = {
177{ DTRACE_STABILITY_EVOLVING, DTRACE_STABILITY_EVOLVING, DTRACE_CLASS_COMMON },
178{ DTRACE_STABILITY_PRIVATE, DTRACE_STABILITY_PRIVATE, DTRACE_CLASS_UNKNOWN },
179{ DTRACE_STABILITY_PRIVATE, DTRACE_STABILITY_PRIVATE, DTRACE_CLASS_ISA },
180{ DTRACE_STABILITY_EVOLVING, DTRACE_STABILITY_EVOLVING, DTRACE_CLASS_COMMON },
181{ DTRACE_STABILITY_PRIVATE, DTRACE_STABILITY_PRIVATE, DTRACE_CLASS_ISA },
182};
183
184static dtrace_pops_t profile_pops = {
185	profile_provide,
186	NULL,
187	profile_enable,
188	profile_disable,
189	NULL,
190	NULL,
191	NULL,
192	NULL,
193	NULL,
194	profile_destroy
195};
196
197static struct cdev		*profile_cdev;
198static dtrace_provider_id_t	profile_id;
199static hrtime_t			profile_interval_min = NANOSEC / 5000;	/* 5000 hz */
200static int			profile_aframes = 0;			/* override */
201
202static void
203profile_fire(void *arg)
204{
205	profile_probe_percpu_t *pcpu = arg;
206	profile_probe_t *prof = pcpu->profc_probe;
207	hrtime_t late;
208	solaris_cpu_t *c = &solaris_cpu[curcpu];
209
210	late = gethrtime() - pcpu->profc_expected;
211	pcpu->profc_expected += pcpu->profc_interval;
212
213	dtrace_probe(prof->prof_id, c->cpu_profile_pc,
214	    c->cpu_profile_upc, late, 0, 0);
215}
216
217static void
218profile_tick(void *arg)
219{
220	profile_probe_t *prof = arg;
221	solaris_cpu_t *c = &solaris_cpu[curcpu];
222
223	dtrace_probe(prof->prof_id, c->cpu_profile_pc,
224	    c->cpu_profile_upc, 0, 0, 0);
225}
226
227static void
228profile_create(hrtime_t interval, char *name, int kind)
229{
230	profile_probe_t *prof;
231
232	if (interval < profile_interval_min)
233		return;
234
235	if (dtrace_probe_lookup(profile_id, NULL, NULL, name) != 0)
236		return;
237
238	atomic_add_32(&profile_total, 1);
239	if (profile_total > profile_max) {
240		atomic_add_32(&profile_total, -1);
241		return;
242	}
243
244	prof = kmem_zalloc(sizeof (profile_probe_t), KM_SLEEP);
245	(void) strcpy(prof->prof_name, name);
246	prof->prof_interval = interval;
247	prof->prof_cyclic = CYCLIC_NONE;
248	prof->prof_kind = kind;
249	prof->prof_id = dtrace_probe_create(profile_id,
250	    NULL, NULL, name,
251	    profile_aframes ? profile_aframes : PROF_ARTIFICIAL_FRAMES, prof);
252}
253
254/*ARGSUSED*/
255static void
256profile_provide(void *arg, dtrace_probedesc_t *desc)
257{
258	int i, j, rate, kind;
259	hrtime_t val = 0, mult = 1, len = 0;
260	char *name, *suffix = NULL;
261
262	const struct {
263		char *prefix;
264		int kind;
265	} types[] = {
266		{ PROF_PREFIX_PROFILE, PROF_PROFILE },
267		{ PROF_PREFIX_TICK, PROF_TICK },
268		{ 0, 0 }
269	};
270
271	const struct {
272		char *name;
273		hrtime_t mult;
274	} suffixes[] = {
275		{ "ns", 	NANOSEC / NANOSEC },
276		{ "nsec",	NANOSEC / NANOSEC },
277		{ "us",		NANOSEC / MICROSEC },
278		{ "usec",	NANOSEC / MICROSEC },
279		{ "ms",		NANOSEC / MILLISEC },
280		{ "msec",	NANOSEC / MILLISEC },
281		{ "s",		NANOSEC / SEC },
282		{ "sec",	NANOSEC / SEC },
283		{ "m",		NANOSEC * (hrtime_t)60 },
284		{ "min",	NANOSEC * (hrtime_t)60 },
285		{ "h",		NANOSEC * (hrtime_t)(60 * 60) },
286		{ "hour",	NANOSEC * (hrtime_t)(60 * 60) },
287		{ "d",		NANOSEC * (hrtime_t)(24 * 60 * 60) },
288		{ "day",	NANOSEC * (hrtime_t)(24 * 60 * 60) },
289		{ "hz",		0 },
290		{ NULL }
291	};
292
293	if (desc == NULL) {
294		char n[PROF_NAMELEN];
295
296		/*
297		 * If no description was provided, provide all of our probes.
298		 */
299		for (i = 0; i < sizeof (profile_rates) / sizeof (int); i++) {
300			if ((rate = profile_rates[i]) == 0)
301				continue;
302
303			(void) snprintf(n, PROF_NAMELEN, "%s%d",
304			    PROF_PREFIX_PROFILE, rate);
305			profile_create(NANOSEC / rate, n, PROF_PROFILE);
306		}
307
308		for (i = 0; i < sizeof (profile_ticks) / sizeof (int); i++) {
309			if ((rate = profile_ticks[i]) == 0)
310				continue;
311
312			(void) snprintf(n, PROF_NAMELEN, "%s%d",
313			    PROF_PREFIX_TICK, rate);
314			profile_create(NANOSEC / rate, n, PROF_TICK);
315		}
316
317		return;
318	}
319
320	name = desc->dtpd_name;
321
322	for (i = 0; types[i].prefix != NULL; i++) {
323		len = strlen(types[i].prefix);
324
325		if (strncmp(name, types[i].prefix, len) != 0)
326			continue;
327		break;
328	}
329
330	if (types[i].prefix == NULL)
331		return;
332
333	kind = types[i].kind;
334	j = strlen(name) - len;
335
336	/*
337	 * We need to start before any time suffix.
338	 */
339	for (j = strlen(name); j >= len; j--) {
340		if (name[j] >= '0' && name[j] <= '9')
341			break;
342		suffix = &name[j];
343	}
344
345	ASSERT(suffix != NULL);
346
347	/*
348	 * Now determine the numerical value present in the probe name.
349	 */
350	for (; j >= len; j--) {
351		if (name[j] < '0' || name[j] > '9')
352			return;
353
354		val += (name[j] - '0') * mult;
355		mult *= (hrtime_t)10;
356	}
357
358	if (val == 0)
359		return;
360
361	/*
362	 * Look-up the suffix to determine the multiplier.
363	 */
364	for (i = 0, mult = 0; suffixes[i].name != NULL; i++) {
365		if (strcasecmp(suffixes[i].name, suffix) == 0) {
366			mult = suffixes[i].mult;
367			break;
368		}
369	}
370
371	if (suffixes[i].name == NULL && *suffix != '\0')
372		return;
373
374	if (mult == 0) {
375		/*
376		 * The default is frequency-per-second.
377		 */
378		val = NANOSEC / val;
379	} else {
380		val *= mult;
381	}
382
383	profile_create(val, name, kind);
384}
385
386/* ARGSUSED */
387static void
388profile_destroy(void *arg, dtrace_id_t id, void *parg)
389{
390	profile_probe_t *prof = parg;
391
392	ASSERT(prof->prof_cyclic == CYCLIC_NONE);
393	kmem_free(prof, sizeof (profile_probe_t));
394
395	ASSERT(profile_total >= 1);
396	atomic_add_32(&profile_total, -1);
397}
398
399/*ARGSUSED*/
400static void
401profile_online(void *arg, cpu_t *cpu, cyc_handler_t *hdlr, cyc_time_t *when)
402{
403	profile_probe_t *prof = arg;
404	profile_probe_percpu_t *pcpu;
405
406	pcpu = kmem_zalloc(sizeof (profile_probe_percpu_t), KM_SLEEP);
407	pcpu->profc_probe = prof;
408
409	hdlr->cyh_func = profile_fire;
410	hdlr->cyh_arg = pcpu;
411
412	when->cyt_interval = prof->prof_interval;
413	when->cyt_when = gethrtime() + when->cyt_interval;
414
415	pcpu->profc_expected = when->cyt_when;
416	pcpu->profc_interval = when->cyt_interval;
417}
418
419/*ARGSUSED*/
420static void
421profile_offline(void *arg, cpu_t *cpu, void *oarg)
422{
423	profile_probe_percpu_t *pcpu = oarg;
424
425	ASSERT(pcpu->profc_probe == arg);
426	kmem_free(pcpu, sizeof (profile_probe_percpu_t));
427}
428
429/* ARGSUSED */
430static void
431profile_enable(void *arg, dtrace_id_t id, void *parg)
432{
433	profile_probe_t *prof = parg;
434	cyc_omni_handler_t omni;
435	cyc_handler_t hdlr;
436	cyc_time_t when;
437
438	ASSERT(prof->prof_interval != 0);
439	ASSERT(MUTEX_HELD(&cpu_lock));
440
441	if (prof->prof_kind == PROF_TICK) {
442		hdlr.cyh_func = profile_tick;
443		hdlr.cyh_arg = prof;
444
445		when.cyt_interval = prof->prof_interval;
446		when.cyt_when = gethrtime() + when.cyt_interval;
447	} else {
448		ASSERT(prof->prof_kind == PROF_PROFILE);
449		omni.cyo_online = profile_online;
450		omni.cyo_offline = profile_offline;
451		omni.cyo_arg = prof;
452	}
453
454	if (prof->prof_kind == PROF_TICK) {
455		prof->prof_cyclic = cyclic_add(&hdlr, &when);
456	} else {
457		prof->prof_cyclic = cyclic_add_omni(&omni);
458	}
459}
460
461/* ARGSUSED */
462static void
463profile_disable(void *arg, dtrace_id_t id, void *parg)
464{
465	profile_probe_t *prof = parg;
466
467	ASSERT(prof->prof_cyclic != CYCLIC_NONE);
468	ASSERT(MUTEX_HELD(&cpu_lock));
469
470	cyclic_remove(prof->prof_cyclic);
471	prof->prof_cyclic = CYCLIC_NONE;
472}
473
474static void
475profile_load(void *dummy)
476{
477	/* Create the /dev/dtrace/profile entry. */
478	profile_cdev = make_dev(&profile_cdevsw, 0, UID_ROOT, GID_WHEEL, 0600,
479	    "dtrace/profile");
480
481	if (dtrace_register("profile", &profile_attr, DTRACE_PRIV_USER,
482	    NULL, &profile_pops, NULL, &profile_id) != 0)
483		return;
484}
485
486
487static int
488profile_unload()
489{
490	int error = 0;
491
492	if ((error = dtrace_unregister(profile_id)) != 0)
493		return (error);
494
495	destroy_dev(profile_cdev);
496
497	return (error);
498}
499
500/* ARGSUSED */
501static int
502profile_modevent(module_t mod __unused, int type, void *data __unused)
503{
504	int error = 0;
505
506	switch (type) {
507	case MOD_LOAD:
508		break;
509
510	case MOD_UNLOAD:
511		break;
512
513	case MOD_SHUTDOWN:
514		break;
515
516	default:
517		error = EOPNOTSUPP;
518		break;
519
520	}
521	return (error);
522}
523
524/* ARGSUSED */
525static int
526profile_open(struct cdev *dev __unused, int oflags __unused, int devtype __unused, struct thread *td __unused)
527{
528	return (0);
529}
530
531SYSINIT(profile_load, SI_SUB_DTRACE_PROVIDER, SI_ORDER_ANY, profile_load, NULL);
532SYSUNINIT(profile_unload, SI_SUB_DTRACE_PROVIDER, SI_ORDER_ANY, profile_unload, NULL);
533
534DEV_MODULE(profile, profile_modevent, NULL);
535MODULE_VERSION(profile, 1);
536MODULE_DEPEND(profile, dtrace, 1, 1, 1);
537MODULE_DEPEND(profile, cyclic, 1, 1, 1);
538MODULE_DEPEND(profile, opensolaris, 1, 1, 1);
539