profile.c revision 291855
1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 *
21 * Portions Copyright 2006-2008 John Birrell jb@freebsd.org
22 *
23 * $FreeBSD: head/sys/cddl/dev/profile/profile.c 291855 2015-12-05 10:00:01Z andrew $
24 *
25 */
26
27/*
28 * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
29 * Use is subject to license terms.
30 */
31
32#include <sys/cdefs.h>
33#include <sys/param.h>
34#include <sys/systm.h>
35#include <sys/conf.h>
36#include <sys/cpuvar.h>
37#include <sys/fcntl.h>
38#include <sys/filio.h>
39#include <sys/kdb.h>
40#include <sys/kernel.h>
41#include <sys/kmem.h>
42#include <sys/kthread.h>
43#include <sys/limits.h>
44#include <sys/linker.h>
45#include <sys/lock.h>
46#include <sys/malloc.h>
47#include <sys/module.h>
48#include <sys/mutex.h>
49#include <sys/poll.h>
50#include <sys/proc.h>
51#include <sys/selinfo.h>
52#include <sys/smp.h>
53#include <sys/sysctl.h>
54#include <sys/uio.h>
55#include <sys/unistd.h>
56#include <machine/cpu.h>
57#include <machine/stdarg.h>
58
59#include <sys/dtrace.h>
60#include <sys/dtrace_bsd.h>
61
62#define	PROF_NAMELEN		15
63
64#define	PROF_PROFILE		0
65#define	PROF_TICK		1
66#define	PROF_PREFIX_PROFILE	"profile-"
67#define	PROF_PREFIX_TICK	"tick-"
68
69/*
70 * Regardless of platform, there are five artificial frames in the case of the
71 * profile provider:
72 *
73 *	profile_fire
74 *	cyclic_expire
75 *	cyclic_fire
76 *	[ cbe ]
77 *	[ locore ]
78 *
79 * On amd64, there are two frames associated with locore:  one in locore, and
80 * another in common interrupt dispatch code.  (i386 has not been modified to
81 * use this common layer.)  Further, on i386, the interrupted instruction
82 * appears as its own stack frame.  All of this means that we need to add one
83 * frame for amd64, and then take one away for both amd64 and i386.
84 *
85 * On SPARC, the picture is further complicated because the compiler
86 * optimizes away tail-calls -- so the following frames are optimized away:
87 *
88 * 	profile_fire
89 *	cyclic_expire
90 *
91 * This gives three frames.  However, on DEBUG kernels, the cyclic_expire
92 * frame cannot be tail-call eliminated, yielding four frames in this case.
93 *
94 * All of the above constraints lead to the mess below.  Yes, the profile
95 * provider should ideally figure this out on-the-fly by hiting one of its own
96 * probes and then walking its own stack trace.  This is complicated, however,
97 * and the static definition doesn't seem to be overly brittle.  Still, we
98 * allow for a manual override in case we get it completely wrong.
99 */
100#ifdef __amd64
101#define	PROF_ARTIFICIAL_FRAMES	10
102#else
103#ifdef __i386
104#define	PROF_ARTIFICIAL_FRAMES	6
105#else
106#ifdef __sparc
107#ifdef DEBUG
108#define	PROF_ARTIFICIAL_FRAMES	4
109#else
110#define	PROF_ARTIFICIAL_FRAMES	3
111#endif
112#endif
113#endif
114#endif
115
116#ifdef __mips
117/*
118 * This value is bogus just to make module compilable on mips
119 */
120#define	PROF_ARTIFICIAL_FRAMES	3
121#endif
122
123#ifdef __powerpc__
124/*
125 * This value is bogus just to make module compilable on powerpc
126 */
127#define	PROF_ARTIFICIAL_FRAMES	3
128#endif
129
130struct profile_probe_percpu;
131
132#ifdef __mips
133/* bogus */
134#define	PROF_ARTIFICIAL_FRAMES	3
135#endif
136
137#ifdef __arm__
138#define	PROF_ARTIFICIAL_FRAMES	3
139#endif
140
141#ifdef __aarch64__
142/* TODO: verify */
143#define	PROF_ARTIFICIAL_FRAMES	10
144#endif
145
146typedef struct profile_probe {
147	char		prof_name[PROF_NAMELEN];
148	dtrace_id_t	prof_id;
149	int		prof_kind;
150#ifdef illumos
151	hrtime_t	prof_interval;
152	cyclic_id_t	prof_cyclic;
153#else
154	sbintime_t	prof_interval;
155	struct callout	prof_cyclic;
156	sbintime_t	prof_expected;
157	struct profile_probe_percpu **prof_pcpus;
158#endif
159} profile_probe_t;
160
161typedef struct profile_probe_percpu {
162	hrtime_t	profc_expected;
163	hrtime_t	profc_interval;
164	profile_probe_t	*profc_probe;
165#ifdef __FreeBSD__
166	struct callout	profc_cyclic;
167#endif
168} profile_probe_percpu_t;
169
170static d_open_t	profile_open;
171static int	profile_unload(void);
172static void	profile_create(hrtime_t, char *, int);
173static void	profile_destroy(void *, dtrace_id_t, void *);
174static void	profile_enable(void *, dtrace_id_t, void *);
175static void	profile_disable(void *, dtrace_id_t, void *);
176static void	profile_load(void *);
177static void	profile_provide(void *, dtrace_probedesc_t *);
178
179static int profile_rates[] = {
180    97, 199, 499, 997, 1999,
181    4001, 4999, 0, 0, 0,
182    0, 0, 0, 0, 0,
183    0, 0, 0, 0, 0
184};
185
186static int profile_ticks[] = {
187    1, 10, 100, 500, 1000,
188    5000, 0, 0, 0, 0,
189    0, 0, 0, 0, 0
190};
191
192/*
193 * profile_max defines the upper bound on the number of profile probes that
194 * can exist (this is to prevent malicious or clumsy users from exhausing
195 * system resources by creating a slew of profile probes). At mod load time,
196 * this gets its value from PROFILE_MAX_DEFAULT or profile-max-probes if it's
197 * present in the profile.conf file.
198 */
199#define	PROFILE_MAX_DEFAULT	1000	/* default max. number of probes */
200static uint32_t profile_max = PROFILE_MAX_DEFAULT;
201					/* maximum number of profile probes */
202static uint32_t profile_total;		/* current number of profile probes */
203
204static struct cdevsw profile_cdevsw = {
205	.d_version	= D_VERSION,
206	.d_open		= profile_open,
207	.d_name		= "profile",
208};
209
210static dtrace_pattr_t profile_attr = {
211{ DTRACE_STABILITY_EVOLVING, DTRACE_STABILITY_EVOLVING, DTRACE_CLASS_COMMON },
212{ DTRACE_STABILITY_PRIVATE, DTRACE_STABILITY_PRIVATE, DTRACE_CLASS_UNKNOWN },
213{ DTRACE_STABILITY_PRIVATE, DTRACE_STABILITY_PRIVATE, DTRACE_CLASS_ISA },
214{ DTRACE_STABILITY_EVOLVING, DTRACE_STABILITY_EVOLVING, DTRACE_CLASS_COMMON },
215{ DTRACE_STABILITY_PRIVATE, DTRACE_STABILITY_PRIVATE, DTRACE_CLASS_ISA },
216};
217
218static dtrace_pops_t profile_pops = {
219	profile_provide,
220	NULL,
221	profile_enable,
222	profile_disable,
223	NULL,
224	NULL,
225	NULL,
226	NULL,
227	NULL,
228	profile_destroy
229};
230
231static struct cdev		*profile_cdev;
232static dtrace_provider_id_t	profile_id;
233static hrtime_t			profile_interval_min = NANOSEC / 5000;	/* 5000 hz */
234static int			profile_aframes = PROF_ARTIFICIAL_FRAMES;
235
236SYSCTL_DECL(_kern_dtrace);
237SYSCTL_NODE(_kern_dtrace, OID_AUTO, profile, CTLFLAG_RD, 0, "DTrace profile parameters");
238SYSCTL_INT(_kern_dtrace_profile, OID_AUTO, aframes, CTLFLAG_RW, &profile_aframes,
239    0, "Skipped frames for profile provider");
240
241static sbintime_t
242nsec_to_sbt(hrtime_t nsec)
243{
244	time_t sec;
245
246	/*
247	 * We need to calculate nsec * 2^32 / 10^9
248	 * Seconds and nanoseconds are split to avoid overflow.
249	 */
250	sec = nsec / NANOSEC;
251	nsec = nsec % NANOSEC;
252	return (((sbintime_t)sec << 32) | ((sbintime_t)nsec << 32) / NANOSEC);
253}
254
255static hrtime_t
256sbt_to_nsec(sbintime_t sbt)
257{
258
259	return ((sbt >> 32) * NANOSEC +
260	    (((uint32_t)sbt * (hrtime_t)NANOSEC) >> 32));
261}
262
263static void
264profile_fire(void *arg)
265{
266	profile_probe_percpu_t *pcpu = arg;
267	profile_probe_t *prof = pcpu->profc_probe;
268	hrtime_t late;
269	struct trapframe *frame;
270	uintfptr_t pc, upc;
271
272#ifdef illumos
273	late = gethrtime() - pcpu->profc_expected;
274#else
275	late = sbt_to_nsec(sbinuptime() - pcpu->profc_expected);
276#endif
277
278	pc = 0;
279	upc = 0;
280
281	/*
282	 * td_intr_frame can be unset if this is a catch up event
283	 * after waking up from idle sleep.
284	 * This can only happen on a CPU idle thread.
285	 */
286	frame = curthread->td_intr_frame;
287	if (frame != NULL) {
288		if (TRAPF_USERMODE(frame))
289			upc = TRAPF_PC(frame);
290		else
291			pc = TRAPF_PC(frame);
292	}
293	dtrace_probe(prof->prof_id, pc, upc, late, 0, 0);
294
295	pcpu->profc_expected += pcpu->profc_interval;
296	callout_schedule_sbt_curcpu(&pcpu->profc_cyclic,
297	    pcpu->profc_expected, 0, C_DIRECT_EXEC | C_ABSOLUTE);
298}
299
300static void
301profile_tick(void *arg)
302{
303	profile_probe_t *prof = arg;
304	struct trapframe *frame;
305	uintfptr_t pc, upc;
306
307	pc = 0;
308	upc = 0;
309
310	/*
311	 * td_intr_frame can be unset if this is a catch up event
312	 * after waking up from idle sleep.
313	 * This can only happen on a CPU idle thread.
314	 */
315	frame = curthread->td_intr_frame;
316	if (frame != NULL) {
317		if (TRAPF_USERMODE(frame))
318			upc = TRAPF_PC(frame);
319		else
320			pc = TRAPF_PC(frame);
321	}
322	dtrace_probe(prof->prof_id, pc, upc, 0, 0, 0);
323
324	prof->prof_expected += prof->prof_interval;
325	callout_schedule_sbt(&prof->prof_cyclic,
326	    prof->prof_expected, 0, C_DIRECT_EXEC | C_ABSOLUTE);
327}
328
329static void
330profile_create(hrtime_t interval, char *name, int kind)
331{
332	profile_probe_t *prof;
333
334	if (interval < profile_interval_min)
335		return;
336
337	if (dtrace_probe_lookup(profile_id, NULL, NULL, name) != 0)
338		return;
339
340	atomic_add_32(&profile_total, 1);
341	if (profile_total > profile_max) {
342		atomic_add_32(&profile_total, -1);
343		return;
344	}
345
346	prof = kmem_zalloc(sizeof (profile_probe_t), KM_SLEEP);
347	(void) strcpy(prof->prof_name, name);
348#ifdef illumos
349	prof->prof_interval = interval;
350	prof->prof_cyclic = CYCLIC_NONE;
351#else
352	prof->prof_interval = nsec_to_sbt(interval);
353	callout_init(&prof->prof_cyclic, 1);
354#endif
355	prof->prof_kind = kind;
356	prof->prof_id = dtrace_probe_create(profile_id,
357	    NULL, NULL, name,
358	    profile_aframes, prof);
359}
360
361/*ARGSUSED*/
362static void
363profile_provide(void *arg, dtrace_probedesc_t *desc)
364{
365	int i, j, rate, kind;
366	hrtime_t val = 0, mult = 1, len = 0;
367	char *name, *suffix = NULL;
368
369	const struct {
370		char *prefix;
371		int kind;
372	} types[] = {
373		{ PROF_PREFIX_PROFILE, PROF_PROFILE },
374		{ PROF_PREFIX_TICK, PROF_TICK },
375		{ 0, 0 }
376	};
377
378	const struct {
379		char *name;
380		hrtime_t mult;
381	} suffixes[] = {
382		{ "ns", 	NANOSEC / NANOSEC },
383		{ "nsec",	NANOSEC / NANOSEC },
384		{ "us",		NANOSEC / MICROSEC },
385		{ "usec",	NANOSEC / MICROSEC },
386		{ "ms",		NANOSEC / MILLISEC },
387		{ "msec",	NANOSEC / MILLISEC },
388		{ "s",		NANOSEC / SEC },
389		{ "sec",	NANOSEC / SEC },
390		{ "m",		NANOSEC * (hrtime_t)60 },
391		{ "min",	NANOSEC * (hrtime_t)60 },
392		{ "h",		NANOSEC * (hrtime_t)(60 * 60) },
393		{ "hour",	NANOSEC * (hrtime_t)(60 * 60) },
394		{ "d",		NANOSEC * (hrtime_t)(24 * 60 * 60) },
395		{ "day",	NANOSEC * (hrtime_t)(24 * 60 * 60) },
396		{ "hz",		0 },
397		{ NULL }
398	};
399
400	if (desc == NULL) {
401		char n[PROF_NAMELEN];
402
403		/*
404		 * If no description was provided, provide all of our probes.
405		 */
406		for (i = 0; i < sizeof (profile_rates) / sizeof (int); i++) {
407			if ((rate = profile_rates[i]) == 0)
408				continue;
409
410			(void) snprintf(n, PROF_NAMELEN, "%s%d",
411			    PROF_PREFIX_PROFILE, rate);
412			profile_create(NANOSEC / rate, n, PROF_PROFILE);
413		}
414
415		for (i = 0; i < sizeof (profile_ticks) / sizeof (int); i++) {
416			if ((rate = profile_ticks[i]) == 0)
417				continue;
418
419			(void) snprintf(n, PROF_NAMELEN, "%s%d",
420			    PROF_PREFIX_TICK, rate);
421			profile_create(NANOSEC / rate, n, PROF_TICK);
422		}
423
424		return;
425	}
426
427	name = desc->dtpd_name;
428
429	for (i = 0; types[i].prefix != NULL; i++) {
430		len = strlen(types[i].prefix);
431
432		if (strncmp(name, types[i].prefix, len) != 0)
433			continue;
434		break;
435	}
436
437	if (types[i].prefix == NULL)
438		return;
439
440	kind = types[i].kind;
441	j = strlen(name) - len;
442
443	/*
444	 * We need to start before any time suffix.
445	 */
446	for (j = strlen(name); j >= len; j--) {
447		if (name[j] >= '0' && name[j] <= '9')
448			break;
449		suffix = &name[j];
450	}
451
452	ASSERT(suffix != NULL);
453
454	/*
455	 * Now determine the numerical value present in the probe name.
456	 */
457	for (; j >= len; j--) {
458		if (name[j] < '0' || name[j] > '9')
459			return;
460
461		val += (name[j] - '0') * mult;
462		mult *= (hrtime_t)10;
463	}
464
465	if (val == 0)
466		return;
467
468	/*
469	 * Look-up the suffix to determine the multiplier.
470	 */
471	for (i = 0, mult = 0; suffixes[i].name != NULL; i++) {
472		if (strcasecmp(suffixes[i].name, suffix) == 0) {
473			mult = suffixes[i].mult;
474			break;
475		}
476	}
477
478	if (suffixes[i].name == NULL && *suffix != '\0')
479		return;
480
481	if (mult == 0) {
482		/*
483		 * The default is frequency-per-second.
484		 */
485		val = NANOSEC / val;
486	} else {
487		val *= mult;
488	}
489
490	profile_create(val, name, kind);
491}
492
493/* ARGSUSED */
494static void
495profile_destroy(void *arg, dtrace_id_t id, void *parg)
496{
497	profile_probe_t *prof = parg;
498
499#ifdef illumos
500	ASSERT(prof->prof_cyclic == CYCLIC_NONE);
501#else
502	ASSERT(!callout_active(&prof->prof_cyclic) && prof->prof_pcpus == NULL);
503#endif
504	kmem_free(prof, sizeof (profile_probe_t));
505
506	ASSERT(profile_total >= 1);
507	atomic_add_32(&profile_total, -1);
508}
509
510#ifdef illumos
511/*ARGSUSED*/
512static void
513profile_online(void *arg, cpu_t *cpu, cyc_handler_t *hdlr, cyc_time_t *when)
514{
515	profile_probe_t *prof = arg;
516	profile_probe_percpu_t *pcpu;
517
518	pcpu = kmem_zalloc(sizeof (profile_probe_percpu_t), KM_SLEEP);
519	pcpu->profc_probe = prof;
520
521	hdlr->cyh_func = profile_fire;
522	hdlr->cyh_arg = pcpu;
523
524	when->cyt_interval = prof->prof_interval;
525	when->cyt_when = gethrtime() + when->cyt_interval;
526
527	pcpu->profc_expected = when->cyt_when;
528	pcpu->profc_interval = when->cyt_interval;
529}
530
531/*ARGSUSED*/
532static void
533profile_offline(void *arg, cpu_t *cpu, void *oarg)
534{
535	profile_probe_percpu_t *pcpu = oarg;
536
537	ASSERT(pcpu->profc_probe == arg);
538	kmem_free(pcpu, sizeof (profile_probe_percpu_t));
539}
540
541/* ARGSUSED */
542static void
543profile_enable(void *arg, dtrace_id_t id, void *parg)
544{
545	profile_probe_t *prof = parg;
546	cyc_omni_handler_t omni;
547	cyc_handler_t hdlr;
548	cyc_time_t when;
549
550	ASSERT(prof->prof_interval != 0);
551	ASSERT(MUTEX_HELD(&cpu_lock));
552
553	if (prof->prof_kind == PROF_TICK) {
554		hdlr.cyh_func = profile_tick;
555		hdlr.cyh_arg = prof;
556
557		when.cyt_interval = prof->prof_interval;
558		when.cyt_when = gethrtime() + when.cyt_interval;
559	} else {
560		ASSERT(prof->prof_kind == PROF_PROFILE);
561		omni.cyo_online = profile_online;
562		omni.cyo_offline = profile_offline;
563		omni.cyo_arg = prof;
564	}
565
566	if (prof->prof_kind == PROF_TICK) {
567		prof->prof_cyclic = cyclic_add(&hdlr, &when);
568	} else {
569		prof->prof_cyclic = cyclic_add_omni(&omni);
570	}
571}
572
573/* ARGSUSED */
574static void
575profile_disable(void *arg, dtrace_id_t id, void *parg)
576{
577	profile_probe_t *prof = parg;
578
579	ASSERT(prof->prof_cyclic != CYCLIC_NONE);
580	ASSERT(MUTEX_HELD(&cpu_lock));
581
582	cyclic_remove(prof->prof_cyclic);
583	prof->prof_cyclic = CYCLIC_NONE;
584}
585
586#else
587
588static void
589profile_enable_omni(profile_probe_t *prof)
590{
591	profile_probe_percpu_t *pcpu;
592	int cpu;
593
594	prof->prof_pcpus = kmem_zalloc((mp_maxid + 1) * sizeof(pcpu), KM_SLEEP);
595	CPU_FOREACH(cpu) {
596		pcpu = kmem_zalloc(sizeof(profile_probe_percpu_t), KM_SLEEP);
597		prof->prof_pcpus[cpu] = pcpu;
598		pcpu->profc_probe = prof;
599		pcpu->profc_expected = sbinuptime() + prof->prof_interval;
600		pcpu->profc_interval = prof->prof_interval;
601		callout_init(&pcpu->profc_cyclic, 1);
602		callout_reset_sbt_on(&pcpu->profc_cyclic,
603		    pcpu->profc_expected, 0, profile_fire, pcpu,
604		    cpu, C_DIRECT_EXEC | C_ABSOLUTE);
605	}
606}
607
608static void
609profile_disable_omni(profile_probe_t *prof)
610{
611	profile_probe_percpu_t *pcpu;
612	int cpu;
613
614	ASSERT(prof->prof_pcpus != NULL);
615	CPU_FOREACH(cpu) {
616		pcpu = prof->prof_pcpus[cpu];
617		ASSERT(pcpu->profc_probe == prof);
618		ASSERT(callout_active(&pcpu->profc_cyclic));
619		callout_stop(&pcpu->profc_cyclic);
620		callout_drain(&pcpu->profc_cyclic);
621		kmem_free(pcpu, sizeof(profile_probe_percpu_t));
622	}
623	kmem_free(prof->prof_pcpus, (mp_maxid + 1) * sizeof(pcpu));
624	prof->prof_pcpus = NULL;
625}
626
627/* ARGSUSED */
628static void
629profile_enable(void *arg, dtrace_id_t id, void *parg)
630{
631	profile_probe_t *prof = parg;
632
633	if (prof->prof_kind == PROF_TICK) {
634		prof->prof_expected = sbinuptime() + prof->prof_interval;
635		callout_reset_sbt(&prof->prof_cyclic,
636		    prof->prof_expected, 0, profile_tick, prof,
637		    C_DIRECT_EXEC | C_ABSOLUTE);
638	} else {
639		ASSERT(prof->prof_kind == PROF_PROFILE);
640		profile_enable_omni(prof);
641	}
642}
643
644/* ARGSUSED */
645static void
646profile_disable(void *arg, dtrace_id_t id, void *parg)
647{
648	profile_probe_t *prof = parg;
649
650	if (prof->prof_kind == PROF_TICK) {
651		ASSERT(callout_active(&prof->prof_cyclic));
652		callout_stop(&prof->prof_cyclic);
653		callout_drain(&prof->prof_cyclic);
654	} else {
655		ASSERT(prof->prof_kind == PROF_PROFILE);
656		profile_disable_omni(prof);
657	}
658}
659#endif
660
661static void
662profile_load(void *dummy)
663{
664	/* Create the /dev/dtrace/profile entry. */
665	profile_cdev = make_dev(&profile_cdevsw, 0, UID_ROOT, GID_WHEEL, 0600,
666	    "dtrace/profile");
667
668	if (dtrace_register("profile", &profile_attr, DTRACE_PRIV_USER,
669	    NULL, &profile_pops, NULL, &profile_id) != 0)
670		return;
671}
672
673
674static int
675profile_unload()
676{
677	int error = 0;
678
679	if ((error = dtrace_unregister(profile_id)) != 0)
680		return (error);
681
682	destroy_dev(profile_cdev);
683
684	return (error);
685}
686
687/* ARGSUSED */
688static int
689profile_modevent(module_t mod __unused, int type, void *data __unused)
690{
691	int error = 0;
692
693	switch (type) {
694	case MOD_LOAD:
695		break;
696
697	case MOD_UNLOAD:
698		break;
699
700	case MOD_SHUTDOWN:
701		break;
702
703	default:
704		error = EOPNOTSUPP;
705		break;
706
707	}
708	return (error);
709}
710
711/* ARGSUSED */
712static int
713profile_open(struct cdev *dev __unused, int oflags __unused, int devtype __unused, struct thread *td __unused)
714{
715	return (0);
716}
717
718SYSINIT(profile_load, SI_SUB_DTRACE_PROVIDER, SI_ORDER_ANY, profile_load, NULL);
719SYSUNINIT(profile_unload, SI_SUB_DTRACE_PROVIDER, SI_ORDER_ANY, profile_unload, NULL);
720
721DEV_MODULE(profile, profile_modevent, NULL);
722MODULE_VERSION(profile, 1);
723MODULE_DEPEND(profile, dtrace, 1, 1, 1);
724MODULE_DEPEND(profile, opensolaris, 1, 1, 1);
725