profile.c revision 285009
1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 *
21 * Portions Copyright 2006-2008 John Birrell jb@freebsd.org
22 *
23 * $FreeBSD: head/sys/cddl/dev/profile/profile.c 285009 2015-07-01 15:51:11Z br $
24 *
25 */
26
27/*
28 * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
29 * Use is subject to license terms.
30 */
31
32#include <sys/cdefs.h>
33#include <sys/param.h>
34#include <sys/systm.h>
35#include <sys/conf.h>
36#include <sys/cpuvar.h>
37#include <sys/fcntl.h>
38#include <sys/filio.h>
39#include <sys/kdb.h>
40#include <sys/kernel.h>
41#include <sys/kmem.h>
42#include <sys/kthread.h>
43#include <sys/limits.h>
44#include <sys/linker.h>
45#include <sys/lock.h>
46#include <sys/malloc.h>
47#include <sys/module.h>
48#include <sys/mutex.h>
49#include <sys/poll.h>
50#include <sys/proc.h>
51#include <sys/selinfo.h>
52#include <sys/smp.h>
53#include <sys/uio.h>
54#include <sys/unistd.h>
55#include <machine/cpu.h>
56#include <machine/stdarg.h>
57
58#include <sys/dtrace.h>
59#include <sys/dtrace_bsd.h>
60
61#define	PROF_NAMELEN		15
62
63#define	PROF_PROFILE		0
64#define	PROF_TICK		1
65#define	PROF_PREFIX_PROFILE	"profile-"
66#define	PROF_PREFIX_TICK	"tick-"
67
68/*
69 * Regardless of platform, there are five artificial frames in the case of the
70 * profile provider:
71 *
72 *	profile_fire
73 *	cyclic_expire
74 *	cyclic_fire
75 *	[ cbe ]
76 *	[ locore ]
77 *
78 * On amd64, there are two frames associated with locore:  one in locore, and
79 * another in common interrupt dispatch code.  (i386 has not been modified to
80 * use this common layer.)  Further, on i386, the interrupted instruction
81 * appears as its own stack frame.  All of this means that we need to add one
82 * frame for amd64, and then take one away for both amd64 and i386.
83 *
84 * On SPARC, the picture is further complicated because the compiler
85 * optimizes away tail-calls -- so the following frames are optimized away:
86 *
87 * 	profile_fire
88 *	cyclic_expire
89 *
90 * This gives three frames.  However, on DEBUG kernels, the cyclic_expire
91 * frame cannot be tail-call eliminated, yielding four frames in this case.
92 *
93 * All of the above constraints lead to the mess below.  Yes, the profile
94 * provider should ideally figure this out on-the-fly by hiting one of its own
95 * probes and then walking its own stack trace.  This is complicated, however,
96 * and the static definition doesn't seem to be overly brittle.  Still, we
97 * allow for a manual override in case we get it completely wrong.
98 */
99#ifdef __amd64
100#define	PROF_ARTIFICIAL_FRAMES	10
101#else
102#ifdef __i386
103#define	PROF_ARTIFICIAL_FRAMES	6
104#else
105#ifdef __sparc
106#ifdef DEBUG
107#define	PROF_ARTIFICIAL_FRAMES	4
108#else
109#define	PROF_ARTIFICIAL_FRAMES	3
110#endif
111#endif
112#endif
113#endif
114
115#ifdef __mips
116/*
117 * This value is bogus just to make module compilable on mips
118 */
119#define	PROF_ARTIFICIAL_FRAMES	3
120#endif
121
122#ifdef __powerpc__
123/*
124 * This value is bogus just to make module compilable on powerpc
125 */
126#define	PROF_ARTIFICIAL_FRAMES	3
127#endif
128
129struct profile_probe_percpu;
130
131#ifdef __mips
132/* bogus */
133#define	PROF_ARTIFICIAL_FRAMES	3
134#endif
135
136#ifdef __arm__
137/*
138 * At least on ARMv7, this appears to work quite well.
139 */
140#define	PROF_ARTIFICIAL_FRAMES	10
141#endif
142
143#ifdef __aarch64__
144/* TODO: verify */
145#define	PROF_ARTIFICIAL_FRAMES	10
146#endif
147
148typedef struct profile_probe {
149	char		prof_name[PROF_NAMELEN];
150	dtrace_id_t	prof_id;
151	int		prof_kind;
152#ifdef illumos
153	hrtime_t	prof_interval;
154	cyclic_id_t	prof_cyclic;
155#else
156	sbintime_t	prof_interval;
157	struct callout	prof_cyclic;
158	sbintime_t	prof_expected;
159	struct profile_probe_percpu **prof_pcpus;
160#endif
161} profile_probe_t;
162
163typedef struct profile_probe_percpu {
164	hrtime_t	profc_expected;
165	hrtime_t	profc_interval;
166	profile_probe_t	*profc_probe;
167#ifdef __FreeBSD__
168	struct callout	profc_cyclic;
169#endif
170} profile_probe_percpu_t;
171
172static d_open_t	profile_open;
173static int	profile_unload(void);
174static void	profile_create(hrtime_t, char *, int);
175static void	profile_destroy(void *, dtrace_id_t, void *);
176static void	profile_enable(void *, dtrace_id_t, void *);
177static void	profile_disable(void *, dtrace_id_t, void *);
178static void	profile_load(void *);
179static void	profile_provide(void *, dtrace_probedesc_t *);
180
181static int profile_rates[] = {
182    97, 199, 499, 997, 1999,
183    4001, 4999, 0, 0, 0,
184    0, 0, 0, 0, 0,
185    0, 0, 0, 0, 0
186};
187
188static int profile_ticks[] = {
189    1, 10, 100, 500, 1000,
190    5000, 0, 0, 0, 0,
191    0, 0, 0, 0, 0
192};
193
194/*
195 * profile_max defines the upper bound on the number of profile probes that
196 * can exist (this is to prevent malicious or clumsy users from exhausing
197 * system resources by creating a slew of profile probes). At mod load time,
198 * this gets its value from PROFILE_MAX_DEFAULT or profile-max-probes if it's
199 * present in the profile.conf file.
200 */
201#define	PROFILE_MAX_DEFAULT	1000	/* default max. number of probes */
202static uint32_t profile_max = PROFILE_MAX_DEFAULT;
203					/* maximum number of profile probes */
204static uint32_t profile_total;		/* current number of profile probes */
205
206static struct cdevsw profile_cdevsw = {
207	.d_version	= D_VERSION,
208	.d_open		= profile_open,
209	.d_name		= "profile",
210};
211
212static dtrace_pattr_t profile_attr = {
213{ DTRACE_STABILITY_EVOLVING, DTRACE_STABILITY_EVOLVING, DTRACE_CLASS_COMMON },
214{ DTRACE_STABILITY_PRIVATE, DTRACE_STABILITY_PRIVATE, DTRACE_CLASS_UNKNOWN },
215{ DTRACE_STABILITY_PRIVATE, DTRACE_STABILITY_PRIVATE, DTRACE_CLASS_ISA },
216{ DTRACE_STABILITY_EVOLVING, DTRACE_STABILITY_EVOLVING, DTRACE_CLASS_COMMON },
217{ DTRACE_STABILITY_PRIVATE, DTRACE_STABILITY_PRIVATE, DTRACE_CLASS_ISA },
218};
219
220static dtrace_pops_t profile_pops = {
221	profile_provide,
222	NULL,
223	profile_enable,
224	profile_disable,
225	NULL,
226	NULL,
227	NULL,
228	NULL,
229	NULL,
230	profile_destroy
231};
232
233static struct cdev		*profile_cdev;
234static dtrace_provider_id_t	profile_id;
235static hrtime_t			profile_interval_min = NANOSEC / 5000;	/* 5000 hz */
236static int			profile_aframes = 0;			/* override */
237
238static sbintime_t
239nsec_to_sbt(hrtime_t nsec)
240{
241	time_t sec;
242
243	/*
244	 * We need to calculate nsec * 2^32 / 10^9
245	 * Seconds and nanoseconds are split to avoid overflow.
246	 */
247	sec = nsec / NANOSEC;
248	nsec = nsec % NANOSEC;
249	return (((sbintime_t)sec << 32) | ((sbintime_t)nsec << 32) / NANOSEC);
250}
251
252static hrtime_t
253sbt_to_nsec(sbintime_t sbt)
254{
255
256	return ((sbt >> 32) * NANOSEC +
257	    (((uint32_t)sbt * (hrtime_t)NANOSEC) >> 32));
258}
259
260static void
261profile_fire(void *arg)
262{
263	profile_probe_percpu_t *pcpu = arg;
264	profile_probe_t *prof = pcpu->profc_probe;
265	hrtime_t late;
266	struct trapframe *frame;
267	uintfptr_t pc, upc;
268
269#ifdef illumos
270	late = gethrtime() - pcpu->profc_expected;
271#else
272	late = sbt_to_nsec(sbinuptime() - pcpu->profc_expected);
273#endif
274
275	pc = 0;
276	upc = 0;
277
278	/*
279	 * td_intr_frame can be unset if this is a catch up event
280	 * after waking up from idle sleep.
281	 * This can only happen on a CPU idle thread.
282	 */
283	frame = curthread->td_intr_frame;
284	if (frame != NULL) {
285		if (TRAPF_USERMODE(frame))
286			upc = TRAPF_PC(frame);
287		else
288			pc = TRAPF_PC(frame);
289	}
290	dtrace_probe(prof->prof_id, pc, upc, late, 0, 0);
291
292	pcpu->profc_expected += pcpu->profc_interval;
293	callout_schedule_sbt_curcpu(&pcpu->profc_cyclic,
294	    pcpu->profc_expected, 0, C_DIRECT_EXEC | C_ABSOLUTE);
295}
296
297static void
298profile_tick(void *arg)
299{
300	profile_probe_t *prof = arg;
301	struct trapframe *frame;
302	uintfptr_t pc, upc;
303
304	pc = 0;
305	upc = 0;
306
307	/*
308	 * td_intr_frame can be unset if this is a catch up event
309	 * after waking up from idle sleep.
310	 * This can only happen on a CPU idle thread.
311	 */
312	frame = curthread->td_intr_frame;
313	if (frame != NULL) {
314		if (TRAPF_USERMODE(frame))
315			upc = TRAPF_PC(frame);
316		else
317			pc = TRAPF_PC(frame);
318	}
319	dtrace_probe(prof->prof_id, pc, upc, 0, 0, 0);
320
321	prof->prof_expected += prof->prof_interval;
322	callout_schedule_sbt(&prof->prof_cyclic,
323	    prof->prof_expected, 0, C_DIRECT_EXEC | C_ABSOLUTE);
324}
325
326static void
327profile_create(hrtime_t interval, char *name, int kind)
328{
329	profile_probe_t *prof;
330
331	if (interval < profile_interval_min)
332		return;
333
334	if (dtrace_probe_lookup(profile_id, NULL, NULL, name) != 0)
335		return;
336
337	atomic_add_32(&profile_total, 1);
338	if (profile_total > profile_max) {
339		atomic_add_32(&profile_total, -1);
340		return;
341	}
342
343	prof = kmem_zalloc(sizeof (profile_probe_t), KM_SLEEP);
344	(void) strcpy(prof->prof_name, name);
345#ifdef illumos
346	prof->prof_interval = interval;
347	prof->prof_cyclic = CYCLIC_NONE;
348#else
349	prof->prof_interval = nsec_to_sbt(interval);
350	callout_init(&prof->prof_cyclic, 1);
351#endif
352	prof->prof_kind = kind;
353	prof->prof_id = dtrace_probe_create(profile_id,
354	    NULL, NULL, name,
355	    profile_aframes ? profile_aframes : PROF_ARTIFICIAL_FRAMES, prof);
356}
357
358/*ARGSUSED*/
359static void
360profile_provide(void *arg, dtrace_probedesc_t *desc)
361{
362	int i, j, rate, kind;
363	hrtime_t val = 0, mult = 1, len = 0;
364	char *name, *suffix = NULL;
365
366	const struct {
367		char *prefix;
368		int kind;
369	} types[] = {
370		{ PROF_PREFIX_PROFILE, PROF_PROFILE },
371		{ PROF_PREFIX_TICK, PROF_TICK },
372		{ 0, 0 }
373	};
374
375	const struct {
376		char *name;
377		hrtime_t mult;
378	} suffixes[] = {
379		{ "ns", 	NANOSEC / NANOSEC },
380		{ "nsec",	NANOSEC / NANOSEC },
381		{ "us",		NANOSEC / MICROSEC },
382		{ "usec",	NANOSEC / MICROSEC },
383		{ "ms",		NANOSEC / MILLISEC },
384		{ "msec",	NANOSEC / MILLISEC },
385		{ "s",		NANOSEC / SEC },
386		{ "sec",	NANOSEC / SEC },
387		{ "m",		NANOSEC * (hrtime_t)60 },
388		{ "min",	NANOSEC * (hrtime_t)60 },
389		{ "h",		NANOSEC * (hrtime_t)(60 * 60) },
390		{ "hour",	NANOSEC * (hrtime_t)(60 * 60) },
391		{ "d",		NANOSEC * (hrtime_t)(24 * 60 * 60) },
392		{ "day",	NANOSEC * (hrtime_t)(24 * 60 * 60) },
393		{ "hz",		0 },
394		{ NULL }
395	};
396
397	if (desc == NULL) {
398		char n[PROF_NAMELEN];
399
400		/*
401		 * If no description was provided, provide all of our probes.
402		 */
403		for (i = 0; i < sizeof (profile_rates) / sizeof (int); i++) {
404			if ((rate = profile_rates[i]) == 0)
405				continue;
406
407			(void) snprintf(n, PROF_NAMELEN, "%s%d",
408			    PROF_PREFIX_PROFILE, rate);
409			profile_create(NANOSEC / rate, n, PROF_PROFILE);
410		}
411
412		for (i = 0; i < sizeof (profile_ticks) / sizeof (int); i++) {
413			if ((rate = profile_ticks[i]) == 0)
414				continue;
415
416			(void) snprintf(n, PROF_NAMELEN, "%s%d",
417			    PROF_PREFIX_TICK, rate);
418			profile_create(NANOSEC / rate, n, PROF_TICK);
419		}
420
421		return;
422	}
423
424	name = desc->dtpd_name;
425
426	for (i = 0; types[i].prefix != NULL; i++) {
427		len = strlen(types[i].prefix);
428
429		if (strncmp(name, types[i].prefix, len) != 0)
430			continue;
431		break;
432	}
433
434	if (types[i].prefix == NULL)
435		return;
436
437	kind = types[i].kind;
438	j = strlen(name) - len;
439
440	/*
441	 * We need to start before any time suffix.
442	 */
443	for (j = strlen(name); j >= len; j--) {
444		if (name[j] >= '0' && name[j] <= '9')
445			break;
446		suffix = &name[j];
447	}
448
449	ASSERT(suffix != NULL);
450
451	/*
452	 * Now determine the numerical value present in the probe name.
453	 */
454	for (; j >= len; j--) {
455		if (name[j] < '0' || name[j] > '9')
456			return;
457
458		val += (name[j] - '0') * mult;
459		mult *= (hrtime_t)10;
460	}
461
462	if (val == 0)
463		return;
464
465	/*
466	 * Look-up the suffix to determine the multiplier.
467	 */
468	for (i = 0, mult = 0; suffixes[i].name != NULL; i++) {
469		if (strcasecmp(suffixes[i].name, suffix) == 0) {
470			mult = suffixes[i].mult;
471			break;
472		}
473	}
474
475	if (suffixes[i].name == NULL && *suffix != '\0')
476		return;
477
478	if (mult == 0) {
479		/*
480		 * The default is frequency-per-second.
481		 */
482		val = NANOSEC / val;
483	} else {
484		val *= mult;
485	}
486
487	profile_create(val, name, kind);
488}
489
490/* ARGSUSED */
491static void
492profile_destroy(void *arg, dtrace_id_t id, void *parg)
493{
494	profile_probe_t *prof = parg;
495
496#ifdef illumos
497	ASSERT(prof->prof_cyclic == CYCLIC_NONE);
498#else
499	ASSERT(!callout_active(&prof->prof_cyclic) && prof->prof_pcpus == NULL);
500#endif
501	kmem_free(prof, sizeof (profile_probe_t));
502
503	ASSERT(profile_total >= 1);
504	atomic_add_32(&profile_total, -1);
505}
506
507#ifdef illumos
508/*ARGSUSED*/
509static void
510profile_online(void *arg, cpu_t *cpu, cyc_handler_t *hdlr, cyc_time_t *when)
511{
512	profile_probe_t *prof = arg;
513	profile_probe_percpu_t *pcpu;
514
515	pcpu = kmem_zalloc(sizeof (profile_probe_percpu_t), KM_SLEEP);
516	pcpu->profc_probe = prof;
517
518	hdlr->cyh_func = profile_fire;
519	hdlr->cyh_arg = pcpu;
520
521	when->cyt_interval = prof->prof_interval;
522	when->cyt_when = gethrtime() + when->cyt_interval;
523
524	pcpu->profc_expected = when->cyt_when;
525	pcpu->profc_interval = when->cyt_interval;
526}
527
528/*ARGSUSED*/
529static void
530profile_offline(void *arg, cpu_t *cpu, void *oarg)
531{
532	profile_probe_percpu_t *pcpu = oarg;
533
534	ASSERT(pcpu->profc_probe == arg);
535	kmem_free(pcpu, sizeof (profile_probe_percpu_t));
536}
537
538/* ARGSUSED */
539static void
540profile_enable(void *arg, dtrace_id_t id, void *parg)
541{
542	profile_probe_t *prof = parg;
543	cyc_omni_handler_t omni;
544	cyc_handler_t hdlr;
545	cyc_time_t when;
546
547	ASSERT(prof->prof_interval != 0);
548	ASSERT(MUTEX_HELD(&cpu_lock));
549
550	if (prof->prof_kind == PROF_TICK) {
551		hdlr.cyh_func = profile_tick;
552		hdlr.cyh_arg = prof;
553
554		when.cyt_interval = prof->prof_interval;
555		when.cyt_when = gethrtime() + when.cyt_interval;
556	} else {
557		ASSERT(prof->prof_kind == PROF_PROFILE);
558		omni.cyo_online = profile_online;
559		omni.cyo_offline = profile_offline;
560		omni.cyo_arg = prof;
561	}
562
563	if (prof->prof_kind == PROF_TICK) {
564		prof->prof_cyclic = cyclic_add(&hdlr, &when);
565	} else {
566		prof->prof_cyclic = cyclic_add_omni(&omni);
567	}
568}
569
570/* ARGSUSED */
571static void
572profile_disable(void *arg, dtrace_id_t id, void *parg)
573{
574	profile_probe_t *prof = parg;
575
576	ASSERT(prof->prof_cyclic != CYCLIC_NONE);
577	ASSERT(MUTEX_HELD(&cpu_lock));
578
579	cyclic_remove(prof->prof_cyclic);
580	prof->prof_cyclic = CYCLIC_NONE;
581}
582
583#else
584
585static void
586profile_enable_omni(profile_probe_t *prof)
587{
588	profile_probe_percpu_t *pcpu;
589	int cpu;
590
591	prof->prof_pcpus = kmem_zalloc((mp_maxid + 1) * sizeof(pcpu), KM_SLEEP);
592	CPU_FOREACH(cpu) {
593		pcpu = kmem_zalloc(sizeof(profile_probe_percpu_t), KM_SLEEP);
594		prof->prof_pcpus[cpu] = pcpu;
595		pcpu->profc_probe = prof;
596		pcpu->profc_expected = sbinuptime() + prof->prof_interval;
597		pcpu->profc_interval = prof->prof_interval;
598		callout_init(&pcpu->profc_cyclic, 1);
599		callout_reset_sbt_on(&pcpu->profc_cyclic,
600		    pcpu->profc_expected, 0, profile_fire, pcpu,
601		    cpu, C_DIRECT_EXEC | C_ABSOLUTE);
602	}
603}
604
605static void
606profile_disable_omni(profile_probe_t *prof)
607{
608	profile_probe_percpu_t *pcpu;
609	int cpu;
610
611	ASSERT(prof->prof_pcpus != NULL);
612	CPU_FOREACH(cpu) {
613		pcpu = prof->prof_pcpus[cpu];
614		ASSERT(pcpu->profc_probe == prof);
615		ASSERT(callout_active(&pcpu->profc_cyclic));
616		callout_stop(&pcpu->profc_cyclic);
617		callout_drain(&pcpu->profc_cyclic);
618		kmem_free(pcpu, sizeof(profile_probe_percpu_t));
619	}
620	kmem_free(prof->prof_pcpus, (mp_maxid + 1) * sizeof(pcpu));
621	prof->prof_pcpus = NULL;
622}
623
624/* ARGSUSED */
625static void
626profile_enable(void *arg, dtrace_id_t id, void *parg)
627{
628	profile_probe_t *prof = parg;
629
630	if (prof->prof_kind == PROF_TICK) {
631		prof->prof_expected = sbinuptime() + prof->prof_interval;
632		callout_reset_sbt(&prof->prof_cyclic,
633		    prof->prof_expected, 0, profile_tick, prof,
634		    C_DIRECT_EXEC | C_ABSOLUTE);
635	} else {
636		ASSERT(prof->prof_kind == PROF_PROFILE);
637		profile_enable_omni(prof);
638	}
639}
640
641/* ARGSUSED */
642static void
643profile_disable(void *arg, dtrace_id_t id, void *parg)
644{
645	profile_probe_t *prof = parg;
646
647	if (prof->prof_kind == PROF_TICK) {
648		ASSERT(callout_active(&prof->prof_cyclic));
649		callout_stop(&prof->prof_cyclic);
650		callout_drain(&prof->prof_cyclic);
651	} else {
652		ASSERT(prof->prof_kind == PROF_PROFILE);
653		profile_disable_omni(prof);
654	}
655}
656#endif
657
658static void
659profile_load(void *dummy)
660{
661	/* Create the /dev/dtrace/profile entry. */
662	profile_cdev = make_dev(&profile_cdevsw, 0, UID_ROOT, GID_WHEEL, 0600,
663	    "dtrace/profile");
664
665	if (dtrace_register("profile", &profile_attr, DTRACE_PRIV_USER,
666	    NULL, &profile_pops, NULL, &profile_id) != 0)
667		return;
668}
669
670
671static int
672profile_unload()
673{
674	int error = 0;
675
676	if ((error = dtrace_unregister(profile_id)) != 0)
677		return (error);
678
679	destroy_dev(profile_cdev);
680
681	return (error);
682}
683
684/* ARGSUSED */
685static int
686profile_modevent(module_t mod __unused, int type, void *data __unused)
687{
688	int error = 0;
689
690	switch (type) {
691	case MOD_LOAD:
692		break;
693
694	case MOD_UNLOAD:
695		break;
696
697	case MOD_SHUTDOWN:
698		break;
699
700	default:
701		error = EOPNOTSUPP;
702		break;
703
704	}
705	return (error);
706}
707
708/* ARGSUSED */
709static int
710profile_open(struct cdev *dev __unused, int oflags __unused, int devtype __unused, struct thread *td __unused)
711{
712	return (0);
713}
714
715SYSINIT(profile_load, SI_SUB_DTRACE_PROVIDER, SI_ORDER_ANY, profile_load, NULL);
716SYSUNINIT(profile_unload, SI_SUB_DTRACE_PROVIDER, SI_ORDER_ANY, profile_unload, NULL);
717
718DEV_MODULE(profile, profile_modevent, NULL);
719MODULE_VERSION(profile, 1);
720MODULE_DEPEND(profile, dtrace, 1, 1, 1);
721MODULE_DEPEND(profile, opensolaris, 1, 1, 1);
722