1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21/*
22 * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
23 * Use is subject to license terms.
24 */
25
26/* #pragma ident	"@(#)profile.c	1.7	07/01/10 SMI" */
27
28#ifdef KERNEL
29#ifndef _KERNEL
30#define _KERNEL /* Solaris vs. Darwin */
31#endif
32#endif
33
34#include <kern/cpu_data.h>
35#include <kern/thread.h>
36#include <kern/assert.h>
37#include <mach/thread_status.h>
38
39#include <sys/param.h>
40#include <sys/systm.h>
41#include <sys/errno.h>
42#include <sys/stat.h>
43#include <sys/ioctl.h>
44#include <sys/conf.h>
45#include <sys/fcntl.h>
46#include <miscfs/devfs/devfs.h>
47
48#include <sys/dtrace.h>
49#include <sys/dtrace_impl.h>
50
51#include <sys/dtrace_glue.h>
52
53#include <machine/pal_routines.h>
54
55#if defined(__x86_64__)
56extern x86_saved_state_t *find_kern_regs(thread_t);
57#else
58#error Unknown architecture
59#endif
60
61#undef ASSERT
62#define ASSERT(x) do {} while(0)
63
64extern void profile_init(void);
65
66static dev_info_t *profile_devi;
67static dtrace_provider_id_t profile_id;
68
69/*
70 * Regardless of platform, the stack frames look like this in the case of the
71 * profile provider:
72 *
73 *	profile_fire
74 *	cyclic_expire
75 *	cyclic_fire
76 *	[ cbe ]
77 *	[ interrupt code ]
78 *
79 * On x86, there are five frames from the generic interrupt code; further, the
80 * interrupted instruction appears as its own stack frame, giving us a total of
81 * 10.
82 *
83 * On SPARC, the picture is further complicated because the compiler
84 * optimizes away tail-calls -- so the following frames are optimized away:
85 *
86 * 	profile_fire
87 *	cyclic_expire
88 *
89 * This gives three frames.  However, on DEBUG kernels, the cyclic_expire
90 * frame cannot be tail-call eliminated, yielding four frames in this case.
91 *
92 * All of the above constraints lead to the mess below.  Yes, the profile
93 * provider should ideally figure this out on-the-fly by hitting one of its own
94 * probes and then walking its own stack trace.  This is complicated, however,
95 * and the static definition doesn't seem to be overly brittle.  Still, we
96 * allow for a manual override in case we get it completely wrong.
97 */
98
99#if defined(__x86_64__)
100#define PROF_ARTIFICIAL_FRAMES  9
101#else
102#error Unknown architecture
103#endif
104
105#define	PROF_NAMELEN		15
106
107#define	PROF_PROFILE		0
108#define	PROF_TICK		1
109#define	PROF_PREFIX_PROFILE	"profile-"
110#define	PROF_PREFIX_TICK	"tick-"
111
112typedef struct profile_probe {
113	char		prof_name[PROF_NAMELEN];
114	dtrace_id_t	prof_id;
115	int		prof_kind;
116	hrtime_t	prof_interval;
117	cyclic_id_t	prof_cyclic;
118} profile_probe_t;
119
120typedef struct profile_probe_percpu {
121	hrtime_t	profc_expected;
122	hrtime_t	profc_interval;
123	profile_probe_t	*profc_probe;
124} profile_probe_percpu_t;
125
126hrtime_t	profile_interval_min = NANOSEC / 5000;		/* 5000 hz */
127int		profile_aframes = 0;				/* override */
128
129static int profile_rates[] = {
130    97, 199, 499, 997, 1999,
131    4001, 4999, 0, 0, 0,
132    0, 0, 0, 0, 0,
133    0, 0, 0, 0, 0
134};
135
136static int profile_ticks[] = {
137    1, 10, 100, 500, 1000,
138    5000, 0, 0, 0, 0,
139    0, 0, 0, 0, 0
140};
141
142/*
143 * profile_max defines the upper bound on the number of profile probes that
144 * can exist (this is to prevent malicious or clumsy users from exhausing
145 * system resources by creating a slew of profile probes). At mod load time,
146 * this gets its value from PROFILE_MAX_DEFAULT or profile-max-probes if it's
147 * present in the profile.conf file.
148 */
149#define	PROFILE_MAX_DEFAULT	1000	/* default max. number of probes */
150static uint32_t profile_max;		/* maximum number of profile probes */
151static uint32_t profile_total;	/* current number of profile probes */
152
153static void
154profile_fire(void *arg)
155{
156	profile_probe_percpu_t *pcpu = arg;
157	profile_probe_t *prof = pcpu->profc_probe;
158	hrtime_t late;
159
160	late = dtrace_gethrtime() - pcpu->profc_expected;
161	pcpu->profc_expected += pcpu->profc_interval;
162
163#if defined(__x86_64__)
164	x86_saved_state_t *kern_regs = find_kern_regs(current_thread());
165
166	if (NULL != kern_regs) {
167		/* Kernel was interrupted. */
168		dtrace_probe(prof->prof_id, saved_state64(kern_regs)->isf.rip,  0x0, late, 0, 0);
169
170	} else {
171		pal_register_cache_state(current_thread(), VALID);
172		/* Possibly a user interrupt */
173		x86_saved_state_t   *tagged_regs = (x86_saved_state_t *)find_user_regs(current_thread());
174
175		if (NULL == tagged_regs) {
176			/* Too bad, so sad, no useful interrupt state. */
177			dtrace_probe(prof->prof_id, 0xcafebabe,
178	    		0x0, late, 0, 0); /* XXX_BOGUS also see profile_usermode() below. */
179		} else if (is_saved_state64(tagged_regs)) {
180			x86_saved_state64_t *regs = saved_state64(tagged_regs);
181
182			dtrace_probe(prof->prof_id, 0x0, regs->isf.rip, late, 0, 0);
183		} else {
184			x86_saved_state32_t *regs = saved_state32(tagged_regs);
185
186			dtrace_probe(prof->prof_id, 0x0, regs->eip, late, 0, 0);
187		}
188	}
189#else
190#error Unknown architecture
191#endif
192}
193
194static void
195profile_tick(void *arg)
196{
197	profile_probe_t *prof = arg;
198
199#if defined(__x86_64__)
200	x86_saved_state_t *kern_regs = find_kern_regs(current_thread());
201
202	if (NULL != kern_regs) {
203		/* Kernel was interrupted. */
204		dtrace_probe(prof->prof_id, saved_state64(kern_regs)->isf.rip,  0x0, 0, 0, 0);
205	} else {
206		pal_register_cache_state(current_thread(), VALID);
207		/* Possibly a user interrupt */
208		x86_saved_state_t   *tagged_regs = (x86_saved_state_t *)find_user_regs(current_thread());
209
210		if (NULL == tagged_regs) {
211			/* Too bad, so sad, no useful interrupt state. */
212			dtrace_probe(prof->prof_id, 0xcafebabe,
213	    		0x0, 0, 0, 0); /* XXX_BOGUS also see profile_usermode() below. */
214		} else if (is_saved_state64(tagged_regs)) {
215			x86_saved_state64_t *regs = saved_state64(tagged_regs);
216
217			dtrace_probe(prof->prof_id, 0x0, regs->isf.rip, 0, 0, 0);
218		} else {
219			x86_saved_state32_t *regs = saved_state32(tagged_regs);
220
221			dtrace_probe(prof->prof_id, 0x0, regs->eip, 0, 0, 0);
222		}
223	}
224#else
225#error Unknown architecture
226#endif
227}
228
229static void
230profile_create(hrtime_t interval, const char *name, int kind)
231{
232	profile_probe_t *prof;
233
234	if (interval < profile_interval_min)
235		return;
236
237	if (dtrace_probe_lookup(profile_id, NULL, NULL, name) != 0)
238		return;
239
240	atomic_add_32(&profile_total, 1);
241	if (profile_total > profile_max) {
242		atomic_add_32(&profile_total, -1);
243		return;
244	}
245
246	if (PROF_TICK == kind)
247		prof = kmem_zalloc(sizeof (profile_probe_t), KM_SLEEP);
248	else
249		prof = kmem_zalloc(sizeof (profile_probe_t) + NCPU*sizeof(profile_probe_percpu_t), KM_SLEEP);
250
251	(void) strlcpy(prof->prof_name, name, sizeof(prof->prof_name));
252	prof->prof_interval = interval;
253	prof->prof_cyclic = CYCLIC_NONE;
254	prof->prof_kind = kind;
255	prof->prof_id = dtrace_probe_create(profile_id,
256	    NULL, NULL, name,
257	    profile_aframes ? profile_aframes : PROF_ARTIFICIAL_FRAMES, prof);
258}
259
260/*ARGSUSED*/
261static void
262profile_provide(void *arg, const dtrace_probedesc_t *desc)
263{
264#pragma unused(arg) /* __APPLE__ */
265	int i, j, rate, kind;
266	hrtime_t val = 0, mult = 1, len;
267	const char *name, *suffix = NULL;
268
269	const struct {
270		const char *prefix;
271		int kind;
272	} types[] = {
273		{ PROF_PREFIX_PROFILE, PROF_PROFILE },
274		{ PROF_PREFIX_TICK, PROF_TICK },
275		{ NULL, 0 }
276	};
277
278	const struct {
279		const char *name;
280		hrtime_t mult;
281	} suffixes[] = {
282		{ "ns", 	NANOSEC / NANOSEC },
283		{ "nsec",	NANOSEC / NANOSEC },
284		{ "us",		NANOSEC / MICROSEC },
285		{ "usec",	NANOSEC / MICROSEC },
286		{ "ms",		NANOSEC / MILLISEC },
287		{ "msec",	NANOSEC / MILLISEC },
288		{ "s",		NANOSEC / SEC },
289		{ "sec",	NANOSEC / SEC },
290		{ "m",		NANOSEC * (hrtime_t)60 },
291		{ "min",	NANOSEC * (hrtime_t)60 },
292		{ "h",		NANOSEC * (hrtime_t)(60 * 60) },
293		{ "hour",	NANOSEC * (hrtime_t)(60 * 60) },
294		{ "d",		NANOSEC * (hrtime_t)(24 * 60 * 60) },
295		{ "day",	NANOSEC * (hrtime_t)(24 * 60 * 60) },
296		{ "hz",		0 },
297		{ NULL, 0 }
298	};
299
300	if (desc == NULL) {
301		char n[PROF_NAMELEN];
302
303		/*
304		 * If no description was provided, provide all of our probes.
305		 */
306		for (i = 0; i < (int)(sizeof (profile_rates) / sizeof (int)); i++) {
307			if ((rate = profile_rates[i]) == 0)
308				continue;
309
310			(void) snprintf(n, PROF_NAMELEN, "%s%d",
311			    PROF_PREFIX_PROFILE, rate);
312			profile_create(NANOSEC / rate, n, PROF_PROFILE);
313		}
314
315		for (i = 0; i < (int)(sizeof (profile_ticks) / sizeof (int)); i++) {
316			if ((rate = profile_ticks[i]) == 0)
317				continue;
318
319			(void) snprintf(n, PROF_NAMELEN, "%s%d",
320			    PROF_PREFIX_TICK, rate);
321			profile_create(NANOSEC / rate, n, PROF_TICK);
322		}
323
324		return;
325	}
326
327	name = desc->dtpd_name;
328
329	for (i = 0; types[i].prefix != NULL; i++) {
330		len = strlen(types[i].prefix);
331
332		if (strncmp(name, types[i].prefix, len) != 0)
333			continue;
334		break;
335	}
336
337	if (types[i].prefix == NULL)
338		return;
339
340	kind = types[i].kind;
341	j = strlen(name) - len;
342
343	/*
344	 * We need to start before any time suffix.
345	 */
346	for (j = strlen(name); j >= len; j--) {
347		if (name[j] >= '0' && name[j] <= '9')
348			break;
349		suffix = &name[j];
350	}
351
352	ASSERT(suffix != NULL);
353
354	/*
355	 * Now determine the numerical value present in the probe name.
356	 */
357	for (; j >= len; j--) {
358		if (name[j] < '0' || name[j] > '9')
359			return;
360
361		val += (name[j] - '0') * mult;
362		mult *= (hrtime_t)10;
363	}
364
365	if (val == 0)
366		return;
367
368	/*
369	 * Look-up the suffix to determine the multiplier.
370	 */
371	for (i = 0, mult = 0; suffixes[i].name != NULL; i++) {
372		/* APPLE NOTE: Darwin employs size bounded string operations */
373		if (strncasecmp(suffixes[i].name, suffix, strlen(suffixes[i].name) + 1) == 0) {
374			mult = suffixes[i].mult;
375			break;
376		}
377	}
378
379	if (suffixes[i].name == NULL && *suffix != '\0')
380		return;
381
382	if (mult == 0) {
383		/*
384		 * The default is frequency-per-second.
385		 */
386		val = NANOSEC / val;
387	} else {
388		val *= mult;
389	}
390
391	profile_create(val, name, kind);
392}
393
394/*ARGSUSED*/
395static void
396profile_destroy(void *arg, dtrace_id_t id, void *parg)
397{
398#pragma unused(arg,id) /* __APPLE__ */
399	profile_probe_t *prof = parg;
400
401	ASSERT(prof->prof_cyclic == CYCLIC_NONE);
402
403	if (prof->prof_kind == PROF_TICK)
404		kmem_free(prof, sizeof (profile_probe_t));
405	else
406		kmem_free(prof, sizeof (profile_probe_t) + NCPU*sizeof(profile_probe_percpu_t));
407
408	ASSERT(profile_total >= 1);
409	atomic_add_32(&profile_total, -1);
410}
411
412/*ARGSUSED*/
413static void
414profile_online(void *arg, dtrace_cpu_t *cpu, cyc_handler_t *hdlr, cyc_time_t *when)
415{
416#pragma unused(cpu) /* __APPLE__ */
417	profile_probe_t *prof = arg;
418	profile_probe_percpu_t *pcpu;
419
420	pcpu = ((profile_probe_percpu_t *)(&(prof[1]))) + cpu_number();
421	pcpu->profc_probe = prof;
422
423	hdlr->cyh_func = profile_fire;
424	hdlr->cyh_arg = pcpu;
425	hdlr->cyh_level = CY_HIGH_LEVEL;
426
427	when->cyt_interval = prof->prof_interval;
428	when->cyt_when = dtrace_gethrtime() + when->cyt_interval;
429
430	pcpu->profc_expected = when->cyt_when;
431	pcpu->profc_interval = when->cyt_interval;
432}
433
434/*ARGSUSED*/
435static void
436profile_offline(void *arg, dtrace_cpu_t *cpu, void *oarg)
437{
438	profile_probe_percpu_t *pcpu = oarg;
439
440	ASSERT(pcpu->profc_probe == arg);
441#pragma unused(pcpu,arg,cpu) /* __APPLE__ */
442}
443
444/*ARGSUSED*/
445static int
446profile_enable(void *arg, dtrace_id_t id, void *parg)
447{
448#pragma unused(arg,id) /* __APPLE__ */
449	profile_probe_t *prof = parg;
450	cyc_omni_handler_t omni;
451	cyc_handler_t hdlr;
452	cyc_time_t when;
453
454	ASSERT(prof->prof_interval != 0);
455	ASSERT(MUTEX_HELD(&cpu_lock));
456
457	if (prof->prof_kind == PROF_TICK) {
458		hdlr.cyh_func = profile_tick;
459		hdlr.cyh_arg = prof;
460		hdlr.cyh_level = CY_HIGH_LEVEL;
461
462		when.cyt_interval = prof->prof_interval;
463#if !defined(__APPLE__)
464		when.cyt_when = dtrace_gethrtime() + when.cyt_interval;
465#else
466		when.cyt_when = 0;
467#endif /* __APPLE__ */
468	} else {
469		ASSERT(prof->prof_kind == PROF_PROFILE);
470		omni.cyo_online = profile_online;
471		omni.cyo_offline = profile_offline;
472		omni.cyo_arg = prof;
473	}
474
475	if (prof->prof_kind == PROF_TICK) {
476		prof->prof_cyclic = cyclic_timer_add(&hdlr, &when);
477	} else {
478		prof->prof_cyclic = (cyclic_id_t)cyclic_add_omni(&omni); /* cast puns cyclic_id_list_t with cyclic_id_t */
479	}
480
481	return(0);
482}
483
484/*ARGSUSED*/
485static void
486profile_disable(void *arg, dtrace_id_t id, void *parg)
487{
488	profile_probe_t *prof = parg;
489
490	ASSERT(prof->prof_cyclic != CYCLIC_NONE);
491	ASSERT(MUTEX_HELD(&cpu_lock));
492
493#pragma unused(arg,id)
494	if (prof->prof_kind == PROF_TICK) {
495		cyclic_timer_remove(prof->prof_cyclic);
496	} else {
497		cyclic_remove_omni((cyclic_id_list_t)prof->prof_cyclic); /* cast puns cyclic_id_list_t with cyclic_id_t */
498	}
499	prof->prof_cyclic = CYCLIC_NONE;
500}
501
502/*
503 * APPLE NOTE:  profile_usermode call not supported.
504 */
505static int
506profile_usermode(void *arg, dtrace_id_t id, void *parg)
507{
508#pragma unused(arg,id,parg)
509	return 1; /* XXX_BOGUS */
510}
511
512static dtrace_pattr_t profile_attr = {
513{ DTRACE_STABILITY_EVOLVING, DTRACE_STABILITY_EVOLVING, DTRACE_CLASS_COMMON },
514{ DTRACE_STABILITY_UNSTABLE, DTRACE_STABILITY_UNSTABLE, DTRACE_CLASS_UNKNOWN },
515{ DTRACE_STABILITY_PRIVATE, DTRACE_STABILITY_PRIVATE, DTRACE_CLASS_UNKNOWN },
516{ DTRACE_STABILITY_EVOLVING, DTRACE_STABILITY_EVOLVING, DTRACE_CLASS_COMMON },
517{ DTRACE_STABILITY_EVOLVING, DTRACE_STABILITY_EVOLVING, DTRACE_CLASS_COMMON },
518};
519
520static dtrace_pops_t profile_pops = {
521	profile_provide,
522	NULL,
523	profile_enable,
524	profile_disable,
525	NULL,
526	NULL,
527	NULL,
528	NULL,
529	profile_usermode,
530	profile_destroy
531};
532
533static int
534profile_attach(dev_info_t *devi, ddi_attach_cmd_t cmd)
535{
536	switch (cmd) {
537	case DDI_ATTACH:
538		break;
539	case DDI_RESUME:
540		return (DDI_SUCCESS);
541	default:
542		return (DDI_FAILURE);
543	}
544
545	if (ddi_create_minor_node(devi, "profile", S_IFCHR, 0,
546	    DDI_PSEUDO, 0) == DDI_FAILURE ||
547	    dtrace_register("profile", &profile_attr,
548	    DTRACE_PRIV_KERNEL | DTRACE_PRIV_USER, NULL,
549	    &profile_pops, NULL, &profile_id) != 0) {
550		ddi_remove_minor_node(devi, NULL);
551		return (DDI_FAILURE);
552	}
553
554	profile_max = PROFILE_MAX_DEFAULT;
555
556	ddi_report_dev(devi);
557	profile_devi = devi;
558	return (DDI_SUCCESS);
559}
560
561/*
562 * APPLE NOTE:  profile_detach not implemented
563 */
564#if !defined(__APPLE__)
565static int
566profile_detach(dev_info_t *devi, ddi_detach_cmd_t cmd)
567{
568	switch (cmd) {
569	case DDI_DETACH:
570		break;
571	case DDI_SUSPEND:
572		return (DDI_SUCCESS);
573	default:
574		return (DDI_FAILURE);
575	}
576
577	if (dtrace_unregister(profile_id) != 0)
578		return (DDI_FAILURE);
579
580	ddi_remove_minor_node(devi, NULL);
581	return (DDI_SUCCESS);
582}
583#endif /* __APPLE__ */
584
585d_open_t _profile_open;
586
587int _profile_open(dev_t dev, int flags, int devtype, struct proc *p)
588{
589#pragma unused(dev,flags,devtype,p)
590	return 0;
591}
592
593#define PROFILE_MAJOR  -24 /* let the kernel pick the device number */
594
595/*
596 * A struct describing which functions will get invoked for certain
597 * actions.
598 */
599static struct cdevsw profile_cdevsw =
600{
601	_profile_open,		/* open */
602	eno_opcl,			/* close */
603	eno_rdwrt,			/* read */
604	eno_rdwrt,			/* write */
605	eno_ioctl,			/* ioctl */
606	(stop_fcn_t *)nulldev, /* stop */
607	(reset_fcn_t *)nulldev, /* reset */
608	NULL,				/* tty's */
609	eno_select,			/* select */
610	eno_mmap,			/* mmap */
611	eno_strat,			/* strategy */
612	eno_getc,			/* getc */
613	eno_putc,			/* putc */
614	0					/* type */
615};
616
617static int gProfileInited = 0;
618
619void profile_init( void )
620{
621	if (0 == gProfileInited)
622	{
623		int majdevno = cdevsw_add(PROFILE_MAJOR, &profile_cdevsw);
624
625		if (majdevno < 0) {
626			printf("profile_init: failed to allocate a major number!\n");
627			gProfileInited = 0;
628			return;
629		}
630
631		profile_attach( (dev_info_t	*)(uintptr_t)majdevno, DDI_ATTACH );
632
633		gProfileInited = 1;
634	} else
635		panic("profile_init: called twice!\n");
636}
637#undef PROFILE_MAJOR
638