1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 *
21 * Portions Copyright 2006-2008 John Birrell jb@freebsd.org
22 *
23 */
24
25/*
26 * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
27 * Use is subject to license terms.
28 */
29
30#include <sys/param.h>
31#include <sys/systm.h>
32#include <sys/conf.h>
33#include <sys/cpuvar.h>
34#include <sys/endian.h>
35#include <sys/fcntl.h>
36#include <sys/filio.h>
37#include <sys/kdb.h>
38#include <sys/kernel.h>
39#include <sys/kmem.h>
40#include <sys/kthread.h>
41#include <sys/limits.h>
42#include <sys/linker.h>
43#include <sys/lock.h>
44#include <sys/malloc.h>
45#include <sys/module.h>
46#include <sys/mutex.h>
47#include <sys/poll.h>
48#include <sys/proc.h>
49#include <sys/selinfo.h>
50#include <sys/smp.h>
51#include <sys/sysctl.h>
52#include <sys/uio.h>
53#include <sys/unistd.h>
54#include <machine/cpu.h>
55#include <machine/stdarg.h>
56
57#include <sys/dtrace.h>
58#include <sys/dtrace_bsd.h>
59
60#include <cddl/dev/dtrace/dtrace_cddl.h>
61
62#define	PROF_NAMELEN		15
63
64#define	PROF_PROFILE		0
65#define	PROF_TICK		1
66#define	PROF_PREFIX_PROFILE	"profile-"
67#define	PROF_PREFIX_TICK	"tick-"
68
69/*
70 * Regardless of platform, there are five artificial frames in the case of the
71 * profile provider:
72 *
73 *	profile_fire
74 *	cyclic_expire
75 *	cyclic_fire
76 *	[ cbe ]
77 *	[ locore ]
78 *
79 * On amd64, there are two frames associated with locore:  one in locore, and
80 * another in common interrupt dispatch code.  (i386 has not been modified to
81 * use this common layer.)  Further, on i386, the interrupted instruction
82 * appears as its own stack frame.  All of this means that we need to add one
83 * frame for amd64, and then take one away for both amd64 and i386.
84 *
85 * All of the above constraints lead to the mess below.  Yes, the profile
86 * provider should ideally figure this out on-the-fly by hiting one of its own
87 * probes and then walking its own stack trace.  This is complicated, however,
88 * and the static definition doesn't seem to be overly brittle.  Still, we
89 * allow for a manual override in case we get it completely wrong.
90 */
91#ifdef __amd64
92#define	PROF_ARTIFICIAL_FRAMES	10
93#else
94#ifdef __i386
95#define	PROF_ARTIFICIAL_FRAMES	6
96#endif
97#endif
98
99#ifdef __powerpc__
100/*
101 * This value is bogus just to make module compilable on powerpc
102 */
103#define	PROF_ARTIFICIAL_FRAMES	8
104#endif
105
106struct profile_probe_percpu;
107
108#ifdef __arm__
109#define	PROF_ARTIFICIAL_FRAMES	3
110#endif
111
112#ifdef __aarch64__
113#define	PROF_ARTIFICIAL_FRAMES	12
114#endif
115
116#ifdef __riscv
117#define	PROF_ARTIFICIAL_FRAMES	12
118#endif
119
120typedef struct profile_probe {
121	char		prof_name[PROF_NAMELEN];
122	dtrace_id_t	prof_id;
123	int		prof_kind;
124#ifdef illumos
125	hrtime_t	prof_interval;
126	cyclic_id_t	prof_cyclic;
127#else
128	sbintime_t	prof_interval;
129	struct callout	prof_cyclic;
130	sbintime_t	prof_expected;
131	struct profile_probe_percpu **prof_pcpus;
132#endif
133} profile_probe_t;
134
135typedef struct profile_probe_percpu {
136	hrtime_t	profc_expected;
137	hrtime_t	profc_interval;
138	profile_probe_t	*profc_probe;
139#ifdef __FreeBSD__
140	struct callout	profc_cyclic;
141#endif
142} profile_probe_percpu_t;
143
144static int	profile_unload(void);
145static void	profile_create(hrtime_t, char *, int);
146static void	profile_destroy(void *, dtrace_id_t, void *);
147static void	profile_enable(void *, dtrace_id_t, void *);
148static void	profile_disable(void *, dtrace_id_t, void *);
149static void	profile_load(void *);
150static void	profile_provide(void *, dtrace_probedesc_t *);
151
152static int profile_rates[] = {
153    97, 199, 499, 997, 1999,
154    4001, 4999, 0, 0, 0,
155    0, 0, 0, 0, 0,
156    0, 0, 0, 0, 0
157};
158
159static int profile_ticks[] = {
160    1, 10, 100, 500, 1000,
161    5000, 0, 0, 0, 0,
162    0, 0, 0, 0, 0
163};
164
165/*
166 * profile_max defines the upper bound on the number of profile probes that
167 * can exist (this is to prevent malicious or clumsy users from exhausing
168 * system resources by creating a slew of profile probes). At mod load time,
169 * this gets its value from PROFILE_MAX_DEFAULT or profile-max-probes if it's
170 * present in the profile.conf file.
171 */
172#define	PROFILE_MAX_DEFAULT	1000	/* default max. number of probes */
173static uint32_t profile_max = PROFILE_MAX_DEFAULT;
174					/* maximum number of profile probes */
175static uint32_t profile_total;		/* current number of profile probes */
176
177static dtrace_pattr_t profile_attr = {
178{ DTRACE_STABILITY_EVOLVING, DTRACE_STABILITY_EVOLVING, DTRACE_CLASS_COMMON },
179{ DTRACE_STABILITY_PRIVATE, DTRACE_STABILITY_PRIVATE, DTRACE_CLASS_UNKNOWN },
180{ DTRACE_STABILITY_PRIVATE, DTRACE_STABILITY_PRIVATE, DTRACE_CLASS_ISA },
181{ DTRACE_STABILITY_EVOLVING, DTRACE_STABILITY_EVOLVING, DTRACE_CLASS_COMMON },
182{ DTRACE_STABILITY_PRIVATE, DTRACE_STABILITY_PRIVATE, DTRACE_CLASS_ISA },
183};
184
185static dtrace_pops_t profile_pops = {
186	.dtps_provide =		profile_provide,
187	.dtps_provide_module =	NULL,
188	.dtps_enable =		profile_enable,
189	.dtps_disable =		profile_disable,
190	.dtps_suspend =		NULL,
191	.dtps_resume =		NULL,
192	.dtps_getargdesc =	NULL,
193	.dtps_getargval =	NULL,
194	.dtps_usermode =	NULL,
195	.dtps_destroy =		profile_destroy
196};
197
198static dtrace_provider_id_t	profile_id;
199static hrtime_t			profile_interval_min = NANOSEC / 5000;	/* 5000 hz */
200static int			profile_aframes = PROF_ARTIFICIAL_FRAMES;
201
202SYSCTL_DECL(_kern_dtrace);
203SYSCTL_NODE(_kern_dtrace, OID_AUTO, profile, CTLFLAG_RD | CTLFLAG_MPSAFE, 0,
204    "DTrace profile parameters");
205SYSCTL_INT(_kern_dtrace_profile, OID_AUTO, aframes, CTLFLAG_RW, &profile_aframes,
206    0, "Skipped frames for profile provider");
207
208static sbintime_t
209nsec_to_sbt(hrtime_t nsec)
210{
211	time_t sec;
212
213	/*
214	 * We need to calculate nsec * 2^32 / 10^9
215	 * Seconds and nanoseconds are split to avoid overflow.
216	 */
217	sec = nsec / NANOSEC;
218	nsec = nsec % NANOSEC;
219	return (((sbintime_t)sec << 32) | ((sbintime_t)nsec << 32) / NANOSEC);
220}
221
222static hrtime_t
223sbt_to_nsec(sbintime_t sbt)
224{
225
226	return ((sbt >> 32) * NANOSEC +
227	    (((uint32_t)sbt * (hrtime_t)NANOSEC) >> 32));
228}
229
230static void
231profile_probe(profile_probe_t *prof, hrtime_t late)
232{
233	struct thread *td;
234	struct trapframe *frame;
235	uintfptr_t pc, upc;
236
237	td = curthread;
238	pc = upc = 0;
239
240	/*
241	 * td_intr_frame can be unset if this is a catch-up event upon waking up
242	 * from idle sleep. This can only happen on a CPU idle thread. Use a
243	 * representative arg0 value in this case so that one of the probe
244	 * arguments is non-zero.
245	 */
246	frame = td->td_intr_frame;
247	if (frame != NULL) {
248		if (TRAPF_USERMODE(frame))
249			upc = TRAPF_PC(frame);
250		else {
251			pc = TRAPF_PC(frame);
252			td->t_dtrace_trapframe = frame;
253		}
254	} else if (TD_IS_IDLETHREAD(td))
255		pc = (uintfptr_t)&cpu_idle;
256
257	dtrace_probe(prof->prof_id, pc, upc, late, 0, 0);
258	td->t_dtrace_trapframe = NULL;
259}
260
261static void
262profile_fire(void *arg)
263{
264	profile_probe_percpu_t *pcpu = arg;
265	profile_probe_t *prof = pcpu->profc_probe;
266	hrtime_t late;
267
268	late = sbt_to_nsec(sbinuptime() - pcpu->profc_expected);
269
270	profile_probe(prof, late);
271	pcpu->profc_expected += pcpu->profc_interval;
272	callout_schedule_sbt_curcpu(&pcpu->profc_cyclic,
273	    pcpu->profc_expected, 0, C_DIRECT_EXEC | C_ABSOLUTE);
274}
275
276static void
277profile_tick(void *arg)
278{
279	profile_probe_t *prof = arg;
280
281	profile_probe(prof, 0);
282	prof->prof_expected += prof->prof_interval;
283	callout_schedule_sbt(&prof->prof_cyclic,
284	    prof->prof_expected, 0, C_DIRECT_EXEC | C_ABSOLUTE);
285}
286
287static void
288profile_create(hrtime_t interval, char *name, int kind)
289{
290	profile_probe_t *prof;
291
292	if (interval < profile_interval_min)
293		return;
294
295	if (dtrace_probe_lookup(profile_id, NULL, NULL, name) != 0)
296		return;
297
298	atomic_add_32(&profile_total, 1);
299	if (profile_total > profile_max) {
300		atomic_add_32(&profile_total, -1);
301		return;
302	}
303
304	prof = kmem_zalloc(sizeof (profile_probe_t), KM_SLEEP);
305	(void) strcpy(prof->prof_name, name);
306#ifdef illumos
307	prof->prof_interval = interval;
308	prof->prof_cyclic = CYCLIC_NONE;
309#else
310	prof->prof_interval = nsec_to_sbt(interval);
311	callout_init(&prof->prof_cyclic, 1);
312#endif
313	prof->prof_kind = kind;
314	prof->prof_id = dtrace_probe_create(profile_id,
315	    NULL, NULL, name,
316	    profile_aframes, prof);
317}
318
319/*ARGSUSED*/
320static void
321profile_provide(void *arg, dtrace_probedesc_t *desc)
322{
323	int i, j, rate, kind;
324	hrtime_t val = 0, mult = 1, len = 0;
325	char *name, *suffix = NULL;
326
327	const struct {
328		char *prefix;
329		int kind;
330	} types[] = {
331		{ PROF_PREFIX_PROFILE, PROF_PROFILE },
332		{ PROF_PREFIX_TICK, PROF_TICK },
333		{ 0, 0 }
334	};
335
336	const struct {
337		char *name;
338		hrtime_t mult;
339	} suffixes[] = {
340		{ "ns", 	NANOSEC / NANOSEC },
341		{ "nsec",	NANOSEC / NANOSEC },
342		{ "us",		NANOSEC / MICROSEC },
343		{ "usec",	NANOSEC / MICROSEC },
344		{ "ms",		NANOSEC / MILLISEC },
345		{ "msec",	NANOSEC / MILLISEC },
346		{ "s",		NANOSEC / SEC },
347		{ "sec",	NANOSEC / SEC },
348		{ "m",		NANOSEC * (hrtime_t)60 },
349		{ "min",	NANOSEC * (hrtime_t)60 },
350		{ "h",		NANOSEC * (hrtime_t)(60 * 60) },
351		{ "hour",	NANOSEC * (hrtime_t)(60 * 60) },
352		{ "d",		NANOSEC * (hrtime_t)(24 * 60 * 60) },
353		{ "day",	NANOSEC * (hrtime_t)(24 * 60 * 60) },
354		{ "hz",		0 },
355		{ NULL }
356	};
357
358	if (desc == NULL) {
359		char n[PROF_NAMELEN];
360
361		/*
362		 * If no description was provided, provide all of our probes.
363		 */
364		for (i = 0; i < sizeof (profile_rates) / sizeof (int); i++) {
365			if ((rate = profile_rates[i]) == 0)
366				continue;
367
368			(void) snprintf(n, PROF_NAMELEN, "%s%d",
369			    PROF_PREFIX_PROFILE, rate);
370			profile_create(NANOSEC / rate, n, PROF_PROFILE);
371		}
372
373		for (i = 0; i < sizeof (profile_ticks) / sizeof (int); i++) {
374			if ((rate = profile_ticks[i]) == 0)
375				continue;
376
377			(void) snprintf(n, PROF_NAMELEN, "%s%d",
378			    PROF_PREFIX_TICK, rate);
379			profile_create(NANOSEC / rate, n, PROF_TICK);
380		}
381
382		return;
383	}
384
385	name = desc->dtpd_name;
386
387	for (i = 0; types[i].prefix != NULL; i++) {
388		len = strlen(types[i].prefix);
389
390		if (strncmp(name, types[i].prefix, len) != 0)
391			continue;
392		break;
393	}
394
395	if (types[i].prefix == NULL)
396		return;
397
398	kind = types[i].kind;
399	j = strlen(name) - len;
400
401	/*
402	 * We need to start before any time suffix.
403	 */
404	for (j = strlen(name); j >= len; j--) {
405		if (name[j] >= '0' && name[j] <= '9')
406			break;
407		suffix = &name[j];
408	}
409
410	ASSERT(suffix != NULL);
411
412	/*
413	 * Now determine the numerical value present in the probe name.
414	 */
415	for (; j >= len; j--) {
416		if (name[j] < '0' || name[j] > '9')
417			return;
418
419		val += (name[j] - '0') * mult;
420		mult *= (hrtime_t)10;
421	}
422
423	if (val == 0)
424		return;
425
426	/*
427	 * Look-up the suffix to determine the multiplier.
428	 */
429	for (i = 0, mult = 0; suffixes[i].name != NULL; i++) {
430		if (strcasecmp(suffixes[i].name, suffix) == 0) {
431			mult = suffixes[i].mult;
432			break;
433		}
434	}
435
436	if (suffixes[i].name == NULL && *suffix != '\0')
437		return;
438
439	if (mult == 0) {
440		/*
441		 * The default is frequency-per-second.
442		 */
443		val = NANOSEC / val;
444	} else {
445		val *= mult;
446	}
447
448	profile_create(val, name, kind);
449}
450
451/* ARGSUSED */
452static void
453profile_destroy(void *arg, dtrace_id_t id, void *parg)
454{
455	profile_probe_t *prof = parg;
456
457#ifdef illumos
458	ASSERT(prof->prof_cyclic == CYCLIC_NONE);
459#else
460	ASSERT(!callout_active(&prof->prof_cyclic) && prof->prof_pcpus == NULL);
461#endif
462	kmem_free(prof, sizeof (profile_probe_t));
463
464	ASSERT(profile_total >= 1);
465	atomic_add_32(&profile_total, -1);
466}
467
468#ifdef illumos
469/*ARGSUSED*/
470static void
471profile_online(void *arg, cpu_t *cpu, cyc_handler_t *hdlr, cyc_time_t *when)
472{
473	profile_probe_t *prof = arg;
474	profile_probe_percpu_t *pcpu;
475
476	pcpu = kmem_zalloc(sizeof (profile_probe_percpu_t), KM_SLEEP);
477	pcpu->profc_probe = prof;
478
479	hdlr->cyh_func = profile_fire;
480	hdlr->cyh_arg = pcpu;
481
482	when->cyt_interval = prof->prof_interval;
483	when->cyt_when = gethrtime() + when->cyt_interval;
484
485	pcpu->profc_expected = when->cyt_when;
486	pcpu->profc_interval = when->cyt_interval;
487}
488
489/*ARGSUSED*/
490static void
491profile_offline(void *arg, cpu_t *cpu, void *oarg)
492{
493	profile_probe_percpu_t *pcpu = oarg;
494
495	ASSERT(pcpu->profc_probe == arg);
496	kmem_free(pcpu, sizeof (profile_probe_percpu_t));
497}
498
499/* ARGSUSED */
500static void
501profile_enable(void *arg, dtrace_id_t id, void *parg)
502{
503	profile_probe_t *prof = parg;
504	cyc_omni_handler_t omni;
505	cyc_handler_t hdlr;
506	cyc_time_t when;
507
508	ASSERT(prof->prof_interval != 0);
509	ASSERT(MUTEX_HELD(&cpu_lock));
510
511	if (prof->prof_kind == PROF_TICK) {
512		hdlr.cyh_func = profile_tick;
513		hdlr.cyh_arg = prof;
514
515		when.cyt_interval = prof->prof_interval;
516		when.cyt_when = gethrtime() + when.cyt_interval;
517	} else {
518		ASSERT(prof->prof_kind == PROF_PROFILE);
519		omni.cyo_online = profile_online;
520		omni.cyo_offline = profile_offline;
521		omni.cyo_arg = prof;
522	}
523
524	if (prof->prof_kind == PROF_TICK) {
525		prof->prof_cyclic = cyclic_add(&hdlr, &when);
526	} else {
527		prof->prof_cyclic = cyclic_add_omni(&omni);
528	}
529}
530
531/* ARGSUSED */
532static void
533profile_disable(void *arg, dtrace_id_t id, void *parg)
534{
535	profile_probe_t *prof = parg;
536
537	ASSERT(prof->prof_cyclic != CYCLIC_NONE);
538	ASSERT(MUTEX_HELD(&cpu_lock));
539
540	cyclic_remove(prof->prof_cyclic);
541	prof->prof_cyclic = CYCLIC_NONE;
542}
543
544#else
545
546static void
547profile_enable_omni(profile_probe_t *prof)
548{
549	profile_probe_percpu_t *pcpu;
550	int cpu;
551
552	prof->prof_pcpus = kmem_zalloc((mp_maxid + 1) * sizeof(pcpu), KM_SLEEP);
553	CPU_FOREACH(cpu) {
554		pcpu = kmem_zalloc(sizeof(profile_probe_percpu_t), KM_SLEEP);
555		prof->prof_pcpus[cpu] = pcpu;
556		pcpu->profc_probe = prof;
557		pcpu->profc_expected = sbinuptime() + prof->prof_interval;
558		pcpu->profc_interval = prof->prof_interval;
559		callout_init(&pcpu->profc_cyclic, 1);
560		callout_reset_sbt_on(&pcpu->profc_cyclic,
561		    pcpu->profc_expected, 0, profile_fire, pcpu,
562		    cpu, C_DIRECT_EXEC | C_ABSOLUTE);
563	}
564}
565
566static void
567profile_disable_omni(profile_probe_t *prof)
568{
569	profile_probe_percpu_t *pcpu;
570	int cpu;
571
572	ASSERT(prof->prof_pcpus != NULL);
573	CPU_FOREACH(cpu) {
574		pcpu = prof->prof_pcpus[cpu];
575		ASSERT(pcpu->profc_probe == prof);
576		ASSERT(callout_active(&pcpu->profc_cyclic));
577		callout_stop(&pcpu->profc_cyclic);
578		callout_drain(&pcpu->profc_cyclic);
579		kmem_free(pcpu, sizeof(profile_probe_percpu_t));
580	}
581	kmem_free(prof->prof_pcpus, (mp_maxid + 1) * sizeof(pcpu));
582	prof->prof_pcpus = NULL;
583}
584
585/* ARGSUSED */
586static void
587profile_enable(void *arg, dtrace_id_t id, void *parg)
588{
589	profile_probe_t *prof = parg;
590
591	if (prof->prof_kind == PROF_TICK) {
592		prof->prof_expected = sbinuptime() + prof->prof_interval;
593		callout_reset_sbt(&prof->prof_cyclic,
594		    prof->prof_expected, 0, profile_tick, prof,
595		    C_DIRECT_EXEC | C_ABSOLUTE);
596	} else {
597		ASSERT(prof->prof_kind == PROF_PROFILE);
598		profile_enable_omni(prof);
599	}
600}
601
602/* ARGSUSED */
603static void
604profile_disable(void *arg, dtrace_id_t id, void *parg)
605{
606	profile_probe_t *prof = parg;
607
608	if (prof->prof_kind == PROF_TICK) {
609		ASSERT(callout_active(&prof->prof_cyclic));
610		callout_stop(&prof->prof_cyclic);
611		callout_drain(&prof->prof_cyclic);
612	} else {
613		ASSERT(prof->prof_kind == PROF_PROFILE);
614		profile_disable_omni(prof);
615	}
616}
617#endif
618
619static void
620profile_load(void *dummy)
621{
622	if (dtrace_register("profile", &profile_attr, DTRACE_PRIV_USER,
623	    NULL, &profile_pops, NULL, &profile_id) != 0)
624		return;
625}
626
627
628static int
629profile_unload(void)
630{
631	int error = 0;
632
633	if ((error = dtrace_unregister(profile_id)) != 0)
634		return (error);
635
636	return (error);
637}
638
639/* ARGSUSED */
640static int
641profile_modevent(module_t mod __unused, int type, void *data __unused)
642{
643	int error = 0;
644
645	switch (type) {
646	case MOD_LOAD:
647		break;
648
649	case MOD_UNLOAD:
650		break;
651
652	case MOD_SHUTDOWN:
653		break;
654
655	default:
656		error = EOPNOTSUPP;
657		break;
658
659	}
660	return (error);
661}
662
663SYSINIT(profile_load, SI_SUB_DTRACE_PROVIDER, SI_ORDER_ANY, profile_load, NULL);
664SYSUNINIT(profile_unload, SI_SUB_DTRACE_PROVIDER, SI_ORDER_ANY, profile_unload, NULL);
665
666DEV_MODULE(profile, profile_modevent, NULL);
667MODULE_VERSION(profile, 1);
668MODULE_DEPEND(profile, dtrace, 1, 1, 1);
669MODULE_DEPEND(profile, opensolaris, 1, 1, 1);
670