1179237Sjb/*
2179237Sjb * CDDL HEADER START
3179237Sjb *
4179237Sjb * The contents of this file are subject to the terms of the
5179237Sjb * Common Development and Distribution License (the "License").
6179237Sjb * You may not use this file except in compliance with the License.
7179237Sjb *
8179237Sjb * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9179237Sjb * or http://www.opensolaris.org/os/licensing.
10179237Sjb * See the License for the specific language governing permissions
11179237Sjb * and limitations under the License.
12179237Sjb *
13179237Sjb * When distributing Covered Code, include this CDDL HEADER in each
14179237Sjb * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15179237Sjb * If applicable, add the following below this CDDL HEADER, with the
16179237Sjb * fields enclosed by brackets "[]" replaced with your own identifying
17179237Sjb * information: Portions Copyright [yyyy] [name of copyright owner]
18179237Sjb *
19179237Sjb * CDDL HEADER END
20179237Sjb *
21179237Sjb * Portions Copyright 2006-2008 John Birrell jb@freebsd.org
22179237Sjb *
23179237Sjb * $FreeBSD: stable/11/sys/cddl/dev/profile/profile.c 324282 2017-10-04 15:47:16Z markj $
24179237Sjb *
25179237Sjb */
26179237Sjb
27179237Sjb/*
28179237Sjb * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
29179237Sjb * Use is subject to license terms.
30179237Sjb */
31179237Sjb
32179237Sjb#include <sys/cdefs.h>
33179237Sjb#include <sys/param.h>
34179237Sjb#include <sys/systm.h>
35179237Sjb#include <sys/conf.h>
36179237Sjb#include <sys/cpuvar.h>
37179237Sjb#include <sys/fcntl.h>
38179237Sjb#include <sys/filio.h>
39179237Sjb#include <sys/kdb.h>
40179237Sjb#include <sys/kernel.h>
41179237Sjb#include <sys/kmem.h>
42179237Sjb#include <sys/kthread.h>
43179237Sjb#include <sys/limits.h>
44179237Sjb#include <sys/linker.h>
45179237Sjb#include <sys/lock.h>
46179237Sjb#include <sys/malloc.h>
47179237Sjb#include <sys/module.h>
48179237Sjb#include <sys/mutex.h>
49179237Sjb#include <sys/poll.h>
50179237Sjb#include <sys/proc.h>
51179237Sjb#include <sys/selinfo.h>
52179237Sjb#include <sys/smp.h>
53291855Sandrew#include <sys/sysctl.h>
54179237Sjb#include <sys/uio.h>
55179237Sjb#include <sys/unistd.h>
56275576Savg#include <machine/cpu.h>
57179237Sjb#include <machine/stdarg.h>
58179237Sjb
59179237Sjb#include <sys/dtrace.h>
60179237Sjb#include <sys/dtrace_bsd.h>
61179237Sjb
62179237Sjb#define	PROF_NAMELEN		15
63179237Sjb
64179237Sjb#define	PROF_PROFILE		0
65179237Sjb#define	PROF_TICK		1
66179237Sjb#define	PROF_PREFIX_PROFILE	"profile-"
67179237Sjb#define	PROF_PREFIX_TICK	"tick-"
68179237Sjb
69179237Sjb/*
70179237Sjb * Regardless of platform, there are five artificial frames in the case of the
71179237Sjb * profile provider:
72179237Sjb *
73179237Sjb *	profile_fire
74179237Sjb *	cyclic_expire
75179237Sjb *	cyclic_fire
76179237Sjb *	[ cbe ]
77179237Sjb *	[ locore ]
78179237Sjb *
79179237Sjb * On amd64, there are two frames associated with locore:  one in locore, and
80179237Sjb * another in common interrupt dispatch code.  (i386 has not been modified to
81179237Sjb * use this common layer.)  Further, on i386, the interrupted instruction
82179237Sjb * appears as its own stack frame.  All of this means that we need to add one
83179237Sjb * frame for amd64, and then take one away for both amd64 and i386.
84179237Sjb *
85179237Sjb * On SPARC, the picture is further complicated because the compiler
86179237Sjb * optimizes away tail-calls -- so the following frames are optimized away:
87179237Sjb *
88179237Sjb * 	profile_fire
89179237Sjb *	cyclic_expire
90179237Sjb *
91179237Sjb * This gives three frames.  However, on DEBUG kernels, the cyclic_expire
92179237Sjb * frame cannot be tail-call eliminated, yielding four frames in this case.
93179237Sjb *
94179237Sjb * All of the above constraints lead to the mess below.  Yes, the profile
95179237Sjb * provider should ideally figure this out on-the-fly by hiting one of its own
96179237Sjb * probes and then walking its own stack trace.  This is complicated, however,
97179237Sjb * and the static definition doesn't seem to be overly brittle.  Still, we
98179237Sjb * allow for a manual override in case we get it completely wrong.
99179237Sjb */
100179237Sjb#ifdef __amd64
101275576Savg#define	PROF_ARTIFICIAL_FRAMES	10
102179237Sjb#else
103179237Sjb#ifdef __i386
104179237Sjb#define	PROF_ARTIFICIAL_FRAMES	6
105179237Sjb#else
106179237Sjb#ifdef __sparc
107179237Sjb#ifdef DEBUG
108179237Sjb#define	PROF_ARTIFICIAL_FRAMES	4
109179237Sjb#else
110179237Sjb#define	PROF_ARTIFICIAL_FRAMES	3
111179237Sjb#endif
112179237Sjb#endif
113179237Sjb#endif
114179237Sjb#endif
115179237Sjb
116233409Sgonzo#ifdef __mips
117233409Sgonzo/*
118233409Sgonzo * This value is bogus just to make module compilable on mips
119233409Sgonzo */
120233409Sgonzo#define	PROF_ARTIFICIAL_FRAMES	3
121233409Sgonzo#endif
122233409Sgonzo
123242723Sjhibbits#ifdef __powerpc__
124242723Sjhibbits/*
125242723Sjhibbits * This value is bogus just to make module compilable on powerpc
126242723Sjhibbits */
127242723Sjhibbits#define	PROF_ARTIFICIAL_FRAMES	3
128242723Sjhibbits#endif
129242723Sjhibbits
130275576Savgstruct profile_probe_percpu;
131275576Savg
132278529Sgnn#ifdef __mips
133278529Sgnn/* bogus */
134278529Sgnn#define	PROF_ARTIFICIAL_FRAMES	3
135278529Sgnn#endif
136278529Sgnn
137278529Sgnn#ifdef __arm__
138291855Sandrew#define	PROF_ARTIFICIAL_FRAMES	3
139278529Sgnn#endif
140278529Sgnn
141285009Sbr#ifdef __aarch64__
142285009Sbr/* TODO: verify */
143285009Sbr#define	PROF_ARTIFICIAL_FRAMES	10
144285009Sbr#endif
145285009Sbr
146300618Sbr#ifdef __riscv__
147300618Sbr/* TODO: verify */
148300618Sbr#define	PROF_ARTIFICIAL_FRAMES	10
149300618Sbr#endif
150300618Sbr
151179237Sjbtypedef struct profile_probe {
152179237Sjb	char		prof_name[PROF_NAMELEN];
153179237Sjb	dtrace_id_t	prof_id;
154179237Sjb	int		prof_kind;
155275576Savg#ifdef illumos
156179237Sjb	hrtime_t	prof_interval;
157179237Sjb	cyclic_id_t	prof_cyclic;
158275576Savg#else
159275576Savg	sbintime_t	prof_interval;
160275576Savg	struct callout	prof_cyclic;
161275576Savg	sbintime_t	prof_expected;
162275576Savg	struct profile_probe_percpu **prof_pcpus;
163275576Savg#endif
164179237Sjb} profile_probe_t;
165179237Sjb
166179237Sjbtypedef struct profile_probe_percpu {
167179237Sjb	hrtime_t	profc_expected;
168179237Sjb	hrtime_t	profc_interval;
169179237Sjb	profile_probe_t	*profc_probe;
170275576Savg#ifdef __FreeBSD__
171275576Savg	struct callout	profc_cyclic;
172275576Savg#endif
173179237Sjb} profile_probe_percpu_t;
174179237Sjb
175179237Sjbstatic d_open_t	profile_open;
176179237Sjbstatic int	profile_unload(void);
177179237Sjbstatic void	profile_create(hrtime_t, char *, int);
178179237Sjbstatic void	profile_destroy(void *, dtrace_id_t, void *);
179179237Sjbstatic void	profile_enable(void *, dtrace_id_t, void *);
180179237Sjbstatic void	profile_disable(void *, dtrace_id_t, void *);
181179237Sjbstatic void	profile_load(void *);
182179237Sjbstatic void	profile_provide(void *, dtrace_probedesc_t *);
183179237Sjb
184179237Sjbstatic int profile_rates[] = {
185179237Sjb    97, 199, 499, 997, 1999,
186179237Sjb    4001, 4999, 0, 0, 0,
187179237Sjb    0, 0, 0, 0, 0,
188179237Sjb    0, 0, 0, 0, 0
189179237Sjb};
190179237Sjb
191179237Sjbstatic int profile_ticks[] = {
192179237Sjb    1, 10, 100, 500, 1000,
193179237Sjb    5000, 0, 0, 0, 0,
194179237Sjb    0, 0, 0, 0, 0
195179237Sjb};
196179237Sjb
197179237Sjb/*
198179237Sjb * profile_max defines the upper bound on the number of profile probes that
199179237Sjb * can exist (this is to prevent malicious or clumsy users from exhausing
200179237Sjb * system resources by creating a slew of profile probes). At mod load time,
201179237Sjb * this gets its value from PROFILE_MAX_DEFAULT or profile-max-probes if it's
202179237Sjb * present in the profile.conf file.
203179237Sjb */
204179237Sjb#define	PROFILE_MAX_DEFAULT	1000	/* default max. number of probes */
205179237Sjbstatic uint32_t profile_max = PROFILE_MAX_DEFAULT;
206179237Sjb					/* maximum number of profile probes */
207179237Sjbstatic uint32_t profile_total;		/* current number of profile probes */
208179237Sjb
209179237Sjbstatic struct cdevsw profile_cdevsw = {
210179237Sjb	.d_version	= D_VERSION,
211179237Sjb	.d_open		= profile_open,
212179237Sjb	.d_name		= "profile",
213179237Sjb};
214179237Sjb
215179237Sjbstatic dtrace_pattr_t profile_attr = {
216179237Sjb{ DTRACE_STABILITY_EVOLVING, DTRACE_STABILITY_EVOLVING, DTRACE_CLASS_COMMON },
217179237Sjb{ DTRACE_STABILITY_PRIVATE, DTRACE_STABILITY_PRIVATE, DTRACE_CLASS_UNKNOWN },
218179237Sjb{ DTRACE_STABILITY_PRIVATE, DTRACE_STABILITY_PRIVATE, DTRACE_CLASS_ISA },
219179237Sjb{ DTRACE_STABILITY_EVOLVING, DTRACE_STABILITY_EVOLVING, DTRACE_CLASS_COMMON },
220179237Sjb{ DTRACE_STABILITY_PRIVATE, DTRACE_STABILITY_PRIVATE, DTRACE_CLASS_ISA },
221179237Sjb};
222179237Sjb
223179237Sjbstatic dtrace_pops_t profile_pops = {
224324282Smarkj	.dtps_provide =		profile_provide,
225324282Smarkj	.dtps_provide_module =	NULL,
226324282Smarkj	.dtps_enable =		profile_enable,
227324282Smarkj	.dtps_disable =		profile_disable,
228324282Smarkj	.dtps_suspend =		NULL,
229324282Smarkj	.dtps_resume =		NULL,
230324282Smarkj	.dtps_getargdesc =	NULL,
231324282Smarkj	.dtps_getargval =	NULL,
232324282Smarkj	.dtps_usermode =	NULL,
233324282Smarkj	.dtps_destroy =		profile_destroy
234179237Sjb};
235179237Sjb
236179237Sjbstatic struct cdev		*profile_cdev;
237179237Sjbstatic dtrace_provider_id_t	profile_id;
238179237Sjbstatic hrtime_t			profile_interval_min = NANOSEC / 5000;	/* 5000 hz */
239291855Sandrewstatic int			profile_aframes = PROF_ARTIFICIAL_FRAMES;
240179237Sjb
241291855SandrewSYSCTL_DECL(_kern_dtrace);
242291855SandrewSYSCTL_NODE(_kern_dtrace, OID_AUTO, profile, CTLFLAG_RD, 0, "DTrace profile parameters");
243291855SandrewSYSCTL_INT(_kern_dtrace_profile, OID_AUTO, aframes, CTLFLAG_RW, &profile_aframes,
244291855Sandrew    0, "Skipped frames for profile provider");
245291855Sandrew
246275576Savgstatic sbintime_t
247275576Savgnsec_to_sbt(hrtime_t nsec)
248275576Savg{
249275576Savg	time_t sec;
250275576Savg
251275576Savg	/*
252275576Savg	 * We need to calculate nsec * 2^32 / 10^9
253275576Savg	 * Seconds and nanoseconds are split to avoid overflow.
254275576Savg	 */
255275576Savg	sec = nsec / NANOSEC;
256275576Savg	nsec = nsec % NANOSEC;
257275576Savg	return (((sbintime_t)sec << 32) | ((sbintime_t)nsec << 32) / NANOSEC);
258275576Savg}
259275576Savg
260275576Savgstatic hrtime_t
261275576Savgsbt_to_nsec(sbintime_t sbt)
262275576Savg{
263275576Savg
264275576Savg	return ((sbt >> 32) * NANOSEC +
265275576Savg	    (((uint32_t)sbt * (hrtime_t)NANOSEC) >> 32));
266275576Savg}
267275576Savg
268179237Sjbstatic void
269179237Sjbprofile_fire(void *arg)
270179237Sjb{
271179237Sjb	profile_probe_percpu_t *pcpu = arg;
272179237Sjb	profile_probe_t *prof = pcpu->profc_probe;
273179237Sjb	hrtime_t late;
274275576Savg	struct trapframe *frame;
275275576Savg	uintfptr_t pc, upc;
276179237Sjb
277275576Savg#ifdef illumos
278179237Sjb	late = gethrtime() - pcpu->profc_expected;
279275576Savg#else
280275576Savg	late = sbt_to_nsec(sbinuptime() - pcpu->profc_expected);
281275576Savg#endif
282275576Savg
283275576Savg	pc = 0;
284275576Savg	upc = 0;
285275576Savg
286275576Savg	/*
287275576Savg	 * td_intr_frame can be unset if this is a catch up event
288275576Savg	 * after waking up from idle sleep.
289275576Savg	 * This can only happen on a CPU idle thread.
290275576Savg	 */
291275576Savg	frame = curthread->td_intr_frame;
292275576Savg	if (frame != NULL) {
293275576Savg		if (TRAPF_USERMODE(frame))
294275576Savg			upc = TRAPF_PC(frame);
295275576Savg		else
296275576Savg			pc = TRAPF_PC(frame);
297275576Savg	}
298275576Savg	dtrace_probe(prof->prof_id, pc, upc, late, 0, 0);
299275576Savg
300179237Sjb	pcpu->profc_expected += pcpu->profc_interval;
301275576Savg	callout_schedule_sbt_curcpu(&pcpu->profc_cyclic,
302275576Savg	    pcpu->profc_expected, 0, C_DIRECT_EXEC | C_ABSOLUTE);
303179237Sjb}
304179237Sjb
305179237Sjbstatic void
306179237Sjbprofile_tick(void *arg)
307179237Sjb{
308179237Sjb	profile_probe_t *prof = arg;
309275576Savg	struct trapframe *frame;
310275576Savg	uintfptr_t pc, upc;
311179237Sjb
312275576Savg	pc = 0;
313275576Savg	upc = 0;
314275576Savg
315275576Savg	/*
316275576Savg	 * td_intr_frame can be unset if this is a catch up event
317275576Savg	 * after waking up from idle sleep.
318275576Savg	 * This can only happen on a CPU idle thread.
319275576Savg	 */
320275576Savg	frame = curthread->td_intr_frame;
321275576Savg	if (frame != NULL) {
322275576Savg		if (TRAPF_USERMODE(frame))
323275576Savg			upc = TRAPF_PC(frame);
324275576Savg		else
325275576Savg			pc = TRAPF_PC(frame);
326275576Savg	}
327275576Savg	dtrace_probe(prof->prof_id, pc, upc, 0, 0, 0);
328275576Savg
329275576Savg	prof->prof_expected += prof->prof_interval;
330275576Savg	callout_schedule_sbt(&prof->prof_cyclic,
331275576Savg	    prof->prof_expected, 0, C_DIRECT_EXEC | C_ABSOLUTE);
332179237Sjb}
333179237Sjb
334179237Sjbstatic void
335179237Sjbprofile_create(hrtime_t interval, char *name, int kind)
336179237Sjb{
337179237Sjb	profile_probe_t *prof;
338179237Sjb
339179237Sjb	if (interval < profile_interval_min)
340179237Sjb		return;
341179237Sjb
342179237Sjb	if (dtrace_probe_lookup(profile_id, NULL, NULL, name) != 0)
343179237Sjb		return;
344179237Sjb
345179237Sjb	atomic_add_32(&profile_total, 1);
346179237Sjb	if (profile_total > profile_max) {
347179237Sjb		atomic_add_32(&profile_total, -1);
348179237Sjb		return;
349179237Sjb	}
350179237Sjb
351179237Sjb	prof = kmem_zalloc(sizeof (profile_probe_t), KM_SLEEP);
352179237Sjb	(void) strcpy(prof->prof_name, name);
353275576Savg#ifdef illumos
354179237Sjb	prof->prof_interval = interval;
355179237Sjb	prof->prof_cyclic = CYCLIC_NONE;
356275576Savg#else
357275576Savg	prof->prof_interval = nsec_to_sbt(interval);
358283291Sjkim	callout_init(&prof->prof_cyclic, 1);
359275576Savg#endif
360179237Sjb	prof->prof_kind = kind;
361179237Sjb	prof->prof_id = dtrace_probe_create(profile_id,
362179237Sjb	    NULL, NULL, name,
363291855Sandrew	    profile_aframes, prof);
364179237Sjb}
365179237Sjb
366179237Sjb/*ARGSUSED*/
367179237Sjbstatic void
368179237Sjbprofile_provide(void *arg, dtrace_probedesc_t *desc)
369179237Sjb{
370179237Sjb	int i, j, rate, kind;
371179237Sjb	hrtime_t val = 0, mult = 1, len = 0;
372179237Sjb	char *name, *suffix = NULL;
373179237Sjb
374179237Sjb	const struct {
375179237Sjb		char *prefix;
376179237Sjb		int kind;
377179237Sjb	} types[] = {
378179237Sjb		{ PROF_PREFIX_PROFILE, PROF_PROFILE },
379179237Sjb		{ PROF_PREFIX_TICK, PROF_TICK },
380179237Sjb		{ 0, 0 }
381179237Sjb	};
382179237Sjb
383179237Sjb	const struct {
384179237Sjb		char *name;
385179237Sjb		hrtime_t mult;
386179237Sjb	} suffixes[] = {
387179237Sjb		{ "ns", 	NANOSEC / NANOSEC },
388179237Sjb		{ "nsec",	NANOSEC / NANOSEC },
389179237Sjb		{ "us",		NANOSEC / MICROSEC },
390179237Sjb		{ "usec",	NANOSEC / MICROSEC },
391179237Sjb		{ "ms",		NANOSEC / MILLISEC },
392179237Sjb		{ "msec",	NANOSEC / MILLISEC },
393179237Sjb		{ "s",		NANOSEC / SEC },
394179237Sjb		{ "sec",	NANOSEC / SEC },
395179237Sjb		{ "m",		NANOSEC * (hrtime_t)60 },
396179237Sjb		{ "min",	NANOSEC * (hrtime_t)60 },
397179237Sjb		{ "h",		NANOSEC * (hrtime_t)(60 * 60) },
398179237Sjb		{ "hour",	NANOSEC * (hrtime_t)(60 * 60) },
399179237Sjb		{ "d",		NANOSEC * (hrtime_t)(24 * 60 * 60) },
400179237Sjb		{ "day",	NANOSEC * (hrtime_t)(24 * 60 * 60) },
401179237Sjb		{ "hz",		0 },
402179237Sjb		{ NULL }
403179237Sjb	};
404179237Sjb
405179237Sjb	if (desc == NULL) {
406179237Sjb		char n[PROF_NAMELEN];
407179237Sjb
408179237Sjb		/*
409179237Sjb		 * If no description was provided, provide all of our probes.
410179237Sjb		 */
411179237Sjb		for (i = 0; i < sizeof (profile_rates) / sizeof (int); i++) {
412179237Sjb			if ((rate = profile_rates[i]) == 0)
413179237Sjb				continue;
414179237Sjb
415179237Sjb			(void) snprintf(n, PROF_NAMELEN, "%s%d",
416179237Sjb			    PROF_PREFIX_PROFILE, rate);
417179237Sjb			profile_create(NANOSEC / rate, n, PROF_PROFILE);
418179237Sjb		}
419179237Sjb
420179237Sjb		for (i = 0; i < sizeof (profile_ticks) / sizeof (int); i++) {
421179237Sjb			if ((rate = profile_ticks[i]) == 0)
422179237Sjb				continue;
423179237Sjb
424179237Sjb			(void) snprintf(n, PROF_NAMELEN, "%s%d",
425179237Sjb			    PROF_PREFIX_TICK, rate);
426179237Sjb			profile_create(NANOSEC / rate, n, PROF_TICK);
427179237Sjb		}
428179237Sjb
429179237Sjb		return;
430179237Sjb	}
431179237Sjb
432179237Sjb	name = desc->dtpd_name;
433179237Sjb
434179237Sjb	for (i = 0; types[i].prefix != NULL; i++) {
435179237Sjb		len = strlen(types[i].prefix);
436179237Sjb
437179237Sjb		if (strncmp(name, types[i].prefix, len) != 0)
438179237Sjb			continue;
439179237Sjb		break;
440179237Sjb	}
441179237Sjb
442179237Sjb	if (types[i].prefix == NULL)
443179237Sjb		return;
444179237Sjb
445179237Sjb	kind = types[i].kind;
446179237Sjb	j = strlen(name) - len;
447179237Sjb
448179237Sjb	/*
449179237Sjb	 * We need to start before any time suffix.
450179237Sjb	 */
451179237Sjb	for (j = strlen(name); j >= len; j--) {
452179237Sjb		if (name[j] >= '0' && name[j] <= '9')
453179237Sjb			break;
454179237Sjb		suffix = &name[j];
455179237Sjb	}
456179237Sjb
457179237Sjb	ASSERT(suffix != NULL);
458179237Sjb
459179237Sjb	/*
460179237Sjb	 * Now determine the numerical value present in the probe name.
461179237Sjb	 */
462179237Sjb	for (; j >= len; j--) {
463179237Sjb		if (name[j] < '0' || name[j] > '9')
464179237Sjb			return;
465179237Sjb
466179237Sjb		val += (name[j] - '0') * mult;
467179237Sjb		mult *= (hrtime_t)10;
468179237Sjb	}
469179237Sjb
470179237Sjb	if (val == 0)
471179237Sjb		return;
472179237Sjb
473179237Sjb	/*
474179237Sjb	 * Look-up the suffix to determine the multiplier.
475179237Sjb	 */
476179237Sjb	for (i = 0, mult = 0; suffixes[i].name != NULL; i++) {
477179237Sjb		if (strcasecmp(suffixes[i].name, suffix) == 0) {
478179237Sjb			mult = suffixes[i].mult;
479179237Sjb			break;
480179237Sjb		}
481179237Sjb	}
482179237Sjb
483179237Sjb	if (suffixes[i].name == NULL && *suffix != '\0')
484179237Sjb		return;
485179237Sjb
486179237Sjb	if (mult == 0) {
487179237Sjb		/*
488179237Sjb		 * The default is frequency-per-second.
489179237Sjb		 */
490179237Sjb		val = NANOSEC / val;
491179237Sjb	} else {
492179237Sjb		val *= mult;
493179237Sjb	}
494179237Sjb
495179237Sjb	profile_create(val, name, kind);
496179237Sjb}
497179237Sjb
498179237Sjb/* ARGSUSED */
499179237Sjbstatic void
500179237Sjbprofile_destroy(void *arg, dtrace_id_t id, void *parg)
501179237Sjb{
502179237Sjb	profile_probe_t *prof = parg;
503179237Sjb
504275576Savg#ifdef illumos
505179237Sjb	ASSERT(prof->prof_cyclic == CYCLIC_NONE);
506275576Savg#else
507275576Savg	ASSERT(!callout_active(&prof->prof_cyclic) && prof->prof_pcpus == NULL);
508275576Savg#endif
509179237Sjb	kmem_free(prof, sizeof (profile_probe_t));
510179237Sjb
511179237Sjb	ASSERT(profile_total >= 1);
512179237Sjb	atomic_add_32(&profile_total, -1);
513179237Sjb}
514179237Sjb
515275576Savg#ifdef illumos
516179237Sjb/*ARGSUSED*/
517179237Sjbstatic void
518179237Sjbprofile_online(void *arg, cpu_t *cpu, cyc_handler_t *hdlr, cyc_time_t *when)
519179237Sjb{
520179237Sjb	profile_probe_t *prof = arg;
521179237Sjb	profile_probe_percpu_t *pcpu;
522179237Sjb
523179237Sjb	pcpu = kmem_zalloc(sizeof (profile_probe_percpu_t), KM_SLEEP);
524179237Sjb	pcpu->profc_probe = prof;
525179237Sjb
526179237Sjb	hdlr->cyh_func = profile_fire;
527179237Sjb	hdlr->cyh_arg = pcpu;
528179237Sjb
529179237Sjb	when->cyt_interval = prof->prof_interval;
530179237Sjb	when->cyt_when = gethrtime() + when->cyt_interval;
531179237Sjb
532179237Sjb	pcpu->profc_expected = when->cyt_when;
533179237Sjb	pcpu->profc_interval = when->cyt_interval;
534179237Sjb}
535179237Sjb
536179237Sjb/*ARGSUSED*/
537179237Sjbstatic void
538179237Sjbprofile_offline(void *arg, cpu_t *cpu, void *oarg)
539179237Sjb{
540179237Sjb	profile_probe_percpu_t *pcpu = oarg;
541179237Sjb
542179237Sjb	ASSERT(pcpu->profc_probe == arg);
543179237Sjb	kmem_free(pcpu, sizeof (profile_probe_percpu_t));
544179237Sjb}
545179237Sjb
546179237Sjb/* ARGSUSED */
547179237Sjbstatic void
548179237Sjbprofile_enable(void *arg, dtrace_id_t id, void *parg)
549179237Sjb{
550179237Sjb	profile_probe_t *prof = parg;
551179237Sjb	cyc_omni_handler_t omni;
552179237Sjb	cyc_handler_t hdlr;
553179237Sjb	cyc_time_t when;
554179237Sjb
555179237Sjb	ASSERT(prof->prof_interval != 0);
556179237Sjb	ASSERT(MUTEX_HELD(&cpu_lock));
557179237Sjb
558179237Sjb	if (prof->prof_kind == PROF_TICK) {
559179237Sjb		hdlr.cyh_func = profile_tick;
560179237Sjb		hdlr.cyh_arg = prof;
561179237Sjb
562179237Sjb		when.cyt_interval = prof->prof_interval;
563179237Sjb		when.cyt_when = gethrtime() + when.cyt_interval;
564179237Sjb	} else {
565179237Sjb		ASSERT(prof->prof_kind == PROF_PROFILE);
566179237Sjb		omni.cyo_online = profile_online;
567179237Sjb		omni.cyo_offline = profile_offline;
568179237Sjb		omni.cyo_arg = prof;
569179237Sjb	}
570179237Sjb
571179237Sjb	if (prof->prof_kind == PROF_TICK) {
572179237Sjb		prof->prof_cyclic = cyclic_add(&hdlr, &when);
573179237Sjb	} else {
574179237Sjb		prof->prof_cyclic = cyclic_add_omni(&omni);
575179237Sjb	}
576179237Sjb}
577179237Sjb
578179237Sjb/* ARGSUSED */
579179237Sjbstatic void
580179237Sjbprofile_disable(void *arg, dtrace_id_t id, void *parg)
581179237Sjb{
582179237Sjb	profile_probe_t *prof = parg;
583179237Sjb
584179237Sjb	ASSERT(prof->prof_cyclic != CYCLIC_NONE);
585179237Sjb	ASSERT(MUTEX_HELD(&cpu_lock));
586179237Sjb
587179237Sjb	cyclic_remove(prof->prof_cyclic);
588179237Sjb	prof->prof_cyclic = CYCLIC_NONE;
589179237Sjb}
590179237Sjb
591275576Savg#else
592275576Savg
593179237Sjbstatic void
594275576Savgprofile_enable_omni(profile_probe_t *prof)
595275576Savg{
596275576Savg	profile_probe_percpu_t *pcpu;
597275576Savg	int cpu;
598275576Savg
599275576Savg	prof->prof_pcpus = kmem_zalloc((mp_maxid + 1) * sizeof(pcpu), KM_SLEEP);
600275576Savg	CPU_FOREACH(cpu) {
601275576Savg		pcpu = kmem_zalloc(sizeof(profile_probe_percpu_t), KM_SLEEP);
602275576Savg		prof->prof_pcpus[cpu] = pcpu;
603275576Savg		pcpu->profc_probe = prof;
604275576Savg		pcpu->profc_expected = sbinuptime() + prof->prof_interval;
605275576Savg		pcpu->profc_interval = prof->prof_interval;
606283291Sjkim		callout_init(&pcpu->profc_cyclic, 1);
607275576Savg		callout_reset_sbt_on(&pcpu->profc_cyclic,
608275576Savg		    pcpu->profc_expected, 0, profile_fire, pcpu,
609275576Savg		    cpu, C_DIRECT_EXEC | C_ABSOLUTE);
610275576Savg	}
611275576Savg}
612275576Savg
613275576Savgstatic void
614275576Savgprofile_disable_omni(profile_probe_t *prof)
615275576Savg{
616275576Savg	profile_probe_percpu_t *pcpu;
617275576Savg	int cpu;
618275576Savg
619275576Savg	ASSERT(prof->prof_pcpus != NULL);
620275576Savg	CPU_FOREACH(cpu) {
621275576Savg		pcpu = prof->prof_pcpus[cpu];
622275576Savg		ASSERT(pcpu->profc_probe == prof);
623275576Savg		ASSERT(callout_active(&pcpu->profc_cyclic));
624275576Savg		callout_stop(&pcpu->profc_cyclic);
625275576Savg		callout_drain(&pcpu->profc_cyclic);
626275576Savg		kmem_free(pcpu, sizeof(profile_probe_percpu_t));
627275576Savg	}
628275576Savg	kmem_free(prof->prof_pcpus, (mp_maxid + 1) * sizeof(pcpu));
629275576Savg	prof->prof_pcpus = NULL;
630275576Savg}
631275576Savg
632275576Savg/* ARGSUSED */
633275576Savgstatic void
634275576Savgprofile_enable(void *arg, dtrace_id_t id, void *parg)
635275576Savg{
636275576Savg	profile_probe_t *prof = parg;
637275576Savg
638275576Savg	if (prof->prof_kind == PROF_TICK) {
639275576Savg		prof->prof_expected = sbinuptime() + prof->prof_interval;
640275576Savg		callout_reset_sbt(&prof->prof_cyclic,
641275576Savg		    prof->prof_expected, 0, profile_tick, prof,
642275576Savg		    C_DIRECT_EXEC | C_ABSOLUTE);
643275576Savg	} else {
644275576Savg		ASSERT(prof->prof_kind == PROF_PROFILE);
645275576Savg		profile_enable_omni(prof);
646275576Savg	}
647275576Savg}
648275576Savg
649275576Savg/* ARGSUSED */
650275576Savgstatic void
651275576Savgprofile_disable(void *arg, dtrace_id_t id, void *parg)
652275576Savg{
653275576Savg	profile_probe_t *prof = parg;
654275576Savg
655275576Savg	if (prof->prof_kind == PROF_TICK) {
656275576Savg		ASSERT(callout_active(&prof->prof_cyclic));
657275576Savg		callout_stop(&prof->prof_cyclic);
658275576Savg		callout_drain(&prof->prof_cyclic);
659275576Savg	} else {
660275576Savg		ASSERT(prof->prof_kind == PROF_PROFILE);
661275576Savg		profile_disable_omni(prof);
662275576Savg	}
663275576Savg}
664275576Savg#endif
665275576Savg
666275576Savgstatic void
667179237Sjbprofile_load(void *dummy)
668179237Sjb{
669179237Sjb	/* Create the /dev/dtrace/profile entry. */
670179237Sjb	profile_cdev = make_dev(&profile_cdevsw, 0, UID_ROOT, GID_WHEEL, 0600,
671179237Sjb	    "dtrace/profile");
672179237Sjb
673179237Sjb	if (dtrace_register("profile", &profile_attr, DTRACE_PRIV_USER,
674179237Sjb	    NULL, &profile_pops, NULL, &profile_id) != 0)
675179237Sjb		return;
676179237Sjb}
677179237Sjb
678179237Sjb
679179237Sjbstatic int
680179237Sjbprofile_unload()
681179237Sjb{
682179237Sjb	int error = 0;
683179237Sjb
684179237Sjb	if ((error = dtrace_unregister(profile_id)) != 0)
685179237Sjb		return (error);
686179237Sjb
687179237Sjb	destroy_dev(profile_cdev);
688179237Sjb
689179237Sjb	return (error);
690179237Sjb}
691179237Sjb
692179237Sjb/* ARGSUSED */
693179237Sjbstatic int
694179237Sjbprofile_modevent(module_t mod __unused, int type, void *data __unused)
695179237Sjb{
696179237Sjb	int error = 0;
697179237Sjb
698179237Sjb	switch (type) {
699179237Sjb	case MOD_LOAD:
700179237Sjb		break;
701179237Sjb
702179237Sjb	case MOD_UNLOAD:
703179237Sjb		break;
704179237Sjb
705179237Sjb	case MOD_SHUTDOWN:
706179237Sjb		break;
707179237Sjb
708179237Sjb	default:
709179237Sjb		error = EOPNOTSUPP;
710179237Sjb		break;
711179237Sjb
712179237Sjb	}
713179237Sjb	return (error);
714179237Sjb}
715179237Sjb
716179237Sjb/* ARGSUSED */
717179237Sjbstatic int
718179237Sjbprofile_open(struct cdev *dev __unused, int oflags __unused, int devtype __unused, struct thread *td __unused)
719179237Sjb{
720179237Sjb	return (0);
721179237Sjb}
722179237Sjb
723179237SjbSYSINIT(profile_load, SI_SUB_DTRACE_PROVIDER, SI_ORDER_ANY, profile_load, NULL);
724179237SjbSYSUNINIT(profile_unload, SI_SUB_DTRACE_PROVIDER, SI_ORDER_ANY, profile_unload, NULL);
725179237Sjb
726179237SjbDEV_MODULE(profile, profile_modevent, NULL);
727179237SjbMODULE_VERSION(profile, 1);
728179237SjbMODULE_DEPEND(profile, dtrace, 1, 1, 1);
729179237SjbMODULE_DEPEND(profile, opensolaris, 1, 1, 1);
730