dtrace.c revision 179193
1193326Sed/*
2193326Sed * CDDL HEADER START
3193326Sed *
4193326Sed * The contents of this file are subject to the terms of the
5193326Sed * Common Development and Distribution License (the "License").
6193326Sed * You may not use this file except in compliance with the License.
7193326Sed *
8193326Sed * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9193326Sed * or http://www.opensolaris.org/os/licensing.
10193326Sed * See the License for the specific language governing permissions
11193326Sed * and limitations under the License.
12193326Sed *
13193326Sed * When distributing Covered Code, include this CDDL HEADER in each
14193326Sed * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15193326Sed * If applicable, add the following below this CDDL HEADER, with the
16193326Sed * fields enclosed by brackets "[]" replaced with your own identifying
17193326Sed * information: Portions Copyright [yyyy] [name of copyright owner]
18193326Sed *
19193326Sed * CDDL HEADER END
20193326Sed */
21193326Sed
22193326Sed/*
23193326Sed * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
24193326Sed * Use is subject to license terms.
25193326Sed */
26193326Sed
27193326Sed#pragma ident	"%Z%%M%	%I%	%E% SMI"
28193326Sed
29193326Sed/*
30193326Sed * DTrace - Dynamic Tracing for Solaris
31193326Sed *
32193326Sed * This is the implementation of the Solaris Dynamic Tracing framework
33193326Sed * (DTrace).  The user-visible interface to DTrace is described at length in
34193326Sed * the "Solaris Dynamic Tracing Guide".  The interfaces between the libdtrace
35193326Sed * library, the in-kernel DTrace framework, and the DTrace providers are
36193326Sed * described in the block comments in the <sys/dtrace.h> header file.  The
37206084Srdivacky * internal architecture of DTrace is described in the block comments in the
38193326Sed * <sys/dtrace_impl.h> header file.  The comments contained within the DTrace
39193326Sed * implementation very much assume mastery of all of these sources; if one has
40193326Sed * an unanswered question about the implementation, one should consult them
41193326Sed * first.
42193326Sed *
43206084Srdivacky * The functions here are ordered roughly as follows:
44193326Sed *
45193326Sed *   - Probe context functions
46218893Sdim *   - Probe hashing functions
47193326Sed *   - Non-probe context utility functions
48193326Sed *   - Matching functions
49206084Srdivacky *   - Provider-to-Framework API functions
50193326Sed *   - Probe management functions
51193326Sed *   - DIF object functions
52218893Sdim *   - Format functions
53193326Sed *   - Predicate functions
54193326Sed *   - ECB functions
55206084Srdivacky *   - Buffer functions
56193326Sed *   - Enabling functions
57193326Sed *   - DOF functions
58193326Sed *   - Anonymous enabling functions
59193326Sed *   - Consumer state functions
60193326Sed *   - Helper functions
61206084Srdivacky *   - Hook functions
62193326Sed *   - Driver cookbook functions
63193326Sed *
64193326Sed * Each group of functions begins with a block comment labelled the "DTrace
65193326Sed * [Group] Functions", allowing one to find each block by searching forward
66193326Sed * on capital-f functions.
67206084Srdivacky */
68193326Sed#include <sys/errno.h>
69193326Sed#include <sys/stat.h>
70193326Sed#include <sys/modctl.h>
71193326Sed#include <sys/conf.h>
72193326Sed#include <sys/systm.h>
73206084Srdivacky#include <sys/ddi.h>
74193326Sed#include <sys/sunddi.h>
75193326Sed#include <sys/cpuvar.h>
76193326Sed#include <sys/kmem.h>
77193326Sed#include <sys/strsubr.h>
78193326Sed#include <sys/sysmacros.h>
79206084Srdivacky#include <sys/dtrace_impl.h>
80193326Sed#include <sys/atomic.h>
81193326Sed#include <sys/cmn_err.h>
82193326Sed#include <sys/mutex_impl.h>
83193326Sed#include <sys/rwlock_impl.h>
84193326Sed#include <sys/ctf_api.h>
85206084Srdivacky#include <sys/panic.h>
86193326Sed#include <sys/priv_impl.h>
87193326Sed#include <sys/policy.h>
88218893Sdim#include <sys/cred_impl.h>
89193326Sed#include <sys/procfs_isa.h>
90193326Sed#include <sys/taskq.h>
91206084Srdivacky#include <sys/mkdev.h>
92193326Sed#include <sys/kdi.h>
93193326Sed#include <sys/zone.h>
94218893Sdim#include <sys/socket.h>
95193326Sed#include <netinet/in.h>
96193326Sed
97206084Srdivacky/*
98193326Sed * DTrace Tunable Variables
99193326Sed *
100218893Sdim * The following variables may be tuned by adding a line to /etc/system that
101193326Sed * includes both the name of the DTrace module ("dtrace") and the name of the
102193326Sed * variable.  For example:
103206084Srdivacky *
104193326Sed *   set dtrace:dtrace_destructive_disallow = 1
105193326Sed *
106218893Sdim * In general, the only variables that one should be tuning this way are those
107193326Sed * that affect system-wide DTrace behavior, and for which the default behavior
108193326Sed * is undesirable.  Most of these variables are tunable on a per-consumer
109206084Srdivacky * basis using DTrace options, and need not be tuned on a system-wide basis.
110193326Sed * When tuning these variables, avoid pathological values; while some attempt
111193326Sed * is made to verify the integrity of these variables, they are not considered
112218893Sdim * part of the supported interface to DTrace, and they are therefore not
113193326Sed * checked comprehensively.  Further, these variables should not be tuned
114193326Sed * dynamically via "mdb -kw" or other means; they should only be tuned via
115206084Srdivacky * /etc/system.
116193326Sed */
117193326Sedint		dtrace_destructive_disallow = 0;
118218893Sdimdtrace_optval_t	dtrace_nonroot_maxsize = (16 * 1024 * 1024);
119193326Sedsize_t		dtrace_difo_maxsize = (256 * 1024);
120193326Seddtrace_optval_t	dtrace_dof_maxsize = (256 * 1024);
121206084Srdivackysize_t		dtrace_global_maxsize = (16 * 1024);
122193326Sedsize_t		dtrace_actions_max = (16 * 1024);
123193326Sedsize_t		dtrace_retain_max = 1024;
124218893Sdimdtrace_optval_t	dtrace_helper_actions_max = 32;
125193326Seddtrace_optval_t	dtrace_helper_providers_max = 32;
126193326Seddtrace_optval_t	dtrace_dstate_defsize = (1 * 1024 * 1024);
127206084Srdivackysize_t		dtrace_strsize_default = 256;
128193326Seddtrace_optval_t	dtrace_cleanrate_default = 9900990;		/* 101 hz */
129193326Seddtrace_optval_t	dtrace_cleanrate_min = 200000;			/* 5000 hz */
130218893Sdimdtrace_optval_t	dtrace_cleanrate_max = (uint64_t)60 * NANOSEC;	/* 1/minute */
131193326Seddtrace_optval_t	dtrace_aggrate_default = NANOSEC;		/* 1 hz */
132193326Seddtrace_optval_t	dtrace_statusrate_default = NANOSEC;		/* 1 hz */
133206084Srdivackydtrace_optval_t dtrace_statusrate_max = (hrtime_t)10 * NANOSEC;	 /* 6/minute */
134193326Seddtrace_optval_t	dtrace_switchrate_default = NANOSEC;		/* 1 hz */
135193326Seddtrace_optval_t	dtrace_nspec_default = 1;
136218893Sdimdtrace_optval_t	dtrace_specsize_default = 32 * 1024;
137193326Seddtrace_optval_t dtrace_stackframes_default = 20;
138193326Seddtrace_optval_t dtrace_ustackframes_default = 20;
139206084Srdivackydtrace_optval_t dtrace_jstackframes_default = 50;
140193326Seddtrace_optval_t dtrace_jstackstrsize_default = 512;
141193326Sedint		dtrace_msgdsize_max = 128;
142193326Sedhrtime_t	dtrace_chill_max = 500 * (NANOSEC / MILLISEC);	/* 500 ms */
143193326Sedhrtime_t	dtrace_chill_interval = NANOSEC;		/* 1000 ms */
144193326Sedint		dtrace_devdepth_max = 32;
145206084Srdivackyint		dtrace_err_verbose;
146193326Sedhrtime_t	dtrace_deadman_interval = NANOSEC;
147193326Sedhrtime_t	dtrace_deadman_timeout = (hrtime_t)10 * NANOSEC;
148193326Sedhrtime_t	dtrace_deadman_user = (hrtime_t)30 * NANOSEC;
149193326Sed
150193326Sed/*
151206084Srdivacky * DTrace External Variables
152193326Sed *
153193326Sed * As dtrace(7D) is a kernel module, any DTrace variables are obviously
154193326Sed * available to DTrace consumers via the backtick (`) syntax.  One of these,
155193326Sed * dtrace_zero, is made deliberately so:  it is provided as a source of
156193326Sed * well-known, zero-filled memory.  While this variable is not documented,
157206084Srdivacky * it is used by some translators as an implementation detail.
158193326Sed */
159193326Sedconst char	dtrace_zero[256] = { 0 };	/* zero-filled memory */
160193326Sed
161193326Sed/*
162193326Sed * DTrace Internal Variables
163206084Srdivacky */
164193326Sedstatic dev_info_t	*dtrace_devi;		/* device info */
165193326Sedstatic vmem_t		*dtrace_arena;		/* probe ID arena */
166218893Sdimstatic vmem_t		*dtrace_minor;		/* minor number arena */
167193326Sedstatic taskq_t		*dtrace_taskq;		/* task queue */
168193326Sedstatic dtrace_probe_t	**dtrace_probes;	/* array of all probes */
169206084Srdivackystatic int		dtrace_nprobes;		/* number of probes */
170193326Sedstatic dtrace_provider_t *dtrace_provider;	/* provider list */
171193326Sedstatic dtrace_meta_t	*dtrace_meta_pid;	/* user-land meta provider */
172218893Sdimstatic int		dtrace_opens;		/* number of opens */
173193326Sedstatic int		dtrace_helpers;		/* number of helpers */
174193326Sedstatic void		*dtrace_softstate;	/* softstate pointer */
175206084Srdivackystatic dtrace_hash_t	*dtrace_bymod;		/* probes hashed by module */
176193326Sedstatic dtrace_hash_t	*dtrace_byfunc;		/* probes hashed by function */
177193326Sedstatic dtrace_hash_t	*dtrace_byname;		/* probes hashed by name */
178218893Sdimstatic dtrace_toxrange_t *dtrace_toxrange;	/* toxic range array */
179193326Sedstatic int		dtrace_toxranges;	/* number of toxic ranges */
180193326Sedstatic int		dtrace_toxranges_max;	/* size of toxic range array */
181206084Srdivackystatic dtrace_anon_t	dtrace_anon;		/* anonymous enabling */
182193326Sedstatic kmem_cache_t	*dtrace_state_cache;	/* cache for dynamic state */
183193326Sedstatic uint64_t		dtrace_vtime_references; /* number of vtimestamp refs */
184193326Sedstatic kthread_t	*dtrace_panicked;	/* panicking thread */
185193326Sedstatic dtrace_ecb_t	*dtrace_ecb_create_cache; /* cached created ECB */
186193326Sedstatic dtrace_genid_t	dtrace_probegen;	/* current probe generation */
187206084Srdivackystatic dtrace_helpers_t *dtrace_deferred_pid;	/* deferred helper list */
188193326Sedstatic dtrace_enabling_t *dtrace_retained;	/* list of retained enablings */
189193326Sedstatic dtrace_dynvar_t	dtrace_dynhash_sink;	/* end of dynamic hash chains */
190193326Sed
191193326Sed/*
192193326Sed * DTrace Locking
193206084Srdivacky * DTrace is protected by three (relatively coarse-grained) locks:
194193326Sed *
195193326Sed * (1) dtrace_lock is required to manipulate essentially any DTrace state,
196193326Sed *     including enabling state, probes, ECBs, consumer state, helper state,
197193326Sed *     etc.  Importantly, dtrace_lock is _not_ required when in probe context;
198193326Sed *     probe context is lock-free -- synchronization is handled via the
199206084Srdivacky *     dtrace_sync() cross call mechanism.
200193326Sed *
201193326Sed * (2) dtrace_provider_lock is required when manipulating provider state, or
202193326Sed *     when provider state must be held constant.
203193326Sed *
204193326Sed * (3) dtrace_meta_lock is required when manipulating meta provider state, or
205206084Srdivacky *     when meta provider state must be held constant.
206193326Sed *
207193326Sed * The lock ordering between these three locks is dtrace_meta_lock before
208193326Sed * dtrace_provider_lock before dtrace_lock.  (In particular, there are
209193326Sed * several places where dtrace_provider_lock is held by the framework as it
210193326Sed * calls into the providers -- which then call back into the framework,
211206084Srdivacky * grabbing dtrace_lock.)
212193326Sed *
213193326Sed * There are two other locks in the mix:  mod_lock and cpu_lock.  With respect
214193326Sed * to dtrace_provider_lock and dtrace_lock, cpu_lock continues its historical
215193326Sed * role as a coarse-grained lock; it is acquired before both of these locks.
216193326Sed * With respect to dtrace_meta_lock, its behavior is stranger:  cpu_lock must
217206084Srdivacky * be acquired _between_ dtrace_meta_lock and any other DTrace locks.
218193326Sed * mod_lock is similar with respect to dtrace_provider_lock in that it must be
219193326Sed * acquired _between_ dtrace_provider_lock and dtrace_lock.
220218893Sdim */
221193326Sedstatic kmutex_t		dtrace_lock;		/* probe state lock */
222193326Sedstatic kmutex_t		dtrace_provider_lock;	/* provider state lock */
223206084Srdivackystatic kmutex_t		dtrace_meta_lock;	/* meta-provider state lock */
224193326Sed
225193326Sed/*
226193326Sed * DTrace Provider Variables
227193326Sed *
228193326Sed * These are the variables relating to DTrace as a provider (that is, the
229206084Srdivacky * provider of the BEGIN, END, and ERROR probes).
230193326Sed */
231193326Sedstatic dtrace_pattr_t	dtrace_provider_attr = {
232193326Sed{ DTRACE_STABILITY_STABLE, DTRACE_STABILITY_STABLE, DTRACE_CLASS_COMMON },
233193326Sed{ DTRACE_STABILITY_PRIVATE, DTRACE_STABILITY_PRIVATE, DTRACE_CLASS_UNKNOWN },
234193326Sed{ DTRACE_STABILITY_PRIVATE, DTRACE_STABILITY_PRIVATE, DTRACE_CLASS_UNKNOWN },
235206084Srdivacky{ DTRACE_STABILITY_STABLE, DTRACE_STABILITY_STABLE, DTRACE_CLASS_COMMON },
236193326Sed{ DTRACE_STABILITY_STABLE, DTRACE_STABILITY_STABLE, DTRACE_CLASS_COMMON },
237193326Sed};
238193326Sed
239193326Sedstatic void
240193326Seddtrace_nullop(void)
241206084Srdivacky{}
242193326Sed
243193326Sedstatic dtrace_pops_t	dtrace_provider_ops = {
244193326Sed	(void (*)(void *, const dtrace_probedesc_t *))dtrace_nullop,
245193326Sed	(void (*)(void *, struct modctl *))dtrace_nullop,
246193326Sed	(void (*)(void *, dtrace_id_t, void *))dtrace_nullop,
247206084Srdivacky	(void (*)(void *, dtrace_id_t, void *))dtrace_nullop,
248193326Sed	(void (*)(void *, dtrace_id_t, void *))dtrace_nullop,
249193326Sed	(void (*)(void *, dtrace_id_t, void *))dtrace_nullop,
250218893Sdim	NULL,
251193326Sed	NULL,
252193326Sed	NULL,
253206084Srdivacky	(void (*)(void *, dtrace_id_t, void *))dtrace_nullop
254193326Sed};
255193326Sed
256218893Sdimstatic dtrace_id_t	dtrace_probeid_begin;	/* special BEGIN probe */
257193326Sedstatic dtrace_id_t	dtrace_probeid_end;	/* special END probe */
258193326Seddtrace_id_t		dtrace_probeid_error;	/* special ERROR probe */
259206084Srdivacky
260193326Sed/*
261193326Sed * DTrace Helper Tracing Variables
262193326Sed */
263193326Seduint32_t dtrace_helptrace_next = 0;
264193326Seduint32_t dtrace_helptrace_nlocals;
265206084Srdivackychar	*dtrace_helptrace_buffer;
266193326Sedint	dtrace_helptrace_bufsize = 512 * 1024;
267193326Sed
268193326Sed#ifdef DEBUG
269193326Sedint	dtrace_helptrace_enabled = 1;
270193326Sed#else
271206084Srdivackyint	dtrace_helptrace_enabled = 0;
272193326Sed#endif
273193326Sed
274193326Sed/*
275193326Sed * DTrace Error Hashing
276193326Sed *
277206084Srdivacky * On DEBUG kernels, DTrace will track the errors that has seen in a hash
278193326Sed * table.  This is very useful for checking coverage of tests that are
279193326Sed * expected to induce DIF or DOF processing errors, and may be useful for
280193326Sed * debugging problems in the DIF code generator or in DOF generation .  The
281193326Sed * error hash may be examined with the ::dtrace_errhash MDB dcmd.
282193326Sed */
283206084Srdivacky#ifdef DEBUG
284193326Sedstatic dtrace_errhash_t	dtrace_errhash[DTRACE_ERRHASHSZ];
285193326Sedstatic const char *dtrace_errlast;
286193326Sedstatic kthread_t *dtrace_errthread;
287193326Sedstatic kmutex_t dtrace_errlock;
288193326Sed#endif
289206084Srdivacky
290193326Sed/*
291193326Sed * DTrace Macros and Constants
292193326Sed *
293193326Sed * These are various macros that are useful in various spots in the
294193326Sed * implementation, along with a few random constants that have no meaning
295206084Srdivacky * outside of the implementation.  There is no real structure to this cpp
296193326Sed * mishmash -- but is there ever?
297193326Sed */
298193326Sed#define	DTRACE_HASHSTR(hash, probe)	\
299193326Sed	dtrace_hash_str(*((char **)((uintptr_t)(probe) + (hash)->dth_stroffs)))
300193326Sed
301206084Srdivacky#define	DTRACE_HASHNEXT(hash, probe)	\
302193326Sed	(dtrace_probe_t **)((uintptr_t)(probe) + (hash)->dth_nextoffs)
303193326Sed
304193326Sed#define	DTRACE_HASHPREV(hash, probe)	\
305193326Sed	(dtrace_probe_t **)((uintptr_t)(probe) + (hash)->dth_prevoffs)
306193326Sed
307206084Srdivacky#define	DTRACE_HASHEQ(hash, lhs, rhs)	\
308193326Sed	(strcmp(*((char **)((uintptr_t)(lhs) + (hash)->dth_stroffs)), \
309193326Sed	    *((char **)((uintptr_t)(rhs) + (hash)->dth_stroffs))) == 0)
310193326Sed
311193326Sed#define	DTRACE_AGGHASHSIZE_SLEW		17
312193326Sed
313206084Srdivacky#define	DTRACE_V4MAPPED_OFFSET		(sizeof (uint32_t) * 3)
314193326Sed
315193326Sed/*
316218893Sdim * The key for a thread-local variable consists of the lower 61 bits of the
317193326Sed * t_did, plus the 3 bits of the highest active interrupt above LOCK_LEVEL.
318193326Sed * We add DIF_VARIABLE_MAX to t_did to assure that the thread key is never
319206084Srdivacky * equal to a variable identifier.  This is necessary (but not sufficient) to
320193326Sed * assure that global associative arrays never collide with thread-local
321193326Sed * variables.  To guarantee that they cannot collide, we must also define the
322218893Sdim * order for keying dynamic variables.  That order is:
323193326Sed *
324193326Sed *   [ key0 ] ... [ keyn ] [ variable-key ] [ tls-key ]
325206084Srdivacky *
326193326Sed * Because the variable-key and the tls-key are in orthogonal spaces, there is
327193326Sed * no way for a global variable key signature to match a thread-local key
328218893Sdim * signature.
329193326Sed */
330193326Sed#define	DTRACE_TLS_THRKEY(where) { \
331206084Srdivacky	uint_t intr = 0; \
332193326Sed	uint_t actv = CPU->cpu_intr_actv >> (LOCK_LEVEL + 1); \
333193326Sed	for (; actv; actv >>= 1) \
334218893Sdim		intr++; \
335193326Sed	ASSERT(intr < (1 << 3)); \
336193326Sed	(where) = ((curthread->t_did + DIF_VARIABLE_MAX) & \
337206084Srdivacky	    (((uint64_t)1 << 61) - 1)) | ((uint64_t)intr << 61); \
338193326Sed}
339193326Sed
340218893Sdim#define	DT_BSWAP_8(x)	((x) & 0xff)
341193326Sed#define	DT_BSWAP_16(x)	((DT_BSWAP_8(x) << 8) | DT_BSWAP_8((x) >> 8))
342193326Sed#define	DT_BSWAP_32(x)	((DT_BSWAP_16(x) << 16) | DT_BSWAP_16((x) >> 16))
343206084Srdivacky#define	DT_BSWAP_64(x)	((DT_BSWAP_32(x) << 32) | DT_BSWAP_32((x) >> 32))
344193326Sed
345193326Sed#define	DT_MASK_LO 0x00000000FFFFFFFFULL
346218893Sdim
347193326Sed#define	DTRACE_STORE(type, tomax, offset, what) \
348193326Sed	*((type *)((uintptr_t)(tomax) + (uintptr_t)offset)) = (type)(what);
349206084Srdivacky
350193326Sed#ifndef __i386
351193326Sed#define	DTRACE_ALIGNCHECK(addr, size, flags)				\
352218893Sdim	if (addr & (size - 1)) {					\
353193326Sed		*flags |= CPU_DTRACE_BADALIGN;				\
354193326Sed		cpu_core[CPU->cpu_id].cpuc_dtrace_illval = addr;	\
355206084Srdivacky		return (0);						\
356193326Sed	}
357193326Sed#else
358218893Sdim#define	DTRACE_ALIGNCHECK(addr, size, flags)
359193326Sed#endif
360193326Sed
361206084Srdivacky/*
362193326Sed * Test whether a range of memory starting at testaddr of size testsz falls
363193326Sed * within the range of memory described by addr, sz.  We take care to avoid
364218893Sdim * problems with overflow and underflow of the unsigned quantities, and
365193326Sed * disallow all negative sizes.  Ranges of size 0 are allowed.
366193326Sed */
367206084Srdivacky#define	DTRACE_INRANGE(testaddr, testsz, baseaddr, basesz) \
368193326Sed	((testaddr) - (baseaddr) < (basesz) && \
369193326Sed	(testaddr) + (testsz) - (baseaddr) <= (basesz) && \
370218893Sdim	(testaddr) + (testsz) >= (testaddr))
371193326Sed
372193326Sed/*
373206084Srdivacky * Test whether alloc_sz bytes will fit in the scratch region.  We isolate
374193326Sed * alloc_sz on the righthand side of the comparison in order to avoid overflow
375193326Sed * or underflow in the comparison with it.  This is simpler than the INRANGE
376218893Sdim * check above, because we know that the dtms_scratch_ptr is valid in the
377193326Sed * range.  Allocations of size zero are allowed.
378193326Sed */
379206084Srdivacky#define	DTRACE_INSCRATCH(mstate, alloc_sz) \
380193326Sed	((mstate)->dtms_scratch_base + (mstate)->dtms_scratch_size - \
381193326Sed	(mstate)->dtms_scratch_ptr >= (alloc_sz))
382193326Sed
383193326Sed#define	DTRACE_LOADFUNC(bits)						\
384193326Sed/*CSTYLED*/								\
385206084Srdivackyuint##bits##_t								\
386193326Seddtrace_load##bits(uintptr_t addr)					\
387193326Sed{									\
388218893Sdim	size_t size = bits / NBBY;					\
389193326Sed	/*CSTYLED*/							\
390193326Sed	uint##bits##_t rval;						\
391206084Srdivacky	int i;								\
392193326Sed	volatile uint16_t *flags = (volatile uint16_t *)		\
393193326Sed	    &cpu_core[CPU->cpu_id].cpuc_dtrace_flags;			\
394218893Sdim									\
395193326Sed	DTRACE_ALIGNCHECK(addr, size, flags);				\
396193326Sed									\
397206084Srdivacky	for (i = 0; i < dtrace_toxranges; i++) {			\
398193326Sed		if (addr >= dtrace_toxrange[i].dtt_limit)		\
399193326Sed			continue;					\
400193326Sed									\
401218893Sdim		if (addr + size <= dtrace_toxrange[i].dtt_base)		\
402218893Sdim			continue;					\
403193326Sed									\
404193326Sed		/*							\
405206084Srdivacky		 * This address falls within a toxic region; return 0.	\
406193326Sed		 */							\
407193326Sed		*flags |= CPU_DTRACE_BADADDR;				\
408218893Sdim		cpu_core[CPU->cpu_id].cpuc_dtrace_illval = addr;	\
409193326Sed		return (0);						\
410193326Sed	}								\
411206084Srdivacky									\
412218893Sdim	*flags |= CPU_DTRACE_NOFAULT;					\
413193326Sed	/*CSTYLED*/							\
414218893Sdim	rval = *((volatile uint##bits##_t *)addr);			\
415193326Sed	*flags &= ~CPU_DTRACE_NOFAULT;					\
416193326Sed									\
417206084Srdivacky	return (!(*flags & CPU_DTRACE_FAULT) ? rval : 0);		\
418193326Sed}
419193326Sed
420218893Sdim#ifdef _LP64
421193326Sed#define	dtrace_loadptr	dtrace_load64
422193326Sed#else
423206084Srdivacky#define	dtrace_loadptr	dtrace_load32
424223017Sdim#endif
425193326Sed
426218893Sdim#define	DTRACE_DYNHASH_FREE	0
427193326Sed#define	DTRACE_DYNHASH_SINK	1
428193326Sed#define	DTRACE_DYNHASH_VALID	2
429206084Srdivacky
430223017Sdim#define	DTRACE_MATCH_NEXT	0
431193326Sed#define	DTRACE_MATCH_DONE	1
432218893Sdim#define	DTRACE_ANCHORED(probe)	((probe)->dtpr_func[0] != '\0')
433193326Sed#define	DTRACE_STATE_ALIGN	64
434193326Sed
435206084Srdivacky#define	DTRACE_FLAGS2FLT(flags)						\
436223017Sdim	(((flags) & CPU_DTRACE_BADADDR) ? DTRACEFLT_BADADDR :		\
437223017Sdim	((flags) & CPU_DTRACE_ILLOP) ? DTRACEFLT_ILLOP :		\
438193326Sed	((flags) & CPU_DTRACE_DIVZERO) ? DTRACEFLT_DIVZERO :		\
439218893Sdim	((flags) & CPU_DTRACE_KPRIV) ? DTRACEFLT_KPRIV :		\
440193326Sed	((flags) & CPU_DTRACE_UPRIV) ? DTRACEFLT_UPRIV :		\
441193326Sed	((flags) & CPU_DTRACE_TUPOFLOW) ?  DTRACEFLT_TUPOFLOW :		\
442212904Sdim	((flags) & CPU_DTRACE_BADALIGN) ?  DTRACEFLT_BADALIGN :		\
443212904Sdim	((flags) & CPU_DTRACE_NOSCRATCH) ?  DTRACEFLT_NOSCRATCH :	\
444212904Sdim	((flags) & CPU_DTRACE_BADSTACK) ?  DTRACEFLT_BADSTACK :		\
445212904Sdim	DTRACEFLT_UNKNOWN)
446212904Sdim
447212904Sdim#define	DTRACEACT_ISSTRING(act)						\
448212904Sdim	((act)->dta_kind == DTRACEACT_DIFEXPR &&			\
449212904Sdim	(act)->dta_difo->dtdo_rtype.dtdt_kind == DIF_TYPE_STRING)
450212904Sdim
451212904Sdimstatic size_t dtrace_strlen(const char *, size_t);
452212904Sdimstatic dtrace_probe_t *dtrace_probe_lookup_id(dtrace_id_t id);
453212904Sdimstatic void dtrace_enabling_provide(dtrace_provider_t *);
454212904Sdimstatic int dtrace_enabling_match(dtrace_enabling_t *, int *);
455212904Sdimstatic void dtrace_enabling_matchall(void);
456212904Sdimstatic dtrace_state_t *dtrace_anon_grab(void);
457212904Sdimstatic uint64_t dtrace_helper(int, dtrace_mstate_t *,
458212904Sdim    dtrace_state_t *, uint64_t, uint64_t);
459212904Sdimstatic dtrace_helpers_t *dtrace_helpers_create(proc_t *);
460212904Sdimstatic void dtrace_buffer_drop(dtrace_buffer_t *);
461212904Sdimstatic intptr_t dtrace_buffer_reserve(dtrace_buffer_t *, size_t, size_t,
462212904Sdim    dtrace_state_t *, dtrace_mstate_t *);
463212904Sdimstatic int dtrace_state_option(dtrace_state_t *, dtrace_optid_t,
464212904Sdim    dtrace_optval_t);
465212904Sdimstatic int dtrace_ecb_create_enable(dtrace_probe_t *, void *);
466212904Sdimstatic void dtrace_helper_provider_destroy(dtrace_helper_provider_t *);
467212904Sdim
468212904Sdim/*
469212904Sdim * DTrace Probe Context Functions
470212904Sdim *
471212904Sdim * These functions are called from probe context.  Because probe context is
472212904Sdim * any context in which C may be called, arbitrarily locks may be held,
473212904Sdim * interrupts may be disabled, we may be in arbitrary dispatched state, etc.
474212904Sdim * As a result, functions called from probe context may only call other DTrace
475212904Sdim * support functions -- they may not interact at all with the system at large.
476212904Sdim * (Note that the ASSERT macro is made probe-context safe by redefining it in
477212904Sdim * terms of dtrace_assfail(), a probe-context safe function.) If arbitrary
478212904Sdim * loads are to be performed from probe context, they _must_ be in terms of
479212904Sdim * the safe dtrace_load*() variants.
480212904Sdim *
481212904Sdim * Some functions in this block are not actually called from probe context;
482212904Sdim * for these functions, there will be a comment above the function reading
483212904Sdim * "Note:  not called from probe context."
484212904Sdim */
485212904Sdimvoid
486212904Sdimdtrace_panic(const char *format, ...)
487212904Sdim{
488212904Sdim	va_list alist;
489212904Sdim
490212904Sdim	va_start(alist, format);
491212904Sdim	dtrace_vpanic(format, alist);
492212904Sdim	va_end(alist);
493212904Sdim}
494212904Sdim
495212904Sdimint
496212904Sdimdtrace_assfail(const char *a, const char *f, int l)
497212904Sdim{
498212904Sdim	dtrace_panic("assertion failed: %s, file: %s, line: %d", a, f, l);
499212904Sdim
500193326Sed	/*
501193326Sed	 * We just need something here that even the most clever compiler
502193326Sed	 * cannot optimize away.
503193326Sed	 */
504	return (a[(uintptr_t)f]);
505}
506
507/*
508 * Atomically increment a specified error counter from probe context.
509 */
510static void
511dtrace_error(uint32_t *counter)
512{
513	/*
514	 * Most counters stored to in probe context are per-CPU counters.
515	 * However, there are some error conditions that are sufficiently
516	 * arcane that they don't merit per-CPU storage.  If these counters
517	 * are incremented concurrently on different CPUs, scalability will be
518	 * adversely affected -- but we don't expect them to be white-hot in a
519	 * correctly constructed enabling...
520	 */
521	uint32_t oval, nval;
522
523	do {
524		oval = *counter;
525
526		if ((nval = oval + 1) == 0) {
527			/*
528			 * If the counter would wrap, set it to 1 -- assuring
529			 * that the counter is never zero when we have seen
530			 * errors.  (The counter must be 32-bits because we
531			 * aren't guaranteed a 64-bit compare&swap operation.)
532			 * To save this code both the infamy of being fingered
533			 * by a priggish news story and the indignity of being
534			 * the target of a neo-puritan witch trial, we're
535			 * carefully avoiding any colorful description of the
536			 * likelihood of this condition -- but suffice it to
537			 * say that it is only slightly more likely than the
538			 * overflow of predicate cache IDs, as discussed in
539			 * dtrace_predicate_create().
540			 */
541			nval = 1;
542		}
543	} while (dtrace_cas32(counter, oval, nval) != oval);
544}
545
546/*
547 * Use the DTRACE_LOADFUNC macro to define functions for each of loading a
548 * uint8_t, a uint16_t, a uint32_t and a uint64_t.
549 */
550DTRACE_LOADFUNC(8)
551DTRACE_LOADFUNC(16)
552DTRACE_LOADFUNC(32)
553DTRACE_LOADFUNC(64)
554
555static int
556dtrace_inscratch(uintptr_t dest, size_t size, dtrace_mstate_t *mstate)
557{
558	if (dest < mstate->dtms_scratch_base)
559		return (0);
560
561	if (dest + size < dest)
562		return (0);
563
564	if (dest + size > mstate->dtms_scratch_ptr)
565		return (0);
566
567	return (1);
568}
569
570static int
571dtrace_canstore_statvar(uint64_t addr, size_t sz,
572    dtrace_statvar_t **svars, int nsvars)
573{
574	int i;
575
576	for (i = 0; i < nsvars; i++) {
577		dtrace_statvar_t *svar = svars[i];
578
579		if (svar == NULL || svar->dtsv_size == 0)
580			continue;
581
582		if (DTRACE_INRANGE(addr, sz, svar->dtsv_data, svar->dtsv_size))
583			return (1);
584	}
585
586	return (0);
587}
588
589/*
590 * Check to see if the address is within a memory region to which a store may
591 * be issued.  This includes the DTrace scratch areas, and any DTrace variable
592 * region.  The caller of dtrace_canstore() is responsible for performing any
593 * alignment checks that are needed before stores are actually executed.
594 */
595static int
596dtrace_canstore(uint64_t addr, size_t sz, dtrace_mstate_t *mstate,
597    dtrace_vstate_t *vstate)
598{
599	/*
600	 * First, check to see if the address is in scratch space...
601	 */
602	if (DTRACE_INRANGE(addr, sz, mstate->dtms_scratch_base,
603	    mstate->dtms_scratch_size))
604		return (1);
605
606	/*
607	 * Now check to see if it's a dynamic variable.  This check will pick
608	 * up both thread-local variables and any global dynamically-allocated
609	 * variables.
610	 */
611	if (DTRACE_INRANGE(addr, sz, (uintptr_t)vstate->dtvs_dynvars.dtds_base,
612	    vstate->dtvs_dynvars.dtds_size)) {
613		dtrace_dstate_t *dstate = &vstate->dtvs_dynvars;
614		uintptr_t base = (uintptr_t)dstate->dtds_base +
615		    (dstate->dtds_hashsize * sizeof (dtrace_dynhash_t));
616		uintptr_t chunkoffs;
617
618		/*
619		 * Before we assume that we can store here, we need to make
620		 * sure that it isn't in our metadata -- storing to our
621		 * dynamic variable metadata would corrupt our state.  For
622		 * the range to not include any dynamic variable metadata,
623		 * it must:
624		 *
625		 *	(1) Start above the hash table that is at the base of
626		 *	the dynamic variable space
627		 *
628		 *	(2) Have a starting chunk offset that is beyond the
629		 *	dtrace_dynvar_t that is at the base of every chunk
630		 *
631		 *	(3) Not span a chunk boundary
632		 *
633		 */
634		if (addr < base)
635			return (0);
636
637		chunkoffs = (addr - base) % dstate->dtds_chunksize;
638
639		if (chunkoffs < sizeof (dtrace_dynvar_t))
640			return (0);
641
642		if (chunkoffs + sz > dstate->dtds_chunksize)
643			return (0);
644
645		return (1);
646	}
647
648	/*
649	 * Finally, check the static local and global variables.  These checks
650	 * take the longest, so we perform them last.
651	 */
652	if (dtrace_canstore_statvar(addr, sz,
653	    vstate->dtvs_locals, vstate->dtvs_nlocals))
654		return (1);
655
656	if (dtrace_canstore_statvar(addr, sz,
657	    vstate->dtvs_globals, vstate->dtvs_nglobals))
658		return (1);
659
660	return (0);
661}
662
663
664/*
665 * Convenience routine to check to see if the address is within a memory
666 * region in which a load may be issued given the user's privilege level;
667 * if not, it sets the appropriate error flags and loads 'addr' into the
668 * illegal value slot.
669 *
670 * DTrace subroutines (DIF_SUBR_*) should use this helper to implement
671 * appropriate memory access protection.
672 */
673static int
674dtrace_canload(uint64_t addr, size_t sz, dtrace_mstate_t *mstate,
675    dtrace_vstate_t *vstate)
676{
677	volatile uintptr_t *illval = &cpu_core[CPU->cpu_id].cpuc_dtrace_illval;
678
679	/*
680	 * If we hold the privilege to read from kernel memory, then
681	 * everything is readable.
682	 */
683	if ((mstate->dtms_access & DTRACE_ACCESS_KERNEL) != 0)
684		return (1);
685
686	/*
687	 * You can obviously read that which you can store.
688	 */
689	if (dtrace_canstore(addr, sz, mstate, vstate))
690		return (1);
691
692	/*
693	 * We're allowed to read from our own string table.
694	 */
695	if (DTRACE_INRANGE(addr, sz, (uintptr_t)mstate->dtms_difo->dtdo_strtab,
696	    mstate->dtms_difo->dtdo_strlen))
697		return (1);
698
699	DTRACE_CPUFLAG_SET(CPU_DTRACE_KPRIV);
700	*illval = addr;
701	return (0);
702}
703
704/*
705 * Convenience routine to check to see if a given string is within a memory
706 * region in which a load may be issued given the user's privilege level;
707 * this exists so that we don't need to issue unnecessary dtrace_strlen()
708 * calls in the event that the user has all privileges.
709 */
710static int
711dtrace_strcanload(uint64_t addr, size_t sz, dtrace_mstate_t *mstate,
712    dtrace_vstate_t *vstate)
713{
714	size_t strsz;
715
716	/*
717	 * If we hold the privilege to read from kernel memory, then
718	 * everything is readable.
719	 */
720	if ((mstate->dtms_access & DTRACE_ACCESS_KERNEL) != 0)
721		return (1);
722
723	strsz = 1 + dtrace_strlen((char *)(uintptr_t)addr, sz);
724	if (dtrace_canload(addr, strsz, mstate, vstate))
725		return (1);
726
727	return (0);
728}
729
730/*
731 * Convenience routine to check to see if a given variable is within a memory
732 * region in which a load may be issued given the user's privilege level.
733 */
734static int
735dtrace_vcanload(void *src, dtrace_diftype_t *type, dtrace_mstate_t *mstate,
736    dtrace_vstate_t *vstate)
737{
738	size_t sz;
739	ASSERT(type->dtdt_flags & DIF_TF_BYREF);
740
741	/*
742	 * If we hold the privilege to read from kernel memory, then
743	 * everything is readable.
744	 */
745	if ((mstate->dtms_access & DTRACE_ACCESS_KERNEL) != 0)
746		return (1);
747
748	if (type->dtdt_kind == DIF_TYPE_STRING)
749		sz = dtrace_strlen(src,
750		    vstate->dtvs_state->dts_options[DTRACEOPT_STRSIZE]) + 1;
751	else
752		sz = type->dtdt_size;
753
754	return (dtrace_canload((uintptr_t)src, sz, mstate, vstate));
755}
756
757/*
758 * Compare two strings using safe loads.
759 */
760static int
761dtrace_strncmp(char *s1, char *s2, size_t limit)
762{
763	uint8_t c1, c2;
764	volatile uint16_t *flags;
765
766	if (s1 == s2 || limit == 0)
767		return (0);
768
769	flags = (volatile uint16_t *)&cpu_core[CPU->cpu_id].cpuc_dtrace_flags;
770
771	do {
772		if (s1 == NULL) {
773			c1 = '\0';
774		} else {
775			c1 = dtrace_load8((uintptr_t)s1++);
776		}
777
778		if (s2 == NULL) {
779			c2 = '\0';
780		} else {
781			c2 = dtrace_load8((uintptr_t)s2++);
782		}
783
784		if (c1 != c2)
785			return (c1 - c2);
786	} while (--limit && c1 != '\0' && !(*flags & CPU_DTRACE_FAULT));
787
788	return (0);
789}
790
791/*
792 * Compute strlen(s) for a string using safe memory accesses.  The additional
793 * len parameter is used to specify a maximum length to ensure completion.
794 */
795static size_t
796dtrace_strlen(const char *s, size_t lim)
797{
798	uint_t len;
799
800	for (len = 0; len != lim; len++) {
801		if (dtrace_load8((uintptr_t)s++) == '\0')
802			break;
803	}
804
805	return (len);
806}
807
808/*
809 * Check if an address falls within a toxic region.
810 */
811static int
812dtrace_istoxic(uintptr_t kaddr, size_t size)
813{
814	uintptr_t taddr, tsize;
815	int i;
816
817	for (i = 0; i < dtrace_toxranges; i++) {
818		taddr = dtrace_toxrange[i].dtt_base;
819		tsize = dtrace_toxrange[i].dtt_limit - taddr;
820
821		if (kaddr - taddr < tsize) {
822			DTRACE_CPUFLAG_SET(CPU_DTRACE_BADADDR);
823			cpu_core[CPU->cpu_id].cpuc_dtrace_illval = kaddr;
824			return (1);
825		}
826
827		if (taddr - kaddr < size) {
828			DTRACE_CPUFLAG_SET(CPU_DTRACE_BADADDR);
829			cpu_core[CPU->cpu_id].cpuc_dtrace_illval = taddr;
830			return (1);
831		}
832	}
833
834	return (0);
835}
836
837/*
838 * Copy src to dst using safe memory accesses.  The src is assumed to be unsafe
839 * memory specified by the DIF program.  The dst is assumed to be safe memory
840 * that we can store to directly because it is managed by DTrace.  As with
841 * standard bcopy, overlapping copies are handled properly.
842 */
843static void
844dtrace_bcopy(const void *src, void *dst, size_t len)
845{
846	if (len != 0) {
847		uint8_t *s1 = dst;
848		const uint8_t *s2 = src;
849
850		if (s1 <= s2) {
851			do {
852				*s1++ = dtrace_load8((uintptr_t)s2++);
853			} while (--len != 0);
854		} else {
855			s2 += len;
856			s1 += len;
857
858			do {
859				*--s1 = dtrace_load8((uintptr_t)--s2);
860			} while (--len != 0);
861		}
862	}
863}
864
865/*
866 * Copy src to dst using safe memory accesses, up to either the specified
867 * length, or the point that a nul byte is encountered.  The src is assumed to
868 * be unsafe memory specified by the DIF program.  The dst is assumed to be
869 * safe memory that we can store to directly because it is managed by DTrace.
870 * Unlike dtrace_bcopy(), overlapping regions are not handled.
871 */
872static void
873dtrace_strcpy(const void *src, void *dst, size_t len)
874{
875	if (len != 0) {
876		uint8_t *s1 = dst, c;
877		const uint8_t *s2 = src;
878
879		do {
880			*s1++ = c = dtrace_load8((uintptr_t)s2++);
881		} while (--len != 0 && c != '\0');
882	}
883}
884
885/*
886 * Copy src to dst, deriving the size and type from the specified (BYREF)
887 * variable type.  The src is assumed to be unsafe memory specified by the DIF
888 * program.  The dst is assumed to be DTrace variable memory that is of the
889 * specified type; we assume that we can store to directly.
890 */
891static void
892dtrace_vcopy(void *src, void *dst, dtrace_diftype_t *type)
893{
894	ASSERT(type->dtdt_flags & DIF_TF_BYREF);
895
896	if (type->dtdt_kind == DIF_TYPE_STRING) {
897		dtrace_strcpy(src, dst, type->dtdt_size);
898	} else {
899		dtrace_bcopy(src, dst, type->dtdt_size);
900	}
901}
902
903/*
904 * Compare s1 to s2 using safe memory accesses.  The s1 data is assumed to be
905 * unsafe memory specified by the DIF program.  The s2 data is assumed to be
906 * safe memory that we can access directly because it is managed by DTrace.
907 */
908static int
909dtrace_bcmp(const void *s1, const void *s2, size_t len)
910{
911	volatile uint16_t *flags;
912
913	flags = (volatile uint16_t *)&cpu_core[CPU->cpu_id].cpuc_dtrace_flags;
914
915	if (s1 == s2)
916		return (0);
917
918	if (s1 == NULL || s2 == NULL)
919		return (1);
920
921	if (s1 != s2 && len != 0) {
922		const uint8_t *ps1 = s1;
923		const uint8_t *ps2 = s2;
924
925		do {
926			if (dtrace_load8((uintptr_t)ps1++) != *ps2++)
927				return (1);
928		} while (--len != 0 && !(*flags & CPU_DTRACE_FAULT));
929	}
930	return (0);
931}
932
933/*
934 * Zero the specified region using a simple byte-by-byte loop.  Note that this
935 * is for safe DTrace-managed memory only.
936 */
937static void
938dtrace_bzero(void *dst, size_t len)
939{
940	uchar_t *cp;
941
942	for (cp = dst; len != 0; len--)
943		*cp++ = 0;
944}
945
946static void
947dtrace_add_128(uint64_t *addend1, uint64_t *addend2, uint64_t *sum)
948{
949	uint64_t result[2];
950
951	result[0] = addend1[0] + addend2[0];
952	result[1] = addend1[1] + addend2[1] +
953	    (result[0] < addend1[0] || result[0] < addend2[0] ? 1 : 0);
954
955	sum[0] = result[0];
956	sum[1] = result[1];
957}
958
959/*
960 * Shift the 128-bit value in a by b. If b is positive, shift left.
961 * If b is negative, shift right.
962 */
963static void
964dtrace_shift_128(uint64_t *a, int b)
965{
966	uint64_t mask;
967
968	if (b == 0)
969		return;
970
971	if (b < 0) {
972		b = -b;
973		if (b >= 64) {
974			a[0] = a[1] >> (b - 64);
975			a[1] = 0;
976		} else {
977			a[0] >>= b;
978			mask = 1LL << (64 - b);
979			mask -= 1;
980			a[0] |= ((a[1] & mask) << (64 - b));
981			a[1] >>= b;
982		}
983	} else {
984		if (b >= 64) {
985			a[1] = a[0] << (b - 64);
986			a[0] = 0;
987		} else {
988			a[1] <<= b;
989			mask = a[0] >> (64 - b);
990			a[1] |= mask;
991			a[0] <<= b;
992		}
993	}
994}
995
996/*
997 * The basic idea is to break the 2 64-bit values into 4 32-bit values,
998 * use native multiplication on those, and then re-combine into the
999 * resulting 128-bit value.
1000 *
1001 * (hi1 << 32 + lo1) * (hi2 << 32 + lo2) =
1002 *     hi1 * hi2 << 64 +
1003 *     hi1 * lo2 << 32 +
1004 *     hi2 * lo1 << 32 +
1005 *     lo1 * lo2
1006 */
1007static void
1008dtrace_multiply_128(uint64_t factor1, uint64_t factor2, uint64_t *product)
1009{
1010	uint64_t hi1, hi2, lo1, lo2;
1011	uint64_t tmp[2];
1012
1013	hi1 = factor1 >> 32;
1014	hi2 = factor2 >> 32;
1015
1016	lo1 = factor1 & DT_MASK_LO;
1017	lo2 = factor2 & DT_MASK_LO;
1018
1019	product[0] = lo1 * lo2;
1020	product[1] = hi1 * hi2;
1021
1022	tmp[0] = hi1 * lo2;
1023	tmp[1] = 0;
1024	dtrace_shift_128(tmp, 32);
1025	dtrace_add_128(product, tmp, product);
1026
1027	tmp[0] = hi2 * lo1;
1028	tmp[1] = 0;
1029	dtrace_shift_128(tmp, 32);
1030	dtrace_add_128(product, tmp, product);
1031}
1032
1033/*
1034 * This privilege check should be used by actions and subroutines to
1035 * verify that the user credentials of the process that enabled the
1036 * invoking ECB match the target credentials
1037 */
1038static int
1039dtrace_priv_proc_common_user(dtrace_state_t *state)
1040{
1041	cred_t *cr, *s_cr = state->dts_cred.dcr_cred;
1042
1043	/*
1044	 * We should always have a non-NULL state cred here, since if cred
1045	 * is null (anonymous tracing), we fast-path bypass this routine.
1046	 */
1047	ASSERT(s_cr != NULL);
1048
1049	if ((cr = CRED()) != NULL &&
1050	    s_cr->cr_uid == cr->cr_uid &&
1051	    s_cr->cr_uid == cr->cr_ruid &&
1052	    s_cr->cr_uid == cr->cr_suid &&
1053	    s_cr->cr_gid == cr->cr_gid &&
1054	    s_cr->cr_gid == cr->cr_rgid &&
1055	    s_cr->cr_gid == cr->cr_sgid)
1056		return (1);
1057
1058	return (0);
1059}
1060
1061/*
1062 * This privilege check should be used by actions and subroutines to
1063 * verify that the zone of the process that enabled the invoking ECB
1064 * matches the target credentials
1065 */
1066static int
1067dtrace_priv_proc_common_zone(dtrace_state_t *state)
1068{
1069	cred_t *cr, *s_cr = state->dts_cred.dcr_cred;
1070
1071	/*
1072	 * We should always have a non-NULL state cred here, since if cred
1073	 * is null (anonymous tracing), we fast-path bypass this routine.
1074	 */
1075	ASSERT(s_cr != NULL);
1076
1077	if ((cr = CRED()) != NULL &&
1078	    s_cr->cr_zone == cr->cr_zone)
1079		return (1);
1080
1081	return (0);
1082}
1083
1084/*
1085 * This privilege check should be used by actions and subroutines to
1086 * verify that the process has not setuid or changed credentials.
1087 */
1088static int
1089dtrace_priv_proc_common_nocd()
1090{
1091	proc_t *proc;
1092
1093	if ((proc = ttoproc(curthread)) != NULL &&
1094	    !(proc->p_flag & SNOCD))
1095		return (1);
1096
1097	return (0);
1098}
1099
1100static int
1101dtrace_priv_proc_destructive(dtrace_state_t *state)
1102{
1103	int action = state->dts_cred.dcr_action;
1104
1105	if (((action & DTRACE_CRA_PROC_DESTRUCTIVE_ALLZONE) == 0) &&
1106	    dtrace_priv_proc_common_zone(state) == 0)
1107		goto bad;
1108
1109	if (((action & DTRACE_CRA_PROC_DESTRUCTIVE_ALLUSER) == 0) &&
1110	    dtrace_priv_proc_common_user(state) == 0)
1111		goto bad;
1112
1113	if (((action & DTRACE_CRA_PROC_DESTRUCTIVE_CREDCHG) == 0) &&
1114	    dtrace_priv_proc_common_nocd() == 0)
1115		goto bad;
1116
1117	return (1);
1118
1119bad:
1120	cpu_core[CPU->cpu_id].cpuc_dtrace_flags |= CPU_DTRACE_UPRIV;
1121
1122	return (0);
1123}
1124
1125static int
1126dtrace_priv_proc_control(dtrace_state_t *state)
1127{
1128	if (state->dts_cred.dcr_action & DTRACE_CRA_PROC_CONTROL)
1129		return (1);
1130
1131	if (dtrace_priv_proc_common_zone(state) &&
1132	    dtrace_priv_proc_common_user(state) &&
1133	    dtrace_priv_proc_common_nocd())
1134		return (1);
1135
1136	cpu_core[CPU->cpu_id].cpuc_dtrace_flags |= CPU_DTRACE_UPRIV;
1137
1138	return (0);
1139}
1140
1141static int
1142dtrace_priv_proc(dtrace_state_t *state)
1143{
1144	if (state->dts_cred.dcr_action & DTRACE_CRA_PROC)
1145		return (1);
1146
1147	cpu_core[CPU->cpu_id].cpuc_dtrace_flags |= CPU_DTRACE_UPRIV;
1148
1149	return (0);
1150}
1151
1152static int
1153dtrace_priv_kernel(dtrace_state_t *state)
1154{
1155	if (state->dts_cred.dcr_action & DTRACE_CRA_KERNEL)
1156		return (1);
1157
1158	cpu_core[CPU->cpu_id].cpuc_dtrace_flags |= CPU_DTRACE_KPRIV;
1159
1160	return (0);
1161}
1162
1163static int
1164dtrace_priv_kernel_destructive(dtrace_state_t *state)
1165{
1166	if (state->dts_cred.dcr_action & DTRACE_CRA_KERNEL_DESTRUCTIVE)
1167		return (1);
1168
1169	cpu_core[CPU->cpu_id].cpuc_dtrace_flags |= CPU_DTRACE_KPRIV;
1170
1171	return (0);
1172}
1173
1174/*
1175 * Note:  not called from probe context.  This function is called
1176 * asynchronously (and at a regular interval) from outside of probe context to
1177 * clean the dirty dynamic variable lists on all CPUs.  Dynamic variable
1178 * cleaning is explained in detail in <sys/dtrace_impl.h>.
1179 */
1180void
1181dtrace_dynvar_clean(dtrace_dstate_t *dstate)
1182{
1183	dtrace_dynvar_t *dirty;
1184	dtrace_dstate_percpu_t *dcpu;
1185	int i, work = 0;
1186
1187	for (i = 0; i < NCPU; i++) {
1188		dcpu = &dstate->dtds_percpu[i];
1189
1190		ASSERT(dcpu->dtdsc_rinsing == NULL);
1191
1192		/*
1193		 * If the dirty list is NULL, there is no dirty work to do.
1194		 */
1195		if (dcpu->dtdsc_dirty == NULL)
1196			continue;
1197
1198		/*
1199		 * If the clean list is non-NULL, then we're not going to do
1200		 * any work for this CPU -- it means that there has not been
1201		 * a dtrace_dynvar() allocation on this CPU (or from this CPU)
1202		 * since the last time we cleaned house.
1203		 */
1204		if (dcpu->dtdsc_clean != NULL)
1205			continue;
1206
1207		work = 1;
1208
1209		/*
1210		 * Atomically move the dirty list aside.
1211		 */
1212		do {
1213			dirty = dcpu->dtdsc_dirty;
1214
1215			/*
1216			 * Before we zap the dirty list, set the rinsing list.
1217			 * (This allows for a potential assertion in
1218			 * dtrace_dynvar():  if a free dynamic variable appears
1219			 * on a hash chain, either the dirty list or the
1220			 * rinsing list for some CPU must be non-NULL.)
1221			 */
1222			dcpu->dtdsc_rinsing = dirty;
1223			dtrace_membar_producer();
1224		} while (dtrace_casptr(&dcpu->dtdsc_dirty,
1225		    dirty, NULL) != dirty);
1226	}
1227
1228	if (!work) {
1229		/*
1230		 * We have no work to do; we can simply return.
1231		 */
1232		return;
1233	}
1234
1235	dtrace_sync();
1236
1237	for (i = 0; i < NCPU; i++) {
1238		dcpu = &dstate->dtds_percpu[i];
1239
1240		if (dcpu->dtdsc_rinsing == NULL)
1241			continue;
1242
1243		/*
1244		 * We are now guaranteed that no hash chain contains a pointer
1245		 * into this dirty list; we can make it clean.
1246		 */
1247		ASSERT(dcpu->dtdsc_clean == NULL);
1248		dcpu->dtdsc_clean = dcpu->dtdsc_rinsing;
1249		dcpu->dtdsc_rinsing = NULL;
1250	}
1251
1252	/*
1253	 * Before we actually set the state to be DTRACE_DSTATE_CLEAN, make
1254	 * sure that all CPUs have seen all of the dtdsc_clean pointers.
1255	 * This prevents a race whereby a CPU incorrectly decides that
1256	 * the state should be something other than DTRACE_DSTATE_CLEAN
1257	 * after dtrace_dynvar_clean() has completed.
1258	 */
1259	dtrace_sync();
1260
1261	dstate->dtds_state = DTRACE_DSTATE_CLEAN;
1262}
1263
1264/*
1265 * Depending on the value of the op parameter, this function looks-up,
1266 * allocates or deallocates an arbitrarily-keyed dynamic variable.  If an
1267 * allocation is requested, this function will return a pointer to a
1268 * dtrace_dynvar_t corresponding to the allocated variable -- or NULL if no
1269 * variable can be allocated.  If NULL is returned, the appropriate counter
1270 * will be incremented.
1271 */
1272dtrace_dynvar_t *
1273dtrace_dynvar(dtrace_dstate_t *dstate, uint_t nkeys,
1274    dtrace_key_t *key, size_t dsize, dtrace_dynvar_op_t op,
1275    dtrace_mstate_t *mstate, dtrace_vstate_t *vstate)
1276{
1277	uint64_t hashval = DTRACE_DYNHASH_VALID;
1278	dtrace_dynhash_t *hash = dstate->dtds_hash;
1279	dtrace_dynvar_t *free, *new_free, *next, *dvar, *start, *prev = NULL;
1280	processorid_t me = CPU->cpu_id, cpu = me;
1281	dtrace_dstate_percpu_t *dcpu = &dstate->dtds_percpu[me];
1282	size_t bucket, ksize;
1283	size_t chunksize = dstate->dtds_chunksize;
1284	uintptr_t kdata, lock, nstate;
1285	uint_t i;
1286
1287	ASSERT(nkeys != 0);
1288
1289	/*
1290	 * Hash the key.  As with aggregations, we use Jenkins' "One-at-a-time"
1291	 * algorithm.  For the by-value portions, we perform the algorithm in
1292	 * 16-bit chunks (as opposed to 8-bit chunks).  This speeds things up a
1293	 * bit, and seems to have only a minute effect on distribution.  For
1294	 * the by-reference data, we perform "One-at-a-time" iterating (safely)
1295	 * over each referenced byte.  It's painful to do this, but it's much
1296	 * better than pathological hash distribution.  The efficacy of the
1297	 * hashing algorithm (and a comparison with other algorithms) may be
1298	 * found by running the ::dtrace_dynstat MDB dcmd.
1299	 */
1300	for (i = 0; i < nkeys; i++) {
1301		if (key[i].dttk_size == 0) {
1302			uint64_t val = key[i].dttk_value;
1303
1304			hashval += (val >> 48) & 0xffff;
1305			hashval += (hashval << 10);
1306			hashval ^= (hashval >> 6);
1307
1308			hashval += (val >> 32) & 0xffff;
1309			hashval += (hashval << 10);
1310			hashval ^= (hashval >> 6);
1311
1312			hashval += (val >> 16) & 0xffff;
1313			hashval += (hashval << 10);
1314			hashval ^= (hashval >> 6);
1315
1316			hashval += val & 0xffff;
1317			hashval += (hashval << 10);
1318			hashval ^= (hashval >> 6);
1319		} else {
1320			/*
1321			 * This is incredibly painful, but it beats the hell
1322			 * out of the alternative.
1323			 */
1324			uint64_t j, size = key[i].dttk_size;
1325			uintptr_t base = (uintptr_t)key[i].dttk_value;
1326
1327			if (!dtrace_canload(base, size, mstate, vstate))
1328				break;
1329
1330			for (j = 0; j < size; j++) {
1331				hashval += dtrace_load8(base + j);
1332				hashval += (hashval << 10);
1333				hashval ^= (hashval >> 6);
1334			}
1335		}
1336	}
1337
1338	if (DTRACE_CPUFLAG_ISSET(CPU_DTRACE_FAULT))
1339		return (NULL);
1340
1341	hashval += (hashval << 3);
1342	hashval ^= (hashval >> 11);
1343	hashval += (hashval << 15);
1344
1345	/*
1346	 * There is a remote chance (ideally, 1 in 2^31) that our hashval
1347	 * comes out to be one of our two sentinel hash values.  If this
1348	 * actually happens, we set the hashval to be a value known to be a
1349	 * non-sentinel value.
1350	 */
1351	if (hashval == DTRACE_DYNHASH_FREE || hashval == DTRACE_DYNHASH_SINK)
1352		hashval = DTRACE_DYNHASH_VALID;
1353
1354	/*
1355	 * Yes, it's painful to do a divide here.  If the cycle count becomes
1356	 * important here, tricks can be pulled to reduce it.  (However, it's
1357	 * critical that hash collisions be kept to an absolute minimum;
1358	 * they're much more painful than a divide.)  It's better to have a
1359	 * solution that generates few collisions and still keeps things
1360	 * relatively simple.
1361	 */
1362	bucket = hashval % dstate->dtds_hashsize;
1363
1364	if (op == DTRACE_DYNVAR_DEALLOC) {
1365		volatile uintptr_t *lockp = &hash[bucket].dtdh_lock;
1366
1367		for (;;) {
1368			while ((lock = *lockp) & 1)
1369				continue;
1370
1371			if (dtrace_casptr((void *)lockp,
1372			    (void *)lock, (void *)(lock + 1)) == (void *)lock)
1373				break;
1374		}
1375
1376		dtrace_membar_producer();
1377	}
1378
1379top:
1380	prev = NULL;
1381	lock = hash[bucket].dtdh_lock;
1382
1383	dtrace_membar_consumer();
1384
1385	start = hash[bucket].dtdh_chain;
1386	ASSERT(start != NULL && (start->dtdv_hashval == DTRACE_DYNHASH_SINK ||
1387	    start->dtdv_hashval != DTRACE_DYNHASH_FREE ||
1388	    op != DTRACE_DYNVAR_DEALLOC));
1389
1390	for (dvar = start; dvar != NULL; dvar = dvar->dtdv_next) {
1391		dtrace_tuple_t *dtuple = &dvar->dtdv_tuple;
1392		dtrace_key_t *dkey = &dtuple->dtt_key[0];
1393
1394		if (dvar->dtdv_hashval != hashval) {
1395			if (dvar->dtdv_hashval == DTRACE_DYNHASH_SINK) {
1396				/*
1397				 * We've reached the sink, and therefore the
1398				 * end of the hash chain; we can kick out of
1399				 * the loop knowing that we have seen a valid
1400				 * snapshot of state.
1401				 */
1402				ASSERT(dvar->dtdv_next == NULL);
1403				ASSERT(dvar == &dtrace_dynhash_sink);
1404				break;
1405			}
1406
1407			if (dvar->dtdv_hashval == DTRACE_DYNHASH_FREE) {
1408				/*
1409				 * We've gone off the rails:  somewhere along
1410				 * the line, one of the members of this hash
1411				 * chain was deleted.  Note that we could also
1412				 * detect this by simply letting this loop run
1413				 * to completion, as we would eventually hit
1414				 * the end of the dirty list.  However, we
1415				 * want to avoid running the length of the
1416				 * dirty list unnecessarily (it might be quite
1417				 * long), so we catch this as early as
1418				 * possible by detecting the hash marker.  In
1419				 * this case, we simply set dvar to NULL and
1420				 * break; the conditional after the loop will
1421				 * send us back to top.
1422				 */
1423				dvar = NULL;
1424				break;
1425			}
1426
1427			goto next;
1428		}
1429
1430		if (dtuple->dtt_nkeys != nkeys)
1431			goto next;
1432
1433		for (i = 0; i < nkeys; i++, dkey++) {
1434			if (dkey->dttk_size != key[i].dttk_size)
1435				goto next; /* size or type mismatch */
1436
1437			if (dkey->dttk_size != 0) {
1438				if (dtrace_bcmp(
1439				    (void *)(uintptr_t)key[i].dttk_value,
1440				    (void *)(uintptr_t)dkey->dttk_value,
1441				    dkey->dttk_size))
1442					goto next;
1443			} else {
1444				if (dkey->dttk_value != key[i].dttk_value)
1445					goto next;
1446			}
1447		}
1448
1449		if (op != DTRACE_DYNVAR_DEALLOC)
1450			return (dvar);
1451
1452		ASSERT(dvar->dtdv_next == NULL ||
1453		    dvar->dtdv_next->dtdv_hashval != DTRACE_DYNHASH_FREE);
1454
1455		if (prev != NULL) {
1456			ASSERT(hash[bucket].dtdh_chain != dvar);
1457			ASSERT(start != dvar);
1458			ASSERT(prev->dtdv_next == dvar);
1459			prev->dtdv_next = dvar->dtdv_next;
1460		} else {
1461			if (dtrace_casptr(&hash[bucket].dtdh_chain,
1462			    start, dvar->dtdv_next) != start) {
1463				/*
1464				 * We have failed to atomically swing the
1465				 * hash table head pointer, presumably because
1466				 * of a conflicting allocation on another CPU.
1467				 * We need to reread the hash chain and try
1468				 * again.
1469				 */
1470				goto top;
1471			}
1472		}
1473
1474		dtrace_membar_producer();
1475
1476		/*
1477		 * Now set the hash value to indicate that it's free.
1478		 */
1479		ASSERT(hash[bucket].dtdh_chain != dvar);
1480		dvar->dtdv_hashval = DTRACE_DYNHASH_FREE;
1481
1482		dtrace_membar_producer();
1483
1484		/*
1485		 * Set the next pointer to point at the dirty list, and
1486		 * atomically swing the dirty pointer to the newly freed dvar.
1487		 */
1488		do {
1489			next = dcpu->dtdsc_dirty;
1490			dvar->dtdv_next = next;
1491		} while (dtrace_casptr(&dcpu->dtdsc_dirty, next, dvar) != next);
1492
1493		/*
1494		 * Finally, unlock this hash bucket.
1495		 */
1496		ASSERT(hash[bucket].dtdh_lock == lock);
1497		ASSERT(lock & 1);
1498		hash[bucket].dtdh_lock++;
1499
1500		return (NULL);
1501next:
1502		prev = dvar;
1503		continue;
1504	}
1505
1506	if (dvar == NULL) {
1507		/*
1508		 * If dvar is NULL, it is because we went off the rails:
1509		 * one of the elements that we traversed in the hash chain
1510		 * was deleted while we were traversing it.  In this case,
1511		 * we assert that we aren't doing a dealloc (deallocs lock
1512		 * the hash bucket to prevent themselves from racing with
1513		 * one another), and retry the hash chain traversal.
1514		 */
1515		ASSERT(op != DTRACE_DYNVAR_DEALLOC);
1516		goto top;
1517	}
1518
1519	if (op != DTRACE_DYNVAR_ALLOC) {
1520		/*
1521		 * If we are not to allocate a new variable, we want to
1522		 * return NULL now.  Before we return, check that the value
1523		 * of the lock word hasn't changed.  If it has, we may have
1524		 * seen an inconsistent snapshot.
1525		 */
1526		if (op == DTRACE_DYNVAR_NOALLOC) {
1527			if (hash[bucket].dtdh_lock != lock)
1528				goto top;
1529		} else {
1530			ASSERT(op == DTRACE_DYNVAR_DEALLOC);
1531			ASSERT(hash[bucket].dtdh_lock == lock);
1532			ASSERT(lock & 1);
1533			hash[bucket].dtdh_lock++;
1534		}
1535
1536		return (NULL);
1537	}
1538
1539	/*
1540	 * We need to allocate a new dynamic variable.  The size we need is the
1541	 * size of dtrace_dynvar plus the size of nkeys dtrace_key_t's plus the
1542	 * size of any auxiliary key data (rounded up to 8-byte alignment) plus
1543	 * the size of any referred-to data (dsize).  We then round the final
1544	 * size up to the chunksize for allocation.
1545	 */
1546	for (ksize = 0, i = 0; i < nkeys; i++)
1547		ksize += P2ROUNDUP(key[i].dttk_size, sizeof (uint64_t));
1548
1549	/*
1550	 * This should be pretty much impossible, but could happen if, say,
1551	 * strange DIF specified the tuple.  Ideally, this should be an
1552	 * assertion and not an error condition -- but that requires that the
1553	 * chunksize calculation in dtrace_difo_chunksize() be absolutely
1554	 * bullet-proof.  (That is, it must not be able to be fooled by
1555	 * malicious DIF.)  Given the lack of backwards branches in DIF,
1556	 * solving this would presumably not amount to solving the Halting
1557	 * Problem -- but it still seems awfully hard.
1558	 */
1559	if (sizeof (dtrace_dynvar_t) + sizeof (dtrace_key_t) * (nkeys - 1) +
1560	    ksize + dsize > chunksize) {
1561		dcpu->dtdsc_drops++;
1562		return (NULL);
1563	}
1564
1565	nstate = DTRACE_DSTATE_EMPTY;
1566
1567	do {
1568retry:
1569		free = dcpu->dtdsc_free;
1570
1571		if (free == NULL) {
1572			dtrace_dynvar_t *clean = dcpu->dtdsc_clean;
1573			void *rval;
1574
1575			if (clean == NULL) {
1576				/*
1577				 * We're out of dynamic variable space on
1578				 * this CPU.  Unless we have tried all CPUs,
1579				 * we'll try to allocate from a different
1580				 * CPU.
1581				 */
1582				switch (dstate->dtds_state) {
1583				case DTRACE_DSTATE_CLEAN: {
1584					void *sp = &dstate->dtds_state;
1585
1586					if (++cpu >= NCPU)
1587						cpu = 0;
1588
1589					if (dcpu->dtdsc_dirty != NULL &&
1590					    nstate == DTRACE_DSTATE_EMPTY)
1591						nstate = DTRACE_DSTATE_DIRTY;
1592
1593					if (dcpu->dtdsc_rinsing != NULL)
1594						nstate = DTRACE_DSTATE_RINSING;
1595
1596					dcpu = &dstate->dtds_percpu[cpu];
1597
1598					if (cpu != me)
1599						goto retry;
1600
1601					(void) dtrace_cas32(sp,
1602					    DTRACE_DSTATE_CLEAN, nstate);
1603
1604					/*
1605					 * To increment the correct bean
1606					 * counter, take another lap.
1607					 */
1608					goto retry;
1609				}
1610
1611				case DTRACE_DSTATE_DIRTY:
1612					dcpu->dtdsc_dirty_drops++;
1613					break;
1614
1615				case DTRACE_DSTATE_RINSING:
1616					dcpu->dtdsc_rinsing_drops++;
1617					break;
1618
1619				case DTRACE_DSTATE_EMPTY:
1620					dcpu->dtdsc_drops++;
1621					break;
1622				}
1623
1624				DTRACE_CPUFLAG_SET(CPU_DTRACE_DROP);
1625				return (NULL);
1626			}
1627
1628			/*
1629			 * The clean list appears to be non-empty.  We want to
1630			 * move the clean list to the free list; we start by
1631			 * moving the clean pointer aside.
1632			 */
1633			if (dtrace_casptr(&dcpu->dtdsc_clean,
1634			    clean, NULL) != clean) {
1635				/*
1636				 * We are in one of two situations:
1637				 *
1638				 *  (a)	The clean list was switched to the
1639				 *	free list by another CPU.
1640				 *
1641				 *  (b)	The clean list was added to by the
1642				 *	cleansing cyclic.
1643				 *
1644				 * In either of these situations, we can
1645				 * just reattempt the free list allocation.
1646				 */
1647				goto retry;
1648			}
1649
1650			ASSERT(clean->dtdv_hashval == DTRACE_DYNHASH_FREE);
1651
1652			/*
1653			 * Now we'll move the clean list to the free list.
1654			 * It's impossible for this to fail:  the only way
1655			 * the free list can be updated is through this
1656			 * code path, and only one CPU can own the clean list.
1657			 * Thus, it would only be possible for this to fail if
1658			 * this code were racing with dtrace_dynvar_clean().
1659			 * (That is, if dtrace_dynvar_clean() updated the clean
1660			 * list, and we ended up racing to update the free
1661			 * list.)  This race is prevented by the dtrace_sync()
1662			 * in dtrace_dynvar_clean() -- which flushes the
1663			 * owners of the clean lists out before resetting
1664			 * the clean lists.
1665			 */
1666			rval = dtrace_casptr(&dcpu->dtdsc_free, NULL, clean);
1667			ASSERT(rval == NULL);
1668			goto retry;
1669		}
1670
1671		dvar = free;
1672		new_free = dvar->dtdv_next;
1673	} while (dtrace_casptr(&dcpu->dtdsc_free, free, new_free) != free);
1674
1675	/*
1676	 * We have now allocated a new chunk.  We copy the tuple keys into the
1677	 * tuple array and copy any referenced key data into the data space
1678	 * following the tuple array.  As we do this, we relocate dttk_value
1679	 * in the final tuple to point to the key data address in the chunk.
1680	 */
1681	kdata = (uintptr_t)&dvar->dtdv_tuple.dtt_key[nkeys];
1682	dvar->dtdv_data = (void *)(kdata + ksize);
1683	dvar->dtdv_tuple.dtt_nkeys = nkeys;
1684
1685	for (i = 0; i < nkeys; i++) {
1686		dtrace_key_t *dkey = &dvar->dtdv_tuple.dtt_key[i];
1687		size_t kesize = key[i].dttk_size;
1688
1689		if (kesize != 0) {
1690			dtrace_bcopy(
1691			    (const void *)(uintptr_t)key[i].dttk_value,
1692			    (void *)kdata, kesize);
1693			dkey->dttk_value = kdata;
1694			kdata += P2ROUNDUP(kesize, sizeof (uint64_t));
1695		} else {
1696			dkey->dttk_value = key[i].dttk_value;
1697		}
1698
1699		dkey->dttk_size = kesize;
1700	}
1701
1702	ASSERT(dvar->dtdv_hashval == DTRACE_DYNHASH_FREE);
1703	dvar->dtdv_hashval = hashval;
1704	dvar->dtdv_next = start;
1705
1706	if (dtrace_casptr(&hash[bucket].dtdh_chain, start, dvar) == start)
1707		return (dvar);
1708
1709	/*
1710	 * The cas has failed.  Either another CPU is adding an element to
1711	 * this hash chain, or another CPU is deleting an element from this
1712	 * hash chain.  The simplest way to deal with both of these cases
1713	 * (though not necessarily the most efficient) is to free our
1714	 * allocated block and tail-call ourselves.  Note that the free is
1715	 * to the dirty list and _not_ to the free list.  This is to prevent
1716	 * races with allocators, above.
1717	 */
1718	dvar->dtdv_hashval = DTRACE_DYNHASH_FREE;
1719
1720	dtrace_membar_producer();
1721
1722	do {
1723		free = dcpu->dtdsc_dirty;
1724		dvar->dtdv_next = free;
1725	} while (dtrace_casptr(&dcpu->dtdsc_dirty, free, dvar) != free);
1726
1727	return (dtrace_dynvar(dstate, nkeys, key, dsize, op, mstate, vstate));
1728}
1729
1730/*ARGSUSED*/
1731static void
1732dtrace_aggregate_min(uint64_t *oval, uint64_t nval, uint64_t arg)
1733{
1734	if ((int64_t)nval < (int64_t)*oval)
1735		*oval = nval;
1736}
1737
1738/*ARGSUSED*/
1739static void
1740dtrace_aggregate_max(uint64_t *oval, uint64_t nval, uint64_t arg)
1741{
1742	if ((int64_t)nval > (int64_t)*oval)
1743		*oval = nval;
1744}
1745
1746static void
1747dtrace_aggregate_quantize(uint64_t *quanta, uint64_t nval, uint64_t incr)
1748{
1749	int i, zero = DTRACE_QUANTIZE_ZEROBUCKET;
1750	int64_t val = (int64_t)nval;
1751
1752	if (val < 0) {
1753		for (i = 0; i < zero; i++) {
1754			if (val <= DTRACE_QUANTIZE_BUCKETVAL(i)) {
1755				quanta[i] += incr;
1756				return;
1757			}
1758		}
1759	} else {
1760		for (i = zero + 1; i < DTRACE_QUANTIZE_NBUCKETS; i++) {
1761			if (val < DTRACE_QUANTIZE_BUCKETVAL(i)) {
1762				quanta[i - 1] += incr;
1763				return;
1764			}
1765		}
1766
1767		quanta[DTRACE_QUANTIZE_NBUCKETS - 1] += incr;
1768		return;
1769	}
1770
1771	ASSERT(0);
1772}
1773
1774static void
1775dtrace_aggregate_lquantize(uint64_t *lquanta, uint64_t nval, uint64_t incr)
1776{
1777	uint64_t arg = *lquanta++;
1778	int32_t base = DTRACE_LQUANTIZE_BASE(arg);
1779	uint16_t step = DTRACE_LQUANTIZE_STEP(arg);
1780	uint16_t levels = DTRACE_LQUANTIZE_LEVELS(arg);
1781	int32_t val = (int32_t)nval, level;
1782
1783	ASSERT(step != 0);
1784	ASSERT(levels != 0);
1785
1786	if (val < base) {
1787		/*
1788		 * This is an underflow.
1789		 */
1790		lquanta[0] += incr;
1791		return;
1792	}
1793
1794	level = (val - base) / step;
1795
1796	if (level < levels) {
1797		lquanta[level + 1] += incr;
1798		return;
1799	}
1800
1801	/*
1802	 * This is an overflow.
1803	 */
1804	lquanta[levels + 1] += incr;
1805}
1806
1807/*ARGSUSED*/
1808static void
1809dtrace_aggregate_avg(uint64_t *data, uint64_t nval, uint64_t arg)
1810{
1811	data[0]++;
1812	data[1] += nval;
1813}
1814
1815/*ARGSUSED*/
1816static void
1817dtrace_aggregate_stddev(uint64_t *data, uint64_t nval, uint64_t arg)
1818{
1819	int64_t snval = (int64_t)nval;
1820	uint64_t tmp[2];
1821
1822	data[0]++;
1823	data[1] += nval;
1824
1825	/*
1826	 * What we want to say here is:
1827	 *
1828	 * data[2] += nval * nval;
1829	 *
1830	 * But given that nval is 64-bit, we could easily overflow, so
1831	 * we do this as 128-bit arithmetic.
1832	 */
1833	if (snval < 0)
1834		snval = -snval;
1835
1836	dtrace_multiply_128((uint64_t)snval, (uint64_t)snval, tmp);
1837	dtrace_add_128(data + 2, tmp, data + 2);
1838}
1839
1840/*ARGSUSED*/
1841static void
1842dtrace_aggregate_count(uint64_t *oval, uint64_t nval, uint64_t arg)
1843{
1844	*oval = *oval + 1;
1845}
1846
1847/*ARGSUSED*/
1848static void
1849dtrace_aggregate_sum(uint64_t *oval, uint64_t nval, uint64_t arg)
1850{
1851	*oval += nval;
1852}
1853
1854/*
1855 * Aggregate given the tuple in the principal data buffer, and the aggregating
1856 * action denoted by the specified dtrace_aggregation_t.  The aggregation
1857 * buffer is specified as the buf parameter.  This routine does not return
1858 * failure; if there is no space in the aggregation buffer, the data will be
1859 * dropped, and a corresponding counter incremented.
1860 */
1861static void
1862dtrace_aggregate(dtrace_aggregation_t *agg, dtrace_buffer_t *dbuf,
1863    intptr_t offset, dtrace_buffer_t *buf, uint64_t expr, uint64_t arg)
1864{
1865	dtrace_recdesc_t *rec = &agg->dtag_action.dta_rec;
1866	uint32_t i, ndx, size, fsize;
1867	uint32_t align = sizeof (uint64_t) - 1;
1868	dtrace_aggbuffer_t *agb;
1869	dtrace_aggkey_t *key;
1870	uint32_t hashval = 0, limit, isstr;
1871	caddr_t tomax, data, kdata;
1872	dtrace_actkind_t action;
1873	dtrace_action_t *act;
1874	uintptr_t offs;
1875
1876	if (buf == NULL)
1877		return;
1878
1879	if (!agg->dtag_hasarg) {
1880		/*
1881		 * Currently, only quantize() and lquantize() take additional
1882		 * arguments, and they have the same semantics:  an increment
1883		 * value that defaults to 1 when not present.  If additional
1884		 * aggregating actions take arguments, the setting of the
1885		 * default argument value will presumably have to become more
1886		 * sophisticated...
1887		 */
1888		arg = 1;
1889	}
1890
1891	action = agg->dtag_action.dta_kind - DTRACEACT_AGGREGATION;
1892	size = rec->dtrd_offset - agg->dtag_base;
1893	fsize = size + rec->dtrd_size;
1894
1895	ASSERT(dbuf->dtb_tomax != NULL);
1896	data = dbuf->dtb_tomax + offset + agg->dtag_base;
1897
1898	if ((tomax = buf->dtb_tomax) == NULL) {
1899		dtrace_buffer_drop(buf);
1900		return;
1901	}
1902
1903	/*
1904	 * The metastructure is always at the bottom of the buffer.
1905	 */
1906	agb = (dtrace_aggbuffer_t *)(tomax + buf->dtb_size -
1907	    sizeof (dtrace_aggbuffer_t));
1908
1909	if (buf->dtb_offset == 0) {
1910		/*
1911		 * We just kludge up approximately 1/8th of the size to be
1912		 * buckets.  If this guess ends up being routinely
1913		 * off-the-mark, we may need to dynamically readjust this
1914		 * based on past performance.
1915		 */
1916		uintptr_t hashsize = (buf->dtb_size >> 3) / sizeof (uintptr_t);
1917
1918		if ((uintptr_t)agb - hashsize * sizeof (dtrace_aggkey_t *) <
1919		    (uintptr_t)tomax || hashsize == 0) {
1920			/*
1921			 * We've been given a ludicrously small buffer;
1922			 * increment our drop count and leave.
1923			 */
1924			dtrace_buffer_drop(buf);
1925			return;
1926		}
1927
1928		/*
1929		 * And now, a pathetic attempt to try to get a an odd (or
1930		 * perchance, a prime) hash size for better hash distribution.
1931		 */
1932		if (hashsize > (DTRACE_AGGHASHSIZE_SLEW << 3))
1933			hashsize -= DTRACE_AGGHASHSIZE_SLEW;
1934
1935		agb->dtagb_hashsize = hashsize;
1936		agb->dtagb_hash = (dtrace_aggkey_t **)((uintptr_t)agb -
1937		    agb->dtagb_hashsize * sizeof (dtrace_aggkey_t *));
1938		agb->dtagb_free = (uintptr_t)agb->dtagb_hash;
1939
1940		for (i = 0; i < agb->dtagb_hashsize; i++)
1941			agb->dtagb_hash[i] = NULL;
1942	}
1943
1944	ASSERT(agg->dtag_first != NULL);
1945	ASSERT(agg->dtag_first->dta_intuple);
1946
1947	/*
1948	 * Calculate the hash value based on the key.  Note that we _don't_
1949	 * include the aggid in the hashing (but we will store it as part of
1950	 * the key).  The hashing algorithm is Bob Jenkins' "One-at-a-time"
1951	 * algorithm: a simple, quick algorithm that has no known funnels, and
1952	 * gets good distribution in practice.  The efficacy of the hashing
1953	 * algorithm (and a comparison with other algorithms) may be found by
1954	 * running the ::dtrace_aggstat MDB dcmd.
1955	 */
1956	for (act = agg->dtag_first; act->dta_intuple; act = act->dta_next) {
1957		i = act->dta_rec.dtrd_offset - agg->dtag_base;
1958		limit = i + act->dta_rec.dtrd_size;
1959		ASSERT(limit <= size);
1960		isstr = DTRACEACT_ISSTRING(act);
1961
1962		for (; i < limit; i++) {
1963			hashval += data[i];
1964			hashval += (hashval << 10);
1965			hashval ^= (hashval >> 6);
1966
1967			if (isstr && data[i] == '\0')
1968				break;
1969		}
1970	}
1971
1972	hashval += (hashval << 3);
1973	hashval ^= (hashval >> 11);
1974	hashval += (hashval << 15);
1975
1976	/*
1977	 * Yes, the divide here is expensive -- but it's generally the least
1978	 * of the performance issues given the amount of data that we iterate
1979	 * over to compute hash values, compare data, etc.
1980	 */
1981	ndx = hashval % agb->dtagb_hashsize;
1982
1983	for (key = agb->dtagb_hash[ndx]; key != NULL; key = key->dtak_next) {
1984		ASSERT((caddr_t)key >= tomax);
1985		ASSERT((caddr_t)key < tomax + buf->dtb_size);
1986
1987		if (hashval != key->dtak_hashval || key->dtak_size != size)
1988			continue;
1989
1990		kdata = key->dtak_data;
1991		ASSERT(kdata >= tomax && kdata < tomax + buf->dtb_size);
1992
1993		for (act = agg->dtag_first; act->dta_intuple;
1994		    act = act->dta_next) {
1995			i = act->dta_rec.dtrd_offset - agg->dtag_base;
1996			limit = i + act->dta_rec.dtrd_size;
1997			ASSERT(limit <= size);
1998			isstr = DTRACEACT_ISSTRING(act);
1999
2000			for (; i < limit; i++) {
2001				if (kdata[i] != data[i])
2002					goto next;
2003
2004				if (isstr && data[i] == '\0')
2005					break;
2006			}
2007		}
2008
2009		if (action != key->dtak_action) {
2010			/*
2011			 * We are aggregating on the same value in the same
2012			 * aggregation with two different aggregating actions.
2013			 * (This should have been picked up in the compiler,
2014			 * so we may be dealing with errant or devious DIF.)
2015			 * This is an error condition; we indicate as much,
2016			 * and return.
2017			 */
2018			DTRACE_CPUFLAG_SET(CPU_DTRACE_ILLOP);
2019			return;
2020		}
2021
2022		/*
2023		 * This is a hit:  we need to apply the aggregator to
2024		 * the value at this key.
2025		 */
2026		agg->dtag_aggregate((uint64_t *)(kdata + size), expr, arg);
2027		return;
2028next:
2029		continue;
2030	}
2031
2032	/*
2033	 * We didn't find it.  We need to allocate some zero-filled space,
2034	 * link it into the hash table appropriately, and apply the aggregator
2035	 * to the (zero-filled) value.
2036	 */
2037	offs = buf->dtb_offset;
2038	while (offs & (align - 1))
2039		offs += sizeof (uint32_t);
2040
2041	/*
2042	 * If we don't have enough room to both allocate a new key _and_
2043	 * its associated data, increment the drop count and return.
2044	 */
2045	if ((uintptr_t)tomax + offs + fsize >
2046	    agb->dtagb_free - sizeof (dtrace_aggkey_t)) {
2047		dtrace_buffer_drop(buf);
2048		return;
2049	}
2050
2051	/*CONSTCOND*/
2052	ASSERT(!(sizeof (dtrace_aggkey_t) & (sizeof (uintptr_t) - 1)));
2053	key = (dtrace_aggkey_t *)(agb->dtagb_free - sizeof (dtrace_aggkey_t));
2054	agb->dtagb_free -= sizeof (dtrace_aggkey_t);
2055
2056	key->dtak_data = kdata = tomax + offs;
2057	buf->dtb_offset = offs + fsize;
2058
2059	/*
2060	 * Now copy the data across.
2061	 */
2062	*((dtrace_aggid_t *)kdata) = agg->dtag_id;
2063
2064	for (i = sizeof (dtrace_aggid_t); i < size; i++)
2065		kdata[i] = data[i];
2066
2067	/*
2068	 * Because strings are not zeroed out by default, we need to iterate
2069	 * looking for actions that store strings, and we need to explicitly
2070	 * pad these strings out with zeroes.
2071	 */
2072	for (act = agg->dtag_first; act->dta_intuple; act = act->dta_next) {
2073		int nul;
2074
2075		if (!DTRACEACT_ISSTRING(act))
2076			continue;
2077
2078		i = act->dta_rec.dtrd_offset - agg->dtag_base;
2079		limit = i + act->dta_rec.dtrd_size;
2080		ASSERT(limit <= size);
2081
2082		for (nul = 0; i < limit; i++) {
2083			if (nul) {
2084				kdata[i] = '\0';
2085				continue;
2086			}
2087
2088			if (data[i] != '\0')
2089				continue;
2090
2091			nul = 1;
2092		}
2093	}
2094
2095	for (i = size; i < fsize; i++)
2096		kdata[i] = 0;
2097
2098	key->dtak_hashval = hashval;
2099	key->dtak_size = size;
2100	key->dtak_action = action;
2101	key->dtak_next = agb->dtagb_hash[ndx];
2102	agb->dtagb_hash[ndx] = key;
2103
2104	/*
2105	 * Finally, apply the aggregator.
2106	 */
2107	*((uint64_t *)(key->dtak_data + size)) = agg->dtag_initial;
2108	agg->dtag_aggregate((uint64_t *)(key->dtak_data + size), expr, arg);
2109}
2110
2111/*
2112 * Given consumer state, this routine finds a speculation in the INACTIVE
2113 * state and transitions it into the ACTIVE state.  If there is no speculation
2114 * in the INACTIVE state, 0 is returned.  In this case, no error counter is
2115 * incremented -- it is up to the caller to take appropriate action.
2116 */
2117static int
2118dtrace_speculation(dtrace_state_t *state)
2119{
2120	int i = 0;
2121	dtrace_speculation_state_t current;
2122	uint32_t *stat = &state->dts_speculations_unavail, count;
2123
2124	while (i < state->dts_nspeculations) {
2125		dtrace_speculation_t *spec = &state->dts_speculations[i];
2126
2127		current = spec->dtsp_state;
2128
2129		if (current != DTRACESPEC_INACTIVE) {
2130			if (current == DTRACESPEC_COMMITTINGMANY ||
2131			    current == DTRACESPEC_COMMITTING ||
2132			    current == DTRACESPEC_DISCARDING)
2133				stat = &state->dts_speculations_busy;
2134			i++;
2135			continue;
2136		}
2137
2138		if (dtrace_cas32((uint32_t *)&spec->dtsp_state,
2139		    current, DTRACESPEC_ACTIVE) == current)
2140			return (i + 1);
2141	}
2142
2143	/*
2144	 * We couldn't find a speculation.  If we found as much as a single
2145	 * busy speculation buffer, we'll attribute this failure as "busy"
2146	 * instead of "unavail".
2147	 */
2148	do {
2149		count = *stat;
2150	} while (dtrace_cas32(stat, count, count + 1) != count);
2151
2152	return (0);
2153}
2154
2155/*
2156 * This routine commits an active speculation.  If the specified speculation
2157 * is not in a valid state to perform a commit(), this routine will silently do
2158 * nothing.  The state of the specified speculation is transitioned according
2159 * to the state transition diagram outlined in <sys/dtrace_impl.h>
2160 */
2161static void
2162dtrace_speculation_commit(dtrace_state_t *state, processorid_t cpu,
2163    dtrace_specid_t which)
2164{
2165	dtrace_speculation_t *spec;
2166	dtrace_buffer_t *src, *dest;
2167	uintptr_t daddr, saddr, dlimit;
2168	dtrace_speculation_state_t current, new;
2169	intptr_t offs;
2170
2171	if (which == 0)
2172		return;
2173
2174	if (which > state->dts_nspeculations) {
2175		cpu_core[cpu].cpuc_dtrace_flags |= CPU_DTRACE_ILLOP;
2176		return;
2177	}
2178
2179	spec = &state->dts_speculations[which - 1];
2180	src = &spec->dtsp_buffer[cpu];
2181	dest = &state->dts_buffer[cpu];
2182
2183	do {
2184		current = spec->dtsp_state;
2185
2186		if (current == DTRACESPEC_COMMITTINGMANY)
2187			break;
2188
2189		switch (current) {
2190		case DTRACESPEC_INACTIVE:
2191		case DTRACESPEC_DISCARDING:
2192			return;
2193
2194		case DTRACESPEC_COMMITTING:
2195			/*
2196			 * This is only possible if we are (a) commit()'ing
2197			 * without having done a prior speculate() on this CPU
2198			 * and (b) racing with another commit() on a different
2199			 * CPU.  There's nothing to do -- we just assert that
2200			 * our offset is 0.
2201			 */
2202			ASSERT(src->dtb_offset == 0);
2203			return;
2204
2205		case DTRACESPEC_ACTIVE:
2206			new = DTRACESPEC_COMMITTING;
2207			break;
2208
2209		case DTRACESPEC_ACTIVEONE:
2210			/*
2211			 * This speculation is active on one CPU.  If our
2212			 * buffer offset is non-zero, we know that the one CPU
2213			 * must be us.  Otherwise, we are committing on a
2214			 * different CPU from the speculate(), and we must
2215			 * rely on being asynchronously cleaned.
2216			 */
2217			if (src->dtb_offset != 0) {
2218				new = DTRACESPEC_COMMITTING;
2219				break;
2220			}
2221			/*FALLTHROUGH*/
2222
2223		case DTRACESPEC_ACTIVEMANY:
2224			new = DTRACESPEC_COMMITTINGMANY;
2225			break;
2226
2227		default:
2228			ASSERT(0);
2229		}
2230	} while (dtrace_cas32((uint32_t *)&spec->dtsp_state,
2231	    current, new) != current);
2232
2233	/*
2234	 * We have set the state to indicate that we are committing this
2235	 * speculation.  Now reserve the necessary space in the destination
2236	 * buffer.
2237	 */
2238	if ((offs = dtrace_buffer_reserve(dest, src->dtb_offset,
2239	    sizeof (uint64_t), state, NULL)) < 0) {
2240		dtrace_buffer_drop(dest);
2241		goto out;
2242	}
2243
2244	/*
2245	 * We have the space; copy the buffer across.  (Note that this is a
2246	 * highly subobtimal bcopy(); in the unlikely event that this becomes
2247	 * a serious performance issue, a high-performance DTrace-specific
2248	 * bcopy() should obviously be invented.)
2249	 */
2250	daddr = (uintptr_t)dest->dtb_tomax + offs;
2251	dlimit = daddr + src->dtb_offset;
2252	saddr = (uintptr_t)src->dtb_tomax;
2253
2254	/*
2255	 * First, the aligned portion.
2256	 */
2257	while (dlimit - daddr >= sizeof (uint64_t)) {
2258		*((uint64_t *)daddr) = *((uint64_t *)saddr);
2259
2260		daddr += sizeof (uint64_t);
2261		saddr += sizeof (uint64_t);
2262	}
2263
2264	/*
2265	 * Now any left-over bit...
2266	 */
2267	while (dlimit - daddr)
2268		*((uint8_t *)daddr++) = *((uint8_t *)saddr++);
2269
2270	/*
2271	 * Finally, commit the reserved space in the destination buffer.
2272	 */
2273	dest->dtb_offset = offs + src->dtb_offset;
2274
2275out:
2276	/*
2277	 * If we're lucky enough to be the only active CPU on this speculation
2278	 * buffer, we can just set the state back to DTRACESPEC_INACTIVE.
2279	 */
2280	if (current == DTRACESPEC_ACTIVE ||
2281	    (current == DTRACESPEC_ACTIVEONE && new == DTRACESPEC_COMMITTING)) {
2282		uint32_t rval = dtrace_cas32((uint32_t *)&spec->dtsp_state,
2283		    DTRACESPEC_COMMITTING, DTRACESPEC_INACTIVE);
2284
2285		ASSERT(rval == DTRACESPEC_COMMITTING);
2286	}
2287
2288	src->dtb_offset = 0;
2289	src->dtb_xamot_drops += src->dtb_drops;
2290	src->dtb_drops = 0;
2291}
2292
2293/*
2294 * This routine discards an active speculation.  If the specified speculation
2295 * is not in a valid state to perform a discard(), this routine will silently
2296 * do nothing.  The state of the specified speculation is transitioned
2297 * according to the state transition diagram outlined in <sys/dtrace_impl.h>
2298 */
2299static void
2300dtrace_speculation_discard(dtrace_state_t *state, processorid_t cpu,
2301    dtrace_specid_t which)
2302{
2303	dtrace_speculation_t *spec;
2304	dtrace_speculation_state_t current, new;
2305	dtrace_buffer_t *buf;
2306
2307	if (which == 0)
2308		return;
2309
2310	if (which > state->dts_nspeculations) {
2311		cpu_core[cpu].cpuc_dtrace_flags |= CPU_DTRACE_ILLOP;
2312		return;
2313	}
2314
2315	spec = &state->dts_speculations[which - 1];
2316	buf = &spec->dtsp_buffer[cpu];
2317
2318	do {
2319		current = spec->dtsp_state;
2320
2321		switch (current) {
2322		case DTRACESPEC_INACTIVE:
2323		case DTRACESPEC_COMMITTINGMANY:
2324		case DTRACESPEC_COMMITTING:
2325		case DTRACESPEC_DISCARDING:
2326			return;
2327
2328		case DTRACESPEC_ACTIVE:
2329		case DTRACESPEC_ACTIVEMANY:
2330			new = DTRACESPEC_DISCARDING;
2331			break;
2332
2333		case DTRACESPEC_ACTIVEONE:
2334			if (buf->dtb_offset != 0) {
2335				new = DTRACESPEC_INACTIVE;
2336			} else {
2337				new = DTRACESPEC_DISCARDING;
2338			}
2339			break;
2340
2341		default:
2342			ASSERT(0);
2343		}
2344	} while (dtrace_cas32((uint32_t *)&spec->dtsp_state,
2345	    current, new) != current);
2346
2347	buf->dtb_offset = 0;
2348	buf->dtb_drops = 0;
2349}
2350
2351/*
2352 * Note:  not called from probe context.  This function is called
2353 * asynchronously from cross call context to clean any speculations that are
2354 * in the COMMITTINGMANY or DISCARDING states.  These speculations may not be
2355 * transitioned back to the INACTIVE state until all CPUs have cleaned the
2356 * speculation.
2357 */
2358static void
2359dtrace_speculation_clean_here(dtrace_state_t *state)
2360{
2361	dtrace_icookie_t cookie;
2362	processorid_t cpu = CPU->cpu_id;
2363	dtrace_buffer_t *dest = &state->dts_buffer[cpu];
2364	dtrace_specid_t i;
2365
2366	cookie = dtrace_interrupt_disable();
2367
2368	if (dest->dtb_tomax == NULL) {
2369		dtrace_interrupt_enable(cookie);
2370		return;
2371	}
2372
2373	for (i = 0; i < state->dts_nspeculations; i++) {
2374		dtrace_speculation_t *spec = &state->dts_speculations[i];
2375		dtrace_buffer_t *src = &spec->dtsp_buffer[cpu];
2376
2377		if (src->dtb_tomax == NULL)
2378			continue;
2379
2380		if (spec->dtsp_state == DTRACESPEC_DISCARDING) {
2381			src->dtb_offset = 0;
2382			continue;
2383		}
2384
2385		if (spec->dtsp_state != DTRACESPEC_COMMITTINGMANY)
2386			continue;
2387
2388		if (src->dtb_offset == 0)
2389			continue;
2390
2391		dtrace_speculation_commit(state, cpu, i + 1);
2392	}
2393
2394	dtrace_interrupt_enable(cookie);
2395}
2396
2397/*
2398 * Note:  not called from probe context.  This function is called
2399 * asynchronously (and at a regular interval) to clean any speculations that
2400 * are in the COMMITTINGMANY or DISCARDING states.  If it discovers that there
2401 * is work to be done, it cross calls all CPUs to perform that work;
2402 * COMMITMANY and DISCARDING speculations may not be transitioned back to the
2403 * INACTIVE state until they have been cleaned by all CPUs.
2404 */
2405static void
2406dtrace_speculation_clean(dtrace_state_t *state)
2407{
2408	int work = 0, rv;
2409	dtrace_specid_t i;
2410
2411	for (i = 0; i < state->dts_nspeculations; i++) {
2412		dtrace_speculation_t *spec = &state->dts_speculations[i];
2413
2414		ASSERT(!spec->dtsp_cleaning);
2415
2416		if (spec->dtsp_state != DTRACESPEC_DISCARDING &&
2417		    spec->dtsp_state != DTRACESPEC_COMMITTINGMANY)
2418			continue;
2419
2420		work++;
2421		spec->dtsp_cleaning = 1;
2422	}
2423
2424	if (!work)
2425		return;
2426
2427	dtrace_xcall(DTRACE_CPUALL,
2428	    (dtrace_xcall_t)dtrace_speculation_clean_here, state);
2429
2430	/*
2431	 * We now know that all CPUs have committed or discarded their
2432	 * speculation buffers, as appropriate.  We can now set the state
2433	 * to inactive.
2434	 */
2435	for (i = 0; i < state->dts_nspeculations; i++) {
2436		dtrace_speculation_t *spec = &state->dts_speculations[i];
2437		dtrace_speculation_state_t current, new;
2438
2439		if (!spec->dtsp_cleaning)
2440			continue;
2441
2442		current = spec->dtsp_state;
2443		ASSERT(current == DTRACESPEC_DISCARDING ||
2444		    current == DTRACESPEC_COMMITTINGMANY);
2445
2446		new = DTRACESPEC_INACTIVE;
2447
2448		rv = dtrace_cas32((uint32_t *)&spec->dtsp_state, current, new);
2449		ASSERT(rv == current);
2450		spec->dtsp_cleaning = 0;
2451	}
2452}
2453
2454/*
2455 * Called as part of a speculate() to get the speculative buffer associated
2456 * with a given speculation.  Returns NULL if the specified speculation is not
2457 * in an ACTIVE state.  If the speculation is in the ACTIVEONE state -- and
2458 * the active CPU is not the specified CPU -- the speculation will be
2459 * atomically transitioned into the ACTIVEMANY state.
2460 */
2461static dtrace_buffer_t *
2462dtrace_speculation_buffer(dtrace_state_t *state, processorid_t cpuid,
2463    dtrace_specid_t which)
2464{
2465	dtrace_speculation_t *spec;
2466	dtrace_speculation_state_t current, new;
2467	dtrace_buffer_t *buf;
2468
2469	if (which == 0)
2470		return (NULL);
2471
2472	if (which > state->dts_nspeculations) {
2473		cpu_core[cpuid].cpuc_dtrace_flags |= CPU_DTRACE_ILLOP;
2474		return (NULL);
2475	}
2476
2477	spec = &state->dts_speculations[which - 1];
2478	buf = &spec->dtsp_buffer[cpuid];
2479
2480	do {
2481		current = spec->dtsp_state;
2482
2483		switch (current) {
2484		case DTRACESPEC_INACTIVE:
2485		case DTRACESPEC_COMMITTINGMANY:
2486		case DTRACESPEC_DISCARDING:
2487			return (NULL);
2488
2489		case DTRACESPEC_COMMITTING:
2490			ASSERT(buf->dtb_offset == 0);
2491			return (NULL);
2492
2493		case DTRACESPEC_ACTIVEONE:
2494			/*
2495			 * This speculation is currently active on one CPU.
2496			 * Check the offset in the buffer; if it's non-zero,
2497			 * that CPU must be us (and we leave the state alone).
2498			 * If it's zero, assume that we're starting on a new
2499			 * CPU -- and change the state to indicate that the
2500			 * speculation is active on more than one CPU.
2501			 */
2502			if (buf->dtb_offset != 0)
2503				return (buf);
2504
2505			new = DTRACESPEC_ACTIVEMANY;
2506			break;
2507
2508		case DTRACESPEC_ACTIVEMANY:
2509			return (buf);
2510
2511		case DTRACESPEC_ACTIVE:
2512			new = DTRACESPEC_ACTIVEONE;
2513			break;
2514
2515		default:
2516			ASSERT(0);
2517		}
2518	} while (dtrace_cas32((uint32_t *)&spec->dtsp_state,
2519	    current, new) != current);
2520
2521	ASSERT(new == DTRACESPEC_ACTIVEONE || new == DTRACESPEC_ACTIVEMANY);
2522	return (buf);
2523}
2524
2525/*
2526 * Return a string.  In the event that the user lacks the privilege to access
2527 * arbitrary kernel memory, we copy the string out to scratch memory so that we
2528 * don't fail access checking.
2529 *
2530 * dtrace_dif_variable() uses this routine as a helper for various
2531 * builtin values such as 'execname' and 'probefunc.'
2532 */
2533uintptr_t
2534dtrace_dif_varstr(uintptr_t addr, dtrace_state_t *state,
2535    dtrace_mstate_t *mstate)
2536{
2537	uint64_t size = state->dts_options[DTRACEOPT_STRSIZE];
2538	uintptr_t ret;
2539	size_t strsz;
2540
2541	/*
2542	 * The easy case: this probe is allowed to read all of memory, so
2543	 * we can just return this as a vanilla pointer.
2544	 */
2545	if ((mstate->dtms_access & DTRACE_ACCESS_KERNEL) != 0)
2546		return (addr);
2547
2548	/*
2549	 * This is the tougher case: we copy the string in question from
2550	 * kernel memory into scratch memory and return it that way: this
2551	 * ensures that we won't trip up when access checking tests the
2552	 * BYREF return value.
2553	 */
2554	strsz = dtrace_strlen((char *)addr, size) + 1;
2555
2556	if (mstate->dtms_scratch_ptr + strsz >
2557	    mstate->dtms_scratch_base + mstate->dtms_scratch_size) {
2558		DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
2559		return (NULL);
2560	}
2561
2562	dtrace_strcpy((const void *)addr, (void *)mstate->dtms_scratch_ptr,
2563	    strsz);
2564	ret = mstate->dtms_scratch_ptr;
2565	mstate->dtms_scratch_ptr += strsz;
2566	return (ret);
2567}
2568
2569/*
2570 * This function implements the DIF emulator's variable lookups.  The emulator
2571 * passes a reserved variable identifier and optional built-in array index.
2572 */
2573static uint64_t
2574dtrace_dif_variable(dtrace_mstate_t *mstate, dtrace_state_t *state, uint64_t v,
2575    uint64_t ndx)
2576{
2577	/*
2578	 * If we're accessing one of the uncached arguments, we'll turn this
2579	 * into a reference in the args array.
2580	 */
2581	if (v >= DIF_VAR_ARG0 && v <= DIF_VAR_ARG9) {
2582		ndx = v - DIF_VAR_ARG0;
2583		v = DIF_VAR_ARGS;
2584	}
2585
2586	switch (v) {
2587	case DIF_VAR_ARGS:
2588		ASSERT(mstate->dtms_present & DTRACE_MSTATE_ARGS);
2589		if (ndx >= sizeof (mstate->dtms_arg) /
2590		    sizeof (mstate->dtms_arg[0])) {
2591			int aframes = mstate->dtms_probe->dtpr_aframes + 2;
2592			dtrace_provider_t *pv;
2593			uint64_t val;
2594
2595			pv = mstate->dtms_probe->dtpr_provider;
2596			if (pv->dtpv_pops.dtps_getargval != NULL)
2597				val = pv->dtpv_pops.dtps_getargval(pv->dtpv_arg,
2598				    mstate->dtms_probe->dtpr_id,
2599				    mstate->dtms_probe->dtpr_arg, ndx, aframes);
2600			else
2601				val = dtrace_getarg(ndx, aframes);
2602
2603			/*
2604			 * This is regrettably required to keep the compiler
2605			 * from tail-optimizing the call to dtrace_getarg().
2606			 * The condition always evaluates to true, but the
2607			 * compiler has no way of figuring that out a priori.
2608			 * (None of this would be necessary if the compiler
2609			 * could be relied upon to _always_ tail-optimize
2610			 * the call to dtrace_getarg() -- but it can't.)
2611			 */
2612			if (mstate->dtms_probe != NULL)
2613				return (val);
2614
2615			ASSERT(0);
2616		}
2617
2618		return (mstate->dtms_arg[ndx]);
2619
2620	case DIF_VAR_UREGS: {
2621		klwp_t *lwp;
2622
2623		if (!dtrace_priv_proc(state))
2624			return (0);
2625
2626		if ((lwp = curthread->t_lwp) == NULL) {
2627			DTRACE_CPUFLAG_SET(CPU_DTRACE_BADADDR);
2628			cpu_core[CPU->cpu_id].cpuc_dtrace_illval = NULL;
2629			return (0);
2630		}
2631
2632		return (dtrace_getreg(lwp->lwp_regs, ndx));
2633	}
2634
2635	case DIF_VAR_CURTHREAD:
2636		if (!dtrace_priv_kernel(state))
2637			return (0);
2638		return ((uint64_t)(uintptr_t)curthread);
2639
2640	case DIF_VAR_TIMESTAMP:
2641		if (!(mstate->dtms_present & DTRACE_MSTATE_TIMESTAMP)) {
2642			mstate->dtms_timestamp = dtrace_gethrtime();
2643			mstate->dtms_present |= DTRACE_MSTATE_TIMESTAMP;
2644		}
2645		return (mstate->dtms_timestamp);
2646
2647	case DIF_VAR_VTIMESTAMP:
2648		ASSERT(dtrace_vtime_references != 0);
2649		return (curthread->t_dtrace_vtime);
2650
2651	case DIF_VAR_WALLTIMESTAMP:
2652		if (!(mstate->dtms_present & DTRACE_MSTATE_WALLTIMESTAMP)) {
2653			mstate->dtms_walltimestamp = dtrace_gethrestime();
2654			mstate->dtms_present |= DTRACE_MSTATE_WALLTIMESTAMP;
2655		}
2656		return (mstate->dtms_walltimestamp);
2657
2658	case DIF_VAR_IPL:
2659		if (!dtrace_priv_kernel(state))
2660			return (0);
2661		if (!(mstate->dtms_present & DTRACE_MSTATE_IPL)) {
2662			mstate->dtms_ipl = dtrace_getipl();
2663			mstate->dtms_present |= DTRACE_MSTATE_IPL;
2664		}
2665		return (mstate->dtms_ipl);
2666
2667	case DIF_VAR_EPID:
2668		ASSERT(mstate->dtms_present & DTRACE_MSTATE_EPID);
2669		return (mstate->dtms_epid);
2670
2671	case DIF_VAR_ID:
2672		ASSERT(mstate->dtms_present & DTRACE_MSTATE_PROBE);
2673		return (mstate->dtms_probe->dtpr_id);
2674
2675	case DIF_VAR_STACKDEPTH:
2676		if (!dtrace_priv_kernel(state))
2677			return (0);
2678		if (!(mstate->dtms_present & DTRACE_MSTATE_STACKDEPTH)) {
2679			int aframes = mstate->dtms_probe->dtpr_aframes + 2;
2680
2681			mstate->dtms_stackdepth = dtrace_getstackdepth(aframes);
2682			mstate->dtms_present |= DTRACE_MSTATE_STACKDEPTH;
2683		}
2684		return (mstate->dtms_stackdepth);
2685
2686	case DIF_VAR_USTACKDEPTH:
2687		if (!dtrace_priv_proc(state))
2688			return (0);
2689		if (!(mstate->dtms_present & DTRACE_MSTATE_USTACKDEPTH)) {
2690			/*
2691			 * See comment in DIF_VAR_PID.
2692			 */
2693			if (DTRACE_ANCHORED(mstate->dtms_probe) &&
2694			    CPU_ON_INTR(CPU)) {
2695				mstate->dtms_ustackdepth = 0;
2696			} else {
2697				DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
2698				mstate->dtms_ustackdepth =
2699				    dtrace_getustackdepth();
2700				DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);
2701			}
2702			mstate->dtms_present |= DTRACE_MSTATE_USTACKDEPTH;
2703		}
2704		return (mstate->dtms_ustackdepth);
2705
2706	case DIF_VAR_CALLER:
2707		if (!dtrace_priv_kernel(state))
2708			return (0);
2709		if (!(mstate->dtms_present & DTRACE_MSTATE_CALLER)) {
2710			int aframes = mstate->dtms_probe->dtpr_aframes + 2;
2711
2712			if (!DTRACE_ANCHORED(mstate->dtms_probe)) {
2713				/*
2714				 * If this is an unanchored probe, we are
2715				 * required to go through the slow path:
2716				 * dtrace_caller() only guarantees correct
2717				 * results for anchored probes.
2718				 */
2719				pc_t caller[2];
2720
2721				dtrace_getpcstack(caller, 2, aframes,
2722				    (uint32_t *)(uintptr_t)mstate->dtms_arg[0]);
2723				mstate->dtms_caller = caller[1];
2724			} else if ((mstate->dtms_caller =
2725			    dtrace_caller(aframes)) == -1) {
2726				/*
2727				 * We have failed to do this the quick way;
2728				 * we must resort to the slower approach of
2729				 * calling dtrace_getpcstack().
2730				 */
2731				pc_t caller;
2732
2733				dtrace_getpcstack(&caller, 1, aframes, NULL);
2734				mstate->dtms_caller = caller;
2735			}
2736
2737			mstate->dtms_present |= DTRACE_MSTATE_CALLER;
2738		}
2739		return (mstate->dtms_caller);
2740
2741	case DIF_VAR_UCALLER:
2742		if (!dtrace_priv_proc(state))
2743			return (0);
2744
2745		if (!(mstate->dtms_present & DTRACE_MSTATE_UCALLER)) {
2746			uint64_t ustack[3];
2747
2748			/*
2749			 * dtrace_getupcstack() fills in the first uint64_t
2750			 * with the current PID.  The second uint64_t will
2751			 * be the program counter at user-level.  The third
2752			 * uint64_t will contain the caller, which is what
2753			 * we're after.
2754			 */
2755			ustack[2] = NULL;
2756			DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
2757			dtrace_getupcstack(ustack, 3);
2758			DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);
2759			mstate->dtms_ucaller = ustack[2];
2760			mstate->dtms_present |= DTRACE_MSTATE_UCALLER;
2761		}
2762
2763		return (mstate->dtms_ucaller);
2764
2765	case DIF_VAR_PROBEPROV:
2766		ASSERT(mstate->dtms_present & DTRACE_MSTATE_PROBE);
2767		return (dtrace_dif_varstr(
2768		    (uintptr_t)mstate->dtms_probe->dtpr_provider->dtpv_name,
2769		    state, mstate));
2770
2771	case DIF_VAR_PROBEMOD:
2772		ASSERT(mstate->dtms_present & DTRACE_MSTATE_PROBE);
2773		return (dtrace_dif_varstr(
2774		    (uintptr_t)mstate->dtms_probe->dtpr_mod,
2775		    state, mstate));
2776
2777	case DIF_VAR_PROBEFUNC:
2778		ASSERT(mstate->dtms_present & DTRACE_MSTATE_PROBE);
2779		return (dtrace_dif_varstr(
2780		    (uintptr_t)mstate->dtms_probe->dtpr_func,
2781		    state, mstate));
2782
2783	case DIF_VAR_PROBENAME:
2784		ASSERT(mstate->dtms_present & DTRACE_MSTATE_PROBE);
2785		return (dtrace_dif_varstr(
2786		    (uintptr_t)mstate->dtms_probe->dtpr_name,
2787		    state, mstate));
2788
2789	case DIF_VAR_PID:
2790		if (!dtrace_priv_proc(state))
2791			return (0);
2792
2793		/*
2794		 * Note that we are assuming that an unanchored probe is
2795		 * always due to a high-level interrupt.  (And we're assuming
2796		 * that there is only a single high level interrupt.)
2797		 */
2798		if (DTRACE_ANCHORED(mstate->dtms_probe) && CPU_ON_INTR(CPU))
2799			return (pid0.pid_id);
2800
2801		/*
2802		 * It is always safe to dereference one's own t_procp pointer:
2803		 * it always points to a valid, allocated proc structure.
2804		 * Further, it is always safe to dereference the p_pidp member
2805		 * of one's own proc structure.  (These are truisms becuase
2806		 * threads and processes don't clean up their own state --
2807		 * they leave that task to whomever reaps them.)
2808		 */
2809		return ((uint64_t)curthread->t_procp->p_pidp->pid_id);
2810
2811	case DIF_VAR_PPID:
2812		if (!dtrace_priv_proc(state))
2813			return (0);
2814
2815		/*
2816		 * See comment in DIF_VAR_PID.
2817		 */
2818		if (DTRACE_ANCHORED(mstate->dtms_probe) && CPU_ON_INTR(CPU))
2819			return (pid0.pid_id);
2820
2821		/*
2822		 * It is always safe to dereference one's own t_procp pointer:
2823		 * it always points to a valid, allocated proc structure.
2824		 * (This is true because threads don't clean up their own
2825		 * state -- they leave that task to whomever reaps them.)
2826		 */
2827		return ((uint64_t)curthread->t_procp->p_ppid);
2828
2829	case DIF_VAR_TID:
2830		/*
2831		 * See comment in DIF_VAR_PID.
2832		 */
2833		if (DTRACE_ANCHORED(mstate->dtms_probe) && CPU_ON_INTR(CPU))
2834			return (0);
2835
2836		return ((uint64_t)curthread->t_tid);
2837
2838	case DIF_VAR_EXECNAME:
2839		if (!dtrace_priv_proc(state))
2840			return (0);
2841
2842		/*
2843		 * See comment in DIF_VAR_PID.
2844		 */
2845		if (DTRACE_ANCHORED(mstate->dtms_probe) && CPU_ON_INTR(CPU))
2846			return ((uint64_t)(uintptr_t)p0.p_user.u_comm);
2847
2848		/*
2849		 * It is always safe to dereference one's own t_procp pointer:
2850		 * it always points to a valid, allocated proc structure.
2851		 * (This is true because threads don't clean up their own
2852		 * state -- they leave that task to whomever reaps them.)
2853		 */
2854		return (dtrace_dif_varstr(
2855		    (uintptr_t)curthread->t_procp->p_user.u_comm,
2856		    state, mstate));
2857
2858	case DIF_VAR_ZONENAME:
2859		if (!dtrace_priv_proc(state))
2860			return (0);
2861
2862		/*
2863		 * See comment in DIF_VAR_PID.
2864		 */
2865		if (DTRACE_ANCHORED(mstate->dtms_probe) && CPU_ON_INTR(CPU))
2866			return ((uint64_t)(uintptr_t)p0.p_zone->zone_name);
2867
2868		/*
2869		 * It is always safe to dereference one's own t_procp pointer:
2870		 * it always points to a valid, allocated proc structure.
2871		 * (This is true because threads don't clean up their own
2872		 * state -- they leave that task to whomever reaps them.)
2873		 */
2874		return (dtrace_dif_varstr(
2875		    (uintptr_t)curthread->t_procp->p_zone->zone_name,
2876		    state, mstate));
2877
2878	case DIF_VAR_UID:
2879		if (!dtrace_priv_proc(state))
2880			return (0);
2881
2882		/*
2883		 * See comment in DIF_VAR_PID.
2884		 */
2885		if (DTRACE_ANCHORED(mstate->dtms_probe) && CPU_ON_INTR(CPU))
2886			return ((uint64_t)p0.p_cred->cr_uid);
2887
2888		/*
2889		 * It is always safe to dereference one's own t_procp pointer:
2890		 * it always points to a valid, allocated proc structure.
2891		 * (This is true because threads don't clean up their own
2892		 * state -- they leave that task to whomever reaps them.)
2893		 *
2894		 * Additionally, it is safe to dereference one's own process
2895		 * credential, since this is never NULL after process birth.
2896		 */
2897		return ((uint64_t)curthread->t_procp->p_cred->cr_uid);
2898
2899	case DIF_VAR_GID:
2900		if (!dtrace_priv_proc(state))
2901			return (0);
2902
2903		/*
2904		 * See comment in DIF_VAR_PID.
2905		 */
2906		if (DTRACE_ANCHORED(mstate->dtms_probe) && CPU_ON_INTR(CPU))
2907			return ((uint64_t)p0.p_cred->cr_gid);
2908
2909		/*
2910		 * It is always safe to dereference one's own t_procp pointer:
2911		 * it always points to a valid, allocated proc structure.
2912		 * (This is true because threads don't clean up their own
2913		 * state -- they leave that task to whomever reaps them.)
2914		 *
2915		 * Additionally, it is safe to dereference one's own process
2916		 * credential, since this is never NULL after process birth.
2917		 */
2918		return ((uint64_t)curthread->t_procp->p_cred->cr_gid);
2919
2920	case DIF_VAR_ERRNO: {
2921		klwp_t *lwp;
2922		if (!dtrace_priv_proc(state))
2923			return (0);
2924
2925		/*
2926		 * See comment in DIF_VAR_PID.
2927		 */
2928		if (DTRACE_ANCHORED(mstate->dtms_probe) && CPU_ON_INTR(CPU))
2929			return (0);
2930
2931		/*
2932		 * It is always safe to dereference one's own t_lwp pointer in
2933		 * the event that this pointer is non-NULL.  (This is true
2934		 * because threads and lwps don't clean up their own state --
2935		 * they leave that task to whomever reaps them.)
2936		 */
2937		if ((lwp = curthread->t_lwp) == NULL)
2938			return (0);
2939
2940		return ((uint64_t)lwp->lwp_errno);
2941	}
2942	default:
2943		DTRACE_CPUFLAG_SET(CPU_DTRACE_ILLOP);
2944		return (0);
2945	}
2946}
2947
2948/*
2949 * Emulate the execution of DTrace ID subroutines invoked by the call opcode.
2950 * Notice that we don't bother validating the proper number of arguments or
2951 * their types in the tuple stack.  This isn't needed because all argument
2952 * interpretation is safe because of our load safety -- the worst that can
2953 * happen is that a bogus program can obtain bogus results.
2954 */
2955static void
2956dtrace_dif_subr(uint_t subr, uint_t rd, uint64_t *regs,
2957    dtrace_key_t *tupregs, int nargs,
2958    dtrace_mstate_t *mstate, dtrace_state_t *state)
2959{
2960	volatile uint16_t *flags = &cpu_core[CPU->cpu_id].cpuc_dtrace_flags;
2961	volatile uintptr_t *illval = &cpu_core[CPU->cpu_id].cpuc_dtrace_illval;
2962	dtrace_vstate_t *vstate = &state->dts_vstate;
2963
2964	union {
2965		mutex_impl_t mi;
2966		uint64_t mx;
2967	} m;
2968
2969	union {
2970		krwlock_t ri;
2971		uintptr_t rw;
2972	} r;
2973
2974	switch (subr) {
2975	case DIF_SUBR_RAND:
2976		regs[rd] = (dtrace_gethrtime() * 2416 + 374441) % 1771875;
2977		break;
2978
2979	case DIF_SUBR_MUTEX_OWNED:
2980		if (!dtrace_canload(tupregs[0].dttk_value, sizeof (kmutex_t),
2981		    mstate, vstate)) {
2982			regs[rd] = NULL;
2983			break;
2984		}
2985
2986		m.mx = dtrace_load64(tupregs[0].dttk_value);
2987		if (MUTEX_TYPE_ADAPTIVE(&m.mi))
2988			regs[rd] = MUTEX_OWNER(&m.mi) != MUTEX_NO_OWNER;
2989		else
2990			regs[rd] = LOCK_HELD(&m.mi.m_spin.m_spinlock);
2991		break;
2992
2993	case DIF_SUBR_MUTEX_OWNER:
2994		if (!dtrace_canload(tupregs[0].dttk_value, sizeof (kmutex_t),
2995		    mstate, vstate)) {
2996			regs[rd] = NULL;
2997			break;
2998		}
2999
3000		m.mx = dtrace_load64(tupregs[0].dttk_value);
3001		if (MUTEX_TYPE_ADAPTIVE(&m.mi) &&
3002		    MUTEX_OWNER(&m.mi) != MUTEX_NO_OWNER)
3003			regs[rd] = (uintptr_t)MUTEX_OWNER(&m.mi);
3004		else
3005			regs[rd] = 0;
3006		break;
3007
3008	case DIF_SUBR_MUTEX_TYPE_ADAPTIVE:
3009		if (!dtrace_canload(tupregs[0].dttk_value, sizeof (kmutex_t),
3010		    mstate, vstate)) {
3011			regs[rd] = NULL;
3012			break;
3013		}
3014
3015		m.mx = dtrace_load64(tupregs[0].dttk_value);
3016		regs[rd] = MUTEX_TYPE_ADAPTIVE(&m.mi);
3017		break;
3018
3019	case DIF_SUBR_MUTEX_TYPE_SPIN:
3020		if (!dtrace_canload(tupregs[0].dttk_value, sizeof (kmutex_t),
3021		    mstate, vstate)) {
3022			regs[rd] = NULL;
3023			break;
3024		}
3025
3026		m.mx = dtrace_load64(tupregs[0].dttk_value);
3027		regs[rd] = MUTEX_TYPE_SPIN(&m.mi);
3028		break;
3029
3030	case DIF_SUBR_RW_READ_HELD: {
3031		uintptr_t tmp;
3032
3033		if (!dtrace_canload(tupregs[0].dttk_value, sizeof (uintptr_t),
3034		    mstate, vstate)) {
3035			regs[rd] = NULL;
3036			break;
3037		}
3038
3039		r.rw = dtrace_loadptr(tupregs[0].dttk_value);
3040		regs[rd] = _RW_READ_HELD(&r.ri, tmp);
3041		break;
3042	}
3043
3044	case DIF_SUBR_RW_WRITE_HELD:
3045		if (!dtrace_canload(tupregs[0].dttk_value, sizeof (krwlock_t),
3046		    mstate, vstate)) {
3047			regs[rd] = NULL;
3048			break;
3049		}
3050
3051		r.rw = dtrace_loadptr(tupregs[0].dttk_value);
3052		regs[rd] = _RW_WRITE_HELD(&r.ri);
3053		break;
3054
3055	case DIF_SUBR_RW_ISWRITER:
3056		if (!dtrace_canload(tupregs[0].dttk_value, sizeof (krwlock_t),
3057		    mstate, vstate)) {
3058			regs[rd] = NULL;
3059			break;
3060		}
3061
3062		r.rw = dtrace_loadptr(tupregs[0].dttk_value);
3063		regs[rd] = _RW_ISWRITER(&r.ri);
3064		break;
3065
3066	case DIF_SUBR_BCOPY: {
3067		/*
3068		 * We need to be sure that the destination is in the scratch
3069		 * region -- no other region is allowed.
3070		 */
3071		uintptr_t src = tupregs[0].dttk_value;
3072		uintptr_t dest = tupregs[1].dttk_value;
3073		size_t size = tupregs[2].dttk_value;
3074
3075		if (!dtrace_inscratch(dest, size, mstate)) {
3076			*flags |= CPU_DTRACE_BADADDR;
3077			*illval = regs[rd];
3078			break;
3079		}
3080
3081		if (!dtrace_canload(src, size, mstate, vstate)) {
3082			regs[rd] = NULL;
3083			break;
3084		}
3085
3086		dtrace_bcopy((void *)src, (void *)dest, size);
3087		break;
3088	}
3089
3090	case DIF_SUBR_ALLOCA:
3091	case DIF_SUBR_COPYIN: {
3092		uintptr_t dest = P2ROUNDUP(mstate->dtms_scratch_ptr, 8);
3093		uint64_t size =
3094		    tupregs[subr == DIF_SUBR_ALLOCA ? 0 : 1].dttk_value;
3095		size_t scratch_size = (dest - mstate->dtms_scratch_ptr) + size;
3096
3097		/*
3098		 * This action doesn't require any credential checks since
3099		 * probes will not activate in user contexts to which the
3100		 * enabling user does not have permissions.
3101		 */
3102
3103		/*
3104		 * Rounding up the user allocation size could have overflowed
3105		 * a large, bogus allocation (like -1ULL) to 0.
3106		 */
3107		if (scratch_size < size ||
3108		    !DTRACE_INSCRATCH(mstate, scratch_size)) {
3109			DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
3110			regs[rd] = NULL;
3111			break;
3112		}
3113
3114		if (subr == DIF_SUBR_COPYIN) {
3115			DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
3116			dtrace_copyin(tupregs[0].dttk_value, dest, size, flags);
3117			DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);
3118		}
3119
3120		mstate->dtms_scratch_ptr += scratch_size;
3121		regs[rd] = dest;
3122		break;
3123	}
3124
3125	case DIF_SUBR_COPYINTO: {
3126		uint64_t size = tupregs[1].dttk_value;
3127		uintptr_t dest = tupregs[2].dttk_value;
3128
3129		/*
3130		 * This action doesn't require any credential checks since
3131		 * probes will not activate in user contexts to which the
3132		 * enabling user does not have permissions.
3133		 */
3134		if (!dtrace_inscratch(dest, size, mstate)) {
3135			*flags |= CPU_DTRACE_BADADDR;
3136			*illval = regs[rd];
3137			break;
3138		}
3139
3140		DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
3141		dtrace_copyin(tupregs[0].dttk_value, dest, size, flags);
3142		DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);
3143		break;
3144	}
3145
3146	case DIF_SUBR_COPYINSTR: {
3147		uintptr_t dest = mstate->dtms_scratch_ptr;
3148		uint64_t size = state->dts_options[DTRACEOPT_STRSIZE];
3149
3150		if (nargs > 1 && tupregs[1].dttk_value < size)
3151			size = tupregs[1].dttk_value + 1;
3152
3153		/*
3154		 * This action doesn't require any credential checks since
3155		 * probes will not activate in user contexts to which the
3156		 * enabling user does not have permissions.
3157		 */
3158		if (!DTRACE_INSCRATCH(mstate, size)) {
3159			DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
3160			regs[rd] = NULL;
3161			break;
3162		}
3163
3164		DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
3165		dtrace_copyinstr(tupregs[0].dttk_value, dest, size, flags);
3166		DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);
3167
3168		((char *)dest)[size - 1] = '\0';
3169		mstate->dtms_scratch_ptr += size;
3170		regs[rd] = dest;
3171		break;
3172	}
3173
3174	case DIF_SUBR_MSGSIZE:
3175	case DIF_SUBR_MSGDSIZE: {
3176		uintptr_t baddr = tupregs[0].dttk_value, daddr;
3177		uintptr_t wptr, rptr;
3178		size_t count = 0;
3179		int cont = 0;
3180
3181		while (baddr != NULL && !(*flags & CPU_DTRACE_FAULT)) {
3182
3183			if (!dtrace_canload(baddr, sizeof (mblk_t), mstate,
3184			    vstate)) {
3185				regs[rd] = NULL;
3186				break;
3187			}
3188
3189			wptr = dtrace_loadptr(baddr +
3190			    offsetof(mblk_t, b_wptr));
3191
3192			rptr = dtrace_loadptr(baddr +
3193			    offsetof(mblk_t, b_rptr));
3194
3195			if (wptr < rptr) {
3196				*flags |= CPU_DTRACE_BADADDR;
3197				*illval = tupregs[0].dttk_value;
3198				break;
3199			}
3200
3201			daddr = dtrace_loadptr(baddr +
3202			    offsetof(mblk_t, b_datap));
3203
3204			baddr = dtrace_loadptr(baddr +
3205			    offsetof(mblk_t, b_cont));
3206
3207			/*
3208			 * We want to prevent against denial-of-service here,
3209			 * so we're only going to search the list for
3210			 * dtrace_msgdsize_max mblks.
3211			 */
3212			if (cont++ > dtrace_msgdsize_max) {
3213				*flags |= CPU_DTRACE_ILLOP;
3214				break;
3215			}
3216
3217			if (subr == DIF_SUBR_MSGDSIZE) {
3218				if (dtrace_load8(daddr +
3219				    offsetof(dblk_t, db_type)) != M_DATA)
3220					continue;
3221			}
3222
3223			count += wptr - rptr;
3224		}
3225
3226		if (!(*flags & CPU_DTRACE_FAULT))
3227			regs[rd] = count;
3228
3229		break;
3230	}
3231
3232	case DIF_SUBR_PROGENYOF: {
3233		pid_t pid = tupregs[0].dttk_value;
3234		proc_t *p;
3235		int rval = 0;
3236
3237		DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
3238
3239		for (p = curthread->t_procp; p != NULL; p = p->p_parent) {
3240			if (p->p_pidp->pid_id == pid) {
3241				rval = 1;
3242				break;
3243			}
3244		}
3245
3246		DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);
3247
3248		regs[rd] = rval;
3249		break;
3250	}
3251
3252	case DIF_SUBR_SPECULATION:
3253		regs[rd] = dtrace_speculation(state);
3254		break;
3255
3256	case DIF_SUBR_COPYOUT: {
3257		uintptr_t kaddr = tupregs[0].dttk_value;
3258		uintptr_t uaddr = tupregs[1].dttk_value;
3259		uint64_t size = tupregs[2].dttk_value;
3260
3261		if (!dtrace_destructive_disallow &&
3262		    dtrace_priv_proc_control(state) &&
3263		    !dtrace_istoxic(kaddr, size)) {
3264			DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
3265			dtrace_copyout(kaddr, uaddr, size, flags);
3266			DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);
3267		}
3268		break;
3269	}
3270
3271	case DIF_SUBR_COPYOUTSTR: {
3272		uintptr_t kaddr = tupregs[0].dttk_value;
3273		uintptr_t uaddr = tupregs[1].dttk_value;
3274		uint64_t size = tupregs[2].dttk_value;
3275
3276		if (!dtrace_destructive_disallow &&
3277		    dtrace_priv_proc_control(state) &&
3278		    !dtrace_istoxic(kaddr, size)) {
3279			DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
3280			dtrace_copyoutstr(kaddr, uaddr, size, flags);
3281			DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);
3282		}
3283		break;
3284	}
3285
3286	case DIF_SUBR_STRLEN: {
3287		size_t sz;
3288		uintptr_t addr = (uintptr_t)tupregs[0].dttk_value;
3289		sz = dtrace_strlen((char *)addr,
3290		    state->dts_options[DTRACEOPT_STRSIZE]);
3291
3292		if (!dtrace_canload(addr, sz + 1, mstate, vstate)) {
3293			regs[rd] = NULL;
3294			break;
3295		}
3296
3297		regs[rd] = sz;
3298
3299		break;
3300	}
3301
3302	case DIF_SUBR_STRCHR:
3303	case DIF_SUBR_STRRCHR: {
3304		/*
3305		 * We're going to iterate over the string looking for the
3306		 * specified character.  We will iterate until we have reached
3307		 * the string length or we have found the character.  If this
3308		 * is DIF_SUBR_STRRCHR, we will look for the last occurrence
3309		 * of the specified character instead of the first.
3310		 */
3311		uintptr_t saddr = tupregs[0].dttk_value;
3312		uintptr_t addr = tupregs[0].dttk_value;
3313		uintptr_t limit = addr + state->dts_options[DTRACEOPT_STRSIZE];
3314		char c, target = (char)tupregs[1].dttk_value;
3315
3316		for (regs[rd] = NULL; addr < limit; addr++) {
3317			if ((c = dtrace_load8(addr)) == target) {
3318				regs[rd] = addr;
3319
3320				if (subr == DIF_SUBR_STRCHR)
3321					break;
3322			}
3323
3324			if (c == '\0')
3325				break;
3326		}
3327
3328		if (!dtrace_canload(saddr, addr - saddr, mstate, vstate)) {
3329			regs[rd] = NULL;
3330			break;
3331		}
3332
3333		break;
3334	}
3335
3336	case DIF_SUBR_STRSTR:
3337	case DIF_SUBR_INDEX:
3338	case DIF_SUBR_RINDEX: {
3339		/*
3340		 * We're going to iterate over the string looking for the
3341		 * specified string.  We will iterate until we have reached
3342		 * the string length or we have found the string.  (Yes, this
3343		 * is done in the most naive way possible -- but considering
3344		 * that the string we're searching for is likely to be
3345		 * relatively short, the complexity of Rabin-Karp or similar
3346		 * hardly seems merited.)
3347		 */
3348		char *addr = (char *)(uintptr_t)tupregs[0].dttk_value;
3349		char *substr = (char *)(uintptr_t)tupregs[1].dttk_value;
3350		uint64_t size = state->dts_options[DTRACEOPT_STRSIZE];
3351		size_t len = dtrace_strlen(addr, size);
3352		size_t sublen = dtrace_strlen(substr, size);
3353		char *limit = addr + len, *orig = addr;
3354		int notfound = subr == DIF_SUBR_STRSTR ? 0 : -1;
3355		int inc = 1;
3356
3357		regs[rd] = notfound;
3358
3359		if (!dtrace_canload((uintptr_t)addr, len + 1, mstate, vstate)) {
3360			regs[rd] = NULL;
3361			break;
3362		}
3363
3364		if (!dtrace_canload((uintptr_t)substr, sublen + 1, mstate,
3365		    vstate)) {
3366			regs[rd] = NULL;
3367			break;
3368		}
3369
3370		/*
3371		 * strstr() and index()/rindex() have similar semantics if
3372		 * both strings are the empty string: strstr() returns a
3373		 * pointer to the (empty) string, and index() and rindex()
3374		 * both return index 0 (regardless of any position argument).
3375		 */
3376		if (sublen == 0 && len == 0) {
3377			if (subr == DIF_SUBR_STRSTR)
3378				regs[rd] = (uintptr_t)addr;
3379			else
3380				regs[rd] = 0;
3381			break;
3382		}
3383
3384		if (subr != DIF_SUBR_STRSTR) {
3385			if (subr == DIF_SUBR_RINDEX) {
3386				limit = orig - 1;
3387				addr += len;
3388				inc = -1;
3389			}
3390
3391			/*
3392			 * Both index() and rindex() take an optional position
3393			 * argument that denotes the starting position.
3394			 */
3395			if (nargs == 3) {
3396				int64_t pos = (int64_t)tupregs[2].dttk_value;
3397
3398				/*
3399				 * If the position argument to index() is
3400				 * negative, Perl implicitly clamps it at
3401				 * zero.  This semantic is a little surprising
3402				 * given the special meaning of negative
3403				 * positions to similar Perl functions like
3404				 * substr(), but it appears to reflect a
3405				 * notion that index() can start from a
3406				 * negative index and increment its way up to
3407				 * the string.  Given this notion, Perl's
3408				 * rindex() is at least self-consistent in
3409				 * that it implicitly clamps positions greater
3410				 * than the string length to be the string
3411				 * length.  Where Perl completely loses
3412				 * coherence, however, is when the specified
3413				 * substring is the empty string ("").  In
3414				 * this case, even if the position is
3415				 * negative, rindex() returns 0 -- and even if
3416				 * the position is greater than the length,
3417				 * index() returns the string length.  These
3418				 * semantics violate the notion that index()
3419				 * should never return a value less than the
3420				 * specified position and that rindex() should
3421				 * never return a value greater than the
3422				 * specified position.  (One assumes that
3423				 * these semantics are artifacts of Perl's
3424				 * implementation and not the results of
3425				 * deliberate design -- it beggars belief that
3426				 * even Larry Wall could desire such oddness.)
3427				 * While in the abstract one would wish for
3428				 * consistent position semantics across
3429				 * substr(), index() and rindex() -- or at the
3430				 * very least self-consistent position
3431				 * semantics for index() and rindex() -- we
3432				 * instead opt to keep with the extant Perl
3433				 * semantics, in all their broken glory.  (Do
3434				 * we have more desire to maintain Perl's
3435				 * semantics than Perl does?  Probably.)
3436				 */
3437				if (subr == DIF_SUBR_RINDEX) {
3438					if (pos < 0) {
3439						if (sublen == 0)
3440							regs[rd] = 0;
3441						break;
3442					}
3443
3444					if (pos > len)
3445						pos = len;
3446				} else {
3447					if (pos < 0)
3448						pos = 0;
3449
3450					if (pos >= len) {
3451						if (sublen == 0)
3452							regs[rd] = len;
3453						break;
3454					}
3455				}
3456
3457				addr = orig + pos;
3458			}
3459		}
3460
3461		for (regs[rd] = notfound; addr != limit; addr += inc) {
3462			if (dtrace_strncmp(addr, substr, sublen) == 0) {
3463				if (subr != DIF_SUBR_STRSTR) {
3464					/*
3465					 * As D index() and rindex() are
3466					 * modeled on Perl (and not on awk),
3467					 * we return a zero-based (and not a
3468					 * one-based) index.  (For you Perl
3469					 * weenies: no, we're not going to add
3470					 * $[ -- and shouldn't you be at a con
3471					 * or something?)
3472					 */
3473					regs[rd] = (uintptr_t)(addr - orig);
3474					break;
3475				}
3476
3477				ASSERT(subr == DIF_SUBR_STRSTR);
3478				regs[rd] = (uintptr_t)addr;
3479				break;
3480			}
3481		}
3482
3483		break;
3484	}
3485
3486	case DIF_SUBR_STRTOK: {
3487		uintptr_t addr = tupregs[0].dttk_value;
3488		uintptr_t tokaddr = tupregs[1].dttk_value;
3489		uint64_t size = state->dts_options[DTRACEOPT_STRSIZE];
3490		uintptr_t limit, toklimit = tokaddr + size;
3491		uint8_t c, tokmap[32];	 /* 256 / 8 */
3492		char *dest = (char *)mstate->dtms_scratch_ptr;
3493		int i;
3494
3495		/*
3496		 * Check both the token buffer and (later) the input buffer,
3497		 * since both could be non-scratch addresses.
3498		 */
3499		if (!dtrace_strcanload(tokaddr, size, mstate, vstate)) {
3500			regs[rd] = NULL;
3501			break;
3502		}
3503
3504		if (!DTRACE_INSCRATCH(mstate, size)) {
3505			DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
3506			regs[rd] = NULL;
3507			break;
3508		}
3509
3510		if (addr == NULL) {
3511			/*
3512			 * If the address specified is NULL, we use our saved
3513			 * strtok pointer from the mstate.  Note that this
3514			 * means that the saved strtok pointer is _only_
3515			 * valid within multiple enablings of the same probe --
3516			 * it behaves like an implicit clause-local variable.
3517			 */
3518			addr = mstate->dtms_strtok;
3519		} else {
3520			/*
3521			 * If the user-specified address is non-NULL we must
3522			 * access check it.  This is the only time we have
3523			 * a chance to do so, since this address may reside
3524			 * in the string table of this clause-- future calls
3525			 * (when we fetch addr from mstate->dtms_strtok)
3526			 * would fail this access check.
3527			 */
3528			if (!dtrace_strcanload(addr, size, mstate, vstate)) {
3529				regs[rd] = NULL;
3530				break;
3531			}
3532		}
3533
3534		/*
3535		 * First, zero the token map, and then process the token
3536		 * string -- setting a bit in the map for every character
3537		 * found in the token string.
3538		 */
3539		for (i = 0; i < sizeof (tokmap); i++)
3540			tokmap[i] = 0;
3541
3542		for (; tokaddr < toklimit; tokaddr++) {
3543			if ((c = dtrace_load8(tokaddr)) == '\0')
3544				break;
3545
3546			ASSERT((c >> 3) < sizeof (tokmap));
3547			tokmap[c >> 3] |= (1 << (c & 0x7));
3548		}
3549
3550		for (limit = addr + size; addr < limit; addr++) {
3551			/*
3552			 * We're looking for a character that is _not_ contained
3553			 * in the token string.
3554			 */
3555			if ((c = dtrace_load8(addr)) == '\0')
3556				break;
3557
3558			if (!(tokmap[c >> 3] & (1 << (c & 0x7))))
3559				break;
3560		}
3561
3562		if (c == '\0') {
3563			/*
3564			 * We reached the end of the string without finding
3565			 * any character that was not in the token string.
3566			 * We return NULL in this case, and we set the saved
3567			 * address to NULL as well.
3568			 */
3569			regs[rd] = NULL;
3570			mstate->dtms_strtok = NULL;
3571			break;
3572		}
3573
3574		/*
3575		 * From here on, we're copying into the destination string.
3576		 */
3577		for (i = 0; addr < limit && i < size - 1; addr++) {
3578			if ((c = dtrace_load8(addr)) == '\0')
3579				break;
3580
3581			if (tokmap[c >> 3] & (1 << (c & 0x7)))
3582				break;
3583
3584			ASSERT(i < size);
3585			dest[i++] = c;
3586		}
3587
3588		ASSERT(i < size);
3589		dest[i] = '\0';
3590		regs[rd] = (uintptr_t)dest;
3591		mstate->dtms_scratch_ptr += size;
3592		mstate->dtms_strtok = addr;
3593		break;
3594	}
3595
3596	case DIF_SUBR_SUBSTR: {
3597		uintptr_t s = tupregs[0].dttk_value;
3598		uint64_t size = state->dts_options[DTRACEOPT_STRSIZE];
3599		char *d = (char *)mstate->dtms_scratch_ptr;
3600		int64_t index = (int64_t)tupregs[1].dttk_value;
3601		int64_t remaining = (int64_t)tupregs[2].dttk_value;
3602		size_t len = dtrace_strlen((char *)s, size);
3603		int64_t i = 0;
3604
3605		if (!dtrace_canload(s, len + 1, mstate, vstate)) {
3606			regs[rd] = NULL;
3607			break;
3608		}
3609
3610		if (nargs <= 2)
3611			remaining = (int64_t)size;
3612
3613		if (!DTRACE_INSCRATCH(mstate, size)) {
3614			DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
3615			regs[rd] = NULL;
3616			break;
3617		}
3618
3619		if (index < 0) {
3620			index += len;
3621
3622			if (index < 0 && index + remaining > 0) {
3623				remaining += index;
3624				index = 0;
3625			}
3626		}
3627
3628		if (index >= len || index < 0)
3629			index = len;
3630
3631		for (d[0] = '\0'; remaining > 0; remaining--) {
3632			if ((d[i++] = dtrace_load8(s++ + index)) == '\0')
3633				break;
3634
3635			if (i == size) {
3636				d[i - 1] = '\0';
3637				break;
3638			}
3639		}
3640
3641		mstate->dtms_scratch_ptr += size;
3642		regs[rd] = (uintptr_t)d;
3643		break;
3644	}
3645
3646	case DIF_SUBR_GETMAJOR:
3647#ifdef _LP64
3648		regs[rd] = (tupregs[0].dttk_value >> NBITSMINOR64) & MAXMAJ64;
3649#else
3650		regs[rd] = (tupregs[0].dttk_value >> NBITSMINOR) & MAXMAJ;
3651#endif
3652		break;
3653
3654	case DIF_SUBR_GETMINOR:
3655#ifdef _LP64
3656		regs[rd] = tupregs[0].dttk_value & MAXMIN64;
3657#else
3658		regs[rd] = tupregs[0].dttk_value & MAXMIN;
3659#endif
3660		break;
3661
3662	case DIF_SUBR_DDI_PATHNAME: {
3663		/*
3664		 * This one is a galactic mess.  We are going to roughly
3665		 * emulate ddi_pathname(), but it's made more complicated
3666		 * by the fact that we (a) want to include the minor name and
3667		 * (b) must proceed iteratively instead of recursively.
3668		 */
3669		uintptr_t dest = mstate->dtms_scratch_ptr;
3670		uint64_t size = state->dts_options[DTRACEOPT_STRSIZE];
3671		char *start = (char *)dest, *end = start + size - 1;
3672		uintptr_t daddr = tupregs[0].dttk_value;
3673		int64_t minor = (int64_t)tupregs[1].dttk_value;
3674		char *s;
3675		int i, len, depth = 0;
3676
3677		/*
3678		 * Due to all the pointer jumping we do and context we must
3679		 * rely upon, we just mandate that the user must have kernel
3680		 * read privileges to use this routine.
3681		 */
3682		if ((mstate->dtms_access & DTRACE_ACCESS_KERNEL) == 0) {
3683			*flags |= CPU_DTRACE_KPRIV;
3684			*illval = daddr;
3685			regs[rd] = NULL;
3686		}
3687
3688		if (!DTRACE_INSCRATCH(mstate, size)) {
3689			DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
3690			regs[rd] = NULL;
3691			break;
3692		}
3693
3694		*end = '\0';
3695
3696		/*
3697		 * We want to have a name for the minor.  In order to do this,
3698		 * we need to walk the minor list from the devinfo.  We want
3699		 * to be sure that we don't infinitely walk a circular list,
3700		 * so we check for circularity by sending a scout pointer
3701		 * ahead two elements for every element that we iterate over;
3702		 * if the list is circular, these will ultimately point to the
3703		 * same element.  You may recognize this little trick as the
3704		 * answer to a stupid interview question -- one that always
3705		 * seems to be asked by those who had to have it laboriously
3706		 * explained to them, and who can't even concisely describe
3707		 * the conditions under which one would be forced to resort to
3708		 * this technique.  Needless to say, those conditions are
3709		 * found here -- and probably only here.  Is this is the only
3710		 * use of this infamous trick in shipping, production code?
3711		 * If it isn't, it probably should be...
3712		 */
3713		if (minor != -1) {
3714			uintptr_t maddr = dtrace_loadptr(daddr +
3715			    offsetof(struct dev_info, devi_minor));
3716
3717			uintptr_t next = offsetof(struct ddi_minor_data, next);
3718			uintptr_t name = offsetof(struct ddi_minor_data,
3719			    d_minor) + offsetof(struct ddi_minor, name);
3720			uintptr_t dev = offsetof(struct ddi_minor_data,
3721			    d_minor) + offsetof(struct ddi_minor, dev);
3722			uintptr_t scout;
3723
3724			if (maddr != NULL)
3725				scout = dtrace_loadptr(maddr + next);
3726
3727			while (maddr != NULL && !(*flags & CPU_DTRACE_FAULT)) {
3728				uint64_t m;
3729#ifdef _LP64
3730				m = dtrace_load64(maddr + dev) & MAXMIN64;
3731#else
3732				m = dtrace_load32(maddr + dev) & MAXMIN;
3733#endif
3734				if (m != minor) {
3735					maddr = dtrace_loadptr(maddr + next);
3736
3737					if (scout == NULL)
3738						continue;
3739
3740					scout = dtrace_loadptr(scout + next);
3741
3742					if (scout == NULL)
3743						continue;
3744
3745					scout = dtrace_loadptr(scout + next);
3746
3747					if (scout == NULL)
3748						continue;
3749
3750					if (scout == maddr) {
3751						*flags |= CPU_DTRACE_ILLOP;
3752						break;
3753					}
3754
3755					continue;
3756				}
3757
3758				/*
3759				 * We have the minor data.  Now we need to
3760				 * copy the minor's name into the end of the
3761				 * pathname.
3762				 */
3763				s = (char *)dtrace_loadptr(maddr + name);
3764				len = dtrace_strlen(s, size);
3765
3766				if (*flags & CPU_DTRACE_FAULT)
3767					break;
3768
3769				if (len != 0) {
3770					if ((end -= (len + 1)) < start)
3771						break;
3772
3773					*end = ':';
3774				}
3775
3776				for (i = 1; i <= len; i++)
3777					end[i] = dtrace_load8((uintptr_t)s++);
3778				break;
3779			}
3780		}
3781
3782		while (daddr != NULL && !(*flags & CPU_DTRACE_FAULT)) {
3783			ddi_node_state_t devi_state;
3784
3785			devi_state = dtrace_load32(daddr +
3786			    offsetof(struct dev_info, devi_node_state));
3787
3788			if (*flags & CPU_DTRACE_FAULT)
3789				break;
3790
3791			if (devi_state >= DS_INITIALIZED) {
3792				s = (char *)dtrace_loadptr(daddr +
3793				    offsetof(struct dev_info, devi_addr));
3794				len = dtrace_strlen(s, size);
3795
3796				if (*flags & CPU_DTRACE_FAULT)
3797					break;
3798
3799				if (len != 0) {
3800					if ((end -= (len + 1)) < start)
3801						break;
3802
3803					*end = '@';
3804				}
3805
3806				for (i = 1; i <= len; i++)
3807					end[i] = dtrace_load8((uintptr_t)s++);
3808			}
3809
3810			/*
3811			 * Now for the node name...
3812			 */
3813			s = (char *)dtrace_loadptr(daddr +
3814			    offsetof(struct dev_info, devi_node_name));
3815
3816			daddr = dtrace_loadptr(daddr +
3817			    offsetof(struct dev_info, devi_parent));
3818
3819			/*
3820			 * If our parent is NULL (that is, if we're the root
3821			 * node), we're going to use the special path
3822			 * "devices".
3823			 */
3824			if (daddr == NULL)
3825				s = "devices";
3826
3827			len = dtrace_strlen(s, size);
3828			if (*flags & CPU_DTRACE_FAULT)
3829				break;
3830
3831			if ((end -= (len + 1)) < start)
3832				break;
3833
3834			for (i = 1; i <= len; i++)
3835				end[i] = dtrace_load8((uintptr_t)s++);
3836			*end = '/';
3837
3838			if (depth++ > dtrace_devdepth_max) {
3839				*flags |= CPU_DTRACE_ILLOP;
3840				break;
3841			}
3842		}
3843
3844		if (end < start)
3845			DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
3846
3847		if (daddr == NULL) {
3848			regs[rd] = (uintptr_t)end;
3849			mstate->dtms_scratch_ptr += size;
3850		}
3851
3852		break;
3853	}
3854
3855	case DIF_SUBR_STRJOIN: {
3856		char *d = (char *)mstate->dtms_scratch_ptr;
3857		uint64_t size = state->dts_options[DTRACEOPT_STRSIZE];
3858		uintptr_t s1 = tupregs[0].dttk_value;
3859		uintptr_t s2 = tupregs[1].dttk_value;
3860		int i = 0;
3861
3862		if (!dtrace_strcanload(s1, size, mstate, vstate) ||
3863		    !dtrace_strcanload(s2, size, mstate, vstate)) {
3864			regs[rd] = NULL;
3865			break;
3866		}
3867
3868		if (!DTRACE_INSCRATCH(mstate, size)) {
3869			DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
3870			regs[rd] = NULL;
3871			break;
3872		}
3873
3874		for (;;) {
3875			if (i >= size) {
3876				DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
3877				regs[rd] = NULL;
3878				break;
3879			}
3880
3881			if ((d[i++] = dtrace_load8(s1++)) == '\0') {
3882				i--;
3883				break;
3884			}
3885		}
3886
3887		for (;;) {
3888			if (i >= size) {
3889				DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
3890				regs[rd] = NULL;
3891				break;
3892			}
3893
3894			if ((d[i++] = dtrace_load8(s2++)) == '\0')
3895				break;
3896		}
3897
3898		if (i < size) {
3899			mstate->dtms_scratch_ptr += i;
3900			regs[rd] = (uintptr_t)d;
3901		}
3902
3903		break;
3904	}
3905
3906	case DIF_SUBR_LLTOSTR: {
3907		int64_t i = (int64_t)tupregs[0].dttk_value;
3908		int64_t val = i < 0 ? i * -1 : i;
3909		uint64_t size = 22;	/* enough room for 2^64 in decimal */
3910		char *end = (char *)mstate->dtms_scratch_ptr + size - 1;
3911
3912		if (!DTRACE_INSCRATCH(mstate, size)) {
3913			DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
3914			regs[rd] = NULL;
3915			break;
3916		}
3917
3918		for (*end-- = '\0'; val; val /= 10)
3919			*end-- = '0' + (val % 10);
3920
3921		if (i == 0)
3922			*end-- = '0';
3923
3924		if (i < 0)
3925			*end-- = '-';
3926
3927		regs[rd] = (uintptr_t)end + 1;
3928		mstate->dtms_scratch_ptr += size;
3929		break;
3930	}
3931
3932	case DIF_SUBR_HTONS:
3933	case DIF_SUBR_NTOHS:
3934#ifdef _BIG_ENDIAN
3935		regs[rd] = (uint16_t)tupregs[0].dttk_value;
3936#else
3937		regs[rd] = DT_BSWAP_16((uint16_t)tupregs[0].dttk_value);
3938#endif
3939		break;
3940
3941
3942	case DIF_SUBR_HTONL:
3943	case DIF_SUBR_NTOHL:
3944#ifdef _BIG_ENDIAN
3945		regs[rd] = (uint32_t)tupregs[0].dttk_value;
3946#else
3947		regs[rd] = DT_BSWAP_32((uint32_t)tupregs[0].dttk_value);
3948#endif
3949		break;
3950
3951
3952	case DIF_SUBR_HTONLL:
3953	case DIF_SUBR_NTOHLL:
3954#ifdef _BIG_ENDIAN
3955		regs[rd] = (uint64_t)tupregs[0].dttk_value;
3956#else
3957		regs[rd] = DT_BSWAP_64((uint64_t)tupregs[0].dttk_value);
3958#endif
3959		break;
3960
3961
3962	case DIF_SUBR_DIRNAME:
3963	case DIF_SUBR_BASENAME: {
3964		char *dest = (char *)mstate->dtms_scratch_ptr;
3965		uint64_t size = state->dts_options[DTRACEOPT_STRSIZE];
3966		uintptr_t src = tupregs[0].dttk_value;
3967		int i, j, len = dtrace_strlen((char *)src, size);
3968		int lastbase = -1, firstbase = -1, lastdir = -1;
3969		int start, end;
3970
3971		if (!dtrace_canload(src, len + 1, mstate, vstate)) {
3972			regs[rd] = NULL;
3973			break;
3974		}
3975
3976		if (!DTRACE_INSCRATCH(mstate, size)) {
3977			DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
3978			regs[rd] = NULL;
3979			break;
3980		}
3981
3982		/*
3983		 * The basename and dirname for a zero-length string is
3984		 * defined to be "."
3985		 */
3986		if (len == 0) {
3987			len = 1;
3988			src = (uintptr_t)".";
3989		}
3990
3991		/*
3992		 * Start from the back of the string, moving back toward the
3993		 * front until we see a character that isn't a slash.  That
3994		 * character is the last character in the basename.
3995		 */
3996		for (i = len - 1; i >= 0; i--) {
3997			if (dtrace_load8(src + i) != '/')
3998				break;
3999		}
4000
4001		if (i >= 0)
4002			lastbase = i;
4003
4004		/*
4005		 * Starting from the last character in the basename, move
4006		 * towards the front until we find a slash.  The character
4007		 * that we processed immediately before that is the first
4008		 * character in the basename.
4009		 */
4010		for (; i >= 0; i--) {
4011			if (dtrace_load8(src + i) == '/')
4012				break;
4013		}
4014
4015		if (i >= 0)
4016			firstbase = i + 1;
4017
4018		/*
4019		 * Now keep going until we find a non-slash character.  That
4020		 * character is the last character in the dirname.
4021		 */
4022		for (; i >= 0; i--) {
4023			if (dtrace_load8(src + i) != '/')
4024				break;
4025		}
4026
4027		if (i >= 0)
4028			lastdir = i;
4029
4030		ASSERT(!(lastbase == -1 && firstbase != -1));
4031		ASSERT(!(firstbase == -1 && lastdir != -1));
4032
4033		if (lastbase == -1) {
4034			/*
4035			 * We didn't find a non-slash character.  We know that
4036			 * the length is non-zero, so the whole string must be
4037			 * slashes.  In either the dirname or the basename
4038			 * case, we return '/'.
4039			 */
4040			ASSERT(firstbase == -1);
4041			firstbase = lastbase = lastdir = 0;
4042		}
4043
4044		if (firstbase == -1) {
4045			/*
4046			 * The entire string consists only of a basename
4047			 * component.  If we're looking for dirname, we need
4048			 * to change our string to be just "."; if we're
4049			 * looking for a basename, we'll just set the first
4050			 * character of the basename to be 0.
4051			 */
4052			if (subr == DIF_SUBR_DIRNAME) {
4053				ASSERT(lastdir == -1);
4054				src = (uintptr_t)".";
4055				lastdir = 0;
4056			} else {
4057				firstbase = 0;
4058			}
4059		}
4060
4061		if (subr == DIF_SUBR_DIRNAME) {
4062			if (lastdir == -1) {
4063				/*
4064				 * We know that we have a slash in the name --
4065				 * or lastdir would be set to 0, above.  And
4066				 * because lastdir is -1, we know that this
4067				 * slash must be the first character.  (That
4068				 * is, the full string must be of the form
4069				 * "/basename".)  In this case, the last
4070				 * character of the directory name is 0.
4071				 */
4072				lastdir = 0;
4073			}
4074
4075			start = 0;
4076			end = lastdir;
4077		} else {
4078			ASSERT(subr == DIF_SUBR_BASENAME);
4079			ASSERT(firstbase != -1 && lastbase != -1);
4080			start = firstbase;
4081			end = lastbase;
4082		}
4083
4084		for (i = start, j = 0; i <= end && j < size - 1; i++, j++)
4085			dest[j] = dtrace_load8(src + i);
4086
4087		dest[j] = '\0';
4088		regs[rd] = (uintptr_t)dest;
4089		mstate->dtms_scratch_ptr += size;
4090		break;
4091	}
4092
4093	case DIF_SUBR_CLEANPATH: {
4094		char *dest = (char *)mstate->dtms_scratch_ptr, c;
4095		uint64_t size = state->dts_options[DTRACEOPT_STRSIZE];
4096		uintptr_t src = tupregs[0].dttk_value;
4097		int i = 0, j = 0;
4098
4099		if (!dtrace_strcanload(src, size, mstate, vstate)) {
4100			regs[rd] = NULL;
4101			break;
4102		}
4103
4104		if (!DTRACE_INSCRATCH(mstate, size)) {
4105			DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
4106			regs[rd] = NULL;
4107			break;
4108		}
4109
4110		/*
4111		 * Move forward, loading each character.
4112		 */
4113		do {
4114			c = dtrace_load8(src + i++);
4115next:
4116			if (j + 5 >= size)	/* 5 = strlen("/..c\0") */
4117				break;
4118
4119			if (c != '/') {
4120				dest[j++] = c;
4121				continue;
4122			}
4123
4124			c = dtrace_load8(src + i++);
4125
4126			if (c == '/') {
4127				/*
4128				 * We have two slashes -- we can just advance
4129				 * to the next character.
4130				 */
4131				goto next;
4132			}
4133
4134			if (c != '.') {
4135				/*
4136				 * This is not "." and it's not ".." -- we can
4137				 * just store the "/" and this character and
4138				 * drive on.
4139				 */
4140				dest[j++] = '/';
4141				dest[j++] = c;
4142				continue;
4143			}
4144
4145			c = dtrace_load8(src + i++);
4146
4147			if (c == '/') {
4148				/*
4149				 * This is a "/./" component.  We're not going
4150				 * to store anything in the destination buffer;
4151				 * we're just going to go to the next component.
4152				 */
4153				goto next;
4154			}
4155
4156			if (c != '.') {
4157				/*
4158				 * This is not ".." -- we can just store the
4159				 * "/." and this character and continue
4160				 * processing.
4161				 */
4162				dest[j++] = '/';
4163				dest[j++] = '.';
4164				dest[j++] = c;
4165				continue;
4166			}
4167
4168			c = dtrace_load8(src + i++);
4169
4170			if (c != '/' && c != '\0') {
4171				/*
4172				 * This is not ".." -- it's "..[mumble]".
4173				 * We'll store the "/.." and this character
4174				 * and continue processing.
4175				 */
4176				dest[j++] = '/';
4177				dest[j++] = '.';
4178				dest[j++] = '.';
4179				dest[j++] = c;
4180				continue;
4181			}
4182
4183			/*
4184			 * This is "/../" or "/..\0".  We need to back up
4185			 * our destination pointer until we find a "/".
4186			 */
4187			i--;
4188			while (j != 0 && dest[--j] != '/')
4189				continue;
4190
4191			if (c == '\0')
4192				dest[++j] = '/';
4193		} while (c != '\0');
4194
4195		dest[j] = '\0';
4196		regs[rd] = (uintptr_t)dest;
4197		mstate->dtms_scratch_ptr += size;
4198		break;
4199	}
4200
4201	case DIF_SUBR_INET_NTOA:
4202	case DIF_SUBR_INET_NTOA6:
4203	case DIF_SUBR_INET_NTOP: {
4204		size_t size;
4205		int af, argi, i;
4206		char *base, *end;
4207
4208		if (subr == DIF_SUBR_INET_NTOP) {
4209			af = (int)tupregs[0].dttk_value;
4210			argi = 1;
4211		} else {
4212			af = subr == DIF_SUBR_INET_NTOA ? AF_INET: AF_INET6;
4213			argi = 0;
4214		}
4215
4216		if (af == AF_INET) {
4217			ipaddr_t ip4;
4218			uint8_t *ptr8, val;
4219
4220			/*
4221			 * Safely load the IPv4 address.
4222			 */
4223			ip4 = dtrace_load32(tupregs[argi].dttk_value);
4224
4225			/*
4226			 * Check an IPv4 string will fit in scratch.
4227			 */
4228			size = INET_ADDRSTRLEN;
4229			if (!DTRACE_INSCRATCH(mstate, size)) {
4230				DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
4231				regs[rd] = NULL;
4232				break;
4233			}
4234			base = (char *)mstate->dtms_scratch_ptr;
4235			end = (char *)mstate->dtms_scratch_ptr + size - 1;
4236
4237			/*
4238			 * Stringify as a dotted decimal quad.
4239			 */
4240			*end-- = '\0';
4241			ptr8 = (uint8_t *)&ip4;
4242			for (i = 3; i >= 0; i--) {
4243				val = ptr8[i];
4244
4245				if (val == 0) {
4246					*end-- = '0';
4247				} else {
4248					for (; val; val /= 10) {
4249						*end-- = '0' + (val % 10);
4250					}
4251				}
4252
4253				if (i > 0)
4254					*end-- = '.';
4255			}
4256			ASSERT(end + 1 >= base);
4257
4258		} else if (af == AF_INET6) {
4259			struct in6_addr ip6;
4260			int firstzero, tryzero, numzero, v6end;
4261			uint16_t val;
4262			const char digits[] = "0123456789abcdef";
4263
4264			/*
4265			 * Stringify using RFC 1884 convention 2 - 16 bit
4266			 * hexadecimal values with a zero-run compression.
4267			 * Lower case hexadecimal digits are used.
4268			 * 	eg, fe80::214:4fff:fe0b:76c8.
4269			 * The IPv4 embedded form is returned for inet_ntop,
4270			 * just the IPv4 string is returned for inet_ntoa6.
4271			 */
4272
4273			/*
4274			 * Safely load the IPv6 address.
4275			 */
4276			dtrace_bcopy(
4277			    (void *)(uintptr_t)tupregs[argi].dttk_value,
4278			    (void *)(uintptr_t)&ip6, sizeof (struct in6_addr));
4279
4280			/*
4281			 * Check an IPv6 string will fit in scratch.
4282			 */
4283			size = INET6_ADDRSTRLEN;
4284			if (!DTRACE_INSCRATCH(mstate, size)) {
4285				DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
4286				regs[rd] = NULL;
4287				break;
4288			}
4289			base = (char *)mstate->dtms_scratch_ptr;
4290			end = (char *)mstate->dtms_scratch_ptr + size - 1;
4291			*end-- = '\0';
4292
4293			/*
4294			 * Find the longest run of 16 bit zero values
4295			 * for the single allowed zero compression - "::".
4296			 */
4297			firstzero = -1;
4298			tryzero = -1;
4299			numzero = 1;
4300			for (i = 0; i < sizeof (struct in6_addr); i++) {
4301				if (ip6._S6_un._S6_u8[i] == 0 &&
4302				    tryzero == -1 && i % 2 == 0) {
4303					tryzero = i;
4304					continue;
4305				}
4306
4307				if (tryzero != -1 &&
4308				    (ip6._S6_un._S6_u8[i] != 0 ||
4309				    i == sizeof (struct in6_addr) - 1)) {
4310
4311					if (i - tryzero <= numzero) {
4312						tryzero = -1;
4313						continue;
4314					}
4315
4316					firstzero = tryzero;
4317					numzero = i - i % 2 - tryzero;
4318					tryzero = -1;
4319
4320					if (ip6._S6_un._S6_u8[i] == 0 &&
4321					    i == sizeof (struct in6_addr) - 1)
4322						numzero += 2;
4323				}
4324			}
4325			ASSERT(firstzero + numzero <= sizeof (struct in6_addr));
4326
4327			/*
4328			 * Check for an IPv4 embedded address.
4329			 */
4330			v6end = sizeof (struct in6_addr) - 2;
4331			if (IN6_IS_ADDR_V4MAPPED(&ip6) ||
4332			    IN6_IS_ADDR_V4COMPAT(&ip6)) {
4333				for (i = sizeof (struct in6_addr) - 1;
4334				    i >= DTRACE_V4MAPPED_OFFSET; i--) {
4335					ASSERT(end >= base);
4336
4337					val = ip6._S6_un._S6_u8[i];
4338
4339					if (val == 0) {
4340						*end-- = '0';
4341					} else {
4342						for (; val; val /= 10) {
4343							*end-- = '0' + val % 10;
4344						}
4345					}
4346
4347					if (i > DTRACE_V4MAPPED_OFFSET)
4348						*end-- = '.';
4349				}
4350
4351				if (subr == DIF_SUBR_INET_NTOA6)
4352					goto inetout;
4353
4354				/*
4355				 * Set v6end to skip the IPv4 address that
4356				 * we have already stringified.
4357				 */
4358				v6end = 10;
4359			}
4360
4361			/*
4362			 * Build the IPv6 string by working through the
4363			 * address in reverse.
4364			 */
4365			for (i = v6end; i >= 0; i -= 2) {
4366				ASSERT(end >= base);
4367
4368				if (i == firstzero + numzero - 2) {
4369					*end-- = ':';
4370					*end-- = ':';
4371					i -= numzero - 2;
4372					continue;
4373				}
4374
4375				if (i < 14 && i != firstzero - 2)
4376					*end-- = ':';
4377
4378				val = (ip6._S6_un._S6_u8[i] << 8) +
4379				    ip6._S6_un._S6_u8[i + 1];
4380
4381				if (val == 0) {
4382					*end-- = '0';
4383				} else {
4384					for (; val; val /= 16) {
4385						*end-- = digits[val % 16];
4386					}
4387				}
4388			}
4389			ASSERT(end + 1 >= base);
4390
4391		} else {
4392			/*
4393			 * The user didn't use AH_INET or AH_INET6.
4394			 */
4395			DTRACE_CPUFLAG_SET(CPU_DTRACE_ILLOP);
4396			regs[rd] = NULL;
4397			break;
4398		}
4399
4400inetout:	regs[rd] = (uintptr_t)end + 1;
4401		mstate->dtms_scratch_ptr += size;
4402		break;
4403	}
4404
4405	}
4406}
4407
4408/*
4409 * Emulate the execution of DTrace IR instructions specified by the given
4410 * DIF object.  This function is deliberately void of assertions as all of
4411 * the necessary checks are handled by a call to dtrace_difo_validate().
4412 */
4413static uint64_t
4414dtrace_dif_emulate(dtrace_difo_t *difo, dtrace_mstate_t *mstate,
4415    dtrace_vstate_t *vstate, dtrace_state_t *state)
4416{
4417	const dif_instr_t *text = difo->dtdo_buf;
4418	const uint_t textlen = difo->dtdo_len;
4419	const char *strtab = difo->dtdo_strtab;
4420	const uint64_t *inttab = difo->dtdo_inttab;
4421
4422	uint64_t rval = 0;
4423	dtrace_statvar_t *svar;
4424	dtrace_dstate_t *dstate = &vstate->dtvs_dynvars;
4425	dtrace_difv_t *v;
4426	volatile uint16_t *flags = &cpu_core[CPU->cpu_id].cpuc_dtrace_flags;
4427	volatile uintptr_t *illval = &cpu_core[CPU->cpu_id].cpuc_dtrace_illval;
4428
4429	dtrace_key_t tupregs[DIF_DTR_NREGS + 2]; /* +2 for thread and id */
4430	uint64_t regs[DIF_DIR_NREGS];
4431	uint64_t *tmp;
4432
4433	uint8_t cc_n = 0, cc_z = 0, cc_v = 0, cc_c = 0;
4434	int64_t cc_r;
4435	uint_t pc = 0, id, opc;
4436	uint8_t ttop = 0;
4437	dif_instr_t instr;
4438	uint_t r1, r2, rd;
4439
4440	/*
4441	 * We stash the current DIF object into the machine state: we need it
4442	 * for subsequent access checking.
4443	 */
4444	mstate->dtms_difo = difo;
4445
4446	regs[DIF_REG_R0] = 0; 		/* %r0 is fixed at zero */
4447
4448	while (pc < textlen && !(*flags & CPU_DTRACE_FAULT)) {
4449		opc = pc;
4450
4451		instr = text[pc++];
4452		r1 = DIF_INSTR_R1(instr);
4453		r2 = DIF_INSTR_R2(instr);
4454		rd = DIF_INSTR_RD(instr);
4455
4456		switch (DIF_INSTR_OP(instr)) {
4457		case DIF_OP_OR:
4458			regs[rd] = regs[r1] | regs[r2];
4459			break;
4460		case DIF_OP_XOR:
4461			regs[rd] = regs[r1] ^ regs[r2];
4462			break;
4463		case DIF_OP_AND:
4464			regs[rd] = regs[r1] & regs[r2];
4465			break;
4466		case DIF_OP_SLL:
4467			regs[rd] = regs[r1] << regs[r2];
4468			break;
4469		case DIF_OP_SRL:
4470			regs[rd] = regs[r1] >> regs[r2];
4471			break;
4472		case DIF_OP_SUB:
4473			regs[rd] = regs[r1] - regs[r2];
4474			break;
4475		case DIF_OP_ADD:
4476			regs[rd] = regs[r1] + regs[r2];
4477			break;
4478		case DIF_OP_MUL:
4479			regs[rd] = regs[r1] * regs[r2];
4480			break;
4481		case DIF_OP_SDIV:
4482			if (regs[r2] == 0) {
4483				regs[rd] = 0;
4484				*flags |= CPU_DTRACE_DIVZERO;
4485			} else {
4486				regs[rd] = (int64_t)regs[r1] /
4487				    (int64_t)regs[r2];
4488			}
4489			break;
4490
4491		case DIF_OP_UDIV:
4492			if (regs[r2] == 0) {
4493				regs[rd] = 0;
4494				*flags |= CPU_DTRACE_DIVZERO;
4495			} else {
4496				regs[rd] = regs[r1] / regs[r2];
4497			}
4498			break;
4499
4500		case DIF_OP_SREM:
4501			if (regs[r2] == 0) {
4502				regs[rd] = 0;
4503				*flags |= CPU_DTRACE_DIVZERO;
4504			} else {
4505				regs[rd] = (int64_t)regs[r1] %
4506				    (int64_t)regs[r2];
4507			}
4508			break;
4509
4510		case DIF_OP_UREM:
4511			if (regs[r2] == 0) {
4512				regs[rd] = 0;
4513				*flags |= CPU_DTRACE_DIVZERO;
4514			} else {
4515				regs[rd] = regs[r1] % regs[r2];
4516			}
4517			break;
4518
4519		case DIF_OP_NOT:
4520			regs[rd] = ~regs[r1];
4521			break;
4522		case DIF_OP_MOV:
4523			regs[rd] = regs[r1];
4524			break;
4525		case DIF_OP_CMP:
4526			cc_r = regs[r1] - regs[r2];
4527			cc_n = cc_r < 0;
4528			cc_z = cc_r == 0;
4529			cc_v = 0;
4530			cc_c = regs[r1] < regs[r2];
4531			break;
4532		case DIF_OP_TST:
4533			cc_n = cc_v = cc_c = 0;
4534			cc_z = regs[r1] == 0;
4535			break;
4536		case DIF_OP_BA:
4537			pc = DIF_INSTR_LABEL(instr);
4538			break;
4539		case DIF_OP_BE:
4540			if (cc_z)
4541				pc = DIF_INSTR_LABEL(instr);
4542			break;
4543		case DIF_OP_BNE:
4544			if (cc_z == 0)
4545				pc = DIF_INSTR_LABEL(instr);
4546			break;
4547		case DIF_OP_BG:
4548			if ((cc_z | (cc_n ^ cc_v)) == 0)
4549				pc = DIF_INSTR_LABEL(instr);
4550			break;
4551		case DIF_OP_BGU:
4552			if ((cc_c | cc_z) == 0)
4553				pc = DIF_INSTR_LABEL(instr);
4554			break;
4555		case DIF_OP_BGE:
4556			if ((cc_n ^ cc_v) == 0)
4557				pc = DIF_INSTR_LABEL(instr);
4558			break;
4559		case DIF_OP_BGEU:
4560			if (cc_c == 0)
4561				pc = DIF_INSTR_LABEL(instr);
4562			break;
4563		case DIF_OP_BL:
4564			if (cc_n ^ cc_v)
4565				pc = DIF_INSTR_LABEL(instr);
4566			break;
4567		case DIF_OP_BLU:
4568			if (cc_c)
4569				pc = DIF_INSTR_LABEL(instr);
4570			break;
4571		case DIF_OP_BLE:
4572			if (cc_z | (cc_n ^ cc_v))
4573				pc = DIF_INSTR_LABEL(instr);
4574			break;
4575		case DIF_OP_BLEU:
4576			if (cc_c | cc_z)
4577				pc = DIF_INSTR_LABEL(instr);
4578			break;
4579		case DIF_OP_RLDSB:
4580			if (!dtrace_canstore(regs[r1], 1, mstate, vstate)) {
4581				*flags |= CPU_DTRACE_KPRIV;
4582				*illval = regs[r1];
4583				break;
4584			}
4585			/*FALLTHROUGH*/
4586		case DIF_OP_LDSB:
4587			regs[rd] = (int8_t)dtrace_load8(regs[r1]);
4588			break;
4589		case DIF_OP_RLDSH:
4590			if (!dtrace_canstore(regs[r1], 2, mstate, vstate)) {
4591				*flags |= CPU_DTRACE_KPRIV;
4592				*illval = regs[r1];
4593				break;
4594			}
4595			/*FALLTHROUGH*/
4596		case DIF_OP_LDSH:
4597			regs[rd] = (int16_t)dtrace_load16(regs[r1]);
4598			break;
4599		case DIF_OP_RLDSW:
4600			if (!dtrace_canstore(regs[r1], 4, mstate, vstate)) {
4601				*flags |= CPU_DTRACE_KPRIV;
4602				*illval = regs[r1];
4603				break;
4604			}
4605			/*FALLTHROUGH*/
4606		case DIF_OP_LDSW:
4607			regs[rd] = (int32_t)dtrace_load32(regs[r1]);
4608			break;
4609		case DIF_OP_RLDUB:
4610			if (!dtrace_canstore(regs[r1], 1, mstate, vstate)) {
4611				*flags |= CPU_DTRACE_KPRIV;
4612				*illval = regs[r1];
4613				break;
4614			}
4615			/*FALLTHROUGH*/
4616		case DIF_OP_LDUB:
4617			regs[rd] = dtrace_load8(regs[r1]);
4618			break;
4619		case DIF_OP_RLDUH:
4620			if (!dtrace_canstore(regs[r1], 2, mstate, vstate)) {
4621				*flags |= CPU_DTRACE_KPRIV;
4622				*illval = regs[r1];
4623				break;
4624			}
4625			/*FALLTHROUGH*/
4626		case DIF_OP_LDUH:
4627			regs[rd] = dtrace_load16(regs[r1]);
4628			break;
4629		case DIF_OP_RLDUW:
4630			if (!dtrace_canstore(regs[r1], 4, mstate, vstate)) {
4631				*flags |= CPU_DTRACE_KPRIV;
4632				*illval = regs[r1];
4633				break;
4634			}
4635			/*FALLTHROUGH*/
4636		case DIF_OP_LDUW:
4637			regs[rd] = dtrace_load32(regs[r1]);
4638			break;
4639		case DIF_OP_RLDX:
4640			if (!dtrace_canstore(regs[r1], 8, mstate, vstate)) {
4641				*flags |= CPU_DTRACE_KPRIV;
4642				*illval = regs[r1];
4643				break;
4644			}
4645			/*FALLTHROUGH*/
4646		case DIF_OP_LDX:
4647			regs[rd] = dtrace_load64(regs[r1]);
4648			break;
4649		case DIF_OP_ULDSB:
4650			regs[rd] = (int8_t)
4651			    dtrace_fuword8((void *)(uintptr_t)regs[r1]);
4652			break;
4653		case DIF_OP_ULDSH:
4654			regs[rd] = (int16_t)
4655			    dtrace_fuword16((void *)(uintptr_t)regs[r1]);
4656			break;
4657		case DIF_OP_ULDSW:
4658			regs[rd] = (int32_t)
4659			    dtrace_fuword32((void *)(uintptr_t)regs[r1]);
4660			break;
4661		case DIF_OP_ULDUB:
4662			regs[rd] =
4663			    dtrace_fuword8((void *)(uintptr_t)regs[r1]);
4664			break;
4665		case DIF_OP_ULDUH:
4666			regs[rd] =
4667			    dtrace_fuword16((void *)(uintptr_t)regs[r1]);
4668			break;
4669		case DIF_OP_ULDUW:
4670			regs[rd] =
4671			    dtrace_fuword32((void *)(uintptr_t)regs[r1]);
4672			break;
4673		case DIF_OP_ULDX:
4674			regs[rd] =
4675			    dtrace_fuword64((void *)(uintptr_t)regs[r1]);
4676			break;
4677		case DIF_OP_RET:
4678			rval = regs[rd];
4679			pc = textlen;
4680			break;
4681		case DIF_OP_NOP:
4682			break;
4683		case DIF_OP_SETX:
4684			regs[rd] = inttab[DIF_INSTR_INTEGER(instr)];
4685			break;
4686		case DIF_OP_SETS:
4687			regs[rd] = (uint64_t)(uintptr_t)
4688			    (strtab + DIF_INSTR_STRING(instr));
4689			break;
4690		case DIF_OP_SCMP: {
4691			size_t sz = state->dts_options[DTRACEOPT_STRSIZE];
4692			uintptr_t s1 = regs[r1];
4693			uintptr_t s2 = regs[r2];
4694
4695			if (s1 != NULL &&
4696			    !dtrace_strcanload(s1, sz, mstate, vstate))
4697				break;
4698			if (s2 != NULL &&
4699			    !dtrace_strcanload(s2, sz, mstate, vstate))
4700				break;
4701
4702			cc_r = dtrace_strncmp((char *)s1, (char *)s2, sz);
4703
4704			cc_n = cc_r < 0;
4705			cc_z = cc_r == 0;
4706			cc_v = cc_c = 0;
4707			break;
4708		}
4709		case DIF_OP_LDGA:
4710			regs[rd] = dtrace_dif_variable(mstate, state,
4711			    r1, regs[r2]);
4712			break;
4713		case DIF_OP_LDGS:
4714			id = DIF_INSTR_VAR(instr);
4715
4716			if (id >= DIF_VAR_OTHER_UBASE) {
4717				uintptr_t a;
4718
4719				id -= DIF_VAR_OTHER_UBASE;
4720				svar = vstate->dtvs_globals[id];
4721				ASSERT(svar != NULL);
4722				v = &svar->dtsv_var;
4723
4724				if (!(v->dtdv_type.dtdt_flags & DIF_TF_BYREF)) {
4725					regs[rd] = svar->dtsv_data;
4726					break;
4727				}
4728
4729				a = (uintptr_t)svar->dtsv_data;
4730
4731				if (*(uint8_t *)a == UINT8_MAX) {
4732					/*
4733					 * If the 0th byte is set to UINT8_MAX
4734					 * then this is to be treated as a
4735					 * reference to a NULL variable.
4736					 */
4737					regs[rd] = NULL;
4738				} else {
4739					regs[rd] = a + sizeof (uint64_t);
4740				}
4741
4742				break;
4743			}
4744
4745			regs[rd] = dtrace_dif_variable(mstate, state, id, 0);
4746			break;
4747
4748		case DIF_OP_STGS:
4749			id = DIF_INSTR_VAR(instr);
4750
4751			ASSERT(id >= DIF_VAR_OTHER_UBASE);
4752			id -= DIF_VAR_OTHER_UBASE;
4753
4754			svar = vstate->dtvs_globals[id];
4755			ASSERT(svar != NULL);
4756			v = &svar->dtsv_var;
4757
4758			if (v->dtdv_type.dtdt_flags & DIF_TF_BYREF) {
4759				uintptr_t a = (uintptr_t)svar->dtsv_data;
4760
4761				ASSERT(a != NULL);
4762				ASSERT(svar->dtsv_size != 0);
4763
4764				if (regs[rd] == NULL) {
4765					*(uint8_t *)a = UINT8_MAX;
4766					break;
4767				} else {
4768					*(uint8_t *)a = 0;
4769					a += sizeof (uint64_t);
4770				}
4771				if (!dtrace_vcanload(
4772				    (void *)(uintptr_t)regs[rd], &v->dtdv_type,
4773				    mstate, vstate))
4774					break;
4775
4776				dtrace_vcopy((void *)(uintptr_t)regs[rd],
4777				    (void *)a, &v->dtdv_type);
4778				break;
4779			}
4780
4781			svar->dtsv_data = regs[rd];
4782			break;
4783
4784		case DIF_OP_LDTA:
4785			/*
4786			 * There are no DTrace built-in thread-local arrays at
4787			 * present.  This opcode is saved for future work.
4788			 */
4789			*flags |= CPU_DTRACE_ILLOP;
4790			regs[rd] = 0;
4791			break;
4792
4793		case DIF_OP_LDLS:
4794			id = DIF_INSTR_VAR(instr);
4795
4796			if (id < DIF_VAR_OTHER_UBASE) {
4797				/*
4798				 * For now, this has no meaning.
4799				 */
4800				regs[rd] = 0;
4801				break;
4802			}
4803
4804			id -= DIF_VAR_OTHER_UBASE;
4805
4806			ASSERT(id < vstate->dtvs_nlocals);
4807			ASSERT(vstate->dtvs_locals != NULL);
4808
4809			svar = vstate->dtvs_locals[id];
4810			ASSERT(svar != NULL);
4811			v = &svar->dtsv_var;
4812
4813			if (v->dtdv_type.dtdt_flags & DIF_TF_BYREF) {
4814				uintptr_t a = (uintptr_t)svar->dtsv_data;
4815				size_t sz = v->dtdv_type.dtdt_size;
4816
4817				sz += sizeof (uint64_t);
4818				ASSERT(svar->dtsv_size == NCPU * sz);
4819				a += CPU->cpu_id * sz;
4820
4821				if (*(uint8_t *)a == UINT8_MAX) {
4822					/*
4823					 * If the 0th byte is set to UINT8_MAX
4824					 * then this is to be treated as a
4825					 * reference to a NULL variable.
4826					 */
4827					regs[rd] = NULL;
4828				} else {
4829					regs[rd] = a + sizeof (uint64_t);
4830				}
4831
4832				break;
4833			}
4834
4835			ASSERT(svar->dtsv_size == NCPU * sizeof (uint64_t));
4836			tmp = (uint64_t *)(uintptr_t)svar->dtsv_data;
4837			regs[rd] = tmp[CPU->cpu_id];
4838			break;
4839
4840		case DIF_OP_STLS:
4841			id = DIF_INSTR_VAR(instr);
4842
4843			ASSERT(id >= DIF_VAR_OTHER_UBASE);
4844			id -= DIF_VAR_OTHER_UBASE;
4845			ASSERT(id < vstate->dtvs_nlocals);
4846
4847			ASSERT(vstate->dtvs_locals != NULL);
4848			svar = vstate->dtvs_locals[id];
4849			ASSERT(svar != NULL);
4850			v = &svar->dtsv_var;
4851
4852			if (v->dtdv_type.dtdt_flags & DIF_TF_BYREF) {
4853				uintptr_t a = (uintptr_t)svar->dtsv_data;
4854				size_t sz = v->dtdv_type.dtdt_size;
4855
4856				sz += sizeof (uint64_t);
4857				ASSERT(svar->dtsv_size == NCPU * sz);
4858				a += CPU->cpu_id * sz;
4859
4860				if (regs[rd] == NULL) {
4861					*(uint8_t *)a = UINT8_MAX;
4862					break;
4863				} else {
4864					*(uint8_t *)a = 0;
4865					a += sizeof (uint64_t);
4866				}
4867
4868				if (!dtrace_vcanload(
4869				    (void *)(uintptr_t)regs[rd], &v->dtdv_type,
4870				    mstate, vstate))
4871					break;
4872
4873				dtrace_vcopy((void *)(uintptr_t)regs[rd],
4874				    (void *)a, &v->dtdv_type);
4875				break;
4876			}
4877
4878			ASSERT(svar->dtsv_size == NCPU * sizeof (uint64_t));
4879			tmp = (uint64_t *)(uintptr_t)svar->dtsv_data;
4880			tmp[CPU->cpu_id] = regs[rd];
4881			break;
4882
4883		case DIF_OP_LDTS: {
4884			dtrace_dynvar_t *dvar;
4885			dtrace_key_t *key;
4886
4887			id = DIF_INSTR_VAR(instr);
4888			ASSERT(id >= DIF_VAR_OTHER_UBASE);
4889			id -= DIF_VAR_OTHER_UBASE;
4890			v = &vstate->dtvs_tlocals[id];
4891
4892			key = &tupregs[DIF_DTR_NREGS];
4893			key[0].dttk_value = (uint64_t)id;
4894			key[0].dttk_size = 0;
4895			DTRACE_TLS_THRKEY(key[1].dttk_value);
4896			key[1].dttk_size = 0;
4897
4898			dvar = dtrace_dynvar(dstate, 2, key,
4899			    sizeof (uint64_t), DTRACE_DYNVAR_NOALLOC,
4900			    mstate, vstate);
4901
4902			if (dvar == NULL) {
4903				regs[rd] = 0;
4904				break;
4905			}
4906
4907			if (v->dtdv_type.dtdt_flags & DIF_TF_BYREF) {
4908				regs[rd] = (uint64_t)(uintptr_t)dvar->dtdv_data;
4909			} else {
4910				regs[rd] = *((uint64_t *)dvar->dtdv_data);
4911			}
4912
4913			break;
4914		}
4915
4916		case DIF_OP_STTS: {
4917			dtrace_dynvar_t *dvar;
4918			dtrace_key_t *key;
4919
4920			id = DIF_INSTR_VAR(instr);
4921			ASSERT(id >= DIF_VAR_OTHER_UBASE);
4922			id -= DIF_VAR_OTHER_UBASE;
4923
4924			key = &tupregs[DIF_DTR_NREGS];
4925			key[0].dttk_value = (uint64_t)id;
4926			key[0].dttk_size = 0;
4927			DTRACE_TLS_THRKEY(key[1].dttk_value);
4928			key[1].dttk_size = 0;
4929			v = &vstate->dtvs_tlocals[id];
4930
4931			dvar = dtrace_dynvar(dstate, 2, key,
4932			    v->dtdv_type.dtdt_size > sizeof (uint64_t) ?
4933			    v->dtdv_type.dtdt_size : sizeof (uint64_t),
4934			    regs[rd] ? DTRACE_DYNVAR_ALLOC :
4935			    DTRACE_DYNVAR_DEALLOC, mstate, vstate);
4936
4937			/*
4938			 * Given that we're storing to thread-local data,
4939			 * we need to flush our predicate cache.
4940			 */
4941			curthread->t_predcache = NULL;
4942
4943			if (dvar == NULL)
4944				break;
4945
4946			if (v->dtdv_type.dtdt_flags & DIF_TF_BYREF) {
4947				if (!dtrace_vcanload(
4948				    (void *)(uintptr_t)regs[rd],
4949				    &v->dtdv_type, mstate, vstate))
4950					break;
4951
4952				dtrace_vcopy((void *)(uintptr_t)regs[rd],
4953				    dvar->dtdv_data, &v->dtdv_type);
4954			} else {
4955				*((uint64_t *)dvar->dtdv_data) = regs[rd];
4956			}
4957
4958			break;
4959		}
4960
4961		case DIF_OP_SRA:
4962			regs[rd] = (int64_t)regs[r1] >> regs[r2];
4963			break;
4964
4965		case DIF_OP_CALL:
4966			dtrace_dif_subr(DIF_INSTR_SUBR(instr), rd,
4967			    regs, tupregs, ttop, mstate, state);
4968			break;
4969
4970		case DIF_OP_PUSHTR:
4971			if (ttop == DIF_DTR_NREGS) {
4972				*flags |= CPU_DTRACE_TUPOFLOW;
4973				break;
4974			}
4975
4976			if (r1 == DIF_TYPE_STRING) {
4977				/*
4978				 * If this is a string type and the size is 0,
4979				 * we'll use the system-wide default string
4980				 * size.  Note that we are _not_ looking at
4981				 * the value of the DTRACEOPT_STRSIZE option;
4982				 * had this been set, we would expect to have
4983				 * a non-zero size value in the "pushtr".
4984				 */
4985				tupregs[ttop].dttk_size =
4986				    dtrace_strlen((char *)(uintptr_t)regs[rd],
4987				    regs[r2] ? regs[r2] :
4988				    dtrace_strsize_default) + 1;
4989			} else {
4990				tupregs[ttop].dttk_size = regs[r2];
4991			}
4992
4993			tupregs[ttop++].dttk_value = regs[rd];
4994			break;
4995
4996		case DIF_OP_PUSHTV:
4997			if (ttop == DIF_DTR_NREGS) {
4998				*flags |= CPU_DTRACE_TUPOFLOW;
4999				break;
5000			}
5001
5002			tupregs[ttop].dttk_value = regs[rd];
5003			tupregs[ttop++].dttk_size = 0;
5004			break;
5005
5006		case DIF_OP_POPTS:
5007			if (ttop != 0)
5008				ttop--;
5009			break;
5010
5011		case DIF_OP_FLUSHTS:
5012			ttop = 0;
5013			break;
5014
5015		case DIF_OP_LDGAA:
5016		case DIF_OP_LDTAA: {
5017			dtrace_dynvar_t *dvar;
5018			dtrace_key_t *key = tupregs;
5019			uint_t nkeys = ttop;
5020
5021			id = DIF_INSTR_VAR(instr);
5022			ASSERT(id >= DIF_VAR_OTHER_UBASE);
5023			id -= DIF_VAR_OTHER_UBASE;
5024
5025			key[nkeys].dttk_value = (uint64_t)id;
5026			key[nkeys++].dttk_size = 0;
5027
5028			if (DIF_INSTR_OP(instr) == DIF_OP_LDTAA) {
5029				DTRACE_TLS_THRKEY(key[nkeys].dttk_value);
5030				key[nkeys++].dttk_size = 0;
5031				v = &vstate->dtvs_tlocals[id];
5032			} else {
5033				v = &vstate->dtvs_globals[id]->dtsv_var;
5034			}
5035
5036			dvar = dtrace_dynvar(dstate, nkeys, key,
5037			    v->dtdv_type.dtdt_size > sizeof (uint64_t) ?
5038			    v->dtdv_type.dtdt_size : sizeof (uint64_t),
5039			    DTRACE_DYNVAR_NOALLOC, mstate, vstate);
5040
5041			if (dvar == NULL) {
5042				regs[rd] = 0;
5043				break;
5044			}
5045
5046			if (v->dtdv_type.dtdt_flags & DIF_TF_BYREF) {
5047				regs[rd] = (uint64_t)(uintptr_t)dvar->dtdv_data;
5048			} else {
5049				regs[rd] = *((uint64_t *)dvar->dtdv_data);
5050			}
5051
5052			break;
5053		}
5054
5055		case DIF_OP_STGAA:
5056		case DIF_OP_STTAA: {
5057			dtrace_dynvar_t *dvar;
5058			dtrace_key_t *key = tupregs;
5059			uint_t nkeys = ttop;
5060
5061			id = DIF_INSTR_VAR(instr);
5062			ASSERT(id >= DIF_VAR_OTHER_UBASE);
5063			id -= DIF_VAR_OTHER_UBASE;
5064
5065			key[nkeys].dttk_value = (uint64_t)id;
5066			key[nkeys++].dttk_size = 0;
5067
5068			if (DIF_INSTR_OP(instr) == DIF_OP_STTAA) {
5069				DTRACE_TLS_THRKEY(key[nkeys].dttk_value);
5070				key[nkeys++].dttk_size = 0;
5071				v = &vstate->dtvs_tlocals[id];
5072			} else {
5073				v = &vstate->dtvs_globals[id]->dtsv_var;
5074			}
5075
5076			dvar = dtrace_dynvar(dstate, nkeys, key,
5077			    v->dtdv_type.dtdt_size > sizeof (uint64_t) ?
5078			    v->dtdv_type.dtdt_size : sizeof (uint64_t),
5079			    regs[rd] ? DTRACE_DYNVAR_ALLOC :
5080			    DTRACE_DYNVAR_DEALLOC, mstate, vstate);
5081
5082			if (dvar == NULL)
5083				break;
5084
5085			if (v->dtdv_type.dtdt_flags & DIF_TF_BYREF) {
5086				if (!dtrace_vcanload(
5087				    (void *)(uintptr_t)regs[rd], &v->dtdv_type,
5088				    mstate, vstate))
5089					break;
5090
5091				dtrace_vcopy((void *)(uintptr_t)regs[rd],
5092				    dvar->dtdv_data, &v->dtdv_type);
5093			} else {
5094				*((uint64_t *)dvar->dtdv_data) = regs[rd];
5095			}
5096
5097			break;
5098		}
5099
5100		case DIF_OP_ALLOCS: {
5101			uintptr_t ptr = P2ROUNDUP(mstate->dtms_scratch_ptr, 8);
5102			size_t size = ptr - mstate->dtms_scratch_ptr + regs[r1];
5103
5104			/*
5105			 * Rounding up the user allocation size could have
5106			 * overflowed large, bogus allocations (like -1ULL) to
5107			 * 0.
5108			 */
5109			if (size < regs[r1] ||
5110			    !DTRACE_INSCRATCH(mstate, size)) {
5111				DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
5112				regs[rd] = NULL;
5113				break;
5114			}
5115
5116			dtrace_bzero((void *) mstate->dtms_scratch_ptr, size);
5117			mstate->dtms_scratch_ptr += size;
5118			regs[rd] = ptr;
5119			break;
5120		}
5121
5122		case DIF_OP_COPYS:
5123			if (!dtrace_canstore(regs[rd], regs[r2],
5124			    mstate, vstate)) {
5125				*flags |= CPU_DTRACE_BADADDR;
5126				*illval = regs[rd];
5127				break;
5128			}
5129
5130			if (!dtrace_canload(regs[r1], regs[r2], mstate, vstate))
5131				break;
5132
5133			dtrace_bcopy((void *)(uintptr_t)regs[r1],
5134			    (void *)(uintptr_t)regs[rd], (size_t)regs[r2]);
5135			break;
5136
5137		case DIF_OP_STB:
5138			if (!dtrace_canstore(regs[rd], 1, mstate, vstate)) {
5139				*flags |= CPU_DTRACE_BADADDR;
5140				*illval = regs[rd];
5141				break;
5142			}
5143			*((uint8_t *)(uintptr_t)regs[rd]) = (uint8_t)regs[r1];
5144			break;
5145
5146		case DIF_OP_STH:
5147			if (!dtrace_canstore(regs[rd], 2, mstate, vstate)) {
5148				*flags |= CPU_DTRACE_BADADDR;
5149				*illval = regs[rd];
5150				break;
5151			}
5152			if (regs[rd] & 1) {
5153				*flags |= CPU_DTRACE_BADALIGN;
5154				*illval = regs[rd];
5155				break;
5156			}
5157			*((uint16_t *)(uintptr_t)regs[rd]) = (uint16_t)regs[r1];
5158			break;
5159
5160		case DIF_OP_STW:
5161			if (!dtrace_canstore(regs[rd], 4, mstate, vstate)) {
5162				*flags |= CPU_DTRACE_BADADDR;
5163				*illval = regs[rd];
5164				break;
5165			}
5166			if (regs[rd] & 3) {
5167				*flags |= CPU_DTRACE_BADALIGN;
5168				*illval = regs[rd];
5169				break;
5170			}
5171			*((uint32_t *)(uintptr_t)regs[rd]) = (uint32_t)regs[r1];
5172			break;
5173
5174		case DIF_OP_STX:
5175			if (!dtrace_canstore(regs[rd], 8, mstate, vstate)) {
5176				*flags |= CPU_DTRACE_BADADDR;
5177				*illval = regs[rd];
5178				break;
5179			}
5180			if (regs[rd] & 7) {
5181				*flags |= CPU_DTRACE_BADALIGN;
5182				*illval = regs[rd];
5183				break;
5184			}
5185			*((uint64_t *)(uintptr_t)regs[rd]) = regs[r1];
5186			break;
5187		}
5188	}
5189
5190	if (!(*flags & CPU_DTRACE_FAULT))
5191		return (rval);
5192
5193	mstate->dtms_fltoffs = opc * sizeof (dif_instr_t);
5194	mstate->dtms_present |= DTRACE_MSTATE_FLTOFFS;
5195
5196	return (0);
5197}
5198
5199static void
5200dtrace_action_breakpoint(dtrace_ecb_t *ecb)
5201{
5202	dtrace_probe_t *probe = ecb->dte_probe;
5203	dtrace_provider_t *prov = probe->dtpr_provider;
5204	char c[DTRACE_FULLNAMELEN + 80], *str;
5205	char *msg = "dtrace: breakpoint action at probe ";
5206	char *ecbmsg = " (ecb ";
5207	uintptr_t mask = (0xf << (sizeof (uintptr_t) * NBBY / 4));
5208	uintptr_t val = (uintptr_t)ecb;
5209	int shift = (sizeof (uintptr_t) * NBBY) - 4, i = 0;
5210
5211	if (dtrace_destructive_disallow)
5212		return;
5213
5214	/*
5215	 * It's impossible to be taking action on the NULL probe.
5216	 */
5217	ASSERT(probe != NULL);
5218
5219	/*
5220	 * This is a poor man's (destitute man's?) sprintf():  we want to
5221	 * print the provider name, module name, function name and name of
5222	 * the probe, along with the hex address of the ECB with the breakpoint
5223	 * action -- all of which we must place in the character buffer by
5224	 * hand.
5225	 */
5226	while (*msg != '\0')
5227		c[i++] = *msg++;
5228
5229	for (str = prov->dtpv_name; *str != '\0'; str++)
5230		c[i++] = *str;
5231	c[i++] = ':';
5232
5233	for (str = probe->dtpr_mod; *str != '\0'; str++)
5234		c[i++] = *str;
5235	c[i++] = ':';
5236
5237	for (str = probe->dtpr_func; *str != '\0'; str++)
5238		c[i++] = *str;
5239	c[i++] = ':';
5240
5241	for (str = probe->dtpr_name; *str != '\0'; str++)
5242		c[i++] = *str;
5243
5244	while (*ecbmsg != '\0')
5245		c[i++] = *ecbmsg++;
5246
5247	while (shift >= 0) {
5248		mask = (uintptr_t)0xf << shift;
5249
5250		if (val >= ((uintptr_t)1 << shift))
5251			c[i++] = "0123456789abcdef"[(val & mask) >> shift];
5252		shift -= 4;
5253	}
5254
5255	c[i++] = ')';
5256	c[i] = '\0';
5257
5258	debug_enter(c);
5259}
5260
5261static void
5262dtrace_action_panic(dtrace_ecb_t *ecb)
5263{
5264	dtrace_probe_t *probe = ecb->dte_probe;
5265
5266	/*
5267	 * It's impossible to be taking action on the NULL probe.
5268	 */
5269	ASSERT(probe != NULL);
5270
5271	if (dtrace_destructive_disallow)
5272		return;
5273
5274	if (dtrace_panicked != NULL)
5275		return;
5276
5277	if (dtrace_casptr(&dtrace_panicked, NULL, curthread) != NULL)
5278		return;
5279
5280	/*
5281	 * We won the right to panic.  (We want to be sure that only one
5282	 * thread calls panic() from dtrace_probe(), and that panic() is
5283	 * called exactly once.)
5284	 */
5285	dtrace_panic("dtrace: panic action at probe %s:%s:%s:%s (ecb %p)",
5286	    probe->dtpr_provider->dtpv_name, probe->dtpr_mod,
5287	    probe->dtpr_func, probe->dtpr_name, (void *)ecb);
5288}
5289
5290static void
5291dtrace_action_raise(uint64_t sig)
5292{
5293	if (dtrace_destructive_disallow)
5294		return;
5295
5296	if (sig >= NSIG) {
5297		DTRACE_CPUFLAG_SET(CPU_DTRACE_ILLOP);
5298		return;
5299	}
5300
5301	/*
5302	 * raise() has a queue depth of 1 -- we ignore all subsequent
5303	 * invocations of the raise() action.
5304	 */
5305	if (curthread->t_dtrace_sig == 0)
5306		curthread->t_dtrace_sig = (uint8_t)sig;
5307
5308	curthread->t_sig_check = 1;
5309	aston(curthread);
5310}
5311
5312static void
5313dtrace_action_stop(void)
5314{
5315	if (dtrace_destructive_disallow)
5316		return;
5317
5318	if (!curthread->t_dtrace_stop) {
5319		curthread->t_dtrace_stop = 1;
5320		curthread->t_sig_check = 1;
5321		aston(curthread);
5322	}
5323}
5324
5325static void
5326dtrace_action_chill(dtrace_mstate_t *mstate, hrtime_t val)
5327{
5328	hrtime_t now;
5329	volatile uint16_t *flags;
5330	cpu_t *cpu = CPU;
5331
5332	if (dtrace_destructive_disallow)
5333		return;
5334
5335	flags = (volatile uint16_t *)&cpu_core[cpu->cpu_id].cpuc_dtrace_flags;
5336
5337	now = dtrace_gethrtime();
5338
5339	if (now - cpu->cpu_dtrace_chillmark > dtrace_chill_interval) {
5340		/*
5341		 * We need to advance the mark to the current time.
5342		 */
5343		cpu->cpu_dtrace_chillmark = now;
5344		cpu->cpu_dtrace_chilled = 0;
5345	}
5346
5347	/*
5348	 * Now check to see if the requested chill time would take us over
5349	 * the maximum amount of time allowed in the chill interval.  (Or
5350	 * worse, if the calculation itself induces overflow.)
5351	 */
5352	if (cpu->cpu_dtrace_chilled + val > dtrace_chill_max ||
5353	    cpu->cpu_dtrace_chilled + val < cpu->cpu_dtrace_chilled) {
5354		*flags |= CPU_DTRACE_ILLOP;
5355		return;
5356	}
5357
5358	while (dtrace_gethrtime() - now < val)
5359		continue;
5360
5361	/*
5362	 * Normally, we assure that the value of the variable "timestamp" does
5363	 * not change within an ECB.  The presence of chill() represents an
5364	 * exception to this rule, however.
5365	 */
5366	mstate->dtms_present &= ~DTRACE_MSTATE_TIMESTAMP;
5367	cpu->cpu_dtrace_chilled += val;
5368}
5369
5370static void
5371dtrace_action_ustack(dtrace_mstate_t *mstate, dtrace_state_t *state,
5372    uint64_t *buf, uint64_t arg)
5373{
5374	int nframes = DTRACE_USTACK_NFRAMES(arg);
5375	int strsize = DTRACE_USTACK_STRSIZE(arg);
5376	uint64_t *pcs = &buf[1], *fps;
5377	char *str = (char *)&pcs[nframes];
5378	int size, offs = 0, i, j;
5379	uintptr_t old = mstate->dtms_scratch_ptr, saved;
5380	uint16_t *flags = &cpu_core[CPU->cpu_id].cpuc_dtrace_flags;
5381	char *sym;
5382
5383	/*
5384	 * Should be taking a faster path if string space has not been
5385	 * allocated.
5386	 */
5387	ASSERT(strsize != 0);
5388
5389	/*
5390	 * We will first allocate some temporary space for the frame pointers.
5391	 */
5392	fps = (uint64_t *)P2ROUNDUP(mstate->dtms_scratch_ptr, 8);
5393	size = (uintptr_t)fps - mstate->dtms_scratch_ptr +
5394	    (nframes * sizeof (uint64_t));
5395
5396	if (!DTRACE_INSCRATCH(mstate, size)) {
5397		/*
5398		 * Not enough room for our frame pointers -- need to indicate
5399		 * that we ran out of scratch space.
5400		 */
5401		DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
5402		return;
5403	}
5404
5405	mstate->dtms_scratch_ptr += size;
5406	saved = mstate->dtms_scratch_ptr;
5407
5408	/*
5409	 * Now get a stack with both program counters and frame pointers.
5410	 */
5411	DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
5412	dtrace_getufpstack(buf, fps, nframes + 1);
5413	DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);
5414
5415	/*
5416	 * If that faulted, we're cooked.
5417	 */
5418	if (*flags & CPU_DTRACE_FAULT)
5419		goto out;
5420
5421	/*
5422	 * Now we want to walk up the stack, calling the USTACK helper.  For
5423	 * each iteration, we restore the scratch pointer.
5424	 */
5425	for (i = 0; i < nframes; i++) {
5426		mstate->dtms_scratch_ptr = saved;
5427
5428		if (offs >= strsize)
5429			break;
5430
5431		sym = (char *)(uintptr_t)dtrace_helper(
5432		    DTRACE_HELPER_ACTION_USTACK,
5433		    mstate, state, pcs[i], fps[i]);
5434
5435		/*
5436		 * If we faulted while running the helper, we're going to
5437		 * clear the fault and null out the corresponding string.
5438		 */
5439		if (*flags & CPU_DTRACE_FAULT) {
5440			*flags &= ~CPU_DTRACE_FAULT;
5441			str[offs++] = '\0';
5442			continue;
5443		}
5444
5445		if (sym == NULL) {
5446			str[offs++] = '\0';
5447			continue;
5448		}
5449
5450		DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
5451
5452		/*
5453		 * Now copy in the string that the helper returned to us.
5454		 */
5455		for (j = 0; offs + j < strsize; j++) {
5456			if ((str[offs + j] = sym[j]) == '\0')
5457				break;
5458		}
5459
5460		DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);
5461
5462		offs += j + 1;
5463	}
5464
5465	if (offs >= strsize) {
5466		/*
5467		 * If we didn't have room for all of the strings, we don't
5468		 * abort processing -- this needn't be a fatal error -- but we
5469		 * still want to increment a counter (dts_stkstroverflows) to
5470		 * allow this condition to be warned about.  (If this is from
5471		 * a jstack() action, it is easily tuned via jstackstrsize.)
5472		 */
5473		dtrace_error(&state->dts_stkstroverflows);
5474	}
5475
5476	while (offs < strsize)
5477		str[offs++] = '\0';
5478
5479out:
5480	mstate->dtms_scratch_ptr = old;
5481}
5482
5483/*
5484 * If you're looking for the epicenter of DTrace, you just found it.  This
5485 * is the function called by the provider to fire a probe -- from which all
5486 * subsequent probe-context DTrace activity emanates.
5487 */
5488void
5489dtrace_probe(dtrace_id_t id, uintptr_t arg0, uintptr_t arg1,
5490    uintptr_t arg2, uintptr_t arg3, uintptr_t arg4)
5491{
5492	processorid_t cpuid;
5493	dtrace_icookie_t cookie;
5494	dtrace_probe_t *probe;
5495	dtrace_mstate_t mstate;
5496	dtrace_ecb_t *ecb;
5497	dtrace_action_t *act;
5498	intptr_t offs;
5499	size_t size;
5500	int vtime, onintr;
5501	volatile uint16_t *flags;
5502	hrtime_t now;
5503
5504	/*
5505	 * Kick out immediately if this CPU is still being born (in which case
5506	 * curthread will be set to -1) or the current thread can't allow
5507	 * probes in its current context.
5508	 */
5509	if (((uintptr_t)curthread & 1) || (curthread->t_flag & T_DONTDTRACE))
5510		return;
5511
5512	cookie = dtrace_interrupt_disable();
5513	probe = dtrace_probes[id - 1];
5514	cpuid = CPU->cpu_id;
5515	onintr = CPU_ON_INTR(CPU);
5516
5517	if (!onintr && probe->dtpr_predcache != DTRACE_CACHEIDNONE &&
5518	    probe->dtpr_predcache == curthread->t_predcache) {
5519		/*
5520		 * We have hit in the predicate cache; we know that
5521		 * this predicate would evaluate to be false.
5522		 */
5523		dtrace_interrupt_enable(cookie);
5524		return;
5525	}
5526
5527	if (panic_quiesce) {
5528		/*
5529		 * We don't trace anything if we're panicking.
5530		 */
5531		dtrace_interrupt_enable(cookie);
5532		return;
5533	}
5534
5535	now = dtrace_gethrtime();
5536	vtime = dtrace_vtime_references != 0;
5537
5538	if (vtime && curthread->t_dtrace_start)
5539		curthread->t_dtrace_vtime += now - curthread->t_dtrace_start;
5540
5541	mstate.dtms_difo = NULL;
5542	mstate.dtms_probe = probe;
5543	mstate.dtms_strtok = NULL;
5544	mstate.dtms_arg[0] = arg0;
5545	mstate.dtms_arg[1] = arg1;
5546	mstate.dtms_arg[2] = arg2;
5547	mstate.dtms_arg[3] = arg3;
5548	mstate.dtms_arg[4] = arg4;
5549
5550	flags = (volatile uint16_t *)&cpu_core[cpuid].cpuc_dtrace_flags;
5551
5552	for (ecb = probe->dtpr_ecb; ecb != NULL; ecb = ecb->dte_next) {
5553		dtrace_predicate_t *pred = ecb->dte_predicate;
5554		dtrace_state_t *state = ecb->dte_state;
5555		dtrace_buffer_t *buf = &state->dts_buffer[cpuid];
5556		dtrace_buffer_t *aggbuf = &state->dts_aggbuffer[cpuid];
5557		dtrace_vstate_t *vstate = &state->dts_vstate;
5558		dtrace_provider_t *prov = probe->dtpr_provider;
5559		int committed = 0;
5560		caddr_t tomax;
5561
5562		/*
5563		 * A little subtlety with the following (seemingly innocuous)
5564		 * declaration of the automatic 'val':  by looking at the
5565		 * code, you might think that it could be declared in the
5566		 * action processing loop, below.  (That is, it's only used in
5567		 * the action processing loop.)  However, it must be declared
5568		 * out of that scope because in the case of DIF expression
5569		 * arguments to aggregating actions, one iteration of the
5570		 * action loop will use the last iteration's value.
5571		 */
5572#ifdef lint
5573		uint64_t val = 0;
5574#else
5575		uint64_t val;
5576#endif
5577
5578		mstate.dtms_present = DTRACE_MSTATE_ARGS | DTRACE_MSTATE_PROBE;
5579		*flags &= ~CPU_DTRACE_ERROR;
5580
5581		if (prov == dtrace_provider) {
5582			/*
5583			 * If dtrace itself is the provider of this probe,
5584			 * we're only going to continue processing the ECB if
5585			 * arg0 (the dtrace_state_t) is equal to the ECB's
5586			 * creating state.  (This prevents disjoint consumers
5587			 * from seeing one another's metaprobes.)
5588			 */
5589			if (arg0 != (uint64_t)(uintptr_t)state)
5590				continue;
5591		}
5592
5593		if (state->dts_activity != DTRACE_ACTIVITY_ACTIVE) {
5594			/*
5595			 * We're not currently active.  If our provider isn't
5596			 * the dtrace pseudo provider, we're not interested.
5597			 */
5598			if (prov != dtrace_provider)
5599				continue;
5600
5601			/*
5602			 * Now we must further check if we are in the BEGIN
5603			 * probe.  If we are, we will only continue processing
5604			 * if we're still in WARMUP -- if one BEGIN enabling
5605			 * has invoked the exit() action, we don't want to
5606			 * evaluate subsequent BEGIN enablings.
5607			 */
5608			if (probe->dtpr_id == dtrace_probeid_begin &&
5609			    state->dts_activity != DTRACE_ACTIVITY_WARMUP) {
5610				ASSERT(state->dts_activity ==
5611				    DTRACE_ACTIVITY_DRAINING);
5612				continue;
5613			}
5614		}
5615
5616		if (ecb->dte_cond) {
5617			/*
5618			 * If the dte_cond bits indicate that this
5619			 * consumer is only allowed to see user-mode firings
5620			 * of this probe, call the provider's dtps_usermode()
5621			 * entry point to check that the probe was fired
5622			 * while in a user context. Skip this ECB if that's
5623			 * not the case.
5624			 */
5625			if ((ecb->dte_cond & DTRACE_COND_USERMODE) &&
5626			    prov->dtpv_pops.dtps_usermode(prov->dtpv_arg,
5627			    probe->dtpr_id, probe->dtpr_arg) == 0)
5628				continue;
5629
5630			/*
5631			 * This is more subtle than it looks. We have to be
5632			 * absolutely certain that CRED() isn't going to
5633			 * change out from under us so it's only legit to
5634			 * examine that structure if we're in constrained
5635			 * situations. Currently, the only times we'll this
5636			 * check is if a non-super-user has enabled the
5637			 * profile or syscall providers -- providers that
5638			 * allow visibility of all processes. For the
5639			 * profile case, the check above will ensure that
5640			 * we're examining a user context.
5641			 */
5642			if (ecb->dte_cond & DTRACE_COND_OWNER) {
5643				cred_t *cr;
5644				cred_t *s_cr =
5645				    ecb->dte_state->dts_cred.dcr_cred;
5646				proc_t *proc;
5647
5648				ASSERT(s_cr != NULL);
5649
5650				if ((cr = CRED()) == NULL ||
5651				    s_cr->cr_uid != cr->cr_uid ||
5652				    s_cr->cr_uid != cr->cr_ruid ||
5653				    s_cr->cr_uid != cr->cr_suid ||
5654				    s_cr->cr_gid != cr->cr_gid ||
5655				    s_cr->cr_gid != cr->cr_rgid ||
5656				    s_cr->cr_gid != cr->cr_sgid ||
5657				    (proc = ttoproc(curthread)) == NULL ||
5658				    (proc->p_flag & SNOCD))
5659					continue;
5660			}
5661
5662			if (ecb->dte_cond & DTRACE_COND_ZONEOWNER) {
5663				cred_t *cr;
5664				cred_t *s_cr =
5665				    ecb->dte_state->dts_cred.dcr_cred;
5666
5667				ASSERT(s_cr != NULL);
5668
5669				if ((cr = CRED()) == NULL ||
5670				    s_cr->cr_zone->zone_id !=
5671				    cr->cr_zone->zone_id)
5672					continue;
5673			}
5674		}
5675
5676		if (now - state->dts_alive > dtrace_deadman_timeout) {
5677			/*
5678			 * We seem to be dead.  Unless we (a) have kernel
5679			 * destructive permissions (b) have expicitly enabled
5680			 * destructive actions and (c) destructive actions have
5681			 * not been disabled, we're going to transition into
5682			 * the KILLED state, from which no further processing
5683			 * on this state will be performed.
5684			 */
5685			if (!dtrace_priv_kernel_destructive(state) ||
5686			    !state->dts_cred.dcr_destructive ||
5687			    dtrace_destructive_disallow) {
5688				void *activity = &state->dts_activity;
5689				dtrace_activity_t current;
5690
5691				do {
5692					current = state->dts_activity;
5693				} while (dtrace_cas32(activity, current,
5694				    DTRACE_ACTIVITY_KILLED) != current);
5695
5696				continue;
5697			}
5698		}
5699
5700		if ((offs = dtrace_buffer_reserve(buf, ecb->dte_needed,
5701		    ecb->dte_alignment, state, &mstate)) < 0)
5702			continue;
5703
5704		tomax = buf->dtb_tomax;
5705		ASSERT(tomax != NULL);
5706
5707		if (ecb->dte_size != 0)
5708			DTRACE_STORE(uint32_t, tomax, offs, ecb->dte_epid);
5709
5710		mstate.dtms_epid = ecb->dte_epid;
5711		mstate.dtms_present |= DTRACE_MSTATE_EPID;
5712
5713		if (state->dts_cred.dcr_visible & DTRACE_CRV_KERNEL)
5714			mstate.dtms_access = DTRACE_ACCESS_KERNEL;
5715		else
5716			mstate.dtms_access = 0;
5717
5718		if (pred != NULL) {
5719			dtrace_difo_t *dp = pred->dtp_difo;
5720			int rval;
5721
5722			rval = dtrace_dif_emulate(dp, &mstate, vstate, state);
5723
5724			if (!(*flags & CPU_DTRACE_ERROR) && !rval) {
5725				dtrace_cacheid_t cid = probe->dtpr_predcache;
5726
5727				if (cid != DTRACE_CACHEIDNONE && !onintr) {
5728					/*
5729					 * Update the predicate cache...
5730					 */
5731					ASSERT(cid == pred->dtp_cacheid);
5732					curthread->t_predcache = cid;
5733				}
5734
5735				continue;
5736			}
5737		}
5738
5739		for (act = ecb->dte_action; !(*flags & CPU_DTRACE_ERROR) &&
5740		    act != NULL; act = act->dta_next) {
5741			size_t valoffs;
5742			dtrace_difo_t *dp;
5743			dtrace_recdesc_t *rec = &act->dta_rec;
5744
5745			size = rec->dtrd_size;
5746			valoffs = offs + rec->dtrd_offset;
5747
5748			if (DTRACEACT_ISAGG(act->dta_kind)) {
5749				uint64_t v = 0xbad;
5750				dtrace_aggregation_t *agg;
5751
5752				agg = (dtrace_aggregation_t *)act;
5753
5754				if ((dp = act->dta_difo) != NULL)
5755					v = dtrace_dif_emulate(dp,
5756					    &mstate, vstate, state);
5757
5758				if (*flags & CPU_DTRACE_ERROR)
5759					continue;
5760
5761				/*
5762				 * Note that we always pass the expression
5763				 * value from the previous iteration of the
5764				 * action loop.  This value will only be used
5765				 * if there is an expression argument to the
5766				 * aggregating action, denoted by the
5767				 * dtag_hasarg field.
5768				 */
5769				dtrace_aggregate(agg, buf,
5770				    offs, aggbuf, v, val);
5771				continue;
5772			}
5773
5774			switch (act->dta_kind) {
5775			case DTRACEACT_STOP:
5776				if (dtrace_priv_proc_destructive(state))
5777					dtrace_action_stop();
5778				continue;
5779
5780			case DTRACEACT_BREAKPOINT:
5781				if (dtrace_priv_kernel_destructive(state))
5782					dtrace_action_breakpoint(ecb);
5783				continue;
5784
5785			case DTRACEACT_PANIC:
5786				if (dtrace_priv_kernel_destructive(state))
5787					dtrace_action_panic(ecb);
5788				continue;
5789
5790			case DTRACEACT_STACK:
5791				if (!dtrace_priv_kernel(state))
5792					continue;
5793
5794				dtrace_getpcstack((pc_t *)(tomax + valoffs),
5795				    size / sizeof (pc_t), probe->dtpr_aframes,
5796				    DTRACE_ANCHORED(probe) ? NULL :
5797				    (uint32_t *)arg0);
5798
5799				continue;
5800
5801			case DTRACEACT_JSTACK:
5802			case DTRACEACT_USTACK:
5803				if (!dtrace_priv_proc(state))
5804					continue;
5805
5806				/*
5807				 * See comment in DIF_VAR_PID.
5808				 */
5809				if (DTRACE_ANCHORED(mstate.dtms_probe) &&
5810				    CPU_ON_INTR(CPU)) {
5811					int depth = DTRACE_USTACK_NFRAMES(
5812					    rec->dtrd_arg) + 1;
5813
5814					dtrace_bzero((void *)(tomax + valoffs),
5815					    DTRACE_USTACK_STRSIZE(rec->dtrd_arg)
5816					    + depth * sizeof (uint64_t));
5817
5818					continue;
5819				}
5820
5821				if (DTRACE_USTACK_STRSIZE(rec->dtrd_arg) != 0 &&
5822				    curproc->p_dtrace_helpers != NULL) {
5823					/*
5824					 * This is the slow path -- we have
5825					 * allocated string space, and we're
5826					 * getting the stack of a process that
5827					 * has helpers.  Call into a separate
5828					 * routine to perform this processing.
5829					 */
5830					dtrace_action_ustack(&mstate, state,
5831					    (uint64_t *)(tomax + valoffs),
5832					    rec->dtrd_arg);
5833					continue;
5834				}
5835
5836				DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
5837				dtrace_getupcstack((uint64_t *)
5838				    (tomax + valoffs),
5839				    DTRACE_USTACK_NFRAMES(rec->dtrd_arg) + 1);
5840				DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);
5841				continue;
5842
5843			default:
5844				break;
5845			}
5846
5847			dp = act->dta_difo;
5848			ASSERT(dp != NULL);
5849
5850			val = dtrace_dif_emulate(dp, &mstate, vstate, state);
5851
5852			if (*flags & CPU_DTRACE_ERROR)
5853				continue;
5854
5855			switch (act->dta_kind) {
5856			case DTRACEACT_SPECULATE:
5857				ASSERT(buf == &state->dts_buffer[cpuid]);
5858				buf = dtrace_speculation_buffer(state,
5859				    cpuid, val);
5860
5861				if (buf == NULL) {
5862					*flags |= CPU_DTRACE_DROP;
5863					continue;
5864				}
5865
5866				offs = dtrace_buffer_reserve(buf,
5867				    ecb->dte_needed, ecb->dte_alignment,
5868				    state, NULL);
5869
5870				if (offs < 0) {
5871					*flags |= CPU_DTRACE_DROP;
5872					continue;
5873				}
5874
5875				tomax = buf->dtb_tomax;
5876				ASSERT(tomax != NULL);
5877
5878				if (ecb->dte_size != 0)
5879					DTRACE_STORE(uint32_t, tomax, offs,
5880					    ecb->dte_epid);
5881				continue;
5882
5883			case DTRACEACT_CHILL:
5884				if (dtrace_priv_kernel_destructive(state))
5885					dtrace_action_chill(&mstate, val);
5886				continue;
5887
5888			case DTRACEACT_RAISE:
5889				if (dtrace_priv_proc_destructive(state))
5890					dtrace_action_raise(val);
5891				continue;
5892
5893			case DTRACEACT_COMMIT:
5894				ASSERT(!committed);
5895
5896				/*
5897				 * We need to commit our buffer state.
5898				 */
5899				if (ecb->dte_size)
5900					buf->dtb_offset = offs + ecb->dte_size;
5901				buf = &state->dts_buffer[cpuid];
5902				dtrace_speculation_commit(state, cpuid, val);
5903				committed = 1;
5904				continue;
5905
5906			case DTRACEACT_DISCARD:
5907				dtrace_speculation_discard(state, cpuid, val);
5908				continue;
5909
5910			case DTRACEACT_DIFEXPR:
5911			case DTRACEACT_LIBACT:
5912			case DTRACEACT_PRINTF:
5913			case DTRACEACT_PRINTA:
5914			case DTRACEACT_SYSTEM:
5915			case DTRACEACT_FREOPEN:
5916				break;
5917
5918			case DTRACEACT_SYM:
5919			case DTRACEACT_MOD:
5920				if (!dtrace_priv_kernel(state))
5921					continue;
5922				break;
5923
5924			case DTRACEACT_USYM:
5925			case DTRACEACT_UMOD:
5926			case DTRACEACT_UADDR: {
5927				struct pid *pid = curthread->t_procp->p_pidp;
5928
5929				if (!dtrace_priv_proc(state))
5930					continue;
5931
5932				DTRACE_STORE(uint64_t, tomax,
5933				    valoffs, (uint64_t)pid->pid_id);
5934				DTRACE_STORE(uint64_t, tomax,
5935				    valoffs + sizeof (uint64_t), val);
5936
5937				continue;
5938			}
5939
5940			case DTRACEACT_EXIT: {
5941				/*
5942				 * For the exit action, we are going to attempt
5943				 * to atomically set our activity to be
5944				 * draining.  If this fails (either because
5945				 * another CPU has beat us to the exit action,
5946				 * or because our current activity is something
5947				 * other than ACTIVE or WARMUP), we will
5948				 * continue.  This assures that the exit action
5949				 * can be successfully recorded at most once
5950				 * when we're in the ACTIVE state.  If we're
5951				 * encountering the exit() action while in
5952				 * COOLDOWN, however, we want to honor the new
5953				 * status code.  (We know that we're the only
5954				 * thread in COOLDOWN, so there is no race.)
5955				 */
5956				void *activity = &state->dts_activity;
5957				dtrace_activity_t current = state->dts_activity;
5958
5959				if (current == DTRACE_ACTIVITY_COOLDOWN)
5960					break;
5961
5962				if (current != DTRACE_ACTIVITY_WARMUP)
5963					current = DTRACE_ACTIVITY_ACTIVE;
5964
5965				if (dtrace_cas32(activity, current,
5966				    DTRACE_ACTIVITY_DRAINING) != current) {
5967					*flags |= CPU_DTRACE_DROP;
5968					continue;
5969				}
5970
5971				break;
5972			}
5973
5974			default:
5975				ASSERT(0);
5976			}
5977
5978			if (dp->dtdo_rtype.dtdt_flags & DIF_TF_BYREF) {
5979				uintptr_t end = valoffs + size;
5980
5981				if (!dtrace_vcanload((void *)(uintptr_t)val,
5982				    &dp->dtdo_rtype, &mstate, vstate))
5983					continue;
5984
5985				/*
5986				 * If this is a string, we're going to only
5987				 * load until we find the zero byte -- after
5988				 * which we'll store zero bytes.
5989				 */
5990				if (dp->dtdo_rtype.dtdt_kind ==
5991				    DIF_TYPE_STRING) {
5992					char c = '\0' + 1;
5993					int intuple = act->dta_intuple;
5994					size_t s;
5995
5996					for (s = 0; s < size; s++) {
5997						if (c != '\0')
5998							c = dtrace_load8(val++);
5999
6000						DTRACE_STORE(uint8_t, tomax,
6001						    valoffs++, c);
6002
6003						if (c == '\0' && intuple)
6004							break;
6005					}
6006
6007					continue;
6008				}
6009
6010				while (valoffs < end) {
6011					DTRACE_STORE(uint8_t, tomax, valoffs++,
6012					    dtrace_load8(val++));
6013				}
6014
6015				continue;
6016			}
6017
6018			switch (size) {
6019			case 0:
6020				break;
6021
6022			case sizeof (uint8_t):
6023				DTRACE_STORE(uint8_t, tomax, valoffs, val);
6024				break;
6025			case sizeof (uint16_t):
6026				DTRACE_STORE(uint16_t, tomax, valoffs, val);
6027				break;
6028			case sizeof (uint32_t):
6029				DTRACE_STORE(uint32_t, tomax, valoffs, val);
6030				break;
6031			case sizeof (uint64_t):
6032				DTRACE_STORE(uint64_t, tomax, valoffs, val);
6033				break;
6034			default:
6035				/*
6036				 * Any other size should have been returned by
6037				 * reference, not by value.
6038				 */
6039				ASSERT(0);
6040				break;
6041			}
6042		}
6043
6044		if (*flags & CPU_DTRACE_DROP)
6045			continue;
6046
6047		if (*flags & CPU_DTRACE_FAULT) {
6048			int ndx;
6049			dtrace_action_t *err;
6050
6051			buf->dtb_errors++;
6052
6053			if (probe->dtpr_id == dtrace_probeid_error) {
6054				/*
6055				 * There's nothing we can do -- we had an
6056				 * error on the error probe.  We bump an
6057				 * error counter to at least indicate that
6058				 * this condition happened.
6059				 */
6060				dtrace_error(&state->dts_dblerrors);
6061				continue;
6062			}
6063
6064			if (vtime) {
6065				/*
6066				 * Before recursing on dtrace_probe(), we
6067				 * need to explicitly clear out our start
6068				 * time to prevent it from being accumulated
6069				 * into t_dtrace_vtime.
6070				 */
6071				curthread->t_dtrace_start = 0;
6072			}
6073
6074			/*
6075			 * Iterate over the actions to figure out which action
6076			 * we were processing when we experienced the error.
6077			 * Note that act points _past_ the faulting action; if
6078			 * act is ecb->dte_action, the fault was in the
6079			 * predicate, if it's ecb->dte_action->dta_next it's
6080			 * in action #1, and so on.
6081			 */
6082			for (err = ecb->dte_action, ndx = 0;
6083			    err != act; err = err->dta_next, ndx++)
6084				continue;
6085
6086			dtrace_probe_error(state, ecb->dte_epid, ndx,
6087			    (mstate.dtms_present & DTRACE_MSTATE_FLTOFFS) ?
6088			    mstate.dtms_fltoffs : -1, DTRACE_FLAGS2FLT(*flags),
6089			    cpu_core[cpuid].cpuc_dtrace_illval);
6090
6091			continue;
6092		}
6093
6094		if (!committed)
6095			buf->dtb_offset = offs + ecb->dte_size;
6096	}
6097
6098	if (vtime)
6099		curthread->t_dtrace_start = dtrace_gethrtime();
6100
6101	dtrace_interrupt_enable(cookie);
6102}
6103
6104/*
6105 * DTrace Probe Hashing Functions
6106 *
6107 * The functions in this section (and indeed, the functions in remaining
6108 * sections) are not _called_ from probe context.  (Any exceptions to this are
6109 * marked with a "Note:".)  Rather, they are called from elsewhere in the
6110 * DTrace framework to look-up probes in, add probes to and remove probes from
6111 * the DTrace probe hashes.  (Each probe is hashed by each element of the
6112 * probe tuple -- allowing for fast lookups, regardless of what was
6113 * specified.)
6114 */
6115static uint_t
6116dtrace_hash_str(char *p)
6117{
6118	unsigned int g;
6119	uint_t hval = 0;
6120
6121	while (*p) {
6122		hval = (hval << 4) + *p++;
6123		if ((g = (hval & 0xf0000000)) != 0)
6124			hval ^= g >> 24;
6125		hval &= ~g;
6126	}
6127	return (hval);
6128}
6129
6130static dtrace_hash_t *
6131dtrace_hash_create(uintptr_t stroffs, uintptr_t nextoffs, uintptr_t prevoffs)
6132{
6133	dtrace_hash_t *hash = kmem_zalloc(sizeof (dtrace_hash_t), KM_SLEEP);
6134
6135	hash->dth_stroffs = stroffs;
6136	hash->dth_nextoffs = nextoffs;
6137	hash->dth_prevoffs = prevoffs;
6138
6139	hash->dth_size = 1;
6140	hash->dth_mask = hash->dth_size - 1;
6141
6142	hash->dth_tab = kmem_zalloc(hash->dth_size *
6143	    sizeof (dtrace_hashbucket_t *), KM_SLEEP);
6144
6145	return (hash);
6146}
6147
6148static void
6149dtrace_hash_destroy(dtrace_hash_t *hash)
6150{
6151#ifdef DEBUG
6152	int i;
6153
6154	for (i = 0; i < hash->dth_size; i++)
6155		ASSERT(hash->dth_tab[i] == NULL);
6156#endif
6157
6158	kmem_free(hash->dth_tab,
6159	    hash->dth_size * sizeof (dtrace_hashbucket_t *));
6160	kmem_free(hash, sizeof (dtrace_hash_t));
6161}
6162
6163static void
6164dtrace_hash_resize(dtrace_hash_t *hash)
6165{
6166	int size = hash->dth_size, i, ndx;
6167	int new_size = hash->dth_size << 1;
6168	int new_mask = new_size - 1;
6169	dtrace_hashbucket_t **new_tab, *bucket, *next;
6170
6171	ASSERT((new_size & new_mask) == 0);
6172
6173	new_tab = kmem_zalloc(new_size * sizeof (void *), KM_SLEEP);
6174
6175	for (i = 0; i < size; i++) {
6176		for (bucket = hash->dth_tab[i]; bucket != NULL; bucket = next) {
6177			dtrace_probe_t *probe = bucket->dthb_chain;
6178
6179			ASSERT(probe != NULL);
6180			ndx = DTRACE_HASHSTR(hash, probe) & new_mask;
6181
6182			next = bucket->dthb_next;
6183			bucket->dthb_next = new_tab[ndx];
6184			new_tab[ndx] = bucket;
6185		}
6186	}
6187
6188	kmem_free(hash->dth_tab, hash->dth_size * sizeof (void *));
6189	hash->dth_tab = new_tab;
6190	hash->dth_size = new_size;
6191	hash->dth_mask = new_mask;
6192}
6193
6194static void
6195dtrace_hash_add(dtrace_hash_t *hash, dtrace_probe_t *new)
6196{
6197	int hashval = DTRACE_HASHSTR(hash, new);
6198	int ndx = hashval & hash->dth_mask;
6199	dtrace_hashbucket_t *bucket = hash->dth_tab[ndx];
6200	dtrace_probe_t **nextp, **prevp;
6201
6202	for (; bucket != NULL; bucket = bucket->dthb_next) {
6203		if (DTRACE_HASHEQ(hash, bucket->dthb_chain, new))
6204			goto add;
6205	}
6206
6207	if ((hash->dth_nbuckets >> 1) > hash->dth_size) {
6208		dtrace_hash_resize(hash);
6209		dtrace_hash_add(hash, new);
6210		return;
6211	}
6212
6213	bucket = kmem_zalloc(sizeof (dtrace_hashbucket_t), KM_SLEEP);
6214	bucket->dthb_next = hash->dth_tab[ndx];
6215	hash->dth_tab[ndx] = bucket;
6216	hash->dth_nbuckets++;
6217
6218add:
6219	nextp = DTRACE_HASHNEXT(hash, new);
6220	ASSERT(*nextp == NULL && *(DTRACE_HASHPREV(hash, new)) == NULL);
6221	*nextp = bucket->dthb_chain;
6222
6223	if (bucket->dthb_chain != NULL) {
6224		prevp = DTRACE_HASHPREV(hash, bucket->dthb_chain);
6225		ASSERT(*prevp == NULL);
6226		*prevp = new;
6227	}
6228
6229	bucket->dthb_chain = new;
6230	bucket->dthb_len++;
6231}
6232
6233static dtrace_probe_t *
6234dtrace_hash_lookup(dtrace_hash_t *hash, dtrace_probe_t *template)
6235{
6236	int hashval = DTRACE_HASHSTR(hash, template);
6237	int ndx = hashval & hash->dth_mask;
6238	dtrace_hashbucket_t *bucket = hash->dth_tab[ndx];
6239
6240	for (; bucket != NULL; bucket = bucket->dthb_next) {
6241		if (DTRACE_HASHEQ(hash, bucket->dthb_chain, template))
6242			return (bucket->dthb_chain);
6243	}
6244
6245	return (NULL);
6246}
6247
6248static int
6249dtrace_hash_collisions(dtrace_hash_t *hash, dtrace_probe_t *template)
6250{
6251	int hashval = DTRACE_HASHSTR(hash, template);
6252	int ndx = hashval & hash->dth_mask;
6253	dtrace_hashbucket_t *bucket = hash->dth_tab[ndx];
6254
6255	for (; bucket != NULL; bucket = bucket->dthb_next) {
6256		if (DTRACE_HASHEQ(hash, bucket->dthb_chain, template))
6257			return (bucket->dthb_len);
6258	}
6259
6260	return (NULL);
6261}
6262
6263static void
6264dtrace_hash_remove(dtrace_hash_t *hash, dtrace_probe_t *probe)
6265{
6266	int ndx = DTRACE_HASHSTR(hash, probe) & hash->dth_mask;
6267	dtrace_hashbucket_t *bucket = hash->dth_tab[ndx];
6268
6269	dtrace_probe_t **prevp = DTRACE_HASHPREV(hash, probe);
6270	dtrace_probe_t **nextp = DTRACE_HASHNEXT(hash, probe);
6271
6272	/*
6273	 * Find the bucket that we're removing this probe from.
6274	 */
6275	for (; bucket != NULL; bucket = bucket->dthb_next) {
6276		if (DTRACE_HASHEQ(hash, bucket->dthb_chain, probe))
6277			break;
6278	}
6279
6280	ASSERT(bucket != NULL);
6281
6282	if (*prevp == NULL) {
6283		if (*nextp == NULL) {
6284			/*
6285			 * The removed probe was the only probe on this
6286			 * bucket; we need to remove the bucket.
6287			 */
6288			dtrace_hashbucket_t *b = hash->dth_tab[ndx];
6289
6290			ASSERT(bucket->dthb_chain == probe);
6291			ASSERT(b != NULL);
6292
6293			if (b == bucket) {
6294				hash->dth_tab[ndx] = bucket->dthb_next;
6295			} else {
6296				while (b->dthb_next != bucket)
6297					b = b->dthb_next;
6298				b->dthb_next = bucket->dthb_next;
6299			}
6300
6301			ASSERT(hash->dth_nbuckets > 0);
6302			hash->dth_nbuckets--;
6303			kmem_free(bucket, sizeof (dtrace_hashbucket_t));
6304			return;
6305		}
6306
6307		bucket->dthb_chain = *nextp;
6308	} else {
6309		*(DTRACE_HASHNEXT(hash, *prevp)) = *nextp;
6310	}
6311
6312	if (*nextp != NULL)
6313		*(DTRACE_HASHPREV(hash, *nextp)) = *prevp;
6314}
6315
6316/*
6317 * DTrace Utility Functions
6318 *
6319 * These are random utility functions that are _not_ called from probe context.
6320 */
6321static int
6322dtrace_badattr(const dtrace_attribute_t *a)
6323{
6324	return (a->dtat_name > DTRACE_STABILITY_MAX ||
6325	    a->dtat_data > DTRACE_STABILITY_MAX ||
6326	    a->dtat_class > DTRACE_CLASS_MAX);
6327}
6328
6329/*
6330 * Return a duplicate copy of a string.  If the specified string is NULL,
6331 * this function returns a zero-length string.
6332 */
6333static char *
6334dtrace_strdup(const char *str)
6335{
6336	char *new = kmem_zalloc((str != NULL ? strlen(str) : 0) + 1, KM_SLEEP);
6337
6338	if (str != NULL)
6339		(void) strcpy(new, str);
6340
6341	return (new);
6342}
6343
6344#define	DTRACE_ISALPHA(c)	\
6345	(((c) >= 'a' && (c) <= 'z') || ((c) >= 'A' && (c) <= 'Z'))
6346
6347static int
6348dtrace_badname(const char *s)
6349{
6350	char c;
6351
6352	if (s == NULL || (c = *s++) == '\0')
6353		return (0);
6354
6355	if (!DTRACE_ISALPHA(c) && c != '-' && c != '_' && c != '.')
6356		return (1);
6357
6358	while ((c = *s++) != '\0') {
6359		if (!DTRACE_ISALPHA(c) && (c < '0' || c > '9') &&
6360		    c != '-' && c != '_' && c != '.' && c != '`')
6361			return (1);
6362	}
6363
6364	return (0);
6365}
6366
6367static void
6368dtrace_cred2priv(cred_t *cr, uint32_t *privp, uid_t *uidp, zoneid_t *zoneidp)
6369{
6370	uint32_t priv;
6371
6372	if (cr == NULL || PRIV_POLICY_ONLY(cr, PRIV_ALL, B_FALSE)) {
6373		/*
6374		 * For DTRACE_PRIV_ALL, the uid and zoneid don't matter.
6375		 */
6376		priv = DTRACE_PRIV_ALL;
6377	} else {
6378		*uidp = crgetuid(cr);
6379		*zoneidp = crgetzoneid(cr);
6380
6381		priv = 0;
6382		if (PRIV_POLICY_ONLY(cr, PRIV_DTRACE_KERNEL, B_FALSE))
6383			priv |= DTRACE_PRIV_KERNEL | DTRACE_PRIV_USER;
6384		else if (PRIV_POLICY_ONLY(cr, PRIV_DTRACE_USER, B_FALSE))
6385			priv |= DTRACE_PRIV_USER;
6386		if (PRIV_POLICY_ONLY(cr, PRIV_DTRACE_PROC, B_FALSE))
6387			priv |= DTRACE_PRIV_PROC;
6388		if (PRIV_POLICY_ONLY(cr, PRIV_PROC_OWNER, B_FALSE))
6389			priv |= DTRACE_PRIV_OWNER;
6390		if (PRIV_POLICY_ONLY(cr, PRIV_PROC_ZONE, B_FALSE))
6391			priv |= DTRACE_PRIV_ZONEOWNER;
6392	}
6393
6394	*privp = priv;
6395}
6396
6397#ifdef DTRACE_ERRDEBUG
6398static void
6399dtrace_errdebug(const char *str)
6400{
6401	int hval = dtrace_hash_str((char *)str) % DTRACE_ERRHASHSZ;
6402	int occupied = 0;
6403
6404	mutex_enter(&dtrace_errlock);
6405	dtrace_errlast = str;
6406	dtrace_errthread = curthread;
6407
6408	while (occupied++ < DTRACE_ERRHASHSZ) {
6409		if (dtrace_errhash[hval].dter_msg == str) {
6410			dtrace_errhash[hval].dter_count++;
6411			goto out;
6412		}
6413
6414		if (dtrace_errhash[hval].dter_msg != NULL) {
6415			hval = (hval + 1) % DTRACE_ERRHASHSZ;
6416			continue;
6417		}
6418
6419		dtrace_errhash[hval].dter_msg = str;
6420		dtrace_errhash[hval].dter_count = 1;
6421		goto out;
6422	}
6423
6424	panic("dtrace: undersized error hash");
6425out:
6426	mutex_exit(&dtrace_errlock);
6427}
6428#endif
6429
6430/*
6431 * DTrace Matching Functions
6432 *
6433 * These functions are used to match groups of probes, given some elements of
6434 * a probe tuple, or some globbed expressions for elements of a probe tuple.
6435 */
6436static int
6437dtrace_match_priv(const dtrace_probe_t *prp, uint32_t priv, uid_t uid,
6438    zoneid_t zoneid)
6439{
6440	if (priv != DTRACE_PRIV_ALL) {
6441		uint32_t ppriv = prp->dtpr_provider->dtpv_priv.dtpp_flags;
6442		uint32_t match = priv & ppriv;
6443
6444		/*
6445		 * No PRIV_DTRACE_* privileges...
6446		 */
6447		if ((priv & (DTRACE_PRIV_PROC | DTRACE_PRIV_USER |
6448		    DTRACE_PRIV_KERNEL)) == 0)
6449			return (0);
6450
6451		/*
6452		 * No matching bits, but there were bits to match...
6453		 */
6454		if (match == 0 && ppriv != 0)
6455			return (0);
6456
6457		/*
6458		 * Need to have permissions to the process, but don't...
6459		 */
6460		if (((ppriv & ~match) & DTRACE_PRIV_OWNER) != 0 &&
6461		    uid != prp->dtpr_provider->dtpv_priv.dtpp_uid) {
6462			return (0);
6463		}
6464
6465		/*
6466		 * Need to be in the same zone unless we possess the
6467		 * privilege to examine all zones.
6468		 */
6469		if (((ppriv & ~match) & DTRACE_PRIV_ZONEOWNER) != 0 &&
6470		    zoneid != prp->dtpr_provider->dtpv_priv.dtpp_zoneid) {
6471			return (0);
6472		}
6473	}
6474
6475	return (1);
6476}
6477
6478/*
6479 * dtrace_match_probe compares a dtrace_probe_t to a pre-compiled key, which
6480 * consists of input pattern strings and an ops-vector to evaluate them.
6481 * This function returns >0 for match, 0 for no match, and <0 for error.
6482 */
6483static int
6484dtrace_match_probe(const dtrace_probe_t *prp, const dtrace_probekey_t *pkp,
6485    uint32_t priv, uid_t uid, zoneid_t zoneid)
6486{
6487	dtrace_provider_t *pvp = prp->dtpr_provider;
6488	int rv;
6489
6490	if (pvp->dtpv_defunct)
6491		return (0);
6492
6493	if ((rv = pkp->dtpk_pmatch(pvp->dtpv_name, pkp->dtpk_prov, 0)) <= 0)
6494		return (rv);
6495
6496	if ((rv = pkp->dtpk_mmatch(prp->dtpr_mod, pkp->dtpk_mod, 0)) <= 0)
6497		return (rv);
6498
6499	if ((rv = pkp->dtpk_fmatch(prp->dtpr_func, pkp->dtpk_func, 0)) <= 0)
6500		return (rv);
6501
6502	if ((rv = pkp->dtpk_nmatch(prp->dtpr_name, pkp->dtpk_name, 0)) <= 0)
6503		return (rv);
6504
6505	if (dtrace_match_priv(prp, priv, uid, zoneid) == 0)
6506		return (0);
6507
6508	return (rv);
6509}
6510
6511/*
6512 * dtrace_match_glob() is a safe kernel implementation of the gmatch(3GEN)
6513 * interface for matching a glob pattern 'p' to an input string 's'.  Unlike
6514 * libc's version, the kernel version only applies to 8-bit ASCII strings.
6515 * In addition, all of the recursion cases except for '*' matching have been
6516 * unwound.  For '*', we still implement recursive evaluation, but a depth
6517 * counter is maintained and matching is aborted if we recurse too deep.
6518 * The function returns 0 if no match, >0 if match, and <0 if recursion error.
6519 */
6520static int
6521dtrace_match_glob(const char *s, const char *p, int depth)
6522{
6523	const char *olds;
6524	char s1, c;
6525	int gs;
6526
6527	if (depth > DTRACE_PROBEKEY_MAXDEPTH)
6528		return (-1);
6529
6530	if (s == NULL)
6531		s = ""; /* treat NULL as empty string */
6532
6533top:
6534	olds = s;
6535	s1 = *s++;
6536
6537	if (p == NULL)
6538		return (0);
6539
6540	if ((c = *p++) == '\0')
6541		return (s1 == '\0');
6542
6543	switch (c) {
6544	case '[': {
6545		int ok = 0, notflag = 0;
6546		char lc = '\0';
6547
6548		if (s1 == '\0')
6549			return (0);
6550
6551		if (*p == '!') {
6552			notflag = 1;
6553			p++;
6554		}
6555
6556		if ((c = *p++) == '\0')
6557			return (0);
6558
6559		do {
6560			if (c == '-' && lc != '\0' && *p != ']') {
6561				if ((c = *p++) == '\0')
6562					return (0);
6563				if (c == '\\' && (c = *p++) == '\0')
6564					return (0);
6565
6566				if (notflag) {
6567					if (s1 < lc || s1 > c)
6568						ok++;
6569					else
6570						return (0);
6571				} else if (lc <= s1 && s1 <= c)
6572					ok++;
6573
6574			} else if (c == '\\' && (c = *p++) == '\0')
6575				return (0);
6576
6577			lc = c; /* save left-hand 'c' for next iteration */
6578
6579			if (notflag) {
6580				if (s1 != c)
6581					ok++;
6582				else
6583					return (0);
6584			} else if (s1 == c)
6585				ok++;
6586
6587			if ((c = *p++) == '\0')
6588				return (0);
6589
6590		} while (c != ']');
6591
6592		if (ok)
6593			goto top;
6594
6595		return (0);
6596	}
6597
6598	case '\\':
6599		if ((c = *p++) == '\0')
6600			return (0);
6601		/*FALLTHRU*/
6602
6603	default:
6604		if (c != s1)
6605			return (0);
6606		/*FALLTHRU*/
6607
6608	case '?':
6609		if (s1 != '\0')
6610			goto top;
6611		return (0);
6612
6613	case '*':
6614		while (*p == '*')
6615			p++; /* consecutive *'s are identical to a single one */
6616
6617		if (*p == '\0')
6618			return (1);
6619
6620		for (s = olds; *s != '\0'; s++) {
6621			if ((gs = dtrace_match_glob(s, p, depth + 1)) != 0)
6622				return (gs);
6623		}
6624
6625		return (0);
6626	}
6627}
6628
6629/*ARGSUSED*/
6630static int
6631dtrace_match_string(const char *s, const char *p, int depth)
6632{
6633	return (s != NULL && strcmp(s, p) == 0);
6634}
6635
6636/*ARGSUSED*/
6637static int
6638dtrace_match_nul(const char *s, const char *p, int depth)
6639{
6640	return (1); /* always match the empty pattern */
6641}
6642
6643/*ARGSUSED*/
6644static int
6645dtrace_match_nonzero(const char *s, const char *p, int depth)
6646{
6647	return (s != NULL && s[0] != '\0');
6648}
6649
6650static int
6651dtrace_match(const dtrace_probekey_t *pkp, uint32_t priv, uid_t uid,
6652    zoneid_t zoneid, int (*matched)(dtrace_probe_t *, void *), void *arg)
6653{
6654	dtrace_probe_t template, *probe;
6655	dtrace_hash_t *hash = NULL;
6656	int len, best = INT_MAX, nmatched = 0;
6657	dtrace_id_t i;
6658
6659	ASSERT(MUTEX_HELD(&dtrace_lock));
6660
6661	/*
6662	 * If the probe ID is specified in the key, just lookup by ID and
6663	 * invoke the match callback once if a matching probe is found.
6664	 */
6665	if (pkp->dtpk_id != DTRACE_IDNONE) {
6666		if ((probe = dtrace_probe_lookup_id(pkp->dtpk_id)) != NULL &&
6667		    dtrace_match_probe(probe, pkp, priv, uid, zoneid) > 0) {
6668			(void) (*matched)(probe, arg);
6669			nmatched++;
6670		}
6671		return (nmatched);
6672	}
6673
6674	template.dtpr_mod = (char *)pkp->dtpk_mod;
6675	template.dtpr_func = (char *)pkp->dtpk_func;
6676	template.dtpr_name = (char *)pkp->dtpk_name;
6677
6678	/*
6679	 * We want to find the most distinct of the module name, function
6680	 * name, and name.  So for each one that is not a glob pattern or
6681	 * empty string, we perform a lookup in the corresponding hash and
6682	 * use the hash table with the fewest collisions to do our search.
6683	 */
6684	if (pkp->dtpk_mmatch == &dtrace_match_string &&
6685	    (len = dtrace_hash_collisions(dtrace_bymod, &template)) < best) {
6686		best = len;
6687		hash = dtrace_bymod;
6688	}
6689
6690	if (pkp->dtpk_fmatch == &dtrace_match_string &&
6691	    (len = dtrace_hash_collisions(dtrace_byfunc, &template)) < best) {
6692		best = len;
6693		hash = dtrace_byfunc;
6694	}
6695
6696	if (pkp->dtpk_nmatch == &dtrace_match_string &&
6697	    (len = dtrace_hash_collisions(dtrace_byname, &template)) < best) {
6698		best = len;
6699		hash = dtrace_byname;
6700	}
6701
6702	/*
6703	 * If we did not select a hash table, iterate over every probe and
6704	 * invoke our callback for each one that matches our input probe key.
6705	 */
6706	if (hash == NULL) {
6707		for (i = 0; i < dtrace_nprobes; i++) {
6708			if ((probe = dtrace_probes[i]) == NULL ||
6709			    dtrace_match_probe(probe, pkp, priv, uid,
6710			    zoneid) <= 0)
6711				continue;
6712
6713			nmatched++;
6714
6715			if ((*matched)(probe, arg) != DTRACE_MATCH_NEXT)
6716				break;
6717		}
6718
6719		return (nmatched);
6720	}
6721
6722	/*
6723	 * If we selected a hash table, iterate over each probe of the same key
6724	 * name and invoke the callback for every probe that matches the other
6725	 * attributes of our input probe key.
6726	 */
6727	for (probe = dtrace_hash_lookup(hash, &template); probe != NULL;
6728	    probe = *(DTRACE_HASHNEXT(hash, probe))) {
6729
6730		if (dtrace_match_probe(probe, pkp, priv, uid, zoneid) <= 0)
6731			continue;
6732
6733		nmatched++;
6734
6735		if ((*matched)(probe, arg) != DTRACE_MATCH_NEXT)
6736			break;
6737	}
6738
6739	return (nmatched);
6740}
6741
6742/*
6743 * Return the function pointer dtrace_probecmp() should use to compare the
6744 * specified pattern with a string.  For NULL or empty patterns, we select
6745 * dtrace_match_nul().  For glob pattern strings, we use dtrace_match_glob().
6746 * For non-empty non-glob strings, we use dtrace_match_string().
6747 */
6748static dtrace_probekey_f *
6749dtrace_probekey_func(const char *p)
6750{
6751	char c;
6752
6753	if (p == NULL || *p == '\0')
6754		return (&dtrace_match_nul);
6755
6756	while ((c = *p++) != '\0') {
6757		if (c == '[' || c == '?' || c == '*' || c == '\\')
6758			return (&dtrace_match_glob);
6759	}
6760
6761	return (&dtrace_match_string);
6762}
6763
6764/*
6765 * Build a probe comparison key for use with dtrace_match_probe() from the
6766 * given probe description.  By convention, a null key only matches anchored
6767 * probes: if each field is the empty string, reset dtpk_fmatch to
6768 * dtrace_match_nonzero().
6769 */
6770static void
6771dtrace_probekey(const dtrace_probedesc_t *pdp, dtrace_probekey_t *pkp)
6772{
6773	pkp->dtpk_prov = pdp->dtpd_provider;
6774	pkp->dtpk_pmatch = dtrace_probekey_func(pdp->dtpd_provider);
6775
6776	pkp->dtpk_mod = pdp->dtpd_mod;
6777	pkp->dtpk_mmatch = dtrace_probekey_func(pdp->dtpd_mod);
6778
6779	pkp->dtpk_func = pdp->dtpd_func;
6780	pkp->dtpk_fmatch = dtrace_probekey_func(pdp->dtpd_func);
6781
6782	pkp->dtpk_name = pdp->dtpd_name;
6783	pkp->dtpk_nmatch = dtrace_probekey_func(pdp->dtpd_name);
6784
6785	pkp->dtpk_id = pdp->dtpd_id;
6786
6787	if (pkp->dtpk_id == DTRACE_IDNONE &&
6788	    pkp->dtpk_pmatch == &dtrace_match_nul &&
6789	    pkp->dtpk_mmatch == &dtrace_match_nul &&
6790	    pkp->dtpk_fmatch == &dtrace_match_nul &&
6791	    pkp->dtpk_nmatch == &dtrace_match_nul)
6792		pkp->dtpk_fmatch = &dtrace_match_nonzero;
6793}
6794
6795/*
6796 * DTrace Provider-to-Framework API Functions
6797 *
6798 * These functions implement much of the Provider-to-Framework API, as
6799 * described in <sys/dtrace.h>.  The parts of the API not in this section are
6800 * the functions in the API for probe management (found below), and
6801 * dtrace_probe() itself (found above).
6802 */
6803
6804/*
6805 * Register the calling provider with the DTrace framework.  This should
6806 * generally be called by DTrace providers in their attach(9E) entry point.
6807 */
6808int
6809dtrace_register(const char *name, const dtrace_pattr_t *pap, uint32_t priv,
6810    cred_t *cr, const dtrace_pops_t *pops, void *arg, dtrace_provider_id_t *idp)
6811{
6812	dtrace_provider_t *provider;
6813
6814	if (name == NULL || pap == NULL || pops == NULL || idp == NULL) {
6815		cmn_err(CE_WARN, "failed to register provider '%s': invalid "
6816		    "arguments", name ? name : "<NULL>");
6817		return (EINVAL);
6818	}
6819
6820	if (name[0] == '\0' || dtrace_badname(name)) {
6821		cmn_err(CE_WARN, "failed to register provider '%s': invalid "
6822		    "provider name", name);
6823		return (EINVAL);
6824	}
6825
6826	if ((pops->dtps_provide == NULL && pops->dtps_provide_module == NULL) ||
6827	    pops->dtps_enable == NULL || pops->dtps_disable == NULL ||
6828	    pops->dtps_destroy == NULL ||
6829	    ((pops->dtps_resume == NULL) != (pops->dtps_suspend == NULL))) {
6830		cmn_err(CE_WARN, "failed to register provider '%s': invalid "
6831		    "provider ops", name);
6832		return (EINVAL);
6833	}
6834
6835	if (dtrace_badattr(&pap->dtpa_provider) ||
6836	    dtrace_badattr(&pap->dtpa_mod) ||
6837	    dtrace_badattr(&pap->dtpa_func) ||
6838	    dtrace_badattr(&pap->dtpa_name) ||
6839	    dtrace_badattr(&pap->dtpa_args)) {
6840		cmn_err(CE_WARN, "failed to register provider '%s': invalid "
6841		    "provider attributes", name);
6842		return (EINVAL);
6843	}
6844
6845	if (priv & ~DTRACE_PRIV_ALL) {
6846		cmn_err(CE_WARN, "failed to register provider '%s': invalid "
6847		    "privilege attributes", name);
6848		return (EINVAL);
6849	}
6850
6851	if ((priv & DTRACE_PRIV_KERNEL) &&
6852	    (priv & (DTRACE_PRIV_USER | DTRACE_PRIV_OWNER)) &&
6853	    pops->dtps_usermode == NULL) {
6854		cmn_err(CE_WARN, "failed to register provider '%s': need "
6855		    "dtps_usermode() op for given privilege attributes", name);
6856		return (EINVAL);
6857	}
6858
6859	provider = kmem_zalloc(sizeof (dtrace_provider_t), KM_SLEEP);
6860	provider->dtpv_name = kmem_alloc(strlen(name) + 1, KM_SLEEP);
6861	(void) strcpy(provider->dtpv_name, name);
6862
6863	provider->dtpv_attr = *pap;
6864	provider->dtpv_priv.dtpp_flags = priv;
6865	if (cr != NULL) {
6866		provider->dtpv_priv.dtpp_uid = crgetuid(cr);
6867		provider->dtpv_priv.dtpp_zoneid = crgetzoneid(cr);
6868	}
6869	provider->dtpv_pops = *pops;
6870
6871	if (pops->dtps_provide == NULL) {
6872		ASSERT(pops->dtps_provide_module != NULL);
6873		provider->dtpv_pops.dtps_provide =
6874		    (void (*)(void *, const dtrace_probedesc_t *))dtrace_nullop;
6875	}
6876
6877	if (pops->dtps_provide_module == NULL) {
6878		ASSERT(pops->dtps_provide != NULL);
6879		provider->dtpv_pops.dtps_provide_module =
6880		    (void (*)(void *, struct modctl *))dtrace_nullop;
6881	}
6882
6883	if (pops->dtps_suspend == NULL) {
6884		ASSERT(pops->dtps_resume == NULL);
6885		provider->dtpv_pops.dtps_suspend =
6886		    (void (*)(void *, dtrace_id_t, void *))dtrace_nullop;
6887		provider->dtpv_pops.dtps_resume =
6888		    (void (*)(void *, dtrace_id_t, void *))dtrace_nullop;
6889	}
6890
6891	provider->dtpv_arg = arg;
6892	*idp = (dtrace_provider_id_t)provider;
6893
6894	if (pops == &dtrace_provider_ops) {
6895		ASSERT(MUTEX_HELD(&dtrace_provider_lock));
6896		ASSERT(MUTEX_HELD(&dtrace_lock));
6897		ASSERT(dtrace_anon.dta_enabling == NULL);
6898
6899		/*
6900		 * We make sure that the DTrace provider is at the head of
6901		 * the provider chain.
6902		 */
6903		provider->dtpv_next = dtrace_provider;
6904		dtrace_provider = provider;
6905		return (0);
6906	}
6907
6908	mutex_enter(&dtrace_provider_lock);
6909	mutex_enter(&dtrace_lock);
6910
6911	/*
6912	 * If there is at least one provider registered, we'll add this
6913	 * provider after the first provider.
6914	 */
6915	if (dtrace_provider != NULL) {
6916		provider->dtpv_next = dtrace_provider->dtpv_next;
6917		dtrace_provider->dtpv_next = provider;
6918	} else {
6919		dtrace_provider = provider;
6920	}
6921
6922	if (dtrace_retained != NULL) {
6923		dtrace_enabling_provide(provider);
6924
6925		/*
6926		 * Now we need to call dtrace_enabling_matchall() -- which
6927		 * will acquire cpu_lock and dtrace_lock.  We therefore need
6928		 * to drop all of our locks before calling into it...
6929		 */
6930		mutex_exit(&dtrace_lock);
6931		mutex_exit(&dtrace_provider_lock);
6932		dtrace_enabling_matchall();
6933
6934		return (0);
6935	}
6936
6937	mutex_exit(&dtrace_lock);
6938	mutex_exit(&dtrace_provider_lock);
6939
6940	return (0);
6941}
6942
6943/*
6944 * Unregister the specified provider from the DTrace framework.  This should
6945 * generally be called by DTrace providers in their detach(9E) entry point.
6946 */
6947int
6948dtrace_unregister(dtrace_provider_id_t id)
6949{
6950	dtrace_provider_t *old = (dtrace_provider_t *)id;
6951	dtrace_provider_t *prev = NULL;
6952	int i, self = 0;
6953	dtrace_probe_t *probe, *first = NULL;
6954
6955	if (old->dtpv_pops.dtps_enable ==
6956	    (void (*)(void *, dtrace_id_t, void *))dtrace_nullop) {
6957		/*
6958		 * If DTrace itself is the provider, we're called with locks
6959		 * already held.
6960		 */
6961		ASSERT(old == dtrace_provider);
6962		ASSERT(dtrace_devi != NULL);
6963		ASSERT(MUTEX_HELD(&dtrace_provider_lock));
6964		ASSERT(MUTEX_HELD(&dtrace_lock));
6965		self = 1;
6966
6967		if (dtrace_provider->dtpv_next != NULL) {
6968			/*
6969			 * There's another provider here; return failure.
6970			 */
6971			return (EBUSY);
6972		}
6973	} else {
6974		mutex_enter(&dtrace_provider_lock);
6975		mutex_enter(&mod_lock);
6976		mutex_enter(&dtrace_lock);
6977	}
6978
6979	/*
6980	 * If anyone has /dev/dtrace open, or if there are anonymous enabled
6981	 * probes, we refuse to let providers slither away, unless this
6982	 * provider has already been explicitly invalidated.
6983	 */
6984	if (!old->dtpv_defunct &&
6985	    (dtrace_opens || (dtrace_anon.dta_state != NULL &&
6986	    dtrace_anon.dta_state->dts_necbs > 0))) {
6987		if (!self) {
6988			mutex_exit(&dtrace_lock);
6989			mutex_exit(&mod_lock);
6990			mutex_exit(&dtrace_provider_lock);
6991		}
6992		return (EBUSY);
6993	}
6994
6995	/*
6996	 * Attempt to destroy the probes associated with this provider.
6997	 */
6998	for (i = 0; i < dtrace_nprobes; i++) {
6999		if ((probe = dtrace_probes[i]) == NULL)
7000			continue;
7001
7002		if (probe->dtpr_provider != old)
7003			continue;
7004
7005		if (probe->dtpr_ecb == NULL)
7006			continue;
7007
7008		/*
7009		 * We have at least one ECB; we can't remove this provider.
7010		 */
7011		if (!self) {
7012			mutex_exit(&dtrace_lock);
7013			mutex_exit(&mod_lock);
7014			mutex_exit(&dtrace_provider_lock);
7015		}
7016		return (EBUSY);
7017	}
7018
7019	/*
7020	 * All of the probes for this provider are disabled; we can safely
7021	 * remove all of them from their hash chains and from the probe array.
7022	 */
7023	for (i = 0; i < dtrace_nprobes; i++) {
7024		if ((probe = dtrace_probes[i]) == NULL)
7025			continue;
7026
7027		if (probe->dtpr_provider != old)
7028			continue;
7029
7030		dtrace_probes[i] = NULL;
7031
7032		dtrace_hash_remove(dtrace_bymod, probe);
7033		dtrace_hash_remove(dtrace_byfunc, probe);
7034		dtrace_hash_remove(dtrace_byname, probe);
7035
7036		if (first == NULL) {
7037			first = probe;
7038			probe->dtpr_nextmod = NULL;
7039		} else {
7040			probe->dtpr_nextmod = first;
7041			first = probe;
7042		}
7043	}
7044
7045	/*
7046	 * The provider's probes have been removed from the hash chains and
7047	 * from the probe array.  Now issue a dtrace_sync() to be sure that
7048	 * everyone has cleared out from any probe array processing.
7049	 */
7050	dtrace_sync();
7051
7052	for (probe = first; probe != NULL; probe = first) {
7053		first = probe->dtpr_nextmod;
7054
7055		old->dtpv_pops.dtps_destroy(old->dtpv_arg, probe->dtpr_id,
7056		    probe->dtpr_arg);
7057		kmem_free(probe->dtpr_mod, strlen(probe->dtpr_mod) + 1);
7058		kmem_free(probe->dtpr_func, strlen(probe->dtpr_func) + 1);
7059		kmem_free(probe->dtpr_name, strlen(probe->dtpr_name) + 1);
7060		vmem_free(dtrace_arena, (void *)(uintptr_t)(probe->dtpr_id), 1);
7061		kmem_free(probe, sizeof (dtrace_probe_t));
7062	}
7063
7064	if ((prev = dtrace_provider) == old) {
7065		ASSERT(self || dtrace_devi == NULL);
7066		ASSERT(old->dtpv_next == NULL || dtrace_devi == NULL);
7067		dtrace_provider = old->dtpv_next;
7068	} else {
7069		while (prev != NULL && prev->dtpv_next != old)
7070			prev = prev->dtpv_next;
7071
7072		if (prev == NULL) {
7073			panic("attempt to unregister non-existent "
7074			    "dtrace provider %p\n", (void *)id);
7075		}
7076
7077		prev->dtpv_next = old->dtpv_next;
7078	}
7079
7080	if (!self) {
7081		mutex_exit(&dtrace_lock);
7082		mutex_exit(&mod_lock);
7083		mutex_exit(&dtrace_provider_lock);
7084	}
7085
7086	kmem_free(old->dtpv_name, strlen(old->dtpv_name) + 1);
7087	kmem_free(old, sizeof (dtrace_provider_t));
7088
7089	return (0);
7090}
7091
7092/*
7093 * Invalidate the specified provider.  All subsequent probe lookups for the
7094 * specified provider will fail, but its probes will not be removed.
7095 */
7096void
7097dtrace_invalidate(dtrace_provider_id_t id)
7098{
7099	dtrace_provider_t *pvp = (dtrace_provider_t *)id;
7100
7101	ASSERT(pvp->dtpv_pops.dtps_enable !=
7102	    (void (*)(void *, dtrace_id_t, void *))dtrace_nullop);
7103
7104	mutex_enter(&dtrace_provider_lock);
7105	mutex_enter(&dtrace_lock);
7106
7107	pvp->dtpv_defunct = 1;
7108
7109	mutex_exit(&dtrace_lock);
7110	mutex_exit(&dtrace_provider_lock);
7111}
7112
7113/*
7114 * Indicate whether or not DTrace has attached.
7115 */
7116int
7117dtrace_attached(void)
7118{
7119	/*
7120	 * dtrace_provider will be non-NULL iff the DTrace driver has
7121	 * attached.  (It's non-NULL because DTrace is always itself a
7122	 * provider.)
7123	 */
7124	return (dtrace_provider != NULL);
7125}
7126
7127/*
7128 * Remove all the unenabled probes for the given provider.  This function is
7129 * not unlike dtrace_unregister(), except that it doesn't remove the provider
7130 * -- just as many of its associated probes as it can.
7131 */
7132int
7133dtrace_condense(dtrace_provider_id_t id)
7134{
7135	dtrace_provider_t *prov = (dtrace_provider_t *)id;
7136	int i;
7137	dtrace_probe_t *probe;
7138
7139	/*
7140	 * Make sure this isn't the dtrace provider itself.
7141	 */
7142	ASSERT(prov->dtpv_pops.dtps_enable !=
7143	    (void (*)(void *, dtrace_id_t, void *))dtrace_nullop);
7144
7145	mutex_enter(&dtrace_provider_lock);
7146	mutex_enter(&dtrace_lock);
7147
7148	/*
7149	 * Attempt to destroy the probes associated with this provider.
7150	 */
7151	for (i = 0; i < dtrace_nprobes; i++) {
7152		if ((probe = dtrace_probes[i]) == NULL)
7153			continue;
7154
7155		if (probe->dtpr_provider != prov)
7156			continue;
7157
7158		if (probe->dtpr_ecb != NULL)
7159			continue;
7160
7161		dtrace_probes[i] = NULL;
7162
7163		dtrace_hash_remove(dtrace_bymod, probe);
7164		dtrace_hash_remove(dtrace_byfunc, probe);
7165		dtrace_hash_remove(dtrace_byname, probe);
7166
7167		prov->dtpv_pops.dtps_destroy(prov->dtpv_arg, i + 1,
7168		    probe->dtpr_arg);
7169		kmem_free(probe->dtpr_mod, strlen(probe->dtpr_mod) + 1);
7170		kmem_free(probe->dtpr_func, strlen(probe->dtpr_func) + 1);
7171		kmem_free(probe->dtpr_name, strlen(probe->dtpr_name) + 1);
7172		kmem_free(probe, sizeof (dtrace_probe_t));
7173		vmem_free(dtrace_arena, (void *)((uintptr_t)i + 1), 1);
7174	}
7175
7176	mutex_exit(&dtrace_lock);
7177	mutex_exit(&dtrace_provider_lock);
7178
7179	return (0);
7180}
7181
7182/*
7183 * DTrace Probe Management Functions
7184 *
7185 * The functions in this section perform the DTrace probe management,
7186 * including functions to create probes, look-up probes, and call into the
7187 * providers to request that probes be provided.  Some of these functions are
7188 * in the Provider-to-Framework API; these functions can be identified by the
7189 * fact that they are not declared "static".
7190 */
7191
7192/*
7193 * Create a probe with the specified module name, function name, and name.
7194 */
7195dtrace_id_t
7196dtrace_probe_create(dtrace_provider_id_t prov, const char *mod,
7197    const char *func, const char *name, int aframes, void *arg)
7198{
7199	dtrace_probe_t *probe, **probes;
7200	dtrace_provider_t *provider = (dtrace_provider_t *)prov;
7201	dtrace_id_t id;
7202
7203	if (provider == dtrace_provider) {
7204		ASSERT(MUTEX_HELD(&dtrace_lock));
7205	} else {
7206		mutex_enter(&dtrace_lock);
7207	}
7208
7209	id = (dtrace_id_t)(uintptr_t)vmem_alloc(dtrace_arena, 1,
7210	    VM_BESTFIT | VM_SLEEP);
7211	probe = kmem_zalloc(sizeof (dtrace_probe_t), KM_SLEEP);
7212
7213	probe->dtpr_id = id;
7214	probe->dtpr_gen = dtrace_probegen++;
7215	probe->dtpr_mod = dtrace_strdup(mod);
7216	probe->dtpr_func = dtrace_strdup(func);
7217	probe->dtpr_name = dtrace_strdup(name);
7218	probe->dtpr_arg = arg;
7219	probe->dtpr_aframes = aframes;
7220	probe->dtpr_provider = provider;
7221
7222	dtrace_hash_add(dtrace_bymod, probe);
7223	dtrace_hash_add(dtrace_byfunc, probe);
7224	dtrace_hash_add(dtrace_byname, probe);
7225
7226	if (id - 1 >= dtrace_nprobes) {
7227		size_t osize = dtrace_nprobes * sizeof (dtrace_probe_t *);
7228		size_t nsize = osize << 1;
7229
7230		if (nsize == 0) {
7231			ASSERT(osize == 0);
7232			ASSERT(dtrace_probes == NULL);
7233			nsize = sizeof (dtrace_probe_t *);
7234		}
7235
7236		probes = kmem_zalloc(nsize, KM_SLEEP);
7237
7238		if (dtrace_probes == NULL) {
7239			ASSERT(osize == 0);
7240			dtrace_probes = probes;
7241			dtrace_nprobes = 1;
7242		} else {
7243			dtrace_probe_t **oprobes = dtrace_probes;
7244
7245			bcopy(oprobes, probes, osize);
7246			dtrace_membar_producer();
7247			dtrace_probes = probes;
7248
7249			dtrace_sync();
7250
7251			/*
7252			 * All CPUs are now seeing the new probes array; we can
7253			 * safely free the old array.
7254			 */
7255			kmem_free(oprobes, osize);
7256			dtrace_nprobes <<= 1;
7257		}
7258
7259		ASSERT(id - 1 < dtrace_nprobes);
7260	}
7261
7262	ASSERT(dtrace_probes[id - 1] == NULL);
7263	dtrace_probes[id - 1] = probe;
7264
7265	if (provider != dtrace_provider)
7266		mutex_exit(&dtrace_lock);
7267
7268	return (id);
7269}
7270
7271static dtrace_probe_t *
7272dtrace_probe_lookup_id(dtrace_id_t id)
7273{
7274	ASSERT(MUTEX_HELD(&dtrace_lock));
7275
7276	if (id == 0 || id > dtrace_nprobes)
7277		return (NULL);
7278
7279	return (dtrace_probes[id - 1]);
7280}
7281
7282static int
7283dtrace_probe_lookup_match(dtrace_probe_t *probe, void *arg)
7284{
7285	*((dtrace_id_t *)arg) = probe->dtpr_id;
7286
7287	return (DTRACE_MATCH_DONE);
7288}
7289
7290/*
7291 * Look up a probe based on provider and one or more of module name, function
7292 * name and probe name.
7293 */
7294dtrace_id_t
7295dtrace_probe_lookup(dtrace_provider_id_t prid, const char *mod,
7296    const char *func, const char *name)
7297{
7298	dtrace_probekey_t pkey;
7299	dtrace_id_t id;
7300	int match;
7301
7302	pkey.dtpk_prov = ((dtrace_provider_t *)prid)->dtpv_name;
7303	pkey.dtpk_pmatch = &dtrace_match_string;
7304	pkey.dtpk_mod = mod;
7305	pkey.dtpk_mmatch = mod ? &dtrace_match_string : &dtrace_match_nul;
7306	pkey.dtpk_func = func;
7307	pkey.dtpk_fmatch = func ? &dtrace_match_string : &dtrace_match_nul;
7308	pkey.dtpk_name = name;
7309	pkey.dtpk_nmatch = name ? &dtrace_match_string : &dtrace_match_nul;
7310	pkey.dtpk_id = DTRACE_IDNONE;
7311
7312	mutex_enter(&dtrace_lock);
7313	match = dtrace_match(&pkey, DTRACE_PRIV_ALL, 0, 0,
7314	    dtrace_probe_lookup_match, &id);
7315	mutex_exit(&dtrace_lock);
7316
7317	ASSERT(match == 1 || match == 0);
7318	return (match ? id : 0);
7319}
7320
7321/*
7322 * Returns the probe argument associated with the specified probe.
7323 */
7324void *
7325dtrace_probe_arg(dtrace_provider_id_t id, dtrace_id_t pid)
7326{
7327	dtrace_probe_t *probe;
7328	void *rval = NULL;
7329
7330	mutex_enter(&dtrace_lock);
7331
7332	if ((probe = dtrace_probe_lookup_id(pid)) != NULL &&
7333	    probe->dtpr_provider == (dtrace_provider_t *)id)
7334		rval = probe->dtpr_arg;
7335
7336	mutex_exit(&dtrace_lock);
7337
7338	return (rval);
7339}
7340
7341/*
7342 * Copy a probe into a probe description.
7343 */
7344static void
7345dtrace_probe_description(const dtrace_probe_t *prp, dtrace_probedesc_t *pdp)
7346{
7347	bzero(pdp, sizeof (dtrace_probedesc_t));
7348	pdp->dtpd_id = prp->dtpr_id;
7349
7350	(void) strncpy(pdp->dtpd_provider,
7351	    prp->dtpr_provider->dtpv_name, DTRACE_PROVNAMELEN - 1);
7352
7353	(void) strncpy(pdp->dtpd_mod, prp->dtpr_mod, DTRACE_MODNAMELEN - 1);
7354	(void) strncpy(pdp->dtpd_func, prp->dtpr_func, DTRACE_FUNCNAMELEN - 1);
7355	(void) strncpy(pdp->dtpd_name, prp->dtpr_name, DTRACE_NAMELEN - 1);
7356}
7357
7358/*
7359 * Called to indicate that a probe -- or probes -- should be provided by a
7360 * specfied provider.  If the specified description is NULL, the provider will
7361 * be told to provide all of its probes.  (This is done whenever a new
7362 * consumer comes along, or whenever a retained enabling is to be matched.) If
7363 * the specified description is non-NULL, the provider is given the
7364 * opportunity to dynamically provide the specified probe, allowing providers
7365 * to support the creation of probes on-the-fly.  (So-called _autocreated_
7366 * probes.)  If the provider is NULL, the operations will be applied to all
7367 * providers; if the provider is non-NULL the operations will only be applied
7368 * to the specified provider.  The dtrace_provider_lock must be held, and the
7369 * dtrace_lock must _not_ be held -- the provider's dtps_provide() operation
7370 * will need to grab the dtrace_lock when it reenters the framework through
7371 * dtrace_probe_lookup(), dtrace_probe_create(), etc.
7372 */
7373static void
7374dtrace_probe_provide(dtrace_probedesc_t *desc, dtrace_provider_t *prv)
7375{
7376	struct modctl *ctl;
7377	int all = 0;
7378
7379	ASSERT(MUTEX_HELD(&dtrace_provider_lock));
7380
7381	if (prv == NULL) {
7382		all = 1;
7383		prv = dtrace_provider;
7384	}
7385
7386	do {
7387		/*
7388		 * First, call the blanket provide operation.
7389		 */
7390		prv->dtpv_pops.dtps_provide(prv->dtpv_arg, desc);
7391
7392		/*
7393		 * Now call the per-module provide operation.  We will grab
7394		 * mod_lock to prevent the list from being modified.  Note
7395		 * that this also prevents the mod_busy bits from changing.
7396		 * (mod_busy can only be changed with mod_lock held.)
7397		 */
7398		mutex_enter(&mod_lock);
7399
7400		ctl = &modules;
7401		do {
7402			if (ctl->mod_busy || ctl->mod_mp == NULL)
7403				continue;
7404
7405			prv->dtpv_pops.dtps_provide_module(prv->dtpv_arg, ctl);
7406
7407		} while ((ctl = ctl->mod_next) != &modules);
7408
7409		mutex_exit(&mod_lock);
7410	} while (all && (prv = prv->dtpv_next) != NULL);
7411}
7412
7413/*
7414 * Iterate over each probe, and call the Framework-to-Provider API function
7415 * denoted by offs.
7416 */
7417static void
7418dtrace_probe_foreach(uintptr_t offs)
7419{
7420	dtrace_provider_t *prov;
7421	void (*func)(void *, dtrace_id_t, void *);
7422	dtrace_probe_t *probe;
7423	dtrace_icookie_t cookie;
7424	int i;
7425
7426	/*
7427	 * We disable interrupts to walk through the probe array.  This is
7428	 * safe -- the dtrace_sync() in dtrace_unregister() assures that we
7429	 * won't see stale data.
7430	 */
7431	cookie = dtrace_interrupt_disable();
7432
7433	for (i = 0; i < dtrace_nprobes; i++) {
7434		if ((probe = dtrace_probes[i]) == NULL)
7435			continue;
7436
7437		if (probe->dtpr_ecb == NULL) {
7438			/*
7439			 * This probe isn't enabled -- don't call the function.
7440			 */
7441			continue;
7442		}
7443
7444		prov = probe->dtpr_provider;
7445		func = *((void(**)(void *, dtrace_id_t, void *))
7446		    ((uintptr_t)&prov->dtpv_pops + offs));
7447
7448		func(prov->dtpv_arg, i + 1, probe->dtpr_arg);
7449	}
7450
7451	dtrace_interrupt_enable(cookie);
7452}
7453
7454static int
7455dtrace_probe_enable(const dtrace_probedesc_t *desc, dtrace_enabling_t *enab)
7456{
7457	dtrace_probekey_t pkey;
7458	uint32_t priv;
7459	uid_t uid;
7460	zoneid_t zoneid;
7461
7462	ASSERT(MUTEX_HELD(&dtrace_lock));
7463	dtrace_ecb_create_cache = NULL;
7464
7465	if (desc == NULL) {
7466		/*
7467		 * If we're passed a NULL description, we're being asked to
7468		 * create an ECB with a NULL probe.
7469		 */
7470		(void) dtrace_ecb_create_enable(NULL, enab);
7471		return (0);
7472	}
7473
7474	dtrace_probekey(desc, &pkey);
7475	dtrace_cred2priv(enab->dten_vstate->dtvs_state->dts_cred.dcr_cred,
7476	    &priv, &uid, &zoneid);
7477
7478	return (dtrace_match(&pkey, priv, uid, zoneid, dtrace_ecb_create_enable,
7479	    enab));
7480}
7481
7482/*
7483 * DTrace Helper Provider Functions
7484 */
7485static void
7486dtrace_dofattr2attr(dtrace_attribute_t *attr, const dof_attr_t dofattr)
7487{
7488	attr->dtat_name = DOF_ATTR_NAME(dofattr);
7489	attr->dtat_data = DOF_ATTR_DATA(dofattr);
7490	attr->dtat_class = DOF_ATTR_CLASS(dofattr);
7491}
7492
7493static void
7494dtrace_dofprov2hprov(dtrace_helper_provdesc_t *hprov,
7495    const dof_provider_t *dofprov, char *strtab)
7496{
7497	hprov->dthpv_provname = strtab + dofprov->dofpv_name;
7498	dtrace_dofattr2attr(&hprov->dthpv_pattr.dtpa_provider,
7499	    dofprov->dofpv_provattr);
7500	dtrace_dofattr2attr(&hprov->dthpv_pattr.dtpa_mod,
7501	    dofprov->dofpv_modattr);
7502	dtrace_dofattr2attr(&hprov->dthpv_pattr.dtpa_func,
7503	    dofprov->dofpv_funcattr);
7504	dtrace_dofattr2attr(&hprov->dthpv_pattr.dtpa_name,
7505	    dofprov->dofpv_nameattr);
7506	dtrace_dofattr2attr(&hprov->dthpv_pattr.dtpa_args,
7507	    dofprov->dofpv_argsattr);
7508}
7509
7510static void
7511dtrace_helper_provide_one(dof_helper_t *dhp, dof_sec_t *sec, pid_t pid)
7512{
7513	uintptr_t daddr = (uintptr_t)dhp->dofhp_dof;
7514	dof_hdr_t *dof = (dof_hdr_t *)daddr;
7515	dof_sec_t *str_sec, *prb_sec, *arg_sec, *off_sec, *enoff_sec;
7516	dof_provider_t *provider;
7517	dof_probe_t *probe;
7518	uint32_t *off, *enoff;
7519	uint8_t *arg;
7520	char *strtab;
7521	uint_t i, nprobes;
7522	dtrace_helper_provdesc_t dhpv;
7523	dtrace_helper_probedesc_t dhpb;
7524	dtrace_meta_t *meta = dtrace_meta_pid;
7525	dtrace_mops_t *mops = &meta->dtm_mops;
7526	void *parg;
7527
7528	provider = (dof_provider_t *)(uintptr_t)(daddr + sec->dofs_offset);
7529	str_sec = (dof_sec_t *)(uintptr_t)(daddr + dof->dofh_secoff +
7530	    provider->dofpv_strtab * dof->dofh_secsize);
7531	prb_sec = (dof_sec_t *)(uintptr_t)(daddr + dof->dofh_secoff +
7532	    provider->dofpv_probes * dof->dofh_secsize);
7533	arg_sec = (dof_sec_t *)(uintptr_t)(daddr + dof->dofh_secoff +
7534	    provider->dofpv_prargs * dof->dofh_secsize);
7535	off_sec = (dof_sec_t *)(uintptr_t)(daddr + dof->dofh_secoff +
7536	    provider->dofpv_proffs * dof->dofh_secsize);
7537
7538	strtab = (char *)(uintptr_t)(daddr + str_sec->dofs_offset);
7539	off = (uint32_t *)(uintptr_t)(daddr + off_sec->dofs_offset);
7540	arg = (uint8_t *)(uintptr_t)(daddr + arg_sec->dofs_offset);
7541	enoff = NULL;
7542
7543	/*
7544	 * See dtrace_helper_provider_validate().
7545	 */
7546	if (dof->dofh_ident[DOF_ID_VERSION] != DOF_VERSION_1 &&
7547	    provider->dofpv_prenoffs != DOF_SECT_NONE) {
7548		enoff_sec = (dof_sec_t *)(uintptr_t)(daddr + dof->dofh_secoff +
7549		    provider->dofpv_prenoffs * dof->dofh_secsize);
7550		enoff = (uint32_t *)(uintptr_t)(daddr + enoff_sec->dofs_offset);
7551	}
7552
7553	nprobes = prb_sec->dofs_size / prb_sec->dofs_entsize;
7554
7555	/*
7556	 * Create the provider.
7557	 */
7558	dtrace_dofprov2hprov(&dhpv, provider, strtab);
7559
7560	if ((parg = mops->dtms_provide_pid(meta->dtm_arg, &dhpv, pid)) == NULL)
7561		return;
7562
7563	meta->dtm_count++;
7564
7565	/*
7566	 * Create the probes.
7567	 */
7568	for (i = 0; i < nprobes; i++) {
7569		probe = (dof_probe_t *)(uintptr_t)(daddr +
7570		    prb_sec->dofs_offset + i * prb_sec->dofs_entsize);
7571
7572		dhpb.dthpb_mod = dhp->dofhp_mod;
7573		dhpb.dthpb_func = strtab + probe->dofpr_func;
7574		dhpb.dthpb_name = strtab + probe->dofpr_name;
7575		dhpb.dthpb_base = probe->dofpr_addr;
7576		dhpb.dthpb_offs = off + probe->dofpr_offidx;
7577		dhpb.dthpb_noffs = probe->dofpr_noffs;
7578		if (enoff != NULL) {
7579			dhpb.dthpb_enoffs = enoff + probe->dofpr_enoffidx;
7580			dhpb.dthpb_nenoffs = probe->dofpr_nenoffs;
7581		} else {
7582			dhpb.dthpb_enoffs = NULL;
7583			dhpb.dthpb_nenoffs = 0;
7584		}
7585		dhpb.dthpb_args = arg + probe->dofpr_argidx;
7586		dhpb.dthpb_nargc = probe->dofpr_nargc;
7587		dhpb.dthpb_xargc = probe->dofpr_xargc;
7588		dhpb.dthpb_ntypes = strtab + probe->dofpr_nargv;
7589		dhpb.dthpb_xtypes = strtab + probe->dofpr_xargv;
7590
7591		mops->dtms_create_probe(meta->dtm_arg, parg, &dhpb);
7592	}
7593}
7594
7595static void
7596dtrace_helper_provide(dof_helper_t *dhp, pid_t pid)
7597{
7598	uintptr_t daddr = (uintptr_t)dhp->dofhp_dof;
7599	dof_hdr_t *dof = (dof_hdr_t *)daddr;
7600	int i;
7601
7602	ASSERT(MUTEX_HELD(&dtrace_meta_lock));
7603
7604	for (i = 0; i < dof->dofh_secnum; i++) {
7605		dof_sec_t *sec = (dof_sec_t *)(uintptr_t)(daddr +
7606		    dof->dofh_secoff + i * dof->dofh_secsize);
7607
7608		if (sec->dofs_type != DOF_SECT_PROVIDER)
7609			continue;
7610
7611		dtrace_helper_provide_one(dhp, sec, pid);
7612	}
7613
7614	/*
7615	 * We may have just created probes, so we must now rematch against
7616	 * any retained enablings.  Note that this call will acquire both
7617	 * cpu_lock and dtrace_lock; the fact that we are holding
7618	 * dtrace_meta_lock now is what defines the ordering with respect to
7619	 * these three locks.
7620	 */
7621	dtrace_enabling_matchall();
7622}
7623
7624static void
7625dtrace_helper_provider_remove_one(dof_helper_t *dhp, dof_sec_t *sec, pid_t pid)
7626{
7627	uintptr_t daddr = (uintptr_t)dhp->dofhp_dof;
7628	dof_hdr_t *dof = (dof_hdr_t *)daddr;
7629	dof_sec_t *str_sec;
7630	dof_provider_t *provider;
7631	char *strtab;
7632	dtrace_helper_provdesc_t dhpv;
7633	dtrace_meta_t *meta = dtrace_meta_pid;
7634	dtrace_mops_t *mops = &meta->dtm_mops;
7635
7636	provider = (dof_provider_t *)(uintptr_t)(daddr + sec->dofs_offset);
7637	str_sec = (dof_sec_t *)(uintptr_t)(daddr + dof->dofh_secoff +
7638	    provider->dofpv_strtab * dof->dofh_secsize);
7639
7640	strtab = (char *)(uintptr_t)(daddr + str_sec->dofs_offset);
7641
7642	/*
7643	 * Create the provider.
7644	 */
7645	dtrace_dofprov2hprov(&dhpv, provider, strtab);
7646
7647	mops->dtms_remove_pid(meta->dtm_arg, &dhpv, pid);
7648
7649	meta->dtm_count--;
7650}
7651
7652static void
7653dtrace_helper_provider_remove(dof_helper_t *dhp, pid_t pid)
7654{
7655	uintptr_t daddr = (uintptr_t)dhp->dofhp_dof;
7656	dof_hdr_t *dof = (dof_hdr_t *)daddr;
7657	int i;
7658
7659	ASSERT(MUTEX_HELD(&dtrace_meta_lock));
7660
7661	for (i = 0; i < dof->dofh_secnum; i++) {
7662		dof_sec_t *sec = (dof_sec_t *)(uintptr_t)(daddr +
7663		    dof->dofh_secoff + i * dof->dofh_secsize);
7664
7665		if (sec->dofs_type != DOF_SECT_PROVIDER)
7666			continue;
7667
7668		dtrace_helper_provider_remove_one(dhp, sec, pid);
7669	}
7670}
7671
7672/*
7673 * DTrace Meta Provider-to-Framework API Functions
7674 *
7675 * These functions implement the Meta Provider-to-Framework API, as described
7676 * in <sys/dtrace.h>.
7677 */
7678int
7679dtrace_meta_register(const char *name, const dtrace_mops_t *mops, void *arg,
7680    dtrace_meta_provider_id_t *idp)
7681{
7682	dtrace_meta_t *meta;
7683	dtrace_helpers_t *help, *next;
7684	int i;
7685
7686	*idp = DTRACE_METAPROVNONE;
7687
7688	/*
7689	 * We strictly don't need the name, but we hold onto it for
7690	 * debuggability. All hail error queues!
7691	 */
7692	if (name == NULL) {
7693		cmn_err(CE_WARN, "failed to register meta-provider: "
7694		    "invalid name");
7695		return (EINVAL);
7696	}
7697
7698	if (mops == NULL ||
7699	    mops->dtms_create_probe == NULL ||
7700	    mops->dtms_provide_pid == NULL ||
7701	    mops->dtms_remove_pid == NULL) {
7702		cmn_err(CE_WARN, "failed to register meta-register %s: "
7703		    "invalid ops", name);
7704		return (EINVAL);
7705	}
7706
7707	meta = kmem_zalloc(sizeof (dtrace_meta_t), KM_SLEEP);
7708	meta->dtm_mops = *mops;
7709	meta->dtm_name = kmem_alloc(strlen(name) + 1, KM_SLEEP);
7710	(void) strcpy(meta->dtm_name, name);
7711	meta->dtm_arg = arg;
7712
7713	mutex_enter(&dtrace_meta_lock);
7714	mutex_enter(&dtrace_lock);
7715
7716	if (dtrace_meta_pid != NULL) {
7717		mutex_exit(&dtrace_lock);
7718		mutex_exit(&dtrace_meta_lock);
7719		cmn_err(CE_WARN, "failed to register meta-register %s: "
7720		    "user-land meta-provider exists", name);
7721		kmem_free(meta->dtm_name, strlen(meta->dtm_name) + 1);
7722		kmem_free(meta, sizeof (dtrace_meta_t));
7723		return (EINVAL);
7724	}
7725
7726	dtrace_meta_pid = meta;
7727	*idp = (dtrace_meta_provider_id_t)meta;
7728
7729	/*
7730	 * If there are providers and probes ready to go, pass them
7731	 * off to the new meta provider now.
7732	 */
7733
7734	help = dtrace_deferred_pid;
7735	dtrace_deferred_pid = NULL;
7736
7737	mutex_exit(&dtrace_lock);
7738
7739	while (help != NULL) {
7740		for (i = 0; i < help->dthps_nprovs; i++) {
7741			dtrace_helper_provide(&help->dthps_provs[i]->dthp_prov,
7742			    help->dthps_pid);
7743		}
7744
7745		next = help->dthps_next;
7746		help->dthps_next = NULL;
7747		help->dthps_prev = NULL;
7748		help->dthps_deferred = 0;
7749		help = next;
7750	}
7751
7752	mutex_exit(&dtrace_meta_lock);
7753
7754	return (0);
7755}
7756
7757int
7758dtrace_meta_unregister(dtrace_meta_provider_id_t id)
7759{
7760	dtrace_meta_t **pp, *old = (dtrace_meta_t *)id;
7761
7762	mutex_enter(&dtrace_meta_lock);
7763	mutex_enter(&dtrace_lock);
7764
7765	if (old == dtrace_meta_pid) {
7766		pp = &dtrace_meta_pid;
7767	} else {
7768		panic("attempt to unregister non-existent "
7769		    "dtrace meta-provider %p\n", (void *)old);
7770	}
7771
7772	if (old->dtm_count != 0) {
7773		mutex_exit(&dtrace_lock);
7774		mutex_exit(&dtrace_meta_lock);
7775		return (EBUSY);
7776	}
7777
7778	*pp = NULL;
7779
7780	mutex_exit(&dtrace_lock);
7781	mutex_exit(&dtrace_meta_lock);
7782
7783	kmem_free(old->dtm_name, strlen(old->dtm_name) + 1);
7784	kmem_free(old, sizeof (dtrace_meta_t));
7785
7786	return (0);
7787}
7788
7789
7790/*
7791 * DTrace DIF Object Functions
7792 */
7793static int
7794dtrace_difo_err(uint_t pc, const char *format, ...)
7795{
7796	if (dtrace_err_verbose) {
7797		va_list alist;
7798
7799		(void) uprintf("dtrace DIF object error: [%u]: ", pc);
7800		va_start(alist, format);
7801		(void) vuprintf(format, alist);
7802		va_end(alist);
7803	}
7804
7805#ifdef DTRACE_ERRDEBUG
7806	dtrace_errdebug(format);
7807#endif
7808	return (1);
7809}
7810
7811/*
7812 * Validate a DTrace DIF object by checking the IR instructions.  The following
7813 * rules are currently enforced by dtrace_difo_validate():
7814 *
7815 * 1. Each instruction must have a valid opcode
7816 * 2. Each register, string, variable, or subroutine reference must be valid
7817 * 3. No instruction can modify register %r0 (must be zero)
7818 * 4. All instruction reserved bits must be set to zero
7819 * 5. The last instruction must be a "ret" instruction
7820 * 6. All branch targets must reference a valid instruction _after_ the branch
7821 */
7822static int
7823dtrace_difo_validate(dtrace_difo_t *dp, dtrace_vstate_t *vstate, uint_t nregs,
7824    cred_t *cr)
7825{
7826	int err = 0, i;
7827	int (*efunc)(uint_t pc, const char *, ...) = dtrace_difo_err;
7828	int kcheckload;
7829	uint_t pc;
7830
7831	kcheckload = cr == NULL ||
7832	    (vstate->dtvs_state->dts_cred.dcr_visible & DTRACE_CRV_KERNEL) == 0;
7833
7834	dp->dtdo_destructive = 0;
7835
7836	for (pc = 0; pc < dp->dtdo_len && err == 0; pc++) {
7837		dif_instr_t instr = dp->dtdo_buf[pc];
7838
7839		uint_t r1 = DIF_INSTR_R1(instr);
7840		uint_t r2 = DIF_INSTR_R2(instr);
7841		uint_t rd = DIF_INSTR_RD(instr);
7842		uint_t rs = DIF_INSTR_RS(instr);
7843		uint_t label = DIF_INSTR_LABEL(instr);
7844		uint_t v = DIF_INSTR_VAR(instr);
7845		uint_t subr = DIF_INSTR_SUBR(instr);
7846		uint_t type = DIF_INSTR_TYPE(instr);
7847		uint_t op = DIF_INSTR_OP(instr);
7848
7849		switch (op) {
7850		case DIF_OP_OR:
7851		case DIF_OP_XOR:
7852		case DIF_OP_AND:
7853		case DIF_OP_SLL:
7854		case DIF_OP_SRL:
7855		case DIF_OP_SRA:
7856		case DIF_OP_SUB:
7857		case DIF_OP_ADD:
7858		case DIF_OP_MUL:
7859		case DIF_OP_SDIV:
7860		case DIF_OP_UDIV:
7861		case DIF_OP_SREM:
7862		case DIF_OP_UREM:
7863		case DIF_OP_COPYS:
7864			if (r1 >= nregs)
7865				err += efunc(pc, "invalid register %u\n", r1);
7866			if (r2 >= nregs)
7867				err += efunc(pc, "invalid register %u\n", r2);
7868			if (rd >= nregs)
7869				err += efunc(pc, "invalid register %u\n", rd);
7870			if (rd == 0)
7871				err += efunc(pc, "cannot write to %r0\n");
7872			break;
7873		case DIF_OP_NOT:
7874		case DIF_OP_MOV:
7875		case DIF_OP_ALLOCS:
7876			if (r1 >= nregs)
7877				err += efunc(pc, "invalid register %u\n", r1);
7878			if (r2 != 0)
7879				err += efunc(pc, "non-zero reserved bits\n");
7880			if (rd >= nregs)
7881				err += efunc(pc, "invalid register %u\n", rd);
7882			if (rd == 0)
7883				err += efunc(pc, "cannot write to %r0\n");
7884			break;
7885		case DIF_OP_LDSB:
7886		case DIF_OP_LDSH:
7887		case DIF_OP_LDSW:
7888		case DIF_OP_LDUB:
7889		case DIF_OP_LDUH:
7890		case DIF_OP_LDUW:
7891		case DIF_OP_LDX:
7892			if (r1 >= nregs)
7893				err += efunc(pc, "invalid register %u\n", r1);
7894			if (r2 != 0)
7895				err += efunc(pc, "non-zero reserved bits\n");
7896			if (rd >= nregs)
7897				err += efunc(pc, "invalid register %u\n", rd);
7898			if (rd == 0)
7899				err += efunc(pc, "cannot write to %r0\n");
7900			if (kcheckload)
7901				dp->dtdo_buf[pc] = DIF_INSTR_LOAD(op +
7902				    DIF_OP_RLDSB - DIF_OP_LDSB, r1, rd);
7903			break;
7904		case DIF_OP_RLDSB:
7905		case DIF_OP_RLDSH:
7906		case DIF_OP_RLDSW:
7907		case DIF_OP_RLDUB:
7908		case DIF_OP_RLDUH:
7909		case DIF_OP_RLDUW:
7910		case DIF_OP_RLDX:
7911			if (r1 >= nregs)
7912				err += efunc(pc, "invalid register %u\n", r1);
7913			if (r2 != 0)
7914				err += efunc(pc, "non-zero reserved bits\n");
7915			if (rd >= nregs)
7916				err += efunc(pc, "invalid register %u\n", rd);
7917			if (rd == 0)
7918				err += efunc(pc, "cannot write to %r0\n");
7919			break;
7920		case DIF_OP_ULDSB:
7921		case DIF_OP_ULDSH:
7922		case DIF_OP_ULDSW:
7923		case DIF_OP_ULDUB:
7924		case DIF_OP_ULDUH:
7925		case DIF_OP_ULDUW:
7926		case DIF_OP_ULDX:
7927			if (r1 >= nregs)
7928				err += efunc(pc, "invalid register %u\n", r1);
7929			if (r2 != 0)
7930				err += efunc(pc, "non-zero reserved bits\n");
7931			if (rd >= nregs)
7932				err += efunc(pc, "invalid register %u\n", rd);
7933			if (rd == 0)
7934				err += efunc(pc, "cannot write to %r0\n");
7935			break;
7936		case DIF_OP_STB:
7937		case DIF_OP_STH:
7938		case DIF_OP_STW:
7939		case DIF_OP_STX:
7940			if (r1 >= nregs)
7941				err += efunc(pc, "invalid register %u\n", r1);
7942			if (r2 != 0)
7943				err += efunc(pc, "non-zero reserved bits\n");
7944			if (rd >= nregs)
7945				err += efunc(pc, "invalid register %u\n", rd);
7946			if (rd == 0)
7947				err += efunc(pc, "cannot write to 0 address\n");
7948			break;
7949		case DIF_OP_CMP:
7950		case DIF_OP_SCMP:
7951			if (r1 >= nregs)
7952				err += efunc(pc, "invalid register %u\n", r1);
7953			if (r2 >= nregs)
7954				err += efunc(pc, "invalid register %u\n", r2);
7955			if (rd != 0)
7956				err += efunc(pc, "non-zero reserved bits\n");
7957			break;
7958		case DIF_OP_TST:
7959			if (r1 >= nregs)
7960				err += efunc(pc, "invalid register %u\n", r1);
7961			if (r2 != 0 || rd != 0)
7962				err += efunc(pc, "non-zero reserved bits\n");
7963			break;
7964		case DIF_OP_BA:
7965		case DIF_OP_BE:
7966		case DIF_OP_BNE:
7967		case DIF_OP_BG:
7968		case DIF_OP_BGU:
7969		case DIF_OP_BGE:
7970		case DIF_OP_BGEU:
7971		case DIF_OP_BL:
7972		case DIF_OP_BLU:
7973		case DIF_OP_BLE:
7974		case DIF_OP_BLEU:
7975			if (label >= dp->dtdo_len) {
7976				err += efunc(pc, "invalid branch target %u\n",
7977				    label);
7978			}
7979			if (label <= pc) {
7980				err += efunc(pc, "backward branch to %u\n",
7981				    label);
7982			}
7983			break;
7984		case DIF_OP_RET:
7985			if (r1 != 0 || r2 != 0)
7986				err += efunc(pc, "non-zero reserved bits\n");
7987			if (rd >= nregs)
7988				err += efunc(pc, "invalid register %u\n", rd);
7989			break;
7990		case DIF_OP_NOP:
7991		case DIF_OP_POPTS:
7992		case DIF_OP_FLUSHTS:
7993			if (r1 != 0 || r2 != 0 || rd != 0)
7994				err += efunc(pc, "non-zero reserved bits\n");
7995			break;
7996		case DIF_OP_SETX:
7997			if (DIF_INSTR_INTEGER(instr) >= dp->dtdo_intlen) {
7998				err += efunc(pc, "invalid integer ref %u\n",
7999				    DIF_INSTR_INTEGER(instr));
8000			}
8001			if (rd >= nregs)
8002				err += efunc(pc, "invalid register %u\n", rd);
8003			if (rd == 0)
8004				err += efunc(pc, "cannot write to %r0\n");
8005			break;
8006		case DIF_OP_SETS:
8007			if (DIF_INSTR_STRING(instr) >= dp->dtdo_strlen) {
8008				err += efunc(pc, "invalid string ref %u\n",
8009				    DIF_INSTR_STRING(instr));
8010			}
8011			if (rd >= nregs)
8012				err += efunc(pc, "invalid register %u\n", rd);
8013			if (rd == 0)
8014				err += efunc(pc, "cannot write to %r0\n");
8015			break;
8016		case DIF_OP_LDGA:
8017		case DIF_OP_LDTA:
8018			if (r1 > DIF_VAR_ARRAY_MAX)
8019				err += efunc(pc, "invalid array %u\n", r1);
8020			if (r2 >= nregs)
8021				err += efunc(pc, "invalid register %u\n", r2);
8022			if (rd >= nregs)
8023				err += efunc(pc, "invalid register %u\n", rd);
8024			if (rd == 0)
8025				err += efunc(pc, "cannot write to %r0\n");
8026			break;
8027		case DIF_OP_LDGS:
8028		case DIF_OP_LDTS:
8029		case DIF_OP_LDLS:
8030		case DIF_OP_LDGAA:
8031		case DIF_OP_LDTAA:
8032			if (v < DIF_VAR_OTHER_MIN || v > DIF_VAR_OTHER_MAX)
8033				err += efunc(pc, "invalid variable %u\n", v);
8034			if (rd >= nregs)
8035				err += efunc(pc, "invalid register %u\n", rd);
8036			if (rd == 0)
8037				err += efunc(pc, "cannot write to %r0\n");
8038			break;
8039		case DIF_OP_STGS:
8040		case DIF_OP_STTS:
8041		case DIF_OP_STLS:
8042		case DIF_OP_STGAA:
8043		case DIF_OP_STTAA:
8044			if (v < DIF_VAR_OTHER_UBASE || v > DIF_VAR_OTHER_MAX)
8045				err += efunc(pc, "invalid variable %u\n", v);
8046			if (rs >= nregs)
8047				err += efunc(pc, "invalid register %u\n", rd);
8048			break;
8049		case DIF_OP_CALL:
8050			if (subr > DIF_SUBR_MAX)
8051				err += efunc(pc, "invalid subr %u\n", subr);
8052			if (rd >= nregs)
8053				err += efunc(pc, "invalid register %u\n", rd);
8054			if (rd == 0)
8055				err += efunc(pc, "cannot write to %r0\n");
8056
8057			if (subr == DIF_SUBR_COPYOUT ||
8058			    subr == DIF_SUBR_COPYOUTSTR) {
8059				dp->dtdo_destructive = 1;
8060			}
8061			break;
8062		case DIF_OP_PUSHTR:
8063			if (type != DIF_TYPE_STRING && type != DIF_TYPE_CTF)
8064				err += efunc(pc, "invalid ref type %u\n", type);
8065			if (r2 >= nregs)
8066				err += efunc(pc, "invalid register %u\n", r2);
8067			if (rs >= nregs)
8068				err += efunc(pc, "invalid register %u\n", rs);
8069			break;
8070		case DIF_OP_PUSHTV:
8071			if (type != DIF_TYPE_CTF)
8072				err += efunc(pc, "invalid val type %u\n", type);
8073			if (r2 >= nregs)
8074				err += efunc(pc, "invalid register %u\n", r2);
8075			if (rs >= nregs)
8076				err += efunc(pc, "invalid register %u\n", rs);
8077			break;
8078		default:
8079			err += efunc(pc, "invalid opcode %u\n",
8080			    DIF_INSTR_OP(instr));
8081		}
8082	}
8083
8084	if (dp->dtdo_len != 0 &&
8085	    DIF_INSTR_OP(dp->dtdo_buf[dp->dtdo_len - 1]) != DIF_OP_RET) {
8086		err += efunc(dp->dtdo_len - 1,
8087		    "expected 'ret' as last DIF instruction\n");
8088	}
8089
8090	if (!(dp->dtdo_rtype.dtdt_flags & DIF_TF_BYREF)) {
8091		/*
8092		 * If we're not returning by reference, the size must be either
8093		 * 0 or the size of one of the base types.
8094		 */
8095		switch (dp->dtdo_rtype.dtdt_size) {
8096		case 0:
8097		case sizeof (uint8_t):
8098		case sizeof (uint16_t):
8099		case sizeof (uint32_t):
8100		case sizeof (uint64_t):
8101			break;
8102
8103		default:
8104			err += efunc(dp->dtdo_len - 1, "bad return size");
8105		}
8106	}
8107
8108	for (i = 0; i < dp->dtdo_varlen && err == 0; i++) {
8109		dtrace_difv_t *v = &dp->dtdo_vartab[i], *existing = NULL;
8110		dtrace_diftype_t *vt, *et;
8111		uint_t id, ndx;
8112
8113		if (v->dtdv_scope != DIFV_SCOPE_GLOBAL &&
8114		    v->dtdv_scope != DIFV_SCOPE_THREAD &&
8115		    v->dtdv_scope != DIFV_SCOPE_LOCAL) {
8116			err += efunc(i, "unrecognized variable scope %d\n",
8117			    v->dtdv_scope);
8118			break;
8119		}
8120
8121		if (v->dtdv_kind != DIFV_KIND_ARRAY &&
8122		    v->dtdv_kind != DIFV_KIND_SCALAR) {
8123			err += efunc(i, "unrecognized variable type %d\n",
8124			    v->dtdv_kind);
8125			break;
8126		}
8127
8128		if ((id = v->dtdv_id) > DIF_VARIABLE_MAX) {
8129			err += efunc(i, "%d exceeds variable id limit\n", id);
8130			break;
8131		}
8132
8133		if (id < DIF_VAR_OTHER_UBASE)
8134			continue;
8135
8136		/*
8137		 * For user-defined variables, we need to check that this
8138		 * definition is identical to any previous definition that we
8139		 * encountered.
8140		 */
8141		ndx = id - DIF_VAR_OTHER_UBASE;
8142
8143		switch (v->dtdv_scope) {
8144		case DIFV_SCOPE_GLOBAL:
8145			if (ndx < vstate->dtvs_nglobals) {
8146				dtrace_statvar_t *svar;
8147
8148				if ((svar = vstate->dtvs_globals[ndx]) != NULL)
8149					existing = &svar->dtsv_var;
8150			}
8151
8152			break;
8153
8154		case DIFV_SCOPE_THREAD:
8155			if (ndx < vstate->dtvs_ntlocals)
8156				existing = &vstate->dtvs_tlocals[ndx];
8157			break;
8158
8159		case DIFV_SCOPE_LOCAL:
8160			if (ndx < vstate->dtvs_nlocals) {
8161				dtrace_statvar_t *svar;
8162
8163				if ((svar = vstate->dtvs_locals[ndx]) != NULL)
8164					existing = &svar->dtsv_var;
8165			}
8166
8167			break;
8168		}
8169
8170		vt = &v->dtdv_type;
8171
8172		if (vt->dtdt_flags & DIF_TF_BYREF) {
8173			if (vt->dtdt_size == 0) {
8174				err += efunc(i, "zero-sized variable\n");
8175				break;
8176			}
8177
8178			if (v->dtdv_scope == DIFV_SCOPE_GLOBAL &&
8179			    vt->dtdt_size > dtrace_global_maxsize) {
8180				err += efunc(i, "oversized by-ref global\n");
8181				break;
8182			}
8183		}
8184
8185		if (existing == NULL || existing->dtdv_id == 0)
8186			continue;
8187
8188		ASSERT(existing->dtdv_id == v->dtdv_id);
8189		ASSERT(existing->dtdv_scope == v->dtdv_scope);
8190
8191		if (existing->dtdv_kind != v->dtdv_kind)
8192			err += efunc(i, "%d changed variable kind\n", id);
8193
8194		et = &existing->dtdv_type;
8195
8196		if (vt->dtdt_flags != et->dtdt_flags) {
8197			err += efunc(i, "%d changed variable type flags\n", id);
8198			break;
8199		}
8200
8201		if (vt->dtdt_size != 0 && vt->dtdt_size != et->dtdt_size) {
8202			err += efunc(i, "%d changed variable type size\n", id);
8203			break;
8204		}
8205	}
8206
8207	return (err);
8208}
8209
8210/*
8211 * Validate a DTrace DIF object that it is to be used as a helper.  Helpers
8212 * are much more constrained than normal DIFOs.  Specifically, they may
8213 * not:
8214 *
8215 * 1. Make calls to subroutines other than copyin(), copyinstr() or
8216 *    miscellaneous string routines
8217 * 2. Access DTrace variables other than the args[] array, and the
8218 *    curthread, pid, ppid, tid, execname, zonename, uid and gid variables.
8219 * 3. Have thread-local variables.
8220 * 4. Have dynamic variables.
8221 */
8222static int
8223dtrace_difo_validate_helper(dtrace_difo_t *dp)
8224{
8225	int (*efunc)(uint_t pc, const char *, ...) = dtrace_difo_err;
8226	int err = 0;
8227	uint_t pc;
8228
8229	for (pc = 0; pc < dp->dtdo_len; pc++) {
8230		dif_instr_t instr = dp->dtdo_buf[pc];
8231
8232		uint_t v = DIF_INSTR_VAR(instr);
8233		uint_t subr = DIF_INSTR_SUBR(instr);
8234		uint_t op = DIF_INSTR_OP(instr);
8235
8236		switch (op) {
8237		case DIF_OP_OR:
8238		case DIF_OP_XOR:
8239		case DIF_OP_AND:
8240		case DIF_OP_SLL:
8241		case DIF_OP_SRL:
8242		case DIF_OP_SRA:
8243		case DIF_OP_SUB:
8244		case DIF_OP_ADD:
8245		case DIF_OP_MUL:
8246		case DIF_OP_SDIV:
8247		case DIF_OP_UDIV:
8248		case DIF_OP_SREM:
8249		case DIF_OP_UREM:
8250		case DIF_OP_COPYS:
8251		case DIF_OP_NOT:
8252		case DIF_OP_MOV:
8253		case DIF_OP_RLDSB:
8254		case DIF_OP_RLDSH:
8255		case DIF_OP_RLDSW:
8256		case DIF_OP_RLDUB:
8257		case DIF_OP_RLDUH:
8258		case DIF_OP_RLDUW:
8259		case DIF_OP_RLDX:
8260		case DIF_OP_ULDSB:
8261		case DIF_OP_ULDSH:
8262		case DIF_OP_ULDSW:
8263		case DIF_OP_ULDUB:
8264		case DIF_OP_ULDUH:
8265		case DIF_OP_ULDUW:
8266		case DIF_OP_ULDX:
8267		case DIF_OP_STB:
8268		case DIF_OP_STH:
8269		case DIF_OP_STW:
8270		case DIF_OP_STX:
8271		case DIF_OP_ALLOCS:
8272		case DIF_OP_CMP:
8273		case DIF_OP_SCMP:
8274		case DIF_OP_TST:
8275		case DIF_OP_BA:
8276		case DIF_OP_BE:
8277		case DIF_OP_BNE:
8278		case DIF_OP_BG:
8279		case DIF_OP_BGU:
8280		case DIF_OP_BGE:
8281		case DIF_OP_BGEU:
8282		case DIF_OP_BL:
8283		case DIF_OP_BLU:
8284		case DIF_OP_BLE:
8285		case DIF_OP_BLEU:
8286		case DIF_OP_RET:
8287		case DIF_OP_NOP:
8288		case DIF_OP_POPTS:
8289		case DIF_OP_FLUSHTS:
8290		case DIF_OP_SETX:
8291		case DIF_OP_SETS:
8292		case DIF_OP_LDGA:
8293		case DIF_OP_LDLS:
8294		case DIF_OP_STGS:
8295		case DIF_OP_STLS:
8296		case DIF_OP_PUSHTR:
8297		case DIF_OP_PUSHTV:
8298			break;
8299
8300		case DIF_OP_LDGS:
8301			if (v >= DIF_VAR_OTHER_UBASE)
8302				break;
8303
8304			if (v >= DIF_VAR_ARG0 && v <= DIF_VAR_ARG9)
8305				break;
8306
8307			if (v == DIF_VAR_CURTHREAD || v == DIF_VAR_PID ||
8308			    v == DIF_VAR_PPID || v == DIF_VAR_TID ||
8309			    v == DIF_VAR_EXECNAME || v == DIF_VAR_ZONENAME ||
8310			    v == DIF_VAR_UID || v == DIF_VAR_GID)
8311				break;
8312
8313			err += efunc(pc, "illegal variable %u\n", v);
8314			break;
8315
8316		case DIF_OP_LDTA:
8317		case DIF_OP_LDTS:
8318		case DIF_OP_LDGAA:
8319		case DIF_OP_LDTAA:
8320			err += efunc(pc, "illegal dynamic variable load\n");
8321			break;
8322
8323		case DIF_OP_STTS:
8324		case DIF_OP_STGAA:
8325		case DIF_OP_STTAA:
8326			err += efunc(pc, "illegal dynamic variable store\n");
8327			break;
8328
8329		case DIF_OP_CALL:
8330			if (subr == DIF_SUBR_ALLOCA ||
8331			    subr == DIF_SUBR_BCOPY ||
8332			    subr == DIF_SUBR_COPYIN ||
8333			    subr == DIF_SUBR_COPYINTO ||
8334			    subr == DIF_SUBR_COPYINSTR ||
8335			    subr == DIF_SUBR_INDEX ||
8336			    subr == DIF_SUBR_INET_NTOA ||
8337			    subr == DIF_SUBR_INET_NTOA6 ||
8338			    subr == DIF_SUBR_INET_NTOP ||
8339			    subr == DIF_SUBR_LLTOSTR ||
8340			    subr == DIF_SUBR_RINDEX ||
8341			    subr == DIF_SUBR_STRCHR ||
8342			    subr == DIF_SUBR_STRJOIN ||
8343			    subr == DIF_SUBR_STRRCHR ||
8344			    subr == DIF_SUBR_STRSTR ||
8345			    subr == DIF_SUBR_HTONS ||
8346			    subr == DIF_SUBR_HTONL ||
8347			    subr == DIF_SUBR_HTONLL ||
8348			    subr == DIF_SUBR_NTOHS ||
8349			    subr == DIF_SUBR_NTOHL ||
8350			    subr == DIF_SUBR_NTOHLL)
8351				break;
8352
8353			err += efunc(pc, "invalid subr %u\n", subr);
8354			break;
8355
8356		default:
8357			err += efunc(pc, "invalid opcode %u\n",
8358			    DIF_INSTR_OP(instr));
8359		}
8360	}
8361
8362	return (err);
8363}
8364
8365/*
8366 * Returns 1 if the expression in the DIF object can be cached on a per-thread
8367 * basis; 0 if not.
8368 */
8369static int
8370dtrace_difo_cacheable(dtrace_difo_t *dp)
8371{
8372	int i;
8373
8374	if (dp == NULL)
8375		return (0);
8376
8377	for (i = 0; i < dp->dtdo_varlen; i++) {
8378		dtrace_difv_t *v = &dp->dtdo_vartab[i];
8379
8380		if (v->dtdv_scope != DIFV_SCOPE_GLOBAL)
8381			continue;
8382
8383		switch (v->dtdv_id) {
8384		case DIF_VAR_CURTHREAD:
8385		case DIF_VAR_PID:
8386		case DIF_VAR_TID:
8387		case DIF_VAR_EXECNAME:
8388		case DIF_VAR_ZONENAME:
8389			break;
8390
8391		default:
8392			return (0);
8393		}
8394	}
8395
8396	/*
8397	 * This DIF object may be cacheable.  Now we need to look for any
8398	 * array loading instructions, any memory loading instructions, or
8399	 * any stores to thread-local variables.
8400	 */
8401	for (i = 0; i < dp->dtdo_len; i++) {
8402		uint_t op = DIF_INSTR_OP(dp->dtdo_buf[i]);
8403
8404		if ((op >= DIF_OP_LDSB && op <= DIF_OP_LDX) ||
8405		    (op >= DIF_OP_ULDSB && op <= DIF_OP_ULDX) ||
8406		    (op >= DIF_OP_RLDSB && op <= DIF_OP_RLDX) ||
8407		    op == DIF_OP_LDGA || op == DIF_OP_STTS)
8408			return (0);
8409	}
8410
8411	return (1);
8412}
8413
8414static void
8415dtrace_difo_hold(dtrace_difo_t *dp)
8416{
8417	int i;
8418
8419	ASSERT(MUTEX_HELD(&dtrace_lock));
8420
8421	dp->dtdo_refcnt++;
8422	ASSERT(dp->dtdo_refcnt != 0);
8423
8424	/*
8425	 * We need to check this DIF object for references to the variable
8426	 * DIF_VAR_VTIMESTAMP.
8427	 */
8428	for (i = 0; i < dp->dtdo_varlen; i++) {
8429		dtrace_difv_t *v = &dp->dtdo_vartab[i];
8430
8431		if (v->dtdv_id != DIF_VAR_VTIMESTAMP)
8432			continue;
8433
8434		if (dtrace_vtime_references++ == 0)
8435			dtrace_vtime_enable();
8436	}
8437}
8438
8439/*
8440 * This routine calculates the dynamic variable chunksize for a given DIF
8441 * object.  The calculation is not fool-proof, and can probably be tricked by
8442 * malicious DIF -- but it works for all compiler-generated DIF.  Because this
8443 * calculation is likely imperfect, dtrace_dynvar() is able to gracefully fail
8444 * if a dynamic variable size exceeds the chunksize.
8445 */
8446static void
8447dtrace_difo_chunksize(dtrace_difo_t *dp, dtrace_vstate_t *vstate)
8448{
8449	uint64_t sval;
8450	dtrace_key_t tupregs[DIF_DTR_NREGS + 2]; /* +2 for thread and id */
8451	const dif_instr_t *text = dp->dtdo_buf;
8452	uint_t pc, srd = 0;
8453	uint_t ttop = 0;
8454	size_t size, ksize;
8455	uint_t id, i;
8456
8457	for (pc = 0; pc < dp->dtdo_len; pc++) {
8458		dif_instr_t instr = text[pc];
8459		uint_t op = DIF_INSTR_OP(instr);
8460		uint_t rd = DIF_INSTR_RD(instr);
8461		uint_t r1 = DIF_INSTR_R1(instr);
8462		uint_t nkeys = 0;
8463		uchar_t scope;
8464
8465		dtrace_key_t *key = tupregs;
8466
8467		switch (op) {
8468		case DIF_OP_SETX:
8469			sval = dp->dtdo_inttab[DIF_INSTR_INTEGER(instr)];
8470			srd = rd;
8471			continue;
8472
8473		case DIF_OP_STTS:
8474			key = &tupregs[DIF_DTR_NREGS];
8475			key[0].dttk_size = 0;
8476			key[1].dttk_size = 0;
8477			nkeys = 2;
8478			scope = DIFV_SCOPE_THREAD;
8479			break;
8480
8481		case DIF_OP_STGAA:
8482		case DIF_OP_STTAA:
8483			nkeys = ttop;
8484
8485			if (DIF_INSTR_OP(instr) == DIF_OP_STTAA)
8486				key[nkeys++].dttk_size = 0;
8487
8488			key[nkeys++].dttk_size = 0;
8489
8490			if (op == DIF_OP_STTAA) {
8491				scope = DIFV_SCOPE_THREAD;
8492			} else {
8493				scope = DIFV_SCOPE_GLOBAL;
8494			}
8495
8496			break;
8497
8498		case DIF_OP_PUSHTR:
8499			if (ttop == DIF_DTR_NREGS)
8500				return;
8501
8502			if ((srd == 0 || sval == 0) && r1 == DIF_TYPE_STRING) {
8503				/*
8504				 * If the register for the size of the "pushtr"
8505				 * is %r0 (or the value is 0) and the type is
8506				 * a string, we'll use the system-wide default
8507				 * string size.
8508				 */
8509				tupregs[ttop++].dttk_size =
8510				    dtrace_strsize_default;
8511			} else {
8512				if (srd == 0)
8513					return;
8514
8515				tupregs[ttop++].dttk_size = sval;
8516			}
8517
8518			break;
8519
8520		case DIF_OP_PUSHTV:
8521			if (ttop == DIF_DTR_NREGS)
8522				return;
8523
8524			tupregs[ttop++].dttk_size = 0;
8525			break;
8526
8527		case DIF_OP_FLUSHTS:
8528			ttop = 0;
8529			break;
8530
8531		case DIF_OP_POPTS:
8532			if (ttop != 0)
8533				ttop--;
8534			break;
8535		}
8536
8537		sval = 0;
8538		srd = 0;
8539
8540		if (nkeys == 0)
8541			continue;
8542
8543		/*
8544		 * We have a dynamic variable allocation; calculate its size.
8545		 */
8546		for (ksize = 0, i = 0; i < nkeys; i++)
8547			ksize += P2ROUNDUP(key[i].dttk_size, sizeof (uint64_t));
8548
8549		size = sizeof (dtrace_dynvar_t);
8550		size += sizeof (dtrace_key_t) * (nkeys - 1);
8551		size += ksize;
8552
8553		/*
8554		 * Now we need to determine the size of the stored data.
8555		 */
8556		id = DIF_INSTR_VAR(instr);
8557
8558		for (i = 0; i < dp->dtdo_varlen; i++) {
8559			dtrace_difv_t *v = &dp->dtdo_vartab[i];
8560
8561			if (v->dtdv_id == id && v->dtdv_scope == scope) {
8562				size += v->dtdv_type.dtdt_size;
8563				break;
8564			}
8565		}
8566
8567		if (i == dp->dtdo_varlen)
8568			return;
8569
8570		/*
8571		 * We have the size.  If this is larger than the chunk size
8572		 * for our dynamic variable state, reset the chunk size.
8573		 */
8574		size = P2ROUNDUP(size, sizeof (uint64_t));
8575
8576		if (size > vstate->dtvs_dynvars.dtds_chunksize)
8577			vstate->dtvs_dynvars.dtds_chunksize = size;
8578	}
8579}
8580
8581static void
8582dtrace_difo_init(dtrace_difo_t *dp, dtrace_vstate_t *vstate)
8583{
8584	int i, oldsvars, osz, nsz, otlocals, ntlocals;
8585	uint_t id;
8586
8587	ASSERT(MUTEX_HELD(&dtrace_lock));
8588	ASSERT(dp->dtdo_buf != NULL && dp->dtdo_len != 0);
8589
8590	for (i = 0; i < dp->dtdo_varlen; i++) {
8591		dtrace_difv_t *v = &dp->dtdo_vartab[i];
8592		dtrace_statvar_t *svar, ***svarp;
8593		size_t dsize = 0;
8594		uint8_t scope = v->dtdv_scope;
8595		int *np;
8596
8597		if ((id = v->dtdv_id) < DIF_VAR_OTHER_UBASE)
8598			continue;
8599
8600		id -= DIF_VAR_OTHER_UBASE;
8601
8602		switch (scope) {
8603		case DIFV_SCOPE_THREAD:
8604			while (id >= (otlocals = vstate->dtvs_ntlocals)) {
8605				dtrace_difv_t *tlocals;
8606
8607				if ((ntlocals = (otlocals << 1)) == 0)
8608					ntlocals = 1;
8609
8610				osz = otlocals * sizeof (dtrace_difv_t);
8611				nsz = ntlocals * sizeof (dtrace_difv_t);
8612
8613				tlocals = kmem_zalloc(nsz, KM_SLEEP);
8614
8615				if (osz != 0) {
8616					bcopy(vstate->dtvs_tlocals,
8617					    tlocals, osz);
8618					kmem_free(vstate->dtvs_tlocals, osz);
8619				}
8620
8621				vstate->dtvs_tlocals = tlocals;
8622				vstate->dtvs_ntlocals = ntlocals;
8623			}
8624
8625			vstate->dtvs_tlocals[id] = *v;
8626			continue;
8627
8628		case DIFV_SCOPE_LOCAL:
8629			np = &vstate->dtvs_nlocals;
8630			svarp = &vstate->dtvs_locals;
8631
8632			if (v->dtdv_type.dtdt_flags & DIF_TF_BYREF)
8633				dsize = NCPU * (v->dtdv_type.dtdt_size +
8634				    sizeof (uint64_t));
8635			else
8636				dsize = NCPU * sizeof (uint64_t);
8637
8638			break;
8639
8640		case DIFV_SCOPE_GLOBAL:
8641			np = &vstate->dtvs_nglobals;
8642			svarp = &vstate->dtvs_globals;
8643
8644			if (v->dtdv_type.dtdt_flags & DIF_TF_BYREF)
8645				dsize = v->dtdv_type.dtdt_size +
8646				    sizeof (uint64_t);
8647
8648			break;
8649
8650		default:
8651			ASSERT(0);
8652		}
8653
8654		while (id >= (oldsvars = *np)) {
8655			dtrace_statvar_t **statics;
8656			int newsvars, oldsize, newsize;
8657
8658			if ((newsvars = (oldsvars << 1)) == 0)
8659				newsvars = 1;
8660
8661			oldsize = oldsvars * sizeof (dtrace_statvar_t *);
8662			newsize = newsvars * sizeof (dtrace_statvar_t *);
8663
8664			statics = kmem_zalloc(newsize, KM_SLEEP);
8665
8666			if (oldsize != 0) {
8667				bcopy(*svarp, statics, oldsize);
8668				kmem_free(*svarp, oldsize);
8669			}
8670
8671			*svarp = statics;
8672			*np = newsvars;
8673		}
8674
8675		if ((svar = (*svarp)[id]) == NULL) {
8676			svar = kmem_zalloc(sizeof (dtrace_statvar_t), KM_SLEEP);
8677			svar->dtsv_var = *v;
8678
8679			if ((svar->dtsv_size = dsize) != 0) {
8680				svar->dtsv_data = (uint64_t)(uintptr_t)
8681				    kmem_zalloc(dsize, KM_SLEEP);
8682			}
8683
8684			(*svarp)[id] = svar;
8685		}
8686
8687		svar->dtsv_refcnt++;
8688	}
8689
8690	dtrace_difo_chunksize(dp, vstate);
8691	dtrace_difo_hold(dp);
8692}
8693
8694static dtrace_difo_t *
8695dtrace_difo_duplicate(dtrace_difo_t *dp, dtrace_vstate_t *vstate)
8696{
8697	dtrace_difo_t *new;
8698	size_t sz;
8699
8700	ASSERT(dp->dtdo_buf != NULL);
8701	ASSERT(dp->dtdo_refcnt != 0);
8702
8703	new = kmem_zalloc(sizeof (dtrace_difo_t), KM_SLEEP);
8704
8705	ASSERT(dp->dtdo_buf != NULL);
8706	sz = dp->dtdo_len * sizeof (dif_instr_t);
8707	new->dtdo_buf = kmem_alloc(sz, KM_SLEEP);
8708	bcopy(dp->dtdo_buf, new->dtdo_buf, sz);
8709	new->dtdo_len = dp->dtdo_len;
8710
8711	if (dp->dtdo_strtab != NULL) {
8712		ASSERT(dp->dtdo_strlen != 0);
8713		new->dtdo_strtab = kmem_alloc(dp->dtdo_strlen, KM_SLEEP);
8714		bcopy(dp->dtdo_strtab, new->dtdo_strtab, dp->dtdo_strlen);
8715		new->dtdo_strlen = dp->dtdo_strlen;
8716	}
8717
8718	if (dp->dtdo_inttab != NULL) {
8719		ASSERT(dp->dtdo_intlen != 0);
8720		sz = dp->dtdo_intlen * sizeof (uint64_t);
8721		new->dtdo_inttab = kmem_alloc(sz, KM_SLEEP);
8722		bcopy(dp->dtdo_inttab, new->dtdo_inttab, sz);
8723		new->dtdo_intlen = dp->dtdo_intlen;
8724	}
8725
8726	if (dp->dtdo_vartab != NULL) {
8727		ASSERT(dp->dtdo_varlen != 0);
8728		sz = dp->dtdo_varlen * sizeof (dtrace_difv_t);
8729		new->dtdo_vartab = kmem_alloc(sz, KM_SLEEP);
8730		bcopy(dp->dtdo_vartab, new->dtdo_vartab, sz);
8731		new->dtdo_varlen = dp->dtdo_varlen;
8732	}
8733
8734	dtrace_difo_init(new, vstate);
8735	return (new);
8736}
8737
8738static void
8739dtrace_difo_destroy(dtrace_difo_t *dp, dtrace_vstate_t *vstate)
8740{
8741	int i;
8742
8743	ASSERT(dp->dtdo_refcnt == 0);
8744
8745	for (i = 0; i < dp->dtdo_varlen; i++) {
8746		dtrace_difv_t *v = &dp->dtdo_vartab[i];
8747		dtrace_statvar_t *svar, **svarp;
8748		uint_t id;
8749		uint8_t scope = v->dtdv_scope;
8750		int *np;
8751
8752		switch (scope) {
8753		case DIFV_SCOPE_THREAD:
8754			continue;
8755
8756		case DIFV_SCOPE_LOCAL:
8757			np = &vstate->dtvs_nlocals;
8758			svarp = vstate->dtvs_locals;
8759			break;
8760
8761		case DIFV_SCOPE_GLOBAL:
8762			np = &vstate->dtvs_nglobals;
8763			svarp = vstate->dtvs_globals;
8764			break;
8765
8766		default:
8767			ASSERT(0);
8768		}
8769
8770		if ((id = v->dtdv_id) < DIF_VAR_OTHER_UBASE)
8771			continue;
8772
8773		id -= DIF_VAR_OTHER_UBASE;
8774		ASSERT(id < *np);
8775
8776		svar = svarp[id];
8777		ASSERT(svar != NULL);
8778		ASSERT(svar->dtsv_refcnt > 0);
8779
8780		if (--svar->dtsv_refcnt > 0)
8781			continue;
8782
8783		if (svar->dtsv_size != 0) {
8784			ASSERT(svar->dtsv_data != NULL);
8785			kmem_free((void *)(uintptr_t)svar->dtsv_data,
8786			    svar->dtsv_size);
8787		}
8788
8789		kmem_free(svar, sizeof (dtrace_statvar_t));
8790		svarp[id] = NULL;
8791	}
8792
8793	kmem_free(dp->dtdo_buf, dp->dtdo_len * sizeof (dif_instr_t));
8794	kmem_free(dp->dtdo_inttab, dp->dtdo_intlen * sizeof (uint64_t));
8795	kmem_free(dp->dtdo_strtab, dp->dtdo_strlen);
8796	kmem_free(dp->dtdo_vartab, dp->dtdo_varlen * sizeof (dtrace_difv_t));
8797
8798	kmem_free(dp, sizeof (dtrace_difo_t));
8799}
8800
8801static void
8802dtrace_difo_release(dtrace_difo_t *dp, dtrace_vstate_t *vstate)
8803{
8804	int i;
8805
8806	ASSERT(MUTEX_HELD(&dtrace_lock));
8807	ASSERT(dp->dtdo_refcnt != 0);
8808
8809	for (i = 0; i < dp->dtdo_varlen; i++) {
8810		dtrace_difv_t *v = &dp->dtdo_vartab[i];
8811
8812		if (v->dtdv_id != DIF_VAR_VTIMESTAMP)
8813			continue;
8814
8815		ASSERT(dtrace_vtime_references > 0);
8816		if (--dtrace_vtime_references == 0)
8817			dtrace_vtime_disable();
8818	}
8819
8820	if (--dp->dtdo_refcnt == 0)
8821		dtrace_difo_destroy(dp, vstate);
8822}
8823
8824/*
8825 * DTrace Format Functions
8826 */
8827static uint16_t
8828dtrace_format_add(dtrace_state_t *state, char *str)
8829{
8830	char *fmt, **new;
8831	uint16_t ndx, len = strlen(str) + 1;
8832
8833	fmt = kmem_zalloc(len, KM_SLEEP);
8834	bcopy(str, fmt, len);
8835
8836	for (ndx = 0; ndx < state->dts_nformats; ndx++) {
8837		if (state->dts_formats[ndx] == NULL) {
8838			state->dts_formats[ndx] = fmt;
8839			return (ndx + 1);
8840		}
8841	}
8842
8843	if (state->dts_nformats == USHRT_MAX) {
8844		/*
8845		 * This is only likely if a denial-of-service attack is being
8846		 * attempted.  As such, it's okay to fail silently here.
8847		 */
8848		kmem_free(fmt, len);
8849		return (0);
8850	}
8851
8852	/*
8853	 * For simplicity, we always resize the formats array to be exactly the
8854	 * number of formats.
8855	 */
8856	ndx = state->dts_nformats++;
8857	new = kmem_alloc((ndx + 1) * sizeof (char *), KM_SLEEP);
8858
8859	if (state->dts_formats != NULL) {
8860		ASSERT(ndx != 0);
8861		bcopy(state->dts_formats, new, ndx * sizeof (char *));
8862		kmem_free(state->dts_formats, ndx * sizeof (char *));
8863	}
8864
8865	state->dts_formats = new;
8866	state->dts_formats[ndx] = fmt;
8867
8868	return (ndx + 1);
8869}
8870
8871static void
8872dtrace_format_remove(dtrace_state_t *state, uint16_t format)
8873{
8874	char *fmt;
8875
8876	ASSERT(state->dts_formats != NULL);
8877	ASSERT(format <= state->dts_nformats);
8878	ASSERT(state->dts_formats[format - 1] != NULL);
8879
8880	fmt = state->dts_formats[format - 1];
8881	kmem_free(fmt, strlen(fmt) + 1);
8882	state->dts_formats[format - 1] = NULL;
8883}
8884
8885static void
8886dtrace_format_destroy(dtrace_state_t *state)
8887{
8888	int i;
8889
8890	if (state->dts_nformats == 0) {
8891		ASSERT(state->dts_formats == NULL);
8892		return;
8893	}
8894
8895	ASSERT(state->dts_formats != NULL);
8896
8897	for (i = 0; i < state->dts_nformats; i++) {
8898		char *fmt = state->dts_formats[i];
8899
8900		if (fmt == NULL)
8901			continue;
8902
8903		kmem_free(fmt, strlen(fmt) + 1);
8904	}
8905
8906	kmem_free(state->dts_formats, state->dts_nformats * sizeof (char *));
8907	state->dts_nformats = 0;
8908	state->dts_formats = NULL;
8909}
8910
8911/*
8912 * DTrace Predicate Functions
8913 */
8914static dtrace_predicate_t *
8915dtrace_predicate_create(dtrace_difo_t *dp)
8916{
8917	dtrace_predicate_t *pred;
8918
8919	ASSERT(MUTEX_HELD(&dtrace_lock));
8920	ASSERT(dp->dtdo_refcnt != 0);
8921
8922	pred = kmem_zalloc(sizeof (dtrace_predicate_t), KM_SLEEP);
8923	pred->dtp_difo = dp;
8924	pred->dtp_refcnt = 1;
8925
8926	if (!dtrace_difo_cacheable(dp))
8927		return (pred);
8928
8929	if (dtrace_predcache_id == DTRACE_CACHEIDNONE) {
8930		/*
8931		 * This is only theoretically possible -- we have had 2^32
8932		 * cacheable predicates on this machine.  We cannot allow any
8933		 * more predicates to become cacheable:  as unlikely as it is,
8934		 * there may be a thread caching a (now stale) predicate cache
8935		 * ID. (N.B.: the temptation is being successfully resisted to
8936		 * have this cmn_err() "Holy shit -- we executed this code!")
8937		 */
8938		return (pred);
8939	}
8940
8941	pred->dtp_cacheid = dtrace_predcache_id++;
8942
8943	return (pred);
8944}
8945
8946static void
8947dtrace_predicate_hold(dtrace_predicate_t *pred)
8948{
8949	ASSERT(MUTEX_HELD(&dtrace_lock));
8950	ASSERT(pred->dtp_difo != NULL && pred->dtp_difo->dtdo_refcnt != 0);
8951	ASSERT(pred->dtp_refcnt > 0);
8952
8953	pred->dtp_refcnt++;
8954}
8955
8956static void
8957dtrace_predicate_release(dtrace_predicate_t *pred, dtrace_vstate_t *vstate)
8958{
8959	dtrace_difo_t *dp = pred->dtp_difo;
8960
8961	ASSERT(MUTEX_HELD(&dtrace_lock));
8962	ASSERT(dp != NULL && dp->dtdo_refcnt != 0);
8963	ASSERT(pred->dtp_refcnt > 0);
8964
8965	if (--pred->dtp_refcnt == 0) {
8966		dtrace_difo_release(pred->dtp_difo, vstate);
8967		kmem_free(pred, sizeof (dtrace_predicate_t));
8968	}
8969}
8970
8971/*
8972 * DTrace Action Description Functions
8973 */
8974static dtrace_actdesc_t *
8975dtrace_actdesc_create(dtrace_actkind_t kind, uint32_t ntuple,
8976    uint64_t uarg, uint64_t arg)
8977{
8978	dtrace_actdesc_t *act;
8979
8980	ASSERT(!DTRACEACT_ISPRINTFLIKE(kind) || (arg != NULL &&
8981	    arg >= KERNELBASE) || (arg == NULL && kind == DTRACEACT_PRINTA));
8982
8983	act = kmem_zalloc(sizeof (dtrace_actdesc_t), KM_SLEEP);
8984	act->dtad_kind = kind;
8985	act->dtad_ntuple = ntuple;
8986	act->dtad_uarg = uarg;
8987	act->dtad_arg = arg;
8988	act->dtad_refcnt = 1;
8989
8990	return (act);
8991}
8992
8993static void
8994dtrace_actdesc_hold(dtrace_actdesc_t *act)
8995{
8996	ASSERT(act->dtad_refcnt >= 1);
8997	act->dtad_refcnt++;
8998}
8999
9000static void
9001dtrace_actdesc_release(dtrace_actdesc_t *act, dtrace_vstate_t *vstate)
9002{
9003	dtrace_actkind_t kind = act->dtad_kind;
9004	dtrace_difo_t *dp;
9005
9006	ASSERT(act->dtad_refcnt >= 1);
9007
9008	if (--act->dtad_refcnt != 0)
9009		return;
9010
9011	if ((dp = act->dtad_difo) != NULL)
9012		dtrace_difo_release(dp, vstate);
9013
9014	if (DTRACEACT_ISPRINTFLIKE(kind)) {
9015		char *str = (char *)(uintptr_t)act->dtad_arg;
9016
9017		ASSERT((str != NULL && (uintptr_t)str >= KERNELBASE) ||
9018		    (str == NULL && act->dtad_kind == DTRACEACT_PRINTA));
9019
9020		if (str != NULL)
9021			kmem_free(str, strlen(str) + 1);
9022	}
9023
9024	kmem_free(act, sizeof (dtrace_actdesc_t));
9025}
9026
9027/*
9028 * DTrace ECB Functions
9029 */
9030static dtrace_ecb_t *
9031dtrace_ecb_add(dtrace_state_t *state, dtrace_probe_t *probe)
9032{
9033	dtrace_ecb_t *ecb;
9034	dtrace_epid_t epid;
9035
9036	ASSERT(MUTEX_HELD(&dtrace_lock));
9037
9038	ecb = kmem_zalloc(sizeof (dtrace_ecb_t), KM_SLEEP);
9039	ecb->dte_predicate = NULL;
9040	ecb->dte_probe = probe;
9041
9042	/*
9043	 * The default size is the size of the default action: recording
9044	 * the epid.
9045	 */
9046	ecb->dte_size = ecb->dte_needed = sizeof (dtrace_epid_t);
9047	ecb->dte_alignment = sizeof (dtrace_epid_t);
9048
9049	epid = state->dts_epid++;
9050
9051	if (epid - 1 >= state->dts_necbs) {
9052		dtrace_ecb_t **oecbs = state->dts_ecbs, **ecbs;
9053		int necbs = state->dts_necbs << 1;
9054
9055		ASSERT(epid == state->dts_necbs + 1);
9056
9057		if (necbs == 0) {
9058			ASSERT(oecbs == NULL);
9059			necbs = 1;
9060		}
9061
9062		ecbs = kmem_zalloc(necbs * sizeof (*ecbs), KM_SLEEP);
9063
9064		if (oecbs != NULL)
9065			bcopy(oecbs, ecbs, state->dts_necbs * sizeof (*ecbs));
9066
9067		dtrace_membar_producer();
9068		state->dts_ecbs = ecbs;
9069
9070		if (oecbs != NULL) {
9071			/*
9072			 * If this state is active, we must dtrace_sync()
9073			 * before we can free the old dts_ecbs array:  we're
9074			 * coming in hot, and there may be active ring
9075			 * buffer processing (which indexes into the dts_ecbs
9076			 * array) on another CPU.
9077			 */
9078			if (state->dts_activity != DTRACE_ACTIVITY_INACTIVE)
9079				dtrace_sync();
9080
9081			kmem_free(oecbs, state->dts_necbs * sizeof (*ecbs));
9082		}
9083
9084		dtrace_membar_producer();
9085		state->dts_necbs = necbs;
9086	}
9087
9088	ecb->dte_state = state;
9089
9090	ASSERT(state->dts_ecbs[epid - 1] == NULL);
9091	dtrace_membar_producer();
9092	state->dts_ecbs[(ecb->dte_epid = epid) - 1] = ecb;
9093
9094	return (ecb);
9095}
9096
9097static void
9098dtrace_ecb_enable(dtrace_ecb_t *ecb)
9099{
9100	dtrace_probe_t *probe = ecb->dte_probe;
9101
9102	ASSERT(MUTEX_HELD(&cpu_lock));
9103	ASSERT(MUTEX_HELD(&dtrace_lock));
9104	ASSERT(ecb->dte_next == NULL);
9105
9106	if (probe == NULL) {
9107		/*
9108		 * This is the NULL probe -- there's nothing to do.
9109		 */
9110		return;
9111	}
9112
9113	if (probe->dtpr_ecb == NULL) {
9114		dtrace_provider_t *prov = probe->dtpr_provider;
9115
9116		/*
9117		 * We're the first ECB on this probe.
9118		 */
9119		probe->dtpr_ecb = probe->dtpr_ecb_last = ecb;
9120
9121		if (ecb->dte_predicate != NULL)
9122			probe->dtpr_predcache = ecb->dte_predicate->dtp_cacheid;
9123
9124		prov->dtpv_pops.dtps_enable(prov->dtpv_arg,
9125		    probe->dtpr_id, probe->dtpr_arg);
9126	} else {
9127		/*
9128		 * This probe is already active.  Swing the last pointer to
9129		 * point to the new ECB, and issue a dtrace_sync() to assure
9130		 * that all CPUs have seen the change.
9131		 */
9132		ASSERT(probe->dtpr_ecb_last != NULL);
9133		probe->dtpr_ecb_last->dte_next = ecb;
9134		probe->dtpr_ecb_last = ecb;
9135		probe->dtpr_predcache = 0;
9136
9137		dtrace_sync();
9138	}
9139}
9140
9141static void
9142dtrace_ecb_resize(dtrace_ecb_t *ecb)
9143{
9144	uint32_t maxalign = sizeof (dtrace_epid_t);
9145	uint32_t align = sizeof (uint8_t), offs, diff;
9146	dtrace_action_t *act;
9147	int wastuple = 0;
9148	uint32_t aggbase = UINT32_MAX;
9149	dtrace_state_t *state = ecb->dte_state;
9150
9151	/*
9152	 * If we record anything, we always record the epid.  (And we always
9153	 * record it first.)
9154	 */
9155	offs = sizeof (dtrace_epid_t);
9156	ecb->dte_size = ecb->dte_needed = sizeof (dtrace_epid_t);
9157
9158	for (act = ecb->dte_action; act != NULL; act = act->dta_next) {
9159		dtrace_recdesc_t *rec = &act->dta_rec;
9160
9161		if ((align = rec->dtrd_alignment) > maxalign)
9162			maxalign = align;
9163
9164		if (!wastuple && act->dta_intuple) {
9165			/*
9166			 * This is the first record in a tuple.  Align the
9167			 * offset to be at offset 4 in an 8-byte aligned
9168			 * block.
9169			 */
9170			diff = offs + sizeof (dtrace_aggid_t);
9171
9172			if (diff = (diff & (sizeof (uint64_t) - 1)))
9173				offs += sizeof (uint64_t) - diff;
9174
9175			aggbase = offs - sizeof (dtrace_aggid_t);
9176			ASSERT(!(aggbase & (sizeof (uint64_t) - 1)));
9177		}
9178
9179		/*LINTED*/
9180		if (rec->dtrd_size != 0 && (diff = (offs & (align - 1)))) {
9181			/*
9182			 * The current offset is not properly aligned; align it.
9183			 */
9184			offs += align - diff;
9185		}
9186
9187		rec->dtrd_offset = offs;
9188
9189		if (offs + rec->dtrd_size > ecb->dte_needed) {
9190			ecb->dte_needed = offs + rec->dtrd_size;
9191
9192			if (ecb->dte_needed > state->dts_needed)
9193				state->dts_needed = ecb->dte_needed;
9194		}
9195
9196		if (DTRACEACT_ISAGG(act->dta_kind)) {
9197			dtrace_aggregation_t *agg = (dtrace_aggregation_t *)act;
9198			dtrace_action_t *first = agg->dtag_first, *prev;
9199
9200			ASSERT(rec->dtrd_size != 0 && first != NULL);
9201			ASSERT(wastuple);
9202			ASSERT(aggbase != UINT32_MAX);
9203
9204			agg->dtag_base = aggbase;
9205
9206			while ((prev = first->dta_prev) != NULL &&
9207			    DTRACEACT_ISAGG(prev->dta_kind)) {
9208				agg = (dtrace_aggregation_t *)prev;
9209				first = agg->dtag_first;
9210			}
9211
9212			if (prev != NULL) {
9213				offs = prev->dta_rec.dtrd_offset +
9214				    prev->dta_rec.dtrd_size;
9215			} else {
9216				offs = sizeof (dtrace_epid_t);
9217			}
9218			wastuple = 0;
9219		} else {
9220			if (!act->dta_intuple)
9221				ecb->dte_size = offs + rec->dtrd_size;
9222
9223			offs += rec->dtrd_size;
9224		}
9225
9226		wastuple = act->dta_intuple;
9227	}
9228
9229	if ((act = ecb->dte_action) != NULL &&
9230	    !(act->dta_kind == DTRACEACT_SPECULATE && act->dta_next == NULL) &&
9231	    ecb->dte_size == sizeof (dtrace_epid_t)) {
9232		/*
9233		 * If the size is still sizeof (dtrace_epid_t), then all
9234		 * actions store no data; set the size to 0.
9235		 */
9236		ecb->dte_alignment = maxalign;
9237		ecb->dte_size = 0;
9238
9239		/*
9240		 * If the needed space is still sizeof (dtrace_epid_t), then
9241		 * all actions need no additional space; set the needed
9242		 * size to 0.
9243		 */
9244		if (ecb->dte_needed == sizeof (dtrace_epid_t))
9245			ecb->dte_needed = 0;
9246
9247		return;
9248	}
9249
9250	/*
9251	 * Set our alignment, and make sure that the dte_size and dte_needed
9252	 * are aligned to the size of an EPID.
9253	 */
9254	ecb->dte_alignment = maxalign;
9255	ecb->dte_size = (ecb->dte_size + (sizeof (dtrace_epid_t) - 1)) &
9256	    ~(sizeof (dtrace_epid_t) - 1);
9257	ecb->dte_needed = (ecb->dte_needed + (sizeof (dtrace_epid_t) - 1)) &
9258	    ~(sizeof (dtrace_epid_t) - 1);
9259	ASSERT(ecb->dte_size <= ecb->dte_needed);
9260}
9261
9262static dtrace_action_t *
9263dtrace_ecb_aggregation_create(dtrace_ecb_t *ecb, dtrace_actdesc_t *desc)
9264{
9265	dtrace_aggregation_t *agg;
9266	size_t size = sizeof (uint64_t);
9267	int ntuple = desc->dtad_ntuple;
9268	dtrace_action_t *act;
9269	dtrace_recdesc_t *frec;
9270	dtrace_aggid_t aggid;
9271	dtrace_state_t *state = ecb->dte_state;
9272
9273	agg = kmem_zalloc(sizeof (dtrace_aggregation_t), KM_SLEEP);
9274	agg->dtag_ecb = ecb;
9275
9276	ASSERT(DTRACEACT_ISAGG(desc->dtad_kind));
9277
9278	switch (desc->dtad_kind) {
9279	case DTRACEAGG_MIN:
9280		agg->dtag_initial = INT64_MAX;
9281		agg->dtag_aggregate = dtrace_aggregate_min;
9282		break;
9283
9284	case DTRACEAGG_MAX:
9285		agg->dtag_initial = INT64_MIN;
9286		agg->dtag_aggregate = dtrace_aggregate_max;
9287		break;
9288
9289	case DTRACEAGG_COUNT:
9290		agg->dtag_aggregate = dtrace_aggregate_count;
9291		break;
9292
9293	case DTRACEAGG_QUANTIZE:
9294		agg->dtag_aggregate = dtrace_aggregate_quantize;
9295		size = (((sizeof (uint64_t) * NBBY) - 1) * 2 + 1) *
9296		    sizeof (uint64_t);
9297		break;
9298
9299	case DTRACEAGG_LQUANTIZE: {
9300		uint16_t step = DTRACE_LQUANTIZE_STEP(desc->dtad_arg);
9301		uint16_t levels = DTRACE_LQUANTIZE_LEVELS(desc->dtad_arg);
9302
9303		agg->dtag_initial = desc->dtad_arg;
9304		agg->dtag_aggregate = dtrace_aggregate_lquantize;
9305
9306		if (step == 0 || levels == 0)
9307			goto err;
9308
9309		size = levels * sizeof (uint64_t) + 3 * sizeof (uint64_t);
9310		break;
9311	}
9312
9313	case DTRACEAGG_AVG:
9314		agg->dtag_aggregate = dtrace_aggregate_avg;
9315		size = sizeof (uint64_t) * 2;
9316		break;
9317
9318	case DTRACEAGG_STDDEV:
9319		agg->dtag_aggregate = dtrace_aggregate_stddev;
9320		size = sizeof (uint64_t) * 4;
9321		break;
9322
9323	case DTRACEAGG_SUM:
9324		agg->dtag_aggregate = dtrace_aggregate_sum;
9325		break;
9326
9327	default:
9328		goto err;
9329	}
9330
9331	agg->dtag_action.dta_rec.dtrd_size = size;
9332
9333	if (ntuple == 0)
9334		goto err;
9335
9336	/*
9337	 * We must make sure that we have enough actions for the n-tuple.
9338	 */
9339	for (act = ecb->dte_action_last; act != NULL; act = act->dta_prev) {
9340		if (DTRACEACT_ISAGG(act->dta_kind))
9341			break;
9342
9343		if (--ntuple == 0) {
9344			/*
9345			 * This is the action with which our n-tuple begins.
9346			 */
9347			agg->dtag_first = act;
9348			goto success;
9349		}
9350	}
9351
9352	/*
9353	 * This n-tuple is short by ntuple elements.  Return failure.
9354	 */
9355	ASSERT(ntuple != 0);
9356err:
9357	kmem_free(agg, sizeof (dtrace_aggregation_t));
9358	return (NULL);
9359
9360success:
9361	/*
9362	 * If the last action in the tuple has a size of zero, it's actually
9363	 * an expression argument for the aggregating action.
9364	 */
9365	ASSERT(ecb->dte_action_last != NULL);
9366	act = ecb->dte_action_last;
9367
9368	if (act->dta_kind == DTRACEACT_DIFEXPR) {
9369		ASSERT(act->dta_difo != NULL);
9370
9371		if (act->dta_difo->dtdo_rtype.dtdt_size == 0)
9372			agg->dtag_hasarg = 1;
9373	}
9374
9375	/*
9376	 * We need to allocate an id for this aggregation.
9377	 */
9378	aggid = (dtrace_aggid_t)(uintptr_t)vmem_alloc(state->dts_aggid_arena, 1,
9379	    VM_BESTFIT | VM_SLEEP);
9380
9381	if (aggid - 1 >= state->dts_naggregations) {
9382		dtrace_aggregation_t **oaggs = state->dts_aggregations;
9383		dtrace_aggregation_t **aggs;
9384		int naggs = state->dts_naggregations << 1;
9385		int onaggs = state->dts_naggregations;
9386
9387		ASSERT(aggid == state->dts_naggregations + 1);
9388
9389		if (naggs == 0) {
9390			ASSERT(oaggs == NULL);
9391			naggs = 1;
9392		}
9393
9394		aggs = kmem_zalloc(naggs * sizeof (*aggs), KM_SLEEP);
9395
9396		if (oaggs != NULL) {
9397			bcopy(oaggs, aggs, onaggs * sizeof (*aggs));
9398			kmem_free(oaggs, onaggs * sizeof (*aggs));
9399		}
9400
9401		state->dts_aggregations = aggs;
9402		state->dts_naggregations = naggs;
9403	}
9404
9405	ASSERT(state->dts_aggregations[aggid - 1] == NULL);
9406	state->dts_aggregations[(agg->dtag_id = aggid) - 1] = agg;
9407
9408	frec = &agg->dtag_first->dta_rec;
9409	if (frec->dtrd_alignment < sizeof (dtrace_aggid_t))
9410		frec->dtrd_alignment = sizeof (dtrace_aggid_t);
9411
9412	for (act = agg->dtag_first; act != NULL; act = act->dta_next) {
9413		ASSERT(!act->dta_intuple);
9414		act->dta_intuple = 1;
9415	}
9416
9417	return (&agg->dtag_action);
9418}
9419
9420static void
9421dtrace_ecb_aggregation_destroy(dtrace_ecb_t *ecb, dtrace_action_t *act)
9422{
9423	dtrace_aggregation_t *agg = (dtrace_aggregation_t *)act;
9424	dtrace_state_t *state = ecb->dte_state;
9425	dtrace_aggid_t aggid = agg->dtag_id;
9426
9427	ASSERT(DTRACEACT_ISAGG(act->dta_kind));
9428	vmem_free(state->dts_aggid_arena, (void *)(uintptr_t)aggid, 1);
9429
9430	ASSERT(state->dts_aggregations[aggid - 1] == agg);
9431	state->dts_aggregations[aggid - 1] = NULL;
9432
9433	kmem_free(agg, sizeof (dtrace_aggregation_t));
9434}
9435
9436static int
9437dtrace_ecb_action_add(dtrace_ecb_t *ecb, dtrace_actdesc_t *desc)
9438{
9439	dtrace_action_t *action, *last;
9440	dtrace_difo_t *dp = desc->dtad_difo;
9441	uint32_t size = 0, align = sizeof (uint8_t), mask;
9442	uint16_t format = 0;
9443	dtrace_recdesc_t *rec;
9444	dtrace_state_t *state = ecb->dte_state;
9445	dtrace_optval_t *opt = state->dts_options, nframes, strsize;
9446	uint64_t arg = desc->dtad_arg;
9447
9448	ASSERT(MUTEX_HELD(&dtrace_lock));
9449	ASSERT(ecb->dte_action == NULL || ecb->dte_action->dta_refcnt == 1);
9450
9451	if (DTRACEACT_ISAGG(desc->dtad_kind)) {
9452		/*
9453		 * If this is an aggregating action, there must be neither
9454		 * a speculate nor a commit on the action chain.
9455		 */
9456		dtrace_action_t *act;
9457
9458		for (act = ecb->dte_action; act != NULL; act = act->dta_next) {
9459			if (act->dta_kind == DTRACEACT_COMMIT)
9460				return (EINVAL);
9461
9462			if (act->dta_kind == DTRACEACT_SPECULATE)
9463				return (EINVAL);
9464		}
9465
9466		action = dtrace_ecb_aggregation_create(ecb, desc);
9467
9468		if (action == NULL)
9469			return (EINVAL);
9470	} else {
9471		if (DTRACEACT_ISDESTRUCTIVE(desc->dtad_kind) ||
9472		    (desc->dtad_kind == DTRACEACT_DIFEXPR &&
9473		    dp != NULL && dp->dtdo_destructive)) {
9474			state->dts_destructive = 1;
9475		}
9476
9477		switch (desc->dtad_kind) {
9478		case DTRACEACT_PRINTF:
9479		case DTRACEACT_PRINTA:
9480		case DTRACEACT_SYSTEM:
9481		case DTRACEACT_FREOPEN:
9482			/*
9483			 * We know that our arg is a string -- turn it into a
9484			 * format.
9485			 */
9486			if (arg == NULL) {
9487				ASSERT(desc->dtad_kind == DTRACEACT_PRINTA);
9488				format = 0;
9489			} else {
9490				ASSERT(arg != NULL);
9491				ASSERT(arg > KERNELBASE);
9492				format = dtrace_format_add(state,
9493				    (char *)(uintptr_t)arg);
9494			}
9495
9496			/*FALLTHROUGH*/
9497		case DTRACEACT_LIBACT:
9498		case DTRACEACT_DIFEXPR:
9499			if (dp == NULL)
9500				return (EINVAL);
9501
9502			if ((size = dp->dtdo_rtype.dtdt_size) != 0)
9503				break;
9504
9505			if (dp->dtdo_rtype.dtdt_kind == DIF_TYPE_STRING) {
9506				if (!(dp->dtdo_rtype.dtdt_flags & DIF_TF_BYREF))
9507					return (EINVAL);
9508
9509				size = opt[DTRACEOPT_STRSIZE];
9510			}
9511
9512			break;
9513
9514		case DTRACEACT_STACK:
9515			if ((nframes = arg) == 0) {
9516				nframes = opt[DTRACEOPT_STACKFRAMES];
9517				ASSERT(nframes > 0);
9518				arg = nframes;
9519			}
9520
9521			size = nframes * sizeof (pc_t);
9522			break;
9523
9524		case DTRACEACT_JSTACK:
9525			if ((strsize = DTRACE_USTACK_STRSIZE(arg)) == 0)
9526				strsize = opt[DTRACEOPT_JSTACKSTRSIZE];
9527
9528			if ((nframes = DTRACE_USTACK_NFRAMES(arg)) == 0)
9529				nframes = opt[DTRACEOPT_JSTACKFRAMES];
9530
9531			arg = DTRACE_USTACK_ARG(nframes, strsize);
9532
9533			/*FALLTHROUGH*/
9534		case DTRACEACT_USTACK:
9535			if (desc->dtad_kind != DTRACEACT_JSTACK &&
9536			    (nframes = DTRACE_USTACK_NFRAMES(arg)) == 0) {
9537				strsize = DTRACE_USTACK_STRSIZE(arg);
9538				nframes = opt[DTRACEOPT_USTACKFRAMES];
9539				ASSERT(nframes > 0);
9540				arg = DTRACE_USTACK_ARG(nframes, strsize);
9541			}
9542
9543			/*
9544			 * Save a slot for the pid.
9545			 */
9546			size = (nframes + 1) * sizeof (uint64_t);
9547			size += DTRACE_USTACK_STRSIZE(arg);
9548			size = P2ROUNDUP(size, (uint32_t)(sizeof (uintptr_t)));
9549
9550			break;
9551
9552		case DTRACEACT_SYM:
9553		case DTRACEACT_MOD:
9554			if (dp == NULL || ((size = dp->dtdo_rtype.dtdt_size) !=
9555			    sizeof (uint64_t)) ||
9556			    (dp->dtdo_rtype.dtdt_flags & DIF_TF_BYREF))
9557				return (EINVAL);
9558			break;
9559
9560		case DTRACEACT_USYM:
9561		case DTRACEACT_UMOD:
9562		case DTRACEACT_UADDR:
9563			if (dp == NULL ||
9564			    (dp->dtdo_rtype.dtdt_size != sizeof (uint64_t)) ||
9565			    (dp->dtdo_rtype.dtdt_flags & DIF_TF_BYREF))
9566				return (EINVAL);
9567
9568			/*
9569			 * We have a slot for the pid, plus a slot for the
9570			 * argument.  To keep things simple (aligned with
9571			 * bitness-neutral sizing), we store each as a 64-bit
9572			 * quantity.
9573			 */
9574			size = 2 * sizeof (uint64_t);
9575			break;
9576
9577		case DTRACEACT_STOP:
9578		case DTRACEACT_BREAKPOINT:
9579		case DTRACEACT_PANIC:
9580			break;
9581
9582		case DTRACEACT_CHILL:
9583		case DTRACEACT_DISCARD:
9584		case DTRACEACT_RAISE:
9585			if (dp == NULL)
9586				return (EINVAL);
9587			break;
9588
9589		case DTRACEACT_EXIT:
9590			if (dp == NULL ||
9591			    (size = dp->dtdo_rtype.dtdt_size) != sizeof (int) ||
9592			    (dp->dtdo_rtype.dtdt_flags & DIF_TF_BYREF))
9593				return (EINVAL);
9594			break;
9595
9596		case DTRACEACT_SPECULATE:
9597			if (ecb->dte_size > sizeof (dtrace_epid_t))
9598				return (EINVAL);
9599
9600			if (dp == NULL)
9601				return (EINVAL);
9602
9603			state->dts_speculates = 1;
9604			break;
9605
9606		case DTRACEACT_COMMIT: {
9607			dtrace_action_t *act = ecb->dte_action;
9608
9609			for (; act != NULL; act = act->dta_next) {
9610				if (act->dta_kind == DTRACEACT_COMMIT)
9611					return (EINVAL);
9612			}
9613
9614			if (dp == NULL)
9615				return (EINVAL);
9616			break;
9617		}
9618
9619		default:
9620			return (EINVAL);
9621		}
9622
9623		if (size != 0 || desc->dtad_kind == DTRACEACT_SPECULATE) {
9624			/*
9625			 * If this is a data-storing action or a speculate,
9626			 * we must be sure that there isn't a commit on the
9627			 * action chain.
9628			 */
9629			dtrace_action_t *act = ecb->dte_action;
9630
9631			for (; act != NULL; act = act->dta_next) {
9632				if (act->dta_kind == DTRACEACT_COMMIT)
9633					return (EINVAL);
9634			}
9635		}
9636
9637		action = kmem_zalloc(sizeof (dtrace_action_t), KM_SLEEP);
9638		action->dta_rec.dtrd_size = size;
9639	}
9640
9641	action->dta_refcnt = 1;
9642	rec = &action->dta_rec;
9643	size = rec->dtrd_size;
9644
9645	for (mask = sizeof (uint64_t) - 1; size != 0 && mask > 0; mask >>= 1) {
9646		if (!(size & mask)) {
9647			align = mask + 1;
9648			break;
9649		}
9650	}
9651
9652	action->dta_kind = desc->dtad_kind;
9653
9654	if ((action->dta_difo = dp) != NULL)
9655		dtrace_difo_hold(dp);
9656
9657	rec->dtrd_action = action->dta_kind;
9658	rec->dtrd_arg = arg;
9659	rec->dtrd_uarg = desc->dtad_uarg;
9660	rec->dtrd_alignment = (uint16_t)align;
9661	rec->dtrd_format = format;
9662
9663	if ((last = ecb->dte_action_last) != NULL) {
9664		ASSERT(ecb->dte_action != NULL);
9665		action->dta_prev = last;
9666		last->dta_next = action;
9667	} else {
9668		ASSERT(ecb->dte_action == NULL);
9669		ecb->dte_action = action;
9670	}
9671
9672	ecb->dte_action_last = action;
9673
9674	return (0);
9675}
9676
9677static void
9678dtrace_ecb_action_remove(dtrace_ecb_t *ecb)
9679{
9680	dtrace_action_t *act = ecb->dte_action, *next;
9681	dtrace_vstate_t *vstate = &ecb->dte_state->dts_vstate;
9682	dtrace_difo_t *dp;
9683	uint16_t format;
9684
9685	if (act != NULL && act->dta_refcnt > 1) {
9686		ASSERT(act->dta_next == NULL || act->dta_next->dta_refcnt == 1);
9687		act->dta_refcnt--;
9688	} else {
9689		for (; act != NULL; act = next) {
9690			next = act->dta_next;
9691			ASSERT(next != NULL || act == ecb->dte_action_last);
9692			ASSERT(act->dta_refcnt == 1);
9693
9694			if ((format = act->dta_rec.dtrd_format) != 0)
9695				dtrace_format_remove(ecb->dte_state, format);
9696
9697			if ((dp = act->dta_difo) != NULL)
9698				dtrace_difo_release(dp, vstate);
9699
9700			if (DTRACEACT_ISAGG(act->dta_kind)) {
9701				dtrace_ecb_aggregation_destroy(ecb, act);
9702			} else {
9703				kmem_free(act, sizeof (dtrace_action_t));
9704			}
9705		}
9706	}
9707
9708	ecb->dte_action = NULL;
9709	ecb->dte_action_last = NULL;
9710	ecb->dte_size = sizeof (dtrace_epid_t);
9711}
9712
9713static void
9714dtrace_ecb_disable(dtrace_ecb_t *ecb)
9715{
9716	/*
9717	 * We disable the ECB by removing it from its probe.
9718	 */
9719	dtrace_ecb_t *pecb, *prev = NULL;
9720	dtrace_probe_t *probe = ecb->dte_probe;
9721
9722	ASSERT(MUTEX_HELD(&dtrace_lock));
9723
9724	if (probe == NULL) {
9725		/*
9726		 * This is the NULL probe; there is nothing to disable.
9727		 */
9728		return;
9729	}
9730
9731	for (pecb = probe->dtpr_ecb; pecb != NULL; pecb = pecb->dte_next) {
9732		if (pecb == ecb)
9733			break;
9734		prev = pecb;
9735	}
9736
9737	ASSERT(pecb != NULL);
9738
9739	if (prev == NULL) {
9740		probe->dtpr_ecb = ecb->dte_next;
9741	} else {
9742		prev->dte_next = ecb->dte_next;
9743	}
9744
9745	if (ecb == probe->dtpr_ecb_last) {
9746		ASSERT(ecb->dte_next == NULL);
9747		probe->dtpr_ecb_last = prev;
9748	}
9749
9750	/*
9751	 * The ECB has been disconnected from the probe; now sync to assure
9752	 * that all CPUs have seen the change before returning.
9753	 */
9754	dtrace_sync();
9755
9756	if (probe->dtpr_ecb == NULL) {
9757		/*
9758		 * That was the last ECB on the probe; clear the predicate
9759		 * cache ID for the probe, disable it and sync one more time
9760		 * to assure that we'll never hit it again.
9761		 */
9762		dtrace_provider_t *prov = probe->dtpr_provider;
9763
9764		ASSERT(ecb->dte_next == NULL);
9765		ASSERT(probe->dtpr_ecb_last == NULL);
9766		probe->dtpr_predcache = DTRACE_CACHEIDNONE;
9767		prov->dtpv_pops.dtps_disable(prov->dtpv_arg,
9768		    probe->dtpr_id, probe->dtpr_arg);
9769		dtrace_sync();
9770	} else {
9771		/*
9772		 * There is at least one ECB remaining on the probe.  If there
9773		 * is _exactly_ one, set the probe's predicate cache ID to be
9774		 * the predicate cache ID of the remaining ECB.
9775		 */
9776		ASSERT(probe->dtpr_ecb_last != NULL);
9777		ASSERT(probe->dtpr_predcache == DTRACE_CACHEIDNONE);
9778
9779		if (probe->dtpr_ecb == probe->dtpr_ecb_last) {
9780			dtrace_predicate_t *p = probe->dtpr_ecb->dte_predicate;
9781
9782			ASSERT(probe->dtpr_ecb->dte_next == NULL);
9783
9784			if (p != NULL)
9785				probe->dtpr_predcache = p->dtp_cacheid;
9786		}
9787
9788		ecb->dte_next = NULL;
9789	}
9790}
9791
9792static void
9793dtrace_ecb_destroy(dtrace_ecb_t *ecb)
9794{
9795	dtrace_state_t *state = ecb->dte_state;
9796	dtrace_vstate_t *vstate = &state->dts_vstate;
9797	dtrace_predicate_t *pred;
9798	dtrace_epid_t epid = ecb->dte_epid;
9799
9800	ASSERT(MUTEX_HELD(&dtrace_lock));
9801	ASSERT(ecb->dte_next == NULL);
9802	ASSERT(ecb->dte_probe == NULL || ecb->dte_probe->dtpr_ecb != ecb);
9803
9804	if ((pred = ecb->dte_predicate) != NULL)
9805		dtrace_predicate_release(pred, vstate);
9806
9807	dtrace_ecb_action_remove(ecb);
9808
9809	ASSERT(state->dts_ecbs[epid - 1] == ecb);
9810	state->dts_ecbs[epid - 1] = NULL;
9811
9812	kmem_free(ecb, sizeof (dtrace_ecb_t));
9813}
9814
9815static dtrace_ecb_t *
9816dtrace_ecb_create(dtrace_state_t *state, dtrace_probe_t *probe,
9817    dtrace_enabling_t *enab)
9818{
9819	dtrace_ecb_t *ecb;
9820	dtrace_predicate_t *pred;
9821	dtrace_actdesc_t *act;
9822	dtrace_provider_t *prov;
9823	dtrace_ecbdesc_t *desc = enab->dten_current;
9824
9825	ASSERT(MUTEX_HELD(&dtrace_lock));
9826	ASSERT(state != NULL);
9827
9828	ecb = dtrace_ecb_add(state, probe);
9829	ecb->dte_uarg = desc->dted_uarg;
9830
9831	if ((pred = desc->dted_pred.dtpdd_predicate) != NULL) {
9832		dtrace_predicate_hold(pred);
9833		ecb->dte_predicate = pred;
9834	}
9835
9836	if (probe != NULL) {
9837		/*
9838		 * If the provider shows more leg than the consumer is old
9839		 * enough to see, we need to enable the appropriate implicit
9840		 * predicate bits to prevent the ecb from activating at
9841		 * revealing times.
9842		 *
9843		 * Providers specifying DTRACE_PRIV_USER at register time
9844		 * are stating that they need the /proc-style privilege
9845		 * model to be enforced, and this is what DTRACE_COND_OWNER
9846		 * and DTRACE_COND_ZONEOWNER will then do at probe time.
9847		 */
9848		prov = probe->dtpr_provider;
9849		if (!(state->dts_cred.dcr_visible & DTRACE_CRV_ALLPROC) &&
9850		    (prov->dtpv_priv.dtpp_flags & DTRACE_PRIV_USER))
9851			ecb->dte_cond |= DTRACE_COND_OWNER;
9852
9853		if (!(state->dts_cred.dcr_visible & DTRACE_CRV_ALLZONE) &&
9854		    (prov->dtpv_priv.dtpp_flags & DTRACE_PRIV_USER))
9855			ecb->dte_cond |= DTRACE_COND_ZONEOWNER;
9856
9857		/*
9858		 * If the provider shows us kernel innards and the user
9859		 * is lacking sufficient privilege, enable the
9860		 * DTRACE_COND_USERMODE implicit predicate.
9861		 */
9862		if (!(state->dts_cred.dcr_visible & DTRACE_CRV_KERNEL) &&
9863		    (prov->dtpv_priv.dtpp_flags & DTRACE_PRIV_KERNEL))
9864			ecb->dte_cond |= DTRACE_COND_USERMODE;
9865	}
9866
9867	if (dtrace_ecb_create_cache != NULL) {
9868		/*
9869		 * If we have a cached ecb, we'll use its action list instead
9870		 * of creating our own (saving both time and space).
9871		 */
9872		dtrace_ecb_t *cached = dtrace_ecb_create_cache;
9873		dtrace_action_t *act = cached->dte_action;
9874
9875		if (act != NULL) {
9876			ASSERT(act->dta_refcnt > 0);
9877			act->dta_refcnt++;
9878			ecb->dte_action = act;
9879			ecb->dte_action_last = cached->dte_action_last;
9880			ecb->dte_needed = cached->dte_needed;
9881			ecb->dte_size = cached->dte_size;
9882			ecb->dte_alignment = cached->dte_alignment;
9883		}
9884
9885		return (ecb);
9886	}
9887
9888	for (act = desc->dted_action; act != NULL; act = act->dtad_next) {
9889		if ((enab->dten_error = dtrace_ecb_action_add(ecb, act)) != 0) {
9890			dtrace_ecb_destroy(ecb);
9891			return (NULL);
9892		}
9893	}
9894
9895	dtrace_ecb_resize(ecb);
9896
9897	return (dtrace_ecb_create_cache = ecb);
9898}
9899
9900static int
9901dtrace_ecb_create_enable(dtrace_probe_t *probe, void *arg)
9902{
9903	dtrace_ecb_t *ecb;
9904	dtrace_enabling_t *enab = arg;
9905	dtrace_state_t *state = enab->dten_vstate->dtvs_state;
9906
9907	ASSERT(state != NULL);
9908
9909	if (probe != NULL && probe->dtpr_gen < enab->dten_probegen) {
9910		/*
9911		 * This probe was created in a generation for which this
9912		 * enabling has previously created ECBs; we don't want to
9913		 * enable it again, so just kick out.
9914		 */
9915		return (DTRACE_MATCH_NEXT);
9916	}
9917
9918	if ((ecb = dtrace_ecb_create(state, probe, enab)) == NULL)
9919		return (DTRACE_MATCH_DONE);
9920
9921	dtrace_ecb_enable(ecb);
9922	return (DTRACE_MATCH_NEXT);
9923}
9924
9925static dtrace_ecb_t *
9926dtrace_epid2ecb(dtrace_state_t *state, dtrace_epid_t id)
9927{
9928	dtrace_ecb_t *ecb;
9929
9930	ASSERT(MUTEX_HELD(&dtrace_lock));
9931
9932	if (id == 0 || id > state->dts_necbs)
9933		return (NULL);
9934
9935	ASSERT(state->dts_necbs > 0 && state->dts_ecbs != NULL);
9936	ASSERT((ecb = state->dts_ecbs[id - 1]) == NULL || ecb->dte_epid == id);
9937
9938	return (state->dts_ecbs[id - 1]);
9939}
9940
9941static dtrace_aggregation_t *
9942dtrace_aggid2agg(dtrace_state_t *state, dtrace_aggid_t id)
9943{
9944	dtrace_aggregation_t *agg;
9945
9946	ASSERT(MUTEX_HELD(&dtrace_lock));
9947
9948	if (id == 0 || id > state->dts_naggregations)
9949		return (NULL);
9950
9951	ASSERT(state->dts_naggregations > 0 && state->dts_aggregations != NULL);
9952	ASSERT((agg = state->dts_aggregations[id - 1]) == NULL ||
9953	    agg->dtag_id == id);
9954
9955	return (state->dts_aggregations[id - 1]);
9956}
9957
9958/*
9959 * DTrace Buffer Functions
9960 *
9961 * The following functions manipulate DTrace buffers.  Most of these functions
9962 * are called in the context of establishing or processing consumer state;
9963 * exceptions are explicitly noted.
9964 */
9965
9966/*
9967 * Note:  called from cross call context.  This function switches the two
9968 * buffers on a given CPU.  The atomicity of this operation is assured by
9969 * disabling interrupts while the actual switch takes place; the disabling of
9970 * interrupts serializes the execution with any execution of dtrace_probe() on
9971 * the same CPU.
9972 */
9973static void
9974dtrace_buffer_switch(dtrace_buffer_t *buf)
9975{
9976	caddr_t tomax = buf->dtb_tomax;
9977	caddr_t xamot = buf->dtb_xamot;
9978	dtrace_icookie_t cookie;
9979
9980	ASSERT(!(buf->dtb_flags & DTRACEBUF_NOSWITCH));
9981	ASSERT(!(buf->dtb_flags & DTRACEBUF_RING));
9982
9983	cookie = dtrace_interrupt_disable();
9984	buf->dtb_tomax = xamot;
9985	buf->dtb_xamot = tomax;
9986	buf->dtb_xamot_drops = buf->dtb_drops;
9987	buf->dtb_xamot_offset = buf->dtb_offset;
9988	buf->dtb_xamot_errors = buf->dtb_errors;
9989	buf->dtb_xamot_flags = buf->dtb_flags;
9990	buf->dtb_offset = 0;
9991	buf->dtb_drops = 0;
9992	buf->dtb_errors = 0;
9993	buf->dtb_flags &= ~(DTRACEBUF_ERROR | DTRACEBUF_DROPPED);
9994	dtrace_interrupt_enable(cookie);
9995}
9996
9997/*
9998 * Note:  called from cross call context.  This function activates a buffer
9999 * on a CPU.  As with dtrace_buffer_switch(), the atomicity of the operation
10000 * is guaranteed by the disabling of interrupts.
10001 */
10002static void
10003dtrace_buffer_activate(dtrace_state_t *state)
10004{
10005	dtrace_buffer_t *buf;
10006	dtrace_icookie_t cookie = dtrace_interrupt_disable();
10007
10008	buf = &state->dts_buffer[CPU->cpu_id];
10009
10010	if (buf->dtb_tomax != NULL) {
10011		/*
10012		 * We might like to assert that the buffer is marked inactive,
10013		 * but this isn't necessarily true:  the buffer for the CPU
10014		 * that processes the BEGIN probe has its buffer activated
10015		 * manually.  In this case, we take the (harmless) action
10016		 * re-clearing the bit INACTIVE bit.
10017		 */
10018		buf->dtb_flags &= ~DTRACEBUF_INACTIVE;
10019	}
10020
10021	dtrace_interrupt_enable(cookie);
10022}
10023
10024static int
10025dtrace_buffer_alloc(dtrace_buffer_t *bufs, size_t size, int flags,
10026    processorid_t cpu)
10027{
10028	cpu_t *cp;
10029	dtrace_buffer_t *buf;
10030
10031	ASSERT(MUTEX_HELD(&cpu_lock));
10032	ASSERT(MUTEX_HELD(&dtrace_lock));
10033
10034	if (size > dtrace_nonroot_maxsize &&
10035	    !PRIV_POLICY_CHOICE(CRED(), PRIV_ALL, B_FALSE))
10036		return (EFBIG);
10037
10038	cp = cpu_list;
10039
10040	do {
10041		if (cpu != DTRACE_CPUALL && cpu != cp->cpu_id)
10042			continue;
10043
10044		buf = &bufs[cp->cpu_id];
10045
10046		/*
10047		 * If there is already a buffer allocated for this CPU, it
10048		 * is only possible that this is a DR event.  In this case,
10049		 * the buffer size must match our specified size.
10050		 */
10051		if (buf->dtb_tomax != NULL) {
10052			ASSERT(buf->dtb_size == size);
10053			continue;
10054		}
10055
10056		ASSERT(buf->dtb_xamot == NULL);
10057
10058		if ((buf->dtb_tomax = kmem_zalloc(size, KM_NOSLEEP)) == NULL)
10059			goto err;
10060
10061		buf->dtb_size = size;
10062		buf->dtb_flags = flags;
10063		buf->dtb_offset = 0;
10064		buf->dtb_drops = 0;
10065
10066		if (flags & DTRACEBUF_NOSWITCH)
10067			continue;
10068
10069		if ((buf->dtb_xamot = kmem_zalloc(size, KM_NOSLEEP)) == NULL)
10070			goto err;
10071	} while ((cp = cp->cpu_next) != cpu_list);
10072
10073	return (0);
10074
10075err:
10076	cp = cpu_list;
10077
10078	do {
10079		if (cpu != DTRACE_CPUALL && cpu != cp->cpu_id)
10080			continue;
10081
10082		buf = &bufs[cp->cpu_id];
10083
10084		if (buf->dtb_xamot != NULL) {
10085			ASSERT(buf->dtb_tomax != NULL);
10086			ASSERT(buf->dtb_size == size);
10087			kmem_free(buf->dtb_xamot, size);
10088		}
10089
10090		if (buf->dtb_tomax != NULL) {
10091			ASSERT(buf->dtb_size == size);
10092			kmem_free(buf->dtb_tomax, size);
10093		}
10094
10095		buf->dtb_tomax = NULL;
10096		buf->dtb_xamot = NULL;
10097		buf->dtb_size = 0;
10098	} while ((cp = cp->cpu_next) != cpu_list);
10099
10100	return (ENOMEM);
10101}
10102
10103/*
10104 * Note:  called from probe context.  This function just increments the drop
10105 * count on a buffer.  It has been made a function to allow for the
10106 * possibility of understanding the source of mysterious drop counts.  (A
10107 * problem for which one may be particularly disappointed that DTrace cannot
10108 * be used to understand DTrace.)
10109 */
10110static void
10111dtrace_buffer_drop(dtrace_buffer_t *buf)
10112{
10113	buf->dtb_drops++;
10114}
10115
10116/*
10117 * Note:  called from probe context.  This function is called to reserve space
10118 * in a buffer.  If mstate is non-NULL, sets the scratch base and size in the
10119 * mstate.  Returns the new offset in the buffer, or a negative value if an
10120 * error has occurred.
10121 */
10122static intptr_t
10123dtrace_buffer_reserve(dtrace_buffer_t *buf, size_t needed, size_t align,
10124    dtrace_state_t *state, dtrace_mstate_t *mstate)
10125{
10126	intptr_t offs = buf->dtb_offset, soffs;
10127	intptr_t woffs;
10128	caddr_t tomax;
10129	size_t total;
10130
10131	if (buf->dtb_flags & DTRACEBUF_INACTIVE)
10132		return (-1);
10133
10134	if ((tomax = buf->dtb_tomax) == NULL) {
10135		dtrace_buffer_drop(buf);
10136		return (-1);
10137	}
10138
10139	if (!(buf->dtb_flags & (DTRACEBUF_RING | DTRACEBUF_FILL))) {
10140		while (offs & (align - 1)) {
10141			/*
10142			 * Assert that our alignment is off by a number which
10143			 * is itself sizeof (uint32_t) aligned.
10144			 */
10145			ASSERT(!((align - (offs & (align - 1))) &
10146			    (sizeof (uint32_t) - 1)));
10147			DTRACE_STORE(uint32_t, tomax, offs, DTRACE_EPIDNONE);
10148			offs += sizeof (uint32_t);
10149		}
10150
10151		if ((soffs = offs + needed) > buf->dtb_size) {
10152			dtrace_buffer_drop(buf);
10153			return (-1);
10154		}
10155
10156		if (mstate == NULL)
10157			return (offs);
10158
10159		mstate->dtms_scratch_base = (uintptr_t)tomax + soffs;
10160		mstate->dtms_scratch_size = buf->dtb_size - soffs;
10161		mstate->dtms_scratch_ptr = mstate->dtms_scratch_base;
10162
10163		return (offs);
10164	}
10165
10166	if (buf->dtb_flags & DTRACEBUF_FILL) {
10167		if (state->dts_activity != DTRACE_ACTIVITY_COOLDOWN &&
10168		    (buf->dtb_flags & DTRACEBUF_FULL))
10169			return (-1);
10170		goto out;
10171	}
10172
10173	total = needed + (offs & (align - 1));
10174
10175	/*
10176	 * For a ring buffer, life is quite a bit more complicated.  Before
10177	 * we can store any padding, we need to adjust our wrapping offset.
10178	 * (If we've never before wrapped or we're not about to, no adjustment
10179	 * is required.)
10180	 */
10181	if ((buf->dtb_flags & DTRACEBUF_WRAPPED) ||
10182	    offs + total > buf->dtb_size) {
10183		woffs = buf->dtb_xamot_offset;
10184
10185		if (offs + total > buf->dtb_size) {
10186			/*
10187			 * We can't fit in the end of the buffer.  First, a
10188			 * sanity check that we can fit in the buffer at all.
10189			 */
10190			if (total > buf->dtb_size) {
10191				dtrace_buffer_drop(buf);
10192				return (-1);
10193			}
10194
10195			/*
10196			 * We're going to be storing at the top of the buffer,
10197			 * so now we need to deal with the wrapped offset.  We
10198			 * only reset our wrapped offset to 0 if it is
10199			 * currently greater than the current offset.  If it
10200			 * is less than the current offset, it is because a
10201			 * previous allocation induced a wrap -- but the
10202			 * allocation didn't subsequently take the space due
10203			 * to an error or false predicate evaluation.  In this
10204			 * case, we'll just leave the wrapped offset alone: if
10205			 * the wrapped offset hasn't been advanced far enough
10206			 * for this allocation, it will be adjusted in the
10207			 * lower loop.
10208			 */
10209			if (buf->dtb_flags & DTRACEBUF_WRAPPED) {
10210				if (woffs >= offs)
10211					woffs = 0;
10212			} else {
10213				woffs = 0;
10214			}
10215
10216			/*
10217			 * Now we know that we're going to be storing to the
10218			 * top of the buffer and that there is room for us
10219			 * there.  We need to clear the buffer from the current
10220			 * offset to the end (there may be old gunk there).
10221			 */
10222			while (offs < buf->dtb_size)
10223				tomax[offs++] = 0;
10224
10225			/*
10226			 * We need to set our offset to zero.  And because we
10227			 * are wrapping, we need to set the bit indicating as
10228			 * much.  We can also adjust our needed space back
10229			 * down to the space required by the ECB -- we know
10230			 * that the top of the buffer is aligned.
10231			 */
10232			offs = 0;
10233			total = needed;
10234			buf->dtb_flags |= DTRACEBUF_WRAPPED;
10235		} else {
10236			/*
10237			 * There is room for us in the buffer, so we simply
10238			 * need to check the wrapped offset.
10239			 */
10240			if (woffs < offs) {
10241				/*
10242				 * The wrapped offset is less than the offset.
10243				 * This can happen if we allocated buffer space
10244				 * that induced a wrap, but then we didn't
10245				 * subsequently take the space due to an error
10246				 * or false predicate evaluation.  This is
10247				 * okay; we know that _this_ allocation isn't
10248				 * going to induce a wrap.  We still can't
10249				 * reset the wrapped offset to be zero,
10250				 * however: the space may have been trashed in
10251				 * the previous failed probe attempt.  But at
10252				 * least the wrapped offset doesn't need to
10253				 * be adjusted at all...
10254				 */
10255				goto out;
10256			}
10257		}
10258
10259		while (offs + total > woffs) {
10260			dtrace_epid_t epid = *(uint32_t *)(tomax + woffs);
10261			size_t size;
10262
10263			if (epid == DTRACE_EPIDNONE) {
10264				size = sizeof (uint32_t);
10265			} else {
10266				ASSERT(epid <= state->dts_necbs);
10267				ASSERT(state->dts_ecbs[epid - 1] != NULL);
10268
10269				size = state->dts_ecbs[epid - 1]->dte_size;
10270			}
10271
10272			ASSERT(woffs + size <= buf->dtb_size);
10273			ASSERT(size != 0);
10274
10275			if (woffs + size == buf->dtb_size) {
10276				/*
10277				 * We've reached the end of the buffer; we want
10278				 * to set the wrapped offset to 0 and break
10279				 * out.  However, if the offs is 0, then we're
10280				 * in a strange edge-condition:  the amount of
10281				 * space that we want to reserve plus the size
10282				 * of the record that we're overwriting is
10283				 * greater than the size of the buffer.  This
10284				 * is problematic because if we reserve the
10285				 * space but subsequently don't consume it (due
10286				 * to a failed predicate or error) the wrapped
10287				 * offset will be 0 -- yet the EPID at offset 0
10288				 * will not be committed.  This situation is
10289				 * relatively easy to deal with:  if we're in
10290				 * this case, the buffer is indistinguishable
10291				 * from one that hasn't wrapped; we need only
10292				 * finish the job by clearing the wrapped bit,
10293				 * explicitly setting the offset to be 0, and
10294				 * zero'ing out the old data in the buffer.
10295				 */
10296				if (offs == 0) {
10297					buf->dtb_flags &= ~DTRACEBUF_WRAPPED;
10298					buf->dtb_offset = 0;
10299					woffs = total;
10300
10301					while (woffs < buf->dtb_size)
10302						tomax[woffs++] = 0;
10303				}
10304
10305				woffs = 0;
10306				break;
10307			}
10308
10309			woffs += size;
10310		}
10311
10312		/*
10313		 * We have a wrapped offset.  It may be that the wrapped offset
10314		 * has become zero -- that's okay.
10315		 */
10316		buf->dtb_xamot_offset = woffs;
10317	}
10318
10319out:
10320	/*
10321	 * Now we can plow the buffer with any necessary padding.
10322	 */
10323	while (offs & (align - 1)) {
10324		/*
10325		 * Assert that our alignment is off by a number which
10326		 * is itself sizeof (uint32_t) aligned.
10327		 */
10328		ASSERT(!((align - (offs & (align - 1))) &
10329		    (sizeof (uint32_t) - 1)));
10330		DTRACE_STORE(uint32_t, tomax, offs, DTRACE_EPIDNONE);
10331		offs += sizeof (uint32_t);
10332	}
10333
10334	if (buf->dtb_flags & DTRACEBUF_FILL) {
10335		if (offs + needed > buf->dtb_size - state->dts_reserve) {
10336			buf->dtb_flags |= DTRACEBUF_FULL;
10337			return (-1);
10338		}
10339	}
10340
10341	if (mstate == NULL)
10342		return (offs);
10343
10344	/*
10345	 * For ring buffers and fill buffers, the scratch space is always
10346	 * the inactive buffer.
10347	 */
10348	mstate->dtms_scratch_base = (uintptr_t)buf->dtb_xamot;
10349	mstate->dtms_scratch_size = buf->dtb_size;
10350	mstate->dtms_scratch_ptr = mstate->dtms_scratch_base;
10351
10352	return (offs);
10353}
10354
10355static void
10356dtrace_buffer_polish(dtrace_buffer_t *buf)
10357{
10358	ASSERT(buf->dtb_flags & DTRACEBUF_RING);
10359	ASSERT(MUTEX_HELD(&dtrace_lock));
10360
10361	if (!(buf->dtb_flags & DTRACEBUF_WRAPPED))
10362		return;
10363
10364	/*
10365	 * We need to polish the ring buffer.  There are three cases:
10366	 *
10367	 * - The first (and presumably most common) is that there is no gap
10368	 *   between the buffer offset and the wrapped offset.  In this case,
10369	 *   there is nothing in the buffer that isn't valid data; we can
10370	 *   mark the buffer as polished and return.
10371	 *
10372	 * - The second (less common than the first but still more common
10373	 *   than the third) is that there is a gap between the buffer offset
10374	 *   and the wrapped offset, and the wrapped offset is larger than the
10375	 *   buffer offset.  This can happen because of an alignment issue, or
10376	 *   can happen because of a call to dtrace_buffer_reserve() that
10377	 *   didn't subsequently consume the buffer space.  In this case,
10378	 *   we need to zero the data from the buffer offset to the wrapped
10379	 *   offset.
10380	 *
10381	 * - The third (and least common) is that there is a gap between the
10382	 *   buffer offset and the wrapped offset, but the wrapped offset is
10383	 *   _less_ than the buffer offset.  This can only happen because a
10384	 *   call to dtrace_buffer_reserve() induced a wrap, but the space
10385	 *   was not subsequently consumed.  In this case, we need to zero the
10386	 *   space from the offset to the end of the buffer _and_ from the
10387	 *   top of the buffer to the wrapped offset.
10388	 */
10389	if (buf->dtb_offset < buf->dtb_xamot_offset) {
10390		bzero(buf->dtb_tomax + buf->dtb_offset,
10391		    buf->dtb_xamot_offset - buf->dtb_offset);
10392	}
10393
10394	if (buf->dtb_offset > buf->dtb_xamot_offset) {
10395		bzero(buf->dtb_tomax + buf->dtb_offset,
10396		    buf->dtb_size - buf->dtb_offset);
10397		bzero(buf->dtb_tomax, buf->dtb_xamot_offset);
10398	}
10399}
10400
10401static void
10402dtrace_buffer_free(dtrace_buffer_t *bufs)
10403{
10404	int i;
10405
10406	for (i = 0; i < NCPU; i++) {
10407		dtrace_buffer_t *buf = &bufs[i];
10408
10409		if (buf->dtb_tomax == NULL) {
10410			ASSERT(buf->dtb_xamot == NULL);
10411			ASSERT(buf->dtb_size == 0);
10412			continue;
10413		}
10414
10415		if (buf->dtb_xamot != NULL) {
10416			ASSERT(!(buf->dtb_flags & DTRACEBUF_NOSWITCH));
10417			kmem_free(buf->dtb_xamot, buf->dtb_size);
10418		}
10419
10420		kmem_free(buf->dtb_tomax, buf->dtb_size);
10421		buf->dtb_size = 0;
10422		buf->dtb_tomax = NULL;
10423		buf->dtb_xamot = NULL;
10424	}
10425}
10426
10427/*
10428 * DTrace Enabling Functions
10429 */
10430static dtrace_enabling_t *
10431dtrace_enabling_create(dtrace_vstate_t *vstate)
10432{
10433	dtrace_enabling_t *enab;
10434
10435	enab = kmem_zalloc(sizeof (dtrace_enabling_t), KM_SLEEP);
10436	enab->dten_vstate = vstate;
10437
10438	return (enab);
10439}
10440
10441static void
10442dtrace_enabling_add(dtrace_enabling_t *enab, dtrace_ecbdesc_t *ecb)
10443{
10444	dtrace_ecbdesc_t **ndesc;
10445	size_t osize, nsize;
10446
10447	/*
10448	 * We can't add to enablings after we've enabled them, or after we've
10449	 * retained them.
10450	 */
10451	ASSERT(enab->dten_probegen == 0);
10452	ASSERT(enab->dten_next == NULL && enab->dten_prev == NULL);
10453
10454	if (enab->dten_ndesc < enab->dten_maxdesc) {
10455		enab->dten_desc[enab->dten_ndesc++] = ecb;
10456		return;
10457	}
10458
10459	osize = enab->dten_maxdesc * sizeof (dtrace_enabling_t *);
10460
10461	if (enab->dten_maxdesc == 0) {
10462		enab->dten_maxdesc = 1;
10463	} else {
10464		enab->dten_maxdesc <<= 1;
10465	}
10466
10467	ASSERT(enab->dten_ndesc < enab->dten_maxdesc);
10468
10469	nsize = enab->dten_maxdesc * sizeof (dtrace_enabling_t *);
10470	ndesc = kmem_zalloc(nsize, KM_SLEEP);
10471	bcopy(enab->dten_desc, ndesc, osize);
10472	kmem_free(enab->dten_desc, osize);
10473
10474	enab->dten_desc = ndesc;
10475	enab->dten_desc[enab->dten_ndesc++] = ecb;
10476}
10477
10478static void
10479dtrace_enabling_addlike(dtrace_enabling_t *enab, dtrace_ecbdesc_t *ecb,
10480    dtrace_probedesc_t *pd)
10481{
10482	dtrace_ecbdesc_t *new;
10483	dtrace_predicate_t *pred;
10484	dtrace_actdesc_t *act;
10485
10486	/*
10487	 * We're going to create a new ECB description that matches the
10488	 * specified ECB in every way, but has the specified probe description.
10489	 */
10490	new = kmem_zalloc(sizeof (dtrace_ecbdesc_t), KM_SLEEP);
10491
10492	if ((pred = ecb->dted_pred.dtpdd_predicate) != NULL)
10493		dtrace_predicate_hold(pred);
10494
10495	for (act = ecb->dted_action; act != NULL; act = act->dtad_next)
10496		dtrace_actdesc_hold(act);
10497
10498	new->dted_action = ecb->dted_action;
10499	new->dted_pred = ecb->dted_pred;
10500	new->dted_probe = *pd;
10501	new->dted_uarg = ecb->dted_uarg;
10502
10503	dtrace_enabling_add(enab, new);
10504}
10505
10506static void
10507dtrace_enabling_dump(dtrace_enabling_t *enab)
10508{
10509	int i;
10510
10511	for (i = 0; i < enab->dten_ndesc; i++) {
10512		dtrace_probedesc_t *desc = &enab->dten_desc[i]->dted_probe;
10513
10514		cmn_err(CE_NOTE, "enabling probe %d (%s:%s:%s:%s)", i,
10515		    desc->dtpd_provider, desc->dtpd_mod,
10516		    desc->dtpd_func, desc->dtpd_name);
10517	}
10518}
10519
10520static void
10521dtrace_enabling_destroy(dtrace_enabling_t *enab)
10522{
10523	int i;
10524	dtrace_ecbdesc_t *ep;
10525	dtrace_vstate_t *vstate = enab->dten_vstate;
10526
10527	ASSERT(MUTEX_HELD(&dtrace_lock));
10528
10529	for (i = 0; i < enab->dten_ndesc; i++) {
10530		dtrace_actdesc_t *act, *next;
10531		dtrace_predicate_t *pred;
10532
10533		ep = enab->dten_desc[i];
10534
10535		if ((pred = ep->dted_pred.dtpdd_predicate) != NULL)
10536			dtrace_predicate_release(pred, vstate);
10537
10538		for (act = ep->dted_action; act != NULL; act = next) {
10539			next = act->dtad_next;
10540			dtrace_actdesc_release(act, vstate);
10541		}
10542
10543		kmem_free(ep, sizeof (dtrace_ecbdesc_t));
10544	}
10545
10546	kmem_free(enab->dten_desc,
10547	    enab->dten_maxdesc * sizeof (dtrace_enabling_t *));
10548
10549	/*
10550	 * If this was a retained enabling, decrement the dts_nretained count
10551	 * and take it off of the dtrace_retained list.
10552	 */
10553	if (enab->dten_prev != NULL || enab->dten_next != NULL ||
10554	    dtrace_retained == enab) {
10555		ASSERT(enab->dten_vstate->dtvs_state != NULL);
10556		ASSERT(enab->dten_vstate->dtvs_state->dts_nretained > 0);
10557		enab->dten_vstate->dtvs_state->dts_nretained--;
10558	}
10559
10560	if (enab->dten_prev == NULL) {
10561		if (dtrace_retained == enab) {
10562			dtrace_retained = enab->dten_next;
10563
10564			if (dtrace_retained != NULL)
10565				dtrace_retained->dten_prev = NULL;
10566		}
10567	} else {
10568		ASSERT(enab != dtrace_retained);
10569		ASSERT(dtrace_retained != NULL);
10570		enab->dten_prev->dten_next = enab->dten_next;
10571	}
10572
10573	if (enab->dten_next != NULL) {
10574		ASSERT(dtrace_retained != NULL);
10575		enab->dten_next->dten_prev = enab->dten_prev;
10576	}
10577
10578	kmem_free(enab, sizeof (dtrace_enabling_t));
10579}
10580
10581static int
10582dtrace_enabling_retain(dtrace_enabling_t *enab)
10583{
10584	dtrace_state_t *state;
10585
10586	ASSERT(MUTEX_HELD(&dtrace_lock));
10587	ASSERT(enab->dten_next == NULL && enab->dten_prev == NULL);
10588	ASSERT(enab->dten_vstate != NULL);
10589
10590	state = enab->dten_vstate->dtvs_state;
10591	ASSERT(state != NULL);
10592
10593	/*
10594	 * We only allow each state to retain dtrace_retain_max enablings.
10595	 */
10596	if (state->dts_nretained >= dtrace_retain_max)
10597		return (ENOSPC);
10598
10599	state->dts_nretained++;
10600
10601	if (dtrace_retained == NULL) {
10602		dtrace_retained = enab;
10603		return (0);
10604	}
10605
10606	enab->dten_next = dtrace_retained;
10607	dtrace_retained->dten_prev = enab;
10608	dtrace_retained = enab;
10609
10610	return (0);
10611}
10612
10613static int
10614dtrace_enabling_replicate(dtrace_state_t *state, dtrace_probedesc_t *match,
10615    dtrace_probedesc_t *create)
10616{
10617	dtrace_enabling_t *new, *enab;
10618	int found = 0, err = ENOENT;
10619
10620	ASSERT(MUTEX_HELD(&dtrace_lock));
10621	ASSERT(strlen(match->dtpd_provider) < DTRACE_PROVNAMELEN);
10622	ASSERT(strlen(match->dtpd_mod) < DTRACE_MODNAMELEN);
10623	ASSERT(strlen(match->dtpd_func) < DTRACE_FUNCNAMELEN);
10624	ASSERT(strlen(match->dtpd_name) < DTRACE_NAMELEN);
10625
10626	new = dtrace_enabling_create(&state->dts_vstate);
10627
10628	/*
10629	 * Iterate over all retained enablings, looking for enablings that
10630	 * match the specified state.
10631	 */
10632	for (enab = dtrace_retained; enab != NULL; enab = enab->dten_next) {
10633		int i;
10634
10635		/*
10636		 * dtvs_state can only be NULL for helper enablings -- and
10637		 * helper enablings can't be retained.
10638		 */
10639		ASSERT(enab->dten_vstate->dtvs_state != NULL);
10640
10641		if (enab->dten_vstate->dtvs_state != state)
10642			continue;
10643
10644		/*
10645		 * Now iterate over each probe description; we're looking for
10646		 * an exact match to the specified probe description.
10647		 */
10648		for (i = 0; i < enab->dten_ndesc; i++) {
10649			dtrace_ecbdesc_t *ep = enab->dten_desc[i];
10650			dtrace_probedesc_t *pd = &ep->dted_probe;
10651
10652			if (strcmp(pd->dtpd_provider, match->dtpd_provider))
10653				continue;
10654
10655			if (strcmp(pd->dtpd_mod, match->dtpd_mod))
10656				continue;
10657
10658			if (strcmp(pd->dtpd_func, match->dtpd_func))
10659				continue;
10660
10661			if (strcmp(pd->dtpd_name, match->dtpd_name))
10662				continue;
10663
10664			/*
10665			 * We have a winning probe!  Add it to our growing
10666			 * enabling.
10667			 */
10668			found = 1;
10669			dtrace_enabling_addlike(new, ep, create);
10670		}
10671	}
10672
10673	if (!found || (err = dtrace_enabling_retain(new)) != 0) {
10674		dtrace_enabling_destroy(new);
10675		return (err);
10676	}
10677
10678	return (0);
10679}
10680
10681static void
10682dtrace_enabling_retract(dtrace_state_t *state)
10683{
10684	dtrace_enabling_t *enab, *next;
10685
10686	ASSERT(MUTEX_HELD(&dtrace_lock));
10687
10688	/*
10689	 * Iterate over all retained enablings, destroy the enablings retained
10690	 * for the specified state.
10691	 */
10692	for (enab = dtrace_retained; enab != NULL; enab = next) {
10693		next = enab->dten_next;
10694
10695		/*
10696		 * dtvs_state can only be NULL for helper enablings -- and
10697		 * helper enablings can't be retained.
10698		 */
10699		ASSERT(enab->dten_vstate->dtvs_state != NULL);
10700
10701		if (enab->dten_vstate->dtvs_state == state) {
10702			ASSERT(state->dts_nretained > 0);
10703			dtrace_enabling_destroy(enab);
10704		}
10705	}
10706
10707	ASSERT(state->dts_nretained == 0);
10708}
10709
10710static int
10711dtrace_enabling_match(dtrace_enabling_t *enab, int *nmatched)
10712{
10713	int i = 0;
10714	int matched = 0;
10715
10716	ASSERT(MUTEX_HELD(&cpu_lock));
10717	ASSERT(MUTEX_HELD(&dtrace_lock));
10718
10719	for (i = 0; i < enab->dten_ndesc; i++) {
10720		dtrace_ecbdesc_t *ep = enab->dten_desc[i];
10721
10722		enab->dten_current = ep;
10723		enab->dten_error = 0;
10724
10725		matched += dtrace_probe_enable(&ep->dted_probe, enab);
10726
10727		if (enab->dten_error != 0) {
10728			/*
10729			 * If we get an error half-way through enabling the
10730			 * probes, we kick out -- perhaps with some number of
10731			 * them enabled.  Leaving enabled probes enabled may
10732			 * be slightly confusing for user-level, but we expect
10733			 * that no one will attempt to actually drive on in
10734			 * the face of such errors.  If this is an anonymous
10735			 * enabling (indicated with a NULL nmatched pointer),
10736			 * we cmn_err() a message.  We aren't expecting to
10737			 * get such an error -- such as it can exist at all,
10738			 * it would be a result of corrupted DOF in the driver
10739			 * properties.
10740			 */
10741			if (nmatched == NULL) {
10742				cmn_err(CE_WARN, "dtrace_enabling_match() "
10743				    "error on %p: %d", (void *)ep,
10744				    enab->dten_error);
10745			}
10746
10747			return (enab->dten_error);
10748		}
10749	}
10750
10751	enab->dten_probegen = dtrace_probegen;
10752	if (nmatched != NULL)
10753		*nmatched = matched;
10754
10755	return (0);
10756}
10757
10758static void
10759dtrace_enabling_matchall(void)
10760{
10761	dtrace_enabling_t *enab;
10762
10763	mutex_enter(&cpu_lock);
10764	mutex_enter(&dtrace_lock);
10765
10766	/*
10767	 * Because we can be called after dtrace_detach() has been called, we
10768	 * cannot assert that there are retained enablings.  We can safely
10769	 * load from dtrace_retained, however:  the taskq_destroy() at the
10770	 * end of dtrace_detach() will block pending our completion.
10771	 */
10772	for (enab = dtrace_retained; enab != NULL; enab = enab->dten_next)
10773		(void) dtrace_enabling_match(enab, NULL);
10774
10775	mutex_exit(&dtrace_lock);
10776	mutex_exit(&cpu_lock);
10777}
10778
10779static int
10780dtrace_enabling_matchstate(dtrace_state_t *state, int *nmatched)
10781{
10782	dtrace_enabling_t *enab;
10783	int matched, total = 0, err;
10784
10785	ASSERT(MUTEX_HELD(&cpu_lock));
10786	ASSERT(MUTEX_HELD(&dtrace_lock));
10787
10788	for (enab = dtrace_retained; enab != NULL; enab = enab->dten_next) {
10789		ASSERT(enab->dten_vstate->dtvs_state != NULL);
10790
10791		if (enab->dten_vstate->dtvs_state != state)
10792			continue;
10793
10794		if ((err = dtrace_enabling_match(enab, &matched)) != 0)
10795			return (err);
10796
10797		total += matched;
10798	}
10799
10800	if (nmatched != NULL)
10801		*nmatched = total;
10802
10803	return (0);
10804}
10805
10806/*
10807 * If an enabling is to be enabled without having matched probes (that is, if
10808 * dtrace_state_go() is to be called on the underlying dtrace_state_t), the
10809 * enabling must be _primed_ by creating an ECB for every ECB description.
10810 * This must be done to assure that we know the number of speculations, the
10811 * number of aggregations, the minimum buffer size needed, etc. before we
10812 * transition out of DTRACE_ACTIVITY_INACTIVE.  To do this without actually
10813 * enabling any probes, we create ECBs for every ECB decription, but with a
10814 * NULL probe -- which is exactly what this function does.
10815 */
10816static void
10817dtrace_enabling_prime(dtrace_state_t *state)
10818{
10819	dtrace_enabling_t *enab;
10820	int i;
10821
10822	for (enab = dtrace_retained; enab != NULL; enab = enab->dten_next) {
10823		ASSERT(enab->dten_vstate->dtvs_state != NULL);
10824
10825		if (enab->dten_vstate->dtvs_state != state)
10826			continue;
10827
10828		/*
10829		 * We don't want to prime an enabling more than once, lest
10830		 * we allow a malicious user to induce resource exhaustion.
10831		 * (The ECBs that result from priming an enabling aren't
10832		 * leaked -- but they also aren't deallocated until the
10833		 * consumer state is destroyed.)
10834		 */
10835		if (enab->dten_primed)
10836			continue;
10837
10838		for (i = 0; i < enab->dten_ndesc; i++) {
10839			enab->dten_current = enab->dten_desc[i];
10840			(void) dtrace_probe_enable(NULL, enab);
10841		}
10842
10843		enab->dten_primed = 1;
10844	}
10845}
10846
10847/*
10848 * Called to indicate that probes should be provided due to retained
10849 * enablings.  This is implemented in terms of dtrace_probe_provide(), but it
10850 * must take an initial lap through the enabling calling the dtps_provide()
10851 * entry point explicitly to allow for autocreated probes.
10852 */
10853static void
10854dtrace_enabling_provide(dtrace_provider_t *prv)
10855{
10856	int i, all = 0;
10857	dtrace_probedesc_t desc;
10858
10859	ASSERT(MUTEX_HELD(&dtrace_lock));
10860	ASSERT(MUTEX_HELD(&dtrace_provider_lock));
10861
10862	if (prv == NULL) {
10863		all = 1;
10864		prv = dtrace_provider;
10865	}
10866
10867	do {
10868		dtrace_enabling_t *enab = dtrace_retained;
10869		void *parg = prv->dtpv_arg;
10870
10871		for (; enab != NULL; enab = enab->dten_next) {
10872			for (i = 0; i < enab->dten_ndesc; i++) {
10873				desc = enab->dten_desc[i]->dted_probe;
10874				mutex_exit(&dtrace_lock);
10875				prv->dtpv_pops.dtps_provide(parg, &desc);
10876				mutex_enter(&dtrace_lock);
10877			}
10878		}
10879	} while (all && (prv = prv->dtpv_next) != NULL);
10880
10881	mutex_exit(&dtrace_lock);
10882	dtrace_probe_provide(NULL, all ? NULL : prv);
10883	mutex_enter(&dtrace_lock);
10884}
10885
10886/*
10887 * DTrace DOF Functions
10888 */
10889/*ARGSUSED*/
10890static void
10891dtrace_dof_error(dof_hdr_t *dof, const char *str)
10892{
10893	if (dtrace_err_verbose)
10894		cmn_err(CE_WARN, "failed to process DOF: %s", str);
10895
10896#ifdef DTRACE_ERRDEBUG
10897	dtrace_errdebug(str);
10898#endif
10899}
10900
10901/*
10902 * Create DOF out of a currently enabled state.  Right now, we only create
10903 * DOF containing the run-time options -- but this could be expanded to create
10904 * complete DOF representing the enabled state.
10905 */
10906static dof_hdr_t *
10907dtrace_dof_create(dtrace_state_t *state)
10908{
10909	dof_hdr_t *dof;
10910	dof_sec_t *sec;
10911	dof_optdesc_t *opt;
10912	int i, len = sizeof (dof_hdr_t) +
10913	    roundup(sizeof (dof_sec_t), sizeof (uint64_t)) +
10914	    sizeof (dof_optdesc_t) * DTRACEOPT_MAX;
10915
10916	ASSERT(MUTEX_HELD(&dtrace_lock));
10917
10918	dof = kmem_zalloc(len, KM_SLEEP);
10919	dof->dofh_ident[DOF_ID_MAG0] = DOF_MAG_MAG0;
10920	dof->dofh_ident[DOF_ID_MAG1] = DOF_MAG_MAG1;
10921	dof->dofh_ident[DOF_ID_MAG2] = DOF_MAG_MAG2;
10922	dof->dofh_ident[DOF_ID_MAG3] = DOF_MAG_MAG3;
10923
10924	dof->dofh_ident[DOF_ID_MODEL] = DOF_MODEL_NATIVE;
10925	dof->dofh_ident[DOF_ID_ENCODING] = DOF_ENCODE_NATIVE;
10926	dof->dofh_ident[DOF_ID_VERSION] = DOF_VERSION;
10927	dof->dofh_ident[DOF_ID_DIFVERS] = DIF_VERSION;
10928	dof->dofh_ident[DOF_ID_DIFIREG] = DIF_DIR_NREGS;
10929	dof->dofh_ident[DOF_ID_DIFTREG] = DIF_DTR_NREGS;
10930
10931	dof->dofh_flags = 0;
10932	dof->dofh_hdrsize = sizeof (dof_hdr_t);
10933	dof->dofh_secsize = sizeof (dof_sec_t);
10934	dof->dofh_secnum = 1;	/* only DOF_SECT_OPTDESC */
10935	dof->dofh_secoff = sizeof (dof_hdr_t);
10936	dof->dofh_loadsz = len;
10937	dof->dofh_filesz = len;
10938	dof->dofh_pad = 0;
10939
10940	/*
10941	 * Fill in the option section header...
10942	 */
10943	sec = (dof_sec_t *)((uintptr_t)dof + sizeof (dof_hdr_t));
10944	sec->dofs_type = DOF_SECT_OPTDESC;
10945	sec->dofs_align = sizeof (uint64_t);
10946	sec->dofs_flags = DOF_SECF_LOAD;
10947	sec->dofs_entsize = sizeof (dof_optdesc_t);
10948
10949	opt = (dof_optdesc_t *)((uintptr_t)sec +
10950	    roundup(sizeof (dof_sec_t), sizeof (uint64_t)));
10951
10952	sec->dofs_offset = (uintptr_t)opt - (uintptr_t)dof;
10953	sec->dofs_size = sizeof (dof_optdesc_t) * DTRACEOPT_MAX;
10954
10955	for (i = 0; i < DTRACEOPT_MAX; i++) {
10956		opt[i].dofo_option = i;
10957		opt[i].dofo_strtab = DOF_SECIDX_NONE;
10958		opt[i].dofo_value = state->dts_options[i];
10959	}
10960
10961	return (dof);
10962}
10963
10964static dof_hdr_t *
10965dtrace_dof_copyin(uintptr_t uarg, int *errp)
10966{
10967	dof_hdr_t hdr, *dof;
10968
10969	ASSERT(!MUTEX_HELD(&dtrace_lock));
10970
10971	/*
10972	 * First, we're going to copyin() the sizeof (dof_hdr_t).
10973	 */
10974	if (copyin((void *)uarg, &hdr, sizeof (hdr)) != 0) {
10975		dtrace_dof_error(NULL, "failed to copyin DOF header");
10976		*errp = EFAULT;
10977		return (NULL);
10978	}
10979
10980	/*
10981	 * Now we'll allocate the entire DOF and copy it in -- provided
10982	 * that the length isn't outrageous.
10983	 */
10984	if (hdr.dofh_loadsz >= dtrace_dof_maxsize) {
10985		dtrace_dof_error(&hdr, "load size exceeds maximum");
10986		*errp = E2BIG;
10987		return (NULL);
10988	}
10989
10990	if (hdr.dofh_loadsz < sizeof (hdr)) {
10991		dtrace_dof_error(&hdr, "invalid load size");
10992		*errp = EINVAL;
10993		return (NULL);
10994	}
10995
10996	dof = kmem_alloc(hdr.dofh_loadsz, KM_SLEEP);
10997
10998	if (copyin((void *)uarg, dof, hdr.dofh_loadsz) != 0) {
10999		kmem_free(dof, hdr.dofh_loadsz);
11000		*errp = EFAULT;
11001		return (NULL);
11002	}
11003
11004	return (dof);
11005}
11006
11007static dof_hdr_t *
11008dtrace_dof_property(const char *name)
11009{
11010	uchar_t *buf;
11011	uint64_t loadsz;
11012	unsigned int len, i;
11013	dof_hdr_t *dof;
11014
11015	/*
11016	 * Unfortunately, array of values in .conf files are always (and
11017	 * only) interpreted to be integer arrays.  We must read our DOF
11018	 * as an integer array, and then squeeze it into a byte array.
11019	 */
11020	if (ddi_prop_lookup_int_array(DDI_DEV_T_ANY, dtrace_devi, 0,
11021	    (char *)name, (int **)&buf, &len) != DDI_PROP_SUCCESS)
11022		return (NULL);
11023
11024	for (i = 0; i < len; i++)
11025		buf[i] = (uchar_t)(((int *)buf)[i]);
11026
11027	if (len < sizeof (dof_hdr_t)) {
11028		ddi_prop_free(buf);
11029		dtrace_dof_error(NULL, "truncated header");
11030		return (NULL);
11031	}
11032
11033	if (len < (loadsz = ((dof_hdr_t *)buf)->dofh_loadsz)) {
11034		ddi_prop_free(buf);
11035		dtrace_dof_error(NULL, "truncated DOF");
11036		return (NULL);
11037	}
11038
11039	if (loadsz >= dtrace_dof_maxsize) {
11040		ddi_prop_free(buf);
11041		dtrace_dof_error(NULL, "oversized DOF");
11042		return (NULL);
11043	}
11044
11045	dof = kmem_alloc(loadsz, KM_SLEEP);
11046	bcopy(buf, dof, loadsz);
11047	ddi_prop_free(buf);
11048
11049	return (dof);
11050}
11051
11052static void
11053dtrace_dof_destroy(dof_hdr_t *dof)
11054{
11055	kmem_free(dof, dof->dofh_loadsz);
11056}
11057
11058/*
11059 * Return the dof_sec_t pointer corresponding to a given section index.  If the
11060 * index is not valid, dtrace_dof_error() is called and NULL is returned.  If
11061 * a type other than DOF_SECT_NONE is specified, the header is checked against
11062 * this type and NULL is returned if the types do not match.
11063 */
11064static dof_sec_t *
11065dtrace_dof_sect(dof_hdr_t *dof, uint32_t type, dof_secidx_t i)
11066{
11067	dof_sec_t *sec = (dof_sec_t *)(uintptr_t)
11068	    ((uintptr_t)dof + dof->dofh_secoff + i * dof->dofh_secsize);
11069
11070	if (i >= dof->dofh_secnum) {
11071		dtrace_dof_error(dof, "referenced section index is invalid");
11072		return (NULL);
11073	}
11074
11075	if (!(sec->dofs_flags & DOF_SECF_LOAD)) {
11076		dtrace_dof_error(dof, "referenced section is not loadable");
11077		return (NULL);
11078	}
11079
11080	if (type != DOF_SECT_NONE && type != sec->dofs_type) {
11081		dtrace_dof_error(dof, "referenced section is the wrong type");
11082		return (NULL);
11083	}
11084
11085	return (sec);
11086}
11087
11088static dtrace_probedesc_t *
11089dtrace_dof_probedesc(dof_hdr_t *dof, dof_sec_t *sec, dtrace_probedesc_t *desc)
11090{
11091	dof_probedesc_t *probe;
11092	dof_sec_t *strtab;
11093	uintptr_t daddr = (uintptr_t)dof;
11094	uintptr_t str;
11095	size_t size;
11096
11097	if (sec->dofs_type != DOF_SECT_PROBEDESC) {
11098		dtrace_dof_error(dof, "invalid probe section");
11099		return (NULL);
11100	}
11101
11102	if (sec->dofs_align != sizeof (dof_secidx_t)) {
11103		dtrace_dof_error(dof, "bad alignment in probe description");
11104		return (NULL);
11105	}
11106
11107	if (sec->dofs_offset + sizeof (dof_probedesc_t) > dof->dofh_loadsz) {
11108		dtrace_dof_error(dof, "truncated probe description");
11109		return (NULL);
11110	}
11111
11112	probe = (dof_probedesc_t *)(uintptr_t)(daddr + sec->dofs_offset);
11113	strtab = dtrace_dof_sect(dof, DOF_SECT_STRTAB, probe->dofp_strtab);
11114
11115	if (strtab == NULL)
11116		return (NULL);
11117
11118	str = daddr + strtab->dofs_offset;
11119	size = strtab->dofs_size;
11120
11121	if (probe->dofp_provider >= strtab->dofs_size) {
11122		dtrace_dof_error(dof, "corrupt probe provider");
11123		return (NULL);
11124	}
11125
11126	(void) strncpy(desc->dtpd_provider,
11127	    (char *)(str + probe->dofp_provider),
11128	    MIN(DTRACE_PROVNAMELEN - 1, size - probe->dofp_provider));
11129
11130	if (probe->dofp_mod >= strtab->dofs_size) {
11131		dtrace_dof_error(dof, "corrupt probe module");
11132		return (NULL);
11133	}
11134
11135	(void) strncpy(desc->dtpd_mod, (char *)(str + probe->dofp_mod),
11136	    MIN(DTRACE_MODNAMELEN - 1, size - probe->dofp_mod));
11137
11138	if (probe->dofp_func >= strtab->dofs_size) {
11139		dtrace_dof_error(dof, "corrupt probe function");
11140		return (NULL);
11141	}
11142
11143	(void) strncpy(desc->dtpd_func, (char *)(str + probe->dofp_func),
11144	    MIN(DTRACE_FUNCNAMELEN - 1, size - probe->dofp_func));
11145
11146	if (probe->dofp_name >= strtab->dofs_size) {
11147		dtrace_dof_error(dof, "corrupt probe name");
11148		return (NULL);
11149	}
11150
11151	(void) strncpy(desc->dtpd_name, (char *)(str + probe->dofp_name),
11152	    MIN(DTRACE_NAMELEN - 1, size - probe->dofp_name));
11153
11154	return (desc);
11155}
11156
11157static dtrace_difo_t *
11158dtrace_dof_difo(dof_hdr_t *dof, dof_sec_t *sec, dtrace_vstate_t *vstate,
11159    cred_t *cr)
11160{
11161	dtrace_difo_t *dp;
11162	size_t ttl = 0;
11163	dof_difohdr_t *dofd;
11164	uintptr_t daddr = (uintptr_t)dof;
11165	size_t max = dtrace_difo_maxsize;
11166	int i, l, n;
11167
11168	static const struct {
11169		int section;
11170		int bufoffs;
11171		int lenoffs;
11172		int entsize;
11173		int align;
11174		const char *msg;
11175	} difo[] = {
11176		{ DOF_SECT_DIF, offsetof(dtrace_difo_t, dtdo_buf),
11177		offsetof(dtrace_difo_t, dtdo_len), sizeof (dif_instr_t),
11178		sizeof (dif_instr_t), "multiple DIF sections" },
11179
11180		{ DOF_SECT_INTTAB, offsetof(dtrace_difo_t, dtdo_inttab),
11181		offsetof(dtrace_difo_t, dtdo_intlen), sizeof (uint64_t),
11182		sizeof (uint64_t), "multiple integer tables" },
11183
11184		{ DOF_SECT_STRTAB, offsetof(dtrace_difo_t, dtdo_strtab),
11185		offsetof(dtrace_difo_t, dtdo_strlen), 0,
11186		sizeof (char), "multiple string tables" },
11187
11188		{ DOF_SECT_VARTAB, offsetof(dtrace_difo_t, dtdo_vartab),
11189		offsetof(dtrace_difo_t, dtdo_varlen), sizeof (dtrace_difv_t),
11190		sizeof (uint_t), "multiple variable tables" },
11191
11192		{ DOF_SECT_NONE, 0, 0, 0, NULL }
11193	};
11194
11195	if (sec->dofs_type != DOF_SECT_DIFOHDR) {
11196		dtrace_dof_error(dof, "invalid DIFO header section");
11197		return (NULL);
11198	}
11199
11200	if (sec->dofs_align != sizeof (dof_secidx_t)) {
11201		dtrace_dof_error(dof, "bad alignment in DIFO header");
11202		return (NULL);
11203	}
11204
11205	if (sec->dofs_size < sizeof (dof_difohdr_t) ||
11206	    sec->dofs_size % sizeof (dof_secidx_t)) {
11207		dtrace_dof_error(dof, "bad size in DIFO header");
11208		return (NULL);
11209	}
11210
11211	dofd = (dof_difohdr_t *)(uintptr_t)(daddr + sec->dofs_offset);
11212	n = (sec->dofs_size - sizeof (*dofd)) / sizeof (dof_secidx_t) + 1;
11213
11214	dp = kmem_zalloc(sizeof (dtrace_difo_t), KM_SLEEP);
11215	dp->dtdo_rtype = dofd->dofd_rtype;
11216
11217	for (l = 0; l < n; l++) {
11218		dof_sec_t *subsec;
11219		void **bufp;
11220		uint32_t *lenp;
11221
11222		if ((subsec = dtrace_dof_sect(dof, DOF_SECT_NONE,
11223		    dofd->dofd_links[l])) == NULL)
11224			goto err; /* invalid section link */
11225
11226		if (ttl + subsec->dofs_size > max) {
11227			dtrace_dof_error(dof, "exceeds maximum size");
11228			goto err;
11229		}
11230
11231		ttl += subsec->dofs_size;
11232
11233		for (i = 0; difo[i].section != DOF_SECT_NONE; i++) {
11234			if (subsec->dofs_type != difo[i].section)
11235				continue;
11236
11237			if (!(subsec->dofs_flags & DOF_SECF_LOAD)) {
11238				dtrace_dof_error(dof, "section not loaded");
11239				goto err;
11240			}
11241
11242			if (subsec->dofs_align != difo[i].align) {
11243				dtrace_dof_error(dof, "bad alignment");
11244				goto err;
11245			}
11246
11247			bufp = (void **)((uintptr_t)dp + difo[i].bufoffs);
11248			lenp = (uint32_t *)((uintptr_t)dp + difo[i].lenoffs);
11249
11250			if (*bufp != NULL) {
11251				dtrace_dof_error(dof, difo[i].msg);
11252				goto err;
11253			}
11254
11255			if (difo[i].entsize != subsec->dofs_entsize) {
11256				dtrace_dof_error(dof, "entry size mismatch");
11257				goto err;
11258			}
11259
11260			if (subsec->dofs_entsize != 0 &&
11261			    (subsec->dofs_size % subsec->dofs_entsize) != 0) {
11262				dtrace_dof_error(dof, "corrupt entry size");
11263				goto err;
11264			}
11265
11266			*lenp = subsec->dofs_size;
11267			*bufp = kmem_alloc(subsec->dofs_size, KM_SLEEP);
11268			bcopy((char *)(uintptr_t)(daddr + subsec->dofs_offset),
11269			    *bufp, subsec->dofs_size);
11270
11271			if (subsec->dofs_entsize != 0)
11272				*lenp /= subsec->dofs_entsize;
11273
11274			break;
11275		}
11276
11277		/*
11278		 * If we encounter a loadable DIFO sub-section that is not
11279		 * known to us, assume this is a broken program and fail.
11280		 */
11281		if (difo[i].section == DOF_SECT_NONE &&
11282		    (subsec->dofs_flags & DOF_SECF_LOAD)) {
11283			dtrace_dof_error(dof, "unrecognized DIFO subsection");
11284			goto err;
11285		}
11286	}
11287
11288	if (dp->dtdo_buf == NULL) {
11289		/*
11290		 * We can't have a DIF object without DIF text.
11291		 */
11292		dtrace_dof_error(dof, "missing DIF text");
11293		goto err;
11294	}
11295
11296	/*
11297	 * Before we validate the DIF object, run through the variable table
11298	 * looking for the strings -- if any of their size are under, we'll set
11299	 * their size to be the system-wide default string size.  Note that
11300	 * this should _not_ happen if the "strsize" option has been set --
11301	 * in this case, the compiler should have set the size to reflect the
11302	 * setting of the option.
11303	 */
11304	for (i = 0; i < dp->dtdo_varlen; i++) {
11305		dtrace_difv_t *v = &dp->dtdo_vartab[i];
11306		dtrace_diftype_t *t = &v->dtdv_type;
11307
11308		if (v->dtdv_id < DIF_VAR_OTHER_UBASE)
11309			continue;
11310
11311		if (t->dtdt_kind == DIF_TYPE_STRING && t->dtdt_size == 0)
11312			t->dtdt_size = dtrace_strsize_default;
11313	}
11314
11315	if (dtrace_difo_validate(dp, vstate, DIF_DIR_NREGS, cr) != 0)
11316		goto err;
11317
11318	dtrace_difo_init(dp, vstate);
11319	return (dp);
11320
11321err:
11322	kmem_free(dp->dtdo_buf, dp->dtdo_len * sizeof (dif_instr_t));
11323	kmem_free(dp->dtdo_inttab, dp->dtdo_intlen * sizeof (uint64_t));
11324	kmem_free(dp->dtdo_strtab, dp->dtdo_strlen);
11325	kmem_free(dp->dtdo_vartab, dp->dtdo_varlen * sizeof (dtrace_difv_t));
11326
11327	kmem_free(dp, sizeof (dtrace_difo_t));
11328	return (NULL);
11329}
11330
11331static dtrace_predicate_t *
11332dtrace_dof_predicate(dof_hdr_t *dof, dof_sec_t *sec, dtrace_vstate_t *vstate,
11333    cred_t *cr)
11334{
11335	dtrace_difo_t *dp;
11336
11337	if ((dp = dtrace_dof_difo(dof, sec, vstate, cr)) == NULL)
11338		return (NULL);
11339
11340	return (dtrace_predicate_create(dp));
11341}
11342
11343static dtrace_actdesc_t *
11344dtrace_dof_actdesc(dof_hdr_t *dof, dof_sec_t *sec, dtrace_vstate_t *vstate,
11345    cred_t *cr)
11346{
11347	dtrace_actdesc_t *act, *first = NULL, *last = NULL, *next;
11348	dof_actdesc_t *desc;
11349	dof_sec_t *difosec;
11350	size_t offs;
11351	uintptr_t daddr = (uintptr_t)dof;
11352	uint64_t arg;
11353	dtrace_actkind_t kind;
11354
11355	if (sec->dofs_type != DOF_SECT_ACTDESC) {
11356		dtrace_dof_error(dof, "invalid action section");
11357		return (NULL);
11358	}
11359
11360	if (sec->dofs_offset + sizeof (dof_actdesc_t) > dof->dofh_loadsz) {
11361		dtrace_dof_error(dof, "truncated action description");
11362		return (NULL);
11363	}
11364
11365	if (sec->dofs_align != sizeof (uint64_t)) {
11366		dtrace_dof_error(dof, "bad alignment in action description");
11367		return (NULL);
11368	}
11369
11370	if (sec->dofs_size < sec->dofs_entsize) {
11371		dtrace_dof_error(dof, "section entry size exceeds total size");
11372		return (NULL);
11373	}
11374
11375	if (sec->dofs_entsize != sizeof (dof_actdesc_t)) {
11376		dtrace_dof_error(dof, "bad entry size in action description");
11377		return (NULL);
11378	}
11379
11380	if (sec->dofs_size / sec->dofs_entsize > dtrace_actions_max) {
11381		dtrace_dof_error(dof, "actions exceed dtrace_actions_max");
11382		return (NULL);
11383	}
11384
11385	for (offs = 0; offs < sec->dofs_size; offs += sec->dofs_entsize) {
11386		desc = (dof_actdesc_t *)(daddr +
11387		    (uintptr_t)sec->dofs_offset + offs);
11388		kind = (dtrace_actkind_t)desc->dofa_kind;
11389
11390		if (DTRACEACT_ISPRINTFLIKE(kind) &&
11391		    (kind != DTRACEACT_PRINTA ||
11392		    desc->dofa_strtab != DOF_SECIDX_NONE)) {
11393			dof_sec_t *strtab;
11394			char *str, *fmt;
11395			uint64_t i;
11396
11397			/*
11398			 * printf()-like actions must have a format string.
11399			 */
11400			if ((strtab = dtrace_dof_sect(dof,
11401			    DOF_SECT_STRTAB, desc->dofa_strtab)) == NULL)
11402				goto err;
11403
11404			str = (char *)((uintptr_t)dof +
11405			    (uintptr_t)strtab->dofs_offset);
11406
11407			for (i = desc->dofa_arg; i < strtab->dofs_size; i++) {
11408				if (str[i] == '\0')
11409					break;
11410			}
11411
11412			if (i >= strtab->dofs_size) {
11413				dtrace_dof_error(dof, "bogus format string");
11414				goto err;
11415			}
11416
11417			if (i == desc->dofa_arg) {
11418				dtrace_dof_error(dof, "empty format string");
11419				goto err;
11420			}
11421
11422			i -= desc->dofa_arg;
11423			fmt = kmem_alloc(i + 1, KM_SLEEP);
11424			bcopy(&str[desc->dofa_arg], fmt, i + 1);
11425			arg = (uint64_t)(uintptr_t)fmt;
11426		} else {
11427			if (kind == DTRACEACT_PRINTA) {
11428				ASSERT(desc->dofa_strtab == DOF_SECIDX_NONE);
11429				arg = 0;
11430			} else {
11431				arg = desc->dofa_arg;
11432			}
11433		}
11434
11435		act = dtrace_actdesc_create(kind, desc->dofa_ntuple,
11436		    desc->dofa_uarg, arg);
11437
11438		if (last != NULL) {
11439			last->dtad_next = act;
11440		} else {
11441			first = act;
11442		}
11443
11444		last = act;
11445
11446		if (desc->dofa_difo == DOF_SECIDX_NONE)
11447			continue;
11448
11449		if ((difosec = dtrace_dof_sect(dof,
11450		    DOF_SECT_DIFOHDR, desc->dofa_difo)) == NULL)
11451			goto err;
11452
11453		act->dtad_difo = dtrace_dof_difo(dof, difosec, vstate, cr);
11454
11455		if (act->dtad_difo == NULL)
11456			goto err;
11457	}
11458
11459	ASSERT(first != NULL);
11460	return (first);
11461
11462err:
11463	for (act = first; act != NULL; act = next) {
11464		next = act->dtad_next;
11465		dtrace_actdesc_release(act, vstate);
11466	}
11467
11468	return (NULL);
11469}
11470
11471static dtrace_ecbdesc_t *
11472dtrace_dof_ecbdesc(dof_hdr_t *dof, dof_sec_t *sec, dtrace_vstate_t *vstate,
11473    cred_t *cr)
11474{
11475	dtrace_ecbdesc_t *ep;
11476	dof_ecbdesc_t *ecb;
11477	dtrace_probedesc_t *desc;
11478	dtrace_predicate_t *pred = NULL;
11479
11480	if (sec->dofs_size < sizeof (dof_ecbdesc_t)) {
11481		dtrace_dof_error(dof, "truncated ECB description");
11482		return (NULL);
11483	}
11484
11485	if (sec->dofs_align != sizeof (uint64_t)) {
11486		dtrace_dof_error(dof, "bad alignment in ECB description");
11487		return (NULL);
11488	}
11489
11490	ecb = (dof_ecbdesc_t *)((uintptr_t)dof + (uintptr_t)sec->dofs_offset);
11491	sec = dtrace_dof_sect(dof, DOF_SECT_PROBEDESC, ecb->dofe_probes);
11492
11493	if (sec == NULL)
11494		return (NULL);
11495
11496	ep = kmem_zalloc(sizeof (dtrace_ecbdesc_t), KM_SLEEP);
11497	ep->dted_uarg = ecb->dofe_uarg;
11498	desc = &ep->dted_probe;
11499
11500	if (dtrace_dof_probedesc(dof, sec, desc) == NULL)
11501		goto err;
11502
11503	if (ecb->dofe_pred != DOF_SECIDX_NONE) {
11504		if ((sec = dtrace_dof_sect(dof,
11505		    DOF_SECT_DIFOHDR, ecb->dofe_pred)) == NULL)
11506			goto err;
11507
11508		if ((pred = dtrace_dof_predicate(dof, sec, vstate, cr)) == NULL)
11509			goto err;
11510
11511		ep->dted_pred.dtpdd_predicate = pred;
11512	}
11513
11514	if (ecb->dofe_actions != DOF_SECIDX_NONE) {
11515		if ((sec = dtrace_dof_sect(dof,
11516		    DOF_SECT_ACTDESC, ecb->dofe_actions)) == NULL)
11517			goto err;
11518
11519		ep->dted_action = dtrace_dof_actdesc(dof, sec, vstate, cr);
11520
11521		if (ep->dted_action == NULL)
11522			goto err;
11523	}
11524
11525	return (ep);
11526
11527err:
11528	if (pred != NULL)
11529		dtrace_predicate_release(pred, vstate);
11530	kmem_free(ep, sizeof (dtrace_ecbdesc_t));
11531	return (NULL);
11532}
11533
11534/*
11535 * Apply the relocations from the specified 'sec' (a DOF_SECT_URELHDR) to the
11536 * specified DOF.  At present, this amounts to simply adding 'ubase' to the
11537 * site of any user SETX relocations to account for load object base address.
11538 * In the future, if we need other relocations, this function can be extended.
11539 */
11540static int
11541dtrace_dof_relocate(dof_hdr_t *dof, dof_sec_t *sec, uint64_t ubase)
11542{
11543	uintptr_t daddr = (uintptr_t)dof;
11544	dof_relohdr_t *dofr =
11545	    (dof_relohdr_t *)(uintptr_t)(daddr + sec->dofs_offset);
11546	dof_sec_t *ss, *rs, *ts;
11547	dof_relodesc_t *r;
11548	uint_t i, n;
11549
11550	if (sec->dofs_size < sizeof (dof_relohdr_t) ||
11551	    sec->dofs_align != sizeof (dof_secidx_t)) {
11552		dtrace_dof_error(dof, "invalid relocation header");
11553		return (-1);
11554	}
11555
11556	ss = dtrace_dof_sect(dof, DOF_SECT_STRTAB, dofr->dofr_strtab);
11557	rs = dtrace_dof_sect(dof, DOF_SECT_RELTAB, dofr->dofr_relsec);
11558	ts = dtrace_dof_sect(dof, DOF_SECT_NONE, dofr->dofr_tgtsec);
11559
11560	if (ss == NULL || rs == NULL || ts == NULL)
11561		return (-1); /* dtrace_dof_error() has been called already */
11562
11563	if (rs->dofs_entsize < sizeof (dof_relodesc_t) ||
11564	    rs->dofs_align != sizeof (uint64_t)) {
11565		dtrace_dof_error(dof, "invalid relocation section");
11566		return (-1);
11567	}
11568
11569	r = (dof_relodesc_t *)(uintptr_t)(daddr + rs->dofs_offset);
11570	n = rs->dofs_size / rs->dofs_entsize;
11571
11572	for (i = 0; i < n; i++) {
11573		uintptr_t taddr = daddr + ts->dofs_offset + r->dofr_offset;
11574
11575		switch (r->dofr_type) {
11576		case DOF_RELO_NONE:
11577			break;
11578		case DOF_RELO_SETX:
11579			if (r->dofr_offset >= ts->dofs_size || r->dofr_offset +
11580			    sizeof (uint64_t) > ts->dofs_size) {
11581				dtrace_dof_error(dof, "bad relocation offset");
11582				return (-1);
11583			}
11584
11585			if (!IS_P2ALIGNED(taddr, sizeof (uint64_t))) {
11586				dtrace_dof_error(dof, "misaligned setx relo");
11587				return (-1);
11588			}
11589
11590			*(uint64_t *)taddr += ubase;
11591			break;
11592		default:
11593			dtrace_dof_error(dof, "invalid relocation type");
11594			return (-1);
11595		}
11596
11597		r = (dof_relodesc_t *)((uintptr_t)r + rs->dofs_entsize);
11598	}
11599
11600	return (0);
11601}
11602
11603/*
11604 * The dof_hdr_t passed to dtrace_dof_slurp() should be a partially validated
11605 * header:  it should be at the front of a memory region that is at least
11606 * sizeof (dof_hdr_t) in size -- and then at least dof_hdr.dofh_loadsz in
11607 * size.  It need not be validated in any other way.
11608 */
11609static int
11610dtrace_dof_slurp(dof_hdr_t *dof, dtrace_vstate_t *vstate, cred_t *cr,
11611    dtrace_enabling_t **enabp, uint64_t ubase, int noprobes)
11612{
11613	uint64_t len = dof->dofh_loadsz, seclen;
11614	uintptr_t daddr = (uintptr_t)dof;
11615	dtrace_ecbdesc_t *ep;
11616	dtrace_enabling_t *enab;
11617	uint_t i;
11618
11619	ASSERT(MUTEX_HELD(&dtrace_lock));
11620	ASSERT(dof->dofh_loadsz >= sizeof (dof_hdr_t));
11621
11622	/*
11623	 * Check the DOF header identification bytes.  In addition to checking
11624	 * valid settings, we also verify that unused bits/bytes are zeroed so
11625	 * we can use them later without fear of regressing existing binaries.
11626	 */
11627	if (bcmp(&dof->dofh_ident[DOF_ID_MAG0],
11628	    DOF_MAG_STRING, DOF_MAG_STRLEN) != 0) {
11629		dtrace_dof_error(dof, "DOF magic string mismatch");
11630		return (-1);
11631	}
11632
11633	if (dof->dofh_ident[DOF_ID_MODEL] != DOF_MODEL_ILP32 &&
11634	    dof->dofh_ident[DOF_ID_MODEL] != DOF_MODEL_LP64) {
11635		dtrace_dof_error(dof, "DOF has invalid data model");
11636		return (-1);
11637	}
11638
11639	if (dof->dofh_ident[DOF_ID_ENCODING] != DOF_ENCODE_NATIVE) {
11640		dtrace_dof_error(dof, "DOF encoding mismatch");
11641		return (-1);
11642	}
11643
11644	if (dof->dofh_ident[DOF_ID_VERSION] != DOF_VERSION_1 &&
11645	    dof->dofh_ident[DOF_ID_VERSION] != DOF_VERSION_2) {
11646		dtrace_dof_error(dof, "DOF version mismatch");
11647		return (-1);
11648	}
11649
11650	if (dof->dofh_ident[DOF_ID_DIFVERS] != DIF_VERSION_2) {
11651		dtrace_dof_error(dof, "DOF uses unsupported instruction set");
11652		return (-1);
11653	}
11654
11655	if (dof->dofh_ident[DOF_ID_DIFIREG] > DIF_DIR_NREGS) {
11656		dtrace_dof_error(dof, "DOF uses too many integer registers");
11657		return (-1);
11658	}
11659
11660	if (dof->dofh_ident[DOF_ID_DIFTREG] > DIF_DTR_NREGS) {
11661		dtrace_dof_error(dof, "DOF uses too many tuple registers");
11662		return (-1);
11663	}
11664
11665	for (i = DOF_ID_PAD; i < DOF_ID_SIZE; i++) {
11666		if (dof->dofh_ident[i] != 0) {
11667			dtrace_dof_error(dof, "DOF has invalid ident byte set");
11668			return (-1);
11669		}
11670	}
11671
11672	if (dof->dofh_flags & ~DOF_FL_VALID) {
11673		dtrace_dof_error(dof, "DOF has invalid flag bits set");
11674		return (-1);
11675	}
11676
11677	if (dof->dofh_secsize == 0) {
11678		dtrace_dof_error(dof, "zero section header size");
11679		return (-1);
11680	}
11681
11682	/*
11683	 * Check that the section headers don't exceed the amount of DOF
11684	 * data.  Note that we cast the section size and number of sections
11685	 * to uint64_t's to prevent possible overflow in the multiplication.
11686	 */
11687	seclen = (uint64_t)dof->dofh_secnum * (uint64_t)dof->dofh_secsize;
11688
11689	if (dof->dofh_secoff > len || seclen > len ||
11690	    dof->dofh_secoff + seclen > len) {
11691		dtrace_dof_error(dof, "truncated section headers");
11692		return (-1);
11693	}
11694
11695	if (!IS_P2ALIGNED(dof->dofh_secoff, sizeof (uint64_t))) {
11696		dtrace_dof_error(dof, "misaligned section headers");
11697		return (-1);
11698	}
11699
11700	if (!IS_P2ALIGNED(dof->dofh_secsize, sizeof (uint64_t))) {
11701		dtrace_dof_error(dof, "misaligned section size");
11702		return (-1);
11703	}
11704
11705	/*
11706	 * Take an initial pass through the section headers to be sure that
11707	 * the headers don't have stray offsets.  If the 'noprobes' flag is
11708	 * set, do not permit sections relating to providers, probes, or args.
11709	 */
11710	for (i = 0; i < dof->dofh_secnum; i++) {
11711		dof_sec_t *sec = (dof_sec_t *)(daddr +
11712		    (uintptr_t)dof->dofh_secoff + i * dof->dofh_secsize);
11713
11714		if (noprobes) {
11715			switch (sec->dofs_type) {
11716			case DOF_SECT_PROVIDER:
11717			case DOF_SECT_PROBES:
11718			case DOF_SECT_PRARGS:
11719			case DOF_SECT_PROFFS:
11720				dtrace_dof_error(dof, "illegal sections "
11721				    "for enabling");
11722				return (-1);
11723			}
11724		}
11725
11726		if (!(sec->dofs_flags & DOF_SECF_LOAD))
11727			continue; /* just ignore non-loadable sections */
11728
11729		if (sec->dofs_align & (sec->dofs_align - 1)) {
11730			dtrace_dof_error(dof, "bad section alignment");
11731			return (-1);
11732		}
11733
11734		if (sec->dofs_offset & (sec->dofs_align - 1)) {
11735			dtrace_dof_error(dof, "misaligned section");
11736			return (-1);
11737		}
11738
11739		if (sec->dofs_offset > len || sec->dofs_size > len ||
11740		    sec->dofs_offset + sec->dofs_size > len) {
11741			dtrace_dof_error(dof, "corrupt section header");
11742			return (-1);
11743		}
11744
11745		if (sec->dofs_type == DOF_SECT_STRTAB && *((char *)daddr +
11746		    sec->dofs_offset + sec->dofs_size - 1) != '\0') {
11747			dtrace_dof_error(dof, "non-terminating string table");
11748			return (-1);
11749		}
11750	}
11751
11752	/*
11753	 * Take a second pass through the sections and locate and perform any
11754	 * relocations that are present.  We do this after the first pass to
11755	 * be sure that all sections have had their headers validated.
11756	 */
11757	for (i = 0; i < dof->dofh_secnum; i++) {
11758		dof_sec_t *sec = (dof_sec_t *)(daddr +
11759		    (uintptr_t)dof->dofh_secoff + i * dof->dofh_secsize);
11760
11761		if (!(sec->dofs_flags & DOF_SECF_LOAD))
11762			continue; /* skip sections that are not loadable */
11763
11764		switch (sec->dofs_type) {
11765		case DOF_SECT_URELHDR:
11766			if (dtrace_dof_relocate(dof, sec, ubase) != 0)
11767				return (-1);
11768			break;
11769		}
11770	}
11771
11772	if ((enab = *enabp) == NULL)
11773		enab = *enabp = dtrace_enabling_create(vstate);
11774
11775	for (i = 0; i < dof->dofh_secnum; i++) {
11776		dof_sec_t *sec = (dof_sec_t *)(daddr +
11777		    (uintptr_t)dof->dofh_secoff + i * dof->dofh_secsize);
11778
11779		if (sec->dofs_type != DOF_SECT_ECBDESC)
11780			continue;
11781
11782		if ((ep = dtrace_dof_ecbdesc(dof, sec, vstate, cr)) == NULL) {
11783			dtrace_enabling_destroy(enab);
11784			*enabp = NULL;
11785			return (-1);
11786		}
11787
11788		dtrace_enabling_add(enab, ep);
11789	}
11790
11791	return (0);
11792}
11793
11794/*
11795 * Process DOF for any options.  This routine assumes that the DOF has been
11796 * at least processed by dtrace_dof_slurp().
11797 */
11798static int
11799dtrace_dof_options(dof_hdr_t *dof, dtrace_state_t *state)
11800{
11801	int i, rval;
11802	uint32_t entsize;
11803	size_t offs;
11804	dof_optdesc_t *desc;
11805
11806	for (i = 0; i < dof->dofh_secnum; i++) {
11807		dof_sec_t *sec = (dof_sec_t *)((uintptr_t)dof +
11808		    (uintptr_t)dof->dofh_secoff + i * dof->dofh_secsize);
11809
11810		if (sec->dofs_type != DOF_SECT_OPTDESC)
11811			continue;
11812
11813		if (sec->dofs_align != sizeof (uint64_t)) {
11814			dtrace_dof_error(dof, "bad alignment in "
11815			    "option description");
11816			return (EINVAL);
11817		}
11818
11819		if ((entsize = sec->dofs_entsize) == 0) {
11820			dtrace_dof_error(dof, "zeroed option entry size");
11821			return (EINVAL);
11822		}
11823
11824		if (entsize < sizeof (dof_optdesc_t)) {
11825			dtrace_dof_error(dof, "bad option entry size");
11826			return (EINVAL);
11827		}
11828
11829		for (offs = 0; offs < sec->dofs_size; offs += entsize) {
11830			desc = (dof_optdesc_t *)((uintptr_t)dof +
11831			    (uintptr_t)sec->dofs_offset + offs);
11832
11833			if (desc->dofo_strtab != DOF_SECIDX_NONE) {
11834				dtrace_dof_error(dof, "non-zero option string");
11835				return (EINVAL);
11836			}
11837
11838			if (desc->dofo_value == DTRACEOPT_UNSET) {
11839				dtrace_dof_error(dof, "unset option");
11840				return (EINVAL);
11841			}
11842
11843			if ((rval = dtrace_state_option(state,
11844			    desc->dofo_option, desc->dofo_value)) != 0) {
11845				dtrace_dof_error(dof, "rejected option");
11846				return (rval);
11847			}
11848		}
11849	}
11850
11851	return (0);
11852}
11853
11854/*
11855 * DTrace Consumer State Functions
11856 */
11857int
11858dtrace_dstate_init(dtrace_dstate_t *dstate, size_t size)
11859{
11860	size_t hashsize, maxper, min, chunksize = dstate->dtds_chunksize;
11861	void *base;
11862	uintptr_t limit;
11863	dtrace_dynvar_t *dvar, *next, *start;
11864	int i;
11865
11866	ASSERT(MUTEX_HELD(&dtrace_lock));
11867	ASSERT(dstate->dtds_base == NULL && dstate->dtds_percpu == NULL);
11868
11869	bzero(dstate, sizeof (dtrace_dstate_t));
11870
11871	if ((dstate->dtds_chunksize = chunksize) == 0)
11872		dstate->dtds_chunksize = DTRACE_DYNVAR_CHUNKSIZE;
11873
11874	if (size < (min = dstate->dtds_chunksize + sizeof (dtrace_dynhash_t)))
11875		size = min;
11876
11877	if ((base = kmem_zalloc(size, KM_NOSLEEP)) == NULL)
11878		return (ENOMEM);
11879
11880	dstate->dtds_size = size;
11881	dstate->dtds_base = base;
11882	dstate->dtds_percpu = kmem_cache_alloc(dtrace_state_cache, KM_SLEEP);
11883	bzero(dstate->dtds_percpu, NCPU * sizeof (dtrace_dstate_percpu_t));
11884
11885	hashsize = size / (dstate->dtds_chunksize + sizeof (dtrace_dynhash_t));
11886
11887	if (hashsize != 1 && (hashsize & 1))
11888		hashsize--;
11889
11890	dstate->dtds_hashsize = hashsize;
11891	dstate->dtds_hash = dstate->dtds_base;
11892
11893	/*
11894	 * Set all of our hash buckets to point to the single sink, and (if
11895	 * it hasn't already been set), set the sink's hash value to be the
11896	 * sink sentinel value.  The sink is needed for dynamic variable
11897	 * lookups to know that they have iterated over an entire, valid hash
11898	 * chain.
11899	 */
11900	for (i = 0; i < hashsize; i++)
11901		dstate->dtds_hash[i].dtdh_chain = &dtrace_dynhash_sink;
11902
11903	if (dtrace_dynhash_sink.dtdv_hashval != DTRACE_DYNHASH_SINK)
11904		dtrace_dynhash_sink.dtdv_hashval = DTRACE_DYNHASH_SINK;
11905
11906	/*
11907	 * Determine number of active CPUs.  Divide free list evenly among
11908	 * active CPUs.
11909	 */
11910	start = (dtrace_dynvar_t *)
11911	    ((uintptr_t)base + hashsize * sizeof (dtrace_dynhash_t));
11912	limit = (uintptr_t)base + size;
11913
11914	maxper = (limit - (uintptr_t)start) / NCPU;
11915	maxper = (maxper / dstate->dtds_chunksize) * dstate->dtds_chunksize;
11916
11917	for (i = 0; i < NCPU; i++) {
11918		dstate->dtds_percpu[i].dtdsc_free = dvar = start;
11919
11920		/*
11921		 * If we don't even have enough chunks to make it once through
11922		 * NCPUs, we're just going to allocate everything to the first
11923		 * CPU.  And if we're on the last CPU, we're going to allocate
11924		 * whatever is left over.  In either case, we set the limit to
11925		 * be the limit of the dynamic variable space.
11926		 */
11927		if (maxper == 0 || i == NCPU - 1) {
11928			limit = (uintptr_t)base + size;
11929			start = NULL;
11930		} else {
11931			limit = (uintptr_t)start + maxper;
11932			start = (dtrace_dynvar_t *)limit;
11933		}
11934
11935		ASSERT(limit <= (uintptr_t)base + size);
11936
11937		for (;;) {
11938			next = (dtrace_dynvar_t *)((uintptr_t)dvar +
11939			    dstate->dtds_chunksize);
11940
11941			if ((uintptr_t)next + dstate->dtds_chunksize >= limit)
11942				break;
11943
11944			dvar->dtdv_next = next;
11945			dvar = next;
11946		}
11947
11948		if (maxper == 0)
11949			break;
11950	}
11951
11952	return (0);
11953}
11954
11955void
11956dtrace_dstate_fini(dtrace_dstate_t *dstate)
11957{
11958	ASSERT(MUTEX_HELD(&cpu_lock));
11959
11960	if (dstate->dtds_base == NULL)
11961		return;
11962
11963	kmem_free(dstate->dtds_base, dstate->dtds_size);
11964	kmem_cache_free(dtrace_state_cache, dstate->dtds_percpu);
11965}
11966
11967static void
11968dtrace_vstate_fini(dtrace_vstate_t *vstate)
11969{
11970	/*
11971	 * Logical XOR, where are you?
11972	 */
11973	ASSERT((vstate->dtvs_nglobals == 0) ^ (vstate->dtvs_globals != NULL));
11974
11975	if (vstate->dtvs_nglobals > 0) {
11976		kmem_free(vstate->dtvs_globals, vstate->dtvs_nglobals *
11977		    sizeof (dtrace_statvar_t *));
11978	}
11979
11980	if (vstate->dtvs_ntlocals > 0) {
11981		kmem_free(vstate->dtvs_tlocals, vstate->dtvs_ntlocals *
11982		    sizeof (dtrace_difv_t));
11983	}
11984
11985	ASSERT((vstate->dtvs_nlocals == 0) ^ (vstate->dtvs_locals != NULL));
11986
11987	if (vstate->dtvs_nlocals > 0) {
11988		kmem_free(vstate->dtvs_locals, vstate->dtvs_nlocals *
11989		    sizeof (dtrace_statvar_t *));
11990	}
11991}
11992
11993static void
11994dtrace_state_clean(dtrace_state_t *state)
11995{
11996	if (state->dts_activity == DTRACE_ACTIVITY_INACTIVE)
11997		return;
11998
11999	dtrace_dynvar_clean(&state->dts_vstate.dtvs_dynvars);
12000	dtrace_speculation_clean(state);
12001}
12002
12003static void
12004dtrace_state_deadman(dtrace_state_t *state)
12005{
12006	hrtime_t now;
12007
12008	dtrace_sync();
12009
12010	now = dtrace_gethrtime();
12011
12012	if (state != dtrace_anon.dta_state &&
12013	    now - state->dts_laststatus >= dtrace_deadman_user)
12014		return;
12015
12016	/*
12017	 * We must be sure that dts_alive never appears to be less than the
12018	 * value upon entry to dtrace_state_deadman(), and because we lack a
12019	 * dtrace_cas64(), we cannot store to it atomically.  We thus instead
12020	 * store INT64_MAX to it, followed by a memory barrier, followed by
12021	 * the new value.  This assures that dts_alive never appears to be
12022	 * less than its true value, regardless of the order in which the
12023	 * stores to the underlying storage are issued.
12024	 */
12025	state->dts_alive = INT64_MAX;
12026	dtrace_membar_producer();
12027	state->dts_alive = now;
12028}
12029
12030dtrace_state_t *
12031dtrace_state_create(dev_t *devp, cred_t *cr)
12032{
12033	minor_t minor;
12034	major_t major;
12035	char c[30];
12036	dtrace_state_t *state;
12037	dtrace_optval_t *opt;
12038	int bufsize = NCPU * sizeof (dtrace_buffer_t), i;
12039
12040	ASSERT(MUTEX_HELD(&dtrace_lock));
12041	ASSERT(MUTEX_HELD(&cpu_lock));
12042
12043	minor = (minor_t)(uintptr_t)vmem_alloc(dtrace_minor, 1,
12044	    VM_BESTFIT | VM_SLEEP);
12045
12046	if (ddi_soft_state_zalloc(dtrace_softstate, minor) != DDI_SUCCESS) {
12047		vmem_free(dtrace_minor, (void *)(uintptr_t)minor, 1);
12048		return (NULL);
12049	}
12050
12051	state = ddi_get_soft_state(dtrace_softstate, minor);
12052	state->dts_epid = DTRACE_EPIDNONE + 1;
12053
12054	(void) snprintf(c, sizeof (c), "dtrace_aggid_%d", minor);
12055	state->dts_aggid_arena = vmem_create(c, (void *)1, UINT32_MAX, 1,
12056	    NULL, NULL, NULL, 0, VM_SLEEP | VMC_IDENTIFIER);
12057
12058	if (devp != NULL) {
12059		major = getemajor(*devp);
12060	} else {
12061		major = ddi_driver_major(dtrace_devi);
12062	}
12063
12064	state->dts_dev = makedevice(major, minor);
12065
12066	if (devp != NULL)
12067		*devp = state->dts_dev;
12068
12069	/*
12070	 * We allocate NCPU buffers.  On the one hand, this can be quite
12071	 * a bit of memory per instance (nearly 36K on a Starcat).  On the
12072	 * other hand, it saves an additional memory reference in the probe
12073	 * path.
12074	 */
12075	state->dts_buffer = kmem_zalloc(bufsize, KM_SLEEP);
12076	state->dts_aggbuffer = kmem_zalloc(bufsize, KM_SLEEP);
12077	state->dts_cleaner = CYCLIC_NONE;
12078	state->dts_deadman = CYCLIC_NONE;
12079	state->dts_vstate.dtvs_state = state;
12080
12081	for (i = 0; i < DTRACEOPT_MAX; i++)
12082		state->dts_options[i] = DTRACEOPT_UNSET;
12083
12084	/*
12085	 * Set the default options.
12086	 */
12087	opt = state->dts_options;
12088	opt[DTRACEOPT_BUFPOLICY] = DTRACEOPT_BUFPOLICY_SWITCH;
12089	opt[DTRACEOPT_BUFRESIZE] = DTRACEOPT_BUFRESIZE_AUTO;
12090	opt[DTRACEOPT_NSPEC] = dtrace_nspec_default;
12091	opt[DTRACEOPT_SPECSIZE] = dtrace_specsize_default;
12092	opt[DTRACEOPT_CPU] = (dtrace_optval_t)DTRACE_CPUALL;
12093	opt[DTRACEOPT_STRSIZE] = dtrace_strsize_default;
12094	opt[DTRACEOPT_STACKFRAMES] = dtrace_stackframes_default;
12095	opt[DTRACEOPT_USTACKFRAMES] = dtrace_ustackframes_default;
12096	opt[DTRACEOPT_CLEANRATE] = dtrace_cleanrate_default;
12097	opt[DTRACEOPT_AGGRATE] = dtrace_aggrate_default;
12098	opt[DTRACEOPT_SWITCHRATE] = dtrace_switchrate_default;
12099	opt[DTRACEOPT_STATUSRATE] = dtrace_statusrate_default;
12100	opt[DTRACEOPT_JSTACKFRAMES] = dtrace_jstackframes_default;
12101	opt[DTRACEOPT_JSTACKSTRSIZE] = dtrace_jstackstrsize_default;
12102
12103	state->dts_activity = DTRACE_ACTIVITY_INACTIVE;
12104
12105	/*
12106	 * Depending on the user credentials, we set flag bits which alter probe
12107	 * visibility or the amount of destructiveness allowed.  In the case of
12108	 * actual anonymous tracing, or the possession of all privileges, all of
12109	 * the normal checks are bypassed.
12110	 */
12111	if (cr == NULL || PRIV_POLICY_ONLY(cr, PRIV_ALL, B_FALSE)) {
12112		state->dts_cred.dcr_visible = DTRACE_CRV_ALL;
12113		state->dts_cred.dcr_action = DTRACE_CRA_ALL;
12114	} else {
12115		/*
12116		 * Set up the credentials for this instantiation.  We take a
12117		 * hold on the credential to prevent it from disappearing on
12118		 * us; this in turn prevents the zone_t referenced by this
12119		 * credential from disappearing.  This means that we can
12120		 * examine the credential and the zone from probe context.
12121		 */
12122		crhold(cr);
12123		state->dts_cred.dcr_cred = cr;
12124
12125		/*
12126		 * CRA_PROC means "we have *some* privilege for dtrace" and
12127		 * unlocks the use of variables like pid, zonename, etc.
12128		 */
12129		if (PRIV_POLICY_ONLY(cr, PRIV_DTRACE_USER, B_FALSE) ||
12130		    PRIV_POLICY_ONLY(cr, PRIV_DTRACE_PROC, B_FALSE)) {
12131			state->dts_cred.dcr_action |= DTRACE_CRA_PROC;
12132		}
12133
12134		/*
12135		 * dtrace_user allows use of syscall and profile providers.
12136		 * If the user also has proc_owner and/or proc_zone, we
12137		 * extend the scope to include additional visibility and
12138		 * destructive power.
12139		 */
12140		if (PRIV_POLICY_ONLY(cr, PRIV_DTRACE_USER, B_FALSE)) {
12141			if (PRIV_POLICY_ONLY(cr, PRIV_PROC_OWNER, B_FALSE)) {
12142				state->dts_cred.dcr_visible |=
12143				    DTRACE_CRV_ALLPROC;
12144
12145				state->dts_cred.dcr_action |=
12146				    DTRACE_CRA_PROC_DESTRUCTIVE_ALLUSER;
12147			}
12148
12149			if (PRIV_POLICY_ONLY(cr, PRIV_PROC_ZONE, B_FALSE)) {
12150				state->dts_cred.dcr_visible |=
12151				    DTRACE_CRV_ALLZONE;
12152
12153				state->dts_cred.dcr_action |=
12154				    DTRACE_CRA_PROC_DESTRUCTIVE_ALLZONE;
12155			}
12156
12157			/*
12158			 * If we have all privs in whatever zone this is,
12159			 * we can do destructive things to processes which
12160			 * have altered credentials.
12161			 */
12162			if (priv_isequalset(priv_getset(cr, PRIV_EFFECTIVE),
12163			    cr->cr_zone->zone_privset)) {
12164				state->dts_cred.dcr_action |=
12165				    DTRACE_CRA_PROC_DESTRUCTIVE_CREDCHG;
12166			}
12167		}
12168
12169		/*
12170		 * Holding the dtrace_kernel privilege also implies that
12171		 * the user has the dtrace_user privilege from a visibility
12172		 * perspective.  But without further privileges, some
12173		 * destructive actions are not available.
12174		 */
12175		if (PRIV_POLICY_ONLY(cr, PRIV_DTRACE_KERNEL, B_FALSE)) {
12176			/*
12177			 * Make all probes in all zones visible.  However,
12178			 * this doesn't mean that all actions become available
12179			 * to all zones.
12180			 */
12181			state->dts_cred.dcr_visible |= DTRACE_CRV_KERNEL |
12182			    DTRACE_CRV_ALLPROC | DTRACE_CRV_ALLZONE;
12183
12184			state->dts_cred.dcr_action |= DTRACE_CRA_KERNEL |
12185			    DTRACE_CRA_PROC;
12186			/*
12187			 * Holding proc_owner means that destructive actions
12188			 * for *this* zone are allowed.
12189			 */
12190			if (PRIV_POLICY_ONLY(cr, PRIV_PROC_OWNER, B_FALSE))
12191				state->dts_cred.dcr_action |=
12192				    DTRACE_CRA_PROC_DESTRUCTIVE_ALLUSER;
12193
12194			/*
12195			 * Holding proc_zone means that destructive actions
12196			 * for this user/group ID in all zones is allowed.
12197			 */
12198			if (PRIV_POLICY_ONLY(cr, PRIV_PROC_ZONE, B_FALSE))
12199				state->dts_cred.dcr_action |=
12200				    DTRACE_CRA_PROC_DESTRUCTIVE_ALLZONE;
12201
12202			/*
12203			 * If we have all privs in whatever zone this is,
12204			 * we can do destructive things to processes which
12205			 * have altered credentials.
12206			 */
12207			if (priv_isequalset(priv_getset(cr, PRIV_EFFECTIVE),
12208			    cr->cr_zone->zone_privset)) {
12209				state->dts_cred.dcr_action |=
12210				    DTRACE_CRA_PROC_DESTRUCTIVE_CREDCHG;
12211			}
12212		}
12213
12214		/*
12215		 * Holding the dtrace_proc privilege gives control over fasttrap
12216		 * and pid providers.  We need to grant wider destructive
12217		 * privileges in the event that the user has proc_owner and/or
12218		 * proc_zone.
12219		 */
12220		if (PRIV_POLICY_ONLY(cr, PRIV_DTRACE_PROC, B_FALSE)) {
12221			if (PRIV_POLICY_ONLY(cr, PRIV_PROC_OWNER, B_FALSE))
12222				state->dts_cred.dcr_action |=
12223				    DTRACE_CRA_PROC_DESTRUCTIVE_ALLUSER;
12224
12225			if (PRIV_POLICY_ONLY(cr, PRIV_PROC_ZONE, B_FALSE))
12226				state->dts_cred.dcr_action |=
12227				    DTRACE_CRA_PROC_DESTRUCTIVE_ALLZONE;
12228		}
12229	}
12230
12231	return (state);
12232}
12233
12234static int
12235dtrace_state_buffer(dtrace_state_t *state, dtrace_buffer_t *buf, int which)
12236{
12237	dtrace_optval_t *opt = state->dts_options, size;
12238	processorid_t cpu;
12239	int flags = 0, rval;
12240
12241	ASSERT(MUTEX_HELD(&dtrace_lock));
12242	ASSERT(MUTEX_HELD(&cpu_lock));
12243	ASSERT(which < DTRACEOPT_MAX);
12244	ASSERT(state->dts_activity == DTRACE_ACTIVITY_INACTIVE ||
12245	    (state == dtrace_anon.dta_state &&
12246	    state->dts_activity == DTRACE_ACTIVITY_ACTIVE));
12247
12248	if (opt[which] == DTRACEOPT_UNSET || opt[which] == 0)
12249		return (0);
12250
12251	if (opt[DTRACEOPT_CPU] != DTRACEOPT_UNSET)
12252		cpu = opt[DTRACEOPT_CPU];
12253
12254	if (which == DTRACEOPT_SPECSIZE)
12255		flags |= DTRACEBUF_NOSWITCH;
12256
12257	if (which == DTRACEOPT_BUFSIZE) {
12258		if (opt[DTRACEOPT_BUFPOLICY] == DTRACEOPT_BUFPOLICY_RING)
12259			flags |= DTRACEBUF_RING;
12260
12261		if (opt[DTRACEOPT_BUFPOLICY] == DTRACEOPT_BUFPOLICY_FILL)
12262			flags |= DTRACEBUF_FILL;
12263
12264		if (state != dtrace_anon.dta_state ||
12265		    state->dts_activity != DTRACE_ACTIVITY_ACTIVE)
12266			flags |= DTRACEBUF_INACTIVE;
12267	}
12268
12269	for (size = opt[which]; size >= sizeof (uint64_t); size >>= 1) {
12270		/*
12271		 * The size must be 8-byte aligned.  If the size is not 8-byte
12272		 * aligned, drop it down by the difference.
12273		 */
12274		if (size & (sizeof (uint64_t) - 1))
12275			size -= size & (sizeof (uint64_t) - 1);
12276
12277		if (size < state->dts_reserve) {
12278			/*
12279			 * Buffers always must be large enough to accommodate
12280			 * their prereserved space.  We return E2BIG instead
12281			 * of ENOMEM in this case to allow for user-level
12282			 * software to differentiate the cases.
12283			 */
12284			return (E2BIG);
12285		}
12286
12287		rval = dtrace_buffer_alloc(buf, size, flags, cpu);
12288
12289		if (rval != ENOMEM) {
12290			opt[which] = size;
12291			return (rval);
12292		}
12293
12294		if (opt[DTRACEOPT_BUFRESIZE] == DTRACEOPT_BUFRESIZE_MANUAL)
12295			return (rval);
12296	}
12297
12298	return (ENOMEM);
12299}
12300
12301static int
12302dtrace_state_buffers(dtrace_state_t *state)
12303{
12304	dtrace_speculation_t *spec = state->dts_speculations;
12305	int rval, i;
12306
12307	if ((rval = dtrace_state_buffer(state, state->dts_buffer,
12308	    DTRACEOPT_BUFSIZE)) != 0)
12309		return (rval);
12310
12311	if ((rval = dtrace_state_buffer(state, state->dts_aggbuffer,
12312	    DTRACEOPT_AGGSIZE)) != 0)
12313		return (rval);
12314
12315	for (i = 0; i < state->dts_nspeculations; i++) {
12316		if ((rval = dtrace_state_buffer(state,
12317		    spec[i].dtsp_buffer, DTRACEOPT_SPECSIZE)) != 0)
12318			return (rval);
12319	}
12320
12321	return (0);
12322}
12323
12324static void
12325dtrace_state_prereserve(dtrace_state_t *state)
12326{
12327	dtrace_ecb_t *ecb;
12328	dtrace_probe_t *probe;
12329
12330	state->dts_reserve = 0;
12331
12332	if (state->dts_options[DTRACEOPT_BUFPOLICY] != DTRACEOPT_BUFPOLICY_FILL)
12333		return;
12334
12335	/*
12336	 * If our buffer policy is a "fill" buffer policy, we need to set the
12337	 * prereserved space to be the space required by the END probes.
12338	 */
12339	probe = dtrace_probes[dtrace_probeid_end - 1];
12340	ASSERT(probe != NULL);
12341
12342	for (ecb = probe->dtpr_ecb; ecb != NULL; ecb = ecb->dte_next) {
12343		if (ecb->dte_state != state)
12344			continue;
12345
12346		state->dts_reserve += ecb->dte_needed + ecb->dte_alignment;
12347	}
12348}
12349
12350static int
12351dtrace_state_go(dtrace_state_t *state, processorid_t *cpu)
12352{
12353	dtrace_optval_t *opt = state->dts_options, sz, nspec;
12354	dtrace_speculation_t *spec;
12355	dtrace_buffer_t *buf;
12356	cyc_handler_t hdlr;
12357	cyc_time_t when;
12358	int rval = 0, i, bufsize = NCPU * sizeof (dtrace_buffer_t);
12359	dtrace_icookie_t cookie;
12360
12361	mutex_enter(&cpu_lock);
12362	mutex_enter(&dtrace_lock);
12363
12364	if (state->dts_activity != DTRACE_ACTIVITY_INACTIVE) {
12365		rval = EBUSY;
12366		goto out;
12367	}
12368
12369	/*
12370	 * Before we can perform any checks, we must prime all of the
12371	 * retained enablings that correspond to this state.
12372	 */
12373	dtrace_enabling_prime(state);
12374
12375	if (state->dts_destructive && !state->dts_cred.dcr_destructive) {
12376		rval = EACCES;
12377		goto out;
12378	}
12379
12380	dtrace_state_prereserve(state);
12381
12382	/*
12383	 * Now we want to do is try to allocate our speculations.
12384	 * We do not automatically resize the number of speculations; if
12385	 * this fails, we will fail the operation.
12386	 */
12387	nspec = opt[DTRACEOPT_NSPEC];
12388	ASSERT(nspec != DTRACEOPT_UNSET);
12389
12390	if (nspec > INT_MAX) {
12391		rval = ENOMEM;
12392		goto out;
12393	}
12394
12395	spec = kmem_zalloc(nspec * sizeof (dtrace_speculation_t), KM_NOSLEEP);
12396
12397	if (spec == NULL) {
12398		rval = ENOMEM;
12399		goto out;
12400	}
12401
12402	state->dts_speculations = spec;
12403	state->dts_nspeculations = (int)nspec;
12404
12405	for (i = 0; i < nspec; i++) {
12406		if ((buf = kmem_zalloc(bufsize, KM_NOSLEEP)) == NULL) {
12407			rval = ENOMEM;
12408			goto err;
12409		}
12410
12411		spec[i].dtsp_buffer = buf;
12412	}
12413
12414	if (opt[DTRACEOPT_GRABANON] != DTRACEOPT_UNSET) {
12415		if (dtrace_anon.dta_state == NULL) {
12416			rval = ENOENT;
12417			goto out;
12418		}
12419
12420		if (state->dts_necbs != 0) {
12421			rval = EALREADY;
12422			goto out;
12423		}
12424
12425		state->dts_anon = dtrace_anon_grab();
12426		ASSERT(state->dts_anon != NULL);
12427		state = state->dts_anon;
12428
12429		/*
12430		 * We want "grabanon" to be set in the grabbed state, so we'll
12431		 * copy that option value from the grabbing state into the
12432		 * grabbed state.
12433		 */
12434		state->dts_options[DTRACEOPT_GRABANON] =
12435		    opt[DTRACEOPT_GRABANON];
12436
12437		*cpu = dtrace_anon.dta_beganon;
12438
12439		/*
12440		 * If the anonymous state is active (as it almost certainly
12441		 * is if the anonymous enabling ultimately matched anything),
12442		 * we don't allow any further option processing -- but we
12443		 * don't return failure.
12444		 */
12445		if (state->dts_activity != DTRACE_ACTIVITY_INACTIVE)
12446			goto out;
12447	}
12448
12449	if (opt[DTRACEOPT_AGGSIZE] != DTRACEOPT_UNSET &&
12450	    opt[DTRACEOPT_AGGSIZE] != 0) {
12451		if (state->dts_aggregations == NULL) {
12452			/*
12453			 * We're not going to create an aggregation buffer
12454			 * because we don't have any ECBs that contain
12455			 * aggregations -- set this option to 0.
12456			 */
12457			opt[DTRACEOPT_AGGSIZE] = 0;
12458		} else {
12459			/*
12460			 * If we have an aggregation buffer, we must also have
12461			 * a buffer to use as scratch.
12462			 */
12463			if (opt[DTRACEOPT_BUFSIZE] == DTRACEOPT_UNSET ||
12464			    opt[DTRACEOPT_BUFSIZE] < state->dts_needed) {
12465				opt[DTRACEOPT_BUFSIZE] = state->dts_needed;
12466			}
12467		}
12468	}
12469
12470	if (opt[DTRACEOPT_SPECSIZE] != DTRACEOPT_UNSET &&
12471	    opt[DTRACEOPT_SPECSIZE] != 0) {
12472		if (!state->dts_speculates) {
12473			/*
12474			 * We're not going to create speculation buffers
12475			 * because we don't have any ECBs that actually
12476			 * speculate -- set the speculation size to 0.
12477			 */
12478			opt[DTRACEOPT_SPECSIZE] = 0;
12479		}
12480	}
12481
12482	/*
12483	 * The bare minimum size for any buffer that we're actually going to
12484	 * do anything to is sizeof (uint64_t).
12485	 */
12486	sz = sizeof (uint64_t);
12487
12488	if ((state->dts_needed != 0 && opt[DTRACEOPT_BUFSIZE] < sz) ||
12489	    (state->dts_speculates && opt[DTRACEOPT_SPECSIZE] < sz) ||
12490	    (state->dts_aggregations != NULL && opt[DTRACEOPT_AGGSIZE] < sz)) {
12491		/*
12492		 * A buffer size has been explicitly set to 0 (or to a size
12493		 * that will be adjusted to 0) and we need the space -- we
12494		 * need to return failure.  We return ENOSPC to differentiate
12495		 * it from failing to allocate a buffer due to failure to meet
12496		 * the reserve (for which we return E2BIG).
12497		 */
12498		rval = ENOSPC;
12499		goto out;
12500	}
12501
12502	if ((rval = dtrace_state_buffers(state)) != 0)
12503		goto err;
12504
12505	if ((sz = opt[DTRACEOPT_DYNVARSIZE]) == DTRACEOPT_UNSET)
12506		sz = dtrace_dstate_defsize;
12507
12508	do {
12509		rval = dtrace_dstate_init(&state->dts_vstate.dtvs_dynvars, sz);
12510
12511		if (rval == 0)
12512			break;
12513
12514		if (opt[DTRACEOPT_BUFRESIZE] == DTRACEOPT_BUFRESIZE_MANUAL)
12515			goto err;
12516	} while (sz >>= 1);
12517
12518	opt[DTRACEOPT_DYNVARSIZE] = sz;
12519
12520	if (rval != 0)
12521		goto err;
12522
12523	if (opt[DTRACEOPT_STATUSRATE] > dtrace_statusrate_max)
12524		opt[DTRACEOPT_STATUSRATE] = dtrace_statusrate_max;
12525
12526	if (opt[DTRACEOPT_CLEANRATE] == 0)
12527		opt[DTRACEOPT_CLEANRATE] = dtrace_cleanrate_max;
12528
12529	if (opt[DTRACEOPT_CLEANRATE] < dtrace_cleanrate_min)
12530		opt[DTRACEOPT_CLEANRATE] = dtrace_cleanrate_min;
12531
12532	if (opt[DTRACEOPT_CLEANRATE] > dtrace_cleanrate_max)
12533		opt[DTRACEOPT_CLEANRATE] = dtrace_cleanrate_max;
12534
12535	hdlr.cyh_func = (cyc_func_t)dtrace_state_clean;
12536	hdlr.cyh_arg = state;
12537	hdlr.cyh_level = CY_LOW_LEVEL;
12538
12539	when.cyt_when = 0;
12540	when.cyt_interval = opt[DTRACEOPT_CLEANRATE];
12541
12542	state->dts_cleaner = cyclic_add(&hdlr, &when);
12543
12544	hdlr.cyh_func = (cyc_func_t)dtrace_state_deadman;
12545	hdlr.cyh_arg = state;
12546	hdlr.cyh_level = CY_LOW_LEVEL;
12547
12548	when.cyt_when = 0;
12549	when.cyt_interval = dtrace_deadman_interval;
12550
12551	state->dts_alive = state->dts_laststatus = dtrace_gethrtime();
12552	state->dts_deadman = cyclic_add(&hdlr, &when);
12553
12554	state->dts_activity = DTRACE_ACTIVITY_WARMUP;
12555
12556	/*
12557	 * Now it's time to actually fire the BEGIN probe.  We need to disable
12558	 * interrupts here both to record the CPU on which we fired the BEGIN
12559	 * probe (the data from this CPU will be processed first at user
12560	 * level) and to manually activate the buffer for this CPU.
12561	 */
12562	cookie = dtrace_interrupt_disable();
12563	*cpu = CPU->cpu_id;
12564	ASSERT(state->dts_buffer[*cpu].dtb_flags & DTRACEBUF_INACTIVE);
12565	state->dts_buffer[*cpu].dtb_flags &= ~DTRACEBUF_INACTIVE;
12566
12567	dtrace_probe(dtrace_probeid_begin,
12568	    (uint64_t)(uintptr_t)state, 0, 0, 0, 0);
12569	dtrace_interrupt_enable(cookie);
12570	/*
12571	 * We may have had an exit action from a BEGIN probe; only change our
12572	 * state to ACTIVE if we're still in WARMUP.
12573	 */
12574	ASSERT(state->dts_activity == DTRACE_ACTIVITY_WARMUP ||
12575	    state->dts_activity == DTRACE_ACTIVITY_DRAINING);
12576
12577	if (state->dts_activity == DTRACE_ACTIVITY_WARMUP)
12578		state->dts_activity = DTRACE_ACTIVITY_ACTIVE;
12579
12580	/*
12581	 * Regardless of whether or not now we're in ACTIVE or DRAINING, we
12582	 * want each CPU to transition its principal buffer out of the
12583	 * INACTIVE state.  Doing this assures that no CPU will suddenly begin
12584	 * processing an ECB halfway down a probe's ECB chain; all CPUs will
12585	 * atomically transition from processing none of a state's ECBs to
12586	 * processing all of them.
12587	 */
12588	dtrace_xcall(DTRACE_CPUALL,
12589	    (dtrace_xcall_t)dtrace_buffer_activate, state);
12590	goto out;
12591
12592err:
12593	dtrace_buffer_free(state->dts_buffer);
12594	dtrace_buffer_free(state->dts_aggbuffer);
12595
12596	if ((nspec = state->dts_nspeculations) == 0) {
12597		ASSERT(state->dts_speculations == NULL);
12598		goto out;
12599	}
12600
12601	spec = state->dts_speculations;
12602	ASSERT(spec != NULL);
12603
12604	for (i = 0; i < state->dts_nspeculations; i++) {
12605		if ((buf = spec[i].dtsp_buffer) == NULL)
12606			break;
12607
12608		dtrace_buffer_free(buf);
12609		kmem_free(buf, bufsize);
12610	}
12611
12612	kmem_free(spec, nspec * sizeof (dtrace_speculation_t));
12613	state->dts_nspeculations = 0;
12614	state->dts_speculations = NULL;
12615
12616out:
12617	mutex_exit(&dtrace_lock);
12618	mutex_exit(&cpu_lock);
12619
12620	return (rval);
12621}
12622
12623static int
12624dtrace_state_stop(dtrace_state_t *state, processorid_t *cpu)
12625{
12626	dtrace_icookie_t cookie;
12627
12628	ASSERT(MUTEX_HELD(&dtrace_lock));
12629
12630	if (state->dts_activity != DTRACE_ACTIVITY_ACTIVE &&
12631	    state->dts_activity != DTRACE_ACTIVITY_DRAINING)
12632		return (EINVAL);
12633
12634	/*
12635	 * We'll set the activity to DTRACE_ACTIVITY_DRAINING, and issue a sync
12636	 * to be sure that every CPU has seen it.  See below for the details
12637	 * on why this is done.
12638	 */
12639	state->dts_activity = DTRACE_ACTIVITY_DRAINING;
12640	dtrace_sync();
12641
12642	/*
12643	 * By this point, it is impossible for any CPU to be still processing
12644	 * with DTRACE_ACTIVITY_ACTIVE.  We can thus set our activity to
12645	 * DTRACE_ACTIVITY_COOLDOWN and know that we're not racing with any
12646	 * other CPU in dtrace_buffer_reserve().  This allows dtrace_probe()
12647	 * and callees to know that the activity is DTRACE_ACTIVITY_COOLDOWN
12648	 * iff we're in the END probe.
12649	 */
12650	state->dts_activity = DTRACE_ACTIVITY_COOLDOWN;
12651	dtrace_sync();
12652	ASSERT(state->dts_activity == DTRACE_ACTIVITY_COOLDOWN);
12653
12654	/*
12655	 * Finally, we can release the reserve and call the END probe.  We
12656	 * disable interrupts across calling the END probe to allow us to
12657	 * return the CPU on which we actually called the END probe.  This
12658	 * allows user-land to be sure that this CPU's principal buffer is
12659	 * processed last.
12660	 */
12661	state->dts_reserve = 0;
12662
12663	cookie = dtrace_interrupt_disable();
12664	*cpu = CPU->cpu_id;
12665	dtrace_probe(dtrace_probeid_end,
12666	    (uint64_t)(uintptr_t)state, 0, 0, 0, 0);
12667	dtrace_interrupt_enable(cookie);
12668
12669	state->dts_activity = DTRACE_ACTIVITY_STOPPED;
12670	dtrace_sync();
12671
12672	return (0);
12673}
12674
12675static int
12676dtrace_state_option(dtrace_state_t *state, dtrace_optid_t option,
12677    dtrace_optval_t val)
12678{
12679	ASSERT(MUTEX_HELD(&dtrace_lock));
12680
12681	if (state->dts_activity != DTRACE_ACTIVITY_INACTIVE)
12682		return (EBUSY);
12683
12684	if (option >= DTRACEOPT_MAX)
12685		return (EINVAL);
12686
12687	if (option != DTRACEOPT_CPU && val < 0)
12688		return (EINVAL);
12689
12690	switch (option) {
12691	case DTRACEOPT_DESTRUCTIVE:
12692		if (dtrace_destructive_disallow)
12693			return (EACCES);
12694
12695		state->dts_cred.dcr_destructive = 1;
12696		break;
12697
12698	case DTRACEOPT_BUFSIZE:
12699	case DTRACEOPT_DYNVARSIZE:
12700	case DTRACEOPT_AGGSIZE:
12701	case DTRACEOPT_SPECSIZE:
12702	case DTRACEOPT_STRSIZE:
12703		if (val < 0)
12704			return (EINVAL);
12705
12706		if (val >= LONG_MAX) {
12707			/*
12708			 * If this is an otherwise negative value, set it to
12709			 * the highest multiple of 128m less than LONG_MAX.
12710			 * Technically, we're adjusting the size without
12711			 * regard to the buffer resizing policy, but in fact,
12712			 * this has no effect -- if we set the buffer size to
12713			 * ~LONG_MAX and the buffer policy is ultimately set to
12714			 * be "manual", the buffer allocation is guaranteed to
12715			 * fail, if only because the allocation requires two
12716			 * buffers.  (We set the the size to the highest
12717			 * multiple of 128m because it ensures that the size
12718			 * will remain a multiple of a megabyte when
12719			 * repeatedly halved -- all the way down to 15m.)
12720			 */
12721			val = LONG_MAX - (1 << 27) + 1;
12722		}
12723	}
12724
12725	state->dts_options[option] = val;
12726
12727	return (0);
12728}
12729
12730static void
12731dtrace_state_destroy(dtrace_state_t *state)
12732{
12733	dtrace_ecb_t *ecb;
12734	dtrace_vstate_t *vstate = &state->dts_vstate;
12735	minor_t minor = getminor(state->dts_dev);
12736	int i, bufsize = NCPU * sizeof (dtrace_buffer_t);
12737	dtrace_speculation_t *spec = state->dts_speculations;
12738	int nspec = state->dts_nspeculations;
12739	uint32_t match;
12740
12741	ASSERT(MUTEX_HELD(&dtrace_lock));
12742	ASSERT(MUTEX_HELD(&cpu_lock));
12743
12744	/*
12745	 * First, retract any retained enablings for this state.
12746	 */
12747	dtrace_enabling_retract(state);
12748	ASSERT(state->dts_nretained == 0);
12749
12750	if (state->dts_activity == DTRACE_ACTIVITY_ACTIVE ||
12751	    state->dts_activity == DTRACE_ACTIVITY_DRAINING) {
12752		/*
12753		 * We have managed to come into dtrace_state_destroy() on a
12754		 * hot enabling -- almost certainly because of a disorderly
12755		 * shutdown of a consumer.  (That is, a consumer that is
12756		 * exiting without having called dtrace_stop().) In this case,
12757		 * we're going to set our activity to be KILLED, and then
12758		 * issue a sync to be sure that everyone is out of probe
12759		 * context before we start blowing away ECBs.
12760		 */
12761		state->dts_activity = DTRACE_ACTIVITY_KILLED;
12762		dtrace_sync();
12763	}
12764
12765	/*
12766	 * Release the credential hold we took in dtrace_state_create().
12767	 */
12768	if (state->dts_cred.dcr_cred != NULL)
12769		crfree(state->dts_cred.dcr_cred);
12770
12771	/*
12772	 * Now we can safely disable and destroy any enabled probes.  Because
12773	 * any DTRACE_PRIV_KERNEL probes may actually be slowing our progress
12774	 * (especially if they're all enabled), we take two passes through the
12775	 * ECBs:  in the first, we disable just DTRACE_PRIV_KERNEL probes, and
12776	 * in the second we disable whatever is left over.
12777	 */
12778	for (match = DTRACE_PRIV_KERNEL; ; match = 0) {
12779		for (i = 0; i < state->dts_necbs; i++) {
12780			if ((ecb = state->dts_ecbs[i]) == NULL)
12781				continue;
12782
12783			if (match && ecb->dte_probe != NULL) {
12784				dtrace_probe_t *probe = ecb->dte_probe;
12785				dtrace_provider_t *prov = probe->dtpr_provider;
12786
12787				if (!(prov->dtpv_priv.dtpp_flags & match))
12788					continue;
12789			}
12790
12791			dtrace_ecb_disable(ecb);
12792			dtrace_ecb_destroy(ecb);
12793		}
12794
12795		if (!match)
12796			break;
12797	}
12798
12799	/*
12800	 * Before we free the buffers, perform one more sync to assure that
12801	 * every CPU is out of probe context.
12802	 */
12803	dtrace_sync();
12804
12805	dtrace_buffer_free(state->dts_buffer);
12806	dtrace_buffer_free(state->dts_aggbuffer);
12807
12808	for (i = 0; i < nspec; i++)
12809		dtrace_buffer_free(spec[i].dtsp_buffer);
12810
12811	if (state->dts_cleaner != CYCLIC_NONE)
12812		cyclic_remove(state->dts_cleaner);
12813
12814	if (state->dts_deadman != CYCLIC_NONE)
12815		cyclic_remove(state->dts_deadman);
12816
12817	dtrace_dstate_fini(&vstate->dtvs_dynvars);
12818	dtrace_vstate_fini(vstate);
12819	kmem_free(state->dts_ecbs, state->dts_necbs * sizeof (dtrace_ecb_t *));
12820
12821	if (state->dts_aggregations != NULL) {
12822#ifdef DEBUG
12823		for (i = 0; i < state->dts_naggregations; i++)
12824			ASSERT(state->dts_aggregations[i] == NULL);
12825#endif
12826		ASSERT(state->dts_naggregations > 0);
12827		kmem_free(state->dts_aggregations,
12828		    state->dts_naggregations * sizeof (dtrace_aggregation_t *));
12829	}
12830
12831	kmem_free(state->dts_buffer, bufsize);
12832	kmem_free(state->dts_aggbuffer, bufsize);
12833
12834	for (i = 0; i < nspec; i++)
12835		kmem_free(spec[i].dtsp_buffer, bufsize);
12836
12837	kmem_free(spec, nspec * sizeof (dtrace_speculation_t));
12838
12839	dtrace_format_destroy(state);
12840
12841	vmem_destroy(state->dts_aggid_arena);
12842	ddi_soft_state_free(dtrace_softstate, minor);
12843	vmem_free(dtrace_minor, (void *)(uintptr_t)minor, 1);
12844}
12845
12846/*
12847 * DTrace Anonymous Enabling Functions
12848 */
12849static dtrace_state_t *
12850dtrace_anon_grab(void)
12851{
12852	dtrace_state_t *state;
12853
12854	ASSERT(MUTEX_HELD(&dtrace_lock));
12855
12856	if ((state = dtrace_anon.dta_state) == NULL) {
12857		ASSERT(dtrace_anon.dta_enabling == NULL);
12858		return (NULL);
12859	}
12860
12861	ASSERT(dtrace_anon.dta_enabling != NULL);
12862	ASSERT(dtrace_retained != NULL);
12863
12864	dtrace_enabling_destroy(dtrace_anon.dta_enabling);
12865	dtrace_anon.dta_enabling = NULL;
12866	dtrace_anon.dta_state = NULL;
12867
12868	return (state);
12869}
12870
12871static void
12872dtrace_anon_property(void)
12873{
12874	int i, rv;
12875	dtrace_state_t *state;
12876	dof_hdr_t *dof;
12877	char c[32];		/* enough for "dof-data-" + digits */
12878
12879	ASSERT(MUTEX_HELD(&dtrace_lock));
12880	ASSERT(MUTEX_HELD(&cpu_lock));
12881
12882	for (i = 0; ; i++) {
12883		(void) snprintf(c, sizeof (c), "dof-data-%d", i);
12884
12885		dtrace_err_verbose = 1;
12886
12887		if ((dof = dtrace_dof_property(c)) == NULL) {
12888			dtrace_err_verbose = 0;
12889			break;
12890		}
12891
12892		/*
12893		 * We want to create anonymous state, so we need to transition
12894		 * the kernel debugger to indicate that DTrace is active.  If
12895		 * this fails (e.g. because the debugger has modified text in
12896		 * some way), we won't continue with the processing.
12897		 */
12898		if (kdi_dtrace_set(KDI_DTSET_DTRACE_ACTIVATE) != 0) {
12899			cmn_err(CE_NOTE, "kernel debugger active; anonymous "
12900			    "enabling ignored.");
12901			dtrace_dof_destroy(dof);
12902			break;
12903		}
12904
12905		/*
12906		 * If we haven't allocated an anonymous state, we'll do so now.
12907		 */
12908		if ((state = dtrace_anon.dta_state) == NULL) {
12909			state = dtrace_state_create(NULL, NULL);
12910			dtrace_anon.dta_state = state;
12911
12912			if (state == NULL) {
12913				/*
12914				 * This basically shouldn't happen:  the only
12915				 * failure mode from dtrace_state_create() is a
12916				 * failure of ddi_soft_state_zalloc() that
12917				 * itself should never happen.  Still, the
12918				 * interface allows for a failure mode, and
12919				 * we want to fail as gracefully as possible:
12920				 * we'll emit an error message and cease
12921				 * processing anonymous state in this case.
12922				 */
12923				cmn_err(CE_WARN, "failed to create "
12924				    "anonymous state");
12925				dtrace_dof_destroy(dof);
12926				break;
12927			}
12928		}
12929
12930		rv = dtrace_dof_slurp(dof, &state->dts_vstate, CRED(),
12931		    &dtrace_anon.dta_enabling, 0, B_TRUE);
12932
12933		if (rv == 0)
12934			rv = dtrace_dof_options(dof, state);
12935
12936		dtrace_err_verbose = 0;
12937		dtrace_dof_destroy(dof);
12938
12939		if (rv != 0) {
12940			/*
12941			 * This is malformed DOF; chuck any anonymous state
12942			 * that we created.
12943			 */
12944			ASSERT(dtrace_anon.dta_enabling == NULL);
12945			dtrace_state_destroy(state);
12946			dtrace_anon.dta_state = NULL;
12947			break;
12948		}
12949
12950		ASSERT(dtrace_anon.dta_enabling != NULL);
12951	}
12952
12953	if (dtrace_anon.dta_enabling != NULL) {
12954		int rval;
12955
12956		/*
12957		 * dtrace_enabling_retain() can only fail because we are
12958		 * trying to retain more enablings than are allowed -- but
12959		 * we only have one anonymous enabling, and we are guaranteed
12960		 * to be allowed at least one retained enabling; we assert
12961		 * that dtrace_enabling_retain() returns success.
12962		 */
12963		rval = dtrace_enabling_retain(dtrace_anon.dta_enabling);
12964		ASSERT(rval == 0);
12965
12966		dtrace_enabling_dump(dtrace_anon.dta_enabling);
12967	}
12968}
12969
12970/*
12971 * DTrace Helper Functions
12972 */
12973static void
12974dtrace_helper_trace(dtrace_helper_action_t *helper,
12975    dtrace_mstate_t *mstate, dtrace_vstate_t *vstate, int where)
12976{
12977	uint32_t size, next, nnext, i;
12978	dtrace_helptrace_t *ent;
12979	uint16_t flags = cpu_core[CPU->cpu_id].cpuc_dtrace_flags;
12980
12981	if (!dtrace_helptrace_enabled)
12982		return;
12983
12984	ASSERT(vstate->dtvs_nlocals <= dtrace_helptrace_nlocals);
12985
12986	/*
12987	 * What would a tracing framework be without its own tracing
12988	 * framework?  (Well, a hell of a lot simpler, for starters...)
12989	 */
12990	size = sizeof (dtrace_helptrace_t) + dtrace_helptrace_nlocals *
12991	    sizeof (uint64_t) - sizeof (uint64_t);
12992
12993	/*
12994	 * Iterate until we can allocate a slot in the trace buffer.
12995	 */
12996	do {
12997		next = dtrace_helptrace_next;
12998
12999		if (next + size < dtrace_helptrace_bufsize) {
13000			nnext = next + size;
13001		} else {
13002			nnext = size;
13003		}
13004	} while (dtrace_cas32(&dtrace_helptrace_next, next, nnext) != next);
13005
13006	/*
13007	 * We have our slot; fill it in.
13008	 */
13009	if (nnext == size)
13010		next = 0;
13011
13012	ent = (dtrace_helptrace_t *)&dtrace_helptrace_buffer[next];
13013	ent->dtht_helper = helper;
13014	ent->dtht_where = where;
13015	ent->dtht_nlocals = vstate->dtvs_nlocals;
13016
13017	ent->dtht_fltoffs = (mstate->dtms_present & DTRACE_MSTATE_FLTOFFS) ?
13018	    mstate->dtms_fltoffs : -1;
13019	ent->dtht_fault = DTRACE_FLAGS2FLT(flags);
13020	ent->dtht_illval = cpu_core[CPU->cpu_id].cpuc_dtrace_illval;
13021
13022	for (i = 0; i < vstate->dtvs_nlocals; i++) {
13023		dtrace_statvar_t *svar;
13024
13025		if ((svar = vstate->dtvs_locals[i]) == NULL)
13026			continue;
13027
13028		ASSERT(svar->dtsv_size >= NCPU * sizeof (uint64_t));
13029		ent->dtht_locals[i] =
13030		    ((uint64_t *)(uintptr_t)svar->dtsv_data)[CPU->cpu_id];
13031	}
13032}
13033
13034static uint64_t
13035dtrace_helper(int which, dtrace_mstate_t *mstate,
13036    dtrace_state_t *state, uint64_t arg0, uint64_t arg1)
13037{
13038	uint16_t *flags = &cpu_core[CPU->cpu_id].cpuc_dtrace_flags;
13039	uint64_t sarg0 = mstate->dtms_arg[0];
13040	uint64_t sarg1 = mstate->dtms_arg[1];
13041	uint64_t rval;
13042	dtrace_helpers_t *helpers = curproc->p_dtrace_helpers;
13043	dtrace_helper_action_t *helper;
13044	dtrace_vstate_t *vstate;
13045	dtrace_difo_t *pred;
13046	int i, trace = dtrace_helptrace_enabled;
13047
13048	ASSERT(which >= 0 && which < DTRACE_NHELPER_ACTIONS);
13049
13050	if (helpers == NULL)
13051		return (0);
13052
13053	if ((helper = helpers->dthps_actions[which]) == NULL)
13054		return (0);
13055
13056	vstate = &helpers->dthps_vstate;
13057	mstate->dtms_arg[0] = arg0;
13058	mstate->dtms_arg[1] = arg1;
13059
13060	/*
13061	 * Now iterate over each helper.  If its predicate evaluates to 'true',
13062	 * we'll call the corresponding actions.  Note that the below calls
13063	 * to dtrace_dif_emulate() may set faults in machine state.  This is
13064	 * okay:  our caller (the outer dtrace_dif_emulate()) will simply plow
13065	 * the stored DIF offset with its own (which is the desired behavior).
13066	 * Also, note the calls to dtrace_dif_emulate() may allocate scratch
13067	 * from machine state; this is okay, too.
13068	 */
13069	for (; helper != NULL; helper = helper->dtha_next) {
13070		if ((pred = helper->dtha_predicate) != NULL) {
13071			if (trace)
13072				dtrace_helper_trace(helper, mstate, vstate, 0);
13073
13074			if (!dtrace_dif_emulate(pred, mstate, vstate, state))
13075				goto next;
13076
13077			if (*flags & CPU_DTRACE_FAULT)
13078				goto err;
13079		}
13080
13081		for (i = 0; i < helper->dtha_nactions; i++) {
13082			if (trace)
13083				dtrace_helper_trace(helper,
13084				    mstate, vstate, i + 1);
13085
13086			rval = dtrace_dif_emulate(helper->dtha_actions[i],
13087			    mstate, vstate, state);
13088
13089			if (*flags & CPU_DTRACE_FAULT)
13090				goto err;
13091		}
13092
13093next:
13094		if (trace)
13095			dtrace_helper_trace(helper, mstate, vstate,
13096			    DTRACE_HELPTRACE_NEXT);
13097	}
13098
13099	if (trace)
13100		dtrace_helper_trace(helper, mstate, vstate,
13101		    DTRACE_HELPTRACE_DONE);
13102
13103	/*
13104	 * Restore the arg0 that we saved upon entry.
13105	 */
13106	mstate->dtms_arg[0] = sarg0;
13107	mstate->dtms_arg[1] = sarg1;
13108
13109	return (rval);
13110
13111err:
13112	if (trace)
13113		dtrace_helper_trace(helper, mstate, vstate,
13114		    DTRACE_HELPTRACE_ERR);
13115
13116	/*
13117	 * Restore the arg0 that we saved upon entry.
13118	 */
13119	mstate->dtms_arg[0] = sarg0;
13120	mstate->dtms_arg[1] = sarg1;
13121
13122	return (NULL);
13123}
13124
13125static void
13126dtrace_helper_action_destroy(dtrace_helper_action_t *helper,
13127    dtrace_vstate_t *vstate)
13128{
13129	int i;
13130
13131	if (helper->dtha_predicate != NULL)
13132		dtrace_difo_release(helper->dtha_predicate, vstate);
13133
13134	for (i = 0; i < helper->dtha_nactions; i++) {
13135		ASSERT(helper->dtha_actions[i] != NULL);
13136		dtrace_difo_release(helper->dtha_actions[i], vstate);
13137	}
13138
13139	kmem_free(helper->dtha_actions,
13140	    helper->dtha_nactions * sizeof (dtrace_difo_t *));
13141	kmem_free(helper, sizeof (dtrace_helper_action_t));
13142}
13143
13144static int
13145dtrace_helper_destroygen(int gen)
13146{
13147	proc_t *p = curproc;
13148	dtrace_helpers_t *help = p->p_dtrace_helpers;
13149	dtrace_vstate_t *vstate;
13150	int i;
13151
13152	ASSERT(MUTEX_HELD(&dtrace_lock));
13153
13154	if (help == NULL || gen > help->dthps_generation)
13155		return (EINVAL);
13156
13157	vstate = &help->dthps_vstate;
13158
13159	for (i = 0; i < DTRACE_NHELPER_ACTIONS; i++) {
13160		dtrace_helper_action_t *last = NULL, *h, *next;
13161
13162		for (h = help->dthps_actions[i]; h != NULL; h = next) {
13163			next = h->dtha_next;
13164
13165			if (h->dtha_generation == gen) {
13166				if (last != NULL) {
13167					last->dtha_next = next;
13168				} else {
13169					help->dthps_actions[i] = next;
13170				}
13171
13172				dtrace_helper_action_destroy(h, vstate);
13173			} else {
13174				last = h;
13175			}
13176		}
13177	}
13178
13179	/*
13180	 * Interate until we've cleared out all helper providers with the
13181	 * given generation number.
13182	 */
13183	for (;;) {
13184		dtrace_helper_provider_t *prov;
13185
13186		/*
13187		 * Look for a helper provider with the right generation. We
13188		 * have to start back at the beginning of the list each time
13189		 * because we drop dtrace_lock. It's unlikely that we'll make
13190		 * more than two passes.
13191		 */
13192		for (i = 0; i < help->dthps_nprovs; i++) {
13193			prov = help->dthps_provs[i];
13194
13195			if (prov->dthp_generation == gen)
13196				break;
13197		}
13198
13199		/*
13200		 * If there were no matches, we're done.
13201		 */
13202		if (i == help->dthps_nprovs)
13203			break;
13204
13205		/*
13206		 * Move the last helper provider into this slot.
13207		 */
13208		help->dthps_nprovs--;
13209		help->dthps_provs[i] = help->dthps_provs[help->dthps_nprovs];
13210		help->dthps_provs[help->dthps_nprovs] = NULL;
13211
13212		mutex_exit(&dtrace_lock);
13213
13214		/*
13215		 * If we have a meta provider, remove this helper provider.
13216		 */
13217		mutex_enter(&dtrace_meta_lock);
13218		if (dtrace_meta_pid != NULL) {
13219			ASSERT(dtrace_deferred_pid == NULL);
13220			dtrace_helper_provider_remove(&prov->dthp_prov,
13221			    p->p_pid);
13222		}
13223		mutex_exit(&dtrace_meta_lock);
13224
13225		dtrace_helper_provider_destroy(prov);
13226
13227		mutex_enter(&dtrace_lock);
13228	}
13229
13230	return (0);
13231}
13232
13233static int
13234dtrace_helper_validate(dtrace_helper_action_t *helper)
13235{
13236	int err = 0, i;
13237	dtrace_difo_t *dp;
13238
13239	if ((dp = helper->dtha_predicate) != NULL)
13240		err += dtrace_difo_validate_helper(dp);
13241
13242	for (i = 0; i < helper->dtha_nactions; i++)
13243		err += dtrace_difo_validate_helper(helper->dtha_actions[i]);
13244
13245	return (err == 0);
13246}
13247
13248static int
13249dtrace_helper_action_add(int which, dtrace_ecbdesc_t *ep)
13250{
13251	dtrace_helpers_t *help;
13252	dtrace_helper_action_t *helper, *last;
13253	dtrace_actdesc_t *act;
13254	dtrace_vstate_t *vstate;
13255	dtrace_predicate_t *pred;
13256	int count = 0, nactions = 0, i;
13257
13258	if (which < 0 || which >= DTRACE_NHELPER_ACTIONS)
13259		return (EINVAL);
13260
13261	help = curproc->p_dtrace_helpers;
13262	last = help->dthps_actions[which];
13263	vstate = &help->dthps_vstate;
13264
13265	for (count = 0; last != NULL; last = last->dtha_next) {
13266		count++;
13267		if (last->dtha_next == NULL)
13268			break;
13269	}
13270
13271	/*
13272	 * If we already have dtrace_helper_actions_max helper actions for this
13273	 * helper action type, we'll refuse to add a new one.
13274	 */
13275	if (count >= dtrace_helper_actions_max)
13276		return (ENOSPC);
13277
13278	helper = kmem_zalloc(sizeof (dtrace_helper_action_t), KM_SLEEP);
13279	helper->dtha_generation = help->dthps_generation;
13280
13281	if ((pred = ep->dted_pred.dtpdd_predicate) != NULL) {
13282		ASSERT(pred->dtp_difo != NULL);
13283		dtrace_difo_hold(pred->dtp_difo);
13284		helper->dtha_predicate = pred->dtp_difo;
13285	}
13286
13287	for (act = ep->dted_action; act != NULL; act = act->dtad_next) {
13288		if (act->dtad_kind != DTRACEACT_DIFEXPR)
13289			goto err;
13290
13291		if (act->dtad_difo == NULL)
13292			goto err;
13293
13294		nactions++;
13295	}
13296
13297	helper->dtha_actions = kmem_zalloc(sizeof (dtrace_difo_t *) *
13298	    (helper->dtha_nactions = nactions), KM_SLEEP);
13299
13300	for (act = ep->dted_action, i = 0; act != NULL; act = act->dtad_next) {
13301		dtrace_difo_hold(act->dtad_difo);
13302		helper->dtha_actions[i++] = act->dtad_difo;
13303	}
13304
13305	if (!dtrace_helper_validate(helper))
13306		goto err;
13307
13308	if (last == NULL) {
13309		help->dthps_actions[which] = helper;
13310	} else {
13311		last->dtha_next = helper;
13312	}
13313
13314	if (vstate->dtvs_nlocals > dtrace_helptrace_nlocals) {
13315		dtrace_helptrace_nlocals = vstate->dtvs_nlocals;
13316		dtrace_helptrace_next = 0;
13317	}
13318
13319	return (0);
13320err:
13321	dtrace_helper_action_destroy(helper, vstate);
13322	return (EINVAL);
13323}
13324
13325static void
13326dtrace_helper_provider_register(proc_t *p, dtrace_helpers_t *help,
13327    dof_helper_t *dofhp)
13328{
13329	ASSERT(MUTEX_NOT_HELD(&dtrace_lock));
13330
13331	mutex_enter(&dtrace_meta_lock);
13332	mutex_enter(&dtrace_lock);
13333
13334	if (!dtrace_attached() || dtrace_meta_pid == NULL) {
13335		/*
13336		 * If the dtrace module is loaded but not attached, or if
13337		 * there aren't isn't a meta provider registered to deal with
13338		 * these provider descriptions, we need to postpone creating
13339		 * the actual providers until later.
13340		 */
13341
13342		if (help->dthps_next == NULL && help->dthps_prev == NULL &&
13343		    dtrace_deferred_pid != help) {
13344			help->dthps_deferred = 1;
13345			help->dthps_pid = p->p_pid;
13346			help->dthps_next = dtrace_deferred_pid;
13347			help->dthps_prev = NULL;
13348			if (dtrace_deferred_pid != NULL)
13349				dtrace_deferred_pid->dthps_prev = help;
13350			dtrace_deferred_pid = help;
13351		}
13352
13353		mutex_exit(&dtrace_lock);
13354
13355	} else if (dofhp != NULL) {
13356		/*
13357		 * If the dtrace module is loaded and we have a particular
13358		 * helper provider description, pass that off to the
13359		 * meta provider.
13360		 */
13361
13362		mutex_exit(&dtrace_lock);
13363
13364		dtrace_helper_provide(dofhp, p->p_pid);
13365
13366	} else {
13367		/*
13368		 * Otherwise, just pass all the helper provider descriptions
13369		 * off to the meta provider.
13370		 */
13371
13372		int i;
13373		mutex_exit(&dtrace_lock);
13374
13375		for (i = 0; i < help->dthps_nprovs; i++) {
13376			dtrace_helper_provide(&help->dthps_provs[i]->dthp_prov,
13377			    p->p_pid);
13378		}
13379	}
13380
13381	mutex_exit(&dtrace_meta_lock);
13382}
13383
13384static int
13385dtrace_helper_provider_add(dof_helper_t *dofhp, int gen)
13386{
13387	dtrace_helpers_t *help;
13388	dtrace_helper_provider_t *hprov, **tmp_provs;
13389	uint_t tmp_maxprovs, i;
13390
13391	ASSERT(MUTEX_HELD(&dtrace_lock));
13392
13393	help = curproc->p_dtrace_helpers;
13394	ASSERT(help != NULL);
13395
13396	/*
13397	 * If we already have dtrace_helper_providers_max helper providers,
13398	 * we're refuse to add a new one.
13399	 */
13400	if (help->dthps_nprovs >= dtrace_helper_providers_max)
13401		return (ENOSPC);
13402
13403	/*
13404	 * Check to make sure this isn't a duplicate.
13405	 */
13406	for (i = 0; i < help->dthps_nprovs; i++) {
13407		if (dofhp->dofhp_addr ==
13408		    help->dthps_provs[i]->dthp_prov.dofhp_addr)
13409			return (EALREADY);
13410	}
13411
13412	hprov = kmem_zalloc(sizeof (dtrace_helper_provider_t), KM_SLEEP);
13413	hprov->dthp_prov = *dofhp;
13414	hprov->dthp_ref = 1;
13415	hprov->dthp_generation = gen;
13416
13417	/*
13418	 * Allocate a bigger table for helper providers if it's already full.
13419	 */
13420	if (help->dthps_maxprovs == help->dthps_nprovs) {
13421		tmp_maxprovs = help->dthps_maxprovs;
13422		tmp_provs = help->dthps_provs;
13423
13424		if (help->dthps_maxprovs == 0)
13425			help->dthps_maxprovs = 2;
13426		else
13427			help->dthps_maxprovs *= 2;
13428		if (help->dthps_maxprovs > dtrace_helper_providers_max)
13429			help->dthps_maxprovs = dtrace_helper_providers_max;
13430
13431		ASSERT(tmp_maxprovs < help->dthps_maxprovs);
13432
13433		help->dthps_provs = kmem_zalloc(help->dthps_maxprovs *
13434		    sizeof (dtrace_helper_provider_t *), KM_SLEEP);
13435
13436		if (tmp_provs != NULL) {
13437			bcopy(tmp_provs, help->dthps_provs, tmp_maxprovs *
13438			    sizeof (dtrace_helper_provider_t *));
13439			kmem_free(tmp_provs, tmp_maxprovs *
13440			    sizeof (dtrace_helper_provider_t *));
13441		}
13442	}
13443
13444	help->dthps_provs[help->dthps_nprovs] = hprov;
13445	help->dthps_nprovs++;
13446
13447	return (0);
13448}
13449
13450static void
13451dtrace_helper_provider_destroy(dtrace_helper_provider_t *hprov)
13452{
13453	mutex_enter(&dtrace_lock);
13454
13455	if (--hprov->dthp_ref == 0) {
13456		dof_hdr_t *dof;
13457		mutex_exit(&dtrace_lock);
13458		dof = (dof_hdr_t *)(uintptr_t)hprov->dthp_prov.dofhp_dof;
13459		dtrace_dof_destroy(dof);
13460		kmem_free(hprov, sizeof (dtrace_helper_provider_t));
13461	} else {
13462		mutex_exit(&dtrace_lock);
13463	}
13464}
13465
13466static int
13467dtrace_helper_provider_validate(dof_hdr_t *dof, dof_sec_t *sec)
13468{
13469	uintptr_t daddr = (uintptr_t)dof;
13470	dof_sec_t *str_sec, *prb_sec, *arg_sec, *off_sec, *enoff_sec;
13471	dof_provider_t *provider;
13472	dof_probe_t *probe;
13473	uint8_t *arg;
13474	char *strtab, *typestr;
13475	dof_stridx_t typeidx;
13476	size_t typesz;
13477	uint_t nprobes, j, k;
13478
13479	ASSERT(sec->dofs_type == DOF_SECT_PROVIDER);
13480
13481	if (sec->dofs_offset & (sizeof (uint_t) - 1)) {
13482		dtrace_dof_error(dof, "misaligned section offset");
13483		return (-1);
13484	}
13485
13486	/*
13487	 * The section needs to be large enough to contain the DOF provider
13488	 * structure appropriate for the given version.
13489	 */
13490	if (sec->dofs_size <
13491	    ((dof->dofh_ident[DOF_ID_VERSION] == DOF_VERSION_1) ?
13492	    offsetof(dof_provider_t, dofpv_prenoffs) :
13493	    sizeof (dof_provider_t))) {
13494		dtrace_dof_error(dof, "provider section too small");
13495		return (-1);
13496	}
13497
13498	provider = (dof_provider_t *)(uintptr_t)(daddr + sec->dofs_offset);
13499	str_sec = dtrace_dof_sect(dof, DOF_SECT_STRTAB, provider->dofpv_strtab);
13500	prb_sec = dtrace_dof_sect(dof, DOF_SECT_PROBES, provider->dofpv_probes);
13501	arg_sec = dtrace_dof_sect(dof, DOF_SECT_PRARGS, provider->dofpv_prargs);
13502	off_sec = dtrace_dof_sect(dof, DOF_SECT_PROFFS, provider->dofpv_proffs);
13503
13504	if (str_sec == NULL || prb_sec == NULL ||
13505	    arg_sec == NULL || off_sec == NULL)
13506		return (-1);
13507
13508	enoff_sec = NULL;
13509
13510	if (dof->dofh_ident[DOF_ID_VERSION] != DOF_VERSION_1 &&
13511	    provider->dofpv_prenoffs != DOF_SECT_NONE &&
13512	    (enoff_sec = dtrace_dof_sect(dof, DOF_SECT_PRENOFFS,
13513	    provider->dofpv_prenoffs)) == NULL)
13514		return (-1);
13515
13516	strtab = (char *)(uintptr_t)(daddr + str_sec->dofs_offset);
13517
13518	if (provider->dofpv_name >= str_sec->dofs_size ||
13519	    strlen(strtab + provider->dofpv_name) >= DTRACE_PROVNAMELEN) {
13520		dtrace_dof_error(dof, "invalid provider name");
13521		return (-1);
13522	}
13523
13524	if (prb_sec->dofs_entsize == 0 ||
13525	    prb_sec->dofs_entsize > prb_sec->dofs_size) {
13526		dtrace_dof_error(dof, "invalid entry size");
13527		return (-1);
13528	}
13529
13530	if (prb_sec->dofs_entsize & (sizeof (uintptr_t) - 1)) {
13531		dtrace_dof_error(dof, "misaligned entry size");
13532		return (-1);
13533	}
13534
13535	if (off_sec->dofs_entsize != sizeof (uint32_t)) {
13536		dtrace_dof_error(dof, "invalid entry size");
13537		return (-1);
13538	}
13539
13540	if (off_sec->dofs_offset & (sizeof (uint32_t) - 1)) {
13541		dtrace_dof_error(dof, "misaligned section offset");
13542		return (-1);
13543	}
13544
13545	if (arg_sec->dofs_entsize != sizeof (uint8_t)) {
13546		dtrace_dof_error(dof, "invalid entry size");
13547		return (-1);
13548	}
13549
13550	arg = (uint8_t *)(uintptr_t)(daddr + arg_sec->dofs_offset);
13551
13552	nprobes = prb_sec->dofs_size / prb_sec->dofs_entsize;
13553
13554	/*
13555	 * Take a pass through the probes to check for errors.
13556	 */
13557	for (j = 0; j < nprobes; j++) {
13558		probe = (dof_probe_t *)(uintptr_t)(daddr +
13559		    prb_sec->dofs_offset + j * prb_sec->dofs_entsize);
13560
13561		if (probe->dofpr_func >= str_sec->dofs_size) {
13562			dtrace_dof_error(dof, "invalid function name");
13563			return (-1);
13564		}
13565
13566		if (strlen(strtab + probe->dofpr_func) >= DTRACE_FUNCNAMELEN) {
13567			dtrace_dof_error(dof, "function name too long");
13568			return (-1);
13569		}
13570
13571		if (probe->dofpr_name >= str_sec->dofs_size ||
13572		    strlen(strtab + probe->dofpr_name) >= DTRACE_NAMELEN) {
13573			dtrace_dof_error(dof, "invalid probe name");
13574			return (-1);
13575		}
13576
13577		/*
13578		 * The offset count must not wrap the index, and the offsets
13579		 * must also not overflow the section's data.
13580		 */
13581		if (probe->dofpr_offidx + probe->dofpr_noffs <
13582		    probe->dofpr_offidx ||
13583		    (probe->dofpr_offidx + probe->dofpr_noffs) *
13584		    off_sec->dofs_entsize > off_sec->dofs_size) {
13585			dtrace_dof_error(dof, "invalid probe offset");
13586			return (-1);
13587		}
13588
13589		if (dof->dofh_ident[DOF_ID_VERSION] != DOF_VERSION_1) {
13590			/*
13591			 * If there's no is-enabled offset section, make sure
13592			 * there aren't any is-enabled offsets. Otherwise
13593			 * perform the same checks as for probe offsets
13594			 * (immediately above).
13595			 */
13596			if (enoff_sec == NULL) {
13597				if (probe->dofpr_enoffidx != 0 ||
13598				    probe->dofpr_nenoffs != 0) {
13599					dtrace_dof_error(dof, "is-enabled "
13600					    "offsets with null section");
13601					return (-1);
13602				}
13603			} else if (probe->dofpr_enoffidx +
13604			    probe->dofpr_nenoffs < probe->dofpr_enoffidx ||
13605			    (probe->dofpr_enoffidx + probe->dofpr_nenoffs) *
13606			    enoff_sec->dofs_entsize > enoff_sec->dofs_size) {
13607				dtrace_dof_error(dof, "invalid is-enabled "
13608				    "offset");
13609				return (-1);
13610			}
13611
13612			if (probe->dofpr_noffs + probe->dofpr_nenoffs == 0) {
13613				dtrace_dof_error(dof, "zero probe and "
13614				    "is-enabled offsets");
13615				return (-1);
13616			}
13617		} else if (probe->dofpr_noffs == 0) {
13618			dtrace_dof_error(dof, "zero probe offsets");
13619			return (-1);
13620		}
13621
13622		if (probe->dofpr_argidx + probe->dofpr_xargc <
13623		    probe->dofpr_argidx ||
13624		    (probe->dofpr_argidx + probe->dofpr_xargc) *
13625		    arg_sec->dofs_entsize > arg_sec->dofs_size) {
13626			dtrace_dof_error(dof, "invalid args");
13627			return (-1);
13628		}
13629
13630		typeidx = probe->dofpr_nargv;
13631		typestr = strtab + probe->dofpr_nargv;
13632		for (k = 0; k < probe->dofpr_nargc; k++) {
13633			if (typeidx >= str_sec->dofs_size) {
13634				dtrace_dof_error(dof, "bad "
13635				    "native argument type");
13636				return (-1);
13637			}
13638
13639			typesz = strlen(typestr) + 1;
13640			if (typesz > DTRACE_ARGTYPELEN) {
13641				dtrace_dof_error(dof, "native "
13642				    "argument type too long");
13643				return (-1);
13644			}
13645			typeidx += typesz;
13646			typestr += typesz;
13647		}
13648
13649		typeidx = probe->dofpr_xargv;
13650		typestr = strtab + probe->dofpr_xargv;
13651		for (k = 0; k < probe->dofpr_xargc; k++) {
13652			if (arg[probe->dofpr_argidx + k] > probe->dofpr_nargc) {
13653				dtrace_dof_error(dof, "bad "
13654				    "native argument index");
13655				return (-1);
13656			}
13657
13658			if (typeidx >= str_sec->dofs_size) {
13659				dtrace_dof_error(dof, "bad "
13660				    "translated argument type");
13661				return (-1);
13662			}
13663
13664			typesz = strlen(typestr) + 1;
13665			if (typesz > DTRACE_ARGTYPELEN) {
13666				dtrace_dof_error(dof, "translated argument "
13667				    "type too long");
13668				return (-1);
13669			}
13670
13671			typeidx += typesz;
13672			typestr += typesz;
13673		}
13674	}
13675
13676	return (0);
13677}
13678
13679static int
13680dtrace_helper_slurp(dof_hdr_t *dof, dof_helper_t *dhp)
13681{
13682	dtrace_helpers_t *help;
13683	dtrace_vstate_t *vstate;
13684	dtrace_enabling_t *enab = NULL;
13685	int i, gen, rv, nhelpers = 0, nprovs = 0, destroy = 1;
13686	uintptr_t daddr = (uintptr_t)dof;
13687
13688	ASSERT(MUTEX_HELD(&dtrace_lock));
13689
13690	if ((help = curproc->p_dtrace_helpers) == NULL)
13691		help = dtrace_helpers_create(curproc);
13692
13693	vstate = &help->dthps_vstate;
13694
13695	if ((rv = dtrace_dof_slurp(dof, vstate, NULL, &enab,
13696	    dhp != NULL ? dhp->dofhp_addr : 0, B_FALSE)) != 0) {
13697		dtrace_dof_destroy(dof);
13698		return (rv);
13699	}
13700
13701	/*
13702	 * Look for helper providers and validate their descriptions.
13703	 */
13704	if (dhp != NULL) {
13705		for (i = 0; i < dof->dofh_secnum; i++) {
13706			dof_sec_t *sec = (dof_sec_t *)(uintptr_t)(daddr +
13707			    dof->dofh_secoff + i * dof->dofh_secsize);
13708
13709			if (sec->dofs_type != DOF_SECT_PROVIDER)
13710				continue;
13711
13712			if (dtrace_helper_provider_validate(dof, sec) != 0) {
13713				dtrace_enabling_destroy(enab);
13714				dtrace_dof_destroy(dof);
13715				return (-1);
13716			}
13717
13718			nprovs++;
13719		}
13720	}
13721
13722	/*
13723	 * Now we need to walk through the ECB descriptions in the enabling.
13724	 */
13725	for (i = 0; i < enab->dten_ndesc; i++) {
13726		dtrace_ecbdesc_t *ep = enab->dten_desc[i];
13727		dtrace_probedesc_t *desc = &ep->dted_probe;
13728
13729		if (strcmp(desc->dtpd_provider, "dtrace") != 0)
13730			continue;
13731
13732		if (strcmp(desc->dtpd_mod, "helper") != 0)
13733			continue;
13734
13735		if (strcmp(desc->dtpd_func, "ustack") != 0)
13736			continue;
13737
13738		if ((rv = dtrace_helper_action_add(DTRACE_HELPER_ACTION_USTACK,
13739		    ep)) != 0) {
13740			/*
13741			 * Adding this helper action failed -- we are now going
13742			 * to rip out the entire generation and return failure.
13743			 */
13744			(void) dtrace_helper_destroygen(help->dthps_generation);
13745			dtrace_enabling_destroy(enab);
13746			dtrace_dof_destroy(dof);
13747			return (-1);
13748		}
13749
13750		nhelpers++;
13751	}
13752
13753	if (nhelpers < enab->dten_ndesc)
13754		dtrace_dof_error(dof, "unmatched helpers");
13755
13756	gen = help->dthps_generation++;
13757	dtrace_enabling_destroy(enab);
13758
13759	if (dhp != NULL && nprovs > 0) {
13760		dhp->dofhp_dof = (uint64_t)(uintptr_t)dof;
13761		if (dtrace_helper_provider_add(dhp, gen) == 0) {
13762			mutex_exit(&dtrace_lock);
13763			dtrace_helper_provider_register(curproc, help, dhp);
13764			mutex_enter(&dtrace_lock);
13765
13766			destroy = 0;
13767		}
13768	}
13769
13770	if (destroy)
13771		dtrace_dof_destroy(dof);
13772
13773	return (gen);
13774}
13775
13776static dtrace_helpers_t *
13777dtrace_helpers_create(proc_t *p)
13778{
13779	dtrace_helpers_t *help;
13780
13781	ASSERT(MUTEX_HELD(&dtrace_lock));
13782	ASSERT(p->p_dtrace_helpers == NULL);
13783
13784	help = kmem_zalloc(sizeof (dtrace_helpers_t), KM_SLEEP);
13785	help->dthps_actions = kmem_zalloc(sizeof (dtrace_helper_action_t *) *
13786	    DTRACE_NHELPER_ACTIONS, KM_SLEEP);
13787
13788	p->p_dtrace_helpers = help;
13789	dtrace_helpers++;
13790
13791	return (help);
13792}
13793
13794static void
13795dtrace_helpers_destroy(void)
13796{
13797	dtrace_helpers_t *help;
13798	dtrace_vstate_t *vstate;
13799	proc_t *p = curproc;
13800	int i;
13801
13802	mutex_enter(&dtrace_lock);
13803
13804	ASSERT(p->p_dtrace_helpers != NULL);
13805	ASSERT(dtrace_helpers > 0);
13806
13807	help = p->p_dtrace_helpers;
13808	vstate = &help->dthps_vstate;
13809
13810	/*
13811	 * We're now going to lose the help from this process.
13812	 */
13813	p->p_dtrace_helpers = NULL;
13814	dtrace_sync();
13815
13816	/*
13817	 * Destory the helper actions.
13818	 */
13819	for (i = 0; i < DTRACE_NHELPER_ACTIONS; i++) {
13820		dtrace_helper_action_t *h, *next;
13821
13822		for (h = help->dthps_actions[i]; h != NULL; h = next) {
13823			next = h->dtha_next;
13824			dtrace_helper_action_destroy(h, vstate);
13825			h = next;
13826		}
13827	}
13828
13829	mutex_exit(&dtrace_lock);
13830
13831	/*
13832	 * Destroy the helper providers.
13833	 */
13834	if (help->dthps_maxprovs > 0) {
13835		mutex_enter(&dtrace_meta_lock);
13836		if (dtrace_meta_pid != NULL) {
13837			ASSERT(dtrace_deferred_pid == NULL);
13838
13839			for (i = 0; i < help->dthps_nprovs; i++) {
13840				dtrace_helper_provider_remove(
13841				    &help->dthps_provs[i]->dthp_prov, p->p_pid);
13842			}
13843		} else {
13844			mutex_enter(&dtrace_lock);
13845			ASSERT(help->dthps_deferred == 0 ||
13846			    help->dthps_next != NULL ||
13847			    help->dthps_prev != NULL ||
13848			    help == dtrace_deferred_pid);
13849
13850			/*
13851			 * Remove the helper from the deferred list.
13852			 */
13853			if (help->dthps_next != NULL)
13854				help->dthps_next->dthps_prev = help->dthps_prev;
13855			if (help->dthps_prev != NULL)
13856				help->dthps_prev->dthps_next = help->dthps_next;
13857			if (dtrace_deferred_pid == help) {
13858				dtrace_deferred_pid = help->dthps_next;
13859				ASSERT(help->dthps_prev == NULL);
13860			}
13861
13862			mutex_exit(&dtrace_lock);
13863		}
13864
13865		mutex_exit(&dtrace_meta_lock);
13866
13867		for (i = 0; i < help->dthps_nprovs; i++) {
13868			dtrace_helper_provider_destroy(help->dthps_provs[i]);
13869		}
13870
13871		kmem_free(help->dthps_provs, help->dthps_maxprovs *
13872		    sizeof (dtrace_helper_provider_t *));
13873	}
13874
13875	mutex_enter(&dtrace_lock);
13876
13877	dtrace_vstate_fini(&help->dthps_vstate);
13878	kmem_free(help->dthps_actions,
13879	    sizeof (dtrace_helper_action_t *) * DTRACE_NHELPER_ACTIONS);
13880	kmem_free(help, sizeof (dtrace_helpers_t));
13881
13882	--dtrace_helpers;
13883	mutex_exit(&dtrace_lock);
13884}
13885
13886static void
13887dtrace_helpers_duplicate(proc_t *from, proc_t *to)
13888{
13889	dtrace_helpers_t *help, *newhelp;
13890	dtrace_helper_action_t *helper, *new, *last;
13891	dtrace_difo_t *dp;
13892	dtrace_vstate_t *vstate;
13893	int i, j, sz, hasprovs = 0;
13894
13895	mutex_enter(&dtrace_lock);
13896	ASSERT(from->p_dtrace_helpers != NULL);
13897	ASSERT(dtrace_helpers > 0);
13898
13899	help = from->p_dtrace_helpers;
13900	newhelp = dtrace_helpers_create(to);
13901	ASSERT(to->p_dtrace_helpers != NULL);
13902
13903	newhelp->dthps_generation = help->dthps_generation;
13904	vstate = &newhelp->dthps_vstate;
13905
13906	/*
13907	 * Duplicate the helper actions.
13908	 */
13909	for (i = 0; i < DTRACE_NHELPER_ACTIONS; i++) {
13910		if ((helper = help->dthps_actions[i]) == NULL)
13911			continue;
13912
13913		for (last = NULL; helper != NULL; helper = helper->dtha_next) {
13914			new = kmem_zalloc(sizeof (dtrace_helper_action_t),
13915			    KM_SLEEP);
13916			new->dtha_generation = helper->dtha_generation;
13917
13918			if ((dp = helper->dtha_predicate) != NULL) {
13919				dp = dtrace_difo_duplicate(dp, vstate);
13920				new->dtha_predicate = dp;
13921			}
13922
13923			new->dtha_nactions = helper->dtha_nactions;
13924			sz = sizeof (dtrace_difo_t *) * new->dtha_nactions;
13925			new->dtha_actions = kmem_alloc(sz, KM_SLEEP);
13926
13927			for (j = 0; j < new->dtha_nactions; j++) {
13928				dtrace_difo_t *dp = helper->dtha_actions[j];
13929
13930				ASSERT(dp != NULL);
13931				dp = dtrace_difo_duplicate(dp, vstate);
13932				new->dtha_actions[j] = dp;
13933			}
13934
13935			if (last != NULL) {
13936				last->dtha_next = new;
13937			} else {
13938				newhelp->dthps_actions[i] = new;
13939			}
13940
13941			last = new;
13942		}
13943	}
13944
13945	/*
13946	 * Duplicate the helper providers and register them with the
13947	 * DTrace framework.
13948	 */
13949	if (help->dthps_nprovs > 0) {
13950		newhelp->dthps_nprovs = help->dthps_nprovs;
13951		newhelp->dthps_maxprovs = help->dthps_nprovs;
13952		newhelp->dthps_provs = kmem_alloc(newhelp->dthps_nprovs *
13953		    sizeof (dtrace_helper_provider_t *), KM_SLEEP);
13954		for (i = 0; i < newhelp->dthps_nprovs; i++) {
13955			newhelp->dthps_provs[i] = help->dthps_provs[i];
13956			newhelp->dthps_provs[i]->dthp_ref++;
13957		}
13958
13959		hasprovs = 1;
13960	}
13961
13962	mutex_exit(&dtrace_lock);
13963
13964	if (hasprovs)
13965		dtrace_helper_provider_register(to, newhelp, NULL);
13966}
13967
13968/*
13969 * DTrace Hook Functions
13970 */
13971static void
13972dtrace_module_loaded(struct modctl *ctl)
13973{
13974	dtrace_provider_t *prv;
13975
13976	mutex_enter(&dtrace_provider_lock);
13977	mutex_enter(&mod_lock);
13978
13979	ASSERT(ctl->mod_busy);
13980
13981	/*
13982	 * We're going to call each providers per-module provide operation
13983	 * specifying only this module.
13984	 */
13985	for (prv = dtrace_provider; prv != NULL; prv = prv->dtpv_next)
13986		prv->dtpv_pops.dtps_provide_module(prv->dtpv_arg, ctl);
13987
13988	mutex_exit(&mod_lock);
13989	mutex_exit(&dtrace_provider_lock);
13990
13991	/*
13992	 * If we have any retained enablings, we need to match against them.
13993	 * Enabling probes requires that cpu_lock be held, and we cannot hold
13994	 * cpu_lock here -- it is legal for cpu_lock to be held when loading a
13995	 * module.  (In particular, this happens when loading scheduling
13996	 * classes.)  So if we have any retained enablings, we need to dispatch
13997	 * our task queue to do the match for us.
13998	 */
13999	mutex_enter(&dtrace_lock);
14000
14001	if (dtrace_retained == NULL) {
14002		mutex_exit(&dtrace_lock);
14003		return;
14004	}
14005
14006	(void) taskq_dispatch(dtrace_taskq,
14007	    (task_func_t *)dtrace_enabling_matchall, NULL, TQ_SLEEP);
14008
14009	mutex_exit(&dtrace_lock);
14010
14011	/*
14012	 * And now, for a little heuristic sleaze:  in general, we want to
14013	 * match modules as soon as they load.  However, we cannot guarantee
14014	 * this, because it would lead us to the lock ordering violation
14015	 * outlined above.  The common case, of course, is that cpu_lock is
14016	 * _not_ held -- so we delay here for a clock tick, hoping that that's
14017	 * long enough for the task queue to do its work.  If it's not, it's
14018	 * not a serious problem -- it just means that the module that we
14019	 * just loaded may not be immediately instrumentable.
14020	 */
14021	delay(1);
14022}
14023
14024static void
14025dtrace_module_unloaded(struct modctl *ctl)
14026{
14027	dtrace_probe_t template, *probe, *first, *next;
14028	dtrace_provider_t *prov;
14029
14030	template.dtpr_mod = ctl->mod_modname;
14031
14032	mutex_enter(&dtrace_provider_lock);
14033	mutex_enter(&mod_lock);
14034	mutex_enter(&dtrace_lock);
14035
14036	if (dtrace_bymod == NULL) {
14037		/*
14038		 * The DTrace module is loaded (obviously) but not attached;
14039		 * we don't have any work to do.
14040		 */
14041		mutex_exit(&dtrace_provider_lock);
14042		mutex_exit(&mod_lock);
14043		mutex_exit(&dtrace_lock);
14044		return;
14045	}
14046
14047	for (probe = first = dtrace_hash_lookup(dtrace_bymod, &template);
14048	    probe != NULL; probe = probe->dtpr_nextmod) {
14049		if (probe->dtpr_ecb != NULL) {
14050			mutex_exit(&dtrace_provider_lock);
14051			mutex_exit(&mod_lock);
14052			mutex_exit(&dtrace_lock);
14053
14054			/*
14055			 * This shouldn't _actually_ be possible -- we're
14056			 * unloading a module that has an enabled probe in it.
14057			 * (It's normally up to the provider to make sure that
14058			 * this can't happen.)  However, because dtps_enable()
14059			 * doesn't have a failure mode, there can be an
14060			 * enable/unload race.  Upshot:  we don't want to
14061			 * assert, but we're not going to disable the
14062			 * probe, either.
14063			 */
14064			if (dtrace_err_verbose) {
14065				cmn_err(CE_WARN, "unloaded module '%s' had "
14066				    "enabled probes", ctl->mod_modname);
14067			}
14068
14069			return;
14070		}
14071	}
14072
14073	probe = first;
14074
14075	for (first = NULL; probe != NULL; probe = next) {
14076		ASSERT(dtrace_probes[probe->dtpr_id - 1] == probe);
14077
14078		dtrace_probes[probe->dtpr_id - 1] = NULL;
14079
14080		next = probe->dtpr_nextmod;
14081		dtrace_hash_remove(dtrace_bymod, probe);
14082		dtrace_hash_remove(dtrace_byfunc, probe);
14083		dtrace_hash_remove(dtrace_byname, probe);
14084
14085		if (first == NULL) {
14086			first = probe;
14087			probe->dtpr_nextmod = NULL;
14088		} else {
14089			probe->dtpr_nextmod = first;
14090			first = probe;
14091		}
14092	}
14093
14094	/*
14095	 * We've removed all of the module's probes from the hash chains and
14096	 * from the probe array.  Now issue a dtrace_sync() to be sure that
14097	 * everyone has cleared out from any probe array processing.
14098	 */
14099	dtrace_sync();
14100
14101	for (probe = first; probe != NULL; probe = first) {
14102		first = probe->dtpr_nextmod;
14103		prov = probe->dtpr_provider;
14104		prov->dtpv_pops.dtps_destroy(prov->dtpv_arg, probe->dtpr_id,
14105		    probe->dtpr_arg);
14106		kmem_free(probe->dtpr_mod, strlen(probe->dtpr_mod) + 1);
14107		kmem_free(probe->dtpr_func, strlen(probe->dtpr_func) + 1);
14108		kmem_free(probe->dtpr_name, strlen(probe->dtpr_name) + 1);
14109		vmem_free(dtrace_arena, (void *)(uintptr_t)probe->dtpr_id, 1);
14110		kmem_free(probe, sizeof (dtrace_probe_t));
14111	}
14112
14113	mutex_exit(&dtrace_lock);
14114	mutex_exit(&mod_lock);
14115	mutex_exit(&dtrace_provider_lock);
14116}
14117
14118void
14119dtrace_suspend(void)
14120{
14121	dtrace_probe_foreach(offsetof(dtrace_pops_t, dtps_suspend));
14122}
14123
14124void
14125dtrace_resume(void)
14126{
14127	dtrace_probe_foreach(offsetof(dtrace_pops_t, dtps_resume));
14128}
14129
14130static int
14131dtrace_cpu_setup(cpu_setup_t what, processorid_t cpu)
14132{
14133	ASSERT(MUTEX_HELD(&cpu_lock));
14134	mutex_enter(&dtrace_lock);
14135
14136	switch (what) {
14137	case CPU_CONFIG: {
14138		dtrace_state_t *state;
14139		dtrace_optval_t *opt, rs, c;
14140
14141		/*
14142		 * For now, we only allocate a new buffer for anonymous state.
14143		 */
14144		if ((state = dtrace_anon.dta_state) == NULL)
14145			break;
14146
14147		if (state->dts_activity != DTRACE_ACTIVITY_ACTIVE)
14148			break;
14149
14150		opt = state->dts_options;
14151		c = opt[DTRACEOPT_CPU];
14152
14153		if (c != DTRACE_CPUALL && c != DTRACEOPT_UNSET && c != cpu)
14154			break;
14155
14156		/*
14157		 * Regardless of what the actual policy is, we're going to
14158		 * temporarily set our resize policy to be manual.  We're
14159		 * also going to temporarily set our CPU option to denote
14160		 * the newly configured CPU.
14161		 */
14162		rs = opt[DTRACEOPT_BUFRESIZE];
14163		opt[DTRACEOPT_BUFRESIZE] = DTRACEOPT_BUFRESIZE_MANUAL;
14164		opt[DTRACEOPT_CPU] = (dtrace_optval_t)cpu;
14165
14166		(void) dtrace_state_buffers(state);
14167
14168		opt[DTRACEOPT_BUFRESIZE] = rs;
14169		opt[DTRACEOPT_CPU] = c;
14170
14171		break;
14172	}
14173
14174	case CPU_UNCONFIG:
14175		/*
14176		 * We don't free the buffer in the CPU_UNCONFIG case.  (The
14177		 * buffer will be freed when the consumer exits.)
14178		 */
14179		break;
14180
14181	default:
14182		break;
14183	}
14184
14185	mutex_exit(&dtrace_lock);
14186	return (0);
14187}
14188
14189static void
14190dtrace_cpu_setup_initial(processorid_t cpu)
14191{
14192	(void) dtrace_cpu_setup(CPU_CONFIG, cpu);
14193}
14194
14195static void
14196dtrace_toxrange_add(uintptr_t base, uintptr_t limit)
14197{
14198	if (dtrace_toxranges >= dtrace_toxranges_max) {
14199		int osize, nsize;
14200		dtrace_toxrange_t *range;
14201
14202		osize = dtrace_toxranges_max * sizeof (dtrace_toxrange_t);
14203
14204		if (osize == 0) {
14205			ASSERT(dtrace_toxrange == NULL);
14206			ASSERT(dtrace_toxranges_max == 0);
14207			dtrace_toxranges_max = 1;
14208		} else {
14209			dtrace_toxranges_max <<= 1;
14210		}
14211
14212		nsize = dtrace_toxranges_max * sizeof (dtrace_toxrange_t);
14213		range = kmem_zalloc(nsize, KM_SLEEP);
14214
14215		if (dtrace_toxrange != NULL) {
14216			ASSERT(osize != 0);
14217			bcopy(dtrace_toxrange, range, osize);
14218			kmem_free(dtrace_toxrange, osize);
14219		}
14220
14221		dtrace_toxrange = range;
14222	}
14223
14224	ASSERT(dtrace_toxrange[dtrace_toxranges].dtt_base == NULL);
14225	ASSERT(dtrace_toxrange[dtrace_toxranges].dtt_limit == NULL);
14226
14227	dtrace_toxrange[dtrace_toxranges].dtt_base = base;
14228	dtrace_toxrange[dtrace_toxranges].dtt_limit = limit;
14229	dtrace_toxranges++;
14230}
14231
14232/*
14233 * DTrace Driver Cookbook Functions
14234 */
14235/*ARGSUSED*/
14236static int
14237dtrace_attach(dev_info_t *devi, ddi_attach_cmd_t cmd)
14238{
14239	dtrace_provider_id_t id;
14240	dtrace_state_t *state = NULL;
14241	dtrace_enabling_t *enab;
14242
14243	mutex_enter(&cpu_lock);
14244	mutex_enter(&dtrace_provider_lock);
14245	mutex_enter(&dtrace_lock);
14246
14247	if (ddi_soft_state_init(&dtrace_softstate,
14248	    sizeof (dtrace_state_t), 0) != 0) {
14249		cmn_err(CE_NOTE, "/dev/dtrace failed to initialize soft state");
14250		mutex_exit(&cpu_lock);
14251		mutex_exit(&dtrace_provider_lock);
14252		mutex_exit(&dtrace_lock);
14253		return (DDI_FAILURE);
14254	}
14255
14256	if (ddi_create_minor_node(devi, DTRACEMNR_DTRACE, S_IFCHR,
14257	    DTRACEMNRN_DTRACE, DDI_PSEUDO, NULL) == DDI_FAILURE ||
14258	    ddi_create_minor_node(devi, DTRACEMNR_HELPER, S_IFCHR,
14259	    DTRACEMNRN_HELPER, DDI_PSEUDO, NULL) == DDI_FAILURE) {
14260		cmn_err(CE_NOTE, "/dev/dtrace couldn't create minor nodes");
14261		ddi_remove_minor_node(devi, NULL);
14262		ddi_soft_state_fini(&dtrace_softstate);
14263		mutex_exit(&cpu_lock);
14264		mutex_exit(&dtrace_provider_lock);
14265		mutex_exit(&dtrace_lock);
14266		return (DDI_FAILURE);
14267	}
14268
14269	ddi_report_dev(devi);
14270	dtrace_devi = devi;
14271
14272	dtrace_modload = dtrace_module_loaded;
14273	dtrace_modunload = dtrace_module_unloaded;
14274	dtrace_cpu_init = dtrace_cpu_setup_initial;
14275	dtrace_helpers_cleanup = dtrace_helpers_destroy;
14276	dtrace_helpers_fork = dtrace_helpers_duplicate;
14277	dtrace_cpustart_init = dtrace_suspend;
14278	dtrace_cpustart_fini = dtrace_resume;
14279	dtrace_debugger_init = dtrace_suspend;
14280	dtrace_debugger_fini = dtrace_resume;
14281
14282	register_cpu_setup_func((cpu_setup_func_t *)dtrace_cpu_setup, NULL);
14283
14284	ASSERT(MUTEX_HELD(&cpu_lock));
14285
14286	dtrace_arena = vmem_create("dtrace", (void *)1, UINT32_MAX, 1,
14287	    NULL, NULL, NULL, 0, VM_SLEEP | VMC_IDENTIFIER);
14288	dtrace_minor = vmem_create("dtrace_minor", (void *)DTRACEMNRN_CLONE,
14289	    UINT32_MAX - DTRACEMNRN_CLONE, 1, NULL, NULL, NULL, 0,
14290	    VM_SLEEP | VMC_IDENTIFIER);
14291	dtrace_taskq = taskq_create("dtrace_taskq", 1, maxclsyspri,
14292	    1, INT_MAX, 0);
14293
14294	dtrace_state_cache = kmem_cache_create("dtrace_state_cache",
14295	    sizeof (dtrace_dstate_percpu_t) * NCPU, DTRACE_STATE_ALIGN,
14296	    NULL, NULL, NULL, NULL, NULL, 0);
14297
14298	ASSERT(MUTEX_HELD(&cpu_lock));
14299	dtrace_bymod = dtrace_hash_create(offsetof(dtrace_probe_t, dtpr_mod),
14300	    offsetof(dtrace_probe_t, dtpr_nextmod),
14301	    offsetof(dtrace_probe_t, dtpr_prevmod));
14302
14303	dtrace_byfunc = dtrace_hash_create(offsetof(dtrace_probe_t, dtpr_func),
14304	    offsetof(dtrace_probe_t, dtpr_nextfunc),
14305	    offsetof(dtrace_probe_t, dtpr_prevfunc));
14306
14307	dtrace_byname = dtrace_hash_create(offsetof(dtrace_probe_t, dtpr_name),
14308	    offsetof(dtrace_probe_t, dtpr_nextname),
14309	    offsetof(dtrace_probe_t, dtpr_prevname));
14310
14311	if (dtrace_retain_max < 1) {
14312		cmn_err(CE_WARN, "illegal value (%lu) for dtrace_retain_max; "
14313		    "setting to 1", dtrace_retain_max);
14314		dtrace_retain_max = 1;
14315	}
14316
14317	/*
14318	 * Now discover our toxic ranges.
14319	 */
14320	dtrace_toxic_ranges(dtrace_toxrange_add);
14321
14322	/*
14323	 * Before we register ourselves as a provider to our own framework,
14324	 * we would like to assert that dtrace_provider is NULL -- but that's
14325	 * not true if we were loaded as a dependency of a DTrace provider.
14326	 * Once we've registered, we can assert that dtrace_provider is our
14327	 * pseudo provider.
14328	 */
14329	(void) dtrace_register("dtrace", &dtrace_provider_attr,
14330	    DTRACE_PRIV_NONE, 0, &dtrace_provider_ops, NULL, &id);
14331
14332	ASSERT(dtrace_provider != NULL);
14333	ASSERT((dtrace_provider_id_t)dtrace_provider == id);
14334
14335	dtrace_probeid_begin = dtrace_probe_create((dtrace_provider_id_t)
14336	    dtrace_provider, NULL, NULL, "BEGIN", 0, NULL);
14337	dtrace_probeid_end = dtrace_probe_create((dtrace_provider_id_t)
14338	    dtrace_provider, NULL, NULL, "END", 0, NULL);
14339	dtrace_probeid_error = dtrace_probe_create((dtrace_provider_id_t)
14340	    dtrace_provider, NULL, NULL, "ERROR", 1, NULL);
14341
14342	dtrace_anon_property();
14343	mutex_exit(&cpu_lock);
14344
14345	/*
14346	 * If DTrace helper tracing is enabled, we need to allocate the
14347	 * trace buffer and initialize the values.
14348	 */
14349	if (dtrace_helptrace_enabled) {
14350		ASSERT(dtrace_helptrace_buffer == NULL);
14351		dtrace_helptrace_buffer =
14352		    kmem_zalloc(dtrace_helptrace_bufsize, KM_SLEEP);
14353		dtrace_helptrace_next = 0;
14354	}
14355
14356	/*
14357	 * If there are already providers, we must ask them to provide their
14358	 * probes, and then match any anonymous enabling against them.  Note
14359	 * that there should be no other retained enablings at this time:
14360	 * the only retained enablings at this time should be the anonymous
14361	 * enabling.
14362	 */
14363	if (dtrace_anon.dta_enabling != NULL) {
14364		ASSERT(dtrace_retained == dtrace_anon.dta_enabling);
14365
14366		dtrace_enabling_provide(NULL);
14367		state = dtrace_anon.dta_state;
14368
14369		/*
14370		 * We couldn't hold cpu_lock across the above call to
14371		 * dtrace_enabling_provide(), but we must hold it to actually
14372		 * enable the probes.  We have to drop all of our locks, pick
14373		 * up cpu_lock, and regain our locks before matching the
14374		 * retained anonymous enabling.
14375		 */
14376		mutex_exit(&dtrace_lock);
14377		mutex_exit(&dtrace_provider_lock);
14378
14379		mutex_enter(&cpu_lock);
14380		mutex_enter(&dtrace_provider_lock);
14381		mutex_enter(&dtrace_lock);
14382
14383		if ((enab = dtrace_anon.dta_enabling) != NULL)
14384			(void) dtrace_enabling_match(enab, NULL);
14385
14386		mutex_exit(&cpu_lock);
14387	}
14388
14389	mutex_exit(&dtrace_lock);
14390	mutex_exit(&dtrace_provider_lock);
14391
14392	if (state != NULL) {
14393		/*
14394		 * If we created any anonymous state, set it going now.
14395		 */
14396		(void) dtrace_state_go(state, &dtrace_anon.dta_beganon);
14397	}
14398
14399	return (DDI_SUCCESS);
14400}
14401
14402/*ARGSUSED*/
14403static int
14404dtrace_open(dev_t *devp, int flag, int otyp, cred_t *cred_p)
14405{
14406	dtrace_state_t *state;
14407	uint32_t priv;
14408	uid_t uid;
14409	zoneid_t zoneid;
14410
14411	if (getminor(*devp) == DTRACEMNRN_HELPER)
14412		return (0);
14413
14414	/*
14415	 * If this wasn't an open with the "helper" minor, then it must be
14416	 * the "dtrace" minor.
14417	 */
14418	ASSERT(getminor(*devp) == DTRACEMNRN_DTRACE);
14419
14420	/*
14421	 * If no DTRACE_PRIV_* bits are set in the credential, then the
14422	 * caller lacks sufficient permission to do anything with DTrace.
14423	 */
14424	dtrace_cred2priv(cred_p, &priv, &uid, &zoneid);
14425	if (priv == DTRACE_PRIV_NONE)
14426		return (EACCES);
14427
14428	/*
14429	 * Ask all providers to provide all their probes.
14430	 */
14431	mutex_enter(&dtrace_provider_lock);
14432	dtrace_probe_provide(NULL, NULL);
14433	mutex_exit(&dtrace_provider_lock);
14434
14435	mutex_enter(&cpu_lock);
14436	mutex_enter(&dtrace_lock);
14437	dtrace_opens++;
14438	dtrace_membar_producer();
14439
14440	/*
14441	 * If the kernel debugger is active (that is, if the kernel debugger
14442	 * modified text in some way), we won't allow the open.
14443	 */
14444	if (kdi_dtrace_set(KDI_DTSET_DTRACE_ACTIVATE) != 0) {
14445		dtrace_opens--;
14446		mutex_exit(&cpu_lock);
14447		mutex_exit(&dtrace_lock);
14448		return (EBUSY);
14449	}
14450
14451	state = dtrace_state_create(devp, cred_p);
14452	mutex_exit(&cpu_lock);
14453
14454	if (state == NULL) {
14455		if (--dtrace_opens == 0)
14456			(void) kdi_dtrace_set(KDI_DTSET_DTRACE_DEACTIVATE);
14457		mutex_exit(&dtrace_lock);
14458		return (EAGAIN);
14459	}
14460
14461	mutex_exit(&dtrace_lock);
14462
14463	return (0);
14464}
14465
14466/*ARGSUSED*/
14467static int
14468dtrace_close(dev_t dev, int flag, int otyp, cred_t *cred_p)
14469{
14470	minor_t minor = getminor(dev);
14471	dtrace_state_t *state;
14472
14473	if (minor == DTRACEMNRN_HELPER)
14474		return (0);
14475
14476	state = ddi_get_soft_state(dtrace_softstate, minor);
14477
14478	mutex_enter(&cpu_lock);
14479	mutex_enter(&dtrace_lock);
14480
14481	if (state->dts_anon) {
14482		/*
14483		 * There is anonymous state. Destroy that first.
14484		 */
14485		ASSERT(dtrace_anon.dta_state == NULL);
14486		dtrace_state_destroy(state->dts_anon);
14487	}
14488
14489	dtrace_state_destroy(state);
14490	ASSERT(dtrace_opens > 0);
14491	if (--dtrace_opens == 0)
14492		(void) kdi_dtrace_set(KDI_DTSET_DTRACE_DEACTIVATE);
14493
14494	mutex_exit(&dtrace_lock);
14495	mutex_exit(&cpu_lock);
14496
14497	return (0);
14498}
14499
14500/*ARGSUSED*/
14501static int
14502dtrace_ioctl_helper(int cmd, intptr_t arg, int *rv)
14503{
14504	int rval;
14505	dof_helper_t help, *dhp = NULL;
14506
14507	switch (cmd) {
14508	case DTRACEHIOC_ADDDOF:
14509		if (copyin((void *)arg, &help, sizeof (help)) != 0) {
14510			dtrace_dof_error(NULL, "failed to copyin DOF helper");
14511			return (EFAULT);
14512		}
14513
14514		dhp = &help;
14515		arg = (intptr_t)help.dofhp_dof;
14516		/*FALLTHROUGH*/
14517
14518	case DTRACEHIOC_ADD: {
14519		dof_hdr_t *dof = dtrace_dof_copyin(arg, &rval);
14520
14521		if (dof == NULL)
14522			return (rval);
14523
14524		mutex_enter(&dtrace_lock);
14525
14526		/*
14527		 * dtrace_helper_slurp() takes responsibility for the dof --
14528		 * it may free it now or it may save it and free it later.
14529		 */
14530		if ((rval = dtrace_helper_slurp(dof, dhp)) != -1) {
14531			*rv = rval;
14532			rval = 0;
14533		} else {
14534			rval = EINVAL;
14535		}
14536
14537		mutex_exit(&dtrace_lock);
14538		return (rval);
14539	}
14540
14541	case DTRACEHIOC_REMOVE: {
14542		mutex_enter(&dtrace_lock);
14543		rval = dtrace_helper_destroygen(arg);
14544		mutex_exit(&dtrace_lock);
14545
14546		return (rval);
14547	}
14548
14549	default:
14550		break;
14551	}
14552
14553	return (ENOTTY);
14554}
14555
14556/*ARGSUSED*/
14557static int
14558dtrace_ioctl(dev_t dev, int cmd, intptr_t arg, int md, cred_t *cr, int *rv)
14559{
14560	minor_t minor = getminor(dev);
14561	dtrace_state_t *state;
14562	int rval;
14563
14564	if (minor == DTRACEMNRN_HELPER)
14565		return (dtrace_ioctl_helper(cmd, arg, rv));
14566
14567	state = ddi_get_soft_state(dtrace_softstate, minor);
14568
14569	if (state->dts_anon) {
14570		ASSERT(dtrace_anon.dta_state == NULL);
14571		state = state->dts_anon;
14572	}
14573
14574	switch (cmd) {
14575	case DTRACEIOC_PROVIDER: {
14576		dtrace_providerdesc_t pvd;
14577		dtrace_provider_t *pvp;
14578
14579		if (copyin((void *)arg, &pvd, sizeof (pvd)) != 0)
14580			return (EFAULT);
14581
14582		pvd.dtvd_name[DTRACE_PROVNAMELEN - 1] = '\0';
14583		mutex_enter(&dtrace_provider_lock);
14584
14585		for (pvp = dtrace_provider; pvp != NULL; pvp = pvp->dtpv_next) {
14586			if (strcmp(pvp->dtpv_name, pvd.dtvd_name) == 0)
14587				break;
14588		}
14589
14590		mutex_exit(&dtrace_provider_lock);
14591
14592		if (pvp == NULL)
14593			return (ESRCH);
14594
14595		bcopy(&pvp->dtpv_priv, &pvd.dtvd_priv, sizeof (dtrace_ppriv_t));
14596		bcopy(&pvp->dtpv_attr, &pvd.dtvd_attr, sizeof (dtrace_pattr_t));
14597		if (copyout(&pvd, (void *)arg, sizeof (pvd)) != 0)
14598			return (EFAULT);
14599
14600		return (0);
14601	}
14602
14603	case DTRACEIOC_EPROBE: {
14604		dtrace_eprobedesc_t epdesc;
14605		dtrace_ecb_t *ecb;
14606		dtrace_action_t *act;
14607		void *buf;
14608		size_t size;
14609		uintptr_t dest;
14610		int nrecs;
14611
14612		if (copyin((void *)arg, &epdesc, sizeof (epdesc)) != 0)
14613			return (EFAULT);
14614
14615		mutex_enter(&dtrace_lock);
14616
14617		if ((ecb = dtrace_epid2ecb(state, epdesc.dtepd_epid)) == NULL) {
14618			mutex_exit(&dtrace_lock);
14619			return (EINVAL);
14620		}
14621
14622		if (ecb->dte_probe == NULL) {
14623			mutex_exit(&dtrace_lock);
14624			return (EINVAL);
14625		}
14626
14627		epdesc.dtepd_probeid = ecb->dte_probe->dtpr_id;
14628		epdesc.dtepd_uarg = ecb->dte_uarg;
14629		epdesc.dtepd_size = ecb->dte_size;
14630
14631		nrecs = epdesc.dtepd_nrecs;
14632		epdesc.dtepd_nrecs = 0;
14633		for (act = ecb->dte_action; act != NULL; act = act->dta_next) {
14634			if (DTRACEACT_ISAGG(act->dta_kind) || act->dta_intuple)
14635				continue;
14636
14637			epdesc.dtepd_nrecs++;
14638		}
14639
14640		/*
14641		 * Now that we have the size, we need to allocate a temporary
14642		 * buffer in which to store the complete description.  We need
14643		 * the temporary buffer to be able to drop dtrace_lock()
14644		 * across the copyout(), below.
14645		 */
14646		size = sizeof (dtrace_eprobedesc_t) +
14647		    (epdesc.dtepd_nrecs * sizeof (dtrace_recdesc_t));
14648
14649		buf = kmem_alloc(size, KM_SLEEP);
14650		dest = (uintptr_t)buf;
14651
14652		bcopy(&epdesc, (void *)dest, sizeof (epdesc));
14653		dest += offsetof(dtrace_eprobedesc_t, dtepd_rec[0]);
14654
14655		for (act = ecb->dte_action; act != NULL; act = act->dta_next) {
14656			if (DTRACEACT_ISAGG(act->dta_kind) || act->dta_intuple)
14657				continue;
14658
14659			if (nrecs-- == 0)
14660				break;
14661
14662			bcopy(&act->dta_rec, (void *)dest,
14663			    sizeof (dtrace_recdesc_t));
14664			dest += sizeof (dtrace_recdesc_t);
14665		}
14666
14667		mutex_exit(&dtrace_lock);
14668
14669		if (copyout(buf, (void *)arg, dest - (uintptr_t)buf) != 0) {
14670			kmem_free(buf, size);
14671			return (EFAULT);
14672		}
14673
14674		kmem_free(buf, size);
14675		return (0);
14676	}
14677
14678	case DTRACEIOC_AGGDESC: {
14679		dtrace_aggdesc_t aggdesc;
14680		dtrace_action_t *act;
14681		dtrace_aggregation_t *agg;
14682		int nrecs;
14683		uint32_t offs;
14684		dtrace_recdesc_t *lrec;
14685		void *buf;
14686		size_t size;
14687		uintptr_t dest;
14688
14689		if (copyin((void *)arg, &aggdesc, sizeof (aggdesc)) != 0)
14690			return (EFAULT);
14691
14692		mutex_enter(&dtrace_lock);
14693
14694		if ((agg = dtrace_aggid2agg(state, aggdesc.dtagd_id)) == NULL) {
14695			mutex_exit(&dtrace_lock);
14696			return (EINVAL);
14697		}
14698
14699		aggdesc.dtagd_epid = agg->dtag_ecb->dte_epid;
14700
14701		nrecs = aggdesc.dtagd_nrecs;
14702		aggdesc.dtagd_nrecs = 0;
14703
14704		offs = agg->dtag_base;
14705		lrec = &agg->dtag_action.dta_rec;
14706		aggdesc.dtagd_size = lrec->dtrd_offset + lrec->dtrd_size - offs;
14707
14708		for (act = agg->dtag_first; ; act = act->dta_next) {
14709			ASSERT(act->dta_intuple ||
14710			    DTRACEACT_ISAGG(act->dta_kind));
14711
14712			/*
14713			 * If this action has a record size of zero, it
14714			 * denotes an argument to the aggregating action.
14715			 * Because the presence of this record doesn't (or
14716			 * shouldn't) affect the way the data is interpreted,
14717			 * we don't copy it out to save user-level the
14718			 * confusion of dealing with a zero-length record.
14719			 */
14720			if (act->dta_rec.dtrd_size == 0) {
14721				ASSERT(agg->dtag_hasarg);
14722				continue;
14723			}
14724
14725			aggdesc.dtagd_nrecs++;
14726
14727			if (act == &agg->dtag_action)
14728				break;
14729		}
14730
14731		/*
14732		 * Now that we have the size, we need to allocate a temporary
14733		 * buffer in which to store the complete description.  We need
14734		 * the temporary buffer to be able to drop dtrace_lock()
14735		 * across the copyout(), below.
14736		 */
14737		size = sizeof (dtrace_aggdesc_t) +
14738		    (aggdesc.dtagd_nrecs * sizeof (dtrace_recdesc_t));
14739
14740		buf = kmem_alloc(size, KM_SLEEP);
14741		dest = (uintptr_t)buf;
14742
14743		bcopy(&aggdesc, (void *)dest, sizeof (aggdesc));
14744		dest += offsetof(dtrace_aggdesc_t, dtagd_rec[0]);
14745
14746		for (act = agg->dtag_first; ; act = act->dta_next) {
14747			dtrace_recdesc_t rec = act->dta_rec;
14748
14749			/*
14750			 * See the comment in the above loop for why we pass
14751			 * over zero-length records.
14752			 */
14753			if (rec.dtrd_size == 0) {
14754				ASSERT(agg->dtag_hasarg);
14755				continue;
14756			}
14757
14758			if (nrecs-- == 0)
14759				break;
14760
14761			rec.dtrd_offset -= offs;
14762			bcopy(&rec, (void *)dest, sizeof (rec));
14763			dest += sizeof (dtrace_recdesc_t);
14764
14765			if (act == &agg->dtag_action)
14766				break;
14767		}
14768
14769		mutex_exit(&dtrace_lock);
14770
14771		if (copyout(buf, (void *)arg, dest - (uintptr_t)buf) != 0) {
14772			kmem_free(buf, size);
14773			return (EFAULT);
14774		}
14775
14776		kmem_free(buf, size);
14777		return (0);
14778	}
14779
14780	case DTRACEIOC_ENABLE: {
14781		dof_hdr_t *dof;
14782		dtrace_enabling_t *enab = NULL;
14783		dtrace_vstate_t *vstate;
14784		int err = 0;
14785
14786		*rv = 0;
14787
14788		/*
14789		 * If a NULL argument has been passed, we take this as our
14790		 * cue to reevaluate our enablings.
14791		 */
14792		if (arg == NULL) {
14793			mutex_enter(&cpu_lock);
14794			mutex_enter(&dtrace_lock);
14795			err = dtrace_enabling_matchstate(state, rv);
14796			mutex_exit(&dtrace_lock);
14797			mutex_exit(&cpu_lock);
14798
14799			return (err);
14800		}
14801
14802		if ((dof = dtrace_dof_copyin(arg, &rval)) == NULL)
14803			return (rval);
14804
14805		mutex_enter(&cpu_lock);
14806		mutex_enter(&dtrace_lock);
14807		vstate = &state->dts_vstate;
14808
14809		if (state->dts_activity != DTRACE_ACTIVITY_INACTIVE) {
14810			mutex_exit(&dtrace_lock);
14811			mutex_exit(&cpu_lock);
14812			dtrace_dof_destroy(dof);
14813			return (EBUSY);
14814		}
14815
14816		if (dtrace_dof_slurp(dof, vstate, cr, &enab, 0, B_TRUE) != 0) {
14817			mutex_exit(&dtrace_lock);
14818			mutex_exit(&cpu_lock);
14819			dtrace_dof_destroy(dof);
14820			return (EINVAL);
14821		}
14822
14823		if ((rval = dtrace_dof_options(dof, state)) != 0) {
14824			dtrace_enabling_destroy(enab);
14825			mutex_exit(&dtrace_lock);
14826			mutex_exit(&cpu_lock);
14827			dtrace_dof_destroy(dof);
14828			return (rval);
14829		}
14830
14831		if ((err = dtrace_enabling_match(enab, rv)) == 0) {
14832			err = dtrace_enabling_retain(enab);
14833		} else {
14834			dtrace_enabling_destroy(enab);
14835		}
14836
14837		mutex_exit(&cpu_lock);
14838		mutex_exit(&dtrace_lock);
14839		dtrace_dof_destroy(dof);
14840
14841		return (err);
14842	}
14843
14844	case DTRACEIOC_REPLICATE: {
14845		dtrace_repldesc_t desc;
14846		dtrace_probedesc_t *match = &desc.dtrpd_match;
14847		dtrace_probedesc_t *create = &desc.dtrpd_create;
14848		int err;
14849
14850		if (copyin((void *)arg, &desc, sizeof (desc)) != 0)
14851			return (EFAULT);
14852
14853		match->dtpd_provider[DTRACE_PROVNAMELEN - 1] = '\0';
14854		match->dtpd_mod[DTRACE_MODNAMELEN - 1] = '\0';
14855		match->dtpd_func[DTRACE_FUNCNAMELEN - 1] = '\0';
14856		match->dtpd_name[DTRACE_NAMELEN - 1] = '\0';
14857
14858		create->dtpd_provider[DTRACE_PROVNAMELEN - 1] = '\0';
14859		create->dtpd_mod[DTRACE_MODNAMELEN - 1] = '\0';
14860		create->dtpd_func[DTRACE_FUNCNAMELEN - 1] = '\0';
14861		create->dtpd_name[DTRACE_NAMELEN - 1] = '\0';
14862
14863		mutex_enter(&dtrace_lock);
14864		err = dtrace_enabling_replicate(state, match, create);
14865		mutex_exit(&dtrace_lock);
14866
14867		return (err);
14868	}
14869
14870	case DTRACEIOC_PROBEMATCH:
14871	case DTRACEIOC_PROBES: {
14872		dtrace_probe_t *probe = NULL;
14873		dtrace_probedesc_t desc;
14874		dtrace_probekey_t pkey;
14875		dtrace_id_t i;
14876		int m = 0;
14877		uint32_t priv;
14878		uid_t uid;
14879		zoneid_t zoneid;
14880
14881		if (copyin((void *)arg, &desc, sizeof (desc)) != 0)
14882			return (EFAULT);
14883
14884		desc.dtpd_provider[DTRACE_PROVNAMELEN - 1] = '\0';
14885		desc.dtpd_mod[DTRACE_MODNAMELEN - 1] = '\0';
14886		desc.dtpd_func[DTRACE_FUNCNAMELEN - 1] = '\0';
14887		desc.dtpd_name[DTRACE_NAMELEN - 1] = '\0';
14888
14889		/*
14890		 * Before we attempt to match this probe, we want to give
14891		 * all providers the opportunity to provide it.
14892		 */
14893		if (desc.dtpd_id == DTRACE_IDNONE) {
14894			mutex_enter(&dtrace_provider_lock);
14895			dtrace_probe_provide(&desc, NULL);
14896			mutex_exit(&dtrace_provider_lock);
14897			desc.dtpd_id++;
14898		}
14899
14900		if (cmd == DTRACEIOC_PROBEMATCH)  {
14901			dtrace_probekey(&desc, &pkey);
14902			pkey.dtpk_id = DTRACE_IDNONE;
14903		}
14904
14905		dtrace_cred2priv(cr, &priv, &uid, &zoneid);
14906
14907		mutex_enter(&dtrace_lock);
14908
14909		if (cmd == DTRACEIOC_PROBEMATCH) {
14910			for (i = desc.dtpd_id; i <= dtrace_nprobes; i++) {
14911				if ((probe = dtrace_probes[i - 1]) != NULL &&
14912				    (m = dtrace_match_probe(probe, &pkey,
14913				    priv, uid, zoneid)) != 0)
14914					break;
14915			}
14916
14917			if (m < 0) {
14918				mutex_exit(&dtrace_lock);
14919				return (EINVAL);
14920			}
14921
14922		} else {
14923			for (i = desc.dtpd_id; i <= dtrace_nprobes; i++) {
14924				if ((probe = dtrace_probes[i - 1]) != NULL &&
14925				    dtrace_match_priv(probe, priv, uid, zoneid))
14926					break;
14927			}
14928		}
14929
14930		if (probe == NULL) {
14931			mutex_exit(&dtrace_lock);
14932			return (ESRCH);
14933		}
14934
14935		dtrace_probe_description(probe, &desc);
14936		mutex_exit(&dtrace_lock);
14937
14938		if (copyout(&desc, (void *)arg, sizeof (desc)) != 0)
14939			return (EFAULT);
14940
14941		return (0);
14942	}
14943
14944	case DTRACEIOC_PROBEARG: {
14945		dtrace_argdesc_t desc;
14946		dtrace_probe_t *probe;
14947		dtrace_provider_t *prov;
14948
14949		if (copyin((void *)arg, &desc, sizeof (desc)) != 0)
14950			return (EFAULT);
14951
14952		if (desc.dtargd_id == DTRACE_IDNONE)
14953			return (EINVAL);
14954
14955		if (desc.dtargd_ndx == DTRACE_ARGNONE)
14956			return (EINVAL);
14957
14958		mutex_enter(&dtrace_provider_lock);
14959		mutex_enter(&mod_lock);
14960		mutex_enter(&dtrace_lock);
14961
14962		if (desc.dtargd_id > dtrace_nprobes) {
14963			mutex_exit(&dtrace_lock);
14964			mutex_exit(&mod_lock);
14965			mutex_exit(&dtrace_provider_lock);
14966			return (EINVAL);
14967		}
14968
14969		if ((probe = dtrace_probes[desc.dtargd_id - 1]) == NULL) {
14970			mutex_exit(&dtrace_lock);
14971			mutex_exit(&mod_lock);
14972			mutex_exit(&dtrace_provider_lock);
14973			return (EINVAL);
14974		}
14975
14976		mutex_exit(&dtrace_lock);
14977
14978		prov = probe->dtpr_provider;
14979
14980		if (prov->dtpv_pops.dtps_getargdesc == NULL) {
14981			/*
14982			 * There isn't any typed information for this probe.
14983			 * Set the argument number to DTRACE_ARGNONE.
14984			 */
14985			desc.dtargd_ndx = DTRACE_ARGNONE;
14986		} else {
14987			desc.dtargd_native[0] = '\0';
14988			desc.dtargd_xlate[0] = '\0';
14989			desc.dtargd_mapping = desc.dtargd_ndx;
14990
14991			prov->dtpv_pops.dtps_getargdesc(prov->dtpv_arg,
14992			    probe->dtpr_id, probe->dtpr_arg, &desc);
14993		}
14994
14995		mutex_exit(&mod_lock);
14996		mutex_exit(&dtrace_provider_lock);
14997
14998		if (copyout(&desc, (void *)arg, sizeof (desc)) != 0)
14999			return (EFAULT);
15000
15001		return (0);
15002	}
15003
15004	case DTRACEIOC_GO: {
15005		processorid_t cpuid;
15006		rval = dtrace_state_go(state, &cpuid);
15007
15008		if (rval != 0)
15009			return (rval);
15010
15011		if (copyout(&cpuid, (void *)arg, sizeof (cpuid)) != 0)
15012			return (EFAULT);
15013
15014		return (0);
15015	}
15016
15017	case DTRACEIOC_STOP: {
15018		processorid_t cpuid;
15019
15020		mutex_enter(&dtrace_lock);
15021		rval = dtrace_state_stop(state, &cpuid);
15022		mutex_exit(&dtrace_lock);
15023
15024		if (rval != 0)
15025			return (rval);
15026
15027		if (copyout(&cpuid, (void *)arg, sizeof (cpuid)) != 0)
15028			return (EFAULT);
15029
15030		return (0);
15031	}
15032
15033	case DTRACEIOC_DOFGET: {
15034		dof_hdr_t hdr, *dof;
15035		uint64_t len;
15036
15037		if (copyin((void *)arg, &hdr, sizeof (hdr)) != 0)
15038			return (EFAULT);
15039
15040		mutex_enter(&dtrace_lock);
15041		dof = dtrace_dof_create(state);
15042		mutex_exit(&dtrace_lock);
15043
15044		len = MIN(hdr.dofh_loadsz, dof->dofh_loadsz);
15045		rval = copyout(dof, (void *)arg, len);
15046		dtrace_dof_destroy(dof);
15047
15048		return (rval == 0 ? 0 : EFAULT);
15049	}
15050
15051	case DTRACEIOC_AGGSNAP:
15052	case DTRACEIOC_BUFSNAP: {
15053		dtrace_bufdesc_t desc;
15054		caddr_t cached;
15055		dtrace_buffer_t *buf;
15056
15057		if (copyin((void *)arg, &desc, sizeof (desc)) != 0)
15058			return (EFAULT);
15059
15060		if (desc.dtbd_cpu < 0 || desc.dtbd_cpu >= NCPU)
15061			return (EINVAL);
15062
15063		mutex_enter(&dtrace_lock);
15064
15065		if (cmd == DTRACEIOC_BUFSNAP) {
15066			buf = &state->dts_buffer[desc.dtbd_cpu];
15067		} else {
15068			buf = &state->dts_aggbuffer[desc.dtbd_cpu];
15069		}
15070
15071		if (buf->dtb_flags & (DTRACEBUF_RING | DTRACEBUF_FILL)) {
15072			size_t sz = buf->dtb_offset;
15073
15074			if (state->dts_activity != DTRACE_ACTIVITY_STOPPED) {
15075				mutex_exit(&dtrace_lock);
15076				return (EBUSY);
15077			}
15078
15079			/*
15080			 * If this buffer has already been consumed, we're
15081			 * going to indicate that there's nothing left here
15082			 * to consume.
15083			 */
15084			if (buf->dtb_flags & DTRACEBUF_CONSUMED) {
15085				mutex_exit(&dtrace_lock);
15086
15087				desc.dtbd_size = 0;
15088				desc.dtbd_drops = 0;
15089				desc.dtbd_errors = 0;
15090				desc.dtbd_oldest = 0;
15091				sz = sizeof (desc);
15092
15093				if (copyout(&desc, (void *)arg, sz) != 0)
15094					return (EFAULT);
15095
15096				return (0);
15097			}
15098
15099			/*
15100			 * If this is a ring buffer that has wrapped, we want
15101			 * to copy the whole thing out.
15102			 */
15103			if (buf->dtb_flags & DTRACEBUF_WRAPPED) {
15104				dtrace_buffer_polish(buf);
15105				sz = buf->dtb_size;
15106			}
15107
15108			if (copyout(buf->dtb_tomax, desc.dtbd_data, sz) != 0) {
15109				mutex_exit(&dtrace_lock);
15110				return (EFAULT);
15111			}
15112
15113			desc.dtbd_size = sz;
15114			desc.dtbd_drops = buf->dtb_drops;
15115			desc.dtbd_errors = buf->dtb_errors;
15116			desc.dtbd_oldest = buf->dtb_xamot_offset;
15117
15118			mutex_exit(&dtrace_lock);
15119
15120			if (copyout(&desc, (void *)arg, sizeof (desc)) != 0)
15121				return (EFAULT);
15122
15123			buf->dtb_flags |= DTRACEBUF_CONSUMED;
15124
15125			return (0);
15126		}
15127
15128		if (buf->dtb_tomax == NULL) {
15129			ASSERT(buf->dtb_xamot == NULL);
15130			mutex_exit(&dtrace_lock);
15131			return (ENOENT);
15132		}
15133
15134		cached = buf->dtb_tomax;
15135		ASSERT(!(buf->dtb_flags & DTRACEBUF_NOSWITCH));
15136
15137		dtrace_xcall(desc.dtbd_cpu,
15138		    (dtrace_xcall_t)dtrace_buffer_switch, buf);
15139
15140		state->dts_errors += buf->dtb_xamot_errors;
15141
15142		/*
15143		 * If the buffers did not actually switch, then the cross call
15144		 * did not take place -- presumably because the given CPU is
15145		 * not in the ready set.  If this is the case, we'll return
15146		 * ENOENT.
15147		 */
15148		if (buf->dtb_tomax == cached) {
15149			ASSERT(buf->dtb_xamot != cached);
15150			mutex_exit(&dtrace_lock);
15151			return (ENOENT);
15152		}
15153
15154		ASSERT(cached == buf->dtb_xamot);
15155
15156		/*
15157		 * We have our snapshot; now copy it out.
15158		 */
15159		if (copyout(buf->dtb_xamot, desc.dtbd_data,
15160		    buf->dtb_xamot_offset) != 0) {
15161			mutex_exit(&dtrace_lock);
15162			return (EFAULT);
15163		}
15164
15165		desc.dtbd_size = buf->dtb_xamot_offset;
15166		desc.dtbd_drops = buf->dtb_xamot_drops;
15167		desc.dtbd_errors = buf->dtb_xamot_errors;
15168		desc.dtbd_oldest = 0;
15169
15170		mutex_exit(&dtrace_lock);
15171
15172		/*
15173		 * Finally, copy out the buffer description.
15174		 */
15175		if (copyout(&desc, (void *)arg, sizeof (desc)) != 0)
15176			return (EFAULT);
15177
15178		return (0);
15179	}
15180
15181	case DTRACEIOC_CONF: {
15182		dtrace_conf_t conf;
15183
15184		bzero(&conf, sizeof (conf));
15185		conf.dtc_difversion = DIF_VERSION;
15186		conf.dtc_difintregs = DIF_DIR_NREGS;
15187		conf.dtc_diftupregs = DIF_DTR_NREGS;
15188		conf.dtc_ctfmodel = CTF_MODEL_NATIVE;
15189
15190		if (copyout(&conf, (void *)arg, sizeof (conf)) != 0)
15191			return (EFAULT);
15192
15193		return (0);
15194	}
15195
15196	case DTRACEIOC_STATUS: {
15197		dtrace_status_t stat;
15198		dtrace_dstate_t *dstate;
15199		int i, j;
15200		uint64_t nerrs;
15201
15202		/*
15203		 * See the comment in dtrace_state_deadman() for the reason
15204		 * for setting dts_laststatus to INT64_MAX before setting
15205		 * it to the correct value.
15206		 */
15207		state->dts_laststatus = INT64_MAX;
15208		dtrace_membar_producer();
15209		state->dts_laststatus = dtrace_gethrtime();
15210
15211		bzero(&stat, sizeof (stat));
15212
15213		mutex_enter(&dtrace_lock);
15214
15215		if (state->dts_activity == DTRACE_ACTIVITY_INACTIVE) {
15216			mutex_exit(&dtrace_lock);
15217			return (ENOENT);
15218		}
15219
15220		if (state->dts_activity == DTRACE_ACTIVITY_DRAINING)
15221			stat.dtst_exiting = 1;
15222
15223		nerrs = state->dts_errors;
15224		dstate = &state->dts_vstate.dtvs_dynvars;
15225
15226		for (i = 0; i < NCPU; i++) {
15227			dtrace_dstate_percpu_t *dcpu = &dstate->dtds_percpu[i];
15228
15229			stat.dtst_dyndrops += dcpu->dtdsc_drops;
15230			stat.dtst_dyndrops_dirty += dcpu->dtdsc_dirty_drops;
15231			stat.dtst_dyndrops_rinsing += dcpu->dtdsc_rinsing_drops;
15232
15233			if (state->dts_buffer[i].dtb_flags & DTRACEBUF_FULL)
15234				stat.dtst_filled++;
15235
15236			nerrs += state->dts_buffer[i].dtb_errors;
15237
15238			for (j = 0; j < state->dts_nspeculations; j++) {
15239				dtrace_speculation_t *spec;
15240				dtrace_buffer_t *buf;
15241
15242				spec = &state->dts_speculations[j];
15243				buf = &spec->dtsp_buffer[i];
15244				stat.dtst_specdrops += buf->dtb_xamot_drops;
15245			}
15246		}
15247
15248		stat.dtst_specdrops_busy = state->dts_speculations_busy;
15249		stat.dtst_specdrops_unavail = state->dts_speculations_unavail;
15250		stat.dtst_stkstroverflows = state->dts_stkstroverflows;
15251		stat.dtst_dblerrors = state->dts_dblerrors;
15252		stat.dtst_killed =
15253		    (state->dts_activity == DTRACE_ACTIVITY_KILLED);
15254		stat.dtst_errors = nerrs;
15255
15256		mutex_exit(&dtrace_lock);
15257
15258		if (copyout(&stat, (void *)arg, sizeof (stat)) != 0)
15259			return (EFAULT);
15260
15261		return (0);
15262	}
15263
15264	case DTRACEIOC_FORMAT: {
15265		dtrace_fmtdesc_t fmt;
15266		char *str;
15267		int len;
15268
15269		if (copyin((void *)arg, &fmt, sizeof (fmt)) != 0)
15270			return (EFAULT);
15271
15272		mutex_enter(&dtrace_lock);
15273
15274		if (fmt.dtfd_format == 0 ||
15275		    fmt.dtfd_format > state->dts_nformats) {
15276			mutex_exit(&dtrace_lock);
15277			return (EINVAL);
15278		}
15279
15280		/*
15281		 * Format strings are allocated contiguously and they are
15282		 * never freed; if a format index is less than the number
15283		 * of formats, we can assert that the format map is non-NULL
15284		 * and that the format for the specified index is non-NULL.
15285		 */
15286		ASSERT(state->dts_formats != NULL);
15287		str = state->dts_formats[fmt.dtfd_format - 1];
15288		ASSERT(str != NULL);
15289
15290		len = strlen(str) + 1;
15291
15292		if (len > fmt.dtfd_length) {
15293			fmt.dtfd_length = len;
15294
15295			if (copyout(&fmt, (void *)arg, sizeof (fmt)) != 0) {
15296				mutex_exit(&dtrace_lock);
15297				return (EINVAL);
15298			}
15299		} else {
15300			if (copyout(str, fmt.dtfd_string, len) != 0) {
15301				mutex_exit(&dtrace_lock);
15302				return (EINVAL);
15303			}
15304		}
15305
15306		mutex_exit(&dtrace_lock);
15307		return (0);
15308	}
15309
15310	default:
15311		break;
15312	}
15313
15314	return (ENOTTY);
15315}
15316
15317/*ARGSUSED*/
15318static int
15319dtrace_detach(dev_info_t *dip, ddi_detach_cmd_t cmd)
15320{
15321	dtrace_state_t *state;
15322
15323	switch (cmd) {
15324	case DDI_DETACH:
15325		break;
15326
15327	case DDI_SUSPEND:
15328		return (DDI_SUCCESS);
15329
15330	default:
15331		return (DDI_FAILURE);
15332	}
15333
15334	mutex_enter(&cpu_lock);
15335	mutex_enter(&dtrace_provider_lock);
15336	mutex_enter(&dtrace_lock);
15337
15338	ASSERT(dtrace_opens == 0);
15339
15340	if (dtrace_helpers > 0) {
15341		mutex_exit(&dtrace_provider_lock);
15342		mutex_exit(&dtrace_lock);
15343		mutex_exit(&cpu_lock);
15344		return (DDI_FAILURE);
15345	}
15346
15347	if (dtrace_unregister((dtrace_provider_id_t)dtrace_provider) != 0) {
15348		mutex_exit(&dtrace_provider_lock);
15349		mutex_exit(&dtrace_lock);
15350		mutex_exit(&cpu_lock);
15351		return (DDI_FAILURE);
15352	}
15353
15354	dtrace_provider = NULL;
15355
15356	if ((state = dtrace_anon_grab()) != NULL) {
15357		/*
15358		 * If there were ECBs on this state, the provider should
15359		 * have not been allowed to detach; assert that there is
15360		 * none.
15361		 */
15362		ASSERT(state->dts_necbs == 0);
15363		dtrace_state_destroy(state);
15364
15365		/*
15366		 * If we're being detached with anonymous state, we need to
15367		 * indicate to the kernel debugger that DTrace is now inactive.
15368		 */
15369		(void) kdi_dtrace_set(KDI_DTSET_DTRACE_DEACTIVATE);
15370	}
15371
15372	bzero(&dtrace_anon, sizeof (dtrace_anon_t));
15373	unregister_cpu_setup_func((cpu_setup_func_t *)dtrace_cpu_setup, NULL);
15374	dtrace_cpu_init = NULL;
15375	dtrace_helpers_cleanup = NULL;
15376	dtrace_helpers_fork = NULL;
15377	dtrace_cpustart_init = NULL;
15378	dtrace_cpustart_fini = NULL;
15379	dtrace_debugger_init = NULL;
15380	dtrace_debugger_fini = NULL;
15381	dtrace_modload = NULL;
15382	dtrace_modunload = NULL;
15383
15384	mutex_exit(&cpu_lock);
15385
15386	if (dtrace_helptrace_enabled) {
15387		kmem_free(dtrace_helptrace_buffer, dtrace_helptrace_bufsize);
15388		dtrace_helptrace_buffer = NULL;
15389	}
15390
15391	kmem_free(dtrace_probes, dtrace_nprobes * sizeof (dtrace_probe_t *));
15392	dtrace_probes = NULL;
15393	dtrace_nprobes = 0;
15394
15395	dtrace_hash_destroy(dtrace_bymod);
15396	dtrace_hash_destroy(dtrace_byfunc);
15397	dtrace_hash_destroy(dtrace_byname);
15398	dtrace_bymod = NULL;
15399	dtrace_byfunc = NULL;
15400	dtrace_byname = NULL;
15401
15402	kmem_cache_destroy(dtrace_state_cache);
15403	vmem_destroy(dtrace_minor);
15404	vmem_destroy(dtrace_arena);
15405
15406	if (dtrace_toxrange != NULL) {
15407		kmem_free(dtrace_toxrange,
15408		    dtrace_toxranges_max * sizeof (dtrace_toxrange_t));
15409		dtrace_toxrange = NULL;
15410		dtrace_toxranges = 0;
15411		dtrace_toxranges_max = 0;
15412	}
15413
15414	ddi_remove_minor_node(dtrace_devi, NULL);
15415	dtrace_devi = NULL;
15416
15417	ddi_soft_state_fini(&dtrace_softstate);
15418
15419	ASSERT(dtrace_vtime_references == 0);
15420	ASSERT(dtrace_opens == 0);
15421	ASSERT(dtrace_retained == NULL);
15422
15423	mutex_exit(&dtrace_lock);
15424	mutex_exit(&dtrace_provider_lock);
15425
15426	/*
15427	 * We don't destroy the task queue until after we have dropped our
15428	 * locks (taskq_destroy() may block on running tasks).  To prevent
15429	 * attempting to do work after we have effectively detached but before
15430	 * the task queue has been destroyed, all tasks dispatched via the
15431	 * task queue must check that DTrace is still attached before
15432	 * performing any operation.
15433	 */
15434	taskq_destroy(dtrace_taskq);
15435	dtrace_taskq = NULL;
15436
15437	return (DDI_SUCCESS);
15438}
15439
15440/*ARGSUSED*/
15441static int
15442dtrace_info(dev_info_t *dip, ddi_info_cmd_t infocmd, void *arg, void **result)
15443{
15444	int error;
15445
15446	switch (infocmd) {
15447	case DDI_INFO_DEVT2DEVINFO:
15448		*result = (void *)dtrace_devi;
15449		error = DDI_SUCCESS;
15450		break;
15451	case DDI_INFO_DEVT2INSTANCE:
15452		*result = (void *)0;
15453		error = DDI_SUCCESS;
15454		break;
15455	default:
15456		error = DDI_FAILURE;
15457	}
15458	return (error);
15459}
15460
15461static struct cb_ops dtrace_cb_ops = {
15462	dtrace_open,		/* open */
15463	dtrace_close,		/* close */
15464	nulldev,		/* strategy */
15465	nulldev,		/* print */
15466	nodev,			/* dump */
15467	nodev,			/* read */
15468	nodev,			/* write */
15469	dtrace_ioctl,		/* ioctl */
15470	nodev,			/* devmap */
15471	nodev,			/* mmap */
15472	nodev,			/* segmap */
15473	nochpoll,		/* poll */
15474	ddi_prop_op,		/* cb_prop_op */
15475	0,			/* streamtab  */
15476	D_NEW | D_MP		/* Driver compatibility flag */
15477};
15478
15479static struct dev_ops dtrace_ops = {
15480	DEVO_REV,		/* devo_rev */
15481	0,			/* refcnt */
15482	dtrace_info,		/* get_dev_info */
15483	nulldev,		/* identify */
15484	nulldev,		/* probe */
15485	dtrace_attach,		/* attach */
15486	dtrace_detach,		/* detach */
15487	nodev,			/* reset */
15488	&dtrace_cb_ops,		/* driver operations */
15489	NULL,			/* bus operations */
15490	nodev			/* dev power */
15491};
15492
15493static struct modldrv modldrv = {
15494	&mod_driverops,		/* module type (this is a pseudo driver) */
15495	"Dynamic Tracing",	/* name of module */
15496	&dtrace_ops,		/* driver ops */
15497};
15498
15499static struct modlinkage modlinkage = {
15500	MODREV_1,
15501	(void *)&modldrv,
15502	NULL
15503};
15504
15505int
15506_init(void)
15507{
15508	return (mod_install(&modlinkage));
15509}
15510
15511int
15512_info(struct modinfo *modinfop)
15513{
15514	return (mod_info(&modlinkage, modinfop));
15515}
15516
15517int
15518_fini(void)
15519{
15520	return (mod_remove(&modlinkage));
15521}
15522