dtrace.c revision 7656:2621e50fdf4a
1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21
22/*
23 * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
24 * Use is subject to license terms.
25 */
26
27/*
28 * DTrace - Dynamic Tracing for Solaris
29 *
30 * This is the implementation of the Solaris Dynamic Tracing framework
31 * (DTrace).  The user-visible interface to DTrace is described at length in
32 * the "Solaris Dynamic Tracing Guide".  The interfaces between the libdtrace
33 * library, the in-kernel DTrace framework, and the DTrace providers are
34 * described in the block comments in the <sys/dtrace.h> header file.  The
35 * internal architecture of DTrace is described in the block comments in the
36 * <sys/dtrace_impl.h> header file.  The comments contained within the DTrace
37 * implementation very much assume mastery of all of these sources; if one has
38 * an unanswered question about the implementation, one should consult them
39 * first.
40 *
41 * The functions here are ordered roughly as follows:
42 *
43 *   - Probe context functions
44 *   - Probe hashing functions
45 *   - Non-probe context utility functions
46 *   - Matching functions
47 *   - Provider-to-Framework API functions
48 *   - Probe management functions
49 *   - DIF object functions
50 *   - Format functions
51 *   - Predicate functions
52 *   - ECB functions
53 *   - Buffer functions
54 *   - Enabling functions
55 *   - DOF functions
56 *   - Anonymous enabling functions
57 *   - Consumer state functions
58 *   - Helper functions
59 *   - Hook functions
60 *   - Driver cookbook functions
61 *
62 * Each group of functions begins with a block comment labelled the "DTrace
63 * [Group] Functions", allowing one to find each block by searching forward
64 * on capital-f functions.
65 */
66#include <sys/errno.h>
67#include <sys/stat.h>
68#include <sys/modctl.h>
69#include <sys/conf.h>
70#include <sys/systm.h>
71#include <sys/ddi.h>
72#include <sys/sunddi.h>
73#include <sys/cpuvar.h>
74#include <sys/kmem.h>
75#include <sys/strsubr.h>
76#include <sys/sysmacros.h>
77#include <sys/dtrace_impl.h>
78#include <sys/atomic.h>
79#include <sys/cmn_err.h>
80#include <sys/mutex_impl.h>
81#include <sys/rwlock_impl.h>
82#include <sys/ctf_api.h>
83#include <sys/panic.h>
84#include <sys/priv_impl.h>
85#include <sys/policy.h>
86#include <sys/cred_impl.h>
87#include <sys/procfs_isa.h>
88#include <sys/taskq.h>
89#include <sys/mkdev.h>
90#include <sys/kdi.h>
91#include <sys/zone.h>
92#include <sys/socket.h>
93#include <netinet/in.h>
94
95/*
96 * DTrace Tunable Variables
97 *
98 * The following variables may be tuned by adding a line to /etc/system that
99 * includes both the name of the DTrace module ("dtrace") and the name of the
100 * variable.  For example:
101 *
102 *   set dtrace:dtrace_destructive_disallow = 1
103 *
104 * In general, the only variables that one should be tuning this way are those
105 * that affect system-wide DTrace behavior, and for which the default behavior
106 * is undesirable.  Most of these variables are tunable on a per-consumer
107 * basis using DTrace options, and need not be tuned on a system-wide basis.
108 * When tuning these variables, avoid pathological values; while some attempt
109 * is made to verify the integrity of these variables, they are not considered
110 * part of the supported interface to DTrace, and they are therefore not
111 * checked comprehensively.  Further, these variables should not be tuned
112 * dynamically via "mdb -kw" or other means; they should only be tuned via
113 * /etc/system.
114 */
115int		dtrace_destructive_disallow = 0;
116dtrace_optval_t	dtrace_nonroot_maxsize = (16 * 1024 * 1024);
117size_t		dtrace_difo_maxsize = (256 * 1024);
118dtrace_optval_t	dtrace_dof_maxsize = (256 * 1024);
119size_t		dtrace_global_maxsize = (16 * 1024);
120size_t		dtrace_actions_max = (16 * 1024);
121size_t		dtrace_retain_max = 1024;
122dtrace_optval_t	dtrace_helper_actions_max = 32;
123dtrace_optval_t	dtrace_helper_providers_max = 32;
124dtrace_optval_t	dtrace_dstate_defsize = (1 * 1024 * 1024);
125size_t		dtrace_strsize_default = 256;
126dtrace_optval_t	dtrace_cleanrate_default = 9900990;		/* 101 hz */
127dtrace_optval_t	dtrace_cleanrate_min = 200000;			/* 5000 hz */
128dtrace_optval_t	dtrace_cleanrate_max = (uint64_t)60 * NANOSEC;	/* 1/minute */
129dtrace_optval_t	dtrace_aggrate_default = NANOSEC;		/* 1 hz */
130dtrace_optval_t	dtrace_statusrate_default = NANOSEC;		/* 1 hz */
131dtrace_optval_t dtrace_statusrate_max = (hrtime_t)10 * NANOSEC;	 /* 6/minute */
132dtrace_optval_t	dtrace_switchrate_default = NANOSEC;		/* 1 hz */
133dtrace_optval_t	dtrace_nspec_default = 1;
134dtrace_optval_t	dtrace_specsize_default = 32 * 1024;
135dtrace_optval_t dtrace_stackframes_default = 20;
136dtrace_optval_t dtrace_ustackframes_default = 20;
137dtrace_optval_t dtrace_jstackframes_default = 50;
138dtrace_optval_t dtrace_jstackstrsize_default = 512;
139int		dtrace_msgdsize_max = 128;
140hrtime_t	dtrace_chill_max = 500 * (NANOSEC / MILLISEC);	/* 500 ms */
141hrtime_t	dtrace_chill_interval = NANOSEC;		/* 1000 ms */
142int		dtrace_devdepth_max = 32;
143int		dtrace_err_verbose;
144hrtime_t	dtrace_deadman_interval = NANOSEC;
145hrtime_t	dtrace_deadman_timeout = (hrtime_t)10 * NANOSEC;
146hrtime_t	dtrace_deadman_user = (hrtime_t)30 * NANOSEC;
147
148/*
149 * DTrace External Variables
150 *
151 * As dtrace(7D) is a kernel module, any DTrace variables are obviously
152 * available to DTrace consumers via the backtick (`) syntax.  One of these,
153 * dtrace_zero, is made deliberately so:  it is provided as a source of
154 * well-known, zero-filled memory.  While this variable is not documented,
155 * it is used by some translators as an implementation detail.
156 */
157const char	dtrace_zero[256] = { 0 };	/* zero-filled memory */
158
159/*
160 * DTrace Internal Variables
161 */
162static dev_info_t	*dtrace_devi;		/* device info */
163static vmem_t		*dtrace_arena;		/* probe ID arena */
164static vmem_t		*dtrace_minor;		/* minor number arena */
165static taskq_t		*dtrace_taskq;		/* task queue */
166static dtrace_probe_t	**dtrace_probes;	/* array of all probes */
167static int		dtrace_nprobes;		/* number of probes */
168static dtrace_provider_t *dtrace_provider;	/* provider list */
169static dtrace_meta_t	*dtrace_meta_pid;	/* user-land meta provider */
170static int		dtrace_opens;		/* number of opens */
171static int		dtrace_helpers;		/* number of helpers */
172static void		*dtrace_softstate;	/* softstate pointer */
173static dtrace_hash_t	*dtrace_bymod;		/* probes hashed by module */
174static dtrace_hash_t	*dtrace_byfunc;		/* probes hashed by function */
175static dtrace_hash_t	*dtrace_byname;		/* probes hashed by name */
176static dtrace_toxrange_t *dtrace_toxrange;	/* toxic range array */
177static int		dtrace_toxranges;	/* number of toxic ranges */
178static int		dtrace_toxranges_max;	/* size of toxic range array */
179static dtrace_anon_t	dtrace_anon;		/* anonymous enabling */
180static kmem_cache_t	*dtrace_state_cache;	/* cache for dynamic state */
181static uint64_t		dtrace_vtime_references; /* number of vtimestamp refs */
182static kthread_t	*dtrace_panicked;	/* panicking thread */
183static dtrace_ecb_t	*dtrace_ecb_create_cache; /* cached created ECB */
184static dtrace_genid_t	dtrace_probegen;	/* current probe generation */
185static dtrace_helpers_t *dtrace_deferred_pid;	/* deferred helper list */
186static dtrace_enabling_t *dtrace_retained;	/* list of retained enablings */
187static dtrace_genid_t	dtrace_retained_gen;	/* current retained enab gen */
188static dtrace_dynvar_t	dtrace_dynhash_sink;	/* end of dynamic hash chains */
189
190/*
191 * DTrace Locking
192 * DTrace is protected by three (relatively coarse-grained) locks:
193 *
194 * (1) dtrace_lock is required to manipulate essentially any DTrace state,
195 *     including enabling state, probes, ECBs, consumer state, helper state,
196 *     etc.  Importantly, dtrace_lock is _not_ required when in probe context;
197 *     probe context is lock-free -- synchronization is handled via the
198 *     dtrace_sync() cross call mechanism.
199 *
200 * (2) dtrace_provider_lock is required when manipulating provider state, or
201 *     when provider state must be held constant.
202 *
203 * (3) dtrace_meta_lock is required when manipulating meta provider state, or
204 *     when meta provider state must be held constant.
205 *
206 * The lock ordering between these three locks is dtrace_meta_lock before
207 * dtrace_provider_lock before dtrace_lock.  (In particular, there are
208 * several places where dtrace_provider_lock is held by the framework as it
209 * calls into the providers -- which then call back into the framework,
210 * grabbing dtrace_lock.)
211 *
212 * There are two other locks in the mix:  mod_lock and cpu_lock.  With respect
213 * to dtrace_provider_lock and dtrace_lock, cpu_lock continues its historical
214 * role as a coarse-grained lock; it is acquired before both of these locks.
215 * With respect to dtrace_meta_lock, its behavior is stranger:  cpu_lock must
216 * be acquired _between_ dtrace_meta_lock and any other DTrace locks.
217 * mod_lock is similar with respect to dtrace_provider_lock in that it must be
218 * acquired _between_ dtrace_provider_lock and dtrace_lock.
219 */
220static kmutex_t		dtrace_lock;		/* probe state lock */
221static kmutex_t		dtrace_provider_lock;	/* provider state lock */
222static kmutex_t		dtrace_meta_lock;	/* meta-provider state lock */
223
224/*
225 * DTrace Provider Variables
226 *
227 * These are the variables relating to DTrace as a provider (that is, the
228 * provider of the BEGIN, END, and ERROR probes).
229 */
230static dtrace_pattr_t	dtrace_provider_attr = {
231{ DTRACE_STABILITY_STABLE, DTRACE_STABILITY_STABLE, DTRACE_CLASS_COMMON },
232{ DTRACE_STABILITY_PRIVATE, DTRACE_STABILITY_PRIVATE, DTRACE_CLASS_UNKNOWN },
233{ DTRACE_STABILITY_PRIVATE, DTRACE_STABILITY_PRIVATE, DTRACE_CLASS_UNKNOWN },
234{ DTRACE_STABILITY_STABLE, DTRACE_STABILITY_STABLE, DTRACE_CLASS_COMMON },
235{ DTRACE_STABILITY_STABLE, DTRACE_STABILITY_STABLE, DTRACE_CLASS_COMMON },
236};
237
238static void
239dtrace_nullop(void)
240{}
241
242static dtrace_pops_t	dtrace_provider_ops = {
243	(void (*)(void *, const dtrace_probedesc_t *))dtrace_nullop,
244	(void (*)(void *, struct modctl *))dtrace_nullop,
245	(void (*)(void *, dtrace_id_t, void *))dtrace_nullop,
246	(void (*)(void *, dtrace_id_t, void *))dtrace_nullop,
247	(void (*)(void *, dtrace_id_t, void *))dtrace_nullop,
248	(void (*)(void *, dtrace_id_t, void *))dtrace_nullop,
249	NULL,
250	NULL,
251	NULL,
252	(void (*)(void *, dtrace_id_t, void *))dtrace_nullop
253};
254
255static dtrace_id_t	dtrace_probeid_begin;	/* special BEGIN probe */
256static dtrace_id_t	dtrace_probeid_end;	/* special END probe */
257dtrace_id_t		dtrace_probeid_error;	/* special ERROR probe */
258
259/*
260 * DTrace Helper Tracing Variables
261 */
262uint32_t dtrace_helptrace_next = 0;
263uint32_t dtrace_helptrace_nlocals;
264char	*dtrace_helptrace_buffer;
265int	dtrace_helptrace_bufsize = 512 * 1024;
266
267#ifdef DEBUG
268int	dtrace_helptrace_enabled = 1;
269#else
270int	dtrace_helptrace_enabled = 0;
271#endif
272
273/*
274 * DTrace Error Hashing
275 *
276 * On DEBUG kernels, DTrace will track the errors that has seen in a hash
277 * table.  This is very useful for checking coverage of tests that are
278 * expected to induce DIF or DOF processing errors, and may be useful for
279 * debugging problems in the DIF code generator or in DOF generation .  The
280 * error hash may be examined with the ::dtrace_errhash MDB dcmd.
281 */
282#ifdef DEBUG
283static dtrace_errhash_t	dtrace_errhash[DTRACE_ERRHASHSZ];
284static const char *dtrace_errlast;
285static kthread_t *dtrace_errthread;
286static kmutex_t dtrace_errlock;
287#endif
288
289/*
290 * DTrace Macros and Constants
291 *
292 * These are various macros that are useful in various spots in the
293 * implementation, along with a few random constants that have no meaning
294 * outside of the implementation.  There is no real structure to this cpp
295 * mishmash -- but is there ever?
296 */
297#define	DTRACE_HASHSTR(hash, probe)	\
298	dtrace_hash_str(*((char **)((uintptr_t)(probe) + (hash)->dth_stroffs)))
299
300#define	DTRACE_HASHNEXT(hash, probe)	\
301	(dtrace_probe_t **)((uintptr_t)(probe) + (hash)->dth_nextoffs)
302
303#define	DTRACE_HASHPREV(hash, probe)	\
304	(dtrace_probe_t **)((uintptr_t)(probe) + (hash)->dth_prevoffs)
305
306#define	DTRACE_HASHEQ(hash, lhs, rhs)	\
307	(strcmp(*((char **)((uintptr_t)(lhs) + (hash)->dth_stroffs)), \
308	    *((char **)((uintptr_t)(rhs) + (hash)->dth_stroffs))) == 0)
309
310#define	DTRACE_AGGHASHSIZE_SLEW		17
311
312#define	DTRACE_V4MAPPED_OFFSET		(sizeof (uint32_t) * 3)
313
314/*
315 * The key for a thread-local variable consists of the lower 61 bits of the
316 * t_did, plus the 3 bits of the highest active interrupt above LOCK_LEVEL.
317 * We add DIF_VARIABLE_MAX to t_did to assure that the thread key is never
318 * equal to a variable identifier.  This is necessary (but not sufficient) to
319 * assure that global associative arrays never collide with thread-local
320 * variables.  To guarantee that they cannot collide, we must also define the
321 * order for keying dynamic variables.  That order is:
322 *
323 *   [ key0 ] ... [ keyn ] [ variable-key ] [ tls-key ]
324 *
325 * Because the variable-key and the tls-key are in orthogonal spaces, there is
326 * no way for a global variable key signature to match a thread-local key
327 * signature.
328 */
329#define	DTRACE_TLS_THRKEY(where) { \
330	uint_t intr = 0; \
331	uint_t actv = CPU->cpu_intr_actv >> (LOCK_LEVEL + 1); \
332	for (; actv; actv >>= 1) \
333		intr++; \
334	ASSERT(intr < (1 << 3)); \
335	(where) = ((curthread->t_did + DIF_VARIABLE_MAX) & \
336	    (((uint64_t)1 << 61) - 1)) | ((uint64_t)intr << 61); \
337}
338
339#define	DT_BSWAP_8(x)	((x) & 0xff)
340#define	DT_BSWAP_16(x)	((DT_BSWAP_8(x) << 8) | DT_BSWAP_8((x) >> 8))
341#define	DT_BSWAP_32(x)	((DT_BSWAP_16(x) << 16) | DT_BSWAP_16((x) >> 16))
342#define	DT_BSWAP_64(x)	((DT_BSWAP_32(x) << 32) | DT_BSWAP_32((x) >> 32))
343
344#define	DT_MASK_LO 0x00000000FFFFFFFFULL
345
346#define	DTRACE_STORE(type, tomax, offset, what) \
347	*((type *)((uintptr_t)(tomax) + (uintptr_t)offset)) = (type)(what);
348
349#ifndef __i386
350#define	DTRACE_ALIGNCHECK(addr, size, flags)				\
351	if (addr & (size - 1)) {					\
352		*flags |= CPU_DTRACE_BADALIGN;				\
353		cpu_core[CPU->cpu_id].cpuc_dtrace_illval = addr;	\
354		return (0);						\
355	}
356#else
357#define	DTRACE_ALIGNCHECK(addr, size, flags)
358#endif
359
360/*
361 * Test whether a range of memory starting at testaddr of size testsz falls
362 * within the range of memory described by addr, sz.  We take care to avoid
363 * problems with overflow and underflow of the unsigned quantities, and
364 * disallow all negative sizes.  Ranges of size 0 are allowed.
365 */
366#define	DTRACE_INRANGE(testaddr, testsz, baseaddr, basesz) \
367	((testaddr) - (baseaddr) < (basesz) && \
368	(testaddr) + (testsz) - (baseaddr) <= (basesz) && \
369	(testaddr) + (testsz) >= (testaddr))
370
371/*
372 * Test whether alloc_sz bytes will fit in the scratch region.  We isolate
373 * alloc_sz on the righthand side of the comparison in order to avoid overflow
374 * or underflow in the comparison with it.  This is simpler than the INRANGE
375 * check above, because we know that the dtms_scratch_ptr is valid in the
376 * range.  Allocations of size zero are allowed.
377 */
378#define	DTRACE_INSCRATCH(mstate, alloc_sz) \
379	((mstate)->dtms_scratch_base + (mstate)->dtms_scratch_size - \
380	(mstate)->dtms_scratch_ptr >= (alloc_sz))
381
382#define	DTRACE_LOADFUNC(bits)						\
383/*CSTYLED*/								\
384uint##bits##_t								\
385dtrace_load##bits(uintptr_t addr)					\
386{									\
387	size_t size = bits / NBBY;					\
388	/*CSTYLED*/							\
389	uint##bits##_t rval;						\
390	int i;								\
391	volatile uint16_t *flags = (volatile uint16_t *)		\
392	    &cpu_core[CPU->cpu_id].cpuc_dtrace_flags;			\
393									\
394	DTRACE_ALIGNCHECK(addr, size, flags);				\
395									\
396	for (i = 0; i < dtrace_toxranges; i++) {			\
397		if (addr >= dtrace_toxrange[i].dtt_limit)		\
398			continue;					\
399									\
400		if (addr + size <= dtrace_toxrange[i].dtt_base)		\
401			continue;					\
402									\
403		/*							\
404		 * This address falls within a toxic region; return 0.	\
405		 */							\
406		*flags |= CPU_DTRACE_BADADDR;				\
407		cpu_core[CPU->cpu_id].cpuc_dtrace_illval = addr;	\
408		return (0);						\
409	}								\
410									\
411	*flags |= CPU_DTRACE_NOFAULT;					\
412	/*CSTYLED*/							\
413	rval = *((volatile uint##bits##_t *)addr);			\
414	*flags &= ~CPU_DTRACE_NOFAULT;					\
415									\
416	return (!(*flags & CPU_DTRACE_FAULT) ? rval : 0);		\
417}
418
419#ifdef _LP64
420#define	dtrace_loadptr	dtrace_load64
421#else
422#define	dtrace_loadptr	dtrace_load32
423#endif
424
425#define	DTRACE_DYNHASH_FREE	0
426#define	DTRACE_DYNHASH_SINK	1
427#define	DTRACE_DYNHASH_VALID	2
428
429#define	DTRACE_MATCH_NEXT	0
430#define	DTRACE_MATCH_DONE	1
431#define	DTRACE_ANCHORED(probe)	((probe)->dtpr_func[0] != '\0')
432#define	DTRACE_STATE_ALIGN	64
433
434#define	DTRACE_FLAGS2FLT(flags)						\
435	(((flags) & CPU_DTRACE_BADADDR) ? DTRACEFLT_BADADDR :		\
436	((flags) & CPU_DTRACE_ILLOP) ? DTRACEFLT_ILLOP :		\
437	((flags) & CPU_DTRACE_DIVZERO) ? DTRACEFLT_DIVZERO :		\
438	((flags) & CPU_DTRACE_KPRIV) ? DTRACEFLT_KPRIV :		\
439	((flags) & CPU_DTRACE_UPRIV) ? DTRACEFLT_UPRIV :		\
440	((flags) & CPU_DTRACE_TUPOFLOW) ?  DTRACEFLT_TUPOFLOW :		\
441	((flags) & CPU_DTRACE_BADALIGN) ?  DTRACEFLT_BADALIGN :		\
442	((flags) & CPU_DTRACE_NOSCRATCH) ?  DTRACEFLT_NOSCRATCH :	\
443	((flags) & CPU_DTRACE_BADSTACK) ?  DTRACEFLT_BADSTACK :		\
444	DTRACEFLT_UNKNOWN)
445
446#define	DTRACEACT_ISSTRING(act)						\
447	((act)->dta_kind == DTRACEACT_DIFEXPR &&			\
448	(act)->dta_difo->dtdo_rtype.dtdt_kind == DIF_TYPE_STRING)
449
450static size_t dtrace_strlen(const char *, size_t);
451static dtrace_probe_t *dtrace_probe_lookup_id(dtrace_id_t id);
452static void dtrace_enabling_provide(dtrace_provider_t *);
453static int dtrace_enabling_match(dtrace_enabling_t *, int *);
454static void dtrace_enabling_matchall(void);
455static dtrace_state_t *dtrace_anon_grab(void);
456static uint64_t dtrace_helper(int, dtrace_mstate_t *,
457    dtrace_state_t *, uint64_t, uint64_t);
458static dtrace_helpers_t *dtrace_helpers_create(proc_t *);
459static void dtrace_buffer_drop(dtrace_buffer_t *);
460static intptr_t dtrace_buffer_reserve(dtrace_buffer_t *, size_t, size_t,
461    dtrace_state_t *, dtrace_mstate_t *);
462static int dtrace_state_option(dtrace_state_t *, dtrace_optid_t,
463    dtrace_optval_t);
464static int dtrace_ecb_create_enable(dtrace_probe_t *, void *);
465static void dtrace_helper_provider_destroy(dtrace_helper_provider_t *);
466
467/*
468 * DTrace Probe Context Functions
469 *
470 * These functions are called from probe context.  Because probe context is
471 * any context in which C may be called, arbitrarily locks may be held,
472 * interrupts may be disabled, we may be in arbitrary dispatched state, etc.
473 * As a result, functions called from probe context may only call other DTrace
474 * support functions -- they may not interact at all with the system at large.
475 * (Note that the ASSERT macro is made probe-context safe by redefining it in
476 * terms of dtrace_assfail(), a probe-context safe function.) If arbitrary
477 * loads are to be performed from probe context, they _must_ be in terms of
478 * the safe dtrace_load*() variants.
479 *
480 * Some functions in this block are not actually called from probe context;
481 * for these functions, there will be a comment above the function reading
482 * "Note:  not called from probe context."
483 */
484void
485dtrace_panic(const char *format, ...)
486{
487	va_list alist;
488
489	va_start(alist, format);
490	dtrace_vpanic(format, alist);
491	va_end(alist);
492}
493
494int
495dtrace_assfail(const char *a, const char *f, int l)
496{
497	dtrace_panic("assertion failed: %s, file: %s, line: %d", a, f, l);
498
499	/*
500	 * We just need something here that even the most clever compiler
501	 * cannot optimize away.
502	 */
503	return (a[(uintptr_t)f]);
504}
505
506/*
507 * Atomically increment a specified error counter from probe context.
508 */
509static void
510dtrace_error(uint32_t *counter)
511{
512	/*
513	 * Most counters stored to in probe context are per-CPU counters.
514	 * However, there are some error conditions that are sufficiently
515	 * arcane that they don't merit per-CPU storage.  If these counters
516	 * are incremented concurrently on different CPUs, scalability will be
517	 * adversely affected -- but we don't expect them to be white-hot in a
518	 * correctly constructed enabling...
519	 */
520	uint32_t oval, nval;
521
522	do {
523		oval = *counter;
524
525		if ((nval = oval + 1) == 0) {
526			/*
527			 * If the counter would wrap, set it to 1 -- assuring
528			 * that the counter is never zero when we have seen
529			 * errors.  (The counter must be 32-bits because we
530			 * aren't guaranteed a 64-bit compare&swap operation.)
531			 * To save this code both the infamy of being fingered
532			 * by a priggish news story and the indignity of being
533			 * the target of a neo-puritan witch trial, we're
534			 * carefully avoiding any colorful description of the
535			 * likelihood of this condition -- but suffice it to
536			 * say that it is only slightly more likely than the
537			 * overflow of predicate cache IDs, as discussed in
538			 * dtrace_predicate_create().
539			 */
540			nval = 1;
541		}
542	} while (dtrace_cas32(counter, oval, nval) != oval);
543}
544
545/*
546 * Use the DTRACE_LOADFUNC macro to define functions for each of loading a
547 * uint8_t, a uint16_t, a uint32_t and a uint64_t.
548 */
549DTRACE_LOADFUNC(8)
550DTRACE_LOADFUNC(16)
551DTRACE_LOADFUNC(32)
552DTRACE_LOADFUNC(64)
553
554static int
555dtrace_inscratch(uintptr_t dest, size_t size, dtrace_mstate_t *mstate)
556{
557	if (dest < mstate->dtms_scratch_base)
558		return (0);
559
560	if (dest + size < dest)
561		return (0);
562
563	if (dest + size > mstate->dtms_scratch_ptr)
564		return (0);
565
566	return (1);
567}
568
569static int
570dtrace_canstore_statvar(uint64_t addr, size_t sz,
571    dtrace_statvar_t **svars, int nsvars)
572{
573	int i;
574
575	for (i = 0; i < nsvars; i++) {
576		dtrace_statvar_t *svar = svars[i];
577
578		if (svar == NULL || svar->dtsv_size == 0)
579			continue;
580
581		if (DTRACE_INRANGE(addr, sz, svar->dtsv_data, svar->dtsv_size))
582			return (1);
583	}
584
585	return (0);
586}
587
588/*
589 * Check to see if the address is within a memory region to which a store may
590 * be issued.  This includes the DTrace scratch areas, and any DTrace variable
591 * region.  The caller of dtrace_canstore() is responsible for performing any
592 * alignment checks that are needed before stores are actually executed.
593 */
594static int
595dtrace_canstore(uint64_t addr, size_t sz, dtrace_mstate_t *mstate,
596    dtrace_vstate_t *vstate)
597{
598	/*
599	 * First, check to see if the address is in scratch space...
600	 */
601	if (DTRACE_INRANGE(addr, sz, mstate->dtms_scratch_base,
602	    mstate->dtms_scratch_size))
603		return (1);
604
605	/*
606	 * Now check to see if it's a dynamic variable.  This check will pick
607	 * up both thread-local variables and any global dynamically-allocated
608	 * variables.
609	 */
610	if (DTRACE_INRANGE(addr, sz, (uintptr_t)vstate->dtvs_dynvars.dtds_base,
611	    vstate->dtvs_dynvars.dtds_size)) {
612		dtrace_dstate_t *dstate = &vstate->dtvs_dynvars;
613		uintptr_t base = (uintptr_t)dstate->dtds_base +
614		    (dstate->dtds_hashsize * sizeof (dtrace_dynhash_t));
615		uintptr_t chunkoffs;
616
617		/*
618		 * Before we assume that we can store here, we need to make
619		 * sure that it isn't in our metadata -- storing to our
620		 * dynamic variable metadata would corrupt our state.  For
621		 * the range to not include any dynamic variable metadata,
622		 * it must:
623		 *
624		 *	(1) Start above the hash table that is at the base of
625		 *	the dynamic variable space
626		 *
627		 *	(2) Have a starting chunk offset that is beyond the
628		 *	dtrace_dynvar_t that is at the base of every chunk
629		 *
630		 *	(3) Not span a chunk boundary
631		 *
632		 */
633		if (addr < base)
634			return (0);
635
636		chunkoffs = (addr - base) % dstate->dtds_chunksize;
637
638		if (chunkoffs < sizeof (dtrace_dynvar_t))
639			return (0);
640
641		if (chunkoffs + sz > dstate->dtds_chunksize)
642			return (0);
643
644		return (1);
645	}
646
647	/*
648	 * Finally, check the static local and global variables.  These checks
649	 * take the longest, so we perform them last.
650	 */
651	if (dtrace_canstore_statvar(addr, sz,
652	    vstate->dtvs_locals, vstate->dtvs_nlocals))
653		return (1);
654
655	if (dtrace_canstore_statvar(addr, sz,
656	    vstate->dtvs_globals, vstate->dtvs_nglobals))
657		return (1);
658
659	return (0);
660}
661
662
663/*
664 * Convenience routine to check to see if the address is within a memory
665 * region in which a load may be issued given the user's privilege level;
666 * if not, it sets the appropriate error flags and loads 'addr' into the
667 * illegal value slot.
668 *
669 * DTrace subroutines (DIF_SUBR_*) should use this helper to implement
670 * appropriate memory access protection.
671 */
672static int
673dtrace_canload(uint64_t addr, size_t sz, dtrace_mstate_t *mstate,
674    dtrace_vstate_t *vstate)
675{
676	volatile uintptr_t *illval = &cpu_core[CPU->cpu_id].cpuc_dtrace_illval;
677
678	/*
679	 * If we hold the privilege to read from kernel memory, then
680	 * everything is readable.
681	 */
682	if ((mstate->dtms_access & DTRACE_ACCESS_KERNEL) != 0)
683		return (1);
684
685	/*
686	 * You can obviously read that which you can store.
687	 */
688	if (dtrace_canstore(addr, sz, mstate, vstate))
689		return (1);
690
691	/*
692	 * We're allowed to read from our own string table.
693	 */
694	if (DTRACE_INRANGE(addr, sz, (uintptr_t)mstate->dtms_difo->dtdo_strtab,
695	    mstate->dtms_difo->dtdo_strlen))
696		return (1);
697
698	DTRACE_CPUFLAG_SET(CPU_DTRACE_KPRIV);
699	*illval = addr;
700	return (0);
701}
702
703/*
704 * Convenience routine to check to see if a given string is within a memory
705 * region in which a load may be issued given the user's privilege level;
706 * this exists so that we don't need to issue unnecessary dtrace_strlen()
707 * calls in the event that the user has all privileges.
708 */
709static int
710dtrace_strcanload(uint64_t addr, size_t sz, dtrace_mstate_t *mstate,
711    dtrace_vstate_t *vstate)
712{
713	size_t strsz;
714
715	/*
716	 * If we hold the privilege to read from kernel memory, then
717	 * everything is readable.
718	 */
719	if ((mstate->dtms_access & DTRACE_ACCESS_KERNEL) != 0)
720		return (1);
721
722	strsz = 1 + dtrace_strlen((char *)(uintptr_t)addr, sz);
723	if (dtrace_canload(addr, strsz, mstate, vstate))
724		return (1);
725
726	return (0);
727}
728
729/*
730 * Convenience routine to check to see if a given variable is within a memory
731 * region in which a load may be issued given the user's privilege level.
732 */
733static int
734dtrace_vcanload(void *src, dtrace_diftype_t *type, dtrace_mstate_t *mstate,
735    dtrace_vstate_t *vstate)
736{
737	size_t sz;
738	ASSERT(type->dtdt_flags & DIF_TF_BYREF);
739
740	/*
741	 * If we hold the privilege to read from kernel memory, then
742	 * everything is readable.
743	 */
744	if ((mstate->dtms_access & DTRACE_ACCESS_KERNEL) != 0)
745		return (1);
746
747	if (type->dtdt_kind == DIF_TYPE_STRING)
748		sz = dtrace_strlen(src,
749		    vstate->dtvs_state->dts_options[DTRACEOPT_STRSIZE]) + 1;
750	else
751		sz = type->dtdt_size;
752
753	return (dtrace_canload((uintptr_t)src, sz, mstate, vstate));
754}
755
756/*
757 * Compare two strings using safe loads.
758 */
759static int
760dtrace_strncmp(char *s1, char *s2, size_t limit)
761{
762	uint8_t c1, c2;
763	volatile uint16_t *flags;
764
765	if (s1 == s2 || limit == 0)
766		return (0);
767
768	flags = (volatile uint16_t *)&cpu_core[CPU->cpu_id].cpuc_dtrace_flags;
769
770	do {
771		if (s1 == NULL) {
772			c1 = '\0';
773		} else {
774			c1 = dtrace_load8((uintptr_t)s1++);
775		}
776
777		if (s2 == NULL) {
778			c2 = '\0';
779		} else {
780			c2 = dtrace_load8((uintptr_t)s2++);
781		}
782
783		if (c1 != c2)
784			return (c1 - c2);
785	} while (--limit && c1 != '\0' && !(*flags & CPU_DTRACE_FAULT));
786
787	return (0);
788}
789
790/*
791 * Compute strlen(s) for a string using safe memory accesses.  The additional
792 * len parameter is used to specify a maximum length to ensure completion.
793 */
794static size_t
795dtrace_strlen(const char *s, size_t lim)
796{
797	uint_t len;
798
799	for (len = 0; len != lim; len++) {
800		if (dtrace_load8((uintptr_t)s++) == '\0')
801			break;
802	}
803
804	return (len);
805}
806
807/*
808 * Check if an address falls within a toxic region.
809 */
810static int
811dtrace_istoxic(uintptr_t kaddr, size_t size)
812{
813	uintptr_t taddr, tsize;
814	int i;
815
816	for (i = 0; i < dtrace_toxranges; i++) {
817		taddr = dtrace_toxrange[i].dtt_base;
818		tsize = dtrace_toxrange[i].dtt_limit - taddr;
819
820		if (kaddr - taddr < tsize) {
821			DTRACE_CPUFLAG_SET(CPU_DTRACE_BADADDR);
822			cpu_core[CPU->cpu_id].cpuc_dtrace_illval = kaddr;
823			return (1);
824		}
825
826		if (taddr - kaddr < size) {
827			DTRACE_CPUFLAG_SET(CPU_DTRACE_BADADDR);
828			cpu_core[CPU->cpu_id].cpuc_dtrace_illval = taddr;
829			return (1);
830		}
831	}
832
833	return (0);
834}
835
836/*
837 * Copy src to dst using safe memory accesses.  The src is assumed to be unsafe
838 * memory specified by the DIF program.  The dst is assumed to be safe memory
839 * that we can store to directly because it is managed by DTrace.  As with
840 * standard bcopy, overlapping copies are handled properly.
841 */
842static void
843dtrace_bcopy(const void *src, void *dst, size_t len)
844{
845	if (len != 0) {
846		uint8_t *s1 = dst;
847		const uint8_t *s2 = src;
848
849		if (s1 <= s2) {
850			do {
851				*s1++ = dtrace_load8((uintptr_t)s2++);
852			} while (--len != 0);
853		} else {
854			s2 += len;
855			s1 += len;
856
857			do {
858				*--s1 = dtrace_load8((uintptr_t)--s2);
859			} while (--len != 0);
860		}
861	}
862}
863
864/*
865 * Copy src to dst using safe memory accesses, up to either the specified
866 * length, or the point that a nul byte is encountered.  The src is assumed to
867 * be unsafe memory specified by the DIF program.  The dst is assumed to be
868 * safe memory that we can store to directly because it is managed by DTrace.
869 * Unlike dtrace_bcopy(), overlapping regions are not handled.
870 */
871static void
872dtrace_strcpy(const void *src, void *dst, size_t len)
873{
874	if (len != 0) {
875		uint8_t *s1 = dst, c;
876		const uint8_t *s2 = src;
877
878		do {
879			*s1++ = c = dtrace_load8((uintptr_t)s2++);
880		} while (--len != 0 && c != '\0');
881	}
882}
883
884/*
885 * Copy src to dst, deriving the size and type from the specified (BYREF)
886 * variable type.  The src is assumed to be unsafe memory specified by the DIF
887 * program.  The dst is assumed to be DTrace variable memory that is of the
888 * specified type; we assume that we can store to directly.
889 */
890static void
891dtrace_vcopy(void *src, void *dst, dtrace_diftype_t *type)
892{
893	ASSERT(type->dtdt_flags & DIF_TF_BYREF);
894
895	if (type->dtdt_kind == DIF_TYPE_STRING) {
896		dtrace_strcpy(src, dst, type->dtdt_size);
897	} else {
898		dtrace_bcopy(src, dst, type->dtdt_size);
899	}
900}
901
902/*
903 * Compare s1 to s2 using safe memory accesses.  The s1 data is assumed to be
904 * unsafe memory specified by the DIF program.  The s2 data is assumed to be
905 * safe memory that we can access directly because it is managed by DTrace.
906 */
907static int
908dtrace_bcmp(const void *s1, const void *s2, size_t len)
909{
910	volatile uint16_t *flags;
911
912	flags = (volatile uint16_t *)&cpu_core[CPU->cpu_id].cpuc_dtrace_flags;
913
914	if (s1 == s2)
915		return (0);
916
917	if (s1 == NULL || s2 == NULL)
918		return (1);
919
920	if (s1 != s2 && len != 0) {
921		const uint8_t *ps1 = s1;
922		const uint8_t *ps2 = s2;
923
924		do {
925			if (dtrace_load8((uintptr_t)ps1++) != *ps2++)
926				return (1);
927		} while (--len != 0 && !(*flags & CPU_DTRACE_FAULT));
928	}
929	return (0);
930}
931
932/*
933 * Zero the specified region using a simple byte-by-byte loop.  Note that this
934 * is for safe DTrace-managed memory only.
935 */
936static void
937dtrace_bzero(void *dst, size_t len)
938{
939	uchar_t *cp;
940
941	for (cp = dst; len != 0; len--)
942		*cp++ = 0;
943}
944
945static void
946dtrace_add_128(uint64_t *addend1, uint64_t *addend2, uint64_t *sum)
947{
948	uint64_t result[2];
949
950	result[0] = addend1[0] + addend2[0];
951	result[1] = addend1[1] + addend2[1] +
952	    (result[0] < addend1[0] || result[0] < addend2[0] ? 1 : 0);
953
954	sum[0] = result[0];
955	sum[1] = result[1];
956}
957
958/*
959 * Shift the 128-bit value in a by b. If b is positive, shift left.
960 * If b is negative, shift right.
961 */
962static void
963dtrace_shift_128(uint64_t *a, int b)
964{
965	uint64_t mask;
966
967	if (b == 0)
968		return;
969
970	if (b < 0) {
971		b = -b;
972		if (b >= 64) {
973			a[0] = a[1] >> (b - 64);
974			a[1] = 0;
975		} else {
976			a[0] >>= b;
977			mask = 1LL << (64 - b);
978			mask -= 1;
979			a[0] |= ((a[1] & mask) << (64 - b));
980			a[1] >>= b;
981		}
982	} else {
983		if (b >= 64) {
984			a[1] = a[0] << (b - 64);
985			a[0] = 0;
986		} else {
987			a[1] <<= b;
988			mask = a[0] >> (64 - b);
989			a[1] |= mask;
990			a[0] <<= b;
991		}
992	}
993}
994
995/*
996 * The basic idea is to break the 2 64-bit values into 4 32-bit values,
997 * use native multiplication on those, and then re-combine into the
998 * resulting 128-bit value.
999 *
1000 * (hi1 << 32 + lo1) * (hi2 << 32 + lo2) =
1001 *     hi1 * hi2 << 64 +
1002 *     hi1 * lo2 << 32 +
1003 *     hi2 * lo1 << 32 +
1004 *     lo1 * lo2
1005 */
1006static void
1007dtrace_multiply_128(uint64_t factor1, uint64_t factor2, uint64_t *product)
1008{
1009	uint64_t hi1, hi2, lo1, lo2;
1010	uint64_t tmp[2];
1011
1012	hi1 = factor1 >> 32;
1013	hi2 = factor2 >> 32;
1014
1015	lo1 = factor1 & DT_MASK_LO;
1016	lo2 = factor2 & DT_MASK_LO;
1017
1018	product[0] = lo1 * lo2;
1019	product[1] = hi1 * hi2;
1020
1021	tmp[0] = hi1 * lo2;
1022	tmp[1] = 0;
1023	dtrace_shift_128(tmp, 32);
1024	dtrace_add_128(product, tmp, product);
1025
1026	tmp[0] = hi2 * lo1;
1027	tmp[1] = 0;
1028	dtrace_shift_128(tmp, 32);
1029	dtrace_add_128(product, tmp, product);
1030}
1031
1032/*
1033 * This privilege check should be used by actions and subroutines to
1034 * verify that the user credentials of the process that enabled the
1035 * invoking ECB match the target credentials
1036 */
1037static int
1038dtrace_priv_proc_common_user(dtrace_state_t *state)
1039{
1040	cred_t *cr, *s_cr = state->dts_cred.dcr_cred;
1041
1042	/*
1043	 * We should always have a non-NULL state cred here, since if cred
1044	 * is null (anonymous tracing), we fast-path bypass this routine.
1045	 */
1046	ASSERT(s_cr != NULL);
1047
1048	if ((cr = CRED()) != NULL &&
1049	    s_cr->cr_uid == cr->cr_uid &&
1050	    s_cr->cr_uid == cr->cr_ruid &&
1051	    s_cr->cr_uid == cr->cr_suid &&
1052	    s_cr->cr_gid == cr->cr_gid &&
1053	    s_cr->cr_gid == cr->cr_rgid &&
1054	    s_cr->cr_gid == cr->cr_sgid)
1055		return (1);
1056
1057	return (0);
1058}
1059
1060/*
1061 * This privilege check should be used by actions and subroutines to
1062 * verify that the zone of the process that enabled the invoking ECB
1063 * matches the target credentials
1064 */
1065static int
1066dtrace_priv_proc_common_zone(dtrace_state_t *state)
1067{
1068	cred_t *cr, *s_cr = state->dts_cred.dcr_cred;
1069
1070	/*
1071	 * We should always have a non-NULL state cred here, since if cred
1072	 * is null (anonymous tracing), we fast-path bypass this routine.
1073	 */
1074	ASSERT(s_cr != NULL);
1075
1076	if ((cr = CRED()) != NULL &&
1077	    s_cr->cr_zone == cr->cr_zone)
1078		return (1);
1079
1080	return (0);
1081}
1082
1083/*
1084 * This privilege check should be used by actions and subroutines to
1085 * verify that the process has not setuid or changed credentials.
1086 */
1087static int
1088dtrace_priv_proc_common_nocd()
1089{
1090	proc_t *proc;
1091
1092	if ((proc = ttoproc(curthread)) != NULL &&
1093	    !(proc->p_flag & SNOCD))
1094		return (1);
1095
1096	return (0);
1097}
1098
1099static int
1100dtrace_priv_proc_destructive(dtrace_state_t *state)
1101{
1102	int action = state->dts_cred.dcr_action;
1103
1104	if (((action & DTRACE_CRA_PROC_DESTRUCTIVE_ALLZONE) == 0) &&
1105	    dtrace_priv_proc_common_zone(state) == 0)
1106		goto bad;
1107
1108	if (((action & DTRACE_CRA_PROC_DESTRUCTIVE_ALLUSER) == 0) &&
1109	    dtrace_priv_proc_common_user(state) == 0)
1110		goto bad;
1111
1112	if (((action & DTRACE_CRA_PROC_DESTRUCTIVE_CREDCHG) == 0) &&
1113	    dtrace_priv_proc_common_nocd() == 0)
1114		goto bad;
1115
1116	return (1);
1117
1118bad:
1119	cpu_core[CPU->cpu_id].cpuc_dtrace_flags |= CPU_DTRACE_UPRIV;
1120
1121	return (0);
1122}
1123
1124static int
1125dtrace_priv_proc_control(dtrace_state_t *state)
1126{
1127	if (state->dts_cred.dcr_action & DTRACE_CRA_PROC_CONTROL)
1128		return (1);
1129
1130	if (dtrace_priv_proc_common_zone(state) &&
1131	    dtrace_priv_proc_common_user(state) &&
1132	    dtrace_priv_proc_common_nocd())
1133		return (1);
1134
1135	cpu_core[CPU->cpu_id].cpuc_dtrace_flags |= CPU_DTRACE_UPRIV;
1136
1137	return (0);
1138}
1139
1140static int
1141dtrace_priv_proc(dtrace_state_t *state)
1142{
1143	if (state->dts_cred.dcr_action & DTRACE_CRA_PROC)
1144		return (1);
1145
1146	cpu_core[CPU->cpu_id].cpuc_dtrace_flags |= CPU_DTRACE_UPRIV;
1147
1148	return (0);
1149}
1150
1151static int
1152dtrace_priv_kernel(dtrace_state_t *state)
1153{
1154	if (state->dts_cred.dcr_action & DTRACE_CRA_KERNEL)
1155		return (1);
1156
1157	cpu_core[CPU->cpu_id].cpuc_dtrace_flags |= CPU_DTRACE_KPRIV;
1158
1159	return (0);
1160}
1161
1162static int
1163dtrace_priv_kernel_destructive(dtrace_state_t *state)
1164{
1165	if (state->dts_cred.dcr_action & DTRACE_CRA_KERNEL_DESTRUCTIVE)
1166		return (1);
1167
1168	cpu_core[CPU->cpu_id].cpuc_dtrace_flags |= CPU_DTRACE_KPRIV;
1169
1170	return (0);
1171}
1172
1173/*
1174 * Note:  not called from probe context.  This function is called
1175 * asynchronously (and at a regular interval) from outside of probe context to
1176 * clean the dirty dynamic variable lists on all CPUs.  Dynamic variable
1177 * cleaning is explained in detail in <sys/dtrace_impl.h>.
1178 */
1179void
1180dtrace_dynvar_clean(dtrace_dstate_t *dstate)
1181{
1182	dtrace_dynvar_t *dirty;
1183	dtrace_dstate_percpu_t *dcpu;
1184	int i, work = 0;
1185
1186	for (i = 0; i < NCPU; i++) {
1187		dcpu = &dstate->dtds_percpu[i];
1188
1189		ASSERT(dcpu->dtdsc_rinsing == NULL);
1190
1191		/*
1192		 * If the dirty list is NULL, there is no dirty work to do.
1193		 */
1194		if (dcpu->dtdsc_dirty == NULL)
1195			continue;
1196
1197		/*
1198		 * If the clean list is non-NULL, then we're not going to do
1199		 * any work for this CPU -- it means that there has not been
1200		 * a dtrace_dynvar() allocation on this CPU (or from this CPU)
1201		 * since the last time we cleaned house.
1202		 */
1203		if (dcpu->dtdsc_clean != NULL)
1204			continue;
1205
1206		work = 1;
1207
1208		/*
1209		 * Atomically move the dirty list aside.
1210		 */
1211		do {
1212			dirty = dcpu->dtdsc_dirty;
1213
1214			/*
1215			 * Before we zap the dirty list, set the rinsing list.
1216			 * (This allows for a potential assertion in
1217			 * dtrace_dynvar():  if a free dynamic variable appears
1218			 * on a hash chain, either the dirty list or the
1219			 * rinsing list for some CPU must be non-NULL.)
1220			 */
1221			dcpu->dtdsc_rinsing = dirty;
1222			dtrace_membar_producer();
1223		} while (dtrace_casptr(&dcpu->dtdsc_dirty,
1224		    dirty, NULL) != dirty);
1225	}
1226
1227	if (!work) {
1228		/*
1229		 * We have no work to do; we can simply return.
1230		 */
1231		return;
1232	}
1233
1234	dtrace_sync();
1235
1236	for (i = 0; i < NCPU; i++) {
1237		dcpu = &dstate->dtds_percpu[i];
1238
1239		if (dcpu->dtdsc_rinsing == NULL)
1240			continue;
1241
1242		/*
1243		 * We are now guaranteed that no hash chain contains a pointer
1244		 * into this dirty list; we can make it clean.
1245		 */
1246		ASSERT(dcpu->dtdsc_clean == NULL);
1247		dcpu->dtdsc_clean = dcpu->dtdsc_rinsing;
1248		dcpu->dtdsc_rinsing = NULL;
1249	}
1250
1251	/*
1252	 * Before we actually set the state to be DTRACE_DSTATE_CLEAN, make
1253	 * sure that all CPUs have seen all of the dtdsc_clean pointers.
1254	 * This prevents a race whereby a CPU incorrectly decides that
1255	 * the state should be something other than DTRACE_DSTATE_CLEAN
1256	 * after dtrace_dynvar_clean() has completed.
1257	 */
1258	dtrace_sync();
1259
1260	dstate->dtds_state = DTRACE_DSTATE_CLEAN;
1261}
1262
1263/*
1264 * Depending on the value of the op parameter, this function looks-up,
1265 * allocates or deallocates an arbitrarily-keyed dynamic variable.  If an
1266 * allocation is requested, this function will return a pointer to a
1267 * dtrace_dynvar_t corresponding to the allocated variable -- or NULL if no
1268 * variable can be allocated.  If NULL is returned, the appropriate counter
1269 * will be incremented.
1270 */
1271dtrace_dynvar_t *
1272dtrace_dynvar(dtrace_dstate_t *dstate, uint_t nkeys,
1273    dtrace_key_t *key, size_t dsize, dtrace_dynvar_op_t op,
1274    dtrace_mstate_t *mstate, dtrace_vstate_t *vstate)
1275{
1276	uint64_t hashval = DTRACE_DYNHASH_VALID;
1277	dtrace_dynhash_t *hash = dstate->dtds_hash;
1278	dtrace_dynvar_t *free, *new_free, *next, *dvar, *start, *prev = NULL;
1279	processorid_t me = CPU->cpu_id, cpu = me;
1280	dtrace_dstate_percpu_t *dcpu = &dstate->dtds_percpu[me];
1281	size_t bucket, ksize;
1282	size_t chunksize = dstate->dtds_chunksize;
1283	uintptr_t kdata, lock, nstate;
1284	uint_t i;
1285
1286	ASSERT(nkeys != 0);
1287
1288	/*
1289	 * Hash the key.  As with aggregations, we use Jenkins' "One-at-a-time"
1290	 * algorithm.  For the by-value portions, we perform the algorithm in
1291	 * 16-bit chunks (as opposed to 8-bit chunks).  This speeds things up a
1292	 * bit, and seems to have only a minute effect on distribution.  For
1293	 * the by-reference data, we perform "One-at-a-time" iterating (safely)
1294	 * over each referenced byte.  It's painful to do this, but it's much
1295	 * better than pathological hash distribution.  The efficacy of the
1296	 * hashing algorithm (and a comparison with other algorithms) may be
1297	 * found by running the ::dtrace_dynstat MDB dcmd.
1298	 */
1299	for (i = 0; i < nkeys; i++) {
1300		if (key[i].dttk_size == 0) {
1301			uint64_t val = key[i].dttk_value;
1302
1303			hashval += (val >> 48) & 0xffff;
1304			hashval += (hashval << 10);
1305			hashval ^= (hashval >> 6);
1306
1307			hashval += (val >> 32) & 0xffff;
1308			hashval += (hashval << 10);
1309			hashval ^= (hashval >> 6);
1310
1311			hashval += (val >> 16) & 0xffff;
1312			hashval += (hashval << 10);
1313			hashval ^= (hashval >> 6);
1314
1315			hashval += val & 0xffff;
1316			hashval += (hashval << 10);
1317			hashval ^= (hashval >> 6);
1318		} else {
1319			/*
1320			 * This is incredibly painful, but it beats the hell
1321			 * out of the alternative.
1322			 */
1323			uint64_t j, size = key[i].dttk_size;
1324			uintptr_t base = (uintptr_t)key[i].dttk_value;
1325
1326			if (!dtrace_canload(base, size, mstate, vstate))
1327				break;
1328
1329			for (j = 0; j < size; j++) {
1330				hashval += dtrace_load8(base + j);
1331				hashval += (hashval << 10);
1332				hashval ^= (hashval >> 6);
1333			}
1334		}
1335	}
1336
1337	if (DTRACE_CPUFLAG_ISSET(CPU_DTRACE_FAULT))
1338		return (NULL);
1339
1340	hashval += (hashval << 3);
1341	hashval ^= (hashval >> 11);
1342	hashval += (hashval << 15);
1343
1344	/*
1345	 * There is a remote chance (ideally, 1 in 2^31) that our hashval
1346	 * comes out to be one of our two sentinel hash values.  If this
1347	 * actually happens, we set the hashval to be a value known to be a
1348	 * non-sentinel value.
1349	 */
1350	if (hashval == DTRACE_DYNHASH_FREE || hashval == DTRACE_DYNHASH_SINK)
1351		hashval = DTRACE_DYNHASH_VALID;
1352
1353	/*
1354	 * Yes, it's painful to do a divide here.  If the cycle count becomes
1355	 * important here, tricks can be pulled to reduce it.  (However, it's
1356	 * critical that hash collisions be kept to an absolute minimum;
1357	 * they're much more painful than a divide.)  It's better to have a
1358	 * solution that generates few collisions and still keeps things
1359	 * relatively simple.
1360	 */
1361	bucket = hashval % dstate->dtds_hashsize;
1362
1363	if (op == DTRACE_DYNVAR_DEALLOC) {
1364		volatile uintptr_t *lockp = &hash[bucket].dtdh_lock;
1365
1366		for (;;) {
1367			while ((lock = *lockp) & 1)
1368				continue;
1369
1370			if (dtrace_casptr((void *)lockp,
1371			    (void *)lock, (void *)(lock + 1)) == (void *)lock)
1372				break;
1373		}
1374
1375		dtrace_membar_producer();
1376	}
1377
1378top:
1379	prev = NULL;
1380	lock = hash[bucket].dtdh_lock;
1381
1382	dtrace_membar_consumer();
1383
1384	start = hash[bucket].dtdh_chain;
1385	ASSERT(start != NULL && (start->dtdv_hashval == DTRACE_DYNHASH_SINK ||
1386	    start->dtdv_hashval != DTRACE_DYNHASH_FREE ||
1387	    op != DTRACE_DYNVAR_DEALLOC));
1388
1389	for (dvar = start; dvar != NULL; dvar = dvar->dtdv_next) {
1390		dtrace_tuple_t *dtuple = &dvar->dtdv_tuple;
1391		dtrace_key_t *dkey = &dtuple->dtt_key[0];
1392
1393		if (dvar->dtdv_hashval != hashval) {
1394			if (dvar->dtdv_hashval == DTRACE_DYNHASH_SINK) {
1395				/*
1396				 * We've reached the sink, and therefore the
1397				 * end of the hash chain; we can kick out of
1398				 * the loop knowing that we have seen a valid
1399				 * snapshot of state.
1400				 */
1401				ASSERT(dvar->dtdv_next == NULL);
1402				ASSERT(dvar == &dtrace_dynhash_sink);
1403				break;
1404			}
1405
1406			if (dvar->dtdv_hashval == DTRACE_DYNHASH_FREE) {
1407				/*
1408				 * We've gone off the rails:  somewhere along
1409				 * the line, one of the members of this hash
1410				 * chain was deleted.  Note that we could also
1411				 * detect this by simply letting this loop run
1412				 * to completion, as we would eventually hit
1413				 * the end of the dirty list.  However, we
1414				 * want to avoid running the length of the
1415				 * dirty list unnecessarily (it might be quite
1416				 * long), so we catch this as early as
1417				 * possible by detecting the hash marker.  In
1418				 * this case, we simply set dvar to NULL and
1419				 * break; the conditional after the loop will
1420				 * send us back to top.
1421				 */
1422				dvar = NULL;
1423				break;
1424			}
1425
1426			goto next;
1427		}
1428
1429		if (dtuple->dtt_nkeys != nkeys)
1430			goto next;
1431
1432		for (i = 0; i < nkeys; i++, dkey++) {
1433			if (dkey->dttk_size != key[i].dttk_size)
1434				goto next; /* size or type mismatch */
1435
1436			if (dkey->dttk_size != 0) {
1437				if (dtrace_bcmp(
1438				    (void *)(uintptr_t)key[i].dttk_value,
1439				    (void *)(uintptr_t)dkey->dttk_value,
1440				    dkey->dttk_size))
1441					goto next;
1442			} else {
1443				if (dkey->dttk_value != key[i].dttk_value)
1444					goto next;
1445			}
1446		}
1447
1448		if (op != DTRACE_DYNVAR_DEALLOC)
1449			return (dvar);
1450
1451		ASSERT(dvar->dtdv_next == NULL ||
1452		    dvar->dtdv_next->dtdv_hashval != DTRACE_DYNHASH_FREE);
1453
1454		if (prev != NULL) {
1455			ASSERT(hash[bucket].dtdh_chain != dvar);
1456			ASSERT(start != dvar);
1457			ASSERT(prev->dtdv_next == dvar);
1458			prev->dtdv_next = dvar->dtdv_next;
1459		} else {
1460			if (dtrace_casptr(&hash[bucket].dtdh_chain,
1461			    start, dvar->dtdv_next) != start) {
1462				/*
1463				 * We have failed to atomically swing the
1464				 * hash table head pointer, presumably because
1465				 * of a conflicting allocation on another CPU.
1466				 * We need to reread the hash chain and try
1467				 * again.
1468				 */
1469				goto top;
1470			}
1471		}
1472
1473		dtrace_membar_producer();
1474
1475		/*
1476		 * Now set the hash value to indicate that it's free.
1477		 */
1478		ASSERT(hash[bucket].dtdh_chain != dvar);
1479		dvar->dtdv_hashval = DTRACE_DYNHASH_FREE;
1480
1481		dtrace_membar_producer();
1482
1483		/*
1484		 * Set the next pointer to point at the dirty list, and
1485		 * atomically swing the dirty pointer to the newly freed dvar.
1486		 */
1487		do {
1488			next = dcpu->dtdsc_dirty;
1489			dvar->dtdv_next = next;
1490		} while (dtrace_casptr(&dcpu->dtdsc_dirty, next, dvar) != next);
1491
1492		/*
1493		 * Finally, unlock this hash bucket.
1494		 */
1495		ASSERT(hash[bucket].dtdh_lock == lock);
1496		ASSERT(lock & 1);
1497		hash[bucket].dtdh_lock++;
1498
1499		return (NULL);
1500next:
1501		prev = dvar;
1502		continue;
1503	}
1504
1505	if (dvar == NULL) {
1506		/*
1507		 * If dvar is NULL, it is because we went off the rails:
1508		 * one of the elements that we traversed in the hash chain
1509		 * was deleted while we were traversing it.  In this case,
1510		 * we assert that we aren't doing a dealloc (deallocs lock
1511		 * the hash bucket to prevent themselves from racing with
1512		 * one another), and retry the hash chain traversal.
1513		 */
1514		ASSERT(op != DTRACE_DYNVAR_DEALLOC);
1515		goto top;
1516	}
1517
1518	if (op != DTRACE_DYNVAR_ALLOC) {
1519		/*
1520		 * If we are not to allocate a new variable, we want to
1521		 * return NULL now.  Before we return, check that the value
1522		 * of the lock word hasn't changed.  If it has, we may have
1523		 * seen an inconsistent snapshot.
1524		 */
1525		if (op == DTRACE_DYNVAR_NOALLOC) {
1526			if (hash[bucket].dtdh_lock != lock)
1527				goto top;
1528		} else {
1529			ASSERT(op == DTRACE_DYNVAR_DEALLOC);
1530			ASSERT(hash[bucket].dtdh_lock == lock);
1531			ASSERT(lock & 1);
1532			hash[bucket].dtdh_lock++;
1533		}
1534
1535		return (NULL);
1536	}
1537
1538	/*
1539	 * We need to allocate a new dynamic variable.  The size we need is the
1540	 * size of dtrace_dynvar plus the size of nkeys dtrace_key_t's plus the
1541	 * size of any auxiliary key data (rounded up to 8-byte alignment) plus
1542	 * the size of any referred-to data (dsize).  We then round the final
1543	 * size up to the chunksize for allocation.
1544	 */
1545	for (ksize = 0, i = 0; i < nkeys; i++)
1546		ksize += P2ROUNDUP(key[i].dttk_size, sizeof (uint64_t));
1547
1548	/*
1549	 * This should be pretty much impossible, but could happen if, say,
1550	 * strange DIF specified the tuple.  Ideally, this should be an
1551	 * assertion and not an error condition -- but that requires that the
1552	 * chunksize calculation in dtrace_difo_chunksize() be absolutely
1553	 * bullet-proof.  (That is, it must not be able to be fooled by
1554	 * malicious DIF.)  Given the lack of backwards branches in DIF,
1555	 * solving this would presumably not amount to solving the Halting
1556	 * Problem -- but it still seems awfully hard.
1557	 */
1558	if (sizeof (dtrace_dynvar_t) + sizeof (dtrace_key_t) * (nkeys - 1) +
1559	    ksize + dsize > chunksize) {
1560		dcpu->dtdsc_drops++;
1561		return (NULL);
1562	}
1563
1564	nstate = DTRACE_DSTATE_EMPTY;
1565
1566	do {
1567retry:
1568		free = dcpu->dtdsc_free;
1569
1570		if (free == NULL) {
1571			dtrace_dynvar_t *clean = dcpu->dtdsc_clean;
1572			void *rval;
1573
1574			if (clean == NULL) {
1575				/*
1576				 * We're out of dynamic variable space on
1577				 * this CPU.  Unless we have tried all CPUs,
1578				 * we'll try to allocate from a different
1579				 * CPU.
1580				 */
1581				switch (dstate->dtds_state) {
1582				case DTRACE_DSTATE_CLEAN: {
1583					void *sp = &dstate->dtds_state;
1584
1585					if (++cpu >= NCPU)
1586						cpu = 0;
1587
1588					if (dcpu->dtdsc_dirty != NULL &&
1589					    nstate == DTRACE_DSTATE_EMPTY)
1590						nstate = DTRACE_DSTATE_DIRTY;
1591
1592					if (dcpu->dtdsc_rinsing != NULL)
1593						nstate = DTRACE_DSTATE_RINSING;
1594
1595					dcpu = &dstate->dtds_percpu[cpu];
1596
1597					if (cpu != me)
1598						goto retry;
1599
1600					(void) dtrace_cas32(sp,
1601					    DTRACE_DSTATE_CLEAN, nstate);
1602
1603					/*
1604					 * To increment the correct bean
1605					 * counter, take another lap.
1606					 */
1607					goto retry;
1608				}
1609
1610				case DTRACE_DSTATE_DIRTY:
1611					dcpu->dtdsc_dirty_drops++;
1612					break;
1613
1614				case DTRACE_DSTATE_RINSING:
1615					dcpu->dtdsc_rinsing_drops++;
1616					break;
1617
1618				case DTRACE_DSTATE_EMPTY:
1619					dcpu->dtdsc_drops++;
1620					break;
1621				}
1622
1623				DTRACE_CPUFLAG_SET(CPU_DTRACE_DROP);
1624				return (NULL);
1625			}
1626
1627			/*
1628			 * The clean list appears to be non-empty.  We want to
1629			 * move the clean list to the free list; we start by
1630			 * moving the clean pointer aside.
1631			 */
1632			if (dtrace_casptr(&dcpu->dtdsc_clean,
1633			    clean, NULL) != clean) {
1634				/*
1635				 * We are in one of two situations:
1636				 *
1637				 *  (a)	The clean list was switched to the
1638				 *	free list by another CPU.
1639				 *
1640				 *  (b)	The clean list was added to by the
1641				 *	cleansing cyclic.
1642				 *
1643				 * In either of these situations, we can
1644				 * just reattempt the free list allocation.
1645				 */
1646				goto retry;
1647			}
1648
1649			ASSERT(clean->dtdv_hashval == DTRACE_DYNHASH_FREE);
1650
1651			/*
1652			 * Now we'll move the clean list to the free list.
1653			 * It's impossible for this to fail:  the only way
1654			 * the free list can be updated is through this
1655			 * code path, and only one CPU can own the clean list.
1656			 * Thus, it would only be possible for this to fail if
1657			 * this code were racing with dtrace_dynvar_clean().
1658			 * (That is, if dtrace_dynvar_clean() updated the clean
1659			 * list, and we ended up racing to update the free
1660			 * list.)  This race is prevented by the dtrace_sync()
1661			 * in dtrace_dynvar_clean() -- which flushes the
1662			 * owners of the clean lists out before resetting
1663			 * the clean lists.
1664			 */
1665			rval = dtrace_casptr(&dcpu->dtdsc_free, NULL, clean);
1666			ASSERT(rval == NULL);
1667			goto retry;
1668		}
1669
1670		dvar = free;
1671		new_free = dvar->dtdv_next;
1672	} while (dtrace_casptr(&dcpu->dtdsc_free, free, new_free) != free);
1673
1674	/*
1675	 * We have now allocated a new chunk.  We copy the tuple keys into the
1676	 * tuple array and copy any referenced key data into the data space
1677	 * following the tuple array.  As we do this, we relocate dttk_value
1678	 * in the final tuple to point to the key data address in the chunk.
1679	 */
1680	kdata = (uintptr_t)&dvar->dtdv_tuple.dtt_key[nkeys];
1681	dvar->dtdv_data = (void *)(kdata + ksize);
1682	dvar->dtdv_tuple.dtt_nkeys = nkeys;
1683
1684	for (i = 0; i < nkeys; i++) {
1685		dtrace_key_t *dkey = &dvar->dtdv_tuple.dtt_key[i];
1686		size_t kesize = key[i].dttk_size;
1687
1688		if (kesize != 0) {
1689			dtrace_bcopy(
1690			    (const void *)(uintptr_t)key[i].dttk_value,
1691			    (void *)kdata, kesize);
1692			dkey->dttk_value = kdata;
1693			kdata += P2ROUNDUP(kesize, sizeof (uint64_t));
1694		} else {
1695			dkey->dttk_value = key[i].dttk_value;
1696		}
1697
1698		dkey->dttk_size = kesize;
1699	}
1700
1701	ASSERT(dvar->dtdv_hashval == DTRACE_DYNHASH_FREE);
1702	dvar->dtdv_hashval = hashval;
1703	dvar->dtdv_next = start;
1704
1705	if (dtrace_casptr(&hash[bucket].dtdh_chain, start, dvar) == start)
1706		return (dvar);
1707
1708	/*
1709	 * The cas has failed.  Either another CPU is adding an element to
1710	 * this hash chain, or another CPU is deleting an element from this
1711	 * hash chain.  The simplest way to deal with both of these cases
1712	 * (though not necessarily the most efficient) is to free our
1713	 * allocated block and tail-call ourselves.  Note that the free is
1714	 * to the dirty list and _not_ to the free list.  This is to prevent
1715	 * races with allocators, above.
1716	 */
1717	dvar->dtdv_hashval = DTRACE_DYNHASH_FREE;
1718
1719	dtrace_membar_producer();
1720
1721	do {
1722		free = dcpu->dtdsc_dirty;
1723		dvar->dtdv_next = free;
1724	} while (dtrace_casptr(&dcpu->dtdsc_dirty, free, dvar) != free);
1725
1726	return (dtrace_dynvar(dstate, nkeys, key, dsize, op, mstate, vstate));
1727}
1728
1729/*ARGSUSED*/
1730static void
1731dtrace_aggregate_min(uint64_t *oval, uint64_t nval, uint64_t arg)
1732{
1733	if ((int64_t)nval < (int64_t)*oval)
1734		*oval = nval;
1735}
1736
1737/*ARGSUSED*/
1738static void
1739dtrace_aggregate_max(uint64_t *oval, uint64_t nval, uint64_t arg)
1740{
1741	if ((int64_t)nval > (int64_t)*oval)
1742		*oval = nval;
1743}
1744
1745static void
1746dtrace_aggregate_quantize(uint64_t *quanta, uint64_t nval, uint64_t incr)
1747{
1748	int i, zero = DTRACE_QUANTIZE_ZEROBUCKET;
1749	int64_t val = (int64_t)nval;
1750
1751	if (val < 0) {
1752		for (i = 0; i < zero; i++) {
1753			if (val <= DTRACE_QUANTIZE_BUCKETVAL(i)) {
1754				quanta[i] += incr;
1755				return;
1756			}
1757		}
1758	} else {
1759		for (i = zero + 1; i < DTRACE_QUANTIZE_NBUCKETS; i++) {
1760			if (val < DTRACE_QUANTIZE_BUCKETVAL(i)) {
1761				quanta[i - 1] += incr;
1762				return;
1763			}
1764		}
1765
1766		quanta[DTRACE_QUANTIZE_NBUCKETS - 1] += incr;
1767		return;
1768	}
1769
1770	ASSERT(0);
1771}
1772
1773static void
1774dtrace_aggregate_lquantize(uint64_t *lquanta, uint64_t nval, uint64_t incr)
1775{
1776	uint64_t arg = *lquanta++;
1777	int32_t base = DTRACE_LQUANTIZE_BASE(arg);
1778	uint16_t step = DTRACE_LQUANTIZE_STEP(arg);
1779	uint16_t levels = DTRACE_LQUANTIZE_LEVELS(arg);
1780	int32_t val = (int32_t)nval, level;
1781
1782	ASSERT(step != 0);
1783	ASSERT(levels != 0);
1784
1785	if (val < base) {
1786		/*
1787		 * This is an underflow.
1788		 */
1789		lquanta[0] += incr;
1790		return;
1791	}
1792
1793	level = (val - base) / step;
1794
1795	if (level < levels) {
1796		lquanta[level + 1] += incr;
1797		return;
1798	}
1799
1800	/*
1801	 * This is an overflow.
1802	 */
1803	lquanta[levels + 1] += incr;
1804}
1805
1806/*ARGSUSED*/
1807static void
1808dtrace_aggregate_avg(uint64_t *data, uint64_t nval, uint64_t arg)
1809{
1810	data[0]++;
1811	data[1] += nval;
1812}
1813
1814/*ARGSUSED*/
1815static void
1816dtrace_aggregate_stddev(uint64_t *data, uint64_t nval, uint64_t arg)
1817{
1818	int64_t snval = (int64_t)nval;
1819	uint64_t tmp[2];
1820
1821	data[0]++;
1822	data[1] += nval;
1823
1824	/*
1825	 * What we want to say here is:
1826	 *
1827	 * data[2] += nval * nval;
1828	 *
1829	 * But given that nval is 64-bit, we could easily overflow, so
1830	 * we do this as 128-bit arithmetic.
1831	 */
1832	if (snval < 0)
1833		snval = -snval;
1834
1835	dtrace_multiply_128((uint64_t)snval, (uint64_t)snval, tmp);
1836	dtrace_add_128(data + 2, tmp, data + 2);
1837}
1838
1839/*ARGSUSED*/
1840static void
1841dtrace_aggregate_count(uint64_t *oval, uint64_t nval, uint64_t arg)
1842{
1843	*oval = *oval + 1;
1844}
1845
1846/*ARGSUSED*/
1847static void
1848dtrace_aggregate_sum(uint64_t *oval, uint64_t nval, uint64_t arg)
1849{
1850	*oval += nval;
1851}
1852
1853/*
1854 * Aggregate given the tuple in the principal data buffer, and the aggregating
1855 * action denoted by the specified dtrace_aggregation_t.  The aggregation
1856 * buffer is specified as the buf parameter.  This routine does not return
1857 * failure; if there is no space in the aggregation buffer, the data will be
1858 * dropped, and a corresponding counter incremented.
1859 */
1860static void
1861dtrace_aggregate(dtrace_aggregation_t *agg, dtrace_buffer_t *dbuf,
1862    intptr_t offset, dtrace_buffer_t *buf, uint64_t expr, uint64_t arg)
1863{
1864	dtrace_recdesc_t *rec = &agg->dtag_action.dta_rec;
1865	uint32_t i, ndx, size, fsize;
1866	uint32_t align = sizeof (uint64_t) - 1;
1867	dtrace_aggbuffer_t *agb;
1868	dtrace_aggkey_t *key;
1869	uint32_t hashval = 0, limit, isstr;
1870	caddr_t tomax, data, kdata;
1871	dtrace_actkind_t action;
1872	dtrace_action_t *act;
1873	uintptr_t offs;
1874
1875	if (buf == NULL)
1876		return;
1877
1878	if (!agg->dtag_hasarg) {
1879		/*
1880		 * Currently, only quantize() and lquantize() take additional
1881		 * arguments, and they have the same semantics:  an increment
1882		 * value that defaults to 1 when not present.  If additional
1883		 * aggregating actions take arguments, the setting of the
1884		 * default argument value will presumably have to become more
1885		 * sophisticated...
1886		 */
1887		arg = 1;
1888	}
1889
1890	action = agg->dtag_action.dta_kind - DTRACEACT_AGGREGATION;
1891	size = rec->dtrd_offset - agg->dtag_base;
1892	fsize = size + rec->dtrd_size;
1893
1894	ASSERT(dbuf->dtb_tomax != NULL);
1895	data = dbuf->dtb_tomax + offset + agg->dtag_base;
1896
1897	if ((tomax = buf->dtb_tomax) == NULL) {
1898		dtrace_buffer_drop(buf);
1899		return;
1900	}
1901
1902	/*
1903	 * The metastructure is always at the bottom of the buffer.
1904	 */
1905	agb = (dtrace_aggbuffer_t *)(tomax + buf->dtb_size -
1906	    sizeof (dtrace_aggbuffer_t));
1907
1908	if (buf->dtb_offset == 0) {
1909		/*
1910		 * We just kludge up approximately 1/8th of the size to be
1911		 * buckets.  If this guess ends up being routinely
1912		 * off-the-mark, we may need to dynamically readjust this
1913		 * based on past performance.
1914		 */
1915		uintptr_t hashsize = (buf->dtb_size >> 3) / sizeof (uintptr_t);
1916
1917		if ((uintptr_t)agb - hashsize * sizeof (dtrace_aggkey_t *) <
1918		    (uintptr_t)tomax || hashsize == 0) {
1919			/*
1920			 * We've been given a ludicrously small buffer;
1921			 * increment our drop count and leave.
1922			 */
1923			dtrace_buffer_drop(buf);
1924			return;
1925		}
1926
1927		/*
1928		 * And now, a pathetic attempt to try to get a an odd (or
1929		 * perchance, a prime) hash size for better hash distribution.
1930		 */
1931		if (hashsize > (DTRACE_AGGHASHSIZE_SLEW << 3))
1932			hashsize -= DTRACE_AGGHASHSIZE_SLEW;
1933
1934		agb->dtagb_hashsize = hashsize;
1935		agb->dtagb_hash = (dtrace_aggkey_t **)((uintptr_t)agb -
1936		    agb->dtagb_hashsize * sizeof (dtrace_aggkey_t *));
1937		agb->dtagb_free = (uintptr_t)agb->dtagb_hash;
1938
1939		for (i = 0; i < agb->dtagb_hashsize; i++)
1940			agb->dtagb_hash[i] = NULL;
1941	}
1942
1943	ASSERT(agg->dtag_first != NULL);
1944	ASSERT(agg->dtag_first->dta_intuple);
1945
1946	/*
1947	 * Calculate the hash value based on the key.  Note that we _don't_
1948	 * include the aggid in the hashing (but we will store it as part of
1949	 * the key).  The hashing algorithm is Bob Jenkins' "One-at-a-time"
1950	 * algorithm: a simple, quick algorithm that has no known funnels, and
1951	 * gets good distribution in practice.  The efficacy of the hashing
1952	 * algorithm (and a comparison with other algorithms) may be found by
1953	 * running the ::dtrace_aggstat MDB dcmd.
1954	 */
1955	for (act = agg->dtag_first; act->dta_intuple; act = act->dta_next) {
1956		i = act->dta_rec.dtrd_offset - agg->dtag_base;
1957		limit = i + act->dta_rec.dtrd_size;
1958		ASSERT(limit <= size);
1959		isstr = DTRACEACT_ISSTRING(act);
1960
1961		for (; i < limit; i++) {
1962			hashval += data[i];
1963			hashval += (hashval << 10);
1964			hashval ^= (hashval >> 6);
1965
1966			if (isstr && data[i] == '\0')
1967				break;
1968		}
1969	}
1970
1971	hashval += (hashval << 3);
1972	hashval ^= (hashval >> 11);
1973	hashval += (hashval << 15);
1974
1975	/*
1976	 * Yes, the divide here is expensive -- but it's generally the least
1977	 * of the performance issues given the amount of data that we iterate
1978	 * over to compute hash values, compare data, etc.
1979	 */
1980	ndx = hashval % agb->dtagb_hashsize;
1981
1982	for (key = agb->dtagb_hash[ndx]; key != NULL; key = key->dtak_next) {
1983		ASSERT((caddr_t)key >= tomax);
1984		ASSERT((caddr_t)key < tomax + buf->dtb_size);
1985
1986		if (hashval != key->dtak_hashval || key->dtak_size != size)
1987			continue;
1988
1989		kdata = key->dtak_data;
1990		ASSERT(kdata >= tomax && kdata < tomax + buf->dtb_size);
1991
1992		for (act = agg->dtag_first; act->dta_intuple;
1993		    act = act->dta_next) {
1994			i = act->dta_rec.dtrd_offset - agg->dtag_base;
1995			limit = i + act->dta_rec.dtrd_size;
1996			ASSERT(limit <= size);
1997			isstr = DTRACEACT_ISSTRING(act);
1998
1999			for (; i < limit; i++) {
2000				if (kdata[i] != data[i])
2001					goto next;
2002
2003				if (isstr && data[i] == '\0')
2004					break;
2005			}
2006		}
2007
2008		if (action != key->dtak_action) {
2009			/*
2010			 * We are aggregating on the same value in the same
2011			 * aggregation with two different aggregating actions.
2012			 * (This should have been picked up in the compiler,
2013			 * so we may be dealing with errant or devious DIF.)
2014			 * This is an error condition; we indicate as much,
2015			 * and return.
2016			 */
2017			DTRACE_CPUFLAG_SET(CPU_DTRACE_ILLOP);
2018			return;
2019		}
2020
2021		/*
2022		 * This is a hit:  we need to apply the aggregator to
2023		 * the value at this key.
2024		 */
2025		agg->dtag_aggregate((uint64_t *)(kdata + size), expr, arg);
2026		return;
2027next:
2028		continue;
2029	}
2030
2031	/*
2032	 * We didn't find it.  We need to allocate some zero-filled space,
2033	 * link it into the hash table appropriately, and apply the aggregator
2034	 * to the (zero-filled) value.
2035	 */
2036	offs = buf->dtb_offset;
2037	while (offs & (align - 1))
2038		offs += sizeof (uint32_t);
2039
2040	/*
2041	 * If we don't have enough room to both allocate a new key _and_
2042	 * its associated data, increment the drop count and return.
2043	 */
2044	if ((uintptr_t)tomax + offs + fsize >
2045	    agb->dtagb_free - sizeof (dtrace_aggkey_t)) {
2046		dtrace_buffer_drop(buf);
2047		return;
2048	}
2049
2050	/*CONSTCOND*/
2051	ASSERT(!(sizeof (dtrace_aggkey_t) & (sizeof (uintptr_t) - 1)));
2052	key = (dtrace_aggkey_t *)(agb->dtagb_free - sizeof (dtrace_aggkey_t));
2053	agb->dtagb_free -= sizeof (dtrace_aggkey_t);
2054
2055	key->dtak_data = kdata = tomax + offs;
2056	buf->dtb_offset = offs + fsize;
2057
2058	/*
2059	 * Now copy the data across.
2060	 */
2061	*((dtrace_aggid_t *)kdata) = agg->dtag_id;
2062
2063	for (i = sizeof (dtrace_aggid_t); i < size; i++)
2064		kdata[i] = data[i];
2065
2066	/*
2067	 * Because strings are not zeroed out by default, we need to iterate
2068	 * looking for actions that store strings, and we need to explicitly
2069	 * pad these strings out with zeroes.
2070	 */
2071	for (act = agg->dtag_first; act->dta_intuple; act = act->dta_next) {
2072		int nul;
2073
2074		if (!DTRACEACT_ISSTRING(act))
2075			continue;
2076
2077		i = act->dta_rec.dtrd_offset - agg->dtag_base;
2078		limit = i + act->dta_rec.dtrd_size;
2079		ASSERT(limit <= size);
2080
2081		for (nul = 0; i < limit; i++) {
2082			if (nul) {
2083				kdata[i] = '\0';
2084				continue;
2085			}
2086
2087			if (data[i] != '\0')
2088				continue;
2089
2090			nul = 1;
2091		}
2092	}
2093
2094	for (i = size; i < fsize; i++)
2095		kdata[i] = 0;
2096
2097	key->dtak_hashval = hashval;
2098	key->dtak_size = size;
2099	key->dtak_action = action;
2100	key->dtak_next = agb->dtagb_hash[ndx];
2101	agb->dtagb_hash[ndx] = key;
2102
2103	/*
2104	 * Finally, apply the aggregator.
2105	 */
2106	*((uint64_t *)(key->dtak_data + size)) = agg->dtag_initial;
2107	agg->dtag_aggregate((uint64_t *)(key->dtak_data + size), expr, arg);
2108}
2109
2110/*
2111 * Given consumer state, this routine finds a speculation in the INACTIVE
2112 * state and transitions it into the ACTIVE state.  If there is no speculation
2113 * in the INACTIVE state, 0 is returned.  In this case, no error counter is
2114 * incremented -- it is up to the caller to take appropriate action.
2115 */
2116static int
2117dtrace_speculation(dtrace_state_t *state)
2118{
2119	int i = 0;
2120	dtrace_speculation_state_t current;
2121	uint32_t *stat = &state->dts_speculations_unavail, count;
2122
2123	while (i < state->dts_nspeculations) {
2124		dtrace_speculation_t *spec = &state->dts_speculations[i];
2125
2126		current = spec->dtsp_state;
2127
2128		if (current != DTRACESPEC_INACTIVE) {
2129			if (current == DTRACESPEC_COMMITTINGMANY ||
2130			    current == DTRACESPEC_COMMITTING ||
2131			    current == DTRACESPEC_DISCARDING)
2132				stat = &state->dts_speculations_busy;
2133			i++;
2134			continue;
2135		}
2136
2137		if (dtrace_cas32((uint32_t *)&spec->dtsp_state,
2138		    current, DTRACESPEC_ACTIVE) == current)
2139			return (i + 1);
2140	}
2141
2142	/*
2143	 * We couldn't find a speculation.  If we found as much as a single
2144	 * busy speculation buffer, we'll attribute this failure as "busy"
2145	 * instead of "unavail".
2146	 */
2147	do {
2148		count = *stat;
2149	} while (dtrace_cas32(stat, count, count + 1) != count);
2150
2151	return (0);
2152}
2153
2154/*
2155 * This routine commits an active speculation.  If the specified speculation
2156 * is not in a valid state to perform a commit(), this routine will silently do
2157 * nothing.  The state of the specified speculation is transitioned according
2158 * to the state transition diagram outlined in <sys/dtrace_impl.h>
2159 */
2160static void
2161dtrace_speculation_commit(dtrace_state_t *state, processorid_t cpu,
2162    dtrace_specid_t which)
2163{
2164	dtrace_speculation_t *spec;
2165	dtrace_buffer_t *src, *dest;
2166	uintptr_t daddr, saddr, dlimit;
2167	dtrace_speculation_state_t current, new;
2168	intptr_t offs;
2169
2170	if (which == 0)
2171		return;
2172
2173	if (which > state->dts_nspeculations) {
2174		cpu_core[cpu].cpuc_dtrace_flags |= CPU_DTRACE_ILLOP;
2175		return;
2176	}
2177
2178	spec = &state->dts_speculations[which - 1];
2179	src = &spec->dtsp_buffer[cpu];
2180	dest = &state->dts_buffer[cpu];
2181
2182	do {
2183		current = spec->dtsp_state;
2184
2185		if (current == DTRACESPEC_COMMITTINGMANY)
2186			break;
2187
2188		switch (current) {
2189		case DTRACESPEC_INACTIVE:
2190		case DTRACESPEC_DISCARDING:
2191			return;
2192
2193		case DTRACESPEC_COMMITTING:
2194			/*
2195			 * This is only possible if we are (a) commit()'ing
2196			 * without having done a prior speculate() on this CPU
2197			 * and (b) racing with another commit() on a different
2198			 * CPU.  There's nothing to do -- we just assert that
2199			 * our offset is 0.
2200			 */
2201			ASSERT(src->dtb_offset == 0);
2202			return;
2203
2204		case DTRACESPEC_ACTIVE:
2205			new = DTRACESPEC_COMMITTING;
2206			break;
2207
2208		case DTRACESPEC_ACTIVEONE:
2209			/*
2210			 * This speculation is active on one CPU.  If our
2211			 * buffer offset is non-zero, we know that the one CPU
2212			 * must be us.  Otherwise, we are committing on a
2213			 * different CPU from the speculate(), and we must
2214			 * rely on being asynchronously cleaned.
2215			 */
2216			if (src->dtb_offset != 0) {
2217				new = DTRACESPEC_COMMITTING;
2218				break;
2219			}
2220			/*FALLTHROUGH*/
2221
2222		case DTRACESPEC_ACTIVEMANY:
2223			new = DTRACESPEC_COMMITTINGMANY;
2224			break;
2225
2226		default:
2227			ASSERT(0);
2228		}
2229	} while (dtrace_cas32((uint32_t *)&spec->dtsp_state,
2230	    current, new) != current);
2231
2232	/*
2233	 * We have set the state to indicate that we are committing this
2234	 * speculation.  Now reserve the necessary space in the destination
2235	 * buffer.
2236	 */
2237	if ((offs = dtrace_buffer_reserve(dest, src->dtb_offset,
2238	    sizeof (uint64_t), state, NULL)) < 0) {
2239		dtrace_buffer_drop(dest);
2240		goto out;
2241	}
2242
2243	/*
2244	 * We have the space; copy the buffer across.  (Note that this is a
2245	 * highly subobtimal bcopy(); in the unlikely event that this becomes
2246	 * a serious performance issue, a high-performance DTrace-specific
2247	 * bcopy() should obviously be invented.)
2248	 */
2249	daddr = (uintptr_t)dest->dtb_tomax + offs;
2250	dlimit = daddr + src->dtb_offset;
2251	saddr = (uintptr_t)src->dtb_tomax;
2252
2253	/*
2254	 * First, the aligned portion.
2255	 */
2256	while (dlimit - daddr >= sizeof (uint64_t)) {
2257		*((uint64_t *)daddr) = *((uint64_t *)saddr);
2258
2259		daddr += sizeof (uint64_t);
2260		saddr += sizeof (uint64_t);
2261	}
2262
2263	/*
2264	 * Now any left-over bit...
2265	 */
2266	while (dlimit - daddr)
2267		*((uint8_t *)daddr++) = *((uint8_t *)saddr++);
2268
2269	/*
2270	 * Finally, commit the reserved space in the destination buffer.
2271	 */
2272	dest->dtb_offset = offs + src->dtb_offset;
2273
2274out:
2275	/*
2276	 * If we're lucky enough to be the only active CPU on this speculation
2277	 * buffer, we can just set the state back to DTRACESPEC_INACTIVE.
2278	 */
2279	if (current == DTRACESPEC_ACTIVE ||
2280	    (current == DTRACESPEC_ACTIVEONE && new == DTRACESPEC_COMMITTING)) {
2281		uint32_t rval = dtrace_cas32((uint32_t *)&spec->dtsp_state,
2282		    DTRACESPEC_COMMITTING, DTRACESPEC_INACTIVE);
2283
2284		ASSERT(rval == DTRACESPEC_COMMITTING);
2285	}
2286
2287	src->dtb_offset = 0;
2288	src->dtb_xamot_drops += src->dtb_drops;
2289	src->dtb_drops = 0;
2290}
2291
2292/*
2293 * This routine discards an active speculation.  If the specified speculation
2294 * is not in a valid state to perform a discard(), this routine will silently
2295 * do nothing.  The state of the specified speculation is transitioned
2296 * according to the state transition diagram outlined in <sys/dtrace_impl.h>
2297 */
2298static void
2299dtrace_speculation_discard(dtrace_state_t *state, processorid_t cpu,
2300    dtrace_specid_t which)
2301{
2302	dtrace_speculation_t *spec;
2303	dtrace_speculation_state_t current, new;
2304	dtrace_buffer_t *buf;
2305
2306	if (which == 0)
2307		return;
2308
2309	if (which > state->dts_nspeculations) {
2310		cpu_core[cpu].cpuc_dtrace_flags |= CPU_DTRACE_ILLOP;
2311		return;
2312	}
2313
2314	spec = &state->dts_speculations[which - 1];
2315	buf = &spec->dtsp_buffer[cpu];
2316
2317	do {
2318		current = spec->dtsp_state;
2319
2320		switch (current) {
2321		case DTRACESPEC_INACTIVE:
2322		case DTRACESPEC_COMMITTINGMANY:
2323		case DTRACESPEC_COMMITTING:
2324		case DTRACESPEC_DISCARDING:
2325			return;
2326
2327		case DTRACESPEC_ACTIVE:
2328		case DTRACESPEC_ACTIVEMANY:
2329			new = DTRACESPEC_DISCARDING;
2330			break;
2331
2332		case DTRACESPEC_ACTIVEONE:
2333			if (buf->dtb_offset != 0) {
2334				new = DTRACESPEC_INACTIVE;
2335			} else {
2336				new = DTRACESPEC_DISCARDING;
2337			}
2338			break;
2339
2340		default:
2341			ASSERT(0);
2342		}
2343	} while (dtrace_cas32((uint32_t *)&spec->dtsp_state,
2344	    current, new) != current);
2345
2346	buf->dtb_offset = 0;
2347	buf->dtb_drops = 0;
2348}
2349
2350/*
2351 * Note:  not called from probe context.  This function is called
2352 * asynchronously from cross call context to clean any speculations that are
2353 * in the COMMITTINGMANY or DISCARDING states.  These speculations may not be
2354 * transitioned back to the INACTIVE state until all CPUs have cleaned the
2355 * speculation.
2356 */
2357static void
2358dtrace_speculation_clean_here(dtrace_state_t *state)
2359{
2360	dtrace_icookie_t cookie;
2361	processorid_t cpu = CPU->cpu_id;
2362	dtrace_buffer_t *dest = &state->dts_buffer[cpu];
2363	dtrace_specid_t i;
2364
2365	cookie = dtrace_interrupt_disable();
2366
2367	if (dest->dtb_tomax == NULL) {
2368		dtrace_interrupt_enable(cookie);
2369		return;
2370	}
2371
2372	for (i = 0; i < state->dts_nspeculations; i++) {
2373		dtrace_speculation_t *spec = &state->dts_speculations[i];
2374		dtrace_buffer_t *src = &spec->dtsp_buffer[cpu];
2375
2376		if (src->dtb_tomax == NULL)
2377			continue;
2378
2379		if (spec->dtsp_state == DTRACESPEC_DISCARDING) {
2380			src->dtb_offset = 0;
2381			continue;
2382		}
2383
2384		if (spec->dtsp_state != DTRACESPEC_COMMITTINGMANY)
2385			continue;
2386
2387		if (src->dtb_offset == 0)
2388			continue;
2389
2390		dtrace_speculation_commit(state, cpu, i + 1);
2391	}
2392
2393	dtrace_interrupt_enable(cookie);
2394}
2395
2396/*
2397 * Note:  not called from probe context.  This function is called
2398 * asynchronously (and at a regular interval) to clean any speculations that
2399 * are in the COMMITTINGMANY or DISCARDING states.  If it discovers that there
2400 * is work to be done, it cross calls all CPUs to perform that work;
2401 * COMMITMANY and DISCARDING speculations may not be transitioned back to the
2402 * INACTIVE state until they have been cleaned by all CPUs.
2403 */
2404static void
2405dtrace_speculation_clean(dtrace_state_t *state)
2406{
2407	int work = 0, rv;
2408	dtrace_specid_t i;
2409
2410	for (i = 0; i < state->dts_nspeculations; i++) {
2411		dtrace_speculation_t *spec = &state->dts_speculations[i];
2412
2413		ASSERT(!spec->dtsp_cleaning);
2414
2415		if (spec->dtsp_state != DTRACESPEC_DISCARDING &&
2416		    spec->dtsp_state != DTRACESPEC_COMMITTINGMANY)
2417			continue;
2418
2419		work++;
2420		spec->dtsp_cleaning = 1;
2421	}
2422
2423	if (!work)
2424		return;
2425
2426	dtrace_xcall(DTRACE_CPUALL,
2427	    (dtrace_xcall_t)dtrace_speculation_clean_here, state);
2428
2429	/*
2430	 * We now know that all CPUs have committed or discarded their
2431	 * speculation buffers, as appropriate.  We can now set the state
2432	 * to inactive.
2433	 */
2434	for (i = 0; i < state->dts_nspeculations; i++) {
2435		dtrace_speculation_t *spec = &state->dts_speculations[i];
2436		dtrace_speculation_state_t current, new;
2437
2438		if (!spec->dtsp_cleaning)
2439			continue;
2440
2441		current = spec->dtsp_state;
2442		ASSERT(current == DTRACESPEC_DISCARDING ||
2443		    current == DTRACESPEC_COMMITTINGMANY);
2444
2445		new = DTRACESPEC_INACTIVE;
2446
2447		rv = dtrace_cas32((uint32_t *)&spec->dtsp_state, current, new);
2448		ASSERT(rv == current);
2449		spec->dtsp_cleaning = 0;
2450	}
2451}
2452
2453/*
2454 * Called as part of a speculate() to get the speculative buffer associated
2455 * with a given speculation.  Returns NULL if the specified speculation is not
2456 * in an ACTIVE state.  If the speculation is in the ACTIVEONE state -- and
2457 * the active CPU is not the specified CPU -- the speculation will be
2458 * atomically transitioned into the ACTIVEMANY state.
2459 */
2460static dtrace_buffer_t *
2461dtrace_speculation_buffer(dtrace_state_t *state, processorid_t cpuid,
2462    dtrace_specid_t which)
2463{
2464	dtrace_speculation_t *spec;
2465	dtrace_speculation_state_t current, new;
2466	dtrace_buffer_t *buf;
2467
2468	if (which == 0)
2469		return (NULL);
2470
2471	if (which > state->dts_nspeculations) {
2472		cpu_core[cpuid].cpuc_dtrace_flags |= CPU_DTRACE_ILLOP;
2473		return (NULL);
2474	}
2475
2476	spec = &state->dts_speculations[which - 1];
2477	buf = &spec->dtsp_buffer[cpuid];
2478
2479	do {
2480		current = spec->dtsp_state;
2481
2482		switch (current) {
2483		case DTRACESPEC_INACTIVE:
2484		case DTRACESPEC_COMMITTINGMANY:
2485		case DTRACESPEC_DISCARDING:
2486			return (NULL);
2487
2488		case DTRACESPEC_COMMITTING:
2489			ASSERT(buf->dtb_offset == 0);
2490			return (NULL);
2491
2492		case DTRACESPEC_ACTIVEONE:
2493			/*
2494			 * This speculation is currently active on one CPU.
2495			 * Check the offset in the buffer; if it's non-zero,
2496			 * that CPU must be us (and we leave the state alone).
2497			 * If it's zero, assume that we're starting on a new
2498			 * CPU -- and change the state to indicate that the
2499			 * speculation is active on more than one CPU.
2500			 */
2501			if (buf->dtb_offset != 0)
2502				return (buf);
2503
2504			new = DTRACESPEC_ACTIVEMANY;
2505			break;
2506
2507		case DTRACESPEC_ACTIVEMANY:
2508			return (buf);
2509
2510		case DTRACESPEC_ACTIVE:
2511			new = DTRACESPEC_ACTIVEONE;
2512			break;
2513
2514		default:
2515			ASSERT(0);
2516		}
2517	} while (dtrace_cas32((uint32_t *)&spec->dtsp_state,
2518	    current, new) != current);
2519
2520	ASSERT(new == DTRACESPEC_ACTIVEONE || new == DTRACESPEC_ACTIVEMANY);
2521	return (buf);
2522}
2523
2524/*
2525 * Return a string.  In the event that the user lacks the privilege to access
2526 * arbitrary kernel memory, we copy the string out to scratch memory so that we
2527 * don't fail access checking.
2528 *
2529 * dtrace_dif_variable() uses this routine as a helper for various
2530 * builtin values such as 'execname' and 'probefunc.'
2531 */
2532uintptr_t
2533dtrace_dif_varstr(uintptr_t addr, dtrace_state_t *state,
2534    dtrace_mstate_t *mstate)
2535{
2536	uint64_t size = state->dts_options[DTRACEOPT_STRSIZE];
2537	uintptr_t ret;
2538	size_t strsz;
2539
2540	/*
2541	 * The easy case: this probe is allowed to read all of memory, so
2542	 * we can just return this as a vanilla pointer.
2543	 */
2544	if ((mstate->dtms_access & DTRACE_ACCESS_KERNEL) != 0)
2545		return (addr);
2546
2547	/*
2548	 * This is the tougher case: we copy the string in question from
2549	 * kernel memory into scratch memory and return it that way: this
2550	 * ensures that we won't trip up when access checking tests the
2551	 * BYREF return value.
2552	 */
2553	strsz = dtrace_strlen((char *)addr, size) + 1;
2554
2555	if (mstate->dtms_scratch_ptr + strsz >
2556	    mstate->dtms_scratch_base + mstate->dtms_scratch_size) {
2557		DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
2558		return (NULL);
2559	}
2560
2561	dtrace_strcpy((const void *)addr, (void *)mstate->dtms_scratch_ptr,
2562	    strsz);
2563	ret = mstate->dtms_scratch_ptr;
2564	mstate->dtms_scratch_ptr += strsz;
2565	return (ret);
2566}
2567
2568/*
2569 * This function implements the DIF emulator's variable lookups.  The emulator
2570 * passes a reserved variable identifier and optional built-in array index.
2571 */
2572static uint64_t
2573dtrace_dif_variable(dtrace_mstate_t *mstate, dtrace_state_t *state, uint64_t v,
2574    uint64_t ndx)
2575{
2576	/*
2577	 * If we're accessing one of the uncached arguments, we'll turn this
2578	 * into a reference in the args array.
2579	 */
2580	if (v >= DIF_VAR_ARG0 && v <= DIF_VAR_ARG9) {
2581		ndx = v - DIF_VAR_ARG0;
2582		v = DIF_VAR_ARGS;
2583	}
2584
2585	switch (v) {
2586	case DIF_VAR_ARGS:
2587		ASSERT(mstate->dtms_present & DTRACE_MSTATE_ARGS);
2588		if (ndx >= sizeof (mstate->dtms_arg) /
2589		    sizeof (mstate->dtms_arg[0])) {
2590			int aframes = mstate->dtms_probe->dtpr_aframes + 2;
2591			dtrace_provider_t *pv;
2592			uint64_t val;
2593
2594			pv = mstate->dtms_probe->dtpr_provider;
2595			if (pv->dtpv_pops.dtps_getargval != NULL)
2596				val = pv->dtpv_pops.dtps_getargval(pv->dtpv_arg,
2597				    mstate->dtms_probe->dtpr_id,
2598				    mstate->dtms_probe->dtpr_arg, ndx, aframes);
2599			else
2600				val = dtrace_getarg(ndx, aframes);
2601
2602			/*
2603			 * This is regrettably required to keep the compiler
2604			 * from tail-optimizing the call to dtrace_getarg().
2605			 * The condition always evaluates to true, but the
2606			 * compiler has no way of figuring that out a priori.
2607			 * (None of this would be necessary if the compiler
2608			 * could be relied upon to _always_ tail-optimize
2609			 * the call to dtrace_getarg() -- but it can't.)
2610			 */
2611			if (mstate->dtms_probe != NULL)
2612				return (val);
2613
2614			ASSERT(0);
2615		}
2616
2617		return (mstate->dtms_arg[ndx]);
2618
2619	case DIF_VAR_UREGS: {
2620		klwp_t *lwp;
2621
2622		if (!dtrace_priv_proc(state))
2623			return (0);
2624
2625		if ((lwp = curthread->t_lwp) == NULL) {
2626			DTRACE_CPUFLAG_SET(CPU_DTRACE_BADADDR);
2627			cpu_core[CPU->cpu_id].cpuc_dtrace_illval = NULL;
2628			return (0);
2629		}
2630
2631		return (dtrace_getreg(lwp->lwp_regs, ndx));
2632	}
2633
2634	case DIF_VAR_CURTHREAD:
2635		if (!dtrace_priv_kernel(state))
2636			return (0);
2637		return ((uint64_t)(uintptr_t)curthread);
2638
2639	case DIF_VAR_TIMESTAMP:
2640		if (!(mstate->dtms_present & DTRACE_MSTATE_TIMESTAMP)) {
2641			mstate->dtms_timestamp = dtrace_gethrtime();
2642			mstate->dtms_present |= DTRACE_MSTATE_TIMESTAMP;
2643		}
2644		return (mstate->dtms_timestamp);
2645
2646	case DIF_VAR_VTIMESTAMP:
2647		ASSERT(dtrace_vtime_references != 0);
2648		return (curthread->t_dtrace_vtime);
2649
2650	case DIF_VAR_WALLTIMESTAMP:
2651		if (!(mstate->dtms_present & DTRACE_MSTATE_WALLTIMESTAMP)) {
2652			mstate->dtms_walltimestamp = dtrace_gethrestime();
2653			mstate->dtms_present |= DTRACE_MSTATE_WALLTIMESTAMP;
2654		}
2655		return (mstate->dtms_walltimestamp);
2656
2657	case DIF_VAR_IPL:
2658		if (!dtrace_priv_kernel(state))
2659			return (0);
2660		if (!(mstate->dtms_present & DTRACE_MSTATE_IPL)) {
2661			mstate->dtms_ipl = dtrace_getipl();
2662			mstate->dtms_present |= DTRACE_MSTATE_IPL;
2663		}
2664		return (mstate->dtms_ipl);
2665
2666	case DIF_VAR_EPID:
2667		ASSERT(mstate->dtms_present & DTRACE_MSTATE_EPID);
2668		return (mstate->dtms_epid);
2669
2670	case DIF_VAR_ID:
2671		ASSERT(mstate->dtms_present & DTRACE_MSTATE_PROBE);
2672		return (mstate->dtms_probe->dtpr_id);
2673
2674	case DIF_VAR_STACKDEPTH:
2675		if (!dtrace_priv_kernel(state))
2676			return (0);
2677		if (!(mstate->dtms_present & DTRACE_MSTATE_STACKDEPTH)) {
2678			int aframes = mstate->dtms_probe->dtpr_aframes + 2;
2679
2680			mstate->dtms_stackdepth = dtrace_getstackdepth(aframes);
2681			mstate->dtms_present |= DTRACE_MSTATE_STACKDEPTH;
2682		}
2683		return (mstate->dtms_stackdepth);
2684
2685	case DIF_VAR_USTACKDEPTH:
2686		if (!dtrace_priv_proc(state))
2687			return (0);
2688		if (!(mstate->dtms_present & DTRACE_MSTATE_USTACKDEPTH)) {
2689			/*
2690			 * See comment in DIF_VAR_PID.
2691			 */
2692			if (DTRACE_ANCHORED(mstate->dtms_probe) &&
2693			    CPU_ON_INTR(CPU)) {
2694				mstate->dtms_ustackdepth = 0;
2695			} else {
2696				DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
2697				mstate->dtms_ustackdepth =
2698				    dtrace_getustackdepth();
2699				DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);
2700			}
2701			mstate->dtms_present |= DTRACE_MSTATE_USTACKDEPTH;
2702		}
2703		return (mstate->dtms_ustackdepth);
2704
2705	case DIF_VAR_CALLER:
2706		if (!dtrace_priv_kernel(state))
2707			return (0);
2708		if (!(mstate->dtms_present & DTRACE_MSTATE_CALLER)) {
2709			int aframes = mstate->dtms_probe->dtpr_aframes + 2;
2710
2711			if (!DTRACE_ANCHORED(mstate->dtms_probe)) {
2712				/*
2713				 * If this is an unanchored probe, we are
2714				 * required to go through the slow path:
2715				 * dtrace_caller() only guarantees correct
2716				 * results for anchored probes.
2717				 */
2718				pc_t caller[2];
2719
2720				dtrace_getpcstack(caller, 2, aframes,
2721				    (uint32_t *)(uintptr_t)mstate->dtms_arg[0]);
2722				mstate->dtms_caller = caller[1];
2723			} else if ((mstate->dtms_caller =
2724			    dtrace_caller(aframes)) == -1) {
2725				/*
2726				 * We have failed to do this the quick way;
2727				 * we must resort to the slower approach of
2728				 * calling dtrace_getpcstack().
2729				 */
2730				pc_t caller;
2731
2732				dtrace_getpcstack(&caller, 1, aframes, NULL);
2733				mstate->dtms_caller = caller;
2734			}
2735
2736			mstate->dtms_present |= DTRACE_MSTATE_CALLER;
2737		}
2738		return (mstate->dtms_caller);
2739
2740	case DIF_VAR_UCALLER:
2741		if (!dtrace_priv_proc(state))
2742			return (0);
2743
2744		if (!(mstate->dtms_present & DTRACE_MSTATE_UCALLER)) {
2745			uint64_t ustack[3];
2746
2747			/*
2748			 * dtrace_getupcstack() fills in the first uint64_t
2749			 * with the current PID.  The second uint64_t will
2750			 * be the program counter at user-level.  The third
2751			 * uint64_t will contain the caller, which is what
2752			 * we're after.
2753			 */
2754			ustack[2] = NULL;
2755			DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
2756			dtrace_getupcstack(ustack, 3);
2757			DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);
2758			mstate->dtms_ucaller = ustack[2];
2759			mstate->dtms_present |= DTRACE_MSTATE_UCALLER;
2760		}
2761
2762		return (mstate->dtms_ucaller);
2763
2764	case DIF_VAR_PROBEPROV:
2765		ASSERT(mstate->dtms_present & DTRACE_MSTATE_PROBE);
2766		return (dtrace_dif_varstr(
2767		    (uintptr_t)mstate->dtms_probe->dtpr_provider->dtpv_name,
2768		    state, mstate));
2769
2770	case DIF_VAR_PROBEMOD:
2771		ASSERT(mstate->dtms_present & DTRACE_MSTATE_PROBE);
2772		return (dtrace_dif_varstr(
2773		    (uintptr_t)mstate->dtms_probe->dtpr_mod,
2774		    state, mstate));
2775
2776	case DIF_VAR_PROBEFUNC:
2777		ASSERT(mstate->dtms_present & DTRACE_MSTATE_PROBE);
2778		return (dtrace_dif_varstr(
2779		    (uintptr_t)mstate->dtms_probe->dtpr_func,
2780		    state, mstate));
2781
2782	case DIF_VAR_PROBENAME:
2783		ASSERT(mstate->dtms_present & DTRACE_MSTATE_PROBE);
2784		return (dtrace_dif_varstr(
2785		    (uintptr_t)mstate->dtms_probe->dtpr_name,
2786		    state, mstate));
2787
2788	case DIF_VAR_PID:
2789		if (!dtrace_priv_proc(state))
2790			return (0);
2791
2792		/*
2793		 * Note that we are assuming that an unanchored probe is
2794		 * always due to a high-level interrupt.  (And we're assuming
2795		 * that there is only a single high level interrupt.)
2796		 */
2797		if (DTRACE_ANCHORED(mstate->dtms_probe) && CPU_ON_INTR(CPU))
2798			return (pid0.pid_id);
2799
2800		/*
2801		 * It is always safe to dereference one's own t_procp pointer:
2802		 * it always points to a valid, allocated proc structure.
2803		 * Further, it is always safe to dereference the p_pidp member
2804		 * of one's own proc structure.  (These are truisms becuase
2805		 * threads and processes don't clean up their own state --
2806		 * they leave that task to whomever reaps them.)
2807		 */
2808		return ((uint64_t)curthread->t_procp->p_pidp->pid_id);
2809
2810	case DIF_VAR_PPID:
2811		if (!dtrace_priv_proc(state))
2812			return (0);
2813
2814		/*
2815		 * See comment in DIF_VAR_PID.
2816		 */
2817		if (DTRACE_ANCHORED(mstate->dtms_probe) && CPU_ON_INTR(CPU))
2818			return (pid0.pid_id);
2819
2820		/*
2821		 * It is always safe to dereference one's own t_procp pointer:
2822		 * it always points to a valid, allocated proc structure.
2823		 * (This is true because threads don't clean up their own
2824		 * state -- they leave that task to whomever reaps them.)
2825		 */
2826		return ((uint64_t)curthread->t_procp->p_ppid);
2827
2828	case DIF_VAR_TID:
2829		/*
2830		 * See comment in DIF_VAR_PID.
2831		 */
2832		if (DTRACE_ANCHORED(mstate->dtms_probe) && CPU_ON_INTR(CPU))
2833			return (0);
2834
2835		return ((uint64_t)curthread->t_tid);
2836
2837	case DIF_VAR_EXECNAME:
2838		if (!dtrace_priv_proc(state))
2839			return (0);
2840
2841		/*
2842		 * See comment in DIF_VAR_PID.
2843		 */
2844		if (DTRACE_ANCHORED(mstate->dtms_probe) && CPU_ON_INTR(CPU))
2845			return ((uint64_t)(uintptr_t)p0.p_user.u_comm);
2846
2847		/*
2848		 * It is always safe to dereference one's own t_procp pointer:
2849		 * it always points to a valid, allocated proc structure.
2850		 * (This is true because threads don't clean up their own
2851		 * state -- they leave that task to whomever reaps them.)
2852		 */
2853		return (dtrace_dif_varstr(
2854		    (uintptr_t)curthread->t_procp->p_user.u_comm,
2855		    state, mstate));
2856
2857	case DIF_VAR_ZONENAME:
2858		if (!dtrace_priv_proc(state))
2859			return (0);
2860
2861		/*
2862		 * See comment in DIF_VAR_PID.
2863		 */
2864		if (DTRACE_ANCHORED(mstate->dtms_probe) && CPU_ON_INTR(CPU))
2865			return ((uint64_t)(uintptr_t)p0.p_zone->zone_name);
2866
2867		/*
2868		 * It is always safe to dereference one's own t_procp pointer:
2869		 * it always points to a valid, allocated proc structure.
2870		 * (This is true because threads don't clean up their own
2871		 * state -- they leave that task to whomever reaps them.)
2872		 */
2873		return (dtrace_dif_varstr(
2874		    (uintptr_t)curthread->t_procp->p_zone->zone_name,
2875		    state, mstate));
2876
2877	case DIF_VAR_UID:
2878		if (!dtrace_priv_proc(state))
2879			return (0);
2880
2881		/*
2882		 * See comment in DIF_VAR_PID.
2883		 */
2884		if (DTRACE_ANCHORED(mstate->dtms_probe) && CPU_ON_INTR(CPU))
2885			return ((uint64_t)p0.p_cred->cr_uid);
2886
2887		/*
2888		 * It is always safe to dereference one's own t_procp pointer:
2889		 * it always points to a valid, allocated proc structure.
2890		 * (This is true because threads don't clean up their own
2891		 * state -- they leave that task to whomever reaps them.)
2892		 *
2893		 * Additionally, it is safe to dereference one's own process
2894		 * credential, since this is never NULL after process birth.
2895		 */
2896		return ((uint64_t)curthread->t_procp->p_cred->cr_uid);
2897
2898	case DIF_VAR_GID:
2899		if (!dtrace_priv_proc(state))
2900			return (0);
2901
2902		/*
2903		 * See comment in DIF_VAR_PID.
2904		 */
2905		if (DTRACE_ANCHORED(mstate->dtms_probe) && CPU_ON_INTR(CPU))
2906			return ((uint64_t)p0.p_cred->cr_gid);
2907
2908		/*
2909		 * It is always safe to dereference one's own t_procp pointer:
2910		 * it always points to a valid, allocated proc structure.
2911		 * (This is true because threads don't clean up their own
2912		 * state -- they leave that task to whomever reaps them.)
2913		 *
2914		 * Additionally, it is safe to dereference one's own process
2915		 * credential, since this is never NULL after process birth.
2916		 */
2917		return ((uint64_t)curthread->t_procp->p_cred->cr_gid);
2918
2919	case DIF_VAR_ERRNO: {
2920		klwp_t *lwp;
2921		if (!dtrace_priv_proc(state))
2922			return (0);
2923
2924		/*
2925		 * See comment in DIF_VAR_PID.
2926		 */
2927		if (DTRACE_ANCHORED(mstate->dtms_probe) && CPU_ON_INTR(CPU))
2928			return (0);
2929
2930		/*
2931		 * It is always safe to dereference one's own t_lwp pointer in
2932		 * the event that this pointer is non-NULL.  (This is true
2933		 * because threads and lwps don't clean up their own state --
2934		 * they leave that task to whomever reaps them.)
2935		 */
2936		if ((lwp = curthread->t_lwp) == NULL)
2937			return (0);
2938
2939		return ((uint64_t)lwp->lwp_errno);
2940	}
2941	default:
2942		DTRACE_CPUFLAG_SET(CPU_DTRACE_ILLOP);
2943		return (0);
2944	}
2945}
2946
2947/*
2948 * Emulate the execution of DTrace ID subroutines invoked by the call opcode.
2949 * Notice that we don't bother validating the proper number of arguments or
2950 * their types in the tuple stack.  This isn't needed because all argument
2951 * interpretation is safe because of our load safety -- the worst that can
2952 * happen is that a bogus program can obtain bogus results.
2953 */
2954static void
2955dtrace_dif_subr(uint_t subr, uint_t rd, uint64_t *regs,
2956    dtrace_key_t *tupregs, int nargs,
2957    dtrace_mstate_t *mstate, dtrace_state_t *state)
2958{
2959	volatile uint16_t *flags = &cpu_core[CPU->cpu_id].cpuc_dtrace_flags;
2960	volatile uintptr_t *illval = &cpu_core[CPU->cpu_id].cpuc_dtrace_illval;
2961	dtrace_vstate_t *vstate = &state->dts_vstate;
2962
2963	union {
2964		mutex_impl_t mi;
2965		uint64_t mx;
2966	} m;
2967
2968	union {
2969		krwlock_t ri;
2970		uintptr_t rw;
2971	} r;
2972
2973	switch (subr) {
2974	case DIF_SUBR_RAND:
2975		regs[rd] = (dtrace_gethrtime() * 2416 + 374441) % 1771875;
2976		break;
2977
2978	case DIF_SUBR_MUTEX_OWNED:
2979		if (!dtrace_canload(tupregs[0].dttk_value, sizeof (kmutex_t),
2980		    mstate, vstate)) {
2981			regs[rd] = NULL;
2982			break;
2983		}
2984
2985		m.mx = dtrace_load64(tupregs[0].dttk_value);
2986		if (MUTEX_TYPE_ADAPTIVE(&m.mi))
2987			regs[rd] = MUTEX_OWNER(&m.mi) != MUTEX_NO_OWNER;
2988		else
2989			regs[rd] = LOCK_HELD(&m.mi.m_spin.m_spinlock);
2990		break;
2991
2992	case DIF_SUBR_MUTEX_OWNER:
2993		if (!dtrace_canload(tupregs[0].dttk_value, sizeof (kmutex_t),
2994		    mstate, vstate)) {
2995			regs[rd] = NULL;
2996			break;
2997		}
2998
2999		m.mx = dtrace_load64(tupregs[0].dttk_value);
3000		if (MUTEX_TYPE_ADAPTIVE(&m.mi) &&
3001		    MUTEX_OWNER(&m.mi) != MUTEX_NO_OWNER)
3002			regs[rd] = (uintptr_t)MUTEX_OWNER(&m.mi);
3003		else
3004			regs[rd] = 0;
3005		break;
3006
3007	case DIF_SUBR_MUTEX_TYPE_ADAPTIVE:
3008		if (!dtrace_canload(tupregs[0].dttk_value, sizeof (kmutex_t),
3009		    mstate, vstate)) {
3010			regs[rd] = NULL;
3011			break;
3012		}
3013
3014		m.mx = dtrace_load64(tupregs[0].dttk_value);
3015		regs[rd] = MUTEX_TYPE_ADAPTIVE(&m.mi);
3016		break;
3017
3018	case DIF_SUBR_MUTEX_TYPE_SPIN:
3019		if (!dtrace_canload(tupregs[0].dttk_value, sizeof (kmutex_t),
3020		    mstate, vstate)) {
3021			regs[rd] = NULL;
3022			break;
3023		}
3024
3025		m.mx = dtrace_load64(tupregs[0].dttk_value);
3026		regs[rd] = MUTEX_TYPE_SPIN(&m.mi);
3027		break;
3028
3029	case DIF_SUBR_RW_READ_HELD: {
3030		uintptr_t tmp;
3031
3032		if (!dtrace_canload(tupregs[0].dttk_value, sizeof (uintptr_t),
3033		    mstate, vstate)) {
3034			regs[rd] = NULL;
3035			break;
3036		}
3037
3038		r.rw = dtrace_loadptr(tupregs[0].dttk_value);
3039		regs[rd] = _RW_READ_HELD(&r.ri, tmp);
3040		break;
3041	}
3042
3043	case DIF_SUBR_RW_WRITE_HELD:
3044		if (!dtrace_canload(tupregs[0].dttk_value, sizeof (krwlock_t),
3045		    mstate, vstate)) {
3046			regs[rd] = NULL;
3047			break;
3048		}
3049
3050		r.rw = dtrace_loadptr(tupregs[0].dttk_value);
3051		regs[rd] = _RW_WRITE_HELD(&r.ri);
3052		break;
3053
3054	case DIF_SUBR_RW_ISWRITER:
3055		if (!dtrace_canload(tupregs[0].dttk_value, sizeof (krwlock_t),
3056		    mstate, vstate)) {
3057			regs[rd] = NULL;
3058			break;
3059		}
3060
3061		r.rw = dtrace_loadptr(tupregs[0].dttk_value);
3062		regs[rd] = _RW_ISWRITER(&r.ri);
3063		break;
3064
3065	case DIF_SUBR_BCOPY: {
3066		/*
3067		 * We need to be sure that the destination is in the scratch
3068		 * region -- no other region is allowed.
3069		 */
3070		uintptr_t src = tupregs[0].dttk_value;
3071		uintptr_t dest = tupregs[1].dttk_value;
3072		size_t size = tupregs[2].dttk_value;
3073
3074		if (!dtrace_inscratch(dest, size, mstate)) {
3075			*flags |= CPU_DTRACE_BADADDR;
3076			*illval = regs[rd];
3077			break;
3078		}
3079
3080		if (!dtrace_canload(src, size, mstate, vstate)) {
3081			regs[rd] = NULL;
3082			break;
3083		}
3084
3085		dtrace_bcopy((void *)src, (void *)dest, size);
3086		break;
3087	}
3088
3089	case DIF_SUBR_ALLOCA:
3090	case DIF_SUBR_COPYIN: {
3091		uintptr_t dest = P2ROUNDUP(mstate->dtms_scratch_ptr, 8);
3092		uint64_t size =
3093		    tupregs[subr == DIF_SUBR_ALLOCA ? 0 : 1].dttk_value;
3094		size_t scratch_size = (dest - mstate->dtms_scratch_ptr) + size;
3095
3096		/*
3097		 * This action doesn't require any credential checks since
3098		 * probes will not activate in user contexts to which the
3099		 * enabling user does not have permissions.
3100		 */
3101
3102		/*
3103		 * Rounding up the user allocation size could have overflowed
3104		 * a large, bogus allocation (like -1ULL) to 0.
3105		 */
3106		if (scratch_size < size ||
3107		    !DTRACE_INSCRATCH(mstate, scratch_size)) {
3108			DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
3109			regs[rd] = NULL;
3110			break;
3111		}
3112
3113		if (subr == DIF_SUBR_COPYIN) {
3114			DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
3115			dtrace_copyin(tupregs[0].dttk_value, dest, size, flags);
3116			DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);
3117		}
3118
3119		mstate->dtms_scratch_ptr += scratch_size;
3120		regs[rd] = dest;
3121		break;
3122	}
3123
3124	case DIF_SUBR_COPYINTO: {
3125		uint64_t size = tupregs[1].dttk_value;
3126		uintptr_t dest = tupregs[2].dttk_value;
3127
3128		/*
3129		 * This action doesn't require any credential checks since
3130		 * probes will not activate in user contexts to which the
3131		 * enabling user does not have permissions.
3132		 */
3133		if (!dtrace_inscratch(dest, size, mstate)) {
3134			*flags |= CPU_DTRACE_BADADDR;
3135			*illval = regs[rd];
3136			break;
3137		}
3138
3139		DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
3140		dtrace_copyin(tupregs[0].dttk_value, dest, size, flags);
3141		DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);
3142		break;
3143	}
3144
3145	case DIF_SUBR_COPYINSTR: {
3146		uintptr_t dest = mstate->dtms_scratch_ptr;
3147		uint64_t size = state->dts_options[DTRACEOPT_STRSIZE];
3148
3149		if (nargs > 1 && tupregs[1].dttk_value < size)
3150			size = tupregs[1].dttk_value + 1;
3151
3152		/*
3153		 * This action doesn't require any credential checks since
3154		 * probes will not activate in user contexts to which the
3155		 * enabling user does not have permissions.
3156		 */
3157		if (!DTRACE_INSCRATCH(mstate, size)) {
3158			DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
3159			regs[rd] = NULL;
3160			break;
3161		}
3162
3163		DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
3164		dtrace_copyinstr(tupregs[0].dttk_value, dest, size, flags);
3165		DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);
3166
3167		((char *)dest)[size - 1] = '\0';
3168		mstate->dtms_scratch_ptr += size;
3169		regs[rd] = dest;
3170		break;
3171	}
3172
3173	case DIF_SUBR_MSGSIZE:
3174	case DIF_SUBR_MSGDSIZE: {
3175		uintptr_t baddr = tupregs[0].dttk_value, daddr;
3176		uintptr_t wptr, rptr;
3177		size_t count = 0;
3178		int cont = 0;
3179
3180		while (baddr != NULL && !(*flags & CPU_DTRACE_FAULT)) {
3181
3182			if (!dtrace_canload(baddr, sizeof (mblk_t), mstate,
3183			    vstate)) {
3184				regs[rd] = NULL;
3185				break;
3186			}
3187
3188			wptr = dtrace_loadptr(baddr +
3189			    offsetof(mblk_t, b_wptr));
3190
3191			rptr = dtrace_loadptr(baddr +
3192			    offsetof(mblk_t, b_rptr));
3193
3194			if (wptr < rptr) {
3195				*flags |= CPU_DTRACE_BADADDR;
3196				*illval = tupregs[0].dttk_value;
3197				break;
3198			}
3199
3200			daddr = dtrace_loadptr(baddr +
3201			    offsetof(mblk_t, b_datap));
3202
3203			baddr = dtrace_loadptr(baddr +
3204			    offsetof(mblk_t, b_cont));
3205
3206			/*
3207			 * We want to prevent against denial-of-service here,
3208			 * so we're only going to search the list for
3209			 * dtrace_msgdsize_max mblks.
3210			 */
3211			if (cont++ > dtrace_msgdsize_max) {
3212				*flags |= CPU_DTRACE_ILLOP;
3213				break;
3214			}
3215
3216			if (subr == DIF_SUBR_MSGDSIZE) {
3217				if (dtrace_load8(daddr +
3218				    offsetof(dblk_t, db_type)) != M_DATA)
3219					continue;
3220			}
3221
3222			count += wptr - rptr;
3223		}
3224
3225		if (!(*flags & CPU_DTRACE_FAULT))
3226			regs[rd] = count;
3227
3228		break;
3229	}
3230
3231	case DIF_SUBR_PROGENYOF: {
3232		pid_t pid = tupregs[0].dttk_value;
3233		proc_t *p;
3234		int rval = 0;
3235
3236		DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
3237
3238		for (p = curthread->t_procp; p != NULL; p = p->p_parent) {
3239			if (p->p_pidp->pid_id == pid) {
3240				rval = 1;
3241				break;
3242			}
3243		}
3244
3245		DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);
3246
3247		regs[rd] = rval;
3248		break;
3249	}
3250
3251	case DIF_SUBR_SPECULATION:
3252		regs[rd] = dtrace_speculation(state);
3253		break;
3254
3255	case DIF_SUBR_COPYOUT: {
3256		uintptr_t kaddr = tupregs[0].dttk_value;
3257		uintptr_t uaddr = tupregs[1].dttk_value;
3258		uint64_t size = tupregs[2].dttk_value;
3259
3260		if (!dtrace_destructive_disallow &&
3261		    dtrace_priv_proc_control(state) &&
3262		    !dtrace_istoxic(kaddr, size)) {
3263			DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
3264			dtrace_copyout(kaddr, uaddr, size, flags);
3265			DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);
3266		}
3267		break;
3268	}
3269
3270	case DIF_SUBR_COPYOUTSTR: {
3271		uintptr_t kaddr = tupregs[0].dttk_value;
3272		uintptr_t uaddr = tupregs[1].dttk_value;
3273		uint64_t size = tupregs[2].dttk_value;
3274
3275		if (!dtrace_destructive_disallow &&
3276		    dtrace_priv_proc_control(state) &&
3277		    !dtrace_istoxic(kaddr, size)) {
3278			DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
3279			dtrace_copyoutstr(kaddr, uaddr, size, flags);
3280			DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);
3281		}
3282		break;
3283	}
3284
3285	case DIF_SUBR_STRLEN: {
3286		size_t sz;
3287		uintptr_t addr = (uintptr_t)tupregs[0].dttk_value;
3288		sz = dtrace_strlen((char *)addr,
3289		    state->dts_options[DTRACEOPT_STRSIZE]);
3290
3291		if (!dtrace_canload(addr, sz + 1, mstate, vstate)) {
3292			regs[rd] = NULL;
3293			break;
3294		}
3295
3296		regs[rd] = sz;
3297
3298		break;
3299	}
3300
3301	case DIF_SUBR_STRCHR:
3302	case DIF_SUBR_STRRCHR: {
3303		/*
3304		 * We're going to iterate over the string looking for the
3305		 * specified character.  We will iterate until we have reached
3306		 * the string length or we have found the character.  If this
3307		 * is DIF_SUBR_STRRCHR, we will look for the last occurrence
3308		 * of the specified character instead of the first.
3309		 */
3310		uintptr_t saddr = tupregs[0].dttk_value;
3311		uintptr_t addr = tupregs[0].dttk_value;
3312		uintptr_t limit = addr + state->dts_options[DTRACEOPT_STRSIZE];
3313		char c, target = (char)tupregs[1].dttk_value;
3314
3315		for (regs[rd] = NULL; addr < limit; addr++) {
3316			if ((c = dtrace_load8(addr)) == target) {
3317				regs[rd] = addr;
3318
3319				if (subr == DIF_SUBR_STRCHR)
3320					break;
3321			}
3322
3323			if (c == '\0')
3324				break;
3325		}
3326
3327		if (!dtrace_canload(saddr, addr - saddr, mstate, vstate)) {
3328			regs[rd] = NULL;
3329			break;
3330		}
3331
3332		break;
3333	}
3334
3335	case DIF_SUBR_STRSTR:
3336	case DIF_SUBR_INDEX:
3337	case DIF_SUBR_RINDEX: {
3338		/*
3339		 * We're going to iterate over the string looking for the
3340		 * specified string.  We will iterate until we have reached
3341		 * the string length or we have found the string.  (Yes, this
3342		 * is done in the most naive way possible -- but considering
3343		 * that the string we're searching for is likely to be
3344		 * relatively short, the complexity of Rabin-Karp or similar
3345		 * hardly seems merited.)
3346		 */
3347		char *addr = (char *)(uintptr_t)tupregs[0].dttk_value;
3348		char *substr = (char *)(uintptr_t)tupregs[1].dttk_value;
3349		uint64_t size = state->dts_options[DTRACEOPT_STRSIZE];
3350		size_t len = dtrace_strlen(addr, size);
3351		size_t sublen = dtrace_strlen(substr, size);
3352		char *limit = addr + len, *orig = addr;
3353		int notfound = subr == DIF_SUBR_STRSTR ? 0 : -1;
3354		int inc = 1;
3355
3356		regs[rd] = notfound;
3357
3358		if (!dtrace_canload((uintptr_t)addr, len + 1, mstate, vstate)) {
3359			regs[rd] = NULL;
3360			break;
3361		}
3362
3363		if (!dtrace_canload((uintptr_t)substr, sublen + 1, mstate,
3364		    vstate)) {
3365			regs[rd] = NULL;
3366			break;
3367		}
3368
3369		/*
3370		 * strstr() and index()/rindex() have similar semantics if
3371		 * both strings are the empty string: strstr() returns a
3372		 * pointer to the (empty) string, and index() and rindex()
3373		 * both return index 0 (regardless of any position argument).
3374		 */
3375		if (sublen == 0 && len == 0) {
3376			if (subr == DIF_SUBR_STRSTR)
3377				regs[rd] = (uintptr_t)addr;
3378			else
3379				regs[rd] = 0;
3380			break;
3381		}
3382
3383		if (subr != DIF_SUBR_STRSTR) {
3384			if (subr == DIF_SUBR_RINDEX) {
3385				limit = orig - 1;
3386				addr += len;
3387				inc = -1;
3388			}
3389
3390			/*
3391			 * Both index() and rindex() take an optional position
3392			 * argument that denotes the starting position.
3393			 */
3394			if (nargs == 3) {
3395				int64_t pos = (int64_t)tupregs[2].dttk_value;
3396
3397				/*
3398				 * If the position argument to index() is
3399				 * negative, Perl implicitly clamps it at
3400				 * zero.  This semantic is a little surprising
3401				 * given the special meaning of negative
3402				 * positions to similar Perl functions like
3403				 * substr(), but it appears to reflect a
3404				 * notion that index() can start from a
3405				 * negative index and increment its way up to
3406				 * the string.  Given this notion, Perl's
3407				 * rindex() is at least self-consistent in
3408				 * that it implicitly clamps positions greater
3409				 * than the string length to be the string
3410				 * length.  Where Perl completely loses
3411				 * coherence, however, is when the specified
3412				 * substring is the empty string ("").  In
3413				 * this case, even if the position is
3414				 * negative, rindex() returns 0 -- and even if
3415				 * the position is greater than the length,
3416				 * index() returns the string length.  These
3417				 * semantics violate the notion that index()
3418				 * should never return a value less than the
3419				 * specified position and that rindex() should
3420				 * never return a value greater than the
3421				 * specified position.  (One assumes that
3422				 * these semantics are artifacts of Perl's
3423				 * implementation and not the results of
3424				 * deliberate design -- it beggars belief that
3425				 * even Larry Wall could desire such oddness.)
3426				 * While in the abstract one would wish for
3427				 * consistent position semantics across
3428				 * substr(), index() and rindex() -- or at the
3429				 * very least self-consistent position
3430				 * semantics for index() and rindex() -- we
3431				 * instead opt to keep with the extant Perl
3432				 * semantics, in all their broken glory.  (Do
3433				 * we have more desire to maintain Perl's
3434				 * semantics than Perl does?  Probably.)
3435				 */
3436				if (subr == DIF_SUBR_RINDEX) {
3437					if (pos < 0) {
3438						if (sublen == 0)
3439							regs[rd] = 0;
3440						break;
3441					}
3442
3443					if (pos > len)
3444						pos = len;
3445				} else {
3446					if (pos < 0)
3447						pos = 0;
3448
3449					if (pos >= len) {
3450						if (sublen == 0)
3451							regs[rd] = len;
3452						break;
3453					}
3454				}
3455
3456				addr = orig + pos;
3457			}
3458		}
3459
3460		for (regs[rd] = notfound; addr != limit; addr += inc) {
3461			if (dtrace_strncmp(addr, substr, sublen) == 0) {
3462				if (subr != DIF_SUBR_STRSTR) {
3463					/*
3464					 * As D index() and rindex() are
3465					 * modeled on Perl (and not on awk),
3466					 * we return a zero-based (and not a
3467					 * one-based) index.  (For you Perl
3468					 * weenies: no, we're not going to add
3469					 * $[ -- and shouldn't you be at a con
3470					 * or something?)
3471					 */
3472					regs[rd] = (uintptr_t)(addr - orig);
3473					break;
3474				}
3475
3476				ASSERT(subr == DIF_SUBR_STRSTR);
3477				regs[rd] = (uintptr_t)addr;
3478				break;
3479			}
3480		}
3481
3482		break;
3483	}
3484
3485	case DIF_SUBR_STRTOK: {
3486		uintptr_t addr = tupregs[0].dttk_value;
3487		uintptr_t tokaddr = tupregs[1].dttk_value;
3488		uint64_t size = state->dts_options[DTRACEOPT_STRSIZE];
3489		uintptr_t limit, toklimit = tokaddr + size;
3490		uint8_t c, tokmap[32];	 /* 256 / 8 */
3491		char *dest = (char *)mstate->dtms_scratch_ptr;
3492		int i;
3493
3494		/*
3495		 * Check both the token buffer and (later) the input buffer,
3496		 * since both could be non-scratch addresses.
3497		 */
3498		if (!dtrace_strcanload(tokaddr, size, mstate, vstate)) {
3499			regs[rd] = NULL;
3500			break;
3501		}
3502
3503		if (!DTRACE_INSCRATCH(mstate, size)) {
3504			DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
3505			regs[rd] = NULL;
3506			break;
3507		}
3508
3509		if (addr == NULL) {
3510			/*
3511			 * If the address specified is NULL, we use our saved
3512			 * strtok pointer from the mstate.  Note that this
3513			 * means that the saved strtok pointer is _only_
3514			 * valid within multiple enablings of the same probe --
3515			 * it behaves like an implicit clause-local variable.
3516			 */
3517			addr = mstate->dtms_strtok;
3518		} else {
3519			/*
3520			 * If the user-specified address is non-NULL we must
3521			 * access check it.  This is the only time we have
3522			 * a chance to do so, since this address may reside
3523			 * in the string table of this clause-- future calls
3524			 * (when we fetch addr from mstate->dtms_strtok)
3525			 * would fail this access check.
3526			 */
3527			if (!dtrace_strcanload(addr, size, mstate, vstate)) {
3528				regs[rd] = NULL;
3529				break;
3530			}
3531		}
3532
3533		/*
3534		 * First, zero the token map, and then process the token
3535		 * string -- setting a bit in the map for every character
3536		 * found in the token string.
3537		 */
3538		for (i = 0; i < sizeof (tokmap); i++)
3539			tokmap[i] = 0;
3540
3541		for (; tokaddr < toklimit; tokaddr++) {
3542			if ((c = dtrace_load8(tokaddr)) == '\0')
3543				break;
3544
3545			ASSERT((c >> 3) < sizeof (tokmap));
3546			tokmap[c >> 3] |= (1 << (c & 0x7));
3547		}
3548
3549		for (limit = addr + size; addr < limit; addr++) {
3550			/*
3551			 * We're looking for a character that is _not_ contained
3552			 * in the token string.
3553			 */
3554			if ((c = dtrace_load8(addr)) == '\0')
3555				break;
3556
3557			if (!(tokmap[c >> 3] & (1 << (c & 0x7))))
3558				break;
3559		}
3560
3561		if (c == '\0') {
3562			/*
3563			 * We reached the end of the string without finding
3564			 * any character that was not in the token string.
3565			 * We return NULL in this case, and we set the saved
3566			 * address to NULL as well.
3567			 */
3568			regs[rd] = NULL;
3569			mstate->dtms_strtok = NULL;
3570			break;
3571		}
3572
3573		/*
3574		 * From here on, we're copying into the destination string.
3575		 */
3576		for (i = 0; addr < limit && i < size - 1; addr++) {
3577			if ((c = dtrace_load8(addr)) == '\0')
3578				break;
3579
3580			if (tokmap[c >> 3] & (1 << (c & 0x7)))
3581				break;
3582
3583			ASSERT(i < size);
3584			dest[i++] = c;
3585		}
3586
3587		ASSERT(i < size);
3588		dest[i] = '\0';
3589		regs[rd] = (uintptr_t)dest;
3590		mstate->dtms_scratch_ptr += size;
3591		mstate->dtms_strtok = addr;
3592		break;
3593	}
3594
3595	case DIF_SUBR_SUBSTR: {
3596		uintptr_t s = tupregs[0].dttk_value;
3597		uint64_t size = state->dts_options[DTRACEOPT_STRSIZE];
3598		char *d = (char *)mstate->dtms_scratch_ptr;
3599		int64_t index = (int64_t)tupregs[1].dttk_value;
3600		int64_t remaining = (int64_t)tupregs[2].dttk_value;
3601		size_t len = dtrace_strlen((char *)s, size);
3602		int64_t i = 0;
3603
3604		if (!dtrace_canload(s, len + 1, mstate, vstate)) {
3605			regs[rd] = NULL;
3606			break;
3607		}
3608
3609		if (!DTRACE_INSCRATCH(mstate, size)) {
3610			DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
3611			regs[rd] = NULL;
3612			break;
3613		}
3614
3615		if (nargs <= 2)
3616			remaining = (int64_t)size;
3617
3618		if (index < 0) {
3619			index += len;
3620
3621			if (index < 0 && index + remaining > 0) {
3622				remaining += index;
3623				index = 0;
3624			}
3625		}
3626
3627		if (index >= len || index < 0) {
3628			remaining = 0;
3629		} else if (remaining < 0) {
3630			remaining += len - index;
3631		} else if (index + remaining > size) {
3632			remaining = size - index;
3633		}
3634
3635		for (i = 0; i < remaining; i++) {
3636			if ((d[i] = dtrace_load8(s + index + i)) == '\0')
3637				break;
3638		}
3639
3640		d[i] = '\0';
3641
3642		mstate->dtms_scratch_ptr += size;
3643		regs[rd] = (uintptr_t)d;
3644		break;
3645	}
3646
3647	case DIF_SUBR_GETMAJOR:
3648#ifdef _LP64
3649		regs[rd] = (tupregs[0].dttk_value >> NBITSMINOR64) & MAXMAJ64;
3650#else
3651		regs[rd] = (tupregs[0].dttk_value >> NBITSMINOR) & MAXMAJ;
3652#endif
3653		break;
3654
3655	case DIF_SUBR_GETMINOR:
3656#ifdef _LP64
3657		regs[rd] = tupregs[0].dttk_value & MAXMIN64;
3658#else
3659		regs[rd] = tupregs[0].dttk_value & MAXMIN;
3660#endif
3661		break;
3662
3663	case DIF_SUBR_DDI_PATHNAME: {
3664		/*
3665		 * This one is a galactic mess.  We are going to roughly
3666		 * emulate ddi_pathname(), but it's made more complicated
3667		 * by the fact that we (a) want to include the minor name and
3668		 * (b) must proceed iteratively instead of recursively.
3669		 */
3670		uintptr_t dest = mstate->dtms_scratch_ptr;
3671		uint64_t size = state->dts_options[DTRACEOPT_STRSIZE];
3672		char *start = (char *)dest, *end = start + size - 1;
3673		uintptr_t daddr = tupregs[0].dttk_value;
3674		int64_t minor = (int64_t)tupregs[1].dttk_value;
3675		char *s;
3676		int i, len, depth = 0;
3677
3678		/*
3679		 * Due to all the pointer jumping we do and context we must
3680		 * rely upon, we just mandate that the user must have kernel
3681		 * read privileges to use this routine.
3682		 */
3683		if ((mstate->dtms_access & DTRACE_ACCESS_KERNEL) == 0) {
3684			*flags |= CPU_DTRACE_KPRIV;
3685			*illval = daddr;
3686			regs[rd] = NULL;
3687		}
3688
3689		if (!DTRACE_INSCRATCH(mstate, size)) {
3690			DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
3691			regs[rd] = NULL;
3692			break;
3693		}
3694
3695		*end = '\0';
3696
3697		/*
3698		 * We want to have a name for the minor.  In order to do this,
3699		 * we need to walk the minor list from the devinfo.  We want
3700		 * to be sure that we don't infinitely walk a circular list,
3701		 * so we check for circularity by sending a scout pointer
3702		 * ahead two elements for every element that we iterate over;
3703		 * if the list is circular, these will ultimately point to the
3704		 * same element.  You may recognize this little trick as the
3705		 * answer to a stupid interview question -- one that always
3706		 * seems to be asked by those who had to have it laboriously
3707		 * explained to them, and who can't even concisely describe
3708		 * the conditions under which one would be forced to resort to
3709		 * this technique.  Needless to say, those conditions are
3710		 * found here -- and probably only here.  Is this the only use
3711		 * of this infamous trick in shipping, production code?  If it
3712		 * isn't, it probably should be...
3713		 */
3714		if (minor != -1) {
3715			uintptr_t maddr = dtrace_loadptr(daddr +
3716			    offsetof(struct dev_info, devi_minor));
3717
3718			uintptr_t next = offsetof(struct ddi_minor_data, next);
3719			uintptr_t name = offsetof(struct ddi_minor_data,
3720			    d_minor) + offsetof(struct ddi_minor, name);
3721			uintptr_t dev = offsetof(struct ddi_minor_data,
3722			    d_minor) + offsetof(struct ddi_minor, dev);
3723			uintptr_t scout;
3724
3725			if (maddr != NULL)
3726				scout = dtrace_loadptr(maddr + next);
3727
3728			while (maddr != NULL && !(*flags & CPU_DTRACE_FAULT)) {
3729				uint64_t m;
3730#ifdef _LP64
3731				m = dtrace_load64(maddr + dev) & MAXMIN64;
3732#else
3733				m = dtrace_load32(maddr + dev) & MAXMIN;
3734#endif
3735				if (m != minor) {
3736					maddr = dtrace_loadptr(maddr + next);
3737
3738					if (scout == NULL)
3739						continue;
3740
3741					scout = dtrace_loadptr(scout + next);
3742
3743					if (scout == NULL)
3744						continue;
3745
3746					scout = dtrace_loadptr(scout + next);
3747
3748					if (scout == NULL)
3749						continue;
3750
3751					if (scout == maddr) {
3752						*flags |= CPU_DTRACE_ILLOP;
3753						break;
3754					}
3755
3756					continue;
3757				}
3758
3759				/*
3760				 * We have the minor data.  Now we need to
3761				 * copy the minor's name into the end of the
3762				 * pathname.
3763				 */
3764				s = (char *)dtrace_loadptr(maddr + name);
3765				len = dtrace_strlen(s, size);
3766
3767				if (*flags & CPU_DTRACE_FAULT)
3768					break;
3769
3770				if (len != 0) {
3771					if ((end -= (len + 1)) < start)
3772						break;
3773
3774					*end = ':';
3775				}
3776
3777				for (i = 1; i <= len; i++)
3778					end[i] = dtrace_load8((uintptr_t)s++);
3779				break;
3780			}
3781		}
3782
3783		while (daddr != NULL && !(*flags & CPU_DTRACE_FAULT)) {
3784			ddi_node_state_t devi_state;
3785
3786			devi_state = dtrace_load32(daddr +
3787			    offsetof(struct dev_info, devi_node_state));
3788
3789			if (*flags & CPU_DTRACE_FAULT)
3790				break;
3791
3792			if (devi_state >= DS_INITIALIZED) {
3793				s = (char *)dtrace_loadptr(daddr +
3794				    offsetof(struct dev_info, devi_addr));
3795				len = dtrace_strlen(s, size);
3796
3797				if (*flags & CPU_DTRACE_FAULT)
3798					break;
3799
3800				if (len != 0) {
3801					if ((end -= (len + 1)) < start)
3802						break;
3803
3804					*end = '@';
3805				}
3806
3807				for (i = 1; i <= len; i++)
3808					end[i] = dtrace_load8((uintptr_t)s++);
3809			}
3810
3811			/*
3812			 * Now for the node name...
3813			 */
3814			s = (char *)dtrace_loadptr(daddr +
3815			    offsetof(struct dev_info, devi_node_name));
3816
3817			daddr = dtrace_loadptr(daddr +
3818			    offsetof(struct dev_info, devi_parent));
3819
3820			/*
3821			 * If our parent is NULL (that is, if we're the root
3822			 * node), we're going to use the special path
3823			 * "devices".
3824			 */
3825			if (daddr == NULL)
3826				s = "devices";
3827
3828			len = dtrace_strlen(s, size);
3829			if (*flags & CPU_DTRACE_FAULT)
3830				break;
3831
3832			if ((end -= (len + 1)) < start)
3833				break;
3834
3835			for (i = 1; i <= len; i++)
3836				end[i] = dtrace_load8((uintptr_t)s++);
3837			*end = '/';
3838
3839			if (depth++ > dtrace_devdepth_max) {
3840				*flags |= CPU_DTRACE_ILLOP;
3841				break;
3842			}
3843		}
3844
3845		if (end < start)
3846			DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
3847
3848		if (daddr == NULL) {
3849			regs[rd] = (uintptr_t)end;
3850			mstate->dtms_scratch_ptr += size;
3851		}
3852
3853		break;
3854	}
3855
3856	case DIF_SUBR_STRJOIN: {
3857		char *d = (char *)mstate->dtms_scratch_ptr;
3858		uint64_t size = state->dts_options[DTRACEOPT_STRSIZE];
3859		uintptr_t s1 = tupregs[0].dttk_value;
3860		uintptr_t s2 = tupregs[1].dttk_value;
3861		int i = 0;
3862
3863		if (!dtrace_strcanload(s1, size, mstate, vstate) ||
3864		    !dtrace_strcanload(s2, size, mstate, vstate)) {
3865			regs[rd] = NULL;
3866			break;
3867		}
3868
3869		if (!DTRACE_INSCRATCH(mstate, size)) {
3870			DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
3871			regs[rd] = NULL;
3872			break;
3873		}
3874
3875		for (;;) {
3876			if (i >= size) {
3877				DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
3878				regs[rd] = NULL;
3879				break;
3880			}
3881
3882			if ((d[i++] = dtrace_load8(s1++)) == '\0') {
3883				i--;
3884				break;
3885			}
3886		}
3887
3888		for (;;) {
3889			if (i >= size) {
3890				DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
3891				regs[rd] = NULL;
3892				break;
3893			}
3894
3895			if ((d[i++] = dtrace_load8(s2++)) == '\0')
3896				break;
3897		}
3898
3899		if (i < size) {
3900			mstate->dtms_scratch_ptr += i;
3901			regs[rd] = (uintptr_t)d;
3902		}
3903
3904		break;
3905	}
3906
3907	case DIF_SUBR_LLTOSTR: {
3908		int64_t i = (int64_t)tupregs[0].dttk_value;
3909		int64_t val = i < 0 ? i * -1 : i;
3910		uint64_t size = 22;	/* enough room for 2^64 in decimal */
3911		char *end = (char *)mstate->dtms_scratch_ptr + size - 1;
3912
3913		if (!DTRACE_INSCRATCH(mstate, size)) {
3914			DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
3915			regs[rd] = NULL;
3916			break;
3917		}
3918
3919		for (*end-- = '\0'; val; val /= 10)
3920			*end-- = '0' + (val % 10);
3921
3922		if (i == 0)
3923			*end-- = '0';
3924
3925		if (i < 0)
3926			*end-- = '-';
3927
3928		regs[rd] = (uintptr_t)end + 1;
3929		mstate->dtms_scratch_ptr += size;
3930		break;
3931	}
3932
3933	case DIF_SUBR_HTONS:
3934	case DIF_SUBR_NTOHS:
3935#ifdef _BIG_ENDIAN
3936		regs[rd] = (uint16_t)tupregs[0].dttk_value;
3937#else
3938		regs[rd] = DT_BSWAP_16((uint16_t)tupregs[0].dttk_value);
3939#endif
3940		break;
3941
3942
3943	case DIF_SUBR_HTONL:
3944	case DIF_SUBR_NTOHL:
3945#ifdef _BIG_ENDIAN
3946		regs[rd] = (uint32_t)tupregs[0].dttk_value;
3947#else
3948		regs[rd] = DT_BSWAP_32((uint32_t)tupregs[0].dttk_value);
3949#endif
3950		break;
3951
3952
3953	case DIF_SUBR_HTONLL:
3954	case DIF_SUBR_NTOHLL:
3955#ifdef _BIG_ENDIAN
3956		regs[rd] = (uint64_t)tupregs[0].dttk_value;
3957#else
3958		regs[rd] = DT_BSWAP_64((uint64_t)tupregs[0].dttk_value);
3959#endif
3960		break;
3961
3962
3963	case DIF_SUBR_DIRNAME:
3964	case DIF_SUBR_BASENAME: {
3965		char *dest = (char *)mstate->dtms_scratch_ptr;
3966		uint64_t size = state->dts_options[DTRACEOPT_STRSIZE];
3967		uintptr_t src = tupregs[0].dttk_value;
3968		int i, j, len = dtrace_strlen((char *)src, size);
3969		int lastbase = -1, firstbase = -1, lastdir = -1;
3970		int start, end;
3971
3972		if (!dtrace_canload(src, len + 1, mstate, vstate)) {
3973			regs[rd] = NULL;
3974			break;
3975		}
3976
3977		if (!DTRACE_INSCRATCH(mstate, size)) {
3978			DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
3979			regs[rd] = NULL;
3980			break;
3981		}
3982
3983		/*
3984		 * The basename and dirname for a zero-length string is
3985		 * defined to be "."
3986		 */
3987		if (len == 0) {
3988			len = 1;
3989			src = (uintptr_t)".";
3990		}
3991
3992		/*
3993		 * Start from the back of the string, moving back toward the
3994		 * front until we see a character that isn't a slash.  That
3995		 * character is the last character in the basename.
3996		 */
3997		for (i = len - 1; i >= 0; i--) {
3998			if (dtrace_load8(src + i) != '/')
3999				break;
4000		}
4001
4002		if (i >= 0)
4003			lastbase = i;
4004
4005		/*
4006		 * Starting from the last character in the basename, move
4007		 * towards the front until we find a slash.  The character
4008		 * that we processed immediately before that is the first
4009		 * character in the basename.
4010		 */
4011		for (; i >= 0; i--) {
4012			if (dtrace_load8(src + i) == '/')
4013				break;
4014		}
4015
4016		if (i >= 0)
4017			firstbase = i + 1;
4018
4019		/*
4020		 * Now keep going until we find a non-slash character.  That
4021		 * character is the last character in the dirname.
4022		 */
4023		for (; i >= 0; i--) {
4024			if (dtrace_load8(src + i) != '/')
4025				break;
4026		}
4027
4028		if (i >= 0)
4029			lastdir = i;
4030
4031		ASSERT(!(lastbase == -1 && firstbase != -1));
4032		ASSERT(!(firstbase == -1 && lastdir != -1));
4033
4034		if (lastbase == -1) {
4035			/*
4036			 * We didn't find a non-slash character.  We know that
4037			 * the length is non-zero, so the whole string must be
4038			 * slashes.  In either the dirname or the basename
4039			 * case, we return '/'.
4040			 */
4041			ASSERT(firstbase == -1);
4042			firstbase = lastbase = lastdir = 0;
4043		}
4044
4045		if (firstbase == -1) {
4046			/*
4047			 * The entire string consists only of a basename
4048			 * component.  If we're looking for dirname, we need
4049			 * to change our string to be just "."; if we're
4050			 * looking for a basename, we'll just set the first
4051			 * character of the basename to be 0.
4052			 */
4053			if (subr == DIF_SUBR_DIRNAME) {
4054				ASSERT(lastdir == -1);
4055				src = (uintptr_t)".";
4056				lastdir = 0;
4057			} else {
4058				firstbase = 0;
4059			}
4060		}
4061
4062		if (subr == DIF_SUBR_DIRNAME) {
4063			if (lastdir == -1) {
4064				/*
4065				 * We know that we have a slash in the name --
4066				 * or lastdir would be set to 0, above.  And
4067				 * because lastdir is -1, we know that this
4068				 * slash must be the first character.  (That
4069				 * is, the full string must be of the form
4070				 * "/basename".)  In this case, the last
4071				 * character of the directory name is 0.
4072				 */
4073				lastdir = 0;
4074			}
4075
4076			start = 0;
4077			end = lastdir;
4078		} else {
4079			ASSERT(subr == DIF_SUBR_BASENAME);
4080			ASSERT(firstbase != -1 && lastbase != -1);
4081			start = firstbase;
4082			end = lastbase;
4083		}
4084
4085		for (i = start, j = 0; i <= end && j < size - 1; i++, j++)
4086			dest[j] = dtrace_load8(src + i);
4087
4088		dest[j] = '\0';
4089		regs[rd] = (uintptr_t)dest;
4090		mstate->dtms_scratch_ptr += size;
4091		break;
4092	}
4093
4094	case DIF_SUBR_CLEANPATH: {
4095		char *dest = (char *)mstate->dtms_scratch_ptr, c;
4096		uint64_t size = state->dts_options[DTRACEOPT_STRSIZE];
4097		uintptr_t src = tupregs[0].dttk_value;
4098		int i = 0, j = 0;
4099
4100		if (!dtrace_strcanload(src, size, mstate, vstate)) {
4101			regs[rd] = NULL;
4102			break;
4103		}
4104
4105		if (!DTRACE_INSCRATCH(mstate, size)) {
4106			DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
4107			regs[rd] = NULL;
4108			break;
4109		}
4110
4111		/*
4112		 * Move forward, loading each character.
4113		 */
4114		do {
4115			c = dtrace_load8(src + i++);
4116next:
4117			if (j + 5 >= size)	/* 5 = strlen("/..c\0") */
4118				break;
4119
4120			if (c != '/') {
4121				dest[j++] = c;
4122				continue;
4123			}
4124
4125			c = dtrace_load8(src + i++);
4126
4127			if (c == '/') {
4128				/*
4129				 * We have two slashes -- we can just advance
4130				 * to the next character.
4131				 */
4132				goto next;
4133			}
4134
4135			if (c != '.') {
4136				/*
4137				 * This is not "." and it's not ".." -- we can
4138				 * just store the "/" and this character and
4139				 * drive on.
4140				 */
4141				dest[j++] = '/';
4142				dest[j++] = c;
4143				continue;
4144			}
4145
4146			c = dtrace_load8(src + i++);
4147
4148			if (c == '/') {
4149				/*
4150				 * This is a "/./" component.  We're not going
4151				 * to store anything in the destination buffer;
4152				 * we're just going to go to the next component.
4153				 */
4154				goto next;
4155			}
4156
4157			if (c != '.') {
4158				/*
4159				 * This is not ".." -- we can just store the
4160				 * "/." and this character and continue
4161				 * processing.
4162				 */
4163				dest[j++] = '/';
4164				dest[j++] = '.';
4165				dest[j++] = c;
4166				continue;
4167			}
4168
4169			c = dtrace_load8(src + i++);
4170
4171			if (c != '/' && c != '\0') {
4172				/*
4173				 * This is not ".." -- it's "..[mumble]".
4174				 * We'll store the "/.." and this character
4175				 * and continue processing.
4176				 */
4177				dest[j++] = '/';
4178				dest[j++] = '.';
4179				dest[j++] = '.';
4180				dest[j++] = c;
4181				continue;
4182			}
4183
4184			/*
4185			 * This is "/../" or "/..\0".  We need to back up
4186			 * our destination pointer until we find a "/".
4187			 */
4188			i--;
4189			while (j != 0 && dest[--j] != '/')
4190				continue;
4191
4192			if (c == '\0')
4193				dest[++j] = '/';
4194		} while (c != '\0');
4195
4196		dest[j] = '\0';
4197		regs[rd] = (uintptr_t)dest;
4198		mstate->dtms_scratch_ptr += size;
4199		break;
4200	}
4201
4202	case DIF_SUBR_INET_NTOA:
4203	case DIF_SUBR_INET_NTOA6:
4204	case DIF_SUBR_INET_NTOP: {
4205		size_t size;
4206		int af, argi, i;
4207		char *base, *end;
4208
4209		if (subr == DIF_SUBR_INET_NTOP) {
4210			af = (int)tupregs[0].dttk_value;
4211			argi = 1;
4212		} else {
4213			af = subr == DIF_SUBR_INET_NTOA ? AF_INET: AF_INET6;
4214			argi = 0;
4215		}
4216
4217		if (af == AF_INET) {
4218			ipaddr_t ip4;
4219			uint8_t *ptr8, val;
4220
4221			/*
4222			 * Safely load the IPv4 address.
4223			 */
4224			ip4 = dtrace_load32(tupregs[argi].dttk_value);
4225
4226			/*
4227			 * Check an IPv4 string will fit in scratch.
4228			 */
4229			size = INET_ADDRSTRLEN;
4230			if (!DTRACE_INSCRATCH(mstate, size)) {
4231				DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
4232				regs[rd] = NULL;
4233				break;
4234			}
4235			base = (char *)mstate->dtms_scratch_ptr;
4236			end = (char *)mstate->dtms_scratch_ptr + size - 1;
4237
4238			/*
4239			 * Stringify as a dotted decimal quad.
4240			 */
4241			*end-- = '\0';
4242			ptr8 = (uint8_t *)&ip4;
4243			for (i = 3; i >= 0; i--) {
4244				val = ptr8[i];
4245
4246				if (val == 0) {
4247					*end-- = '0';
4248				} else {
4249					for (; val; val /= 10) {
4250						*end-- = '0' + (val % 10);
4251					}
4252				}
4253
4254				if (i > 0)
4255					*end-- = '.';
4256			}
4257			ASSERT(end + 1 >= base);
4258
4259		} else if (af == AF_INET6) {
4260			struct in6_addr ip6;
4261			int firstzero, tryzero, numzero, v6end;
4262			uint16_t val;
4263			const char digits[] = "0123456789abcdef";
4264
4265			/*
4266			 * Stringify using RFC 1884 convention 2 - 16 bit
4267			 * hexadecimal values with a zero-run compression.
4268			 * Lower case hexadecimal digits are used.
4269			 * 	eg, fe80::214:4fff:fe0b:76c8.
4270			 * The IPv4 embedded form is returned for inet_ntop,
4271			 * just the IPv4 string is returned for inet_ntoa6.
4272			 */
4273
4274			/*
4275			 * Safely load the IPv6 address.
4276			 */
4277			dtrace_bcopy(
4278			    (void *)(uintptr_t)tupregs[argi].dttk_value,
4279			    (void *)(uintptr_t)&ip6, sizeof (struct in6_addr));
4280
4281			/*
4282			 * Check an IPv6 string will fit in scratch.
4283			 */
4284			size = INET6_ADDRSTRLEN;
4285			if (!DTRACE_INSCRATCH(mstate, size)) {
4286				DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
4287				regs[rd] = NULL;
4288				break;
4289			}
4290			base = (char *)mstate->dtms_scratch_ptr;
4291			end = (char *)mstate->dtms_scratch_ptr + size - 1;
4292			*end-- = '\0';
4293
4294			/*
4295			 * Find the longest run of 16 bit zero values
4296			 * for the single allowed zero compression - "::".
4297			 */
4298			firstzero = -1;
4299			tryzero = -1;
4300			numzero = 1;
4301			for (i = 0; i < sizeof (struct in6_addr); i++) {
4302				if (ip6._S6_un._S6_u8[i] == 0 &&
4303				    tryzero == -1 && i % 2 == 0) {
4304					tryzero = i;
4305					continue;
4306				}
4307
4308				if (tryzero != -1 &&
4309				    (ip6._S6_un._S6_u8[i] != 0 ||
4310				    i == sizeof (struct in6_addr) - 1)) {
4311
4312					if (i - tryzero <= numzero) {
4313						tryzero = -1;
4314						continue;
4315					}
4316
4317					firstzero = tryzero;
4318					numzero = i - i % 2 - tryzero;
4319					tryzero = -1;
4320
4321					if (ip6._S6_un._S6_u8[i] == 0 &&
4322					    i == sizeof (struct in6_addr) - 1)
4323						numzero += 2;
4324				}
4325			}
4326			ASSERT(firstzero + numzero <= sizeof (struct in6_addr));
4327
4328			/*
4329			 * Check for an IPv4 embedded address.
4330			 */
4331			v6end = sizeof (struct in6_addr) - 2;
4332			if (IN6_IS_ADDR_V4MAPPED(&ip6) ||
4333			    IN6_IS_ADDR_V4COMPAT(&ip6)) {
4334				for (i = sizeof (struct in6_addr) - 1;
4335				    i >= DTRACE_V4MAPPED_OFFSET; i--) {
4336					ASSERT(end >= base);
4337
4338					val = ip6._S6_un._S6_u8[i];
4339
4340					if (val == 0) {
4341						*end-- = '0';
4342					} else {
4343						for (; val; val /= 10) {
4344							*end-- = '0' + val % 10;
4345						}
4346					}
4347
4348					if (i > DTRACE_V4MAPPED_OFFSET)
4349						*end-- = '.';
4350				}
4351
4352				if (subr == DIF_SUBR_INET_NTOA6)
4353					goto inetout;
4354
4355				/*
4356				 * Set v6end to skip the IPv4 address that
4357				 * we have already stringified.
4358				 */
4359				v6end = 10;
4360			}
4361
4362			/*
4363			 * Build the IPv6 string by working through the
4364			 * address in reverse.
4365			 */
4366			for (i = v6end; i >= 0; i -= 2) {
4367				ASSERT(end >= base);
4368
4369				if (i == firstzero + numzero - 2) {
4370					*end-- = ':';
4371					*end-- = ':';
4372					i -= numzero - 2;
4373					continue;
4374				}
4375
4376				if (i < 14 && i != firstzero - 2)
4377					*end-- = ':';
4378
4379				val = (ip6._S6_un._S6_u8[i] << 8) +
4380				    ip6._S6_un._S6_u8[i + 1];
4381
4382				if (val == 0) {
4383					*end-- = '0';
4384				} else {
4385					for (; val; val /= 16) {
4386						*end-- = digits[val % 16];
4387					}
4388				}
4389			}
4390			ASSERT(end + 1 >= base);
4391
4392		} else {
4393			/*
4394			 * The user didn't use AH_INET or AH_INET6.
4395			 */
4396			DTRACE_CPUFLAG_SET(CPU_DTRACE_ILLOP);
4397			regs[rd] = NULL;
4398			break;
4399		}
4400
4401inetout:	regs[rd] = (uintptr_t)end + 1;
4402		mstate->dtms_scratch_ptr += size;
4403		break;
4404	}
4405
4406	}
4407}
4408
4409/*
4410 * Emulate the execution of DTrace IR instructions specified by the given
4411 * DIF object.  This function is deliberately void of assertions as all of
4412 * the necessary checks are handled by a call to dtrace_difo_validate().
4413 */
4414static uint64_t
4415dtrace_dif_emulate(dtrace_difo_t *difo, dtrace_mstate_t *mstate,
4416    dtrace_vstate_t *vstate, dtrace_state_t *state)
4417{
4418	const dif_instr_t *text = difo->dtdo_buf;
4419	const uint_t textlen = difo->dtdo_len;
4420	const char *strtab = difo->dtdo_strtab;
4421	const uint64_t *inttab = difo->dtdo_inttab;
4422
4423	uint64_t rval = 0;
4424	dtrace_statvar_t *svar;
4425	dtrace_dstate_t *dstate = &vstate->dtvs_dynvars;
4426	dtrace_difv_t *v;
4427	volatile uint16_t *flags = &cpu_core[CPU->cpu_id].cpuc_dtrace_flags;
4428	volatile uintptr_t *illval = &cpu_core[CPU->cpu_id].cpuc_dtrace_illval;
4429
4430	dtrace_key_t tupregs[DIF_DTR_NREGS + 2]; /* +2 for thread and id */
4431	uint64_t regs[DIF_DIR_NREGS];
4432	uint64_t *tmp;
4433
4434	uint8_t cc_n = 0, cc_z = 0, cc_v = 0, cc_c = 0;
4435	int64_t cc_r;
4436	uint_t pc = 0, id, opc;
4437	uint8_t ttop = 0;
4438	dif_instr_t instr;
4439	uint_t r1, r2, rd;
4440
4441	/*
4442	 * We stash the current DIF object into the machine state: we need it
4443	 * for subsequent access checking.
4444	 */
4445	mstate->dtms_difo = difo;
4446
4447	regs[DIF_REG_R0] = 0; 		/* %r0 is fixed at zero */
4448
4449	while (pc < textlen && !(*flags & CPU_DTRACE_FAULT)) {
4450		opc = pc;
4451
4452		instr = text[pc++];
4453		r1 = DIF_INSTR_R1(instr);
4454		r2 = DIF_INSTR_R2(instr);
4455		rd = DIF_INSTR_RD(instr);
4456
4457		switch (DIF_INSTR_OP(instr)) {
4458		case DIF_OP_OR:
4459			regs[rd] = regs[r1] | regs[r2];
4460			break;
4461		case DIF_OP_XOR:
4462			regs[rd] = regs[r1] ^ regs[r2];
4463			break;
4464		case DIF_OP_AND:
4465			regs[rd] = regs[r1] & regs[r2];
4466			break;
4467		case DIF_OP_SLL:
4468			regs[rd] = regs[r1] << regs[r2];
4469			break;
4470		case DIF_OP_SRL:
4471			regs[rd] = regs[r1] >> regs[r2];
4472			break;
4473		case DIF_OP_SUB:
4474			regs[rd] = regs[r1] - regs[r2];
4475			break;
4476		case DIF_OP_ADD:
4477			regs[rd] = regs[r1] + regs[r2];
4478			break;
4479		case DIF_OP_MUL:
4480			regs[rd] = regs[r1] * regs[r2];
4481			break;
4482		case DIF_OP_SDIV:
4483			if (regs[r2] == 0) {
4484				regs[rd] = 0;
4485				*flags |= CPU_DTRACE_DIVZERO;
4486			} else {
4487				regs[rd] = (int64_t)regs[r1] /
4488				    (int64_t)regs[r2];
4489			}
4490			break;
4491
4492		case DIF_OP_UDIV:
4493			if (regs[r2] == 0) {
4494				regs[rd] = 0;
4495				*flags |= CPU_DTRACE_DIVZERO;
4496			} else {
4497				regs[rd] = regs[r1] / regs[r2];
4498			}
4499			break;
4500
4501		case DIF_OP_SREM:
4502			if (regs[r2] == 0) {
4503				regs[rd] = 0;
4504				*flags |= CPU_DTRACE_DIVZERO;
4505			} else {
4506				regs[rd] = (int64_t)regs[r1] %
4507				    (int64_t)regs[r2];
4508			}
4509			break;
4510
4511		case DIF_OP_UREM:
4512			if (regs[r2] == 0) {
4513				regs[rd] = 0;
4514				*flags |= CPU_DTRACE_DIVZERO;
4515			} else {
4516				regs[rd] = regs[r1] % regs[r2];
4517			}
4518			break;
4519
4520		case DIF_OP_NOT:
4521			regs[rd] = ~regs[r1];
4522			break;
4523		case DIF_OP_MOV:
4524			regs[rd] = regs[r1];
4525			break;
4526		case DIF_OP_CMP:
4527			cc_r = regs[r1] - regs[r2];
4528			cc_n = cc_r < 0;
4529			cc_z = cc_r == 0;
4530			cc_v = 0;
4531			cc_c = regs[r1] < regs[r2];
4532			break;
4533		case DIF_OP_TST:
4534			cc_n = cc_v = cc_c = 0;
4535			cc_z = regs[r1] == 0;
4536			break;
4537		case DIF_OP_BA:
4538			pc = DIF_INSTR_LABEL(instr);
4539			break;
4540		case DIF_OP_BE:
4541			if (cc_z)
4542				pc = DIF_INSTR_LABEL(instr);
4543			break;
4544		case DIF_OP_BNE:
4545			if (cc_z == 0)
4546				pc = DIF_INSTR_LABEL(instr);
4547			break;
4548		case DIF_OP_BG:
4549			if ((cc_z | (cc_n ^ cc_v)) == 0)
4550				pc = DIF_INSTR_LABEL(instr);
4551			break;
4552		case DIF_OP_BGU:
4553			if ((cc_c | cc_z) == 0)
4554				pc = DIF_INSTR_LABEL(instr);
4555			break;
4556		case DIF_OP_BGE:
4557			if ((cc_n ^ cc_v) == 0)
4558				pc = DIF_INSTR_LABEL(instr);
4559			break;
4560		case DIF_OP_BGEU:
4561			if (cc_c == 0)
4562				pc = DIF_INSTR_LABEL(instr);
4563			break;
4564		case DIF_OP_BL:
4565			if (cc_n ^ cc_v)
4566				pc = DIF_INSTR_LABEL(instr);
4567			break;
4568		case DIF_OP_BLU:
4569			if (cc_c)
4570				pc = DIF_INSTR_LABEL(instr);
4571			break;
4572		case DIF_OP_BLE:
4573			if (cc_z | (cc_n ^ cc_v))
4574				pc = DIF_INSTR_LABEL(instr);
4575			break;
4576		case DIF_OP_BLEU:
4577			if (cc_c | cc_z)
4578				pc = DIF_INSTR_LABEL(instr);
4579			break;
4580		case DIF_OP_RLDSB:
4581			if (!dtrace_canstore(regs[r1], 1, mstate, vstate)) {
4582				*flags |= CPU_DTRACE_KPRIV;
4583				*illval = regs[r1];
4584				break;
4585			}
4586			/*FALLTHROUGH*/
4587		case DIF_OP_LDSB:
4588			regs[rd] = (int8_t)dtrace_load8(regs[r1]);
4589			break;
4590		case DIF_OP_RLDSH:
4591			if (!dtrace_canstore(regs[r1], 2, mstate, vstate)) {
4592				*flags |= CPU_DTRACE_KPRIV;
4593				*illval = regs[r1];
4594				break;
4595			}
4596			/*FALLTHROUGH*/
4597		case DIF_OP_LDSH:
4598			regs[rd] = (int16_t)dtrace_load16(regs[r1]);
4599			break;
4600		case DIF_OP_RLDSW:
4601			if (!dtrace_canstore(regs[r1], 4, mstate, vstate)) {
4602				*flags |= CPU_DTRACE_KPRIV;
4603				*illval = regs[r1];
4604				break;
4605			}
4606			/*FALLTHROUGH*/
4607		case DIF_OP_LDSW:
4608			regs[rd] = (int32_t)dtrace_load32(regs[r1]);
4609			break;
4610		case DIF_OP_RLDUB:
4611			if (!dtrace_canstore(regs[r1], 1, mstate, vstate)) {
4612				*flags |= CPU_DTRACE_KPRIV;
4613				*illval = regs[r1];
4614				break;
4615			}
4616			/*FALLTHROUGH*/
4617		case DIF_OP_LDUB:
4618			regs[rd] = dtrace_load8(regs[r1]);
4619			break;
4620		case DIF_OP_RLDUH:
4621			if (!dtrace_canstore(regs[r1], 2, mstate, vstate)) {
4622				*flags |= CPU_DTRACE_KPRIV;
4623				*illval = regs[r1];
4624				break;
4625			}
4626			/*FALLTHROUGH*/
4627		case DIF_OP_LDUH:
4628			regs[rd] = dtrace_load16(regs[r1]);
4629			break;
4630		case DIF_OP_RLDUW:
4631			if (!dtrace_canstore(regs[r1], 4, mstate, vstate)) {
4632				*flags |= CPU_DTRACE_KPRIV;
4633				*illval = regs[r1];
4634				break;
4635			}
4636			/*FALLTHROUGH*/
4637		case DIF_OP_LDUW:
4638			regs[rd] = dtrace_load32(regs[r1]);
4639			break;
4640		case DIF_OP_RLDX:
4641			if (!dtrace_canstore(regs[r1], 8, mstate, vstate)) {
4642				*flags |= CPU_DTRACE_KPRIV;
4643				*illval = regs[r1];
4644				break;
4645			}
4646			/*FALLTHROUGH*/
4647		case DIF_OP_LDX:
4648			regs[rd] = dtrace_load64(regs[r1]);
4649			break;
4650		case DIF_OP_ULDSB:
4651			regs[rd] = (int8_t)
4652			    dtrace_fuword8((void *)(uintptr_t)regs[r1]);
4653			break;
4654		case DIF_OP_ULDSH:
4655			regs[rd] = (int16_t)
4656			    dtrace_fuword16((void *)(uintptr_t)regs[r1]);
4657			break;
4658		case DIF_OP_ULDSW:
4659			regs[rd] = (int32_t)
4660			    dtrace_fuword32((void *)(uintptr_t)regs[r1]);
4661			break;
4662		case DIF_OP_ULDUB:
4663			regs[rd] =
4664			    dtrace_fuword8((void *)(uintptr_t)regs[r1]);
4665			break;
4666		case DIF_OP_ULDUH:
4667			regs[rd] =
4668			    dtrace_fuword16((void *)(uintptr_t)regs[r1]);
4669			break;
4670		case DIF_OP_ULDUW:
4671			regs[rd] =
4672			    dtrace_fuword32((void *)(uintptr_t)regs[r1]);
4673			break;
4674		case DIF_OP_ULDX:
4675			regs[rd] =
4676			    dtrace_fuword64((void *)(uintptr_t)regs[r1]);
4677			break;
4678		case DIF_OP_RET:
4679			rval = regs[rd];
4680			pc = textlen;
4681			break;
4682		case DIF_OP_NOP:
4683			break;
4684		case DIF_OP_SETX:
4685			regs[rd] = inttab[DIF_INSTR_INTEGER(instr)];
4686			break;
4687		case DIF_OP_SETS:
4688			regs[rd] = (uint64_t)(uintptr_t)
4689			    (strtab + DIF_INSTR_STRING(instr));
4690			break;
4691		case DIF_OP_SCMP: {
4692			size_t sz = state->dts_options[DTRACEOPT_STRSIZE];
4693			uintptr_t s1 = regs[r1];
4694			uintptr_t s2 = regs[r2];
4695
4696			if (s1 != NULL &&
4697			    !dtrace_strcanload(s1, sz, mstate, vstate))
4698				break;
4699			if (s2 != NULL &&
4700			    !dtrace_strcanload(s2, sz, mstate, vstate))
4701				break;
4702
4703			cc_r = dtrace_strncmp((char *)s1, (char *)s2, sz);
4704
4705			cc_n = cc_r < 0;
4706			cc_z = cc_r == 0;
4707			cc_v = cc_c = 0;
4708			break;
4709		}
4710		case DIF_OP_LDGA:
4711			regs[rd] = dtrace_dif_variable(mstate, state,
4712			    r1, regs[r2]);
4713			break;
4714		case DIF_OP_LDGS:
4715			id = DIF_INSTR_VAR(instr);
4716
4717			if (id >= DIF_VAR_OTHER_UBASE) {
4718				uintptr_t a;
4719
4720				id -= DIF_VAR_OTHER_UBASE;
4721				svar = vstate->dtvs_globals[id];
4722				ASSERT(svar != NULL);
4723				v = &svar->dtsv_var;
4724
4725				if (!(v->dtdv_type.dtdt_flags & DIF_TF_BYREF)) {
4726					regs[rd] = svar->dtsv_data;
4727					break;
4728				}
4729
4730				a = (uintptr_t)svar->dtsv_data;
4731
4732				if (*(uint8_t *)a == UINT8_MAX) {
4733					/*
4734					 * If the 0th byte is set to UINT8_MAX
4735					 * then this is to be treated as a
4736					 * reference to a NULL variable.
4737					 */
4738					regs[rd] = NULL;
4739				} else {
4740					regs[rd] = a + sizeof (uint64_t);
4741				}
4742
4743				break;
4744			}
4745
4746			regs[rd] = dtrace_dif_variable(mstate, state, id, 0);
4747			break;
4748
4749		case DIF_OP_STGS:
4750			id = DIF_INSTR_VAR(instr);
4751
4752			ASSERT(id >= DIF_VAR_OTHER_UBASE);
4753			id -= DIF_VAR_OTHER_UBASE;
4754
4755			svar = vstate->dtvs_globals[id];
4756			ASSERT(svar != NULL);
4757			v = &svar->dtsv_var;
4758
4759			if (v->dtdv_type.dtdt_flags & DIF_TF_BYREF) {
4760				uintptr_t a = (uintptr_t)svar->dtsv_data;
4761
4762				ASSERT(a != NULL);
4763				ASSERT(svar->dtsv_size != 0);
4764
4765				if (regs[rd] == NULL) {
4766					*(uint8_t *)a = UINT8_MAX;
4767					break;
4768				} else {
4769					*(uint8_t *)a = 0;
4770					a += sizeof (uint64_t);
4771				}
4772				if (!dtrace_vcanload(
4773				    (void *)(uintptr_t)regs[rd], &v->dtdv_type,
4774				    mstate, vstate))
4775					break;
4776
4777				dtrace_vcopy((void *)(uintptr_t)regs[rd],
4778				    (void *)a, &v->dtdv_type);
4779				break;
4780			}
4781
4782			svar->dtsv_data = regs[rd];
4783			break;
4784
4785		case DIF_OP_LDTA:
4786			/*
4787			 * There are no DTrace built-in thread-local arrays at
4788			 * present.  This opcode is saved for future work.
4789			 */
4790			*flags |= CPU_DTRACE_ILLOP;
4791			regs[rd] = 0;
4792			break;
4793
4794		case DIF_OP_LDLS:
4795			id = DIF_INSTR_VAR(instr);
4796
4797			if (id < DIF_VAR_OTHER_UBASE) {
4798				/*
4799				 * For now, this has no meaning.
4800				 */
4801				regs[rd] = 0;
4802				break;
4803			}
4804
4805			id -= DIF_VAR_OTHER_UBASE;
4806
4807			ASSERT(id < vstate->dtvs_nlocals);
4808			ASSERT(vstate->dtvs_locals != NULL);
4809
4810			svar = vstate->dtvs_locals[id];
4811			ASSERT(svar != NULL);
4812			v = &svar->dtsv_var;
4813
4814			if (v->dtdv_type.dtdt_flags & DIF_TF_BYREF) {
4815				uintptr_t a = (uintptr_t)svar->dtsv_data;
4816				size_t sz = v->dtdv_type.dtdt_size;
4817
4818				sz += sizeof (uint64_t);
4819				ASSERT(svar->dtsv_size == NCPU * sz);
4820				a += CPU->cpu_id * sz;
4821
4822				if (*(uint8_t *)a == UINT8_MAX) {
4823					/*
4824					 * If the 0th byte is set to UINT8_MAX
4825					 * then this is to be treated as a
4826					 * reference to a NULL variable.
4827					 */
4828					regs[rd] = NULL;
4829				} else {
4830					regs[rd] = a + sizeof (uint64_t);
4831				}
4832
4833				break;
4834			}
4835
4836			ASSERT(svar->dtsv_size == NCPU * sizeof (uint64_t));
4837			tmp = (uint64_t *)(uintptr_t)svar->dtsv_data;
4838			regs[rd] = tmp[CPU->cpu_id];
4839			break;
4840
4841		case DIF_OP_STLS:
4842			id = DIF_INSTR_VAR(instr);
4843
4844			ASSERT(id >= DIF_VAR_OTHER_UBASE);
4845			id -= DIF_VAR_OTHER_UBASE;
4846			ASSERT(id < vstate->dtvs_nlocals);
4847
4848			ASSERT(vstate->dtvs_locals != NULL);
4849			svar = vstate->dtvs_locals[id];
4850			ASSERT(svar != NULL);
4851			v = &svar->dtsv_var;
4852
4853			if (v->dtdv_type.dtdt_flags & DIF_TF_BYREF) {
4854				uintptr_t a = (uintptr_t)svar->dtsv_data;
4855				size_t sz = v->dtdv_type.dtdt_size;
4856
4857				sz += sizeof (uint64_t);
4858				ASSERT(svar->dtsv_size == NCPU * sz);
4859				a += CPU->cpu_id * sz;
4860
4861				if (regs[rd] == NULL) {
4862					*(uint8_t *)a = UINT8_MAX;
4863					break;
4864				} else {
4865					*(uint8_t *)a = 0;
4866					a += sizeof (uint64_t);
4867				}
4868
4869				if (!dtrace_vcanload(
4870				    (void *)(uintptr_t)regs[rd], &v->dtdv_type,
4871				    mstate, vstate))
4872					break;
4873
4874				dtrace_vcopy((void *)(uintptr_t)regs[rd],
4875				    (void *)a, &v->dtdv_type);
4876				break;
4877			}
4878
4879			ASSERT(svar->dtsv_size == NCPU * sizeof (uint64_t));
4880			tmp = (uint64_t *)(uintptr_t)svar->dtsv_data;
4881			tmp[CPU->cpu_id] = regs[rd];
4882			break;
4883
4884		case DIF_OP_LDTS: {
4885			dtrace_dynvar_t *dvar;
4886			dtrace_key_t *key;
4887
4888			id = DIF_INSTR_VAR(instr);
4889			ASSERT(id >= DIF_VAR_OTHER_UBASE);
4890			id -= DIF_VAR_OTHER_UBASE;
4891			v = &vstate->dtvs_tlocals[id];
4892
4893			key = &tupregs[DIF_DTR_NREGS];
4894			key[0].dttk_value = (uint64_t)id;
4895			key[0].dttk_size = 0;
4896			DTRACE_TLS_THRKEY(key[1].dttk_value);
4897			key[1].dttk_size = 0;
4898
4899			dvar = dtrace_dynvar(dstate, 2, key,
4900			    sizeof (uint64_t), DTRACE_DYNVAR_NOALLOC,
4901			    mstate, vstate);
4902
4903			if (dvar == NULL) {
4904				regs[rd] = 0;
4905				break;
4906			}
4907
4908			if (v->dtdv_type.dtdt_flags & DIF_TF_BYREF) {
4909				regs[rd] = (uint64_t)(uintptr_t)dvar->dtdv_data;
4910			} else {
4911				regs[rd] = *((uint64_t *)dvar->dtdv_data);
4912			}
4913
4914			break;
4915		}
4916
4917		case DIF_OP_STTS: {
4918			dtrace_dynvar_t *dvar;
4919			dtrace_key_t *key;
4920
4921			id = DIF_INSTR_VAR(instr);
4922			ASSERT(id >= DIF_VAR_OTHER_UBASE);
4923			id -= DIF_VAR_OTHER_UBASE;
4924
4925			key = &tupregs[DIF_DTR_NREGS];
4926			key[0].dttk_value = (uint64_t)id;
4927			key[0].dttk_size = 0;
4928			DTRACE_TLS_THRKEY(key[1].dttk_value);
4929			key[1].dttk_size = 0;
4930			v = &vstate->dtvs_tlocals[id];
4931
4932			dvar = dtrace_dynvar(dstate, 2, key,
4933			    v->dtdv_type.dtdt_size > sizeof (uint64_t) ?
4934			    v->dtdv_type.dtdt_size : sizeof (uint64_t),
4935			    regs[rd] ? DTRACE_DYNVAR_ALLOC :
4936			    DTRACE_DYNVAR_DEALLOC, mstate, vstate);
4937
4938			/*
4939			 * Given that we're storing to thread-local data,
4940			 * we need to flush our predicate cache.
4941			 */
4942			curthread->t_predcache = NULL;
4943
4944			if (dvar == NULL)
4945				break;
4946
4947			if (v->dtdv_type.dtdt_flags & DIF_TF_BYREF) {
4948				if (!dtrace_vcanload(
4949				    (void *)(uintptr_t)regs[rd],
4950				    &v->dtdv_type, mstate, vstate))
4951					break;
4952
4953				dtrace_vcopy((void *)(uintptr_t)regs[rd],
4954				    dvar->dtdv_data, &v->dtdv_type);
4955			} else {
4956				*((uint64_t *)dvar->dtdv_data) = regs[rd];
4957			}
4958
4959			break;
4960		}
4961
4962		case DIF_OP_SRA:
4963			regs[rd] = (int64_t)regs[r1] >> regs[r2];
4964			break;
4965
4966		case DIF_OP_CALL:
4967			dtrace_dif_subr(DIF_INSTR_SUBR(instr), rd,
4968			    regs, tupregs, ttop, mstate, state);
4969			break;
4970
4971		case DIF_OP_PUSHTR:
4972			if (ttop == DIF_DTR_NREGS) {
4973				*flags |= CPU_DTRACE_TUPOFLOW;
4974				break;
4975			}
4976
4977			if (r1 == DIF_TYPE_STRING) {
4978				/*
4979				 * If this is a string type and the size is 0,
4980				 * we'll use the system-wide default string
4981				 * size.  Note that we are _not_ looking at
4982				 * the value of the DTRACEOPT_STRSIZE option;
4983				 * had this been set, we would expect to have
4984				 * a non-zero size value in the "pushtr".
4985				 */
4986				tupregs[ttop].dttk_size =
4987				    dtrace_strlen((char *)(uintptr_t)regs[rd],
4988				    regs[r2] ? regs[r2] :
4989				    dtrace_strsize_default) + 1;
4990			} else {
4991				tupregs[ttop].dttk_size = regs[r2];
4992			}
4993
4994			tupregs[ttop++].dttk_value = regs[rd];
4995			break;
4996
4997		case DIF_OP_PUSHTV:
4998			if (ttop == DIF_DTR_NREGS) {
4999				*flags |= CPU_DTRACE_TUPOFLOW;
5000				break;
5001			}
5002
5003			tupregs[ttop].dttk_value = regs[rd];
5004			tupregs[ttop++].dttk_size = 0;
5005			break;
5006
5007		case DIF_OP_POPTS:
5008			if (ttop != 0)
5009				ttop--;
5010			break;
5011
5012		case DIF_OP_FLUSHTS:
5013			ttop = 0;
5014			break;
5015
5016		case DIF_OP_LDGAA:
5017		case DIF_OP_LDTAA: {
5018			dtrace_dynvar_t *dvar;
5019			dtrace_key_t *key = tupregs;
5020			uint_t nkeys = ttop;
5021
5022			id = DIF_INSTR_VAR(instr);
5023			ASSERT(id >= DIF_VAR_OTHER_UBASE);
5024			id -= DIF_VAR_OTHER_UBASE;
5025
5026			key[nkeys].dttk_value = (uint64_t)id;
5027			key[nkeys++].dttk_size = 0;
5028
5029			if (DIF_INSTR_OP(instr) == DIF_OP_LDTAA) {
5030				DTRACE_TLS_THRKEY(key[nkeys].dttk_value);
5031				key[nkeys++].dttk_size = 0;
5032				v = &vstate->dtvs_tlocals[id];
5033			} else {
5034				v = &vstate->dtvs_globals[id]->dtsv_var;
5035			}
5036
5037			dvar = dtrace_dynvar(dstate, nkeys, key,
5038			    v->dtdv_type.dtdt_size > sizeof (uint64_t) ?
5039			    v->dtdv_type.dtdt_size : sizeof (uint64_t),
5040			    DTRACE_DYNVAR_NOALLOC, mstate, vstate);
5041
5042			if (dvar == NULL) {
5043				regs[rd] = 0;
5044				break;
5045			}
5046
5047			if (v->dtdv_type.dtdt_flags & DIF_TF_BYREF) {
5048				regs[rd] = (uint64_t)(uintptr_t)dvar->dtdv_data;
5049			} else {
5050				regs[rd] = *((uint64_t *)dvar->dtdv_data);
5051			}
5052
5053			break;
5054		}
5055
5056		case DIF_OP_STGAA:
5057		case DIF_OP_STTAA: {
5058			dtrace_dynvar_t *dvar;
5059			dtrace_key_t *key = tupregs;
5060			uint_t nkeys = ttop;
5061
5062			id = DIF_INSTR_VAR(instr);
5063			ASSERT(id >= DIF_VAR_OTHER_UBASE);
5064			id -= DIF_VAR_OTHER_UBASE;
5065
5066			key[nkeys].dttk_value = (uint64_t)id;
5067			key[nkeys++].dttk_size = 0;
5068
5069			if (DIF_INSTR_OP(instr) == DIF_OP_STTAA) {
5070				DTRACE_TLS_THRKEY(key[nkeys].dttk_value);
5071				key[nkeys++].dttk_size = 0;
5072				v = &vstate->dtvs_tlocals[id];
5073			} else {
5074				v = &vstate->dtvs_globals[id]->dtsv_var;
5075			}
5076
5077			dvar = dtrace_dynvar(dstate, nkeys, key,
5078			    v->dtdv_type.dtdt_size > sizeof (uint64_t) ?
5079			    v->dtdv_type.dtdt_size : sizeof (uint64_t),
5080			    regs[rd] ? DTRACE_DYNVAR_ALLOC :
5081			    DTRACE_DYNVAR_DEALLOC, mstate, vstate);
5082
5083			if (dvar == NULL)
5084				break;
5085
5086			if (v->dtdv_type.dtdt_flags & DIF_TF_BYREF) {
5087				if (!dtrace_vcanload(
5088				    (void *)(uintptr_t)regs[rd], &v->dtdv_type,
5089				    mstate, vstate))
5090					break;
5091
5092				dtrace_vcopy((void *)(uintptr_t)regs[rd],
5093				    dvar->dtdv_data, &v->dtdv_type);
5094			} else {
5095				*((uint64_t *)dvar->dtdv_data) = regs[rd];
5096			}
5097
5098			break;
5099		}
5100
5101		case DIF_OP_ALLOCS: {
5102			uintptr_t ptr = P2ROUNDUP(mstate->dtms_scratch_ptr, 8);
5103			size_t size = ptr - mstate->dtms_scratch_ptr + regs[r1];
5104
5105			/*
5106			 * Rounding up the user allocation size could have
5107			 * overflowed large, bogus allocations (like -1ULL) to
5108			 * 0.
5109			 */
5110			if (size < regs[r1] ||
5111			    !DTRACE_INSCRATCH(mstate, size)) {
5112				DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
5113				regs[rd] = NULL;
5114				break;
5115			}
5116
5117			dtrace_bzero((void *) mstate->dtms_scratch_ptr, size);
5118			mstate->dtms_scratch_ptr += size;
5119			regs[rd] = ptr;
5120			break;
5121		}
5122
5123		case DIF_OP_COPYS:
5124			if (!dtrace_canstore(regs[rd], regs[r2],
5125			    mstate, vstate)) {
5126				*flags |= CPU_DTRACE_BADADDR;
5127				*illval = regs[rd];
5128				break;
5129			}
5130
5131			if (!dtrace_canload(regs[r1], regs[r2], mstate, vstate))
5132				break;
5133
5134			dtrace_bcopy((void *)(uintptr_t)regs[r1],
5135			    (void *)(uintptr_t)regs[rd], (size_t)regs[r2]);
5136			break;
5137
5138		case DIF_OP_STB:
5139			if (!dtrace_canstore(regs[rd], 1, mstate, vstate)) {
5140				*flags |= CPU_DTRACE_BADADDR;
5141				*illval = regs[rd];
5142				break;
5143			}
5144			*((uint8_t *)(uintptr_t)regs[rd]) = (uint8_t)regs[r1];
5145			break;
5146
5147		case DIF_OP_STH:
5148			if (!dtrace_canstore(regs[rd], 2, mstate, vstate)) {
5149				*flags |= CPU_DTRACE_BADADDR;
5150				*illval = regs[rd];
5151				break;
5152			}
5153			if (regs[rd] & 1) {
5154				*flags |= CPU_DTRACE_BADALIGN;
5155				*illval = regs[rd];
5156				break;
5157			}
5158			*((uint16_t *)(uintptr_t)regs[rd]) = (uint16_t)regs[r1];
5159			break;
5160
5161		case DIF_OP_STW:
5162			if (!dtrace_canstore(regs[rd], 4, mstate, vstate)) {
5163				*flags |= CPU_DTRACE_BADADDR;
5164				*illval = regs[rd];
5165				break;
5166			}
5167			if (regs[rd] & 3) {
5168				*flags |= CPU_DTRACE_BADALIGN;
5169				*illval = regs[rd];
5170				break;
5171			}
5172			*((uint32_t *)(uintptr_t)regs[rd]) = (uint32_t)regs[r1];
5173			break;
5174
5175		case DIF_OP_STX:
5176			if (!dtrace_canstore(regs[rd], 8, mstate, vstate)) {
5177				*flags |= CPU_DTRACE_BADADDR;
5178				*illval = regs[rd];
5179				break;
5180			}
5181			if (regs[rd] & 7) {
5182				*flags |= CPU_DTRACE_BADALIGN;
5183				*illval = regs[rd];
5184				break;
5185			}
5186			*((uint64_t *)(uintptr_t)regs[rd]) = regs[r1];
5187			break;
5188		}
5189	}
5190
5191	if (!(*flags & CPU_DTRACE_FAULT))
5192		return (rval);
5193
5194	mstate->dtms_fltoffs = opc * sizeof (dif_instr_t);
5195	mstate->dtms_present |= DTRACE_MSTATE_FLTOFFS;
5196
5197	return (0);
5198}
5199
5200static void
5201dtrace_action_breakpoint(dtrace_ecb_t *ecb)
5202{
5203	dtrace_probe_t *probe = ecb->dte_probe;
5204	dtrace_provider_t *prov = probe->dtpr_provider;
5205	char c[DTRACE_FULLNAMELEN + 80], *str;
5206	char *msg = "dtrace: breakpoint action at probe ";
5207	char *ecbmsg = " (ecb ";
5208	uintptr_t mask = (0xf << (sizeof (uintptr_t) * NBBY / 4));
5209	uintptr_t val = (uintptr_t)ecb;
5210	int shift = (sizeof (uintptr_t) * NBBY) - 4, i = 0;
5211
5212	if (dtrace_destructive_disallow)
5213		return;
5214
5215	/*
5216	 * It's impossible to be taking action on the NULL probe.
5217	 */
5218	ASSERT(probe != NULL);
5219
5220	/*
5221	 * This is a poor man's (destitute man's?) sprintf():  we want to
5222	 * print the provider name, module name, function name and name of
5223	 * the probe, along with the hex address of the ECB with the breakpoint
5224	 * action -- all of which we must place in the character buffer by
5225	 * hand.
5226	 */
5227	while (*msg != '\0')
5228		c[i++] = *msg++;
5229
5230	for (str = prov->dtpv_name; *str != '\0'; str++)
5231		c[i++] = *str;
5232	c[i++] = ':';
5233
5234	for (str = probe->dtpr_mod; *str != '\0'; str++)
5235		c[i++] = *str;
5236	c[i++] = ':';
5237
5238	for (str = probe->dtpr_func; *str != '\0'; str++)
5239		c[i++] = *str;
5240	c[i++] = ':';
5241
5242	for (str = probe->dtpr_name; *str != '\0'; str++)
5243		c[i++] = *str;
5244
5245	while (*ecbmsg != '\0')
5246		c[i++] = *ecbmsg++;
5247
5248	while (shift >= 0) {
5249		mask = (uintptr_t)0xf << shift;
5250
5251		if (val >= ((uintptr_t)1 << shift))
5252			c[i++] = "0123456789abcdef"[(val & mask) >> shift];
5253		shift -= 4;
5254	}
5255
5256	c[i++] = ')';
5257	c[i] = '\0';
5258
5259	debug_enter(c);
5260}
5261
5262static void
5263dtrace_action_panic(dtrace_ecb_t *ecb)
5264{
5265	dtrace_probe_t *probe = ecb->dte_probe;
5266
5267	/*
5268	 * It's impossible to be taking action on the NULL probe.
5269	 */
5270	ASSERT(probe != NULL);
5271
5272	if (dtrace_destructive_disallow)
5273		return;
5274
5275	if (dtrace_panicked != NULL)
5276		return;
5277
5278	if (dtrace_casptr(&dtrace_panicked, NULL, curthread) != NULL)
5279		return;
5280
5281	/*
5282	 * We won the right to panic.  (We want to be sure that only one
5283	 * thread calls panic() from dtrace_probe(), and that panic() is
5284	 * called exactly once.)
5285	 */
5286	dtrace_panic("dtrace: panic action at probe %s:%s:%s:%s (ecb %p)",
5287	    probe->dtpr_provider->dtpv_name, probe->dtpr_mod,
5288	    probe->dtpr_func, probe->dtpr_name, (void *)ecb);
5289}
5290
5291static void
5292dtrace_action_raise(uint64_t sig)
5293{
5294	if (dtrace_destructive_disallow)
5295		return;
5296
5297	if (sig >= NSIG) {
5298		DTRACE_CPUFLAG_SET(CPU_DTRACE_ILLOP);
5299		return;
5300	}
5301
5302	/*
5303	 * raise() has a queue depth of 1 -- we ignore all subsequent
5304	 * invocations of the raise() action.
5305	 */
5306	if (curthread->t_dtrace_sig == 0)
5307		curthread->t_dtrace_sig = (uint8_t)sig;
5308
5309	curthread->t_sig_check = 1;
5310	aston(curthread);
5311}
5312
5313static void
5314dtrace_action_stop(void)
5315{
5316	if (dtrace_destructive_disallow)
5317		return;
5318
5319	if (!curthread->t_dtrace_stop) {
5320		curthread->t_dtrace_stop = 1;
5321		curthread->t_sig_check = 1;
5322		aston(curthread);
5323	}
5324}
5325
5326static void
5327dtrace_action_chill(dtrace_mstate_t *mstate, hrtime_t val)
5328{
5329	hrtime_t now;
5330	volatile uint16_t *flags;
5331	cpu_t *cpu = CPU;
5332
5333	if (dtrace_destructive_disallow)
5334		return;
5335
5336	flags = (volatile uint16_t *)&cpu_core[cpu->cpu_id].cpuc_dtrace_flags;
5337
5338	now = dtrace_gethrtime();
5339
5340	if (now - cpu->cpu_dtrace_chillmark > dtrace_chill_interval) {
5341		/*
5342		 * We need to advance the mark to the current time.
5343		 */
5344		cpu->cpu_dtrace_chillmark = now;
5345		cpu->cpu_dtrace_chilled = 0;
5346	}
5347
5348	/*
5349	 * Now check to see if the requested chill time would take us over
5350	 * the maximum amount of time allowed in the chill interval.  (Or
5351	 * worse, if the calculation itself induces overflow.)
5352	 */
5353	if (cpu->cpu_dtrace_chilled + val > dtrace_chill_max ||
5354	    cpu->cpu_dtrace_chilled + val < cpu->cpu_dtrace_chilled) {
5355		*flags |= CPU_DTRACE_ILLOP;
5356		return;
5357	}
5358
5359	while (dtrace_gethrtime() - now < val)
5360		continue;
5361
5362	/*
5363	 * Normally, we assure that the value of the variable "timestamp" does
5364	 * not change within an ECB.  The presence of chill() represents an
5365	 * exception to this rule, however.
5366	 */
5367	mstate->dtms_present &= ~DTRACE_MSTATE_TIMESTAMP;
5368	cpu->cpu_dtrace_chilled += val;
5369}
5370
5371static void
5372dtrace_action_ustack(dtrace_mstate_t *mstate, dtrace_state_t *state,
5373    uint64_t *buf, uint64_t arg)
5374{
5375	int nframes = DTRACE_USTACK_NFRAMES(arg);
5376	int strsize = DTRACE_USTACK_STRSIZE(arg);
5377	uint64_t *pcs = &buf[1], *fps;
5378	char *str = (char *)&pcs[nframes];
5379	int size, offs = 0, i, j;
5380	uintptr_t old = mstate->dtms_scratch_ptr, saved;
5381	uint16_t *flags = &cpu_core[CPU->cpu_id].cpuc_dtrace_flags;
5382	char *sym;
5383
5384	/*
5385	 * Should be taking a faster path if string space has not been
5386	 * allocated.
5387	 */
5388	ASSERT(strsize != 0);
5389
5390	/*
5391	 * We will first allocate some temporary space for the frame pointers.
5392	 */
5393	fps = (uint64_t *)P2ROUNDUP(mstate->dtms_scratch_ptr, 8);
5394	size = (uintptr_t)fps - mstate->dtms_scratch_ptr +
5395	    (nframes * sizeof (uint64_t));
5396
5397	if (!DTRACE_INSCRATCH(mstate, size)) {
5398		/*
5399		 * Not enough room for our frame pointers -- need to indicate
5400		 * that we ran out of scratch space.
5401		 */
5402		DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
5403		return;
5404	}
5405
5406	mstate->dtms_scratch_ptr += size;
5407	saved = mstate->dtms_scratch_ptr;
5408
5409	/*
5410	 * Now get a stack with both program counters and frame pointers.
5411	 */
5412	DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
5413	dtrace_getufpstack(buf, fps, nframes + 1);
5414	DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);
5415
5416	/*
5417	 * If that faulted, we're cooked.
5418	 */
5419	if (*flags & CPU_DTRACE_FAULT)
5420		goto out;
5421
5422	/*
5423	 * Now we want to walk up the stack, calling the USTACK helper.  For
5424	 * each iteration, we restore the scratch pointer.
5425	 */
5426	for (i = 0; i < nframes; i++) {
5427		mstate->dtms_scratch_ptr = saved;
5428
5429		if (offs >= strsize)
5430			break;
5431
5432		sym = (char *)(uintptr_t)dtrace_helper(
5433		    DTRACE_HELPER_ACTION_USTACK,
5434		    mstate, state, pcs[i], fps[i]);
5435
5436		/*
5437		 * If we faulted while running the helper, we're going to
5438		 * clear the fault and null out the corresponding string.
5439		 */
5440		if (*flags & CPU_DTRACE_FAULT) {
5441			*flags &= ~CPU_DTRACE_FAULT;
5442			str[offs++] = '\0';
5443			continue;
5444		}
5445
5446		if (sym == NULL) {
5447			str[offs++] = '\0';
5448			continue;
5449		}
5450
5451		DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
5452
5453		/*
5454		 * Now copy in the string that the helper returned to us.
5455		 */
5456		for (j = 0; offs + j < strsize; j++) {
5457			if ((str[offs + j] = sym[j]) == '\0')
5458				break;
5459		}
5460
5461		DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);
5462
5463		offs += j + 1;
5464	}
5465
5466	if (offs >= strsize) {
5467		/*
5468		 * If we didn't have room for all of the strings, we don't
5469		 * abort processing -- this needn't be a fatal error -- but we
5470		 * still want to increment a counter (dts_stkstroverflows) to
5471		 * allow this condition to be warned about.  (If this is from
5472		 * a jstack() action, it is easily tuned via jstackstrsize.)
5473		 */
5474		dtrace_error(&state->dts_stkstroverflows);
5475	}
5476
5477	while (offs < strsize)
5478		str[offs++] = '\0';
5479
5480out:
5481	mstate->dtms_scratch_ptr = old;
5482}
5483
5484/*
5485 * If you're looking for the epicenter of DTrace, you just found it.  This
5486 * is the function called by the provider to fire a probe -- from which all
5487 * subsequent probe-context DTrace activity emanates.
5488 */
5489void
5490dtrace_probe(dtrace_id_t id, uintptr_t arg0, uintptr_t arg1,
5491    uintptr_t arg2, uintptr_t arg3, uintptr_t arg4)
5492{
5493	processorid_t cpuid;
5494	dtrace_icookie_t cookie;
5495	dtrace_probe_t *probe;
5496	dtrace_mstate_t mstate;
5497	dtrace_ecb_t *ecb;
5498	dtrace_action_t *act;
5499	intptr_t offs;
5500	size_t size;
5501	int vtime, onintr;
5502	volatile uint16_t *flags;
5503	hrtime_t now;
5504
5505	/*
5506	 * Kick out immediately if this CPU is still being born (in which case
5507	 * curthread will be set to -1) or the current thread can't allow
5508	 * probes in its current context.
5509	 */
5510	if (((uintptr_t)curthread & 1) || (curthread->t_flag & T_DONTDTRACE))
5511		return;
5512
5513	cookie = dtrace_interrupt_disable();
5514	probe = dtrace_probes[id - 1];
5515	cpuid = CPU->cpu_id;
5516	onintr = CPU_ON_INTR(CPU);
5517
5518	if (!onintr && probe->dtpr_predcache != DTRACE_CACHEIDNONE &&
5519	    probe->dtpr_predcache == curthread->t_predcache) {
5520		/*
5521		 * We have hit in the predicate cache; we know that
5522		 * this predicate would evaluate to be false.
5523		 */
5524		dtrace_interrupt_enable(cookie);
5525		return;
5526	}
5527
5528	if (panic_quiesce) {
5529		/*
5530		 * We don't trace anything if we're panicking.
5531		 */
5532		dtrace_interrupt_enable(cookie);
5533		return;
5534	}
5535
5536	now = dtrace_gethrtime();
5537	vtime = dtrace_vtime_references != 0;
5538
5539	if (vtime && curthread->t_dtrace_start)
5540		curthread->t_dtrace_vtime += now - curthread->t_dtrace_start;
5541
5542	mstate.dtms_difo = NULL;
5543	mstate.dtms_probe = probe;
5544	mstate.dtms_strtok = NULL;
5545	mstate.dtms_arg[0] = arg0;
5546	mstate.dtms_arg[1] = arg1;
5547	mstate.dtms_arg[2] = arg2;
5548	mstate.dtms_arg[3] = arg3;
5549	mstate.dtms_arg[4] = arg4;
5550
5551	flags = (volatile uint16_t *)&cpu_core[cpuid].cpuc_dtrace_flags;
5552
5553	for (ecb = probe->dtpr_ecb; ecb != NULL; ecb = ecb->dte_next) {
5554		dtrace_predicate_t *pred = ecb->dte_predicate;
5555		dtrace_state_t *state = ecb->dte_state;
5556		dtrace_buffer_t *buf = &state->dts_buffer[cpuid];
5557		dtrace_buffer_t *aggbuf = &state->dts_aggbuffer[cpuid];
5558		dtrace_vstate_t *vstate = &state->dts_vstate;
5559		dtrace_provider_t *prov = probe->dtpr_provider;
5560		int committed = 0;
5561		caddr_t tomax;
5562
5563		/*
5564		 * A little subtlety with the following (seemingly innocuous)
5565		 * declaration of the automatic 'val':  by looking at the
5566		 * code, you might think that it could be declared in the
5567		 * action processing loop, below.  (That is, it's only used in
5568		 * the action processing loop.)  However, it must be declared
5569		 * out of that scope because in the case of DIF expression
5570		 * arguments to aggregating actions, one iteration of the
5571		 * action loop will use the last iteration's value.
5572		 */
5573#ifdef lint
5574		uint64_t val = 0;
5575#else
5576		uint64_t val;
5577#endif
5578
5579		mstate.dtms_present = DTRACE_MSTATE_ARGS | DTRACE_MSTATE_PROBE;
5580		*flags &= ~CPU_DTRACE_ERROR;
5581
5582		if (prov == dtrace_provider) {
5583			/*
5584			 * If dtrace itself is the provider of this probe,
5585			 * we're only going to continue processing the ECB if
5586			 * arg0 (the dtrace_state_t) is equal to the ECB's
5587			 * creating state.  (This prevents disjoint consumers
5588			 * from seeing one another's metaprobes.)
5589			 */
5590			if (arg0 != (uint64_t)(uintptr_t)state)
5591				continue;
5592		}
5593
5594		if (state->dts_activity != DTRACE_ACTIVITY_ACTIVE) {
5595			/*
5596			 * We're not currently active.  If our provider isn't
5597			 * the dtrace pseudo provider, we're not interested.
5598			 */
5599			if (prov != dtrace_provider)
5600				continue;
5601
5602			/*
5603			 * Now we must further check if we are in the BEGIN
5604			 * probe.  If we are, we will only continue processing
5605			 * if we're still in WARMUP -- if one BEGIN enabling
5606			 * has invoked the exit() action, we don't want to
5607			 * evaluate subsequent BEGIN enablings.
5608			 */
5609			if (probe->dtpr_id == dtrace_probeid_begin &&
5610			    state->dts_activity != DTRACE_ACTIVITY_WARMUP) {
5611				ASSERT(state->dts_activity ==
5612				    DTRACE_ACTIVITY_DRAINING);
5613				continue;
5614			}
5615		}
5616
5617		if (ecb->dte_cond) {
5618			/*
5619			 * If the dte_cond bits indicate that this
5620			 * consumer is only allowed to see user-mode firings
5621			 * of this probe, call the provider's dtps_usermode()
5622			 * entry point to check that the probe was fired
5623			 * while in a user context. Skip this ECB if that's
5624			 * not the case.
5625			 */
5626			if ((ecb->dte_cond & DTRACE_COND_USERMODE) &&
5627			    prov->dtpv_pops.dtps_usermode(prov->dtpv_arg,
5628			    probe->dtpr_id, probe->dtpr_arg) == 0)
5629				continue;
5630
5631			/*
5632			 * This is more subtle than it looks. We have to be
5633			 * absolutely certain that CRED() isn't going to
5634			 * change out from under us so it's only legit to
5635			 * examine that structure if we're in constrained
5636			 * situations. Currently, the only times we'll this
5637			 * check is if a non-super-user has enabled the
5638			 * profile or syscall providers -- providers that
5639			 * allow visibility of all processes. For the
5640			 * profile case, the check above will ensure that
5641			 * we're examining a user context.
5642			 */
5643			if (ecb->dte_cond & DTRACE_COND_OWNER) {
5644				cred_t *cr;
5645				cred_t *s_cr =
5646				    ecb->dte_state->dts_cred.dcr_cred;
5647				proc_t *proc;
5648
5649				ASSERT(s_cr != NULL);
5650
5651				if ((cr = CRED()) == NULL ||
5652				    s_cr->cr_uid != cr->cr_uid ||
5653				    s_cr->cr_uid != cr->cr_ruid ||
5654				    s_cr->cr_uid != cr->cr_suid ||
5655				    s_cr->cr_gid != cr->cr_gid ||
5656				    s_cr->cr_gid != cr->cr_rgid ||
5657				    s_cr->cr_gid != cr->cr_sgid ||
5658				    (proc = ttoproc(curthread)) == NULL ||
5659				    (proc->p_flag & SNOCD))
5660					continue;
5661			}
5662
5663			if (ecb->dte_cond & DTRACE_COND_ZONEOWNER) {
5664				cred_t *cr;
5665				cred_t *s_cr =
5666				    ecb->dte_state->dts_cred.dcr_cred;
5667
5668				ASSERT(s_cr != NULL);
5669
5670				if ((cr = CRED()) == NULL ||
5671				    s_cr->cr_zone->zone_id !=
5672				    cr->cr_zone->zone_id)
5673					continue;
5674			}
5675		}
5676
5677		if (now - state->dts_alive > dtrace_deadman_timeout) {
5678			/*
5679			 * We seem to be dead.  Unless we (a) have kernel
5680			 * destructive permissions (b) have expicitly enabled
5681			 * destructive actions and (c) destructive actions have
5682			 * not been disabled, we're going to transition into
5683			 * the KILLED state, from which no further processing
5684			 * on this state will be performed.
5685			 */
5686			if (!dtrace_priv_kernel_destructive(state) ||
5687			    !state->dts_cred.dcr_destructive ||
5688			    dtrace_destructive_disallow) {
5689				void *activity = &state->dts_activity;
5690				dtrace_activity_t current;
5691
5692				do {
5693					current = state->dts_activity;
5694				} while (dtrace_cas32(activity, current,
5695				    DTRACE_ACTIVITY_KILLED) != current);
5696
5697				continue;
5698			}
5699		}
5700
5701		if ((offs = dtrace_buffer_reserve(buf, ecb->dte_needed,
5702		    ecb->dte_alignment, state, &mstate)) < 0)
5703			continue;
5704
5705		tomax = buf->dtb_tomax;
5706		ASSERT(tomax != NULL);
5707
5708		if (ecb->dte_size != 0)
5709			DTRACE_STORE(uint32_t, tomax, offs, ecb->dte_epid);
5710
5711		mstate.dtms_epid = ecb->dte_epid;
5712		mstate.dtms_present |= DTRACE_MSTATE_EPID;
5713
5714		if (state->dts_cred.dcr_visible & DTRACE_CRV_KERNEL)
5715			mstate.dtms_access = DTRACE_ACCESS_KERNEL;
5716		else
5717			mstate.dtms_access = 0;
5718
5719		if (pred != NULL) {
5720			dtrace_difo_t *dp = pred->dtp_difo;
5721			int rval;
5722
5723			rval = dtrace_dif_emulate(dp, &mstate, vstate, state);
5724
5725			if (!(*flags & CPU_DTRACE_ERROR) && !rval) {
5726				dtrace_cacheid_t cid = probe->dtpr_predcache;
5727
5728				if (cid != DTRACE_CACHEIDNONE && !onintr) {
5729					/*
5730					 * Update the predicate cache...
5731					 */
5732					ASSERT(cid == pred->dtp_cacheid);
5733					curthread->t_predcache = cid;
5734				}
5735
5736				continue;
5737			}
5738		}
5739
5740		for (act = ecb->dte_action; !(*flags & CPU_DTRACE_ERROR) &&
5741		    act != NULL; act = act->dta_next) {
5742			size_t valoffs;
5743			dtrace_difo_t *dp;
5744			dtrace_recdesc_t *rec = &act->dta_rec;
5745
5746			size = rec->dtrd_size;
5747			valoffs = offs + rec->dtrd_offset;
5748
5749			if (DTRACEACT_ISAGG(act->dta_kind)) {
5750				uint64_t v = 0xbad;
5751				dtrace_aggregation_t *agg;
5752
5753				agg = (dtrace_aggregation_t *)act;
5754
5755				if ((dp = act->dta_difo) != NULL)
5756					v = dtrace_dif_emulate(dp,
5757					    &mstate, vstate, state);
5758
5759				if (*flags & CPU_DTRACE_ERROR)
5760					continue;
5761
5762				/*
5763				 * Note that we always pass the expression
5764				 * value from the previous iteration of the
5765				 * action loop.  This value will only be used
5766				 * if there is an expression argument to the
5767				 * aggregating action, denoted by the
5768				 * dtag_hasarg field.
5769				 */
5770				dtrace_aggregate(agg, buf,
5771				    offs, aggbuf, v, val);
5772				continue;
5773			}
5774
5775			switch (act->dta_kind) {
5776			case DTRACEACT_STOP:
5777				if (dtrace_priv_proc_destructive(state))
5778					dtrace_action_stop();
5779				continue;
5780
5781			case DTRACEACT_BREAKPOINT:
5782				if (dtrace_priv_kernel_destructive(state))
5783					dtrace_action_breakpoint(ecb);
5784				continue;
5785
5786			case DTRACEACT_PANIC:
5787				if (dtrace_priv_kernel_destructive(state))
5788					dtrace_action_panic(ecb);
5789				continue;
5790
5791			case DTRACEACT_STACK:
5792				if (!dtrace_priv_kernel(state))
5793					continue;
5794
5795				dtrace_getpcstack((pc_t *)(tomax + valoffs),
5796				    size / sizeof (pc_t), probe->dtpr_aframes,
5797				    DTRACE_ANCHORED(probe) ? NULL :
5798				    (uint32_t *)arg0);
5799
5800				continue;
5801
5802			case DTRACEACT_JSTACK:
5803			case DTRACEACT_USTACK:
5804				if (!dtrace_priv_proc(state))
5805					continue;
5806
5807				/*
5808				 * See comment in DIF_VAR_PID.
5809				 */
5810				if (DTRACE_ANCHORED(mstate.dtms_probe) &&
5811				    CPU_ON_INTR(CPU)) {
5812					int depth = DTRACE_USTACK_NFRAMES(
5813					    rec->dtrd_arg) + 1;
5814
5815					dtrace_bzero((void *)(tomax + valoffs),
5816					    DTRACE_USTACK_STRSIZE(rec->dtrd_arg)
5817					    + depth * sizeof (uint64_t));
5818
5819					continue;
5820				}
5821
5822				if (DTRACE_USTACK_STRSIZE(rec->dtrd_arg) != 0 &&
5823				    curproc->p_dtrace_helpers != NULL) {
5824					/*
5825					 * This is the slow path -- we have
5826					 * allocated string space, and we're
5827					 * getting the stack of a process that
5828					 * has helpers.  Call into a separate
5829					 * routine to perform this processing.
5830					 */
5831					dtrace_action_ustack(&mstate, state,
5832					    (uint64_t *)(tomax + valoffs),
5833					    rec->dtrd_arg);
5834					continue;
5835				}
5836
5837				DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
5838				dtrace_getupcstack((uint64_t *)
5839				    (tomax + valoffs),
5840				    DTRACE_USTACK_NFRAMES(rec->dtrd_arg) + 1);
5841				DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);
5842				continue;
5843
5844			default:
5845				break;
5846			}
5847
5848			dp = act->dta_difo;
5849			ASSERT(dp != NULL);
5850
5851			val = dtrace_dif_emulate(dp, &mstate, vstate, state);
5852
5853			if (*flags & CPU_DTRACE_ERROR)
5854				continue;
5855
5856			switch (act->dta_kind) {
5857			case DTRACEACT_SPECULATE:
5858				ASSERT(buf == &state->dts_buffer[cpuid]);
5859				buf = dtrace_speculation_buffer(state,
5860				    cpuid, val);
5861
5862				if (buf == NULL) {
5863					*flags |= CPU_DTRACE_DROP;
5864					continue;
5865				}
5866
5867				offs = dtrace_buffer_reserve(buf,
5868				    ecb->dte_needed, ecb->dte_alignment,
5869				    state, NULL);
5870
5871				if (offs < 0) {
5872					*flags |= CPU_DTRACE_DROP;
5873					continue;
5874				}
5875
5876				tomax = buf->dtb_tomax;
5877				ASSERT(tomax != NULL);
5878
5879				if (ecb->dte_size != 0)
5880					DTRACE_STORE(uint32_t, tomax, offs,
5881					    ecb->dte_epid);
5882				continue;
5883
5884			case DTRACEACT_CHILL:
5885				if (dtrace_priv_kernel_destructive(state))
5886					dtrace_action_chill(&mstate, val);
5887				continue;
5888
5889			case DTRACEACT_RAISE:
5890				if (dtrace_priv_proc_destructive(state))
5891					dtrace_action_raise(val);
5892				continue;
5893
5894			case DTRACEACT_COMMIT:
5895				ASSERT(!committed);
5896
5897				/*
5898				 * We need to commit our buffer state.
5899				 */
5900				if (ecb->dte_size)
5901					buf->dtb_offset = offs + ecb->dte_size;
5902				buf = &state->dts_buffer[cpuid];
5903				dtrace_speculation_commit(state, cpuid, val);
5904				committed = 1;
5905				continue;
5906
5907			case DTRACEACT_DISCARD:
5908				dtrace_speculation_discard(state, cpuid, val);
5909				continue;
5910
5911			case DTRACEACT_DIFEXPR:
5912			case DTRACEACT_LIBACT:
5913			case DTRACEACT_PRINTF:
5914			case DTRACEACT_PRINTA:
5915			case DTRACEACT_SYSTEM:
5916			case DTRACEACT_FREOPEN:
5917				break;
5918
5919			case DTRACEACT_SYM:
5920			case DTRACEACT_MOD:
5921				if (!dtrace_priv_kernel(state))
5922					continue;
5923				break;
5924
5925			case DTRACEACT_USYM:
5926			case DTRACEACT_UMOD:
5927			case DTRACEACT_UADDR: {
5928				struct pid *pid = curthread->t_procp->p_pidp;
5929
5930				if (!dtrace_priv_proc(state))
5931					continue;
5932
5933				DTRACE_STORE(uint64_t, tomax,
5934				    valoffs, (uint64_t)pid->pid_id);
5935				DTRACE_STORE(uint64_t, tomax,
5936				    valoffs + sizeof (uint64_t), val);
5937
5938				continue;
5939			}
5940
5941			case DTRACEACT_EXIT: {
5942				/*
5943				 * For the exit action, we are going to attempt
5944				 * to atomically set our activity to be
5945				 * draining.  If this fails (either because
5946				 * another CPU has beat us to the exit action,
5947				 * or because our current activity is something
5948				 * other than ACTIVE or WARMUP), we will
5949				 * continue.  This assures that the exit action
5950				 * can be successfully recorded at most once
5951				 * when we're in the ACTIVE state.  If we're
5952				 * encountering the exit() action while in
5953				 * COOLDOWN, however, we want to honor the new
5954				 * status code.  (We know that we're the only
5955				 * thread in COOLDOWN, so there is no race.)
5956				 */
5957				void *activity = &state->dts_activity;
5958				dtrace_activity_t current = state->dts_activity;
5959
5960				if (current == DTRACE_ACTIVITY_COOLDOWN)
5961					break;
5962
5963				if (current != DTRACE_ACTIVITY_WARMUP)
5964					current = DTRACE_ACTIVITY_ACTIVE;
5965
5966				if (dtrace_cas32(activity, current,
5967				    DTRACE_ACTIVITY_DRAINING) != current) {
5968					*flags |= CPU_DTRACE_DROP;
5969					continue;
5970				}
5971
5972				break;
5973			}
5974
5975			default:
5976				ASSERT(0);
5977			}
5978
5979			if (dp->dtdo_rtype.dtdt_flags & DIF_TF_BYREF) {
5980				uintptr_t end = valoffs + size;
5981
5982				if (!dtrace_vcanload((void *)(uintptr_t)val,
5983				    &dp->dtdo_rtype, &mstate, vstate))
5984					continue;
5985
5986				/*
5987				 * If this is a string, we're going to only
5988				 * load until we find the zero byte -- after
5989				 * which we'll store zero bytes.
5990				 */
5991				if (dp->dtdo_rtype.dtdt_kind ==
5992				    DIF_TYPE_STRING) {
5993					char c = '\0' + 1;
5994					int intuple = act->dta_intuple;
5995					size_t s;
5996
5997					for (s = 0; s < size; s++) {
5998						if (c != '\0')
5999							c = dtrace_load8(val++);
6000
6001						DTRACE_STORE(uint8_t, tomax,
6002						    valoffs++, c);
6003
6004						if (c == '\0' && intuple)
6005							break;
6006					}
6007
6008					continue;
6009				}
6010
6011				while (valoffs < end) {
6012					DTRACE_STORE(uint8_t, tomax, valoffs++,
6013					    dtrace_load8(val++));
6014				}
6015
6016				continue;
6017			}
6018
6019			switch (size) {
6020			case 0:
6021				break;
6022
6023			case sizeof (uint8_t):
6024				DTRACE_STORE(uint8_t, tomax, valoffs, val);
6025				break;
6026			case sizeof (uint16_t):
6027				DTRACE_STORE(uint16_t, tomax, valoffs, val);
6028				break;
6029			case sizeof (uint32_t):
6030				DTRACE_STORE(uint32_t, tomax, valoffs, val);
6031				break;
6032			case sizeof (uint64_t):
6033				DTRACE_STORE(uint64_t, tomax, valoffs, val);
6034				break;
6035			default:
6036				/*
6037				 * Any other size should have been returned by
6038				 * reference, not by value.
6039				 */
6040				ASSERT(0);
6041				break;
6042			}
6043		}
6044
6045		if (*flags & CPU_DTRACE_DROP)
6046			continue;
6047
6048		if (*flags & CPU_DTRACE_FAULT) {
6049			int ndx;
6050			dtrace_action_t *err;
6051
6052			buf->dtb_errors++;
6053
6054			if (probe->dtpr_id == dtrace_probeid_error) {
6055				/*
6056				 * There's nothing we can do -- we had an
6057				 * error on the error probe.  We bump an
6058				 * error counter to at least indicate that
6059				 * this condition happened.
6060				 */
6061				dtrace_error(&state->dts_dblerrors);
6062				continue;
6063			}
6064
6065			if (vtime) {
6066				/*
6067				 * Before recursing on dtrace_probe(), we
6068				 * need to explicitly clear out our start
6069				 * time to prevent it from being accumulated
6070				 * into t_dtrace_vtime.
6071				 */
6072				curthread->t_dtrace_start = 0;
6073			}
6074
6075			/*
6076			 * Iterate over the actions to figure out which action
6077			 * we were processing when we experienced the error.
6078			 * Note that act points _past_ the faulting action; if
6079			 * act is ecb->dte_action, the fault was in the
6080			 * predicate, if it's ecb->dte_action->dta_next it's
6081			 * in action #1, and so on.
6082			 */
6083			for (err = ecb->dte_action, ndx = 0;
6084			    err != act; err = err->dta_next, ndx++)
6085				continue;
6086
6087			dtrace_probe_error(state, ecb->dte_epid, ndx,
6088			    (mstate.dtms_present & DTRACE_MSTATE_FLTOFFS) ?
6089			    mstate.dtms_fltoffs : -1, DTRACE_FLAGS2FLT(*flags),
6090			    cpu_core[cpuid].cpuc_dtrace_illval);
6091
6092			continue;
6093		}
6094
6095		if (!committed)
6096			buf->dtb_offset = offs + ecb->dte_size;
6097	}
6098
6099	if (vtime)
6100		curthread->t_dtrace_start = dtrace_gethrtime();
6101
6102	dtrace_interrupt_enable(cookie);
6103}
6104
6105/*
6106 * DTrace Probe Hashing Functions
6107 *
6108 * The functions in this section (and indeed, the functions in remaining
6109 * sections) are not _called_ from probe context.  (Any exceptions to this are
6110 * marked with a "Note:".)  Rather, they are called from elsewhere in the
6111 * DTrace framework to look-up probes in, add probes to and remove probes from
6112 * the DTrace probe hashes.  (Each probe is hashed by each element of the
6113 * probe tuple -- allowing for fast lookups, regardless of what was
6114 * specified.)
6115 */
6116static uint_t
6117dtrace_hash_str(char *p)
6118{
6119	unsigned int g;
6120	uint_t hval = 0;
6121
6122	while (*p) {
6123		hval = (hval << 4) + *p++;
6124		if ((g = (hval & 0xf0000000)) != 0)
6125			hval ^= g >> 24;
6126		hval &= ~g;
6127	}
6128	return (hval);
6129}
6130
6131static dtrace_hash_t *
6132dtrace_hash_create(uintptr_t stroffs, uintptr_t nextoffs, uintptr_t prevoffs)
6133{
6134	dtrace_hash_t *hash = kmem_zalloc(sizeof (dtrace_hash_t), KM_SLEEP);
6135
6136	hash->dth_stroffs = stroffs;
6137	hash->dth_nextoffs = nextoffs;
6138	hash->dth_prevoffs = prevoffs;
6139
6140	hash->dth_size = 1;
6141	hash->dth_mask = hash->dth_size - 1;
6142
6143	hash->dth_tab = kmem_zalloc(hash->dth_size *
6144	    sizeof (dtrace_hashbucket_t *), KM_SLEEP);
6145
6146	return (hash);
6147}
6148
6149static void
6150dtrace_hash_destroy(dtrace_hash_t *hash)
6151{
6152#ifdef DEBUG
6153	int i;
6154
6155	for (i = 0; i < hash->dth_size; i++)
6156		ASSERT(hash->dth_tab[i] == NULL);
6157#endif
6158
6159	kmem_free(hash->dth_tab,
6160	    hash->dth_size * sizeof (dtrace_hashbucket_t *));
6161	kmem_free(hash, sizeof (dtrace_hash_t));
6162}
6163
6164static void
6165dtrace_hash_resize(dtrace_hash_t *hash)
6166{
6167	int size = hash->dth_size, i, ndx;
6168	int new_size = hash->dth_size << 1;
6169	int new_mask = new_size - 1;
6170	dtrace_hashbucket_t **new_tab, *bucket, *next;
6171
6172	ASSERT((new_size & new_mask) == 0);
6173
6174	new_tab = kmem_zalloc(new_size * sizeof (void *), KM_SLEEP);
6175
6176	for (i = 0; i < size; i++) {
6177		for (bucket = hash->dth_tab[i]; bucket != NULL; bucket = next) {
6178			dtrace_probe_t *probe = bucket->dthb_chain;
6179
6180			ASSERT(probe != NULL);
6181			ndx = DTRACE_HASHSTR(hash, probe) & new_mask;
6182
6183			next = bucket->dthb_next;
6184			bucket->dthb_next = new_tab[ndx];
6185			new_tab[ndx] = bucket;
6186		}
6187	}
6188
6189	kmem_free(hash->dth_tab, hash->dth_size * sizeof (void *));
6190	hash->dth_tab = new_tab;
6191	hash->dth_size = new_size;
6192	hash->dth_mask = new_mask;
6193}
6194
6195static void
6196dtrace_hash_add(dtrace_hash_t *hash, dtrace_probe_t *new)
6197{
6198	int hashval = DTRACE_HASHSTR(hash, new);
6199	int ndx = hashval & hash->dth_mask;
6200	dtrace_hashbucket_t *bucket = hash->dth_tab[ndx];
6201	dtrace_probe_t **nextp, **prevp;
6202
6203	for (; bucket != NULL; bucket = bucket->dthb_next) {
6204		if (DTRACE_HASHEQ(hash, bucket->dthb_chain, new))
6205			goto add;
6206	}
6207
6208	if ((hash->dth_nbuckets >> 1) > hash->dth_size) {
6209		dtrace_hash_resize(hash);
6210		dtrace_hash_add(hash, new);
6211		return;
6212	}
6213
6214	bucket = kmem_zalloc(sizeof (dtrace_hashbucket_t), KM_SLEEP);
6215	bucket->dthb_next = hash->dth_tab[ndx];
6216	hash->dth_tab[ndx] = bucket;
6217	hash->dth_nbuckets++;
6218
6219add:
6220	nextp = DTRACE_HASHNEXT(hash, new);
6221	ASSERT(*nextp == NULL && *(DTRACE_HASHPREV(hash, new)) == NULL);
6222	*nextp = bucket->dthb_chain;
6223
6224	if (bucket->dthb_chain != NULL) {
6225		prevp = DTRACE_HASHPREV(hash, bucket->dthb_chain);
6226		ASSERT(*prevp == NULL);
6227		*prevp = new;
6228	}
6229
6230	bucket->dthb_chain = new;
6231	bucket->dthb_len++;
6232}
6233
6234static dtrace_probe_t *
6235dtrace_hash_lookup(dtrace_hash_t *hash, dtrace_probe_t *template)
6236{
6237	int hashval = DTRACE_HASHSTR(hash, template);
6238	int ndx = hashval & hash->dth_mask;
6239	dtrace_hashbucket_t *bucket = hash->dth_tab[ndx];
6240
6241	for (; bucket != NULL; bucket = bucket->dthb_next) {
6242		if (DTRACE_HASHEQ(hash, bucket->dthb_chain, template))
6243			return (bucket->dthb_chain);
6244	}
6245
6246	return (NULL);
6247}
6248
6249static int
6250dtrace_hash_collisions(dtrace_hash_t *hash, dtrace_probe_t *template)
6251{
6252	int hashval = DTRACE_HASHSTR(hash, template);
6253	int ndx = hashval & hash->dth_mask;
6254	dtrace_hashbucket_t *bucket = hash->dth_tab[ndx];
6255
6256	for (; bucket != NULL; bucket = bucket->dthb_next) {
6257		if (DTRACE_HASHEQ(hash, bucket->dthb_chain, template))
6258			return (bucket->dthb_len);
6259	}
6260
6261	return (NULL);
6262}
6263
6264static void
6265dtrace_hash_remove(dtrace_hash_t *hash, dtrace_probe_t *probe)
6266{
6267	int ndx = DTRACE_HASHSTR(hash, probe) & hash->dth_mask;
6268	dtrace_hashbucket_t *bucket = hash->dth_tab[ndx];
6269
6270	dtrace_probe_t **prevp = DTRACE_HASHPREV(hash, probe);
6271	dtrace_probe_t **nextp = DTRACE_HASHNEXT(hash, probe);
6272
6273	/*
6274	 * Find the bucket that we're removing this probe from.
6275	 */
6276	for (; bucket != NULL; bucket = bucket->dthb_next) {
6277		if (DTRACE_HASHEQ(hash, bucket->dthb_chain, probe))
6278			break;
6279	}
6280
6281	ASSERT(bucket != NULL);
6282
6283	if (*prevp == NULL) {
6284		if (*nextp == NULL) {
6285			/*
6286			 * The removed probe was the only probe on this
6287			 * bucket; we need to remove the bucket.
6288			 */
6289			dtrace_hashbucket_t *b = hash->dth_tab[ndx];
6290
6291			ASSERT(bucket->dthb_chain == probe);
6292			ASSERT(b != NULL);
6293
6294			if (b == bucket) {
6295				hash->dth_tab[ndx] = bucket->dthb_next;
6296			} else {
6297				while (b->dthb_next != bucket)
6298					b = b->dthb_next;
6299				b->dthb_next = bucket->dthb_next;
6300			}
6301
6302			ASSERT(hash->dth_nbuckets > 0);
6303			hash->dth_nbuckets--;
6304			kmem_free(bucket, sizeof (dtrace_hashbucket_t));
6305			return;
6306		}
6307
6308		bucket->dthb_chain = *nextp;
6309	} else {
6310		*(DTRACE_HASHNEXT(hash, *prevp)) = *nextp;
6311	}
6312
6313	if (*nextp != NULL)
6314		*(DTRACE_HASHPREV(hash, *nextp)) = *prevp;
6315}
6316
6317/*
6318 * DTrace Utility Functions
6319 *
6320 * These are random utility functions that are _not_ called from probe context.
6321 */
6322static int
6323dtrace_badattr(const dtrace_attribute_t *a)
6324{
6325	return (a->dtat_name > DTRACE_STABILITY_MAX ||
6326	    a->dtat_data > DTRACE_STABILITY_MAX ||
6327	    a->dtat_class > DTRACE_CLASS_MAX);
6328}
6329
6330/*
6331 * Return a duplicate copy of a string.  If the specified string is NULL,
6332 * this function returns a zero-length string.
6333 */
6334static char *
6335dtrace_strdup(const char *str)
6336{
6337	char *new = kmem_zalloc((str != NULL ? strlen(str) : 0) + 1, KM_SLEEP);
6338
6339	if (str != NULL)
6340		(void) strcpy(new, str);
6341
6342	return (new);
6343}
6344
6345#define	DTRACE_ISALPHA(c)	\
6346	(((c) >= 'a' && (c) <= 'z') || ((c) >= 'A' && (c) <= 'Z'))
6347
6348static int
6349dtrace_badname(const char *s)
6350{
6351	char c;
6352
6353	if (s == NULL || (c = *s++) == '\0')
6354		return (0);
6355
6356	if (!DTRACE_ISALPHA(c) && c != '-' && c != '_' && c != '.')
6357		return (1);
6358
6359	while ((c = *s++) != '\0') {
6360		if (!DTRACE_ISALPHA(c) && (c < '0' || c > '9') &&
6361		    c != '-' && c != '_' && c != '.' && c != '`')
6362			return (1);
6363	}
6364
6365	return (0);
6366}
6367
6368static void
6369dtrace_cred2priv(cred_t *cr, uint32_t *privp, uid_t *uidp, zoneid_t *zoneidp)
6370{
6371	uint32_t priv;
6372
6373	if (cr == NULL || PRIV_POLICY_ONLY(cr, PRIV_ALL, B_FALSE)) {
6374		/*
6375		 * For DTRACE_PRIV_ALL, the uid and zoneid don't matter.
6376		 */
6377		priv = DTRACE_PRIV_ALL;
6378	} else {
6379		*uidp = crgetuid(cr);
6380		*zoneidp = crgetzoneid(cr);
6381
6382		priv = 0;
6383		if (PRIV_POLICY_ONLY(cr, PRIV_DTRACE_KERNEL, B_FALSE))
6384			priv |= DTRACE_PRIV_KERNEL | DTRACE_PRIV_USER;
6385		else if (PRIV_POLICY_ONLY(cr, PRIV_DTRACE_USER, B_FALSE))
6386			priv |= DTRACE_PRIV_USER;
6387		if (PRIV_POLICY_ONLY(cr, PRIV_DTRACE_PROC, B_FALSE))
6388			priv |= DTRACE_PRIV_PROC;
6389		if (PRIV_POLICY_ONLY(cr, PRIV_PROC_OWNER, B_FALSE))
6390			priv |= DTRACE_PRIV_OWNER;
6391		if (PRIV_POLICY_ONLY(cr, PRIV_PROC_ZONE, B_FALSE))
6392			priv |= DTRACE_PRIV_ZONEOWNER;
6393	}
6394
6395	*privp = priv;
6396}
6397
6398#ifdef DTRACE_ERRDEBUG
6399static void
6400dtrace_errdebug(const char *str)
6401{
6402	int hval = dtrace_hash_str((char *)str) % DTRACE_ERRHASHSZ;
6403	int occupied = 0;
6404
6405	mutex_enter(&dtrace_errlock);
6406	dtrace_errlast = str;
6407	dtrace_errthread = curthread;
6408
6409	while (occupied++ < DTRACE_ERRHASHSZ) {
6410		if (dtrace_errhash[hval].dter_msg == str) {
6411			dtrace_errhash[hval].dter_count++;
6412			goto out;
6413		}
6414
6415		if (dtrace_errhash[hval].dter_msg != NULL) {
6416			hval = (hval + 1) % DTRACE_ERRHASHSZ;
6417			continue;
6418		}
6419
6420		dtrace_errhash[hval].dter_msg = str;
6421		dtrace_errhash[hval].dter_count = 1;
6422		goto out;
6423	}
6424
6425	panic("dtrace: undersized error hash");
6426out:
6427	mutex_exit(&dtrace_errlock);
6428}
6429#endif
6430
6431/*
6432 * DTrace Matching Functions
6433 *
6434 * These functions are used to match groups of probes, given some elements of
6435 * a probe tuple, or some globbed expressions for elements of a probe tuple.
6436 */
6437static int
6438dtrace_match_priv(const dtrace_probe_t *prp, uint32_t priv, uid_t uid,
6439    zoneid_t zoneid)
6440{
6441	if (priv != DTRACE_PRIV_ALL) {
6442		uint32_t ppriv = prp->dtpr_provider->dtpv_priv.dtpp_flags;
6443		uint32_t match = priv & ppriv;
6444
6445		/*
6446		 * No PRIV_DTRACE_* privileges...
6447		 */
6448		if ((priv & (DTRACE_PRIV_PROC | DTRACE_PRIV_USER |
6449		    DTRACE_PRIV_KERNEL)) == 0)
6450			return (0);
6451
6452		/*
6453		 * No matching bits, but there were bits to match...
6454		 */
6455		if (match == 0 && ppriv != 0)
6456			return (0);
6457
6458		/*
6459		 * Need to have permissions to the process, but don't...
6460		 */
6461		if (((ppriv & ~match) & DTRACE_PRIV_OWNER) != 0 &&
6462		    uid != prp->dtpr_provider->dtpv_priv.dtpp_uid) {
6463			return (0);
6464		}
6465
6466		/*
6467		 * Need to be in the same zone unless we possess the
6468		 * privilege to examine all zones.
6469		 */
6470		if (((ppriv & ~match) & DTRACE_PRIV_ZONEOWNER) != 0 &&
6471		    zoneid != prp->dtpr_provider->dtpv_priv.dtpp_zoneid) {
6472			return (0);
6473		}
6474	}
6475
6476	return (1);
6477}
6478
6479/*
6480 * dtrace_match_probe compares a dtrace_probe_t to a pre-compiled key, which
6481 * consists of input pattern strings and an ops-vector to evaluate them.
6482 * This function returns >0 for match, 0 for no match, and <0 for error.
6483 */
6484static int
6485dtrace_match_probe(const dtrace_probe_t *prp, const dtrace_probekey_t *pkp,
6486    uint32_t priv, uid_t uid, zoneid_t zoneid)
6487{
6488	dtrace_provider_t *pvp = prp->dtpr_provider;
6489	int rv;
6490
6491	if (pvp->dtpv_defunct)
6492		return (0);
6493
6494	if ((rv = pkp->dtpk_pmatch(pvp->dtpv_name, pkp->dtpk_prov, 0)) <= 0)
6495		return (rv);
6496
6497	if ((rv = pkp->dtpk_mmatch(prp->dtpr_mod, pkp->dtpk_mod, 0)) <= 0)
6498		return (rv);
6499
6500	if ((rv = pkp->dtpk_fmatch(prp->dtpr_func, pkp->dtpk_func, 0)) <= 0)
6501		return (rv);
6502
6503	if ((rv = pkp->dtpk_nmatch(prp->dtpr_name, pkp->dtpk_name, 0)) <= 0)
6504		return (rv);
6505
6506	if (dtrace_match_priv(prp, priv, uid, zoneid) == 0)
6507		return (0);
6508
6509	return (rv);
6510}
6511
6512/*
6513 * dtrace_match_glob() is a safe kernel implementation of the gmatch(3GEN)
6514 * interface for matching a glob pattern 'p' to an input string 's'.  Unlike
6515 * libc's version, the kernel version only applies to 8-bit ASCII strings.
6516 * In addition, all of the recursion cases except for '*' matching have been
6517 * unwound.  For '*', we still implement recursive evaluation, but a depth
6518 * counter is maintained and matching is aborted if we recurse too deep.
6519 * The function returns 0 if no match, >0 if match, and <0 if recursion error.
6520 */
6521static int
6522dtrace_match_glob(const char *s, const char *p, int depth)
6523{
6524	const char *olds;
6525	char s1, c;
6526	int gs;
6527
6528	if (depth > DTRACE_PROBEKEY_MAXDEPTH)
6529		return (-1);
6530
6531	if (s == NULL)
6532		s = ""; /* treat NULL as empty string */
6533
6534top:
6535	olds = s;
6536	s1 = *s++;
6537
6538	if (p == NULL)
6539		return (0);
6540
6541	if ((c = *p++) == '\0')
6542		return (s1 == '\0');
6543
6544	switch (c) {
6545	case '[': {
6546		int ok = 0, notflag = 0;
6547		char lc = '\0';
6548
6549		if (s1 == '\0')
6550			return (0);
6551
6552		if (*p == '!') {
6553			notflag = 1;
6554			p++;
6555		}
6556
6557		if ((c = *p++) == '\0')
6558			return (0);
6559
6560		do {
6561			if (c == '-' && lc != '\0' && *p != ']') {
6562				if ((c = *p++) == '\0')
6563					return (0);
6564				if (c == '\\' && (c = *p++) == '\0')
6565					return (0);
6566
6567				if (notflag) {
6568					if (s1 < lc || s1 > c)
6569						ok++;
6570					else
6571						return (0);
6572				} else if (lc <= s1 && s1 <= c)
6573					ok++;
6574
6575			} else if (c == '\\' && (c = *p++) == '\0')
6576				return (0);
6577
6578			lc = c; /* save left-hand 'c' for next iteration */
6579
6580			if (notflag) {
6581				if (s1 != c)
6582					ok++;
6583				else
6584					return (0);
6585			} else if (s1 == c)
6586				ok++;
6587
6588			if ((c = *p++) == '\0')
6589				return (0);
6590
6591		} while (c != ']');
6592
6593		if (ok)
6594			goto top;
6595
6596		return (0);
6597	}
6598
6599	case '\\':
6600		if ((c = *p++) == '\0')
6601			return (0);
6602		/*FALLTHRU*/
6603
6604	default:
6605		if (c != s1)
6606			return (0);
6607		/*FALLTHRU*/
6608
6609	case '?':
6610		if (s1 != '\0')
6611			goto top;
6612		return (0);
6613
6614	case '*':
6615		while (*p == '*')
6616			p++; /* consecutive *'s are identical to a single one */
6617
6618		if (*p == '\0')
6619			return (1);
6620
6621		for (s = olds; *s != '\0'; s++) {
6622			if ((gs = dtrace_match_glob(s, p, depth + 1)) != 0)
6623				return (gs);
6624		}
6625
6626		return (0);
6627	}
6628}
6629
6630/*ARGSUSED*/
6631static int
6632dtrace_match_string(const char *s, const char *p, int depth)
6633{
6634	return (s != NULL && strcmp(s, p) == 0);
6635}
6636
6637/*ARGSUSED*/
6638static int
6639dtrace_match_nul(const char *s, const char *p, int depth)
6640{
6641	return (1); /* always match the empty pattern */
6642}
6643
6644/*ARGSUSED*/
6645static int
6646dtrace_match_nonzero(const char *s, const char *p, int depth)
6647{
6648	return (s != NULL && s[0] != '\0');
6649}
6650
6651static int
6652dtrace_match(const dtrace_probekey_t *pkp, uint32_t priv, uid_t uid,
6653    zoneid_t zoneid, int (*matched)(dtrace_probe_t *, void *), void *arg)
6654{
6655	dtrace_probe_t template, *probe;
6656	dtrace_hash_t *hash = NULL;
6657	int len, best = INT_MAX, nmatched = 0;
6658	dtrace_id_t i;
6659
6660	ASSERT(MUTEX_HELD(&dtrace_lock));
6661
6662	/*
6663	 * If the probe ID is specified in the key, just lookup by ID and
6664	 * invoke the match callback once if a matching probe is found.
6665	 */
6666	if (pkp->dtpk_id != DTRACE_IDNONE) {
6667		if ((probe = dtrace_probe_lookup_id(pkp->dtpk_id)) != NULL &&
6668		    dtrace_match_probe(probe, pkp, priv, uid, zoneid) > 0) {
6669			(void) (*matched)(probe, arg);
6670			nmatched++;
6671		}
6672		return (nmatched);
6673	}
6674
6675	template.dtpr_mod = (char *)pkp->dtpk_mod;
6676	template.dtpr_func = (char *)pkp->dtpk_func;
6677	template.dtpr_name = (char *)pkp->dtpk_name;
6678
6679	/*
6680	 * We want to find the most distinct of the module name, function
6681	 * name, and name.  So for each one that is not a glob pattern or
6682	 * empty string, we perform a lookup in the corresponding hash and
6683	 * use the hash table with the fewest collisions to do our search.
6684	 */
6685	if (pkp->dtpk_mmatch == &dtrace_match_string &&
6686	    (len = dtrace_hash_collisions(dtrace_bymod, &template)) < best) {
6687		best = len;
6688		hash = dtrace_bymod;
6689	}
6690
6691	if (pkp->dtpk_fmatch == &dtrace_match_string &&
6692	    (len = dtrace_hash_collisions(dtrace_byfunc, &template)) < best) {
6693		best = len;
6694		hash = dtrace_byfunc;
6695	}
6696
6697	if (pkp->dtpk_nmatch == &dtrace_match_string &&
6698	    (len = dtrace_hash_collisions(dtrace_byname, &template)) < best) {
6699		best = len;
6700		hash = dtrace_byname;
6701	}
6702
6703	/*
6704	 * If we did not select a hash table, iterate over every probe and
6705	 * invoke our callback for each one that matches our input probe key.
6706	 */
6707	if (hash == NULL) {
6708		for (i = 0; i < dtrace_nprobes; i++) {
6709			if ((probe = dtrace_probes[i]) == NULL ||
6710			    dtrace_match_probe(probe, pkp, priv, uid,
6711			    zoneid) <= 0)
6712				continue;
6713
6714			nmatched++;
6715
6716			if ((*matched)(probe, arg) != DTRACE_MATCH_NEXT)
6717				break;
6718		}
6719
6720		return (nmatched);
6721	}
6722
6723	/*
6724	 * If we selected a hash table, iterate over each probe of the same key
6725	 * name and invoke the callback for every probe that matches the other
6726	 * attributes of our input probe key.
6727	 */
6728	for (probe = dtrace_hash_lookup(hash, &template); probe != NULL;
6729	    probe = *(DTRACE_HASHNEXT(hash, probe))) {
6730
6731		if (dtrace_match_probe(probe, pkp, priv, uid, zoneid) <= 0)
6732			continue;
6733
6734		nmatched++;
6735
6736		if ((*matched)(probe, arg) != DTRACE_MATCH_NEXT)
6737			break;
6738	}
6739
6740	return (nmatched);
6741}
6742
6743/*
6744 * Return the function pointer dtrace_probecmp() should use to compare the
6745 * specified pattern with a string.  For NULL or empty patterns, we select
6746 * dtrace_match_nul().  For glob pattern strings, we use dtrace_match_glob().
6747 * For non-empty non-glob strings, we use dtrace_match_string().
6748 */
6749static dtrace_probekey_f *
6750dtrace_probekey_func(const char *p)
6751{
6752	char c;
6753
6754	if (p == NULL || *p == '\0')
6755		return (&dtrace_match_nul);
6756
6757	while ((c = *p++) != '\0') {
6758		if (c == '[' || c == '?' || c == '*' || c == '\\')
6759			return (&dtrace_match_glob);
6760	}
6761
6762	return (&dtrace_match_string);
6763}
6764
6765/*
6766 * Build a probe comparison key for use with dtrace_match_probe() from the
6767 * given probe description.  By convention, a null key only matches anchored
6768 * probes: if each field is the empty string, reset dtpk_fmatch to
6769 * dtrace_match_nonzero().
6770 */
6771static void
6772dtrace_probekey(const dtrace_probedesc_t *pdp, dtrace_probekey_t *pkp)
6773{
6774	pkp->dtpk_prov = pdp->dtpd_provider;
6775	pkp->dtpk_pmatch = dtrace_probekey_func(pdp->dtpd_provider);
6776
6777	pkp->dtpk_mod = pdp->dtpd_mod;
6778	pkp->dtpk_mmatch = dtrace_probekey_func(pdp->dtpd_mod);
6779
6780	pkp->dtpk_func = pdp->dtpd_func;
6781	pkp->dtpk_fmatch = dtrace_probekey_func(pdp->dtpd_func);
6782
6783	pkp->dtpk_name = pdp->dtpd_name;
6784	pkp->dtpk_nmatch = dtrace_probekey_func(pdp->dtpd_name);
6785
6786	pkp->dtpk_id = pdp->dtpd_id;
6787
6788	if (pkp->dtpk_id == DTRACE_IDNONE &&
6789	    pkp->dtpk_pmatch == &dtrace_match_nul &&
6790	    pkp->dtpk_mmatch == &dtrace_match_nul &&
6791	    pkp->dtpk_fmatch == &dtrace_match_nul &&
6792	    pkp->dtpk_nmatch == &dtrace_match_nul)
6793		pkp->dtpk_fmatch = &dtrace_match_nonzero;
6794}
6795
6796/*
6797 * DTrace Provider-to-Framework API Functions
6798 *
6799 * These functions implement much of the Provider-to-Framework API, as
6800 * described in <sys/dtrace.h>.  The parts of the API not in this section are
6801 * the functions in the API for probe management (found below), and
6802 * dtrace_probe() itself (found above).
6803 */
6804
6805/*
6806 * Register the calling provider with the DTrace framework.  This should
6807 * generally be called by DTrace providers in their attach(9E) entry point.
6808 */
6809int
6810dtrace_register(const char *name, const dtrace_pattr_t *pap, uint32_t priv,
6811    cred_t *cr, const dtrace_pops_t *pops, void *arg, dtrace_provider_id_t *idp)
6812{
6813	dtrace_provider_t *provider;
6814
6815	if (name == NULL || pap == NULL || pops == NULL || idp == NULL) {
6816		cmn_err(CE_WARN, "failed to register provider '%s': invalid "
6817		    "arguments", name ? name : "<NULL>");
6818		return (EINVAL);
6819	}
6820
6821	if (name[0] == '\0' || dtrace_badname(name)) {
6822		cmn_err(CE_WARN, "failed to register provider '%s': invalid "
6823		    "provider name", name);
6824		return (EINVAL);
6825	}
6826
6827	if ((pops->dtps_provide == NULL && pops->dtps_provide_module == NULL) ||
6828	    pops->dtps_enable == NULL || pops->dtps_disable == NULL ||
6829	    pops->dtps_destroy == NULL ||
6830	    ((pops->dtps_resume == NULL) != (pops->dtps_suspend == NULL))) {
6831		cmn_err(CE_WARN, "failed to register provider '%s': invalid "
6832		    "provider ops", name);
6833		return (EINVAL);
6834	}
6835
6836	if (dtrace_badattr(&pap->dtpa_provider) ||
6837	    dtrace_badattr(&pap->dtpa_mod) ||
6838	    dtrace_badattr(&pap->dtpa_func) ||
6839	    dtrace_badattr(&pap->dtpa_name) ||
6840	    dtrace_badattr(&pap->dtpa_args)) {
6841		cmn_err(CE_WARN, "failed to register provider '%s': invalid "
6842		    "provider attributes", name);
6843		return (EINVAL);
6844	}
6845
6846	if (priv & ~DTRACE_PRIV_ALL) {
6847		cmn_err(CE_WARN, "failed to register provider '%s': invalid "
6848		    "privilege attributes", name);
6849		return (EINVAL);
6850	}
6851
6852	if ((priv & DTRACE_PRIV_KERNEL) &&
6853	    (priv & (DTRACE_PRIV_USER | DTRACE_PRIV_OWNER)) &&
6854	    pops->dtps_usermode == NULL) {
6855		cmn_err(CE_WARN, "failed to register provider '%s': need "
6856		    "dtps_usermode() op for given privilege attributes", name);
6857		return (EINVAL);
6858	}
6859
6860	provider = kmem_zalloc(sizeof (dtrace_provider_t), KM_SLEEP);
6861	provider->dtpv_name = kmem_alloc(strlen(name) + 1, KM_SLEEP);
6862	(void) strcpy(provider->dtpv_name, name);
6863
6864	provider->dtpv_attr = *pap;
6865	provider->dtpv_priv.dtpp_flags = priv;
6866	if (cr != NULL) {
6867		provider->dtpv_priv.dtpp_uid = crgetuid(cr);
6868		provider->dtpv_priv.dtpp_zoneid = crgetzoneid(cr);
6869	}
6870	provider->dtpv_pops = *pops;
6871
6872	if (pops->dtps_provide == NULL) {
6873		ASSERT(pops->dtps_provide_module != NULL);
6874		provider->dtpv_pops.dtps_provide =
6875		    (void (*)(void *, const dtrace_probedesc_t *))dtrace_nullop;
6876	}
6877
6878	if (pops->dtps_provide_module == NULL) {
6879		ASSERT(pops->dtps_provide != NULL);
6880		provider->dtpv_pops.dtps_provide_module =
6881		    (void (*)(void *, struct modctl *))dtrace_nullop;
6882	}
6883
6884	if (pops->dtps_suspend == NULL) {
6885		ASSERT(pops->dtps_resume == NULL);
6886		provider->dtpv_pops.dtps_suspend =
6887		    (void (*)(void *, dtrace_id_t, void *))dtrace_nullop;
6888		provider->dtpv_pops.dtps_resume =
6889		    (void (*)(void *, dtrace_id_t, void *))dtrace_nullop;
6890	}
6891
6892	provider->dtpv_arg = arg;
6893	*idp = (dtrace_provider_id_t)provider;
6894
6895	if (pops == &dtrace_provider_ops) {
6896		ASSERT(MUTEX_HELD(&dtrace_provider_lock));
6897		ASSERT(MUTEX_HELD(&dtrace_lock));
6898		ASSERT(dtrace_anon.dta_enabling == NULL);
6899
6900		/*
6901		 * We make sure that the DTrace provider is at the head of
6902		 * the provider chain.
6903		 */
6904		provider->dtpv_next = dtrace_provider;
6905		dtrace_provider = provider;
6906		return (0);
6907	}
6908
6909	mutex_enter(&dtrace_provider_lock);
6910	mutex_enter(&dtrace_lock);
6911
6912	/*
6913	 * If there is at least one provider registered, we'll add this
6914	 * provider after the first provider.
6915	 */
6916	if (dtrace_provider != NULL) {
6917		provider->dtpv_next = dtrace_provider->dtpv_next;
6918		dtrace_provider->dtpv_next = provider;
6919	} else {
6920		dtrace_provider = provider;
6921	}
6922
6923	if (dtrace_retained != NULL) {
6924		dtrace_enabling_provide(provider);
6925
6926		/*
6927		 * Now we need to call dtrace_enabling_matchall() -- which
6928		 * will acquire cpu_lock and dtrace_lock.  We therefore need
6929		 * to drop all of our locks before calling into it...
6930		 */
6931		mutex_exit(&dtrace_lock);
6932		mutex_exit(&dtrace_provider_lock);
6933		dtrace_enabling_matchall();
6934
6935		return (0);
6936	}
6937
6938	mutex_exit(&dtrace_lock);
6939	mutex_exit(&dtrace_provider_lock);
6940
6941	return (0);
6942}
6943
6944/*
6945 * Unregister the specified provider from the DTrace framework.  This should
6946 * generally be called by DTrace providers in their detach(9E) entry point.
6947 */
6948int
6949dtrace_unregister(dtrace_provider_id_t id)
6950{
6951	dtrace_provider_t *old = (dtrace_provider_t *)id;
6952	dtrace_provider_t *prev = NULL;
6953	int i, self = 0;
6954	dtrace_probe_t *probe, *first = NULL;
6955
6956	if (old->dtpv_pops.dtps_enable ==
6957	    (void (*)(void *, dtrace_id_t, void *))dtrace_nullop) {
6958		/*
6959		 * If DTrace itself is the provider, we're called with locks
6960		 * already held.
6961		 */
6962		ASSERT(old == dtrace_provider);
6963		ASSERT(dtrace_devi != NULL);
6964		ASSERT(MUTEX_HELD(&dtrace_provider_lock));
6965		ASSERT(MUTEX_HELD(&dtrace_lock));
6966		self = 1;
6967
6968		if (dtrace_provider->dtpv_next != NULL) {
6969			/*
6970			 * There's another provider here; return failure.
6971			 */
6972			return (EBUSY);
6973		}
6974	} else {
6975		mutex_enter(&dtrace_provider_lock);
6976		mutex_enter(&mod_lock);
6977		mutex_enter(&dtrace_lock);
6978	}
6979
6980	/*
6981	 * If anyone has /dev/dtrace open, or if there are anonymous enabled
6982	 * probes, we refuse to let providers slither away, unless this
6983	 * provider has already been explicitly invalidated.
6984	 */
6985	if (!old->dtpv_defunct &&
6986	    (dtrace_opens || (dtrace_anon.dta_state != NULL &&
6987	    dtrace_anon.dta_state->dts_necbs > 0))) {
6988		if (!self) {
6989			mutex_exit(&dtrace_lock);
6990			mutex_exit(&mod_lock);
6991			mutex_exit(&dtrace_provider_lock);
6992		}
6993		return (EBUSY);
6994	}
6995
6996	/*
6997	 * Attempt to destroy the probes associated with this provider.
6998	 */
6999	for (i = 0; i < dtrace_nprobes; i++) {
7000		if ((probe = dtrace_probes[i]) == NULL)
7001			continue;
7002
7003		if (probe->dtpr_provider != old)
7004			continue;
7005
7006		if (probe->dtpr_ecb == NULL)
7007			continue;
7008
7009		/*
7010		 * We have at least one ECB; we can't remove this provider.
7011		 */
7012		if (!self) {
7013			mutex_exit(&dtrace_lock);
7014			mutex_exit(&mod_lock);
7015			mutex_exit(&dtrace_provider_lock);
7016		}
7017		return (EBUSY);
7018	}
7019
7020	/*
7021	 * All of the probes for this provider are disabled; we can safely
7022	 * remove all of them from their hash chains and from the probe array.
7023	 */
7024	for (i = 0; i < dtrace_nprobes; i++) {
7025		if ((probe = dtrace_probes[i]) == NULL)
7026			continue;
7027
7028		if (probe->dtpr_provider != old)
7029			continue;
7030
7031		dtrace_probes[i] = NULL;
7032
7033		dtrace_hash_remove(dtrace_bymod, probe);
7034		dtrace_hash_remove(dtrace_byfunc, probe);
7035		dtrace_hash_remove(dtrace_byname, probe);
7036
7037		if (first == NULL) {
7038			first = probe;
7039			probe->dtpr_nextmod = NULL;
7040		} else {
7041			probe->dtpr_nextmod = first;
7042			first = probe;
7043		}
7044	}
7045
7046	/*
7047	 * The provider's probes have been removed from the hash chains and
7048	 * from the probe array.  Now issue a dtrace_sync() to be sure that
7049	 * everyone has cleared out from any probe array processing.
7050	 */
7051	dtrace_sync();
7052
7053	for (probe = first; probe != NULL; probe = first) {
7054		first = probe->dtpr_nextmod;
7055
7056		old->dtpv_pops.dtps_destroy(old->dtpv_arg, probe->dtpr_id,
7057		    probe->dtpr_arg);
7058		kmem_free(probe->dtpr_mod, strlen(probe->dtpr_mod) + 1);
7059		kmem_free(probe->dtpr_func, strlen(probe->dtpr_func) + 1);
7060		kmem_free(probe->dtpr_name, strlen(probe->dtpr_name) + 1);
7061		vmem_free(dtrace_arena, (void *)(uintptr_t)(probe->dtpr_id), 1);
7062		kmem_free(probe, sizeof (dtrace_probe_t));
7063	}
7064
7065	if ((prev = dtrace_provider) == old) {
7066		ASSERT(self || dtrace_devi == NULL);
7067		ASSERT(old->dtpv_next == NULL || dtrace_devi == NULL);
7068		dtrace_provider = old->dtpv_next;
7069	} else {
7070		while (prev != NULL && prev->dtpv_next != old)
7071			prev = prev->dtpv_next;
7072
7073		if (prev == NULL) {
7074			panic("attempt to unregister non-existent "
7075			    "dtrace provider %p\n", (void *)id);
7076		}
7077
7078		prev->dtpv_next = old->dtpv_next;
7079	}
7080
7081	if (!self) {
7082		mutex_exit(&dtrace_lock);
7083		mutex_exit(&mod_lock);
7084		mutex_exit(&dtrace_provider_lock);
7085	}
7086
7087	kmem_free(old->dtpv_name, strlen(old->dtpv_name) + 1);
7088	kmem_free(old, sizeof (dtrace_provider_t));
7089
7090	return (0);
7091}
7092
7093/*
7094 * Invalidate the specified provider.  All subsequent probe lookups for the
7095 * specified provider will fail, but its probes will not be removed.
7096 */
7097void
7098dtrace_invalidate(dtrace_provider_id_t id)
7099{
7100	dtrace_provider_t *pvp = (dtrace_provider_t *)id;
7101
7102	ASSERT(pvp->dtpv_pops.dtps_enable !=
7103	    (void (*)(void *, dtrace_id_t, void *))dtrace_nullop);
7104
7105	mutex_enter(&dtrace_provider_lock);
7106	mutex_enter(&dtrace_lock);
7107
7108	pvp->dtpv_defunct = 1;
7109
7110	mutex_exit(&dtrace_lock);
7111	mutex_exit(&dtrace_provider_lock);
7112}
7113
7114/*
7115 * Indicate whether or not DTrace has attached.
7116 */
7117int
7118dtrace_attached(void)
7119{
7120	/*
7121	 * dtrace_provider will be non-NULL iff the DTrace driver has
7122	 * attached.  (It's non-NULL because DTrace is always itself a
7123	 * provider.)
7124	 */
7125	return (dtrace_provider != NULL);
7126}
7127
7128/*
7129 * Remove all the unenabled probes for the given provider.  This function is
7130 * not unlike dtrace_unregister(), except that it doesn't remove the provider
7131 * -- just as many of its associated probes as it can.
7132 */
7133int
7134dtrace_condense(dtrace_provider_id_t id)
7135{
7136	dtrace_provider_t *prov = (dtrace_provider_t *)id;
7137	int i;
7138	dtrace_probe_t *probe;
7139
7140	/*
7141	 * Make sure this isn't the dtrace provider itself.
7142	 */
7143	ASSERT(prov->dtpv_pops.dtps_enable !=
7144	    (void (*)(void *, dtrace_id_t, void *))dtrace_nullop);
7145
7146	mutex_enter(&dtrace_provider_lock);
7147	mutex_enter(&dtrace_lock);
7148
7149	/*
7150	 * Attempt to destroy the probes associated with this provider.
7151	 */
7152	for (i = 0; i < dtrace_nprobes; i++) {
7153		if ((probe = dtrace_probes[i]) == NULL)
7154			continue;
7155
7156		if (probe->dtpr_provider != prov)
7157			continue;
7158
7159		if (probe->dtpr_ecb != NULL)
7160			continue;
7161
7162		dtrace_probes[i] = NULL;
7163
7164		dtrace_hash_remove(dtrace_bymod, probe);
7165		dtrace_hash_remove(dtrace_byfunc, probe);
7166		dtrace_hash_remove(dtrace_byname, probe);
7167
7168		prov->dtpv_pops.dtps_destroy(prov->dtpv_arg, i + 1,
7169		    probe->dtpr_arg);
7170		kmem_free(probe->dtpr_mod, strlen(probe->dtpr_mod) + 1);
7171		kmem_free(probe->dtpr_func, strlen(probe->dtpr_func) + 1);
7172		kmem_free(probe->dtpr_name, strlen(probe->dtpr_name) + 1);
7173		kmem_free(probe, sizeof (dtrace_probe_t));
7174		vmem_free(dtrace_arena, (void *)((uintptr_t)i + 1), 1);
7175	}
7176
7177	mutex_exit(&dtrace_lock);
7178	mutex_exit(&dtrace_provider_lock);
7179
7180	return (0);
7181}
7182
7183/*
7184 * DTrace Probe Management Functions
7185 *
7186 * The functions in this section perform the DTrace probe management,
7187 * including functions to create probes, look-up probes, and call into the
7188 * providers to request that probes be provided.  Some of these functions are
7189 * in the Provider-to-Framework API; these functions can be identified by the
7190 * fact that they are not declared "static".
7191 */
7192
7193/*
7194 * Create a probe with the specified module name, function name, and name.
7195 */
7196dtrace_id_t
7197dtrace_probe_create(dtrace_provider_id_t prov, const char *mod,
7198    const char *func, const char *name, int aframes, void *arg)
7199{
7200	dtrace_probe_t *probe, **probes;
7201	dtrace_provider_t *provider = (dtrace_provider_t *)prov;
7202	dtrace_id_t id;
7203
7204	if (provider == dtrace_provider) {
7205		ASSERT(MUTEX_HELD(&dtrace_lock));
7206	} else {
7207		mutex_enter(&dtrace_lock);
7208	}
7209
7210	id = (dtrace_id_t)(uintptr_t)vmem_alloc(dtrace_arena, 1,
7211	    VM_BESTFIT | VM_SLEEP);
7212	probe = kmem_zalloc(sizeof (dtrace_probe_t), KM_SLEEP);
7213
7214	probe->dtpr_id = id;
7215	probe->dtpr_gen = dtrace_probegen++;
7216	probe->dtpr_mod = dtrace_strdup(mod);
7217	probe->dtpr_func = dtrace_strdup(func);
7218	probe->dtpr_name = dtrace_strdup(name);
7219	probe->dtpr_arg = arg;
7220	probe->dtpr_aframes = aframes;
7221	probe->dtpr_provider = provider;
7222
7223	dtrace_hash_add(dtrace_bymod, probe);
7224	dtrace_hash_add(dtrace_byfunc, probe);
7225	dtrace_hash_add(dtrace_byname, probe);
7226
7227	if (id - 1 >= dtrace_nprobes) {
7228		size_t osize = dtrace_nprobes * sizeof (dtrace_probe_t *);
7229		size_t nsize = osize << 1;
7230
7231		if (nsize == 0) {
7232			ASSERT(osize == 0);
7233			ASSERT(dtrace_probes == NULL);
7234			nsize = sizeof (dtrace_probe_t *);
7235		}
7236
7237		probes = kmem_zalloc(nsize, KM_SLEEP);
7238
7239		if (dtrace_probes == NULL) {
7240			ASSERT(osize == 0);
7241			dtrace_probes = probes;
7242			dtrace_nprobes = 1;
7243		} else {
7244			dtrace_probe_t **oprobes = dtrace_probes;
7245
7246			bcopy(oprobes, probes, osize);
7247			dtrace_membar_producer();
7248			dtrace_probes = probes;
7249
7250			dtrace_sync();
7251
7252			/*
7253			 * All CPUs are now seeing the new probes array; we can
7254			 * safely free the old array.
7255			 */
7256			kmem_free(oprobes, osize);
7257			dtrace_nprobes <<= 1;
7258		}
7259
7260		ASSERT(id - 1 < dtrace_nprobes);
7261	}
7262
7263	ASSERT(dtrace_probes[id - 1] == NULL);
7264	dtrace_probes[id - 1] = probe;
7265
7266	if (provider != dtrace_provider)
7267		mutex_exit(&dtrace_lock);
7268
7269	return (id);
7270}
7271
7272static dtrace_probe_t *
7273dtrace_probe_lookup_id(dtrace_id_t id)
7274{
7275	ASSERT(MUTEX_HELD(&dtrace_lock));
7276
7277	if (id == 0 || id > dtrace_nprobes)
7278		return (NULL);
7279
7280	return (dtrace_probes[id - 1]);
7281}
7282
7283static int
7284dtrace_probe_lookup_match(dtrace_probe_t *probe, void *arg)
7285{
7286	*((dtrace_id_t *)arg) = probe->dtpr_id;
7287
7288	return (DTRACE_MATCH_DONE);
7289}
7290
7291/*
7292 * Look up a probe based on provider and one or more of module name, function
7293 * name and probe name.
7294 */
7295dtrace_id_t
7296dtrace_probe_lookup(dtrace_provider_id_t prid, const char *mod,
7297    const char *func, const char *name)
7298{
7299	dtrace_probekey_t pkey;
7300	dtrace_id_t id;
7301	int match;
7302
7303	pkey.dtpk_prov = ((dtrace_provider_t *)prid)->dtpv_name;
7304	pkey.dtpk_pmatch = &dtrace_match_string;
7305	pkey.dtpk_mod = mod;
7306	pkey.dtpk_mmatch = mod ? &dtrace_match_string : &dtrace_match_nul;
7307	pkey.dtpk_func = func;
7308	pkey.dtpk_fmatch = func ? &dtrace_match_string : &dtrace_match_nul;
7309	pkey.dtpk_name = name;
7310	pkey.dtpk_nmatch = name ? &dtrace_match_string : &dtrace_match_nul;
7311	pkey.dtpk_id = DTRACE_IDNONE;
7312
7313	mutex_enter(&dtrace_lock);
7314	match = dtrace_match(&pkey, DTRACE_PRIV_ALL, 0, 0,
7315	    dtrace_probe_lookup_match, &id);
7316	mutex_exit(&dtrace_lock);
7317
7318	ASSERT(match == 1 || match == 0);
7319	return (match ? id : 0);
7320}
7321
7322/*
7323 * Returns the probe argument associated with the specified probe.
7324 */
7325void *
7326dtrace_probe_arg(dtrace_provider_id_t id, dtrace_id_t pid)
7327{
7328	dtrace_probe_t *probe;
7329	void *rval = NULL;
7330
7331	mutex_enter(&dtrace_lock);
7332
7333	if ((probe = dtrace_probe_lookup_id(pid)) != NULL &&
7334	    probe->dtpr_provider == (dtrace_provider_t *)id)
7335		rval = probe->dtpr_arg;
7336
7337	mutex_exit(&dtrace_lock);
7338
7339	return (rval);
7340}
7341
7342/*
7343 * Copy a probe into a probe description.
7344 */
7345static void
7346dtrace_probe_description(const dtrace_probe_t *prp, dtrace_probedesc_t *pdp)
7347{
7348	bzero(pdp, sizeof (dtrace_probedesc_t));
7349	pdp->dtpd_id = prp->dtpr_id;
7350
7351	(void) strncpy(pdp->dtpd_provider,
7352	    prp->dtpr_provider->dtpv_name, DTRACE_PROVNAMELEN - 1);
7353
7354	(void) strncpy(pdp->dtpd_mod, prp->dtpr_mod, DTRACE_MODNAMELEN - 1);
7355	(void) strncpy(pdp->dtpd_func, prp->dtpr_func, DTRACE_FUNCNAMELEN - 1);
7356	(void) strncpy(pdp->dtpd_name, prp->dtpr_name, DTRACE_NAMELEN - 1);
7357}
7358
7359/*
7360 * Called to indicate that a probe -- or probes -- should be provided by a
7361 * specfied provider.  If the specified description is NULL, the provider will
7362 * be told to provide all of its probes.  (This is done whenever a new
7363 * consumer comes along, or whenever a retained enabling is to be matched.) If
7364 * the specified description is non-NULL, the provider is given the
7365 * opportunity to dynamically provide the specified probe, allowing providers
7366 * to support the creation of probes on-the-fly.  (So-called _autocreated_
7367 * probes.)  If the provider is NULL, the operations will be applied to all
7368 * providers; if the provider is non-NULL the operations will only be applied
7369 * to the specified provider.  The dtrace_provider_lock must be held, and the
7370 * dtrace_lock must _not_ be held -- the provider's dtps_provide() operation
7371 * will need to grab the dtrace_lock when it reenters the framework through
7372 * dtrace_probe_lookup(), dtrace_probe_create(), etc.
7373 */
7374static void
7375dtrace_probe_provide(dtrace_probedesc_t *desc, dtrace_provider_t *prv)
7376{
7377	struct modctl *ctl;
7378	int all = 0;
7379
7380	ASSERT(MUTEX_HELD(&dtrace_provider_lock));
7381
7382	if (prv == NULL) {
7383		all = 1;
7384		prv = dtrace_provider;
7385	}
7386
7387	do {
7388		/*
7389		 * First, call the blanket provide operation.
7390		 */
7391		prv->dtpv_pops.dtps_provide(prv->dtpv_arg, desc);
7392
7393		/*
7394		 * Now call the per-module provide operation.  We will grab
7395		 * mod_lock to prevent the list from being modified.  Note
7396		 * that this also prevents the mod_busy bits from changing.
7397		 * (mod_busy can only be changed with mod_lock held.)
7398		 */
7399		mutex_enter(&mod_lock);
7400
7401		ctl = &modules;
7402		do {
7403			if (ctl->mod_busy || ctl->mod_mp == NULL)
7404				continue;
7405
7406			prv->dtpv_pops.dtps_provide_module(prv->dtpv_arg, ctl);
7407
7408		} while ((ctl = ctl->mod_next) != &modules);
7409
7410		mutex_exit(&mod_lock);
7411	} while (all && (prv = prv->dtpv_next) != NULL);
7412}
7413
7414/*
7415 * Iterate over each probe, and call the Framework-to-Provider API function
7416 * denoted by offs.
7417 */
7418static void
7419dtrace_probe_foreach(uintptr_t offs)
7420{
7421	dtrace_provider_t *prov;
7422	void (*func)(void *, dtrace_id_t, void *);
7423	dtrace_probe_t *probe;
7424	dtrace_icookie_t cookie;
7425	int i;
7426
7427	/*
7428	 * We disable interrupts to walk through the probe array.  This is
7429	 * safe -- the dtrace_sync() in dtrace_unregister() assures that we
7430	 * won't see stale data.
7431	 */
7432	cookie = dtrace_interrupt_disable();
7433
7434	for (i = 0; i < dtrace_nprobes; i++) {
7435		if ((probe = dtrace_probes[i]) == NULL)
7436			continue;
7437
7438		if (probe->dtpr_ecb == NULL) {
7439			/*
7440			 * This probe isn't enabled -- don't call the function.
7441			 */
7442			continue;
7443		}
7444
7445		prov = probe->dtpr_provider;
7446		func = *((void(**)(void *, dtrace_id_t, void *))
7447		    ((uintptr_t)&prov->dtpv_pops + offs));
7448
7449		func(prov->dtpv_arg, i + 1, probe->dtpr_arg);
7450	}
7451
7452	dtrace_interrupt_enable(cookie);
7453}
7454
7455static int
7456dtrace_probe_enable(const dtrace_probedesc_t *desc, dtrace_enabling_t *enab)
7457{
7458	dtrace_probekey_t pkey;
7459	uint32_t priv;
7460	uid_t uid;
7461	zoneid_t zoneid;
7462
7463	ASSERT(MUTEX_HELD(&dtrace_lock));
7464	dtrace_ecb_create_cache = NULL;
7465
7466	if (desc == NULL) {
7467		/*
7468		 * If we're passed a NULL description, we're being asked to
7469		 * create an ECB with a NULL probe.
7470		 */
7471		(void) dtrace_ecb_create_enable(NULL, enab);
7472		return (0);
7473	}
7474
7475	dtrace_probekey(desc, &pkey);
7476	dtrace_cred2priv(enab->dten_vstate->dtvs_state->dts_cred.dcr_cred,
7477	    &priv, &uid, &zoneid);
7478
7479	return (dtrace_match(&pkey, priv, uid, zoneid, dtrace_ecb_create_enable,
7480	    enab));
7481}
7482
7483/*
7484 * DTrace Helper Provider Functions
7485 */
7486static void
7487dtrace_dofattr2attr(dtrace_attribute_t *attr, const dof_attr_t dofattr)
7488{
7489	attr->dtat_name = DOF_ATTR_NAME(dofattr);
7490	attr->dtat_data = DOF_ATTR_DATA(dofattr);
7491	attr->dtat_class = DOF_ATTR_CLASS(dofattr);
7492}
7493
7494static void
7495dtrace_dofprov2hprov(dtrace_helper_provdesc_t *hprov,
7496    const dof_provider_t *dofprov, char *strtab)
7497{
7498	hprov->dthpv_provname = strtab + dofprov->dofpv_name;
7499	dtrace_dofattr2attr(&hprov->dthpv_pattr.dtpa_provider,
7500	    dofprov->dofpv_provattr);
7501	dtrace_dofattr2attr(&hprov->dthpv_pattr.dtpa_mod,
7502	    dofprov->dofpv_modattr);
7503	dtrace_dofattr2attr(&hprov->dthpv_pattr.dtpa_func,
7504	    dofprov->dofpv_funcattr);
7505	dtrace_dofattr2attr(&hprov->dthpv_pattr.dtpa_name,
7506	    dofprov->dofpv_nameattr);
7507	dtrace_dofattr2attr(&hprov->dthpv_pattr.dtpa_args,
7508	    dofprov->dofpv_argsattr);
7509}
7510
7511static void
7512dtrace_helper_provide_one(dof_helper_t *dhp, dof_sec_t *sec, pid_t pid)
7513{
7514	uintptr_t daddr = (uintptr_t)dhp->dofhp_dof;
7515	dof_hdr_t *dof = (dof_hdr_t *)daddr;
7516	dof_sec_t *str_sec, *prb_sec, *arg_sec, *off_sec, *enoff_sec;
7517	dof_provider_t *provider;
7518	dof_probe_t *probe;
7519	uint32_t *off, *enoff;
7520	uint8_t *arg;
7521	char *strtab;
7522	uint_t i, nprobes;
7523	dtrace_helper_provdesc_t dhpv;
7524	dtrace_helper_probedesc_t dhpb;
7525	dtrace_meta_t *meta = dtrace_meta_pid;
7526	dtrace_mops_t *mops = &meta->dtm_mops;
7527	void *parg;
7528
7529	provider = (dof_provider_t *)(uintptr_t)(daddr + sec->dofs_offset);
7530	str_sec = (dof_sec_t *)(uintptr_t)(daddr + dof->dofh_secoff +
7531	    provider->dofpv_strtab * dof->dofh_secsize);
7532	prb_sec = (dof_sec_t *)(uintptr_t)(daddr + dof->dofh_secoff +
7533	    provider->dofpv_probes * dof->dofh_secsize);
7534	arg_sec = (dof_sec_t *)(uintptr_t)(daddr + dof->dofh_secoff +
7535	    provider->dofpv_prargs * dof->dofh_secsize);
7536	off_sec = (dof_sec_t *)(uintptr_t)(daddr + dof->dofh_secoff +
7537	    provider->dofpv_proffs * dof->dofh_secsize);
7538
7539	strtab = (char *)(uintptr_t)(daddr + str_sec->dofs_offset);
7540	off = (uint32_t *)(uintptr_t)(daddr + off_sec->dofs_offset);
7541	arg = (uint8_t *)(uintptr_t)(daddr + arg_sec->dofs_offset);
7542	enoff = NULL;
7543
7544	/*
7545	 * See dtrace_helper_provider_validate().
7546	 */
7547	if (dof->dofh_ident[DOF_ID_VERSION] != DOF_VERSION_1 &&
7548	    provider->dofpv_prenoffs != DOF_SECT_NONE) {
7549		enoff_sec = (dof_sec_t *)(uintptr_t)(daddr + dof->dofh_secoff +
7550		    provider->dofpv_prenoffs * dof->dofh_secsize);
7551		enoff = (uint32_t *)(uintptr_t)(daddr + enoff_sec->dofs_offset);
7552	}
7553
7554	nprobes = prb_sec->dofs_size / prb_sec->dofs_entsize;
7555
7556	/*
7557	 * Create the provider.
7558	 */
7559	dtrace_dofprov2hprov(&dhpv, provider, strtab);
7560
7561	if ((parg = mops->dtms_provide_pid(meta->dtm_arg, &dhpv, pid)) == NULL)
7562		return;
7563
7564	meta->dtm_count++;
7565
7566	/*
7567	 * Create the probes.
7568	 */
7569	for (i = 0; i < nprobes; i++) {
7570		probe = (dof_probe_t *)(uintptr_t)(daddr +
7571		    prb_sec->dofs_offset + i * prb_sec->dofs_entsize);
7572
7573		dhpb.dthpb_mod = dhp->dofhp_mod;
7574		dhpb.dthpb_func = strtab + probe->dofpr_func;
7575		dhpb.dthpb_name = strtab + probe->dofpr_name;
7576		dhpb.dthpb_base = probe->dofpr_addr;
7577		dhpb.dthpb_offs = off + probe->dofpr_offidx;
7578		dhpb.dthpb_noffs = probe->dofpr_noffs;
7579		if (enoff != NULL) {
7580			dhpb.dthpb_enoffs = enoff + probe->dofpr_enoffidx;
7581			dhpb.dthpb_nenoffs = probe->dofpr_nenoffs;
7582		} else {
7583			dhpb.dthpb_enoffs = NULL;
7584			dhpb.dthpb_nenoffs = 0;
7585		}
7586		dhpb.dthpb_args = arg + probe->dofpr_argidx;
7587		dhpb.dthpb_nargc = probe->dofpr_nargc;
7588		dhpb.dthpb_xargc = probe->dofpr_xargc;
7589		dhpb.dthpb_ntypes = strtab + probe->dofpr_nargv;
7590		dhpb.dthpb_xtypes = strtab + probe->dofpr_xargv;
7591
7592		mops->dtms_create_probe(meta->dtm_arg, parg, &dhpb);
7593	}
7594}
7595
7596static void
7597dtrace_helper_provide(dof_helper_t *dhp, pid_t pid)
7598{
7599	uintptr_t daddr = (uintptr_t)dhp->dofhp_dof;
7600	dof_hdr_t *dof = (dof_hdr_t *)daddr;
7601	int i;
7602
7603	ASSERT(MUTEX_HELD(&dtrace_meta_lock));
7604
7605	for (i = 0; i < dof->dofh_secnum; i++) {
7606		dof_sec_t *sec = (dof_sec_t *)(uintptr_t)(daddr +
7607		    dof->dofh_secoff + i * dof->dofh_secsize);
7608
7609		if (sec->dofs_type != DOF_SECT_PROVIDER)
7610			continue;
7611
7612		dtrace_helper_provide_one(dhp, sec, pid);
7613	}
7614
7615	/*
7616	 * We may have just created probes, so we must now rematch against
7617	 * any retained enablings.  Note that this call will acquire both
7618	 * cpu_lock and dtrace_lock; the fact that we are holding
7619	 * dtrace_meta_lock now is what defines the ordering with respect to
7620	 * these three locks.
7621	 */
7622	dtrace_enabling_matchall();
7623}
7624
7625static void
7626dtrace_helper_provider_remove_one(dof_helper_t *dhp, dof_sec_t *sec, pid_t pid)
7627{
7628	uintptr_t daddr = (uintptr_t)dhp->dofhp_dof;
7629	dof_hdr_t *dof = (dof_hdr_t *)daddr;
7630	dof_sec_t *str_sec;
7631	dof_provider_t *provider;
7632	char *strtab;
7633	dtrace_helper_provdesc_t dhpv;
7634	dtrace_meta_t *meta = dtrace_meta_pid;
7635	dtrace_mops_t *mops = &meta->dtm_mops;
7636
7637	provider = (dof_provider_t *)(uintptr_t)(daddr + sec->dofs_offset);
7638	str_sec = (dof_sec_t *)(uintptr_t)(daddr + dof->dofh_secoff +
7639	    provider->dofpv_strtab * dof->dofh_secsize);
7640
7641	strtab = (char *)(uintptr_t)(daddr + str_sec->dofs_offset);
7642
7643	/*
7644	 * Create the provider.
7645	 */
7646	dtrace_dofprov2hprov(&dhpv, provider, strtab);
7647
7648	mops->dtms_remove_pid(meta->dtm_arg, &dhpv, pid);
7649
7650	meta->dtm_count--;
7651}
7652
7653static void
7654dtrace_helper_provider_remove(dof_helper_t *dhp, pid_t pid)
7655{
7656	uintptr_t daddr = (uintptr_t)dhp->dofhp_dof;
7657	dof_hdr_t *dof = (dof_hdr_t *)daddr;
7658	int i;
7659
7660	ASSERT(MUTEX_HELD(&dtrace_meta_lock));
7661
7662	for (i = 0; i < dof->dofh_secnum; i++) {
7663		dof_sec_t *sec = (dof_sec_t *)(uintptr_t)(daddr +
7664		    dof->dofh_secoff + i * dof->dofh_secsize);
7665
7666		if (sec->dofs_type != DOF_SECT_PROVIDER)
7667			continue;
7668
7669		dtrace_helper_provider_remove_one(dhp, sec, pid);
7670	}
7671}
7672
7673/*
7674 * DTrace Meta Provider-to-Framework API Functions
7675 *
7676 * These functions implement the Meta Provider-to-Framework API, as described
7677 * in <sys/dtrace.h>.
7678 */
7679int
7680dtrace_meta_register(const char *name, const dtrace_mops_t *mops, void *arg,
7681    dtrace_meta_provider_id_t *idp)
7682{
7683	dtrace_meta_t *meta;
7684	dtrace_helpers_t *help, *next;
7685	int i;
7686
7687	*idp = DTRACE_METAPROVNONE;
7688
7689	/*
7690	 * We strictly don't need the name, but we hold onto it for
7691	 * debuggability. All hail error queues!
7692	 */
7693	if (name == NULL) {
7694		cmn_err(CE_WARN, "failed to register meta-provider: "
7695		    "invalid name");
7696		return (EINVAL);
7697	}
7698
7699	if (mops == NULL ||
7700	    mops->dtms_create_probe == NULL ||
7701	    mops->dtms_provide_pid == NULL ||
7702	    mops->dtms_remove_pid == NULL) {
7703		cmn_err(CE_WARN, "failed to register meta-register %s: "
7704		    "invalid ops", name);
7705		return (EINVAL);
7706	}
7707
7708	meta = kmem_zalloc(sizeof (dtrace_meta_t), KM_SLEEP);
7709	meta->dtm_mops = *mops;
7710	meta->dtm_name = kmem_alloc(strlen(name) + 1, KM_SLEEP);
7711	(void) strcpy(meta->dtm_name, name);
7712	meta->dtm_arg = arg;
7713
7714	mutex_enter(&dtrace_meta_lock);
7715	mutex_enter(&dtrace_lock);
7716
7717	if (dtrace_meta_pid != NULL) {
7718		mutex_exit(&dtrace_lock);
7719		mutex_exit(&dtrace_meta_lock);
7720		cmn_err(CE_WARN, "failed to register meta-register %s: "
7721		    "user-land meta-provider exists", name);
7722		kmem_free(meta->dtm_name, strlen(meta->dtm_name) + 1);
7723		kmem_free(meta, sizeof (dtrace_meta_t));
7724		return (EINVAL);
7725	}
7726
7727	dtrace_meta_pid = meta;
7728	*idp = (dtrace_meta_provider_id_t)meta;
7729
7730	/*
7731	 * If there are providers and probes ready to go, pass them
7732	 * off to the new meta provider now.
7733	 */
7734
7735	help = dtrace_deferred_pid;
7736	dtrace_deferred_pid = NULL;
7737
7738	mutex_exit(&dtrace_lock);
7739
7740	while (help != NULL) {
7741		for (i = 0; i < help->dthps_nprovs; i++) {
7742			dtrace_helper_provide(&help->dthps_provs[i]->dthp_prov,
7743			    help->dthps_pid);
7744		}
7745
7746		next = help->dthps_next;
7747		help->dthps_next = NULL;
7748		help->dthps_prev = NULL;
7749		help->dthps_deferred = 0;
7750		help = next;
7751	}
7752
7753	mutex_exit(&dtrace_meta_lock);
7754
7755	return (0);
7756}
7757
7758int
7759dtrace_meta_unregister(dtrace_meta_provider_id_t id)
7760{
7761	dtrace_meta_t **pp, *old = (dtrace_meta_t *)id;
7762
7763	mutex_enter(&dtrace_meta_lock);
7764	mutex_enter(&dtrace_lock);
7765
7766	if (old == dtrace_meta_pid) {
7767		pp = &dtrace_meta_pid;
7768	} else {
7769		panic("attempt to unregister non-existent "
7770		    "dtrace meta-provider %p\n", (void *)old);
7771	}
7772
7773	if (old->dtm_count != 0) {
7774		mutex_exit(&dtrace_lock);
7775		mutex_exit(&dtrace_meta_lock);
7776		return (EBUSY);
7777	}
7778
7779	*pp = NULL;
7780
7781	mutex_exit(&dtrace_lock);
7782	mutex_exit(&dtrace_meta_lock);
7783
7784	kmem_free(old->dtm_name, strlen(old->dtm_name) + 1);
7785	kmem_free(old, sizeof (dtrace_meta_t));
7786
7787	return (0);
7788}
7789
7790
7791/*
7792 * DTrace DIF Object Functions
7793 */
7794static int
7795dtrace_difo_err(uint_t pc, const char *format, ...)
7796{
7797	if (dtrace_err_verbose) {
7798		va_list alist;
7799
7800		(void) uprintf("dtrace DIF object error: [%u]: ", pc);
7801		va_start(alist, format);
7802		(void) vuprintf(format, alist);
7803		va_end(alist);
7804	}
7805
7806#ifdef DTRACE_ERRDEBUG
7807	dtrace_errdebug(format);
7808#endif
7809	return (1);
7810}
7811
7812/*
7813 * Validate a DTrace DIF object by checking the IR instructions.  The following
7814 * rules are currently enforced by dtrace_difo_validate():
7815 *
7816 * 1. Each instruction must have a valid opcode
7817 * 2. Each register, string, variable, or subroutine reference must be valid
7818 * 3. No instruction can modify register %r0 (must be zero)
7819 * 4. All instruction reserved bits must be set to zero
7820 * 5. The last instruction must be a "ret" instruction
7821 * 6. All branch targets must reference a valid instruction _after_ the branch
7822 */
7823static int
7824dtrace_difo_validate(dtrace_difo_t *dp, dtrace_vstate_t *vstate, uint_t nregs,
7825    cred_t *cr)
7826{
7827	int err = 0, i;
7828	int (*efunc)(uint_t pc, const char *, ...) = dtrace_difo_err;
7829	int kcheckload;
7830	uint_t pc;
7831
7832	kcheckload = cr == NULL ||
7833	    (vstate->dtvs_state->dts_cred.dcr_visible & DTRACE_CRV_KERNEL) == 0;
7834
7835	dp->dtdo_destructive = 0;
7836
7837	for (pc = 0; pc < dp->dtdo_len && err == 0; pc++) {
7838		dif_instr_t instr = dp->dtdo_buf[pc];
7839
7840		uint_t r1 = DIF_INSTR_R1(instr);
7841		uint_t r2 = DIF_INSTR_R2(instr);
7842		uint_t rd = DIF_INSTR_RD(instr);
7843		uint_t rs = DIF_INSTR_RS(instr);
7844		uint_t label = DIF_INSTR_LABEL(instr);
7845		uint_t v = DIF_INSTR_VAR(instr);
7846		uint_t subr = DIF_INSTR_SUBR(instr);
7847		uint_t type = DIF_INSTR_TYPE(instr);
7848		uint_t op = DIF_INSTR_OP(instr);
7849
7850		switch (op) {
7851		case DIF_OP_OR:
7852		case DIF_OP_XOR:
7853		case DIF_OP_AND:
7854		case DIF_OP_SLL:
7855		case DIF_OP_SRL:
7856		case DIF_OP_SRA:
7857		case DIF_OP_SUB:
7858		case DIF_OP_ADD:
7859		case DIF_OP_MUL:
7860		case DIF_OP_SDIV:
7861		case DIF_OP_UDIV:
7862		case DIF_OP_SREM:
7863		case DIF_OP_UREM:
7864		case DIF_OP_COPYS:
7865			if (r1 >= nregs)
7866				err += efunc(pc, "invalid register %u\n", r1);
7867			if (r2 >= nregs)
7868				err += efunc(pc, "invalid register %u\n", r2);
7869			if (rd >= nregs)
7870				err += efunc(pc, "invalid register %u\n", rd);
7871			if (rd == 0)
7872				err += efunc(pc, "cannot write to %r0\n");
7873			break;
7874		case DIF_OP_NOT:
7875		case DIF_OP_MOV:
7876		case DIF_OP_ALLOCS:
7877			if (r1 >= nregs)
7878				err += efunc(pc, "invalid register %u\n", r1);
7879			if (r2 != 0)
7880				err += efunc(pc, "non-zero reserved bits\n");
7881			if (rd >= nregs)
7882				err += efunc(pc, "invalid register %u\n", rd);
7883			if (rd == 0)
7884				err += efunc(pc, "cannot write to %r0\n");
7885			break;
7886		case DIF_OP_LDSB:
7887		case DIF_OP_LDSH:
7888		case DIF_OP_LDSW:
7889		case DIF_OP_LDUB:
7890		case DIF_OP_LDUH:
7891		case DIF_OP_LDUW:
7892		case DIF_OP_LDX:
7893			if (r1 >= nregs)
7894				err += efunc(pc, "invalid register %u\n", r1);
7895			if (r2 != 0)
7896				err += efunc(pc, "non-zero reserved bits\n");
7897			if (rd >= nregs)
7898				err += efunc(pc, "invalid register %u\n", rd);
7899			if (rd == 0)
7900				err += efunc(pc, "cannot write to %r0\n");
7901			if (kcheckload)
7902				dp->dtdo_buf[pc] = DIF_INSTR_LOAD(op +
7903				    DIF_OP_RLDSB - DIF_OP_LDSB, r1, rd);
7904			break;
7905		case DIF_OP_RLDSB:
7906		case DIF_OP_RLDSH:
7907		case DIF_OP_RLDSW:
7908		case DIF_OP_RLDUB:
7909		case DIF_OP_RLDUH:
7910		case DIF_OP_RLDUW:
7911		case DIF_OP_RLDX:
7912			if (r1 >= nregs)
7913				err += efunc(pc, "invalid register %u\n", r1);
7914			if (r2 != 0)
7915				err += efunc(pc, "non-zero reserved bits\n");
7916			if (rd >= nregs)
7917				err += efunc(pc, "invalid register %u\n", rd);
7918			if (rd == 0)
7919				err += efunc(pc, "cannot write to %r0\n");
7920			break;
7921		case DIF_OP_ULDSB:
7922		case DIF_OP_ULDSH:
7923		case DIF_OP_ULDSW:
7924		case DIF_OP_ULDUB:
7925		case DIF_OP_ULDUH:
7926		case DIF_OP_ULDUW:
7927		case DIF_OP_ULDX:
7928			if (r1 >= nregs)
7929				err += efunc(pc, "invalid register %u\n", r1);
7930			if (r2 != 0)
7931				err += efunc(pc, "non-zero reserved bits\n");
7932			if (rd >= nregs)
7933				err += efunc(pc, "invalid register %u\n", rd);
7934			if (rd == 0)
7935				err += efunc(pc, "cannot write to %r0\n");
7936			break;
7937		case DIF_OP_STB:
7938		case DIF_OP_STH:
7939		case DIF_OP_STW:
7940		case DIF_OP_STX:
7941			if (r1 >= nregs)
7942				err += efunc(pc, "invalid register %u\n", r1);
7943			if (r2 != 0)
7944				err += efunc(pc, "non-zero reserved bits\n");
7945			if (rd >= nregs)
7946				err += efunc(pc, "invalid register %u\n", rd);
7947			if (rd == 0)
7948				err += efunc(pc, "cannot write to 0 address\n");
7949			break;
7950		case DIF_OP_CMP:
7951		case DIF_OP_SCMP:
7952			if (r1 >= nregs)
7953				err += efunc(pc, "invalid register %u\n", r1);
7954			if (r2 >= nregs)
7955				err += efunc(pc, "invalid register %u\n", r2);
7956			if (rd != 0)
7957				err += efunc(pc, "non-zero reserved bits\n");
7958			break;
7959		case DIF_OP_TST:
7960			if (r1 >= nregs)
7961				err += efunc(pc, "invalid register %u\n", r1);
7962			if (r2 != 0 || rd != 0)
7963				err += efunc(pc, "non-zero reserved bits\n");
7964			break;
7965		case DIF_OP_BA:
7966		case DIF_OP_BE:
7967		case DIF_OP_BNE:
7968		case DIF_OP_BG:
7969		case DIF_OP_BGU:
7970		case DIF_OP_BGE:
7971		case DIF_OP_BGEU:
7972		case DIF_OP_BL:
7973		case DIF_OP_BLU:
7974		case DIF_OP_BLE:
7975		case DIF_OP_BLEU:
7976			if (label >= dp->dtdo_len) {
7977				err += efunc(pc, "invalid branch target %u\n",
7978				    label);
7979			}
7980			if (label <= pc) {
7981				err += efunc(pc, "backward branch to %u\n",
7982				    label);
7983			}
7984			break;
7985		case DIF_OP_RET:
7986			if (r1 != 0 || r2 != 0)
7987				err += efunc(pc, "non-zero reserved bits\n");
7988			if (rd >= nregs)
7989				err += efunc(pc, "invalid register %u\n", rd);
7990			break;
7991		case DIF_OP_NOP:
7992		case DIF_OP_POPTS:
7993		case DIF_OP_FLUSHTS:
7994			if (r1 != 0 || r2 != 0 || rd != 0)
7995				err += efunc(pc, "non-zero reserved bits\n");
7996			break;
7997		case DIF_OP_SETX:
7998			if (DIF_INSTR_INTEGER(instr) >= dp->dtdo_intlen) {
7999				err += efunc(pc, "invalid integer ref %u\n",
8000				    DIF_INSTR_INTEGER(instr));
8001			}
8002			if (rd >= nregs)
8003				err += efunc(pc, "invalid register %u\n", rd);
8004			if (rd == 0)
8005				err += efunc(pc, "cannot write to %r0\n");
8006			break;
8007		case DIF_OP_SETS:
8008			if (DIF_INSTR_STRING(instr) >= dp->dtdo_strlen) {
8009				err += efunc(pc, "invalid string ref %u\n",
8010				    DIF_INSTR_STRING(instr));
8011			}
8012			if (rd >= nregs)
8013				err += efunc(pc, "invalid register %u\n", rd);
8014			if (rd == 0)
8015				err += efunc(pc, "cannot write to %r0\n");
8016			break;
8017		case DIF_OP_LDGA:
8018		case DIF_OP_LDTA:
8019			if (r1 > DIF_VAR_ARRAY_MAX)
8020				err += efunc(pc, "invalid array %u\n", r1);
8021			if (r2 >= nregs)
8022				err += efunc(pc, "invalid register %u\n", r2);
8023			if (rd >= nregs)
8024				err += efunc(pc, "invalid register %u\n", rd);
8025			if (rd == 0)
8026				err += efunc(pc, "cannot write to %r0\n");
8027			break;
8028		case DIF_OP_LDGS:
8029		case DIF_OP_LDTS:
8030		case DIF_OP_LDLS:
8031		case DIF_OP_LDGAA:
8032		case DIF_OP_LDTAA:
8033			if (v < DIF_VAR_OTHER_MIN || v > DIF_VAR_OTHER_MAX)
8034				err += efunc(pc, "invalid variable %u\n", v);
8035			if (rd >= nregs)
8036				err += efunc(pc, "invalid register %u\n", rd);
8037			if (rd == 0)
8038				err += efunc(pc, "cannot write to %r0\n");
8039			break;
8040		case DIF_OP_STGS:
8041		case DIF_OP_STTS:
8042		case DIF_OP_STLS:
8043		case DIF_OP_STGAA:
8044		case DIF_OP_STTAA:
8045			if (v < DIF_VAR_OTHER_UBASE || v > DIF_VAR_OTHER_MAX)
8046				err += efunc(pc, "invalid variable %u\n", v);
8047			if (rs >= nregs)
8048				err += efunc(pc, "invalid register %u\n", rd);
8049			break;
8050		case DIF_OP_CALL:
8051			if (subr > DIF_SUBR_MAX)
8052				err += efunc(pc, "invalid subr %u\n", subr);
8053			if (rd >= nregs)
8054				err += efunc(pc, "invalid register %u\n", rd);
8055			if (rd == 0)
8056				err += efunc(pc, "cannot write to %r0\n");
8057
8058			if (subr == DIF_SUBR_COPYOUT ||
8059			    subr == DIF_SUBR_COPYOUTSTR) {
8060				dp->dtdo_destructive = 1;
8061			}
8062			break;
8063		case DIF_OP_PUSHTR:
8064			if (type != DIF_TYPE_STRING && type != DIF_TYPE_CTF)
8065				err += efunc(pc, "invalid ref type %u\n", type);
8066			if (r2 >= nregs)
8067				err += efunc(pc, "invalid register %u\n", r2);
8068			if (rs >= nregs)
8069				err += efunc(pc, "invalid register %u\n", rs);
8070			break;
8071		case DIF_OP_PUSHTV:
8072			if (type != DIF_TYPE_CTF)
8073				err += efunc(pc, "invalid val type %u\n", type);
8074			if (r2 >= nregs)
8075				err += efunc(pc, "invalid register %u\n", r2);
8076			if (rs >= nregs)
8077				err += efunc(pc, "invalid register %u\n", rs);
8078			break;
8079		default:
8080			err += efunc(pc, "invalid opcode %u\n",
8081			    DIF_INSTR_OP(instr));
8082		}
8083	}
8084
8085	if (dp->dtdo_len != 0 &&
8086	    DIF_INSTR_OP(dp->dtdo_buf[dp->dtdo_len - 1]) != DIF_OP_RET) {
8087		err += efunc(dp->dtdo_len - 1,
8088		    "expected 'ret' as last DIF instruction\n");
8089	}
8090
8091	if (!(dp->dtdo_rtype.dtdt_flags & DIF_TF_BYREF)) {
8092		/*
8093		 * If we're not returning by reference, the size must be either
8094		 * 0 or the size of one of the base types.
8095		 */
8096		switch (dp->dtdo_rtype.dtdt_size) {
8097		case 0:
8098		case sizeof (uint8_t):
8099		case sizeof (uint16_t):
8100		case sizeof (uint32_t):
8101		case sizeof (uint64_t):
8102			break;
8103
8104		default:
8105			err += efunc(dp->dtdo_len - 1, "bad return size");
8106		}
8107	}
8108
8109	for (i = 0; i < dp->dtdo_varlen && err == 0; i++) {
8110		dtrace_difv_t *v = &dp->dtdo_vartab[i], *existing = NULL;
8111		dtrace_diftype_t *vt, *et;
8112		uint_t id, ndx;
8113
8114		if (v->dtdv_scope != DIFV_SCOPE_GLOBAL &&
8115		    v->dtdv_scope != DIFV_SCOPE_THREAD &&
8116		    v->dtdv_scope != DIFV_SCOPE_LOCAL) {
8117			err += efunc(i, "unrecognized variable scope %d\n",
8118			    v->dtdv_scope);
8119			break;
8120		}
8121
8122		if (v->dtdv_kind != DIFV_KIND_ARRAY &&
8123		    v->dtdv_kind != DIFV_KIND_SCALAR) {
8124			err += efunc(i, "unrecognized variable type %d\n",
8125			    v->dtdv_kind);
8126			break;
8127		}
8128
8129		if ((id = v->dtdv_id) > DIF_VARIABLE_MAX) {
8130			err += efunc(i, "%d exceeds variable id limit\n", id);
8131			break;
8132		}
8133
8134		if (id < DIF_VAR_OTHER_UBASE)
8135			continue;
8136
8137		/*
8138		 * For user-defined variables, we need to check that this
8139		 * definition is identical to any previous definition that we
8140		 * encountered.
8141		 */
8142		ndx = id - DIF_VAR_OTHER_UBASE;
8143
8144		switch (v->dtdv_scope) {
8145		case DIFV_SCOPE_GLOBAL:
8146			if (ndx < vstate->dtvs_nglobals) {
8147				dtrace_statvar_t *svar;
8148
8149				if ((svar = vstate->dtvs_globals[ndx]) != NULL)
8150					existing = &svar->dtsv_var;
8151			}
8152
8153			break;
8154
8155		case DIFV_SCOPE_THREAD:
8156			if (ndx < vstate->dtvs_ntlocals)
8157				existing = &vstate->dtvs_tlocals[ndx];
8158			break;
8159
8160		case DIFV_SCOPE_LOCAL:
8161			if (ndx < vstate->dtvs_nlocals) {
8162				dtrace_statvar_t *svar;
8163
8164				if ((svar = vstate->dtvs_locals[ndx]) != NULL)
8165					existing = &svar->dtsv_var;
8166			}
8167
8168			break;
8169		}
8170
8171		vt = &v->dtdv_type;
8172
8173		if (vt->dtdt_flags & DIF_TF_BYREF) {
8174			if (vt->dtdt_size == 0) {
8175				err += efunc(i, "zero-sized variable\n");
8176				break;
8177			}
8178
8179			if (v->dtdv_scope == DIFV_SCOPE_GLOBAL &&
8180			    vt->dtdt_size > dtrace_global_maxsize) {
8181				err += efunc(i, "oversized by-ref global\n");
8182				break;
8183			}
8184		}
8185
8186		if (existing == NULL || existing->dtdv_id == 0)
8187			continue;
8188
8189		ASSERT(existing->dtdv_id == v->dtdv_id);
8190		ASSERT(existing->dtdv_scope == v->dtdv_scope);
8191
8192		if (existing->dtdv_kind != v->dtdv_kind)
8193			err += efunc(i, "%d changed variable kind\n", id);
8194
8195		et = &existing->dtdv_type;
8196
8197		if (vt->dtdt_flags != et->dtdt_flags) {
8198			err += efunc(i, "%d changed variable type flags\n", id);
8199			break;
8200		}
8201
8202		if (vt->dtdt_size != 0 && vt->dtdt_size != et->dtdt_size) {
8203			err += efunc(i, "%d changed variable type size\n", id);
8204			break;
8205		}
8206	}
8207
8208	return (err);
8209}
8210
8211/*
8212 * Validate a DTrace DIF object that it is to be used as a helper.  Helpers
8213 * are much more constrained than normal DIFOs.  Specifically, they may
8214 * not:
8215 *
8216 * 1. Make calls to subroutines other than copyin(), copyinstr() or
8217 *    miscellaneous string routines
8218 * 2. Access DTrace variables other than the args[] array, and the
8219 *    curthread, pid, ppid, tid, execname, zonename, uid and gid variables.
8220 * 3. Have thread-local variables.
8221 * 4. Have dynamic variables.
8222 */
8223static int
8224dtrace_difo_validate_helper(dtrace_difo_t *dp)
8225{
8226	int (*efunc)(uint_t pc, const char *, ...) = dtrace_difo_err;
8227	int err = 0;
8228	uint_t pc;
8229
8230	for (pc = 0; pc < dp->dtdo_len; pc++) {
8231		dif_instr_t instr = dp->dtdo_buf[pc];
8232
8233		uint_t v = DIF_INSTR_VAR(instr);
8234		uint_t subr = DIF_INSTR_SUBR(instr);
8235		uint_t op = DIF_INSTR_OP(instr);
8236
8237		switch (op) {
8238		case DIF_OP_OR:
8239		case DIF_OP_XOR:
8240		case DIF_OP_AND:
8241		case DIF_OP_SLL:
8242		case DIF_OP_SRL:
8243		case DIF_OP_SRA:
8244		case DIF_OP_SUB:
8245		case DIF_OP_ADD:
8246		case DIF_OP_MUL:
8247		case DIF_OP_SDIV:
8248		case DIF_OP_UDIV:
8249		case DIF_OP_SREM:
8250		case DIF_OP_UREM:
8251		case DIF_OP_COPYS:
8252		case DIF_OP_NOT:
8253		case DIF_OP_MOV:
8254		case DIF_OP_RLDSB:
8255		case DIF_OP_RLDSH:
8256		case DIF_OP_RLDSW:
8257		case DIF_OP_RLDUB:
8258		case DIF_OP_RLDUH:
8259		case DIF_OP_RLDUW:
8260		case DIF_OP_RLDX:
8261		case DIF_OP_ULDSB:
8262		case DIF_OP_ULDSH:
8263		case DIF_OP_ULDSW:
8264		case DIF_OP_ULDUB:
8265		case DIF_OP_ULDUH:
8266		case DIF_OP_ULDUW:
8267		case DIF_OP_ULDX:
8268		case DIF_OP_STB:
8269		case DIF_OP_STH:
8270		case DIF_OP_STW:
8271		case DIF_OP_STX:
8272		case DIF_OP_ALLOCS:
8273		case DIF_OP_CMP:
8274		case DIF_OP_SCMP:
8275		case DIF_OP_TST:
8276		case DIF_OP_BA:
8277		case DIF_OP_BE:
8278		case DIF_OP_BNE:
8279		case DIF_OP_BG:
8280		case DIF_OP_BGU:
8281		case DIF_OP_BGE:
8282		case DIF_OP_BGEU:
8283		case DIF_OP_BL:
8284		case DIF_OP_BLU:
8285		case DIF_OP_BLE:
8286		case DIF_OP_BLEU:
8287		case DIF_OP_RET:
8288		case DIF_OP_NOP:
8289		case DIF_OP_POPTS:
8290		case DIF_OP_FLUSHTS:
8291		case DIF_OP_SETX:
8292		case DIF_OP_SETS:
8293		case DIF_OP_LDGA:
8294		case DIF_OP_LDLS:
8295		case DIF_OP_STGS:
8296		case DIF_OP_STLS:
8297		case DIF_OP_PUSHTR:
8298		case DIF_OP_PUSHTV:
8299			break;
8300
8301		case DIF_OP_LDGS:
8302			if (v >= DIF_VAR_OTHER_UBASE)
8303				break;
8304
8305			if (v >= DIF_VAR_ARG0 && v <= DIF_VAR_ARG9)
8306				break;
8307
8308			if (v == DIF_VAR_CURTHREAD || v == DIF_VAR_PID ||
8309			    v == DIF_VAR_PPID || v == DIF_VAR_TID ||
8310			    v == DIF_VAR_EXECNAME || v == DIF_VAR_ZONENAME ||
8311			    v == DIF_VAR_UID || v == DIF_VAR_GID)
8312				break;
8313
8314			err += efunc(pc, "illegal variable %u\n", v);
8315			break;
8316
8317		case DIF_OP_LDTA:
8318		case DIF_OP_LDTS:
8319		case DIF_OP_LDGAA:
8320		case DIF_OP_LDTAA:
8321			err += efunc(pc, "illegal dynamic variable load\n");
8322			break;
8323
8324		case DIF_OP_STTS:
8325		case DIF_OP_STGAA:
8326		case DIF_OP_STTAA:
8327			err += efunc(pc, "illegal dynamic variable store\n");
8328			break;
8329
8330		case DIF_OP_CALL:
8331			if (subr == DIF_SUBR_ALLOCA ||
8332			    subr == DIF_SUBR_BCOPY ||
8333			    subr == DIF_SUBR_COPYIN ||
8334			    subr == DIF_SUBR_COPYINTO ||
8335			    subr == DIF_SUBR_COPYINSTR ||
8336			    subr == DIF_SUBR_INDEX ||
8337			    subr == DIF_SUBR_INET_NTOA ||
8338			    subr == DIF_SUBR_INET_NTOA6 ||
8339			    subr == DIF_SUBR_INET_NTOP ||
8340			    subr == DIF_SUBR_LLTOSTR ||
8341			    subr == DIF_SUBR_RINDEX ||
8342			    subr == DIF_SUBR_STRCHR ||
8343			    subr == DIF_SUBR_STRJOIN ||
8344			    subr == DIF_SUBR_STRRCHR ||
8345			    subr == DIF_SUBR_STRSTR ||
8346			    subr == DIF_SUBR_HTONS ||
8347			    subr == DIF_SUBR_HTONL ||
8348			    subr == DIF_SUBR_HTONLL ||
8349			    subr == DIF_SUBR_NTOHS ||
8350			    subr == DIF_SUBR_NTOHL ||
8351			    subr == DIF_SUBR_NTOHLL)
8352				break;
8353
8354			err += efunc(pc, "invalid subr %u\n", subr);
8355			break;
8356
8357		default:
8358			err += efunc(pc, "invalid opcode %u\n",
8359			    DIF_INSTR_OP(instr));
8360		}
8361	}
8362
8363	return (err);
8364}
8365
8366/*
8367 * Returns 1 if the expression in the DIF object can be cached on a per-thread
8368 * basis; 0 if not.
8369 */
8370static int
8371dtrace_difo_cacheable(dtrace_difo_t *dp)
8372{
8373	int i;
8374
8375	if (dp == NULL)
8376		return (0);
8377
8378	for (i = 0; i < dp->dtdo_varlen; i++) {
8379		dtrace_difv_t *v = &dp->dtdo_vartab[i];
8380
8381		if (v->dtdv_scope != DIFV_SCOPE_GLOBAL)
8382			continue;
8383
8384		switch (v->dtdv_id) {
8385		case DIF_VAR_CURTHREAD:
8386		case DIF_VAR_PID:
8387		case DIF_VAR_TID:
8388		case DIF_VAR_EXECNAME:
8389		case DIF_VAR_ZONENAME:
8390			break;
8391
8392		default:
8393			return (0);
8394		}
8395	}
8396
8397	/*
8398	 * This DIF object may be cacheable.  Now we need to look for any
8399	 * array loading instructions, any memory loading instructions, or
8400	 * any stores to thread-local variables.
8401	 */
8402	for (i = 0; i < dp->dtdo_len; i++) {
8403		uint_t op = DIF_INSTR_OP(dp->dtdo_buf[i]);
8404
8405		if ((op >= DIF_OP_LDSB && op <= DIF_OP_LDX) ||
8406		    (op >= DIF_OP_ULDSB && op <= DIF_OP_ULDX) ||
8407		    (op >= DIF_OP_RLDSB && op <= DIF_OP_RLDX) ||
8408		    op == DIF_OP_LDGA || op == DIF_OP_STTS)
8409			return (0);
8410	}
8411
8412	return (1);
8413}
8414
8415static void
8416dtrace_difo_hold(dtrace_difo_t *dp)
8417{
8418	int i;
8419
8420	ASSERT(MUTEX_HELD(&dtrace_lock));
8421
8422	dp->dtdo_refcnt++;
8423	ASSERT(dp->dtdo_refcnt != 0);
8424
8425	/*
8426	 * We need to check this DIF object for references to the variable
8427	 * DIF_VAR_VTIMESTAMP.
8428	 */
8429	for (i = 0; i < dp->dtdo_varlen; i++) {
8430		dtrace_difv_t *v = &dp->dtdo_vartab[i];
8431
8432		if (v->dtdv_id != DIF_VAR_VTIMESTAMP)
8433			continue;
8434
8435		if (dtrace_vtime_references++ == 0)
8436			dtrace_vtime_enable();
8437	}
8438}
8439
8440/*
8441 * This routine calculates the dynamic variable chunksize for a given DIF
8442 * object.  The calculation is not fool-proof, and can probably be tricked by
8443 * malicious DIF -- but it works for all compiler-generated DIF.  Because this
8444 * calculation is likely imperfect, dtrace_dynvar() is able to gracefully fail
8445 * if a dynamic variable size exceeds the chunksize.
8446 */
8447static void
8448dtrace_difo_chunksize(dtrace_difo_t *dp, dtrace_vstate_t *vstate)
8449{
8450	uint64_t sval;
8451	dtrace_key_t tupregs[DIF_DTR_NREGS + 2]; /* +2 for thread and id */
8452	const dif_instr_t *text = dp->dtdo_buf;
8453	uint_t pc, srd = 0;
8454	uint_t ttop = 0;
8455	size_t size, ksize;
8456	uint_t id, i;
8457
8458	for (pc = 0; pc < dp->dtdo_len; pc++) {
8459		dif_instr_t instr = text[pc];
8460		uint_t op = DIF_INSTR_OP(instr);
8461		uint_t rd = DIF_INSTR_RD(instr);
8462		uint_t r1 = DIF_INSTR_R1(instr);
8463		uint_t nkeys = 0;
8464		uchar_t scope;
8465
8466		dtrace_key_t *key = tupregs;
8467
8468		switch (op) {
8469		case DIF_OP_SETX:
8470			sval = dp->dtdo_inttab[DIF_INSTR_INTEGER(instr)];
8471			srd = rd;
8472			continue;
8473
8474		case DIF_OP_STTS:
8475			key = &tupregs[DIF_DTR_NREGS];
8476			key[0].dttk_size = 0;
8477			key[1].dttk_size = 0;
8478			nkeys = 2;
8479			scope = DIFV_SCOPE_THREAD;
8480			break;
8481
8482		case DIF_OP_STGAA:
8483		case DIF_OP_STTAA:
8484			nkeys = ttop;
8485
8486			if (DIF_INSTR_OP(instr) == DIF_OP_STTAA)
8487				key[nkeys++].dttk_size = 0;
8488
8489			key[nkeys++].dttk_size = 0;
8490
8491			if (op == DIF_OP_STTAA) {
8492				scope = DIFV_SCOPE_THREAD;
8493			} else {
8494				scope = DIFV_SCOPE_GLOBAL;
8495			}
8496
8497			break;
8498
8499		case DIF_OP_PUSHTR:
8500			if (ttop == DIF_DTR_NREGS)
8501				return;
8502
8503			if ((srd == 0 || sval == 0) && r1 == DIF_TYPE_STRING) {
8504				/*
8505				 * If the register for the size of the "pushtr"
8506				 * is %r0 (or the value is 0) and the type is
8507				 * a string, we'll use the system-wide default
8508				 * string size.
8509				 */
8510				tupregs[ttop++].dttk_size =
8511				    dtrace_strsize_default;
8512			} else {
8513				if (srd == 0)
8514					return;
8515
8516				tupregs[ttop++].dttk_size = sval;
8517			}
8518
8519			break;
8520
8521		case DIF_OP_PUSHTV:
8522			if (ttop == DIF_DTR_NREGS)
8523				return;
8524
8525			tupregs[ttop++].dttk_size = 0;
8526			break;
8527
8528		case DIF_OP_FLUSHTS:
8529			ttop = 0;
8530			break;
8531
8532		case DIF_OP_POPTS:
8533			if (ttop != 0)
8534				ttop--;
8535			break;
8536		}
8537
8538		sval = 0;
8539		srd = 0;
8540
8541		if (nkeys == 0)
8542			continue;
8543
8544		/*
8545		 * We have a dynamic variable allocation; calculate its size.
8546		 */
8547		for (ksize = 0, i = 0; i < nkeys; i++)
8548			ksize += P2ROUNDUP(key[i].dttk_size, sizeof (uint64_t));
8549
8550		size = sizeof (dtrace_dynvar_t);
8551		size += sizeof (dtrace_key_t) * (nkeys - 1);
8552		size += ksize;
8553
8554		/*
8555		 * Now we need to determine the size of the stored data.
8556		 */
8557		id = DIF_INSTR_VAR(instr);
8558
8559		for (i = 0; i < dp->dtdo_varlen; i++) {
8560			dtrace_difv_t *v = &dp->dtdo_vartab[i];
8561
8562			if (v->dtdv_id == id && v->dtdv_scope == scope) {
8563				size += v->dtdv_type.dtdt_size;
8564				break;
8565			}
8566		}
8567
8568		if (i == dp->dtdo_varlen)
8569			return;
8570
8571		/*
8572		 * We have the size.  If this is larger than the chunk size
8573		 * for our dynamic variable state, reset the chunk size.
8574		 */
8575		size = P2ROUNDUP(size, sizeof (uint64_t));
8576
8577		if (size > vstate->dtvs_dynvars.dtds_chunksize)
8578			vstate->dtvs_dynvars.dtds_chunksize = size;
8579	}
8580}
8581
8582static void
8583dtrace_difo_init(dtrace_difo_t *dp, dtrace_vstate_t *vstate)
8584{
8585	int i, oldsvars, osz, nsz, otlocals, ntlocals;
8586	uint_t id;
8587
8588	ASSERT(MUTEX_HELD(&dtrace_lock));
8589	ASSERT(dp->dtdo_buf != NULL && dp->dtdo_len != 0);
8590
8591	for (i = 0; i < dp->dtdo_varlen; i++) {
8592		dtrace_difv_t *v = &dp->dtdo_vartab[i];
8593		dtrace_statvar_t *svar, ***svarp;
8594		size_t dsize = 0;
8595		uint8_t scope = v->dtdv_scope;
8596		int *np;
8597
8598		if ((id = v->dtdv_id) < DIF_VAR_OTHER_UBASE)
8599			continue;
8600
8601		id -= DIF_VAR_OTHER_UBASE;
8602
8603		switch (scope) {
8604		case DIFV_SCOPE_THREAD:
8605			while (id >= (otlocals = vstate->dtvs_ntlocals)) {
8606				dtrace_difv_t *tlocals;
8607
8608				if ((ntlocals = (otlocals << 1)) == 0)
8609					ntlocals = 1;
8610
8611				osz = otlocals * sizeof (dtrace_difv_t);
8612				nsz = ntlocals * sizeof (dtrace_difv_t);
8613
8614				tlocals = kmem_zalloc(nsz, KM_SLEEP);
8615
8616				if (osz != 0) {
8617					bcopy(vstate->dtvs_tlocals,
8618					    tlocals, osz);
8619					kmem_free(vstate->dtvs_tlocals, osz);
8620				}
8621
8622				vstate->dtvs_tlocals = tlocals;
8623				vstate->dtvs_ntlocals = ntlocals;
8624			}
8625
8626			vstate->dtvs_tlocals[id] = *v;
8627			continue;
8628
8629		case DIFV_SCOPE_LOCAL:
8630			np = &vstate->dtvs_nlocals;
8631			svarp = &vstate->dtvs_locals;
8632
8633			if (v->dtdv_type.dtdt_flags & DIF_TF_BYREF)
8634				dsize = NCPU * (v->dtdv_type.dtdt_size +
8635				    sizeof (uint64_t));
8636			else
8637				dsize = NCPU * sizeof (uint64_t);
8638
8639			break;
8640
8641		case DIFV_SCOPE_GLOBAL:
8642			np = &vstate->dtvs_nglobals;
8643			svarp = &vstate->dtvs_globals;
8644
8645			if (v->dtdv_type.dtdt_flags & DIF_TF_BYREF)
8646				dsize = v->dtdv_type.dtdt_size +
8647				    sizeof (uint64_t);
8648
8649			break;
8650
8651		default:
8652			ASSERT(0);
8653		}
8654
8655		while (id >= (oldsvars = *np)) {
8656			dtrace_statvar_t **statics;
8657			int newsvars, oldsize, newsize;
8658
8659			if ((newsvars = (oldsvars << 1)) == 0)
8660				newsvars = 1;
8661
8662			oldsize = oldsvars * sizeof (dtrace_statvar_t *);
8663			newsize = newsvars * sizeof (dtrace_statvar_t *);
8664
8665			statics = kmem_zalloc(newsize, KM_SLEEP);
8666
8667			if (oldsize != 0) {
8668				bcopy(*svarp, statics, oldsize);
8669				kmem_free(*svarp, oldsize);
8670			}
8671
8672			*svarp = statics;
8673			*np = newsvars;
8674		}
8675
8676		if ((svar = (*svarp)[id]) == NULL) {
8677			svar = kmem_zalloc(sizeof (dtrace_statvar_t), KM_SLEEP);
8678			svar->dtsv_var = *v;
8679
8680			if ((svar->dtsv_size = dsize) != 0) {
8681				svar->dtsv_data = (uint64_t)(uintptr_t)
8682				    kmem_zalloc(dsize, KM_SLEEP);
8683			}
8684
8685			(*svarp)[id] = svar;
8686		}
8687
8688		svar->dtsv_refcnt++;
8689	}
8690
8691	dtrace_difo_chunksize(dp, vstate);
8692	dtrace_difo_hold(dp);
8693}
8694
8695static dtrace_difo_t *
8696dtrace_difo_duplicate(dtrace_difo_t *dp, dtrace_vstate_t *vstate)
8697{
8698	dtrace_difo_t *new;
8699	size_t sz;
8700
8701	ASSERT(dp->dtdo_buf != NULL);
8702	ASSERT(dp->dtdo_refcnt != 0);
8703
8704	new = kmem_zalloc(sizeof (dtrace_difo_t), KM_SLEEP);
8705
8706	ASSERT(dp->dtdo_buf != NULL);
8707	sz = dp->dtdo_len * sizeof (dif_instr_t);
8708	new->dtdo_buf = kmem_alloc(sz, KM_SLEEP);
8709	bcopy(dp->dtdo_buf, new->dtdo_buf, sz);
8710	new->dtdo_len = dp->dtdo_len;
8711
8712	if (dp->dtdo_strtab != NULL) {
8713		ASSERT(dp->dtdo_strlen != 0);
8714		new->dtdo_strtab = kmem_alloc(dp->dtdo_strlen, KM_SLEEP);
8715		bcopy(dp->dtdo_strtab, new->dtdo_strtab, dp->dtdo_strlen);
8716		new->dtdo_strlen = dp->dtdo_strlen;
8717	}
8718
8719	if (dp->dtdo_inttab != NULL) {
8720		ASSERT(dp->dtdo_intlen != 0);
8721		sz = dp->dtdo_intlen * sizeof (uint64_t);
8722		new->dtdo_inttab = kmem_alloc(sz, KM_SLEEP);
8723		bcopy(dp->dtdo_inttab, new->dtdo_inttab, sz);
8724		new->dtdo_intlen = dp->dtdo_intlen;
8725	}
8726
8727	if (dp->dtdo_vartab != NULL) {
8728		ASSERT(dp->dtdo_varlen != 0);
8729		sz = dp->dtdo_varlen * sizeof (dtrace_difv_t);
8730		new->dtdo_vartab = kmem_alloc(sz, KM_SLEEP);
8731		bcopy(dp->dtdo_vartab, new->dtdo_vartab, sz);
8732		new->dtdo_varlen = dp->dtdo_varlen;
8733	}
8734
8735	dtrace_difo_init(new, vstate);
8736	return (new);
8737}
8738
8739static void
8740dtrace_difo_destroy(dtrace_difo_t *dp, dtrace_vstate_t *vstate)
8741{
8742	int i;
8743
8744	ASSERT(dp->dtdo_refcnt == 0);
8745
8746	for (i = 0; i < dp->dtdo_varlen; i++) {
8747		dtrace_difv_t *v = &dp->dtdo_vartab[i];
8748		dtrace_statvar_t *svar, **svarp;
8749		uint_t id;
8750		uint8_t scope = v->dtdv_scope;
8751		int *np;
8752
8753		switch (scope) {
8754		case DIFV_SCOPE_THREAD:
8755			continue;
8756
8757		case DIFV_SCOPE_LOCAL:
8758			np = &vstate->dtvs_nlocals;
8759			svarp = vstate->dtvs_locals;
8760			break;
8761
8762		case DIFV_SCOPE_GLOBAL:
8763			np = &vstate->dtvs_nglobals;
8764			svarp = vstate->dtvs_globals;
8765			break;
8766
8767		default:
8768			ASSERT(0);
8769		}
8770
8771		if ((id = v->dtdv_id) < DIF_VAR_OTHER_UBASE)
8772			continue;
8773
8774		id -= DIF_VAR_OTHER_UBASE;
8775		ASSERT(id < *np);
8776
8777		svar = svarp[id];
8778		ASSERT(svar != NULL);
8779		ASSERT(svar->dtsv_refcnt > 0);
8780
8781		if (--svar->dtsv_refcnt > 0)
8782			continue;
8783
8784		if (svar->dtsv_size != 0) {
8785			ASSERT(svar->dtsv_data != NULL);
8786			kmem_free((void *)(uintptr_t)svar->dtsv_data,
8787			    svar->dtsv_size);
8788		}
8789
8790		kmem_free(svar, sizeof (dtrace_statvar_t));
8791		svarp[id] = NULL;
8792	}
8793
8794	kmem_free(dp->dtdo_buf, dp->dtdo_len * sizeof (dif_instr_t));
8795	kmem_free(dp->dtdo_inttab, dp->dtdo_intlen * sizeof (uint64_t));
8796	kmem_free(dp->dtdo_strtab, dp->dtdo_strlen);
8797	kmem_free(dp->dtdo_vartab, dp->dtdo_varlen * sizeof (dtrace_difv_t));
8798
8799	kmem_free(dp, sizeof (dtrace_difo_t));
8800}
8801
8802static void
8803dtrace_difo_release(dtrace_difo_t *dp, dtrace_vstate_t *vstate)
8804{
8805	int i;
8806
8807	ASSERT(MUTEX_HELD(&dtrace_lock));
8808	ASSERT(dp->dtdo_refcnt != 0);
8809
8810	for (i = 0; i < dp->dtdo_varlen; i++) {
8811		dtrace_difv_t *v = &dp->dtdo_vartab[i];
8812
8813		if (v->dtdv_id != DIF_VAR_VTIMESTAMP)
8814			continue;
8815
8816		ASSERT(dtrace_vtime_references > 0);
8817		if (--dtrace_vtime_references == 0)
8818			dtrace_vtime_disable();
8819	}
8820
8821	if (--dp->dtdo_refcnt == 0)
8822		dtrace_difo_destroy(dp, vstate);
8823}
8824
8825/*
8826 * DTrace Format Functions
8827 */
8828static uint16_t
8829dtrace_format_add(dtrace_state_t *state, char *str)
8830{
8831	char *fmt, **new;
8832	uint16_t ndx, len = strlen(str) + 1;
8833
8834	fmt = kmem_zalloc(len, KM_SLEEP);
8835	bcopy(str, fmt, len);
8836
8837	for (ndx = 0; ndx < state->dts_nformats; ndx++) {
8838		if (state->dts_formats[ndx] == NULL) {
8839			state->dts_formats[ndx] = fmt;
8840			return (ndx + 1);
8841		}
8842	}
8843
8844	if (state->dts_nformats == USHRT_MAX) {
8845		/*
8846		 * This is only likely if a denial-of-service attack is being
8847		 * attempted.  As such, it's okay to fail silently here.
8848		 */
8849		kmem_free(fmt, len);
8850		return (0);
8851	}
8852
8853	/*
8854	 * For simplicity, we always resize the formats array to be exactly the
8855	 * number of formats.
8856	 */
8857	ndx = state->dts_nformats++;
8858	new = kmem_alloc((ndx + 1) * sizeof (char *), KM_SLEEP);
8859
8860	if (state->dts_formats != NULL) {
8861		ASSERT(ndx != 0);
8862		bcopy(state->dts_formats, new, ndx * sizeof (char *));
8863		kmem_free(state->dts_formats, ndx * sizeof (char *));
8864	}
8865
8866	state->dts_formats = new;
8867	state->dts_formats[ndx] = fmt;
8868
8869	return (ndx + 1);
8870}
8871
8872static void
8873dtrace_format_remove(dtrace_state_t *state, uint16_t format)
8874{
8875	char *fmt;
8876
8877	ASSERT(state->dts_formats != NULL);
8878	ASSERT(format <= state->dts_nformats);
8879	ASSERT(state->dts_formats[format - 1] != NULL);
8880
8881	fmt = state->dts_formats[format - 1];
8882	kmem_free(fmt, strlen(fmt) + 1);
8883	state->dts_formats[format - 1] = NULL;
8884}
8885
8886static void
8887dtrace_format_destroy(dtrace_state_t *state)
8888{
8889	int i;
8890
8891	if (state->dts_nformats == 0) {
8892		ASSERT(state->dts_formats == NULL);
8893		return;
8894	}
8895
8896	ASSERT(state->dts_formats != NULL);
8897
8898	for (i = 0; i < state->dts_nformats; i++) {
8899		char *fmt = state->dts_formats[i];
8900
8901		if (fmt == NULL)
8902			continue;
8903
8904		kmem_free(fmt, strlen(fmt) + 1);
8905	}
8906
8907	kmem_free(state->dts_formats, state->dts_nformats * sizeof (char *));
8908	state->dts_nformats = 0;
8909	state->dts_formats = NULL;
8910}
8911
8912/*
8913 * DTrace Predicate Functions
8914 */
8915static dtrace_predicate_t *
8916dtrace_predicate_create(dtrace_difo_t *dp)
8917{
8918	dtrace_predicate_t *pred;
8919
8920	ASSERT(MUTEX_HELD(&dtrace_lock));
8921	ASSERT(dp->dtdo_refcnt != 0);
8922
8923	pred = kmem_zalloc(sizeof (dtrace_predicate_t), KM_SLEEP);
8924	pred->dtp_difo = dp;
8925	pred->dtp_refcnt = 1;
8926
8927	if (!dtrace_difo_cacheable(dp))
8928		return (pred);
8929
8930	if (dtrace_predcache_id == DTRACE_CACHEIDNONE) {
8931		/*
8932		 * This is only theoretically possible -- we have had 2^32
8933		 * cacheable predicates on this machine.  We cannot allow any
8934		 * more predicates to become cacheable:  as unlikely as it is,
8935		 * there may be a thread caching a (now stale) predicate cache
8936		 * ID. (N.B.: the temptation is being successfully resisted to
8937		 * have this cmn_err() "Holy shit -- we executed this code!")
8938		 */
8939		return (pred);
8940	}
8941
8942	pred->dtp_cacheid = dtrace_predcache_id++;
8943
8944	return (pred);
8945}
8946
8947static void
8948dtrace_predicate_hold(dtrace_predicate_t *pred)
8949{
8950	ASSERT(MUTEX_HELD(&dtrace_lock));
8951	ASSERT(pred->dtp_difo != NULL && pred->dtp_difo->dtdo_refcnt != 0);
8952	ASSERT(pred->dtp_refcnt > 0);
8953
8954	pred->dtp_refcnt++;
8955}
8956
8957static void
8958dtrace_predicate_release(dtrace_predicate_t *pred, dtrace_vstate_t *vstate)
8959{
8960	dtrace_difo_t *dp = pred->dtp_difo;
8961
8962	ASSERT(MUTEX_HELD(&dtrace_lock));
8963	ASSERT(dp != NULL && dp->dtdo_refcnt != 0);
8964	ASSERT(pred->dtp_refcnt > 0);
8965
8966	if (--pred->dtp_refcnt == 0) {
8967		dtrace_difo_release(pred->dtp_difo, vstate);
8968		kmem_free(pred, sizeof (dtrace_predicate_t));
8969	}
8970}
8971
8972/*
8973 * DTrace Action Description Functions
8974 */
8975static dtrace_actdesc_t *
8976dtrace_actdesc_create(dtrace_actkind_t kind, uint32_t ntuple,
8977    uint64_t uarg, uint64_t arg)
8978{
8979	dtrace_actdesc_t *act;
8980
8981	ASSERT(!DTRACEACT_ISPRINTFLIKE(kind) || (arg != NULL &&
8982	    arg >= KERNELBASE) || (arg == NULL && kind == DTRACEACT_PRINTA));
8983
8984	act = kmem_zalloc(sizeof (dtrace_actdesc_t), KM_SLEEP);
8985	act->dtad_kind = kind;
8986	act->dtad_ntuple = ntuple;
8987	act->dtad_uarg = uarg;
8988	act->dtad_arg = arg;
8989	act->dtad_refcnt = 1;
8990
8991	return (act);
8992}
8993
8994static void
8995dtrace_actdesc_hold(dtrace_actdesc_t *act)
8996{
8997	ASSERT(act->dtad_refcnt >= 1);
8998	act->dtad_refcnt++;
8999}
9000
9001static void
9002dtrace_actdesc_release(dtrace_actdesc_t *act, dtrace_vstate_t *vstate)
9003{
9004	dtrace_actkind_t kind = act->dtad_kind;
9005	dtrace_difo_t *dp;
9006
9007	ASSERT(act->dtad_refcnt >= 1);
9008
9009	if (--act->dtad_refcnt != 0)
9010		return;
9011
9012	if ((dp = act->dtad_difo) != NULL)
9013		dtrace_difo_release(dp, vstate);
9014
9015	if (DTRACEACT_ISPRINTFLIKE(kind)) {
9016		char *str = (char *)(uintptr_t)act->dtad_arg;
9017
9018		ASSERT((str != NULL && (uintptr_t)str >= KERNELBASE) ||
9019		    (str == NULL && act->dtad_kind == DTRACEACT_PRINTA));
9020
9021		if (str != NULL)
9022			kmem_free(str, strlen(str) + 1);
9023	}
9024
9025	kmem_free(act, sizeof (dtrace_actdesc_t));
9026}
9027
9028/*
9029 * DTrace ECB Functions
9030 */
9031static dtrace_ecb_t *
9032dtrace_ecb_add(dtrace_state_t *state, dtrace_probe_t *probe)
9033{
9034	dtrace_ecb_t *ecb;
9035	dtrace_epid_t epid;
9036
9037	ASSERT(MUTEX_HELD(&dtrace_lock));
9038
9039	ecb = kmem_zalloc(sizeof (dtrace_ecb_t), KM_SLEEP);
9040	ecb->dte_predicate = NULL;
9041	ecb->dte_probe = probe;
9042
9043	/*
9044	 * The default size is the size of the default action: recording
9045	 * the epid.
9046	 */
9047	ecb->dte_size = ecb->dte_needed = sizeof (dtrace_epid_t);
9048	ecb->dte_alignment = sizeof (dtrace_epid_t);
9049
9050	epid = state->dts_epid++;
9051
9052	if (epid - 1 >= state->dts_necbs) {
9053		dtrace_ecb_t **oecbs = state->dts_ecbs, **ecbs;
9054		int necbs = state->dts_necbs << 1;
9055
9056		ASSERT(epid == state->dts_necbs + 1);
9057
9058		if (necbs == 0) {
9059			ASSERT(oecbs == NULL);
9060			necbs = 1;
9061		}
9062
9063		ecbs = kmem_zalloc(necbs * sizeof (*ecbs), KM_SLEEP);
9064
9065		if (oecbs != NULL)
9066			bcopy(oecbs, ecbs, state->dts_necbs * sizeof (*ecbs));
9067
9068		dtrace_membar_producer();
9069		state->dts_ecbs = ecbs;
9070
9071		if (oecbs != NULL) {
9072			/*
9073			 * If this state is active, we must dtrace_sync()
9074			 * before we can free the old dts_ecbs array:  we're
9075			 * coming in hot, and there may be active ring
9076			 * buffer processing (which indexes into the dts_ecbs
9077			 * array) on another CPU.
9078			 */
9079			if (state->dts_activity != DTRACE_ACTIVITY_INACTIVE)
9080				dtrace_sync();
9081
9082			kmem_free(oecbs, state->dts_necbs * sizeof (*ecbs));
9083		}
9084
9085		dtrace_membar_producer();
9086		state->dts_necbs = necbs;
9087	}
9088
9089	ecb->dte_state = state;
9090
9091	ASSERT(state->dts_ecbs[epid - 1] == NULL);
9092	dtrace_membar_producer();
9093	state->dts_ecbs[(ecb->dte_epid = epid) - 1] = ecb;
9094
9095	return (ecb);
9096}
9097
9098static void
9099dtrace_ecb_enable(dtrace_ecb_t *ecb)
9100{
9101	dtrace_probe_t *probe = ecb->dte_probe;
9102
9103	ASSERT(MUTEX_HELD(&cpu_lock));
9104	ASSERT(MUTEX_HELD(&dtrace_lock));
9105	ASSERT(ecb->dte_next == NULL);
9106
9107	if (probe == NULL) {
9108		/*
9109		 * This is the NULL probe -- there's nothing to do.
9110		 */
9111		return;
9112	}
9113
9114	if (probe->dtpr_ecb == NULL) {
9115		dtrace_provider_t *prov = probe->dtpr_provider;
9116
9117		/*
9118		 * We're the first ECB on this probe.
9119		 */
9120		probe->dtpr_ecb = probe->dtpr_ecb_last = ecb;
9121
9122		if (ecb->dte_predicate != NULL)
9123			probe->dtpr_predcache = ecb->dte_predicate->dtp_cacheid;
9124
9125		prov->dtpv_pops.dtps_enable(prov->dtpv_arg,
9126		    probe->dtpr_id, probe->dtpr_arg);
9127	} else {
9128		/*
9129		 * This probe is already active.  Swing the last pointer to
9130		 * point to the new ECB, and issue a dtrace_sync() to assure
9131		 * that all CPUs have seen the change.
9132		 */
9133		ASSERT(probe->dtpr_ecb_last != NULL);
9134		probe->dtpr_ecb_last->dte_next = ecb;
9135		probe->dtpr_ecb_last = ecb;
9136		probe->dtpr_predcache = 0;
9137
9138		dtrace_sync();
9139	}
9140}
9141
9142static void
9143dtrace_ecb_resize(dtrace_ecb_t *ecb)
9144{
9145	uint32_t maxalign = sizeof (dtrace_epid_t);
9146	uint32_t align = sizeof (uint8_t), offs, diff;
9147	dtrace_action_t *act;
9148	int wastuple = 0;
9149	uint32_t aggbase = UINT32_MAX;
9150	dtrace_state_t *state = ecb->dte_state;
9151
9152	/*
9153	 * If we record anything, we always record the epid.  (And we always
9154	 * record it first.)
9155	 */
9156	offs = sizeof (dtrace_epid_t);
9157	ecb->dte_size = ecb->dte_needed = sizeof (dtrace_epid_t);
9158
9159	for (act = ecb->dte_action; act != NULL; act = act->dta_next) {
9160		dtrace_recdesc_t *rec = &act->dta_rec;
9161
9162		if ((align = rec->dtrd_alignment) > maxalign)
9163			maxalign = align;
9164
9165		if (!wastuple && act->dta_intuple) {
9166			/*
9167			 * This is the first record in a tuple.  Align the
9168			 * offset to be at offset 4 in an 8-byte aligned
9169			 * block.
9170			 */
9171			diff = offs + sizeof (dtrace_aggid_t);
9172
9173			if (diff = (diff & (sizeof (uint64_t) - 1)))
9174				offs += sizeof (uint64_t) - diff;
9175
9176			aggbase = offs - sizeof (dtrace_aggid_t);
9177			ASSERT(!(aggbase & (sizeof (uint64_t) - 1)));
9178		}
9179
9180		/*LINTED*/
9181		if (rec->dtrd_size != 0 && (diff = (offs & (align - 1)))) {
9182			/*
9183			 * The current offset is not properly aligned; align it.
9184			 */
9185			offs += align - diff;
9186		}
9187
9188		rec->dtrd_offset = offs;
9189
9190		if (offs + rec->dtrd_size > ecb->dte_needed) {
9191			ecb->dte_needed = offs + rec->dtrd_size;
9192
9193			if (ecb->dte_needed > state->dts_needed)
9194				state->dts_needed = ecb->dte_needed;
9195		}
9196
9197		if (DTRACEACT_ISAGG(act->dta_kind)) {
9198			dtrace_aggregation_t *agg = (dtrace_aggregation_t *)act;
9199			dtrace_action_t *first = agg->dtag_first, *prev;
9200
9201			ASSERT(rec->dtrd_size != 0 && first != NULL);
9202			ASSERT(wastuple);
9203			ASSERT(aggbase != UINT32_MAX);
9204
9205			agg->dtag_base = aggbase;
9206
9207			while ((prev = first->dta_prev) != NULL &&
9208			    DTRACEACT_ISAGG(prev->dta_kind)) {
9209				agg = (dtrace_aggregation_t *)prev;
9210				first = agg->dtag_first;
9211			}
9212
9213			if (prev != NULL) {
9214				offs = prev->dta_rec.dtrd_offset +
9215				    prev->dta_rec.dtrd_size;
9216			} else {
9217				offs = sizeof (dtrace_epid_t);
9218			}
9219			wastuple = 0;
9220		} else {
9221			if (!act->dta_intuple)
9222				ecb->dte_size = offs + rec->dtrd_size;
9223
9224			offs += rec->dtrd_size;
9225		}
9226
9227		wastuple = act->dta_intuple;
9228	}
9229
9230	if ((act = ecb->dte_action) != NULL &&
9231	    !(act->dta_kind == DTRACEACT_SPECULATE && act->dta_next == NULL) &&
9232	    ecb->dte_size == sizeof (dtrace_epid_t)) {
9233		/*
9234		 * If the size is still sizeof (dtrace_epid_t), then all
9235		 * actions store no data; set the size to 0.
9236		 */
9237		ecb->dte_alignment = maxalign;
9238		ecb->dte_size = 0;
9239
9240		/*
9241		 * If the needed space is still sizeof (dtrace_epid_t), then
9242		 * all actions need no additional space; set the needed
9243		 * size to 0.
9244		 */
9245		if (ecb->dte_needed == sizeof (dtrace_epid_t))
9246			ecb->dte_needed = 0;
9247
9248		return;
9249	}
9250
9251	/*
9252	 * Set our alignment, and make sure that the dte_size and dte_needed
9253	 * are aligned to the size of an EPID.
9254	 */
9255	ecb->dte_alignment = maxalign;
9256	ecb->dte_size = (ecb->dte_size + (sizeof (dtrace_epid_t) - 1)) &
9257	    ~(sizeof (dtrace_epid_t) - 1);
9258	ecb->dte_needed = (ecb->dte_needed + (sizeof (dtrace_epid_t) - 1)) &
9259	    ~(sizeof (dtrace_epid_t) - 1);
9260	ASSERT(ecb->dte_size <= ecb->dte_needed);
9261}
9262
9263static dtrace_action_t *
9264dtrace_ecb_aggregation_create(dtrace_ecb_t *ecb, dtrace_actdesc_t *desc)
9265{
9266	dtrace_aggregation_t *agg;
9267	size_t size = sizeof (uint64_t);
9268	int ntuple = desc->dtad_ntuple;
9269	dtrace_action_t *act;
9270	dtrace_recdesc_t *frec;
9271	dtrace_aggid_t aggid;
9272	dtrace_state_t *state = ecb->dte_state;
9273
9274	agg = kmem_zalloc(sizeof (dtrace_aggregation_t), KM_SLEEP);
9275	agg->dtag_ecb = ecb;
9276
9277	ASSERT(DTRACEACT_ISAGG(desc->dtad_kind));
9278
9279	switch (desc->dtad_kind) {
9280	case DTRACEAGG_MIN:
9281		agg->dtag_initial = INT64_MAX;
9282		agg->dtag_aggregate = dtrace_aggregate_min;
9283		break;
9284
9285	case DTRACEAGG_MAX:
9286		agg->dtag_initial = INT64_MIN;
9287		agg->dtag_aggregate = dtrace_aggregate_max;
9288		break;
9289
9290	case DTRACEAGG_COUNT:
9291		agg->dtag_aggregate = dtrace_aggregate_count;
9292		break;
9293
9294	case DTRACEAGG_QUANTIZE:
9295		agg->dtag_aggregate = dtrace_aggregate_quantize;
9296		size = (((sizeof (uint64_t) * NBBY) - 1) * 2 + 1) *
9297		    sizeof (uint64_t);
9298		break;
9299
9300	case DTRACEAGG_LQUANTIZE: {
9301		uint16_t step = DTRACE_LQUANTIZE_STEP(desc->dtad_arg);
9302		uint16_t levels = DTRACE_LQUANTIZE_LEVELS(desc->dtad_arg);
9303
9304		agg->dtag_initial = desc->dtad_arg;
9305		agg->dtag_aggregate = dtrace_aggregate_lquantize;
9306
9307		if (step == 0 || levels == 0)
9308			goto err;
9309
9310		size = levels * sizeof (uint64_t) + 3 * sizeof (uint64_t);
9311		break;
9312	}
9313
9314	case DTRACEAGG_AVG:
9315		agg->dtag_aggregate = dtrace_aggregate_avg;
9316		size = sizeof (uint64_t) * 2;
9317		break;
9318
9319	case DTRACEAGG_STDDEV:
9320		agg->dtag_aggregate = dtrace_aggregate_stddev;
9321		size = sizeof (uint64_t) * 4;
9322		break;
9323
9324	case DTRACEAGG_SUM:
9325		agg->dtag_aggregate = dtrace_aggregate_sum;
9326		break;
9327
9328	default:
9329		goto err;
9330	}
9331
9332	agg->dtag_action.dta_rec.dtrd_size = size;
9333
9334	if (ntuple == 0)
9335		goto err;
9336
9337	/*
9338	 * We must make sure that we have enough actions for the n-tuple.
9339	 */
9340	for (act = ecb->dte_action_last; act != NULL; act = act->dta_prev) {
9341		if (DTRACEACT_ISAGG(act->dta_kind))
9342			break;
9343
9344		if (--ntuple == 0) {
9345			/*
9346			 * This is the action with which our n-tuple begins.
9347			 */
9348			agg->dtag_first = act;
9349			goto success;
9350		}
9351	}
9352
9353	/*
9354	 * This n-tuple is short by ntuple elements.  Return failure.
9355	 */
9356	ASSERT(ntuple != 0);
9357err:
9358	kmem_free(agg, sizeof (dtrace_aggregation_t));
9359	return (NULL);
9360
9361success:
9362	/*
9363	 * If the last action in the tuple has a size of zero, it's actually
9364	 * an expression argument for the aggregating action.
9365	 */
9366	ASSERT(ecb->dte_action_last != NULL);
9367	act = ecb->dte_action_last;
9368
9369	if (act->dta_kind == DTRACEACT_DIFEXPR) {
9370		ASSERT(act->dta_difo != NULL);
9371
9372		if (act->dta_difo->dtdo_rtype.dtdt_size == 0)
9373			agg->dtag_hasarg = 1;
9374	}
9375
9376	/*
9377	 * We need to allocate an id for this aggregation.
9378	 */
9379	aggid = (dtrace_aggid_t)(uintptr_t)vmem_alloc(state->dts_aggid_arena, 1,
9380	    VM_BESTFIT | VM_SLEEP);
9381
9382	if (aggid - 1 >= state->dts_naggregations) {
9383		dtrace_aggregation_t **oaggs = state->dts_aggregations;
9384		dtrace_aggregation_t **aggs;
9385		int naggs = state->dts_naggregations << 1;
9386		int onaggs = state->dts_naggregations;
9387
9388		ASSERT(aggid == state->dts_naggregations + 1);
9389
9390		if (naggs == 0) {
9391			ASSERT(oaggs == NULL);
9392			naggs = 1;
9393		}
9394
9395		aggs = kmem_zalloc(naggs * sizeof (*aggs), KM_SLEEP);
9396
9397		if (oaggs != NULL) {
9398			bcopy(oaggs, aggs, onaggs * sizeof (*aggs));
9399			kmem_free(oaggs, onaggs * sizeof (*aggs));
9400		}
9401
9402		state->dts_aggregations = aggs;
9403		state->dts_naggregations = naggs;
9404	}
9405
9406	ASSERT(state->dts_aggregations[aggid - 1] == NULL);
9407	state->dts_aggregations[(agg->dtag_id = aggid) - 1] = agg;
9408
9409	frec = &agg->dtag_first->dta_rec;
9410	if (frec->dtrd_alignment < sizeof (dtrace_aggid_t))
9411		frec->dtrd_alignment = sizeof (dtrace_aggid_t);
9412
9413	for (act = agg->dtag_first; act != NULL; act = act->dta_next) {
9414		ASSERT(!act->dta_intuple);
9415		act->dta_intuple = 1;
9416	}
9417
9418	return (&agg->dtag_action);
9419}
9420
9421static void
9422dtrace_ecb_aggregation_destroy(dtrace_ecb_t *ecb, dtrace_action_t *act)
9423{
9424	dtrace_aggregation_t *agg = (dtrace_aggregation_t *)act;
9425	dtrace_state_t *state = ecb->dte_state;
9426	dtrace_aggid_t aggid = agg->dtag_id;
9427
9428	ASSERT(DTRACEACT_ISAGG(act->dta_kind));
9429	vmem_free(state->dts_aggid_arena, (void *)(uintptr_t)aggid, 1);
9430
9431	ASSERT(state->dts_aggregations[aggid - 1] == agg);
9432	state->dts_aggregations[aggid - 1] = NULL;
9433
9434	kmem_free(agg, sizeof (dtrace_aggregation_t));
9435}
9436
9437static int
9438dtrace_ecb_action_add(dtrace_ecb_t *ecb, dtrace_actdesc_t *desc)
9439{
9440	dtrace_action_t *action, *last;
9441	dtrace_difo_t *dp = desc->dtad_difo;
9442	uint32_t size = 0, align = sizeof (uint8_t), mask;
9443	uint16_t format = 0;
9444	dtrace_recdesc_t *rec;
9445	dtrace_state_t *state = ecb->dte_state;
9446	dtrace_optval_t *opt = state->dts_options, nframes, strsize;
9447	uint64_t arg = desc->dtad_arg;
9448
9449	ASSERT(MUTEX_HELD(&dtrace_lock));
9450	ASSERT(ecb->dte_action == NULL || ecb->dte_action->dta_refcnt == 1);
9451
9452	if (DTRACEACT_ISAGG(desc->dtad_kind)) {
9453		/*
9454		 * If this is an aggregating action, there must be neither
9455		 * a speculate nor a commit on the action chain.
9456		 */
9457		dtrace_action_t *act;
9458
9459		for (act = ecb->dte_action; act != NULL; act = act->dta_next) {
9460			if (act->dta_kind == DTRACEACT_COMMIT)
9461				return (EINVAL);
9462
9463			if (act->dta_kind == DTRACEACT_SPECULATE)
9464				return (EINVAL);
9465		}
9466
9467		action = dtrace_ecb_aggregation_create(ecb, desc);
9468
9469		if (action == NULL)
9470			return (EINVAL);
9471	} else {
9472		if (DTRACEACT_ISDESTRUCTIVE(desc->dtad_kind) ||
9473		    (desc->dtad_kind == DTRACEACT_DIFEXPR &&
9474		    dp != NULL && dp->dtdo_destructive)) {
9475			state->dts_destructive = 1;
9476		}
9477
9478		switch (desc->dtad_kind) {
9479		case DTRACEACT_PRINTF:
9480		case DTRACEACT_PRINTA:
9481		case DTRACEACT_SYSTEM:
9482		case DTRACEACT_FREOPEN:
9483			/*
9484			 * We know that our arg is a string -- turn it into a
9485			 * format.
9486			 */
9487			if (arg == NULL) {
9488				ASSERT(desc->dtad_kind == DTRACEACT_PRINTA);
9489				format = 0;
9490			} else {
9491				ASSERT(arg != NULL);
9492				ASSERT(arg > KERNELBASE);
9493				format = dtrace_format_add(state,
9494				    (char *)(uintptr_t)arg);
9495			}
9496
9497			/*FALLTHROUGH*/
9498		case DTRACEACT_LIBACT:
9499		case DTRACEACT_DIFEXPR:
9500			if (dp == NULL)
9501				return (EINVAL);
9502
9503			if ((size = dp->dtdo_rtype.dtdt_size) != 0)
9504				break;
9505
9506			if (dp->dtdo_rtype.dtdt_kind == DIF_TYPE_STRING) {
9507				if (!(dp->dtdo_rtype.dtdt_flags & DIF_TF_BYREF))
9508					return (EINVAL);
9509
9510				size = opt[DTRACEOPT_STRSIZE];
9511			}
9512
9513			break;
9514
9515		case DTRACEACT_STACK:
9516			if ((nframes = arg) == 0) {
9517				nframes = opt[DTRACEOPT_STACKFRAMES];
9518				ASSERT(nframes > 0);
9519				arg = nframes;
9520			}
9521
9522			size = nframes * sizeof (pc_t);
9523			break;
9524
9525		case DTRACEACT_JSTACK:
9526			if ((strsize = DTRACE_USTACK_STRSIZE(arg)) == 0)
9527				strsize = opt[DTRACEOPT_JSTACKSTRSIZE];
9528
9529			if ((nframes = DTRACE_USTACK_NFRAMES(arg)) == 0)
9530				nframes = opt[DTRACEOPT_JSTACKFRAMES];
9531
9532			arg = DTRACE_USTACK_ARG(nframes, strsize);
9533
9534			/*FALLTHROUGH*/
9535		case DTRACEACT_USTACK:
9536			if (desc->dtad_kind != DTRACEACT_JSTACK &&
9537			    (nframes = DTRACE_USTACK_NFRAMES(arg)) == 0) {
9538				strsize = DTRACE_USTACK_STRSIZE(arg);
9539				nframes = opt[DTRACEOPT_USTACKFRAMES];
9540				ASSERT(nframes > 0);
9541				arg = DTRACE_USTACK_ARG(nframes, strsize);
9542			}
9543
9544			/*
9545			 * Save a slot for the pid.
9546			 */
9547			size = (nframes + 1) * sizeof (uint64_t);
9548			size += DTRACE_USTACK_STRSIZE(arg);
9549			size = P2ROUNDUP(size, (uint32_t)(sizeof (uintptr_t)));
9550
9551			break;
9552
9553		case DTRACEACT_SYM:
9554		case DTRACEACT_MOD:
9555			if (dp == NULL || ((size = dp->dtdo_rtype.dtdt_size) !=
9556			    sizeof (uint64_t)) ||
9557			    (dp->dtdo_rtype.dtdt_flags & DIF_TF_BYREF))
9558				return (EINVAL);
9559			break;
9560
9561		case DTRACEACT_USYM:
9562		case DTRACEACT_UMOD:
9563		case DTRACEACT_UADDR:
9564			if (dp == NULL ||
9565			    (dp->dtdo_rtype.dtdt_size != sizeof (uint64_t)) ||
9566			    (dp->dtdo_rtype.dtdt_flags & DIF_TF_BYREF))
9567				return (EINVAL);
9568
9569			/*
9570			 * We have a slot for the pid, plus a slot for the
9571			 * argument.  To keep things simple (aligned with
9572			 * bitness-neutral sizing), we store each as a 64-bit
9573			 * quantity.
9574			 */
9575			size = 2 * sizeof (uint64_t);
9576			break;
9577
9578		case DTRACEACT_STOP:
9579		case DTRACEACT_BREAKPOINT:
9580		case DTRACEACT_PANIC:
9581			break;
9582
9583		case DTRACEACT_CHILL:
9584		case DTRACEACT_DISCARD:
9585		case DTRACEACT_RAISE:
9586			if (dp == NULL)
9587				return (EINVAL);
9588			break;
9589
9590		case DTRACEACT_EXIT:
9591			if (dp == NULL ||
9592			    (size = dp->dtdo_rtype.dtdt_size) != sizeof (int) ||
9593			    (dp->dtdo_rtype.dtdt_flags & DIF_TF_BYREF))
9594				return (EINVAL);
9595			break;
9596
9597		case DTRACEACT_SPECULATE:
9598			if (ecb->dte_size > sizeof (dtrace_epid_t))
9599				return (EINVAL);
9600
9601			if (dp == NULL)
9602				return (EINVAL);
9603
9604			state->dts_speculates = 1;
9605			break;
9606
9607		case DTRACEACT_COMMIT: {
9608			dtrace_action_t *act = ecb->dte_action;
9609
9610			for (; act != NULL; act = act->dta_next) {
9611				if (act->dta_kind == DTRACEACT_COMMIT)
9612					return (EINVAL);
9613			}
9614
9615			if (dp == NULL)
9616				return (EINVAL);
9617			break;
9618		}
9619
9620		default:
9621			return (EINVAL);
9622		}
9623
9624		if (size != 0 || desc->dtad_kind == DTRACEACT_SPECULATE) {
9625			/*
9626			 * If this is a data-storing action or a speculate,
9627			 * we must be sure that there isn't a commit on the
9628			 * action chain.
9629			 */
9630			dtrace_action_t *act = ecb->dte_action;
9631
9632			for (; act != NULL; act = act->dta_next) {
9633				if (act->dta_kind == DTRACEACT_COMMIT)
9634					return (EINVAL);
9635			}
9636		}
9637
9638		action = kmem_zalloc(sizeof (dtrace_action_t), KM_SLEEP);
9639		action->dta_rec.dtrd_size = size;
9640	}
9641
9642	action->dta_refcnt = 1;
9643	rec = &action->dta_rec;
9644	size = rec->dtrd_size;
9645
9646	for (mask = sizeof (uint64_t) - 1; size != 0 && mask > 0; mask >>= 1) {
9647		if (!(size & mask)) {
9648			align = mask + 1;
9649			break;
9650		}
9651	}
9652
9653	action->dta_kind = desc->dtad_kind;
9654
9655	if ((action->dta_difo = dp) != NULL)
9656		dtrace_difo_hold(dp);
9657
9658	rec->dtrd_action = action->dta_kind;
9659	rec->dtrd_arg = arg;
9660	rec->dtrd_uarg = desc->dtad_uarg;
9661	rec->dtrd_alignment = (uint16_t)align;
9662	rec->dtrd_format = format;
9663
9664	if ((last = ecb->dte_action_last) != NULL) {
9665		ASSERT(ecb->dte_action != NULL);
9666		action->dta_prev = last;
9667		last->dta_next = action;
9668	} else {
9669		ASSERT(ecb->dte_action == NULL);
9670		ecb->dte_action = action;
9671	}
9672
9673	ecb->dte_action_last = action;
9674
9675	return (0);
9676}
9677
9678static void
9679dtrace_ecb_action_remove(dtrace_ecb_t *ecb)
9680{
9681	dtrace_action_t *act = ecb->dte_action, *next;
9682	dtrace_vstate_t *vstate = &ecb->dte_state->dts_vstate;
9683	dtrace_difo_t *dp;
9684	uint16_t format;
9685
9686	if (act != NULL && act->dta_refcnt > 1) {
9687		ASSERT(act->dta_next == NULL || act->dta_next->dta_refcnt == 1);
9688		act->dta_refcnt--;
9689	} else {
9690		for (; act != NULL; act = next) {
9691			next = act->dta_next;
9692			ASSERT(next != NULL || act == ecb->dte_action_last);
9693			ASSERT(act->dta_refcnt == 1);
9694
9695			if ((format = act->dta_rec.dtrd_format) != 0)
9696				dtrace_format_remove(ecb->dte_state, format);
9697
9698			if ((dp = act->dta_difo) != NULL)
9699				dtrace_difo_release(dp, vstate);
9700
9701			if (DTRACEACT_ISAGG(act->dta_kind)) {
9702				dtrace_ecb_aggregation_destroy(ecb, act);
9703			} else {
9704				kmem_free(act, sizeof (dtrace_action_t));
9705			}
9706		}
9707	}
9708
9709	ecb->dte_action = NULL;
9710	ecb->dte_action_last = NULL;
9711	ecb->dte_size = sizeof (dtrace_epid_t);
9712}
9713
9714static void
9715dtrace_ecb_disable(dtrace_ecb_t *ecb)
9716{
9717	/*
9718	 * We disable the ECB by removing it from its probe.
9719	 */
9720	dtrace_ecb_t *pecb, *prev = NULL;
9721	dtrace_probe_t *probe = ecb->dte_probe;
9722
9723	ASSERT(MUTEX_HELD(&dtrace_lock));
9724
9725	if (probe == NULL) {
9726		/*
9727		 * This is the NULL probe; there is nothing to disable.
9728		 */
9729		return;
9730	}
9731
9732	for (pecb = probe->dtpr_ecb; pecb != NULL; pecb = pecb->dte_next) {
9733		if (pecb == ecb)
9734			break;
9735		prev = pecb;
9736	}
9737
9738	ASSERT(pecb != NULL);
9739
9740	if (prev == NULL) {
9741		probe->dtpr_ecb = ecb->dte_next;
9742	} else {
9743		prev->dte_next = ecb->dte_next;
9744	}
9745
9746	if (ecb == probe->dtpr_ecb_last) {
9747		ASSERT(ecb->dte_next == NULL);
9748		probe->dtpr_ecb_last = prev;
9749	}
9750
9751	/*
9752	 * The ECB has been disconnected from the probe; now sync to assure
9753	 * that all CPUs have seen the change before returning.
9754	 */
9755	dtrace_sync();
9756
9757	if (probe->dtpr_ecb == NULL) {
9758		/*
9759		 * That was the last ECB on the probe; clear the predicate
9760		 * cache ID for the probe, disable it and sync one more time
9761		 * to assure that we'll never hit it again.
9762		 */
9763		dtrace_provider_t *prov = probe->dtpr_provider;
9764
9765		ASSERT(ecb->dte_next == NULL);
9766		ASSERT(probe->dtpr_ecb_last == NULL);
9767		probe->dtpr_predcache = DTRACE_CACHEIDNONE;
9768		prov->dtpv_pops.dtps_disable(prov->dtpv_arg,
9769		    probe->dtpr_id, probe->dtpr_arg);
9770		dtrace_sync();
9771	} else {
9772		/*
9773		 * There is at least one ECB remaining on the probe.  If there
9774		 * is _exactly_ one, set the probe's predicate cache ID to be
9775		 * the predicate cache ID of the remaining ECB.
9776		 */
9777		ASSERT(probe->dtpr_ecb_last != NULL);
9778		ASSERT(probe->dtpr_predcache == DTRACE_CACHEIDNONE);
9779
9780		if (probe->dtpr_ecb == probe->dtpr_ecb_last) {
9781			dtrace_predicate_t *p = probe->dtpr_ecb->dte_predicate;
9782
9783			ASSERT(probe->dtpr_ecb->dte_next == NULL);
9784
9785			if (p != NULL)
9786				probe->dtpr_predcache = p->dtp_cacheid;
9787		}
9788
9789		ecb->dte_next = NULL;
9790	}
9791}
9792
9793static void
9794dtrace_ecb_destroy(dtrace_ecb_t *ecb)
9795{
9796	dtrace_state_t *state = ecb->dte_state;
9797	dtrace_vstate_t *vstate = &state->dts_vstate;
9798	dtrace_predicate_t *pred;
9799	dtrace_epid_t epid = ecb->dte_epid;
9800
9801	ASSERT(MUTEX_HELD(&dtrace_lock));
9802	ASSERT(ecb->dte_next == NULL);
9803	ASSERT(ecb->dte_probe == NULL || ecb->dte_probe->dtpr_ecb != ecb);
9804
9805	if ((pred = ecb->dte_predicate) != NULL)
9806		dtrace_predicate_release(pred, vstate);
9807
9808	dtrace_ecb_action_remove(ecb);
9809
9810	ASSERT(state->dts_ecbs[epid - 1] == ecb);
9811	state->dts_ecbs[epid - 1] = NULL;
9812
9813	kmem_free(ecb, sizeof (dtrace_ecb_t));
9814}
9815
9816static dtrace_ecb_t *
9817dtrace_ecb_create(dtrace_state_t *state, dtrace_probe_t *probe,
9818    dtrace_enabling_t *enab)
9819{
9820	dtrace_ecb_t *ecb;
9821	dtrace_predicate_t *pred;
9822	dtrace_actdesc_t *act;
9823	dtrace_provider_t *prov;
9824	dtrace_ecbdesc_t *desc = enab->dten_current;
9825
9826	ASSERT(MUTEX_HELD(&dtrace_lock));
9827	ASSERT(state != NULL);
9828
9829	ecb = dtrace_ecb_add(state, probe);
9830	ecb->dte_uarg = desc->dted_uarg;
9831
9832	if ((pred = desc->dted_pred.dtpdd_predicate) != NULL) {
9833		dtrace_predicate_hold(pred);
9834		ecb->dte_predicate = pred;
9835	}
9836
9837	if (probe != NULL) {
9838		/*
9839		 * If the provider shows more leg than the consumer is old
9840		 * enough to see, we need to enable the appropriate implicit
9841		 * predicate bits to prevent the ecb from activating at
9842		 * revealing times.
9843		 *
9844		 * Providers specifying DTRACE_PRIV_USER at register time
9845		 * are stating that they need the /proc-style privilege
9846		 * model to be enforced, and this is what DTRACE_COND_OWNER
9847		 * and DTRACE_COND_ZONEOWNER will then do at probe time.
9848		 */
9849		prov = probe->dtpr_provider;
9850		if (!(state->dts_cred.dcr_visible & DTRACE_CRV_ALLPROC) &&
9851		    (prov->dtpv_priv.dtpp_flags & DTRACE_PRIV_USER))
9852			ecb->dte_cond |= DTRACE_COND_OWNER;
9853
9854		if (!(state->dts_cred.dcr_visible & DTRACE_CRV_ALLZONE) &&
9855		    (prov->dtpv_priv.dtpp_flags & DTRACE_PRIV_USER))
9856			ecb->dte_cond |= DTRACE_COND_ZONEOWNER;
9857
9858		/*
9859		 * If the provider shows us kernel innards and the user
9860		 * is lacking sufficient privilege, enable the
9861		 * DTRACE_COND_USERMODE implicit predicate.
9862		 */
9863		if (!(state->dts_cred.dcr_visible & DTRACE_CRV_KERNEL) &&
9864		    (prov->dtpv_priv.dtpp_flags & DTRACE_PRIV_KERNEL))
9865			ecb->dte_cond |= DTRACE_COND_USERMODE;
9866	}
9867
9868	if (dtrace_ecb_create_cache != NULL) {
9869		/*
9870		 * If we have a cached ecb, we'll use its action list instead
9871		 * of creating our own (saving both time and space).
9872		 */
9873		dtrace_ecb_t *cached = dtrace_ecb_create_cache;
9874		dtrace_action_t *act = cached->dte_action;
9875
9876		if (act != NULL) {
9877			ASSERT(act->dta_refcnt > 0);
9878			act->dta_refcnt++;
9879			ecb->dte_action = act;
9880			ecb->dte_action_last = cached->dte_action_last;
9881			ecb->dte_needed = cached->dte_needed;
9882			ecb->dte_size = cached->dte_size;
9883			ecb->dte_alignment = cached->dte_alignment;
9884		}
9885
9886		return (ecb);
9887	}
9888
9889	for (act = desc->dted_action; act != NULL; act = act->dtad_next) {
9890		if ((enab->dten_error = dtrace_ecb_action_add(ecb, act)) != 0) {
9891			dtrace_ecb_destroy(ecb);
9892			return (NULL);
9893		}
9894	}
9895
9896	dtrace_ecb_resize(ecb);
9897
9898	return (dtrace_ecb_create_cache = ecb);
9899}
9900
9901static int
9902dtrace_ecb_create_enable(dtrace_probe_t *probe, void *arg)
9903{
9904	dtrace_ecb_t *ecb;
9905	dtrace_enabling_t *enab = arg;
9906	dtrace_state_t *state = enab->dten_vstate->dtvs_state;
9907
9908	ASSERT(state != NULL);
9909
9910	if (probe != NULL && probe->dtpr_gen < enab->dten_probegen) {
9911		/*
9912		 * This probe was created in a generation for which this
9913		 * enabling has previously created ECBs; we don't want to
9914		 * enable it again, so just kick out.
9915		 */
9916		return (DTRACE_MATCH_NEXT);
9917	}
9918
9919	if ((ecb = dtrace_ecb_create(state, probe, enab)) == NULL)
9920		return (DTRACE_MATCH_DONE);
9921
9922	dtrace_ecb_enable(ecb);
9923	return (DTRACE_MATCH_NEXT);
9924}
9925
9926static dtrace_ecb_t *
9927dtrace_epid2ecb(dtrace_state_t *state, dtrace_epid_t id)
9928{
9929	dtrace_ecb_t *ecb;
9930
9931	ASSERT(MUTEX_HELD(&dtrace_lock));
9932
9933	if (id == 0 || id > state->dts_necbs)
9934		return (NULL);
9935
9936	ASSERT(state->dts_necbs > 0 && state->dts_ecbs != NULL);
9937	ASSERT((ecb = state->dts_ecbs[id - 1]) == NULL || ecb->dte_epid == id);
9938
9939	return (state->dts_ecbs[id - 1]);
9940}
9941
9942static dtrace_aggregation_t *
9943dtrace_aggid2agg(dtrace_state_t *state, dtrace_aggid_t id)
9944{
9945	dtrace_aggregation_t *agg;
9946
9947	ASSERT(MUTEX_HELD(&dtrace_lock));
9948
9949	if (id == 0 || id > state->dts_naggregations)
9950		return (NULL);
9951
9952	ASSERT(state->dts_naggregations > 0 && state->dts_aggregations != NULL);
9953	ASSERT((agg = state->dts_aggregations[id - 1]) == NULL ||
9954	    agg->dtag_id == id);
9955
9956	return (state->dts_aggregations[id - 1]);
9957}
9958
9959/*
9960 * DTrace Buffer Functions
9961 *
9962 * The following functions manipulate DTrace buffers.  Most of these functions
9963 * are called in the context of establishing or processing consumer state;
9964 * exceptions are explicitly noted.
9965 */
9966
9967/*
9968 * Note:  called from cross call context.  This function switches the two
9969 * buffers on a given CPU.  The atomicity of this operation is assured by
9970 * disabling interrupts while the actual switch takes place; the disabling of
9971 * interrupts serializes the execution with any execution of dtrace_probe() on
9972 * the same CPU.
9973 */
9974static void
9975dtrace_buffer_switch(dtrace_buffer_t *buf)
9976{
9977	caddr_t tomax = buf->dtb_tomax;
9978	caddr_t xamot = buf->dtb_xamot;
9979	dtrace_icookie_t cookie;
9980
9981	ASSERT(!(buf->dtb_flags & DTRACEBUF_NOSWITCH));
9982	ASSERT(!(buf->dtb_flags & DTRACEBUF_RING));
9983
9984	cookie = dtrace_interrupt_disable();
9985	buf->dtb_tomax = xamot;
9986	buf->dtb_xamot = tomax;
9987	buf->dtb_xamot_drops = buf->dtb_drops;
9988	buf->dtb_xamot_offset = buf->dtb_offset;
9989	buf->dtb_xamot_errors = buf->dtb_errors;
9990	buf->dtb_xamot_flags = buf->dtb_flags;
9991	buf->dtb_offset = 0;
9992	buf->dtb_drops = 0;
9993	buf->dtb_errors = 0;
9994	buf->dtb_flags &= ~(DTRACEBUF_ERROR | DTRACEBUF_DROPPED);
9995	dtrace_interrupt_enable(cookie);
9996}
9997
9998/*
9999 * Note:  called from cross call context.  This function activates a buffer
10000 * on a CPU.  As with dtrace_buffer_switch(), the atomicity of the operation
10001 * is guaranteed by the disabling of interrupts.
10002 */
10003static void
10004dtrace_buffer_activate(dtrace_state_t *state)
10005{
10006	dtrace_buffer_t *buf;
10007	dtrace_icookie_t cookie = dtrace_interrupt_disable();
10008
10009	buf = &state->dts_buffer[CPU->cpu_id];
10010
10011	if (buf->dtb_tomax != NULL) {
10012		/*
10013		 * We might like to assert that the buffer is marked inactive,
10014		 * but this isn't necessarily true:  the buffer for the CPU
10015		 * that processes the BEGIN probe has its buffer activated
10016		 * manually.  In this case, we take the (harmless) action
10017		 * re-clearing the bit INACTIVE bit.
10018		 */
10019		buf->dtb_flags &= ~DTRACEBUF_INACTIVE;
10020	}
10021
10022	dtrace_interrupt_enable(cookie);
10023}
10024
10025static int
10026dtrace_buffer_alloc(dtrace_buffer_t *bufs, size_t size, int flags,
10027    processorid_t cpu)
10028{
10029	cpu_t *cp;
10030	dtrace_buffer_t *buf;
10031
10032	ASSERT(MUTEX_HELD(&cpu_lock));
10033	ASSERT(MUTEX_HELD(&dtrace_lock));
10034
10035	if (size > dtrace_nonroot_maxsize &&
10036	    !PRIV_POLICY_CHOICE(CRED(), PRIV_ALL, B_FALSE))
10037		return (EFBIG);
10038
10039	cp = cpu_list;
10040
10041	do {
10042		if (cpu != DTRACE_CPUALL && cpu != cp->cpu_id)
10043			continue;
10044
10045		buf = &bufs[cp->cpu_id];
10046
10047		/*
10048		 * If there is already a buffer allocated for this CPU, it
10049		 * is only possible that this is a DR event.  In this case,
10050		 * the buffer size must match our specified size.
10051		 */
10052		if (buf->dtb_tomax != NULL) {
10053			ASSERT(buf->dtb_size == size);
10054			continue;
10055		}
10056
10057		ASSERT(buf->dtb_xamot == NULL);
10058
10059		if ((buf->dtb_tomax = kmem_zalloc(size, KM_NOSLEEP)) == NULL)
10060			goto err;
10061
10062		buf->dtb_size = size;
10063		buf->dtb_flags = flags;
10064		buf->dtb_offset = 0;
10065		buf->dtb_drops = 0;
10066
10067		if (flags & DTRACEBUF_NOSWITCH)
10068			continue;
10069
10070		if ((buf->dtb_xamot = kmem_zalloc(size, KM_NOSLEEP)) == NULL)
10071			goto err;
10072	} while ((cp = cp->cpu_next) != cpu_list);
10073
10074	return (0);
10075
10076err:
10077	cp = cpu_list;
10078
10079	do {
10080		if (cpu != DTRACE_CPUALL && cpu != cp->cpu_id)
10081			continue;
10082
10083		buf = &bufs[cp->cpu_id];
10084
10085		if (buf->dtb_xamot != NULL) {
10086			ASSERT(buf->dtb_tomax != NULL);
10087			ASSERT(buf->dtb_size == size);
10088			kmem_free(buf->dtb_xamot, size);
10089		}
10090
10091		if (buf->dtb_tomax != NULL) {
10092			ASSERT(buf->dtb_size == size);
10093			kmem_free(buf->dtb_tomax, size);
10094		}
10095
10096		buf->dtb_tomax = NULL;
10097		buf->dtb_xamot = NULL;
10098		buf->dtb_size = 0;
10099	} while ((cp = cp->cpu_next) != cpu_list);
10100
10101	return (ENOMEM);
10102}
10103
10104/*
10105 * Note:  called from probe context.  This function just increments the drop
10106 * count on a buffer.  It has been made a function to allow for the
10107 * possibility of understanding the source of mysterious drop counts.  (A
10108 * problem for which one may be particularly disappointed that DTrace cannot
10109 * be used to understand DTrace.)
10110 */
10111static void
10112dtrace_buffer_drop(dtrace_buffer_t *buf)
10113{
10114	buf->dtb_drops++;
10115}
10116
10117/*
10118 * Note:  called from probe context.  This function is called to reserve space
10119 * in a buffer.  If mstate is non-NULL, sets the scratch base and size in the
10120 * mstate.  Returns the new offset in the buffer, or a negative value if an
10121 * error has occurred.
10122 */
10123static intptr_t
10124dtrace_buffer_reserve(dtrace_buffer_t *buf, size_t needed, size_t align,
10125    dtrace_state_t *state, dtrace_mstate_t *mstate)
10126{
10127	intptr_t offs = buf->dtb_offset, soffs;
10128	intptr_t woffs;
10129	caddr_t tomax;
10130	size_t total;
10131
10132	if (buf->dtb_flags & DTRACEBUF_INACTIVE)
10133		return (-1);
10134
10135	if ((tomax = buf->dtb_tomax) == NULL) {
10136		dtrace_buffer_drop(buf);
10137		return (-1);
10138	}
10139
10140	if (!(buf->dtb_flags & (DTRACEBUF_RING | DTRACEBUF_FILL))) {
10141		while (offs & (align - 1)) {
10142			/*
10143			 * Assert that our alignment is off by a number which
10144			 * is itself sizeof (uint32_t) aligned.
10145			 */
10146			ASSERT(!((align - (offs & (align - 1))) &
10147			    (sizeof (uint32_t) - 1)));
10148			DTRACE_STORE(uint32_t, tomax, offs, DTRACE_EPIDNONE);
10149			offs += sizeof (uint32_t);
10150		}
10151
10152		if ((soffs = offs + needed) > buf->dtb_size) {
10153			dtrace_buffer_drop(buf);
10154			return (-1);
10155		}
10156
10157		if (mstate == NULL)
10158			return (offs);
10159
10160		mstate->dtms_scratch_base = (uintptr_t)tomax + soffs;
10161		mstate->dtms_scratch_size = buf->dtb_size - soffs;
10162		mstate->dtms_scratch_ptr = mstate->dtms_scratch_base;
10163
10164		return (offs);
10165	}
10166
10167	if (buf->dtb_flags & DTRACEBUF_FILL) {
10168		if (state->dts_activity != DTRACE_ACTIVITY_COOLDOWN &&
10169		    (buf->dtb_flags & DTRACEBUF_FULL))
10170			return (-1);
10171		goto out;
10172	}
10173
10174	total = needed + (offs & (align - 1));
10175
10176	/*
10177	 * For a ring buffer, life is quite a bit more complicated.  Before
10178	 * we can store any padding, we need to adjust our wrapping offset.
10179	 * (If we've never before wrapped or we're not about to, no adjustment
10180	 * is required.)
10181	 */
10182	if ((buf->dtb_flags & DTRACEBUF_WRAPPED) ||
10183	    offs + total > buf->dtb_size) {
10184		woffs = buf->dtb_xamot_offset;
10185
10186		if (offs + total > buf->dtb_size) {
10187			/*
10188			 * We can't fit in the end of the buffer.  First, a
10189			 * sanity check that we can fit in the buffer at all.
10190			 */
10191			if (total > buf->dtb_size) {
10192				dtrace_buffer_drop(buf);
10193				return (-1);
10194			}
10195
10196			/*
10197			 * We're going to be storing at the top of the buffer,
10198			 * so now we need to deal with the wrapped offset.  We
10199			 * only reset our wrapped offset to 0 if it is
10200			 * currently greater than the current offset.  If it
10201			 * is less than the current offset, it is because a
10202			 * previous allocation induced a wrap -- but the
10203			 * allocation didn't subsequently take the space due
10204			 * to an error or false predicate evaluation.  In this
10205			 * case, we'll just leave the wrapped offset alone: if
10206			 * the wrapped offset hasn't been advanced far enough
10207			 * for this allocation, it will be adjusted in the
10208			 * lower loop.
10209			 */
10210			if (buf->dtb_flags & DTRACEBUF_WRAPPED) {
10211				if (woffs >= offs)
10212					woffs = 0;
10213			} else {
10214				woffs = 0;
10215			}
10216
10217			/*
10218			 * Now we know that we're going to be storing to the
10219			 * top of the buffer and that there is room for us
10220			 * there.  We need to clear the buffer from the current
10221			 * offset to the end (there may be old gunk there).
10222			 */
10223			while (offs < buf->dtb_size)
10224				tomax[offs++] = 0;
10225
10226			/*
10227			 * We need to set our offset to zero.  And because we
10228			 * are wrapping, we need to set the bit indicating as
10229			 * much.  We can also adjust our needed space back
10230			 * down to the space required by the ECB -- we know
10231			 * that the top of the buffer is aligned.
10232			 */
10233			offs = 0;
10234			total = needed;
10235			buf->dtb_flags |= DTRACEBUF_WRAPPED;
10236		} else {
10237			/*
10238			 * There is room for us in the buffer, so we simply
10239			 * need to check the wrapped offset.
10240			 */
10241			if (woffs < offs) {
10242				/*
10243				 * The wrapped offset is less than the offset.
10244				 * This can happen if we allocated buffer space
10245				 * that induced a wrap, but then we didn't
10246				 * subsequently take the space due to an error
10247				 * or false predicate evaluation.  This is
10248				 * okay; we know that _this_ allocation isn't
10249				 * going to induce a wrap.  We still can't
10250				 * reset the wrapped offset to be zero,
10251				 * however: the space may have been trashed in
10252				 * the previous failed probe attempt.  But at
10253				 * least the wrapped offset doesn't need to
10254				 * be adjusted at all...
10255				 */
10256				goto out;
10257			}
10258		}
10259
10260		while (offs + total > woffs) {
10261			dtrace_epid_t epid = *(uint32_t *)(tomax + woffs);
10262			size_t size;
10263
10264			if (epid == DTRACE_EPIDNONE) {
10265				size = sizeof (uint32_t);
10266			} else {
10267				ASSERT(epid <= state->dts_necbs);
10268				ASSERT(state->dts_ecbs[epid - 1] != NULL);
10269
10270				size = state->dts_ecbs[epid - 1]->dte_size;
10271			}
10272
10273			ASSERT(woffs + size <= buf->dtb_size);
10274			ASSERT(size != 0);
10275
10276			if (woffs + size == buf->dtb_size) {
10277				/*
10278				 * We've reached the end of the buffer; we want
10279				 * to set the wrapped offset to 0 and break
10280				 * out.  However, if the offs is 0, then we're
10281				 * in a strange edge-condition:  the amount of
10282				 * space that we want to reserve plus the size
10283				 * of the record that we're overwriting is
10284				 * greater than the size of the buffer.  This
10285				 * is problematic because if we reserve the
10286				 * space but subsequently don't consume it (due
10287				 * to a failed predicate or error) the wrapped
10288				 * offset will be 0 -- yet the EPID at offset 0
10289				 * will not be committed.  This situation is
10290				 * relatively easy to deal with:  if we're in
10291				 * this case, the buffer is indistinguishable
10292				 * from one that hasn't wrapped; we need only
10293				 * finish the job by clearing the wrapped bit,
10294				 * explicitly setting the offset to be 0, and
10295				 * zero'ing out the old data in the buffer.
10296				 */
10297				if (offs == 0) {
10298					buf->dtb_flags &= ~DTRACEBUF_WRAPPED;
10299					buf->dtb_offset = 0;
10300					woffs = total;
10301
10302					while (woffs < buf->dtb_size)
10303						tomax[woffs++] = 0;
10304				}
10305
10306				woffs = 0;
10307				break;
10308			}
10309
10310			woffs += size;
10311		}
10312
10313		/*
10314		 * We have a wrapped offset.  It may be that the wrapped offset
10315		 * has become zero -- that's okay.
10316		 */
10317		buf->dtb_xamot_offset = woffs;
10318	}
10319
10320out:
10321	/*
10322	 * Now we can plow the buffer with any necessary padding.
10323	 */
10324	while (offs & (align - 1)) {
10325		/*
10326		 * Assert that our alignment is off by a number which
10327		 * is itself sizeof (uint32_t) aligned.
10328		 */
10329		ASSERT(!((align - (offs & (align - 1))) &
10330		    (sizeof (uint32_t) - 1)));
10331		DTRACE_STORE(uint32_t, tomax, offs, DTRACE_EPIDNONE);
10332		offs += sizeof (uint32_t);
10333	}
10334
10335	if (buf->dtb_flags & DTRACEBUF_FILL) {
10336		if (offs + needed > buf->dtb_size - state->dts_reserve) {
10337			buf->dtb_flags |= DTRACEBUF_FULL;
10338			return (-1);
10339		}
10340	}
10341
10342	if (mstate == NULL)
10343		return (offs);
10344
10345	/*
10346	 * For ring buffers and fill buffers, the scratch space is always
10347	 * the inactive buffer.
10348	 */
10349	mstate->dtms_scratch_base = (uintptr_t)buf->dtb_xamot;
10350	mstate->dtms_scratch_size = buf->dtb_size;
10351	mstate->dtms_scratch_ptr = mstate->dtms_scratch_base;
10352
10353	return (offs);
10354}
10355
10356static void
10357dtrace_buffer_polish(dtrace_buffer_t *buf)
10358{
10359	ASSERT(buf->dtb_flags & DTRACEBUF_RING);
10360	ASSERT(MUTEX_HELD(&dtrace_lock));
10361
10362	if (!(buf->dtb_flags & DTRACEBUF_WRAPPED))
10363		return;
10364
10365	/*
10366	 * We need to polish the ring buffer.  There are three cases:
10367	 *
10368	 * - The first (and presumably most common) is that there is no gap
10369	 *   between the buffer offset and the wrapped offset.  In this case,
10370	 *   there is nothing in the buffer that isn't valid data; we can
10371	 *   mark the buffer as polished and return.
10372	 *
10373	 * - The second (less common than the first but still more common
10374	 *   than the third) is that there is a gap between the buffer offset
10375	 *   and the wrapped offset, and the wrapped offset is larger than the
10376	 *   buffer offset.  This can happen because of an alignment issue, or
10377	 *   can happen because of a call to dtrace_buffer_reserve() that
10378	 *   didn't subsequently consume the buffer space.  In this case,
10379	 *   we need to zero the data from the buffer offset to the wrapped
10380	 *   offset.
10381	 *
10382	 * - The third (and least common) is that there is a gap between the
10383	 *   buffer offset and the wrapped offset, but the wrapped offset is
10384	 *   _less_ than the buffer offset.  This can only happen because a
10385	 *   call to dtrace_buffer_reserve() induced a wrap, but the space
10386	 *   was not subsequently consumed.  In this case, we need to zero the
10387	 *   space from the offset to the end of the buffer _and_ from the
10388	 *   top of the buffer to the wrapped offset.
10389	 */
10390	if (buf->dtb_offset < buf->dtb_xamot_offset) {
10391		bzero(buf->dtb_tomax + buf->dtb_offset,
10392		    buf->dtb_xamot_offset - buf->dtb_offset);
10393	}
10394
10395	if (buf->dtb_offset > buf->dtb_xamot_offset) {
10396		bzero(buf->dtb_tomax + buf->dtb_offset,
10397		    buf->dtb_size - buf->dtb_offset);
10398		bzero(buf->dtb_tomax, buf->dtb_xamot_offset);
10399	}
10400}
10401
10402static void
10403dtrace_buffer_free(dtrace_buffer_t *bufs)
10404{
10405	int i;
10406
10407	for (i = 0; i < NCPU; i++) {
10408		dtrace_buffer_t *buf = &bufs[i];
10409
10410		if (buf->dtb_tomax == NULL) {
10411			ASSERT(buf->dtb_xamot == NULL);
10412			ASSERT(buf->dtb_size == 0);
10413			continue;
10414		}
10415
10416		if (buf->dtb_xamot != NULL) {
10417			ASSERT(!(buf->dtb_flags & DTRACEBUF_NOSWITCH));
10418			kmem_free(buf->dtb_xamot, buf->dtb_size);
10419		}
10420
10421		kmem_free(buf->dtb_tomax, buf->dtb_size);
10422		buf->dtb_size = 0;
10423		buf->dtb_tomax = NULL;
10424		buf->dtb_xamot = NULL;
10425	}
10426}
10427
10428/*
10429 * DTrace Enabling Functions
10430 */
10431static dtrace_enabling_t *
10432dtrace_enabling_create(dtrace_vstate_t *vstate)
10433{
10434	dtrace_enabling_t *enab;
10435
10436	enab = kmem_zalloc(sizeof (dtrace_enabling_t), KM_SLEEP);
10437	enab->dten_vstate = vstate;
10438
10439	return (enab);
10440}
10441
10442static void
10443dtrace_enabling_add(dtrace_enabling_t *enab, dtrace_ecbdesc_t *ecb)
10444{
10445	dtrace_ecbdesc_t **ndesc;
10446	size_t osize, nsize;
10447
10448	/*
10449	 * We can't add to enablings after we've enabled them, or after we've
10450	 * retained them.
10451	 */
10452	ASSERT(enab->dten_probegen == 0);
10453	ASSERT(enab->dten_next == NULL && enab->dten_prev == NULL);
10454
10455	if (enab->dten_ndesc < enab->dten_maxdesc) {
10456		enab->dten_desc[enab->dten_ndesc++] = ecb;
10457		return;
10458	}
10459
10460	osize = enab->dten_maxdesc * sizeof (dtrace_enabling_t *);
10461
10462	if (enab->dten_maxdesc == 0) {
10463		enab->dten_maxdesc = 1;
10464	} else {
10465		enab->dten_maxdesc <<= 1;
10466	}
10467
10468	ASSERT(enab->dten_ndesc < enab->dten_maxdesc);
10469
10470	nsize = enab->dten_maxdesc * sizeof (dtrace_enabling_t *);
10471	ndesc = kmem_zalloc(nsize, KM_SLEEP);
10472	bcopy(enab->dten_desc, ndesc, osize);
10473	kmem_free(enab->dten_desc, osize);
10474
10475	enab->dten_desc = ndesc;
10476	enab->dten_desc[enab->dten_ndesc++] = ecb;
10477}
10478
10479static void
10480dtrace_enabling_addlike(dtrace_enabling_t *enab, dtrace_ecbdesc_t *ecb,
10481    dtrace_probedesc_t *pd)
10482{
10483	dtrace_ecbdesc_t *new;
10484	dtrace_predicate_t *pred;
10485	dtrace_actdesc_t *act;
10486
10487	/*
10488	 * We're going to create a new ECB description that matches the
10489	 * specified ECB in every way, but has the specified probe description.
10490	 */
10491	new = kmem_zalloc(sizeof (dtrace_ecbdesc_t), KM_SLEEP);
10492
10493	if ((pred = ecb->dted_pred.dtpdd_predicate) != NULL)
10494		dtrace_predicate_hold(pred);
10495
10496	for (act = ecb->dted_action; act != NULL; act = act->dtad_next)
10497		dtrace_actdesc_hold(act);
10498
10499	new->dted_action = ecb->dted_action;
10500	new->dted_pred = ecb->dted_pred;
10501	new->dted_probe = *pd;
10502	new->dted_uarg = ecb->dted_uarg;
10503
10504	dtrace_enabling_add(enab, new);
10505}
10506
10507static void
10508dtrace_enabling_dump(dtrace_enabling_t *enab)
10509{
10510	int i;
10511
10512	for (i = 0; i < enab->dten_ndesc; i++) {
10513		dtrace_probedesc_t *desc = &enab->dten_desc[i]->dted_probe;
10514
10515		cmn_err(CE_NOTE, "enabling probe %d (%s:%s:%s:%s)", i,
10516		    desc->dtpd_provider, desc->dtpd_mod,
10517		    desc->dtpd_func, desc->dtpd_name);
10518	}
10519}
10520
10521static void
10522dtrace_enabling_destroy(dtrace_enabling_t *enab)
10523{
10524	int i;
10525	dtrace_ecbdesc_t *ep;
10526	dtrace_vstate_t *vstate = enab->dten_vstate;
10527
10528	ASSERT(MUTEX_HELD(&dtrace_lock));
10529
10530	for (i = 0; i < enab->dten_ndesc; i++) {
10531		dtrace_actdesc_t *act, *next;
10532		dtrace_predicate_t *pred;
10533
10534		ep = enab->dten_desc[i];
10535
10536		if ((pred = ep->dted_pred.dtpdd_predicate) != NULL)
10537			dtrace_predicate_release(pred, vstate);
10538
10539		for (act = ep->dted_action; act != NULL; act = next) {
10540			next = act->dtad_next;
10541			dtrace_actdesc_release(act, vstate);
10542		}
10543
10544		kmem_free(ep, sizeof (dtrace_ecbdesc_t));
10545	}
10546
10547	kmem_free(enab->dten_desc,
10548	    enab->dten_maxdesc * sizeof (dtrace_enabling_t *));
10549
10550	/*
10551	 * If this was a retained enabling, decrement the dts_nretained count
10552	 * and take it off of the dtrace_retained list.
10553	 */
10554	if (enab->dten_prev != NULL || enab->dten_next != NULL ||
10555	    dtrace_retained == enab) {
10556		ASSERT(enab->dten_vstate->dtvs_state != NULL);
10557		ASSERT(enab->dten_vstate->dtvs_state->dts_nretained > 0);
10558		enab->dten_vstate->dtvs_state->dts_nretained--;
10559		dtrace_retained_gen++;
10560	}
10561
10562	if (enab->dten_prev == NULL) {
10563		if (dtrace_retained == enab) {
10564			dtrace_retained = enab->dten_next;
10565
10566			if (dtrace_retained != NULL)
10567				dtrace_retained->dten_prev = NULL;
10568		}
10569	} else {
10570		ASSERT(enab != dtrace_retained);
10571		ASSERT(dtrace_retained != NULL);
10572		enab->dten_prev->dten_next = enab->dten_next;
10573	}
10574
10575	if (enab->dten_next != NULL) {
10576		ASSERT(dtrace_retained != NULL);
10577		enab->dten_next->dten_prev = enab->dten_prev;
10578	}
10579
10580	kmem_free(enab, sizeof (dtrace_enabling_t));
10581}
10582
10583static int
10584dtrace_enabling_retain(dtrace_enabling_t *enab)
10585{
10586	dtrace_state_t *state;
10587
10588	ASSERT(MUTEX_HELD(&dtrace_lock));
10589	ASSERT(enab->dten_next == NULL && enab->dten_prev == NULL);
10590	ASSERT(enab->dten_vstate != NULL);
10591
10592	state = enab->dten_vstate->dtvs_state;
10593	ASSERT(state != NULL);
10594
10595	/*
10596	 * We only allow each state to retain dtrace_retain_max enablings.
10597	 */
10598	if (state->dts_nretained >= dtrace_retain_max)
10599		return (ENOSPC);
10600
10601	state->dts_nretained++;
10602	dtrace_retained_gen++;
10603
10604	if (dtrace_retained == NULL) {
10605		dtrace_retained = enab;
10606		return (0);
10607	}
10608
10609	enab->dten_next = dtrace_retained;
10610	dtrace_retained->dten_prev = enab;
10611	dtrace_retained = enab;
10612
10613	return (0);
10614}
10615
10616static int
10617dtrace_enabling_replicate(dtrace_state_t *state, dtrace_probedesc_t *match,
10618    dtrace_probedesc_t *create)
10619{
10620	dtrace_enabling_t *new, *enab;
10621	int found = 0, err = ENOENT;
10622
10623	ASSERT(MUTEX_HELD(&dtrace_lock));
10624	ASSERT(strlen(match->dtpd_provider) < DTRACE_PROVNAMELEN);
10625	ASSERT(strlen(match->dtpd_mod) < DTRACE_MODNAMELEN);
10626	ASSERT(strlen(match->dtpd_func) < DTRACE_FUNCNAMELEN);
10627	ASSERT(strlen(match->dtpd_name) < DTRACE_NAMELEN);
10628
10629	new = dtrace_enabling_create(&state->dts_vstate);
10630
10631	/*
10632	 * Iterate over all retained enablings, looking for enablings that
10633	 * match the specified state.
10634	 */
10635	for (enab = dtrace_retained; enab != NULL; enab = enab->dten_next) {
10636		int i;
10637
10638		/*
10639		 * dtvs_state can only be NULL for helper enablings -- and
10640		 * helper enablings can't be retained.
10641		 */
10642		ASSERT(enab->dten_vstate->dtvs_state != NULL);
10643
10644		if (enab->dten_vstate->dtvs_state != state)
10645			continue;
10646
10647		/*
10648		 * Now iterate over each probe description; we're looking for
10649		 * an exact match to the specified probe description.
10650		 */
10651		for (i = 0; i < enab->dten_ndesc; i++) {
10652			dtrace_ecbdesc_t *ep = enab->dten_desc[i];
10653			dtrace_probedesc_t *pd = &ep->dted_probe;
10654
10655			if (strcmp(pd->dtpd_provider, match->dtpd_provider))
10656				continue;
10657
10658			if (strcmp(pd->dtpd_mod, match->dtpd_mod))
10659				continue;
10660
10661			if (strcmp(pd->dtpd_func, match->dtpd_func))
10662				continue;
10663
10664			if (strcmp(pd->dtpd_name, match->dtpd_name))
10665				continue;
10666
10667			/*
10668			 * We have a winning probe!  Add it to our growing
10669			 * enabling.
10670			 */
10671			found = 1;
10672			dtrace_enabling_addlike(new, ep, create);
10673		}
10674	}
10675
10676	if (!found || (err = dtrace_enabling_retain(new)) != 0) {
10677		dtrace_enabling_destroy(new);
10678		return (err);
10679	}
10680
10681	return (0);
10682}
10683
10684static void
10685dtrace_enabling_retract(dtrace_state_t *state)
10686{
10687	dtrace_enabling_t *enab, *next;
10688
10689	ASSERT(MUTEX_HELD(&dtrace_lock));
10690
10691	/*
10692	 * Iterate over all retained enablings, destroy the enablings retained
10693	 * for the specified state.
10694	 */
10695	for (enab = dtrace_retained; enab != NULL; enab = next) {
10696		next = enab->dten_next;
10697
10698		/*
10699		 * dtvs_state can only be NULL for helper enablings -- and
10700		 * helper enablings can't be retained.
10701		 */
10702		ASSERT(enab->dten_vstate->dtvs_state != NULL);
10703
10704		if (enab->dten_vstate->dtvs_state == state) {
10705			ASSERT(state->dts_nretained > 0);
10706			dtrace_enabling_destroy(enab);
10707		}
10708	}
10709
10710	ASSERT(state->dts_nretained == 0);
10711}
10712
10713static int
10714dtrace_enabling_match(dtrace_enabling_t *enab, int *nmatched)
10715{
10716	int i = 0;
10717	int matched = 0;
10718
10719	ASSERT(MUTEX_HELD(&cpu_lock));
10720	ASSERT(MUTEX_HELD(&dtrace_lock));
10721
10722	for (i = 0; i < enab->dten_ndesc; i++) {
10723		dtrace_ecbdesc_t *ep = enab->dten_desc[i];
10724
10725		enab->dten_current = ep;
10726		enab->dten_error = 0;
10727
10728		matched += dtrace_probe_enable(&ep->dted_probe, enab);
10729
10730		if (enab->dten_error != 0) {
10731			/*
10732			 * If we get an error half-way through enabling the
10733			 * probes, we kick out -- perhaps with some number of
10734			 * them enabled.  Leaving enabled probes enabled may
10735			 * be slightly confusing for user-level, but we expect
10736			 * that no one will attempt to actually drive on in
10737			 * the face of such errors.  If this is an anonymous
10738			 * enabling (indicated with a NULL nmatched pointer),
10739			 * we cmn_err() a message.  We aren't expecting to
10740			 * get such an error -- such as it can exist at all,
10741			 * it would be a result of corrupted DOF in the driver
10742			 * properties.
10743			 */
10744			if (nmatched == NULL) {
10745				cmn_err(CE_WARN, "dtrace_enabling_match() "
10746				    "error on %p: %d", (void *)ep,
10747				    enab->dten_error);
10748			}
10749
10750			return (enab->dten_error);
10751		}
10752	}
10753
10754	enab->dten_probegen = dtrace_probegen;
10755	if (nmatched != NULL)
10756		*nmatched = matched;
10757
10758	return (0);
10759}
10760
10761static void
10762dtrace_enabling_matchall(void)
10763{
10764	dtrace_enabling_t *enab;
10765
10766	mutex_enter(&cpu_lock);
10767	mutex_enter(&dtrace_lock);
10768
10769	/*
10770	 * Iterate over all retained enablings to see if any probes match
10771	 * against them.  We only perform this operation on enablings for which
10772	 * we have sufficient permissions by virtue of being in the global zone
10773	 * or in the same zone as the DTrace client.  Because we can be called
10774	 * after dtrace_detach() has been called, we cannot assert that there
10775	 * are retained enablings.  We can safely load from dtrace_retained,
10776	 * however:  the taskq_destroy() at the end of dtrace_detach() will
10777	 * block pending our completion.
10778	 */
10779	for (enab = dtrace_retained; enab != NULL; enab = enab->dten_next) {
10780		cred_t *cr = enab->dten_vstate->dtvs_state->dts_cred.dcr_cred;
10781
10782		if (INGLOBALZONE(curproc) ||
10783		    cr != NULL && getzoneid() == crgetzoneid(cr))
10784			(void) dtrace_enabling_match(enab, NULL);
10785	}
10786
10787	mutex_exit(&dtrace_lock);
10788	mutex_exit(&cpu_lock);
10789}
10790
10791/*
10792 * If an enabling is to be enabled without having matched probes (that is, if
10793 * dtrace_state_go() is to be called on the underlying dtrace_state_t), the
10794 * enabling must be _primed_ by creating an ECB for every ECB description.
10795 * This must be done to assure that we know the number of speculations, the
10796 * number of aggregations, the minimum buffer size needed, etc. before we
10797 * transition out of DTRACE_ACTIVITY_INACTIVE.  To do this without actually
10798 * enabling any probes, we create ECBs for every ECB decription, but with a
10799 * NULL probe -- which is exactly what this function does.
10800 */
10801static void
10802dtrace_enabling_prime(dtrace_state_t *state)
10803{
10804	dtrace_enabling_t *enab;
10805	int i;
10806
10807	for (enab = dtrace_retained; enab != NULL; enab = enab->dten_next) {
10808		ASSERT(enab->dten_vstate->dtvs_state != NULL);
10809
10810		if (enab->dten_vstate->dtvs_state != state)
10811			continue;
10812
10813		/*
10814		 * We don't want to prime an enabling more than once, lest
10815		 * we allow a malicious user to induce resource exhaustion.
10816		 * (The ECBs that result from priming an enabling aren't
10817		 * leaked -- but they also aren't deallocated until the
10818		 * consumer state is destroyed.)
10819		 */
10820		if (enab->dten_primed)
10821			continue;
10822
10823		for (i = 0; i < enab->dten_ndesc; i++) {
10824			enab->dten_current = enab->dten_desc[i];
10825			(void) dtrace_probe_enable(NULL, enab);
10826		}
10827
10828		enab->dten_primed = 1;
10829	}
10830}
10831
10832/*
10833 * Called to indicate that probes should be provided due to retained
10834 * enablings.  This is implemented in terms of dtrace_probe_provide(), but it
10835 * must take an initial lap through the enabling calling the dtps_provide()
10836 * entry point explicitly to allow for autocreated probes.
10837 */
10838static void
10839dtrace_enabling_provide(dtrace_provider_t *prv)
10840{
10841	int i, all = 0;
10842	dtrace_probedesc_t desc;
10843	dtrace_genid_t gen;
10844
10845	ASSERT(MUTEX_HELD(&dtrace_lock));
10846	ASSERT(MUTEX_HELD(&dtrace_provider_lock));
10847
10848	if (prv == NULL) {
10849		all = 1;
10850		prv = dtrace_provider;
10851	}
10852
10853	do {
10854		dtrace_enabling_t *enab;
10855		void *parg = prv->dtpv_arg;
10856
10857retry:
10858		gen = dtrace_retained_gen;
10859		for (enab = dtrace_retained; enab != NULL;
10860		    enab = enab->dten_next) {
10861			for (i = 0; i < enab->dten_ndesc; i++) {
10862				desc = enab->dten_desc[i]->dted_probe;
10863				mutex_exit(&dtrace_lock);
10864				prv->dtpv_pops.dtps_provide(parg, &desc);
10865				mutex_enter(&dtrace_lock);
10866				/*
10867				 * Process the retained enablings again if
10868				 * they have changed while we weren't holding
10869				 * dtrace_lock.
10870				 */
10871				if (gen != dtrace_retained_gen)
10872					goto retry;
10873			}
10874		}
10875	} while (all && (prv = prv->dtpv_next) != NULL);
10876
10877	mutex_exit(&dtrace_lock);
10878	dtrace_probe_provide(NULL, all ? NULL : prv);
10879	mutex_enter(&dtrace_lock);
10880}
10881
10882/*
10883 * DTrace DOF Functions
10884 */
10885/*ARGSUSED*/
10886static void
10887dtrace_dof_error(dof_hdr_t *dof, const char *str)
10888{
10889	if (dtrace_err_verbose)
10890		cmn_err(CE_WARN, "failed to process DOF: %s", str);
10891
10892#ifdef DTRACE_ERRDEBUG
10893	dtrace_errdebug(str);
10894#endif
10895}
10896
10897/*
10898 * Create DOF out of a currently enabled state.  Right now, we only create
10899 * DOF containing the run-time options -- but this could be expanded to create
10900 * complete DOF representing the enabled state.
10901 */
10902static dof_hdr_t *
10903dtrace_dof_create(dtrace_state_t *state)
10904{
10905	dof_hdr_t *dof;
10906	dof_sec_t *sec;
10907	dof_optdesc_t *opt;
10908	int i, len = sizeof (dof_hdr_t) +
10909	    roundup(sizeof (dof_sec_t), sizeof (uint64_t)) +
10910	    sizeof (dof_optdesc_t) * DTRACEOPT_MAX;
10911
10912	ASSERT(MUTEX_HELD(&dtrace_lock));
10913
10914	dof = kmem_zalloc(len, KM_SLEEP);
10915	dof->dofh_ident[DOF_ID_MAG0] = DOF_MAG_MAG0;
10916	dof->dofh_ident[DOF_ID_MAG1] = DOF_MAG_MAG1;
10917	dof->dofh_ident[DOF_ID_MAG2] = DOF_MAG_MAG2;
10918	dof->dofh_ident[DOF_ID_MAG3] = DOF_MAG_MAG3;
10919
10920	dof->dofh_ident[DOF_ID_MODEL] = DOF_MODEL_NATIVE;
10921	dof->dofh_ident[DOF_ID_ENCODING] = DOF_ENCODE_NATIVE;
10922	dof->dofh_ident[DOF_ID_VERSION] = DOF_VERSION;
10923	dof->dofh_ident[DOF_ID_DIFVERS] = DIF_VERSION;
10924	dof->dofh_ident[DOF_ID_DIFIREG] = DIF_DIR_NREGS;
10925	dof->dofh_ident[DOF_ID_DIFTREG] = DIF_DTR_NREGS;
10926
10927	dof->dofh_flags = 0;
10928	dof->dofh_hdrsize = sizeof (dof_hdr_t);
10929	dof->dofh_secsize = sizeof (dof_sec_t);
10930	dof->dofh_secnum = 1;	/* only DOF_SECT_OPTDESC */
10931	dof->dofh_secoff = sizeof (dof_hdr_t);
10932	dof->dofh_loadsz = len;
10933	dof->dofh_filesz = len;
10934	dof->dofh_pad = 0;
10935
10936	/*
10937	 * Fill in the option section header...
10938	 */
10939	sec = (dof_sec_t *)((uintptr_t)dof + sizeof (dof_hdr_t));
10940	sec->dofs_type = DOF_SECT_OPTDESC;
10941	sec->dofs_align = sizeof (uint64_t);
10942	sec->dofs_flags = DOF_SECF_LOAD;
10943	sec->dofs_entsize = sizeof (dof_optdesc_t);
10944
10945	opt = (dof_optdesc_t *)((uintptr_t)sec +
10946	    roundup(sizeof (dof_sec_t), sizeof (uint64_t)));
10947
10948	sec->dofs_offset = (uintptr_t)opt - (uintptr_t)dof;
10949	sec->dofs_size = sizeof (dof_optdesc_t) * DTRACEOPT_MAX;
10950
10951	for (i = 0; i < DTRACEOPT_MAX; i++) {
10952		opt[i].dofo_option = i;
10953		opt[i].dofo_strtab = DOF_SECIDX_NONE;
10954		opt[i].dofo_value = state->dts_options[i];
10955	}
10956
10957	return (dof);
10958}
10959
10960static dof_hdr_t *
10961dtrace_dof_copyin(uintptr_t uarg, int *errp)
10962{
10963	dof_hdr_t hdr, *dof;
10964
10965	ASSERT(!MUTEX_HELD(&dtrace_lock));
10966
10967	/*
10968	 * First, we're going to copyin() the sizeof (dof_hdr_t).
10969	 */
10970	if (copyin((void *)uarg, &hdr, sizeof (hdr)) != 0) {
10971		dtrace_dof_error(NULL, "failed to copyin DOF header");
10972		*errp = EFAULT;
10973		return (NULL);
10974	}
10975
10976	/*
10977	 * Now we'll allocate the entire DOF and copy it in -- provided
10978	 * that the length isn't outrageous.
10979	 */
10980	if (hdr.dofh_loadsz >= dtrace_dof_maxsize) {
10981		dtrace_dof_error(&hdr, "load size exceeds maximum");
10982		*errp = E2BIG;
10983		return (NULL);
10984	}
10985
10986	if (hdr.dofh_loadsz < sizeof (hdr)) {
10987		dtrace_dof_error(&hdr, "invalid load size");
10988		*errp = EINVAL;
10989		return (NULL);
10990	}
10991
10992	dof = kmem_alloc(hdr.dofh_loadsz, KM_SLEEP);
10993
10994	if (copyin((void *)uarg, dof, hdr.dofh_loadsz) != 0) {
10995		kmem_free(dof, hdr.dofh_loadsz);
10996		*errp = EFAULT;
10997		return (NULL);
10998	}
10999
11000	return (dof);
11001}
11002
11003static dof_hdr_t *
11004dtrace_dof_property(const char *name)
11005{
11006	uchar_t *buf;
11007	uint64_t loadsz;
11008	unsigned int len, i;
11009	dof_hdr_t *dof;
11010
11011	/*
11012	 * Unfortunately, array of values in .conf files are always (and
11013	 * only) interpreted to be integer arrays.  We must read our DOF
11014	 * as an integer array, and then squeeze it into a byte array.
11015	 */
11016	if (ddi_prop_lookup_int_array(DDI_DEV_T_ANY, dtrace_devi, 0,
11017	    (char *)name, (int **)&buf, &len) != DDI_PROP_SUCCESS)
11018		return (NULL);
11019
11020	for (i = 0; i < len; i++)
11021		buf[i] = (uchar_t)(((int *)buf)[i]);
11022
11023	if (len < sizeof (dof_hdr_t)) {
11024		ddi_prop_free(buf);
11025		dtrace_dof_error(NULL, "truncated header");
11026		return (NULL);
11027	}
11028
11029	if (len < (loadsz = ((dof_hdr_t *)buf)->dofh_loadsz)) {
11030		ddi_prop_free(buf);
11031		dtrace_dof_error(NULL, "truncated DOF");
11032		return (NULL);
11033	}
11034
11035	if (loadsz >= dtrace_dof_maxsize) {
11036		ddi_prop_free(buf);
11037		dtrace_dof_error(NULL, "oversized DOF");
11038		return (NULL);
11039	}
11040
11041	dof = kmem_alloc(loadsz, KM_SLEEP);
11042	bcopy(buf, dof, loadsz);
11043	ddi_prop_free(buf);
11044
11045	return (dof);
11046}
11047
11048static void
11049dtrace_dof_destroy(dof_hdr_t *dof)
11050{
11051	kmem_free(dof, dof->dofh_loadsz);
11052}
11053
11054/*
11055 * Return the dof_sec_t pointer corresponding to a given section index.  If the
11056 * index is not valid, dtrace_dof_error() is called and NULL is returned.  If
11057 * a type other than DOF_SECT_NONE is specified, the header is checked against
11058 * this type and NULL is returned if the types do not match.
11059 */
11060static dof_sec_t *
11061dtrace_dof_sect(dof_hdr_t *dof, uint32_t type, dof_secidx_t i)
11062{
11063	dof_sec_t *sec = (dof_sec_t *)(uintptr_t)
11064	    ((uintptr_t)dof + dof->dofh_secoff + i * dof->dofh_secsize);
11065
11066	if (i >= dof->dofh_secnum) {
11067		dtrace_dof_error(dof, "referenced section index is invalid");
11068		return (NULL);
11069	}
11070
11071	if (!(sec->dofs_flags & DOF_SECF_LOAD)) {
11072		dtrace_dof_error(dof, "referenced section is not loadable");
11073		return (NULL);
11074	}
11075
11076	if (type != DOF_SECT_NONE && type != sec->dofs_type) {
11077		dtrace_dof_error(dof, "referenced section is the wrong type");
11078		return (NULL);
11079	}
11080
11081	return (sec);
11082}
11083
11084static dtrace_probedesc_t *
11085dtrace_dof_probedesc(dof_hdr_t *dof, dof_sec_t *sec, dtrace_probedesc_t *desc)
11086{
11087	dof_probedesc_t *probe;
11088	dof_sec_t *strtab;
11089	uintptr_t daddr = (uintptr_t)dof;
11090	uintptr_t str;
11091	size_t size;
11092
11093	if (sec->dofs_type != DOF_SECT_PROBEDESC) {
11094		dtrace_dof_error(dof, "invalid probe section");
11095		return (NULL);
11096	}
11097
11098	if (sec->dofs_align != sizeof (dof_secidx_t)) {
11099		dtrace_dof_error(dof, "bad alignment in probe description");
11100		return (NULL);
11101	}
11102
11103	if (sec->dofs_offset + sizeof (dof_probedesc_t) > dof->dofh_loadsz) {
11104		dtrace_dof_error(dof, "truncated probe description");
11105		return (NULL);
11106	}
11107
11108	probe = (dof_probedesc_t *)(uintptr_t)(daddr + sec->dofs_offset);
11109	strtab = dtrace_dof_sect(dof, DOF_SECT_STRTAB, probe->dofp_strtab);
11110
11111	if (strtab == NULL)
11112		return (NULL);
11113
11114	str = daddr + strtab->dofs_offset;
11115	size = strtab->dofs_size;
11116
11117	if (probe->dofp_provider >= strtab->dofs_size) {
11118		dtrace_dof_error(dof, "corrupt probe provider");
11119		return (NULL);
11120	}
11121
11122	(void) strncpy(desc->dtpd_provider,
11123	    (char *)(str + probe->dofp_provider),
11124	    MIN(DTRACE_PROVNAMELEN - 1, size - probe->dofp_provider));
11125
11126	if (probe->dofp_mod >= strtab->dofs_size) {
11127		dtrace_dof_error(dof, "corrupt probe module");
11128		return (NULL);
11129	}
11130
11131	(void) strncpy(desc->dtpd_mod, (char *)(str + probe->dofp_mod),
11132	    MIN(DTRACE_MODNAMELEN - 1, size - probe->dofp_mod));
11133
11134	if (probe->dofp_func >= strtab->dofs_size) {
11135		dtrace_dof_error(dof, "corrupt probe function");
11136		return (NULL);
11137	}
11138
11139	(void) strncpy(desc->dtpd_func, (char *)(str + probe->dofp_func),
11140	    MIN(DTRACE_FUNCNAMELEN - 1, size - probe->dofp_func));
11141
11142	if (probe->dofp_name >= strtab->dofs_size) {
11143		dtrace_dof_error(dof, "corrupt probe name");
11144		return (NULL);
11145	}
11146
11147	(void) strncpy(desc->dtpd_name, (char *)(str + probe->dofp_name),
11148	    MIN(DTRACE_NAMELEN - 1, size - probe->dofp_name));
11149
11150	return (desc);
11151}
11152
11153static dtrace_difo_t *
11154dtrace_dof_difo(dof_hdr_t *dof, dof_sec_t *sec, dtrace_vstate_t *vstate,
11155    cred_t *cr)
11156{
11157	dtrace_difo_t *dp;
11158	size_t ttl = 0;
11159	dof_difohdr_t *dofd;
11160	uintptr_t daddr = (uintptr_t)dof;
11161	size_t max = dtrace_difo_maxsize;
11162	int i, l, n;
11163
11164	static const struct {
11165		int section;
11166		int bufoffs;
11167		int lenoffs;
11168		int entsize;
11169		int align;
11170		const char *msg;
11171	} difo[] = {
11172		{ DOF_SECT_DIF, offsetof(dtrace_difo_t, dtdo_buf),
11173		offsetof(dtrace_difo_t, dtdo_len), sizeof (dif_instr_t),
11174		sizeof (dif_instr_t), "multiple DIF sections" },
11175
11176		{ DOF_SECT_INTTAB, offsetof(dtrace_difo_t, dtdo_inttab),
11177		offsetof(dtrace_difo_t, dtdo_intlen), sizeof (uint64_t),
11178		sizeof (uint64_t), "multiple integer tables" },
11179
11180		{ DOF_SECT_STRTAB, offsetof(dtrace_difo_t, dtdo_strtab),
11181		offsetof(dtrace_difo_t, dtdo_strlen), 0,
11182		sizeof (char), "multiple string tables" },
11183
11184		{ DOF_SECT_VARTAB, offsetof(dtrace_difo_t, dtdo_vartab),
11185		offsetof(dtrace_difo_t, dtdo_varlen), sizeof (dtrace_difv_t),
11186		sizeof (uint_t), "multiple variable tables" },
11187
11188		{ DOF_SECT_NONE, 0, 0, 0, NULL }
11189	};
11190
11191	if (sec->dofs_type != DOF_SECT_DIFOHDR) {
11192		dtrace_dof_error(dof, "invalid DIFO header section");
11193		return (NULL);
11194	}
11195
11196	if (sec->dofs_align != sizeof (dof_secidx_t)) {
11197		dtrace_dof_error(dof, "bad alignment in DIFO header");
11198		return (NULL);
11199	}
11200
11201	if (sec->dofs_size < sizeof (dof_difohdr_t) ||
11202	    sec->dofs_size % sizeof (dof_secidx_t)) {
11203		dtrace_dof_error(dof, "bad size in DIFO header");
11204		return (NULL);
11205	}
11206
11207	dofd = (dof_difohdr_t *)(uintptr_t)(daddr + sec->dofs_offset);
11208	n = (sec->dofs_size - sizeof (*dofd)) / sizeof (dof_secidx_t) + 1;
11209
11210	dp = kmem_zalloc(sizeof (dtrace_difo_t), KM_SLEEP);
11211	dp->dtdo_rtype = dofd->dofd_rtype;
11212
11213	for (l = 0; l < n; l++) {
11214		dof_sec_t *subsec;
11215		void **bufp;
11216		uint32_t *lenp;
11217
11218		if ((subsec = dtrace_dof_sect(dof, DOF_SECT_NONE,
11219		    dofd->dofd_links[l])) == NULL)
11220			goto err; /* invalid section link */
11221
11222		if (ttl + subsec->dofs_size > max) {
11223			dtrace_dof_error(dof, "exceeds maximum size");
11224			goto err;
11225		}
11226
11227		ttl += subsec->dofs_size;
11228
11229		for (i = 0; difo[i].section != DOF_SECT_NONE; i++) {
11230			if (subsec->dofs_type != difo[i].section)
11231				continue;
11232
11233			if (!(subsec->dofs_flags & DOF_SECF_LOAD)) {
11234				dtrace_dof_error(dof, "section not loaded");
11235				goto err;
11236			}
11237
11238			if (subsec->dofs_align != difo[i].align) {
11239				dtrace_dof_error(dof, "bad alignment");
11240				goto err;
11241			}
11242
11243			bufp = (void **)((uintptr_t)dp + difo[i].bufoffs);
11244			lenp = (uint32_t *)((uintptr_t)dp + difo[i].lenoffs);
11245
11246			if (*bufp != NULL) {
11247				dtrace_dof_error(dof, difo[i].msg);
11248				goto err;
11249			}
11250
11251			if (difo[i].entsize != subsec->dofs_entsize) {
11252				dtrace_dof_error(dof, "entry size mismatch");
11253				goto err;
11254			}
11255
11256			if (subsec->dofs_entsize != 0 &&
11257			    (subsec->dofs_size % subsec->dofs_entsize) != 0) {
11258				dtrace_dof_error(dof, "corrupt entry size");
11259				goto err;
11260			}
11261
11262			*lenp = subsec->dofs_size;
11263			*bufp = kmem_alloc(subsec->dofs_size, KM_SLEEP);
11264			bcopy((char *)(uintptr_t)(daddr + subsec->dofs_offset),
11265			    *bufp, subsec->dofs_size);
11266
11267			if (subsec->dofs_entsize != 0)
11268				*lenp /= subsec->dofs_entsize;
11269
11270			break;
11271		}
11272
11273		/*
11274		 * If we encounter a loadable DIFO sub-section that is not
11275		 * known to us, assume this is a broken program and fail.
11276		 */
11277		if (difo[i].section == DOF_SECT_NONE &&
11278		    (subsec->dofs_flags & DOF_SECF_LOAD)) {
11279			dtrace_dof_error(dof, "unrecognized DIFO subsection");
11280			goto err;
11281		}
11282	}
11283
11284	if (dp->dtdo_buf == NULL) {
11285		/*
11286		 * We can't have a DIF object without DIF text.
11287		 */
11288		dtrace_dof_error(dof, "missing DIF text");
11289		goto err;
11290	}
11291
11292	/*
11293	 * Before we validate the DIF object, run through the variable table
11294	 * looking for the strings -- if any of their size are under, we'll set
11295	 * their size to be the system-wide default string size.  Note that
11296	 * this should _not_ happen if the "strsize" option has been set --
11297	 * in this case, the compiler should have set the size to reflect the
11298	 * setting of the option.
11299	 */
11300	for (i = 0; i < dp->dtdo_varlen; i++) {
11301		dtrace_difv_t *v = &dp->dtdo_vartab[i];
11302		dtrace_diftype_t *t = &v->dtdv_type;
11303
11304		if (v->dtdv_id < DIF_VAR_OTHER_UBASE)
11305			continue;
11306
11307		if (t->dtdt_kind == DIF_TYPE_STRING && t->dtdt_size == 0)
11308			t->dtdt_size = dtrace_strsize_default;
11309	}
11310
11311	if (dtrace_difo_validate(dp, vstate, DIF_DIR_NREGS, cr) != 0)
11312		goto err;
11313
11314	dtrace_difo_init(dp, vstate);
11315	return (dp);
11316
11317err:
11318	kmem_free(dp->dtdo_buf, dp->dtdo_len * sizeof (dif_instr_t));
11319	kmem_free(dp->dtdo_inttab, dp->dtdo_intlen * sizeof (uint64_t));
11320	kmem_free(dp->dtdo_strtab, dp->dtdo_strlen);
11321	kmem_free(dp->dtdo_vartab, dp->dtdo_varlen * sizeof (dtrace_difv_t));
11322
11323	kmem_free(dp, sizeof (dtrace_difo_t));
11324	return (NULL);
11325}
11326
11327static dtrace_predicate_t *
11328dtrace_dof_predicate(dof_hdr_t *dof, dof_sec_t *sec, dtrace_vstate_t *vstate,
11329    cred_t *cr)
11330{
11331	dtrace_difo_t *dp;
11332
11333	if ((dp = dtrace_dof_difo(dof, sec, vstate, cr)) == NULL)
11334		return (NULL);
11335
11336	return (dtrace_predicate_create(dp));
11337}
11338
11339static dtrace_actdesc_t *
11340dtrace_dof_actdesc(dof_hdr_t *dof, dof_sec_t *sec, dtrace_vstate_t *vstate,
11341    cred_t *cr)
11342{
11343	dtrace_actdesc_t *act, *first = NULL, *last = NULL, *next;
11344	dof_actdesc_t *desc;
11345	dof_sec_t *difosec;
11346	size_t offs;
11347	uintptr_t daddr = (uintptr_t)dof;
11348	uint64_t arg;
11349	dtrace_actkind_t kind;
11350
11351	if (sec->dofs_type != DOF_SECT_ACTDESC) {
11352		dtrace_dof_error(dof, "invalid action section");
11353		return (NULL);
11354	}
11355
11356	if (sec->dofs_offset + sizeof (dof_actdesc_t) > dof->dofh_loadsz) {
11357		dtrace_dof_error(dof, "truncated action description");
11358		return (NULL);
11359	}
11360
11361	if (sec->dofs_align != sizeof (uint64_t)) {
11362		dtrace_dof_error(dof, "bad alignment in action description");
11363		return (NULL);
11364	}
11365
11366	if (sec->dofs_size < sec->dofs_entsize) {
11367		dtrace_dof_error(dof, "section entry size exceeds total size");
11368		return (NULL);
11369	}
11370
11371	if (sec->dofs_entsize != sizeof (dof_actdesc_t)) {
11372		dtrace_dof_error(dof, "bad entry size in action description");
11373		return (NULL);
11374	}
11375
11376	if (sec->dofs_size / sec->dofs_entsize > dtrace_actions_max) {
11377		dtrace_dof_error(dof, "actions exceed dtrace_actions_max");
11378		return (NULL);
11379	}
11380
11381	for (offs = 0; offs < sec->dofs_size; offs += sec->dofs_entsize) {
11382		desc = (dof_actdesc_t *)(daddr +
11383		    (uintptr_t)sec->dofs_offset + offs);
11384		kind = (dtrace_actkind_t)desc->dofa_kind;
11385
11386		if (DTRACEACT_ISPRINTFLIKE(kind) &&
11387		    (kind != DTRACEACT_PRINTA ||
11388		    desc->dofa_strtab != DOF_SECIDX_NONE)) {
11389			dof_sec_t *strtab;
11390			char *str, *fmt;
11391			uint64_t i;
11392
11393			/*
11394			 * printf()-like actions must have a format string.
11395			 */
11396			if ((strtab = dtrace_dof_sect(dof,
11397			    DOF_SECT_STRTAB, desc->dofa_strtab)) == NULL)
11398				goto err;
11399
11400			str = (char *)((uintptr_t)dof +
11401			    (uintptr_t)strtab->dofs_offset);
11402
11403			for (i = desc->dofa_arg; i < strtab->dofs_size; i++) {
11404				if (str[i] == '\0')
11405					break;
11406			}
11407
11408			if (i >= strtab->dofs_size) {
11409				dtrace_dof_error(dof, "bogus format string");
11410				goto err;
11411			}
11412
11413			if (i == desc->dofa_arg) {
11414				dtrace_dof_error(dof, "empty format string");
11415				goto err;
11416			}
11417
11418			i -= desc->dofa_arg;
11419			fmt = kmem_alloc(i + 1, KM_SLEEP);
11420			bcopy(&str[desc->dofa_arg], fmt, i + 1);
11421			arg = (uint64_t)(uintptr_t)fmt;
11422		} else {
11423			if (kind == DTRACEACT_PRINTA) {
11424				ASSERT(desc->dofa_strtab == DOF_SECIDX_NONE);
11425				arg = 0;
11426			} else {
11427				arg = desc->dofa_arg;
11428			}
11429		}
11430
11431		act = dtrace_actdesc_create(kind, desc->dofa_ntuple,
11432		    desc->dofa_uarg, arg);
11433
11434		if (last != NULL) {
11435			last->dtad_next = act;
11436		} else {
11437			first = act;
11438		}
11439
11440		last = act;
11441
11442		if (desc->dofa_difo == DOF_SECIDX_NONE)
11443			continue;
11444
11445		if ((difosec = dtrace_dof_sect(dof,
11446		    DOF_SECT_DIFOHDR, desc->dofa_difo)) == NULL)
11447			goto err;
11448
11449		act->dtad_difo = dtrace_dof_difo(dof, difosec, vstate, cr);
11450
11451		if (act->dtad_difo == NULL)
11452			goto err;
11453	}
11454
11455	ASSERT(first != NULL);
11456	return (first);
11457
11458err:
11459	for (act = first; act != NULL; act = next) {
11460		next = act->dtad_next;
11461		dtrace_actdesc_release(act, vstate);
11462	}
11463
11464	return (NULL);
11465}
11466
11467static dtrace_ecbdesc_t *
11468dtrace_dof_ecbdesc(dof_hdr_t *dof, dof_sec_t *sec, dtrace_vstate_t *vstate,
11469    cred_t *cr)
11470{
11471	dtrace_ecbdesc_t *ep;
11472	dof_ecbdesc_t *ecb;
11473	dtrace_probedesc_t *desc;
11474	dtrace_predicate_t *pred = NULL;
11475
11476	if (sec->dofs_size < sizeof (dof_ecbdesc_t)) {
11477		dtrace_dof_error(dof, "truncated ECB description");
11478		return (NULL);
11479	}
11480
11481	if (sec->dofs_align != sizeof (uint64_t)) {
11482		dtrace_dof_error(dof, "bad alignment in ECB description");
11483		return (NULL);
11484	}
11485
11486	ecb = (dof_ecbdesc_t *)((uintptr_t)dof + (uintptr_t)sec->dofs_offset);
11487	sec = dtrace_dof_sect(dof, DOF_SECT_PROBEDESC, ecb->dofe_probes);
11488
11489	if (sec == NULL)
11490		return (NULL);
11491
11492	ep = kmem_zalloc(sizeof (dtrace_ecbdesc_t), KM_SLEEP);
11493	ep->dted_uarg = ecb->dofe_uarg;
11494	desc = &ep->dted_probe;
11495
11496	if (dtrace_dof_probedesc(dof, sec, desc) == NULL)
11497		goto err;
11498
11499	if (ecb->dofe_pred != DOF_SECIDX_NONE) {
11500		if ((sec = dtrace_dof_sect(dof,
11501		    DOF_SECT_DIFOHDR, ecb->dofe_pred)) == NULL)
11502			goto err;
11503
11504		if ((pred = dtrace_dof_predicate(dof, sec, vstate, cr)) == NULL)
11505			goto err;
11506
11507		ep->dted_pred.dtpdd_predicate = pred;
11508	}
11509
11510	if (ecb->dofe_actions != DOF_SECIDX_NONE) {
11511		if ((sec = dtrace_dof_sect(dof,
11512		    DOF_SECT_ACTDESC, ecb->dofe_actions)) == NULL)
11513			goto err;
11514
11515		ep->dted_action = dtrace_dof_actdesc(dof, sec, vstate, cr);
11516
11517		if (ep->dted_action == NULL)
11518			goto err;
11519	}
11520
11521	return (ep);
11522
11523err:
11524	if (pred != NULL)
11525		dtrace_predicate_release(pred, vstate);
11526	kmem_free(ep, sizeof (dtrace_ecbdesc_t));
11527	return (NULL);
11528}
11529
11530/*
11531 * Apply the relocations from the specified 'sec' (a DOF_SECT_URELHDR) to the
11532 * specified DOF.  At present, this amounts to simply adding 'ubase' to the
11533 * site of any user SETX relocations to account for load object base address.
11534 * In the future, if we need other relocations, this function can be extended.
11535 */
11536static int
11537dtrace_dof_relocate(dof_hdr_t *dof, dof_sec_t *sec, uint64_t ubase)
11538{
11539	uintptr_t daddr = (uintptr_t)dof;
11540	dof_relohdr_t *dofr =
11541	    (dof_relohdr_t *)(uintptr_t)(daddr + sec->dofs_offset);
11542	dof_sec_t *ss, *rs, *ts;
11543	dof_relodesc_t *r;
11544	uint_t i, n;
11545
11546	if (sec->dofs_size < sizeof (dof_relohdr_t) ||
11547	    sec->dofs_align != sizeof (dof_secidx_t)) {
11548		dtrace_dof_error(dof, "invalid relocation header");
11549		return (-1);
11550	}
11551
11552	ss = dtrace_dof_sect(dof, DOF_SECT_STRTAB, dofr->dofr_strtab);
11553	rs = dtrace_dof_sect(dof, DOF_SECT_RELTAB, dofr->dofr_relsec);
11554	ts = dtrace_dof_sect(dof, DOF_SECT_NONE, dofr->dofr_tgtsec);
11555
11556	if (ss == NULL || rs == NULL || ts == NULL)
11557		return (-1); /* dtrace_dof_error() has been called already */
11558
11559	if (rs->dofs_entsize < sizeof (dof_relodesc_t) ||
11560	    rs->dofs_align != sizeof (uint64_t)) {
11561		dtrace_dof_error(dof, "invalid relocation section");
11562		return (-1);
11563	}
11564
11565	r = (dof_relodesc_t *)(uintptr_t)(daddr + rs->dofs_offset);
11566	n = rs->dofs_size / rs->dofs_entsize;
11567
11568	for (i = 0; i < n; i++) {
11569		uintptr_t taddr = daddr + ts->dofs_offset + r->dofr_offset;
11570
11571		switch (r->dofr_type) {
11572		case DOF_RELO_NONE:
11573			break;
11574		case DOF_RELO_SETX:
11575			if (r->dofr_offset >= ts->dofs_size || r->dofr_offset +
11576			    sizeof (uint64_t) > ts->dofs_size) {
11577				dtrace_dof_error(dof, "bad relocation offset");
11578				return (-1);
11579			}
11580
11581			if (!IS_P2ALIGNED(taddr, sizeof (uint64_t))) {
11582				dtrace_dof_error(dof, "misaligned setx relo");
11583				return (-1);
11584			}
11585
11586			*(uint64_t *)taddr += ubase;
11587			break;
11588		default:
11589			dtrace_dof_error(dof, "invalid relocation type");
11590			return (-1);
11591		}
11592
11593		r = (dof_relodesc_t *)((uintptr_t)r + rs->dofs_entsize);
11594	}
11595
11596	return (0);
11597}
11598
11599/*
11600 * The dof_hdr_t passed to dtrace_dof_slurp() should be a partially validated
11601 * header:  it should be at the front of a memory region that is at least
11602 * sizeof (dof_hdr_t) in size -- and then at least dof_hdr.dofh_loadsz in
11603 * size.  It need not be validated in any other way.
11604 */
11605static int
11606dtrace_dof_slurp(dof_hdr_t *dof, dtrace_vstate_t *vstate, cred_t *cr,
11607    dtrace_enabling_t **enabp, uint64_t ubase, int noprobes)
11608{
11609	uint64_t len = dof->dofh_loadsz, seclen;
11610	uintptr_t daddr = (uintptr_t)dof;
11611	dtrace_ecbdesc_t *ep;
11612	dtrace_enabling_t *enab;
11613	uint_t i;
11614
11615	ASSERT(MUTEX_HELD(&dtrace_lock));
11616	ASSERT(dof->dofh_loadsz >= sizeof (dof_hdr_t));
11617
11618	/*
11619	 * Check the DOF header identification bytes.  In addition to checking
11620	 * valid settings, we also verify that unused bits/bytes are zeroed so
11621	 * we can use them later without fear of regressing existing binaries.
11622	 */
11623	if (bcmp(&dof->dofh_ident[DOF_ID_MAG0],
11624	    DOF_MAG_STRING, DOF_MAG_STRLEN) != 0) {
11625		dtrace_dof_error(dof, "DOF magic string mismatch");
11626		return (-1);
11627	}
11628
11629	if (dof->dofh_ident[DOF_ID_MODEL] != DOF_MODEL_ILP32 &&
11630	    dof->dofh_ident[DOF_ID_MODEL] != DOF_MODEL_LP64) {
11631		dtrace_dof_error(dof, "DOF has invalid data model");
11632		return (-1);
11633	}
11634
11635	if (dof->dofh_ident[DOF_ID_ENCODING] != DOF_ENCODE_NATIVE) {
11636		dtrace_dof_error(dof, "DOF encoding mismatch");
11637		return (-1);
11638	}
11639
11640	if (dof->dofh_ident[DOF_ID_VERSION] != DOF_VERSION_1 &&
11641	    dof->dofh_ident[DOF_ID_VERSION] != DOF_VERSION_2) {
11642		dtrace_dof_error(dof, "DOF version mismatch");
11643		return (-1);
11644	}
11645
11646	if (dof->dofh_ident[DOF_ID_DIFVERS] != DIF_VERSION_2) {
11647		dtrace_dof_error(dof, "DOF uses unsupported instruction set");
11648		return (-1);
11649	}
11650
11651	if (dof->dofh_ident[DOF_ID_DIFIREG] > DIF_DIR_NREGS) {
11652		dtrace_dof_error(dof, "DOF uses too many integer registers");
11653		return (-1);
11654	}
11655
11656	if (dof->dofh_ident[DOF_ID_DIFTREG] > DIF_DTR_NREGS) {
11657		dtrace_dof_error(dof, "DOF uses too many tuple registers");
11658		return (-1);
11659	}
11660
11661	for (i = DOF_ID_PAD; i < DOF_ID_SIZE; i++) {
11662		if (dof->dofh_ident[i] != 0) {
11663			dtrace_dof_error(dof, "DOF has invalid ident byte set");
11664			return (-1);
11665		}
11666	}
11667
11668	if (dof->dofh_flags & ~DOF_FL_VALID) {
11669		dtrace_dof_error(dof, "DOF has invalid flag bits set");
11670		return (-1);
11671	}
11672
11673	if (dof->dofh_secsize == 0) {
11674		dtrace_dof_error(dof, "zero section header size");
11675		return (-1);
11676	}
11677
11678	/*
11679	 * Check that the section headers don't exceed the amount of DOF
11680	 * data.  Note that we cast the section size and number of sections
11681	 * to uint64_t's to prevent possible overflow in the multiplication.
11682	 */
11683	seclen = (uint64_t)dof->dofh_secnum * (uint64_t)dof->dofh_secsize;
11684
11685	if (dof->dofh_secoff > len || seclen > len ||
11686	    dof->dofh_secoff + seclen > len) {
11687		dtrace_dof_error(dof, "truncated section headers");
11688		return (-1);
11689	}
11690
11691	if (!IS_P2ALIGNED(dof->dofh_secoff, sizeof (uint64_t))) {
11692		dtrace_dof_error(dof, "misaligned section headers");
11693		return (-1);
11694	}
11695
11696	if (!IS_P2ALIGNED(dof->dofh_secsize, sizeof (uint64_t))) {
11697		dtrace_dof_error(dof, "misaligned section size");
11698		return (-1);
11699	}
11700
11701	/*
11702	 * Take an initial pass through the section headers to be sure that
11703	 * the headers don't have stray offsets.  If the 'noprobes' flag is
11704	 * set, do not permit sections relating to providers, probes, or args.
11705	 */
11706	for (i = 0; i < dof->dofh_secnum; i++) {
11707		dof_sec_t *sec = (dof_sec_t *)(daddr +
11708		    (uintptr_t)dof->dofh_secoff + i * dof->dofh_secsize);
11709
11710		if (noprobes) {
11711			switch (sec->dofs_type) {
11712			case DOF_SECT_PROVIDER:
11713			case DOF_SECT_PROBES:
11714			case DOF_SECT_PRARGS:
11715			case DOF_SECT_PROFFS:
11716				dtrace_dof_error(dof, "illegal sections "
11717				    "for enabling");
11718				return (-1);
11719			}
11720		}
11721
11722		if (!(sec->dofs_flags & DOF_SECF_LOAD))
11723			continue; /* just ignore non-loadable sections */
11724
11725		if (sec->dofs_align & (sec->dofs_align - 1)) {
11726			dtrace_dof_error(dof, "bad section alignment");
11727			return (-1);
11728		}
11729
11730		if (sec->dofs_offset & (sec->dofs_align - 1)) {
11731			dtrace_dof_error(dof, "misaligned section");
11732			return (-1);
11733		}
11734
11735		if (sec->dofs_offset > len || sec->dofs_size > len ||
11736		    sec->dofs_offset + sec->dofs_size > len) {
11737			dtrace_dof_error(dof, "corrupt section header");
11738			return (-1);
11739		}
11740
11741		if (sec->dofs_type == DOF_SECT_STRTAB && *((char *)daddr +
11742		    sec->dofs_offset + sec->dofs_size - 1) != '\0') {
11743			dtrace_dof_error(dof, "non-terminating string table");
11744			return (-1);
11745		}
11746	}
11747
11748	/*
11749	 * Take a second pass through the sections and locate and perform any
11750	 * relocations that are present.  We do this after the first pass to
11751	 * be sure that all sections have had their headers validated.
11752	 */
11753	for (i = 0; i < dof->dofh_secnum; i++) {
11754		dof_sec_t *sec = (dof_sec_t *)(daddr +
11755		    (uintptr_t)dof->dofh_secoff + i * dof->dofh_secsize);
11756
11757		if (!(sec->dofs_flags & DOF_SECF_LOAD))
11758			continue; /* skip sections that are not loadable */
11759
11760		switch (sec->dofs_type) {
11761		case DOF_SECT_URELHDR:
11762			if (dtrace_dof_relocate(dof, sec, ubase) != 0)
11763				return (-1);
11764			break;
11765		}
11766	}
11767
11768	if ((enab = *enabp) == NULL)
11769		enab = *enabp = dtrace_enabling_create(vstate);
11770
11771	for (i = 0; i < dof->dofh_secnum; i++) {
11772		dof_sec_t *sec = (dof_sec_t *)(daddr +
11773		    (uintptr_t)dof->dofh_secoff + i * dof->dofh_secsize);
11774
11775		if (sec->dofs_type != DOF_SECT_ECBDESC)
11776			continue;
11777
11778		if ((ep = dtrace_dof_ecbdesc(dof, sec, vstate, cr)) == NULL) {
11779			dtrace_enabling_destroy(enab);
11780			*enabp = NULL;
11781			return (-1);
11782		}
11783
11784		dtrace_enabling_add(enab, ep);
11785	}
11786
11787	return (0);
11788}
11789
11790/*
11791 * Process DOF for any options.  This routine assumes that the DOF has been
11792 * at least processed by dtrace_dof_slurp().
11793 */
11794static int
11795dtrace_dof_options(dof_hdr_t *dof, dtrace_state_t *state)
11796{
11797	int i, rval;
11798	uint32_t entsize;
11799	size_t offs;
11800	dof_optdesc_t *desc;
11801
11802	for (i = 0; i < dof->dofh_secnum; i++) {
11803		dof_sec_t *sec = (dof_sec_t *)((uintptr_t)dof +
11804		    (uintptr_t)dof->dofh_secoff + i * dof->dofh_secsize);
11805
11806		if (sec->dofs_type != DOF_SECT_OPTDESC)
11807			continue;
11808
11809		if (sec->dofs_align != sizeof (uint64_t)) {
11810			dtrace_dof_error(dof, "bad alignment in "
11811			    "option description");
11812			return (EINVAL);
11813		}
11814
11815		if ((entsize = sec->dofs_entsize) == 0) {
11816			dtrace_dof_error(dof, "zeroed option entry size");
11817			return (EINVAL);
11818		}
11819
11820		if (entsize < sizeof (dof_optdesc_t)) {
11821			dtrace_dof_error(dof, "bad option entry size");
11822			return (EINVAL);
11823		}
11824
11825		for (offs = 0; offs < sec->dofs_size; offs += entsize) {
11826			desc = (dof_optdesc_t *)((uintptr_t)dof +
11827			    (uintptr_t)sec->dofs_offset + offs);
11828
11829			if (desc->dofo_strtab != DOF_SECIDX_NONE) {
11830				dtrace_dof_error(dof, "non-zero option string");
11831				return (EINVAL);
11832			}
11833
11834			if (desc->dofo_value == DTRACEOPT_UNSET) {
11835				dtrace_dof_error(dof, "unset option");
11836				return (EINVAL);
11837			}
11838
11839			if ((rval = dtrace_state_option(state,
11840			    desc->dofo_option, desc->dofo_value)) != 0) {
11841				dtrace_dof_error(dof, "rejected option");
11842				return (rval);
11843			}
11844		}
11845	}
11846
11847	return (0);
11848}
11849
11850/*
11851 * DTrace Consumer State Functions
11852 */
11853int
11854dtrace_dstate_init(dtrace_dstate_t *dstate, size_t size)
11855{
11856	size_t hashsize, maxper, min, chunksize = dstate->dtds_chunksize;
11857	void *base;
11858	uintptr_t limit;
11859	dtrace_dynvar_t *dvar, *next, *start;
11860	int i;
11861
11862	ASSERT(MUTEX_HELD(&dtrace_lock));
11863	ASSERT(dstate->dtds_base == NULL && dstate->dtds_percpu == NULL);
11864
11865	bzero(dstate, sizeof (dtrace_dstate_t));
11866
11867	if ((dstate->dtds_chunksize = chunksize) == 0)
11868		dstate->dtds_chunksize = DTRACE_DYNVAR_CHUNKSIZE;
11869
11870	if (size < (min = dstate->dtds_chunksize + sizeof (dtrace_dynhash_t)))
11871		size = min;
11872
11873	if ((base = kmem_zalloc(size, KM_NOSLEEP)) == NULL)
11874		return (ENOMEM);
11875
11876	dstate->dtds_size = size;
11877	dstate->dtds_base = base;
11878	dstate->dtds_percpu = kmem_cache_alloc(dtrace_state_cache, KM_SLEEP);
11879	bzero(dstate->dtds_percpu, NCPU * sizeof (dtrace_dstate_percpu_t));
11880
11881	hashsize = size / (dstate->dtds_chunksize + sizeof (dtrace_dynhash_t));
11882
11883	if (hashsize != 1 && (hashsize & 1))
11884		hashsize--;
11885
11886	dstate->dtds_hashsize = hashsize;
11887	dstate->dtds_hash = dstate->dtds_base;
11888
11889	/*
11890	 * Set all of our hash buckets to point to the single sink, and (if
11891	 * it hasn't already been set), set the sink's hash value to be the
11892	 * sink sentinel value.  The sink is needed for dynamic variable
11893	 * lookups to know that they have iterated over an entire, valid hash
11894	 * chain.
11895	 */
11896	for (i = 0; i < hashsize; i++)
11897		dstate->dtds_hash[i].dtdh_chain = &dtrace_dynhash_sink;
11898
11899	if (dtrace_dynhash_sink.dtdv_hashval != DTRACE_DYNHASH_SINK)
11900		dtrace_dynhash_sink.dtdv_hashval = DTRACE_DYNHASH_SINK;
11901
11902	/*
11903	 * Determine number of active CPUs.  Divide free list evenly among
11904	 * active CPUs.
11905	 */
11906	start = (dtrace_dynvar_t *)
11907	    ((uintptr_t)base + hashsize * sizeof (dtrace_dynhash_t));
11908	limit = (uintptr_t)base + size;
11909
11910	maxper = (limit - (uintptr_t)start) / NCPU;
11911	maxper = (maxper / dstate->dtds_chunksize) * dstate->dtds_chunksize;
11912
11913	for (i = 0; i < NCPU; i++) {
11914		dstate->dtds_percpu[i].dtdsc_free = dvar = start;
11915
11916		/*
11917		 * If we don't even have enough chunks to make it once through
11918		 * NCPUs, we're just going to allocate everything to the first
11919		 * CPU.  And if we're on the last CPU, we're going to allocate
11920		 * whatever is left over.  In either case, we set the limit to
11921		 * be the limit of the dynamic variable space.
11922		 */
11923		if (maxper == 0 || i == NCPU - 1) {
11924			limit = (uintptr_t)base + size;
11925			start = NULL;
11926		} else {
11927			limit = (uintptr_t)start + maxper;
11928			start = (dtrace_dynvar_t *)limit;
11929		}
11930
11931		ASSERT(limit <= (uintptr_t)base + size);
11932
11933		for (;;) {
11934			next = (dtrace_dynvar_t *)((uintptr_t)dvar +
11935			    dstate->dtds_chunksize);
11936
11937			if ((uintptr_t)next + dstate->dtds_chunksize >= limit)
11938				break;
11939
11940			dvar->dtdv_next = next;
11941			dvar = next;
11942		}
11943
11944		if (maxper == 0)
11945			break;
11946	}
11947
11948	return (0);
11949}
11950
11951void
11952dtrace_dstate_fini(dtrace_dstate_t *dstate)
11953{
11954	ASSERT(MUTEX_HELD(&cpu_lock));
11955
11956	if (dstate->dtds_base == NULL)
11957		return;
11958
11959	kmem_free(dstate->dtds_base, dstate->dtds_size);
11960	kmem_cache_free(dtrace_state_cache, dstate->dtds_percpu);
11961}
11962
11963static void
11964dtrace_vstate_fini(dtrace_vstate_t *vstate)
11965{
11966	/*
11967	 * Logical XOR, where are you?
11968	 */
11969	ASSERT((vstate->dtvs_nglobals == 0) ^ (vstate->dtvs_globals != NULL));
11970
11971	if (vstate->dtvs_nglobals > 0) {
11972		kmem_free(vstate->dtvs_globals, vstate->dtvs_nglobals *
11973		    sizeof (dtrace_statvar_t *));
11974	}
11975
11976	if (vstate->dtvs_ntlocals > 0) {
11977		kmem_free(vstate->dtvs_tlocals, vstate->dtvs_ntlocals *
11978		    sizeof (dtrace_difv_t));
11979	}
11980
11981	ASSERT((vstate->dtvs_nlocals == 0) ^ (vstate->dtvs_locals != NULL));
11982
11983	if (vstate->dtvs_nlocals > 0) {
11984		kmem_free(vstate->dtvs_locals, vstate->dtvs_nlocals *
11985		    sizeof (dtrace_statvar_t *));
11986	}
11987}
11988
11989static void
11990dtrace_state_clean(dtrace_state_t *state)
11991{
11992	if (state->dts_activity == DTRACE_ACTIVITY_INACTIVE)
11993		return;
11994
11995	dtrace_dynvar_clean(&state->dts_vstate.dtvs_dynvars);
11996	dtrace_speculation_clean(state);
11997}
11998
11999static void
12000dtrace_state_deadman(dtrace_state_t *state)
12001{
12002	hrtime_t now;
12003
12004	dtrace_sync();
12005
12006	now = dtrace_gethrtime();
12007
12008	if (state != dtrace_anon.dta_state &&
12009	    now - state->dts_laststatus >= dtrace_deadman_user)
12010		return;
12011
12012	/*
12013	 * We must be sure that dts_alive never appears to be less than the
12014	 * value upon entry to dtrace_state_deadman(), and because we lack a
12015	 * dtrace_cas64(), we cannot store to it atomically.  We thus instead
12016	 * store INT64_MAX to it, followed by a memory barrier, followed by
12017	 * the new value.  This assures that dts_alive never appears to be
12018	 * less than its true value, regardless of the order in which the
12019	 * stores to the underlying storage are issued.
12020	 */
12021	state->dts_alive = INT64_MAX;
12022	dtrace_membar_producer();
12023	state->dts_alive = now;
12024}
12025
12026dtrace_state_t *
12027dtrace_state_create(dev_t *devp, cred_t *cr)
12028{
12029	minor_t minor;
12030	major_t major;
12031	char c[30];
12032	dtrace_state_t *state;
12033	dtrace_optval_t *opt;
12034	int bufsize = NCPU * sizeof (dtrace_buffer_t), i;
12035
12036	ASSERT(MUTEX_HELD(&dtrace_lock));
12037	ASSERT(MUTEX_HELD(&cpu_lock));
12038
12039	minor = (minor_t)(uintptr_t)vmem_alloc(dtrace_minor, 1,
12040	    VM_BESTFIT | VM_SLEEP);
12041
12042	if (ddi_soft_state_zalloc(dtrace_softstate, minor) != DDI_SUCCESS) {
12043		vmem_free(dtrace_minor, (void *)(uintptr_t)minor, 1);
12044		return (NULL);
12045	}
12046
12047	state = ddi_get_soft_state(dtrace_softstate, minor);
12048	state->dts_epid = DTRACE_EPIDNONE + 1;
12049
12050	(void) snprintf(c, sizeof (c), "dtrace_aggid_%d", minor);
12051	state->dts_aggid_arena = vmem_create(c, (void *)1, UINT32_MAX, 1,
12052	    NULL, NULL, NULL, 0, VM_SLEEP | VMC_IDENTIFIER);
12053
12054	if (devp != NULL) {
12055		major = getemajor(*devp);
12056	} else {
12057		major = ddi_driver_major(dtrace_devi);
12058	}
12059
12060	state->dts_dev = makedevice(major, minor);
12061
12062	if (devp != NULL)
12063		*devp = state->dts_dev;
12064
12065	/*
12066	 * We allocate NCPU buffers.  On the one hand, this can be quite
12067	 * a bit of memory per instance (nearly 36K on a Starcat).  On the
12068	 * other hand, it saves an additional memory reference in the probe
12069	 * path.
12070	 */
12071	state->dts_buffer = kmem_zalloc(bufsize, KM_SLEEP);
12072	state->dts_aggbuffer = kmem_zalloc(bufsize, KM_SLEEP);
12073	state->dts_cleaner = CYCLIC_NONE;
12074	state->dts_deadman = CYCLIC_NONE;
12075	state->dts_vstate.dtvs_state = state;
12076
12077	for (i = 0; i < DTRACEOPT_MAX; i++)
12078		state->dts_options[i] = DTRACEOPT_UNSET;
12079
12080	/*
12081	 * Set the default options.
12082	 */
12083	opt = state->dts_options;
12084	opt[DTRACEOPT_BUFPOLICY] = DTRACEOPT_BUFPOLICY_SWITCH;
12085	opt[DTRACEOPT_BUFRESIZE] = DTRACEOPT_BUFRESIZE_AUTO;
12086	opt[DTRACEOPT_NSPEC] = dtrace_nspec_default;
12087	opt[DTRACEOPT_SPECSIZE] = dtrace_specsize_default;
12088	opt[DTRACEOPT_CPU] = (dtrace_optval_t)DTRACE_CPUALL;
12089	opt[DTRACEOPT_STRSIZE] = dtrace_strsize_default;
12090	opt[DTRACEOPT_STACKFRAMES] = dtrace_stackframes_default;
12091	opt[DTRACEOPT_USTACKFRAMES] = dtrace_ustackframes_default;
12092	opt[DTRACEOPT_CLEANRATE] = dtrace_cleanrate_default;
12093	opt[DTRACEOPT_AGGRATE] = dtrace_aggrate_default;
12094	opt[DTRACEOPT_SWITCHRATE] = dtrace_switchrate_default;
12095	opt[DTRACEOPT_STATUSRATE] = dtrace_statusrate_default;
12096	opt[DTRACEOPT_JSTACKFRAMES] = dtrace_jstackframes_default;
12097	opt[DTRACEOPT_JSTACKSTRSIZE] = dtrace_jstackstrsize_default;
12098
12099	state->dts_activity = DTRACE_ACTIVITY_INACTIVE;
12100
12101	/*
12102	 * Depending on the user credentials, we set flag bits which alter probe
12103	 * visibility or the amount of destructiveness allowed.  In the case of
12104	 * actual anonymous tracing, or the possession of all privileges, all of
12105	 * the normal checks are bypassed.
12106	 */
12107	if (cr == NULL || PRIV_POLICY_ONLY(cr, PRIV_ALL, B_FALSE)) {
12108		state->dts_cred.dcr_visible = DTRACE_CRV_ALL;
12109		state->dts_cred.dcr_action = DTRACE_CRA_ALL;
12110	} else {
12111		/*
12112		 * Set up the credentials for this instantiation.  We take a
12113		 * hold on the credential to prevent it from disappearing on
12114		 * us; this in turn prevents the zone_t referenced by this
12115		 * credential from disappearing.  This means that we can
12116		 * examine the credential and the zone from probe context.
12117		 */
12118		crhold(cr);
12119		state->dts_cred.dcr_cred = cr;
12120
12121		/*
12122		 * CRA_PROC means "we have *some* privilege for dtrace" and
12123		 * unlocks the use of variables like pid, zonename, etc.
12124		 */
12125		if (PRIV_POLICY_ONLY(cr, PRIV_DTRACE_USER, B_FALSE) ||
12126		    PRIV_POLICY_ONLY(cr, PRIV_DTRACE_PROC, B_FALSE)) {
12127			state->dts_cred.dcr_action |= DTRACE_CRA_PROC;
12128		}
12129
12130		/*
12131		 * dtrace_user allows use of syscall and profile providers.
12132		 * If the user also has proc_owner and/or proc_zone, we
12133		 * extend the scope to include additional visibility and
12134		 * destructive power.
12135		 */
12136		if (PRIV_POLICY_ONLY(cr, PRIV_DTRACE_USER, B_FALSE)) {
12137			if (PRIV_POLICY_ONLY(cr, PRIV_PROC_OWNER, B_FALSE)) {
12138				state->dts_cred.dcr_visible |=
12139				    DTRACE_CRV_ALLPROC;
12140
12141				state->dts_cred.dcr_action |=
12142				    DTRACE_CRA_PROC_DESTRUCTIVE_ALLUSER;
12143			}
12144
12145			if (PRIV_POLICY_ONLY(cr, PRIV_PROC_ZONE, B_FALSE)) {
12146				state->dts_cred.dcr_visible |=
12147				    DTRACE_CRV_ALLZONE;
12148
12149				state->dts_cred.dcr_action |=
12150				    DTRACE_CRA_PROC_DESTRUCTIVE_ALLZONE;
12151			}
12152
12153			/*
12154			 * If we have all privs in whatever zone this is,
12155			 * we can do destructive things to processes which
12156			 * have altered credentials.
12157			 */
12158			if (priv_isequalset(priv_getset(cr, PRIV_EFFECTIVE),
12159			    cr->cr_zone->zone_privset)) {
12160				state->dts_cred.dcr_action |=
12161				    DTRACE_CRA_PROC_DESTRUCTIVE_CREDCHG;
12162			}
12163		}
12164
12165		/*
12166		 * Holding the dtrace_kernel privilege also implies that
12167		 * the user has the dtrace_user privilege from a visibility
12168		 * perspective.  But without further privileges, some
12169		 * destructive actions are not available.
12170		 */
12171		if (PRIV_POLICY_ONLY(cr, PRIV_DTRACE_KERNEL, B_FALSE)) {
12172			/*
12173			 * Make all probes in all zones visible.  However,
12174			 * this doesn't mean that all actions become available
12175			 * to all zones.
12176			 */
12177			state->dts_cred.dcr_visible |= DTRACE_CRV_KERNEL |
12178			    DTRACE_CRV_ALLPROC | DTRACE_CRV_ALLZONE;
12179
12180			state->dts_cred.dcr_action |= DTRACE_CRA_KERNEL |
12181			    DTRACE_CRA_PROC;
12182			/*
12183			 * Holding proc_owner means that destructive actions
12184			 * for *this* zone are allowed.
12185			 */
12186			if (PRIV_POLICY_ONLY(cr, PRIV_PROC_OWNER, B_FALSE))
12187				state->dts_cred.dcr_action |=
12188				    DTRACE_CRA_PROC_DESTRUCTIVE_ALLUSER;
12189
12190			/*
12191			 * Holding proc_zone means that destructive actions
12192			 * for this user/group ID in all zones is allowed.
12193			 */
12194			if (PRIV_POLICY_ONLY(cr, PRIV_PROC_ZONE, B_FALSE))
12195				state->dts_cred.dcr_action |=
12196				    DTRACE_CRA_PROC_DESTRUCTIVE_ALLZONE;
12197
12198			/*
12199			 * If we have all privs in whatever zone this is,
12200			 * we can do destructive things to processes which
12201			 * have altered credentials.
12202			 */
12203			if (priv_isequalset(priv_getset(cr, PRIV_EFFECTIVE),
12204			    cr->cr_zone->zone_privset)) {
12205				state->dts_cred.dcr_action |=
12206				    DTRACE_CRA_PROC_DESTRUCTIVE_CREDCHG;
12207			}
12208		}
12209
12210		/*
12211		 * Holding the dtrace_proc privilege gives control over fasttrap
12212		 * and pid providers.  We need to grant wider destructive
12213		 * privileges in the event that the user has proc_owner and/or
12214		 * proc_zone.
12215		 */
12216		if (PRIV_POLICY_ONLY(cr, PRIV_DTRACE_PROC, B_FALSE)) {
12217			if (PRIV_POLICY_ONLY(cr, PRIV_PROC_OWNER, B_FALSE))
12218				state->dts_cred.dcr_action |=
12219				    DTRACE_CRA_PROC_DESTRUCTIVE_ALLUSER;
12220
12221			if (PRIV_POLICY_ONLY(cr, PRIV_PROC_ZONE, B_FALSE))
12222				state->dts_cred.dcr_action |=
12223				    DTRACE_CRA_PROC_DESTRUCTIVE_ALLZONE;
12224		}
12225	}
12226
12227	return (state);
12228}
12229
12230static int
12231dtrace_state_buffer(dtrace_state_t *state, dtrace_buffer_t *buf, int which)
12232{
12233	dtrace_optval_t *opt = state->dts_options, size;
12234	processorid_t cpu;
12235	int flags = 0, rval;
12236
12237	ASSERT(MUTEX_HELD(&dtrace_lock));
12238	ASSERT(MUTEX_HELD(&cpu_lock));
12239	ASSERT(which < DTRACEOPT_MAX);
12240	ASSERT(state->dts_activity == DTRACE_ACTIVITY_INACTIVE ||
12241	    (state == dtrace_anon.dta_state &&
12242	    state->dts_activity == DTRACE_ACTIVITY_ACTIVE));
12243
12244	if (opt[which] == DTRACEOPT_UNSET || opt[which] == 0)
12245		return (0);
12246
12247	if (opt[DTRACEOPT_CPU] != DTRACEOPT_UNSET)
12248		cpu = opt[DTRACEOPT_CPU];
12249
12250	if (which == DTRACEOPT_SPECSIZE)
12251		flags |= DTRACEBUF_NOSWITCH;
12252
12253	if (which == DTRACEOPT_BUFSIZE) {
12254		if (opt[DTRACEOPT_BUFPOLICY] == DTRACEOPT_BUFPOLICY_RING)
12255			flags |= DTRACEBUF_RING;
12256
12257		if (opt[DTRACEOPT_BUFPOLICY] == DTRACEOPT_BUFPOLICY_FILL)
12258			flags |= DTRACEBUF_FILL;
12259
12260		if (state != dtrace_anon.dta_state ||
12261		    state->dts_activity != DTRACE_ACTIVITY_ACTIVE)
12262			flags |= DTRACEBUF_INACTIVE;
12263	}
12264
12265	for (size = opt[which]; size >= sizeof (uint64_t); size >>= 1) {
12266		/*
12267		 * The size must be 8-byte aligned.  If the size is not 8-byte
12268		 * aligned, drop it down by the difference.
12269		 */
12270		if (size & (sizeof (uint64_t) - 1))
12271			size -= size & (sizeof (uint64_t) - 1);
12272
12273		if (size < state->dts_reserve) {
12274			/*
12275			 * Buffers always must be large enough to accommodate
12276			 * their prereserved space.  We return E2BIG instead
12277			 * of ENOMEM in this case to allow for user-level
12278			 * software to differentiate the cases.
12279			 */
12280			return (E2BIG);
12281		}
12282
12283		rval = dtrace_buffer_alloc(buf, size, flags, cpu);
12284
12285		if (rval != ENOMEM) {
12286			opt[which] = size;
12287			return (rval);
12288		}
12289
12290		if (opt[DTRACEOPT_BUFRESIZE] == DTRACEOPT_BUFRESIZE_MANUAL)
12291			return (rval);
12292	}
12293
12294	return (ENOMEM);
12295}
12296
12297static int
12298dtrace_state_buffers(dtrace_state_t *state)
12299{
12300	dtrace_speculation_t *spec = state->dts_speculations;
12301	int rval, i;
12302
12303	if ((rval = dtrace_state_buffer(state, state->dts_buffer,
12304	    DTRACEOPT_BUFSIZE)) != 0)
12305		return (rval);
12306
12307	if ((rval = dtrace_state_buffer(state, state->dts_aggbuffer,
12308	    DTRACEOPT_AGGSIZE)) != 0)
12309		return (rval);
12310
12311	for (i = 0; i < state->dts_nspeculations; i++) {
12312		if ((rval = dtrace_state_buffer(state,
12313		    spec[i].dtsp_buffer, DTRACEOPT_SPECSIZE)) != 0)
12314			return (rval);
12315	}
12316
12317	return (0);
12318}
12319
12320static void
12321dtrace_state_prereserve(dtrace_state_t *state)
12322{
12323	dtrace_ecb_t *ecb;
12324	dtrace_probe_t *probe;
12325
12326	state->dts_reserve = 0;
12327
12328	if (state->dts_options[DTRACEOPT_BUFPOLICY] != DTRACEOPT_BUFPOLICY_FILL)
12329		return;
12330
12331	/*
12332	 * If our buffer policy is a "fill" buffer policy, we need to set the
12333	 * prereserved space to be the space required by the END probes.
12334	 */
12335	probe = dtrace_probes[dtrace_probeid_end - 1];
12336	ASSERT(probe != NULL);
12337
12338	for (ecb = probe->dtpr_ecb; ecb != NULL; ecb = ecb->dte_next) {
12339		if (ecb->dte_state != state)
12340			continue;
12341
12342		state->dts_reserve += ecb->dte_needed + ecb->dte_alignment;
12343	}
12344}
12345
12346static int
12347dtrace_state_go(dtrace_state_t *state, processorid_t *cpu)
12348{
12349	dtrace_optval_t *opt = state->dts_options, sz, nspec;
12350	dtrace_speculation_t *spec;
12351	dtrace_buffer_t *buf;
12352	cyc_handler_t hdlr;
12353	cyc_time_t when;
12354	int rval = 0, i, bufsize = NCPU * sizeof (dtrace_buffer_t);
12355	dtrace_icookie_t cookie;
12356
12357	mutex_enter(&cpu_lock);
12358	mutex_enter(&dtrace_lock);
12359
12360	if (state->dts_activity != DTRACE_ACTIVITY_INACTIVE) {
12361		rval = EBUSY;
12362		goto out;
12363	}
12364
12365	/*
12366	 * Before we can perform any checks, we must prime all of the
12367	 * retained enablings that correspond to this state.
12368	 */
12369	dtrace_enabling_prime(state);
12370
12371	if (state->dts_destructive && !state->dts_cred.dcr_destructive) {
12372		rval = EACCES;
12373		goto out;
12374	}
12375
12376	dtrace_state_prereserve(state);
12377
12378	/*
12379	 * Now we want to do is try to allocate our speculations.
12380	 * We do not automatically resize the number of speculations; if
12381	 * this fails, we will fail the operation.
12382	 */
12383	nspec = opt[DTRACEOPT_NSPEC];
12384	ASSERT(nspec != DTRACEOPT_UNSET);
12385
12386	if (nspec > INT_MAX) {
12387		rval = ENOMEM;
12388		goto out;
12389	}
12390
12391	spec = kmem_zalloc(nspec * sizeof (dtrace_speculation_t), KM_NOSLEEP);
12392
12393	if (spec == NULL) {
12394		rval = ENOMEM;
12395		goto out;
12396	}
12397
12398	state->dts_speculations = spec;
12399	state->dts_nspeculations = (int)nspec;
12400
12401	for (i = 0; i < nspec; i++) {
12402		if ((buf = kmem_zalloc(bufsize, KM_NOSLEEP)) == NULL) {
12403			rval = ENOMEM;
12404			goto err;
12405		}
12406
12407		spec[i].dtsp_buffer = buf;
12408	}
12409
12410	if (opt[DTRACEOPT_GRABANON] != DTRACEOPT_UNSET) {
12411		if (dtrace_anon.dta_state == NULL) {
12412			rval = ENOENT;
12413			goto out;
12414		}
12415
12416		if (state->dts_necbs != 0) {
12417			rval = EALREADY;
12418			goto out;
12419		}
12420
12421		state->dts_anon = dtrace_anon_grab();
12422		ASSERT(state->dts_anon != NULL);
12423		state = state->dts_anon;
12424
12425		/*
12426		 * We want "grabanon" to be set in the grabbed state, so we'll
12427		 * copy that option value from the grabbing state into the
12428		 * grabbed state.
12429		 */
12430		state->dts_options[DTRACEOPT_GRABANON] =
12431		    opt[DTRACEOPT_GRABANON];
12432
12433		*cpu = dtrace_anon.dta_beganon;
12434
12435		/*
12436		 * If the anonymous state is active (as it almost certainly
12437		 * is if the anonymous enabling ultimately matched anything),
12438		 * we don't allow any further option processing -- but we
12439		 * don't return failure.
12440		 */
12441		if (state->dts_activity != DTRACE_ACTIVITY_INACTIVE)
12442			goto out;
12443	}
12444
12445	if (opt[DTRACEOPT_AGGSIZE] != DTRACEOPT_UNSET &&
12446	    opt[DTRACEOPT_AGGSIZE] != 0) {
12447		if (state->dts_aggregations == NULL) {
12448			/*
12449			 * We're not going to create an aggregation buffer
12450			 * because we don't have any ECBs that contain
12451			 * aggregations -- set this option to 0.
12452			 */
12453			opt[DTRACEOPT_AGGSIZE] = 0;
12454		} else {
12455			/*
12456			 * If we have an aggregation buffer, we must also have
12457			 * a buffer to use as scratch.
12458			 */
12459			if (opt[DTRACEOPT_BUFSIZE] == DTRACEOPT_UNSET ||
12460			    opt[DTRACEOPT_BUFSIZE] < state->dts_needed) {
12461				opt[DTRACEOPT_BUFSIZE] = state->dts_needed;
12462			}
12463		}
12464	}
12465
12466	if (opt[DTRACEOPT_SPECSIZE] != DTRACEOPT_UNSET &&
12467	    opt[DTRACEOPT_SPECSIZE] != 0) {
12468		if (!state->dts_speculates) {
12469			/*
12470			 * We're not going to create speculation buffers
12471			 * because we don't have any ECBs that actually
12472			 * speculate -- set the speculation size to 0.
12473			 */
12474			opt[DTRACEOPT_SPECSIZE] = 0;
12475		}
12476	}
12477
12478	/*
12479	 * The bare minimum size for any buffer that we're actually going to
12480	 * do anything to is sizeof (uint64_t).
12481	 */
12482	sz = sizeof (uint64_t);
12483
12484	if ((state->dts_needed != 0 && opt[DTRACEOPT_BUFSIZE] < sz) ||
12485	    (state->dts_speculates && opt[DTRACEOPT_SPECSIZE] < sz) ||
12486	    (state->dts_aggregations != NULL && opt[DTRACEOPT_AGGSIZE] < sz)) {
12487		/*
12488		 * A buffer size has been explicitly set to 0 (or to a size
12489		 * that will be adjusted to 0) and we need the space -- we
12490		 * need to return failure.  We return ENOSPC to differentiate
12491		 * it from failing to allocate a buffer due to failure to meet
12492		 * the reserve (for which we return E2BIG).
12493		 */
12494		rval = ENOSPC;
12495		goto out;
12496	}
12497
12498	if ((rval = dtrace_state_buffers(state)) != 0)
12499		goto err;
12500
12501	if ((sz = opt[DTRACEOPT_DYNVARSIZE]) == DTRACEOPT_UNSET)
12502		sz = dtrace_dstate_defsize;
12503
12504	do {
12505		rval = dtrace_dstate_init(&state->dts_vstate.dtvs_dynvars, sz);
12506
12507		if (rval == 0)
12508			break;
12509
12510		if (opt[DTRACEOPT_BUFRESIZE] == DTRACEOPT_BUFRESIZE_MANUAL)
12511			goto err;
12512	} while (sz >>= 1);
12513
12514	opt[DTRACEOPT_DYNVARSIZE] = sz;
12515
12516	if (rval != 0)
12517		goto err;
12518
12519	if (opt[DTRACEOPT_STATUSRATE] > dtrace_statusrate_max)
12520		opt[DTRACEOPT_STATUSRATE] = dtrace_statusrate_max;
12521
12522	if (opt[DTRACEOPT_CLEANRATE] == 0)
12523		opt[DTRACEOPT_CLEANRATE] = dtrace_cleanrate_max;
12524
12525	if (opt[DTRACEOPT_CLEANRATE] < dtrace_cleanrate_min)
12526		opt[DTRACEOPT_CLEANRATE] = dtrace_cleanrate_min;
12527
12528	if (opt[DTRACEOPT_CLEANRATE] > dtrace_cleanrate_max)
12529		opt[DTRACEOPT_CLEANRATE] = dtrace_cleanrate_max;
12530
12531	hdlr.cyh_func = (cyc_func_t)dtrace_state_clean;
12532	hdlr.cyh_arg = state;
12533	hdlr.cyh_level = CY_LOW_LEVEL;
12534
12535	when.cyt_when = 0;
12536	when.cyt_interval = opt[DTRACEOPT_CLEANRATE];
12537
12538	state->dts_cleaner = cyclic_add(&hdlr, &when);
12539
12540	hdlr.cyh_func = (cyc_func_t)dtrace_state_deadman;
12541	hdlr.cyh_arg = state;
12542	hdlr.cyh_level = CY_LOW_LEVEL;
12543
12544	when.cyt_when = 0;
12545	when.cyt_interval = dtrace_deadman_interval;
12546
12547	state->dts_alive = state->dts_laststatus = dtrace_gethrtime();
12548	state->dts_deadman = cyclic_add(&hdlr, &when);
12549
12550	state->dts_activity = DTRACE_ACTIVITY_WARMUP;
12551
12552	/*
12553	 * Now it's time to actually fire the BEGIN probe.  We need to disable
12554	 * interrupts here both to record the CPU on which we fired the BEGIN
12555	 * probe (the data from this CPU will be processed first at user
12556	 * level) and to manually activate the buffer for this CPU.
12557	 */
12558	cookie = dtrace_interrupt_disable();
12559	*cpu = CPU->cpu_id;
12560	ASSERT(state->dts_buffer[*cpu].dtb_flags & DTRACEBUF_INACTIVE);
12561	state->dts_buffer[*cpu].dtb_flags &= ~DTRACEBUF_INACTIVE;
12562
12563	dtrace_probe(dtrace_probeid_begin,
12564	    (uint64_t)(uintptr_t)state, 0, 0, 0, 0);
12565	dtrace_interrupt_enable(cookie);
12566	/*
12567	 * We may have had an exit action from a BEGIN probe; only change our
12568	 * state to ACTIVE if we're still in WARMUP.
12569	 */
12570	ASSERT(state->dts_activity == DTRACE_ACTIVITY_WARMUP ||
12571	    state->dts_activity == DTRACE_ACTIVITY_DRAINING);
12572
12573	if (state->dts_activity == DTRACE_ACTIVITY_WARMUP)
12574		state->dts_activity = DTRACE_ACTIVITY_ACTIVE;
12575
12576	/*
12577	 * Regardless of whether or not now we're in ACTIVE or DRAINING, we
12578	 * want each CPU to transition its principal buffer out of the
12579	 * INACTIVE state.  Doing this assures that no CPU will suddenly begin
12580	 * processing an ECB halfway down a probe's ECB chain; all CPUs will
12581	 * atomically transition from processing none of a state's ECBs to
12582	 * processing all of them.
12583	 */
12584	dtrace_xcall(DTRACE_CPUALL,
12585	    (dtrace_xcall_t)dtrace_buffer_activate, state);
12586	goto out;
12587
12588err:
12589	dtrace_buffer_free(state->dts_buffer);
12590	dtrace_buffer_free(state->dts_aggbuffer);
12591
12592	if ((nspec = state->dts_nspeculations) == 0) {
12593		ASSERT(state->dts_speculations == NULL);
12594		goto out;
12595	}
12596
12597	spec = state->dts_speculations;
12598	ASSERT(spec != NULL);
12599
12600	for (i = 0; i < state->dts_nspeculations; i++) {
12601		if ((buf = spec[i].dtsp_buffer) == NULL)
12602			break;
12603
12604		dtrace_buffer_free(buf);
12605		kmem_free(buf, bufsize);
12606	}
12607
12608	kmem_free(spec, nspec * sizeof (dtrace_speculation_t));
12609	state->dts_nspeculations = 0;
12610	state->dts_speculations = NULL;
12611
12612out:
12613	mutex_exit(&dtrace_lock);
12614	mutex_exit(&cpu_lock);
12615
12616	return (rval);
12617}
12618
12619static int
12620dtrace_state_stop(dtrace_state_t *state, processorid_t *cpu)
12621{
12622	dtrace_icookie_t cookie;
12623
12624	ASSERT(MUTEX_HELD(&dtrace_lock));
12625
12626	if (state->dts_activity != DTRACE_ACTIVITY_ACTIVE &&
12627	    state->dts_activity != DTRACE_ACTIVITY_DRAINING)
12628		return (EINVAL);
12629
12630	/*
12631	 * We'll set the activity to DTRACE_ACTIVITY_DRAINING, and issue a sync
12632	 * to be sure that every CPU has seen it.  See below for the details
12633	 * on why this is done.
12634	 */
12635	state->dts_activity = DTRACE_ACTIVITY_DRAINING;
12636	dtrace_sync();
12637
12638	/*
12639	 * By this point, it is impossible for any CPU to be still processing
12640	 * with DTRACE_ACTIVITY_ACTIVE.  We can thus set our activity to
12641	 * DTRACE_ACTIVITY_COOLDOWN and know that we're not racing with any
12642	 * other CPU in dtrace_buffer_reserve().  This allows dtrace_probe()
12643	 * and callees to know that the activity is DTRACE_ACTIVITY_COOLDOWN
12644	 * iff we're in the END probe.
12645	 */
12646	state->dts_activity = DTRACE_ACTIVITY_COOLDOWN;
12647	dtrace_sync();
12648	ASSERT(state->dts_activity == DTRACE_ACTIVITY_COOLDOWN);
12649
12650	/*
12651	 * Finally, we can release the reserve and call the END probe.  We
12652	 * disable interrupts across calling the END probe to allow us to
12653	 * return the CPU on which we actually called the END probe.  This
12654	 * allows user-land to be sure that this CPU's principal buffer is
12655	 * processed last.
12656	 */
12657	state->dts_reserve = 0;
12658
12659	cookie = dtrace_interrupt_disable();
12660	*cpu = CPU->cpu_id;
12661	dtrace_probe(dtrace_probeid_end,
12662	    (uint64_t)(uintptr_t)state, 0, 0, 0, 0);
12663	dtrace_interrupt_enable(cookie);
12664
12665	state->dts_activity = DTRACE_ACTIVITY_STOPPED;
12666	dtrace_sync();
12667
12668	return (0);
12669}
12670
12671static int
12672dtrace_state_option(dtrace_state_t *state, dtrace_optid_t option,
12673    dtrace_optval_t val)
12674{
12675	ASSERT(MUTEX_HELD(&dtrace_lock));
12676
12677	if (state->dts_activity != DTRACE_ACTIVITY_INACTIVE)
12678		return (EBUSY);
12679
12680	if (option >= DTRACEOPT_MAX)
12681		return (EINVAL);
12682
12683	if (option != DTRACEOPT_CPU && val < 0)
12684		return (EINVAL);
12685
12686	switch (option) {
12687	case DTRACEOPT_DESTRUCTIVE:
12688		if (dtrace_destructive_disallow)
12689			return (EACCES);
12690
12691		state->dts_cred.dcr_destructive = 1;
12692		break;
12693
12694	case DTRACEOPT_BUFSIZE:
12695	case DTRACEOPT_DYNVARSIZE:
12696	case DTRACEOPT_AGGSIZE:
12697	case DTRACEOPT_SPECSIZE:
12698	case DTRACEOPT_STRSIZE:
12699		if (val < 0)
12700			return (EINVAL);
12701
12702		if (val >= LONG_MAX) {
12703			/*
12704			 * If this is an otherwise negative value, set it to
12705			 * the highest multiple of 128m less than LONG_MAX.
12706			 * Technically, we're adjusting the size without
12707			 * regard to the buffer resizing policy, but in fact,
12708			 * this has no effect -- if we set the buffer size to
12709			 * ~LONG_MAX and the buffer policy is ultimately set to
12710			 * be "manual", the buffer allocation is guaranteed to
12711			 * fail, if only because the allocation requires two
12712			 * buffers.  (We set the the size to the highest
12713			 * multiple of 128m because it ensures that the size
12714			 * will remain a multiple of a megabyte when
12715			 * repeatedly halved -- all the way down to 15m.)
12716			 */
12717			val = LONG_MAX - (1 << 27) + 1;
12718		}
12719	}
12720
12721	state->dts_options[option] = val;
12722
12723	return (0);
12724}
12725
12726static void
12727dtrace_state_destroy(dtrace_state_t *state)
12728{
12729	dtrace_ecb_t *ecb;
12730	dtrace_vstate_t *vstate = &state->dts_vstate;
12731	minor_t minor = getminor(state->dts_dev);
12732	int i, bufsize = NCPU * sizeof (dtrace_buffer_t);
12733	dtrace_speculation_t *spec = state->dts_speculations;
12734	int nspec = state->dts_nspeculations;
12735	uint32_t match;
12736
12737	ASSERT(MUTEX_HELD(&dtrace_lock));
12738	ASSERT(MUTEX_HELD(&cpu_lock));
12739
12740	/*
12741	 * First, retract any retained enablings for this state.
12742	 */
12743	dtrace_enabling_retract(state);
12744	ASSERT(state->dts_nretained == 0);
12745
12746	if (state->dts_activity == DTRACE_ACTIVITY_ACTIVE ||
12747	    state->dts_activity == DTRACE_ACTIVITY_DRAINING) {
12748		/*
12749		 * We have managed to come into dtrace_state_destroy() on a
12750		 * hot enabling -- almost certainly because of a disorderly
12751		 * shutdown of a consumer.  (That is, a consumer that is
12752		 * exiting without having called dtrace_stop().) In this case,
12753		 * we're going to set our activity to be KILLED, and then
12754		 * issue a sync to be sure that everyone is out of probe
12755		 * context before we start blowing away ECBs.
12756		 */
12757		state->dts_activity = DTRACE_ACTIVITY_KILLED;
12758		dtrace_sync();
12759	}
12760
12761	/*
12762	 * Release the credential hold we took in dtrace_state_create().
12763	 */
12764	if (state->dts_cred.dcr_cred != NULL)
12765		crfree(state->dts_cred.dcr_cred);
12766
12767	/*
12768	 * Now we can safely disable and destroy any enabled probes.  Because
12769	 * any DTRACE_PRIV_KERNEL probes may actually be slowing our progress
12770	 * (especially if they're all enabled), we take two passes through the
12771	 * ECBs:  in the first, we disable just DTRACE_PRIV_KERNEL probes, and
12772	 * in the second we disable whatever is left over.
12773	 */
12774	for (match = DTRACE_PRIV_KERNEL; ; match = 0) {
12775		for (i = 0; i < state->dts_necbs; i++) {
12776			if ((ecb = state->dts_ecbs[i]) == NULL)
12777				continue;
12778
12779			if (match && ecb->dte_probe != NULL) {
12780				dtrace_probe_t *probe = ecb->dte_probe;
12781				dtrace_provider_t *prov = probe->dtpr_provider;
12782
12783				if (!(prov->dtpv_priv.dtpp_flags & match))
12784					continue;
12785			}
12786
12787			dtrace_ecb_disable(ecb);
12788			dtrace_ecb_destroy(ecb);
12789		}
12790
12791		if (!match)
12792			break;
12793	}
12794
12795	/*
12796	 * Before we free the buffers, perform one more sync to assure that
12797	 * every CPU is out of probe context.
12798	 */
12799	dtrace_sync();
12800
12801	dtrace_buffer_free(state->dts_buffer);
12802	dtrace_buffer_free(state->dts_aggbuffer);
12803
12804	for (i = 0; i < nspec; i++)
12805		dtrace_buffer_free(spec[i].dtsp_buffer);
12806
12807	if (state->dts_cleaner != CYCLIC_NONE)
12808		cyclic_remove(state->dts_cleaner);
12809
12810	if (state->dts_deadman != CYCLIC_NONE)
12811		cyclic_remove(state->dts_deadman);
12812
12813	dtrace_dstate_fini(&vstate->dtvs_dynvars);
12814	dtrace_vstate_fini(vstate);
12815	kmem_free(state->dts_ecbs, state->dts_necbs * sizeof (dtrace_ecb_t *));
12816
12817	if (state->dts_aggregations != NULL) {
12818#ifdef DEBUG
12819		for (i = 0; i < state->dts_naggregations; i++)
12820			ASSERT(state->dts_aggregations[i] == NULL);
12821#endif
12822		ASSERT(state->dts_naggregations > 0);
12823		kmem_free(state->dts_aggregations,
12824		    state->dts_naggregations * sizeof (dtrace_aggregation_t *));
12825	}
12826
12827	kmem_free(state->dts_buffer, bufsize);
12828	kmem_free(state->dts_aggbuffer, bufsize);
12829
12830	for (i = 0; i < nspec; i++)
12831		kmem_free(spec[i].dtsp_buffer, bufsize);
12832
12833	kmem_free(spec, nspec * sizeof (dtrace_speculation_t));
12834
12835	dtrace_format_destroy(state);
12836
12837	vmem_destroy(state->dts_aggid_arena);
12838	ddi_soft_state_free(dtrace_softstate, minor);
12839	vmem_free(dtrace_minor, (void *)(uintptr_t)minor, 1);
12840}
12841
12842/*
12843 * DTrace Anonymous Enabling Functions
12844 */
12845static dtrace_state_t *
12846dtrace_anon_grab(void)
12847{
12848	dtrace_state_t *state;
12849
12850	ASSERT(MUTEX_HELD(&dtrace_lock));
12851
12852	if ((state = dtrace_anon.dta_state) == NULL) {
12853		ASSERT(dtrace_anon.dta_enabling == NULL);
12854		return (NULL);
12855	}
12856
12857	ASSERT(dtrace_anon.dta_enabling != NULL);
12858	ASSERT(dtrace_retained != NULL);
12859
12860	dtrace_enabling_destroy(dtrace_anon.dta_enabling);
12861	dtrace_anon.dta_enabling = NULL;
12862	dtrace_anon.dta_state = NULL;
12863
12864	return (state);
12865}
12866
12867static void
12868dtrace_anon_property(void)
12869{
12870	int i, rv;
12871	dtrace_state_t *state;
12872	dof_hdr_t *dof;
12873	char c[32];		/* enough for "dof-data-" + digits */
12874
12875	ASSERT(MUTEX_HELD(&dtrace_lock));
12876	ASSERT(MUTEX_HELD(&cpu_lock));
12877
12878	for (i = 0; ; i++) {
12879		(void) snprintf(c, sizeof (c), "dof-data-%d", i);
12880
12881		dtrace_err_verbose = 1;
12882
12883		if ((dof = dtrace_dof_property(c)) == NULL) {
12884			dtrace_err_verbose = 0;
12885			break;
12886		}
12887
12888		/*
12889		 * We want to create anonymous state, so we need to transition
12890		 * the kernel debugger to indicate that DTrace is active.  If
12891		 * this fails (e.g. because the debugger has modified text in
12892		 * some way), we won't continue with the processing.
12893		 */
12894		if (kdi_dtrace_set(KDI_DTSET_DTRACE_ACTIVATE) != 0) {
12895			cmn_err(CE_NOTE, "kernel debugger active; anonymous "
12896			    "enabling ignored.");
12897			dtrace_dof_destroy(dof);
12898			break;
12899		}
12900
12901		/*
12902		 * If we haven't allocated an anonymous state, we'll do so now.
12903		 */
12904		if ((state = dtrace_anon.dta_state) == NULL) {
12905			state = dtrace_state_create(NULL, NULL);
12906			dtrace_anon.dta_state = state;
12907
12908			if (state == NULL) {
12909				/*
12910				 * This basically shouldn't happen:  the only
12911				 * failure mode from dtrace_state_create() is a
12912				 * failure of ddi_soft_state_zalloc() that
12913				 * itself should never happen.  Still, the
12914				 * interface allows for a failure mode, and
12915				 * we want to fail as gracefully as possible:
12916				 * we'll emit an error message and cease
12917				 * processing anonymous state in this case.
12918				 */
12919				cmn_err(CE_WARN, "failed to create "
12920				    "anonymous state");
12921				dtrace_dof_destroy(dof);
12922				break;
12923			}
12924		}
12925
12926		rv = dtrace_dof_slurp(dof, &state->dts_vstate, CRED(),
12927		    &dtrace_anon.dta_enabling, 0, B_TRUE);
12928
12929		if (rv == 0)
12930			rv = dtrace_dof_options(dof, state);
12931
12932		dtrace_err_verbose = 0;
12933		dtrace_dof_destroy(dof);
12934
12935		if (rv != 0) {
12936			/*
12937			 * This is malformed DOF; chuck any anonymous state
12938			 * that we created.
12939			 */
12940			ASSERT(dtrace_anon.dta_enabling == NULL);
12941			dtrace_state_destroy(state);
12942			dtrace_anon.dta_state = NULL;
12943			break;
12944		}
12945
12946		ASSERT(dtrace_anon.dta_enabling != NULL);
12947	}
12948
12949	if (dtrace_anon.dta_enabling != NULL) {
12950		int rval;
12951
12952		/*
12953		 * dtrace_enabling_retain() can only fail because we are
12954		 * trying to retain more enablings than are allowed -- but
12955		 * we only have one anonymous enabling, and we are guaranteed
12956		 * to be allowed at least one retained enabling; we assert
12957		 * that dtrace_enabling_retain() returns success.
12958		 */
12959		rval = dtrace_enabling_retain(dtrace_anon.dta_enabling);
12960		ASSERT(rval == 0);
12961
12962		dtrace_enabling_dump(dtrace_anon.dta_enabling);
12963	}
12964}
12965
12966/*
12967 * DTrace Helper Functions
12968 */
12969static void
12970dtrace_helper_trace(dtrace_helper_action_t *helper,
12971    dtrace_mstate_t *mstate, dtrace_vstate_t *vstate, int where)
12972{
12973	uint32_t size, next, nnext, i;
12974	dtrace_helptrace_t *ent;
12975	uint16_t flags = cpu_core[CPU->cpu_id].cpuc_dtrace_flags;
12976
12977	if (!dtrace_helptrace_enabled)
12978		return;
12979
12980	ASSERT(vstate->dtvs_nlocals <= dtrace_helptrace_nlocals);
12981
12982	/*
12983	 * What would a tracing framework be without its own tracing
12984	 * framework?  (Well, a hell of a lot simpler, for starters...)
12985	 */
12986	size = sizeof (dtrace_helptrace_t) + dtrace_helptrace_nlocals *
12987	    sizeof (uint64_t) - sizeof (uint64_t);
12988
12989	/*
12990	 * Iterate until we can allocate a slot in the trace buffer.
12991	 */
12992	do {
12993		next = dtrace_helptrace_next;
12994
12995		if (next + size < dtrace_helptrace_bufsize) {
12996			nnext = next + size;
12997		} else {
12998			nnext = size;
12999		}
13000	} while (dtrace_cas32(&dtrace_helptrace_next, next, nnext) != next);
13001
13002	/*
13003	 * We have our slot; fill it in.
13004	 */
13005	if (nnext == size)
13006		next = 0;
13007
13008	ent = (dtrace_helptrace_t *)&dtrace_helptrace_buffer[next];
13009	ent->dtht_helper = helper;
13010	ent->dtht_where = where;
13011	ent->dtht_nlocals = vstate->dtvs_nlocals;
13012
13013	ent->dtht_fltoffs = (mstate->dtms_present & DTRACE_MSTATE_FLTOFFS) ?
13014	    mstate->dtms_fltoffs : -1;
13015	ent->dtht_fault = DTRACE_FLAGS2FLT(flags);
13016	ent->dtht_illval = cpu_core[CPU->cpu_id].cpuc_dtrace_illval;
13017
13018	for (i = 0; i < vstate->dtvs_nlocals; i++) {
13019		dtrace_statvar_t *svar;
13020
13021		if ((svar = vstate->dtvs_locals[i]) == NULL)
13022			continue;
13023
13024		ASSERT(svar->dtsv_size >= NCPU * sizeof (uint64_t));
13025		ent->dtht_locals[i] =
13026		    ((uint64_t *)(uintptr_t)svar->dtsv_data)[CPU->cpu_id];
13027	}
13028}
13029
13030static uint64_t
13031dtrace_helper(int which, dtrace_mstate_t *mstate,
13032    dtrace_state_t *state, uint64_t arg0, uint64_t arg1)
13033{
13034	uint16_t *flags = &cpu_core[CPU->cpu_id].cpuc_dtrace_flags;
13035	uint64_t sarg0 = mstate->dtms_arg[0];
13036	uint64_t sarg1 = mstate->dtms_arg[1];
13037	uint64_t rval;
13038	dtrace_helpers_t *helpers = curproc->p_dtrace_helpers;
13039	dtrace_helper_action_t *helper;
13040	dtrace_vstate_t *vstate;
13041	dtrace_difo_t *pred;
13042	int i, trace = dtrace_helptrace_enabled;
13043
13044	ASSERT(which >= 0 && which < DTRACE_NHELPER_ACTIONS);
13045
13046	if (helpers == NULL)
13047		return (0);
13048
13049	if ((helper = helpers->dthps_actions[which]) == NULL)
13050		return (0);
13051
13052	vstate = &helpers->dthps_vstate;
13053	mstate->dtms_arg[0] = arg0;
13054	mstate->dtms_arg[1] = arg1;
13055
13056	/*
13057	 * Now iterate over each helper.  If its predicate evaluates to 'true',
13058	 * we'll call the corresponding actions.  Note that the below calls
13059	 * to dtrace_dif_emulate() may set faults in machine state.  This is
13060	 * okay:  our caller (the outer dtrace_dif_emulate()) will simply plow
13061	 * the stored DIF offset with its own (which is the desired behavior).
13062	 * Also, note the calls to dtrace_dif_emulate() may allocate scratch
13063	 * from machine state; this is okay, too.
13064	 */
13065	for (; helper != NULL; helper = helper->dtha_next) {
13066		if ((pred = helper->dtha_predicate) != NULL) {
13067			if (trace)
13068				dtrace_helper_trace(helper, mstate, vstate, 0);
13069
13070			if (!dtrace_dif_emulate(pred, mstate, vstate, state))
13071				goto next;
13072
13073			if (*flags & CPU_DTRACE_FAULT)
13074				goto err;
13075		}
13076
13077		for (i = 0; i < helper->dtha_nactions; i++) {
13078			if (trace)
13079				dtrace_helper_trace(helper,
13080				    mstate, vstate, i + 1);
13081
13082			rval = dtrace_dif_emulate(helper->dtha_actions[i],
13083			    mstate, vstate, state);
13084
13085			if (*flags & CPU_DTRACE_FAULT)
13086				goto err;
13087		}
13088
13089next:
13090		if (trace)
13091			dtrace_helper_trace(helper, mstate, vstate,
13092			    DTRACE_HELPTRACE_NEXT);
13093	}
13094
13095	if (trace)
13096		dtrace_helper_trace(helper, mstate, vstate,
13097		    DTRACE_HELPTRACE_DONE);
13098
13099	/*
13100	 * Restore the arg0 that we saved upon entry.
13101	 */
13102	mstate->dtms_arg[0] = sarg0;
13103	mstate->dtms_arg[1] = sarg1;
13104
13105	return (rval);
13106
13107err:
13108	if (trace)
13109		dtrace_helper_trace(helper, mstate, vstate,
13110		    DTRACE_HELPTRACE_ERR);
13111
13112	/*
13113	 * Restore the arg0 that we saved upon entry.
13114	 */
13115	mstate->dtms_arg[0] = sarg0;
13116	mstate->dtms_arg[1] = sarg1;
13117
13118	return (NULL);
13119}
13120
13121static void
13122dtrace_helper_action_destroy(dtrace_helper_action_t *helper,
13123    dtrace_vstate_t *vstate)
13124{
13125	int i;
13126
13127	if (helper->dtha_predicate != NULL)
13128		dtrace_difo_release(helper->dtha_predicate, vstate);
13129
13130	for (i = 0; i < helper->dtha_nactions; i++) {
13131		ASSERT(helper->dtha_actions[i] != NULL);
13132		dtrace_difo_release(helper->dtha_actions[i], vstate);
13133	}
13134
13135	kmem_free(helper->dtha_actions,
13136	    helper->dtha_nactions * sizeof (dtrace_difo_t *));
13137	kmem_free(helper, sizeof (dtrace_helper_action_t));
13138}
13139
13140static int
13141dtrace_helper_destroygen(int gen)
13142{
13143	proc_t *p = curproc;
13144	dtrace_helpers_t *help = p->p_dtrace_helpers;
13145	dtrace_vstate_t *vstate;
13146	int i;
13147
13148	ASSERT(MUTEX_HELD(&dtrace_lock));
13149
13150	if (help == NULL || gen > help->dthps_generation)
13151		return (EINVAL);
13152
13153	vstate = &help->dthps_vstate;
13154
13155	for (i = 0; i < DTRACE_NHELPER_ACTIONS; i++) {
13156		dtrace_helper_action_t *last = NULL, *h, *next;
13157
13158		for (h = help->dthps_actions[i]; h != NULL; h = next) {
13159			next = h->dtha_next;
13160
13161			if (h->dtha_generation == gen) {
13162				if (last != NULL) {
13163					last->dtha_next = next;
13164				} else {
13165					help->dthps_actions[i] = next;
13166				}
13167
13168				dtrace_helper_action_destroy(h, vstate);
13169			} else {
13170				last = h;
13171			}
13172		}
13173	}
13174
13175	/*
13176	 * Interate until we've cleared out all helper providers with the
13177	 * given generation number.
13178	 */
13179	for (;;) {
13180		dtrace_helper_provider_t *prov;
13181
13182		/*
13183		 * Look for a helper provider with the right generation. We
13184		 * have to start back at the beginning of the list each time
13185		 * because we drop dtrace_lock. It's unlikely that we'll make
13186		 * more than two passes.
13187		 */
13188		for (i = 0; i < help->dthps_nprovs; i++) {
13189			prov = help->dthps_provs[i];
13190
13191			if (prov->dthp_generation == gen)
13192				break;
13193		}
13194
13195		/*
13196		 * If there were no matches, we're done.
13197		 */
13198		if (i == help->dthps_nprovs)
13199			break;
13200
13201		/*
13202		 * Move the last helper provider into this slot.
13203		 */
13204		help->dthps_nprovs--;
13205		help->dthps_provs[i] = help->dthps_provs[help->dthps_nprovs];
13206		help->dthps_provs[help->dthps_nprovs] = NULL;
13207
13208		mutex_exit(&dtrace_lock);
13209
13210		/*
13211		 * If we have a meta provider, remove this helper provider.
13212		 */
13213		mutex_enter(&dtrace_meta_lock);
13214		if (dtrace_meta_pid != NULL) {
13215			ASSERT(dtrace_deferred_pid == NULL);
13216			dtrace_helper_provider_remove(&prov->dthp_prov,
13217			    p->p_pid);
13218		}
13219		mutex_exit(&dtrace_meta_lock);
13220
13221		dtrace_helper_provider_destroy(prov);
13222
13223		mutex_enter(&dtrace_lock);
13224	}
13225
13226	return (0);
13227}
13228
13229static int
13230dtrace_helper_validate(dtrace_helper_action_t *helper)
13231{
13232	int err = 0, i;
13233	dtrace_difo_t *dp;
13234
13235	if ((dp = helper->dtha_predicate) != NULL)
13236		err += dtrace_difo_validate_helper(dp);
13237
13238	for (i = 0; i < helper->dtha_nactions; i++)
13239		err += dtrace_difo_validate_helper(helper->dtha_actions[i]);
13240
13241	return (err == 0);
13242}
13243
13244static int
13245dtrace_helper_action_add(int which, dtrace_ecbdesc_t *ep)
13246{
13247	dtrace_helpers_t *help;
13248	dtrace_helper_action_t *helper, *last;
13249	dtrace_actdesc_t *act;
13250	dtrace_vstate_t *vstate;
13251	dtrace_predicate_t *pred;
13252	int count = 0, nactions = 0, i;
13253
13254	if (which < 0 || which >= DTRACE_NHELPER_ACTIONS)
13255		return (EINVAL);
13256
13257	help = curproc->p_dtrace_helpers;
13258	last = help->dthps_actions[which];
13259	vstate = &help->dthps_vstate;
13260
13261	for (count = 0; last != NULL; last = last->dtha_next) {
13262		count++;
13263		if (last->dtha_next == NULL)
13264			break;
13265	}
13266
13267	/*
13268	 * If we already have dtrace_helper_actions_max helper actions for this
13269	 * helper action type, we'll refuse to add a new one.
13270	 */
13271	if (count >= dtrace_helper_actions_max)
13272		return (ENOSPC);
13273
13274	helper = kmem_zalloc(sizeof (dtrace_helper_action_t), KM_SLEEP);
13275	helper->dtha_generation = help->dthps_generation;
13276
13277	if ((pred = ep->dted_pred.dtpdd_predicate) != NULL) {
13278		ASSERT(pred->dtp_difo != NULL);
13279		dtrace_difo_hold(pred->dtp_difo);
13280		helper->dtha_predicate = pred->dtp_difo;
13281	}
13282
13283	for (act = ep->dted_action; act != NULL; act = act->dtad_next) {
13284		if (act->dtad_kind != DTRACEACT_DIFEXPR)
13285			goto err;
13286
13287		if (act->dtad_difo == NULL)
13288			goto err;
13289
13290		nactions++;
13291	}
13292
13293	helper->dtha_actions = kmem_zalloc(sizeof (dtrace_difo_t *) *
13294	    (helper->dtha_nactions = nactions), KM_SLEEP);
13295
13296	for (act = ep->dted_action, i = 0; act != NULL; act = act->dtad_next) {
13297		dtrace_difo_hold(act->dtad_difo);
13298		helper->dtha_actions[i++] = act->dtad_difo;
13299	}
13300
13301	if (!dtrace_helper_validate(helper))
13302		goto err;
13303
13304	if (last == NULL) {
13305		help->dthps_actions[which] = helper;
13306	} else {
13307		last->dtha_next = helper;
13308	}
13309
13310	if (vstate->dtvs_nlocals > dtrace_helptrace_nlocals) {
13311		dtrace_helptrace_nlocals = vstate->dtvs_nlocals;
13312		dtrace_helptrace_next = 0;
13313	}
13314
13315	return (0);
13316err:
13317	dtrace_helper_action_destroy(helper, vstate);
13318	return (EINVAL);
13319}
13320
13321static void
13322dtrace_helper_provider_register(proc_t *p, dtrace_helpers_t *help,
13323    dof_helper_t *dofhp)
13324{
13325	ASSERT(MUTEX_NOT_HELD(&dtrace_lock));
13326
13327	mutex_enter(&dtrace_meta_lock);
13328	mutex_enter(&dtrace_lock);
13329
13330	if (!dtrace_attached() || dtrace_meta_pid == NULL) {
13331		/*
13332		 * If the dtrace module is loaded but not attached, or if
13333		 * there aren't isn't a meta provider registered to deal with
13334		 * these provider descriptions, we need to postpone creating
13335		 * the actual providers until later.
13336		 */
13337
13338		if (help->dthps_next == NULL && help->dthps_prev == NULL &&
13339		    dtrace_deferred_pid != help) {
13340			help->dthps_deferred = 1;
13341			help->dthps_pid = p->p_pid;
13342			help->dthps_next = dtrace_deferred_pid;
13343			help->dthps_prev = NULL;
13344			if (dtrace_deferred_pid != NULL)
13345				dtrace_deferred_pid->dthps_prev = help;
13346			dtrace_deferred_pid = help;
13347		}
13348
13349		mutex_exit(&dtrace_lock);
13350
13351	} else if (dofhp != NULL) {
13352		/*
13353		 * If the dtrace module is loaded and we have a particular
13354		 * helper provider description, pass that off to the
13355		 * meta provider.
13356		 */
13357
13358		mutex_exit(&dtrace_lock);
13359
13360		dtrace_helper_provide(dofhp, p->p_pid);
13361
13362	} else {
13363		/*
13364		 * Otherwise, just pass all the helper provider descriptions
13365		 * off to the meta provider.
13366		 */
13367
13368		int i;
13369		mutex_exit(&dtrace_lock);
13370
13371		for (i = 0; i < help->dthps_nprovs; i++) {
13372			dtrace_helper_provide(&help->dthps_provs[i]->dthp_prov,
13373			    p->p_pid);
13374		}
13375	}
13376
13377	mutex_exit(&dtrace_meta_lock);
13378}
13379
13380static int
13381dtrace_helper_provider_add(dof_helper_t *dofhp, int gen)
13382{
13383	dtrace_helpers_t *help;
13384	dtrace_helper_provider_t *hprov, **tmp_provs;
13385	uint_t tmp_maxprovs, i;
13386
13387	ASSERT(MUTEX_HELD(&dtrace_lock));
13388
13389	help = curproc->p_dtrace_helpers;
13390	ASSERT(help != NULL);
13391
13392	/*
13393	 * If we already have dtrace_helper_providers_max helper providers,
13394	 * we're refuse to add a new one.
13395	 */
13396	if (help->dthps_nprovs >= dtrace_helper_providers_max)
13397		return (ENOSPC);
13398
13399	/*
13400	 * Check to make sure this isn't a duplicate.
13401	 */
13402	for (i = 0; i < help->dthps_nprovs; i++) {
13403		if (dofhp->dofhp_addr ==
13404		    help->dthps_provs[i]->dthp_prov.dofhp_addr)
13405			return (EALREADY);
13406	}
13407
13408	hprov = kmem_zalloc(sizeof (dtrace_helper_provider_t), KM_SLEEP);
13409	hprov->dthp_prov = *dofhp;
13410	hprov->dthp_ref = 1;
13411	hprov->dthp_generation = gen;
13412
13413	/*
13414	 * Allocate a bigger table for helper providers if it's already full.
13415	 */
13416	if (help->dthps_maxprovs == help->dthps_nprovs) {
13417		tmp_maxprovs = help->dthps_maxprovs;
13418		tmp_provs = help->dthps_provs;
13419
13420		if (help->dthps_maxprovs == 0)
13421			help->dthps_maxprovs = 2;
13422		else
13423			help->dthps_maxprovs *= 2;
13424		if (help->dthps_maxprovs > dtrace_helper_providers_max)
13425			help->dthps_maxprovs = dtrace_helper_providers_max;
13426
13427		ASSERT(tmp_maxprovs < help->dthps_maxprovs);
13428
13429		help->dthps_provs = kmem_zalloc(help->dthps_maxprovs *
13430		    sizeof (dtrace_helper_provider_t *), KM_SLEEP);
13431
13432		if (tmp_provs != NULL) {
13433			bcopy(tmp_provs, help->dthps_provs, tmp_maxprovs *
13434			    sizeof (dtrace_helper_provider_t *));
13435			kmem_free(tmp_provs, tmp_maxprovs *
13436			    sizeof (dtrace_helper_provider_t *));
13437		}
13438	}
13439
13440	help->dthps_provs[help->dthps_nprovs] = hprov;
13441	help->dthps_nprovs++;
13442
13443	return (0);
13444}
13445
13446static void
13447dtrace_helper_provider_destroy(dtrace_helper_provider_t *hprov)
13448{
13449	mutex_enter(&dtrace_lock);
13450
13451	if (--hprov->dthp_ref == 0) {
13452		dof_hdr_t *dof;
13453		mutex_exit(&dtrace_lock);
13454		dof = (dof_hdr_t *)(uintptr_t)hprov->dthp_prov.dofhp_dof;
13455		dtrace_dof_destroy(dof);
13456		kmem_free(hprov, sizeof (dtrace_helper_provider_t));
13457	} else {
13458		mutex_exit(&dtrace_lock);
13459	}
13460}
13461
13462static int
13463dtrace_helper_provider_validate(dof_hdr_t *dof, dof_sec_t *sec)
13464{
13465	uintptr_t daddr = (uintptr_t)dof;
13466	dof_sec_t *str_sec, *prb_sec, *arg_sec, *off_sec, *enoff_sec;
13467	dof_provider_t *provider;
13468	dof_probe_t *probe;
13469	uint8_t *arg;
13470	char *strtab, *typestr;
13471	dof_stridx_t typeidx;
13472	size_t typesz;
13473	uint_t nprobes, j, k;
13474
13475	ASSERT(sec->dofs_type == DOF_SECT_PROVIDER);
13476
13477	if (sec->dofs_offset & (sizeof (uint_t) - 1)) {
13478		dtrace_dof_error(dof, "misaligned section offset");
13479		return (-1);
13480	}
13481
13482	/*
13483	 * The section needs to be large enough to contain the DOF provider
13484	 * structure appropriate for the given version.
13485	 */
13486	if (sec->dofs_size <
13487	    ((dof->dofh_ident[DOF_ID_VERSION] == DOF_VERSION_1) ?
13488	    offsetof(dof_provider_t, dofpv_prenoffs) :
13489	    sizeof (dof_provider_t))) {
13490		dtrace_dof_error(dof, "provider section too small");
13491		return (-1);
13492	}
13493
13494	provider = (dof_provider_t *)(uintptr_t)(daddr + sec->dofs_offset);
13495	str_sec = dtrace_dof_sect(dof, DOF_SECT_STRTAB, provider->dofpv_strtab);
13496	prb_sec = dtrace_dof_sect(dof, DOF_SECT_PROBES, provider->dofpv_probes);
13497	arg_sec = dtrace_dof_sect(dof, DOF_SECT_PRARGS, provider->dofpv_prargs);
13498	off_sec = dtrace_dof_sect(dof, DOF_SECT_PROFFS, provider->dofpv_proffs);
13499
13500	if (str_sec == NULL || prb_sec == NULL ||
13501	    arg_sec == NULL || off_sec == NULL)
13502		return (-1);
13503
13504	enoff_sec = NULL;
13505
13506	if (dof->dofh_ident[DOF_ID_VERSION] != DOF_VERSION_1 &&
13507	    provider->dofpv_prenoffs != DOF_SECT_NONE &&
13508	    (enoff_sec = dtrace_dof_sect(dof, DOF_SECT_PRENOFFS,
13509	    provider->dofpv_prenoffs)) == NULL)
13510		return (-1);
13511
13512	strtab = (char *)(uintptr_t)(daddr + str_sec->dofs_offset);
13513
13514	if (provider->dofpv_name >= str_sec->dofs_size ||
13515	    strlen(strtab + provider->dofpv_name) >= DTRACE_PROVNAMELEN) {
13516		dtrace_dof_error(dof, "invalid provider name");
13517		return (-1);
13518	}
13519
13520	if (prb_sec->dofs_entsize == 0 ||
13521	    prb_sec->dofs_entsize > prb_sec->dofs_size) {
13522		dtrace_dof_error(dof, "invalid entry size");
13523		return (-1);
13524	}
13525
13526	if (prb_sec->dofs_entsize & (sizeof (uintptr_t) - 1)) {
13527		dtrace_dof_error(dof, "misaligned entry size");
13528		return (-1);
13529	}
13530
13531	if (off_sec->dofs_entsize != sizeof (uint32_t)) {
13532		dtrace_dof_error(dof, "invalid entry size");
13533		return (-1);
13534	}
13535
13536	if (off_sec->dofs_offset & (sizeof (uint32_t) - 1)) {
13537		dtrace_dof_error(dof, "misaligned section offset");
13538		return (-1);
13539	}
13540
13541	if (arg_sec->dofs_entsize != sizeof (uint8_t)) {
13542		dtrace_dof_error(dof, "invalid entry size");
13543		return (-1);
13544	}
13545
13546	arg = (uint8_t *)(uintptr_t)(daddr + arg_sec->dofs_offset);
13547
13548	nprobes = prb_sec->dofs_size / prb_sec->dofs_entsize;
13549
13550	/*
13551	 * Take a pass through the probes to check for errors.
13552	 */
13553	for (j = 0; j < nprobes; j++) {
13554		probe = (dof_probe_t *)(uintptr_t)(daddr +
13555		    prb_sec->dofs_offset + j * prb_sec->dofs_entsize);
13556
13557		if (probe->dofpr_func >= str_sec->dofs_size) {
13558			dtrace_dof_error(dof, "invalid function name");
13559			return (-1);
13560		}
13561
13562		if (strlen(strtab + probe->dofpr_func) >= DTRACE_FUNCNAMELEN) {
13563			dtrace_dof_error(dof, "function name too long");
13564			return (-1);
13565		}
13566
13567		if (probe->dofpr_name >= str_sec->dofs_size ||
13568		    strlen(strtab + probe->dofpr_name) >= DTRACE_NAMELEN) {
13569			dtrace_dof_error(dof, "invalid probe name");
13570			return (-1);
13571		}
13572
13573		/*
13574		 * The offset count must not wrap the index, and the offsets
13575		 * must also not overflow the section's data.
13576		 */
13577		if (probe->dofpr_offidx + probe->dofpr_noffs <
13578		    probe->dofpr_offidx ||
13579		    (probe->dofpr_offidx + probe->dofpr_noffs) *
13580		    off_sec->dofs_entsize > off_sec->dofs_size) {
13581			dtrace_dof_error(dof, "invalid probe offset");
13582			return (-1);
13583		}
13584
13585		if (dof->dofh_ident[DOF_ID_VERSION] != DOF_VERSION_1) {
13586			/*
13587			 * If there's no is-enabled offset section, make sure
13588			 * there aren't any is-enabled offsets. Otherwise
13589			 * perform the same checks as for probe offsets
13590			 * (immediately above).
13591			 */
13592			if (enoff_sec == NULL) {
13593				if (probe->dofpr_enoffidx != 0 ||
13594				    probe->dofpr_nenoffs != 0) {
13595					dtrace_dof_error(dof, "is-enabled "
13596					    "offsets with null section");
13597					return (-1);
13598				}
13599			} else if (probe->dofpr_enoffidx +
13600			    probe->dofpr_nenoffs < probe->dofpr_enoffidx ||
13601			    (probe->dofpr_enoffidx + probe->dofpr_nenoffs) *
13602			    enoff_sec->dofs_entsize > enoff_sec->dofs_size) {
13603				dtrace_dof_error(dof, "invalid is-enabled "
13604				    "offset");
13605				return (-1);
13606			}
13607
13608			if (probe->dofpr_noffs + probe->dofpr_nenoffs == 0) {
13609				dtrace_dof_error(dof, "zero probe and "
13610				    "is-enabled offsets");
13611				return (-1);
13612			}
13613		} else if (probe->dofpr_noffs == 0) {
13614			dtrace_dof_error(dof, "zero probe offsets");
13615			return (-1);
13616		}
13617
13618		if (probe->dofpr_argidx + probe->dofpr_xargc <
13619		    probe->dofpr_argidx ||
13620		    (probe->dofpr_argidx + probe->dofpr_xargc) *
13621		    arg_sec->dofs_entsize > arg_sec->dofs_size) {
13622			dtrace_dof_error(dof, "invalid args");
13623			return (-1);
13624		}
13625
13626		typeidx = probe->dofpr_nargv;
13627		typestr = strtab + probe->dofpr_nargv;
13628		for (k = 0; k < probe->dofpr_nargc; k++) {
13629			if (typeidx >= str_sec->dofs_size) {
13630				dtrace_dof_error(dof, "bad "
13631				    "native argument type");
13632				return (-1);
13633			}
13634
13635			typesz = strlen(typestr) + 1;
13636			if (typesz > DTRACE_ARGTYPELEN) {
13637				dtrace_dof_error(dof, "native "
13638				    "argument type too long");
13639				return (-1);
13640			}
13641			typeidx += typesz;
13642			typestr += typesz;
13643		}
13644
13645		typeidx = probe->dofpr_xargv;
13646		typestr = strtab + probe->dofpr_xargv;
13647		for (k = 0; k < probe->dofpr_xargc; k++) {
13648			if (arg[probe->dofpr_argidx + k] > probe->dofpr_nargc) {
13649				dtrace_dof_error(dof, "bad "
13650				    "native argument index");
13651				return (-1);
13652			}
13653
13654			if (typeidx >= str_sec->dofs_size) {
13655				dtrace_dof_error(dof, "bad "
13656				    "translated argument type");
13657				return (-1);
13658			}
13659
13660			typesz = strlen(typestr) + 1;
13661			if (typesz > DTRACE_ARGTYPELEN) {
13662				dtrace_dof_error(dof, "translated argument "
13663				    "type too long");
13664				return (-1);
13665			}
13666
13667			typeidx += typesz;
13668			typestr += typesz;
13669		}
13670	}
13671
13672	return (0);
13673}
13674
13675static int
13676dtrace_helper_slurp(dof_hdr_t *dof, dof_helper_t *dhp)
13677{
13678	dtrace_helpers_t *help;
13679	dtrace_vstate_t *vstate;
13680	dtrace_enabling_t *enab = NULL;
13681	int i, gen, rv, nhelpers = 0, nprovs = 0, destroy = 1;
13682	uintptr_t daddr = (uintptr_t)dof;
13683
13684	ASSERT(MUTEX_HELD(&dtrace_lock));
13685
13686	if ((help = curproc->p_dtrace_helpers) == NULL)
13687		help = dtrace_helpers_create(curproc);
13688
13689	vstate = &help->dthps_vstate;
13690
13691	if ((rv = dtrace_dof_slurp(dof, vstate, NULL, &enab,
13692	    dhp != NULL ? dhp->dofhp_addr : 0, B_FALSE)) != 0) {
13693		dtrace_dof_destroy(dof);
13694		return (rv);
13695	}
13696
13697	/*
13698	 * Look for helper providers and validate their descriptions.
13699	 */
13700	if (dhp != NULL) {
13701		for (i = 0; i < dof->dofh_secnum; i++) {
13702			dof_sec_t *sec = (dof_sec_t *)(uintptr_t)(daddr +
13703			    dof->dofh_secoff + i * dof->dofh_secsize);
13704
13705			if (sec->dofs_type != DOF_SECT_PROVIDER)
13706				continue;
13707
13708			if (dtrace_helper_provider_validate(dof, sec) != 0) {
13709				dtrace_enabling_destroy(enab);
13710				dtrace_dof_destroy(dof);
13711				return (-1);
13712			}
13713
13714			nprovs++;
13715		}
13716	}
13717
13718	/*
13719	 * Now we need to walk through the ECB descriptions in the enabling.
13720	 */
13721	for (i = 0; i < enab->dten_ndesc; i++) {
13722		dtrace_ecbdesc_t *ep = enab->dten_desc[i];
13723		dtrace_probedesc_t *desc = &ep->dted_probe;
13724
13725		if (strcmp(desc->dtpd_provider, "dtrace") != 0)
13726			continue;
13727
13728		if (strcmp(desc->dtpd_mod, "helper") != 0)
13729			continue;
13730
13731		if (strcmp(desc->dtpd_func, "ustack") != 0)
13732			continue;
13733
13734		if ((rv = dtrace_helper_action_add(DTRACE_HELPER_ACTION_USTACK,
13735		    ep)) != 0) {
13736			/*
13737			 * Adding this helper action failed -- we are now going
13738			 * to rip out the entire generation and return failure.
13739			 */
13740			(void) dtrace_helper_destroygen(help->dthps_generation);
13741			dtrace_enabling_destroy(enab);
13742			dtrace_dof_destroy(dof);
13743			return (-1);
13744		}
13745
13746		nhelpers++;
13747	}
13748
13749	if (nhelpers < enab->dten_ndesc)
13750		dtrace_dof_error(dof, "unmatched helpers");
13751
13752	gen = help->dthps_generation++;
13753	dtrace_enabling_destroy(enab);
13754
13755	if (dhp != NULL && nprovs > 0) {
13756		dhp->dofhp_dof = (uint64_t)(uintptr_t)dof;
13757		if (dtrace_helper_provider_add(dhp, gen) == 0) {
13758			mutex_exit(&dtrace_lock);
13759			dtrace_helper_provider_register(curproc, help, dhp);
13760			mutex_enter(&dtrace_lock);
13761
13762			destroy = 0;
13763		}
13764	}
13765
13766	if (destroy)
13767		dtrace_dof_destroy(dof);
13768
13769	return (gen);
13770}
13771
13772static dtrace_helpers_t *
13773dtrace_helpers_create(proc_t *p)
13774{
13775	dtrace_helpers_t *help;
13776
13777	ASSERT(MUTEX_HELD(&dtrace_lock));
13778	ASSERT(p->p_dtrace_helpers == NULL);
13779
13780	help = kmem_zalloc(sizeof (dtrace_helpers_t), KM_SLEEP);
13781	help->dthps_actions = kmem_zalloc(sizeof (dtrace_helper_action_t *) *
13782	    DTRACE_NHELPER_ACTIONS, KM_SLEEP);
13783
13784	p->p_dtrace_helpers = help;
13785	dtrace_helpers++;
13786
13787	return (help);
13788}
13789
13790static void
13791dtrace_helpers_destroy(void)
13792{
13793	dtrace_helpers_t *help;
13794	dtrace_vstate_t *vstate;
13795	proc_t *p = curproc;
13796	int i;
13797
13798	mutex_enter(&dtrace_lock);
13799
13800	ASSERT(p->p_dtrace_helpers != NULL);
13801	ASSERT(dtrace_helpers > 0);
13802
13803	help = p->p_dtrace_helpers;
13804	vstate = &help->dthps_vstate;
13805
13806	/*
13807	 * We're now going to lose the help from this process.
13808	 */
13809	p->p_dtrace_helpers = NULL;
13810	dtrace_sync();
13811
13812	/*
13813	 * Destory the helper actions.
13814	 */
13815	for (i = 0; i < DTRACE_NHELPER_ACTIONS; i++) {
13816		dtrace_helper_action_t *h, *next;
13817
13818		for (h = help->dthps_actions[i]; h != NULL; h = next) {
13819			next = h->dtha_next;
13820			dtrace_helper_action_destroy(h, vstate);
13821			h = next;
13822		}
13823	}
13824
13825	mutex_exit(&dtrace_lock);
13826
13827	/*
13828	 * Destroy the helper providers.
13829	 */
13830	if (help->dthps_maxprovs > 0) {
13831		mutex_enter(&dtrace_meta_lock);
13832		if (dtrace_meta_pid != NULL) {
13833			ASSERT(dtrace_deferred_pid == NULL);
13834
13835			for (i = 0; i < help->dthps_nprovs; i++) {
13836				dtrace_helper_provider_remove(
13837				    &help->dthps_provs[i]->dthp_prov, p->p_pid);
13838			}
13839		} else {
13840			mutex_enter(&dtrace_lock);
13841			ASSERT(help->dthps_deferred == 0 ||
13842			    help->dthps_next != NULL ||
13843			    help->dthps_prev != NULL ||
13844			    help == dtrace_deferred_pid);
13845
13846			/*
13847			 * Remove the helper from the deferred list.
13848			 */
13849			if (help->dthps_next != NULL)
13850				help->dthps_next->dthps_prev = help->dthps_prev;
13851			if (help->dthps_prev != NULL)
13852				help->dthps_prev->dthps_next = help->dthps_next;
13853			if (dtrace_deferred_pid == help) {
13854				dtrace_deferred_pid = help->dthps_next;
13855				ASSERT(help->dthps_prev == NULL);
13856			}
13857
13858			mutex_exit(&dtrace_lock);
13859		}
13860
13861		mutex_exit(&dtrace_meta_lock);
13862
13863		for (i = 0; i < help->dthps_nprovs; i++) {
13864			dtrace_helper_provider_destroy(help->dthps_provs[i]);
13865		}
13866
13867		kmem_free(help->dthps_provs, help->dthps_maxprovs *
13868		    sizeof (dtrace_helper_provider_t *));
13869	}
13870
13871	mutex_enter(&dtrace_lock);
13872
13873	dtrace_vstate_fini(&help->dthps_vstate);
13874	kmem_free(help->dthps_actions,
13875	    sizeof (dtrace_helper_action_t *) * DTRACE_NHELPER_ACTIONS);
13876	kmem_free(help, sizeof (dtrace_helpers_t));
13877
13878	--dtrace_helpers;
13879	mutex_exit(&dtrace_lock);
13880}
13881
13882static void
13883dtrace_helpers_duplicate(proc_t *from, proc_t *to)
13884{
13885	dtrace_helpers_t *help, *newhelp;
13886	dtrace_helper_action_t *helper, *new, *last;
13887	dtrace_difo_t *dp;
13888	dtrace_vstate_t *vstate;
13889	int i, j, sz, hasprovs = 0;
13890
13891	mutex_enter(&dtrace_lock);
13892	ASSERT(from->p_dtrace_helpers != NULL);
13893	ASSERT(dtrace_helpers > 0);
13894
13895	help = from->p_dtrace_helpers;
13896	newhelp = dtrace_helpers_create(to);
13897	ASSERT(to->p_dtrace_helpers != NULL);
13898
13899	newhelp->dthps_generation = help->dthps_generation;
13900	vstate = &newhelp->dthps_vstate;
13901
13902	/*
13903	 * Duplicate the helper actions.
13904	 */
13905	for (i = 0; i < DTRACE_NHELPER_ACTIONS; i++) {
13906		if ((helper = help->dthps_actions[i]) == NULL)
13907			continue;
13908
13909		for (last = NULL; helper != NULL; helper = helper->dtha_next) {
13910			new = kmem_zalloc(sizeof (dtrace_helper_action_t),
13911			    KM_SLEEP);
13912			new->dtha_generation = helper->dtha_generation;
13913
13914			if ((dp = helper->dtha_predicate) != NULL) {
13915				dp = dtrace_difo_duplicate(dp, vstate);
13916				new->dtha_predicate = dp;
13917			}
13918
13919			new->dtha_nactions = helper->dtha_nactions;
13920			sz = sizeof (dtrace_difo_t *) * new->dtha_nactions;
13921			new->dtha_actions = kmem_alloc(sz, KM_SLEEP);
13922
13923			for (j = 0; j < new->dtha_nactions; j++) {
13924				dtrace_difo_t *dp = helper->dtha_actions[j];
13925
13926				ASSERT(dp != NULL);
13927				dp = dtrace_difo_duplicate(dp, vstate);
13928				new->dtha_actions[j] = dp;
13929			}
13930
13931			if (last != NULL) {
13932				last->dtha_next = new;
13933			} else {
13934				newhelp->dthps_actions[i] = new;
13935			}
13936
13937			last = new;
13938		}
13939	}
13940
13941	/*
13942	 * Duplicate the helper providers and register them with the
13943	 * DTrace framework.
13944	 */
13945	if (help->dthps_nprovs > 0) {
13946		newhelp->dthps_nprovs = help->dthps_nprovs;
13947		newhelp->dthps_maxprovs = help->dthps_nprovs;
13948		newhelp->dthps_provs = kmem_alloc(newhelp->dthps_nprovs *
13949		    sizeof (dtrace_helper_provider_t *), KM_SLEEP);
13950		for (i = 0; i < newhelp->dthps_nprovs; i++) {
13951			newhelp->dthps_provs[i] = help->dthps_provs[i];
13952			newhelp->dthps_provs[i]->dthp_ref++;
13953		}
13954
13955		hasprovs = 1;
13956	}
13957
13958	mutex_exit(&dtrace_lock);
13959
13960	if (hasprovs)
13961		dtrace_helper_provider_register(to, newhelp, NULL);
13962}
13963
13964/*
13965 * DTrace Hook Functions
13966 */
13967static void
13968dtrace_module_loaded(struct modctl *ctl)
13969{
13970	dtrace_provider_t *prv;
13971
13972	mutex_enter(&dtrace_provider_lock);
13973	mutex_enter(&mod_lock);
13974
13975	ASSERT(ctl->mod_busy);
13976
13977	/*
13978	 * We're going to call each providers per-module provide operation
13979	 * specifying only this module.
13980	 */
13981	for (prv = dtrace_provider; prv != NULL; prv = prv->dtpv_next)
13982		prv->dtpv_pops.dtps_provide_module(prv->dtpv_arg, ctl);
13983
13984	mutex_exit(&mod_lock);
13985	mutex_exit(&dtrace_provider_lock);
13986
13987	/*
13988	 * If we have any retained enablings, we need to match against them.
13989	 * Enabling probes requires that cpu_lock be held, and we cannot hold
13990	 * cpu_lock here -- it is legal for cpu_lock to be held when loading a
13991	 * module.  (In particular, this happens when loading scheduling
13992	 * classes.)  So if we have any retained enablings, we need to dispatch
13993	 * our task queue to do the match for us.
13994	 */
13995	mutex_enter(&dtrace_lock);
13996
13997	if (dtrace_retained == NULL) {
13998		mutex_exit(&dtrace_lock);
13999		return;
14000	}
14001
14002	(void) taskq_dispatch(dtrace_taskq,
14003	    (task_func_t *)dtrace_enabling_matchall, NULL, TQ_SLEEP);
14004
14005	mutex_exit(&dtrace_lock);
14006
14007	/*
14008	 * And now, for a little heuristic sleaze:  in general, we want to
14009	 * match modules as soon as they load.  However, we cannot guarantee
14010	 * this, because it would lead us to the lock ordering violation
14011	 * outlined above.  The common case, of course, is that cpu_lock is
14012	 * _not_ held -- so we delay here for a clock tick, hoping that that's
14013	 * long enough for the task queue to do its work.  If it's not, it's
14014	 * not a serious problem -- it just means that the module that we
14015	 * just loaded may not be immediately instrumentable.
14016	 */
14017	delay(1);
14018}
14019
14020static void
14021dtrace_module_unloaded(struct modctl *ctl)
14022{
14023	dtrace_probe_t template, *probe, *first, *next;
14024	dtrace_provider_t *prov;
14025
14026	template.dtpr_mod = ctl->mod_modname;
14027
14028	mutex_enter(&dtrace_provider_lock);
14029	mutex_enter(&mod_lock);
14030	mutex_enter(&dtrace_lock);
14031
14032	if (dtrace_bymod == NULL) {
14033		/*
14034		 * The DTrace module is loaded (obviously) but not attached;
14035		 * we don't have any work to do.
14036		 */
14037		mutex_exit(&dtrace_provider_lock);
14038		mutex_exit(&mod_lock);
14039		mutex_exit(&dtrace_lock);
14040		return;
14041	}
14042
14043	for (probe = first = dtrace_hash_lookup(dtrace_bymod, &template);
14044	    probe != NULL; probe = probe->dtpr_nextmod) {
14045		if (probe->dtpr_ecb != NULL) {
14046			mutex_exit(&dtrace_provider_lock);
14047			mutex_exit(&mod_lock);
14048			mutex_exit(&dtrace_lock);
14049
14050			/*
14051			 * This shouldn't _actually_ be possible -- we're
14052			 * unloading a module that has an enabled probe in it.
14053			 * (It's normally up to the provider to make sure that
14054			 * this can't happen.)  However, because dtps_enable()
14055			 * doesn't have a failure mode, there can be an
14056			 * enable/unload race.  Upshot:  we don't want to
14057			 * assert, but we're not going to disable the
14058			 * probe, either.
14059			 */
14060			if (dtrace_err_verbose) {
14061				cmn_err(CE_WARN, "unloaded module '%s' had "
14062				    "enabled probes", ctl->mod_modname);
14063			}
14064
14065			return;
14066		}
14067	}
14068
14069	probe = first;
14070
14071	for (first = NULL; probe != NULL; probe = next) {
14072		ASSERT(dtrace_probes[probe->dtpr_id - 1] == probe);
14073
14074		dtrace_probes[probe->dtpr_id - 1] = NULL;
14075
14076		next = probe->dtpr_nextmod;
14077		dtrace_hash_remove(dtrace_bymod, probe);
14078		dtrace_hash_remove(dtrace_byfunc, probe);
14079		dtrace_hash_remove(dtrace_byname, probe);
14080
14081		if (first == NULL) {
14082			first = probe;
14083			probe->dtpr_nextmod = NULL;
14084		} else {
14085			probe->dtpr_nextmod = first;
14086			first = probe;
14087		}
14088	}
14089
14090	/*
14091	 * We've removed all of the module's probes from the hash chains and
14092	 * from the probe array.  Now issue a dtrace_sync() to be sure that
14093	 * everyone has cleared out from any probe array processing.
14094	 */
14095	dtrace_sync();
14096
14097	for (probe = first; probe != NULL; probe = first) {
14098		first = probe->dtpr_nextmod;
14099		prov = probe->dtpr_provider;
14100		prov->dtpv_pops.dtps_destroy(prov->dtpv_arg, probe->dtpr_id,
14101		    probe->dtpr_arg);
14102		kmem_free(probe->dtpr_mod, strlen(probe->dtpr_mod) + 1);
14103		kmem_free(probe->dtpr_func, strlen(probe->dtpr_func) + 1);
14104		kmem_free(probe->dtpr_name, strlen(probe->dtpr_name) + 1);
14105		vmem_free(dtrace_arena, (void *)(uintptr_t)probe->dtpr_id, 1);
14106		kmem_free(probe, sizeof (dtrace_probe_t));
14107	}
14108
14109	mutex_exit(&dtrace_lock);
14110	mutex_exit(&mod_lock);
14111	mutex_exit(&dtrace_provider_lock);
14112}
14113
14114void
14115dtrace_suspend(void)
14116{
14117	dtrace_probe_foreach(offsetof(dtrace_pops_t, dtps_suspend));
14118}
14119
14120void
14121dtrace_resume(void)
14122{
14123	dtrace_probe_foreach(offsetof(dtrace_pops_t, dtps_resume));
14124}
14125
14126static int
14127dtrace_cpu_setup(cpu_setup_t what, processorid_t cpu)
14128{
14129	ASSERT(MUTEX_HELD(&cpu_lock));
14130	mutex_enter(&dtrace_lock);
14131
14132	switch (what) {
14133	case CPU_CONFIG: {
14134		dtrace_state_t *state;
14135		dtrace_optval_t *opt, rs, c;
14136
14137		/*
14138		 * For now, we only allocate a new buffer for anonymous state.
14139		 */
14140		if ((state = dtrace_anon.dta_state) == NULL)
14141			break;
14142
14143		if (state->dts_activity != DTRACE_ACTIVITY_ACTIVE)
14144			break;
14145
14146		opt = state->dts_options;
14147		c = opt[DTRACEOPT_CPU];
14148
14149		if (c != DTRACE_CPUALL && c != DTRACEOPT_UNSET && c != cpu)
14150			break;
14151
14152		/*
14153		 * Regardless of what the actual policy is, we're going to
14154		 * temporarily set our resize policy to be manual.  We're
14155		 * also going to temporarily set our CPU option to denote
14156		 * the newly configured CPU.
14157		 */
14158		rs = opt[DTRACEOPT_BUFRESIZE];
14159		opt[DTRACEOPT_BUFRESIZE] = DTRACEOPT_BUFRESIZE_MANUAL;
14160		opt[DTRACEOPT_CPU] = (dtrace_optval_t)cpu;
14161
14162		(void) dtrace_state_buffers(state);
14163
14164		opt[DTRACEOPT_BUFRESIZE] = rs;
14165		opt[DTRACEOPT_CPU] = c;
14166
14167		break;
14168	}
14169
14170	case CPU_UNCONFIG:
14171		/*
14172		 * We don't free the buffer in the CPU_UNCONFIG case.  (The
14173		 * buffer will be freed when the consumer exits.)
14174		 */
14175		break;
14176
14177	default:
14178		break;
14179	}
14180
14181	mutex_exit(&dtrace_lock);
14182	return (0);
14183}
14184
14185static void
14186dtrace_cpu_setup_initial(processorid_t cpu)
14187{
14188	(void) dtrace_cpu_setup(CPU_CONFIG, cpu);
14189}
14190
14191static void
14192dtrace_toxrange_add(uintptr_t base, uintptr_t limit)
14193{
14194	if (dtrace_toxranges >= dtrace_toxranges_max) {
14195		int osize, nsize;
14196		dtrace_toxrange_t *range;
14197
14198		osize = dtrace_toxranges_max * sizeof (dtrace_toxrange_t);
14199
14200		if (osize == 0) {
14201			ASSERT(dtrace_toxrange == NULL);
14202			ASSERT(dtrace_toxranges_max == 0);
14203			dtrace_toxranges_max = 1;
14204		} else {
14205			dtrace_toxranges_max <<= 1;
14206		}
14207
14208		nsize = dtrace_toxranges_max * sizeof (dtrace_toxrange_t);
14209		range = kmem_zalloc(nsize, KM_SLEEP);
14210
14211		if (dtrace_toxrange != NULL) {
14212			ASSERT(osize != 0);
14213			bcopy(dtrace_toxrange, range, osize);
14214			kmem_free(dtrace_toxrange, osize);
14215		}
14216
14217		dtrace_toxrange = range;
14218	}
14219
14220	ASSERT(dtrace_toxrange[dtrace_toxranges].dtt_base == NULL);
14221	ASSERT(dtrace_toxrange[dtrace_toxranges].dtt_limit == NULL);
14222
14223	dtrace_toxrange[dtrace_toxranges].dtt_base = base;
14224	dtrace_toxrange[dtrace_toxranges].dtt_limit = limit;
14225	dtrace_toxranges++;
14226}
14227
14228/*
14229 * DTrace Driver Cookbook Functions
14230 */
14231/*ARGSUSED*/
14232static int
14233dtrace_attach(dev_info_t *devi, ddi_attach_cmd_t cmd)
14234{
14235	dtrace_provider_id_t id;
14236	dtrace_state_t *state = NULL;
14237	dtrace_enabling_t *enab;
14238
14239	mutex_enter(&cpu_lock);
14240	mutex_enter(&dtrace_provider_lock);
14241	mutex_enter(&dtrace_lock);
14242
14243	if (ddi_soft_state_init(&dtrace_softstate,
14244	    sizeof (dtrace_state_t), 0) != 0) {
14245		cmn_err(CE_NOTE, "/dev/dtrace failed to initialize soft state");
14246		mutex_exit(&cpu_lock);
14247		mutex_exit(&dtrace_provider_lock);
14248		mutex_exit(&dtrace_lock);
14249		return (DDI_FAILURE);
14250	}
14251
14252	if (ddi_create_minor_node(devi, DTRACEMNR_DTRACE, S_IFCHR,
14253	    DTRACEMNRN_DTRACE, DDI_PSEUDO, NULL) == DDI_FAILURE ||
14254	    ddi_create_minor_node(devi, DTRACEMNR_HELPER, S_IFCHR,
14255	    DTRACEMNRN_HELPER, DDI_PSEUDO, NULL) == DDI_FAILURE) {
14256		cmn_err(CE_NOTE, "/dev/dtrace couldn't create minor nodes");
14257		ddi_remove_minor_node(devi, NULL);
14258		ddi_soft_state_fini(&dtrace_softstate);
14259		mutex_exit(&cpu_lock);
14260		mutex_exit(&dtrace_provider_lock);
14261		mutex_exit(&dtrace_lock);
14262		return (DDI_FAILURE);
14263	}
14264
14265	ddi_report_dev(devi);
14266	dtrace_devi = devi;
14267
14268	dtrace_modload = dtrace_module_loaded;
14269	dtrace_modunload = dtrace_module_unloaded;
14270	dtrace_cpu_init = dtrace_cpu_setup_initial;
14271	dtrace_helpers_cleanup = dtrace_helpers_destroy;
14272	dtrace_helpers_fork = dtrace_helpers_duplicate;
14273	dtrace_cpustart_init = dtrace_suspend;
14274	dtrace_cpustart_fini = dtrace_resume;
14275	dtrace_debugger_init = dtrace_suspend;
14276	dtrace_debugger_fini = dtrace_resume;
14277
14278	register_cpu_setup_func((cpu_setup_func_t *)dtrace_cpu_setup, NULL);
14279
14280	ASSERT(MUTEX_HELD(&cpu_lock));
14281
14282	dtrace_arena = vmem_create("dtrace", (void *)1, UINT32_MAX, 1,
14283	    NULL, NULL, NULL, 0, VM_SLEEP | VMC_IDENTIFIER);
14284	dtrace_minor = vmem_create("dtrace_minor", (void *)DTRACEMNRN_CLONE,
14285	    UINT32_MAX - DTRACEMNRN_CLONE, 1, NULL, NULL, NULL, 0,
14286	    VM_SLEEP | VMC_IDENTIFIER);
14287	dtrace_taskq = taskq_create("dtrace_taskq", 1, maxclsyspri,
14288	    1, INT_MAX, 0);
14289
14290	dtrace_state_cache = kmem_cache_create("dtrace_state_cache",
14291	    sizeof (dtrace_dstate_percpu_t) * NCPU, DTRACE_STATE_ALIGN,
14292	    NULL, NULL, NULL, NULL, NULL, 0);
14293
14294	ASSERT(MUTEX_HELD(&cpu_lock));
14295	dtrace_bymod = dtrace_hash_create(offsetof(dtrace_probe_t, dtpr_mod),
14296	    offsetof(dtrace_probe_t, dtpr_nextmod),
14297	    offsetof(dtrace_probe_t, dtpr_prevmod));
14298
14299	dtrace_byfunc = dtrace_hash_create(offsetof(dtrace_probe_t, dtpr_func),
14300	    offsetof(dtrace_probe_t, dtpr_nextfunc),
14301	    offsetof(dtrace_probe_t, dtpr_prevfunc));
14302
14303	dtrace_byname = dtrace_hash_create(offsetof(dtrace_probe_t, dtpr_name),
14304	    offsetof(dtrace_probe_t, dtpr_nextname),
14305	    offsetof(dtrace_probe_t, dtpr_prevname));
14306
14307	if (dtrace_retain_max < 1) {
14308		cmn_err(CE_WARN, "illegal value (%lu) for dtrace_retain_max; "
14309		    "setting to 1", dtrace_retain_max);
14310		dtrace_retain_max = 1;
14311	}
14312
14313	/*
14314	 * Now discover our toxic ranges.
14315	 */
14316	dtrace_toxic_ranges(dtrace_toxrange_add);
14317
14318	/*
14319	 * Before we register ourselves as a provider to our own framework,
14320	 * we would like to assert that dtrace_provider is NULL -- but that's
14321	 * not true if we were loaded as a dependency of a DTrace provider.
14322	 * Once we've registered, we can assert that dtrace_provider is our
14323	 * pseudo provider.
14324	 */
14325	(void) dtrace_register("dtrace", &dtrace_provider_attr,
14326	    DTRACE_PRIV_NONE, 0, &dtrace_provider_ops, NULL, &id);
14327
14328	ASSERT(dtrace_provider != NULL);
14329	ASSERT((dtrace_provider_id_t)dtrace_provider == id);
14330
14331	dtrace_probeid_begin = dtrace_probe_create((dtrace_provider_id_t)
14332	    dtrace_provider, NULL, NULL, "BEGIN", 0, NULL);
14333	dtrace_probeid_end = dtrace_probe_create((dtrace_provider_id_t)
14334	    dtrace_provider, NULL, NULL, "END", 0, NULL);
14335	dtrace_probeid_error = dtrace_probe_create((dtrace_provider_id_t)
14336	    dtrace_provider, NULL, NULL, "ERROR", 1, NULL);
14337
14338	dtrace_anon_property();
14339	mutex_exit(&cpu_lock);
14340
14341	/*
14342	 * If DTrace helper tracing is enabled, we need to allocate the
14343	 * trace buffer and initialize the values.
14344	 */
14345	if (dtrace_helptrace_enabled) {
14346		ASSERT(dtrace_helptrace_buffer == NULL);
14347		dtrace_helptrace_buffer =
14348		    kmem_zalloc(dtrace_helptrace_bufsize, KM_SLEEP);
14349		dtrace_helptrace_next = 0;
14350	}
14351
14352	/*
14353	 * If there are already providers, we must ask them to provide their
14354	 * probes, and then match any anonymous enabling against them.  Note
14355	 * that there should be no other retained enablings at this time:
14356	 * the only retained enablings at this time should be the anonymous
14357	 * enabling.
14358	 */
14359	if (dtrace_anon.dta_enabling != NULL) {
14360		ASSERT(dtrace_retained == dtrace_anon.dta_enabling);
14361
14362		dtrace_enabling_provide(NULL);
14363		state = dtrace_anon.dta_state;
14364
14365		/*
14366		 * We couldn't hold cpu_lock across the above call to
14367		 * dtrace_enabling_provide(), but we must hold it to actually
14368		 * enable the probes.  We have to drop all of our locks, pick
14369		 * up cpu_lock, and regain our locks before matching the
14370		 * retained anonymous enabling.
14371		 */
14372		mutex_exit(&dtrace_lock);
14373		mutex_exit(&dtrace_provider_lock);
14374
14375		mutex_enter(&cpu_lock);
14376		mutex_enter(&dtrace_provider_lock);
14377		mutex_enter(&dtrace_lock);
14378
14379		if ((enab = dtrace_anon.dta_enabling) != NULL)
14380			(void) dtrace_enabling_match(enab, NULL);
14381
14382		mutex_exit(&cpu_lock);
14383	}
14384
14385	mutex_exit(&dtrace_lock);
14386	mutex_exit(&dtrace_provider_lock);
14387
14388	if (state != NULL) {
14389		/*
14390		 * If we created any anonymous state, set it going now.
14391		 */
14392		(void) dtrace_state_go(state, &dtrace_anon.dta_beganon);
14393	}
14394
14395	return (DDI_SUCCESS);
14396}
14397
14398/*ARGSUSED*/
14399static int
14400dtrace_open(dev_t *devp, int flag, int otyp, cred_t *cred_p)
14401{
14402	dtrace_state_t *state;
14403	uint32_t priv;
14404	uid_t uid;
14405	zoneid_t zoneid;
14406
14407	if (getminor(*devp) == DTRACEMNRN_HELPER)
14408		return (0);
14409
14410	/*
14411	 * If this wasn't an open with the "helper" minor, then it must be
14412	 * the "dtrace" minor.
14413	 */
14414	if (getminor(*devp) != DTRACEMNRN_DTRACE)
14415		return (ENXIO);
14416
14417	/*
14418	 * If no DTRACE_PRIV_* bits are set in the credential, then the
14419	 * caller lacks sufficient permission to do anything with DTrace.
14420	 */
14421	dtrace_cred2priv(cred_p, &priv, &uid, &zoneid);
14422	if (priv == DTRACE_PRIV_NONE)
14423		return (EACCES);
14424
14425	/*
14426	 * Ask all providers to provide all their probes.
14427	 */
14428	mutex_enter(&dtrace_provider_lock);
14429	dtrace_probe_provide(NULL, NULL);
14430	mutex_exit(&dtrace_provider_lock);
14431
14432	mutex_enter(&cpu_lock);
14433	mutex_enter(&dtrace_lock);
14434	dtrace_opens++;
14435	dtrace_membar_producer();
14436
14437	/*
14438	 * If the kernel debugger is active (that is, if the kernel debugger
14439	 * modified text in some way), we won't allow the open.
14440	 */
14441	if (kdi_dtrace_set(KDI_DTSET_DTRACE_ACTIVATE) != 0) {
14442		dtrace_opens--;
14443		mutex_exit(&cpu_lock);
14444		mutex_exit(&dtrace_lock);
14445		return (EBUSY);
14446	}
14447
14448	state = dtrace_state_create(devp, cred_p);
14449	mutex_exit(&cpu_lock);
14450
14451	if (state == NULL) {
14452		if (--dtrace_opens == 0)
14453			(void) kdi_dtrace_set(KDI_DTSET_DTRACE_DEACTIVATE);
14454		mutex_exit(&dtrace_lock);
14455		return (EAGAIN);
14456	}
14457
14458	mutex_exit(&dtrace_lock);
14459
14460	return (0);
14461}
14462
14463/*ARGSUSED*/
14464static int
14465dtrace_close(dev_t dev, int flag, int otyp, cred_t *cred_p)
14466{
14467	minor_t minor = getminor(dev);
14468	dtrace_state_t *state;
14469
14470	if (minor == DTRACEMNRN_HELPER)
14471		return (0);
14472
14473	state = ddi_get_soft_state(dtrace_softstate, minor);
14474
14475	mutex_enter(&cpu_lock);
14476	mutex_enter(&dtrace_lock);
14477
14478	if (state->dts_anon) {
14479		/*
14480		 * There is anonymous state. Destroy that first.
14481		 */
14482		ASSERT(dtrace_anon.dta_state == NULL);
14483		dtrace_state_destroy(state->dts_anon);
14484	}
14485
14486	dtrace_state_destroy(state);
14487	ASSERT(dtrace_opens > 0);
14488	if (--dtrace_opens == 0)
14489		(void) kdi_dtrace_set(KDI_DTSET_DTRACE_DEACTIVATE);
14490
14491	mutex_exit(&dtrace_lock);
14492	mutex_exit(&cpu_lock);
14493
14494	return (0);
14495}
14496
14497/*ARGSUSED*/
14498static int
14499dtrace_ioctl_helper(int cmd, intptr_t arg, int *rv)
14500{
14501	int rval;
14502	dof_helper_t help, *dhp = NULL;
14503
14504	switch (cmd) {
14505	case DTRACEHIOC_ADDDOF:
14506		if (copyin((void *)arg, &help, sizeof (help)) != 0) {
14507			dtrace_dof_error(NULL, "failed to copyin DOF helper");
14508			return (EFAULT);
14509		}
14510
14511		dhp = &help;
14512		arg = (intptr_t)help.dofhp_dof;
14513		/*FALLTHROUGH*/
14514
14515	case DTRACEHIOC_ADD: {
14516		dof_hdr_t *dof = dtrace_dof_copyin(arg, &rval);
14517
14518		if (dof == NULL)
14519			return (rval);
14520
14521		mutex_enter(&dtrace_lock);
14522
14523		/*
14524		 * dtrace_helper_slurp() takes responsibility for the dof --
14525		 * it may free it now or it may save it and free it later.
14526		 */
14527		if ((rval = dtrace_helper_slurp(dof, dhp)) != -1) {
14528			*rv = rval;
14529			rval = 0;
14530		} else {
14531			rval = EINVAL;
14532		}
14533
14534		mutex_exit(&dtrace_lock);
14535		return (rval);
14536	}
14537
14538	case DTRACEHIOC_REMOVE: {
14539		mutex_enter(&dtrace_lock);
14540		rval = dtrace_helper_destroygen(arg);
14541		mutex_exit(&dtrace_lock);
14542
14543		return (rval);
14544	}
14545
14546	default:
14547		break;
14548	}
14549
14550	return (ENOTTY);
14551}
14552
14553/*ARGSUSED*/
14554static int
14555dtrace_ioctl(dev_t dev, int cmd, intptr_t arg, int md, cred_t *cr, int *rv)
14556{
14557	minor_t minor = getminor(dev);
14558	dtrace_state_t *state;
14559	int rval;
14560
14561	if (minor == DTRACEMNRN_HELPER)
14562		return (dtrace_ioctl_helper(cmd, arg, rv));
14563
14564	state = ddi_get_soft_state(dtrace_softstate, minor);
14565
14566	if (state->dts_anon) {
14567		ASSERT(dtrace_anon.dta_state == NULL);
14568		state = state->dts_anon;
14569	}
14570
14571	switch (cmd) {
14572	case DTRACEIOC_PROVIDER: {
14573		dtrace_providerdesc_t pvd;
14574		dtrace_provider_t *pvp;
14575
14576		if (copyin((void *)arg, &pvd, sizeof (pvd)) != 0)
14577			return (EFAULT);
14578
14579		pvd.dtvd_name[DTRACE_PROVNAMELEN - 1] = '\0';
14580		mutex_enter(&dtrace_provider_lock);
14581
14582		for (pvp = dtrace_provider; pvp != NULL; pvp = pvp->dtpv_next) {
14583			if (strcmp(pvp->dtpv_name, pvd.dtvd_name) == 0)
14584				break;
14585		}
14586
14587		mutex_exit(&dtrace_provider_lock);
14588
14589		if (pvp == NULL)
14590			return (ESRCH);
14591
14592		bcopy(&pvp->dtpv_priv, &pvd.dtvd_priv, sizeof (dtrace_ppriv_t));
14593		bcopy(&pvp->dtpv_attr, &pvd.dtvd_attr, sizeof (dtrace_pattr_t));
14594		if (copyout(&pvd, (void *)arg, sizeof (pvd)) != 0)
14595			return (EFAULT);
14596
14597		return (0);
14598	}
14599
14600	case DTRACEIOC_EPROBE: {
14601		dtrace_eprobedesc_t epdesc;
14602		dtrace_ecb_t *ecb;
14603		dtrace_action_t *act;
14604		void *buf;
14605		size_t size;
14606		uintptr_t dest;
14607		int nrecs;
14608
14609		if (copyin((void *)arg, &epdesc, sizeof (epdesc)) != 0)
14610			return (EFAULT);
14611
14612		mutex_enter(&dtrace_lock);
14613
14614		if ((ecb = dtrace_epid2ecb(state, epdesc.dtepd_epid)) == NULL) {
14615			mutex_exit(&dtrace_lock);
14616			return (EINVAL);
14617		}
14618
14619		if (ecb->dte_probe == NULL) {
14620			mutex_exit(&dtrace_lock);
14621			return (EINVAL);
14622		}
14623
14624		epdesc.dtepd_probeid = ecb->dte_probe->dtpr_id;
14625		epdesc.dtepd_uarg = ecb->dte_uarg;
14626		epdesc.dtepd_size = ecb->dte_size;
14627
14628		nrecs = epdesc.dtepd_nrecs;
14629		epdesc.dtepd_nrecs = 0;
14630		for (act = ecb->dte_action; act != NULL; act = act->dta_next) {
14631			if (DTRACEACT_ISAGG(act->dta_kind) || act->dta_intuple)
14632				continue;
14633
14634			epdesc.dtepd_nrecs++;
14635		}
14636
14637		/*
14638		 * Now that we have the size, we need to allocate a temporary
14639		 * buffer in which to store the complete description.  We need
14640		 * the temporary buffer to be able to drop dtrace_lock()
14641		 * across the copyout(), below.
14642		 */
14643		size = sizeof (dtrace_eprobedesc_t) +
14644		    (epdesc.dtepd_nrecs * sizeof (dtrace_recdesc_t));
14645
14646		buf = kmem_alloc(size, KM_SLEEP);
14647		dest = (uintptr_t)buf;
14648
14649		bcopy(&epdesc, (void *)dest, sizeof (epdesc));
14650		dest += offsetof(dtrace_eprobedesc_t, dtepd_rec[0]);
14651
14652		for (act = ecb->dte_action; act != NULL; act = act->dta_next) {
14653			if (DTRACEACT_ISAGG(act->dta_kind) || act->dta_intuple)
14654				continue;
14655
14656			if (nrecs-- == 0)
14657				break;
14658
14659			bcopy(&act->dta_rec, (void *)dest,
14660			    sizeof (dtrace_recdesc_t));
14661			dest += sizeof (dtrace_recdesc_t);
14662		}
14663
14664		mutex_exit(&dtrace_lock);
14665
14666		if (copyout(buf, (void *)arg, dest - (uintptr_t)buf) != 0) {
14667			kmem_free(buf, size);
14668			return (EFAULT);
14669		}
14670
14671		kmem_free(buf, size);
14672		return (0);
14673	}
14674
14675	case DTRACEIOC_AGGDESC: {
14676		dtrace_aggdesc_t aggdesc;
14677		dtrace_action_t *act;
14678		dtrace_aggregation_t *agg;
14679		int nrecs;
14680		uint32_t offs;
14681		dtrace_recdesc_t *lrec;
14682		void *buf;
14683		size_t size;
14684		uintptr_t dest;
14685
14686		if (copyin((void *)arg, &aggdesc, sizeof (aggdesc)) != 0)
14687			return (EFAULT);
14688
14689		mutex_enter(&dtrace_lock);
14690
14691		if ((agg = dtrace_aggid2agg(state, aggdesc.dtagd_id)) == NULL) {
14692			mutex_exit(&dtrace_lock);
14693			return (EINVAL);
14694		}
14695
14696		aggdesc.dtagd_epid = agg->dtag_ecb->dte_epid;
14697
14698		nrecs = aggdesc.dtagd_nrecs;
14699		aggdesc.dtagd_nrecs = 0;
14700
14701		offs = agg->dtag_base;
14702		lrec = &agg->dtag_action.dta_rec;
14703		aggdesc.dtagd_size = lrec->dtrd_offset + lrec->dtrd_size - offs;
14704
14705		for (act = agg->dtag_first; ; act = act->dta_next) {
14706			ASSERT(act->dta_intuple ||
14707			    DTRACEACT_ISAGG(act->dta_kind));
14708
14709			/*
14710			 * If this action has a record size of zero, it
14711			 * denotes an argument to the aggregating action.
14712			 * Because the presence of this record doesn't (or
14713			 * shouldn't) affect the way the data is interpreted,
14714			 * we don't copy it out to save user-level the
14715			 * confusion of dealing with a zero-length record.
14716			 */
14717			if (act->dta_rec.dtrd_size == 0) {
14718				ASSERT(agg->dtag_hasarg);
14719				continue;
14720			}
14721
14722			aggdesc.dtagd_nrecs++;
14723
14724			if (act == &agg->dtag_action)
14725				break;
14726		}
14727
14728		/*
14729		 * Now that we have the size, we need to allocate a temporary
14730		 * buffer in which to store the complete description.  We need
14731		 * the temporary buffer to be able to drop dtrace_lock()
14732		 * across the copyout(), below.
14733		 */
14734		size = sizeof (dtrace_aggdesc_t) +
14735		    (aggdesc.dtagd_nrecs * sizeof (dtrace_recdesc_t));
14736
14737		buf = kmem_alloc(size, KM_SLEEP);
14738		dest = (uintptr_t)buf;
14739
14740		bcopy(&aggdesc, (void *)dest, sizeof (aggdesc));
14741		dest += offsetof(dtrace_aggdesc_t, dtagd_rec[0]);
14742
14743		for (act = agg->dtag_first; ; act = act->dta_next) {
14744			dtrace_recdesc_t rec = act->dta_rec;
14745
14746			/*
14747			 * See the comment in the above loop for why we pass
14748			 * over zero-length records.
14749			 */
14750			if (rec.dtrd_size == 0) {
14751				ASSERT(agg->dtag_hasarg);
14752				continue;
14753			}
14754
14755			if (nrecs-- == 0)
14756				break;
14757
14758			rec.dtrd_offset -= offs;
14759			bcopy(&rec, (void *)dest, sizeof (rec));
14760			dest += sizeof (dtrace_recdesc_t);
14761
14762			if (act == &agg->dtag_action)
14763				break;
14764		}
14765
14766		mutex_exit(&dtrace_lock);
14767
14768		if (copyout(buf, (void *)arg, dest - (uintptr_t)buf) != 0) {
14769			kmem_free(buf, size);
14770			return (EFAULT);
14771		}
14772
14773		kmem_free(buf, size);
14774		return (0);
14775	}
14776
14777	case DTRACEIOC_ENABLE: {
14778		dof_hdr_t *dof;
14779		dtrace_enabling_t *enab = NULL;
14780		dtrace_vstate_t *vstate;
14781		int err = 0;
14782
14783		*rv = 0;
14784
14785		/*
14786		 * If a NULL argument has been passed, we take this as our
14787		 * cue to reevaluate our enablings.
14788		 */
14789		if (arg == NULL) {
14790			dtrace_enabling_matchall();
14791
14792			return (0);
14793		}
14794
14795		if ((dof = dtrace_dof_copyin(arg, &rval)) == NULL)
14796			return (rval);
14797
14798		mutex_enter(&cpu_lock);
14799		mutex_enter(&dtrace_lock);
14800		vstate = &state->dts_vstate;
14801
14802		if (state->dts_activity != DTRACE_ACTIVITY_INACTIVE) {
14803			mutex_exit(&dtrace_lock);
14804			mutex_exit(&cpu_lock);
14805			dtrace_dof_destroy(dof);
14806			return (EBUSY);
14807		}
14808
14809		if (dtrace_dof_slurp(dof, vstate, cr, &enab, 0, B_TRUE) != 0) {
14810			mutex_exit(&dtrace_lock);
14811			mutex_exit(&cpu_lock);
14812			dtrace_dof_destroy(dof);
14813			return (EINVAL);
14814		}
14815
14816		if ((rval = dtrace_dof_options(dof, state)) != 0) {
14817			dtrace_enabling_destroy(enab);
14818			mutex_exit(&dtrace_lock);
14819			mutex_exit(&cpu_lock);
14820			dtrace_dof_destroy(dof);
14821			return (rval);
14822		}
14823
14824		if ((err = dtrace_enabling_match(enab, rv)) == 0) {
14825			err = dtrace_enabling_retain(enab);
14826		} else {
14827			dtrace_enabling_destroy(enab);
14828		}
14829
14830		mutex_exit(&cpu_lock);
14831		mutex_exit(&dtrace_lock);
14832		dtrace_dof_destroy(dof);
14833
14834		return (err);
14835	}
14836
14837	case DTRACEIOC_REPLICATE: {
14838		dtrace_repldesc_t desc;
14839		dtrace_probedesc_t *match = &desc.dtrpd_match;
14840		dtrace_probedesc_t *create = &desc.dtrpd_create;
14841		int err;
14842
14843		if (copyin((void *)arg, &desc, sizeof (desc)) != 0)
14844			return (EFAULT);
14845
14846		match->dtpd_provider[DTRACE_PROVNAMELEN - 1] = '\0';
14847		match->dtpd_mod[DTRACE_MODNAMELEN - 1] = '\0';
14848		match->dtpd_func[DTRACE_FUNCNAMELEN - 1] = '\0';
14849		match->dtpd_name[DTRACE_NAMELEN - 1] = '\0';
14850
14851		create->dtpd_provider[DTRACE_PROVNAMELEN - 1] = '\0';
14852		create->dtpd_mod[DTRACE_MODNAMELEN - 1] = '\0';
14853		create->dtpd_func[DTRACE_FUNCNAMELEN - 1] = '\0';
14854		create->dtpd_name[DTRACE_NAMELEN - 1] = '\0';
14855
14856		mutex_enter(&dtrace_lock);
14857		err = dtrace_enabling_replicate(state, match, create);
14858		mutex_exit(&dtrace_lock);
14859
14860		return (err);
14861	}
14862
14863	case DTRACEIOC_PROBEMATCH:
14864	case DTRACEIOC_PROBES: {
14865		dtrace_probe_t *probe = NULL;
14866		dtrace_probedesc_t desc;
14867		dtrace_probekey_t pkey;
14868		dtrace_id_t i;
14869		int m = 0;
14870		uint32_t priv;
14871		uid_t uid;
14872		zoneid_t zoneid;
14873
14874		if (copyin((void *)arg, &desc, sizeof (desc)) != 0)
14875			return (EFAULT);
14876
14877		desc.dtpd_provider[DTRACE_PROVNAMELEN - 1] = '\0';
14878		desc.dtpd_mod[DTRACE_MODNAMELEN - 1] = '\0';
14879		desc.dtpd_func[DTRACE_FUNCNAMELEN - 1] = '\0';
14880		desc.dtpd_name[DTRACE_NAMELEN - 1] = '\0';
14881
14882		/*
14883		 * Before we attempt to match this probe, we want to give
14884		 * all providers the opportunity to provide it.
14885		 */
14886		if (desc.dtpd_id == DTRACE_IDNONE) {
14887			mutex_enter(&dtrace_provider_lock);
14888			dtrace_probe_provide(&desc, NULL);
14889			mutex_exit(&dtrace_provider_lock);
14890			desc.dtpd_id++;
14891		}
14892
14893		if (cmd == DTRACEIOC_PROBEMATCH)  {
14894			dtrace_probekey(&desc, &pkey);
14895			pkey.dtpk_id = DTRACE_IDNONE;
14896		}
14897
14898		dtrace_cred2priv(cr, &priv, &uid, &zoneid);
14899
14900		mutex_enter(&dtrace_lock);
14901
14902		if (cmd == DTRACEIOC_PROBEMATCH) {
14903			for (i = desc.dtpd_id; i <= dtrace_nprobes; i++) {
14904				if ((probe = dtrace_probes[i - 1]) != NULL &&
14905				    (m = dtrace_match_probe(probe, &pkey,
14906				    priv, uid, zoneid)) != 0)
14907					break;
14908			}
14909
14910			if (m < 0) {
14911				mutex_exit(&dtrace_lock);
14912				return (EINVAL);
14913			}
14914
14915		} else {
14916			for (i = desc.dtpd_id; i <= dtrace_nprobes; i++) {
14917				if ((probe = dtrace_probes[i - 1]) != NULL &&
14918				    dtrace_match_priv(probe, priv, uid, zoneid))
14919					break;
14920			}
14921		}
14922
14923		if (probe == NULL) {
14924			mutex_exit(&dtrace_lock);
14925			return (ESRCH);
14926		}
14927
14928		dtrace_probe_description(probe, &desc);
14929		mutex_exit(&dtrace_lock);
14930
14931		if (copyout(&desc, (void *)arg, sizeof (desc)) != 0)
14932			return (EFAULT);
14933
14934		return (0);
14935	}
14936
14937	case DTRACEIOC_PROBEARG: {
14938		dtrace_argdesc_t desc;
14939		dtrace_probe_t *probe;
14940		dtrace_provider_t *prov;
14941
14942		if (copyin((void *)arg, &desc, sizeof (desc)) != 0)
14943			return (EFAULT);
14944
14945		if (desc.dtargd_id == DTRACE_IDNONE)
14946			return (EINVAL);
14947
14948		if (desc.dtargd_ndx == DTRACE_ARGNONE)
14949			return (EINVAL);
14950
14951		mutex_enter(&dtrace_provider_lock);
14952		mutex_enter(&mod_lock);
14953		mutex_enter(&dtrace_lock);
14954
14955		if (desc.dtargd_id > dtrace_nprobes) {
14956			mutex_exit(&dtrace_lock);
14957			mutex_exit(&mod_lock);
14958			mutex_exit(&dtrace_provider_lock);
14959			return (EINVAL);
14960		}
14961
14962		if ((probe = dtrace_probes[desc.dtargd_id - 1]) == NULL) {
14963			mutex_exit(&dtrace_lock);
14964			mutex_exit(&mod_lock);
14965			mutex_exit(&dtrace_provider_lock);
14966			return (EINVAL);
14967		}
14968
14969		mutex_exit(&dtrace_lock);
14970
14971		prov = probe->dtpr_provider;
14972
14973		if (prov->dtpv_pops.dtps_getargdesc == NULL) {
14974			/*
14975			 * There isn't any typed information for this probe.
14976			 * Set the argument number to DTRACE_ARGNONE.
14977			 */
14978			desc.dtargd_ndx = DTRACE_ARGNONE;
14979		} else {
14980			desc.dtargd_native[0] = '\0';
14981			desc.dtargd_xlate[0] = '\0';
14982			desc.dtargd_mapping = desc.dtargd_ndx;
14983
14984			prov->dtpv_pops.dtps_getargdesc(prov->dtpv_arg,
14985			    probe->dtpr_id, probe->dtpr_arg, &desc);
14986		}
14987
14988		mutex_exit(&mod_lock);
14989		mutex_exit(&dtrace_provider_lock);
14990
14991		if (copyout(&desc, (void *)arg, sizeof (desc)) != 0)
14992			return (EFAULT);
14993
14994		return (0);
14995	}
14996
14997	case DTRACEIOC_GO: {
14998		processorid_t cpuid;
14999		rval = dtrace_state_go(state, &cpuid);
15000
15001		if (rval != 0)
15002			return (rval);
15003
15004		if (copyout(&cpuid, (void *)arg, sizeof (cpuid)) != 0)
15005			return (EFAULT);
15006
15007		return (0);
15008	}
15009
15010	case DTRACEIOC_STOP: {
15011		processorid_t cpuid;
15012
15013		mutex_enter(&dtrace_lock);
15014		rval = dtrace_state_stop(state, &cpuid);
15015		mutex_exit(&dtrace_lock);
15016
15017		if (rval != 0)
15018			return (rval);
15019
15020		if (copyout(&cpuid, (void *)arg, sizeof (cpuid)) != 0)
15021			return (EFAULT);
15022
15023		return (0);
15024	}
15025
15026	case DTRACEIOC_DOFGET: {
15027		dof_hdr_t hdr, *dof;
15028		uint64_t len;
15029
15030		if (copyin((void *)arg, &hdr, sizeof (hdr)) != 0)
15031			return (EFAULT);
15032
15033		mutex_enter(&dtrace_lock);
15034		dof = dtrace_dof_create(state);
15035		mutex_exit(&dtrace_lock);
15036
15037		len = MIN(hdr.dofh_loadsz, dof->dofh_loadsz);
15038		rval = copyout(dof, (void *)arg, len);
15039		dtrace_dof_destroy(dof);
15040
15041		return (rval == 0 ? 0 : EFAULT);
15042	}
15043
15044	case DTRACEIOC_AGGSNAP:
15045	case DTRACEIOC_BUFSNAP: {
15046		dtrace_bufdesc_t desc;
15047		caddr_t cached;
15048		dtrace_buffer_t *buf;
15049
15050		if (copyin((void *)arg, &desc, sizeof (desc)) != 0)
15051			return (EFAULT);
15052
15053		if (desc.dtbd_cpu < 0 || desc.dtbd_cpu >= NCPU)
15054			return (EINVAL);
15055
15056		mutex_enter(&dtrace_lock);
15057
15058		if (cmd == DTRACEIOC_BUFSNAP) {
15059			buf = &state->dts_buffer[desc.dtbd_cpu];
15060		} else {
15061			buf = &state->dts_aggbuffer[desc.dtbd_cpu];
15062		}
15063
15064		if (buf->dtb_flags & (DTRACEBUF_RING | DTRACEBUF_FILL)) {
15065			size_t sz = buf->dtb_offset;
15066
15067			if (state->dts_activity != DTRACE_ACTIVITY_STOPPED) {
15068				mutex_exit(&dtrace_lock);
15069				return (EBUSY);
15070			}
15071
15072			/*
15073			 * If this buffer has already been consumed, we're
15074			 * going to indicate that there's nothing left here
15075			 * to consume.
15076			 */
15077			if (buf->dtb_flags & DTRACEBUF_CONSUMED) {
15078				mutex_exit(&dtrace_lock);
15079
15080				desc.dtbd_size = 0;
15081				desc.dtbd_drops = 0;
15082				desc.dtbd_errors = 0;
15083				desc.dtbd_oldest = 0;
15084				sz = sizeof (desc);
15085
15086				if (copyout(&desc, (void *)arg, sz) != 0)
15087					return (EFAULT);
15088
15089				return (0);
15090			}
15091
15092			/*
15093			 * If this is a ring buffer that has wrapped, we want
15094			 * to copy the whole thing out.
15095			 */
15096			if (buf->dtb_flags & DTRACEBUF_WRAPPED) {
15097				dtrace_buffer_polish(buf);
15098				sz = buf->dtb_size;
15099			}
15100
15101			if (copyout(buf->dtb_tomax, desc.dtbd_data, sz) != 0) {
15102				mutex_exit(&dtrace_lock);
15103				return (EFAULT);
15104			}
15105
15106			desc.dtbd_size = sz;
15107			desc.dtbd_drops = buf->dtb_drops;
15108			desc.dtbd_errors = buf->dtb_errors;
15109			desc.dtbd_oldest = buf->dtb_xamot_offset;
15110
15111			mutex_exit(&dtrace_lock);
15112
15113			if (copyout(&desc, (void *)arg, sizeof (desc)) != 0)
15114				return (EFAULT);
15115
15116			buf->dtb_flags |= DTRACEBUF_CONSUMED;
15117
15118			return (0);
15119		}
15120
15121		if (buf->dtb_tomax == NULL) {
15122			ASSERT(buf->dtb_xamot == NULL);
15123			mutex_exit(&dtrace_lock);
15124			return (ENOENT);
15125		}
15126
15127		cached = buf->dtb_tomax;
15128		ASSERT(!(buf->dtb_flags & DTRACEBUF_NOSWITCH));
15129
15130		dtrace_xcall(desc.dtbd_cpu,
15131		    (dtrace_xcall_t)dtrace_buffer_switch, buf);
15132
15133		state->dts_errors += buf->dtb_xamot_errors;
15134
15135		/*
15136		 * If the buffers did not actually switch, then the cross call
15137		 * did not take place -- presumably because the given CPU is
15138		 * not in the ready set.  If this is the case, we'll return
15139		 * ENOENT.
15140		 */
15141		if (buf->dtb_tomax == cached) {
15142			ASSERT(buf->dtb_xamot != cached);
15143			mutex_exit(&dtrace_lock);
15144			return (ENOENT);
15145		}
15146
15147		ASSERT(cached == buf->dtb_xamot);
15148
15149		/*
15150		 * We have our snapshot; now copy it out.
15151		 */
15152		if (copyout(buf->dtb_xamot, desc.dtbd_data,
15153		    buf->dtb_xamot_offset) != 0) {
15154			mutex_exit(&dtrace_lock);
15155			return (EFAULT);
15156		}
15157
15158		desc.dtbd_size = buf->dtb_xamot_offset;
15159		desc.dtbd_drops = buf->dtb_xamot_drops;
15160		desc.dtbd_errors = buf->dtb_xamot_errors;
15161		desc.dtbd_oldest = 0;
15162
15163		mutex_exit(&dtrace_lock);
15164
15165		/*
15166		 * Finally, copy out the buffer description.
15167		 */
15168		if (copyout(&desc, (void *)arg, sizeof (desc)) != 0)
15169			return (EFAULT);
15170
15171		return (0);
15172	}
15173
15174	case DTRACEIOC_CONF: {
15175		dtrace_conf_t conf;
15176
15177		bzero(&conf, sizeof (conf));
15178		conf.dtc_difversion = DIF_VERSION;
15179		conf.dtc_difintregs = DIF_DIR_NREGS;
15180		conf.dtc_diftupregs = DIF_DTR_NREGS;
15181		conf.dtc_ctfmodel = CTF_MODEL_NATIVE;
15182
15183		if (copyout(&conf, (void *)arg, sizeof (conf)) != 0)
15184			return (EFAULT);
15185
15186		return (0);
15187	}
15188
15189	case DTRACEIOC_STATUS: {
15190		dtrace_status_t stat;
15191		dtrace_dstate_t *dstate;
15192		int i, j;
15193		uint64_t nerrs;
15194
15195		/*
15196		 * See the comment in dtrace_state_deadman() for the reason
15197		 * for setting dts_laststatus to INT64_MAX before setting
15198		 * it to the correct value.
15199		 */
15200		state->dts_laststatus = INT64_MAX;
15201		dtrace_membar_producer();
15202		state->dts_laststatus = dtrace_gethrtime();
15203
15204		bzero(&stat, sizeof (stat));
15205
15206		mutex_enter(&dtrace_lock);
15207
15208		if (state->dts_activity == DTRACE_ACTIVITY_INACTIVE) {
15209			mutex_exit(&dtrace_lock);
15210			return (ENOENT);
15211		}
15212
15213		if (state->dts_activity == DTRACE_ACTIVITY_DRAINING)
15214			stat.dtst_exiting = 1;
15215
15216		nerrs = state->dts_errors;
15217		dstate = &state->dts_vstate.dtvs_dynvars;
15218
15219		for (i = 0; i < NCPU; i++) {
15220			dtrace_dstate_percpu_t *dcpu = &dstate->dtds_percpu[i];
15221
15222			stat.dtst_dyndrops += dcpu->dtdsc_drops;
15223			stat.dtst_dyndrops_dirty += dcpu->dtdsc_dirty_drops;
15224			stat.dtst_dyndrops_rinsing += dcpu->dtdsc_rinsing_drops;
15225
15226			if (state->dts_buffer[i].dtb_flags & DTRACEBUF_FULL)
15227				stat.dtst_filled++;
15228
15229			nerrs += state->dts_buffer[i].dtb_errors;
15230
15231			for (j = 0; j < state->dts_nspeculations; j++) {
15232				dtrace_speculation_t *spec;
15233				dtrace_buffer_t *buf;
15234
15235				spec = &state->dts_speculations[j];
15236				buf = &spec->dtsp_buffer[i];
15237				stat.dtst_specdrops += buf->dtb_xamot_drops;
15238			}
15239		}
15240
15241		stat.dtst_specdrops_busy = state->dts_speculations_busy;
15242		stat.dtst_specdrops_unavail = state->dts_speculations_unavail;
15243		stat.dtst_stkstroverflows = state->dts_stkstroverflows;
15244		stat.dtst_dblerrors = state->dts_dblerrors;
15245		stat.dtst_killed =
15246		    (state->dts_activity == DTRACE_ACTIVITY_KILLED);
15247		stat.dtst_errors = nerrs;
15248
15249		mutex_exit(&dtrace_lock);
15250
15251		if (copyout(&stat, (void *)arg, sizeof (stat)) != 0)
15252			return (EFAULT);
15253
15254		return (0);
15255	}
15256
15257	case DTRACEIOC_FORMAT: {
15258		dtrace_fmtdesc_t fmt;
15259		char *str;
15260		int len;
15261
15262		if (copyin((void *)arg, &fmt, sizeof (fmt)) != 0)
15263			return (EFAULT);
15264
15265		mutex_enter(&dtrace_lock);
15266
15267		if (fmt.dtfd_format == 0 ||
15268		    fmt.dtfd_format > state->dts_nformats) {
15269			mutex_exit(&dtrace_lock);
15270			return (EINVAL);
15271		}
15272
15273		/*
15274		 * Format strings are allocated contiguously and they are
15275		 * never freed; if a format index is less than the number
15276		 * of formats, we can assert that the format map is non-NULL
15277		 * and that the format for the specified index is non-NULL.
15278		 */
15279		ASSERT(state->dts_formats != NULL);
15280		str = state->dts_formats[fmt.dtfd_format - 1];
15281		ASSERT(str != NULL);
15282
15283		len = strlen(str) + 1;
15284
15285		if (len > fmt.dtfd_length) {
15286			fmt.dtfd_length = len;
15287
15288			if (copyout(&fmt, (void *)arg, sizeof (fmt)) != 0) {
15289				mutex_exit(&dtrace_lock);
15290				return (EINVAL);
15291			}
15292		} else {
15293			if (copyout(str, fmt.dtfd_string, len) != 0) {
15294				mutex_exit(&dtrace_lock);
15295				return (EINVAL);
15296			}
15297		}
15298
15299		mutex_exit(&dtrace_lock);
15300		return (0);
15301	}
15302
15303	default:
15304		break;
15305	}
15306
15307	return (ENOTTY);
15308}
15309
15310/*ARGSUSED*/
15311static int
15312dtrace_detach(dev_info_t *dip, ddi_detach_cmd_t cmd)
15313{
15314	dtrace_state_t *state;
15315
15316	switch (cmd) {
15317	case DDI_DETACH:
15318		break;
15319
15320	case DDI_SUSPEND:
15321		return (DDI_SUCCESS);
15322
15323	default:
15324		return (DDI_FAILURE);
15325	}
15326
15327	mutex_enter(&cpu_lock);
15328	mutex_enter(&dtrace_provider_lock);
15329	mutex_enter(&dtrace_lock);
15330
15331	ASSERT(dtrace_opens == 0);
15332
15333	if (dtrace_helpers > 0) {
15334		mutex_exit(&dtrace_provider_lock);
15335		mutex_exit(&dtrace_lock);
15336		mutex_exit(&cpu_lock);
15337		return (DDI_FAILURE);
15338	}
15339
15340	if (dtrace_unregister((dtrace_provider_id_t)dtrace_provider) != 0) {
15341		mutex_exit(&dtrace_provider_lock);
15342		mutex_exit(&dtrace_lock);
15343		mutex_exit(&cpu_lock);
15344		return (DDI_FAILURE);
15345	}
15346
15347	dtrace_provider = NULL;
15348
15349	if ((state = dtrace_anon_grab()) != NULL) {
15350		/*
15351		 * If there were ECBs on this state, the provider should
15352		 * have not been allowed to detach; assert that there is
15353		 * none.
15354		 */
15355		ASSERT(state->dts_necbs == 0);
15356		dtrace_state_destroy(state);
15357
15358		/*
15359		 * If we're being detached with anonymous state, we need to
15360		 * indicate to the kernel debugger that DTrace is now inactive.
15361		 */
15362		(void) kdi_dtrace_set(KDI_DTSET_DTRACE_DEACTIVATE);
15363	}
15364
15365	bzero(&dtrace_anon, sizeof (dtrace_anon_t));
15366	unregister_cpu_setup_func((cpu_setup_func_t *)dtrace_cpu_setup, NULL);
15367	dtrace_cpu_init = NULL;
15368	dtrace_helpers_cleanup = NULL;
15369	dtrace_helpers_fork = NULL;
15370	dtrace_cpustart_init = NULL;
15371	dtrace_cpustart_fini = NULL;
15372	dtrace_debugger_init = NULL;
15373	dtrace_debugger_fini = NULL;
15374	dtrace_modload = NULL;
15375	dtrace_modunload = NULL;
15376
15377	mutex_exit(&cpu_lock);
15378
15379	if (dtrace_helptrace_enabled) {
15380		kmem_free(dtrace_helptrace_buffer, dtrace_helptrace_bufsize);
15381		dtrace_helptrace_buffer = NULL;
15382	}
15383
15384	kmem_free(dtrace_probes, dtrace_nprobes * sizeof (dtrace_probe_t *));
15385	dtrace_probes = NULL;
15386	dtrace_nprobes = 0;
15387
15388	dtrace_hash_destroy(dtrace_bymod);
15389	dtrace_hash_destroy(dtrace_byfunc);
15390	dtrace_hash_destroy(dtrace_byname);
15391	dtrace_bymod = NULL;
15392	dtrace_byfunc = NULL;
15393	dtrace_byname = NULL;
15394
15395	kmem_cache_destroy(dtrace_state_cache);
15396	vmem_destroy(dtrace_minor);
15397	vmem_destroy(dtrace_arena);
15398
15399	if (dtrace_toxrange != NULL) {
15400		kmem_free(dtrace_toxrange,
15401		    dtrace_toxranges_max * sizeof (dtrace_toxrange_t));
15402		dtrace_toxrange = NULL;
15403		dtrace_toxranges = 0;
15404		dtrace_toxranges_max = 0;
15405	}
15406
15407	ddi_remove_minor_node(dtrace_devi, NULL);
15408	dtrace_devi = NULL;
15409
15410	ddi_soft_state_fini(&dtrace_softstate);
15411
15412	ASSERT(dtrace_vtime_references == 0);
15413	ASSERT(dtrace_opens == 0);
15414	ASSERT(dtrace_retained == NULL);
15415
15416	mutex_exit(&dtrace_lock);
15417	mutex_exit(&dtrace_provider_lock);
15418
15419	/*
15420	 * We don't destroy the task queue until after we have dropped our
15421	 * locks (taskq_destroy() may block on running tasks).  To prevent
15422	 * attempting to do work after we have effectively detached but before
15423	 * the task queue has been destroyed, all tasks dispatched via the
15424	 * task queue must check that DTrace is still attached before
15425	 * performing any operation.
15426	 */
15427	taskq_destroy(dtrace_taskq);
15428	dtrace_taskq = NULL;
15429
15430	return (DDI_SUCCESS);
15431}
15432
15433/*ARGSUSED*/
15434static int
15435dtrace_info(dev_info_t *dip, ddi_info_cmd_t infocmd, void *arg, void **result)
15436{
15437	int error;
15438
15439	switch (infocmd) {
15440	case DDI_INFO_DEVT2DEVINFO:
15441		*result = (void *)dtrace_devi;
15442		error = DDI_SUCCESS;
15443		break;
15444	case DDI_INFO_DEVT2INSTANCE:
15445		*result = (void *)0;
15446		error = DDI_SUCCESS;
15447		break;
15448	default:
15449		error = DDI_FAILURE;
15450	}
15451	return (error);
15452}
15453
15454static struct cb_ops dtrace_cb_ops = {
15455	dtrace_open,		/* open */
15456	dtrace_close,		/* close */
15457	nulldev,		/* strategy */
15458	nulldev,		/* print */
15459	nodev,			/* dump */
15460	nodev,			/* read */
15461	nodev,			/* write */
15462	dtrace_ioctl,		/* ioctl */
15463	nodev,			/* devmap */
15464	nodev,			/* mmap */
15465	nodev,			/* segmap */
15466	nochpoll,		/* poll */
15467	ddi_prop_op,		/* cb_prop_op */
15468	0,			/* streamtab  */
15469	D_NEW | D_MP		/* Driver compatibility flag */
15470};
15471
15472static struct dev_ops dtrace_ops = {
15473	DEVO_REV,		/* devo_rev */
15474	0,			/* refcnt */
15475	dtrace_info,		/* get_dev_info */
15476	nulldev,		/* identify */
15477	nulldev,		/* probe */
15478	dtrace_attach,		/* attach */
15479	dtrace_detach,		/* detach */
15480	nodev,			/* reset */
15481	&dtrace_cb_ops,		/* driver operations */
15482	NULL,			/* bus operations */
15483	nodev,			/* dev power */
15484	ddi_quiesce_not_needed,		/* quiesce */
15485};
15486
15487static struct modldrv modldrv = {
15488	&mod_driverops,		/* module type (this is a pseudo driver) */
15489	"Dynamic Tracing",	/* name of module */
15490	&dtrace_ops,		/* driver ops */
15491};
15492
15493static struct modlinkage modlinkage = {
15494	MODREV_1,
15495	(void *)&modldrv,
15496	NULL
15497};
15498
15499int
15500_init(void)
15501{
15502	return (mod_install(&modlinkage));
15503}
15504
15505int
15506_info(struct modinfo *modinfop)
15507{
15508	return (mod_info(&modlinkage, modinfop));
15509}
15510
15511int
15512_fini(void)
15513{
15514	return (mod_remove(&modlinkage));
15515}
15516