dtrace.c revision 284134
1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 *
21 * $FreeBSD: stable/10/sys/cddl/contrib/opensolaris/uts/common/dtrace/dtrace.c 284134 2015-06-07 20:10:11Z markj $
22 */
23
24/*
25 * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
26 * Copyright (c) 2013, Joyent, Inc. All rights reserved.
27 * Copyright (c) 2012 by Delphix. All rights reserved.
28 */
29
30/*
31 * DTrace - Dynamic Tracing for Solaris
32 *
33 * This is the implementation of the Solaris Dynamic Tracing framework
34 * (DTrace).  The user-visible interface to DTrace is described at length in
35 * the "Solaris Dynamic Tracing Guide".  The interfaces between the libdtrace
36 * library, the in-kernel DTrace framework, and the DTrace providers are
37 * described in the block comments in the <sys/dtrace.h> header file.  The
38 * internal architecture of DTrace is described in the block comments in the
39 * <sys/dtrace_impl.h> header file.  The comments contained within the DTrace
40 * implementation very much assume mastery of all of these sources; if one has
41 * an unanswered question about the implementation, one should consult them
42 * first.
43 *
44 * The functions here are ordered roughly as follows:
45 *
46 *   - Probe context functions
47 *   - Probe hashing functions
48 *   - Non-probe context utility functions
49 *   - Matching functions
50 *   - Provider-to-Framework API functions
51 *   - Probe management functions
52 *   - DIF object functions
53 *   - Format functions
54 *   - Predicate functions
55 *   - ECB functions
56 *   - Buffer functions
57 *   - Enabling functions
58 *   - DOF functions
59 *   - Anonymous enabling functions
60 *   - Consumer state functions
61 *   - Helper functions
62 *   - Hook functions
63 *   - Driver cookbook functions
64 *
65 * Each group of functions begins with a block comment labelled the "DTrace
66 * [Group] Functions", allowing one to find each block by searching forward
67 * on capital-f functions.
68 */
69#include <sys/errno.h>
70#if !defined(sun)
71#include <sys/time.h>
72#endif
73#include <sys/stat.h>
74#include <sys/modctl.h>
75#include <sys/conf.h>
76#include <sys/systm.h>
77#if defined(sun)
78#include <sys/ddi.h>
79#include <sys/sunddi.h>
80#endif
81#include <sys/cpuvar.h>
82#include <sys/kmem.h>
83#if defined(sun)
84#include <sys/strsubr.h>
85#endif
86#include <sys/sysmacros.h>
87#include <sys/dtrace_impl.h>
88#include <sys/atomic.h>
89#include <sys/cmn_err.h>
90#if defined(sun)
91#include <sys/mutex_impl.h>
92#include <sys/rwlock_impl.h>
93#endif
94#include <sys/ctf_api.h>
95#if defined(sun)
96#include <sys/panic.h>
97#include <sys/priv_impl.h>
98#endif
99#include <sys/policy.h>
100#if defined(sun)
101#include <sys/cred_impl.h>
102#include <sys/procfs_isa.h>
103#endif
104#include <sys/taskq.h>
105#if defined(sun)
106#include <sys/mkdev.h>
107#include <sys/kdi.h>
108#endif
109#include <sys/zone.h>
110#include <sys/socket.h>
111#include <netinet/in.h>
112#include "strtolctype.h"
113
114/* FreeBSD includes: */
115#if !defined(sun)
116#include <sys/callout.h>
117#include <sys/ctype.h>
118#include <sys/eventhandler.h>
119#include <sys/limits.h>
120#include <sys/kdb.h>
121#include <sys/kernel.h>
122#include <sys/malloc.h>
123#include <sys/sysctl.h>
124#include <sys/lock.h>
125#include <sys/mutex.h>
126#include <sys/rwlock.h>
127#include <sys/sx.h>
128#include <sys/dtrace_bsd.h>
129#include <netinet/in.h>
130#include "dtrace_cddl.h"
131#include "dtrace_debug.c"
132#endif
133
134/*
135 * DTrace Tunable Variables
136 *
137 * The following variables may be tuned by adding a line to /etc/system that
138 * includes both the name of the DTrace module ("dtrace") and the name of the
139 * variable.  For example:
140 *
141 *   set dtrace:dtrace_destructive_disallow = 1
142 *
143 * In general, the only variables that one should be tuning this way are those
144 * that affect system-wide DTrace behavior, and for which the default behavior
145 * is undesirable.  Most of these variables are tunable on a per-consumer
146 * basis using DTrace options, and need not be tuned on a system-wide basis.
147 * When tuning these variables, avoid pathological values; while some attempt
148 * is made to verify the integrity of these variables, they are not considered
149 * part of the supported interface to DTrace, and they are therefore not
150 * checked comprehensively.  Further, these variables should not be tuned
151 * dynamically via "mdb -kw" or other means; they should only be tuned via
152 * /etc/system.
153 */
154int		dtrace_destructive_disallow = 0;
155dtrace_optval_t	dtrace_nonroot_maxsize = (16 * 1024 * 1024);
156size_t		dtrace_difo_maxsize = (256 * 1024);
157dtrace_optval_t	dtrace_dof_maxsize = (8 * 1024 * 1024);
158size_t		dtrace_global_maxsize = (16 * 1024);
159size_t		dtrace_actions_max = (16 * 1024);
160size_t		dtrace_retain_max = 1024;
161dtrace_optval_t	dtrace_helper_actions_max = 128;
162dtrace_optval_t	dtrace_helper_providers_max = 32;
163dtrace_optval_t	dtrace_dstate_defsize = (1 * 1024 * 1024);
164size_t		dtrace_strsize_default = 256;
165dtrace_optval_t	dtrace_cleanrate_default = 9900990;		/* 101 hz */
166dtrace_optval_t	dtrace_cleanrate_min = 200000;			/* 5000 hz */
167dtrace_optval_t	dtrace_cleanrate_max = (uint64_t)60 * NANOSEC;	/* 1/minute */
168dtrace_optval_t	dtrace_aggrate_default = NANOSEC;		/* 1 hz */
169dtrace_optval_t	dtrace_statusrate_default = NANOSEC;		/* 1 hz */
170dtrace_optval_t dtrace_statusrate_max = (hrtime_t)10 * NANOSEC;	 /* 6/minute */
171dtrace_optval_t	dtrace_switchrate_default = NANOSEC;		/* 1 hz */
172dtrace_optval_t	dtrace_nspec_default = 1;
173dtrace_optval_t	dtrace_specsize_default = 32 * 1024;
174dtrace_optval_t dtrace_stackframes_default = 20;
175dtrace_optval_t dtrace_ustackframes_default = 20;
176dtrace_optval_t dtrace_jstackframes_default = 50;
177dtrace_optval_t dtrace_jstackstrsize_default = 512;
178int		dtrace_msgdsize_max = 128;
179hrtime_t	dtrace_chill_max = MSEC2NSEC(500);		/* 500 ms */
180hrtime_t	dtrace_chill_interval = NANOSEC;		/* 1000 ms */
181int		dtrace_devdepth_max = 32;
182int		dtrace_err_verbose;
183hrtime_t	dtrace_deadman_interval = NANOSEC;
184hrtime_t	dtrace_deadman_timeout = (hrtime_t)10 * NANOSEC;
185hrtime_t	dtrace_deadman_user = (hrtime_t)30 * NANOSEC;
186hrtime_t	dtrace_unregister_defunct_reap = (hrtime_t)60 * NANOSEC;
187#if !defined(sun)
188int		dtrace_memstr_max = 4096;
189#endif
190
191/*
192 * DTrace External Variables
193 *
194 * As dtrace(7D) is a kernel module, any DTrace variables are obviously
195 * available to DTrace consumers via the backtick (`) syntax.  One of these,
196 * dtrace_zero, is made deliberately so:  it is provided as a source of
197 * well-known, zero-filled memory.  While this variable is not documented,
198 * it is used by some translators as an implementation detail.
199 */
200const char	dtrace_zero[256] = { 0 };	/* zero-filled memory */
201
202/*
203 * DTrace Internal Variables
204 */
205#if defined(sun)
206static dev_info_t	*dtrace_devi;		/* device info */
207#endif
208#if defined(sun)
209static vmem_t		*dtrace_arena;		/* probe ID arena */
210static vmem_t		*dtrace_minor;		/* minor number arena */
211#else
212static taskq_t		*dtrace_taskq;		/* task queue */
213static struct unrhdr	*dtrace_arena;		/* Probe ID number.     */
214#endif
215static dtrace_probe_t	**dtrace_probes;	/* array of all probes */
216static int		dtrace_nprobes;		/* number of probes */
217static dtrace_provider_t *dtrace_provider;	/* provider list */
218static dtrace_meta_t	*dtrace_meta_pid;	/* user-land meta provider */
219static int		dtrace_opens;		/* number of opens */
220static int		dtrace_helpers;		/* number of helpers */
221static int		dtrace_getf;		/* number of unpriv getf()s */
222#if defined(sun)
223static void		*dtrace_softstate;	/* softstate pointer */
224#endif
225static dtrace_hash_t	*dtrace_bymod;		/* probes hashed by module */
226static dtrace_hash_t	*dtrace_byfunc;		/* probes hashed by function */
227static dtrace_hash_t	*dtrace_byname;		/* probes hashed by name */
228static dtrace_toxrange_t *dtrace_toxrange;	/* toxic range array */
229static int		dtrace_toxranges;	/* number of toxic ranges */
230static int		dtrace_toxranges_max;	/* size of toxic range array */
231static dtrace_anon_t	dtrace_anon;		/* anonymous enabling */
232static kmem_cache_t	*dtrace_state_cache;	/* cache for dynamic state */
233static uint64_t		dtrace_vtime_references; /* number of vtimestamp refs */
234static kthread_t	*dtrace_panicked;	/* panicking thread */
235static dtrace_ecb_t	*dtrace_ecb_create_cache; /* cached created ECB */
236static dtrace_genid_t	dtrace_probegen;	/* current probe generation */
237static dtrace_helpers_t *dtrace_deferred_pid;	/* deferred helper list */
238static dtrace_enabling_t *dtrace_retained;	/* list of retained enablings */
239static dtrace_genid_t	dtrace_retained_gen;	/* current retained enab gen */
240static dtrace_dynvar_t	dtrace_dynhash_sink;	/* end of dynamic hash chains */
241static int		dtrace_dynvar_failclean; /* dynvars failed to clean */
242#if !defined(sun)
243static struct mtx	dtrace_unr_mtx;
244MTX_SYSINIT(dtrace_unr_mtx, &dtrace_unr_mtx, "Unique resource identifier", MTX_DEF);
245int		dtrace_in_probe;	/* non-zero if executing a probe */
246#if defined(__i386__) || defined(__amd64__) || defined(__mips__) || defined(__powerpc__)
247uintptr_t	dtrace_in_probe_addr;	/* Address of invop when already in probe */
248#endif
249static eventhandler_tag	dtrace_kld_load_tag;
250static eventhandler_tag	dtrace_kld_unload_try_tag;
251#endif
252
253/*
254 * DTrace Locking
255 * DTrace is protected by three (relatively coarse-grained) locks:
256 *
257 * (1) dtrace_lock is required to manipulate essentially any DTrace state,
258 *     including enabling state, probes, ECBs, consumer state, helper state,
259 *     etc.  Importantly, dtrace_lock is _not_ required when in probe context;
260 *     probe context is lock-free -- synchronization is handled via the
261 *     dtrace_sync() cross call mechanism.
262 *
263 * (2) dtrace_provider_lock is required when manipulating provider state, or
264 *     when provider state must be held constant.
265 *
266 * (3) dtrace_meta_lock is required when manipulating meta provider state, or
267 *     when meta provider state must be held constant.
268 *
269 * The lock ordering between these three locks is dtrace_meta_lock before
270 * dtrace_provider_lock before dtrace_lock.  (In particular, there are
271 * several places where dtrace_provider_lock is held by the framework as it
272 * calls into the providers -- which then call back into the framework,
273 * grabbing dtrace_lock.)
274 *
275 * There are two other locks in the mix:  mod_lock and cpu_lock.  With respect
276 * to dtrace_provider_lock and dtrace_lock, cpu_lock continues its historical
277 * role as a coarse-grained lock; it is acquired before both of these locks.
278 * With respect to dtrace_meta_lock, its behavior is stranger:  cpu_lock must
279 * be acquired _between_ dtrace_meta_lock and any other DTrace locks.
280 * mod_lock is similar with respect to dtrace_provider_lock in that it must be
281 * acquired _between_ dtrace_provider_lock and dtrace_lock.
282 */
283static kmutex_t		dtrace_lock;		/* probe state lock */
284static kmutex_t		dtrace_provider_lock;	/* provider state lock */
285static kmutex_t		dtrace_meta_lock;	/* meta-provider state lock */
286
287#if !defined(sun)
288/* XXX FreeBSD hacks. */
289#define cr_suid		cr_svuid
290#define cr_sgid		cr_svgid
291#define	ipaddr_t	in_addr_t
292#define mod_modname	pathname
293#define vuprintf	vprintf
294#define ttoproc(_a)	((_a)->td_proc)
295#define crgetzoneid(_a)	0
296#define	NCPU		MAXCPU
297#define SNOCD		0
298#define CPU_ON_INTR(_a)	0
299
300#define PRIV_EFFECTIVE		(1 << 0)
301#define PRIV_DTRACE_KERNEL	(1 << 1)
302#define PRIV_DTRACE_PROC	(1 << 2)
303#define PRIV_DTRACE_USER	(1 << 3)
304#define PRIV_PROC_OWNER		(1 << 4)
305#define PRIV_PROC_ZONE		(1 << 5)
306#define PRIV_ALL		~0
307
308SYSCTL_DECL(_debug_dtrace);
309SYSCTL_DECL(_kern_dtrace);
310#endif
311
312#if defined(sun)
313#define curcpu	CPU->cpu_id
314#endif
315
316
317/*
318 * DTrace Provider Variables
319 *
320 * These are the variables relating to DTrace as a provider (that is, the
321 * provider of the BEGIN, END, and ERROR probes).
322 */
323static dtrace_pattr_t	dtrace_provider_attr = {
324{ DTRACE_STABILITY_STABLE, DTRACE_STABILITY_STABLE, DTRACE_CLASS_COMMON },
325{ DTRACE_STABILITY_PRIVATE, DTRACE_STABILITY_PRIVATE, DTRACE_CLASS_UNKNOWN },
326{ DTRACE_STABILITY_PRIVATE, DTRACE_STABILITY_PRIVATE, DTRACE_CLASS_UNKNOWN },
327{ DTRACE_STABILITY_STABLE, DTRACE_STABILITY_STABLE, DTRACE_CLASS_COMMON },
328{ DTRACE_STABILITY_STABLE, DTRACE_STABILITY_STABLE, DTRACE_CLASS_COMMON },
329};
330
331static void
332dtrace_nullop(void)
333{}
334
335static dtrace_pops_t	dtrace_provider_ops = {
336	(void (*)(void *, dtrace_probedesc_t *))dtrace_nullop,
337	(void (*)(void *, modctl_t *))dtrace_nullop,
338	(void (*)(void *, dtrace_id_t, void *))dtrace_nullop,
339	(void (*)(void *, dtrace_id_t, void *))dtrace_nullop,
340	(void (*)(void *, dtrace_id_t, void *))dtrace_nullop,
341	(void (*)(void *, dtrace_id_t, void *))dtrace_nullop,
342	NULL,
343	NULL,
344	NULL,
345	(void (*)(void *, dtrace_id_t, void *))dtrace_nullop
346};
347
348static dtrace_id_t	dtrace_probeid_begin;	/* special BEGIN probe */
349static dtrace_id_t	dtrace_probeid_end;	/* special END probe */
350dtrace_id_t		dtrace_probeid_error;	/* special ERROR probe */
351
352/*
353 * DTrace Helper Tracing Variables
354 */
355uint32_t dtrace_helptrace_next = 0;
356uint32_t dtrace_helptrace_nlocals;
357char	*dtrace_helptrace_buffer;
358int	dtrace_helptrace_bufsize = 512 * 1024;
359
360#ifdef DEBUG
361int	dtrace_helptrace_enabled = 1;
362#else
363int	dtrace_helptrace_enabled = 0;
364#endif
365
366/*
367 * DTrace Error Hashing
368 *
369 * On DEBUG kernels, DTrace will track the errors that has seen in a hash
370 * table.  This is very useful for checking coverage of tests that are
371 * expected to induce DIF or DOF processing errors, and may be useful for
372 * debugging problems in the DIF code generator or in DOF generation .  The
373 * error hash may be examined with the ::dtrace_errhash MDB dcmd.
374 */
375#ifdef DEBUG
376static dtrace_errhash_t	dtrace_errhash[DTRACE_ERRHASHSZ];
377static const char *dtrace_errlast;
378static kthread_t *dtrace_errthread;
379static kmutex_t dtrace_errlock;
380#endif
381
382/*
383 * DTrace Macros and Constants
384 *
385 * These are various macros that are useful in various spots in the
386 * implementation, along with a few random constants that have no meaning
387 * outside of the implementation.  There is no real structure to this cpp
388 * mishmash -- but is there ever?
389 */
390#define	DTRACE_HASHSTR(hash, probe)	\
391	dtrace_hash_str(*((char **)((uintptr_t)(probe) + (hash)->dth_stroffs)))
392
393#define	DTRACE_HASHNEXT(hash, probe)	\
394	(dtrace_probe_t **)((uintptr_t)(probe) + (hash)->dth_nextoffs)
395
396#define	DTRACE_HASHPREV(hash, probe)	\
397	(dtrace_probe_t **)((uintptr_t)(probe) + (hash)->dth_prevoffs)
398
399#define	DTRACE_HASHEQ(hash, lhs, rhs)	\
400	(strcmp(*((char **)((uintptr_t)(lhs) + (hash)->dth_stroffs)), \
401	    *((char **)((uintptr_t)(rhs) + (hash)->dth_stroffs))) == 0)
402
403#define	DTRACE_AGGHASHSIZE_SLEW		17
404
405#define	DTRACE_V4MAPPED_OFFSET		(sizeof (uint32_t) * 3)
406
407/*
408 * The key for a thread-local variable consists of the lower 61 bits of the
409 * t_did, plus the 3 bits of the highest active interrupt above LOCK_LEVEL.
410 * We add DIF_VARIABLE_MAX to t_did to assure that the thread key is never
411 * equal to a variable identifier.  This is necessary (but not sufficient) to
412 * assure that global associative arrays never collide with thread-local
413 * variables.  To guarantee that they cannot collide, we must also define the
414 * order for keying dynamic variables.  That order is:
415 *
416 *   [ key0 ] ... [ keyn ] [ variable-key ] [ tls-key ]
417 *
418 * Because the variable-key and the tls-key are in orthogonal spaces, there is
419 * no way for a global variable key signature to match a thread-local key
420 * signature.
421 */
422#if defined(sun)
423#define	DTRACE_TLS_THRKEY(where) { \
424	uint_t intr = 0; \
425	uint_t actv = CPU->cpu_intr_actv >> (LOCK_LEVEL + 1); \
426	for (; actv; actv >>= 1) \
427		intr++; \
428	ASSERT(intr < (1 << 3)); \
429	(where) = ((curthread->t_did + DIF_VARIABLE_MAX) & \
430	    (((uint64_t)1 << 61) - 1)) | ((uint64_t)intr << 61); \
431}
432#else
433#define	DTRACE_TLS_THRKEY(where) { \
434	solaris_cpu_t *_c = &solaris_cpu[curcpu]; \
435	uint_t intr = 0; \
436	uint_t actv = _c->cpu_intr_actv; \
437	for (; actv; actv >>= 1) \
438		intr++; \
439	ASSERT(intr < (1 << 3)); \
440	(where) = ((curthread->td_tid + DIF_VARIABLE_MAX) & \
441	    (((uint64_t)1 << 61) - 1)) | ((uint64_t)intr << 61); \
442}
443#endif
444
445#define	DT_BSWAP_8(x)	((x) & 0xff)
446#define	DT_BSWAP_16(x)	((DT_BSWAP_8(x) << 8) | DT_BSWAP_8((x) >> 8))
447#define	DT_BSWAP_32(x)	((DT_BSWAP_16(x) << 16) | DT_BSWAP_16((x) >> 16))
448#define	DT_BSWAP_64(x)	((DT_BSWAP_32(x) << 32) | DT_BSWAP_32((x) >> 32))
449
450#define	DT_MASK_LO 0x00000000FFFFFFFFULL
451
452#define	DTRACE_STORE(type, tomax, offset, what) \
453	*((type *)((uintptr_t)(tomax) + (uintptr_t)offset)) = (type)(what);
454
455#ifndef __x86
456#define	DTRACE_ALIGNCHECK(addr, size, flags)				\
457	if (addr & (size - 1)) {					\
458		*flags |= CPU_DTRACE_BADALIGN;				\
459		cpu_core[curcpu].cpuc_dtrace_illval = addr;	\
460		return (0);						\
461	}
462#else
463#define	DTRACE_ALIGNCHECK(addr, size, flags)
464#endif
465
466/*
467 * Test whether a range of memory starting at testaddr of size testsz falls
468 * within the range of memory described by addr, sz.  We take care to avoid
469 * problems with overflow and underflow of the unsigned quantities, and
470 * disallow all negative sizes.  Ranges of size 0 are allowed.
471 */
472#define	DTRACE_INRANGE(testaddr, testsz, baseaddr, basesz) \
473	((testaddr) - (uintptr_t)(baseaddr) < (basesz) && \
474	(testaddr) + (testsz) - (uintptr_t)(baseaddr) <= (basesz) && \
475	(testaddr) + (testsz) >= (testaddr))
476
477/*
478 * Test whether alloc_sz bytes will fit in the scratch region.  We isolate
479 * alloc_sz on the righthand side of the comparison in order to avoid overflow
480 * or underflow in the comparison with it.  This is simpler than the INRANGE
481 * check above, because we know that the dtms_scratch_ptr is valid in the
482 * range.  Allocations of size zero are allowed.
483 */
484#define	DTRACE_INSCRATCH(mstate, alloc_sz) \
485	((mstate)->dtms_scratch_base + (mstate)->dtms_scratch_size - \
486	(mstate)->dtms_scratch_ptr >= (alloc_sz))
487
488#define	DTRACE_LOADFUNC(bits)						\
489/*CSTYLED*/								\
490uint##bits##_t								\
491dtrace_load##bits(uintptr_t addr)					\
492{									\
493	size_t size = bits / NBBY;					\
494	/*CSTYLED*/							\
495	uint##bits##_t rval;						\
496	int i;								\
497	volatile uint16_t *flags = (volatile uint16_t *)		\
498	    &cpu_core[curcpu].cpuc_dtrace_flags;			\
499									\
500	DTRACE_ALIGNCHECK(addr, size, flags);				\
501									\
502	for (i = 0; i < dtrace_toxranges; i++) {			\
503		if (addr >= dtrace_toxrange[i].dtt_limit)		\
504			continue;					\
505									\
506		if (addr + size <= dtrace_toxrange[i].dtt_base)		\
507			continue;					\
508									\
509		/*							\
510		 * This address falls within a toxic region; return 0.	\
511		 */							\
512		*flags |= CPU_DTRACE_BADADDR;				\
513		cpu_core[curcpu].cpuc_dtrace_illval = addr;		\
514		return (0);						\
515	}								\
516									\
517	*flags |= CPU_DTRACE_NOFAULT;					\
518	/*CSTYLED*/							\
519	rval = *((volatile uint##bits##_t *)addr);			\
520	*flags &= ~CPU_DTRACE_NOFAULT;					\
521									\
522	return (!(*flags & CPU_DTRACE_FAULT) ? rval : 0);		\
523}
524
525#ifdef _LP64
526#define	dtrace_loadptr	dtrace_load64
527#else
528#define	dtrace_loadptr	dtrace_load32
529#endif
530
531#define	DTRACE_DYNHASH_FREE	0
532#define	DTRACE_DYNHASH_SINK	1
533#define	DTRACE_DYNHASH_VALID	2
534
535#define	DTRACE_MATCH_NEXT	0
536#define	DTRACE_MATCH_DONE	1
537#define	DTRACE_ANCHORED(probe)	((probe)->dtpr_func[0] != '\0')
538#define	DTRACE_STATE_ALIGN	64
539
540#define	DTRACE_FLAGS2FLT(flags)						\
541	(((flags) & CPU_DTRACE_BADADDR) ? DTRACEFLT_BADADDR :		\
542	((flags) & CPU_DTRACE_ILLOP) ? DTRACEFLT_ILLOP :		\
543	((flags) & CPU_DTRACE_DIVZERO) ? DTRACEFLT_DIVZERO :		\
544	((flags) & CPU_DTRACE_KPRIV) ? DTRACEFLT_KPRIV :		\
545	((flags) & CPU_DTRACE_UPRIV) ? DTRACEFLT_UPRIV :		\
546	((flags) & CPU_DTRACE_TUPOFLOW) ?  DTRACEFLT_TUPOFLOW :		\
547	((flags) & CPU_DTRACE_BADALIGN) ?  DTRACEFLT_BADALIGN :		\
548	((flags) & CPU_DTRACE_NOSCRATCH) ?  DTRACEFLT_NOSCRATCH :	\
549	((flags) & CPU_DTRACE_BADSTACK) ?  DTRACEFLT_BADSTACK :		\
550	DTRACEFLT_UNKNOWN)
551
552#define	DTRACEACT_ISSTRING(act)						\
553	((act)->dta_kind == DTRACEACT_DIFEXPR &&			\
554	(act)->dta_difo->dtdo_rtype.dtdt_kind == DIF_TYPE_STRING)
555
556/* Function prototype definitions: */
557static size_t dtrace_strlen(const char *, size_t);
558static dtrace_probe_t *dtrace_probe_lookup_id(dtrace_id_t id);
559static void dtrace_enabling_provide(dtrace_provider_t *);
560static int dtrace_enabling_match(dtrace_enabling_t *, int *);
561static void dtrace_enabling_matchall(void);
562static void dtrace_enabling_reap(void);
563static dtrace_state_t *dtrace_anon_grab(void);
564static uint64_t dtrace_helper(int, dtrace_mstate_t *,
565    dtrace_state_t *, uint64_t, uint64_t);
566static dtrace_helpers_t *dtrace_helpers_create(proc_t *);
567static void dtrace_buffer_drop(dtrace_buffer_t *);
568static int dtrace_buffer_consumed(dtrace_buffer_t *, hrtime_t when);
569static intptr_t dtrace_buffer_reserve(dtrace_buffer_t *, size_t, size_t,
570    dtrace_state_t *, dtrace_mstate_t *);
571static int dtrace_state_option(dtrace_state_t *, dtrace_optid_t,
572    dtrace_optval_t);
573static int dtrace_ecb_create_enable(dtrace_probe_t *, void *);
574static void dtrace_helper_provider_destroy(dtrace_helper_provider_t *);
575uint16_t dtrace_load16(uintptr_t);
576uint32_t dtrace_load32(uintptr_t);
577uint64_t dtrace_load64(uintptr_t);
578uint8_t dtrace_load8(uintptr_t);
579void dtrace_dynvar_clean(dtrace_dstate_t *);
580dtrace_dynvar_t *dtrace_dynvar(dtrace_dstate_t *, uint_t, dtrace_key_t *,
581    size_t, dtrace_dynvar_op_t, dtrace_mstate_t *, dtrace_vstate_t *);
582uintptr_t dtrace_dif_varstr(uintptr_t, dtrace_state_t *, dtrace_mstate_t *);
583static int dtrace_priv_proc(dtrace_state_t *);
584static void dtrace_getf_barrier(void);
585
586/*
587 * DTrace Probe Context Functions
588 *
589 * These functions are called from probe context.  Because probe context is
590 * any context in which C may be called, arbitrarily locks may be held,
591 * interrupts may be disabled, we may be in arbitrary dispatched state, etc.
592 * As a result, functions called from probe context may only call other DTrace
593 * support functions -- they may not interact at all with the system at large.
594 * (Note that the ASSERT macro is made probe-context safe by redefining it in
595 * terms of dtrace_assfail(), a probe-context safe function.) If arbitrary
596 * loads are to be performed from probe context, they _must_ be in terms of
597 * the safe dtrace_load*() variants.
598 *
599 * Some functions in this block are not actually called from probe context;
600 * for these functions, there will be a comment above the function reading
601 * "Note:  not called from probe context."
602 */
603void
604dtrace_panic(const char *format, ...)
605{
606	va_list alist;
607
608	va_start(alist, format);
609#ifdef __FreeBSD__
610	vpanic(format, alist);
611#else
612	dtrace_vpanic(format, alist);
613#endif
614	va_end(alist);
615}
616
617int
618dtrace_assfail(const char *a, const char *f, int l)
619{
620	dtrace_panic("assertion failed: %s, file: %s, line: %d", a, f, l);
621
622	/*
623	 * We just need something here that even the most clever compiler
624	 * cannot optimize away.
625	 */
626	return (a[(uintptr_t)f]);
627}
628
629/*
630 * Atomically increment a specified error counter from probe context.
631 */
632static void
633dtrace_error(uint32_t *counter)
634{
635	/*
636	 * Most counters stored to in probe context are per-CPU counters.
637	 * However, there are some error conditions that are sufficiently
638	 * arcane that they don't merit per-CPU storage.  If these counters
639	 * are incremented concurrently on different CPUs, scalability will be
640	 * adversely affected -- but we don't expect them to be white-hot in a
641	 * correctly constructed enabling...
642	 */
643	uint32_t oval, nval;
644
645	do {
646		oval = *counter;
647
648		if ((nval = oval + 1) == 0) {
649			/*
650			 * If the counter would wrap, set it to 1 -- assuring
651			 * that the counter is never zero when we have seen
652			 * errors.  (The counter must be 32-bits because we
653			 * aren't guaranteed a 64-bit compare&swap operation.)
654			 * To save this code both the infamy of being fingered
655			 * by a priggish news story and the indignity of being
656			 * the target of a neo-puritan witch trial, we're
657			 * carefully avoiding any colorful description of the
658			 * likelihood of this condition -- but suffice it to
659			 * say that it is only slightly more likely than the
660			 * overflow of predicate cache IDs, as discussed in
661			 * dtrace_predicate_create().
662			 */
663			nval = 1;
664		}
665	} while (dtrace_cas32(counter, oval, nval) != oval);
666}
667
668/*
669 * Use the DTRACE_LOADFUNC macro to define functions for each of loading a
670 * uint8_t, a uint16_t, a uint32_t and a uint64_t.
671 */
672DTRACE_LOADFUNC(8)
673DTRACE_LOADFUNC(16)
674DTRACE_LOADFUNC(32)
675DTRACE_LOADFUNC(64)
676
677static int
678dtrace_inscratch(uintptr_t dest, size_t size, dtrace_mstate_t *mstate)
679{
680	if (dest < mstate->dtms_scratch_base)
681		return (0);
682
683	if (dest + size < dest)
684		return (0);
685
686	if (dest + size > mstate->dtms_scratch_ptr)
687		return (0);
688
689	return (1);
690}
691
692static int
693dtrace_canstore_statvar(uint64_t addr, size_t sz,
694    dtrace_statvar_t **svars, int nsvars)
695{
696	int i;
697
698	for (i = 0; i < nsvars; i++) {
699		dtrace_statvar_t *svar = svars[i];
700
701		if (svar == NULL || svar->dtsv_size == 0)
702			continue;
703
704		if (DTRACE_INRANGE(addr, sz, svar->dtsv_data, svar->dtsv_size))
705			return (1);
706	}
707
708	return (0);
709}
710
711/*
712 * Check to see if the address is within a memory region to which a store may
713 * be issued.  This includes the DTrace scratch areas, and any DTrace variable
714 * region.  The caller of dtrace_canstore() is responsible for performing any
715 * alignment checks that are needed before stores are actually executed.
716 */
717static int
718dtrace_canstore(uint64_t addr, size_t sz, dtrace_mstate_t *mstate,
719    dtrace_vstate_t *vstate)
720{
721	/*
722	 * First, check to see if the address is in scratch space...
723	 */
724	if (DTRACE_INRANGE(addr, sz, mstate->dtms_scratch_base,
725	    mstate->dtms_scratch_size))
726		return (1);
727
728	/*
729	 * Now check to see if it's a dynamic variable.  This check will pick
730	 * up both thread-local variables and any global dynamically-allocated
731	 * variables.
732	 */
733	if (DTRACE_INRANGE(addr, sz, vstate->dtvs_dynvars.dtds_base,
734	    vstate->dtvs_dynvars.dtds_size)) {
735		dtrace_dstate_t *dstate = &vstate->dtvs_dynvars;
736		uintptr_t base = (uintptr_t)dstate->dtds_base +
737		    (dstate->dtds_hashsize * sizeof (dtrace_dynhash_t));
738		uintptr_t chunkoffs;
739
740		/*
741		 * Before we assume that we can store here, we need to make
742		 * sure that it isn't in our metadata -- storing to our
743		 * dynamic variable metadata would corrupt our state.  For
744		 * the range to not include any dynamic variable metadata,
745		 * it must:
746		 *
747		 *	(1) Start above the hash table that is at the base of
748		 *	the dynamic variable space
749		 *
750		 *	(2) Have a starting chunk offset that is beyond the
751		 *	dtrace_dynvar_t that is at the base of every chunk
752		 *
753		 *	(3) Not span a chunk boundary
754		 *
755		 */
756		if (addr < base)
757			return (0);
758
759		chunkoffs = (addr - base) % dstate->dtds_chunksize;
760
761		if (chunkoffs < sizeof (dtrace_dynvar_t))
762			return (0);
763
764		if (chunkoffs + sz > dstate->dtds_chunksize)
765			return (0);
766
767		return (1);
768	}
769
770	/*
771	 * Finally, check the static local and global variables.  These checks
772	 * take the longest, so we perform them last.
773	 */
774	if (dtrace_canstore_statvar(addr, sz,
775	    vstate->dtvs_locals, vstate->dtvs_nlocals))
776		return (1);
777
778	if (dtrace_canstore_statvar(addr, sz,
779	    vstate->dtvs_globals, vstate->dtvs_nglobals))
780		return (1);
781
782	return (0);
783}
784
785
786/*
787 * Convenience routine to check to see if the address is within a memory
788 * region in which a load may be issued given the user's privilege level;
789 * if not, it sets the appropriate error flags and loads 'addr' into the
790 * illegal value slot.
791 *
792 * DTrace subroutines (DIF_SUBR_*) should use this helper to implement
793 * appropriate memory access protection.
794 */
795static int
796dtrace_canload(uint64_t addr, size_t sz, dtrace_mstate_t *mstate,
797    dtrace_vstate_t *vstate)
798{
799	volatile uintptr_t *illval = &cpu_core[curcpu].cpuc_dtrace_illval;
800	file_t *fp;
801
802	/*
803	 * If we hold the privilege to read from kernel memory, then
804	 * everything is readable.
805	 */
806	if ((mstate->dtms_access & DTRACE_ACCESS_KERNEL) != 0)
807		return (1);
808
809	/*
810	 * You can obviously read that which you can store.
811	 */
812	if (dtrace_canstore(addr, sz, mstate, vstate))
813		return (1);
814
815	/*
816	 * We're allowed to read from our own string table.
817	 */
818	if (DTRACE_INRANGE(addr, sz, mstate->dtms_difo->dtdo_strtab,
819	    mstate->dtms_difo->dtdo_strlen))
820		return (1);
821
822	if (vstate->dtvs_state != NULL &&
823	    dtrace_priv_proc(vstate->dtvs_state)) {
824		proc_t *p;
825
826		/*
827		 * When we have privileges to the current process, there are
828		 * several context-related kernel structures that are safe to
829		 * read, even absent the privilege to read from kernel memory.
830		 * These reads are safe because these structures contain only
831		 * state that (1) we're permitted to read, (2) is harmless or
832		 * (3) contains pointers to additional kernel state that we're
833		 * not permitted to read (and as such, do not present an
834		 * opportunity for privilege escalation).  Finally (and
835		 * critically), because of the nature of their relation with
836		 * the current thread context, the memory associated with these
837		 * structures cannot change over the duration of probe context,
838		 * and it is therefore impossible for this memory to be
839		 * deallocated and reallocated as something else while it's
840		 * being operated upon.
841		 */
842		if (DTRACE_INRANGE(addr, sz, curthread, sizeof (kthread_t)))
843			return (1);
844
845		if ((p = curthread->t_procp) != NULL && DTRACE_INRANGE(addr,
846		    sz, curthread->t_procp, sizeof (proc_t))) {
847			return (1);
848		}
849
850		if (curthread->t_cred != NULL && DTRACE_INRANGE(addr, sz,
851		    curthread->t_cred, sizeof (cred_t))) {
852			return (1);
853		}
854
855#if defined(sun)
856		if (p != NULL && p->p_pidp != NULL && DTRACE_INRANGE(addr, sz,
857		    &(p->p_pidp->pid_id), sizeof (pid_t))) {
858			return (1);
859		}
860
861		if (curthread->t_cpu != NULL && DTRACE_INRANGE(addr, sz,
862		    curthread->t_cpu, offsetof(cpu_t, cpu_pause_thread))) {
863			return (1);
864		}
865#endif
866	}
867
868	if ((fp = mstate->dtms_getf) != NULL) {
869		uintptr_t psz = sizeof (void *);
870		vnode_t *vp;
871		vnodeops_t *op;
872
873		/*
874		 * When getf() returns a file_t, the enabling is implicitly
875		 * granted the (transient) right to read the returned file_t
876		 * as well as the v_path and v_op->vnop_name of the underlying
877		 * vnode.  These accesses are allowed after a successful
878		 * getf() because the members that they refer to cannot change
879		 * once set -- and the barrier logic in the kernel's closef()
880		 * path assures that the file_t and its referenced vode_t
881		 * cannot themselves be stale (that is, it impossible for
882		 * either dtms_getf itself or its f_vnode member to reference
883		 * freed memory).
884		 */
885		if (DTRACE_INRANGE(addr, sz, fp, sizeof (file_t)))
886			return (1);
887
888		if ((vp = fp->f_vnode) != NULL) {
889#if defined(sun)
890			if (DTRACE_INRANGE(addr, sz, &vp->v_path, psz))
891				return (1);
892			if (vp->v_path != NULL && DTRACE_INRANGE(addr, sz,
893			    vp->v_path, strlen(vp->v_path) + 1)) {
894				return (1);
895			}
896#endif
897
898			if (DTRACE_INRANGE(addr, sz, &vp->v_op, psz))
899				return (1);
900
901#if defined(sun)
902			if ((op = vp->v_op) != NULL &&
903			    DTRACE_INRANGE(addr, sz, &op->vnop_name, psz)) {
904				return (1);
905			}
906
907			if (op != NULL && op->vnop_name != NULL &&
908			    DTRACE_INRANGE(addr, sz, op->vnop_name,
909			    strlen(op->vnop_name) + 1)) {
910				return (1);
911			}
912#endif
913		}
914	}
915
916	DTRACE_CPUFLAG_SET(CPU_DTRACE_KPRIV);
917	*illval = addr;
918	return (0);
919}
920
921/*
922 * Convenience routine to check to see if a given string is within a memory
923 * region in which a load may be issued given the user's privilege level;
924 * this exists so that we don't need to issue unnecessary dtrace_strlen()
925 * calls in the event that the user has all privileges.
926 */
927static int
928dtrace_strcanload(uint64_t addr, size_t sz, dtrace_mstate_t *mstate,
929    dtrace_vstate_t *vstate)
930{
931	size_t strsz;
932
933	/*
934	 * If we hold the privilege to read from kernel memory, then
935	 * everything is readable.
936	 */
937	if ((mstate->dtms_access & DTRACE_ACCESS_KERNEL) != 0)
938		return (1);
939
940	strsz = 1 + dtrace_strlen((char *)(uintptr_t)addr, sz);
941	if (dtrace_canload(addr, strsz, mstate, vstate))
942		return (1);
943
944	return (0);
945}
946
947/*
948 * Convenience routine to check to see if a given variable is within a memory
949 * region in which a load may be issued given the user's privilege level.
950 */
951static int
952dtrace_vcanload(void *src, dtrace_diftype_t *type, dtrace_mstate_t *mstate,
953    dtrace_vstate_t *vstate)
954{
955	size_t sz;
956	ASSERT(type->dtdt_flags & DIF_TF_BYREF);
957
958	/*
959	 * If we hold the privilege to read from kernel memory, then
960	 * everything is readable.
961	 */
962	if ((mstate->dtms_access & DTRACE_ACCESS_KERNEL) != 0)
963		return (1);
964
965	if (type->dtdt_kind == DIF_TYPE_STRING)
966		sz = dtrace_strlen(src,
967		    vstate->dtvs_state->dts_options[DTRACEOPT_STRSIZE]) + 1;
968	else
969		sz = type->dtdt_size;
970
971	return (dtrace_canload((uintptr_t)src, sz, mstate, vstate));
972}
973
974/*
975 * Convert a string to a signed integer using safe loads.
976 *
977 * NOTE: This function uses various macros from strtolctype.h to manipulate
978 * digit values, etc -- these have all been checked to ensure they make
979 * no additional function calls.
980 */
981static int64_t
982dtrace_strtoll(char *input, int base, size_t limit)
983{
984	uintptr_t pos = (uintptr_t)input;
985	int64_t val = 0;
986	int x;
987	boolean_t neg = B_FALSE;
988	char c, cc, ccc;
989	uintptr_t end = pos + limit;
990
991	/*
992	 * Consume any whitespace preceding digits.
993	 */
994	while ((c = dtrace_load8(pos)) == ' ' || c == '\t')
995		pos++;
996
997	/*
998	 * Handle an explicit sign if one is present.
999	 */
1000	if (c == '-' || c == '+') {
1001		if (c == '-')
1002			neg = B_TRUE;
1003		c = dtrace_load8(++pos);
1004	}
1005
1006	/*
1007	 * Check for an explicit hexadecimal prefix ("0x" or "0X") and skip it
1008	 * if present.
1009	 */
1010	if (base == 16 && c == '0' && ((cc = dtrace_load8(pos + 1)) == 'x' ||
1011	    cc == 'X') && isxdigit(ccc = dtrace_load8(pos + 2))) {
1012		pos += 2;
1013		c = ccc;
1014	}
1015
1016	/*
1017	 * Read in contiguous digits until the first non-digit character.
1018	 */
1019	for (; pos < end && c != '\0' && lisalnum(c) && (x = DIGIT(c)) < base;
1020	    c = dtrace_load8(++pos))
1021		val = val * base + x;
1022
1023	return (neg ? -val : val);
1024}
1025
1026/*
1027 * Compare two strings using safe loads.
1028 */
1029static int
1030dtrace_strncmp(char *s1, char *s2, size_t limit)
1031{
1032	uint8_t c1, c2;
1033	volatile uint16_t *flags;
1034
1035	if (s1 == s2 || limit == 0)
1036		return (0);
1037
1038	flags = (volatile uint16_t *)&cpu_core[curcpu].cpuc_dtrace_flags;
1039
1040	do {
1041		if (s1 == NULL) {
1042			c1 = '\0';
1043		} else {
1044			c1 = dtrace_load8((uintptr_t)s1++);
1045		}
1046
1047		if (s2 == NULL) {
1048			c2 = '\0';
1049		} else {
1050			c2 = dtrace_load8((uintptr_t)s2++);
1051		}
1052
1053		if (c1 != c2)
1054			return (c1 - c2);
1055	} while (--limit && c1 != '\0' && !(*flags & CPU_DTRACE_FAULT));
1056
1057	return (0);
1058}
1059
1060/*
1061 * Compute strlen(s) for a string using safe memory accesses.  The additional
1062 * len parameter is used to specify a maximum length to ensure completion.
1063 */
1064static size_t
1065dtrace_strlen(const char *s, size_t lim)
1066{
1067	uint_t len;
1068
1069	for (len = 0; len != lim; len++) {
1070		if (dtrace_load8((uintptr_t)s++) == '\0')
1071			break;
1072	}
1073
1074	return (len);
1075}
1076
1077/*
1078 * Check if an address falls within a toxic region.
1079 */
1080static int
1081dtrace_istoxic(uintptr_t kaddr, size_t size)
1082{
1083	uintptr_t taddr, tsize;
1084	int i;
1085
1086	for (i = 0; i < dtrace_toxranges; i++) {
1087		taddr = dtrace_toxrange[i].dtt_base;
1088		tsize = dtrace_toxrange[i].dtt_limit - taddr;
1089
1090		if (kaddr - taddr < tsize) {
1091			DTRACE_CPUFLAG_SET(CPU_DTRACE_BADADDR);
1092			cpu_core[curcpu].cpuc_dtrace_illval = kaddr;
1093			return (1);
1094		}
1095
1096		if (taddr - kaddr < size) {
1097			DTRACE_CPUFLAG_SET(CPU_DTRACE_BADADDR);
1098			cpu_core[curcpu].cpuc_dtrace_illval = taddr;
1099			return (1);
1100		}
1101	}
1102
1103	return (0);
1104}
1105
1106/*
1107 * Copy src to dst using safe memory accesses.  The src is assumed to be unsafe
1108 * memory specified by the DIF program.  The dst is assumed to be safe memory
1109 * that we can store to directly because it is managed by DTrace.  As with
1110 * standard bcopy, overlapping copies are handled properly.
1111 */
1112static void
1113dtrace_bcopy(const void *src, void *dst, size_t len)
1114{
1115	if (len != 0) {
1116		uint8_t *s1 = dst;
1117		const uint8_t *s2 = src;
1118
1119		if (s1 <= s2) {
1120			do {
1121				*s1++ = dtrace_load8((uintptr_t)s2++);
1122			} while (--len != 0);
1123		} else {
1124			s2 += len;
1125			s1 += len;
1126
1127			do {
1128				*--s1 = dtrace_load8((uintptr_t)--s2);
1129			} while (--len != 0);
1130		}
1131	}
1132}
1133
1134/*
1135 * Copy src to dst using safe memory accesses, up to either the specified
1136 * length, or the point that a nul byte is encountered.  The src is assumed to
1137 * be unsafe memory specified by the DIF program.  The dst is assumed to be
1138 * safe memory that we can store to directly because it is managed by DTrace.
1139 * Unlike dtrace_bcopy(), overlapping regions are not handled.
1140 */
1141static void
1142dtrace_strcpy(const void *src, void *dst, size_t len)
1143{
1144	if (len != 0) {
1145		uint8_t *s1 = dst, c;
1146		const uint8_t *s2 = src;
1147
1148		do {
1149			*s1++ = c = dtrace_load8((uintptr_t)s2++);
1150		} while (--len != 0 && c != '\0');
1151	}
1152}
1153
1154/*
1155 * Copy src to dst, deriving the size and type from the specified (BYREF)
1156 * variable type.  The src is assumed to be unsafe memory specified by the DIF
1157 * program.  The dst is assumed to be DTrace variable memory that is of the
1158 * specified type; we assume that we can store to directly.
1159 */
1160static void
1161dtrace_vcopy(void *src, void *dst, dtrace_diftype_t *type)
1162{
1163	ASSERT(type->dtdt_flags & DIF_TF_BYREF);
1164
1165	if (type->dtdt_kind == DIF_TYPE_STRING) {
1166		dtrace_strcpy(src, dst, type->dtdt_size);
1167	} else {
1168		dtrace_bcopy(src, dst, type->dtdt_size);
1169	}
1170}
1171
1172/*
1173 * Compare s1 to s2 using safe memory accesses.  The s1 data is assumed to be
1174 * unsafe memory specified by the DIF program.  The s2 data is assumed to be
1175 * safe memory that we can access directly because it is managed by DTrace.
1176 */
1177static int
1178dtrace_bcmp(const void *s1, const void *s2, size_t len)
1179{
1180	volatile uint16_t *flags;
1181
1182	flags = (volatile uint16_t *)&cpu_core[curcpu].cpuc_dtrace_flags;
1183
1184	if (s1 == s2)
1185		return (0);
1186
1187	if (s1 == NULL || s2 == NULL)
1188		return (1);
1189
1190	if (s1 != s2 && len != 0) {
1191		const uint8_t *ps1 = s1;
1192		const uint8_t *ps2 = s2;
1193
1194		do {
1195			if (dtrace_load8((uintptr_t)ps1++) != *ps2++)
1196				return (1);
1197		} while (--len != 0 && !(*flags & CPU_DTRACE_FAULT));
1198	}
1199	return (0);
1200}
1201
1202/*
1203 * Zero the specified region using a simple byte-by-byte loop.  Note that this
1204 * is for safe DTrace-managed memory only.
1205 */
1206static void
1207dtrace_bzero(void *dst, size_t len)
1208{
1209	uchar_t *cp;
1210
1211	for (cp = dst; len != 0; len--)
1212		*cp++ = 0;
1213}
1214
1215static void
1216dtrace_add_128(uint64_t *addend1, uint64_t *addend2, uint64_t *sum)
1217{
1218	uint64_t result[2];
1219
1220	result[0] = addend1[0] + addend2[0];
1221	result[1] = addend1[1] + addend2[1] +
1222	    (result[0] < addend1[0] || result[0] < addend2[0] ? 1 : 0);
1223
1224	sum[0] = result[0];
1225	sum[1] = result[1];
1226}
1227
1228/*
1229 * Shift the 128-bit value in a by b. If b is positive, shift left.
1230 * If b is negative, shift right.
1231 */
1232static void
1233dtrace_shift_128(uint64_t *a, int b)
1234{
1235	uint64_t mask;
1236
1237	if (b == 0)
1238		return;
1239
1240	if (b < 0) {
1241		b = -b;
1242		if (b >= 64) {
1243			a[0] = a[1] >> (b - 64);
1244			a[1] = 0;
1245		} else {
1246			a[0] >>= b;
1247			mask = 1LL << (64 - b);
1248			mask -= 1;
1249			a[0] |= ((a[1] & mask) << (64 - b));
1250			a[1] >>= b;
1251		}
1252	} else {
1253		if (b >= 64) {
1254			a[1] = a[0] << (b - 64);
1255			a[0] = 0;
1256		} else {
1257			a[1] <<= b;
1258			mask = a[0] >> (64 - b);
1259			a[1] |= mask;
1260			a[0] <<= b;
1261		}
1262	}
1263}
1264
1265/*
1266 * The basic idea is to break the 2 64-bit values into 4 32-bit values,
1267 * use native multiplication on those, and then re-combine into the
1268 * resulting 128-bit value.
1269 *
1270 * (hi1 << 32 + lo1) * (hi2 << 32 + lo2) =
1271 *     hi1 * hi2 << 64 +
1272 *     hi1 * lo2 << 32 +
1273 *     hi2 * lo1 << 32 +
1274 *     lo1 * lo2
1275 */
1276static void
1277dtrace_multiply_128(uint64_t factor1, uint64_t factor2, uint64_t *product)
1278{
1279	uint64_t hi1, hi2, lo1, lo2;
1280	uint64_t tmp[2];
1281
1282	hi1 = factor1 >> 32;
1283	hi2 = factor2 >> 32;
1284
1285	lo1 = factor1 & DT_MASK_LO;
1286	lo2 = factor2 & DT_MASK_LO;
1287
1288	product[0] = lo1 * lo2;
1289	product[1] = hi1 * hi2;
1290
1291	tmp[0] = hi1 * lo2;
1292	tmp[1] = 0;
1293	dtrace_shift_128(tmp, 32);
1294	dtrace_add_128(product, tmp, product);
1295
1296	tmp[0] = hi2 * lo1;
1297	tmp[1] = 0;
1298	dtrace_shift_128(tmp, 32);
1299	dtrace_add_128(product, tmp, product);
1300}
1301
1302/*
1303 * This privilege check should be used by actions and subroutines to
1304 * verify that the user credentials of the process that enabled the
1305 * invoking ECB match the target credentials
1306 */
1307static int
1308dtrace_priv_proc_common_user(dtrace_state_t *state)
1309{
1310	cred_t *cr, *s_cr = state->dts_cred.dcr_cred;
1311
1312	/*
1313	 * We should always have a non-NULL state cred here, since if cred
1314	 * is null (anonymous tracing), we fast-path bypass this routine.
1315	 */
1316	ASSERT(s_cr != NULL);
1317
1318	if ((cr = CRED()) != NULL &&
1319	    s_cr->cr_uid == cr->cr_uid &&
1320	    s_cr->cr_uid == cr->cr_ruid &&
1321	    s_cr->cr_uid == cr->cr_suid &&
1322	    s_cr->cr_gid == cr->cr_gid &&
1323	    s_cr->cr_gid == cr->cr_rgid &&
1324	    s_cr->cr_gid == cr->cr_sgid)
1325		return (1);
1326
1327	return (0);
1328}
1329
1330/*
1331 * This privilege check should be used by actions and subroutines to
1332 * verify that the zone of the process that enabled the invoking ECB
1333 * matches the target credentials
1334 */
1335static int
1336dtrace_priv_proc_common_zone(dtrace_state_t *state)
1337{
1338#if defined(sun)
1339	cred_t *cr, *s_cr = state->dts_cred.dcr_cred;
1340
1341	/*
1342	 * We should always have a non-NULL state cred here, since if cred
1343	 * is null (anonymous tracing), we fast-path bypass this routine.
1344	 */
1345	ASSERT(s_cr != NULL);
1346
1347	if ((cr = CRED()) != NULL && s_cr->cr_zone == cr->cr_zone)
1348		return (1);
1349
1350	return (0);
1351#else
1352	return (1);
1353#endif
1354}
1355
1356/*
1357 * This privilege check should be used by actions and subroutines to
1358 * verify that the process has not setuid or changed credentials.
1359 */
1360static int
1361dtrace_priv_proc_common_nocd(void)
1362{
1363	proc_t *proc;
1364
1365	if ((proc = ttoproc(curthread)) != NULL &&
1366	    !(proc->p_flag & SNOCD))
1367		return (1);
1368
1369	return (0);
1370}
1371
1372static int
1373dtrace_priv_proc_destructive(dtrace_state_t *state)
1374{
1375	int action = state->dts_cred.dcr_action;
1376
1377	if (((action & DTRACE_CRA_PROC_DESTRUCTIVE_ALLZONE) == 0) &&
1378	    dtrace_priv_proc_common_zone(state) == 0)
1379		goto bad;
1380
1381	if (((action & DTRACE_CRA_PROC_DESTRUCTIVE_ALLUSER) == 0) &&
1382	    dtrace_priv_proc_common_user(state) == 0)
1383		goto bad;
1384
1385	if (((action & DTRACE_CRA_PROC_DESTRUCTIVE_CREDCHG) == 0) &&
1386	    dtrace_priv_proc_common_nocd() == 0)
1387		goto bad;
1388
1389	return (1);
1390
1391bad:
1392	cpu_core[curcpu].cpuc_dtrace_flags |= CPU_DTRACE_UPRIV;
1393
1394	return (0);
1395}
1396
1397static int
1398dtrace_priv_proc_control(dtrace_state_t *state)
1399{
1400	if (state->dts_cred.dcr_action & DTRACE_CRA_PROC_CONTROL)
1401		return (1);
1402
1403	if (dtrace_priv_proc_common_zone(state) &&
1404	    dtrace_priv_proc_common_user(state) &&
1405	    dtrace_priv_proc_common_nocd())
1406		return (1);
1407
1408	cpu_core[curcpu].cpuc_dtrace_flags |= CPU_DTRACE_UPRIV;
1409
1410	return (0);
1411}
1412
1413static int
1414dtrace_priv_proc(dtrace_state_t *state)
1415{
1416	if (state->dts_cred.dcr_action & DTRACE_CRA_PROC)
1417		return (1);
1418
1419	cpu_core[curcpu].cpuc_dtrace_flags |= CPU_DTRACE_UPRIV;
1420
1421	return (0);
1422}
1423
1424static int
1425dtrace_priv_kernel(dtrace_state_t *state)
1426{
1427	if (state->dts_cred.dcr_action & DTRACE_CRA_KERNEL)
1428		return (1);
1429
1430	cpu_core[curcpu].cpuc_dtrace_flags |= CPU_DTRACE_KPRIV;
1431
1432	return (0);
1433}
1434
1435static int
1436dtrace_priv_kernel_destructive(dtrace_state_t *state)
1437{
1438	if (state->dts_cred.dcr_action & DTRACE_CRA_KERNEL_DESTRUCTIVE)
1439		return (1);
1440
1441	cpu_core[curcpu].cpuc_dtrace_flags |= CPU_DTRACE_KPRIV;
1442
1443	return (0);
1444}
1445
1446/*
1447 * Determine if the dte_cond of the specified ECB allows for processing of
1448 * the current probe to continue.  Note that this routine may allow continued
1449 * processing, but with access(es) stripped from the mstate's dtms_access
1450 * field.
1451 */
1452static int
1453dtrace_priv_probe(dtrace_state_t *state, dtrace_mstate_t *mstate,
1454    dtrace_ecb_t *ecb)
1455{
1456	dtrace_probe_t *probe = ecb->dte_probe;
1457	dtrace_provider_t *prov = probe->dtpr_provider;
1458	dtrace_pops_t *pops = &prov->dtpv_pops;
1459	int mode = DTRACE_MODE_NOPRIV_DROP;
1460
1461	ASSERT(ecb->dte_cond);
1462
1463#if defined(sun)
1464	if (pops->dtps_mode != NULL) {
1465		mode = pops->dtps_mode(prov->dtpv_arg,
1466		    probe->dtpr_id, probe->dtpr_arg);
1467
1468		ASSERT((mode & DTRACE_MODE_USER) ||
1469		    (mode & DTRACE_MODE_KERNEL));
1470		ASSERT((mode & DTRACE_MODE_NOPRIV_RESTRICT) ||
1471		    (mode & DTRACE_MODE_NOPRIV_DROP));
1472	}
1473
1474	/*
1475	 * If the dte_cond bits indicate that this consumer is only allowed to
1476	 * see user-mode firings of this probe, call the provider's dtps_mode()
1477	 * entry point to check that the probe was fired while in a user
1478	 * context.  If that's not the case, use the policy specified by the
1479	 * provider to determine if we drop the probe or merely restrict
1480	 * operation.
1481	 */
1482	if (ecb->dte_cond & DTRACE_COND_USERMODE) {
1483		ASSERT(mode != DTRACE_MODE_NOPRIV_DROP);
1484
1485		if (!(mode & DTRACE_MODE_USER)) {
1486			if (mode & DTRACE_MODE_NOPRIV_DROP)
1487				return (0);
1488
1489			mstate->dtms_access &= ~DTRACE_ACCESS_ARGS;
1490		}
1491	}
1492#endif
1493
1494	/*
1495	 * This is more subtle than it looks. We have to be absolutely certain
1496	 * that CRED() isn't going to change out from under us so it's only
1497	 * legit to examine that structure if we're in constrained situations.
1498	 * Currently, the only times we'll this check is if a non-super-user
1499	 * has enabled the profile or syscall providers -- providers that
1500	 * allow visibility of all processes. For the profile case, the check
1501	 * above will ensure that we're examining a user context.
1502	 */
1503	if (ecb->dte_cond & DTRACE_COND_OWNER) {
1504		cred_t *cr;
1505		cred_t *s_cr = state->dts_cred.dcr_cred;
1506		proc_t *proc;
1507
1508		ASSERT(s_cr != NULL);
1509
1510		if ((cr = CRED()) == NULL ||
1511		    s_cr->cr_uid != cr->cr_uid ||
1512		    s_cr->cr_uid != cr->cr_ruid ||
1513		    s_cr->cr_uid != cr->cr_suid ||
1514		    s_cr->cr_gid != cr->cr_gid ||
1515		    s_cr->cr_gid != cr->cr_rgid ||
1516		    s_cr->cr_gid != cr->cr_sgid ||
1517		    (proc = ttoproc(curthread)) == NULL ||
1518		    (proc->p_flag & SNOCD)) {
1519			if (mode & DTRACE_MODE_NOPRIV_DROP)
1520				return (0);
1521
1522#if defined(sun)
1523			mstate->dtms_access &= ~DTRACE_ACCESS_PROC;
1524#endif
1525		}
1526	}
1527
1528#if defined(sun)
1529	/*
1530	 * If our dte_cond is set to DTRACE_COND_ZONEOWNER and we are not
1531	 * in our zone, check to see if our mode policy is to restrict rather
1532	 * than to drop; if to restrict, strip away both DTRACE_ACCESS_PROC
1533	 * and DTRACE_ACCESS_ARGS
1534	 */
1535	if (ecb->dte_cond & DTRACE_COND_ZONEOWNER) {
1536		cred_t *cr;
1537		cred_t *s_cr = state->dts_cred.dcr_cred;
1538
1539		ASSERT(s_cr != NULL);
1540
1541		if ((cr = CRED()) == NULL ||
1542		    s_cr->cr_zone->zone_id != cr->cr_zone->zone_id) {
1543			if (mode & DTRACE_MODE_NOPRIV_DROP)
1544				return (0);
1545
1546			mstate->dtms_access &=
1547			    ~(DTRACE_ACCESS_PROC | DTRACE_ACCESS_ARGS);
1548		}
1549	}
1550#endif
1551
1552	return (1);
1553}
1554
1555/*
1556 * Note:  not called from probe context.  This function is called
1557 * asynchronously (and at a regular interval) from outside of probe context to
1558 * clean the dirty dynamic variable lists on all CPUs.  Dynamic variable
1559 * cleaning is explained in detail in <sys/dtrace_impl.h>.
1560 */
1561void
1562dtrace_dynvar_clean(dtrace_dstate_t *dstate)
1563{
1564	dtrace_dynvar_t *dirty;
1565	dtrace_dstate_percpu_t *dcpu;
1566	dtrace_dynvar_t **rinsep;
1567	int i, j, work = 0;
1568
1569	for (i = 0; i < NCPU; i++) {
1570		dcpu = &dstate->dtds_percpu[i];
1571		rinsep = &dcpu->dtdsc_rinsing;
1572
1573		/*
1574		 * If the dirty list is NULL, there is no dirty work to do.
1575		 */
1576		if (dcpu->dtdsc_dirty == NULL)
1577			continue;
1578
1579		if (dcpu->dtdsc_rinsing != NULL) {
1580			/*
1581			 * If the rinsing list is non-NULL, then it is because
1582			 * this CPU was selected to accept another CPU's
1583			 * dirty list -- and since that time, dirty buffers
1584			 * have accumulated.  This is a highly unlikely
1585			 * condition, but we choose to ignore the dirty
1586			 * buffers -- they'll be picked up a future cleanse.
1587			 */
1588			continue;
1589		}
1590
1591		if (dcpu->dtdsc_clean != NULL) {
1592			/*
1593			 * If the clean list is non-NULL, then we're in a
1594			 * situation where a CPU has done deallocations (we
1595			 * have a non-NULL dirty list) but no allocations (we
1596			 * also have a non-NULL clean list).  We can't simply
1597			 * move the dirty list into the clean list on this
1598			 * CPU, yet we also don't want to allow this condition
1599			 * to persist, lest a short clean list prevent a
1600			 * massive dirty list from being cleaned (which in
1601			 * turn could lead to otherwise avoidable dynamic
1602			 * drops).  To deal with this, we look for some CPU
1603			 * with a NULL clean list, NULL dirty list, and NULL
1604			 * rinsing list -- and then we borrow this CPU to
1605			 * rinse our dirty list.
1606			 */
1607			for (j = 0; j < NCPU; j++) {
1608				dtrace_dstate_percpu_t *rinser;
1609
1610				rinser = &dstate->dtds_percpu[j];
1611
1612				if (rinser->dtdsc_rinsing != NULL)
1613					continue;
1614
1615				if (rinser->dtdsc_dirty != NULL)
1616					continue;
1617
1618				if (rinser->dtdsc_clean != NULL)
1619					continue;
1620
1621				rinsep = &rinser->dtdsc_rinsing;
1622				break;
1623			}
1624
1625			if (j == NCPU) {
1626				/*
1627				 * We were unable to find another CPU that
1628				 * could accept this dirty list -- we are
1629				 * therefore unable to clean it now.
1630				 */
1631				dtrace_dynvar_failclean++;
1632				continue;
1633			}
1634		}
1635
1636		work = 1;
1637
1638		/*
1639		 * Atomically move the dirty list aside.
1640		 */
1641		do {
1642			dirty = dcpu->dtdsc_dirty;
1643
1644			/*
1645			 * Before we zap the dirty list, set the rinsing list.
1646			 * (This allows for a potential assertion in
1647			 * dtrace_dynvar():  if a free dynamic variable appears
1648			 * on a hash chain, either the dirty list or the
1649			 * rinsing list for some CPU must be non-NULL.)
1650			 */
1651			*rinsep = dirty;
1652			dtrace_membar_producer();
1653		} while (dtrace_casptr(&dcpu->dtdsc_dirty,
1654		    dirty, NULL) != dirty);
1655	}
1656
1657	if (!work) {
1658		/*
1659		 * We have no work to do; we can simply return.
1660		 */
1661		return;
1662	}
1663
1664	dtrace_sync();
1665
1666	for (i = 0; i < NCPU; i++) {
1667		dcpu = &dstate->dtds_percpu[i];
1668
1669		if (dcpu->dtdsc_rinsing == NULL)
1670			continue;
1671
1672		/*
1673		 * We are now guaranteed that no hash chain contains a pointer
1674		 * into this dirty list; we can make it clean.
1675		 */
1676		ASSERT(dcpu->dtdsc_clean == NULL);
1677		dcpu->dtdsc_clean = dcpu->dtdsc_rinsing;
1678		dcpu->dtdsc_rinsing = NULL;
1679	}
1680
1681	/*
1682	 * Before we actually set the state to be DTRACE_DSTATE_CLEAN, make
1683	 * sure that all CPUs have seen all of the dtdsc_clean pointers.
1684	 * This prevents a race whereby a CPU incorrectly decides that
1685	 * the state should be something other than DTRACE_DSTATE_CLEAN
1686	 * after dtrace_dynvar_clean() has completed.
1687	 */
1688	dtrace_sync();
1689
1690	dstate->dtds_state = DTRACE_DSTATE_CLEAN;
1691}
1692
1693/*
1694 * Depending on the value of the op parameter, this function looks-up,
1695 * allocates or deallocates an arbitrarily-keyed dynamic variable.  If an
1696 * allocation is requested, this function will return a pointer to a
1697 * dtrace_dynvar_t corresponding to the allocated variable -- or NULL if no
1698 * variable can be allocated.  If NULL is returned, the appropriate counter
1699 * will be incremented.
1700 */
1701dtrace_dynvar_t *
1702dtrace_dynvar(dtrace_dstate_t *dstate, uint_t nkeys,
1703    dtrace_key_t *key, size_t dsize, dtrace_dynvar_op_t op,
1704    dtrace_mstate_t *mstate, dtrace_vstate_t *vstate)
1705{
1706	uint64_t hashval = DTRACE_DYNHASH_VALID;
1707	dtrace_dynhash_t *hash = dstate->dtds_hash;
1708	dtrace_dynvar_t *free, *new_free, *next, *dvar, *start, *prev = NULL;
1709	processorid_t me = curcpu, cpu = me;
1710	dtrace_dstate_percpu_t *dcpu = &dstate->dtds_percpu[me];
1711	size_t bucket, ksize;
1712	size_t chunksize = dstate->dtds_chunksize;
1713	uintptr_t kdata, lock, nstate;
1714	uint_t i;
1715
1716	ASSERT(nkeys != 0);
1717
1718	/*
1719	 * Hash the key.  As with aggregations, we use Jenkins' "One-at-a-time"
1720	 * algorithm.  For the by-value portions, we perform the algorithm in
1721	 * 16-bit chunks (as opposed to 8-bit chunks).  This speeds things up a
1722	 * bit, and seems to have only a minute effect on distribution.  For
1723	 * the by-reference data, we perform "One-at-a-time" iterating (safely)
1724	 * over each referenced byte.  It's painful to do this, but it's much
1725	 * better than pathological hash distribution.  The efficacy of the
1726	 * hashing algorithm (and a comparison with other algorithms) may be
1727	 * found by running the ::dtrace_dynstat MDB dcmd.
1728	 */
1729	for (i = 0; i < nkeys; i++) {
1730		if (key[i].dttk_size == 0) {
1731			uint64_t val = key[i].dttk_value;
1732
1733			hashval += (val >> 48) & 0xffff;
1734			hashval += (hashval << 10);
1735			hashval ^= (hashval >> 6);
1736
1737			hashval += (val >> 32) & 0xffff;
1738			hashval += (hashval << 10);
1739			hashval ^= (hashval >> 6);
1740
1741			hashval += (val >> 16) & 0xffff;
1742			hashval += (hashval << 10);
1743			hashval ^= (hashval >> 6);
1744
1745			hashval += val & 0xffff;
1746			hashval += (hashval << 10);
1747			hashval ^= (hashval >> 6);
1748		} else {
1749			/*
1750			 * This is incredibly painful, but it beats the hell
1751			 * out of the alternative.
1752			 */
1753			uint64_t j, size = key[i].dttk_size;
1754			uintptr_t base = (uintptr_t)key[i].dttk_value;
1755
1756			if (!dtrace_canload(base, size, mstate, vstate))
1757				break;
1758
1759			for (j = 0; j < size; j++) {
1760				hashval += dtrace_load8(base + j);
1761				hashval += (hashval << 10);
1762				hashval ^= (hashval >> 6);
1763			}
1764		}
1765	}
1766
1767	if (DTRACE_CPUFLAG_ISSET(CPU_DTRACE_FAULT))
1768		return (NULL);
1769
1770	hashval += (hashval << 3);
1771	hashval ^= (hashval >> 11);
1772	hashval += (hashval << 15);
1773
1774	/*
1775	 * There is a remote chance (ideally, 1 in 2^31) that our hashval
1776	 * comes out to be one of our two sentinel hash values.  If this
1777	 * actually happens, we set the hashval to be a value known to be a
1778	 * non-sentinel value.
1779	 */
1780	if (hashval == DTRACE_DYNHASH_FREE || hashval == DTRACE_DYNHASH_SINK)
1781		hashval = DTRACE_DYNHASH_VALID;
1782
1783	/*
1784	 * Yes, it's painful to do a divide here.  If the cycle count becomes
1785	 * important here, tricks can be pulled to reduce it.  (However, it's
1786	 * critical that hash collisions be kept to an absolute minimum;
1787	 * they're much more painful than a divide.)  It's better to have a
1788	 * solution that generates few collisions and still keeps things
1789	 * relatively simple.
1790	 */
1791	bucket = hashval % dstate->dtds_hashsize;
1792
1793	if (op == DTRACE_DYNVAR_DEALLOC) {
1794		volatile uintptr_t *lockp = &hash[bucket].dtdh_lock;
1795
1796		for (;;) {
1797			while ((lock = *lockp) & 1)
1798				continue;
1799
1800			if (dtrace_casptr((volatile void *)lockp,
1801			    (volatile void *)lock, (volatile void *)(lock + 1)) == (void *)lock)
1802				break;
1803		}
1804
1805		dtrace_membar_producer();
1806	}
1807
1808top:
1809	prev = NULL;
1810	lock = hash[bucket].dtdh_lock;
1811
1812	dtrace_membar_consumer();
1813
1814	start = hash[bucket].dtdh_chain;
1815	ASSERT(start != NULL && (start->dtdv_hashval == DTRACE_DYNHASH_SINK ||
1816	    start->dtdv_hashval != DTRACE_DYNHASH_FREE ||
1817	    op != DTRACE_DYNVAR_DEALLOC));
1818
1819	for (dvar = start; dvar != NULL; dvar = dvar->dtdv_next) {
1820		dtrace_tuple_t *dtuple = &dvar->dtdv_tuple;
1821		dtrace_key_t *dkey = &dtuple->dtt_key[0];
1822
1823		if (dvar->dtdv_hashval != hashval) {
1824			if (dvar->dtdv_hashval == DTRACE_DYNHASH_SINK) {
1825				/*
1826				 * We've reached the sink, and therefore the
1827				 * end of the hash chain; we can kick out of
1828				 * the loop knowing that we have seen a valid
1829				 * snapshot of state.
1830				 */
1831				ASSERT(dvar->dtdv_next == NULL);
1832				ASSERT(dvar == &dtrace_dynhash_sink);
1833				break;
1834			}
1835
1836			if (dvar->dtdv_hashval == DTRACE_DYNHASH_FREE) {
1837				/*
1838				 * We've gone off the rails:  somewhere along
1839				 * the line, one of the members of this hash
1840				 * chain was deleted.  Note that we could also
1841				 * detect this by simply letting this loop run
1842				 * to completion, as we would eventually hit
1843				 * the end of the dirty list.  However, we
1844				 * want to avoid running the length of the
1845				 * dirty list unnecessarily (it might be quite
1846				 * long), so we catch this as early as
1847				 * possible by detecting the hash marker.  In
1848				 * this case, we simply set dvar to NULL and
1849				 * break; the conditional after the loop will
1850				 * send us back to top.
1851				 */
1852				dvar = NULL;
1853				break;
1854			}
1855
1856			goto next;
1857		}
1858
1859		if (dtuple->dtt_nkeys != nkeys)
1860			goto next;
1861
1862		for (i = 0; i < nkeys; i++, dkey++) {
1863			if (dkey->dttk_size != key[i].dttk_size)
1864				goto next; /* size or type mismatch */
1865
1866			if (dkey->dttk_size != 0) {
1867				if (dtrace_bcmp(
1868				    (void *)(uintptr_t)key[i].dttk_value,
1869				    (void *)(uintptr_t)dkey->dttk_value,
1870				    dkey->dttk_size))
1871					goto next;
1872			} else {
1873				if (dkey->dttk_value != key[i].dttk_value)
1874					goto next;
1875			}
1876		}
1877
1878		if (op != DTRACE_DYNVAR_DEALLOC)
1879			return (dvar);
1880
1881		ASSERT(dvar->dtdv_next == NULL ||
1882		    dvar->dtdv_next->dtdv_hashval != DTRACE_DYNHASH_FREE);
1883
1884		if (prev != NULL) {
1885			ASSERT(hash[bucket].dtdh_chain != dvar);
1886			ASSERT(start != dvar);
1887			ASSERT(prev->dtdv_next == dvar);
1888			prev->dtdv_next = dvar->dtdv_next;
1889		} else {
1890			if (dtrace_casptr(&hash[bucket].dtdh_chain,
1891			    start, dvar->dtdv_next) != start) {
1892				/*
1893				 * We have failed to atomically swing the
1894				 * hash table head pointer, presumably because
1895				 * of a conflicting allocation on another CPU.
1896				 * We need to reread the hash chain and try
1897				 * again.
1898				 */
1899				goto top;
1900			}
1901		}
1902
1903		dtrace_membar_producer();
1904
1905		/*
1906		 * Now set the hash value to indicate that it's free.
1907		 */
1908		ASSERT(hash[bucket].dtdh_chain != dvar);
1909		dvar->dtdv_hashval = DTRACE_DYNHASH_FREE;
1910
1911		dtrace_membar_producer();
1912
1913		/*
1914		 * Set the next pointer to point at the dirty list, and
1915		 * atomically swing the dirty pointer to the newly freed dvar.
1916		 */
1917		do {
1918			next = dcpu->dtdsc_dirty;
1919			dvar->dtdv_next = next;
1920		} while (dtrace_casptr(&dcpu->dtdsc_dirty, next, dvar) != next);
1921
1922		/*
1923		 * Finally, unlock this hash bucket.
1924		 */
1925		ASSERT(hash[bucket].dtdh_lock == lock);
1926		ASSERT(lock & 1);
1927		hash[bucket].dtdh_lock++;
1928
1929		return (NULL);
1930next:
1931		prev = dvar;
1932		continue;
1933	}
1934
1935	if (dvar == NULL) {
1936		/*
1937		 * If dvar is NULL, it is because we went off the rails:
1938		 * one of the elements that we traversed in the hash chain
1939		 * was deleted while we were traversing it.  In this case,
1940		 * we assert that we aren't doing a dealloc (deallocs lock
1941		 * the hash bucket to prevent themselves from racing with
1942		 * one another), and retry the hash chain traversal.
1943		 */
1944		ASSERT(op != DTRACE_DYNVAR_DEALLOC);
1945		goto top;
1946	}
1947
1948	if (op != DTRACE_DYNVAR_ALLOC) {
1949		/*
1950		 * If we are not to allocate a new variable, we want to
1951		 * return NULL now.  Before we return, check that the value
1952		 * of the lock word hasn't changed.  If it has, we may have
1953		 * seen an inconsistent snapshot.
1954		 */
1955		if (op == DTRACE_DYNVAR_NOALLOC) {
1956			if (hash[bucket].dtdh_lock != lock)
1957				goto top;
1958		} else {
1959			ASSERT(op == DTRACE_DYNVAR_DEALLOC);
1960			ASSERT(hash[bucket].dtdh_lock == lock);
1961			ASSERT(lock & 1);
1962			hash[bucket].dtdh_lock++;
1963		}
1964
1965		return (NULL);
1966	}
1967
1968	/*
1969	 * We need to allocate a new dynamic variable.  The size we need is the
1970	 * size of dtrace_dynvar plus the size of nkeys dtrace_key_t's plus the
1971	 * size of any auxiliary key data (rounded up to 8-byte alignment) plus
1972	 * the size of any referred-to data (dsize).  We then round the final
1973	 * size up to the chunksize for allocation.
1974	 */
1975	for (ksize = 0, i = 0; i < nkeys; i++)
1976		ksize += P2ROUNDUP(key[i].dttk_size, sizeof (uint64_t));
1977
1978	/*
1979	 * This should be pretty much impossible, but could happen if, say,
1980	 * strange DIF specified the tuple.  Ideally, this should be an
1981	 * assertion and not an error condition -- but that requires that the
1982	 * chunksize calculation in dtrace_difo_chunksize() be absolutely
1983	 * bullet-proof.  (That is, it must not be able to be fooled by
1984	 * malicious DIF.)  Given the lack of backwards branches in DIF,
1985	 * solving this would presumably not amount to solving the Halting
1986	 * Problem -- but it still seems awfully hard.
1987	 */
1988	if (sizeof (dtrace_dynvar_t) + sizeof (dtrace_key_t) * (nkeys - 1) +
1989	    ksize + dsize > chunksize) {
1990		dcpu->dtdsc_drops++;
1991		return (NULL);
1992	}
1993
1994	nstate = DTRACE_DSTATE_EMPTY;
1995
1996	do {
1997retry:
1998		free = dcpu->dtdsc_free;
1999
2000		if (free == NULL) {
2001			dtrace_dynvar_t *clean = dcpu->dtdsc_clean;
2002			void *rval;
2003
2004			if (clean == NULL) {
2005				/*
2006				 * We're out of dynamic variable space on
2007				 * this CPU.  Unless we have tried all CPUs,
2008				 * we'll try to allocate from a different
2009				 * CPU.
2010				 */
2011				switch (dstate->dtds_state) {
2012				case DTRACE_DSTATE_CLEAN: {
2013					void *sp = &dstate->dtds_state;
2014
2015					if (++cpu >= NCPU)
2016						cpu = 0;
2017
2018					if (dcpu->dtdsc_dirty != NULL &&
2019					    nstate == DTRACE_DSTATE_EMPTY)
2020						nstate = DTRACE_DSTATE_DIRTY;
2021
2022					if (dcpu->dtdsc_rinsing != NULL)
2023						nstate = DTRACE_DSTATE_RINSING;
2024
2025					dcpu = &dstate->dtds_percpu[cpu];
2026
2027					if (cpu != me)
2028						goto retry;
2029
2030					(void) dtrace_cas32(sp,
2031					    DTRACE_DSTATE_CLEAN, nstate);
2032
2033					/*
2034					 * To increment the correct bean
2035					 * counter, take another lap.
2036					 */
2037					goto retry;
2038				}
2039
2040				case DTRACE_DSTATE_DIRTY:
2041					dcpu->dtdsc_dirty_drops++;
2042					break;
2043
2044				case DTRACE_DSTATE_RINSING:
2045					dcpu->dtdsc_rinsing_drops++;
2046					break;
2047
2048				case DTRACE_DSTATE_EMPTY:
2049					dcpu->dtdsc_drops++;
2050					break;
2051				}
2052
2053				DTRACE_CPUFLAG_SET(CPU_DTRACE_DROP);
2054				return (NULL);
2055			}
2056
2057			/*
2058			 * The clean list appears to be non-empty.  We want to
2059			 * move the clean list to the free list; we start by
2060			 * moving the clean pointer aside.
2061			 */
2062			if (dtrace_casptr(&dcpu->dtdsc_clean,
2063			    clean, NULL) != clean) {
2064				/*
2065				 * We are in one of two situations:
2066				 *
2067				 *  (a)	The clean list was switched to the
2068				 *	free list by another CPU.
2069				 *
2070				 *  (b)	The clean list was added to by the
2071				 *	cleansing cyclic.
2072				 *
2073				 * In either of these situations, we can
2074				 * just reattempt the free list allocation.
2075				 */
2076				goto retry;
2077			}
2078
2079			ASSERT(clean->dtdv_hashval == DTRACE_DYNHASH_FREE);
2080
2081			/*
2082			 * Now we'll move the clean list to our free list.
2083			 * It's impossible for this to fail:  the only way
2084			 * the free list can be updated is through this
2085			 * code path, and only one CPU can own the clean list.
2086			 * Thus, it would only be possible for this to fail if
2087			 * this code were racing with dtrace_dynvar_clean().
2088			 * (That is, if dtrace_dynvar_clean() updated the clean
2089			 * list, and we ended up racing to update the free
2090			 * list.)  This race is prevented by the dtrace_sync()
2091			 * in dtrace_dynvar_clean() -- which flushes the
2092			 * owners of the clean lists out before resetting
2093			 * the clean lists.
2094			 */
2095			dcpu = &dstate->dtds_percpu[me];
2096			rval = dtrace_casptr(&dcpu->dtdsc_free, NULL, clean);
2097			ASSERT(rval == NULL);
2098			goto retry;
2099		}
2100
2101		dvar = free;
2102		new_free = dvar->dtdv_next;
2103	} while (dtrace_casptr(&dcpu->dtdsc_free, free, new_free) != free);
2104
2105	/*
2106	 * We have now allocated a new chunk.  We copy the tuple keys into the
2107	 * tuple array and copy any referenced key data into the data space
2108	 * following the tuple array.  As we do this, we relocate dttk_value
2109	 * in the final tuple to point to the key data address in the chunk.
2110	 */
2111	kdata = (uintptr_t)&dvar->dtdv_tuple.dtt_key[nkeys];
2112	dvar->dtdv_data = (void *)(kdata + ksize);
2113	dvar->dtdv_tuple.dtt_nkeys = nkeys;
2114
2115	for (i = 0; i < nkeys; i++) {
2116		dtrace_key_t *dkey = &dvar->dtdv_tuple.dtt_key[i];
2117		size_t kesize = key[i].dttk_size;
2118
2119		if (kesize != 0) {
2120			dtrace_bcopy(
2121			    (const void *)(uintptr_t)key[i].dttk_value,
2122			    (void *)kdata, kesize);
2123			dkey->dttk_value = kdata;
2124			kdata += P2ROUNDUP(kesize, sizeof (uint64_t));
2125		} else {
2126			dkey->dttk_value = key[i].dttk_value;
2127		}
2128
2129		dkey->dttk_size = kesize;
2130	}
2131
2132	ASSERT(dvar->dtdv_hashval == DTRACE_DYNHASH_FREE);
2133	dvar->dtdv_hashval = hashval;
2134	dvar->dtdv_next = start;
2135
2136	if (dtrace_casptr(&hash[bucket].dtdh_chain, start, dvar) == start)
2137		return (dvar);
2138
2139	/*
2140	 * The cas has failed.  Either another CPU is adding an element to
2141	 * this hash chain, or another CPU is deleting an element from this
2142	 * hash chain.  The simplest way to deal with both of these cases
2143	 * (though not necessarily the most efficient) is to free our
2144	 * allocated block and tail-call ourselves.  Note that the free is
2145	 * to the dirty list and _not_ to the free list.  This is to prevent
2146	 * races with allocators, above.
2147	 */
2148	dvar->dtdv_hashval = DTRACE_DYNHASH_FREE;
2149
2150	dtrace_membar_producer();
2151
2152	do {
2153		free = dcpu->dtdsc_dirty;
2154		dvar->dtdv_next = free;
2155	} while (dtrace_casptr(&dcpu->dtdsc_dirty, free, dvar) != free);
2156
2157	return (dtrace_dynvar(dstate, nkeys, key, dsize, op, mstate, vstate));
2158}
2159
2160/*ARGSUSED*/
2161static void
2162dtrace_aggregate_min(uint64_t *oval, uint64_t nval, uint64_t arg)
2163{
2164	if ((int64_t)nval < (int64_t)*oval)
2165		*oval = nval;
2166}
2167
2168/*ARGSUSED*/
2169static void
2170dtrace_aggregate_max(uint64_t *oval, uint64_t nval, uint64_t arg)
2171{
2172	if ((int64_t)nval > (int64_t)*oval)
2173		*oval = nval;
2174}
2175
2176static void
2177dtrace_aggregate_quantize(uint64_t *quanta, uint64_t nval, uint64_t incr)
2178{
2179	int i, zero = DTRACE_QUANTIZE_ZEROBUCKET;
2180	int64_t val = (int64_t)nval;
2181
2182	if (val < 0) {
2183		for (i = 0; i < zero; i++) {
2184			if (val <= DTRACE_QUANTIZE_BUCKETVAL(i)) {
2185				quanta[i] += incr;
2186				return;
2187			}
2188		}
2189	} else {
2190		for (i = zero + 1; i < DTRACE_QUANTIZE_NBUCKETS; i++) {
2191			if (val < DTRACE_QUANTIZE_BUCKETVAL(i)) {
2192				quanta[i - 1] += incr;
2193				return;
2194			}
2195		}
2196
2197		quanta[DTRACE_QUANTIZE_NBUCKETS - 1] += incr;
2198		return;
2199	}
2200
2201	ASSERT(0);
2202}
2203
2204static void
2205dtrace_aggregate_lquantize(uint64_t *lquanta, uint64_t nval, uint64_t incr)
2206{
2207	uint64_t arg = *lquanta++;
2208	int32_t base = DTRACE_LQUANTIZE_BASE(arg);
2209	uint16_t step = DTRACE_LQUANTIZE_STEP(arg);
2210	uint16_t levels = DTRACE_LQUANTIZE_LEVELS(arg);
2211	int32_t val = (int32_t)nval, level;
2212
2213	ASSERT(step != 0);
2214	ASSERT(levels != 0);
2215
2216	if (val < base) {
2217		/*
2218		 * This is an underflow.
2219		 */
2220		lquanta[0] += incr;
2221		return;
2222	}
2223
2224	level = (val - base) / step;
2225
2226	if (level < levels) {
2227		lquanta[level + 1] += incr;
2228		return;
2229	}
2230
2231	/*
2232	 * This is an overflow.
2233	 */
2234	lquanta[levels + 1] += incr;
2235}
2236
2237static int
2238dtrace_aggregate_llquantize_bucket(uint16_t factor, uint16_t low,
2239    uint16_t high, uint16_t nsteps, int64_t value)
2240{
2241	int64_t this = 1, last, next;
2242	int base = 1, order;
2243
2244	ASSERT(factor <= nsteps);
2245	ASSERT(nsteps % factor == 0);
2246
2247	for (order = 0; order < low; order++)
2248		this *= factor;
2249
2250	/*
2251	 * If our value is less than our factor taken to the power of the
2252	 * low order of magnitude, it goes into the zeroth bucket.
2253	 */
2254	if (value < (last = this))
2255		return (0);
2256
2257	for (this *= factor; order <= high; order++) {
2258		int nbuckets = this > nsteps ? nsteps : this;
2259
2260		if ((next = this * factor) < this) {
2261			/*
2262			 * We should not generally get log/linear quantizations
2263			 * with a high magnitude that allows 64-bits to
2264			 * overflow, but we nonetheless protect against this
2265			 * by explicitly checking for overflow, and clamping
2266			 * our value accordingly.
2267			 */
2268			value = this - 1;
2269		}
2270
2271		if (value < this) {
2272			/*
2273			 * If our value lies within this order of magnitude,
2274			 * determine its position by taking the offset within
2275			 * the order of magnitude, dividing by the bucket
2276			 * width, and adding to our (accumulated) base.
2277			 */
2278			return (base + (value - last) / (this / nbuckets));
2279		}
2280
2281		base += nbuckets - (nbuckets / factor);
2282		last = this;
2283		this = next;
2284	}
2285
2286	/*
2287	 * Our value is greater than or equal to our factor taken to the
2288	 * power of one plus the high magnitude -- return the top bucket.
2289	 */
2290	return (base);
2291}
2292
2293static void
2294dtrace_aggregate_llquantize(uint64_t *llquanta, uint64_t nval, uint64_t incr)
2295{
2296	uint64_t arg = *llquanta++;
2297	uint16_t factor = DTRACE_LLQUANTIZE_FACTOR(arg);
2298	uint16_t low = DTRACE_LLQUANTIZE_LOW(arg);
2299	uint16_t high = DTRACE_LLQUANTIZE_HIGH(arg);
2300	uint16_t nsteps = DTRACE_LLQUANTIZE_NSTEP(arg);
2301
2302	llquanta[dtrace_aggregate_llquantize_bucket(factor,
2303	    low, high, nsteps, nval)] += incr;
2304}
2305
2306/*ARGSUSED*/
2307static void
2308dtrace_aggregate_avg(uint64_t *data, uint64_t nval, uint64_t arg)
2309{
2310	data[0]++;
2311	data[1] += nval;
2312}
2313
2314/*ARGSUSED*/
2315static void
2316dtrace_aggregate_stddev(uint64_t *data, uint64_t nval, uint64_t arg)
2317{
2318	int64_t snval = (int64_t)nval;
2319	uint64_t tmp[2];
2320
2321	data[0]++;
2322	data[1] += nval;
2323
2324	/*
2325	 * What we want to say here is:
2326	 *
2327	 * data[2] += nval * nval;
2328	 *
2329	 * But given that nval is 64-bit, we could easily overflow, so
2330	 * we do this as 128-bit arithmetic.
2331	 */
2332	if (snval < 0)
2333		snval = -snval;
2334
2335	dtrace_multiply_128((uint64_t)snval, (uint64_t)snval, tmp);
2336	dtrace_add_128(data + 2, tmp, data + 2);
2337}
2338
2339/*ARGSUSED*/
2340static void
2341dtrace_aggregate_count(uint64_t *oval, uint64_t nval, uint64_t arg)
2342{
2343	*oval = *oval + 1;
2344}
2345
2346/*ARGSUSED*/
2347static void
2348dtrace_aggregate_sum(uint64_t *oval, uint64_t nval, uint64_t arg)
2349{
2350	*oval += nval;
2351}
2352
2353/*
2354 * Aggregate given the tuple in the principal data buffer, and the aggregating
2355 * action denoted by the specified dtrace_aggregation_t.  The aggregation
2356 * buffer is specified as the buf parameter.  This routine does not return
2357 * failure; if there is no space in the aggregation buffer, the data will be
2358 * dropped, and a corresponding counter incremented.
2359 */
2360static void
2361dtrace_aggregate(dtrace_aggregation_t *agg, dtrace_buffer_t *dbuf,
2362    intptr_t offset, dtrace_buffer_t *buf, uint64_t expr, uint64_t arg)
2363{
2364	dtrace_recdesc_t *rec = &agg->dtag_action.dta_rec;
2365	uint32_t i, ndx, size, fsize;
2366	uint32_t align = sizeof (uint64_t) - 1;
2367	dtrace_aggbuffer_t *agb;
2368	dtrace_aggkey_t *key;
2369	uint32_t hashval = 0, limit, isstr;
2370	caddr_t tomax, data, kdata;
2371	dtrace_actkind_t action;
2372	dtrace_action_t *act;
2373	uintptr_t offs;
2374
2375	if (buf == NULL)
2376		return;
2377
2378	if (!agg->dtag_hasarg) {
2379		/*
2380		 * Currently, only quantize() and lquantize() take additional
2381		 * arguments, and they have the same semantics:  an increment
2382		 * value that defaults to 1 when not present.  If additional
2383		 * aggregating actions take arguments, the setting of the
2384		 * default argument value will presumably have to become more
2385		 * sophisticated...
2386		 */
2387		arg = 1;
2388	}
2389
2390	action = agg->dtag_action.dta_kind - DTRACEACT_AGGREGATION;
2391	size = rec->dtrd_offset - agg->dtag_base;
2392	fsize = size + rec->dtrd_size;
2393
2394	ASSERT(dbuf->dtb_tomax != NULL);
2395	data = dbuf->dtb_tomax + offset + agg->dtag_base;
2396
2397	if ((tomax = buf->dtb_tomax) == NULL) {
2398		dtrace_buffer_drop(buf);
2399		return;
2400	}
2401
2402	/*
2403	 * The metastructure is always at the bottom of the buffer.
2404	 */
2405	agb = (dtrace_aggbuffer_t *)(tomax + buf->dtb_size -
2406	    sizeof (dtrace_aggbuffer_t));
2407
2408	if (buf->dtb_offset == 0) {
2409		/*
2410		 * We just kludge up approximately 1/8th of the size to be
2411		 * buckets.  If this guess ends up being routinely
2412		 * off-the-mark, we may need to dynamically readjust this
2413		 * based on past performance.
2414		 */
2415		uintptr_t hashsize = (buf->dtb_size >> 3) / sizeof (uintptr_t);
2416
2417		if ((uintptr_t)agb - hashsize * sizeof (dtrace_aggkey_t *) <
2418		    (uintptr_t)tomax || hashsize == 0) {
2419			/*
2420			 * We've been given a ludicrously small buffer;
2421			 * increment our drop count and leave.
2422			 */
2423			dtrace_buffer_drop(buf);
2424			return;
2425		}
2426
2427		/*
2428		 * And now, a pathetic attempt to try to get a an odd (or
2429		 * perchance, a prime) hash size for better hash distribution.
2430		 */
2431		if (hashsize > (DTRACE_AGGHASHSIZE_SLEW << 3))
2432			hashsize -= DTRACE_AGGHASHSIZE_SLEW;
2433
2434		agb->dtagb_hashsize = hashsize;
2435		agb->dtagb_hash = (dtrace_aggkey_t **)((uintptr_t)agb -
2436		    agb->dtagb_hashsize * sizeof (dtrace_aggkey_t *));
2437		agb->dtagb_free = (uintptr_t)agb->dtagb_hash;
2438
2439		for (i = 0; i < agb->dtagb_hashsize; i++)
2440			agb->dtagb_hash[i] = NULL;
2441	}
2442
2443	ASSERT(agg->dtag_first != NULL);
2444	ASSERT(agg->dtag_first->dta_intuple);
2445
2446	/*
2447	 * Calculate the hash value based on the key.  Note that we _don't_
2448	 * include the aggid in the hashing (but we will store it as part of
2449	 * the key).  The hashing algorithm is Bob Jenkins' "One-at-a-time"
2450	 * algorithm: a simple, quick algorithm that has no known funnels, and
2451	 * gets good distribution in practice.  The efficacy of the hashing
2452	 * algorithm (and a comparison with other algorithms) may be found by
2453	 * running the ::dtrace_aggstat MDB dcmd.
2454	 */
2455	for (act = agg->dtag_first; act->dta_intuple; act = act->dta_next) {
2456		i = act->dta_rec.dtrd_offset - agg->dtag_base;
2457		limit = i + act->dta_rec.dtrd_size;
2458		ASSERT(limit <= size);
2459		isstr = DTRACEACT_ISSTRING(act);
2460
2461		for (; i < limit; i++) {
2462			hashval += data[i];
2463			hashval += (hashval << 10);
2464			hashval ^= (hashval >> 6);
2465
2466			if (isstr && data[i] == '\0')
2467				break;
2468		}
2469	}
2470
2471	hashval += (hashval << 3);
2472	hashval ^= (hashval >> 11);
2473	hashval += (hashval << 15);
2474
2475	/*
2476	 * Yes, the divide here is expensive -- but it's generally the least
2477	 * of the performance issues given the amount of data that we iterate
2478	 * over to compute hash values, compare data, etc.
2479	 */
2480	ndx = hashval % agb->dtagb_hashsize;
2481
2482	for (key = agb->dtagb_hash[ndx]; key != NULL; key = key->dtak_next) {
2483		ASSERT((caddr_t)key >= tomax);
2484		ASSERT((caddr_t)key < tomax + buf->dtb_size);
2485
2486		if (hashval != key->dtak_hashval || key->dtak_size != size)
2487			continue;
2488
2489		kdata = key->dtak_data;
2490		ASSERT(kdata >= tomax && kdata < tomax + buf->dtb_size);
2491
2492		for (act = agg->dtag_first; act->dta_intuple;
2493		    act = act->dta_next) {
2494			i = act->dta_rec.dtrd_offset - agg->dtag_base;
2495			limit = i + act->dta_rec.dtrd_size;
2496			ASSERT(limit <= size);
2497			isstr = DTRACEACT_ISSTRING(act);
2498
2499			for (; i < limit; i++) {
2500				if (kdata[i] != data[i])
2501					goto next;
2502
2503				if (isstr && data[i] == '\0')
2504					break;
2505			}
2506		}
2507
2508		if (action != key->dtak_action) {
2509			/*
2510			 * We are aggregating on the same value in the same
2511			 * aggregation with two different aggregating actions.
2512			 * (This should have been picked up in the compiler,
2513			 * so we may be dealing with errant or devious DIF.)
2514			 * This is an error condition; we indicate as much,
2515			 * and return.
2516			 */
2517			DTRACE_CPUFLAG_SET(CPU_DTRACE_ILLOP);
2518			return;
2519		}
2520
2521		/*
2522		 * This is a hit:  we need to apply the aggregator to
2523		 * the value at this key.
2524		 */
2525		agg->dtag_aggregate((uint64_t *)(kdata + size), expr, arg);
2526		return;
2527next:
2528		continue;
2529	}
2530
2531	/*
2532	 * We didn't find it.  We need to allocate some zero-filled space,
2533	 * link it into the hash table appropriately, and apply the aggregator
2534	 * to the (zero-filled) value.
2535	 */
2536	offs = buf->dtb_offset;
2537	while (offs & (align - 1))
2538		offs += sizeof (uint32_t);
2539
2540	/*
2541	 * If we don't have enough room to both allocate a new key _and_
2542	 * its associated data, increment the drop count and return.
2543	 */
2544	if ((uintptr_t)tomax + offs + fsize >
2545	    agb->dtagb_free - sizeof (dtrace_aggkey_t)) {
2546		dtrace_buffer_drop(buf);
2547		return;
2548	}
2549
2550	/*CONSTCOND*/
2551	ASSERT(!(sizeof (dtrace_aggkey_t) & (sizeof (uintptr_t) - 1)));
2552	key = (dtrace_aggkey_t *)(agb->dtagb_free - sizeof (dtrace_aggkey_t));
2553	agb->dtagb_free -= sizeof (dtrace_aggkey_t);
2554
2555	key->dtak_data = kdata = tomax + offs;
2556	buf->dtb_offset = offs + fsize;
2557
2558	/*
2559	 * Now copy the data across.
2560	 */
2561	*((dtrace_aggid_t *)kdata) = agg->dtag_id;
2562
2563	for (i = sizeof (dtrace_aggid_t); i < size; i++)
2564		kdata[i] = data[i];
2565
2566	/*
2567	 * Because strings are not zeroed out by default, we need to iterate
2568	 * looking for actions that store strings, and we need to explicitly
2569	 * pad these strings out with zeroes.
2570	 */
2571	for (act = agg->dtag_first; act->dta_intuple; act = act->dta_next) {
2572		int nul;
2573
2574		if (!DTRACEACT_ISSTRING(act))
2575			continue;
2576
2577		i = act->dta_rec.dtrd_offset - agg->dtag_base;
2578		limit = i + act->dta_rec.dtrd_size;
2579		ASSERT(limit <= size);
2580
2581		for (nul = 0; i < limit; i++) {
2582			if (nul) {
2583				kdata[i] = '\0';
2584				continue;
2585			}
2586
2587			if (data[i] != '\0')
2588				continue;
2589
2590			nul = 1;
2591		}
2592	}
2593
2594	for (i = size; i < fsize; i++)
2595		kdata[i] = 0;
2596
2597	key->dtak_hashval = hashval;
2598	key->dtak_size = size;
2599	key->dtak_action = action;
2600	key->dtak_next = agb->dtagb_hash[ndx];
2601	agb->dtagb_hash[ndx] = key;
2602
2603	/*
2604	 * Finally, apply the aggregator.
2605	 */
2606	*((uint64_t *)(key->dtak_data + size)) = agg->dtag_initial;
2607	agg->dtag_aggregate((uint64_t *)(key->dtak_data + size), expr, arg);
2608}
2609
2610/*
2611 * Given consumer state, this routine finds a speculation in the INACTIVE
2612 * state and transitions it into the ACTIVE state.  If there is no speculation
2613 * in the INACTIVE state, 0 is returned.  In this case, no error counter is
2614 * incremented -- it is up to the caller to take appropriate action.
2615 */
2616static int
2617dtrace_speculation(dtrace_state_t *state)
2618{
2619	int i = 0;
2620	dtrace_speculation_state_t current;
2621	uint32_t *stat = &state->dts_speculations_unavail, count;
2622
2623	while (i < state->dts_nspeculations) {
2624		dtrace_speculation_t *spec = &state->dts_speculations[i];
2625
2626		current = spec->dtsp_state;
2627
2628		if (current != DTRACESPEC_INACTIVE) {
2629			if (current == DTRACESPEC_COMMITTINGMANY ||
2630			    current == DTRACESPEC_COMMITTING ||
2631			    current == DTRACESPEC_DISCARDING)
2632				stat = &state->dts_speculations_busy;
2633			i++;
2634			continue;
2635		}
2636
2637		if (dtrace_cas32((uint32_t *)&spec->dtsp_state,
2638		    current, DTRACESPEC_ACTIVE) == current)
2639			return (i + 1);
2640	}
2641
2642	/*
2643	 * We couldn't find a speculation.  If we found as much as a single
2644	 * busy speculation buffer, we'll attribute this failure as "busy"
2645	 * instead of "unavail".
2646	 */
2647	do {
2648		count = *stat;
2649	} while (dtrace_cas32(stat, count, count + 1) != count);
2650
2651	return (0);
2652}
2653
2654/*
2655 * This routine commits an active speculation.  If the specified speculation
2656 * is not in a valid state to perform a commit(), this routine will silently do
2657 * nothing.  The state of the specified speculation is transitioned according
2658 * to the state transition diagram outlined in <sys/dtrace_impl.h>
2659 */
2660static void
2661dtrace_speculation_commit(dtrace_state_t *state, processorid_t cpu,
2662    dtrace_specid_t which)
2663{
2664	dtrace_speculation_t *spec;
2665	dtrace_buffer_t *src, *dest;
2666	uintptr_t daddr, saddr, dlimit, slimit;
2667	dtrace_speculation_state_t current, new = 0;
2668	intptr_t offs;
2669	uint64_t timestamp;
2670
2671	if (which == 0)
2672		return;
2673
2674	if (which > state->dts_nspeculations) {
2675		cpu_core[cpu].cpuc_dtrace_flags |= CPU_DTRACE_ILLOP;
2676		return;
2677	}
2678
2679	spec = &state->dts_speculations[which - 1];
2680	src = &spec->dtsp_buffer[cpu];
2681	dest = &state->dts_buffer[cpu];
2682
2683	do {
2684		current = spec->dtsp_state;
2685
2686		if (current == DTRACESPEC_COMMITTINGMANY)
2687			break;
2688
2689		switch (current) {
2690		case DTRACESPEC_INACTIVE:
2691		case DTRACESPEC_DISCARDING:
2692			return;
2693
2694		case DTRACESPEC_COMMITTING:
2695			/*
2696			 * This is only possible if we are (a) commit()'ing
2697			 * without having done a prior speculate() on this CPU
2698			 * and (b) racing with another commit() on a different
2699			 * CPU.  There's nothing to do -- we just assert that
2700			 * our offset is 0.
2701			 */
2702			ASSERT(src->dtb_offset == 0);
2703			return;
2704
2705		case DTRACESPEC_ACTIVE:
2706			new = DTRACESPEC_COMMITTING;
2707			break;
2708
2709		case DTRACESPEC_ACTIVEONE:
2710			/*
2711			 * This speculation is active on one CPU.  If our
2712			 * buffer offset is non-zero, we know that the one CPU
2713			 * must be us.  Otherwise, we are committing on a
2714			 * different CPU from the speculate(), and we must
2715			 * rely on being asynchronously cleaned.
2716			 */
2717			if (src->dtb_offset != 0) {
2718				new = DTRACESPEC_COMMITTING;
2719				break;
2720			}
2721			/*FALLTHROUGH*/
2722
2723		case DTRACESPEC_ACTIVEMANY:
2724			new = DTRACESPEC_COMMITTINGMANY;
2725			break;
2726
2727		default:
2728			ASSERT(0);
2729		}
2730	} while (dtrace_cas32((uint32_t *)&spec->dtsp_state,
2731	    current, new) != current);
2732
2733	/*
2734	 * We have set the state to indicate that we are committing this
2735	 * speculation.  Now reserve the necessary space in the destination
2736	 * buffer.
2737	 */
2738	if ((offs = dtrace_buffer_reserve(dest, src->dtb_offset,
2739	    sizeof (uint64_t), state, NULL)) < 0) {
2740		dtrace_buffer_drop(dest);
2741		goto out;
2742	}
2743
2744	/*
2745	 * We have sufficient space to copy the speculative buffer into the
2746	 * primary buffer.  First, modify the speculative buffer, filling
2747	 * in the timestamp of all entries with the current time.  The data
2748	 * must have the commit() time rather than the time it was traced,
2749	 * so that all entries in the primary buffer are in timestamp order.
2750	 */
2751	timestamp = dtrace_gethrtime();
2752	saddr = (uintptr_t)src->dtb_tomax;
2753	slimit = saddr + src->dtb_offset;
2754	while (saddr < slimit) {
2755		size_t size;
2756		dtrace_rechdr_t *dtrh = (dtrace_rechdr_t *)saddr;
2757
2758		if (dtrh->dtrh_epid == DTRACE_EPIDNONE) {
2759			saddr += sizeof (dtrace_epid_t);
2760			continue;
2761		}
2762		ASSERT3U(dtrh->dtrh_epid, <=, state->dts_necbs);
2763		size = state->dts_ecbs[dtrh->dtrh_epid - 1]->dte_size;
2764
2765		ASSERT3U(saddr + size, <=, slimit);
2766		ASSERT3U(size, >=, sizeof (dtrace_rechdr_t));
2767		ASSERT3U(DTRACE_RECORD_LOAD_TIMESTAMP(dtrh), ==, UINT64_MAX);
2768
2769		DTRACE_RECORD_STORE_TIMESTAMP(dtrh, timestamp);
2770
2771		saddr += size;
2772	}
2773
2774	/*
2775	 * Copy the buffer across.  (Note that this is a
2776	 * highly subobtimal bcopy(); in the unlikely event that this becomes
2777	 * a serious performance issue, a high-performance DTrace-specific
2778	 * bcopy() should obviously be invented.)
2779	 */
2780	daddr = (uintptr_t)dest->dtb_tomax + offs;
2781	dlimit = daddr + src->dtb_offset;
2782	saddr = (uintptr_t)src->dtb_tomax;
2783
2784	/*
2785	 * First, the aligned portion.
2786	 */
2787	while (dlimit - daddr >= sizeof (uint64_t)) {
2788		*((uint64_t *)daddr) = *((uint64_t *)saddr);
2789
2790		daddr += sizeof (uint64_t);
2791		saddr += sizeof (uint64_t);
2792	}
2793
2794	/*
2795	 * Now any left-over bit...
2796	 */
2797	while (dlimit - daddr)
2798		*((uint8_t *)daddr++) = *((uint8_t *)saddr++);
2799
2800	/*
2801	 * Finally, commit the reserved space in the destination buffer.
2802	 */
2803	dest->dtb_offset = offs + src->dtb_offset;
2804
2805out:
2806	/*
2807	 * If we're lucky enough to be the only active CPU on this speculation
2808	 * buffer, we can just set the state back to DTRACESPEC_INACTIVE.
2809	 */
2810	if (current == DTRACESPEC_ACTIVE ||
2811	    (current == DTRACESPEC_ACTIVEONE && new == DTRACESPEC_COMMITTING)) {
2812		uint32_t rval = dtrace_cas32((uint32_t *)&spec->dtsp_state,
2813		    DTRACESPEC_COMMITTING, DTRACESPEC_INACTIVE);
2814
2815		ASSERT(rval == DTRACESPEC_COMMITTING);
2816	}
2817
2818	src->dtb_offset = 0;
2819	src->dtb_xamot_drops += src->dtb_drops;
2820	src->dtb_drops = 0;
2821}
2822
2823/*
2824 * This routine discards an active speculation.  If the specified speculation
2825 * is not in a valid state to perform a discard(), this routine will silently
2826 * do nothing.  The state of the specified speculation is transitioned
2827 * according to the state transition diagram outlined in <sys/dtrace_impl.h>
2828 */
2829static void
2830dtrace_speculation_discard(dtrace_state_t *state, processorid_t cpu,
2831    dtrace_specid_t which)
2832{
2833	dtrace_speculation_t *spec;
2834	dtrace_speculation_state_t current, new = 0;
2835	dtrace_buffer_t *buf;
2836
2837	if (which == 0)
2838		return;
2839
2840	if (which > state->dts_nspeculations) {
2841		cpu_core[cpu].cpuc_dtrace_flags |= CPU_DTRACE_ILLOP;
2842		return;
2843	}
2844
2845	spec = &state->dts_speculations[which - 1];
2846	buf = &spec->dtsp_buffer[cpu];
2847
2848	do {
2849		current = spec->dtsp_state;
2850
2851		switch (current) {
2852		case DTRACESPEC_INACTIVE:
2853		case DTRACESPEC_COMMITTINGMANY:
2854		case DTRACESPEC_COMMITTING:
2855		case DTRACESPEC_DISCARDING:
2856			return;
2857
2858		case DTRACESPEC_ACTIVE:
2859		case DTRACESPEC_ACTIVEMANY:
2860			new = DTRACESPEC_DISCARDING;
2861			break;
2862
2863		case DTRACESPEC_ACTIVEONE:
2864			if (buf->dtb_offset != 0) {
2865				new = DTRACESPEC_INACTIVE;
2866			} else {
2867				new = DTRACESPEC_DISCARDING;
2868			}
2869			break;
2870
2871		default:
2872			ASSERT(0);
2873		}
2874	} while (dtrace_cas32((uint32_t *)&spec->dtsp_state,
2875	    current, new) != current);
2876
2877	buf->dtb_offset = 0;
2878	buf->dtb_drops = 0;
2879}
2880
2881/*
2882 * Note:  not called from probe context.  This function is called
2883 * asynchronously from cross call context to clean any speculations that are
2884 * in the COMMITTINGMANY or DISCARDING states.  These speculations may not be
2885 * transitioned back to the INACTIVE state until all CPUs have cleaned the
2886 * speculation.
2887 */
2888static void
2889dtrace_speculation_clean_here(dtrace_state_t *state)
2890{
2891	dtrace_icookie_t cookie;
2892	processorid_t cpu = curcpu;
2893	dtrace_buffer_t *dest = &state->dts_buffer[cpu];
2894	dtrace_specid_t i;
2895
2896	cookie = dtrace_interrupt_disable();
2897
2898	if (dest->dtb_tomax == NULL) {
2899		dtrace_interrupt_enable(cookie);
2900		return;
2901	}
2902
2903	for (i = 0; i < state->dts_nspeculations; i++) {
2904		dtrace_speculation_t *spec = &state->dts_speculations[i];
2905		dtrace_buffer_t *src = &spec->dtsp_buffer[cpu];
2906
2907		if (src->dtb_tomax == NULL)
2908			continue;
2909
2910		if (spec->dtsp_state == DTRACESPEC_DISCARDING) {
2911			src->dtb_offset = 0;
2912			continue;
2913		}
2914
2915		if (spec->dtsp_state != DTRACESPEC_COMMITTINGMANY)
2916			continue;
2917
2918		if (src->dtb_offset == 0)
2919			continue;
2920
2921		dtrace_speculation_commit(state, cpu, i + 1);
2922	}
2923
2924	dtrace_interrupt_enable(cookie);
2925}
2926
2927/*
2928 * Note:  not called from probe context.  This function is called
2929 * asynchronously (and at a regular interval) to clean any speculations that
2930 * are in the COMMITTINGMANY or DISCARDING states.  If it discovers that there
2931 * is work to be done, it cross calls all CPUs to perform that work;
2932 * COMMITMANY and DISCARDING speculations may not be transitioned back to the
2933 * INACTIVE state until they have been cleaned by all CPUs.
2934 */
2935static void
2936dtrace_speculation_clean(dtrace_state_t *state)
2937{
2938	int work = 0, rv;
2939	dtrace_specid_t i;
2940
2941	for (i = 0; i < state->dts_nspeculations; i++) {
2942		dtrace_speculation_t *spec = &state->dts_speculations[i];
2943
2944		ASSERT(!spec->dtsp_cleaning);
2945
2946		if (spec->dtsp_state != DTRACESPEC_DISCARDING &&
2947		    spec->dtsp_state != DTRACESPEC_COMMITTINGMANY)
2948			continue;
2949
2950		work++;
2951		spec->dtsp_cleaning = 1;
2952	}
2953
2954	if (!work)
2955		return;
2956
2957	dtrace_xcall(DTRACE_CPUALL,
2958	    (dtrace_xcall_t)dtrace_speculation_clean_here, state);
2959
2960	/*
2961	 * We now know that all CPUs have committed or discarded their
2962	 * speculation buffers, as appropriate.  We can now set the state
2963	 * to inactive.
2964	 */
2965	for (i = 0; i < state->dts_nspeculations; i++) {
2966		dtrace_speculation_t *spec = &state->dts_speculations[i];
2967		dtrace_speculation_state_t current, new;
2968
2969		if (!spec->dtsp_cleaning)
2970			continue;
2971
2972		current = spec->dtsp_state;
2973		ASSERT(current == DTRACESPEC_DISCARDING ||
2974		    current == DTRACESPEC_COMMITTINGMANY);
2975
2976		new = DTRACESPEC_INACTIVE;
2977
2978		rv = dtrace_cas32((uint32_t *)&spec->dtsp_state, current, new);
2979		ASSERT(rv == current);
2980		spec->dtsp_cleaning = 0;
2981	}
2982}
2983
2984/*
2985 * Called as part of a speculate() to get the speculative buffer associated
2986 * with a given speculation.  Returns NULL if the specified speculation is not
2987 * in an ACTIVE state.  If the speculation is in the ACTIVEONE state -- and
2988 * the active CPU is not the specified CPU -- the speculation will be
2989 * atomically transitioned into the ACTIVEMANY state.
2990 */
2991static dtrace_buffer_t *
2992dtrace_speculation_buffer(dtrace_state_t *state, processorid_t cpuid,
2993    dtrace_specid_t which)
2994{
2995	dtrace_speculation_t *spec;
2996	dtrace_speculation_state_t current, new = 0;
2997	dtrace_buffer_t *buf;
2998
2999	if (which == 0)
3000		return (NULL);
3001
3002	if (which > state->dts_nspeculations) {
3003		cpu_core[cpuid].cpuc_dtrace_flags |= CPU_DTRACE_ILLOP;
3004		return (NULL);
3005	}
3006
3007	spec = &state->dts_speculations[which - 1];
3008	buf = &spec->dtsp_buffer[cpuid];
3009
3010	do {
3011		current = spec->dtsp_state;
3012
3013		switch (current) {
3014		case DTRACESPEC_INACTIVE:
3015		case DTRACESPEC_COMMITTINGMANY:
3016		case DTRACESPEC_DISCARDING:
3017			return (NULL);
3018
3019		case DTRACESPEC_COMMITTING:
3020			ASSERT(buf->dtb_offset == 0);
3021			return (NULL);
3022
3023		case DTRACESPEC_ACTIVEONE:
3024			/*
3025			 * This speculation is currently active on one CPU.
3026			 * Check the offset in the buffer; if it's non-zero,
3027			 * that CPU must be us (and we leave the state alone).
3028			 * If it's zero, assume that we're starting on a new
3029			 * CPU -- and change the state to indicate that the
3030			 * speculation is active on more than one CPU.
3031			 */
3032			if (buf->dtb_offset != 0)
3033				return (buf);
3034
3035			new = DTRACESPEC_ACTIVEMANY;
3036			break;
3037
3038		case DTRACESPEC_ACTIVEMANY:
3039			return (buf);
3040
3041		case DTRACESPEC_ACTIVE:
3042			new = DTRACESPEC_ACTIVEONE;
3043			break;
3044
3045		default:
3046			ASSERT(0);
3047		}
3048	} while (dtrace_cas32((uint32_t *)&spec->dtsp_state,
3049	    current, new) != current);
3050
3051	ASSERT(new == DTRACESPEC_ACTIVEONE || new == DTRACESPEC_ACTIVEMANY);
3052	return (buf);
3053}
3054
3055/*
3056 * Return a string.  In the event that the user lacks the privilege to access
3057 * arbitrary kernel memory, we copy the string out to scratch memory so that we
3058 * don't fail access checking.
3059 *
3060 * dtrace_dif_variable() uses this routine as a helper for various
3061 * builtin values such as 'execname' and 'probefunc.'
3062 */
3063uintptr_t
3064dtrace_dif_varstr(uintptr_t addr, dtrace_state_t *state,
3065    dtrace_mstate_t *mstate)
3066{
3067	uint64_t size = state->dts_options[DTRACEOPT_STRSIZE];
3068	uintptr_t ret;
3069	size_t strsz;
3070
3071	/*
3072	 * The easy case: this probe is allowed to read all of memory, so
3073	 * we can just return this as a vanilla pointer.
3074	 */
3075	if ((mstate->dtms_access & DTRACE_ACCESS_KERNEL) != 0)
3076		return (addr);
3077
3078	/*
3079	 * This is the tougher case: we copy the string in question from
3080	 * kernel memory into scratch memory and return it that way: this
3081	 * ensures that we won't trip up when access checking tests the
3082	 * BYREF return value.
3083	 */
3084	strsz = dtrace_strlen((char *)addr, size) + 1;
3085
3086	if (mstate->dtms_scratch_ptr + strsz >
3087	    mstate->dtms_scratch_base + mstate->dtms_scratch_size) {
3088		DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
3089		return (0);
3090	}
3091
3092	dtrace_strcpy((const void *)addr, (void *)mstate->dtms_scratch_ptr,
3093	    strsz);
3094	ret = mstate->dtms_scratch_ptr;
3095	mstate->dtms_scratch_ptr += strsz;
3096	return (ret);
3097}
3098
3099/*
3100 * Return a string from a memoy address which is known to have one or
3101 * more concatenated, individually zero terminated, sub-strings.
3102 * In the event that the user lacks the privilege to access
3103 * arbitrary kernel memory, we copy the string out to scratch memory so that we
3104 * don't fail access checking.
3105 *
3106 * dtrace_dif_variable() uses this routine as a helper for various
3107 * builtin values such as 'execargs'.
3108 */
3109static uintptr_t
3110dtrace_dif_varstrz(uintptr_t addr, size_t strsz, dtrace_state_t *state,
3111    dtrace_mstate_t *mstate)
3112{
3113	char *p;
3114	size_t i;
3115	uintptr_t ret;
3116
3117	if (mstate->dtms_scratch_ptr + strsz >
3118	    mstate->dtms_scratch_base + mstate->dtms_scratch_size) {
3119		DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
3120		return (0);
3121	}
3122
3123	dtrace_bcopy((const void *)addr, (void *)mstate->dtms_scratch_ptr,
3124	    strsz);
3125
3126	/* Replace sub-string termination characters with a space. */
3127	for (p = (char *) mstate->dtms_scratch_ptr, i = 0; i < strsz - 1;
3128	    p++, i++)
3129		if (*p == '\0')
3130			*p = ' ';
3131
3132	ret = mstate->dtms_scratch_ptr;
3133	mstate->dtms_scratch_ptr += strsz;
3134	return (ret);
3135}
3136
3137/*
3138 * This function implements the DIF emulator's variable lookups.  The emulator
3139 * passes a reserved variable identifier and optional built-in array index.
3140 */
3141static uint64_t
3142dtrace_dif_variable(dtrace_mstate_t *mstate, dtrace_state_t *state, uint64_t v,
3143    uint64_t ndx)
3144{
3145	/*
3146	 * If we're accessing one of the uncached arguments, we'll turn this
3147	 * into a reference in the args array.
3148	 */
3149	if (v >= DIF_VAR_ARG0 && v <= DIF_VAR_ARG9) {
3150		ndx = v - DIF_VAR_ARG0;
3151		v = DIF_VAR_ARGS;
3152	}
3153
3154	switch (v) {
3155	case DIF_VAR_ARGS:
3156		ASSERT(mstate->dtms_present & DTRACE_MSTATE_ARGS);
3157		if (ndx >= sizeof (mstate->dtms_arg) /
3158		    sizeof (mstate->dtms_arg[0])) {
3159			int aframes = mstate->dtms_probe->dtpr_aframes + 2;
3160			dtrace_provider_t *pv;
3161			uint64_t val;
3162
3163			pv = mstate->dtms_probe->dtpr_provider;
3164			if (pv->dtpv_pops.dtps_getargval != NULL)
3165				val = pv->dtpv_pops.dtps_getargval(pv->dtpv_arg,
3166				    mstate->dtms_probe->dtpr_id,
3167				    mstate->dtms_probe->dtpr_arg, ndx, aframes);
3168			else
3169				val = dtrace_getarg(ndx, aframes);
3170
3171			/*
3172			 * This is regrettably required to keep the compiler
3173			 * from tail-optimizing the call to dtrace_getarg().
3174			 * The condition always evaluates to true, but the
3175			 * compiler has no way of figuring that out a priori.
3176			 * (None of this would be necessary if the compiler
3177			 * could be relied upon to _always_ tail-optimize
3178			 * the call to dtrace_getarg() -- but it can't.)
3179			 */
3180			if (mstate->dtms_probe != NULL)
3181				return (val);
3182
3183			ASSERT(0);
3184		}
3185
3186		return (mstate->dtms_arg[ndx]);
3187
3188#if defined(sun)
3189	case DIF_VAR_UREGS: {
3190		klwp_t *lwp;
3191
3192		if (!dtrace_priv_proc(state))
3193			return (0);
3194
3195		if ((lwp = curthread->t_lwp) == NULL) {
3196			DTRACE_CPUFLAG_SET(CPU_DTRACE_BADADDR);
3197			cpu_core[curcpu].cpuc_dtrace_illval = NULL;
3198			return (0);
3199		}
3200
3201		return (dtrace_getreg(lwp->lwp_regs, ndx));
3202		return (0);
3203	}
3204#else
3205	case DIF_VAR_UREGS: {
3206		struct trapframe *tframe;
3207
3208		if (!dtrace_priv_proc(state))
3209			return (0);
3210
3211		if ((tframe = curthread->td_frame) == NULL) {
3212			DTRACE_CPUFLAG_SET(CPU_DTRACE_BADADDR);
3213			cpu_core[curcpu].cpuc_dtrace_illval = 0;
3214			return (0);
3215		}
3216
3217		return (dtrace_getreg(tframe, ndx));
3218	}
3219#endif
3220
3221	case DIF_VAR_CURTHREAD:
3222		if (!dtrace_priv_proc(state))
3223			return (0);
3224		return ((uint64_t)(uintptr_t)curthread);
3225
3226	case DIF_VAR_TIMESTAMP:
3227		if (!(mstate->dtms_present & DTRACE_MSTATE_TIMESTAMP)) {
3228			mstate->dtms_timestamp = dtrace_gethrtime();
3229			mstate->dtms_present |= DTRACE_MSTATE_TIMESTAMP;
3230		}
3231		return (mstate->dtms_timestamp);
3232
3233	case DIF_VAR_VTIMESTAMP:
3234		ASSERT(dtrace_vtime_references != 0);
3235		return (curthread->t_dtrace_vtime);
3236
3237	case DIF_VAR_WALLTIMESTAMP:
3238		if (!(mstate->dtms_present & DTRACE_MSTATE_WALLTIMESTAMP)) {
3239			mstate->dtms_walltimestamp = dtrace_gethrestime();
3240			mstate->dtms_present |= DTRACE_MSTATE_WALLTIMESTAMP;
3241		}
3242		return (mstate->dtms_walltimestamp);
3243
3244#if defined(sun)
3245	case DIF_VAR_IPL:
3246		if (!dtrace_priv_kernel(state))
3247			return (0);
3248		if (!(mstate->dtms_present & DTRACE_MSTATE_IPL)) {
3249			mstate->dtms_ipl = dtrace_getipl();
3250			mstate->dtms_present |= DTRACE_MSTATE_IPL;
3251		}
3252		return (mstate->dtms_ipl);
3253#endif
3254
3255	case DIF_VAR_EPID:
3256		ASSERT(mstate->dtms_present & DTRACE_MSTATE_EPID);
3257		return (mstate->dtms_epid);
3258
3259	case DIF_VAR_ID:
3260		ASSERT(mstate->dtms_present & DTRACE_MSTATE_PROBE);
3261		return (mstate->dtms_probe->dtpr_id);
3262
3263	case DIF_VAR_STACKDEPTH:
3264		if (!dtrace_priv_kernel(state))
3265			return (0);
3266		if (!(mstate->dtms_present & DTRACE_MSTATE_STACKDEPTH)) {
3267			int aframes = mstate->dtms_probe->dtpr_aframes + 2;
3268
3269			mstate->dtms_stackdepth = dtrace_getstackdepth(aframes);
3270			mstate->dtms_present |= DTRACE_MSTATE_STACKDEPTH;
3271		}
3272		return (mstate->dtms_stackdepth);
3273
3274	case DIF_VAR_USTACKDEPTH:
3275		if (!dtrace_priv_proc(state))
3276			return (0);
3277		if (!(mstate->dtms_present & DTRACE_MSTATE_USTACKDEPTH)) {
3278			/*
3279			 * See comment in DIF_VAR_PID.
3280			 */
3281			if (DTRACE_ANCHORED(mstate->dtms_probe) &&
3282			    CPU_ON_INTR(CPU)) {
3283				mstate->dtms_ustackdepth = 0;
3284			} else {
3285				DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
3286				mstate->dtms_ustackdepth =
3287				    dtrace_getustackdepth();
3288				DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);
3289			}
3290			mstate->dtms_present |= DTRACE_MSTATE_USTACKDEPTH;
3291		}
3292		return (mstate->dtms_ustackdepth);
3293
3294	case DIF_VAR_CALLER:
3295		if (!dtrace_priv_kernel(state))
3296			return (0);
3297		if (!(mstate->dtms_present & DTRACE_MSTATE_CALLER)) {
3298			int aframes = mstate->dtms_probe->dtpr_aframes + 2;
3299
3300			if (!DTRACE_ANCHORED(mstate->dtms_probe)) {
3301				/*
3302				 * If this is an unanchored probe, we are
3303				 * required to go through the slow path:
3304				 * dtrace_caller() only guarantees correct
3305				 * results for anchored probes.
3306				 */
3307				pc_t caller[2] = {0, 0};
3308
3309				dtrace_getpcstack(caller, 2, aframes,
3310				    (uint32_t *)(uintptr_t)mstate->dtms_arg[0]);
3311				mstate->dtms_caller = caller[1];
3312			} else if ((mstate->dtms_caller =
3313			    dtrace_caller(aframes)) == -1) {
3314				/*
3315				 * We have failed to do this the quick way;
3316				 * we must resort to the slower approach of
3317				 * calling dtrace_getpcstack().
3318				 */
3319				pc_t caller = 0;
3320
3321				dtrace_getpcstack(&caller, 1, aframes, NULL);
3322				mstate->dtms_caller = caller;
3323			}
3324
3325			mstate->dtms_present |= DTRACE_MSTATE_CALLER;
3326		}
3327		return (mstate->dtms_caller);
3328
3329	case DIF_VAR_UCALLER:
3330		if (!dtrace_priv_proc(state))
3331			return (0);
3332
3333		if (!(mstate->dtms_present & DTRACE_MSTATE_UCALLER)) {
3334			uint64_t ustack[3];
3335
3336			/*
3337			 * dtrace_getupcstack() fills in the first uint64_t
3338			 * with the current PID.  The second uint64_t will
3339			 * be the program counter at user-level.  The third
3340			 * uint64_t will contain the caller, which is what
3341			 * we're after.
3342			 */
3343			ustack[2] = 0;
3344			DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
3345			dtrace_getupcstack(ustack, 3);
3346			DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);
3347			mstate->dtms_ucaller = ustack[2];
3348			mstate->dtms_present |= DTRACE_MSTATE_UCALLER;
3349		}
3350
3351		return (mstate->dtms_ucaller);
3352
3353	case DIF_VAR_PROBEPROV:
3354		ASSERT(mstate->dtms_present & DTRACE_MSTATE_PROBE);
3355		return (dtrace_dif_varstr(
3356		    (uintptr_t)mstate->dtms_probe->dtpr_provider->dtpv_name,
3357		    state, mstate));
3358
3359	case DIF_VAR_PROBEMOD:
3360		ASSERT(mstate->dtms_present & DTRACE_MSTATE_PROBE);
3361		return (dtrace_dif_varstr(
3362		    (uintptr_t)mstate->dtms_probe->dtpr_mod,
3363		    state, mstate));
3364
3365	case DIF_VAR_PROBEFUNC:
3366		ASSERT(mstate->dtms_present & DTRACE_MSTATE_PROBE);
3367		return (dtrace_dif_varstr(
3368		    (uintptr_t)mstate->dtms_probe->dtpr_func,
3369		    state, mstate));
3370
3371	case DIF_VAR_PROBENAME:
3372		ASSERT(mstate->dtms_present & DTRACE_MSTATE_PROBE);
3373		return (dtrace_dif_varstr(
3374		    (uintptr_t)mstate->dtms_probe->dtpr_name,
3375		    state, mstate));
3376
3377	case DIF_VAR_PID:
3378		if (!dtrace_priv_proc(state))
3379			return (0);
3380
3381#if defined(sun)
3382		/*
3383		 * Note that we are assuming that an unanchored probe is
3384		 * always due to a high-level interrupt.  (And we're assuming
3385		 * that there is only a single high level interrupt.)
3386		 */
3387		if (DTRACE_ANCHORED(mstate->dtms_probe) && CPU_ON_INTR(CPU))
3388			return (pid0.pid_id);
3389
3390		/*
3391		 * It is always safe to dereference one's own t_procp pointer:
3392		 * it always points to a valid, allocated proc structure.
3393		 * Further, it is always safe to dereference the p_pidp member
3394		 * of one's own proc structure.  (These are truisms becuase
3395		 * threads and processes don't clean up their own state --
3396		 * they leave that task to whomever reaps them.)
3397		 */
3398		return ((uint64_t)curthread->t_procp->p_pidp->pid_id);
3399#else
3400		return ((uint64_t)curproc->p_pid);
3401#endif
3402
3403	case DIF_VAR_PPID:
3404		if (!dtrace_priv_proc(state))
3405			return (0);
3406
3407#if defined(sun)
3408		/*
3409		 * See comment in DIF_VAR_PID.
3410		 */
3411		if (DTRACE_ANCHORED(mstate->dtms_probe) && CPU_ON_INTR(CPU))
3412			return (pid0.pid_id);
3413
3414		/*
3415		 * It is always safe to dereference one's own t_procp pointer:
3416		 * it always points to a valid, allocated proc structure.
3417		 * (This is true because threads don't clean up their own
3418		 * state -- they leave that task to whomever reaps them.)
3419		 */
3420		return ((uint64_t)curthread->t_procp->p_ppid);
3421#else
3422		if (curproc->p_pid == proc0.p_pid)
3423			return (curproc->p_pid);
3424		else
3425			return (curproc->p_pptr->p_pid);
3426#endif
3427
3428	case DIF_VAR_TID:
3429#if defined(sun)
3430		/*
3431		 * See comment in DIF_VAR_PID.
3432		 */
3433		if (DTRACE_ANCHORED(mstate->dtms_probe) && CPU_ON_INTR(CPU))
3434			return (0);
3435#endif
3436
3437		return ((uint64_t)curthread->t_tid);
3438
3439	case DIF_VAR_EXECARGS: {
3440		struct pargs *p_args = curthread->td_proc->p_args;
3441
3442		if (p_args == NULL)
3443			return(0);
3444
3445		return (dtrace_dif_varstrz(
3446		    (uintptr_t) p_args->ar_args, p_args->ar_length, state, mstate));
3447	}
3448
3449	case DIF_VAR_EXECNAME:
3450#if defined(sun)
3451		if (!dtrace_priv_proc(state))
3452			return (0);
3453
3454		/*
3455		 * See comment in DIF_VAR_PID.
3456		 */
3457		if (DTRACE_ANCHORED(mstate->dtms_probe) && CPU_ON_INTR(CPU))
3458			return ((uint64_t)(uintptr_t)p0.p_user.u_comm);
3459
3460		/*
3461		 * It is always safe to dereference one's own t_procp pointer:
3462		 * it always points to a valid, allocated proc structure.
3463		 * (This is true because threads don't clean up their own
3464		 * state -- they leave that task to whomever reaps them.)
3465		 */
3466		return (dtrace_dif_varstr(
3467		    (uintptr_t)curthread->t_procp->p_user.u_comm,
3468		    state, mstate));
3469#else
3470		return (dtrace_dif_varstr(
3471		    (uintptr_t) curthread->td_proc->p_comm, state, mstate));
3472#endif
3473
3474	case DIF_VAR_ZONENAME:
3475#if defined(sun)
3476		if (!dtrace_priv_proc(state))
3477			return (0);
3478
3479		/*
3480		 * See comment in DIF_VAR_PID.
3481		 */
3482		if (DTRACE_ANCHORED(mstate->dtms_probe) && CPU_ON_INTR(CPU))
3483			return ((uint64_t)(uintptr_t)p0.p_zone->zone_name);
3484
3485		/*
3486		 * It is always safe to dereference one's own t_procp pointer:
3487		 * it always points to a valid, allocated proc structure.
3488		 * (This is true because threads don't clean up their own
3489		 * state -- they leave that task to whomever reaps them.)
3490		 */
3491		return (dtrace_dif_varstr(
3492		    (uintptr_t)curthread->t_procp->p_zone->zone_name,
3493		    state, mstate));
3494#else
3495		return (0);
3496#endif
3497
3498	case DIF_VAR_UID:
3499		if (!dtrace_priv_proc(state))
3500			return (0);
3501
3502#if defined(sun)
3503		/*
3504		 * See comment in DIF_VAR_PID.
3505		 */
3506		if (DTRACE_ANCHORED(mstate->dtms_probe) && CPU_ON_INTR(CPU))
3507			return ((uint64_t)p0.p_cred->cr_uid);
3508#endif
3509
3510		/*
3511		 * It is always safe to dereference one's own t_procp pointer:
3512		 * it always points to a valid, allocated proc structure.
3513		 * (This is true because threads don't clean up their own
3514		 * state -- they leave that task to whomever reaps them.)
3515		 *
3516		 * Additionally, it is safe to dereference one's own process
3517		 * credential, since this is never NULL after process birth.
3518		 */
3519		return ((uint64_t)curthread->t_procp->p_cred->cr_uid);
3520
3521	case DIF_VAR_GID:
3522		if (!dtrace_priv_proc(state))
3523			return (0);
3524
3525#if defined(sun)
3526		/*
3527		 * See comment in DIF_VAR_PID.
3528		 */
3529		if (DTRACE_ANCHORED(mstate->dtms_probe) && CPU_ON_INTR(CPU))
3530			return ((uint64_t)p0.p_cred->cr_gid);
3531#endif
3532
3533		/*
3534		 * It is always safe to dereference one's own t_procp pointer:
3535		 * it always points to a valid, allocated proc structure.
3536		 * (This is true because threads don't clean up their own
3537		 * state -- they leave that task to whomever reaps them.)
3538		 *
3539		 * Additionally, it is safe to dereference one's own process
3540		 * credential, since this is never NULL after process birth.
3541		 */
3542		return ((uint64_t)curthread->t_procp->p_cred->cr_gid);
3543
3544	case DIF_VAR_ERRNO: {
3545#if defined(sun)
3546		klwp_t *lwp;
3547		if (!dtrace_priv_proc(state))
3548			return (0);
3549
3550		/*
3551		 * See comment in DIF_VAR_PID.
3552		 */
3553		if (DTRACE_ANCHORED(mstate->dtms_probe) && CPU_ON_INTR(CPU))
3554			return (0);
3555
3556		/*
3557		 * It is always safe to dereference one's own t_lwp pointer in
3558		 * the event that this pointer is non-NULL.  (This is true
3559		 * because threads and lwps don't clean up their own state --
3560		 * they leave that task to whomever reaps them.)
3561		 */
3562		if ((lwp = curthread->t_lwp) == NULL)
3563			return (0);
3564
3565		return ((uint64_t)lwp->lwp_errno);
3566#else
3567		return (curthread->td_errno);
3568#endif
3569	}
3570#if !defined(sun)
3571	case DIF_VAR_CPU: {
3572		return curcpu;
3573	}
3574#endif
3575	default:
3576		DTRACE_CPUFLAG_SET(CPU_DTRACE_ILLOP);
3577		return (0);
3578	}
3579}
3580
3581
3582typedef enum dtrace_json_state {
3583	DTRACE_JSON_REST = 1,
3584	DTRACE_JSON_OBJECT,
3585	DTRACE_JSON_STRING,
3586	DTRACE_JSON_STRING_ESCAPE,
3587	DTRACE_JSON_STRING_ESCAPE_UNICODE,
3588	DTRACE_JSON_COLON,
3589	DTRACE_JSON_COMMA,
3590	DTRACE_JSON_VALUE,
3591	DTRACE_JSON_IDENTIFIER,
3592	DTRACE_JSON_NUMBER,
3593	DTRACE_JSON_NUMBER_FRAC,
3594	DTRACE_JSON_NUMBER_EXP,
3595	DTRACE_JSON_COLLECT_OBJECT
3596} dtrace_json_state_t;
3597
3598/*
3599 * This function possesses just enough knowledge about JSON to extract a single
3600 * value from a JSON string and store it in the scratch buffer.  It is able
3601 * to extract nested object values, and members of arrays by index.
3602 *
3603 * elemlist is a list of JSON keys, stored as packed NUL-terminated strings, to
3604 * be looked up as we descend into the object tree.  e.g.
3605 *
3606 *    foo[0].bar.baz[32] --> "foo" NUL "0" NUL "bar" NUL "baz" NUL "32" NUL
3607 *       with nelems = 5.
3608 *
3609 * The run time of this function must be bounded above by strsize to limit the
3610 * amount of work done in probe context.  As such, it is implemented as a
3611 * simple state machine, reading one character at a time using safe loads
3612 * until we find the requested element, hit a parsing error or run off the
3613 * end of the object or string.
3614 *
3615 * As there is no way for a subroutine to return an error without interrupting
3616 * clause execution, we simply return NULL in the event of a missing key or any
3617 * other error condition.  Each NULL return in this function is commented with
3618 * the error condition it represents -- parsing or otherwise.
3619 *
3620 * The set of states for the state machine closely matches the JSON
3621 * specification (http://json.org/).  Briefly:
3622 *
3623 *   DTRACE_JSON_REST:
3624 *     Skip whitespace until we find either a top-level Object, moving
3625 *     to DTRACE_JSON_OBJECT; or an Array, moving to DTRACE_JSON_VALUE.
3626 *
3627 *   DTRACE_JSON_OBJECT:
3628 *     Locate the next key String in an Object.  Sets a flag to denote
3629 *     the next String as a key string and moves to DTRACE_JSON_STRING.
3630 *
3631 *   DTRACE_JSON_COLON:
3632 *     Skip whitespace until we find the colon that separates key Strings
3633 *     from their values.  Once found, move to DTRACE_JSON_VALUE.
3634 *
3635 *   DTRACE_JSON_VALUE:
3636 *     Detects the type of the next value (String, Number, Identifier, Object
3637 *     or Array) and routes to the states that process that type.  Here we also
3638 *     deal with the element selector list if we are requested to traverse down
3639 *     into the object tree.
3640 *
3641 *   DTRACE_JSON_COMMA:
3642 *     Skip whitespace until we find the comma that separates key-value pairs
3643 *     in Objects (returning to DTRACE_JSON_OBJECT) or values in Arrays
3644 *     (similarly DTRACE_JSON_VALUE).  All following literal value processing
3645 *     states return to this state at the end of their value, unless otherwise
3646 *     noted.
3647 *
3648 *   DTRACE_JSON_NUMBER, DTRACE_JSON_NUMBER_FRAC, DTRACE_JSON_NUMBER_EXP:
3649 *     Processes a Number literal from the JSON, including any exponent
3650 *     component that may be present.  Numbers are returned as strings, which
3651 *     may be passed to strtoll() if an integer is required.
3652 *
3653 *   DTRACE_JSON_IDENTIFIER:
3654 *     Processes a "true", "false" or "null" literal in the JSON.
3655 *
3656 *   DTRACE_JSON_STRING, DTRACE_JSON_STRING_ESCAPE,
3657 *   DTRACE_JSON_STRING_ESCAPE_UNICODE:
3658 *     Processes a String literal from the JSON, whether the String denotes
3659 *     a key, a value or part of a larger Object.  Handles all escape sequences
3660 *     present in the specification, including four-digit unicode characters,
3661 *     but merely includes the escape sequence without converting it to the
3662 *     actual escaped character.  If the String is flagged as a key, we
3663 *     move to DTRACE_JSON_COLON rather than DTRACE_JSON_COMMA.
3664 *
3665 *   DTRACE_JSON_COLLECT_OBJECT:
3666 *     This state collects an entire Object (or Array), correctly handling
3667 *     embedded strings.  If the full element selector list matches this nested
3668 *     object, we return the Object in full as a string.  If not, we use this
3669 *     state to skip to the next value at this level and continue processing.
3670 *
3671 * NOTE: This function uses various macros from strtolctype.h to manipulate
3672 * digit values, etc -- these have all been checked to ensure they make
3673 * no additional function calls.
3674 */
3675static char *
3676dtrace_json(uint64_t size, uintptr_t json, char *elemlist, int nelems,
3677    char *dest)
3678{
3679	dtrace_json_state_t state = DTRACE_JSON_REST;
3680	int64_t array_elem = INT64_MIN;
3681	int64_t array_pos = 0;
3682	uint8_t escape_unicount = 0;
3683	boolean_t string_is_key = B_FALSE;
3684	boolean_t collect_object = B_FALSE;
3685	boolean_t found_key = B_FALSE;
3686	boolean_t in_array = B_FALSE;
3687	uint32_t braces = 0, brackets = 0;
3688	char *elem = elemlist;
3689	char *dd = dest;
3690	uintptr_t cur;
3691
3692	for (cur = json; cur < json + size; cur++) {
3693		char cc = dtrace_load8(cur);
3694		if (cc == '\0')
3695			return (NULL);
3696
3697		switch (state) {
3698		case DTRACE_JSON_REST:
3699			if (isspace(cc))
3700				break;
3701
3702			if (cc == '{') {
3703				state = DTRACE_JSON_OBJECT;
3704				break;
3705			}
3706
3707			if (cc == '[') {
3708				in_array = B_TRUE;
3709				array_pos = 0;
3710				array_elem = dtrace_strtoll(elem, 10, size);
3711				found_key = array_elem == 0 ? B_TRUE : B_FALSE;
3712				state = DTRACE_JSON_VALUE;
3713				break;
3714			}
3715
3716			/*
3717			 * ERROR: expected to find a top-level object or array.
3718			 */
3719			return (NULL);
3720		case DTRACE_JSON_OBJECT:
3721			if (isspace(cc))
3722				break;
3723
3724			if (cc == '"') {
3725				state = DTRACE_JSON_STRING;
3726				string_is_key = B_TRUE;
3727				break;
3728			}
3729
3730			/*
3731			 * ERROR: either the object did not start with a key
3732			 * string, or we've run off the end of the object
3733			 * without finding the requested key.
3734			 */
3735			return (NULL);
3736		case DTRACE_JSON_STRING:
3737			if (cc == '\\') {
3738				*dd++ = '\\';
3739				state = DTRACE_JSON_STRING_ESCAPE;
3740				break;
3741			}
3742
3743			if (cc == '"') {
3744				if (collect_object) {
3745					/*
3746					 * We don't reset the dest here, as
3747					 * the string is part of a larger
3748					 * object being collected.
3749					 */
3750					*dd++ = cc;
3751					collect_object = B_FALSE;
3752					state = DTRACE_JSON_COLLECT_OBJECT;
3753					break;
3754				}
3755				*dd = '\0';
3756				dd = dest; /* reset string buffer */
3757				if (string_is_key) {
3758					if (dtrace_strncmp(dest, elem,
3759					    size) == 0)
3760						found_key = B_TRUE;
3761				} else if (found_key) {
3762					if (nelems > 1) {
3763						/*
3764						 * We expected an object, not
3765						 * this string.
3766						 */
3767						return (NULL);
3768					}
3769					return (dest);
3770				}
3771				state = string_is_key ? DTRACE_JSON_COLON :
3772				    DTRACE_JSON_COMMA;
3773				string_is_key = B_FALSE;
3774				break;
3775			}
3776
3777			*dd++ = cc;
3778			break;
3779		case DTRACE_JSON_STRING_ESCAPE:
3780			*dd++ = cc;
3781			if (cc == 'u') {
3782				escape_unicount = 0;
3783				state = DTRACE_JSON_STRING_ESCAPE_UNICODE;
3784			} else {
3785				state = DTRACE_JSON_STRING;
3786			}
3787			break;
3788		case DTRACE_JSON_STRING_ESCAPE_UNICODE:
3789			if (!isxdigit(cc)) {
3790				/*
3791				 * ERROR: invalid unicode escape, expected
3792				 * four valid hexidecimal digits.
3793				 */
3794				return (NULL);
3795			}
3796
3797			*dd++ = cc;
3798			if (++escape_unicount == 4)
3799				state = DTRACE_JSON_STRING;
3800			break;
3801		case DTRACE_JSON_COLON:
3802			if (isspace(cc))
3803				break;
3804
3805			if (cc == ':') {
3806				state = DTRACE_JSON_VALUE;
3807				break;
3808			}
3809
3810			/*
3811			 * ERROR: expected a colon.
3812			 */
3813			return (NULL);
3814		case DTRACE_JSON_COMMA:
3815			if (isspace(cc))
3816				break;
3817
3818			if (cc == ',') {
3819				if (in_array) {
3820					state = DTRACE_JSON_VALUE;
3821					if (++array_pos == array_elem)
3822						found_key = B_TRUE;
3823				} else {
3824					state = DTRACE_JSON_OBJECT;
3825				}
3826				break;
3827			}
3828
3829			/*
3830			 * ERROR: either we hit an unexpected character, or
3831			 * we reached the end of the object or array without
3832			 * finding the requested key.
3833			 */
3834			return (NULL);
3835		case DTRACE_JSON_IDENTIFIER:
3836			if (islower(cc)) {
3837				*dd++ = cc;
3838				break;
3839			}
3840
3841			*dd = '\0';
3842			dd = dest; /* reset string buffer */
3843
3844			if (dtrace_strncmp(dest, "true", 5) == 0 ||
3845			    dtrace_strncmp(dest, "false", 6) == 0 ||
3846			    dtrace_strncmp(dest, "null", 5) == 0) {
3847				if (found_key) {
3848					if (nelems > 1) {
3849						/*
3850						 * ERROR: We expected an object,
3851						 * not this identifier.
3852						 */
3853						return (NULL);
3854					}
3855					return (dest);
3856				} else {
3857					cur--;
3858					state = DTRACE_JSON_COMMA;
3859					break;
3860				}
3861			}
3862
3863			/*
3864			 * ERROR: we did not recognise the identifier as one
3865			 * of those in the JSON specification.
3866			 */
3867			return (NULL);
3868		case DTRACE_JSON_NUMBER:
3869			if (cc == '.') {
3870				*dd++ = cc;
3871				state = DTRACE_JSON_NUMBER_FRAC;
3872				break;
3873			}
3874
3875			if (cc == 'x' || cc == 'X') {
3876				/*
3877				 * ERROR: specification explicitly excludes
3878				 * hexidecimal or octal numbers.
3879				 */
3880				return (NULL);
3881			}
3882
3883			/* FALLTHRU */
3884		case DTRACE_JSON_NUMBER_FRAC:
3885			if (cc == 'e' || cc == 'E') {
3886				*dd++ = cc;
3887				state = DTRACE_JSON_NUMBER_EXP;
3888				break;
3889			}
3890
3891			if (cc == '+' || cc == '-') {
3892				/*
3893				 * ERROR: expect sign as part of exponent only.
3894				 */
3895				return (NULL);
3896			}
3897			/* FALLTHRU */
3898		case DTRACE_JSON_NUMBER_EXP:
3899			if (isdigit(cc) || cc == '+' || cc == '-') {
3900				*dd++ = cc;
3901				break;
3902			}
3903
3904			*dd = '\0';
3905			dd = dest; /* reset string buffer */
3906			if (found_key) {
3907				if (nelems > 1) {
3908					/*
3909					 * ERROR: We expected an object, not
3910					 * this number.
3911					 */
3912					return (NULL);
3913				}
3914				return (dest);
3915			}
3916
3917			cur--;
3918			state = DTRACE_JSON_COMMA;
3919			break;
3920		case DTRACE_JSON_VALUE:
3921			if (isspace(cc))
3922				break;
3923
3924			if (cc == '{' || cc == '[') {
3925				if (nelems > 1 && found_key) {
3926					in_array = cc == '[' ? B_TRUE : B_FALSE;
3927					/*
3928					 * If our element selector directs us
3929					 * to descend into this nested object,
3930					 * then move to the next selector
3931					 * element in the list and restart the
3932					 * state machine.
3933					 */
3934					while (*elem != '\0')
3935						elem++;
3936					elem++; /* skip the inter-element NUL */
3937					nelems--;
3938					dd = dest;
3939					if (in_array) {
3940						state = DTRACE_JSON_VALUE;
3941						array_pos = 0;
3942						array_elem = dtrace_strtoll(
3943						    elem, 10, size);
3944						found_key = array_elem == 0 ?
3945						    B_TRUE : B_FALSE;
3946					} else {
3947						found_key = B_FALSE;
3948						state = DTRACE_JSON_OBJECT;
3949					}
3950					break;
3951				}
3952
3953				/*
3954				 * Otherwise, we wish to either skip this
3955				 * nested object or return it in full.
3956				 */
3957				if (cc == '[')
3958					brackets = 1;
3959				else
3960					braces = 1;
3961				*dd++ = cc;
3962				state = DTRACE_JSON_COLLECT_OBJECT;
3963				break;
3964			}
3965
3966			if (cc == '"') {
3967				state = DTRACE_JSON_STRING;
3968				break;
3969			}
3970
3971			if (islower(cc)) {
3972				/*
3973				 * Here we deal with true, false and null.
3974				 */
3975				*dd++ = cc;
3976				state = DTRACE_JSON_IDENTIFIER;
3977				break;
3978			}
3979
3980			if (cc == '-' || isdigit(cc)) {
3981				*dd++ = cc;
3982				state = DTRACE_JSON_NUMBER;
3983				break;
3984			}
3985
3986			/*
3987			 * ERROR: unexpected character at start of value.
3988			 */
3989			return (NULL);
3990		case DTRACE_JSON_COLLECT_OBJECT:
3991			if (cc == '\0')
3992				/*
3993				 * ERROR: unexpected end of input.
3994				 */
3995				return (NULL);
3996
3997			*dd++ = cc;
3998			if (cc == '"') {
3999				collect_object = B_TRUE;
4000				state = DTRACE_JSON_STRING;
4001				break;
4002			}
4003
4004			if (cc == ']') {
4005				if (brackets-- == 0) {
4006					/*
4007					 * ERROR: unbalanced brackets.
4008					 */
4009					return (NULL);
4010				}
4011			} else if (cc == '}') {
4012				if (braces-- == 0) {
4013					/*
4014					 * ERROR: unbalanced braces.
4015					 */
4016					return (NULL);
4017				}
4018			} else if (cc == '{') {
4019				braces++;
4020			} else if (cc == '[') {
4021				brackets++;
4022			}
4023
4024			if (brackets == 0 && braces == 0) {
4025				if (found_key) {
4026					*dd = '\0';
4027					return (dest);
4028				}
4029				dd = dest; /* reset string buffer */
4030				state = DTRACE_JSON_COMMA;
4031			}
4032			break;
4033		}
4034	}
4035	return (NULL);
4036}
4037
4038/*
4039 * Emulate the execution of DTrace ID subroutines invoked by the call opcode.
4040 * Notice that we don't bother validating the proper number of arguments or
4041 * their types in the tuple stack.  This isn't needed because all argument
4042 * interpretation is safe because of our load safety -- the worst that can
4043 * happen is that a bogus program can obtain bogus results.
4044 */
4045static void
4046dtrace_dif_subr(uint_t subr, uint_t rd, uint64_t *regs,
4047    dtrace_key_t *tupregs, int nargs,
4048    dtrace_mstate_t *mstate, dtrace_state_t *state)
4049{
4050	volatile uint16_t *flags = &cpu_core[curcpu].cpuc_dtrace_flags;
4051	volatile uintptr_t *illval = &cpu_core[curcpu].cpuc_dtrace_illval;
4052	dtrace_vstate_t *vstate = &state->dts_vstate;
4053
4054#if defined(sun)
4055	union {
4056		mutex_impl_t mi;
4057		uint64_t mx;
4058	} m;
4059
4060	union {
4061		krwlock_t ri;
4062		uintptr_t rw;
4063	} r;
4064#else
4065	struct thread *lowner;
4066	union {
4067		struct lock_object *li;
4068		uintptr_t lx;
4069	} l;
4070#endif
4071
4072	switch (subr) {
4073	case DIF_SUBR_RAND:
4074		regs[rd] = (dtrace_gethrtime() * 2416 + 374441) % 1771875;
4075		break;
4076
4077#if defined(sun)
4078	case DIF_SUBR_MUTEX_OWNED:
4079		if (!dtrace_canload(tupregs[0].dttk_value, sizeof (kmutex_t),
4080		    mstate, vstate)) {
4081			regs[rd] = 0;
4082			break;
4083		}
4084
4085		m.mx = dtrace_load64(tupregs[0].dttk_value);
4086		if (MUTEX_TYPE_ADAPTIVE(&m.mi))
4087			regs[rd] = MUTEX_OWNER(&m.mi) != MUTEX_NO_OWNER;
4088		else
4089			regs[rd] = LOCK_HELD(&m.mi.m_spin.m_spinlock);
4090		break;
4091
4092	case DIF_SUBR_MUTEX_OWNER:
4093		if (!dtrace_canload(tupregs[0].dttk_value, sizeof (kmutex_t),
4094		    mstate, vstate)) {
4095			regs[rd] = 0;
4096			break;
4097		}
4098
4099		m.mx = dtrace_load64(tupregs[0].dttk_value);
4100		if (MUTEX_TYPE_ADAPTIVE(&m.mi) &&
4101		    MUTEX_OWNER(&m.mi) != MUTEX_NO_OWNER)
4102			regs[rd] = (uintptr_t)MUTEX_OWNER(&m.mi);
4103		else
4104			regs[rd] = 0;
4105		break;
4106
4107	case DIF_SUBR_MUTEX_TYPE_ADAPTIVE:
4108		if (!dtrace_canload(tupregs[0].dttk_value, sizeof (kmutex_t),
4109		    mstate, vstate)) {
4110			regs[rd] = 0;
4111			break;
4112		}
4113
4114		m.mx = dtrace_load64(tupregs[0].dttk_value);
4115		regs[rd] = MUTEX_TYPE_ADAPTIVE(&m.mi);
4116		break;
4117
4118	case DIF_SUBR_MUTEX_TYPE_SPIN:
4119		if (!dtrace_canload(tupregs[0].dttk_value, sizeof (kmutex_t),
4120		    mstate, vstate)) {
4121			regs[rd] = 0;
4122			break;
4123		}
4124
4125		m.mx = dtrace_load64(tupregs[0].dttk_value);
4126		regs[rd] = MUTEX_TYPE_SPIN(&m.mi);
4127		break;
4128
4129	case DIF_SUBR_RW_READ_HELD: {
4130		uintptr_t tmp;
4131
4132		if (!dtrace_canload(tupregs[0].dttk_value, sizeof (uintptr_t),
4133		    mstate, vstate)) {
4134			regs[rd] = 0;
4135			break;
4136		}
4137
4138		r.rw = dtrace_loadptr(tupregs[0].dttk_value);
4139		regs[rd] = _RW_READ_HELD(&r.ri, tmp);
4140		break;
4141	}
4142
4143	case DIF_SUBR_RW_WRITE_HELD:
4144		if (!dtrace_canload(tupregs[0].dttk_value, sizeof (krwlock_t),
4145		    mstate, vstate)) {
4146			regs[rd] = 0;
4147			break;
4148		}
4149
4150		r.rw = dtrace_loadptr(tupregs[0].dttk_value);
4151		regs[rd] = _RW_WRITE_HELD(&r.ri);
4152		break;
4153
4154	case DIF_SUBR_RW_ISWRITER:
4155		if (!dtrace_canload(tupregs[0].dttk_value, sizeof (krwlock_t),
4156		    mstate, vstate)) {
4157			regs[rd] = 0;
4158			break;
4159		}
4160
4161		r.rw = dtrace_loadptr(tupregs[0].dttk_value);
4162		regs[rd] = _RW_ISWRITER(&r.ri);
4163		break;
4164
4165#else
4166	case DIF_SUBR_MUTEX_OWNED:
4167		if (!dtrace_canload(tupregs[0].dttk_value,
4168			sizeof (struct lock_object), mstate, vstate)) {
4169			regs[rd] = 0;
4170			break;
4171		}
4172		l.lx = dtrace_loadptr((uintptr_t)&tupregs[0].dttk_value);
4173		regs[rd] = LOCK_CLASS(l.li)->lc_owner(l.li, &lowner);
4174		break;
4175
4176	case DIF_SUBR_MUTEX_OWNER:
4177		if (!dtrace_canload(tupregs[0].dttk_value,
4178			sizeof (struct lock_object), mstate, vstate)) {
4179			regs[rd] = 0;
4180			break;
4181		}
4182		l.lx = dtrace_loadptr((uintptr_t)&tupregs[0].dttk_value);
4183		LOCK_CLASS(l.li)->lc_owner(l.li, &lowner);
4184		regs[rd] = (uintptr_t)lowner;
4185		break;
4186
4187	case DIF_SUBR_MUTEX_TYPE_ADAPTIVE:
4188		if (!dtrace_canload(tupregs[0].dttk_value, sizeof (struct mtx),
4189		    mstate, vstate)) {
4190			regs[rd] = 0;
4191			break;
4192		}
4193		l.lx = dtrace_loadptr((uintptr_t)&tupregs[0].dttk_value);
4194		/* XXX - should be only LC_SLEEPABLE? */
4195		regs[rd] = (LOCK_CLASS(l.li)->lc_flags &
4196		    (LC_SLEEPLOCK | LC_SLEEPABLE)) != 0;
4197		break;
4198
4199	case DIF_SUBR_MUTEX_TYPE_SPIN:
4200		if (!dtrace_canload(tupregs[0].dttk_value, sizeof (struct mtx),
4201		    mstate, vstate)) {
4202			regs[rd] = 0;
4203			break;
4204		}
4205		l.lx = dtrace_loadptr((uintptr_t)&tupregs[0].dttk_value);
4206		regs[rd] = (LOCK_CLASS(l.li)->lc_flags & LC_SPINLOCK) != 0;
4207		break;
4208
4209	case DIF_SUBR_RW_READ_HELD:
4210	case DIF_SUBR_SX_SHARED_HELD:
4211		if (!dtrace_canload(tupregs[0].dttk_value, sizeof (uintptr_t),
4212		    mstate, vstate)) {
4213			regs[rd] = 0;
4214			break;
4215		}
4216		l.lx = dtrace_loadptr((uintptr_t)&tupregs[0].dttk_value);
4217		regs[rd] = LOCK_CLASS(l.li)->lc_owner(l.li, &lowner) &&
4218		    lowner == NULL;
4219		break;
4220
4221	case DIF_SUBR_RW_WRITE_HELD:
4222	case DIF_SUBR_SX_EXCLUSIVE_HELD:
4223		if (!dtrace_canload(tupregs[0].dttk_value, sizeof (uintptr_t),
4224		    mstate, vstate)) {
4225			regs[rd] = 0;
4226			break;
4227		}
4228		l.lx = dtrace_loadptr(tupregs[0].dttk_value);
4229		LOCK_CLASS(l.li)->lc_owner(l.li, &lowner);
4230		regs[rd] = (lowner == curthread);
4231		break;
4232
4233	case DIF_SUBR_RW_ISWRITER:
4234	case DIF_SUBR_SX_ISEXCLUSIVE:
4235		if (!dtrace_canload(tupregs[0].dttk_value, sizeof (uintptr_t),
4236		    mstate, vstate)) {
4237			regs[rd] = 0;
4238			break;
4239		}
4240		l.lx = dtrace_loadptr(tupregs[0].dttk_value);
4241		regs[rd] = LOCK_CLASS(l.li)->lc_owner(l.li, &lowner) &&
4242		    lowner != NULL;
4243		break;
4244#endif /* ! defined(sun) */
4245
4246	case DIF_SUBR_BCOPY: {
4247		/*
4248		 * We need to be sure that the destination is in the scratch
4249		 * region -- no other region is allowed.
4250		 */
4251		uintptr_t src = tupregs[0].dttk_value;
4252		uintptr_t dest = tupregs[1].dttk_value;
4253		size_t size = tupregs[2].dttk_value;
4254
4255		if (!dtrace_inscratch(dest, size, mstate)) {
4256			*flags |= CPU_DTRACE_BADADDR;
4257			*illval = regs[rd];
4258			break;
4259		}
4260
4261		if (!dtrace_canload(src, size, mstate, vstate)) {
4262			regs[rd] = 0;
4263			break;
4264		}
4265
4266		dtrace_bcopy((void *)src, (void *)dest, size);
4267		break;
4268	}
4269
4270	case DIF_SUBR_ALLOCA:
4271	case DIF_SUBR_COPYIN: {
4272		uintptr_t dest = P2ROUNDUP(mstate->dtms_scratch_ptr, 8);
4273		uint64_t size =
4274		    tupregs[subr == DIF_SUBR_ALLOCA ? 0 : 1].dttk_value;
4275		size_t scratch_size = (dest - mstate->dtms_scratch_ptr) + size;
4276
4277		/*
4278		 * This action doesn't require any credential checks since
4279		 * probes will not activate in user contexts to which the
4280		 * enabling user does not have permissions.
4281		 */
4282
4283		/*
4284		 * Rounding up the user allocation size could have overflowed
4285		 * a large, bogus allocation (like -1ULL) to 0.
4286		 */
4287		if (scratch_size < size ||
4288		    !DTRACE_INSCRATCH(mstate, scratch_size)) {
4289			DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
4290			regs[rd] = 0;
4291			break;
4292		}
4293
4294		if (subr == DIF_SUBR_COPYIN) {
4295			DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
4296			dtrace_copyin(tupregs[0].dttk_value, dest, size, flags);
4297			DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);
4298		}
4299
4300		mstate->dtms_scratch_ptr += scratch_size;
4301		regs[rd] = dest;
4302		break;
4303	}
4304
4305	case DIF_SUBR_COPYINTO: {
4306		uint64_t size = tupregs[1].dttk_value;
4307		uintptr_t dest = tupregs[2].dttk_value;
4308
4309		/*
4310		 * This action doesn't require any credential checks since
4311		 * probes will not activate in user contexts to which the
4312		 * enabling user does not have permissions.
4313		 */
4314		if (!dtrace_inscratch(dest, size, mstate)) {
4315			*flags |= CPU_DTRACE_BADADDR;
4316			*illval = regs[rd];
4317			break;
4318		}
4319
4320		DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
4321		dtrace_copyin(tupregs[0].dttk_value, dest, size, flags);
4322		DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);
4323		break;
4324	}
4325
4326	case DIF_SUBR_COPYINSTR: {
4327		uintptr_t dest = mstate->dtms_scratch_ptr;
4328		uint64_t size = state->dts_options[DTRACEOPT_STRSIZE];
4329
4330		if (nargs > 1 && tupregs[1].dttk_value < size)
4331			size = tupregs[1].dttk_value + 1;
4332
4333		/*
4334		 * This action doesn't require any credential checks since
4335		 * probes will not activate in user contexts to which the
4336		 * enabling user does not have permissions.
4337		 */
4338		if (!DTRACE_INSCRATCH(mstate, size)) {
4339			DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
4340			regs[rd] = 0;
4341			break;
4342		}
4343
4344		DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
4345		dtrace_copyinstr(tupregs[0].dttk_value, dest, size, flags);
4346		DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);
4347
4348		((char *)dest)[size - 1] = '\0';
4349		mstate->dtms_scratch_ptr += size;
4350		regs[rd] = dest;
4351		break;
4352	}
4353
4354#if defined(sun)
4355	case DIF_SUBR_MSGSIZE:
4356	case DIF_SUBR_MSGDSIZE: {
4357		uintptr_t baddr = tupregs[0].dttk_value, daddr;
4358		uintptr_t wptr, rptr;
4359		size_t count = 0;
4360		int cont = 0;
4361
4362		while (baddr != 0 && !(*flags & CPU_DTRACE_FAULT)) {
4363
4364			if (!dtrace_canload(baddr, sizeof (mblk_t), mstate,
4365			    vstate)) {
4366				regs[rd] = 0;
4367				break;
4368			}
4369
4370			wptr = dtrace_loadptr(baddr +
4371			    offsetof(mblk_t, b_wptr));
4372
4373			rptr = dtrace_loadptr(baddr +
4374			    offsetof(mblk_t, b_rptr));
4375
4376			if (wptr < rptr) {
4377				*flags |= CPU_DTRACE_BADADDR;
4378				*illval = tupregs[0].dttk_value;
4379				break;
4380			}
4381
4382			daddr = dtrace_loadptr(baddr +
4383			    offsetof(mblk_t, b_datap));
4384
4385			baddr = dtrace_loadptr(baddr +
4386			    offsetof(mblk_t, b_cont));
4387
4388			/*
4389			 * We want to prevent against denial-of-service here,
4390			 * so we're only going to search the list for
4391			 * dtrace_msgdsize_max mblks.
4392			 */
4393			if (cont++ > dtrace_msgdsize_max) {
4394				*flags |= CPU_DTRACE_ILLOP;
4395				break;
4396			}
4397
4398			if (subr == DIF_SUBR_MSGDSIZE) {
4399				if (dtrace_load8(daddr +
4400				    offsetof(dblk_t, db_type)) != M_DATA)
4401					continue;
4402			}
4403
4404			count += wptr - rptr;
4405		}
4406
4407		if (!(*flags & CPU_DTRACE_FAULT))
4408			regs[rd] = count;
4409
4410		break;
4411	}
4412#endif
4413
4414	case DIF_SUBR_PROGENYOF: {
4415		pid_t pid = tupregs[0].dttk_value;
4416		proc_t *p;
4417		int rval = 0;
4418
4419		DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
4420
4421		for (p = curthread->t_procp; p != NULL; p = p->p_parent) {
4422#if defined(sun)
4423			if (p->p_pidp->pid_id == pid) {
4424#else
4425			if (p->p_pid == pid) {
4426#endif
4427				rval = 1;
4428				break;
4429			}
4430		}
4431
4432		DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);
4433
4434		regs[rd] = rval;
4435		break;
4436	}
4437
4438	case DIF_SUBR_SPECULATION:
4439		regs[rd] = dtrace_speculation(state);
4440		break;
4441
4442	case DIF_SUBR_COPYOUT: {
4443		uintptr_t kaddr = tupregs[0].dttk_value;
4444		uintptr_t uaddr = tupregs[1].dttk_value;
4445		uint64_t size = tupregs[2].dttk_value;
4446
4447		if (!dtrace_destructive_disallow &&
4448		    dtrace_priv_proc_control(state) &&
4449		    !dtrace_istoxic(kaddr, size)) {
4450			DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
4451			dtrace_copyout(kaddr, uaddr, size, flags);
4452			DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);
4453		}
4454		break;
4455	}
4456
4457	case DIF_SUBR_COPYOUTSTR: {
4458		uintptr_t kaddr = tupregs[0].dttk_value;
4459		uintptr_t uaddr = tupregs[1].dttk_value;
4460		uint64_t size = tupregs[2].dttk_value;
4461
4462		if (!dtrace_destructive_disallow &&
4463		    dtrace_priv_proc_control(state) &&
4464		    !dtrace_istoxic(kaddr, size)) {
4465			DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
4466			dtrace_copyoutstr(kaddr, uaddr, size, flags);
4467			DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);
4468		}
4469		break;
4470	}
4471
4472	case DIF_SUBR_STRLEN: {
4473		size_t sz;
4474		uintptr_t addr = (uintptr_t)tupregs[0].dttk_value;
4475		sz = dtrace_strlen((char *)addr,
4476		    state->dts_options[DTRACEOPT_STRSIZE]);
4477
4478		if (!dtrace_canload(addr, sz + 1, mstate, vstate)) {
4479			regs[rd] = 0;
4480			break;
4481		}
4482
4483		regs[rd] = sz;
4484
4485		break;
4486	}
4487
4488	case DIF_SUBR_STRCHR:
4489	case DIF_SUBR_STRRCHR: {
4490		/*
4491		 * We're going to iterate over the string looking for the
4492		 * specified character.  We will iterate until we have reached
4493		 * the string length or we have found the character.  If this
4494		 * is DIF_SUBR_STRRCHR, we will look for the last occurrence
4495		 * of the specified character instead of the first.
4496		 */
4497		uintptr_t saddr = tupregs[0].dttk_value;
4498		uintptr_t addr = tupregs[0].dttk_value;
4499		uintptr_t limit = addr + state->dts_options[DTRACEOPT_STRSIZE];
4500		char c, target = (char)tupregs[1].dttk_value;
4501
4502		for (regs[rd] = 0; addr < limit; addr++) {
4503			if ((c = dtrace_load8(addr)) == target) {
4504				regs[rd] = addr;
4505
4506				if (subr == DIF_SUBR_STRCHR)
4507					break;
4508			}
4509
4510			if (c == '\0')
4511				break;
4512		}
4513
4514		if (!dtrace_canload(saddr, addr - saddr, mstate, vstate)) {
4515			regs[rd] = 0;
4516			break;
4517		}
4518
4519		break;
4520	}
4521
4522	case DIF_SUBR_STRSTR:
4523	case DIF_SUBR_INDEX:
4524	case DIF_SUBR_RINDEX: {
4525		/*
4526		 * We're going to iterate over the string looking for the
4527		 * specified string.  We will iterate until we have reached
4528		 * the string length or we have found the string.  (Yes, this
4529		 * is done in the most naive way possible -- but considering
4530		 * that the string we're searching for is likely to be
4531		 * relatively short, the complexity of Rabin-Karp or similar
4532		 * hardly seems merited.)
4533		 */
4534		char *addr = (char *)(uintptr_t)tupregs[0].dttk_value;
4535		char *substr = (char *)(uintptr_t)tupregs[1].dttk_value;
4536		uint64_t size = state->dts_options[DTRACEOPT_STRSIZE];
4537		size_t len = dtrace_strlen(addr, size);
4538		size_t sublen = dtrace_strlen(substr, size);
4539		char *limit = addr + len, *orig = addr;
4540		int notfound = subr == DIF_SUBR_STRSTR ? 0 : -1;
4541		int inc = 1;
4542
4543		regs[rd] = notfound;
4544
4545		if (!dtrace_canload((uintptr_t)addr, len + 1, mstate, vstate)) {
4546			regs[rd] = 0;
4547			break;
4548		}
4549
4550		if (!dtrace_canload((uintptr_t)substr, sublen + 1, mstate,
4551		    vstate)) {
4552			regs[rd] = 0;
4553			break;
4554		}
4555
4556		/*
4557		 * strstr() and index()/rindex() have similar semantics if
4558		 * both strings are the empty string: strstr() returns a
4559		 * pointer to the (empty) string, and index() and rindex()
4560		 * both return index 0 (regardless of any position argument).
4561		 */
4562		if (sublen == 0 && len == 0) {
4563			if (subr == DIF_SUBR_STRSTR)
4564				regs[rd] = (uintptr_t)addr;
4565			else
4566				regs[rd] = 0;
4567			break;
4568		}
4569
4570		if (subr != DIF_SUBR_STRSTR) {
4571			if (subr == DIF_SUBR_RINDEX) {
4572				limit = orig - 1;
4573				addr += len;
4574				inc = -1;
4575			}
4576
4577			/*
4578			 * Both index() and rindex() take an optional position
4579			 * argument that denotes the starting position.
4580			 */
4581			if (nargs == 3) {
4582				int64_t pos = (int64_t)tupregs[2].dttk_value;
4583
4584				/*
4585				 * If the position argument to index() is
4586				 * negative, Perl implicitly clamps it at
4587				 * zero.  This semantic is a little surprising
4588				 * given the special meaning of negative
4589				 * positions to similar Perl functions like
4590				 * substr(), but it appears to reflect a
4591				 * notion that index() can start from a
4592				 * negative index and increment its way up to
4593				 * the string.  Given this notion, Perl's
4594				 * rindex() is at least self-consistent in
4595				 * that it implicitly clamps positions greater
4596				 * than the string length to be the string
4597				 * length.  Where Perl completely loses
4598				 * coherence, however, is when the specified
4599				 * substring is the empty string ("").  In
4600				 * this case, even if the position is
4601				 * negative, rindex() returns 0 -- and even if
4602				 * the position is greater than the length,
4603				 * index() returns the string length.  These
4604				 * semantics violate the notion that index()
4605				 * should never return a value less than the
4606				 * specified position and that rindex() should
4607				 * never return a value greater than the
4608				 * specified position.  (One assumes that
4609				 * these semantics are artifacts of Perl's
4610				 * implementation and not the results of
4611				 * deliberate design -- it beggars belief that
4612				 * even Larry Wall could desire such oddness.)
4613				 * While in the abstract one would wish for
4614				 * consistent position semantics across
4615				 * substr(), index() and rindex() -- or at the
4616				 * very least self-consistent position
4617				 * semantics for index() and rindex() -- we
4618				 * instead opt to keep with the extant Perl
4619				 * semantics, in all their broken glory.  (Do
4620				 * we have more desire to maintain Perl's
4621				 * semantics than Perl does?  Probably.)
4622				 */
4623				if (subr == DIF_SUBR_RINDEX) {
4624					if (pos < 0) {
4625						if (sublen == 0)
4626							regs[rd] = 0;
4627						break;
4628					}
4629
4630					if (pos > len)
4631						pos = len;
4632				} else {
4633					if (pos < 0)
4634						pos = 0;
4635
4636					if (pos >= len) {
4637						if (sublen == 0)
4638							regs[rd] = len;
4639						break;
4640					}
4641				}
4642
4643				addr = orig + pos;
4644			}
4645		}
4646
4647		for (regs[rd] = notfound; addr != limit; addr += inc) {
4648			if (dtrace_strncmp(addr, substr, sublen) == 0) {
4649				if (subr != DIF_SUBR_STRSTR) {
4650					/*
4651					 * As D index() and rindex() are
4652					 * modeled on Perl (and not on awk),
4653					 * we return a zero-based (and not a
4654					 * one-based) index.  (For you Perl
4655					 * weenies: no, we're not going to add
4656					 * $[ -- and shouldn't you be at a con
4657					 * or something?)
4658					 */
4659					regs[rd] = (uintptr_t)(addr - orig);
4660					break;
4661				}
4662
4663				ASSERT(subr == DIF_SUBR_STRSTR);
4664				regs[rd] = (uintptr_t)addr;
4665				break;
4666			}
4667		}
4668
4669		break;
4670	}
4671
4672	case DIF_SUBR_STRTOK: {
4673		uintptr_t addr = tupregs[0].dttk_value;
4674		uintptr_t tokaddr = tupregs[1].dttk_value;
4675		uint64_t size = state->dts_options[DTRACEOPT_STRSIZE];
4676		uintptr_t limit, toklimit = tokaddr + size;
4677		uint8_t c = 0, tokmap[32];	 /* 256 / 8 */
4678		char *dest = (char *)mstate->dtms_scratch_ptr;
4679		int i;
4680
4681		/*
4682		 * Check both the token buffer and (later) the input buffer,
4683		 * since both could be non-scratch addresses.
4684		 */
4685		if (!dtrace_strcanload(tokaddr, size, mstate, vstate)) {
4686			regs[rd] = 0;
4687			break;
4688		}
4689
4690		if (!DTRACE_INSCRATCH(mstate, size)) {
4691			DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
4692			regs[rd] = 0;
4693			break;
4694		}
4695
4696		if (addr == 0) {
4697			/*
4698			 * If the address specified is NULL, we use our saved
4699			 * strtok pointer from the mstate.  Note that this
4700			 * means that the saved strtok pointer is _only_
4701			 * valid within multiple enablings of the same probe --
4702			 * it behaves like an implicit clause-local variable.
4703			 */
4704			addr = mstate->dtms_strtok;
4705		} else {
4706			/*
4707			 * If the user-specified address is non-NULL we must
4708			 * access check it.  This is the only time we have
4709			 * a chance to do so, since this address may reside
4710			 * in the string table of this clause-- future calls
4711			 * (when we fetch addr from mstate->dtms_strtok)
4712			 * would fail this access check.
4713			 */
4714			if (!dtrace_strcanload(addr, size, mstate, vstate)) {
4715				regs[rd] = 0;
4716				break;
4717			}
4718		}
4719
4720		/*
4721		 * First, zero the token map, and then process the token
4722		 * string -- setting a bit in the map for every character
4723		 * found in the token string.
4724		 */
4725		for (i = 0; i < sizeof (tokmap); i++)
4726			tokmap[i] = 0;
4727
4728		for (; tokaddr < toklimit; tokaddr++) {
4729			if ((c = dtrace_load8(tokaddr)) == '\0')
4730				break;
4731
4732			ASSERT((c >> 3) < sizeof (tokmap));
4733			tokmap[c >> 3] |= (1 << (c & 0x7));
4734		}
4735
4736		for (limit = addr + size; addr < limit; addr++) {
4737			/*
4738			 * We're looking for a character that is _not_ contained
4739			 * in the token string.
4740			 */
4741			if ((c = dtrace_load8(addr)) == '\0')
4742				break;
4743
4744			if (!(tokmap[c >> 3] & (1 << (c & 0x7))))
4745				break;
4746		}
4747
4748		if (c == '\0') {
4749			/*
4750			 * We reached the end of the string without finding
4751			 * any character that was not in the token string.
4752			 * We return NULL in this case, and we set the saved
4753			 * address to NULL as well.
4754			 */
4755			regs[rd] = 0;
4756			mstate->dtms_strtok = 0;
4757			break;
4758		}
4759
4760		/*
4761		 * From here on, we're copying into the destination string.
4762		 */
4763		for (i = 0; addr < limit && i < size - 1; addr++) {
4764			if ((c = dtrace_load8(addr)) == '\0')
4765				break;
4766
4767			if (tokmap[c >> 3] & (1 << (c & 0x7)))
4768				break;
4769
4770			ASSERT(i < size);
4771			dest[i++] = c;
4772		}
4773
4774		ASSERT(i < size);
4775		dest[i] = '\0';
4776		regs[rd] = (uintptr_t)dest;
4777		mstate->dtms_scratch_ptr += size;
4778		mstate->dtms_strtok = addr;
4779		break;
4780	}
4781
4782	case DIF_SUBR_SUBSTR: {
4783		uintptr_t s = tupregs[0].dttk_value;
4784		uint64_t size = state->dts_options[DTRACEOPT_STRSIZE];
4785		char *d = (char *)mstate->dtms_scratch_ptr;
4786		int64_t index = (int64_t)tupregs[1].dttk_value;
4787		int64_t remaining = (int64_t)tupregs[2].dttk_value;
4788		size_t len = dtrace_strlen((char *)s, size);
4789		int64_t i;
4790
4791		if (!dtrace_canload(s, len + 1, mstate, vstate)) {
4792			regs[rd] = 0;
4793			break;
4794		}
4795
4796		if (!DTRACE_INSCRATCH(mstate, size)) {
4797			DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
4798			regs[rd] = 0;
4799			break;
4800		}
4801
4802		if (nargs <= 2)
4803			remaining = (int64_t)size;
4804
4805		if (index < 0) {
4806			index += len;
4807
4808			if (index < 0 && index + remaining > 0) {
4809				remaining += index;
4810				index = 0;
4811			}
4812		}
4813
4814		if (index >= len || index < 0) {
4815			remaining = 0;
4816		} else if (remaining < 0) {
4817			remaining += len - index;
4818		} else if (index + remaining > size) {
4819			remaining = size - index;
4820		}
4821
4822		for (i = 0; i < remaining; i++) {
4823			if ((d[i] = dtrace_load8(s + index + i)) == '\0')
4824				break;
4825		}
4826
4827		d[i] = '\0';
4828
4829		mstate->dtms_scratch_ptr += size;
4830		regs[rd] = (uintptr_t)d;
4831		break;
4832	}
4833
4834	case DIF_SUBR_JSON: {
4835		uint64_t size = state->dts_options[DTRACEOPT_STRSIZE];
4836		uintptr_t json = tupregs[0].dttk_value;
4837		size_t jsonlen = dtrace_strlen((char *)json, size);
4838		uintptr_t elem = tupregs[1].dttk_value;
4839		size_t elemlen = dtrace_strlen((char *)elem, size);
4840
4841		char *dest = (char *)mstate->dtms_scratch_ptr;
4842		char *elemlist = (char *)mstate->dtms_scratch_ptr + jsonlen + 1;
4843		char *ee = elemlist;
4844		int nelems = 1;
4845		uintptr_t cur;
4846
4847		if (!dtrace_canload(json, jsonlen + 1, mstate, vstate) ||
4848		    !dtrace_canload(elem, elemlen + 1, mstate, vstate)) {
4849			regs[rd] = 0;
4850			break;
4851		}
4852
4853		if (!DTRACE_INSCRATCH(mstate, jsonlen + 1 + elemlen + 1)) {
4854			DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
4855			regs[rd] = 0;
4856			break;
4857		}
4858
4859		/*
4860		 * Read the element selector and split it up into a packed list
4861		 * of strings.
4862		 */
4863		for (cur = elem; cur < elem + elemlen; cur++) {
4864			char cc = dtrace_load8(cur);
4865
4866			if (cur == elem && cc == '[') {
4867				/*
4868				 * If the first element selector key is
4869				 * actually an array index then ignore the
4870				 * bracket.
4871				 */
4872				continue;
4873			}
4874
4875			if (cc == ']')
4876				continue;
4877
4878			if (cc == '.' || cc == '[') {
4879				nelems++;
4880				cc = '\0';
4881			}
4882
4883			*ee++ = cc;
4884		}
4885		*ee++ = '\0';
4886
4887		if ((regs[rd] = (uintptr_t)dtrace_json(size, json, elemlist,
4888		    nelems, dest)) != 0)
4889			mstate->dtms_scratch_ptr += jsonlen + 1;
4890		break;
4891	}
4892
4893	case DIF_SUBR_TOUPPER:
4894	case DIF_SUBR_TOLOWER: {
4895		uintptr_t s = tupregs[0].dttk_value;
4896		uint64_t size = state->dts_options[DTRACEOPT_STRSIZE];
4897		char *dest = (char *)mstate->dtms_scratch_ptr, c;
4898		size_t len = dtrace_strlen((char *)s, size);
4899		char lower, upper, convert;
4900		int64_t i;
4901
4902		if (subr == DIF_SUBR_TOUPPER) {
4903			lower = 'a';
4904			upper = 'z';
4905			convert = 'A';
4906		} else {
4907			lower = 'A';
4908			upper = 'Z';
4909			convert = 'a';
4910		}
4911
4912		if (!dtrace_canload(s, len + 1, mstate, vstate)) {
4913			regs[rd] = 0;
4914			break;
4915		}
4916
4917		if (!DTRACE_INSCRATCH(mstate, size)) {
4918			DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
4919			regs[rd] = 0;
4920			break;
4921		}
4922
4923		for (i = 0; i < size - 1; i++) {
4924			if ((c = dtrace_load8(s + i)) == '\0')
4925				break;
4926
4927			if (c >= lower && c <= upper)
4928				c = convert + (c - lower);
4929
4930			dest[i] = c;
4931		}
4932
4933		ASSERT(i < size);
4934		dest[i] = '\0';
4935		regs[rd] = (uintptr_t)dest;
4936		mstate->dtms_scratch_ptr += size;
4937		break;
4938	}
4939
4940#if defined(sun)
4941	case DIF_SUBR_GETMAJOR:
4942#ifdef _LP64
4943		regs[rd] = (tupregs[0].dttk_value >> NBITSMINOR64) & MAXMAJ64;
4944#else
4945		regs[rd] = (tupregs[0].dttk_value >> NBITSMINOR) & MAXMAJ;
4946#endif
4947		break;
4948
4949	case DIF_SUBR_GETMINOR:
4950#ifdef _LP64
4951		regs[rd] = tupregs[0].dttk_value & MAXMIN64;
4952#else
4953		regs[rd] = tupregs[0].dttk_value & MAXMIN;
4954#endif
4955		break;
4956
4957	case DIF_SUBR_DDI_PATHNAME: {
4958		/*
4959		 * This one is a galactic mess.  We are going to roughly
4960		 * emulate ddi_pathname(), but it's made more complicated
4961		 * by the fact that we (a) want to include the minor name and
4962		 * (b) must proceed iteratively instead of recursively.
4963		 */
4964		uintptr_t dest = mstate->dtms_scratch_ptr;
4965		uint64_t size = state->dts_options[DTRACEOPT_STRSIZE];
4966		char *start = (char *)dest, *end = start + size - 1;
4967		uintptr_t daddr = tupregs[0].dttk_value;
4968		int64_t minor = (int64_t)tupregs[1].dttk_value;
4969		char *s;
4970		int i, len, depth = 0;
4971
4972		/*
4973		 * Due to all the pointer jumping we do and context we must
4974		 * rely upon, we just mandate that the user must have kernel
4975		 * read privileges to use this routine.
4976		 */
4977		if ((mstate->dtms_access & DTRACE_ACCESS_KERNEL) == 0) {
4978			*flags |= CPU_DTRACE_KPRIV;
4979			*illval = daddr;
4980			regs[rd] = 0;
4981		}
4982
4983		if (!DTRACE_INSCRATCH(mstate, size)) {
4984			DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
4985			regs[rd] = 0;
4986			break;
4987		}
4988
4989		*end = '\0';
4990
4991		/*
4992		 * We want to have a name for the minor.  In order to do this,
4993		 * we need to walk the minor list from the devinfo.  We want
4994		 * to be sure that we don't infinitely walk a circular list,
4995		 * so we check for circularity by sending a scout pointer
4996		 * ahead two elements for every element that we iterate over;
4997		 * if the list is circular, these will ultimately point to the
4998		 * same element.  You may recognize this little trick as the
4999		 * answer to a stupid interview question -- one that always
5000		 * seems to be asked by those who had to have it laboriously
5001		 * explained to them, and who can't even concisely describe
5002		 * the conditions under which one would be forced to resort to
5003		 * this technique.  Needless to say, those conditions are
5004		 * found here -- and probably only here.  Is this the only use
5005		 * of this infamous trick in shipping, production code?  If it
5006		 * isn't, it probably should be...
5007		 */
5008		if (minor != -1) {
5009			uintptr_t maddr = dtrace_loadptr(daddr +
5010			    offsetof(struct dev_info, devi_minor));
5011
5012			uintptr_t next = offsetof(struct ddi_minor_data, next);
5013			uintptr_t name = offsetof(struct ddi_minor_data,
5014			    d_minor) + offsetof(struct ddi_minor, name);
5015			uintptr_t dev = offsetof(struct ddi_minor_data,
5016			    d_minor) + offsetof(struct ddi_minor, dev);
5017			uintptr_t scout;
5018
5019			if (maddr != NULL)
5020				scout = dtrace_loadptr(maddr + next);
5021
5022			while (maddr != NULL && !(*flags & CPU_DTRACE_FAULT)) {
5023				uint64_t m;
5024#ifdef _LP64
5025				m = dtrace_load64(maddr + dev) & MAXMIN64;
5026#else
5027				m = dtrace_load32(maddr + dev) & MAXMIN;
5028#endif
5029				if (m != minor) {
5030					maddr = dtrace_loadptr(maddr + next);
5031
5032					if (scout == NULL)
5033						continue;
5034
5035					scout = dtrace_loadptr(scout + next);
5036
5037					if (scout == NULL)
5038						continue;
5039
5040					scout = dtrace_loadptr(scout + next);
5041
5042					if (scout == NULL)
5043						continue;
5044
5045					if (scout == maddr) {
5046						*flags |= CPU_DTRACE_ILLOP;
5047						break;
5048					}
5049
5050					continue;
5051				}
5052
5053				/*
5054				 * We have the minor data.  Now we need to
5055				 * copy the minor's name into the end of the
5056				 * pathname.
5057				 */
5058				s = (char *)dtrace_loadptr(maddr + name);
5059				len = dtrace_strlen(s, size);
5060
5061				if (*flags & CPU_DTRACE_FAULT)
5062					break;
5063
5064				if (len != 0) {
5065					if ((end -= (len + 1)) < start)
5066						break;
5067
5068					*end = ':';
5069				}
5070
5071				for (i = 1; i <= len; i++)
5072					end[i] = dtrace_load8((uintptr_t)s++);
5073				break;
5074			}
5075		}
5076
5077		while (daddr != NULL && !(*flags & CPU_DTRACE_FAULT)) {
5078			ddi_node_state_t devi_state;
5079
5080			devi_state = dtrace_load32(daddr +
5081			    offsetof(struct dev_info, devi_node_state));
5082
5083			if (*flags & CPU_DTRACE_FAULT)
5084				break;
5085
5086			if (devi_state >= DS_INITIALIZED) {
5087				s = (char *)dtrace_loadptr(daddr +
5088				    offsetof(struct dev_info, devi_addr));
5089				len = dtrace_strlen(s, size);
5090
5091				if (*flags & CPU_DTRACE_FAULT)
5092					break;
5093
5094				if (len != 0) {
5095					if ((end -= (len + 1)) < start)
5096						break;
5097
5098					*end = '@';
5099				}
5100
5101				for (i = 1; i <= len; i++)
5102					end[i] = dtrace_load8((uintptr_t)s++);
5103			}
5104
5105			/*
5106			 * Now for the node name...
5107			 */
5108			s = (char *)dtrace_loadptr(daddr +
5109			    offsetof(struct dev_info, devi_node_name));
5110
5111			daddr = dtrace_loadptr(daddr +
5112			    offsetof(struct dev_info, devi_parent));
5113
5114			/*
5115			 * If our parent is NULL (that is, if we're the root
5116			 * node), we're going to use the special path
5117			 * "devices".
5118			 */
5119			if (daddr == 0)
5120				s = "devices";
5121
5122			len = dtrace_strlen(s, size);
5123			if (*flags & CPU_DTRACE_FAULT)
5124				break;
5125
5126			if ((end -= (len + 1)) < start)
5127				break;
5128
5129			for (i = 1; i <= len; i++)
5130				end[i] = dtrace_load8((uintptr_t)s++);
5131			*end = '/';
5132
5133			if (depth++ > dtrace_devdepth_max) {
5134				*flags |= CPU_DTRACE_ILLOP;
5135				break;
5136			}
5137		}
5138
5139		if (end < start)
5140			DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
5141
5142		if (daddr == 0) {
5143			regs[rd] = (uintptr_t)end;
5144			mstate->dtms_scratch_ptr += size;
5145		}
5146
5147		break;
5148	}
5149#endif
5150
5151	case DIF_SUBR_STRJOIN: {
5152		char *d = (char *)mstate->dtms_scratch_ptr;
5153		uint64_t size = state->dts_options[DTRACEOPT_STRSIZE];
5154		uintptr_t s1 = tupregs[0].dttk_value;
5155		uintptr_t s2 = tupregs[1].dttk_value;
5156		int i = 0;
5157
5158		if (!dtrace_strcanload(s1, size, mstate, vstate) ||
5159		    !dtrace_strcanload(s2, size, mstate, vstate)) {
5160			regs[rd] = 0;
5161			break;
5162		}
5163
5164		if (!DTRACE_INSCRATCH(mstate, size)) {
5165			DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
5166			regs[rd] = 0;
5167			break;
5168		}
5169
5170		for (;;) {
5171			if (i >= size) {
5172				DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
5173				regs[rd] = 0;
5174				break;
5175			}
5176
5177			if ((d[i++] = dtrace_load8(s1++)) == '\0') {
5178				i--;
5179				break;
5180			}
5181		}
5182
5183		for (;;) {
5184			if (i >= size) {
5185				DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
5186				regs[rd] = 0;
5187				break;
5188			}
5189
5190			if ((d[i++] = dtrace_load8(s2++)) == '\0')
5191				break;
5192		}
5193
5194		if (i < size) {
5195			mstate->dtms_scratch_ptr += i;
5196			regs[rd] = (uintptr_t)d;
5197		}
5198
5199		break;
5200	}
5201
5202	case DIF_SUBR_STRTOLL: {
5203		uintptr_t s = tupregs[0].dttk_value;
5204		uint64_t size = state->dts_options[DTRACEOPT_STRSIZE];
5205		int base = 10;
5206
5207		if (nargs > 1) {
5208			if ((base = tupregs[1].dttk_value) <= 1 ||
5209			    base > ('z' - 'a' + 1) + ('9' - '0' + 1)) {
5210				*flags |= CPU_DTRACE_ILLOP;
5211				break;
5212			}
5213		}
5214
5215		if (!dtrace_strcanload(s, size, mstate, vstate)) {
5216			regs[rd] = INT64_MIN;
5217			break;
5218		}
5219
5220		regs[rd] = dtrace_strtoll((char *)s, base, size);
5221		break;
5222	}
5223
5224	case DIF_SUBR_LLTOSTR: {
5225		int64_t i = (int64_t)tupregs[0].dttk_value;
5226		uint64_t val, digit;
5227		uint64_t size = 65;	/* enough room for 2^64 in binary */
5228		char *end = (char *)mstate->dtms_scratch_ptr + size - 1;
5229		int base = 10;
5230
5231		if (nargs > 1) {
5232			if ((base = tupregs[1].dttk_value) <= 1 ||
5233			    base > ('z' - 'a' + 1) + ('9' - '0' + 1)) {
5234				*flags |= CPU_DTRACE_ILLOP;
5235				break;
5236			}
5237		}
5238
5239		val = (base == 10 && i < 0) ? i * -1 : i;
5240
5241		if (!DTRACE_INSCRATCH(mstate, size)) {
5242			DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
5243			regs[rd] = 0;
5244			break;
5245		}
5246
5247		for (*end-- = '\0'; val; val /= base) {
5248			if ((digit = val % base) <= '9' - '0') {
5249				*end-- = '0' + digit;
5250			} else {
5251				*end-- = 'a' + (digit - ('9' - '0') - 1);
5252			}
5253		}
5254
5255		if (i == 0 && base == 16)
5256			*end-- = '0';
5257
5258		if (base == 16)
5259			*end-- = 'x';
5260
5261		if (i == 0 || base == 8 || base == 16)
5262			*end-- = '0';
5263
5264		if (i < 0 && base == 10)
5265			*end-- = '-';
5266
5267		regs[rd] = (uintptr_t)end + 1;
5268		mstate->dtms_scratch_ptr += size;
5269		break;
5270	}
5271
5272	case DIF_SUBR_HTONS:
5273	case DIF_SUBR_NTOHS:
5274#if BYTE_ORDER == BIG_ENDIAN
5275		regs[rd] = (uint16_t)tupregs[0].dttk_value;
5276#else
5277		regs[rd] = DT_BSWAP_16((uint16_t)tupregs[0].dttk_value);
5278#endif
5279		break;
5280
5281
5282	case DIF_SUBR_HTONL:
5283	case DIF_SUBR_NTOHL:
5284#if BYTE_ORDER == BIG_ENDIAN
5285		regs[rd] = (uint32_t)tupregs[0].dttk_value;
5286#else
5287		regs[rd] = DT_BSWAP_32((uint32_t)tupregs[0].dttk_value);
5288#endif
5289		break;
5290
5291
5292	case DIF_SUBR_HTONLL:
5293	case DIF_SUBR_NTOHLL:
5294#if BYTE_ORDER == BIG_ENDIAN
5295		regs[rd] = (uint64_t)tupregs[0].dttk_value;
5296#else
5297		regs[rd] = DT_BSWAP_64((uint64_t)tupregs[0].dttk_value);
5298#endif
5299		break;
5300
5301
5302	case DIF_SUBR_DIRNAME:
5303	case DIF_SUBR_BASENAME: {
5304		char *dest = (char *)mstate->dtms_scratch_ptr;
5305		uint64_t size = state->dts_options[DTRACEOPT_STRSIZE];
5306		uintptr_t src = tupregs[0].dttk_value;
5307		int i, j, len = dtrace_strlen((char *)src, size);
5308		int lastbase = -1, firstbase = -1, lastdir = -1;
5309		int start, end;
5310
5311		if (!dtrace_canload(src, len + 1, mstate, vstate)) {
5312			regs[rd] = 0;
5313			break;
5314		}
5315
5316		if (!DTRACE_INSCRATCH(mstate, size)) {
5317			DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
5318			regs[rd] = 0;
5319			break;
5320		}
5321
5322		/*
5323		 * The basename and dirname for a zero-length string is
5324		 * defined to be "."
5325		 */
5326		if (len == 0) {
5327			len = 1;
5328			src = (uintptr_t)".";
5329		}
5330
5331		/*
5332		 * Start from the back of the string, moving back toward the
5333		 * front until we see a character that isn't a slash.  That
5334		 * character is the last character in the basename.
5335		 */
5336		for (i = len - 1; i >= 0; i--) {
5337			if (dtrace_load8(src + i) != '/')
5338				break;
5339		}
5340
5341		if (i >= 0)
5342			lastbase = i;
5343
5344		/*
5345		 * Starting from the last character in the basename, move
5346		 * towards the front until we find a slash.  The character
5347		 * that we processed immediately before that is the first
5348		 * character in the basename.
5349		 */
5350		for (; i >= 0; i--) {
5351			if (dtrace_load8(src + i) == '/')
5352				break;
5353		}
5354
5355		if (i >= 0)
5356			firstbase = i + 1;
5357
5358		/*
5359		 * Now keep going until we find a non-slash character.  That
5360		 * character is the last character in the dirname.
5361		 */
5362		for (; i >= 0; i--) {
5363			if (dtrace_load8(src + i) != '/')
5364				break;
5365		}
5366
5367		if (i >= 0)
5368			lastdir = i;
5369
5370		ASSERT(!(lastbase == -1 && firstbase != -1));
5371		ASSERT(!(firstbase == -1 && lastdir != -1));
5372
5373		if (lastbase == -1) {
5374			/*
5375			 * We didn't find a non-slash character.  We know that
5376			 * the length is non-zero, so the whole string must be
5377			 * slashes.  In either the dirname or the basename
5378			 * case, we return '/'.
5379			 */
5380			ASSERT(firstbase == -1);
5381			firstbase = lastbase = lastdir = 0;
5382		}
5383
5384		if (firstbase == -1) {
5385			/*
5386			 * The entire string consists only of a basename
5387			 * component.  If we're looking for dirname, we need
5388			 * to change our string to be just "."; if we're
5389			 * looking for a basename, we'll just set the first
5390			 * character of the basename to be 0.
5391			 */
5392			if (subr == DIF_SUBR_DIRNAME) {
5393				ASSERT(lastdir == -1);
5394				src = (uintptr_t)".";
5395				lastdir = 0;
5396			} else {
5397				firstbase = 0;
5398			}
5399		}
5400
5401		if (subr == DIF_SUBR_DIRNAME) {
5402			if (lastdir == -1) {
5403				/*
5404				 * We know that we have a slash in the name --
5405				 * or lastdir would be set to 0, above.  And
5406				 * because lastdir is -1, we know that this
5407				 * slash must be the first character.  (That
5408				 * is, the full string must be of the form
5409				 * "/basename".)  In this case, the last
5410				 * character of the directory name is 0.
5411				 */
5412				lastdir = 0;
5413			}
5414
5415			start = 0;
5416			end = lastdir;
5417		} else {
5418			ASSERT(subr == DIF_SUBR_BASENAME);
5419			ASSERT(firstbase != -1 && lastbase != -1);
5420			start = firstbase;
5421			end = lastbase;
5422		}
5423
5424		for (i = start, j = 0; i <= end && j < size - 1; i++, j++)
5425			dest[j] = dtrace_load8(src + i);
5426
5427		dest[j] = '\0';
5428		regs[rd] = (uintptr_t)dest;
5429		mstate->dtms_scratch_ptr += size;
5430		break;
5431	}
5432
5433	case DIF_SUBR_GETF: {
5434		uintptr_t fd = tupregs[0].dttk_value;
5435		struct filedesc *fdp;
5436		file_t *fp;
5437
5438		if (!dtrace_priv_proc(state)) {
5439			regs[rd] = 0;
5440			break;
5441		}
5442		fdp = curproc->p_fd;
5443		FILEDESC_SLOCK(fdp);
5444		fp = fget_locked(fdp, fd);
5445		mstate->dtms_getf = fp;
5446		regs[rd] = (uintptr_t)fp;
5447		FILEDESC_SUNLOCK(fdp);
5448		break;
5449	}
5450
5451	case DIF_SUBR_CLEANPATH: {
5452		char *dest = (char *)mstate->dtms_scratch_ptr, c;
5453		uint64_t size = state->dts_options[DTRACEOPT_STRSIZE];
5454		uintptr_t src = tupregs[0].dttk_value;
5455		int i = 0, j = 0;
5456#if defined(sun)
5457		zone_t *z;
5458#endif
5459
5460		if (!dtrace_strcanload(src, size, mstate, vstate)) {
5461			regs[rd] = 0;
5462			break;
5463		}
5464
5465		if (!DTRACE_INSCRATCH(mstate, size)) {
5466			DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
5467			regs[rd] = 0;
5468			break;
5469		}
5470
5471		/*
5472		 * Move forward, loading each character.
5473		 */
5474		do {
5475			c = dtrace_load8(src + i++);
5476next:
5477			if (j + 5 >= size)	/* 5 = strlen("/..c\0") */
5478				break;
5479
5480			if (c != '/') {
5481				dest[j++] = c;
5482				continue;
5483			}
5484
5485			c = dtrace_load8(src + i++);
5486
5487			if (c == '/') {
5488				/*
5489				 * We have two slashes -- we can just advance
5490				 * to the next character.
5491				 */
5492				goto next;
5493			}
5494
5495			if (c != '.') {
5496				/*
5497				 * This is not "." and it's not ".." -- we can
5498				 * just store the "/" and this character and
5499				 * drive on.
5500				 */
5501				dest[j++] = '/';
5502				dest[j++] = c;
5503				continue;
5504			}
5505
5506			c = dtrace_load8(src + i++);
5507
5508			if (c == '/') {
5509				/*
5510				 * This is a "/./" component.  We're not going
5511				 * to store anything in the destination buffer;
5512				 * we're just going to go to the next component.
5513				 */
5514				goto next;
5515			}
5516
5517			if (c != '.') {
5518				/*
5519				 * This is not ".." -- we can just store the
5520				 * "/." and this character and continue
5521				 * processing.
5522				 */
5523				dest[j++] = '/';
5524				dest[j++] = '.';
5525				dest[j++] = c;
5526				continue;
5527			}
5528
5529			c = dtrace_load8(src + i++);
5530
5531			if (c != '/' && c != '\0') {
5532				/*
5533				 * This is not ".." -- it's "..[mumble]".
5534				 * We'll store the "/.." and this character
5535				 * and continue processing.
5536				 */
5537				dest[j++] = '/';
5538				dest[j++] = '.';
5539				dest[j++] = '.';
5540				dest[j++] = c;
5541				continue;
5542			}
5543
5544			/*
5545			 * This is "/../" or "/..\0".  We need to back up
5546			 * our destination pointer until we find a "/".
5547			 */
5548			i--;
5549			while (j != 0 && dest[--j] != '/')
5550				continue;
5551
5552			if (c == '\0')
5553				dest[++j] = '/';
5554		} while (c != '\0');
5555
5556		dest[j] = '\0';
5557
5558#if defined(sun)
5559		if (mstate->dtms_getf != NULL &&
5560		    !(mstate->dtms_access & DTRACE_ACCESS_KERNEL) &&
5561		    (z = state->dts_cred.dcr_cred->cr_zone) != kcred->cr_zone) {
5562			/*
5563			 * If we've done a getf() as a part of this ECB and we
5564			 * don't have kernel access (and we're not in the global
5565			 * zone), check if the path we cleaned up begins with
5566			 * the zone's root path, and trim it off if so.  Note
5567			 * that this is an output cleanliness issue, not a
5568			 * security issue: knowing one's zone root path does
5569			 * not enable privilege escalation.
5570			 */
5571			if (strstr(dest, z->zone_rootpath) == dest)
5572				dest += strlen(z->zone_rootpath) - 1;
5573		}
5574#endif
5575
5576		regs[rd] = (uintptr_t)dest;
5577		mstate->dtms_scratch_ptr += size;
5578		break;
5579	}
5580
5581	case DIF_SUBR_INET_NTOA:
5582	case DIF_SUBR_INET_NTOA6:
5583	case DIF_SUBR_INET_NTOP: {
5584		size_t size;
5585		int af, argi, i;
5586		char *base, *end;
5587
5588		if (subr == DIF_SUBR_INET_NTOP) {
5589			af = (int)tupregs[0].dttk_value;
5590			argi = 1;
5591		} else {
5592			af = subr == DIF_SUBR_INET_NTOA ? AF_INET: AF_INET6;
5593			argi = 0;
5594		}
5595
5596		if (af == AF_INET) {
5597			ipaddr_t ip4;
5598			uint8_t *ptr8, val;
5599
5600			/*
5601			 * Safely load the IPv4 address.
5602			 */
5603			ip4 = dtrace_load32(tupregs[argi].dttk_value);
5604
5605			/*
5606			 * Check an IPv4 string will fit in scratch.
5607			 */
5608			size = INET_ADDRSTRLEN;
5609			if (!DTRACE_INSCRATCH(mstate, size)) {
5610				DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
5611				regs[rd] = 0;
5612				break;
5613			}
5614			base = (char *)mstate->dtms_scratch_ptr;
5615			end = (char *)mstate->dtms_scratch_ptr + size - 1;
5616
5617			/*
5618			 * Stringify as a dotted decimal quad.
5619			 */
5620			*end-- = '\0';
5621			ptr8 = (uint8_t *)&ip4;
5622			for (i = 3; i >= 0; i--) {
5623				val = ptr8[i];
5624
5625				if (val == 0) {
5626					*end-- = '0';
5627				} else {
5628					for (; val; val /= 10) {
5629						*end-- = '0' + (val % 10);
5630					}
5631				}
5632
5633				if (i > 0)
5634					*end-- = '.';
5635			}
5636			ASSERT(end + 1 >= base);
5637
5638		} else if (af == AF_INET6) {
5639			struct in6_addr ip6;
5640			int firstzero, tryzero, numzero, v6end;
5641			uint16_t val;
5642			const char digits[] = "0123456789abcdef";
5643
5644			/*
5645			 * Stringify using RFC 1884 convention 2 - 16 bit
5646			 * hexadecimal values with a zero-run compression.
5647			 * Lower case hexadecimal digits are used.
5648			 * 	eg, fe80::214:4fff:fe0b:76c8.
5649			 * The IPv4 embedded form is returned for inet_ntop,
5650			 * just the IPv4 string is returned for inet_ntoa6.
5651			 */
5652
5653			/*
5654			 * Safely load the IPv6 address.
5655			 */
5656			dtrace_bcopy(
5657			    (void *)(uintptr_t)tupregs[argi].dttk_value,
5658			    (void *)(uintptr_t)&ip6, sizeof (struct in6_addr));
5659
5660			/*
5661			 * Check an IPv6 string will fit in scratch.
5662			 */
5663			size = INET6_ADDRSTRLEN;
5664			if (!DTRACE_INSCRATCH(mstate, size)) {
5665				DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
5666				regs[rd] = 0;
5667				break;
5668			}
5669			base = (char *)mstate->dtms_scratch_ptr;
5670			end = (char *)mstate->dtms_scratch_ptr + size - 1;
5671			*end-- = '\0';
5672
5673			/*
5674			 * Find the longest run of 16 bit zero values
5675			 * for the single allowed zero compression - "::".
5676			 */
5677			firstzero = -1;
5678			tryzero = -1;
5679			numzero = 1;
5680			for (i = 0; i < sizeof (struct in6_addr); i++) {
5681#if defined(sun)
5682				if (ip6._S6_un._S6_u8[i] == 0 &&
5683#else
5684				if (ip6.__u6_addr.__u6_addr8[i] == 0 &&
5685#endif
5686				    tryzero == -1 && i % 2 == 0) {
5687					tryzero = i;
5688					continue;
5689				}
5690
5691				if (tryzero != -1 &&
5692#if defined(sun)
5693				    (ip6._S6_un._S6_u8[i] != 0 ||
5694#else
5695				    (ip6.__u6_addr.__u6_addr8[i] != 0 ||
5696#endif
5697				    i == sizeof (struct in6_addr) - 1)) {
5698
5699					if (i - tryzero <= numzero) {
5700						tryzero = -1;
5701						continue;
5702					}
5703
5704					firstzero = tryzero;
5705					numzero = i - i % 2 - tryzero;
5706					tryzero = -1;
5707
5708#if defined(sun)
5709					if (ip6._S6_un._S6_u8[i] == 0 &&
5710#else
5711					if (ip6.__u6_addr.__u6_addr8[i] == 0 &&
5712#endif
5713					    i == sizeof (struct in6_addr) - 1)
5714						numzero += 2;
5715				}
5716			}
5717			ASSERT(firstzero + numzero <= sizeof (struct in6_addr));
5718
5719			/*
5720			 * Check for an IPv4 embedded address.
5721			 */
5722			v6end = sizeof (struct in6_addr) - 2;
5723			if (IN6_IS_ADDR_V4MAPPED(&ip6) ||
5724			    IN6_IS_ADDR_V4COMPAT(&ip6)) {
5725				for (i = sizeof (struct in6_addr) - 1;
5726				    i >= DTRACE_V4MAPPED_OFFSET; i--) {
5727					ASSERT(end >= base);
5728
5729#if defined(sun)
5730					val = ip6._S6_un._S6_u8[i];
5731#else
5732					val = ip6.__u6_addr.__u6_addr8[i];
5733#endif
5734
5735					if (val == 0) {
5736						*end-- = '0';
5737					} else {
5738						for (; val; val /= 10) {
5739							*end-- = '0' + val % 10;
5740						}
5741					}
5742
5743					if (i > DTRACE_V4MAPPED_OFFSET)
5744						*end-- = '.';
5745				}
5746
5747				if (subr == DIF_SUBR_INET_NTOA6)
5748					goto inetout;
5749
5750				/*
5751				 * Set v6end to skip the IPv4 address that
5752				 * we have already stringified.
5753				 */
5754				v6end = 10;
5755			}
5756
5757			/*
5758			 * Build the IPv6 string by working through the
5759			 * address in reverse.
5760			 */
5761			for (i = v6end; i >= 0; i -= 2) {
5762				ASSERT(end >= base);
5763
5764				if (i == firstzero + numzero - 2) {
5765					*end-- = ':';
5766					*end-- = ':';
5767					i -= numzero - 2;
5768					continue;
5769				}
5770
5771				if (i < 14 && i != firstzero - 2)
5772					*end-- = ':';
5773
5774#if defined(sun)
5775				val = (ip6._S6_un._S6_u8[i] << 8) +
5776				    ip6._S6_un._S6_u8[i + 1];
5777#else
5778				val = (ip6.__u6_addr.__u6_addr8[i] << 8) +
5779				    ip6.__u6_addr.__u6_addr8[i + 1];
5780#endif
5781
5782				if (val == 0) {
5783					*end-- = '0';
5784				} else {
5785					for (; val; val /= 16) {
5786						*end-- = digits[val % 16];
5787					}
5788				}
5789			}
5790			ASSERT(end + 1 >= base);
5791
5792		} else {
5793			/*
5794			 * The user didn't use AH_INET or AH_INET6.
5795			 */
5796			DTRACE_CPUFLAG_SET(CPU_DTRACE_ILLOP);
5797			regs[rd] = 0;
5798			break;
5799		}
5800
5801inetout:	regs[rd] = (uintptr_t)end + 1;
5802		mstate->dtms_scratch_ptr += size;
5803		break;
5804	}
5805
5806	case DIF_SUBR_MEMREF: {
5807		uintptr_t size = 2 * sizeof(uintptr_t);
5808		uintptr_t *memref = (uintptr_t *) P2ROUNDUP(mstate->dtms_scratch_ptr, sizeof(uintptr_t));
5809		size_t scratch_size = ((uintptr_t) memref - mstate->dtms_scratch_ptr) + size;
5810
5811		/* address and length */
5812		memref[0] = tupregs[0].dttk_value;
5813		memref[1] = tupregs[1].dttk_value;
5814
5815		regs[rd] = (uintptr_t) memref;
5816		mstate->dtms_scratch_ptr += scratch_size;
5817		break;
5818	}
5819
5820#if !defined(sun)
5821	case DIF_SUBR_MEMSTR: {
5822		char *str = (char *)mstate->dtms_scratch_ptr;
5823		uintptr_t mem = tupregs[0].dttk_value;
5824		char c = tupregs[1].dttk_value;
5825		size_t size = tupregs[2].dttk_value;
5826		uint8_t n;
5827		int i;
5828
5829		regs[rd] = 0;
5830
5831		if (size == 0)
5832			break;
5833
5834		if (!dtrace_canload(mem, size - 1, mstate, vstate))
5835			break;
5836
5837		if (!DTRACE_INSCRATCH(mstate, size)) {
5838			DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
5839			break;
5840		}
5841
5842		if (dtrace_memstr_max != 0 && size > dtrace_memstr_max) {
5843			*flags |= CPU_DTRACE_ILLOP;
5844			break;
5845		}
5846
5847		for (i = 0; i < size - 1; i++) {
5848			n = dtrace_load8(mem++);
5849			str[i] = (n == 0) ? c : n;
5850		}
5851		str[size - 1] = 0;
5852
5853		regs[rd] = (uintptr_t)str;
5854		mstate->dtms_scratch_ptr += size;
5855		break;
5856	}
5857#endif
5858
5859	case DIF_SUBR_TYPEREF: {
5860		uintptr_t size = 4 * sizeof(uintptr_t);
5861		uintptr_t *typeref = (uintptr_t *) P2ROUNDUP(mstate->dtms_scratch_ptr, sizeof(uintptr_t));
5862		size_t scratch_size = ((uintptr_t) typeref - mstate->dtms_scratch_ptr) + size;
5863
5864		/* address, num_elements, type_str, type_len */
5865		typeref[0] = tupregs[0].dttk_value;
5866		typeref[1] = tupregs[1].dttk_value;
5867		typeref[2] = tupregs[2].dttk_value;
5868		typeref[3] = tupregs[3].dttk_value;
5869
5870		regs[rd] = (uintptr_t) typeref;
5871		mstate->dtms_scratch_ptr += scratch_size;
5872		break;
5873	}
5874	}
5875}
5876
5877/*
5878 * Emulate the execution of DTrace IR instructions specified by the given
5879 * DIF object.  This function is deliberately void of assertions as all of
5880 * the necessary checks are handled by a call to dtrace_difo_validate().
5881 */
5882static uint64_t
5883dtrace_dif_emulate(dtrace_difo_t *difo, dtrace_mstate_t *mstate,
5884    dtrace_vstate_t *vstate, dtrace_state_t *state)
5885{
5886	const dif_instr_t *text = difo->dtdo_buf;
5887	const uint_t textlen = difo->dtdo_len;
5888	const char *strtab = difo->dtdo_strtab;
5889	const uint64_t *inttab = difo->dtdo_inttab;
5890
5891	uint64_t rval = 0;
5892	dtrace_statvar_t *svar;
5893	dtrace_dstate_t *dstate = &vstate->dtvs_dynvars;
5894	dtrace_difv_t *v;
5895	volatile uint16_t *flags = &cpu_core[curcpu].cpuc_dtrace_flags;
5896	volatile uintptr_t *illval = &cpu_core[curcpu].cpuc_dtrace_illval;
5897
5898	dtrace_key_t tupregs[DIF_DTR_NREGS + 2]; /* +2 for thread and id */
5899	uint64_t regs[DIF_DIR_NREGS];
5900	uint64_t *tmp;
5901
5902	uint8_t cc_n = 0, cc_z = 0, cc_v = 0, cc_c = 0;
5903	int64_t cc_r;
5904	uint_t pc = 0, id, opc = 0;
5905	uint8_t ttop = 0;
5906	dif_instr_t instr;
5907	uint_t r1, r2, rd;
5908
5909	/*
5910	 * We stash the current DIF object into the machine state: we need it
5911	 * for subsequent access checking.
5912	 */
5913	mstate->dtms_difo = difo;
5914
5915	regs[DIF_REG_R0] = 0; 		/* %r0 is fixed at zero */
5916
5917	while (pc < textlen && !(*flags & CPU_DTRACE_FAULT)) {
5918		opc = pc;
5919
5920		instr = text[pc++];
5921		r1 = DIF_INSTR_R1(instr);
5922		r2 = DIF_INSTR_R2(instr);
5923		rd = DIF_INSTR_RD(instr);
5924
5925		switch (DIF_INSTR_OP(instr)) {
5926		case DIF_OP_OR:
5927			regs[rd] = regs[r1] | regs[r2];
5928			break;
5929		case DIF_OP_XOR:
5930			regs[rd] = regs[r1] ^ regs[r2];
5931			break;
5932		case DIF_OP_AND:
5933			regs[rd] = regs[r1] & regs[r2];
5934			break;
5935		case DIF_OP_SLL:
5936			regs[rd] = regs[r1] << regs[r2];
5937			break;
5938		case DIF_OP_SRL:
5939			regs[rd] = regs[r1] >> regs[r2];
5940			break;
5941		case DIF_OP_SUB:
5942			regs[rd] = regs[r1] - regs[r2];
5943			break;
5944		case DIF_OP_ADD:
5945			regs[rd] = regs[r1] + regs[r2];
5946			break;
5947		case DIF_OP_MUL:
5948			regs[rd] = regs[r1] * regs[r2];
5949			break;
5950		case DIF_OP_SDIV:
5951			if (regs[r2] == 0) {
5952				regs[rd] = 0;
5953				*flags |= CPU_DTRACE_DIVZERO;
5954			} else {
5955				regs[rd] = (int64_t)regs[r1] /
5956				    (int64_t)regs[r2];
5957			}
5958			break;
5959
5960		case DIF_OP_UDIV:
5961			if (regs[r2] == 0) {
5962				regs[rd] = 0;
5963				*flags |= CPU_DTRACE_DIVZERO;
5964			} else {
5965				regs[rd] = regs[r1] / regs[r2];
5966			}
5967			break;
5968
5969		case DIF_OP_SREM:
5970			if (regs[r2] == 0) {
5971				regs[rd] = 0;
5972				*flags |= CPU_DTRACE_DIVZERO;
5973			} else {
5974				regs[rd] = (int64_t)regs[r1] %
5975				    (int64_t)regs[r2];
5976			}
5977			break;
5978
5979		case DIF_OP_UREM:
5980			if (regs[r2] == 0) {
5981				regs[rd] = 0;
5982				*flags |= CPU_DTRACE_DIVZERO;
5983			} else {
5984				regs[rd] = regs[r1] % regs[r2];
5985			}
5986			break;
5987
5988		case DIF_OP_NOT:
5989			regs[rd] = ~regs[r1];
5990			break;
5991		case DIF_OP_MOV:
5992			regs[rd] = regs[r1];
5993			break;
5994		case DIF_OP_CMP:
5995			cc_r = regs[r1] - regs[r2];
5996			cc_n = cc_r < 0;
5997			cc_z = cc_r == 0;
5998			cc_v = 0;
5999			cc_c = regs[r1] < regs[r2];
6000			break;
6001		case DIF_OP_TST:
6002			cc_n = cc_v = cc_c = 0;
6003			cc_z = regs[r1] == 0;
6004			break;
6005		case DIF_OP_BA:
6006			pc = DIF_INSTR_LABEL(instr);
6007			break;
6008		case DIF_OP_BE:
6009			if (cc_z)
6010				pc = DIF_INSTR_LABEL(instr);
6011			break;
6012		case DIF_OP_BNE:
6013			if (cc_z == 0)
6014				pc = DIF_INSTR_LABEL(instr);
6015			break;
6016		case DIF_OP_BG:
6017			if ((cc_z | (cc_n ^ cc_v)) == 0)
6018				pc = DIF_INSTR_LABEL(instr);
6019			break;
6020		case DIF_OP_BGU:
6021			if ((cc_c | cc_z) == 0)
6022				pc = DIF_INSTR_LABEL(instr);
6023			break;
6024		case DIF_OP_BGE:
6025			if ((cc_n ^ cc_v) == 0)
6026				pc = DIF_INSTR_LABEL(instr);
6027			break;
6028		case DIF_OP_BGEU:
6029			if (cc_c == 0)
6030				pc = DIF_INSTR_LABEL(instr);
6031			break;
6032		case DIF_OP_BL:
6033			if (cc_n ^ cc_v)
6034				pc = DIF_INSTR_LABEL(instr);
6035			break;
6036		case DIF_OP_BLU:
6037			if (cc_c)
6038				pc = DIF_INSTR_LABEL(instr);
6039			break;
6040		case DIF_OP_BLE:
6041			if (cc_z | (cc_n ^ cc_v))
6042				pc = DIF_INSTR_LABEL(instr);
6043			break;
6044		case DIF_OP_BLEU:
6045			if (cc_c | cc_z)
6046				pc = DIF_INSTR_LABEL(instr);
6047			break;
6048		case DIF_OP_RLDSB:
6049			if (!dtrace_canload(regs[r1], 1, mstate, vstate))
6050				break;
6051			/*FALLTHROUGH*/
6052		case DIF_OP_LDSB:
6053			regs[rd] = (int8_t)dtrace_load8(regs[r1]);
6054			break;
6055		case DIF_OP_RLDSH:
6056			if (!dtrace_canload(regs[r1], 2, mstate, vstate))
6057				break;
6058			/*FALLTHROUGH*/
6059		case DIF_OP_LDSH:
6060			regs[rd] = (int16_t)dtrace_load16(regs[r1]);
6061			break;
6062		case DIF_OP_RLDSW:
6063			if (!dtrace_canload(regs[r1], 4, mstate, vstate))
6064				break;
6065			/*FALLTHROUGH*/
6066		case DIF_OP_LDSW:
6067			regs[rd] = (int32_t)dtrace_load32(regs[r1]);
6068			break;
6069		case DIF_OP_RLDUB:
6070			if (!dtrace_canload(regs[r1], 1, mstate, vstate))
6071				break;
6072			/*FALLTHROUGH*/
6073		case DIF_OP_LDUB:
6074			regs[rd] = dtrace_load8(regs[r1]);
6075			break;
6076		case DIF_OP_RLDUH:
6077			if (!dtrace_canload(regs[r1], 2, mstate, vstate))
6078				break;
6079			/*FALLTHROUGH*/
6080		case DIF_OP_LDUH:
6081			regs[rd] = dtrace_load16(regs[r1]);
6082			break;
6083		case DIF_OP_RLDUW:
6084			if (!dtrace_canload(regs[r1], 4, mstate, vstate))
6085				break;
6086			/*FALLTHROUGH*/
6087		case DIF_OP_LDUW:
6088			regs[rd] = dtrace_load32(regs[r1]);
6089			break;
6090		case DIF_OP_RLDX:
6091			if (!dtrace_canload(regs[r1], 8, mstate, vstate))
6092				break;
6093			/*FALLTHROUGH*/
6094		case DIF_OP_LDX:
6095			regs[rd] = dtrace_load64(regs[r1]);
6096			break;
6097		case DIF_OP_ULDSB:
6098			DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
6099			regs[rd] = (int8_t)
6100			    dtrace_fuword8((void *)(uintptr_t)regs[r1]);
6101			DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);
6102			break;
6103		case DIF_OP_ULDSH:
6104			DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
6105			regs[rd] = (int16_t)
6106			    dtrace_fuword16((void *)(uintptr_t)regs[r1]);
6107			DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);
6108			break;
6109		case DIF_OP_ULDSW:
6110			DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
6111			regs[rd] = (int32_t)
6112			    dtrace_fuword32((void *)(uintptr_t)regs[r1]);
6113			DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);
6114			break;
6115		case DIF_OP_ULDUB:
6116			DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
6117			regs[rd] =
6118			    dtrace_fuword8((void *)(uintptr_t)regs[r1]);
6119			DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);
6120			break;
6121		case DIF_OP_ULDUH:
6122			DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
6123			regs[rd] =
6124			    dtrace_fuword16((void *)(uintptr_t)regs[r1]);
6125			DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);
6126			break;
6127		case DIF_OP_ULDUW:
6128			DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
6129			regs[rd] =
6130			    dtrace_fuword32((void *)(uintptr_t)regs[r1]);
6131			DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);
6132			break;
6133		case DIF_OP_ULDX:
6134			DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
6135			regs[rd] =
6136			    dtrace_fuword64((void *)(uintptr_t)regs[r1]);
6137			DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);
6138			break;
6139		case DIF_OP_RET:
6140			rval = regs[rd];
6141			pc = textlen;
6142			break;
6143		case DIF_OP_NOP:
6144			break;
6145		case DIF_OP_SETX:
6146			regs[rd] = inttab[DIF_INSTR_INTEGER(instr)];
6147			break;
6148		case DIF_OP_SETS:
6149			regs[rd] = (uint64_t)(uintptr_t)
6150			    (strtab + DIF_INSTR_STRING(instr));
6151			break;
6152		case DIF_OP_SCMP: {
6153			size_t sz = state->dts_options[DTRACEOPT_STRSIZE];
6154			uintptr_t s1 = regs[r1];
6155			uintptr_t s2 = regs[r2];
6156
6157			if (s1 != 0 &&
6158			    !dtrace_strcanload(s1, sz, mstate, vstate))
6159				break;
6160			if (s2 != 0 &&
6161			    !dtrace_strcanload(s2, sz, mstate, vstate))
6162				break;
6163
6164			cc_r = dtrace_strncmp((char *)s1, (char *)s2, sz);
6165
6166			cc_n = cc_r < 0;
6167			cc_z = cc_r == 0;
6168			cc_v = cc_c = 0;
6169			break;
6170		}
6171		case DIF_OP_LDGA:
6172			regs[rd] = dtrace_dif_variable(mstate, state,
6173			    r1, regs[r2]);
6174			break;
6175		case DIF_OP_LDGS:
6176			id = DIF_INSTR_VAR(instr);
6177
6178			if (id >= DIF_VAR_OTHER_UBASE) {
6179				uintptr_t a;
6180
6181				id -= DIF_VAR_OTHER_UBASE;
6182				svar = vstate->dtvs_globals[id];
6183				ASSERT(svar != NULL);
6184				v = &svar->dtsv_var;
6185
6186				if (!(v->dtdv_type.dtdt_flags & DIF_TF_BYREF)) {
6187					regs[rd] = svar->dtsv_data;
6188					break;
6189				}
6190
6191				a = (uintptr_t)svar->dtsv_data;
6192
6193				if (*(uint8_t *)a == UINT8_MAX) {
6194					/*
6195					 * If the 0th byte is set to UINT8_MAX
6196					 * then this is to be treated as a
6197					 * reference to a NULL variable.
6198					 */
6199					regs[rd] = 0;
6200				} else {
6201					regs[rd] = a + sizeof (uint64_t);
6202				}
6203
6204				break;
6205			}
6206
6207			regs[rd] = dtrace_dif_variable(mstate, state, id, 0);
6208			break;
6209
6210		case DIF_OP_STGS:
6211			id = DIF_INSTR_VAR(instr);
6212
6213			ASSERT(id >= DIF_VAR_OTHER_UBASE);
6214			id -= DIF_VAR_OTHER_UBASE;
6215
6216			svar = vstate->dtvs_globals[id];
6217			ASSERT(svar != NULL);
6218			v = &svar->dtsv_var;
6219
6220			if (v->dtdv_type.dtdt_flags & DIF_TF_BYREF) {
6221				uintptr_t a = (uintptr_t)svar->dtsv_data;
6222
6223				ASSERT(a != 0);
6224				ASSERT(svar->dtsv_size != 0);
6225
6226				if (regs[rd] == 0) {
6227					*(uint8_t *)a = UINT8_MAX;
6228					break;
6229				} else {
6230					*(uint8_t *)a = 0;
6231					a += sizeof (uint64_t);
6232				}
6233				if (!dtrace_vcanload(
6234				    (void *)(uintptr_t)regs[rd], &v->dtdv_type,
6235				    mstate, vstate))
6236					break;
6237
6238				dtrace_vcopy((void *)(uintptr_t)regs[rd],
6239				    (void *)a, &v->dtdv_type);
6240				break;
6241			}
6242
6243			svar->dtsv_data = regs[rd];
6244			break;
6245
6246		case DIF_OP_LDTA:
6247			/*
6248			 * There are no DTrace built-in thread-local arrays at
6249			 * present.  This opcode is saved for future work.
6250			 */
6251			*flags |= CPU_DTRACE_ILLOP;
6252			regs[rd] = 0;
6253			break;
6254
6255		case DIF_OP_LDLS:
6256			id = DIF_INSTR_VAR(instr);
6257
6258			if (id < DIF_VAR_OTHER_UBASE) {
6259				/*
6260				 * For now, this has no meaning.
6261				 */
6262				regs[rd] = 0;
6263				break;
6264			}
6265
6266			id -= DIF_VAR_OTHER_UBASE;
6267
6268			ASSERT(id < vstate->dtvs_nlocals);
6269			ASSERT(vstate->dtvs_locals != NULL);
6270
6271			svar = vstate->dtvs_locals[id];
6272			ASSERT(svar != NULL);
6273			v = &svar->dtsv_var;
6274
6275			if (v->dtdv_type.dtdt_flags & DIF_TF_BYREF) {
6276				uintptr_t a = (uintptr_t)svar->dtsv_data;
6277				size_t sz = v->dtdv_type.dtdt_size;
6278
6279				sz += sizeof (uint64_t);
6280				ASSERT(svar->dtsv_size == NCPU * sz);
6281				a += curcpu * sz;
6282
6283				if (*(uint8_t *)a == UINT8_MAX) {
6284					/*
6285					 * If the 0th byte is set to UINT8_MAX
6286					 * then this is to be treated as a
6287					 * reference to a NULL variable.
6288					 */
6289					regs[rd] = 0;
6290				} else {
6291					regs[rd] = a + sizeof (uint64_t);
6292				}
6293
6294				break;
6295			}
6296
6297			ASSERT(svar->dtsv_size == NCPU * sizeof (uint64_t));
6298			tmp = (uint64_t *)(uintptr_t)svar->dtsv_data;
6299			regs[rd] = tmp[curcpu];
6300			break;
6301
6302		case DIF_OP_STLS:
6303			id = DIF_INSTR_VAR(instr);
6304
6305			ASSERT(id >= DIF_VAR_OTHER_UBASE);
6306			id -= DIF_VAR_OTHER_UBASE;
6307			ASSERT(id < vstate->dtvs_nlocals);
6308
6309			ASSERT(vstate->dtvs_locals != NULL);
6310			svar = vstate->dtvs_locals[id];
6311			ASSERT(svar != NULL);
6312			v = &svar->dtsv_var;
6313
6314			if (v->dtdv_type.dtdt_flags & DIF_TF_BYREF) {
6315				uintptr_t a = (uintptr_t)svar->dtsv_data;
6316				size_t sz = v->dtdv_type.dtdt_size;
6317
6318				sz += sizeof (uint64_t);
6319				ASSERT(svar->dtsv_size == NCPU * sz);
6320				a += curcpu * sz;
6321
6322				if (regs[rd] == 0) {
6323					*(uint8_t *)a = UINT8_MAX;
6324					break;
6325				} else {
6326					*(uint8_t *)a = 0;
6327					a += sizeof (uint64_t);
6328				}
6329
6330				if (!dtrace_vcanload(
6331				    (void *)(uintptr_t)regs[rd], &v->dtdv_type,
6332				    mstate, vstate))
6333					break;
6334
6335				dtrace_vcopy((void *)(uintptr_t)regs[rd],
6336				    (void *)a, &v->dtdv_type);
6337				break;
6338			}
6339
6340			ASSERT(svar->dtsv_size == NCPU * sizeof (uint64_t));
6341			tmp = (uint64_t *)(uintptr_t)svar->dtsv_data;
6342			tmp[curcpu] = regs[rd];
6343			break;
6344
6345		case DIF_OP_LDTS: {
6346			dtrace_dynvar_t *dvar;
6347			dtrace_key_t *key;
6348
6349			id = DIF_INSTR_VAR(instr);
6350			ASSERT(id >= DIF_VAR_OTHER_UBASE);
6351			id -= DIF_VAR_OTHER_UBASE;
6352			v = &vstate->dtvs_tlocals[id];
6353
6354			key = &tupregs[DIF_DTR_NREGS];
6355			key[0].dttk_value = (uint64_t)id;
6356			key[0].dttk_size = 0;
6357			DTRACE_TLS_THRKEY(key[1].dttk_value);
6358			key[1].dttk_size = 0;
6359
6360			dvar = dtrace_dynvar(dstate, 2, key,
6361			    sizeof (uint64_t), DTRACE_DYNVAR_NOALLOC,
6362			    mstate, vstate);
6363
6364			if (dvar == NULL) {
6365				regs[rd] = 0;
6366				break;
6367			}
6368
6369			if (v->dtdv_type.dtdt_flags & DIF_TF_BYREF) {
6370				regs[rd] = (uint64_t)(uintptr_t)dvar->dtdv_data;
6371			} else {
6372				regs[rd] = *((uint64_t *)dvar->dtdv_data);
6373			}
6374
6375			break;
6376		}
6377
6378		case DIF_OP_STTS: {
6379			dtrace_dynvar_t *dvar;
6380			dtrace_key_t *key;
6381
6382			id = DIF_INSTR_VAR(instr);
6383			ASSERT(id >= DIF_VAR_OTHER_UBASE);
6384			id -= DIF_VAR_OTHER_UBASE;
6385
6386			key = &tupregs[DIF_DTR_NREGS];
6387			key[0].dttk_value = (uint64_t)id;
6388			key[0].dttk_size = 0;
6389			DTRACE_TLS_THRKEY(key[1].dttk_value);
6390			key[1].dttk_size = 0;
6391			v = &vstate->dtvs_tlocals[id];
6392
6393			dvar = dtrace_dynvar(dstate, 2, key,
6394			    v->dtdv_type.dtdt_size > sizeof (uint64_t) ?
6395			    v->dtdv_type.dtdt_size : sizeof (uint64_t),
6396			    regs[rd] ? DTRACE_DYNVAR_ALLOC :
6397			    DTRACE_DYNVAR_DEALLOC, mstate, vstate);
6398
6399			/*
6400			 * Given that we're storing to thread-local data,
6401			 * we need to flush our predicate cache.
6402			 */
6403			curthread->t_predcache = 0;
6404
6405			if (dvar == NULL)
6406				break;
6407
6408			if (v->dtdv_type.dtdt_flags & DIF_TF_BYREF) {
6409				if (!dtrace_vcanload(
6410				    (void *)(uintptr_t)regs[rd],
6411				    &v->dtdv_type, mstate, vstate))
6412					break;
6413
6414				dtrace_vcopy((void *)(uintptr_t)regs[rd],
6415				    dvar->dtdv_data, &v->dtdv_type);
6416			} else {
6417				*((uint64_t *)dvar->dtdv_data) = regs[rd];
6418			}
6419
6420			break;
6421		}
6422
6423		case DIF_OP_SRA:
6424			regs[rd] = (int64_t)regs[r1] >> regs[r2];
6425			break;
6426
6427		case DIF_OP_CALL:
6428			dtrace_dif_subr(DIF_INSTR_SUBR(instr), rd,
6429			    regs, tupregs, ttop, mstate, state);
6430			break;
6431
6432		case DIF_OP_PUSHTR:
6433			if (ttop == DIF_DTR_NREGS) {
6434				*flags |= CPU_DTRACE_TUPOFLOW;
6435				break;
6436			}
6437
6438			if (r1 == DIF_TYPE_STRING) {
6439				/*
6440				 * If this is a string type and the size is 0,
6441				 * we'll use the system-wide default string
6442				 * size.  Note that we are _not_ looking at
6443				 * the value of the DTRACEOPT_STRSIZE option;
6444				 * had this been set, we would expect to have
6445				 * a non-zero size value in the "pushtr".
6446				 */
6447				tupregs[ttop].dttk_size =
6448				    dtrace_strlen((char *)(uintptr_t)regs[rd],
6449				    regs[r2] ? regs[r2] :
6450				    dtrace_strsize_default) + 1;
6451			} else {
6452				tupregs[ttop].dttk_size = regs[r2];
6453			}
6454
6455			tupregs[ttop++].dttk_value = regs[rd];
6456			break;
6457
6458		case DIF_OP_PUSHTV:
6459			if (ttop == DIF_DTR_NREGS) {
6460				*flags |= CPU_DTRACE_TUPOFLOW;
6461				break;
6462			}
6463
6464			tupregs[ttop].dttk_value = regs[rd];
6465			tupregs[ttop++].dttk_size = 0;
6466			break;
6467
6468		case DIF_OP_POPTS:
6469			if (ttop != 0)
6470				ttop--;
6471			break;
6472
6473		case DIF_OP_FLUSHTS:
6474			ttop = 0;
6475			break;
6476
6477		case DIF_OP_LDGAA:
6478		case DIF_OP_LDTAA: {
6479			dtrace_dynvar_t *dvar;
6480			dtrace_key_t *key = tupregs;
6481			uint_t nkeys = ttop;
6482
6483			id = DIF_INSTR_VAR(instr);
6484			ASSERT(id >= DIF_VAR_OTHER_UBASE);
6485			id -= DIF_VAR_OTHER_UBASE;
6486
6487			key[nkeys].dttk_value = (uint64_t)id;
6488			key[nkeys++].dttk_size = 0;
6489
6490			if (DIF_INSTR_OP(instr) == DIF_OP_LDTAA) {
6491				DTRACE_TLS_THRKEY(key[nkeys].dttk_value);
6492				key[nkeys++].dttk_size = 0;
6493				v = &vstate->dtvs_tlocals[id];
6494			} else {
6495				v = &vstate->dtvs_globals[id]->dtsv_var;
6496			}
6497
6498			dvar = dtrace_dynvar(dstate, nkeys, key,
6499			    v->dtdv_type.dtdt_size > sizeof (uint64_t) ?
6500			    v->dtdv_type.dtdt_size : sizeof (uint64_t),
6501			    DTRACE_DYNVAR_NOALLOC, mstate, vstate);
6502
6503			if (dvar == NULL) {
6504				regs[rd] = 0;
6505				break;
6506			}
6507
6508			if (v->dtdv_type.dtdt_flags & DIF_TF_BYREF) {
6509				regs[rd] = (uint64_t)(uintptr_t)dvar->dtdv_data;
6510			} else {
6511				regs[rd] = *((uint64_t *)dvar->dtdv_data);
6512			}
6513
6514			break;
6515		}
6516
6517		case DIF_OP_STGAA:
6518		case DIF_OP_STTAA: {
6519			dtrace_dynvar_t *dvar;
6520			dtrace_key_t *key = tupregs;
6521			uint_t nkeys = ttop;
6522
6523			id = DIF_INSTR_VAR(instr);
6524			ASSERT(id >= DIF_VAR_OTHER_UBASE);
6525			id -= DIF_VAR_OTHER_UBASE;
6526
6527			key[nkeys].dttk_value = (uint64_t)id;
6528			key[nkeys++].dttk_size = 0;
6529
6530			if (DIF_INSTR_OP(instr) == DIF_OP_STTAA) {
6531				DTRACE_TLS_THRKEY(key[nkeys].dttk_value);
6532				key[nkeys++].dttk_size = 0;
6533				v = &vstate->dtvs_tlocals[id];
6534			} else {
6535				v = &vstate->dtvs_globals[id]->dtsv_var;
6536			}
6537
6538			dvar = dtrace_dynvar(dstate, nkeys, key,
6539			    v->dtdv_type.dtdt_size > sizeof (uint64_t) ?
6540			    v->dtdv_type.dtdt_size : sizeof (uint64_t),
6541			    regs[rd] ? DTRACE_DYNVAR_ALLOC :
6542			    DTRACE_DYNVAR_DEALLOC, mstate, vstate);
6543
6544			if (dvar == NULL)
6545				break;
6546
6547			if (v->dtdv_type.dtdt_flags & DIF_TF_BYREF) {
6548				if (!dtrace_vcanload(
6549				    (void *)(uintptr_t)regs[rd], &v->dtdv_type,
6550				    mstate, vstate))
6551					break;
6552
6553				dtrace_vcopy((void *)(uintptr_t)regs[rd],
6554				    dvar->dtdv_data, &v->dtdv_type);
6555			} else {
6556				*((uint64_t *)dvar->dtdv_data) = regs[rd];
6557			}
6558
6559			break;
6560		}
6561
6562		case DIF_OP_ALLOCS: {
6563			uintptr_t ptr = P2ROUNDUP(mstate->dtms_scratch_ptr, 8);
6564			size_t size = ptr - mstate->dtms_scratch_ptr + regs[r1];
6565
6566			/*
6567			 * Rounding up the user allocation size could have
6568			 * overflowed large, bogus allocations (like -1ULL) to
6569			 * 0.
6570			 */
6571			if (size < regs[r1] ||
6572			    !DTRACE_INSCRATCH(mstate, size)) {
6573				DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
6574				regs[rd] = 0;
6575				break;
6576			}
6577
6578			dtrace_bzero((void *) mstate->dtms_scratch_ptr, size);
6579			mstate->dtms_scratch_ptr += size;
6580			regs[rd] = ptr;
6581			break;
6582		}
6583
6584		case DIF_OP_COPYS:
6585			if (!dtrace_canstore(regs[rd], regs[r2],
6586			    mstate, vstate)) {
6587				*flags |= CPU_DTRACE_BADADDR;
6588				*illval = regs[rd];
6589				break;
6590			}
6591
6592			if (!dtrace_canload(regs[r1], regs[r2], mstate, vstate))
6593				break;
6594
6595			dtrace_bcopy((void *)(uintptr_t)regs[r1],
6596			    (void *)(uintptr_t)regs[rd], (size_t)regs[r2]);
6597			break;
6598
6599		case DIF_OP_STB:
6600			if (!dtrace_canstore(regs[rd], 1, mstate, vstate)) {
6601				*flags |= CPU_DTRACE_BADADDR;
6602				*illval = regs[rd];
6603				break;
6604			}
6605			*((uint8_t *)(uintptr_t)regs[rd]) = (uint8_t)regs[r1];
6606			break;
6607
6608		case DIF_OP_STH:
6609			if (!dtrace_canstore(regs[rd], 2, mstate, vstate)) {
6610				*flags |= CPU_DTRACE_BADADDR;
6611				*illval = regs[rd];
6612				break;
6613			}
6614			if (regs[rd] & 1) {
6615				*flags |= CPU_DTRACE_BADALIGN;
6616				*illval = regs[rd];
6617				break;
6618			}
6619			*((uint16_t *)(uintptr_t)regs[rd]) = (uint16_t)regs[r1];
6620			break;
6621
6622		case DIF_OP_STW:
6623			if (!dtrace_canstore(regs[rd], 4, mstate, vstate)) {
6624				*flags |= CPU_DTRACE_BADADDR;
6625				*illval = regs[rd];
6626				break;
6627			}
6628			if (regs[rd] & 3) {
6629				*flags |= CPU_DTRACE_BADALIGN;
6630				*illval = regs[rd];
6631				break;
6632			}
6633			*((uint32_t *)(uintptr_t)regs[rd]) = (uint32_t)regs[r1];
6634			break;
6635
6636		case DIF_OP_STX:
6637			if (!dtrace_canstore(regs[rd], 8, mstate, vstate)) {
6638				*flags |= CPU_DTRACE_BADADDR;
6639				*illval = regs[rd];
6640				break;
6641			}
6642			if (regs[rd] & 7) {
6643				*flags |= CPU_DTRACE_BADALIGN;
6644				*illval = regs[rd];
6645				break;
6646			}
6647			*((uint64_t *)(uintptr_t)regs[rd]) = regs[r1];
6648			break;
6649		}
6650	}
6651
6652	if (!(*flags & CPU_DTRACE_FAULT))
6653		return (rval);
6654
6655	mstate->dtms_fltoffs = opc * sizeof (dif_instr_t);
6656	mstate->dtms_present |= DTRACE_MSTATE_FLTOFFS;
6657
6658	return (0);
6659}
6660
6661static void
6662dtrace_action_breakpoint(dtrace_ecb_t *ecb)
6663{
6664	dtrace_probe_t *probe = ecb->dte_probe;
6665	dtrace_provider_t *prov = probe->dtpr_provider;
6666	char c[DTRACE_FULLNAMELEN + 80], *str;
6667	char *msg = "dtrace: breakpoint action at probe ";
6668	char *ecbmsg = " (ecb ";
6669	uintptr_t mask = (0xf << (sizeof (uintptr_t) * NBBY / 4));
6670	uintptr_t val = (uintptr_t)ecb;
6671	int shift = (sizeof (uintptr_t) * NBBY) - 4, i = 0;
6672
6673	if (dtrace_destructive_disallow)
6674		return;
6675
6676	/*
6677	 * It's impossible to be taking action on the NULL probe.
6678	 */
6679	ASSERT(probe != NULL);
6680
6681	/*
6682	 * This is a poor man's (destitute man's?) sprintf():  we want to
6683	 * print the provider name, module name, function name and name of
6684	 * the probe, along with the hex address of the ECB with the breakpoint
6685	 * action -- all of which we must place in the character buffer by
6686	 * hand.
6687	 */
6688	while (*msg != '\0')
6689		c[i++] = *msg++;
6690
6691	for (str = prov->dtpv_name; *str != '\0'; str++)
6692		c[i++] = *str;
6693	c[i++] = ':';
6694
6695	for (str = probe->dtpr_mod; *str != '\0'; str++)
6696		c[i++] = *str;
6697	c[i++] = ':';
6698
6699	for (str = probe->dtpr_func; *str != '\0'; str++)
6700		c[i++] = *str;
6701	c[i++] = ':';
6702
6703	for (str = probe->dtpr_name; *str != '\0'; str++)
6704		c[i++] = *str;
6705
6706	while (*ecbmsg != '\0')
6707		c[i++] = *ecbmsg++;
6708
6709	while (shift >= 0) {
6710		mask = (uintptr_t)0xf << shift;
6711
6712		if (val >= ((uintptr_t)1 << shift))
6713			c[i++] = "0123456789abcdef"[(val & mask) >> shift];
6714		shift -= 4;
6715	}
6716
6717	c[i++] = ')';
6718	c[i] = '\0';
6719
6720#if defined(sun)
6721	debug_enter(c);
6722#else
6723	kdb_enter(KDB_WHY_DTRACE, "breakpoint action");
6724#endif
6725}
6726
6727static void
6728dtrace_action_panic(dtrace_ecb_t *ecb)
6729{
6730	dtrace_probe_t *probe = ecb->dte_probe;
6731
6732	/*
6733	 * It's impossible to be taking action on the NULL probe.
6734	 */
6735	ASSERT(probe != NULL);
6736
6737	if (dtrace_destructive_disallow)
6738		return;
6739
6740	if (dtrace_panicked != NULL)
6741		return;
6742
6743	if (dtrace_casptr(&dtrace_panicked, NULL, curthread) != NULL)
6744		return;
6745
6746	/*
6747	 * We won the right to panic.  (We want to be sure that only one
6748	 * thread calls panic() from dtrace_probe(), and that panic() is
6749	 * called exactly once.)
6750	 */
6751	dtrace_panic("dtrace: panic action at probe %s:%s:%s:%s (ecb %p)",
6752	    probe->dtpr_provider->dtpv_name, probe->dtpr_mod,
6753	    probe->dtpr_func, probe->dtpr_name, (void *)ecb);
6754}
6755
6756static void
6757dtrace_action_raise(uint64_t sig)
6758{
6759	if (dtrace_destructive_disallow)
6760		return;
6761
6762	if (sig >= NSIG) {
6763		DTRACE_CPUFLAG_SET(CPU_DTRACE_ILLOP);
6764		return;
6765	}
6766
6767#if defined(sun)
6768	/*
6769	 * raise() has a queue depth of 1 -- we ignore all subsequent
6770	 * invocations of the raise() action.
6771	 */
6772	if (curthread->t_dtrace_sig == 0)
6773		curthread->t_dtrace_sig = (uint8_t)sig;
6774
6775	curthread->t_sig_check = 1;
6776	aston(curthread);
6777#else
6778	struct proc *p = curproc;
6779	PROC_LOCK(p);
6780	kern_psignal(p, sig);
6781	PROC_UNLOCK(p);
6782#endif
6783}
6784
6785static void
6786dtrace_action_stop(void)
6787{
6788	if (dtrace_destructive_disallow)
6789		return;
6790
6791#if defined(sun)
6792	if (!curthread->t_dtrace_stop) {
6793		curthread->t_dtrace_stop = 1;
6794		curthread->t_sig_check = 1;
6795		aston(curthread);
6796	}
6797#else
6798	struct proc *p = curproc;
6799	PROC_LOCK(p);
6800	kern_psignal(p, SIGSTOP);
6801	PROC_UNLOCK(p);
6802#endif
6803}
6804
6805static void
6806dtrace_action_chill(dtrace_mstate_t *mstate, hrtime_t val)
6807{
6808	hrtime_t now;
6809	volatile uint16_t *flags;
6810#if defined(sun)
6811	cpu_t *cpu = CPU;
6812#else
6813	cpu_t *cpu = &solaris_cpu[curcpu];
6814#endif
6815
6816	if (dtrace_destructive_disallow)
6817		return;
6818
6819	flags = (volatile uint16_t *)&cpu_core[curcpu].cpuc_dtrace_flags;
6820
6821	now = dtrace_gethrtime();
6822
6823	if (now - cpu->cpu_dtrace_chillmark > dtrace_chill_interval) {
6824		/*
6825		 * We need to advance the mark to the current time.
6826		 */
6827		cpu->cpu_dtrace_chillmark = now;
6828		cpu->cpu_dtrace_chilled = 0;
6829	}
6830
6831	/*
6832	 * Now check to see if the requested chill time would take us over
6833	 * the maximum amount of time allowed in the chill interval.  (Or
6834	 * worse, if the calculation itself induces overflow.)
6835	 */
6836	if (cpu->cpu_dtrace_chilled + val > dtrace_chill_max ||
6837	    cpu->cpu_dtrace_chilled + val < cpu->cpu_dtrace_chilled) {
6838		*flags |= CPU_DTRACE_ILLOP;
6839		return;
6840	}
6841
6842	while (dtrace_gethrtime() - now < val)
6843		continue;
6844
6845	/*
6846	 * Normally, we assure that the value of the variable "timestamp" does
6847	 * not change within an ECB.  The presence of chill() represents an
6848	 * exception to this rule, however.
6849	 */
6850	mstate->dtms_present &= ~DTRACE_MSTATE_TIMESTAMP;
6851	cpu->cpu_dtrace_chilled += val;
6852}
6853
6854static void
6855dtrace_action_ustack(dtrace_mstate_t *mstate, dtrace_state_t *state,
6856    uint64_t *buf, uint64_t arg)
6857{
6858	int nframes = DTRACE_USTACK_NFRAMES(arg);
6859	int strsize = DTRACE_USTACK_STRSIZE(arg);
6860	uint64_t *pcs = &buf[1], *fps;
6861	char *str = (char *)&pcs[nframes];
6862	int size, offs = 0, i, j;
6863	uintptr_t old = mstate->dtms_scratch_ptr, saved;
6864	uint16_t *flags = &cpu_core[curcpu].cpuc_dtrace_flags;
6865	char *sym;
6866
6867	/*
6868	 * Should be taking a faster path if string space has not been
6869	 * allocated.
6870	 */
6871	ASSERT(strsize != 0);
6872
6873	/*
6874	 * We will first allocate some temporary space for the frame pointers.
6875	 */
6876	fps = (uint64_t *)P2ROUNDUP(mstate->dtms_scratch_ptr, 8);
6877	size = (uintptr_t)fps - mstate->dtms_scratch_ptr +
6878	    (nframes * sizeof (uint64_t));
6879
6880	if (!DTRACE_INSCRATCH(mstate, size)) {
6881		/*
6882		 * Not enough room for our frame pointers -- need to indicate
6883		 * that we ran out of scratch space.
6884		 */
6885		DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
6886		return;
6887	}
6888
6889	mstate->dtms_scratch_ptr += size;
6890	saved = mstate->dtms_scratch_ptr;
6891
6892	/*
6893	 * Now get a stack with both program counters and frame pointers.
6894	 */
6895	DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
6896	dtrace_getufpstack(buf, fps, nframes + 1);
6897	DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);
6898
6899	/*
6900	 * If that faulted, we're cooked.
6901	 */
6902	if (*flags & CPU_DTRACE_FAULT)
6903		goto out;
6904
6905	/*
6906	 * Now we want to walk up the stack, calling the USTACK helper.  For
6907	 * each iteration, we restore the scratch pointer.
6908	 */
6909	for (i = 0; i < nframes; i++) {
6910		mstate->dtms_scratch_ptr = saved;
6911
6912		if (offs >= strsize)
6913			break;
6914
6915		sym = (char *)(uintptr_t)dtrace_helper(
6916		    DTRACE_HELPER_ACTION_USTACK,
6917		    mstate, state, pcs[i], fps[i]);
6918
6919		/*
6920		 * If we faulted while running the helper, we're going to
6921		 * clear the fault and null out the corresponding string.
6922		 */
6923		if (*flags & CPU_DTRACE_FAULT) {
6924			*flags &= ~CPU_DTRACE_FAULT;
6925			str[offs++] = '\0';
6926			continue;
6927		}
6928
6929		if (sym == NULL) {
6930			str[offs++] = '\0';
6931			continue;
6932		}
6933
6934		DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
6935
6936		/*
6937		 * Now copy in the string that the helper returned to us.
6938		 */
6939		for (j = 0; offs + j < strsize; j++) {
6940			if ((str[offs + j] = sym[j]) == '\0')
6941				break;
6942		}
6943
6944		DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);
6945
6946		offs += j + 1;
6947	}
6948
6949	if (offs >= strsize) {
6950		/*
6951		 * If we didn't have room for all of the strings, we don't
6952		 * abort processing -- this needn't be a fatal error -- but we
6953		 * still want to increment a counter (dts_stkstroverflows) to
6954		 * allow this condition to be warned about.  (If this is from
6955		 * a jstack() action, it is easily tuned via jstackstrsize.)
6956		 */
6957		dtrace_error(&state->dts_stkstroverflows);
6958	}
6959
6960	while (offs < strsize)
6961		str[offs++] = '\0';
6962
6963out:
6964	mstate->dtms_scratch_ptr = old;
6965}
6966
6967static void
6968dtrace_store_by_ref(dtrace_difo_t *dp, caddr_t tomax, size_t size,
6969    size_t *valoffsp, uint64_t *valp, uint64_t end, int intuple, int dtkind)
6970{
6971	volatile uint16_t *flags;
6972	uint64_t val = *valp;
6973	size_t valoffs = *valoffsp;
6974
6975	flags = (volatile uint16_t *)&cpu_core[curcpu].cpuc_dtrace_flags;
6976	ASSERT(dtkind == DIF_TF_BYREF || dtkind == DIF_TF_BYUREF);
6977
6978	/*
6979	 * If this is a string, we're going to only load until we find the zero
6980	 * byte -- after which we'll store zero bytes.
6981	 */
6982	if (dp->dtdo_rtype.dtdt_kind == DIF_TYPE_STRING) {
6983		char c = '\0' + 1;
6984		size_t s;
6985
6986		for (s = 0; s < size; s++) {
6987			if (c != '\0' && dtkind == DIF_TF_BYREF) {
6988				c = dtrace_load8(val++);
6989			} else if (c != '\0' && dtkind == DIF_TF_BYUREF) {
6990				DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
6991				c = dtrace_fuword8((void *)(uintptr_t)val++);
6992				DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);
6993				if (*flags & CPU_DTRACE_FAULT)
6994					break;
6995			}
6996
6997			DTRACE_STORE(uint8_t, tomax, valoffs++, c);
6998
6999			if (c == '\0' && intuple)
7000				break;
7001		}
7002	} else {
7003		uint8_t c;
7004		while (valoffs < end) {
7005			if (dtkind == DIF_TF_BYREF) {
7006				c = dtrace_load8(val++);
7007			} else if (dtkind == DIF_TF_BYUREF) {
7008				DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
7009				c = dtrace_fuword8((void *)(uintptr_t)val++);
7010				DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);
7011				if (*flags & CPU_DTRACE_FAULT)
7012					break;
7013			}
7014
7015			DTRACE_STORE(uint8_t, tomax,
7016			    valoffs++, c);
7017		}
7018	}
7019
7020	*valp = val;
7021	*valoffsp = valoffs;
7022}
7023
7024/*
7025 * If you're looking for the epicenter of DTrace, you just found it.  This
7026 * is the function called by the provider to fire a probe -- from which all
7027 * subsequent probe-context DTrace activity emanates.
7028 */
7029void
7030dtrace_probe(dtrace_id_t id, uintptr_t arg0, uintptr_t arg1,
7031    uintptr_t arg2, uintptr_t arg3, uintptr_t arg4)
7032{
7033	processorid_t cpuid;
7034	dtrace_icookie_t cookie;
7035	dtrace_probe_t *probe;
7036	dtrace_mstate_t mstate;
7037	dtrace_ecb_t *ecb;
7038	dtrace_action_t *act;
7039	intptr_t offs;
7040	size_t size;
7041	int vtime, onintr;
7042	volatile uint16_t *flags;
7043	hrtime_t now;
7044
7045	if (panicstr != NULL)
7046		return;
7047
7048#if defined(sun)
7049	/*
7050	 * Kick out immediately if this CPU is still being born (in which case
7051	 * curthread will be set to -1) or the current thread can't allow
7052	 * probes in its current context.
7053	 */
7054	if (((uintptr_t)curthread & 1) || (curthread->t_flag & T_DONTDTRACE))
7055		return;
7056#endif
7057
7058	cookie = dtrace_interrupt_disable();
7059	probe = dtrace_probes[id - 1];
7060	cpuid = curcpu;
7061	onintr = CPU_ON_INTR(CPU);
7062
7063	if (!onintr && probe->dtpr_predcache != DTRACE_CACHEIDNONE &&
7064	    probe->dtpr_predcache == curthread->t_predcache) {
7065		/*
7066		 * We have hit in the predicate cache; we know that
7067		 * this predicate would evaluate to be false.
7068		 */
7069		dtrace_interrupt_enable(cookie);
7070		return;
7071	}
7072
7073#if defined(sun)
7074	if (panic_quiesce) {
7075#else
7076	if (panicstr != NULL) {
7077#endif
7078		/*
7079		 * We don't trace anything if we're panicking.
7080		 */
7081		dtrace_interrupt_enable(cookie);
7082		return;
7083	}
7084
7085	now = dtrace_gethrtime();
7086	vtime = dtrace_vtime_references != 0;
7087
7088	if (vtime && curthread->t_dtrace_start)
7089		curthread->t_dtrace_vtime += now - curthread->t_dtrace_start;
7090
7091	mstate.dtms_difo = NULL;
7092	mstate.dtms_probe = probe;
7093	mstate.dtms_strtok = 0;
7094	mstate.dtms_arg[0] = arg0;
7095	mstate.dtms_arg[1] = arg1;
7096	mstate.dtms_arg[2] = arg2;
7097	mstate.dtms_arg[3] = arg3;
7098	mstate.dtms_arg[4] = arg4;
7099
7100	flags = (volatile uint16_t *)&cpu_core[cpuid].cpuc_dtrace_flags;
7101
7102	for (ecb = probe->dtpr_ecb; ecb != NULL; ecb = ecb->dte_next) {
7103		dtrace_predicate_t *pred = ecb->dte_predicate;
7104		dtrace_state_t *state = ecb->dte_state;
7105		dtrace_buffer_t *buf = &state->dts_buffer[cpuid];
7106		dtrace_buffer_t *aggbuf = &state->dts_aggbuffer[cpuid];
7107		dtrace_vstate_t *vstate = &state->dts_vstate;
7108		dtrace_provider_t *prov = probe->dtpr_provider;
7109		uint64_t tracememsize = 0;
7110		int committed = 0;
7111		caddr_t tomax;
7112
7113		/*
7114		 * A little subtlety with the following (seemingly innocuous)
7115		 * declaration of the automatic 'val':  by looking at the
7116		 * code, you might think that it could be declared in the
7117		 * action processing loop, below.  (That is, it's only used in
7118		 * the action processing loop.)  However, it must be declared
7119		 * out of that scope because in the case of DIF expression
7120		 * arguments to aggregating actions, one iteration of the
7121		 * action loop will use the last iteration's value.
7122		 */
7123		uint64_t val = 0;
7124
7125		mstate.dtms_present = DTRACE_MSTATE_ARGS | DTRACE_MSTATE_PROBE;
7126		mstate.dtms_getf = NULL;
7127
7128		*flags &= ~CPU_DTRACE_ERROR;
7129
7130		if (prov == dtrace_provider) {
7131			/*
7132			 * If dtrace itself is the provider of this probe,
7133			 * we're only going to continue processing the ECB if
7134			 * arg0 (the dtrace_state_t) is equal to the ECB's
7135			 * creating state.  (This prevents disjoint consumers
7136			 * from seeing one another's metaprobes.)
7137			 */
7138			if (arg0 != (uint64_t)(uintptr_t)state)
7139				continue;
7140		}
7141
7142		if (state->dts_activity != DTRACE_ACTIVITY_ACTIVE) {
7143			/*
7144			 * We're not currently active.  If our provider isn't
7145			 * the dtrace pseudo provider, we're not interested.
7146			 */
7147			if (prov != dtrace_provider)
7148				continue;
7149
7150			/*
7151			 * Now we must further check if we are in the BEGIN
7152			 * probe.  If we are, we will only continue processing
7153			 * if we're still in WARMUP -- if one BEGIN enabling
7154			 * has invoked the exit() action, we don't want to
7155			 * evaluate subsequent BEGIN enablings.
7156			 */
7157			if (probe->dtpr_id == dtrace_probeid_begin &&
7158			    state->dts_activity != DTRACE_ACTIVITY_WARMUP) {
7159				ASSERT(state->dts_activity ==
7160				    DTRACE_ACTIVITY_DRAINING);
7161				continue;
7162			}
7163		}
7164
7165		if (ecb->dte_cond) {
7166			/*
7167			 * If the dte_cond bits indicate that this
7168			 * consumer is only allowed to see user-mode firings
7169			 * of this probe, call the provider's dtps_usermode()
7170			 * entry point to check that the probe was fired
7171			 * while in a user context. Skip this ECB if that's
7172			 * not the case.
7173			 */
7174			if ((ecb->dte_cond & DTRACE_COND_USERMODE) &&
7175			    prov->dtpv_pops.dtps_usermode(prov->dtpv_arg,
7176			    probe->dtpr_id, probe->dtpr_arg) == 0)
7177				continue;
7178
7179#if defined(sun)
7180			/*
7181			 * This is more subtle than it looks. We have to be
7182			 * absolutely certain that CRED() isn't going to
7183			 * change out from under us so it's only legit to
7184			 * examine that structure if we're in constrained
7185			 * situations. Currently, the only times we'll this
7186			 * check is if a non-super-user has enabled the
7187			 * profile or syscall providers -- providers that
7188			 * allow visibility of all processes. For the
7189			 * profile case, the check above will ensure that
7190			 * we're examining a user context.
7191			 */
7192			if (ecb->dte_cond & DTRACE_COND_OWNER) {
7193				cred_t *cr;
7194				cred_t *s_cr =
7195				    ecb->dte_state->dts_cred.dcr_cred;
7196				proc_t *proc;
7197
7198				ASSERT(s_cr != NULL);
7199
7200				if ((cr = CRED()) == NULL ||
7201				    s_cr->cr_uid != cr->cr_uid ||
7202				    s_cr->cr_uid != cr->cr_ruid ||
7203				    s_cr->cr_uid != cr->cr_suid ||
7204				    s_cr->cr_gid != cr->cr_gid ||
7205				    s_cr->cr_gid != cr->cr_rgid ||
7206				    s_cr->cr_gid != cr->cr_sgid ||
7207				    (proc = ttoproc(curthread)) == NULL ||
7208				    (proc->p_flag & SNOCD))
7209					continue;
7210			}
7211
7212			if (ecb->dte_cond & DTRACE_COND_ZONEOWNER) {
7213				cred_t *cr;
7214				cred_t *s_cr =
7215				    ecb->dte_state->dts_cred.dcr_cred;
7216
7217				ASSERT(s_cr != NULL);
7218
7219				if ((cr = CRED()) == NULL ||
7220				    s_cr->cr_zone->zone_id !=
7221				    cr->cr_zone->zone_id)
7222					continue;
7223			}
7224#endif
7225		}
7226
7227		if (now - state->dts_alive > dtrace_deadman_timeout) {
7228			/*
7229			 * We seem to be dead.  Unless we (a) have kernel
7230			 * destructive permissions (b) have explicitly enabled
7231			 * destructive actions and (c) destructive actions have
7232			 * not been disabled, we're going to transition into
7233			 * the KILLED state, from which no further processing
7234			 * on this state will be performed.
7235			 */
7236			if (!dtrace_priv_kernel_destructive(state) ||
7237			    !state->dts_cred.dcr_destructive ||
7238			    dtrace_destructive_disallow) {
7239				void *activity = &state->dts_activity;
7240				dtrace_activity_t current;
7241
7242				do {
7243					current = state->dts_activity;
7244				} while (dtrace_cas32(activity, current,
7245				    DTRACE_ACTIVITY_KILLED) != current);
7246
7247				continue;
7248			}
7249		}
7250
7251		if ((offs = dtrace_buffer_reserve(buf, ecb->dte_needed,
7252		    ecb->dte_alignment, state, &mstate)) < 0)
7253			continue;
7254
7255		tomax = buf->dtb_tomax;
7256		ASSERT(tomax != NULL);
7257
7258		if (ecb->dte_size != 0) {
7259			dtrace_rechdr_t dtrh;
7260			if (!(mstate.dtms_present & DTRACE_MSTATE_TIMESTAMP)) {
7261				mstate.dtms_timestamp = dtrace_gethrtime();
7262				mstate.dtms_present |= DTRACE_MSTATE_TIMESTAMP;
7263			}
7264			ASSERT3U(ecb->dte_size, >=, sizeof (dtrace_rechdr_t));
7265			dtrh.dtrh_epid = ecb->dte_epid;
7266			DTRACE_RECORD_STORE_TIMESTAMP(&dtrh,
7267			    mstate.dtms_timestamp);
7268			*((dtrace_rechdr_t *)(tomax + offs)) = dtrh;
7269		}
7270
7271		mstate.dtms_epid = ecb->dte_epid;
7272		mstate.dtms_present |= DTRACE_MSTATE_EPID;
7273
7274		if (state->dts_cred.dcr_visible & DTRACE_CRV_KERNEL)
7275			mstate.dtms_access = DTRACE_ACCESS_KERNEL;
7276		else
7277			mstate.dtms_access = 0;
7278
7279		if (pred != NULL) {
7280			dtrace_difo_t *dp = pred->dtp_difo;
7281			int rval;
7282
7283			rval = dtrace_dif_emulate(dp, &mstate, vstate, state);
7284
7285			if (!(*flags & CPU_DTRACE_ERROR) && !rval) {
7286				dtrace_cacheid_t cid = probe->dtpr_predcache;
7287
7288				if (cid != DTRACE_CACHEIDNONE && !onintr) {
7289					/*
7290					 * Update the predicate cache...
7291					 */
7292					ASSERT(cid == pred->dtp_cacheid);
7293					curthread->t_predcache = cid;
7294				}
7295
7296				continue;
7297			}
7298		}
7299
7300		for (act = ecb->dte_action; !(*flags & CPU_DTRACE_ERROR) &&
7301		    act != NULL; act = act->dta_next) {
7302			size_t valoffs;
7303			dtrace_difo_t *dp;
7304			dtrace_recdesc_t *rec = &act->dta_rec;
7305
7306			size = rec->dtrd_size;
7307			valoffs = offs + rec->dtrd_offset;
7308
7309			if (DTRACEACT_ISAGG(act->dta_kind)) {
7310				uint64_t v = 0xbad;
7311				dtrace_aggregation_t *agg;
7312
7313				agg = (dtrace_aggregation_t *)act;
7314
7315				if ((dp = act->dta_difo) != NULL)
7316					v = dtrace_dif_emulate(dp,
7317					    &mstate, vstate, state);
7318
7319				if (*flags & CPU_DTRACE_ERROR)
7320					continue;
7321
7322				/*
7323				 * Note that we always pass the expression
7324				 * value from the previous iteration of the
7325				 * action loop.  This value will only be used
7326				 * if there is an expression argument to the
7327				 * aggregating action, denoted by the
7328				 * dtag_hasarg field.
7329				 */
7330				dtrace_aggregate(agg, buf,
7331				    offs, aggbuf, v, val);
7332				continue;
7333			}
7334
7335			switch (act->dta_kind) {
7336			case DTRACEACT_STOP:
7337				if (dtrace_priv_proc_destructive(state))
7338					dtrace_action_stop();
7339				continue;
7340
7341			case DTRACEACT_BREAKPOINT:
7342				if (dtrace_priv_kernel_destructive(state))
7343					dtrace_action_breakpoint(ecb);
7344				continue;
7345
7346			case DTRACEACT_PANIC:
7347				if (dtrace_priv_kernel_destructive(state))
7348					dtrace_action_panic(ecb);
7349				continue;
7350
7351			case DTRACEACT_STACK:
7352				if (!dtrace_priv_kernel(state))
7353					continue;
7354
7355				dtrace_getpcstack((pc_t *)(tomax + valoffs),
7356				    size / sizeof (pc_t), probe->dtpr_aframes,
7357				    DTRACE_ANCHORED(probe) ? NULL :
7358				    (uint32_t *)arg0);
7359				continue;
7360
7361			case DTRACEACT_JSTACK:
7362			case DTRACEACT_USTACK:
7363				if (!dtrace_priv_proc(state))
7364					continue;
7365
7366				/*
7367				 * See comment in DIF_VAR_PID.
7368				 */
7369				if (DTRACE_ANCHORED(mstate.dtms_probe) &&
7370				    CPU_ON_INTR(CPU)) {
7371					int depth = DTRACE_USTACK_NFRAMES(
7372					    rec->dtrd_arg) + 1;
7373
7374					dtrace_bzero((void *)(tomax + valoffs),
7375					    DTRACE_USTACK_STRSIZE(rec->dtrd_arg)
7376					    + depth * sizeof (uint64_t));
7377
7378					continue;
7379				}
7380
7381				if (DTRACE_USTACK_STRSIZE(rec->dtrd_arg) != 0 &&
7382				    curproc->p_dtrace_helpers != NULL) {
7383					/*
7384					 * This is the slow path -- we have
7385					 * allocated string space, and we're
7386					 * getting the stack of a process that
7387					 * has helpers.  Call into a separate
7388					 * routine to perform this processing.
7389					 */
7390					dtrace_action_ustack(&mstate, state,
7391					    (uint64_t *)(tomax + valoffs),
7392					    rec->dtrd_arg);
7393					continue;
7394				}
7395
7396				DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
7397				dtrace_getupcstack((uint64_t *)
7398				    (tomax + valoffs),
7399				    DTRACE_USTACK_NFRAMES(rec->dtrd_arg) + 1);
7400				DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);
7401				continue;
7402
7403			default:
7404				break;
7405			}
7406
7407			dp = act->dta_difo;
7408			ASSERT(dp != NULL);
7409
7410			val = dtrace_dif_emulate(dp, &mstate, vstate, state);
7411
7412			if (*flags & CPU_DTRACE_ERROR)
7413				continue;
7414
7415			switch (act->dta_kind) {
7416			case DTRACEACT_SPECULATE: {
7417				dtrace_rechdr_t *dtrh;
7418
7419				ASSERT(buf == &state->dts_buffer[cpuid]);
7420				buf = dtrace_speculation_buffer(state,
7421				    cpuid, val);
7422
7423				if (buf == NULL) {
7424					*flags |= CPU_DTRACE_DROP;
7425					continue;
7426				}
7427
7428				offs = dtrace_buffer_reserve(buf,
7429				    ecb->dte_needed, ecb->dte_alignment,
7430				    state, NULL);
7431
7432				if (offs < 0) {
7433					*flags |= CPU_DTRACE_DROP;
7434					continue;
7435				}
7436
7437				tomax = buf->dtb_tomax;
7438				ASSERT(tomax != NULL);
7439
7440				if (ecb->dte_size == 0)
7441					continue;
7442
7443				ASSERT3U(ecb->dte_size, >=,
7444				    sizeof (dtrace_rechdr_t));
7445				dtrh = ((void *)(tomax + offs));
7446				dtrh->dtrh_epid = ecb->dte_epid;
7447				/*
7448				 * When the speculation is committed, all of
7449				 * the records in the speculative buffer will
7450				 * have their timestamps set to the commit
7451				 * time.  Until then, it is set to a sentinel
7452				 * value, for debugability.
7453				 */
7454				DTRACE_RECORD_STORE_TIMESTAMP(dtrh, UINT64_MAX);
7455				continue;
7456			}
7457
7458			case DTRACEACT_PRINTM: {
7459				/* The DIF returns a 'memref'. */
7460				uintptr_t *memref = (uintptr_t *)(uintptr_t) val;
7461
7462				/* Get the size from the memref. */
7463				size = memref[1];
7464
7465				/*
7466				 * Check if the size exceeds the allocated
7467				 * buffer size.
7468				 */
7469				if (size + sizeof(uintptr_t) > dp->dtdo_rtype.dtdt_size) {
7470					/* Flag a drop! */
7471					*flags |= CPU_DTRACE_DROP;
7472					continue;
7473				}
7474
7475				/* Store the size in the buffer first. */
7476				DTRACE_STORE(uintptr_t, tomax,
7477				    valoffs, size);
7478
7479				/*
7480				 * Offset the buffer address to the start
7481				 * of the data.
7482				 */
7483				valoffs += sizeof(uintptr_t);
7484
7485				/*
7486				 * Reset to the memory address rather than
7487				 * the memref array, then let the BYREF
7488				 * code below do the work to store the
7489				 * memory data in the buffer.
7490				 */
7491				val = memref[0];
7492				break;
7493			}
7494
7495			case DTRACEACT_PRINTT: {
7496				/* The DIF returns a 'typeref'. */
7497				uintptr_t *typeref = (uintptr_t *)(uintptr_t) val;
7498				char c = '\0' + 1;
7499				size_t s;
7500
7501				/*
7502				 * Get the type string length and round it
7503				 * up so that the data that follows is
7504				 * aligned for easy access.
7505				 */
7506				size_t typs = strlen((char *) typeref[2]) + 1;
7507				typs = roundup(typs,  sizeof(uintptr_t));
7508
7509				/*
7510				 *Get the size from the typeref using the
7511				 * number of elements and the type size.
7512				 */
7513				size = typeref[1] * typeref[3];
7514
7515				/*
7516				 * Check if the size exceeds the allocated
7517				 * buffer size.
7518				 */
7519				if (size + typs + 2 * sizeof(uintptr_t) > dp->dtdo_rtype.dtdt_size) {
7520					/* Flag a drop! */
7521					*flags |= CPU_DTRACE_DROP;
7522
7523				}
7524
7525				/* Store the size in the buffer first. */
7526				DTRACE_STORE(uintptr_t, tomax,
7527				    valoffs, size);
7528				valoffs += sizeof(uintptr_t);
7529
7530				/* Store the type size in the buffer. */
7531				DTRACE_STORE(uintptr_t, tomax,
7532				    valoffs, typeref[3]);
7533				valoffs += sizeof(uintptr_t);
7534
7535				val = typeref[2];
7536
7537				for (s = 0; s < typs; s++) {
7538					if (c != '\0')
7539						c = dtrace_load8(val++);
7540
7541					DTRACE_STORE(uint8_t, tomax,
7542					    valoffs++, c);
7543				}
7544
7545				/*
7546				 * Reset to the memory address rather than
7547				 * the typeref array, then let the BYREF
7548				 * code below do the work to store the
7549				 * memory data in the buffer.
7550				 */
7551				val = typeref[0];
7552				break;
7553			}
7554
7555			case DTRACEACT_CHILL:
7556				if (dtrace_priv_kernel_destructive(state))
7557					dtrace_action_chill(&mstate, val);
7558				continue;
7559
7560			case DTRACEACT_RAISE:
7561				if (dtrace_priv_proc_destructive(state))
7562					dtrace_action_raise(val);
7563				continue;
7564
7565			case DTRACEACT_COMMIT:
7566				ASSERT(!committed);
7567
7568				/*
7569				 * We need to commit our buffer state.
7570				 */
7571				if (ecb->dte_size)
7572					buf->dtb_offset = offs + ecb->dte_size;
7573				buf = &state->dts_buffer[cpuid];
7574				dtrace_speculation_commit(state, cpuid, val);
7575				committed = 1;
7576				continue;
7577
7578			case DTRACEACT_DISCARD:
7579				dtrace_speculation_discard(state, cpuid, val);
7580				continue;
7581
7582			case DTRACEACT_DIFEXPR:
7583			case DTRACEACT_LIBACT:
7584			case DTRACEACT_PRINTF:
7585			case DTRACEACT_PRINTA:
7586			case DTRACEACT_SYSTEM:
7587			case DTRACEACT_FREOPEN:
7588			case DTRACEACT_TRACEMEM:
7589				break;
7590
7591			case DTRACEACT_TRACEMEM_DYNSIZE:
7592				tracememsize = val;
7593				break;
7594
7595			case DTRACEACT_SYM:
7596			case DTRACEACT_MOD:
7597				if (!dtrace_priv_kernel(state))
7598					continue;
7599				break;
7600
7601			case DTRACEACT_USYM:
7602			case DTRACEACT_UMOD:
7603			case DTRACEACT_UADDR: {
7604#if defined(sun)
7605				struct pid *pid = curthread->t_procp->p_pidp;
7606#endif
7607
7608				if (!dtrace_priv_proc(state))
7609					continue;
7610
7611				DTRACE_STORE(uint64_t, tomax,
7612#if defined(sun)
7613				    valoffs, (uint64_t)pid->pid_id);
7614#else
7615				    valoffs, (uint64_t) curproc->p_pid);
7616#endif
7617				DTRACE_STORE(uint64_t, tomax,
7618				    valoffs + sizeof (uint64_t), val);
7619
7620				continue;
7621			}
7622
7623			case DTRACEACT_EXIT: {
7624				/*
7625				 * For the exit action, we are going to attempt
7626				 * to atomically set our activity to be
7627				 * draining.  If this fails (either because
7628				 * another CPU has beat us to the exit action,
7629				 * or because our current activity is something
7630				 * other than ACTIVE or WARMUP), we will
7631				 * continue.  This assures that the exit action
7632				 * can be successfully recorded at most once
7633				 * when we're in the ACTIVE state.  If we're
7634				 * encountering the exit() action while in
7635				 * COOLDOWN, however, we want to honor the new
7636				 * status code.  (We know that we're the only
7637				 * thread in COOLDOWN, so there is no race.)
7638				 */
7639				void *activity = &state->dts_activity;
7640				dtrace_activity_t current = state->dts_activity;
7641
7642				if (current == DTRACE_ACTIVITY_COOLDOWN)
7643					break;
7644
7645				if (current != DTRACE_ACTIVITY_WARMUP)
7646					current = DTRACE_ACTIVITY_ACTIVE;
7647
7648				if (dtrace_cas32(activity, current,
7649				    DTRACE_ACTIVITY_DRAINING) != current) {
7650					*flags |= CPU_DTRACE_DROP;
7651					continue;
7652				}
7653
7654				break;
7655			}
7656
7657			default:
7658				ASSERT(0);
7659			}
7660
7661			if (dp->dtdo_rtype.dtdt_flags & DIF_TF_BYREF ||
7662			    dp->dtdo_rtype.dtdt_flags & DIF_TF_BYUREF) {
7663				uintptr_t end = valoffs + size;
7664
7665				if (tracememsize != 0 &&
7666				    valoffs + tracememsize < end) {
7667					end = valoffs + tracememsize;
7668					tracememsize = 0;
7669				}
7670
7671				if (dp->dtdo_rtype.dtdt_flags & DIF_TF_BYREF &&
7672				    !dtrace_vcanload((void *)(uintptr_t)val,
7673				    &dp->dtdo_rtype, &mstate, vstate))
7674					continue;
7675
7676				dtrace_store_by_ref(dp, tomax, size, &valoffs,
7677				    &val, end, act->dta_intuple,
7678				    dp->dtdo_rtype.dtdt_flags & DIF_TF_BYREF ?
7679				    DIF_TF_BYREF: DIF_TF_BYUREF);
7680				continue;
7681			}
7682
7683			switch (size) {
7684			case 0:
7685				break;
7686
7687			case sizeof (uint8_t):
7688				DTRACE_STORE(uint8_t, tomax, valoffs, val);
7689				break;
7690			case sizeof (uint16_t):
7691				DTRACE_STORE(uint16_t, tomax, valoffs, val);
7692				break;
7693			case sizeof (uint32_t):
7694				DTRACE_STORE(uint32_t, tomax, valoffs, val);
7695				break;
7696			case sizeof (uint64_t):
7697				DTRACE_STORE(uint64_t, tomax, valoffs, val);
7698				break;
7699			default:
7700				/*
7701				 * Any other size should have been returned by
7702				 * reference, not by value.
7703				 */
7704				ASSERT(0);
7705				break;
7706			}
7707		}
7708
7709		if (*flags & CPU_DTRACE_DROP)
7710			continue;
7711
7712		if (*flags & CPU_DTRACE_FAULT) {
7713			int ndx;
7714			dtrace_action_t *err;
7715
7716			buf->dtb_errors++;
7717
7718			if (probe->dtpr_id == dtrace_probeid_error) {
7719				/*
7720				 * There's nothing we can do -- we had an
7721				 * error on the error probe.  We bump an
7722				 * error counter to at least indicate that
7723				 * this condition happened.
7724				 */
7725				dtrace_error(&state->dts_dblerrors);
7726				continue;
7727			}
7728
7729			if (vtime) {
7730				/*
7731				 * Before recursing on dtrace_probe(), we
7732				 * need to explicitly clear out our start
7733				 * time to prevent it from being accumulated
7734				 * into t_dtrace_vtime.
7735				 */
7736				curthread->t_dtrace_start = 0;
7737			}
7738
7739			/*
7740			 * Iterate over the actions to figure out which action
7741			 * we were processing when we experienced the error.
7742			 * Note that act points _past_ the faulting action; if
7743			 * act is ecb->dte_action, the fault was in the
7744			 * predicate, if it's ecb->dte_action->dta_next it's
7745			 * in action #1, and so on.
7746			 */
7747			for (err = ecb->dte_action, ndx = 0;
7748			    err != act; err = err->dta_next, ndx++)
7749				continue;
7750
7751			dtrace_probe_error(state, ecb->dte_epid, ndx,
7752			    (mstate.dtms_present & DTRACE_MSTATE_FLTOFFS) ?
7753			    mstate.dtms_fltoffs : -1, DTRACE_FLAGS2FLT(*flags),
7754			    cpu_core[cpuid].cpuc_dtrace_illval);
7755
7756			continue;
7757		}
7758
7759		if (!committed)
7760			buf->dtb_offset = offs + ecb->dte_size;
7761	}
7762
7763	if (vtime)
7764		curthread->t_dtrace_start = dtrace_gethrtime();
7765
7766	dtrace_interrupt_enable(cookie);
7767}
7768
7769/*
7770 * DTrace Probe Hashing Functions
7771 *
7772 * The functions in this section (and indeed, the functions in remaining
7773 * sections) are not _called_ from probe context.  (Any exceptions to this are
7774 * marked with a "Note:".)  Rather, they are called from elsewhere in the
7775 * DTrace framework to look-up probes in, add probes to and remove probes from
7776 * the DTrace probe hashes.  (Each probe is hashed by each element of the
7777 * probe tuple -- allowing for fast lookups, regardless of what was
7778 * specified.)
7779 */
7780static uint_t
7781dtrace_hash_str(const char *p)
7782{
7783	unsigned int g;
7784	uint_t hval = 0;
7785
7786	while (*p) {
7787		hval = (hval << 4) + *p++;
7788		if ((g = (hval & 0xf0000000)) != 0)
7789			hval ^= g >> 24;
7790		hval &= ~g;
7791	}
7792	return (hval);
7793}
7794
7795static dtrace_hash_t *
7796dtrace_hash_create(uintptr_t stroffs, uintptr_t nextoffs, uintptr_t prevoffs)
7797{
7798	dtrace_hash_t *hash = kmem_zalloc(sizeof (dtrace_hash_t), KM_SLEEP);
7799
7800	hash->dth_stroffs = stroffs;
7801	hash->dth_nextoffs = nextoffs;
7802	hash->dth_prevoffs = prevoffs;
7803
7804	hash->dth_size = 1;
7805	hash->dth_mask = hash->dth_size - 1;
7806
7807	hash->dth_tab = kmem_zalloc(hash->dth_size *
7808	    sizeof (dtrace_hashbucket_t *), KM_SLEEP);
7809
7810	return (hash);
7811}
7812
7813static void
7814dtrace_hash_destroy(dtrace_hash_t *hash)
7815{
7816#ifdef DEBUG
7817	int i;
7818
7819	for (i = 0; i < hash->dth_size; i++)
7820		ASSERT(hash->dth_tab[i] == NULL);
7821#endif
7822
7823	kmem_free(hash->dth_tab,
7824	    hash->dth_size * sizeof (dtrace_hashbucket_t *));
7825	kmem_free(hash, sizeof (dtrace_hash_t));
7826}
7827
7828static void
7829dtrace_hash_resize(dtrace_hash_t *hash)
7830{
7831	int size = hash->dth_size, i, ndx;
7832	int new_size = hash->dth_size << 1;
7833	int new_mask = new_size - 1;
7834	dtrace_hashbucket_t **new_tab, *bucket, *next;
7835
7836	ASSERT((new_size & new_mask) == 0);
7837
7838	new_tab = kmem_zalloc(new_size * sizeof (void *), KM_SLEEP);
7839
7840	for (i = 0; i < size; i++) {
7841		for (bucket = hash->dth_tab[i]; bucket != NULL; bucket = next) {
7842			dtrace_probe_t *probe = bucket->dthb_chain;
7843
7844			ASSERT(probe != NULL);
7845			ndx = DTRACE_HASHSTR(hash, probe) & new_mask;
7846
7847			next = bucket->dthb_next;
7848			bucket->dthb_next = new_tab[ndx];
7849			new_tab[ndx] = bucket;
7850		}
7851	}
7852
7853	kmem_free(hash->dth_tab, hash->dth_size * sizeof (void *));
7854	hash->dth_tab = new_tab;
7855	hash->dth_size = new_size;
7856	hash->dth_mask = new_mask;
7857}
7858
7859static void
7860dtrace_hash_add(dtrace_hash_t *hash, dtrace_probe_t *new)
7861{
7862	int hashval = DTRACE_HASHSTR(hash, new);
7863	int ndx = hashval & hash->dth_mask;
7864	dtrace_hashbucket_t *bucket = hash->dth_tab[ndx];
7865	dtrace_probe_t **nextp, **prevp;
7866
7867	for (; bucket != NULL; bucket = bucket->dthb_next) {
7868		if (DTRACE_HASHEQ(hash, bucket->dthb_chain, new))
7869			goto add;
7870	}
7871
7872	if ((hash->dth_nbuckets >> 1) > hash->dth_size) {
7873		dtrace_hash_resize(hash);
7874		dtrace_hash_add(hash, new);
7875		return;
7876	}
7877
7878	bucket = kmem_zalloc(sizeof (dtrace_hashbucket_t), KM_SLEEP);
7879	bucket->dthb_next = hash->dth_tab[ndx];
7880	hash->dth_tab[ndx] = bucket;
7881	hash->dth_nbuckets++;
7882
7883add:
7884	nextp = DTRACE_HASHNEXT(hash, new);
7885	ASSERT(*nextp == NULL && *(DTRACE_HASHPREV(hash, new)) == NULL);
7886	*nextp = bucket->dthb_chain;
7887
7888	if (bucket->dthb_chain != NULL) {
7889		prevp = DTRACE_HASHPREV(hash, bucket->dthb_chain);
7890		ASSERT(*prevp == NULL);
7891		*prevp = new;
7892	}
7893
7894	bucket->dthb_chain = new;
7895	bucket->dthb_len++;
7896}
7897
7898static dtrace_probe_t *
7899dtrace_hash_lookup(dtrace_hash_t *hash, dtrace_probe_t *template)
7900{
7901	int hashval = DTRACE_HASHSTR(hash, template);
7902	int ndx = hashval & hash->dth_mask;
7903	dtrace_hashbucket_t *bucket = hash->dth_tab[ndx];
7904
7905	for (; bucket != NULL; bucket = bucket->dthb_next) {
7906		if (DTRACE_HASHEQ(hash, bucket->dthb_chain, template))
7907			return (bucket->dthb_chain);
7908	}
7909
7910	return (NULL);
7911}
7912
7913static int
7914dtrace_hash_collisions(dtrace_hash_t *hash, dtrace_probe_t *template)
7915{
7916	int hashval = DTRACE_HASHSTR(hash, template);
7917	int ndx = hashval & hash->dth_mask;
7918	dtrace_hashbucket_t *bucket = hash->dth_tab[ndx];
7919
7920	for (; bucket != NULL; bucket = bucket->dthb_next) {
7921		if (DTRACE_HASHEQ(hash, bucket->dthb_chain, template))
7922			return (bucket->dthb_len);
7923	}
7924
7925	return (0);
7926}
7927
7928static void
7929dtrace_hash_remove(dtrace_hash_t *hash, dtrace_probe_t *probe)
7930{
7931	int ndx = DTRACE_HASHSTR(hash, probe) & hash->dth_mask;
7932	dtrace_hashbucket_t *bucket = hash->dth_tab[ndx];
7933
7934	dtrace_probe_t **prevp = DTRACE_HASHPREV(hash, probe);
7935	dtrace_probe_t **nextp = DTRACE_HASHNEXT(hash, probe);
7936
7937	/*
7938	 * Find the bucket that we're removing this probe from.
7939	 */
7940	for (; bucket != NULL; bucket = bucket->dthb_next) {
7941		if (DTRACE_HASHEQ(hash, bucket->dthb_chain, probe))
7942			break;
7943	}
7944
7945	ASSERT(bucket != NULL);
7946
7947	if (*prevp == NULL) {
7948		if (*nextp == NULL) {
7949			/*
7950			 * The removed probe was the only probe on this
7951			 * bucket; we need to remove the bucket.
7952			 */
7953			dtrace_hashbucket_t *b = hash->dth_tab[ndx];
7954
7955			ASSERT(bucket->dthb_chain == probe);
7956			ASSERT(b != NULL);
7957
7958			if (b == bucket) {
7959				hash->dth_tab[ndx] = bucket->dthb_next;
7960			} else {
7961				while (b->dthb_next != bucket)
7962					b = b->dthb_next;
7963				b->dthb_next = bucket->dthb_next;
7964			}
7965
7966			ASSERT(hash->dth_nbuckets > 0);
7967			hash->dth_nbuckets--;
7968			kmem_free(bucket, sizeof (dtrace_hashbucket_t));
7969			return;
7970		}
7971
7972		bucket->dthb_chain = *nextp;
7973	} else {
7974		*(DTRACE_HASHNEXT(hash, *prevp)) = *nextp;
7975	}
7976
7977	if (*nextp != NULL)
7978		*(DTRACE_HASHPREV(hash, *nextp)) = *prevp;
7979}
7980
7981/*
7982 * DTrace Utility Functions
7983 *
7984 * These are random utility functions that are _not_ called from probe context.
7985 */
7986static int
7987dtrace_badattr(const dtrace_attribute_t *a)
7988{
7989	return (a->dtat_name > DTRACE_STABILITY_MAX ||
7990	    a->dtat_data > DTRACE_STABILITY_MAX ||
7991	    a->dtat_class > DTRACE_CLASS_MAX);
7992}
7993
7994/*
7995 * Return a duplicate copy of a string.  If the specified string is NULL,
7996 * this function returns a zero-length string.
7997 */
7998static char *
7999dtrace_strdup(const char *str)
8000{
8001	char *new = kmem_zalloc((str != NULL ? strlen(str) : 0) + 1, KM_SLEEP);
8002
8003	if (str != NULL)
8004		(void) strcpy(new, str);
8005
8006	return (new);
8007}
8008
8009#define	DTRACE_ISALPHA(c)	\
8010	(((c) >= 'a' && (c) <= 'z') || ((c) >= 'A' && (c) <= 'Z'))
8011
8012static int
8013dtrace_badname(const char *s)
8014{
8015	char c;
8016
8017	if (s == NULL || (c = *s++) == '\0')
8018		return (0);
8019
8020	if (!DTRACE_ISALPHA(c) && c != '-' && c != '_' && c != '.')
8021		return (1);
8022
8023	while ((c = *s++) != '\0') {
8024		if (!DTRACE_ISALPHA(c) && (c < '0' || c > '9') &&
8025		    c != '-' && c != '_' && c != '.' && c != '`')
8026			return (1);
8027	}
8028
8029	return (0);
8030}
8031
8032static void
8033dtrace_cred2priv(cred_t *cr, uint32_t *privp, uid_t *uidp, zoneid_t *zoneidp)
8034{
8035	uint32_t priv;
8036
8037#if defined(sun)
8038	if (cr == NULL || PRIV_POLICY_ONLY(cr, PRIV_ALL, B_FALSE)) {
8039		/*
8040		 * For DTRACE_PRIV_ALL, the uid and zoneid don't matter.
8041		 */
8042		priv = DTRACE_PRIV_ALL;
8043	} else {
8044		*uidp = crgetuid(cr);
8045		*zoneidp = crgetzoneid(cr);
8046
8047		priv = 0;
8048		if (PRIV_POLICY_ONLY(cr, PRIV_DTRACE_KERNEL, B_FALSE))
8049			priv |= DTRACE_PRIV_KERNEL | DTRACE_PRIV_USER;
8050		else if (PRIV_POLICY_ONLY(cr, PRIV_DTRACE_USER, B_FALSE))
8051			priv |= DTRACE_PRIV_USER;
8052		if (PRIV_POLICY_ONLY(cr, PRIV_DTRACE_PROC, B_FALSE))
8053			priv |= DTRACE_PRIV_PROC;
8054		if (PRIV_POLICY_ONLY(cr, PRIV_PROC_OWNER, B_FALSE))
8055			priv |= DTRACE_PRIV_OWNER;
8056		if (PRIV_POLICY_ONLY(cr, PRIV_PROC_ZONE, B_FALSE))
8057			priv |= DTRACE_PRIV_ZONEOWNER;
8058	}
8059#else
8060	priv = DTRACE_PRIV_ALL;
8061#endif
8062
8063	*privp = priv;
8064}
8065
8066#ifdef DTRACE_ERRDEBUG
8067static void
8068dtrace_errdebug(const char *str)
8069{
8070	int hval = dtrace_hash_str(str) % DTRACE_ERRHASHSZ;
8071	int occupied = 0;
8072
8073	mutex_enter(&dtrace_errlock);
8074	dtrace_errlast = str;
8075	dtrace_errthread = curthread;
8076
8077	while (occupied++ < DTRACE_ERRHASHSZ) {
8078		if (dtrace_errhash[hval].dter_msg == str) {
8079			dtrace_errhash[hval].dter_count++;
8080			goto out;
8081		}
8082
8083		if (dtrace_errhash[hval].dter_msg != NULL) {
8084			hval = (hval + 1) % DTRACE_ERRHASHSZ;
8085			continue;
8086		}
8087
8088		dtrace_errhash[hval].dter_msg = str;
8089		dtrace_errhash[hval].dter_count = 1;
8090		goto out;
8091	}
8092
8093	panic("dtrace: undersized error hash");
8094out:
8095	mutex_exit(&dtrace_errlock);
8096}
8097#endif
8098
8099/*
8100 * DTrace Matching Functions
8101 *
8102 * These functions are used to match groups of probes, given some elements of
8103 * a probe tuple, or some globbed expressions for elements of a probe tuple.
8104 */
8105static int
8106dtrace_match_priv(const dtrace_probe_t *prp, uint32_t priv, uid_t uid,
8107    zoneid_t zoneid)
8108{
8109	if (priv != DTRACE_PRIV_ALL) {
8110		uint32_t ppriv = prp->dtpr_provider->dtpv_priv.dtpp_flags;
8111		uint32_t match = priv & ppriv;
8112
8113		/*
8114		 * No PRIV_DTRACE_* privileges...
8115		 */
8116		if ((priv & (DTRACE_PRIV_PROC | DTRACE_PRIV_USER |
8117		    DTRACE_PRIV_KERNEL)) == 0)
8118			return (0);
8119
8120		/*
8121		 * No matching bits, but there were bits to match...
8122		 */
8123		if (match == 0 && ppriv != 0)
8124			return (0);
8125
8126		/*
8127		 * Need to have permissions to the process, but don't...
8128		 */
8129		if (((ppriv & ~match) & DTRACE_PRIV_OWNER) != 0 &&
8130		    uid != prp->dtpr_provider->dtpv_priv.dtpp_uid) {
8131			return (0);
8132		}
8133
8134		/*
8135		 * Need to be in the same zone unless we possess the
8136		 * privilege to examine all zones.
8137		 */
8138		if (((ppriv & ~match) & DTRACE_PRIV_ZONEOWNER) != 0 &&
8139		    zoneid != prp->dtpr_provider->dtpv_priv.dtpp_zoneid) {
8140			return (0);
8141		}
8142	}
8143
8144	return (1);
8145}
8146
8147/*
8148 * dtrace_match_probe compares a dtrace_probe_t to a pre-compiled key, which
8149 * consists of input pattern strings and an ops-vector to evaluate them.
8150 * This function returns >0 for match, 0 for no match, and <0 for error.
8151 */
8152static int
8153dtrace_match_probe(const dtrace_probe_t *prp, const dtrace_probekey_t *pkp,
8154    uint32_t priv, uid_t uid, zoneid_t zoneid)
8155{
8156	dtrace_provider_t *pvp = prp->dtpr_provider;
8157	int rv;
8158
8159	if (pvp->dtpv_defunct)
8160		return (0);
8161
8162	if ((rv = pkp->dtpk_pmatch(pvp->dtpv_name, pkp->dtpk_prov, 0)) <= 0)
8163		return (rv);
8164
8165	if ((rv = pkp->dtpk_mmatch(prp->dtpr_mod, pkp->dtpk_mod, 0)) <= 0)
8166		return (rv);
8167
8168	if ((rv = pkp->dtpk_fmatch(prp->dtpr_func, pkp->dtpk_func, 0)) <= 0)
8169		return (rv);
8170
8171	if ((rv = pkp->dtpk_nmatch(prp->dtpr_name, pkp->dtpk_name, 0)) <= 0)
8172		return (rv);
8173
8174	if (dtrace_match_priv(prp, priv, uid, zoneid) == 0)
8175		return (0);
8176
8177	return (rv);
8178}
8179
8180/*
8181 * dtrace_match_glob() is a safe kernel implementation of the gmatch(3GEN)
8182 * interface for matching a glob pattern 'p' to an input string 's'.  Unlike
8183 * libc's version, the kernel version only applies to 8-bit ASCII strings.
8184 * In addition, all of the recursion cases except for '*' matching have been
8185 * unwound.  For '*', we still implement recursive evaluation, but a depth
8186 * counter is maintained and matching is aborted if we recurse too deep.
8187 * The function returns 0 if no match, >0 if match, and <0 if recursion error.
8188 */
8189static int
8190dtrace_match_glob(const char *s, const char *p, int depth)
8191{
8192	const char *olds;
8193	char s1, c;
8194	int gs;
8195
8196	if (depth > DTRACE_PROBEKEY_MAXDEPTH)
8197		return (-1);
8198
8199	if (s == NULL)
8200		s = ""; /* treat NULL as empty string */
8201
8202top:
8203	olds = s;
8204	s1 = *s++;
8205
8206	if (p == NULL)
8207		return (0);
8208
8209	if ((c = *p++) == '\0')
8210		return (s1 == '\0');
8211
8212	switch (c) {
8213	case '[': {
8214		int ok = 0, notflag = 0;
8215		char lc = '\0';
8216
8217		if (s1 == '\0')
8218			return (0);
8219
8220		if (*p == '!') {
8221			notflag = 1;
8222			p++;
8223		}
8224
8225		if ((c = *p++) == '\0')
8226			return (0);
8227
8228		do {
8229			if (c == '-' && lc != '\0' && *p != ']') {
8230				if ((c = *p++) == '\0')
8231					return (0);
8232				if (c == '\\' && (c = *p++) == '\0')
8233					return (0);
8234
8235				if (notflag) {
8236					if (s1 < lc || s1 > c)
8237						ok++;
8238					else
8239						return (0);
8240				} else if (lc <= s1 && s1 <= c)
8241					ok++;
8242
8243			} else if (c == '\\' && (c = *p++) == '\0')
8244				return (0);
8245
8246			lc = c; /* save left-hand 'c' for next iteration */
8247
8248			if (notflag) {
8249				if (s1 != c)
8250					ok++;
8251				else
8252					return (0);
8253			} else if (s1 == c)
8254				ok++;
8255
8256			if ((c = *p++) == '\0')
8257				return (0);
8258
8259		} while (c != ']');
8260
8261		if (ok)
8262			goto top;
8263
8264		return (0);
8265	}
8266
8267	case '\\':
8268		if ((c = *p++) == '\0')
8269			return (0);
8270		/*FALLTHRU*/
8271
8272	default:
8273		if (c != s1)
8274			return (0);
8275		/*FALLTHRU*/
8276
8277	case '?':
8278		if (s1 != '\0')
8279			goto top;
8280		return (0);
8281
8282	case '*':
8283		while (*p == '*')
8284			p++; /* consecutive *'s are identical to a single one */
8285
8286		if (*p == '\0')
8287			return (1);
8288
8289		for (s = olds; *s != '\0'; s++) {
8290			if ((gs = dtrace_match_glob(s, p, depth + 1)) != 0)
8291				return (gs);
8292		}
8293
8294		return (0);
8295	}
8296}
8297
8298/*ARGSUSED*/
8299static int
8300dtrace_match_string(const char *s, const char *p, int depth)
8301{
8302	return (s != NULL && strcmp(s, p) == 0);
8303}
8304
8305/*ARGSUSED*/
8306static int
8307dtrace_match_nul(const char *s, const char *p, int depth)
8308{
8309	return (1); /* always match the empty pattern */
8310}
8311
8312/*ARGSUSED*/
8313static int
8314dtrace_match_nonzero(const char *s, const char *p, int depth)
8315{
8316	return (s != NULL && s[0] != '\0');
8317}
8318
8319static int
8320dtrace_match(const dtrace_probekey_t *pkp, uint32_t priv, uid_t uid,
8321    zoneid_t zoneid, int (*matched)(dtrace_probe_t *, void *), void *arg)
8322{
8323	dtrace_probe_t template, *probe;
8324	dtrace_hash_t *hash = NULL;
8325	int len, best = INT_MAX, nmatched = 0;
8326	dtrace_id_t i;
8327
8328	ASSERT(MUTEX_HELD(&dtrace_lock));
8329
8330	/*
8331	 * If the probe ID is specified in the key, just lookup by ID and
8332	 * invoke the match callback once if a matching probe is found.
8333	 */
8334	if (pkp->dtpk_id != DTRACE_IDNONE) {
8335		if ((probe = dtrace_probe_lookup_id(pkp->dtpk_id)) != NULL &&
8336		    dtrace_match_probe(probe, pkp, priv, uid, zoneid) > 0) {
8337			(void) (*matched)(probe, arg);
8338			nmatched++;
8339		}
8340		return (nmatched);
8341	}
8342
8343	template.dtpr_mod = (char *)pkp->dtpk_mod;
8344	template.dtpr_func = (char *)pkp->dtpk_func;
8345	template.dtpr_name = (char *)pkp->dtpk_name;
8346
8347	/*
8348	 * We want to find the most distinct of the module name, function
8349	 * name, and name.  So for each one that is not a glob pattern or
8350	 * empty string, we perform a lookup in the corresponding hash and
8351	 * use the hash table with the fewest collisions to do our search.
8352	 */
8353	if (pkp->dtpk_mmatch == &dtrace_match_string &&
8354	    (len = dtrace_hash_collisions(dtrace_bymod, &template)) < best) {
8355		best = len;
8356		hash = dtrace_bymod;
8357	}
8358
8359	if (pkp->dtpk_fmatch == &dtrace_match_string &&
8360	    (len = dtrace_hash_collisions(dtrace_byfunc, &template)) < best) {
8361		best = len;
8362		hash = dtrace_byfunc;
8363	}
8364
8365	if (pkp->dtpk_nmatch == &dtrace_match_string &&
8366	    (len = dtrace_hash_collisions(dtrace_byname, &template)) < best) {
8367		best = len;
8368		hash = dtrace_byname;
8369	}
8370
8371	/*
8372	 * If we did not select a hash table, iterate over every probe and
8373	 * invoke our callback for each one that matches our input probe key.
8374	 */
8375	if (hash == NULL) {
8376		for (i = 0; i < dtrace_nprobes; i++) {
8377			if ((probe = dtrace_probes[i]) == NULL ||
8378			    dtrace_match_probe(probe, pkp, priv, uid,
8379			    zoneid) <= 0)
8380				continue;
8381
8382			nmatched++;
8383
8384			if ((*matched)(probe, arg) != DTRACE_MATCH_NEXT)
8385				break;
8386		}
8387
8388		return (nmatched);
8389	}
8390
8391	/*
8392	 * If we selected a hash table, iterate over each probe of the same key
8393	 * name and invoke the callback for every probe that matches the other
8394	 * attributes of our input probe key.
8395	 */
8396	for (probe = dtrace_hash_lookup(hash, &template); probe != NULL;
8397	    probe = *(DTRACE_HASHNEXT(hash, probe))) {
8398
8399		if (dtrace_match_probe(probe, pkp, priv, uid, zoneid) <= 0)
8400			continue;
8401
8402		nmatched++;
8403
8404		if ((*matched)(probe, arg) != DTRACE_MATCH_NEXT)
8405			break;
8406	}
8407
8408	return (nmatched);
8409}
8410
8411/*
8412 * Return the function pointer dtrace_probecmp() should use to compare the
8413 * specified pattern with a string.  For NULL or empty patterns, we select
8414 * dtrace_match_nul().  For glob pattern strings, we use dtrace_match_glob().
8415 * For non-empty non-glob strings, we use dtrace_match_string().
8416 */
8417static dtrace_probekey_f *
8418dtrace_probekey_func(const char *p)
8419{
8420	char c;
8421
8422	if (p == NULL || *p == '\0')
8423		return (&dtrace_match_nul);
8424
8425	while ((c = *p++) != '\0') {
8426		if (c == '[' || c == '?' || c == '*' || c == '\\')
8427			return (&dtrace_match_glob);
8428	}
8429
8430	return (&dtrace_match_string);
8431}
8432
8433/*
8434 * Build a probe comparison key for use with dtrace_match_probe() from the
8435 * given probe description.  By convention, a null key only matches anchored
8436 * probes: if each field is the empty string, reset dtpk_fmatch to
8437 * dtrace_match_nonzero().
8438 */
8439static void
8440dtrace_probekey(dtrace_probedesc_t *pdp, dtrace_probekey_t *pkp)
8441{
8442	pkp->dtpk_prov = pdp->dtpd_provider;
8443	pkp->dtpk_pmatch = dtrace_probekey_func(pdp->dtpd_provider);
8444
8445	pkp->dtpk_mod = pdp->dtpd_mod;
8446	pkp->dtpk_mmatch = dtrace_probekey_func(pdp->dtpd_mod);
8447
8448	pkp->dtpk_func = pdp->dtpd_func;
8449	pkp->dtpk_fmatch = dtrace_probekey_func(pdp->dtpd_func);
8450
8451	pkp->dtpk_name = pdp->dtpd_name;
8452	pkp->dtpk_nmatch = dtrace_probekey_func(pdp->dtpd_name);
8453
8454	pkp->dtpk_id = pdp->dtpd_id;
8455
8456	if (pkp->dtpk_id == DTRACE_IDNONE &&
8457	    pkp->dtpk_pmatch == &dtrace_match_nul &&
8458	    pkp->dtpk_mmatch == &dtrace_match_nul &&
8459	    pkp->dtpk_fmatch == &dtrace_match_nul &&
8460	    pkp->dtpk_nmatch == &dtrace_match_nul)
8461		pkp->dtpk_fmatch = &dtrace_match_nonzero;
8462}
8463
8464/*
8465 * DTrace Provider-to-Framework API Functions
8466 *
8467 * These functions implement much of the Provider-to-Framework API, as
8468 * described in <sys/dtrace.h>.  The parts of the API not in this section are
8469 * the functions in the API for probe management (found below), and
8470 * dtrace_probe() itself (found above).
8471 */
8472
8473/*
8474 * Register the calling provider with the DTrace framework.  This should
8475 * generally be called by DTrace providers in their attach(9E) entry point.
8476 */
8477int
8478dtrace_register(const char *name, const dtrace_pattr_t *pap, uint32_t priv,
8479    cred_t *cr, const dtrace_pops_t *pops, void *arg, dtrace_provider_id_t *idp)
8480{
8481	dtrace_provider_t *provider;
8482
8483	if (name == NULL || pap == NULL || pops == NULL || idp == NULL) {
8484		cmn_err(CE_WARN, "failed to register provider '%s': invalid "
8485		    "arguments", name ? name : "<NULL>");
8486		return (EINVAL);
8487	}
8488
8489	if (name[0] == '\0' || dtrace_badname(name)) {
8490		cmn_err(CE_WARN, "failed to register provider '%s': invalid "
8491		    "provider name", name);
8492		return (EINVAL);
8493	}
8494
8495	if ((pops->dtps_provide == NULL && pops->dtps_provide_module == NULL) ||
8496	    pops->dtps_enable == NULL || pops->dtps_disable == NULL ||
8497	    pops->dtps_destroy == NULL ||
8498	    ((pops->dtps_resume == NULL) != (pops->dtps_suspend == NULL))) {
8499		cmn_err(CE_WARN, "failed to register provider '%s': invalid "
8500		    "provider ops", name);
8501		return (EINVAL);
8502	}
8503
8504	if (dtrace_badattr(&pap->dtpa_provider) ||
8505	    dtrace_badattr(&pap->dtpa_mod) ||
8506	    dtrace_badattr(&pap->dtpa_func) ||
8507	    dtrace_badattr(&pap->dtpa_name) ||
8508	    dtrace_badattr(&pap->dtpa_args)) {
8509		cmn_err(CE_WARN, "failed to register provider '%s': invalid "
8510		    "provider attributes", name);
8511		return (EINVAL);
8512	}
8513
8514	if (priv & ~DTRACE_PRIV_ALL) {
8515		cmn_err(CE_WARN, "failed to register provider '%s': invalid "
8516		    "privilege attributes", name);
8517		return (EINVAL);
8518	}
8519
8520	if ((priv & DTRACE_PRIV_KERNEL) &&
8521	    (priv & (DTRACE_PRIV_USER | DTRACE_PRIV_OWNER)) &&
8522	    pops->dtps_usermode == NULL) {
8523		cmn_err(CE_WARN, "failed to register provider '%s': need "
8524		    "dtps_usermode() op for given privilege attributes", name);
8525		return (EINVAL);
8526	}
8527
8528	provider = kmem_zalloc(sizeof (dtrace_provider_t), KM_SLEEP);
8529	provider->dtpv_name = kmem_alloc(strlen(name) + 1, KM_SLEEP);
8530	(void) strcpy(provider->dtpv_name, name);
8531
8532	provider->dtpv_attr = *pap;
8533	provider->dtpv_priv.dtpp_flags = priv;
8534	if (cr != NULL) {
8535		provider->dtpv_priv.dtpp_uid = crgetuid(cr);
8536		provider->dtpv_priv.dtpp_zoneid = crgetzoneid(cr);
8537	}
8538	provider->dtpv_pops = *pops;
8539
8540	if (pops->dtps_provide == NULL) {
8541		ASSERT(pops->dtps_provide_module != NULL);
8542		provider->dtpv_pops.dtps_provide =
8543		    (void (*)(void *, dtrace_probedesc_t *))dtrace_nullop;
8544	}
8545
8546	if (pops->dtps_provide_module == NULL) {
8547		ASSERT(pops->dtps_provide != NULL);
8548		provider->dtpv_pops.dtps_provide_module =
8549		    (void (*)(void *, modctl_t *))dtrace_nullop;
8550	}
8551
8552	if (pops->dtps_suspend == NULL) {
8553		ASSERT(pops->dtps_resume == NULL);
8554		provider->dtpv_pops.dtps_suspend =
8555		    (void (*)(void *, dtrace_id_t, void *))dtrace_nullop;
8556		provider->dtpv_pops.dtps_resume =
8557		    (void (*)(void *, dtrace_id_t, void *))dtrace_nullop;
8558	}
8559
8560	provider->dtpv_arg = arg;
8561	*idp = (dtrace_provider_id_t)provider;
8562
8563	if (pops == &dtrace_provider_ops) {
8564		ASSERT(MUTEX_HELD(&dtrace_provider_lock));
8565		ASSERT(MUTEX_HELD(&dtrace_lock));
8566		ASSERT(dtrace_anon.dta_enabling == NULL);
8567
8568		/*
8569		 * We make sure that the DTrace provider is at the head of
8570		 * the provider chain.
8571		 */
8572		provider->dtpv_next = dtrace_provider;
8573		dtrace_provider = provider;
8574		return (0);
8575	}
8576
8577	mutex_enter(&dtrace_provider_lock);
8578	mutex_enter(&dtrace_lock);
8579
8580	/*
8581	 * If there is at least one provider registered, we'll add this
8582	 * provider after the first provider.
8583	 */
8584	if (dtrace_provider != NULL) {
8585		provider->dtpv_next = dtrace_provider->dtpv_next;
8586		dtrace_provider->dtpv_next = provider;
8587	} else {
8588		dtrace_provider = provider;
8589	}
8590
8591	if (dtrace_retained != NULL) {
8592		dtrace_enabling_provide(provider);
8593
8594		/*
8595		 * Now we need to call dtrace_enabling_matchall() -- which
8596		 * will acquire cpu_lock and dtrace_lock.  We therefore need
8597		 * to drop all of our locks before calling into it...
8598		 */
8599		mutex_exit(&dtrace_lock);
8600		mutex_exit(&dtrace_provider_lock);
8601		dtrace_enabling_matchall();
8602
8603		return (0);
8604	}
8605
8606	mutex_exit(&dtrace_lock);
8607	mutex_exit(&dtrace_provider_lock);
8608
8609	return (0);
8610}
8611
8612/*
8613 * Unregister the specified provider from the DTrace framework.  This should
8614 * generally be called by DTrace providers in their detach(9E) entry point.
8615 */
8616int
8617dtrace_unregister(dtrace_provider_id_t id)
8618{
8619	dtrace_provider_t *old = (dtrace_provider_t *)id;
8620	dtrace_provider_t *prev = NULL;
8621	int i, self = 0, noreap = 0;
8622	dtrace_probe_t *probe, *first = NULL;
8623
8624	if (old->dtpv_pops.dtps_enable ==
8625	    (void (*)(void *, dtrace_id_t, void *))dtrace_nullop) {
8626		/*
8627		 * If DTrace itself is the provider, we're called with locks
8628		 * already held.
8629		 */
8630		ASSERT(old == dtrace_provider);
8631#if defined(sun)
8632		ASSERT(dtrace_devi != NULL);
8633#endif
8634		ASSERT(MUTEX_HELD(&dtrace_provider_lock));
8635		ASSERT(MUTEX_HELD(&dtrace_lock));
8636		self = 1;
8637
8638		if (dtrace_provider->dtpv_next != NULL) {
8639			/*
8640			 * There's another provider here; return failure.
8641			 */
8642			return (EBUSY);
8643		}
8644	} else {
8645		mutex_enter(&dtrace_provider_lock);
8646#if defined(sun)
8647		mutex_enter(&mod_lock);
8648#endif
8649		mutex_enter(&dtrace_lock);
8650	}
8651
8652	/*
8653	 * If anyone has /dev/dtrace open, or if there are anonymous enabled
8654	 * probes, we refuse to let providers slither away, unless this
8655	 * provider has already been explicitly invalidated.
8656	 */
8657	if (!old->dtpv_defunct &&
8658	    (dtrace_opens || (dtrace_anon.dta_state != NULL &&
8659	    dtrace_anon.dta_state->dts_necbs > 0))) {
8660		if (!self) {
8661			mutex_exit(&dtrace_lock);
8662#if defined(sun)
8663			mutex_exit(&mod_lock);
8664#endif
8665			mutex_exit(&dtrace_provider_lock);
8666		}
8667		return (EBUSY);
8668	}
8669
8670	/*
8671	 * Attempt to destroy the probes associated with this provider.
8672	 */
8673	for (i = 0; i < dtrace_nprobes; i++) {
8674		if ((probe = dtrace_probes[i]) == NULL)
8675			continue;
8676
8677		if (probe->dtpr_provider != old)
8678			continue;
8679
8680		if (probe->dtpr_ecb == NULL)
8681			continue;
8682
8683		/*
8684		 * If we are trying to unregister a defunct provider, and the
8685		 * provider was made defunct within the interval dictated by
8686		 * dtrace_unregister_defunct_reap, we'll (asynchronously)
8687		 * attempt to reap our enablings.  To denote that the provider
8688		 * should reattempt to unregister itself at some point in the
8689		 * future, we will return a differentiable error code (EAGAIN
8690		 * instead of EBUSY) in this case.
8691		 */
8692		if (dtrace_gethrtime() - old->dtpv_defunct >
8693		    dtrace_unregister_defunct_reap)
8694			noreap = 1;
8695
8696		if (!self) {
8697			mutex_exit(&dtrace_lock);
8698#if defined(sun)
8699			mutex_exit(&mod_lock);
8700#endif
8701			mutex_exit(&dtrace_provider_lock);
8702		}
8703
8704		if (noreap)
8705			return (EBUSY);
8706
8707		(void) taskq_dispatch(dtrace_taskq,
8708		    (task_func_t *)dtrace_enabling_reap, NULL, TQ_SLEEP);
8709
8710		return (EAGAIN);
8711	}
8712
8713	/*
8714	 * All of the probes for this provider are disabled; we can safely
8715	 * remove all of them from their hash chains and from the probe array.
8716	 */
8717	for (i = 0; i < dtrace_nprobes; i++) {
8718		if ((probe = dtrace_probes[i]) == NULL)
8719			continue;
8720
8721		if (probe->dtpr_provider != old)
8722			continue;
8723
8724		dtrace_probes[i] = NULL;
8725
8726		dtrace_hash_remove(dtrace_bymod, probe);
8727		dtrace_hash_remove(dtrace_byfunc, probe);
8728		dtrace_hash_remove(dtrace_byname, probe);
8729
8730		if (first == NULL) {
8731			first = probe;
8732			probe->dtpr_nextmod = NULL;
8733		} else {
8734			probe->dtpr_nextmod = first;
8735			first = probe;
8736		}
8737	}
8738
8739	/*
8740	 * The provider's probes have been removed from the hash chains and
8741	 * from the probe array.  Now issue a dtrace_sync() to be sure that
8742	 * everyone has cleared out from any probe array processing.
8743	 */
8744	dtrace_sync();
8745
8746	for (probe = first; probe != NULL; probe = first) {
8747		first = probe->dtpr_nextmod;
8748
8749		old->dtpv_pops.dtps_destroy(old->dtpv_arg, probe->dtpr_id,
8750		    probe->dtpr_arg);
8751		kmem_free(probe->dtpr_mod, strlen(probe->dtpr_mod) + 1);
8752		kmem_free(probe->dtpr_func, strlen(probe->dtpr_func) + 1);
8753		kmem_free(probe->dtpr_name, strlen(probe->dtpr_name) + 1);
8754#if defined(sun)
8755		vmem_free(dtrace_arena, (void *)(uintptr_t)(probe->dtpr_id), 1);
8756#else
8757		free_unr(dtrace_arena, probe->dtpr_id);
8758#endif
8759		kmem_free(probe, sizeof (dtrace_probe_t));
8760	}
8761
8762	if ((prev = dtrace_provider) == old) {
8763#if defined(sun)
8764		ASSERT(self || dtrace_devi == NULL);
8765		ASSERT(old->dtpv_next == NULL || dtrace_devi == NULL);
8766#endif
8767		dtrace_provider = old->dtpv_next;
8768	} else {
8769		while (prev != NULL && prev->dtpv_next != old)
8770			prev = prev->dtpv_next;
8771
8772		if (prev == NULL) {
8773			panic("attempt to unregister non-existent "
8774			    "dtrace provider %p\n", (void *)id);
8775		}
8776
8777		prev->dtpv_next = old->dtpv_next;
8778	}
8779
8780	if (!self) {
8781		mutex_exit(&dtrace_lock);
8782#if defined(sun)
8783		mutex_exit(&mod_lock);
8784#endif
8785		mutex_exit(&dtrace_provider_lock);
8786	}
8787
8788	kmem_free(old->dtpv_name, strlen(old->dtpv_name) + 1);
8789	kmem_free(old, sizeof (dtrace_provider_t));
8790
8791	return (0);
8792}
8793
8794/*
8795 * Invalidate the specified provider.  All subsequent probe lookups for the
8796 * specified provider will fail, but its probes will not be removed.
8797 */
8798void
8799dtrace_invalidate(dtrace_provider_id_t id)
8800{
8801	dtrace_provider_t *pvp = (dtrace_provider_t *)id;
8802
8803	ASSERT(pvp->dtpv_pops.dtps_enable !=
8804	    (void (*)(void *, dtrace_id_t, void *))dtrace_nullop);
8805
8806	mutex_enter(&dtrace_provider_lock);
8807	mutex_enter(&dtrace_lock);
8808
8809	pvp->dtpv_defunct = dtrace_gethrtime();
8810
8811	mutex_exit(&dtrace_lock);
8812	mutex_exit(&dtrace_provider_lock);
8813}
8814
8815/*
8816 * Indicate whether or not DTrace has attached.
8817 */
8818int
8819dtrace_attached(void)
8820{
8821	/*
8822	 * dtrace_provider will be non-NULL iff the DTrace driver has
8823	 * attached.  (It's non-NULL because DTrace is always itself a
8824	 * provider.)
8825	 */
8826	return (dtrace_provider != NULL);
8827}
8828
8829/*
8830 * Remove all the unenabled probes for the given provider.  This function is
8831 * not unlike dtrace_unregister(), except that it doesn't remove the provider
8832 * -- just as many of its associated probes as it can.
8833 */
8834int
8835dtrace_condense(dtrace_provider_id_t id)
8836{
8837	dtrace_provider_t *prov = (dtrace_provider_t *)id;
8838	int i;
8839	dtrace_probe_t *probe;
8840
8841	/*
8842	 * Make sure this isn't the dtrace provider itself.
8843	 */
8844	ASSERT(prov->dtpv_pops.dtps_enable !=
8845	    (void (*)(void *, dtrace_id_t, void *))dtrace_nullop);
8846
8847	mutex_enter(&dtrace_provider_lock);
8848	mutex_enter(&dtrace_lock);
8849
8850	/*
8851	 * Attempt to destroy the probes associated with this provider.
8852	 */
8853	for (i = 0; i < dtrace_nprobes; i++) {
8854		if ((probe = dtrace_probes[i]) == NULL)
8855			continue;
8856
8857		if (probe->dtpr_provider != prov)
8858			continue;
8859
8860		if (probe->dtpr_ecb != NULL)
8861			continue;
8862
8863		dtrace_probes[i] = NULL;
8864
8865		dtrace_hash_remove(dtrace_bymod, probe);
8866		dtrace_hash_remove(dtrace_byfunc, probe);
8867		dtrace_hash_remove(dtrace_byname, probe);
8868
8869		prov->dtpv_pops.dtps_destroy(prov->dtpv_arg, i + 1,
8870		    probe->dtpr_arg);
8871		kmem_free(probe->dtpr_mod, strlen(probe->dtpr_mod) + 1);
8872		kmem_free(probe->dtpr_func, strlen(probe->dtpr_func) + 1);
8873		kmem_free(probe->dtpr_name, strlen(probe->dtpr_name) + 1);
8874		kmem_free(probe, sizeof (dtrace_probe_t));
8875#if defined(sun)
8876		vmem_free(dtrace_arena, (void *)((uintptr_t)i + 1), 1);
8877#else
8878		free_unr(dtrace_arena, i + 1);
8879#endif
8880	}
8881
8882	mutex_exit(&dtrace_lock);
8883	mutex_exit(&dtrace_provider_lock);
8884
8885	return (0);
8886}
8887
8888/*
8889 * DTrace Probe Management Functions
8890 *
8891 * The functions in this section perform the DTrace probe management,
8892 * including functions to create probes, look-up probes, and call into the
8893 * providers to request that probes be provided.  Some of these functions are
8894 * in the Provider-to-Framework API; these functions can be identified by the
8895 * fact that they are not declared "static".
8896 */
8897
8898/*
8899 * Create a probe with the specified module name, function name, and name.
8900 */
8901dtrace_id_t
8902dtrace_probe_create(dtrace_provider_id_t prov, const char *mod,
8903    const char *func, const char *name, int aframes, void *arg)
8904{
8905	dtrace_probe_t *probe, **probes;
8906	dtrace_provider_t *provider = (dtrace_provider_t *)prov;
8907	dtrace_id_t id;
8908
8909	if (provider == dtrace_provider) {
8910		ASSERT(MUTEX_HELD(&dtrace_lock));
8911	} else {
8912		mutex_enter(&dtrace_lock);
8913	}
8914
8915#if defined(sun)
8916	id = (dtrace_id_t)(uintptr_t)vmem_alloc(dtrace_arena, 1,
8917	    VM_BESTFIT | VM_SLEEP);
8918#else
8919	id = alloc_unr(dtrace_arena);
8920#endif
8921	probe = kmem_zalloc(sizeof (dtrace_probe_t), KM_SLEEP);
8922
8923	probe->dtpr_id = id;
8924	probe->dtpr_gen = dtrace_probegen++;
8925	probe->dtpr_mod = dtrace_strdup(mod);
8926	probe->dtpr_func = dtrace_strdup(func);
8927	probe->dtpr_name = dtrace_strdup(name);
8928	probe->dtpr_arg = arg;
8929	probe->dtpr_aframes = aframes;
8930	probe->dtpr_provider = provider;
8931
8932	dtrace_hash_add(dtrace_bymod, probe);
8933	dtrace_hash_add(dtrace_byfunc, probe);
8934	dtrace_hash_add(dtrace_byname, probe);
8935
8936	if (id - 1 >= dtrace_nprobes) {
8937		size_t osize = dtrace_nprobes * sizeof (dtrace_probe_t *);
8938		size_t nsize = osize << 1;
8939
8940		if (nsize == 0) {
8941			ASSERT(osize == 0);
8942			ASSERT(dtrace_probes == NULL);
8943			nsize = sizeof (dtrace_probe_t *);
8944		}
8945
8946		probes = kmem_zalloc(nsize, KM_SLEEP);
8947
8948		if (dtrace_probes == NULL) {
8949			ASSERT(osize == 0);
8950			dtrace_probes = probes;
8951			dtrace_nprobes = 1;
8952		} else {
8953			dtrace_probe_t **oprobes = dtrace_probes;
8954
8955			bcopy(oprobes, probes, osize);
8956			dtrace_membar_producer();
8957			dtrace_probes = probes;
8958
8959			dtrace_sync();
8960
8961			/*
8962			 * All CPUs are now seeing the new probes array; we can
8963			 * safely free the old array.
8964			 */
8965			kmem_free(oprobes, osize);
8966			dtrace_nprobes <<= 1;
8967		}
8968
8969		ASSERT(id - 1 < dtrace_nprobes);
8970	}
8971
8972	ASSERT(dtrace_probes[id - 1] == NULL);
8973	dtrace_probes[id - 1] = probe;
8974
8975	if (provider != dtrace_provider)
8976		mutex_exit(&dtrace_lock);
8977
8978	return (id);
8979}
8980
8981static dtrace_probe_t *
8982dtrace_probe_lookup_id(dtrace_id_t id)
8983{
8984	ASSERT(MUTEX_HELD(&dtrace_lock));
8985
8986	if (id == 0 || id > dtrace_nprobes)
8987		return (NULL);
8988
8989	return (dtrace_probes[id - 1]);
8990}
8991
8992static int
8993dtrace_probe_lookup_match(dtrace_probe_t *probe, void *arg)
8994{
8995	*((dtrace_id_t *)arg) = probe->dtpr_id;
8996
8997	return (DTRACE_MATCH_DONE);
8998}
8999
9000/*
9001 * Look up a probe based on provider and one or more of module name, function
9002 * name and probe name.
9003 */
9004dtrace_id_t
9005dtrace_probe_lookup(dtrace_provider_id_t prid, char *mod,
9006    char *func, char *name)
9007{
9008	dtrace_probekey_t pkey;
9009	dtrace_id_t id;
9010	int match;
9011
9012	pkey.dtpk_prov = ((dtrace_provider_t *)prid)->dtpv_name;
9013	pkey.dtpk_pmatch = &dtrace_match_string;
9014	pkey.dtpk_mod = mod;
9015	pkey.dtpk_mmatch = mod ? &dtrace_match_string : &dtrace_match_nul;
9016	pkey.dtpk_func = func;
9017	pkey.dtpk_fmatch = func ? &dtrace_match_string : &dtrace_match_nul;
9018	pkey.dtpk_name = name;
9019	pkey.dtpk_nmatch = name ? &dtrace_match_string : &dtrace_match_nul;
9020	pkey.dtpk_id = DTRACE_IDNONE;
9021
9022	mutex_enter(&dtrace_lock);
9023	match = dtrace_match(&pkey, DTRACE_PRIV_ALL, 0, 0,
9024	    dtrace_probe_lookup_match, &id);
9025	mutex_exit(&dtrace_lock);
9026
9027	ASSERT(match == 1 || match == 0);
9028	return (match ? id : 0);
9029}
9030
9031/*
9032 * Returns the probe argument associated with the specified probe.
9033 */
9034void *
9035dtrace_probe_arg(dtrace_provider_id_t id, dtrace_id_t pid)
9036{
9037	dtrace_probe_t *probe;
9038	void *rval = NULL;
9039
9040	mutex_enter(&dtrace_lock);
9041
9042	if ((probe = dtrace_probe_lookup_id(pid)) != NULL &&
9043	    probe->dtpr_provider == (dtrace_provider_t *)id)
9044		rval = probe->dtpr_arg;
9045
9046	mutex_exit(&dtrace_lock);
9047
9048	return (rval);
9049}
9050
9051/*
9052 * Copy a probe into a probe description.
9053 */
9054static void
9055dtrace_probe_description(const dtrace_probe_t *prp, dtrace_probedesc_t *pdp)
9056{
9057	bzero(pdp, sizeof (dtrace_probedesc_t));
9058	pdp->dtpd_id = prp->dtpr_id;
9059
9060	(void) strncpy(pdp->dtpd_provider,
9061	    prp->dtpr_provider->dtpv_name, DTRACE_PROVNAMELEN - 1);
9062
9063	(void) strncpy(pdp->dtpd_mod, prp->dtpr_mod, DTRACE_MODNAMELEN - 1);
9064	(void) strncpy(pdp->dtpd_func, prp->dtpr_func, DTRACE_FUNCNAMELEN - 1);
9065	(void) strncpy(pdp->dtpd_name, prp->dtpr_name, DTRACE_NAMELEN - 1);
9066}
9067
9068/*
9069 * Called to indicate that a probe -- or probes -- should be provided by a
9070 * specfied provider.  If the specified description is NULL, the provider will
9071 * be told to provide all of its probes.  (This is done whenever a new
9072 * consumer comes along, or whenever a retained enabling is to be matched.) If
9073 * the specified description is non-NULL, the provider is given the
9074 * opportunity to dynamically provide the specified probe, allowing providers
9075 * to support the creation of probes on-the-fly.  (So-called _autocreated_
9076 * probes.)  If the provider is NULL, the operations will be applied to all
9077 * providers; if the provider is non-NULL the operations will only be applied
9078 * to the specified provider.  The dtrace_provider_lock must be held, and the
9079 * dtrace_lock must _not_ be held -- the provider's dtps_provide() operation
9080 * will need to grab the dtrace_lock when it reenters the framework through
9081 * dtrace_probe_lookup(), dtrace_probe_create(), etc.
9082 */
9083static void
9084dtrace_probe_provide(dtrace_probedesc_t *desc, dtrace_provider_t *prv)
9085{
9086#if defined(sun)
9087	modctl_t *ctl;
9088#endif
9089	int all = 0;
9090
9091	ASSERT(MUTEX_HELD(&dtrace_provider_lock));
9092
9093	if (prv == NULL) {
9094		all = 1;
9095		prv = dtrace_provider;
9096	}
9097
9098	do {
9099		/*
9100		 * First, call the blanket provide operation.
9101		 */
9102		prv->dtpv_pops.dtps_provide(prv->dtpv_arg, desc);
9103
9104#if defined(sun)
9105		/*
9106		 * Now call the per-module provide operation.  We will grab
9107		 * mod_lock to prevent the list from being modified.  Note
9108		 * that this also prevents the mod_busy bits from changing.
9109		 * (mod_busy can only be changed with mod_lock held.)
9110		 */
9111		mutex_enter(&mod_lock);
9112
9113		ctl = &modules;
9114		do {
9115			if (ctl->mod_busy || ctl->mod_mp == NULL)
9116				continue;
9117
9118			prv->dtpv_pops.dtps_provide_module(prv->dtpv_arg, ctl);
9119
9120		} while ((ctl = ctl->mod_next) != &modules);
9121
9122		mutex_exit(&mod_lock);
9123#endif
9124	} while (all && (prv = prv->dtpv_next) != NULL);
9125}
9126
9127#if defined(sun)
9128/*
9129 * Iterate over each probe, and call the Framework-to-Provider API function
9130 * denoted by offs.
9131 */
9132static void
9133dtrace_probe_foreach(uintptr_t offs)
9134{
9135	dtrace_provider_t *prov;
9136	void (*func)(void *, dtrace_id_t, void *);
9137	dtrace_probe_t *probe;
9138	dtrace_icookie_t cookie;
9139	int i;
9140
9141	/*
9142	 * We disable interrupts to walk through the probe array.  This is
9143	 * safe -- the dtrace_sync() in dtrace_unregister() assures that we
9144	 * won't see stale data.
9145	 */
9146	cookie = dtrace_interrupt_disable();
9147
9148	for (i = 0; i < dtrace_nprobes; i++) {
9149		if ((probe = dtrace_probes[i]) == NULL)
9150			continue;
9151
9152		if (probe->dtpr_ecb == NULL) {
9153			/*
9154			 * This probe isn't enabled -- don't call the function.
9155			 */
9156			continue;
9157		}
9158
9159		prov = probe->dtpr_provider;
9160		func = *((void(**)(void *, dtrace_id_t, void *))
9161		    ((uintptr_t)&prov->dtpv_pops + offs));
9162
9163		func(prov->dtpv_arg, i + 1, probe->dtpr_arg);
9164	}
9165
9166	dtrace_interrupt_enable(cookie);
9167}
9168#endif
9169
9170static int
9171dtrace_probe_enable(dtrace_probedesc_t *desc, dtrace_enabling_t *enab)
9172{
9173	dtrace_probekey_t pkey;
9174	uint32_t priv;
9175	uid_t uid;
9176	zoneid_t zoneid;
9177
9178	ASSERT(MUTEX_HELD(&dtrace_lock));
9179	dtrace_ecb_create_cache = NULL;
9180
9181	if (desc == NULL) {
9182		/*
9183		 * If we're passed a NULL description, we're being asked to
9184		 * create an ECB with a NULL probe.
9185		 */
9186		(void) dtrace_ecb_create_enable(NULL, enab);
9187		return (0);
9188	}
9189
9190	dtrace_probekey(desc, &pkey);
9191	dtrace_cred2priv(enab->dten_vstate->dtvs_state->dts_cred.dcr_cred,
9192	    &priv, &uid, &zoneid);
9193
9194	return (dtrace_match(&pkey, priv, uid, zoneid, dtrace_ecb_create_enable,
9195	    enab));
9196}
9197
9198/*
9199 * DTrace Helper Provider Functions
9200 */
9201static void
9202dtrace_dofattr2attr(dtrace_attribute_t *attr, const dof_attr_t dofattr)
9203{
9204	attr->dtat_name = DOF_ATTR_NAME(dofattr);
9205	attr->dtat_data = DOF_ATTR_DATA(dofattr);
9206	attr->dtat_class = DOF_ATTR_CLASS(dofattr);
9207}
9208
9209static void
9210dtrace_dofprov2hprov(dtrace_helper_provdesc_t *hprov,
9211    const dof_provider_t *dofprov, char *strtab)
9212{
9213	hprov->dthpv_provname = strtab + dofprov->dofpv_name;
9214	dtrace_dofattr2attr(&hprov->dthpv_pattr.dtpa_provider,
9215	    dofprov->dofpv_provattr);
9216	dtrace_dofattr2attr(&hprov->dthpv_pattr.dtpa_mod,
9217	    dofprov->dofpv_modattr);
9218	dtrace_dofattr2attr(&hprov->dthpv_pattr.dtpa_func,
9219	    dofprov->dofpv_funcattr);
9220	dtrace_dofattr2attr(&hprov->dthpv_pattr.dtpa_name,
9221	    dofprov->dofpv_nameattr);
9222	dtrace_dofattr2attr(&hprov->dthpv_pattr.dtpa_args,
9223	    dofprov->dofpv_argsattr);
9224}
9225
9226static void
9227dtrace_helper_provide_one(dof_helper_t *dhp, dof_sec_t *sec, pid_t pid)
9228{
9229	uintptr_t daddr = (uintptr_t)dhp->dofhp_dof;
9230	dof_hdr_t *dof = (dof_hdr_t *)daddr;
9231	dof_sec_t *str_sec, *prb_sec, *arg_sec, *off_sec, *enoff_sec;
9232	dof_provider_t *provider;
9233	dof_probe_t *probe;
9234	uint32_t *off, *enoff;
9235	uint8_t *arg;
9236	char *strtab;
9237	uint_t i, nprobes;
9238	dtrace_helper_provdesc_t dhpv;
9239	dtrace_helper_probedesc_t dhpb;
9240	dtrace_meta_t *meta = dtrace_meta_pid;
9241	dtrace_mops_t *mops = &meta->dtm_mops;
9242	void *parg;
9243
9244	provider = (dof_provider_t *)(uintptr_t)(daddr + sec->dofs_offset);
9245	str_sec = (dof_sec_t *)(uintptr_t)(daddr + dof->dofh_secoff +
9246	    provider->dofpv_strtab * dof->dofh_secsize);
9247	prb_sec = (dof_sec_t *)(uintptr_t)(daddr + dof->dofh_secoff +
9248	    provider->dofpv_probes * dof->dofh_secsize);
9249	arg_sec = (dof_sec_t *)(uintptr_t)(daddr + dof->dofh_secoff +
9250	    provider->dofpv_prargs * dof->dofh_secsize);
9251	off_sec = (dof_sec_t *)(uintptr_t)(daddr + dof->dofh_secoff +
9252	    provider->dofpv_proffs * dof->dofh_secsize);
9253
9254	strtab = (char *)(uintptr_t)(daddr + str_sec->dofs_offset);
9255	off = (uint32_t *)(uintptr_t)(daddr + off_sec->dofs_offset);
9256	arg = (uint8_t *)(uintptr_t)(daddr + arg_sec->dofs_offset);
9257	enoff = NULL;
9258
9259	/*
9260	 * See dtrace_helper_provider_validate().
9261	 */
9262	if (dof->dofh_ident[DOF_ID_VERSION] != DOF_VERSION_1 &&
9263	    provider->dofpv_prenoffs != DOF_SECT_NONE) {
9264		enoff_sec = (dof_sec_t *)(uintptr_t)(daddr + dof->dofh_secoff +
9265		    provider->dofpv_prenoffs * dof->dofh_secsize);
9266		enoff = (uint32_t *)(uintptr_t)(daddr + enoff_sec->dofs_offset);
9267	}
9268
9269	nprobes = prb_sec->dofs_size / prb_sec->dofs_entsize;
9270
9271	/*
9272	 * Create the provider.
9273	 */
9274	dtrace_dofprov2hprov(&dhpv, provider, strtab);
9275
9276	if ((parg = mops->dtms_provide_pid(meta->dtm_arg, &dhpv, pid)) == NULL)
9277		return;
9278
9279	meta->dtm_count++;
9280
9281	/*
9282	 * Create the probes.
9283	 */
9284	for (i = 0; i < nprobes; i++) {
9285		probe = (dof_probe_t *)(uintptr_t)(daddr +
9286		    prb_sec->dofs_offset + i * prb_sec->dofs_entsize);
9287
9288		dhpb.dthpb_mod = dhp->dofhp_mod;
9289		dhpb.dthpb_func = strtab + probe->dofpr_func;
9290		dhpb.dthpb_name = strtab + probe->dofpr_name;
9291		dhpb.dthpb_base = probe->dofpr_addr;
9292		dhpb.dthpb_offs = off + probe->dofpr_offidx;
9293		dhpb.dthpb_noffs = probe->dofpr_noffs;
9294		if (enoff != NULL) {
9295			dhpb.dthpb_enoffs = enoff + probe->dofpr_enoffidx;
9296			dhpb.dthpb_nenoffs = probe->dofpr_nenoffs;
9297		} else {
9298			dhpb.dthpb_enoffs = NULL;
9299			dhpb.dthpb_nenoffs = 0;
9300		}
9301		dhpb.dthpb_args = arg + probe->dofpr_argidx;
9302		dhpb.dthpb_nargc = probe->dofpr_nargc;
9303		dhpb.dthpb_xargc = probe->dofpr_xargc;
9304		dhpb.dthpb_ntypes = strtab + probe->dofpr_nargv;
9305		dhpb.dthpb_xtypes = strtab + probe->dofpr_xargv;
9306
9307		mops->dtms_create_probe(meta->dtm_arg, parg, &dhpb);
9308	}
9309}
9310
9311static void
9312dtrace_helper_provide(dof_helper_t *dhp, pid_t pid)
9313{
9314	uintptr_t daddr = (uintptr_t)dhp->dofhp_dof;
9315	dof_hdr_t *dof = (dof_hdr_t *)daddr;
9316	int i;
9317
9318	ASSERT(MUTEX_HELD(&dtrace_meta_lock));
9319
9320	for (i = 0; i < dof->dofh_secnum; i++) {
9321		dof_sec_t *sec = (dof_sec_t *)(uintptr_t)(daddr +
9322		    dof->dofh_secoff + i * dof->dofh_secsize);
9323
9324		if (sec->dofs_type != DOF_SECT_PROVIDER)
9325			continue;
9326
9327		dtrace_helper_provide_one(dhp, sec, pid);
9328	}
9329
9330	/*
9331	 * We may have just created probes, so we must now rematch against
9332	 * any retained enablings.  Note that this call will acquire both
9333	 * cpu_lock and dtrace_lock; the fact that we are holding
9334	 * dtrace_meta_lock now is what defines the ordering with respect to
9335	 * these three locks.
9336	 */
9337	dtrace_enabling_matchall();
9338}
9339
9340static void
9341dtrace_helper_provider_remove_one(dof_helper_t *dhp, dof_sec_t *sec, pid_t pid)
9342{
9343	uintptr_t daddr = (uintptr_t)dhp->dofhp_dof;
9344	dof_hdr_t *dof = (dof_hdr_t *)daddr;
9345	dof_sec_t *str_sec;
9346	dof_provider_t *provider;
9347	char *strtab;
9348	dtrace_helper_provdesc_t dhpv;
9349	dtrace_meta_t *meta = dtrace_meta_pid;
9350	dtrace_mops_t *mops = &meta->dtm_mops;
9351
9352	provider = (dof_provider_t *)(uintptr_t)(daddr + sec->dofs_offset);
9353	str_sec = (dof_sec_t *)(uintptr_t)(daddr + dof->dofh_secoff +
9354	    provider->dofpv_strtab * dof->dofh_secsize);
9355
9356	strtab = (char *)(uintptr_t)(daddr + str_sec->dofs_offset);
9357
9358	/*
9359	 * Create the provider.
9360	 */
9361	dtrace_dofprov2hprov(&dhpv, provider, strtab);
9362
9363	mops->dtms_remove_pid(meta->dtm_arg, &dhpv, pid);
9364
9365	meta->dtm_count--;
9366}
9367
9368static void
9369dtrace_helper_provider_remove(dof_helper_t *dhp, pid_t pid)
9370{
9371	uintptr_t daddr = (uintptr_t)dhp->dofhp_dof;
9372	dof_hdr_t *dof = (dof_hdr_t *)daddr;
9373	int i;
9374
9375	ASSERT(MUTEX_HELD(&dtrace_meta_lock));
9376
9377	for (i = 0; i < dof->dofh_secnum; i++) {
9378		dof_sec_t *sec = (dof_sec_t *)(uintptr_t)(daddr +
9379		    dof->dofh_secoff + i * dof->dofh_secsize);
9380
9381		if (sec->dofs_type != DOF_SECT_PROVIDER)
9382			continue;
9383
9384		dtrace_helper_provider_remove_one(dhp, sec, pid);
9385	}
9386}
9387
9388/*
9389 * DTrace Meta Provider-to-Framework API Functions
9390 *
9391 * These functions implement the Meta Provider-to-Framework API, as described
9392 * in <sys/dtrace.h>.
9393 */
9394int
9395dtrace_meta_register(const char *name, const dtrace_mops_t *mops, void *arg,
9396    dtrace_meta_provider_id_t *idp)
9397{
9398	dtrace_meta_t *meta;
9399	dtrace_helpers_t *help, *next;
9400	int i;
9401
9402	*idp = DTRACE_METAPROVNONE;
9403
9404	/*
9405	 * We strictly don't need the name, but we hold onto it for
9406	 * debuggability. All hail error queues!
9407	 */
9408	if (name == NULL) {
9409		cmn_err(CE_WARN, "failed to register meta-provider: "
9410		    "invalid name");
9411		return (EINVAL);
9412	}
9413
9414	if (mops == NULL ||
9415	    mops->dtms_create_probe == NULL ||
9416	    mops->dtms_provide_pid == NULL ||
9417	    mops->dtms_remove_pid == NULL) {
9418		cmn_err(CE_WARN, "failed to register meta-register %s: "
9419		    "invalid ops", name);
9420		return (EINVAL);
9421	}
9422
9423	meta = kmem_zalloc(sizeof (dtrace_meta_t), KM_SLEEP);
9424	meta->dtm_mops = *mops;
9425	meta->dtm_name = kmem_alloc(strlen(name) + 1, KM_SLEEP);
9426	(void) strcpy(meta->dtm_name, name);
9427	meta->dtm_arg = arg;
9428
9429	mutex_enter(&dtrace_meta_lock);
9430	mutex_enter(&dtrace_lock);
9431
9432	if (dtrace_meta_pid != NULL) {
9433		mutex_exit(&dtrace_lock);
9434		mutex_exit(&dtrace_meta_lock);
9435		cmn_err(CE_WARN, "failed to register meta-register %s: "
9436		    "user-land meta-provider exists", name);
9437		kmem_free(meta->dtm_name, strlen(meta->dtm_name) + 1);
9438		kmem_free(meta, sizeof (dtrace_meta_t));
9439		return (EINVAL);
9440	}
9441
9442	dtrace_meta_pid = meta;
9443	*idp = (dtrace_meta_provider_id_t)meta;
9444
9445	/*
9446	 * If there are providers and probes ready to go, pass them
9447	 * off to the new meta provider now.
9448	 */
9449
9450	help = dtrace_deferred_pid;
9451	dtrace_deferred_pid = NULL;
9452
9453	mutex_exit(&dtrace_lock);
9454
9455	while (help != NULL) {
9456		for (i = 0; i < help->dthps_nprovs; i++) {
9457			dtrace_helper_provide(&help->dthps_provs[i]->dthp_prov,
9458			    help->dthps_pid);
9459		}
9460
9461		next = help->dthps_next;
9462		help->dthps_next = NULL;
9463		help->dthps_prev = NULL;
9464		help->dthps_deferred = 0;
9465		help = next;
9466	}
9467
9468	mutex_exit(&dtrace_meta_lock);
9469
9470	return (0);
9471}
9472
9473int
9474dtrace_meta_unregister(dtrace_meta_provider_id_t id)
9475{
9476	dtrace_meta_t **pp, *old = (dtrace_meta_t *)id;
9477
9478	mutex_enter(&dtrace_meta_lock);
9479	mutex_enter(&dtrace_lock);
9480
9481	if (old == dtrace_meta_pid) {
9482		pp = &dtrace_meta_pid;
9483	} else {
9484		panic("attempt to unregister non-existent "
9485		    "dtrace meta-provider %p\n", (void *)old);
9486	}
9487
9488	if (old->dtm_count != 0) {
9489		mutex_exit(&dtrace_lock);
9490		mutex_exit(&dtrace_meta_lock);
9491		return (EBUSY);
9492	}
9493
9494	*pp = NULL;
9495
9496	mutex_exit(&dtrace_lock);
9497	mutex_exit(&dtrace_meta_lock);
9498
9499	kmem_free(old->dtm_name, strlen(old->dtm_name) + 1);
9500	kmem_free(old, sizeof (dtrace_meta_t));
9501
9502	return (0);
9503}
9504
9505
9506/*
9507 * DTrace DIF Object Functions
9508 */
9509static int
9510dtrace_difo_err(uint_t pc, const char *format, ...)
9511{
9512	if (dtrace_err_verbose) {
9513		va_list alist;
9514
9515		(void) uprintf("dtrace DIF object error: [%u]: ", pc);
9516		va_start(alist, format);
9517		(void) vuprintf(format, alist);
9518		va_end(alist);
9519	}
9520
9521#ifdef DTRACE_ERRDEBUG
9522	dtrace_errdebug(format);
9523#endif
9524	return (1);
9525}
9526
9527/*
9528 * Validate a DTrace DIF object by checking the IR instructions.  The following
9529 * rules are currently enforced by dtrace_difo_validate():
9530 *
9531 * 1. Each instruction must have a valid opcode
9532 * 2. Each register, string, variable, or subroutine reference must be valid
9533 * 3. No instruction can modify register %r0 (must be zero)
9534 * 4. All instruction reserved bits must be set to zero
9535 * 5. The last instruction must be a "ret" instruction
9536 * 6. All branch targets must reference a valid instruction _after_ the branch
9537 */
9538static int
9539dtrace_difo_validate(dtrace_difo_t *dp, dtrace_vstate_t *vstate, uint_t nregs,
9540    cred_t *cr)
9541{
9542	int err = 0, i;
9543	int (*efunc)(uint_t pc, const char *, ...) = dtrace_difo_err;
9544	int kcheckload;
9545	uint_t pc;
9546
9547	kcheckload = cr == NULL ||
9548	    (vstate->dtvs_state->dts_cred.dcr_visible & DTRACE_CRV_KERNEL) == 0;
9549
9550	dp->dtdo_destructive = 0;
9551
9552	for (pc = 0; pc < dp->dtdo_len && err == 0; pc++) {
9553		dif_instr_t instr = dp->dtdo_buf[pc];
9554
9555		uint_t r1 = DIF_INSTR_R1(instr);
9556		uint_t r2 = DIF_INSTR_R2(instr);
9557		uint_t rd = DIF_INSTR_RD(instr);
9558		uint_t rs = DIF_INSTR_RS(instr);
9559		uint_t label = DIF_INSTR_LABEL(instr);
9560		uint_t v = DIF_INSTR_VAR(instr);
9561		uint_t subr = DIF_INSTR_SUBR(instr);
9562		uint_t type = DIF_INSTR_TYPE(instr);
9563		uint_t op = DIF_INSTR_OP(instr);
9564
9565		switch (op) {
9566		case DIF_OP_OR:
9567		case DIF_OP_XOR:
9568		case DIF_OP_AND:
9569		case DIF_OP_SLL:
9570		case DIF_OP_SRL:
9571		case DIF_OP_SRA:
9572		case DIF_OP_SUB:
9573		case DIF_OP_ADD:
9574		case DIF_OP_MUL:
9575		case DIF_OP_SDIV:
9576		case DIF_OP_UDIV:
9577		case DIF_OP_SREM:
9578		case DIF_OP_UREM:
9579		case DIF_OP_COPYS:
9580			if (r1 >= nregs)
9581				err += efunc(pc, "invalid register %u\n", r1);
9582			if (r2 >= nregs)
9583				err += efunc(pc, "invalid register %u\n", r2);
9584			if (rd >= nregs)
9585				err += efunc(pc, "invalid register %u\n", rd);
9586			if (rd == 0)
9587				err += efunc(pc, "cannot write to %r0\n");
9588			break;
9589		case DIF_OP_NOT:
9590		case DIF_OP_MOV:
9591		case DIF_OP_ALLOCS:
9592			if (r1 >= nregs)
9593				err += efunc(pc, "invalid register %u\n", r1);
9594			if (r2 != 0)
9595				err += efunc(pc, "non-zero reserved bits\n");
9596			if (rd >= nregs)
9597				err += efunc(pc, "invalid register %u\n", rd);
9598			if (rd == 0)
9599				err += efunc(pc, "cannot write to %r0\n");
9600			break;
9601		case DIF_OP_LDSB:
9602		case DIF_OP_LDSH:
9603		case DIF_OP_LDSW:
9604		case DIF_OP_LDUB:
9605		case DIF_OP_LDUH:
9606		case DIF_OP_LDUW:
9607		case DIF_OP_LDX:
9608			if (r1 >= nregs)
9609				err += efunc(pc, "invalid register %u\n", r1);
9610			if (r2 != 0)
9611				err += efunc(pc, "non-zero reserved bits\n");
9612			if (rd >= nregs)
9613				err += efunc(pc, "invalid register %u\n", rd);
9614			if (rd == 0)
9615				err += efunc(pc, "cannot write to %r0\n");
9616			if (kcheckload)
9617				dp->dtdo_buf[pc] = DIF_INSTR_LOAD(op +
9618				    DIF_OP_RLDSB - DIF_OP_LDSB, r1, rd);
9619			break;
9620		case DIF_OP_RLDSB:
9621		case DIF_OP_RLDSH:
9622		case DIF_OP_RLDSW:
9623		case DIF_OP_RLDUB:
9624		case DIF_OP_RLDUH:
9625		case DIF_OP_RLDUW:
9626		case DIF_OP_RLDX:
9627			if (r1 >= nregs)
9628				err += efunc(pc, "invalid register %u\n", r1);
9629			if (r2 != 0)
9630				err += efunc(pc, "non-zero reserved bits\n");
9631			if (rd >= nregs)
9632				err += efunc(pc, "invalid register %u\n", rd);
9633			if (rd == 0)
9634				err += efunc(pc, "cannot write to %r0\n");
9635			break;
9636		case DIF_OP_ULDSB:
9637		case DIF_OP_ULDSH:
9638		case DIF_OP_ULDSW:
9639		case DIF_OP_ULDUB:
9640		case DIF_OP_ULDUH:
9641		case DIF_OP_ULDUW:
9642		case DIF_OP_ULDX:
9643			if (r1 >= nregs)
9644				err += efunc(pc, "invalid register %u\n", r1);
9645			if (r2 != 0)
9646				err += efunc(pc, "non-zero reserved bits\n");
9647			if (rd >= nregs)
9648				err += efunc(pc, "invalid register %u\n", rd);
9649			if (rd == 0)
9650				err += efunc(pc, "cannot write to %r0\n");
9651			break;
9652		case DIF_OP_STB:
9653		case DIF_OP_STH:
9654		case DIF_OP_STW:
9655		case DIF_OP_STX:
9656			if (r1 >= nregs)
9657				err += efunc(pc, "invalid register %u\n", r1);
9658			if (r2 != 0)
9659				err += efunc(pc, "non-zero reserved bits\n");
9660			if (rd >= nregs)
9661				err += efunc(pc, "invalid register %u\n", rd);
9662			if (rd == 0)
9663				err += efunc(pc, "cannot write to 0 address\n");
9664			break;
9665		case DIF_OP_CMP:
9666		case DIF_OP_SCMP:
9667			if (r1 >= nregs)
9668				err += efunc(pc, "invalid register %u\n", r1);
9669			if (r2 >= nregs)
9670				err += efunc(pc, "invalid register %u\n", r2);
9671			if (rd != 0)
9672				err += efunc(pc, "non-zero reserved bits\n");
9673			break;
9674		case DIF_OP_TST:
9675			if (r1 >= nregs)
9676				err += efunc(pc, "invalid register %u\n", r1);
9677			if (r2 != 0 || rd != 0)
9678				err += efunc(pc, "non-zero reserved bits\n");
9679			break;
9680		case DIF_OP_BA:
9681		case DIF_OP_BE:
9682		case DIF_OP_BNE:
9683		case DIF_OP_BG:
9684		case DIF_OP_BGU:
9685		case DIF_OP_BGE:
9686		case DIF_OP_BGEU:
9687		case DIF_OP_BL:
9688		case DIF_OP_BLU:
9689		case DIF_OP_BLE:
9690		case DIF_OP_BLEU:
9691			if (label >= dp->dtdo_len) {
9692				err += efunc(pc, "invalid branch target %u\n",
9693				    label);
9694			}
9695			if (label <= pc) {
9696				err += efunc(pc, "backward branch to %u\n",
9697				    label);
9698			}
9699			break;
9700		case DIF_OP_RET:
9701			if (r1 != 0 || r2 != 0)
9702				err += efunc(pc, "non-zero reserved bits\n");
9703			if (rd >= nregs)
9704				err += efunc(pc, "invalid register %u\n", rd);
9705			break;
9706		case DIF_OP_NOP:
9707		case DIF_OP_POPTS:
9708		case DIF_OP_FLUSHTS:
9709			if (r1 != 0 || r2 != 0 || rd != 0)
9710				err += efunc(pc, "non-zero reserved bits\n");
9711			break;
9712		case DIF_OP_SETX:
9713			if (DIF_INSTR_INTEGER(instr) >= dp->dtdo_intlen) {
9714				err += efunc(pc, "invalid integer ref %u\n",
9715				    DIF_INSTR_INTEGER(instr));
9716			}
9717			if (rd >= nregs)
9718				err += efunc(pc, "invalid register %u\n", rd);
9719			if (rd == 0)
9720				err += efunc(pc, "cannot write to %r0\n");
9721			break;
9722		case DIF_OP_SETS:
9723			if (DIF_INSTR_STRING(instr) >= dp->dtdo_strlen) {
9724				err += efunc(pc, "invalid string ref %u\n",
9725				    DIF_INSTR_STRING(instr));
9726			}
9727			if (rd >= nregs)
9728				err += efunc(pc, "invalid register %u\n", rd);
9729			if (rd == 0)
9730				err += efunc(pc, "cannot write to %r0\n");
9731			break;
9732		case DIF_OP_LDGA:
9733		case DIF_OP_LDTA:
9734			if (r1 > DIF_VAR_ARRAY_MAX)
9735				err += efunc(pc, "invalid array %u\n", r1);
9736			if (r2 >= nregs)
9737				err += efunc(pc, "invalid register %u\n", r2);
9738			if (rd >= nregs)
9739				err += efunc(pc, "invalid register %u\n", rd);
9740			if (rd == 0)
9741				err += efunc(pc, "cannot write to %r0\n");
9742			break;
9743		case DIF_OP_LDGS:
9744		case DIF_OP_LDTS:
9745		case DIF_OP_LDLS:
9746		case DIF_OP_LDGAA:
9747		case DIF_OP_LDTAA:
9748			if (v < DIF_VAR_OTHER_MIN || v > DIF_VAR_OTHER_MAX)
9749				err += efunc(pc, "invalid variable %u\n", v);
9750			if (rd >= nregs)
9751				err += efunc(pc, "invalid register %u\n", rd);
9752			if (rd == 0)
9753				err += efunc(pc, "cannot write to %r0\n");
9754			break;
9755		case DIF_OP_STGS:
9756		case DIF_OP_STTS:
9757		case DIF_OP_STLS:
9758		case DIF_OP_STGAA:
9759		case DIF_OP_STTAA:
9760			if (v < DIF_VAR_OTHER_UBASE || v > DIF_VAR_OTHER_MAX)
9761				err += efunc(pc, "invalid variable %u\n", v);
9762			if (rs >= nregs)
9763				err += efunc(pc, "invalid register %u\n", rd);
9764			break;
9765		case DIF_OP_CALL:
9766			if (subr > DIF_SUBR_MAX)
9767				err += efunc(pc, "invalid subr %u\n", subr);
9768			if (rd >= nregs)
9769				err += efunc(pc, "invalid register %u\n", rd);
9770			if (rd == 0)
9771				err += efunc(pc, "cannot write to %r0\n");
9772
9773			if (subr == DIF_SUBR_COPYOUT ||
9774			    subr == DIF_SUBR_COPYOUTSTR) {
9775				dp->dtdo_destructive = 1;
9776			}
9777
9778			if (subr == DIF_SUBR_GETF) {
9779				/*
9780				 * If we have a getf() we need to record that
9781				 * in our state.  Note that our state can be
9782				 * NULL if this is a helper -- but in that
9783				 * case, the call to getf() is itself illegal,
9784				 * and will be caught (slightly later) when
9785				 * the helper is validated.
9786				 */
9787				if (vstate->dtvs_state != NULL)
9788					vstate->dtvs_state->dts_getf++;
9789			}
9790
9791			break;
9792		case DIF_OP_PUSHTR:
9793			if (type != DIF_TYPE_STRING && type != DIF_TYPE_CTF)
9794				err += efunc(pc, "invalid ref type %u\n", type);
9795			if (r2 >= nregs)
9796				err += efunc(pc, "invalid register %u\n", r2);
9797			if (rs >= nregs)
9798				err += efunc(pc, "invalid register %u\n", rs);
9799			break;
9800		case DIF_OP_PUSHTV:
9801			if (type != DIF_TYPE_CTF)
9802				err += efunc(pc, "invalid val type %u\n", type);
9803			if (r2 >= nregs)
9804				err += efunc(pc, "invalid register %u\n", r2);
9805			if (rs >= nregs)
9806				err += efunc(pc, "invalid register %u\n", rs);
9807			break;
9808		default:
9809			err += efunc(pc, "invalid opcode %u\n",
9810			    DIF_INSTR_OP(instr));
9811		}
9812	}
9813
9814	if (dp->dtdo_len != 0 &&
9815	    DIF_INSTR_OP(dp->dtdo_buf[dp->dtdo_len - 1]) != DIF_OP_RET) {
9816		err += efunc(dp->dtdo_len - 1,
9817		    "expected 'ret' as last DIF instruction\n");
9818	}
9819
9820	if (!(dp->dtdo_rtype.dtdt_flags & (DIF_TF_BYREF | DIF_TF_BYUREF))) {
9821		/*
9822		 * If we're not returning by reference, the size must be either
9823		 * 0 or the size of one of the base types.
9824		 */
9825		switch (dp->dtdo_rtype.dtdt_size) {
9826		case 0:
9827		case sizeof (uint8_t):
9828		case sizeof (uint16_t):
9829		case sizeof (uint32_t):
9830		case sizeof (uint64_t):
9831			break;
9832
9833		default:
9834			err += efunc(dp->dtdo_len - 1, "bad return size\n");
9835		}
9836	}
9837
9838	for (i = 0; i < dp->dtdo_varlen && err == 0; i++) {
9839		dtrace_difv_t *v = &dp->dtdo_vartab[i], *existing = NULL;
9840		dtrace_diftype_t *vt, *et;
9841		uint_t id, ndx;
9842
9843		if (v->dtdv_scope != DIFV_SCOPE_GLOBAL &&
9844		    v->dtdv_scope != DIFV_SCOPE_THREAD &&
9845		    v->dtdv_scope != DIFV_SCOPE_LOCAL) {
9846			err += efunc(i, "unrecognized variable scope %d\n",
9847			    v->dtdv_scope);
9848			break;
9849		}
9850
9851		if (v->dtdv_kind != DIFV_KIND_ARRAY &&
9852		    v->dtdv_kind != DIFV_KIND_SCALAR) {
9853			err += efunc(i, "unrecognized variable type %d\n",
9854			    v->dtdv_kind);
9855			break;
9856		}
9857
9858		if ((id = v->dtdv_id) > DIF_VARIABLE_MAX) {
9859			err += efunc(i, "%d exceeds variable id limit\n", id);
9860			break;
9861		}
9862
9863		if (id < DIF_VAR_OTHER_UBASE)
9864			continue;
9865
9866		/*
9867		 * For user-defined variables, we need to check that this
9868		 * definition is identical to any previous definition that we
9869		 * encountered.
9870		 */
9871		ndx = id - DIF_VAR_OTHER_UBASE;
9872
9873		switch (v->dtdv_scope) {
9874		case DIFV_SCOPE_GLOBAL:
9875			if (ndx < vstate->dtvs_nglobals) {
9876				dtrace_statvar_t *svar;
9877
9878				if ((svar = vstate->dtvs_globals[ndx]) != NULL)
9879					existing = &svar->dtsv_var;
9880			}
9881
9882			break;
9883
9884		case DIFV_SCOPE_THREAD:
9885			if (ndx < vstate->dtvs_ntlocals)
9886				existing = &vstate->dtvs_tlocals[ndx];
9887			break;
9888
9889		case DIFV_SCOPE_LOCAL:
9890			if (ndx < vstate->dtvs_nlocals) {
9891				dtrace_statvar_t *svar;
9892
9893				if ((svar = vstate->dtvs_locals[ndx]) != NULL)
9894					existing = &svar->dtsv_var;
9895			}
9896
9897			break;
9898		}
9899
9900		vt = &v->dtdv_type;
9901
9902		if (vt->dtdt_flags & DIF_TF_BYREF) {
9903			if (vt->dtdt_size == 0) {
9904				err += efunc(i, "zero-sized variable\n");
9905				break;
9906			}
9907
9908			if (v->dtdv_scope == DIFV_SCOPE_GLOBAL &&
9909			    vt->dtdt_size > dtrace_global_maxsize) {
9910				err += efunc(i, "oversized by-ref global\n");
9911				break;
9912			}
9913		}
9914
9915		if (existing == NULL || existing->dtdv_id == 0)
9916			continue;
9917
9918		ASSERT(existing->dtdv_id == v->dtdv_id);
9919		ASSERT(existing->dtdv_scope == v->dtdv_scope);
9920
9921		if (existing->dtdv_kind != v->dtdv_kind)
9922			err += efunc(i, "%d changed variable kind\n", id);
9923
9924		et = &existing->dtdv_type;
9925
9926		if (vt->dtdt_flags != et->dtdt_flags) {
9927			err += efunc(i, "%d changed variable type flags\n", id);
9928			break;
9929		}
9930
9931		if (vt->dtdt_size != 0 && vt->dtdt_size != et->dtdt_size) {
9932			err += efunc(i, "%d changed variable type size\n", id);
9933			break;
9934		}
9935	}
9936
9937	return (err);
9938}
9939
9940/*
9941 * Validate a DTrace DIF object that it is to be used as a helper.  Helpers
9942 * are much more constrained than normal DIFOs.  Specifically, they may
9943 * not:
9944 *
9945 * 1. Make calls to subroutines other than copyin(), copyinstr() or
9946 *    miscellaneous string routines
9947 * 2. Access DTrace variables other than the args[] array, and the
9948 *    curthread, pid, ppid, tid, execname, zonename, uid and gid variables.
9949 * 3. Have thread-local variables.
9950 * 4. Have dynamic variables.
9951 */
9952static int
9953dtrace_difo_validate_helper(dtrace_difo_t *dp)
9954{
9955	int (*efunc)(uint_t pc, const char *, ...) = dtrace_difo_err;
9956	int err = 0;
9957	uint_t pc;
9958
9959	for (pc = 0; pc < dp->dtdo_len; pc++) {
9960		dif_instr_t instr = dp->dtdo_buf[pc];
9961
9962		uint_t v = DIF_INSTR_VAR(instr);
9963		uint_t subr = DIF_INSTR_SUBR(instr);
9964		uint_t op = DIF_INSTR_OP(instr);
9965
9966		switch (op) {
9967		case DIF_OP_OR:
9968		case DIF_OP_XOR:
9969		case DIF_OP_AND:
9970		case DIF_OP_SLL:
9971		case DIF_OP_SRL:
9972		case DIF_OP_SRA:
9973		case DIF_OP_SUB:
9974		case DIF_OP_ADD:
9975		case DIF_OP_MUL:
9976		case DIF_OP_SDIV:
9977		case DIF_OP_UDIV:
9978		case DIF_OP_SREM:
9979		case DIF_OP_UREM:
9980		case DIF_OP_COPYS:
9981		case DIF_OP_NOT:
9982		case DIF_OP_MOV:
9983		case DIF_OP_RLDSB:
9984		case DIF_OP_RLDSH:
9985		case DIF_OP_RLDSW:
9986		case DIF_OP_RLDUB:
9987		case DIF_OP_RLDUH:
9988		case DIF_OP_RLDUW:
9989		case DIF_OP_RLDX:
9990		case DIF_OP_ULDSB:
9991		case DIF_OP_ULDSH:
9992		case DIF_OP_ULDSW:
9993		case DIF_OP_ULDUB:
9994		case DIF_OP_ULDUH:
9995		case DIF_OP_ULDUW:
9996		case DIF_OP_ULDX:
9997		case DIF_OP_STB:
9998		case DIF_OP_STH:
9999		case DIF_OP_STW:
10000		case DIF_OP_STX:
10001		case DIF_OP_ALLOCS:
10002		case DIF_OP_CMP:
10003		case DIF_OP_SCMP:
10004		case DIF_OP_TST:
10005		case DIF_OP_BA:
10006		case DIF_OP_BE:
10007		case DIF_OP_BNE:
10008		case DIF_OP_BG:
10009		case DIF_OP_BGU:
10010		case DIF_OP_BGE:
10011		case DIF_OP_BGEU:
10012		case DIF_OP_BL:
10013		case DIF_OP_BLU:
10014		case DIF_OP_BLE:
10015		case DIF_OP_BLEU:
10016		case DIF_OP_RET:
10017		case DIF_OP_NOP:
10018		case DIF_OP_POPTS:
10019		case DIF_OP_FLUSHTS:
10020		case DIF_OP_SETX:
10021		case DIF_OP_SETS:
10022		case DIF_OP_LDGA:
10023		case DIF_OP_LDLS:
10024		case DIF_OP_STGS:
10025		case DIF_OP_STLS:
10026		case DIF_OP_PUSHTR:
10027		case DIF_OP_PUSHTV:
10028			break;
10029
10030		case DIF_OP_LDGS:
10031			if (v >= DIF_VAR_OTHER_UBASE)
10032				break;
10033
10034			if (v >= DIF_VAR_ARG0 && v <= DIF_VAR_ARG9)
10035				break;
10036
10037			if (v == DIF_VAR_CURTHREAD || v == DIF_VAR_PID ||
10038			    v == DIF_VAR_PPID || v == DIF_VAR_TID ||
10039			    v == DIF_VAR_EXECARGS ||
10040			    v == DIF_VAR_EXECNAME || v == DIF_VAR_ZONENAME ||
10041			    v == DIF_VAR_UID || v == DIF_VAR_GID)
10042				break;
10043
10044			err += efunc(pc, "illegal variable %u\n", v);
10045			break;
10046
10047		case DIF_OP_LDTA:
10048		case DIF_OP_LDTS:
10049		case DIF_OP_LDGAA:
10050		case DIF_OP_LDTAA:
10051			err += efunc(pc, "illegal dynamic variable load\n");
10052			break;
10053
10054		case DIF_OP_STTS:
10055		case DIF_OP_STGAA:
10056		case DIF_OP_STTAA:
10057			err += efunc(pc, "illegal dynamic variable store\n");
10058			break;
10059
10060		case DIF_OP_CALL:
10061			if (subr == DIF_SUBR_ALLOCA ||
10062			    subr == DIF_SUBR_BCOPY ||
10063			    subr == DIF_SUBR_COPYIN ||
10064			    subr == DIF_SUBR_COPYINTO ||
10065			    subr == DIF_SUBR_COPYINSTR ||
10066			    subr == DIF_SUBR_INDEX ||
10067			    subr == DIF_SUBR_INET_NTOA ||
10068			    subr == DIF_SUBR_INET_NTOA6 ||
10069			    subr == DIF_SUBR_INET_NTOP ||
10070			    subr == DIF_SUBR_JSON ||
10071			    subr == DIF_SUBR_LLTOSTR ||
10072			    subr == DIF_SUBR_STRTOLL ||
10073			    subr == DIF_SUBR_RINDEX ||
10074			    subr == DIF_SUBR_STRCHR ||
10075			    subr == DIF_SUBR_STRJOIN ||
10076			    subr == DIF_SUBR_STRRCHR ||
10077			    subr == DIF_SUBR_STRSTR ||
10078			    subr == DIF_SUBR_HTONS ||
10079			    subr == DIF_SUBR_HTONL ||
10080			    subr == DIF_SUBR_HTONLL ||
10081			    subr == DIF_SUBR_NTOHS ||
10082			    subr == DIF_SUBR_NTOHL ||
10083			    subr == DIF_SUBR_NTOHLL ||
10084			    subr == DIF_SUBR_MEMREF ||
10085#if !defined(sun)
10086			    subr == DIF_SUBR_MEMSTR ||
10087#endif
10088			    subr == DIF_SUBR_TYPEREF)
10089				break;
10090
10091			err += efunc(pc, "invalid subr %u\n", subr);
10092			break;
10093
10094		default:
10095			err += efunc(pc, "invalid opcode %u\n",
10096			    DIF_INSTR_OP(instr));
10097		}
10098	}
10099
10100	return (err);
10101}
10102
10103/*
10104 * Returns 1 if the expression in the DIF object can be cached on a per-thread
10105 * basis; 0 if not.
10106 */
10107static int
10108dtrace_difo_cacheable(dtrace_difo_t *dp)
10109{
10110	int i;
10111
10112	if (dp == NULL)
10113		return (0);
10114
10115	for (i = 0; i < dp->dtdo_varlen; i++) {
10116		dtrace_difv_t *v = &dp->dtdo_vartab[i];
10117
10118		if (v->dtdv_scope != DIFV_SCOPE_GLOBAL)
10119			continue;
10120
10121		switch (v->dtdv_id) {
10122		case DIF_VAR_CURTHREAD:
10123		case DIF_VAR_PID:
10124		case DIF_VAR_TID:
10125		case DIF_VAR_EXECARGS:
10126		case DIF_VAR_EXECNAME:
10127		case DIF_VAR_ZONENAME:
10128			break;
10129
10130		default:
10131			return (0);
10132		}
10133	}
10134
10135	/*
10136	 * This DIF object may be cacheable.  Now we need to look for any
10137	 * array loading instructions, any memory loading instructions, or
10138	 * any stores to thread-local variables.
10139	 */
10140	for (i = 0; i < dp->dtdo_len; i++) {
10141		uint_t op = DIF_INSTR_OP(dp->dtdo_buf[i]);
10142
10143		if ((op >= DIF_OP_LDSB && op <= DIF_OP_LDX) ||
10144		    (op >= DIF_OP_ULDSB && op <= DIF_OP_ULDX) ||
10145		    (op >= DIF_OP_RLDSB && op <= DIF_OP_RLDX) ||
10146		    op == DIF_OP_LDGA || op == DIF_OP_STTS)
10147			return (0);
10148	}
10149
10150	return (1);
10151}
10152
10153static void
10154dtrace_difo_hold(dtrace_difo_t *dp)
10155{
10156	int i;
10157
10158	ASSERT(MUTEX_HELD(&dtrace_lock));
10159
10160	dp->dtdo_refcnt++;
10161	ASSERT(dp->dtdo_refcnt != 0);
10162
10163	/*
10164	 * We need to check this DIF object for references to the variable
10165	 * DIF_VAR_VTIMESTAMP.
10166	 */
10167	for (i = 0; i < dp->dtdo_varlen; i++) {
10168		dtrace_difv_t *v = &dp->dtdo_vartab[i];
10169
10170		if (v->dtdv_id != DIF_VAR_VTIMESTAMP)
10171			continue;
10172
10173		if (dtrace_vtime_references++ == 0)
10174			dtrace_vtime_enable();
10175	}
10176}
10177
10178/*
10179 * This routine calculates the dynamic variable chunksize for a given DIF
10180 * object.  The calculation is not fool-proof, and can probably be tricked by
10181 * malicious DIF -- but it works for all compiler-generated DIF.  Because this
10182 * calculation is likely imperfect, dtrace_dynvar() is able to gracefully fail
10183 * if a dynamic variable size exceeds the chunksize.
10184 */
10185static void
10186dtrace_difo_chunksize(dtrace_difo_t *dp, dtrace_vstate_t *vstate)
10187{
10188	uint64_t sval = 0;
10189	dtrace_key_t tupregs[DIF_DTR_NREGS + 2]; /* +2 for thread and id */
10190	const dif_instr_t *text = dp->dtdo_buf;
10191	uint_t pc, srd = 0;
10192	uint_t ttop = 0;
10193	size_t size, ksize;
10194	uint_t id, i;
10195
10196	for (pc = 0; pc < dp->dtdo_len; pc++) {
10197		dif_instr_t instr = text[pc];
10198		uint_t op = DIF_INSTR_OP(instr);
10199		uint_t rd = DIF_INSTR_RD(instr);
10200		uint_t r1 = DIF_INSTR_R1(instr);
10201		uint_t nkeys = 0;
10202		uchar_t scope = 0;
10203
10204		dtrace_key_t *key = tupregs;
10205
10206		switch (op) {
10207		case DIF_OP_SETX:
10208			sval = dp->dtdo_inttab[DIF_INSTR_INTEGER(instr)];
10209			srd = rd;
10210			continue;
10211
10212		case DIF_OP_STTS:
10213			key = &tupregs[DIF_DTR_NREGS];
10214			key[0].dttk_size = 0;
10215			key[1].dttk_size = 0;
10216			nkeys = 2;
10217			scope = DIFV_SCOPE_THREAD;
10218			break;
10219
10220		case DIF_OP_STGAA:
10221		case DIF_OP_STTAA:
10222			nkeys = ttop;
10223
10224			if (DIF_INSTR_OP(instr) == DIF_OP_STTAA)
10225				key[nkeys++].dttk_size = 0;
10226
10227			key[nkeys++].dttk_size = 0;
10228
10229			if (op == DIF_OP_STTAA) {
10230				scope = DIFV_SCOPE_THREAD;
10231			} else {
10232				scope = DIFV_SCOPE_GLOBAL;
10233			}
10234
10235			break;
10236
10237		case DIF_OP_PUSHTR:
10238			if (ttop == DIF_DTR_NREGS)
10239				return;
10240
10241			if ((srd == 0 || sval == 0) && r1 == DIF_TYPE_STRING) {
10242				/*
10243				 * If the register for the size of the "pushtr"
10244				 * is %r0 (or the value is 0) and the type is
10245				 * a string, we'll use the system-wide default
10246				 * string size.
10247				 */
10248				tupregs[ttop++].dttk_size =
10249				    dtrace_strsize_default;
10250			} else {
10251				if (srd == 0)
10252					return;
10253
10254				tupregs[ttop++].dttk_size = sval;
10255			}
10256
10257			break;
10258
10259		case DIF_OP_PUSHTV:
10260			if (ttop == DIF_DTR_NREGS)
10261				return;
10262
10263			tupregs[ttop++].dttk_size = 0;
10264			break;
10265
10266		case DIF_OP_FLUSHTS:
10267			ttop = 0;
10268			break;
10269
10270		case DIF_OP_POPTS:
10271			if (ttop != 0)
10272				ttop--;
10273			break;
10274		}
10275
10276		sval = 0;
10277		srd = 0;
10278
10279		if (nkeys == 0)
10280			continue;
10281
10282		/*
10283		 * We have a dynamic variable allocation; calculate its size.
10284		 */
10285		for (ksize = 0, i = 0; i < nkeys; i++)
10286			ksize += P2ROUNDUP(key[i].dttk_size, sizeof (uint64_t));
10287
10288		size = sizeof (dtrace_dynvar_t);
10289		size += sizeof (dtrace_key_t) * (nkeys - 1);
10290		size += ksize;
10291
10292		/*
10293		 * Now we need to determine the size of the stored data.
10294		 */
10295		id = DIF_INSTR_VAR(instr);
10296
10297		for (i = 0; i < dp->dtdo_varlen; i++) {
10298			dtrace_difv_t *v = &dp->dtdo_vartab[i];
10299
10300			if (v->dtdv_id == id && v->dtdv_scope == scope) {
10301				size += v->dtdv_type.dtdt_size;
10302				break;
10303			}
10304		}
10305
10306		if (i == dp->dtdo_varlen)
10307			return;
10308
10309		/*
10310		 * We have the size.  If this is larger than the chunk size
10311		 * for our dynamic variable state, reset the chunk size.
10312		 */
10313		size = P2ROUNDUP(size, sizeof (uint64_t));
10314
10315		if (size > vstate->dtvs_dynvars.dtds_chunksize)
10316			vstate->dtvs_dynvars.dtds_chunksize = size;
10317	}
10318}
10319
10320static void
10321dtrace_difo_init(dtrace_difo_t *dp, dtrace_vstate_t *vstate)
10322{
10323	int i, oldsvars, osz, nsz, otlocals, ntlocals;
10324	uint_t id;
10325
10326	ASSERT(MUTEX_HELD(&dtrace_lock));
10327	ASSERT(dp->dtdo_buf != NULL && dp->dtdo_len != 0);
10328
10329	for (i = 0; i < dp->dtdo_varlen; i++) {
10330		dtrace_difv_t *v = &dp->dtdo_vartab[i];
10331		dtrace_statvar_t *svar, ***svarp = NULL;
10332		size_t dsize = 0;
10333		uint8_t scope = v->dtdv_scope;
10334		int *np = NULL;
10335
10336		if ((id = v->dtdv_id) < DIF_VAR_OTHER_UBASE)
10337			continue;
10338
10339		id -= DIF_VAR_OTHER_UBASE;
10340
10341		switch (scope) {
10342		case DIFV_SCOPE_THREAD:
10343			while (id >= (otlocals = vstate->dtvs_ntlocals)) {
10344				dtrace_difv_t *tlocals;
10345
10346				if ((ntlocals = (otlocals << 1)) == 0)
10347					ntlocals = 1;
10348
10349				osz = otlocals * sizeof (dtrace_difv_t);
10350				nsz = ntlocals * sizeof (dtrace_difv_t);
10351
10352				tlocals = kmem_zalloc(nsz, KM_SLEEP);
10353
10354				if (osz != 0) {
10355					bcopy(vstate->dtvs_tlocals,
10356					    tlocals, osz);
10357					kmem_free(vstate->dtvs_tlocals, osz);
10358				}
10359
10360				vstate->dtvs_tlocals = tlocals;
10361				vstate->dtvs_ntlocals = ntlocals;
10362			}
10363
10364			vstate->dtvs_tlocals[id] = *v;
10365			continue;
10366
10367		case DIFV_SCOPE_LOCAL:
10368			np = &vstate->dtvs_nlocals;
10369			svarp = &vstate->dtvs_locals;
10370
10371			if (v->dtdv_type.dtdt_flags & DIF_TF_BYREF)
10372				dsize = NCPU * (v->dtdv_type.dtdt_size +
10373				    sizeof (uint64_t));
10374			else
10375				dsize = NCPU * sizeof (uint64_t);
10376
10377			break;
10378
10379		case DIFV_SCOPE_GLOBAL:
10380			np = &vstate->dtvs_nglobals;
10381			svarp = &vstate->dtvs_globals;
10382
10383			if (v->dtdv_type.dtdt_flags & DIF_TF_BYREF)
10384				dsize = v->dtdv_type.dtdt_size +
10385				    sizeof (uint64_t);
10386
10387			break;
10388
10389		default:
10390			ASSERT(0);
10391		}
10392
10393		while (id >= (oldsvars = *np)) {
10394			dtrace_statvar_t **statics;
10395			int newsvars, oldsize, newsize;
10396
10397			if ((newsvars = (oldsvars << 1)) == 0)
10398				newsvars = 1;
10399
10400			oldsize = oldsvars * sizeof (dtrace_statvar_t *);
10401			newsize = newsvars * sizeof (dtrace_statvar_t *);
10402
10403			statics = kmem_zalloc(newsize, KM_SLEEP);
10404
10405			if (oldsize != 0) {
10406				bcopy(*svarp, statics, oldsize);
10407				kmem_free(*svarp, oldsize);
10408			}
10409
10410			*svarp = statics;
10411			*np = newsvars;
10412		}
10413
10414		if ((svar = (*svarp)[id]) == NULL) {
10415			svar = kmem_zalloc(sizeof (dtrace_statvar_t), KM_SLEEP);
10416			svar->dtsv_var = *v;
10417
10418			if ((svar->dtsv_size = dsize) != 0) {
10419				svar->dtsv_data = (uint64_t)(uintptr_t)
10420				    kmem_zalloc(dsize, KM_SLEEP);
10421			}
10422
10423			(*svarp)[id] = svar;
10424		}
10425
10426		svar->dtsv_refcnt++;
10427	}
10428
10429	dtrace_difo_chunksize(dp, vstate);
10430	dtrace_difo_hold(dp);
10431}
10432
10433static dtrace_difo_t *
10434dtrace_difo_duplicate(dtrace_difo_t *dp, dtrace_vstate_t *vstate)
10435{
10436	dtrace_difo_t *new;
10437	size_t sz;
10438
10439	ASSERT(dp->dtdo_buf != NULL);
10440	ASSERT(dp->dtdo_refcnt != 0);
10441
10442	new = kmem_zalloc(sizeof (dtrace_difo_t), KM_SLEEP);
10443
10444	ASSERT(dp->dtdo_buf != NULL);
10445	sz = dp->dtdo_len * sizeof (dif_instr_t);
10446	new->dtdo_buf = kmem_alloc(sz, KM_SLEEP);
10447	bcopy(dp->dtdo_buf, new->dtdo_buf, sz);
10448	new->dtdo_len = dp->dtdo_len;
10449
10450	if (dp->dtdo_strtab != NULL) {
10451		ASSERT(dp->dtdo_strlen != 0);
10452		new->dtdo_strtab = kmem_alloc(dp->dtdo_strlen, KM_SLEEP);
10453		bcopy(dp->dtdo_strtab, new->dtdo_strtab, dp->dtdo_strlen);
10454		new->dtdo_strlen = dp->dtdo_strlen;
10455	}
10456
10457	if (dp->dtdo_inttab != NULL) {
10458		ASSERT(dp->dtdo_intlen != 0);
10459		sz = dp->dtdo_intlen * sizeof (uint64_t);
10460		new->dtdo_inttab = kmem_alloc(sz, KM_SLEEP);
10461		bcopy(dp->dtdo_inttab, new->dtdo_inttab, sz);
10462		new->dtdo_intlen = dp->dtdo_intlen;
10463	}
10464
10465	if (dp->dtdo_vartab != NULL) {
10466		ASSERT(dp->dtdo_varlen != 0);
10467		sz = dp->dtdo_varlen * sizeof (dtrace_difv_t);
10468		new->dtdo_vartab = kmem_alloc(sz, KM_SLEEP);
10469		bcopy(dp->dtdo_vartab, new->dtdo_vartab, sz);
10470		new->dtdo_varlen = dp->dtdo_varlen;
10471	}
10472
10473	dtrace_difo_init(new, vstate);
10474	return (new);
10475}
10476
10477static void
10478dtrace_difo_destroy(dtrace_difo_t *dp, dtrace_vstate_t *vstate)
10479{
10480	int i;
10481
10482	ASSERT(dp->dtdo_refcnt == 0);
10483
10484	for (i = 0; i < dp->dtdo_varlen; i++) {
10485		dtrace_difv_t *v = &dp->dtdo_vartab[i];
10486		dtrace_statvar_t *svar, **svarp = NULL;
10487		uint_t id;
10488		uint8_t scope = v->dtdv_scope;
10489		int *np = NULL;
10490
10491		switch (scope) {
10492		case DIFV_SCOPE_THREAD:
10493			continue;
10494
10495		case DIFV_SCOPE_LOCAL:
10496			np = &vstate->dtvs_nlocals;
10497			svarp = vstate->dtvs_locals;
10498			break;
10499
10500		case DIFV_SCOPE_GLOBAL:
10501			np = &vstate->dtvs_nglobals;
10502			svarp = vstate->dtvs_globals;
10503			break;
10504
10505		default:
10506			ASSERT(0);
10507		}
10508
10509		if ((id = v->dtdv_id) < DIF_VAR_OTHER_UBASE)
10510			continue;
10511
10512		id -= DIF_VAR_OTHER_UBASE;
10513		ASSERT(id < *np);
10514
10515		svar = svarp[id];
10516		ASSERT(svar != NULL);
10517		ASSERT(svar->dtsv_refcnt > 0);
10518
10519		if (--svar->dtsv_refcnt > 0)
10520			continue;
10521
10522		if (svar->dtsv_size != 0) {
10523			ASSERT(svar->dtsv_data != 0);
10524			kmem_free((void *)(uintptr_t)svar->dtsv_data,
10525			    svar->dtsv_size);
10526		}
10527
10528		kmem_free(svar, sizeof (dtrace_statvar_t));
10529		svarp[id] = NULL;
10530	}
10531
10532	if (dp->dtdo_buf != NULL)
10533		kmem_free(dp->dtdo_buf, dp->dtdo_len * sizeof (dif_instr_t));
10534	if (dp->dtdo_inttab != NULL)
10535		kmem_free(dp->dtdo_inttab, dp->dtdo_intlen * sizeof (uint64_t));
10536	if (dp->dtdo_strtab != NULL)
10537		kmem_free(dp->dtdo_strtab, dp->dtdo_strlen);
10538	if (dp->dtdo_vartab != NULL)
10539		kmem_free(dp->dtdo_vartab, dp->dtdo_varlen * sizeof (dtrace_difv_t));
10540
10541	kmem_free(dp, sizeof (dtrace_difo_t));
10542}
10543
10544static void
10545dtrace_difo_release(dtrace_difo_t *dp, dtrace_vstate_t *vstate)
10546{
10547	int i;
10548
10549	ASSERT(MUTEX_HELD(&dtrace_lock));
10550	ASSERT(dp->dtdo_refcnt != 0);
10551
10552	for (i = 0; i < dp->dtdo_varlen; i++) {
10553		dtrace_difv_t *v = &dp->dtdo_vartab[i];
10554
10555		if (v->dtdv_id != DIF_VAR_VTIMESTAMP)
10556			continue;
10557
10558		ASSERT(dtrace_vtime_references > 0);
10559		if (--dtrace_vtime_references == 0)
10560			dtrace_vtime_disable();
10561	}
10562
10563	if (--dp->dtdo_refcnt == 0)
10564		dtrace_difo_destroy(dp, vstate);
10565}
10566
10567/*
10568 * DTrace Format Functions
10569 */
10570static uint16_t
10571dtrace_format_add(dtrace_state_t *state, char *str)
10572{
10573	char *fmt, **new;
10574	uint16_t ndx, len = strlen(str) + 1;
10575
10576	fmt = kmem_zalloc(len, KM_SLEEP);
10577	bcopy(str, fmt, len);
10578
10579	for (ndx = 0; ndx < state->dts_nformats; ndx++) {
10580		if (state->dts_formats[ndx] == NULL) {
10581			state->dts_formats[ndx] = fmt;
10582			return (ndx + 1);
10583		}
10584	}
10585
10586	if (state->dts_nformats == USHRT_MAX) {
10587		/*
10588		 * This is only likely if a denial-of-service attack is being
10589		 * attempted.  As such, it's okay to fail silently here.
10590		 */
10591		kmem_free(fmt, len);
10592		return (0);
10593	}
10594
10595	/*
10596	 * For simplicity, we always resize the formats array to be exactly the
10597	 * number of formats.
10598	 */
10599	ndx = state->dts_nformats++;
10600	new = kmem_alloc((ndx + 1) * sizeof (char *), KM_SLEEP);
10601
10602	if (state->dts_formats != NULL) {
10603		ASSERT(ndx != 0);
10604		bcopy(state->dts_formats, new, ndx * sizeof (char *));
10605		kmem_free(state->dts_formats, ndx * sizeof (char *));
10606	}
10607
10608	state->dts_formats = new;
10609	state->dts_formats[ndx] = fmt;
10610
10611	return (ndx + 1);
10612}
10613
10614static void
10615dtrace_format_remove(dtrace_state_t *state, uint16_t format)
10616{
10617	char *fmt;
10618
10619	ASSERT(state->dts_formats != NULL);
10620	ASSERT(format <= state->dts_nformats);
10621	ASSERT(state->dts_formats[format - 1] != NULL);
10622
10623	fmt = state->dts_formats[format - 1];
10624	kmem_free(fmt, strlen(fmt) + 1);
10625	state->dts_formats[format - 1] = NULL;
10626}
10627
10628static void
10629dtrace_format_destroy(dtrace_state_t *state)
10630{
10631	int i;
10632
10633	if (state->dts_nformats == 0) {
10634		ASSERT(state->dts_formats == NULL);
10635		return;
10636	}
10637
10638	ASSERT(state->dts_formats != NULL);
10639
10640	for (i = 0; i < state->dts_nformats; i++) {
10641		char *fmt = state->dts_formats[i];
10642
10643		if (fmt == NULL)
10644			continue;
10645
10646		kmem_free(fmt, strlen(fmt) + 1);
10647	}
10648
10649	kmem_free(state->dts_formats, state->dts_nformats * sizeof (char *));
10650	state->dts_nformats = 0;
10651	state->dts_formats = NULL;
10652}
10653
10654/*
10655 * DTrace Predicate Functions
10656 */
10657static dtrace_predicate_t *
10658dtrace_predicate_create(dtrace_difo_t *dp)
10659{
10660	dtrace_predicate_t *pred;
10661
10662	ASSERT(MUTEX_HELD(&dtrace_lock));
10663	ASSERT(dp->dtdo_refcnt != 0);
10664
10665	pred = kmem_zalloc(sizeof (dtrace_predicate_t), KM_SLEEP);
10666	pred->dtp_difo = dp;
10667	pred->dtp_refcnt = 1;
10668
10669	if (!dtrace_difo_cacheable(dp))
10670		return (pred);
10671
10672	if (dtrace_predcache_id == DTRACE_CACHEIDNONE) {
10673		/*
10674		 * This is only theoretically possible -- we have had 2^32
10675		 * cacheable predicates on this machine.  We cannot allow any
10676		 * more predicates to become cacheable:  as unlikely as it is,
10677		 * there may be a thread caching a (now stale) predicate cache
10678		 * ID. (N.B.: the temptation is being successfully resisted to
10679		 * have this cmn_err() "Holy shit -- we executed this code!")
10680		 */
10681		return (pred);
10682	}
10683
10684	pred->dtp_cacheid = dtrace_predcache_id++;
10685
10686	return (pred);
10687}
10688
10689static void
10690dtrace_predicate_hold(dtrace_predicate_t *pred)
10691{
10692	ASSERT(MUTEX_HELD(&dtrace_lock));
10693	ASSERT(pred->dtp_difo != NULL && pred->dtp_difo->dtdo_refcnt != 0);
10694	ASSERT(pred->dtp_refcnt > 0);
10695
10696	pred->dtp_refcnt++;
10697}
10698
10699static void
10700dtrace_predicate_release(dtrace_predicate_t *pred, dtrace_vstate_t *vstate)
10701{
10702	dtrace_difo_t *dp = pred->dtp_difo;
10703
10704	ASSERT(MUTEX_HELD(&dtrace_lock));
10705	ASSERT(dp != NULL && dp->dtdo_refcnt != 0);
10706	ASSERT(pred->dtp_refcnt > 0);
10707
10708	if (--pred->dtp_refcnt == 0) {
10709		dtrace_difo_release(pred->dtp_difo, vstate);
10710		kmem_free(pred, sizeof (dtrace_predicate_t));
10711	}
10712}
10713
10714/*
10715 * DTrace Action Description Functions
10716 */
10717static dtrace_actdesc_t *
10718dtrace_actdesc_create(dtrace_actkind_t kind, uint32_t ntuple,
10719    uint64_t uarg, uint64_t arg)
10720{
10721	dtrace_actdesc_t *act;
10722
10723#if defined(sun)
10724	ASSERT(!DTRACEACT_ISPRINTFLIKE(kind) || (arg != NULL &&
10725	    arg >= KERNELBASE) || (arg == NULL && kind == DTRACEACT_PRINTA));
10726#endif
10727
10728	act = kmem_zalloc(sizeof (dtrace_actdesc_t), KM_SLEEP);
10729	act->dtad_kind = kind;
10730	act->dtad_ntuple = ntuple;
10731	act->dtad_uarg = uarg;
10732	act->dtad_arg = arg;
10733	act->dtad_refcnt = 1;
10734
10735	return (act);
10736}
10737
10738static void
10739dtrace_actdesc_hold(dtrace_actdesc_t *act)
10740{
10741	ASSERT(act->dtad_refcnt >= 1);
10742	act->dtad_refcnt++;
10743}
10744
10745static void
10746dtrace_actdesc_release(dtrace_actdesc_t *act, dtrace_vstate_t *vstate)
10747{
10748	dtrace_actkind_t kind = act->dtad_kind;
10749	dtrace_difo_t *dp;
10750
10751	ASSERT(act->dtad_refcnt >= 1);
10752
10753	if (--act->dtad_refcnt != 0)
10754		return;
10755
10756	if ((dp = act->dtad_difo) != NULL)
10757		dtrace_difo_release(dp, vstate);
10758
10759	if (DTRACEACT_ISPRINTFLIKE(kind)) {
10760		char *str = (char *)(uintptr_t)act->dtad_arg;
10761
10762#if defined(sun)
10763		ASSERT((str != NULL && (uintptr_t)str >= KERNELBASE) ||
10764		    (str == NULL && act->dtad_kind == DTRACEACT_PRINTA));
10765#endif
10766
10767		if (str != NULL)
10768			kmem_free(str, strlen(str) + 1);
10769	}
10770
10771	kmem_free(act, sizeof (dtrace_actdesc_t));
10772}
10773
10774/*
10775 * DTrace ECB Functions
10776 */
10777static dtrace_ecb_t *
10778dtrace_ecb_add(dtrace_state_t *state, dtrace_probe_t *probe)
10779{
10780	dtrace_ecb_t *ecb;
10781	dtrace_epid_t epid;
10782
10783	ASSERT(MUTEX_HELD(&dtrace_lock));
10784
10785	ecb = kmem_zalloc(sizeof (dtrace_ecb_t), KM_SLEEP);
10786	ecb->dte_predicate = NULL;
10787	ecb->dte_probe = probe;
10788
10789	/*
10790	 * The default size is the size of the default action: recording
10791	 * the header.
10792	 */
10793	ecb->dte_size = ecb->dte_needed = sizeof (dtrace_rechdr_t);
10794	ecb->dte_alignment = sizeof (dtrace_epid_t);
10795
10796	epid = state->dts_epid++;
10797
10798	if (epid - 1 >= state->dts_necbs) {
10799		dtrace_ecb_t **oecbs = state->dts_ecbs, **ecbs;
10800		int necbs = state->dts_necbs << 1;
10801
10802		ASSERT(epid == state->dts_necbs + 1);
10803
10804		if (necbs == 0) {
10805			ASSERT(oecbs == NULL);
10806			necbs = 1;
10807		}
10808
10809		ecbs = kmem_zalloc(necbs * sizeof (*ecbs), KM_SLEEP);
10810
10811		if (oecbs != NULL)
10812			bcopy(oecbs, ecbs, state->dts_necbs * sizeof (*ecbs));
10813
10814		dtrace_membar_producer();
10815		state->dts_ecbs = ecbs;
10816
10817		if (oecbs != NULL) {
10818			/*
10819			 * If this state is active, we must dtrace_sync()
10820			 * before we can free the old dts_ecbs array:  we're
10821			 * coming in hot, and there may be active ring
10822			 * buffer processing (which indexes into the dts_ecbs
10823			 * array) on another CPU.
10824			 */
10825			if (state->dts_activity != DTRACE_ACTIVITY_INACTIVE)
10826				dtrace_sync();
10827
10828			kmem_free(oecbs, state->dts_necbs * sizeof (*ecbs));
10829		}
10830
10831		dtrace_membar_producer();
10832		state->dts_necbs = necbs;
10833	}
10834
10835	ecb->dte_state = state;
10836
10837	ASSERT(state->dts_ecbs[epid - 1] == NULL);
10838	dtrace_membar_producer();
10839	state->dts_ecbs[(ecb->dte_epid = epid) - 1] = ecb;
10840
10841	return (ecb);
10842}
10843
10844static void
10845dtrace_ecb_enable(dtrace_ecb_t *ecb)
10846{
10847	dtrace_probe_t *probe = ecb->dte_probe;
10848
10849	ASSERT(MUTEX_HELD(&cpu_lock));
10850	ASSERT(MUTEX_HELD(&dtrace_lock));
10851	ASSERT(ecb->dte_next == NULL);
10852
10853	if (probe == NULL) {
10854		/*
10855		 * This is the NULL probe -- there's nothing to do.
10856		 */
10857		return;
10858	}
10859
10860	if (probe->dtpr_ecb == NULL) {
10861		dtrace_provider_t *prov = probe->dtpr_provider;
10862
10863		/*
10864		 * We're the first ECB on this probe.
10865		 */
10866		probe->dtpr_ecb = probe->dtpr_ecb_last = ecb;
10867
10868		if (ecb->dte_predicate != NULL)
10869			probe->dtpr_predcache = ecb->dte_predicate->dtp_cacheid;
10870
10871		prov->dtpv_pops.dtps_enable(prov->dtpv_arg,
10872		    probe->dtpr_id, probe->dtpr_arg);
10873	} else {
10874		/*
10875		 * This probe is already active.  Swing the last pointer to
10876		 * point to the new ECB, and issue a dtrace_sync() to assure
10877		 * that all CPUs have seen the change.
10878		 */
10879		ASSERT(probe->dtpr_ecb_last != NULL);
10880		probe->dtpr_ecb_last->dte_next = ecb;
10881		probe->dtpr_ecb_last = ecb;
10882		probe->dtpr_predcache = 0;
10883
10884		dtrace_sync();
10885	}
10886}
10887
10888static void
10889dtrace_ecb_resize(dtrace_ecb_t *ecb)
10890{
10891	dtrace_action_t *act;
10892	uint32_t curneeded = UINT32_MAX;
10893	uint32_t aggbase = UINT32_MAX;
10894
10895	/*
10896	 * If we record anything, we always record the dtrace_rechdr_t.  (And
10897	 * we always record it first.)
10898	 */
10899	ecb->dte_size = sizeof (dtrace_rechdr_t);
10900	ecb->dte_alignment = sizeof (dtrace_epid_t);
10901
10902	for (act = ecb->dte_action; act != NULL; act = act->dta_next) {
10903		dtrace_recdesc_t *rec = &act->dta_rec;
10904		ASSERT(rec->dtrd_size > 0 || rec->dtrd_alignment == 1);
10905
10906		ecb->dte_alignment = MAX(ecb->dte_alignment,
10907		    rec->dtrd_alignment);
10908
10909		if (DTRACEACT_ISAGG(act->dta_kind)) {
10910			dtrace_aggregation_t *agg = (dtrace_aggregation_t *)act;
10911
10912			ASSERT(rec->dtrd_size != 0);
10913			ASSERT(agg->dtag_first != NULL);
10914			ASSERT(act->dta_prev->dta_intuple);
10915			ASSERT(aggbase != UINT32_MAX);
10916			ASSERT(curneeded != UINT32_MAX);
10917
10918			agg->dtag_base = aggbase;
10919
10920			curneeded = P2ROUNDUP(curneeded, rec->dtrd_alignment);
10921			rec->dtrd_offset = curneeded;
10922			curneeded += rec->dtrd_size;
10923			ecb->dte_needed = MAX(ecb->dte_needed, curneeded);
10924
10925			aggbase = UINT32_MAX;
10926			curneeded = UINT32_MAX;
10927		} else if (act->dta_intuple) {
10928			if (curneeded == UINT32_MAX) {
10929				/*
10930				 * This is the first record in a tuple.  Align
10931				 * curneeded to be at offset 4 in an 8-byte
10932				 * aligned block.
10933				 */
10934				ASSERT(act->dta_prev == NULL ||
10935				    !act->dta_prev->dta_intuple);
10936				ASSERT3U(aggbase, ==, UINT32_MAX);
10937				curneeded = P2PHASEUP(ecb->dte_size,
10938				    sizeof (uint64_t), sizeof (dtrace_aggid_t));
10939
10940				aggbase = curneeded - sizeof (dtrace_aggid_t);
10941				ASSERT(IS_P2ALIGNED(aggbase,
10942				    sizeof (uint64_t)));
10943			}
10944			curneeded = P2ROUNDUP(curneeded, rec->dtrd_alignment);
10945			rec->dtrd_offset = curneeded;
10946			curneeded += rec->dtrd_size;
10947		} else {
10948			/* tuples must be followed by an aggregation */
10949			ASSERT(act->dta_prev == NULL ||
10950			    !act->dta_prev->dta_intuple);
10951
10952			ecb->dte_size = P2ROUNDUP(ecb->dte_size,
10953			    rec->dtrd_alignment);
10954			rec->dtrd_offset = ecb->dte_size;
10955			ecb->dte_size += rec->dtrd_size;
10956			ecb->dte_needed = MAX(ecb->dte_needed, ecb->dte_size);
10957		}
10958	}
10959
10960	if ((act = ecb->dte_action) != NULL &&
10961	    !(act->dta_kind == DTRACEACT_SPECULATE && act->dta_next == NULL) &&
10962	    ecb->dte_size == sizeof (dtrace_rechdr_t)) {
10963		/*
10964		 * If the size is still sizeof (dtrace_rechdr_t), then all
10965		 * actions store no data; set the size to 0.
10966		 */
10967		ecb->dte_size = 0;
10968	}
10969
10970	ecb->dte_size = P2ROUNDUP(ecb->dte_size, sizeof (dtrace_epid_t));
10971	ecb->dte_needed = P2ROUNDUP(ecb->dte_needed, (sizeof (dtrace_epid_t)));
10972	ecb->dte_state->dts_needed = MAX(ecb->dte_state->dts_needed,
10973	    ecb->dte_needed);
10974}
10975
10976static dtrace_action_t *
10977dtrace_ecb_aggregation_create(dtrace_ecb_t *ecb, dtrace_actdesc_t *desc)
10978{
10979	dtrace_aggregation_t *agg;
10980	size_t size = sizeof (uint64_t);
10981	int ntuple = desc->dtad_ntuple;
10982	dtrace_action_t *act;
10983	dtrace_recdesc_t *frec;
10984	dtrace_aggid_t aggid;
10985	dtrace_state_t *state = ecb->dte_state;
10986
10987	agg = kmem_zalloc(sizeof (dtrace_aggregation_t), KM_SLEEP);
10988	agg->dtag_ecb = ecb;
10989
10990	ASSERT(DTRACEACT_ISAGG(desc->dtad_kind));
10991
10992	switch (desc->dtad_kind) {
10993	case DTRACEAGG_MIN:
10994		agg->dtag_initial = INT64_MAX;
10995		agg->dtag_aggregate = dtrace_aggregate_min;
10996		break;
10997
10998	case DTRACEAGG_MAX:
10999		agg->dtag_initial = INT64_MIN;
11000		agg->dtag_aggregate = dtrace_aggregate_max;
11001		break;
11002
11003	case DTRACEAGG_COUNT:
11004		agg->dtag_aggregate = dtrace_aggregate_count;
11005		break;
11006
11007	case DTRACEAGG_QUANTIZE:
11008		agg->dtag_aggregate = dtrace_aggregate_quantize;
11009		size = (((sizeof (uint64_t) * NBBY) - 1) * 2 + 1) *
11010		    sizeof (uint64_t);
11011		break;
11012
11013	case DTRACEAGG_LQUANTIZE: {
11014		uint16_t step = DTRACE_LQUANTIZE_STEP(desc->dtad_arg);
11015		uint16_t levels = DTRACE_LQUANTIZE_LEVELS(desc->dtad_arg);
11016
11017		agg->dtag_initial = desc->dtad_arg;
11018		agg->dtag_aggregate = dtrace_aggregate_lquantize;
11019
11020		if (step == 0 || levels == 0)
11021			goto err;
11022
11023		size = levels * sizeof (uint64_t) + 3 * sizeof (uint64_t);
11024		break;
11025	}
11026
11027	case DTRACEAGG_LLQUANTIZE: {
11028		uint16_t factor = DTRACE_LLQUANTIZE_FACTOR(desc->dtad_arg);
11029		uint16_t low = DTRACE_LLQUANTIZE_LOW(desc->dtad_arg);
11030		uint16_t high = DTRACE_LLQUANTIZE_HIGH(desc->dtad_arg);
11031		uint16_t nsteps = DTRACE_LLQUANTIZE_NSTEP(desc->dtad_arg);
11032		int64_t v;
11033
11034		agg->dtag_initial = desc->dtad_arg;
11035		agg->dtag_aggregate = dtrace_aggregate_llquantize;
11036
11037		if (factor < 2 || low >= high || nsteps < factor)
11038			goto err;
11039
11040		/*
11041		 * Now check that the number of steps evenly divides a power
11042		 * of the factor.  (This assures both integer bucket size and
11043		 * linearity within each magnitude.)
11044		 */
11045		for (v = factor; v < nsteps; v *= factor)
11046			continue;
11047
11048		if ((v % nsteps) || (nsteps % factor))
11049			goto err;
11050
11051		size = (dtrace_aggregate_llquantize_bucket(factor,
11052		    low, high, nsteps, INT64_MAX) + 2) * sizeof (uint64_t);
11053		break;
11054	}
11055
11056	case DTRACEAGG_AVG:
11057		agg->dtag_aggregate = dtrace_aggregate_avg;
11058		size = sizeof (uint64_t) * 2;
11059		break;
11060
11061	case DTRACEAGG_STDDEV:
11062		agg->dtag_aggregate = dtrace_aggregate_stddev;
11063		size = sizeof (uint64_t) * 4;
11064		break;
11065
11066	case DTRACEAGG_SUM:
11067		agg->dtag_aggregate = dtrace_aggregate_sum;
11068		break;
11069
11070	default:
11071		goto err;
11072	}
11073
11074	agg->dtag_action.dta_rec.dtrd_size = size;
11075
11076	if (ntuple == 0)
11077		goto err;
11078
11079	/*
11080	 * We must make sure that we have enough actions for the n-tuple.
11081	 */
11082	for (act = ecb->dte_action_last; act != NULL; act = act->dta_prev) {
11083		if (DTRACEACT_ISAGG(act->dta_kind))
11084			break;
11085
11086		if (--ntuple == 0) {
11087			/*
11088			 * This is the action with which our n-tuple begins.
11089			 */
11090			agg->dtag_first = act;
11091			goto success;
11092		}
11093	}
11094
11095	/*
11096	 * This n-tuple is short by ntuple elements.  Return failure.
11097	 */
11098	ASSERT(ntuple != 0);
11099err:
11100	kmem_free(agg, sizeof (dtrace_aggregation_t));
11101	return (NULL);
11102
11103success:
11104	/*
11105	 * If the last action in the tuple has a size of zero, it's actually
11106	 * an expression argument for the aggregating action.
11107	 */
11108	ASSERT(ecb->dte_action_last != NULL);
11109	act = ecb->dte_action_last;
11110
11111	if (act->dta_kind == DTRACEACT_DIFEXPR) {
11112		ASSERT(act->dta_difo != NULL);
11113
11114		if (act->dta_difo->dtdo_rtype.dtdt_size == 0)
11115			agg->dtag_hasarg = 1;
11116	}
11117
11118	/*
11119	 * We need to allocate an id for this aggregation.
11120	 */
11121#if defined(sun)
11122	aggid = (dtrace_aggid_t)(uintptr_t)vmem_alloc(state->dts_aggid_arena, 1,
11123	    VM_BESTFIT | VM_SLEEP);
11124#else
11125	aggid = alloc_unr(state->dts_aggid_arena);
11126#endif
11127
11128	if (aggid - 1 >= state->dts_naggregations) {
11129		dtrace_aggregation_t **oaggs = state->dts_aggregations;
11130		dtrace_aggregation_t **aggs;
11131		int naggs = state->dts_naggregations << 1;
11132		int onaggs = state->dts_naggregations;
11133
11134		ASSERT(aggid == state->dts_naggregations + 1);
11135
11136		if (naggs == 0) {
11137			ASSERT(oaggs == NULL);
11138			naggs = 1;
11139		}
11140
11141		aggs = kmem_zalloc(naggs * sizeof (*aggs), KM_SLEEP);
11142
11143		if (oaggs != NULL) {
11144			bcopy(oaggs, aggs, onaggs * sizeof (*aggs));
11145			kmem_free(oaggs, onaggs * sizeof (*aggs));
11146		}
11147
11148		state->dts_aggregations = aggs;
11149		state->dts_naggregations = naggs;
11150	}
11151
11152	ASSERT(state->dts_aggregations[aggid - 1] == NULL);
11153	state->dts_aggregations[(agg->dtag_id = aggid) - 1] = agg;
11154
11155	frec = &agg->dtag_first->dta_rec;
11156	if (frec->dtrd_alignment < sizeof (dtrace_aggid_t))
11157		frec->dtrd_alignment = sizeof (dtrace_aggid_t);
11158
11159	for (act = agg->dtag_first; act != NULL; act = act->dta_next) {
11160		ASSERT(!act->dta_intuple);
11161		act->dta_intuple = 1;
11162	}
11163
11164	return (&agg->dtag_action);
11165}
11166
11167static void
11168dtrace_ecb_aggregation_destroy(dtrace_ecb_t *ecb, dtrace_action_t *act)
11169{
11170	dtrace_aggregation_t *agg = (dtrace_aggregation_t *)act;
11171	dtrace_state_t *state = ecb->dte_state;
11172	dtrace_aggid_t aggid = agg->dtag_id;
11173
11174	ASSERT(DTRACEACT_ISAGG(act->dta_kind));
11175#if defined(sun)
11176	vmem_free(state->dts_aggid_arena, (void *)(uintptr_t)aggid, 1);
11177#else
11178	free_unr(state->dts_aggid_arena, aggid);
11179#endif
11180
11181	ASSERT(state->dts_aggregations[aggid - 1] == agg);
11182	state->dts_aggregations[aggid - 1] = NULL;
11183
11184	kmem_free(agg, sizeof (dtrace_aggregation_t));
11185}
11186
11187static int
11188dtrace_ecb_action_add(dtrace_ecb_t *ecb, dtrace_actdesc_t *desc)
11189{
11190	dtrace_action_t *action, *last;
11191	dtrace_difo_t *dp = desc->dtad_difo;
11192	uint32_t size = 0, align = sizeof (uint8_t), mask;
11193	uint16_t format = 0;
11194	dtrace_recdesc_t *rec;
11195	dtrace_state_t *state = ecb->dte_state;
11196	dtrace_optval_t *opt = state->dts_options, nframes = 0, strsize;
11197	uint64_t arg = desc->dtad_arg;
11198
11199	ASSERT(MUTEX_HELD(&dtrace_lock));
11200	ASSERT(ecb->dte_action == NULL || ecb->dte_action->dta_refcnt == 1);
11201
11202	if (DTRACEACT_ISAGG(desc->dtad_kind)) {
11203		/*
11204		 * If this is an aggregating action, there must be neither
11205		 * a speculate nor a commit on the action chain.
11206		 */
11207		dtrace_action_t *act;
11208
11209		for (act = ecb->dte_action; act != NULL; act = act->dta_next) {
11210			if (act->dta_kind == DTRACEACT_COMMIT)
11211				return (EINVAL);
11212
11213			if (act->dta_kind == DTRACEACT_SPECULATE)
11214				return (EINVAL);
11215		}
11216
11217		action = dtrace_ecb_aggregation_create(ecb, desc);
11218
11219		if (action == NULL)
11220			return (EINVAL);
11221	} else {
11222		if (DTRACEACT_ISDESTRUCTIVE(desc->dtad_kind) ||
11223		    (desc->dtad_kind == DTRACEACT_DIFEXPR &&
11224		    dp != NULL && dp->dtdo_destructive)) {
11225			state->dts_destructive = 1;
11226		}
11227
11228		switch (desc->dtad_kind) {
11229		case DTRACEACT_PRINTF:
11230		case DTRACEACT_PRINTA:
11231		case DTRACEACT_SYSTEM:
11232		case DTRACEACT_FREOPEN:
11233		case DTRACEACT_DIFEXPR:
11234			/*
11235			 * We know that our arg is a string -- turn it into a
11236			 * format.
11237			 */
11238			if (arg == 0) {
11239				ASSERT(desc->dtad_kind == DTRACEACT_PRINTA ||
11240				    desc->dtad_kind == DTRACEACT_DIFEXPR);
11241				format = 0;
11242			} else {
11243				ASSERT(arg != 0);
11244#if defined(sun)
11245				ASSERT(arg > KERNELBASE);
11246#endif
11247				format = dtrace_format_add(state,
11248				    (char *)(uintptr_t)arg);
11249			}
11250
11251			/*FALLTHROUGH*/
11252		case DTRACEACT_LIBACT:
11253		case DTRACEACT_TRACEMEM:
11254		case DTRACEACT_TRACEMEM_DYNSIZE:
11255			if (dp == NULL)
11256				return (EINVAL);
11257
11258			if ((size = dp->dtdo_rtype.dtdt_size) != 0)
11259				break;
11260
11261			if (dp->dtdo_rtype.dtdt_kind == DIF_TYPE_STRING) {
11262				if (!(dp->dtdo_rtype.dtdt_flags & DIF_TF_BYREF))
11263					return (EINVAL);
11264
11265				size = opt[DTRACEOPT_STRSIZE];
11266			}
11267
11268			break;
11269
11270		case DTRACEACT_STACK:
11271			if ((nframes = arg) == 0) {
11272				nframes = opt[DTRACEOPT_STACKFRAMES];
11273				ASSERT(nframes > 0);
11274				arg = nframes;
11275			}
11276
11277			size = nframes * sizeof (pc_t);
11278			break;
11279
11280		case DTRACEACT_JSTACK:
11281			if ((strsize = DTRACE_USTACK_STRSIZE(arg)) == 0)
11282				strsize = opt[DTRACEOPT_JSTACKSTRSIZE];
11283
11284			if ((nframes = DTRACE_USTACK_NFRAMES(arg)) == 0)
11285				nframes = opt[DTRACEOPT_JSTACKFRAMES];
11286
11287			arg = DTRACE_USTACK_ARG(nframes, strsize);
11288
11289			/*FALLTHROUGH*/
11290		case DTRACEACT_USTACK:
11291			if (desc->dtad_kind != DTRACEACT_JSTACK &&
11292			    (nframes = DTRACE_USTACK_NFRAMES(arg)) == 0) {
11293				strsize = DTRACE_USTACK_STRSIZE(arg);
11294				nframes = opt[DTRACEOPT_USTACKFRAMES];
11295				ASSERT(nframes > 0);
11296				arg = DTRACE_USTACK_ARG(nframes, strsize);
11297			}
11298
11299			/*
11300			 * Save a slot for the pid.
11301			 */
11302			size = (nframes + 1) * sizeof (uint64_t);
11303			size += DTRACE_USTACK_STRSIZE(arg);
11304			size = P2ROUNDUP(size, (uint32_t)(sizeof (uintptr_t)));
11305
11306			break;
11307
11308		case DTRACEACT_SYM:
11309		case DTRACEACT_MOD:
11310			if (dp == NULL || ((size = dp->dtdo_rtype.dtdt_size) !=
11311			    sizeof (uint64_t)) ||
11312			    (dp->dtdo_rtype.dtdt_flags & DIF_TF_BYREF))
11313				return (EINVAL);
11314			break;
11315
11316		case DTRACEACT_USYM:
11317		case DTRACEACT_UMOD:
11318		case DTRACEACT_UADDR:
11319			if (dp == NULL ||
11320			    (dp->dtdo_rtype.dtdt_size != sizeof (uint64_t)) ||
11321			    (dp->dtdo_rtype.dtdt_flags & DIF_TF_BYREF))
11322				return (EINVAL);
11323
11324			/*
11325			 * We have a slot for the pid, plus a slot for the
11326			 * argument.  To keep things simple (aligned with
11327			 * bitness-neutral sizing), we store each as a 64-bit
11328			 * quantity.
11329			 */
11330			size = 2 * sizeof (uint64_t);
11331			break;
11332
11333		case DTRACEACT_STOP:
11334		case DTRACEACT_BREAKPOINT:
11335		case DTRACEACT_PANIC:
11336			break;
11337
11338		case DTRACEACT_CHILL:
11339		case DTRACEACT_DISCARD:
11340		case DTRACEACT_RAISE:
11341			if (dp == NULL)
11342				return (EINVAL);
11343			break;
11344
11345		case DTRACEACT_EXIT:
11346			if (dp == NULL ||
11347			    (size = dp->dtdo_rtype.dtdt_size) != sizeof (int) ||
11348			    (dp->dtdo_rtype.dtdt_flags & DIF_TF_BYREF))
11349				return (EINVAL);
11350			break;
11351
11352		case DTRACEACT_SPECULATE:
11353			if (ecb->dte_size > sizeof (dtrace_rechdr_t))
11354				return (EINVAL);
11355
11356			if (dp == NULL)
11357				return (EINVAL);
11358
11359			state->dts_speculates = 1;
11360			break;
11361
11362		case DTRACEACT_PRINTM:
11363		    	size = dp->dtdo_rtype.dtdt_size;
11364			break;
11365
11366		case DTRACEACT_PRINTT:
11367		    	size = dp->dtdo_rtype.dtdt_size;
11368			break;
11369
11370		case DTRACEACT_COMMIT: {
11371			dtrace_action_t *act = ecb->dte_action;
11372
11373			for (; act != NULL; act = act->dta_next) {
11374				if (act->dta_kind == DTRACEACT_COMMIT)
11375					return (EINVAL);
11376			}
11377
11378			if (dp == NULL)
11379				return (EINVAL);
11380			break;
11381		}
11382
11383		default:
11384			return (EINVAL);
11385		}
11386
11387		if (size != 0 || desc->dtad_kind == DTRACEACT_SPECULATE) {
11388			/*
11389			 * If this is a data-storing action or a speculate,
11390			 * we must be sure that there isn't a commit on the
11391			 * action chain.
11392			 */
11393			dtrace_action_t *act = ecb->dte_action;
11394
11395			for (; act != NULL; act = act->dta_next) {
11396				if (act->dta_kind == DTRACEACT_COMMIT)
11397					return (EINVAL);
11398			}
11399		}
11400
11401		action = kmem_zalloc(sizeof (dtrace_action_t), KM_SLEEP);
11402		action->dta_rec.dtrd_size = size;
11403	}
11404
11405	action->dta_refcnt = 1;
11406	rec = &action->dta_rec;
11407	size = rec->dtrd_size;
11408
11409	for (mask = sizeof (uint64_t) - 1; size != 0 && mask > 0; mask >>= 1) {
11410		if (!(size & mask)) {
11411			align = mask + 1;
11412			break;
11413		}
11414	}
11415
11416	action->dta_kind = desc->dtad_kind;
11417
11418	if ((action->dta_difo = dp) != NULL)
11419		dtrace_difo_hold(dp);
11420
11421	rec->dtrd_action = action->dta_kind;
11422	rec->dtrd_arg = arg;
11423	rec->dtrd_uarg = desc->dtad_uarg;
11424	rec->dtrd_alignment = (uint16_t)align;
11425	rec->dtrd_format = format;
11426
11427	if ((last = ecb->dte_action_last) != NULL) {
11428		ASSERT(ecb->dte_action != NULL);
11429		action->dta_prev = last;
11430		last->dta_next = action;
11431	} else {
11432		ASSERT(ecb->dte_action == NULL);
11433		ecb->dte_action = action;
11434	}
11435
11436	ecb->dte_action_last = action;
11437
11438	return (0);
11439}
11440
11441static void
11442dtrace_ecb_action_remove(dtrace_ecb_t *ecb)
11443{
11444	dtrace_action_t *act = ecb->dte_action, *next;
11445	dtrace_vstate_t *vstate = &ecb->dte_state->dts_vstate;
11446	dtrace_difo_t *dp;
11447	uint16_t format;
11448
11449	if (act != NULL && act->dta_refcnt > 1) {
11450		ASSERT(act->dta_next == NULL || act->dta_next->dta_refcnt == 1);
11451		act->dta_refcnt--;
11452	} else {
11453		for (; act != NULL; act = next) {
11454			next = act->dta_next;
11455			ASSERT(next != NULL || act == ecb->dte_action_last);
11456			ASSERT(act->dta_refcnt == 1);
11457
11458			if ((format = act->dta_rec.dtrd_format) != 0)
11459				dtrace_format_remove(ecb->dte_state, format);
11460
11461			if ((dp = act->dta_difo) != NULL)
11462				dtrace_difo_release(dp, vstate);
11463
11464			if (DTRACEACT_ISAGG(act->dta_kind)) {
11465				dtrace_ecb_aggregation_destroy(ecb, act);
11466			} else {
11467				kmem_free(act, sizeof (dtrace_action_t));
11468			}
11469		}
11470	}
11471
11472	ecb->dte_action = NULL;
11473	ecb->dte_action_last = NULL;
11474	ecb->dte_size = 0;
11475}
11476
11477static void
11478dtrace_ecb_disable(dtrace_ecb_t *ecb)
11479{
11480	/*
11481	 * We disable the ECB by removing it from its probe.
11482	 */
11483	dtrace_ecb_t *pecb, *prev = NULL;
11484	dtrace_probe_t *probe = ecb->dte_probe;
11485
11486	ASSERT(MUTEX_HELD(&dtrace_lock));
11487
11488	if (probe == NULL) {
11489		/*
11490		 * This is the NULL probe; there is nothing to disable.
11491		 */
11492		return;
11493	}
11494
11495	for (pecb = probe->dtpr_ecb; pecb != NULL; pecb = pecb->dte_next) {
11496		if (pecb == ecb)
11497			break;
11498		prev = pecb;
11499	}
11500
11501	ASSERT(pecb != NULL);
11502
11503	if (prev == NULL) {
11504		probe->dtpr_ecb = ecb->dte_next;
11505	} else {
11506		prev->dte_next = ecb->dte_next;
11507	}
11508
11509	if (ecb == probe->dtpr_ecb_last) {
11510		ASSERT(ecb->dte_next == NULL);
11511		probe->dtpr_ecb_last = prev;
11512	}
11513
11514	/*
11515	 * The ECB has been disconnected from the probe; now sync to assure
11516	 * that all CPUs have seen the change before returning.
11517	 */
11518	dtrace_sync();
11519
11520	if (probe->dtpr_ecb == NULL) {
11521		/*
11522		 * That was the last ECB on the probe; clear the predicate
11523		 * cache ID for the probe, disable it and sync one more time
11524		 * to assure that we'll never hit it again.
11525		 */
11526		dtrace_provider_t *prov = probe->dtpr_provider;
11527
11528		ASSERT(ecb->dte_next == NULL);
11529		ASSERT(probe->dtpr_ecb_last == NULL);
11530		probe->dtpr_predcache = DTRACE_CACHEIDNONE;
11531		prov->dtpv_pops.dtps_disable(prov->dtpv_arg,
11532		    probe->dtpr_id, probe->dtpr_arg);
11533		dtrace_sync();
11534	} else {
11535		/*
11536		 * There is at least one ECB remaining on the probe.  If there
11537		 * is _exactly_ one, set the probe's predicate cache ID to be
11538		 * the predicate cache ID of the remaining ECB.
11539		 */
11540		ASSERT(probe->dtpr_ecb_last != NULL);
11541		ASSERT(probe->dtpr_predcache == DTRACE_CACHEIDNONE);
11542
11543		if (probe->dtpr_ecb == probe->dtpr_ecb_last) {
11544			dtrace_predicate_t *p = probe->dtpr_ecb->dte_predicate;
11545
11546			ASSERT(probe->dtpr_ecb->dte_next == NULL);
11547
11548			if (p != NULL)
11549				probe->dtpr_predcache = p->dtp_cacheid;
11550		}
11551
11552		ecb->dte_next = NULL;
11553	}
11554}
11555
11556static void
11557dtrace_ecb_destroy(dtrace_ecb_t *ecb)
11558{
11559	dtrace_state_t *state = ecb->dte_state;
11560	dtrace_vstate_t *vstate = &state->dts_vstate;
11561	dtrace_predicate_t *pred;
11562	dtrace_epid_t epid = ecb->dte_epid;
11563
11564	ASSERT(MUTEX_HELD(&dtrace_lock));
11565	ASSERT(ecb->dte_next == NULL);
11566	ASSERT(ecb->dte_probe == NULL || ecb->dte_probe->dtpr_ecb != ecb);
11567
11568	if ((pred = ecb->dte_predicate) != NULL)
11569		dtrace_predicate_release(pred, vstate);
11570
11571	dtrace_ecb_action_remove(ecb);
11572
11573	ASSERT(state->dts_ecbs[epid - 1] == ecb);
11574	state->dts_ecbs[epid - 1] = NULL;
11575
11576	kmem_free(ecb, sizeof (dtrace_ecb_t));
11577}
11578
11579static dtrace_ecb_t *
11580dtrace_ecb_create(dtrace_state_t *state, dtrace_probe_t *probe,
11581    dtrace_enabling_t *enab)
11582{
11583	dtrace_ecb_t *ecb;
11584	dtrace_predicate_t *pred;
11585	dtrace_actdesc_t *act;
11586	dtrace_provider_t *prov;
11587	dtrace_ecbdesc_t *desc = enab->dten_current;
11588
11589	ASSERT(MUTEX_HELD(&dtrace_lock));
11590	ASSERT(state != NULL);
11591
11592	ecb = dtrace_ecb_add(state, probe);
11593	ecb->dte_uarg = desc->dted_uarg;
11594
11595	if ((pred = desc->dted_pred.dtpdd_predicate) != NULL) {
11596		dtrace_predicate_hold(pred);
11597		ecb->dte_predicate = pred;
11598	}
11599
11600	if (probe != NULL) {
11601		/*
11602		 * If the provider shows more leg than the consumer is old
11603		 * enough to see, we need to enable the appropriate implicit
11604		 * predicate bits to prevent the ecb from activating at
11605		 * revealing times.
11606		 *
11607		 * Providers specifying DTRACE_PRIV_USER at register time
11608		 * are stating that they need the /proc-style privilege
11609		 * model to be enforced, and this is what DTRACE_COND_OWNER
11610		 * and DTRACE_COND_ZONEOWNER will then do at probe time.
11611		 */
11612		prov = probe->dtpr_provider;
11613		if (!(state->dts_cred.dcr_visible & DTRACE_CRV_ALLPROC) &&
11614		    (prov->dtpv_priv.dtpp_flags & DTRACE_PRIV_USER))
11615			ecb->dte_cond |= DTRACE_COND_OWNER;
11616
11617		if (!(state->dts_cred.dcr_visible & DTRACE_CRV_ALLZONE) &&
11618		    (prov->dtpv_priv.dtpp_flags & DTRACE_PRIV_USER))
11619			ecb->dte_cond |= DTRACE_COND_ZONEOWNER;
11620
11621		/*
11622		 * If the provider shows us kernel innards and the user
11623		 * is lacking sufficient privilege, enable the
11624		 * DTRACE_COND_USERMODE implicit predicate.
11625		 */
11626		if (!(state->dts_cred.dcr_visible & DTRACE_CRV_KERNEL) &&
11627		    (prov->dtpv_priv.dtpp_flags & DTRACE_PRIV_KERNEL))
11628			ecb->dte_cond |= DTRACE_COND_USERMODE;
11629	}
11630
11631	if (dtrace_ecb_create_cache != NULL) {
11632		/*
11633		 * If we have a cached ecb, we'll use its action list instead
11634		 * of creating our own (saving both time and space).
11635		 */
11636		dtrace_ecb_t *cached = dtrace_ecb_create_cache;
11637		dtrace_action_t *act = cached->dte_action;
11638
11639		if (act != NULL) {
11640			ASSERT(act->dta_refcnt > 0);
11641			act->dta_refcnt++;
11642			ecb->dte_action = act;
11643			ecb->dte_action_last = cached->dte_action_last;
11644			ecb->dte_needed = cached->dte_needed;
11645			ecb->dte_size = cached->dte_size;
11646			ecb->dte_alignment = cached->dte_alignment;
11647		}
11648
11649		return (ecb);
11650	}
11651
11652	for (act = desc->dted_action; act != NULL; act = act->dtad_next) {
11653		if ((enab->dten_error = dtrace_ecb_action_add(ecb, act)) != 0) {
11654			dtrace_ecb_destroy(ecb);
11655			return (NULL);
11656		}
11657	}
11658
11659	dtrace_ecb_resize(ecb);
11660
11661	return (dtrace_ecb_create_cache = ecb);
11662}
11663
11664static int
11665dtrace_ecb_create_enable(dtrace_probe_t *probe, void *arg)
11666{
11667	dtrace_ecb_t *ecb;
11668	dtrace_enabling_t *enab = arg;
11669	dtrace_state_t *state = enab->dten_vstate->dtvs_state;
11670
11671	ASSERT(state != NULL);
11672
11673	if (probe != NULL && probe->dtpr_gen < enab->dten_probegen) {
11674		/*
11675		 * This probe was created in a generation for which this
11676		 * enabling has previously created ECBs; we don't want to
11677		 * enable it again, so just kick out.
11678		 */
11679		return (DTRACE_MATCH_NEXT);
11680	}
11681
11682	if ((ecb = dtrace_ecb_create(state, probe, enab)) == NULL)
11683		return (DTRACE_MATCH_DONE);
11684
11685	dtrace_ecb_enable(ecb);
11686	return (DTRACE_MATCH_NEXT);
11687}
11688
11689static dtrace_ecb_t *
11690dtrace_epid2ecb(dtrace_state_t *state, dtrace_epid_t id)
11691{
11692	dtrace_ecb_t *ecb;
11693
11694	ASSERT(MUTEX_HELD(&dtrace_lock));
11695
11696	if (id == 0 || id > state->dts_necbs)
11697		return (NULL);
11698
11699	ASSERT(state->dts_necbs > 0 && state->dts_ecbs != NULL);
11700	ASSERT((ecb = state->dts_ecbs[id - 1]) == NULL || ecb->dte_epid == id);
11701
11702	return (state->dts_ecbs[id - 1]);
11703}
11704
11705static dtrace_aggregation_t *
11706dtrace_aggid2agg(dtrace_state_t *state, dtrace_aggid_t id)
11707{
11708	dtrace_aggregation_t *agg;
11709
11710	ASSERT(MUTEX_HELD(&dtrace_lock));
11711
11712	if (id == 0 || id > state->dts_naggregations)
11713		return (NULL);
11714
11715	ASSERT(state->dts_naggregations > 0 && state->dts_aggregations != NULL);
11716	ASSERT((agg = state->dts_aggregations[id - 1]) == NULL ||
11717	    agg->dtag_id == id);
11718
11719	return (state->dts_aggregations[id - 1]);
11720}
11721
11722/*
11723 * DTrace Buffer Functions
11724 *
11725 * The following functions manipulate DTrace buffers.  Most of these functions
11726 * are called in the context of establishing or processing consumer state;
11727 * exceptions are explicitly noted.
11728 */
11729
11730/*
11731 * Note:  called from cross call context.  This function switches the two
11732 * buffers on a given CPU.  The atomicity of this operation is assured by
11733 * disabling interrupts while the actual switch takes place; the disabling of
11734 * interrupts serializes the execution with any execution of dtrace_probe() on
11735 * the same CPU.
11736 */
11737static void
11738dtrace_buffer_switch(dtrace_buffer_t *buf)
11739{
11740	caddr_t tomax = buf->dtb_tomax;
11741	caddr_t xamot = buf->dtb_xamot;
11742	dtrace_icookie_t cookie;
11743	hrtime_t now;
11744
11745	ASSERT(!(buf->dtb_flags & DTRACEBUF_NOSWITCH));
11746	ASSERT(!(buf->dtb_flags & DTRACEBUF_RING));
11747
11748	cookie = dtrace_interrupt_disable();
11749	now = dtrace_gethrtime();
11750	buf->dtb_tomax = xamot;
11751	buf->dtb_xamot = tomax;
11752	buf->dtb_xamot_drops = buf->dtb_drops;
11753	buf->dtb_xamot_offset = buf->dtb_offset;
11754	buf->dtb_xamot_errors = buf->dtb_errors;
11755	buf->dtb_xamot_flags = buf->dtb_flags;
11756	buf->dtb_offset = 0;
11757	buf->dtb_drops = 0;
11758	buf->dtb_errors = 0;
11759	buf->dtb_flags &= ~(DTRACEBUF_ERROR | DTRACEBUF_DROPPED);
11760	buf->dtb_interval = now - buf->dtb_switched;
11761	buf->dtb_switched = now;
11762	dtrace_interrupt_enable(cookie);
11763}
11764
11765/*
11766 * Note:  called from cross call context.  This function activates a buffer
11767 * on a CPU.  As with dtrace_buffer_switch(), the atomicity of the operation
11768 * is guaranteed by the disabling of interrupts.
11769 */
11770static void
11771dtrace_buffer_activate(dtrace_state_t *state)
11772{
11773	dtrace_buffer_t *buf;
11774	dtrace_icookie_t cookie = dtrace_interrupt_disable();
11775
11776	buf = &state->dts_buffer[curcpu];
11777
11778	if (buf->dtb_tomax != NULL) {
11779		/*
11780		 * We might like to assert that the buffer is marked inactive,
11781		 * but this isn't necessarily true:  the buffer for the CPU
11782		 * that processes the BEGIN probe has its buffer activated
11783		 * manually.  In this case, we take the (harmless) action
11784		 * re-clearing the bit INACTIVE bit.
11785		 */
11786		buf->dtb_flags &= ~DTRACEBUF_INACTIVE;
11787	}
11788
11789	dtrace_interrupt_enable(cookie);
11790}
11791
11792static int
11793dtrace_buffer_alloc(dtrace_buffer_t *bufs, size_t size, int flags,
11794    processorid_t cpu, int *factor)
11795{
11796#if defined(sun)
11797	cpu_t *cp;
11798#endif
11799	dtrace_buffer_t *buf;
11800	int allocated = 0, desired = 0;
11801
11802#if defined(sun)
11803	ASSERT(MUTEX_HELD(&cpu_lock));
11804	ASSERT(MUTEX_HELD(&dtrace_lock));
11805
11806	*factor = 1;
11807
11808	if (size > dtrace_nonroot_maxsize &&
11809	    !PRIV_POLICY_CHOICE(CRED(), PRIV_ALL, B_FALSE))
11810		return (EFBIG);
11811
11812	cp = cpu_list;
11813
11814	do {
11815		if (cpu != DTRACE_CPUALL && cpu != cp->cpu_id)
11816			continue;
11817
11818		buf = &bufs[cp->cpu_id];
11819
11820		/*
11821		 * If there is already a buffer allocated for this CPU, it
11822		 * is only possible that this is a DR event.  In this case,
11823		 */
11824		if (buf->dtb_tomax != NULL) {
11825			ASSERT(buf->dtb_size == size);
11826			continue;
11827		}
11828
11829		ASSERT(buf->dtb_xamot == NULL);
11830
11831		if ((buf->dtb_tomax = kmem_zalloc(size,
11832		    KM_NOSLEEP | KM_NORMALPRI)) == NULL)
11833			goto err;
11834
11835		buf->dtb_size = size;
11836		buf->dtb_flags = flags;
11837		buf->dtb_offset = 0;
11838		buf->dtb_drops = 0;
11839
11840		if (flags & DTRACEBUF_NOSWITCH)
11841			continue;
11842
11843		if ((buf->dtb_xamot = kmem_zalloc(size,
11844		    KM_NOSLEEP | KM_NORMALPRI)) == NULL)
11845			goto err;
11846	} while ((cp = cp->cpu_next) != cpu_list);
11847
11848	return (0);
11849
11850err:
11851	cp = cpu_list;
11852
11853	do {
11854		if (cpu != DTRACE_CPUALL && cpu != cp->cpu_id)
11855			continue;
11856
11857		buf = &bufs[cp->cpu_id];
11858		desired += 2;
11859
11860		if (buf->dtb_xamot != NULL) {
11861			ASSERT(buf->dtb_tomax != NULL);
11862			ASSERT(buf->dtb_size == size);
11863			kmem_free(buf->dtb_xamot, size);
11864			allocated++;
11865		}
11866
11867		if (buf->dtb_tomax != NULL) {
11868			ASSERT(buf->dtb_size == size);
11869			kmem_free(buf->dtb_tomax, size);
11870			allocated++;
11871		}
11872
11873		buf->dtb_tomax = NULL;
11874		buf->dtb_xamot = NULL;
11875		buf->dtb_size = 0;
11876	} while ((cp = cp->cpu_next) != cpu_list);
11877#else
11878	int i;
11879
11880	*factor = 1;
11881#if defined(__amd64__) || defined(__mips__) || defined(__powerpc__)
11882	/*
11883	 * FreeBSD isn't good at limiting the amount of memory we
11884	 * ask to malloc, so let's place a limit here before trying
11885	 * to do something that might well end in tears at bedtime.
11886	 */
11887	if (size > physmem * PAGE_SIZE / (128 * (mp_maxid + 1)))
11888		return (ENOMEM);
11889#endif
11890
11891	ASSERT(MUTEX_HELD(&dtrace_lock));
11892	CPU_FOREACH(i) {
11893		if (cpu != DTRACE_CPUALL && cpu != i)
11894			continue;
11895
11896		buf = &bufs[i];
11897
11898		/*
11899		 * If there is already a buffer allocated for this CPU, it
11900		 * is only possible that this is a DR event.  In this case,
11901		 * the buffer size must match our specified size.
11902		 */
11903		if (buf->dtb_tomax != NULL) {
11904			ASSERT(buf->dtb_size == size);
11905			continue;
11906		}
11907
11908		ASSERT(buf->dtb_xamot == NULL);
11909
11910		if ((buf->dtb_tomax = kmem_zalloc(size,
11911		    KM_NOSLEEP | KM_NORMALPRI)) == NULL)
11912			goto err;
11913
11914		buf->dtb_size = size;
11915		buf->dtb_flags = flags;
11916		buf->dtb_offset = 0;
11917		buf->dtb_drops = 0;
11918
11919		if (flags & DTRACEBUF_NOSWITCH)
11920			continue;
11921
11922		if ((buf->dtb_xamot = kmem_zalloc(size,
11923		    KM_NOSLEEP | KM_NORMALPRI)) == NULL)
11924			goto err;
11925	}
11926
11927	return (0);
11928
11929err:
11930	/*
11931	 * Error allocating memory, so free the buffers that were
11932	 * allocated before the failed allocation.
11933	 */
11934	CPU_FOREACH(i) {
11935		if (cpu != DTRACE_CPUALL && cpu != i)
11936			continue;
11937
11938		buf = &bufs[i];
11939		desired += 2;
11940
11941		if (buf->dtb_xamot != NULL) {
11942			ASSERT(buf->dtb_tomax != NULL);
11943			ASSERT(buf->dtb_size == size);
11944			kmem_free(buf->dtb_xamot, size);
11945			allocated++;
11946		}
11947
11948		if (buf->dtb_tomax != NULL) {
11949			ASSERT(buf->dtb_size == size);
11950			kmem_free(buf->dtb_tomax, size);
11951			allocated++;
11952		}
11953
11954		buf->dtb_tomax = NULL;
11955		buf->dtb_xamot = NULL;
11956		buf->dtb_size = 0;
11957
11958	}
11959#endif
11960	*factor = desired / (allocated > 0 ? allocated : 1);
11961
11962	return (ENOMEM);
11963}
11964
11965/*
11966 * Note:  called from probe context.  This function just increments the drop
11967 * count on a buffer.  It has been made a function to allow for the
11968 * possibility of understanding the source of mysterious drop counts.  (A
11969 * problem for which one may be particularly disappointed that DTrace cannot
11970 * be used to understand DTrace.)
11971 */
11972static void
11973dtrace_buffer_drop(dtrace_buffer_t *buf)
11974{
11975	buf->dtb_drops++;
11976}
11977
11978/*
11979 * Note:  called from probe context.  This function is called to reserve space
11980 * in a buffer.  If mstate is non-NULL, sets the scratch base and size in the
11981 * mstate.  Returns the new offset in the buffer, or a negative value if an
11982 * error has occurred.
11983 */
11984static intptr_t
11985dtrace_buffer_reserve(dtrace_buffer_t *buf, size_t needed, size_t align,
11986    dtrace_state_t *state, dtrace_mstate_t *mstate)
11987{
11988	intptr_t offs = buf->dtb_offset, soffs;
11989	intptr_t woffs;
11990	caddr_t tomax;
11991	size_t total;
11992
11993	if (buf->dtb_flags & DTRACEBUF_INACTIVE)
11994		return (-1);
11995
11996	if ((tomax = buf->dtb_tomax) == NULL) {
11997		dtrace_buffer_drop(buf);
11998		return (-1);
11999	}
12000
12001	if (!(buf->dtb_flags & (DTRACEBUF_RING | DTRACEBUF_FILL))) {
12002		while (offs & (align - 1)) {
12003			/*
12004			 * Assert that our alignment is off by a number which
12005			 * is itself sizeof (uint32_t) aligned.
12006			 */
12007			ASSERT(!((align - (offs & (align - 1))) &
12008			    (sizeof (uint32_t) - 1)));
12009			DTRACE_STORE(uint32_t, tomax, offs, DTRACE_EPIDNONE);
12010			offs += sizeof (uint32_t);
12011		}
12012
12013		if ((soffs = offs + needed) > buf->dtb_size) {
12014			dtrace_buffer_drop(buf);
12015			return (-1);
12016		}
12017
12018		if (mstate == NULL)
12019			return (offs);
12020
12021		mstate->dtms_scratch_base = (uintptr_t)tomax + soffs;
12022		mstate->dtms_scratch_size = buf->dtb_size - soffs;
12023		mstate->dtms_scratch_ptr = mstate->dtms_scratch_base;
12024
12025		return (offs);
12026	}
12027
12028	if (buf->dtb_flags & DTRACEBUF_FILL) {
12029		if (state->dts_activity != DTRACE_ACTIVITY_COOLDOWN &&
12030		    (buf->dtb_flags & DTRACEBUF_FULL))
12031			return (-1);
12032		goto out;
12033	}
12034
12035	total = needed + (offs & (align - 1));
12036
12037	/*
12038	 * For a ring buffer, life is quite a bit more complicated.  Before
12039	 * we can store any padding, we need to adjust our wrapping offset.
12040	 * (If we've never before wrapped or we're not about to, no adjustment
12041	 * is required.)
12042	 */
12043	if ((buf->dtb_flags & DTRACEBUF_WRAPPED) ||
12044	    offs + total > buf->dtb_size) {
12045		woffs = buf->dtb_xamot_offset;
12046
12047		if (offs + total > buf->dtb_size) {
12048			/*
12049			 * We can't fit in the end of the buffer.  First, a
12050			 * sanity check that we can fit in the buffer at all.
12051			 */
12052			if (total > buf->dtb_size) {
12053				dtrace_buffer_drop(buf);
12054				return (-1);
12055			}
12056
12057			/*
12058			 * We're going to be storing at the top of the buffer,
12059			 * so now we need to deal with the wrapped offset.  We
12060			 * only reset our wrapped offset to 0 if it is
12061			 * currently greater than the current offset.  If it
12062			 * is less than the current offset, it is because a
12063			 * previous allocation induced a wrap -- but the
12064			 * allocation didn't subsequently take the space due
12065			 * to an error or false predicate evaluation.  In this
12066			 * case, we'll just leave the wrapped offset alone: if
12067			 * the wrapped offset hasn't been advanced far enough
12068			 * for this allocation, it will be adjusted in the
12069			 * lower loop.
12070			 */
12071			if (buf->dtb_flags & DTRACEBUF_WRAPPED) {
12072				if (woffs >= offs)
12073					woffs = 0;
12074			} else {
12075				woffs = 0;
12076			}
12077
12078			/*
12079			 * Now we know that we're going to be storing to the
12080			 * top of the buffer and that there is room for us
12081			 * there.  We need to clear the buffer from the current
12082			 * offset to the end (there may be old gunk there).
12083			 */
12084			while (offs < buf->dtb_size)
12085				tomax[offs++] = 0;
12086
12087			/*
12088			 * We need to set our offset to zero.  And because we
12089			 * are wrapping, we need to set the bit indicating as
12090			 * much.  We can also adjust our needed space back
12091			 * down to the space required by the ECB -- we know
12092			 * that the top of the buffer is aligned.
12093			 */
12094			offs = 0;
12095			total = needed;
12096			buf->dtb_flags |= DTRACEBUF_WRAPPED;
12097		} else {
12098			/*
12099			 * There is room for us in the buffer, so we simply
12100			 * need to check the wrapped offset.
12101			 */
12102			if (woffs < offs) {
12103				/*
12104				 * The wrapped offset is less than the offset.
12105				 * This can happen if we allocated buffer space
12106				 * that induced a wrap, but then we didn't
12107				 * subsequently take the space due to an error
12108				 * or false predicate evaluation.  This is
12109				 * okay; we know that _this_ allocation isn't
12110				 * going to induce a wrap.  We still can't
12111				 * reset the wrapped offset to be zero,
12112				 * however: the space may have been trashed in
12113				 * the previous failed probe attempt.  But at
12114				 * least the wrapped offset doesn't need to
12115				 * be adjusted at all...
12116				 */
12117				goto out;
12118			}
12119		}
12120
12121		while (offs + total > woffs) {
12122			dtrace_epid_t epid = *(uint32_t *)(tomax + woffs);
12123			size_t size;
12124
12125			if (epid == DTRACE_EPIDNONE) {
12126				size = sizeof (uint32_t);
12127			} else {
12128				ASSERT3U(epid, <=, state->dts_necbs);
12129				ASSERT(state->dts_ecbs[epid - 1] != NULL);
12130
12131				size = state->dts_ecbs[epid - 1]->dte_size;
12132			}
12133
12134			ASSERT(woffs + size <= buf->dtb_size);
12135			ASSERT(size != 0);
12136
12137			if (woffs + size == buf->dtb_size) {
12138				/*
12139				 * We've reached the end of the buffer; we want
12140				 * to set the wrapped offset to 0 and break
12141				 * out.  However, if the offs is 0, then we're
12142				 * in a strange edge-condition:  the amount of
12143				 * space that we want to reserve plus the size
12144				 * of the record that we're overwriting is
12145				 * greater than the size of the buffer.  This
12146				 * is problematic because if we reserve the
12147				 * space but subsequently don't consume it (due
12148				 * to a failed predicate or error) the wrapped
12149				 * offset will be 0 -- yet the EPID at offset 0
12150				 * will not be committed.  This situation is
12151				 * relatively easy to deal with:  if we're in
12152				 * this case, the buffer is indistinguishable
12153				 * from one that hasn't wrapped; we need only
12154				 * finish the job by clearing the wrapped bit,
12155				 * explicitly setting the offset to be 0, and
12156				 * zero'ing out the old data in the buffer.
12157				 */
12158				if (offs == 0) {
12159					buf->dtb_flags &= ~DTRACEBUF_WRAPPED;
12160					buf->dtb_offset = 0;
12161					woffs = total;
12162
12163					while (woffs < buf->dtb_size)
12164						tomax[woffs++] = 0;
12165				}
12166
12167				woffs = 0;
12168				break;
12169			}
12170
12171			woffs += size;
12172		}
12173
12174		/*
12175		 * We have a wrapped offset.  It may be that the wrapped offset
12176		 * has become zero -- that's okay.
12177		 */
12178		buf->dtb_xamot_offset = woffs;
12179	}
12180
12181out:
12182	/*
12183	 * Now we can plow the buffer with any necessary padding.
12184	 */
12185	while (offs & (align - 1)) {
12186		/*
12187		 * Assert that our alignment is off by a number which
12188		 * is itself sizeof (uint32_t) aligned.
12189		 */
12190		ASSERT(!((align - (offs & (align - 1))) &
12191		    (sizeof (uint32_t) - 1)));
12192		DTRACE_STORE(uint32_t, tomax, offs, DTRACE_EPIDNONE);
12193		offs += sizeof (uint32_t);
12194	}
12195
12196	if (buf->dtb_flags & DTRACEBUF_FILL) {
12197		if (offs + needed > buf->dtb_size - state->dts_reserve) {
12198			buf->dtb_flags |= DTRACEBUF_FULL;
12199			return (-1);
12200		}
12201	}
12202
12203	if (mstate == NULL)
12204		return (offs);
12205
12206	/*
12207	 * For ring buffers and fill buffers, the scratch space is always
12208	 * the inactive buffer.
12209	 */
12210	mstate->dtms_scratch_base = (uintptr_t)buf->dtb_xamot;
12211	mstate->dtms_scratch_size = buf->dtb_size;
12212	mstate->dtms_scratch_ptr = mstate->dtms_scratch_base;
12213
12214	return (offs);
12215}
12216
12217static void
12218dtrace_buffer_polish(dtrace_buffer_t *buf)
12219{
12220	ASSERT(buf->dtb_flags & DTRACEBUF_RING);
12221	ASSERT(MUTEX_HELD(&dtrace_lock));
12222
12223	if (!(buf->dtb_flags & DTRACEBUF_WRAPPED))
12224		return;
12225
12226	/*
12227	 * We need to polish the ring buffer.  There are three cases:
12228	 *
12229	 * - The first (and presumably most common) is that there is no gap
12230	 *   between the buffer offset and the wrapped offset.  In this case,
12231	 *   there is nothing in the buffer that isn't valid data; we can
12232	 *   mark the buffer as polished and return.
12233	 *
12234	 * - The second (less common than the first but still more common
12235	 *   than the third) is that there is a gap between the buffer offset
12236	 *   and the wrapped offset, and the wrapped offset is larger than the
12237	 *   buffer offset.  This can happen because of an alignment issue, or
12238	 *   can happen because of a call to dtrace_buffer_reserve() that
12239	 *   didn't subsequently consume the buffer space.  In this case,
12240	 *   we need to zero the data from the buffer offset to the wrapped
12241	 *   offset.
12242	 *
12243	 * - The third (and least common) is that there is a gap between the
12244	 *   buffer offset and the wrapped offset, but the wrapped offset is
12245	 *   _less_ than the buffer offset.  This can only happen because a
12246	 *   call to dtrace_buffer_reserve() induced a wrap, but the space
12247	 *   was not subsequently consumed.  In this case, we need to zero the
12248	 *   space from the offset to the end of the buffer _and_ from the
12249	 *   top of the buffer to the wrapped offset.
12250	 */
12251	if (buf->dtb_offset < buf->dtb_xamot_offset) {
12252		bzero(buf->dtb_tomax + buf->dtb_offset,
12253		    buf->dtb_xamot_offset - buf->dtb_offset);
12254	}
12255
12256	if (buf->dtb_offset > buf->dtb_xamot_offset) {
12257		bzero(buf->dtb_tomax + buf->dtb_offset,
12258		    buf->dtb_size - buf->dtb_offset);
12259		bzero(buf->dtb_tomax, buf->dtb_xamot_offset);
12260	}
12261}
12262
12263/*
12264 * This routine determines if data generated at the specified time has likely
12265 * been entirely consumed at user-level.  This routine is called to determine
12266 * if an ECB on a defunct probe (but for an active enabling) can be safely
12267 * disabled and destroyed.
12268 */
12269static int
12270dtrace_buffer_consumed(dtrace_buffer_t *bufs, hrtime_t when)
12271{
12272	int i;
12273
12274	for (i = 0; i < NCPU; i++) {
12275		dtrace_buffer_t *buf = &bufs[i];
12276
12277		if (buf->dtb_size == 0)
12278			continue;
12279
12280		if (buf->dtb_flags & DTRACEBUF_RING)
12281			return (0);
12282
12283		if (!buf->dtb_switched && buf->dtb_offset != 0)
12284			return (0);
12285
12286		if (buf->dtb_switched - buf->dtb_interval < when)
12287			return (0);
12288	}
12289
12290	return (1);
12291}
12292
12293static void
12294dtrace_buffer_free(dtrace_buffer_t *bufs)
12295{
12296	int i;
12297
12298	for (i = 0; i < NCPU; i++) {
12299		dtrace_buffer_t *buf = &bufs[i];
12300
12301		if (buf->dtb_tomax == NULL) {
12302			ASSERT(buf->dtb_xamot == NULL);
12303			ASSERT(buf->dtb_size == 0);
12304			continue;
12305		}
12306
12307		if (buf->dtb_xamot != NULL) {
12308			ASSERT(!(buf->dtb_flags & DTRACEBUF_NOSWITCH));
12309			kmem_free(buf->dtb_xamot, buf->dtb_size);
12310		}
12311
12312		kmem_free(buf->dtb_tomax, buf->dtb_size);
12313		buf->dtb_size = 0;
12314		buf->dtb_tomax = NULL;
12315		buf->dtb_xamot = NULL;
12316	}
12317}
12318
12319/*
12320 * DTrace Enabling Functions
12321 */
12322static dtrace_enabling_t *
12323dtrace_enabling_create(dtrace_vstate_t *vstate)
12324{
12325	dtrace_enabling_t *enab;
12326
12327	enab = kmem_zalloc(sizeof (dtrace_enabling_t), KM_SLEEP);
12328	enab->dten_vstate = vstate;
12329
12330	return (enab);
12331}
12332
12333static void
12334dtrace_enabling_add(dtrace_enabling_t *enab, dtrace_ecbdesc_t *ecb)
12335{
12336	dtrace_ecbdesc_t **ndesc;
12337	size_t osize, nsize;
12338
12339	/*
12340	 * We can't add to enablings after we've enabled them, or after we've
12341	 * retained them.
12342	 */
12343	ASSERT(enab->dten_probegen == 0);
12344	ASSERT(enab->dten_next == NULL && enab->dten_prev == NULL);
12345
12346	if (enab->dten_ndesc < enab->dten_maxdesc) {
12347		enab->dten_desc[enab->dten_ndesc++] = ecb;
12348		return;
12349	}
12350
12351	osize = enab->dten_maxdesc * sizeof (dtrace_enabling_t *);
12352
12353	if (enab->dten_maxdesc == 0) {
12354		enab->dten_maxdesc = 1;
12355	} else {
12356		enab->dten_maxdesc <<= 1;
12357	}
12358
12359	ASSERT(enab->dten_ndesc < enab->dten_maxdesc);
12360
12361	nsize = enab->dten_maxdesc * sizeof (dtrace_enabling_t *);
12362	ndesc = kmem_zalloc(nsize, KM_SLEEP);
12363	bcopy(enab->dten_desc, ndesc, osize);
12364	if (enab->dten_desc != NULL)
12365		kmem_free(enab->dten_desc, osize);
12366
12367	enab->dten_desc = ndesc;
12368	enab->dten_desc[enab->dten_ndesc++] = ecb;
12369}
12370
12371static void
12372dtrace_enabling_addlike(dtrace_enabling_t *enab, dtrace_ecbdesc_t *ecb,
12373    dtrace_probedesc_t *pd)
12374{
12375	dtrace_ecbdesc_t *new;
12376	dtrace_predicate_t *pred;
12377	dtrace_actdesc_t *act;
12378
12379	/*
12380	 * We're going to create a new ECB description that matches the
12381	 * specified ECB in every way, but has the specified probe description.
12382	 */
12383	new = kmem_zalloc(sizeof (dtrace_ecbdesc_t), KM_SLEEP);
12384
12385	if ((pred = ecb->dted_pred.dtpdd_predicate) != NULL)
12386		dtrace_predicate_hold(pred);
12387
12388	for (act = ecb->dted_action; act != NULL; act = act->dtad_next)
12389		dtrace_actdesc_hold(act);
12390
12391	new->dted_action = ecb->dted_action;
12392	new->dted_pred = ecb->dted_pred;
12393	new->dted_probe = *pd;
12394	new->dted_uarg = ecb->dted_uarg;
12395
12396	dtrace_enabling_add(enab, new);
12397}
12398
12399static void
12400dtrace_enabling_dump(dtrace_enabling_t *enab)
12401{
12402	int i;
12403
12404	for (i = 0; i < enab->dten_ndesc; i++) {
12405		dtrace_probedesc_t *desc = &enab->dten_desc[i]->dted_probe;
12406
12407		cmn_err(CE_NOTE, "enabling probe %d (%s:%s:%s:%s)", i,
12408		    desc->dtpd_provider, desc->dtpd_mod,
12409		    desc->dtpd_func, desc->dtpd_name);
12410	}
12411}
12412
12413static void
12414dtrace_enabling_destroy(dtrace_enabling_t *enab)
12415{
12416	int i;
12417	dtrace_ecbdesc_t *ep;
12418	dtrace_vstate_t *vstate = enab->dten_vstate;
12419
12420	ASSERT(MUTEX_HELD(&dtrace_lock));
12421
12422	for (i = 0; i < enab->dten_ndesc; i++) {
12423		dtrace_actdesc_t *act, *next;
12424		dtrace_predicate_t *pred;
12425
12426		ep = enab->dten_desc[i];
12427
12428		if ((pred = ep->dted_pred.dtpdd_predicate) != NULL)
12429			dtrace_predicate_release(pred, vstate);
12430
12431		for (act = ep->dted_action; act != NULL; act = next) {
12432			next = act->dtad_next;
12433			dtrace_actdesc_release(act, vstate);
12434		}
12435
12436		kmem_free(ep, sizeof (dtrace_ecbdesc_t));
12437	}
12438
12439	if (enab->dten_desc != NULL)
12440		kmem_free(enab->dten_desc,
12441		    enab->dten_maxdesc * sizeof (dtrace_enabling_t *));
12442
12443	/*
12444	 * If this was a retained enabling, decrement the dts_nretained count
12445	 * and take it off of the dtrace_retained list.
12446	 */
12447	if (enab->dten_prev != NULL || enab->dten_next != NULL ||
12448	    dtrace_retained == enab) {
12449		ASSERT(enab->dten_vstate->dtvs_state != NULL);
12450		ASSERT(enab->dten_vstate->dtvs_state->dts_nretained > 0);
12451		enab->dten_vstate->dtvs_state->dts_nretained--;
12452		dtrace_retained_gen++;
12453	}
12454
12455	if (enab->dten_prev == NULL) {
12456		if (dtrace_retained == enab) {
12457			dtrace_retained = enab->dten_next;
12458
12459			if (dtrace_retained != NULL)
12460				dtrace_retained->dten_prev = NULL;
12461		}
12462	} else {
12463		ASSERT(enab != dtrace_retained);
12464		ASSERT(dtrace_retained != NULL);
12465		enab->dten_prev->dten_next = enab->dten_next;
12466	}
12467
12468	if (enab->dten_next != NULL) {
12469		ASSERT(dtrace_retained != NULL);
12470		enab->dten_next->dten_prev = enab->dten_prev;
12471	}
12472
12473	kmem_free(enab, sizeof (dtrace_enabling_t));
12474}
12475
12476static int
12477dtrace_enabling_retain(dtrace_enabling_t *enab)
12478{
12479	dtrace_state_t *state;
12480
12481	ASSERT(MUTEX_HELD(&dtrace_lock));
12482	ASSERT(enab->dten_next == NULL && enab->dten_prev == NULL);
12483	ASSERT(enab->dten_vstate != NULL);
12484
12485	state = enab->dten_vstate->dtvs_state;
12486	ASSERT(state != NULL);
12487
12488	/*
12489	 * We only allow each state to retain dtrace_retain_max enablings.
12490	 */
12491	if (state->dts_nretained >= dtrace_retain_max)
12492		return (ENOSPC);
12493
12494	state->dts_nretained++;
12495	dtrace_retained_gen++;
12496
12497	if (dtrace_retained == NULL) {
12498		dtrace_retained = enab;
12499		return (0);
12500	}
12501
12502	enab->dten_next = dtrace_retained;
12503	dtrace_retained->dten_prev = enab;
12504	dtrace_retained = enab;
12505
12506	return (0);
12507}
12508
12509static int
12510dtrace_enabling_replicate(dtrace_state_t *state, dtrace_probedesc_t *match,
12511    dtrace_probedesc_t *create)
12512{
12513	dtrace_enabling_t *new, *enab;
12514	int found = 0, err = ENOENT;
12515
12516	ASSERT(MUTEX_HELD(&dtrace_lock));
12517	ASSERT(strlen(match->dtpd_provider) < DTRACE_PROVNAMELEN);
12518	ASSERT(strlen(match->dtpd_mod) < DTRACE_MODNAMELEN);
12519	ASSERT(strlen(match->dtpd_func) < DTRACE_FUNCNAMELEN);
12520	ASSERT(strlen(match->dtpd_name) < DTRACE_NAMELEN);
12521
12522	new = dtrace_enabling_create(&state->dts_vstate);
12523
12524	/*
12525	 * Iterate over all retained enablings, looking for enablings that
12526	 * match the specified state.
12527	 */
12528	for (enab = dtrace_retained; enab != NULL; enab = enab->dten_next) {
12529		int i;
12530
12531		/*
12532		 * dtvs_state can only be NULL for helper enablings -- and
12533		 * helper enablings can't be retained.
12534		 */
12535		ASSERT(enab->dten_vstate->dtvs_state != NULL);
12536
12537		if (enab->dten_vstate->dtvs_state != state)
12538			continue;
12539
12540		/*
12541		 * Now iterate over each probe description; we're looking for
12542		 * an exact match to the specified probe description.
12543		 */
12544		for (i = 0; i < enab->dten_ndesc; i++) {
12545			dtrace_ecbdesc_t *ep = enab->dten_desc[i];
12546			dtrace_probedesc_t *pd = &ep->dted_probe;
12547
12548			if (strcmp(pd->dtpd_provider, match->dtpd_provider))
12549				continue;
12550
12551			if (strcmp(pd->dtpd_mod, match->dtpd_mod))
12552				continue;
12553
12554			if (strcmp(pd->dtpd_func, match->dtpd_func))
12555				continue;
12556
12557			if (strcmp(pd->dtpd_name, match->dtpd_name))
12558				continue;
12559
12560			/*
12561			 * We have a winning probe!  Add it to our growing
12562			 * enabling.
12563			 */
12564			found = 1;
12565			dtrace_enabling_addlike(new, ep, create);
12566		}
12567	}
12568
12569	if (!found || (err = dtrace_enabling_retain(new)) != 0) {
12570		dtrace_enabling_destroy(new);
12571		return (err);
12572	}
12573
12574	return (0);
12575}
12576
12577static void
12578dtrace_enabling_retract(dtrace_state_t *state)
12579{
12580	dtrace_enabling_t *enab, *next;
12581
12582	ASSERT(MUTEX_HELD(&dtrace_lock));
12583
12584	/*
12585	 * Iterate over all retained enablings, destroy the enablings retained
12586	 * for the specified state.
12587	 */
12588	for (enab = dtrace_retained; enab != NULL; enab = next) {
12589		next = enab->dten_next;
12590
12591		/*
12592		 * dtvs_state can only be NULL for helper enablings -- and
12593		 * helper enablings can't be retained.
12594		 */
12595		ASSERT(enab->dten_vstate->dtvs_state != NULL);
12596
12597		if (enab->dten_vstate->dtvs_state == state) {
12598			ASSERT(state->dts_nretained > 0);
12599			dtrace_enabling_destroy(enab);
12600		}
12601	}
12602
12603	ASSERT(state->dts_nretained == 0);
12604}
12605
12606static int
12607dtrace_enabling_match(dtrace_enabling_t *enab, int *nmatched)
12608{
12609	int i = 0;
12610	int matched = 0;
12611
12612	ASSERT(MUTEX_HELD(&cpu_lock));
12613	ASSERT(MUTEX_HELD(&dtrace_lock));
12614
12615	for (i = 0; i < enab->dten_ndesc; i++) {
12616		dtrace_ecbdesc_t *ep = enab->dten_desc[i];
12617
12618		enab->dten_current = ep;
12619		enab->dten_error = 0;
12620
12621		matched += dtrace_probe_enable(&ep->dted_probe, enab);
12622
12623		if (enab->dten_error != 0) {
12624			/*
12625			 * If we get an error half-way through enabling the
12626			 * probes, we kick out -- perhaps with some number of
12627			 * them enabled.  Leaving enabled probes enabled may
12628			 * be slightly confusing for user-level, but we expect
12629			 * that no one will attempt to actually drive on in
12630			 * the face of such errors.  If this is an anonymous
12631			 * enabling (indicated with a NULL nmatched pointer),
12632			 * we cmn_err() a message.  We aren't expecting to
12633			 * get such an error -- such as it can exist at all,
12634			 * it would be a result of corrupted DOF in the driver
12635			 * properties.
12636			 */
12637			if (nmatched == NULL) {
12638				cmn_err(CE_WARN, "dtrace_enabling_match() "
12639				    "error on %p: %d", (void *)ep,
12640				    enab->dten_error);
12641			}
12642
12643			return (enab->dten_error);
12644		}
12645	}
12646
12647	enab->dten_probegen = dtrace_probegen;
12648	if (nmatched != NULL)
12649		*nmatched = matched;
12650
12651	return (0);
12652}
12653
12654static void
12655dtrace_enabling_matchall(void)
12656{
12657	dtrace_enabling_t *enab;
12658
12659	mutex_enter(&cpu_lock);
12660	mutex_enter(&dtrace_lock);
12661
12662	/*
12663	 * Iterate over all retained enablings to see if any probes match
12664	 * against them.  We only perform this operation on enablings for which
12665	 * we have sufficient permissions by virtue of being in the global zone
12666	 * or in the same zone as the DTrace client.  Because we can be called
12667	 * after dtrace_detach() has been called, we cannot assert that there
12668	 * are retained enablings.  We can safely load from dtrace_retained,
12669	 * however:  the taskq_destroy() at the end of dtrace_detach() will
12670	 * block pending our completion.
12671	 */
12672	for (enab = dtrace_retained; enab != NULL; enab = enab->dten_next) {
12673#if defined(sun)
12674		cred_t *cr = enab->dten_vstate->dtvs_state->dts_cred.dcr_cred;
12675
12676		if (INGLOBALZONE(curproc) ||
12677		    cr != NULL && getzoneid() == crgetzoneid(cr))
12678#endif
12679			(void) dtrace_enabling_match(enab, NULL);
12680	}
12681
12682	mutex_exit(&dtrace_lock);
12683	mutex_exit(&cpu_lock);
12684}
12685
12686/*
12687 * If an enabling is to be enabled without having matched probes (that is, if
12688 * dtrace_state_go() is to be called on the underlying dtrace_state_t), the
12689 * enabling must be _primed_ by creating an ECB for every ECB description.
12690 * This must be done to assure that we know the number of speculations, the
12691 * number of aggregations, the minimum buffer size needed, etc. before we
12692 * transition out of DTRACE_ACTIVITY_INACTIVE.  To do this without actually
12693 * enabling any probes, we create ECBs for every ECB decription, but with a
12694 * NULL probe -- which is exactly what this function does.
12695 */
12696static void
12697dtrace_enabling_prime(dtrace_state_t *state)
12698{
12699	dtrace_enabling_t *enab;
12700	int i;
12701
12702	for (enab = dtrace_retained; enab != NULL; enab = enab->dten_next) {
12703		ASSERT(enab->dten_vstate->dtvs_state != NULL);
12704
12705		if (enab->dten_vstate->dtvs_state != state)
12706			continue;
12707
12708		/*
12709		 * We don't want to prime an enabling more than once, lest
12710		 * we allow a malicious user to induce resource exhaustion.
12711		 * (The ECBs that result from priming an enabling aren't
12712		 * leaked -- but they also aren't deallocated until the
12713		 * consumer state is destroyed.)
12714		 */
12715		if (enab->dten_primed)
12716			continue;
12717
12718		for (i = 0; i < enab->dten_ndesc; i++) {
12719			enab->dten_current = enab->dten_desc[i];
12720			(void) dtrace_probe_enable(NULL, enab);
12721		}
12722
12723		enab->dten_primed = 1;
12724	}
12725}
12726
12727/*
12728 * Called to indicate that probes should be provided due to retained
12729 * enablings.  This is implemented in terms of dtrace_probe_provide(), but it
12730 * must take an initial lap through the enabling calling the dtps_provide()
12731 * entry point explicitly to allow for autocreated probes.
12732 */
12733static void
12734dtrace_enabling_provide(dtrace_provider_t *prv)
12735{
12736	int i, all = 0;
12737	dtrace_probedesc_t desc;
12738	dtrace_genid_t gen;
12739
12740	ASSERT(MUTEX_HELD(&dtrace_lock));
12741	ASSERT(MUTEX_HELD(&dtrace_provider_lock));
12742
12743	if (prv == NULL) {
12744		all = 1;
12745		prv = dtrace_provider;
12746	}
12747
12748	do {
12749		dtrace_enabling_t *enab;
12750		void *parg = prv->dtpv_arg;
12751
12752retry:
12753		gen = dtrace_retained_gen;
12754		for (enab = dtrace_retained; enab != NULL;
12755		    enab = enab->dten_next) {
12756			for (i = 0; i < enab->dten_ndesc; i++) {
12757				desc = enab->dten_desc[i]->dted_probe;
12758				mutex_exit(&dtrace_lock);
12759				prv->dtpv_pops.dtps_provide(parg, &desc);
12760				mutex_enter(&dtrace_lock);
12761				/*
12762				 * Process the retained enablings again if
12763				 * they have changed while we weren't holding
12764				 * dtrace_lock.
12765				 */
12766				if (gen != dtrace_retained_gen)
12767					goto retry;
12768			}
12769		}
12770	} while (all && (prv = prv->dtpv_next) != NULL);
12771
12772	mutex_exit(&dtrace_lock);
12773	dtrace_probe_provide(NULL, all ? NULL : prv);
12774	mutex_enter(&dtrace_lock);
12775}
12776
12777/*
12778 * Called to reap ECBs that are attached to probes from defunct providers.
12779 */
12780static void
12781dtrace_enabling_reap(void)
12782{
12783	dtrace_provider_t *prov;
12784	dtrace_probe_t *probe;
12785	dtrace_ecb_t *ecb;
12786	hrtime_t when;
12787	int i;
12788
12789	mutex_enter(&cpu_lock);
12790	mutex_enter(&dtrace_lock);
12791
12792	for (i = 0; i < dtrace_nprobes; i++) {
12793		if ((probe = dtrace_probes[i]) == NULL)
12794			continue;
12795
12796		if (probe->dtpr_ecb == NULL)
12797			continue;
12798
12799		prov = probe->dtpr_provider;
12800
12801		if ((when = prov->dtpv_defunct) == 0)
12802			continue;
12803
12804		/*
12805		 * We have ECBs on a defunct provider:  we want to reap these
12806		 * ECBs to allow the provider to unregister.  The destruction
12807		 * of these ECBs must be done carefully:  if we destroy the ECB
12808		 * and the consumer later wishes to consume an EPID that
12809		 * corresponds to the destroyed ECB (and if the EPID metadata
12810		 * has not been previously consumed), the consumer will abort
12811		 * processing on the unknown EPID.  To reduce (but not, sadly,
12812		 * eliminate) the possibility of this, we will only destroy an
12813		 * ECB for a defunct provider if, for the state that
12814		 * corresponds to the ECB:
12815		 *
12816		 *  (a)	There is no speculative tracing (which can effectively
12817		 *	cache an EPID for an arbitrary amount of time).
12818		 *
12819		 *  (b)	The principal buffers have been switched twice since the
12820		 *	provider became defunct.
12821		 *
12822		 *  (c)	The aggregation buffers are of zero size or have been
12823		 *	switched twice since the provider became defunct.
12824		 *
12825		 * We use dts_speculates to determine (a) and call a function
12826		 * (dtrace_buffer_consumed()) to determine (b) and (c).  Note
12827		 * that as soon as we've been unable to destroy one of the ECBs
12828		 * associated with the probe, we quit trying -- reaping is only
12829		 * fruitful in as much as we can destroy all ECBs associated
12830		 * with the defunct provider's probes.
12831		 */
12832		while ((ecb = probe->dtpr_ecb) != NULL) {
12833			dtrace_state_t *state = ecb->dte_state;
12834			dtrace_buffer_t *buf = state->dts_buffer;
12835			dtrace_buffer_t *aggbuf = state->dts_aggbuffer;
12836
12837			if (state->dts_speculates)
12838				break;
12839
12840			if (!dtrace_buffer_consumed(buf, when))
12841				break;
12842
12843			if (!dtrace_buffer_consumed(aggbuf, when))
12844				break;
12845
12846			dtrace_ecb_disable(ecb);
12847			ASSERT(probe->dtpr_ecb != ecb);
12848			dtrace_ecb_destroy(ecb);
12849		}
12850	}
12851
12852	mutex_exit(&dtrace_lock);
12853	mutex_exit(&cpu_lock);
12854}
12855
12856/*
12857 * DTrace DOF Functions
12858 */
12859/*ARGSUSED*/
12860static void
12861dtrace_dof_error(dof_hdr_t *dof, const char *str)
12862{
12863	if (dtrace_err_verbose)
12864		cmn_err(CE_WARN, "failed to process DOF: %s", str);
12865
12866#ifdef DTRACE_ERRDEBUG
12867	dtrace_errdebug(str);
12868#endif
12869}
12870
12871/*
12872 * Create DOF out of a currently enabled state.  Right now, we only create
12873 * DOF containing the run-time options -- but this could be expanded to create
12874 * complete DOF representing the enabled state.
12875 */
12876static dof_hdr_t *
12877dtrace_dof_create(dtrace_state_t *state)
12878{
12879	dof_hdr_t *dof;
12880	dof_sec_t *sec;
12881	dof_optdesc_t *opt;
12882	int i, len = sizeof (dof_hdr_t) +
12883	    roundup(sizeof (dof_sec_t), sizeof (uint64_t)) +
12884	    sizeof (dof_optdesc_t) * DTRACEOPT_MAX;
12885
12886	ASSERT(MUTEX_HELD(&dtrace_lock));
12887
12888	dof = kmem_zalloc(len, KM_SLEEP);
12889	dof->dofh_ident[DOF_ID_MAG0] = DOF_MAG_MAG0;
12890	dof->dofh_ident[DOF_ID_MAG1] = DOF_MAG_MAG1;
12891	dof->dofh_ident[DOF_ID_MAG2] = DOF_MAG_MAG2;
12892	dof->dofh_ident[DOF_ID_MAG3] = DOF_MAG_MAG3;
12893
12894	dof->dofh_ident[DOF_ID_MODEL] = DOF_MODEL_NATIVE;
12895	dof->dofh_ident[DOF_ID_ENCODING] = DOF_ENCODE_NATIVE;
12896	dof->dofh_ident[DOF_ID_VERSION] = DOF_VERSION;
12897	dof->dofh_ident[DOF_ID_DIFVERS] = DIF_VERSION;
12898	dof->dofh_ident[DOF_ID_DIFIREG] = DIF_DIR_NREGS;
12899	dof->dofh_ident[DOF_ID_DIFTREG] = DIF_DTR_NREGS;
12900
12901	dof->dofh_flags = 0;
12902	dof->dofh_hdrsize = sizeof (dof_hdr_t);
12903	dof->dofh_secsize = sizeof (dof_sec_t);
12904	dof->dofh_secnum = 1;	/* only DOF_SECT_OPTDESC */
12905	dof->dofh_secoff = sizeof (dof_hdr_t);
12906	dof->dofh_loadsz = len;
12907	dof->dofh_filesz = len;
12908	dof->dofh_pad = 0;
12909
12910	/*
12911	 * Fill in the option section header...
12912	 */
12913	sec = (dof_sec_t *)((uintptr_t)dof + sizeof (dof_hdr_t));
12914	sec->dofs_type = DOF_SECT_OPTDESC;
12915	sec->dofs_align = sizeof (uint64_t);
12916	sec->dofs_flags = DOF_SECF_LOAD;
12917	sec->dofs_entsize = sizeof (dof_optdesc_t);
12918
12919	opt = (dof_optdesc_t *)((uintptr_t)sec +
12920	    roundup(sizeof (dof_sec_t), sizeof (uint64_t)));
12921
12922	sec->dofs_offset = (uintptr_t)opt - (uintptr_t)dof;
12923	sec->dofs_size = sizeof (dof_optdesc_t) * DTRACEOPT_MAX;
12924
12925	for (i = 0; i < DTRACEOPT_MAX; i++) {
12926		opt[i].dofo_option = i;
12927		opt[i].dofo_strtab = DOF_SECIDX_NONE;
12928		opt[i].dofo_value = state->dts_options[i];
12929	}
12930
12931	return (dof);
12932}
12933
12934static dof_hdr_t *
12935dtrace_dof_copyin(uintptr_t uarg, int *errp)
12936{
12937	dof_hdr_t hdr, *dof;
12938
12939	ASSERT(!MUTEX_HELD(&dtrace_lock));
12940
12941	/*
12942	 * First, we're going to copyin() the sizeof (dof_hdr_t).
12943	 */
12944	if (copyin((void *)uarg, &hdr, sizeof (hdr)) != 0) {
12945		dtrace_dof_error(NULL, "failed to copyin DOF header");
12946		*errp = EFAULT;
12947		return (NULL);
12948	}
12949
12950	/*
12951	 * Now we'll allocate the entire DOF and copy it in -- provided
12952	 * that the length isn't outrageous.
12953	 */
12954	if (hdr.dofh_loadsz >= dtrace_dof_maxsize) {
12955		dtrace_dof_error(&hdr, "load size exceeds maximum");
12956		*errp = E2BIG;
12957		return (NULL);
12958	}
12959
12960	if (hdr.dofh_loadsz < sizeof (hdr)) {
12961		dtrace_dof_error(&hdr, "invalid load size");
12962		*errp = EINVAL;
12963		return (NULL);
12964	}
12965
12966	dof = kmem_alloc(hdr.dofh_loadsz, KM_SLEEP);
12967
12968	if (copyin((void *)uarg, dof, hdr.dofh_loadsz) != 0 ||
12969	    dof->dofh_loadsz != hdr.dofh_loadsz) {
12970		kmem_free(dof, hdr.dofh_loadsz);
12971		*errp = EFAULT;
12972		return (NULL);
12973	}
12974
12975	return (dof);
12976}
12977
12978#if !defined(sun)
12979static __inline uchar_t
12980dtrace_dof_char(char c) {
12981	switch (c) {
12982	case '0':
12983	case '1':
12984	case '2':
12985	case '3':
12986	case '4':
12987	case '5':
12988	case '6':
12989	case '7':
12990	case '8':
12991	case '9':
12992		return (c - '0');
12993	case 'A':
12994	case 'B':
12995	case 'C':
12996	case 'D':
12997	case 'E':
12998	case 'F':
12999		return (c - 'A' + 10);
13000	case 'a':
13001	case 'b':
13002	case 'c':
13003	case 'd':
13004	case 'e':
13005	case 'f':
13006		return (c - 'a' + 10);
13007	}
13008	/* Should not reach here. */
13009	return (0);
13010}
13011#endif
13012
13013static dof_hdr_t *
13014dtrace_dof_property(const char *name)
13015{
13016	uchar_t *buf;
13017	uint64_t loadsz;
13018	unsigned int len, i;
13019	dof_hdr_t *dof;
13020
13021#if defined(sun)
13022	/*
13023	 * Unfortunately, array of values in .conf files are always (and
13024	 * only) interpreted to be integer arrays.  We must read our DOF
13025	 * as an integer array, and then squeeze it into a byte array.
13026	 */
13027	if (ddi_prop_lookup_int_array(DDI_DEV_T_ANY, dtrace_devi, 0,
13028	    (char *)name, (int **)&buf, &len) != DDI_PROP_SUCCESS)
13029		return (NULL);
13030
13031	for (i = 0; i < len; i++)
13032		buf[i] = (uchar_t)(((int *)buf)[i]);
13033
13034	if (len < sizeof (dof_hdr_t)) {
13035		ddi_prop_free(buf);
13036		dtrace_dof_error(NULL, "truncated header");
13037		return (NULL);
13038	}
13039
13040	if (len < (loadsz = ((dof_hdr_t *)buf)->dofh_loadsz)) {
13041		ddi_prop_free(buf);
13042		dtrace_dof_error(NULL, "truncated DOF");
13043		return (NULL);
13044	}
13045
13046	if (loadsz >= dtrace_dof_maxsize) {
13047		ddi_prop_free(buf);
13048		dtrace_dof_error(NULL, "oversized DOF");
13049		return (NULL);
13050	}
13051
13052	dof = kmem_alloc(loadsz, KM_SLEEP);
13053	bcopy(buf, dof, loadsz);
13054	ddi_prop_free(buf);
13055#else
13056	char *p;
13057	char *p_env;
13058
13059	if ((p_env = getenv(name)) == NULL)
13060		return (NULL);
13061
13062	len = strlen(p_env) / 2;
13063
13064	buf = kmem_alloc(len, KM_SLEEP);
13065
13066	dof = (dof_hdr_t *) buf;
13067
13068	p = p_env;
13069
13070	for (i = 0; i < len; i++) {
13071		buf[i] = (dtrace_dof_char(p[0]) << 4) |
13072		     dtrace_dof_char(p[1]);
13073		p += 2;
13074	}
13075
13076	freeenv(p_env);
13077
13078	if (len < sizeof (dof_hdr_t)) {
13079		kmem_free(buf, 0);
13080		dtrace_dof_error(NULL, "truncated header");
13081		return (NULL);
13082	}
13083
13084	if (len < (loadsz = dof->dofh_loadsz)) {
13085		kmem_free(buf, 0);
13086		dtrace_dof_error(NULL, "truncated DOF");
13087		return (NULL);
13088	}
13089
13090	if (loadsz >= dtrace_dof_maxsize) {
13091		kmem_free(buf, 0);
13092		dtrace_dof_error(NULL, "oversized DOF");
13093		return (NULL);
13094	}
13095#endif
13096
13097	return (dof);
13098}
13099
13100static void
13101dtrace_dof_destroy(dof_hdr_t *dof)
13102{
13103	kmem_free(dof, dof->dofh_loadsz);
13104}
13105
13106/*
13107 * Return the dof_sec_t pointer corresponding to a given section index.  If the
13108 * index is not valid, dtrace_dof_error() is called and NULL is returned.  If
13109 * a type other than DOF_SECT_NONE is specified, the header is checked against
13110 * this type and NULL is returned if the types do not match.
13111 */
13112static dof_sec_t *
13113dtrace_dof_sect(dof_hdr_t *dof, uint32_t type, dof_secidx_t i)
13114{
13115	dof_sec_t *sec = (dof_sec_t *)(uintptr_t)
13116	    ((uintptr_t)dof + dof->dofh_secoff + i * dof->dofh_secsize);
13117
13118	if (i >= dof->dofh_secnum) {
13119		dtrace_dof_error(dof, "referenced section index is invalid");
13120		return (NULL);
13121	}
13122
13123	if (!(sec->dofs_flags & DOF_SECF_LOAD)) {
13124		dtrace_dof_error(dof, "referenced section is not loadable");
13125		return (NULL);
13126	}
13127
13128	if (type != DOF_SECT_NONE && type != sec->dofs_type) {
13129		dtrace_dof_error(dof, "referenced section is the wrong type");
13130		return (NULL);
13131	}
13132
13133	return (sec);
13134}
13135
13136static dtrace_probedesc_t *
13137dtrace_dof_probedesc(dof_hdr_t *dof, dof_sec_t *sec, dtrace_probedesc_t *desc)
13138{
13139	dof_probedesc_t *probe;
13140	dof_sec_t *strtab;
13141	uintptr_t daddr = (uintptr_t)dof;
13142	uintptr_t str;
13143	size_t size;
13144
13145	if (sec->dofs_type != DOF_SECT_PROBEDESC) {
13146		dtrace_dof_error(dof, "invalid probe section");
13147		return (NULL);
13148	}
13149
13150	if (sec->dofs_align != sizeof (dof_secidx_t)) {
13151		dtrace_dof_error(dof, "bad alignment in probe description");
13152		return (NULL);
13153	}
13154
13155	if (sec->dofs_offset + sizeof (dof_probedesc_t) > dof->dofh_loadsz) {
13156		dtrace_dof_error(dof, "truncated probe description");
13157		return (NULL);
13158	}
13159
13160	probe = (dof_probedesc_t *)(uintptr_t)(daddr + sec->dofs_offset);
13161	strtab = dtrace_dof_sect(dof, DOF_SECT_STRTAB, probe->dofp_strtab);
13162
13163	if (strtab == NULL)
13164		return (NULL);
13165
13166	str = daddr + strtab->dofs_offset;
13167	size = strtab->dofs_size;
13168
13169	if (probe->dofp_provider >= strtab->dofs_size) {
13170		dtrace_dof_error(dof, "corrupt probe provider");
13171		return (NULL);
13172	}
13173
13174	(void) strncpy(desc->dtpd_provider,
13175	    (char *)(str + probe->dofp_provider),
13176	    MIN(DTRACE_PROVNAMELEN - 1, size - probe->dofp_provider));
13177
13178	if (probe->dofp_mod >= strtab->dofs_size) {
13179		dtrace_dof_error(dof, "corrupt probe module");
13180		return (NULL);
13181	}
13182
13183	(void) strncpy(desc->dtpd_mod, (char *)(str + probe->dofp_mod),
13184	    MIN(DTRACE_MODNAMELEN - 1, size - probe->dofp_mod));
13185
13186	if (probe->dofp_func >= strtab->dofs_size) {
13187		dtrace_dof_error(dof, "corrupt probe function");
13188		return (NULL);
13189	}
13190
13191	(void) strncpy(desc->dtpd_func, (char *)(str + probe->dofp_func),
13192	    MIN(DTRACE_FUNCNAMELEN - 1, size - probe->dofp_func));
13193
13194	if (probe->dofp_name >= strtab->dofs_size) {
13195		dtrace_dof_error(dof, "corrupt probe name");
13196		return (NULL);
13197	}
13198
13199	(void) strncpy(desc->dtpd_name, (char *)(str + probe->dofp_name),
13200	    MIN(DTRACE_NAMELEN - 1, size - probe->dofp_name));
13201
13202	return (desc);
13203}
13204
13205static dtrace_difo_t *
13206dtrace_dof_difo(dof_hdr_t *dof, dof_sec_t *sec, dtrace_vstate_t *vstate,
13207    cred_t *cr)
13208{
13209	dtrace_difo_t *dp;
13210	size_t ttl = 0;
13211	dof_difohdr_t *dofd;
13212	uintptr_t daddr = (uintptr_t)dof;
13213	size_t max = dtrace_difo_maxsize;
13214	int i, l, n;
13215
13216	static const struct {
13217		int section;
13218		int bufoffs;
13219		int lenoffs;
13220		int entsize;
13221		int align;
13222		const char *msg;
13223	} difo[] = {
13224		{ DOF_SECT_DIF, offsetof(dtrace_difo_t, dtdo_buf),
13225		offsetof(dtrace_difo_t, dtdo_len), sizeof (dif_instr_t),
13226		sizeof (dif_instr_t), "multiple DIF sections" },
13227
13228		{ DOF_SECT_INTTAB, offsetof(dtrace_difo_t, dtdo_inttab),
13229		offsetof(dtrace_difo_t, dtdo_intlen), sizeof (uint64_t),
13230		sizeof (uint64_t), "multiple integer tables" },
13231
13232		{ DOF_SECT_STRTAB, offsetof(dtrace_difo_t, dtdo_strtab),
13233		offsetof(dtrace_difo_t, dtdo_strlen), 0,
13234		sizeof (char), "multiple string tables" },
13235
13236		{ DOF_SECT_VARTAB, offsetof(dtrace_difo_t, dtdo_vartab),
13237		offsetof(dtrace_difo_t, dtdo_varlen), sizeof (dtrace_difv_t),
13238		sizeof (uint_t), "multiple variable tables" },
13239
13240		{ DOF_SECT_NONE, 0, 0, 0, 0, NULL }
13241	};
13242
13243	if (sec->dofs_type != DOF_SECT_DIFOHDR) {
13244		dtrace_dof_error(dof, "invalid DIFO header section");
13245		return (NULL);
13246	}
13247
13248	if (sec->dofs_align != sizeof (dof_secidx_t)) {
13249		dtrace_dof_error(dof, "bad alignment in DIFO header");
13250		return (NULL);
13251	}
13252
13253	if (sec->dofs_size < sizeof (dof_difohdr_t) ||
13254	    sec->dofs_size % sizeof (dof_secidx_t)) {
13255		dtrace_dof_error(dof, "bad size in DIFO header");
13256		return (NULL);
13257	}
13258
13259	dofd = (dof_difohdr_t *)(uintptr_t)(daddr + sec->dofs_offset);
13260	n = (sec->dofs_size - sizeof (*dofd)) / sizeof (dof_secidx_t) + 1;
13261
13262	dp = kmem_zalloc(sizeof (dtrace_difo_t), KM_SLEEP);
13263	dp->dtdo_rtype = dofd->dofd_rtype;
13264
13265	for (l = 0; l < n; l++) {
13266		dof_sec_t *subsec;
13267		void **bufp;
13268		uint32_t *lenp;
13269
13270		if ((subsec = dtrace_dof_sect(dof, DOF_SECT_NONE,
13271		    dofd->dofd_links[l])) == NULL)
13272			goto err; /* invalid section link */
13273
13274		if (ttl + subsec->dofs_size > max) {
13275			dtrace_dof_error(dof, "exceeds maximum size");
13276			goto err;
13277		}
13278
13279		ttl += subsec->dofs_size;
13280
13281		for (i = 0; difo[i].section != DOF_SECT_NONE; i++) {
13282			if (subsec->dofs_type != difo[i].section)
13283				continue;
13284
13285			if (!(subsec->dofs_flags & DOF_SECF_LOAD)) {
13286				dtrace_dof_error(dof, "section not loaded");
13287				goto err;
13288			}
13289
13290			if (subsec->dofs_align != difo[i].align) {
13291				dtrace_dof_error(dof, "bad alignment");
13292				goto err;
13293			}
13294
13295			bufp = (void **)((uintptr_t)dp + difo[i].bufoffs);
13296			lenp = (uint32_t *)((uintptr_t)dp + difo[i].lenoffs);
13297
13298			if (*bufp != NULL) {
13299				dtrace_dof_error(dof, difo[i].msg);
13300				goto err;
13301			}
13302
13303			if (difo[i].entsize != subsec->dofs_entsize) {
13304				dtrace_dof_error(dof, "entry size mismatch");
13305				goto err;
13306			}
13307
13308			if (subsec->dofs_entsize != 0 &&
13309			    (subsec->dofs_size % subsec->dofs_entsize) != 0) {
13310				dtrace_dof_error(dof, "corrupt entry size");
13311				goto err;
13312			}
13313
13314			*lenp = subsec->dofs_size;
13315			*bufp = kmem_alloc(subsec->dofs_size, KM_SLEEP);
13316			bcopy((char *)(uintptr_t)(daddr + subsec->dofs_offset),
13317			    *bufp, subsec->dofs_size);
13318
13319			if (subsec->dofs_entsize != 0)
13320				*lenp /= subsec->dofs_entsize;
13321
13322			break;
13323		}
13324
13325		/*
13326		 * If we encounter a loadable DIFO sub-section that is not
13327		 * known to us, assume this is a broken program and fail.
13328		 */
13329		if (difo[i].section == DOF_SECT_NONE &&
13330		    (subsec->dofs_flags & DOF_SECF_LOAD)) {
13331			dtrace_dof_error(dof, "unrecognized DIFO subsection");
13332			goto err;
13333		}
13334	}
13335
13336	if (dp->dtdo_buf == NULL) {
13337		/*
13338		 * We can't have a DIF object without DIF text.
13339		 */
13340		dtrace_dof_error(dof, "missing DIF text");
13341		goto err;
13342	}
13343
13344	/*
13345	 * Before we validate the DIF object, run through the variable table
13346	 * looking for the strings -- if any of their size are under, we'll set
13347	 * their size to be the system-wide default string size.  Note that
13348	 * this should _not_ happen if the "strsize" option has been set --
13349	 * in this case, the compiler should have set the size to reflect the
13350	 * setting of the option.
13351	 */
13352	for (i = 0; i < dp->dtdo_varlen; i++) {
13353		dtrace_difv_t *v = &dp->dtdo_vartab[i];
13354		dtrace_diftype_t *t = &v->dtdv_type;
13355
13356		if (v->dtdv_id < DIF_VAR_OTHER_UBASE)
13357			continue;
13358
13359		if (t->dtdt_kind == DIF_TYPE_STRING && t->dtdt_size == 0)
13360			t->dtdt_size = dtrace_strsize_default;
13361	}
13362
13363	if (dtrace_difo_validate(dp, vstate, DIF_DIR_NREGS, cr) != 0)
13364		goto err;
13365
13366	dtrace_difo_init(dp, vstate);
13367	return (dp);
13368
13369err:
13370	kmem_free(dp->dtdo_buf, dp->dtdo_len * sizeof (dif_instr_t));
13371	kmem_free(dp->dtdo_inttab, dp->dtdo_intlen * sizeof (uint64_t));
13372	kmem_free(dp->dtdo_strtab, dp->dtdo_strlen);
13373	kmem_free(dp->dtdo_vartab, dp->dtdo_varlen * sizeof (dtrace_difv_t));
13374
13375	kmem_free(dp, sizeof (dtrace_difo_t));
13376	return (NULL);
13377}
13378
13379static dtrace_predicate_t *
13380dtrace_dof_predicate(dof_hdr_t *dof, dof_sec_t *sec, dtrace_vstate_t *vstate,
13381    cred_t *cr)
13382{
13383	dtrace_difo_t *dp;
13384
13385	if ((dp = dtrace_dof_difo(dof, sec, vstate, cr)) == NULL)
13386		return (NULL);
13387
13388	return (dtrace_predicate_create(dp));
13389}
13390
13391static dtrace_actdesc_t *
13392dtrace_dof_actdesc(dof_hdr_t *dof, dof_sec_t *sec, dtrace_vstate_t *vstate,
13393    cred_t *cr)
13394{
13395	dtrace_actdesc_t *act, *first = NULL, *last = NULL, *next;
13396	dof_actdesc_t *desc;
13397	dof_sec_t *difosec;
13398	size_t offs;
13399	uintptr_t daddr = (uintptr_t)dof;
13400	uint64_t arg;
13401	dtrace_actkind_t kind;
13402
13403	if (sec->dofs_type != DOF_SECT_ACTDESC) {
13404		dtrace_dof_error(dof, "invalid action section");
13405		return (NULL);
13406	}
13407
13408	if (sec->dofs_offset + sizeof (dof_actdesc_t) > dof->dofh_loadsz) {
13409		dtrace_dof_error(dof, "truncated action description");
13410		return (NULL);
13411	}
13412
13413	if (sec->dofs_align != sizeof (uint64_t)) {
13414		dtrace_dof_error(dof, "bad alignment in action description");
13415		return (NULL);
13416	}
13417
13418	if (sec->dofs_size < sec->dofs_entsize) {
13419		dtrace_dof_error(dof, "section entry size exceeds total size");
13420		return (NULL);
13421	}
13422
13423	if (sec->dofs_entsize != sizeof (dof_actdesc_t)) {
13424		dtrace_dof_error(dof, "bad entry size in action description");
13425		return (NULL);
13426	}
13427
13428	if (sec->dofs_size / sec->dofs_entsize > dtrace_actions_max) {
13429		dtrace_dof_error(dof, "actions exceed dtrace_actions_max");
13430		return (NULL);
13431	}
13432
13433	for (offs = 0; offs < sec->dofs_size; offs += sec->dofs_entsize) {
13434		desc = (dof_actdesc_t *)(daddr +
13435		    (uintptr_t)sec->dofs_offset + offs);
13436		kind = (dtrace_actkind_t)desc->dofa_kind;
13437
13438		if ((DTRACEACT_ISPRINTFLIKE(kind) &&
13439		    (kind != DTRACEACT_PRINTA ||
13440		    desc->dofa_strtab != DOF_SECIDX_NONE)) ||
13441		    (kind == DTRACEACT_DIFEXPR &&
13442		    desc->dofa_strtab != DOF_SECIDX_NONE)) {
13443			dof_sec_t *strtab;
13444			char *str, *fmt;
13445			uint64_t i;
13446
13447			/*
13448			 * The argument to these actions is an index into the
13449			 * DOF string table.  For printf()-like actions, this
13450			 * is the format string.  For print(), this is the
13451			 * CTF type of the expression result.
13452			 */
13453			if ((strtab = dtrace_dof_sect(dof,
13454			    DOF_SECT_STRTAB, desc->dofa_strtab)) == NULL)
13455				goto err;
13456
13457			str = (char *)((uintptr_t)dof +
13458			    (uintptr_t)strtab->dofs_offset);
13459
13460			for (i = desc->dofa_arg; i < strtab->dofs_size; i++) {
13461				if (str[i] == '\0')
13462					break;
13463			}
13464
13465			if (i >= strtab->dofs_size) {
13466				dtrace_dof_error(dof, "bogus format string");
13467				goto err;
13468			}
13469
13470			if (i == desc->dofa_arg) {
13471				dtrace_dof_error(dof, "empty format string");
13472				goto err;
13473			}
13474
13475			i -= desc->dofa_arg;
13476			fmt = kmem_alloc(i + 1, KM_SLEEP);
13477			bcopy(&str[desc->dofa_arg], fmt, i + 1);
13478			arg = (uint64_t)(uintptr_t)fmt;
13479		} else {
13480			if (kind == DTRACEACT_PRINTA) {
13481				ASSERT(desc->dofa_strtab == DOF_SECIDX_NONE);
13482				arg = 0;
13483			} else {
13484				arg = desc->dofa_arg;
13485			}
13486		}
13487
13488		act = dtrace_actdesc_create(kind, desc->dofa_ntuple,
13489		    desc->dofa_uarg, arg);
13490
13491		if (last != NULL) {
13492			last->dtad_next = act;
13493		} else {
13494			first = act;
13495		}
13496
13497		last = act;
13498
13499		if (desc->dofa_difo == DOF_SECIDX_NONE)
13500			continue;
13501
13502		if ((difosec = dtrace_dof_sect(dof,
13503		    DOF_SECT_DIFOHDR, desc->dofa_difo)) == NULL)
13504			goto err;
13505
13506		act->dtad_difo = dtrace_dof_difo(dof, difosec, vstate, cr);
13507
13508		if (act->dtad_difo == NULL)
13509			goto err;
13510	}
13511
13512	ASSERT(first != NULL);
13513	return (first);
13514
13515err:
13516	for (act = first; act != NULL; act = next) {
13517		next = act->dtad_next;
13518		dtrace_actdesc_release(act, vstate);
13519	}
13520
13521	return (NULL);
13522}
13523
13524static dtrace_ecbdesc_t *
13525dtrace_dof_ecbdesc(dof_hdr_t *dof, dof_sec_t *sec, dtrace_vstate_t *vstate,
13526    cred_t *cr)
13527{
13528	dtrace_ecbdesc_t *ep;
13529	dof_ecbdesc_t *ecb;
13530	dtrace_probedesc_t *desc;
13531	dtrace_predicate_t *pred = NULL;
13532
13533	if (sec->dofs_size < sizeof (dof_ecbdesc_t)) {
13534		dtrace_dof_error(dof, "truncated ECB description");
13535		return (NULL);
13536	}
13537
13538	if (sec->dofs_align != sizeof (uint64_t)) {
13539		dtrace_dof_error(dof, "bad alignment in ECB description");
13540		return (NULL);
13541	}
13542
13543	ecb = (dof_ecbdesc_t *)((uintptr_t)dof + (uintptr_t)sec->dofs_offset);
13544	sec = dtrace_dof_sect(dof, DOF_SECT_PROBEDESC, ecb->dofe_probes);
13545
13546	if (sec == NULL)
13547		return (NULL);
13548
13549	ep = kmem_zalloc(sizeof (dtrace_ecbdesc_t), KM_SLEEP);
13550	ep->dted_uarg = ecb->dofe_uarg;
13551	desc = &ep->dted_probe;
13552
13553	if (dtrace_dof_probedesc(dof, sec, desc) == NULL)
13554		goto err;
13555
13556	if (ecb->dofe_pred != DOF_SECIDX_NONE) {
13557		if ((sec = dtrace_dof_sect(dof,
13558		    DOF_SECT_DIFOHDR, ecb->dofe_pred)) == NULL)
13559			goto err;
13560
13561		if ((pred = dtrace_dof_predicate(dof, sec, vstate, cr)) == NULL)
13562			goto err;
13563
13564		ep->dted_pred.dtpdd_predicate = pred;
13565	}
13566
13567	if (ecb->dofe_actions != DOF_SECIDX_NONE) {
13568		if ((sec = dtrace_dof_sect(dof,
13569		    DOF_SECT_ACTDESC, ecb->dofe_actions)) == NULL)
13570			goto err;
13571
13572		ep->dted_action = dtrace_dof_actdesc(dof, sec, vstate, cr);
13573
13574		if (ep->dted_action == NULL)
13575			goto err;
13576	}
13577
13578	return (ep);
13579
13580err:
13581	if (pred != NULL)
13582		dtrace_predicate_release(pred, vstate);
13583	kmem_free(ep, sizeof (dtrace_ecbdesc_t));
13584	return (NULL);
13585}
13586
13587/*
13588 * Apply the relocations from the specified 'sec' (a DOF_SECT_URELHDR) to the
13589 * specified DOF.  At present, this amounts to simply adding 'ubase' to the
13590 * site of any user SETX relocations to account for load object base address.
13591 * In the future, if we need other relocations, this function can be extended.
13592 */
13593static int
13594dtrace_dof_relocate(dof_hdr_t *dof, dof_sec_t *sec, uint64_t ubase)
13595{
13596	uintptr_t daddr = (uintptr_t)dof;
13597	dof_relohdr_t *dofr =
13598	    (dof_relohdr_t *)(uintptr_t)(daddr + sec->dofs_offset);
13599	dof_sec_t *ss, *rs, *ts;
13600	dof_relodesc_t *r;
13601	uint_t i, n;
13602
13603	if (sec->dofs_size < sizeof (dof_relohdr_t) ||
13604	    sec->dofs_align != sizeof (dof_secidx_t)) {
13605		dtrace_dof_error(dof, "invalid relocation header");
13606		return (-1);
13607	}
13608
13609	ss = dtrace_dof_sect(dof, DOF_SECT_STRTAB, dofr->dofr_strtab);
13610	rs = dtrace_dof_sect(dof, DOF_SECT_RELTAB, dofr->dofr_relsec);
13611	ts = dtrace_dof_sect(dof, DOF_SECT_NONE, dofr->dofr_tgtsec);
13612
13613	if (ss == NULL || rs == NULL || ts == NULL)
13614		return (-1); /* dtrace_dof_error() has been called already */
13615
13616	if (rs->dofs_entsize < sizeof (dof_relodesc_t) ||
13617	    rs->dofs_align != sizeof (uint64_t)) {
13618		dtrace_dof_error(dof, "invalid relocation section");
13619		return (-1);
13620	}
13621
13622	r = (dof_relodesc_t *)(uintptr_t)(daddr + rs->dofs_offset);
13623	n = rs->dofs_size / rs->dofs_entsize;
13624
13625	for (i = 0; i < n; i++) {
13626		uintptr_t taddr = daddr + ts->dofs_offset + r->dofr_offset;
13627
13628		switch (r->dofr_type) {
13629		case DOF_RELO_NONE:
13630			break;
13631		case DOF_RELO_SETX:
13632			if (r->dofr_offset >= ts->dofs_size || r->dofr_offset +
13633			    sizeof (uint64_t) > ts->dofs_size) {
13634				dtrace_dof_error(dof, "bad relocation offset");
13635				return (-1);
13636			}
13637
13638			if (!IS_P2ALIGNED(taddr, sizeof (uint64_t))) {
13639				dtrace_dof_error(dof, "misaligned setx relo");
13640				return (-1);
13641			}
13642
13643			*(uint64_t *)taddr += ubase;
13644			break;
13645		default:
13646			dtrace_dof_error(dof, "invalid relocation type");
13647			return (-1);
13648		}
13649
13650		r = (dof_relodesc_t *)((uintptr_t)r + rs->dofs_entsize);
13651	}
13652
13653	return (0);
13654}
13655
13656/*
13657 * The dof_hdr_t passed to dtrace_dof_slurp() should be a partially validated
13658 * header:  it should be at the front of a memory region that is at least
13659 * sizeof (dof_hdr_t) in size -- and then at least dof_hdr.dofh_loadsz in
13660 * size.  It need not be validated in any other way.
13661 */
13662static int
13663dtrace_dof_slurp(dof_hdr_t *dof, dtrace_vstate_t *vstate, cred_t *cr,
13664    dtrace_enabling_t **enabp, uint64_t ubase, int noprobes)
13665{
13666	uint64_t len = dof->dofh_loadsz, seclen;
13667	uintptr_t daddr = (uintptr_t)dof;
13668	dtrace_ecbdesc_t *ep;
13669	dtrace_enabling_t *enab;
13670	uint_t i;
13671
13672	ASSERT(MUTEX_HELD(&dtrace_lock));
13673	ASSERT(dof->dofh_loadsz >= sizeof (dof_hdr_t));
13674
13675	/*
13676	 * Check the DOF header identification bytes.  In addition to checking
13677	 * valid settings, we also verify that unused bits/bytes are zeroed so
13678	 * we can use them later without fear of regressing existing binaries.
13679	 */
13680	if (bcmp(&dof->dofh_ident[DOF_ID_MAG0],
13681	    DOF_MAG_STRING, DOF_MAG_STRLEN) != 0) {
13682		dtrace_dof_error(dof, "DOF magic string mismatch");
13683		return (-1);
13684	}
13685
13686	if (dof->dofh_ident[DOF_ID_MODEL] != DOF_MODEL_ILP32 &&
13687	    dof->dofh_ident[DOF_ID_MODEL] != DOF_MODEL_LP64) {
13688		dtrace_dof_error(dof, "DOF has invalid data model");
13689		return (-1);
13690	}
13691
13692	if (dof->dofh_ident[DOF_ID_ENCODING] != DOF_ENCODE_NATIVE) {
13693		dtrace_dof_error(dof, "DOF encoding mismatch");
13694		return (-1);
13695	}
13696
13697	if (dof->dofh_ident[DOF_ID_VERSION] != DOF_VERSION_1 &&
13698	    dof->dofh_ident[DOF_ID_VERSION] != DOF_VERSION_2) {
13699		dtrace_dof_error(dof, "DOF version mismatch");
13700		return (-1);
13701	}
13702
13703	if (dof->dofh_ident[DOF_ID_DIFVERS] != DIF_VERSION_2) {
13704		dtrace_dof_error(dof, "DOF uses unsupported instruction set");
13705		return (-1);
13706	}
13707
13708	if (dof->dofh_ident[DOF_ID_DIFIREG] > DIF_DIR_NREGS) {
13709		dtrace_dof_error(dof, "DOF uses too many integer registers");
13710		return (-1);
13711	}
13712
13713	if (dof->dofh_ident[DOF_ID_DIFTREG] > DIF_DTR_NREGS) {
13714		dtrace_dof_error(dof, "DOF uses too many tuple registers");
13715		return (-1);
13716	}
13717
13718	for (i = DOF_ID_PAD; i < DOF_ID_SIZE; i++) {
13719		if (dof->dofh_ident[i] != 0) {
13720			dtrace_dof_error(dof, "DOF has invalid ident byte set");
13721			return (-1);
13722		}
13723	}
13724
13725	if (dof->dofh_flags & ~DOF_FL_VALID) {
13726		dtrace_dof_error(dof, "DOF has invalid flag bits set");
13727		return (-1);
13728	}
13729
13730	if (dof->dofh_secsize == 0) {
13731		dtrace_dof_error(dof, "zero section header size");
13732		return (-1);
13733	}
13734
13735	/*
13736	 * Check that the section headers don't exceed the amount of DOF
13737	 * data.  Note that we cast the section size and number of sections
13738	 * to uint64_t's to prevent possible overflow in the multiplication.
13739	 */
13740	seclen = (uint64_t)dof->dofh_secnum * (uint64_t)dof->dofh_secsize;
13741
13742	if (dof->dofh_secoff > len || seclen > len ||
13743	    dof->dofh_secoff + seclen > len) {
13744		dtrace_dof_error(dof, "truncated section headers");
13745		return (-1);
13746	}
13747
13748	if (!IS_P2ALIGNED(dof->dofh_secoff, sizeof (uint64_t))) {
13749		dtrace_dof_error(dof, "misaligned section headers");
13750		return (-1);
13751	}
13752
13753	if (!IS_P2ALIGNED(dof->dofh_secsize, sizeof (uint64_t))) {
13754		dtrace_dof_error(dof, "misaligned section size");
13755		return (-1);
13756	}
13757
13758	/*
13759	 * Take an initial pass through the section headers to be sure that
13760	 * the headers don't have stray offsets.  If the 'noprobes' flag is
13761	 * set, do not permit sections relating to providers, probes, or args.
13762	 */
13763	for (i = 0; i < dof->dofh_secnum; i++) {
13764		dof_sec_t *sec = (dof_sec_t *)(daddr +
13765		    (uintptr_t)dof->dofh_secoff + i * dof->dofh_secsize);
13766
13767		if (noprobes) {
13768			switch (sec->dofs_type) {
13769			case DOF_SECT_PROVIDER:
13770			case DOF_SECT_PROBES:
13771			case DOF_SECT_PRARGS:
13772			case DOF_SECT_PROFFS:
13773				dtrace_dof_error(dof, "illegal sections "
13774				    "for enabling");
13775				return (-1);
13776			}
13777		}
13778
13779		if (DOF_SEC_ISLOADABLE(sec->dofs_type) &&
13780		    !(sec->dofs_flags & DOF_SECF_LOAD)) {
13781			dtrace_dof_error(dof, "loadable section with load "
13782			    "flag unset");
13783			return (-1);
13784		}
13785
13786		if (!(sec->dofs_flags & DOF_SECF_LOAD))
13787			continue; /* just ignore non-loadable sections */
13788
13789		if (!ISP2(sec->dofs_align)) {
13790			dtrace_dof_error(dof, "bad section alignment");
13791			return (-1);
13792		}
13793
13794		if (sec->dofs_offset & (sec->dofs_align - 1)) {
13795			dtrace_dof_error(dof, "misaligned section");
13796			return (-1);
13797		}
13798
13799		if (sec->dofs_offset > len || sec->dofs_size > len ||
13800		    sec->dofs_offset + sec->dofs_size > len) {
13801			dtrace_dof_error(dof, "corrupt section header");
13802			return (-1);
13803		}
13804
13805		if (sec->dofs_type == DOF_SECT_STRTAB && *((char *)daddr +
13806		    sec->dofs_offset + sec->dofs_size - 1) != '\0') {
13807			dtrace_dof_error(dof, "non-terminating string table");
13808			return (-1);
13809		}
13810	}
13811
13812	/*
13813	 * Take a second pass through the sections and locate and perform any
13814	 * relocations that are present.  We do this after the first pass to
13815	 * be sure that all sections have had their headers validated.
13816	 */
13817	for (i = 0; i < dof->dofh_secnum; i++) {
13818		dof_sec_t *sec = (dof_sec_t *)(daddr +
13819		    (uintptr_t)dof->dofh_secoff + i * dof->dofh_secsize);
13820
13821		if (!(sec->dofs_flags & DOF_SECF_LOAD))
13822			continue; /* skip sections that are not loadable */
13823
13824		switch (sec->dofs_type) {
13825		case DOF_SECT_URELHDR:
13826			if (dtrace_dof_relocate(dof, sec, ubase) != 0)
13827				return (-1);
13828			break;
13829		}
13830	}
13831
13832	if ((enab = *enabp) == NULL)
13833		enab = *enabp = dtrace_enabling_create(vstate);
13834
13835	for (i = 0; i < dof->dofh_secnum; i++) {
13836		dof_sec_t *sec = (dof_sec_t *)(daddr +
13837		    (uintptr_t)dof->dofh_secoff + i * dof->dofh_secsize);
13838
13839		if (sec->dofs_type != DOF_SECT_ECBDESC)
13840			continue;
13841
13842		if ((ep = dtrace_dof_ecbdesc(dof, sec, vstate, cr)) == NULL) {
13843			dtrace_enabling_destroy(enab);
13844			*enabp = NULL;
13845			return (-1);
13846		}
13847
13848		dtrace_enabling_add(enab, ep);
13849	}
13850
13851	return (0);
13852}
13853
13854/*
13855 * Process DOF for any options.  This routine assumes that the DOF has been
13856 * at least processed by dtrace_dof_slurp().
13857 */
13858static int
13859dtrace_dof_options(dof_hdr_t *dof, dtrace_state_t *state)
13860{
13861	int i, rval;
13862	uint32_t entsize;
13863	size_t offs;
13864	dof_optdesc_t *desc;
13865
13866	for (i = 0; i < dof->dofh_secnum; i++) {
13867		dof_sec_t *sec = (dof_sec_t *)((uintptr_t)dof +
13868		    (uintptr_t)dof->dofh_secoff + i * dof->dofh_secsize);
13869
13870		if (sec->dofs_type != DOF_SECT_OPTDESC)
13871			continue;
13872
13873		if (sec->dofs_align != sizeof (uint64_t)) {
13874			dtrace_dof_error(dof, "bad alignment in "
13875			    "option description");
13876			return (EINVAL);
13877		}
13878
13879		if ((entsize = sec->dofs_entsize) == 0) {
13880			dtrace_dof_error(dof, "zeroed option entry size");
13881			return (EINVAL);
13882		}
13883
13884		if (entsize < sizeof (dof_optdesc_t)) {
13885			dtrace_dof_error(dof, "bad option entry size");
13886			return (EINVAL);
13887		}
13888
13889		for (offs = 0; offs < sec->dofs_size; offs += entsize) {
13890			desc = (dof_optdesc_t *)((uintptr_t)dof +
13891			    (uintptr_t)sec->dofs_offset + offs);
13892
13893			if (desc->dofo_strtab != DOF_SECIDX_NONE) {
13894				dtrace_dof_error(dof, "non-zero option string");
13895				return (EINVAL);
13896			}
13897
13898			if (desc->dofo_value == DTRACEOPT_UNSET) {
13899				dtrace_dof_error(dof, "unset option");
13900				return (EINVAL);
13901			}
13902
13903			if ((rval = dtrace_state_option(state,
13904			    desc->dofo_option, desc->dofo_value)) != 0) {
13905				dtrace_dof_error(dof, "rejected option");
13906				return (rval);
13907			}
13908		}
13909	}
13910
13911	return (0);
13912}
13913
13914/*
13915 * DTrace Consumer State Functions
13916 */
13917static int
13918dtrace_dstate_init(dtrace_dstate_t *dstate, size_t size)
13919{
13920	size_t hashsize, maxper, min, chunksize = dstate->dtds_chunksize;
13921	void *base;
13922	uintptr_t limit;
13923	dtrace_dynvar_t *dvar, *next, *start;
13924	int i;
13925
13926	ASSERT(MUTEX_HELD(&dtrace_lock));
13927	ASSERT(dstate->dtds_base == NULL && dstate->dtds_percpu == NULL);
13928
13929	bzero(dstate, sizeof (dtrace_dstate_t));
13930
13931	if ((dstate->dtds_chunksize = chunksize) == 0)
13932		dstate->dtds_chunksize = DTRACE_DYNVAR_CHUNKSIZE;
13933
13934	if (size < (min = dstate->dtds_chunksize + sizeof (dtrace_dynhash_t)))
13935		size = min;
13936
13937	if ((base = kmem_zalloc(size, KM_NOSLEEP | KM_NORMALPRI)) == NULL)
13938		return (ENOMEM);
13939
13940	dstate->dtds_size = size;
13941	dstate->dtds_base = base;
13942	dstate->dtds_percpu = kmem_cache_alloc(dtrace_state_cache, KM_SLEEP);
13943	bzero(dstate->dtds_percpu, NCPU * sizeof (dtrace_dstate_percpu_t));
13944
13945	hashsize = size / (dstate->dtds_chunksize + sizeof (dtrace_dynhash_t));
13946
13947	if (hashsize != 1 && (hashsize & 1))
13948		hashsize--;
13949
13950	dstate->dtds_hashsize = hashsize;
13951	dstate->dtds_hash = dstate->dtds_base;
13952
13953	/*
13954	 * Set all of our hash buckets to point to the single sink, and (if
13955	 * it hasn't already been set), set the sink's hash value to be the
13956	 * sink sentinel value.  The sink is needed for dynamic variable
13957	 * lookups to know that they have iterated over an entire, valid hash
13958	 * chain.
13959	 */
13960	for (i = 0; i < hashsize; i++)
13961		dstate->dtds_hash[i].dtdh_chain = &dtrace_dynhash_sink;
13962
13963	if (dtrace_dynhash_sink.dtdv_hashval != DTRACE_DYNHASH_SINK)
13964		dtrace_dynhash_sink.dtdv_hashval = DTRACE_DYNHASH_SINK;
13965
13966	/*
13967	 * Determine number of active CPUs.  Divide free list evenly among
13968	 * active CPUs.
13969	 */
13970	start = (dtrace_dynvar_t *)
13971	    ((uintptr_t)base + hashsize * sizeof (dtrace_dynhash_t));
13972	limit = (uintptr_t)base + size;
13973
13974	maxper = (limit - (uintptr_t)start) / NCPU;
13975	maxper = (maxper / dstate->dtds_chunksize) * dstate->dtds_chunksize;
13976
13977#if !defined(sun)
13978	CPU_FOREACH(i) {
13979#else
13980	for (i = 0; i < NCPU; i++) {
13981#endif
13982		dstate->dtds_percpu[i].dtdsc_free = dvar = start;
13983
13984		/*
13985		 * If we don't even have enough chunks to make it once through
13986		 * NCPUs, we're just going to allocate everything to the first
13987		 * CPU.  And if we're on the last CPU, we're going to allocate
13988		 * whatever is left over.  In either case, we set the limit to
13989		 * be the limit of the dynamic variable space.
13990		 */
13991		if (maxper == 0 || i == NCPU - 1) {
13992			limit = (uintptr_t)base + size;
13993			start = NULL;
13994		} else {
13995			limit = (uintptr_t)start + maxper;
13996			start = (dtrace_dynvar_t *)limit;
13997		}
13998
13999		ASSERT(limit <= (uintptr_t)base + size);
14000
14001		for (;;) {
14002			next = (dtrace_dynvar_t *)((uintptr_t)dvar +
14003			    dstate->dtds_chunksize);
14004
14005			if ((uintptr_t)next + dstate->dtds_chunksize >= limit)
14006				break;
14007
14008			dvar->dtdv_next = next;
14009			dvar = next;
14010		}
14011
14012		if (maxper == 0)
14013			break;
14014	}
14015
14016	return (0);
14017}
14018
14019static void
14020dtrace_dstate_fini(dtrace_dstate_t *dstate)
14021{
14022	ASSERT(MUTEX_HELD(&cpu_lock));
14023
14024	if (dstate->dtds_base == NULL)
14025		return;
14026
14027	kmem_free(dstate->dtds_base, dstate->dtds_size);
14028	kmem_cache_free(dtrace_state_cache, dstate->dtds_percpu);
14029}
14030
14031static void
14032dtrace_vstate_fini(dtrace_vstate_t *vstate)
14033{
14034	/*
14035	 * Logical XOR, where are you?
14036	 */
14037	ASSERT((vstate->dtvs_nglobals == 0) ^ (vstate->dtvs_globals != NULL));
14038
14039	if (vstate->dtvs_nglobals > 0) {
14040		kmem_free(vstate->dtvs_globals, vstate->dtvs_nglobals *
14041		    sizeof (dtrace_statvar_t *));
14042	}
14043
14044	if (vstate->dtvs_ntlocals > 0) {
14045		kmem_free(vstate->dtvs_tlocals, vstate->dtvs_ntlocals *
14046		    sizeof (dtrace_difv_t));
14047	}
14048
14049	ASSERT((vstate->dtvs_nlocals == 0) ^ (vstate->dtvs_locals != NULL));
14050
14051	if (vstate->dtvs_nlocals > 0) {
14052		kmem_free(vstate->dtvs_locals, vstate->dtvs_nlocals *
14053		    sizeof (dtrace_statvar_t *));
14054	}
14055}
14056
14057#if defined(sun)
14058static void
14059dtrace_state_clean(dtrace_state_t *state)
14060{
14061	if (state->dts_activity == DTRACE_ACTIVITY_INACTIVE)
14062		return;
14063
14064	dtrace_dynvar_clean(&state->dts_vstate.dtvs_dynvars);
14065	dtrace_speculation_clean(state);
14066}
14067
14068static void
14069dtrace_state_deadman(dtrace_state_t *state)
14070{
14071	hrtime_t now;
14072
14073	dtrace_sync();
14074
14075	now = dtrace_gethrtime();
14076
14077	if (state != dtrace_anon.dta_state &&
14078	    now - state->dts_laststatus >= dtrace_deadman_user)
14079		return;
14080
14081	/*
14082	 * We must be sure that dts_alive never appears to be less than the
14083	 * value upon entry to dtrace_state_deadman(), and because we lack a
14084	 * dtrace_cas64(), we cannot store to it atomically.  We thus instead
14085	 * store INT64_MAX to it, followed by a memory barrier, followed by
14086	 * the new value.  This assures that dts_alive never appears to be
14087	 * less than its true value, regardless of the order in which the
14088	 * stores to the underlying storage are issued.
14089	 */
14090	state->dts_alive = INT64_MAX;
14091	dtrace_membar_producer();
14092	state->dts_alive = now;
14093}
14094#else
14095static void
14096dtrace_state_clean(void *arg)
14097{
14098	dtrace_state_t *state = arg;
14099	dtrace_optval_t *opt = state->dts_options;
14100
14101	if (state->dts_activity == DTRACE_ACTIVITY_INACTIVE)
14102		return;
14103
14104	dtrace_dynvar_clean(&state->dts_vstate.dtvs_dynvars);
14105	dtrace_speculation_clean(state);
14106
14107	callout_reset(&state->dts_cleaner, hz * opt[DTRACEOPT_CLEANRATE] / NANOSEC,
14108	    dtrace_state_clean, state);
14109}
14110
14111static void
14112dtrace_state_deadman(void *arg)
14113{
14114	dtrace_state_t *state = arg;
14115	hrtime_t now;
14116
14117	dtrace_sync();
14118
14119	dtrace_debug_output();
14120
14121	now = dtrace_gethrtime();
14122
14123	if (state != dtrace_anon.dta_state &&
14124	    now - state->dts_laststatus >= dtrace_deadman_user)
14125		return;
14126
14127	/*
14128	 * We must be sure that dts_alive never appears to be less than the
14129	 * value upon entry to dtrace_state_deadman(), and because we lack a
14130	 * dtrace_cas64(), we cannot store to it atomically.  We thus instead
14131	 * store INT64_MAX to it, followed by a memory barrier, followed by
14132	 * the new value.  This assures that dts_alive never appears to be
14133	 * less than its true value, regardless of the order in which the
14134	 * stores to the underlying storage are issued.
14135	 */
14136	state->dts_alive = INT64_MAX;
14137	dtrace_membar_producer();
14138	state->dts_alive = now;
14139
14140	callout_reset(&state->dts_deadman, hz * dtrace_deadman_interval / NANOSEC,
14141	    dtrace_state_deadman, state);
14142}
14143#endif
14144
14145static dtrace_state_t *
14146#if defined(sun)
14147dtrace_state_create(dev_t *devp, cred_t *cr)
14148#else
14149dtrace_state_create(struct cdev *dev)
14150#endif
14151{
14152#if defined(sun)
14153	minor_t minor;
14154	major_t major;
14155#else
14156	cred_t *cr = NULL;
14157	int m = 0;
14158#endif
14159	char c[30];
14160	dtrace_state_t *state;
14161	dtrace_optval_t *opt;
14162	int bufsize = NCPU * sizeof (dtrace_buffer_t), i;
14163
14164	ASSERT(MUTEX_HELD(&dtrace_lock));
14165	ASSERT(MUTEX_HELD(&cpu_lock));
14166
14167#if defined(sun)
14168	minor = (minor_t)(uintptr_t)vmem_alloc(dtrace_minor, 1,
14169	    VM_BESTFIT | VM_SLEEP);
14170
14171	if (ddi_soft_state_zalloc(dtrace_softstate, minor) != DDI_SUCCESS) {
14172		vmem_free(dtrace_minor, (void *)(uintptr_t)minor, 1);
14173		return (NULL);
14174	}
14175
14176	state = ddi_get_soft_state(dtrace_softstate, minor);
14177#else
14178	if (dev != NULL) {
14179		cr = dev->si_cred;
14180		m = dev2unit(dev);
14181	}
14182
14183	/* Allocate memory for the state. */
14184	state = kmem_zalloc(sizeof(dtrace_state_t), KM_SLEEP);
14185#endif
14186
14187	state->dts_epid = DTRACE_EPIDNONE + 1;
14188
14189	(void) snprintf(c, sizeof (c), "dtrace_aggid_%d", m);
14190#if defined(sun)
14191	state->dts_aggid_arena = vmem_create(c, (void *)1, UINT32_MAX, 1,
14192	    NULL, NULL, NULL, 0, VM_SLEEP | VMC_IDENTIFIER);
14193
14194	if (devp != NULL) {
14195		major = getemajor(*devp);
14196	} else {
14197		major = ddi_driver_major(dtrace_devi);
14198	}
14199
14200	state->dts_dev = makedevice(major, minor);
14201
14202	if (devp != NULL)
14203		*devp = state->dts_dev;
14204#else
14205	state->dts_aggid_arena = new_unrhdr(1, INT_MAX, &dtrace_unr_mtx);
14206	state->dts_dev = dev;
14207#endif
14208
14209	/*
14210	 * We allocate NCPU buffers.  On the one hand, this can be quite
14211	 * a bit of memory per instance (nearly 36K on a Starcat).  On the
14212	 * other hand, it saves an additional memory reference in the probe
14213	 * path.
14214	 */
14215	state->dts_buffer = kmem_zalloc(bufsize, KM_SLEEP);
14216	state->dts_aggbuffer = kmem_zalloc(bufsize, KM_SLEEP);
14217
14218#if defined(sun)
14219	state->dts_cleaner = CYCLIC_NONE;
14220	state->dts_deadman = CYCLIC_NONE;
14221#else
14222	callout_init(&state->dts_cleaner, CALLOUT_MPSAFE);
14223	callout_init(&state->dts_deadman, CALLOUT_MPSAFE);
14224#endif
14225	state->dts_vstate.dtvs_state = state;
14226
14227	for (i = 0; i < DTRACEOPT_MAX; i++)
14228		state->dts_options[i] = DTRACEOPT_UNSET;
14229
14230	/*
14231	 * Set the default options.
14232	 */
14233	opt = state->dts_options;
14234	opt[DTRACEOPT_BUFPOLICY] = DTRACEOPT_BUFPOLICY_SWITCH;
14235	opt[DTRACEOPT_BUFRESIZE] = DTRACEOPT_BUFRESIZE_AUTO;
14236	opt[DTRACEOPT_NSPEC] = dtrace_nspec_default;
14237	opt[DTRACEOPT_SPECSIZE] = dtrace_specsize_default;
14238	opt[DTRACEOPT_CPU] = (dtrace_optval_t)DTRACE_CPUALL;
14239	opt[DTRACEOPT_STRSIZE] = dtrace_strsize_default;
14240	opt[DTRACEOPT_STACKFRAMES] = dtrace_stackframes_default;
14241	opt[DTRACEOPT_USTACKFRAMES] = dtrace_ustackframes_default;
14242	opt[DTRACEOPT_CLEANRATE] = dtrace_cleanrate_default;
14243	opt[DTRACEOPT_AGGRATE] = dtrace_aggrate_default;
14244	opt[DTRACEOPT_SWITCHRATE] = dtrace_switchrate_default;
14245	opt[DTRACEOPT_STATUSRATE] = dtrace_statusrate_default;
14246	opt[DTRACEOPT_JSTACKFRAMES] = dtrace_jstackframes_default;
14247	opt[DTRACEOPT_JSTACKSTRSIZE] = dtrace_jstackstrsize_default;
14248
14249	state->dts_activity = DTRACE_ACTIVITY_INACTIVE;
14250
14251	/*
14252	 * Depending on the user credentials, we set flag bits which alter probe
14253	 * visibility or the amount of destructiveness allowed.  In the case of
14254	 * actual anonymous tracing, or the possession of all privileges, all of
14255	 * the normal checks are bypassed.
14256	 */
14257	if (cr == NULL || PRIV_POLICY_ONLY(cr, PRIV_ALL, B_FALSE)) {
14258		state->dts_cred.dcr_visible = DTRACE_CRV_ALL;
14259		state->dts_cred.dcr_action = DTRACE_CRA_ALL;
14260	} else {
14261		/*
14262		 * Set up the credentials for this instantiation.  We take a
14263		 * hold on the credential to prevent it from disappearing on
14264		 * us; this in turn prevents the zone_t referenced by this
14265		 * credential from disappearing.  This means that we can
14266		 * examine the credential and the zone from probe context.
14267		 */
14268		crhold(cr);
14269		state->dts_cred.dcr_cred = cr;
14270
14271		/*
14272		 * CRA_PROC means "we have *some* privilege for dtrace" and
14273		 * unlocks the use of variables like pid, zonename, etc.
14274		 */
14275		if (PRIV_POLICY_ONLY(cr, PRIV_DTRACE_USER, B_FALSE) ||
14276		    PRIV_POLICY_ONLY(cr, PRIV_DTRACE_PROC, B_FALSE)) {
14277			state->dts_cred.dcr_action |= DTRACE_CRA_PROC;
14278		}
14279
14280		/*
14281		 * dtrace_user allows use of syscall and profile providers.
14282		 * If the user also has proc_owner and/or proc_zone, we
14283		 * extend the scope to include additional visibility and
14284		 * destructive power.
14285		 */
14286		if (PRIV_POLICY_ONLY(cr, PRIV_DTRACE_USER, B_FALSE)) {
14287			if (PRIV_POLICY_ONLY(cr, PRIV_PROC_OWNER, B_FALSE)) {
14288				state->dts_cred.dcr_visible |=
14289				    DTRACE_CRV_ALLPROC;
14290
14291				state->dts_cred.dcr_action |=
14292				    DTRACE_CRA_PROC_DESTRUCTIVE_ALLUSER;
14293			}
14294
14295			if (PRIV_POLICY_ONLY(cr, PRIV_PROC_ZONE, B_FALSE)) {
14296				state->dts_cred.dcr_visible |=
14297				    DTRACE_CRV_ALLZONE;
14298
14299				state->dts_cred.dcr_action |=
14300				    DTRACE_CRA_PROC_DESTRUCTIVE_ALLZONE;
14301			}
14302
14303			/*
14304			 * If we have all privs in whatever zone this is,
14305			 * we can do destructive things to processes which
14306			 * have altered credentials.
14307			 */
14308#if defined(sun)
14309			if (priv_isequalset(priv_getset(cr, PRIV_EFFECTIVE),
14310			    cr->cr_zone->zone_privset)) {
14311				state->dts_cred.dcr_action |=
14312				    DTRACE_CRA_PROC_DESTRUCTIVE_CREDCHG;
14313			}
14314#endif
14315		}
14316
14317		/*
14318		 * Holding the dtrace_kernel privilege also implies that
14319		 * the user has the dtrace_user privilege from a visibility
14320		 * perspective.  But without further privileges, some
14321		 * destructive actions are not available.
14322		 */
14323		if (PRIV_POLICY_ONLY(cr, PRIV_DTRACE_KERNEL, B_FALSE)) {
14324			/*
14325			 * Make all probes in all zones visible.  However,
14326			 * this doesn't mean that all actions become available
14327			 * to all zones.
14328			 */
14329			state->dts_cred.dcr_visible |= DTRACE_CRV_KERNEL |
14330			    DTRACE_CRV_ALLPROC | DTRACE_CRV_ALLZONE;
14331
14332			state->dts_cred.dcr_action |= DTRACE_CRA_KERNEL |
14333			    DTRACE_CRA_PROC;
14334			/*
14335			 * Holding proc_owner means that destructive actions
14336			 * for *this* zone are allowed.
14337			 */
14338			if (PRIV_POLICY_ONLY(cr, PRIV_PROC_OWNER, B_FALSE))
14339				state->dts_cred.dcr_action |=
14340				    DTRACE_CRA_PROC_DESTRUCTIVE_ALLUSER;
14341
14342			/*
14343			 * Holding proc_zone means that destructive actions
14344			 * for this user/group ID in all zones is allowed.
14345			 */
14346			if (PRIV_POLICY_ONLY(cr, PRIV_PROC_ZONE, B_FALSE))
14347				state->dts_cred.dcr_action |=
14348				    DTRACE_CRA_PROC_DESTRUCTIVE_ALLZONE;
14349
14350#if defined(sun)
14351			/*
14352			 * If we have all privs in whatever zone this is,
14353			 * we can do destructive things to processes which
14354			 * have altered credentials.
14355			 */
14356			if (priv_isequalset(priv_getset(cr, PRIV_EFFECTIVE),
14357			    cr->cr_zone->zone_privset)) {
14358				state->dts_cred.dcr_action |=
14359				    DTRACE_CRA_PROC_DESTRUCTIVE_CREDCHG;
14360			}
14361#endif
14362		}
14363
14364		/*
14365		 * Holding the dtrace_proc privilege gives control over fasttrap
14366		 * and pid providers.  We need to grant wider destructive
14367		 * privileges in the event that the user has proc_owner and/or
14368		 * proc_zone.
14369		 */
14370		if (PRIV_POLICY_ONLY(cr, PRIV_DTRACE_PROC, B_FALSE)) {
14371			if (PRIV_POLICY_ONLY(cr, PRIV_PROC_OWNER, B_FALSE))
14372				state->dts_cred.dcr_action |=
14373				    DTRACE_CRA_PROC_DESTRUCTIVE_ALLUSER;
14374
14375			if (PRIV_POLICY_ONLY(cr, PRIV_PROC_ZONE, B_FALSE))
14376				state->dts_cred.dcr_action |=
14377				    DTRACE_CRA_PROC_DESTRUCTIVE_ALLZONE;
14378		}
14379	}
14380
14381	return (state);
14382}
14383
14384static int
14385dtrace_state_buffer(dtrace_state_t *state, dtrace_buffer_t *buf, int which)
14386{
14387	dtrace_optval_t *opt = state->dts_options, size;
14388	processorid_t cpu = 0;;
14389	int flags = 0, rval, factor, divisor = 1;
14390
14391	ASSERT(MUTEX_HELD(&dtrace_lock));
14392	ASSERT(MUTEX_HELD(&cpu_lock));
14393	ASSERT(which < DTRACEOPT_MAX);
14394	ASSERT(state->dts_activity == DTRACE_ACTIVITY_INACTIVE ||
14395	    (state == dtrace_anon.dta_state &&
14396	    state->dts_activity == DTRACE_ACTIVITY_ACTIVE));
14397
14398	if (opt[which] == DTRACEOPT_UNSET || opt[which] == 0)
14399		return (0);
14400
14401	if (opt[DTRACEOPT_CPU] != DTRACEOPT_UNSET)
14402		cpu = opt[DTRACEOPT_CPU];
14403
14404	if (which == DTRACEOPT_SPECSIZE)
14405		flags |= DTRACEBUF_NOSWITCH;
14406
14407	if (which == DTRACEOPT_BUFSIZE) {
14408		if (opt[DTRACEOPT_BUFPOLICY] == DTRACEOPT_BUFPOLICY_RING)
14409			flags |= DTRACEBUF_RING;
14410
14411		if (opt[DTRACEOPT_BUFPOLICY] == DTRACEOPT_BUFPOLICY_FILL)
14412			flags |= DTRACEBUF_FILL;
14413
14414		if (state != dtrace_anon.dta_state ||
14415		    state->dts_activity != DTRACE_ACTIVITY_ACTIVE)
14416			flags |= DTRACEBUF_INACTIVE;
14417	}
14418
14419	for (size = opt[which]; size >= sizeof (uint64_t); size /= divisor) {
14420		/*
14421		 * The size must be 8-byte aligned.  If the size is not 8-byte
14422		 * aligned, drop it down by the difference.
14423		 */
14424		if (size & (sizeof (uint64_t) - 1))
14425			size -= size & (sizeof (uint64_t) - 1);
14426
14427		if (size < state->dts_reserve) {
14428			/*
14429			 * Buffers always must be large enough to accommodate
14430			 * their prereserved space.  We return E2BIG instead
14431			 * of ENOMEM in this case to allow for user-level
14432			 * software to differentiate the cases.
14433			 */
14434			return (E2BIG);
14435		}
14436
14437		rval = dtrace_buffer_alloc(buf, size, flags, cpu, &factor);
14438
14439		if (rval != ENOMEM) {
14440			opt[which] = size;
14441			return (rval);
14442		}
14443
14444		if (opt[DTRACEOPT_BUFRESIZE] == DTRACEOPT_BUFRESIZE_MANUAL)
14445			return (rval);
14446
14447		for (divisor = 2; divisor < factor; divisor <<= 1)
14448			continue;
14449	}
14450
14451	return (ENOMEM);
14452}
14453
14454static int
14455dtrace_state_buffers(dtrace_state_t *state)
14456{
14457	dtrace_speculation_t *spec = state->dts_speculations;
14458	int rval, i;
14459
14460	if ((rval = dtrace_state_buffer(state, state->dts_buffer,
14461	    DTRACEOPT_BUFSIZE)) != 0)
14462		return (rval);
14463
14464	if ((rval = dtrace_state_buffer(state, state->dts_aggbuffer,
14465	    DTRACEOPT_AGGSIZE)) != 0)
14466		return (rval);
14467
14468	for (i = 0; i < state->dts_nspeculations; i++) {
14469		if ((rval = dtrace_state_buffer(state,
14470		    spec[i].dtsp_buffer, DTRACEOPT_SPECSIZE)) != 0)
14471			return (rval);
14472	}
14473
14474	return (0);
14475}
14476
14477static void
14478dtrace_state_prereserve(dtrace_state_t *state)
14479{
14480	dtrace_ecb_t *ecb;
14481	dtrace_probe_t *probe;
14482
14483	state->dts_reserve = 0;
14484
14485	if (state->dts_options[DTRACEOPT_BUFPOLICY] != DTRACEOPT_BUFPOLICY_FILL)
14486		return;
14487
14488	/*
14489	 * If our buffer policy is a "fill" buffer policy, we need to set the
14490	 * prereserved space to be the space required by the END probes.
14491	 */
14492	probe = dtrace_probes[dtrace_probeid_end - 1];
14493	ASSERT(probe != NULL);
14494
14495	for (ecb = probe->dtpr_ecb; ecb != NULL; ecb = ecb->dte_next) {
14496		if (ecb->dte_state != state)
14497			continue;
14498
14499		state->dts_reserve += ecb->dte_needed + ecb->dte_alignment;
14500	}
14501}
14502
14503static int
14504dtrace_state_go(dtrace_state_t *state, processorid_t *cpu)
14505{
14506	dtrace_optval_t *opt = state->dts_options, sz, nspec;
14507	dtrace_speculation_t *spec;
14508	dtrace_buffer_t *buf;
14509#if defined(sun)
14510	cyc_handler_t hdlr;
14511	cyc_time_t when;
14512#endif
14513	int rval = 0, i, bufsize = NCPU * sizeof (dtrace_buffer_t);
14514	dtrace_icookie_t cookie;
14515
14516	mutex_enter(&cpu_lock);
14517	mutex_enter(&dtrace_lock);
14518
14519	if (state->dts_activity != DTRACE_ACTIVITY_INACTIVE) {
14520		rval = EBUSY;
14521		goto out;
14522	}
14523
14524	/*
14525	 * Before we can perform any checks, we must prime all of the
14526	 * retained enablings that correspond to this state.
14527	 */
14528	dtrace_enabling_prime(state);
14529
14530	if (state->dts_destructive && !state->dts_cred.dcr_destructive) {
14531		rval = EACCES;
14532		goto out;
14533	}
14534
14535	dtrace_state_prereserve(state);
14536
14537	/*
14538	 * Now we want to do is try to allocate our speculations.
14539	 * We do not automatically resize the number of speculations; if
14540	 * this fails, we will fail the operation.
14541	 */
14542	nspec = opt[DTRACEOPT_NSPEC];
14543	ASSERT(nspec != DTRACEOPT_UNSET);
14544
14545	if (nspec > INT_MAX) {
14546		rval = ENOMEM;
14547		goto out;
14548	}
14549
14550	spec = kmem_zalloc(nspec * sizeof (dtrace_speculation_t),
14551	    KM_NOSLEEP | KM_NORMALPRI);
14552
14553	if (spec == NULL) {
14554		rval = ENOMEM;
14555		goto out;
14556	}
14557
14558	state->dts_speculations = spec;
14559	state->dts_nspeculations = (int)nspec;
14560
14561	for (i = 0; i < nspec; i++) {
14562		if ((buf = kmem_zalloc(bufsize,
14563		    KM_NOSLEEP | KM_NORMALPRI)) == NULL) {
14564			rval = ENOMEM;
14565			goto err;
14566		}
14567
14568		spec[i].dtsp_buffer = buf;
14569	}
14570
14571	if (opt[DTRACEOPT_GRABANON] != DTRACEOPT_UNSET) {
14572		if (dtrace_anon.dta_state == NULL) {
14573			rval = ENOENT;
14574			goto out;
14575		}
14576
14577		if (state->dts_necbs != 0) {
14578			rval = EALREADY;
14579			goto out;
14580		}
14581
14582		state->dts_anon = dtrace_anon_grab();
14583		ASSERT(state->dts_anon != NULL);
14584		state = state->dts_anon;
14585
14586		/*
14587		 * We want "grabanon" to be set in the grabbed state, so we'll
14588		 * copy that option value from the grabbing state into the
14589		 * grabbed state.
14590		 */
14591		state->dts_options[DTRACEOPT_GRABANON] =
14592		    opt[DTRACEOPT_GRABANON];
14593
14594		*cpu = dtrace_anon.dta_beganon;
14595
14596		/*
14597		 * If the anonymous state is active (as it almost certainly
14598		 * is if the anonymous enabling ultimately matched anything),
14599		 * we don't allow any further option processing -- but we
14600		 * don't return failure.
14601		 */
14602		if (state->dts_activity != DTRACE_ACTIVITY_INACTIVE)
14603			goto out;
14604	}
14605
14606	if (opt[DTRACEOPT_AGGSIZE] != DTRACEOPT_UNSET &&
14607	    opt[DTRACEOPT_AGGSIZE] != 0) {
14608		if (state->dts_aggregations == NULL) {
14609			/*
14610			 * We're not going to create an aggregation buffer
14611			 * because we don't have any ECBs that contain
14612			 * aggregations -- set this option to 0.
14613			 */
14614			opt[DTRACEOPT_AGGSIZE] = 0;
14615		} else {
14616			/*
14617			 * If we have an aggregation buffer, we must also have
14618			 * a buffer to use as scratch.
14619			 */
14620			if (opt[DTRACEOPT_BUFSIZE] == DTRACEOPT_UNSET ||
14621			    opt[DTRACEOPT_BUFSIZE] < state->dts_needed) {
14622				opt[DTRACEOPT_BUFSIZE] = state->dts_needed;
14623			}
14624		}
14625	}
14626
14627	if (opt[DTRACEOPT_SPECSIZE] != DTRACEOPT_UNSET &&
14628	    opt[DTRACEOPT_SPECSIZE] != 0) {
14629		if (!state->dts_speculates) {
14630			/*
14631			 * We're not going to create speculation buffers
14632			 * because we don't have any ECBs that actually
14633			 * speculate -- set the speculation size to 0.
14634			 */
14635			opt[DTRACEOPT_SPECSIZE] = 0;
14636		}
14637	}
14638
14639	/*
14640	 * The bare minimum size for any buffer that we're actually going to
14641	 * do anything to is sizeof (uint64_t).
14642	 */
14643	sz = sizeof (uint64_t);
14644
14645	if ((state->dts_needed != 0 && opt[DTRACEOPT_BUFSIZE] < sz) ||
14646	    (state->dts_speculates && opt[DTRACEOPT_SPECSIZE] < sz) ||
14647	    (state->dts_aggregations != NULL && opt[DTRACEOPT_AGGSIZE] < sz)) {
14648		/*
14649		 * A buffer size has been explicitly set to 0 (or to a size
14650		 * that will be adjusted to 0) and we need the space -- we
14651		 * need to return failure.  We return ENOSPC to differentiate
14652		 * it from failing to allocate a buffer due to failure to meet
14653		 * the reserve (for which we return E2BIG).
14654		 */
14655		rval = ENOSPC;
14656		goto out;
14657	}
14658
14659	if ((rval = dtrace_state_buffers(state)) != 0)
14660		goto err;
14661
14662	if ((sz = opt[DTRACEOPT_DYNVARSIZE]) == DTRACEOPT_UNSET)
14663		sz = dtrace_dstate_defsize;
14664
14665	do {
14666		rval = dtrace_dstate_init(&state->dts_vstate.dtvs_dynvars, sz);
14667
14668		if (rval == 0)
14669			break;
14670
14671		if (opt[DTRACEOPT_BUFRESIZE] == DTRACEOPT_BUFRESIZE_MANUAL)
14672			goto err;
14673	} while (sz >>= 1);
14674
14675	opt[DTRACEOPT_DYNVARSIZE] = sz;
14676
14677	if (rval != 0)
14678		goto err;
14679
14680	if (opt[DTRACEOPT_STATUSRATE] > dtrace_statusrate_max)
14681		opt[DTRACEOPT_STATUSRATE] = dtrace_statusrate_max;
14682
14683	if (opt[DTRACEOPT_CLEANRATE] == 0)
14684		opt[DTRACEOPT_CLEANRATE] = dtrace_cleanrate_max;
14685
14686	if (opt[DTRACEOPT_CLEANRATE] < dtrace_cleanrate_min)
14687		opt[DTRACEOPT_CLEANRATE] = dtrace_cleanrate_min;
14688
14689	if (opt[DTRACEOPT_CLEANRATE] > dtrace_cleanrate_max)
14690		opt[DTRACEOPT_CLEANRATE] = dtrace_cleanrate_max;
14691
14692	state->dts_alive = state->dts_laststatus = dtrace_gethrtime();
14693#if defined(sun)
14694	hdlr.cyh_func = (cyc_func_t)dtrace_state_clean;
14695	hdlr.cyh_arg = state;
14696	hdlr.cyh_level = CY_LOW_LEVEL;
14697
14698	when.cyt_when = 0;
14699	when.cyt_interval = opt[DTRACEOPT_CLEANRATE];
14700
14701	state->dts_cleaner = cyclic_add(&hdlr, &when);
14702
14703	hdlr.cyh_func = (cyc_func_t)dtrace_state_deadman;
14704	hdlr.cyh_arg = state;
14705	hdlr.cyh_level = CY_LOW_LEVEL;
14706
14707	when.cyt_when = 0;
14708	when.cyt_interval = dtrace_deadman_interval;
14709
14710	state->dts_deadman = cyclic_add(&hdlr, &when);
14711#else
14712	callout_reset(&state->dts_cleaner, hz * opt[DTRACEOPT_CLEANRATE] / NANOSEC,
14713	    dtrace_state_clean, state);
14714	callout_reset(&state->dts_deadman, hz * dtrace_deadman_interval / NANOSEC,
14715	    dtrace_state_deadman, state);
14716#endif
14717
14718	state->dts_activity = DTRACE_ACTIVITY_WARMUP;
14719
14720#if defined(sun)
14721	if (state->dts_getf != 0 &&
14722	    !(state->dts_cred.dcr_visible & DTRACE_CRV_KERNEL)) {
14723		/*
14724		 * We don't have kernel privs but we have at least one call
14725		 * to getf(); we need to bump our zone's count, and (if
14726		 * this is the first enabling to have an unprivileged call
14727		 * to getf()) we need to hook into closef().
14728		 */
14729		state->dts_cred.dcr_cred->cr_zone->zone_dtrace_getf++;
14730
14731		if (dtrace_getf++ == 0) {
14732			ASSERT(dtrace_closef == NULL);
14733			dtrace_closef = dtrace_getf_barrier;
14734		}
14735	}
14736#endif
14737
14738	/*
14739	 * Now it's time to actually fire the BEGIN probe.  We need to disable
14740	 * interrupts here both to record the CPU on which we fired the BEGIN
14741	 * probe (the data from this CPU will be processed first at user
14742	 * level) and to manually activate the buffer for this CPU.
14743	 */
14744	cookie = dtrace_interrupt_disable();
14745	*cpu = curcpu;
14746	ASSERT(state->dts_buffer[*cpu].dtb_flags & DTRACEBUF_INACTIVE);
14747	state->dts_buffer[*cpu].dtb_flags &= ~DTRACEBUF_INACTIVE;
14748
14749	dtrace_probe(dtrace_probeid_begin,
14750	    (uint64_t)(uintptr_t)state, 0, 0, 0, 0);
14751	dtrace_interrupt_enable(cookie);
14752	/*
14753	 * We may have had an exit action from a BEGIN probe; only change our
14754	 * state to ACTIVE if we're still in WARMUP.
14755	 */
14756	ASSERT(state->dts_activity == DTRACE_ACTIVITY_WARMUP ||
14757	    state->dts_activity == DTRACE_ACTIVITY_DRAINING);
14758
14759	if (state->dts_activity == DTRACE_ACTIVITY_WARMUP)
14760		state->dts_activity = DTRACE_ACTIVITY_ACTIVE;
14761
14762	/*
14763	 * Regardless of whether or not now we're in ACTIVE or DRAINING, we
14764	 * want each CPU to transition its principal buffer out of the
14765	 * INACTIVE state.  Doing this assures that no CPU will suddenly begin
14766	 * processing an ECB halfway down a probe's ECB chain; all CPUs will
14767	 * atomically transition from processing none of a state's ECBs to
14768	 * processing all of them.
14769	 */
14770	dtrace_xcall(DTRACE_CPUALL,
14771	    (dtrace_xcall_t)dtrace_buffer_activate, state);
14772	goto out;
14773
14774err:
14775	dtrace_buffer_free(state->dts_buffer);
14776	dtrace_buffer_free(state->dts_aggbuffer);
14777
14778	if ((nspec = state->dts_nspeculations) == 0) {
14779		ASSERT(state->dts_speculations == NULL);
14780		goto out;
14781	}
14782
14783	spec = state->dts_speculations;
14784	ASSERT(spec != NULL);
14785
14786	for (i = 0; i < state->dts_nspeculations; i++) {
14787		if ((buf = spec[i].dtsp_buffer) == NULL)
14788			break;
14789
14790		dtrace_buffer_free(buf);
14791		kmem_free(buf, bufsize);
14792	}
14793
14794	kmem_free(spec, nspec * sizeof (dtrace_speculation_t));
14795	state->dts_nspeculations = 0;
14796	state->dts_speculations = NULL;
14797
14798out:
14799	mutex_exit(&dtrace_lock);
14800	mutex_exit(&cpu_lock);
14801
14802	return (rval);
14803}
14804
14805static int
14806dtrace_state_stop(dtrace_state_t *state, processorid_t *cpu)
14807{
14808	dtrace_icookie_t cookie;
14809
14810	ASSERT(MUTEX_HELD(&dtrace_lock));
14811
14812	if (state->dts_activity != DTRACE_ACTIVITY_ACTIVE &&
14813	    state->dts_activity != DTRACE_ACTIVITY_DRAINING)
14814		return (EINVAL);
14815
14816	/*
14817	 * We'll set the activity to DTRACE_ACTIVITY_DRAINING, and issue a sync
14818	 * to be sure that every CPU has seen it.  See below for the details
14819	 * on why this is done.
14820	 */
14821	state->dts_activity = DTRACE_ACTIVITY_DRAINING;
14822	dtrace_sync();
14823
14824	/*
14825	 * By this point, it is impossible for any CPU to be still processing
14826	 * with DTRACE_ACTIVITY_ACTIVE.  We can thus set our activity to
14827	 * DTRACE_ACTIVITY_COOLDOWN and know that we're not racing with any
14828	 * other CPU in dtrace_buffer_reserve().  This allows dtrace_probe()
14829	 * and callees to know that the activity is DTRACE_ACTIVITY_COOLDOWN
14830	 * iff we're in the END probe.
14831	 */
14832	state->dts_activity = DTRACE_ACTIVITY_COOLDOWN;
14833	dtrace_sync();
14834	ASSERT(state->dts_activity == DTRACE_ACTIVITY_COOLDOWN);
14835
14836	/*
14837	 * Finally, we can release the reserve and call the END probe.  We
14838	 * disable interrupts across calling the END probe to allow us to
14839	 * return the CPU on which we actually called the END probe.  This
14840	 * allows user-land to be sure that this CPU's principal buffer is
14841	 * processed last.
14842	 */
14843	state->dts_reserve = 0;
14844
14845	cookie = dtrace_interrupt_disable();
14846	*cpu = curcpu;
14847	dtrace_probe(dtrace_probeid_end,
14848	    (uint64_t)(uintptr_t)state, 0, 0, 0, 0);
14849	dtrace_interrupt_enable(cookie);
14850
14851	state->dts_activity = DTRACE_ACTIVITY_STOPPED;
14852	dtrace_sync();
14853
14854#if defined(sun)
14855	if (state->dts_getf != 0 &&
14856	    !(state->dts_cred.dcr_visible & DTRACE_CRV_KERNEL)) {
14857		/*
14858		 * We don't have kernel privs but we have at least one call
14859		 * to getf(); we need to lower our zone's count, and (if
14860		 * this is the last enabling to have an unprivileged call
14861		 * to getf()) we need to clear the closef() hook.
14862		 */
14863		ASSERT(state->dts_cred.dcr_cred->cr_zone->zone_dtrace_getf > 0);
14864		ASSERT(dtrace_closef == dtrace_getf_barrier);
14865		ASSERT(dtrace_getf > 0);
14866
14867		state->dts_cred.dcr_cred->cr_zone->zone_dtrace_getf--;
14868
14869		if (--dtrace_getf == 0)
14870			dtrace_closef = NULL;
14871	}
14872#endif
14873
14874	return (0);
14875}
14876
14877static int
14878dtrace_state_option(dtrace_state_t *state, dtrace_optid_t option,
14879    dtrace_optval_t val)
14880{
14881	ASSERT(MUTEX_HELD(&dtrace_lock));
14882
14883	if (state->dts_activity != DTRACE_ACTIVITY_INACTIVE)
14884		return (EBUSY);
14885
14886	if (option >= DTRACEOPT_MAX)
14887		return (EINVAL);
14888
14889	if (option != DTRACEOPT_CPU && val < 0)
14890		return (EINVAL);
14891
14892	switch (option) {
14893	case DTRACEOPT_DESTRUCTIVE:
14894		if (dtrace_destructive_disallow)
14895			return (EACCES);
14896
14897		state->dts_cred.dcr_destructive = 1;
14898		break;
14899
14900	case DTRACEOPT_BUFSIZE:
14901	case DTRACEOPT_DYNVARSIZE:
14902	case DTRACEOPT_AGGSIZE:
14903	case DTRACEOPT_SPECSIZE:
14904	case DTRACEOPT_STRSIZE:
14905		if (val < 0)
14906			return (EINVAL);
14907
14908		if (val >= LONG_MAX) {
14909			/*
14910			 * If this is an otherwise negative value, set it to
14911			 * the highest multiple of 128m less than LONG_MAX.
14912			 * Technically, we're adjusting the size without
14913			 * regard to the buffer resizing policy, but in fact,
14914			 * this has no effect -- if we set the buffer size to
14915			 * ~LONG_MAX and the buffer policy is ultimately set to
14916			 * be "manual", the buffer allocation is guaranteed to
14917			 * fail, if only because the allocation requires two
14918			 * buffers.  (We set the the size to the highest
14919			 * multiple of 128m because it ensures that the size
14920			 * will remain a multiple of a megabyte when
14921			 * repeatedly halved -- all the way down to 15m.)
14922			 */
14923			val = LONG_MAX - (1 << 27) + 1;
14924		}
14925	}
14926
14927	state->dts_options[option] = val;
14928
14929	return (0);
14930}
14931
14932static void
14933dtrace_state_destroy(dtrace_state_t *state)
14934{
14935	dtrace_ecb_t *ecb;
14936	dtrace_vstate_t *vstate = &state->dts_vstate;
14937#if defined(sun)
14938	minor_t minor = getminor(state->dts_dev);
14939#endif
14940	int i, bufsize = NCPU * sizeof (dtrace_buffer_t);
14941	dtrace_speculation_t *spec = state->dts_speculations;
14942	int nspec = state->dts_nspeculations;
14943	uint32_t match;
14944
14945	ASSERT(MUTEX_HELD(&dtrace_lock));
14946	ASSERT(MUTEX_HELD(&cpu_lock));
14947
14948	/*
14949	 * First, retract any retained enablings for this state.
14950	 */
14951	dtrace_enabling_retract(state);
14952	ASSERT(state->dts_nretained == 0);
14953
14954	if (state->dts_activity == DTRACE_ACTIVITY_ACTIVE ||
14955	    state->dts_activity == DTRACE_ACTIVITY_DRAINING) {
14956		/*
14957		 * We have managed to come into dtrace_state_destroy() on a
14958		 * hot enabling -- almost certainly because of a disorderly
14959		 * shutdown of a consumer.  (That is, a consumer that is
14960		 * exiting without having called dtrace_stop().) In this case,
14961		 * we're going to set our activity to be KILLED, and then
14962		 * issue a sync to be sure that everyone is out of probe
14963		 * context before we start blowing away ECBs.
14964		 */
14965		state->dts_activity = DTRACE_ACTIVITY_KILLED;
14966		dtrace_sync();
14967	}
14968
14969	/*
14970	 * Release the credential hold we took in dtrace_state_create().
14971	 */
14972	if (state->dts_cred.dcr_cred != NULL)
14973		crfree(state->dts_cred.dcr_cred);
14974
14975	/*
14976	 * Now we can safely disable and destroy any enabled probes.  Because
14977	 * any DTRACE_PRIV_KERNEL probes may actually be slowing our progress
14978	 * (especially if they're all enabled), we take two passes through the
14979	 * ECBs:  in the first, we disable just DTRACE_PRIV_KERNEL probes, and
14980	 * in the second we disable whatever is left over.
14981	 */
14982	for (match = DTRACE_PRIV_KERNEL; ; match = 0) {
14983		for (i = 0; i < state->dts_necbs; i++) {
14984			if ((ecb = state->dts_ecbs[i]) == NULL)
14985				continue;
14986
14987			if (match && ecb->dte_probe != NULL) {
14988				dtrace_probe_t *probe = ecb->dte_probe;
14989				dtrace_provider_t *prov = probe->dtpr_provider;
14990
14991				if (!(prov->dtpv_priv.dtpp_flags & match))
14992					continue;
14993			}
14994
14995			dtrace_ecb_disable(ecb);
14996			dtrace_ecb_destroy(ecb);
14997		}
14998
14999		if (!match)
15000			break;
15001	}
15002
15003	/*
15004	 * Before we free the buffers, perform one more sync to assure that
15005	 * every CPU is out of probe context.
15006	 */
15007	dtrace_sync();
15008
15009	dtrace_buffer_free(state->dts_buffer);
15010	dtrace_buffer_free(state->dts_aggbuffer);
15011
15012	for (i = 0; i < nspec; i++)
15013		dtrace_buffer_free(spec[i].dtsp_buffer);
15014
15015#if defined(sun)
15016	if (state->dts_cleaner != CYCLIC_NONE)
15017		cyclic_remove(state->dts_cleaner);
15018
15019	if (state->dts_deadman != CYCLIC_NONE)
15020		cyclic_remove(state->dts_deadman);
15021#else
15022	callout_stop(&state->dts_cleaner);
15023	callout_drain(&state->dts_cleaner);
15024	callout_stop(&state->dts_deadman);
15025	callout_drain(&state->dts_deadman);
15026#endif
15027
15028	dtrace_dstate_fini(&vstate->dtvs_dynvars);
15029	dtrace_vstate_fini(vstate);
15030	if (state->dts_ecbs != NULL)
15031		kmem_free(state->dts_ecbs, state->dts_necbs * sizeof (dtrace_ecb_t *));
15032
15033	if (state->dts_aggregations != NULL) {
15034#ifdef DEBUG
15035		for (i = 0; i < state->dts_naggregations; i++)
15036			ASSERT(state->dts_aggregations[i] == NULL);
15037#endif
15038		ASSERT(state->dts_naggregations > 0);
15039		kmem_free(state->dts_aggregations,
15040		    state->dts_naggregations * sizeof (dtrace_aggregation_t *));
15041	}
15042
15043	kmem_free(state->dts_buffer, bufsize);
15044	kmem_free(state->dts_aggbuffer, bufsize);
15045
15046	for (i = 0; i < nspec; i++)
15047		kmem_free(spec[i].dtsp_buffer, bufsize);
15048
15049	if (spec != NULL)
15050		kmem_free(spec, nspec * sizeof (dtrace_speculation_t));
15051
15052	dtrace_format_destroy(state);
15053
15054	if (state->dts_aggid_arena != NULL) {
15055#if defined(sun)
15056		vmem_destroy(state->dts_aggid_arena);
15057#else
15058		delete_unrhdr(state->dts_aggid_arena);
15059#endif
15060		state->dts_aggid_arena = NULL;
15061	}
15062#if defined(sun)
15063	ddi_soft_state_free(dtrace_softstate, minor);
15064	vmem_free(dtrace_minor, (void *)(uintptr_t)minor, 1);
15065#endif
15066}
15067
15068/*
15069 * DTrace Anonymous Enabling Functions
15070 */
15071static dtrace_state_t *
15072dtrace_anon_grab(void)
15073{
15074	dtrace_state_t *state;
15075
15076	ASSERT(MUTEX_HELD(&dtrace_lock));
15077
15078	if ((state = dtrace_anon.dta_state) == NULL) {
15079		ASSERT(dtrace_anon.dta_enabling == NULL);
15080		return (NULL);
15081	}
15082
15083	ASSERT(dtrace_anon.dta_enabling != NULL);
15084	ASSERT(dtrace_retained != NULL);
15085
15086	dtrace_enabling_destroy(dtrace_anon.dta_enabling);
15087	dtrace_anon.dta_enabling = NULL;
15088	dtrace_anon.dta_state = NULL;
15089
15090	return (state);
15091}
15092
15093static void
15094dtrace_anon_property(void)
15095{
15096	int i, rv;
15097	dtrace_state_t *state;
15098	dof_hdr_t *dof;
15099	char c[32];		/* enough for "dof-data-" + digits */
15100
15101	ASSERT(MUTEX_HELD(&dtrace_lock));
15102	ASSERT(MUTEX_HELD(&cpu_lock));
15103
15104	for (i = 0; ; i++) {
15105		(void) snprintf(c, sizeof (c), "dof-data-%d", i);
15106
15107		dtrace_err_verbose = 1;
15108
15109		if ((dof = dtrace_dof_property(c)) == NULL) {
15110			dtrace_err_verbose = 0;
15111			break;
15112		}
15113
15114#if defined(sun)
15115		/*
15116		 * We want to create anonymous state, so we need to transition
15117		 * the kernel debugger to indicate that DTrace is active.  If
15118		 * this fails (e.g. because the debugger has modified text in
15119		 * some way), we won't continue with the processing.
15120		 */
15121		if (kdi_dtrace_set(KDI_DTSET_DTRACE_ACTIVATE) != 0) {
15122			cmn_err(CE_NOTE, "kernel debugger active; anonymous "
15123			    "enabling ignored.");
15124			dtrace_dof_destroy(dof);
15125			break;
15126		}
15127#endif
15128
15129		/*
15130		 * If we haven't allocated an anonymous state, we'll do so now.
15131		 */
15132		if ((state = dtrace_anon.dta_state) == NULL) {
15133#if defined(sun)
15134			state = dtrace_state_create(NULL, NULL);
15135#else
15136			state = dtrace_state_create(NULL);
15137#endif
15138			dtrace_anon.dta_state = state;
15139
15140			if (state == NULL) {
15141				/*
15142				 * This basically shouldn't happen:  the only
15143				 * failure mode from dtrace_state_create() is a
15144				 * failure of ddi_soft_state_zalloc() that
15145				 * itself should never happen.  Still, the
15146				 * interface allows for a failure mode, and
15147				 * we want to fail as gracefully as possible:
15148				 * we'll emit an error message and cease
15149				 * processing anonymous state in this case.
15150				 */
15151				cmn_err(CE_WARN, "failed to create "
15152				    "anonymous state");
15153				dtrace_dof_destroy(dof);
15154				break;
15155			}
15156		}
15157
15158		rv = dtrace_dof_slurp(dof, &state->dts_vstate, CRED(),
15159		    &dtrace_anon.dta_enabling, 0, B_TRUE);
15160
15161		if (rv == 0)
15162			rv = dtrace_dof_options(dof, state);
15163
15164		dtrace_err_verbose = 0;
15165		dtrace_dof_destroy(dof);
15166
15167		if (rv != 0) {
15168			/*
15169			 * This is malformed DOF; chuck any anonymous state
15170			 * that we created.
15171			 */
15172			ASSERT(dtrace_anon.dta_enabling == NULL);
15173			dtrace_state_destroy(state);
15174			dtrace_anon.dta_state = NULL;
15175			break;
15176		}
15177
15178		ASSERT(dtrace_anon.dta_enabling != NULL);
15179	}
15180
15181	if (dtrace_anon.dta_enabling != NULL) {
15182		int rval;
15183
15184		/*
15185		 * dtrace_enabling_retain() can only fail because we are
15186		 * trying to retain more enablings than are allowed -- but
15187		 * we only have one anonymous enabling, and we are guaranteed
15188		 * to be allowed at least one retained enabling; we assert
15189		 * that dtrace_enabling_retain() returns success.
15190		 */
15191		rval = dtrace_enabling_retain(dtrace_anon.dta_enabling);
15192		ASSERT(rval == 0);
15193
15194		dtrace_enabling_dump(dtrace_anon.dta_enabling);
15195	}
15196}
15197
15198/*
15199 * DTrace Helper Functions
15200 */
15201static void
15202dtrace_helper_trace(dtrace_helper_action_t *helper,
15203    dtrace_mstate_t *mstate, dtrace_vstate_t *vstate, int where)
15204{
15205	uint32_t size, next, nnext, i;
15206	dtrace_helptrace_t *ent;
15207	uint16_t flags = cpu_core[curcpu].cpuc_dtrace_flags;
15208
15209	if (!dtrace_helptrace_enabled)
15210		return;
15211
15212	ASSERT(vstate->dtvs_nlocals <= dtrace_helptrace_nlocals);
15213
15214	/*
15215	 * What would a tracing framework be without its own tracing
15216	 * framework?  (Well, a hell of a lot simpler, for starters...)
15217	 */
15218	size = sizeof (dtrace_helptrace_t) + dtrace_helptrace_nlocals *
15219	    sizeof (uint64_t) - sizeof (uint64_t);
15220
15221	/*
15222	 * Iterate until we can allocate a slot in the trace buffer.
15223	 */
15224	do {
15225		next = dtrace_helptrace_next;
15226
15227		if (next + size < dtrace_helptrace_bufsize) {
15228			nnext = next + size;
15229		} else {
15230			nnext = size;
15231		}
15232	} while (dtrace_cas32(&dtrace_helptrace_next, next, nnext) != next);
15233
15234	/*
15235	 * We have our slot; fill it in.
15236	 */
15237	if (nnext == size)
15238		next = 0;
15239
15240	ent = (dtrace_helptrace_t *)&dtrace_helptrace_buffer[next];
15241	ent->dtht_helper = helper;
15242	ent->dtht_where = where;
15243	ent->dtht_nlocals = vstate->dtvs_nlocals;
15244
15245	ent->dtht_fltoffs = (mstate->dtms_present & DTRACE_MSTATE_FLTOFFS) ?
15246	    mstate->dtms_fltoffs : -1;
15247	ent->dtht_fault = DTRACE_FLAGS2FLT(flags);
15248	ent->dtht_illval = cpu_core[curcpu].cpuc_dtrace_illval;
15249
15250	for (i = 0; i < vstate->dtvs_nlocals; i++) {
15251		dtrace_statvar_t *svar;
15252
15253		if ((svar = vstate->dtvs_locals[i]) == NULL)
15254			continue;
15255
15256		ASSERT(svar->dtsv_size >= NCPU * sizeof (uint64_t));
15257		ent->dtht_locals[i] =
15258		    ((uint64_t *)(uintptr_t)svar->dtsv_data)[curcpu];
15259	}
15260}
15261
15262static uint64_t
15263dtrace_helper(int which, dtrace_mstate_t *mstate,
15264    dtrace_state_t *state, uint64_t arg0, uint64_t arg1)
15265{
15266	uint16_t *flags = &cpu_core[curcpu].cpuc_dtrace_flags;
15267	uint64_t sarg0 = mstate->dtms_arg[0];
15268	uint64_t sarg1 = mstate->dtms_arg[1];
15269	uint64_t rval = 0;
15270	dtrace_helpers_t *helpers = curproc->p_dtrace_helpers;
15271	dtrace_helper_action_t *helper;
15272	dtrace_vstate_t *vstate;
15273	dtrace_difo_t *pred;
15274	int i, trace = dtrace_helptrace_enabled;
15275
15276	ASSERT(which >= 0 && which < DTRACE_NHELPER_ACTIONS);
15277
15278	if (helpers == NULL)
15279		return (0);
15280
15281	if ((helper = helpers->dthps_actions[which]) == NULL)
15282		return (0);
15283
15284	vstate = &helpers->dthps_vstate;
15285	mstate->dtms_arg[0] = arg0;
15286	mstate->dtms_arg[1] = arg1;
15287
15288	/*
15289	 * Now iterate over each helper.  If its predicate evaluates to 'true',
15290	 * we'll call the corresponding actions.  Note that the below calls
15291	 * to dtrace_dif_emulate() may set faults in machine state.  This is
15292	 * okay:  our caller (the outer dtrace_dif_emulate()) will simply plow
15293	 * the stored DIF offset with its own (which is the desired behavior).
15294	 * Also, note the calls to dtrace_dif_emulate() may allocate scratch
15295	 * from machine state; this is okay, too.
15296	 */
15297	for (; helper != NULL; helper = helper->dtha_next) {
15298		if ((pred = helper->dtha_predicate) != NULL) {
15299			if (trace)
15300				dtrace_helper_trace(helper, mstate, vstate, 0);
15301
15302			if (!dtrace_dif_emulate(pred, mstate, vstate, state))
15303				goto next;
15304
15305			if (*flags & CPU_DTRACE_FAULT)
15306				goto err;
15307		}
15308
15309		for (i = 0; i < helper->dtha_nactions; i++) {
15310			if (trace)
15311				dtrace_helper_trace(helper,
15312				    mstate, vstate, i + 1);
15313
15314			rval = dtrace_dif_emulate(helper->dtha_actions[i],
15315			    mstate, vstate, state);
15316
15317			if (*flags & CPU_DTRACE_FAULT)
15318				goto err;
15319		}
15320
15321next:
15322		if (trace)
15323			dtrace_helper_trace(helper, mstate, vstate,
15324			    DTRACE_HELPTRACE_NEXT);
15325	}
15326
15327	if (trace)
15328		dtrace_helper_trace(helper, mstate, vstate,
15329		    DTRACE_HELPTRACE_DONE);
15330
15331	/*
15332	 * Restore the arg0 that we saved upon entry.
15333	 */
15334	mstate->dtms_arg[0] = sarg0;
15335	mstate->dtms_arg[1] = sarg1;
15336
15337	return (rval);
15338
15339err:
15340	if (trace)
15341		dtrace_helper_trace(helper, mstate, vstate,
15342		    DTRACE_HELPTRACE_ERR);
15343
15344	/*
15345	 * Restore the arg0 that we saved upon entry.
15346	 */
15347	mstate->dtms_arg[0] = sarg0;
15348	mstate->dtms_arg[1] = sarg1;
15349
15350	return (0);
15351}
15352
15353static void
15354dtrace_helper_action_destroy(dtrace_helper_action_t *helper,
15355    dtrace_vstate_t *vstate)
15356{
15357	int i;
15358
15359	if (helper->dtha_predicate != NULL)
15360		dtrace_difo_release(helper->dtha_predicate, vstate);
15361
15362	for (i = 0; i < helper->dtha_nactions; i++) {
15363		ASSERT(helper->dtha_actions[i] != NULL);
15364		dtrace_difo_release(helper->dtha_actions[i], vstate);
15365	}
15366
15367	kmem_free(helper->dtha_actions,
15368	    helper->dtha_nactions * sizeof (dtrace_difo_t *));
15369	kmem_free(helper, sizeof (dtrace_helper_action_t));
15370}
15371
15372static int
15373dtrace_helper_destroygen(int gen)
15374{
15375	proc_t *p = curproc;
15376	dtrace_helpers_t *help = p->p_dtrace_helpers;
15377	dtrace_vstate_t *vstate;
15378	int i;
15379
15380	ASSERT(MUTEX_HELD(&dtrace_lock));
15381
15382	if (help == NULL || gen > help->dthps_generation)
15383		return (EINVAL);
15384
15385	vstate = &help->dthps_vstate;
15386
15387	for (i = 0; i < DTRACE_NHELPER_ACTIONS; i++) {
15388		dtrace_helper_action_t *last = NULL, *h, *next;
15389
15390		for (h = help->dthps_actions[i]; h != NULL; h = next) {
15391			next = h->dtha_next;
15392
15393			if (h->dtha_generation == gen) {
15394				if (last != NULL) {
15395					last->dtha_next = next;
15396				} else {
15397					help->dthps_actions[i] = next;
15398				}
15399
15400				dtrace_helper_action_destroy(h, vstate);
15401			} else {
15402				last = h;
15403			}
15404		}
15405	}
15406
15407	/*
15408	 * Interate until we've cleared out all helper providers with the
15409	 * given generation number.
15410	 */
15411	for (;;) {
15412		dtrace_helper_provider_t *prov;
15413
15414		/*
15415		 * Look for a helper provider with the right generation. We
15416		 * have to start back at the beginning of the list each time
15417		 * because we drop dtrace_lock. It's unlikely that we'll make
15418		 * more than two passes.
15419		 */
15420		for (i = 0; i < help->dthps_nprovs; i++) {
15421			prov = help->dthps_provs[i];
15422
15423			if (prov->dthp_generation == gen)
15424				break;
15425		}
15426
15427		/*
15428		 * If there were no matches, we're done.
15429		 */
15430		if (i == help->dthps_nprovs)
15431			break;
15432
15433		/*
15434		 * Move the last helper provider into this slot.
15435		 */
15436		help->dthps_nprovs--;
15437		help->dthps_provs[i] = help->dthps_provs[help->dthps_nprovs];
15438		help->dthps_provs[help->dthps_nprovs] = NULL;
15439
15440		mutex_exit(&dtrace_lock);
15441
15442		/*
15443		 * If we have a meta provider, remove this helper provider.
15444		 */
15445		mutex_enter(&dtrace_meta_lock);
15446		if (dtrace_meta_pid != NULL) {
15447			ASSERT(dtrace_deferred_pid == NULL);
15448			dtrace_helper_provider_remove(&prov->dthp_prov,
15449			    p->p_pid);
15450		}
15451		mutex_exit(&dtrace_meta_lock);
15452
15453		dtrace_helper_provider_destroy(prov);
15454
15455		mutex_enter(&dtrace_lock);
15456	}
15457
15458	return (0);
15459}
15460
15461static int
15462dtrace_helper_validate(dtrace_helper_action_t *helper)
15463{
15464	int err = 0, i;
15465	dtrace_difo_t *dp;
15466
15467	if ((dp = helper->dtha_predicate) != NULL)
15468		err += dtrace_difo_validate_helper(dp);
15469
15470	for (i = 0; i < helper->dtha_nactions; i++)
15471		err += dtrace_difo_validate_helper(helper->dtha_actions[i]);
15472
15473	return (err == 0);
15474}
15475
15476static int
15477dtrace_helper_action_add(int which, dtrace_ecbdesc_t *ep)
15478{
15479	dtrace_helpers_t *help;
15480	dtrace_helper_action_t *helper, *last;
15481	dtrace_actdesc_t *act;
15482	dtrace_vstate_t *vstate;
15483	dtrace_predicate_t *pred;
15484	int count = 0, nactions = 0, i;
15485
15486	if (which < 0 || which >= DTRACE_NHELPER_ACTIONS)
15487		return (EINVAL);
15488
15489	help = curproc->p_dtrace_helpers;
15490	last = help->dthps_actions[which];
15491	vstate = &help->dthps_vstate;
15492
15493	for (count = 0; last != NULL; last = last->dtha_next) {
15494		count++;
15495		if (last->dtha_next == NULL)
15496			break;
15497	}
15498
15499	/*
15500	 * If we already have dtrace_helper_actions_max helper actions for this
15501	 * helper action type, we'll refuse to add a new one.
15502	 */
15503	if (count >= dtrace_helper_actions_max)
15504		return (ENOSPC);
15505
15506	helper = kmem_zalloc(sizeof (dtrace_helper_action_t), KM_SLEEP);
15507	helper->dtha_generation = help->dthps_generation;
15508
15509	if ((pred = ep->dted_pred.dtpdd_predicate) != NULL) {
15510		ASSERT(pred->dtp_difo != NULL);
15511		dtrace_difo_hold(pred->dtp_difo);
15512		helper->dtha_predicate = pred->dtp_difo;
15513	}
15514
15515	for (act = ep->dted_action; act != NULL; act = act->dtad_next) {
15516		if (act->dtad_kind != DTRACEACT_DIFEXPR)
15517			goto err;
15518
15519		if (act->dtad_difo == NULL)
15520			goto err;
15521
15522		nactions++;
15523	}
15524
15525	helper->dtha_actions = kmem_zalloc(sizeof (dtrace_difo_t *) *
15526	    (helper->dtha_nactions = nactions), KM_SLEEP);
15527
15528	for (act = ep->dted_action, i = 0; act != NULL; act = act->dtad_next) {
15529		dtrace_difo_hold(act->dtad_difo);
15530		helper->dtha_actions[i++] = act->dtad_difo;
15531	}
15532
15533	if (!dtrace_helper_validate(helper))
15534		goto err;
15535
15536	if (last == NULL) {
15537		help->dthps_actions[which] = helper;
15538	} else {
15539		last->dtha_next = helper;
15540	}
15541
15542	if (vstate->dtvs_nlocals > dtrace_helptrace_nlocals) {
15543		dtrace_helptrace_nlocals = vstate->dtvs_nlocals;
15544		dtrace_helptrace_next = 0;
15545	}
15546
15547	return (0);
15548err:
15549	dtrace_helper_action_destroy(helper, vstate);
15550	return (EINVAL);
15551}
15552
15553static void
15554dtrace_helper_provider_register(proc_t *p, dtrace_helpers_t *help,
15555    dof_helper_t *dofhp)
15556{
15557	ASSERT(MUTEX_NOT_HELD(&dtrace_lock));
15558
15559	mutex_enter(&dtrace_meta_lock);
15560	mutex_enter(&dtrace_lock);
15561
15562	if (!dtrace_attached() || dtrace_meta_pid == NULL) {
15563		/*
15564		 * If the dtrace module is loaded but not attached, or if
15565		 * there aren't isn't a meta provider registered to deal with
15566		 * these provider descriptions, we need to postpone creating
15567		 * the actual providers until later.
15568		 */
15569
15570		if (help->dthps_next == NULL && help->dthps_prev == NULL &&
15571		    dtrace_deferred_pid != help) {
15572			help->dthps_deferred = 1;
15573			help->dthps_pid = p->p_pid;
15574			help->dthps_next = dtrace_deferred_pid;
15575			help->dthps_prev = NULL;
15576			if (dtrace_deferred_pid != NULL)
15577				dtrace_deferred_pid->dthps_prev = help;
15578			dtrace_deferred_pid = help;
15579		}
15580
15581		mutex_exit(&dtrace_lock);
15582
15583	} else if (dofhp != NULL) {
15584		/*
15585		 * If the dtrace module is loaded and we have a particular
15586		 * helper provider description, pass that off to the
15587		 * meta provider.
15588		 */
15589
15590		mutex_exit(&dtrace_lock);
15591
15592		dtrace_helper_provide(dofhp, p->p_pid);
15593
15594	} else {
15595		/*
15596		 * Otherwise, just pass all the helper provider descriptions
15597		 * off to the meta provider.
15598		 */
15599
15600		int i;
15601		mutex_exit(&dtrace_lock);
15602
15603		for (i = 0; i < help->dthps_nprovs; i++) {
15604			dtrace_helper_provide(&help->dthps_provs[i]->dthp_prov,
15605			    p->p_pid);
15606		}
15607	}
15608
15609	mutex_exit(&dtrace_meta_lock);
15610}
15611
15612static int
15613dtrace_helper_provider_add(dof_helper_t *dofhp, int gen)
15614{
15615	dtrace_helpers_t *help;
15616	dtrace_helper_provider_t *hprov, **tmp_provs;
15617	uint_t tmp_maxprovs, i;
15618
15619	ASSERT(MUTEX_HELD(&dtrace_lock));
15620
15621	help = curproc->p_dtrace_helpers;
15622	ASSERT(help != NULL);
15623
15624	/*
15625	 * If we already have dtrace_helper_providers_max helper providers,
15626	 * we're refuse to add a new one.
15627	 */
15628	if (help->dthps_nprovs >= dtrace_helper_providers_max)
15629		return (ENOSPC);
15630
15631	/*
15632	 * Check to make sure this isn't a duplicate.
15633	 */
15634	for (i = 0; i < help->dthps_nprovs; i++) {
15635		if (dofhp->dofhp_dof ==
15636		    help->dthps_provs[i]->dthp_prov.dofhp_dof)
15637			return (EALREADY);
15638	}
15639
15640	hprov = kmem_zalloc(sizeof (dtrace_helper_provider_t), KM_SLEEP);
15641	hprov->dthp_prov = *dofhp;
15642	hprov->dthp_ref = 1;
15643	hprov->dthp_generation = gen;
15644
15645	/*
15646	 * Allocate a bigger table for helper providers if it's already full.
15647	 */
15648	if (help->dthps_maxprovs == help->dthps_nprovs) {
15649		tmp_maxprovs = help->dthps_maxprovs;
15650		tmp_provs = help->dthps_provs;
15651
15652		if (help->dthps_maxprovs == 0)
15653			help->dthps_maxprovs = 2;
15654		else
15655			help->dthps_maxprovs *= 2;
15656		if (help->dthps_maxprovs > dtrace_helper_providers_max)
15657			help->dthps_maxprovs = dtrace_helper_providers_max;
15658
15659		ASSERT(tmp_maxprovs < help->dthps_maxprovs);
15660
15661		help->dthps_provs = kmem_zalloc(help->dthps_maxprovs *
15662		    sizeof (dtrace_helper_provider_t *), KM_SLEEP);
15663
15664		if (tmp_provs != NULL) {
15665			bcopy(tmp_provs, help->dthps_provs, tmp_maxprovs *
15666			    sizeof (dtrace_helper_provider_t *));
15667			kmem_free(tmp_provs, tmp_maxprovs *
15668			    sizeof (dtrace_helper_provider_t *));
15669		}
15670	}
15671
15672	help->dthps_provs[help->dthps_nprovs] = hprov;
15673	help->dthps_nprovs++;
15674
15675	return (0);
15676}
15677
15678static void
15679dtrace_helper_provider_destroy(dtrace_helper_provider_t *hprov)
15680{
15681	mutex_enter(&dtrace_lock);
15682
15683	if (--hprov->dthp_ref == 0) {
15684		dof_hdr_t *dof;
15685		mutex_exit(&dtrace_lock);
15686		dof = (dof_hdr_t *)(uintptr_t)hprov->dthp_prov.dofhp_dof;
15687		dtrace_dof_destroy(dof);
15688		kmem_free(hprov, sizeof (dtrace_helper_provider_t));
15689	} else {
15690		mutex_exit(&dtrace_lock);
15691	}
15692}
15693
15694static int
15695dtrace_helper_provider_validate(dof_hdr_t *dof, dof_sec_t *sec)
15696{
15697	uintptr_t daddr = (uintptr_t)dof;
15698	dof_sec_t *str_sec, *prb_sec, *arg_sec, *off_sec, *enoff_sec;
15699	dof_provider_t *provider;
15700	dof_probe_t *probe;
15701	uint8_t *arg;
15702	char *strtab, *typestr;
15703	dof_stridx_t typeidx;
15704	size_t typesz;
15705	uint_t nprobes, j, k;
15706
15707	ASSERT(sec->dofs_type == DOF_SECT_PROVIDER);
15708
15709	if (sec->dofs_offset & (sizeof (uint_t) - 1)) {
15710		dtrace_dof_error(dof, "misaligned section offset");
15711		return (-1);
15712	}
15713
15714	/*
15715	 * The section needs to be large enough to contain the DOF provider
15716	 * structure appropriate for the given version.
15717	 */
15718	if (sec->dofs_size <
15719	    ((dof->dofh_ident[DOF_ID_VERSION] == DOF_VERSION_1) ?
15720	    offsetof(dof_provider_t, dofpv_prenoffs) :
15721	    sizeof (dof_provider_t))) {
15722		dtrace_dof_error(dof, "provider section too small");
15723		return (-1);
15724	}
15725
15726	provider = (dof_provider_t *)(uintptr_t)(daddr + sec->dofs_offset);
15727	str_sec = dtrace_dof_sect(dof, DOF_SECT_STRTAB, provider->dofpv_strtab);
15728	prb_sec = dtrace_dof_sect(dof, DOF_SECT_PROBES, provider->dofpv_probes);
15729	arg_sec = dtrace_dof_sect(dof, DOF_SECT_PRARGS, provider->dofpv_prargs);
15730	off_sec = dtrace_dof_sect(dof, DOF_SECT_PROFFS, provider->dofpv_proffs);
15731
15732	if (str_sec == NULL || prb_sec == NULL ||
15733	    arg_sec == NULL || off_sec == NULL)
15734		return (-1);
15735
15736	enoff_sec = NULL;
15737
15738	if (dof->dofh_ident[DOF_ID_VERSION] != DOF_VERSION_1 &&
15739	    provider->dofpv_prenoffs != DOF_SECT_NONE &&
15740	    (enoff_sec = dtrace_dof_sect(dof, DOF_SECT_PRENOFFS,
15741	    provider->dofpv_prenoffs)) == NULL)
15742		return (-1);
15743
15744	strtab = (char *)(uintptr_t)(daddr + str_sec->dofs_offset);
15745
15746	if (provider->dofpv_name >= str_sec->dofs_size ||
15747	    strlen(strtab + provider->dofpv_name) >= DTRACE_PROVNAMELEN) {
15748		dtrace_dof_error(dof, "invalid provider name");
15749		return (-1);
15750	}
15751
15752	if (prb_sec->dofs_entsize == 0 ||
15753	    prb_sec->dofs_entsize > prb_sec->dofs_size) {
15754		dtrace_dof_error(dof, "invalid entry size");
15755		return (-1);
15756	}
15757
15758	if (prb_sec->dofs_entsize & (sizeof (uintptr_t) - 1)) {
15759		dtrace_dof_error(dof, "misaligned entry size");
15760		return (-1);
15761	}
15762
15763	if (off_sec->dofs_entsize != sizeof (uint32_t)) {
15764		dtrace_dof_error(dof, "invalid entry size");
15765		return (-1);
15766	}
15767
15768	if (off_sec->dofs_offset & (sizeof (uint32_t) - 1)) {
15769		dtrace_dof_error(dof, "misaligned section offset");
15770		return (-1);
15771	}
15772
15773	if (arg_sec->dofs_entsize != sizeof (uint8_t)) {
15774		dtrace_dof_error(dof, "invalid entry size");
15775		return (-1);
15776	}
15777
15778	arg = (uint8_t *)(uintptr_t)(daddr + arg_sec->dofs_offset);
15779
15780	nprobes = prb_sec->dofs_size / prb_sec->dofs_entsize;
15781
15782	/*
15783	 * Take a pass through the probes to check for errors.
15784	 */
15785	for (j = 0; j < nprobes; j++) {
15786		probe = (dof_probe_t *)(uintptr_t)(daddr +
15787		    prb_sec->dofs_offset + j * prb_sec->dofs_entsize);
15788
15789		if (probe->dofpr_func >= str_sec->dofs_size) {
15790			dtrace_dof_error(dof, "invalid function name");
15791			return (-1);
15792		}
15793
15794		if (strlen(strtab + probe->dofpr_func) >= DTRACE_FUNCNAMELEN) {
15795			dtrace_dof_error(dof, "function name too long");
15796			return (-1);
15797		}
15798
15799		if (probe->dofpr_name >= str_sec->dofs_size ||
15800		    strlen(strtab + probe->dofpr_name) >= DTRACE_NAMELEN) {
15801			dtrace_dof_error(dof, "invalid probe name");
15802			return (-1);
15803		}
15804
15805		/*
15806		 * The offset count must not wrap the index, and the offsets
15807		 * must also not overflow the section's data.
15808		 */
15809		if (probe->dofpr_offidx + probe->dofpr_noffs <
15810		    probe->dofpr_offidx ||
15811		    (probe->dofpr_offidx + probe->dofpr_noffs) *
15812		    off_sec->dofs_entsize > off_sec->dofs_size) {
15813			dtrace_dof_error(dof, "invalid probe offset");
15814			return (-1);
15815		}
15816
15817		if (dof->dofh_ident[DOF_ID_VERSION] != DOF_VERSION_1) {
15818			/*
15819			 * If there's no is-enabled offset section, make sure
15820			 * there aren't any is-enabled offsets. Otherwise
15821			 * perform the same checks as for probe offsets
15822			 * (immediately above).
15823			 */
15824			if (enoff_sec == NULL) {
15825				if (probe->dofpr_enoffidx != 0 ||
15826				    probe->dofpr_nenoffs != 0) {
15827					dtrace_dof_error(dof, "is-enabled "
15828					    "offsets with null section");
15829					return (-1);
15830				}
15831			} else if (probe->dofpr_enoffidx +
15832			    probe->dofpr_nenoffs < probe->dofpr_enoffidx ||
15833			    (probe->dofpr_enoffidx + probe->dofpr_nenoffs) *
15834			    enoff_sec->dofs_entsize > enoff_sec->dofs_size) {
15835				dtrace_dof_error(dof, "invalid is-enabled "
15836				    "offset");
15837				return (-1);
15838			}
15839
15840			if (probe->dofpr_noffs + probe->dofpr_nenoffs == 0) {
15841				dtrace_dof_error(dof, "zero probe and "
15842				    "is-enabled offsets");
15843				return (-1);
15844			}
15845		} else if (probe->dofpr_noffs == 0) {
15846			dtrace_dof_error(dof, "zero probe offsets");
15847			return (-1);
15848		}
15849
15850		if (probe->dofpr_argidx + probe->dofpr_xargc <
15851		    probe->dofpr_argidx ||
15852		    (probe->dofpr_argidx + probe->dofpr_xargc) *
15853		    arg_sec->dofs_entsize > arg_sec->dofs_size) {
15854			dtrace_dof_error(dof, "invalid args");
15855			return (-1);
15856		}
15857
15858		typeidx = probe->dofpr_nargv;
15859		typestr = strtab + probe->dofpr_nargv;
15860		for (k = 0; k < probe->dofpr_nargc; k++) {
15861			if (typeidx >= str_sec->dofs_size) {
15862				dtrace_dof_error(dof, "bad "
15863				    "native argument type");
15864				return (-1);
15865			}
15866
15867			typesz = strlen(typestr) + 1;
15868			if (typesz > DTRACE_ARGTYPELEN) {
15869				dtrace_dof_error(dof, "native "
15870				    "argument type too long");
15871				return (-1);
15872			}
15873			typeidx += typesz;
15874			typestr += typesz;
15875		}
15876
15877		typeidx = probe->dofpr_xargv;
15878		typestr = strtab + probe->dofpr_xargv;
15879		for (k = 0; k < probe->dofpr_xargc; k++) {
15880			if (arg[probe->dofpr_argidx + k] > probe->dofpr_nargc) {
15881				dtrace_dof_error(dof, "bad "
15882				    "native argument index");
15883				return (-1);
15884			}
15885
15886			if (typeidx >= str_sec->dofs_size) {
15887				dtrace_dof_error(dof, "bad "
15888				    "translated argument type");
15889				return (-1);
15890			}
15891
15892			typesz = strlen(typestr) + 1;
15893			if (typesz > DTRACE_ARGTYPELEN) {
15894				dtrace_dof_error(dof, "translated argument "
15895				    "type too long");
15896				return (-1);
15897			}
15898
15899			typeidx += typesz;
15900			typestr += typesz;
15901		}
15902	}
15903
15904	return (0);
15905}
15906
15907static int
15908dtrace_helper_slurp(dof_hdr_t *dof, dof_helper_t *dhp)
15909{
15910	dtrace_helpers_t *help;
15911	dtrace_vstate_t *vstate;
15912	dtrace_enabling_t *enab = NULL;
15913	int i, gen, rv, nhelpers = 0, nprovs = 0, destroy = 1;
15914	uintptr_t daddr = (uintptr_t)dof;
15915
15916	ASSERT(MUTEX_HELD(&dtrace_lock));
15917
15918	if ((help = curproc->p_dtrace_helpers) == NULL)
15919		help = dtrace_helpers_create(curproc);
15920
15921	vstate = &help->dthps_vstate;
15922
15923	if ((rv = dtrace_dof_slurp(dof, vstate, NULL, &enab,
15924	    dhp != NULL ? dhp->dofhp_addr : 0, B_FALSE)) != 0) {
15925		dtrace_dof_destroy(dof);
15926		return (rv);
15927	}
15928
15929	/*
15930	 * Look for helper providers and validate their descriptions.
15931	 */
15932	if (dhp != NULL) {
15933		for (i = 0; i < dof->dofh_secnum; i++) {
15934			dof_sec_t *sec = (dof_sec_t *)(uintptr_t)(daddr +
15935			    dof->dofh_secoff + i * dof->dofh_secsize);
15936
15937			if (sec->dofs_type != DOF_SECT_PROVIDER)
15938				continue;
15939
15940			if (dtrace_helper_provider_validate(dof, sec) != 0) {
15941				dtrace_enabling_destroy(enab);
15942				dtrace_dof_destroy(dof);
15943				return (-1);
15944			}
15945
15946			nprovs++;
15947		}
15948	}
15949
15950	/*
15951	 * Now we need to walk through the ECB descriptions in the enabling.
15952	 */
15953	for (i = 0; i < enab->dten_ndesc; i++) {
15954		dtrace_ecbdesc_t *ep = enab->dten_desc[i];
15955		dtrace_probedesc_t *desc = &ep->dted_probe;
15956
15957		if (strcmp(desc->dtpd_provider, "dtrace") != 0)
15958			continue;
15959
15960		if (strcmp(desc->dtpd_mod, "helper") != 0)
15961			continue;
15962
15963		if (strcmp(desc->dtpd_func, "ustack") != 0)
15964			continue;
15965
15966		if ((rv = dtrace_helper_action_add(DTRACE_HELPER_ACTION_USTACK,
15967		    ep)) != 0) {
15968			/*
15969			 * Adding this helper action failed -- we are now going
15970			 * to rip out the entire generation and return failure.
15971			 */
15972			(void) dtrace_helper_destroygen(help->dthps_generation);
15973			dtrace_enabling_destroy(enab);
15974			dtrace_dof_destroy(dof);
15975			return (-1);
15976		}
15977
15978		nhelpers++;
15979	}
15980
15981	if (nhelpers < enab->dten_ndesc)
15982		dtrace_dof_error(dof, "unmatched helpers");
15983
15984	gen = help->dthps_generation++;
15985	dtrace_enabling_destroy(enab);
15986
15987	if (dhp != NULL && nprovs > 0) {
15988		dhp->dofhp_dof = (uint64_t)(uintptr_t)dof;
15989		if (dtrace_helper_provider_add(dhp, gen) == 0) {
15990			mutex_exit(&dtrace_lock);
15991			dtrace_helper_provider_register(curproc, help, dhp);
15992			mutex_enter(&dtrace_lock);
15993
15994			destroy = 0;
15995		}
15996	}
15997
15998	if (destroy)
15999		dtrace_dof_destroy(dof);
16000
16001	return (gen);
16002}
16003
16004static dtrace_helpers_t *
16005dtrace_helpers_create(proc_t *p)
16006{
16007	dtrace_helpers_t *help;
16008
16009	ASSERT(MUTEX_HELD(&dtrace_lock));
16010	ASSERT(p->p_dtrace_helpers == NULL);
16011
16012	help = kmem_zalloc(sizeof (dtrace_helpers_t), KM_SLEEP);
16013	help->dthps_actions = kmem_zalloc(sizeof (dtrace_helper_action_t *) *
16014	    DTRACE_NHELPER_ACTIONS, KM_SLEEP);
16015
16016	p->p_dtrace_helpers = help;
16017	dtrace_helpers++;
16018
16019	return (help);
16020}
16021
16022#if defined(sun)
16023static
16024#endif
16025void
16026dtrace_helpers_destroy(proc_t *p)
16027{
16028	dtrace_helpers_t *help;
16029	dtrace_vstate_t *vstate;
16030#if defined(sun)
16031	proc_t *p = curproc;
16032#endif
16033	int i;
16034
16035	mutex_enter(&dtrace_lock);
16036
16037	ASSERT(p->p_dtrace_helpers != NULL);
16038	ASSERT(dtrace_helpers > 0);
16039
16040	help = p->p_dtrace_helpers;
16041	vstate = &help->dthps_vstate;
16042
16043	/*
16044	 * We're now going to lose the help from this process.
16045	 */
16046	p->p_dtrace_helpers = NULL;
16047	dtrace_sync();
16048
16049	/*
16050	 * Destory the helper actions.
16051	 */
16052	for (i = 0; i < DTRACE_NHELPER_ACTIONS; i++) {
16053		dtrace_helper_action_t *h, *next;
16054
16055		for (h = help->dthps_actions[i]; h != NULL; h = next) {
16056			next = h->dtha_next;
16057			dtrace_helper_action_destroy(h, vstate);
16058			h = next;
16059		}
16060	}
16061
16062	mutex_exit(&dtrace_lock);
16063
16064	/*
16065	 * Destroy the helper providers.
16066	 */
16067	if (help->dthps_maxprovs > 0) {
16068		mutex_enter(&dtrace_meta_lock);
16069		if (dtrace_meta_pid != NULL) {
16070			ASSERT(dtrace_deferred_pid == NULL);
16071
16072			for (i = 0; i < help->dthps_nprovs; i++) {
16073				dtrace_helper_provider_remove(
16074				    &help->dthps_provs[i]->dthp_prov, p->p_pid);
16075			}
16076		} else {
16077			mutex_enter(&dtrace_lock);
16078			ASSERT(help->dthps_deferred == 0 ||
16079			    help->dthps_next != NULL ||
16080			    help->dthps_prev != NULL ||
16081			    help == dtrace_deferred_pid);
16082
16083			/*
16084			 * Remove the helper from the deferred list.
16085			 */
16086			if (help->dthps_next != NULL)
16087				help->dthps_next->dthps_prev = help->dthps_prev;
16088			if (help->dthps_prev != NULL)
16089				help->dthps_prev->dthps_next = help->dthps_next;
16090			if (dtrace_deferred_pid == help) {
16091				dtrace_deferred_pid = help->dthps_next;
16092				ASSERT(help->dthps_prev == NULL);
16093			}
16094
16095			mutex_exit(&dtrace_lock);
16096		}
16097
16098		mutex_exit(&dtrace_meta_lock);
16099
16100		for (i = 0; i < help->dthps_nprovs; i++) {
16101			dtrace_helper_provider_destroy(help->dthps_provs[i]);
16102		}
16103
16104		kmem_free(help->dthps_provs, help->dthps_maxprovs *
16105		    sizeof (dtrace_helper_provider_t *));
16106	}
16107
16108	mutex_enter(&dtrace_lock);
16109
16110	dtrace_vstate_fini(&help->dthps_vstate);
16111	kmem_free(help->dthps_actions,
16112	    sizeof (dtrace_helper_action_t *) * DTRACE_NHELPER_ACTIONS);
16113	kmem_free(help, sizeof (dtrace_helpers_t));
16114
16115	--dtrace_helpers;
16116	mutex_exit(&dtrace_lock);
16117}
16118
16119#if defined(sun)
16120static
16121#endif
16122void
16123dtrace_helpers_duplicate(proc_t *from, proc_t *to)
16124{
16125	dtrace_helpers_t *help, *newhelp;
16126	dtrace_helper_action_t *helper, *new, *last;
16127	dtrace_difo_t *dp;
16128	dtrace_vstate_t *vstate;
16129	int i, j, sz, hasprovs = 0;
16130
16131	mutex_enter(&dtrace_lock);
16132	ASSERT(from->p_dtrace_helpers != NULL);
16133	ASSERT(dtrace_helpers > 0);
16134
16135	help = from->p_dtrace_helpers;
16136	newhelp = dtrace_helpers_create(to);
16137	ASSERT(to->p_dtrace_helpers != NULL);
16138
16139	newhelp->dthps_generation = help->dthps_generation;
16140	vstate = &newhelp->dthps_vstate;
16141
16142	/*
16143	 * Duplicate the helper actions.
16144	 */
16145	for (i = 0; i < DTRACE_NHELPER_ACTIONS; i++) {
16146		if ((helper = help->dthps_actions[i]) == NULL)
16147			continue;
16148
16149		for (last = NULL; helper != NULL; helper = helper->dtha_next) {
16150			new = kmem_zalloc(sizeof (dtrace_helper_action_t),
16151			    KM_SLEEP);
16152			new->dtha_generation = helper->dtha_generation;
16153
16154			if ((dp = helper->dtha_predicate) != NULL) {
16155				dp = dtrace_difo_duplicate(dp, vstate);
16156				new->dtha_predicate = dp;
16157			}
16158
16159			new->dtha_nactions = helper->dtha_nactions;
16160			sz = sizeof (dtrace_difo_t *) * new->dtha_nactions;
16161			new->dtha_actions = kmem_alloc(sz, KM_SLEEP);
16162
16163			for (j = 0; j < new->dtha_nactions; j++) {
16164				dtrace_difo_t *dp = helper->dtha_actions[j];
16165
16166				ASSERT(dp != NULL);
16167				dp = dtrace_difo_duplicate(dp, vstate);
16168				new->dtha_actions[j] = dp;
16169			}
16170
16171			if (last != NULL) {
16172				last->dtha_next = new;
16173			} else {
16174				newhelp->dthps_actions[i] = new;
16175			}
16176
16177			last = new;
16178		}
16179	}
16180
16181	/*
16182	 * Duplicate the helper providers and register them with the
16183	 * DTrace framework.
16184	 */
16185	if (help->dthps_nprovs > 0) {
16186		newhelp->dthps_nprovs = help->dthps_nprovs;
16187		newhelp->dthps_maxprovs = help->dthps_nprovs;
16188		newhelp->dthps_provs = kmem_alloc(newhelp->dthps_nprovs *
16189		    sizeof (dtrace_helper_provider_t *), KM_SLEEP);
16190		for (i = 0; i < newhelp->dthps_nprovs; i++) {
16191			newhelp->dthps_provs[i] = help->dthps_provs[i];
16192			newhelp->dthps_provs[i]->dthp_ref++;
16193		}
16194
16195		hasprovs = 1;
16196	}
16197
16198	mutex_exit(&dtrace_lock);
16199
16200	if (hasprovs)
16201		dtrace_helper_provider_register(to, newhelp, NULL);
16202}
16203
16204/*
16205 * DTrace Hook Functions
16206 */
16207static void
16208dtrace_module_loaded(modctl_t *ctl)
16209{
16210	dtrace_provider_t *prv;
16211
16212	mutex_enter(&dtrace_provider_lock);
16213#if defined(sun)
16214	mutex_enter(&mod_lock);
16215#endif
16216
16217#if defined(sun)
16218	ASSERT(ctl->mod_busy);
16219#endif
16220
16221	/*
16222	 * We're going to call each providers per-module provide operation
16223	 * specifying only this module.
16224	 */
16225	for (prv = dtrace_provider; prv != NULL; prv = prv->dtpv_next)
16226		prv->dtpv_pops.dtps_provide_module(prv->dtpv_arg, ctl);
16227
16228#if defined(sun)
16229	mutex_exit(&mod_lock);
16230#endif
16231	mutex_exit(&dtrace_provider_lock);
16232
16233	/*
16234	 * If we have any retained enablings, we need to match against them.
16235	 * Enabling probes requires that cpu_lock be held, and we cannot hold
16236	 * cpu_lock here -- it is legal for cpu_lock to be held when loading a
16237	 * module.  (In particular, this happens when loading scheduling
16238	 * classes.)  So if we have any retained enablings, we need to dispatch
16239	 * our task queue to do the match for us.
16240	 */
16241	mutex_enter(&dtrace_lock);
16242
16243	if (dtrace_retained == NULL) {
16244		mutex_exit(&dtrace_lock);
16245		return;
16246	}
16247
16248	(void) taskq_dispatch(dtrace_taskq,
16249	    (task_func_t *)dtrace_enabling_matchall, NULL, TQ_SLEEP);
16250
16251	mutex_exit(&dtrace_lock);
16252
16253	/*
16254	 * And now, for a little heuristic sleaze:  in general, we want to
16255	 * match modules as soon as they load.  However, we cannot guarantee
16256	 * this, because it would lead us to the lock ordering violation
16257	 * outlined above.  The common case, of course, is that cpu_lock is
16258	 * _not_ held -- so we delay here for a clock tick, hoping that that's
16259	 * long enough for the task queue to do its work.  If it's not, it's
16260	 * not a serious problem -- it just means that the module that we
16261	 * just loaded may not be immediately instrumentable.
16262	 */
16263	delay(1);
16264}
16265
16266static void
16267#if defined(sun)
16268dtrace_module_unloaded(modctl_t *ctl)
16269#else
16270dtrace_module_unloaded(modctl_t *ctl, int *error)
16271#endif
16272{
16273	dtrace_probe_t template, *probe, *first, *next;
16274	dtrace_provider_t *prov;
16275#if !defined(sun)
16276	char modname[DTRACE_MODNAMELEN];
16277	size_t len;
16278#endif
16279
16280#if defined(sun)
16281	template.dtpr_mod = ctl->mod_modname;
16282#else
16283	/* Handle the fact that ctl->filename may end in ".ko". */
16284	strlcpy(modname, ctl->filename, sizeof(modname));
16285	len = strlen(ctl->filename);
16286	if (len > 3 && strcmp(modname + len - 3, ".ko") == 0)
16287		modname[len - 3] = '\0';
16288	template.dtpr_mod = modname;
16289#endif
16290
16291	mutex_enter(&dtrace_provider_lock);
16292#if defined(sun)
16293	mutex_enter(&mod_lock);
16294#endif
16295	mutex_enter(&dtrace_lock);
16296
16297#if !defined(sun)
16298	if (ctl->nenabled > 0) {
16299		/* Don't allow unloads if a probe is enabled. */
16300		mutex_exit(&dtrace_provider_lock);
16301		mutex_exit(&dtrace_lock);
16302		*error = -1;
16303		printf(
16304	"kldunload: attempt to unload module that has DTrace probes enabled\n");
16305		return;
16306	}
16307#endif
16308
16309	if (dtrace_bymod == NULL) {
16310		/*
16311		 * The DTrace module is loaded (obviously) but not attached;
16312		 * we don't have any work to do.
16313		 */
16314		mutex_exit(&dtrace_provider_lock);
16315#if defined(sun)
16316		mutex_exit(&mod_lock);
16317#endif
16318		mutex_exit(&dtrace_lock);
16319		return;
16320	}
16321
16322	for (probe = first = dtrace_hash_lookup(dtrace_bymod, &template);
16323	    probe != NULL; probe = probe->dtpr_nextmod) {
16324		if (probe->dtpr_ecb != NULL) {
16325			mutex_exit(&dtrace_provider_lock);
16326#if defined(sun)
16327			mutex_exit(&mod_lock);
16328#endif
16329			mutex_exit(&dtrace_lock);
16330
16331			/*
16332			 * This shouldn't _actually_ be possible -- we're
16333			 * unloading a module that has an enabled probe in it.
16334			 * (It's normally up to the provider to make sure that
16335			 * this can't happen.)  However, because dtps_enable()
16336			 * doesn't have a failure mode, there can be an
16337			 * enable/unload race.  Upshot:  we don't want to
16338			 * assert, but we're not going to disable the
16339			 * probe, either.
16340			 */
16341			if (dtrace_err_verbose) {
16342#if defined(sun)
16343				cmn_err(CE_WARN, "unloaded module '%s' had "
16344				    "enabled probes", ctl->mod_modname);
16345#else
16346				cmn_err(CE_WARN, "unloaded module '%s' had "
16347				    "enabled probes", modname);
16348#endif
16349			}
16350
16351			return;
16352		}
16353	}
16354
16355	probe = first;
16356
16357	for (first = NULL; probe != NULL; probe = next) {
16358		ASSERT(dtrace_probes[probe->dtpr_id - 1] == probe);
16359
16360		dtrace_probes[probe->dtpr_id - 1] = NULL;
16361
16362		next = probe->dtpr_nextmod;
16363		dtrace_hash_remove(dtrace_bymod, probe);
16364		dtrace_hash_remove(dtrace_byfunc, probe);
16365		dtrace_hash_remove(dtrace_byname, probe);
16366
16367		if (first == NULL) {
16368			first = probe;
16369			probe->dtpr_nextmod = NULL;
16370		} else {
16371			probe->dtpr_nextmod = first;
16372			first = probe;
16373		}
16374	}
16375
16376	/*
16377	 * We've removed all of the module's probes from the hash chains and
16378	 * from the probe array.  Now issue a dtrace_sync() to be sure that
16379	 * everyone has cleared out from any probe array processing.
16380	 */
16381	dtrace_sync();
16382
16383	for (probe = first; probe != NULL; probe = first) {
16384		first = probe->dtpr_nextmod;
16385		prov = probe->dtpr_provider;
16386		prov->dtpv_pops.dtps_destroy(prov->dtpv_arg, probe->dtpr_id,
16387		    probe->dtpr_arg);
16388		kmem_free(probe->dtpr_mod, strlen(probe->dtpr_mod) + 1);
16389		kmem_free(probe->dtpr_func, strlen(probe->dtpr_func) + 1);
16390		kmem_free(probe->dtpr_name, strlen(probe->dtpr_name) + 1);
16391#if defined(sun)
16392		vmem_free(dtrace_arena, (void *)(uintptr_t)probe->dtpr_id, 1);
16393#else
16394		free_unr(dtrace_arena, probe->dtpr_id);
16395#endif
16396		kmem_free(probe, sizeof (dtrace_probe_t));
16397	}
16398
16399	mutex_exit(&dtrace_lock);
16400#if defined(sun)
16401	mutex_exit(&mod_lock);
16402#endif
16403	mutex_exit(&dtrace_provider_lock);
16404}
16405
16406#if !defined(sun)
16407static void
16408dtrace_kld_load(void *arg __unused, linker_file_t lf)
16409{
16410
16411	dtrace_module_loaded(lf);
16412}
16413
16414static void
16415dtrace_kld_unload_try(void *arg __unused, linker_file_t lf, int *error)
16416{
16417
16418	if (*error != 0)
16419		/* We already have an error, so don't do anything. */
16420		return;
16421	dtrace_module_unloaded(lf, error);
16422}
16423#endif
16424
16425#if defined(sun)
16426static void
16427dtrace_suspend(void)
16428{
16429	dtrace_probe_foreach(offsetof(dtrace_pops_t, dtps_suspend));
16430}
16431
16432static void
16433dtrace_resume(void)
16434{
16435	dtrace_probe_foreach(offsetof(dtrace_pops_t, dtps_resume));
16436}
16437#endif
16438
16439static int
16440dtrace_cpu_setup(cpu_setup_t what, processorid_t cpu)
16441{
16442	ASSERT(MUTEX_HELD(&cpu_lock));
16443	mutex_enter(&dtrace_lock);
16444
16445	switch (what) {
16446	case CPU_CONFIG: {
16447		dtrace_state_t *state;
16448		dtrace_optval_t *opt, rs, c;
16449
16450		/*
16451		 * For now, we only allocate a new buffer for anonymous state.
16452		 */
16453		if ((state = dtrace_anon.dta_state) == NULL)
16454			break;
16455
16456		if (state->dts_activity != DTRACE_ACTIVITY_ACTIVE)
16457			break;
16458
16459		opt = state->dts_options;
16460		c = opt[DTRACEOPT_CPU];
16461
16462		if (c != DTRACE_CPUALL && c != DTRACEOPT_UNSET && c != cpu)
16463			break;
16464
16465		/*
16466		 * Regardless of what the actual policy is, we're going to
16467		 * temporarily set our resize policy to be manual.  We're
16468		 * also going to temporarily set our CPU option to denote
16469		 * the newly configured CPU.
16470		 */
16471		rs = opt[DTRACEOPT_BUFRESIZE];
16472		opt[DTRACEOPT_BUFRESIZE] = DTRACEOPT_BUFRESIZE_MANUAL;
16473		opt[DTRACEOPT_CPU] = (dtrace_optval_t)cpu;
16474
16475		(void) dtrace_state_buffers(state);
16476
16477		opt[DTRACEOPT_BUFRESIZE] = rs;
16478		opt[DTRACEOPT_CPU] = c;
16479
16480		break;
16481	}
16482
16483	case CPU_UNCONFIG:
16484		/*
16485		 * We don't free the buffer in the CPU_UNCONFIG case.  (The
16486		 * buffer will be freed when the consumer exits.)
16487		 */
16488		break;
16489
16490	default:
16491		break;
16492	}
16493
16494	mutex_exit(&dtrace_lock);
16495	return (0);
16496}
16497
16498#if defined(sun)
16499static void
16500dtrace_cpu_setup_initial(processorid_t cpu)
16501{
16502	(void) dtrace_cpu_setup(CPU_CONFIG, cpu);
16503}
16504#endif
16505
16506static void
16507dtrace_toxrange_add(uintptr_t base, uintptr_t limit)
16508{
16509	if (dtrace_toxranges >= dtrace_toxranges_max) {
16510		int osize, nsize;
16511		dtrace_toxrange_t *range;
16512
16513		osize = dtrace_toxranges_max * sizeof (dtrace_toxrange_t);
16514
16515		if (osize == 0) {
16516			ASSERT(dtrace_toxrange == NULL);
16517			ASSERT(dtrace_toxranges_max == 0);
16518			dtrace_toxranges_max = 1;
16519		} else {
16520			dtrace_toxranges_max <<= 1;
16521		}
16522
16523		nsize = dtrace_toxranges_max * sizeof (dtrace_toxrange_t);
16524		range = kmem_zalloc(nsize, KM_SLEEP);
16525
16526		if (dtrace_toxrange != NULL) {
16527			ASSERT(osize != 0);
16528			bcopy(dtrace_toxrange, range, osize);
16529			kmem_free(dtrace_toxrange, osize);
16530		}
16531
16532		dtrace_toxrange = range;
16533	}
16534
16535	ASSERT(dtrace_toxrange[dtrace_toxranges].dtt_base == 0);
16536	ASSERT(dtrace_toxrange[dtrace_toxranges].dtt_limit == 0);
16537
16538	dtrace_toxrange[dtrace_toxranges].dtt_base = base;
16539	dtrace_toxrange[dtrace_toxranges].dtt_limit = limit;
16540	dtrace_toxranges++;
16541}
16542
16543static void
16544dtrace_getf_barrier()
16545{
16546#if defined(sun)
16547	/*
16548	 * When we have unprivileged (that is, non-DTRACE_CRV_KERNEL) enablings
16549	 * that contain calls to getf(), this routine will be called on every
16550	 * closef() before either the underlying vnode is released or the
16551	 * file_t itself is freed.  By the time we are here, it is essential
16552	 * that the file_t can no longer be accessed from a call to getf()
16553	 * in probe context -- that assures that a dtrace_sync() can be used
16554	 * to clear out any enablings referring to the old structures.
16555	 */
16556	if (curthread->t_procp->p_zone->zone_dtrace_getf != 0 ||
16557	    kcred->cr_zone->zone_dtrace_getf != 0)
16558		dtrace_sync();
16559#endif
16560}
16561
16562/*
16563 * DTrace Driver Cookbook Functions
16564 */
16565#if defined(sun)
16566/*ARGSUSED*/
16567static int
16568dtrace_attach(dev_info_t *devi, ddi_attach_cmd_t cmd)
16569{
16570	dtrace_provider_id_t id;
16571	dtrace_state_t *state = NULL;
16572	dtrace_enabling_t *enab;
16573
16574	mutex_enter(&cpu_lock);
16575	mutex_enter(&dtrace_provider_lock);
16576	mutex_enter(&dtrace_lock);
16577
16578	if (ddi_soft_state_init(&dtrace_softstate,
16579	    sizeof (dtrace_state_t), 0) != 0) {
16580		cmn_err(CE_NOTE, "/dev/dtrace failed to initialize soft state");
16581		mutex_exit(&cpu_lock);
16582		mutex_exit(&dtrace_provider_lock);
16583		mutex_exit(&dtrace_lock);
16584		return (DDI_FAILURE);
16585	}
16586
16587	if (ddi_create_minor_node(devi, DTRACEMNR_DTRACE, S_IFCHR,
16588	    DTRACEMNRN_DTRACE, DDI_PSEUDO, NULL) == DDI_FAILURE ||
16589	    ddi_create_minor_node(devi, DTRACEMNR_HELPER, S_IFCHR,
16590	    DTRACEMNRN_HELPER, DDI_PSEUDO, NULL) == DDI_FAILURE) {
16591		cmn_err(CE_NOTE, "/dev/dtrace couldn't create minor nodes");
16592		ddi_remove_minor_node(devi, NULL);
16593		ddi_soft_state_fini(&dtrace_softstate);
16594		mutex_exit(&cpu_lock);
16595		mutex_exit(&dtrace_provider_lock);
16596		mutex_exit(&dtrace_lock);
16597		return (DDI_FAILURE);
16598	}
16599
16600	ddi_report_dev(devi);
16601	dtrace_devi = devi;
16602
16603	dtrace_modload = dtrace_module_loaded;
16604	dtrace_modunload = dtrace_module_unloaded;
16605	dtrace_cpu_init = dtrace_cpu_setup_initial;
16606	dtrace_helpers_cleanup = dtrace_helpers_destroy;
16607	dtrace_helpers_fork = dtrace_helpers_duplicate;
16608	dtrace_cpustart_init = dtrace_suspend;
16609	dtrace_cpustart_fini = dtrace_resume;
16610	dtrace_debugger_init = dtrace_suspend;
16611	dtrace_debugger_fini = dtrace_resume;
16612
16613	register_cpu_setup_func((cpu_setup_func_t *)dtrace_cpu_setup, NULL);
16614
16615	ASSERT(MUTEX_HELD(&cpu_lock));
16616
16617	dtrace_arena = vmem_create("dtrace", (void *)1, UINT32_MAX, 1,
16618	    NULL, NULL, NULL, 0, VM_SLEEP | VMC_IDENTIFIER);
16619	dtrace_minor = vmem_create("dtrace_minor", (void *)DTRACEMNRN_CLONE,
16620	    UINT32_MAX - DTRACEMNRN_CLONE, 1, NULL, NULL, NULL, 0,
16621	    VM_SLEEP | VMC_IDENTIFIER);
16622	dtrace_taskq = taskq_create("dtrace_taskq", 1, maxclsyspri,
16623	    1, INT_MAX, 0);
16624
16625	dtrace_state_cache = kmem_cache_create("dtrace_state_cache",
16626	    sizeof (dtrace_dstate_percpu_t) * NCPU, DTRACE_STATE_ALIGN,
16627	    NULL, NULL, NULL, NULL, NULL, 0);
16628
16629	ASSERT(MUTEX_HELD(&cpu_lock));
16630	dtrace_bymod = dtrace_hash_create(offsetof(dtrace_probe_t, dtpr_mod),
16631	    offsetof(dtrace_probe_t, dtpr_nextmod),
16632	    offsetof(dtrace_probe_t, dtpr_prevmod));
16633
16634	dtrace_byfunc = dtrace_hash_create(offsetof(dtrace_probe_t, dtpr_func),
16635	    offsetof(dtrace_probe_t, dtpr_nextfunc),
16636	    offsetof(dtrace_probe_t, dtpr_prevfunc));
16637
16638	dtrace_byname = dtrace_hash_create(offsetof(dtrace_probe_t, dtpr_name),
16639	    offsetof(dtrace_probe_t, dtpr_nextname),
16640	    offsetof(dtrace_probe_t, dtpr_prevname));
16641
16642	if (dtrace_retain_max < 1) {
16643		cmn_err(CE_WARN, "illegal value (%lu) for dtrace_retain_max; "
16644		    "setting to 1", dtrace_retain_max);
16645		dtrace_retain_max = 1;
16646	}
16647
16648	/*
16649	 * Now discover our toxic ranges.
16650	 */
16651	dtrace_toxic_ranges(dtrace_toxrange_add);
16652
16653	/*
16654	 * Before we register ourselves as a provider to our own framework,
16655	 * we would like to assert that dtrace_provider is NULL -- but that's
16656	 * not true if we were loaded as a dependency of a DTrace provider.
16657	 * Once we've registered, we can assert that dtrace_provider is our
16658	 * pseudo provider.
16659	 */
16660	(void) dtrace_register("dtrace", &dtrace_provider_attr,
16661	    DTRACE_PRIV_NONE, 0, &dtrace_provider_ops, NULL, &id);
16662
16663	ASSERT(dtrace_provider != NULL);
16664	ASSERT((dtrace_provider_id_t)dtrace_provider == id);
16665
16666	dtrace_probeid_begin = dtrace_probe_create((dtrace_provider_id_t)
16667	    dtrace_provider, NULL, NULL, "BEGIN", 0, NULL);
16668	dtrace_probeid_end = dtrace_probe_create((dtrace_provider_id_t)
16669	    dtrace_provider, NULL, NULL, "END", 0, NULL);
16670	dtrace_probeid_error = dtrace_probe_create((dtrace_provider_id_t)
16671	    dtrace_provider, NULL, NULL, "ERROR", 1, NULL);
16672
16673	dtrace_anon_property();
16674	mutex_exit(&cpu_lock);
16675
16676	/*
16677	 * If DTrace helper tracing is enabled, we need to allocate the
16678	 * trace buffer and initialize the values.
16679	 */
16680	if (dtrace_helptrace_enabled) {
16681		ASSERT(dtrace_helptrace_buffer == NULL);
16682		dtrace_helptrace_buffer =
16683		    kmem_zalloc(dtrace_helptrace_bufsize, KM_SLEEP);
16684		dtrace_helptrace_next = 0;
16685	}
16686
16687	/*
16688	 * If there are already providers, we must ask them to provide their
16689	 * probes, and then match any anonymous enabling against them.  Note
16690	 * that there should be no other retained enablings at this time:
16691	 * the only retained enablings at this time should be the anonymous
16692	 * enabling.
16693	 */
16694	if (dtrace_anon.dta_enabling != NULL) {
16695		ASSERT(dtrace_retained == dtrace_anon.dta_enabling);
16696
16697		dtrace_enabling_provide(NULL);
16698		state = dtrace_anon.dta_state;
16699
16700		/*
16701		 * We couldn't hold cpu_lock across the above call to
16702		 * dtrace_enabling_provide(), but we must hold it to actually
16703		 * enable the probes.  We have to drop all of our locks, pick
16704		 * up cpu_lock, and regain our locks before matching the
16705		 * retained anonymous enabling.
16706		 */
16707		mutex_exit(&dtrace_lock);
16708		mutex_exit(&dtrace_provider_lock);
16709
16710		mutex_enter(&cpu_lock);
16711		mutex_enter(&dtrace_provider_lock);
16712		mutex_enter(&dtrace_lock);
16713
16714		if ((enab = dtrace_anon.dta_enabling) != NULL)
16715			(void) dtrace_enabling_match(enab, NULL);
16716
16717		mutex_exit(&cpu_lock);
16718	}
16719
16720	mutex_exit(&dtrace_lock);
16721	mutex_exit(&dtrace_provider_lock);
16722
16723	if (state != NULL) {
16724		/*
16725		 * If we created any anonymous state, set it going now.
16726		 */
16727		(void) dtrace_state_go(state, &dtrace_anon.dta_beganon);
16728	}
16729
16730	return (DDI_SUCCESS);
16731}
16732#endif
16733
16734#if !defined(sun)
16735static void dtrace_dtr(void *);
16736#endif
16737
16738/*ARGSUSED*/
16739static int
16740#if defined(sun)
16741dtrace_open(dev_t *devp, int flag, int otyp, cred_t *cred_p)
16742#else
16743dtrace_open(struct cdev *dev, int oflags, int devtype, struct thread *td)
16744#endif
16745{
16746	dtrace_state_t *state;
16747	uint32_t priv;
16748	uid_t uid;
16749	zoneid_t zoneid;
16750
16751#if defined(sun)
16752	if (getminor(*devp) == DTRACEMNRN_HELPER)
16753		return (0);
16754
16755	/*
16756	 * If this wasn't an open with the "helper" minor, then it must be
16757	 * the "dtrace" minor.
16758	 */
16759	if (getminor(*devp) == DTRACEMNRN_DTRACE)
16760		return (ENXIO);
16761#else
16762	cred_t *cred_p = NULL;
16763	cred_p = dev->si_cred;
16764
16765	/*
16766	 * If no DTRACE_PRIV_* bits are set in the credential, then the
16767	 * caller lacks sufficient permission to do anything with DTrace.
16768	 */
16769	dtrace_cred2priv(cred_p, &priv, &uid, &zoneid);
16770	if (priv == DTRACE_PRIV_NONE) {
16771#endif
16772
16773		return (EACCES);
16774	}
16775
16776	/*
16777	 * Ask all providers to provide all their probes.
16778	 */
16779	mutex_enter(&dtrace_provider_lock);
16780	dtrace_probe_provide(NULL, NULL);
16781	mutex_exit(&dtrace_provider_lock);
16782
16783	mutex_enter(&cpu_lock);
16784	mutex_enter(&dtrace_lock);
16785	dtrace_opens++;
16786	dtrace_membar_producer();
16787
16788#if defined(sun)
16789	/*
16790	 * If the kernel debugger is active (that is, if the kernel debugger
16791	 * modified text in some way), we won't allow the open.
16792	 */
16793	if (kdi_dtrace_set(KDI_DTSET_DTRACE_ACTIVATE) != 0) {
16794		dtrace_opens--;
16795		mutex_exit(&cpu_lock);
16796		mutex_exit(&dtrace_lock);
16797		return (EBUSY);
16798	}
16799
16800	state = dtrace_state_create(devp, cred_p);
16801#else
16802	state = dtrace_state_create(dev);
16803	devfs_set_cdevpriv(state, dtrace_dtr);
16804#endif
16805
16806	mutex_exit(&cpu_lock);
16807
16808	if (state == NULL) {
16809#if defined(sun)
16810		if (--dtrace_opens == 0 && dtrace_anon.dta_enabling == NULL)
16811			(void) kdi_dtrace_set(KDI_DTSET_DTRACE_DEACTIVATE);
16812#else
16813		--dtrace_opens;
16814#endif
16815		mutex_exit(&dtrace_lock);
16816		return (EAGAIN);
16817	}
16818
16819	mutex_exit(&dtrace_lock);
16820
16821	return (0);
16822}
16823
16824/*ARGSUSED*/
16825#if defined(sun)
16826static int
16827dtrace_close(dev_t dev, int flag, int otyp, cred_t *cred_p)
16828#else
16829static void
16830dtrace_dtr(void *data)
16831#endif
16832{
16833#if defined(sun)
16834	minor_t minor = getminor(dev);
16835	dtrace_state_t *state;
16836
16837	if (minor == DTRACEMNRN_HELPER)
16838		return (0);
16839
16840	state = ddi_get_soft_state(dtrace_softstate, minor);
16841#else
16842	dtrace_state_t *state = data;
16843#endif
16844
16845	mutex_enter(&cpu_lock);
16846	mutex_enter(&dtrace_lock);
16847
16848#ifdef illumos
16849	if (state->dts_anon)
16850#else
16851	if (state != NULL && state->dts_anon)
16852#endif
16853	{
16854		/*
16855		 * There is anonymous state. Destroy that first.
16856		 */
16857		ASSERT(dtrace_anon.dta_state == NULL);
16858		dtrace_state_destroy(state->dts_anon);
16859	}
16860
16861#ifdef illumos
16862	dtrace_state_destroy(state);
16863#else
16864	if (state != NULL) {
16865		dtrace_state_destroy(state);
16866		kmem_free(state, 0);
16867	}
16868#endif
16869	ASSERT(dtrace_opens > 0);
16870
16871#if defined(sun)
16872	/*
16873	 * Only relinquish control of the kernel debugger interface when there
16874	 * are no consumers and no anonymous enablings.
16875	 */
16876	if (--dtrace_opens == 0 && dtrace_anon.dta_enabling == NULL)
16877		(void) kdi_dtrace_set(KDI_DTSET_DTRACE_DEACTIVATE);
16878#else
16879	--dtrace_opens;
16880#endif
16881
16882	mutex_exit(&dtrace_lock);
16883	mutex_exit(&cpu_lock);
16884
16885#if defined(sun)
16886	return (0);
16887#endif
16888}
16889
16890#if defined(sun)
16891/*ARGSUSED*/
16892static int
16893dtrace_ioctl_helper(int cmd, intptr_t arg, int *rv)
16894{
16895	int rval;
16896	dof_helper_t help, *dhp = NULL;
16897
16898	switch (cmd) {
16899	case DTRACEHIOC_ADDDOF:
16900		if (copyin((void *)arg, &help, sizeof (help)) != 0) {
16901			dtrace_dof_error(NULL, "failed to copyin DOF helper");
16902			return (EFAULT);
16903		}
16904
16905		dhp = &help;
16906		arg = (intptr_t)help.dofhp_dof;
16907		/*FALLTHROUGH*/
16908
16909	case DTRACEHIOC_ADD: {
16910		dof_hdr_t *dof = dtrace_dof_copyin(arg, &rval);
16911
16912		if (dof == NULL)
16913			return (rval);
16914
16915		mutex_enter(&dtrace_lock);
16916
16917		/*
16918		 * dtrace_helper_slurp() takes responsibility for the dof --
16919		 * it may free it now or it may save it and free it later.
16920		 */
16921		if ((rval = dtrace_helper_slurp(dof, dhp)) != -1) {
16922			*rv = rval;
16923			rval = 0;
16924		} else {
16925			rval = EINVAL;
16926		}
16927
16928		mutex_exit(&dtrace_lock);
16929		return (rval);
16930	}
16931
16932	case DTRACEHIOC_REMOVE: {
16933		mutex_enter(&dtrace_lock);
16934		rval = dtrace_helper_destroygen(arg);
16935		mutex_exit(&dtrace_lock);
16936
16937		return (rval);
16938	}
16939
16940	default:
16941		break;
16942	}
16943
16944	return (ENOTTY);
16945}
16946
16947/*ARGSUSED*/
16948static int
16949dtrace_ioctl(dev_t dev, int cmd, intptr_t arg, int md, cred_t *cr, int *rv)
16950{
16951	minor_t minor = getminor(dev);
16952	dtrace_state_t *state;
16953	int rval;
16954
16955	if (minor == DTRACEMNRN_HELPER)
16956		return (dtrace_ioctl_helper(cmd, arg, rv));
16957
16958	state = ddi_get_soft_state(dtrace_softstate, minor);
16959
16960	if (state->dts_anon) {
16961		ASSERT(dtrace_anon.dta_state == NULL);
16962		state = state->dts_anon;
16963	}
16964
16965	switch (cmd) {
16966	case DTRACEIOC_PROVIDER: {
16967		dtrace_providerdesc_t pvd;
16968		dtrace_provider_t *pvp;
16969
16970		if (copyin((void *)arg, &pvd, sizeof (pvd)) != 0)
16971			return (EFAULT);
16972
16973		pvd.dtvd_name[DTRACE_PROVNAMELEN - 1] = '\0';
16974		mutex_enter(&dtrace_provider_lock);
16975
16976		for (pvp = dtrace_provider; pvp != NULL; pvp = pvp->dtpv_next) {
16977			if (strcmp(pvp->dtpv_name, pvd.dtvd_name) == 0)
16978				break;
16979		}
16980
16981		mutex_exit(&dtrace_provider_lock);
16982
16983		if (pvp == NULL)
16984			return (ESRCH);
16985
16986		bcopy(&pvp->dtpv_priv, &pvd.dtvd_priv, sizeof (dtrace_ppriv_t));
16987		bcopy(&pvp->dtpv_attr, &pvd.dtvd_attr, sizeof (dtrace_pattr_t));
16988
16989		if (copyout(&pvd, (void *)arg, sizeof (pvd)) != 0)
16990			return (EFAULT);
16991
16992		return (0);
16993	}
16994
16995	case DTRACEIOC_EPROBE: {
16996		dtrace_eprobedesc_t epdesc;
16997		dtrace_ecb_t *ecb;
16998		dtrace_action_t *act;
16999		void *buf;
17000		size_t size;
17001		uintptr_t dest;
17002		int nrecs;
17003
17004		if (copyin((void *)arg, &epdesc, sizeof (epdesc)) != 0)
17005			return (EFAULT);
17006
17007		mutex_enter(&dtrace_lock);
17008
17009		if ((ecb = dtrace_epid2ecb(state, epdesc.dtepd_epid)) == NULL) {
17010			mutex_exit(&dtrace_lock);
17011			return (EINVAL);
17012		}
17013
17014		if (ecb->dte_probe == NULL) {
17015			mutex_exit(&dtrace_lock);
17016			return (EINVAL);
17017		}
17018
17019		epdesc.dtepd_probeid = ecb->dte_probe->dtpr_id;
17020		epdesc.dtepd_uarg = ecb->dte_uarg;
17021		epdesc.dtepd_size = ecb->dte_size;
17022
17023		nrecs = epdesc.dtepd_nrecs;
17024		epdesc.dtepd_nrecs = 0;
17025		for (act = ecb->dte_action; act != NULL; act = act->dta_next) {
17026			if (DTRACEACT_ISAGG(act->dta_kind) || act->dta_intuple)
17027				continue;
17028
17029			epdesc.dtepd_nrecs++;
17030		}
17031
17032		/*
17033		 * Now that we have the size, we need to allocate a temporary
17034		 * buffer in which to store the complete description.  We need
17035		 * the temporary buffer to be able to drop dtrace_lock()
17036		 * across the copyout(), below.
17037		 */
17038		size = sizeof (dtrace_eprobedesc_t) +
17039		    (epdesc.dtepd_nrecs * sizeof (dtrace_recdesc_t));
17040
17041		buf = kmem_alloc(size, KM_SLEEP);
17042		dest = (uintptr_t)buf;
17043
17044		bcopy(&epdesc, (void *)dest, sizeof (epdesc));
17045		dest += offsetof(dtrace_eprobedesc_t, dtepd_rec[0]);
17046
17047		for (act = ecb->dte_action; act != NULL; act = act->dta_next) {
17048			if (DTRACEACT_ISAGG(act->dta_kind) || act->dta_intuple)
17049				continue;
17050
17051			if (nrecs-- == 0)
17052				break;
17053
17054			bcopy(&act->dta_rec, (void *)dest,
17055			    sizeof (dtrace_recdesc_t));
17056			dest += sizeof (dtrace_recdesc_t);
17057		}
17058
17059		mutex_exit(&dtrace_lock);
17060
17061		if (copyout(buf, (void *)arg, dest - (uintptr_t)buf) != 0) {
17062			kmem_free(buf, size);
17063			return (EFAULT);
17064		}
17065
17066		kmem_free(buf, size);
17067		return (0);
17068	}
17069
17070	case DTRACEIOC_AGGDESC: {
17071		dtrace_aggdesc_t aggdesc;
17072		dtrace_action_t *act;
17073		dtrace_aggregation_t *agg;
17074		int nrecs;
17075		uint32_t offs;
17076		dtrace_recdesc_t *lrec;
17077		void *buf;
17078		size_t size;
17079		uintptr_t dest;
17080
17081		if (copyin((void *)arg, &aggdesc, sizeof (aggdesc)) != 0)
17082			return (EFAULT);
17083
17084		mutex_enter(&dtrace_lock);
17085
17086		if ((agg = dtrace_aggid2agg(state, aggdesc.dtagd_id)) == NULL) {
17087			mutex_exit(&dtrace_lock);
17088			return (EINVAL);
17089		}
17090
17091		aggdesc.dtagd_epid = agg->dtag_ecb->dte_epid;
17092
17093		nrecs = aggdesc.dtagd_nrecs;
17094		aggdesc.dtagd_nrecs = 0;
17095
17096		offs = agg->dtag_base;
17097		lrec = &agg->dtag_action.dta_rec;
17098		aggdesc.dtagd_size = lrec->dtrd_offset + lrec->dtrd_size - offs;
17099
17100		for (act = agg->dtag_first; ; act = act->dta_next) {
17101			ASSERT(act->dta_intuple ||
17102			    DTRACEACT_ISAGG(act->dta_kind));
17103
17104			/*
17105			 * If this action has a record size of zero, it
17106			 * denotes an argument to the aggregating action.
17107			 * Because the presence of this record doesn't (or
17108			 * shouldn't) affect the way the data is interpreted,
17109			 * we don't copy it out to save user-level the
17110			 * confusion of dealing with a zero-length record.
17111			 */
17112			if (act->dta_rec.dtrd_size == 0) {
17113				ASSERT(agg->dtag_hasarg);
17114				continue;
17115			}
17116
17117			aggdesc.dtagd_nrecs++;
17118
17119			if (act == &agg->dtag_action)
17120				break;
17121		}
17122
17123		/*
17124		 * Now that we have the size, we need to allocate a temporary
17125		 * buffer in which to store the complete description.  We need
17126		 * the temporary buffer to be able to drop dtrace_lock()
17127		 * across the copyout(), below.
17128		 */
17129		size = sizeof (dtrace_aggdesc_t) +
17130		    (aggdesc.dtagd_nrecs * sizeof (dtrace_recdesc_t));
17131
17132		buf = kmem_alloc(size, KM_SLEEP);
17133		dest = (uintptr_t)buf;
17134
17135		bcopy(&aggdesc, (void *)dest, sizeof (aggdesc));
17136		dest += offsetof(dtrace_aggdesc_t, dtagd_rec[0]);
17137
17138		for (act = agg->dtag_first; ; act = act->dta_next) {
17139			dtrace_recdesc_t rec = act->dta_rec;
17140
17141			/*
17142			 * See the comment in the above loop for why we pass
17143			 * over zero-length records.
17144			 */
17145			if (rec.dtrd_size == 0) {
17146				ASSERT(agg->dtag_hasarg);
17147				continue;
17148			}
17149
17150			if (nrecs-- == 0)
17151				break;
17152
17153			rec.dtrd_offset -= offs;
17154			bcopy(&rec, (void *)dest, sizeof (rec));
17155			dest += sizeof (dtrace_recdesc_t);
17156
17157			if (act == &agg->dtag_action)
17158				break;
17159		}
17160
17161		mutex_exit(&dtrace_lock);
17162
17163		if (copyout(buf, (void *)arg, dest - (uintptr_t)buf) != 0) {
17164			kmem_free(buf, size);
17165			return (EFAULT);
17166		}
17167
17168		kmem_free(buf, size);
17169		return (0);
17170	}
17171
17172	case DTRACEIOC_ENABLE: {
17173		dof_hdr_t *dof;
17174		dtrace_enabling_t *enab = NULL;
17175		dtrace_vstate_t *vstate;
17176		int err = 0;
17177
17178		*rv = 0;
17179
17180		/*
17181		 * If a NULL argument has been passed, we take this as our
17182		 * cue to reevaluate our enablings.
17183		 */
17184		if (arg == NULL) {
17185			dtrace_enabling_matchall();
17186
17187			return (0);
17188		}
17189
17190		if ((dof = dtrace_dof_copyin(arg, &rval)) == NULL)
17191			return (rval);
17192
17193		mutex_enter(&cpu_lock);
17194		mutex_enter(&dtrace_lock);
17195		vstate = &state->dts_vstate;
17196
17197		if (state->dts_activity != DTRACE_ACTIVITY_INACTIVE) {
17198			mutex_exit(&dtrace_lock);
17199			mutex_exit(&cpu_lock);
17200			dtrace_dof_destroy(dof);
17201			return (EBUSY);
17202		}
17203
17204		if (dtrace_dof_slurp(dof, vstate, cr, &enab, 0, B_TRUE) != 0) {
17205			mutex_exit(&dtrace_lock);
17206			mutex_exit(&cpu_lock);
17207			dtrace_dof_destroy(dof);
17208			return (EINVAL);
17209		}
17210
17211		if ((rval = dtrace_dof_options(dof, state)) != 0) {
17212			dtrace_enabling_destroy(enab);
17213			mutex_exit(&dtrace_lock);
17214			mutex_exit(&cpu_lock);
17215			dtrace_dof_destroy(dof);
17216			return (rval);
17217		}
17218
17219		if ((err = dtrace_enabling_match(enab, rv)) == 0) {
17220			err = dtrace_enabling_retain(enab);
17221		} else {
17222			dtrace_enabling_destroy(enab);
17223		}
17224
17225		mutex_exit(&cpu_lock);
17226		mutex_exit(&dtrace_lock);
17227		dtrace_dof_destroy(dof);
17228
17229		return (err);
17230	}
17231
17232	case DTRACEIOC_REPLICATE: {
17233		dtrace_repldesc_t desc;
17234		dtrace_probedesc_t *match = &desc.dtrpd_match;
17235		dtrace_probedesc_t *create = &desc.dtrpd_create;
17236		int err;
17237
17238		if (copyin((void *)arg, &desc, sizeof (desc)) != 0)
17239			return (EFAULT);
17240
17241		match->dtpd_provider[DTRACE_PROVNAMELEN - 1] = '\0';
17242		match->dtpd_mod[DTRACE_MODNAMELEN - 1] = '\0';
17243		match->dtpd_func[DTRACE_FUNCNAMELEN - 1] = '\0';
17244		match->dtpd_name[DTRACE_NAMELEN - 1] = '\0';
17245
17246		create->dtpd_provider[DTRACE_PROVNAMELEN - 1] = '\0';
17247		create->dtpd_mod[DTRACE_MODNAMELEN - 1] = '\0';
17248		create->dtpd_func[DTRACE_FUNCNAMELEN - 1] = '\0';
17249		create->dtpd_name[DTRACE_NAMELEN - 1] = '\0';
17250
17251		mutex_enter(&dtrace_lock);
17252		err = dtrace_enabling_replicate(state, match, create);
17253		mutex_exit(&dtrace_lock);
17254
17255		return (err);
17256	}
17257
17258	case DTRACEIOC_PROBEMATCH:
17259	case DTRACEIOC_PROBES: {
17260		dtrace_probe_t *probe = NULL;
17261		dtrace_probedesc_t desc;
17262		dtrace_probekey_t pkey;
17263		dtrace_id_t i;
17264		int m = 0;
17265		uint32_t priv;
17266		uid_t uid;
17267		zoneid_t zoneid;
17268
17269		if (copyin((void *)arg, &desc, sizeof (desc)) != 0)
17270			return (EFAULT);
17271
17272		desc.dtpd_provider[DTRACE_PROVNAMELEN - 1] = '\0';
17273		desc.dtpd_mod[DTRACE_MODNAMELEN - 1] = '\0';
17274		desc.dtpd_func[DTRACE_FUNCNAMELEN - 1] = '\0';
17275		desc.dtpd_name[DTRACE_NAMELEN - 1] = '\0';
17276
17277		/*
17278		 * Before we attempt to match this probe, we want to give
17279		 * all providers the opportunity to provide it.
17280		 */
17281		if (desc.dtpd_id == DTRACE_IDNONE) {
17282			mutex_enter(&dtrace_provider_lock);
17283			dtrace_probe_provide(&desc, NULL);
17284			mutex_exit(&dtrace_provider_lock);
17285			desc.dtpd_id++;
17286		}
17287
17288		if (cmd == DTRACEIOC_PROBEMATCH)  {
17289			dtrace_probekey(&desc, &pkey);
17290			pkey.dtpk_id = DTRACE_IDNONE;
17291		}
17292
17293		dtrace_cred2priv(cr, &priv, &uid, &zoneid);
17294
17295		mutex_enter(&dtrace_lock);
17296
17297		if (cmd == DTRACEIOC_PROBEMATCH) {
17298			for (i = desc.dtpd_id; i <= dtrace_nprobes; i++) {
17299				if ((probe = dtrace_probes[i - 1]) != NULL &&
17300				    (m = dtrace_match_probe(probe, &pkey,
17301				    priv, uid, zoneid)) != 0)
17302					break;
17303			}
17304
17305			if (m < 0) {
17306				mutex_exit(&dtrace_lock);
17307				return (EINVAL);
17308			}
17309
17310		} else {
17311			for (i = desc.dtpd_id; i <= dtrace_nprobes; i++) {
17312				if ((probe = dtrace_probes[i - 1]) != NULL &&
17313				    dtrace_match_priv(probe, priv, uid, zoneid))
17314					break;
17315			}
17316		}
17317
17318		if (probe == NULL) {
17319			mutex_exit(&dtrace_lock);
17320			return (ESRCH);
17321		}
17322
17323		dtrace_probe_description(probe, &desc);
17324		mutex_exit(&dtrace_lock);
17325
17326		if (copyout(&desc, (void *)arg, sizeof (desc)) != 0)
17327			return (EFAULT);
17328
17329		return (0);
17330	}
17331
17332	case DTRACEIOC_PROBEARG: {
17333		dtrace_argdesc_t desc;
17334		dtrace_probe_t *probe;
17335		dtrace_provider_t *prov;
17336
17337		if (copyin((void *)arg, &desc, sizeof (desc)) != 0)
17338			return (EFAULT);
17339
17340		if (desc.dtargd_id == DTRACE_IDNONE)
17341			return (EINVAL);
17342
17343		if (desc.dtargd_ndx == DTRACE_ARGNONE)
17344			return (EINVAL);
17345
17346		mutex_enter(&dtrace_provider_lock);
17347		mutex_enter(&mod_lock);
17348		mutex_enter(&dtrace_lock);
17349
17350		if (desc.dtargd_id > dtrace_nprobes) {
17351			mutex_exit(&dtrace_lock);
17352			mutex_exit(&mod_lock);
17353			mutex_exit(&dtrace_provider_lock);
17354			return (EINVAL);
17355		}
17356
17357		if ((probe = dtrace_probes[desc.dtargd_id - 1]) == NULL) {
17358			mutex_exit(&dtrace_lock);
17359			mutex_exit(&mod_lock);
17360			mutex_exit(&dtrace_provider_lock);
17361			return (EINVAL);
17362		}
17363
17364		mutex_exit(&dtrace_lock);
17365
17366		prov = probe->dtpr_provider;
17367
17368		if (prov->dtpv_pops.dtps_getargdesc == NULL) {
17369			/*
17370			 * There isn't any typed information for this probe.
17371			 * Set the argument number to DTRACE_ARGNONE.
17372			 */
17373			desc.dtargd_ndx = DTRACE_ARGNONE;
17374		} else {
17375			desc.dtargd_native[0] = '\0';
17376			desc.dtargd_xlate[0] = '\0';
17377			desc.dtargd_mapping = desc.dtargd_ndx;
17378
17379			prov->dtpv_pops.dtps_getargdesc(prov->dtpv_arg,
17380			    probe->dtpr_id, probe->dtpr_arg, &desc);
17381		}
17382
17383		mutex_exit(&mod_lock);
17384		mutex_exit(&dtrace_provider_lock);
17385
17386		if (copyout(&desc, (void *)arg, sizeof (desc)) != 0)
17387			return (EFAULT);
17388
17389		return (0);
17390	}
17391
17392	case DTRACEIOC_GO: {
17393		processorid_t cpuid;
17394		rval = dtrace_state_go(state, &cpuid);
17395
17396		if (rval != 0)
17397			return (rval);
17398
17399		if (copyout(&cpuid, (void *)arg, sizeof (cpuid)) != 0)
17400			return (EFAULT);
17401
17402		return (0);
17403	}
17404
17405	case DTRACEIOC_STOP: {
17406		processorid_t cpuid;
17407
17408		mutex_enter(&dtrace_lock);
17409		rval = dtrace_state_stop(state, &cpuid);
17410		mutex_exit(&dtrace_lock);
17411
17412		if (rval != 0)
17413			return (rval);
17414
17415		if (copyout(&cpuid, (void *)arg, sizeof (cpuid)) != 0)
17416			return (EFAULT);
17417
17418		return (0);
17419	}
17420
17421	case DTRACEIOC_DOFGET: {
17422		dof_hdr_t hdr, *dof;
17423		uint64_t len;
17424
17425		if (copyin((void *)arg, &hdr, sizeof (hdr)) != 0)
17426			return (EFAULT);
17427
17428		mutex_enter(&dtrace_lock);
17429		dof = dtrace_dof_create(state);
17430		mutex_exit(&dtrace_lock);
17431
17432		len = MIN(hdr.dofh_loadsz, dof->dofh_loadsz);
17433		rval = copyout(dof, (void *)arg, len);
17434		dtrace_dof_destroy(dof);
17435
17436		return (rval == 0 ? 0 : EFAULT);
17437	}
17438
17439	case DTRACEIOC_AGGSNAP:
17440	case DTRACEIOC_BUFSNAP: {
17441		dtrace_bufdesc_t desc;
17442		caddr_t cached;
17443		dtrace_buffer_t *buf;
17444
17445		if (copyin((void *)arg, &desc, sizeof (desc)) != 0)
17446			return (EFAULT);
17447
17448		if (desc.dtbd_cpu < 0 || desc.dtbd_cpu >= NCPU)
17449			return (EINVAL);
17450
17451		mutex_enter(&dtrace_lock);
17452
17453		if (cmd == DTRACEIOC_BUFSNAP) {
17454			buf = &state->dts_buffer[desc.dtbd_cpu];
17455		} else {
17456			buf = &state->dts_aggbuffer[desc.dtbd_cpu];
17457		}
17458
17459		if (buf->dtb_flags & (DTRACEBUF_RING | DTRACEBUF_FILL)) {
17460			size_t sz = buf->dtb_offset;
17461
17462			if (state->dts_activity != DTRACE_ACTIVITY_STOPPED) {
17463				mutex_exit(&dtrace_lock);
17464				return (EBUSY);
17465			}
17466
17467			/*
17468			 * If this buffer has already been consumed, we're
17469			 * going to indicate that there's nothing left here
17470			 * to consume.
17471			 */
17472			if (buf->dtb_flags & DTRACEBUF_CONSUMED) {
17473				mutex_exit(&dtrace_lock);
17474
17475				desc.dtbd_size = 0;
17476				desc.dtbd_drops = 0;
17477				desc.dtbd_errors = 0;
17478				desc.dtbd_oldest = 0;
17479				sz = sizeof (desc);
17480
17481				if (copyout(&desc, (void *)arg, sz) != 0)
17482					return (EFAULT);
17483
17484				return (0);
17485			}
17486
17487			/*
17488			 * If this is a ring buffer that has wrapped, we want
17489			 * to copy the whole thing out.
17490			 */
17491			if (buf->dtb_flags & DTRACEBUF_WRAPPED) {
17492				dtrace_buffer_polish(buf);
17493				sz = buf->dtb_size;
17494			}
17495
17496			if (copyout(buf->dtb_tomax, desc.dtbd_data, sz) != 0) {
17497				mutex_exit(&dtrace_lock);
17498				return (EFAULT);
17499			}
17500
17501			desc.dtbd_size = sz;
17502			desc.dtbd_drops = buf->dtb_drops;
17503			desc.dtbd_errors = buf->dtb_errors;
17504			desc.dtbd_oldest = buf->dtb_xamot_offset;
17505			desc.dtbd_timestamp = dtrace_gethrtime();
17506
17507			mutex_exit(&dtrace_lock);
17508
17509			if (copyout(&desc, (void *)arg, sizeof (desc)) != 0)
17510				return (EFAULT);
17511
17512			buf->dtb_flags |= DTRACEBUF_CONSUMED;
17513
17514			return (0);
17515		}
17516
17517		if (buf->dtb_tomax == NULL) {
17518			ASSERT(buf->dtb_xamot == NULL);
17519			mutex_exit(&dtrace_lock);
17520			return (ENOENT);
17521		}
17522
17523		cached = buf->dtb_tomax;
17524		ASSERT(!(buf->dtb_flags & DTRACEBUF_NOSWITCH));
17525
17526		dtrace_xcall(desc.dtbd_cpu,
17527		    (dtrace_xcall_t)dtrace_buffer_switch, buf);
17528
17529		state->dts_errors += buf->dtb_xamot_errors;
17530
17531		/*
17532		 * If the buffers did not actually switch, then the cross call
17533		 * did not take place -- presumably because the given CPU is
17534		 * not in the ready set.  If this is the case, we'll return
17535		 * ENOENT.
17536		 */
17537		if (buf->dtb_tomax == cached) {
17538			ASSERT(buf->dtb_xamot != cached);
17539			mutex_exit(&dtrace_lock);
17540			return (ENOENT);
17541		}
17542
17543		ASSERT(cached == buf->dtb_xamot);
17544
17545		/*
17546		 * We have our snapshot; now copy it out.
17547		 */
17548		if (copyout(buf->dtb_xamot, desc.dtbd_data,
17549		    buf->dtb_xamot_offset) != 0) {
17550			mutex_exit(&dtrace_lock);
17551			return (EFAULT);
17552		}
17553
17554		desc.dtbd_size = buf->dtb_xamot_offset;
17555		desc.dtbd_drops = buf->dtb_xamot_drops;
17556		desc.dtbd_errors = buf->dtb_xamot_errors;
17557		desc.dtbd_oldest = 0;
17558		desc.dtbd_timestamp = buf->dtb_switched;
17559
17560		mutex_exit(&dtrace_lock);
17561
17562		/*
17563		 * Finally, copy out the buffer description.
17564		 */
17565		if (copyout(&desc, (void *)arg, sizeof (desc)) != 0)
17566			return (EFAULT);
17567
17568		return (0);
17569	}
17570
17571	case DTRACEIOC_CONF: {
17572		dtrace_conf_t conf;
17573
17574		bzero(&conf, sizeof (conf));
17575		conf.dtc_difversion = DIF_VERSION;
17576		conf.dtc_difintregs = DIF_DIR_NREGS;
17577		conf.dtc_diftupregs = DIF_DTR_NREGS;
17578		conf.dtc_ctfmodel = CTF_MODEL_NATIVE;
17579
17580		if (copyout(&conf, (void *)arg, sizeof (conf)) != 0)
17581			return (EFAULT);
17582
17583		return (0);
17584	}
17585
17586	case DTRACEIOC_STATUS: {
17587		dtrace_status_t stat;
17588		dtrace_dstate_t *dstate;
17589		int i, j;
17590		uint64_t nerrs;
17591
17592		/*
17593		 * See the comment in dtrace_state_deadman() for the reason
17594		 * for setting dts_laststatus to INT64_MAX before setting
17595		 * it to the correct value.
17596		 */
17597		state->dts_laststatus = INT64_MAX;
17598		dtrace_membar_producer();
17599		state->dts_laststatus = dtrace_gethrtime();
17600
17601		bzero(&stat, sizeof (stat));
17602
17603		mutex_enter(&dtrace_lock);
17604
17605		if (state->dts_activity == DTRACE_ACTIVITY_INACTIVE) {
17606			mutex_exit(&dtrace_lock);
17607			return (ENOENT);
17608		}
17609
17610		if (state->dts_activity == DTRACE_ACTIVITY_DRAINING)
17611			stat.dtst_exiting = 1;
17612
17613		nerrs = state->dts_errors;
17614		dstate = &state->dts_vstate.dtvs_dynvars;
17615
17616		for (i = 0; i < NCPU; i++) {
17617			dtrace_dstate_percpu_t *dcpu = &dstate->dtds_percpu[i];
17618
17619			stat.dtst_dyndrops += dcpu->dtdsc_drops;
17620			stat.dtst_dyndrops_dirty += dcpu->dtdsc_dirty_drops;
17621			stat.dtst_dyndrops_rinsing += dcpu->dtdsc_rinsing_drops;
17622
17623			if (state->dts_buffer[i].dtb_flags & DTRACEBUF_FULL)
17624				stat.dtst_filled++;
17625
17626			nerrs += state->dts_buffer[i].dtb_errors;
17627
17628			for (j = 0; j < state->dts_nspeculations; j++) {
17629				dtrace_speculation_t *spec;
17630				dtrace_buffer_t *buf;
17631
17632				spec = &state->dts_speculations[j];
17633				buf = &spec->dtsp_buffer[i];
17634				stat.dtst_specdrops += buf->dtb_xamot_drops;
17635			}
17636		}
17637
17638		stat.dtst_specdrops_busy = state->dts_speculations_busy;
17639		stat.dtst_specdrops_unavail = state->dts_speculations_unavail;
17640		stat.dtst_stkstroverflows = state->dts_stkstroverflows;
17641		stat.dtst_dblerrors = state->dts_dblerrors;
17642		stat.dtst_killed =
17643		    (state->dts_activity == DTRACE_ACTIVITY_KILLED);
17644		stat.dtst_errors = nerrs;
17645
17646		mutex_exit(&dtrace_lock);
17647
17648		if (copyout(&stat, (void *)arg, sizeof (stat)) != 0)
17649			return (EFAULT);
17650
17651		return (0);
17652	}
17653
17654	case DTRACEIOC_FORMAT: {
17655		dtrace_fmtdesc_t fmt;
17656		char *str;
17657		int len;
17658
17659		if (copyin((void *)arg, &fmt, sizeof (fmt)) != 0)
17660			return (EFAULT);
17661
17662		mutex_enter(&dtrace_lock);
17663
17664		if (fmt.dtfd_format == 0 ||
17665		    fmt.dtfd_format > state->dts_nformats) {
17666			mutex_exit(&dtrace_lock);
17667			return (EINVAL);
17668		}
17669
17670		/*
17671		 * Format strings are allocated contiguously and they are
17672		 * never freed; if a format index is less than the number
17673		 * of formats, we can assert that the format map is non-NULL
17674		 * and that the format for the specified index is non-NULL.
17675		 */
17676		ASSERT(state->dts_formats != NULL);
17677		str = state->dts_formats[fmt.dtfd_format - 1];
17678		ASSERT(str != NULL);
17679
17680		len = strlen(str) + 1;
17681
17682		if (len > fmt.dtfd_length) {
17683			fmt.dtfd_length = len;
17684
17685			if (copyout(&fmt, (void *)arg, sizeof (fmt)) != 0) {
17686				mutex_exit(&dtrace_lock);
17687				return (EINVAL);
17688			}
17689		} else {
17690			if (copyout(str, fmt.dtfd_string, len) != 0) {
17691				mutex_exit(&dtrace_lock);
17692				return (EINVAL);
17693			}
17694		}
17695
17696		mutex_exit(&dtrace_lock);
17697		return (0);
17698	}
17699
17700	default:
17701		break;
17702	}
17703
17704	return (ENOTTY);
17705}
17706
17707/*ARGSUSED*/
17708static int
17709dtrace_detach(dev_info_t *dip, ddi_detach_cmd_t cmd)
17710{
17711	dtrace_state_t *state;
17712
17713	switch (cmd) {
17714	case DDI_DETACH:
17715		break;
17716
17717	case DDI_SUSPEND:
17718		return (DDI_SUCCESS);
17719
17720	default:
17721		return (DDI_FAILURE);
17722	}
17723
17724	mutex_enter(&cpu_lock);
17725	mutex_enter(&dtrace_provider_lock);
17726	mutex_enter(&dtrace_lock);
17727
17728	ASSERT(dtrace_opens == 0);
17729
17730	if (dtrace_helpers > 0) {
17731		mutex_exit(&dtrace_provider_lock);
17732		mutex_exit(&dtrace_lock);
17733		mutex_exit(&cpu_lock);
17734		return (DDI_FAILURE);
17735	}
17736
17737	if (dtrace_unregister((dtrace_provider_id_t)dtrace_provider) != 0) {
17738		mutex_exit(&dtrace_provider_lock);
17739		mutex_exit(&dtrace_lock);
17740		mutex_exit(&cpu_lock);
17741		return (DDI_FAILURE);
17742	}
17743
17744	dtrace_provider = NULL;
17745
17746	if ((state = dtrace_anon_grab()) != NULL) {
17747		/*
17748		 * If there were ECBs on this state, the provider should
17749		 * have not been allowed to detach; assert that there is
17750		 * none.
17751		 */
17752		ASSERT(state->dts_necbs == 0);
17753		dtrace_state_destroy(state);
17754
17755		/*
17756		 * If we're being detached with anonymous state, we need to
17757		 * indicate to the kernel debugger that DTrace is now inactive.
17758		 */
17759		(void) kdi_dtrace_set(KDI_DTSET_DTRACE_DEACTIVATE);
17760	}
17761
17762	bzero(&dtrace_anon, sizeof (dtrace_anon_t));
17763	unregister_cpu_setup_func((cpu_setup_func_t *)dtrace_cpu_setup, NULL);
17764	dtrace_cpu_init = NULL;
17765	dtrace_helpers_cleanup = NULL;
17766	dtrace_helpers_fork = NULL;
17767	dtrace_cpustart_init = NULL;
17768	dtrace_cpustart_fini = NULL;
17769	dtrace_debugger_init = NULL;
17770	dtrace_debugger_fini = NULL;
17771	dtrace_modload = NULL;
17772	dtrace_modunload = NULL;
17773
17774	ASSERT(dtrace_getf == 0);
17775	ASSERT(dtrace_closef == NULL);
17776
17777	mutex_exit(&cpu_lock);
17778
17779	if (dtrace_helptrace_enabled) {
17780		kmem_free(dtrace_helptrace_buffer, dtrace_helptrace_bufsize);
17781		dtrace_helptrace_buffer = NULL;
17782	}
17783
17784	kmem_free(dtrace_probes, dtrace_nprobes * sizeof (dtrace_probe_t *));
17785	dtrace_probes = NULL;
17786	dtrace_nprobes = 0;
17787
17788	dtrace_hash_destroy(dtrace_bymod);
17789	dtrace_hash_destroy(dtrace_byfunc);
17790	dtrace_hash_destroy(dtrace_byname);
17791	dtrace_bymod = NULL;
17792	dtrace_byfunc = NULL;
17793	dtrace_byname = NULL;
17794
17795	kmem_cache_destroy(dtrace_state_cache);
17796	vmem_destroy(dtrace_minor);
17797	vmem_destroy(dtrace_arena);
17798
17799	if (dtrace_toxrange != NULL) {
17800		kmem_free(dtrace_toxrange,
17801		    dtrace_toxranges_max * sizeof (dtrace_toxrange_t));
17802		dtrace_toxrange = NULL;
17803		dtrace_toxranges = 0;
17804		dtrace_toxranges_max = 0;
17805	}
17806
17807	ddi_remove_minor_node(dtrace_devi, NULL);
17808	dtrace_devi = NULL;
17809
17810	ddi_soft_state_fini(&dtrace_softstate);
17811
17812	ASSERT(dtrace_vtime_references == 0);
17813	ASSERT(dtrace_opens == 0);
17814	ASSERT(dtrace_retained == NULL);
17815
17816	mutex_exit(&dtrace_lock);
17817	mutex_exit(&dtrace_provider_lock);
17818
17819	/*
17820	 * We don't destroy the task queue until after we have dropped our
17821	 * locks (taskq_destroy() may block on running tasks).  To prevent
17822	 * attempting to do work after we have effectively detached but before
17823	 * the task queue has been destroyed, all tasks dispatched via the
17824	 * task queue must check that DTrace is still attached before
17825	 * performing any operation.
17826	 */
17827	taskq_destroy(dtrace_taskq);
17828	dtrace_taskq = NULL;
17829
17830	return (DDI_SUCCESS);
17831}
17832#endif
17833
17834#if defined(sun)
17835/*ARGSUSED*/
17836static int
17837dtrace_info(dev_info_t *dip, ddi_info_cmd_t infocmd, void *arg, void **result)
17838{
17839	int error;
17840
17841	switch (infocmd) {
17842	case DDI_INFO_DEVT2DEVINFO:
17843		*result = (void *)dtrace_devi;
17844		error = DDI_SUCCESS;
17845		break;
17846	case DDI_INFO_DEVT2INSTANCE:
17847		*result = (void *)0;
17848		error = DDI_SUCCESS;
17849		break;
17850	default:
17851		error = DDI_FAILURE;
17852	}
17853	return (error);
17854}
17855#endif
17856
17857#if defined(sun)
17858static struct cb_ops dtrace_cb_ops = {
17859	dtrace_open,		/* open */
17860	dtrace_close,		/* close */
17861	nulldev,		/* strategy */
17862	nulldev,		/* print */
17863	nodev,			/* dump */
17864	nodev,			/* read */
17865	nodev,			/* write */
17866	dtrace_ioctl,		/* ioctl */
17867	nodev,			/* devmap */
17868	nodev,			/* mmap */
17869	nodev,			/* segmap */
17870	nochpoll,		/* poll */
17871	ddi_prop_op,		/* cb_prop_op */
17872	0,			/* streamtab  */
17873	D_NEW | D_MP		/* Driver compatibility flag */
17874};
17875
17876static struct dev_ops dtrace_ops = {
17877	DEVO_REV,		/* devo_rev */
17878	0,			/* refcnt */
17879	dtrace_info,		/* get_dev_info */
17880	nulldev,		/* identify */
17881	nulldev,		/* probe */
17882	dtrace_attach,		/* attach */
17883	dtrace_detach,		/* detach */
17884	nodev,			/* reset */
17885	&dtrace_cb_ops,		/* driver operations */
17886	NULL,			/* bus operations */
17887	nodev			/* dev power */
17888};
17889
17890static struct modldrv modldrv = {
17891	&mod_driverops,		/* module type (this is a pseudo driver) */
17892	"Dynamic Tracing",	/* name of module */
17893	&dtrace_ops,		/* driver ops */
17894};
17895
17896static struct modlinkage modlinkage = {
17897	MODREV_1,
17898	(void *)&modldrv,
17899	NULL
17900};
17901
17902int
17903_init(void)
17904{
17905	return (mod_install(&modlinkage));
17906}
17907
17908int
17909_info(struct modinfo *modinfop)
17910{
17911	return (mod_info(&modlinkage, modinfop));
17912}
17913
17914int
17915_fini(void)
17916{
17917	return (mod_remove(&modlinkage));
17918}
17919#else
17920
17921static d_ioctl_t	dtrace_ioctl;
17922static d_ioctl_t	dtrace_ioctl_helper;
17923static void		dtrace_load(void *);
17924static int		dtrace_unload(void);
17925static struct cdev	*dtrace_dev;
17926static struct cdev	*helper_dev;
17927
17928void dtrace_invop_init(void);
17929void dtrace_invop_uninit(void);
17930
17931static struct cdevsw dtrace_cdevsw = {
17932	.d_version	= D_VERSION,
17933	.d_ioctl	= dtrace_ioctl,
17934	.d_open		= dtrace_open,
17935	.d_name		= "dtrace",
17936};
17937
17938static struct cdevsw helper_cdevsw = {
17939	.d_version	= D_VERSION,
17940	.d_ioctl	= dtrace_ioctl_helper,
17941	.d_name		= "helper",
17942};
17943
17944#include <dtrace_anon.c>
17945#include <dtrace_ioctl.c>
17946#include <dtrace_load.c>
17947#include <dtrace_modevent.c>
17948#include <dtrace_sysctl.c>
17949#include <dtrace_unload.c>
17950#include <dtrace_vtime.c>
17951#include <dtrace_hacks.c>
17952#include <dtrace_isa.c>
17953
17954SYSINIT(dtrace_load, SI_SUB_DTRACE, SI_ORDER_FIRST, dtrace_load, NULL);
17955SYSUNINIT(dtrace_unload, SI_SUB_DTRACE, SI_ORDER_FIRST, dtrace_unload, NULL);
17956SYSINIT(dtrace_anon_init, SI_SUB_DTRACE_ANON, SI_ORDER_FIRST, dtrace_anon_init, NULL);
17957
17958DEV_MODULE(dtrace, dtrace_modevent, NULL);
17959MODULE_VERSION(dtrace, 1);
17960MODULE_DEPEND(dtrace, opensolaris, 1, 1, 1);
17961#endif
17962