dtrace.c revision 262038
1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 *
21 * $FreeBSD: stable/9/sys/cddl/contrib/opensolaris/uts/common/dtrace/dtrace.c 262038 2014-02-17 12:42:57Z avg $
22 */
23
24/*
25 * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
26 * Copyright (c) 2012 by Delphix. All rights reserved
27 * Use is subject to license terms.
28 */
29
30#pragma ident	"%Z%%M%	%I%	%E% SMI"
31
32/*
33 * DTrace - Dynamic Tracing for Solaris
34 *
35 * This is the implementation of the Solaris Dynamic Tracing framework
36 * (DTrace).  The user-visible interface to DTrace is described at length in
37 * the "Solaris Dynamic Tracing Guide".  The interfaces between the libdtrace
38 * library, the in-kernel DTrace framework, and the DTrace providers are
39 * described in the block comments in the <sys/dtrace.h> header file.  The
40 * internal architecture of DTrace is described in the block comments in the
41 * <sys/dtrace_impl.h> header file.  The comments contained within the DTrace
42 * implementation very much assume mastery of all of these sources; if one has
43 * an unanswered question about the implementation, one should consult them
44 * first.
45 *
46 * The functions here are ordered roughly as follows:
47 *
48 *   - Probe context functions
49 *   - Probe hashing functions
50 *   - Non-probe context utility functions
51 *   - Matching functions
52 *   - Provider-to-Framework API functions
53 *   - Probe management functions
54 *   - DIF object functions
55 *   - Format functions
56 *   - Predicate functions
57 *   - ECB functions
58 *   - Buffer functions
59 *   - Enabling functions
60 *   - DOF functions
61 *   - Anonymous enabling functions
62 *   - Consumer state functions
63 *   - Helper functions
64 *   - Hook functions
65 *   - Driver cookbook functions
66 *
67 * Each group of functions begins with a block comment labelled the "DTrace
68 * [Group] Functions", allowing one to find each block by searching forward
69 * on capital-f functions.
70 */
71#include <sys/errno.h>
72#if !defined(sun)
73#include <sys/time.h>
74#endif
75#include <sys/stat.h>
76#include <sys/modctl.h>
77#include <sys/conf.h>
78#include <sys/systm.h>
79#if defined(sun)
80#include <sys/ddi.h>
81#include <sys/sunddi.h>
82#endif
83#include <sys/cpuvar.h>
84#include <sys/kmem.h>
85#if defined(sun)
86#include <sys/strsubr.h>
87#endif
88#include <sys/sysmacros.h>
89#include <sys/dtrace_impl.h>
90#include <sys/atomic.h>
91#include <sys/cmn_err.h>
92#if defined(sun)
93#include <sys/mutex_impl.h>
94#include <sys/rwlock_impl.h>
95#endif
96#include <sys/ctf_api.h>
97#if defined(sun)
98#include <sys/panic.h>
99#include <sys/priv_impl.h>
100#endif
101#include <sys/policy.h>
102#if defined(sun)
103#include <sys/cred_impl.h>
104#include <sys/procfs_isa.h>
105#endif
106#include <sys/taskq.h>
107#if defined(sun)
108#include <sys/mkdev.h>
109#include <sys/kdi.h>
110#endif
111#include <sys/zone.h>
112#include <sys/socket.h>
113#include <netinet/in.h>
114
115/* FreeBSD includes: */
116#if !defined(sun)
117#include <sys/callout.h>
118#include <sys/ctype.h>
119#include <sys/eventhandler.h>
120#include <sys/limits.h>
121#include <sys/kdb.h>
122#include <sys/kernel.h>
123#include <sys/malloc.h>
124#include <sys/sysctl.h>
125#include <sys/lock.h>
126#include <sys/mutex.h>
127#include <sys/rwlock.h>
128#include <sys/sx.h>
129#include <sys/dtrace_bsd.h>
130#include <netinet/in.h>
131#include "dtrace_cddl.h"
132#include "dtrace_debug.c"
133#endif
134
135/*
136 * DTrace Tunable Variables
137 *
138 * The following variables may be tuned by adding a line to /etc/system that
139 * includes both the name of the DTrace module ("dtrace") and the name of the
140 * variable.  For example:
141 *
142 *   set dtrace:dtrace_destructive_disallow = 1
143 *
144 * In general, the only variables that one should be tuning this way are those
145 * that affect system-wide DTrace behavior, and for which the default behavior
146 * is undesirable.  Most of these variables are tunable on a per-consumer
147 * basis using DTrace options, and need not be tuned on a system-wide basis.
148 * When tuning these variables, avoid pathological values; while some attempt
149 * is made to verify the integrity of these variables, they are not considered
150 * part of the supported interface to DTrace, and they are therefore not
151 * checked comprehensively.  Further, these variables should not be tuned
152 * dynamically via "mdb -kw" or other means; they should only be tuned via
153 * /etc/system.
154 */
155int		dtrace_destructive_disallow = 0;
156dtrace_optval_t	dtrace_nonroot_maxsize = (16 * 1024 * 1024);
157size_t		dtrace_difo_maxsize = (256 * 1024);
158dtrace_optval_t	dtrace_dof_maxsize = (256 * 1024);
159size_t		dtrace_global_maxsize = (16 * 1024);
160size_t		dtrace_actions_max = (16 * 1024);
161size_t		dtrace_retain_max = 1024;
162dtrace_optval_t	dtrace_helper_actions_max = 128;
163dtrace_optval_t	dtrace_helper_providers_max = 32;
164dtrace_optval_t	dtrace_dstate_defsize = (1 * 1024 * 1024);
165size_t		dtrace_strsize_default = 256;
166dtrace_optval_t	dtrace_cleanrate_default = 9900990;		/* 101 hz */
167dtrace_optval_t	dtrace_cleanrate_min = 200000;			/* 5000 hz */
168dtrace_optval_t	dtrace_cleanrate_max = (uint64_t)60 * NANOSEC;	/* 1/minute */
169dtrace_optval_t	dtrace_aggrate_default = NANOSEC;		/* 1 hz */
170dtrace_optval_t	dtrace_statusrate_default = NANOSEC;		/* 1 hz */
171dtrace_optval_t dtrace_statusrate_max = (hrtime_t)10 * NANOSEC;	 /* 6/minute */
172dtrace_optval_t	dtrace_switchrate_default = NANOSEC;		/* 1 hz */
173dtrace_optval_t	dtrace_nspec_default = 1;
174dtrace_optval_t	dtrace_specsize_default = 32 * 1024;
175dtrace_optval_t dtrace_stackframes_default = 20;
176dtrace_optval_t dtrace_ustackframes_default = 20;
177dtrace_optval_t dtrace_jstackframes_default = 50;
178dtrace_optval_t dtrace_jstackstrsize_default = 512;
179int		dtrace_msgdsize_max = 128;
180hrtime_t	dtrace_chill_max = 500 * (NANOSEC / MILLISEC);	/* 500 ms */
181hrtime_t	dtrace_chill_interval = NANOSEC;		/* 1000 ms */
182int		dtrace_devdepth_max = 32;
183int		dtrace_err_verbose;
184hrtime_t	dtrace_deadman_interval = NANOSEC;
185hrtime_t	dtrace_deadman_timeout = (hrtime_t)10 * NANOSEC;
186hrtime_t	dtrace_deadman_user = (hrtime_t)30 * NANOSEC;
187hrtime_t	dtrace_unregister_defunct_reap = (hrtime_t)60 * NANOSEC;
188
189/*
190 * DTrace External Variables
191 *
192 * As dtrace(7D) is a kernel module, any DTrace variables are obviously
193 * available to DTrace consumers via the backtick (`) syntax.  One of these,
194 * dtrace_zero, is made deliberately so:  it is provided as a source of
195 * well-known, zero-filled memory.  While this variable is not documented,
196 * it is used by some translators as an implementation detail.
197 */
198const char	dtrace_zero[256] = { 0 };	/* zero-filled memory */
199
200/*
201 * DTrace Internal Variables
202 */
203#if defined(sun)
204static dev_info_t	*dtrace_devi;		/* device info */
205#endif
206#if defined(sun)
207static vmem_t		*dtrace_arena;		/* probe ID arena */
208static vmem_t		*dtrace_minor;		/* minor number arena */
209#else
210static taskq_t		*dtrace_taskq;		/* task queue */
211static struct unrhdr	*dtrace_arena;		/* Probe ID number.     */
212#endif
213static dtrace_probe_t	**dtrace_probes;	/* array of all probes */
214static int		dtrace_nprobes;		/* number of probes */
215static dtrace_provider_t *dtrace_provider;	/* provider list */
216static dtrace_meta_t	*dtrace_meta_pid;	/* user-land meta provider */
217static int		dtrace_opens;		/* number of opens */
218static int		dtrace_helpers;		/* number of helpers */
219#if defined(sun)
220static void		*dtrace_softstate;	/* softstate pointer */
221#endif
222static dtrace_hash_t	*dtrace_bymod;		/* probes hashed by module */
223static dtrace_hash_t	*dtrace_byfunc;		/* probes hashed by function */
224static dtrace_hash_t	*dtrace_byname;		/* probes hashed by name */
225static dtrace_toxrange_t *dtrace_toxrange;	/* toxic range array */
226static int		dtrace_toxranges;	/* number of toxic ranges */
227static int		dtrace_toxranges_max;	/* size of toxic range array */
228static dtrace_anon_t	dtrace_anon;		/* anonymous enabling */
229static kmem_cache_t	*dtrace_state_cache;	/* cache for dynamic state */
230static uint64_t		dtrace_vtime_references; /* number of vtimestamp refs */
231static kthread_t	*dtrace_panicked;	/* panicking thread */
232static dtrace_ecb_t	*dtrace_ecb_create_cache; /* cached created ECB */
233static dtrace_genid_t	dtrace_probegen;	/* current probe generation */
234static dtrace_helpers_t *dtrace_deferred_pid;	/* deferred helper list */
235static dtrace_enabling_t *dtrace_retained;	/* list of retained enablings */
236static dtrace_dynvar_t	dtrace_dynhash_sink;	/* end of dynamic hash chains */
237#if !defined(sun)
238static struct mtx	dtrace_unr_mtx;
239MTX_SYSINIT(dtrace_unr_mtx, &dtrace_unr_mtx, "Unique resource identifier", MTX_DEF);
240int		dtrace_in_probe;	/* non-zero if executing a probe */
241#if defined(__i386__) || defined(__amd64__)
242uintptr_t	dtrace_in_probe_addr;	/* Address of invop when already in probe */
243#endif
244static eventhandler_tag	dtrace_kld_load_tag;
245static eventhandler_tag	dtrace_kld_unload_try_tag;
246#endif
247
248/*
249 * DTrace Locking
250 * DTrace is protected by three (relatively coarse-grained) locks:
251 *
252 * (1) dtrace_lock is required to manipulate essentially any DTrace state,
253 *     including enabling state, probes, ECBs, consumer state, helper state,
254 *     etc.  Importantly, dtrace_lock is _not_ required when in probe context;
255 *     probe context is lock-free -- synchronization is handled via the
256 *     dtrace_sync() cross call mechanism.
257 *
258 * (2) dtrace_provider_lock is required when manipulating provider state, or
259 *     when provider state must be held constant.
260 *
261 * (3) dtrace_meta_lock is required when manipulating meta provider state, or
262 *     when meta provider state must be held constant.
263 *
264 * The lock ordering between these three locks is dtrace_meta_lock before
265 * dtrace_provider_lock before dtrace_lock.  (In particular, there are
266 * several places where dtrace_provider_lock is held by the framework as it
267 * calls into the providers -- which then call back into the framework,
268 * grabbing dtrace_lock.)
269 *
270 * There are two other locks in the mix:  mod_lock and cpu_lock.  With respect
271 * to dtrace_provider_lock and dtrace_lock, cpu_lock continues its historical
272 * role as a coarse-grained lock; it is acquired before both of these locks.
273 * With respect to dtrace_meta_lock, its behavior is stranger:  cpu_lock must
274 * be acquired _between_ dtrace_meta_lock and any other DTrace locks.
275 * mod_lock is similar with respect to dtrace_provider_lock in that it must be
276 * acquired _between_ dtrace_provider_lock and dtrace_lock.
277 */
278static kmutex_t		dtrace_lock;		/* probe state lock */
279static kmutex_t		dtrace_provider_lock;	/* provider state lock */
280static kmutex_t		dtrace_meta_lock;	/* meta-provider state lock */
281
282#if !defined(sun)
283/* XXX FreeBSD hacks. */
284static kmutex_t		mod_lock;
285
286#define cr_suid		cr_svuid
287#define cr_sgid		cr_svgid
288#define	ipaddr_t	in_addr_t
289#define mod_modname	pathname
290#define vuprintf	vprintf
291#define ttoproc(_a)	((_a)->td_proc)
292#define crgetzoneid(_a)	0
293#define	NCPU		MAXCPU
294#define SNOCD		0
295#define CPU_ON_INTR(_a)	0
296
297#define PRIV_EFFECTIVE		(1 << 0)
298#define PRIV_DTRACE_KERNEL	(1 << 1)
299#define PRIV_DTRACE_PROC	(1 << 2)
300#define PRIV_DTRACE_USER	(1 << 3)
301#define PRIV_PROC_OWNER		(1 << 4)
302#define PRIV_PROC_ZONE		(1 << 5)
303#define PRIV_ALL		~0
304
305SYSCTL_NODE(_debug, OID_AUTO, dtrace, CTLFLAG_RD, 0, "DTrace Information");
306#endif
307
308#if defined(sun)
309#define curcpu	CPU->cpu_id
310#endif
311
312
313/*
314 * DTrace Provider Variables
315 *
316 * These are the variables relating to DTrace as a provider (that is, the
317 * provider of the BEGIN, END, and ERROR probes).
318 */
319static dtrace_pattr_t	dtrace_provider_attr = {
320{ DTRACE_STABILITY_STABLE, DTRACE_STABILITY_STABLE, DTRACE_CLASS_COMMON },
321{ DTRACE_STABILITY_PRIVATE, DTRACE_STABILITY_PRIVATE, DTRACE_CLASS_UNKNOWN },
322{ DTRACE_STABILITY_PRIVATE, DTRACE_STABILITY_PRIVATE, DTRACE_CLASS_UNKNOWN },
323{ DTRACE_STABILITY_STABLE, DTRACE_STABILITY_STABLE, DTRACE_CLASS_COMMON },
324{ DTRACE_STABILITY_STABLE, DTRACE_STABILITY_STABLE, DTRACE_CLASS_COMMON },
325};
326
327static void
328dtrace_nullop(void)
329{}
330
331static dtrace_pops_t	dtrace_provider_ops = {
332	(void (*)(void *, dtrace_probedesc_t *))dtrace_nullop,
333	(void (*)(void *, modctl_t *))dtrace_nullop,
334	(void (*)(void *, dtrace_id_t, void *))dtrace_nullop,
335	(void (*)(void *, dtrace_id_t, void *))dtrace_nullop,
336	(void (*)(void *, dtrace_id_t, void *))dtrace_nullop,
337	(void (*)(void *, dtrace_id_t, void *))dtrace_nullop,
338	NULL,
339	NULL,
340	NULL,
341	(void (*)(void *, dtrace_id_t, void *))dtrace_nullop
342};
343
344static dtrace_id_t	dtrace_probeid_begin;	/* special BEGIN probe */
345static dtrace_id_t	dtrace_probeid_end;	/* special END probe */
346dtrace_id_t		dtrace_probeid_error;	/* special ERROR probe */
347
348/*
349 * DTrace Helper Tracing Variables
350 */
351uint32_t dtrace_helptrace_next = 0;
352uint32_t dtrace_helptrace_nlocals;
353char	*dtrace_helptrace_buffer;
354int	dtrace_helptrace_bufsize = 512 * 1024;
355
356#ifdef DEBUG
357int	dtrace_helptrace_enabled = 1;
358#else
359int	dtrace_helptrace_enabled = 0;
360#endif
361
362/*
363 * DTrace Error Hashing
364 *
365 * On DEBUG kernels, DTrace will track the errors that has seen in a hash
366 * table.  This is very useful for checking coverage of tests that are
367 * expected to induce DIF or DOF processing errors, and may be useful for
368 * debugging problems in the DIF code generator or in DOF generation .  The
369 * error hash may be examined with the ::dtrace_errhash MDB dcmd.
370 */
371#ifdef DEBUG
372static dtrace_errhash_t	dtrace_errhash[DTRACE_ERRHASHSZ];
373static const char *dtrace_errlast;
374static kthread_t *dtrace_errthread;
375static kmutex_t dtrace_errlock;
376#endif
377
378/*
379 * DTrace Macros and Constants
380 *
381 * These are various macros that are useful in various spots in the
382 * implementation, along with a few random constants that have no meaning
383 * outside of the implementation.  There is no real structure to this cpp
384 * mishmash -- but is there ever?
385 */
386#define	DTRACE_HASHSTR(hash, probe)	\
387	dtrace_hash_str(*((char **)((uintptr_t)(probe) + (hash)->dth_stroffs)))
388
389#define	DTRACE_HASHNEXT(hash, probe)	\
390	(dtrace_probe_t **)((uintptr_t)(probe) + (hash)->dth_nextoffs)
391
392#define	DTRACE_HASHPREV(hash, probe)	\
393	(dtrace_probe_t **)((uintptr_t)(probe) + (hash)->dth_prevoffs)
394
395#define	DTRACE_HASHEQ(hash, lhs, rhs)	\
396	(strcmp(*((char **)((uintptr_t)(lhs) + (hash)->dth_stroffs)), \
397	    *((char **)((uintptr_t)(rhs) + (hash)->dth_stroffs))) == 0)
398
399#define	DTRACE_AGGHASHSIZE_SLEW		17
400
401#define	DTRACE_V4MAPPED_OFFSET		(sizeof (uint32_t) * 3)
402
403/*
404 * The key for a thread-local variable consists of the lower 61 bits of the
405 * t_did, plus the 3 bits of the highest active interrupt above LOCK_LEVEL.
406 * We add DIF_VARIABLE_MAX to t_did to assure that the thread key is never
407 * equal to a variable identifier.  This is necessary (but not sufficient) to
408 * assure that global associative arrays never collide with thread-local
409 * variables.  To guarantee that they cannot collide, we must also define the
410 * order for keying dynamic variables.  That order is:
411 *
412 *   [ key0 ] ... [ keyn ] [ variable-key ] [ tls-key ]
413 *
414 * Because the variable-key and the tls-key are in orthogonal spaces, there is
415 * no way for a global variable key signature to match a thread-local key
416 * signature.
417 */
418#if defined(sun)
419#define	DTRACE_TLS_THRKEY(where) { \
420	uint_t intr = 0; \
421	uint_t actv = CPU->cpu_intr_actv >> (LOCK_LEVEL + 1); \
422	for (; actv; actv >>= 1) \
423		intr++; \
424	ASSERT(intr < (1 << 3)); \
425	(where) = ((curthread->t_did + DIF_VARIABLE_MAX) & \
426	    (((uint64_t)1 << 61) - 1)) | ((uint64_t)intr << 61); \
427}
428#else
429#define	DTRACE_TLS_THRKEY(where) { \
430	solaris_cpu_t *_c = &solaris_cpu[curcpu]; \
431	uint_t intr = 0; \
432	uint_t actv = _c->cpu_intr_actv; \
433	for (; actv; actv >>= 1) \
434		intr++; \
435	ASSERT(intr < (1 << 3)); \
436	(where) = ((curthread->td_tid + DIF_VARIABLE_MAX) & \
437	    (((uint64_t)1 << 61) - 1)) | ((uint64_t)intr << 61); \
438}
439#endif
440
441#define	DT_BSWAP_8(x)	((x) & 0xff)
442#define	DT_BSWAP_16(x)	((DT_BSWAP_8(x) << 8) | DT_BSWAP_8((x) >> 8))
443#define	DT_BSWAP_32(x)	((DT_BSWAP_16(x) << 16) | DT_BSWAP_16((x) >> 16))
444#define	DT_BSWAP_64(x)	((DT_BSWAP_32(x) << 32) | DT_BSWAP_32((x) >> 32))
445
446#define	DT_MASK_LO 0x00000000FFFFFFFFULL
447
448#define	DTRACE_STORE(type, tomax, offset, what) \
449	*((type *)((uintptr_t)(tomax) + (uintptr_t)offset)) = (type)(what);
450
451#ifndef __x86
452#define	DTRACE_ALIGNCHECK(addr, size, flags)				\
453	if (addr & (size - 1)) {					\
454		*flags |= CPU_DTRACE_BADALIGN;				\
455		cpu_core[curcpu].cpuc_dtrace_illval = addr;	\
456		return (0);						\
457	}
458#else
459#define	DTRACE_ALIGNCHECK(addr, size, flags)
460#endif
461
462/*
463 * Test whether a range of memory starting at testaddr of size testsz falls
464 * within the range of memory described by addr, sz.  We take care to avoid
465 * problems with overflow and underflow of the unsigned quantities, and
466 * disallow all negative sizes.  Ranges of size 0 are allowed.
467 */
468#define	DTRACE_INRANGE(testaddr, testsz, baseaddr, basesz) \
469	((testaddr) - (baseaddr) < (basesz) && \
470	(testaddr) + (testsz) - (baseaddr) <= (basesz) && \
471	(testaddr) + (testsz) >= (testaddr))
472
473/*
474 * Test whether alloc_sz bytes will fit in the scratch region.  We isolate
475 * alloc_sz on the righthand side of the comparison in order to avoid overflow
476 * or underflow in the comparison with it.  This is simpler than the INRANGE
477 * check above, because we know that the dtms_scratch_ptr is valid in the
478 * range.  Allocations of size zero are allowed.
479 */
480#define	DTRACE_INSCRATCH(mstate, alloc_sz) \
481	((mstate)->dtms_scratch_base + (mstate)->dtms_scratch_size - \
482	(mstate)->dtms_scratch_ptr >= (alloc_sz))
483
484#define	DTRACE_LOADFUNC(bits)						\
485/*CSTYLED*/								\
486uint##bits##_t								\
487dtrace_load##bits(uintptr_t addr)					\
488{									\
489	size_t size = bits / NBBY;					\
490	/*CSTYLED*/							\
491	uint##bits##_t rval;						\
492	int i;								\
493	volatile uint16_t *flags = (volatile uint16_t *)		\
494	    &cpu_core[curcpu].cpuc_dtrace_flags;			\
495									\
496	DTRACE_ALIGNCHECK(addr, size, flags);				\
497									\
498	for (i = 0; i < dtrace_toxranges; i++) {			\
499		if (addr >= dtrace_toxrange[i].dtt_limit)		\
500			continue;					\
501									\
502		if (addr + size <= dtrace_toxrange[i].dtt_base)		\
503			continue;					\
504									\
505		/*							\
506		 * This address falls within a toxic region; return 0.	\
507		 */							\
508		*flags |= CPU_DTRACE_BADADDR;				\
509		cpu_core[curcpu].cpuc_dtrace_illval = addr;		\
510		return (0);						\
511	}								\
512									\
513	*flags |= CPU_DTRACE_NOFAULT;					\
514	/*CSTYLED*/							\
515	rval = *((volatile uint##bits##_t *)addr);			\
516	*flags &= ~CPU_DTRACE_NOFAULT;					\
517									\
518	return (!(*flags & CPU_DTRACE_FAULT) ? rval : 0);		\
519}
520
521#ifdef _LP64
522#define	dtrace_loadptr	dtrace_load64
523#else
524#define	dtrace_loadptr	dtrace_load32
525#endif
526
527#define	DTRACE_DYNHASH_FREE	0
528#define	DTRACE_DYNHASH_SINK	1
529#define	DTRACE_DYNHASH_VALID	2
530
531#define	DTRACE_MATCH_NEXT	0
532#define	DTRACE_MATCH_DONE	1
533#define	DTRACE_ANCHORED(probe)	((probe)->dtpr_func[0] != '\0')
534#define	DTRACE_STATE_ALIGN	64
535
536#define	DTRACE_FLAGS2FLT(flags)						\
537	(((flags) & CPU_DTRACE_BADADDR) ? DTRACEFLT_BADADDR :		\
538	((flags) & CPU_DTRACE_ILLOP) ? DTRACEFLT_ILLOP :		\
539	((flags) & CPU_DTRACE_DIVZERO) ? DTRACEFLT_DIVZERO :		\
540	((flags) & CPU_DTRACE_KPRIV) ? DTRACEFLT_KPRIV :		\
541	((flags) & CPU_DTRACE_UPRIV) ? DTRACEFLT_UPRIV :		\
542	((flags) & CPU_DTRACE_TUPOFLOW) ?  DTRACEFLT_TUPOFLOW :		\
543	((flags) & CPU_DTRACE_BADALIGN) ?  DTRACEFLT_BADALIGN :		\
544	((flags) & CPU_DTRACE_NOSCRATCH) ?  DTRACEFLT_NOSCRATCH :	\
545	((flags) & CPU_DTRACE_BADSTACK) ?  DTRACEFLT_BADSTACK :		\
546	DTRACEFLT_UNKNOWN)
547
548#define	DTRACEACT_ISSTRING(act)						\
549	((act)->dta_kind == DTRACEACT_DIFEXPR &&			\
550	(act)->dta_difo->dtdo_rtype.dtdt_kind == DIF_TYPE_STRING)
551
552/* Function prototype definitions: */
553static size_t dtrace_strlen(const char *, size_t);
554static dtrace_probe_t *dtrace_probe_lookup_id(dtrace_id_t id);
555static void dtrace_enabling_provide(dtrace_provider_t *);
556static int dtrace_enabling_match(dtrace_enabling_t *, int *);
557static void dtrace_enabling_matchall(void);
558static void dtrace_enabling_reap(void);
559static dtrace_state_t *dtrace_anon_grab(void);
560static uint64_t dtrace_helper(int, dtrace_mstate_t *,
561    dtrace_state_t *, uint64_t, uint64_t);
562static dtrace_helpers_t *dtrace_helpers_create(proc_t *);
563static void dtrace_buffer_drop(dtrace_buffer_t *);
564static int dtrace_buffer_consumed(dtrace_buffer_t *, hrtime_t when);
565static intptr_t dtrace_buffer_reserve(dtrace_buffer_t *, size_t, size_t,
566    dtrace_state_t *, dtrace_mstate_t *);
567static int dtrace_state_option(dtrace_state_t *, dtrace_optid_t,
568    dtrace_optval_t);
569static int dtrace_ecb_create_enable(dtrace_probe_t *, void *);
570static void dtrace_helper_provider_destroy(dtrace_helper_provider_t *);
571uint16_t dtrace_load16(uintptr_t);
572uint32_t dtrace_load32(uintptr_t);
573uint64_t dtrace_load64(uintptr_t);
574uint8_t dtrace_load8(uintptr_t);
575void dtrace_dynvar_clean(dtrace_dstate_t *);
576dtrace_dynvar_t *dtrace_dynvar(dtrace_dstate_t *, uint_t, dtrace_key_t *,
577    size_t, dtrace_dynvar_op_t, dtrace_mstate_t *, dtrace_vstate_t *);
578uintptr_t dtrace_dif_varstr(uintptr_t, dtrace_state_t *, dtrace_mstate_t *);
579
580/*
581 * DTrace Probe Context Functions
582 *
583 * These functions are called from probe context.  Because probe context is
584 * any context in which C may be called, arbitrarily locks may be held,
585 * interrupts may be disabled, we may be in arbitrary dispatched state, etc.
586 * As a result, functions called from probe context may only call other DTrace
587 * support functions -- they may not interact at all with the system at large.
588 * (Note that the ASSERT macro is made probe-context safe by redefining it in
589 * terms of dtrace_assfail(), a probe-context safe function.) If arbitrary
590 * loads are to be performed from probe context, they _must_ be in terms of
591 * the safe dtrace_load*() variants.
592 *
593 * Some functions in this block are not actually called from probe context;
594 * for these functions, there will be a comment above the function reading
595 * "Note:  not called from probe context."
596 */
597void
598dtrace_panic(const char *format, ...)
599{
600	va_list alist;
601
602	va_start(alist, format);
603	dtrace_vpanic(format, alist);
604	va_end(alist);
605}
606
607int
608dtrace_assfail(const char *a, const char *f, int l)
609{
610	dtrace_panic("assertion failed: %s, file: %s, line: %d", a, f, l);
611
612	/*
613	 * We just need something here that even the most clever compiler
614	 * cannot optimize away.
615	 */
616	return (a[(uintptr_t)f]);
617}
618
619/*
620 * Atomically increment a specified error counter from probe context.
621 */
622static void
623dtrace_error(uint32_t *counter)
624{
625	/*
626	 * Most counters stored to in probe context are per-CPU counters.
627	 * However, there are some error conditions that are sufficiently
628	 * arcane that they don't merit per-CPU storage.  If these counters
629	 * are incremented concurrently on different CPUs, scalability will be
630	 * adversely affected -- but we don't expect them to be white-hot in a
631	 * correctly constructed enabling...
632	 */
633	uint32_t oval, nval;
634
635	do {
636		oval = *counter;
637
638		if ((nval = oval + 1) == 0) {
639			/*
640			 * If the counter would wrap, set it to 1 -- assuring
641			 * that the counter is never zero when we have seen
642			 * errors.  (The counter must be 32-bits because we
643			 * aren't guaranteed a 64-bit compare&swap operation.)
644			 * To save this code both the infamy of being fingered
645			 * by a priggish news story and the indignity of being
646			 * the target of a neo-puritan witch trial, we're
647			 * carefully avoiding any colorful description of the
648			 * likelihood of this condition -- but suffice it to
649			 * say that it is only slightly more likely than the
650			 * overflow of predicate cache IDs, as discussed in
651			 * dtrace_predicate_create().
652			 */
653			nval = 1;
654		}
655	} while (dtrace_cas32(counter, oval, nval) != oval);
656}
657
658/*
659 * Use the DTRACE_LOADFUNC macro to define functions for each of loading a
660 * uint8_t, a uint16_t, a uint32_t and a uint64_t.
661 */
662DTRACE_LOADFUNC(8)
663DTRACE_LOADFUNC(16)
664DTRACE_LOADFUNC(32)
665DTRACE_LOADFUNC(64)
666
667static int
668dtrace_inscratch(uintptr_t dest, size_t size, dtrace_mstate_t *mstate)
669{
670	if (dest < mstate->dtms_scratch_base)
671		return (0);
672
673	if (dest + size < dest)
674		return (0);
675
676	if (dest + size > mstate->dtms_scratch_ptr)
677		return (0);
678
679	return (1);
680}
681
682static int
683dtrace_canstore_statvar(uint64_t addr, size_t sz,
684    dtrace_statvar_t **svars, int nsvars)
685{
686	int i;
687
688	for (i = 0; i < nsvars; i++) {
689		dtrace_statvar_t *svar = svars[i];
690
691		if (svar == NULL || svar->dtsv_size == 0)
692			continue;
693
694		if (DTRACE_INRANGE(addr, sz, svar->dtsv_data, svar->dtsv_size))
695			return (1);
696	}
697
698	return (0);
699}
700
701/*
702 * Check to see if the address is within a memory region to which a store may
703 * be issued.  This includes the DTrace scratch areas, and any DTrace variable
704 * region.  The caller of dtrace_canstore() is responsible for performing any
705 * alignment checks that are needed before stores are actually executed.
706 */
707static int
708dtrace_canstore(uint64_t addr, size_t sz, dtrace_mstate_t *mstate,
709    dtrace_vstate_t *vstate)
710{
711	/*
712	 * First, check to see if the address is in scratch space...
713	 */
714	if (DTRACE_INRANGE(addr, sz, mstate->dtms_scratch_base,
715	    mstate->dtms_scratch_size))
716		return (1);
717
718	/*
719	 * Now check to see if it's a dynamic variable.  This check will pick
720	 * up both thread-local variables and any global dynamically-allocated
721	 * variables.
722	 */
723	if (DTRACE_INRANGE(addr, sz, (uintptr_t)vstate->dtvs_dynvars.dtds_base,
724	    vstate->dtvs_dynvars.dtds_size)) {
725		dtrace_dstate_t *dstate = &vstate->dtvs_dynvars;
726		uintptr_t base = (uintptr_t)dstate->dtds_base +
727		    (dstate->dtds_hashsize * sizeof (dtrace_dynhash_t));
728		uintptr_t chunkoffs;
729
730		/*
731		 * Before we assume that we can store here, we need to make
732		 * sure that it isn't in our metadata -- storing to our
733		 * dynamic variable metadata would corrupt our state.  For
734		 * the range to not include any dynamic variable metadata,
735		 * it must:
736		 *
737		 *	(1) Start above the hash table that is at the base of
738		 *	the dynamic variable space
739		 *
740		 *	(2) Have a starting chunk offset that is beyond the
741		 *	dtrace_dynvar_t that is at the base of every chunk
742		 *
743		 *	(3) Not span a chunk boundary
744		 *
745		 */
746		if (addr < base)
747			return (0);
748
749		chunkoffs = (addr - base) % dstate->dtds_chunksize;
750
751		if (chunkoffs < sizeof (dtrace_dynvar_t))
752			return (0);
753
754		if (chunkoffs + sz > dstate->dtds_chunksize)
755			return (0);
756
757		return (1);
758	}
759
760	/*
761	 * Finally, check the static local and global variables.  These checks
762	 * take the longest, so we perform them last.
763	 */
764	if (dtrace_canstore_statvar(addr, sz,
765	    vstate->dtvs_locals, vstate->dtvs_nlocals))
766		return (1);
767
768	if (dtrace_canstore_statvar(addr, sz,
769	    vstate->dtvs_globals, vstate->dtvs_nglobals))
770		return (1);
771
772	return (0);
773}
774
775
776/*
777 * Convenience routine to check to see if the address is within a memory
778 * region in which a load may be issued given the user's privilege level;
779 * if not, it sets the appropriate error flags and loads 'addr' into the
780 * illegal value slot.
781 *
782 * DTrace subroutines (DIF_SUBR_*) should use this helper to implement
783 * appropriate memory access protection.
784 */
785static int
786dtrace_canload(uint64_t addr, size_t sz, dtrace_mstate_t *mstate,
787    dtrace_vstate_t *vstate)
788{
789	volatile uintptr_t *illval = &cpu_core[curcpu].cpuc_dtrace_illval;
790
791	/*
792	 * If we hold the privilege to read from kernel memory, then
793	 * everything is readable.
794	 */
795	if ((mstate->dtms_access & DTRACE_ACCESS_KERNEL) != 0)
796		return (1);
797
798	/*
799	 * You can obviously read that which you can store.
800	 */
801	if (dtrace_canstore(addr, sz, mstate, vstate))
802		return (1);
803
804	/*
805	 * We're allowed to read from our own string table.
806	 */
807	if (DTRACE_INRANGE(addr, sz, (uintptr_t)mstate->dtms_difo->dtdo_strtab,
808	    mstate->dtms_difo->dtdo_strlen))
809		return (1);
810
811	DTRACE_CPUFLAG_SET(CPU_DTRACE_KPRIV);
812	*illval = addr;
813	return (0);
814}
815
816/*
817 * Convenience routine to check to see if a given string is within a memory
818 * region in which a load may be issued given the user's privilege level;
819 * this exists so that we don't need to issue unnecessary dtrace_strlen()
820 * calls in the event that the user has all privileges.
821 */
822static int
823dtrace_strcanload(uint64_t addr, size_t sz, dtrace_mstate_t *mstate,
824    dtrace_vstate_t *vstate)
825{
826	size_t strsz;
827
828	/*
829	 * If we hold the privilege to read from kernel memory, then
830	 * everything is readable.
831	 */
832	if ((mstate->dtms_access & DTRACE_ACCESS_KERNEL) != 0)
833		return (1);
834
835	strsz = 1 + dtrace_strlen((char *)(uintptr_t)addr, sz);
836	if (dtrace_canload(addr, strsz, mstate, vstate))
837		return (1);
838
839	return (0);
840}
841
842/*
843 * Convenience routine to check to see if a given variable is within a memory
844 * region in which a load may be issued given the user's privilege level.
845 */
846static int
847dtrace_vcanload(void *src, dtrace_diftype_t *type, dtrace_mstate_t *mstate,
848    dtrace_vstate_t *vstate)
849{
850	size_t sz;
851	ASSERT(type->dtdt_flags & DIF_TF_BYREF);
852
853	/*
854	 * If we hold the privilege to read from kernel memory, then
855	 * everything is readable.
856	 */
857	if ((mstate->dtms_access & DTRACE_ACCESS_KERNEL) != 0)
858		return (1);
859
860	if (type->dtdt_kind == DIF_TYPE_STRING)
861		sz = dtrace_strlen(src,
862		    vstate->dtvs_state->dts_options[DTRACEOPT_STRSIZE]) + 1;
863	else
864		sz = type->dtdt_size;
865
866	return (dtrace_canload((uintptr_t)src, sz, mstate, vstate));
867}
868
869/*
870 * Compare two strings using safe loads.
871 */
872static int
873dtrace_strncmp(char *s1, char *s2, size_t limit)
874{
875	uint8_t c1, c2;
876	volatile uint16_t *flags;
877
878	if (s1 == s2 || limit == 0)
879		return (0);
880
881	flags = (volatile uint16_t *)&cpu_core[curcpu].cpuc_dtrace_flags;
882
883	do {
884		if (s1 == NULL) {
885			c1 = '\0';
886		} else {
887			c1 = dtrace_load8((uintptr_t)s1++);
888		}
889
890		if (s2 == NULL) {
891			c2 = '\0';
892		} else {
893			c2 = dtrace_load8((uintptr_t)s2++);
894		}
895
896		if (c1 != c2)
897			return (c1 - c2);
898	} while (--limit && c1 != '\0' && !(*flags & CPU_DTRACE_FAULT));
899
900	return (0);
901}
902
903/*
904 * Compute strlen(s) for a string using safe memory accesses.  The additional
905 * len parameter is used to specify a maximum length to ensure completion.
906 */
907static size_t
908dtrace_strlen(const char *s, size_t lim)
909{
910	uint_t len;
911
912	for (len = 0; len != lim; len++) {
913		if (dtrace_load8((uintptr_t)s++) == '\0')
914			break;
915	}
916
917	return (len);
918}
919
920/*
921 * Check if an address falls within a toxic region.
922 */
923static int
924dtrace_istoxic(uintptr_t kaddr, size_t size)
925{
926	uintptr_t taddr, tsize;
927	int i;
928
929	for (i = 0; i < dtrace_toxranges; i++) {
930		taddr = dtrace_toxrange[i].dtt_base;
931		tsize = dtrace_toxrange[i].dtt_limit - taddr;
932
933		if (kaddr - taddr < tsize) {
934			DTRACE_CPUFLAG_SET(CPU_DTRACE_BADADDR);
935			cpu_core[curcpu].cpuc_dtrace_illval = kaddr;
936			return (1);
937		}
938
939		if (taddr - kaddr < size) {
940			DTRACE_CPUFLAG_SET(CPU_DTRACE_BADADDR);
941			cpu_core[curcpu].cpuc_dtrace_illval = taddr;
942			return (1);
943		}
944	}
945
946	return (0);
947}
948
949/*
950 * Copy src to dst using safe memory accesses.  The src is assumed to be unsafe
951 * memory specified by the DIF program.  The dst is assumed to be safe memory
952 * that we can store to directly because it is managed by DTrace.  As with
953 * standard bcopy, overlapping copies are handled properly.
954 */
955static void
956dtrace_bcopy(const void *src, void *dst, size_t len)
957{
958	if (len != 0) {
959		uint8_t *s1 = dst;
960		const uint8_t *s2 = src;
961
962		if (s1 <= s2) {
963			do {
964				*s1++ = dtrace_load8((uintptr_t)s2++);
965			} while (--len != 0);
966		} else {
967			s2 += len;
968			s1 += len;
969
970			do {
971				*--s1 = dtrace_load8((uintptr_t)--s2);
972			} while (--len != 0);
973		}
974	}
975}
976
977/*
978 * Copy src to dst using safe memory accesses, up to either the specified
979 * length, or the point that a nul byte is encountered.  The src is assumed to
980 * be unsafe memory specified by the DIF program.  The dst is assumed to be
981 * safe memory that we can store to directly because it is managed by DTrace.
982 * Unlike dtrace_bcopy(), overlapping regions are not handled.
983 */
984static void
985dtrace_strcpy(const void *src, void *dst, size_t len)
986{
987	if (len != 0) {
988		uint8_t *s1 = dst, c;
989		const uint8_t *s2 = src;
990
991		do {
992			*s1++ = c = dtrace_load8((uintptr_t)s2++);
993		} while (--len != 0 && c != '\0');
994	}
995}
996
997/*
998 * Copy src to dst, deriving the size and type from the specified (BYREF)
999 * variable type.  The src is assumed to be unsafe memory specified by the DIF
1000 * program.  The dst is assumed to be DTrace variable memory that is of the
1001 * specified type; we assume that we can store to directly.
1002 */
1003static void
1004dtrace_vcopy(void *src, void *dst, dtrace_diftype_t *type)
1005{
1006	ASSERT(type->dtdt_flags & DIF_TF_BYREF);
1007
1008	if (type->dtdt_kind == DIF_TYPE_STRING) {
1009		dtrace_strcpy(src, dst, type->dtdt_size);
1010	} else {
1011		dtrace_bcopy(src, dst, type->dtdt_size);
1012	}
1013}
1014
1015/*
1016 * Compare s1 to s2 using safe memory accesses.  The s1 data is assumed to be
1017 * unsafe memory specified by the DIF program.  The s2 data is assumed to be
1018 * safe memory that we can access directly because it is managed by DTrace.
1019 */
1020static int
1021dtrace_bcmp(const void *s1, const void *s2, size_t len)
1022{
1023	volatile uint16_t *flags;
1024
1025	flags = (volatile uint16_t *)&cpu_core[curcpu].cpuc_dtrace_flags;
1026
1027	if (s1 == s2)
1028		return (0);
1029
1030	if (s1 == NULL || s2 == NULL)
1031		return (1);
1032
1033	if (s1 != s2 && len != 0) {
1034		const uint8_t *ps1 = s1;
1035		const uint8_t *ps2 = s2;
1036
1037		do {
1038			if (dtrace_load8((uintptr_t)ps1++) != *ps2++)
1039				return (1);
1040		} while (--len != 0 && !(*flags & CPU_DTRACE_FAULT));
1041	}
1042	return (0);
1043}
1044
1045/*
1046 * Zero the specified region using a simple byte-by-byte loop.  Note that this
1047 * is for safe DTrace-managed memory only.
1048 */
1049static void
1050dtrace_bzero(void *dst, size_t len)
1051{
1052	uchar_t *cp;
1053
1054	for (cp = dst; len != 0; len--)
1055		*cp++ = 0;
1056}
1057
1058static void
1059dtrace_add_128(uint64_t *addend1, uint64_t *addend2, uint64_t *sum)
1060{
1061	uint64_t result[2];
1062
1063	result[0] = addend1[0] + addend2[0];
1064	result[1] = addend1[1] + addend2[1] +
1065	    (result[0] < addend1[0] || result[0] < addend2[0] ? 1 : 0);
1066
1067	sum[0] = result[0];
1068	sum[1] = result[1];
1069}
1070
1071/*
1072 * Shift the 128-bit value in a by b. If b is positive, shift left.
1073 * If b is negative, shift right.
1074 */
1075static void
1076dtrace_shift_128(uint64_t *a, int b)
1077{
1078	uint64_t mask;
1079
1080	if (b == 0)
1081		return;
1082
1083	if (b < 0) {
1084		b = -b;
1085		if (b >= 64) {
1086			a[0] = a[1] >> (b - 64);
1087			a[1] = 0;
1088		} else {
1089			a[0] >>= b;
1090			mask = 1LL << (64 - b);
1091			mask -= 1;
1092			a[0] |= ((a[1] & mask) << (64 - b));
1093			a[1] >>= b;
1094		}
1095	} else {
1096		if (b >= 64) {
1097			a[1] = a[0] << (b - 64);
1098			a[0] = 0;
1099		} else {
1100			a[1] <<= b;
1101			mask = a[0] >> (64 - b);
1102			a[1] |= mask;
1103			a[0] <<= b;
1104		}
1105	}
1106}
1107
1108/*
1109 * The basic idea is to break the 2 64-bit values into 4 32-bit values,
1110 * use native multiplication on those, and then re-combine into the
1111 * resulting 128-bit value.
1112 *
1113 * (hi1 << 32 + lo1) * (hi2 << 32 + lo2) =
1114 *     hi1 * hi2 << 64 +
1115 *     hi1 * lo2 << 32 +
1116 *     hi2 * lo1 << 32 +
1117 *     lo1 * lo2
1118 */
1119static void
1120dtrace_multiply_128(uint64_t factor1, uint64_t factor2, uint64_t *product)
1121{
1122	uint64_t hi1, hi2, lo1, lo2;
1123	uint64_t tmp[2];
1124
1125	hi1 = factor1 >> 32;
1126	hi2 = factor2 >> 32;
1127
1128	lo1 = factor1 & DT_MASK_LO;
1129	lo2 = factor2 & DT_MASK_LO;
1130
1131	product[0] = lo1 * lo2;
1132	product[1] = hi1 * hi2;
1133
1134	tmp[0] = hi1 * lo2;
1135	tmp[1] = 0;
1136	dtrace_shift_128(tmp, 32);
1137	dtrace_add_128(product, tmp, product);
1138
1139	tmp[0] = hi2 * lo1;
1140	tmp[1] = 0;
1141	dtrace_shift_128(tmp, 32);
1142	dtrace_add_128(product, tmp, product);
1143}
1144
1145/*
1146 * This privilege check should be used by actions and subroutines to
1147 * verify that the user credentials of the process that enabled the
1148 * invoking ECB match the target credentials
1149 */
1150static int
1151dtrace_priv_proc_common_user(dtrace_state_t *state)
1152{
1153	cred_t *cr, *s_cr = state->dts_cred.dcr_cred;
1154
1155	/*
1156	 * We should always have a non-NULL state cred here, since if cred
1157	 * is null (anonymous tracing), we fast-path bypass this routine.
1158	 */
1159	ASSERT(s_cr != NULL);
1160
1161	if ((cr = CRED()) != NULL &&
1162	    s_cr->cr_uid == cr->cr_uid &&
1163	    s_cr->cr_uid == cr->cr_ruid &&
1164	    s_cr->cr_uid == cr->cr_suid &&
1165	    s_cr->cr_gid == cr->cr_gid &&
1166	    s_cr->cr_gid == cr->cr_rgid &&
1167	    s_cr->cr_gid == cr->cr_sgid)
1168		return (1);
1169
1170	return (0);
1171}
1172
1173/*
1174 * This privilege check should be used by actions and subroutines to
1175 * verify that the zone of the process that enabled the invoking ECB
1176 * matches the target credentials
1177 */
1178static int
1179dtrace_priv_proc_common_zone(dtrace_state_t *state)
1180{
1181#if defined(sun)
1182	cred_t *cr, *s_cr = state->dts_cred.dcr_cred;
1183
1184	/*
1185	 * We should always have a non-NULL state cred here, since if cred
1186	 * is null (anonymous tracing), we fast-path bypass this routine.
1187	 */
1188	ASSERT(s_cr != NULL);
1189
1190	if ((cr = CRED()) != NULL &&
1191	    s_cr->cr_zone == cr->cr_zone)
1192		return (1);
1193
1194	return (0);
1195#else
1196	return (1);
1197#endif
1198}
1199
1200/*
1201 * This privilege check should be used by actions and subroutines to
1202 * verify that the process has not setuid or changed credentials.
1203 */
1204static int
1205dtrace_priv_proc_common_nocd(void)
1206{
1207	proc_t *proc;
1208
1209	if ((proc = ttoproc(curthread)) != NULL &&
1210	    !(proc->p_flag & SNOCD))
1211		return (1);
1212
1213	return (0);
1214}
1215
1216static int
1217dtrace_priv_proc_destructive(dtrace_state_t *state)
1218{
1219	int action = state->dts_cred.dcr_action;
1220
1221	if (((action & DTRACE_CRA_PROC_DESTRUCTIVE_ALLZONE) == 0) &&
1222	    dtrace_priv_proc_common_zone(state) == 0)
1223		goto bad;
1224
1225	if (((action & DTRACE_CRA_PROC_DESTRUCTIVE_ALLUSER) == 0) &&
1226	    dtrace_priv_proc_common_user(state) == 0)
1227		goto bad;
1228
1229	if (((action & DTRACE_CRA_PROC_DESTRUCTIVE_CREDCHG) == 0) &&
1230	    dtrace_priv_proc_common_nocd() == 0)
1231		goto bad;
1232
1233	return (1);
1234
1235bad:
1236	cpu_core[curcpu].cpuc_dtrace_flags |= CPU_DTRACE_UPRIV;
1237
1238	return (0);
1239}
1240
1241static int
1242dtrace_priv_proc_control(dtrace_state_t *state)
1243{
1244	if (state->dts_cred.dcr_action & DTRACE_CRA_PROC_CONTROL)
1245		return (1);
1246
1247	if (dtrace_priv_proc_common_zone(state) &&
1248	    dtrace_priv_proc_common_user(state) &&
1249	    dtrace_priv_proc_common_nocd())
1250		return (1);
1251
1252	cpu_core[curcpu].cpuc_dtrace_flags |= CPU_DTRACE_UPRIV;
1253
1254	return (0);
1255}
1256
1257static int
1258dtrace_priv_proc(dtrace_state_t *state)
1259{
1260	if (state->dts_cred.dcr_action & DTRACE_CRA_PROC)
1261		return (1);
1262
1263	cpu_core[curcpu].cpuc_dtrace_flags |= CPU_DTRACE_UPRIV;
1264
1265	return (0);
1266}
1267
1268static int
1269dtrace_priv_kernel(dtrace_state_t *state)
1270{
1271	if (state->dts_cred.dcr_action & DTRACE_CRA_KERNEL)
1272		return (1);
1273
1274	cpu_core[curcpu].cpuc_dtrace_flags |= CPU_DTRACE_KPRIV;
1275
1276	return (0);
1277}
1278
1279static int
1280dtrace_priv_kernel_destructive(dtrace_state_t *state)
1281{
1282	if (state->dts_cred.dcr_action & DTRACE_CRA_KERNEL_DESTRUCTIVE)
1283		return (1);
1284
1285	cpu_core[curcpu].cpuc_dtrace_flags |= CPU_DTRACE_KPRIV;
1286
1287	return (0);
1288}
1289
1290/*
1291 * Note:  not called from probe context.  This function is called
1292 * asynchronously (and at a regular interval) from outside of probe context to
1293 * clean the dirty dynamic variable lists on all CPUs.  Dynamic variable
1294 * cleaning is explained in detail in <sys/dtrace_impl.h>.
1295 */
1296void
1297dtrace_dynvar_clean(dtrace_dstate_t *dstate)
1298{
1299	dtrace_dynvar_t *dirty;
1300	dtrace_dstate_percpu_t *dcpu;
1301	int i, work = 0;
1302
1303	for (i = 0; i < NCPU; i++) {
1304		dcpu = &dstate->dtds_percpu[i];
1305
1306		ASSERT(dcpu->dtdsc_rinsing == NULL);
1307
1308		/*
1309		 * If the dirty list is NULL, there is no dirty work to do.
1310		 */
1311		if (dcpu->dtdsc_dirty == NULL)
1312			continue;
1313
1314		/*
1315		 * If the clean list is non-NULL, then we're not going to do
1316		 * any work for this CPU -- it means that there has not been
1317		 * a dtrace_dynvar() allocation on this CPU (or from this CPU)
1318		 * since the last time we cleaned house.
1319		 */
1320		if (dcpu->dtdsc_clean != NULL)
1321			continue;
1322
1323		work = 1;
1324
1325		/*
1326		 * Atomically move the dirty list aside.
1327		 */
1328		do {
1329			dirty = dcpu->dtdsc_dirty;
1330
1331			/*
1332			 * Before we zap the dirty list, set the rinsing list.
1333			 * (This allows for a potential assertion in
1334			 * dtrace_dynvar():  if a free dynamic variable appears
1335			 * on a hash chain, either the dirty list or the
1336			 * rinsing list for some CPU must be non-NULL.)
1337			 */
1338			dcpu->dtdsc_rinsing = dirty;
1339			dtrace_membar_producer();
1340		} while (dtrace_casptr(&dcpu->dtdsc_dirty,
1341		    dirty, NULL) != dirty);
1342	}
1343
1344	if (!work) {
1345		/*
1346		 * We have no work to do; we can simply return.
1347		 */
1348		return;
1349	}
1350
1351	dtrace_sync();
1352
1353	for (i = 0; i < NCPU; i++) {
1354		dcpu = &dstate->dtds_percpu[i];
1355
1356		if (dcpu->dtdsc_rinsing == NULL)
1357			continue;
1358
1359		/*
1360		 * We are now guaranteed that no hash chain contains a pointer
1361		 * into this dirty list; we can make it clean.
1362		 */
1363		ASSERT(dcpu->dtdsc_clean == NULL);
1364		dcpu->dtdsc_clean = dcpu->dtdsc_rinsing;
1365		dcpu->dtdsc_rinsing = NULL;
1366	}
1367
1368	/*
1369	 * Before we actually set the state to be DTRACE_DSTATE_CLEAN, make
1370	 * sure that all CPUs have seen all of the dtdsc_clean pointers.
1371	 * This prevents a race whereby a CPU incorrectly decides that
1372	 * the state should be something other than DTRACE_DSTATE_CLEAN
1373	 * after dtrace_dynvar_clean() has completed.
1374	 */
1375	dtrace_sync();
1376
1377	dstate->dtds_state = DTRACE_DSTATE_CLEAN;
1378}
1379
1380/*
1381 * Depending on the value of the op parameter, this function looks-up,
1382 * allocates or deallocates an arbitrarily-keyed dynamic variable.  If an
1383 * allocation is requested, this function will return a pointer to a
1384 * dtrace_dynvar_t corresponding to the allocated variable -- or NULL if no
1385 * variable can be allocated.  If NULL is returned, the appropriate counter
1386 * will be incremented.
1387 */
1388dtrace_dynvar_t *
1389dtrace_dynvar(dtrace_dstate_t *dstate, uint_t nkeys,
1390    dtrace_key_t *key, size_t dsize, dtrace_dynvar_op_t op,
1391    dtrace_mstate_t *mstate, dtrace_vstate_t *vstate)
1392{
1393	uint64_t hashval = DTRACE_DYNHASH_VALID;
1394	dtrace_dynhash_t *hash = dstate->dtds_hash;
1395	dtrace_dynvar_t *free, *new_free, *next, *dvar, *start, *prev = NULL;
1396	processorid_t me = curcpu, cpu = me;
1397	dtrace_dstate_percpu_t *dcpu = &dstate->dtds_percpu[me];
1398	size_t bucket, ksize;
1399	size_t chunksize = dstate->dtds_chunksize;
1400	uintptr_t kdata, lock, nstate;
1401	uint_t i;
1402
1403	ASSERT(nkeys != 0);
1404
1405	/*
1406	 * Hash the key.  As with aggregations, we use Jenkins' "One-at-a-time"
1407	 * algorithm.  For the by-value portions, we perform the algorithm in
1408	 * 16-bit chunks (as opposed to 8-bit chunks).  This speeds things up a
1409	 * bit, and seems to have only a minute effect on distribution.  For
1410	 * the by-reference data, we perform "One-at-a-time" iterating (safely)
1411	 * over each referenced byte.  It's painful to do this, but it's much
1412	 * better than pathological hash distribution.  The efficacy of the
1413	 * hashing algorithm (and a comparison with other algorithms) may be
1414	 * found by running the ::dtrace_dynstat MDB dcmd.
1415	 */
1416	for (i = 0; i < nkeys; i++) {
1417		if (key[i].dttk_size == 0) {
1418			uint64_t val = key[i].dttk_value;
1419
1420			hashval += (val >> 48) & 0xffff;
1421			hashval += (hashval << 10);
1422			hashval ^= (hashval >> 6);
1423
1424			hashval += (val >> 32) & 0xffff;
1425			hashval += (hashval << 10);
1426			hashval ^= (hashval >> 6);
1427
1428			hashval += (val >> 16) & 0xffff;
1429			hashval += (hashval << 10);
1430			hashval ^= (hashval >> 6);
1431
1432			hashval += val & 0xffff;
1433			hashval += (hashval << 10);
1434			hashval ^= (hashval >> 6);
1435		} else {
1436			/*
1437			 * This is incredibly painful, but it beats the hell
1438			 * out of the alternative.
1439			 */
1440			uint64_t j, size = key[i].dttk_size;
1441			uintptr_t base = (uintptr_t)key[i].dttk_value;
1442
1443			if (!dtrace_canload(base, size, mstate, vstate))
1444				break;
1445
1446			for (j = 0; j < size; j++) {
1447				hashval += dtrace_load8(base + j);
1448				hashval += (hashval << 10);
1449				hashval ^= (hashval >> 6);
1450			}
1451		}
1452	}
1453
1454	if (DTRACE_CPUFLAG_ISSET(CPU_DTRACE_FAULT))
1455		return (NULL);
1456
1457	hashval += (hashval << 3);
1458	hashval ^= (hashval >> 11);
1459	hashval += (hashval << 15);
1460
1461	/*
1462	 * There is a remote chance (ideally, 1 in 2^31) that our hashval
1463	 * comes out to be one of our two sentinel hash values.  If this
1464	 * actually happens, we set the hashval to be a value known to be a
1465	 * non-sentinel value.
1466	 */
1467	if (hashval == DTRACE_DYNHASH_FREE || hashval == DTRACE_DYNHASH_SINK)
1468		hashval = DTRACE_DYNHASH_VALID;
1469
1470	/*
1471	 * Yes, it's painful to do a divide here.  If the cycle count becomes
1472	 * important here, tricks can be pulled to reduce it.  (However, it's
1473	 * critical that hash collisions be kept to an absolute minimum;
1474	 * they're much more painful than a divide.)  It's better to have a
1475	 * solution that generates few collisions and still keeps things
1476	 * relatively simple.
1477	 */
1478	bucket = hashval % dstate->dtds_hashsize;
1479
1480	if (op == DTRACE_DYNVAR_DEALLOC) {
1481		volatile uintptr_t *lockp = &hash[bucket].dtdh_lock;
1482
1483		for (;;) {
1484			while ((lock = *lockp) & 1)
1485				continue;
1486
1487			if (dtrace_casptr((volatile void *)lockp,
1488			    (volatile void *)lock, (volatile void *)(lock + 1)) == (void *)lock)
1489				break;
1490		}
1491
1492		dtrace_membar_producer();
1493	}
1494
1495top:
1496	prev = NULL;
1497	lock = hash[bucket].dtdh_lock;
1498
1499	dtrace_membar_consumer();
1500
1501	start = hash[bucket].dtdh_chain;
1502	ASSERT(start != NULL && (start->dtdv_hashval == DTRACE_DYNHASH_SINK ||
1503	    start->dtdv_hashval != DTRACE_DYNHASH_FREE ||
1504	    op != DTRACE_DYNVAR_DEALLOC));
1505
1506	for (dvar = start; dvar != NULL; dvar = dvar->dtdv_next) {
1507		dtrace_tuple_t *dtuple = &dvar->dtdv_tuple;
1508		dtrace_key_t *dkey = &dtuple->dtt_key[0];
1509
1510		if (dvar->dtdv_hashval != hashval) {
1511			if (dvar->dtdv_hashval == DTRACE_DYNHASH_SINK) {
1512				/*
1513				 * We've reached the sink, and therefore the
1514				 * end of the hash chain; we can kick out of
1515				 * the loop knowing that we have seen a valid
1516				 * snapshot of state.
1517				 */
1518				ASSERT(dvar->dtdv_next == NULL);
1519				ASSERT(dvar == &dtrace_dynhash_sink);
1520				break;
1521			}
1522
1523			if (dvar->dtdv_hashval == DTRACE_DYNHASH_FREE) {
1524				/*
1525				 * We've gone off the rails:  somewhere along
1526				 * the line, one of the members of this hash
1527				 * chain was deleted.  Note that we could also
1528				 * detect this by simply letting this loop run
1529				 * to completion, as we would eventually hit
1530				 * the end of the dirty list.  However, we
1531				 * want to avoid running the length of the
1532				 * dirty list unnecessarily (it might be quite
1533				 * long), so we catch this as early as
1534				 * possible by detecting the hash marker.  In
1535				 * this case, we simply set dvar to NULL and
1536				 * break; the conditional after the loop will
1537				 * send us back to top.
1538				 */
1539				dvar = NULL;
1540				break;
1541			}
1542
1543			goto next;
1544		}
1545
1546		if (dtuple->dtt_nkeys != nkeys)
1547			goto next;
1548
1549		for (i = 0; i < nkeys; i++, dkey++) {
1550			if (dkey->dttk_size != key[i].dttk_size)
1551				goto next; /* size or type mismatch */
1552
1553			if (dkey->dttk_size != 0) {
1554				if (dtrace_bcmp(
1555				    (void *)(uintptr_t)key[i].dttk_value,
1556				    (void *)(uintptr_t)dkey->dttk_value,
1557				    dkey->dttk_size))
1558					goto next;
1559			} else {
1560				if (dkey->dttk_value != key[i].dttk_value)
1561					goto next;
1562			}
1563		}
1564
1565		if (op != DTRACE_DYNVAR_DEALLOC)
1566			return (dvar);
1567
1568		ASSERT(dvar->dtdv_next == NULL ||
1569		    dvar->dtdv_next->dtdv_hashval != DTRACE_DYNHASH_FREE);
1570
1571		if (prev != NULL) {
1572			ASSERT(hash[bucket].dtdh_chain != dvar);
1573			ASSERT(start != dvar);
1574			ASSERT(prev->dtdv_next == dvar);
1575			prev->dtdv_next = dvar->dtdv_next;
1576		} else {
1577			if (dtrace_casptr(&hash[bucket].dtdh_chain,
1578			    start, dvar->dtdv_next) != start) {
1579				/*
1580				 * We have failed to atomically swing the
1581				 * hash table head pointer, presumably because
1582				 * of a conflicting allocation on another CPU.
1583				 * We need to reread the hash chain and try
1584				 * again.
1585				 */
1586				goto top;
1587			}
1588		}
1589
1590		dtrace_membar_producer();
1591
1592		/*
1593		 * Now set the hash value to indicate that it's free.
1594		 */
1595		ASSERT(hash[bucket].dtdh_chain != dvar);
1596		dvar->dtdv_hashval = DTRACE_DYNHASH_FREE;
1597
1598		dtrace_membar_producer();
1599
1600		/*
1601		 * Set the next pointer to point at the dirty list, and
1602		 * atomically swing the dirty pointer to the newly freed dvar.
1603		 */
1604		do {
1605			next = dcpu->dtdsc_dirty;
1606			dvar->dtdv_next = next;
1607		} while (dtrace_casptr(&dcpu->dtdsc_dirty, next, dvar) != next);
1608
1609		/*
1610		 * Finally, unlock this hash bucket.
1611		 */
1612		ASSERT(hash[bucket].dtdh_lock == lock);
1613		ASSERT(lock & 1);
1614		hash[bucket].dtdh_lock++;
1615
1616		return (NULL);
1617next:
1618		prev = dvar;
1619		continue;
1620	}
1621
1622	if (dvar == NULL) {
1623		/*
1624		 * If dvar is NULL, it is because we went off the rails:
1625		 * one of the elements that we traversed in the hash chain
1626		 * was deleted while we were traversing it.  In this case,
1627		 * we assert that we aren't doing a dealloc (deallocs lock
1628		 * the hash bucket to prevent themselves from racing with
1629		 * one another), and retry the hash chain traversal.
1630		 */
1631		ASSERT(op != DTRACE_DYNVAR_DEALLOC);
1632		goto top;
1633	}
1634
1635	if (op != DTRACE_DYNVAR_ALLOC) {
1636		/*
1637		 * If we are not to allocate a new variable, we want to
1638		 * return NULL now.  Before we return, check that the value
1639		 * of the lock word hasn't changed.  If it has, we may have
1640		 * seen an inconsistent snapshot.
1641		 */
1642		if (op == DTRACE_DYNVAR_NOALLOC) {
1643			if (hash[bucket].dtdh_lock != lock)
1644				goto top;
1645		} else {
1646			ASSERT(op == DTRACE_DYNVAR_DEALLOC);
1647			ASSERT(hash[bucket].dtdh_lock == lock);
1648			ASSERT(lock & 1);
1649			hash[bucket].dtdh_lock++;
1650		}
1651
1652		return (NULL);
1653	}
1654
1655	/*
1656	 * We need to allocate a new dynamic variable.  The size we need is the
1657	 * size of dtrace_dynvar plus the size of nkeys dtrace_key_t's plus the
1658	 * size of any auxiliary key data (rounded up to 8-byte alignment) plus
1659	 * the size of any referred-to data (dsize).  We then round the final
1660	 * size up to the chunksize for allocation.
1661	 */
1662	for (ksize = 0, i = 0; i < nkeys; i++)
1663		ksize += P2ROUNDUP(key[i].dttk_size, sizeof (uint64_t));
1664
1665	/*
1666	 * This should be pretty much impossible, but could happen if, say,
1667	 * strange DIF specified the tuple.  Ideally, this should be an
1668	 * assertion and not an error condition -- but that requires that the
1669	 * chunksize calculation in dtrace_difo_chunksize() be absolutely
1670	 * bullet-proof.  (That is, it must not be able to be fooled by
1671	 * malicious DIF.)  Given the lack of backwards branches in DIF,
1672	 * solving this would presumably not amount to solving the Halting
1673	 * Problem -- but it still seems awfully hard.
1674	 */
1675	if (sizeof (dtrace_dynvar_t) + sizeof (dtrace_key_t) * (nkeys - 1) +
1676	    ksize + dsize > chunksize) {
1677		dcpu->dtdsc_drops++;
1678		return (NULL);
1679	}
1680
1681	nstate = DTRACE_DSTATE_EMPTY;
1682
1683	do {
1684retry:
1685		free = dcpu->dtdsc_free;
1686
1687		if (free == NULL) {
1688			dtrace_dynvar_t *clean = dcpu->dtdsc_clean;
1689			void *rval;
1690
1691			if (clean == NULL) {
1692				/*
1693				 * We're out of dynamic variable space on
1694				 * this CPU.  Unless we have tried all CPUs,
1695				 * we'll try to allocate from a different
1696				 * CPU.
1697				 */
1698				switch (dstate->dtds_state) {
1699				case DTRACE_DSTATE_CLEAN: {
1700					void *sp = &dstate->dtds_state;
1701
1702					if (++cpu >= NCPU)
1703						cpu = 0;
1704
1705					if (dcpu->dtdsc_dirty != NULL &&
1706					    nstate == DTRACE_DSTATE_EMPTY)
1707						nstate = DTRACE_DSTATE_DIRTY;
1708
1709					if (dcpu->dtdsc_rinsing != NULL)
1710						nstate = DTRACE_DSTATE_RINSING;
1711
1712					dcpu = &dstate->dtds_percpu[cpu];
1713
1714					if (cpu != me)
1715						goto retry;
1716
1717					(void) dtrace_cas32(sp,
1718					    DTRACE_DSTATE_CLEAN, nstate);
1719
1720					/*
1721					 * To increment the correct bean
1722					 * counter, take another lap.
1723					 */
1724					goto retry;
1725				}
1726
1727				case DTRACE_DSTATE_DIRTY:
1728					dcpu->dtdsc_dirty_drops++;
1729					break;
1730
1731				case DTRACE_DSTATE_RINSING:
1732					dcpu->dtdsc_rinsing_drops++;
1733					break;
1734
1735				case DTRACE_DSTATE_EMPTY:
1736					dcpu->dtdsc_drops++;
1737					break;
1738				}
1739
1740				DTRACE_CPUFLAG_SET(CPU_DTRACE_DROP);
1741				return (NULL);
1742			}
1743
1744			/*
1745			 * The clean list appears to be non-empty.  We want to
1746			 * move the clean list to the free list; we start by
1747			 * moving the clean pointer aside.
1748			 */
1749			if (dtrace_casptr(&dcpu->dtdsc_clean,
1750			    clean, NULL) != clean) {
1751				/*
1752				 * We are in one of two situations:
1753				 *
1754				 *  (a)	The clean list was switched to the
1755				 *	free list by another CPU.
1756				 *
1757				 *  (b)	The clean list was added to by the
1758				 *	cleansing cyclic.
1759				 *
1760				 * In either of these situations, we can
1761				 * just reattempt the free list allocation.
1762				 */
1763				goto retry;
1764			}
1765
1766			ASSERT(clean->dtdv_hashval == DTRACE_DYNHASH_FREE);
1767
1768			/*
1769			 * Now we'll move the clean list to the free list.
1770			 * It's impossible for this to fail:  the only way
1771			 * the free list can be updated is through this
1772			 * code path, and only one CPU can own the clean list.
1773			 * Thus, it would only be possible for this to fail if
1774			 * this code were racing with dtrace_dynvar_clean().
1775			 * (That is, if dtrace_dynvar_clean() updated the clean
1776			 * list, and we ended up racing to update the free
1777			 * list.)  This race is prevented by the dtrace_sync()
1778			 * in dtrace_dynvar_clean() -- which flushes the
1779			 * owners of the clean lists out before resetting
1780			 * the clean lists.
1781			 */
1782			rval = dtrace_casptr(&dcpu->dtdsc_free, NULL, clean);
1783			ASSERT(rval == NULL);
1784			goto retry;
1785		}
1786
1787		dvar = free;
1788		new_free = dvar->dtdv_next;
1789	} while (dtrace_casptr(&dcpu->dtdsc_free, free, new_free) != free);
1790
1791	/*
1792	 * We have now allocated a new chunk.  We copy the tuple keys into the
1793	 * tuple array and copy any referenced key data into the data space
1794	 * following the tuple array.  As we do this, we relocate dttk_value
1795	 * in the final tuple to point to the key data address in the chunk.
1796	 */
1797	kdata = (uintptr_t)&dvar->dtdv_tuple.dtt_key[nkeys];
1798	dvar->dtdv_data = (void *)(kdata + ksize);
1799	dvar->dtdv_tuple.dtt_nkeys = nkeys;
1800
1801	for (i = 0; i < nkeys; i++) {
1802		dtrace_key_t *dkey = &dvar->dtdv_tuple.dtt_key[i];
1803		size_t kesize = key[i].dttk_size;
1804
1805		if (kesize != 0) {
1806			dtrace_bcopy(
1807			    (const void *)(uintptr_t)key[i].dttk_value,
1808			    (void *)kdata, kesize);
1809			dkey->dttk_value = kdata;
1810			kdata += P2ROUNDUP(kesize, sizeof (uint64_t));
1811		} else {
1812			dkey->dttk_value = key[i].dttk_value;
1813		}
1814
1815		dkey->dttk_size = kesize;
1816	}
1817
1818	ASSERT(dvar->dtdv_hashval == DTRACE_DYNHASH_FREE);
1819	dvar->dtdv_hashval = hashval;
1820	dvar->dtdv_next = start;
1821
1822	if (dtrace_casptr(&hash[bucket].dtdh_chain, start, dvar) == start)
1823		return (dvar);
1824
1825	/*
1826	 * The cas has failed.  Either another CPU is adding an element to
1827	 * this hash chain, or another CPU is deleting an element from this
1828	 * hash chain.  The simplest way to deal with both of these cases
1829	 * (though not necessarily the most efficient) is to free our
1830	 * allocated block and tail-call ourselves.  Note that the free is
1831	 * to the dirty list and _not_ to the free list.  This is to prevent
1832	 * races with allocators, above.
1833	 */
1834	dvar->dtdv_hashval = DTRACE_DYNHASH_FREE;
1835
1836	dtrace_membar_producer();
1837
1838	do {
1839		free = dcpu->dtdsc_dirty;
1840		dvar->dtdv_next = free;
1841	} while (dtrace_casptr(&dcpu->dtdsc_dirty, free, dvar) != free);
1842
1843	return (dtrace_dynvar(dstate, nkeys, key, dsize, op, mstate, vstate));
1844}
1845
1846/*ARGSUSED*/
1847static void
1848dtrace_aggregate_min(uint64_t *oval, uint64_t nval, uint64_t arg)
1849{
1850	if ((int64_t)nval < (int64_t)*oval)
1851		*oval = nval;
1852}
1853
1854/*ARGSUSED*/
1855static void
1856dtrace_aggregate_max(uint64_t *oval, uint64_t nval, uint64_t arg)
1857{
1858	if ((int64_t)nval > (int64_t)*oval)
1859		*oval = nval;
1860}
1861
1862static void
1863dtrace_aggregate_quantize(uint64_t *quanta, uint64_t nval, uint64_t incr)
1864{
1865	int i, zero = DTRACE_QUANTIZE_ZEROBUCKET;
1866	int64_t val = (int64_t)nval;
1867
1868	if (val < 0) {
1869		for (i = 0; i < zero; i++) {
1870			if (val <= DTRACE_QUANTIZE_BUCKETVAL(i)) {
1871				quanta[i] += incr;
1872				return;
1873			}
1874		}
1875	} else {
1876		for (i = zero + 1; i < DTRACE_QUANTIZE_NBUCKETS; i++) {
1877			if (val < DTRACE_QUANTIZE_BUCKETVAL(i)) {
1878				quanta[i - 1] += incr;
1879				return;
1880			}
1881		}
1882
1883		quanta[DTRACE_QUANTIZE_NBUCKETS - 1] += incr;
1884		return;
1885	}
1886
1887	ASSERT(0);
1888}
1889
1890static void
1891dtrace_aggregate_lquantize(uint64_t *lquanta, uint64_t nval, uint64_t incr)
1892{
1893	uint64_t arg = *lquanta++;
1894	int32_t base = DTRACE_LQUANTIZE_BASE(arg);
1895	uint16_t step = DTRACE_LQUANTIZE_STEP(arg);
1896	uint16_t levels = DTRACE_LQUANTIZE_LEVELS(arg);
1897	int32_t val = (int32_t)nval, level;
1898
1899	ASSERT(step != 0);
1900	ASSERT(levels != 0);
1901
1902	if (val < base) {
1903		/*
1904		 * This is an underflow.
1905		 */
1906		lquanta[0] += incr;
1907		return;
1908	}
1909
1910	level = (val - base) / step;
1911
1912	if (level < levels) {
1913		lquanta[level + 1] += incr;
1914		return;
1915	}
1916
1917	/*
1918	 * This is an overflow.
1919	 */
1920	lquanta[levels + 1] += incr;
1921}
1922
1923static int
1924dtrace_aggregate_llquantize_bucket(uint16_t factor, uint16_t low,
1925    uint16_t high, uint16_t nsteps, int64_t value)
1926{
1927	int64_t this = 1, last, next;
1928	int base = 1, order;
1929
1930	ASSERT(factor <= nsteps);
1931	ASSERT(nsteps % factor == 0);
1932
1933	for (order = 0; order < low; order++)
1934		this *= factor;
1935
1936	/*
1937	 * If our value is less than our factor taken to the power of the
1938	 * low order of magnitude, it goes into the zeroth bucket.
1939	 */
1940	if (value < (last = this))
1941		return (0);
1942
1943	for (this *= factor; order <= high; order++) {
1944		int nbuckets = this > nsteps ? nsteps : this;
1945
1946		if ((next = this * factor) < this) {
1947			/*
1948			 * We should not generally get log/linear quantizations
1949			 * with a high magnitude that allows 64-bits to
1950			 * overflow, but we nonetheless protect against this
1951			 * by explicitly checking for overflow, and clamping
1952			 * our value accordingly.
1953			 */
1954			value = this - 1;
1955		}
1956
1957		if (value < this) {
1958			/*
1959			 * If our value lies within this order of magnitude,
1960			 * determine its position by taking the offset within
1961			 * the order of magnitude, dividing by the bucket
1962			 * width, and adding to our (accumulated) base.
1963			 */
1964			return (base + (value - last) / (this / nbuckets));
1965		}
1966
1967		base += nbuckets - (nbuckets / factor);
1968		last = this;
1969		this = next;
1970	}
1971
1972	/*
1973	 * Our value is greater than or equal to our factor taken to the
1974	 * power of one plus the high magnitude -- return the top bucket.
1975	 */
1976	return (base);
1977}
1978
1979static void
1980dtrace_aggregate_llquantize(uint64_t *llquanta, uint64_t nval, uint64_t incr)
1981{
1982	uint64_t arg = *llquanta++;
1983	uint16_t factor = DTRACE_LLQUANTIZE_FACTOR(arg);
1984	uint16_t low = DTRACE_LLQUANTIZE_LOW(arg);
1985	uint16_t high = DTRACE_LLQUANTIZE_HIGH(arg);
1986	uint16_t nsteps = DTRACE_LLQUANTIZE_NSTEP(arg);
1987
1988	llquanta[dtrace_aggregate_llquantize_bucket(factor,
1989	    low, high, nsteps, nval)] += incr;
1990}
1991
1992/*ARGSUSED*/
1993static void
1994dtrace_aggregate_avg(uint64_t *data, uint64_t nval, uint64_t arg)
1995{
1996	data[0]++;
1997	data[1] += nval;
1998}
1999
2000/*ARGSUSED*/
2001static void
2002dtrace_aggregate_stddev(uint64_t *data, uint64_t nval, uint64_t arg)
2003{
2004	int64_t snval = (int64_t)nval;
2005	uint64_t tmp[2];
2006
2007	data[0]++;
2008	data[1] += nval;
2009
2010	/*
2011	 * What we want to say here is:
2012	 *
2013	 * data[2] += nval * nval;
2014	 *
2015	 * But given that nval is 64-bit, we could easily overflow, so
2016	 * we do this as 128-bit arithmetic.
2017	 */
2018	if (snval < 0)
2019		snval = -snval;
2020
2021	dtrace_multiply_128((uint64_t)snval, (uint64_t)snval, tmp);
2022	dtrace_add_128(data + 2, tmp, data + 2);
2023}
2024
2025/*ARGSUSED*/
2026static void
2027dtrace_aggregate_count(uint64_t *oval, uint64_t nval, uint64_t arg)
2028{
2029	*oval = *oval + 1;
2030}
2031
2032/*ARGSUSED*/
2033static void
2034dtrace_aggregate_sum(uint64_t *oval, uint64_t nval, uint64_t arg)
2035{
2036	*oval += nval;
2037}
2038
2039/*
2040 * Aggregate given the tuple in the principal data buffer, and the aggregating
2041 * action denoted by the specified dtrace_aggregation_t.  The aggregation
2042 * buffer is specified as the buf parameter.  This routine does not return
2043 * failure; if there is no space in the aggregation buffer, the data will be
2044 * dropped, and a corresponding counter incremented.
2045 */
2046static void
2047dtrace_aggregate(dtrace_aggregation_t *agg, dtrace_buffer_t *dbuf,
2048    intptr_t offset, dtrace_buffer_t *buf, uint64_t expr, uint64_t arg)
2049{
2050	dtrace_recdesc_t *rec = &agg->dtag_action.dta_rec;
2051	uint32_t i, ndx, size, fsize;
2052	uint32_t align = sizeof (uint64_t) - 1;
2053	dtrace_aggbuffer_t *agb;
2054	dtrace_aggkey_t *key;
2055	uint32_t hashval = 0, limit, isstr;
2056	caddr_t tomax, data, kdata;
2057	dtrace_actkind_t action;
2058	dtrace_action_t *act;
2059	uintptr_t offs;
2060
2061	if (buf == NULL)
2062		return;
2063
2064	if (!agg->dtag_hasarg) {
2065		/*
2066		 * Currently, only quantize() and lquantize() take additional
2067		 * arguments, and they have the same semantics:  an increment
2068		 * value that defaults to 1 when not present.  If additional
2069		 * aggregating actions take arguments, the setting of the
2070		 * default argument value will presumably have to become more
2071		 * sophisticated...
2072		 */
2073		arg = 1;
2074	}
2075
2076	action = agg->dtag_action.dta_kind - DTRACEACT_AGGREGATION;
2077	size = rec->dtrd_offset - agg->dtag_base;
2078	fsize = size + rec->dtrd_size;
2079
2080	ASSERT(dbuf->dtb_tomax != NULL);
2081	data = dbuf->dtb_tomax + offset + agg->dtag_base;
2082
2083	if ((tomax = buf->dtb_tomax) == NULL) {
2084		dtrace_buffer_drop(buf);
2085		return;
2086	}
2087
2088	/*
2089	 * The metastructure is always at the bottom of the buffer.
2090	 */
2091	agb = (dtrace_aggbuffer_t *)(tomax + buf->dtb_size -
2092	    sizeof (dtrace_aggbuffer_t));
2093
2094	if (buf->dtb_offset == 0) {
2095		/*
2096		 * We just kludge up approximately 1/8th of the size to be
2097		 * buckets.  If this guess ends up being routinely
2098		 * off-the-mark, we may need to dynamically readjust this
2099		 * based on past performance.
2100		 */
2101		uintptr_t hashsize = (buf->dtb_size >> 3) / sizeof (uintptr_t);
2102
2103		if ((uintptr_t)agb - hashsize * sizeof (dtrace_aggkey_t *) <
2104		    (uintptr_t)tomax || hashsize == 0) {
2105			/*
2106			 * We've been given a ludicrously small buffer;
2107			 * increment our drop count and leave.
2108			 */
2109			dtrace_buffer_drop(buf);
2110			return;
2111		}
2112
2113		/*
2114		 * And now, a pathetic attempt to try to get a an odd (or
2115		 * perchance, a prime) hash size for better hash distribution.
2116		 */
2117		if (hashsize > (DTRACE_AGGHASHSIZE_SLEW << 3))
2118			hashsize -= DTRACE_AGGHASHSIZE_SLEW;
2119
2120		agb->dtagb_hashsize = hashsize;
2121		agb->dtagb_hash = (dtrace_aggkey_t **)((uintptr_t)agb -
2122		    agb->dtagb_hashsize * sizeof (dtrace_aggkey_t *));
2123		agb->dtagb_free = (uintptr_t)agb->dtagb_hash;
2124
2125		for (i = 0; i < agb->dtagb_hashsize; i++)
2126			agb->dtagb_hash[i] = NULL;
2127	}
2128
2129	ASSERT(agg->dtag_first != NULL);
2130	ASSERT(agg->dtag_first->dta_intuple);
2131
2132	/*
2133	 * Calculate the hash value based on the key.  Note that we _don't_
2134	 * include the aggid in the hashing (but we will store it as part of
2135	 * the key).  The hashing algorithm is Bob Jenkins' "One-at-a-time"
2136	 * algorithm: a simple, quick algorithm that has no known funnels, and
2137	 * gets good distribution in practice.  The efficacy of the hashing
2138	 * algorithm (and a comparison with other algorithms) may be found by
2139	 * running the ::dtrace_aggstat MDB dcmd.
2140	 */
2141	for (act = agg->dtag_first; act->dta_intuple; act = act->dta_next) {
2142		i = act->dta_rec.dtrd_offset - agg->dtag_base;
2143		limit = i + act->dta_rec.dtrd_size;
2144		ASSERT(limit <= size);
2145		isstr = DTRACEACT_ISSTRING(act);
2146
2147		for (; i < limit; i++) {
2148			hashval += data[i];
2149			hashval += (hashval << 10);
2150			hashval ^= (hashval >> 6);
2151
2152			if (isstr && data[i] == '\0')
2153				break;
2154		}
2155	}
2156
2157	hashval += (hashval << 3);
2158	hashval ^= (hashval >> 11);
2159	hashval += (hashval << 15);
2160
2161	/*
2162	 * Yes, the divide here is expensive -- but it's generally the least
2163	 * of the performance issues given the amount of data that we iterate
2164	 * over to compute hash values, compare data, etc.
2165	 */
2166	ndx = hashval % agb->dtagb_hashsize;
2167
2168	for (key = agb->dtagb_hash[ndx]; key != NULL; key = key->dtak_next) {
2169		ASSERT((caddr_t)key >= tomax);
2170		ASSERT((caddr_t)key < tomax + buf->dtb_size);
2171
2172		if (hashval != key->dtak_hashval || key->dtak_size != size)
2173			continue;
2174
2175		kdata = key->dtak_data;
2176		ASSERT(kdata >= tomax && kdata < tomax + buf->dtb_size);
2177
2178		for (act = agg->dtag_first; act->dta_intuple;
2179		    act = act->dta_next) {
2180			i = act->dta_rec.dtrd_offset - agg->dtag_base;
2181			limit = i + act->dta_rec.dtrd_size;
2182			ASSERT(limit <= size);
2183			isstr = DTRACEACT_ISSTRING(act);
2184
2185			for (; i < limit; i++) {
2186				if (kdata[i] != data[i])
2187					goto next;
2188
2189				if (isstr && data[i] == '\0')
2190					break;
2191			}
2192		}
2193
2194		if (action != key->dtak_action) {
2195			/*
2196			 * We are aggregating on the same value in the same
2197			 * aggregation with two different aggregating actions.
2198			 * (This should have been picked up in the compiler,
2199			 * so we may be dealing with errant or devious DIF.)
2200			 * This is an error condition; we indicate as much,
2201			 * and return.
2202			 */
2203			DTRACE_CPUFLAG_SET(CPU_DTRACE_ILLOP);
2204			return;
2205		}
2206
2207		/*
2208		 * This is a hit:  we need to apply the aggregator to
2209		 * the value at this key.
2210		 */
2211		agg->dtag_aggregate((uint64_t *)(kdata + size), expr, arg);
2212		return;
2213next:
2214		continue;
2215	}
2216
2217	/*
2218	 * We didn't find it.  We need to allocate some zero-filled space,
2219	 * link it into the hash table appropriately, and apply the aggregator
2220	 * to the (zero-filled) value.
2221	 */
2222	offs = buf->dtb_offset;
2223	while (offs & (align - 1))
2224		offs += sizeof (uint32_t);
2225
2226	/*
2227	 * If we don't have enough room to both allocate a new key _and_
2228	 * its associated data, increment the drop count and return.
2229	 */
2230	if ((uintptr_t)tomax + offs + fsize >
2231	    agb->dtagb_free - sizeof (dtrace_aggkey_t)) {
2232		dtrace_buffer_drop(buf);
2233		return;
2234	}
2235
2236	/*CONSTCOND*/
2237	ASSERT(!(sizeof (dtrace_aggkey_t) & (sizeof (uintptr_t) - 1)));
2238	key = (dtrace_aggkey_t *)(agb->dtagb_free - sizeof (dtrace_aggkey_t));
2239	agb->dtagb_free -= sizeof (dtrace_aggkey_t);
2240
2241	key->dtak_data = kdata = tomax + offs;
2242	buf->dtb_offset = offs + fsize;
2243
2244	/*
2245	 * Now copy the data across.
2246	 */
2247	*((dtrace_aggid_t *)kdata) = agg->dtag_id;
2248
2249	for (i = sizeof (dtrace_aggid_t); i < size; i++)
2250		kdata[i] = data[i];
2251
2252	/*
2253	 * Because strings are not zeroed out by default, we need to iterate
2254	 * looking for actions that store strings, and we need to explicitly
2255	 * pad these strings out with zeroes.
2256	 */
2257	for (act = agg->dtag_first; act->dta_intuple; act = act->dta_next) {
2258		int nul;
2259
2260		if (!DTRACEACT_ISSTRING(act))
2261			continue;
2262
2263		i = act->dta_rec.dtrd_offset - agg->dtag_base;
2264		limit = i + act->dta_rec.dtrd_size;
2265		ASSERT(limit <= size);
2266
2267		for (nul = 0; i < limit; i++) {
2268			if (nul) {
2269				kdata[i] = '\0';
2270				continue;
2271			}
2272
2273			if (data[i] != '\0')
2274				continue;
2275
2276			nul = 1;
2277		}
2278	}
2279
2280	for (i = size; i < fsize; i++)
2281		kdata[i] = 0;
2282
2283	key->dtak_hashval = hashval;
2284	key->dtak_size = size;
2285	key->dtak_action = action;
2286	key->dtak_next = agb->dtagb_hash[ndx];
2287	agb->dtagb_hash[ndx] = key;
2288
2289	/*
2290	 * Finally, apply the aggregator.
2291	 */
2292	*((uint64_t *)(key->dtak_data + size)) = agg->dtag_initial;
2293	agg->dtag_aggregate((uint64_t *)(key->dtak_data + size), expr, arg);
2294}
2295
2296/*
2297 * Given consumer state, this routine finds a speculation in the INACTIVE
2298 * state and transitions it into the ACTIVE state.  If there is no speculation
2299 * in the INACTIVE state, 0 is returned.  In this case, no error counter is
2300 * incremented -- it is up to the caller to take appropriate action.
2301 */
2302static int
2303dtrace_speculation(dtrace_state_t *state)
2304{
2305	int i = 0;
2306	dtrace_speculation_state_t current;
2307	uint32_t *stat = &state->dts_speculations_unavail, count;
2308
2309	while (i < state->dts_nspeculations) {
2310		dtrace_speculation_t *spec = &state->dts_speculations[i];
2311
2312		current = spec->dtsp_state;
2313
2314		if (current != DTRACESPEC_INACTIVE) {
2315			if (current == DTRACESPEC_COMMITTINGMANY ||
2316			    current == DTRACESPEC_COMMITTING ||
2317			    current == DTRACESPEC_DISCARDING)
2318				stat = &state->dts_speculations_busy;
2319			i++;
2320			continue;
2321		}
2322
2323		if (dtrace_cas32((uint32_t *)&spec->dtsp_state,
2324		    current, DTRACESPEC_ACTIVE) == current)
2325			return (i + 1);
2326	}
2327
2328	/*
2329	 * We couldn't find a speculation.  If we found as much as a single
2330	 * busy speculation buffer, we'll attribute this failure as "busy"
2331	 * instead of "unavail".
2332	 */
2333	do {
2334		count = *stat;
2335	} while (dtrace_cas32(stat, count, count + 1) != count);
2336
2337	return (0);
2338}
2339
2340/*
2341 * This routine commits an active speculation.  If the specified speculation
2342 * is not in a valid state to perform a commit(), this routine will silently do
2343 * nothing.  The state of the specified speculation is transitioned according
2344 * to the state transition diagram outlined in <sys/dtrace_impl.h>
2345 */
2346static void
2347dtrace_speculation_commit(dtrace_state_t *state, processorid_t cpu,
2348    dtrace_specid_t which)
2349{
2350	dtrace_speculation_t *spec;
2351	dtrace_buffer_t *src, *dest;
2352	uintptr_t daddr, saddr, dlimit, slimit;
2353	dtrace_speculation_state_t current, new = 0;
2354	intptr_t offs;
2355	uint64_t timestamp;
2356
2357	if (which == 0)
2358		return;
2359
2360	if (which > state->dts_nspeculations) {
2361		cpu_core[cpu].cpuc_dtrace_flags |= CPU_DTRACE_ILLOP;
2362		return;
2363	}
2364
2365	spec = &state->dts_speculations[which - 1];
2366	src = &spec->dtsp_buffer[cpu];
2367	dest = &state->dts_buffer[cpu];
2368
2369	do {
2370		current = spec->dtsp_state;
2371
2372		if (current == DTRACESPEC_COMMITTINGMANY)
2373			break;
2374
2375		switch (current) {
2376		case DTRACESPEC_INACTIVE:
2377		case DTRACESPEC_DISCARDING:
2378			return;
2379
2380		case DTRACESPEC_COMMITTING:
2381			/*
2382			 * This is only possible if we are (a) commit()'ing
2383			 * without having done a prior speculate() on this CPU
2384			 * and (b) racing with another commit() on a different
2385			 * CPU.  There's nothing to do -- we just assert that
2386			 * our offset is 0.
2387			 */
2388			ASSERT(src->dtb_offset == 0);
2389			return;
2390
2391		case DTRACESPEC_ACTIVE:
2392			new = DTRACESPEC_COMMITTING;
2393			break;
2394
2395		case DTRACESPEC_ACTIVEONE:
2396			/*
2397			 * This speculation is active on one CPU.  If our
2398			 * buffer offset is non-zero, we know that the one CPU
2399			 * must be us.  Otherwise, we are committing on a
2400			 * different CPU from the speculate(), and we must
2401			 * rely on being asynchronously cleaned.
2402			 */
2403			if (src->dtb_offset != 0) {
2404				new = DTRACESPEC_COMMITTING;
2405				break;
2406			}
2407			/*FALLTHROUGH*/
2408
2409		case DTRACESPEC_ACTIVEMANY:
2410			new = DTRACESPEC_COMMITTINGMANY;
2411			break;
2412
2413		default:
2414			ASSERT(0);
2415		}
2416	} while (dtrace_cas32((uint32_t *)&spec->dtsp_state,
2417	    current, new) != current);
2418
2419	/*
2420	 * We have set the state to indicate that we are committing this
2421	 * speculation.  Now reserve the necessary space in the destination
2422	 * buffer.
2423	 */
2424	if ((offs = dtrace_buffer_reserve(dest, src->dtb_offset,
2425	    sizeof (uint64_t), state, NULL)) < 0) {
2426		dtrace_buffer_drop(dest);
2427		goto out;
2428	}
2429
2430	/*
2431	 * We have sufficient space to copy the speculative buffer into the
2432	 * primary buffer.  First, modify the speculative buffer, filling
2433	 * in the timestamp of all entries with the current time.  The data
2434	 * must have the commit() time rather than the time it was traced,
2435	 * so that all entries in the primary buffer are in timestamp order.
2436	 */
2437	timestamp = dtrace_gethrtime();
2438	saddr = (uintptr_t)src->dtb_tomax;
2439	slimit = saddr + src->dtb_offset;
2440	while (saddr < slimit) {
2441		size_t size;
2442		dtrace_rechdr_t *dtrh = (dtrace_rechdr_t *)saddr;
2443
2444		if (dtrh->dtrh_epid == DTRACE_EPIDNONE) {
2445			saddr += sizeof (dtrace_epid_t);
2446			continue;
2447		}
2448		ASSERT3U(dtrh->dtrh_epid, <=, state->dts_necbs);
2449		size = state->dts_ecbs[dtrh->dtrh_epid - 1]->dte_size;
2450
2451		ASSERT3U(saddr + size, <=, slimit);
2452		ASSERT3U(size, >=, sizeof (dtrace_rechdr_t));
2453		ASSERT3U(DTRACE_RECORD_LOAD_TIMESTAMP(dtrh), ==, UINT64_MAX);
2454
2455		DTRACE_RECORD_STORE_TIMESTAMP(dtrh, timestamp);
2456
2457		saddr += size;
2458	}
2459
2460	/*
2461	 * Copy the buffer across.  (Note that this is a
2462	 * highly subobtimal bcopy(); in the unlikely event that this becomes
2463	 * a serious performance issue, a high-performance DTrace-specific
2464	 * bcopy() should obviously be invented.)
2465	 */
2466	daddr = (uintptr_t)dest->dtb_tomax + offs;
2467	dlimit = daddr + src->dtb_offset;
2468	saddr = (uintptr_t)src->dtb_tomax;
2469
2470	/*
2471	 * First, the aligned portion.
2472	 */
2473	while (dlimit - daddr >= sizeof (uint64_t)) {
2474		*((uint64_t *)daddr) = *((uint64_t *)saddr);
2475
2476		daddr += sizeof (uint64_t);
2477		saddr += sizeof (uint64_t);
2478	}
2479
2480	/*
2481	 * Now any left-over bit...
2482	 */
2483	while (dlimit - daddr)
2484		*((uint8_t *)daddr++) = *((uint8_t *)saddr++);
2485
2486	/*
2487	 * Finally, commit the reserved space in the destination buffer.
2488	 */
2489	dest->dtb_offset = offs + src->dtb_offset;
2490
2491out:
2492	/*
2493	 * If we're lucky enough to be the only active CPU on this speculation
2494	 * buffer, we can just set the state back to DTRACESPEC_INACTIVE.
2495	 */
2496	if (current == DTRACESPEC_ACTIVE ||
2497	    (current == DTRACESPEC_ACTIVEONE && new == DTRACESPEC_COMMITTING)) {
2498		uint32_t rval = dtrace_cas32((uint32_t *)&spec->dtsp_state,
2499		    DTRACESPEC_COMMITTING, DTRACESPEC_INACTIVE);
2500
2501		ASSERT(rval == DTRACESPEC_COMMITTING);
2502	}
2503
2504	src->dtb_offset = 0;
2505	src->dtb_xamot_drops += src->dtb_drops;
2506	src->dtb_drops = 0;
2507}
2508
2509/*
2510 * This routine discards an active speculation.  If the specified speculation
2511 * is not in a valid state to perform a discard(), this routine will silently
2512 * do nothing.  The state of the specified speculation is transitioned
2513 * according to the state transition diagram outlined in <sys/dtrace_impl.h>
2514 */
2515static void
2516dtrace_speculation_discard(dtrace_state_t *state, processorid_t cpu,
2517    dtrace_specid_t which)
2518{
2519	dtrace_speculation_t *spec;
2520	dtrace_speculation_state_t current, new = 0;
2521	dtrace_buffer_t *buf;
2522
2523	if (which == 0)
2524		return;
2525
2526	if (which > state->dts_nspeculations) {
2527		cpu_core[cpu].cpuc_dtrace_flags |= CPU_DTRACE_ILLOP;
2528		return;
2529	}
2530
2531	spec = &state->dts_speculations[which - 1];
2532	buf = &spec->dtsp_buffer[cpu];
2533
2534	do {
2535		current = spec->dtsp_state;
2536
2537		switch (current) {
2538		case DTRACESPEC_INACTIVE:
2539		case DTRACESPEC_COMMITTINGMANY:
2540		case DTRACESPEC_COMMITTING:
2541		case DTRACESPEC_DISCARDING:
2542			return;
2543
2544		case DTRACESPEC_ACTIVE:
2545		case DTRACESPEC_ACTIVEMANY:
2546			new = DTRACESPEC_DISCARDING;
2547			break;
2548
2549		case DTRACESPEC_ACTIVEONE:
2550			if (buf->dtb_offset != 0) {
2551				new = DTRACESPEC_INACTIVE;
2552			} else {
2553				new = DTRACESPEC_DISCARDING;
2554			}
2555			break;
2556
2557		default:
2558			ASSERT(0);
2559		}
2560	} while (dtrace_cas32((uint32_t *)&spec->dtsp_state,
2561	    current, new) != current);
2562
2563	buf->dtb_offset = 0;
2564	buf->dtb_drops = 0;
2565}
2566
2567/*
2568 * Note:  not called from probe context.  This function is called
2569 * asynchronously from cross call context to clean any speculations that are
2570 * in the COMMITTINGMANY or DISCARDING states.  These speculations may not be
2571 * transitioned back to the INACTIVE state until all CPUs have cleaned the
2572 * speculation.
2573 */
2574static void
2575dtrace_speculation_clean_here(dtrace_state_t *state)
2576{
2577	dtrace_icookie_t cookie;
2578	processorid_t cpu = curcpu;
2579	dtrace_buffer_t *dest = &state->dts_buffer[cpu];
2580	dtrace_specid_t i;
2581
2582	cookie = dtrace_interrupt_disable();
2583
2584	if (dest->dtb_tomax == NULL) {
2585		dtrace_interrupt_enable(cookie);
2586		return;
2587	}
2588
2589	for (i = 0; i < state->dts_nspeculations; i++) {
2590		dtrace_speculation_t *spec = &state->dts_speculations[i];
2591		dtrace_buffer_t *src = &spec->dtsp_buffer[cpu];
2592
2593		if (src->dtb_tomax == NULL)
2594			continue;
2595
2596		if (spec->dtsp_state == DTRACESPEC_DISCARDING) {
2597			src->dtb_offset = 0;
2598			continue;
2599		}
2600
2601		if (spec->dtsp_state != DTRACESPEC_COMMITTINGMANY)
2602			continue;
2603
2604		if (src->dtb_offset == 0)
2605			continue;
2606
2607		dtrace_speculation_commit(state, cpu, i + 1);
2608	}
2609
2610	dtrace_interrupt_enable(cookie);
2611}
2612
2613/*
2614 * Note:  not called from probe context.  This function is called
2615 * asynchronously (and at a regular interval) to clean any speculations that
2616 * are in the COMMITTINGMANY or DISCARDING states.  If it discovers that there
2617 * is work to be done, it cross calls all CPUs to perform that work;
2618 * COMMITMANY and DISCARDING speculations may not be transitioned back to the
2619 * INACTIVE state until they have been cleaned by all CPUs.
2620 */
2621static void
2622dtrace_speculation_clean(dtrace_state_t *state)
2623{
2624	int work = 0, rv;
2625	dtrace_specid_t i;
2626
2627	for (i = 0; i < state->dts_nspeculations; i++) {
2628		dtrace_speculation_t *spec = &state->dts_speculations[i];
2629
2630		ASSERT(!spec->dtsp_cleaning);
2631
2632		if (spec->dtsp_state != DTRACESPEC_DISCARDING &&
2633		    spec->dtsp_state != DTRACESPEC_COMMITTINGMANY)
2634			continue;
2635
2636		work++;
2637		spec->dtsp_cleaning = 1;
2638	}
2639
2640	if (!work)
2641		return;
2642
2643	dtrace_xcall(DTRACE_CPUALL,
2644	    (dtrace_xcall_t)dtrace_speculation_clean_here, state);
2645
2646	/*
2647	 * We now know that all CPUs have committed or discarded their
2648	 * speculation buffers, as appropriate.  We can now set the state
2649	 * to inactive.
2650	 */
2651	for (i = 0; i < state->dts_nspeculations; i++) {
2652		dtrace_speculation_t *spec = &state->dts_speculations[i];
2653		dtrace_speculation_state_t current, new;
2654
2655		if (!spec->dtsp_cleaning)
2656			continue;
2657
2658		current = spec->dtsp_state;
2659		ASSERT(current == DTRACESPEC_DISCARDING ||
2660		    current == DTRACESPEC_COMMITTINGMANY);
2661
2662		new = DTRACESPEC_INACTIVE;
2663
2664		rv = dtrace_cas32((uint32_t *)&spec->dtsp_state, current, new);
2665		ASSERT(rv == current);
2666		spec->dtsp_cleaning = 0;
2667	}
2668}
2669
2670/*
2671 * Called as part of a speculate() to get the speculative buffer associated
2672 * with a given speculation.  Returns NULL if the specified speculation is not
2673 * in an ACTIVE state.  If the speculation is in the ACTIVEONE state -- and
2674 * the active CPU is not the specified CPU -- the speculation will be
2675 * atomically transitioned into the ACTIVEMANY state.
2676 */
2677static dtrace_buffer_t *
2678dtrace_speculation_buffer(dtrace_state_t *state, processorid_t cpuid,
2679    dtrace_specid_t which)
2680{
2681	dtrace_speculation_t *spec;
2682	dtrace_speculation_state_t current, new = 0;
2683	dtrace_buffer_t *buf;
2684
2685	if (which == 0)
2686		return (NULL);
2687
2688	if (which > state->dts_nspeculations) {
2689		cpu_core[cpuid].cpuc_dtrace_flags |= CPU_DTRACE_ILLOP;
2690		return (NULL);
2691	}
2692
2693	spec = &state->dts_speculations[which - 1];
2694	buf = &spec->dtsp_buffer[cpuid];
2695
2696	do {
2697		current = spec->dtsp_state;
2698
2699		switch (current) {
2700		case DTRACESPEC_INACTIVE:
2701		case DTRACESPEC_COMMITTINGMANY:
2702		case DTRACESPEC_DISCARDING:
2703			return (NULL);
2704
2705		case DTRACESPEC_COMMITTING:
2706			ASSERT(buf->dtb_offset == 0);
2707			return (NULL);
2708
2709		case DTRACESPEC_ACTIVEONE:
2710			/*
2711			 * This speculation is currently active on one CPU.
2712			 * Check the offset in the buffer; if it's non-zero,
2713			 * that CPU must be us (and we leave the state alone).
2714			 * If it's zero, assume that we're starting on a new
2715			 * CPU -- and change the state to indicate that the
2716			 * speculation is active on more than one CPU.
2717			 */
2718			if (buf->dtb_offset != 0)
2719				return (buf);
2720
2721			new = DTRACESPEC_ACTIVEMANY;
2722			break;
2723
2724		case DTRACESPEC_ACTIVEMANY:
2725			return (buf);
2726
2727		case DTRACESPEC_ACTIVE:
2728			new = DTRACESPEC_ACTIVEONE;
2729			break;
2730
2731		default:
2732			ASSERT(0);
2733		}
2734	} while (dtrace_cas32((uint32_t *)&spec->dtsp_state,
2735	    current, new) != current);
2736
2737	ASSERT(new == DTRACESPEC_ACTIVEONE || new == DTRACESPEC_ACTIVEMANY);
2738	return (buf);
2739}
2740
2741/*
2742 * Return a string.  In the event that the user lacks the privilege to access
2743 * arbitrary kernel memory, we copy the string out to scratch memory so that we
2744 * don't fail access checking.
2745 *
2746 * dtrace_dif_variable() uses this routine as a helper for various
2747 * builtin values such as 'execname' and 'probefunc.'
2748 */
2749uintptr_t
2750dtrace_dif_varstr(uintptr_t addr, dtrace_state_t *state,
2751    dtrace_mstate_t *mstate)
2752{
2753	uint64_t size = state->dts_options[DTRACEOPT_STRSIZE];
2754	uintptr_t ret;
2755	size_t strsz;
2756
2757	/*
2758	 * The easy case: this probe is allowed to read all of memory, so
2759	 * we can just return this as a vanilla pointer.
2760	 */
2761	if ((mstate->dtms_access & DTRACE_ACCESS_KERNEL) != 0)
2762		return (addr);
2763
2764	/*
2765	 * This is the tougher case: we copy the string in question from
2766	 * kernel memory into scratch memory and return it that way: this
2767	 * ensures that we won't trip up when access checking tests the
2768	 * BYREF return value.
2769	 */
2770	strsz = dtrace_strlen((char *)addr, size) + 1;
2771
2772	if (mstate->dtms_scratch_ptr + strsz >
2773	    mstate->dtms_scratch_base + mstate->dtms_scratch_size) {
2774		DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
2775		return (0);
2776	}
2777
2778	dtrace_strcpy((const void *)addr, (void *)mstate->dtms_scratch_ptr,
2779	    strsz);
2780	ret = mstate->dtms_scratch_ptr;
2781	mstate->dtms_scratch_ptr += strsz;
2782	return (ret);
2783}
2784
2785/*
2786 * Return a string from a memoy address which is known to have one or
2787 * more concatenated, individually zero terminated, sub-strings.
2788 * In the event that the user lacks the privilege to access
2789 * arbitrary kernel memory, we copy the string out to scratch memory so that we
2790 * don't fail access checking.
2791 *
2792 * dtrace_dif_variable() uses this routine as a helper for various
2793 * builtin values such as 'execargs'.
2794 */
2795static uintptr_t
2796dtrace_dif_varstrz(uintptr_t addr, size_t strsz, dtrace_state_t *state,
2797    dtrace_mstate_t *mstate)
2798{
2799	char *p;
2800	size_t i;
2801	uintptr_t ret;
2802
2803	if (mstate->dtms_scratch_ptr + strsz >
2804	    mstate->dtms_scratch_base + mstate->dtms_scratch_size) {
2805		DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
2806		return (0);
2807	}
2808
2809	dtrace_bcopy((const void *)addr, (void *)mstate->dtms_scratch_ptr,
2810	    strsz);
2811
2812	/* Replace sub-string termination characters with a space. */
2813	for (p = (char *) mstate->dtms_scratch_ptr, i = 0; i < strsz - 1;
2814	    p++, i++)
2815		if (*p == '\0')
2816			*p = ' ';
2817
2818	ret = mstate->dtms_scratch_ptr;
2819	mstate->dtms_scratch_ptr += strsz;
2820	return (ret);
2821}
2822
2823/*
2824 * This function implements the DIF emulator's variable lookups.  The emulator
2825 * passes a reserved variable identifier and optional built-in array index.
2826 */
2827static uint64_t
2828dtrace_dif_variable(dtrace_mstate_t *mstate, dtrace_state_t *state, uint64_t v,
2829    uint64_t ndx)
2830{
2831	/*
2832	 * If we're accessing one of the uncached arguments, we'll turn this
2833	 * into a reference in the args array.
2834	 */
2835	if (v >= DIF_VAR_ARG0 && v <= DIF_VAR_ARG9) {
2836		ndx = v - DIF_VAR_ARG0;
2837		v = DIF_VAR_ARGS;
2838	}
2839
2840	switch (v) {
2841	case DIF_VAR_ARGS:
2842		ASSERT(mstate->dtms_present & DTRACE_MSTATE_ARGS);
2843		if (ndx >= sizeof (mstate->dtms_arg) /
2844		    sizeof (mstate->dtms_arg[0])) {
2845			int aframes = mstate->dtms_probe->dtpr_aframes + 2;
2846			dtrace_provider_t *pv;
2847			uint64_t val;
2848
2849			pv = mstate->dtms_probe->dtpr_provider;
2850			if (pv->dtpv_pops.dtps_getargval != NULL)
2851				val = pv->dtpv_pops.dtps_getargval(pv->dtpv_arg,
2852				    mstate->dtms_probe->dtpr_id,
2853				    mstate->dtms_probe->dtpr_arg, ndx, aframes);
2854			else
2855				val = dtrace_getarg(ndx, aframes);
2856
2857			/*
2858			 * This is regrettably required to keep the compiler
2859			 * from tail-optimizing the call to dtrace_getarg().
2860			 * The condition always evaluates to true, but the
2861			 * compiler has no way of figuring that out a priori.
2862			 * (None of this would be necessary if the compiler
2863			 * could be relied upon to _always_ tail-optimize
2864			 * the call to dtrace_getarg() -- but it can't.)
2865			 */
2866			if (mstate->dtms_probe != NULL)
2867				return (val);
2868
2869			ASSERT(0);
2870		}
2871
2872		return (mstate->dtms_arg[ndx]);
2873
2874#if defined(sun)
2875	case DIF_VAR_UREGS: {
2876		klwp_t *lwp;
2877
2878		if (!dtrace_priv_proc(state))
2879			return (0);
2880
2881		if ((lwp = curthread->t_lwp) == NULL) {
2882			DTRACE_CPUFLAG_SET(CPU_DTRACE_BADADDR);
2883			cpu_core[curcpu].cpuc_dtrace_illval = NULL;
2884			return (0);
2885		}
2886
2887		return (dtrace_getreg(lwp->lwp_regs, ndx));
2888		return (0);
2889	}
2890#else
2891	case DIF_VAR_UREGS: {
2892		struct trapframe *tframe;
2893
2894		if (!dtrace_priv_proc(state))
2895			return (0);
2896
2897		if ((tframe = curthread->td_frame) == NULL) {
2898			DTRACE_CPUFLAG_SET(CPU_DTRACE_BADADDR);
2899			cpu_core[curcpu].cpuc_dtrace_illval = 0;
2900			return (0);
2901		}
2902
2903		return (dtrace_getreg(tframe, ndx));
2904	}
2905#endif
2906
2907	case DIF_VAR_CURTHREAD:
2908		if (!dtrace_priv_kernel(state))
2909			return (0);
2910		return ((uint64_t)(uintptr_t)curthread);
2911
2912	case DIF_VAR_TIMESTAMP:
2913		if (!(mstate->dtms_present & DTRACE_MSTATE_TIMESTAMP)) {
2914			mstate->dtms_timestamp = dtrace_gethrtime();
2915			mstate->dtms_present |= DTRACE_MSTATE_TIMESTAMP;
2916		}
2917		return (mstate->dtms_timestamp);
2918
2919	case DIF_VAR_VTIMESTAMP:
2920		ASSERT(dtrace_vtime_references != 0);
2921		return (curthread->t_dtrace_vtime);
2922
2923	case DIF_VAR_WALLTIMESTAMP:
2924		if (!(mstate->dtms_present & DTRACE_MSTATE_WALLTIMESTAMP)) {
2925			mstate->dtms_walltimestamp = dtrace_gethrestime();
2926			mstate->dtms_present |= DTRACE_MSTATE_WALLTIMESTAMP;
2927		}
2928		return (mstate->dtms_walltimestamp);
2929
2930#if defined(sun)
2931	case DIF_VAR_IPL:
2932		if (!dtrace_priv_kernel(state))
2933			return (0);
2934		if (!(mstate->dtms_present & DTRACE_MSTATE_IPL)) {
2935			mstate->dtms_ipl = dtrace_getipl();
2936			mstate->dtms_present |= DTRACE_MSTATE_IPL;
2937		}
2938		return (mstate->dtms_ipl);
2939#endif
2940
2941	case DIF_VAR_EPID:
2942		ASSERT(mstate->dtms_present & DTRACE_MSTATE_EPID);
2943		return (mstate->dtms_epid);
2944
2945	case DIF_VAR_ID:
2946		ASSERT(mstate->dtms_present & DTRACE_MSTATE_PROBE);
2947		return (mstate->dtms_probe->dtpr_id);
2948
2949	case DIF_VAR_STACKDEPTH:
2950		if (!dtrace_priv_kernel(state))
2951			return (0);
2952		if (!(mstate->dtms_present & DTRACE_MSTATE_STACKDEPTH)) {
2953			int aframes = mstate->dtms_probe->dtpr_aframes + 2;
2954
2955			mstate->dtms_stackdepth = dtrace_getstackdepth(aframes);
2956			mstate->dtms_present |= DTRACE_MSTATE_STACKDEPTH;
2957		}
2958		return (mstate->dtms_stackdepth);
2959
2960	case DIF_VAR_USTACKDEPTH:
2961		if (!dtrace_priv_proc(state))
2962			return (0);
2963		if (!(mstate->dtms_present & DTRACE_MSTATE_USTACKDEPTH)) {
2964			/*
2965			 * See comment in DIF_VAR_PID.
2966			 */
2967			if (DTRACE_ANCHORED(mstate->dtms_probe) &&
2968			    CPU_ON_INTR(CPU)) {
2969				mstate->dtms_ustackdepth = 0;
2970			} else {
2971				DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
2972				mstate->dtms_ustackdepth =
2973				    dtrace_getustackdepth();
2974				DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);
2975			}
2976			mstate->dtms_present |= DTRACE_MSTATE_USTACKDEPTH;
2977		}
2978		return (mstate->dtms_ustackdepth);
2979
2980	case DIF_VAR_CALLER:
2981		if (!dtrace_priv_kernel(state))
2982			return (0);
2983		if (!(mstate->dtms_present & DTRACE_MSTATE_CALLER)) {
2984			int aframes = mstate->dtms_probe->dtpr_aframes + 2;
2985
2986			if (!DTRACE_ANCHORED(mstate->dtms_probe)) {
2987				/*
2988				 * If this is an unanchored probe, we are
2989				 * required to go through the slow path:
2990				 * dtrace_caller() only guarantees correct
2991				 * results for anchored probes.
2992				 */
2993				pc_t caller[2] = {0, 0};
2994
2995				dtrace_getpcstack(caller, 2, aframes,
2996				    (uint32_t *)(uintptr_t)mstate->dtms_arg[0]);
2997				mstate->dtms_caller = caller[1];
2998			} else if ((mstate->dtms_caller =
2999			    dtrace_caller(aframes)) == -1) {
3000				/*
3001				 * We have failed to do this the quick way;
3002				 * we must resort to the slower approach of
3003				 * calling dtrace_getpcstack().
3004				 */
3005				pc_t caller = 0;
3006
3007				dtrace_getpcstack(&caller, 1, aframes, NULL);
3008				mstate->dtms_caller = caller;
3009			}
3010
3011			mstate->dtms_present |= DTRACE_MSTATE_CALLER;
3012		}
3013		return (mstate->dtms_caller);
3014
3015	case DIF_VAR_UCALLER:
3016		if (!dtrace_priv_proc(state))
3017			return (0);
3018
3019		if (!(mstate->dtms_present & DTRACE_MSTATE_UCALLER)) {
3020			uint64_t ustack[3];
3021
3022			/*
3023			 * dtrace_getupcstack() fills in the first uint64_t
3024			 * with the current PID.  The second uint64_t will
3025			 * be the program counter at user-level.  The third
3026			 * uint64_t will contain the caller, which is what
3027			 * we're after.
3028			 */
3029			ustack[2] = 0;
3030			DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
3031			dtrace_getupcstack(ustack, 3);
3032			DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);
3033			mstate->dtms_ucaller = ustack[2];
3034			mstate->dtms_present |= DTRACE_MSTATE_UCALLER;
3035		}
3036
3037		return (mstate->dtms_ucaller);
3038
3039	case DIF_VAR_PROBEPROV:
3040		ASSERT(mstate->dtms_present & DTRACE_MSTATE_PROBE);
3041		return (dtrace_dif_varstr(
3042		    (uintptr_t)mstate->dtms_probe->dtpr_provider->dtpv_name,
3043		    state, mstate));
3044
3045	case DIF_VAR_PROBEMOD:
3046		ASSERT(mstate->dtms_present & DTRACE_MSTATE_PROBE);
3047		return (dtrace_dif_varstr(
3048		    (uintptr_t)mstate->dtms_probe->dtpr_mod,
3049		    state, mstate));
3050
3051	case DIF_VAR_PROBEFUNC:
3052		ASSERT(mstate->dtms_present & DTRACE_MSTATE_PROBE);
3053		return (dtrace_dif_varstr(
3054		    (uintptr_t)mstate->dtms_probe->dtpr_func,
3055		    state, mstate));
3056
3057	case DIF_VAR_PROBENAME:
3058		ASSERT(mstate->dtms_present & DTRACE_MSTATE_PROBE);
3059		return (dtrace_dif_varstr(
3060		    (uintptr_t)mstate->dtms_probe->dtpr_name,
3061		    state, mstate));
3062
3063	case DIF_VAR_PID:
3064		if (!dtrace_priv_proc(state))
3065			return (0);
3066
3067#if defined(sun)
3068		/*
3069		 * Note that we are assuming that an unanchored probe is
3070		 * always due to a high-level interrupt.  (And we're assuming
3071		 * that there is only a single high level interrupt.)
3072		 */
3073		if (DTRACE_ANCHORED(mstate->dtms_probe) && CPU_ON_INTR(CPU))
3074			return (pid0.pid_id);
3075
3076		/*
3077		 * It is always safe to dereference one's own t_procp pointer:
3078		 * it always points to a valid, allocated proc structure.
3079		 * Further, it is always safe to dereference the p_pidp member
3080		 * of one's own proc structure.  (These are truisms becuase
3081		 * threads and processes don't clean up their own state --
3082		 * they leave that task to whomever reaps them.)
3083		 */
3084		return ((uint64_t)curthread->t_procp->p_pidp->pid_id);
3085#else
3086		return ((uint64_t)curproc->p_pid);
3087#endif
3088
3089	case DIF_VAR_PPID:
3090		if (!dtrace_priv_proc(state))
3091			return (0);
3092
3093#if defined(sun)
3094		/*
3095		 * See comment in DIF_VAR_PID.
3096		 */
3097		if (DTRACE_ANCHORED(mstate->dtms_probe) && CPU_ON_INTR(CPU))
3098			return (pid0.pid_id);
3099
3100		/*
3101		 * It is always safe to dereference one's own t_procp pointer:
3102		 * it always points to a valid, allocated proc structure.
3103		 * (This is true because threads don't clean up their own
3104		 * state -- they leave that task to whomever reaps them.)
3105		 */
3106		return ((uint64_t)curthread->t_procp->p_ppid);
3107#else
3108		return ((uint64_t)curproc->p_pptr->p_pid);
3109#endif
3110
3111	case DIF_VAR_TID:
3112#if defined(sun)
3113		/*
3114		 * See comment in DIF_VAR_PID.
3115		 */
3116		if (DTRACE_ANCHORED(mstate->dtms_probe) && CPU_ON_INTR(CPU))
3117			return (0);
3118#endif
3119
3120		return ((uint64_t)curthread->t_tid);
3121
3122	case DIF_VAR_EXECARGS: {
3123		struct pargs *p_args = curthread->td_proc->p_args;
3124
3125		if (p_args == NULL)
3126			return(0);
3127
3128		return (dtrace_dif_varstrz(
3129		    (uintptr_t) p_args->ar_args, p_args->ar_length, state, mstate));
3130	}
3131
3132	case DIF_VAR_EXECNAME:
3133#if defined(sun)
3134		if (!dtrace_priv_proc(state))
3135			return (0);
3136
3137		/*
3138		 * See comment in DIF_VAR_PID.
3139		 */
3140		if (DTRACE_ANCHORED(mstate->dtms_probe) && CPU_ON_INTR(CPU))
3141			return ((uint64_t)(uintptr_t)p0.p_user.u_comm);
3142
3143		/*
3144		 * It is always safe to dereference one's own t_procp pointer:
3145		 * it always points to a valid, allocated proc structure.
3146		 * (This is true because threads don't clean up their own
3147		 * state -- they leave that task to whomever reaps them.)
3148		 */
3149		return (dtrace_dif_varstr(
3150		    (uintptr_t)curthread->t_procp->p_user.u_comm,
3151		    state, mstate));
3152#else
3153		return (dtrace_dif_varstr(
3154		    (uintptr_t) curthread->td_proc->p_comm, state, mstate));
3155#endif
3156
3157	case DIF_VAR_ZONENAME:
3158#if defined(sun)
3159		if (!dtrace_priv_proc(state))
3160			return (0);
3161
3162		/*
3163		 * See comment in DIF_VAR_PID.
3164		 */
3165		if (DTRACE_ANCHORED(mstate->dtms_probe) && CPU_ON_INTR(CPU))
3166			return ((uint64_t)(uintptr_t)p0.p_zone->zone_name);
3167
3168		/*
3169		 * It is always safe to dereference one's own t_procp pointer:
3170		 * it always points to a valid, allocated proc structure.
3171		 * (This is true because threads don't clean up their own
3172		 * state -- they leave that task to whomever reaps them.)
3173		 */
3174		return (dtrace_dif_varstr(
3175		    (uintptr_t)curthread->t_procp->p_zone->zone_name,
3176		    state, mstate));
3177#else
3178		return (0);
3179#endif
3180
3181	case DIF_VAR_UID:
3182		if (!dtrace_priv_proc(state))
3183			return (0);
3184
3185#if defined(sun)
3186		/*
3187		 * See comment in DIF_VAR_PID.
3188		 */
3189		if (DTRACE_ANCHORED(mstate->dtms_probe) && CPU_ON_INTR(CPU))
3190			return ((uint64_t)p0.p_cred->cr_uid);
3191#endif
3192
3193		/*
3194		 * It is always safe to dereference one's own t_procp pointer:
3195		 * it always points to a valid, allocated proc structure.
3196		 * (This is true because threads don't clean up their own
3197		 * state -- they leave that task to whomever reaps them.)
3198		 *
3199		 * Additionally, it is safe to dereference one's own process
3200		 * credential, since this is never NULL after process birth.
3201		 */
3202		return ((uint64_t)curthread->t_procp->p_cred->cr_uid);
3203
3204	case DIF_VAR_GID:
3205		if (!dtrace_priv_proc(state))
3206			return (0);
3207
3208#if defined(sun)
3209		/*
3210		 * See comment in DIF_VAR_PID.
3211		 */
3212		if (DTRACE_ANCHORED(mstate->dtms_probe) && CPU_ON_INTR(CPU))
3213			return ((uint64_t)p0.p_cred->cr_gid);
3214#endif
3215
3216		/*
3217		 * It is always safe to dereference one's own t_procp pointer:
3218		 * it always points to a valid, allocated proc structure.
3219		 * (This is true because threads don't clean up their own
3220		 * state -- they leave that task to whomever reaps them.)
3221		 *
3222		 * Additionally, it is safe to dereference one's own process
3223		 * credential, since this is never NULL after process birth.
3224		 */
3225		return ((uint64_t)curthread->t_procp->p_cred->cr_gid);
3226
3227	case DIF_VAR_ERRNO: {
3228#if defined(sun)
3229		klwp_t *lwp;
3230		if (!dtrace_priv_proc(state))
3231			return (0);
3232
3233		/*
3234		 * See comment in DIF_VAR_PID.
3235		 */
3236		if (DTRACE_ANCHORED(mstate->dtms_probe) && CPU_ON_INTR(CPU))
3237			return (0);
3238
3239		/*
3240		 * It is always safe to dereference one's own t_lwp pointer in
3241		 * the event that this pointer is non-NULL.  (This is true
3242		 * because threads and lwps don't clean up their own state --
3243		 * they leave that task to whomever reaps them.)
3244		 */
3245		if ((lwp = curthread->t_lwp) == NULL)
3246			return (0);
3247
3248		return ((uint64_t)lwp->lwp_errno);
3249#else
3250		return (curthread->td_errno);
3251#endif
3252	}
3253#if !defined(sun)
3254	case DIF_VAR_CPU: {
3255		return curcpu;
3256	}
3257#endif
3258	default:
3259		DTRACE_CPUFLAG_SET(CPU_DTRACE_ILLOP);
3260		return (0);
3261	}
3262}
3263
3264/*
3265 * Emulate the execution of DTrace ID subroutines invoked by the call opcode.
3266 * Notice that we don't bother validating the proper number of arguments or
3267 * their types in the tuple stack.  This isn't needed because all argument
3268 * interpretation is safe because of our load safety -- the worst that can
3269 * happen is that a bogus program can obtain bogus results.
3270 */
3271static void
3272dtrace_dif_subr(uint_t subr, uint_t rd, uint64_t *regs,
3273    dtrace_key_t *tupregs, int nargs,
3274    dtrace_mstate_t *mstate, dtrace_state_t *state)
3275{
3276	volatile uint16_t *flags = &cpu_core[curcpu].cpuc_dtrace_flags;
3277	volatile uintptr_t *illval = &cpu_core[curcpu].cpuc_dtrace_illval;
3278	dtrace_vstate_t *vstate = &state->dts_vstate;
3279
3280#if defined(sun)
3281	union {
3282		mutex_impl_t mi;
3283		uint64_t mx;
3284	} m;
3285
3286	union {
3287		krwlock_t ri;
3288		uintptr_t rw;
3289	} r;
3290#else
3291	struct thread *lowner;
3292	union {
3293		struct lock_object *li;
3294		uintptr_t lx;
3295	} l;
3296#endif
3297
3298	switch (subr) {
3299	case DIF_SUBR_RAND:
3300		regs[rd] = (dtrace_gethrtime() * 2416 + 374441) % 1771875;
3301		break;
3302
3303#if defined(sun)
3304	case DIF_SUBR_MUTEX_OWNED:
3305		if (!dtrace_canload(tupregs[0].dttk_value, sizeof (kmutex_t),
3306		    mstate, vstate)) {
3307			regs[rd] = 0;
3308			break;
3309		}
3310
3311		m.mx = dtrace_load64(tupregs[0].dttk_value);
3312		if (MUTEX_TYPE_ADAPTIVE(&m.mi))
3313			regs[rd] = MUTEX_OWNER(&m.mi) != MUTEX_NO_OWNER;
3314		else
3315			regs[rd] = LOCK_HELD(&m.mi.m_spin.m_spinlock);
3316		break;
3317
3318	case DIF_SUBR_MUTEX_OWNER:
3319		if (!dtrace_canload(tupregs[0].dttk_value, sizeof (kmutex_t),
3320		    mstate, vstate)) {
3321			regs[rd] = 0;
3322			break;
3323		}
3324
3325		m.mx = dtrace_load64(tupregs[0].dttk_value);
3326		if (MUTEX_TYPE_ADAPTIVE(&m.mi) &&
3327		    MUTEX_OWNER(&m.mi) != MUTEX_NO_OWNER)
3328			regs[rd] = (uintptr_t)MUTEX_OWNER(&m.mi);
3329		else
3330			regs[rd] = 0;
3331		break;
3332
3333	case DIF_SUBR_MUTEX_TYPE_ADAPTIVE:
3334		if (!dtrace_canload(tupregs[0].dttk_value, sizeof (kmutex_t),
3335		    mstate, vstate)) {
3336			regs[rd] = 0;
3337			break;
3338		}
3339
3340		m.mx = dtrace_load64(tupregs[0].dttk_value);
3341		regs[rd] = MUTEX_TYPE_ADAPTIVE(&m.mi);
3342		break;
3343
3344	case DIF_SUBR_MUTEX_TYPE_SPIN:
3345		if (!dtrace_canload(tupregs[0].dttk_value, sizeof (kmutex_t),
3346		    mstate, vstate)) {
3347			regs[rd] = 0;
3348			break;
3349		}
3350
3351		m.mx = dtrace_load64(tupregs[0].dttk_value);
3352		regs[rd] = MUTEX_TYPE_SPIN(&m.mi);
3353		break;
3354
3355	case DIF_SUBR_RW_READ_HELD: {
3356		uintptr_t tmp;
3357
3358		if (!dtrace_canload(tupregs[0].dttk_value, sizeof (uintptr_t),
3359		    mstate, vstate)) {
3360			regs[rd] = 0;
3361			break;
3362		}
3363
3364		r.rw = dtrace_loadptr(tupregs[0].dttk_value);
3365		regs[rd] = _RW_READ_HELD(&r.ri, tmp);
3366		break;
3367	}
3368
3369	case DIF_SUBR_RW_WRITE_HELD:
3370		if (!dtrace_canload(tupregs[0].dttk_value, sizeof (krwlock_t),
3371		    mstate, vstate)) {
3372			regs[rd] = 0;
3373			break;
3374		}
3375
3376		r.rw = dtrace_loadptr(tupregs[0].dttk_value);
3377		regs[rd] = _RW_WRITE_HELD(&r.ri);
3378		break;
3379
3380	case DIF_SUBR_RW_ISWRITER:
3381		if (!dtrace_canload(tupregs[0].dttk_value, sizeof (krwlock_t),
3382		    mstate, vstate)) {
3383			regs[rd] = 0;
3384			break;
3385		}
3386
3387		r.rw = dtrace_loadptr(tupregs[0].dttk_value);
3388		regs[rd] = _RW_ISWRITER(&r.ri);
3389		break;
3390
3391#else
3392	case DIF_SUBR_MUTEX_OWNED:
3393		if (!dtrace_canload(tupregs[0].dttk_value,
3394			sizeof (struct lock_object), mstate, vstate)) {
3395			regs[rd] = 0;
3396			break;
3397		}
3398		l.lx = dtrace_loadptr((uintptr_t)&tupregs[0].dttk_value);
3399		regs[rd] = LOCK_CLASS(l.li)->lc_owner(l.li, &lowner);
3400		break;
3401
3402	case DIF_SUBR_MUTEX_OWNER:
3403		if (!dtrace_canload(tupregs[0].dttk_value,
3404			sizeof (struct lock_object), mstate, vstate)) {
3405			regs[rd] = 0;
3406			break;
3407		}
3408		l.lx = dtrace_loadptr((uintptr_t)&tupregs[0].dttk_value);
3409		LOCK_CLASS(l.li)->lc_owner(l.li, &lowner);
3410		regs[rd] = (uintptr_t)lowner;
3411		break;
3412
3413	case DIF_SUBR_MUTEX_TYPE_ADAPTIVE:
3414		if (!dtrace_canload(tupregs[0].dttk_value, sizeof (struct mtx),
3415		    mstate, vstate)) {
3416			regs[rd] = 0;
3417			break;
3418		}
3419		l.lx = dtrace_loadptr((uintptr_t)&tupregs[0].dttk_value);
3420		/* XXX - should be only LC_SLEEPABLE? */
3421		regs[rd] = (LOCK_CLASS(l.li)->lc_flags &
3422		    (LC_SLEEPLOCK | LC_SLEEPABLE)) != 0;
3423		break;
3424
3425	case DIF_SUBR_MUTEX_TYPE_SPIN:
3426		if (!dtrace_canload(tupregs[0].dttk_value, sizeof (struct mtx),
3427		    mstate, vstate)) {
3428			regs[rd] = 0;
3429			break;
3430		}
3431		l.lx = dtrace_loadptr((uintptr_t)&tupregs[0].dttk_value);
3432		regs[rd] = (LOCK_CLASS(l.li)->lc_flags & LC_SPINLOCK) != 0;
3433		break;
3434
3435	case DIF_SUBR_RW_READ_HELD:
3436	case DIF_SUBR_SX_SHARED_HELD:
3437		if (!dtrace_canload(tupregs[0].dttk_value, sizeof (uintptr_t),
3438		    mstate, vstate)) {
3439			regs[rd] = 0;
3440			break;
3441		}
3442		l.lx = dtrace_loadptr((uintptr_t)&tupregs[0].dttk_value);
3443		regs[rd] = LOCK_CLASS(l.li)->lc_owner(l.li, &lowner) &&
3444		    lowner == NULL;
3445		break;
3446
3447	case DIF_SUBR_RW_WRITE_HELD:
3448	case DIF_SUBR_SX_EXCLUSIVE_HELD:
3449		if (!dtrace_canload(tupregs[0].dttk_value, sizeof (uintptr_t),
3450		    mstate, vstate)) {
3451			regs[rd] = 0;
3452			break;
3453		}
3454		l.lx = dtrace_loadptr(tupregs[0].dttk_value);
3455		LOCK_CLASS(l.li)->lc_owner(l.li, &lowner);
3456		regs[rd] = (lowner == curthread);
3457		break;
3458
3459	case DIF_SUBR_RW_ISWRITER:
3460	case DIF_SUBR_SX_ISEXCLUSIVE:
3461		if (!dtrace_canload(tupregs[0].dttk_value, sizeof (uintptr_t),
3462		    mstate, vstate)) {
3463			regs[rd] = 0;
3464			break;
3465		}
3466		l.lx = dtrace_loadptr(tupregs[0].dttk_value);
3467		regs[rd] = LOCK_CLASS(l.li)->lc_owner(l.li, &lowner) &&
3468		    lowner != NULL;
3469		break;
3470#endif /* ! defined(sun) */
3471
3472	case DIF_SUBR_BCOPY: {
3473		/*
3474		 * We need to be sure that the destination is in the scratch
3475		 * region -- no other region is allowed.
3476		 */
3477		uintptr_t src = tupregs[0].dttk_value;
3478		uintptr_t dest = tupregs[1].dttk_value;
3479		size_t size = tupregs[2].dttk_value;
3480
3481		if (!dtrace_inscratch(dest, size, mstate)) {
3482			*flags |= CPU_DTRACE_BADADDR;
3483			*illval = regs[rd];
3484			break;
3485		}
3486
3487		if (!dtrace_canload(src, size, mstate, vstate)) {
3488			regs[rd] = 0;
3489			break;
3490		}
3491
3492		dtrace_bcopy((void *)src, (void *)dest, size);
3493		break;
3494	}
3495
3496	case DIF_SUBR_ALLOCA:
3497	case DIF_SUBR_COPYIN: {
3498		uintptr_t dest = P2ROUNDUP(mstate->dtms_scratch_ptr, 8);
3499		uint64_t size =
3500		    tupregs[subr == DIF_SUBR_ALLOCA ? 0 : 1].dttk_value;
3501		size_t scratch_size = (dest - mstate->dtms_scratch_ptr) + size;
3502
3503		/*
3504		 * This action doesn't require any credential checks since
3505		 * probes will not activate in user contexts to which the
3506		 * enabling user does not have permissions.
3507		 */
3508
3509		/*
3510		 * Rounding up the user allocation size could have overflowed
3511		 * a large, bogus allocation (like -1ULL) to 0.
3512		 */
3513		if (scratch_size < size ||
3514		    !DTRACE_INSCRATCH(mstate, scratch_size)) {
3515			DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
3516			regs[rd] = 0;
3517			break;
3518		}
3519
3520		if (subr == DIF_SUBR_COPYIN) {
3521			DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
3522			dtrace_copyin(tupregs[0].dttk_value, dest, size, flags);
3523			DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);
3524		}
3525
3526		mstate->dtms_scratch_ptr += scratch_size;
3527		regs[rd] = dest;
3528		break;
3529	}
3530
3531	case DIF_SUBR_COPYINTO: {
3532		uint64_t size = tupregs[1].dttk_value;
3533		uintptr_t dest = tupregs[2].dttk_value;
3534
3535		/*
3536		 * This action doesn't require any credential checks since
3537		 * probes will not activate in user contexts to which the
3538		 * enabling user does not have permissions.
3539		 */
3540		if (!dtrace_inscratch(dest, size, mstate)) {
3541			*flags |= CPU_DTRACE_BADADDR;
3542			*illval = regs[rd];
3543			break;
3544		}
3545
3546		DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
3547		dtrace_copyin(tupregs[0].dttk_value, dest, size, flags);
3548		DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);
3549		break;
3550	}
3551
3552	case DIF_SUBR_COPYINSTR: {
3553		uintptr_t dest = mstate->dtms_scratch_ptr;
3554		uint64_t size = state->dts_options[DTRACEOPT_STRSIZE];
3555
3556		if (nargs > 1 && tupregs[1].dttk_value < size)
3557			size = tupregs[1].dttk_value + 1;
3558
3559		/*
3560		 * This action doesn't require any credential checks since
3561		 * probes will not activate in user contexts to which the
3562		 * enabling user does not have permissions.
3563		 */
3564		if (!DTRACE_INSCRATCH(mstate, size)) {
3565			DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
3566			regs[rd] = 0;
3567			break;
3568		}
3569
3570		DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
3571		dtrace_copyinstr(tupregs[0].dttk_value, dest, size, flags);
3572		DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);
3573
3574		((char *)dest)[size - 1] = '\0';
3575		mstate->dtms_scratch_ptr += size;
3576		regs[rd] = dest;
3577		break;
3578	}
3579
3580#if defined(sun)
3581	case DIF_SUBR_MSGSIZE:
3582	case DIF_SUBR_MSGDSIZE: {
3583		uintptr_t baddr = tupregs[0].dttk_value, daddr;
3584		uintptr_t wptr, rptr;
3585		size_t count = 0;
3586		int cont = 0;
3587
3588		while (baddr != 0 && !(*flags & CPU_DTRACE_FAULT)) {
3589
3590			if (!dtrace_canload(baddr, sizeof (mblk_t), mstate,
3591			    vstate)) {
3592				regs[rd] = 0;
3593				break;
3594			}
3595
3596			wptr = dtrace_loadptr(baddr +
3597			    offsetof(mblk_t, b_wptr));
3598
3599			rptr = dtrace_loadptr(baddr +
3600			    offsetof(mblk_t, b_rptr));
3601
3602			if (wptr < rptr) {
3603				*flags |= CPU_DTRACE_BADADDR;
3604				*illval = tupregs[0].dttk_value;
3605				break;
3606			}
3607
3608			daddr = dtrace_loadptr(baddr +
3609			    offsetof(mblk_t, b_datap));
3610
3611			baddr = dtrace_loadptr(baddr +
3612			    offsetof(mblk_t, b_cont));
3613
3614			/*
3615			 * We want to prevent against denial-of-service here,
3616			 * so we're only going to search the list for
3617			 * dtrace_msgdsize_max mblks.
3618			 */
3619			if (cont++ > dtrace_msgdsize_max) {
3620				*flags |= CPU_DTRACE_ILLOP;
3621				break;
3622			}
3623
3624			if (subr == DIF_SUBR_MSGDSIZE) {
3625				if (dtrace_load8(daddr +
3626				    offsetof(dblk_t, db_type)) != M_DATA)
3627					continue;
3628			}
3629
3630			count += wptr - rptr;
3631		}
3632
3633		if (!(*flags & CPU_DTRACE_FAULT))
3634			regs[rd] = count;
3635
3636		break;
3637	}
3638#endif
3639
3640	case DIF_SUBR_PROGENYOF: {
3641		pid_t pid = tupregs[0].dttk_value;
3642		proc_t *p;
3643		int rval = 0;
3644
3645		DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
3646
3647		for (p = curthread->t_procp; p != NULL; p = p->p_parent) {
3648#if defined(sun)
3649			if (p->p_pidp->pid_id == pid) {
3650#else
3651			if (p->p_pid == pid) {
3652#endif
3653				rval = 1;
3654				break;
3655			}
3656		}
3657
3658		DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);
3659
3660		regs[rd] = rval;
3661		break;
3662	}
3663
3664	case DIF_SUBR_SPECULATION:
3665		regs[rd] = dtrace_speculation(state);
3666		break;
3667
3668	case DIF_SUBR_COPYOUT: {
3669		uintptr_t kaddr = tupregs[0].dttk_value;
3670		uintptr_t uaddr = tupregs[1].dttk_value;
3671		uint64_t size = tupregs[2].dttk_value;
3672
3673		if (!dtrace_destructive_disallow &&
3674		    dtrace_priv_proc_control(state) &&
3675		    !dtrace_istoxic(kaddr, size)) {
3676			DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
3677			dtrace_copyout(kaddr, uaddr, size, flags);
3678			DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);
3679		}
3680		break;
3681	}
3682
3683	case DIF_SUBR_COPYOUTSTR: {
3684		uintptr_t kaddr = tupregs[0].dttk_value;
3685		uintptr_t uaddr = tupregs[1].dttk_value;
3686		uint64_t size = tupregs[2].dttk_value;
3687
3688		if (!dtrace_destructive_disallow &&
3689		    dtrace_priv_proc_control(state) &&
3690		    !dtrace_istoxic(kaddr, size)) {
3691			DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
3692			dtrace_copyoutstr(kaddr, uaddr, size, flags);
3693			DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);
3694		}
3695		break;
3696	}
3697
3698	case DIF_SUBR_STRLEN: {
3699		size_t sz;
3700		uintptr_t addr = (uintptr_t)tupregs[0].dttk_value;
3701		sz = dtrace_strlen((char *)addr,
3702		    state->dts_options[DTRACEOPT_STRSIZE]);
3703
3704		if (!dtrace_canload(addr, sz + 1, mstate, vstate)) {
3705			regs[rd] = 0;
3706			break;
3707		}
3708
3709		regs[rd] = sz;
3710
3711		break;
3712	}
3713
3714	case DIF_SUBR_STRCHR:
3715	case DIF_SUBR_STRRCHR: {
3716		/*
3717		 * We're going to iterate over the string looking for the
3718		 * specified character.  We will iterate until we have reached
3719		 * the string length or we have found the character.  If this
3720		 * is DIF_SUBR_STRRCHR, we will look for the last occurrence
3721		 * of the specified character instead of the first.
3722		 */
3723		uintptr_t saddr = tupregs[0].dttk_value;
3724		uintptr_t addr = tupregs[0].dttk_value;
3725		uintptr_t limit = addr + state->dts_options[DTRACEOPT_STRSIZE];
3726		char c, target = (char)tupregs[1].dttk_value;
3727
3728		for (regs[rd] = 0; addr < limit; addr++) {
3729			if ((c = dtrace_load8(addr)) == target) {
3730				regs[rd] = addr;
3731
3732				if (subr == DIF_SUBR_STRCHR)
3733					break;
3734			}
3735
3736			if (c == '\0')
3737				break;
3738		}
3739
3740		if (!dtrace_canload(saddr, addr - saddr, mstate, vstate)) {
3741			regs[rd] = 0;
3742			break;
3743		}
3744
3745		break;
3746	}
3747
3748	case DIF_SUBR_STRSTR:
3749	case DIF_SUBR_INDEX:
3750	case DIF_SUBR_RINDEX: {
3751		/*
3752		 * We're going to iterate over the string looking for the
3753		 * specified string.  We will iterate until we have reached
3754		 * the string length or we have found the string.  (Yes, this
3755		 * is done in the most naive way possible -- but considering
3756		 * that the string we're searching for is likely to be
3757		 * relatively short, the complexity of Rabin-Karp or similar
3758		 * hardly seems merited.)
3759		 */
3760		char *addr = (char *)(uintptr_t)tupregs[0].dttk_value;
3761		char *substr = (char *)(uintptr_t)tupregs[1].dttk_value;
3762		uint64_t size = state->dts_options[DTRACEOPT_STRSIZE];
3763		size_t len = dtrace_strlen(addr, size);
3764		size_t sublen = dtrace_strlen(substr, size);
3765		char *limit = addr + len, *orig = addr;
3766		int notfound = subr == DIF_SUBR_STRSTR ? 0 : -1;
3767		int inc = 1;
3768
3769		regs[rd] = notfound;
3770
3771		if (!dtrace_canload((uintptr_t)addr, len + 1, mstate, vstate)) {
3772			regs[rd] = 0;
3773			break;
3774		}
3775
3776		if (!dtrace_canload((uintptr_t)substr, sublen + 1, mstate,
3777		    vstate)) {
3778			regs[rd] = 0;
3779			break;
3780		}
3781
3782		/*
3783		 * strstr() and index()/rindex() have similar semantics if
3784		 * both strings are the empty string: strstr() returns a
3785		 * pointer to the (empty) string, and index() and rindex()
3786		 * both return index 0 (regardless of any position argument).
3787		 */
3788		if (sublen == 0 && len == 0) {
3789			if (subr == DIF_SUBR_STRSTR)
3790				regs[rd] = (uintptr_t)addr;
3791			else
3792				regs[rd] = 0;
3793			break;
3794		}
3795
3796		if (subr != DIF_SUBR_STRSTR) {
3797			if (subr == DIF_SUBR_RINDEX) {
3798				limit = orig - 1;
3799				addr += len;
3800				inc = -1;
3801			}
3802
3803			/*
3804			 * Both index() and rindex() take an optional position
3805			 * argument that denotes the starting position.
3806			 */
3807			if (nargs == 3) {
3808				int64_t pos = (int64_t)tupregs[2].dttk_value;
3809
3810				/*
3811				 * If the position argument to index() is
3812				 * negative, Perl implicitly clamps it at
3813				 * zero.  This semantic is a little surprising
3814				 * given the special meaning of negative
3815				 * positions to similar Perl functions like
3816				 * substr(), but it appears to reflect a
3817				 * notion that index() can start from a
3818				 * negative index and increment its way up to
3819				 * the string.  Given this notion, Perl's
3820				 * rindex() is at least self-consistent in
3821				 * that it implicitly clamps positions greater
3822				 * than the string length to be the string
3823				 * length.  Where Perl completely loses
3824				 * coherence, however, is when the specified
3825				 * substring is the empty string ("").  In
3826				 * this case, even if the position is
3827				 * negative, rindex() returns 0 -- and even if
3828				 * the position is greater than the length,
3829				 * index() returns the string length.  These
3830				 * semantics violate the notion that index()
3831				 * should never return a value less than the
3832				 * specified position and that rindex() should
3833				 * never return a value greater than the
3834				 * specified position.  (One assumes that
3835				 * these semantics are artifacts of Perl's
3836				 * implementation and not the results of
3837				 * deliberate design -- it beggars belief that
3838				 * even Larry Wall could desire such oddness.)
3839				 * While in the abstract one would wish for
3840				 * consistent position semantics across
3841				 * substr(), index() and rindex() -- or at the
3842				 * very least self-consistent position
3843				 * semantics for index() and rindex() -- we
3844				 * instead opt to keep with the extant Perl
3845				 * semantics, in all their broken glory.  (Do
3846				 * we have more desire to maintain Perl's
3847				 * semantics than Perl does?  Probably.)
3848				 */
3849				if (subr == DIF_SUBR_RINDEX) {
3850					if (pos < 0) {
3851						if (sublen == 0)
3852							regs[rd] = 0;
3853						break;
3854					}
3855
3856					if (pos > len)
3857						pos = len;
3858				} else {
3859					if (pos < 0)
3860						pos = 0;
3861
3862					if (pos >= len) {
3863						if (sublen == 0)
3864							regs[rd] = len;
3865						break;
3866					}
3867				}
3868
3869				addr = orig + pos;
3870			}
3871		}
3872
3873		for (regs[rd] = notfound; addr != limit; addr += inc) {
3874			if (dtrace_strncmp(addr, substr, sublen) == 0) {
3875				if (subr != DIF_SUBR_STRSTR) {
3876					/*
3877					 * As D index() and rindex() are
3878					 * modeled on Perl (and not on awk),
3879					 * we return a zero-based (and not a
3880					 * one-based) index.  (For you Perl
3881					 * weenies: no, we're not going to add
3882					 * $[ -- and shouldn't you be at a con
3883					 * or something?)
3884					 */
3885					regs[rd] = (uintptr_t)(addr - orig);
3886					break;
3887				}
3888
3889				ASSERT(subr == DIF_SUBR_STRSTR);
3890				regs[rd] = (uintptr_t)addr;
3891				break;
3892			}
3893		}
3894
3895		break;
3896	}
3897
3898	case DIF_SUBR_STRTOK: {
3899		uintptr_t addr = tupregs[0].dttk_value;
3900		uintptr_t tokaddr = tupregs[1].dttk_value;
3901		uint64_t size = state->dts_options[DTRACEOPT_STRSIZE];
3902		uintptr_t limit, toklimit = tokaddr + size;
3903		uint8_t c = 0, tokmap[32];	 /* 256 / 8 */
3904		char *dest = (char *)mstate->dtms_scratch_ptr;
3905		int i;
3906
3907		/*
3908		 * Check both the token buffer and (later) the input buffer,
3909		 * since both could be non-scratch addresses.
3910		 */
3911		if (!dtrace_strcanload(tokaddr, size, mstate, vstate)) {
3912			regs[rd] = 0;
3913			break;
3914		}
3915
3916		if (!DTRACE_INSCRATCH(mstate, size)) {
3917			DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
3918			regs[rd] = 0;
3919			break;
3920		}
3921
3922		if (addr == 0) {
3923			/*
3924			 * If the address specified is NULL, we use our saved
3925			 * strtok pointer from the mstate.  Note that this
3926			 * means that the saved strtok pointer is _only_
3927			 * valid within multiple enablings of the same probe --
3928			 * it behaves like an implicit clause-local variable.
3929			 */
3930			addr = mstate->dtms_strtok;
3931		} else {
3932			/*
3933			 * If the user-specified address is non-NULL we must
3934			 * access check it.  This is the only time we have
3935			 * a chance to do so, since this address may reside
3936			 * in the string table of this clause-- future calls
3937			 * (when we fetch addr from mstate->dtms_strtok)
3938			 * would fail this access check.
3939			 */
3940			if (!dtrace_strcanload(addr, size, mstate, vstate)) {
3941				regs[rd] = 0;
3942				break;
3943			}
3944		}
3945
3946		/*
3947		 * First, zero the token map, and then process the token
3948		 * string -- setting a bit in the map for every character
3949		 * found in the token string.
3950		 */
3951		for (i = 0; i < sizeof (tokmap); i++)
3952			tokmap[i] = 0;
3953
3954		for (; tokaddr < toklimit; tokaddr++) {
3955			if ((c = dtrace_load8(tokaddr)) == '\0')
3956				break;
3957
3958			ASSERT((c >> 3) < sizeof (tokmap));
3959			tokmap[c >> 3] |= (1 << (c & 0x7));
3960		}
3961
3962		for (limit = addr + size; addr < limit; addr++) {
3963			/*
3964			 * We're looking for a character that is _not_ contained
3965			 * in the token string.
3966			 */
3967			if ((c = dtrace_load8(addr)) == '\0')
3968				break;
3969
3970			if (!(tokmap[c >> 3] & (1 << (c & 0x7))))
3971				break;
3972		}
3973
3974		if (c == '\0') {
3975			/*
3976			 * We reached the end of the string without finding
3977			 * any character that was not in the token string.
3978			 * We return NULL in this case, and we set the saved
3979			 * address to NULL as well.
3980			 */
3981			regs[rd] = 0;
3982			mstate->dtms_strtok = 0;
3983			break;
3984		}
3985
3986		/*
3987		 * From here on, we're copying into the destination string.
3988		 */
3989		for (i = 0; addr < limit && i < size - 1; addr++) {
3990			if ((c = dtrace_load8(addr)) == '\0')
3991				break;
3992
3993			if (tokmap[c >> 3] & (1 << (c & 0x7)))
3994				break;
3995
3996			ASSERT(i < size);
3997			dest[i++] = c;
3998		}
3999
4000		ASSERT(i < size);
4001		dest[i] = '\0';
4002		regs[rd] = (uintptr_t)dest;
4003		mstate->dtms_scratch_ptr += size;
4004		mstate->dtms_strtok = addr;
4005		break;
4006	}
4007
4008	case DIF_SUBR_SUBSTR: {
4009		uintptr_t s = tupregs[0].dttk_value;
4010		uint64_t size = state->dts_options[DTRACEOPT_STRSIZE];
4011		char *d = (char *)mstate->dtms_scratch_ptr;
4012		int64_t index = (int64_t)tupregs[1].dttk_value;
4013		int64_t remaining = (int64_t)tupregs[2].dttk_value;
4014		size_t len = dtrace_strlen((char *)s, size);
4015		int64_t i = 0;
4016
4017		if (!dtrace_canload(s, len + 1, mstate, vstate)) {
4018			regs[rd] = 0;
4019			break;
4020		}
4021
4022		if (!DTRACE_INSCRATCH(mstate, size)) {
4023			DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
4024			regs[rd] = 0;
4025			break;
4026		}
4027
4028		if (nargs <= 2)
4029			remaining = (int64_t)size;
4030
4031		if (index < 0) {
4032			index += len;
4033
4034			if (index < 0 && index + remaining > 0) {
4035				remaining += index;
4036				index = 0;
4037			}
4038		}
4039
4040		if (index >= len || index < 0) {
4041			remaining = 0;
4042		} else if (remaining < 0) {
4043			remaining += len - index;
4044		} else if (index + remaining > size) {
4045			remaining = size - index;
4046		}
4047
4048		for (i = 0; i < remaining; i++) {
4049			if ((d[i] = dtrace_load8(s + index + i)) == '\0')
4050				break;
4051		}
4052
4053		d[i] = '\0';
4054
4055		mstate->dtms_scratch_ptr += size;
4056		regs[rd] = (uintptr_t)d;
4057		break;
4058	}
4059
4060	case DIF_SUBR_TOUPPER:
4061	case DIF_SUBR_TOLOWER: {
4062		uintptr_t s = tupregs[0].dttk_value;
4063		uint64_t size = state->dts_options[DTRACEOPT_STRSIZE];
4064		char *dest = (char *)mstate->dtms_scratch_ptr, c;
4065		size_t len = dtrace_strlen((char *)s, size);
4066		char lower, upper, convert;
4067		int64_t i;
4068
4069		if (subr == DIF_SUBR_TOUPPER) {
4070			lower = 'a';
4071			upper = 'z';
4072			convert = 'A';
4073		} else {
4074			lower = 'A';
4075			upper = 'Z';
4076			convert = 'a';
4077		}
4078
4079		if (!dtrace_canload(s, len + 1, mstate, vstate)) {
4080			regs[rd] = 0;
4081			break;
4082		}
4083
4084		if (!DTRACE_INSCRATCH(mstate, size)) {
4085			DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
4086			regs[rd] = 0;
4087			break;
4088		}
4089
4090		for (i = 0; i < size - 1; i++) {
4091			if ((c = dtrace_load8(s + i)) == '\0')
4092				break;
4093
4094			if (c >= lower && c <= upper)
4095				c = convert + (c - lower);
4096
4097			dest[i] = c;
4098		}
4099
4100		ASSERT(i < size);
4101		dest[i] = '\0';
4102		regs[rd] = (uintptr_t)dest;
4103		mstate->dtms_scratch_ptr += size;
4104		break;
4105	}
4106
4107#if defined(sun)
4108	case DIF_SUBR_GETMAJOR:
4109#ifdef _LP64
4110		regs[rd] = (tupregs[0].dttk_value >> NBITSMINOR64) & MAXMAJ64;
4111#else
4112		regs[rd] = (tupregs[0].dttk_value >> NBITSMINOR) & MAXMAJ;
4113#endif
4114		break;
4115
4116	case DIF_SUBR_GETMINOR:
4117#ifdef _LP64
4118		regs[rd] = tupregs[0].dttk_value & MAXMIN64;
4119#else
4120		regs[rd] = tupregs[0].dttk_value & MAXMIN;
4121#endif
4122		break;
4123
4124	case DIF_SUBR_DDI_PATHNAME: {
4125		/*
4126		 * This one is a galactic mess.  We are going to roughly
4127		 * emulate ddi_pathname(), but it's made more complicated
4128		 * by the fact that we (a) want to include the minor name and
4129		 * (b) must proceed iteratively instead of recursively.
4130		 */
4131		uintptr_t dest = mstate->dtms_scratch_ptr;
4132		uint64_t size = state->dts_options[DTRACEOPT_STRSIZE];
4133		char *start = (char *)dest, *end = start + size - 1;
4134		uintptr_t daddr = tupregs[0].dttk_value;
4135		int64_t minor = (int64_t)tupregs[1].dttk_value;
4136		char *s;
4137		int i, len, depth = 0;
4138
4139		/*
4140		 * Due to all the pointer jumping we do and context we must
4141		 * rely upon, we just mandate that the user must have kernel
4142		 * read privileges to use this routine.
4143		 */
4144		if ((mstate->dtms_access & DTRACE_ACCESS_KERNEL) == 0) {
4145			*flags |= CPU_DTRACE_KPRIV;
4146			*illval = daddr;
4147			regs[rd] = 0;
4148		}
4149
4150		if (!DTRACE_INSCRATCH(mstate, size)) {
4151			DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
4152			regs[rd] = 0;
4153			break;
4154		}
4155
4156		*end = '\0';
4157
4158		/*
4159		 * We want to have a name for the minor.  In order to do this,
4160		 * we need to walk the minor list from the devinfo.  We want
4161		 * to be sure that we don't infinitely walk a circular list,
4162		 * so we check for circularity by sending a scout pointer
4163		 * ahead two elements for every element that we iterate over;
4164		 * if the list is circular, these will ultimately point to the
4165		 * same element.  You may recognize this little trick as the
4166		 * answer to a stupid interview question -- one that always
4167		 * seems to be asked by those who had to have it laboriously
4168		 * explained to them, and who can't even concisely describe
4169		 * the conditions under which one would be forced to resort to
4170		 * this technique.  Needless to say, those conditions are
4171		 * found here -- and probably only here.  Is this the only use
4172		 * of this infamous trick in shipping, production code?  If it
4173		 * isn't, it probably should be...
4174		 */
4175		if (minor != -1) {
4176			uintptr_t maddr = dtrace_loadptr(daddr +
4177			    offsetof(struct dev_info, devi_minor));
4178
4179			uintptr_t next = offsetof(struct ddi_minor_data, next);
4180			uintptr_t name = offsetof(struct ddi_minor_data,
4181			    d_minor) + offsetof(struct ddi_minor, name);
4182			uintptr_t dev = offsetof(struct ddi_minor_data,
4183			    d_minor) + offsetof(struct ddi_minor, dev);
4184			uintptr_t scout;
4185
4186			if (maddr != NULL)
4187				scout = dtrace_loadptr(maddr + next);
4188
4189			while (maddr != NULL && !(*flags & CPU_DTRACE_FAULT)) {
4190				uint64_t m;
4191#ifdef _LP64
4192				m = dtrace_load64(maddr + dev) & MAXMIN64;
4193#else
4194				m = dtrace_load32(maddr + dev) & MAXMIN;
4195#endif
4196				if (m != minor) {
4197					maddr = dtrace_loadptr(maddr + next);
4198
4199					if (scout == NULL)
4200						continue;
4201
4202					scout = dtrace_loadptr(scout + next);
4203
4204					if (scout == NULL)
4205						continue;
4206
4207					scout = dtrace_loadptr(scout + next);
4208
4209					if (scout == NULL)
4210						continue;
4211
4212					if (scout == maddr) {
4213						*flags |= CPU_DTRACE_ILLOP;
4214						break;
4215					}
4216
4217					continue;
4218				}
4219
4220				/*
4221				 * We have the minor data.  Now we need to
4222				 * copy the minor's name into the end of the
4223				 * pathname.
4224				 */
4225				s = (char *)dtrace_loadptr(maddr + name);
4226				len = dtrace_strlen(s, size);
4227
4228				if (*flags & CPU_DTRACE_FAULT)
4229					break;
4230
4231				if (len != 0) {
4232					if ((end -= (len + 1)) < start)
4233						break;
4234
4235					*end = ':';
4236				}
4237
4238				for (i = 1; i <= len; i++)
4239					end[i] = dtrace_load8((uintptr_t)s++);
4240				break;
4241			}
4242		}
4243
4244		while (daddr != NULL && !(*flags & CPU_DTRACE_FAULT)) {
4245			ddi_node_state_t devi_state;
4246
4247			devi_state = dtrace_load32(daddr +
4248			    offsetof(struct dev_info, devi_node_state));
4249
4250			if (*flags & CPU_DTRACE_FAULT)
4251				break;
4252
4253			if (devi_state >= DS_INITIALIZED) {
4254				s = (char *)dtrace_loadptr(daddr +
4255				    offsetof(struct dev_info, devi_addr));
4256				len = dtrace_strlen(s, size);
4257
4258				if (*flags & CPU_DTRACE_FAULT)
4259					break;
4260
4261				if (len != 0) {
4262					if ((end -= (len + 1)) < start)
4263						break;
4264
4265					*end = '@';
4266				}
4267
4268				for (i = 1; i <= len; i++)
4269					end[i] = dtrace_load8((uintptr_t)s++);
4270			}
4271
4272			/*
4273			 * Now for the node name...
4274			 */
4275			s = (char *)dtrace_loadptr(daddr +
4276			    offsetof(struct dev_info, devi_node_name));
4277
4278			daddr = dtrace_loadptr(daddr +
4279			    offsetof(struct dev_info, devi_parent));
4280
4281			/*
4282			 * If our parent is NULL (that is, if we're the root
4283			 * node), we're going to use the special path
4284			 * "devices".
4285			 */
4286			if (daddr == 0)
4287				s = "devices";
4288
4289			len = dtrace_strlen(s, size);
4290			if (*flags & CPU_DTRACE_FAULT)
4291				break;
4292
4293			if ((end -= (len + 1)) < start)
4294				break;
4295
4296			for (i = 1; i <= len; i++)
4297				end[i] = dtrace_load8((uintptr_t)s++);
4298			*end = '/';
4299
4300			if (depth++ > dtrace_devdepth_max) {
4301				*flags |= CPU_DTRACE_ILLOP;
4302				break;
4303			}
4304		}
4305
4306		if (end < start)
4307			DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
4308
4309		if (daddr == 0) {
4310			regs[rd] = (uintptr_t)end;
4311			mstate->dtms_scratch_ptr += size;
4312		}
4313
4314		break;
4315	}
4316#endif
4317
4318	case DIF_SUBR_STRJOIN: {
4319		char *d = (char *)mstate->dtms_scratch_ptr;
4320		uint64_t size = state->dts_options[DTRACEOPT_STRSIZE];
4321		uintptr_t s1 = tupregs[0].dttk_value;
4322		uintptr_t s2 = tupregs[1].dttk_value;
4323		int i = 0;
4324
4325		if (!dtrace_strcanload(s1, size, mstate, vstate) ||
4326		    !dtrace_strcanload(s2, size, mstate, vstate)) {
4327			regs[rd] = 0;
4328			break;
4329		}
4330
4331		if (!DTRACE_INSCRATCH(mstate, size)) {
4332			DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
4333			regs[rd] = 0;
4334			break;
4335		}
4336
4337		for (;;) {
4338			if (i >= size) {
4339				DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
4340				regs[rd] = 0;
4341				break;
4342			}
4343
4344			if ((d[i++] = dtrace_load8(s1++)) == '\0') {
4345				i--;
4346				break;
4347			}
4348		}
4349
4350		for (;;) {
4351			if (i >= size) {
4352				DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
4353				regs[rd] = 0;
4354				break;
4355			}
4356
4357			if ((d[i++] = dtrace_load8(s2++)) == '\0')
4358				break;
4359		}
4360
4361		if (i < size) {
4362			mstate->dtms_scratch_ptr += i;
4363			regs[rd] = (uintptr_t)d;
4364		}
4365
4366		break;
4367	}
4368
4369	case DIF_SUBR_LLTOSTR: {
4370		int64_t i = (int64_t)tupregs[0].dttk_value;
4371		uint64_t val, digit;
4372		uint64_t size = 65;	/* enough room for 2^64 in binary */
4373		char *end = (char *)mstate->dtms_scratch_ptr + size - 1;
4374		int base = 10;
4375
4376		if (nargs > 1) {
4377			if ((base = tupregs[1].dttk_value) <= 1 ||
4378			    base > ('z' - 'a' + 1) + ('9' - '0' + 1)) {
4379				*flags |= CPU_DTRACE_ILLOP;
4380				break;
4381			}
4382		}
4383
4384		val = (base == 10 && i < 0) ? i * -1 : i;
4385
4386		if (!DTRACE_INSCRATCH(mstate, size)) {
4387			DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
4388			regs[rd] = 0;
4389			break;
4390		}
4391
4392		for (*end-- = '\0'; val; val /= base) {
4393			if ((digit = val % base) <= '9' - '0') {
4394				*end-- = '0' + digit;
4395			} else {
4396				*end-- = 'a' + (digit - ('9' - '0') - 1);
4397			}
4398		}
4399
4400		if (i == 0 && base == 16)
4401			*end-- = '0';
4402
4403		if (base == 16)
4404			*end-- = 'x';
4405
4406		if (i == 0 || base == 8 || base == 16)
4407			*end-- = '0';
4408
4409		if (i < 0 && base == 10)
4410			*end-- = '-';
4411
4412		regs[rd] = (uintptr_t)end + 1;
4413		mstate->dtms_scratch_ptr += size;
4414		break;
4415	}
4416
4417	case DIF_SUBR_HTONS:
4418	case DIF_SUBR_NTOHS:
4419#if BYTE_ORDER == BIG_ENDIAN
4420		regs[rd] = (uint16_t)tupregs[0].dttk_value;
4421#else
4422		regs[rd] = DT_BSWAP_16((uint16_t)tupregs[0].dttk_value);
4423#endif
4424		break;
4425
4426
4427	case DIF_SUBR_HTONL:
4428	case DIF_SUBR_NTOHL:
4429#if BYTE_ORDER == BIG_ENDIAN
4430		regs[rd] = (uint32_t)tupregs[0].dttk_value;
4431#else
4432		regs[rd] = DT_BSWAP_32((uint32_t)tupregs[0].dttk_value);
4433#endif
4434		break;
4435
4436
4437	case DIF_SUBR_HTONLL:
4438	case DIF_SUBR_NTOHLL:
4439#if BYTE_ORDER == BIG_ENDIAN
4440		regs[rd] = (uint64_t)tupregs[0].dttk_value;
4441#else
4442		regs[rd] = DT_BSWAP_64((uint64_t)tupregs[0].dttk_value);
4443#endif
4444		break;
4445
4446
4447	case DIF_SUBR_DIRNAME:
4448	case DIF_SUBR_BASENAME: {
4449		char *dest = (char *)mstate->dtms_scratch_ptr;
4450		uint64_t size = state->dts_options[DTRACEOPT_STRSIZE];
4451		uintptr_t src = tupregs[0].dttk_value;
4452		int i, j, len = dtrace_strlen((char *)src, size);
4453		int lastbase = -1, firstbase = -1, lastdir = -1;
4454		int start, end;
4455
4456		if (!dtrace_canload(src, len + 1, mstate, vstate)) {
4457			regs[rd] = 0;
4458			break;
4459		}
4460
4461		if (!DTRACE_INSCRATCH(mstate, size)) {
4462			DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
4463			regs[rd] = 0;
4464			break;
4465		}
4466
4467		/*
4468		 * The basename and dirname for a zero-length string is
4469		 * defined to be "."
4470		 */
4471		if (len == 0) {
4472			len = 1;
4473			src = (uintptr_t)".";
4474		}
4475
4476		/*
4477		 * Start from the back of the string, moving back toward the
4478		 * front until we see a character that isn't a slash.  That
4479		 * character is the last character in the basename.
4480		 */
4481		for (i = len - 1; i >= 0; i--) {
4482			if (dtrace_load8(src + i) != '/')
4483				break;
4484		}
4485
4486		if (i >= 0)
4487			lastbase = i;
4488
4489		/*
4490		 * Starting from the last character in the basename, move
4491		 * towards the front until we find a slash.  The character
4492		 * that we processed immediately before that is the first
4493		 * character in the basename.
4494		 */
4495		for (; i >= 0; i--) {
4496			if (dtrace_load8(src + i) == '/')
4497				break;
4498		}
4499
4500		if (i >= 0)
4501			firstbase = i + 1;
4502
4503		/*
4504		 * Now keep going until we find a non-slash character.  That
4505		 * character is the last character in the dirname.
4506		 */
4507		for (; i >= 0; i--) {
4508			if (dtrace_load8(src + i) != '/')
4509				break;
4510		}
4511
4512		if (i >= 0)
4513			lastdir = i;
4514
4515		ASSERT(!(lastbase == -1 && firstbase != -1));
4516		ASSERT(!(firstbase == -1 && lastdir != -1));
4517
4518		if (lastbase == -1) {
4519			/*
4520			 * We didn't find a non-slash character.  We know that
4521			 * the length is non-zero, so the whole string must be
4522			 * slashes.  In either the dirname or the basename
4523			 * case, we return '/'.
4524			 */
4525			ASSERT(firstbase == -1);
4526			firstbase = lastbase = lastdir = 0;
4527		}
4528
4529		if (firstbase == -1) {
4530			/*
4531			 * The entire string consists only of a basename
4532			 * component.  If we're looking for dirname, we need
4533			 * to change our string to be just "."; if we're
4534			 * looking for a basename, we'll just set the first
4535			 * character of the basename to be 0.
4536			 */
4537			if (subr == DIF_SUBR_DIRNAME) {
4538				ASSERT(lastdir == -1);
4539				src = (uintptr_t)".";
4540				lastdir = 0;
4541			} else {
4542				firstbase = 0;
4543			}
4544		}
4545
4546		if (subr == DIF_SUBR_DIRNAME) {
4547			if (lastdir == -1) {
4548				/*
4549				 * We know that we have a slash in the name --
4550				 * or lastdir would be set to 0, above.  And
4551				 * because lastdir is -1, we know that this
4552				 * slash must be the first character.  (That
4553				 * is, the full string must be of the form
4554				 * "/basename".)  In this case, the last
4555				 * character of the directory name is 0.
4556				 */
4557				lastdir = 0;
4558			}
4559
4560			start = 0;
4561			end = lastdir;
4562		} else {
4563			ASSERT(subr == DIF_SUBR_BASENAME);
4564			ASSERT(firstbase != -1 && lastbase != -1);
4565			start = firstbase;
4566			end = lastbase;
4567		}
4568
4569		for (i = start, j = 0; i <= end && j < size - 1; i++, j++)
4570			dest[j] = dtrace_load8(src + i);
4571
4572		dest[j] = '\0';
4573		regs[rd] = (uintptr_t)dest;
4574		mstate->dtms_scratch_ptr += size;
4575		break;
4576	}
4577
4578	case DIF_SUBR_CLEANPATH: {
4579		char *dest = (char *)mstate->dtms_scratch_ptr, c;
4580		uint64_t size = state->dts_options[DTRACEOPT_STRSIZE];
4581		uintptr_t src = tupregs[0].dttk_value;
4582		int i = 0, j = 0;
4583
4584		if (!dtrace_strcanload(src, size, mstate, vstate)) {
4585			regs[rd] = 0;
4586			break;
4587		}
4588
4589		if (!DTRACE_INSCRATCH(mstate, size)) {
4590			DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
4591			regs[rd] = 0;
4592			break;
4593		}
4594
4595		/*
4596		 * Move forward, loading each character.
4597		 */
4598		do {
4599			c = dtrace_load8(src + i++);
4600next:
4601			if (j + 5 >= size)	/* 5 = strlen("/..c\0") */
4602				break;
4603
4604			if (c != '/') {
4605				dest[j++] = c;
4606				continue;
4607			}
4608
4609			c = dtrace_load8(src + i++);
4610
4611			if (c == '/') {
4612				/*
4613				 * We have two slashes -- we can just advance
4614				 * to the next character.
4615				 */
4616				goto next;
4617			}
4618
4619			if (c != '.') {
4620				/*
4621				 * This is not "." and it's not ".." -- we can
4622				 * just store the "/" and this character and
4623				 * drive on.
4624				 */
4625				dest[j++] = '/';
4626				dest[j++] = c;
4627				continue;
4628			}
4629
4630			c = dtrace_load8(src + i++);
4631
4632			if (c == '/') {
4633				/*
4634				 * This is a "/./" component.  We're not going
4635				 * to store anything in the destination buffer;
4636				 * we're just going to go to the next component.
4637				 */
4638				goto next;
4639			}
4640
4641			if (c != '.') {
4642				/*
4643				 * This is not ".." -- we can just store the
4644				 * "/." and this character and continue
4645				 * processing.
4646				 */
4647				dest[j++] = '/';
4648				dest[j++] = '.';
4649				dest[j++] = c;
4650				continue;
4651			}
4652
4653			c = dtrace_load8(src + i++);
4654
4655			if (c != '/' && c != '\0') {
4656				/*
4657				 * This is not ".." -- it's "..[mumble]".
4658				 * We'll store the "/.." and this character
4659				 * and continue processing.
4660				 */
4661				dest[j++] = '/';
4662				dest[j++] = '.';
4663				dest[j++] = '.';
4664				dest[j++] = c;
4665				continue;
4666			}
4667
4668			/*
4669			 * This is "/../" or "/..\0".  We need to back up
4670			 * our destination pointer until we find a "/".
4671			 */
4672			i--;
4673			while (j != 0 && dest[--j] != '/')
4674				continue;
4675
4676			if (c == '\0')
4677				dest[++j] = '/';
4678		} while (c != '\0');
4679
4680		dest[j] = '\0';
4681		regs[rd] = (uintptr_t)dest;
4682		mstate->dtms_scratch_ptr += size;
4683		break;
4684	}
4685
4686	case DIF_SUBR_INET_NTOA:
4687	case DIF_SUBR_INET_NTOA6:
4688	case DIF_SUBR_INET_NTOP: {
4689		size_t size;
4690		int af, argi, i;
4691		char *base, *end;
4692
4693		if (subr == DIF_SUBR_INET_NTOP) {
4694			af = (int)tupregs[0].dttk_value;
4695			argi = 1;
4696		} else {
4697			af = subr == DIF_SUBR_INET_NTOA ? AF_INET: AF_INET6;
4698			argi = 0;
4699		}
4700
4701		if (af == AF_INET) {
4702			ipaddr_t ip4;
4703			uint8_t *ptr8, val;
4704
4705			/*
4706			 * Safely load the IPv4 address.
4707			 */
4708			ip4 = dtrace_load32(tupregs[argi].dttk_value);
4709
4710			/*
4711			 * Check an IPv4 string will fit in scratch.
4712			 */
4713			size = INET_ADDRSTRLEN;
4714			if (!DTRACE_INSCRATCH(mstate, size)) {
4715				DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
4716				regs[rd] = 0;
4717				break;
4718			}
4719			base = (char *)mstate->dtms_scratch_ptr;
4720			end = (char *)mstate->dtms_scratch_ptr + size - 1;
4721
4722			/*
4723			 * Stringify as a dotted decimal quad.
4724			 */
4725			*end-- = '\0';
4726			ptr8 = (uint8_t *)&ip4;
4727			for (i = 3; i >= 0; i--) {
4728				val = ptr8[i];
4729
4730				if (val == 0) {
4731					*end-- = '0';
4732				} else {
4733					for (; val; val /= 10) {
4734						*end-- = '0' + (val % 10);
4735					}
4736				}
4737
4738				if (i > 0)
4739					*end-- = '.';
4740			}
4741			ASSERT(end + 1 >= base);
4742
4743		} else if (af == AF_INET6) {
4744			struct in6_addr ip6;
4745			int firstzero, tryzero, numzero, v6end;
4746			uint16_t val;
4747			const char digits[] = "0123456789abcdef";
4748
4749			/*
4750			 * Stringify using RFC 1884 convention 2 - 16 bit
4751			 * hexadecimal values with a zero-run compression.
4752			 * Lower case hexadecimal digits are used.
4753			 * 	eg, fe80::214:4fff:fe0b:76c8.
4754			 * The IPv4 embedded form is returned for inet_ntop,
4755			 * just the IPv4 string is returned for inet_ntoa6.
4756			 */
4757
4758			/*
4759			 * Safely load the IPv6 address.
4760			 */
4761			dtrace_bcopy(
4762			    (void *)(uintptr_t)tupregs[argi].dttk_value,
4763			    (void *)(uintptr_t)&ip6, sizeof (struct in6_addr));
4764
4765			/*
4766			 * Check an IPv6 string will fit in scratch.
4767			 */
4768			size = INET6_ADDRSTRLEN;
4769			if (!DTRACE_INSCRATCH(mstate, size)) {
4770				DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
4771				regs[rd] = 0;
4772				break;
4773			}
4774			base = (char *)mstate->dtms_scratch_ptr;
4775			end = (char *)mstate->dtms_scratch_ptr + size - 1;
4776			*end-- = '\0';
4777
4778			/*
4779			 * Find the longest run of 16 bit zero values
4780			 * for the single allowed zero compression - "::".
4781			 */
4782			firstzero = -1;
4783			tryzero = -1;
4784			numzero = 1;
4785			for (i = 0; i < sizeof (struct in6_addr); i++) {
4786#if defined(sun)
4787				if (ip6._S6_un._S6_u8[i] == 0 &&
4788#else
4789				if (ip6.__u6_addr.__u6_addr8[i] == 0 &&
4790#endif
4791				    tryzero == -1 && i % 2 == 0) {
4792					tryzero = i;
4793					continue;
4794				}
4795
4796				if (tryzero != -1 &&
4797#if defined(sun)
4798				    (ip6._S6_un._S6_u8[i] != 0 ||
4799#else
4800				    (ip6.__u6_addr.__u6_addr8[i] != 0 ||
4801#endif
4802				    i == sizeof (struct in6_addr) - 1)) {
4803
4804					if (i - tryzero <= numzero) {
4805						tryzero = -1;
4806						continue;
4807					}
4808
4809					firstzero = tryzero;
4810					numzero = i - i % 2 - tryzero;
4811					tryzero = -1;
4812
4813#if defined(sun)
4814					if (ip6._S6_un._S6_u8[i] == 0 &&
4815#else
4816					if (ip6.__u6_addr.__u6_addr8[i] == 0 &&
4817#endif
4818					    i == sizeof (struct in6_addr) - 1)
4819						numzero += 2;
4820				}
4821			}
4822			ASSERT(firstzero + numzero <= sizeof (struct in6_addr));
4823
4824			/*
4825			 * Check for an IPv4 embedded address.
4826			 */
4827			v6end = sizeof (struct in6_addr) - 2;
4828			if (IN6_IS_ADDR_V4MAPPED(&ip6) ||
4829			    IN6_IS_ADDR_V4COMPAT(&ip6)) {
4830				for (i = sizeof (struct in6_addr) - 1;
4831				    i >= DTRACE_V4MAPPED_OFFSET; i--) {
4832					ASSERT(end >= base);
4833
4834#if defined(sun)
4835					val = ip6._S6_un._S6_u8[i];
4836#else
4837					val = ip6.__u6_addr.__u6_addr8[i];
4838#endif
4839
4840					if (val == 0) {
4841						*end-- = '0';
4842					} else {
4843						for (; val; val /= 10) {
4844							*end-- = '0' + val % 10;
4845						}
4846					}
4847
4848					if (i > DTRACE_V4MAPPED_OFFSET)
4849						*end-- = '.';
4850				}
4851
4852				if (subr == DIF_SUBR_INET_NTOA6)
4853					goto inetout;
4854
4855				/*
4856				 * Set v6end to skip the IPv4 address that
4857				 * we have already stringified.
4858				 */
4859				v6end = 10;
4860			}
4861
4862			/*
4863			 * Build the IPv6 string by working through the
4864			 * address in reverse.
4865			 */
4866			for (i = v6end; i >= 0; i -= 2) {
4867				ASSERT(end >= base);
4868
4869				if (i == firstzero + numzero - 2) {
4870					*end-- = ':';
4871					*end-- = ':';
4872					i -= numzero - 2;
4873					continue;
4874				}
4875
4876				if (i < 14 && i != firstzero - 2)
4877					*end-- = ':';
4878
4879#if defined(sun)
4880				val = (ip6._S6_un._S6_u8[i] << 8) +
4881				    ip6._S6_un._S6_u8[i + 1];
4882#else
4883				val = (ip6.__u6_addr.__u6_addr8[i] << 8) +
4884				    ip6.__u6_addr.__u6_addr8[i + 1];
4885#endif
4886
4887				if (val == 0) {
4888					*end-- = '0';
4889				} else {
4890					for (; val; val /= 16) {
4891						*end-- = digits[val % 16];
4892					}
4893				}
4894			}
4895			ASSERT(end + 1 >= base);
4896
4897		} else {
4898			/*
4899			 * The user didn't use AH_INET or AH_INET6.
4900			 */
4901			DTRACE_CPUFLAG_SET(CPU_DTRACE_ILLOP);
4902			regs[rd] = 0;
4903			break;
4904		}
4905
4906inetout:	regs[rd] = (uintptr_t)end + 1;
4907		mstate->dtms_scratch_ptr += size;
4908		break;
4909	}
4910
4911	case DIF_SUBR_MEMREF: {
4912		uintptr_t size = 2 * sizeof(uintptr_t);
4913		uintptr_t *memref = (uintptr_t *) P2ROUNDUP(mstate->dtms_scratch_ptr, sizeof(uintptr_t));
4914		size_t scratch_size = ((uintptr_t) memref - mstate->dtms_scratch_ptr) + size;
4915
4916		/* address and length */
4917		memref[0] = tupregs[0].dttk_value;
4918		memref[1] = tupregs[1].dttk_value;
4919
4920		regs[rd] = (uintptr_t) memref;
4921		mstate->dtms_scratch_ptr += scratch_size;
4922		break;
4923	}
4924
4925	case DIF_SUBR_TYPEREF: {
4926		uintptr_t size = 4 * sizeof(uintptr_t);
4927		uintptr_t *typeref = (uintptr_t *) P2ROUNDUP(mstate->dtms_scratch_ptr, sizeof(uintptr_t));
4928		size_t scratch_size = ((uintptr_t) typeref - mstate->dtms_scratch_ptr) + size;
4929
4930		/* address, num_elements, type_str, type_len */
4931		typeref[0] = tupregs[0].dttk_value;
4932		typeref[1] = tupregs[1].dttk_value;
4933		typeref[2] = tupregs[2].dttk_value;
4934		typeref[3] = tupregs[3].dttk_value;
4935
4936		regs[rd] = (uintptr_t) typeref;
4937		mstate->dtms_scratch_ptr += scratch_size;
4938		break;
4939	}
4940	}
4941}
4942
4943/*
4944 * Emulate the execution of DTrace IR instructions specified by the given
4945 * DIF object.  This function is deliberately void of assertions as all of
4946 * the necessary checks are handled by a call to dtrace_difo_validate().
4947 */
4948static uint64_t
4949dtrace_dif_emulate(dtrace_difo_t *difo, dtrace_mstate_t *mstate,
4950    dtrace_vstate_t *vstate, dtrace_state_t *state)
4951{
4952	const dif_instr_t *text = difo->dtdo_buf;
4953	const uint_t textlen = difo->dtdo_len;
4954	const char *strtab = difo->dtdo_strtab;
4955	const uint64_t *inttab = difo->dtdo_inttab;
4956
4957	uint64_t rval = 0;
4958	dtrace_statvar_t *svar;
4959	dtrace_dstate_t *dstate = &vstate->dtvs_dynvars;
4960	dtrace_difv_t *v;
4961	volatile uint16_t *flags = &cpu_core[curcpu].cpuc_dtrace_flags;
4962	volatile uintptr_t *illval = &cpu_core[curcpu].cpuc_dtrace_illval;
4963
4964	dtrace_key_t tupregs[DIF_DTR_NREGS + 2]; /* +2 for thread and id */
4965	uint64_t regs[DIF_DIR_NREGS];
4966	uint64_t *tmp;
4967
4968	uint8_t cc_n = 0, cc_z = 0, cc_v = 0, cc_c = 0;
4969	int64_t cc_r;
4970	uint_t pc = 0, id, opc = 0;
4971	uint8_t ttop = 0;
4972	dif_instr_t instr;
4973	uint_t r1, r2, rd;
4974
4975	/*
4976	 * We stash the current DIF object into the machine state: we need it
4977	 * for subsequent access checking.
4978	 */
4979	mstate->dtms_difo = difo;
4980
4981	regs[DIF_REG_R0] = 0; 		/* %r0 is fixed at zero */
4982
4983	while (pc < textlen && !(*flags & CPU_DTRACE_FAULT)) {
4984		opc = pc;
4985
4986		instr = text[pc++];
4987		r1 = DIF_INSTR_R1(instr);
4988		r2 = DIF_INSTR_R2(instr);
4989		rd = DIF_INSTR_RD(instr);
4990
4991		switch (DIF_INSTR_OP(instr)) {
4992		case DIF_OP_OR:
4993			regs[rd] = regs[r1] | regs[r2];
4994			break;
4995		case DIF_OP_XOR:
4996			regs[rd] = regs[r1] ^ regs[r2];
4997			break;
4998		case DIF_OP_AND:
4999			regs[rd] = regs[r1] & regs[r2];
5000			break;
5001		case DIF_OP_SLL:
5002			regs[rd] = regs[r1] << regs[r2];
5003			break;
5004		case DIF_OP_SRL:
5005			regs[rd] = regs[r1] >> regs[r2];
5006			break;
5007		case DIF_OP_SUB:
5008			regs[rd] = regs[r1] - regs[r2];
5009			break;
5010		case DIF_OP_ADD:
5011			regs[rd] = regs[r1] + regs[r2];
5012			break;
5013		case DIF_OP_MUL:
5014			regs[rd] = regs[r1] * regs[r2];
5015			break;
5016		case DIF_OP_SDIV:
5017			if (regs[r2] == 0) {
5018				regs[rd] = 0;
5019				*flags |= CPU_DTRACE_DIVZERO;
5020			} else {
5021				regs[rd] = (int64_t)regs[r1] /
5022				    (int64_t)regs[r2];
5023			}
5024			break;
5025
5026		case DIF_OP_UDIV:
5027			if (regs[r2] == 0) {
5028				regs[rd] = 0;
5029				*flags |= CPU_DTRACE_DIVZERO;
5030			} else {
5031				regs[rd] = regs[r1] / regs[r2];
5032			}
5033			break;
5034
5035		case DIF_OP_SREM:
5036			if (regs[r2] == 0) {
5037				regs[rd] = 0;
5038				*flags |= CPU_DTRACE_DIVZERO;
5039			} else {
5040				regs[rd] = (int64_t)regs[r1] %
5041				    (int64_t)regs[r2];
5042			}
5043			break;
5044
5045		case DIF_OP_UREM:
5046			if (regs[r2] == 0) {
5047				regs[rd] = 0;
5048				*flags |= CPU_DTRACE_DIVZERO;
5049			} else {
5050				regs[rd] = regs[r1] % regs[r2];
5051			}
5052			break;
5053
5054		case DIF_OP_NOT:
5055			regs[rd] = ~regs[r1];
5056			break;
5057		case DIF_OP_MOV:
5058			regs[rd] = regs[r1];
5059			break;
5060		case DIF_OP_CMP:
5061			cc_r = regs[r1] - regs[r2];
5062			cc_n = cc_r < 0;
5063			cc_z = cc_r == 0;
5064			cc_v = 0;
5065			cc_c = regs[r1] < regs[r2];
5066			break;
5067		case DIF_OP_TST:
5068			cc_n = cc_v = cc_c = 0;
5069			cc_z = regs[r1] == 0;
5070			break;
5071		case DIF_OP_BA:
5072			pc = DIF_INSTR_LABEL(instr);
5073			break;
5074		case DIF_OP_BE:
5075			if (cc_z)
5076				pc = DIF_INSTR_LABEL(instr);
5077			break;
5078		case DIF_OP_BNE:
5079			if (cc_z == 0)
5080				pc = DIF_INSTR_LABEL(instr);
5081			break;
5082		case DIF_OP_BG:
5083			if ((cc_z | (cc_n ^ cc_v)) == 0)
5084				pc = DIF_INSTR_LABEL(instr);
5085			break;
5086		case DIF_OP_BGU:
5087			if ((cc_c | cc_z) == 0)
5088				pc = DIF_INSTR_LABEL(instr);
5089			break;
5090		case DIF_OP_BGE:
5091			if ((cc_n ^ cc_v) == 0)
5092				pc = DIF_INSTR_LABEL(instr);
5093			break;
5094		case DIF_OP_BGEU:
5095			if (cc_c == 0)
5096				pc = DIF_INSTR_LABEL(instr);
5097			break;
5098		case DIF_OP_BL:
5099			if (cc_n ^ cc_v)
5100				pc = DIF_INSTR_LABEL(instr);
5101			break;
5102		case DIF_OP_BLU:
5103			if (cc_c)
5104				pc = DIF_INSTR_LABEL(instr);
5105			break;
5106		case DIF_OP_BLE:
5107			if (cc_z | (cc_n ^ cc_v))
5108				pc = DIF_INSTR_LABEL(instr);
5109			break;
5110		case DIF_OP_BLEU:
5111			if (cc_c | cc_z)
5112				pc = DIF_INSTR_LABEL(instr);
5113			break;
5114		case DIF_OP_RLDSB:
5115			if (!dtrace_canstore(regs[r1], 1, mstate, vstate)) {
5116				*flags |= CPU_DTRACE_KPRIV;
5117				*illval = regs[r1];
5118				break;
5119			}
5120			/*FALLTHROUGH*/
5121		case DIF_OP_LDSB:
5122			regs[rd] = (int8_t)dtrace_load8(regs[r1]);
5123			break;
5124		case DIF_OP_RLDSH:
5125			if (!dtrace_canstore(regs[r1], 2, mstate, vstate)) {
5126				*flags |= CPU_DTRACE_KPRIV;
5127				*illval = regs[r1];
5128				break;
5129			}
5130			/*FALLTHROUGH*/
5131		case DIF_OP_LDSH:
5132			regs[rd] = (int16_t)dtrace_load16(regs[r1]);
5133			break;
5134		case DIF_OP_RLDSW:
5135			if (!dtrace_canstore(regs[r1], 4, mstate, vstate)) {
5136				*flags |= CPU_DTRACE_KPRIV;
5137				*illval = regs[r1];
5138				break;
5139			}
5140			/*FALLTHROUGH*/
5141		case DIF_OP_LDSW:
5142			regs[rd] = (int32_t)dtrace_load32(regs[r1]);
5143			break;
5144		case DIF_OP_RLDUB:
5145			if (!dtrace_canstore(regs[r1], 1, mstate, vstate)) {
5146				*flags |= CPU_DTRACE_KPRIV;
5147				*illval = regs[r1];
5148				break;
5149			}
5150			/*FALLTHROUGH*/
5151		case DIF_OP_LDUB:
5152			regs[rd] = dtrace_load8(regs[r1]);
5153			break;
5154		case DIF_OP_RLDUH:
5155			if (!dtrace_canstore(regs[r1], 2, mstate, vstate)) {
5156				*flags |= CPU_DTRACE_KPRIV;
5157				*illval = regs[r1];
5158				break;
5159			}
5160			/*FALLTHROUGH*/
5161		case DIF_OP_LDUH:
5162			regs[rd] = dtrace_load16(regs[r1]);
5163			break;
5164		case DIF_OP_RLDUW:
5165			if (!dtrace_canstore(regs[r1], 4, mstate, vstate)) {
5166				*flags |= CPU_DTRACE_KPRIV;
5167				*illval = regs[r1];
5168				break;
5169			}
5170			/*FALLTHROUGH*/
5171		case DIF_OP_LDUW:
5172			regs[rd] = dtrace_load32(regs[r1]);
5173			break;
5174		case DIF_OP_RLDX:
5175			if (!dtrace_canstore(regs[r1], 8, mstate, vstate)) {
5176				*flags |= CPU_DTRACE_KPRIV;
5177				*illval = regs[r1];
5178				break;
5179			}
5180			/*FALLTHROUGH*/
5181		case DIF_OP_LDX:
5182			regs[rd] = dtrace_load64(regs[r1]);
5183			break;
5184		case DIF_OP_ULDSB:
5185			regs[rd] = (int8_t)
5186			    dtrace_fuword8((void *)(uintptr_t)regs[r1]);
5187			break;
5188		case DIF_OP_ULDSH:
5189			regs[rd] = (int16_t)
5190			    dtrace_fuword16((void *)(uintptr_t)regs[r1]);
5191			break;
5192		case DIF_OP_ULDSW:
5193			regs[rd] = (int32_t)
5194			    dtrace_fuword32((void *)(uintptr_t)regs[r1]);
5195			break;
5196		case DIF_OP_ULDUB:
5197			regs[rd] =
5198			    dtrace_fuword8((void *)(uintptr_t)regs[r1]);
5199			break;
5200		case DIF_OP_ULDUH:
5201			regs[rd] =
5202			    dtrace_fuword16((void *)(uintptr_t)regs[r1]);
5203			break;
5204		case DIF_OP_ULDUW:
5205			regs[rd] =
5206			    dtrace_fuword32((void *)(uintptr_t)regs[r1]);
5207			break;
5208		case DIF_OP_ULDX:
5209			regs[rd] =
5210			    dtrace_fuword64((void *)(uintptr_t)regs[r1]);
5211			break;
5212		case DIF_OP_RET:
5213			rval = regs[rd];
5214			pc = textlen;
5215			break;
5216		case DIF_OP_NOP:
5217			break;
5218		case DIF_OP_SETX:
5219			regs[rd] = inttab[DIF_INSTR_INTEGER(instr)];
5220			break;
5221		case DIF_OP_SETS:
5222			regs[rd] = (uint64_t)(uintptr_t)
5223			    (strtab + DIF_INSTR_STRING(instr));
5224			break;
5225		case DIF_OP_SCMP: {
5226			size_t sz = state->dts_options[DTRACEOPT_STRSIZE];
5227			uintptr_t s1 = regs[r1];
5228			uintptr_t s2 = regs[r2];
5229
5230			if (s1 != 0 &&
5231			    !dtrace_strcanload(s1, sz, mstate, vstate))
5232				break;
5233			if (s2 != 0 &&
5234			    !dtrace_strcanload(s2, sz, mstate, vstate))
5235				break;
5236
5237			cc_r = dtrace_strncmp((char *)s1, (char *)s2, sz);
5238
5239			cc_n = cc_r < 0;
5240			cc_z = cc_r == 0;
5241			cc_v = cc_c = 0;
5242			break;
5243		}
5244		case DIF_OP_LDGA:
5245			regs[rd] = dtrace_dif_variable(mstate, state,
5246			    r1, regs[r2]);
5247			break;
5248		case DIF_OP_LDGS:
5249			id = DIF_INSTR_VAR(instr);
5250
5251			if (id >= DIF_VAR_OTHER_UBASE) {
5252				uintptr_t a;
5253
5254				id -= DIF_VAR_OTHER_UBASE;
5255				svar = vstate->dtvs_globals[id];
5256				ASSERT(svar != NULL);
5257				v = &svar->dtsv_var;
5258
5259				if (!(v->dtdv_type.dtdt_flags & DIF_TF_BYREF)) {
5260					regs[rd] = svar->dtsv_data;
5261					break;
5262				}
5263
5264				a = (uintptr_t)svar->dtsv_data;
5265
5266				if (*(uint8_t *)a == UINT8_MAX) {
5267					/*
5268					 * If the 0th byte is set to UINT8_MAX
5269					 * then this is to be treated as a
5270					 * reference to a NULL variable.
5271					 */
5272					regs[rd] = 0;
5273				} else {
5274					regs[rd] = a + sizeof (uint64_t);
5275				}
5276
5277				break;
5278			}
5279
5280			regs[rd] = dtrace_dif_variable(mstate, state, id, 0);
5281			break;
5282
5283		case DIF_OP_STGS:
5284			id = DIF_INSTR_VAR(instr);
5285
5286			ASSERT(id >= DIF_VAR_OTHER_UBASE);
5287			id -= DIF_VAR_OTHER_UBASE;
5288
5289			svar = vstate->dtvs_globals[id];
5290			ASSERT(svar != NULL);
5291			v = &svar->dtsv_var;
5292
5293			if (v->dtdv_type.dtdt_flags & DIF_TF_BYREF) {
5294				uintptr_t a = (uintptr_t)svar->dtsv_data;
5295
5296				ASSERT(a != 0);
5297				ASSERT(svar->dtsv_size != 0);
5298
5299				if (regs[rd] == 0) {
5300					*(uint8_t *)a = UINT8_MAX;
5301					break;
5302				} else {
5303					*(uint8_t *)a = 0;
5304					a += sizeof (uint64_t);
5305				}
5306				if (!dtrace_vcanload(
5307				    (void *)(uintptr_t)regs[rd], &v->dtdv_type,
5308				    mstate, vstate))
5309					break;
5310
5311				dtrace_vcopy((void *)(uintptr_t)regs[rd],
5312				    (void *)a, &v->dtdv_type);
5313				break;
5314			}
5315
5316			svar->dtsv_data = regs[rd];
5317			break;
5318
5319		case DIF_OP_LDTA:
5320			/*
5321			 * There are no DTrace built-in thread-local arrays at
5322			 * present.  This opcode is saved for future work.
5323			 */
5324			*flags |= CPU_DTRACE_ILLOP;
5325			regs[rd] = 0;
5326			break;
5327
5328		case DIF_OP_LDLS:
5329			id = DIF_INSTR_VAR(instr);
5330
5331			if (id < DIF_VAR_OTHER_UBASE) {
5332				/*
5333				 * For now, this has no meaning.
5334				 */
5335				regs[rd] = 0;
5336				break;
5337			}
5338
5339			id -= DIF_VAR_OTHER_UBASE;
5340
5341			ASSERT(id < vstate->dtvs_nlocals);
5342			ASSERT(vstate->dtvs_locals != NULL);
5343
5344			svar = vstate->dtvs_locals[id];
5345			ASSERT(svar != NULL);
5346			v = &svar->dtsv_var;
5347
5348			if (v->dtdv_type.dtdt_flags & DIF_TF_BYREF) {
5349				uintptr_t a = (uintptr_t)svar->dtsv_data;
5350				size_t sz = v->dtdv_type.dtdt_size;
5351
5352				sz += sizeof (uint64_t);
5353				ASSERT(svar->dtsv_size == NCPU * sz);
5354				a += curcpu * sz;
5355
5356				if (*(uint8_t *)a == UINT8_MAX) {
5357					/*
5358					 * If the 0th byte is set to UINT8_MAX
5359					 * then this is to be treated as a
5360					 * reference to a NULL variable.
5361					 */
5362					regs[rd] = 0;
5363				} else {
5364					regs[rd] = a + sizeof (uint64_t);
5365				}
5366
5367				break;
5368			}
5369
5370			ASSERT(svar->dtsv_size == NCPU * sizeof (uint64_t));
5371			tmp = (uint64_t *)(uintptr_t)svar->dtsv_data;
5372			regs[rd] = tmp[curcpu];
5373			break;
5374
5375		case DIF_OP_STLS:
5376			id = DIF_INSTR_VAR(instr);
5377
5378			ASSERT(id >= DIF_VAR_OTHER_UBASE);
5379			id -= DIF_VAR_OTHER_UBASE;
5380			ASSERT(id < vstate->dtvs_nlocals);
5381
5382			ASSERT(vstate->dtvs_locals != NULL);
5383			svar = vstate->dtvs_locals[id];
5384			ASSERT(svar != NULL);
5385			v = &svar->dtsv_var;
5386
5387			if (v->dtdv_type.dtdt_flags & DIF_TF_BYREF) {
5388				uintptr_t a = (uintptr_t)svar->dtsv_data;
5389				size_t sz = v->dtdv_type.dtdt_size;
5390
5391				sz += sizeof (uint64_t);
5392				ASSERT(svar->dtsv_size == NCPU * sz);
5393				a += curcpu * sz;
5394
5395				if (regs[rd] == 0) {
5396					*(uint8_t *)a = UINT8_MAX;
5397					break;
5398				} else {
5399					*(uint8_t *)a = 0;
5400					a += sizeof (uint64_t);
5401				}
5402
5403				if (!dtrace_vcanload(
5404				    (void *)(uintptr_t)regs[rd], &v->dtdv_type,
5405				    mstate, vstate))
5406					break;
5407
5408				dtrace_vcopy((void *)(uintptr_t)regs[rd],
5409				    (void *)a, &v->dtdv_type);
5410				break;
5411			}
5412
5413			ASSERT(svar->dtsv_size == NCPU * sizeof (uint64_t));
5414			tmp = (uint64_t *)(uintptr_t)svar->dtsv_data;
5415			tmp[curcpu] = regs[rd];
5416			break;
5417
5418		case DIF_OP_LDTS: {
5419			dtrace_dynvar_t *dvar;
5420			dtrace_key_t *key;
5421
5422			id = DIF_INSTR_VAR(instr);
5423			ASSERT(id >= DIF_VAR_OTHER_UBASE);
5424			id -= DIF_VAR_OTHER_UBASE;
5425			v = &vstate->dtvs_tlocals[id];
5426
5427			key = &tupregs[DIF_DTR_NREGS];
5428			key[0].dttk_value = (uint64_t)id;
5429			key[0].dttk_size = 0;
5430			DTRACE_TLS_THRKEY(key[1].dttk_value);
5431			key[1].dttk_size = 0;
5432
5433			dvar = dtrace_dynvar(dstate, 2, key,
5434			    sizeof (uint64_t), DTRACE_DYNVAR_NOALLOC,
5435			    mstate, vstate);
5436
5437			if (dvar == NULL) {
5438				regs[rd] = 0;
5439				break;
5440			}
5441
5442			if (v->dtdv_type.dtdt_flags & DIF_TF_BYREF) {
5443				regs[rd] = (uint64_t)(uintptr_t)dvar->dtdv_data;
5444			} else {
5445				regs[rd] = *((uint64_t *)dvar->dtdv_data);
5446			}
5447
5448			break;
5449		}
5450
5451		case DIF_OP_STTS: {
5452			dtrace_dynvar_t *dvar;
5453			dtrace_key_t *key;
5454
5455			id = DIF_INSTR_VAR(instr);
5456			ASSERT(id >= DIF_VAR_OTHER_UBASE);
5457			id -= DIF_VAR_OTHER_UBASE;
5458
5459			key = &tupregs[DIF_DTR_NREGS];
5460			key[0].dttk_value = (uint64_t)id;
5461			key[0].dttk_size = 0;
5462			DTRACE_TLS_THRKEY(key[1].dttk_value);
5463			key[1].dttk_size = 0;
5464			v = &vstate->dtvs_tlocals[id];
5465
5466			dvar = dtrace_dynvar(dstate, 2, key,
5467			    v->dtdv_type.dtdt_size > sizeof (uint64_t) ?
5468			    v->dtdv_type.dtdt_size : sizeof (uint64_t),
5469			    regs[rd] ? DTRACE_DYNVAR_ALLOC :
5470			    DTRACE_DYNVAR_DEALLOC, mstate, vstate);
5471
5472			/*
5473			 * Given that we're storing to thread-local data,
5474			 * we need to flush our predicate cache.
5475			 */
5476			curthread->t_predcache = 0;
5477
5478			if (dvar == NULL)
5479				break;
5480
5481			if (v->dtdv_type.dtdt_flags & DIF_TF_BYREF) {
5482				if (!dtrace_vcanload(
5483				    (void *)(uintptr_t)regs[rd],
5484				    &v->dtdv_type, mstate, vstate))
5485					break;
5486
5487				dtrace_vcopy((void *)(uintptr_t)regs[rd],
5488				    dvar->dtdv_data, &v->dtdv_type);
5489			} else {
5490				*((uint64_t *)dvar->dtdv_data) = regs[rd];
5491			}
5492
5493			break;
5494		}
5495
5496		case DIF_OP_SRA:
5497			regs[rd] = (int64_t)regs[r1] >> regs[r2];
5498			break;
5499
5500		case DIF_OP_CALL:
5501			dtrace_dif_subr(DIF_INSTR_SUBR(instr), rd,
5502			    regs, tupregs, ttop, mstate, state);
5503			break;
5504
5505		case DIF_OP_PUSHTR:
5506			if (ttop == DIF_DTR_NREGS) {
5507				*flags |= CPU_DTRACE_TUPOFLOW;
5508				break;
5509			}
5510
5511			if (r1 == DIF_TYPE_STRING) {
5512				/*
5513				 * If this is a string type and the size is 0,
5514				 * we'll use the system-wide default string
5515				 * size.  Note that we are _not_ looking at
5516				 * the value of the DTRACEOPT_STRSIZE option;
5517				 * had this been set, we would expect to have
5518				 * a non-zero size value in the "pushtr".
5519				 */
5520				tupregs[ttop].dttk_size =
5521				    dtrace_strlen((char *)(uintptr_t)regs[rd],
5522				    regs[r2] ? regs[r2] :
5523				    dtrace_strsize_default) + 1;
5524			} else {
5525				tupregs[ttop].dttk_size = regs[r2];
5526			}
5527
5528			tupregs[ttop++].dttk_value = regs[rd];
5529			break;
5530
5531		case DIF_OP_PUSHTV:
5532			if (ttop == DIF_DTR_NREGS) {
5533				*flags |= CPU_DTRACE_TUPOFLOW;
5534				break;
5535			}
5536
5537			tupregs[ttop].dttk_value = regs[rd];
5538			tupregs[ttop++].dttk_size = 0;
5539			break;
5540
5541		case DIF_OP_POPTS:
5542			if (ttop != 0)
5543				ttop--;
5544			break;
5545
5546		case DIF_OP_FLUSHTS:
5547			ttop = 0;
5548			break;
5549
5550		case DIF_OP_LDGAA:
5551		case DIF_OP_LDTAA: {
5552			dtrace_dynvar_t *dvar;
5553			dtrace_key_t *key = tupregs;
5554			uint_t nkeys = ttop;
5555
5556			id = DIF_INSTR_VAR(instr);
5557			ASSERT(id >= DIF_VAR_OTHER_UBASE);
5558			id -= DIF_VAR_OTHER_UBASE;
5559
5560			key[nkeys].dttk_value = (uint64_t)id;
5561			key[nkeys++].dttk_size = 0;
5562
5563			if (DIF_INSTR_OP(instr) == DIF_OP_LDTAA) {
5564				DTRACE_TLS_THRKEY(key[nkeys].dttk_value);
5565				key[nkeys++].dttk_size = 0;
5566				v = &vstate->dtvs_tlocals[id];
5567			} else {
5568				v = &vstate->dtvs_globals[id]->dtsv_var;
5569			}
5570
5571			dvar = dtrace_dynvar(dstate, nkeys, key,
5572			    v->dtdv_type.dtdt_size > sizeof (uint64_t) ?
5573			    v->dtdv_type.dtdt_size : sizeof (uint64_t),
5574			    DTRACE_DYNVAR_NOALLOC, mstate, vstate);
5575
5576			if (dvar == NULL) {
5577				regs[rd] = 0;
5578				break;
5579			}
5580
5581			if (v->dtdv_type.dtdt_flags & DIF_TF_BYREF) {
5582				regs[rd] = (uint64_t)(uintptr_t)dvar->dtdv_data;
5583			} else {
5584				regs[rd] = *((uint64_t *)dvar->dtdv_data);
5585			}
5586
5587			break;
5588		}
5589
5590		case DIF_OP_STGAA:
5591		case DIF_OP_STTAA: {
5592			dtrace_dynvar_t *dvar;
5593			dtrace_key_t *key = tupregs;
5594			uint_t nkeys = ttop;
5595
5596			id = DIF_INSTR_VAR(instr);
5597			ASSERT(id >= DIF_VAR_OTHER_UBASE);
5598			id -= DIF_VAR_OTHER_UBASE;
5599
5600			key[nkeys].dttk_value = (uint64_t)id;
5601			key[nkeys++].dttk_size = 0;
5602
5603			if (DIF_INSTR_OP(instr) == DIF_OP_STTAA) {
5604				DTRACE_TLS_THRKEY(key[nkeys].dttk_value);
5605				key[nkeys++].dttk_size = 0;
5606				v = &vstate->dtvs_tlocals[id];
5607			} else {
5608				v = &vstate->dtvs_globals[id]->dtsv_var;
5609			}
5610
5611			dvar = dtrace_dynvar(dstate, nkeys, key,
5612			    v->dtdv_type.dtdt_size > sizeof (uint64_t) ?
5613			    v->dtdv_type.dtdt_size : sizeof (uint64_t),
5614			    regs[rd] ? DTRACE_DYNVAR_ALLOC :
5615			    DTRACE_DYNVAR_DEALLOC, mstate, vstate);
5616
5617			if (dvar == NULL)
5618				break;
5619
5620			if (v->dtdv_type.dtdt_flags & DIF_TF_BYREF) {
5621				if (!dtrace_vcanload(
5622				    (void *)(uintptr_t)regs[rd], &v->dtdv_type,
5623				    mstate, vstate))
5624					break;
5625
5626				dtrace_vcopy((void *)(uintptr_t)regs[rd],
5627				    dvar->dtdv_data, &v->dtdv_type);
5628			} else {
5629				*((uint64_t *)dvar->dtdv_data) = regs[rd];
5630			}
5631
5632			break;
5633		}
5634
5635		case DIF_OP_ALLOCS: {
5636			uintptr_t ptr = P2ROUNDUP(mstate->dtms_scratch_ptr, 8);
5637			size_t size = ptr - mstate->dtms_scratch_ptr + regs[r1];
5638
5639			/*
5640			 * Rounding up the user allocation size could have
5641			 * overflowed large, bogus allocations (like -1ULL) to
5642			 * 0.
5643			 */
5644			if (size < regs[r1] ||
5645			    !DTRACE_INSCRATCH(mstate, size)) {
5646				DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
5647				regs[rd] = 0;
5648				break;
5649			}
5650
5651			dtrace_bzero((void *) mstate->dtms_scratch_ptr, size);
5652			mstate->dtms_scratch_ptr += size;
5653			regs[rd] = ptr;
5654			break;
5655		}
5656
5657		case DIF_OP_COPYS:
5658			if (!dtrace_canstore(regs[rd], regs[r2],
5659			    mstate, vstate)) {
5660				*flags |= CPU_DTRACE_BADADDR;
5661				*illval = regs[rd];
5662				break;
5663			}
5664
5665			if (!dtrace_canload(regs[r1], regs[r2], mstate, vstate))
5666				break;
5667
5668			dtrace_bcopy((void *)(uintptr_t)regs[r1],
5669			    (void *)(uintptr_t)regs[rd], (size_t)regs[r2]);
5670			break;
5671
5672		case DIF_OP_STB:
5673			if (!dtrace_canstore(regs[rd], 1, mstate, vstate)) {
5674				*flags |= CPU_DTRACE_BADADDR;
5675				*illval = regs[rd];
5676				break;
5677			}
5678			*((uint8_t *)(uintptr_t)regs[rd]) = (uint8_t)regs[r1];
5679			break;
5680
5681		case DIF_OP_STH:
5682			if (!dtrace_canstore(regs[rd], 2, mstate, vstate)) {
5683				*flags |= CPU_DTRACE_BADADDR;
5684				*illval = regs[rd];
5685				break;
5686			}
5687			if (regs[rd] & 1) {
5688				*flags |= CPU_DTRACE_BADALIGN;
5689				*illval = regs[rd];
5690				break;
5691			}
5692			*((uint16_t *)(uintptr_t)regs[rd]) = (uint16_t)regs[r1];
5693			break;
5694
5695		case DIF_OP_STW:
5696			if (!dtrace_canstore(regs[rd], 4, mstate, vstate)) {
5697				*flags |= CPU_DTRACE_BADADDR;
5698				*illval = regs[rd];
5699				break;
5700			}
5701			if (regs[rd] & 3) {
5702				*flags |= CPU_DTRACE_BADALIGN;
5703				*illval = regs[rd];
5704				break;
5705			}
5706			*((uint32_t *)(uintptr_t)regs[rd]) = (uint32_t)regs[r1];
5707			break;
5708
5709		case DIF_OP_STX:
5710			if (!dtrace_canstore(regs[rd], 8, mstate, vstate)) {
5711				*flags |= CPU_DTRACE_BADADDR;
5712				*illval = regs[rd];
5713				break;
5714			}
5715			if (regs[rd] & 7) {
5716				*flags |= CPU_DTRACE_BADALIGN;
5717				*illval = regs[rd];
5718				break;
5719			}
5720			*((uint64_t *)(uintptr_t)regs[rd]) = regs[r1];
5721			break;
5722		}
5723	}
5724
5725	if (!(*flags & CPU_DTRACE_FAULT))
5726		return (rval);
5727
5728	mstate->dtms_fltoffs = opc * sizeof (dif_instr_t);
5729	mstate->dtms_present |= DTRACE_MSTATE_FLTOFFS;
5730
5731	return (0);
5732}
5733
5734static void
5735dtrace_action_breakpoint(dtrace_ecb_t *ecb)
5736{
5737	dtrace_probe_t *probe = ecb->dte_probe;
5738	dtrace_provider_t *prov = probe->dtpr_provider;
5739	char c[DTRACE_FULLNAMELEN + 80], *str;
5740	char *msg = "dtrace: breakpoint action at probe ";
5741	char *ecbmsg = " (ecb ";
5742	uintptr_t mask = (0xf << (sizeof (uintptr_t) * NBBY / 4));
5743	uintptr_t val = (uintptr_t)ecb;
5744	int shift = (sizeof (uintptr_t) * NBBY) - 4, i = 0;
5745
5746	if (dtrace_destructive_disallow)
5747		return;
5748
5749	/*
5750	 * It's impossible to be taking action on the NULL probe.
5751	 */
5752	ASSERT(probe != NULL);
5753
5754	/*
5755	 * This is a poor man's (destitute man's?) sprintf():  we want to
5756	 * print the provider name, module name, function name and name of
5757	 * the probe, along with the hex address of the ECB with the breakpoint
5758	 * action -- all of which we must place in the character buffer by
5759	 * hand.
5760	 */
5761	while (*msg != '\0')
5762		c[i++] = *msg++;
5763
5764	for (str = prov->dtpv_name; *str != '\0'; str++)
5765		c[i++] = *str;
5766	c[i++] = ':';
5767
5768	for (str = probe->dtpr_mod; *str != '\0'; str++)
5769		c[i++] = *str;
5770	c[i++] = ':';
5771
5772	for (str = probe->dtpr_func; *str != '\0'; str++)
5773		c[i++] = *str;
5774	c[i++] = ':';
5775
5776	for (str = probe->dtpr_name; *str != '\0'; str++)
5777		c[i++] = *str;
5778
5779	while (*ecbmsg != '\0')
5780		c[i++] = *ecbmsg++;
5781
5782	while (shift >= 0) {
5783		mask = (uintptr_t)0xf << shift;
5784
5785		if (val >= ((uintptr_t)1 << shift))
5786			c[i++] = "0123456789abcdef"[(val & mask) >> shift];
5787		shift -= 4;
5788	}
5789
5790	c[i++] = ')';
5791	c[i] = '\0';
5792
5793#if defined(sun)
5794	debug_enter(c);
5795#else
5796	kdb_enter(KDB_WHY_DTRACE, "breakpoint action");
5797#endif
5798}
5799
5800static void
5801dtrace_action_panic(dtrace_ecb_t *ecb)
5802{
5803	dtrace_probe_t *probe = ecb->dte_probe;
5804
5805	/*
5806	 * It's impossible to be taking action on the NULL probe.
5807	 */
5808	ASSERT(probe != NULL);
5809
5810	if (dtrace_destructive_disallow)
5811		return;
5812
5813	if (dtrace_panicked != NULL)
5814		return;
5815
5816	if (dtrace_casptr(&dtrace_panicked, NULL, curthread) != NULL)
5817		return;
5818
5819	/*
5820	 * We won the right to panic.  (We want to be sure that only one
5821	 * thread calls panic() from dtrace_probe(), and that panic() is
5822	 * called exactly once.)
5823	 */
5824	dtrace_panic("dtrace: panic action at probe %s:%s:%s:%s (ecb %p)",
5825	    probe->dtpr_provider->dtpv_name, probe->dtpr_mod,
5826	    probe->dtpr_func, probe->dtpr_name, (void *)ecb);
5827}
5828
5829static void
5830dtrace_action_raise(uint64_t sig)
5831{
5832	if (dtrace_destructive_disallow)
5833		return;
5834
5835	if (sig >= NSIG) {
5836		DTRACE_CPUFLAG_SET(CPU_DTRACE_ILLOP);
5837		return;
5838	}
5839
5840#if defined(sun)
5841	/*
5842	 * raise() has a queue depth of 1 -- we ignore all subsequent
5843	 * invocations of the raise() action.
5844	 */
5845	if (curthread->t_dtrace_sig == 0)
5846		curthread->t_dtrace_sig = (uint8_t)sig;
5847
5848	curthread->t_sig_check = 1;
5849	aston(curthread);
5850#else
5851	struct proc *p = curproc;
5852	PROC_LOCK(p);
5853	kern_psignal(p, sig);
5854	PROC_UNLOCK(p);
5855#endif
5856}
5857
5858static void
5859dtrace_action_stop(void)
5860{
5861	if (dtrace_destructive_disallow)
5862		return;
5863
5864#if defined(sun)
5865	if (!curthread->t_dtrace_stop) {
5866		curthread->t_dtrace_stop = 1;
5867		curthread->t_sig_check = 1;
5868		aston(curthread);
5869	}
5870#else
5871	struct proc *p = curproc;
5872	PROC_LOCK(p);
5873	kern_psignal(p, SIGSTOP);
5874	PROC_UNLOCK(p);
5875#endif
5876}
5877
5878static void
5879dtrace_action_chill(dtrace_mstate_t *mstate, hrtime_t val)
5880{
5881	hrtime_t now;
5882	volatile uint16_t *flags;
5883#if defined(sun)
5884	cpu_t *cpu = CPU;
5885#else
5886	cpu_t *cpu = &solaris_cpu[curcpu];
5887#endif
5888
5889	if (dtrace_destructive_disallow)
5890		return;
5891
5892	flags = (volatile uint16_t *)&cpu_core[cpu->cpu_id].cpuc_dtrace_flags;
5893
5894	now = dtrace_gethrtime();
5895
5896	if (now - cpu->cpu_dtrace_chillmark > dtrace_chill_interval) {
5897		/*
5898		 * We need to advance the mark to the current time.
5899		 */
5900		cpu->cpu_dtrace_chillmark = now;
5901		cpu->cpu_dtrace_chilled = 0;
5902	}
5903
5904	/*
5905	 * Now check to see if the requested chill time would take us over
5906	 * the maximum amount of time allowed in the chill interval.  (Or
5907	 * worse, if the calculation itself induces overflow.)
5908	 */
5909	if (cpu->cpu_dtrace_chilled + val > dtrace_chill_max ||
5910	    cpu->cpu_dtrace_chilled + val < cpu->cpu_dtrace_chilled) {
5911		*flags |= CPU_DTRACE_ILLOP;
5912		return;
5913	}
5914
5915	while (dtrace_gethrtime() - now < val)
5916		continue;
5917
5918	/*
5919	 * Normally, we assure that the value of the variable "timestamp" does
5920	 * not change within an ECB.  The presence of chill() represents an
5921	 * exception to this rule, however.
5922	 */
5923	mstate->dtms_present &= ~DTRACE_MSTATE_TIMESTAMP;
5924	cpu->cpu_dtrace_chilled += val;
5925}
5926
5927static void
5928dtrace_action_ustack(dtrace_mstate_t *mstate, dtrace_state_t *state,
5929    uint64_t *buf, uint64_t arg)
5930{
5931	int nframes = DTRACE_USTACK_NFRAMES(arg);
5932	int strsize = DTRACE_USTACK_STRSIZE(arg);
5933	uint64_t *pcs = &buf[1], *fps;
5934	char *str = (char *)&pcs[nframes];
5935	int size, offs = 0, i, j;
5936	uintptr_t old = mstate->dtms_scratch_ptr, saved;
5937	uint16_t *flags = &cpu_core[curcpu].cpuc_dtrace_flags;
5938	char *sym;
5939
5940	/*
5941	 * Should be taking a faster path if string space has not been
5942	 * allocated.
5943	 */
5944	ASSERT(strsize != 0);
5945
5946	/*
5947	 * We will first allocate some temporary space for the frame pointers.
5948	 */
5949	fps = (uint64_t *)P2ROUNDUP(mstate->dtms_scratch_ptr, 8);
5950	size = (uintptr_t)fps - mstate->dtms_scratch_ptr +
5951	    (nframes * sizeof (uint64_t));
5952
5953	if (!DTRACE_INSCRATCH(mstate, size)) {
5954		/*
5955		 * Not enough room for our frame pointers -- need to indicate
5956		 * that we ran out of scratch space.
5957		 */
5958		DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
5959		return;
5960	}
5961
5962	mstate->dtms_scratch_ptr += size;
5963	saved = mstate->dtms_scratch_ptr;
5964
5965	/*
5966	 * Now get a stack with both program counters and frame pointers.
5967	 */
5968	DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
5969	dtrace_getufpstack(buf, fps, nframes + 1);
5970	DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);
5971
5972	/*
5973	 * If that faulted, we're cooked.
5974	 */
5975	if (*flags & CPU_DTRACE_FAULT)
5976		goto out;
5977
5978	/*
5979	 * Now we want to walk up the stack, calling the USTACK helper.  For
5980	 * each iteration, we restore the scratch pointer.
5981	 */
5982	for (i = 0; i < nframes; i++) {
5983		mstate->dtms_scratch_ptr = saved;
5984
5985		if (offs >= strsize)
5986			break;
5987
5988		sym = (char *)(uintptr_t)dtrace_helper(
5989		    DTRACE_HELPER_ACTION_USTACK,
5990		    mstate, state, pcs[i], fps[i]);
5991
5992		/*
5993		 * If we faulted while running the helper, we're going to
5994		 * clear the fault and null out the corresponding string.
5995		 */
5996		if (*flags & CPU_DTRACE_FAULT) {
5997			*flags &= ~CPU_DTRACE_FAULT;
5998			str[offs++] = '\0';
5999			continue;
6000		}
6001
6002		if (sym == NULL) {
6003			str[offs++] = '\0';
6004			continue;
6005		}
6006
6007		DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
6008
6009		/*
6010		 * Now copy in the string that the helper returned to us.
6011		 */
6012		for (j = 0; offs + j < strsize; j++) {
6013			if ((str[offs + j] = sym[j]) == '\0')
6014				break;
6015		}
6016
6017		DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);
6018
6019		offs += j + 1;
6020	}
6021
6022	if (offs >= strsize) {
6023		/*
6024		 * If we didn't have room for all of the strings, we don't
6025		 * abort processing -- this needn't be a fatal error -- but we
6026		 * still want to increment a counter (dts_stkstroverflows) to
6027		 * allow this condition to be warned about.  (If this is from
6028		 * a jstack() action, it is easily tuned via jstackstrsize.)
6029		 */
6030		dtrace_error(&state->dts_stkstroverflows);
6031	}
6032
6033	while (offs < strsize)
6034		str[offs++] = '\0';
6035
6036out:
6037	mstate->dtms_scratch_ptr = old;
6038}
6039
6040/*
6041 * If you're looking for the epicenter of DTrace, you just found it.  This
6042 * is the function called by the provider to fire a probe -- from which all
6043 * subsequent probe-context DTrace activity emanates.
6044 */
6045void
6046dtrace_probe(dtrace_id_t id, uintptr_t arg0, uintptr_t arg1,
6047    uintptr_t arg2, uintptr_t arg3, uintptr_t arg4)
6048{
6049	processorid_t cpuid;
6050	dtrace_icookie_t cookie;
6051	dtrace_probe_t *probe;
6052	dtrace_mstate_t mstate;
6053	dtrace_ecb_t *ecb;
6054	dtrace_action_t *act;
6055	intptr_t offs;
6056	size_t size;
6057	int vtime, onintr;
6058	volatile uint16_t *flags;
6059	hrtime_t now;
6060
6061	if (panicstr != NULL)
6062		return;
6063
6064#if defined(sun)
6065	/*
6066	 * Kick out immediately if this CPU is still being born (in which case
6067	 * curthread will be set to -1) or the current thread can't allow
6068	 * probes in its current context.
6069	 */
6070	if (((uintptr_t)curthread & 1) || (curthread->t_flag & T_DONTDTRACE))
6071		return;
6072#endif
6073
6074	cookie = dtrace_interrupt_disable();
6075	probe = dtrace_probes[id - 1];
6076	cpuid = curcpu;
6077	onintr = CPU_ON_INTR(CPU);
6078
6079	if (!onintr && probe->dtpr_predcache != DTRACE_CACHEIDNONE &&
6080	    probe->dtpr_predcache == curthread->t_predcache) {
6081		/*
6082		 * We have hit in the predicate cache; we know that
6083		 * this predicate would evaluate to be false.
6084		 */
6085		dtrace_interrupt_enable(cookie);
6086		return;
6087	}
6088
6089#if defined(sun)
6090	if (panic_quiesce) {
6091#else
6092	if (panicstr != NULL) {
6093#endif
6094		/*
6095		 * We don't trace anything if we're panicking.
6096		 */
6097		dtrace_interrupt_enable(cookie);
6098		return;
6099	}
6100
6101	now = dtrace_gethrtime();
6102	vtime = dtrace_vtime_references != 0;
6103
6104	if (vtime && curthread->t_dtrace_start)
6105		curthread->t_dtrace_vtime += now - curthread->t_dtrace_start;
6106
6107	mstate.dtms_difo = NULL;
6108	mstate.dtms_probe = probe;
6109	mstate.dtms_strtok = 0;
6110	mstate.dtms_arg[0] = arg0;
6111	mstate.dtms_arg[1] = arg1;
6112	mstate.dtms_arg[2] = arg2;
6113	mstate.dtms_arg[3] = arg3;
6114	mstate.dtms_arg[4] = arg4;
6115
6116	flags = (volatile uint16_t *)&cpu_core[cpuid].cpuc_dtrace_flags;
6117
6118	for (ecb = probe->dtpr_ecb; ecb != NULL; ecb = ecb->dte_next) {
6119		dtrace_predicate_t *pred = ecb->dte_predicate;
6120		dtrace_state_t *state = ecb->dte_state;
6121		dtrace_buffer_t *buf = &state->dts_buffer[cpuid];
6122		dtrace_buffer_t *aggbuf = &state->dts_aggbuffer[cpuid];
6123		dtrace_vstate_t *vstate = &state->dts_vstate;
6124		dtrace_provider_t *prov = probe->dtpr_provider;
6125		uint64_t tracememsize = 0;
6126		int committed = 0;
6127		caddr_t tomax;
6128
6129		/*
6130		 * A little subtlety with the following (seemingly innocuous)
6131		 * declaration of the automatic 'val':  by looking at the
6132		 * code, you might think that it could be declared in the
6133		 * action processing loop, below.  (That is, it's only used in
6134		 * the action processing loop.)  However, it must be declared
6135		 * out of that scope because in the case of DIF expression
6136		 * arguments to aggregating actions, one iteration of the
6137		 * action loop will use the last iteration's value.
6138		 */
6139		uint64_t val = 0;
6140
6141		mstate.dtms_present = DTRACE_MSTATE_ARGS | DTRACE_MSTATE_PROBE;
6142		*flags &= ~CPU_DTRACE_ERROR;
6143
6144		if (prov == dtrace_provider) {
6145			/*
6146			 * If dtrace itself is the provider of this probe,
6147			 * we're only going to continue processing the ECB if
6148			 * arg0 (the dtrace_state_t) is equal to the ECB's
6149			 * creating state.  (This prevents disjoint consumers
6150			 * from seeing one another's metaprobes.)
6151			 */
6152			if (arg0 != (uint64_t)(uintptr_t)state)
6153				continue;
6154		}
6155
6156		if (state->dts_activity != DTRACE_ACTIVITY_ACTIVE) {
6157			/*
6158			 * We're not currently active.  If our provider isn't
6159			 * the dtrace pseudo provider, we're not interested.
6160			 */
6161			if (prov != dtrace_provider)
6162				continue;
6163
6164			/*
6165			 * Now we must further check if we are in the BEGIN
6166			 * probe.  If we are, we will only continue processing
6167			 * if we're still in WARMUP -- if one BEGIN enabling
6168			 * has invoked the exit() action, we don't want to
6169			 * evaluate subsequent BEGIN enablings.
6170			 */
6171			if (probe->dtpr_id == dtrace_probeid_begin &&
6172			    state->dts_activity != DTRACE_ACTIVITY_WARMUP) {
6173				ASSERT(state->dts_activity ==
6174				    DTRACE_ACTIVITY_DRAINING);
6175				continue;
6176			}
6177		}
6178
6179		if (ecb->dte_cond) {
6180			/*
6181			 * If the dte_cond bits indicate that this
6182			 * consumer is only allowed to see user-mode firings
6183			 * of this probe, call the provider's dtps_usermode()
6184			 * entry point to check that the probe was fired
6185			 * while in a user context. Skip this ECB if that's
6186			 * not the case.
6187			 */
6188			if ((ecb->dte_cond & DTRACE_COND_USERMODE) &&
6189			    prov->dtpv_pops.dtps_usermode(prov->dtpv_arg,
6190			    probe->dtpr_id, probe->dtpr_arg) == 0)
6191				continue;
6192
6193#if defined(sun)
6194			/*
6195			 * This is more subtle than it looks. We have to be
6196			 * absolutely certain that CRED() isn't going to
6197			 * change out from under us so it's only legit to
6198			 * examine that structure if we're in constrained
6199			 * situations. Currently, the only times we'll this
6200			 * check is if a non-super-user has enabled the
6201			 * profile or syscall providers -- providers that
6202			 * allow visibility of all processes. For the
6203			 * profile case, the check above will ensure that
6204			 * we're examining a user context.
6205			 */
6206			if (ecb->dte_cond & DTRACE_COND_OWNER) {
6207				cred_t *cr;
6208				cred_t *s_cr =
6209				    ecb->dte_state->dts_cred.dcr_cred;
6210				proc_t *proc;
6211
6212				ASSERT(s_cr != NULL);
6213
6214				if ((cr = CRED()) == NULL ||
6215				    s_cr->cr_uid != cr->cr_uid ||
6216				    s_cr->cr_uid != cr->cr_ruid ||
6217				    s_cr->cr_uid != cr->cr_suid ||
6218				    s_cr->cr_gid != cr->cr_gid ||
6219				    s_cr->cr_gid != cr->cr_rgid ||
6220				    s_cr->cr_gid != cr->cr_sgid ||
6221				    (proc = ttoproc(curthread)) == NULL ||
6222				    (proc->p_flag & SNOCD))
6223					continue;
6224			}
6225
6226			if (ecb->dte_cond & DTRACE_COND_ZONEOWNER) {
6227				cred_t *cr;
6228				cred_t *s_cr =
6229				    ecb->dte_state->dts_cred.dcr_cred;
6230
6231				ASSERT(s_cr != NULL);
6232
6233				if ((cr = CRED()) == NULL ||
6234				    s_cr->cr_zone->zone_id !=
6235				    cr->cr_zone->zone_id)
6236					continue;
6237			}
6238#endif
6239		}
6240
6241		if (now - state->dts_alive > dtrace_deadman_timeout) {
6242			/*
6243			 * We seem to be dead.  Unless we (a) have kernel
6244			 * destructive permissions (b) have explicitly enabled
6245			 * destructive actions and (c) destructive actions have
6246			 * not been disabled, we're going to transition into
6247			 * the KILLED state, from which no further processing
6248			 * on this state will be performed.
6249			 */
6250			if (!dtrace_priv_kernel_destructive(state) ||
6251			    !state->dts_cred.dcr_destructive ||
6252			    dtrace_destructive_disallow) {
6253				void *activity = &state->dts_activity;
6254				dtrace_activity_t current;
6255
6256				do {
6257					current = state->dts_activity;
6258				} while (dtrace_cas32(activity, current,
6259				    DTRACE_ACTIVITY_KILLED) != current);
6260
6261				continue;
6262			}
6263		}
6264
6265		if ((offs = dtrace_buffer_reserve(buf, ecb->dte_needed,
6266		    ecb->dte_alignment, state, &mstate)) < 0)
6267			continue;
6268
6269		tomax = buf->dtb_tomax;
6270		ASSERT(tomax != NULL);
6271
6272		if (ecb->dte_size != 0) {
6273			dtrace_rechdr_t dtrh;
6274			if (!(mstate.dtms_present & DTRACE_MSTATE_TIMESTAMP)) {
6275				mstate.dtms_timestamp = dtrace_gethrtime();
6276				mstate.dtms_present |= DTRACE_MSTATE_TIMESTAMP;
6277			}
6278			ASSERT3U(ecb->dte_size, >=, sizeof (dtrace_rechdr_t));
6279			dtrh.dtrh_epid = ecb->dte_epid;
6280			DTRACE_RECORD_STORE_TIMESTAMP(&dtrh,
6281			    mstate.dtms_timestamp);
6282			*((dtrace_rechdr_t *)(tomax + offs)) = dtrh;
6283		}
6284
6285		mstate.dtms_epid = ecb->dte_epid;
6286		mstate.dtms_present |= DTRACE_MSTATE_EPID;
6287
6288		if (state->dts_cred.dcr_visible & DTRACE_CRV_KERNEL)
6289			mstate.dtms_access = DTRACE_ACCESS_KERNEL;
6290		else
6291			mstate.dtms_access = 0;
6292
6293		if (pred != NULL) {
6294			dtrace_difo_t *dp = pred->dtp_difo;
6295			int rval;
6296
6297			rval = dtrace_dif_emulate(dp, &mstate, vstate, state);
6298
6299			if (!(*flags & CPU_DTRACE_ERROR) && !rval) {
6300				dtrace_cacheid_t cid = probe->dtpr_predcache;
6301
6302				if (cid != DTRACE_CACHEIDNONE && !onintr) {
6303					/*
6304					 * Update the predicate cache...
6305					 */
6306					ASSERT(cid == pred->dtp_cacheid);
6307					curthread->t_predcache = cid;
6308				}
6309
6310				continue;
6311			}
6312		}
6313
6314		for (act = ecb->dte_action; !(*flags & CPU_DTRACE_ERROR) &&
6315		    act != NULL; act = act->dta_next) {
6316			size_t valoffs;
6317			dtrace_difo_t *dp;
6318			dtrace_recdesc_t *rec = &act->dta_rec;
6319
6320			size = rec->dtrd_size;
6321			valoffs = offs + rec->dtrd_offset;
6322
6323			if (DTRACEACT_ISAGG(act->dta_kind)) {
6324				uint64_t v = 0xbad;
6325				dtrace_aggregation_t *agg;
6326
6327				agg = (dtrace_aggregation_t *)act;
6328
6329				if ((dp = act->dta_difo) != NULL)
6330					v = dtrace_dif_emulate(dp,
6331					    &mstate, vstate, state);
6332
6333				if (*flags & CPU_DTRACE_ERROR)
6334					continue;
6335
6336				/*
6337				 * Note that we always pass the expression
6338				 * value from the previous iteration of the
6339				 * action loop.  This value will only be used
6340				 * if there is an expression argument to the
6341				 * aggregating action, denoted by the
6342				 * dtag_hasarg field.
6343				 */
6344				dtrace_aggregate(agg, buf,
6345				    offs, aggbuf, v, val);
6346				continue;
6347			}
6348
6349			switch (act->dta_kind) {
6350			case DTRACEACT_STOP:
6351				if (dtrace_priv_proc_destructive(state))
6352					dtrace_action_stop();
6353				continue;
6354
6355			case DTRACEACT_BREAKPOINT:
6356				if (dtrace_priv_kernel_destructive(state))
6357					dtrace_action_breakpoint(ecb);
6358				continue;
6359
6360			case DTRACEACT_PANIC:
6361				if (dtrace_priv_kernel_destructive(state))
6362					dtrace_action_panic(ecb);
6363				continue;
6364
6365			case DTRACEACT_STACK:
6366				if (!dtrace_priv_kernel(state))
6367					continue;
6368
6369				dtrace_getpcstack((pc_t *)(tomax + valoffs),
6370				    size / sizeof (pc_t), probe->dtpr_aframes,
6371				    DTRACE_ANCHORED(probe) ? NULL :
6372				    (uint32_t *)arg0);
6373				continue;
6374
6375			case DTRACEACT_JSTACK:
6376			case DTRACEACT_USTACK:
6377				if (!dtrace_priv_proc(state))
6378					continue;
6379
6380				/*
6381				 * See comment in DIF_VAR_PID.
6382				 */
6383				if (DTRACE_ANCHORED(mstate.dtms_probe) &&
6384				    CPU_ON_INTR(CPU)) {
6385					int depth = DTRACE_USTACK_NFRAMES(
6386					    rec->dtrd_arg) + 1;
6387
6388					dtrace_bzero((void *)(tomax + valoffs),
6389					    DTRACE_USTACK_STRSIZE(rec->dtrd_arg)
6390					    + depth * sizeof (uint64_t));
6391
6392					continue;
6393				}
6394
6395				if (DTRACE_USTACK_STRSIZE(rec->dtrd_arg) != 0 &&
6396				    curproc->p_dtrace_helpers != NULL) {
6397					/*
6398					 * This is the slow path -- we have
6399					 * allocated string space, and we're
6400					 * getting the stack of a process that
6401					 * has helpers.  Call into a separate
6402					 * routine to perform this processing.
6403					 */
6404					dtrace_action_ustack(&mstate, state,
6405					    (uint64_t *)(tomax + valoffs),
6406					    rec->dtrd_arg);
6407					continue;
6408				}
6409
6410				DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
6411				dtrace_getupcstack((uint64_t *)
6412				    (tomax + valoffs),
6413				    DTRACE_USTACK_NFRAMES(rec->dtrd_arg) + 1);
6414				DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);
6415				continue;
6416
6417			default:
6418				break;
6419			}
6420
6421			dp = act->dta_difo;
6422			ASSERT(dp != NULL);
6423
6424			val = dtrace_dif_emulate(dp, &mstate, vstate, state);
6425
6426			if (*flags & CPU_DTRACE_ERROR)
6427				continue;
6428
6429			switch (act->dta_kind) {
6430			case DTRACEACT_SPECULATE: {
6431				dtrace_rechdr_t *dtrh;
6432
6433				ASSERT(buf == &state->dts_buffer[cpuid]);
6434				buf = dtrace_speculation_buffer(state,
6435				    cpuid, val);
6436
6437				if (buf == NULL) {
6438					*flags |= CPU_DTRACE_DROP;
6439					continue;
6440				}
6441
6442				offs = dtrace_buffer_reserve(buf,
6443				    ecb->dte_needed, ecb->dte_alignment,
6444				    state, NULL);
6445
6446				if (offs < 0) {
6447					*flags |= CPU_DTRACE_DROP;
6448					continue;
6449				}
6450
6451				tomax = buf->dtb_tomax;
6452				ASSERT(tomax != NULL);
6453
6454				if (ecb->dte_size == 0)
6455					continue;
6456
6457				ASSERT3U(ecb->dte_size, >=,
6458				    sizeof (dtrace_rechdr_t));
6459				dtrh = ((void *)(tomax + offs));
6460				dtrh->dtrh_epid = ecb->dte_epid;
6461				/*
6462				 * When the speculation is committed, all of
6463				 * the records in the speculative buffer will
6464				 * have their timestamps set to the commit
6465				 * time.  Until then, it is set to a sentinel
6466				 * value, for debugability.
6467				 */
6468				DTRACE_RECORD_STORE_TIMESTAMP(dtrh, UINT64_MAX);
6469				continue;
6470			}
6471
6472			case DTRACEACT_PRINTM: {
6473				/* The DIF returns a 'memref'. */
6474				uintptr_t *memref = (uintptr_t *)(uintptr_t) val;
6475
6476				/* Get the size from the memref. */
6477				size = memref[1];
6478
6479				/*
6480				 * Check if the size exceeds the allocated
6481				 * buffer size.
6482				 */
6483				if (size + sizeof(uintptr_t) > dp->dtdo_rtype.dtdt_size) {
6484					/* Flag a drop! */
6485					*flags |= CPU_DTRACE_DROP;
6486					continue;
6487				}
6488
6489				/* Store the size in the buffer first. */
6490				DTRACE_STORE(uintptr_t, tomax,
6491				    valoffs, size);
6492
6493				/*
6494				 * Offset the buffer address to the start
6495				 * of the data.
6496				 */
6497				valoffs += sizeof(uintptr_t);
6498
6499				/*
6500				 * Reset to the memory address rather than
6501				 * the memref array, then let the BYREF
6502				 * code below do the work to store the
6503				 * memory data in the buffer.
6504				 */
6505				val = memref[0];
6506				break;
6507			}
6508
6509			case DTRACEACT_PRINTT: {
6510				/* The DIF returns a 'typeref'. */
6511				uintptr_t *typeref = (uintptr_t *)(uintptr_t) val;
6512				char c = '\0' + 1;
6513				size_t s;
6514
6515				/*
6516				 * Get the type string length and round it
6517				 * up so that the data that follows is
6518				 * aligned for easy access.
6519				 */
6520				size_t typs = strlen((char *) typeref[2]) + 1;
6521				typs = roundup(typs,  sizeof(uintptr_t));
6522
6523				/*
6524				 *Get the size from the typeref using the
6525				 * number of elements and the type size.
6526				 */
6527				size = typeref[1] * typeref[3];
6528
6529				/*
6530				 * Check if the size exceeds the allocated
6531				 * buffer size.
6532				 */
6533				if (size + typs + 2 * sizeof(uintptr_t) > dp->dtdo_rtype.dtdt_size) {
6534					/* Flag a drop! */
6535					*flags |= CPU_DTRACE_DROP;
6536
6537				}
6538
6539				/* Store the size in the buffer first. */
6540				DTRACE_STORE(uintptr_t, tomax,
6541				    valoffs, size);
6542				valoffs += sizeof(uintptr_t);
6543
6544				/* Store the type size in the buffer. */
6545				DTRACE_STORE(uintptr_t, tomax,
6546				    valoffs, typeref[3]);
6547				valoffs += sizeof(uintptr_t);
6548
6549				val = typeref[2];
6550
6551				for (s = 0; s < typs; s++) {
6552					if (c != '\0')
6553						c = dtrace_load8(val++);
6554
6555					DTRACE_STORE(uint8_t, tomax,
6556					    valoffs++, c);
6557				}
6558
6559				/*
6560				 * Reset to the memory address rather than
6561				 * the typeref array, then let the BYREF
6562				 * code below do the work to store the
6563				 * memory data in the buffer.
6564				 */
6565				val = typeref[0];
6566				break;
6567			}
6568
6569			case DTRACEACT_CHILL:
6570				if (dtrace_priv_kernel_destructive(state))
6571					dtrace_action_chill(&mstate, val);
6572				continue;
6573
6574			case DTRACEACT_RAISE:
6575				if (dtrace_priv_proc_destructive(state))
6576					dtrace_action_raise(val);
6577				continue;
6578
6579			case DTRACEACT_COMMIT:
6580				ASSERT(!committed);
6581
6582				/*
6583				 * We need to commit our buffer state.
6584				 */
6585				if (ecb->dte_size)
6586					buf->dtb_offset = offs + ecb->dte_size;
6587				buf = &state->dts_buffer[cpuid];
6588				dtrace_speculation_commit(state, cpuid, val);
6589				committed = 1;
6590				continue;
6591
6592			case DTRACEACT_DISCARD:
6593				dtrace_speculation_discard(state, cpuid, val);
6594				continue;
6595
6596			case DTRACEACT_DIFEXPR:
6597			case DTRACEACT_LIBACT:
6598			case DTRACEACT_PRINTF:
6599			case DTRACEACT_PRINTA:
6600			case DTRACEACT_SYSTEM:
6601			case DTRACEACT_FREOPEN:
6602			case DTRACEACT_TRACEMEM:
6603				break;
6604
6605			case DTRACEACT_TRACEMEM_DYNSIZE:
6606				tracememsize = val;
6607				break;
6608
6609			case DTRACEACT_SYM:
6610			case DTRACEACT_MOD:
6611				if (!dtrace_priv_kernel(state))
6612					continue;
6613				break;
6614
6615			case DTRACEACT_USYM:
6616			case DTRACEACT_UMOD:
6617			case DTRACEACT_UADDR: {
6618#if defined(sun)
6619				struct pid *pid = curthread->t_procp->p_pidp;
6620#endif
6621
6622				if (!dtrace_priv_proc(state))
6623					continue;
6624
6625				DTRACE_STORE(uint64_t, tomax,
6626#if defined(sun)
6627				    valoffs, (uint64_t)pid->pid_id);
6628#else
6629				    valoffs, (uint64_t) curproc->p_pid);
6630#endif
6631				DTRACE_STORE(uint64_t, tomax,
6632				    valoffs + sizeof (uint64_t), val);
6633
6634				continue;
6635			}
6636
6637			case DTRACEACT_EXIT: {
6638				/*
6639				 * For the exit action, we are going to attempt
6640				 * to atomically set our activity to be
6641				 * draining.  If this fails (either because
6642				 * another CPU has beat us to the exit action,
6643				 * or because our current activity is something
6644				 * other than ACTIVE or WARMUP), we will
6645				 * continue.  This assures that the exit action
6646				 * can be successfully recorded at most once
6647				 * when we're in the ACTIVE state.  If we're
6648				 * encountering the exit() action while in
6649				 * COOLDOWN, however, we want to honor the new
6650				 * status code.  (We know that we're the only
6651				 * thread in COOLDOWN, so there is no race.)
6652				 */
6653				void *activity = &state->dts_activity;
6654				dtrace_activity_t current = state->dts_activity;
6655
6656				if (current == DTRACE_ACTIVITY_COOLDOWN)
6657					break;
6658
6659				if (current != DTRACE_ACTIVITY_WARMUP)
6660					current = DTRACE_ACTIVITY_ACTIVE;
6661
6662				if (dtrace_cas32(activity, current,
6663				    DTRACE_ACTIVITY_DRAINING) != current) {
6664					*flags |= CPU_DTRACE_DROP;
6665					continue;
6666				}
6667
6668				break;
6669			}
6670
6671			default:
6672				ASSERT(0);
6673			}
6674
6675			if (dp->dtdo_rtype.dtdt_flags & DIF_TF_BYREF) {
6676				uintptr_t end = valoffs + size;
6677
6678				if (tracememsize != 0 &&
6679				    valoffs + tracememsize < end) {
6680					end = valoffs + tracememsize;
6681					tracememsize = 0;
6682				}
6683
6684				if (!dtrace_vcanload((void *)(uintptr_t)val,
6685				    &dp->dtdo_rtype, &mstate, vstate))
6686					continue;
6687
6688				/*
6689				 * If this is a string, we're going to only
6690				 * load until we find the zero byte -- after
6691				 * which we'll store zero bytes.
6692				 */
6693				if (dp->dtdo_rtype.dtdt_kind ==
6694				    DIF_TYPE_STRING) {
6695					char c = '\0' + 1;
6696					int intuple = act->dta_intuple;
6697					size_t s;
6698
6699					for (s = 0; s < size; s++) {
6700						if (c != '\0')
6701							c = dtrace_load8(val++);
6702
6703						DTRACE_STORE(uint8_t, tomax,
6704						    valoffs++, c);
6705
6706						if (c == '\0' && intuple)
6707							break;
6708					}
6709
6710					continue;
6711				}
6712
6713				while (valoffs < end) {
6714					DTRACE_STORE(uint8_t, tomax, valoffs++,
6715					    dtrace_load8(val++));
6716				}
6717
6718				continue;
6719			}
6720
6721			switch (size) {
6722			case 0:
6723				break;
6724
6725			case sizeof (uint8_t):
6726				DTRACE_STORE(uint8_t, tomax, valoffs, val);
6727				break;
6728			case sizeof (uint16_t):
6729				DTRACE_STORE(uint16_t, tomax, valoffs, val);
6730				break;
6731			case sizeof (uint32_t):
6732				DTRACE_STORE(uint32_t, tomax, valoffs, val);
6733				break;
6734			case sizeof (uint64_t):
6735				DTRACE_STORE(uint64_t, tomax, valoffs, val);
6736				break;
6737			default:
6738				/*
6739				 * Any other size should have been returned by
6740				 * reference, not by value.
6741				 */
6742				ASSERT(0);
6743				break;
6744			}
6745		}
6746
6747		if (*flags & CPU_DTRACE_DROP)
6748			continue;
6749
6750		if (*flags & CPU_DTRACE_FAULT) {
6751			int ndx;
6752			dtrace_action_t *err;
6753
6754			buf->dtb_errors++;
6755
6756			if (probe->dtpr_id == dtrace_probeid_error) {
6757				/*
6758				 * There's nothing we can do -- we had an
6759				 * error on the error probe.  We bump an
6760				 * error counter to at least indicate that
6761				 * this condition happened.
6762				 */
6763				dtrace_error(&state->dts_dblerrors);
6764				continue;
6765			}
6766
6767			if (vtime) {
6768				/*
6769				 * Before recursing on dtrace_probe(), we
6770				 * need to explicitly clear out our start
6771				 * time to prevent it from being accumulated
6772				 * into t_dtrace_vtime.
6773				 */
6774				curthread->t_dtrace_start = 0;
6775			}
6776
6777			/*
6778			 * Iterate over the actions to figure out which action
6779			 * we were processing when we experienced the error.
6780			 * Note that act points _past_ the faulting action; if
6781			 * act is ecb->dte_action, the fault was in the
6782			 * predicate, if it's ecb->dte_action->dta_next it's
6783			 * in action #1, and so on.
6784			 */
6785			for (err = ecb->dte_action, ndx = 0;
6786			    err != act; err = err->dta_next, ndx++)
6787				continue;
6788
6789			dtrace_probe_error(state, ecb->dte_epid, ndx,
6790			    (mstate.dtms_present & DTRACE_MSTATE_FLTOFFS) ?
6791			    mstate.dtms_fltoffs : -1, DTRACE_FLAGS2FLT(*flags),
6792			    cpu_core[cpuid].cpuc_dtrace_illval);
6793
6794			continue;
6795		}
6796
6797		if (!committed)
6798			buf->dtb_offset = offs + ecb->dte_size;
6799	}
6800
6801	if (vtime)
6802		curthread->t_dtrace_start = dtrace_gethrtime();
6803
6804	dtrace_interrupt_enable(cookie);
6805}
6806
6807/*
6808 * DTrace Probe Hashing Functions
6809 *
6810 * The functions in this section (and indeed, the functions in remaining
6811 * sections) are not _called_ from probe context.  (Any exceptions to this are
6812 * marked with a "Note:".)  Rather, they are called from elsewhere in the
6813 * DTrace framework to look-up probes in, add probes to and remove probes from
6814 * the DTrace probe hashes.  (Each probe is hashed by each element of the
6815 * probe tuple -- allowing for fast lookups, regardless of what was
6816 * specified.)
6817 */
6818static uint_t
6819dtrace_hash_str(const char *p)
6820{
6821	unsigned int g;
6822	uint_t hval = 0;
6823
6824	while (*p) {
6825		hval = (hval << 4) + *p++;
6826		if ((g = (hval & 0xf0000000)) != 0)
6827			hval ^= g >> 24;
6828		hval &= ~g;
6829	}
6830	return (hval);
6831}
6832
6833static dtrace_hash_t *
6834dtrace_hash_create(uintptr_t stroffs, uintptr_t nextoffs, uintptr_t prevoffs)
6835{
6836	dtrace_hash_t *hash = kmem_zalloc(sizeof (dtrace_hash_t), KM_SLEEP);
6837
6838	hash->dth_stroffs = stroffs;
6839	hash->dth_nextoffs = nextoffs;
6840	hash->dth_prevoffs = prevoffs;
6841
6842	hash->dth_size = 1;
6843	hash->dth_mask = hash->dth_size - 1;
6844
6845	hash->dth_tab = kmem_zalloc(hash->dth_size *
6846	    sizeof (dtrace_hashbucket_t *), KM_SLEEP);
6847
6848	return (hash);
6849}
6850
6851static void
6852dtrace_hash_destroy(dtrace_hash_t *hash)
6853{
6854#ifdef DEBUG
6855	int i;
6856
6857	for (i = 0; i < hash->dth_size; i++)
6858		ASSERT(hash->dth_tab[i] == NULL);
6859#endif
6860
6861	kmem_free(hash->dth_tab,
6862	    hash->dth_size * sizeof (dtrace_hashbucket_t *));
6863	kmem_free(hash, sizeof (dtrace_hash_t));
6864}
6865
6866static void
6867dtrace_hash_resize(dtrace_hash_t *hash)
6868{
6869	int size = hash->dth_size, i, ndx;
6870	int new_size = hash->dth_size << 1;
6871	int new_mask = new_size - 1;
6872	dtrace_hashbucket_t **new_tab, *bucket, *next;
6873
6874	ASSERT((new_size & new_mask) == 0);
6875
6876	new_tab = kmem_zalloc(new_size * sizeof (void *), KM_SLEEP);
6877
6878	for (i = 0; i < size; i++) {
6879		for (bucket = hash->dth_tab[i]; bucket != NULL; bucket = next) {
6880			dtrace_probe_t *probe = bucket->dthb_chain;
6881
6882			ASSERT(probe != NULL);
6883			ndx = DTRACE_HASHSTR(hash, probe) & new_mask;
6884
6885			next = bucket->dthb_next;
6886			bucket->dthb_next = new_tab[ndx];
6887			new_tab[ndx] = bucket;
6888		}
6889	}
6890
6891	kmem_free(hash->dth_tab, hash->dth_size * sizeof (void *));
6892	hash->dth_tab = new_tab;
6893	hash->dth_size = new_size;
6894	hash->dth_mask = new_mask;
6895}
6896
6897static void
6898dtrace_hash_add(dtrace_hash_t *hash, dtrace_probe_t *new)
6899{
6900	int hashval = DTRACE_HASHSTR(hash, new);
6901	int ndx = hashval & hash->dth_mask;
6902	dtrace_hashbucket_t *bucket = hash->dth_tab[ndx];
6903	dtrace_probe_t **nextp, **prevp;
6904
6905	for (; bucket != NULL; bucket = bucket->dthb_next) {
6906		if (DTRACE_HASHEQ(hash, bucket->dthb_chain, new))
6907			goto add;
6908	}
6909
6910	if ((hash->dth_nbuckets >> 1) > hash->dth_size) {
6911		dtrace_hash_resize(hash);
6912		dtrace_hash_add(hash, new);
6913		return;
6914	}
6915
6916	bucket = kmem_zalloc(sizeof (dtrace_hashbucket_t), KM_SLEEP);
6917	bucket->dthb_next = hash->dth_tab[ndx];
6918	hash->dth_tab[ndx] = bucket;
6919	hash->dth_nbuckets++;
6920
6921add:
6922	nextp = DTRACE_HASHNEXT(hash, new);
6923	ASSERT(*nextp == NULL && *(DTRACE_HASHPREV(hash, new)) == NULL);
6924	*nextp = bucket->dthb_chain;
6925
6926	if (bucket->dthb_chain != NULL) {
6927		prevp = DTRACE_HASHPREV(hash, bucket->dthb_chain);
6928		ASSERT(*prevp == NULL);
6929		*prevp = new;
6930	}
6931
6932	bucket->dthb_chain = new;
6933	bucket->dthb_len++;
6934}
6935
6936static dtrace_probe_t *
6937dtrace_hash_lookup(dtrace_hash_t *hash, dtrace_probe_t *template)
6938{
6939	int hashval = DTRACE_HASHSTR(hash, template);
6940	int ndx = hashval & hash->dth_mask;
6941	dtrace_hashbucket_t *bucket = hash->dth_tab[ndx];
6942
6943	for (; bucket != NULL; bucket = bucket->dthb_next) {
6944		if (DTRACE_HASHEQ(hash, bucket->dthb_chain, template))
6945			return (bucket->dthb_chain);
6946	}
6947
6948	return (NULL);
6949}
6950
6951static int
6952dtrace_hash_collisions(dtrace_hash_t *hash, dtrace_probe_t *template)
6953{
6954	int hashval = DTRACE_HASHSTR(hash, template);
6955	int ndx = hashval & hash->dth_mask;
6956	dtrace_hashbucket_t *bucket = hash->dth_tab[ndx];
6957
6958	for (; bucket != NULL; bucket = bucket->dthb_next) {
6959		if (DTRACE_HASHEQ(hash, bucket->dthb_chain, template))
6960			return (bucket->dthb_len);
6961	}
6962
6963	return (0);
6964}
6965
6966static void
6967dtrace_hash_remove(dtrace_hash_t *hash, dtrace_probe_t *probe)
6968{
6969	int ndx = DTRACE_HASHSTR(hash, probe) & hash->dth_mask;
6970	dtrace_hashbucket_t *bucket = hash->dth_tab[ndx];
6971
6972	dtrace_probe_t **prevp = DTRACE_HASHPREV(hash, probe);
6973	dtrace_probe_t **nextp = DTRACE_HASHNEXT(hash, probe);
6974
6975	/*
6976	 * Find the bucket that we're removing this probe from.
6977	 */
6978	for (; bucket != NULL; bucket = bucket->dthb_next) {
6979		if (DTRACE_HASHEQ(hash, bucket->dthb_chain, probe))
6980			break;
6981	}
6982
6983	ASSERT(bucket != NULL);
6984
6985	if (*prevp == NULL) {
6986		if (*nextp == NULL) {
6987			/*
6988			 * The removed probe was the only probe on this
6989			 * bucket; we need to remove the bucket.
6990			 */
6991			dtrace_hashbucket_t *b = hash->dth_tab[ndx];
6992
6993			ASSERT(bucket->dthb_chain == probe);
6994			ASSERT(b != NULL);
6995
6996			if (b == bucket) {
6997				hash->dth_tab[ndx] = bucket->dthb_next;
6998			} else {
6999				while (b->dthb_next != bucket)
7000					b = b->dthb_next;
7001				b->dthb_next = bucket->dthb_next;
7002			}
7003
7004			ASSERT(hash->dth_nbuckets > 0);
7005			hash->dth_nbuckets--;
7006			kmem_free(bucket, sizeof (dtrace_hashbucket_t));
7007			return;
7008		}
7009
7010		bucket->dthb_chain = *nextp;
7011	} else {
7012		*(DTRACE_HASHNEXT(hash, *prevp)) = *nextp;
7013	}
7014
7015	if (*nextp != NULL)
7016		*(DTRACE_HASHPREV(hash, *nextp)) = *prevp;
7017}
7018
7019/*
7020 * DTrace Utility Functions
7021 *
7022 * These are random utility functions that are _not_ called from probe context.
7023 */
7024static int
7025dtrace_badattr(const dtrace_attribute_t *a)
7026{
7027	return (a->dtat_name > DTRACE_STABILITY_MAX ||
7028	    a->dtat_data > DTRACE_STABILITY_MAX ||
7029	    a->dtat_class > DTRACE_CLASS_MAX);
7030}
7031
7032/*
7033 * Return a duplicate copy of a string.  If the specified string is NULL,
7034 * this function returns a zero-length string.
7035 */
7036static char *
7037dtrace_strdup(const char *str)
7038{
7039	char *new = kmem_zalloc((str != NULL ? strlen(str) : 0) + 1, KM_SLEEP);
7040
7041	if (str != NULL)
7042		(void) strcpy(new, str);
7043
7044	return (new);
7045}
7046
7047#define	DTRACE_ISALPHA(c)	\
7048	(((c) >= 'a' && (c) <= 'z') || ((c) >= 'A' && (c) <= 'Z'))
7049
7050static int
7051dtrace_badname(const char *s)
7052{
7053	char c;
7054
7055	if (s == NULL || (c = *s++) == '\0')
7056		return (0);
7057
7058	if (!DTRACE_ISALPHA(c) && c != '-' && c != '_' && c != '.')
7059		return (1);
7060
7061	while ((c = *s++) != '\0') {
7062		if (!DTRACE_ISALPHA(c) && (c < '0' || c > '9') &&
7063		    c != '-' && c != '_' && c != '.' && c != '`')
7064			return (1);
7065	}
7066
7067	return (0);
7068}
7069
7070static void
7071dtrace_cred2priv(cred_t *cr, uint32_t *privp, uid_t *uidp, zoneid_t *zoneidp)
7072{
7073	uint32_t priv;
7074
7075#if defined(sun)
7076	if (cr == NULL || PRIV_POLICY_ONLY(cr, PRIV_ALL, B_FALSE)) {
7077		/*
7078		 * For DTRACE_PRIV_ALL, the uid and zoneid don't matter.
7079		 */
7080		priv = DTRACE_PRIV_ALL;
7081	} else {
7082		*uidp = crgetuid(cr);
7083		*zoneidp = crgetzoneid(cr);
7084
7085		priv = 0;
7086		if (PRIV_POLICY_ONLY(cr, PRIV_DTRACE_KERNEL, B_FALSE))
7087			priv |= DTRACE_PRIV_KERNEL | DTRACE_PRIV_USER;
7088		else if (PRIV_POLICY_ONLY(cr, PRIV_DTRACE_USER, B_FALSE))
7089			priv |= DTRACE_PRIV_USER;
7090		if (PRIV_POLICY_ONLY(cr, PRIV_DTRACE_PROC, B_FALSE))
7091			priv |= DTRACE_PRIV_PROC;
7092		if (PRIV_POLICY_ONLY(cr, PRIV_PROC_OWNER, B_FALSE))
7093			priv |= DTRACE_PRIV_OWNER;
7094		if (PRIV_POLICY_ONLY(cr, PRIV_PROC_ZONE, B_FALSE))
7095			priv |= DTRACE_PRIV_ZONEOWNER;
7096	}
7097#else
7098	priv = DTRACE_PRIV_ALL;
7099#endif
7100
7101	*privp = priv;
7102}
7103
7104#ifdef DTRACE_ERRDEBUG
7105static void
7106dtrace_errdebug(const char *str)
7107{
7108	int hval = dtrace_hash_str(str) % DTRACE_ERRHASHSZ;
7109	int occupied = 0;
7110
7111	mutex_enter(&dtrace_errlock);
7112	dtrace_errlast = str;
7113	dtrace_errthread = curthread;
7114
7115	while (occupied++ < DTRACE_ERRHASHSZ) {
7116		if (dtrace_errhash[hval].dter_msg == str) {
7117			dtrace_errhash[hval].dter_count++;
7118			goto out;
7119		}
7120
7121		if (dtrace_errhash[hval].dter_msg != NULL) {
7122			hval = (hval + 1) % DTRACE_ERRHASHSZ;
7123			continue;
7124		}
7125
7126		dtrace_errhash[hval].dter_msg = str;
7127		dtrace_errhash[hval].dter_count = 1;
7128		goto out;
7129	}
7130
7131	panic("dtrace: undersized error hash");
7132out:
7133	mutex_exit(&dtrace_errlock);
7134}
7135#endif
7136
7137/*
7138 * DTrace Matching Functions
7139 *
7140 * These functions are used to match groups of probes, given some elements of
7141 * a probe tuple, or some globbed expressions for elements of a probe tuple.
7142 */
7143static int
7144dtrace_match_priv(const dtrace_probe_t *prp, uint32_t priv, uid_t uid,
7145    zoneid_t zoneid)
7146{
7147	if (priv != DTRACE_PRIV_ALL) {
7148		uint32_t ppriv = prp->dtpr_provider->dtpv_priv.dtpp_flags;
7149		uint32_t match = priv & ppriv;
7150
7151		/*
7152		 * No PRIV_DTRACE_* privileges...
7153		 */
7154		if ((priv & (DTRACE_PRIV_PROC | DTRACE_PRIV_USER |
7155		    DTRACE_PRIV_KERNEL)) == 0)
7156			return (0);
7157
7158		/*
7159		 * No matching bits, but there were bits to match...
7160		 */
7161		if (match == 0 && ppriv != 0)
7162			return (0);
7163
7164		/*
7165		 * Need to have permissions to the process, but don't...
7166		 */
7167		if (((ppriv & ~match) & DTRACE_PRIV_OWNER) != 0 &&
7168		    uid != prp->dtpr_provider->dtpv_priv.dtpp_uid) {
7169			return (0);
7170		}
7171
7172		/*
7173		 * Need to be in the same zone unless we possess the
7174		 * privilege to examine all zones.
7175		 */
7176		if (((ppriv & ~match) & DTRACE_PRIV_ZONEOWNER) != 0 &&
7177		    zoneid != prp->dtpr_provider->dtpv_priv.dtpp_zoneid) {
7178			return (0);
7179		}
7180	}
7181
7182	return (1);
7183}
7184
7185/*
7186 * dtrace_match_probe compares a dtrace_probe_t to a pre-compiled key, which
7187 * consists of input pattern strings and an ops-vector to evaluate them.
7188 * This function returns >0 for match, 0 for no match, and <0 for error.
7189 */
7190static int
7191dtrace_match_probe(const dtrace_probe_t *prp, const dtrace_probekey_t *pkp,
7192    uint32_t priv, uid_t uid, zoneid_t zoneid)
7193{
7194	dtrace_provider_t *pvp = prp->dtpr_provider;
7195	int rv;
7196
7197	if (pvp->dtpv_defunct)
7198		return (0);
7199
7200	if ((rv = pkp->dtpk_pmatch(pvp->dtpv_name, pkp->dtpk_prov, 0)) <= 0)
7201		return (rv);
7202
7203	if ((rv = pkp->dtpk_mmatch(prp->dtpr_mod, pkp->dtpk_mod, 0)) <= 0)
7204		return (rv);
7205
7206	if ((rv = pkp->dtpk_fmatch(prp->dtpr_func, pkp->dtpk_func, 0)) <= 0)
7207		return (rv);
7208
7209	if ((rv = pkp->dtpk_nmatch(prp->dtpr_name, pkp->dtpk_name, 0)) <= 0)
7210		return (rv);
7211
7212	if (dtrace_match_priv(prp, priv, uid, zoneid) == 0)
7213		return (0);
7214
7215	return (rv);
7216}
7217
7218/*
7219 * dtrace_match_glob() is a safe kernel implementation of the gmatch(3GEN)
7220 * interface for matching a glob pattern 'p' to an input string 's'.  Unlike
7221 * libc's version, the kernel version only applies to 8-bit ASCII strings.
7222 * In addition, all of the recursion cases except for '*' matching have been
7223 * unwound.  For '*', we still implement recursive evaluation, but a depth
7224 * counter is maintained and matching is aborted if we recurse too deep.
7225 * The function returns 0 if no match, >0 if match, and <0 if recursion error.
7226 */
7227static int
7228dtrace_match_glob(const char *s, const char *p, int depth)
7229{
7230	const char *olds;
7231	char s1, c;
7232	int gs;
7233
7234	if (depth > DTRACE_PROBEKEY_MAXDEPTH)
7235		return (-1);
7236
7237	if (s == NULL)
7238		s = ""; /* treat NULL as empty string */
7239
7240top:
7241	olds = s;
7242	s1 = *s++;
7243
7244	if (p == NULL)
7245		return (0);
7246
7247	if ((c = *p++) == '\0')
7248		return (s1 == '\0');
7249
7250	switch (c) {
7251	case '[': {
7252		int ok = 0, notflag = 0;
7253		char lc = '\0';
7254
7255		if (s1 == '\0')
7256			return (0);
7257
7258		if (*p == '!') {
7259			notflag = 1;
7260			p++;
7261		}
7262
7263		if ((c = *p++) == '\0')
7264			return (0);
7265
7266		do {
7267			if (c == '-' && lc != '\0' && *p != ']') {
7268				if ((c = *p++) == '\0')
7269					return (0);
7270				if (c == '\\' && (c = *p++) == '\0')
7271					return (0);
7272
7273				if (notflag) {
7274					if (s1 < lc || s1 > c)
7275						ok++;
7276					else
7277						return (0);
7278				} else if (lc <= s1 && s1 <= c)
7279					ok++;
7280
7281			} else if (c == '\\' && (c = *p++) == '\0')
7282				return (0);
7283
7284			lc = c; /* save left-hand 'c' for next iteration */
7285
7286			if (notflag) {
7287				if (s1 != c)
7288					ok++;
7289				else
7290					return (0);
7291			} else if (s1 == c)
7292				ok++;
7293
7294			if ((c = *p++) == '\0')
7295				return (0);
7296
7297		} while (c != ']');
7298
7299		if (ok)
7300			goto top;
7301
7302		return (0);
7303	}
7304
7305	case '\\':
7306		if ((c = *p++) == '\0')
7307			return (0);
7308		/*FALLTHRU*/
7309
7310	default:
7311		if (c != s1)
7312			return (0);
7313		/*FALLTHRU*/
7314
7315	case '?':
7316		if (s1 != '\0')
7317			goto top;
7318		return (0);
7319
7320	case '*':
7321		while (*p == '*')
7322			p++; /* consecutive *'s are identical to a single one */
7323
7324		if (*p == '\0')
7325			return (1);
7326
7327		for (s = olds; *s != '\0'; s++) {
7328			if ((gs = dtrace_match_glob(s, p, depth + 1)) != 0)
7329				return (gs);
7330		}
7331
7332		return (0);
7333	}
7334}
7335
7336/*ARGSUSED*/
7337static int
7338dtrace_match_string(const char *s, const char *p, int depth)
7339{
7340	return (s != NULL && strcmp(s, p) == 0);
7341}
7342
7343/*ARGSUSED*/
7344static int
7345dtrace_match_nul(const char *s, const char *p, int depth)
7346{
7347	return (1); /* always match the empty pattern */
7348}
7349
7350/*ARGSUSED*/
7351static int
7352dtrace_match_nonzero(const char *s, const char *p, int depth)
7353{
7354	return (s != NULL && s[0] != '\0');
7355}
7356
7357static int
7358dtrace_match(const dtrace_probekey_t *pkp, uint32_t priv, uid_t uid,
7359    zoneid_t zoneid, int (*matched)(dtrace_probe_t *, void *), void *arg)
7360{
7361	dtrace_probe_t template, *probe;
7362	dtrace_hash_t *hash = NULL;
7363	int len, best = INT_MAX, nmatched = 0;
7364	dtrace_id_t i;
7365
7366	ASSERT(MUTEX_HELD(&dtrace_lock));
7367
7368	/*
7369	 * If the probe ID is specified in the key, just lookup by ID and
7370	 * invoke the match callback once if a matching probe is found.
7371	 */
7372	if (pkp->dtpk_id != DTRACE_IDNONE) {
7373		if ((probe = dtrace_probe_lookup_id(pkp->dtpk_id)) != NULL &&
7374		    dtrace_match_probe(probe, pkp, priv, uid, zoneid) > 0) {
7375			(void) (*matched)(probe, arg);
7376			nmatched++;
7377		}
7378		return (nmatched);
7379	}
7380
7381	template.dtpr_mod = (char *)pkp->dtpk_mod;
7382	template.dtpr_func = (char *)pkp->dtpk_func;
7383	template.dtpr_name = (char *)pkp->dtpk_name;
7384
7385	/*
7386	 * We want to find the most distinct of the module name, function
7387	 * name, and name.  So for each one that is not a glob pattern or
7388	 * empty string, we perform a lookup in the corresponding hash and
7389	 * use the hash table with the fewest collisions to do our search.
7390	 */
7391	if (pkp->dtpk_mmatch == &dtrace_match_string &&
7392	    (len = dtrace_hash_collisions(dtrace_bymod, &template)) < best) {
7393		best = len;
7394		hash = dtrace_bymod;
7395	}
7396
7397	if (pkp->dtpk_fmatch == &dtrace_match_string &&
7398	    (len = dtrace_hash_collisions(dtrace_byfunc, &template)) < best) {
7399		best = len;
7400		hash = dtrace_byfunc;
7401	}
7402
7403	if (pkp->dtpk_nmatch == &dtrace_match_string &&
7404	    (len = dtrace_hash_collisions(dtrace_byname, &template)) < best) {
7405		best = len;
7406		hash = dtrace_byname;
7407	}
7408
7409	/*
7410	 * If we did not select a hash table, iterate over every probe and
7411	 * invoke our callback for each one that matches our input probe key.
7412	 */
7413	if (hash == NULL) {
7414		for (i = 0; i < dtrace_nprobes; i++) {
7415			if ((probe = dtrace_probes[i]) == NULL ||
7416			    dtrace_match_probe(probe, pkp, priv, uid,
7417			    zoneid) <= 0)
7418				continue;
7419
7420			nmatched++;
7421
7422			if ((*matched)(probe, arg) != DTRACE_MATCH_NEXT)
7423				break;
7424		}
7425
7426		return (nmatched);
7427	}
7428
7429	/*
7430	 * If we selected a hash table, iterate over each probe of the same key
7431	 * name and invoke the callback for every probe that matches the other
7432	 * attributes of our input probe key.
7433	 */
7434	for (probe = dtrace_hash_lookup(hash, &template); probe != NULL;
7435	    probe = *(DTRACE_HASHNEXT(hash, probe))) {
7436
7437		if (dtrace_match_probe(probe, pkp, priv, uid, zoneid) <= 0)
7438			continue;
7439
7440		nmatched++;
7441
7442		if ((*matched)(probe, arg) != DTRACE_MATCH_NEXT)
7443			break;
7444	}
7445
7446	return (nmatched);
7447}
7448
7449/*
7450 * Return the function pointer dtrace_probecmp() should use to compare the
7451 * specified pattern with a string.  For NULL or empty patterns, we select
7452 * dtrace_match_nul().  For glob pattern strings, we use dtrace_match_glob().
7453 * For non-empty non-glob strings, we use dtrace_match_string().
7454 */
7455static dtrace_probekey_f *
7456dtrace_probekey_func(const char *p)
7457{
7458	char c;
7459
7460	if (p == NULL || *p == '\0')
7461		return (&dtrace_match_nul);
7462
7463	while ((c = *p++) != '\0') {
7464		if (c == '[' || c == '?' || c == '*' || c == '\\')
7465			return (&dtrace_match_glob);
7466	}
7467
7468	return (&dtrace_match_string);
7469}
7470
7471/*
7472 * Build a probe comparison key for use with dtrace_match_probe() from the
7473 * given probe description.  By convention, a null key only matches anchored
7474 * probes: if each field is the empty string, reset dtpk_fmatch to
7475 * dtrace_match_nonzero().
7476 */
7477static void
7478dtrace_probekey(dtrace_probedesc_t *pdp, dtrace_probekey_t *pkp)
7479{
7480	pkp->dtpk_prov = pdp->dtpd_provider;
7481	pkp->dtpk_pmatch = dtrace_probekey_func(pdp->dtpd_provider);
7482
7483	pkp->dtpk_mod = pdp->dtpd_mod;
7484	pkp->dtpk_mmatch = dtrace_probekey_func(pdp->dtpd_mod);
7485
7486	pkp->dtpk_func = pdp->dtpd_func;
7487	pkp->dtpk_fmatch = dtrace_probekey_func(pdp->dtpd_func);
7488
7489	pkp->dtpk_name = pdp->dtpd_name;
7490	pkp->dtpk_nmatch = dtrace_probekey_func(pdp->dtpd_name);
7491
7492	pkp->dtpk_id = pdp->dtpd_id;
7493
7494	if (pkp->dtpk_id == DTRACE_IDNONE &&
7495	    pkp->dtpk_pmatch == &dtrace_match_nul &&
7496	    pkp->dtpk_mmatch == &dtrace_match_nul &&
7497	    pkp->dtpk_fmatch == &dtrace_match_nul &&
7498	    pkp->dtpk_nmatch == &dtrace_match_nul)
7499		pkp->dtpk_fmatch = &dtrace_match_nonzero;
7500}
7501
7502/*
7503 * DTrace Provider-to-Framework API Functions
7504 *
7505 * These functions implement much of the Provider-to-Framework API, as
7506 * described in <sys/dtrace.h>.  The parts of the API not in this section are
7507 * the functions in the API for probe management (found below), and
7508 * dtrace_probe() itself (found above).
7509 */
7510
7511/*
7512 * Register the calling provider with the DTrace framework.  This should
7513 * generally be called by DTrace providers in their attach(9E) entry point.
7514 */
7515int
7516dtrace_register(const char *name, const dtrace_pattr_t *pap, uint32_t priv,
7517    cred_t *cr, const dtrace_pops_t *pops, void *arg, dtrace_provider_id_t *idp)
7518{
7519	dtrace_provider_t *provider;
7520
7521	if (name == NULL || pap == NULL || pops == NULL || idp == NULL) {
7522		cmn_err(CE_WARN, "failed to register provider '%s': invalid "
7523		    "arguments", name ? name : "<NULL>");
7524		return (EINVAL);
7525	}
7526
7527	if (name[0] == '\0' || dtrace_badname(name)) {
7528		cmn_err(CE_WARN, "failed to register provider '%s': invalid "
7529		    "provider name", name);
7530		return (EINVAL);
7531	}
7532
7533	if ((pops->dtps_provide == NULL && pops->dtps_provide_module == NULL) ||
7534	    pops->dtps_enable == NULL || pops->dtps_disable == NULL ||
7535	    pops->dtps_destroy == NULL ||
7536	    ((pops->dtps_resume == NULL) != (pops->dtps_suspend == NULL))) {
7537		cmn_err(CE_WARN, "failed to register provider '%s': invalid "
7538		    "provider ops", name);
7539		return (EINVAL);
7540	}
7541
7542	if (dtrace_badattr(&pap->dtpa_provider) ||
7543	    dtrace_badattr(&pap->dtpa_mod) ||
7544	    dtrace_badattr(&pap->dtpa_func) ||
7545	    dtrace_badattr(&pap->dtpa_name) ||
7546	    dtrace_badattr(&pap->dtpa_args)) {
7547		cmn_err(CE_WARN, "failed to register provider '%s': invalid "
7548		    "provider attributes", name);
7549		return (EINVAL);
7550	}
7551
7552	if (priv & ~DTRACE_PRIV_ALL) {
7553		cmn_err(CE_WARN, "failed to register provider '%s': invalid "
7554		    "privilege attributes", name);
7555		return (EINVAL);
7556	}
7557
7558	if ((priv & DTRACE_PRIV_KERNEL) &&
7559	    (priv & (DTRACE_PRIV_USER | DTRACE_PRIV_OWNER)) &&
7560	    pops->dtps_usermode == NULL) {
7561		cmn_err(CE_WARN, "failed to register provider '%s': need "
7562		    "dtps_usermode() op for given privilege attributes", name);
7563		return (EINVAL);
7564	}
7565
7566	provider = kmem_zalloc(sizeof (dtrace_provider_t), KM_SLEEP);
7567	provider->dtpv_name = kmem_alloc(strlen(name) + 1, KM_SLEEP);
7568	(void) strcpy(provider->dtpv_name, name);
7569
7570	provider->dtpv_attr = *pap;
7571	provider->dtpv_priv.dtpp_flags = priv;
7572	if (cr != NULL) {
7573		provider->dtpv_priv.dtpp_uid = crgetuid(cr);
7574		provider->dtpv_priv.dtpp_zoneid = crgetzoneid(cr);
7575	}
7576	provider->dtpv_pops = *pops;
7577
7578	if (pops->dtps_provide == NULL) {
7579		ASSERT(pops->dtps_provide_module != NULL);
7580		provider->dtpv_pops.dtps_provide =
7581		    (void (*)(void *, dtrace_probedesc_t *))dtrace_nullop;
7582	}
7583
7584	if (pops->dtps_provide_module == NULL) {
7585		ASSERT(pops->dtps_provide != NULL);
7586		provider->dtpv_pops.dtps_provide_module =
7587		    (void (*)(void *, modctl_t *))dtrace_nullop;
7588	}
7589
7590	if (pops->dtps_suspend == NULL) {
7591		ASSERT(pops->dtps_resume == NULL);
7592		provider->dtpv_pops.dtps_suspend =
7593		    (void (*)(void *, dtrace_id_t, void *))dtrace_nullop;
7594		provider->dtpv_pops.dtps_resume =
7595		    (void (*)(void *, dtrace_id_t, void *))dtrace_nullop;
7596	}
7597
7598	provider->dtpv_arg = arg;
7599	*idp = (dtrace_provider_id_t)provider;
7600
7601	if (pops == &dtrace_provider_ops) {
7602		ASSERT(MUTEX_HELD(&dtrace_provider_lock));
7603		ASSERT(MUTEX_HELD(&dtrace_lock));
7604		ASSERT(dtrace_anon.dta_enabling == NULL);
7605
7606		/*
7607		 * We make sure that the DTrace provider is at the head of
7608		 * the provider chain.
7609		 */
7610		provider->dtpv_next = dtrace_provider;
7611		dtrace_provider = provider;
7612		return (0);
7613	}
7614
7615	mutex_enter(&dtrace_provider_lock);
7616	mutex_enter(&dtrace_lock);
7617
7618	/*
7619	 * If there is at least one provider registered, we'll add this
7620	 * provider after the first provider.
7621	 */
7622	if (dtrace_provider != NULL) {
7623		provider->dtpv_next = dtrace_provider->dtpv_next;
7624		dtrace_provider->dtpv_next = provider;
7625	} else {
7626		dtrace_provider = provider;
7627	}
7628
7629	if (dtrace_retained != NULL) {
7630		dtrace_enabling_provide(provider);
7631
7632		/*
7633		 * Now we need to call dtrace_enabling_matchall() -- which
7634		 * will acquire cpu_lock and dtrace_lock.  We therefore need
7635		 * to drop all of our locks before calling into it...
7636		 */
7637		mutex_exit(&dtrace_lock);
7638		mutex_exit(&dtrace_provider_lock);
7639		dtrace_enabling_matchall();
7640
7641		return (0);
7642	}
7643
7644	mutex_exit(&dtrace_lock);
7645	mutex_exit(&dtrace_provider_lock);
7646
7647	return (0);
7648}
7649
7650/*
7651 * Unregister the specified provider from the DTrace framework.  This should
7652 * generally be called by DTrace providers in their detach(9E) entry point.
7653 */
7654int
7655dtrace_unregister(dtrace_provider_id_t id)
7656{
7657	dtrace_provider_t *old = (dtrace_provider_t *)id;
7658	dtrace_provider_t *prev = NULL;
7659	int i, self = 0, noreap = 0;
7660	dtrace_probe_t *probe, *first = NULL;
7661
7662	if (old->dtpv_pops.dtps_enable ==
7663	    (void (*)(void *, dtrace_id_t, void *))dtrace_nullop) {
7664		/*
7665		 * If DTrace itself is the provider, we're called with locks
7666		 * already held.
7667		 */
7668		ASSERT(old == dtrace_provider);
7669#if defined(sun)
7670		ASSERT(dtrace_devi != NULL);
7671#endif
7672		ASSERT(MUTEX_HELD(&dtrace_provider_lock));
7673		ASSERT(MUTEX_HELD(&dtrace_lock));
7674		self = 1;
7675
7676		if (dtrace_provider->dtpv_next != NULL) {
7677			/*
7678			 * There's another provider here; return failure.
7679			 */
7680			return (EBUSY);
7681		}
7682	} else {
7683		mutex_enter(&dtrace_provider_lock);
7684		mutex_enter(&mod_lock);
7685		mutex_enter(&dtrace_lock);
7686	}
7687
7688	/*
7689	 * If anyone has /dev/dtrace open, or if there are anonymous enabled
7690	 * probes, we refuse to let providers slither away, unless this
7691	 * provider has already been explicitly invalidated.
7692	 */
7693	if (!old->dtpv_defunct &&
7694	    (dtrace_opens || (dtrace_anon.dta_state != NULL &&
7695	    dtrace_anon.dta_state->dts_necbs > 0))) {
7696		if (!self) {
7697			mutex_exit(&dtrace_lock);
7698			mutex_exit(&mod_lock);
7699			mutex_exit(&dtrace_provider_lock);
7700		}
7701		return (EBUSY);
7702	}
7703
7704	/*
7705	 * Attempt to destroy the probes associated with this provider.
7706	 */
7707	for (i = 0; i < dtrace_nprobes; i++) {
7708		if ((probe = dtrace_probes[i]) == NULL)
7709			continue;
7710
7711		if (probe->dtpr_provider != old)
7712			continue;
7713
7714		if (probe->dtpr_ecb == NULL)
7715			continue;
7716
7717		/*
7718		 * If we are trying to unregister a defunct provider, and the
7719		 * provider was made defunct within the interval dictated by
7720		 * dtrace_unregister_defunct_reap, we'll (asynchronously)
7721		 * attempt to reap our enablings.  To denote that the provider
7722		 * should reattempt to unregister itself at some point in the
7723		 * future, we will return a differentiable error code (EAGAIN
7724		 * instead of EBUSY) in this case.
7725		 */
7726		if (dtrace_gethrtime() - old->dtpv_defunct >
7727		    dtrace_unregister_defunct_reap)
7728			noreap = 1;
7729
7730		if (!self) {
7731			mutex_exit(&dtrace_lock);
7732			mutex_exit(&mod_lock);
7733			mutex_exit(&dtrace_provider_lock);
7734		}
7735
7736		if (noreap)
7737			return (EBUSY);
7738
7739		(void) taskq_dispatch(dtrace_taskq,
7740		    (task_func_t *)dtrace_enabling_reap, NULL, TQ_SLEEP);
7741
7742		return (EAGAIN);
7743	}
7744
7745	/*
7746	 * All of the probes for this provider are disabled; we can safely
7747	 * remove all of them from their hash chains and from the probe array.
7748	 */
7749	for (i = 0; i < dtrace_nprobes; i++) {
7750		if ((probe = dtrace_probes[i]) == NULL)
7751			continue;
7752
7753		if (probe->dtpr_provider != old)
7754			continue;
7755
7756		dtrace_probes[i] = NULL;
7757
7758		dtrace_hash_remove(dtrace_bymod, probe);
7759		dtrace_hash_remove(dtrace_byfunc, probe);
7760		dtrace_hash_remove(dtrace_byname, probe);
7761
7762		if (first == NULL) {
7763			first = probe;
7764			probe->dtpr_nextmod = NULL;
7765		} else {
7766			probe->dtpr_nextmod = first;
7767			first = probe;
7768		}
7769	}
7770
7771	/*
7772	 * The provider's probes have been removed from the hash chains and
7773	 * from the probe array.  Now issue a dtrace_sync() to be sure that
7774	 * everyone has cleared out from any probe array processing.
7775	 */
7776	dtrace_sync();
7777
7778	for (probe = first; probe != NULL; probe = first) {
7779		first = probe->dtpr_nextmod;
7780
7781		old->dtpv_pops.dtps_destroy(old->dtpv_arg, probe->dtpr_id,
7782		    probe->dtpr_arg);
7783		kmem_free(probe->dtpr_mod, strlen(probe->dtpr_mod) + 1);
7784		kmem_free(probe->dtpr_func, strlen(probe->dtpr_func) + 1);
7785		kmem_free(probe->dtpr_name, strlen(probe->dtpr_name) + 1);
7786#if defined(sun)
7787		vmem_free(dtrace_arena, (void *)(uintptr_t)(probe->dtpr_id), 1);
7788#else
7789		free_unr(dtrace_arena, probe->dtpr_id);
7790#endif
7791		kmem_free(probe, sizeof (dtrace_probe_t));
7792	}
7793
7794	if ((prev = dtrace_provider) == old) {
7795#if defined(sun)
7796		ASSERT(self || dtrace_devi == NULL);
7797		ASSERT(old->dtpv_next == NULL || dtrace_devi == NULL);
7798#endif
7799		dtrace_provider = old->dtpv_next;
7800	} else {
7801		while (prev != NULL && prev->dtpv_next != old)
7802			prev = prev->dtpv_next;
7803
7804		if (prev == NULL) {
7805			panic("attempt to unregister non-existent "
7806			    "dtrace provider %p\n", (void *)id);
7807		}
7808
7809		prev->dtpv_next = old->dtpv_next;
7810	}
7811
7812	if (!self) {
7813		mutex_exit(&dtrace_lock);
7814		mutex_exit(&mod_lock);
7815		mutex_exit(&dtrace_provider_lock);
7816	}
7817
7818	kmem_free(old->dtpv_name, strlen(old->dtpv_name) + 1);
7819	kmem_free(old, sizeof (dtrace_provider_t));
7820
7821	return (0);
7822}
7823
7824/*
7825 * Invalidate the specified provider.  All subsequent probe lookups for the
7826 * specified provider will fail, but its probes will not be removed.
7827 */
7828void
7829dtrace_invalidate(dtrace_provider_id_t id)
7830{
7831	dtrace_provider_t *pvp = (dtrace_provider_t *)id;
7832
7833	ASSERT(pvp->dtpv_pops.dtps_enable !=
7834	    (void (*)(void *, dtrace_id_t, void *))dtrace_nullop);
7835
7836	mutex_enter(&dtrace_provider_lock);
7837	mutex_enter(&dtrace_lock);
7838
7839	pvp->dtpv_defunct = dtrace_gethrtime();
7840
7841	mutex_exit(&dtrace_lock);
7842	mutex_exit(&dtrace_provider_lock);
7843}
7844
7845/*
7846 * Indicate whether or not DTrace has attached.
7847 */
7848int
7849dtrace_attached(void)
7850{
7851	/*
7852	 * dtrace_provider will be non-NULL iff the DTrace driver has
7853	 * attached.  (It's non-NULL because DTrace is always itself a
7854	 * provider.)
7855	 */
7856	return (dtrace_provider != NULL);
7857}
7858
7859/*
7860 * Remove all the unenabled probes for the given provider.  This function is
7861 * not unlike dtrace_unregister(), except that it doesn't remove the provider
7862 * -- just as many of its associated probes as it can.
7863 */
7864int
7865dtrace_condense(dtrace_provider_id_t id)
7866{
7867	dtrace_provider_t *prov = (dtrace_provider_t *)id;
7868	int i;
7869	dtrace_probe_t *probe;
7870
7871	/*
7872	 * Make sure this isn't the dtrace provider itself.
7873	 */
7874	ASSERT(prov->dtpv_pops.dtps_enable !=
7875	    (void (*)(void *, dtrace_id_t, void *))dtrace_nullop);
7876
7877	mutex_enter(&dtrace_provider_lock);
7878	mutex_enter(&dtrace_lock);
7879
7880	/*
7881	 * Attempt to destroy the probes associated with this provider.
7882	 */
7883	for (i = 0; i < dtrace_nprobes; i++) {
7884		if ((probe = dtrace_probes[i]) == NULL)
7885			continue;
7886
7887		if (probe->dtpr_provider != prov)
7888			continue;
7889
7890		if (probe->dtpr_ecb != NULL)
7891			continue;
7892
7893		dtrace_probes[i] = NULL;
7894
7895		dtrace_hash_remove(dtrace_bymod, probe);
7896		dtrace_hash_remove(dtrace_byfunc, probe);
7897		dtrace_hash_remove(dtrace_byname, probe);
7898
7899		prov->dtpv_pops.dtps_destroy(prov->dtpv_arg, i + 1,
7900		    probe->dtpr_arg);
7901		kmem_free(probe->dtpr_mod, strlen(probe->dtpr_mod) + 1);
7902		kmem_free(probe->dtpr_func, strlen(probe->dtpr_func) + 1);
7903		kmem_free(probe->dtpr_name, strlen(probe->dtpr_name) + 1);
7904		kmem_free(probe, sizeof (dtrace_probe_t));
7905#if defined(sun)
7906		vmem_free(dtrace_arena, (void *)((uintptr_t)i + 1), 1);
7907#else
7908		free_unr(dtrace_arena, i + 1);
7909#endif
7910	}
7911
7912	mutex_exit(&dtrace_lock);
7913	mutex_exit(&dtrace_provider_lock);
7914
7915	return (0);
7916}
7917
7918/*
7919 * DTrace Probe Management Functions
7920 *
7921 * The functions in this section perform the DTrace probe management,
7922 * including functions to create probes, look-up probes, and call into the
7923 * providers to request that probes be provided.  Some of these functions are
7924 * in the Provider-to-Framework API; these functions can be identified by the
7925 * fact that they are not declared "static".
7926 */
7927
7928/*
7929 * Create a probe with the specified module name, function name, and name.
7930 */
7931dtrace_id_t
7932dtrace_probe_create(dtrace_provider_id_t prov, const char *mod,
7933    const char *func, const char *name, int aframes, void *arg)
7934{
7935	dtrace_probe_t *probe, **probes;
7936	dtrace_provider_t *provider = (dtrace_provider_t *)prov;
7937	dtrace_id_t id;
7938
7939	if (provider == dtrace_provider) {
7940		ASSERT(MUTEX_HELD(&dtrace_lock));
7941	} else {
7942		mutex_enter(&dtrace_lock);
7943	}
7944
7945#if defined(sun)
7946	id = (dtrace_id_t)(uintptr_t)vmem_alloc(dtrace_arena, 1,
7947	    VM_BESTFIT | VM_SLEEP);
7948#else
7949	id = alloc_unr(dtrace_arena);
7950#endif
7951	probe = kmem_zalloc(sizeof (dtrace_probe_t), KM_SLEEP);
7952
7953	probe->dtpr_id = id;
7954	probe->dtpr_gen = dtrace_probegen++;
7955	probe->dtpr_mod = dtrace_strdup(mod);
7956	probe->dtpr_func = dtrace_strdup(func);
7957	probe->dtpr_name = dtrace_strdup(name);
7958	probe->dtpr_arg = arg;
7959	probe->dtpr_aframes = aframes;
7960	probe->dtpr_provider = provider;
7961
7962	dtrace_hash_add(dtrace_bymod, probe);
7963	dtrace_hash_add(dtrace_byfunc, probe);
7964	dtrace_hash_add(dtrace_byname, probe);
7965
7966	if (id - 1 >= dtrace_nprobes) {
7967		size_t osize = dtrace_nprobes * sizeof (dtrace_probe_t *);
7968		size_t nsize = osize << 1;
7969
7970		if (nsize == 0) {
7971			ASSERT(osize == 0);
7972			ASSERT(dtrace_probes == NULL);
7973			nsize = sizeof (dtrace_probe_t *);
7974		}
7975
7976		probes = kmem_zalloc(nsize, KM_SLEEP);
7977
7978		if (dtrace_probes == NULL) {
7979			ASSERT(osize == 0);
7980			dtrace_probes = probes;
7981			dtrace_nprobes = 1;
7982		} else {
7983			dtrace_probe_t **oprobes = dtrace_probes;
7984
7985			bcopy(oprobes, probes, osize);
7986			dtrace_membar_producer();
7987			dtrace_probes = probes;
7988
7989			dtrace_sync();
7990
7991			/*
7992			 * All CPUs are now seeing the new probes array; we can
7993			 * safely free the old array.
7994			 */
7995			kmem_free(oprobes, osize);
7996			dtrace_nprobes <<= 1;
7997		}
7998
7999		ASSERT(id - 1 < dtrace_nprobes);
8000	}
8001
8002	ASSERT(dtrace_probes[id - 1] == NULL);
8003	dtrace_probes[id - 1] = probe;
8004
8005	if (provider != dtrace_provider)
8006		mutex_exit(&dtrace_lock);
8007
8008	return (id);
8009}
8010
8011static dtrace_probe_t *
8012dtrace_probe_lookup_id(dtrace_id_t id)
8013{
8014	ASSERT(MUTEX_HELD(&dtrace_lock));
8015
8016	if (id == 0 || id > dtrace_nprobes)
8017		return (NULL);
8018
8019	return (dtrace_probes[id - 1]);
8020}
8021
8022static int
8023dtrace_probe_lookup_match(dtrace_probe_t *probe, void *arg)
8024{
8025	*((dtrace_id_t *)arg) = probe->dtpr_id;
8026
8027	return (DTRACE_MATCH_DONE);
8028}
8029
8030/*
8031 * Look up a probe based on provider and one or more of module name, function
8032 * name and probe name.
8033 */
8034dtrace_id_t
8035dtrace_probe_lookup(dtrace_provider_id_t prid, char *mod,
8036    char *func, char *name)
8037{
8038	dtrace_probekey_t pkey;
8039	dtrace_id_t id;
8040	int match;
8041
8042	pkey.dtpk_prov = ((dtrace_provider_t *)prid)->dtpv_name;
8043	pkey.dtpk_pmatch = &dtrace_match_string;
8044	pkey.dtpk_mod = mod;
8045	pkey.dtpk_mmatch = mod ? &dtrace_match_string : &dtrace_match_nul;
8046	pkey.dtpk_func = func;
8047	pkey.dtpk_fmatch = func ? &dtrace_match_string : &dtrace_match_nul;
8048	pkey.dtpk_name = name;
8049	pkey.dtpk_nmatch = name ? &dtrace_match_string : &dtrace_match_nul;
8050	pkey.dtpk_id = DTRACE_IDNONE;
8051
8052	mutex_enter(&dtrace_lock);
8053	match = dtrace_match(&pkey, DTRACE_PRIV_ALL, 0, 0,
8054	    dtrace_probe_lookup_match, &id);
8055	mutex_exit(&dtrace_lock);
8056
8057	ASSERT(match == 1 || match == 0);
8058	return (match ? id : 0);
8059}
8060
8061/*
8062 * Returns the probe argument associated with the specified probe.
8063 */
8064void *
8065dtrace_probe_arg(dtrace_provider_id_t id, dtrace_id_t pid)
8066{
8067	dtrace_probe_t *probe;
8068	void *rval = NULL;
8069
8070	mutex_enter(&dtrace_lock);
8071
8072	if ((probe = dtrace_probe_lookup_id(pid)) != NULL &&
8073	    probe->dtpr_provider == (dtrace_provider_t *)id)
8074		rval = probe->dtpr_arg;
8075
8076	mutex_exit(&dtrace_lock);
8077
8078	return (rval);
8079}
8080
8081/*
8082 * Copy a probe into a probe description.
8083 */
8084static void
8085dtrace_probe_description(const dtrace_probe_t *prp, dtrace_probedesc_t *pdp)
8086{
8087	bzero(pdp, sizeof (dtrace_probedesc_t));
8088	pdp->dtpd_id = prp->dtpr_id;
8089
8090	(void) strncpy(pdp->dtpd_provider,
8091	    prp->dtpr_provider->dtpv_name, DTRACE_PROVNAMELEN - 1);
8092
8093	(void) strncpy(pdp->dtpd_mod, prp->dtpr_mod, DTRACE_MODNAMELEN - 1);
8094	(void) strncpy(pdp->dtpd_func, prp->dtpr_func, DTRACE_FUNCNAMELEN - 1);
8095	(void) strncpy(pdp->dtpd_name, prp->dtpr_name, DTRACE_NAMELEN - 1);
8096}
8097
8098/*
8099 * Called to indicate that a probe -- or probes -- should be provided by a
8100 * specfied provider.  If the specified description is NULL, the provider will
8101 * be told to provide all of its probes.  (This is done whenever a new
8102 * consumer comes along, or whenever a retained enabling is to be matched.) If
8103 * the specified description is non-NULL, the provider is given the
8104 * opportunity to dynamically provide the specified probe, allowing providers
8105 * to support the creation of probes on-the-fly.  (So-called _autocreated_
8106 * probes.)  If the provider is NULL, the operations will be applied to all
8107 * providers; if the provider is non-NULL the operations will only be applied
8108 * to the specified provider.  The dtrace_provider_lock must be held, and the
8109 * dtrace_lock must _not_ be held -- the provider's dtps_provide() operation
8110 * will need to grab the dtrace_lock when it reenters the framework through
8111 * dtrace_probe_lookup(), dtrace_probe_create(), etc.
8112 */
8113static void
8114dtrace_probe_provide(dtrace_probedesc_t *desc, dtrace_provider_t *prv)
8115{
8116#if defined(sun)
8117	modctl_t *ctl;
8118#endif
8119	int all = 0;
8120
8121	ASSERT(MUTEX_HELD(&dtrace_provider_lock));
8122
8123	if (prv == NULL) {
8124		all = 1;
8125		prv = dtrace_provider;
8126	}
8127
8128	do {
8129		/*
8130		 * First, call the blanket provide operation.
8131		 */
8132		prv->dtpv_pops.dtps_provide(prv->dtpv_arg, desc);
8133
8134		/*
8135		 * Now call the per-module provide operation.  We will grab
8136		 * mod_lock to prevent the list from being modified.  Note
8137		 * that this also prevents the mod_busy bits from changing.
8138		 * (mod_busy can only be changed with mod_lock held.)
8139		 */
8140		mutex_enter(&mod_lock);
8141
8142#if defined(sun)
8143		ctl = &modules;
8144		do {
8145			if (ctl->mod_busy || ctl->mod_mp == NULL)
8146				continue;
8147
8148			prv->dtpv_pops.dtps_provide_module(prv->dtpv_arg, ctl);
8149
8150		} while ((ctl = ctl->mod_next) != &modules);
8151#endif
8152
8153		mutex_exit(&mod_lock);
8154	} while (all && (prv = prv->dtpv_next) != NULL);
8155}
8156
8157#if defined(sun)
8158/*
8159 * Iterate over each probe, and call the Framework-to-Provider API function
8160 * denoted by offs.
8161 */
8162static void
8163dtrace_probe_foreach(uintptr_t offs)
8164{
8165	dtrace_provider_t *prov;
8166	void (*func)(void *, dtrace_id_t, void *);
8167	dtrace_probe_t *probe;
8168	dtrace_icookie_t cookie;
8169	int i;
8170
8171	/*
8172	 * We disable interrupts to walk through the probe array.  This is
8173	 * safe -- the dtrace_sync() in dtrace_unregister() assures that we
8174	 * won't see stale data.
8175	 */
8176	cookie = dtrace_interrupt_disable();
8177
8178	for (i = 0; i < dtrace_nprobes; i++) {
8179		if ((probe = dtrace_probes[i]) == NULL)
8180			continue;
8181
8182		if (probe->dtpr_ecb == NULL) {
8183			/*
8184			 * This probe isn't enabled -- don't call the function.
8185			 */
8186			continue;
8187		}
8188
8189		prov = probe->dtpr_provider;
8190		func = *((void(**)(void *, dtrace_id_t, void *))
8191		    ((uintptr_t)&prov->dtpv_pops + offs));
8192
8193		func(prov->dtpv_arg, i + 1, probe->dtpr_arg);
8194	}
8195
8196	dtrace_interrupt_enable(cookie);
8197}
8198#endif
8199
8200static int
8201dtrace_probe_enable(dtrace_probedesc_t *desc, dtrace_enabling_t *enab)
8202{
8203	dtrace_probekey_t pkey;
8204	uint32_t priv;
8205	uid_t uid;
8206	zoneid_t zoneid;
8207
8208	ASSERT(MUTEX_HELD(&dtrace_lock));
8209	dtrace_ecb_create_cache = NULL;
8210
8211	if (desc == NULL) {
8212		/*
8213		 * If we're passed a NULL description, we're being asked to
8214		 * create an ECB with a NULL probe.
8215		 */
8216		(void) dtrace_ecb_create_enable(NULL, enab);
8217		return (0);
8218	}
8219
8220	dtrace_probekey(desc, &pkey);
8221	dtrace_cred2priv(enab->dten_vstate->dtvs_state->dts_cred.dcr_cred,
8222	    &priv, &uid, &zoneid);
8223
8224	return (dtrace_match(&pkey, priv, uid, zoneid, dtrace_ecb_create_enable,
8225	    enab));
8226}
8227
8228/*
8229 * DTrace Helper Provider Functions
8230 */
8231static void
8232dtrace_dofattr2attr(dtrace_attribute_t *attr, const dof_attr_t dofattr)
8233{
8234	attr->dtat_name = DOF_ATTR_NAME(dofattr);
8235	attr->dtat_data = DOF_ATTR_DATA(dofattr);
8236	attr->dtat_class = DOF_ATTR_CLASS(dofattr);
8237}
8238
8239static void
8240dtrace_dofprov2hprov(dtrace_helper_provdesc_t *hprov,
8241    const dof_provider_t *dofprov, char *strtab)
8242{
8243	hprov->dthpv_provname = strtab + dofprov->dofpv_name;
8244	dtrace_dofattr2attr(&hprov->dthpv_pattr.dtpa_provider,
8245	    dofprov->dofpv_provattr);
8246	dtrace_dofattr2attr(&hprov->dthpv_pattr.dtpa_mod,
8247	    dofprov->dofpv_modattr);
8248	dtrace_dofattr2attr(&hprov->dthpv_pattr.dtpa_func,
8249	    dofprov->dofpv_funcattr);
8250	dtrace_dofattr2attr(&hprov->dthpv_pattr.dtpa_name,
8251	    dofprov->dofpv_nameattr);
8252	dtrace_dofattr2attr(&hprov->dthpv_pattr.dtpa_args,
8253	    dofprov->dofpv_argsattr);
8254}
8255
8256static void
8257dtrace_helper_provide_one(dof_helper_t *dhp, dof_sec_t *sec, pid_t pid)
8258{
8259	uintptr_t daddr = (uintptr_t)dhp->dofhp_dof;
8260	dof_hdr_t *dof = (dof_hdr_t *)daddr;
8261	dof_sec_t *str_sec, *prb_sec, *arg_sec, *off_sec, *enoff_sec;
8262	dof_provider_t *provider;
8263	dof_probe_t *probe;
8264	uint32_t *off, *enoff;
8265	uint8_t *arg;
8266	char *strtab;
8267	uint_t i, nprobes;
8268	dtrace_helper_provdesc_t dhpv;
8269	dtrace_helper_probedesc_t dhpb;
8270	dtrace_meta_t *meta = dtrace_meta_pid;
8271	dtrace_mops_t *mops = &meta->dtm_mops;
8272	void *parg;
8273
8274	provider = (dof_provider_t *)(uintptr_t)(daddr + sec->dofs_offset);
8275	str_sec = (dof_sec_t *)(uintptr_t)(daddr + dof->dofh_secoff +
8276	    provider->dofpv_strtab * dof->dofh_secsize);
8277	prb_sec = (dof_sec_t *)(uintptr_t)(daddr + dof->dofh_secoff +
8278	    provider->dofpv_probes * dof->dofh_secsize);
8279	arg_sec = (dof_sec_t *)(uintptr_t)(daddr + dof->dofh_secoff +
8280	    provider->dofpv_prargs * dof->dofh_secsize);
8281	off_sec = (dof_sec_t *)(uintptr_t)(daddr + dof->dofh_secoff +
8282	    provider->dofpv_proffs * dof->dofh_secsize);
8283
8284	strtab = (char *)(uintptr_t)(daddr + str_sec->dofs_offset);
8285	off = (uint32_t *)(uintptr_t)(daddr + off_sec->dofs_offset);
8286	arg = (uint8_t *)(uintptr_t)(daddr + arg_sec->dofs_offset);
8287	enoff = NULL;
8288
8289	/*
8290	 * See dtrace_helper_provider_validate().
8291	 */
8292	if (dof->dofh_ident[DOF_ID_VERSION] != DOF_VERSION_1 &&
8293	    provider->dofpv_prenoffs != DOF_SECT_NONE) {
8294		enoff_sec = (dof_sec_t *)(uintptr_t)(daddr + dof->dofh_secoff +
8295		    provider->dofpv_prenoffs * dof->dofh_secsize);
8296		enoff = (uint32_t *)(uintptr_t)(daddr + enoff_sec->dofs_offset);
8297	}
8298
8299	nprobes = prb_sec->dofs_size / prb_sec->dofs_entsize;
8300
8301	/*
8302	 * Create the provider.
8303	 */
8304	dtrace_dofprov2hprov(&dhpv, provider, strtab);
8305
8306	if ((parg = mops->dtms_provide_pid(meta->dtm_arg, &dhpv, pid)) == NULL)
8307		return;
8308
8309	meta->dtm_count++;
8310
8311	/*
8312	 * Create the probes.
8313	 */
8314	for (i = 0; i < nprobes; i++) {
8315		probe = (dof_probe_t *)(uintptr_t)(daddr +
8316		    prb_sec->dofs_offset + i * prb_sec->dofs_entsize);
8317
8318		dhpb.dthpb_mod = dhp->dofhp_mod;
8319		dhpb.dthpb_func = strtab + probe->dofpr_func;
8320		dhpb.dthpb_name = strtab + probe->dofpr_name;
8321		dhpb.dthpb_base = probe->dofpr_addr;
8322		dhpb.dthpb_offs = off + probe->dofpr_offidx;
8323		dhpb.dthpb_noffs = probe->dofpr_noffs;
8324		if (enoff != NULL) {
8325			dhpb.dthpb_enoffs = enoff + probe->dofpr_enoffidx;
8326			dhpb.dthpb_nenoffs = probe->dofpr_nenoffs;
8327		} else {
8328			dhpb.dthpb_enoffs = NULL;
8329			dhpb.dthpb_nenoffs = 0;
8330		}
8331		dhpb.dthpb_args = arg + probe->dofpr_argidx;
8332		dhpb.dthpb_nargc = probe->dofpr_nargc;
8333		dhpb.dthpb_xargc = probe->dofpr_xargc;
8334		dhpb.dthpb_ntypes = strtab + probe->dofpr_nargv;
8335		dhpb.dthpb_xtypes = strtab + probe->dofpr_xargv;
8336
8337		mops->dtms_create_probe(meta->dtm_arg, parg, &dhpb);
8338	}
8339}
8340
8341static void
8342dtrace_helper_provide(dof_helper_t *dhp, pid_t pid)
8343{
8344	uintptr_t daddr = (uintptr_t)dhp->dofhp_dof;
8345	dof_hdr_t *dof = (dof_hdr_t *)daddr;
8346	int i;
8347
8348	ASSERT(MUTEX_HELD(&dtrace_meta_lock));
8349
8350	for (i = 0; i < dof->dofh_secnum; i++) {
8351		dof_sec_t *sec = (dof_sec_t *)(uintptr_t)(daddr +
8352		    dof->dofh_secoff + i * dof->dofh_secsize);
8353
8354		if (sec->dofs_type != DOF_SECT_PROVIDER)
8355			continue;
8356
8357		dtrace_helper_provide_one(dhp, sec, pid);
8358	}
8359
8360	/*
8361	 * We may have just created probes, so we must now rematch against
8362	 * any retained enablings.  Note that this call will acquire both
8363	 * cpu_lock and dtrace_lock; the fact that we are holding
8364	 * dtrace_meta_lock now is what defines the ordering with respect to
8365	 * these three locks.
8366	 */
8367	dtrace_enabling_matchall();
8368}
8369
8370static void
8371dtrace_helper_provider_remove_one(dof_helper_t *dhp, dof_sec_t *sec, pid_t pid)
8372{
8373	uintptr_t daddr = (uintptr_t)dhp->dofhp_dof;
8374	dof_hdr_t *dof = (dof_hdr_t *)daddr;
8375	dof_sec_t *str_sec;
8376	dof_provider_t *provider;
8377	char *strtab;
8378	dtrace_helper_provdesc_t dhpv;
8379	dtrace_meta_t *meta = dtrace_meta_pid;
8380	dtrace_mops_t *mops = &meta->dtm_mops;
8381
8382	provider = (dof_provider_t *)(uintptr_t)(daddr + sec->dofs_offset);
8383	str_sec = (dof_sec_t *)(uintptr_t)(daddr + dof->dofh_secoff +
8384	    provider->dofpv_strtab * dof->dofh_secsize);
8385
8386	strtab = (char *)(uintptr_t)(daddr + str_sec->dofs_offset);
8387
8388	/*
8389	 * Create the provider.
8390	 */
8391	dtrace_dofprov2hprov(&dhpv, provider, strtab);
8392
8393	mops->dtms_remove_pid(meta->dtm_arg, &dhpv, pid);
8394
8395	meta->dtm_count--;
8396}
8397
8398static void
8399dtrace_helper_provider_remove(dof_helper_t *dhp, pid_t pid)
8400{
8401	uintptr_t daddr = (uintptr_t)dhp->dofhp_dof;
8402	dof_hdr_t *dof = (dof_hdr_t *)daddr;
8403	int i;
8404
8405	ASSERT(MUTEX_HELD(&dtrace_meta_lock));
8406
8407	for (i = 0; i < dof->dofh_secnum; i++) {
8408		dof_sec_t *sec = (dof_sec_t *)(uintptr_t)(daddr +
8409		    dof->dofh_secoff + i * dof->dofh_secsize);
8410
8411		if (sec->dofs_type != DOF_SECT_PROVIDER)
8412			continue;
8413
8414		dtrace_helper_provider_remove_one(dhp, sec, pid);
8415	}
8416}
8417
8418/*
8419 * DTrace Meta Provider-to-Framework API Functions
8420 *
8421 * These functions implement the Meta Provider-to-Framework API, as described
8422 * in <sys/dtrace.h>.
8423 */
8424int
8425dtrace_meta_register(const char *name, const dtrace_mops_t *mops, void *arg,
8426    dtrace_meta_provider_id_t *idp)
8427{
8428	dtrace_meta_t *meta;
8429	dtrace_helpers_t *help, *next;
8430	int i;
8431
8432	*idp = DTRACE_METAPROVNONE;
8433
8434	/*
8435	 * We strictly don't need the name, but we hold onto it for
8436	 * debuggability. All hail error queues!
8437	 */
8438	if (name == NULL) {
8439		cmn_err(CE_WARN, "failed to register meta-provider: "
8440		    "invalid name");
8441		return (EINVAL);
8442	}
8443
8444	if (mops == NULL ||
8445	    mops->dtms_create_probe == NULL ||
8446	    mops->dtms_provide_pid == NULL ||
8447	    mops->dtms_remove_pid == NULL) {
8448		cmn_err(CE_WARN, "failed to register meta-register %s: "
8449		    "invalid ops", name);
8450		return (EINVAL);
8451	}
8452
8453	meta = kmem_zalloc(sizeof (dtrace_meta_t), KM_SLEEP);
8454	meta->dtm_mops = *mops;
8455	meta->dtm_name = kmem_alloc(strlen(name) + 1, KM_SLEEP);
8456	(void) strcpy(meta->dtm_name, name);
8457	meta->dtm_arg = arg;
8458
8459	mutex_enter(&dtrace_meta_lock);
8460	mutex_enter(&dtrace_lock);
8461
8462	if (dtrace_meta_pid != NULL) {
8463		mutex_exit(&dtrace_lock);
8464		mutex_exit(&dtrace_meta_lock);
8465		cmn_err(CE_WARN, "failed to register meta-register %s: "
8466		    "user-land meta-provider exists", name);
8467		kmem_free(meta->dtm_name, strlen(meta->dtm_name) + 1);
8468		kmem_free(meta, sizeof (dtrace_meta_t));
8469		return (EINVAL);
8470	}
8471
8472	dtrace_meta_pid = meta;
8473	*idp = (dtrace_meta_provider_id_t)meta;
8474
8475	/*
8476	 * If there are providers and probes ready to go, pass them
8477	 * off to the new meta provider now.
8478	 */
8479
8480	help = dtrace_deferred_pid;
8481	dtrace_deferred_pid = NULL;
8482
8483	mutex_exit(&dtrace_lock);
8484
8485	while (help != NULL) {
8486		for (i = 0; i < help->dthps_nprovs; i++) {
8487			dtrace_helper_provide(&help->dthps_provs[i]->dthp_prov,
8488			    help->dthps_pid);
8489		}
8490
8491		next = help->dthps_next;
8492		help->dthps_next = NULL;
8493		help->dthps_prev = NULL;
8494		help->dthps_deferred = 0;
8495		help = next;
8496	}
8497
8498	mutex_exit(&dtrace_meta_lock);
8499
8500	return (0);
8501}
8502
8503int
8504dtrace_meta_unregister(dtrace_meta_provider_id_t id)
8505{
8506	dtrace_meta_t **pp, *old = (dtrace_meta_t *)id;
8507
8508	mutex_enter(&dtrace_meta_lock);
8509	mutex_enter(&dtrace_lock);
8510
8511	if (old == dtrace_meta_pid) {
8512		pp = &dtrace_meta_pid;
8513	} else {
8514		panic("attempt to unregister non-existent "
8515		    "dtrace meta-provider %p\n", (void *)old);
8516	}
8517
8518	if (old->dtm_count != 0) {
8519		mutex_exit(&dtrace_lock);
8520		mutex_exit(&dtrace_meta_lock);
8521		return (EBUSY);
8522	}
8523
8524	*pp = NULL;
8525
8526	mutex_exit(&dtrace_lock);
8527	mutex_exit(&dtrace_meta_lock);
8528
8529	kmem_free(old->dtm_name, strlen(old->dtm_name) + 1);
8530	kmem_free(old, sizeof (dtrace_meta_t));
8531
8532	return (0);
8533}
8534
8535
8536/*
8537 * DTrace DIF Object Functions
8538 */
8539static int
8540dtrace_difo_err(uint_t pc, const char *format, ...)
8541{
8542	if (dtrace_err_verbose) {
8543		va_list alist;
8544
8545		(void) uprintf("dtrace DIF object error: [%u]: ", pc);
8546		va_start(alist, format);
8547		(void) vuprintf(format, alist);
8548		va_end(alist);
8549	}
8550
8551#ifdef DTRACE_ERRDEBUG
8552	dtrace_errdebug(format);
8553#endif
8554	return (1);
8555}
8556
8557/*
8558 * Validate a DTrace DIF object by checking the IR instructions.  The following
8559 * rules are currently enforced by dtrace_difo_validate():
8560 *
8561 * 1. Each instruction must have a valid opcode
8562 * 2. Each register, string, variable, or subroutine reference must be valid
8563 * 3. No instruction can modify register %r0 (must be zero)
8564 * 4. All instruction reserved bits must be set to zero
8565 * 5. The last instruction must be a "ret" instruction
8566 * 6. All branch targets must reference a valid instruction _after_ the branch
8567 */
8568static int
8569dtrace_difo_validate(dtrace_difo_t *dp, dtrace_vstate_t *vstate, uint_t nregs,
8570    cred_t *cr)
8571{
8572	int err = 0, i;
8573	int (*efunc)(uint_t pc, const char *, ...) = dtrace_difo_err;
8574	int kcheckload;
8575	uint_t pc;
8576
8577	kcheckload = cr == NULL ||
8578	    (vstate->dtvs_state->dts_cred.dcr_visible & DTRACE_CRV_KERNEL) == 0;
8579
8580	dp->dtdo_destructive = 0;
8581
8582	for (pc = 0; pc < dp->dtdo_len && err == 0; pc++) {
8583		dif_instr_t instr = dp->dtdo_buf[pc];
8584
8585		uint_t r1 = DIF_INSTR_R1(instr);
8586		uint_t r2 = DIF_INSTR_R2(instr);
8587		uint_t rd = DIF_INSTR_RD(instr);
8588		uint_t rs = DIF_INSTR_RS(instr);
8589		uint_t label = DIF_INSTR_LABEL(instr);
8590		uint_t v = DIF_INSTR_VAR(instr);
8591		uint_t subr = DIF_INSTR_SUBR(instr);
8592		uint_t type = DIF_INSTR_TYPE(instr);
8593		uint_t op = DIF_INSTR_OP(instr);
8594
8595		switch (op) {
8596		case DIF_OP_OR:
8597		case DIF_OP_XOR:
8598		case DIF_OP_AND:
8599		case DIF_OP_SLL:
8600		case DIF_OP_SRL:
8601		case DIF_OP_SRA:
8602		case DIF_OP_SUB:
8603		case DIF_OP_ADD:
8604		case DIF_OP_MUL:
8605		case DIF_OP_SDIV:
8606		case DIF_OP_UDIV:
8607		case DIF_OP_SREM:
8608		case DIF_OP_UREM:
8609		case DIF_OP_COPYS:
8610			if (r1 >= nregs)
8611				err += efunc(pc, "invalid register %u\n", r1);
8612			if (r2 >= nregs)
8613				err += efunc(pc, "invalid register %u\n", r2);
8614			if (rd >= nregs)
8615				err += efunc(pc, "invalid register %u\n", rd);
8616			if (rd == 0)
8617				err += efunc(pc, "cannot write to %r0\n");
8618			break;
8619		case DIF_OP_NOT:
8620		case DIF_OP_MOV:
8621		case DIF_OP_ALLOCS:
8622			if (r1 >= nregs)
8623				err += efunc(pc, "invalid register %u\n", r1);
8624			if (r2 != 0)
8625				err += efunc(pc, "non-zero reserved bits\n");
8626			if (rd >= nregs)
8627				err += efunc(pc, "invalid register %u\n", rd);
8628			if (rd == 0)
8629				err += efunc(pc, "cannot write to %r0\n");
8630			break;
8631		case DIF_OP_LDSB:
8632		case DIF_OP_LDSH:
8633		case DIF_OP_LDSW:
8634		case DIF_OP_LDUB:
8635		case DIF_OP_LDUH:
8636		case DIF_OP_LDUW:
8637		case DIF_OP_LDX:
8638			if (r1 >= nregs)
8639				err += efunc(pc, "invalid register %u\n", r1);
8640			if (r2 != 0)
8641				err += efunc(pc, "non-zero reserved bits\n");
8642			if (rd >= nregs)
8643				err += efunc(pc, "invalid register %u\n", rd);
8644			if (rd == 0)
8645				err += efunc(pc, "cannot write to %r0\n");
8646			if (kcheckload)
8647				dp->dtdo_buf[pc] = DIF_INSTR_LOAD(op +
8648				    DIF_OP_RLDSB - DIF_OP_LDSB, r1, rd);
8649			break;
8650		case DIF_OP_RLDSB:
8651		case DIF_OP_RLDSH:
8652		case DIF_OP_RLDSW:
8653		case DIF_OP_RLDUB:
8654		case DIF_OP_RLDUH:
8655		case DIF_OP_RLDUW:
8656		case DIF_OP_RLDX:
8657			if (r1 >= nregs)
8658				err += efunc(pc, "invalid register %u\n", r1);
8659			if (r2 != 0)
8660				err += efunc(pc, "non-zero reserved bits\n");
8661			if (rd >= nregs)
8662				err += efunc(pc, "invalid register %u\n", rd);
8663			if (rd == 0)
8664				err += efunc(pc, "cannot write to %r0\n");
8665			break;
8666		case DIF_OP_ULDSB:
8667		case DIF_OP_ULDSH:
8668		case DIF_OP_ULDSW:
8669		case DIF_OP_ULDUB:
8670		case DIF_OP_ULDUH:
8671		case DIF_OP_ULDUW:
8672		case DIF_OP_ULDX:
8673			if (r1 >= nregs)
8674				err += efunc(pc, "invalid register %u\n", r1);
8675			if (r2 != 0)
8676				err += efunc(pc, "non-zero reserved bits\n");
8677			if (rd >= nregs)
8678				err += efunc(pc, "invalid register %u\n", rd);
8679			if (rd == 0)
8680				err += efunc(pc, "cannot write to %r0\n");
8681			break;
8682		case DIF_OP_STB:
8683		case DIF_OP_STH:
8684		case DIF_OP_STW:
8685		case DIF_OP_STX:
8686			if (r1 >= nregs)
8687				err += efunc(pc, "invalid register %u\n", r1);
8688			if (r2 != 0)
8689				err += efunc(pc, "non-zero reserved bits\n");
8690			if (rd >= nregs)
8691				err += efunc(pc, "invalid register %u\n", rd);
8692			if (rd == 0)
8693				err += efunc(pc, "cannot write to 0 address\n");
8694			break;
8695		case DIF_OP_CMP:
8696		case DIF_OP_SCMP:
8697			if (r1 >= nregs)
8698				err += efunc(pc, "invalid register %u\n", r1);
8699			if (r2 >= nregs)
8700				err += efunc(pc, "invalid register %u\n", r2);
8701			if (rd != 0)
8702				err += efunc(pc, "non-zero reserved bits\n");
8703			break;
8704		case DIF_OP_TST:
8705			if (r1 >= nregs)
8706				err += efunc(pc, "invalid register %u\n", r1);
8707			if (r2 != 0 || rd != 0)
8708				err += efunc(pc, "non-zero reserved bits\n");
8709			break;
8710		case DIF_OP_BA:
8711		case DIF_OP_BE:
8712		case DIF_OP_BNE:
8713		case DIF_OP_BG:
8714		case DIF_OP_BGU:
8715		case DIF_OP_BGE:
8716		case DIF_OP_BGEU:
8717		case DIF_OP_BL:
8718		case DIF_OP_BLU:
8719		case DIF_OP_BLE:
8720		case DIF_OP_BLEU:
8721			if (label >= dp->dtdo_len) {
8722				err += efunc(pc, "invalid branch target %u\n",
8723				    label);
8724			}
8725			if (label <= pc) {
8726				err += efunc(pc, "backward branch to %u\n",
8727				    label);
8728			}
8729			break;
8730		case DIF_OP_RET:
8731			if (r1 != 0 || r2 != 0)
8732				err += efunc(pc, "non-zero reserved bits\n");
8733			if (rd >= nregs)
8734				err += efunc(pc, "invalid register %u\n", rd);
8735			break;
8736		case DIF_OP_NOP:
8737		case DIF_OP_POPTS:
8738		case DIF_OP_FLUSHTS:
8739			if (r1 != 0 || r2 != 0 || rd != 0)
8740				err += efunc(pc, "non-zero reserved bits\n");
8741			break;
8742		case DIF_OP_SETX:
8743			if (DIF_INSTR_INTEGER(instr) >= dp->dtdo_intlen) {
8744				err += efunc(pc, "invalid integer ref %u\n",
8745				    DIF_INSTR_INTEGER(instr));
8746			}
8747			if (rd >= nregs)
8748				err += efunc(pc, "invalid register %u\n", rd);
8749			if (rd == 0)
8750				err += efunc(pc, "cannot write to %r0\n");
8751			break;
8752		case DIF_OP_SETS:
8753			if (DIF_INSTR_STRING(instr) >= dp->dtdo_strlen) {
8754				err += efunc(pc, "invalid string ref %u\n",
8755				    DIF_INSTR_STRING(instr));
8756			}
8757			if (rd >= nregs)
8758				err += efunc(pc, "invalid register %u\n", rd);
8759			if (rd == 0)
8760				err += efunc(pc, "cannot write to %r0\n");
8761			break;
8762		case DIF_OP_LDGA:
8763		case DIF_OP_LDTA:
8764			if (r1 > DIF_VAR_ARRAY_MAX)
8765				err += efunc(pc, "invalid array %u\n", r1);
8766			if (r2 >= nregs)
8767				err += efunc(pc, "invalid register %u\n", r2);
8768			if (rd >= nregs)
8769				err += efunc(pc, "invalid register %u\n", rd);
8770			if (rd == 0)
8771				err += efunc(pc, "cannot write to %r0\n");
8772			break;
8773		case DIF_OP_LDGS:
8774		case DIF_OP_LDTS:
8775		case DIF_OP_LDLS:
8776		case DIF_OP_LDGAA:
8777		case DIF_OP_LDTAA:
8778			if (v < DIF_VAR_OTHER_MIN || v > DIF_VAR_OTHER_MAX)
8779				err += efunc(pc, "invalid variable %u\n", v);
8780			if (rd >= nregs)
8781				err += efunc(pc, "invalid register %u\n", rd);
8782			if (rd == 0)
8783				err += efunc(pc, "cannot write to %r0\n");
8784			break;
8785		case DIF_OP_STGS:
8786		case DIF_OP_STTS:
8787		case DIF_OP_STLS:
8788		case DIF_OP_STGAA:
8789		case DIF_OP_STTAA:
8790			if (v < DIF_VAR_OTHER_UBASE || v > DIF_VAR_OTHER_MAX)
8791				err += efunc(pc, "invalid variable %u\n", v);
8792			if (rs >= nregs)
8793				err += efunc(pc, "invalid register %u\n", rd);
8794			break;
8795		case DIF_OP_CALL:
8796			if (subr > DIF_SUBR_MAX)
8797				err += efunc(pc, "invalid subr %u\n", subr);
8798			if (rd >= nregs)
8799				err += efunc(pc, "invalid register %u\n", rd);
8800			if (rd == 0)
8801				err += efunc(pc, "cannot write to %r0\n");
8802
8803			if (subr == DIF_SUBR_COPYOUT ||
8804			    subr == DIF_SUBR_COPYOUTSTR) {
8805				dp->dtdo_destructive = 1;
8806			}
8807			break;
8808		case DIF_OP_PUSHTR:
8809			if (type != DIF_TYPE_STRING && type != DIF_TYPE_CTF)
8810				err += efunc(pc, "invalid ref type %u\n", type);
8811			if (r2 >= nregs)
8812				err += efunc(pc, "invalid register %u\n", r2);
8813			if (rs >= nregs)
8814				err += efunc(pc, "invalid register %u\n", rs);
8815			break;
8816		case DIF_OP_PUSHTV:
8817			if (type != DIF_TYPE_CTF)
8818				err += efunc(pc, "invalid val type %u\n", type);
8819			if (r2 >= nregs)
8820				err += efunc(pc, "invalid register %u\n", r2);
8821			if (rs >= nregs)
8822				err += efunc(pc, "invalid register %u\n", rs);
8823			break;
8824		default:
8825			err += efunc(pc, "invalid opcode %u\n",
8826			    DIF_INSTR_OP(instr));
8827		}
8828	}
8829
8830	if (dp->dtdo_len != 0 &&
8831	    DIF_INSTR_OP(dp->dtdo_buf[dp->dtdo_len - 1]) != DIF_OP_RET) {
8832		err += efunc(dp->dtdo_len - 1,
8833		    "expected 'ret' as last DIF instruction\n");
8834	}
8835
8836	if (!(dp->dtdo_rtype.dtdt_flags & DIF_TF_BYREF)) {
8837		/*
8838		 * If we're not returning by reference, the size must be either
8839		 * 0 or the size of one of the base types.
8840		 */
8841		switch (dp->dtdo_rtype.dtdt_size) {
8842		case 0:
8843		case sizeof (uint8_t):
8844		case sizeof (uint16_t):
8845		case sizeof (uint32_t):
8846		case sizeof (uint64_t):
8847			break;
8848
8849		default:
8850			err += efunc(dp->dtdo_len - 1, "bad return size");
8851		}
8852	}
8853
8854	for (i = 0; i < dp->dtdo_varlen && err == 0; i++) {
8855		dtrace_difv_t *v = &dp->dtdo_vartab[i], *existing = NULL;
8856		dtrace_diftype_t *vt, *et;
8857		uint_t id, ndx;
8858
8859		if (v->dtdv_scope != DIFV_SCOPE_GLOBAL &&
8860		    v->dtdv_scope != DIFV_SCOPE_THREAD &&
8861		    v->dtdv_scope != DIFV_SCOPE_LOCAL) {
8862			err += efunc(i, "unrecognized variable scope %d\n",
8863			    v->dtdv_scope);
8864			break;
8865		}
8866
8867		if (v->dtdv_kind != DIFV_KIND_ARRAY &&
8868		    v->dtdv_kind != DIFV_KIND_SCALAR) {
8869			err += efunc(i, "unrecognized variable type %d\n",
8870			    v->dtdv_kind);
8871			break;
8872		}
8873
8874		if ((id = v->dtdv_id) > DIF_VARIABLE_MAX) {
8875			err += efunc(i, "%d exceeds variable id limit\n", id);
8876			break;
8877		}
8878
8879		if (id < DIF_VAR_OTHER_UBASE)
8880			continue;
8881
8882		/*
8883		 * For user-defined variables, we need to check that this
8884		 * definition is identical to any previous definition that we
8885		 * encountered.
8886		 */
8887		ndx = id - DIF_VAR_OTHER_UBASE;
8888
8889		switch (v->dtdv_scope) {
8890		case DIFV_SCOPE_GLOBAL:
8891			if (ndx < vstate->dtvs_nglobals) {
8892				dtrace_statvar_t *svar;
8893
8894				if ((svar = vstate->dtvs_globals[ndx]) != NULL)
8895					existing = &svar->dtsv_var;
8896			}
8897
8898			break;
8899
8900		case DIFV_SCOPE_THREAD:
8901			if (ndx < vstate->dtvs_ntlocals)
8902				existing = &vstate->dtvs_tlocals[ndx];
8903			break;
8904
8905		case DIFV_SCOPE_LOCAL:
8906			if (ndx < vstate->dtvs_nlocals) {
8907				dtrace_statvar_t *svar;
8908
8909				if ((svar = vstate->dtvs_locals[ndx]) != NULL)
8910					existing = &svar->dtsv_var;
8911			}
8912
8913			break;
8914		}
8915
8916		vt = &v->dtdv_type;
8917
8918		if (vt->dtdt_flags & DIF_TF_BYREF) {
8919			if (vt->dtdt_size == 0) {
8920				err += efunc(i, "zero-sized variable\n");
8921				break;
8922			}
8923
8924			if (v->dtdv_scope == DIFV_SCOPE_GLOBAL &&
8925			    vt->dtdt_size > dtrace_global_maxsize) {
8926				err += efunc(i, "oversized by-ref global\n");
8927				break;
8928			}
8929		}
8930
8931		if (existing == NULL || existing->dtdv_id == 0)
8932			continue;
8933
8934		ASSERT(existing->dtdv_id == v->dtdv_id);
8935		ASSERT(existing->dtdv_scope == v->dtdv_scope);
8936
8937		if (existing->dtdv_kind != v->dtdv_kind)
8938			err += efunc(i, "%d changed variable kind\n", id);
8939
8940		et = &existing->dtdv_type;
8941
8942		if (vt->dtdt_flags != et->dtdt_flags) {
8943			err += efunc(i, "%d changed variable type flags\n", id);
8944			break;
8945		}
8946
8947		if (vt->dtdt_size != 0 && vt->dtdt_size != et->dtdt_size) {
8948			err += efunc(i, "%d changed variable type size\n", id);
8949			break;
8950		}
8951	}
8952
8953	return (err);
8954}
8955
8956/*
8957 * Validate a DTrace DIF object that it is to be used as a helper.  Helpers
8958 * are much more constrained than normal DIFOs.  Specifically, they may
8959 * not:
8960 *
8961 * 1. Make calls to subroutines other than copyin(), copyinstr() or
8962 *    miscellaneous string routines
8963 * 2. Access DTrace variables other than the args[] array, and the
8964 *    curthread, pid, ppid, tid, execname, zonename, uid and gid variables.
8965 * 3. Have thread-local variables.
8966 * 4. Have dynamic variables.
8967 */
8968static int
8969dtrace_difo_validate_helper(dtrace_difo_t *dp)
8970{
8971	int (*efunc)(uint_t pc, const char *, ...) = dtrace_difo_err;
8972	int err = 0;
8973	uint_t pc;
8974
8975	for (pc = 0; pc < dp->dtdo_len; pc++) {
8976		dif_instr_t instr = dp->dtdo_buf[pc];
8977
8978		uint_t v = DIF_INSTR_VAR(instr);
8979		uint_t subr = DIF_INSTR_SUBR(instr);
8980		uint_t op = DIF_INSTR_OP(instr);
8981
8982		switch (op) {
8983		case DIF_OP_OR:
8984		case DIF_OP_XOR:
8985		case DIF_OP_AND:
8986		case DIF_OP_SLL:
8987		case DIF_OP_SRL:
8988		case DIF_OP_SRA:
8989		case DIF_OP_SUB:
8990		case DIF_OP_ADD:
8991		case DIF_OP_MUL:
8992		case DIF_OP_SDIV:
8993		case DIF_OP_UDIV:
8994		case DIF_OP_SREM:
8995		case DIF_OP_UREM:
8996		case DIF_OP_COPYS:
8997		case DIF_OP_NOT:
8998		case DIF_OP_MOV:
8999		case DIF_OP_RLDSB:
9000		case DIF_OP_RLDSH:
9001		case DIF_OP_RLDSW:
9002		case DIF_OP_RLDUB:
9003		case DIF_OP_RLDUH:
9004		case DIF_OP_RLDUW:
9005		case DIF_OP_RLDX:
9006		case DIF_OP_ULDSB:
9007		case DIF_OP_ULDSH:
9008		case DIF_OP_ULDSW:
9009		case DIF_OP_ULDUB:
9010		case DIF_OP_ULDUH:
9011		case DIF_OP_ULDUW:
9012		case DIF_OP_ULDX:
9013		case DIF_OP_STB:
9014		case DIF_OP_STH:
9015		case DIF_OP_STW:
9016		case DIF_OP_STX:
9017		case DIF_OP_ALLOCS:
9018		case DIF_OP_CMP:
9019		case DIF_OP_SCMP:
9020		case DIF_OP_TST:
9021		case DIF_OP_BA:
9022		case DIF_OP_BE:
9023		case DIF_OP_BNE:
9024		case DIF_OP_BG:
9025		case DIF_OP_BGU:
9026		case DIF_OP_BGE:
9027		case DIF_OP_BGEU:
9028		case DIF_OP_BL:
9029		case DIF_OP_BLU:
9030		case DIF_OP_BLE:
9031		case DIF_OP_BLEU:
9032		case DIF_OP_RET:
9033		case DIF_OP_NOP:
9034		case DIF_OP_POPTS:
9035		case DIF_OP_FLUSHTS:
9036		case DIF_OP_SETX:
9037		case DIF_OP_SETS:
9038		case DIF_OP_LDGA:
9039		case DIF_OP_LDLS:
9040		case DIF_OP_STGS:
9041		case DIF_OP_STLS:
9042		case DIF_OP_PUSHTR:
9043		case DIF_OP_PUSHTV:
9044			break;
9045
9046		case DIF_OP_LDGS:
9047			if (v >= DIF_VAR_OTHER_UBASE)
9048				break;
9049
9050			if (v >= DIF_VAR_ARG0 && v <= DIF_VAR_ARG9)
9051				break;
9052
9053			if (v == DIF_VAR_CURTHREAD || v == DIF_VAR_PID ||
9054			    v == DIF_VAR_PPID || v == DIF_VAR_TID ||
9055			    v == DIF_VAR_EXECARGS ||
9056			    v == DIF_VAR_EXECNAME || v == DIF_VAR_ZONENAME ||
9057			    v == DIF_VAR_UID || v == DIF_VAR_GID)
9058				break;
9059
9060			err += efunc(pc, "illegal variable %u\n", v);
9061			break;
9062
9063		case DIF_OP_LDTA:
9064		case DIF_OP_LDTS:
9065		case DIF_OP_LDGAA:
9066		case DIF_OP_LDTAA:
9067			err += efunc(pc, "illegal dynamic variable load\n");
9068			break;
9069
9070		case DIF_OP_STTS:
9071		case DIF_OP_STGAA:
9072		case DIF_OP_STTAA:
9073			err += efunc(pc, "illegal dynamic variable store\n");
9074			break;
9075
9076		case DIF_OP_CALL:
9077			if (subr == DIF_SUBR_ALLOCA ||
9078			    subr == DIF_SUBR_BCOPY ||
9079			    subr == DIF_SUBR_COPYIN ||
9080			    subr == DIF_SUBR_COPYINTO ||
9081			    subr == DIF_SUBR_COPYINSTR ||
9082			    subr == DIF_SUBR_INDEX ||
9083			    subr == DIF_SUBR_INET_NTOA ||
9084			    subr == DIF_SUBR_INET_NTOA6 ||
9085			    subr == DIF_SUBR_INET_NTOP ||
9086			    subr == DIF_SUBR_LLTOSTR ||
9087			    subr == DIF_SUBR_RINDEX ||
9088			    subr == DIF_SUBR_STRCHR ||
9089			    subr == DIF_SUBR_STRJOIN ||
9090			    subr == DIF_SUBR_STRRCHR ||
9091			    subr == DIF_SUBR_STRSTR ||
9092			    subr == DIF_SUBR_HTONS ||
9093			    subr == DIF_SUBR_HTONL ||
9094			    subr == DIF_SUBR_HTONLL ||
9095			    subr == DIF_SUBR_NTOHS ||
9096			    subr == DIF_SUBR_NTOHL ||
9097			    subr == DIF_SUBR_NTOHLL ||
9098			    subr == DIF_SUBR_MEMREF ||
9099			    subr == DIF_SUBR_TYPEREF)
9100				break;
9101
9102			err += efunc(pc, "invalid subr %u\n", subr);
9103			break;
9104
9105		default:
9106			err += efunc(pc, "invalid opcode %u\n",
9107			    DIF_INSTR_OP(instr));
9108		}
9109	}
9110
9111	return (err);
9112}
9113
9114/*
9115 * Returns 1 if the expression in the DIF object can be cached on a per-thread
9116 * basis; 0 if not.
9117 */
9118static int
9119dtrace_difo_cacheable(dtrace_difo_t *dp)
9120{
9121	int i;
9122
9123	if (dp == NULL)
9124		return (0);
9125
9126	for (i = 0; i < dp->dtdo_varlen; i++) {
9127		dtrace_difv_t *v = &dp->dtdo_vartab[i];
9128
9129		if (v->dtdv_scope != DIFV_SCOPE_GLOBAL)
9130			continue;
9131
9132		switch (v->dtdv_id) {
9133		case DIF_VAR_CURTHREAD:
9134		case DIF_VAR_PID:
9135		case DIF_VAR_TID:
9136		case DIF_VAR_EXECARGS:
9137		case DIF_VAR_EXECNAME:
9138		case DIF_VAR_ZONENAME:
9139			break;
9140
9141		default:
9142			return (0);
9143		}
9144	}
9145
9146	/*
9147	 * This DIF object may be cacheable.  Now we need to look for any
9148	 * array loading instructions, any memory loading instructions, or
9149	 * any stores to thread-local variables.
9150	 */
9151	for (i = 0; i < dp->dtdo_len; i++) {
9152		uint_t op = DIF_INSTR_OP(dp->dtdo_buf[i]);
9153
9154		if ((op >= DIF_OP_LDSB && op <= DIF_OP_LDX) ||
9155		    (op >= DIF_OP_ULDSB && op <= DIF_OP_ULDX) ||
9156		    (op >= DIF_OP_RLDSB && op <= DIF_OP_RLDX) ||
9157		    op == DIF_OP_LDGA || op == DIF_OP_STTS)
9158			return (0);
9159	}
9160
9161	return (1);
9162}
9163
9164static void
9165dtrace_difo_hold(dtrace_difo_t *dp)
9166{
9167	int i;
9168
9169	ASSERT(MUTEX_HELD(&dtrace_lock));
9170
9171	dp->dtdo_refcnt++;
9172	ASSERT(dp->dtdo_refcnt != 0);
9173
9174	/*
9175	 * We need to check this DIF object for references to the variable
9176	 * DIF_VAR_VTIMESTAMP.
9177	 */
9178	for (i = 0; i < dp->dtdo_varlen; i++) {
9179		dtrace_difv_t *v = &dp->dtdo_vartab[i];
9180
9181		if (v->dtdv_id != DIF_VAR_VTIMESTAMP)
9182			continue;
9183
9184		if (dtrace_vtime_references++ == 0)
9185			dtrace_vtime_enable();
9186	}
9187}
9188
9189/*
9190 * This routine calculates the dynamic variable chunksize for a given DIF
9191 * object.  The calculation is not fool-proof, and can probably be tricked by
9192 * malicious DIF -- but it works for all compiler-generated DIF.  Because this
9193 * calculation is likely imperfect, dtrace_dynvar() is able to gracefully fail
9194 * if a dynamic variable size exceeds the chunksize.
9195 */
9196static void
9197dtrace_difo_chunksize(dtrace_difo_t *dp, dtrace_vstate_t *vstate)
9198{
9199	uint64_t sval = 0;
9200	dtrace_key_t tupregs[DIF_DTR_NREGS + 2]; /* +2 for thread and id */
9201	const dif_instr_t *text = dp->dtdo_buf;
9202	uint_t pc, srd = 0;
9203	uint_t ttop = 0;
9204	size_t size, ksize;
9205	uint_t id, i;
9206
9207	for (pc = 0; pc < dp->dtdo_len; pc++) {
9208		dif_instr_t instr = text[pc];
9209		uint_t op = DIF_INSTR_OP(instr);
9210		uint_t rd = DIF_INSTR_RD(instr);
9211		uint_t r1 = DIF_INSTR_R1(instr);
9212		uint_t nkeys = 0;
9213		uchar_t scope = 0;
9214
9215		dtrace_key_t *key = tupregs;
9216
9217		switch (op) {
9218		case DIF_OP_SETX:
9219			sval = dp->dtdo_inttab[DIF_INSTR_INTEGER(instr)];
9220			srd = rd;
9221			continue;
9222
9223		case DIF_OP_STTS:
9224			key = &tupregs[DIF_DTR_NREGS];
9225			key[0].dttk_size = 0;
9226			key[1].dttk_size = 0;
9227			nkeys = 2;
9228			scope = DIFV_SCOPE_THREAD;
9229			break;
9230
9231		case DIF_OP_STGAA:
9232		case DIF_OP_STTAA:
9233			nkeys = ttop;
9234
9235			if (DIF_INSTR_OP(instr) == DIF_OP_STTAA)
9236				key[nkeys++].dttk_size = 0;
9237
9238			key[nkeys++].dttk_size = 0;
9239
9240			if (op == DIF_OP_STTAA) {
9241				scope = DIFV_SCOPE_THREAD;
9242			} else {
9243				scope = DIFV_SCOPE_GLOBAL;
9244			}
9245
9246			break;
9247
9248		case DIF_OP_PUSHTR:
9249			if (ttop == DIF_DTR_NREGS)
9250				return;
9251
9252			if ((srd == 0 || sval == 0) && r1 == DIF_TYPE_STRING) {
9253				/*
9254				 * If the register for the size of the "pushtr"
9255				 * is %r0 (or the value is 0) and the type is
9256				 * a string, we'll use the system-wide default
9257				 * string size.
9258				 */
9259				tupregs[ttop++].dttk_size =
9260				    dtrace_strsize_default;
9261			} else {
9262				if (srd == 0)
9263					return;
9264
9265				tupregs[ttop++].dttk_size = sval;
9266			}
9267
9268			break;
9269
9270		case DIF_OP_PUSHTV:
9271			if (ttop == DIF_DTR_NREGS)
9272				return;
9273
9274			tupregs[ttop++].dttk_size = 0;
9275			break;
9276
9277		case DIF_OP_FLUSHTS:
9278			ttop = 0;
9279			break;
9280
9281		case DIF_OP_POPTS:
9282			if (ttop != 0)
9283				ttop--;
9284			break;
9285		}
9286
9287		sval = 0;
9288		srd = 0;
9289
9290		if (nkeys == 0)
9291			continue;
9292
9293		/*
9294		 * We have a dynamic variable allocation; calculate its size.
9295		 */
9296		for (ksize = 0, i = 0; i < nkeys; i++)
9297			ksize += P2ROUNDUP(key[i].dttk_size, sizeof (uint64_t));
9298
9299		size = sizeof (dtrace_dynvar_t);
9300		size += sizeof (dtrace_key_t) * (nkeys - 1);
9301		size += ksize;
9302
9303		/*
9304		 * Now we need to determine the size of the stored data.
9305		 */
9306		id = DIF_INSTR_VAR(instr);
9307
9308		for (i = 0; i < dp->dtdo_varlen; i++) {
9309			dtrace_difv_t *v = &dp->dtdo_vartab[i];
9310
9311			if (v->dtdv_id == id && v->dtdv_scope == scope) {
9312				size += v->dtdv_type.dtdt_size;
9313				break;
9314			}
9315		}
9316
9317		if (i == dp->dtdo_varlen)
9318			return;
9319
9320		/*
9321		 * We have the size.  If this is larger than the chunk size
9322		 * for our dynamic variable state, reset the chunk size.
9323		 */
9324		size = P2ROUNDUP(size, sizeof (uint64_t));
9325
9326		if (size > vstate->dtvs_dynvars.dtds_chunksize)
9327			vstate->dtvs_dynvars.dtds_chunksize = size;
9328	}
9329}
9330
9331static void
9332dtrace_difo_init(dtrace_difo_t *dp, dtrace_vstate_t *vstate)
9333{
9334	int i, oldsvars, osz, nsz, otlocals, ntlocals;
9335	uint_t id;
9336
9337	ASSERT(MUTEX_HELD(&dtrace_lock));
9338	ASSERT(dp->dtdo_buf != NULL && dp->dtdo_len != 0);
9339
9340	for (i = 0; i < dp->dtdo_varlen; i++) {
9341		dtrace_difv_t *v = &dp->dtdo_vartab[i];
9342		dtrace_statvar_t *svar, ***svarp = NULL;
9343		size_t dsize = 0;
9344		uint8_t scope = v->dtdv_scope;
9345		int *np = NULL;
9346
9347		if ((id = v->dtdv_id) < DIF_VAR_OTHER_UBASE)
9348			continue;
9349
9350		id -= DIF_VAR_OTHER_UBASE;
9351
9352		switch (scope) {
9353		case DIFV_SCOPE_THREAD:
9354			while (id >= (otlocals = vstate->dtvs_ntlocals)) {
9355				dtrace_difv_t *tlocals;
9356
9357				if ((ntlocals = (otlocals << 1)) == 0)
9358					ntlocals = 1;
9359
9360				osz = otlocals * sizeof (dtrace_difv_t);
9361				nsz = ntlocals * sizeof (dtrace_difv_t);
9362
9363				tlocals = kmem_zalloc(nsz, KM_SLEEP);
9364
9365				if (osz != 0) {
9366					bcopy(vstate->dtvs_tlocals,
9367					    tlocals, osz);
9368					kmem_free(vstate->dtvs_tlocals, osz);
9369				}
9370
9371				vstate->dtvs_tlocals = tlocals;
9372				vstate->dtvs_ntlocals = ntlocals;
9373			}
9374
9375			vstate->dtvs_tlocals[id] = *v;
9376			continue;
9377
9378		case DIFV_SCOPE_LOCAL:
9379			np = &vstate->dtvs_nlocals;
9380			svarp = &vstate->dtvs_locals;
9381
9382			if (v->dtdv_type.dtdt_flags & DIF_TF_BYREF)
9383				dsize = NCPU * (v->dtdv_type.dtdt_size +
9384				    sizeof (uint64_t));
9385			else
9386				dsize = NCPU * sizeof (uint64_t);
9387
9388			break;
9389
9390		case DIFV_SCOPE_GLOBAL:
9391			np = &vstate->dtvs_nglobals;
9392			svarp = &vstate->dtvs_globals;
9393
9394			if (v->dtdv_type.dtdt_flags & DIF_TF_BYREF)
9395				dsize = v->dtdv_type.dtdt_size +
9396				    sizeof (uint64_t);
9397
9398			break;
9399
9400		default:
9401			ASSERT(0);
9402		}
9403
9404		while (id >= (oldsvars = *np)) {
9405			dtrace_statvar_t **statics;
9406			int newsvars, oldsize, newsize;
9407
9408			if ((newsvars = (oldsvars << 1)) == 0)
9409				newsvars = 1;
9410
9411			oldsize = oldsvars * sizeof (dtrace_statvar_t *);
9412			newsize = newsvars * sizeof (dtrace_statvar_t *);
9413
9414			statics = kmem_zalloc(newsize, KM_SLEEP);
9415
9416			if (oldsize != 0) {
9417				bcopy(*svarp, statics, oldsize);
9418				kmem_free(*svarp, oldsize);
9419			}
9420
9421			*svarp = statics;
9422			*np = newsvars;
9423		}
9424
9425		if ((svar = (*svarp)[id]) == NULL) {
9426			svar = kmem_zalloc(sizeof (dtrace_statvar_t), KM_SLEEP);
9427			svar->dtsv_var = *v;
9428
9429			if ((svar->dtsv_size = dsize) != 0) {
9430				svar->dtsv_data = (uint64_t)(uintptr_t)
9431				    kmem_zalloc(dsize, KM_SLEEP);
9432			}
9433
9434			(*svarp)[id] = svar;
9435		}
9436
9437		svar->dtsv_refcnt++;
9438	}
9439
9440	dtrace_difo_chunksize(dp, vstate);
9441	dtrace_difo_hold(dp);
9442}
9443
9444static dtrace_difo_t *
9445dtrace_difo_duplicate(dtrace_difo_t *dp, dtrace_vstate_t *vstate)
9446{
9447	dtrace_difo_t *new;
9448	size_t sz;
9449
9450	ASSERT(dp->dtdo_buf != NULL);
9451	ASSERT(dp->dtdo_refcnt != 0);
9452
9453	new = kmem_zalloc(sizeof (dtrace_difo_t), KM_SLEEP);
9454
9455	ASSERT(dp->dtdo_buf != NULL);
9456	sz = dp->dtdo_len * sizeof (dif_instr_t);
9457	new->dtdo_buf = kmem_alloc(sz, KM_SLEEP);
9458	bcopy(dp->dtdo_buf, new->dtdo_buf, sz);
9459	new->dtdo_len = dp->dtdo_len;
9460
9461	if (dp->dtdo_strtab != NULL) {
9462		ASSERT(dp->dtdo_strlen != 0);
9463		new->dtdo_strtab = kmem_alloc(dp->dtdo_strlen, KM_SLEEP);
9464		bcopy(dp->dtdo_strtab, new->dtdo_strtab, dp->dtdo_strlen);
9465		new->dtdo_strlen = dp->dtdo_strlen;
9466	}
9467
9468	if (dp->dtdo_inttab != NULL) {
9469		ASSERT(dp->dtdo_intlen != 0);
9470		sz = dp->dtdo_intlen * sizeof (uint64_t);
9471		new->dtdo_inttab = kmem_alloc(sz, KM_SLEEP);
9472		bcopy(dp->dtdo_inttab, new->dtdo_inttab, sz);
9473		new->dtdo_intlen = dp->dtdo_intlen;
9474	}
9475
9476	if (dp->dtdo_vartab != NULL) {
9477		ASSERT(dp->dtdo_varlen != 0);
9478		sz = dp->dtdo_varlen * sizeof (dtrace_difv_t);
9479		new->dtdo_vartab = kmem_alloc(sz, KM_SLEEP);
9480		bcopy(dp->dtdo_vartab, new->dtdo_vartab, sz);
9481		new->dtdo_varlen = dp->dtdo_varlen;
9482	}
9483
9484	dtrace_difo_init(new, vstate);
9485	return (new);
9486}
9487
9488static void
9489dtrace_difo_destroy(dtrace_difo_t *dp, dtrace_vstate_t *vstate)
9490{
9491	int i;
9492
9493	ASSERT(dp->dtdo_refcnt == 0);
9494
9495	for (i = 0; i < dp->dtdo_varlen; i++) {
9496		dtrace_difv_t *v = &dp->dtdo_vartab[i];
9497		dtrace_statvar_t *svar, **svarp = NULL;
9498		uint_t id;
9499		uint8_t scope = v->dtdv_scope;
9500		int *np = NULL;
9501
9502		switch (scope) {
9503		case DIFV_SCOPE_THREAD:
9504			continue;
9505
9506		case DIFV_SCOPE_LOCAL:
9507			np = &vstate->dtvs_nlocals;
9508			svarp = vstate->dtvs_locals;
9509			break;
9510
9511		case DIFV_SCOPE_GLOBAL:
9512			np = &vstate->dtvs_nglobals;
9513			svarp = vstate->dtvs_globals;
9514			break;
9515
9516		default:
9517			ASSERT(0);
9518		}
9519
9520		if ((id = v->dtdv_id) < DIF_VAR_OTHER_UBASE)
9521			continue;
9522
9523		id -= DIF_VAR_OTHER_UBASE;
9524		ASSERT(id < *np);
9525
9526		svar = svarp[id];
9527		ASSERT(svar != NULL);
9528		ASSERT(svar->dtsv_refcnt > 0);
9529
9530		if (--svar->dtsv_refcnt > 0)
9531			continue;
9532
9533		if (svar->dtsv_size != 0) {
9534			ASSERT(svar->dtsv_data != 0);
9535			kmem_free((void *)(uintptr_t)svar->dtsv_data,
9536			    svar->dtsv_size);
9537		}
9538
9539		kmem_free(svar, sizeof (dtrace_statvar_t));
9540		svarp[id] = NULL;
9541	}
9542
9543	if (dp->dtdo_buf != NULL)
9544		kmem_free(dp->dtdo_buf, dp->dtdo_len * sizeof (dif_instr_t));
9545	if (dp->dtdo_inttab != NULL)
9546		kmem_free(dp->dtdo_inttab, dp->dtdo_intlen * sizeof (uint64_t));
9547	if (dp->dtdo_strtab != NULL)
9548		kmem_free(dp->dtdo_strtab, dp->dtdo_strlen);
9549	if (dp->dtdo_vartab != NULL)
9550		kmem_free(dp->dtdo_vartab, dp->dtdo_varlen * sizeof (dtrace_difv_t));
9551
9552	kmem_free(dp, sizeof (dtrace_difo_t));
9553}
9554
9555static void
9556dtrace_difo_release(dtrace_difo_t *dp, dtrace_vstate_t *vstate)
9557{
9558	int i;
9559
9560	ASSERT(MUTEX_HELD(&dtrace_lock));
9561	ASSERT(dp->dtdo_refcnt != 0);
9562
9563	for (i = 0; i < dp->dtdo_varlen; i++) {
9564		dtrace_difv_t *v = &dp->dtdo_vartab[i];
9565
9566		if (v->dtdv_id != DIF_VAR_VTIMESTAMP)
9567			continue;
9568
9569		ASSERT(dtrace_vtime_references > 0);
9570		if (--dtrace_vtime_references == 0)
9571			dtrace_vtime_disable();
9572	}
9573
9574	if (--dp->dtdo_refcnt == 0)
9575		dtrace_difo_destroy(dp, vstate);
9576}
9577
9578/*
9579 * DTrace Format Functions
9580 */
9581static uint16_t
9582dtrace_format_add(dtrace_state_t *state, char *str)
9583{
9584	char *fmt, **new;
9585	uint16_t ndx, len = strlen(str) + 1;
9586
9587	fmt = kmem_zalloc(len, KM_SLEEP);
9588	bcopy(str, fmt, len);
9589
9590	for (ndx = 0; ndx < state->dts_nformats; ndx++) {
9591		if (state->dts_formats[ndx] == NULL) {
9592			state->dts_formats[ndx] = fmt;
9593			return (ndx + 1);
9594		}
9595	}
9596
9597	if (state->dts_nformats == USHRT_MAX) {
9598		/*
9599		 * This is only likely if a denial-of-service attack is being
9600		 * attempted.  As such, it's okay to fail silently here.
9601		 */
9602		kmem_free(fmt, len);
9603		return (0);
9604	}
9605
9606	/*
9607	 * For simplicity, we always resize the formats array to be exactly the
9608	 * number of formats.
9609	 */
9610	ndx = state->dts_nformats++;
9611	new = kmem_alloc((ndx + 1) * sizeof (char *), KM_SLEEP);
9612
9613	if (state->dts_formats != NULL) {
9614		ASSERT(ndx != 0);
9615		bcopy(state->dts_formats, new, ndx * sizeof (char *));
9616		kmem_free(state->dts_formats, ndx * sizeof (char *));
9617	}
9618
9619	state->dts_formats = new;
9620	state->dts_formats[ndx] = fmt;
9621
9622	return (ndx + 1);
9623}
9624
9625static void
9626dtrace_format_remove(dtrace_state_t *state, uint16_t format)
9627{
9628	char *fmt;
9629
9630	ASSERT(state->dts_formats != NULL);
9631	ASSERT(format <= state->dts_nformats);
9632	ASSERT(state->dts_formats[format - 1] != NULL);
9633
9634	fmt = state->dts_formats[format - 1];
9635	kmem_free(fmt, strlen(fmt) + 1);
9636	state->dts_formats[format - 1] = NULL;
9637}
9638
9639static void
9640dtrace_format_destroy(dtrace_state_t *state)
9641{
9642	int i;
9643
9644	if (state->dts_nformats == 0) {
9645		ASSERT(state->dts_formats == NULL);
9646		return;
9647	}
9648
9649	ASSERT(state->dts_formats != NULL);
9650
9651	for (i = 0; i < state->dts_nformats; i++) {
9652		char *fmt = state->dts_formats[i];
9653
9654		if (fmt == NULL)
9655			continue;
9656
9657		kmem_free(fmt, strlen(fmt) + 1);
9658	}
9659
9660	kmem_free(state->dts_formats, state->dts_nformats * sizeof (char *));
9661	state->dts_nformats = 0;
9662	state->dts_formats = NULL;
9663}
9664
9665/*
9666 * DTrace Predicate Functions
9667 */
9668static dtrace_predicate_t *
9669dtrace_predicate_create(dtrace_difo_t *dp)
9670{
9671	dtrace_predicate_t *pred;
9672
9673	ASSERT(MUTEX_HELD(&dtrace_lock));
9674	ASSERT(dp->dtdo_refcnt != 0);
9675
9676	pred = kmem_zalloc(sizeof (dtrace_predicate_t), KM_SLEEP);
9677	pred->dtp_difo = dp;
9678	pred->dtp_refcnt = 1;
9679
9680	if (!dtrace_difo_cacheable(dp))
9681		return (pred);
9682
9683	if (dtrace_predcache_id == DTRACE_CACHEIDNONE) {
9684		/*
9685		 * This is only theoretically possible -- we have had 2^32
9686		 * cacheable predicates on this machine.  We cannot allow any
9687		 * more predicates to become cacheable:  as unlikely as it is,
9688		 * there may be a thread caching a (now stale) predicate cache
9689		 * ID. (N.B.: the temptation is being successfully resisted to
9690		 * have this cmn_err() "Holy shit -- we executed this code!")
9691		 */
9692		return (pred);
9693	}
9694
9695	pred->dtp_cacheid = dtrace_predcache_id++;
9696
9697	return (pred);
9698}
9699
9700static void
9701dtrace_predicate_hold(dtrace_predicate_t *pred)
9702{
9703	ASSERT(MUTEX_HELD(&dtrace_lock));
9704	ASSERT(pred->dtp_difo != NULL && pred->dtp_difo->dtdo_refcnt != 0);
9705	ASSERT(pred->dtp_refcnt > 0);
9706
9707	pred->dtp_refcnt++;
9708}
9709
9710static void
9711dtrace_predicate_release(dtrace_predicate_t *pred, dtrace_vstate_t *vstate)
9712{
9713	dtrace_difo_t *dp = pred->dtp_difo;
9714
9715	ASSERT(MUTEX_HELD(&dtrace_lock));
9716	ASSERT(dp != NULL && dp->dtdo_refcnt != 0);
9717	ASSERT(pred->dtp_refcnt > 0);
9718
9719	if (--pred->dtp_refcnt == 0) {
9720		dtrace_difo_release(pred->dtp_difo, vstate);
9721		kmem_free(pred, sizeof (dtrace_predicate_t));
9722	}
9723}
9724
9725/*
9726 * DTrace Action Description Functions
9727 */
9728static dtrace_actdesc_t *
9729dtrace_actdesc_create(dtrace_actkind_t kind, uint32_t ntuple,
9730    uint64_t uarg, uint64_t arg)
9731{
9732	dtrace_actdesc_t *act;
9733
9734#if defined(sun)
9735	ASSERT(!DTRACEACT_ISPRINTFLIKE(kind) || (arg != NULL &&
9736	    arg >= KERNELBASE) || (arg == NULL && kind == DTRACEACT_PRINTA));
9737#endif
9738
9739	act = kmem_zalloc(sizeof (dtrace_actdesc_t), KM_SLEEP);
9740	act->dtad_kind = kind;
9741	act->dtad_ntuple = ntuple;
9742	act->dtad_uarg = uarg;
9743	act->dtad_arg = arg;
9744	act->dtad_refcnt = 1;
9745
9746	return (act);
9747}
9748
9749static void
9750dtrace_actdesc_hold(dtrace_actdesc_t *act)
9751{
9752	ASSERT(act->dtad_refcnt >= 1);
9753	act->dtad_refcnt++;
9754}
9755
9756static void
9757dtrace_actdesc_release(dtrace_actdesc_t *act, dtrace_vstate_t *vstate)
9758{
9759	dtrace_actkind_t kind = act->dtad_kind;
9760	dtrace_difo_t *dp;
9761
9762	ASSERT(act->dtad_refcnt >= 1);
9763
9764	if (--act->dtad_refcnt != 0)
9765		return;
9766
9767	if ((dp = act->dtad_difo) != NULL)
9768		dtrace_difo_release(dp, vstate);
9769
9770	if (DTRACEACT_ISPRINTFLIKE(kind)) {
9771		char *str = (char *)(uintptr_t)act->dtad_arg;
9772
9773#if defined(sun)
9774		ASSERT((str != NULL && (uintptr_t)str >= KERNELBASE) ||
9775		    (str == NULL && act->dtad_kind == DTRACEACT_PRINTA));
9776#endif
9777
9778		if (str != NULL)
9779			kmem_free(str, strlen(str) + 1);
9780	}
9781
9782	kmem_free(act, sizeof (dtrace_actdesc_t));
9783}
9784
9785/*
9786 * DTrace ECB Functions
9787 */
9788static dtrace_ecb_t *
9789dtrace_ecb_add(dtrace_state_t *state, dtrace_probe_t *probe)
9790{
9791	dtrace_ecb_t *ecb;
9792	dtrace_epid_t epid;
9793
9794	ASSERT(MUTEX_HELD(&dtrace_lock));
9795
9796	ecb = kmem_zalloc(sizeof (dtrace_ecb_t), KM_SLEEP);
9797	ecb->dte_predicate = NULL;
9798	ecb->dte_probe = probe;
9799
9800	/*
9801	 * The default size is the size of the default action: recording
9802	 * the header.
9803	 */
9804	ecb->dte_size = ecb->dte_needed = sizeof (dtrace_rechdr_t);
9805	ecb->dte_alignment = sizeof (dtrace_epid_t);
9806
9807	epid = state->dts_epid++;
9808
9809	if (epid - 1 >= state->dts_necbs) {
9810		dtrace_ecb_t **oecbs = state->dts_ecbs, **ecbs;
9811		int necbs = state->dts_necbs << 1;
9812
9813		ASSERT(epid == state->dts_necbs + 1);
9814
9815		if (necbs == 0) {
9816			ASSERT(oecbs == NULL);
9817			necbs = 1;
9818		}
9819
9820		ecbs = kmem_zalloc(necbs * sizeof (*ecbs), KM_SLEEP);
9821
9822		if (oecbs != NULL)
9823			bcopy(oecbs, ecbs, state->dts_necbs * sizeof (*ecbs));
9824
9825		dtrace_membar_producer();
9826		state->dts_ecbs = ecbs;
9827
9828		if (oecbs != NULL) {
9829			/*
9830			 * If this state is active, we must dtrace_sync()
9831			 * before we can free the old dts_ecbs array:  we're
9832			 * coming in hot, and there may be active ring
9833			 * buffer processing (which indexes into the dts_ecbs
9834			 * array) on another CPU.
9835			 */
9836			if (state->dts_activity != DTRACE_ACTIVITY_INACTIVE)
9837				dtrace_sync();
9838
9839			kmem_free(oecbs, state->dts_necbs * sizeof (*ecbs));
9840		}
9841
9842		dtrace_membar_producer();
9843		state->dts_necbs = necbs;
9844	}
9845
9846	ecb->dte_state = state;
9847
9848	ASSERT(state->dts_ecbs[epid - 1] == NULL);
9849	dtrace_membar_producer();
9850	state->dts_ecbs[(ecb->dte_epid = epid) - 1] = ecb;
9851
9852	return (ecb);
9853}
9854
9855static void
9856dtrace_ecb_enable(dtrace_ecb_t *ecb)
9857{
9858	dtrace_probe_t *probe = ecb->dte_probe;
9859
9860	ASSERT(MUTEX_HELD(&cpu_lock));
9861	ASSERT(MUTEX_HELD(&dtrace_lock));
9862	ASSERT(ecb->dte_next == NULL);
9863
9864	if (probe == NULL) {
9865		/*
9866		 * This is the NULL probe -- there's nothing to do.
9867		 */
9868		return;
9869	}
9870
9871	if (probe->dtpr_ecb == NULL) {
9872		dtrace_provider_t *prov = probe->dtpr_provider;
9873
9874		/*
9875		 * We're the first ECB on this probe.
9876		 */
9877		probe->dtpr_ecb = probe->dtpr_ecb_last = ecb;
9878
9879		if (ecb->dte_predicate != NULL)
9880			probe->dtpr_predcache = ecb->dte_predicate->dtp_cacheid;
9881
9882		prov->dtpv_pops.dtps_enable(prov->dtpv_arg,
9883		    probe->dtpr_id, probe->dtpr_arg);
9884	} else {
9885		/*
9886		 * This probe is already active.  Swing the last pointer to
9887		 * point to the new ECB, and issue a dtrace_sync() to assure
9888		 * that all CPUs have seen the change.
9889		 */
9890		ASSERT(probe->dtpr_ecb_last != NULL);
9891		probe->dtpr_ecb_last->dte_next = ecb;
9892		probe->dtpr_ecb_last = ecb;
9893		probe->dtpr_predcache = 0;
9894
9895		dtrace_sync();
9896	}
9897}
9898
9899static void
9900dtrace_ecb_resize(dtrace_ecb_t *ecb)
9901{
9902	dtrace_action_t *act;
9903	uint32_t curneeded = UINT32_MAX;
9904	uint32_t aggbase = UINT32_MAX;
9905
9906	/*
9907	 * If we record anything, we always record the dtrace_rechdr_t.  (And
9908	 * we always record it first.)
9909	 */
9910	ecb->dte_size = sizeof (dtrace_rechdr_t);
9911	ecb->dte_alignment = sizeof (dtrace_epid_t);
9912
9913	for (act = ecb->dte_action; act != NULL; act = act->dta_next) {
9914		dtrace_recdesc_t *rec = &act->dta_rec;
9915		ASSERT(rec->dtrd_size > 0 || rec->dtrd_alignment == 1);
9916
9917		ecb->dte_alignment = MAX(ecb->dte_alignment,
9918		    rec->dtrd_alignment);
9919
9920		if (DTRACEACT_ISAGG(act->dta_kind)) {
9921			dtrace_aggregation_t *agg = (dtrace_aggregation_t *)act;
9922
9923			ASSERT(rec->dtrd_size != 0);
9924			ASSERT(agg->dtag_first != NULL);
9925			ASSERT(act->dta_prev->dta_intuple);
9926			ASSERT(aggbase != UINT32_MAX);
9927			ASSERT(curneeded != UINT32_MAX);
9928
9929			agg->dtag_base = aggbase;
9930
9931			curneeded = P2ROUNDUP(curneeded, rec->dtrd_alignment);
9932			rec->dtrd_offset = curneeded;
9933			curneeded += rec->dtrd_size;
9934			ecb->dte_needed = MAX(ecb->dte_needed, curneeded);
9935
9936			aggbase = UINT32_MAX;
9937			curneeded = UINT32_MAX;
9938		} else if (act->dta_intuple) {
9939			if (curneeded == UINT32_MAX) {
9940				/*
9941				 * This is the first record in a tuple.  Align
9942				 * curneeded to be at offset 4 in an 8-byte
9943				 * aligned block.
9944				 */
9945				ASSERT(act->dta_prev == NULL ||
9946				    !act->dta_prev->dta_intuple);
9947				ASSERT3U(aggbase, ==, UINT32_MAX);
9948				curneeded = P2PHASEUP(ecb->dte_size,
9949				    sizeof (uint64_t), sizeof (dtrace_aggid_t));
9950
9951				aggbase = curneeded - sizeof (dtrace_aggid_t);
9952				ASSERT(IS_P2ALIGNED(aggbase,
9953				    sizeof (uint64_t)));
9954			}
9955			curneeded = P2ROUNDUP(curneeded, rec->dtrd_alignment);
9956			rec->dtrd_offset = curneeded;
9957			curneeded += rec->dtrd_size;
9958		} else {
9959			/* tuples must be followed by an aggregation */
9960			ASSERT(act->dta_prev == NULL ||
9961			    !act->dta_prev->dta_intuple);
9962
9963			ecb->dte_size = P2ROUNDUP(ecb->dte_size,
9964			    rec->dtrd_alignment);
9965			rec->dtrd_offset = ecb->dte_size;
9966			ecb->dte_size += rec->dtrd_size;
9967			ecb->dte_needed = MAX(ecb->dte_needed, ecb->dte_size);
9968		}
9969	}
9970
9971	if ((act = ecb->dte_action) != NULL &&
9972	    !(act->dta_kind == DTRACEACT_SPECULATE && act->dta_next == NULL) &&
9973	    ecb->dte_size == sizeof (dtrace_rechdr_t)) {
9974		/*
9975		 * If the size is still sizeof (dtrace_rechdr_t), then all
9976		 * actions store no data; set the size to 0.
9977		 */
9978		ecb->dte_size = 0;
9979	}
9980
9981	ecb->dte_size = P2ROUNDUP(ecb->dte_size, sizeof (dtrace_epid_t));
9982	ecb->dte_needed = P2ROUNDUP(ecb->dte_needed, (sizeof (dtrace_epid_t)));
9983	ecb->dte_state->dts_needed = MAX(ecb->dte_state->dts_needed,
9984	    ecb->dte_needed);
9985}
9986
9987static dtrace_action_t *
9988dtrace_ecb_aggregation_create(dtrace_ecb_t *ecb, dtrace_actdesc_t *desc)
9989{
9990	dtrace_aggregation_t *agg;
9991	size_t size = sizeof (uint64_t);
9992	int ntuple = desc->dtad_ntuple;
9993	dtrace_action_t *act;
9994	dtrace_recdesc_t *frec;
9995	dtrace_aggid_t aggid;
9996	dtrace_state_t *state = ecb->dte_state;
9997
9998	agg = kmem_zalloc(sizeof (dtrace_aggregation_t), KM_SLEEP);
9999	agg->dtag_ecb = ecb;
10000
10001	ASSERT(DTRACEACT_ISAGG(desc->dtad_kind));
10002
10003	switch (desc->dtad_kind) {
10004	case DTRACEAGG_MIN:
10005		agg->dtag_initial = INT64_MAX;
10006		agg->dtag_aggregate = dtrace_aggregate_min;
10007		break;
10008
10009	case DTRACEAGG_MAX:
10010		agg->dtag_initial = INT64_MIN;
10011		agg->dtag_aggregate = dtrace_aggregate_max;
10012		break;
10013
10014	case DTRACEAGG_COUNT:
10015		agg->dtag_aggregate = dtrace_aggregate_count;
10016		break;
10017
10018	case DTRACEAGG_QUANTIZE:
10019		agg->dtag_aggregate = dtrace_aggregate_quantize;
10020		size = (((sizeof (uint64_t) * NBBY) - 1) * 2 + 1) *
10021		    sizeof (uint64_t);
10022		break;
10023
10024	case DTRACEAGG_LQUANTIZE: {
10025		uint16_t step = DTRACE_LQUANTIZE_STEP(desc->dtad_arg);
10026		uint16_t levels = DTRACE_LQUANTIZE_LEVELS(desc->dtad_arg);
10027
10028		agg->dtag_initial = desc->dtad_arg;
10029		agg->dtag_aggregate = dtrace_aggregate_lquantize;
10030
10031		if (step == 0 || levels == 0)
10032			goto err;
10033
10034		size = levels * sizeof (uint64_t) + 3 * sizeof (uint64_t);
10035		break;
10036	}
10037
10038	case DTRACEAGG_LLQUANTIZE: {
10039		uint16_t factor = DTRACE_LLQUANTIZE_FACTOR(desc->dtad_arg);
10040		uint16_t low = DTRACE_LLQUANTIZE_LOW(desc->dtad_arg);
10041		uint16_t high = DTRACE_LLQUANTIZE_HIGH(desc->dtad_arg);
10042		uint16_t nsteps = DTRACE_LLQUANTIZE_NSTEP(desc->dtad_arg);
10043		int64_t v;
10044
10045		agg->dtag_initial = desc->dtad_arg;
10046		agg->dtag_aggregate = dtrace_aggregate_llquantize;
10047
10048		if (factor < 2 || low >= high || nsteps < factor)
10049			goto err;
10050
10051		/*
10052		 * Now check that the number of steps evenly divides a power
10053		 * of the factor.  (This assures both integer bucket size and
10054		 * linearity within each magnitude.)
10055		 */
10056		for (v = factor; v < nsteps; v *= factor)
10057			continue;
10058
10059		if ((v % nsteps) || (nsteps % factor))
10060			goto err;
10061
10062		size = (dtrace_aggregate_llquantize_bucket(factor,
10063		    low, high, nsteps, INT64_MAX) + 2) * sizeof (uint64_t);
10064		break;
10065	}
10066
10067	case DTRACEAGG_AVG:
10068		agg->dtag_aggregate = dtrace_aggregate_avg;
10069		size = sizeof (uint64_t) * 2;
10070		break;
10071
10072	case DTRACEAGG_STDDEV:
10073		agg->dtag_aggregate = dtrace_aggregate_stddev;
10074		size = sizeof (uint64_t) * 4;
10075		break;
10076
10077	case DTRACEAGG_SUM:
10078		agg->dtag_aggregate = dtrace_aggregate_sum;
10079		break;
10080
10081	default:
10082		goto err;
10083	}
10084
10085	agg->dtag_action.dta_rec.dtrd_size = size;
10086
10087	if (ntuple == 0)
10088		goto err;
10089
10090	/*
10091	 * We must make sure that we have enough actions for the n-tuple.
10092	 */
10093	for (act = ecb->dte_action_last; act != NULL; act = act->dta_prev) {
10094		if (DTRACEACT_ISAGG(act->dta_kind))
10095			break;
10096
10097		if (--ntuple == 0) {
10098			/*
10099			 * This is the action with which our n-tuple begins.
10100			 */
10101			agg->dtag_first = act;
10102			goto success;
10103		}
10104	}
10105
10106	/*
10107	 * This n-tuple is short by ntuple elements.  Return failure.
10108	 */
10109	ASSERT(ntuple != 0);
10110err:
10111	kmem_free(agg, sizeof (dtrace_aggregation_t));
10112	return (NULL);
10113
10114success:
10115	/*
10116	 * If the last action in the tuple has a size of zero, it's actually
10117	 * an expression argument for the aggregating action.
10118	 */
10119	ASSERT(ecb->dte_action_last != NULL);
10120	act = ecb->dte_action_last;
10121
10122	if (act->dta_kind == DTRACEACT_DIFEXPR) {
10123		ASSERT(act->dta_difo != NULL);
10124
10125		if (act->dta_difo->dtdo_rtype.dtdt_size == 0)
10126			agg->dtag_hasarg = 1;
10127	}
10128
10129	/*
10130	 * We need to allocate an id for this aggregation.
10131	 */
10132#if defined(sun)
10133	aggid = (dtrace_aggid_t)(uintptr_t)vmem_alloc(state->dts_aggid_arena, 1,
10134	    VM_BESTFIT | VM_SLEEP);
10135#else
10136	aggid = alloc_unr(state->dts_aggid_arena);
10137#endif
10138
10139	if (aggid - 1 >= state->dts_naggregations) {
10140		dtrace_aggregation_t **oaggs = state->dts_aggregations;
10141		dtrace_aggregation_t **aggs;
10142		int naggs = state->dts_naggregations << 1;
10143		int onaggs = state->dts_naggregations;
10144
10145		ASSERT(aggid == state->dts_naggregations + 1);
10146
10147		if (naggs == 0) {
10148			ASSERT(oaggs == NULL);
10149			naggs = 1;
10150		}
10151
10152		aggs = kmem_zalloc(naggs * sizeof (*aggs), KM_SLEEP);
10153
10154		if (oaggs != NULL) {
10155			bcopy(oaggs, aggs, onaggs * sizeof (*aggs));
10156			kmem_free(oaggs, onaggs * sizeof (*aggs));
10157		}
10158
10159		state->dts_aggregations = aggs;
10160		state->dts_naggregations = naggs;
10161	}
10162
10163	ASSERT(state->dts_aggregations[aggid - 1] == NULL);
10164	state->dts_aggregations[(agg->dtag_id = aggid) - 1] = agg;
10165
10166	frec = &agg->dtag_first->dta_rec;
10167	if (frec->dtrd_alignment < sizeof (dtrace_aggid_t))
10168		frec->dtrd_alignment = sizeof (dtrace_aggid_t);
10169
10170	for (act = agg->dtag_first; act != NULL; act = act->dta_next) {
10171		ASSERT(!act->dta_intuple);
10172		act->dta_intuple = 1;
10173	}
10174
10175	return (&agg->dtag_action);
10176}
10177
10178static void
10179dtrace_ecb_aggregation_destroy(dtrace_ecb_t *ecb, dtrace_action_t *act)
10180{
10181	dtrace_aggregation_t *agg = (dtrace_aggregation_t *)act;
10182	dtrace_state_t *state = ecb->dte_state;
10183	dtrace_aggid_t aggid = agg->dtag_id;
10184
10185	ASSERT(DTRACEACT_ISAGG(act->dta_kind));
10186#if defined(sun)
10187	vmem_free(state->dts_aggid_arena, (void *)(uintptr_t)aggid, 1);
10188#else
10189	free_unr(state->dts_aggid_arena, aggid);
10190#endif
10191
10192	ASSERT(state->dts_aggregations[aggid - 1] == agg);
10193	state->dts_aggregations[aggid - 1] = NULL;
10194
10195	kmem_free(agg, sizeof (dtrace_aggregation_t));
10196}
10197
10198static int
10199dtrace_ecb_action_add(dtrace_ecb_t *ecb, dtrace_actdesc_t *desc)
10200{
10201	dtrace_action_t *action, *last;
10202	dtrace_difo_t *dp = desc->dtad_difo;
10203	uint32_t size = 0, align = sizeof (uint8_t), mask;
10204	uint16_t format = 0;
10205	dtrace_recdesc_t *rec;
10206	dtrace_state_t *state = ecb->dte_state;
10207	dtrace_optval_t *opt = state->dts_options, nframes = 0, strsize;
10208	uint64_t arg = desc->dtad_arg;
10209
10210	ASSERT(MUTEX_HELD(&dtrace_lock));
10211	ASSERT(ecb->dte_action == NULL || ecb->dte_action->dta_refcnt == 1);
10212
10213	if (DTRACEACT_ISAGG(desc->dtad_kind)) {
10214		/*
10215		 * If this is an aggregating action, there must be neither
10216		 * a speculate nor a commit on the action chain.
10217		 */
10218		dtrace_action_t *act;
10219
10220		for (act = ecb->dte_action; act != NULL; act = act->dta_next) {
10221			if (act->dta_kind == DTRACEACT_COMMIT)
10222				return (EINVAL);
10223
10224			if (act->dta_kind == DTRACEACT_SPECULATE)
10225				return (EINVAL);
10226		}
10227
10228		action = dtrace_ecb_aggregation_create(ecb, desc);
10229
10230		if (action == NULL)
10231			return (EINVAL);
10232	} else {
10233		if (DTRACEACT_ISDESTRUCTIVE(desc->dtad_kind) ||
10234		    (desc->dtad_kind == DTRACEACT_DIFEXPR &&
10235		    dp != NULL && dp->dtdo_destructive)) {
10236			state->dts_destructive = 1;
10237		}
10238
10239		switch (desc->dtad_kind) {
10240		case DTRACEACT_PRINTF:
10241		case DTRACEACT_PRINTA:
10242		case DTRACEACT_SYSTEM:
10243		case DTRACEACT_FREOPEN:
10244		case DTRACEACT_DIFEXPR:
10245			/*
10246			 * We know that our arg is a string -- turn it into a
10247			 * format.
10248			 */
10249			if (arg == 0) {
10250				ASSERT(desc->dtad_kind == DTRACEACT_PRINTA ||
10251				    desc->dtad_kind == DTRACEACT_DIFEXPR);
10252				format = 0;
10253			} else {
10254				ASSERT(arg != 0);
10255#if defined(sun)
10256				ASSERT(arg > KERNELBASE);
10257#endif
10258				format = dtrace_format_add(state,
10259				    (char *)(uintptr_t)arg);
10260			}
10261
10262			/*FALLTHROUGH*/
10263		case DTRACEACT_LIBACT:
10264		case DTRACEACT_TRACEMEM:
10265		case DTRACEACT_TRACEMEM_DYNSIZE:
10266			if (dp == NULL)
10267				return (EINVAL);
10268
10269			if ((size = dp->dtdo_rtype.dtdt_size) != 0)
10270				break;
10271
10272			if (dp->dtdo_rtype.dtdt_kind == DIF_TYPE_STRING) {
10273				if (!(dp->dtdo_rtype.dtdt_flags & DIF_TF_BYREF))
10274					return (EINVAL);
10275
10276				size = opt[DTRACEOPT_STRSIZE];
10277			}
10278
10279			break;
10280
10281		case DTRACEACT_STACK:
10282			if ((nframes = arg) == 0) {
10283				nframes = opt[DTRACEOPT_STACKFRAMES];
10284				ASSERT(nframes > 0);
10285				arg = nframes;
10286			}
10287
10288			size = nframes * sizeof (pc_t);
10289			break;
10290
10291		case DTRACEACT_JSTACK:
10292			if ((strsize = DTRACE_USTACK_STRSIZE(arg)) == 0)
10293				strsize = opt[DTRACEOPT_JSTACKSTRSIZE];
10294
10295			if ((nframes = DTRACE_USTACK_NFRAMES(arg)) == 0)
10296				nframes = opt[DTRACEOPT_JSTACKFRAMES];
10297
10298			arg = DTRACE_USTACK_ARG(nframes, strsize);
10299
10300			/*FALLTHROUGH*/
10301		case DTRACEACT_USTACK:
10302			if (desc->dtad_kind != DTRACEACT_JSTACK &&
10303			    (nframes = DTRACE_USTACK_NFRAMES(arg)) == 0) {
10304				strsize = DTRACE_USTACK_STRSIZE(arg);
10305				nframes = opt[DTRACEOPT_USTACKFRAMES];
10306				ASSERT(nframes > 0);
10307				arg = DTRACE_USTACK_ARG(nframes, strsize);
10308			}
10309
10310			/*
10311			 * Save a slot for the pid.
10312			 */
10313			size = (nframes + 1) * sizeof (uint64_t);
10314			size += DTRACE_USTACK_STRSIZE(arg);
10315			size = P2ROUNDUP(size, (uint32_t)(sizeof (uintptr_t)));
10316
10317			break;
10318
10319		case DTRACEACT_SYM:
10320		case DTRACEACT_MOD:
10321			if (dp == NULL || ((size = dp->dtdo_rtype.dtdt_size) !=
10322			    sizeof (uint64_t)) ||
10323			    (dp->dtdo_rtype.dtdt_flags & DIF_TF_BYREF))
10324				return (EINVAL);
10325			break;
10326
10327		case DTRACEACT_USYM:
10328		case DTRACEACT_UMOD:
10329		case DTRACEACT_UADDR:
10330			if (dp == NULL ||
10331			    (dp->dtdo_rtype.dtdt_size != sizeof (uint64_t)) ||
10332			    (dp->dtdo_rtype.dtdt_flags & DIF_TF_BYREF))
10333				return (EINVAL);
10334
10335			/*
10336			 * We have a slot for the pid, plus a slot for the
10337			 * argument.  To keep things simple (aligned with
10338			 * bitness-neutral sizing), we store each as a 64-bit
10339			 * quantity.
10340			 */
10341			size = 2 * sizeof (uint64_t);
10342			break;
10343
10344		case DTRACEACT_STOP:
10345		case DTRACEACT_BREAKPOINT:
10346		case DTRACEACT_PANIC:
10347			break;
10348
10349		case DTRACEACT_CHILL:
10350		case DTRACEACT_DISCARD:
10351		case DTRACEACT_RAISE:
10352			if (dp == NULL)
10353				return (EINVAL);
10354			break;
10355
10356		case DTRACEACT_EXIT:
10357			if (dp == NULL ||
10358			    (size = dp->dtdo_rtype.dtdt_size) != sizeof (int) ||
10359			    (dp->dtdo_rtype.dtdt_flags & DIF_TF_BYREF))
10360				return (EINVAL);
10361			break;
10362
10363		case DTRACEACT_SPECULATE:
10364			if (ecb->dte_size > sizeof (dtrace_rechdr_t))
10365				return (EINVAL);
10366
10367			if (dp == NULL)
10368				return (EINVAL);
10369
10370			state->dts_speculates = 1;
10371			break;
10372
10373		case DTRACEACT_PRINTM:
10374		    	size = dp->dtdo_rtype.dtdt_size;
10375			break;
10376
10377		case DTRACEACT_PRINTT:
10378		    	size = dp->dtdo_rtype.dtdt_size;
10379			break;
10380
10381		case DTRACEACT_COMMIT: {
10382			dtrace_action_t *act = ecb->dte_action;
10383
10384			for (; act != NULL; act = act->dta_next) {
10385				if (act->dta_kind == DTRACEACT_COMMIT)
10386					return (EINVAL);
10387			}
10388
10389			if (dp == NULL)
10390				return (EINVAL);
10391			break;
10392		}
10393
10394		default:
10395			return (EINVAL);
10396		}
10397
10398		if (size != 0 || desc->dtad_kind == DTRACEACT_SPECULATE) {
10399			/*
10400			 * If this is a data-storing action or a speculate,
10401			 * we must be sure that there isn't a commit on the
10402			 * action chain.
10403			 */
10404			dtrace_action_t *act = ecb->dte_action;
10405
10406			for (; act != NULL; act = act->dta_next) {
10407				if (act->dta_kind == DTRACEACT_COMMIT)
10408					return (EINVAL);
10409			}
10410		}
10411
10412		action = kmem_zalloc(sizeof (dtrace_action_t), KM_SLEEP);
10413		action->dta_rec.dtrd_size = size;
10414	}
10415
10416	action->dta_refcnt = 1;
10417	rec = &action->dta_rec;
10418	size = rec->dtrd_size;
10419
10420	for (mask = sizeof (uint64_t) - 1; size != 0 && mask > 0; mask >>= 1) {
10421		if (!(size & mask)) {
10422			align = mask + 1;
10423			break;
10424		}
10425	}
10426
10427	action->dta_kind = desc->dtad_kind;
10428
10429	if ((action->dta_difo = dp) != NULL)
10430		dtrace_difo_hold(dp);
10431
10432	rec->dtrd_action = action->dta_kind;
10433	rec->dtrd_arg = arg;
10434	rec->dtrd_uarg = desc->dtad_uarg;
10435	rec->dtrd_alignment = (uint16_t)align;
10436	rec->dtrd_format = format;
10437
10438	if ((last = ecb->dte_action_last) != NULL) {
10439		ASSERT(ecb->dte_action != NULL);
10440		action->dta_prev = last;
10441		last->dta_next = action;
10442	} else {
10443		ASSERT(ecb->dte_action == NULL);
10444		ecb->dte_action = action;
10445	}
10446
10447	ecb->dte_action_last = action;
10448
10449	return (0);
10450}
10451
10452static void
10453dtrace_ecb_action_remove(dtrace_ecb_t *ecb)
10454{
10455	dtrace_action_t *act = ecb->dte_action, *next;
10456	dtrace_vstate_t *vstate = &ecb->dte_state->dts_vstate;
10457	dtrace_difo_t *dp;
10458	uint16_t format;
10459
10460	if (act != NULL && act->dta_refcnt > 1) {
10461		ASSERT(act->dta_next == NULL || act->dta_next->dta_refcnt == 1);
10462		act->dta_refcnt--;
10463	} else {
10464		for (; act != NULL; act = next) {
10465			next = act->dta_next;
10466			ASSERT(next != NULL || act == ecb->dte_action_last);
10467			ASSERT(act->dta_refcnt == 1);
10468
10469			if ((format = act->dta_rec.dtrd_format) != 0)
10470				dtrace_format_remove(ecb->dte_state, format);
10471
10472			if ((dp = act->dta_difo) != NULL)
10473				dtrace_difo_release(dp, vstate);
10474
10475			if (DTRACEACT_ISAGG(act->dta_kind)) {
10476				dtrace_ecb_aggregation_destroy(ecb, act);
10477			} else {
10478				kmem_free(act, sizeof (dtrace_action_t));
10479			}
10480		}
10481	}
10482
10483	ecb->dte_action = NULL;
10484	ecb->dte_action_last = NULL;
10485	ecb->dte_size = 0;
10486}
10487
10488static void
10489dtrace_ecb_disable(dtrace_ecb_t *ecb)
10490{
10491	/*
10492	 * We disable the ECB by removing it from its probe.
10493	 */
10494	dtrace_ecb_t *pecb, *prev = NULL;
10495	dtrace_probe_t *probe = ecb->dte_probe;
10496
10497	ASSERT(MUTEX_HELD(&dtrace_lock));
10498
10499	if (probe == NULL) {
10500		/*
10501		 * This is the NULL probe; there is nothing to disable.
10502		 */
10503		return;
10504	}
10505
10506	for (pecb = probe->dtpr_ecb; pecb != NULL; pecb = pecb->dte_next) {
10507		if (pecb == ecb)
10508			break;
10509		prev = pecb;
10510	}
10511
10512	ASSERT(pecb != NULL);
10513
10514	if (prev == NULL) {
10515		probe->dtpr_ecb = ecb->dte_next;
10516	} else {
10517		prev->dte_next = ecb->dte_next;
10518	}
10519
10520	if (ecb == probe->dtpr_ecb_last) {
10521		ASSERT(ecb->dte_next == NULL);
10522		probe->dtpr_ecb_last = prev;
10523	}
10524
10525	/*
10526	 * The ECB has been disconnected from the probe; now sync to assure
10527	 * that all CPUs have seen the change before returning.
10528	 */
10529	dtrace_sync();
10530
10531	if (probe->dtpr_ecb == NULL) {
10532		/*
10533		 * That was the last ECB on the probe; clear the predicate
10534		 * cache ID for the probe, disable it and sync one more time
10535		 * to assure that we'll never hit it again.
10536		 */
10537		dtrace_provider_t *prov = probe->dtpr_provider;
10538
10539		ASSERT(ecb->dte_next == NULL);
10540		ASSERT(probe->dtpr_ecb_last == NULL);
10541		probe->dtpr_predcache = DTRACE_CACHEIDNONE;
10542		prov->dtpv_pops.dtps_disable(prov->dtpv_arg,
10543		    probe->dtpr_id, probe->dtpr_arg);
10544		dtrace_sync();
10545	} else {
10546		/*
10547		 * There is at least one ECB remaining on the probe.  If there
10548		 * is _exactly_ one, set the probe's predicate cache ID to be
10549		 * the predicate cache ID of the remaining ECB.
10550		 */
10551		ASSERT(probe->dtpr_ecb_last != NULL);
10552		ASSERT(probe->dtpr_predcache == DTRACE_CACHEIDNONE);
10553
10554		if (probe->dtpr_ecb == probe->dtpr_ecb_last) {
10555			dtrace_predicate_t *p = probe->dtpr_ecb->dte_predicate;
10556
10557			ASSERT(probe->dtpr_ecb->dte_next == NULL);
10558
10559			if (p != NULL)
10560				probe->dtpr_predcache = p->dtp_cacheid;
10561		}
10562
10563		ecb->dte_next = NULL;
10564	}
10565}
10566
10567static void
10568dtrace_ecb_destroy(dtrace_ecb_t *ecb)
10569{
10570	dtrace_state_t *state = ecb->dte_state;
10571	dtrace_vstate_t *vstate = &state->dts_vstate;
10572	dtrace_predicate_t *pred;
10573	dtrace_epid_t epid = ecb->dte_epid;
10574
10575	ASSERT(MUTEX_HELD(&dtrace_lock));
10576	ASSERT(ecb->dte_next == NULL);
10577	ASSERT(ecb->dte_probe == NULL || ecb->dte_probe->dtpr_ecb != ecb);
10578
10579	if ((pred = ecb->dte_predicate) != NULL)
10580		dtrace_predicate_release(pred, vstate);
10581
10582	dtrace_ecb_action_remove(ecb);
10583
10584	ASSERT(state->dts_ecbs[epid - 1] == ecb);
10585	state->dts_ecbs[epid - 1] = NULL;
10586
10587	kmem_free(ecb, sizeof (dtrace_ecb_t));
10588}
10589
10590static dtrace_ecb_t *
10591dtrace_ecb_create(dtrace_state_t *state, dtrace_probe_t *probe,
10592    dtrace_enabling_t *enab)
10593{
10594	dtrace_ecb_t *ecb;
10595	dtrace_predicate_t *pred;
10596	dtrace_actdesc_t *act;
10597	dtrace_provider_t *prov;
10598	dtrace_ecbdesc_t *desc = enab->dten_current;
10599
10600	ASSERT(MUTEX_HELD(&dtrace_lock));
10601	ASSERT(state != NULL);
10602
10603	ecb = dtrace_ecb_add(state, probe);
10604	ecb->dte_uarg = desc->dted_uarg;
10605
10606	if ((pred = desc->dted_pred.dtpdd_predicate) != NULL) {
10607		dtrace_predicate_hold(pred);
10608		ecb->dte_predicate = pred;
10609	}
10610
10611	if (probe != NULL) {
10612		/*
10613		 * If the provider shows more leg than the consumer is old
10614		 * enough to see, we need to enable the appropriate implicit
10615		 * predicate bits to prevent the ecb from activating at
10616		 * revealing times.
10617		 *
10618		 * Providers specifying DTRACE_PRIV_USER at register time
10619		 * are stating that they need the /proc-style privilege
10620		 * model to be enforced, and this is what DTRACE_COND_OWNER
10621		 * and DTRACE_COND_ZONEOWNER will then do at probe time.
10622		 */
10623		prov = probe->dtpr_provider;
10624		if (!(state->dts_cred.dcr_visible & DTRACE_CRV_ALLPROC) &&
10625		    (prov->dtpv_priv.dtpp_flags & DTRACE_PRIV_USER))
10626			ecb->dte_cond |= DTRACE_COND_OWNER;
10627
10628		if (!(state->dts_cred.dcr_visible & DTRACE_CRV_ALLZONE) &&
10629		    (prov->dtpv_priv.dtpp_flags & DTRACE_PRIV_USER))
10630			ecb->dte_cond |= DTRACE_COND_ZONEOWNER;
10631
10632		/*
10633		 * If the provider shows us kernel innards and the user
10634		 * is lacking sufficient privilege, enable the
10635		 * DTRACE_COND_USERMODE implicit predicate.
10636		 */
10637		if (!(state->dts_cred.dcr_visible & DTRACE_CRV_KERNEL) &&
10638		    (prov->dtpv_priv.dtpp_flags & DTRACE_PRIV_KERNEL))
10639			ecb->dte_cond |= DTRACE_COND_USERMODE;
10640	}
10641
10642	if (dtrace_ecb_create_cache != NULL) {
10643		/*
10644		 * If we have a cached ecb, we'll use its action list instead
10645		 * of creating our own (saving both time and space).
10646		 */
10647		dtrace_ecb_t *cached = dtrace_ecb_create_cache;
10648		dtrace_action_t *act = cached->dte_action;
10649
10650		if (act != NULL) {
10651			ASSERT(act->dta_refcnt > 0);
10652			act->dta_refcnt++;
10653			ecb->dte_action = act;
10654			ecb->dte_action_last = cached->dte_action_last;
10655			ecb->dte_needed = cached->dte_needed;
10656			ecb->dte_size = cached->dte_size;
10657			ecb->dte_alignment = cached->dte_alignment;
10658		}
10659
10660		return (ecb);
10661	}
10662
10663	for (act = desc->dted_action; act != NULL; act = act->dtad_next) {
10664		if ((enab->dten_error = dtrace_ecb_action_add(ecb, act)) != 0) {
10665			dtrace_ecb_destroy(ecb);
10666			return (NULL);
10667		}
10668	}
10669
10670	dtrace_ecb_resize(ecb);
10671
10672	return (dtrace_ecb_create_cache = ecb);
10673}
10674
10675static int
10676dtrace_ecb_create_enable(dtrace_probe_t *probe, void *arg)
10677{
10678	dtrace_ecb_t *ecb;
10679	dtrace_enabling_t *enab = arg;
10680	dtrace_state_t *state = enab->dten_vstate->dtvs_state;
10681
10682	ASSERT(state != NULL);
10683
10684	if (probe != NULL && probe->dtpr_gen < enab->dten_probegen) {
10685		/*
10686		 * This probe was created in a generation for which this
10687		 * enabling has previously created ECBs; we don't want to
10688		 * enable it again, so just kick out.
10689		 */
10690		return (DTRACE_MATCH_NEXT);
10691	}
10692
10693	if ((ecb = dtrace_ecb_create(state, probe, enab)) == NULL)
10694		return (DTRACE_MATCH_DONE);
10695
10696	dtrace_ecb_enable(ecb);
10697	return (DTRACE_MATCH_NEXT);
10698}
10699
10700static dtrace_ecb_t *
10701dtrace_epid2ecb(dtrace_state_t *state, dtrace_epid_t id)
10702{
10703	dtrace_ecb_t *ecb;
10704
10705	ASSERT(MUTEX_HELD(&dtrace_lock));
10706
10707	if (id == 0 || id > state->dts_necbs)
10708		return (NULL);
10709
10710	ASSERT(state->dts_necbs > 0 && state->dts_ecbs != NULL);
10711	ASSERT((ecb = state->dts_ecbs[id - 1]) == NULL || ecb->dte_epid == id);
10712
10713	return (state->dts_ecbs[id - 1]);
10714}
10715
10716static dtrace_aggregation_t *
10717dtrace_aggid2agg(dtrace_state_t *state, dtrace_aggid_t id)
10718{
10719	dtrace_aggregation_t *agg;
10720
10721	ASSERT(MUTEX_HELD(&dtrace_lock));
10722
10723	if (id == 0 || id > state->dts_naggregations)
10724		return (NULL);
10725
10726	ASSERT(state->dts_naggregations > 0 && state->dts_aggregations != NULL);
10727	ASSERT((agg = state->dts_aggregations[id - 1]) == NULL ||
10728	    agg->dtag_id == id);
10729
10730	return (state->dts_aggregations[id - 1]);
10731}
10732
10733/*
10734 * DTrace Buffer Functions
10735 *
10736 * The following functions manipulate DTrace buffers.  Most of these functions
10737 * are called in the context of establishing or processing consumer state;
10738 * exceptions are explicitly noted.
10739 */
10740
10741/*
10742 * Note:  called from cross call context.  This function switches the two
10743 * buffers on a given CPU.  The atomicity of this operation is assured by
10744 * disabling interrupts while the actual switch takes place; the disabling of
10745 * interrupts serializes the execution with any execution of dtrace_probe() on
10746 * the same CPU.
10747 */
10748static void
10749dtrace_buffer_switch(dtrace_buffer_t *buf)
10750{
10751	caddr_t tomax = buf->dtb_tomax;
10752	caddr_t xamot = buf->dtb_xamot;
10753	dtrace_icookie_t cookie;
10754	hrtime_t now;
10755
10756	ASSERT(!(buf->dtb_flags & DTRACEBUF_NOSWITCH));
10757	ASSERT(!(buf->dtb_flags & DTRACEBUF_RING));
10758
10759	cookie = dtrace_interrupt_disable();
10760	now = dtrace_gethrtime();
10761	buf->dtb_tomax = xamot;
10762	buf->dtb_xamot = tomax;
10763	buf->dtb_xamot_drops = buf->dtb_drops;
10764	buf->dtb_xamot_offset = buf->dtb_offset;
10765	buf->dtb_xamot_errors = buf->dtb_errors;
10766	buf->dtb_xamot_flags = buf->dtb_flags;
10767	buf->dtb_offset = 0;
10768	buf->dtb_drops = 0;
10769	buf->dtb_errors = 0;
10770	buf->dtb_flags &= ~(DTRACEBUF_ERROR | DTRACEBUF_DROPPED);
10771	buf->dtb_interval = now - buf->dtb_switched;
10772	buf->dtb_switched = now;
10773	dtrace_interrupt_enable(cookie);
10774}
10775
10776/*
10777 * Note:  called from cross call context.  This function activates a buffer
10778 * on a CPU.  As with dtrace_buffer_switch(), the atomicity of the operation
10779 * is guaranteed by the disabling of interrupts.
10780 */
10781static void
10782dtrace_buffer_activate(dtrace_state_t *state)
10783{
10784	dtrace_buffer_t *buf;
10785	dtrace_icookie_t cookie = dtrace_interrupt_disable();
10786
10787	buf = &state->dts_buffer[curcpu];
10788
10789	if (buf->dtb_tomax != NULL) {
10790		/*
10791		 * We might like to assert that the buffer is marked inactive,
10792		 * but this isn't necessarily true:  the buffer for the CPU
10793		 * that processes the BEGIN probe has its buffer activated
10794		 * manually.  In this case, we take the (harmless) action
10795		 * re-clearing the bit INACTIVE bit.
10796		 */
10797		buf->dtb_flags &= ~DTRACEBUF_INACTIVE;
10798	}
10799
10800	dtrace_interrupt_enable(cookie);
10801}
10802
10803static int
10804dtrace_buffer_alloc(dtrace_buffer_t *bufs, size_t size, int flags,
10805    processorid_t cpu)
10806{
10807#if defined(sun)
10808	cpu_t *cp;
10809#endif
10810	dtrace_buffer_t *buf;
10811
10812#if defined(sun)
10813	ASSERT(MUTEX_HELD(&cpu_lock));
10814	ASSERT(MUTEX_HELD(&dtrace_lock));
10815
10816	if (size > dtrace_nonroot_maxsize &&
10817	    !PRIV_POLICY_CHOICE(CRED(), PRIV_ALL, B_FALSE))
10818		return (EFBIG);
10819
10820	cp = cpu_list;
10821
10822	do {
10823		if (cpu != DTRACE_CPUALL && cpu != cp->cpu_id)
10824			continue;
10825
10826		buf = &bufs[cp->cpu_id];
10827
10828		/*
10829		 * If there is already a buffer allocated for this CPU, it
10830		 * is only possible that this is a DR event.  In this case,
10831		 */
10832		if (buf->dtb_tomax != NULL) {
10833			ASSERT(buf->dtb_size == size);
10834			continue;
10835		}
10836
10837		ASSERT(buf->dtb_xamot == NULL);
10838
10839		if ((buf->dtb_tomax = kmem_zalloc(size, KM_NOSLEEP)) == NULL)
10840			goto err;
10841
10842		buf->dtb_size = size;
10843		buf->dtb_flags = flags;
10844		buf->dtb_offset = 0;
10845		buf->dtb_drops = 0;
10846
10847		if (flags & DTRACEBUF_NOSWITCH)
10848			continue;
10849
10850		if ((buf->dtb_xamot = kmem_zalloc(size, KM_NOSLEEP)) == NULL)
10851			goto err;
10852	} while ((cp = cp->cpu_next) != cpu_list);
10853
10854	return (0);
10855
10856err:
10857	cp = cpu_list;
10858
10859	do {
10860		if (cpu != DTRACE_CPUALL && cpu != cp->cpu_id)
10861			continue;
10862
10863		buf = &bufs[cp->cpu_id];
10864
10865		if (buf->dtb_xamot != NULL) {
10866			ASSERT(buf->dtb_tomax != NULL);
10867			ASSERT(buf->dtb_size == size);
10868			kmem_free(buf->dtb_xamot, size);
10869		}
10870
10871		if (buf->dtb_tomax != NULL) {
10872			ASSERT(buf->dtb_size == size);
10873			kmem_free(buf->dtb_tomax, size);
10874		}
10875
10876		buf->dtb_tomax = NULL;
10877		buf->dtb_xamot = NULL;
10878		buf->dtb_size = 0;
10879	} while ((cp = cp->cpu_next) != cpu_list);
10880
10881	return (ENOMEM);
10882#else
10883	int i;
10884
10885#if defined(__amd64__)
10886	/*
10887	 * FreeBSD isn't good at limiting the amount of memory we
10888	 * ask to malloc, so let's place a limit here before trying
10889	 * to do something that might well end in tears at bedtime.
10890	 */
10891	if (size > physmem * PAGE_SIZE / (128 * (mp_maxid + 1)))
10892		return(ENOMEM);
10893#endif
10894
10895	ASSERT(MUTEX_HELD(&dtrace_lock));
10896	CPU_FOREACH(i) {
10897		if (cpu != DTRACE_CPUALL && cpu != i)
10898			continue;
10899
10900		buf = &bufs[i];
10901
10902		/*
10903		 * If there is already a buffer allocated for this CPU, it
10904		 * is only possible that this is a DR event.  In this case,
10905		 * the buffer size must match our specified size.
10906		 */
10907		if (buf->dtb_tomax != NULL) {
10908			ASSERT(buf->dtb_size == size);
10909			continue;
10910		}
10911
10912		ASSERT(buf->dtb_xamot == NULL);
10913
10914		if ((buf->dtb_tomax = kmem_zalloc(size, KM_NOSLEEP)) == NULL)
10915			goto err;
10916
10917		buf->dtb_size = size;
10918		buf->dtb_flags = flags;
10919		buf->dtb_offset = 0;
10920		buf->dtb_drops = 0;
10921
10922		if (flags & DTRACEBUF_NOSWITCH)
10923			continue;
10924
10925		if ((buf->dtb_xamot = kmem_zalloc(size, KM_NOSLEEP)) == NULL)
10926			goto err;
10927	}
10928
10929	return (0);
10930
10931err:
10932	/*
10933	 * Error allocating memory, so free the buffers that were
10934	 * allocated before the failed allocation.
10935	 */
10936	CPU_FOREACH(i) {
10937		if (cpu != DTRACE_CPUALL && cpu != i)
10938			continue;
10939
10940		buf = &bufs[i];
10941
10942		if (buf->dtb_xamot != NULL) {
10943			ASSERT(buf->dtb_tomax != NULL);
10944			ASSERT(buf->dtb_size == size);
10945			kmem_free(buf->dtb_xamot, size);
10946		}
10947
10948		if (buf->dtb_tomax != NULL) {
10949			ASSERT(buf->dtb_size == size);
10950			kmem_free(buf->dtb_tomax, size);
10951		}
10952
10953		buf->dtb_tomax = NULL;
10954		buf->dtb_xamot = NULL;
10955		buf->dtb_size = 0;
10956
10957	}
10958
10959	return (ENOMEM);
10960#endif
10961}
10962
10963/*
10964 * Note:  called from probe context.  This function just increments the drop
10965 * count on a buffer.  It has been made a function to allow for the
10966 * possibility of understanding the source of mysterious drop counts.  (A
10967 * problem for which one may be particularly disappointed that DTrace cannot
10968 * be used to understand DTrace.)
10969 */
10970static void
10971dtrace_buffer_drop(dtrace_buffer_t *buf)
10972{
10973	buf->dtb_drops++;
10974}
10975
10976/*
10977 * Note:  called from probe context.  This function is called to reserve space
10978 * in a buffer.  If mstate is non-NULL, sets the scratch base and size in the
10979 * mstate.  Returns the new offset in the buffer, or a negative value if an
10980 * error has occurred.
10981 */
10982static intptr_t
10983dtrace_buffer_reserve(dtrace_buffer_t *buf, size_t needed, size_t align,
10984    dtrace_state_t *state, dtrace_mstate_t *mstate)
10985{
10986	intptr_t offs = buf->dtb_offset, soffs;
10987	intptr_t woffs;
10988	caddr_t tomax;
10989	size_t total;
10990
10991	if (buf->dtb_flags & DTRACEBUF_INACTIVE)
10992		return (-1);
10993
10994	if ((tomax = buf->dtb_tomax) == NULL) {
10995		dtrace_buffer_drop(buf);
10996		return (-1);
10997	}
10998
10999	if (!(buf->dtb_flags & (DTRACEBUF_RING | DTRACEBUF_FILL))) {
11000		while (offs & (align - 1)) {
11001			/*
11002			 * Assert that our alignment is off by a number which
11003			 * is itself sizeof (uint32_t) aligned.
11004			 */
11005			ASSERT(!((align - (offs & (align - 1))) &
11006			    (sizeof (uint32_t) - 1)));
11007			DTRACE_STORE(uint32_t, tomax, offs, DTRACE_EPIDNONE);
11008			offs += sizeof (uint32_t);
11009		}
11010
11011		if ((soffs = offs + needed) > buf->dtb_size) {
11012			dtrace_buffer_drop(buf);
11013			return (-1);
11014		}
11015
11016		if (mstate == NULL)
11017			return (offs);
11018
11019		mstate->dtms_scratch_base = (uintptr_t)tomax + soffs;
11020		mstate->dtms_scratch_size = buf->dtb_size - soffs;
11021		mstate->dtms_scratch_ptr = mstate->dtms_scratch_base;
11022
11023		return (offs);
11024	}
11025
11026	if (buf->dtb_flags & DTRACEBUF_FILL) {
11027		if (state->dts_activity != DTRACE_ACTIVITY_COOLDOWN &&
11028		    (buf->dtb_flags & DTRACEBUF_FULL))
11029			return (-1);
11030		goto out;
11031	}
11032
11033	total = needed + (offs & (align - 1));
11034
11035	/*
11036	 * For a ring buffer, life is quite a bit more complicated.  Before
11037	 * we can store any padding, we need to adjust our wrapping offset.
11038	 * (If we've never before wrapped or we're not about to, no adjustment
11039	 * is required.)
11040	 */
11041	if ((buf->dtb_flags & DTRACEBUF_WRAPPED) ||
11042	    offs + total > buf->dtb_size) {
11043		woffs = buf->dtb_xamot_offset;
11044
11045		if (offs + total > buf->dtb_size) {
11046			/*
11047			 * We can't fit in the end of the buffer.  First, a
11048			 * sanity check that we can fit in the buffer at all.
11049			 */
11050			if (total > buf->dtb_size) {
11051				dtrace_buffer_drop(buf);
11052				return (-1);
11053			}
11054
11055			/*
11056			 * We're going to be storing at the top of the buffer,
11057			 * so now we need to deal with the wrapped offset.  We
11058			 * only reset our wrapped offset to 0 if it is
11059			 * currently greater than the current offset.  If it
11060			 * is less than the current offset, it is because a
11061			 * previous allocation induced a wrap -- but the
11062			 * allocation didn't subsequently take the space due
11063			 * to an error or false predicate evaluation.  In this
11064			 * case, we'll just leave the wrapped offset alone: if
11065			 * the wrapped offset hasn't been advanced far enough
11066			 * for this allocation, it will be adjusted in the
11067			 * lower loop.
11068			 */
11069			if (buf->dtb_flags & DTRACEBUF_WRAPPED) {
11070				if (woffs >= offs)
11071					woffs = 0;
11072			} else {
11073				woffs = 0;
11074			}
11075
11076			/*
11077			 * Now we know that we're going to be storing to the
11078			 * top of the buffer and that there is room for us
11079			 * there.  We need to clear the buffer from the current
11080			 * offset to the end (there may be old gunk there).
11081			 */
11082			while (offs < buf->dtb_size)
11083				tomax[offs++] = 0;
11084
11085			/*
11086			 * We need to set our offset to zero.  And because we
11087			 * are wrapping, we need to set the bit indicating as
11088			 * much.  We can also adjust our needed space back
11089			 * down to the space required by the ECB -- we know
11090			 * that the top of the buffer is aligned.
11091			 */
11092			offs = 0;
11093			total = needed;
11094			buf->dtb_flags |= DTRACEBUF_WRAPPED;
11095		} else {
11096			/*
11097			 * There is room for us in the buffer, so we simply
11098			 * need to check the wrapped offset.
11099			 */
11100			if (woffs < offs) {
11101				/*
11102				 * The wrapped offset is less than the offset.
11103				 * This can happen if we allocated buffer space
11104				 * that induced a wrap, but then we didn't
11105				 * subsequently take the space due to an error
11106				 * or false predicate evaluation.  This is
11107				 * okay; we know that _this_ allocation isn't
11108				 * going to induce a wrap.  We still can't
11109				 * reset the wrapped offset to be zero,
11110				 * however: the space may have been trashed in
11111				 * the previous failed probe attempt.  But at
11112				 * least the wrapped offset doesn't need to
11113				 * be adjusted at all...
11114				 */
11115				goto out;
11116			}
11117		}
11118
11119		while (offs + total > woffs) {
11120			dtrace_epid_t epid = *(uint32_t *)(tomax + woffs);
11121			size_t size;
11122
11123			if (epid == DTRACE_EPIDNONE) {
11124				size = sizeof (uint32_t);
11125			} else {
11126				ASSERT3U(epid, <=, state->dts_necbs);
11127				ASSERT(state->dts_ecbs[epid - 1] != NULL);
11128
11129				size = state->dts_ecbs[epid - 1]->dte_size;
11130			}
11131
11132			ASSERT(woffs + size <= buf->dtb_size);
11133			ASSERT(size != 0);
11134
11135			if (woffs + size == buf->dtb_size) {
11136				/*
11137				 * We've reached the end of the buffer; we want
11138				 * to set the wrapped offset to 0 and break
11139				 * out.  However, if the offs is 0, then we're
11140				 * in a strange edge-condition:  the amount of
11141				 * space that we want to reserve plus the size
11142				 * of the record that we're overwriting is
11143				 * greater than the size of the buffer.  This
11144				 * is problematic because if we reserve the
11145				 * space but subsequently don't consume it (due
11146				 * to a failed predicate or error) the wrapped
11147				 * offset will be 0 -- yet the EPID at offset 0
11148				 * will not be committed.  This situation is
11149				 * relatively easy to deal with:  if we're in
11150				 * this case, the buffer is indistinguishable
11151				 * from one that hasn't wrapped; we need only
11152				 * finish the job by clearing the wrapped bit,
11153				 * explicitly setting the offset to be 0, and
11154				 * zero'ing out the old data in the buffer.
11155				 */
11156				if (offs == 0) {
11157					buf->dtb_flags &= ~DTRACEBUF_WRAPPED;
11158					buf->dtb_offset = 0;
11159					woffs = total;
11160
11161					while (woffs < buf->dtb_size)
11162						tomax[woffs++] = 0;
11163				}
11164
11165				woffs = 0;
11166				break;
11167			}
11168
11169			woffs += size;
11170		}
11171
11172		/*
11173		 * We have a wrapped offset.  It may be that the wrapped offset
11174		 * has become zero -- that's okay.
11175		 */
11176		buf->dtb_xamot_offset = woffs;
11177	}
11178
11179out:
11180	/*
11181	 * Now we can plow the buffer with any necessary padding.
11182	 */
11183	while (offs & (align - 1)) {
11184		/*
11185		 * Assert that our alignment is off by a number which
11186		 * is itself sizeof (uint32_t) aligned.
11187		 */
11188		ASSERT(!((align - (offs & (align - 1))) &
11189		    (sizeof (uint32_t) - 1)));
11190		DTRACE_STORE(uint32_t, tomax, offs, DTRACE_EPIDNONE);
11191		offs += sizeof (uint32_t);
11192	}
11193
11194	if (buf->dtb_flags & DTRACEBUF_FILL) {
11195		if (offs + needed > buf->dtb_size - state->dts_reserve) {
11196			buf->dtb_flags |= DTRACEBUF_FULL;
11197			return (-1);
11198		}
11199	}
11200
11201	if (mstate == NULL)
11202		return (offs);
11203
11204	/*
11205	 * For ring buffers and fill buffers, the scratch space is always
11206	 * the inactive buffer.
11207	 */
11208	mstate->dtms_scratch_base = (uintptr_t)buf->dtb_xamot;
11209	mstate->dtms_scratch_size = buf->dtb_size;
11210	mstate->dtms_scratch_ptr = mstate->dtms_scratch_base;
11211
11212	return (offs);
11213}
11214
11215static void
11216dtrace_buffer_polish(dtrace_buffer_t *buf)
11217{
11218	ASSERT(buf->dtb_flags & DTRACEBUF_RING);
11219	ASSERT(MUTEX_HELD(&dtrace_lock));
11220
11221	if (!(buf->dtb_flags & DTRACEBUF_WRAPPED))
11222		return;
11223
11224	/*
11225	 * We need to polish the ring buffer.  There are three cases:
11226	 *
11227	 * - The first (and presumably most common) is that there is no gap
11228	 *   between the buffer offset and the wrapped offset.  In this case,
11229	 *   there is nothing in the buffer that isn't valid data; we can
11230	 *   mark the buffer as polished and return.
11231	 *
11232	 * - The second (less common than the first but still more common
11233	 *   than the third) is that there is a gap between the buffer offset
11234	 *   and the wrapped offset, and the wrapped offset is larger than the
11235	 *   buffer offset.  This can happen because of an alignment issue, or
11236	 *   can happen because of a call to dtrace_buffer_reserve() that
11237	 *   didn't subsequently consume the buffer space.  In this case,
11238	 *   we need to zero the data from the buffer offset to the wrapped
11239	 *   offset.
11240	 *
11241	 * - The third (and least common) is that there is a gap between the
11242	 *   buffer offset and the wrapped offset, but the wrapped offset is
11243	 *   _less_ than the buffer offset.  This can only happen because a
11244	 *   call to dtrace_buffer_reserve() induced a wrap, but the space
11245	 *   was not subsequently consumed.  In this case, we need to zero the
11246	 *   space from the offset to the end of the buffer _and_ from the
11247	 *   top of the buffer to the wrapped offset.
11248	 */
11249	if (buf->dtb_offset < buf->dtb_xamot_offset) {
11250		bzero(buf->dtb_tomax + buf->dtb_offset,
11251		    buf->dtb_xamot_offset - buf->dtb_offset);
11252	}
11253
11254	if (buf->dtb_offset > buf->dtb_xamot_offset) {
11255		bzero(buf->dtb_tomax + buf->dtb_offset,
11256		    buf->dtb_size - buf->dtb_offset);
11257		bzero(buf->dtb_tomax, buf->dtb_xamot_offset);
11258	}
11259}
11260
11261/*
11262 * This routine determines if data generated at the specified time has likely
11263 * been entirely consumed at user-level.  This routine is called to determine
11264 * if an ECB on a defunct probe (but for an active enabling) can be safely
11265 * disabled and destroyed.
11266 */
11267static int
11268dtrace_buffer_consumed(dtrace_buffer_t *bufs, hrtime_t when)
11269{
11270	int i;
11271
11272	for (i = 0; i < NCPU; i++) {
11273		dtrace_buffer_t *buf = &bufs[i];
11274
11275		if (buf->dtb_size == 0)
11276			continue;
11277
11278		if (buf->dtb_flags & DTRACEBUF_RING)
11279			return (0);
11280
11281		if (!buf->dtb_switched && buf->dtb_offset != 0)
11282			return (0);
11283
11284		if (buf->dtb_switched - buf->dtb_interval < when)
11285			return (0);
11286	}
11287
11288	return (1);
11289}
11290
11291static void
11292dtrace_buffer_free(dtrace_buffer_t *bufs)
11293{
11294	int i;
11295
11296	for (i = 0; i < NCPU; i++) {
11297		dtrace_buffer_t *buf = &bufs[i];
11298
11299		if (buf->dtb_tomax == NULL) {
11300			ASSERT(buf->dtb_xamot == NULL);
11301			ASSERT(buf->dtb_size == 0);
11302			continue;
11303		}
11304
11305		if (buf->dtb_xamot != NULL) {
11306			ASSERT(!(buf->dtb_flags & DTRACEBUF_NOSWITCH));
11307			kmem_free(buf->dtb_xamot, buf->dtb_size);
11308		}
11309
11310		kmem_free(buf->dtb_tomax, buf->dtb_size);
11311		buf->dtb_size = 0;
11312		buf->dtb_tomax = NULL;
11313		buf->dtb_xamot = NULL;
11314	}
11315}
11316
11317/*
11318 * DTrace Enabling Functions
11319 */
11320static dtrace_enabling_t *
11321dtrace_enabling_create(dtrace_vstate_t *vstate)
11322{
11323	dtrace_enabling_t *enab;
11324
11325	enab = kmem_zalloc(sizeof (dtrace_enabling_t), KM_SLEEP);
11326	enab->dten_vstate = vstate;
11327
11328	return (enab);
11329}
11330
11331static void
11332dtrace_enabling_add(dtrace_enabling_t *enab, dtrace_ecbdesc_t *ecb)
11333{
11334	dtrace_ecbdesc_t **ndesc;
11335	size_t osize, nsize;
11336
11337	/*
11338	 * We can't add to enablings after we've enabled them, or after we've
11339	 * retained them.
11340	 */
11341	ASSERT(enab->dten_probegen == 0);
11342	ASSERT(enab->dten_next == NULL && enab->dten_prev == NULL);
11343
11344	if (enab->dten_ndesc < enab->dten_maxdesc) {
11345		enab->dten_desc[enab->dten_ndesc++] = ecb;
11346		return;
11347	}
11348
11349	osize = enab->dten_maxdesc * sizeof (dtrace_enabling_t *);
11350
11351	if (enab->dten_maxdesc == 0) {
11352		enab->dten_maxdesc = 1;
11353	} else {
11354		enab->dten_maxdesc <<= 1;
11355	}
11356
11357	ASSERT(enab->dten_ndesc < enab->dten_maxdesc);
11358
11359	nsize = enab->dten_maxdesc * sizeof (dtrace_enabling_t *);
11360	ndesc = kmem_zalloc(nsize, KM_SLEEP);
11361	bcopy(enab->dten_desc, ndesc, osize);
11362	if (enab->dten_desc != NULL)
11363		kmem_free(enab->dten_desc, osize);
11364
11365	enab->dten_desc = ndesc;
11366	enab->dten_desc[enab->dten_ndesc++] = ecb;
11367}
11368
11369static void
11370dtrace_enabling_addlike(dtrace_enabling_t *enab, dtrace_ecbdesc_t *ecb,
11371    dtrace_probedesc_t *pd)
11372{
11373	dtrace_ecbdesc_t *new;
11374	dtrace_predicate_t *pred;
11375	dtrace_actdesc_t *act;
11376
11377	/*
11378	 * We're going to create a new ECB description that matches the
11379	 * specified ECB in every way, but has the specified probe description.
11380	 */
11381	new = kmem_zalloc(sizeof (dtrace_ecbdesc_t), KM_SLEEP);
11382
11383	if ((pred = ecb->dted_pred.dtpdd_predicate) != NULL)
11384		dtrace_predicate_hold(pred);
11385
11386	for (act = ecb->dted_action; act != NULL; act = act->dtad_next)
11387		dtrace_actdesc_hold(act);
11388
11389	new->dted_action = ecb->dted_action;
11390	new->dted_pred = ecb->dted_pred;
11391	new->dted_probe = *pd;
11392	new->dted_uarg = ecb->dted_uarg;
11393
11394	dtrace_enabling_add(enab, new);
11395}
11396
11397static void
11398dtrace_enabling_dump(dtrace_enabling_t *enab)
11399{
11400	int i;
11401
11402	for (i = 0; i < enab->dten_ndesc; i++) {
11403		dtrace_probedesc_t *desc = &enab->dten_desc[i]->dted_probe;
11404
11405		cmn_err(CE_NOTE, "enabling probe %d (%s:%s:%s:%s)", i,
11406		    desc->dtpd_provider, desc->dtpd_mod,
11407		    desc->dtpd_func, desc->dtpd_name);
11408	}
11409}
11410
11411static void
11412dtrace_enabling_destroy(dtrace_enabling_t *enab)
11413{
11414	int i;
11415	dtrace_ecbdesc_t *ep;
11416	dtrace_vstate_t *vstate = enab->dten_vstate;
11417
11418	ASSERT(MUTEX_HELD(&dtrace_lock));
11419
11420	for (i = 0; i < enab->dten_ndesc; i++) {
11421		dtrace_actdesc_t *act, *next;
11422		dtrace_predicate_t *pred;
11423
11424		ep = enab->dten_desc[i];
11425
11426		if ((pred = ep->dted_pred.dtpdd_predicate) != NULL)
11427			dtrace_predicate_release(pred, vstate);
11428
11429		for (act = ep->dted_action; act != NULL; act = next) {
11430			next = act->dtad_next;
11431			dtrace_actdesc_release(act, vstate);
11432		}
11433
11434		kmem_free(ep, sizeof (dtrace_ecbdesc_t));
11435	}
11436
11437	if (enab->dten_desc != NULL)
11438		kmem_free(enab->dten_desc,
11439		    enab->dten_maxdesc * sizeof (dtrace_enabling_t *));
11440
11441	/*
11442	 * If this was a retained enabling, decrement the dts_nretained count
11443	 * and take it off of the dtrace_retained list.
11444	 */
11445	if (enab->dten_prev != NULL || enab->dten_next != NULL ||
11446	    dtrace_retained == enab) {
11447		ASSERT(enab->dten_vstate->dtvs_state != NULL);
11448		ASSERT(enab->dten_vstate->dtvs_state->dts_nretained > 0);
11449		enab->dten_vstate->dtvs_state->dts_nretained--;
11450	}
11451
11452	if (enab->dten_prev == NULL) {
11453		if (dtrace_retained == enab) {
11454			dtrace_retained = enab->dten_next;
11455
11456			if (dtrace_retained != NULL)
11457				dtrace_retained->dten_prev = NULL;
11458		}
11459	} else {
11460		ASSERT(enab != dtrace_retained);
11461		ASSERT(dtrace_retained != NULL);
11462		enab->dten_prev->dten_next = enab->dten_next;
11463	}
11464
11465	if (enab->dten_next != NULL) {
11466		ASSERT(dtrace_retained != NULL);
11467		enab->dten_next->dten_prev = enab->dten_prev;
11468	}
11469
11470	kmem_free(enab, sizeof (dtrace_enabling_t));
11471}
11472
11473static int
11474dtrace_enabling_retain(dtrace_enabling_t *enab)
11475{
11476	dtrace_state_t *state;
11477
11478	ASSERT(MUTEX_HELD(&dtrace_lock));
11479	ASSERT(enab->dten_next == NULL && enab->dten_prev == NULL);
11480	ASSERT(enab->dten_vstate != NULL);
11481
11482	state = enab->dten_vstate->dtvs_state;
11483	ASSERT(state != NULL);
11484
11485	/*
11486	 * We only allow each state to retain dtrace_retain_max enablings.
11487	 */
11488	if (state->dts_nretained >= dtrace_retain_max)
11489		return (ENOSPC);
11490
11491	state->dts_nretained++;
11492
11493	if (dtrace_retained == NULL) {
11494		dtrace_retained = enab;
11495		return (0);
11496	}
11497
11498	enab->dten_next = dtrace_retained;
11499	dtrace_retained->dten_prev = enab;
11500	dtrace_retained = enab;
11501
11502	return (0);
11503}
11504
11505static int
11506dtrace_enabling_replicate(dtrace_state_t *state, dtrace_probedesc_t *match,
11507    dtrace_probedesc_t *create)
11508{
11509	dtrace_enabling_t *new, *enab;
11510	int found = 0, err = ENOENT;
11511
11512	ASSERT(MUTEX_HELD(&dtrace_lock));
11513	ASSERT(strlen(match->dtpd_provider) < DTRACE_PROVNAMELEN);
11514	ASSERT(strlen(match->dtpd_mod) < DTRACE_MODNAMELEN);
11515	ASSERT(strlen(match->dtpd_func) < DTRACE_FUNCNAMELEN);
11516	ASSERT(strlen(match->dtpd_name) < DTRACE_NAMELEN);
11517
11518	new = dtrace_enabling_create(&state->dts_vstate);
11519
11520	/*
11521	 * Iterate over all retained enablings, looking for enablings that
11522	 * match the specified state.
11523	 */
11524	for (enab = dtrace_retained; enab != NULL; enab = enab->dten_next) {
11525		int i;
11526
11527		/*
11528		 * dtvs_state can only be NULL for helper enablings -- and
11529		 * helper enablings can't be retained.
11530		 */
11531		ASSERT(enab->dten_vstate->dtvs_state != NULL);
11532
11533		if (enab->dten_vstate->dtvs_state != state)
11534			continue;
11535
11536		/*
11537		 * Now iterate over each probe description; we're looking for
11538		 * an exact match to the specified probe description.
11539		 */
11540		for (i = 0; i < enab->dten_ndesc; i++) {
11541			dtrace_ecbdesc_t *ep = enab->dten_desc[i];
11542			dtrace_probedesc_t *pd = &ep->dted_probe;
11543
11544			if (strcmp(pd->dtpd_provider, match->dtpd_provider))
11545				continue;
11546
11547			if (strcmp(pd->dtpd_mod, match->dtpd_mod))
11548				continue;
11549
11550			if (strcmp(pd->dtpd_func, match->dtpd_func))
11551				continue;
11552
11553			if (strcmp(pd->dtpd_name, match->dtpd_name))
11554				continue;
11555
11556			/*
11557			 * We have a winning probe!  Add it to our growing
11558			 * enabling.
11559			 */
11560			found = 1;
11561			dtrace_enabling_addlike(new, ep, create);
11562		}
11563	}
11564
11565	if (!found || (err = dtrace_enabling_retain(new)) != 0) {
11566		dtrace_enabling_destroy(new);
11567		return (err);
11568	}
11569
11570	return (0);
11571}
11572
11573static void
11574dtrace_enabling_retract(dtrace_state_t *state)
11575{
11576	dtrace_enabling_t *enab, *next;
11577
11578	ASSERT(MUTEX_HELD(&dtrace_lock));
11579
11580	/*
11581	 * Iterate over all retained enablings, destroy the enablings retained
11582	 * for the specified state.
11583	 */
11584	for (enab = dtrace_retained; enab != NULL; enab = next) {
11585		next = enab->dten_next;
11586
11587		/*
11588		 * dtvs_state can only be NULL for helper enablings -- and
11589		 * helper enablings can't be retained.
11590		 */
11591		ASSERT(enab->dten_vstate->dtvs_state != NULL);
11592
11593		if (enab->dten_vstate->dtvs_state == state) {
11594			ASSERT(state->dts_nretained > 0);
11595			dtrace_enabling_destroy(enab);
11596		}
11597	}
11598
11599	ASSERT(state->dts_nretained == 0);
11600}
11601
11602static int
11603dtrace_enabling_match(dtrace_enabling_t *enab, int *nmatched)
11604{
11605	int i = 0;
11606	int matched = 0;
11607
11608	ASSERT(MUTEX_HELD(&cpu_lock));
11609	ASSERT(MUTEX_HELD(&dtrace_lock));
11610
11611	for (i = 0; i < enab->dten_ndesc; i++) {
11612		dtrace_ecbdesc_t *ep = enab->dten_desc[i];
11613
11614		enab->dten_current = ep;
11615		enab->dten_error = 0;
11616
11617		matched += dtrace_probe_enable(&ep->dted_probe, enab);
11618
11619		if (enab->dten_error != 0) {
11620			/*
11621			 * If we get an error half-way through enabling the
11622			 * probes, we kick out -- perhaps with some number of
11623			 * them enabled.  Leaving enabled probes enabled may
11624			 * be slightly confusing for user-level, but we expect
11625			 * that no one will attempt to actually drive on in
11626			 * the face of such errors.  If this is an anonymous
11627			 * enabling (indicated with a NULL nmatched pointer),
11628			 * we cmn_err() a message.  We aren't expecting to
11629			 * get such an error -- such as it can exist at all,
11630			 * it would be a result of corrupted DOF in the driver
11631			 * properties.
11632			 */
11633			if (nmatched == NULL) {
11634				cmn_err(CE_WARN, "dtrace_enabling_match() "
11635				    "error on %p: %d", (void *)ep,
11636				    enab->dten_error);
11637			}
11638
11639			return (enab->dten_error);
11640		}
11641	}
11642
11643	enab->dten_probegen = dtrace_probegen;
11644	if (nmatched != NULL)
11645		*nmatched = matched;
11646
11647	return (0);
11648}
11649
11650static void
11651dtrace_enabling_matchall(void)
11652{
11653	dtrace_enabling_t *enab;
11654
11655	mutex_enter(&cpu_lock);
11656	mutex_enter(&dtrace_lock);
11657
11658	/*
11659	 * Iterate over all retained enablings to see if any probes match
11660	 * against them.  We only perform this operation on enablings for which
11661	 * we have sufficient permissions by virtue of being in the global zone
11662	 * or in the same zone as the DTrace client.  Because we can be called
11663	 * after dtrace_detach() has been called, we cannot assert that there
11664	 * are retained enablings.  We can safely load from dtrace_retained,
11665	 * however:  the taskq_destroy() at the end of dtrace_detach() will
11666	 * block pending our completion.
11667	 */
11668	for (enab = dtrace_retained; enab != NULL; enab = enab->dten_next) {
11669#if defined(sun)
11670		cred_t *cr = enab->dten_vstate->dtvs_state->dts_cred.dcr_cred;
11671
11672		if (INGLOBALZONE(curproc) || getzoneid() == crgetzoneid(cr))
11673#endif
11674			(void) dtrace_enabling_match(enab, NULL);
11675	}
11676
11677	mutex_exit(&dtrace_lock);
11678	mutex_exit(&cpu_lock);
11679}
11680
11681/*
11682 * If an enabling is to be enabled without having matched probes (that is, if
11683 * dtrace_state_go() is to be called on the underlying dtrace_state_t), the
11684 * enabling must be _primed_ by creating an ECB for every ECB description.
11685 * This must be done to assure that we know the number of speculations, the
11686 * number of aggregations, the minimum buffer size needed, etc. before we
11687 * transition out of DTRACE_ACTIVITY_INACTIVE.  To do this without actually
11688 * enabling any probes, we create ECBs for every ECB decription, but with a
11689 * NULL probe -- which is exactly what this function does.
11690 */
11691static void
11692dtrace_enabling_prime(dtrace_state_t *state)
11693{
11694	dtrace_enabling_t *enab;
11695	int i;
11696
11697	for (enab = dtrace_retained; enab != NULL; enab = enab->dten_next) {
11698		ASSERT(enab->dten_vstate->dtvs_state != NULL);
11699
11700		if (enab->dten_vstate->dtvs_state != state)
11701			continue;
11702
11703		/*
11704		 * We don't want to prime an enabling more than once, lest
11705		 * we allow a malicious user to induce resource exhaustion.
11706		 * (The ECBs that result from priming an enabling aren't
11707		 * leaked -- but they also aren't deallocated until the
11708		 * consumer state is destroyed.)
11709		 */
11710		if (enab->dten_primed)
11711			continue;
11712
11713		for (i = 0; i < enab->dten_ndesc; i++) {
11714			enab->dten_current = enab->dten_desc[i];
11715			(void) dtrace_probe_enable(NULL, enab);
11716		}
11717
11718		enab->dten_primed = 1;
11719	}
11720}
11721
11722/*
11723 * Called to indicate that probes should be provided due to retained
11724 * enablings.  This is implemented in terms of dtrace_probe_provide(), but it
11725 * must take an initial lap through the enabling calling the dtps_provide()
11726 * entry point explicitly to allow for autocreated probes.
11727 */
11728static void
11729dtrace_enabling_provide(dtrace_provider_t *prv)
11730{
11731	int i, all = 0;
11732	dtrace_probedesc_t desc;
11733
11734	ASSERT(MUTEX_HELD(&dtrace_lock));
11735	ASSERT(MUTEX_HELD(&dtrace_provider_lock));
11736
11737	if (prv == NULL) {
11738		all = 1;
11739		prv = dtrace_provider;
11740	}
11741
11742	do {
11743		dtrace_enabling_t *enab = dtrace_retained;
11744		void *parg = prv->dtpv_arg;
11745
11746		for (; enab != NULL; enab = enab->dten_next) {
11747			for (i = 0; i < enab->dten_ndesc; i++) {
11748				desc = enab->dten_desc[i]->dted_probe;
11749				mutex_exit(&dtrace_lock);
11750				prv->dtpv_pops.dtps_provide(parg, &desc);
11751				mutex_enter(&dtrace_lock);
11752			}
11753		}
11754	} while (all && (prv = prv->dtpv_next) != NULL);
11755
11756	mutex_exit(&dtrace_lock);
11757	dtrace_probe_provide(NULL, all ? NULL : prv);
11758	mutex_enter(&dtrace_lock);
11759}
11760
11761/*
11762 * Called to reap ECBs that are attached to probes from defunct providers.
11763 */
11764static void
11765dtrace_enabling_reap(void)
11766{
11767	dtrace_provider_t *prov;
11768	dtrace_probe_t *probe;
11769	dtrace_ecb_t *ecb;
11770	hrtime_t when;
11771	int i;
11772
11773	mutex_enter(&cpu_lock);
11774	mutex_enter(&dtrace_lock);
11775
11776	for (i = 0; i < dtrace_nprobes; i++) {
11777		if ((probe = dtrace_probes[i]) == NULL)
11778			continue;
11779
11780		if (probe->dtpr_ecb == NULL)
11781			continue;
11782
11783		prov = probe->dtpr_provider;
11784
11785		if ((when = prov->dtpv_defunct) == 0)
11786			continue;
11787
11788		/*
11789		 * We have ECBs on a defunct provider:  we want to reap these
11790		 * ECBs to allow the provider to unregister.  The destruction
11791		 * of these ECBs must be done carefully:  if we destroy the ECB
11792		 * and the consumer later wishes to consume an EPID that
11793		 * corresponds to the destroyed ECB (and if the EPID metadata
11794		 * has not been previously consumed), the consumer will abort
11795		 * processing on the unknown EPID.  To reduce (but not, sadly,
11796		 * eliminate) the possibility of this, we will only destroy an
11797		 * ECB for a defunct provider if, for the state that
11798		 * corresponds to the ECB:
11799		 *
11800		 *  (a)	There is no speculative tracing (which can effectively
11801		 *	cache an EPID for an arbitrary amount of time).
11802		 *
11803		 *  (b)	The principal buffers have been switched twice since the
11804		 *	provider became defunct.
11805		 *
11806		 *  (c)	The aggregation buffers are of zero size or have been
11807		 *	switched twice since the provider became defunct.
11808		 *
11809		 * We use dts_speculates to determine (a) and call a function
11810		 * (dtrace_buffer_consumed()) to determine (b) and (c).  Note
11811		 * that as soon as we've been unable to destroy one of the ECBs
11812		 * associated with the probe, we quit trying -- reaping is only
11813		 * fruitful in as much as we can destroy all ECBs associated
11814		 * with the defunct provider's probes.
11815		 */
11816		while ((ecb = probe->dtpr_ecb) != NULL) {
11817			dtrace_state_t *state = ecb->dte_state;
11818			dtrace_buffer_t *buf = state->dts_buffer;
11819			dtrace_buffer_t *aggbuf = state->dts_aggbuffer;
11820
11821			if (state->dts_speculates)
11822				break;
11823
11824			if (!dtrace_buffer_consumed(buf, when))
11825				break;
11826
11827			if (!dtrace_buffer_consumed(aggbuf, when))
11828				break;
11829
11830			dtrace_ecb_disable(ecb);
11831			ASSERT(probe->dtpr_ecb != ecb);
11832			dtrace_ecb_destroy(ecb);
11833		}
11834	}
11835
11836	mutex_exit(&dtrace_lock);
11837	mutex_exit(&cpu_lock);
11838}
11839
11840/*
11841 * DTrace DOF Functions
11842 */
11843/*ARGSUSED*/
11844static void
11845dtrace_dof_error(dof_hdr_t *dof, const char *str)
11846{
11847	if (dtrace_err_verbose)
11848		cmn_err(CE_WARN, "failed to process DOF: %s", str);
11849
11850#ifdef DTRACE_ERRDEBUG
11851	dtrace_errdebug(str);
11852#endif
11853}
11854
11855/*
11856 * Create DOF out of a currently enabled state.  Right now, we only create
11857 * DOF containing the run-time options -- but this could be expanded to create
11858 * complete DOF representing the enabled state.
11859 */
11860static dof_hdr_t *
11861dtrace_dof_create(dtrace_state_t *state)
11862{
11863	dof_hdr_t *dof;
11864	dof_sec_t *sec;
11865	dof_optdesc_t *opt;
11866	int i, len = sizeof (dof_hdr_t) +
11867	    roundup(sizeof (dof_sec_t), sizeof (uint64_t)) +
11868	    sizeof (dof_optdesc_t) * DTRACEOPT_MAX;
11869
11870	ASSERT(MUTEX_HELD(&dtrace_lock));
11871
11872	dof = kmem_zalloc(len, KM_SLEEP);
11873	dof->dofh_ident[DOF_ID_MAG0] = DOF_MAG_MAG0;
11874	dof->dofh_ident[DOF_ID_MAG1] = DOF_MAG_MAG1;
11875	dof->dofh_ident[DOF_ID_MAG2] = DOF_MAG_MAG2;
11876	dof->dofh_ident[DOF_ID_MAG3] = DOF_MAG_MAG3;
11877
11878	dof->dofh_ident[DOF_ID_MODEL] = DOF_MODEL_NATIVE;
11879	dof->dofh_ident[DOF_ID_ENCODING] = DOF_ENCODE_NATIVE;
11880	dof->dofh_ident[DOF_ID_VERSION] = DOF_VERSION;
11881	dof->dofh_ident[DOF_ID_DIFVERS] = DIF_VERSION;
11882	dof->dofh_ident[DOF_ID_DIFIREG] = DIF_DIR_NREGS;
11883	dof->dofh_ident[DOF_ID_DIFTREG] = DIF_DTR_NREGS;
11884
11885	dof->dofh_flags = 0;
11886	dof->dofh_hdrsize = sizeof (dof_hdr_t);
11887	dof->dofh_secsize = sizeof (dof_sec_t);
11888	dof->dofh_secnum = 1;	/* only DOF_SECT_OPTDESC */
11889	dof->dofh_secoff = sizeof (dof_hdr_t);
11890	dof->dofh_loadsz = len;
11891	dof->dofh_filesz = len;
11892	dof->dofh_pad = 0;
11893
11894	/*
11895	 * Fill in the option section header...
11896	 */
11897	sec = (dof_sec_t *)((uintptr_t)dof + sizeof (dof_hdr_t));
11898	sec->dofs_type = DOF_SECT_OPTDESC;
11899	sec->dofs_align = sizeof (uint64_t);
11900	sec->dofs_flags = DOF_SECF_LOAD;
11901	sec->dofs_entsize = sizeof (dof_optdesc_t);
11902
11903	opt = (dof_optdesc_t *)((uintptr_t)sec +
11904	    roundup(sizeof (dof_sec_t), sizeof (uint64_t)));
11905
11906	sec->dofs_offset = (uintptr_t)opt - (uintptr_t)dof;
11907	sec->dofs_size = sizeof (dof_optdesc_t) * DTRACEOPT_MAX;
11908
11909	for (i = 0; i < DTRACEOPT_MAX; i++) {
11910		opt[i].dofo_option = i;
11911		opt[i].dofo_strtab = DOF_SECIDX_NONE;
11912		opt[i].dofo_value = state->dts_options[i];
11913	}
11914
11915	return (dof);
11916}
11917
11918static dof_hdr_t *
11919dtrace_dof_copyin(uintptr_t uarg, int *errp)
11920{
11921	dof_hdr_t hdr, *dof;
11922
11923	ASSERT(!MUTEX_HELD(&dtrace_lock));
11924
11925	/*
11926	 * First, we're going to copyin() the sizeof (dof_hdr_t).
11927	 */
11928	if (copyin((void *)uarg, &hdr, sizeof (hdr)) != 0) {
11929		dtrace_dof_error(NULL, "failed to copyin DOF header");
11930		*errp = EFAULT;
11931		return (NULL);
11932	}
11933
11934	/*
11935	 * Now we'll allocate the entire DOF and copy it in -- provided
11936	 * that the length isn't outrageous.
11937	 */
11938	if (hdr.dofh_loadsz >= dtrace_dof_maxsize) {
11939		dtrace_dof_error(&hdr, "load size exceeds maximum");
11940		*errp = E2BIG;
11941		return (NULL);
11942	}
11943
11944	if (hdr.dofh_loadsz < sizeof (hdr)) {
11945		dtrace_dof_error(&hdr, "invalid load size");
11946		*errp = EINVAL;
11947		return (NULL);
11948	}
11949
11950	dof = kmem_alloc(hdr.dofh_loadsz, KM_SLEEP);
11951
11952	if (copyin((void *)uarg, dof, hdr.dofh_loadsz) != 0) {
11953		kmem_free(dof, hdr.dofh_loadsz);
11954		*errp = EFAULT;
11955		return (NULL);
11956	}
11957
11958	return (dof);
11959}
11960
11961#if !defined(sun)
11962static __inline uchar_t
11963dtrace_dof_char(char c) {
11964	switch (c) {
11965	case '0':
11966	case '1':
11967	case '2':
11968	case '3':
11969	case '4':
11970	case '5':
11971	case '6':
11972	case '7':
11973	case '8':
11974	case '9':
11975		return (c - '0');
11976	case 'A':
11977	case 'B':
11978	case 'C':
11979	case 'D':
11980	case 'E':
11981	case 'F':
11982		return (c - 'A' + 10);
11983	case 'a':
11984	case 'b':
11985	case 'c':
11986	case 'd':
11987	case 'e':
11988	case 'f':
11989		return (c - 'a' + 10);
11990	}
11991	/* Should not reach here. */
11992	return (0);
11993}
11994#endif
11995
11996static dof_hdr_t *
11997dtrace_dof_property(const char *name)
11998{
11999	uchar_t *buf;
12000	uint64_t loadsz;
12001	unsigned int len, i;
12002	dof_hdr_t *dof;
12003
12004#if defined(sun)
12005	/*
12006	 * Unfortunately, array of values in .conf files are always (and
12007	 * only) interpreted to be integer arrays.  We must read our DOF
12008	 * as an integer array, and then squeeze it into a byte array.
12009	 */
12010	if (ddi_prop_lookup_int_array(DDI_DEV_T_ANY, dtrace_devi, 0,
12011	    (char *)name, (int **)&buf, &len) != DDI_PROP_SUCCESS)
12012		return (NULL);
12013
12014	for (i = 0; i < len; i++)
12015		buf[i] = (uchar_t)(((int *)buf)[i]);
12016
12017	if (len < sizeof (dof_hdr_t)) {
12018		ddi_prop_free(buf);
12019		dtrace_dof_error(NULL, "truncated header");
12020		return (NULL);
12021	}
12022
12023	if (len < (loadsz = ((dof_hdr_t *)buf)->dofh_loadsz)) {
12024		ddi_prop_free(buf);
12025		dtrace_dof_error(NULL, "truncated DOF");
12026		return (NULL);
12027	}
12028
12029	if (loadsz >= dtrace_dof_maxsize) {
12030		ddi_prop_free(buf);
12031		dtrace_dof_error(NULL, "oversized DOF");
12032		return (NULL);
12033	}
12034
12035	dof = kmem_alloc(loadsz, KM_SLEEP);
12036	bcopy(buf, dof, loadsz);
12037	ddi_prop_free(buf);
12038#else
12039	char *p;
12040	char *p_env;
12041
12042	if ((p_env = getenv(name)) == NULL)
12043		return (NULL);
12044
12045	len = strlen(p_env) / 2;
12046
12047	buf = kmem_alloc(len, KM_SLEEP);
12048
12049	dof = (dof_hdr_t *) buf;
12050
12051	p = p_env;
12052
12053	for (i = 0; i < len; i++) {
12054		buf[i] = (dtrace_dof_char(p[0]) << 4) |
12055		     dtrace_dof_char(p[1]);
12056		p += 2;
12057	}
12058
12059	freeenv(p_env);
12060
12061	if (len < sizeof (dof_hdr_t)) {
12062		kmem_free(buf, 0);
12063		dtrace_dof_error(NULL, "truncated header");
12064		return (NULL);
12065	}
12066
12067	if (len < (loadsz = dof->dofh_loadsz)) {
12068		kmem_free(buf, 0);
12069		dtrace_dof_error(NULL, "truncated DOF");
12070		return (NULL);
12071	}
12072
12073	if (loadsz >= dtrace_dof_maxsize) {
12074		kmem_free(buf, 0);
12075		dtrace_dof_error(NULL, "oversized DOF");
12076		return (NULL);
12077	}
12078#endif
12079
12080	return (dof);
12081}
12082
12083static void
12084dtrace_dof_destroy(dof_hdr_t *dof)
12085{
12086	kmem_free(dof, dof->dofh_loadsz);
12087}
12088
12089/*
12090 * Return the dof_sec_t pointer corresponding to a given section index.  If the
12091 * index is not valid, dtrace_dof_error() is called and NULL is returned.  If
12092 * a type other than DOF_SECT_NONE is specified, the header is checked against
12093 * this type and NULL is returned if the types do not match.
12094 */
12095static dof_sec_t *
12096dtrace_dof_sect(dof_hdr_t *dof, uint32_t type, dof_secidx_t i)
12097{
12098	dof_sec_t *sec = (dof_sec_t *)(uintptr_t)
12099	    ((uintptr_t)dof + dof->dofh_secoff + i * dof->dofh_secsize);
12100
12101	if (i >= dof->dofh_secnum) {
12102		dtrace_dof_error(dof, "referenced section index is invalid");
12103		return (NULL);
12104	}
12105
12106	if (!(sec->dofs_flags & DOF_SECF_LOAD)) {
12107		dtrace_dof_error(dof, "referenced section is not loadable");
12108		return (NULL);
12109	}
12110
12111	if (type != DOF_SECT_NONE && type != sec->dofs_type) {
12112		dtrace_dof_error(dof, "referenced section is the wrong type");
12113		return (NULL);
12114	}
12115
12116	return (sec);
12117}
12118
12119static dtrace_probedesc_t *
12120dtrace_dof_probedesc(dof_hdr_t *dof, dof_sec_t *sec, dtrace_probedesc_t *desc)
12121{
12122	dof_probedesc_t *probe;
12123	dof_sec_t *strtab;
12124	uintptr_t daddr = (uintptr_t)dof;
12125	uintptr_t str;
12126	size_t size;
12127
12128	if (sec->dofs_type != DOF_SECT_PROBEDESC) {
12129		dtrace_dof_error(dof, "invalid probe section");
12130		return (NULL);
12131	}
12132
12133	if (sec->dofs_align != sizeof (dof_secidx_t)) {
12134		dtrace_dof_error(dof, "bad alignment in probe description");
12135		return (NULL);
12136	}
12137
12138	if (sec->dofs_offset + sizeof (dof_probedesc_t) > dof->dofh_loadsz) {
12139		dtrace_dof_error(dof, "truncated probe description");
12140		return (NULL);
12141	}
12142
12143	probe = (dof_probedesc_t *)(uintptr_t)(daddr + sec->dofs_offset);
12144	strtab = dtrace_dof_sect(dof, DOF_SECT_STRTAB, probe->dofp_strtab);
12145
12146	if (strtab == NULL)
12147		return (NULL);
12148
12149	str = daddr + strtab->dofs_offset;
12150	size = strtab->dofs_size;
12151
12152	if (probe->dofp_provider >= strtab->dofs_size) {
12153		dtrace_dof_error(dof, "corrupt probe provider");
12154		return (NULL);
12155	}
12156
12157	(void) strncpy(desc->dtpd_provider,
12158	    (char *)(str + probe->dofp_provider),
12159	    MIN(DTRACE_PROVNAMELEN - 1, size - probe->dofp_provider));
12160
12161	if (probe->dofp_mod >= strtab->dofs_size) {
12162		dtrace_dof_error(dof, "corrupt probe module");
12163		return (NULL);
12164	}
12165
12166	(void) strncpy(desc->dtpd_mod, (char *)(str + probe->dofp_mod),
12167	    MIN(DTRACE_MODNAMELEN - 1, size - probe->dofp_mod));
12168
12169	if (probe->dofp_func >= strtab->dofs_size) {
12170		dtrace_dof_error(dof, "corrupt probe function");
12171		return (NULL);
12172	}
12173
12174	(void) strncpy(desc->dtpd_func, (char *)(str + probe->dofp_func),
12175	    MIN(DTRACE_FUNCNAMELEN - 1, size - probe->dofp_func));
12176
12177	if (probe->dofp_name >= strtab->dofs_size) {
12178		dtrace_dof_error(dof, "corrupt probe name");
12179		return (NULL);
12180	}
12181
12182	(void) strncpy(desc->dtpd_name, (char *)(str + probe->dofp_name),
12183	    MIN(DTRACE_NAMELEN - 1, size - probe->dofp_name));
12184
12185	return (desc);
12186}
12187
12188static dtrace_difo_t *
12189dtrace_dof_difo(dof_hdr_t *dof, dof_sec_t *sec, dtrace_vstate_t *vstate,
12190    cred_t *cr)
12191{
12192	dtrace_difo_t *dp;
12193	size_t ttl = 0;
12194	dof_difohdr_t *dofd;
12195	uintptr_t daddr = (uintptr_t)dof;
12196	size_t max = dtrace_difo_maxsize;
12197	int i, l, n;
12198
12199	static const struct {
12200		int section;
12201		int bufoffs;
12202		int lenoffs;
12203		int entsize;
12204		int align;
12205		const char *msg;
12206	} difo[] = {
12207		{ DOF_SECT_DIF, offsetof(dtrace_difo_t, dtdo_buf),
12208		offsetof(dtrace_difo_t, dtdo_len), sizeof (dif_instr_t),
12209		sizeof (dif_instr_t), "multiple DIF sections" },
12210
12211		{ DOF_SECT_INTTAB, offsetof(dtrace_difo_t, dtdo_inttab),
12212		offsetof(dtrace_difo_t, dtdo_intlen), sizeof (uint64_t),
12213		sizeof (uint64_t), "multiple integer tables" },
12214
12215		{ DOF_SECT_STRTAB, offsetof(dtrace_difo_t, dtdo_strtab),
12216		offsetof(dtrace_difo_t, dtdo_strlen), 0,
12217		sizeof (char), "multiple string tables" },
12218
12219		{ DOF_SECT_VARTAB, offsetof(dtrace_difo_t, dtdo_vartab),
12220		offsetof(dtrace_difo_t, dtdo_varlen), sizeof (dtrace_difv_t),
12221		sizeof (uint_t), "multiple variable tables" },
12222
12223		{ DOF_SECT_NONE, 0, 0, 0, 0, NULL }
12224	};
12225
12226	if (sec->dofs_type != DOF_SECT_DIFOHDR) {
12227		dtrace_dof_error(dof, "invalid DIFO header section");
12228		return (NULL);
12229	}
12230
12231	if (sec->dofs_align != sizeof (dof_secidx_t)) {
12232		dtrace_dof_error(dof, "bad alignment in DIFO header");
12233		return (NULL);
12234	}
12235
12236	if (sec->dofs_size < sizeof (dof_difohdr_t) ||
12237	    sec->dofs_size % sizeof (dof_secidx_t)) {
12238		dtrace_dof_error(dof, "bad size in DIFO header");
12239		return (NULL);
12240	}
12241
12242	dofd = (dof_difohdr_t *)(uintptr_t)(daddr + sec->dofs_offset);
12243	n = (sec->dofs_size - sizeof (*dofd)) / sizeof (dof_secidx_t) + 1;
12244
12245	dp = kmem_zalloc(sizeof (dtrace_difo_t), KM_SLEEP);
12246	dp->dtdo_rtype = dofd->dofd_rtype;
12247
12248	for (l = 0; l < n; l++) {
12249		dof_sec_t *subsec;
12250		void **bufp;
12251		uint32_t *lenp;
12252
12253		if ((subsec = dtrace_dof_sect(dof, DOF_SECT_NONE,
12254		    dofd->dofd_links[l])) == NULL)
12255			goto err; /* invalid section link */
12256
12257		if (ttl + subsec->dofs_size > max) {
12258			dtrace_dof_error(dof, "exceeds maximum size");
12259			goto err;
12260		}
12261
12262		ttl += subsec->dofs_size;
12263
12264		for (i = 0; difo[i].section != DOF_SECT_NONE; i++) {
12265			if (subsec->dofs_type != difo[i].section)
12266				continue;
12267
12268			if (!(subsec->dofs_flags & DOF_SECF_LOAD)) {
12269				dtrace_dof_error(dof, "section not loaded");
12270				goto err;
12271			}
12272
12273			if (subsec->dofs_align != difo[i].align) {
12274				dtrace_dof_error(dof, "bad alignment");
12275				goto err;
12276			}
12277
12278			bufp = (void **)((uintptr_t)dp + difo[i].bufoffs);
12279			lenp = (uint32_t *)((uintptr_t)dp + difo[i].lenoffs);
12280
12281			if (*bufp != NULL) {
12282				dtrace_dof_error(dof, difo[i].msg);
12283				goto err;
12284			}
12285
12286			if (difo[i].entsize != subsec->dofs_entsize) {
12287				dtrace_dof_error(dof, "entry size mismatch");
12288				goto err;
12289			}
12290
12291			if (subsec->dofs_entsize != 0 &&
12292			    (subsec->dofs_size % subsec->dofs_entsize) != 0) {
12293				dtrace_dof_error(dof, "corrupt entry size");
12294				goto err;
12295			}
12296
12297			*lenp = subsec->dofs_size;
12298			*bufp = kmem_alloc(subsec->dofs_size, KM_SLEEP);
12299			bcopy((char *)(uintptr_t)(daddr + subsec->dofs_offset),
12300			    *bufp, subsec->dofs_size);
12301
12302			if (subsec->dofs_entsize != 0)
12303				*lenp /= subsec->dofs_entsize;
12304
12305			break;
12306		}
12307
12308		/*
12309		 * If we encounter a loadable DIFO sub-section that is not
12310		 * known to us, assume this is a broken program and fail.
12311		 */
12312		if (difo[i].section == DOF_SECT_NONE &&
12313		    (subsec->dofs_flags & DOF_SECF_LOAD)) {
12314			dtrace_dof_error(dof, "unrecognized DIFO subsection");
12315			goto err;
12316		}
12317	}
12318
12319	if (dp->dtdo_buf == NULL) {
12320		/*
12321		 * We can't have a DIF object without DIF text.
12322		 */
12323		dtrace_dof_error(dof, "missing DIF text");
12324		goto err;
12325	}
12326
12327	/*
12328	 * Before we validate the DIF object, run through the variable table
12329	 * looking for the strings -- if any of their size are under, we'll set
12330	 * their size to be the system-wide default string size.  Note that
12331	 * this should _not_ happen if the "strsize" option has been set --
12332	 * in this case, the compiler should have set the size to reflect the
12333	 * setting of the option.
12334	 */
12335	for (i = 0; i < dp->dtdo_varlen; i++) {
12336		dtrace_difv_t *v = &dp->dtdo_vartab[i];
12337		dtrace_diftype_t *t = &v->dtdv_type;
12338
12339		if (v->dtdv_id < DIF_VAR_OTHER_UBASE)
12340			continue;
12341
12342		if (t->dtdt_kind == DIF_TYPE_STRING && t->dtdt_size == 0)
12343			t->dtdt_size = dtrace_strsize_default;
12344	}
12345
12346	if (dtrace_difo_validate(dp, vstate, DIF_DIR_NREGS, cr) != 0)
12347		goto err;
12348
12349	dtrace_difo_init(dp, vstate);
12350	return (dp);
12351
12352err:
12353	kmem_free(dp->dtdo_buf, dp->dtdo_len * sizeof (dif_instr_t));
12354	kmem_free(dp->dtdo_inttab, dp->dtdo_intlen * sizeof (uint64_t));
12355	kmem_free(dp->dtdo_strtab, dp->dtdo_strlen);
12356	kmem_free(dp->dtdo_vartab, dp->dtdo_varlen * sizeof (dtrace_difv_t));
12357
12358	kmem_free(dp, sizeof (dtrace_difo_t));
12359	return (NULL);
12360}
12361
12362static dtrace_predicate_t *
12363dtrace_dof_predicate(dof_hdr_t *dof, dof_sec_t *sec, dtrace_vstate_t *vstate,
12364    cred_t *cr)
12365{
12366	dtrace_difo_t *dp;
12367
12368	if ((dp = dtrace_dof_difo(dof, sec, vstate, cr)) == NULL)
12369		return (NULL);
12370
12371	return (dtrace_predicate_create(dp));
12372}
12373
12374static dtrace_actdesc_t *
12375dtrace_dof_actdesc(dof_hdr_t *dof, dof_sec_t *sec, dtrace_vstate_t *vstate,
12376    cred_t *cr)
12377{
12378	dtrace_actdesc_t *act, *first = NULL, *last = NULL, *next;
12379	dof_actdesc_t *desc;
12380	dof_sec_t *difosec;
12381	size_t offs;
12382	uintptr_t daddr = (uintptr_t)dof;
12383	uint64_t arg;
12384	dtrace_actkind_t kind;
12385
12386	if (sec->dofs_type != DOF_SECT_ACTDESC) {
12387		dtrace_dof_error(dof, "invalid action section");
12388		return (NULL);
12389	}
12390
12391	if (sec->dofs_offset + sizeof (dof_actdesc_t) > dof->dofh_loadsz) {
12392		dtrace_dof_error(dof, "truncated action description");
12393		return (NULL);
12394	}
12395
12396	if (sec->dofs_align != sizeof (uint64_t)) {
12397		dtrace_dof_error(dof, "bad alignment in action description");
12398		return (NULL);
12399	}
12400
12401	if (sec->dofs_size < sec->dofs_entsize) {
12402		dtrace_dof_error(dof, "section entry size exceeds total size");
12403		return (NULL);
12404	}
12405
12406	if (sec->dofs_entsize != sizeof (dof_actdesc_t)) {
12407		dtrace_dof_error(dof, "bad entry size in action description");
12408		return (NULL);
12409	}
12410
12411	if (sec->dofs_size / sec->dofs_entsize > dtrace_actions_max) {
12412		dtrace_dof_error(dof, "actions exceed dtrace_actions_max");
12413		return (NULL);
12414	}
12415
12416	for (offs = 0; offs < sec->dofs_size; offs += sec->dofs_entsize) {
12417		desc = (dof_actdesc_t *)(daddr +
12418		    (uintptr_t)sec->dofs_offset + offs);
12419		kind = (dtrace_actkind_t)desc->dofa_kind;
12420
12421		if ((DTRACEACT_ISPRINTFLIKE(kind) &&
12422		    (kind != DTRACEACT_PRINTA ||
12423		    desc->dofa_strtab != DOF_SECIDX_NONE)) ||
12424		    (kind == DTRACEACT_DIFEXPR &&
12425		    desc->dofa_strtab != DOF_SECIDX_NONE)) {
12426			dof_sec_t *strtab;
12427			char *str, *fmt;
12428			uint64_t i;
12429
12430			/*
12431			 * The argument to these actions is an index into the
12432			 * DOF string table.  For printf()-like actions, this
12433			 * is the format string.  For print(), this is the
12434			 * CTF type of the expression result.
12435			 */
12436			if ((strtab = dtrace_dof_sect(dof,
12437			    DOF_SECT_STRTAB, desc->dofa_strtab)) == NULL)
12438				goto err;
12439
12440			str = (char *)((uintptr_t)dof +
12441			    (uintptr_t)strtab->dofs_offset);
12442
12443			for (i = desc->dofa_arg; i < strtab->dofs_size; i++) {
12444				if (str[i] == '\0')
12445					break;
12446			}
12447
12448			if (i >= strtab->dofs_size) {
12449				dtrace_dof_error(dof, "bogus format string");
12450				goto err;
12451			}
12452
12453			if (i == desc->dofa_arg) {
12454				dtrace_dof_error(dof, "empty format string");
12455				goto err;
12456			}
12457
12458			i -= desc->dofa_arg;
12459			fmt = kmem_alloc(i + 1, KM_SLEEP);
12460			bcopy(&str[desc->dofa_arg], fmt, i + 1);
12461			arg = (uint64_t)(uintptr_t)fmt;
12462		} else {
12463			if (kind == DTRACEACT_PRINTA) {
12464				ASSERT(desc->dofa_strtab == DOF_SECIDX_NONE);
12465				arg = 0;
12466			} else {
12467				arg = desc->dofa_arg;
12468			}
12469		}
12470
12471		act = dtrace_actdesc_create(kind, desc->dofa_ntuple,
12472		    desc->dofa_uarg, arg);
12473
12474		if (last != NULL) {
12475			last->dtad_next = act;
12476		} else {
12477			first = act;
12478		}
12479
12480		last = act;
12481
12482		if (desc->dofa_difo == DOF_SECIDX_NONE)
12483			continue;
12484
12485		if ((difosec = dtrace_dof_sect(dof,
12486		    DOF_SECT_DIFOHDR, desc->dofa_difo)) == NULL)
12487			goto err;
12488
12489		act->dtad_difo = dtrace_dof_difo(dof, difosec, vstate, cr);
12490
12491		if (act->dtad_difo == NULL)
12492			goto err;
12493	}
12494
12495	ASSERT(first != NULL);
12496	return (first);
12497
12498err:
12499	for (act = first; act != NULL; act = next) {
12500		next = act->dtad_next;
12501		dtrace_actdesc_release(act, vstate);
12502	}
12503
12504	return (NULL);
12505}
12506
12507static dtrace_ecbdesc_t *
12508dtrace_dof_ecbdesc(dof_hdr_t *dof, dof_sec_t *sec, dtrace_vstate_t *vstate,
12509    cred_t *cr)
12510{
12511	dtrace_ecbdesc_t *ep;
12512	dof_ecbdesc_t *ecb;
12513	dtrace_probedesc_t *desc;
12514	dtrace_predicate_t *pred = NULL;
12515
12516	if (sec->dofs_size < sizeof (dof_ecbdesc_t)) {
12517		dtrace_dof_error(dof, "truncated ECB description");
12518		return (NULL);
12519	}
12520
12521	if (sec->dofs_align != sizeof (uint64_t)) {
12522		dtrace_dof_error(dof, "bad alignment in ECB description");
12523		return (NULL);
12524	}
12525
12526	ecb = (dof_ecbdesc_t *)((uintptr_t)dof + (uintptr_t)sec->dofs_offset);
12527	sec = dtrace_dof_sect(dof, DOF_SECT_PROBEDESC, ecb->dofe_probes);
12528
12529	if (sec == NULL)
12530		return (NULL);
12531
12532	ep = kmem_zalloc(sizeof (dtrace_ecbdesc_t), KM_SLEEP);
12533	ep->dted_uarg = ecb->dofe_uarg;
12534	desc = &ep->dted_probe;
12535
12536	if (dtrace_dof_probedesc(dof, sec, desc) == NULL)
12537		goto err;
12538
12539	if (ecb->dofe_pred != DOF_SECIDX_NONE) {
12540		if ((sec = dtrace_dof_sect(dof,
12541		    DOF_SECT_DIFOHDR, ecb->dofe_pred)) == NULL)
12542			goto err;
12543
12544		if ((pred = dtrace_dof_predicate(dof, sec, vstate, cr)) == NULL)
12545			goto err;
12546
12547		ep->dted_pred.dtpdd_predicate = pred;
12548	}
12549
12550	if (ecb->dofe_actions != DOF_SECIDX_NONE) {
12551		if ((sec = dtrace_dof_sect(dof,
12552		    DOF_SECT_ACTDESC, ecb->dofe_actions)) == NULL)
12553			goto err;
12554
12555		ep->dted_action = dtrace_dof_actdesc(dof, sec, vstate, cr);
12556
12557		if (ep->dted_action == NULL)
12558			goto err;
12559	}
12560
12561	return (ep);
12562
12563err:
12564	if (pred != NULL)
12565		dtrace_predicate_release(pred, vstate);
12566	kmem_free(ep, sizeof (dtrace_ecbdesc_t));
12567	return (NULL);
12568}
12569
12570/*
12571 * Apply the relocations from the specified 'sec' (a DOF_SECT_URELHDR) to the
12572 * specified DOF.  At present, this amounts to simply adding 'ubase' to the
12573 * site of any user SETX relocations to account for load object base address.
12574 * In the future, if we need other relocations, this function can be extended.
12575 */
12576static int
12577dtrace_dof_relocate(dof_hdr_t *dof, dof_sec_t *sec, uint64_t ubase)
12578{
12579	uintptr_t daddr = (uintptr_t)dof;
12580	dof_relohdr_t *dofr =
12581	    (dof_relohdr_t *)(uintptr_t)(daddr + sec->dofs_offset);
12582	dof_sec_t *ss, *rs, *ts;
12583	dof_relodesc_t *r;
12584	uint_t i, n;
12585
12586	if (sec->dofs_size < sizeof (dof_relohdr_t) ||
12587	    sec->dofs_align != sizeof (dof_secidx_t)) {
12588		dtrace_dof_error(dof, "invalid relocation header");
12589		return (-1);
12590	}
12591
12592	ss = dtrace_dof_sect(dof, DOF_SECT_STRTAB, dofr->dofr_strtab);
12593	rs = dtrace_dof_sect(dof, DOF_SECT_RELTAB, dofr->dofr_relsec);
12594	ts = dtrace_dof_sect(dof, DOF_SECT_NONE, dofr->dofr_tgtsec);
12595
12596	if (ss == NULL || rs == NULL || ts == NULL)
12597		return (-1); /* dtrace_dof_error() has been called already */
12598
12599	if (rs->dofs_entsize < sizeof (dof_relodesc_t) ||
12600	    rs->dofs_align != sizeof (uint64_t)) {
12601		dtrace_dof_error(dof, "invalid relocation section");
12602		return (-1);
12603	}
12604
12605	r = (dof_relodesc_t *)(uintptr_t)(daddr + rs->dofs_offset);
12606	n = rs->dofs_size / rs->dofs_entsize;
12607
12608	for (i = 0; i < n; i++) {
12609		uintptr_t taddr = daddr + ts->dofs_offset + r->dofr_offset;
12610
12611		switch (r->dofr_type) {
12612		case DOF_RELO_NONE:
12613			break;
12614		case DOF_RELO_SETX:
12615			if (r->dofr_offset >= ts->dofs_size || r->dofr_offset +
12616			    sizeof (uint64_t) > ts->dofs_size) {
12617				dtrace_dof_error(dof, "bad relocation offset");
12618				return (-1);
12619			}
12620
12621			if (!IS_P2ALIGNED(taddr, sizeof (uint64_t))) {
12622				dtrace_dof_error(dof, "misaligned setx relo");
12623				return (-1);
12624			}
12625
12626			*(uint64_t *)taddr += ubase;
12627			break;
12628		default:
12629			dtrace_dof_error(dof, "invalid relocation type");
12630			return (-1);
12631		}
12632
12633		r = (dof_relodesc_t *)((uintptr_t)r + rs->dofs_entsize);
12634	}
12635
12636	return (0);
12637}
12638
12639/*
12640 * The dof_hdr_t passed to dtrace_dof_slurp() should be a partially validated
12641 * header:  it should be at the front of a memory region that is at least
12642 * sizeof (dof_hdr_t) in size -- and then at least dof_hdr.dofh_loadsz in
12643 * size.  It need not be validated in any other way.
12644 */
12645static int
12646dtrace_dof_slurp(dof_hdr_t *dof, dtrace_vstate_t *vstate, cred_t *cr,
12647    dtrace_enabling_t **enabp, uint64_t ubase, int noprobes)
12648{
12649	uint64_t len = dof->dofh_loadsz, seclen;
12650	uintptr_t daddr = (uintptr_t)dof;
12651	dtrace_ecbdesc_t *ep;
12652	dtrace_enabling_t *enab;
12653	uint_t i;
12654
12655	ASSERT(MUTEX_HELD(&dtrace_lock));
12656	ASSERT(dof->dofh_loadsz >= sizeof (dof_hdr_t));
12657
12658	/*
12659	 * Check the DOF header identification bytes.  In addition to checking
12660	 * valid settings, we also verify that unused bits/bytes are zeroed so
12661	 * we can use them later without fear of regressing existing binaries.
12662	 */
12663	if (bcmp(&dof->dofh_ident[DOF_ID_MAG0],
12664	    DOF_MAG_STRING, DOF_MAG_STRLEN) != 0) {
12665		dtrace_dof_error(dof, "DOF magic string mismatch");
12666		return (-1);
12667	}
12668
12669	if (dof->dofh_ident[DOF_ID_MODEL] != DOF_MODEL_ILP32 &&
12670	    dof->dofh_ident[DOF_ID_MODEL] != DOF_MODEL_LP64) {
12671		dtrace_dof_error(dof, "DOF has invalid data model");
12672		return (-1);
12673	}
12674
12675	if (dof->dofh_ident[DOF_ID_ENCODING] != DOF_ENCODE_NATIVE) {
12676		dtrace_dof_error(dof, "DOF encoding mismatch");
12677		return (-1);
12678	}
12679
12680	if (dof->dofh_ident[DOF_ID_VERSION] != DOF_VERSION_1 &&
12681	    dof->dofh_ident[DOF_ID_VERSION] != DOF_VERSION_2) {
12682		dtrace_dof_error(dof, "DOF version mismatch");
12683		return (-1);
12684	}
12685
12686	if (dof->dofh_ident[DOF_ID_DIFVERS] != DIF_VERSION_2) {
12687		dtrace_dof_error(dof, "DOF uses unsupported instruction set");
12688		return (-1);
12689	}
12690
12691	if (dof->dofh_ident[DOF_ID_DIFIREG] > DIF_DIR_NREGS) {
12692		dtrace_dof_error(dof, "DOF uses too many integer registers");
12693		return (-1);
12694	}
12695
12696	if (dof->dofh_ident[DOF_ID_DIFTREG] > DIF_DTR_NREGS) {
12697		dtrace_dof_error(dof, "DOF uses too many tuple registers");
12698		return (-1);
12699	}
12700
12701	for (i = DOF_ID_PAD; i < DOF_ID_SIZE; i++) {
12702		if (dof->dofh_ident[i] != 0) {
12703			dtrace_dof_error(dof, "DOF has invalid ident byte set");
12704			return (-1);
12705		}
12706	}
12707
12708	if (dof->dofh_flags & ~DOF_FL_VALID) {
12709		dtrace_dof_error(dof, "DOF has invalid flag bits set");
12710		return (-1);
12711	}
12712
12713	if (dof->dofh_secsize == 0) {
12714		dtrace_dof_error(dof, "zero section header size");
12715		return (-1);
12716	}
12717
12718	/*
12719	 * Check that the section headers don't exceed the amount of DOF
12720	 * data.  Note that we cast the section size and number of sections
12721	 * to uint64_t's to prevent possible overflow in the multiplication.
12722	 */
12723	seclen = (uint64_t)dof->dofh_secnum * (uint64_t)dof->dofh_secsize;
12724
12725	if (dof->dofh_secoff > len || seclen > len ||
12726	    dof->dofh_secoff + seclen > len) {
12727		dtrace_dof_error(dof, "truncated section headers");
12728		return (-1);
12729	}
12730
12731	if (!IS_P2ALIGNED(dof->dofh_secoff, sizeof (uint64_t))) {
12732		dtrace_dof_error(dof, "misaligned section headers");
12733		return (-1);
12734	}
12735
12736	if (!IS_P2ALIGNED(dof->dofh_secsize, sizeof (uint64_t))) {
12737		dtrace_dof_error(dof, "misaligned section size");
12738		return (-1);
12739	}
12740
12741	/*
12742	 * Take an initial pass through the section headers to be sure that
12743	 * the headers don't have stray offsets.  If the 'noprobes' flag is
12744	 * set, do not permit sections relating to providers, probes, or args.
12745	 */
12746	for (i = 0; i < dof->dofh_secnum; i++) {
12747		dof_sec_t *sec = (dof_sec_t *)(daddr +
12748		    (uintptr_t)dof->dofh_secoff + i * dof->dofh_secsize);
12749
12750		if (noprobes) {
12751			switch (sec->dofs_type) {
12752			case DOF_SECT_PROVIDER:
12753			case DOF_SECT_PROBES:
12754			case DOF_SECT_PRARGS:
12755			case DOF_SECT_PROFFS:
12756				dtrace_dof_error(dof, "illegal sections "
12757				    "for enabling");
12758				return (-1);
12759			}
12760		}
12761
12762		if (!(sec->dofs_flags & DOF_SECF_LOAD))
12763			continue; /* just ignore non-loadable sections */
12764
12765		if (sec->dofs_align & (sec->dofs_align - 1)) {
12766			dtrace_dof_error(dof, "bad section alignment");
12767			return (-1);
12768		}
12769
12770		if (sec->dofs_offset & (sec->dofs_align - 1)) {
12771			dtrace_dof_error(dof, "misaligned section");
12772			return (-1);
12773		}
12774
12775		if (sec->dofs_offset > len || sec->dofs_size > len ||
12776		    sec->dofs_offset + sec->dofs_size > len) {
12777			dtrace_dof_error(dof, "corrupt section header");
12778			return (-1);
12779		}
12780
12781		if (sec->dofs_type == DOF_SECT_STRTAB && *((char *)daddr +
12782		    sec->dofs_offset + sec->dofs_size - 1) != '\0') {
12783			dtrace_dof_error(dof, "non-terminating string table");
12784			return (-1);
12785		}
12786	}
12787
12788	/*
12789	 * Take a second pass through the sections and locate and perform any
12790	 * relocations that are present.  We do this after the first pass to
12791	 * be sure that all sections have had their headers validated.
12792	 */
12793	for (i = 0; i < dof->dofh_secnum; i++) {
12794		dof_sec_t *sec = (dof_sec_t *)(daddr +
12795		    (uintptr_t)dof->dofh_secoff + i * dof->dofh_secsize);
12796
12797		if (!(sec->dofs_flags & DOF_SECF_LOAD))
12798			continue; /* skip sections that are not loadable */
12799
12800		switch (sec->dofs_type) {
12801		case DOF_SECT_URELHDR:
12802			if (dtrace_dof_relocate(dof, sec, ubase) != 0)
12803				return (-1);
12804			break;
12805		}
12806	}
12807
12808	if ((enab = *enabp) == NULL)
12809		enab = *enabp = dtrace_enabling_create(vstate);
12810
12811	for (i = 0; i < dof->dofh_secnum; i++) {
12812		dof_sec_t *sec = (dof_sec_t *)(daddr +
12813		    (uintptr_t)dof->dofh_secoff + i * dof->dofh_secsize);
12814
12815		if (sec->dofs_type != DOF_SECT_ECBDESC)
12816			continue;
12817
12818		if ((ep = dtrace_dof_ecbdesc(dof, sec, vstate, cr)) == NULL) {
12819			dtrace_enabling_destroy(enab);
12820			*enabp = NULL;
12821			return (-1);
12822		}
12823
12824		dtrace_enabling_add(enab, ep);
12825	}
12826
12827	return (0);
12828}
12829
12830/*
12831 * Process DOF for any options.  This routine assumes that the DOF has been
12832 * at least processed by dtrace_dof_slurp().
12833 */
12834static int
12835dtrace_dof_options(dof_hdr_t *dof, dtrace_state_t *state)
12836{
12837	int i, rval;
12838	uint32_t entsize;
12839	size_t offs;
12840	dof_optdesc_t *desc;
12841
12842	for (i = 0; i < dof->dofh_secnum; i++) {
12843		dof_sec_t *sec = (dof_sec_t *)((uintptr_t)dof +
12844		    (uintptr_t)dof->dofh_secoff + i * dof->dofh_secsize);
12845
12846		if (sec->dofs_type != DOF_SECT_OPTDESC)
12847			continue;
12848
12849		if (sec->dofs_align != sizeof (uint64_t)) {
12850			dtrace_dof_error(dof, "bad alignment in "
12851			    "option description");
12852			return (EINVAL);
12853		}
12854
12855		if ((entsize = sec->dofs_entsize) == 0) {
12856			dtrace_dof_error(dof, "zeroed option entry size");
12857			return (EINVAL);
12858		}
12859
12860		if (entsize < sizeof (dof_optdesc_t)) {
12861			dtrace_dof_error(dof, "bad option entry size");
12862			return (EINVAL);
12863		}
12864
12865		for (offs = 0; offs < sec->dofs_size; offs += entsize) {
12866			desc = (dof_optdesc_t *)((uintptr_t)dof +
12867			    (uintptr_t)sec->dofs_offset + offs);
12868
12869			if (desc->dofo_strtab != DOF_SECIDX_NONE) {
12870				dtrace_dof_error(dof, "non-zero option string");
12871				return (EINVAL);
12872			}
12873
12874			if (desc->dofo_value == DTRACEOPT_UNSET) {
12875				dtrace_dof_error(dof, "unset option");
12876				return (EINVAL);
12877			}
12878
12879			if ((rval = dtrace_state_option(state,
12880			    desc->dofo_option, desc->dofo_value)) != 0) {
12881				dtrace_dof_error(dof, "rejected option");
12882				return (rval);
12883			}
12884		}
12885	}
12886
12887	return (0);
12888}
12889
12890/*
12891 * DTrace Consumer State Functions
12892 */
12893static int
12894dtrace_dstate_init(dtrace_dstate_t *dstate, size_t size)
12895{
12896	size_t hashsize, maxper, min, chunksize = dstate->dtds_chunksize;
12897	void *base;
12898	uintptr_t limit;
12899	dtrace_dynvar_t *dvar, *next, *start;
12900	int i;
12901
12902	ASSERT(MUTEX_HELD(&dtrace_lock));
12903	ASSERT(dstate->dtds_base == NULL && dstate->dtds_percpu == NULL);
12904
12905	bzero(dstate, sizeof (dtrace_dstate_t));
12906
12907	if ((dstate->dtds_chunksize = chunksize) == 0)
12908		dstate->dtds_chunksize = DTRACE_DYNVAR_CHUNKSIZE;
12909
12910	if (size < (min = dstate->dtds_chunksize + sizeof (dtrace_dynhash_t)))
12911		size = min;
12912
12913	if ((base = kmem_zalloc(size, KM_NOSLEEP)) == NULL)
12914		return (ENOMEM);
12915
12916	dstate->dtds_size = size;
12917	dstate->dtds_base = base;
12918	dstate->dtds_percpu = kmem_cache_alloc(dtrace_state_cache, KM_SLEEP);
12919	bzero(dstate->dtds_percpu, NCPU * sizeof (dtrace_dstate_percpu_t));
12920
12921	hashsize = size / (dstate->dtds_chunksize + sizeof (dtrace_dynhash_t));
12922
12923	if (hashsize != 1 && (hashsize & 1))
12924		hashsize--;
12925
12926	dstate->dtds_hashsize = hashsize;
12927	dstate->dtds_hash = dstate->dtds_base;
12928
12929	/*
12930	 * Set all of our hash buckets to point to the single sink, and (if
12931	 * it hasn't already been set), set the sink's hash value to be the
12932	 * sink sentinel value.  The sink is needed for dynamic variable
12933	 * lookups to know that they have iterated over an entire, valid hash
12934	 * chain.
12935	 */
12936	for (i = 0; i < hashsize; i++)
12937		dstate->dtds_hash[i].dtdh_chain = &dtrace_dynhash_sink;
12938
12939	if (dtrace_dynhash_sink.dtdv_hashval != DTRACE_DYNHASH_SINK)
12940		dtrace_dynhash_sink.dtdv_hashval = DTRACE_DYNHASH_SINK;
12941
12942	/*
12943	 * Determine number of active CPUs.  Divide free list evenly among
12944	 * active CPUs.
12945	 */
12946	start = (dtrace_dynvar_t *)
12947	    ((uintptr_t)base + hashsize * sizeof (dtrace_dynhash_t));
12948	limit = (uintptr_t)base + size;
12949
12950	maxper = (limit - (uintptr_t)start) / NCPU;
12951	maxper = (maxper / dstate->dtds_chunksize) * dstate->dtds_chunksize;
12952
12953#if !defined(sun)
12954	CPU_FOREACH(i) {
12955#else
12956	for (i = 0; i < NCPU; i++) {
12957#endif
12958		dstate->dtds_percpu[i].dtdsc_free = dvar = start;
12959
12960		/*
12961		 * If we don't even have enough chunks to make it once through
12962		 * NCPUs, we're just going to allocate everything to the first
12963		 * CPU.  And if we're on the last CPU, we're going to allocate
12964		 * whatever is left over.  In either case, we set the limit to
12965		 * be the limit of the dynamic variable space.
12966		 */
12967		if (maxper == 0 || i == NCPU - 1) {
12968			limit = (uintptr_t)base + size;
12969			start = NULL;
12970		} else {
12971			limit = (uintptr_t)start + maxper;
12972			start = (dtrace_dynvar_t *)limit;
12973		}
12974
12975		ASSERT(limit <= (uintptr_t)base + size);
12976
12977		for (;;) {
12978			next = (dtrace_dynvar_t *)((uintptr_t)dvar +
12979			    dstate->dtds_chunksize);
12980
12981			if ((uintptr_t)next + dstate->dtds_chunksize >= limit)
12982				break;
12983
12984			dvar->dtdv_next = next;
12985			dvar = next;
12986		}
12987
12988		if (maxper == 0)
12989			break;
12990	}
12991
12992	return (0);
12993}
12994
12995static void
12996dtrace_dstate_fini(dtrace_dstate_t *dstate)
12997{
12998	ASSERT(MUTEX_HELD(&cpu_lock));
12999
13000	if (dstate->dtds_base == NULL)
13001		return;
13002
13003	kmem_free(dstate->dtds_base, dstate->dtds_size);
13004	kmem_cache_free(dtrace_state_cache, dstate->dtds_percpu);
13005}
13006
13007static void
13008dtrace_vstate_fini(dtrace_vstate_t *vstate)
13009{
13010	/*
13011	 * Logical XOR, where are you?
13012	 */
13013	ASSERT((vstate->dtvs_nglobals == 0) ^ (vstate->dtvs_globals != NULL));
13014
13015	if (vstate->dtvs_nglobals > 0) {
13016		kmem_free(vstate->dtvs_globals, vstate->dtvs_nglobals *
13017		    sizeof (dtrace_statvar_t *));
13018	}
13019
13020	if (vstate->dtvs_ntlocals > 0) {
13021		kmem_free(vstate->dtvs_tlocals, vstate->dtvs_ntlocals *
13022		    sizeof (dtrace_difv_t));
13023	}
13024
13025	ASSERT((vstate->dtvs_nlocals == 0) ^ (vstate->dtvs_locals != NULL));
13026
13027	if (vstate->dtvs_nlocals > 0) {
13028		kmem_free(vstate->dtvs_locals, vstate->dtvs_nlocals *
13029		    sizeof (dtrace_statvar_t *));
13030	}
13031}
13032
13033#if defined(sun)
13034static void
13035dtrace_state_clean(dtrace_state_t *state)
13036{
13037	if (state->dts_activity == DTRACE_ACTIVITY_INACTIVE)
13038		return;
13039
13040	dtrace_dynvar_clean(&state->dts_vstate.dtvs_dynvars);
13041	dtrace_speculation_clean(state);
13042}
13043
13044static void
13045dtrace_state_deadman(dtrace_state_t *state)
13046{
13047	hrtime_t now;
13048
13049	dtrace_sync();
13050
13051	now = dtrace_gethrtime();
13052
13053	if (state != dtrace_anon.dta_state &&
13054	    now - state->dts_laststatus >= dtrace_deadman_user)
13055		return;
13056
13057	/*
13058	 * We must be sure that dts_alive never appears to be less than the
13059	 * value upon entry to dtrace_state_deadman(), and because we lack a
13060	 * dtrace_cas64(), we cannot store to it atomically.  We thus instead
13061	 * store INT64_MAX to it, followed by a memory barrier, followed by
13062	 * the new value.  This assures that dts_alive never appears to be
13063	 * less than its true value, regardless of the order in which the
13064	 * stores to the underlying storage are issued.
13065	 */
13066	state->dts_alive = INT64_MAX;
13067	dtrace_membar_producer();
13068	state->dts_alive = now;
13069}
13070#else
13071static void
13072dtrace_state_clean(void *arg)
13073{
13074	dtrace_state_t *state = arg;
13075	dtrace_optval_t *opt = state->dts_options;
13076
13077	if (state->dts_activity == DTRACE_ACTIVITY_INACTIVE)
13078		return;
13079
13080	dtrace_dynvar_clean(&state->dts_vstate.dtvs_dynvars);
13081	dtrace_speculation_clean(state);
13082
13083	callout_reset(&state->dts_cleaner, hz * opt[DTRACEOPT_CLEANRATE] / NANOSEC,
13084	    dtrace_state_clean, state);
13085}
13086
13087static void
13088dtrace_state_deadman(void *arg)
13089{
13090	dtrace_state_t *state = arg;
13091	hrtime_t now;
13092
13093	dtrace_sync();
13094
13095	dtrace_debug_output();
13096
13097	now = dtrace_gethrtime();
13098
13099	if (state != dtrace_anon.dta_state &&
13100	    now - state->dts_laststatus >= dtrace_deadman_user)
13101		return;
13102
13103	/*
13104	 * We must be sure that dts_alive never appears to be less than the
13105	 * value upon entry to dtrace_state_deadman(), and because we lack a
13106	 * dtrace_cas64(), we cannot store to it atomically.  We thus instead
13107	 * store INT64_MAX to it, followed by a memory barrier, followed by
13108	 * the new value.  This assures that dts_alive never appears to be
13109	 * less than its true value, regardless of the order in which the
13110	 * stores to the underlying storage are issued.
13111	 */
13112	state->dts_alive = INT64_MAX;
13113	dtrace_membar_producer();
13114	state->dts_alive = now;
13115
13116	callout_reset(&state->dts_deadman, hz * dtrace_deadman_interval / NANOSEC,
13117	    dtrace_state_deadman, state);
13118}
13119#endif
13120
13121static dtrace_state_t *
13122#if defined(sun)
13123dtrace_state_create(dev_t *devp, cred_t *cr)
13124#else
13125dtrace_state_create(struct cdev *dev)
13126#endif
13127{
13128#if defined(sun)
13129	minor_t minor;
13130	major_t major;
13131#else
13132	cred_t *cr = NULL;
13133	int m = 0;
13134#endif
13135	char c[30];
13136	dtrace_state_t *state;
13137	dtrace_optval_t *opt;
13138	int bufsize = NCPU * sizeof (dtrace_buffer_t), i;
13139
13140	ASSERT(MUTEX_HELD(&dtrace_lock));
13141	ASSERT(MUTEX_HELD(&cpu_lock));
13142
13143#if defined(sun)
13144	minor = (minor_t)(uintptr_t)vmem_alloc(dtrace_minor, 1,
13145	    VM_BESTFIT | VM_SLEEP);
13146
13147	if (ddi_soft_state_zalloc(dtrace_softstate, minor) != DDI_SUCCESS) {
13148		vmem_free(dtrace_minor, (void *)(uintptr_t)minor, 1);
13149		return (NULL);
13150	}
13151
13152	state = ddi_get_soft_state(dtrace_softstate, minor);
13153#else
13154	if (dev != NULL) {
13155		cr = dev->si_cred;
13156		m = dev2unit(dev);
13157		}
13158
13159	/* Allocate memory for the state. */
13160	state = kmem_zalloc(sizeof(dtrace_state_t), KM_SLEEP);
13161#endif
13162
13163	state->dts_epid = DTRACE_EPIDNONE + 1;
13164
13165	(void) snprintf(c, sizeof (c), "dtrace_aggid_%d", m);
13166#if defined(sun)
13167	state->dts_aggid_arena = vmem_create(c, (void *)1, UINT32_MAX, 1,
13168	    NULL, NULL, NULL, 0, VM_SLEEP | VMC_IDENTIFIER);
13169
13170	if (devp != NULL) {
13171		major = getemajor(*devp);
13172	} else {
13173		major = ddi_driver_major(dtrace_devi);
13174	}
13175
13176	state->dts_dev = makedevice(major, minor);
13177
13178	if (devp != NULL)
13179		*devp = state->dts_dev;
13180#else
13181	state->dts_aggid_arena = new_unrhdr(1, INT_MAX, &dtrace_unr_mtx);
13182	state->dts_dev = dev;
13183#endif
13184
13185	/*
13186	 * We allocate NCPU buffers.  On the one hand, this can be quite
13187	 * a bit of memory per instance (nearly 36K on a Starcat).  On the
13188	 * other hand, it saves an additional memory reference in the probe
13189	 * path.
13190	 */
13191	state->dts_buffer = kmem_zalloc(bufsize, KM_SLEEP);
13192	state->dts_aggbuffer = kmem_zalloc(bufsize, KM_SLEEP);
13193
13194#if defined(sun)
13195	state->dts_cleaner = CYCLIC_NONE;
13196	state->dts_deadman = CYCLIC_NONE;
13197#else
13198	callout_init(&state->dts_cleaner, CALLOUT_MPSAFE);
13199	callout_init(&state->dts_deadman, CALLOUT_MPSAFE);
13200#endif
13201	state->dts_vstate.dtvs_state = state;
13202
13203	for (i = 0; i < DTRACEOPT_MAX; i++)
13204		state->dts_options[i] = DTRACEOPT_UNSET;
13205
13206	/*
13207	 * Set the default options.
13208	 */
13209	opt = state->dts_options;
13210	opt[DTRACEOPT_BUFPOLICY] = DTRACEOPT_BUFPOLICY_SWITCH;
13211	opt[DTRACEOPT_BUFRESIZE] = DTRACEOPT_BUFRESIZE_AUTO;
13212	opt[DTRACEOPT_NSPEC] = dtrace_nspec_default;
13213	opt[DTRACEOPT_SPECSIZE] = dtrace_specsize_default;
13214	opt[DTRACEOPT_CPU] = (dtrace_optval_t)DTRACE_CPUALL;
13215	opt[DTRACEOPT_STRSIZE] = dtrace_strsize_default;
13216	opt[DTRACEOPT_STACKFRAMES] = dtrace_stackframes_default;
13217	opt[DTRACEOPT_USTACKFRAMES] = dtrace_ustackframes_default;
13218	opt[DTRACEOPT_CLEANRATE] = dtrace_cleanrate_default;
13219	opt[DTRACEOPT_AGGRATE] = dtrace_aggrate_default;
13220	opt[DTRACEOPT_SWITCHRATE] = dtrace_switchrate_default;
13221	opt[DTRACEOPT_STATUSRATE] = dtrace_statusrate_default;
13222	opt[DTRACEOPT_JSTACKFRAMES] = dtrace_jstackframes_default;
13223	opt[DTRACEOPT_JSTACKSTRSIZE] = dtrace_jstackstrsize_default;
13224
13225	state->dts_activity = DTRACE_ACTIVITY_INACTIVE;
13226
13227	/*
13228	 * Depending on the user credentials, we set flag bits which alter probe
13229	 * visibility or the amount of destructiveness allowed.  In the case of
13230	 * actual anonymous tracing, or the possession of all privileges, all of
13231	 * the normal checks are bypassed.
13232	 */
13233	if (cr == NULL || PRIV_POLICY_ONLY(cr, PRIV_ALL, B_FALSE)) {
13234		state->dts_cred.dcr_visible = DTRACE_CRV_ALL;
13235		state->dts_cred.dcr_action = DTRACE_CRA_ALL;
13236	} else {
13237		/*
13238		 * Set up the credentials for this instantiation.  We take a
13239		 * hold on the credential to prevent it from disappearing on
13240		 * us; this in turn prevents the zone_t referenced by this
13241		 * credential from disappearing.  This means that we can
13242		 * examine the credential and the zone from probe context.
13243		 */
13244		crhold(cr);
13245		state->dts_cred.dcr_cred = cr;
13246
13247		/*
13248		 * CRA_PROC means "we have *some* privilege for dtrace" and
13249		 * unlocks the use of variables like pid, zonename, etc.
13250		 */
13251		if (PRIV_POLICY_ONLY(cr, PRIV_DTRACE_USER, B_FALSE) ||
13252		    PRIV_POLICY_ONLY(cr, PRIV_DTRACE_PROC, B_FALSE)) {
13253			state->dts_cred.dcr_action |= DTRACE_CRA_PROC;
13254		}
13255
13256		/*
13257		 * dtrace_user allows use of syscall and profile providers.
13258		 * If the user also has proc_owner and/or proc_zone, we
13259		 * extend the scope to include additional visibility and
13260		 * destructive power.
13261		 */
13262		if (PRIV_POLICY_ONLY(cr, PRIV_DTRACE_USER, B_FALSE)) {
13263			if (PRIV_POLICY_ONLY(cr, PRIV_PROC_OWNER, B_FALSE)) {
13264				state->dts_cred.dcr_visible |=
13265				    DTRACE_CRV_ALLPROC;
13266
13267				state->dts_cred.dcr_action |=
13268				    DTRACE_CRA_PROC_DESTRUCTIVE_ALLUSER;
13269			}
13270
13271			if (PRIV_POLICY_ONLY(cr, PRIV_PROC_ZONE, B_FALSE)) {
13272				state->dts_cred.dcr_visible |=
13273				    DTRACE_CRV_ALLZONE;
13274
13275				state->dts_cred.dcr_action |=
13276				    DTRACE_CRA_PROC_DESTRUCTIVE_ALLZONE;
13277			}
13278
13279			/*
13280			 * If we have all privs in whatever zone this is,
13281			 * we can do destructive things to processes which
13282			 * have altered credentials.
13283			 */
13284#if defined(sun)
13285			if (priv_isequalset(priv_getset(cr, PRIV_EFFECTIVE),
13286			    cr->cr_zone->zone_privset)) {
13287				state->dts_cred.dcr_action |=
13288				    DTRACE_CRA_PROC_DESTRUCTIVE_CREDCHG;
13289			}
13290#endif
13291		}
13292
13293		/*
13294		 * Holding the dtrace_kernel privilege also implies that
13295		 * the user has the dtrace_user privilege from a visibility
13296		 * perspective.  But without further privileges, some
13297		 * destructive actions are not available.
13298		 */
13299		if (PRIV_POLICY_ONLY(cr, PRIV_DTRACE_KERNEL, B_FALSE)) {
13300			/*
13301			 * Make all probes in all zones visible.  However,
13302			 * this doesn't mean that all actions become available
13303			 * to all zones.
13304			 */
13305			state->dts_cred.dcr_visible |= DTRACE_CRV_KERNEL |
13306			    DTRACE_CRV_ALLPROC | DTRACE_CRV_ALLZONE;
13307
13308			state->dts_cred.dcr_action |= DTRACE_CRA_KERNEL |
13309			    DTRACE_CRA_PROC;
13310			/*
13311			 * Holding proc_owner means that destructive actions
13312			 * for *this* zone are allowed.
13313			 */
13314			if (PRIV_POLICY_ONLY(cr, PRIV_PROC_OWNER, B_FALSE))
13315				state->dts_cred.dcr_action |=
13316				    DTRACE_CRA_PROC_DESTRUCTIVE_ALLUSER;
13317
13318			/*
13319			 * Holding proc_zone means that destructive actions
13320			 * for this user/group ID in all zones is allowed.
13321			 */
13322			if (PRIV_POLICY_ONLY(cr, PRIV_PROC_ZONE, B_FALSE))
13323				state->dts_cred.dcr_action |=
13324				    DTRACE_CRA_PROC_DESTRUCTIVE_ALLZONE;
13325
13326#if defined(sun)
13327			/*
13328			 * If we have all privs in whatever zone this is,
13329			 * we can do destructive things to processes which
13330			 * have altered credentials.
13331			 */
13332			if (priv_isequalset(priv_getset(cr, PRIV_EFFECTIVE),
13333			    cr->cr_zone->zone_privset)) {
13334				state->dts_cred.dcr_action |=
13335				    DTRACE_CRA_PROC_DESTRUCTIVE_CREDCHG;
13336			}
13337#endif
13338		}
13339
13340		/*
13341		 * Holding the dtrace_proc privilege gives control over fasttrap
13342		 * and pid providers.  We need to grant wider destructive
13343		 * privileges in the event that the user has proc_owner and/or
13344		 * proc_zone.
13345		 */
13346		if (PRIV_POLICY_ONLY(cr, PRIV_DTRACE_PROC, B_FALSE)) {
13347			if (PRIV_POLICY_ONLY(cr, PRIV_PROC_OWNER, B_FALSE))
13348				state->dts_cred.dcr_action |=
13349				    DTRACE_CRA_PROC_DESTRUCTIVE_ALLUSER;
13350
13351			if (PRIV_POLICY_ONLY(cr, PRIV_PROC_ZONE, B_FALSE))
13352				state->dts_cred.dcr_action |=
13353				    DTRACE_CRA_PROC_DESTRUCTIVE_ALLZONE;
13354		}
13355	}
13356
13357	return (state);
13358}
13359
13360static int
13361dtrace_state_buffer(dtrace_state_t *state, dtrace_buffer_t *buf, int which)
13362{
13363	dtrace_optval_t *opt = state->dts_options, size;
13364	processorid_t cpu = 0;;
13365	int flags = 0, rval;
13366
13367	ASSERT(MUTEX_HELD(&dtrace_lock));
13368	ASSERT(MUTEX_HELD(&cpu_lock));
13369	ASSERT(which < DTRACEOPT_MAX);
13370	ASSERT(state->dts_activity == DTRACE_ACTIVITY_INACTIVE ||
13371	    (state == dtrace_anon.dta_state &&
13372	    state->dts_activity == DTRACE_ACTIVITY_ACTIVE));
13373
13374	if (opt[which] == DTRACEOPT_UNSET || opt[which] == 0)
13375		return (0);
13376
13377	if (opt[DTRACEOPT_CPU] != DTRACEOPT_UNSET)
13378		cpu = opt[DTRACEOPT_CPU];
13379
13380	if (which == DTRACEOPT_SPECSIZE)
13381		flags |= DTRACEBUF_NOSWITCH;
13382
13383	if (which == DTRACEOPT_BUFSIZE) {
13384		if (opt[DTRACEOPT_BUFPOLICY] == DTRACEOPT_BUFPOLICY_RING)
13385			flags |= DTRACEBUF_RING;
13386
13387		if (opt[DTRACEOPT_BUFPOLICY] == DTRACEOPT_BUFPOLICY_FILL)
13388			flags |= DTRACEBUF_FILL;
13389
13390		if (state != dtrace_anon.dta_state ||
13391		    state->dts_activity != DTRACE_ACTIVITY_ACTIVE)
13392			flags |= DTRACEBUF_INACTIVE;
13393	}
13394
13395	for (size = opt[which]; size >= sizeof (uint64_t); size >>= 1) {
13396		/*
13397		 * The size must be 8-byte aligned.  If the size is not 8-byte
13398		 * aligned, drop it down by the difference.
13399		 */
13400		if (size & (sizeof (uint64_t) - 1))
13401			size -= size & (sizeof (uint64_t) - 1);
13402
13403		if (size < state->dts_reserve) {
13404			/*
13405			 * Buffers always must be large enough to accommodate
13406			 * their prereserved space.  We return E2BIG instead
13407			 * of ENOMEM in this case to allow for user-level
13408			 * software to differentiate the cases.
13409			 */
13410			return (E2BIG);
13411		}
13412
13413		rval = dtrace_buffer_alloc(buf, size, flags, cpu);
13414
13415		if (rval != ENOMEM) {
13416			opt[which] = size;
13417			return (rval);
13418		}
13419
13420		if (opt[DTRACEOPT_BUFRESIZE] == DTRACEOPT_BUFRESIZE_MANUAL)
13421			return (rval);
13422	}
13423
13424	return (ENOMEM);
13425}
13426
13427static int
13428dtrace_state_buffers(dtrace_state_t *state)
13429{
13430	dtrace_speculation_t *spec = state->dts_speculations;
13431	int rval, i;
13432
13433	if ((rval = dtrace_state_buffer(state, state->dts_buffer,
13434	    DTRACEOPT_BUFSIZE)) != 0)
13435		return (rval);
13436
13437	if ((rval = dtrace_state_buffer(state, state->dts_aggbuffer,
13438	    DTRACEOPT_AGGSIZE)) != 0)
13439		return (rval);
13440
13441	for (i = 0; i < state->dts_nspeculations; i++) {
13442		if ((rval = dtrace_state_buffer(state,
13443		    spec[i].dtsp_buffer, DTRACEOPT_SPECSIZE)) != 0)
13444			return (rval);
13445	}
13446
13447	return (0);
13448}
13449
13450static void
13451dtrace_state_prereserve(dtrace_state_t *state)
13452{
13453	dtrace_ecb_t *ecb;
13454	dtrace_probe_t *probe;
13455
13456	state->dts_reserve = 0;
13457
13458	if (state->dts_options[DTRACEOPT_BUFPOLICY] != DTRACEOPT_BUFPOLICY_FILL)
13459		return;
13460
13461	/*
13462	 * If our buffer policy is a "fill" buffer policy, we need to set the
13463	 * prereserved space to be the space required by the END probes.
13464	 */
13465	probe = dtrace_probes[dtrace_probeid_end - 1];
13466	ASSERT(probe != NULL);
13467
13468	for (ecb = probe->dtpr_ecb; ecb != NULL; ecb = ecb->dte_next) {
13469		if (ecb->dte_state != state)
13470			continue;
13471
13472		state->dts_reserve += ecb->dte_needed + ecb->dte_alignment;
13473	}
13474}
13475
13476static int
13477dtrace_state_go(dtrace_state_t *state, processorid_t *cpu)
13478{
13479	dtrace_optval_t *opt = state->dts_options, sz, nspec;
13480	dtrace_speculation_t *spec;
13481	dtrace_buffer_t *buf;
13482#if defined(sun)
13483	cyc_handler_t hdlr;
13484	cyc_time_t when;
13485#endif
13486	int rval = 0, i, bufsize = NCPU * sizeof (dtrace_buffer_t);
13487	dtrace_icookie_t cookie;
13488
13489	mutex_enter(&cpu_lock);
13490	mutex_enter(&dtrace_lock);
13491
13492	if (state->dts_activity != DTRACE_ACTIVITY_INACTIVE) {
13493		rval = EBUSY;
13494		goto out;
13495	}
13496
13497	/*
13498	 * Before we can perform any checks, we must prime all of the
13499	 * retained enablings that correspond to this state.
13500	 */
13501	dtrace_enabling_prime(state);
13502
13503	if (state->dts_destructive && !state->dts_cred.dcr_destructive) {
13504		rval = EACCES;
13505		goto out;
13506	}
13507
13508	dtrace_state_prereserve(state);
13509
13510	/*
13511	 * Now we want to do is try to allocate our speculations.
13512	 * We do not automatically resize the number of speculations; if
13513	 * this fails, we will fail the operation.
13514	 */
13515	nspec = opt[DTRACEOPT_NSPEC];
13516	ASSERT(nspec != DTRACEOPT_UNSET);
13517
13518	if (nspec > INT_MAX) {
13519		rval = ENOMEM;
13520		goto out;
13521	}
13522
13523	spec = kmem_zalloc(nspec * sizeof (dtrace_speculation_t), KM_NOSLEEP);
13524
13525	if (spec == NULL) {
13526		rval = ENOMEM;
13527		goto out;
13528	}
13529
13530	state->dts_speculations = spec;
13531	state->dts_nspeculations = (int)nspec;
13532
13533	for (i = 0; i < nspec; i++) {
13534		if ((buf = kmem_zalloc(bufsize, KM_NOSLEEP)) == NULL) {
13535			rval = ENOMEM;
13536			goto err;
13537		}
13538
13539		spec[i].dtsp_buffer = buf;
13540	}
13541
13542	if (opt[DTRACEOPT_GRABANON] != DTRACEOPT_UNSET) {
13543		if (dtrace_anon.dta_state == NULL) {
13544			rval = ENOENT;
13545			goto out;
13546		}
13547
13548		if (state->dts_necbs != 0) {
13549			rval = EALREADY;
13550			goto out;
13551		}
13552
13553		state->dts_anon = dtrace_anon_grab();
13554		ASSERT(state->dts_anon != NULL);
13555		state = state->dts_anon;
13556
13557		/*
13558		 * We want "grabanon" to be set in the grabbed state, so we'll
13559		 * copy that option value from the grabbing state into the
13560		 * grabbed state.
13561		 */
13562		state->dts_options[DTRACEOPT_GRABANON] =
13563		    opt[DTRACEOPT_GRABANON];
13564
13565		*cpu = dtrace_anon.dta_beganon;
13566
13567		/*
13568		 * If the anonymous state is active (as it almost certainly
13569		 * is if the anonymous enabling ultimately matched anything),
13570		 * we don't allow any further option processing -- but we
13571		 * don't return failure.
13572		 */
13573		if (state->dts_activity != DTRACE_ACTIVITY_INACTIVE)
13574			goto out;
13575	}
13576
13577	if (opt[DTRACEOPT_AGGSIZE] != DTRACEOPT_UNSET &&
13578	    opt[DTRACEOPT_AGGSIZE] != 0) {
13579		if (state->dts_aggregations == NULL) {
13580			/*
13581			 * We're not going to create an aggregation buffer
13582			 * because we don't have any ECBs that contain
13583			 * aggregations -- set this option to 0.
13584			 */
13585			opt[DTRACEOPT_AGGSIZE] = 0;
13586		} else {
13587			/*
13588			 * If we have an aggregation buffer, we must also have
13589			 * a buffer to use as scratch.
13590			 */
13591			if (opt[DTRACEOPT_BUFSIZE] == DTRACEOPT_UNSET ||
13592			    opt[DTRACEOPT_BUFSIZE] < state->dts_needed) {
13593				opt[DTRACEOPT_BUFSIZE] = state->dts_needed;
13594			}
13595		}
13596	}
13597
13598	if (opt[DTRACEOPT_SPECSIZE] != DTRACEOPT_UNSET &&
13599	    opt[DTRACEOPT_SPECSIZE] != 0) {
13600		if (!state->dts_speculates) {
13601			/*
13602			 * We're not going to create speculation buffers
13603			 * because we don't have any ECBs that actually
13604			 * speculate -- set the speculation size to 0.
13605			 */
13606			opt[DTRACEOPT_SPECSIZE] = 0;
13607		}
13608	}
13609
13610	/*
13611	 * The bare minimum size for any buffer that we're actually going to
13612	 * do anything to is sizeof (uint64_t).
13613	 */
13614	sz = sizeof (uint64_t);
13615
13616	if ((state->dts_needed != 0 && opt[DTRACEOPT_BUFSIZE] < sz) ||
13617	    (state->dts_speculates && opt[DTRACEOPT_SPECSIZE] < sz) ||
13618	    (state->dts_aggregations != NULL && opt[DTRACEOPT_AGGSIZE] < sz)) {
13619		/*
13620		 * A buffer size has been explicitly set to 0 (or to a size
13621		 * that will be adjusted to 0) and we need the space -- we
13622		 * need to return failure.  We return ENOSPC to differentiate
13623		 * it from failing to allocate a buffer due to failure to meet
13624		 * the reserve (for which we return E2BIG).
13625		 */
13626		rval = ENOSPC;
13627		goto out;
13628	}
13629
13630	if ((rval = dtrace_state_buffers(state)) != 0)
13631		goto err;
13632
13633	if ((sz = opt[DTRACEOPT_DYNVARSIZE]) == DTRACEOPT_UNSET)
13634		sz = dtrace_dstate_defsize;
13635
13636	do {
13637		rval = dtrace_dstate_init(&state->dts_vstate.dtvs_dynvars, sz);
13638
13639		if (rval == 0)
13640			break;
13641
13642		if (opt[DTRACEOPT_BUFRESIZE] == DTRACEOPT_BUFRESIZE_MANUAL)
13643			goto err;
13644	} while (sz >>= 1);
13645
13646	opt[DTRACEOPT_DYNVARSIZE] = sz;
13647
13648	if (rval != 0)
13649		goto err;
13650
13651	if (opt[DTRACEOPT_STATUSRATE] > dtrace_statusrate_max)
13652		opt[DTRACEOPT_STATUSRATE] = dtrace_statusrate_max;
13653
13654	if (opt[DTRACEOPT_CLEANRATE] == 0)
13655		opt[DTRACEOPT_CLEANRATE] = dtrace_cleanrate_max;
13656
13657	if (opt[DTRACEOPT_CLEANRATE] < dtrace_cleanrate_min)
13658		opt[DTRACEOPT_CLEANRATE] = dtrace_cleanrate_min;
13659
13660	if (opt[DTRACEOPT_CLEANRATE] > dtrace_cleanrate_max)
13661		opt[DTRACEOPT_CLEANRATE] = dtrace_cleanrate_max;
13662
13663	state->dts_alive = state->dts_laststatus = dtrace_gethrtime();
13664#if defined(sun)
13665	hdlr.cyh_func = (cyc_func_t)dtrace_state_clean;
13666	hdlr.cyh_arg = state;
13667	hdlr.cyh_level = CY_LOW_LEVEL;
13668
13669	when.cyt_when = 0;
13670	when.cyt_interval = opt[DTRACEOPT_CLEANRATE];
13671
13672	state->dts_cleaner = cyclic_add(&hdlr, &when);
13673
13674	hdlr.cyh_func = (cyc_func_t)dtrace_state_deadman;
13675	hdlr.cyh_arg = state;
13676	hdlr.cyh_level = CY_LOW_LEVEL;
13677
13678	when.cyt_when = 0;
13679	when.cyt_interval = dtrace_deadman_interval;
13680
13681	state->dts_deadman = cyclic_add(&hdlr, &when);
13682#else
13683	callout_reset(&state->dts_cleaner, hz * opt[DTRACEOPT_CLEANRATE] / NANOSEC,
13684	    dtrace_state_clean, state);
13685	callout_reset(&state->dts_deadman, hz * dtrace_deadman_interval / NANOSEC,
13686	    dtrace_state_deadman, state);
13687#endif
13688
13689	state->dts_activity = DTRACE_ACTIVITY_WARMUP;
13690
13691	/*
13692	 * Now it's time to actually fire the BEGIN probe.  We need to disable
13693	 * interrupts here both to record the CPU on which we fired the BEGIN
13694	 * probe (the data from this CPU will be processed first at user
13695	 * level) and to manually activate the buffer for this CPU.
13696	 */
13697	cookie = dtrace_interrupt_disable();
13698	*cpu = curcpu;
13699	ASSERT(state->dts_buffer[*cpu].dtb_flags & DTRACEBUF_INACTIVE);
13700	state->dts_buffer[*cpu].dtb_flags &= ~DTRACEBUF_INACTIVE;
13701
13702	dtrace_probe(dtrace_probeid_begin,
13703	    (uint64_t)(uintptr_t)state, 0, 0, 0, 0);
13704	dtrace_interrupt_enable(cookie);
13705	/*
13706	 * We may have had an exit action from a BEGIN probe; only change our
13707	 * state to ACTIVE if we're still in WARMUP.
13708	 */
13709	ASSERT(state->dts_activity == DTRACE_ACTIVITY_WARMUP ||
13710	    state->dts_activity == DTRACE_ACTIVITY_DRAINING);
13711
13712	if (state->dts_activity == DTRACE_ACTIVITY_WARMUP)
13713		state->dts_activity = DTRACE_ACTIVITY_ACTIVE;
13714
13715	/*
13716	 * Regardless of whether or not now we're in ACTIVE or DRAINING, we
13717	 * want each CPU to transition its principal buffer out of the
13718	 * INACTIVE state.  Doing this assures that no CPU will suddenly begin
13719	 * processing an ECB halfway down a probe's ECB chain; all CPUs will
13720	 * atomically transition from processing none of a state's ECBs to
13721	 * processing all of them.
13722	 */
13723	dtrace_xcall(DTRACE_CPUALL,
13724	    (dtrace_xcall_t)dtrace_buffer_activate, state);
13725	goto out;
13726
13727err:
13728	dtrace_buffer_free(state->dts_buffer);
13729	dtrace_buffer_free(state->dts_aggbuffer);
13730
13731	if ((nspec = state->dts_nspeculations) == 0) {
13732		ASSERT(state->dts_speculations == NULL);
13733		goto out;
13734	}
13735
13736	spec = state->dts_speculations;
13737	ASSERT(spec != NULL);
13738
13739	for (i = 0; i < state->dts_nspeculations; i++) {
13740		if ((buf = spec[i].dtsp_buffer) == NULL)
13741			break;
13742
13743		dtrace_buffer_free(buf);
13744		kmem_free(buf, bufsize);
13745	}
13746
13747	kmem_free(spec, nspec * sizeof (dtrace_speculation_t));
13748	state->dts_nspeculations = 0;
13749	state->dts_speculations = NULL;
13750
13751out:
13752	mutex_exit(&dtrace_lock);
13753	mutex_exit(&cpu_lock);
13754
13755	return (rval);
13756}
13757
13758static int
13759dtrace_state_stop(dtrace_state_t *state, processorid_t *cpu)
13760{
13761	dtrace_icookie_t cookie;
13762
13763	ASSERT(MUTEX_HELD(&dtrace_lock));
13764
13765	if (state->dts_activity != DTRACE_ACTIVITY_ACTIVE &&
13766	    state->dts_activity != DTRACE_ACTIVITY_DRAINING)
13767		return (EINVAL);
13768
13769	/*
13770	 * We'll set the activity to DTRACE_ACTIVITY_DRAINING, and issue a sync
13771	 * to be sure that every CPU has seen it.  See below for the details
13772	 * on why this is done.
13773	 */
13774	state->dts_activity = DTRACE_ACTIVITY_DRAINING;
13775	dtrace_sync();
13776
13777	/*
13778	 * By this point, it is impossible for any CPU to be still processing
13779	 * with DTRACE_ACTIVITY_ACTIVE.  We can thus set our activity to
13780	 * DTRACE_ACTIVITY_COOLDOWN and know that we're not racing with any
13781	 * other CPU in dtrace_buffer_reserve().  This allows dtrace_probe()
13782	 * and callees to know that the activity is DTRACE_ACTIVITY_COOLDOWN
13783	 * iff we're in the END probe.
13784	 */
13785	state->dts_activity = DTRACE_ACTIVITY_COOLDOWN;
13786	dtrace_sync();
13787	ASSERT(state->dts_activity == DTRACE_ACTIVITY_COOLDOWN);
13788
13789	/*
13790	 * Finally, we can release the reserve and call the END probe.  We
13791	 * disable interrupts across calling the END probe to allow us to
13792	 * return the CPU on which we actually called the END probe.  This
13793	 * allows user-land to be sure that this CPU's principal buffer is
13794	 * processed last.
13795	 */
13796	state->dts_reserve = 0;
13797
13798	cookie = dtrace_interrupt_disable();
13799	*cpu = curcpu;
13800	dtrace_probe(dtrace_probeid_end,
13801	    (uint64_t)(uintptr_t)state, 0, 0, 0, 0);
13802	dtrace_interrupt_enable(cookie);
13803
13804	state->dts_activity = DTRACE_ACTIVITY_STOPPED;
13805	dtrace_sync();
13806
13807	return (0);
13808}
13809
13810static int
13811dtrace_state_option(dtrace_state_t *state, dtrace_optid_t option,
13812    dtrace_optval_t val)
13813{
13814	ASSERT(MUTEX_HELD(&dtrace_lock));
13815
13816	if (state->dts_activity != DTRACE_ACTIVITY_INACTIVE)
13817		return (EBUSY);
13818
13819	if (option >= DTRACEOPT_MAX)
13820		return (EINVAL);
13821
13822	if (option != DTRACEOPT_CPU && val < 0)
13823		return (EINVAL);
13824
13825	switch (option) {
13826	case DTRACEOPT_DESTRUCTIVE:
13827		if (dtrace_destructive_disallow)
13828			return (EACCES);
13829
13830		state->dts_cred.dcr_destructive = 1;
13831		break;
13832
13833	case DTRACEOPT_BUFSIZE:
13834	case DTRACEOPT_DYNVARSIZE:
13835	case DTRACEOPT_AGGSIZE:
13836	case DTRACEOPT_SPECSIZE:
13837	case DTRACEOPT_STRSIZE:
13838		if (val < 0)
13839			return (EINVAL);
13840
13841		if (val >= LONG_MAX) {
13842			/*
13843			 * If this is an otherwise negative value, set it to
13844			 * the highest multiple of 128m less than LONG_MAX.
13845			 * Technically, we're adjusting the size without
13846			 * regard to the buffer resizing policy, but in fact,
13847			 * this has no effect -- if we set the buffer size to
13848			 * ~LONG_MAX and the buffer policy is ultimately set to
13849			 * be "manual", the buffer allocation is guaranteed to
13850			 * fail, if only because the allocation requires two
13851			 * buffers.  (We set the the size to the highest
13852			 * multiple of 128m because it ensures that the size
13853			 * will remain a multiple of a megabyte when
13854			 * repeatedly halved -- all the way down to 15m.)
13855			 */
13856			val = LONG_MAX - (1 << 27) + 1;
13857		}
13858	}
13859
13860	state->dts_options[option] = val;
13861
13862	return (0);
13863}
13864
13865static void
13866dtrace_state_destroy(dtrace_state_t *state)
13867{
13868	dtrace_ecb_t *ecb;
13869	dtrace_vstate_t *vstate = &state->dts_vstate;
13870#if defined(sun)
13871	minor_t minor = getminor(state->dts_dev);
13872#endif
13873	int i, bufsize = NCPU * sizeof (dtrace_buffer_t);
13874	dtrace_speculation_t *spec = state->dts_speculations;
13875	int nspec = state->dts_nspeculations;
13876	uint32_t match;
13877
13878	ASSERT(MUTEX_HELD(&dtrace_lock));
13879	ASSERT(MUTEX_HELD(&cpu_lock));
13880
13881	/*
13882	 * First, retract any retained enablings for this state.
13883	 */
13884	dtrace_enabling_retract(state);
13885	ASSERT(state->dts_nretained == 0);
13886
13887	if (state->dts_activity == DTRACE_ACTIVITY_ACTIVE ||
13888	    state->dts_activity == DTRACE_ACTIVITY_DRAINING) {
13889		/*
13890		 * We have managed to come into dtrace_state_destroy() on a
13891		 * hot enabling -- almost certainly because of a disorderly
13892		 * shutdown of a consumer.  (That is, a consumer that is
13893		 * exiting without having called dtrace_stop().) In this case,
13894		 * we're going to set our activity to be KILLED, and then
13895		 * issue a sync to be sure that everyone is out of probe
13896		 * context before we start blowing away ECBs.
13897		 */
13898		state->dts_activity = DTRACE_ACTIVITY_KILLED;
13899		dtrace_sync();
13900	}
13901
13902	/*
13903	 * Release the credential hold we took in dtrace_state_create().
13904	 */
13905	if (state->dts_cred.dcr_cred != NULL)
13906		crfree(state->dts_cred.dcr_cred);
13907
13908	/*
13909	 * Now we can safely disable and destroy any enabled probes.  Because
13910	 * any DTRACE_PRIV_KERNEL probes may actually be slowing our progress
13911	 * (especially if they're all enabled), we take two passes through the
13912	 * ECBs:  in the first, we disable just DTRACE_PRIV_KERNEL probes, and
13913	 * in the second we disable whatever is left over.
13914	 */
13915	for (match = DTRACE_PRIV_KERNEL; ; match = 0) {
13916		for (i = 0; i < state->dts_necbs; i++) {
13917			if ((ecb = state->dts_ecbs[i]) == NULL)
13918				continue;
13919
13920			if (match && ecb->dte_probe != NULL) {
13921				dtrace_probe_t *probe = ecb->dte_probe;
13922				dtrace_provider_t *prov = probe->dtpr_provider;
13923
13924				if (!(prov->dtpv_priv.dtpp_flags & match))
13925					continue;
13926			}
13927
13928			dtrace_ecb_disable(ecb);
13929			dtrace_ecb_destroy(ecb);
13930		}
13931
13932		if (!match)
13933			break;
13934	}
13935
13936	/*
13937	 * Before we free the buffers, perform one more sync to assure that
13938	 * every CPU is out of probe context.
13939	 */
13940	dtrace_sync();
13941
13942	dtrace_buffer_free(state->dts_buffer);
13943	dtrace_buffer_free(state->dts_aggbuffer);
13944
13945	for (i = 0; i < nspec; i++)
13946		dtrace_buffer_free(spec[i].dtsp_buffer);
13947
13948#if defined(sun)
13949	if (state->dts_cleaner != CYCLIC_NONE)
13950		cyclic_remove(state->dts_cleaner);
13951
13952	if (state->dts_deadman != CYCLIC_NONE)
13953		cyclic_remove(state->dts_deadman);
13954#else
13955	callout_stop(&state->dts_cleaner);
13956	callout_drain(&state->dts_cleaner);
13957	callout_stop(&state->dts_deadman);
13958	callout_drain(&state->dts_deadman);
13959#endif
13960
13961	dtrace_dstate_fini(&vstate->dtvs_dynvars);
13962	dtrace_vstate_fini(vstate);
13963	if (state->dts_ecbs != NULL)
13964		kmem_free(state->dts_ecbs, state->dts_necbs * sizeof (dtrace_ecb_t *));
13965
13966	if (state->dts_aggregations != NULL) {
13967#ifdef DEBUG
13968		for (i = 0; i < state->dts_naggregations; i++)
13969			ASSERT(state->dts_aggregations[i] == NULL);
13970#endif
13971		ASSERT(state->dts_naggregations > 0);
13972		kmem_free(state->dts_aggregations,
13973		    state->dts_naggregations * sizeof (dtrace_aggregation_t *));
13974	}
13975
13976	kmem_free(state->dts_buffer, bufsize);
13977	kmem_free(state->dts_aggbuffer, bufsize);
13978
13979	for (i = 0; i < nspec; i++)
13980		kmem_free(spec[i].dtsp_buffer, bufsize);
13981
13982	if (spec != NULL)
13983		kmem_free(spec, nspec * sizeof (dtrace_speculation_t));
13984
13985	dtrace_format_destroy(state);
13986
13987	if (state->dts_aggid_arena != NULL) {
13988#if defined(sun)
13989		vmem_destroy(state->dts_aggid_arena);
13990#else
13991		delete_unrhdr(state->dts_aggid_arena);
13992#endif
13993		state->dts_aggid_arena = NULL;
13994	}
13995#if defined(sun)
13996	ddi_soft_state_free(dtrace_softstate, minor);
13997	vmem_free(dtrace_minor, (void *)(uintptr_t)minor, 1);
13998#endif
13999}
14000
14001/*
14002 * DTrace Anonymous Enabling Functions
14003 */
14004static dtrace_state_t *
14005dtrace_anon_grab(void)
14006{
14007	dtrace_state_t *state;
14008
14009	ASSERT(MUTEX_HELD(&dtrace_lock));
14010
14011	if ((state = dtrace_anon.dta_state) == NULL) {
14012		ASSERT(dtrace_anon.dta_enabling == NULL);
14013		return (NULL);
14014	}
14015
14016	ASSERT(dtrace_anon.dta_enabling != NULL);
14017	ASSERT(dtrace_retained != NULL);
14018
14019	dtrace_enabling_destroy(dtrace_anon.dta_enabling);
14020	dtrace_anon.dta_enabling = NULL;
14021	dtrace_anon.dta_state = NULL;
14022
14023	return (state);
14024}
14025
14026static void
14027dtrace_anon_property(void)
14028{
14029	int i, rv;
14030	dtrace_state_t *state;
14031	dof_hdr_t *dof;
14032	char c[32];		/* enough for "dof-data-" + digits */
14033
14034	ASSERT(MUTEX_HELD(&dtrace_lock));
14035	ASSERT(MUTEX_HELD(&cpu_lock));
14036
14037	for (i = 0; ; i++) {
14038		(void) snprintf(c, sizeof (c), "dof-data-%d", i);
14039
14040		dtrace_err_verbose = 1;
14041
14042		if ((dof = dtrace_dof_property(c)) == NULL) {
14043			dtrace_err_verbose = 0;
14044			break;
14045		}
14046
14047#if defined(sun)
14048		/*
14049		 * We want to create anonymous state, so we need to transition
14050		 * the kernel debugger to indicate that DTrace is active.  If
14051		 * this fails (e.g. because the debugger has modified text in
14052		 * some way), we won't continue with the processing.
14053		 */
14054		if (kdi_dtrace_set(KDI_DTSET_DTRACE_ACTIVATE) != 0) {
14055			cmn_err(CE_NOTE, "kernel debugger active; anonymous "
14056			    "enabling ignored.");
14057			dtrace_dof_destroy(dof);
14058			break;
14059		}
14060#endif
14061
14062		/*
14063		 * If we haven't allocated an anonymous state, we'll do so now.
14064		 */
14065		if ((state = dtrace_anon.dta_state) == NULL) {
14066#if defined(sun)
14067			state = dtrace_state_create(NULL, NULL);
14068#else
14069			state = dtrace_state_create(NULL);
14070#endif
14071			dtrace_anon.dta_state = state;
14072
14073			if (state == NULL) {
14074				/*
14075				 * This basically shouldn't happen:  the only
14076				 * failure mode from dtrace_state_create() is a
14077				 * failure of ddi_soft_state_zalloc() that
14078				 * itself should never happen.  Still, the
14079				 * interface allows for a failure mode, and
14080				 * we want to fail as gracefully as possible:
14081				 * we'll emit an error message and cease
14082				 * processing anonymous state in this case.
14083				 */
14084				cmn_err(CE_WARN, "failed to create "
14085				    "anonymous state");
14086				dtrace_dof_destroy(dof);
14087				break;
14088			}
14089		}
14090
14091		rv = dtrace_dof_slurp(dof, &state->dts_vstate, CRED(),
14092		    &dtrace_anon.dta_enabling, 0, B_TRUE);
14093
14094		if (rv == 0)
14095			rv = dtrace_dof_options(dof, state);
14096
14097		dtrace_err_verbose = 0;
14098		dtrace_dof_destroy(dof);
14099
14100		if (rv != 0) {
14101			/*
14102			 * This is malformed DOF; chuck any anonymous state
14103			 * that we created.
14104			 */
14105			ASSERT(dtrace_anon.dta_enabling == NULL);
14106			dtrace_state_destroy(state);
14107			dtrace_anon.dta_state = NULL;
14108			break;
14109		}
14110
14111		ASSERT(dtrace_anon.dta_enabling != NULL);
14112	}
14113
14114	if (dtrace_anon.dta_enabling != NULL) {
14115		int rval;
14116
14117		/*
14118		 * dtrace_enabling_retain() can only fail because we are
14119		 * trying to retain more enablings than are allowed -- but
14120		 * we only have one anonymous enabling, and we are guaranteed
14121		 * to be allowed at least one retained enabling; we assert
14122		 * that dtrace_enabling_retain() returns success.
14123		 */
14124		rval = dtrace_enabling_retain(dtrace_anon.dta_enabling);
14125		ASSERT(rval == 0);
14126
14127		dtrace_enabling_dump(dtrace_anon.dta_enabling);
14128	}
14129}
14130
14131/*
14132 * DTrace Helper Functions
14133 */
14134static void
14135dtrace_helper_trace(dtrace_helper_action_t *helper,
14136    dtrace_mstate_t *mstate, dtrace_vstate_t *vstate, int where)
14137{
14138	uint32_t size, next, nnext, i;
14139	dtrace_helptrace_t *ent;
14140	uint16_t flags = cpu_core[curcpu].cpuc_dtrace_flags;
14141
14142	if (!dtrace_helptrace_enabled)
14143		return;
14144
14145	ASSERT(vstate->dtvs_nlocals <= dtrace_helptrace_nlocals);
14146
14147	/*
14148	 * What would a tracing framework be without its own tracing
14149	 * framework?  (Well, a hell of a lot simpler, for starters...)
14150	 */
14151	size = sizeof (dtrace_helptrace_t) + dtrace_helptrace_nlocals *
14152	    sizeof (uint64_t) - sizeof (uint64_t);
14153
14154	/*
14155	 * Iterate until we can allocate a slot in the trace buffer.
14156	 */
14157	do {
14158		next = dtrace_helptrace_next;
14159
14160		if (next + size < dtrace_helptrace_bufsize) {
14161			nnext = next + size;
14162		} else {
14163			nnext = size;
14164		}
14165	} while (dtrace_cas32(&dtrace_helptrace_next, next, nnext) != next);
14166
14167	/*
14168	 * We have our slot; fill it in.
14169	 */
14170	if (nnext == size)
14171		next = 0;
14172
14173	ent = (dtrace_helptrace_t *)&dtrace_helptrace_buffer[next];
14174	ent->dtht_helper = helper;
14175	ent->dtht_where = where;
14176	ent->dtht_nlocals = vstate->dtvs_nlocals;
14177
14178	ent->dtht_fltoffs = (mstate->dtms_present & DTRACE_MSTATE_FLTOFFS) ?
14179	    mstate->dtms_fltoffs : -1;
14180	ent->dtht_fault = DTRACE_FLAGS2FLT(flags);
14181	ent->dtht_illval = cpu_core[curcpu].cpuc_dtrace_illval;
14182
14183	for (i = 0; i < vstate->dtvs_nlocals; i++) {
14184		dtrace_statvar_t *svar;
14185
14186		if ((svar = vstate->dtvs_locals[i]) == NULL)
14187			continue;
14188
14189		ASSERT(svar->dtsv_size >= NCPU * sizeof (uint64_t));
14190		ent->dtht_locals[i] =
14191		    ((uint64_t *)(uintptr_t)svar->dtsv_data)[curcpu];
14192	}
14193}
14194
14195static uint64_t
14196dtrace_helper(int which, dtrace_mstate_t *mstate,
14197    dtrace_state_t *state, uint64_t arg0, uint64_t arg1)
14198{
14199	uint16_t *flags = &cpu_core[curcpu].cpuc_dtrace_flags;
14200	uint64_t sarg0 = mstate->dtms_arg[0];
14201	uint64_t sarg1 = mstate->dtms_arg[1];
14202	uint64_t rval = 0;
14203	dtrace_helpers_t *helpers = curproc->p_dtrace_helpers;
14204	dtrace_helper_action_t *helper;
14205	dtrace_vstate_t *vstate;
14206	dtrace_difo_t *pred;
14207	int i, trace = dtrace_helptrace_enabled;
14208
14209	ASSERT(which >= 0 && which < DTRACE_NHELPER_ACTIONS);
14210
14211	if (helpers == NULL)
14212		return (0);
14213
14214	if ((helper = helpers->dthps_actions[which]) == NULL)
14215		return (0);
14216
14217	vstate = &helpers->dthps_vstate;
14218	mstate->dtms_arg[0] = arg0;
14219	mstate->dtms_arg[1] = arg1;
14220
14221	/*
14222	 * Now iterate over each helper.  If its predicate evaluates to 'true',
14223	 * we'll call the corresponding actions.  Note that the below calls
14224	 * to dtrace_dif_emulate() may set faults in machine state.  This is
14225	 * okay:  our caller (the outer dtrace_dif_emulate()) will simply plow
14226	 * the stored DIF offset with its own (which is the desired behavior).
14227	 * Also, note the calls to dtrace_dif_emulate() may allocate scratch
14228	 * from machine state; this is okay, too.
14229	 */
14230	for (; helper != NULL; helper = helper->dtha_next) {
14231		if ((pred = helper->dtha_predicate) != NULL) {
14232			if (trace)
14233				dtrace_helper_trace(helper, mstate, vstate, 0);
14234
14235			if (!dtrace_dif_emulate(pred, mstate, vstate, state))
14236				goto next;
14237
14238			if (*flags & CPU_DTRACE_FAULT)
14239				goto err;
14240		}
14241
14242		for (i = 0; i < helper->dtha_nactions; i++) {
14243			if (trace)
14244				dtrace_helper_trace(helper,
14245				    mstate, vstate, i + 1);
14246
14247			rval = dtrace_dif_emulate(helper->dtha_actions[i],
14248			    mstate, vstate, state);
14249
14250			if (*flags & CPU_DTRACE_FAULT)
14251				goto err;
14252		}
14253
14254next:
14255		if (trace)
14256			dtrace_helper_trace(helper, mstate, vstate,
14257			    DTRACE_HELPTRACE_NEXT);
14258	}
14259
14260	if (trace)
14261		dtrace_helper_trace(helper, mstate, vstate,
14262		    DTRACE_HELPTRACE_DONE);
14263
14264	/*
14265	 * Restore the arg0 that we saved upon entry.
14266	 */
14267	mstate->dtms_arg[0] = sarg0;
14268	mstate->dtms_arg[1] = sarg1;
14269
14270	return (rval);
14271
14272err:
14273	if (trace)
14274		dtrace_helper_trace(helper, mstate, vstate,
14275		    DTRACE_HELPTRACE_ERR);
14276
14277	/*
14278	 * Restore the arg0 that we saved upon entry.
14279	 */
14280	mstate->dtms_arg[0] = sarg0;
14281	mstate->dtms_arg[1] = sarg1;
14282
14283	return (0);
14284}
14285
14286static void
14287dtrace_helper_action_destroy(dtrace_helper_action_t *helper,
14288    dtrace_vstate_t *vstate)
14289{
14290	int i;
14291
14292	if (helper->dtha_predicate != NULL)
14293		dtrace_difo_release(helper->dtha_predicate, vstate);
14294
14295	for (i = 0; i < helper->dtha_nactions; i++) {
14296		ASSERT(helper->dtha_actions[i] != NULL);
14297		dtrace_difo_release(helper->dtha_actions[i], vstate);
14298	}
14299
14300	kmem_free(helper->dtha_actions,
14301	    helper->dtha_nactions * sizeof (dtrace_difo_t *));
14302	kmem_free(helper, sizeof (dtrace_helper_action_t));
14303}
14304
14305static int
14306dtrace_helper_destroygen(int gen)
14307{
14308	proc_t *p = curproc;
14309	dtrace_helpers_t *help = p->p_dtrace_helpers;
14310	dtrace_vstate_t *vstate;
14311	int i;
14312
14313	ASSERT(MUTEX_HELD(&dtrace_lock));
14314
14315	if (help == NULL || gen > help->dthps_generation)
14316		return (EINVAL);
14317
14318	vstate = &help->dthps_vstate;
14319
14320	for (i = 0; i < DTRACE_NHELPER_ACTIONS; i++) {
14321		dtrace_helper_action_t *last = NULL, *h, *next;
14322
14323		for (h = help->dthps_actions[i]; h != NULL; h = next) {
14324			next = h->dtha_next;
14325
14326			if (h->dtha_generation == gen) {
14327				if (last != NULL) {
14328					last->dtha_next = next;
14329				} else {
14330					help->dthps_actions[i] = next;
14331				}
14332
14333				dtrace_helper_action_destroy(h, vstate);
14334			} else {
14335				last = h;
14336			}
14337		}
14338	}
14339
14340	/*
14341	 * Interate until we've cleared out all helper providers with the
14342	 * given generation number.
14343	 */
14344	for (;;) {
14345		dtrace_helper_provider_t *prov;
14346
14347		/*
14348		 * Look for a helper provider with the right generation. We
14349		 * have to start back at the beginning of the list each time
14350		 * because we drop dtrace_lock. It's unlikely that we'll make
14351		 * more than two passes.
14352		 */
14353		for (i = 0; i < help->dthps_nprovs; i++) {
14354			prov = help->dthps_provs[i];
14355
14356			if (prov->dthp_generation == gen)
14357				break;
14358		}
14359
14360		/*
14361		 * If there were no matches, we're done.
14362		 */
14363		if (i == help->dthps_nprovs)
14364			break;
14365
14366		/*
14367		 * Move the last helper provider into this slot.
14368		 */
14369		help->dthps_nprovs--;
14370		help->dthps_provs[i] = help->dthps_provs[help->dthps_nprovs];
14371		help->dthps_provs[help->dthps_nprovs] = NULL;
14372
14373		mutex_exit(&dtrace_lock);
14374
14375		/*
14376		 * If we have a meta provider, remove this helper provider.
14377		 */
14378		mutex_enter(&dtrace_meta_lock);
14379		if (dtrace_meta_pid != NULL) {
14380			ASSERT(dtrace_deferred_pid == NULL);
14381			dtrace_helper_provider_remove(&prov->dthp_prov,
14382			    p->p_pid);
14383		}
14384		mutex_exit(&dtrace_meta_lock);
14385
14386		dtrace_helper_provider_destroy(prov);
14387
14388		mutex_enter(&dtrace_lock);
14389	}
14390
14391	return (0);
14392}
14393
14394static int
14395dtrace_helper_validate(dtrace_helper_action_t *helper)
14396{
14397	int err = 0, i;
14398	dtrace_difo_t *dp;
14399
14400	if ((dp = helper->dtha_predicate) != NULL)
14401		err += dtrace_difo_validate_helper(dp);
14402
14403	for (i = 0; i < helper->dtha_nactions; i++)
14404		err += dtrace_difo_validate_helper(helper->dtha_actions[i]);
14405
14406	return (err == 0);
14407}
14408
14409static int
14410dtrace_helper_action_add(int which, dtrace_ecbdesc_t *ep)
14411{
14412	dtrace_helpers_t *help;
14413	dtrace_helper_action_t *helper, *last;
14414	dtrace_actdesc_t *act;
14415	dtrace_vstate_t *vstate;
14416	dtrace_predicate_t *pred;
14417	int count = 0, nactions = 0, i;
14418
14419	if (which < 0 || which >= DTRACE_NHELPER_ACTIONS)
14420		return (EINVAL);
14421
14422	help = curproc->p_dtrace_helpers;
14423	last = help->dthps_actions[which];
14424	vstate = &help->dthps_vstate;
14425
14426	for (count = 0; last != NULL; last = last->dtha_next) {
14427		count++;
14428		if (last->dtha_next == NULL)
14429			break;
14430	}
14431
14432	/*
14433	 * If we already have dtrace_helper_actions_max helper actions for this
14434	 * helper action type, we'll refuse to add a new one.
14435	 */
14436	if (count >= dtrace_helper_actions_max)
14437		return (ENOSPC);
14438
14439	helper = kmem_zalloc(sizeof (dtrace_helper_action_t), KM_SLEEP);
14440	helper->dtha_generation = help->dthps_generation;
14441
14442	if ((pred = ep->dted_pred.dtpdd_predicate) != NULL) {
14443		ASSERT(pred->dtp_difo != NULL);
14444		dtrace_difo_hold(pred->dtp_difo);
14445		helper->dtha_predicate = pred->dtp_difo;
14446	}
14447
14448	for (act = ep->dted_action; act != NULL; act = act->dtad_next) {
14449		if (act->dtad_kind != DTRACEACT_DIFEXPR)
14450			goto err;
14451
14452		if (act->dtad_difo == NULL)
14453			goto err;
14454
14455		nactions++;
14456	}
14457
14458	helper->dtha_actions = kmem_zalloc(sizeof (dtrace_difo_t *) *
14459	    (helper->dtha_nactions = nactions), KM_SLEEP);
14460
14461	for (act = ep->dted_action, i = 0; act != NULL; act = act->dtad_next) {
14462		dtrace_difo_hold(act->dtad_difo);
14463		helper->dtha_actions[i++] = act->dtad_difo;
14464	}
14465
14466	if (!dtrace_helper_validate(helper))
14467		goto err;
14468
14469	if (last == NULL) {
14470		help->dthps_actions[which] = helper;
14471	} else {
14472		last->dtha_next = helper;
14473	}
14474
14475	if (vstate->dtvs_nlocals > dtrace_helptrace_nlocals) {
14476		dtrace_helptrace_nlocals = vstate->dtvs_nlocals;
14477		dtrace_helptrace_next = 0;
14478	}
14479
14480	return (0);
14481err:
14482	dtrace_helper_action_destroy(helper, vstate);
14483	return (EINVAL);
14484}
14485
14486static void
14487dtrace_helper_provider_register(proc_t *p, dtrace_helpers_t *help,
14488    dof_helper_t *dofhp)
14489{
14490	ASSERT(MUTEX_NOT_HELD(&dtrace_lock));
14491
14492	mutex_enter(&dtrace_meta_lock);
14493	mutex_enter(&dtrace_lock);
14494
14495	if (!dtrace_attached() || dtrace_meta_pid == NULL) {
14496		/*
14497		 * If the dtrace module is loaded but not attached, or if
14498		 * there aren't isn't a meta provider registered to deal with
14499		 * these provider descriptions, we need to postpone creating
14500		 * the actual providers until later.
14501		 */
14502
14503		if (help->dthps_next == NULL && help->dthps_prev == NULL &&
14504		    dtrace_deferred_pid != help) {
14505			help->dthps_deferred = 1;
14506			help->dthps_pid = p->p_pid;
14507			help->dthps_next = dtrace_deferred_pid;
14508			help->dthps_prev = NULL;
14509			if (dtrace_deferred_pid != NULL)
14510				dtrace_deferred_pid->dthps_prev = help;
14511			dtrace_deferred_pid = help;
14512		}
14513
14514		mutex_exit(&dtrace_lock);
14515
14516	} else if (dofhp != NULL) {
14517		/*
14518		 * If the dtrace module is loaded and we have a particular
14519		 * helper provider description, pass that off to the
14520		 * meta provider.
14521		 */
14522
14523		mutex_exit(&dtrace_lock);
14524
14525		dtrace_helper_provide(dofhp, p->p_pid);
14526
14527	} else {
14528		/*
14529		 * Otherwise, just pass all the helper provider descriptions
14530		 * off to the meta provider.
14531		 */
14532
14533		int i;
14534		mutex_exit(&dtrace_lock);
14535
14536		for (i = 0; i < help->dthps_nprovs; i++) {
14537			dtrace_helper_provide(&help->dthps_provs[i]->dthp_prov,
14538			    p->p_pid);
14539		}
14540	}
14541
14542	mutex_exit(&dtrace_meta_lock);
14543}
14544
14545static int
14546dtrace_helper_provider_add(dof_helper_t *dofhp, int gen)
14547{
14548	dtrace_helpers_t *help;
14549	dtrace_helper_provider_t *hprov, **tmp_provs;
14550	uint_t tmp_maxprovs, i;
14551
14552	ASSERT(MUTEX_HELD(&dtrace_lock));
14553
14554	help = curproc->p_dtrace_helpers;
14555	ASSERT(help != NULL);
14556
14557	/*
14558	 * If we already have dtrace_helper_providers_max helper providers,
14559	 * we're refuse to add a new one.
14560	 */
14561	if (help->dthps_nprovs >= dtrace_helper_providers_max)
14562		return (ENOSPC);
14563
14564	/*
14565	 * Check to make sure this isn't a duplicate.
14566	 */
14567	for (i = 0; i < help->dthps_nprovs; i++) {
14568		if (dofhp->dofhp_addr ==
14569		    help->dthps_provs[i]->dthp_prov.dofhp_addr)
14570			return (EALREADY);
14571	}
14572
14573	hprov = kmem_zalloc(sizeof (dtrace_helper_provider_t), KM_SLEEP);
14574	hprov->dthp_prov = *dofhp;
14575	hprov->dthp_ref = 1;
14576	hprov->dthp_generation = gen;
14577
14578	/*
14579	 * Allocate a bigger table for helper providers if it's already full.
14580	 */
14581	if (help->dthps_maxprovs == help->dthps_nprovs) {
14582		tmp_maxprovs = help->dthps_maxprovs;
14583		tmp_provs = help->dthps_provs;
14584
14585		if (help->dthps_maxprovs == 0)
14586			help->dthps_maxprovs = 2;
14587		else
14588			help->dthps_maxprovs *= 2;
14589		if (help->dthps_maxprovs > dtrace_helper_providers_max)
14590			help->dthps_maxprovs = dtrace_helper_providers_max;
14591
14592		ASSERT(tmp_maxprovs < help->dthps_maxprovs);
14593
14594		help->dthps_provs = kmem_zalloc(help->dthps_maxprovs *
14595		    sizeof (dtrace_helper_provider_t *), KM_SLEEP);
14596
14597		if (tmp_provs != NULL) {
14598			bcopy(tmp_provs, help->dthps_provs, tmp_maxprovs *
14599			    sizeof (dtrace_helper_provider_t *));
14600			kmem_free(tmp_provs, tmp_maxprovs *
14601			    sizeof (dtrace_helper_provider_t *));
14602		}
14603	}
14604
14605	help->dthps_provs[help->dthps_nprovs] = hprov;
14606	help->dthps_nprovs++;
14607
14608	return (0);
14609}
14610
14611static void
14612dtrace_helper_provider_destroy(dtrace_helper_provider_t *hprov)
14613{
14614	mutex_enter(&dtrace_lock);
14615
14616	if (--hprov->dthp_ref == 0) {
14617		dof_hdr_t *dof;
14618		mutex_exit(&dtrace_lock);
14619		dof = (dof_hdr_t *)(uintptr_t)hprov->dthp_prov.dofhp_dof;
14620		dtrace_dof_destroy(dof);
14621		kmem_free(hprov, sizeof (dtrace_helper_provider_t));
14622	} else {
14623		mutex_exit(&dtrace_lock);
14624	}
14625}
14626
14627static int
14628dtrace_helper_provider_validate(dof_hdr_t *dof, dof_sec_t *sec)
14629{
14630	uintptr_t daddr = (uintptr_t)dof;
14631	dof_sec_t *str_sec, *prb_sec, *arg_sec, *off_sec, *enoff_sec;
14632	dof_provider_t *provider;
14633	dof_probe_t *probe;
14634	uint8_t *arg;
14635	char *strtab, *typestr;
14636	dof_stridx_t typeidx;
14637	size_t typesz;
14638	uint_t nprobes, j, k;
14639
14640	ASSERT(sec->dofs_type == DOF_SECT_PROVIDER);
14641
14642	if (sec->dofs_offset & (sizeof (uint_t) - 1)) {
14643		dtrace_dof_error(dof, "misaligned section offset");
14644		return (-1);
14645	}
14646
14647	/*
14648	 * The section needs to be large enough to contain the DOF provider
14649	 * structure appropriate for the given version.
14650	 */
14651	if (sec->dofs_size <
14652	    ((dof->dofh_ident[DOF_ID_VERSION] == DOF_VERSION_1) ?
14653	    offsetof(dof_provider_t, dofpv_prenoffs) :
14654	    sizeof (dof_provider_t))) {
14655		dtrace_dof_error(dof, "provider section too small");
14656		return (-1);
14657	}
14658
14659	provider = (dof_provider_t *)(uintptr_t)(daddr + sec->dofs_offset);
14660	str_sec = dtrace_dof_sect(dof, DOF_SECT_STRTAB, provider->dofpv_strtab);
14661	prb_sec = dtrace_dof_sect(dof, DOF_SECT_PROBES, provider->dofpv_probes);
14662	arg_sec = dtrace_dof_sect(dof, DOF_SECT_PRARGS, provider->dofpv_prargs);
14663	off_sec = dtrace_dof_sect(dof, DOF_SECT_PROFFS, provider->dofpv_proffs);
14664
14665	if (str_sec == NULL || prb_sec == NULL ||
14666	    arg_sec == NULL || off_sec == NULL)
14667		return (-1);
14668
14669	enoff_sec = NULL;
14670
14671	if (dof->dofh_ident[DOF_ID_VERSION] != DOF_VERSION_1 &&
14672	    provider->dofpv_prenoffs != DOF_SECT_NONE &&
14673	    (enoff_sec = dtrace_dof_sect(dof, DOF_SECT_PRENOFFS,
14674	    provider->dofpv_prenoffs)) == NULL)
14675		return (-1);
14676
14677	strtab = (char *)(uintptr_t)(daddr + str_sec->dofs_offset);
14678
14679	if (provider->dofpv_name >= str_sec->dofs_size ||
14680	    strlen(strtab + provider->dofpv_name) >= DTRACE_PROVNAMELEN) {
14681		dtrace_dof_error(dof, "invalid provider name");
14682		return (-1);
14683	}
14684
14685	if (prb_sec->dofs_entsize == 0 ||
14686	    prb_sec->dofs_entsize > prb_sec->dofs_size) {
14687		dtrace_dof_error(dof, "invalid entry size");
14688		return (-1);
14689	}
14690
14691	if (prb_sec->dofs_entsize & (sizeof (uintptr_t) - 1)) {
14692		dtrace_dof_error(dof, "misaligned entry size");
14693		return (-1);
14694	}
14695
14696	if (off_sec->dofs_entsize != sizeof (uint32_t)) {
14697		dtrace_dof_error(dof, "invalid entry size");
14698		return (-1);
14699	}
14700
14701	if (off_sec->dofs_offset & (sizeof (uint32_t) - 1)) {
14702		dtrace_dof_error(dof, "misaligned section offset");
14703		return (-1);
14704	}
14705
14706	if (arg_sec->dofs_entsize != sizeof (uint8_t)) {
14707		dtrace_dof_error(dof, "invalid entry size");
14708		return (-1);
14709	}
14710
14711	arg = (uint8_t *)(uintptr_t)(daddr + arg_sec->dofs_offset);
14712
14713	nprobes = prb_sec->dofs_size / prb_sec->dofs_entsize;
14714
14715	/*
14716	 * Take a pass through the probes to check for errors.
14717	 */
14718	for (j = 0; j < nprobes; j++) {
14719		probe = (dof_probe_t *)(uintptr_t)(daddr +
14720		    prb_sec->dofs_offset + j * prb_sec->dofs_entsize);
14721
14722		if (probe->dofpr_func >= str_sec->dofs_size) {
14723			dtrace_dof_error(dof, "invalid function name");
14724			return (-1);
14725		}
14726
14727		if (strlen(strtab + probe->dofpr_func) >= DTRACE_FUNCNAMELEN) {
14728			dtrace_dof_error(dof, "function name too long");
14729			return (-1);
14730		}
14731
14732		if (probe->dofpr_name >= str_sec->dofs_size ||
14733		    strlen(strtab + probe->dofpr_name) >= DTRACE_NAMELEN) {
14734			dtrace_dof_error(dof, "invalid probe name");
14735			return (-1);
14736		}
14737
14738		/*
14739		 * The offset count must not wrap the index, and the offsets
14740		 * must also not overflow the section's data.
14741		 */
14742		if (probe->dofpr_offidx + probe->dofpr_noffs <
14743		    probe->dofpr_offidx ||
14744		    (probe->dofpr_offidx + probe->dofpr_noffs) *
14745		    off_sec->dofs_entsize > off_sec->dofs_size) {
14746			dtrace_dof_error(dof, "invalid probe offset");
14747			return (-1);
14748		}
14749
14750		if (dof->dofh_ident[DOF_ID_VERSION] != DOF_VERSION_1) {
14751			/*
14752			 * If there's no is-enabled offset section, make sure
14753			 * there aren't any is-enabled offsets. Otherwise
14754			 * perform the same checks as for probe offsets
14755			 * (immediately above).
14756			 */
14757			if (enoff_sec == NULL) {
14758				if (probe->dofpr_enoffidx != 0 ||
14759				    probe->dofpr_nenoffs != 0) {
14760					dtrace_dof_error(dof, "is-enabled "
14761					    "offsets with null section");
14762					return (-1);
14763				}
14764			} else if (probe->dofpr_enoffidx +
14765			    probe->dofpr_nenoffs < probe->dofpr_enoffidx ||
14766			    (probe->dofpr_enoffidx + probe->dofpr_nenoffs) *
14767			    enoff_sec->dofs_entsize > enoff_sec->dofs_size) {
14768				dtrace_dof_error(dof, "invalid is-enabled "
14769				    "offset");
14770				return (-1);
14771			}
14772
14773			if (probe->dofpr_noffs + probe->dofpr_nenoffs == 0) {
14774				dtrace_dof_error(dof, "zero probe and "
14775				    "is-enabled offsets");
14776				return (-1);
14777			}
14778		} else if (probe->dofpr_noffs == 0) {
14779			dtrace_dof_error(dof, "zero probe offsets");
14780			return (-1);
14781		}
14782
14783		if (probe->dofpr_argidx + probe->dofpr_xargc <
14784		    probe->dofpr_argidx ||
14785		    (probe->dofpr_argidx + probe->dofpr_xargc) *
14786		    arg_sec->dofs_entsize > arg_sec->dofs_size) {
14787			dtrace_dof_error(dof, "invalid args");
14788			return (-1);
14789		}
14790
14791		typeidx = probe->dofpr_nargv;
14792		typestr = strtab + probe->dofpr_nargv;
14793		for (k = 0; k < probe->dofpr_nargc; k++) {
14794			if (typeidx >= str_sec->dofs_size) {
14795				dtrace_dof_error(dof, "bad "
14796				    "native argument type");
14797				return (-1);
14798			}
14799
14800			typesz = strlen(typestr) + 1;
14801			if (typesz > DTRACE_ARGTYPELEN) {
14802				dtrace_dof_error(dof, "native "
14803				    "argument type too long");
14804				return (-1);
14805			}
14806			typeidx += typesz;
14807			typestr += typesz;
14808		}
14809
14810		typeidx = probe->dofpr_xargv;
14811		typestr = strtab + probe->dofpr_xargv;
14812		for (k = 0; k < probe->dofpr_xargc; k++) {
14813			if (arg[probe->dofpr_argidx + k] > probe->dofpr_nargc) {
14814				dtrace_dof_error(dof, "bad "
14815				    "native argument index");
14816				return (-1);
14817			}
14818
14819			if (typeidx >= str_sec->dofs_size) {
14820				dtrace_dof_error(dof, "bad "
14821				    "translated argument type");
14822				return (-1);
14823			}
14824
14825			typesz = strlen(typestr) + 1;
14826			if (typesz > DTRACE_ARGTYPELEN) {
14827				dtrace_dof_error(dof, "translated argument "
14828				    "type too long");
14829				return (-1);
14830			}
14831
14832			typeidx += typesz;
14833			typestr += typesz;
14834		}
14835	}
14836
14837	return (0);
14838}
14839
14840static int
14841dtrace_helper_slurp(dof_hdr_t *dof, dof_helper_t *dhp)
14842{
14843	dtrace_helpers_t *help;
14844	dtrace_vstate_t *vstate;
14845	dtrace_enabling_t *enab = NULL;
14846	int i, gen, rv, nhelpers = 0, nprovs = 0, destroy = 1;
14847	uintptr_t daddr = (uintptr_t)dof;
14848
14849	ASSERT(MUTEX_HELD(&dtrace_lock));
14850
14851	if ((help = curproc->p_dtrace_helpers) == NULL)
14852		help = dtrace_helpers_create(curproc);
14853
14854	vstate = &help->dthps_vstate;
14855
14856	if ((rv = dtrace_dof_slurp(dof, vstate, NULL, &enab,
14857	    dhp != NULL ? dhp->dofhp_addr : 0, B_FALSE)) != 0) {
14858		dtrace_dof_destroy(dof);
14859		return (rv);
14860	}
14861
14862	/*
14863	 * Look for helper providers and validate their descriptions.
14864	 */
14865	if (dhp != NULL) {
14866		for (i = 0; i < dof->dofh_secnum; i++) {
14867			dof_sec_t *sec = (dof_sec_t *)(uintptr_t)(daddr +
14868			    dof->dofh_secoff + i * dof->dofh_secsize);
14869
14870			if (sec->dofs_type != DOF_SECT_PROVIDER)
14871				continue;
14872
14873			if (dtrace_helper_provider_validate(dof, sec) != 0) {
14874				dtrace_enabling_destroy(enab);
14875				dtrace_dof_destroy(dof);
14876				return (-1);
14877			}
14878
14879			nprovs++;
14880		}
14881	}
14882
14883	/*
14884	 * Now we need to walk through the ECB descriptions in the enabling.
14885	 */
14886	for (i = 0; i < enab->dten_ndesc; i++) {
14887		dtrace_ecbdesc_t *ep = enab->dten_desc[i];
14888		dtrace_probedesc_t *desc = &ep->dted_probe;
14889
14890		if (strcmp(desc->dtpd_provider, "dtrace") != 0)
14891			continue;
14892
14893		if (strcmp(desc->dtpd_mod, "helper") != 0)
14894			continue;
14895
14896		if (strcmp(desc->dtpd_func, "ustack") != 0)
14897			continue;
14898
14899		if ((rv = dtrace_helper_action_add(DTRACE_HELPER_ACTION_USTACK,
14900		    ep)) != 0) {
14901			/*
14902			 * Adding this helper action failed -- we are now going
14903			 * to rip out the entire generation and return failure.
14904			 */
14905			(void) dtrace_helper_destroygen(help->dthps_generation);
14906			dtrace_enabling_destroy(enab);
14907			dtrace_dof_destroy(dof);
14908			return (-1);
14909		}
14910
14911		nhelpers++;
14912	}
14913
14914	if (nhelpers < enab->dten_ndesc)
14915		dtrace_dof_error(dof, "unmatched helpers");
14916
14917	gen = help->dthps_generation++;
14918	dtrace_enabling_destroy(enab);
14919
14920	if (dhp != NULL && nprovs > 0) {
14921		dhp->dofhp_dof = (uint64_t)(uintptr_t)dof;
14922		if (dtrace_helper_provider_add(dhp, gen) == 0) {
14923			mutex_exit(&dtrace_lock);
14924			dtrace_helper_provider_register(curproc, help, dhp);
14925			mutex_enter(&dtrace_lock);
14926
14927			destroy = 0;
14928		}
14929	}
14930
14931	if (destroy)
14932		dtrace_dof_destroy(dof);
14933
14934	return (gen);
14935}
14936
14937static dtrace_helpers_t *
14938dtrace_helpers_create(proc_t *p)
14939{
14940	dtrace_helpers_t *help;
14941
14942	ASSERT(MUTEX_HELD(&dtrace_lock));
14943	ASSERT(p->p_dtrace_helpers == NULL);
14944
14945	help = kmem_zalloc(sizeof (dtrace_helpers_t), KM_SLEEP);
14946	help->dthps_actions = kmem_zalloc(sizeof (dtrace_helper_action_t *) *
14947	    DTRACE_NHELPER_ACTIONS, KM_SLEEP);
14948
14949	p->p_dtrace_helpers = help;
14950	dtrace_helpers++;
14951
14952	return (help);
14953}
14954
14955#if defined(sun)
14956static
14957#endif
14958void
14959dtrace_helpers_destroy(proc_t *p)
14960{
14961	dtrace_helpers_t *help;
14962	dtrace_vstate_t *vstate;
14963#if defined(sun)
14964	proc_t *p = curproc;
14965#endif
14966	int i;
14967
14968	mutex_enter(&dtrace_lock);
14969
14970	ASSERT(p->p_dtrace_helpers != NULL);
14971	ASSERT(dtrace_helpers > 0);
14972
14973	help = p->p_dtrace_helpers;
14974	vstate = &help->dthps_vstate;
14975
14976	/*
14977	 * We're now going to lose the help from this process.
14978	 */
14979	p->p_dtrace_helpers = NULL;
14980	dtrace_sync();
14981
14982	/*
14983	 * Destory the helper actions.
14984	 */
14985	for (i = 0; i < DTRACE_NHELPER_ACTIONS; i++) {
14986		dtrace_helper_action_t *h, *next;
14987
14988		for (h = help->dthps_actions[i]; h != NULL; h = next) {
14989			next = h->dtha_next;
14990			dtrace_helper_action_destroy(h, vstate);
14991			h = next;
14992		}
14993	}
14994
14995	mutex_exit(&dtrace_lock);
14996
14997	/*
14998	 * Destroy the helper providers.
14999	 */
15000	if (help->dthps_maxprovs > 0) {
15001		mutex_enter(&dtrace_meta_lock);
15002		if (dtrace_meta_pid != NULL) {
15003			ASSERT(dtrace_deferred_pid == NULL);
15004
15005			for (i = 0; i < help->dthps_nprovs; i++) {
15006				dtrace_helper_provider_remove(
15007				    &help->dthps_provs[i]->dthp_prov, p->p_pid);
15008			}
15009		} else {
15010			mutex_enter(&dtrace_lock);
15011			ASSERT(help->dthps_deferred == 0 ||
15012			    help->dthps_next != NULL ||
15013			    help->dthps_prev != NULL ||
15014			    help == dtrace_deferred_pid);
15015
15016			/*
15017			 * Remove the helper from the deferred list.
15018			 */
15019			if (help->dthps_next != NULL)
15020				help->dthps_next->dthps_prev = help->dthps_prev;
15021			if (help->dthps_prev != NULL)
15022				help->dthps_prev->dthps_next = help->dthps_next;
15023			if (dtrace_deferred_pid == help) {
15024				dtrace_deferred_pid = help->dthps_next;
15025				ASSERT(help->dthps_prev == NULL);
15026			}
15027
15028			mutex_exit(&dtrace_lock);
15029		}
15030
15031		mutex_exit(&dtrace_meta_lock);
15032
15033		for (i = 0; i < help->dthps_nprovs; i++) {
15034			dtrace_helper_provider_destroy(help->dthps_provs[i]);
15035		}
15036
15037		kmem_free(help->dthps_provs, help->dthps_maxprovs *
15038		    sizeof (dtrace_helper_provider_t *));
15039	}
15040
15041	mutex_enter(&dtrace_lock);
15042
15043	dtrace_vstate_fini(&help->dthps_vstate);
15044	kmem_free(help->dthps_actions,
15045	    sizeof (dtrace_helper_action_t *) * DTRACE_NHELPER_ACTIONS);
15046	kmem_free(help, sizeof (dtrace_helpers_t));
15047
15048	--dtrace_helpers;
15049	mutex_exit(&dtrace_lock);
15050}
15051
15052#if defined(sun)
15053static
15054#endif
15055void
15056dtrace_helpers_duplicate(proc_t *from, proc_t *to)
15057{
15058	dtrace_helpers_t *help, *newhelp;
15059	dtrace_helper_action_t *helper, *new, *last;
15060	dtrace_difo_t *dp;
15061	dtrace_vstate_t *vstate;
15062	int i, j, sz, hasprovs = 0;
15063
15064	mutex_enter(&dtrace_lock);
15065	ASSERT(from->p_dtrace_helpers != NULL);
15066	ASSERT(dtrace_helpers > 0);
15067
15068	help = from->p_dtrace_helpers;
15069	newhelp = dtrace_helpers_create(to);
15070	ASSERT(to->p_dtrace_helpers != NULL);
15071
15072	newhelp->dthps_generation = help->dthps_generation;
15073	vstate = &newhelp->dthps_vstate;
15074
15075	/*
15076	 * Duplicate the helper actions.
15077	 */
15078	for (i = 0; i < DTRACE_NHELPER_ACTIONS; i++) {
15079		if ((helper = help->dthps_actions[i]) == NULL)
15080			continue;
15081
15082		for (last = NULL; helper != NULL; helper = helper->dtha_next) {
15083			new = kmem_zalloc(sizeof (dtrace_helper_action_t),
15084			    KM_SLEEP);
15085			new->dtha_generation = helper->dtha_generation;
15086
15087			if ((dp = helper->dtha_predicate) != NULL) {
15088				dp = dtrace_difo_duplicate(dp, vstate);
15089				new->dtha_predicate = dp;
15090			}
15091
15092			new->dtha_nactions = helper->dtha_nactions;
15093			sz = sizeof (dtrace_difo_t *) * new->dtha_nactions;
15094			new->dtha_actions = kmem_alloc(sz, KM_SLEEP);
15095
15096			for (j = 0; j < new->dtha_nactions; j++) {
15097				dtrace_difo_t *dp = helper->dtha_actions[j];
15098
15099				ASSERT(dp != NULL);
15100				dp = dtrace_difo_duplicate(dp, vstate);
15101				new->dtha_actions[j] = dp;
15102			}
15103
15104			if (last != NULL) {
15105				last->dtha_next = new;
15106			} else {
15107				newhelp->dthps_actions[i] = new;
15108			}
15109
15110			last = new;
15111		}
15112	}
15113
15114	/*
15115	 * Duplicate the helper providers and register them with the
15116	 * DTrace framework.
15117	 */
15118	if (help->dthps_nprovs > 0) {
15119		newhelp->dthps_nprovs = help->dthps_nprovs;
15120		newhelp->dthps_maxprovs = help->dthps_nprovs;
15121		newhelp->dthps_provs = kmem_alloc(newhelp->dthps_nprovs *
15122		    sizeof (dtrace_helper_provider_t *), KM_SLEEP);
15123		for (i = 0; i < newhelp->dthps_nprovs; i++) {
15124			newhelp->dthps_provs[i] = help->dthps_provs[i];
15125			newhelp->dthps_provs[i]->dthp_ref++;
15126		}
15127
15128		hasprovs = 1;
15129	}
15130
15131	mutex_exit(&dtrace_lock);
15132
15133	if (hasprovs)
15134		dtrace_helper_provider_register(to, newhelp, NULL);
15135}
15136
15137/*
15138 * DTrace Hook Functions
15139 */
15140static void
15141dtrace_module_loaded(modctl_t *ctl)
15142{
15143	dtrace_provider_t *prv;
15144
15145	mutex_enter(&dtrace_provider_lock);
15146	mutex_enter(&mod_lock);
15147
15148#if defined(sun)
15149	ASSERT(ctl->mod_busy);
15150#endif
15151
15152	/*
15153	 * We're going to call each providers per-module provide operation
15154	 * specifying only this module.
15155	 */
15156	for (prv = dtrace_provider; prv != NULL; prv = prv->dtpv_next)
15157		prv->dtpv_pops.dtps_provide_module(prv->dtpv_arg, ctl);
15158
15159	mutex_exit(&mod_lock);
15160	mutex_exit(&dtrace_provider_lock);
15161
15162	/*
15163	 * If we have any retained enablings, we need to match against them.
15164	 * Enabling probes requires that cpu_lock be held, and we cannot hold
15165	 * cpu_lock here -- it is legal for cpu_lock to be held when loading a
15166	 * module.  (In particular, this happens when loading scheduling
15167	 * classes.)  So if we have any retained enablings, we need to dispatch
15168	 * our task queue to do the match for us.
15169	 */
15170	mutex_enter(&dtrace_lock);
15171
15172	if (dtrace_retained == NULL) {
15173		mutex_exit(&dtrace_lock);
15174		return;
15175	}
15176
15177	(void) taskq_dispatch(dtrace_taskq,
15178	    (task_func_t *)dtrace_enabling_matchall, NULL, TQ_SLEEP);
15179
15180	mutex_exit(&dtrace_lock);
15181
15182	/*
15183	 * And now, for a little heuristic sleaze:  in general, we want to
15184	 * match modules as soon as they load.  However, we cannot guarantee
15185	 * this, because it would lead us to the lock ordering violation
15186	 * outlined above.  The common case, of course, is that cpu_lock is
15187	 * _not_ held -- so we delay here for a clock tick, hoping that that's
15188	 * long enough for the task queue to do its work.  If it's not, it's
15189	 * not a serious problem -- it just means that the module that we
15190	 * just loaded may not be immediately instrumentable.
15191	 */
15192	delay(1);
15193}
15194
15195static void
15196#if defined(sun)
15197dtrace_module_unloaded(modctl_t *ctl)
15198#else
15199dtrace_module_unloaded(modctl_t *ctl, int *error)
15200#endif
15201{
15202	dtrace_probe_t template, *probe, *first, *next;
15203	dtrace_provider_t *prov;
15204#if !defined(sun)
15205	char modname[DTRACE_MODNAMELEN];
15206	size_t len;
15207#endif
15208
15209#if defined(sun)
15210	template.dtpr_mod = ctl->mod_modname;
15211#else
15212	/* Handle the fact that ctl->filename may end in ".ko". */
15213	strlcpy(modname, ctl->filename, sizeof(modname));
15214	len = strlen(ctl->filename);
15215	if (len > 3 && strcmp(modname + len - 3, ".ko") == 0)
15216		modname[len - 3] = '\0';
15217	template.dtpr_mod = modname;
15218#endif
15219
15220	mutex_enter(&dtrace_provider_lock);
15221	mutex_enter(&mod_lock);
15222	mutex_enter(&dtrace_lock);
15223
15224#if !defined(sun)
15225	if (ctl->nenabled > 0) {
15226		/* Don't allow unloads if a probe is enabled. */
15227		mutex_exit(&dtrace_provider_lock);
15228		mutex_exit(&dtrace_lock);
15229		*error = -1;
15230		printf(
15231	"kldunload: attempt to unload module that has DTrace probes enabled\n");
15232		return;
15233	}
15234#endif
15235
15236	if (dtrace_bymod == NULL) {
15237		/*
15238		 * The DTrace module is loaded (obviously) but not attached;
15239		 * we don't have any work to do.
15240		 */
15241		mutex_exit(&dtrace_provider_lock);
15242		mutex_exit(&mod_lock);
15243		mutex_exit(&dtrace_lock);
15244		return;
15245	}
15246
15247	for (probe = first = dtrace_hash_lookup(dtrace_bymod, &template);
15248	    probe != NULL; probe = probe->dtpr_nextmod) {
15249		if (probe->dtpr_ecb != NULL) {
15250			mutex_exit(&dtrace_provider_lock);
15251			mutex_exit(&mod_lock);
15252			mutex_exit(&dtrace_lock);
15253
15254			/*
15255			 * This shouldn't _actually_ be possible -- we're
15256			 * unloading a module that has an enabled probe in it.
15257			 * (It's normally up to the provider to make sure that
15258			 * this can't happen.)  However, because dtps_enable()
15259			 * doesn't have a failure mode, there can be an
15260			 * enable/unload race.  Upshot:  we don't want to
15261			 * assert, but we're not going to disable the
15262			 * probe, either.
15263			 */
15264			if (dtrace_err_verbose) {
15265#if defined(sun)
15266				cmn_err(CE_WARN, "unloaded module '%s' had "
15267				    "enabled probes", ctl->mod_modname);
15268#else
15269				cmn_err(CE_WARN, "unloaded module '%s' had "
15270				    "enabled probes", modname);
15271#endif
15272			}
15273
15274			return;
15275		}
15276	}
15277
15278	probe = first;
15279
15280	for (first = NULL; probe != NULL; probe = next) {
15281		ASSERT(dtrace_probes[probe->dtpr_id - 1] == probe);
15282
15283		dtrace_probes[probe->dtpr_id - 1] = NULL;
15284
15285		next = probe->dtpr_nextmod;
15286		dtrace_hash_remove(dtrace_bymod, probe);
15287		dtrace_hash_remove(dtrace_byfunc, probe);
15288		dtrace_hash_remove(dtrace_byname, probe);
15289
15290		if (first == NULL) {
15291			first = probe;
15292			probe->dtpr_nextmod = NULL;
15293		} else {
15294			probe->dtpr_nextmod = first;
15295			first = probe;
15296		}
15297	}
15298
15299	/*
15300	 * We've removed all of the module's probes from the hash chains and
15301	 * from the probe array.  Now issue a dtrace_sync() to be sure that
15302	 * everyone has cleared out from any probe array processing.
15303	 */
15304	dtrace_sync();
15305
15306	for (probe = first; probe != NULL; probe = first) {
15307		first = probe->dtpr_nextmod;
15308		prov = probe->dtpr_provider;
15309		prov->dtpv_pops.dtps_destroy(prov->dtpv_arg, probe->dtpr_id,
15310		    probe->dtpr_arg);
15311		kmem_free(probe->dtpr_mod, strlen(probe->dtpr_mod) + 1);
15312		kmem_free(probe->dtpr_func, strlen(probe->dtpr_func) + 1);
15313		kmem_free(probe->dtpr_name, strlen(probe->dtpr_name) + 1);
15314#if defined(sun)
15315		vmem_free(dtrace_arena, (void *)(uintptr_t)probe->dtpr_id, 1);
15316#else
15317		free_unr(dtrace_arena, probe->dtpr_id);
15318#endif
15319		kmem_free(probe, sizeof (dtrace_probe_t));
15320	}
15321
15322	mutex_exit(&dtrace_lock);
15323	mutex_exit(&mod_lock);
15324	mutex_exit(&dtrace_provider_lock);
15325}
15326
15327#if !defined(sun)
15328static void
15329dtrace_kld_load(void *arg __unused, linker_file_t lf)
15330{
15331
15332	dtrace_module_loaded(lf);
15333}
15334
15335static void
15336dtrace_kld_unload_try(void *arg __unused, linker_file_t lf, int *error)
15337{
15338
15339	if (*error != 0)
15340		/* We already have an error, so don't do anything. */
15341		return;
15342	dtrace_module_unloaded(lf, error);
15343}
15344#endif
15345
15346#if defined(sun)
15347static void
15348dtrace_suspend(void)
15349{
15350	dtrace_probe_foreach(offsetof(dtrace_pops_t, dtps_suspend));
15351}
15352
15353static void
15354dtrace_resume(void)
15355{
15356	dtrace_probe_foreach(offsetof(dtrace_pops_t, dtps_resume));
15357}
15358#endif
15359
15360static int
15361dtrace_cpu_setup(cpu_setup_t what, processorid_t cpu)
15362{
15363	ASSERT(MUTEX_HELD(&cpu_lock));
15364	mutex_enter(&dtrace_lock);
15365
15366	switch (what) {
15367	case CPU_CONFIG: {
15368		dtrace_state_t *state;
15369		dtrace_optval_t *opt, rs, c;
15370
15371		/*
15372		 * For now, we only allocate a new buffer for anonymous state.
15373		 */
15374		if ((state = dtrace_anon.dta_state) == NULL)
15375			break;
15376
15377		if (state->dts_activity != DTRACE_ACTIVITY_ACTIVE)
15378			break;
15379
15380		opt = state->dts_options;
15381		c = opt[DTRACEOPT_CPU];
15382
15383		if (c != DTRACE_CPUALL && c != DTRACEOPT_UNSET && c != cpu)
15384			break;
15385
15386		/*
15387		 * Regardless of what the actual policy is, we're going to
15388		 * temporarily set our resize policy to be manual.  We're
15389		 * also going to temporarily set our CPU option to denote
15390		 * the newly configured CPU.
15391		 */
15392		rs = opt[DTRACEOPT_BUFRESIZE];
15393		opt[DTRACEOPT_BUFRESIZE] = DTRACEOPT_BUFRESIZE_MANUAL;
15394		opt[DTRACEOPT_CPU] = (dtrace_optval_t)cpu;
15395
15396		(void) dtrace_state_buffers(state);
15397
15398		opt[DTRACEOPT_BUFRESIZE] = rs;
15399		opt[DTRACEOPT_CPU] = c;
15400
15401		break;
15402	}
15403
15404	case CPU_UNCONFIG:
15405		/*
15406		 * We don't free the buffer in the CPU_UNCONFIG case.  (The
15407		 * buffer will be freed when the consumer exits.)
15408		 */
15409		break;
15410
15411	default:
15412		break;
15413	}
15414
15415	mutex_exit(&dtrace_lock);
15416	return (0);
15417}
15418
15419#if defined(sun)
15420static void
15421dtrace_cpu_setup_initial(processorid_t cpu)
15422{
15423	(void) dtrace_cpu_setup(CPU_CONFIG, cpu);
15424}
15425#endif
15426
15427static void
15428dtrace_toxrange_add(uintptr_t base, uintptr_t limit)
15429{
15430	if (dtrace_toxranges >= dtrace_toxranges_max) {
15431		int osize, nsize;
15432		dtrace_toxrange_t *range;
15433
15434		osize = dtrace_toxranges_max * sizeof (dtrace_toxrange_t);
15435
15436		if (osize == 0) {
15437			ASSERT(dtrace_toxrange == NULL);
15438			ASSERT(dtrace_toxranges_max == 0);
15439			dtrace_toxranges_max = 1;
15440		} else {
15441			dtrace_toxranges_max <<= 1;
15442		}
15443
15444		nsize = dtrace_toxranges_max * sizeof (dtrace_toxrange_t);
15445		range = kmem_zalloc(nsize, KM_SLEEP);
15446
15447		if (dtrace_toxrange != NULL) {
15448			ASSERT(osize != 0);
15449			bcopy(dtrace_toxrange, range, osize);
15450			kmem_free(dtrace_toxrange, osize);
15451		}
15452
15453		dtrace_toxrange = range;
15454	}
15455
15456	ASSERT(dtrace_toxrange[dtrace_toxranges].dtt_base == 0);
15457	ASSERT(dtrace_toxrange[dtrace_toxranges].dtt_limit == 0);
15458
15459	dtrace_toxrange[dtrace_toxranges].dtt_base = base;
15460	dtrace_toxrange[dtrace_toxranges].dtt_limit = limit;
15461	dtrace_toxranges++;
15462}
15463
15464/*
15465 * DTrace Driver Cookbook Functions
15466 */
15467#if defined(sun)
15468/*ARGSUSED*/
15469static int
15470dtrace_attach(dev_info_t *devi, ddi_attach_cmd_t cmd)
15471{
15472	dtrace_provider_id_t id;
15473	dtrace_state_t *state = NULL;
15474	dtrace_enabling_t *enab;
15475
15476	mutex_enter(&cpu_lock);
15477	mutex_enter(&dtrace_provider_lock);
15478	mutex_enter(&dtrace_lock);
15479
15480	if (ddi_soft_state_init(&dtrace_softstate,
15481	    sizeof (dtrace_state_t), 0) != 0) {
15482		cmn_err(CE_NOTE, "/dev/dtrace failed to initialize soft state");
15483		mutex_exit(&cpu_lock);
15484		mutex_exit(&dtrace_provider_lock);
15485		mutex_exit(&dtrace_lock);
15486		return (DDI_FAILURE);
15487	}
15488
15489	if (ddi_create_minor_node(devi, DTRACEMNR_DTRACE, S_IFCHR,
15490	    DTRACEMNRN_DTRACE, DDI_PSEUDO, NULL) == DDI_FAILURE ||
15491	    ddi_create_minor_node(devi, DTRACEMNR_HELPER, S_IFCHR,
15492	    DTRACEMNRN_HELPER, DDI_PSEUDO, NULL) == DDI_FAILURE) {
15493		cmn_err(CE_NOTE, "/dev/dtrace couldn't create minor nodes");
15494		ddi_remove_minor_node(devi, NULL);
15495		ddi_soft_state_fini(&dtrace_softstate);
15496		mutex_exit(&cpu_lock);
15497		mutex_exit(&dtrace_provider_lock);
15498		mutex_exit(&dtrace_lock);
15499		return (DDI_FAILURE);
15500	}
15501
15502	ddi_report_dev(devi);
15503	dtrace_devi = devi;
15504
15505	dtrace_modload = dtrace_module_loaded;
15506	dtrace_modunload = dtrace_module_unloaded;
15507	dtrace_cpu_init = dtrace_cpu_setup_initial;
15508	dtrace_helpers_cleanup = dtrace_helpers_destroy;
15509	dtrace_helpers_fork = dtrace_helpers_duplicate;
15510	dtrace_cpustart_init = dtrace_suspend;
15511	dtrace_cpustart_fini = dtrace_resume;
15512	dtrace_debugger_init = dtrace_suspend;
15513	dtrace_debugger_fini = dtrace_resume;
15514
15515	register_cpu_setup_func((cpu_setup_func_t *)dtrace_cpu_setup, NULL);
15516
15517	ASSERT(MUTEX_HELD(&cpu_lock));
15518
15519	dtrace_arena = vmem_create("dtrace", (void *)1, UINT32_MAX, 1,
15520	    NULL, NULL, NULL, 0, VM_SLEEP | VMC_IDENTIFIER);
15521	dtrace_minor = vmem_create("dtrace_minor", (void *)DTRACEMNRN_CLONE,
15522	    UINT32_MAX - DTRACEMNRN_CLONE, 1, NULL, NULL, NULL, 0,
15523	    VM_SLEEP | VMC_IDENTIFIER);
15524	dtrace_taskq = taskq_create("dtrace_taskq", 1, maxclsyspri,
15525	    1, INT_MAX, 0);
15526
15527	dtrace_state_cache = kmem_cache_create("dtrace_state_cache",
15528	    sizeof (dtrace_dstate_percpu_t) * NCPU, DTRACE_STATE_ALIGN,
15529	    NULL, NULL, NULL, NULL, NULL, 0);
15530
15531	ASSERT(MUTEX_HELD(&cpu_lock));
15532	dtrace_bymod = dtrace_hash_create(offsetof(dtrace_probe_t, dtpr_mod),
15533	    offsetof(dtrace_probe_t, dtpr_nextmod),
15534	    offsetof(dtrace_probe_t, dtpr_prevmod));
15535
15536	dtrace_byfunc = dtrace_hash_create(offsetof(dtrace_probe_t, dtpr_func),
15537	    offsetof(dtrace_probe_t, dtpr_nextfunc),
15538	    offsetof(dtrace_probe_t, dtpr_prevfunc));
15539
15540	dtrace_byname = dtrace_hash_create(offsetof(dtrace_probe_t, dtpr_name),
15541	    offsetof(dtrace_probe_t, dtpr_nextname),
15542	    offsetof(dtrace_probe_t, dtpr_prevname));
15543
15544	if (dtrace_retain_max < 1) {
15545		cmn_err(CE_WARN, "illegal value (%lu) for dtrace_retain_max; "
15546		    "setting to 1", dtrace_retain_max);
15547		dtrace_retain_max = 1;
15548	}
15549
15550	/*
15551	 * Now discover our toxic ranges.
15552	 */
15553	dtrace_toxic_ranges(dtrace_toxrange_add);
15554
15555	/*
15556	 * Before we register ourselves as a provider to our own framework,
15557	 * we would like to assert that dtrace_provider is NULL -- but that's
15558	 * not true if we were loaded as a dependency of a DTrace provider.
15559	 * Once we've registered, we can assert that dtrace_provider is our
15560	 * pseudo provider.
15561	 */
15562	(void) dtrace_register("dtrace", &dtrace_provider_attr,
15563	    DTRACE_PRIV_NONE, 0, &dtrace_provider_ops, NULL, &id);
15564
15565	ASSERT(dtrace_provider != NULL);
15566	ASSERT((dtrace_provider_id_t)dtrace_provider == id);
15567
15568	dtrace_probeid_begin = dtrace_probe_create((dtrace_provider_id_t)
15569	    dtrace_provider, NULL, NULL, "BEGIN", 0, NULL);
15570	dtrace_probeid_end = dtrace_probe_create((dtrace_provider_id_t)
15571	    dtrace_provider, NULL, NULL, "END", 0, NULL);
15572	dtrace_probeid_error = dtrace_probe_create((dtrace_provider_id_t)
15573	    dtrace_provider, NULL, NULL, "ERROR", 1, NULL);
15574
15575	dtrace_anon_property();
15576	mutex_exit(&cpu_lock);
15577
15578	/*
15579	 * If DTrace helper tracing is enabled, we need to allocate the
15580	 * trace buffer and initialize the values.
15581	 */
15582	if (dtrace_helptrace_enabled) {
15583		ASSERT(dtrace_helptrace_buffer == NULL);
15584		dtrace_helptrace_buffer =
15585		    kmem_zalloc(dtrace_helptrace_bufsize, KM_SLEEP);
15586		dtrace_helptrace_next = 0;
15587	}
15588
15589	/*
15590	 * If there are already providers, we must ask them to provide their
15591	 * probes, and then match any anonymous enabling against them.  Note
15592	 * that there should be no other retained enablings at this time:
15593	 * the only retained enablings at this time should be the anonymous
15594	 * enabling.
15595	 */
15596	if (dtrace_anon.dta_enabling != NULL) {
15597		ASSERT(dtrace_retained == dtrace_anon.dta_enabling);
15598
15599		dtrace_enabling_provide(NULL);
15600		state = dtrace_anon.dta_state;
15601
15602		/*
15603		 * We couldn't hold cpu_lock across the above call to
15604		 * dtrace_enabling_provide(), but we must hold it to actually
15605		 * enable the probes.  We have to drop all of our locks, pick
15606		 * up cpu_lock, and regain our locks before matching the
15607		 * retained anonymous enabling.
15608		 */
15609		mutex_exit(&dtrace_lock);
15610		mutex_exit(&dtrace_provider_lock);
15611
15612		mutex_enter(&cpu_lock);
15613		mutex_enter(&dtrace_provider_lock);
15614		mutex_enter(&dtrace_lock);
15615
15616		if ((enab = dtrace_anon.dta_enabling) != NULL)
15617			(void) dtrace_enabling_match(enab, NULL);
15618
15619		mutex_exit(&cpu_lock);
15620	}
15621
15622	mutex_exit(&dtrace_lock);
15623	mutex_exit(&dtrace_provider_lock);
15624
15625	if (state != NULL) {
15626		/*
15627		 * If we created any anonymous state, set it going now.
15628		 */
15629		(void) dtrace_state_go(state, &dtrace_anon.dta_beganon);
15630	}
15631
15632	return (DDI_SUCCESS);
15633}
15634#endif
15635
15636#if !defined(sun)
15637#if __FreeBSD_version >= 800039
15638static void
15639dtrace_dtr(void *data __unused)
15640{
15641}
15642#endif
15643#endif
15644
15645/*ARGSUSED*/
15646static int
15647#if defined(sun)
15648dtrace_open(dev_t *devp, int flag, int otyp, cred_t *cred_p)
15649#else
15650dtrace_open(struct cdev *dev, int oflags, int devtype, struct thread *td)
15651#endif
15652{
15653	dtrace_state_t *state;
15654	uint32_t priv;
15655	uid_t uid;
15656	zoneid_t zoneid;
15657
15658#if defined(sun)
15659	if (getminor(*devp) == DTRACEMNRN_HELPER)
15660		return (0);
15661
15662	/*
15663	 * If this wasn't an open with the "helper" minor, then it must be
15664	 * the "dtrace" minor.
15665	 */
15666	ASSERT(getminor(*devp) == DTRACEMNRN_DTRACE);
15667#else
15668	cred_t *cred_p = NULL;
15669
15670#if __FreeBSD_version < 800039
15671	/*
15672	 * The first minor device is the one that is cloned so there is
15673	 * nothing more to do here.
15674	 */
15675	if (dev2unit(dev) == 0)
15676		return 0;
15677
15678	/*
15679	 * Devices are cloned, so if the DTrace state has already
15680	 * been allocated, that means this device belongs to a
15681	 * different client. Each client should open '/dev/dtrace'
15682	 * to get a cloned device.
15683	 */
15684	if (dev->si_drv1 != NULL)
15685		return (EBUSY);
15686#endif
15687
15688	cred_p = dev->si_cred;
15689#endif
15690
15691	/*
15692	 * If no DTRACE_PRIV_* bits are set in the credential, then the
15693	 * caller lacks sufficient permission to do anything with DTrace.
15694	 */
15695	dtrace_cred2priv(cred_p, &priv, &uid, &zoneid);
15696	if (priv == DTRACE_PRIV_NONE) {
15697#if !defined(sun)
15698#if __FreeBSD_version < 800039
15699		/* Destroy the cloned device. */
15700                destroy_dev(dev);
15701#endif
15702#endif
15703
15704		return (EACCES);
15705	}
15706
15707	/*
15708	 * Ask all providers to provide all their probes.
15709	 */
15710	mutex_enter(&dtrace_provider_lock);
15711	dtrace_probe_provide(NULL, NULL);
15712	mutex_exit(&dtrace_provider_lock);
15713
15714	mutex_enter(&cpu_lock);
15715	mutex_enter(&dtrace_lock);
15716	dtrace_opens++;
15717	dtrace_membar_producer();
15718
15719#if defined(sun)
15720	/*
15721	 * If the kernel debugger is active (that is, if the kernel debugger
15722	 * modified text in some way), we won't allow the open.
15723	 */
15724	if (kdi_dtrace_set(KDI_DTSET_DTRACE_ACTIVATE) != 0) {
15725		dtrace_opens--;
15726		mutex_exit(&cpu_lock);
15727		mutex_exit(&dtrace_lock);
15728		return (EBUSY);
15729	}
15730
15731	state = dtrace_state_create(devp, cred_p);
15732#else
15733	state = dtrace_state_create(dev);
15734#if __FreeBSD_version < 800039
15735	dev->si_drv1 = state;
15736#else
15737	devfs_set_cdevpriv(state, dtrace_dtr);
15738#endif
15739	/* This code actually belongs in dtrace_attach() */
15740	if (dtrace_opens == 1)
15741		dtrace_taskq = taskq_create("dtrace_taskq", 1, maxclsyspri,
15742		    1, INT_MAX, 0);
15743#endif
15744
15745	mutex_exit(&cpu_lock);
15746
15747	if (state == NULL) {
15748#if defined(sun)
15749		if (--dtrace_opens == 0)
15750			(void) kdi_dtrace_set(KDI_DTSET_DTRACE_DEACTIVATE);
15751#else
15752		--dtrace_opens;
15753#endif
15754		mutex_exit(&dtrace_lock);
15755#if !defined(sun)
15756#if __FreeBSD_version < 800039
15757		/* Destroy the cloned device. */
15758                destroy_dev(dev);
15759#endif
15760#endif
15761		return (EAGAIN);
15762	}
15763
15764	mutex_exit(&dtrace_lock);
15765
15766	return (0);
15767}
15768
15769/*ARGSUSED*/
15770static int
15771#if defined(sun)
15772dtrace_close(dev_t dev, int flag, int otyp, cred_t *cred_p)
15773#else
15774dtrace_close(struct cdev *dev, int flags, int fmt __unused, struct thread *td)
15775#endif
15776{
15777#if defined(sun)
15778	minor_t minor = getminor(dev);
15779	dtrace_state_t *state;
15780
15781	if (minor == DTRACEMNRN_HELPER)
15782		return (0);
15783
15784	state = ddi_get_soft_state(dtrace_softstate, minor);
15785#else
15786#if __FreeBSD_version < 800039
15787	dtrace_state_t *state = dev->si_drv1;
15788
15789	/* Check if this is not a cloned device. */
15790	if (dev2unit(dev) == 0)
15791		return (0);
15792#else
15793	dtrace_state_t *state;
15794	devfs_get_cdevpriv((void **) &state);
15795#endif
15796
15797#endif
15798
15799	mutex_enter(&cpu_lock);
15800	mutex_enter(&dtrace_lock);
15801
15802	if (state != NULL) {
15803		if (state->dts_anon) {
15804			/*
15805			 * There is anonymous state. Destroy that first.
15806			 */
15807			ASSERT(dtrace_anon.dta_state == NULL);
15808			dtrace_state_destroy(state->dts_anon);
15809		}
15810
15811		dtrace_state_destroy(state);
15812
15813#if !defined(sun)
15814		kmem_free(state, 0);
15815#if __FreeBSD_version < 800039
15816		dev->si_drv1 = NULL;
15817#endif
15818#endif
15819	}
15820
15821	ASSERT(dtrace_opens > 0);
15822#if defined(sun)
15823	if (--dtrace_opens == 0)
15824		(void) kdi_dtrace_set(KDI_DTSET_DTRACE_DEACTIVATE);
15825#else
15826	--dtrace_opens;
15827	/* This code actually belongs in dtrace_detach() */
15828	if ((dtrace_opens == 0) && (dtrace_taskq != NULL)) {
15829		taskq_destroy(dtrace_taskq);
15830		dtrace_taskq = NULL;
15831	}
15832#endif
15833
15834	mutex_exit(&dtrace_lock);
15835	mutex_exit(&cpu_lock);
15836
15837#if __FreeBSD_version < 800039
15838	/* Schedule this cloned device to be destroyed. */
15839	destroy_dev_sched(dev);
15840#endif
15841
15842	return (0);
15843}
15844
15845#if defined(sun)
15846/*ARGSUSED*/
15847static int
15848dtrace_ioctl_helper(int cmd, intptr_t arg, int *rv)
15849{
15850	int rval;
15851	dof_helper_t help, *dhp = NULL;
15852
15853	switch (cmd) {
15854	case DTRACEHIOC_ADDDOF:
15855		if (copyin((void *)arg, &help, sizeof (help)) != 0) {
15856			dtrace_dof_error(NULL, "failed to copyin DOF helper");
15857			return (EFAULT);
15858		}
15859
15860		dhp = &help;
15861		arg = (intptr_t)help.dofhp_dof;
15862		/*FALLTHROUGH*/
15863
15864	case DTRACEHIOC_ADD: {
15865		dof_hdr_t *dof = dtrace_dof_copyin(arg, &rval);
15866
15867		if (dof == NULL)
15868			return (rval);
15869
15870		mutex_enter(&dtrace_lock);
15871
15872		/*
15873		 * dtrace_helper_slurp() takes responsibility for the dof --
15874		 * it may free it now or it may save it and free it later.
15875		 */
15876		if ((rval = dtrace_helper_slurp(dof, dhp)) != -1) {
15877			*rv = rval;
15878			rval = 0;
15879		} else {
15880			rval = EINVAL;
15881		}
15882
15883		mutex_exit(&dtrace_lock);
15884		return (rval);
15885	}
15886
15887	case DTRACEHIOC_REMOVE: {
15888		mutex_enter(&dtrace_lock);
15889		rval = dtrace_helper_destroygen(arg);
15890		mutex_exit(&dtrace_lock);
15891
15892		return (rval);
15893	}
15894
15895	default:
15896		break;
15897	}
15898
15899	return (ENOTTY);
15900}
15901
15902/*ARGSUSED*/
15903static int
15904dtrace_ioctl(dev_t dev, int cmd, intptr_t arg, int md, cred_t *cr, int *rv)
15905{
15906	minor_t minor = getminor(dev);
15907	dtrace_state_t *state;
15908	int rval;
15909
15910	if (minor == DTRACEMNRN_HELPER)
15911		return (dtrace_ioctl_helper(cmd, arg, rv));
15912
15913	state = ddi_get_soft_state(dtrace_softstate, minor);
15914
15915	if (state->dts_anon) {
15916		ASSERT(dtrace_anon.dta_state == NULL);
15917		state = state->dts_anon;
15918	}
15919
15920	switch (cmd) {
15921	case DTRACEIOC_PROVIDER: {
15922		dtrace_providerdesc_t pvd;
15923		dtrace_provider_t *pvp;
15924
15925		if (copyin((void *)arg, &pvd, sizeof (pvd)) != 0)
15926			return (EFAULT);
15927
15928		pvd.dtvd_name[DTRACE_PROVNAMELEN - 1] = '\0';
15929		mutex_enter(&dtrace_provider_lock);
15930
15931		for (pvp = dtrace_provider; pvp != NULL; pvp = pvp->dtpv_next) {
15932			if (strcmp(pvp->dtpv_name, pvd.dtvd_name) == 0)
15933				break;
15934		}
15935
15936		mutex_exit(&dtrace_provider_lock);
15937
15938		if (pvp == NULL)
15939			return (ESRCH);
15940
15941		bcopy(&pvp->dtpv_priv, &pvd.dtvd_priv, sizeof (dtrace_ppriv_t));
15942		bcopy(&pvp->dtpv_attr, &pvd.dtvd_attr, sizeof (dtrace_pattr_t));
15943
15944		if (copyout(&pvd, (void *)arg, sizeof (pvd)) != 0)
15945			return (EFAULT);
15946
15947		return (0);
15948	}
15949
15950	case DTRACEIOC_EPROBE: {
15951		dtrace_eprobedesc_t epdesc;
15952		dtrace_ecb_t *ecb;
15953		dtrace_action_t *act;
15954		void *buf;
15955		size_t size;
15956		uintptr_t dest;
15957		int nrecs;
15958
15959		if (copyin((void *)arg, &epdesc, sizeof (epdesc)) != 0)
15960			return (EFAULT);
15961
15962		mutex_enter(&dtrace_lock);
15963
15964		if ((ecb = dtrace_epid2ecb(state, epdesc.dtepd_epid)) == NULL) {
15965			mutex_exit(&dtrace_lock);
15966			return (EINVAL);
15967		}
15968
15969		if (ecb->dte_probe == NULL) {
15970			mutex_exit(&dtrace_lock);
15971			return (EINVAL);
15972		}
15973
15974		epdesc.dtepd_probeid = ecb->dte_probe->dtpr_id;
15975		epdesc.dtepd_uarg = ecb->dte_uarg;
15976		epdesc.dtepd_size = ecb->dte_size;
15977
15978		nrecs = epdesc.dtepd_nrecs;
15979		epdesc.dtepd_nrecs = 0;
15980		for (act = ecb->dte_action; act != NULL; act = act->dta_next) {
15981			if (DTRACEACT_ISAGG(act->dta_kind) || act->dta_intuple)
15982				continue;
15983
15984			epdesc.dtepd_nrecs++;
15985		}
15986
15987		/*
15988		 * Now that we have the size, we need to allocate a temporary
15989		 * buffer in which to store the complete description.  We need
15990		 * the temporary buffer to be able to drop dtrace_lock()
15991		 * across the copyout(), below.
15992		 */
15993		size = sizeof (dtrace_eprobedesc_t) +
15994		    (epdesc.dtepd_nrecs * sizeof (dtrace_recdesc_t));
15995
15996		buf = kmem_alloc(size, KM_SLEEP);
15997		dest = (uintptr_t)buf;
15998
15999		bcopy(&epdesc, (void *)dest, sizeof (epdesc));
16000		dest += offsetof(dtrace_eprobedesc_t, dtepd_rec[0]);
16001
16002		for (act = ecb->dte_action; act != NULL; act = act->dta_next) {
16003			if (DTRACEACT_ISAGG(act->dta_kind) || act->dta_intuple)
16004				continue;
16005
16006			if (nrecs-- == 0)
16007				break;
16008
16009			bcopy(&act->dta_rec, (void *)dest,
16010			    sizeof (dtrace_recdesc_t));
16011			dest += sizeof (dtrace_recdesc_t);
16012		}
16013
16014		mutex_exit(&dtrace_lock);
16015
16016		if (copyout(buf, (void *)arg, dest - (uintptr_t)buf) != 0) {
16017			kmem_free(buf, size);
16018			return (EFAULT);
16019		}
16020
16021		kmem_free(buf, size);
16022		return (0);
16023	}
16024
16025	case DTRACEIOC_AGGDESC: {
16026		dtrace_aggdesc_t aggdesc;
16027		dtrace_action_t *act;
16028		dtrace_aggregation_t *agg;
16029		int nrecs;
16030		uint32_t offs;
16031		dtrace_recdesc_t *lrec;
16032		void *buf;
16033		size_t size;
16034		uintptr_t dest;
16035
16036		if (copyin((void *)arg, &aggdesc, sizeof (aggdesc)) != 0)
16037			return (EFAULT);
16038
16039		mutex_enter(&dtrace_lock);
16040
16041		if ((agg = dtrace_aggid2agg(state, aggdesc.dtagd_id)) == NULL) {
16042			mutex_exit(&dtrace_lock);
16043			return (EINVAL);
16044		}
16045
16046		aggdesc.dtagd_epid = agg->dtag_ecb->dte_epid;
16047
16048		nrecs = aggdesc.dtagd_nrecs;
16049		aggdesc.dtagd_nrecs = 0;
16050
16051		offs = agg->dtag_base;
16052		lrec = &agg->dtag_action.dta_rec;
16053		aggdesc.dtagd_size = lrec->dtrd_offset + lrec->dtrd_size - offs;
16054
16055		for (act = agg->dtag_first; ; act = act->dta_next) {
16056			ASSERT(act->dta_intuple ||
16057			    DTRACEACT_ISAGG(act->dta_kind));
16058
16059			/*
16060			 * If this action has a record size of zero, it
16061			 * denotes an argument to the aggregating action.
16062			 * Because the presence of this record doesn't (or
16063			 * shouldn't) affect the way the data is interpreted,
16064			 * we don't copy it out to save user-level the
16065			 * confusion of dealing with a zero-length record.
16066			 */
16067			if (act->dta_rec.dtrd_size == 0) {
16068				ASSERT(agg->dtag_hasarg);
16069				continue;
16070			}
16071
16072			aggdesc.dtagd_nrecs++;
16073
16074			if (act == &agg->dtag_action)
16075				break;
16076		}
16077
16078		/*
16079		 * Now that we have the size, we need to allocate a temporary
16080		 * buffer in which to store the complete description.  We need
16081		 * the temporary buffer to be able to drop dtrace_lock()
16082		 * across the copyout(), below.
16083		 */
16084		size = sizeof (dtrace_aggdesc_t) +
16085		    (aggdesc.dtagd_nrecs * sizeof (dtrace_recdesc_t));
16086
16087		buf = kmem_alloc(size, KM_SLEEP);
16088		dest = (uintptr_t)buf;
16089
16090		bcopy(&aggdesc, (void *)dest, sizeof (aggdesc));
16091		dest += offsetof(dtrace_aggdesc_t, dtagd_rec[0]);
16092
16093		for (act = agg->dtag_first; ; act = act->dta_next) {
16094			dtrace_recdesc_t rec = act->dta_rec;
16095
16096			/*
16097			 * See the comment in the above loop for why we pass
16098			 * over zero-length records.
16099			 */
16100			if (rec.dtrd_size == 0) {
16101				ASSERT(agg->dtag_hasarg);
16102				continue;
16103			}
16104
16105			if (nrecs-- == 0)
16106				break;
16107
16108			rec.dtrd_offset -= offs;
16109			bcopy(&rec, (void *)dest, sizeof (rec));
16110			dest += sizeof (dtrace_recdesc_t);
16111
16112			if (act == &agg->dtag_action)
16113				break;
16114		}
16115
16116		mutex_exit(&dtrace_lock);
16117
16118		if (copyout(buf, (void *)arg, dest - (uintptr_t)buf) != 0) {
16119			kmem_free(buf, size);
16120			return (EFAULT);
16121		}
16122
16123		kmem_free(buf, size);
16124		return (0);
16125	}
16126
16127	case DTRACEIOC_ENABLE: {
16128		dof_hdr_t *dof;
16129		dtrace_enabling_t *enab = NULL;
16130		dtrace_vstate_t *vstate;
16131		int err = 0;
16132
16133		*rv = 0;
16134
16135		/*
16136		 * If a NULL argument has been passed, we take this as our
16137		 * cue to reevaluate our enablings.
16138		 */
16139		if (arg == NULL) {
16140			dtrace_enabling_matchall();
16141
16142			return (0);
16143		}
16144
16145		if ((dof = dtrace_dof_copyin(arg, &rval)) == NULL)
16146			return (rval);
16147
16148		mutex_enter(&cpu_lock);
16149		mutex_enter(&dtrace_lock);
16150		vstate = &state->dts_vstate;
16151
16152		if (state->dts_activity != DTRACE_ACTIVITY_INACTIVE) {
16153			mutex_exit(&dtrace_lock);
16154			mutex_exit(&cpu_lock);
16155			dtrace_dof_destroy(dof);
16156			return (EBUSY);
16157		}
16158
16159		if (dtrace_dof_slurp(dof, vstate, cr, &enab, 0, B_TRUE) != 0) {
16160			mutex_exit(&dtrace_lock);
16161			mutex_exit(&cpu_lock);
16162			dtrace_dof_destroy(dof);
16163			return (EINVAL);
16164		}
16165
16166		if ((rval = dtrace_dof_options(dof, state)) != 0) {
16167			dtrace_enabling_destroy(enab);
16168			mutex_exit(&dtrace_lock);
16169			mutex_exit(&cpu_lock);
16170			dtrace_dof_destroy(dof);
16171			return (rval);
16172		}
16173
16174		if ((err = dtrace_enabling_match(enab, rv)) == 0) {
16175			err = dtrace_enabling_retain(enab);
16176		} else {
16177			dtrace_enabling_destroy(enab);
16178		}
16179
16180		mutex_exit(&cpu_lock);
16181		mutex_exit(&dtrace_lock);
16182		dtrace_dof_destroy(dof);
16183
16184		return (err);
16185	}
16186
16187	case DTRACEIOC_REPLICATE: {
16188		dtrace_repldesc_t desc;
16189		dtrace_probedesc_t *match = &desc.dtrpd_match;
16190		dtrace_probedesc_t *create = &desc.dtrpd_create;
16191		int err;
16192
16193		if (copyin((void *)arg, &desc, sizeof (desc)) != 0)
16194			return (EFAULT);
16195
16196		match->dtpd_provider[DTRACE_PROVNAMELEN - 1] = '\0';
16197		match->dtpd_mod[DTRACE_MODNAMELEN - 1] = '\0';
16198		match->dtpd_func[DTRACE_FUNCNAMELEN - 1] = '\0';
16199		match->dtpd_name[DTRACE_NAMELEN - 1] = '\0';
16200
16201		create->dtpd_provider[DTRACE_PROVNAMELEN - 1] = '\0';
16202		create->dtpd_mod[DTRACE_MODNAMELEN - 1] = '\0';
16203		create->dtpd_func[DTRACE_FUNCNAMELEN - 1] = '\0';
16204		create->dtpd_name[DTRACE_NAMELEN - 1] = '\0';
16205
16206		mutex_enter(&dtrace_lock);
16207		err = dtrace_enabling_replicate(state, match, create);
16208		mutex_exit(&dtrace_lock);
16209
16210		return (err);
16211	}
16212
16213	case DTRACEIOC_PROBEMATCH:
16214	case DTRACEIOC_PROBES: {
16215		dtrace_probe_t *probe = NULL;
16216		dtrace_probedesc_t desc;
16217		dtrace_probekey_t pkey;
16218		dtrace_id_t i;
16219		int m = 0;
16220		uint32_t priv;
16221		uid_t uid;
16222		zoneid_t zoneid;
16223
16224		if (copyin((void *)arg, &desc, sizeof (desc)) != 0)
16225			return (EFAULT);
16226
16227		desc.dtpd_provider[DTRACE_PROVNAMELEN - 1] = '\0';
16228		desc.dtpd_mod[DTRACE_MODNAMELEN - 1] = '\0';
16229		desc.dtpd_func[DTRACE_FUNCNAMELEN - 1] = '\0';
16230		desc.dtpd_name[DTRACE_NAMELEN - 1] = '\0';
16231
16232		/*
16233		 * Before we attempt to match this probe, we want to give
16234		 * all providers the opportunity to provide it.
16235		 */
16236		if (desc.dtpd_id == DTRACE_IDNONE) {
16237			mutex_enter(&dtrace_provider_lock);
16238			dtrace_probe_provide(&desc, NULL);
16239			mutex_exit(&dtrace_provider_lock);
16240			desc.dtpd_id++;
16241		}
16242
16243		if (cmd == DTRACEIOC_PROBEMATCH)  {
16244			dtrace_probekey(&desc, &pkey);
16245			pkey.dtpk_id = DTRACE_IDNONE;
16246		}
16247
16248		dtrace_cred2priv(cr, &priv, &uid, &zoneid);
16249
16250		mutex_enter(&dtrace_lock);
16251
16252		if (cmd == DTRACEIOC_PROBEMATCH) {
16253			for (i = desc.dtpd_id; i <= dtrace_nprobes; i++) {
16254				if ((probe = dtrace_probes[i - 1]) != NULL &&
16255				    (m = dtrace_match_probe(probe, &pkey,
16256				    priv, uid, zoneid)) != 0)
16257					break;
16258			}
16259
16260			if (m < 0) {
16261				mutex_exit(&dtrace_lock);
16262				return (EINVAL);
16263			}
16264
16265		} else {
16266			for (i = desc.dtpd_id; i <= dtrace_nprobes; i++) {
16267				if ((probe = dtrace_probes[i - 1]) != NULL &&
16268				    dtrace_match_priv(probe, priv, uid, zoneid))
16269					break;
16270			}
16271		}
16272
16273		if (probe == NULL) {
16274			mutex_exit(&dtrace_lock);
16275			return (ESRCH);
16276		}
16277
16278		dtrace_probe_description(probe, &desc);
16279		mutex_exit(&dtrace_lock);
16280
16281		if (copyout(&desc, (void *)arg, sizeof (desc)) != 0)
16282			return (EFAULT);
16283
16284		return (0);
16285	}
16286
16287	case DTRACEIOC_PROBEARG: {
16288		dtrace_argdesc_t desc;
16289		dtrace_probe_t *probe;
16290		dtrace_provider_t *prov;
16291
16292		if (copyin((void *)arg, &desc, sizeof (desc)) != 0)
16293			return (EFAULT);
16294
16295		if (desc.dtargd_id == DTRACE_IDNONE)
16296			return (EINVAL);
16297
16298		if (desc.dtargd_ndx == DTRACE_ARGNONE)
16299			return (EINVAL);
16300
16301		mutex_enter(&dtrace_provider_lock);
16302		mutex_enter(&mod_lock);
16303		mutex_enter(&dtrace_lock);
16304
16305		if (desc.dtargd_id > dtrace_nprobes) {
16306			mutex_exit(&dtrace_lock);
16307			mutex_exit(&mod_lock);
16308			mutex_exit(&dtrace_provider_lock);
16309			return (EINVAL);
16310		}
16311
16312		if ((probe = dtrace_probes[desc.dtargd_id - 1]) == NULL) {
16313			mutex_exit(&dtrace_lock);
16314			mutex_exit(&mod_lock);
16315			mutex_exit(&dtrace_provider_lock);
16316			return (EINVAL);
16317		}
16318
16319		mutex_exit(&dtrace_lock);
16320
16321		prov = probe->dtpr_provider;
16322
16323		if (prov->dtpv_pops.dtps_getargdesc == NULL) {
16324			/*
16325			 * There isn't any typed information for this probe.
16326			 * Set the argument number to DTRACE_ARGNONE.
16327			 */
16328			desc.dtargd_ndx = DTRACE_ARGNONE;
16329		} else {
16330			desc.dtargd_native[0] = '\0';
16331			desc.dtargd_xlate[0] = '\0';
16332			desc.dtargd_mapping = desc.dtargd_ndx;
16333
16334			prov->dtpv_pops.dtps_getargdesc(prov->dtpv_arg,
16335			    probe->dtpr_id, probe->dtpr_arg, &desc);
16336		}
16337
16338		mutex_exit(&mod_lock);
16339		mutex_exit(&dtrace_provider_lock);
16340
16341		if (copyout(&desc, (void *)arg, sizeof (desc)) != 0)
16342			return (EFAULT);
16343
16344		return (0);
16345	}
16346
16347	case DTRACEIOC_GO: {
16348		processorid_t cpuid;
16349		rval = dtrace_state_go(state, &cpuid);
16350
16351		if (rval != 0)
16352			return (rval);
16353
16354		if (copyout(&cpuid, (void *)arg, sizeof (cpuid)) != 0)
16355			return (EFAULT);
16356
16357		return (0);
16358	}
16359
16360	case DTRACEIOC_STOP: {
16361		processorid_t cpuid;
16362
16363		mutex_enter(&dtrace_lock);
16364		rval = dtrace_state_stop(state, &cpuid);
16365		mutex_exit(&dtrace_lock);
16366
16367		if (rval != 0)
16368			return (rval);
16369
16370		if (copyout(&cpuid, (void *)arg, sizeof (cpuid)) != 0)
16371			return (EFAULT);
16372
16373		return (0);
16374	}
16375
16376	case DTRACEIOC_DOFGET: {
16377		dof_hdr_t hdr, *dof;
16378		uint64_t len;
16379
16380		if (copyin((void *)arg, &hdr, sizeof (hdr)) != 0)
16381			return (EFAULT);
16382
16383		mutex_enter(&dtrace_lock);
16384		dof = dtrace_dof_create(state);
16385		mutex_exit(&dtrace_lock);
16386
16387		len = MIN(hdr.dofh_loadsz, dof->dofh_loadsz);
16388		rval = copyout(dof, (void *)arg, len);
16389		dtrace_dof_destroy(dof);
16390
16391		return (rval == 0 ? 0 : EFAULT);
16392	}
16393
16394	case DTRACEIOC_AGGSNAP:
16395	case DTRACEIOC_BUFSNAP: {
16396		dtrace_bufdesc_t desc;
16397		caddr_t cached;
16398		dtrace_buffer_t *buf;
16399
16400		if (copyin((void *)arg, &desc, sizeof (desc)) != 0)
16401			return (EFAULT);
16402
16403		if (desc.dtbd_cpu < 0 || desc.dtbd_cpu >= NCPU)
16404			return (EINVAL);
16405
16406		mutex_enter(&dtrace_lock);
16407
16408		if (cmd == DTRACEIOC_BUFSNAP) {
16409			buf = &state->dts_buffer[desc.dtbd_cpu];
16410		} else {
16411			buf = &state->dts_aggbuffer[desc.dtbd_cpu];
16412		}
16413
16414		if (buf->dtb_flags & (DTRACEBUF_RING | DTRACEBUF_FILL)) {
16415			size_t sz = buf->dtb_offset;
16416
16417			if (state->dts_activity != DTRACE_ACTIVITY_STOPPED) {
16418				mutex_exit(&dtrace_lock);
16419				return (EBUSY);
16420			}
16421
16422			/*
16423			 * If this buffer has already been consumed, we're
16424			 * going to indicate that there's nothing left here
16425			 * to consume.
16426			 */
16427			if (buf->dtb_flags & DTRACEBUF_CONSUMED) {
16428				mutex_exit(&dtrace_lock);
16429
16430				desc.dtbd_size = 0;
16431				desc.dtbd_drops = 0;
16432				desc.dtbd_errors = 0;
16433				desc.dtbd_oldest = 0;
16434				sz = sizeof (desc);
16435
16436				if (copyout(&desc, (void *)arg, sz) != 0)
16437					return (EFAULT);
16438
16439				return (0);
16440			}
16441
16442			/*
16443			 * If this is a ring buffer that has wrapped, we want
16444			 * to copy the whole thing out.
16445			 */
16446			if (buf->dtb_flags & DTRACEBUF_WRAPPED) {
16447				dtrace_buffer_polish(buf);
16448				sz = buf->dtb_size;
16449			}
16450
16451			if (copyout(buf->dtb_tomax, desc.dtbd_data, sz) != 0) {
16452				mutex_exit(&dtrace_lock);
16453				return (EFAULT);
16454			}
16455
16456			desc.dtbd_size = sz;
16457			desc.dtbd_drops = buf->dtb_drops;
16458			desc.dtbd_errors = buf->dtb_errors;
16459			desc.dtbd_oldest = buf->dtb_xamot_offset;
16460			desc.dtbd_timestamp = dtrace_gethrtime();
16461
16462			mutex_exit(&dtrace_lock);
16463
16464			if (copyout(&desc, (void *)arg, sizeof (desc)) != 0)
16465				return (EFAULT);
16466
16467			buf->dtb_flags |= DTRACEBUF_CONSUMED;
16468
16469			return (0);
16470		}
16471
16472		if (buf->dtb_tomax == NULL) {
16473			ASSERT(buf->dtb_xamot == NULL);
16474			mutex_exit(&dtrace_lock);
16475			return (ENOENT);
16476		}
16477
16478		cached = buf->dtb_tomax;
16479		ASSERT(!(buf->dtb_flags & DTRACEBUF_NOSWITCH));
16480
16481		dtrace_xcall(desc.dtbd_cpu,
16482		    (dtrace_xcall_t)dtrace_buffer_switch, buf);
16483
16484		state->dts_errors += buf->dtb_xamot_errors;
16485
16486		/*
16487		 * If the buffers did not actually switch, then the cross call
16488		 * did not take place -- presumably because the given CPU is
16489		 * not in the ready set.  If this is the case, we'll return
16490		 * ENOENT.
16491		 */
16492		if (buf->dtb_tomax == cached) {
16493			ASSERT(buf->dtb_xamot != cached);
16494			mutex_exit(&dtrace_lock);
16495			return (ENOENT);
16496		}
16497
16498		ASSERT(cached == buf->dtb_xamot);
16499
16500		/*
16501		 * We have our snapshot; now copy it out.
16502		 */
16503		if (copyout(buf->dtb_xamot, desc.dtbd_data,
16504		    buf->dtb_xamot_offset) != 0) {
16505			mutex_exit(&dtrace_lock);
16506			return (EFAULT);
16507		}
16508
16509		desc.dtbd_size = buf->dtb_xamot_offset;
16510		desc.dtbd_drops = buf->dtb_xamot_drops;
16511		desc.dtbd_errors = buf->dtb_xamot_errors;
16512		desc.dtbd_oldest = 0;
16513		desc.dtbd_timestamp = buf->dtb_switched;
16514
16515		mutex_exit(&dtrace_lock);
16516
16517		/*
16518		 * Finally, copy out the buffer description.
16519		 */
16520		if (copyout(&desc, (void *)arg, sizeof (desc)) != 0)
16521			return (EFAULT);
16522
16523		return (0);
16524	}
16525
16526	case DTRACEIOC_CONF: {
16527		dtrace_conf_t conf;
16528
16529		bzero(&conf, sizeof (conf));
16530		conf.dtc_difversion = DIF_VERSION;
16531		conf.dtc_difintregs = DIF_DIR_NREGS;
16532		conf.dtc_diftupregs = DIF_DTR_NREGS;
16533		conf.dtc_ctfmodel = CTF_MODEL_NATIVE;
16534
16535		if (copyout(&conf, (void *)arg, sizeof (conf)) != 0)
16536			return (EFAULT);
16537
16538		return (0);
16539	}
16540
16541	case DTRACEIOC_STATUS: {
16542		dtrace_status_t stat;
16543		dtrace_dstate_t *dstate;
16544		int i, j;
16545		uint64_t nerrs;
16546
16547		/*
16548		 * See the comment in dtrace_state_deadman() for the reason
16549		 * for setting dts_laststatus to INT64_MAX before setting
16550		 * it to the correct value.
16551		 */
16552		state->dts_laststatus = INT64_MAX;
16553		dtrace_membar_producer();
16554		state->dts_laststatus = dtrace_gethrtime();
16555
16556		bzero(&stat, sizeof (stat));
16557
16558		mutex_enter(&dtrace_lock);
16559
16560		if (state->dts_activity == DTRACE_ACTIVITY_INACTIVE) {
16561			mutex_exit(&dtrace_lock);
16562			return (ENOENT);
16563		}
16564
16565		if (state->dts_activity == DTRACE_ACTIVITY_DRAINING)
16566			stat.dtst_exiting = 1;
16567
16568		nerrs = state->dts_errors;
16569		dstate = &state->dts_vstate.dtvs_dynvars;
16570
16571		for (i = 0; i < NCPU; i++) {
16572			dtrace_dstate_percpu_t *dcpu = &dstate->dtds_percpu[i];
16573
16574			stat.dtst_dyndrops += dcpu->dtdsc_drops;
16575			stat.dtst_dyndrops_dirty += dcpu->dtdsc_dirty_drops;
16576			stat.dtst_dyndrops_rinsing += dcpu->dtdsc_rinsing_drops;
16577
16578			if (state->dts_buffer[i].dtb_flags & DTRACEBUF_FULL)
16579				stat.dtst_filled++;
16580
16581			nerrs += state->dts_buffer[i].dtb_errors;
16582
16583			for (j = 0; j < state->dts_nspeculations; j++) {
16584				dtrace_speculation_t *spec;
16585				dtrace_buffer_t *buf;
16586
16587				spec = &state->dts_speculations[j];
16588				buf = &spec->dtsp_buffer[i];
16589				stat.dtst_specdrops += buf->dtb_xamot_drops;
16590			}
16591		}
16592
16593		stat.dtst_specdrops_busy = state->dts_speculations_busy;
16594		stat.dtst_specdrops_unavail = state->dts_speculations_unavail;
16595		stat.dtst_stkstroverflows = state->dts_stkstroverflows;
16596		stat.dtst_dblerrors = state->dts_dblerrors;
16597		stat.dtst_killed =
16598		    (state->dts_activity == DTRACE_ACTIVITY_KILLED);
16599		stat.dtst_errors = nerrs;
16600
16601		mutex_exit(&dtrace_lock);
16602
16603		if (copyout(&stat, (void *)arg, sizeof (stat)) != 0)
16604			return (EFAULT);
16605
16606		return (0);
16607	}
16608
16609	case DTRACEIOC_FORMAT: {
16610		dtrace_fmtdesc_t fmt;
16611		char *str;
16612		int len;
16613
16614		if (copyin((void *)arg, &fmt, sizeof (fmt)) != 0)
16615			return (EFAULT);
16616
16617		mutex_enter(&dtrace_lock);
16618
16619		if (fmt.dtfd_format == 0 ||
16620		    fmt.dtfd_format > state->dts_nformats) {
16621			mutex_exit(&dtrace_lock);
16622			return (EINVAL);
16623		}
16624
16625		/*
16626		 * Format strings are allocated contiguously and they are
16627		 * never freed; if a format index is less than the number
16628		 * of formats, we can assert that the format map is non-NULL
16629		 * and that the format for the specified index is non-NULL.
16630		 */
16631		ASSERT(state->dts_formats != NULL);
16632		str = state->dts_formats[fmt.dtfd_format - 1];
16633		ASSERT(str != NULL);
16634
16635		len = strlen(str) + 1;
16636
16637		if (len > fmt.dtfd_length) {
16638			fmt.dtfd_length = len;
16639
16640			if (copyout(&fmt, (void *)arg, sizeof (fmt)) != 0) {
16641				mutex_exit(&dtrace_lock);
16642				return (EINVAL);
16643			}
16644		} else {
16645			if (copyout(str, fmt.dtfd_string, len) != 0) {
16646				mutex_exit(&dtrace_lock);
16647				return (EINVAL);
16648			}
16649		}
16650
16651		mutex_exit(&dtrace_lock);
16652		return (0);
16653	}
16654
16655	default:
16656		break;
16657	}
16658
16659	return (ENOTTY);
16660}
16661
16662/*ARGSUSED*/
16663static int
16664dtrace_detach(dev_info_t *dip, ddi_detach_cmd_t cmd)
16665{
16666	dtrace_state_t *state;
16667
16668	switch (cmd) {
16669	case DDI_DETACH:
16670		break;
16671
16672	case DDI_SUSPEND:
16673		return (DDI_SUCCESS);
16674
16675	default:
16676		return (DDI_FAILURE);
16677	}
16678
16679	mutex_enter(&cpu_lock);
16680	mutex_enter(&dtrace_provider_lock);
16681	mutex_enter(&dtrace_lock);
16682
16683	ASSERT(dtrace_opens == 0);
16684
16685	if (dtrace_helpers > 0) {
16686		mutex_exit(&dtrace_provider_lock);
16687		mutex_exit(&dtrace_lock);
16688		mutex_exit(&cpu_lock);
16689		return (DDI_FAILURE);
16690	}
16691
16692	if (dtrace_unregister((dtrace_provider_id_t)dtrace_provider) != 0) {
16693		mutex_exit(&dtrace_provider_lock);
16694		mutex_exit(&dtrace_lock);
16695		mutex_exit(&cpu_lock);
16696		return (DDI_FAILURE);
16697	}
16698
16699	dtrace_provider = NULL;
16700
16701	if ((state = dtrace_anon_grab()) != NULL) {
16702		/*
16703		 * If there were ECBs on this state, the provider should
16704		 * have not been allowed to detach; assert that there is
16705		 * none.
16706		 */
16707		ASSERT(state->dts_necbs == 0);
16708		dtrace_state_destroy(state);
16709
16710		/*
16711		 * If we're being detached with anonymous state, we need to
16712		 * indicate to the kernel debugger that DTrace is now inactive.
16713		 */
16714		(void) kdi_dtrace_set(KDI_DTSET_DTRACE_DEACTIVATE);
16715	}
16716
16717	bzero(&dtrace_anon, sizeof (dtrace_anon_t));
16718	unregister_cpu_setup_func((cpu_setup_func_t *)dtrace_cpu_setup, NULL);
16719	dtrace_cpu_init = NULL;
16720	dtrace_helpers_cleanup = NULL;
16721	dtrace_helpers_fork = NULL;
16722	dtrace_cpustart_init = NULL;
16723	dtrace_cpustart_fini = NULL;
16724	dtrace_debugger_init = NULL;
16725	dtrace_debugger_fini = NULL;
16726	dtrace_modload = NULL;
16727	dtrace_modunload = NULL;
16728
16729	mutex_exit(&cpu_lock);
16730
16731	if (dtrace_helptrace_enabled) {
16732		kmem_free(dtrace_helptrace_buffer, dtrace_helptrace_bufsize);
16733		dtrace_helptrace_buffer = NULL;
16734	}
16735
16736	kmem_free(dtrace_probes, dtrace_nprobes * sizeof (dtrace_probe_t *));
16737	dtrace_probes = NULL;
16738	dtrace_nprobes = 0;
16739
16740	dtrace_hash_destroy(dtrace_bymod);
16741	dtrace_hash_destroy(dtrace_byfunc);
16742	dtrace_hash_destroy(dtrace_byname);
16743	dtrace_bymod = NULL;
16744	dtrace_byfunc = NULL;
16745	dtrace_byname = NULL;
16746
16747	kmem_cache_destroy(dtrace_state_cache);
16748	vmem_destroy(dtrace_minor);
16749	vmem_destroy(dtrace_arena);
16750
16751	if (dtrace_toxrange != NULL) {
16752		kmem_free(dtrace_toxrange,
16753		    dtrace_toxranges_max * sizeof (dtrace_toxrange_t));
16754		dtrace_toxrange = NULL;
16755		dtrace_toxranges = 0;
16756		dtrace_toxranges_max = 0;
16757	}
16758
16759	ddi_remove_minor_node(dtrace_devi, NULL);
16760	dtrace_devi = NULL;
16761
16762	ddi_soft_state_fini(&dtrace_softstate);
16763
16764	ASSERT(dtrace_vtime_references == 0);
16765	ASSERT(dtrace_opens == 0);
16766	ASSERT(dtrace_retained == NULL);
16767
16768	mutex_exit(&dtrace_lock);
16769	mutex_exit(&dtrace_provider_lock);
16770
16771	/*
16772	 * We don't destroy the task queue until after we have dropped our
16773	 * locks (taskq_destroy() may block on running tasks).  To prevent
16774	 * attempting to do work after we have effectively detached but before
16775	 * the task queue has been destroyed, all tasks dispatched via the
16776	 * task queue must check that DTrace is still attached before
16777	 * performing any operation.
16778	 */
16779	taskq_destroy(dtrace_taskq);
16780	dtrace_taskq = NULL;
16781
16782	return (DDI_SUCCESS);
16783}
16784#endif
16785
16786#if defined(sun)
16787/*ARGSUSED*/
16788static int
16789dtrace_info(dev_info_t *dip, ddi_info_cmd_t infocmd, void *arg, void **result)
16790{
16791	int error;
16792
16793	switch (infocmd) {
16794	case DDI_INFO_DEVT2DEVINFO:
16795		*result = (void *)dtrace_devi;
16796		error = DDI_SUCCESS;
16797		break;
16798	case DDI_INFO_DEVT2INSTANCE:
16799		*result = (void *)0;
16800		error = DDI_SUCCESS;
16801		break;
16802	default:
16803		error = DDI_FAILURE;
16804	}
16805	return (error);
16806}
16807#endif
16808
16809#if defined(sun)
16810static struct cb_ops dtrace_cb_ops = {
16811	dtrace_open,		/* open */
16812	dtrace_close,		/* close */
16813	nulldev,		/* strategy */
16814	nulldev,		/* print */
16815	nodev,			/* dump */
16816	nodev,			/* read */
16817	nodev,			/* write */
16818	dtrace_ioctl,		/* ioctl */
16819	nodev,			/* devmap */
16820	nodev,			/* mmap */
16821	nodev,			/* segmap */
16822	nochpoll,		/* poll */
16823	ddi_prop_op,		/* cb_prop_op */
16824	0,			/* streamtab  */
16825	D_NEW | D_MP		/* Driver compatibility flag */
16826};
16827
16828static struct dev_ops dtrace_ops = {
16829	DEVO_REV,		/* devo_rev */
16830	0,			/* refcnt */
16831	dtrace_info,		/* get_dev_info */
16832	nulldev,		/* identify */
16833	nulldev,		/* probe */
16834	dtrace_attach,		/* attach */
16835	dtrace_detach,		/* detach */
16836	nodev,			/* reset */
16837	&dtrace_cb_ops,		/* driver operations */
16838	NULL,			/* bus operations */
16839	nodev			/* dev power */
16840};
16841
16842static struct modldrv modldrv = {
16843	&mod_driverops,		/* module type (this is a pseudo driver) */
16844	"Dynamic Tracing",	/* name of module */
16845	&dtrace_ops,		/* driver ops */
16846};
16847
16848static struct modlinkage modlinkage = {
16849	MODREV_1,
16850	(void *)&modldrv,
16851	NULL
16852};
16853
16854int
16855_init(void)
16856{
16857	return (mod_install(&modlinkage));
16858}
16859
16860int
16861_info(struct modinfo *modinfop)
16862{
16863	return (mod_info(&modlinkage, modinfop));
16864}
16865
16866int
16867_fini(void)
16868{
16869	return (mod_remove(&modlinkage));
16870}
16871#else
16872
16873static d_ioctl_t	dtrace_ioctl;
16874static d_ioctl_t	dtrace_ioctl_helper;
16875static void		dtrace_load(void *);
16876static int		dtrace_unload(void);
16877#if __FreeBSD_version < 800039
16878static void		dtrace_clone(void *, struct ucred *, char *, int , struct cdev **);
16879static struct clonedevs	*dtrace_clones;		/* Ptr to the array of cloned devices. */
16880static eventhandler_tag	eh_tag;			/* Event handler tag. */
16881#else
16882static struct cdev	*dtrace_dev;
16883static struct cdev	*helper_dev;
16884#endif
16885
16886void dtrace_invop_init(void);
16887void dtrace_invop_uninit(void);
16888
16889static struct cdevsw dtrace_cdevsw = {
16890	.d_version	= D_VERSION,
16891	.d_flags	= D_TRACKCLOSE | D_NEEDMINOR,
16892	.d_close	= dtrace_close,
16893	.d_ioctl	= dtrace_ioctl,
16894	.d_open		= dtrace_open,
16895	.d_name		= "dtrace",
16896};
16897
16898static struct cdevsw helper_cdevsw = {
16899	.d_version	= D_VERSION,
16900	.d_flags	= D_TRACKCLOSE | D_NEEDMINOR,
16901	.d_ioctl	= dtrace_ioctl_helper,
16902	.d_name		= "helper",
16903};
16904
16905#include <dtrace_anon.c>
16906#if __FreeBSD_version < 800039
16907#include <dtrace_clone.c>
16908#endif
16909#include <dtrace_ioctl.c>
16910#include <dtrace_load.c>
16911#include <dtrace_modevent.c>
16912#include <dtrace_sysctl.c>
16913#include <dtrace_unload.c>
16914#include <dtrace_vtime.c>
16915#include <dtrace_hacks.c>
16916#include <dtrace_isa.c>
16917
16918SYSINIT(dtrace_load, SI_SUB_DTRACE, SI_ORDER_FIRST, dtrace_load, NULL);
16919SYSUNINIT(dtrace_unload, SI_SUB_DTRACE, SI_ORDER_FIRST, dtrace_unload, NULL);
16920SYSINIT(dtrace_anon_init, SI_SUB_DTRACE_ANON, SI_ORDER_FIRST, dtrace_anon_init, NULL);
16921
16922DEV_MODULE(dtrace, dtrace_modevent, NULL);
16923MODULE_VERSION(dtrace, 1);
16924MODULE_DEPEND(dtrace, cyclic, 1, 1, 1);
16925MODULE_DEPEND(dtrace, opensolaris, 1, 1, 1);
16926#endif
16927