1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21
22/*
23 * Portions copyright (c) 2011, Joyent, Inc. All rights reserved.
24 */
25
26/*
27 * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
28 * Use is subject to license terms.
29 */
30
31/* #pragma ident	"@(#)dtrace.c	1.65	08/07/02 SMI" */
32
33/*
34 * DTrace - Dynamic Tracing for Solaris
35 *
36 * This is the implementation of the Solaris Dynamic Tracing framework
37 * (DTrace).  The user-visible interface to DTrace is described at length in
38 * the "Solaris Dynamic Tracing Guide".  The interfaces between the libdtrace
39 * library, the in-kernel DTrace framework, and the DTrace providers are
40 * described in the block comments in the <sys/dtrace.h> header file.  The
41 * internal architecture of DTrace is described in the block comments in the
42 * <sys/dtrace_impl.h> header file.  The comments contained within the DTrace
43 * implementation very much assume mastery of all of these sources; if one has
44 * an unanswered question about the implementation, one should consult them
45 * first.
46 *
47 * The functions here are ordered roughly as follows:
48 *
49 *   - Probe context functions
50 *   - Probe hashing functions
51 *   - Non-probe context utility functions
52 *   - Matching functions
53 *   - Provider-to-Framework API functions
54 *   - Probe management functions
55 *   - DIF object functions
56 *   - Format functions
57 *   - Predicate functions
58 *   - ECB functions
59 *   - Buffer functions
60 *   - Enabling functions
61 *   - DOF functions
62 *   - Anonymous enabling functions
63 *   - Consumer state functions
64 *   - Helper functions
65 *   - Hook functions
66 *   - Driver cookbook functions
67 *
68 * Each group of functions begins with a block comment labelled the "DTrace
69 * [Group] Functions", allowing one to find each block by searching forward
70 * on capital-f functions.
71 */
72#if !defined(__APPLE__)
73#include <sys/errno.h>
74#include <sys/stat.h>
75#include <sys/modctl.h>
76#include <sys/conf.h>
77#include <sys/systm.h>
78#include <sys/ddi.h>
79#include <sys/sunddi.h>
80#include <sys/cpuvar.h>
81#include <sys/kmem.h>
82#include <sys/strsubr.h>
83#include <sys/sysmacros.h>
84#include <sys/dtrace_impl.h>
85#include <sys/atomic.h>
86#include <sys/cmn_err.h>
87#include <sys/mutex_impl.h>
88#include <sys/rwlock_impl.h>
89#include <sys/ctf_api.h>
90#include <sys/panic.h>
91#include <sys/priv_impl.h>
92#include <sys/policy.h>
93#include <sys/cred_impl.h>
94#include <sys/procfs_isa.h>
95#include <sys/taskq.h>
96#include <sys/mkdev.h>
97#include <sys/kdi.h>
98#include <sys/zone.h>
99#else
100#include <sys/errno.h>
101#include <sys/types.h>
102#include <sys/stat.h>
103#include <sys/conf.h>
104#include <sys/systm.h>
105#include <sys/dtrace_impl.h>
106#include <sys/param.h>
107#include <sys/proc_internal.h>
108#include <sys/ioctl.h>
109#include <sys/fcntl.h>
110#include <miscfs/devfs/devfs.h>
111#include <sys/malloc.h>
112#include <sys/kernel_types.h>
113#include <sys/proc_internal.h>
114#include <sys/uio_internal.h>
115#include <sys/kauth.h>
116#include <vm/pmap.h>
117#include <sys/user.h>
118#include <mach/exception_types.h>
119#include <sys/signalvar.h>
120#include <mach/task.h>
121#include <kern/zalloc.h>
122#include <kern/ast.h>
123#include <netinet/in.h>
124
125#if defined(__APPLE__)
126#include <kern/cpu_data.h>
127extern uint32_t pmap_find_phys(void *, uint64_t);
128extern boolean_t pmap_valid_page(uint32_t);
129extern void OSKextRegisterKextsWithDTrace(void);
130extern kmod_info_t g_kernel_kmod_info;
131#endif /* __APPLE__ */
132
133
134/* Solaris proc_t is the struct. Darwin's proc_t is a pointer to it. */
135#define proc_t struct proc /* Steer clear of the Darwin typedef for proc_t */
136
137#define t_predcache t_dtrace_predcache /* Cosmetic. Helps readability of thread.h */
138
139extern void dtrace_suspend(void);
140extern void dtrace_resume(void);
141extern void dtrace_init(void);
142extern void helper_init(void);
143extern void fasttrap_init(void);
144extern void dtrace_lazy_dofs_duplicate(proc_t *, proc_t *);
145extern void dtrace_lazy_dofs_destroy(proc_t *);
146extern void dtrace_postinit(void);
147
148#include "../../../osfmk/chud/chud_dtrace.h"
149
150extern kern_return_t chudxnu_dtrace_callback
151	(uint64_t selector, uint64_t *args, uint32_t count);
152
153#endif /* __APPLE__ */
154
155/*
156 * DTrace Tunable Variables
157 *
158 * The following variables may be tuned by adding a line to /etc/system that
159 * includes both the name of the DTrace module ("dtrace") and the name of the
160 * variable.  For example:
161 *
162 *   set dtrace:dtrace_destructive_disallow = 1
163 *
164 * In general, the only variables that one should be tuning this way are those
165 * that affect system-wide DTrace behavior, and for which the default behavior
166 * is undesirable.  Most of these variables are tunable on a per-consumer
167 * basis using DTrace options, and need not be tuned on a system-wide basis.
168 * When tuning these variables, avoid pathological values; while some attempt
169 * is made to verify the integrity of these variables, they are not considered
170 * part of the supported interface to DTrace, and they are therefore not
171 * checked comprehensively.  Further, these variables should not be tuned
172 * dynamically via "mdb -kw" or other means; they should only be tuned via
173 * /etc/system.
174 */
175int		dtrace_destructive_disallow = 0;
176dtrace_optval_t	dtrace_nonroot_maxsize = (16 * 1024 * 1024);
177size_t		dtrace_difo_maxsize = (256 * 1024);
178dtrace_optval_t	dtrace_dof_maxsize = (384 * 1024);
179size_t		dtrace_global_maxsize = (16 * 1024);
180size_t		dtrace_actions_max = (16 * 1024);
181size_t		dtrace_retain_max = 1024;
182dtrace_optval_t	dtrace_helper_actions_max = 32;
183dtrace_optval_t	dtrace_helper_providers_max = 64;
184dtrace_optval_t	dtrace_dstate_defsize = (1 * 1024 * 1024);
185size_t		dtrace_strsize_default = 256;
186dtrace_optval_t	dtrace_cleanrate_default = 990099000;		/* 1.1 hz */
187dtrace_optval_t	dtrace_cleanrate_min = 20000000;			/* 50 hz */
188dtrace_optval_t	dtrace_cleanrate_max = (uint64_t)60 * NANOSEC;	/* 1/minute */
189dtrace_optval_t	dtrace_aggrate_default = NANOSEC;		/* 1 hz */
190dtrace_optval_t	dtrace_statusrate_default = NANOSEC;		/* 1 hz */
191dtrace_optval_t dtrace_statusrate_max = (hrtime_t)10 * NANOSEC;	 /* 6/minute */
192dtrace_optval_t	dtrace_switchrate_default = NANOSEC;		/* 1 hz */
193dtrace_optval_t	dtrace_nspec_default = 1;
194dtrace_optval_t	dtrace_specsize_default = 32 * 1024;
195dtrace_optval_t dtrace_stackframes_default = 20;
196dtrace_optval_t dtrace_ustackframes_default = 20;
197dtrace_optval_t dtrace_jstackframes_default = 50;
198dtrace_optval_t dtrace_jstackstrsize_default = 512;
199int		dtrace_msgdsize_max = 128;
200hrtime_t	dtrace_chill_max = 500 * (NANOSEC / MILLISEC);	/* 500 ms */
201hrtime_t	dtrace_chill_interval = NANOSEC;		/* 1000 ms */
202int		dtrace_devdepth_max = 32;
203int		dtrace_err_verbose;
204hrtime_t	dtrace_deadman_interval = NANOSEC;
205hrtime_t	dtrace_deadman_timeout = (hrtime_t)10 * NANOSEC;
206hrtime_t	dtrace_deadman_user = (hrtime_t)30 * NANOSEC;
207
208/*
209 * DTrace External Variables
210 *
211 * As dtrace(7D) is a kernel module, any DTrace variables are obviously
212 * available to DTrace consumers via the backtick (`) syntax.  One of these,
213 * dtrace_zero, is made deliberately so:  it is provided as a source of
214 * well-known, zero-filled memory.  While this variable is not documented,
215 * it is used by some translators as an implementation detail.
216 */
217const char	dtrace_zero[256] = { 0 };	/* zero-filled memory */
218unsigned int	dtrace_max_cpus = 0;		/* number of enabled cpus */
219/*
220 * DTrace Internal Variables
221 */
222static dev_info_t	*dtrace_devi;		/* device info */
223static vmem_t		*dtrace_arena;		/* probe ID arena */
224static vmem_t		*dtrace_minor;		/* minor number arena */
225static taskq_t		*dtrace_taskq;		/* task queue */
226static dtrace_probe_t	**dtrace_probes;	/* array of all probes */
227static int		dtrace_nprobes;		/* number of probes */
228static dtrace_provider_t *dtrace_provider;	/* provider list */
229static dtrace_meta_t	*dtrace_meta_pid;	/* user-land meta provider */
230static int		dtrace_opens;		/* number of opens */
231static int		dtrace_helpers;		/* number of helpers */
232static void		*dtrace_softstate;	/* softstate pointer */
233static dtrace_hash_t	*dtrace_bymod;		/* probes hashed by module */
234static dtrace_hash_t	*dtrace_byfunc;		/* probes hashed by function */
235static dtrace_hash_t	*dtrace_byname;		/* probes hashed by name */
236static dtrace_toxrange_t *dtrace_toxrange;	/* toxic range array */
237static int		dtrace_toxranges;	/* number of toxic ranges */
238static int		dtrace_toxranges_max;	/* size of toxic range array */
239static dtrace_anon_t	dtrace_anon;		/* anonymous enabling */
240static kmem_cache_t	*dtrace_state_cache;	/* cache for dynamic state */
241static uint64_t		dtrace_vtime_references; /* number of vtimestamp refs */
242static kthread_t	*dtrace_panicked;	/* panicking thread */
243static dtrace_ecb_t	*dtrace_ecb_create_cache; /* cached created ECB */
244static dtrace_genid_t	dtrace_probegen;	/* current probe generation */
245static dtrace_helpers_t *dtrace_deferred_pid;	/* deferred helper list */
246static dtrace_enabling_t *dtrace_retained;	/* list of retained enablings */
247static dtrace_genid_t   dtrace_retained_gen;    /* current retained enab gen */
248static dtrace_dynvar_t	dtrace_dynhash_sink;	/* end of dynamic hash chains */
249#if defined(__APPLE__)
250static int		dtrace_dof_mode;	/* See dtrace_impl.h for a description of Darwin's dof modes. */
251
252			/*
253			 * This does't quite fit as an internal variable, as it must be accessed in
254			 * fbt_provide and sdt_provide. Its clearly not a dtrace tunable variable either...
255			 */
256int			dtrace_kernel_symbol_mode;	/* See dtrace_impl.h for a description of Darwin's kernel symbol modes. */
257#endif
258
259#if defined(__APPLE__)
260/*
261 * To save memory, some common memory allocations are given a
262 * unique zone. For example, dtrace_probe_t is 72 bytes in size,
263 * which means it would fall into the kalloc.128 bucket. With
264 * 20k elements allocated, the space saved is substantial.
265 */
266
267struct zone *dtrace_probe_t_zone;
268
269static int dtrace_module_unloaded(struct kmod_info *kmod);
270#endif /* __APPLE__ */
271
272/*
273 * DTrace Locking
274 * DTrace is protected by three (relatively coarse-grained) locks:
275 *
276 * (1) dtrace_lock is required to manipulate essentially any DTrace state,
277 *     including enabling state, probes, ECBs, consumer state, helper state,
278 *     etc.  Importantly, dtrace_lock is _not_ required when in probe context;
279 *     probe context is lock-free -- synchronization is handled via the
280 *     dtrace_sync() cross call mechanism.
281 *
282 * (2) dtrace_provider_lock is required when manipulating provider state, or
283 *     when provider state must be held constant.
284 *
285 * (3) dtrace_meta_lock is required when manipulating meta provider state, or
286 *     when meta provider state must be held constant.
287 *
288 * The lock ordering between these three locks is dtrace_meta_lock before
289 * dtrace_provider_lock before dtrace_lock.  (In particular, there are
290 * several places where dtrace_provider_lock is held by the framework as it
291 * calls into the providers -- which then call back into the framework,
292 * grabbing dtrace_lock.)
293 *
294 * There are two other locks in the mix:  mod_lock and cpu_lock.  With respect
295 * to dtrace_provider_lock and dtrace_lock, cpu_lock continues its historical
296 * role as a coarse-grained lock; it is acquired before both of these locks.
297 * With respect to dtrace_meta_lock, its behavior is stranger:  cpu_lock must
298 * be acquired _between_ dtrace_meta_lock and any other DTrace locks.
299 * mod_lock is similar with respect to dtrace_provider_lock in that it must be
300 * acquired _between_ dtrace_provider_lock and dtrace_lock.
301 */
302
303#if !defined(__APPLE__)
304static kmutex_t		dtrace_lock;		/* probe state lock */
305static kmutex_t		dtrace_provider_lock;	/* provider state lock */
306static kmutex_t		dtrace_meta_lock;	/* meta-provider state lock */
307#else
308/*
309 * APPLE NOTE:
310 *
311 * All kmutex_t vars have been changed to lck_mtx_t.
312 * Note that lck_mtx_t's require explicit initialization.
313 *
314 * mutex_enter() becomes lck_mtx_lock()
315 * mutex_exit() becomes lck_mtx_unlock()
316 *
317 * Lock asserts are changed like this:
318 *
319 * ASSERT(MUTEX_HELD(&cpu_lock));
320 *	becomes:
321 * lck_mtx_assert(&cpu_lock, LCK_MTX_ASSERT_OWNED);
322 *
323 * Due to the number of these changes, they are not called out explicitly.
324 */
325static lck_mtx_t	dtrace_lock;		/* probe state lock */
326static lck_mtx_t	dtrace_provider_lock;	/* provider state lock */
327static lck_mtx_t	dtrace_meta_lock;	/* meta-provider state lock */
328static lck_rw_t		dtrace_dof_mode_lock;	/* dof mode lock */
329#endif /* __APPLE__ */
330
331/*
332 * DTrace Provider Variables
333 *
334 * These are the variables relating to DTrace as a provider (that is, the
335 * provider of the BEGIN, END, and ERROR probes).
336 */
337static dtrace_pattr_t	dtrace_provider_attr = {
338{ DTRACE_STABILITY_STABLE, DTRACE_STABILITY_STABLE, DTRACE_CLASS_COMMON },
339{ DTRACE_STABILITY_PRIVATE, DTRACE_STABILITY_PRIVATE, DTRACE_CLASS_UNKNOWN },
340{ DTRACE_STABILITY_PRIVATE, DTRACE_STABILITY_PRIVATE, DTRACE_CLASS_UNKNOWN },
341{ DTRACE_STABILITY_STABLE, DTRACE_STABILITY_STABLE, DTRACE_CLASS_COMMON },
342{ DTRACE_STABILITY_STABLE, DTRACE_STABILITY_STABLE, DTRACE_CLASS_COMMON },
343};
344
345static void
346dtrace_nullop(void)
347{}
348
349static int
350dtrace_enable_nullop(void)
351{
352    return (0);
353}
354
355static dtrace_pops_t	dtrace_provider_ops = {
356	(void (*)(void *, const dtrace_probedesc_t *))dtrace_nullop,
357	(void (*)(void *, struct modctl *))dtrace_nullop,
358	(int (*)(void *, dtrace_id_t, void *))dtrace_enable_nullop,
359	(void (*)(void *, dtrace_id_t, void *))dtrace_nullop,
360	(void (*)(void *, dtrace_id_t, void *))dtrace_nullop,
361	(void (*)(void *, dtrace_id_t, void *))dtrace_nullop,
362	NULL,
363	NULL,
364	NULL,
365	(void (*)(void *, dtrace_id_t, void *))dtrace_nullop
366};
367
368static dtrace_id_t	dtrace_probeid_begin;	/* special BEGIN probe */
369static dtrace_id_t	dtrace_probeid_end;	/* special END probe */
370dtrace_id_t		dtrace_probeid_error;	/* special ERROR probe */
371
372/*
373 * DTrace Helper Tracing Variables
374 */
375uint32_t dtrace_helptrace_next = 0;
376uint32_t dtrace_helptrace_nlocals;
377char	*dtrace_helptrace_buffer;
378#if !defined(__APPLE__) /* Quiet compiler warning */
379int	dtrace_helptrace_bufsize = 512 * 1024;
380#else
381size_t	dtrace_helptrace_bufsize = 512 * 1024;
382#endif /* __APPLE__ */
383
384#if DEBUG
385int	dtrace_helptrace_enabled = 1;
386#else
387int	dtrace_helptrace_enabled = 0;
388#endif
389
390/*
391 * DTrace Error Hashing
392 *
393 * On DEBUG kernels, DTrace will track the errors that has seen in a hash
394 * table.  This is very useful for checking coverage of tests that are
395 * expected to induce DIF or DOF processing errors, and may be useful for
396 * debugging problems in the DIF code generator or in DOF generation .  The
397 * error hash may be examined with the ::dtrace_errhash MDB dcmd.
398 */
399#if DEBUG
400static dtrace_errhash_t	dtrace_errhash[DTRACE_ERRHASHSZ];
401static const char *dtrace_errlast;
402static kthread_t *dtrace_errthread;
403static lck_mtx_t dtrace_errlock;
404#endif
405
406/*
407 * DTrace Macros and Constants
408 *
409 * These are various macros that are useful in various spots in the
410 * implementation, along with a few random constants that have no meaning
411 * outside of the implementation.  There is no real structure to this cpp
412 * mishmash -- but is there ever?
413 */
414#define	DTRACE_HASHSTR(hash, probe)	\
415	dtrace_hash_str(*((char **)((uintptr_t)(probe) + (hash)->dth_stroffs)))
416
417#define	DTRACE_HASHNEXT(hash, probe)	\
418	(dtrace_probe_t **)((uintptr_t)(probe) + (hash)->dth_nextoffs)
419
420#define	DTRACE_HASHPREV(hash, probe)	\
421	(dtrace_probe_t **)((uintptr_t)(probe) + (hash)->dth_prevoffs)
422
423#define	DTRACE_HASHEQ(hash, lhs, rhs)	\
424	(strcmp(*((char **)((uintptr_t)(lhs) + (hash)->dth_stroffs)), \
425	    *((char **)((uintptr_t)(rhs) + (hash)->dth_stroffs))) == 0)
426
427#define	DTRACE_AGGHASHSIZE_SLEW		17
428
429#define	DTRACE_V4MAPPED_OFFSET		(sizeof (uint32_t) * 3)
430
431/*
432 * The key for a thread-local variable consists of the lower 61 bits of the
433 * t_did, plus the 3 bits of the highest active interrupt above LOCK_LEVEL.
434 * We add DIF_VARIABLE_MAX to t_did to assure that the thread key is never
435 * equal to a variable identifier.  This is necessary (but not sufficient) to
436 * assure that global associative arrays never collide with thread-local
437 * variables.  To guarantee that they cannot collide, we must also define the
438 * order for keying dynamic variables.  That order is:
439 *
440 *   [ key0 ] ... [ keyn ] [ variable-key ] [ tls-key ]
441 *
442 * Because the variable-key and the tls-key are in orthogonal spaces, there is
443 * no way for a global variable key signature to match a thread-local key
444 * signature.
445 */
446#if !defined(__APPLE__)
447#define	DTRACE_TLS_THRKEY(where) { \
448	uint_t intr = 0; \
449	uint_t actv = CPU->cpu_intr_actv >> (LOCK_LEVEL + 1); \
450	for (; actv; actv >>= 1) \
451		intr++; \
452	ASSERT(intr < (1 << 3)); \
453	(where) = ((curthread->t_did + DIF_VARIABLE_MAX) & \
454	    (((uint64_t)1 << 61) - 1)) | ((uint64_t)intr << 61); \
455}
456#else
457#if defined (__x86_64__)
458/* FIXME: two function calls!! */
459#define	DTRACE_TLS_THRKEY(where) { \
460	uint_t intr = ml_at_interrupt_context(); /* Note: just one measly bit */ \
461	uint64_t thr = (uintptr_t)current_thread(); \
462	ASSERT(intr < (1 << 3)); \
463	(where) = ((thr + DIF_VARIABLE_MAX) & \
464	    (((uint64_t)1 << 61) - 1)) | ((uint64_t)intr << 61); \
465}
466#else
467#error Unknown architecture
468#endif
469#endif /* __APPLE__ */
470
471#define	DT_BSWAP_8(x)	((x) & 0xff)
472#define	DT_BSWAP_16(x)	((DT_BSWAP_8(x) << 8) | DT_BSWAP_8((x) >> 8))
473#define	DT_BSWAP_32(x)	((DT_BSWAP_16(x) << 16) | DT_BSWAP_16((x) >> 16))
474#define	DT_BSWAP_64(x)	((DT_BSWAP_32(x) << 32) | DT_BSWAP_32((x) >> 32))
475
476#define	DT_MASK_LO 0x00000000FFFFFFFFULL
477
478#define	DTRACE_STORE(type, tomax, offset, what) \
479	*((type *)((uintptr_t)(tomax) + (uintptr_t)offset)) = (type)(what);
480
481
482#define	DTRACE_ALIGNCHECK(addr, size, flags)				\
483	if (addr & (MIN(size,4) - 1)) {					\
484		*flags |= CPU_DTRACE_BADALIGN;				\
485		cpu_core[CPU->cpu_id].cpuc_dtrace_illval = addr;	\
486		return (0);						\
487	}
488
489/*
490 * Test whether a range of memory starting at testaddr of size testsz falls
491 * within the range of memory described by addr, sz.  We take care to avoid
492 * problems with overflow and underflow of the unsigned quantities, and
493 * disallow all negative sizes.  Ranges of size 0 are allowed.
494 */
495#define	DTRACE_INRANGE(testaddr, testsz, baseaddr, basesz) \
496	((testaddr) - (baseaddr) < (basesz) && \
497	(testaddr) + (testsz) - (baseaddr) <= (basesz) && \
498	(testaddr) + (testsz) >= (testaddr))
499
500/*
501 * Test whether alloc_sz bytes will fit in the scratch region.  We isolate
502 * alloc_sz on the righthand side of the comparison in order to avoid overflow
503 * or underflow in the comparison with it.  This is simpler than the INRANGE
504 * check above, because we know that the dtms_scratch_ptr is valid in the
505 * range.  Allocations of size zero are allowed.
506 */
507#define	DTRACE_INSCRATCH(mstate, alloc_sz) \
508	((mstate)->dtms_scratch_base + (mstate)->dtms_scratch_size - \
509	(mstate)->dtms_scratch_ptr >= (alloc_sz))
510
511#if !defined(__APPLE__)
512#define	DTRACE_LOADFUNC(bits)						\
513/*CSTYLED*/								\
514uint##bits##_t								\
515dtrace_load##bits(uintptr_t addr)					\
516{									\
517	size_t size = bits / NBBY;					\
518	/*CSTYLED*/							\
519	uint##bits##_t rval;						\
520	int i;								\
521	volatile uint16_t *flags = (volatile uint16_t *)		\
522	    &cpu_core[CPU->cpu_id].cpuc_dtrace_flags;			\
523									\
524	DTRACE_ALIGNCHECK(addr, size, flags);				\
525									\
526	for (i = 0; i < dtrace_toxranges; i++) {			\
527		if (addr >= dtrace_toxrange[i].dtt_limit)		\
528			continue;					\
529									\
530		if (addr + size <= dtrace_toxrange[i].dtt_base)		\
531			continue;					\
532									\
533		/*							\
534		 * This address falls within a toxic region; return 0.	\
535		 */							\
536		*flags |= CPU_DTRACE_BADADDR;				\
537		cpu_core[CPU->cpu_id].cpuc_dtrace_illval = addr;	\
538		return (0);						\
539	}								\
540									\
541	*flags |= CPU_DTRACE_NOFAULT;					\
542	/*CSTYLED*/							\
543	rval = *((volatile uint##bits##_t *)addr);			\
544	*flags &= ~CPU_DTRACE_NOFAULT;					\
545									\
546	return (!(*flags & CPU_DTRACE_FAULT) ? rval : 0);		\
547}
548#else /* __APPLE__ */
549#define RECOVER_LABEL(bits) dtraceLoadRecover##bits:
550
551#if defined (__x86_64__)
552#define	DTRACE_LOADFUNC(bits)						\
553/*CSTYLED*/								\
554uint##bits##_t dtrace_load##bits(uintptr_t addr);			\
555									\
556uint##bits##_t								\
557dtrace_load##bits(uintptr_t addr)					\
558{									\
559	size_t size = bits / NBBY;					\
560	/*CSTYLED*/							\
561	uint##bits##_t rval = 0;					\
562	int i;								\
563	volatile uint16_t *flags = (volatile uint16_t *)		\
564	    &cpu_core[CPU->cpu_id].cpuc_dtrace_flags;			\
565									\
566	DTRACE_ALIGNCHECK(addr, size, flags);				\
567									\
568	for (i = 0; i < dtrace_toxranges; i++) {			\
569		if (addr >= dtrace_toxrange[i].dtt_limit)		\
570			continue;					\
571									\
572		if (addr + size <= dtrace_toxrange[i].dtt_base)		\
573			continue;					\
574									\
575		/*							\
576		 * This address falls within a toxic region; return 0.	\
577		 */							\
578		*flags |= CPU_DTRACE_BADADDR;				\
579		cpu_core[CPU->cpu_id].cpuc_dtrace_illval = addr;	\
580		return (0);						\
581	}								\
582									\
583	{								\
584	volatile vm_offset_t recover = (vm_offset_t)&&dtraceLoadRecover##bits;		\
585	*flags |= CPU_DTRACE_NOFAULT;					\
586	recover = dtrace_set_thread_recover(current_thread(), recover);	\
587	/*CSTYLED*/							\
588	/*                                                              \
589	* PR6394061 - avoid device memory that is unpredictably		\
590	* mapped and unmapped                                   	\
591	*/								\
592        if (pmap_valid_page(pmap_find_phys(kernel_pmap, addr)))		\
593	    rval = *((volatile uint##bits##_t *)addr);			\
594	RECOVER_LABEL(bits);						\
595	(void)dtrace_set_thread_recover(current_thread(), recover);	\
596	*flags &= ~CPU_DTRACE_NOFAULT;					\
597	}								\
598									\
599	return (rval);							\
600}
601#else /* all other architectures */
602#error Unknown Architecture
603#endif
604#endif /* __APPLE__ */
605
606#ifdef __LP64__
607#define	dtrace_loadptr	dtrace_load64
608#else
609#define	dtrace_loadptr	dtrace_load32
610#endif
611
612#define	DTRACE_DYNHASH_FREE	0
613#define	DTRACE_DYNHASH_SINK	1
614#define	DTRACE_DYNHASH_VALID	2
615
616#define DTRACE_MATCH_FAIL       -1
617#define	DTRACE_MATCH_NEXT	0
618#define	DTRACE_MATCH_DONE	1
619#define	DTRACE_ANCHORED(probe)	((probe)->dtpr_func[0] != '\0')
620#define	DTRACE_STATE_ALIGN	64
621
622#define	DTRACE_FLAGS2FLT(flags)						\
623	(((flags) & CPU_DTRACE_BADADDR) ? DTRACEFLT_BADADDR :		\
624	((flags) & CPU_DTRACE_ILLOP) ? DTRACEFLT_ILLOP :		\
625	((flags) & CPU_DTRACE_DIVZERO) ? DTRACEFLT_DIVZERO :		\
626	((flags) & CPU_DTRACE_KPRIV) ? DTRACEFLT_KPRIV :		\
627	((flags) & CPU_DTRACE_UPRIV) ? DTRACEFLT_UPRIV :		\
628	((flags) & CPU_DTRACE_TUPOFLOW) ?  DTRACEFLT_TUPOFLOW :		\
629	((flags) & CPU_DTRACE_BADALIGN) ?  DTRACEFLT_BADALIGN :		\
630	((flags) & CPU_DTRACE_NOSCRATCH) ?  DTRACEFLT_NOSCRATCH :	\
631	((flags) & CPU_DTRACE_BADSTACK) ?  DTRACEFLT_BADSTACK :		\
632	DTRACEFLT_UNKNOWN)
633
634#define	DTRACEACT_ISSTRING(act)						\
635	((act)->dta_kind == DTRACEACT_DIFEXPR &&			\
636	(act)->dta_difo->dtdo_rtype.dtdt_kind == DIF_TYPE_STRING)
637
638
639#if defined (__APPLE__)
640/* Avoid compiler warnings when assigning regs[rd] = NULL */
641#ifdef NULL
642#undef NULL
643#define NULL (uintptr_t)0
644#endif
645#endif /* __APPLE__ */
646
647static size_t dtrace_strlen(const char *, size_t);
648static dtrace_probe_t *dtrace_probe_lookup_id(dtrace_id_t id);
649static void dtrace_enabling_provide(dtrace_provider_t *);
650static int dtrace_enabling_match(dtrace_enabling_t *, int *);
651static void dtrace_enabling_matchall(void);
652static dtrace_state_t *dtrace_anon_grab(void);
653static uint64_t dtrace_helper(int, dtrace_mstate_t *,
654    dtrace_state_t *, uint64_t, uint64_t);
655static dtrace_helpers_t *dtrace_helpers_create(proc_t *);
656static void dtrace_buffer_drop(dtrace_buffer_t *);
657static intptr_t dtrace_buffer_reserve(dtrace_buffer_t *, size_t, size_t,
658    dtrace_state_t *, dtrace_mstate_t *);
659static int dtrace_state_option(dtrace_state_t *, dtrace_optid_t,
660    dtrace_optval_t);
661static int dtrace_ecb_create_enable(dtrace_probe_t *, void *);
662static void dtrace_helper_provider_destroy(dtrace_helper_provider_t *);
663
664/*
665 * DTrace Probe Context Functions
666 *
667 * These functions are called from probe context.  Because probe context is
668 * any context in which C may be called, arbitrarily locks may be held,
669 * interrupts may be disabled, we may be in arbitrary dispatched state, etc.
670 * As a result, functions called from probe context may only call other DTrace
671 * support functions -- they may not interact at all with the system at large.
672 * (Note that the ASSERT macro is made probe-context safe by redefining it in
673 * terms of dtrace_assfail(), a probe-context safe function.) If arbitrary
674 * loads are to be performed from probe context, they _must_ be in terms of
675 * the safe dtrace_load*() variants.
676 *
677 * Some functions in this block are not actually called from probe context;
678 * for these functions, there will be a comment above the function reading
679 * "Note:  not called from probe context."
680 */
681
682int
683dtrace_assfail(const char *a, const char *f, int l)
684{
685	panic("dtrace: assertion failed: %s, file: %s, line: %d", a, f, l);
686
687	/*
688	 * We just need something here that even the most clever compiler
689	 * cannot optimize away.
690	 */
691	return (a[(uintptr_t)f]);
692}
693
694/*
695 * Atomically increment a specified error counter from probe context.
696 */
697static void
698dtrace_error(uint32_t *counter)
699{
700	/*
701	 * Most counters stored to in probe context are per-CPU counters.
702	 * However, there are some error conditions that are sufficiently
703	 * arcane that they don't merit per-CPU storage.  If these counters
704	 * are incremented concurrently on different CPUs, scalability will be
705	 * adversely affected -- but we don't expect them to be white-hot in a
706	 * correctly constructed enabling...
707	 */
708	uint32_t oval, nval;
709
710	do {
711		oval = *counter;
712
713		if ((nval = oval + 1) == 0) {
714			/*
715			 * If the counter would wrap, set it to 1 -- assuring
716			 * that the counter is never zero when we have seen
717			 * errors.  (The counter must be 32-bits because we
718			 * aren't guaranteed a 64-bit compare&swap operation.)
719			 * To save this code both the infamy of being fingered
720			 * by a priggish news story and the indignity of being
721			 * the target of a neo-puritan witch trial, we're
722			 * carefully avoiding any colorful description of the
723			 * likelihood of this condition -- but suffice it to
724			 * say that it is only slightly more likely than the
725			 * overflow of predicate cache IDs, as discussed in
726			 * dtrace_predicate_create().
727			 */
728			nval = 1;
729		}
730	} while (dtrace_cas32(counter, oval, nval) != oval);
731}
732
733/*
734 * Use the DTRACE_LOADFUNC macro to define functions for each of loading a
735 * uint8_t, a uint16_t, a uint32_t and a uint64_t.
736 */
737DTRACE_LOADFUNC(8)
738DTRACE_LOADFUNC(16)
739DTRACE_LOADFUNC(32)
740DTRACE_LOADFUNC(64)
741
742static int
743dtrace_inscratch(uintptr_t dest, size_t size, dtrace_mstate_t *mstate)
744{
745	if (dest < mstate->dtms_scratch_base)
746		return (0);
747
748	if (dest + size < dest)
749		return (0);
750
751	if (dest + size > mstate->dtms_scratch_ptr)
752		return (0);
753
754	return (1);
755}
756
757static int
758dtrace_canstore_statvar(uint64_t addr, size_t sz,
759    dtrace_statvar_t **svars, int nsvars)
760{
761	int i;
762
763	for (i = 0; i < nsvars; i++) {
764		dtrace_statvar_t *svar = svars[i];
765
766		if (svar == NULL || svar->dtsv_size == 0)
767			continue;
768
769		if (DTRACE_INRANGE(addr, sz, svar->dtsv_data, svar->dtsv_size))
770			return (1);
771	}
772
773	return (0);
774}
775
776/*
777 * Check to see if the address is within a memory region to which a store may
778 * be issued.  This includes the DTrace scratch areas, and any DTrace variable
779 * region.  The caller of dtrace_canstore() is responsible for performing any
780 * alignment checks that are needed before stores are actually executed.
781 */
782static int
783dtrace_canstore(uint64_t addr, size_t sz, dtrace_mstate_t *mstate,
784    dtrace_vstate_t *vstate)
785{
786	/*
787	 * First, check to see if the address is in scratch space...
788	 */
789	if (DTRACE_INRANGE(addr, sz, mstate->dtms_scratch_base,
790	    mstate->dtms_scratch_size))
791		return (1);
792
793	/*
794	 * Now check to see if it's a dynamic variable.  This check will pick
795	 * up both thread-local variables and any global dynamically-allocated
796	 * variables.
797	 */
798	if (DTRACE_INRANGE(addr, sz, (uintptr_t)vstate->dtvs_dynvars.dtds_base,
799	    vstate->dtvs_dynvars.dtds_size)) {
800		dtrace_dstate_t *dstate = &vstate->dtvs_dynvars;
801		uintptr_t base = (uintptr_t)dstate->dtds_base +
802		    (dstate->dtds_hashsize * sizeof (dtrace_dynhash_t));
803		uintptr_t chunkoffs;
804
805		/*
806		 * Before we assume that we can store here, we need to make
807		 * sure that it isn't in our metadata -- storing to our
808		 * dynamic variable metadata would corrupt our state.  For
809		 * the range to not include any dynamic variable metadata,
810		 * it must:
811		 *
812		 *	(1) Start above the hash table that is at the base of
813		 *	the dynamic variable space
814		 *
815		 *	(2) Have a starting chunk offset that is beyond the
816		 *	dtrace_dynvar_t that is at the base of every chunk
817		 *
818		 *	(3) Not span a chunk boundary
819		 *
820		 */
821		if (addr < base)
822			return (0);
823
824		chunkoffs = (addr - base) % dstate->dtds_chunksize;
825
826		if (chunkoffs < sizeof (dtrace_dynvar_t))
827			return (0);
828
829		if (chunkoffs + sz > dstate->dtds_chunksize)
830			return (0);
831
832		return (1);
833	}
834
835	/*
836	 * Finally, check the static local and global variables.  These checks
837	 * take the longest, so we perform them last.
838	 */
839	if (dtrace_canstore_statvar(addr, sz,
840	    vstate->dtvs_locals, vstate->dtvs_nlocals))
841		return (1);
842
843	if (dtrace_canstore_statvar(addr, sz,
844	    vstate->dtvs_globals, vstate->dtvs_nglobals))
845		return (1);
846
847	return (0);
848}
849
850
851/*
852 * Convenience routine to check to see if the address is within a memory
853 * region in which a load may be issued given the user's privilege level;
854 * if not, it sets the appropriate error flags and loads 'addr' into the
855 * illegal value slot.
856 *
857 * DTrace subroutines (DIF_SUBR_*) should use this helper to implement
858 * appropriate memory access protection.
859 */
860static int
861dtrace_canload(uint64_t addr, size_t sz, dtrace_mstate_t *mstate,
862    dtrace_vstate_t *vstate)
863{
864#if !defined(__APPLE__)  /* Quiet compiler warning - matches dtrace_dif_emulate */
865	volatile uintptr_t *illval = &cpu_core[CPU->cpu_id].cpuc_dtrace_illval;
866#else
867	volatile uint64_t *illval = &cpu_core[CPU->cpu_id].cpuc_dtrace_illval;
868#endif /* __APPLE */
869
870	/*
871	 * If we hold the privilege to read from kernel memory, then
872	 * everything is readable.
873	 */
874	if ((mstate->dtms_access & DTRACE_ACCESS_KERNEL) != 0)
875		return (1);
876
877	/*
878	 * You can obviously read that which you can store.
879	 */
880	if (dtrace_canstore(addr, sz, mstate, vstate))
881		return (1);
882
883	/*
884	 * We're allowed to read from our own string table.
885	 */
886	if (DTRACE_INRANGE(addr, sz, (uintptr_t)mstate->dtms_difo->dtdo_strtab,
887	    mstate->dtms_difo->dtdo_strlen))
888		return (1);
889
890	DTRACE_CPUFLAG_SET(CPU_DTRACE_KPRIV);
891	*illval = addr;
892	return (0);
893}
894
895/*
896 * Convenience routine to check to see if a given string is within a memory
897 * region in which a load may be issued given the user's privilege level;
898 * this exists so that we don't need to issue unnecessary dtrace_strlen()
899 * calls in the event that the user has all privileges.
900 */
901static int
902dtrace_strcanload(uint64_t addr, size_t sz, dtrace_mstate_t *mstate,
903    dtrace_vstate_t *vstate)
904{
905	size_t strsz;
906
907	/*
908	 * If we hold the privilege to read from kernel memory, then
909	 * everything is readable.
910	 */
911	if ((mstate->dtms_access & DTRACE_ACCESS_KERNEL) != 0)
912		return (1);
913
914	strsz = 1 + dtrace_strlen((char *)(uintptr_t)addr, sz);
915	if (dtrace_canload(addr, strsz, mstate, vstate))
916		return (1);
917
918	return (0);
919}
920
921/*
922 * Convenience routine to check to see if a given variable is within a memory
923 * region in which a load may be issued given the user's privilege level.
924 */
925static int
926dtrace_vcanload(void *src, dtrace_diftype_t *type, dtrace_mstate_t *mstate,
927    dtrace_vstate_t *vstate)
928{
929	size_t sz;
930	ASSERT(type->dtdt_flags & DIF_TF_BYREF);
931
932	/*
933	 * If we hold the privilege to read from kernel memory, then
934	 * everything is readable.
935	 */
936	if ((mstate->dtms_access & DTRACE_ACCESS_KERNEL) != 0)
937		return (1);
938
939	if (type->dtdt_kind == DIF_TYPE_STRING)
940		sz = dtrace_strlen(src,
941		    vstate->dtvs_state->dts_options[DTRACEOPT_STRSIZE]) + 1;
942	else
943		sz = type->dtdt_size;
944
945	return (dtrace_canload((uintptr_t)src, sz, mstate, vstate));
946}
947
948/*
949 * Compare two strings using safe loads.
950 */
951static int
952dtrace_strncmp(char *s1, char *s2, size_t limit)
953{
954	uint8_t c1, c2;
955	volatile uint16_t *flags;
956
957	if (s1 == s2 || limit == 0)
958		return (0);
959
960	flags = (volatile uint16_t *)&cpu_core[CPU->cpu_id].cpuc_dtrace_flags;
961
962	do {
963		if (s1 == NULL) {
964			c1 = '\0';
965		} else {
966			c1 = dtrace_load8((uintptr_t)s1++);
967		}
968
969		if (s2 == NULL) {
970			c2 = '\0';
971		} else {
972			c2 = dtrace_load8((uintptr_t)s2++);
973		}
974
975		if (c1 != c2)
976			return (c1 - c2);
977	} while (--limit && c1 != '\0' && !(*flags & CPU_DTRACE_FAULT));
978
979	return (0);
980}
981
982/*
983 * Compute strlen(s) for a string using safe memory accesses.  The additional
984 * len parameter is used to specify a maximum length to ensure completion.
985 */
986static size_t
987dtrace_strlen(const char *s, size_t lim)
988{
989	uint_t len;
990
991	for (len = 0; len != lim; len++) {
992		if (dtrace_load8((uintptr_t)s++) == '\0')
993			break;
994	}
995
996	return (len);
997}
998
999/*
1000 * Check if an address falls within a toxic region.
1001 */
1002static int
1003dtrace_istoxic(uintptr_t kaddr, size_t size)
1004{
1005	uintptr_t taddr, tsize;
1006	int i;
1007
1008	for (i = 0; i < dtrace_toxranges; i++) {
1009		taddr = dtrace_toxrange[i].dtt_base;
1010		tsize = dtrace_toxrange[i].dtt_limit - taddr;
1011
1012		if (kaddr - taddr < tsize) {
1013			DTRACE_CPUFLAG_SET(CPU_DTRACE_BADADDR);
1014			cpu_core[CPU->cpu_id].cpuc_dtrace_illval = kaddr;
1015			return (1);
1016		}
1017
1018		if (taddr - kaddr < size) {
1019			DTRACE_CPUFLAG_SET(CPU_DTRACE_BADADDR);
1020			cpu_core[CPU->cpu_id].cpuc_dtrace_illval = taddr;
1021			return (1);
1022		}
1023	}
1024
1025	return (0);
1026}
1027
1028/*
1029 * Copy src to dst using safe memory accesses.  The src is assumed to be unsafe
1030 * memory specified by the DIF program.  The dst is assumed to be safe memory
1031 * that we can store to directly because it is managed by DTrace.  As with
1032 * standard bcopy, overlapping copies are handled properly.
1033 */
1034static void
1035dtrace_bcopy(const void *src, void *dst, size_t len)
1036{
1037	if (len != 0) {
1038		uint8_t *s1 = dst;
1039		const uint8_t *s2 = src;
1040
1041		if (s1 <= s2) {
1042			do {
1043				*s1++ = dtrace_load8((uintptr_t)s2++);
1044			} while (--len != 0);
1045		} else {
1046			s2 += len;
1047			s1 += len;
1048
1049			do {
1050				*--s1 = dtrace_load8((uintptr_t)--s2);
1051			} while (--len != 0);
1052		}
1053	}
1054}
1055
1056/*
1057 * Copy src to dst using safe memory accesses, up to either the specified
1058 * length, or the point that a nul byte is encountered.  The src is assumed to
1059 * be unsafe memory specified by the DIF program.  The dst is assumed to be
1060 * safe memory that we can store to directly because it is managed by DTrace.
1061 * Unlike dtrace_bcopy(), overlapping regions are not handled.
1062 */
1063static void
1064dtrace_strcpy(const void *src, void *dst, size_t len)
1065{
1066	if (len != 0) {
1067		uint8_t *s1 = dst, c;
1068		const uint8_t *s2 = src;
1069
1070		do {
1071			*s1++ = c = dtrace_load8((uintptr_t)s2++);
1072		} while (--len != 0 && c != '\0');
1073	}
1074}
1075
1076/*
1077 * Copy src to dst, deriving the size and type from the specified (BYREF)
1078 * variable type.  The src is assumed to be unsafe memory specified by the DIF
1079 * program.  The dst is assumed to be DTrace variable memory that is of the
1080 * specified type; we assume that we can store to directly.
1081 */
1082static void
1083dtrace_vcopy(void *src, void *dst, dtrace_diftype_t *type)
1084{
1085	ASSERT(type->dtdt_flags & DIF_TF_BYREF);
1086
1087	if (type->dtdt_kind == DIF_TYPE_STRING) {
1088		dtrace_strcpy(src, dst, type->dtdt_size);
1089	} else {
1090		dtrace_bcopy(src, dst, type->dtdt_size);
1091}
1092}
1093
1094/*
1095 * Compare s1 to s2 using safe memory accesses.  The s1 data is assumed to be
1096 * unsafe memory specified by the DIF program.  The s2 data is assumed to be
1097 * safe memory that we can access directly because it is managed by DTrace.
1098 */
1099static int
1100dtrace_bcmp(const void *s1, const void *s2, size_t len)
1101{
1102	volatile uint16_t *flags;
1103
1104	flags = (volatile uint16_t *)&cpu_core[CPU->cpu_id].cpuc_dtrace_flags;
1105
1106	if (s1 == s2)
1107		return (0);
1108
1109	if (s1 == NULL || s2 == NULL)
1110		return (1);
1111
1112	if (s1 != s2 && len != 0) {
1113		const uint8_t *ps1 = s1;
1114		const uint8_t *ps2 = s2;
1115
1116		do {
1117			if (dtrace_load8((uintptr_t)ps1++) != *ps2++)
1118				return (1);
1119		} while (--len != 0 && !(*flags & CPU_DTRACE_FAULT));
1120	}
1121	return (0);
1122}
1123
1124/*
1125 * Zero the specified region using a simple byte-by-byte loop.  Note that this
1126 * is for safe DTrace-managed memory only.
1127 */
1128static void
1129dtrace_bzero(void *dst, size_t len)
1130{
1131	uchar_t *cp;
1132
1133	for (cp = dst; len != 0; len--)
1134		*cp++ = 0;
1135}
1136
1137static void
1138dtrace_add_128(uint64_t *addend1, uint64_t *addend2, uint64_t *sum)
1139{
1140	uint64_t result[2];
1141
1142	result[0] = addend1[0] + addend2[0];
1143	result[1] = addend1[1] + addend2[1] +
1144	    (result[0] < addend1[0] || result[0] < addend2[0] ? 1 : 0);
1145
1146	sum[0] = result[0];
1147	sum[1] = result[1];
1148}
1149
1150/*
1151 * Shift the 128-bit value in a by b. If b is positive, shift left.
1152 * If b is negative, shift right.
1153 */
1154static void
1155dtrace_shift_128(uint64_t *a, int b)
1156{
1157	uint64_t mask;
1158
1159	if (b == 0)
1160		return;
1161
1162	if (b < 0) {
1163		b = -b;
1164		if (b >= 64) {
1165			a[0] = a[1] >> (b - 64);
1166			a[1] = 0;
1167		} else {
1168			a[0] >>= b;
1169			mask = 1LL << (64 - b);
1170			mask -= 1;
1171			a[0] |= ((a[1] & mask) << (64 - b));
1172			a[1] >>= b;
1173		}
1174	} else {
1175		if (b >= 64) {
1176			a[1] = a[0] << (b - 64);
1177			a[0] = 0;
1178		} else {
1179			a[1] <<= b;
1180			mask = a[0] >> (64 - b);
1181			a[1] |= mask;
1182			a[0] <<= b;
1183		}
1184	}
1185}
1186
1187/*
1188 * The basic idea is to break the 2 64-bit values into 4 32-bit values,
1189 * use native multiplication on those, and then re-combine into the
1190 * resulting 128-bit value.
1191 *
1192 * (hi1 << 32 + lo1) * (hi2 << 32 + lo2) =
1193 *     hi1 * hi2 << 64 +
1194 *     hi1 * lo2 << 32 +
1195 *     hi2 * lo1 << 32 +
1196 *     lo1 * lo2
1197 */
1198static void
1199dtrace_multiply_128(uint64_t factor1, uint64_t factor2, uint64_t *product)
1200{
1201	uint64_t hi1, hi2, lo1, lo2;
1202	uint64_t tmp[2];
1203
1204	hi1 = factor1 >> 32;
1205	hi2 = factor2 >> 32;
1206
1207	lo1 = factor1 & DT_MASK_LO;
1208	lo2 = factor2 & DT_MASK_LO;
1209
1210	product[0] = lo1 * lo2;
1211	product[1] = hi1 * hi2;
1212
1213	tmp[0] = hi1 * lo2;
1214	tmp[1] = 0;
1215	dtrace_shift_128(tmp, 32);
1216	dtrace_add_128(product, tmp, product);
1217
1218	tmp[0] = hi2 * lo1;
1219	tmp[1] = 0;
1220	dtrace_shift_128(tmp, 32);
1221	dtrace_add_128(product, tmp, product);
1222}
1223
1224/*
1225 * This privilege check should be used by actions and subroutines to
1226 * verify that the user credentials of the process that enabled the
1227 * invoking ECB match the target credentials
1228 */
1229static int
1230dtrace_priv_proc_common_user(dtrace_state_t *state)
1231{
1232	cred_t *cr, *s_cr = state->dts_cred.dcr_cred;
1233
1234	/*
1235	 * We should always have a non-NULL state cred here, since if cred
1236	 * is null (anonymous tracing), we fast-path bypass this routine.
1237	 */
1238	ASSERT(s_cr != NULL);
1239
1240#if !defined(__APPLE__)
1241	if ((cr = CRED()) != NULL &&
1242#else
1243	if ((cr = dtrace_CRED()) != NULL &&
1244#endif /* __APPLE__ */
1245	    posix_cred_get(s_cr)->cr_uid == posix_cred_get(cr)->cr_uid &&
1246	    posix_cred_get(s_cr)->cr_uid == posix_cred_get(cr)->cr_ruid &&
1247	    posix_cred_get(s_cr)->cr_uid == posix_cred_get(cr)->cr_suid &&
1248	    posix_cred_get(s_cr)->cr_gid == posix_cred_get(cr)->cr_gid &&
1249	    posix_cred_get(s_cr)->cr_gid == posix_cred_get(cr)->cr_rgid &&
1250	    posix_cred_get(s_cr)->cr_gid == posix_cred_get(cr)->cr_sgid)
1251		return (1);
1252
1253	return (0);
1254}
1255
1256/*
1257 * This privilege check should be used by actions and subroutines to
1258 * verify that the zone of the process that enabled the invoking ECB
1259 * matches the target credentials
1260 */
1261static int
1262dtrace_priv_proc_common_zone(dtrace_state_t *state)
1263{
1264	cred_t *cr, *s_cr = state->dts_cred.dcr_cred;
1265#pragma unused(cr, s_cr) /* __APPLE__ */
1266
1267	/*
1268	 * We should always have a non-NULL state cred here, since if cred
1269	 * is null (anonymous tracing), we fast-path bypass this routine.
1270	 */
1271	ASSERT(s_cr != NULL);
1272
1273#if !defined(__APPLE__)
1274	if ((cr = CRED()) != NULL &&
1275	    s_cr->cr_zone == cr->cr_zone)
1276		return (1);
1277
1278	return (0);
1279#else
1280#pragma unused(state)
1281
1282	return 1; /* Darwin doesn't do zones. */
1283#endif /* __APPLE__ */
1284}
1285
1286/*
1287 * This privilege check should be used by actions and subroutines to
1288 * verify that the process has not setuid or changed credentials.
1289 */
1290#if !defined(__APPLE__)
1291static int
1292dtrace_priv_proc_common_nocd()
1293{
1294	proc_t *proc;
1295
1296	if ((proc = ttoproc(curthread)) != NULL &&
1297	    !(proc->p_flag & SNOCD))
1298		return (1);
1299
1300	return (0);
1301}
1302#else
1303static int
1304dtrace_priv_proc_common_nocd(void)
1305{
1306	return 1; /* Darwin omits "No Core Dump" flag. */
1307}
1308#endif /* __APPLE__ */
1309
1310static int
1311dtrace_priv_proc_destructive(dtrace_state_t *state)
1312{
1313	int action = state->dts_cred.dcr_action;
1314
1315#if defined(__APPLE__)
1316	if (ISSET(current_proc()->p_lflag, P_LNOATTACH))
1317		goto bad;
1318#endif /* __APPLE__ */
1319
1320	if (((action & DTRACE_CRA_PROC_DESTRUCTIVE_ALLZONE) == 0) &&
1321	    dtrace_priv_proc_common_zone(state) == 0)
1322		goto bad;
1323
1324	if (((action & DTRACE_CRA_PROC_DESTRUCTIVE_ALLUSER) == 0) &&
1325	    dtrace_priv_proc_common_user(state) == 0)
1326		goto bad;
1327
1328	if (((action & DTRACE_CRA_PROC_DESTRUCTIVE_CREDCHG) == 0) &&
1329	    dtrace_priv_proc_common_nocd() == 0)
1330		goto bad;
1331
1332	return (1);
1333
1334bad:
1335	cpu_core[CPU->cpu_id].cpuc_dtrace_flags |= CPU_DTRACE_UPRIV;
1336
1337	return (0);
1338}
1339
1340static int
1341dtrace_priv_proc_control(dtrace_state_t *state)
1342{
1343#if defined(__APPLE__)
1344	if (ISSET(current_proc()->p_lflag, P_LNOATTACH))
1345		goto bad;
1346#endif /* __APPLE__ */
1347
1348	if (state->dts_cred.dcr_action & DTRACE_CRA_PROC_CONTROL)
1349		return (1);
1350
1351	if (dtrace_priv_proc_common_zone(state) &&
1352	    dtrace_priv_proc_common_user(state) &&
1353	    dtrace_priv_proc_common_nocd())
1354		return (1);
1355
1356#if defined(__APPLE__)
1357bad:
1358#endif /* __APPLE__ */
1359	cpu_core[CPU->cpu_id].cpuc_dtrace_flags |= CPU_DTRACE_UPRIV;
1360
1361	return (0);
1362}
1363
1364static int
1365dtrace_priv_proc(dtrace_state_t *state)
1366{
1367#if defined(__APPLE__)
1368	if (ISSET(current_proc()->p_lflag, P_LNOATTACH))
1369		goto bad;
1370#endif /* __APPLE__ */
1371
1372	if (state->dts_cred.dcr_action & DTRACE_CRA_PROC)
1373		return (1);
1374
1375#if defined(__APPLE__)
1376bad:
1377#endif /* __APPLE__ */
1378	cpu_core[CPU->cpu_id].cpuc_dtrace_flags |= CPU_DTRACE_UPRIV;
1379
1380	return (0);
1381}
1382
1383#if defined(__APPLE__)
1384/* dtrace_priv_proc() omitting the P_LNOATTACH check. For PID and EXECNAME accesses. */
1385static int
1386dtrace_priv_proc_relaxed(dtrace_state_t *state)
1387{
1388
1389	if (state->dts_cred.dcr_action & DTRACE_CRA_PROC)
1390		return (1);
1391
1392	cpu_core[CPU->cpu_id].cpuc_dtrace_flags |= CPU_DTRACE_UPRIV;
1393
1394	return (0);
1395}
1396#endif /* __APPLE__ */
1397
1398static int
1399dtrace_priv_kernel(dtrace_state_t *state)
1400{
1401	if (state->dts_cred.dcr_action & DTRACE_CRA_KERNEL)
1402		return (1);
1403
1404	cpu_core[CPU->cpu_id].cpuc_dtrace_flags |= CPU_DTRACE_KPRIV;
1405
1406	return (0);
1407}
1408
1409static int
1410dtrace_priv_kernel_destructive(dtrace_state_t *state)
1411{
1412	if (state->dts_cred.dcr_action & DTRACE_CRA_KERNEL_DESTRUCTIVE)
1413		return (1);
1414
1415	cpu_core[CPU->cpu_id].cpuc_dtrace_flags |= CPU_DTRACE_KPRIV;
1416
1417	return (0);
1418}
1419
1420/*
1421 * Note:  not called from probe context.  This function is called
1422 * asynchronously (and at a regular interval) from outside of probe context to
1423 * clean the dirty dynamic variable lists on all CPUs.  Dynamic variable
1424 * cleaning is explained in detail in <sys/dtrace_impl.h>.
1425 */
1426#if defined(__APPLE__) /* Quiet compiler warning. */
1427static
1428#endif /* __APPLE__ */
1429void
1430dtrace_dynvar_clean(dtrace_dstate_t *dstate)
1431{
1432	dtrace_dynvar_t *dirty;
1433	dtrace_dstate_percpu_t *dcpu;
1434	int i, work = 0;
1435
1436	for (i = 0; i < (int)NCPU; i++) {
1437		dcpu = &dstate->dtds_percpu[i];
1438
1439		ASSERT(dcpu->dtdsc_rinsing == NULL);
1440
1441		/*
1442		 * If the dirty list is NULL, there is no dirty work to do.
1443		 */
1444		if (dcpu->dtdsc_dirty == NULL)
1445			continue;
1446
1447		/*
1448		 * If the clean list is non-NULL, then we're not going to do
1449		 * any work for this CPU -- it means that there has not been
1450		 * a dtrace_dynvar() allocation on this CPU (or from this CPU)
1451		 * since the last time we cleaned house.
1452		 */
1453		if (dcpu->dtdsc_clean != NULL)
1454			continue;
1455
1456		work = 1;
1457
1458		/*
1459		 * Atomically move the dirty list aside.
1460		 */
1461		do {
1462			dirty = dcpu->dtdsc_dirty;
1463
1464			/*
1465			 * Before we zap the dirty list, set the rinsing list.
1466			 * (This allows for a potential assertion in
1467			 * dtrace_dynvar():  if a free dynamic variable appears
1468			 * on a hash chain, either the dirty list or the
1469			 * rinsing list for some CPU must be non-NULL.)
1470			 */
1471			dcpu->dtdsc_rinsing = dirty;
1472			dtrace_membar_producer();
1473		} while (dtrace_casptr(&dcpu->dtdsc_dirty,
1474		    dirty, NULL) != dirty);
1475	}
1476
1477	if (!work) {
1478		/*
1479		 * We have no work to do; we can simply return.
1480		 */
1481		return;
1482	}
1483
1484	dtrace_sync();
1485
1486	for (i = 0; i < (int)NCPU; i++) {
1487		dcpu = &dstate->dtds_percpu[i];
1488
1489		if (dcpu->dtdsc_rinsing == NULL)
1490			continue;
1491
1492		/*
1493		 * We are now guaranteed that no hash chain contains a pointer
1494		 * into this dirty list; we can make it clean.
1495		 */
1496		ASSERT(dcpu->dtdsc_clean == NULL);
1497		dcpu->dtdsc_clean = dcpu->dtdsc_rinsing;
1498		dcpu->dtdsc_rinsing = NULL;
1499	}
1500
1501	/*
1502	 * Before we actually set the state to be DTRACE_DSTATE_CLEAN, make
1503	 * sure that all CPUs have seen all of the dtdsc_clean pointers.
1504	 * This prevents a race whereby a CPU incorrectly decides that
1505	 * the state should be something other than DTRACE_DSTATE_CLEAN
1506	 * after dtrace_dynvar_clean() has completed.
1507	 */
1508	dtrace_sync();
1509
1510	dstate->dtds_state = DTRACE_DSTATE_CLEAN;
1511}
1512
1513/*
1514 * Depending on the value of the op parameter, this function looks-up,
1515 * allocates or deallocates an arbitrarily-keyed dynamic variable.  If an
1516 * allocation is requested, this function will return a pointer to a
1517 * dtrace_dynvar_t corresponding to the allocated variable -- or NULL if no
1518 * variable can be allocated.  If NULL is returned, the appropriate counter
1519 * will be incremented.
1520 */
1521#if defined(__APPLE__) /* Quiet compiler warning. */
1522static
1523#endif /* __APPLE__ */
1524dtrace_dynvar_t *
1525dtrace_dynvar(dtrace_dstate_t *dstate, uint_t nkeys,
1526    dtrace_key_t *key, size_t dsize, dtrace_dynvar_op_t op,
1527    dtrace_mstate_t *mstate, dtrace_vstate_t *vstate)
1528{
1529	uint64_t hashval = DTRACE_DYNHASH_VALID;
1530	dtrace_dynhash_t *hash = dstate->dtds_hash;
1531	dtrace_dynvar_t *free, *new_free, *next, *dvar, *start, *prev = NULL;
1532	processorid_t me = CPU->cpu_id, cpu = me;
1533	dtrace_dstate_percpu_t *dcpu = &dstate->dtds_percpu[me];
1534	size_t bucket, ksize;
1535	size_t chunksize = dstate->dtds_chunksize;
1536	uintptr_t kdata, lock, nstate;
1537	uint_t i;
1538
1539	ASSERT(nkeys != 0);
1540
1541	/*
1542	 * Hash the key.  As with aggregations, we use Jenkins' "One-at-a-time"
1543	 * algorithm.  For the by-value portions, we perform the algorithm in
1544	 * 16-bit chunks (as opposed to 8-bit chunks).  This speeds things up a
1545	 * bit, and seems to have only a minute effect on distribution.  For
1546	 * the by-reference data, we perform "One-at-a-time" iterating (safely)
1547	 * over each referenced byte.  It's painful to do this, but it's much
1548	 * better than pathological hash distribution.  The efficacy of the
1549	 * hashing algorithm (and a comparison with other algorithms) may be
1550	 * found by running the ::dtrace_dynstat MDB dcmd.
1551	 */
1552	for (i = 0; i < nkeys; i++) {
1553		if (key[i].dttk_size == 0) {
1554			uint64_t val = key[i].dttk_value;
1555
1556			hashval += (val >> 48) & 0xffff;
1557			hashval += (hashval << 10);
1558			hashval ^= (hashval >> 6);
1559
1560			hashval += (val >> 32) & 0xffff;
1561			hashval += (hashval << 10);
1562			hashval ^= (hashval >> 6);
1563
1564			hashval += (val >> 16) & 0xffff;
1565			hashval += (hashval << 10);
1566			hashval ^= (hashval >> 6);
1567
1568			hashval += val & 0xffff;
1569			hashval += (hashval << 10);
1570			hashval ^= (hashval >> 6);
1571		} else {
1572			/*
1573			 * This is incredibly painful, but it beats the hell
1574			 * out of the alternative.
1575			 */
1576			uint64_t j, size = key[i].dttk_size;
1577			uintptr_t base = (uintptr_t)key[i].dttk_value;
1578
1579			if (!dtrace_canload(base, size, mstate, vstate))
1580				break;
1581
1582			for (j = 0; j < size; j++) {
1583				hashval += dtrace_load8(base + j);
1584				hashval += (hashval << 10);
1585				hashval ^= (hashval >> 6);
1586			}
1587		}
1588	}
1589
1590	if (DTRACE_CPUFLAG_ISSET(CPU_DTRACE_FAULT))
1591		return (NULL);
1592
1593	hashval += (hashval << 3);
1594	hashval ^= (hashval >> 11);
1595	hashval += (hashval << 15);
1596
1597	/*
1598	 * There is a remote chance (ideally, 1 in 2^31) that our hashval
1599	 * comes out to be one of our two sentinel hash values.  If this
1600	 * actually happens, we set the hashval to be a value known to be a
1601	 * non-sentinel value.
1602	 */
1603	if (hashval == DTRACE_DYNHASH_FREE || hashval == DTRACE_DYNHASH_SINK)
1604		hashval = DTRACE_DYNHASH_VALID;
1605
1606	/*
1607	 * Yes, it's painful to do a divide here.  If the cycle count becomes
1608	 * important here, tricks can be pulled to reduce it.  (However, it's
1609	 * critical that hash collisions be kept to an absolute minimum;
1610	 * they're much more painful than a divide.)  It's better to have a
1611	 * solution that generates few collisions and still keeps things
1612	 * relatively simple.
1613	 */
1614	bucket = hashval % dstate->dtds_hashsize;
1615
1616	if (op == DTRACE_DYNVAR_DEALLOC) {
1617		volatile uintptr_t *lockp = &hash[bucket].dtdh_lock;
1618
1619		for (;;) {
1620			while ((lock = *lockp) & 1)
1621				continue;
1622
1623#if !defined(__APPLE__)  /* Quiet compiler warning */
1624			if (dtrace_casptr((void *)lockp,
1625			    (void *)lock, (void *)(lock + 1)) == (void *)lock)
1626				break;
1627#else
1628			if (dtrace_casptr((void *)(uintptr_t)lockp,
1629			    (void *)lock, (void *)(lock + 1)) == (void *)lock)
1630				break;
1631#endif /* __APPLE__ */
1632		}
1633
1634		dtrace_membar_producer();
1635	}
1636
1637top:
1638	prev = NULL;
1639	lock = hash[bucket].dtdh_lock;
1640
1641	dtrace_membar_consumer();
1642
1643	start = hash[bucket].dtdh_chain;
1644	ASSERT(start != NULL && (start->dtdv_hashval == DTRACE_DYNHASH_SINK ||
1645	    start->dtdv_hashval != DTRACE_DYNHASH_FREE ||
1646	    op != DTRACE_DYNVAR_DEALLOC));
1647
1648	for (dvar = start; dvar != NULL; dvar = dvar->dtdv_next) {
1649		dtrace_tuple_t *dtuple = &dvar->dtdv_tuple;
1650		dtrace_key_t *dkey = &dtuple->dtt_key[0];
1651
1652		if (dvar->dtdv_hashval != hashval) {
1653			if (dvar->dtdv_hashval == DTRACE_DYNHASH_SINK) {
1654				/*
1655				 * We've reached the sink, and therefore the
1656				 * end of the hash chain; we can kick out of
1657				 * the loop knowing that we have seen a valid
1658				 * snapshot of state.
1659				 */
1660				ASSERT(dvar->dtdv_next == NULL);
1661				ASSERT(dvar == &dtrace_dynhash_sink);
1662				break;
1663			}
1664
1665			if (dvar->dtdv_hashval == DTRACE_DYNHASH_FREE) {
1666				/*
1667				 * We've gone off the rails:  somewhere along
1668				 * the line, one of the members of this hash
1669				 * chain was deleted.  Note that we could also
1670				 * detect this by simply letting this loop run
1671				 * to completion, as we would eventually hit
1672				 * the end of the dirty list.  However, we
1673				 * want to avoid running the length of the
1674				 * dirty list unnecessarily (it might be quite
1675				 * long), so we catch this as early as
1676				 * possible by detecting the hash marker.  In
1677				 * this case, we simply set dvar to NULL and
1678				 * break; the conditional after the loop will
1679				 * send us back to top.
1680				 */
1681				dvar = NULL;
1682				break;
1683			}
1684
1685			goto next;
1686		}
1687
1688		if (dtuple->dtt_nkeys != nkeys)
1689			goto next;
1690
1691		for (i = 0; i < nkeys; i++, dkey++) {
1692			if (dkey->dttk_size != key[i].dttk_size)
1693				goto next; /* size or type mismatch */
1694
1695			if (dkey->dttk_size != 0) {
1696				if (dtrace_bcmp(
1697				    (void *)(uintptr_t)key[i].dttk_value,
1698				    (void *)(uintptr_t)dkey->dttk_value,
1699				    dkey->dttk_size))
1700					goto next;
1701			} else {
1702				if (dkey->dttk_value != key[i].dttk_value)
1703					goto next;
1704			}
1705		}
1706
1707		if (op != DTRACE_DYNVAR_DEALLOC)
1708			return (dvar);
1709
1710		ASSERT(dvar->dtdv_next == NULL ||
1711		    dvar->dtdv_next->dtdv_hashval != DTRACE_DYNHASH_FREE);
1712
1713		if (prev != NULL) {
1714			ASSERT(hash[bucket].dtdh_chain != dvar);
1715			ASSERT(start != dvar);
1716			ASSERT(prev->dtdv_next == dvar);
1717			prev->dtdv_next = dvar->dtdv_next;
1718		} else {
1719			if (dtrace_casptr(&hash[bucket].dtdh_chain,
1720			    start, dvar->dtdv_next) != start) {
1721				/*
1722				 * We have failed to atomically swing the
1723				 * hash table head pointer, presumably because
1724				 * of a conflicting allocation on another CPU.
1725				 * We need to reread the hash chain and try
1726				 * again.
1727				 */
1728				goto top;
1729			}
1730		}
1731
1732		dtrace_membar_producer();
1733
1734		/*
1735		 * Now set the hash value to indicate that it's free.
1736		 */
1737		ASSERT(hash[bucket].dtdh_chain != dvar);
1738		dvar->dtdv_hashval = DTRACE_DYNHASH_FREE;
1739
1740		dtrace_membar_producer();
1741
1742		/*
1743		 * Set the next pointer to point at the dirty list, and
1744		 * atomically swing the dirty pointer to the newly freed dvar.
1745		 */
1746		do {
1747			next = dcpu->dtdsc_dirty;
1748			dvar->dtdv_next = next;
1749		} while (dtrace_casptr(&dcpu->dtdsc_dirty, next, dvar) != next);
1750
1751		/*
1752		 * Finally, unlock this hash bucket.
1753		 */
1754		ASSERT(hash[bucket].dtdh_lock == lock);
1755		ASSERT(lock & 1);
1756		hash[bucket].dtdh_lock++;
1757
1758		return (NULL);
1759next:
1760		prev = dvar;
1761		continue;
1762	}
1763
1764	if (dvar == NULL) {
1765		/*
1766		 * If dvar is NULL, it is because we went off the rails:
1767		 * one of the elements that we traversed in the hash chain
1768		 * was deleted while we were traversing it.  In this case,
1769		 * we assert that we aren't doing a dealloc (deallocs lock
1770		 * the hash bucket to prevent themselves from racing with
1771		 * one another), and retry the hash chain traversal.
1772		 */
1773		ASSERT(op != DTRACE_DYNVAR_DEALLOC);
1774		goto top;
1775	}
1776
1777	if (op != DTRACE_DYNVAR_ALLOC) {
1778		/*
1779		 * If we are not to allocate a new variable, we want to
1780		 * return NULL now.  Before we return, check that the value
1781		 * of the lock word hasn't changed.  If it has, we may have
1782		 * seen an inconsistent snapshot.
1783		 */
1784		if (op == DTRACE_DYNVAR_NOALLOC) {
1785			if (hash[bucket].dtdh_lock != lock)
1786				goto top;
1787		} else {
1788			ASSERT(op == DTRACE_DYNVAR_DEALLOC);
1789			ASSERT(hash[bucket].dtdh_lock == lock);
1790			ASSERT(lock & 1);
1791			hash[bucket].dtdh_lock++;
1792		}
1793
1794		return (NULL);
1795	}
1796
1797	/*
1798	 * We need to allocate a new dynamic variable.  The size we need is the
1799	 * size of dtrace_dynvar plus the size of nkeys dtrace_key_t's plus the
1800	 * size of any auxiliary key data (rounded up to 8-byte alignment) plus
1801	 * the size of any referred-to data (dsize).  We then round the final
1802	 * size up to the chunksize for allocation.
1803	 */
1804	for (ksize = 0, i = 0; i < nkeys; i++)
1805		ksize += P2ROUNDUP(key[i].dttk_size, sizeof (uint64_t));
1806
1807	/*
1808	 * This should be pretty much impossible, but could happen if, say,
1809	 * strange DIF specified the tuple.  Ideally, this should be an
1810	 * assertion and not an error condition -- but that requires that the
1811	 * chunksize calculation in dtrace_difo_chunksize() be absolutely
1812	 * bullet-proof.  (That is, it must not be able to be fooled by
1813	 * malicious DIF.)  Given the lack of backwards branches in DIF,
1814	 * solving this would presumably not amount to solving the Halting
1815	 * Problem -- but it still seems awfully hard.
1816	 */
1817	if (sizeof (dtrace_dynvar_t) + sizeof (dtrace_key_t) * (nkeys - 1) +
1818	    ksize + dsize > chunksize) {
1819		dcpu->dtdsc_drops++;
1820		return (NULL);
1821	}
1822
1823	nstate = DTRACE_DSTATE_EMPTY;
1824
1825	do {
1826retry:
1827		free = dcpu->dtdsc_free;
1828
1829		if (free == NULL) {
1830			dtrace_dynvar_t *clean = dcpu->dtdsc_clean;
1831			void *rval;
1832
1833			if (clean == NULL) {
1834				/*
1835				 * We're out of dynamic variable space on
1836				 * this CPU.  Unless we have tried all CPUs,
1837				 * we'll try to allocate from a different
1838				 * CPU.
1839				 */
1840				switch (dstate->dtds_state) {
1841				case DTRACE_DSTATE_CLEAN: {
1842					void *sp = &dstate->dtds_state;
1843
1844					if (++cpu >= (int)NCPU)
1845						cpu = 0;
1846
1847					if (dcpu->dtdsc_dirty != NULL &&
1848					    nstate == DTRACE_DSTATE_EMPTY)
1849						nstate = DTRACE_DSTATE_DIRTY;
1850
1851					if (dcpu->dtdsc_rinsing != NULL)
1852						nstate = DTRACE_DSTATE_RINSING;
1853
1854					dcpu = &dstate->dtds_percpu[cpu];
1855
1856					if (cpu != me)
1857						goto retry;
1858
1859					(void) dtrace_cas32(sp,
1860					    DTRACE_DSTATE_CLEAN, nstate);
1861
1862					/*
1863					 * To increment the correct bean
1864					 * counter, take another lap.
1865					 */
1866					goto retry;
1867				}
1868
1869				case DTRACE_DSTATE_DIRTY:
1870					dcpu->dtdsc_dirty_drops++;
1871					break;
1872
1873				case DTRACE_DSTATE_RINSING:
1874					dcpu->dtdsc_rinsing_drops++;
1875					break;
1876
1877				case DTRACE_DSTATE_EMPTY:
1878					dcpu->dtdsc_drops++;
1879					break;
1880				}
1881
1882				DTRACE_CPUFLAG_SET(CPU_DTRACE_DROP);
1883				return (NULL);
1884			}
1885
1886			/*
1887			 * The clean list appears to be non-empty.  We want to
1888			 * move the clean list to the free list; we start by
1889			 * moving the clean pointer aside.
1890			 */
1891			if (dtrace_casptr(&dcpu->dtdsc_clean,
1892			    clean, NULL) != clean) {
1893				/*
1894				 * We are in one of two situations:
1895				 *
1896				 *  (a)	The clean list was switched to the
1897				 *	free list by another CPU.
1898				 *
1899				 *  (b)	The clean list was added to by the
1900				 *	cleansing cyclic.
1901				 *
1902				 * In either of these situations, we can
1903				 * just reattempt the free list allocation.
1904				 */
1905				goto retry;
1906			}
1907
1908			ASSERT(clean->dtdv_hashval == DTRACE_DYNHASH_FREE);
1909
1910			/*
1911			 * Now we'll move the clean list to the free list.
1912			 * It's impossible for this to fail:  the only way
1913			 * the free list can be updated is through this
1914			 * code path, and only one CPU can own the clean list.
1915			 * Thus, it would only be possible for this to fail if
1916			 * this code were racing with dtrace_dynvar_clean().
1917			 * (That is, if dtrace_dynvar_clean() updated the clean
1918			 * list, and we ended up racing to update the free
1919			 * list.)  This race is prevented by the dtrace_sync()
1920			 * in dtrace_dynvar_clean() -- which flushes the
1921			 * owners of the clean lists out before resetting
1922			 * the clean lists.
1923			 */
1924			rval = dtrace_casptr(&dcpu->dtdsc_free, NULL, clean);
1925			ASSERT(rval == NULL);
1926			goto retry;
1927		}
1928
1929		dvar = free;
1930		new_free = dvar->dtdv_next;
1931	} while (dtrace_casptr(&dcpu->dtdsc_free, free, new_free) != free);
1932
1933	/*
1934	 * We have now allocated a new chunk.  We copy the tuple keys into the
1935	 * tuple array and copy any referenced key data into the data space
1936	 * following the tuple array.  As we do this, we relocate dttk_value
1937	 * in the final tuple to point to the key data address in the chunk.
1938	 */
1939	kdata = (uintptr_t)&dvar->dtdv_tuple.dtt_key[nkeys];
1940	dvar->dtdv_data = (void *)(kdata + ksize);
1941	dvar->dtdv_tuple.dtt_nkeys = nkeys;
1942
1943	for (i = 0; i < nkeys; i++) {
1944		dtrace_key_t *dkey = &dvar->dtdv_tuple.dtt_key[i];
1945		size_t kesize = key[i].dttk_size;
1946
1947		if (kesize != 0) {
1948			dtrace_bcopy(
1949			    (const void *)(uintptr_t)key[i].dttk_value,
1950			    (void *)kdata, kesize);
1951			dkey->dttk_value = kdata;
1952			kdata += P2ROUNDUP(kesize, sizeof (uint64_t));
1953		} else {
1954			dkey->dttk_value = key[i].dttk_value;
1955		}
1956
1957		dkey->dttk_size = kesize;
1958	}
1959
1960	ASSERT(dvar->dtdv_hashval == DTRACE_DYNHASH_FREE);
1961	dvar->dtdv_hashval = hashval;
1962	dvar->dtdv_next = start;
1963
1964	if (dtrace_casptr(&hash[bucket].dtdh_chain, start, dvar) == start)
1965		return (dvar);
1966
1967	/*
1968	 * The cas has failed.  Either another CPU is adding an element to
1969	 * this hash chain, or another CPU is deleting an element from this
1970	 * hash chain.  The simplest way to deal with both of these cases
1971	 * (though not necessarily the most efficient) is to free our
1972	 * allocated block and tail-call ourselves.  Note that the free is
1973	 * to the dirty list and _not_ to the free list.  This is to prevent
1974	 * races with allocators, above.
1975	 */
1976	dvar->dtdv_hashval = DTRACE_DYNHASH_FREE;
1977
1978	dtrace_membar_producer();
1979
1980	do {
1981		free = dcpu->dtdsc_dirty;
1982		dvar->dtdv_next = free;
1983	} while (dtrace_casptr(&dcpu->dtdsc_dirty, free, dvar) != free);
1984
1985	return (dtrace_dynvar(dstate, nkeys, key, dsize, op, mstate, vstate));
1986}
1987
1988/*ARGSUSED*/
1989static void
1990dtrace_aggregate_min(uint64_t *oval, uint64_t nval, uint64_t arg)
1991{
1992#pragma unused(arg) /* __APPLE__ */
1993	if ((int64_t)nval < (int64_t)*oval)
1994		*oval = nval;
1995}
1996
1997/*ARGSUSED*/
1998static void
1999dtrace_aggregate_max(uint64_t *oval, uint64_t nval, uint64_t arg)
2000{
2001#pragma unused(arg) /* __APPLE__ */
2002	if ((int64_t)nval > (int64_t)*oval)
2003		*oval = nval;
2004}
2005
2006static void
2007dtrace_aggregate_quantize(uint64_t *quanta, uint64_t nval, uint64_t incr)
2008{
2009	int i, zero = DTRACE_QUANTIZE_ZEROBUCKET;
2010	int64_t val = (int64_t)nval;
2011
2012	if (val < 0) {
2013		for (i = 0; i < zero; i++) {
2014			if (val <= DTRACE_QUANTIZE_BUCKETVAL(i)) {
2015				quanta[i] += incr;
2016				return;
2017			}
2018		}
2019	} else {
2020		for (i = zero + 1; i < DTRACE_QUANTIZE_NBUCKETS; i++) {
2021			if (val < DTRACE_QUANTIZE_BUCKETVAL(i)) {
2022				quanta[i - 1] += incr;
2023				return;
2024			}
2025		}
2026
2027		quanta[DTRACE_QUANTIZE_NBUCKETS - 1] += incr;
2028		return;
2029	}
2030
2031	ASSERT(0);
2032}
2033
2034static void
2035dtrace_aggregate_lquantize(uint64_t *lquanta, uint64_t nval, uint64_t incr)
2036{
2037	uint64_t arg = *lquanta++;
2038	int32_t base = DTRACE_LQUANTIZE_BASE(arg);
2039	uint16_t step = DTRACE_LQUANTIZE_STEP(arg);
2040	uint16_t levels = DTRACE_LQUANTIZE_LEVELS(arg);
2041	int32_t val = (int32_t)nval, level;
2042
2043	ASSERT(step != 0);
2044	ASSERT(levels != 0);
2045
2046	if (val < base) {
2047		/*
2048		 * This is an underflow.
2049		 */
2050		lquanta[0] += incr;
2051		return;
2052	}
2053
2054	level = (val - base) / step;
2055
2056	if (level < levels) {
2057		lquanta[level + 1] += incr;
2058		return;
2059	}
2060
2061	/*
2062	 * This is an overflow.
2063	 */
2064	lquanta[levels + 1] += incr;
2065}
2066
2067static int
2068dtrace_aggregate_llquantize_bucket(int16_t factor, int16_t low, int16_t high,
2069                                   int16_t nsteps, int64_t value)
2070{
2071	int64_t this = 1, last, next;
2072	int base = 1, order;
2073
2074	for (order = 0; order < low; ++order)
2075		this *= factor;
2076
2077	/*
2078	 * If our value is less than our factor taken to the power of the
2079	 * low order of magnitude, it goes into the zeroth bucket.
2080	 */
2081	if (value < this)
2082		return 0;
2083	else
2084		last = this;
2085
2086	for (this *= factor; order <= high; ++order) {
2087		int nbuckets = this > nsteps ? nsteps : this;
2088
2089		/*
2090		 * We should not generally get log/linear quantizations
2091		 * with a high magnitude that allows 64-bits to
2092		 * overflow, but we nonetheless protect against this
2093		 * by explicitly checking for overflow, and clamping
2094		 * our value accordingly.
2095		 */
2096		next = this * factor;
2097		if (next < this) {
2098			value = this - 1;
2099		}
2100
2101		/*
2102		 * If our value lies within this order of magnitude,
2103		 * determine its position by taking the offset within
2104		 * the order of magnitude, dividing by the bucket
2105		 * width, and adding to our (accumulated) base.
2106		 */
2107		if (value < this) {
2108			return (base + (value - last) / (this / nbuckets));
2109		}
2110
2111		base += nbuckets - (nbuckets / factor);
2112		last = this;
2113		this = next;
2114	}
2115
2116	/*
2117	 * Our value is greater than or equal to our factor taken to the
2118	 * power of one plus the high magnitude -- return the top bucket.
2119	 */
2120	return base;
2121}
2122
2123static void
2124dtrace_aggregate_llquantize(uint64_t *llquanta, uint64_t nval, uint64_t incr)
2125{
2126	uint64_t arg    = *llquanta++;
2127	uint16_t factor = DTRACE_LLQUANTIZE_FACTOR(arg);
2128	uint16_t low    = DTRACE_LLQUANTIZE_LOW(arg);
2129	uint16_t high   = DTRACE_LLQUANTIZE_HIGH(arg);
2130	uint16_t nsteps = DTRACE_LLQUANTIZE_NSTEP(arg);
2131
2132	llquanta[dtrace_aggregate_llquantize_bucket(factor, low, high, nsteps, nval)] += incr;
2133}
2134
2135/*ARGSUSED*/
2136static void
2137dtrace_aggregate_avg(uint64_t *data, uint64_t nval, uint64_t arg)
2138{
2139#pragma unused(arg) /* __APPLE__ */
2140	data[0]++;
2141	data[1] += nval;
2142}
2143
2144/*ARGSUSED*/
2145static void
2146dtrace_aggregate_stddev(uint64_t *data, uint64_t nval, uint64_t arg)
2147{
2148#pragma unused(arg) /* __APPLE__ */
2149	int64_t snval = (int64_t)nval;
2150	uint64_t tmp[2];
2151
2152	data[0]++;
2153	data[1] += nval;
2154
2155	/*
2156	 * What we want to say here is:
2157	 *
2158	 * data[2] += nval * nval;
2159	 *
2160	 * But given that nval is 64-bit, we could easily overflow, so
2161	 * we do this as 128-bit arithmetic.
2162	 */
2163	if (snval < 0)
2164		snval = -snval;
2165
2166	dtrace_multiply_128((uint64_t)snval, (uint64_t)snval, tmp);
2167	dtrace_add_128(data + 2, tmp, data + 2);
2168}
2169
2170/*ARGSUSED*/
2171static void
2172dtrace_aggregate_count(uint64_t *oval, uint64_t nval, uint64_t arg)
2173{
2174#pragma unused(nval, arg) /* __APPLE__ */
2175	*oval = *oval + 1;
2176}
2177
2178/*ARGSUSED*/
2179static void
2180dtrace_aggregate_sum(uint64_t *oval, uint64_t nval, uint64_t arg)
2181{
2182#pragma unused(arg) /* __APPLE__ */
2183	*oval += nval;
2184}
2185
2186/*
2187 * Aggregate given the tuple in the principal data buffer, and the aggregating
2188 * action denoted by the specified dtrace_aggregation_t.  The aggregation
2189 * buffer is specified as the buf parameter.  This routine does not return
2190 * failure; if there is no space in the aggregation buffer, the data will be
2191 * dropped, and a corresponding counter incremented.
2192 */
2193static void
2194dtrace_aggregate(dtrace_aggregation_t *agg, dtrace_buffer_t *dbuf,
2195    intptr_t offset, dtrace_buffer_t *buf, uint64_t expr, uint64_t arg)
2196{
2197#pragma unused(arg)
2198	dtrace_recdesc_t *rec = &agg->dtag_action.dta_rec;
2199	uint32_t i, ndx, size, fsize;
2200	uint32_t align = sizeof (uint64_t) - 1;
2201	dtrace_aggbuffer_t *agb;
2202	dtrace_aggkey_t *key;
2203	uint32_t hashval = 0, limit, isstr;
2204	caddr_t tomax, data, kdata;
2205	dtrace_actkind_t action;
2206	dtrace_action_t *act;
2207	uintptr_t offs;
2208
2209	if (buf == NULL)
2210		return;
2211
2212	if (!agg->dtag_hasarg) {
2213		/*
2214		 * Currently, only quantize() and lquantize() take additional
2215		 * arguments, and they have the same semantics:  an increment
2216		 * value that defaults to 1 when not present.  If additional
2217		 * aggregating actions take arguments, the setting of the
2218		 * default argument value will presumably have to become more
2219		 * sophisticated...
2220		 */
2221		arg = 1;
2222	}
2223
2224	action = agg->dtag_action.dta_kind - DTRACEACT_AGGREGATION;
2225	size = rec->dtrd_offset - agg->dtag_base;
2226	fsize = size + rec->dtrd_size;
2227
2228	ASSERT(dbuf->dtb_tomax != NULL);
2229	data = dbuf->dtb_tomax + offset + agg->dtag_base;
2230
2231	if ((tomax = buf->dtb_tomax) == NULL) {
2232		dtrace_buffer_drop(buf);
2233		return;
2234	}
2235
2236	/*
2237	 * The metastructure is always at the bottom of the buffer.
2238	 */
2239	agb = (dtrace_aggbuffer_t *)(tomax + buf->dtb_size -
2240	    sizeof (dtrace_aggbuffer_t));
2241
2242	if (buf->dtb_offset == 0) {
2243		/*
2244		 * We just kludge up approximately 1/8th of the size to be
2245		 * buckets.  If this guess ends up being routinely
2246		 * off-the-mark, we may need to dynamically readjust this
2247		 * based on past performance.
2248		 */
2249		uintptr_t hashsize = (buf->dtb_size >> 3) / sizeof (uintptr_t);
2250
2251		if ((uintptr_t)agb - hashsize * sizeof (dtrace_aggkey_t *) <
2252		    (uintptr_t)tomax || hashsize == 0) {
2253			/*
2254			 * We've been given a ludicrously small buffer;
2255			 * increment our drop count and leave.
2256			 */
2257			dtrace_buffer_drop(buf);
2258			return;
2259		}
2260
2261		/*
2262		 * And now, a pathetic attempt to try to get a an odd (or
2263		 * perchance, a prime) hash size for better hash distribution.
2264		 */
2265		if (hashsize > (DTRACE_AGGHASHSIZE_SLEW << 3))
2266			hashsize -= DTRACE_AGGHASHSIZE_SLEW;
2267
2268		agb->dtagb_hashsize = hashsize;
2269		agb->dtagb_hash = (dtrace_aggkey_t **)((uintptr_t)agb -
2270		    agb->dtagb_hashsize * sizeof (dtrace_aggkey_t *));
2271		agb->dtagb_free = (uintptr_t)agb->dtagb_hash;
2272
2273		for (i = 0; i < agb->dtagb_hashsize; i++)
2274			agb->dtagb_hash[i] = NULL;
2275	}
2276
2277	ASSERT(agg->dtag_first != NULL);
2278	ASSERT(agg->dtag_first->dta_intuple);
2279
2280	/*
2281	 * Calculate the hash value based on the key.  Note that we _don't_
2282	 * include the aggid in the hashing (but we will store it as part of
2283	 * the key).  The hashing algorithm is Bob Jenkins' "One-at-a-time"
2284	 * algorithm: a simple, quick algorithm that has no known funnels, and
2285	 * gets good distribution in practice.  The efficacy of the hashing
2286	 * algorithm (and a comparison with other algorithms) may be found by
2287	 * running the ::dtrace_aggstat MDB dcmd.
2288	 */
2289	for (act = agg->dtag_first; act->dta_intuple; act = act->dta_next) {
2290		i = act->dta_rec.dtrd_offset - agg->dtag_base;
2291		limit = i + act->dta_rec.dtrd_size;
2292		ASSERT(limit <= size);
2293		isstr = DTRACEACT_ISSTRING(act);
2294
2295		for (; i < limit; i++) {
2296			hashval += data[i];
2297			hashval += (hashval << 10);
2298			hashval ^= (hashval >> 6);
2299
2300			if (isstr && data[i] == '\0')
2301				break;
2302		}
2303	}
2304
2305	hashval += (hashval << 3);
2306	hashval ^= (hashval >> 11);
2307	hashval += (hashval << 15);
2308
2309	/*
2310	 * Yes, the divide here is expensive -- but it's generally the least
2311	 * of the performance issues given the amount of data that we iterate
2312	 * over to compute hash values, compare data, etc.
2313	 */
2314	ndx = hashval % agb->dtagb_hashsize;
2315
2316	for (key = agb->dtagb_hash[ndx]; key != NULL; key = key->dtak_next) {
2317		ASSERT((caddr_t)key >= tomax);
2318		ASSERT((caddr_t)key < tomax + buf->dtb_size);
2319
2320		if (hashval != key->dtak_hashval || key->dtak_size != size)
2321			continue;
2322
2323		kdata = key->dtak_data;
2324		ASSERT(kdata >= tomax && kdata < tomax + buf->dtb_size);
2325
2326		for (act = agg->dtag_first; act->dta_intuple;
2327		    act = act->dta_next) {
2328			i = act->dta_rec.dtrd_offset - agg->dtag_base;
2329			limit = i + act->dta_rec.dtrd_size;
2330			ASSERT(limit <= size);
2331			isstr = DTRACEACT_ISSTRING(act);
2332
2333			for (; i < limit; i++) {
2334				if (kdata[i] != data[i])
2335					goto next;
2336
2337				if (isstr && data[i] == '\0')
2338					break;
2339			}
2340		}
2341
2342		if (action != key->dtak_action) {
2343			/*
2344			 * We are aggregating on the same value in the same
2345			 * aggregation with two different aggregating actions.
2346			 * (This should have been picked up in the compiler,
2347			 * so we may be dealing with errant or devious DIF.)
2348			 * This is an error condition; we indicate as much,
2349			 * and return.
2350			 */
2351			DTRACE_CPUFLAG_SET(CPU_DTRACE_ILLOP);
2352			return;
2353		}
2354
2355		/*
2356		 * This is a hit:  we need to apply the aggregator to
2357		 * the value at this key.
2358		 */
2359		agg->dtag_aggregate((uint64_t *)(kdata + size), expr, arg);
2360		return;
2361next:
2362		continue;
2363	}
2364
2365	/*
2366	 * We didn't find it.  We need to allocate some zero-filled space,
2367	 * link it into the hash table appropriately, and apply the aggregator
2368	 * to the (zero-filled) value.
2369	 */
2370	offs = buf->dtb_offset;
2371	while (offs & (align - 1))
2372		offs += sizeof (uint32_t);
2373
2374	/*
2375	 * If we don't have enough room to both allocate a new key _and_
2376	 * its associated data, increment the drop count and return.
2377	 */
2378	if ((uintptr_t)tomax + offs + fsize >
2379	    agb->dtagb_free - sizeof (dtrace_aggkey_t)) {
2380		dtrace_buffer_drop(buf);
2381		return;
2382	}
2383
2384	/*CONSTCOND*/
2385	ASSERT(!(sizeof (dtrace_aggkey_t) & (sizeof (uintptr_t) - 1)));
2386	key = (dtrace_aggkey_t *)(agb->dtagb_free - sizeof (dtrace_aggkey_t));
2387	agb->dtagb_free -= sizeof (dtrace_aggkey_t);
2388
2389	key->dtak_data = kdata = tomax + offs;
2390	buf->dtb_offset = offs + fsize;
2391
2392	/*
2393	 * Now copy the data across.
2394	 */
2395	*((dtrace_aggid_t *)kdata) = agg->dtag_id;
2396
2397	for (i = sizeof (dtrace_aggid_t); i < size; i++)
2398		kdata[i] = data[i];
2399
2400	/*
2401	 * Because strings are not zeroed out by default, we need to iterate
2402	 * looking for actions that store strings, and we need to explicitly
2403	 * pad these strings out with zeroes.
2404	 */
2405	for (act = agg->dtag_first; act->dta_intuple; act = act->dta_next) {
2406		int nul;
2407
2408		if (!DTRACEACT_ISSTRING(act))
2409			continue;
2410
2411		i = act->dta_rec.dtrd_offset - agg->dtag_base;
2412		limit = i + act->dta_rec.dtrd_size;
2413		ASSERT(limit <= size);
2414
2415		for (nul = 0; i < limit; i++) {
2416			if (nul) {
2417				kdata[i] = '\0';
2418				continue;
2419			}
2420
2421			if (data[i] != '\0')
2422				continue;
2423
2424			nul = 1;
2425		}
2426	}
2427
2428	for (i = size; i < fsize; i++)
2429		kdata[i] = 0;
2430
2431	key->dtak_hashval = hashval;
2432	key->dtak_size = size;
2433	key->dtak_action = action;
2434	key->dtak_next = agb->dtagb_hash[ndx];
2435	agb->dtagb_hash[ndx] = key;
2436
2437	/*
2438	 * Finally, apply the aggregator.
2439	 */
2440	*((uint64_t *)(key->dtak_data + size)) = agg->dtag_initial;
2441	agg->dtag_aggregate((uint64_t *)(key->dtak_data + size), expr, arg);
2442}
2443
2444/*
2445 * Given consumer state, this routine finds a speculation in the INACTIVE
2446 * state and transitions it into the ACTIVE state.  If there is no speculation
2447 * in the INACTIVE state, 0 is returned.  In this case, no error counter is
2448 * incremented -- it is up to the caller to take appropriate action.
2449 */
2450static int
2451dtrace_speculation(dtrace_state_t *state)
2452{
2453	int i = 0;
2454	dtrace_speculation_state_t current;
2455	uint32_t *stat = &state->dts_speculations_unavail, count;
2456
2457	while (i < state->dts_nspeculations) {
2458		dtrace_speculation_t *spec = &state->dts_speculations[i];
2459
2460		current = spec->dtsp_state;
2461
2462		if (current != DTRACESPEC_INACTIVE) {
2463			if (current == DTRACESPEC_COMMITTINGMANY ||
2464			    current == DTRACESPEC_COMMITTING ||
2465			    current == DTRACESPEC_DISCARDING)
2466				stat = &state->dts_speculations_busy;
2467			i++;
2468			continue;
2469		}
2470
2471		if (dtrace_cas32((uint32_t *)&spec->dtsp_state,
2472		    current, DTRACESPEC_ACTIVE) == current)
2473			return (i + 1);
2474	}
2475
2476	/*
2477	 * We couldn't find a speculation.  If we found as much as a single
2478	 * busy speculation buffer, we'll attribute this failure as "busy"
2479	 * instead of "unavail".
2480	 */
2481	do {
2482		count = *stat;
2483	} while (dtrace_cas32(stat, count, count + 1) != count);
2484
2485	return (0);
2486}
2487
2488/*
2489 * This routine commits an active speculation.  If the specified speculation
2490 * is not in a valid state to perform a commit(), this routine will silently do
2491 * nothing.  The state of the specified speculation is transitioned according
2492 * to the state transition diagram outlined in <sys/dtrace_impl.h>
2493 */
2494static void
2495dtrace_speculation_commit(dtrace_state_t *state, processorid_t cpu,
2496    dtrace_specid_t which)
2497{
2498	dtrace_speculation_t *spec;
2499	dtrace_buffer_t *src, *dest;
2500	uintptr_t daddr, saddr, dlimit;
2501#if !defined(__APPLE__)  /* Quiet compiler warning */
2502	dtrace_speculation_state_t current, new;
2503#else
2504	dtrace_speculation_state_t current,  new = DTRACESPEC_INACTIVE;
2505#endif /* __APPLE__ */
2506	intptr_t offs;
2507
2508	if (which == 0)
2509		return;
2510
2511#if !defined(__APPLE__)  /* Quiet compiler warning */
2512	if (which > state->dts_nspeculations) {
2513		cpu_core[cpu].cpuc_dtrace_flags |= CPU_DTRACE_ILLOP;
2514		return;
2515	}
2516#else
2517	if (which > (dtrace_specid_t)state->dts_nspeculations) {
2518		cpu_core[cpu].cpuc_dtrace_flags |= CPU_DTRACE_ILLOP;
2519		return;
2520	}
2521#endif /* __APPLE__ */
2522
2523	spec = &state->dts_speculations[which - 1];
2524	src = &spec->dtsp_buffer[cpu];
2525	dest = &state->dts_buffer[cpu];
2526
2527	do {
2528		current = spec->dtsp_state;
2529
2530		if (current == DTRACESPEC_COMMITTINGMANY)
2531			break;
2532
2533		switch (current) {
2534		case DTRACESPEC_INACTIVE:
2535		case DTRACESPEC_DISCARDING:
2536			return;
2537
2538		case DTRACESPEC_COMMITTING:
2539			/*
2540			 * This is only possible if we are (a) commit()'ing
2541			 * without having done a prior speculate() on this CPU
2542			 * and (b) racing with another commit() on a different
2543			 * CPU.  There's nothing to do -- we just assert that
2544			 * our offset is 0.
2545			 */
2546			ASSERT(src->dtb_offset == 0);
2547			return;
2548
2549		case DTRACESPEC_ACTIVE:
2550			new = DTRACESPEC_COMMITTING;
2551			break;
2552
2553		case DTRACESPEC_ACTIVEONE:
2554			/*
2555			 * This speculation is active on one CPU.  If our
2556			 * buffer offset is non-zero, we know that the one CPU
2557			 * must be us.  Otherwise, we are committing on a
2558			 * different CPU from the speculate(), and we must
2559			 * rely on being asynchronously cleaned.
2560			 */
2561			if (src->dtb_offset != 0) {
2562				new = DTRACESPEC_COMMITTING;
2563				break;
2564			}
2565			/*FALLTHROUGH*/
2566
2567		case DTRACESPEC_ACTIVEMANY:
2568			new = DTRACESPEC_COMMITTINGMANY;
2569			break;
2570
2571		default:
2572			ASSERT(0);
2573		}
2574	} while (dtrace_cas32((uint32_t *)&spec->dtsp_state,
2575	    current, new) != current);
2576
2577	/*
2578	 * We have set the state to indicate that we are committing this
2579	 * speculation.  Now reserve the necessary space in the destination
2580	 * buffer.
2581	 */
2582	if ((offs = dtrace_buffer_reserve(dest, src->dtb_offset,
2583	    sizeof (uint64_t), state, NULL)) < 0) {
2584		dtrace_buffer_drop(dest);
2585		goto out;
2586	}
2587
2588	/*
2589	 * We have the space; copy the buffer across.  (Note that this is a
2590	 * highly subobtimal bcopy(); in the unlikely event that this becomes
2591	 * a serious performance issue, a high-performance DTrace-specific
2592	 * bcopy() should obviously be invented.)
2593	 */
2594	daddr = (uintptr_t)dest->dtb_tomax + offs;
2595	dlimit = daddr + src->dtb_offset;
2596	saddr = (uintptr_t)src->dtb_tomax;
2597
2598	/*
2599	 * First, the aligned portion.
2600	 */
2601	while (dlimit - daddr >= sizeof (uint64_t)) {
2602		*((uint64_t *)daddr) = *((uint64_t *)saddr);
2603
2604		daddr += sizeof (uint64_t);
2605		saddr += sizeof (uint64_t);
2606	}
2607
2608	/*
2609	 * Now any left-over bit...
2610	 */
2611	while (dlimit - daddr)
2612		*((uint8_t *)daddr++) = *((uint8_t *)saddr++);
2613
2614	/*
2615	 * Finally, commit the reserved space in the destination buffer.
2616	 */
2617	dest->dtb_offset = offs + src->dtb_offset;
2618
2619out:
2620	/*
2621	 * If we're lucky enough to be the only active CPU on this speculation
2622	 * buffer, we can just set the state back to DTRACESPEC_INACTIVE.
2623	 */
2624	if (current == DTRACESPEC_ACTIVE ||
2625	    (current == DTRACESPEC_ACTIVEONE && new == DTRACESPEC_COMMITTING)) {
2626		uint32_t rval = dtrace_cas32((uint32_t *)&spec->dtsp_state,
2627		    DTRACESPEC_COMMITTING, DTRACESPEC_INACTIVE);
2628#pragma unused(rval) /* __APPLE__ */
2629
2630		ASSERT(rval == DTRACESPEC_COMMITTING);
2631	}
2632
2633	src->dtb_offset = 0;
2634	src->dtb_xamot_drops += src->dtb_drops;
2635	src->dtb_drops = 0;
2636}
2637
2638/*
2639 * This routine discards an active speculation.  If the specified speculation
2640 * is not in a valid state to perform a discard(), this routine will silently
2641 * do nothing.  The state of the specified speculation is transitioned
2642 * according to the state transition diagram outlined in <sys/dtrace_impl.h>
2643 */
2644static void
2645dtrace_speculation_discard(dtrace_state_t *state, processorid_t cpu,
2646    dtrace_specid_t which)
2647{
2648	dtrace_speculation_t *spec;
2649#if !defined(__APPLE__)  /* Quiet compiler warning */
2650	dtrace_speculation_state_t current, new;
2651#else
2652	dtrace_speculation_state_t current, new = DTRACESPEC_INACTIVE;
2653#endif /* __APPLE__ */
2654	dtrace_buffer_t *buf;
2655
2656	if (which == 0)
2657		return;
2658
2659#if !defined(__APPLE__)  /* Quiet compiler warning */
2660	if (which > state->dts_nspeculations) {
2661		cpu_core[cpu].cpuc_dtrace_flags |= CPU_DTRACE_ILLOP;
2662		return;
2663	}
2664#else
2665	if (which > (dtrace_specid_t)state->dts_nspeculations) {
2666		cpu_core[cpu].cpuc_dtrace_flags |= CPU_DTRACE_ILLOP;
2667		return;
2668	}
2669#endif /* __APPLE__ */
2670
2671	spec = &state->dts_speculations[which - 1];
2672	buf = &spec->dtsp_buffer[cpu];
2673
2674	do {
2675		current = spec->dtsp_state;
2676
2677		switch (current) {
2678		case DTRACESPEC_INACTIVE:
2679		case DTRACESPEC_COMMITTINGMANY:
2680		case DTRACESPEC_COMMITTING:
2681		case DTRACESPEC_DISCARDING:
2682			return;
2683
2684		case DTRACESPEC_ACTIVE:
2685		case DTRACESPEC_ACTIVEMANY:
2686			new = DTRACESPEC_DISCARDING;
2687			break;
2688
2689		case DTRACESPEC_ACTIVEONE:
2690			if (buf->dtb_offset != 0) {
2691				new = DTRACESPEC_INACTIVE;
2692			} else {
2693				new = DTRACESPEC_DISCARDING;
2694			}
2695			break;
2696
2697		default:
2698			ASSERT(0);
2699		}
2700	} while (dtrace_cas32((uint32_t *)&spec->dtsp_state,
2701	    current, new) != current);
2702
2703	buf->dtb_offset = 0;
2704	buf->dtb_drops = 0;
2705}
2706
2707/*
2708 * Note:  not called from probe context.  This function is called
2709 * asynchronously from cross call context to clean any speculations that are
2710 * in the COMMITTINGMANY or DISCARDING states.  These speculations may not be
2711 * transitioned back to the INACTIVE state until all CPUs have cleaned the
2712 * speculation.
2713 */
2714static void
2715dtrace_speculation_clean_here(dtrace_state_t *state)
2716{
2717	dtrace_icookie_t cookie;
2718	processorid_t cpu = CPU->cpu_id;
2719	dtrace_buffer_t *dest = &state->dts_buffer[cpu];
2720	dtrace_specid_t i;
2721
2722	cookie = dtrace_interrupt_disable();
2723
2724	if (dest->dtb_tomax == NULL) {
2725		dtrace_interrupt_enable(cookie);
2726		return;
2727	}
2728
2729#if !defined(__APPLE__)  /* Quiet compiler warning */
2730	for (i = 0; i < state->dts_nspeculations; i++) {
2731#else
2732	for (i = 0; i < (dtrace_specid_t)state->dts_nspeculations; i++) {
2733#endif /* __APPLE__ */
2734		dtrace_speculation_t *spec = &state->dts_speculations[i];
2735		dtrace_buffer_t *src = &spec->dtsp_buffer[cpu];
2736
2737		if (src->dtb_tomax == NULL)
2738			continue;
2739
2740		if (spec->dtsp_state == DTRACESPEC_DISCARDING) {
2741			src->dtb_offset = 0;
2742			continue;
2743		}
2744
2745		if (spec->dtsp_state != DTRACESPEC_COMMITTINGMANY)
2746			continue;
2747
2748		if (src->dtb_offset == 0)
2749			continue;
2750
2751		dtrace_speculation_commit(state, cpu, i + 1);
2752	}
2753
2754	dtrace_interrupt_enable(cookie);
2755}
2756
2757/*
2758 * Note:  not called from probe context.  This function is called
2759 * asynchronously (and at a regular interval) to clean any speculations that
2760 * are in the COMMITTINGMANY or DISCARDING states.  If it discovers that there
2761 * is work to be done, it cross calls all CPUs to perform that work;
2762 * COMMITMANY and DISCARDING speculations may not be transitioned back to the
2763 * INACTIVE state until they have been cleaned by all CPUs.
2764 */
2765static void
2766dtrace_speculation_clean(dtrace_state_t *state)
2767{
2768#if !defined(__APPLE__)  /* Quiet compiler warning */
2769	int work = 0, rv;
2770#else
2771	int work = 0;
2772	uint32_t rv;
2773#endif /* __APPLE__ */
2774	dtrace_specid_t i;
2775
2776#if !defined(__APPLE__)  /* Quiet compiler warning */
2777	for (i = 0; i < state->dts_nspeculations; i++) {
2778#else
2779	for (i = 0; i < (dtrace_specid_t)state->dts_nspeculations; i++) {
2780#endif /* __APPLE__ */
2781		dtrace_speculation_t *spec = &state->dts_speculations[i];
2782
2783		ASSERT(!spec->dtsp_cleaning);
2784
2785		if (spec->dtsp_state != DTRACESPEC_DISCARDING &&
2786		    spec->dtsp_state != DTRACESPEC_COMMITTINGMANY)
2787			continue;
2788
2789		work++;
2790		spec->dtsp_cleaning = 1;
2791	}
2792
2793	if (!work)
2794		return;
2795
2796	dtrace_xcall(DTRACE_CPUALL,
2797	    (dtrace_xcall_t)dtrace_speculation_clean_here, state);
2798
2799	/*
2800	 * We now know that all CPUs have committed or discarded their
2801	 * speculation buffers, as appropriate.  We can now set the state
2802	 * to inactive.
2803	 */
2804#if !defined(__APPLE__)  /* Quiet compiler warning */
2805	for (i = 0; i < state->dts_nspeculations; i++) {
2806#else
2807	for (i = 0; i < (dtrace_specid_t)state->dts_nspeculations; i++) {
2808#endif /* __APPLE__ */
2809		dtrace_speculation_t *spec = &state->dts_speculations[i];
2810		dtrace_speculation_state_t current, new;
2811
2812		if (!spec->dtsp_cleaning)
2813			continue;
2814
2815		current = spec->dtsp_state;
2816		ASSERT(current == DTRACESPEC_DISCARDING ||
2817		    current == DTRACESPEC_COMMITTINGMANY);
2818
2819		new = DTRACESPEC_INACTIVE;
2820
2821		rv = dtrace_cas32((uint32_t *)&spec->dtsp_state, current, new);
2822		ASSERT(rv == current);
2823		spec->dtsp_cleaning = 0;
2824	}
2825}
2826
2827/*
2828 * Called as part of a speculate() to get the speculative buffer associated
2829 * with a given speculation.  Returns NULL if the specified speculation is not
2830 * in an ACTIVE state.  If the speculation is in the ACTIVEONE state -- and
2831 * the active CPU is not the specified CPU -- the speculation will be
2832 * atomically transitioned into the ACTIVEMANY state.
2833 */
2834static dtrace_buffer_t *
2835dtrace_speculation_buffer(dtrace_state_t *state, processorid_t cpuid,
2836    dtrace_specid_t which)
2837{
2838	dtrace_speculation_t *spec;
2839#if !defined(__APPLE__)  /* Quiet compiler warning */
2840	dtrace_speculation_state_t current, new;
2841#else
2842	dtrace_speculation_state_t current, new = DTRACESPEC_INACTIVE;
2843#endif /* __APPLE__ */
2844	dtrace_buffer_t *buf;
2845
2846	if (which == 0)
2847		return (NULL);
2848
2849#if !defined(__APPLE__)  /* Quiet compiler warning */
2850	if (which > state->dts_nspeculations) {
2851#else
2852	if (which > (dtrace_specid_t)state->dts_nspeculations) {
2853#endif /* __APPLE__ */
2854		cpu_core[cpuid].cpuc_dtrace_flags |= CPU_DTRACE_ILLOP;
2855		return (NULL);
2856	}
2857
2858	spec = &state->dts_speculations[which - 1];
2859	buf = &spec->dtsp_buffer[cpuid];
2860
2861	do {
2862		current = spec->dtsp_state;
2863
2864		switch (current) {
2865		case DTRACESPEC_INACTIVE:
2866		case DTRACESPEC_COMMITTINGMANY:
2867		case DTRACESPEC_DISCARDING:
2868			return (NULL);
2869
2870		case DTRACESPEC_COMMITTING:
2871			ASSERT(buf->dtb_offset == 0);
2872			return (NULL);
2873
2874		case DTRACESPEC_ACTIVEONE:
2875			/*
2876			 * This speculation is currently active on one CPU.
2877			 * Check the offset in the buffer; if it's non-zero,
2878			 * that CPU must be us (and we leave the state alone).
2879			 * If it's zero, assume that we're starting on a new
2880			 * CPU -- and change the state to indicate that the
2881			 * speculation is active on more than one CPU.
2882			 */
2883			if (buf->dtb_offset != 0)
2884				return (buf);
2885
2886			new = DTRACESPEC_ACTIVEMANY;
2887			break;
2888
2889		case DTRACESPEC_ACTIVEMANY:
2890			return (buf);
2891
2892		case DTRACESPEC_ACTIVE:
2893			new = DTRACESPEC_ACTIVEONE;
2894			break;
2895
2896		default:
2897			ASSERT(0);
2898		}
2899	} while (dtrace_cas32((uint32_t *)&spec->dtsp_state,
2900	    current, new) != current);
2901
2902	ASSERT(new == DTRACESPEC_ACTIVEONE || new == DTRACESPEC_ACTIVEMANY);
2903	return (buf);
2904}
2905
2906/*
2907 * Return a string.  In the event that the user lacks the privilege to access
2908 * arbitrary kernel memory, we copy the string out to scratch memory so that we
2909 * don't fail access checking.
2910 *
2911 * dtrace_dif_variable() uses this routine as a helper for various
2912 * builtin values such as 'execname' and 'probefunc.'
2913 */
2914#if defined(__APPLE__) /* Quiet compiler warning. */
2915static
2916#endif /* __APPLE__ */
2917uintptr_t
2918dtrace_dif_varstr(uintptr_t addr, dtrace_state_t *state,
2919    dtrace_mstate_t *mstate)
2920{
2921	uint64_t size = state->dts_options[DTRACEOPT_STRSIZE];
2922	uintptr_t ret;
2923	size_t strsz;
2924
2925	/*
2926	 * The easy case: this probe is allowed to read all of memory, so
2927	 * we can just return this as a vanilla pointer.
2928	 */
2929	if ((mstate->dtms_access & DTRACE_ACCESS_KERNEL) != 0)
2930		return (addr);
2931
2932	/*
2933	 * This is the tougher case: we copy the string in question from
2934	 * kernel memory into scratch memory and return it that way: this
2935	 * ensures that we won't trip up when access checking tests the
2936	 * BYREF return value.
2937	 */
2938	strsz = dtrace_strlen((char *)addr, size) + 1;
2939
2940	if (mstate->dtms_scratch_ptr + strsz >
2941	    mstate->dtms_scratch_base + mstate->dtms_scratch_size) {
2942		DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
2943		return (NULL);
2944	}
2945
2946	dtrace_strcpy((const void *)addr, (void *)mstate->dtms_scratch_ptr,
2947	    strsz);
2948	ret = mstate->dtms_scratch_ptr;
2949	mstate->dtms_scratch_ptr += strsz;
2950	return (ret);
2951}
2952
2953/*
2954 * This function implements the DIF emulator's variable lookups.  The emulator
2955 * passes a reserved variable identifier and optional built-in array index.
2956 */
2957static uint64_t
2958dtrace_dif_variable(dtrace_mstate_t *mstate, dtrace_state_t *state, uint64_t v,
2959    uint64_t ndx)
2960{
2961	/*
2962	 * If we're accessing one of the uncached arguments, we'll turn this
2963	 * into a reference in the args array.
2964	 */
2965	if (v >= DIF_VAR_ARG0 && v <= DIF_VAR_ARG9) {
2966		ndx = v - DIF_VAR_ARG0;
2967		v = DIF_VAR_ARGS;
2968	}
2969
2970	switch (v) {
2971	case DIF_VAR_ARGS:
2972		ASSERT(mstate->dtms_present & DTRACE_MSTATE_ARGS);
2973		if (ndx >= sizeof (mstate->dtms_arg) /
2974		    sizeof (mstate->dtms_arg[0])) {
2975#if !defined(__APPLE__)
2976			int aframes = mstate->dtms_probe->dtpr_aframes + 2;
2977#else
2978			/* Account for introduction of __dtrace_probe() on xnu. */
2979			int aframes = mstate->dtms_probe->dtpr_aframes + 3;
2980#endif /* __APPLE__ */
2981			dtrace_provider_t *pv;
2982			uint64_t val;
2983
2984			pv = mstate->dtms_probe->dtpr_provider;
2985			if (pv->dtpv_pops.dtps_getargval != NULL)
2986				val = pv->dtpv_pops.dtps_getargval(pv->dtpv_arg,
2987				    mstate->dtms_probe->dtpr_id,
2988				    mstate->dtms_probe->dtpr_arg, ndx, aframes);
2989#if defined(__APPLE__)
2990			/* Special case access of arg5 as passed to dtrace_probe_error() (which see.) */
2991			else if (mstate->dtms_probe->dtpr_id == dtrace_probeid_error && ndx == 5) {
2992			        return ((dtrace_state_t *)(uintptr_t)(mstate->dtms_arg[0]))->dts_arg_error_illval;
2993			}
2994#endif /* __APPLE__ */
2995			else
2996				val = dtrace_getarg(ndx, aframes);
2997
2998			/*
2999			 * This is regrettably required to keep the compiler
3000			 * from tail-optimizing the call to dtrace_getarg().
3001			 * The condition always evaluates to true, but the
3002			 * compiler has no way of figuring that out a priori.
3003			 * (None of this would be necessary if the compiler
3004			 * could be relied upon to _always_ tail-optimize
3005			 * the call to dtrace_getarg() -- but it can't.)
3006			 */
3007			if (mstate->dtms_probe != NULL)
3008				return (val);
3009
3010			ASSERT(0);
3011		}
3012
3013		return (mstate->dtms_arg[ndx]);
3014
3015#if !defined(__APPLE__)
3016	case DIF_VAR_UREGS: {
3017		klwp_t *lwp;
3018
3019		if (!dtrace_priv_proc(state))
3020			return (0);
3021
3022		if ((lwp = curthread->t_lwp) == NULL) {
3023			DTRACE_CPUFLAG_SET(CPU_DTRACE_BADADDR);
3024			cpu_core[CPU->cpu_id].cpuc_dtrace_illval = NULL;
3025			return (0);
3026		}
3027
3028		return (dtrace_getreg(lwp->lwp_regs, ndx));
3029	}
3030#else
3031	case DIF_VAR_UREGS: {
3032		thread_t thread;
3033
3034		if (!dtrace_priv_proc(state))
3035			return (0);
3036
3037		if ((thread = current_thread()) == NULL) {
3038			DTRACE_CPUFLAG_SET(CPU_DTRACE_BADADDR);
3039			cpu_core[CPU->cpu_id].cpuc_dtrace_illval = 0;
3040			return (0);
3041		}
3042
3043		return (dtrace_getreg(find_user_regs(thread), ndx));
3044	}
3045#endif /* __APPLE__ */
3046
3047#if !defined(__APPLE__)
3048	case DIF_VAR_CURTHREAD:
3049		if (!dtrace_priv_kernel(state))
3050			return (0);
3051		return ((uint64_t)(uintptr_t)curthread);
3052#else
3053	case DIF_VAR_CURTHREAD:
3054		if (!dtrace_priv_kernel(state))
3055			return (0);
3056
3057		return ((uint64_t)(uintptr_t)current_thread());
3058#endif /* __APPLE__ */
3059
3060	case DIF_VAR_TIMESTAMP:
3061		if (!(mstate->dtms_present & DTRACE_MSTATE_TIMESTAMP)) {
3062			mstate->dtms_timestamp = dtrace_gethrtime();
3063			mstate->dtms_present |= DTRACE_MSTATE_TIMESTAMP;
3064		}
3065		return (mstate->dtms_timestamp);
3066
3067#if !defined(__APPLE__)
3068	case DIF_VAR_VTIMESTAMP:
3069		ASSERT(dtrace_vtime_references != 0);
3070		return (curthread->t_dtrace_vtime);
3071#else
3072	case DIF_VAR_VTIMESTAMP:
3073		ASSERT(dtrace_vtime_references != 0);
3074		return (dtrace_get_thread_vtime(current_thread()));
3075#endif /* __APPLE__ */
3076
3077	case DIF_VAR_WALLTIMESTAMP:
3078		if (!(mstate->dtms_present & DTRACE_MSTATE_WALLTIMESTAMP)) {
3079			mstate->dtms_walltimestamp = dtrace_gethrestime();
3080			mstate->dtms_present |= DTRACE_MSTATE_WALLTIMESTAMP;
3081		}
3082		return (mstate->dtms_walltimestamp);
3083
3084	case DIF_VAR_IPL:
3085		if (!dtrace_priv_kernel(state))
3086			return (0);
3087		if (!(mstate->dtms_present & DTRACE_MSTATE_IPL)) {
3088			mstate->dtms_ipl = dtrace_getipl();
3089			mstate->dtms_present |= DTRACE_MSTATE_IPL;
3090		}
3091		return (mstate->dtms_ipl);
3092
3093	case DIF_VAR_EPID:
3094		ASSERT(mstate->dtms_present & DTRACE_MSTATE_EPID);
3095		return (mstate->dtms_epid);
3096
3097	case DIF_VAR_ID:
3098		ASSERT(mstate->dtms_present & DTRACE_MSTATE_PROBE);
3099		return (mstate->dtms_probe->dtpr_id);
3100
3101	case DIF_VAR_STACKDEPTH:
3102		if (!dtrace_priv_kernel(state))
3103			return (0);
3104		if (!(mstate->dtms_present & DTRACE_MSTATE_STACKDEPTH)) {
3105#if !defined(__APPLE__)
3106			int aframes = mstate->dtms_probe->dtpr_aframes + 2;
3107#else
3108			/* Account for introduction of __dtrace_probe() on xnu. */
3109			int aframes = mstate->dtms_probe->dtpr_aframes + 3;
3110#endif /* __APPLE__ */
3111
3112			mstate->dtms_stackdepth = dtrace_getstackdepth(aframes);
3113			mstate->dtms_present |= DTRACE_MSTATE_STACKDEPTH;
3114		}
3115		return (mstate->dtms_stackdepth);
3116
3117	case DIF_VAR_USTACKDEPTH:
3118		if (!dtrace_priv_proc(state))
3119			return (0);
3120		if (!(mstate->dtms_present & DTRACE_MSTATE_USTACKDEPTH)) {
3121			/*
3122			 * See comment in DIF_VAR_PID.
3123			 */
3124			if (DTRACE_ANCHORED(mstate->dtms_probe) &&
3125			    CPU_ON_INTR(CPU)) {
3126				mstate->dtms_ustackdepth = 0;
3127			} else {
3128				DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
3129				mstate->dtms_ustackdepth =
3130				    dtrace_getustackdepth();
3131				DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);
3132			}
3133			mstate->dtms_present |= DTRACE_MSTATE_USTACKDEPTH;
3134		}
3135		return (mstate->dtms_ustackdepth);
3136
3137	case DIF_VAR_CALLER:
3138		if (!dtrace_priv_kernel(state))
3139			return (0);
3140		if (!(mstate->dtms_present & DTRACE_MSTATE_CALLER)) {
3141#if !defined(__APPLE__)
3142			int aframes = mstate->dtms_probe->dtpr_aframes + 2;
3143#else
3144			/* Account for introduction of __dtrace_probe() on xnu. */
3145			int aframes = mstate->dtms_probe->dtpr_aframes + 3;
3146#endif /* __APPLE__ */
3147
3148			if (!DTRACE_ANCHORED(mstate->dtms_probe)) {
3149				/*
3150				 * If this is an unanchored probe, we are
3151				 * required to go through the slow path:
3152				 * dtrace_caller() only guarantees correct
3153				 * results for anchored probes.
3154				 */
3155				pc_t caller[2];
3156
3157				dtrace_getpcstack(caller, 2, aframes,
3158				    (uint32_t *)(uintptr_t)mstate->dtms_arg[0]);
3159				mstate->dtms_caller = caller[1];
3160			} else if ((mstate->dtms_caller =
3161#if !defined(__APPLE__)  /* Quiet compiler warnings */
3162			    dtrace_caller(aframes)) == -1) {
3163#else
3164			    dtrace_caller(aframes)) == (uintptr_t)-1) {
3165#endif /* __APPLE__ */
3166				/*
3167				 * We have failed to do this the quick way;
3168				 * we must resort to the slower approach of
3169				 * calling dtrace_getpcstack().
3170				 */
3171				pc_t caller;
3172
3173				dtrace_getpcstack(&caller, 1, aframes, NULL);
3174				mstate->dtms_caller = caller;
3175			}
3176
3177			mstate->dtms_present |= DTRACE_MSTATE_CALLER;
3178		}
3179		return (mstate->dtms_caller);
3180
3181	case DIF_VAR_UCALLER:
3182		if (!dtrace_priv_proc(state))
3183			return (0);
3184
3185		if (!(mstate->dtms_present & DTRACE_MSTATE_UCALLER)) {
3186			uint64_t ustack[3];
3187
3188			/*
3189			 * dtrace_getupcstack() fills in the first uint64_t
3190			 * with the current PID.  The second uint64_t will
3191			 * be the program counter at user-level.  The third
3192			 * uint64_t will contain the caller, which is what
3193			 * we're after.
3194			 */
3195			ustack[2] = NULL;
3196			DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
3197			dtrace_getupcstack(ustack, 3);
3198			DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);
3199			mstate->dtms_ucaller = ustack[2];
3200			mstate->dtms_present |= DTRACE_MSTATE_UCALLER;
3201		}
3202
3203		return (mstate->dtms_ucaller);
3204
3205	case DIF_VAR_PROBEPROV:
3206		ASSERT(mstate->dtms_present & DTRACE_MSTATE_PROBE);
3207		return (dtrace_dif_varstr(
3208		    (uintptr_t)mstate->dtms_probe->dtpr_provider->dtpv_name,
3209		    state, mstate));
3210
3211	case DIF_VAR_PROBEMOD:
3212		ASSERT(mstate->dtms_present & DTRACE_MSTATE_PROBE);
3213		return (dtrace_dif_varstr(
3214		    (uintptr_t)mstate->dtms_probe->dtpr_mod,
3215		    state, mstate));
3216
3217	case DIF_VAR_PROBEFUNC:
3218		ASSERT(mstate->dtms_present & DTRACE_MSTATE_PROBE);
3219		return (dtrace_dif_varstr(
3220		    (uintptr_t)mstate->dtms_probe->dtpr_func,
3221		    state, mstate));
3222
3223	case DIF_VAR_PROBENAME:
3224		ASSERT(mstate->dtms_present & DTRACE_MSTATE_PROBE);
3225		return (dtrace_dif_varstr(
3226		    (uintptr_t)mstate->dtms_probe->dtpr_name,
3227		    state, mstate));
3228
3229#if !defined(__APPLE__)
3230	case DIF_VAR_PID:
3231		if (!dtrace_priv_proc(state))
3232			return (0);
3233
3234		/*
3235		 * Note that we are assuming that an unanchored probe is
3236		 * always due to a high-level interrupt.  (And we're assuming
3237		 * that there is only a single high level interrupt.)
3238		 */
3239		if (DTRACE_ANCHORED(mstate->dtms_probe) && CPU_ON_INTR(CPU))
3240			return (pid0.pid_id);
3241
3242		/*
3243		 * It is always safe to dereference one's own t_procp pointer:
3244		 * it always points to a valid, allocated proc structure.
3245		 * Further, it is always safe to dereference the p_pidp member
3246		 * of one's own proc structure.  (These are truisms becuase
3247		 * threads and processes don't clean up their own state --
3248		 * they leave that task to whomever reaps them.)
3249		 */
3250		return ((uint64_t)curthread->t_procp->p_pidp->pid_id);
3251
3252#else
3253	case DIF_VAR_PID:
3254		if (!dtrace_priv_proc_relaxed(state))
3255			return (0);
3256
3257		/*
3258		 * Note that we are assuming that an unanchored probe is
3259		 * always due to a high-level interrupt.  (And we're assuming
3260		 * that there is only a single high level interrupt.)
3261		 */
3262		if (DTRACE_ANCHORED(mstate->dtms_probe) && CPU_ON_INTR(CPU))
3263			/* Anchored probe that fires while on an interrupt accrues to process 0 */
3264			return 0;
3265
3266		return ((uint64_t)dtrace_proc_selfpid());
3267#endif /* __APPLE__ */
3268
3269#if !defined(__APPLE__)
3270	case DIF_VAR_PPID:
3271		if (!dtrace_priv_proc(state))
3272			return (0);
3273
3274		/*
3275		 * See comment in DIF_VAR_PID.
3276		 */
3277		if (DTRACE_ANCHORED(mstate->dtms_probe) && CPU_ON_INTR(CPU))
3278			return (pid0.pid_id);
3279
3280		/*
3281		 * It is always safe to dereference one's own t_procp pointer:
3282		 * it always points to a valid, allocated proc structure.
3283		 * (This is true because threads don't clean up their own
3284		 * state -- they leave that task to whomever reaps them.)
3285		 */
3286		return ((uint64_t)curthread->t_procp->p_ppid);
3287#else
3288	case DIF_VAR_PPID:
3289		if (!dtrace_priv_proc_relaxed(state))
3290			return (0);
3291
3292		/*
3293		 * See comment in DIF_VAR_PID.
3294		 */
3295		if (DTRACE_ANCHORED(mstate->dtms_probe) && CPU_ON_INTR(CPU))
3296			return (0);
3297
3298		return ((uint64_t)dtrace_proc_selfppid());
3299#endif /* __APPLE__ */
3300
3301#if !defined(__APPLE__)
3302	case DIF_VAR_TID:
3303		/*
3304		 * See comment in DIF_VAR_PID.
3305		 */
3306		if (DTRACE_ANCHORED(mstate->dtms_probe) && CPU_ON_INTR(CPU))
3307			return (0);
3308
3309		return ((uint64_t)curthread->t_tid);
3310#else
3311	case DIF_VAR_TID:
3312		/* We do not need to check for null current_thread() */
3313		return thread_tid(current_thread()); /* globally unique */
3314
3315	case DIF_VAR_PTHREAD_SELF:
3316		if (!dtrace_priv_proc(state))
3317			return (0);
3318
3319		/* Not currently supported, but we should be able to delta the dispatchqaddr and dispatchqoffset to get pthread_self */
3320		return 0;
3321
3322	case DIF_VAR_DISPATCHQADDR:
3323		if (!dtrace_priv_proc(state))
3324			return (0);
3325
3326		/* We do not need to check for null current_thread() */
3327		return thread_dispatchqaddr(current_thread());
3328#endif /* __APPLE__ */
3329
3330#if !defined(__APPLE__)
3331	case DIF_VAR_EXECNAME:
3332		if (!dtrace_priv_proc(state))
3333			return (0);
3334
3335		/*
3336		 * See comment in DIF_VAR_PID.
3337		 */
3338		if (DTRACE_ANCHORED(mstate->dtms_probe) && CPU_ON_INTR(CPU))
3339			return ((uint64_t)(uintptr_t)p0.p_user.u_comm);
3340
3341		/*
3342		 * It is always safe to dereference one's own t_procp pointer:
3343		 * it always points to a valid, allocated proc structure.
3344		 * (This is true because threads don't clean up their own
3345		 * state -- they leave that task to whomever reaps them.)
3346		 */
3347		return (dtrace_dif_varstr(
3348		    (uintptr_t)curthread->t_procp->p_user.u_comm,
3349		    state, mstate));
3350#else
3351	case DIF_VAR_EXECNAME:
3352	{
3353		char *xname = (char *)mstate->dtms_scratch_ptr;
3354		size_t scratch_size = MAXCOMLEN+1;
3355
3356		/* The scratch allocation's lifetime is that of the clause. */
3357		if (!DTRACE_INSCRATCH(mstate, scratch_size)) {
3358			DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
3359			return 0;
3360		}
3361
3362		if (!dtrace_priv_proc_relaxed(state))
3363			return (0);
3364
3365		mstate->dtms_scratch_ptr += scratch_size;
3366		proc_selfname( xname, MAXCOMLEN );
3367
3368		return ((uint64_t)(uintptr_t)xname);
3369	}
3370#endif /* __APPLE__ */
3371#if !defined(__APPLE__)
3372	case DIF_VAR_ZONENAME:
3373		if (!dtrace_priv_proc(state))
3374			return (0);
3375
3376		/*
3377		 * See comment in DIF_VAR_PID.
3378		 */
3379		if (DTRACE_ANCHORED(mstate->dtms_probe) && CPU_ON_INTR(CPU))
3380			return ((uint64_t)(uintptr_t)p0.p_zone->zone_name);
3381
3382		/*
3383		 * It is always safe to dereference one's own t_procp pointer:
3384		 * it always points to a valid, allocated proc structure.
3385		 * (This is true because threads don't clean up their own
3386		 * state -- they leave that task to whomever reaps them.)
3387		 */
3388		return (dtrace_dif_varstr(
3389		    (uintptr_t)curthread->t_procp->p_zone->zone_name,
3390		    state, mstate));
3391
3392#else
3393	case DIF_VAR_ZONENAME:
3394        {
3395                /* scratch_size is equal to length('global') + 1 for the null-terminator. */
3396                char *zname = (char *)mstate->dtms_scratch_ptr;
3397                size_t scratch_size = 6 + 1;
3398
3399		if (!dtrace_priv_proc(state))
3400			return (0);
3401
3402                /* The scratch allocation's lifetime is that of the clause. */
3403                if (!DTRACE_INSCRATCH(mstate, scratch_size)) {
3404                        DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
3405                        return 0;
3406                }
3407
3408                mstate->dtms_scratch_ptr += scratch_size;
3409
3410                /* The kernel does not provide zonename, it will always return 'global'. */
3411                strlcpy(zname, "global", scratch_size);
3412
3413                return ((uint64_t)(uintptr_t)zname);
3414        }
3415#endif /* __APPLE__ */
3416
3417#if !defined(__APPLE__)
3418	case DIF_VAR_UID:
3419		if (!dtrace_priv_proc(state))
3420			return (0);
3421
3422		/*
3423		 * See comment in DIF_VAR_PID.
3424		 */
3425		if (DTRACE_ANCHORED(mstate->dtms_probe) && CPU_ON_INTR(CPU))
3426			return ((uint64_t)p0.p_cred->cr_uid);
3427
3428		/*
3429		 * It is always safe to dereference one's own t_procp pointer:
3430		 * it always points to a valid, allocated proc structure.
3431		 * (This is true because threads don't clean up their own
3432		 * state -- they leave that task to whomever reaps them.)
3433		 *
3434		 * Additionally, it is safe to dereference one's own process
3435		 * credential, since this is never NULL after process birth.
3436		 */
3437		return ((uint64_t)curthread->t_procp->p_cred->cr_uid);
3438#else
3439	case DIF_VAR_UID:
3440		if (!dtrace_priv_proc_relaxed(state))
3441			return (0);
3442
3443		/*
3444		 * See comment in DIF_VAR_PID.
3445		 */
3446		if (DTRACE_ANCHORED(mstate->dtms_probe) && CPU_ON_INTR(CPU))
3447			return (0);
3448
3449		return ((uint64_t) dtrace_proc_selfruid());
3450#endif /* __APPLE__ */
3451
3452#if !defined(__APPLE__)
3453	case DIF_VAR_GID:
3454		if (!dtrace_priv_proc(state))
3455			return (0);
3456
3457		/*
3458		 * See comment in DIF_VAR_PID.
3459		 */
3460		if (DTRACE_ANCHORED(mstate->dtms_probe) && CPU_ON_INTR(CPU))
3461			return ((uint64_t)p0.p_cred->cr_gid);
3462
3463		/*
3464		 * It is always safe to dereference one's own t_procp pointer:
3465		 * it always points to a valid, allocated proc structure.
3466		 * (This is true because threads don't clean up their own
3467		 * state -- they leave that task to whomever reaps them.)
3468		 *
3469		 * Additionally, it is safe to dereference one's own process
3470		 * credential, since this is never NULL after process birth.
3471		 */
3472		return ((uint64_t)curthread->t_procp->p_cred->cr_gid);
3473#else
3474	case DIF_VAR_GID:
3475		if (!dtrace_priv_proc(state))
3476			return (0);
3477
3478		/*
3479		 * See comment in DIF_VAR_PID.
3480		 */
3481		if (DTRACE_ANCHORED(mstate->dtms_probe) && CPU_ON_INTR(CPU))
3482			return (0);
3483
3484		if (dtrace_CRED() != NULL)
3485			/* Credential does not require lazy initialization. */
3486			return ((uint64_t)kauth_getgid());
3487		else {
3488			/* proc_lock would be taken under kauth_cred_proc_ref() in kauth_cred_get(). */
3489			DTRACE_CPUFLAG_SET(CPU_DTRACE_ILLOP);
3490			return -1ULL;
3491		}
3492#endif /* __APPLE__ */
3493
3494#if !defined(__APPLE__)
3495	case DIF_VAR_ERRNO: {
3496		klwp_t *lwp;
3497		if (!dtrace_priv_proc(state))
3498			return (0);
3499
3500		/*
3501		 * See comment in DIF_VAR_PID.
3502		 */
3503		if (DTRACE_ANCHORED(mstate->dtms_probe) && CPU_ON_INTR(CPU))
3504			return (0);
3505
3506		/*
3507		 * It is always safe to dereference one's own t_lwp pointer in
3508		 * the event that this pointer is non-NULL.  (This is true
3509		 * because threads and lwps don't clean up their own state --
3510		 * they leave that task to whomever reaps them.)
3511		 */
3512		if ((lwp = curthread->t_lwp) == NULL)
3513			return (0);
3514
3515		return ((uint64_t)lwp->lwp_errno);
3516	}
3517#else
3518	case DIF_VAR_ERRNO: {
3519		uthread_t uthread = (uthread_t)get_bsdthread_info(current_thread());
3520		if (!dtrace_priv_proc(state))
3521			return (0);
3522
3523		/*
3524		 * See comment in DIF_VAR_PID.
3525		 */
3526		if (DTRACE_ANCHORED(mstate->dtms_probe) && CPU_ON_INTR(CPU))
3527			return (0);
3528
3529		if (uthread)
3530			return (uint64_t)uthread->t_dtrace_errno;
3531		else {
3532			DTRACE_CPUFLAG_SET(CPU_DTRACE_ILLOP);
3533			return -1ULL;
3534		}
3535	}
3536#endif /* __APPLE__ */
3537
3538	default:
3539		DTRACE_CPUFLAG_SET(CPU_DTRACE_ILLOP);
3540		return (0);
3541	}
3542}
3543
3544/*
3545 * Emulate the execution of DTrace ID subroutines invoked by the call opcode.
3546 * Notice that we don't bother validating the proper number of arguments or
3547 * their types in the tuple stack.  This isn't needed because all argument
3548 * interpretation is safe because of our load safety -- the worst that can
3549 * happen is that a bogus program can obtain bogus results.
3550 */
3551static void
3552dtrace_dif_subr(uint_t subr, uint_t rd, uint64_t *regs,
3553    dtrace_key_t *tupregs, int nargs,
3554    dtrace_mstate_t *mstate, dtrace_state_t *state)
3555{
3556	volatile uint16_t *flags = &cpu_core[CPU->cpu_id].cpuc_dtrace_flags;
3557#if !defined(__APPLE__)
3558	volatile uintptr_t *illval = &cpu_core[CPU->cpu_id].cpuc_dtrace_illval;
3559#else
3560	volatile uint64_t *illval = &cpu_core[CPU->cpu_id].cpuc_dtrace_illval;
3561#endif /* __APPLE__ */
3562	dtrace_vstate_t *vstate = &state->dts_vstate;
3563
3564#if !defined(__APPLE__)
3565	union {
3566		mutex_impl_t mi;
3567		uint64_t mx;
3568	} m;
3569
3570	union {
3571		krwlock_t ri;
3572		uintptr_t rw;
3573	} r;
3574#else
3575/* FIXME: awaits lock/mutex work */
3576#endif /* __APPLE__ */
3577
3578	switch (subr) {
3579	case DIF_SUBR_RAND:
3580		regs[rd] = (dtrace_gethrtime() * 2416 + 374441) % 1771875;
3581		break;
3582
3583#if !defined(__APPLE__)
3584	case DIF_SUBR_MUTEX_OWNED:
3585		if (!dtrace_canload(tupregs[0].dttk_value, sizeof (kmutex_t),
3586		    mstate, vstate)) {
3587			regs[rd] = NULL;
3588			break;
3589		}
3590
3591		m.mx = dtrace_load64(tupregs[0].dttk_value);
3592		if (MUTEX_TYPE_ADAPTIVE(&m.mi))
3593			regs[rd] = MUTEX_OWNER(&m.mi) != MUTEX_NO_OWNER;
3594		else
3595			regs[rd] = LOCK_HELD(&m.mi.m_spin.m_spinlock);
3596		break;
3597
3598	case DIF_SUBR_MUTEX_OWNER:
3599		if (!dtrace_canload(tupregs[0].dttk_value, sizeof (kmutex_t),
3600		    mstate, vstate)) {
3601			regs[rd] = NULL;
3602			break;
3603		}
3604
3605		m.mx = dtrace_load64(tupregs[0].dttk_value);
3606		if (MUTEX_TYPE_ADAPTIVE(&m.mi) &&
3607		    MUTEX_OWNER(&m.mi) != MUTEX_NO_OWNER)
3608			regs[rd] = (uintptr_t)MUTEX_OWNER(&m.mi);
3609		else
3610			regs[rd] = 0;
3611		break;
3612
3613	case DIF_SUBR_MUTEX_TYPE_ADAPTIVE:
3614		if (!dtrace_canload(tupregs[0].dttk_value, sizeof (kmutex_t),
3615		    mstate, vstate)) {
3616			regs[rd] = NULL;
3617			break;
3618		}
3619
3620		m.mx = dtrace_load64(tupregs[0].dttk_value);
3621		regs[rd] = MUTEX_TYPE_ADAPTIVE(&m.mi);
3622		break;
3623
3624	case DIF_SUBR_MUTEX_TYPE_SPIN:
3625		if (!dtrace_canload(tupregs[0].dttk_value, sizeof (kmutex_t),
3626		    mstate, vstate)) {
3627			regs[rd] = NULL;
3628			break;
3629		}
3630
3631		m.mx = dtrace_load64(tupregs[0].dttk_value);
3632		regs[rd] = MUTEX_TYPE_SPIN(&m.mi);
3633		break;
3634
3635	case DIF_SUBR_RW_READ_HELD: {
3636		uintptr_t tmp;
3637
3638		if (!dtrace_canload(tupregs[0].dttk_value, sizeof (uintptr_t),
3639		    mstate, vstate)) {
3640			regs[rd] = NULL;
3641			break;
3642		}
3643
3644		r.rw = dtrace_loadptr(tupregs[0].dttk_value);
3645		regs[rd] = _RW_READ_HELD(&r.ri, tmp);
3646		break;
3647	}
3648
3649	case DIF_SUBR_RW_WRITE_HELD:
3650		if (!dtrace_canload(tupregs[0].dttk_value, sizeof (krwlock_t),
3651		    mstate, vstate)) {
3652			regs[rd] = NULL;
3653			break;
3654		}
3655
3656		r.rw = dtrace_loadptr(tupregs[0].dttk_value);
3657		regs[rd] = _RW_WRITE_HELD(&r.ri);
3658		break;
3659
3660	case DIF_SUBR_RW_ISWRITER:
3661		if (!dtrace_canload(tupregs[0].dttk_value, sizeof (krwlock_t),
3662		    mstate, vstate)) {
3663			regs[rd] = NULL;
3664			break;
3665		}
3666
3667		r.rw = dtrace_loadptr(tupregs[0].dttk_value);
3668		regs[rd] = _RW_ISWRITER(&r.ri);
3669		break;
3670#else
3671/* FIXME: awaits lock/mutex work */
3672#endif /* __APPLE__ */
3673
3674	case DIF_SUBR_BCOPY: {
3675		/*
3676		 * We need to be sure that the destination is in the scratch
3677		 * region -- no other region is allowed.
3678		 */
3679		uintptr_t src = tupregs[0].dttk_value;
3680		uintptr_t dest = tupregs[1].dttk_value;
3681		size_t size = tupregs[2].dttk_value;
3682
3683		if (!dtrace_inscratch(dest, size, mstate)) {
3684			*flags |= CPU_DTRACE_BADADDR;
3685			*illval = regs[rd];
3686			break;
3687		}
3688
3689		if (!dtrace_canload(src, size, mstate, vstate)) {
3690			regs[rd] = NULL;
3691			break;
3692		}
3693
3694		dtrace_bcopy((void *)src, (void *)dest, size);
3695		break;
3696	}
3697
3698	case DIF_SUBR_ALLOCA:
3699	case DIF_SUBR_COPYIN: {
3700		uintptr_t dest = P2ROUNDUP(mstate->dtms_scratch_ptr, 8);
3701		uint64_t size =
3702		    tupregs[subr == DIF_SUBR_ALLOCA ? 0 : 1].dttk_value;
3703		size_t scratch_size = (dest - mstate->dtms_scratch_ptr) + size;
3704
3705		/*
3706		 * This action doesn't require any credential checks since
3707		 * probes will not activate in user contexts to which the
3708		 * enabling user does not have permissions.
3709		 */
3710
3711		/*
3712		 * Rounding up the user allocation size could have overflowed
3713		 * a large, bogus allocation (like -1ULL) to 0.
3714		 */
3715		if (scratch_size < size ||
3716		    !DTRACE_INSCRATCH(mstate, scratch_size)) {
3717			DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
3718			regs[rd] = NULL;
3719			break;
3720		}
3721
3722		if (subr == DIF_SUBR_COPYIN) {
3723			DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
3724#if !defined(__APPLE__)
3725			dtrace_copyin(tupregs[0].dttk_value, dest, size, flags);
3726#else
3727			if (dtrace_priv_proc(state))
3728				dtrace_copyin(tupregs[0].dttk_value, dest, size, flags);
3729#endif /* __APPLE__ */
3730			DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);
3731		}
3732
3733		mstate->dtms_scratch_ptr += scratch_size;
3734		regs[rd] = dest;
3735		break;
3736	}
3737
3738	case DIF_SUBR_COPYINTO: {
3739		uint64_t size = tupregs[1].dttk_value;
3740		uintptr_t dest = tupregs[2].dttk_value;
3741
3742		/*
3743		 * This action doesn't require any credential checks since
3744		 * probes will not activate in user contexts to which the
3745		 * enabling user does not have permissions.
3746		 */
3747		if (!dtrace_inscratch(dest, size, mstate)) {
3748			*flags |= CPU_DTRACE_BADADDR;
3749			*illval = regs[rd];
3750			break;
3751		}
3752
3753		DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
3754#if !defined(__APPLE__)
3755		dtrace_copyin(tupregs[0].dttk_value, dest, size, flags);
3756#else
3757		if (dtrace_priv_proc(state))
3758			dtrace_copyin(tupregs[0].dttk_value, dest, size, flags);
3759#endif /* __APPLE__ */
3760		DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);
3761		break;
3762	}
3763
3764	case DIF_SUBR_COPYINSTR: {
3765		uintptr_t dest = mstate->dtms_scratch_ptr;
3766		uint64_t size = state->dts_options[DTRACEOPT_STRSIZE];
3767
3768		if (nargs > 1 && tupregs[1].dttk_value < size)
3769			size = tupregs[1].dttk_value + 1;
3770
3771		/*
3772		 * This action doesn't require any credential checks since
3773		 * probes will not activate in user contexts to which the
3774		 * enabling user does not have permissions.
3775		 */
3776		if (!DTRACE_INSCRATCH(mstate, size)) {
3777			DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
3778			regs[rd] = NULL;
3779			break;
3780		}
3781
3782		DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
3783#if !defined(__APPLE__)
3784		dtrace_copyinstr(tupregs[0].dttk_value, dest, size, flags);
3785#else
3786		if (dtrace_priv_proc(state))
3787			dtrace_copyinstr(tupregs[0].dttk_value, dest, size, flags);
3788#endif /* __APPLE__ */
3789		DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);
3790
3791		((char *)dest)[size - 1] = '\0';
3792		mstate->dtms_scratch_ptr += size;
3793		regs[rd] = dest;
3794		break;
3795	}
3796
3797#if !defined(__APPLE__)
3798	case DIF_SUBR_MSGSIZE:
3799	case DIF_SUBR_MSGDSIZE: {
3800		uintptr_t baddr = tupregs[0].dttk_value, daddr;
3801		uintptr_t wptr, rptr;
3802		size_t count = 0;
3803		int cont = 0;
3804
3805		while (baddr != NULL && !(*flags & CPU_DTRACE_FAULT)) {
3806
3807			if (!dtrace_canload(baddr, sizeof (mblk_t), mstate,
3808			    vstate)) {
3809				regs[rd] = NULL;
3810				break;
3811			}
3812
3813			wptr = dtrace_loadptr(baddr +
3814			    offsetof(mblk_t, b_wptr));
3815
3816			rptr = dtrace_loadptr(baddr +
3817			    offsetof(mblk_t, b_rptr));
3818
3819			if (wptr < rptr) {
3820				*flags |= CPU_DTRACE_BADADDR;
3821				*illval = tupregs[0].dttk_value;
3822				break;
3823			}
3824
3825			daddr = dtrace_loadptr(baddr +
3826			    offsetof(mblk_t, b_datap));
3827
3828			baddr = dtrace_loadptr(baddr +
3829			    offsetof(mblk_t, b_cont));
3830
3831			/*
3832			 * We want to prevent against denial-of-service here,
3833			 * so we're only going to search the list for
3834			 * dtrace_msgdsize_max mblks.
3835			 */
3836			if (cont++ > dtrace_msgdsize_max) {
3837				*flags |= CPU_DTRACE_ILLOP;
3838				break;
3839			}
3840
3841			if (subr == DIF_SUBR_MSGDSIZE) {
3842				if (dtrace_load8(daddr +
3843				    offsetof(dblk_t, db_type)) != M_DATA)
3844					continue;
3845			}
3846
3847			count += wptr - rptr;
3848		}
3849
3850		if (!(*flags & CPU_DTRACE_FAULT))
3851			regs[rd] = count;
3852
3853		break;
3854	}
3855#else
3856	case DIF_SUBR_MSGSIZE:
3857	case DIF_SUBR_MSGDSIZE: {
3858		/* Darwin does not implement SysV streams messages */
3859		DTRACE_CPUFLAG_SET(CPU_DTRACE_ILLOP);
3860		regs[rd] = 0;
3861		break;
3862	}
3863#endif /* __APPLE__ */
3864
3865#if !defined(__APPLE__)
3866	case DIF_SUBR_PROGENYOF: {
3867		pid_t pid = tupregs[0].dttk_value;
3868		proc_t *p;
3869		int rval = 0;
3870
3871		DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
3872
3873		for (p = curthread->t_procp; p != NULL; p = p->p_parent) {
3874			if (p->p_pidp->pid_id == pid) {
3875				rval = 1;
3876				break;
3877			}
3878		}
3879
3880		DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);
3881
3882		regs[rd] = rval;
3883		break;
3884	}
3885#else
3886	case DIF_SUBR_PROGENYOF: {
3887		pid_t pid = tupregs[0].dttk_value;
3888		struct proc *p = current_proc();
3889		int rval = 0, lim = nprocs;
3890
3891		while(p && (lim-- > 0)) {
3892			pid_t ppid;
3893
3894			ppid = (pid_t)dtrace_load32((uintptr_t)&(p->p_pid));
3895			if (*flags & CPU_DTRACE_FAULT)
3896				break;
3897
3898			if (ppid == pid) {
3899				rval = 1;
3900				break;
3901			}
3902
3903			if (ppid == 0)
3904				break; /* Can't climb process tree any further. */
3905
3906			p = (struct proc *)dtrace_loadptr((uintptr_t)&(p->p_pptr));
3907			if (*flags & CPU_DTRACE_FAULT)
3908				break;
3909		}
3910
3911		regs[rd] = rval;
3912		break;
3913	}
3914#endif /* __APPLE__ */
3915
3916	case DIF_SUBR_SPECULATION:
3917		regs[rd] = dtrace_speculation(state);
3918		break;
3919
3920#if !defined(__APPLE__)
3921	case DIF_SUBR_COPYOUT: {
3922		uintptr_t kaddr = tupregs[0].dttk_value;
3923		uintptr_t uaddr = tupregs[1].dttk_value;
3924		uint64_t size = tupregs[2].dttk_value;
3925
3926		if (!dtrace_destructive_disallow &&
3927		    dtrace_priv_proc_control(state) &&
3928		    !dtrace_istoxic(kaddr, size)) {
3929			DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
3930			dtrace_copyout(kaddr, uaddr, size, flags);
3931			DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);
3932		}
3933		break;
3934	}
3935
3936	case DIF_SUBR_COPYOUTSTR: {
3937		uintptr_t kaddr = tupregs[0].dttk_value;
3938		uintptr_t uaddr = tupregs[1].dttk_value;
3939		uint64_t size = tupregs[2].dttk_value;
3940
3941		if (!dtrace_destructive_disallow &&
3942		    dtrace_priv_proc_control(state) &&
3943		    !dtrace_istoxic(kaddr, size)) {
3944			DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
3945			dtrace_copyoutstr(kaddr, uaddr, size, flags);
3946			DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);
3947		}
3948		break;
3949	}
3950#else
3951	case DIF_SUBR_COPYOUT: {
3952		uintptr_t kaddr = tupregs[0].dttk_value;
3953		user_addr_t uaddr = tupregs[1].dttk_value;
3954		uint64_t size = tupregs[2].dttk_value;
3955
3956		if (!dtrace_destructive_disallow &&
3957		    dtrace_priv_proc_control(state) &&
3958		    !dtrace_istoxic(kaddr, size)) {
3959			DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
3960			dtrace_copyout(kaddr, uaddr, size, flags);
3961			DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);
3962		}
3963		break;
3964	}
3965
3966	case DIF_SUBR_COPYOUTSTR: {
3967		uintptr_t kaddr = tupregs[0].dttk_value;
3968		user_addr_t uaddr = tupregs[1].dttk_value;
3969		uint64_t size = tupregs[2].dttk_value;
3970
3971		if (!dtrace_destructive_disallow &&
3972		    dtrace_priv_proc_control(state) &&
3973		    !dtrace_istoxic(kaddr, size)) {
3974			DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
3975			dtrace_copyoutstr(kaddr, uaddr, size, flags);
3976			DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);
3977		}
3978		break;
3979	}
3980#endif /* __APPLE__ */
3981
3982	case DIF_SUBR_STRLEN: {
3983		size_t sz;
3984		uintptr_t addr = (uintptr_t)tupregs[0].dttk_value;
3985		sz = dtrace_strlen((char *)addr,
3986		    state->dts_options[DTRACEOPT_STRSIZE]);
3987
3988		if (!dtrace_canload(addr, sz + 1, mstate, vstate)) {
3989			regs[rd] = NULL;
3990			break;
3991		}
3992
3993		regs[rd] = sz;
3994
3995		break;
3996	}
3997
3998	case DIF_SUBR_STRCHR:
3999	case DIF_SUBR_STRRCHR: {
4000		/*
4001		 * We're going to iterate over the string looking for the
4002		 * specified character.  We will iterate until we have reached
4003		 * the string length or we have found the character.  If this
4004		 * is DIF_SUBR_STRRCHR, we will look for the last occurrence
4005		 * of the specified character instead of the first.
4006		 */
4007		uintptr_t saddr = tupregs[0].dttk_value;
4008		uintptr_t addr = tupregs[0].dttk_value;
4009		uintptr_t limit = addr + state->dts_options[DTRACEOPT_STRSIZE];
4010		char c, target = (char)tupregs[1].dttk_value;
4011
4012		for (regs[rd] = NULL; addr < limit; addr++) {
4013			if ((c = dtrace_load8(addr)) == target) {
4014				regs[rd] = addr;
4015
4016				if (subr == DIF_SUBR_STRCHR)
4017					break;
4018			}
4019
4020			if (c == '\0')
4021				break;
4022		}
4023
4024		if (!dtrace_canload(saddr, addr - saddr, mstate, vstate)) {
4025			regs[rd] = NULL;
4026			break;
4027		}
4028
4029		break;
4030	}
4031
4032	case DIF_SUBR_STRSTR:
4033	case DIF_SUBR_INDEX:
4034	case DIF_SUBR_RINDEX: {
4035		/*
4036		 * We're going to iterate over the string looking for the
4037		 * specified string.  We will iterate until we have reached
4038		 * the string length or we have found the string.  (Yes, this
4039		 * is done in the most naive way possible -- but considering
4040		 * that the string we're searching for is likely to be
4041		 * relatively short, the complexity of Rabin-Karp or similar
4042		 * hardly seems merited.)
4043		 */
4044		char *addr = (char *)(uintptr_t)tupregs[0].dttk_value;
4045		char *substr = (char *)(uintptr_t)tupregs[1].dttk_value;
4046		uint64_t size = state->dts_options[DTRACEOPT_STRSIZE];
4047		size_t len = dtrace_strlen(addr, size);
4048		size_t sublen = dtrace_strlen(substr, size);
4049		char *limit = addr + len, *orig = addr;
4050		int notfound = subr == DIF_SUBR_STRSTR ? 0 : -1;
4051		int inc = 1;
4052
4053		regs[rd] = notfound;
4054
4055		if (!dtrace_canload((uintptr_t)addr, len + 1, mstate, vstate)) {
4056			regs[rd] = NULL;
4057			break;
4058		}
4059
4060		if (!dtrace_canload((uintptr_t)substr, sublen + 1, mstate,
4061		    vstate)) {
4062			regs[rd] = NULL;
4063			break;
4064		}
4065
4066		/*
4067		 * strstr() and index()/rindex() have similar semantics if
4068		 * both strings are the empty string: strstr() returns a
4069		 * pointer to the (empty) string, and index() and rindex()
4070		 * both return index 0 (regardless of any position argument).
4071		 */
4072		if (sublen == 0 && len == 0) {
4073			if (subr == DIF_SUBR_STRSTR)
4074				regs[rd] = (uintptr_t)addr;
4075			else
4076				regs[rd] = 0;
4077			break;
4078		}
4079
4080		if (subr != DIF_SUBR_STRSTR) {
4081			if (subr == DIF_SUBR_RINDEX) {
4082				limit = orig - 1;
4083				addr += len;
4084				inc = -1;
4085			}
4086
4087			/*
4088			 * Both index() and rindex() take an optional position
4089			 * argument that denotes the starting position.
4090			 */
4091			if (nargs == 3) {
4092				int64_t pos = (int64_t)tupregs[2].dttk_value;
4093
4094				/*
4095				 * If the position argument to index() is
4096				 * negative, Perl implicitly clamps it at
4097				 * zero.  This semantic is a little surprising
4098				 * given the special meaning of negative
4099				 * positions to similar Perl functions like
4100				 * substr(), but it appears to reflect a
4101				 * notion that index() can start from a
4102				 * negative index and increment its way up to
4103				 * the string.  Given this notion, Perl's
4104				 * rindex() is at least self-consistent in
4105				 * that it implicitly clamps positions greater
4106				 * than the string length to be the string
4107				 * length.  Where Perl completely loses
4108				 * coherence, however, is when the specified
4109				 * substring is the empty string ("").  In
4110				 * this case, even if the position is
4111				 * negative, rindex() returns 0 -- and even if
4112				 * the position is greater than the length,
4113				 * index() returns the string length.  These
4114				 * semantics violate the notion that index()
4115				 * should never return a value less than the
4116				 * specified position and that rindex() should
4117				 * never return a value greater than the
4118				 * specified position.  (One assumes that
4119				 * these semantics are artifacts of Perl's
4120				 * implementation and not the results of
4121				 * deliberate design -- it beggars belief that
4122				 * even Larry Wall could desire such oddness.)
4123				 * While in the abstract one would wish for
4124				 * consistent position semantics across
4125				 * substr(), index() and rindex() -- or at the
4126				 * very least self-consistent position
4127				 * semantics for index() and rindex() -- we
4128				 * instead opt to keep with the extant Perl
4129				 * semantics, in all their broken glory.  (Do
4130				 * we have more desire to maintain Perl's
4131				 * semantics than Perl does?  Probably.)
4132				 */
4133				if (subr == DIF_SUBR_RINDEX) {
4134					if (pos < 0) {
4135						if (sublen == 0)
4136							regs[rd] = 0;
4137						break;
4138					}
4139
4140#if !defined(__APPLE__)  /* Quiet compiler warnings */
4141					if (pos > len)
4142#else
4143					if ((size_t)pos > len)
4144#endif /* __APPLE__ */
4145						pos = len;
4146				} else {
4147					if (pos < 0)
4148						pos = 0;
4149
4150#if !defined(__APPLE__)  /* Quiet compiler warnings */
4151					if (pos >= len) {
4152#else
4153					if ((size_t)pos >= len) {
4154#endif /* __APPLE__ */
4155						if (sublen == 0)
4156							regs[rd] = len;
4157						break;
4158					}
4159				}
4160
4161				addr = orig + pos;
4162			}
4163		}
4164
4165		for (regs[rd] = notfound; addr != limit; addr += inc) {
4166			if (dtrace_strncmp(addr, substr, sublen) == 0) {
4167				if (subr != DIF_SUBR_STRSTR) {
4168					/*
4169					 * As D index() and rindex() are
4170					 * modeled on Perl (and not on awk),
4171					 * we return a zero-based (and not a
4172					 * one-based) index.  (For you Perl
4173					 * weenies: no, we're not going to add
4174					 * $[ -- and shouldn't you be at a con
4175					 * or something?)
4176					 */
4177					regs[rd] = (uintptr_t)(addr - orig);
4178					break;
4179				}
4180
4181				ASSERT(subr == DIF_SUBR_STRSTR);
4182				regs[rd] = (uintptr_t)addr;
4183				break;
4184			}
4185		}
4186
4187		break;
4188	}
4189
4190	case DIF_SUBR_STRTOK: {
4191		uintptr_t addr = tupregs[0].dttk_value;
4192		uintptr_t tokaddr = tupregs[1].dttk_value;
4193		uint64_t size = state->dts_options[DTRACEOPT_STRSIZE];
4194		uintptr_t limit, toklimit = tokaddr + size;
4195		char *dest = (char *)mstate->dtms_scratch_ptr;
4196#if !defined(__APPLE__)   /* Quiet compiler warnings */
4197		uint8_t c, tokmap[32];	 /* 256 / 8 */
4198		int i;
4199#else
4200		uint8_t c='\0', tokmap[32];	 /* 256 / 8 */
4201		uint64_t i = 0;
4202#endif /* __APPLE__ */
4203
4204		/*
4205		 * Check both the token buffer and (later) the input buffer,
4206		 * since both could be non-scratch addresses.
4207		 */
4208		if (!dtrace_strcanload(tokaddr, size, mstate, vstate)) {
4209			regs[rd] = NULL;
4210			break;
4211		}
4212
4213		if (!DTRACE_INSCRATCH(mstate, size)) {
4214			DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
4215			regs[rd] = NULL;
4216			break;
4217		}
4218
4219		if (addr == NULL) {
4220			/*
4221			 * If the address specified is NULL, we use our saved
4222			 * strtok pointer from the mstate.  Note that this
4223			 * means that the saved strtok pointer is _only_
4224			 * valid within multiple enablings of the same probe --
4225			 * it behaves like an implicit clause-local variable.
4226			 */
4227			addr = mstate->dtms_strtok;
4228		} else {
4229			/*
4230			 * If the user-specified address is non-NULL we must
4231			 * access check it.  This is the only time we have
4232			 * a chance to do so, since this address may reside
4233			 * in the string table of this clause-- future calls
4234			 * (when we fetch addr from mstate->dtms_strtok)
4235			 * would fail this access check.
4236			 */
4237			if (!dtrace_strcanload(addr, size, mstate, vstate)) {
4238				regs[rd] = NULL;
4239				break;
4240		}
4241		}
4242
4243		/*
4244		 * First, zero the token map, and then process the token
4245		 * string -- setting a bit in the map for every character
4246		 * found in the token string.
4247		 */
4248		for (i = 0; i < (int)sizeof (tokmap); i++)
4249			tokmap[i] = 0;
4250
4251		for (; tokaddr < toklimit; tokaddr++) {
4252			if ((c = dtrace_load8(tokaddr)) == '\0')
4253				break;
4254
4255			ASSERT((c >> 3) < sizeof (tokmap));
4256			tokmap[c >> 3] |= (1 << (c & 0x7));
4257		}
4258
4259		for (limit = addr + size; addr < limit; addr++) {
4260			/*
4261			 * We're looking for a character that is _not_ contained
4262			 * in the token string.
4263			 */
4264			if ((c = dtrace_load8(addr)) == '\0')
4265				break;
4266
4267			if (!(tokmap[c >> 3] & (1 << (c & 0x7))))
4268				break;
4269		}
4270
4271		if (c == '\0') {
4272			/*
4273			 * We reached the end of the string without finding
4274			 * any character that was not in the token string.
4275			 * We return NULL in this case, and we set the saved
4276			 * address to NULL as well.
4277			 */
4278			regs[rd] = NULL;
4279			mstate->dtms_strtok = NULL;
4280			break;
4281		}
4282
4283		/*
4284		 * From here on, we're copying into the destination string.
4285		 */
4286		for (i = 0; addr < limit && i < size - 1; addr++) {
4287			if ((c = dtrace_load8(addr)) == '\0')
4288				break;
4289
4290			if (tokmap[c >> 3] & (1 << (c & 0x7)))
4291				break;
4292
4293			ASSERT(i < size);
4294			dest[i++] = c;
4295		}
4296
4297		ASSERT(i < size);
4298		dest[i] = '\0';
4299		regs[rd] = (uintptr_t)dest;
4300		mstate->dtms_scratch_ptr += size;
4301		mstate->dtms_strtok = addr;
4302		break;
4303	}
4304
4305	case DIF_SUBR_SUBSTR: {
4306		uintptr_t s = tupregs[0].dttk_value;
4307		uint64_t size = state->dts_options[DTRACEOPT_STRSIZE];
4308		char *d = (char *)mstate->dtms_scratch_ptr;
4309		int64_t index = (int64_t)tupregs[1].dttk_value;
4310		int64_t remaining = (int64_t)tupregs[2].dttk_value;
4311		size_t len = dtrace_strlen((char *)s, size);
4312		int64_t i = 0;
4313
4314		if (!dtrace_canload(s, len + 1, mstate, vstate)) {
4315			regs[rd] = NULL;
4316			break;
4317		}
4318
4319		if (!DTRACE_INSCRATCH(mstate, size)) {
4320			DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
4321			regs[rd] = NULL;
4322			break;
4323		}
4324
4325		if (nargs <= 2)
4326			remaining = (int64_t)size;
4327
4328		if (index < 0) {
4329			index += len;
4330
4331			if (index < 0 && index + remaining > 0) {
4332				remaining += index;
4333				index = 0;
4334			}
4335		}
4336
4337#if !defined(__APPLE__)   /* Quiet compiler warnings */
4338		if (index >= len || index < 0) {
4339			remaining = 0;
4340		} else if (remaining < 0) {
4341			remaining += len - index;
4342		} else if (index + remaining > size) {
4343			remaining = size - index;
4344		}
4345#else
4346		if ((size_t)index >= len || index < 0) {
4347			remaining = 0;
4348		} else if (remaining < 0) {
4349			remaining += len - index;
4350		} else if ((uint64_t)index + (uint64_t)remaining > size) {
4351			remaining = size - index;
4352		}
4353#endif /* __APPLE__ */
4354		for (i = 0; i < remaining; i++) {
4355			if ((d[i] = dtrace_load8(s + index + i)) == '\0')
4356				break;
4357			}
4358
4359		d[i] = '\0';
4360
4361		mstate->dtms_scratch_ptr += size;
4362		regs[rd] = (uintptr_t)d;
4363		break;
4364	}
4365
4366#if !defined(__APPLE__)
4367	case DIF_SUBR_GETMAJOR:
4368#ifdef _LP64
4369		regs[rd] = (tupregs[0].dttk_value >> NBITSMINOR64) & MAXMAJ64;
4370#else
4371		regs[rd] = (tupregs[0].dttk_value >> NBITSMINOR) & MAXMAJ;
4372#endif
4373		break;
4374
4375#else  /* __APPLE__ */
4376	case DIF_SUBR_GETMAJOR:
4377		regs[rd] = (uintptr_t)major( (dev_t)tupregs[0].dttk_value );
4378		break;
4379#endif /* __APPLE__ */
4380
4381#if !defined(__APPLE__)
4382	case DIF_SUBR_GETMINOR:
4383#ifdef _LP64
4384		regs[rd] = tupregs[0].dttk_value & MAXMIN64;
4385#else
4386		regs[rd] = tupregs[0].dttk_value & MAXMIN;
4387#endif
4388		break;
4389
4390#else  /* __APPLE__ */
4391	case DIF_SUBR_GETMINOR:
4392		regs[rd] = (uintptr_t)minor( (dev_t)tupregs[0].dttk_value );
4393		break;
4394#endif /* __APPLE__ */
4395
4396#if !defined(__APPLE__)
4397	case DIF_SUBR_DDI_PATHNAME: {
4398		/*
4399		 * This one is a galactic mess.  We are going to roughly
4400		 * emulate ddi_pathname(), but it's made more complicated
4401		 * by the fact that we (a) want to include the minor name and
4402		 * (b) must proceed iteratively instead of recursively.
4403		 */
4404		uintptr_t dest = mstate->dtms_scratch_ptr;
4405		uint64_t size = state->dts_options[DTRACEOPT_STRSIZE];
4406		char *start = (char *)dest, *end = start + size - 1;
4407		uintptr_t daddr = tupregs[0].dttk_value;
4408		int64_t minor = (int64_t)tupregs[1].dttk_value;
4409		char *s;
4410		int i, len, depth = 0;
4411
4412		/*
4413		 * Due to all the pointer jumping we do and context we must
4414		 * rely upon, we just mandate that the user must have kernel
4415		 * read privileges to use this routine.
4416		 */
4417		if ((mstate->dtms_access & DTRACE_ACCESS_KERNEL) == 0) {
4418			*flags |= CPU_DTRACE_KPRIV;
4419			*illval = daddr;
4420			regs[rd] = NULL;
4421		}
4422
4423		if (!DTRACE_INSCRATCH(mstate, size)) {
4424			DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
4425			regs[rd] = NULL;
4426			break;
4427		}
4428
4429		*end = '\0';
4430
4431		/*
4432		 * We want to have a name for the minor.  In order to do this,
4433		 * we need to walk the minor list from the devinfo.  We want
4434		 * to be sure that we don't infinitely walk a circular list,
4435		 * so we check for circularity by sending a scout pointer
4436		 * ahead two elements for every element that we iterate over;
4437		 * if the list is circular, these will ultimately point to the
4438		 * same element.  You may recognize this little trick as the
4439		 * answer to a stupid interview question -- one that always
4440		 * seems to be asked by those who had to have it laboriously
4441		 * explained to them, and who can't even concisely describe
4442		 * the conditions under which one would be forced to resort to
4443		 * this technique.  Needless to say, those conditions are
4444		 * found here -- and probably only here.  Is this the only use
4445		 * of this infamous trick in shipping, production code?  If it
4446		 * isn't, it probably should be...
4447		 */
4448		if (minor != -1) {
4449			uintptr_t maddr = dtrace_loadptr(daddr +
4450			    offsetof(struct dev_info, devi_minor));
4451
4452			uintptr_t next = offsetof(struct ddi_minor_data, next);
4453			uintptr_t name = offsetof(struct ddi_minor_data,
4454			    d_minor) + offsetof(struct ddi_minor, name);
4455			uintptr_t dev = offsetof(struct ddi_minor_data,
4456			    d_minor) + offsetof(struct ddi_minor, dev);
4457			uintptr_t scout;
4458
4459			if (maddr != NULL)
4460				scout = dtrace_loadptr(maddr + next);
4461
4462			while (maddr != NULL && !(*flags & CPU_DTRACE_FAULT)) {
4463				uint64_t m;
4464#ifdef _LP64
4465				m = dtrace_load64(maddr + dev) & MAXMIN64;
4466#else
4467				m = dtrace_load32(maddr + dev) & MAXMIN;
4468#endif
4469				if (m != minor) {
4470					maddr = dtrace_loadptr(maddr + next);
4471
4472					if (scout == NULL)
4473						continue;
4474
4475					scout = dtrace_loadptr(scout + next);
4476
4477					if (scout == NULL)
4478						continue;
4479
4480					scout = dtrace_loadptr(scout + next);
4481
4482					if (scout == NULL)
4483						continue;
4484
4485					if (scout == maddr) {
4486						*flags |= CPU_DTRACE_ILLOP;
4487						break;
4488					}
4489
4490					continue;
4491				}
4492
4493				/*
4494				 * We have the minor data.  Now we need to
4495				 * copy the minor's name into the end of the
4496				 * pathname.
4497				 */
4498				s = (char *)dtrace_loadptr(maddr + name);
4499				len = dtrace_strlen(s, size);
4500
4501				if (*flags & CPU_DTRACE_FAULT)
4502					break;
4503
4504				if (len != 0) {
4505					if ((end -= (len + 1)) < start)
4506						break;
4507
4508					*end = ':';
4509				}
4510
4511				for (i = 1; i <= len; i++)
4512					end[i] = dtrace_load8((uintptr_t)s++);
4513				break;
4514			}
4515		}
4516
4517		while (daddr != NULL && !(*flags & CPU_DTRACE_FAULT)) {
4518			ddi_node_state_t devi_state;
4519
4520			devi_state = dtrace_load32(daddr +
4521			    offsetof(struct dev_info, devi_node_state));
4522
4523			if (*flags & CPU_DTRACE_FAULT)
4524				break;
4525
4526			if (devi_state >= DS_INITIALIZED) {
4527				s = (char *)dtrace_loadptr(daddr +
4528				    offsetof(struct dev_info, devi_addr));
4529				len = dtrace_strlen(s, size);
4530
4531				if (*flags & CPU_DTRACE_FAULT)
4532					break;
4533
4534				if (len != 0) {
4535					if ((end -= (len + 1)) < start)
4536						break;
4537
4538					*end = '@';
4539				}
4540
4541				for (i = 1; i <= len; i++)
4542					end[i] = dtrace_load8((uintptr_t)s++);
4543			}
4544
4545			/*
4546			 * Now for the node name...
4547			 */
4548			s = (char *)dtrace_loadptr(daddr +
4549			    offsetof(struct dev_info, devi_node_name));
4550
4551			daddr = dtrace_loadptr(daddr +
4552			    offsetof(struct dev_info, devi_parent));
4553
4554			/*
4555			 * If our parent is NULL (that is, if we're the root
4556			 * node), we're going to use the special path
4557			 * "devices".
4558			 */
4559			if (daddr == NULL)
4560				s = "devices";
4561
4562			len = dtrace_strlen(s, size);
4563			if (*flags & CPU_DTRACE_FAULT)
4564				break;
4565
4566			if ((end -= (len + 1)) < start)
4567				break;
4568
4569			for (i = 1; i <= len; i++)
4570				end[i] = dtrace_load8((uintptr_t)s++);
4571			*end = '/';
4572
4573			if (depth++ > dtrace_devdepth_max) {
4574				*flags |= CPU_DTRACE_ILLOP;
4575				break;
4576			}
4577		}
4578
4579		if (end < start)
4580			DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
4581
4582		if (daddr == NULL) {
4583			regs[rd] = (uintptr_t)end;
4584			mstate->dtms_scratch_ptr += size;
4585		}
4586
4587		break;
4588	}
4589#else
4590	case DIF_SUBR_DDI_PATHNAME: {
4591		/* FIXME: awaits galactic disentanglement ;-} */
4592		DTRACE_CPUFLAG_SET(CPU_DTRACE_ILLOP);
4593		regs[rd] = NULL;
4594		break;
4595	}
4596#endif /* __APPLE__ */
4597
4598	case DIF_SUBR_STRJOIN: {
4599		char *d = (char *)mstate->dtms_scratch_ptr;
4600		uint64_t size = state->dts_options[DTRACEOPT_STRSIZE];
4601		uintptr_t s1 = tupregs[0].dttk_value;
4602		uintptr_t s2 = tupregs[1].dttk_value;
4603#if !defined(__APPLE__)   /* Quiet compiler warnings */
4604		int i = 0;
4605#else
4606		uint64_t i = 0;
4607#endif /* __APPLE__ */
4608
4609		if (!dtrace_strcanload(s1, size, mstate, vstate) ||
4610		    !dtrace_strcanload(s2, size, mstate, vstate)) {
4611			regs[rd] = NULL;
4612			break;
4613		}
4614
4615		if (!DTRACE_INSCRATCH(mstate, size)) {
4616			DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
4617			regs[rd] = NULL;
4618			break;
4619		}
4620
4621		for (;;) {
4622			if (i >= size) {
4623				DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
4624				regs[rd] = NULL;
4625				break;
4626			}
4627
4628			if ((d[i++] = dtrace_load8(s1++)) == '\0') {
4629				i--;
4630				break;
4631			}
4632		}
4633
4634		for (;;) {
4635			if (i >= size) {
4636				DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
4637				regs[rd] = NULL;
4638				break;
4639			}
4640
4641			if ((d[i++] = dtrace_load8(s2++)) == '\0')
4642				break;
4643		}
4644
4645		if (i < size) {
4646			mstate->dtms_scratch_ptr += i;
4647			regs[rd] = (uintptr_t)d;
4648		}
4649
4650		break;
4651	}
4652
4653	case DIF_SUBR_LLTOSTR: {
4654		int64_t i = (int64_t)tupregs[0].dttk_value;
4655		int64_t val = i < 0 ? i * -1 : i;
4656		uint64_t size = 22;	/* enough room for 2^64 in decimal */
4657		char *end = (char *)mstate->dtms_scratch_ptr + size - 1;
4658
4659		if (!DTRACE_INSCRATCH(mstate, size)) {
4660			DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
4661			regs[rd] = NULL;
4662			break;
4663		}
4664
4665		for (*end-- = '\0'; val; val /= 10)
4666			*end-- = '0' + (val % 10);
4667
4668		if (i == 0)
4669			*end-- = '0';
4670
4671		if (i < 0)
4672			*end-- = '-';
4673
4674		regs[rd] = (uintptr_t)end + 1;
4675		mstate->dtms_scratch_ptr += size;
4676		break;
4677	}
4678
4679	case DIF_SUBR_HTONS:
4680	case DIF_SUBR_NTOHS:
4681#ifdef _BIG_ENDIAN
4682		regs[rd] = (uint16_t)tupregs[0].dttk_value;
4683#else
4684		regs[rd] = DT_BSWAP_16((uint16_t)tupregs[0].dttk_value);
4685#endif
4686		break;
4687
4688
4689	case DIF_SUBR_HTONL:
4690	case DIF_SUBR_NTOHL:
4691#ifdef _BIG_ENDIAN
4692		regs[rd] = (uint32_t)tupregs[0].dttk_value;
4693#else
4694		regs[rd] = DT_BSWAP_32((uint32_t)tupregs[0].dttk_value);
4695#endif
4696		break;
4697
4698
4699	case DIF_SUBR_HTONLL:
4700	case DIF_SUBR_NTOHLL:
4701#ifdef _BIG_ENDIAN
4702		regs[rd] = (uint64_t)tupregs[0].dttk_value;
4703#else
4704		regs[rd] = DT_BSWAP_64((uint64_t)tupregs[0].dttk_value);
4705#endif
4706		break;
4707
4708
4709	case DIF_SUBR_DIRNAME:
4710	case DIF_SUBR_BASENAME: {
4711		char *dest = (char *)mstate->dtms_scratch_ptr;
4712		uint64_t size = state->dts_options[DTRACEOPT_STRSIZE];
4713		uintptr_t src = tupregs[0].dttk_value;
4714		int i, j, len = dtrace_strlen((char *)src, size);
4715		int lastbase = -1, firstbase = -1, lastdir = -1;
4716		int start, end;
4717
4718		if (!dtrace_canload(src, len + 1, mstate, vstate)) {
4719			regs[rd] = NULL;
4720			break;
4721		}
4722
4723		if (!DTRACE_INSCRATCH(mstate, size)) {
4724			DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
4725			regs[rd] = NULL;
4726			break;
4727		}
4728
4729		/*
4730		 * The basename and dirname for a zero-length string is
4731		 * defined to be "."
4732		 */
4733		if (len == 0) {
4734			len = 1;
4735			src = (uintptr_t)".";
4736		}
4737
4738		/*
4739		 * Start from the back of the string, moving back toward the
4740		 * front until we see a character that isn't a slash.  That
4741		 * character is the last character in the basename.
4742		 */
4743		for (i = len - 1; i >= 0; i--) {
4744			if (dtrace_load8(src + i) != '/')
4745				break;
4746		}
4747
4748		if (i >= 0)
4749			lastbase = i;
4750
4751		/*
4752		 * Starting from the last character in the basename, move
4753		 * towards the front until we find a slash.  The character
4754		 * that we processed immediately before that is the first
4755		 * character in the basename.
4756		 */
4757		for (; i >= 0; i--) {
4758			if (dtrace_load8(src + i) == '/')
4759				break;
4760		}
4761
4762		if (i >= 0)
4763			firstbase = i + 1;
4764
4765		/*
4766		 * Now keep going until we find a non-slash character.  That
4767		 * character is the last character in the dirname.
4768		 */
4769		for (; i >= 0; i--) {
4770			if (dtrace_load8(src + i) != '/')
4771				break;
4772		}
4773
4774		if (i >= 0)
4775			lastdir = i;
4776
4777		ASSERT(!(lastbase == -1 && firstbase != -1));
4778		ASSERT(!(firstbase == -1 && lastdir != -1));
4779
4780		if (lastbase == -1) {
4781			/*
4782			 * We didn't find a non-slash character.  We know that
4783			 * the length is non-zero, so the whole string must be
4784			 * slashes.  In either the dirname or the basename
4785			 * case, we return '/'.
4786			 */
4787			ASSERT(firstbase == -1);
4788			firstbase = lastbase = lastdir = 0;
4789		}
4790
4791		if (firstbase == -1) {
4792			/*
4793			 * The entire string consists only of a basename
4794			 * component.  If we're looking for dirname, we need
4795			 * to change our string to be just "."; if we're
4796			 * looking for a basename, we'll just set the first
4797			 * character of the basename to be 0.
4798			 */
4799			if (subr == DIF_SUBR_DIRNAME) {
4800				ASSERT(lastdir == -1);
4801				src = (uintptr_t)".";
4802				lastdir = 0;
4803			} else {
4804				firstbase = 0;
4805			}
4806		}
4807
4808		if (subr == DIF_SUBR_DIRNAME) {
4809			if (lastdir == -1) {
4810				/*
4811				 * We know that we have a slash in the name --
4812				 * or lastdir would be set to 0, above.  And
4813				 * because lastdir is -1, we know that this
4814				 * slash must be the first character.  (That
4815				 * is, the full string must be of the form
4816				 * "/basename".)  In this case, the last
4817				 * character of the directory name is 0.
4818				 */
4819				lastdir = 0;
4820			}
4821
4822			start = 0;
4823			end = lastdir;
4824		} else {
4825			ASSERT(subr == DIF_SUBR_BASENAME);
4826			ASSERT(firstbase != -1 && lastbase != -1);
4827			start = firstbase;
4828			end = lastbase;
4829		}
4830
4831#if !defined(__APPLE__)   /* Quiet compiler warnings */
4832		for (i = start, j = 0; i <= end && j < size - 1; i++, j++)
4833			dest[j] = dtrace_load8(src + i);
4834#else
4835		for (i = start, j = 0; i <= end && (uint64_t)j < size - 1; i++, j++)
4836			dest[j] = dtrace_load8(src + i);
4837#endif /* __APPLE__ */
4838
4839		dest[j] = '\0';
4840		regs[rd] = (uintptr_t)dest;
4841		mstate->dtms_scratch_ptr += size;
4842		break;
4843	}
4844
4845	case DIF_SUBR_CLEANPATH: {
4846		char *dest = (char *)mstate->dtms_scratch_ptr, c;
4847		uint64_t size = state->dts_options[DTRACEOPT_STRSIZE];
4848		uintptr_t src = tupregs[0].dttk_value;
4849		int i = 0, j = 0;
4850
4851		if (!dtrace_strcanload(src, size, mstate, vstate)) {
4852			regs[rd] = NULL;
4853			break;
4854		}
4855
4856		if (!DTRACE_INSCRATCH(mstate, size)) {
4857			DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
4858			regs[rd] = NULL;
4859			break;
4860		}
4861
4862		/*
4863		 * Move forward, loading each character.
4864		 */
4865		do {
4866			c = dtrace_load8(src + i++);
4867next:
4868#if !defined(__APPLE__)   /* Quiet compiler warnings */
4869			if (j + 5 >= size)	/* 5 = strlen("/..c\0") */
4870				break;
4871#else
4872			if ((uint64_t)(j + 5) >= size)	/* 5 = strlen("/..c\0") */
4873				break;
4874#endif /* __APPLE__ */
4875
4876			if (c != '/') {
4877				dest[j++] = c;
4878				continue;
4879			}
4880
4881			c = dtrace_load8(src + i++);
4882
4883			if (c == '/') {
4884				/*
4885				 * We have two slashes -- we can just advance
4886				 * to the next character.
4887				 */
4888				goto next;
4889			}
4890
4891			if (c != '.') {
4892				/*
4893				 * This is not "." and it's not ".." -- we can
4894				 * just store the "/" and this character and
4895				 * drive on.
4896				 */
4897				dest[j++] = '/';
4898				dest[j++] = c;
4899				continue;
4900			}
4901
4902			c = dtrace_load8(src + i++);
4903
4904			if (c == '/') {
4905				/*
4906				 * This is a "/./" component.  We're not going
4907				 * to store anything in the destination buffer;
4908				 * we're just going to go to the next component.
4909				 */
4910				goto next;
4911			}
4912
4913			if (c != '.') {
4914				/*
4915				 * This is not ".." -- we can just store the
4916				 * "/." and this character and continue
4917				 * processing.
4918				 */
4919				dest[j++] = '/';
4920				dest[j++] = '.';
4921				dest[j++] = c;
4922				continue;
4923			}
4924
4925			c = dtrace_load8(src + i++);
4926
4927			if (c != '/' && c != '\0') {
4928				/*
4929				 * This is not ".." -- it's "..[mumble]".
4930				 * We'll store the "/.." and this character
4931				 * and continue processing.
4932				 */
4933				dest[j++] = '/';
4934				dest[j++] = '.';
4935				dest[j++] = '.';
4936				dest[j++] = c;
4937				continue;
4938			}
4939
4940			/*
4941			 * This is "/../" or "/..\0".  We need to back up
4942			 * our destination pointer until we find a "/".
4943			 */
4944			i--;
4945			while (j != 0 && dest[--j] != '/')
4946				continue;
4947
4948			if (c == '\0')
4949				dest[++j] = '/';
4950		} while (c != '\0');
4951
4952		dest[j] = '\0';
4953		regs[rd] = (uintptr_t)dest;
4954		mstate->dtms_scratch_ptr += size;
4955		break;
4956	}
4957
4958	case DIF_SUBR_INET_NTOA:
4959	case DIF_SUBR_INET_NTOA6:
4960	case DIF_SUBR_INET_NTOP: {
4961		size_t size;
4962		int af, argi, i;
4963		char *base, *end;
4964
4965		if (subr == DIF_SUBR_INET_NTOP) {
4966			af = (int)tupregs[0].dttk_value;
4967			argi = 1;
4968		} else {
4969			af = subr == DIF_SUBR_INET_NTOA ? AF_INET: AF_INET6;
4970			argi = 0;
4971		}
4972
4973		if (af == AF_INET) {
4974#if !defined(__APPLE__)
4975			ipaddr_t ip4;
4976#else
4977			uint32_t ip4;
4978#endif /* __APPLE__ */
4979			uint8_t *ptr8, val;
4980
4981			/*
4982			 * Safely load the IPv4 address.
4983			 */
4984#if !defined(__APPLE__)
4985			ip4 = dtrace_load32(tupregs[argi].dttk_value);
4986#else
4987			dtrace_bcopy(
4988			    (void *)(uintptr_t)tupregs[argi].dttk_value,
4989			    (void *)(uintptr_t)&ip4, sizeof (ip4));
4990#endif /* __APPLE__ */
4991			/*
4992			 * Check an IPv4 string will fit in scratch.
4993			 */
4994#if !defined(__APPLE__)
4995			size = INET_ADDRSTRLEN;
4996#else
4997			size = MAX_IPv4_STR_LEN;
4998#endif /* __APPLE__ */
4999			if (!DTRACE_INSCRATCH(mstate, size)) {
5000				DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
5001				regs[rd] = NULL;
5002				break;
5003			}
5004			base = (char *)mstate->dtms_scratch_ptr;
5005			end = (char *)mstate->dtms_scratch_ptr + size - 1;
5006
5007			/*
5008			 * Stringify as a dotted decimal quad.
5009			 */
5010			*end-- = '\0';
5011			ptr8 = (uint8_t *)&ip4;
5012			for (i = 3; i >= 0; i--) {
5013				val = ptr8[i];
5014
5015				if (val == 0) {
5016					*end-- = '0';
5017				} else {
5018					for (; val; val /= 10) {
5019						*end-- = '0' + (val % 10);
5020					}
5021				}
5022
5023				if (i > 0)
5024					*end-- = '.';
5025			}
5026			ASSERT(end + 1 >= base);
5027
5028		} else if (af == AF_INET6) {
5029#if defined(__APPLE__)
5030#define _S6_un __u6_addr
5031#define _S6_u8 __u6_addr8
5032#endif /* __APPLE__ */
5033			struct in6_addr ip6;
5034			int firstzero, tryzero, numzero, v6end;
5035			uint16_t val;
5036			const char digits[] = "0123456789abcdef";
5037
5038			/*
5039			 * Stringify using RFC 1884 convention 2 - 16 bit
5040			 * hexadecimal values with a zero-run compression.
5041			 * Lower case hexadecimal digits are used.
5042			 * 	eg, fe80::214:4fff:fe0b:76c8.
5043			 * The IPv4 embedded form is returned for inet_ntop,
5044			 * just the IPv4 string is returned for inet_ntoa6.
5045			 */
5046
5047			/*
5048			 * Safely load the IPv6 address.
5049			 */
5050			dtrace_bcopy(
5051			    (void *)(uintptr_t)tupregs[argi].dttk_value,
5052			    (void *)(uintptr_t)&ip6, sizeof (struct in6_addr));
5053
5054			/*
5055			 * Check an IPv6 string will fit in scratch.
5056			 */
5057			size = INET6_ADDRSTRLEN;
5058			if (!DTRACE_INSCRATCH(mstate, size)) {
5059				DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
5060				regs[rd] = NULL;
5061				break;
5062			}
5063			base = (char *)mstate->dtms_scratch_ptr;
5064			end = (char *)mstate->dtms_scratch_ptr + size - 1;
5065			*end-- = '\0';
5066
5067			/*
5068			 * Find the longest run of 16 bit zero values
5069			 * for the single allowed zero compression - "::".
5070			 */
5071			firstzero = -1;
5072			tryzero = -1;
5073			numzero = 1;
5074#if !defined(__APPLE__)   /* Quiet compiler warnings */
5075			for (i = 0; i < sizeof (struct in6_addr); i++) {
5076#else
5077			for (i = 0; i < (int)sizeof (struct in6_addr); i++) {
5078#endif /* __APPLE__ */
5079				if (ip6._S6_un._S6_u8[i] == 0 &&
5080				    tryzero == -1 && i % 2 == 0) {
5081					tryzero = i;
5082					continue;
5083				}
5084
5085				if (tryzero != -1 &&
5086				    (ip6._S6_un._S6_u8[i] != 0 ||
5087				    i == sizeof (struct in6_addr) - 1)) {
5088
5089					if (i - tryzero <= numzero) {
5090						tryzero = -1;
5091						continue;
5092					}
5093
5094					firstzero = tryzero;
5095					numzero = i - i % 2 - tryzero;
5096					tryzero = -1;
5097
5098					if (ip6._S6_un._S6_u8[i] == 0 &&
5099					    i == sizeof (struct in6_addr) - 1)
5100						numzero += 2;
5101				}
5102			}
5103#if !defined(__APPLE__)   /* Quiet compiler warnings */
5104			ASSERT(firstzero + numzero <= sizeof (struct in6_addr));
5105#else
5106			ASSERT(firstzero + numzero <= (int)sizeof (struct in6_addr));
5107#endif /* __APPLE__ */
5108
5109			/*
5110			 * Check for an IPv4 embedded address.
5111			 */
5112			v6end = sizeof (struct in6_addr) - 2;
5113			if (IN6_IS_ADDR_V4MAPPED(&ip6) ||
5114			    IN6_IS_ADDR_V4COMPAT(&ip6)) {
5115#if !defined(__APPLE__)   /* Quiet compiler warnings */
5116				for (i = sizeof (struct in6_addr) - 1;
5117				    i >= DTRACE_V4MAPPED_OFFSET; i--) {
5118#else
5119				for (i = sizeof (struct in6_addr) - 1;
5120				     i >= (int)DTRACE_V4MAPPED_OFFSET; i--) {
5121#endif /* __APPLE__ */
5122					ASSERT(end >= base);
5123
5124					val = ip6._S6_un._S6_u8[i];
5125
5126					if (val == 0) {
5127						*end-- = '0';
5128					} else {
5129						for (; val; val /= 10) {
5130							*end-- = '0' + val % 10;
5131						}
5132					}
5133
5134#if !defined(__APPLE__)   /* Quiet compiler warnings */
5135					if (i > DTRACE_V4MAPPED_OFFSET)
5136						*end-- = '.';
5137#else
5138					if (i > (int)DTRACE_V4MAPPED_OFFSET)
5139						*end-- = '.';
5140#endif /* __APPLE__ */
5141				}
5142
5143				if (subr == DIF_SUBR_INET_NTOA6)
5144					goto inetout;
5145
5146				/*
5147				 * Set v6end to skip the IPv4 address that
5148				 * we have already stringified.
5149				 */
5150				v6end = 10;
5151			}
5152
5153			/*
5154			 * Build the IPv6 string by working through the
5155			 * address in reverse.
5156			 */
5157			for (i = v6end; i >= 0; i -= 2) {
5158				ASSERT(end >= base);
5159
5160				if (i == firstzero + numzero - 2) {
5161					*end-- = ':';
5162					*end-- = ':';
5163					i -= numzero - 2;
5164					continue;
5165				}
5166
5167				if (i < 14 && i != firstzero - 2)
5168					*end-- = ':';
5169
5170				val = (ip6._S6_un._S6_u8[i] << 8) +
5171				    ip6._S6_un._S6_u8[i + 1];
5172
5173				if (val == 0) {
5174					*end-- = '0';
5175				} else {
5176					for (; val; val /= 16) {
5177						*end-- = digits[val % 16];
5178					}
5179				}
5180			}
5181			ASSERT(end + 1 >= base);
5182
5183#if defined(__APPLE__)
5184#undef _S6_un
5185#undef _S6_u8
5186#endif /* __APPLE__ */
5187		} else {
5188			/*
5189			 * The user didn't use AH_INET or AH_INET6.
5190			 */
5191			DTRACE_CPUFLAG_SET(CPU_DTRACE_ILLOP);
5192			regs[rd] = NULL;
5193			break;
5194		}
5195
5196inetout:	regs[rd] = (uintptr_t)end + 1;
5197		mstate->dtms_scratch_ptr += size;
5198		break;
5199	}
5200
5201#ifdef __APPLE__
5202
5203	/* CoreProfile callback ('core_profile(uint64_t, [uint64_t], [uint64_t] ...)') */
5204	case DIF_SUBR_COREPROFILE: {
5205		uint64_t selector = tupregs[0].dttk_value;
5206		uint64_t args[DIF_DTR_NREGS-1] = {0ULL};
5207		uint32_t ii;
5208		uint32_t count = (uint32_t)nargs;
5209
5210		if (count < 1) {
5211		    regs[rd] = KERN_FAILURE;
5212		    break;
5213		}
5214
5215		if(count > DIF_DTR_NREGS)
5216		    count = DIF_DTR_NREGS;
5217
5218		/* copy in any variadic argument list, bounded by DIF_DTR_NREGS */
5219		for(ii = 0; ii < count-1; ii++) {
5220			args[ii] = tupregs[ii+1].dttk_value;
5221		}
5222
5223		kern_return_t ret =
5224			chudxnu_dtrace_callback(selector, args, count-1);
5225		if(KERN_SUCCESS != ret) {
5226			/* error */
5227		}
5228
5229		regs[rd] = ret;
5230		break;
5231	}
5232
5233#endif /* __APPLE__ */
5234
5235	}
5236}
5237
5238/*
5239 * Emulate the execution of DTrace IR instructions specified by the given
5240 * DIF object.  This function is deliberately void of assertions as all of
5241 * the necessary checks are handled by a call to dtrace_difo_validate().
5242 */
5243static uint64_t
5244dtrace_dif_emulate(dtrace_difo_t *difo, dtrace_mstate_t *mstate,
5245    dtrace_vstate_t *vstate, dtrace_state_t *state)
5246{
5247	const dif_instr_t *text = difo->dtdo_buf;
5248	const uint_t textlen = difo->dtdo_len;
5249	const char *strtab = difo->dtdo_strtab;
5250	const uint64_t *inttab = difo->dtdo_inttab;
5251
5252	uint64_t rval = 0;
5253	dtrace_statvar_t *svar;
5254	dtrace_dstate_t *dstate = &vstate->dtvs_dynvars;
5255	dtrace_difv_t *v;
5256	volatile uint16_t *flags = &cpu_core[CPU->cpu_id].cpuc_dtrace_flags;
5257#if !defined(__APPLE__)
5258	volatile uintptr_t *illval = &cpu_core[CPU->cpu_id].cpuc_dtrace_illval;
5259#else
5260	volatile uint64_t *illval = &cpu_core[CPU->cpu_id].cpuc_dtrace_illval;
5261#endif /* __APPLE__ */
5262
5263	dtrace_key_t tupregs[DIF_DTR_NREGS + 2]; /* +2 for thread and id */
5264	uint64_t regs[DIF_DIR_NREGS];
5265	uint64_t *tmp;
5266
5267	uint8_t cc_n = 0, cc_z = 0, cc_v = 0, cc_c = 0;
5268	int64_t cc_r;
5269#if !defined(__APPLE__)   /* Quiet compiler warnings */
5270	uint_t pc = 0, id, opc;
5271#else
5272	uint_t pc = 0, id, opc = 0;
5273#endif /* __APPLE__ */
5274	uint8_t ttop = 0;
5275	dif_instr_t instr;
5276	uint_t r1, r2, rd;
5277
5278	/*
5279	 * We stash the current DIF object into the machine state: we need it
5280	 * for subsequent access checking.
5281	 */
5282	mstate->dtms_difo = difo;
5283
5284	regs[DIF_REG_R0] = 0; 		/* %r0 is fixed at zero */
5285
5286	while (pc < textlen && !(*flags & CPU_DTRACE_FAULT)) {
5287		opc = pc;
5288
5289		instr = text[pc++];
5290		r1 = DIF_INSTR_R1(instr);
5291		r2 = DIF_INSTR_R2(instr);
5292		rd = DIF_INSTR_RD(instr);
5293
5294		switch (DIF_INSTR_OP(instr)) {
5295		case DIF_OP_OR:
5296			regs[rd] = regs[r1] | regs[r2];
5297			break;
5298		case DIF_OP_XOR:
5299			regs[rd] = regs[r1] ^ regs[r2];
5300			break;
5301		case DIF_OP_AND:
5302			regs[rd] = regs[r1] & regs[r2];
5303			break;
5304		case DIF_OP_SLL:
5305			regs[rd] = regs[r1] << regs[r2];
5306			break;
5307		case DIF_OP_SRL:
5308			regs[rd] = regs[r1] >> regs[r2];
5309			break;
5310		case DIF_OP_SUB:
5311			regs[rd] = regs[r1] - regs[r2];
5312			break;
5313		case DIF_OP_ADD:
5314			regs[rd] = regs[r1] + regs[r2];
5315			break;
5316		case DIF_OP_MUL:
5317			regs[rd] = regs[r1] * regs[r2];
5318			break;
5319		case DIF_OP_SDIV:
5320			if (regs[r2] == 0) {
5321				regs[rd] = 0;
5322				*flags |= CPU_DTRACE_DIVZERO;
5323			} else {
5324				regs[rd] = (int64_t)regs[r1] /
5325				    (int64_t)regs[r2];
5326			}
5327			break;
5328
5329		case DIF_OP_UDIV:
5330			if (regs[r2] == 0) {
5331				regs[rd] = 0;
5332				*flags |= CPU_DTRACE_DIVZERO;
5333			} else {
5334				regs[rd] = regs[r1] / regs[r2];
5335			}
5336			break;
5337
5338		case DIF_OP_SREM:
5339			if (regs[r2] == 0) {
5340				regs[rd] = 0;
5341				*flags |= CPU_DTRACE_DIVZERO;
5342			} else {
5343				regs[rd] = (int64_t)regs[r1] %
5344				    (int64_t)regs[r2];
5345			}
5346			break;
5347
5348		case DIF_OP_UREM:
5349			if (regs[r2] == 0) {
5350				regs[rd] = 0;
5351				*flags |= CPU_DTRACE_DIVZERO;
5352			} else {
5353				regs[rd] = regs[r1] % regs[r2];
5354			}
5355			break;
5356
5357		case DIF_OP_NOT:
5358			regs[rd] = ~regs[r1];
5359			break;
5360		case DIF_OP_MOV:
5361			regs[rd] = regs[r1];
5362			break;
5363		case DIF_OP_CMP:
5364			cc_r = regs[r1] - regs[r2];
5365			cc_n = cc_r < 0;
5366			cc_z = cc_r == 0;
5367			cc_v = 0;
5368			cc_c = regs[r1] < regs[r2];
5369			break;
5370		case DIF_OP_TST:
5371			cc_n = cc_v = cc_c = 0;
5372			cc_z = regs[r1] == 0;
5373			break;
5374		case DIF_OP_BA:
5375			pc = DIF_INSTR_LABEL(instr);
5376			break;
5377		case DIF_OP_BE:
5378			if (cc_z)
5379				pc = DIF_INSTR_LABEL(instr);
5380			break;
5381		case DIF_OP_BNE:
5382			if (cc_z == 0)
5383				pc = DIF_INSTR_LABEL(instr);
5384			break;
5385		case DIF_OP_BG:
5386			if ((cc_z | (cc_n ^ cc_v)) == 0)
5387				pc = DIF_INSTR_LABEL(instr);
5388			break;
5389		case DIF_OP_BGU:
5390			if ((cc_c | cc_z) == 0)
5391				pc = DIF_INSTR_LABEL(instr);
5392			break;
5393		case DIF_OP_BGE:
5394			if ((cc_n ^ cc_v) == 0)
5395				pc = DIF_INSTR_LABEL(instr);
5396			break;
5397		case DIF_OP_BGEU:
5398			if (cc_c == 0)
5399				pc = DIF_INSTR_LABEL(instr);
5400			break;
5401		case DIF_OP_BL:
5402			if (cc_n ^ cc_v)
5403				pc = DIF_INSTR_LABEL(instr);
5404			break;
5405		case DIF_OP_BLU:
5406			if (cc_c)
5407				pc = DIF_INSTR_LABEL(instr);
5408			break;
5409		case DIF_OP_BLE:
5410			if (cc_z | (cc_n ^ cc_v))
5411				pc = DIF_INSTR_LABEL(instr);
5412			break;
5413		case DIF_OP_BLEU:
5414			if (cc_c | cc_z)
5415				pc = DIF_INSTR_LABEL(instr);
5416			break;
5417		case DIF_OP_RLDSB:
5418			if (!dtrace_canstore(regs[r1], 1, mstate, vstate)) {
5419				*flags |= CPU_DTRACE_KPRIV;
5420				*illval = regs[r1];
5421				break;
5422			}
5423			/*FALLTHROUGH*/
5424		case DIF_OP_LDSB:
5425			regs[rd] = (int8_t)dtrace_load8(regs[r1]);
5426			break;
5427		case DIF_OP_RLDSH:
5428			if (!dtrace_canstore(regs[r1], 2, mstate, vstate)) {
5429				*flags |= CPU_DTRACE_KPRIV;
5430				*illval = regs[r1];
5431				break;
5432			}
5433			/*FALLTHROUGH*/
5434		case DIF_OP_LDSH:
5435			regs[rd] = (int16_t)dtrace_load16(regs[r1]);
5436			break;
5437		case DIF_OP_RLDSW:
5438			if (!dtrace_canstore(regs[r1], 4, mstate, vstate)) {
5439				*flags |= CPU_DTRACE_KPRIV;
5440				*illval = regs[r1];
5441				break;
5442			}
5443			/*FALLTHROUGH*/
5444		case DIF_OP_LDSW:
5445			regs[rd] = (int32_t)dtrace_load32(regs[r1]);
5446			break;
5447		case DIF_OP_RLDUB:
5448			if (!dtrace_canstore(regs[r1], 1, mstate, vstate)) {
5449				*flags |= CPU_DTRACE_KPRIV;
5450				*illval = regs[r1];
5451				break;
5452			}
5453			/*FALLTHROUGH*/
5454		case DIF_OP_LDUB:
5455			regs[rd] = dtrace_load8(regs[r1]);
5456			break;
5457		case DIF_OP_RLDUH:
5458			if (!dtrace_canstore(regs[r1], 2, mstate, vstate)) {
5459				*flags |= CPU_DTRACE_KPRIV;
5460				*illval = regs[r1];
5461				break;
5462			}
5463			/*FALLTHROUGH*/
5464		case DIF_OP_LDUH:
5465			regs[rd] = dtrace_load16(regs[r1]);
5466			break;
5467		case DIF_OP_RLDUW:
5468			if (!dtrace_canstore(regs[r1], 4, mstate, vstate)) {
5469				*flags |= CPU_DTRACE_KPRIV;
5470				*illval = regs[r1];
5471				break;
5472			}
5473			/*FALLTHROUGH*/
5474		case DIF_OP_LDUW:
5475			regs[rd] = dtrace_load32(regs[r1]);
5476			break;
5477		case DIF_OP_RLDX:
5478			if (!dtrace_canstore(regs[r1], 8, mstate, vstate)) {
5479				*flags |= CPU_DTRACE_KPRIV;
5480				*illval = regs[r1];
5481				break;
5482			}
5483			/*FALLTHROUGH*/
5484		case DIF_OP_LDX:
5485			regs[rd] = dtrace_load64(regs[r1]);
5486			break;
5487#if !defined(__APPLE__)
5488		case DIF_OP_ULDSB:
5489			regs[rd] = (int8_t)
5490			    dtrace_fuword8((void *)(uintptr_t)regs[r1]);
5491			break;
5492		case DIF_OP_ULDSH:
5493			regs[rd] = (int16_t)
5494			    dtrace_fuword16((void *)(uintptr_t)regs[r1]);
5495			break;
5496		case DIF_OP_ULDSW:
5497			regs[rd] = (int32_t)
5498			    dtrace_fuword32((void *)(uintptr_t)regs[r1]);
5499			break;
5500		case DIF_OP_ULDUB:
5501			regs[rd] =
5502			    dtrace_fuword8((void *)(uintptr_t)regs[r1]);
5503			break;
5504		case DIF_OP_ULDUH:
5505			regs[rd] =
5506			    dtrace_fuword16((void *)(uintptr_t)regs[r1]);
5507			break;
5508		case DIF_OP_ULDUW:
5509			regs[rd] =
5510			    dtrace_fuword32((void *)(uintptr_t)regs[r1]);
5511			break;
5512		case DIF_OP_ULDX:
5513			regs[rd] =
5514			    dtrace_fuword64((void *)(uintptr_t)regs[r1]);
5515			break;
5516#else /* Darwin 32-bit kernel may fetch from 64-bit user. Don't want uintptr_t cast. */
5517		case DIF_OP_ULDSB:
5518			regs[rd] = (int8_t)
5519			    dtrace_fuword8(regs[r1]);
5520			break;
5521		case DIF_OP_ULDSH:
5522			regs[rd] = (int16_t)
5523			    dtrace_fuword16(regs[r1]);
5524			break;
5525		case DIF_OP_ULDSW:
5526			regs[rd] = (int32_t)
5527			    dtrace_fuword32(regs[r1]);
5528			break;
5529		case DIF_OP_ULDUB:
5530			regs[rd] =
5531			    dtrace_fuword8(regs[r1]);
5532			break;
5533		case DIF_OP_ULDUH:
5534			regs[rd] =
5535			    dtrace_fuword16(regs[r1]);
5536			break;
5537		case DIF_OP_ULDUW:
5538			regs[rd] =
5539			    dtrace_fuword32(regs[r1]);
5540			break;
5541		case DIF_OP_ULDX:
5542			regs[rd] =
5543			    dtrace_fuword64(regs[r1]);
5544#endif /* __APPLE__ */
5545			break;
5546		case DIF_OP_RET:
5547			rval = regs[rd];
5548			pc = textlen;
5549			break;
5550		case DIF_OP_NOP:
5551			break;
5552		case DIF_OP_SETX:
5553			regs[rd] = inttab[DIF_INSTR_INTEGER(instr)];
5554			break;
5555		case DIF_OP_SETS:
5556			regs[rd] = (uint64_t)(uintptr_t)
5557			    (strtab + DIF_INSTR_STRING(instr));
5558			break;
5559		case DIF_OP_SCMP: {
5560			size_t sz = state->dts_options[DTRACEOPT_STRSIZE];
5561			uintptr_t s1 = regs[r1];
5562			uintptr_t s2 = regs[r2];
5563
5564			if (s1 != NULL &&
5565			    !dtrace_strcanload(s1, sz, mstate, vstate))
5566				break;
5567			if (s2 != NULL &&
5568			    !dtrace_strcanload(s2, sz, mstate, vstate))
5569				break;
5570
5571			cc_r = dtrace_strncmp((char *)s1, (char *)s2, sz);
5572
5573			cc_n = cc_r < 0;
5574			cc_z = cc_r == 0;
5575			cc_v = cc_c = 0;
5576			break;
5577		}
5578		case DIF_OP_LDGA:
5579			regs[rd] = dtrace_dif_variable(mstate, state,
5580			    r1, regs[r2]);
5581			break;
5582		case DIF_OP_LDGS:
5583			id = DIF_INSTR_VAR(instr);
5584
5585			if (id >= DIF_VAR_OTHER_UBASE) {
5586				uintptr_t a;
5587
5588				id -= DIF_VAR_OTHER_UBASE;
5589				svar = vstate->dtvs_globals[id];
5590				ASSERT(svar != NULL);
5591				v = &svar->dtsv_var;
5592
5593				if (!(v->dtdv_type.dtdt_flags & DIF_TF_BYREF)) {
5594					regs[rd] = svar->dtsv_data;
5595					break;
5596				}
5597
5598				a = (uintptr_t)svar->dtsv_data;
5599
5600				if (*(uint8_t *)a == UINT8_MAX) {
5601					/*
5602					 * If the 0th byte is set to UINT8_MAX
5603					 * then this is to be treated as a
5604					 * reference to a NULL variable.
5605					 */
5606					regs[rd] = NULL;
5607				} else {
5608					regs[rd] = a + sizeof (uint64_t);
5609				}
5610
5611				break;
5612			}
5613
5614			regs[rd] = dtrace_dif_variable(mstate, state, id, 0);
5615			break;
5616
5617		case DIF_OP_STGS:
5618			id = DIF_INSTR_VAR(instr);
5619
5620			ASSERT(id >= DIF_VAR_OTHER_UBASE);
5621			id -= DIF_VAR_OTHER_UBASE;
5622
5623			svar = vstate->dtvs_globals[id];
5624			ASSERT(svar != NULL);
5625			v = &svar->dtsv_var;
5626
5627			if (v->dtdv_type.dtdt_flags & DIF_TF_BYREF) {
5628				uintptr_t a = (uintptr_t)svar->dtsv_data;
5629
5630				ASSERT(a != NULL);
5631				ASSERT(svar->dtsv_size != 0);
5632
5633				if (regs[rd] == NULL) {
5634					*(uint8_t *)a = UINT8_MAX;
5635					break;
5636				} else {
5637					*(uint8_t *)a = 0;
5638					a += sizeof (uint64_t);
5639				}
5640				if (!dtrace_vcanload(
5641				    (void *)(uintptr_t)regs[rd], &v->dtdv_type,
5642				    mstate, vstate))
5643					break;
5644
5645				dtrace_vcopy((void *)(uintptr_t)regs[rd],
5646				    (void *)a, &v->dtdv_type);
5647				break;
5648			}
5649
5650			svar->dtsv_data = regs[rd];
5651			break;
5652
5653		case DIF_OP_LDTA:
5654			/*
5655			 * There are no DTrace built-in thread-local arrays at
5656			 * present.  This opcode is saved for future work.
5657			 */
5658			*flags |= CPU_DTRACE_ILLOP;
5659			regs[rd] = 0;
5660			break;
5661
5662		case DIF_OP_LDLS:
5663			id = DIF_INSTR_VAR(instr);
5664
5665			if (id < DIF_VAR_OTHER_UBASE) {
5666				/*
5667				 * For now, this has no meaning.
5668				 */
5669				regs[rd] = 0;
5670				break;
5671			}
5672
5673			id -= DIF_VAR_OTHER_UBASE;
5674
5675#if !defined(__APPLE__)   /* Quiet compiler warnings */
5676			ASSERT(id < vstate->dtvs_nlocals);
5677#else
5678			ASSERT(id < (uint_t)vstate->dtvs_nlocals);
5679#endif /* __APPLE__ */
5680			ASSERT(vstate->dtvs_locals != NULL);
5681
5682			svar = vstate->dtvs_locals[id];
5683			ASSERT(svar != NULL);
5684			v = &svar->dtsv_var;
5685
5686			if (v->dtdv_type.dtdt_flags & DIF_TF_BYREF) {
5687				uintptr_t a = (uintptr_t)svar->dtsv_data;
5688				size_t sz = v->dtdv_type.dtdt_size;
5689
5690				sz += sizeof (uint64_t);
5691				ASSERT(svar->dtsv_size == (int)NCPU * sz);
5692				a += CPU->cpu_id * sz;
5693
5694				if (*(uint8_t *)a == UINT8_MAX) {
5695					/*
5696					 * If the 0th byte is set to UINT8_MAX
5697					 * then this is to be treated as a
5698					 * reference to a NULL variable.
5699					 */
5700					regs[rd] = NULL;
5701				} else {
5702					regs[rd] = a + sizeof (uint64_t);
5703				}
5704
5705				break;
5706			}
5707
5708			ASSERT(svar->dtsv_size == (int)NCPU * sizeof (uint64_t));
5709			tmp = (uint64_t *)(uintptr_t)svar->dtsv_data;
5710			regs[rd] = tmp[CPU->cpu_id];
5711			break;
5712
5713		case DIF_OP_STLS:
5714			id = DIF_INSTR_VAR(instr);
5715
5716			ASSERT(id >= DIF_VAR_OTHER_UBASE);
5717			id -= DIF_VAR_OTHER_UBASE;
5718#if !defined(__APPLE__)   /* Quiet compiler warnings */
5719			ASSERT(id < vstate->dtvs_nlocals);
5720#else
5721			ASSERT(id < (uint_t)vstate->dtvs_nlocals);
5722#endif /* __APPLE__ */
5723
5724			ASSERT(vstate->dtvs_locals != NULL);
5725			svar = vstate->dtvs_locals[id];
5726			ASSERT(svar != NULL);
5727			v = &svar->dtsv_var;
5728
5729			if (v->dtdv_type.dtdt_flags & DIF_TF_BYREF) {
5730				uintptr_t a = (uintptr_t)svar->dtsv_data;
5731				size_t sz = v->dtdv_type.dtdt_size;
5732
5733				sz += sizeof (uint64_t);
5734				ASSERT(svar->dtsv_size == (int)NCPU * sz);
5735				a += CPU->cpu_id * sz;
5736
5737				if (regs[rd] == NULL) {
5738					*(uint8_t *)a = UINT8_MAX;
5739					break;
5740				} else {
5741					*(uint8_t *)a = 0;
5742					a += sizeof (uint64_t);
5743				}
5744
5745				if (!dtrace_vcanload(
5746				    (void *)(uintptr_t)regs[rd], &v->dtdv_type,
5747				    mstate, vstate))
5748					break;
5749
5750				dtrace_vcopy((void *)(uintptr_t)regs[rd],
5751				    (void *)a, &v->dtdv_type);
5752				break;
5753			}
5754
5755			ASSERT(svar->dtsv_size == (int)NCPU * sizeof (uint64_t));
5756			tmp = (uint64_t *)(uintptr_t)svar->dtsv_data;
5757			tmp[CPU->cpu_id] = regs[rd];
5758			break;
5759
5760		case DIF_OP_LDTS: {
5761			dtrace_dynvar_t *dvar;
5762			dtrace_key_t *key;
5763
5764			id = DIF_INSTR_VAR(instr);
5765			ASSERT(id >= DIF_VAR_OTHER_UBASE);
5766			id -= DIF_VAR_OTHER_UBASE;
5767			v = &vstate->dtvs_tlocals[id];
5768
5769			key = &tupregs[DIF_DTR_NREGS];
5770			key[0].dttk_value = (uint64_t)id;
5771			key[0].dttk_size = 0;
5772			DTRACE_TLS_THRKEY(key[1].dttk_value);
5773			key[1].dttk_size = 0;
5774
5775			dvar = dtrace_dynvar(dstate, 2, key,
5776			    sizeof (uint64_t), DTRACE_DYNVAR_NOALLOC,
5777			    mstate, vstate);
5778
5779			if (dvar == NULL) {
5780				regs[rd] = 0;
5781				break;
5782			}
5783
5784			if (v->dtdv_type.dtdt_flags & DIF_TF_BYREF) {
5785				regs[rd] = (uint64_t)(uintptr_t)dvar->dtdv_data;
5786			} else {
5787				regs[rd] = *((uint64_t *)dvar->dtdv_data);
5788			}
5789
5790			break;
5791		}
5792
5793		case DIF_OP_STTS: {
5794			dtrace_dynvar_t *dvar;
5795			dtrace_key_t *key;
5796
5797			id = DIF_INSTR_VAR(instr);
5798			ASSERT(id >= DIF_VAR_OTHER_UBASE);
5799			id -= DIF_VAR_OTHER_UBASE;
5800
5801			key = &tupregs[DIF_DTR_NREGS];
5802			key[0].dttk_value = (uint64_t)id;
5803			key[0].dttk_size = 0;
5804			DTRACE_TLS_THRKEY(key[1].dttk_value);
5805			key[1].dttk_size = 0;
5806			v = &vstate->dtvs_tlocals[id];
5807
5808			dvar = dtrace_dynvar(dstate, 2, key,
5809			    v->dtdv_type.dtdt_size > sizeof (uint64_t) ?
5810			    v->dtdv_type.dtdt_size : sizeof (uint64_t),
5811			    regs[rd] ? DTRACE_DYNVAR_ALLOC :
5812			    DTRACE_DYNVAR_DEALLOC, mstate, vstate);
5813
5814			/*
5815			 * Given that we're storing to thread-local data,
5816			 * we need to flush our predicate cache.
5817			 */
5818#if !defined(__APPLE__)
5819			curthread->t_predcache = NULL;
5820#else
5821			dtrace_set_thread_predcache(current_thread(), 0);
5822#endif /* __APPLE__ */
5823
5824			if (dvar == NULL)
5825				break;
5826
5827			if (v->dtdv_type.dtdt_flags & DIF_TF_BYREF) {
5828				if (!dtrace_vcanload(
5829				    (void *)(uintptr_t)regs[rd],
5830				    &v->dtdv_type, mstate, vstate))
5831					break;
5832
5833				dtrace_vcopy((void *)(uintptr_t)regs[rd],
5834				    dvar->dtdv_data, &v->dtdv_type);
5835			} else {
5836				*((uint64_t *)dvar->dtdv_data) = regs[rd];
5837			}
5838
5839			break;
5840		}
5841
5842		case DIF_OP_SRA:
5843			regs[rd] = (int64_t)regs[r1] >> regs[r2];
5844			break;
5845
5846		case DIF_OP_CALL:
5847			dtrace_dif_subr(DIF_INSTR_SUBR(instr), rd,
5848			    regs, tupregs, ttop, mstate, state);
5849			break;
5850
5851		case DIF_OP_PUSHTR:
5852			if (ttop == DIF_DTR_NREGS) {
5853				*flags |= CPU_DTRACE_TUPOFLOW;
5854				break;
5855			}
5856
5857			if (r1 == DIF_TYPE_STRING) {
5858				/*
5859				 * If this is a string type and the size is 0,
5860				 * we'll use the system-wide default string
5861				 * size.  Note that we are _not_ looking at
5862				 * the value of the DTRACEOPT_STRSIZE option;
5863				 * had this been set, we would expect to have
5864				 * a non-zero size value in the "pushtr".
5865				 */
5866				tupregs[ttop].dttk_size =
5867				    dtrace_strlen((char *)(uintptr_t)regs[rd],
5868				    regs[r2] ? regs[r2] :
5869				    dtrace_strsize_default) + 1;
5870			} else {
5871				tupregs[ttop].dttk_size = regs[r2];
5872			}
5873
5874			tupregs[ttop++].dttk_value = regs[rd];
5875			break;
5876
5877		case DIF_OP_PUSHTV:
5878			if (ttop == DIF_DTR_NREGS) {
5879				*flags |= CPU_DTRACE_TUPOFLOW;
5880				break;
5881			}
5882
5883			tupregs[ttop].dttk_value = regs[rd];
5884			tupregs[ttop++].dttk_size = 0;
5885			break;
5886
5887		case DIF_OP_POPTS:
5888			if (ttop != 0)
5889				ttop--;
5890			break;
5891
5892		case DIF_OP_FLUSHTS:
5893			ttop = 0;
5894			break;
5895
5896		case DIF_OP_LDGAA:
5897		case DIF_OP_LDTAA: {
5898			dtrace_dynvar_t *dvar;
5899			dtrace_key_t *key = tupregs;
5900			uint_t nkeys = ttop;
5901
5902			id = DIF_INSTR_VAR(instr);
5903			ASSERT(id >= DIF_VAR_OTHER_UBASE);
5904			id -= DIF_VAR_OTHER_UBASE;
5905
5906			key[nkeys].dttk_value = (uint64_t)id;
5907			key[nkeys++].dttk_size = 0;
5908
5909			if (DIF_INSTR_OP(instr) == DIF_OP_LDTAA) {
5910				DTRACE_TLS_THRKEY(key[nkeys].dttk_value);
5911				key[nkeys++].dttk_size = 0;
5912				v = &vstate->dtvs_tlocals[id];
5913			} else {
5914				v = &vstate->dtvs_globals[id]->dtsv_var;
5915			}
5916
5917			dvar = dtrace_dynvar(dstate, nkeys, key,
5918			    v->dtdv_type.dtdt_size > sizeof (uint64_t) ?
5919			    v->dtdv_type.dtdt_size : sizeof (uint64_t),
5920			    DTRACE_DYNVAR_NOALLOC, mstate, vstate);
5921
5922			if (dvar == NULL) {
5923				regs[rd] = 0;
5924				break;
5925			}
5926
5927			if (v->dtdv_type.dtdt_flags & DIF_TF_BYREF) {
5928				regs[rd] = (uint64_t)(uintptr_t)dvar->dtdv_data;
5929			} else {
5930				regs[rd] = *((uint64_t *)dvar->dtdv_data);
5931			}
5932
5933			break;
5934		}
5935
5936		case DIF_OP_STGAA:
5937		case DIF_OP_STTAA: {
5938			dtrace_dynvar_t *dvar;
5939			dtrace_key_t *key = tupregs;
5940			uint_t nkeys = ttop;
5941
5942			id = DIF_INSTR_VAR(instr);
5943			ASSERT(id >= DIF_VAR_OTHER_UBASE);
5944			id -= DIF_VAR_OTHER_UBASE;
5945
5946			key[nkeys].dttk_value = (uint64_t)id;
5947			key[nkeys++].dttk_size = 0;
5948
5949			if (DIF_INSTR_OP(instr) == DIF_OP_STTAA) {
5950				DTRACE_TLS_THRKEY(key[nkeys].dttk_value);
5951				key[nkeys++].dttk_size = 0;
5952				v = &vstate->dtvs_tlocals[id];
5953			} else {
5954				v = &vstate->dtvs_globals[id]->dtsv_var;
5955			}
5956
5957			dvar = dtrace_dynvar(dstate, nkeys, key,
5958			    v->dtdv_type.dtdt_size > sizeof (uint64_t) ?
5959			    v->dtdv_type.dtdt_size : sizeof (uint64_t),
5960			    regs[rd] ? DTRACE_DYNVAR_ALLOC :
5961			    DTRACE_DYNVAR_DEALLOC, mstate, vstate);
5962
5963			if (dvar == NULL)
5964				break;
5965
5966			if (v->dtdv_type.dtdt_flags & DIF_TF_BYREF) {
5967				if (!dtrace_vcanload(
5968				    (void *)(uintptr_t)regs[rd], &v->dtdv_type,
5969				    mstate, vstate))
5970					break;
5971
5972				dtrace_vcopy((void *)(uintptr_t)regs[rd],
5973				    dvar->dtdv_data, &v->dtdv_type);
5974			} else {
5975				*((uint64_t *)dvar->dtdv_data) = regs[rd];
5976			}
5977
5978			break;
5979		}
5980
5981		case DIF_OP_ALLOCS: {
5982			uintptr_t ptr = P2ROUNDUP(mstate->dtms_scratch_ptr, 8);
5983			size_t size = ptr - mstate->dtms_scratch_ptr + regs[r1];
5984
5985			/*
5986			 * Rounding up the user allocation size could have
5987			 * overflowed large, bogus allocations (like -1ULL) to
5988			 * 0.
5989			 */
5990			if (size < regs[r1] ||
5991			    !DTRACE_INSCRATCH(mstate, size)) {
5992				DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
5993				regs[rd] = NULL;
5994				break;
5995			}
5996
5997			dtrace_bzero((void *) mstate->dtms_scratch_ptr, size);
5998				mstate->dtms_scratch_ptr += size;
5999				regs[rd] = ptr;
6000			break;
6001		}
6002
6003		case DIF_OP_COPYS:
6004			if (!dtrace_canstore(regs[rd], regs[r2],
6005			    mstate, vstate)) {
6006				*flags |= CPU_DTRACE_BADADDR;
6007				*illval = regs[rd];
6008				break;
6009			}
6010
6011			if (!dtrace_canload(regs[r1], regs[r2], mstate, vstate))
6012				break;
6013
6014			dtrace_bcopy((void *)(uintptr_t)regs[r1],
6015			    (void *)(uintptr_t)regs[rd], (size_t)regs[r2]);
6016			break;
6017
6018		case DIF_OP_STB:
6019			if (!dtrace_canstore(regs[rd], 1, mstate, vstate)) {
6020				*flags |= CPU_DTRACE_BADADDR;
6021				*illval = regs[rd];
6022				break;
6023			}
6024			*((uint8_t *)(uintptr_t)regs[rd]) = (uint8_t)regs[r1];
6025			break;
6026
6027		case DIF_OP_STH:
6028			if (!dtrace_canstore(regs[rd], 2, mstate, vstate)) {
6029				*flags |= CPU_DTRACE_BADADDR;
6030				*illval = regs[rd];
6031				break;
6032			}
6033			if (regs[rd] & 1) {
6034				*flags |= CPU_DTRACE_BADALIGN;
6035				*illval = regs[rd];
6036				break;
6037			}
6038			*((uint16_t *)(uintptr_t)regs[rd]) = (uint16_t)regs[r1];
6039			break;
6040
6041		case DIF_OP_STW:
6042			if (!dtrace_canstore(regs[rd], 4, mstate, vstate)) {
6043				*flags |= CPU_DTRACE_BADADDR;
6044				*illval = regs[rd];
6045				break;
6046			}
6047			if (regs[rd] & 3) {
6048				*flags |= CPU_DTRACE_BADALIGN;
6049				*illval = regs[rd];
6050				break;
6051			}
6052			*((uint32_t *)(uintptr_t)regs[rd]) = (uint32_t)regs[r1];
6053			break;
6054
6055		case DIF_OP_STX:
6056			if (!dtrace_canstore(regs[rd], 8, mstate, vstate)) {
6057				*flags |= CPU_DTRACE_BADADDR;
6058				*illval = regs[rd];
6059				break;
6060			}
6061#if !defined(__APPLE__)
6062			if (regs[rd] & 7) {
6063#else
6064			if (regs[rd] & 3) { /* Darwin kmem_zalloc() called from dtrace_difo_init() is 4-byte aligned. */
6065#endif /* __APPLE__ */
6066				*flags |= CPU_DTRACE_BADALIGN;
6067				*illval = regs[rd];
6068				break;
6069			}
6070			*((uint64_t *)(uintptr_t)regs[rd]) = regs[r1];
6071			break;
6072		}
6073	}
6074
6075	if (!(*flags & CPU_DTRACE_FAULT))
6076		return (rval);
6077
6078	mstate->dtms_fltoffs = opc * sizeof (dif_instr_t);
6079	mstate->dtms_present |= DTRACE_MSTATE_FLTOFFS;
6080
6081	return (0);
6082}
6083
6084static void
6085dtrace_action_breakpoint(dtrace_ecb_t *ecb)
6086{
6087	dtrace_probe_t *probe = ecb->dte_probe;
6088	dtrace_provider_t *prov = probe->dtpr_provider;
6089	char c[DTRACE_FULLNAMELEN + 80], *str;
6090#if !defined(__APPLE__)   /* Quiet compiler warnings */
6091	char *msg = "dtrace: breakpoint action at probe ";
6092	char *ecbmsg = " (ecb ";
6093#else
6094	const char *msg = "dtrace: breakpoint action at probe ";
6095	const char *ecbmsg = " (ecb ";
6096#endif /* __APPLE__ */
6097	uintptr_t mask = (0xf << (sizeof (uintptr_t) * NBBY / 4));
6098	uintptr_t val = (uintptr_t)ecb;
6099	int shift = (sizeof (uintptr_t) * NBBY) - 4, i = 0;
6100
6101	if (dtrace_destructive_disallow)
6102		return;
6103
6104	/*
6105	 * It's impossible to be taking action on the NULL probe.
6106	 */
6107	ASSERT(probe != NULL);
6108
6109	/*
6110	 * This is a poor man's (destitute man's?) sprintf():  we want to
6111	 * print the provider name, module name, function name and name of
6112	 * the probe, along with the hex address of the ECB with the breakpoint
6113	 * action -- all of which we must place in the character buffer by
6114	 * hand.
6115	 */
6116	while (*msg != '\0')
6117		c[i++] = *msg++;
6118
6119	for (str = prov->dtpv_name; *str != '\0'; str++)
6120		c[i++] = *str;
6121	c[i++] = ':';
6122
6123	for (str = probe->dtpr_mod; *str != '\0'; str++)
6124		c[i++] = *str;
6125	c[i++] = ':';
6126
6127	for (str = probe->dtpr_func; *str != '\0'; str++)
6128		c[i++] = *str;
6129	c[i++] = ':';
6130
6131	for (str = probe->dtpr_name; *str != '\0'; str++)
6132		c[i++] = *str;
6133
6134	while (*ecbmsg != '\0')
6135		c[i++] = *ecbmsg++;
6136
6137	while (shift >= 0) {
6138		mask = (uintptr_t)0xf << shift;
6139
6140		if (val >= ((uintptr_t)1 << shift))
6141			c[i++] = "0123456789abcdef"[(val & mask) >> shift];
6142		shift -= 4;
6143	}
6144
6145	c[i++] = ')';
6146	c[i] = '\0';
6147
6148	debug_enter(c);
6149}
6150
6151static void
6152dtrace_action_panic(dtrace_ecb_t *ecb)
6153{
6154	dtrace_probe_t *probe = ecb->dte_probe;
6155
6156	/*
6157	 * It's impossible to be taking action on the NULL probe.
6158	 */
6159	ASSERT(probe != NULL);
6160
6161	if (dtrace_destructive_disallow)
6162		return;
6163
6164	if (dtrace_panicked != NULL)
6165		return;
6166
6167#if !defined(__APPLE__)
6168	if (dtrace_casptr(&dtrace_panicked, NULL, curthread) != NULL)
6169		return;
6170#else
6171	if (dtrace_casptr(&dtrace_panicked, NULL, current_thread()) != NULL)
6172		return;
6173#endif /* __APPLE__ */
6174
6175	/*
6176	 * We won the right to panic.  (We want to be sure that only one
6177	 * thread calls panic() from dtrace_probe(), and that panic() is
6178	 * called exactly once.)
6179	 */
6180	panic("dtrace: panic action at probe %s:%s:%s:%s (ecb %p)",
6181	    probe->dtpr_provider->dtpv_name, probe->dtpr_mod,
6182	    probe->dtpr_func, probe->dtpr_name, (void *)ecb);
6183
6184#if defined(__APPLE__)
6185	/* Mac OS X debug feature -- can return from panic() */
6186	dtrace_panicked = NULL;
6187#endif /* __APPLE__ */
6188}
6189
6190static void
6191dtrace_action_raise(uint64_t sig)
6192{
6193	if (dtrace_destructive_disallow)
6194		return;
6195
6196	if (sig >= NSIG) {
6197		DTRACE_CPUFLAG_SET(CPU_DTRACE_ILLOP);
6198		return;
6199	}
6200
6201#if !defined(__APPLE__)
6202	/*
6203	 * raise() has a queue depth of 1 -- we ignore all subsequent
6204	 * invocations of the raise() action.
6205	 */
6206	if (curthread->t_dtrace_sig == 0)
6207		curthread->t_dtrace_sig = (uint8_t)sig;
6208
6209	curthread->t_sig_check = 1;
6210	aston(curthread);
6211#else
6212	uthread_t uthread = (uthread_t)get_bsdthread_info(current_thread());
6213
6214	if (uthread && uthread->t_dtrace_sig == 0) {
6215		uthread->t_dtrace_sig = sig;
6216		act_set_astbsd(current_thread());
6217	}
6218#endif /* __APPLE__ */
6219}
6220
6221static void
6222dtrace_action_stop(void)
6223{
6224	if (dtrace_destructive_disallow)
6225		return;
6226
6227#if !defined(__APPLE__)
6228	if (!curthread->t_dtrace_stop) {
6229		curthread->t_dtrace_stop = 1;
6230		curthread->t_sig_check = 1;
6231		aston(curthread);
6232	}
6233#else
6234        uthread_t uthread = (uthread_t)get_bsdthread_info(current_thread());
6235	if (uthread) {
6236		/*
6237		 * The currently running process will be set to task_suspend
6238		 * when it next leaves the kernel.
6239		*/
6240		uthread->t_dtrace_stop = 1;
6241		act_set_astbsd(current_thread());
6242	}
6243#endif /* __APPLE__ */
6244}
6245
6246#if defined(__APPLE__)
6247static void
6248dtrace_action_pidresume(uint64_t pid)
6249{
6250	if (dtrace_destructive_disallow)
6251		return;
6252
6253	if (kauth_cred_issuser(kauth_cred_get()) == 0) {
6254		DTRACE_CPUFLAG_SET(CPU_DTRACE_ILLOP);
6255		return;
6256	}
6257        uthread_t uthread = (uthread_t)get_bsdthread_info(current_thread());
6258
6259	/*
6260	 * When the currently running process leaves the kernel, it attempts to
6261	 * task_resume the process (denoted by pid), if that pid appears to have
6262	 * been stopped by dtrace_action_stop().
6263	 * The currently running process has a pidresume() queue depth of 1 --
6264	 * subsequent invocations of the pidresume() action are ignored.
6265	 */
6266
6267	if (pid != 0 && uthread && uthread->t_dtrace_resumepid == 0) {
6268		uthread->t_dtrace_resumepid = pid;
6269		act_set_astbsd(current_thread());
6270	}
6271}
6272#endif /* __APPLE__ */
6273
6274
6275static void
6276dtrace_action_chill(dtrace_mstate_t *mstate, hrtime_t val)
6277{
6278	hrtime_t now;
6279	volatile uint16_t *flags;
6280	dtrace_cpu_t *cpu = CPU;
6281
6282	if (dtrace_destructive_disallow)
6283		return;
6284
6285	flags = (volatile uint16_t *)&cpu_core[cpu->cpu_id].cpuc_dtrace_flags;
6286
6287	now = dtrace_gethrtime();
6288
6289	if (now - cpu->cpu_dtrace_chillmark > dtrace_chill_interval) {
6290		/*
6291		 * We need to advance the mark to the current time.
6292		 */
6293		cpu->cpu_dtrace_chillmark = now;
6294		cpu->cpu_dtrace_chilled = 0;
6295	}
6296
6297	/*
6298	 * Now check to see if the requested chill time would take us over
6299	 * the maximum amount of time allowed in the chill interval.  (Or
6300	 * worse, if the calculation itself induces overflow.)
6301	 */
6302	if (cpu->cpu_dtrace_chilled + val > dtrace_chill_max ||
6303	    cpu->cpu_dtrace_chilled + val < cpu->cpu_dtrace_chilled) {
6304		*flags |= CPU_DTRACE_ILLOP;
6305		return;
6306	}
6307
6308	while (dtrace_gethrtime() - now < val)
6309		continue;
6310
6311	/*
6312	 * Normally, we assure that the value of the variable "timestamp" does
6313	 * not change within an ECB.  The presence of chill() represents an
6314	 * exception to this rule, however.
6315	 */
6316	mstate->dtms_present &= ~DTRACE_MSTATE_TIMESTAMP;
6317	cpu->cpu_dtrace_chilled += val;
6318}
6319
6320static void
6321dtrace_action_ustack(dtrace_mstate_t *mstate, dtrace_state_t *state,
6322    uint64_t *buf, uint64_t arg)
6323{
6324	int nframes = DTRACE_USTACK_NFRAMES(arg);
6325	int strsize = DTRACE_USTACK_STRSIZE(arg);
6326	uint64_t *pcs = &buf[1], *fps;
6327	char *str = (char *)&pcs[nframes];
6328	int size, offs = 0, i, j;
6329	uintptr_t old = mstate->dtms_scratch_ptr, saved;
6330	uint16_t *flags = &cpu_core[CPU->cpu_id].cpuc_dtrace_flags;
6331	char *sym;
6332
6333	/*
6334	 * Should be taking a faster path if string space has not been
6335	 * allocated.
6336	 */
6337	ASSERT(strsize != 0);
6338
6339	/*
6340	 * We will first allocate some temporary space for the frame pointers.
6341	 */
6342	fps = (uint64_t *)P2ROUNDUP(mstate->dtms_scratch_ptr, 8);
6343	size = (uintptr_t)fps - mstate->dtms_scratch_ptr +
6344	    (nframes * sizeof (uint64_t));
6345
6346#if !defined(__APPLE__)   /* Quiet compiler warnings */
6347	if (!DTRACE_INSCRATCH(mstate, size)) {
6348#else
6349	if (!DTRACE_INSCRATCH(mstate, (uintptr_t)size)) {
6350#endif /* __APPLE__ */
6351		/*
6352		 * Not enough room for our frame pointers -- need to indicate
6353		 * that we ran out of scratch space.
6354		 */
6355		DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
6356		return;
6357	}
6358
6359	mstate->dtms_scratch_ptr += size;
6360	saved = mstate->dtms_scratch_ptr;
6361
6362	/*
6363	 * Now get a stack with both program counters and frame pointers.
6364	 */
6365	DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
6366	dtrace_getufpstack(buf, fps, nframes + 1);
6367	DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);
6368
6369	/*
6370	 * If that faulted, we're cooked.
6371	 */
6372	if (*flags & CPU_DTRACE_FAULT)
6373		goto out;
6374
6375	/*
6376	 * Now we want to walk up the stack, calling the USTACK helper.  For
6377	 * each iteration, we restore the scratch pointer.
6378	 */
6379	for (i = 0; i < nframes; i++) {
6380		mstate->dtms_scratch_ptr = saved;
6381
6382		if (offs >= strsize)
6383			break;
6384
6385		sym = (char *)(uintptr_t)dtrace_helper(
6386		    DTRACE_HELPER_ACTION_USTACK,
6387		    mstate, state, pcs[i], fps[i]);
6388
6389		/*
6390		 * If we faulted while running the helper, we're going to
6391		 * clear the fault and null out the corresponding string.
6392		 */
6393		if (*flags & CPU_DTRACE_FAULT) {
6394			*flags &= ~CPU_DTRACE_FAULT;
6395			str[offs++] = '\0';
6396			continue;
6397		}
6398
6399		if (sym == NULL) {
6400			str[offs++] = '\0';
6401			continue;
6402		}
6403
6404		DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
6405
6406		/*
6407		 * Now copy in the string that the helper returned to us.
6408		 */
6409		for (j = 0; offs + j < strsize; j++) {
6410			if ((str[offs + j] = sym[j]) == '\0')
6411				break;
6412		}
6413
6414		DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);
6415
6416		offs += j + 1;
6417	}
6418
6419	if (offs >= strsize) {
6420		/*
6421		 * If we didn't have room for all of the strings, we don't
6422		 * abort processing -- this needn't be a fatal error -- but we
6423		 * still want to increment a counter (dts_stkstroverflows) to
6424		 * allow this condition to be warned about.  (If this is from
6425		 * a jstack() action, it is easily tuned via jstackstrsize.)
6426		 */
6427		dtrace_error(&state->dts_stkstroverflows);
6428	}
6429
6430	while (offs < strsize)
6431		str[offs++] = '\0';
6432
6433out:
6434	mstate->dtms_scratch_ptr = old;
6435}
6436
6437/*
6438 * If you're looking for the epicenter of DTrace, you just found it.  This
6439 * is the function called by the provider to fire a probe -- from which all
6440 * subsequent probe-context DTrace activity emanates.
6441 */
6442#if !defined(__APPLE__)
6443void
6444dtrace_probe(dtrace_id_t id, uintptr_t arg0, uintptr_t arg1,
6445    uintptr_t arg2, uintptr_t arg3, uintptr_t arg4)
6446#else
6447static void
6448__dtrace_probe(dtrace_id_t id, uint64_t arg0, uint64_t arg1,
6449    uint64_t arg2, uint64_t arg3, uint64_t arg4)
6450#endif /* __APPLE__ */
6451{
6452	processorid_t cpuid;
6453	dtrace_icookie_t cookie;
6454	dtrace_probe_t *probe;
6455	dtrace_mstate_t mstate;
6456	dtrace_ecb_t *ecb;
6457	dtrace_action_t *act;
6458	intptr_t offs;
6459	size_t size;
6460	int vtime, onintr;
6461	volatile uint16_t *flags;
6462	hrtime_t now;
6463
6464#if !defined(__APPLE__)
6465	/*
6466	 * Kick out immediately if this CPU is still being born (in which case
6467	 * curthread will be set to -1) or the current thread can't allow
6468	 * probes in its current context.
6469	 */
6470	if (((uintptr_t)curthread & 1) || (curthread->t_flag & T_DONTDTRACE))
6471		return;
6472#else
6473	/* Not a concern for Darwin */
6474#endif /* __APPLE__ */
6475
6476	cookie = dtrace_interrupt_disable();
6477	probe = dtrace_probes[id - 1];
6478	cpuid = CPU->cpu_id;
6479	onintr = CPU_ON_INTR(CPU);
6480
6481#if !defined(__APPLE__)
6482	if (!onintr && probe->dtpr_predcache != DTRACE_CACHEIDNONE &&
6483	    probe->dtpr_predcache == curthread->t_predcache) {
6484#else
6485	if (!onintr && probe->dtpr_predcache != DTRACE_CACHEIDNONE &&
6486	    probe->dtpr_predcache == dtrace_get_thread_predcache(current_thread())) {
6487#endif /* __APPLE__ */
6488		/*
6489		 * We have hit in the predicate cache; we know that
6490		 * this predicate would evaluate to be false.
6491		 */
6492		dtrace_interrupt_enable(cookie);
6493		return;
6494	}
6495
6496	if (panic_quiesce) {
6497		/*
6498		 * We don't trace anything if we're panicking.
6499		 */
6500		dtrace_interrupt_enable(cookie);
6501		return;
6502	}
6503
6504#if !defined(__APPLE__)
6505	now = dtrace_gethrtime();
6506	vtime = dtrace_vtime_references != 0;
6507
6508	if (vtime && curthread->t_dtrace_start)
6509		curthread->t_dtrace_vtime += now - curthread->t_dtrace_start;
6510#else
6511	/* FIXME: the time spent entering DTrace and arriving to this point is attributed
6512	   to the current thread. Instead it should accrue to DTrace. */
6513	vtime = dtrace_vtime_references != 0;
6514
6515	if (vtime)
6516	{
6517		int64_t dtrace_accum_time, recent_vtime;
6518		thread_t thread = current_thread();
6519
6520		dtrace_accum_time = dtrace_get_thread_tracing(thread); /* Time spent inside DTrace so far (nanoseconds) */
6521
6522		if (dtrace_accum_time >= 0) {
6523			recent_vtime = dtrace_abs_to_nano(dtrace_calc_thread_recent_vtime(thread)); /* up to the moment thread vtime */
6524
6525			recent_vtime = recent_vtime - dtrace_accum_time; /* Time without DTrace contribution */
6526
6527			dtrace_set_thread_vtime(thread, recent_vtime);
6528		}
6529	}
6530
6531	now = dtrace_gethrtime(); /* must not precede dtrace_calc_thread_recent_vtime() call! */
6532#endif /* __APPLE__ */
6533
6534#if defined(__APPLE__)
6535	/*
6536	 * A provider may call dtrace_probe_error() in lieu of dtrace_probe() in some circumstances.
6537	 * See, e.g. fasttrap_isa.c. However the provider has no access to ECB context, so passes
6538	 * 0 through "arg0" and the probe_id of the overridden probe as arg1. Detect that here
6539	 * and cons up a viable state (from the probe_id).
6540	 */
6541	if (dtrace_probeid_error == id && 0 == arg0) {
6542		dtrace_id_t ftp_id = (dtrace_id_t)arg1;
6543		dtrace_probe_t *ftp_probe = dtrace_probes[ftp_id - 1];
6544		dtrace_ecb_t *ftp_ecb = ftp_probe->dtpr_ecb;
6545
6546		if (NULL != ftp_ecb) {
6547			dtrace_state_t *ftp_state = ftp_ecb->dte_state;
6548
6549			arg0 = (uint64_t)(uintptr_t)ftp_state;
6550			arg1 = ftp_ecb->dte_epid;
6551			/*
6552			 * args[2-4] established by caller.
6553			 */
6554			ftp_state->dts_arg_error_illval = -1; /* arg5 */
6555		}
6556	}
6557#endif /* __APPLE__ */
6558
6559	mstate.dtms_difo = NULL;
6560	mstate.dtms_probe = probe;
6561	mstate.dtms_strtok = NULL;
6562	mstate.dtms_arg[0] = arg0;
6563	mstate.dtms_arg[1] = arg1;
6564	mstate.dtms_arg[2] = arg2;
6565	mstate.dtms_arg[3] = arg3;
6566	mstate.dtms_arg[4] = arg4;
6567
6568	flags = (volatile uint16_t *)&cpu_core[cpuid].cpuc_dtrace_flags;
6569
6570	for (ecb = probe->dtpr_ecb; ecb != NULL; ecb = ecb->dte_next) {
6571		dtrace_predicate_t *pred = ecb->dte_predicate;
6572		dtrace_state_t *state = ecb->dte_state;
6573		dtrace_buffer_t *buf = &state->dts_buffer[cpuid];
6574		dtrace_buffer_t *aggbuf = &state->dts_aggbuffer[cpuid];
6575		dtrace_vstate_t *vstate = &state->dts_vstate;
6576		dtrace_provider_t *prov = probe->dtpr_provider;
6577		int committed = 0;
6578		caddr_t tomax;
6579
6580		/*
6581		 * A little subtlety with the following (seemingly innocuous)
6582		 * declaration of the automatic 'val':  by looking at the
6583		 * code, you might think that it could be declared in the
6584		 * action processing loop, below.  (That is, it's only used in
6585		 * the action processing loop.)  However, it must be declared
6586		 * out of that scope because in the case of DIF expression
6587		 * arguments to aggregating actions, one iteration of the
6588		 * action loop will use the last iteration's value.
6589		 */
6590#ifdef lint
6591		uint64_t val = 0;
6592#else
6593		uint64_t val = 0;
6594#endif
6595
6596		mstate.dtms_present = DTRACE_MSTATE_ARGS | DTRACE_MSTATE_PROBE;
6597		*flags &= ~CPU_DTRACE_ERROR;
6598
6599		if (prov == dtrace_provider) {
6600			/*
6601			 * If dtrace itself is the provider of this probe,
6602			 * we're only going to continue processing the ECB if
6603			 * arg0 (the dtrace_state_t) is equal to the ECB's
6604			 * creating state.  (This prevents disjoint consumers
6605			 * from seeing one another's metaprobes.)
6606			 */
6607			if (arg0 != (uint64_t)(uintptr_t)state)
6608				continue;
6609		}
6610
6611		if (state->dts_activity != DTRACE_ACTIVITY_ACTIVE) {
6612			/*
6613			 * We're not currently active.  If our provider isn't
6614			 * the dtrace pseudo provider, we're not interested.
6615			 */
6616			if (prov != dtrace_provider)
6617				continue;
6618
6619			/*
6620			 * Now we must further check if we are in the BEGIN
6621			 * probe.  If we are, we will only continue processing
6622			 * if we're still in WARMUP -- if one BEGIN enabling
6623			 * has invoked the exit() action, we don't want to
6624			 * evaluate subsequent BEGIN enablings.
6625			 */
6626			if (probe->dtpr_id == dtrace_probeid_begin &&
6627			    state->dts_activity != DTRACE_ACTIVITY_WARMUP) {
6628				ASSERT(state->dts_activity ==
6629				    DTRACE_ACTIVITY_DRAINING);
6630				continue;
6631			}
6632		}
6633
6634		if (ecb->dte_cond) {
6635			/*
6636			 * If the dte_cond bits indicate that this
6637			 * consumer is only allowed to see user-mode firings
6638			 * of this probe, call the provider's dtps_usermode()
6639			 * entry point to check that the probe was fired
6640			 * while in a user context. Skip this ECB if that's
6641			 * not the case.
6642			 */
6643			if ((ecb->dte_cond & DTRACE_COND_USERMODE) &&
6644			    prov->dtpv_pops.dtps_usermode(prov->dtpv_arg,
6645			    probe->dtpr_id, probe->dtpr_arg) == 0)
6646				continue;
6647
6648			/*
6649			 * This is more subtle than it looks. We have to be
6650			 * absolutely certain that CRED() isn't going to
6651			 * change out from under us so it's only legit to
6652			 * examine that structure if we're in constrained
6653			 * situations. Currently, the only times we'll this
6654			 * check is if a non-super-user has enabled the
6655			 * profile or syscall providers -- providers that
6656			 * allow visibility of all processes. For the
6657			 * profile case, the check above will ensure that
6658			 * we're examining a user context.
6659			 */
6660			if (ecb->dte_cond & DTRACE_COND_OWNER) {
6661				cred_t *cr;
6662				cred_t *s_cr =
6663				    ecb->dte_state->dts_cred.dcr_cred;
6664				proc_t *proc;
6665#pragma unused(proc) /* __APPLE__ */
6666
6667				ASSERT(s_cr != NULL);
6668
6669			/*
6670			 * XXX this is hackish, but so is setting a variable
6671			 * XXX in a McCarthy OR...
6672			 */
6673#if !defined(__APPLE__)
6674				if ((cr = CRED()) == NULL ||
6675#else
6676				if ((cr = dtrace_CRED()) == NULL ||
6677#endif /* __APPLE__ */
6678				    posix_cred_get(s_cr)->cr_uid != posix_cred_get(cr)->cr_uid ||
6679				    posix_cred_get(s_cr)->cr_uid != posix_cred_get(cr)->cr_ruid ||
6680				    posix_cred_get(s_cr)->cr_uid != posix_cred_get(cr)->cr_suid ||
6681				    posix_cred_get(s_cr)->cr_gid != posix_cred_get(cr)->cr_gid ||
6682				    posix_cred_get(s_cr)->cr_gid != posix_cred_get(cr)->cr_rgid ||
6683				    posix_cred_get(s_cr)->cr_gid != posix_cred_get(cr)->cr_sgid ||
6684#if !defined(__APPLE__)
6685				    (proc = ttoproc(curthread)) == NULL ||
6686				    (proc->p_flag & SNOCD))
6687#else
6688					1) /* Darwin omits "No Core Dump" flag. */
6689#endif /* __APPLE__ */
6690					continue;
6691			}
6692
6693			if (ecb->dte_cond & DTRACE_COND_ZONEOWNER) {
6694				cred_t *cr;
6695				cred_t *s_cr =
6696				    ecb->dte_state->dts_cred.dcr_cred;
6697#pragma unused(cr, s_cr) /* __APPLE__ */
6698
6699				ASSERT(s_cr != NULL);
6700
6701#if !defined(__APPLE__)
6702				if ((cr = CRED()) == NULL ||
6703				    s_cr->cr_zone->zone_id !=
6704				    cr->cr_zone->zone_id)
6705					continue;
6706#else
6707				/* Darwin doesn't do zones. */
6708#endif /* __APPLE__ */
6709			}
6710		}
6711
6712		if (now - state->dts_alive > dtrace_deadman_timeout) {
6713			/*
6714			 * We seem to be dead.  Unless we (a) have kernel
6715			 * destructive permissions (b) have expicitly enabled
6716			 * destructive actions and (c) destructive actions have
6717			 * not been disabled, we're going to transition into
6718			 * the KILLED state, from which no further processing
6719			 * on this state will be performed.
6720			 */
6721			if (!dtrace_priv_kernel_destructive(state) ||
6722			    !state->dts_cred.dcr_destructive ||
6723			    dtrace_destructive_disallow) {
6724				void *activity = &state->dts_activity;
6725				dtrace_activity_t current;
6726
6727				do {
6728					current = state->dts_activity;
6729				} while (dtrace_cas32(activity, current,
6730				    DTRACE_ACTIVITY_KILLED) != current);
6731
6732				continue;
6733			}
6734		}
6735
6736		if ((offs = dtrace_buffer_reserve(buf, ecb->dte_needed,
6737		    ecb->dte_alignment, state, &mstate)) < 0)
6738			continue;
6739
6740		tomax = buf->dtb_tomax;
6741		ASSERT(tomax != NULL);
6742
6743		if (ecb->dte_size != 0)
6744			DTRACE_STORE(uint32_t, tomax, offs, ecb->dte_epid);
6745
6746		mstate.dtms_epid = ecb->dte_epid;
6747		mstate.dtms_present |= DTRACE_MSTATE_EPID;
6748
6749		if (state->dts_cred.dcr_visible & DTRACE_CRV_KERNEL)
6750			mstate.dtms_access = DTRACE_ACCESS_KERNEL;
6751		else
6752			mstate.dtms_access = 0;
6753
6754		if (pred != NULL) {
6755			dtrace_difo_t *dp = pred->dtp_difo;
6756			int rval;
6757
6758			rval = dtrace_dif_emulate(dp, &mstate, vstate, state);
6759
6760			if (!(*flags & CPU_DTRACE_ERROR) && !rval) {
6761				dtrace_cacheid_t cid = probe->dtpr_predcache;
6762
6763				if (cid != DTRACE_CACHEIDNONE && !onintr) {
6764					/*
6765					 * Update the predicate cache...
6766					 */
6767					ASSERT(cid == pred->dtp_cacheid);
6768#if !defined(__APPLE__)
6769					curthread->t_predcache = cid;
6770#else
6771					dtrace_set_thread_predcache(current_thread(), cid);
6772#endif /* __APPLE__ */
6773				}
6774
6775				continue;
6776			}
6777		}
6778
6779		for (act = ecb->dte_action; !(*flags & CPU_DTRACE_ERROR) &&
6780		    act != NULL; act = act->dta_next) {
6781			size_t valoffs;
6782			dtrace_difo_t *dp;
6783			dtrace_recdesc_t *rec = &act->dta_rec;
6784
6785			size = rec->dtrd_size;
6786			valoffs = offs + rec->dtrd_offset;
6787
6788			if (DTRACEACT_ISAGG(act->dta_kind)) {
6789				uint64_t v = 0xbad;
6790				dtrace_aggregation_t *agg;
6791
6792				agg = (dtrace_aggregation_t *)act;
6793
6794				if ((dp = act->dta_difo) != NULL)
6795					v = dtrace_dif_emulate(dp,
6796					    &mstate, vstate, state);
6797
6798				if (*flags & CPU_DTRACE_ERROR)
6799					continue;
6800
6801				/*
6802				 * Note that we always pass the expression
6803				 * value from the previous iteration of the
6804				 * action loop.  This value will only be used
6805				 * if there is an expression argument to the
6806				 * aggregating action, denoted by the
6807				 * dtag_hasarg field.
6808				 */
6809				dtrace_aggregate(agg, buf,
6810				    offs, aggbuf, v, val);
6811				continue;
6812			}
6813
6814			switch (act->dta_kind) {
6815			case DTRACEACT_STOP:
6816				if (dtrace_priv_proc_destructive(state))
6817					dtrace_action_stop();
6818				continue;
6819
6820			case DTRACEACT_BREAKPOINT:
6821				if (dtrace_priv_kernel_destructive(state))
6822					dtrace_action_breakpoint(ecb);
6823				continue;
6824
6825			case DTRACEACT_PANIC:
6826				if (dtrace_priv_kernel_destructive(state))
6827					dtrace_action_panic(ecb);
6828				continue;
6829
6830			case DTRACEACT_STACK:
6831				if (!dtrace_priv_kernel(state))
6832					continue;
6833
6834#if !defined(__APPLE__) /* Quiet compiler warnings */
6835				dtrace_getpcstack((pc_t *)(tomax + valoffs),
6836				    size / sizeof (pc_t), probe->dtpr_aframes,
6837				    DTRACE_ANCHORED(probe) ? NULL :
6838				    (uint32_t *)arg0);
6839#else
6840				dtrace_getpcstack((pc_t *)(tomax + valoffs),
6841				    size / sizeof (pc_t), probe->dtpr_aframes,
6842				    DTRACE_ANCHORED(probe) ? NULL :
6843				  (uint32_t *)(uintptr_t)arg0);
6844#endif /* __APPLE__ */
6845
6846				continue;
6847
6848			case DTRACEACT_JSTACK:
6849			case DTRACEACT_USTACK:
6850				if (!dtrace_priv_proc(state))
6851					continue;
6852
6853				/*
6854				 * See comment in DIF_VAR_PID.
6855				 */
6856				if (DTRACE_ANCHORED(mstate.dtms_probe) &&
6857				    CPU_ON_INTR(CPU)) {
6858					int depth = DTRACE_USTACK_NFRAMES(
6859					    rec->dtrd_arg) + 1;
6860
6861					dtrace_bzero((void *)(tomax + valoffs),
6862					    DTRACE_USTACK_STRSIZE(rec->dtrd_arg)
6863					    + depth * sizeof (uint64_t));
6864
6865					continue;
6866				}
6867
6868				if (DTRACE_USTACK_STRSIZE(rec->dtrd_arg) != 0 &&
6869				    curproc->p_dtrace_helpers != NULL) {
6870					/*
6871					 * This is the slow path -- we have
6872					 * allocated string space, and we're
6873					 * getting the stack of a process that
6874					 * has helpers.  Call into a separate
6875					 * routine to perform this processing.
6876					 */
6877					dtrace_action_ustack(&mstate, state,
6878					    (uint64_t *)(tomax + valoffs),
6879					    rec->dtrd_arg);
6880					continue;
6881				}
6882
6883				DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
6884				dtrace_getupcstack((uint64_t *)
6885				    (tomax + valoffs),
6886				    DTRACE_USTACK_NFRAMES(rec->dtrd_arg) + 1);
6887				DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);
6888				continue;
6889
6890			default:
6891				break;
6892			}
6893
6894			dp = act->dta_difo;
6895			ASSERT(dp != NULL);
6896
6897			val = dtrace_dif_emulate(dp, &mstate, vstate, state);
6898
6899			if (*flags & CPU_DTRACE_ERROR)
6900				continue;
6901
6902			switch (act->dta_kind) {
6903			case DTRACEACT_SPECULATE:
6904				ASSERT(buf == &state->dts_buffer[cpuid]);
6905				buf = dtrace_speculation_buffer(state,
6906				    cpuid, val);
6907
6908				if (buf == NULL) {
6909					*flags |= CPU_DTRACE_DROP;
6910					continue;
6911				}
6912
6913				offs = dtrace_buffer_reserve(buf,
6914				    ecb->dte_needed, ecb->dte_alignment,
6915				    state, NULL);
6916
6917				if (offs < 0) {
6918					*flags |= CPU_DTRACE_DROP;
6919					continue;
6920				}
6921
6922				tomax = buf->dtb_tomax;
6923				ASSERT(tomax != NULL);
6924
6925				if (ecb->dte_size != 0)
6926					DTRACE_STORE(uint32_t, tomax, offs,
6927					    ecb->dte_epid);
6928				continue;
6929
6930			case DTRACEACT_CHILL:
6931				if (dtrace_priv_kernel_destructive(state))
6932					dtrace_action_chill(&mstate, val);
6933				continue;
6934
6935			case DTRACEACT_RAISE:
6936				if (dtrace_priv_proc_destructive(state))
6937					dtrace_action_raise(val);
6938				continue;
6939
6940#if defined(__APPLE__)
6941			case DTRACEACT_PIDRESUME:
6942				if (dtrace_priv_proc_destructive(state))
6943					dtrace_action_pidresume(val);
6944				continue;
6945#endif /* __APPLE__ */
6946
6947			case DTRACEACT_COMMIT:
6948				ASSERT(!committed);
6949
6950				/*
6951				 * We need to commit our buffer state.
6952				 */
6953				if (ecb->dte_size)
6954					buf->dtb_offset = offs + ecb->dte_size;
6955				buf = &state->dts_buffer[cpuid];
6956				dtrace_speculation_commit(state, cpuid, val);
6957				committed = 1;
6958				continue;
6959
6960			case DTRACEACT_DISCARD:
6961				dtrace_speculation_discard(state, cpuid, val);
6962				continue;
6963
6964			case DTRACEACT_DIFEXPR:
6965			case DTRACEACT_LIBACT:
6966			case DTRACEACT_PRINTF:
6967			case DTRACEACT_PRINTA:
6968			case DTRACEACT_SYSTEM:
6969			case DTRACEACT_FREOPEN:
6970#if defined(__APPLE__)
6971			case DTRACEACT_APPLEBINARY:
6972#endif /* __APPLE__ */
6973				break;
6974
6975			case DTRACEACT_SYM:
6976			case DTRACEACT_MOD:
6977				if (!dtrace_priv_kernel(state))
6978					continue;
6979				break;
6980
6981#if !defined(__APPLE__)
6982			case DTRACEACT_USYM:
6983			case DTRACEACT_UMOD:
6984			case DTRACEACT_UADDR: {
6985				struct pid *pid = curthread->t_procp->p_pidp;
6986
6987				if (!dtrace_priv_proc(state))
6988					continue;
6989
6990				DTRACE_STORE(uint64_t, tomax,
6991				    valoffs, (uint64_t)pid->pid_id);
6992				DTRACE_STORE(uint64_t, tomax,
6993				    valoffs + sizeof (uint64_t), val);
6994
6995				continue;
6996			}
6997#else
6998			case DTRACEACT_USYM:
6999			case DTRACEACT_UMOD:
7000			case DTRACEACT_UADDR: {
7001				if (!dtrace_priv_proc(state))
7002					continue;
7003
7004				DTRACE_STORE(uint64_t, tomax,
7005				    valoffs, (uint64_t)dtrace_proc_selfpid());
7006				DTRACE_STORE(uint64_t, tomax,
7007				    valoffs + sizeof (uint64_t), val);
7008
7009				continue;
7010			}
7011#endif /* __APPLE__ */
7012
7013			case DTRACEACT_EXIT: {
7014				/*
7015				 * For the exit action, we are going to attempt
7016				 * to atomically set our activity to be
7017				 * draining.  If this fails (either because
7018				 * another CPU has beat us to the exit action,
7019				 * or because our current activity is something
7020				 * other than ACTIVE or WARMUP), we will
7021				 * continue.  This assures that the exit action
7022				 * can be successfully recorded at most once
7023				 * when we're in the ACTIVE state.  If we're
7024				 * encountering the exit() action while in
7025				 * COOLDOWN, however, we want to honor the new
7026				 * status code.  (We know that we're the only
7027				 * thread in COOLDOWN, so there is no race.)
7028				 */
7029				void *activity = &state->dts_activity;
7030				dtrace_activity_t current = state->dts_activity;
7031
7032				if (current == DTRACE_ACTIVITY_COOLDOWN)
7033					break;
7034
7035				if (current != DTRACE_ACTIVITY_WARMUP)
7036					current = DTRACE_ACTIVITY_ACTIVE;
7037
7038				if (dtrace_cas32(activity, current,
7039				    DTRACE_ACTIVITY_DRAINING) != current) {
7040					*flags |= CPU_DTRACE_DROP;
7041					continue;
7042				}
7043
7044				break;
7045			}
7046
7047			default:
7048				ASSERT(0);
7049			}
7050
7051			if (dp->dtdo_rtype.dtdt_flags & DIF_TF_BYREF) {
7052				uintptr_t end = valoffs + size;
7053
7054				if (!dtrace_vcanload((void *)(uintptr_t)val,
7055				    &dp->dtdo_rtype, &mstate, vstate))
7056					continue;
7057
7058				/*
7059				 * If this is a string, we're going to only
7060				 * load until we find the zero byte -- after
7061				 * which we'll store zero bytes.
7062				 */
7063				if (dp->dtdo_rtype.dtdt_kind ==
7064				    DIF_TYPE_STRING) {
7065					char c = '\0' + 1;
7066					int intuple = act->dta_intuple;
7067					size_t s;
7068
7069					for (s = 0; s < size; s++) {
7070						if (c != '\0')
7071							c = dtrace_load8(val++);
7072
7073						DTRACE_STORE(uint8_t, tomax,
7074						    valoffs++, c);
7075
7076						if (c == '\0' && intuple)
7077							break;
7078					}
7079
7080					continue;
7081				}
7082
7083				while (valoffs < end) {
7084					DTRACE_STORE(uint8_t, tomax, valoffs++,
7085					    dtrace_load8(val++));
7086				}
7087
7088				continue;
7089			}
7090
7091			switch (size) {
7092			case 0:
7093				break;
7094
7095			case sizeof (uint8_t):
7096				DTRACE_STORE(uint8_t, tomax, valoffs, val);
7097				break;
7098			case sizeof (uint16_t):
7099				DTRACE_STORE(uint16_t, tomax, valoffs, val);
7100				break;
7101			case sizeof (uint32_t):
7102				DTRACE_STORE(uint32_t, tomax, valoffs, val);
7103				break;
7104			case sizeof (uint64_t):
7105				DTRACE_STORE(uint64_t, tomax, valoffs, val);
7106				break;
7107			default:
7108				/*
7109				 * Any other size should have been returned by
7110				 * reference, not by value.
7111				 */
7112				ASSERT(0);
7113				break;
7114			}
7115		}
7116
7117		if (*flags & CPU_DTRACE_DROP)
7118			continue;
7119
7120		if (*flags & CPU_DTRACE_FAULT) {
7121			int ndx;
7122			dtrace_action_t *err;
7123
7124			buf->dtb_errors++;
7125
7126			if (probe->dtpr_id == dtrace_probeid_error) {
7127				/*
7128				 * There's nothing we can do -- we had an
7129				 * error on the error probe.  We bump an
7130				 * error counter to at least indicate that
7131				 * this condition happened.
7132				 */
7133				dtrace_error(&state->dts_dblerrors);
7134				continue;
7135			}
7136
7137			if (vtime) {
7138				/*
7139				 * Before recursing on dtrace_probe(), we
7140				 * need to explicitly clear out our start
7141				 * time to prevent it from being accumulated
7142				 * into t_dtrace_vtime.
7143				 */
7144#if !defined(__APPLE__)
7145				curthread->t_dtrace_start = 0;
7146#else
7147				/* Set the sign bit on t_dtrace_tracing to suspend accumulation to it. */
7148				dtrace_set_thread_tracing(current_thread(),
7149							(1ULL<<63) | dtrace_get_thread_tracing(current_thread()));
7150#endif /* __APPLE__ */
7151			}
7152
7153			/*
7154			 * Iterate over the actions to figure out which action
7155			 * we were processing when we experienced the error.
7156			 * Note that act points _past_ the faulting action; if
7157			 * act is ecb->dte_action, the fault was in the
7158			 * predicate, if it's ecb->dte_action->dta_next it's
7159			 * in action #1, and so on.
7160			 */
7161			for (err = ecb->dte_action, ndx = 0;
7162			    err != act; err = err->dta_next, ndx++)
7163				continue;
7164
7165			dtrace_probe_error(state, ecb->dte_epid, ndx,
7166			    (mstate.dtms_present & DTRACE_MSTATE_FLTOFFS) ?
7167			    mstate.dtms_fltoffs : -1, DTRACE_FLAGS2FLT(*flags),
7168			    cpu_core[cpuid].cpuc_dtrace_illval);
7169
7170			continue;
7171		}
7172
7173		if (!committed)
7174			buf->dtb_offset = offs + ecb->dte_size;
7175	}
7176
7177#if !defined(__APPLE__)
7178	if (vtime)
7179		curthread->t_dtrace_start = dtrace_gethrtime();
7180#else
7181	/* FIXME: the time spent leaving DTrace from this point to the rti is attributed
7182	   to the current thread. Instead it should accrue to DTrace. */
7183	if (vtime) {
7184		thread_t thread = current_thread();
7185		int64_t t = dtrace_get_thread_tracing(thread);
7186
7187		if (t >= 0) {
7188			/* Usual case, accumulate time spent here into t_dtrace_tracing */
7189			dtrace_set_thread_tracing(thread, t + (dtrace_gethrtime() - now));
7190		} else {
7191			/* Return from error recursion. No accumulation, just clear the sign bit on t_dtrace_tracing. */
7192			dtrace_set_thread_tracing(thread, (~(1ULL<<63)) & t);
7193		}
7194	}
7195#endif /* __APPLE__ */
7196
7197	dtrace_interrupt_enable(cookie);
7198}
7199
7200#if defined(__APPLE__)
7201/* Don't allow a thread to re-enter dtrace_probe(). This could occur if a probe is encountered
7202   on some function in the transitive closure of the call to dtrace_probe(). Solaris has some
7203   strong guarantees that this won't happen, the Darwin implementation is not so mature as to
7204   make those guarantees. */
7205
7206void
7207dtrace_probe(dtrace_id_t id, uint64_t arg0, uint64_t arg1,
7208    uint64_t arg2, uint64_t arg3, uint64_t arg4)
7209{
7210	thread_t thread = current_thread();
7211	disable_preemption();
7212	if (id == dtrace_probeid_error) {
7213		__dtrace_probe(id, arg0, arg1, arg2, arg3, arg4);
7214		dtrace_getipl(); /* Defeat tail-call optimization of __dtrace_probe() */
7215	} else if (!dtrace_get_thread_reentering(thread)) {
7216		dtrace_set_thread_reentering(thread, TRUE);
7217		__dtrace_probe(id, arg0, arg1, arg2, arg3, arg4);
7218		dtrace_set_thread_reentering(thread, FALSE);
7219	}
7220#if DEBUG
7221	else __dtrace_probe(dtrace_probeid_error, 0, id, 1, -1, DTRACEFLT_UNKNOWN);
7222#endif
7223	enable_preemption();
7224}
7225#endif /* __APPLE__ */
7226
7227/*
7228 * DTrace Probe Hashing Functions
7229 *
7230 * The functions in this section (and indeed, the functions in remaining
7231 * sections) are not _called_ from probe context.  (Any exceptions to this are
7232 * marked with a "Note:".)  Rather, they are called from elsewhere in the
7233 * DTrace framework to look-up probes in, add probes to and remove probes from
7234 * the DTrace probe hashes.  (Each probe is hashed by each element of the
7235 * probe tuple -- allowing for fast lookups, regardless of what was
7236 * specified.)
7237 */
7238static uint_t
7239#if !defined(__APPLE__)  /* Quiet compiler warnings */
7240dtrace_hash_str(char *p)
7241#else
7242dtrace_hash_str(const char *p)
7243#endif /* __APPLE__ */
7244{
7245	unsigned int g;
7246	uint_t hval = 0;
7247
7248	while (*p) {
7249		hval = (hval << 4) + *p++;
7250		if ((g = (hval & 0xf0000000)) != 0)
7251			hval ^= g >> 24;
7252		hval &= ~g;
7253	}
7254	return (hval);
7255}
7256
7257static dtrace_hash_t *
7258dtrace_hash_create(uintptr_t stroffs, uintptr_t nextoffs, uintptr_t prevoffs)
7259{
7260	dtrace_hash_t *hash = kmem_zalloc(sizeof (dtrace_hash_t), KM_SLEEP);
7261
7262	hash->dth_stroffs = stroffs;
7263	hash->dth_nextoffs = nextoffs;
7264	hash->dth_prevoffs = prevoffs;
7265
7266	hash->dth_size = 1;
7267	hash->dth_mask = hash->dth_size - 1;
7268
7269	hash->dth_tab = kmem_zalloc(hash->dth_size *
7270	    sizeof (dtrace_hashbucket_t *), KM_SLEEP);
7271
7272	return (hash);
7273}
7274
7275#if !defined(__APPLE__) /* Unused. Quiet compiler warning. */
7276static void
7277dtrace_hash_destroy(dtrace_hash_t *hash)
7278{
7279#if DEBUG
7280	int i;
7281
7282	for (i = 0; i < hash->dth_size; i++)
7283		ASSERT(hash->dth_tab[i] == NULL);
7284#endif
7285
7286	kmem_free(hash->dth_tab,
7287	    hash->dth_size * sizeof (dtrace_hashbucket_t *));
7288	kmem_free(hash, sizeof (dtrace_hash_t));
7289}
7290#endif /* __APPLE__ */
7291
7292static void
7293dtrace_hash_resize(dtrace_hash_t *hash)
7294{
7295	int size = hash->dth_size, i, ndx;
7296	int new_size = hash->dth_size << 1;
7297	int new_mask = new_size - 1;
7298	dtrace_hashbucket_t **new_tab, *bucket, *next;
7299
7300	ASSERT((new_size & new_mask) == 0);
7301
7302	new_tab = kmem_zalloc(new_size * sizeof (void *), KM_SLEEP);
7303
7304	for (i = 0; i < size; i++) {
7305		for (bucket = hash->dth_tab[i]; bucket != NULL; bucket = next) {
7306			dtrace_probe_t *probe = bucket->dthb_chain;
7307
7308			ASSERT(probe != NULL);
7309			ndx = DTRACE_HASHSTR(hash, probe) & new_mask;
7310
7311			next = bucket->dthb_next;
7312			bucket->dthb_next = new_tab[ndx];
7313			new_tab[ndx] = bucket;
7314		}
7315	}
7316
7317	kmem_free(hash->dth_tab, hash->dth_size * sizeof (void *));
7318	hash->dth_tab = new_tab;
7319	hash->dth_size = new_size;
7320	hash->dth_mask = new_mask;
7321}
7322
7323static void
7324dtrace_hash_add(dtrace_hash_t *hash, dtrace_probe_t *new)
7325{
7326	int hashval = DTRACE_HASHSTR(hash, new);
7327	int ndx = hashval & hash->dth_mask;
7328	dtrace_hashbucket_t *bucket = hash->dth_tab[ndx];
7329	dtrace_probe_t **nextp, **prevp;
7330
7331	for (; bucket != NULL; bucket = bucket->dthb_next) {
7332		if (DTRACE_HASHEQ(hash, bucket->dthb_chain, new))
7333			goto add;
7334	}
7335
7336	if ((hash->dth_nbuckets >> 1) > hash->dth_size) {
7337		dtrace_hash_resize(hash);
7338		dtrace_hash_add(hash, new);
7339		return;
7340	}
7341
7342	bucket = kmem_zalloc(sizeof (dtrace_hashbucket_t), KM_SLEEP);
7343	bucket->dthb_next = hash->dth_tab[ndx];
7344	hash->dth_tab[ndx] = bucket;
7345	hash->dth_nbuckets++;
7346
7347add:
7348	nextp = DTRACE_HASHNEXT(hash, new);
7349	ASSERT(*nextp == NULL && *(DTRACE_HASHPREV(hash, new)) == NULL);
7350	*nextp = bucket->dthb_chain;
7351
7352	if (bucket->dthb_chain != NULL) {
7353		prevp = DTRACE_HASHPREV(hash, bucket->dthb_chain);
7354		ASSERT(*prevp == NULL);
7355		*prevp = new;
7356	}
7357
7358	bucket->dthb_chain = new;
7359	bucket->dthb_len++;
7360}
7361
7362static dtrace_probe_t *
7363dtrace_hash_lookup(dtrace_hash_t *hash, dtrace_probe_t *template)
7364{
7365	int hashval = DTRACE_HASHSTR(hash, template);
7366	int ndx = hashval & hash->dth_mask;
7367	dtrace_hashbucket_t *bucket = hash->dth_tab[ndx];
7368
7369	for (; bucket != NULL; bucket = bucket->dthb_next) {
7370		if (DTRACE_HASHEQ(hash, bucket->dthb_chain, template))
7371			return (bucket->dthb_chain);
7372	}
7373
7374	return (NULL);
7375}
7376
7377static int
7378dtrace_hash_collisions(dtrace_hash_t *hash, dtrace_probe_t *template)
7379{
7380	int hashval = DTRACE_HASHSTR(hash, template);
7381	int ndx = hashval & hash->dth_mask;
7382	dtrace_hashbucket_t *bucket = hash->dth_tab[ndx];
7383
7384	for (; bucket != NULL; bucket = bucket->dthb_next) {
7385		if (DTRACE_HASHEQ(hash, bucket->dthb_chain, template))
7386			return (bucket->dthb_len);
7387	}
7388
7389	return (NULL);
7390}
7391
7392static void
7393dtrace_hash_remove(dtrace_hash_t *hash, dtrace_probe_t *probe)
7394{
7395	int ndx = DTRACE_HASHSTR(hash, probe) & hash->dth_mask;
7396	dtrace_hashbucket_t *bucket = hash->dth_tab[ndx];
7397
7398	dtrace_probe_t **prevp = DTRACE_HASHPREV(hash, probe);
7399	dtrace_probe_t **nextp = DTRACE_HASHNEXT(hash, probe);
7400
7401	/*
7402	 * Find the bucket that we're removing this probe from.
7403	 */
7404	for (; bucket != NULL; bucket = bucket->dthb_next) {
7405		if (DTRACE_HASHEQ(hash, bucket->dthb_chain, probe))
7406			break;
7407	}
7408
7409	ASSERT(bucket != NULL);
7410
7411	if (*prevp == NULL) {
7412		if (*nextp == NULL) {
7413			/*
7414			 * The removed probe was the only probe on this
7415			 * bucket; we need to remove the bucket.
7416			 */
7417			dtrace_hashbucket_t *b = hash->dth_tab[ndx];
7418
7419			ASSERT(bucket->dthb_chain == probe);
7420			ASSERT(b != NULL);
7421
7422			if (b == bucket) {
7423				hash->dth_tab[ndx] = bucket->dthb_next;
7424			} else {
7425				while (b->dthb_next != bucket)
7426					b = b->dthb_next;
7427				b->dthb_next = bucket->dthb_next;
7428			}
7429
7430			ASSERT(hash->dth_nbuckets > 0);
7431			hash->dth_nbuckets--;
7432			kmem_free(bucket, sizeof (dtrace_hashbucket_t));
7433			return;
7434		}
7435
7436		bucket->dthb_chain = *nextp;
7437	} else {
7438		*(DTRACE_HASHNEXT(hash, *prevp)) = *nextp;
7439	}
7440
7441	if (*nextp != NULL)
7442		*(DTRACE_HASHPREV(hash, *nextp)) = *prevp;
7443}
7444
7445/*
7446 * DTrace Utility Functions
7447 *
7448 * These are random utility functions that are _not_ called from probe context.
7449 */
7450static int
7451dtrace_badattr(const dtrace_attribute_t *a)
7452{
7453	return (a->dtat_name > DTRACE_STABILITY_MAX ||
7454	    a->dtat_data > DTRACE_STABILITY_MAX ||
7455	    a->dtat_class > DTRACE_CLASS_MAX);
7456}
7457
7458/*
7459 * Return a duplicate copy of a string.  If the specified string is NULL,
7460 * this function returns a zero-length string.
7461 */
7462#if !defined(__APPLE__)
7463static char *
7464dtrace_strdup(const char *str)
7465{
7466	char *new = kmem_zalloc((str != NULL ? strlen(str) : 0) + 1, KM_SLEEP);
7467
7468	if (str != NULL)
7469		(void) strcpy(new, str);
7470
7471	return (new);
7472}
7473#else /* Employ size bounded string operation. */
7474static char *
7475dtrace_strdup(const char *str)
7476{
7477	size_t bufsize = (str != NULL ? strlen(str) : 0) + 1;
7478	char *new = kmem_zalloc(bufsize, KM_SLEEP);
7479
7480	if (str != NULL)
7481		(void) strlcpy(new, str, bufsize);
7482
7483	return (new);
7484}
7485#endif /* __APPLE__ */
7486
7487#define	DTRACE_ISALPHA(c)	\
7488	(((c) >= 'a' && (c) <= 'z') || ((c) >= 'A' && (c) <= 'Z'))
7489
7490static int
7491dtrace_badname(const char *s)
7492{
7493	char c;
7494
7495	if (s == NULL || (c = *s++) == '\0')
7496		return (0);
7497
7498	if (!DTRACE_ISALPHA(c) && c != '-' && c != '_' && c != '.')
7499		return (1);
7500
7501	while ((c = *s++) != '\0') {
7502		if (!DTRACE_ISALPHA(c) && (c < '0' || c > '9') &&
7503		    c != '-' && c != '_' && c != '.' && c != '`')
7504			return (1);
7505	}
7506
7507	return (0);
7508}
7509
7510static void
7511dtrace_cred2priv(cred_t *cr, uint32_t *privp, uid_t *uidp, zoneid_t *zoneidp)
7512{
7513	uint32_t priv;
7514
7515	if (cr == NULL || PRIV_POLICY_ONLY(cr, PRIV_ALL, B_FALSE)) {
7516		/*
7517		 * For DTRACE_PRIV_ALL, the uid and zoneid don't matter.
7518		 */
7519		priv = DTRACE_PRIV_ALL;
7520	} else {
7521		*uidp = crgetuid(cr);
7522		*zoneidp = crgetzoneid(cr);
7523
7524		priv = 0;
7525		if (PRIV_POLICY_ONLY(cr, PRIV_DTRACE_KERNEL, B_FALSE))
7526			priv |= DTRACE_PRIV_KERNEL | DTRACE_PRIV_USER;
7527		else if (PRIV_POLICY_ONLY(cr, PRIV_DTRACE_USER, B_FALSE))
7528			priv |= DTRACE_PRIV_USER;
7529		if (PRIV_POLICY_ONLY(cr, PRIV_DTRACE_PROC, B_FALSE))
7530			priv |= DTRACE_PRIV_PROC;
7531		if (PRIV_POLICY_ONLY(cr, PRIV_PROC_OWNER, B_FALSE))
7532			priv |= DTRACE_PRIV_OWNER;
7533		if (PRIV_POLICY_ONLY(cr, PRIV_PROC_ZONE, B_FALSE))
7534			priv |= DTRACE_PRIV_ZONEOWNER;
7535	}
7536
7537	*privp = priv;
7538}
7539
7540#ifdef DTRACE_ERRDEBUG
7541static void
7542dtrace_errdebug(const char *str)
7543{
7544#if !defined(__APPLE__) /* Quiet compiler warnings */
7545	int hval = dtrace_hash_str((char *)str) % DTRACE_ERRHASHSZ;
7546#else
7547	int hval = dtrace_hash_str(str) % DTRACE_ERRHASHSZ;
7548#endif /* __APPLE__ */
7549	int occupied = 0;
7550
7551	lck_mtx_lock(&dtrace_errlock);
7552	dtrace_errlast = str;
7553#if !defined(__APPLE__)
7554	dtrace_errthread = curthread;
7555#else
7556	dtrace_errthread = (kthread_t *)current_thread();
7557#endif /* __APPLE__ */
7558
7559	while (occupied++ < DTRACE_ERRHASHSZ) {
7560		if (dtrace_errhash[hval].dter_msg == str) {
7561			dtrace_errhash[hval].dter_count++;
7562			goto out;
7563		}
7564
7565		if (dtrace_errhash[hval].dter_msg != NULL) {
7566			hval = (hval + 1) % DTRACE_ERRHASHSZ;
7567			continue;
7568		}
7569
7570		dtrace_errhash[hval].dter_msg = str;
7571		dtrace_errhash[hval].dter_count = 1;
7572		goto out;
7573	}
7574
7575	panic("dtrace: undersized error hash");
7576out:
7577	lck_mtx_unlock(&dtrace_errlock);
7578}
7579#endif
7580
7581/*
7582 * DTrace Matching Functions
7583 *
7584 * These functions are used to match groups of probes, given some elements of
7585 * a probe tuple, or some globbed expressions for elements of a probe tuple.
7586 */
7587static int
7588dtrace_match_priv(const dtrace_probe_t *prp, uint32_t priv, uid_t uid,
7589    zoneid_t zoneid)
7590{
7591	if (priv != DTRACE_PRIV_ALL) {
7592		uint32_t ppriv = prp->dtpr_provider->dtpv_priv.dtpp_flags;
7593		uint32_t match = priv & ppriv;
7594
7595		/*
7596		 * No PRIV_DTRACE_* privileges...
7597		 */
7598		if ((priv & (DTRACE_PRIV_PROC | DTRACE_PRIV_USER |
7599		    DTRACE_PRIV_KERNEL)) == 0)
7600			return (0);
7601
7602		/*
7603		 * No matching bits, but there were bits to match...
7604		 */
7605		if (match == 0 && ppriv != 0)
7606			return (0);
7607
7608		/*
7609		 * Need to have permissions to the process, but don't...
7610		 */
7611		if (((ppriv & ~match) & DTRACE_PRIV_OWNER) != 0 &&
7612		    uid != prp->dtpr_provider->dtpv_priv.dtpp_uid) {
7613			return (0);
7614		}
7615
7616		/*
7617		 * Need to be in the same zone unless we possess the
7618		 * privilege to examine all zones.
7619		 */
7620		if (((ppriv & ~match) & DTRACE_PRIV_ZONEOWNER) != 0 &&
7621		    zoneid != prp->dtpr_provider->dtpv_priv.dtpp_zoneid) {
7622			return (0);
7623		}
7624	}
7625
7626	return (1);
7627}
7628
7629/*
7630 * dtrace_match_probe compares a dtrace_probe_t to a pre-compiled key, which
7631 * consists of input pattern strings and an ops-vector to evaluate them.
7632 * This function returns >0 for match, 0 for no match, and <0 for error.
7633 */
7634static int
7635dtrace_match_probe(const dtrace_probe_t *prp, const dtrace_probekey_t *pkp,
7636    uint32_t priv, uid_t uid, zoneid_t zoneid)
7637{
7638	dtrace_provider_t *pvp = prp->dtpr_provider;
7639	int rv;
7640
7641	if (pvp->dtpv_defunct)
7642		return (0);
7643
7644	if ((rv = pkp->dtpk_pmatch(pvp->dtpv_name, pkp->dtpk_prov, 0)) <= 0)
7645		return (rv);
7646
7647	if ((rv = pkp->dtpk_mmatch(prp->dtpr_mod, pkp->dtpk_mod, 0)) <= 0)
7648		return (rv);
7649
7650	if ((rv = pkp->dtpk_fmatch(prp->dtpr_func, pkp->dtpk_func, 0)) <= 0)
7651		return (rv);
7652
7653	if ((rv = pkp->dtpk_nmatch(prp->dtpr_name, pkp->dtpk_name, 0)) <= 0)
7654		return (rv);
7655
7656	if (dtrace_match_priv(prp, priv, uid, zoneid) == 0)
7657		return (0);
7658
7659	return (rv);
7660}
7661
7662/*
7663 * dtrace_match_glob() is a safe kernel implementation of the gmatch(3GEN)
7664 * interface for matching a glob pattern 'p' to an input string 's'.  Unlike
7665 * libc's version, the kernel version only applies to 8-bit ASCII strings.
7666 * In addition, all of the recursion cases except for '*' matching have been
7667 * unwound.  For '*', we still implement recursive evaluation, but a depth
7668 * counter is maintained and matching is aborted if we recurse too deep.
7669 * The function returns 0 if no match, >0 if match, and <0 if recursion error.
7670 */
7671static int
7672dtrace_match_glob(const char *s, const char *p, int depth)
7673{
7674	const char *olds;
7675	char s1, c;
7676	int gs;
7677
7678	if (depth > DTRACE_PROBEKEY_MAXDEPTH)
7679		return (-1);
7680
7681	if (s == NULL)
7682		s = ""; /* treat NULL as empty string */
7683
7684top:
7685	olds = s;
7686	s1 = *s++;
7687
7688	if (p == NULL)
7689		return (0);
7690
7691	if ((c = *p++) == '\0')
7692		return (s1 == '\0');
7693
7694	switch (c) {
7695	case '[': {
7696		int ok = 0, notflag = 0;
7697		char lc = '\0';
7698
7699		if (s1 == '\0')
7700			return (0);
7701
7702		if (*p == '!') {
7703			notflag = 1;
7704			p++;
7705		}
7706
7707		if ((c = *p++) == '\0')
7708			return (0);
7709
7710		do {
7711			if (c == '-' && lc != '\0' && *p != ']') {
7712				if ((c = *p++) == '\0')
7713					return (0);
7714				if (c == '\\' && (c = *p++) == '\0')
7715					return (0);
7716
7717				if (notflag) {
7718					if (s1 < lc || s1 > c)
7719						ok++;
7720					else
7721						return (0);
7722				} else if (lc <= s1 && s1 <= c)
7723					ok++;
7724
7725			} else if (c == '\\' && (c = *p++) == '\0')
7726				return (0);
7727
7728			lc = c; /* save left-hand 'c' for next iteration */
7729
7730			if (notflag) {
7731				if (s1 != c)
7732					ok++;
7733				else
7734					return (0);
7735			} else if (s1 == c)
7736				ok++;
7737
7738			if ((c = *p++) == '\0')
7739				return (0);
7740
7741		} while (c != ']');
7742
7743		if (ok)
7744			goto top;
7745
7746		return (0);
7747	}
7748
7749	case '\\':
7750		if ((c = *p++) == '\0')
7751			return (0);
7752		/*FALLTHRU*/
7753
7754	default:
7755		if (c != s1)
7756			return (0);
7757		/*FALLTHRU*/
7758
7759	case '?':
7760		if (s1 != '\0')
7761			goto top;
7762		return (0);
7763
7764	case '*':
7765		while (*p == '*')
7766			p++; /* consecutive *'s are identical to a single one */
7767
7768		if (*p == '\0')
7769			return (1);
7770
7771		for (s = olds; *s != '\0'; s++) {
7772			if ((gs = dtrace_match_glob(s, p, depth + 1)) != 0)
7773				return (gs);
7774		}
7775
7776		return (0);
7777	}
7778}
7779
7780/*ARGSUSED*/
7781static int
7782dtrace_match_string(const char *s, const char *p, int depth)
7783{
7784#pragma unused(depth) /* __APPLE__ */
7785#if !defined(__APPLE__)
7786	return (s != NULL && strcmp(s, p) == 0);
7787#else /* Employ size bounded string operation. */
7788	return (s != NULL && strncmp(s, p, strlen(s) + 1) == 0);
7789#endif /* __APPLE__ */
7790}
7791
7792/*ARGSUSED*/
7793static int
7794dtrace_match_nul(const char *s, const char *p, int depth)
7795{
7796#pragma unused(s, p, depth) /* __APPLE__ */
7797	return (1); /* always match the empty pattern */
7798}
7799
7800/*ARGSUSED*/
7801static int
7802dtrace_match_nonzero(const char *s, const char *p, int depth)
7803{
7804#pragma unused(p, depth) /* __APPLE__ */
7805	return (s != NULL && s[0] != '\0');
7806}
7807
7808static int
7809dtrace_match(const dtrace_probekey_t *pkp, uint32_t priv, uid_t uid,
7810    zoneid_t zoneid, int (*matched)(dtrace_probe_t *, void *), void *arg)
7811{
7812	dtrace_probe_t template, *probe;
7813	dtrace_hash_t *hash = NULL;
7814	int len, rc, best = INT_MAX, nmatched = 0;
7815	dtrace_id_t i;
7816
7817	lck_mtx_assert(&dtrace_lock, LCK_MTX_ASSERT_OWNED);
7818
7819	/*
7820	 * If the probe ID is specified in the key, just lookup by ID and
7821	 * invoke the match callback once if a matching probe is found.
7822	 */
7823	if (pkp->dtpk_id != DTRACE_IDNONE) {
7824		if ((probe = dtrace_probe_lookup_id(pkp->dtpk_id)) != NULL &&
7825		    dtrace_match_probe(probe, pkp, priv, uid, zoneid) > 0) {
7826		        if ((*matched)(probe, arg) == DTRACE_MATCH_FAIL)
7827                               return (DTRACE_MATCH_FAIL);
7828			nmatched++;
7829		}
7830		return (nmatched);
7831	}
7832
7833#if !defined(__APPLE__)   /* Quiet compiler warnings */
7834	template.dtpr_mod = (char *)pkp->dtpk_mod;
7835	template.dtpr_func = (char *)pkp->dtpk_func;
7836	template.dtpr_name = (char *)pkp->dtpk_name;
7837#else
7838	template.dtpr_mod =  (char *)(uintptr_t)pkp->dtpk_mod;
7839	template.dtpr_func = (char *)(uintptr_t)pkp->dtpk_func;
7840	template.dtpr_name = (char *)(uintptr_t)pkp->dtpk_name;
7841#endif /* __APPLE__ */
7842
7843	/*
7844	 * We want to find the most distinct of the module name, function
7845	 * name, and name.  So for each one that is not a glob pattern or
7846	 * empty string, we perform a lookup in the corresponding hash and
7847	 * use the hash table with the fewest collisions to do our search.
7848	 */
7849	if (pkp->dtpk_mmatch == &dtrace_match_string &&
7850	    (len = dtrace_hash_collisions(dtrace_bymod, &template)) < best) {
7851		best = len;
7852		hash = dtrace_bymod;
7853	}
7854
7855	if (pkp->dtpk_fmatch == &dtrace_match_string &&
7856	    (len = dtrace_hash_collisions(dtrace_byfunc, &template)) < best) {
7857		best = len;
7858		hash = dtrace_byfunc;
7859	}
7860
7861	if (pkp->dtpk_nmatch == &dtrace_match_string &&
7862	    (len = dtrace_hash_collisions(dtrace_byname, &template)) < best) {
7863		best = len;
7864		hash = dtrace_byname;
7865	}
7866
7867	/*
7868	 * If we did not select a hash table, iterate over every probe and
7869	 * invoke our callback for each one that matches our input probe key.
7870	 */
7871	if (hash == NULL) {
7872#if !defined(__APPLE__)  /* Quiet compiler warning */
7873		for (i = 0; i < dtrace_nprobes; i++) {
7874#else
7875		for (i = 0; i < (dtrace_id_t)dtrace_nprobes; i++) {
7876#endif /* __APPLE__ */
7877			if ((probe = dtrace_probes[i]) == NULL ||
7878			    dtrace_match_probe(probe, pkp, priv, uid,
7879			    zoneid) <= 0)
7880				continue;
7881
7882			nmatched++;
7883
7884                       if ((rc = (*matched)(probe, arg)) != DTRACE_MATCH_NEXT) {
7885			       if (rc == DTRACE_MATCH_FAIL)
7886                                       return (DTRACE_MATCH_FAIL);
7887			       break;
7888                       }
7889		}
7890
7891		return (nmatched);
7892	}
7893
7894	/*
7895	 * If we selected a hash table, iterate over each probe of the same key
7896	 * name and invoke the callback for every probe that matches the other
7897	 * attributes of our input probe key.
7898	 */
7899	for (probe = dtrace_hash_lookup(hash, &template); probe != NULL;
7900	    probe = *(DTRACE_HASHNEXT(hash, probe))) {
7901
7902		if (dtrace_match_probe(probe, pkp, priv, uid, zoneid) <= 0)
7903			continue;
7904
7905		nmatched++;
7906
7907		if ((rc = (*matched)(probe, arg)) != DTRACE_MATCH_NEXT) {
7908		    if (rc == DTRACE_MATCH_FAIL)
7909			return (DTRACE_MATCH_FAIL);
7910		    break;
7911		}
7912	}
7913
7914	return (nmatched);
7915}
7916
7917/*
7918 * Return the function pointer dtrace_probecmp() should use to compare the
7919 * specified pattern with a string.  For NULL or empty patterns, we select
7920 * dtrace_match_nul().  For glob pattern strings, we use dtrace_match_glob().
7921 * For non-empty non-glob strings, we use dtrace_match_string().
7922 */
7923static dtrace_probekey_f *
7924dtrace_probekey_func(const char *p)
7925{
7926	char c;
7927
7928	if (p == NULL || *p == '\0')
7929		return (&dtrace_match_nul);
7930
7931	while ((c = *p++) != '\0') {
7932		if (c == '[' || c == '?' || c == '*' || c == '\\')
7933			return (&dtrace_match_glob);
7934	}
7935
7936	return (&dtrace_match_string);
7937}
7938
7939/*
7940 * Build a probe comparison key for use with dtrace_match_probe() from the
7941 * given probe description.  By convention, a null key only matches anchored
7942 * probes: if each field is the empty string, reset dtpk_fmatch to
7943 * dtrace_match_nonzero().
7944 */
7945static void
7946dtrace_probekey(const dtrace_probedesc_t *pdp, dtrace_probekey_t *pkp)
7947{
7948	pkp->dtpk_prov = pdp->dtpd_provider;
7949	pkp->dtpk_pmatch = dtrace_probekey_func(pdp->dtpd_provider);
7950
7951	pkp->dtpk_mod = pdp->dtpd_mod;
7952	pkp->dtpk_mmatch = dtrace_probekey_func(pdp->dtpd_mod);
7953
7954	pkp->dtpk_func = pdp->dtpd_func;
7955	pkp->dtpk_fmatch = dtrace_probekey_func(pdp->dtpd_func);
7956
7957	pkp->dtpk_name = pdp->dtpd_name;
7958	pkp->dtpk_nmatch = dtrace_probekey_func(pdp->dtpd_name);
7959
7960	pkp->dtpk_id = pdp->dtpd_id;
7961
7962	if (pkp->dtpk_id == DTRACE_IDNONE &&
7963	    pkp->dtpk_pmatch == &dtrace_match_nul &&
7964	    pkp->dtpk_mmatch == &dtrace_match_nul &&
7965	    pkp->dtpk_fmatch == &dtrace_match_nul &&
7966	    pkp->dtpk_nmatch == &dtrace_match_nul)
7967		pkp->dtpk_fmatch = &dtrace_match_nonzero;
7968}
7969
7970/*
7971 * DTrace Provider-to-Framework API Functions
7972 *
7973 * These functions implement much of the Provider-to-Framework API, as
7974 * described in <sys/dtrace.h>.  The parts of the API not in this section are
7975 * the functions in the API for probe management (found below), and
7976 * dtrace_probe() itself (found above).
7977 */
7978
7979/*
7980 * Register the calling provider with the DTrace framework.  This should
7981 * generally be called by DTrace providers in their attach(9E) entry point.
7982 */
7983int
7984dtrace_register(const char *name, const dtrace_pattr_t *pap, uint32_t priv,
7985    cred_t *cr, const dtrace_pops_t *pops, void *arg, dtrace_provider_id_t *idp)
7986{
7987	dtrace_provider_t *provider;
7988
7989	if (name == NULL || pap == NULL || pops == NULL || idp == NULL) {
7990		cmn_err(CE_WARN, "failed to register provider '%s': invalid "
7991		    "arguments", name ? name : "<NULL>");
7992		return (EINVAL);
7993	}
7994
7995	if (name[0] == '\0' || dtrace_badname(name)) {
7996		cmn_err(CE_WARN, "failed to register provider '%s': invalid "
7997		    "provider name", name);
7998		return (EINVAL);
7999	}
8000
8001	if ((pops->dtps_provide == NULL && pops->dtps_provide_module == NULL) ||
8002	    pops->dtps_enable == NULL || pops->dtps_disable == NULL ||
8003	    pops->dtps_destroy == NULL ||
8004	    ((pops->dtps_resume == NULL) != (pops->dtps_suspend == NULL))) {
8005		cmn_err(CE_WARN, "failed to register provider '%s': invalid "
8006		    "provider ops", name);
8007		return (EINVAL);
8008	}
8009
8010	if (dtrace_badattr(&pap->dtpa_provider) ||
8011	    dtrace_badattr(&pap->dtpa_mod) ||
8012	    dtrace_badattr(&pap->dtpa_func) ||
8013	    dtrace_badattr(&pap->dtpa_name) ||
8014	    dtrace_badattr(&pap->dtpa_args)) {
8015		cmn_err(CE_WARN, "failed to register provider '%s': invalid "
8016		    "provider attributes", name);
8017		return (EINVAL);
8018	}
8019
8020	if (priv & ~DTRACE_PRIV_ALL) {
8021		cmn_err(CE_WARN, "failed to register provider '%s': invalid "
8022		    "privilege attributes", name);
8023		return (EINVAL);
8024	}
8025
8026	if ((priv & DTRACE_PRIV_KERNEL) &&
8027	    (priv & (DTRACE_PRIV_USER | DTRACE_PRIV_OWNER)) &&
8028	    pops->dtps_usermode == NULL) {
8029		cmn_err(CE_WARN, "failed to register provider '%s': need "
8030		    "dtps_usermode() op for given privilege attributes", name);
8031		return (EINVAL);
8032	}
8033
8034	provider = kmem_zalloc(sizeof (dtrace_provider_t), KM_SLEEP);
8035#if !defined(__APPLE__)
8036	provider->dtpv_name = kmem_alloc(strlen(name) + 1, KM_SLEEP);
8037	(void) strcpy(provider->dtpv_name, name);
8038#else /* Employ size bounded string operation. */
8039	{
8040	size_t bufsize = strlen(name) + 1;
8041	provider->dtpv_name = kmem_alloc(bufsize, KM_SLEEP);
8042	(void) strlcpy(provider->dtpv_name, name, bufsize);
8043	}
8044#endif /* __APPLE__ */
8045
8046	provider->dtpv_attr = *pap;
8047	provider->dtpv_priv.dtpp_flags = priv;
8048	if (cr != NULL) {
8049		provider->dtpv_priv.dtpp_uid = crgetuid(cr);
8050		provider->dtpv_priv.dtpp_zoneid = crgetzoneid(cr);
8051	}
8052	provider->dtpv_pops = *pops;
8053
8054	if (pops->dtps_provide == NULL) {
8055		ASSERT(pops->dtps_provide_module != NULL);
8056		provider->dtpv_pops.dtps_provide =
8057		    (void (*)(void *, const dtrace_probedesc_t *))dtrace_nullop;
8058	}
8059
8060	if (pops->dtps_provide_module == NULL) {
8061		ASSERT(pops->dtps_provide != NULL);
8062		provider->dtpv_pops.dtps_provide_module =
8063		    (void (*)(void *, struct modctl *))dtrace_nullop;
8064	}
8065
8066	if (pops->dtps_suspend == NULL) {
8067		ASSERT(pops->dtps_resume == NULL);
8068		provider->dtpv_pops.dtps_suspend =
8069		    (void (*)(void *, dtrace_id_t, void *))dtrace_nullop;
8070		provider->dtpv_pops.dtps_resume =
8071		    (void (*)(void *, dtrace_id_t, void *))dtrace_nullop;
8072	}
8073
8074	provider->dtpv_arg = arg;
8075	*idp = (dtrace_provider_id_t)provider;
8076
8077	if (pops == &dtrace_provider_ops) {
8078		lck_mtx_assert(&dtrace_provider_lock, LCK_MTX_ASSERT_OWNED);
8079		lck_mtx_assert(&dtrace_lock, LCK_MTX_ASSERT_OWNED);
8080		ASSERT(dtrace_anon.dta_enabling == NULL);
8081
8082		/*
8083		 * We make sure that the DTrace provider is at the head of
8084		 * the provider chain.
8085		 */
8086		provider->dtpv_next = dtrace_provider;
8087		dtrace_provider = provider;
8088		return (0);
8089	}
8090
8091	lck_mtx_lock(&dtrace_provider_lock);
8092	lck_mtx_lock(&dtrace_lock);
8093
8094	/*
8095	 * If there is at least one provider registered, we'll add this
8096	 * provider after the first provider.
8097	 */
8098	if (dtrace_provider != NULL) {
8099		provider->dtpv_next = dtrace_provider->dtpv_next;
8100		dtrace_provider->dtpv_next = provider;
8101	} else {
8102		dtrace_provider = provider;
8103	}
8104
8105	if (dtrace_retained != NULL) {
8106		dtrace_enabling_provide(provider);
8107
8108		/*
8109		 * Now we need to call dtrace_enabling_matchall() -- which
8110		 * will acquire cpu_lock and dtrace_lock.  We therefore need
8111		 * to drop all of our locks before calling into it...
8112		 */
8113		lck_mtx_unlock(&dtrace_lock);
8114		lck_mtx_unlock(&dtrace_provider_lock);
8115		dtrace_enabling_matchall();
8116
8117		return (0);
8118	}
8119
8120	lck_mtx_unlock(&dtrace_lock);
8121	lck_mtx_unlock(&dtrace_provider_lock);
8122
8123	return (0);
8124}
8125
8126/*
8127 * Unregister the specified provider from the DTrace framework.  This should
8128 * generally be called by DTrace providers in their detach(9E) entry point.
8129 */
8130int
8131dtrace_unregister(dtrace_provider_id_t id)
8132{
8133	dtrace_provider_t *old = (dtrace_provider_t *)id;
8134	dtrace_provider_t *prev = NULL;
8135	int i, self = 0;
8136	dtrace_probe_t *probe, *first = NULL;
8137
8138	if (old->dtpv_pops.dtps_enable ==
8139	    (int (*)(void *, dtrace_id_t, void *))dtrace_enable_nullop) {
8140		/*
8141		 * If DTrace itself is the provider, we're called with locks
8142		 * already held.
8143		 */
8144		ASSERT(old == dtrace_provider);
8145		ASSERT(dtrace_devi != NULL);
8146		lck_mtx_assert(&dtrace_provider_lock, LCK_MTX_ASSERT_OWNED);
8147		lck_mtx_assert(&dtrace_lock, LCK_MTX_ASSERT_OWNED);
8148		self = 1;
8149
8150		if (dtrace_provider->dtpv_next != NULL) {
8151			/*
8152			 * There's another provider here; return failure.
8153			 */
8154			return (EBUSY);
8155		}
8156	} else {
8157		lck_mtx_lock(&dtrace_provider_lock);
8158		lck_mtx_lock(&mod_lock);
8159		lck_mtx_lock(&dtrace_lock);
8160	}
8161
8162	/*
8163	 * If anyone has /dev/dtrace open, or if there are anonymous enabled
8164	 * probes, we refuse to let providers slither away, unless this
8165	 * provider has already been explicitly invalidated.
8166	 */
8167	if (!old->dtpv_defunct &&
8168	    (dtrace_opens || (dtrace_anon.dta_state != NULL &&
8169	    dtrace_anon.dta_state->dts_necbs > 0))) {
8170		if (!self) {
8171			lck_mtx_unlock(&dtrace_lock);
8172			lck_mtx_unlock(&mod_lock);
8173			lck_mtx_unlock(&dtrace_provider_lock);
8174		}
8175		return (EBUSY);
8176	}
8177
8178	/*
8179	 * Attempt to destroy the probes associated with this provider.
8180	 */
8181	if (old->ecb_count!=0) {
8182		/*
8183		 * We have at least one ECB; we can't remove this provider.
8184		 */
8185		if (!self) {
8186			lck_mtx_unlock(&dtrace_lock);
8187			lck_mtx_unlock(&mod_lock);
8188			lck_mtx_unlock(&dtrace_provider_lock);
8189		}
8190		return (EBUSY);
8191	}
8192
8193	/*
8194	 * All of the probes for this provider are disabled; we can safely
8195	 * remove all of them from their hash chains and from the probe array.
8196	 */
8197	for (i = 0; i < dtrace_nprobes && old->probe_count!=0; i++) {
8198		if ((probe = dtrace_probes[i]) == NULL)
8199			continue;
8200
8201		if (probe->dtpr_provider != old)
8202			continue;
8203
8204		dtrace_probes[i] = NULL;
8205		old->probe_count--;
8206
8207		dtrace_hash_remove(dtrace_bymod, probe);
8208		dtrace_hash_remove(dtrace_byfunc, probe);
8209		dtrace_hash_remove(dtrace_byname, probe);
8210
8211		if (first == NULL) {
8212			first = probe;
8213			probe->dtpr_nextmod = NULL;
8214		} else {
8215			probe->dtpr_nextmod = first;
8216			first = probe;
8217		}
8218	}
8219
8220	/*
8221	 * The provider's probes have been removed from the hash chains and
8222	 * from the probe array.  Now issue a dtrace_sync() to be sure that
8223	 * everyone has cleared out from any probe array processing.
8224	 */
8225	dtrace_sync();
8226
8227	for (probe = first; probe != NULL; probe = first) {
8228		first = probe->dtpr_nextmod;
8229
8230		old->dtpv_pops.dtps_destroy(old->dtpv_arg, probe->dtpr_id,
8231		    probe->dtpr_arg);
8232		kmem_free(probe->dtpr_mod, strlen(probe->dtpr_mod) + 1);
8233		kmem_free(probe->dtpr_func, strlen(probe->dtpr_func) + 1);
8234		kmem_free(probe->dtpr_name, strlen(probe->dtpr_name) + 1);
8235		vmem_free(dtrace_arena, (void *)(uintptr_t)(probe->dtpr_id), 1);
8236#if !defined(__APPLE__)
8237		kmem_free(probe, sizeof (dtrace_probe_t));
8238#else
8239		zfree(dtrace_probe_t_zone, probe);
8240#endif
8241	}
8242
8243	if ((prev = dtrace_provider) == old) {
8244		ASSERT(self || dtrace_devi == NULL);
8245		ASSERT(old->dtpv_next == NULL || dtrace_devi == NULL);
8246		dtrace_provider = old->dtpv_next;
8247	} else {
8248		while (prev != NULL && prev->dtpv_next != old)
8249			prev = prev->dtpv_next;
8250
8251		if (prev == NULL) {
8252			panic("attempt to unregister non-existent "
8253			    "dtrace provider %p\n", (void *)id);
8254		}
8255
8256		prev->dtpv_next = old->dtpv_next;
8257	}
8258
8259	if (!self) {
8260		lck_mtx_unlock(&dtrace_lock);
8261		lck_mtx_unlock(&mod_lock);
8262		lck_mtx_unlock(&dtrace_provider_lock);
8263	}
8264
8265	kmem_free(old->dtpv_name, strlen(old->dtpv_name) + 1);
8266	kmem_free(old, sizeof (dtrace_provider_t));
8267
8268	return (0);
8269}
8270
8271/*
8272 * Invalidate the specified provider.  All subsequent probe lookups for the
8273 * specified provider will fail, but its probes will not be removed.
8274 */
8275void
8276dtrace_invalidate(dtrace_provider_id_t id)
8277{
8278	dtrace_provider_t *pvp = (dtrace_provider_t *)id;
8279
8280	ASSERT(pvp->dtpv_pops.dtps_enable !=
8281	    (int (*)(void *, dtrace_id_t, void *))dtrace_enable_nullop);
8282
8283	lck_mtx_lock(&dtrace_provider_lock);
8284	lck_mtx_lock(&dtrace_lock);
8285
8286	pvp->dtpv_defunct = 1;
8287
8288	lck_mtx_unlock(&dtrace_lock);
8289	lck_mtx_unlock(&dtrace_provider_lock);
8290}
8291
8292/*
8293 * Indicate whether or not DTrace has attached.
8294 */
8295int
8296dtrace_attached(void)
8297{
8298	/*
8299	 * dtrace_provider will be non-NULL iff the DTrace driver has
8300	 * attached.  (It's non-NULL because DTrace is always itself a
8301	 * provider.)
8302	 */
8303	return (dtrace_provider != NULL);
8304}
8305
8306/*
8307 * Remove all the unenabled probes for the given provider.  This function is
8308 * not unlike dtrace_unregister(), except that it doesn't remove the provider
8309 * -- just as many of its associated probes as it can.
8310 */
8311int
8312dtrace_condense(dtrace_provider_id_t id)
8313{
8314	dtrace_provider_t *prov = (dtrace_provider_t *)id;
8315	int i;
8316	dtrace_probe_t *probe;
8317
8318	/*
8319	 * Make sure this isn't the dtrace provider itself.
8320	 */
8321	ASSERT(prov->dtpv_pops.dtps_enable !=
8322	  (int (*)(void *, dtrace_id_t, void *))dtrace_enable_nullop);
8323
8324	lck_mtx_lock(&dtrace_provider_lock);
8325	lck_mtx_lock(&dtrace_lock);
8326
8327	/*
8328	 * Attempt to destroy the probes associated with this provider.
8329	 */
8330	for (i = 0; i < dtrace_nprobes; i++) {
8331		if ((probe = dtrace_probes[i]) == NULL)
8332			continue;
8333
8334		if (probe->dtpr_provider != prov)
8335			continue;
8336
8337		if (probe->dtpr_ecb != NULL)
8338			continue;
8339
8340		dtrace_probes[i] = NULL;
8341		prov->probe_count--;
8342
8343		dtrace_hash_remove(dtrace_bymod, probe);
8344		dtrace_hash_remove(dtrace_byfunc, probe);
8345		dtrace_hash_remove(dtrace_byname, probe);
8346
8347		prov->dtpv_pops.dtps_destroy(prov->dtpv_arg, i + 1,
8348		    probe->dtpr_arg);
8349		kmem_free(probe->dtpr_mod, strlen(probe->dtpr_mod) + 1);
8350		kmem_free(probe->dtpr_func, strlen(probe->dtpr_func) + 1);
8351		kmem_free(probe->dtpr_name, strlen(probe->dtpr_name) + 1);
8352#if !defined(__APPLE__)
8353		kmem_free(probe, sizeof (dtrace_probe_t));
8354#else
8355		zfree(dtrace_probe_t_zone, probe);
8356#endif
8357		vmem_free(dtrace_arena, (void *)((uintptr_t)i + 1), 1);
8358	}
8359
8360	lck_mtx_unlock(&dtrace_lock);
8361	lck_mtx_unlock(&dtrace_provider_lock);
8362
8363	return (0);
8364}
8365
8366/*
8367 * DTrace Probe Management Functions
8368 *
8369 * The functions in this section perform the DTrace probe management,
8370 * including functions to create probes, look-up probes, and call into the
8371 * providers to request that probes be provided.  Some of these functions are
8372 * in the Provider-to-Framework API; these functions can be identified by the
8373 * fact that they are not declared "static".
8374 */
8375
8376/*
8377 * Create a probe with the specified module name, function name, and name.
8378 */
8379dtrace_id_t
8380dtrace_probe_create(dtrace_provider_id_t prov, const char *mod,
8381    const char *func, const char *name, int aframes, void *arg)
8382{
8383	dtrace_probe_t *probe, **probes;
8384	dtrace_provider_t *provider = (dtrace_provider_t *)prov;
8385	dtrace_id_t id;
8386
8387	if (provider == dtrace_provider) {
8388		lck_mtx_assert(&dtrace_lock, LCK_MTX_ASSERT_OWNED);
8389	} else {
8390		lck_mtx_lock(&dtrace_lock);
8391	}
8392
8393	id = (dtrace_id_t)(uintptr_t)vmem_alloc(dtrace_arena, 1,
8394	    VM_BESTFIT | VM_SLEEP);
8395#if !defined(__APPLE__)
8396	probe = kmem_zalloc(sizeof (dtrace_probe_t), KM_SLEEP);
8397#else
8398	probe = zalloc(dtrace_probe_t_zone);
8399	bzero(probe, sizeof (dtrace_probe_t));
8400#endif
8401
8402	probe->dtpr_id = id;
8403	probe->dtpr_gen = dtrace_probegen++;
8404	probe->dtpr_mod = dtrace_strdup(mod);
8405	probe->dtpr_func = dtrace_strdup(func);
8406	probe->dtpr_name = dtrace_strdup(name);
8407	probe->dtpr_arg = arg;
8408	probe->dtpr_aframes = aframes;
8409	probe->dtpr_provider = provider;
8410
8411	dtrace_hash_add(dtrace_bymod, probe);
8412	dtrace_hash_add(dtrace_byfunc, probe);
8413	dtrace_hash_add(dtrace_byname, probe);
8414
8415#if !defined(__APPLE__)  /* Quiet compiler warning */
8416	if (id - 1 >= dtrace_nprobes) {
8417#else
8418	if (id - 1 >= (dtrace_id_t)dtrace_nprobes) {
8419#endif /* __APPLE__ */
8420		size_t osize = dtrace_nprobes * sizeof (dtrace_probe_t *);
8421		size_t nsize = osize << 1;
8422
8423		if (nsize == 0) {
8424			ASSERT(osize == 0);
8425			ASSERT(dtrace_probes == NULL);
8426			nsize = sizeof (dtrace_probe_t *);
8427		}
8428
8429		probes = kmem_zalloc(nsize, KM_SLEEP);
8430
8431		if (dtrace_probes == NULL) {
8432			ASSERT(osize == 0);
8433			dtrace_probes = probes;
8434			dtrace_nprobes = 1;
8435		} else {
8436			dtrace_probe_t **oprobes = dtrace_probes;
8437
8438			bcopy(oprobes, probes, osize);
8439			dtrace_membar_producer();
8440			dtrace_probes = probes;
8441
8442			dtrace_sync();
8443
8444			/*
8445			 * All CPUs are now seeing the new probes array; we can
8446			 * safely free the old array.
8447			 */
8448			kmem_free(oprobes, osize);
8449			dtrace_nprobes <<= 1;
8450		}
8451
8452#if !defined(__APPLE__)  /* Quiet compiler warning */
8453		ASSERT(id - 1 < dtrace_nprobes);
8454#else
8455		ASSERT(id - 1 < (dtrace_id_t)dtrace_nprobes);
8456#endif /* __APPLE__ */
8457	}
8458
8459	ASSERT(dtrace_probes[id - 1] == NULL);
8460	dtrace_probes[id - 1] = probe;
8461	provider->probe_count++;
8462
8463	if (provider != dtrace_provider)
8464		lck_mtx_unlock(&dtrace_lock);
8465
8466	return (id);
8467}
8468
8469static dtrace_probe_t *
8470dtrace_probe_lookup_id(dtrace_id_t id)
8471{
8472	lck_mtx_assert(&dtrace_lock, LCK_MTX_ASSERT_OWNED);
8473
8474#if !defined(__APPLE__)  /* Quiet compiler warning */
8475	if (id == 0 || id > dtrace_nprobes)
8476		return (NULL);
8477#else
8478	if (id == 0 || id > (dtrace_id_t)dtrace_nprobes)
8479		return (NULL);
8480#endif /* __APPLE__ */
8481
8482	return (dtrace_probes[id - 1]);
8483}
8484
8485static int
8486dtrace_probe_lookup_match(dtrace_probe_t *probe, void *arg)
8487{
8488	*((dtrace_id_t *)arg) = probe->dtpr_id;
8489
8490	return (DTRACE_MATCH_DONE);
8491}
8492
8493/*
8494 * Look up a probe based on provider and one or more of module name, function
8495 * name and probe name.
8496 */
8497dtrace_id_t
8498dtrace_probe_lookup(dtrace_provider_id_t prid, const char *mod,
8499    const char *func, const char *name)
8500{
8501	dtrace_probekey_t pkey;
8502	dtrace_id_t id;
8503	int match;
8504
8505	pkey.dtpk_prov = ((dtrace_provider_t *)prid)->dtpv_name;
8506	pkey.dtpk_pmatch = &dtrace_match_string;
8507	pkey.dtpk_mod = mod;
8508	pkey.dtpk_mmatch = mod ? &dtrace_match_string : &dtrace_match_nul;
8509	pkey.dtpk_func = func;
8510	pkey.dtpk_fmatch = func ? &dtrace_match_string : &dtrace_match_nul;
8511	pkey.dtpk_name = name;
8512	pkey.dtpk_nmatch = name ? &dtrace_match_string : &dtrace_match_nul;
8513	pkey.dtpk_id = DTRACE_IDNONE;
8514
8515	lck_mtx_lock(&dtrace_lock);
8516	match = dtrace_match(&pkey, DTRACE_PRIV_ALL, 0, 0,
8517	    dtrace_probe_lookup_match, &id);
8518	lck_mtx_unlock(&dtrace_lock);
8519
8520	ASSERT(match == 1 || match == 0);
8521	return (match ? id : 0);
8522}
8523
8524/*
8525 * Returns the probe argument associated with the specified probe.
8526 */
8527void *
8528dtrace_probe_arg(dtrace_provider_id_t id, dtrace_id_t pid)
8529{
8530	dtrace_probe_t *probe;
8531	void *rval = NULL;
8532
8533	lck_mtx_lock(&dtrace_lock);
8534
8535	if ((probe = dtrace_probe_lookup_id(pid)) != NULL &&
8536	    probe->dtpr_provider == (dtrace_provider_t *)id)
8537		rval = probe->dtpr_arg;
8538
8539	lck_mtx_unlock(&dtrace_lock);
8540
8541	return (rval);
8542}
8543
8544/*
8545 * Copy a probe into a probe description.
8546 */
8547static void
8548dtrace_probe_description(const dtrace_probe_t *prp, dtrace_probedesc_t *pdp)
8549{
8550	bzero(pdp, sizeof (dtrace_probedesc_t));
8551	pdp->dtpd_id = prp->dtpr_id;
8552
8553#if !defined(__APPLE__)
8554	(void) strncpy(pdp->dtpd_provider,
8555	    prp->dtpr_provider->dtpv_name, DTRACE_PROVNAMELEN - 1);
8556
8557	(void) strncpy(pdp->dtpd_mod, prp->dtpr_mod, DTRACE_MODNAMELEN - 1);
8558	(void) strncpy(pdp->dtpd_func, prp->dtpr_func, DTRACE_FUNCNAMELEN - 1);
8559	(void) strncpy(pdp->dtpd_name, prp->dtpr_name, DTRACE_NAMELEN - 1);
8560#else /* Employ size bounded string operation. */
8561	(void) strlcpy(pdp->dtpd_provider,
8562	    prp->dtpr_provider->dtpv_name, DTRACE_PROVNAMELEN);
8563
8564	(void) strlcpy(pdp->dtpd_mod, prp->dtpr_mod, DTRACE_MODNAMELEN);
8565	(void) strlcpy(pdp->dtpd_func, prp->dtpr_func, DTRACE_FUNCNAMELEN);
8566	(void) strlcpy(pdp->dtpd_name, prp->dtpr_name, DTRACE_NAMELEN);
8567#endif /* __APPLE__ */
8568}
8569
8570/*
8571 * Called to indicate that a probe -- or probes -- should be provided by a
8572 * specfied provider.  If the specified description is NULL, the provider will
8573 * be told to provide all of its probes.  (This is done whenever a new
8574 * consumer comes along, or whenever a retained enabling is to be matched.) If
8575 * the specified description is non-NULL, the provider is given the
8576 * opportunity to dynamically provide the specified probe, allowing providers
8577 * to support the creation of probes on-the-fly.  (So-called _autocreated_
8578 * probes.)  If the provider is NULL, the operations will be applied to all
8579 * providers; if the provider is non-NULL the operations will only be applied
8580 * to the specified provider.  The dtrace_provider_lock must be held, and the
8581 * dtrace_lock must _not_ be held -- the provider's dtps_provide() operation
8582 * will need to grab the dtrace_lock when it reenters the framework through
8583 * dtrace_probe_lookup(), dtrace_probe_create(), etc.
8584 */
8585static void
8586dtrace_probe_provide(dtrace_probedesc_t *desc, dtrace_provider_t *prv)
8587{
8588	struct modctl *ctl;
8589	int all = 0;
8590
8591	lck_mtx_assert(&dtrace_provider_lock, LCK_MTX_ASSERT_OWNED);
8592
8593	if (prv == NULL) {
8594		all = 1;
8595		prv = dtrace_provider;
8596	}
8597
8598	do {
8599		/*
8600		 * First, call the blanket provide operation.
8601		 */
8602		prv->dtpv_pops.dtps_provide(prv->dtpv_arg, desc);
8603
8604		/*
8605		 * Now call the per-module provide operation.  We will grab
8606		 * mod_lock to prevent the list from being modified.  Note
8607		 * that this also prevents the mod_busy bits from changing.
8608		 * (mod_busy can only be changed with mod_lock held.)
8609		 */
8610		lck_mtx_lock(&mod_lock);
8611
8612#if !defined(__APPLE__)
8613		ctl = &modules;
8614		do {
8615			if (ctl->mod_busy || ctl->mod_mp == NULL)
8616				continue;
8617
8618			prv->dtpv_pops.dtps_provide_module(prv->dtpv_arg, ctl);
8619
8620		} while ((ctl = ctl->mod_next) != &modules);
8621#else
8622		ctl = dtrace_modctl_list;
8623		while (ctl) {
8624			prv->dtpv_pops.dtps_provide_module(prv->dtpv_arg, ctl);
8625			ctl = ctl->mod_next;
8626		}
8627#endif
8628
8629		lck_mtx_unlock(&mod_lock);
8630	} while (all && (prv = prv->dtpv_next) != NULL);
8631}
8632
8633/*
8634 * Iterate over each probe, and call the Framework-to-Provider API function
8635 * denoted by offs.
8636 */
8637static void
8638dtrace_probe_foreach(uintptr_t offs)
8639{
8640	dtrace_provider_t *prov;
8641	void (*func)(void *, dtrace_id_t, void *);
8642	dtrace_probe_t *probe;
8643	dtrace_icookie_t cookie;
8644	int i;
8645
8646	/*
8647	 * We disable interrupts to walk through the probe array.  This is
8648	 * safe -- the dtrace_sync() in dtrace_unregister() assures that we
8649	 * won't see stale data.
8650	 */
8651	cookie = dtrace_interrupt_disable();
8652
8653	for (i = 0; i < dtrace_nprobes; i++) {
8654		if ((probe = dtrace_probes[i]) == NULL)
8655			continue;
8656
8657		if (probe->dtpr_ecb == NULL) {
8658			/*
8659			 * This probe isn't enabled -- don't call the function.
8660			 */
8661			continue;
8662		}
8663
8664		prov = probe->dtpr_provider;
8665		func = *((void(**)(void *, dtrace_id_t, void *))
8666		    ((uintptr_t)&prov->dtpv_pops + offs));
8667
8668		func(prov->dtpv_arg, i + 1, probe->dtpr_arg);
8669	}
8670
8671	dtrace_interrupt_enable(cookie);
8672}
8673
8674static int
8675dtrace_probe_enable(const dtrace_probedesc_t *desc, dtrace_enabling_t *enab)
8676{
8677	dtrace_probekey_t pkey;
8678	uint32_t priv;
8679	uid_t uid;
8680	zoneid_t zoneid;
8681
8682	lck_mtx_assert(&dtrace_lock, LCK_MTX_ASSERT_OWNED);
8683
8684	dtrace_ecb_create_cache = NULL;
8685
8686	if (desc == NULL) {
8687		/*
8688		 * If we're passed a NULL description, we're being asked to
8689		 * create an ECB with a NULL probe.
8690		 */
8691		(void) dtrace_ecb_create_enable(NULL, enab);
8692		return (0);
8693	}
8694
8695	dtrace_probekey(desc, &pkey);
8696	dtrace_cred2priv(enab->dten_vstate->dtvs_state->dts_cred.dcr_cred,
8697	    &priv, &uid, &zoneid);
8698
8699	return (dtrace_match(&pkey, priv, uid, zoneid, dtrace_ecb_create_enable,
8700	    enab));
8701}
8702
8703/*
8704 * DTrace Helper Provider Functions
8705 */
8706static void
8707dtrace_dofattr2attr(dtrace_attribute_t *attr, const dof_attr_t dofattr)
8708{
8709	attr->dtat_name = DOF_ATTR_NAME(dofattr);
8710	attr->dtat_data = DOF_ATTR_DATA(dofattr);
8711	attr->dtat_class = DOF_ATTR_CLASS(dofattr);
8712}
8713
8714static void
8715dtrace_dofprov2hprov(dtrace_helper_provdesc_t *hprov,
8716    const dof_provider_t *dofprov, char *strtab)
8717{
8718	hprov->dthpv_provname = strtab + dofprov->dofpv_name;
8719	dtrace_dofattr2attr(&hprov->dthpv_pattr.dtpa_provider,
8720	    dofprov->dofpv_provattr);
8721	dtrace_dofattr2attr(&hprov->dthpv_pattr.dtpa_mod,
8722	    dofprov->dofpv_modattr);
8723	dtrace_dofattr2attr(&hprov->dthpv_pattr.dtpa_func,
8724	    dofprov->dofpv_funcattr);
8725	dtrace_dofattr2attr(&hprov->dthpv_pattr.dtpa_name,
8726	    dofprov->dofpv_nameattr);
8727	dtrace_dofattr2attr(&hprov->dthpv_pattr.dtpa_args,
8728	    dofprov->dofpv_argsattr);
8729}
8730
8731static void
8732dtrace_helper_provide_one(dof_helper_t *dhp, dof_sec_t *sec, pid_t pid)
8733{
8734	uintptr_t daddr = (uintptr_t)dhp->dofhp_dof;
8735	dof_hdr_t *dof = (dof_hdr_t *)daddr;
8736	dof_sec_t *str_sec, *prb_sec, *arg_sec, *off_sec, *enoff_sec;
8737	dof_provider_t *provider;
8738	dof_probe_t *probe;
8739	uint32_t *off, *enoff;
8740	uint8_t *arg;
8741	char *strtab;
8742	uint_t i, nprobes;
8743	dtrace_helper_provdesc_t dhpv;
8744	dtrace_helper_probedesc_t dhpb;
8745	dtrace_meta_t *meta = dtrace_meta_pid;
8746	dtrace_mops_t *mops = &meta->dtm_mops;
8747	void *parg;
8748
8749	provider = (dof_provider_t *)(uintptr_t)(daddr + sec->dofs_offset);
8750	str_sec = (dof_sec_t *)(uintptr_t)(daddr + dof->dofh_secoff +
8751	    provider->dofpv_strtab * dof->dofh_secsize);
8752	prb_sec = (dof_sec_t *)(uintptr_t)(daddr + dof->dofh_secoff +
8753	    provider->dofpv_probes * dof->dofh_secsize);
8754	arg_sec = (dof_sec_t *)(uintptr_t)(daddr + dof->dofh_secoff +
8755	    provider->dofpv_prargs * dof->dofh_secsize);
8756	off_sec = (dof_sec_t *)(uintptr_t)(daddr + dof->dofh_secoff +
8757	    provider->dofpv_proffs * dof->dofh_secsize);
8758
8759	strtab = (char *)(uintptr_t)(daddr + str_sec->dofs_offset);
8760	off = (uint32_t *)(uintptr_t)(daddr + off_sec->dofs_offset);
8761	arg = (uint8_t *)(uintptr_t)(daddr + arg_sec->dofs_offset);
8762	enoff = NULL;
8763
8764	/*
8765	 * See dtrace_helper_provider_validate().
8766	 */
8767	if (dof->dofh_ident[DOF_ID_VERSION] != DOF_VERSION_1 &&
8768	    provider->dofpv_prenoffs != DOF_SECT_NONE) {
8769		enoff_sec = (dof_sec_t *)(uintptr_t)(daddr + dof->dofh_secoff +
8770		    provider->dofpv_prenoffs * dof->dofh_secsize);
8771		enoff = (uint32_t *)(uintptr_t)(daddr + enoff_sec->dofs_offset);
8772	}
8773
8774	nprobes = prb_sec->dofs_size / prb_sec->dofs_entsize;
8775
8776	/*
8777	 * Create the provider.
8778	 */
8779	dtrace_dofprov2hprov(&dhpv, provider, strtab);
8780
8781	if ((parg = mops->dtms_provide_pid(meta->dtm_arg, &dhpv, pid)) == NULL)
8782		return;
8783
8784	meta->dtm_count++;
8785
8786	/*
8787	 * Create the probes.
8788	 */
8789	for (i = 0; i < nprobes; i++) {
8790		probe = (dof_probe_t *)(uintptr_t)(daddr +
8791		    prb_sec->dofs_offset + i * prb_sec->dofs_entsize);
8792
8793		dhpb.dthpb_mod = dhp->dofhp_mod;
8794		dhpb.dthpb_func = strtab + probe->dofpr_func;
8795		dhpb.dthpb_name = strtab + probe->dofpr_name;
8796#if !defined(__APPLE__)
8797		dhpb.dthpb_base = probe->dofpr_addr;
8798#else
8799		dhpb.dthpb_base = dhp->dofhp_addr; /* FIXME: James, why? */
8800#endif
8801#if !defined(__APPLE__)  /* Quiet compiler warning */
8802		dhpb.dthpb_offs = off + probe->dofpr_offidx;
8803#else
8804		dhpb.dthpb_offs = (int32_t *)(off + probe->dofpr_offidx);
8805#endif /* __APPLE__ */
8806		dhpb.dthpb_noffs = probe->dofpr_noffs;
8807		if (enoff != NULL) {
8808#if !defined(__APPLE__)  /* Quiet compiler warning */
8809			dhpb.dthpb_enoffs = enoff + probe->dofpr_enoffidx;
8810#else
8811			dhpb.dthpb_enoffs = (int32_t *)(enoff + probe->dofpr_enoffidx);
8812#endif /* __APPLE__ */
8813			dhpb.dthpb_nenoffs = probe->dofpr_nenoffs;
8814		} else {
8815			dhpb.dthpb_enoffs = NULL;
8816			dhpb.dthpb_nenoffs = 0;
8817		}
8818		dhpb.dthpb_args = arg + probe->dofpr_argidx;
8819		dhpb.dthpb_nargc = probe->dofpr_nargc;
8820		dhpb.dthpb_xargc = probe->dofpr_xargc;
8821		dhpb.dthpb_ntypes = strtab + probe->dofpr_nargv;
8822		dhpb.dthpb_xtypes = strtab + probe->dofpr_xargv;
8823
8824		mops->dtms_create_probe(meta->dtm_arg, parg, &dhpb);
8825	}
8826}
8827
8828static void
8829dtrace_helper_provide(dof_helper_t *dhp, pid_t pid)
8830{
8831	uintptr_t daddr = (uintptr_t)dhp->dofhp_dof;
8832	dof_hdr_t *dof = (dof_hdr_t *)daddr;
8833#if !defined(__APPLE__)  /* Quiet compiler warning */
8834	int i;
8835#else
8836	uint32_t i;
8837#endif /* __APPLE__ */
8838
8839	lck_mtx_assert(&dtrace_meta_lock, LCK_MTX_ASSERT_OWNED);
8840
8841	for (i = 0; i < dof->dofh_secnum; i++) {
8842		dof_sec_t *sec = (dof_sec_t *)(uintptr_t)(daddr +
8843		    dof->dofh_secoff + i * dof->dofh_secsize);
8844
8845		if (sec->dofs_type != DOF_SECT_PROVIDER)
8846			continue;
8847
8848		dtrace_helper_provide_one(dhp, sec, pid);
8849	}
8850
8851	/*
8852	 * We may have just created probes, so we must now rematch against
8853	 * any retained enablings.  Note that this call will acquire both
8854	 * cpu_lock and dtrace_lock; the fact that we are holding
8855	 * dtrace_meta_lock now is what defines the ordering with respect to
8856	 * these three locks.
8857	 */
8858	dtrace_enabling_matchall();
8859}
8860
8861static void
8862dtrace_helper_provider_remove_one(dof_helper_t *dhp, dof_sec_t *sec, pid_t pid)
8863{
8864	uintptr_t daddr = (uintptr_t)dhp->dofhp_dof;
8865	dof_hdr_t *dof = (dof_hdr_t *)daddr;
8866	dof_sec_t *str_sec;
8867	dof_provider_t *provider;
8868	char *strtab;
8869	dtrace_helper_provdesc_t dhpv;
8870	dtrace_meta_t *meta = dtrace_meta_pid;
8871	dtrace_mops_t *mops = &meta->dtm_mops;
8872
8873	provider = (dof_provider_t *)(uintptr_t)(daddr + sec->dofs_offset);
8874	str_sec = (dof_sec_t *)(uintptr_t)(daddr + dof->dofh_secoff +
8875	    provider->dofpv_strtab * dof->dofh_secsize);
8876
8877	strtab = (char *)(uintptr_t)(daddr + str_sec->dofs_offset);
8878
8879	/*
8880	 * Create the provider.
8881	 */
8882	dtrace_dofprov2hprov(&dhpv, provider, strtab);
8883
8884	mops->dtms_remove_pid(meta->dtm_arg, &dhpv, pid);
8885
8886	meta->dtm_count--;
8887}
8888
8889static void
8890dtrace_helper_provider_remove(dof_helper_t *dhp, pid_t pid)
8891{
8892	uintptr_t daddr = (uintptr_t)dhp->dofhp_dof;
8893	dof_hdr_t *dof = (dof_hdr_t *)daddr;
8894#if !defined(__APPLE__)  /* Quiet compiler warning */
8895	int i;
8896#else
8897	uint32_t i;
8898#endif /* __APPLE__ */
8899
8900	lck_mtx_assert(&dtrace_meta_lock, LCK_MTX_ASSERT_OWNED);
8901
8902	for (i = 0; i < dof->dofh_secnum; i++) {
8903		dof_sec_t *sec = (dof_sec_t *)(uintptr_t)(daddr +
8904		    dof->dofh_secoff + i * dof->dofh_secsize);
8905
8906		if (sec->dofs_type != DOF_SECT_PROVIDER)
8907			continue;
8908
8909		dtrace_helper_provider_remove_one(dhp, sec, pid);
8910	}
8911}
8912
8913/*
8914 * DTrace Meta Provider-to-Framework API Functions
8915 *
8916 * These functions implement the Meta Provider-to-Framework API, as described
8917 * in <sys/dtrace.h>.
8918 */
8919int
8920dtrace_meta_register(const char *name, const dtrace_mops_t *mops, void *arg,
8921    dtrace_meta_provider_id_t *idp)
8922{
8923	dtrace_meta_t *meta;
8924	dtrace_helpers_t *help, *next;
8925#if !defined(__APPLE__)  /* Quiet compiler warning */
8926	int i;
8927#else
8928	uint_t i;
8929#endif /* __APPLE__ */
8930
8931	*idp = DTRACE_METAPROVNONE;
8932
8933	/*
8934	 * We strictly don't need the name, but we hold onto it for
8935	 * debuggability. All hail error queues!
8936	 */
8937	if (name == NULL) {
8938		cmn_err(CE_WARN, "failed to register meta-provider: "
8939		    "invalid name");
8940		return (EINVAL);
8941	}
8942
8943	if (mops == NULL ||
8944	    mops->dtms_create_probe == NULL ||
8945	    mops->dtms_provide_pid == NULL ||
8946	    mops->dtms_remove_pid == NULL) {
8947		cmn_err(CE_WARN, "failed to register meta-register %s: "
8948		    "invalid ops", name);
8949		return (EINVAL);
8950	}
8951
8952	meta = kmem_zalloc(sizeof (dtrace_meta_t), KM_SLEEP);
8953	meta->dtm_mops = *mops;
8954#if !defined(__APPLE__)
8955	meta->dtm_name = kmem_alloc(strlen(name) + 1, KM_SLEEP);
8956	(void) strcpy(meta->dtm_name, name);
8957#else /* Employ size bounded string operation. */
8958	{
8959	size_t bufsize = strlen(name) + 1;
8960	meta->dtm_name = kmem_alloc(bufsize, KM_SLEEP);
8961	(void) strlcpy(meta->dtm_name, name, bufsize);
8962	}
8963#endif /* __APPLE__ */
8964	meta->dtm_arg = arg;
8965
8966	lck_mtx_lock(&dtrace_meta_lock);
8967	lck_mtx_lock(&dtrace_lock);
8968
8969	if (dtrace_meta_pid != NULL) {
8970		lck_mtx_unlock(&dtrace_lock);
8971		lck_mtx_unlock(&dtrace_meta_lock);
8972		cmn_err(CE_WARN, "failed to register meta-register %s: "
8973		    "user-land meta-provider exists", name);
8974		kmem_free(meta->dtm_name, strlen(meta->dtm_name) + 1);
8975		kmem_free(meta, sizeof (dtrace_meta_t));
8976		return (EINVAL);
8977	}
8978
8979	dtrace_meta_pid = meta;
8980	*idp = (dtrace_meta_provider_id_t)meta;
8981
8982	/*
8983	 * If there are providers and probes ready to go, pass them
8984	 * off to the new meta provider now.
8985	 */
8986
8987	help = dtrace_deferred_pid;
8988	dtrace_deferred_pid = NULL;
8989
8990	lck_mtx_unlock(&dtrace_lock);
8991
8992	while (help != NULL) {
8993		for (i = 0; i < help->dthps_nprovs; i++) {
8994			dtrace_helper_provide(&help->dthps_provs[i]->dthp_prov,
8995			    help->dthps_pid);
8996		}
8997
8998		next = help->dthps_next;
8999		help->dthps_next = NULL;
9000		help->dthps_prev = NULL;
9001		help->dthps_deferred = 0;
9002		help = next;
9003	}
9004
9005	lck_mtx_unlock(&dtrace_meta_lock);
9006
9007	return (0);
9008}
9009
9010int
9011dtrace_meta_unregister(dtrace_meta_provider_id_t id)
9012{
9013	dtrace_meta_t **pp, *old = (dtrace_meta_t *)id;
9014
9015	lck_mtx_lock(&dtrace_meta_lock);
9016	lck_mtx_lock(&dtrace_lock);
9017
9018	if (old == dtrace_meta_pid) {
9019		pp = &dtrace_meta_pid;
9020	} else {
9021		panic("attempt to unregister non-existent "
9022		    "dtrace meta-provider %p\n", (void *)old);
9023	}
9024
9025	if (old->dtm_count != 0) {
9026		lck_mtx_unlock(&dtrace_lock);
9027		lck_mtx_unlock(&dtrace_meta_lock);
9028		return (EBUSY);
9029	}
9030
9031	*pp = NULL;
9032
9033	lck_mtx_unlock(&dtrace_lock);
9034	lck_mtx_unlock(&dtrace_meta_lock);
9035
9036	kmem_free(old->dtm_name, strlen(old->dtm_name) + 1);
9037	kmem_free(old, sizeof (dtrace_meta_t));
9038
9039	return (0);
9040}
9041
9042
9043/*
9044 * DTrace DIF Object Functions
9045 */
9046static int
9047dtrace_difo_err(uint_t pc, const char *format, ...)
9048{
9049	if (dtrace_err_verbose) {
9050		va_list alist;
9051
9052		(void) uprintf("dtrace DIF object error: [%u]: ", pc);
9053		va_start(alist, format);
9054		(void) vuprintf(format, alist);
9055		va_end(alist);
9056	}
9057
9058#ifdef DTRACE_ERRDEBUG
9059	dtrace_errdebug(format);
9060#endif
9061	return (1);
9062}
9063
9064/*
9065 * Validate a DTrace DIF object by checking the IR instructions.  The following
9066 * rules are currently enforced by dtrace_difo_validate():
9067 *
9068 * 1. Each instruction must have a valid opcode
9069 * 2. Each register, string, variable, or subroutine reference must be valid
9070 * 3. No instruction can modify register %r0 (must be zero)
9071 * 4. All instruction reserved bits must be set to zero
9072 * 5. The last instruction must be a "ret" instruction
9073 * 6. All branch targets must reference a valid instruction _after_ the branch
9074 */
9075static int
9076dtrace_difo_validate(dtrace_difo_t *dp, dtrace_vstate_t *vstate, uint_t nregs,
9077    cred_t *cr)
9078{
9079#if !defined(__APPLE__)  /* Quiet compiler warnings */
9080	int err = 0, i;
9081#else
9082	int err = 0;
9083	uint_t i;
9084#endif /* __APPLE__ */
9085	int (*efunc)(uint_t pc, const char *, ...) = dtrace_difo_err;
9086	int kcheckload;
9087	uint_t pc;
9088
9089	kcheckload = cr == NULL ||
9090	    (vstate->dtvs_state->dts_cred.dcr_visible & DTRACE_CRV_KERNEL) == 0;
9091
9092	dp->dtdo_destructive = 0;
9093
9094	for (pc = 0; pc < dp->dtdo_len && err == 0; pc++) {
9095		dif_instr_t instr = dp->dtdo_buf[pc];
9096
9097		uint_t r1 = DIF_INSTR_R1(instr);
9098		uint_t r2 = DIF_INSTR_R2(instr);
9099		uint_t rd = DIF_INSTR_RD(instr);
9100		uint_t rs = DIF_INSTR_RS(instr);
9101		uint_t label = DIF_INSTR_LABEL(instr);
9102		uint_t v = DIF_INSTR_VAR(instr);
9103		uint_t subr = DIF_INSTR_SUBR(instr);
9104		uint_t type = DIF_INSTR_TYPE(instr);
9105		uint_t op = DIF_INSTR_OP(instr);
9106
9107		switch (op) {
9108		case DIF_OP_OR:
9109		case DIF_OP_XOR:
9110		case DIF_OP_AND:
9111		case DIF_OP_SLL:
9112		case DIF_OP_SRL:
9113		case DIF_OP_SRA:
9114		case DIF_OP_SUB:
9115		case DIF_OP_ADD:
9116		case DIF_OP_MUL:
9117		case DIF_OP_SDIV:
9118		case DIF_OP_UDIV:
9119		case DIF_OP_SREM:
9120		case DIF_OP_UREM:
9121		case DIF_OP_COPYS:
9122			if (r1 >= nregs)
9123				err += efunc(pc, "invalid register %u\n", r1);
9124			if (r2 >= nregs)
9125				err += efunc(pc, "invalid register %u\n", r2);
9126			if (rd >= nregs)
9127				err += efunc(pc, "invalid register %u\n", rd);
9128			if (rd == 0)
9129				err += efunc(pc, "cannot write to %r0\n");
9130			break;
9131		case DIF_OP_NOT:
9132		case DIF_OP_MOV:
9133		case DIF_OP_ALLOCS:
9134			if (r1 >= nregs)
9135				err += efunc(pc, "invalid register %u\n", r1);
9136			if (r2 != 0)
9137				err += efunc(pc, "non-zero reserved bits\n");
9138			if (rd >= nregs)
9139				err += efunc(pc, "invalid register %u\n", rd);
9140			if (rd == 0)
9141				err += efunc(pc, "cannot write to %r0\n");
9142			break;
9143		case DIF_OP_LDSB:
9144		case DIF_OP_LDSH:
9145		case DIF_OP_LDSW:
9146		case DIF_OP_LDUB:
9147		case DIF_OP_LDUH:
9148		case DIF_OP_LDUW:
9149		case DIF_OP_LDX:
9150			if (r1 >= nregs)
9151				err += efunc(pc, "invalid register %u\n", r1);
9152			if (r2 != 0)
9153				err += efunc(pc, "non-zero reserved bits\n");
9154			if (rd >= nregs)
9155				err += efunc(pc, "invalid register %u\n", rd);
9156			if (rd == 0)
9157				err += efunc(pc, "cannot write to %r0\n");
9158			if (kcheckload)
9159				dp->dtdo_buf[pc] = DIF_INSTR_LOAD(op +
9160				    DIF_OP_RLDSB - DIF_OP_LDSB, r1, rd);
9161			break;
9162		case DIF_OP_RLDSB:
9163		case DIF_OP_RLDSH:
9164		case DIF_OP_RLDSW:
9165		case DIF_OP_RLDUB:
9166		case DIF_OP_RLDUH:
9167		case DIF_OP_RLDUW:
9168		case DIF_OP_RLDX:
9169			if (r1 >= nregs)
9170				err += efunc(pc, "invalid register %u\n", r1);
9171			if (r2 != 0)
9172				err += efunc(pc, "non-zero reserved bits\n");
9173			if (rd >= nregs)
9174				err += efunc(pc, "invalid register %u\n", rd);
9175			if (rd == 0)
9176				err += efunc(pc, "cannot write to %r0\n");
9177			break;
9178		case DIF_OP_ULDSB:
9179		case DIF_OP_ULDSH:
9180		case DIF_OP_ULDSW:
9181		case DIF_OP_ULDUB:
9182		case DIF_OP_ULDUH:
9183		case DIF_OP_ULDUW:
9184		case DIF_OP_ULDX:
9185			if (r1 >= nregs)
9186				err += efunc(pc, "invalid register %u\n", r1);
9187			if (r2 != 0)
9188				err += efunc(pc, "non-zero reserved bits\n");
9189			if (rd >= nregs)
9190				err += efunc(pc, "invalid register %u\n", rd);
9191			if (rd == 0)
9192				err += efunc(pc, "cannot write to %r0\n");
9193			break;
9194		case DIF_OP_STB:
9195		case DIF_OP_STH:
9196		case DIF_OP_STW:
9197		case DIF_OP_STX:
9198			if (r1 >= nregs)
9199				err += efunc(pc, "invalid register %u\n", r1);
9200			if (r2 != 0)
9201				err += efunc(pc, "non-zero reserved bits\n");
9202			if (rd >= nregs)
9203				err += efunc(pc, "invalid register %u\n", rd);
9204			if (rd == 0)
9205				err += efunc(pc, "cannot write to 0 address\n");
9206			break;
9207		case DIF_OP_CMP:
9208		case DIF_OP_SCMP:
9209			if (r1 >= nregs)
9210				err += efunc(pc, "invalid register %u\n", r1);
9211			if (r2 >= nregs)
9212				err += efunc(pc, "invalid register %u\n", r2);
9213			if (rd != 0)
9214				err += efunc(pc, "non-zero reserved bits\n");
9215			break;
9216		case DIF_OP_TST:
9217			if (r1 >= nregs)
9218				err += efunc(pc, "invalid register %u\n", r1);
9219			if (r2 != 0 || rd != 0)
9220				err += efunc(pc, "non-zero reserved bits\n");
9221			break;
9222		case DIF_OP_BA:
9223		case DIF_OP_BE:
9224		case DIF_OP_BNE:
9225		case DIF_OP_BG:
9226		case DIF_OP_BGU:
9227		case DIF_OP_BGE:
9228		case DIF_OP_BGEU:
9229		case DIF_OP_BL:
9230		case DIF_OP_BLU:
9231		case DIF_OP_BLE:
9232		case DIF_OP_BLEU:
9233			if (label >= dp->dtdo_len) {
9234				err += efunc(pc, "invalid branch target %u\n",
9235				    label);
9236			}
9237			if (label <= pc) {
9238				err += efunc(pc, "backward branch to %u\n",
9239				    label);
9240			}
9241			break;
9242		case DIF_OP_RET:
9243			if (r1 != 0 || r2 != 0)
9244				err += efunc(pc, "non-zero reserved bits\n");
9245			if (rd >= nregs)
9246				err += efunc(pc, "invalid register %u\n", rd);
9247			break;
9248		case DIF_OP_NOP:
9249		case DIF_OP_POPTS:
9250		case DIF_OP_FLUSHTS:
9251			if (r1 != 0 || r2 != 0 || rd != 0)
9252				err += efunc(pc, "non-zero reserved bits\n");
9253			break;
9254		case DIF_OP_SETX:
9255			if (DIF_INSTR_INTEGER(instr) >= dp->dtdo_intlen) {
9256				err += efunc(pc, "invalid integer ref %u\n",
9257				    DIF_INSTR_INTEGER(instr));
9258			}
9259			if (rd >= nregs)
9260				err += efunc(pc, "invalid register %u\n", rd);
9261			if (rd == 0)
9262				err += efunc(pc, "cannot write to %r0\n");
9263			break;
9264		case DIF_OP_SETS:
9265			if (DIF_INSTR_STRING(instr) >= dp->dtdo_strlen) {
9266				err += efunc(pc, "invalid string ref %u\n",
9267				    DIF_INSTR_STRING(instr));
9268			}
9269			if (rd >= nregs)
9270				err += efunc(pc, "invalid register %u\n", rd);
9271			if (rd == 0)
9272				err += efunc(pc, "cannot write to %r0\n");
9273			break;
9274		case DIF_OP_LDGA:
9275		case DIF_OP_LDTA:
9276			if (r1 > DIF_VAR_ARRAY_MAX)
9277				err += efunc(pc, "invalid array %u\n", r1);
9278			if (r2 >= nregs)
9279				err += efunc(pc, "invalid register %u\n", r2);
9280			if (rd >= nregs)
9281				err += efunc(pc, "invalid register %u\n", rd);
9282			if (rd == 0)
9283				err += efunc(pc, "cannot write to %r0\n");
9284			break;
9285		case DIF_OP_LDGS:
9286		case DIF_OP_LDTS:
9287		case DIF_OP_LDLS:
9288		case DIF_OP_LDGAA:
9289		case DIF_OP_LDTAA:
9290			if (v < DIF_VAR_OTHER_MIN || v > DIF_VAR_OTHER_MAX)
9291				err += efunc(pc, "invalid variable %u\n", v);
9292			if (rd >= nregs)
9293				err += efunc(pc, "invalid register %u\n", rd);
9294			if (rd == 0)
9295				err += efunc(pc, "cannot write to %r0\n");
9296			break;
9297		case DIF_OP_STGS:
9298		case DIF_OP_STTS:
9299		case DIF_OP_STLS:
9300		case DIF_OP_STGAA:
9301		case DIF_OP_STTAA:
9302			if (v < DIF_VAR_OTHER_UBASE || v > DIF_VAR_OTHER_MAX)
9303				err += efunc(pc, "invalid variable %u\n", v);
9304			if (rs >= nregs)
9305				err += efunc(pc, "invalid register %u\n", rd);
9306			break;
9307		case DIF_OP_CALL:
9308			if (subr > DIF_SUBR_MAX)
9309				err += efunc(pc, "invalid subr %u\n", subr);
9310			if (rd >= nregs)
9311				err += efunc(pc, "invalid register %u\n", rd);
9312			if (rd == 0)
9313				err += efunc(pc, "cannot write to %r0\n");
9314
9315			if (subr == DIF_SUBR_COPYOUT ||
9316			    subr == DIF_SUBR_COPYOUTSTR) {
9317				dp->dtdo_destructive = 1;
9318			}
9319			break;
9320		case DIF_OP_PUSHTR:
9321			if (type != DIF_TYPE_STRING && type != DIF_TYPE_CTF)
9322				err += efunc(pc, "invalid ref type %u\n", type);
9323			if (r2 >= nregs)
9324				err += efunc(pc, "invalid register %u\n", r2);
9325			if (rs >= nregs)
9326				err += efunc(pc, "invalid register %u\n", rs);
9327			break;
9328		case DIF_OP_PUSHTV:
9329			if (type != DIF_TYPE_CTF)
9330				err += efunc(pc, "invalid val type %u\n", type);
9331			if (r2 >= nregs)
9332				err += efunc(pc, "invalid register %u\n", r2);
9333			if (rs >= nregs)
9334				err += efunc(pc, "invalid register %u\n", rs);
9335			break;
9336		default:
9337			err += efunc(pc, "invalid opcode %u\n",
9338			    DIF_INSTR_OP(instr));
9339		}
9340	}
9341
9342	if (dp->dtdo_len != 0 &&
9343	    DIF_INSTR_OP(dp->dtdo_buf[dp->dtdo_len - 1]) != DIF_OP_RET) {
9344		err += efunc(dp->dtdo_len - 1,
9345		    "expected 'ret' as last DIF instruction\n");
9346	}
9347
9348	if (!(dp->dtdo_rtype.dtdt_flags & DIF_TF_BYREF)) {
9349		/*
9350		 * If we're not returning by reference, the size must be either
9351		 * 0 or the size of one of the base types.
9352		 */
9353		switch (dp->dtdo_rtype.dtdt_size) {
9354		case 0:
9355		case sizeof (uint8_t):
9356		case sizeof (uint16_t):
9357		case sizeof (uint32_t):
9358		case sizeof (uint64_t):
9359			break;
9360
9361		default:
9362			err += efunc(dp->dtdo_len - 1, "bad return size\n");
9363		}
9364	}
9365
9366	for (i = 0; i < dp->dtdo_varlen && err == 0; i++) {
9367		dtrace_difv_t *v = &dp->dtdo_vartab[i], *existing = NULL;
9368		dtrace_diftype_t *vt, *et;
9369#if !defined(__APPLE__) /* Quiet compiler warnings */
9370		uint_t id, ndx;
9371#else
9372		uint_t id;
9373		int ndx;
9374#endif /* __APPLE__ */
9375
9376		if (v->dtdv_scope != DIFV_SCOPE_GLOBAL &&
9377		    v->dtdv_scope != DIFV_SCOPE_THREAD &&
9378		    v->dtdv_scope != DIFV_SCOPE_LOCAL) {
9379			err += efunc(i, "unrecognized variable scope %d\n",
9380			    v->dtdv_scope);
9381			break;
9382		}
9383
9384		if (v->dtdv_kind != DIFV_KIND_ARRAY &&
9385		    v->dtdv_kind != DIFV_KIND_SCALAR) {
9386			err += efunc(i, "unrecognized variable type %d\n",
9387			    v->dtdv_kind);
9388			break;
9389		}
9390
9391		if ((id = v->dtdv_id) > DIF_VARIABLE_MAX) {
9392			err += efunc(i, "%d exceeds variable id limit\n", id);
9393			break;
9394		}
9395
9396		if (id < DIF_VAR_OTHER_UBASE)
9397			continue;
9398
9399		/*
9400		 * For user-defined variables, we need to check that this
9401		 * definition is identical to any previous definition that we
9402		 * encountered.
9403		 */
9404		ndx = id - DIF_VAR_OTHER_UBASE;
9405
9406		switch (v->dtdv_scope) {
9407		case DIFV_SCOPE_GLOBAL:
9408			if (ndx < vstate->dtvs_nglobals) {
9409				dtrace_statvar_t *svar;
9410
9411				if ((svar = vstate->dtvs_globals[ndx]) != NULL)
9412					existing = &svar->dtsv_var;
9413			}
9414
9415			break;
9416
9417		case DIFV_SCOPE_THREAD:
9418			if (ndx < vstate->dtvs_ntlocals)
9419				existing = &vstate->dtvs_tlocals[ndx];
9420			break;
9421
9422		case DIFV_SCOPE_LOCAL:
9423			if (ndx < vstate->dtvs_nlocals) {
9424				dtrace_statvar_t *svar;
9425
9426				if ((svar = vstate->dtvs_locals[ndx]) != NULL)
9427					existing = &svar->dtsv_var;
9428			}
9429
9430			break;
9431		}
9432
9433		vt = &v->dtdv_type;
9434
9435		if (vt->dtdt_flags & DIF_TF_BYREF) {
9436			if (vt->dtdt_size == 0) {
9437				err += efunc(i, "zero-sized variable\n");
9438				break;
9439			}
9440
9441			if (v->dtdv_scope == DIFV_SCOPE_GLOBAL &&
9442			    vt->dtdt_size > dtrace_global_maxsize) {
9443				err += efunc(i, "oversized by-ref global\n");
9444				break;
9445			}
9446		}
9447
9448		if (existing == NULL || existing->dtdv_id == 0)
9449			continue;
9450
9451		ASSERT(existing->dtdv_id == v->dtdv_id);
9452		ASSERT(existing->dtdv_scope == v->dtdv_scope);
9453
9454		if (existing->dtdv_kind != v->dtdv_kind)
9455			err += efunc(i, "%d changed variable kind\n", id);
9456
9457		et = &existing->dtdv_type;
9458
9459		if (vt->dtdt_flags != et->dtdt_flags) {
9460			err += efunc(i, "%d changed variable type flags\n", id);
9461			break;
9462		}
9463
9464		if (vt->dtdt_size != 0 && vt->dtdt_size != et->dtdt_size) {
9465			err += efunc(i, "%d changed variable type size\n", id);
9466			break;
9467		}
9468	}
9469
9470	return (err);
9471}
9472
9473/*
9474 * Validate a DTrace DIF object that it is to be used as a helper.  Helpers
9475 * are much more constrained than normal DIFOs.  Specifically, they may
9476 * not:
9477 *
9478 * 1. Make calls to subroutines other than copyin(), copyinstr() or
9479 *    miscellaneous string routines
9480 * 2. Access DTrace variables other than the args[] array, and the
9481 *    curthread, pid, ppid, tid, execname, zonename, uid and gid variables.
9482 * 3. Have thread-local variables.
9483 * 4. Have dynamic variables.
9484 */
9485static int
9486dtrace_difo_validate_helper(dtrace_difo_t *dp)
9487{
9488	int (*efunc)(uint_t pc, const char *, ...) = dtrace_difo_err;
9489	int err = 0;
9490	uint_t pc;
9491
9492	for (pc = 0; pc < dp->dtdo_len; pc++) {
9493		dif_instr_t instr = dp->dtdo_buf[pc];
9494
9495		uint_t v = DIF_INSTR_VAR(instr);
9496		uint_t subr = DIF_INSTR_SUBR(instr);
9497		uint_t op = DIF_INSTR_OP(instr);
9498
9499		switch (op) {
9500		case DIF_OP_OR:
9501		case DIF_OP_XOR:
9502		case DIF_OP_AND:
9503		case DIF_OP_SLL:
9504		case DIF_OP_SRL:
9505		case DIF_OP_SRA:
9506		case DIF_OP_SUB:
9507		case DIF_OP_ADD:
9508		case DIF_OP_MUL:
9509		case DIF_OP_SDIV:
9510		case DIF_OP_UDIV:
9511		case DIF_OP_SREM:
9512		case DIF_OP_UREM:
9513		case DIF_OP_COPYS:
9514		case DIF_OP_NOT:
9515		case DIF_OP_MOV:
9516		case DIF_OP_RLDSB:
9517		case DIF_OP_RLDSH:
9518		case DIF_OP_RLDSW:
9519		case DIF_OP_RLDUB:
9520		case DIF_OP_RLDUH:
9521		case DIF_OP_RLDUW:
9522		case DIF_OP_RLDX:
9523		case DIF_OP_ULDSB:
9524		case DIF_OP_ULDSH:
9525		case DIF_OP_ULDSW:
9526		case DIF_OP_ULDUB:
9527		case DIF_OP_ULDUH:
9528		case DIF_OP_ULDUW:
9529		case DIF_OP_ULDX:
9530		case DIF_OP_STB:
9531		case DIF_OP_STH:
9532		case DIF_OP_STW:
9533		case DIF_OP_STX:
9534		case DIF_OP_ALLOCS:
9535		case DIF_OP_CMP:
9536		case DIF_OP_SCMP:
9537		case DIF_OP_TST:
9538		case DIF_OP_BA:
9539		case DIF_OP_BE:
9540		case DIF_OP_BNE:
9541		case DIF_OP_BG:
9542		case DIF_OP_BGU:
9543		case DIF_OP_BGE:
9544		case DIF_OP_BGEU:
9545		case DIF_OP_BL:
9546		case DIF_OP_BLU:
9547		case DIF_OP_BLE:
9548		case DIF_OP_BLEU:
9549		case DIF_OP_RET:
9550		case DIF_OP_NOP:
9551		case DIF_OP_POPTS:
9552		case DIF_OP_FLUSHTS:
9553		case DIF_OP_SETX:
9554		case DIF_OP_SETS:
9555		case DIF_OP_LDGA:
9556		case DIF_OP_LDLS:
9557		case DIF_OP_STGS:
9558		case DIF_OP_STLS:
9559		case DIF_OP_PUSHTR:
9560		case DIF_OP_PUSHTV:
9561			break;
9562
9563		case DIF_OP_LDGS:
9564			if (v >= DIF_VAR_OTHER_UBASE)
9565				break;
9566
9567			if (v >= DIF_VAR_ARG0 && v <= DIF_VAR_ARG9)
9568				break;
9569
9570			if (v == DIF_VAR_CURTHREAD || v == DIF_VAR_PID ||
9571			    v == DIF_VAR_PPID || v == DIF_VAR_TID ||
9572			    v == DIF_VAR_EXECNAME || v == DIF_VAR_ZONENAME ||
9573			    v == DIF_VAR_UID || v == DIF_VAR_GID)
9574				break;
9575
9576			err += efunc(pc, "illegal variable %u\n", v);
9577			break;
9578
9579		case DIF_OP_LDTA:
9580		case DIF_OP_LDTS:
9581		case DIF_OP_LDGAA:
9582		case DIF_OP_LDTAA:
9583			err += efunc(pc, "illegal dynamic variable load\n");
9584			break;
9585
9586		case DIF_OP_STTS:
9587		case DIF_OP_STGAA:
9588		case DIF_OP_STTAA:
9589			err += efunc(pc, "illegal dynamic variable store\n");
9590			break;
9591
9592		case DIF_OP_CALL:
9593			if (subr == DIF_SUBR_ALLOCA ||
9594			    subr == DIF_SUBR_BCOPY ||
9595			    subr == DIF_SUBR_COPYIN ||
9596			    subr == DIF_SUBR_COPYINTO ||
9597			    subr == DIF_SUBR_COPYINSTR ||
9598			    subr == DIF_SUBR_INDEX ||
9599			    subr == DIF_SUBR_INET_NTOA ||
9600			    subr == DIF_SUBR_INET_NTOA6 ||
9601			    subr == DIF_SUBR_INET_NTOP ||
9602			    subr == DIF_SUBR_LLTOSTR ||
9603			    subr == DIF_SUBR_RINDEX ||
9604			    subr == DIF_SUBR_STRCHR ||
9605			    subr == DIF_SUBR_STRJOIN ||
9606			    subr == DIF_SUBR_STRRCHR ||
9607			    subr == DIF_SUBR_STRSTR ||
9608#if defined(__APPLE__)
9609			    subr == DIF_SUBR_COREPROFILE ||
9610#endif /* __APPLE__ */
9611			    subr == DIF_SUBR_HTONS ||
9612			    subr == DIF_SUBR_HTONL ||
9613			    subr == DIF_SUBR_HTONLL ||
9614			    subr == DIF_SUBR_NTOHS ||
9615			    subr == DIF_SUBR_NTOHL ||
9616			    subr == DIF_SUBR_NTOHLL)
9617				break;
9618
9619			err += efunc(pc, "invalid subr %u\n", subr);
9620			break;
9621
9622		default:
9623			err += efunc(pc, "invalid opcode %u\n",
9624			    DIF_INSTR_OP(instr));
9625		}
9626	}
9627
9628	return (err);
9629}
9630
9631/*
9632 * Returns 1 if the expression in the DIF object can be cached on a per-thread
9633 * basis; 0 if not.
9634 */
9635static int
9636dtrace_difo_cacheable(dtrace_difo_t *dp)
9637{
9638#if !defined(__APPLE__) /* Quiet compiler warnings */
9639	int i;
9640#else
9641	uint_t i;
9642#endif /* __APPLE__ */
9643
9644	if (dp == NULL)
9645		return (0);
9646
9647	for (i = 0; i < dp->dtdo_varlen; i++) {
9648		dtrace_difv_t *v = &dp->dtdo_vartab[i];
9649
9650		if (v->dtdv_scope != DIFV_SCOPE_GLOBAL)
9651			continue;
9652
9653		switch (v->dtdv_id) {
9654		case DIF_VAR_CURTHREAD:
9655		case DIF_VAR_PID:
9656		case DIF_VAR_TID:
9657		case DIF_VAR_EXECNAME:
9658		case DIF_VAR_ZONENAME:
9659			break;
9660
9661		default:
9662			return (0);
9663		}
9664	}
9665
9666	/*
9667	 * This DIF object may be cacheable.  Now we need to look for any
9668	 * array loading instructions, any memory loading instructions, or
9669	 * any stores to thread-local variables.
9670	 */
9671	for (i = 0; i < dp->dtdo_len; i++) {
9672		uint_t op = DIF_INSTR_OP(dp->dtdo_buf[i]);
9673
9674		if ((op >= DIF_OP_LDSB && op <= DIF_OP_LDX) ||
9675		    (op >= DIF_OP_ULDSB && op <= DIF_OP_ULDX) ||
9676		    (op >= DIF_OP_RLDSB && op <= DIF_OP_RLDX) ||
9677		    op == DIF_OP_LDGA || op == DIF_OP_STTS)
9678			return (0);
9679	}
9680
9681	return (1);
9682}
9683
9684static void
9685dtrace_difo_hold(dtrace_difo_t *dp)
9686{
9687#if !defined(__APPLE__) /* Quiet compiler warnings */
9688	int i;
9689#else
9690	uint_t i;
9691#endif /* __APPLE__ */
9692
9693	lck_mtx_assert(&dtrace_lock, LCK_MTX_ASSERT_OWNED);
9694
9695	dp->dtdo_refcnt++;
9696	ASSERT(dp->dtdo_refcnt != 0);
9697
9698	/*
9699	 * We need to check this DIF object for references to the variable
9700	 * DIF_VAR_VTIMESTAMP.
9701	 */
9702	for (i = 0; i < dp->dtdo_varlen; i++) {
9703		dtrace_difv_t *v = &dp->dtdo_vartab[i];
9704
9705		if (v->dtdv_id != DIF_VAR_VTIMESTAMP)
9706			continue;
9707
9708		if (dtrace_vtime_references++ == 0)
9709			dtrace_vtime_enable();
9710	}
9711}
9712
9713/*
9714 * This routine calculates the dynamic variable chunksize for a given DIF
9715 * object.  The calculation is not fool-proof, and can probably be tricked by
9716 * malicious DIF -- but it works for all compiler-generated DIF.  Because this
9717 * calculation is likely imperfect, dtrace_dynvar() is able to gracefully fail
9718 * if a dynamic variable size exceeds the chunksize.
9719 */
9720static void
9721dtrace_difo_chunksize(dtrace_difo_t *dp, dtrace_vstate_t *vstate)
9722{
9723#if !defined(__APPLE__) /* Quiet compiler warnings */
9724	uint64_t sval;
9725#else
9726	uint64_t sval = 0;
9727#endif /* __APPLE__ */
9728	dtrace_key_t tupregs[DIF_DTR_NREGS + 2]; /* +2 for thread and id */
9729	const dif_instr_t *text = dp->dtdo_buf;
9730	uint_t pc, srd = 0;
9731	uint_t ttop = 0;
9732	size_t size, ksize;
9733	uint_t id, i;
9734
9735	for (pc = 0; pc < dp->dtdo_len; pc++) {
9736		dif_instr_t instr = text[pc];
9737		uint_t op = DIF_INSTR_OP(instr);
9738		uint_t rd = DIF_INSTR_RD(instr);
9739		uint_t r1 = DIF_INSTR_R1(instr);
9740		uint_t nkeys = 0;
9741		uchar_t scope;
9742
9743		dtrace_key_t *key = tupregs;
9744
9745		switch (op) {
9746		case DIF_OP_SETX:
9747			sval = dp->dtdo_inttab[DIF_INSTR_INTEGER(instr)];
9748			srd = rd;
9749			continue;
9750
9751		case DIF_OP_STTS:
9752			key = &tupregs[DIF_DTR_NREGS];
9753			key[0].dttk_size = 0;
9754			key[1].dttk_size = 0;
9755			nkeys = 2;
9756			scope = DIFV_SCOPE_THREAD;
9757			break;
9758
9759		case DIF_OP_STGAA:
9760		case DIF_OP_STTAA:
9761			nkeys = ttop;
9762
9763			if (DIF_INSTR_OP(instr) == DIF_OP_STTAA)
9764				key[nkeys++].dttk_size = 0;
9765
9766			key[nkeys++].dttk_size = 0;
9767
9768			if (op == DIF_OP_STTAA) {
9769				scope = DIFV_SCOPE_THREAD;
9770			} else {
9771				scope = DIFV_SCOPE_GLOBAL;
9772			}
9773
9774			break;
9775
9776		case DIF_OP_PUSHTR:
9777			if (ttop == DIF_DTR_NREGS)
9778				return;
9779
9780			if ((srd == 0 || sval == 0) && r1 == DIF_TYPE_STRING) {
9781				/*
9782				 * If the register for the size of the "pushtr"
9783				 * is %r0 (or the value is 0) and the type is
9784				 * a string, we'll use the system-wide default
9785				 * string size.
9786				 */
9787				tupregs[ttop++].dttk_size =
9788				    dtrace_strsize_default;
9789			} else {
9790				if (srd == 0)
9791					return;
9792
9793				tupregs[ttop++].dttk_size = sval;
9794			}
9795
9796			break;
9797
9798		case DIF_OP_PUSHTV:
9799			if (ttop == DIF_DTR_NREGS)
9800				return;
9801
9802			tupregs[ttop++].dttk_size = 0;
9803			break;
9804
9805		case DIF_OP_FLUSHTS:
9806			ttop = 0;
9807			break;
9808
9809		case DIF_OP_POPTS:
9810			if (ttop != 0)
9811				ttop--;
9812			break;
9813		}
9814
9815		sval = 0;
9816		srd = 0;
9817
9818		if (nkeys == 0)
9819			continue;
9820
9821		/*
9822		 * We have a dynamic variable allocation; calculate its size.
9823		 */
9824		for (ksize = 0, i = 0; i < nkeys; i++)
9825			ksize += P2ROUNDUP(key[i].dttk_size, sizeof (uint64_t));
9826
9827		size = sizeof (dtrace_dynvar_t);
9828		size += sizeof (dtrace_key_t) * (nkeys - 1);
9829		size += ksize;
9830
9831		/*
9832		 * Now we need to determine the size of the stored data.
9833		 */
9834		id = DIF_INSTR_VAR(instr);
9835
9836		for (i = 0; i < dp->dtdo_varlen; i++) {
9837			dtrace_difv_t *v = &dp->dtdo_vartab[i];
9838
9839			if (v->dtdv_id == id && v->dtdv_scope == scope) {
9840				size += v->dtdv_type.dtdt_size;
9841				break;
9842			}
9843		}
9844
9845		if (i == dp->dtdo_varlen)
9846			return;
9847
9848		/*
9849		 * We have the size.  If this is larger than the chunk size
9850		 * for our dynamic variable state, reset the chunk size.
9851		 */
9852		size = P2ROUNDUP(size, sizeof (uint64_t));
9853
9854		if (size > vstate->dtvs_dynvars.dtds_chunksize)
9855			vstate->dtvs_dynvars.dtds_chunksize = size;
9856	}
9857}
9858
9859static void
9860dtrace_difo_init(dtrace_difo_t *dp, dtrace_vstate_t *vstate)
9861{
9862#if !defined(__APPLE__) /* Quiet compiler warnings */
9863	int i, oldsvars, osz, nsz, otlocals, ntlocals;
9864	uint_t id;
9865#else
9866	int oldsvars, osz, nsz, otlocals, ntlocals;
9867	uint_t i, id;
9868#endif /* __APPLE__ */
9869
9870	lck_mtx_assert(&dtrace_lock, LCK_MTX_ASSERT_OWNED);
9871	ASSERT(dp->dtdo_buf != NULL && dp->dtdo_len != 0);
9872
9873	for (i = 0; i < dp->dtdo_varlen; i++) {
9874		dtrace_difv_t *v = &dp->dtdo_vartab[i];
9875#if !defined(__APPLE__) /* Quiet compiler warnings */
9876		dtrace_statvar_t *svar, ***svarp;
9877#else
9878		dtrace_statvar_t *svar;
9879		dtrace_statvar_t ***svarp = NULL;
9880#endif /* __APPLE__ */
9881		size_t dsize = 0;
9882		uint8_t scope = v->dtdv_scope;
9883		int *np = (int *)NULL;
9884
9885		if ((id = v->dtdv_id) < DIF_VAR_OTHER_UBASE)
9886			continue;
9887
9888		id -= DIF_VAR_OTHER_UBASE;
9889
9890		switch (scope) {
9891		case DIFV_SCOPE_THREAD:
9892#if !defined(__APPLE__) /* Quiet compiler warnings */
9893			while (id >= (otlocals = vstate->dtvs_ntlocals)) {
9894#else
9895			while (id >= (uint_t)(otlocals = vstate->dtvs_ntlocals)) {
9896#endif /* __APPLE__ */
9897				dtrace_difv_t *tlocals;
9898
9899				if ((ntlocals = (otlocals << 1)) == 0)
9900					ntlocals = 1;
9901
9902				osz = otlocals * sizeof (dtrace_difv_t);
9903				nsz = ntlocals * sizeof (dtrace_difv_t);
9904
9905				tlocals = kmem_zalloc(nsz, KM_SLEEP);
9906
9907				if (osz != 0) {
9908					bcopy(vstate->dtvs_tlocals,
9909					    tlocals, osz);
9910					kmem_free(vstate->dtvs_tlocals, osz);
9911				}
9912
9913				vstate->dtvs_tlocals = tlocals;
9914				vstate->dtvs_ntlocals = ntlocals;
9915			}
9916
9917			vstate->dtvs_tlocals[id] = *v;
9918			continue;
9919
9920		case DIFV_SCOPE_LOCAL:
9921			np = &vstate->dtvs_nlocals;
9922			svarp = &vstate->dtvs_locals;
9923
9924			if (v->dtdv_type.dtdt_flags & DIF_TF_BYREF)
9925				dsize = (int)NCPU * (v->dtdv_type.dtdt_size +
9926				    sizeof (uint64_t));
9927			else
9928				dsize = (int)NCPU * sizeof (uint64_t);
9929
9930			break;
9931
9932		case DIFV_SCOPE_GLOBAL:
9933			np = &vstate->dtvs_nglobals;
9934			svarp = &vstate->dtvs_globals;
9935
9936			if (v->dtdv_type.dtdt_flags & DIF_TF_BYREF)
9937				dsize = v->dtdv_type.dtdt_size +
9938				    sizeof (uint64_t);
9939
9940			break;
9941
9942		default:
9943			ASSERT(0);
9944		}
9945
9946#if !defined(__APPLE__) /* Quiet compiler warnings */
9947		while (id >= (oldsvars = *np)) {
9948#else
9949		while (id >= (uint_t)(oldsvars = *np)) {
9950#endif /* __APPLE__ */
9951			dtrace_statvar_t **statics;
9952			int newsvars, oldsize, newsize;
9953
9954			if ((newsvars = (oldsvars << 1)) == 0)
9955				newsvars = 1;
9956
9957			oldsize = oldsvars * sizeof (dtrace_statvar_t *);
9958			newsize = newsvars * sizeof (dtrace_statvar_t *);
9959
9960			statics = kmem_zalloc(newsize, KM_SLEEP);
9961
9962			if (oldsize != 0) {
9963				bcopy(*svarp, statics, oldsize);
9964				kmem_free(*svarp, oldsize);
9965			}
9966
9967			*svarp = statics;
9968			*np = newsvars;
9969		}
9970
9971		if ((svar = (*svarp)[id]) == NULL) {
9972			svar = kmem_zalloc(sizeof (dtrace_statvar_t), KM_SLEEP);
9973			svar->dtsv_var = *v;
9974
9975			if ((svar->dtsv_size = dsize) != 0) {
9976				svar->dtsv_data = (uint64_t)(uintptr_t)
9977				    kmem_zalloc(dsize, KM_SLEEP);
9978			}
9979
9980			(*svarp)[id] = svar;
9981		}
9982
9983		svar->dtsv_refcnt++;
9984	}
9985
9986	dtrace_difo_chunksize(dp, vstate);
9987	dtrace_difo_hold(dp);
9988}
9989
9990static dtrace_difo_t *
9991dtrace_difo_duplicate(dtrace_difo_t *dp, dtrace_vstate_t *vstate)
9992{
9993	dtrace_difo_t *new;
9994	size_t sz;
9995
9996	ASSERT(dp->dtdo_buf != NULL);
9997	ASSERT(dp->dtdo_refcnt != 0);
9998
9999	new = kmem_zalloc(sizeof (dtrace_difo_t), KM_SLEEP);
10000
10001	ASSERT(dp->dtdo_buf != NULL);
10002	sz = dp->dtdo_len * sizeof (dif_instr_t);
10003	new->dtdo_buf = kmem_alloc(sz, KM_SLEEP);
10004	bcopy(dp->dtdo_buf, new->dtdo_buf, sz);
10005	new->dtdo_len = dp->dtdo_len;
10006
10007	if (dp->dtdo_strtab != NULL) {
10008		ASSERT(dp->dtdo_strlen != 0);
10009		new->dtdo_strtab = kmem_alloc(dp->dtdo_strlen, KM_SLEEP);
10010		bcopy(dp->dtdo_strtab, new->dtdo_strtab, dp->dtdo_strlen);
10011		new->dtdo_strlen = dp->dtdo_strlen;
10012	}
10013
10014	if (dp->dtdo_inttab != NULL) {
10015		ASSERT(dp->dtdo_intlen != 0);
10016		sz = dp->dtdo_intlen * sizeof (uint64_t);
10017		new->dtdo_inttab = kmem_alloc(sz, KM_SLEEP);
10018		bcopy(dp->dtdo_inttab, new->dtdo_inttab, sz);
10019		new->dtdo_intlen = dp->dtdo_intlen;
10020	}
10021
10022	if (dp->dtdo_vartab != NULL) {
10023		ASSERT(dp->dtdo_varlen != 0);
10024		sz = dp->dtdo_varlen * sizeof (dtrace_difv_t);
10025		new->dtdo_vartab = kmem_alloc(sz, KM_SLEEP);
10026		bcopy(dp->dtdo_vartab, new->dtdo_vartab, sz);
10027		new->dtdo_varlen = dp->dtdo_varlen;
10028	}
10029
10030	dtrace_difo_init(new, vstate);
10031	return (new);
10032}
10033
10034static void
10035dtrace_difo_destroy(dtrace_difo_t *dp, dtrace_vstate_t *vstate)
10036{
10037#if !defined(__APPLE__) /* Quiet compiler warnings */
10038	int i;
10039#else
10040	uint_t i;
10041#endif /* __APPLE__ */
10042
10043	ASSERT(dp->dtdo_refcnt == 0);
10044
10045	for (i = 0; i < dp->dtdo_varlen; i++) {
10046		dtrace_difv_t *v = &dp->dtdo_vartab[i];
10047#if !defined(__APPLE__) /* Quiet compiler warnings */
10048		dtrace_statvar_t *svar, **svarp;
10049		uint_t id;
10050		uint8_t scope = v->dtdv_scope;
10051		int *np;
10052#else
10053		dtrace_statvar_t *svar;
10054		dtrace_statvar_t **svarp = NULL;
10055		uint_t id;
10056		uint8_t scope = v->dtdv_scope;
10057		int *np = NULL;
10058#endif /* __APPLE__ */
10059
10060		switch (scope) {
10061		case DIFV_SCOPE_THREAD:
10062			continue;
10063
10064		case DIFV_SCOPE_LOCAL:
10065			np = &vstate->dtvs_nlocals;
10066			svarp = vstate->dtvs_locals;
10067			break;
10068
10069		case DIFV_SCOPE_GLOBAL:
10070			np = &vstate->dtvs_nglobals;
10071			svarp = vstate->dtvs_globals;
10072			break;
10073
10074		default:
10075			ASSERT(0);
10076		}
10077
10078		if ((id = v->dtdv_id) < DIF_VAR_OTHER_UBASE)
10079			continue;
10080
10081		id -= DIF_VAR_OTHER_UBASE;
10082
10083#if !defined(__APPLE__) /* Quiet compiler warnings */
10084		ASSERT(id < *np);
10085#else
10086		ASSERT(id < (uint_t)*np);
10087#endif /* __APPLE__ */
10088
10089		svar = svarp[id];
10090		ASSERT(svar != NULL);
10091		ASSERT(svar->dtsv_refcnt > 0);
10092
10093		if (--svar->dtsv_refcnt > 0)
10094			continue;
10095
10096		if (svar->dtsv_size != 0) {
10097			ASSERT(svar->dtsv_data != NULL);
10098			kmem_free((void *)(uintptr_t)svar->dtsv_data,
10099			    svar->dtsv_size);
10100		}
10101
10102		kmem_free(svar, sizeof (dtrace_statvar_t));
10103		svarp[id] = NULL;
10104	}
10105
10106	kmem_free(dp->dtdo_buf, dp->dtdo_len * sizeof (dif_instr_t));
10107	kmem_free(dp->dtdo_inttab, dp->dtdo_intlen * sizeof (uint64_t));
10108	kmem_free(dp->dtdo_strtab, dp->dtdo_strlen);
10109	kmem_free(dp->dtdo_vartab, dp->dtdo_varlen * sizeof (dtrace_difv_t));
10110
10111	kmem_free(dp, sizeof (dtrace_difo_t));
10112}
10113
10114static void
10115dtrace_difo_release(dtrace_difo_t *dp, dtrace_vstate_t *vstate)
10116{
10117#if !defined(__APPLE__) /* Quiet compiler warnings */
10118	int i;
10119#else
10120	uint_t i;
10121#endif /* __APPLE__ */
10122
10123	lck_mtx_assert(&dtrace_lock, LCK_MTX_ASSERT_OWNED);
10124	ASSERT(dp->dtdo_refcnt != 0);
10125
10126	for (i = 0; i < dp->dtdo_varlen; i++) {
10127		dtrace_difv_t *v = &dp->dtdo_vartab[i];
10128
10129		if (v->dtdv_id != DIF_VAR_VTIMESTAMP)
10130			continue;
10131
10132		ASSERT(dtrace_vtime_references > 0);
10133		if (--dtrace_vtime_references == 0)
10134			dtrace_vtime_disable();
10135	}
10136
10137	if (--dp->dtdo_refcnt == 0)
10138		dtrace_difo_destroy(dp, vstate);
10139}
10140
10141/*
10142 * DTrace Format Functions
10143 */
10144static uint16_t
10145dtrace_format_add(dtrace_state_t *state, char *str)
10146{
10147	char *fmt, **new;
10148	uint16_t ndx, len = strlen(str) + 1;
10149
10150	fmt = kmem_zalloc(len, KM_SLEEP);
10151	bcopy(str, fmt, len);
10152
10153	for (ndx = 0; ndx < state->dts_nformats; ndx++) {
10154		if (state->dts_formats[ndx] == NULL) {
10155			state->dts_formats[ndx] = fmt;
10156			return (ndx + 1);
10157		}
10158	}
10159
10160	if (state->dts_nformats == USHRT_MAX) {
10161		/*
10162		 * This is only likely if a denial-of-service attack is being
10163		 * attempted.  As such, it's okay to fail silently here.
10164		 */
10165		kmem_free(fmt, len);
10166		return (0);
10167	}
10168
10169	/*
10170	 * For simplicity, we always resize the formats array to be exactly the
10171	 * number of formats.
10172	 */
10173	ndx = state->dts_nformats++;
10174	new = kmem_alloc((ndx + 1) * sizeof (char *), KM_SLEEP);
10175
10176	if (state->dts_formats != NULL) {
10177		ASSERT(ndx != 0);
10178		bcopy(state->dts_formats, new, ndx * sizeof (char *));
10179		kmem_free(state->dts_formats, ndx * sizeof (char *));
10180	}
10181
10182	state->dts_formats = new;
10183	state->dts_formats[ndx] = fmt;
10184
10185	return (ndx + 1);
10186}
10187
10188static void
10189dtrace_format_remove(dtrace_state_t *state, uint16_t format)
10190{
10191	char *fmt;
10192
10193	ASSERT(state->dts_formats != NULL);
10194	ASSERT(format <= state->dts_nformats);
10195	ASSERT(state->dts_formats[format - 1] != NULL);
10196
10197	fmt = state->dts_formats[format - 1];
10198	kmem_free(fmt, strlen(fmt) + 1);
10199	state->dts_formats[format - 1] = NULL;
10200}
10201
10202static void
10203dtrace_format_destroy(dtrace_state_t *state)
10204{
10205	int i;
10206
10207	if (state->dts_nformats == 0) {
10208		ASSERT(state->dts_formats == NULL);
10209		return;
10210	}
10211
10212	ASSERT(state->dts_formats != NULL);
10213
10214	for (i = 0; i < state->dts_nformats; i++) {
10215		char *fmt = state->dts_formats[i];
10216
10217		if (fmt == NULL)
10218			continue;
10219
10220		kmem_free(fmt, strlen(fmt) + 1);
10221	}
10222
10223	kmem_free(state->dts_formats, state->dts_nformats * sizeof (char *));
10224	state->dts_nformats = 0;
10225	state->dts_formats = NULL;
10226}
10227
10228/*
10229 * DTrace Predicate Functions
10230 */
10231static dtrace_predicate_t *
10232dtrace_predicate_create(dtrace_difo_t *dp)
10233{
10234	dtrace_predicate_t *pred;
10235
10236	lck_mtx_assert(&dtrace_lock, LCK_MTX_ASSERT_OWNED);
10237	ASSERT(dp->dtdo_refcnt != 0);
10238
10239	pred = kmem_zalloc(sizeof (dtrace_predicate_t), KM_SLEEP);
10240	pred->dtp_difo = dp;
10241	pred->dtp_refcnt = 1;
10242
10243	if (!dtrace_difo_cacheable(dp))
10244		return (pred);
10245
10246	if (dtrace_predcache_id == DTRACE_CACHEIDNONE) {
10247		/*
10248		 * This is only theoretically possible -- we have had 2^32
10249		 * cacheable predicates on this machine.  We cannot allow any
10250		 * more predicates to become cacheable:  as unlikely as it is,
10251		 * there may be a thread caching a (now stale) predicate cache
10252		 * ID. (N.B.: the temptation is being successfully resisted to
10253		 * have this cmn_err() "Holy shit -- we executed this code!")
10254		 */
10255		return (pred);
10256	}
10257
10258	pred->dtp_cacheid = dtrace_predcache_id++;
10259
10260	return (pred);
10261}
10262
10263static void
10264dtrace_predicate_hold(dtrace_predicate_t *pred)
10265{
10266	lck_mtx_assert(&dtrace_lock, LCK_MTX_ASSERT_OWNED);
10267	ASSERT(pred->dtp_difo != NULL && pred->dtp_difo->dtdo_refcnt != 0);
10268	ASSERT(pred->dtp_refcnt > 0);
10269
10270	pred->dtp_refcnt++;
10271}
10272
10273static void
10274dtrace_predicate_release(dtrace_predicate_t *pred, dtrace_vstate_t *vstate)
10275{
10276	dtrace_difo_t *dp = pred->dtp_difo;
10277#pragma unused(dp) /* __APPLE__ */
10278
10279	lck_mtx_assert(&dtrace_lock, LCK_MTX_ASSERT_OWNED);
10280	ASSERT(dp != NULL && dp->dtdo_refcnt != 0);
10281	ASSERT(pred->dtp_refcnt > 0);
10282
10283	if (--pred->dtp_refcnt == 0) {
10284		dtrace_difo_release(pred->dtp_difo, vstate);
10285		kmem_free(pred, sizeof (dtrace_predicate_t));
10286	}
10287}
10288
10289/*
10290 * DTrace Action Description Functions
10291 */
10292static dtrace_actdesc_t *
10293dtrace_actdesc_create(dtrace_actkind_t kind, uint32_t ntuple,
10294    uint64_t uarg, uint64_t arg)
10295{
10296	dtrace_actdesc_t *act;
10297
10298	ASSERT(!DTRACEACT_ISPRINTFLIKE(kind) || (arg != NULL &&
10299	    arg >= KERNELBASE) || (arg == NULL && kind == DTRACEACT_PRINTA));
10300
10301	act = kmem_zalloc(sizeof (dtrace_actdesc_t), KM_SLEEP);
10302	act->dtad_kind = kind;
10303	act->dtad_ntuple = ntuple;
10304	act->dtad_uarg = uarg;
10305	act->dtad_arg = arg;
10306	act->dtad_refcnt = 1;
10307
10308	return (act);
10309}
10310
10311static void
10312dtrace_actdesc_hold(dtrace_actdesc_t *act)
10313{
10314	ASSERT(act->dtad_refcnt >= 1);
10315	act->dtad_refcnt++;
10316}
10317
10318static void
10319dtrace_actdesc_release(dtrace_actdesc_t *act, dtrace_vstate_t *vstate)
10320{
10321	dtrace_actkind_t kind = act->dtad_kind;
10322	dtrace_difo_t *dp;
10323
10324	ASSERT(act->dtad_refcnt >= 1);
10325
10326	if (--act->dtad_refcnt != 0)
10327		return;
10328
10329	if ((dp = act->dtad_difo) != NULL)
10330		dtrace_difo_release(dp, vstate);
10331
10332	if (DTRACEACT_ISPRINTFLIKE(kind)) {
10333		char *str = (char *)(uintptr_t)act->dtad_arg;
10334
10335		ASSERT((str != NULL && (uintptr_t)str >= KERNELBASE) ||
10336		    (str == NULL && act->dtad_kind == DTRACEACT_PRINTA));
10337
10338		if (str != NULL)
10339			kmem_free(str, strlen(str) + 1);
10340	}
10341
10342	kmem_free(act, sizeof (dtrace_actdesc_t));
10343}
10344
10345/*
10346 * DTrace ECB Functions
10347 */
10348static dtrace_ecb_t *
10349dtrace_ecb_add(dtrace_state_t *state, dtrace_probe_t *probe)
10350{
10351	dtrace_ecb_t *ecb;
10352	dtrace_epid_t epid;
10353
10354	lck_mtx_assert(&dtrace_lock, LCK_MTX_ASSERT_OWNED);
10355
10356	ecb = kmem_zalloc(sizeof (dtrace_ecb_t), KM_SLEEP);
10357	ecb->dte_predicate = NULL;
10358	ecb->dte_probe = probe;
10359
10360	/*
10361	 * The default size is the size of the default action: recording
10362	 * the epid.
10363	 */
10364	ecb->dte_size = ecb->dte_needed = sizeof (dtrace_epid_t);
10365	ecb->dte_alignment = sizeof (dtrace_epid_t);
10366
10367	epid = state->dts_epid++;
10368
10369#if !defined(__APPLE__) /* Quiet compiler warnings */
10370	if (epid - 1 >= state->dts_necbs) {
10371#else
10372	if (epid - 1 >= (dtrace_epid_t)state->dts_necbs) {
10373#endif /* __APPLE__ */
10374		dtrace_ecb_t **oecbs = state->dts_ecbs, **ecbs;
10375		int necbs = state->dts_necbs << 1;
10376
10377#if !defined(__APPLE__) /* Quiet compiler warnings */
10378		ASSERT(epid == state->dts_necbs + 1);
10379#else
10380		ASSERT(epid == (dtrace_epid_t)state->dts_necbs + 1);
10381#endif /* __APPLE__ */
10382
10383		if (necbs == 0) {
10384			ASSERT(oecbs == NULL);
10385			necbs = 1;
10386		}
10387
10388		ecbs = kmem_zalloc(necbs * sizeof (*ecbs), KM_SLEEP);
10389
10390		if (oecbs != NULL)
10391			bcopy(oecbs, ecbs, state->dts_necbs * sizeof (*ecbs));
10392
10393		dtrace_membar_producer();
10394		state->dts_ecbs = ecbs;
10395
10396		if (oecbs != NULL) {
10397			/*
10398			 * If this state is active, we must dtrace_sync()
10399			 * before we can free the old dts_ecbs array:  we're
10400			 * coming in hot, and there may be active ring
10401			 * buffer processing (which indexes into the dts_ecbs
10402			 * array) on another CPU.
10403			 */
10404			if (state->dts_activity != DTRACE_ACTIVITY_INACTIVE)
10405				dtrace_sync();
10406
10407			kmem_free(oecbs, state->dts_necbs * sizeof (*ecbs));
10408		}
10409
10410		dtrace_membar_producer();
10411		state->dts_necbs = necbs;
10412	}
10413
10414	ecb->dte_state = state;
10415
10416	ASSERT(state->dts_ecbs[epid - 1] == NULL);
10417	dtrace_membar_producer();
10418	state->dts_ecbs[(ecb->dte_epid = epid) - 1] = ecb;
10419
10420	return (ecb);
10421}
10422
10423static int
10424dtrace_ecb_enable(dtrace_ecb_t *ecb)
10425{
10426	dtrace_probe_t *probe = ecb->dte_probe;
10427
10428	lck_mtx_assert(&cpu_lock, LCK_MTX_ASSERT_OWNED);
10429	lck_mtx_assert(&dtrace_lock, LCK_MTX_ASSERT_OWNED);
10430	ASSERT(ecb->dte_next == NULL);
10431
10432	if (probe == NULL) {
10433		/*
10434		 * This is the NULL probe -- there's nothing to do.
10435		 */
10436	    return(0);
10437	}
10438
10439	probe->dtpr_provider->ecb_count++;
10440	if (probe->dtpr_ecb == NULL) {
10441		dtrace_provider_t *prov = probe->dtpr_provider;
10442
10443		/*
10444		 * We're the first ECB on this probe.
10445		 */
10446		probe->dtpr_ecb = probe->dtpr_ecb_last = ecb;
10447
10448		if (ecb->dte_predicate != NULL)
10449			probe->dtpr_predcache = ecb->dte_predicate->dtp_cacheid;
10450
10451		return (prov->dtpv_pops.dtps_enable(prov->dtpv_arg,
10452                    probe->dtpr_id, probe->dtpr_arg));
10453	} else {
10454		/*
10455		 * This probe is already active.  Swing the last pointer to
10456		 * point to the new ECB, and issue a dtrace_sync() to assure
10457		 * that all CPUs have seen the change.
10458		 */
10459		ASSERT(probe->dtpr_ecb_last != NULL);
10460		probe->dtpr_ecb_last->dte_next = ecb;
10461		probe->dtpr_ecb_last = ecb;
10462		probe->dtpr_predcache = 0;
10463
10464		dtrace_sync();
10465		return(0);
10466	}
10467}
10468
10469static void
10470dtrace_ecb_resize(dtrace_ecb_t *ecb)
10471{
10472	uint32_t maxalign = sizeof (dtrace_epid_t);
10473	uint32_t align = sizeof (uint8_t), offs, diff;
10474	dtrace_action_t *act;
10475	int wastuple = 0;
10476	uint32_t aggbase = UINT32_MAX;
10477	dtrace_state_t *state = ecb->dte_state;
10478
10479	/*
10480	 * If we record anything, we always record the epid.  (And we always
10481	 * record it first.)
10482	 */
10483	offs = sizeof (dtrace_epid_t);
10484	ecb->dte_size = ecb->dte_needed = sizeof (dtrace_epid_t);
10485
10486	for (act = ecb->dte_action; act != NULL; act = act->dta_next) {
10487		dtrace_recdesc_t *rec = &act->dta_rec;
10488
10489		if ((align = rec->dtrd_alignment) > maxalign)
10490			maxalign = align;
10491
10492		if (!wastuple && act->dta_intuple) {
10493			/*
10494			 * This is the first record in a tuple.  Align the
10495			 * offset to be at offset 4 in an 8-byte aligned
10496			 * block.
10497			 */
10498			diff = offs + sizeof (dtrace_aggid_t);
10499
10500			if ((diff = (diff & (sizeof (uint64_t) - 1))))
10501				offs += sizeof (uint64_t) - diff;
10502
10503			aggbase = offs - sizeof (dtrace_aggid_t);
10504			ASSERT(!(aggbase & (sizeof (uint64_t) - 1)));
10505		}
10506
10507		/*LINTED*/
10508		if (rec->dtrd_size != 0 && (diff = (offs & (align - 1)))) {
10509			/*
10510			 * The current offset is not properly aligned; align it.
10511			 */
10512			offs += align - diff;
10513		}
10514
10515		rec->dtrd_offset = offs;
10516
10517		if (offs + rec->dtrd_size > ecb->dte_needed) {
10518			ecb->dte_needed = offs + rec->dtrd_size;
10519
10520			if (ecb->dte_needed > state->dts_needed)
10521				state->dts_needed = ecb->dte_needed;
10522		}
10523
10524		if (DTRACEACT_ISAGG(act->dta_kind)) {
10525			dtrace_aggregation_t *agg = (dtrace_aggregation_t *)act;
10526			dtrace_action_t *first = agg->dtag_first, *prev;
10527
10528			ASSERT(rec->dtrd_size != 0 && first != NULL);
10529			ASSERT(wastuple);
10530			ASSERT(aggbase != UINT32_MAX);
10531
10532			agg->dtag_base = aggbase;
10533
10534			while ((prev = first->dta_prev) != NULL &&
10535			    DTRACEACT_ISAGG(prev->dta_kind)) {
10536				agg = (dtrace_aggregation_t *)prev;
10537				first = agg->dtag_first;
10538			}
10539
10540			if (prev != NULL) {
10541				offs = prev->dta_rec.dtrd_offset +
10542				    prev->dta_rec.dtrd_size;
10543			} else {
10544				offs = sizeof (dtrace_epid_t);
10545			}
10546			wastuple = 0;
10547		} else {
10548			if (!act->dta_intuple)
10549				ecb->dte_size = offs + rec->dtrd_size;
10550
10551			offs += rec->dtrd_size;
10552		}
10553
10554		wastuple = act->dta_intuple;
10555	}
10556
10557	if ((act = ecb->dte_action) != NULL &&
10558	    !(act->dta_kind == DTRACEACT_SPECULATE && act->dta_next == NULL) &&
10559	    ecb->dte_size == sizeof (dtrace_epid_t)) {
10560		/*
10561		 * If the size is still sizeof (dtrace_epid_t), then all
10562		 * actions store no data; set the size to 0.
10563		 */
10564		ecb->dte_alignment = maxalign;
10565		ecb->dte_size = 0;
10566
10567		/*
10568		 * If the needed space is still sizeof (dtrace_epid_t), then
10569		 * all actions need no additional space; set the needed
10570		 * size to 0.
10571		 */
10572		if (ecb->dte_needed == sizeof (dtrace_epid_t))
10573			ecb->dte_needed = 0;
10574
10575		return;
10576	}
10577
10578	/*
10579	 * Set our alignment, and make sure that the dte_size and dte_needed
10580	 * are aligned to the size of an EPID.
10581	 */
10582	ecb->dte_alignment = maxalign;
10583	ecb->dte_size = (ecb->dte_size + (sizeof (dtrace_epid_t) - 1)) &
10584	    ~(sizeof (dtrace_epid_t) - 1);
10585	ecb->dte_needed = (ecb->dte_needed + (sizeof (dtrace_epid_t) - 1)) &
10586	    ~(sizeof (dtrace_epid_t) - 1);
10587	ASSERT(ecb->dte_size <= ecb->dte_needed);
10588}
10589
10590static dtrace_action_t *
10591dtrace_ecb_aggregation_create(dtrace_ecb_t *ecb, dtrace_actdesc_t *desc)
10592{
10593	dtrace_aggregation_t *agg;
10594	size_t size = sizeof (uint64_t);
10595	int ntuple = desc->dtad_ntuple;
10596	dtrace_action_t *act;
10597	dtrace_recdesc_t *frec;
10598	dtrace_aggid_t aggid;
10599	dtrace_state_t *state = ecb->dte_state;
10600
10601	agg = kmem_zalloc(sizeof (dtrace_aggregation_t), KM_SLEEP);
10602	agg->dtag_ecb = ecb;
10603
10604	ASSERT(DTRACEACT_ISAGG(desc->dtad_kind));
10605
10606	switch (desc->dtad_kind) {
10607	case DTRACEAGG_MIN:
10608		agg->dtag_initial = INT64_MAX;
10609		agg->dtag_aggregate = dtrace_aggregate_min;
10610		break;
10611
10612	case DTRACEAGG_MAX:
10613		agg->dtag_initial = INT64_MIN;
10614		agg->dtag_aggregate = dtrace_aggregate_max;
10615		break;
10616
10617	case DTRACEAGG_COUNT:
10618		agg->dtag_aggregate = dtrace_aggregate_count;
10619		break;
10620
10621	case DTRACEAGG_QUANTIZE:
10622		agg->dtag_aggregate = dtrace_aggregate_quantize;
10623		size = (((sizeof (uint64_t) * NBBY) - 1) * 2 + 1) *
10624		    sizeof (uint64_t);
10625		break;
10626
10627	case DTRACEAGG_LQUANTIZE: {
10628		uint16_t step = DTRACE_LQUANTIZE_STEP(desc->dtad_arg);
10629		uint16_t levels = DTRACE_LQUANTIZE_LEVELS(desc->dtad_arg);
10630
10631		agg->dtag_initial = desc->dtad_arg;
10632		agg->dtag_aggregate = dtrace_aggregate_lquantize;
10633
10634		if (step == 0 || levels == 0)
10635			goto err;
10636
10637		size = levels * sizeof (uint64_t) + 3 * sizeof (uint64_t);
10638		break;
10639	}
10640
10641	case DTRACEAGG_LLQUANTIZE: {
10642		uint16_t factor = DTRACE_LLQUANTIZE_FACTOR(desc->dtad_arg);
10643		uint16_t low    = DTRACE_LLQUANTIZE_LOW(desc->dtad_arg);
10644		uint16_t high   = DTRACE_LLQUANTIZE_HIGH(desc->dtad_arg);
10645		uint16_t nsteps = DTRACE_LLQUANTIZE_NSTEP(desc->dtad_arg);
10646		int64_t v;
10647
10648		agg->dtag_initial = desc->dtad_arg;
10649		agg->dtag_aggregate = dtrace_aggregate_llquantize;
10650
10651		if (factor < 2 || low >= high || nsteps < factor)
10652			goto err;
10653
10654		/*
10655		 * Now check that the number of steps evenly divides a power
10656		 * of the factor.  (This assures both integer bucket size and
10657		 * linearity within each magnitude.)
10658		 */
10659		for (v = factor; v < nsteps; v *= factor)
10660			continue;
10661
10662		if ((v % nsteps) || (nsteps % factor))
10663			goto err;
10664
10665 		size = (dtrace_aggregate_llquantize_bucket(factor, low, high, nsteps, INT64_MAX) + 2) * sizeof (uint64_t);
10666		break;
10667  }
10668
10669	case DTRACEAGG_AVG:
10670		agg->dtag_aggregate = dtrace_aggregate_avg;
10671		size = sizeof (uint64_t) * 2;
10672		break;
10673
10674	case DTRACEAGG_STDDEV:
10675		agg->dtag_aggregate = dtrace_aggregate_stddev;
10676		size = sizeof (uint64_t) * 4;
10677		break;
10678
10679	case DTRACEAGG_SUM:
10680		agg->dtag_aggregate = dtrace_aggregate_sum;
10681		break;
10682
10683	default:
10684		goto err;
10685	}
10686
10687	agg->dtag_action.dta_rec.dtrd_size = size;
10688
10689	if (ntuple == 0)
10690		goto err;
10691
10692	/*
10693	 * We must make sure that we have enough actions for the n-tuple.
10694	 */
10695	for (act = ecb->dte_action_last; act != NULL; act = act->dta_prev) {
10696		if (DTRACEACT_ISAGG(act->dta_kind))
10697			break;
10698
10699		if (--ntuple == 0) {
10700			/*
10701			 * This is the action with which our n-tuple begins.
10702			 */
10703			agg->dtag_first = act;
10704			goto success;
10705		}
10706	}
10707
10708	/*
10709	 * This n-tuple is short by ntuple elements.  Return failure.
10710	 */
10711	ASSERT(ntuple != 0);
10712err:
10713	kmem_free(agg, sizeof (dtrace_aggregation_t));
10714	return (NULL);
10715
10716success:
10717	/*
10718	 * If the last action in the tuple has a size of zero, it's actually
10719	 * an expression argument for the aggregating action.
10720	 */
10721	ASSERT(ecb->dte_action_last != NULL);
10722	act = ecb->dte_action_last;
10723
10724	if (act->dta_kind == DTRACEACT_DIFEXPR) {
10725		ASSERT(act->dta_difo != NULL);
10726
10727		if (act->dta_difo->dtdo_rtype.dtdt_size == 0)
10728			agg->dtag_hasarg = 1;
10729	}
10730
10731	/*
10732	 * We need to allocate an id for this aggregation.
10733	 */
10734	aggid = (dtrace_aggid_t)(uintptr_t)vmem_alloc(state->dts_aggid_arena, 1,
10735	    VM_BESTFIT | VM_SLEEP);
10736
10737#if !defined(__APPLE__) /* Quiet compiler warnings */
10738	if (aggid - 1 >= state->dts_naggregations) {
10739#else
10740	if (aggid - 1 >= (dtrace_aggid_t)state->dts_naggregations) {
10741#endif /* __APPLE__ */
10742		dtrace_aggregation_t **oaggs = state->dts_aggregations;
10743		dtrace_aggregation_t **aggs;
10744		int naggs = state->dts_naggregations << 1;
10745		int onaggs = state->dts_naggregations;
10746
10747#if !defined(__APPLE__) /* Quiet compiler warnings */
10748		ASSERT(aggid == state->dts_naggregations + 1);
10749#else
10750		ASSERT(aggid == (dtrace_aggid_t)state->dts_naggregations + 1);
10751#endif /* __APPLE */
10752
10753		if (naggs == 0) {
10754			ASSERT(oaggs == NULL);
10755			naggs = 1;
10756		}
10757
10758		aggs = kmem_zalloc(naggs * sizeof (*aggs), KM_SLEEP);
10759
10760		if (oaggs != NULL) {
10761			bcopy(oaggs, aggs, onaggs * sizeof (*aggs));
10762			kmem_free(oaggs, onaggs * sizeof (*aggs));
10763		}
10764
10765		state->dts_aggregations = aggs;
10766		state->dts_naggregations = naggs;
10767	}
10768
10769	ASSERT(state->dts_aggregations[aggid - 1] == NULL);
10770	state->dts_aggregations[(agg->dtag_id = aggid) - 1] = agg;
10771
10772	frec = &agg->dtag_first->dta_rec;
10773	if (frec->dtrd_alignment < sizeof (dtrace_aggid_t))
10774		frec->dtrd_alignment = sizeof (dtrace_aggid_t);
10775
10776	for (act = agg->dtag_first; act != NULL; act = act->dta_next) {
10777		ASSERT(!act->dta_intuple);
10778		act->dta_intuple = 1;
10779	}
10780
10781	return (&agg->dtag_action);
10782}
10783
10784static void
10785dtrace_ecb_aggregation_destroy(dtrace_ecb_t *ecb, dtrace_action_t *act)
10786{
10787	dtrace_aggregation_t *agg = (dtrace_aggregation_t *)act;
10788	dtrace_state_t *state = ecb->dte_state;
10789	dtrace_aggid_t aggid = agg->dtag_id;
10790
10791	ASSERT(DTRACEACT_ISAGG(act->dta_kind));
10792	vmem_free(state->dts_aggid_arena, (void *)(uintptr_t)aggid, 1);
10793
10794	ASSERT(state->dts_aggregations[aggid - 1] == agg);
10795	state->dts_aggregations[aggid - 1] = NULL;
10796
10797	kmem_free(agg, sizeof (dtrace_aggregation_t));
10798}
10799
10800static int
10801dtrace_ecb_action_add(dtrace_ecb_t *ecb, dtrace_actdesc_t *desc)
10802{
10803	dtrace_action_t *action, *last;
10804	dtrace_difo_t *dp = desc->dtad_difo;
10805	uint32_t size = 0, align = sizeof (uint8_t), mask;
10806	uint16_t format = 0;
10807	dtrace_recdesc_t *rec;
10808	dtrace_state_t *state = ecb->dte_state;
10809#if !defined(__APPLE__) /* Quiet compiler warnings */
10810	dtrace_optval_t *opt = state->dts_options, nframes, strsize;
10811#else
10812	dtrace_optval_t *opt = state->dts_options;
10813	dtrace_optval_t nframes=0, strsize;
10814#endif /* __APPLE__ */
10815	uint64_t arg = desc->dtad_arg;
10816
10817	lck_mtx_assert(&dtrace_lock, LCK_MTX_ASSERT_OWNED);
10818	ASSERT(ecb->dte_action == NULL || ecb->dte_action->dta_refcnt == 1);
10819
10820	if (DTRACEACT_ISAGG(desc->dtad_kind)) {
10821		/*
10822		 * If this is an aggregating action, there must be neither
10823		 * a speculate nor a commit on the action chain.
10824		 */
10825		dtrace_action_t *act;
10826
10827		for (act = ecb->dte_action; act != NULL; act = act->dta_next) {
10828			if (act->dta_kind == DTRACEACT_COMMIT)
10829				return (EINVAL);
10830
10831			if (act->dta_kind == DTRACEACT_SPECULATE)
10832				return (EINVAL);
10833		}
10834
10835		action = dtrace_ecb_aggregation_create(ecb, desc);
10836
10837		if (action == NULL)
10838			return (EINVAL);
10839	} else {
10840		if (DTRACEACT_ISDESTRUCTIVE(desc->dtad_kind) ||
10841		    (desc->dtad_kind == DTRACEACT_DIFEXPR &&
10842		    dp != NULL && dp->dtdo_destructive)) {
10843			state->dts_destructive = 1;
10844		}
10845
10846		switch (desc->dtad_kind) {
10847		case DTRACEACT_PRINTF:
10848		case DTRACEACT_PRINTA:
10849		case DTRACEACT_SYSTEM:
10850		case DTRACEACT_FREOPEN:
10851			/*
10852			 * We know that our arg is a string -- turn it into a
10853			 * format.
10854			 */
10855			if (arg == NULL) {
10856				ASSERT(desc->dtad_kind == DTRACEACT_PRINTA);
10857				format = 0;
10858			} else {
10859				ASSERT(arg != NULL);
10860				ASSERT(arg > KERNELBASE);
10861				format = dtrace_format_add(state,
10862				    (char *)(uintptr_t)arg);
10863			}
10864
10865			/*FALLTHROUGH*/
10866		case DTRACEACT_LIBACT:
10867		case DTRACEACT_DIFEXPR:
10868#if defined(__APPLE__)
10869		case DTRACEACT_APPLEBINARY:
10870#endif /* __APPLE__ */
10871			if (dp == NULL)
10872				return (EINVAL);
10873
10874			if ((size = dp->dtdo_rtype.dtdt_size) != 0)
10875				break;
10876
10877			if (dp->dtdo_rtype.dtdt_kind == DIF_TYPE_STRING) {
10878				if (!(dp->dtdo_rtype.dtdt_flags & DIF_TF_BYREF))
10879					return (EINVAL);
10880
10881				size = opt[DTRACEOPT_STRSIZE];
10882			}
10883
10884			break;
10885
10886		case DTRACEACT_STACK:
10887			if ((nframes = arg) == 0) {
10888				nframes = opt[DTRACEOPT_STACKFRAMES];
10889				ASSERT(nframes > 0);
10890				arg = nframes;
10891			}
10892
10893			size = nframes * sizeof (pc_t);
10894			break;
10895
10896		case DTRACEACT_JSTACK:
10897			if ((strsize = DTRACE_USTACK_STRSIZE(arg)) == 0)
10898				strsize = opt[DTRACEOPT_JSTACKSTRSIZE];
10899
10900			if ((nframes = DTRACE_USTACK_NFRAMES(arg)) == 0)
10901				nframes = opt[DTRACEOPT_JSTACKFRAMES];
10902
10903			arg = DTRACE_USTACK_ARG(nframes, strsize);
10904
10905			/*FALLTHROUGH*/
10906		case DTRACEACT_USTACK:
10907			if (desc->dtad_kind != DTRACEACT_JSTACK &&
10908			    (nframes = DTRACE_USTACK_NFRAMES(arg)) == 0) {
10909				strsize = DTRACE_USTACK_STRSIZE(arg);
10910				nframes = opt[DTRACEOPT_USTACKFRAMES];
10911				ASSERT(nframes > 0);
10912				arg = DTRACE_USTACK_ARG(nframes, strsize);
10913			}
10914
10915			/*
10916			 * Save a slot for the pid.
10917			 */
10918			size = (nframes + 1) * sizeof (uint64_t);
10919			size += DTRACE_USTACK_STRSIZE(arg);
10920			size = P2ROUNDUP(size, (uint32_t)(sizeof (uintptr_t)));
10921
10922			break;
10923
10924		case DTRACEACT_SYM:
10925		case DTRACEACT_MOD:
10926			if (dp == NULL || ((size = dp->dtdo_rtype.dtdt_size) !=
10927			    sizeof (uint64_t)) ||
10928			    (dp->dtdo_rtype.dtdt_flags & DIF_TF_BYREF))
10929				return (EINVAL);
10930			break;
10931
10932		case DTRACEACT_USYM:
10933		case DTRACEACT_UMOD:
10934		case DTRACEACT_UADDR:
10935			if (dp == NULL ||
10936			    (dp->dtdo_rtype.dtdt_size != sizeof (uint64_t)) ||
10937			    (dp->dtdo_rtype.dtdt_flags & DIF_TF_BYREF))
10938				return (EINVAL);
10939
10940			/*
10941			 * We have a slot for the pid, plus a slot for the
10942			 * argument.  To keep things simple (aligned with
10943			 * bitness-neutral sizing), we store each as a 64-bit
10944			 * quantity.
10945			 */
10946			size = 2 * sizeof (uint64_t);
10947			break;
10948
10949		case DTRACEACT_STOP:
10950		case DTRACEACT_BREAKPOINT:
10951		case DTRACEACT_PANIC:
10952			break;
10953
10954		case DTRACEACT_CHILL:
10955		case DTRACEACT_DISCARD:
10956		case DTRACEACT_RAISE:
10957#if defined(__APPLE__)
10958		case DTRACEACT_PIDRESUME:
10959#endif /* __APPLE__ */
10960			if (dp == NULL)
10961				return (EINVAL);
10962			break;
10963
10964		case DTRACEACT_EXIT:
10965			if (dp == NULL ||
10966			    (size = dp->dtdo_rtype.dtdt_size) != sizeof (int) ||
10967			    (dp->dtdo_rtype.dtdt_flags & DIF_TF_BYREF))
10968				return (EINVAL);
10969			break;
10970
10971		case DTRACEACT_SPECULATE:
10972			if (ecb->dte_size > sizeof (dtrace_epid_t))
10973				return (EINVAL);
10974
10975			if (dp == NULL)
10976				return (EINVAL);
10977
10978			state->dts_speculates = 1;
10979			break;
10980
10981		case DTRACEACT_COMMIT: {
10982			dtrace_action_t *act = ecb->dte_action;
10983
10984			for (; act != NULL; act = act->dta_next) {
10985				if (act->dta_kind == DTRACEACT_COMMIT)
10986					return (EINVAL);
10987			}
10988
10989			if (dp == NULL)
10990				return (EINVAL);
10991			break;
10992		}
10993
10994		default:
10995			return (EINVAL);
10996		}
10997
10998		if (size != 0 || desc->dtad_kind == DTRACEACT_SPECULATE) {
10999			/*
11000			 * If this is a data-storing action or a speculate,
11001			 * we must be sure that there isn't a commit on the
11002			 * action chain.
11003			 */
11004			dtrace_action_t *act = ecb->dte_action;
11005
11006			for (; act != NULL; act = act->dta_next) {
11007				if (act->dta_kind == DTRACEACT_COMMIT)
11008					return (EINVAL);
11009			}
11010		}
11011
11012		action = kmem_zalloc(sizeof (dtrace_action_t), KM_SLEEP);
11013		action->dta_rec.dtrd_size = size;
11014	}
11015
11016	action->dta_refcnt = 1;
11017	rec = &action->dta_rec;
11018	size = rec->dtrd_size;
11019
11020	for (mask = sizeof (uint64_t) - 1; size != 0 && mask > 0; mask >>= 1) {
11021		if (!(size & mask)) {
11022			align = mask + 1;
11023			break;
11024		}
11025	}
11026
11027	action->dta_kind = desc->dtad_kind;
11028
11029	if ((action->dta_difo = dp) != NULL)
11030		dtrace_difo_hold(dp);
11031
11032	rec->dtrd_action = action->dta_kind;
11033	rec->dtrd_arg = arg;
11034	rec->dtrd_uarg = desc->dtad_uarg;
11035	rec->dtrd_alignment = (uint16_t)align;
11036	rec->dtrd_format = format;
11037
11038	if ((last = ecb->dte_action_last) != NULL) {
11039		ASSERT(ecb->dte_action != NULL);
11040		action->dta_prev = last;
11041		last->dta_next = action;
11042	} else {
11043		ASSERT(ecb->dte_action == NULL);
11044		ecb->dte_action = action;
11045	}
11046
11047	ecb->dte_action_last = action;
11048
11049	return (0);
11050}
11051
11052static void
11053dtrace_ecb_action_remove(dtrace_ecb_t *ecb)
11054{
11055	dtrace_action_t *act = ecb->dte_action, *next;
11056	dtrace_vstate_t *vstate = &ecb->dte_state->dts_vstate;
11057	dtrace_difo_t *dp;
11058	uint16_t format;
11059
11060	if (act != NULL && act->dta_refcnt > 1) {
11061		ASSERT(act->dta_next == NULL || act->dta_next->dta_refcnt == 1);
11062		act->dta_refcnt--;
11063	} else {
11064		for (; act != NULL; act = next) {
11065			next = act->dta_next;
11066			ASSERT(next != NULL || act == ecb->dte_action_last);
11067			ASSERT(act->dta_refcnt == 1);
11068
11069			if ((format = act->dta_rec.dtrd_format) != 0)
11070				dtrace_format_remove(ecb->dte_state, format);
11071
11072			if ((dp = act->dta_difo) != NULL)
11073				dtrace_difo_release(dp, vstate);
11074
11075			if (DTRACEACT_ISAGG(act->dta_kind)) {
11076				dtrace_ecb_aggregation_destroy(ecb, act);
11077			} else {
11078				kmem_free(act, sizeof (dtrace_action_t));
11079			}
11080		}
11081	}
11082
11083	ecb->dte_action = NULL;
11084	ecb->dte_action_last = NULL;
11085	ecb->dte_size = sizeof (dtrace_epid_t);
11086}
11087
11088static void
11089dtrace_ecb_disable(dtrace_ecb_t *ecb)
11090{
11091	/*
11092	 * We disable the ECB by removing it from its probe.
11093	 */
11094	dtrace_ecb_t *pecb, *prev = NULL;
11095	dtrace_probe_t *probe = ecb->dte_probe;
11096
11097	lck_mtx_assert(&dtrace_lock, LCK_MTX_ASSERT_OWNED);
11098
11099	if (probe == NULL) {
11100		/*
11101		 * This is the NULL probe; there is nothing to disable.
11102		 */
11103		return;
11104	}
11105
11106	for (pecb = probe->dtpr_ecb; pecb != NULL; pecb = pecb->dte_next) {
11107		if (pecb == ecb)
11108			break;
11109		prev = pecb;
11110	}
11111
11112	ASSERT(pecb != NULL);
11113
11114	if (prev == NULL) {
11115		probe->dtpr_ecb = ecb->dte_next;
11116	} else {
11117		prev->dte_next = ecb->dte_next;
11118	}
11119
11120	if (ecb == probe->dtpr_ecb_last) {
11121		ASSERT(ecb->dte_next == NULL);
11122		probe->dtpr_ecb_last = prev;
11123	}
11124
11125	probe->dtpr_provider->ecb_count--;
11126	/*
11127	 * The ECB has been disconnected from the probe; now sync to assure
11128	 * that all CPUs have seen the change before returning.
11129	 */
11130	dtrace_sync();
11131
11132	if (probe->dtpr_ecb == NULL) {
11133		/*
11134		 * That was the last ECB on the probe; clear the predicate
11135		 * cache ID for the probe, disable it and sync one more time
11136		 * to assure that we'll never hit it again.
11137		 */
11138		dtrace_provider_t *prov = probe->dtpr_provider;
11139
11140		ASSERT(ecb->dte_next == NULL);
11141		ASSERT(probe->dtpr_ecb_last == NULL);
11142		probe->dtpr_predcache = DTRACE_CACHEIDNONE;
11143		prov->dtpv_pops.dtps_disable(prov->dtpv_arg,
11144		    probe->dtpr_id, probe->dtpr_arg);
11145		dtrace_sync();
11146	} else {
11147		/*
11148		 * There is at least one ECB remaining on the probe.  If there
11149		 * is _exactly_ one, set the probe's predicate cache ID to be
11150		 * the predicate cache ID of the remaining ECB.
11151		 */
11152		ASSERT(probe->dtpr_ecb_last != NULL);
11153		ASSERT(probe->dtpr_predcache == DTRACE_CACHEIDNONE);
11154
11155		if (probe->dtpr_ecb == probe->dtpr_ecb_last) {
11156			dtrace_predicate_t *p = probe->dtpr_ecb->dte_predicate;
11157
11158			ASSERT(probe->dtpr_ecb->dte_next == NULL);
11159
11160			if (p != NULL)
11161				probe->dtpr_predcache = p->dtp_cacheid;
11162		}
11163
11164		ecb->dte_next = NULL;
11165	}
11166}
11167
11168static void
11169dtrace_ecb_destroy(dtrace_ecb_t *ecb)
11170{
11171	dtrace_state_t *state = ecb->dte_state;
11172	dtrace_vstate_t *vstate = &state->dts_vstate;
11173	dtrace_predicate_t *pred;
11174	dtrace_epid_t epid = ecb->dte_epid;
11175
11176	lck_mtx_assert(&dtrace_lock, LCK_MTX_ASSERT_OWNED);
11177	ASSERT(ecb->dte_next == NULL);
11178	ASSERT(ecb->dte_probe == NULL || ecb->dte_probe->dtpr_ecb != ecb);
11179
11180	if ((pred = ecb->dte_predicate) != NULL)
11181		dtrace_predicate_release(pred, vstate);
11182
11183	dtrace_ecb_action_remove(ecb);
11184
11185	ASSERT(state->dts_ecbs[epid - 1] == ecb);
11186	state->dts_ecbs[epid - 1] = NULL;
11187
11188	kmem_free(ecb, sizeof (dtrace_ecb_t));
11189}
11190
11191static dtrace_ecb_t *
11192dtrace_ecb_create(dtrace_state_t *state, dtrace_probe_t *probe,
11193    dtrace_enabling_t *enab)
11194{
11195	dtrace_ecb_t *ecb;
11196	dtrace_predicate_t *pred;
11197	dtrace_actdesc_t *act;
11198	dtrace_provider_t *prov;
11199	dtrace_ecbdesc_t *desc = enab->dten_current;
11200
11201	lck_mtx_assert(&dtrace_lock, LCK_MTX_ASSERT_OWNED);
11202	ASSERT(state != NULL);
11203
11204	ecb = dtrace_ecb_add(state, probe);
11205	ecb->dte_uarg = desc->dted_uarg;
11206
11207	if ((pred = desc->dted_pred.dtpdd_predicate) != NULL) {
11208		dtrace_predicate_hold(pred);
11209		ecb->dte_predicate = pred;
11210	}
11211
11212	if (probe != NULL) {
11213		/*
11214		 * If the provider shows more leg than the consumer is old
11215		 * enough to see, we need to enable the appropriate implicit
11216		 * predicate bits to prevent the ecb from activating at
11217		 * revealing times.
11218		 *
11219		 * Providers specifying DTRACE_PRIV_USER at register time
11220		 * are stating that they need the /proc-style privilege
11221		 * model to be enforced, and this is what DTRACE_COND_OWNER
11222		 * and DTRACE_COND_ZONEOWNER will then do at probe time.
11223		 */
11224		prov = probe->dtpr_provider;
11225		if (!(state->dts_cred.dcr_visible & DTRACE_CRV_ALLPROC) &&
11226		    (prov->dtpv_priv.dtpp_flags & DTRACE_PRIV_USER))
11227			ecb->dte_cond |= DTRACE_COND_OWNER;
11228
11229		if (!(state->dts_cred.dcr_visible & DTRACE_CRV_ALLZONE) &&
11230		    (prov->dtpv_priv.dtpp_flags & DTRACE_PRIV_USER))
11231			ecb->dte_cond |= DTRACE_COND_ZONEOWNER;
11232
11233		/*
11234		 * If the provider shows us kernel innards and the user
11235		 * is lacking sufficient privilege, enable the
11236		 * DTRACE_COND_USERMODE implicit predicate.
11237		 */
11238		if (!(state->dts_cred.dcr_visible & DTRACE_CRV_KERNEL) &&
11239		    (prov->dtpv_priv.dtpp_flags & DTRACE_PRIV_KERNEL))
11240			ecb->dte_cond |= DTRACE_COND_USERMODE;
11241	}
11242
11243	if (dtrace_ecb_create_cache != NULL) {
11244		/*
11245		 * If we have a cached ecb, we'll use its action list instead
11246		 * of creating our own (saving both time and space).
11247		 */
11248		dtrace_ecb_t *cached = dtrace_ecb_create_cache;
11249		dtrace_action_t *act_if = cached->dte_action;
11250
11251		if (act_if != NULL) {
11252			ASSERT(act_if->dta_refcnt > 0);
11253			act_if->dta_refcnt++;
11254			ecb->dte_action = act_if;
11255			ecb->dte_action_last = cached->dte_action_last;
11256			ecb->dte_needed = cached->dte_needed;
11257			ecb->dte_size = cached->dte_size;
11258			ecb->dte_alignment = cached->dte_alignment;
11259		}
11260
11261		return (ecb);
11262	}
11263
11264	for (act = desc->dted_action; act != NULL; act = act->dtad_next) {
11265		if ((enab->dten_error = dtrace_ecb_action_add(ecb, act)) != 0) {
11266			dtrace_ecb_destroy(ecb);
11267			return (NULL);
11268		}
11269	}
11270
11271	dtrace_ecb_resize(ecb);
11272
11273	return (dtrace_ecb_create_cache = ecb);
11274}
11275
11276static int
11277dtrace_ecb_create_enable(dtrace_probe_t *probe, void *arg)
11278{
11279	dtrace_ecb_t *ecb;
11280	dtrace_enabling_t *enab = arg;
11281	dtrace_state_t *state = enab->dten_vstate->dtvs_state;
11282
11283	ASSERT(state != NULL);
11284
11285	if (probe != NULL && probe->dtpr_gen < enab->dten_probegen) {
11286		/*
11287		 * This probe was created in a generation for which this
11288		 * enabling has previously created ECBs; we don't want to
11289		 * enable it again, so just kick out.
11290		 */
11291		return (DTRACE_MATCH_NEXT);
11292	}
11293
11294	if ((ecb = dtrace_ecb_create(state, probe, enab)) == NULL)
11295		return (DTRACE_MATCH_DONE);
11296
11297	if (dtrace_ecb_enable(ecb) < 0)
11298               return (DTRACE_MATCH_FAIL);
11299
11300	return (DTRACE_MATCH_NEXT);
11301}
11302
11303static dtrace_ecb_t *
11304dtrace_epid2ecb(dtrace_state_t *state, dtrace_epid_t id)
11305{
11306	dtrace_ecb_t *ecb;
11307#pragma unused(ecb) /* __APPLE__ */
11308
11309	lck_mtx_assert(&dtrace_lock, LCK_MTX_ASSERT_OWNED);
11310
11311#if !defined(__APPLE__) /* Quiet compiler warnings */
11312	if (id == 0 || id > state->dts_necbs)
11313#else
11314	    if (id == 0 || id > (dtrace_epid_t)state->dts_necbs)
11315#endif /* __APPLE__ */
11316		return (NULL);
11317
11318	ASSERT(state->dts_necbs > 0 && state->dts_ecbs != NULL);
11319	ASSERT((ecb = state->dts_ecbs[id - 1]) == NULL || ecb->dte_epid == id);
11320
11321	return (state->dts_ecbs[id - 1]);
11322}
11323
11324static dtrace_aggregation_t *
11325dtrace_aggid2agg(dtrace_state_t *state, dtrace_aggid_t id)
11326{
11327	dtrace_aggregation_t *agg;
11328#pragma unused(agg) /* __APPLE__ */
11329
11330	lck_mtx_assert(&dtrace_lock, LCK_MTX_ASSERT_OWNED);
11331
11332#if !defined(__APPLE__) /* Quiet compiler warnings */
11333	if (id == 0 || id > state->dts_naggregations)
11334#else
11335	if (id == 0 || id > (dtrace_aggid_t)state->dts_naggregations)
11336#endif /* __APPLE__ */
11337		return (NULL);
11338
11339	ASSERT(state->dts_naggregations > 0 && state->dts_aggregations != NULL);
11340	ASSERT((agg = state->dts_aggregations[id - 1]) == NULL ||
11341	    agg->dtag_id == id);
11342
11343	return (state->dts_aggregations[id - 1]);
11344}
11345
11346/*
11347 * DTrace Buffer Functions
11348 *
11349 * The following functions manipulate DTrace buffers.  Most of these functions
11350 * are called in the context of establishing or processing consumer state;
11351 * exceptions are explicitly noted.
11352 */
11353
11354/*
11355 * Note:  called from cross call context.  This function switches the two
11356 * buffers on a given CPU.  The atomicity of this operation is assured by
11357 * disabling interrupts while the actual switch takes place; the disabling of
11358 * interrupts serializes the execution with any execution of dtrace_probe() on
11359 * the same CPU.
11360 */
11361static void
11362dtrace_buffer_switch(dtrace_buffer_t *buf)
11363{
11364	caddr_t tomax = buf->dtb_tomax;
11365	caddr_t xamot = buf->dtb_xamot;
11366	dtrace_icookie_t cookie;
11367
11368	ASSERT(!(buf->dtb_flags & DTRACEBUF_NOSWITCH));
11369	ASSERT(!(buf->dtb_flags & DTRACEBUF_RING));
11370
11371	cookie = dtrace_interrupt_disable();
11372	buf->dtb_tomax = xamot;
11373	buf->dtb_xamot = tomax;
11374	buf->dtb_xamot_drops = buf->dtb_drops;
11375	buf->dtb_xamot_offset = buf->dtb_offset;
11376	buf->dtb_xamot_errors = buf->dtb_errors;
11377	buf->dtb_xamot_flags = buf->dtb_flags;
11378	buf->dtb_offset = 0;
11379	buf->dtb_drops = 0;
11380	buf->dtb_errors = 0;
11381	buf->dtb_flags &= ~(DTRACEBUF_ERROR | DTRACEBUF_DROPPED);
11382	dtrace_interrupt_enable(cookie);
11383}
11384
11385/*
11386 * Note:  called from cross call context.  This function activates a buffer
11387 * on a CPU.  As with dtrace_buffer_switch(), the atomicity of the operation
11388 * is guaranteed by the disabling of interrupts.
11389 */
11390static void
11391dtrace_buffer_activate(dtrace_state_t *state)
11392{
11393	dtrace_buffer_t *buf;
11394	dtrace_icookie_t cookie = dtrace_interrupt_disable();
11395
11396	buf = &state->dts_buffer[CPU->cpu_id];
11397
11398	if (buf->dtb_tomax != NULL) {
11399		/*
11400		 * We might like to assert that the buffer is marked inactive,
11401		 * but this isn't necessarily true:  the buffer for the CPU
11402		 * that processes the BEGIN probe has its buffer activated
11403		 * manually.  In this case, we take the (harmless) action
11404		 * re-clearing the bit INACTIVE bit.
11405		 */
11406		buf->dtb_flags &= ~DTRACEBUF_INACTIVE;
11407	}
11408
11409	dtrace_interrupt_enable(cookie);
11410}
11411
11412static int
11413dtrace_buffer_alloc(dtrace_buffer_t *bufs, size_t size, int flags,
11414    processorid_t cpu)
11415{
11416	dtrace_cpu_t *cp;
11417	dtrace_buffer_t *buf;
11418
11419	lck_mtx_assert(&cpu_lock, LCK_MTX_ASSERT_OWNED);
11420	lck_mtx_assert(&dtrace_lock, LCK_MTX_ASSERT_OWNED);
11421
11422#if !defined(__APPLE__) /* Quiet compiler warnings */
11423	if (size > dtrace_nonroot_maxsize &&
11424	    !PRIV_POLICY_CHOICE(CRED(), PRIV_ALL, B_FALSE))
11425		return (EFBIG);
11426#else
11427	if (size > (size_t)dtrace_nonroot_maxsize &&
11428	    !PRIV_POLICY_CHOICE(CRED(), PRIV_ALL, B_FALSE))
11429		return (EFBIG);
11430#endif /* __APPLE__ */
11431
11432
11433#if defined(__APPLE__)
11434	if (size > (sane_size / 8) / (int)NCPU) /* As in kdbg_set_nkdbufs(), roughly. */
11435		return (ENOMEM);
11436#endif /* __APPLE__ */
11437
11438	cp = cpu_list;
11439
11440	do {
11441		if (cpu != DTRACE_CPUALL && cpu != cp->cpu_id)
11442			continue;
11443
11444		buf = &bufs[cp->cpu_id];
11445
11446		/*
11447		 * If there is already a buffer allocated for this CPU, it
11448		 * is only possible that this is a DR event.  In this case,
11449		 * the buffer size must match our specified size.
11450		 */
11451		if (buf->dtb_tomax != NULL) {
11452			ASSERT(buf->dtb_size == size);
11453			continue;
11454		}
11455
11456		ASSERT(buf->dtb_xamot == NULL);
11457
11458		if ((buf->dtb_tomax = kmem_zalloc(size, KM_NOSLEEP)) == NULL)
11459			goto err;
11460
11461		buf->dtb_size = size;
11462		buf->dtb_flags = flags;
11463		buf->dtb_offset = 0;
11464		buf->dtb_drops = 0;
11465
11466		if (flags & DTRACEBUF_NOSWITCH)
11467			continue;
11468
11469		if ((buf->dtb_xamot = kmem_zalloc(size, KM_NOSLEEP)) == NULL)
11470			goto err;
11471	} while ((cp = cp->cpu_next) != cpu_list);
11472
11473	return (0);
11474
11475err:
11476	cp = cpu_list;
11477
11478	do {
11479		if (cpu != DTRACE_CPUALL && cpu != cp->cpu_id)
11480			continue;
11481
11482		buf = &bufs[cp->cpu_id];
11483
11484		if (buf->dtb_xamot != NULL) {
11485			ASSERT(buf->dtb_tomax != NULL);
11486			ASSERT(buf->dtb_size == size);
11487			kmem_free(buf->dtb_xamot, size);
11488		}
11489
11490		if (buf->dtb_tomax != NULL) {
11491			ASSERT(buf->dtb_size == size);
11492			kmem_free(buf->dtb_tomax, size);
11493		}
11494
11495		buf->dtb_tomax = NULL;
11496		buf->dtb_xamot = NULL;
11497		buf->dtb_size = 0;
11498	} while ((cp = cp->cpu_next) != cpu_list);
11499
11500	return (ENOMEM);
11501}
11502
11503/*
11504 * Note:  called from probe context.  This function just increments the drop
11505 * count on a buffer.  It has been made a function to allow for the
11506 * possibility of understanding the source of mysterious drop counts.  (A
11507 * problem for which one may be particularly disappointed that DTrace cannot
11508 * be used to understand DTrace.)
11509 */
11510static void
11511dtrace_buffer_drop(dtrace_buffer_t *buf)
11512{
11513	buf->dtb_drops++;
11514}
11515
11516/*
11517 * Note:  called from probe context.  This function is called to reserve space
11518 * in a buffer.  If mstate is non-NULL, sets the scratch base and size in the
11519 * mstate.  Returns the new offset in the buffer, or a negative value if an
11520 * error has occurred.
11521 */
11522static intptr_t
11523dtrace_buffer_reserve(dtrace_buffer_t *buf, size_t needed, size_t align,
11524    dtrace_state_t *state, dtrace_mstate_t *mstate)
11525{
11526	intptr_t offs = buf->dtb_offset, soffs;
11527	intptr_t woffs;
11528	caddr_t tomax;
11529	size_t total_off;
11530
11531	if (buf->dtb_flags & DTRACEBUF_INACTIVE)
11532		return (-1);
11533
11534	if ((tomax = buf->dtb_tomax) == NULL) {
11535		dtrace_buffer_drop(buf);
11536		return (-1);
11537	}
11538
11539	if (!(buf->dtb_flags & (DTRACEBUF_RING | DTRACEBUF_FILL))) {
11540		while (offs & (align - 1)) {
11541			/*
11542			 * Assert that our alignment is off by a number which
11543			 * is itself sizeof (uint32_t) aligned.
11544			 */
11545			ASSERT(!((align - (offs & (align - 1))) &
11546			    (sizeof (uint32_t) - 1)));
11547			DTRACE_STORE(uint32_t, tomax, offs, DTRACE_EPIDNONE);
11548			offs += sizeof (uint32_t);
11549		}
11550
11551#if !defined(__APPLE__) /* Quiet compiler warnings */
11552		if ((soffs = offs + needed) > buf->dtb_size) {
11553#else
11554		if ((uint64_t)(soffs = offs + needed) > buf->dtb_size) {
11555#endif /* __APPLE__ */
11556			dtrace_buffer_drop(buf);
11557			return (-1);
11558		}
11559
11560		if (mstate == NULL)
11561			return (offs);
11562
11563		mstate->dtms_scratch_base = (uintptr_t)tomax + soffs;
11564		mstate->dtms_scratch_size = buf->dtb_size - soffs;
11565		mstate->dtms_scratch_ptr = mstate->dtms_scratch_base;
11566
11567		return (offs);
11568	}
11569
11570	if (buf->dtb_flags & DTRACEBUF_FILL) {
11571		if (state->dts_activity != DTRACE_ACTIVITY_COOLDOWN &&
11572		    (buf->dtb_flags & DTRACEBUF_FULL))
11573			return (-1);
11574		goto out;
11575	}
11576
11577	total_off = needed + (offs & (align - 1));
11578
11579	/*
11580	 * For a ring buffer, life is quite a bit more complicated.  Before
11581	 * we can store any padding, we need to adjust our wrapping offset.
11582	 * (If we've never before wrapped or we're not about to, no adjustment
11583	 * is required.)
11584	 */
11585	if ((buf->dtb_flags & DTRACEBUF_WRAPPED) ||
11586	    offs + total_off > buf->dtb_size) {
11587		woffs = buf->dtb_xamot_offset;
11588
11589		if (offs + total_off > buf->dtb_size) {
11590			/*
11591			 * We can't fit in the end of the buffer.  First, a
11592			 * sanity check that we can fit in the buffer at all.
11593			 */
11594			if (total_off > buf->dtb_size) {
11595				dtrace_buffer_drop(buf);
11596				return (-1);
11597			}
11598
11599			/*
11600			 * We're going to be storing at the top of the buffer,
11601			 * so now we need to deal with the wrapped offset.  We
11602			 * only reset our wrapped offset to 0 if it is
11603			 * currently greater than the current offset.  If it
11604			 * is less than the current offset, it is because a
11605			 * previous allocation induced a wrap -- but the
11606			 * allocation didn't subsequently take the space due
11607			 * to an error or false predicate evaluation.  In this
11608			 * case, we'll just leave the wrapped offset alone: if
11609			 * the wrapped offset hasn't been advanced far enough
11610			 * for this allocation, it will be adjusted in the
11611			 * lower loop.
11612			 */
11613			if (buf->dtb_flags & DTRACEBUF_WRAPPED) {
11614				if (woffs >= offs)
11615					woffs = 0;
11616			} else {
11617				woffs = 0;
11618			}
11619
11620			/*
11621			 * Now we know that we're going to be storing to the
11622			 * top of the buffer and that there is room for us
11623			 * there.  We need to clear the buffer from the current
11624			 * offset to the end (there may be old gunk there).
11625			 */
11626#if !defined(__APPLE__) /* Quiet compiler warnings */
11627			while (offs < buf->dtb_size)
11628#else
11629			while ((uint64_t)offs < buf->dtb_size)
11630#endif /* __APPLE__ */
11631				tomax[offs++] = 0;
11632
11633			/*
11634			 * We need to set our offset to zero.  And because we
11635			 * are wrapping, we need to set the bit indicating as
11636			 * much.  We can also adjust our needed space back
11637			 * down to the space required by the ECB -- we know
11638			 * that the top of the buffer is aligned.
11639			 */
11640			offs = 0;
11641			total_off = needed;
11642			buf->dtb_flags |= DTRACEBUF_WRAPPED;
11643		} else {
11644			/*
11645			 * There is room for us in the buffer, so we simply
11646			 * need to check the wrapped offset.
11647			 */
11648			if (woffs < offs) {
11649				/*
11650				 * The wrapped offset is less than the offset.
11651				 * This can happen if we allocated buffer space
11652				 * that induced a wrap, but then we didn't
11653				 * subsequently take the space due to an error
11654				 * or false predicate evaluation.  This is
11655				 * okay; we know that _this_ allocation isn't
11656				 * going to induce a wrap.  We still can't
11657				 * reset the wrapped offset to be zero,
11658				 * however: the space may have been trashed in
11659				 * the previous failed probe attempt.  But at
11660				 * least the wrapped offset doesn't need to
11661				 * be adjusted at all...
11662				 */
11663				goto out;
11664			}
11665		}
11666
11667#if !defined(__APPLE__) /* Quiet compiler warnings */
11668		while (offs + total_off > woffs) {
11669#else
11670		while (offs + total_off > (size_t)woffs) {
11671#endif /* __APPLE__ */
11672			dtrace_epid_t epid = *(uint32_t *)(tomax + woffs);
11673			size_t size;
11674
11675			if (epid == DTRACE_EPIDNONE) {
11676				size = sizeof (uint32_t);
11677			} else {
11678#if !defined(__APPLE__) /* Quiet compiler warnings */
11679				ASSERT(epid <= state->dts_necbs);
11680#else
11681				ASSERT(epid <= (dtrace_epid_t)state->dts_necbs);
11682#endif /* __APPLE__ */
11683				ASSERT(state->dts_ecbs[epid - 1] != NULL);
11684
11685				size = state->dts_ecbs[epid - 1]->dte_size;
11686			}
11687
11688			ASSERT(woffs + size <= buf->dtb_size);
11689			ASSERT(size != 0);
11690
11691			if (woffs + size == buf->dtb_size) {
11692				/*
11693				 * We've reached the end of the buffer; we want
11694				 * to set the wrapped offset to 0 and break
11695				 * out.  However, if the offs is 0, then we're
11696				 * in a strange edge-condition:  the amount of
11697				 * space that we want to reserve plus the size
11698				 * of the record that we're overwriting is
11699				 * greater than the size of the buffer.  This
11700				 * is problematic because if we reserve the
11701				 * space but subsequently don't consume it (due
11702				 * to a failed predicate or error) the wrapped
11703				 * offset will be 0 -- yet the EPID at offset 0
11704				 * will not be committed.  This situation is
11705				 * relatively easy to deal with:  if we're in
11706				 * this case, the buffer is indistinguishable
11707				 * from one that hasn't wrapped; we need only
11708				 * finish the job by clearing the wrapped bit,
11709				 * explicitly setting the offset to be 0, and
11710				 * zero'ing out the old data in the buffer.
11711				 */
11712				if (offs == 0) {
11713					buf->dtb_flags &= ~DTRACEBUF_WRAPPED;
11714					buf->dtb_offset = 0;
11715					woffs = total_off;
11716
11717#if !defined(__APPLE__) /* Quiet compiler warnings */
11718					while (woffs < buf->dtb_size)
11719#else
11720					while ((uint64_t)woffs < buf->dtb_size)
11721#endif /* __APPLE__ */
11722
11723						tomax[woffs++] = 0;
11724				}
11725
11726				woffs = 0;
11727				break;
11728			}
11729
11730			woffs += size;
11731		}
11732
11733		/*
11734		 * We have a wrapped offset.  It may be that the wrapped offset
11735		 * has become zero -- that's okay.
11736		 */
11737		buf->dtb_xamot_offset = woffs;
11738	}
11739
11740out:
11741	/*
11742	 * Now we can plow the buffer with any necessary padding.
11743	 */
11744	while (offs & (align - 1)) {
11745		/*
11746		 * Assert that our alignment is off by a number which
11747		 * is itself sizeof (uint32_t) aligned.
11748		 */
11749		ASSERT(!((align - (offs & (align - 1))) &
11750		    (sizeof (uint32_t) - 1)));
11751		DTRACE_STORE(uint32_t, tomax, offs, DTRACE_EPIDNONE);
11752		offs += sizeof (uint32_t);
11753	}
11754
11755	if (buf->dtb_flags & DTRACEBUF_FILL) {
11756		if (offs + needed > buf->dtb_size - state->dts_reserve) {
11757			buf->dtb_flags |= DTRACEBUF_FULL;
11758			return (-1);
11759		}
11760	}
11761
11762	if (mstate == NULL)
11763		return (offs);
11764
11765	/*
11766	 * For ring buffers and fill buffers, the scratch space is always
11767	 * the inactive buffer.
11768	 */
11769	mstate->dtms_scratch_base = (uintptr_t)buf->dtb_xamot;
11770	mstate->dtms_scratch_size = buf->dtb_size;
11771	mstate->dtms_scratch_ptr = mstate->dtms_scratch_base;
11772
11773	return (offs);
11774}
11775
11776static void
11777dtrace_buffer_polish(dtrace_buffer_t *buf)
11778{
11779	ASSERT(buf->dtb_flags & DTRACEBUF_RING);
11780	lck_mtx_assert(&dtrace_lock, LCK_MTX_ASSERT_OWNED);
11781
11782	if (!(buf->dtb_flags & DTRACEBUF_WRAPPED))
11783		return;
11784
11785	/*
11786	 * We need to polish the ring buffer.  There are three cases:
11787	 *
11788	 * - The first (and presumably most common) is that there is no gap
11789	 *   between the buffer offset and the wrapped offset.  In this case,
11790	 *   there is nothing in the buffer that isn't valid data; we can
11791	 *   mark the buffer as polished and return.
11792	 *
11793	 * - The second (less common than the first but still more common
11794	 *   than the third) is that there is a gap between the buffer offset
11795	 *   and the wrapped offset, and the wrapped offset is larger than the
11796	 *   buffer offset.  This can happen because of an alignment issue, or
11797	 *   can happen because of a call to dtrace_buffer_reserve() that
11798	 *   didn't subsequently consume the buffer space.  In this case,
11799	 *   we need to zero the data from the buffer offset to the wrapped
11800	 *   offset.
11801	 *
11802	 * - The third (and least common) is that there is a gap between the
11803	 *   buffer offset and the wrapped offset, but the wrapped offset is
11804	 *   _less_ than the buffer offset.  This can only happen because a
11805	 *   call to dtrace_buffer_reserve() induced a wrap, but the space
11806	 *   was not subsequently consumed.  In this case, we need to zero the
11807	 *   space from the offset to the end of the buffer _and_ from the
11808	 *   top of the buffer to the wrapped offset.
11809	 */
11810	if (buf->dtb_offset < buf->dtb_xamot_offset) {
11811		bzero(buf->dtb_tomax + buf->dtb_offset,
11812		    buf->dtb_xamot_offset - buf->dtb_offset);
11813	}
11814
11815	if (buf->dtb_offset > buf->dtb_xamot_offset) {
11816		bzero(buf->dtb_tomax + buf->dtb_offset,
11817		    buf->dtb_size - buf->dtb_offset);
11818		bzero(buf->dtb_tomax, buf->dtb_xamot_offset);
11819	}
11820}
11821
11822static void
11823dtrace_buffer_free(dtrace_buffer_t *bufs)
11824{
11825	int i;
11826
11827	for (i = 0; i < (int)NCPU; i++) {
11828		dtrace_buffer_t *buf = &bufs[i];
11829
11830		if (buf->dtb_tomax == NULL) {
11831			ASSERT(buf->dtb_xamot == NULL);
11832			ASSERT(buf->dtb_size == 0);
11833			continue;
11834		}
11835
11836		if (buf->dtb_xamot != NULL) {
11837			ASSERT(!(buf->dtb_flags & DTRACEBUF_NOSWITCH));
11838			kmem_free(buf->dtb_xamot, buf->dtb_size);
11839		}
11840
11841		kmem_free(buf->dtb_tomax, buf->dtb_size);
11842		buf->dtb_size = 0;
11843		buf->dtb_tomax = NULL;
11844		buf->dtb_xamot = NULL;
11845	}
11846}
11847
11848/*
11849 * DTrace Enabling Functions
11850 */
11851static dtrace_enabling_t *
11852dtrace_enabling_create(dtrace_vstate_t *vstate)
11853{
11854	dtrace_enabling_t *enab;
11855
11856	enab = kmem_zalloc(sizeof (dtrace_enabling_t), KM_SLEEP);
11857	enab->dten_vstate = vstate;
11858
11859	return (enab);
11860}
11861
11862static void
11863dtrace_enabling_add(dtrace_enabling_t *enab, dtrace_ecbdesc_t *ecb)
11864{
11865	dtrace_ecbdesc_t **ndesc;
11866	size_t osize, nsize;
11867
11868	/*
11869	 * We can't add to enablings after we've enabled them, or after we've
11870	 * retained them.
11871	 */
11872	ASSERT(enab->dten_probegen == 0);
11873	ASSERT(enab->dten_next == NULL && enab->dten_prev == NULL);
11874
11875#if defined(__APPLE__)
11876	if (ecb == NULL) return; /* Note: protection against gcc 4.0 botch on x86 */
11877#endif /* __APPLE__ */
11878
11879	if (enab->dten_ndesc < enab->dten_maxdesc) {
11880		enab->dten_desc[enab->dten_ndesc++] = ecb;
11881		return;
11882	}
11883
11884	osize = enab->dten_maxdesc * sizeof (dtrace_enabling_t *);
11885
11886	if (enab->dten_maxdesc == 0) {
11887		enab->dten_maxdesc = 1;
11888	} else {
11889		enab->dten_maxdesc <<= 1;
11890	}
11891
11892	ASSERT(enab->dten_ndesc < enab->dten_maxdesc);
11893
11894	nsize = enab->dten_maxdesc * sizeof (dtrace_enabling_t *);
11895	ndesc = kmem_zalloc(nsize, KM_SLEEP);
11896	bcopy(enab->dten_desc, ndesc, osize);
11897	kmem_free(enab->dten_desc, osize);
11898
11899	enab->dten_desc = ndesc;
11900	enab->dten_desc[enab->dten_ndesc++] = ecb;
11901}
11902
11903static void
11904dtrace_enabling_addlike(dtrace_enabling_t *enab, dtrace_ecbdesc_t *ecb,
11905    dtrace_probedesc_t *pd)
11906{
11907	dtrace_ecbdesc_t *new;
11908	dtrace_predicate_t *pred;
11909	dtrace_actdesc_t *act;
11910
11911	/*
11912	 * We're going to create a new ECB description that matches the
11913	 * specified ECB in every way, but has the specified probe description.
11914	 */
11915	new = kmem_zalloc(sizeof (dtrace_ecbdesc_t), KM_SLEEP);
11916
11917	if ((pred = ecb->dted_pred.dtpdd_predicate) != NULL)
11918		dtrace_predicate_hold(pred);
11919
11920	for (act = ecb->dted_action; act != NULL; act = act->dtad_next)
11921		dtrace_actdesc_hold(act);
11922
11923	new->dted_action = ecb->dted_action;
11924	new->dted_pred = ecb->dted_pred;
11925	new->dted_probe = *pd;
11926	new->dted_uarg = ecb->dted_uarg;
11927
11928	dtrace_enabling_add(enab, new);
11929}
11930
11931static void
11932dtrace_enabling_dump(dtrace_enabling_t *enab)
11933{
11934	int i;
11935
11936	for (i = 0; i < enab->dten_ndesc; i++) {
11937		dtrace_probedesc_t *desc = &enab->dten_desc[i]->dted_probe;
11938
11939		cmn_err(CE_NOTE, "enabling probe %d (%s:%s:%s:%s)", i,
11940		    desc->dtpd_provider, desc->dtpd_mod,
11941		    desc->dtpd_func, desc->dtpd_name);
11942	}
11943}
11944
11945static void
11946dtrace_enabling_destroy(dtrace_enabling_t *enab)
11947{
11948	int i;
11949	dtrace_ecbdesc_t *ep;
11950	dtrace_vstate_t *vstate = enab->dten_vstate;
11951
11952	lck_mtx_assert(&dtrace_lock, LCK_MTX_ASSERT_OWNED);
11953
11954	for (i = 0; i < enab->dten_ndesc; i++) {
11955		dtrace_actdesc_t *act, *next;
11956		dtrace_predicate_t *pred;
11957
11958		ep = enab->dten_desc[i];
11959
11960		if ((pred = ep->dted_pred.dtpdd_predicate) != NULL)
11961			dtrace_predicate_release(pred, vstate);
11962
11963		for (act = ep->dted_action; act != NULL; act = next) {
11964			next = act->dtad_next;
11965			dtrace_actdesc_release(act, vstate);
11966		}
11967
11968		kmem_free(ep, sizeof (dtrace_ecbdesc_t));
11969	}
11970
11971	kmem_free(enab->dten_desc,
11972	    enab->dten_maxdesc * sizeof (dtrace_enabling_t *));
11973
11974	/*
11975	 * If this was a retained enabling, decrement the dts_nretained count
11976	 * and take it off of the dtrace_retained list.
11977	 */
11978	if (enab->dten_prev != NULL || enab->dten_next != NULL ||
11979	    dtrace_retained == enab) {
11980		ASSERT(enab->dten_vstate->dtvs_state != NULL);
11981		ASSERT(enab->dten_vstate->dtvs_state->dts_nretained > 0);
11982		enab->dten_vstate->dtvs_state->dts_nretained--;
11983                dtrace_retained_gen++;
11984	}
11985
11986	if (enab->dten_prev == NULL) {
11987		if (dtrace_retained == enab) {
11988			dtrace_retained = enab->dten_next;
11989
11990			if (dtrace_retained != NULL)
11991				dtrace_retained->dten_prev = NULL;
11992		}
11993	} else {
11994		ASSERT(enab != dtrace_retained);
11995		ASSERT(dtrace_retained != NULL);
11996		enab->dten_prev->dten_next = enab->dten_next;
11997	}
11998
11999	if (enab->dten_next != NULL) {
12000		ASSERT(dtrace_retained != NULL);
12001		enab->dten_next->dten_prev = enab->dten_prev;
12002	}
12003
12004	kmem_free(enab, sizeof (dtrace_enabling_t));
12005}
12006
12007static int
12008dtrace_enabling_retain(dtrace_enabling_t *enab)
12009{
12010	dtrace_state_t *state;
12011
12012	lck_mtx_assert(&dtrace_lock, LCK_MTX_ASSERT_OWNED);
12013	ASSERT(enab->dten_next == NULL && enab->dten_prev == NULL);
12014	ASSERT(enab->dten_vstate != NULL);
12015
12016	state = enab->dten_vstate->dtvs_state;
12017	ASSERT(state != NULL);
12018
12019	/*
12020	 * We only allow each state to retain dtrace_retain_max enablings.
12021	 */
12022	if (state->dts_nretained >= dtrace_retain_max)
12023		return (ENOSPC);
12024
12025	state->dts_nretained++;
12026        dtrace_retained_gen++;
12027
12028	if (dtrace_retained == NULL) {
12029		dtrace_retained = enab;
12030		return (0);
12031	}
12032
12033	enab->dten_next = dtrace_retained;
12034	dtrace_retained->dten_prev = enab;
12035	dtrace_retained = enab;
12036
12037	return (0);
12038}
12039
12040static int
12041dtrace_enabling_replicate(dtrace_state_t *state, dtrace_probedesc_t *match,
12042    dtrace_probedesc_t *create)
12043{
12044	dtrace_enabling_t *new, *enab;
12045	int found = 0, err = ENOENT;
12046
12047	lck_mtx_assert(&dtrace_lock, LCK_MTX_ASSERT_OWNED);
12048	ASSERT(strlen(match->dtpd_provider) < DTRACE_PROVNAMELEN);
12049	ASSERT(strlen(match->dtpd_mod) < DTRACE_MODNAMELEN);
12050	ASSERT(strlen(match->dtpd_func) < DTRACE_FUNCNAMELEN);
12051	ASSERT(strlen(match->dtpd_name) < DTRACE_NAMELEN);
12052
12053	new = dtrace_enabling_create(&state->dts_vstate);
12054
12055	/*
12056	 * Iterate over all retained enablings, looking for enablings that
12057	 * match the specified state.
12058	 */
12059	for (enab = dtrace_retained; enab != NULL; enab = enab->dten_next) {
12060		int i;
12061
12062		/*
12063		 * dtvs_state can only be NULL for helper enablings -- and
12064		 * helper enablings can't be retained.
12065		 */
12066		ASSERT(enab->dten_vstate->dtvs_state != NULL);
12067
12068		if (enab->dten_vstate->dtvs_state != state)
12069			continue;
12070
12071		/*
12072		 * Now iterate over each probe description; we're looking for
12073		 * an exact match to the specified probe description.
12074		 */
12075		for (i = 0; i < enab->dten_ndesc; i++) {
12076			dtrace_ecbdesc_t *ep = enab->dten_desc[i];
12077			dtrace_probedesc_t *pd = &ep->dted_probe;
12078
12079#if !defined(__APPLE__)
12080			if (strcmp(pd->dtpd_provider, match->dtpd_provider))
12081				continue;
12082
12083			if (strcmp(pd->dtpd_mod, match->dtpd_mod))
12084				continue;
12085
12086			if (strcmp(pd->dtpd_func, match->dtpd_func))
12087				continue;
12088
12089			if (strcmp(pd->dtpd_name, match->dtpd_name))
12090				continue;
12091#else /* Employ size bounded string operation. */
12092			if (strncmp(pd->dtpd_provider, match->dtpd_provider, DTRACE_PROVNAMELEN))
12093				continue;
12094
12095			if (strncmp(pd->dtpd_mod, match->dtpd_mod, DTRACE_MODNAMELEN))
12096				continue;
12097
12098			if (strncmp(pd->dtpd_func, match->dtpd_func, DTRACE_FUNCNAMELEN))
12099				continue;
12100
12101			if (strncmp(pd->dtpd_name, match->dtpd_name, DTRACE_NAMELEN))
12102				continue;
12103#endif /* __APPLE__ */
12104
12105			/*
12106			 * We have a winning probe!  Add it to our growing
12107			 * enabling.
12108			 */
12109			found = 1;
12110			dtrace_enabling_addlike(new, ep, create);
12111		}
12112	}
12113
12114	if (!found || (err = dtrace_enabling_retain(new)) != 0) {
12115		dtrace_enabling_destroy(new);
12116		return (err);
12117	}
12118
12119	return (0);
12120}
12121
12122static void
12123dtrace_enabling_retract(dtrace_state_t *state)
12124{
12125	dtrace_enabling_t *enab, *next;
12126
12127	lck_mtx_assert(&dtrace_lock, LCK_MTX_ASSERT_OWNED);
12128
12129	/*
12130	 * Iterate over all retained enablings, destroy the enablings retained
12131	 * for the specified state.
12132	 */
12133	for (enab = dtrace_retained; enab != NULL; enab = next) {
12134		next = enab->dten_next;
12135
12136		/*
12137		 * dtvs_state can only be NULL for helper enablings -- and
12138		 * helper enablings can't be retained.
12139		 */
12140		ASSERT(enab->dten_vstate->dtvs_state != NULL);
12141
12142		if (enab->dten_vstate->dtvs_state == state) {
12143			ASSERT(state->dts_nretained > 0);
12144			dtrace_enabling_destroy(enab);
12145		}
12146	}
12147
12148	ASSERT(state->dts_nretained == 0);
12149}
12150
12151static int
12152dtrace_enabling_match(dtrace_enabling_t *enab, int *nmatched)
12153{
12154	int i = 0;
12155	int total_matched = 0, matched = 0;
12156
12157	lck_mtx_assert(&cpu_lock, LCK_MTX_ASSERT_OWNED);
12158	lck_mtx_assert(&dtrace_lock, LCK_MTX_ASSERT_OWNED);
12159
12160	for (i = 0; i < enab->dten_ndesc; i++) {
12161		dtrace_ecbdesc_t *ep = enab->dten_desc[i];
12162
12163		enab->dten_current = ep;
12164		enab->dten_error = 0;
12165
12166		/*
12167		 * If a provider failed to enable a probe then get out and
12168		 * let the consumer know we failed.
12169		 */
12170		if ((matched = dtrace_probe_enable(&ep->dted_probe, enab)) < 0)
12171			return (EBUSY);
12172
12173		total_matched += matched;
12174
12175		if (enab->dten_error != 0) {
12176			/*
12177			 * If we get an error half-way through enabling the
12178			 * probes, we kick out -- perhaps with some number of
12179			 * them enabled.  Leaving enabled probes enabled may
12180			 * be slightly confusing for user-level, but we expect
12181			 * that no one will attempt to actually drive on in
12182			 * the face of such errors.  If this is an anonymous
12183			 * enabling (indicated with a NULL nmatched pointer),
12184			 * we cmn_err() a message.  We aren't expecting to
12185			 * get such an error -- such as it can exist at all,
12186			 * it would be a result of corrupted DOF in the driver
12187			 * properties.
12188			 */
12189			if (nmatched == NULL) {
12190				cmn_err(CE_WARN, "dtrace_enabling_match() "
12191				    "error on %p: %d", (void *)ep,
12192				    enab->dten_error);
12193			}
12194
12195			return (enab->dten_error);
12196		}
12197	}
12198
12199	enab->dten_probegen = dtrace_probegen;
12200	if (nmatched != NULL)
12201		*nmatched = total_matched;
12202
12203	return (0);
12204}
12205
12206static void
12207dtrace_enabling_matchall(void)
12208{
12209	dtrace_enabling_t *enab;
12210
12211	lck_mtx_lock(&cpu_lock);
12212	lck_mtx_lock(&dtrace_lock);
12213
12214	/*
12215	 * Iterate over all retained enablings to see if any probes match
12216	 * against them.  We only perform this operation on enablings for which
12217	 * we have sufficient permissions by virtue of being in the global zone
12218	 * or in the same zone as the DTrace client.  Because we can be called
12219	 * after dtrace_detach() has been called, we cannot assert that there
12220	 * are retained enablings.  We can safely load from dtrace_retained,
12221	 * however:  the taskq_destroy() at the end of dtrace_detach() will
12222	 * block pending our completion.
12223	 */
12224	for (enab = dtrace_retained; enab != NULL; enab = enab->dten_next) {
12225#if !defined(__APPLE__)
12226		cred_t *cr = enab->dten_vstate->dtvs_state->dts_cred.dcr_cred;
12227
12228		if (INGLOBALZONE(curproc) ||
12229		    cr != NULL && getzoneid() == crgetzoneid(cr))
12230			(void) dtrace_enabling_match(enab, NULL);
12231#else
12232		(void) dtrace_enabling_match(enab, NULL); /* As if always in "global" zone." */
12233#endif /* __APPLE__ */
12234	}
12235
12236	lck_mtx_unlock(&dtrace_lock);
12237	lck_mtx_unlock(&cpu_lock);
12238}
12239
12240/*
12241 * If an enabling is to be enabled without having matched probes (that is, if
12242 * dtrace_state_go() is to be called on the underlying dtrace_state_t), the
12243 * enabling must be _primed_ by creating an ECB for every ECB description.
12244 * This must be done to assure that we know the number of speculations, the
12245 * number of aggregations, the minimum buffer size needed, etc. before we
12246 * transition out of DTRACE_ACTIVITY_INACTIVE.  To do this without actually
12247 * enabling any probes, we create ECBs for every ECB decription, but with a
12248 * NULL probe -- which is exactly what this function does.
12249 */
12250static void
12251dtrace_enabling_prime(dtrace_state_t *state)
12252{
12253	dtrace_enabling_t *enab;
12254	int i;
12255
12256	for (enab = dtrace_retained; enab != NULL; enab = enab->dten_next) {
12257		ASSERT(enab->dten_vstate->dtvs_state != NULL);
12258
12259		if (enab->dten_vstate->dtvs_state != state)
12260			continue;
12261
12262		/*
12263		 * We don't want to prime an enabling more than once, lest
12264		 * we allow a malicious user to induce resource exhaustion.
12265		 * (The ECBs that result from priming an enabling aren't
12266		 * leaked -- but they also aren't deallocated until the
12267		 * consumer state is destroyed.)
12268		 */
12269		if (enab->dten_primed)
12270			continue;
12271
12272		for (i = 0; i < enab->dten_ndesc; i++) {
12273			enab->dten_current = enab->dten_desc[i];
12274			(void) dtrace_probe_enable(NULL, enab);
12275		}
12276
12277		enab->dten_primed = 1;
12278	}
12279}
12280
12281/*
12282 * Called to indicate that probes should be provided due to retained
12283 * enablings.  This is implemented in terms of dtrace_probe_provide(), but it
12284 * must take an initial lap through the enabling calling the dtps_provide()
12285 * entry point explicitly to allow for autocreated probes.
12286 */
12287static void
12288dtrace_enabling_provide(dtrace_provider_t *prv)
12289{
12290	int i, all = 0;
12291	dtrace_probedesc_t desc;
12292        dtrace_genid_t gen;
12293
12294	lck_mtx_assert(&dtrace_lock, LCK_MTX_ASSERT_OWNED);
12295	lck_mtx_assert(&dtrace_provider_lock, LCK_MTX_ASSERT_OWNED);
12296
12297	if (prv == NULL) {
12298		all = 1;
12299		prv = dtrace_provider;
12300	}
12301
12302	do {
12303		dtrace_enabling_t *enab;
12304		void *parg = prv->dtpv_arg;
12305
12306retry:
12307		gen = dtrace_retained_gen;
12308		for (enab = dtrace_retained; enab != NULL;
12309		    enab = enab->dten_next) {
12310			for (i = 0; i < enab->dten_ndesc; i++) {
12311				desc = enab->dten_desc[i]->dted_probe;
12312				lck_mtx_unlock(&dtrace_lock);
12313				prv->dtpv_pops.dtps_provide(parg, &desc);
12314				lck_mtx_lock(&dtrace_lock);
12315				/*
12316				 * Process the retained enablings again if
12317				 * they have changed while we weren't holding
12318				 * dtrace_lock.
12319				 */
12320				if (gen != dtrace_retained_gen)
12321					goto retry;
12322			}
12323		}
12324	} while (all && (prv = prv->dtpv_next) != NULL);
12325
12326	lck_mtx_unlock(&dtrace_lock);
12327	dtrace_probe_provide(NULL, all ? NULL : prv);
12328	lck_mtx_lock(&dtrace_lock);
12329}
12330
12331/*
12332 * DTrace DOF Functions
12333 */
12334/*ARGSUSED*/
12335static void
12336dtrace_dof_error(dof_hdr_t *dof, const char *str)
12337{
12338#pragma unused(dof) /* __APPLE__ */
12339	if (dtrace_err_verbose)
12340		cmn_err(CE_WARN, "failed to process DOF: %s", str);
12341
12342#ifdef DTRACE_ERRDEBUG
12343	dtrace_errdebug(str);
12344#endif
12345}
12346
12347/*
12348 * Create DOF out of a currently enabled state.  Right now, we only create
12349 * DOF containing the run-time options -- but this could be expanded to create
12350 * complete DOF representing the enabled state.
12351 */
12352static dof_hdr_t *
12353dtrace_dof_create(dtrace_state_t *state)
12354{
12355	dof_hdr_t *dof;
12356	dof_sec_t *sec;
12357	dof_optdesc_t *opt;
12358	int i, len = sizeof (dof_hdr_t) +
12359	    roundup(sizeof (dof_sec_t), sizeof (uint64_t)) +
12360	    sizeof (dof_optdesc_t) * DTRACEOPT_MAX;
12361
12362	lck_mtx_assert(&dtrace_lock, LCK_MTX_ASSERT_OWNED);
12363
12364#if !defined(__APPLE__)
12365	dof = kmem_zalloc(len, KM_SLEEP);
12366#else
12367	dof = dt_kmem_zalloc_aligned(len, 8, KM_SLEEP);
12368#endif /* __APPLE__ */
12369	dof->dofh_ident[DOF_ID_MAG0] = DOF_MAG_MAG0;
12370	dof->dofh_ident[DOF_ID_MAG1] = DOF_MAG_MAG1;
12371	dof->dofh_ident[DOF_ID_MAG2] = DOF_MAG_MAG2;
12372	dof->dofh_ident[DOF_ID_MAG3] = DOF_MAG_MAG3;
12373
12374	dof->dofh_ident[DOF_ID_MODEL] = DOF_MODEL_NATIVE;
12375	dof->dofh_ident[DOF_ID_ENCODING] = DOF_ENCODE_NATIVE;
12376	dof->dofh_ident[DOF_ID_VERSION] = DOF_VERSION;
12377	dof->dofh_ident[DOF_ID_DIFVERS] = DIF_VERSION;
12378	dof->dofh_ident[DOF_ID_DIFIREG] = DIF_DIR_NREGS;
12379	dof->dofh_ident[DOF_ID_DIFTREG] = DIF_DTR_NREGS;
12380
12381	dof->dofh_flags = 0;
12382	dof->dofh_hdrsize = sizeof (dof_hdr_t);
12383	dof->dofh_secsize = sizeof (dof_sec_t);
12384	dof->dofh_secnum = 1;	/* only DOF_SECT_OPTDESC */
12385	dof->dofh_secoff = sizeof (dof_hdr_t);
12386	dof->dofh_loadsz = len;
12387	dof->dofh_filesz = len;
12388	dof->dofh_pad = 0;
12389
12390	/*
12391	 * Fill in the option section header...
12392	 */
12393	sec = (dof_sec_t *)((uintptr_t)dof + sizeof (dof_hdr_t));
12394	sec->dofs_type = DOF_SECT_OPTDESC;
12395	sec->dofs_align = sizeof (uint64_t);
12396	sec->dofs_flags = DOF_SECF_LOAD;
12397	sec->dofs_entsize = sizeof (dof_optdesc_t);
12398
12399	opt = (dof_optdesc_t *)((uintptr_t)sec +
12400	    roundup(sizeof (dof_sec_t), sizeof (uint64_t)));
12401
12402	sec->dofs_offset = (uintptr_t)opt - (uintptr_t)dof;
12403	sec->dofs_size = sizeof (dof_optdesc_t) * DTRACEOPT_MAX;
12404
12405	for (i = 0; i < DTRACEOPT_MAX; i++) {
12406		opt[i].dofo_option = i;
12407		opt[i].dofo_strtab = DOF_SECIDX_NONE;
12408		opt[i].dofo_value = state->dts_options[i];
12409	}
12410
12411	return (dof);
12412}
12413
12414static dof_hdr_t *
12415#if !defined(__APPLE__)
12416dtrace_dof_copyin(uintptr_t uarg, int *errp)
12417#else
12418dtrace_dof_copyin(user_addr_t uarg, int *errp)
12419#endif
12420{
12421	dof_hdr_t hdr, *dof;
12422
12423	lck_mtx_assert(&dtrace_lock, LCK_MTX_ASSERT_NOTOWNED);
12424
12425	/*
12426	 * First, we're going to copyin() the sizeof (dof_hdr_t).
12427	 */
12428#if !defined(__APPLE__)
12429	if (copyin((void *)uarg, &hdr, sizeof (hdr)) != 0) {
12430#else
12431	if (copyin(uarg, &hdr, sizeof (hdr)) != 0) {
12432#endif
12433		dtrace_dof_error(NULL, "failed to copyin DOF header");
12434		*errp = EFAULT;
12435		return (NULL);
12436	}
12437
12438	/*
12439	 * Now we'll allocate the entire DOF and copy it in -- provided
12440	 * that the length isn't outrageous.
12441	 */
12442#if !defined(__APPLE__) /* Quiet compiler warnings */
12443	if (hdr.dofh_loadsz >= dtrace_dof_maxsize) {
12444#else
12445	if (hdr.dofh_loadsz >= (uint64_t)dtrace_dof_maxsize) {
12446#endif /* __APPLE__ */
12447		dtrace_dof_error(&hdr, "load size exceeds maximum");
12448		*errp = E2BIG;
12449		return (NULL);
12450	}
12451
12452	if (hdr.dofh_loadsz < sizeof (hdr)) {
12453		dtrace_dof_error(&hdr, "invalid load size");
12454		*errp = EINVAL;
12455		return (NULL);
12456	}
12457
12458#if !defined(__APPLE__)
12459	dof = kmem_alloc(hdr.dofh_loadsz, KM_SLEEP);
12460
12461        if (copyin((void *)uarg, dof, hdr.dofh_loadsz) != 0 ||
12462	  dof->dofh_loadsz != hdr.dofh_loadsz) {
12463	    kmem_free(dof, hdr.dofh_loadsz);
12464	    *errp = EFAULT;
12465	    return (NULL);
12466	}
12467#else
12468	dof = dt_kmem_alloc_aligned(hdr.dofh_loadsz, 8, KM_SLEEP);
12469
12470        if (copyin(uarg, dof, hdr.dofh_loadsz) != 0  ||
12471	  dof->dofh_loadsz != hdr.dofh_loadsz) {
12472	    dt_kmem_free_aligned(dof, hdr.dofh_loadsz);
12473	    *errp = EFAULT;
12474	    return (NULL);
12475	}
12476#endif
12477
12478	return (dof);
12479}
12480
12481#if defined(__APPLE__)
12482
12483static dof_hdr_t *
12484dtrace_dof_copyin_from_proc(proc_t* p, user_addr_t uarg, int *errp)
12485{
12486	dof_hdr_t hdr, *dof;
12487
12488	lck_mtx_assert(&dtrace_lock, LCK_MTX_ASSERT_NOTOWNED);
12489
12490	/*
12491	 * First, we're going to copyin() the sizeof (dof_hdr_t).
12492	 */
12493	if (uread(p, &hdr, sizeof(hdr), uarg) != KERN_SUCCESS) {
12494		dtrace_dof_error(NULL, "failed to copyin DOF header");
12495		*errp = EFAULT;
12496		return (NULL);
12497	}
12498
12499	/*
12500	 * Now we'll allocate the entire DOF and copy it in -- provided
12501	 * that the length isn't outrageous.
12502	 */
12503	if (hdr.dofh_loadsz >= (uint64_t)dtrace_dof_maxsize) {
12504		dtrace_dof_error(&hdr, "load size exceeds maximum");
12505		*errp = E2BIG;
12506		return (NULL);
12507	}
12508
12509	if (hdr.dofh_loadsz < sizeof (hdr)) {
12510		dtrace_dof_error(&hdr, "invalid load size");
12511		*errp = EINVAL;
12512		return (NULL);
12513	}
12514
12515	dof = dt_kmem_alloc_aligned(hdr.dofh_loadsz, 8, KM_SLEEP);
12516
12517	if (uread(p, dof, hdr.dofh_loadsz, uarg) != KERN_SUCCESS) {
12518		dt_kmem_free_aligned(dof, hdr.dofh_loadsz);
12519		*errp = EFAULT;
12520		return (NULL);
12521	}
12522
12523	return (dof);
12524}
12525
12526#endif /* __APPLE__ */
12527
12528static dof_hdr_t *
12529dtrace_dof_property(const char *name)
12530{
12531	uchar_t *buf;
12532	uint64_t loadsz;
12533	unsigned int len, i;
12534	dof_hdr_t *dof;
12535
12536	/*
12537	 * Unfortunately, array of values in .conf files are always (and
12538	 * only) interpreted to be integer arrays.  We must read our DOF
12539	 * as an integer array, and then squeeze it into a byte array.
12540	 */
12541#if !defined(__APPLE__) /* Quiet compiler warnings */
12542	if (ddi_prop_lookup_int_array(DDI_DEV_T_ANY, dtrace_devi, 0,
12543	    (char *)name, (int **)&buf, &len) != DDI_PROP_SUCCESS)
12544		return (NULL);
12545#else
12546	if (ddi_prop_lookup_int_array(DDI_DEV_T_ANY, dtrace_devi, 0,
12547	    name, (int **)&buf, &len) != DDI_PROP_SUCCESS)
12548		return (NULL);
12549#endif /* __APPLE__ */
12550
12551	for (i = 0; i < len; i++)
12552		buf[i] = (uchar_t)(((int *)buf)[i]);
12553
12554	if (len < sizeof (dof_hdr_t)) {
12555		ddi_prop_free(buf);
12556		dtrace_dof_error(NULL, "truncated header");
12557		return (NULL);
12558	}
12559
12560	if (len < (loadsz = ((dof_hdr_t *)buf)->dofh_loadsz)) {
12561		ddi_prop_free(buf);
12562		dtrace_dof_error(NULL, "truncated DOF");
12563		return (NULL);
12564	}
12565
12566#if !defined(__APPLE__) /* Quiet compiler warnings */
12567	if (loadsz >= dtrace_dof_maxsize) {
12568#else
12569	if (loadsz >= (uint64_t)dtrace_dof_maxsize) {
12570#endif /* __APPLE__ */
12571		ddi_prop_free(buf);
12572		dtrace_dof_error(NULL, "oversized DOF");
12573		return (NULL);
12574	}
12575
12576#if !defined(__APPLE__)
12577	dof = kmem_alloc(loadsz, KM_SLEEP);
12578#else
12579	dof = dt_kmem_alloc_aligned(loadsz, 8, KM_SLEEP);
12580#endif /* __APPLE__ */
12581	bcopy(buf, dof, loadsz);
12582	ddi_prop_free(buf);
12583
12584	return (dof);
12585}
12586
12587static void
12588dtrace_dof_destroy(dof_hdr_t *dof)
12589{
12590#if !defined(__APPLE__)
12591	kmem_free(dof, dof->dofh_loadsz);
12592#else
12593	dt_kmem_free_aligned(dof, dof->dofh_loadsz);
12594#endif /* __APPLE__ */
12595}
12596
12597/*
12598 * Return the dof_sec_t pointer corresponding to a given section index.  If the
12599 * index is not valid, dtrace_dof_error() is called and NULL is returned.  If
12600 * a type other than DOF_SECT_NONE is specified, the header is checked against
12601 * this type and NULL is returned if the types do not match.
12602 */
12603static dof_sec_t *
12604dtrace_dof_sect(dof_hdr_t *dof, uint32_t type, dof_secidx_t i)
12605{
12606	dof_sec_t *sec = (dof_sec_t *)(uintptr_t)
12607	    ((uintptr_t)dof + dof->dofh_secoff + i * dof->dofh_secsize);
12608
12609	if (i >= dof->dofh_secnum) {
12610		dtrace_dof_error(dof, "referenced section index is invalid");
12611		return (NULL);
12612	}
12613
12614	if (!(sec->dofs_flags & DOF_SECF_LOAD)) {
12615		dtrace_dof_error(dof, "referenced section is not loadable");
12616		return (NULL);
12617	}
12618
12619	if (type != DOF_SECT_NONE && type != sec->dofs_type) {
12620		dtrace_dof_error(dof, "referenced section is the wrong type");
12621		return (NULL);
12622	}
12623
12624	return (sec);
12625}
12626
12627static dtrace_probedesc_t *
12628dtrace_dof_probedesc(dof_hdr_t *dof, dof_sec_t *sec, dtrace_probedesc_t *desc)
12629{
12630	dof_probedesc_t *probe;
12631	dof_sec_t *strtab;
12632	uintptr_t daddr = (uintptr_t)dof;
12633	uintptr_t str;
12634	size_t size;
12635
12636	if (sec->dofs_type != DOF_SECT_PROBEDESC) {
12637		dtrace_dof_error(dof, "invalid probe section");
12638		return (NULL);
12639	}
12640
12641	if (sec->dofs_align != sizeof (dof_secidx_t)) {
12642		dtrace_dof_error(dof, "bad alignment in probe description");
12643		return (NULL);
12644	}
12645
12646	if (sec->dofs_offset + sizeof (dof_probedesc_t) > dof->dofh_loadsz) {
12647		dtrace_dof_error(dof, "truncated probe description");
12648		return (NULL);
12649	}
12650
12651	probe = (dof_probedesc_t *)(uintptr_t)(daddr + sec->dofs_offset);
12652	strtab = dtrace_dof_sect(dof, DOF_SECT_STRTAB, probe->dofp_strtab);
12653
12654	if (strtab == NULL)
12655		return (NULL);
12656
12657	str = daddr + strtab->dofs_offset;
12658	size = strtab->dofs_size;
12659
12660	if (probe->dofp_provider >= strtab->dofs_size) {
12661		dtrace_dof_error(dof, "corrupt probe provider");
12662		return (NULL);
12663	}
12664
12665	(void) strncpy(desc->dtpd_provider,
12666	    (char *)(str + probe->dofp_provider),
12667	    MIN(DTRACE_PROVNAMELEN - 1, size - probe->dofp_provider));
12668#if defined(__APPLE__) /* Employ size bounded string operation. */
12669	desc->dtpd_provider[DTRACE_PROVNAMELEN - 1] = '\0';
12670#endif /* __APPLE__ */
12671
12672	if (probe->dofp_mod >= strtab->dofs_size) {
12673		dtrace_dof_error(dof, "corrupt probe module");
12674		return (NULL);
12675	}
12676
12677	(void) strncpy(desc->dtpd_mod, (char *)(str + probe->dofp_mod),
12678	    MIN(DTRACE_MODNAMELEN - 1, size - probe->dofp_mod));
12679#if defined(__APPLE__) /* Employ size bounded string operation. */
12680	desc->dtpd_mod[DTRACE_MODNAMELEN - 1] = '\0';
12681#endif /* __APPLE__ */
12682
12683	if (probe->dofp_func >= strtab->dofs_size) {
12684		dtrace_dof_error(dof, "corrupt probe function");
12685		return (NULL);
12686	}
12687
12688	(void) strncpy(desc->dtpd_func, (char *)(str + probe->dofp_func),
12689	    MIN(DTRACE_FUNCNAMELEN - 1, size - probe->dofp_func));
12690#if defined(__APPLE__) /* Employ size bounded string operation. */
12691	desc->dtpd_func[DTRACE_FUNCNAMELEN - 1] = '\0';
12692#endif /* __APPLE__ */
12693
12694	if (probe->dofp_name >= strtab->dofs_size) {
12695		dtrace_dof_error(dof, "corrupt probe name");
12696		return (NULL);
12697	}
12698
12699	(void) strncpy(desc->dtpd_name, (char *)(str + probe->dofp_name),
12700	    MIN(DTRACE_NAMELEN - 1, size - probe->dofp_name));
12701#if defined(__APPLE__) /* Employ size bounded string operation. */
12702	desc->dtpd_name[DTRACE_NAMELEN - 1] = '\0';
12703#endif /* __APPLE__ */
12704
12705	return (desc);
12706}
12707
12708static dtrace_difo_t *
12709dtrace_dof_difo(dof_hdr_t *dof, dof_sec_t *sec, dtrace_vstate_t *vstate,
12710    cred_t *cr)
12711{
12712	dtrace_difo_t *dp;
12713	size_t ttl = 0;
12714	dof_difohdr_t *dofd;
12715	uintptr_t daddr = (uintptr_t)dof;
12716	size_t max_size = dtrace_difo_maxsize;
12717#if !defined(__APPLE__) /* Quiet compiler warnings */
12718	int i, l, n;
12719#else
12720	uint_t i;
12721	int l, n;
12722#endif /* __APPLE__ */
12723
12724
12725	static const struct {
12726		int section;
12727		int bufoffs;
12728		int lenoffs;
12729		int entsize;
12730		int align;
12731		const char *msg;
12732	} difo[] = {
12733		{ DOF_SECT_DIF, offsetof(dtrace_difo_t, dtdo_buf),
12734		offsetof(dtrace_difo_t, dtdo_len), sizeof (dif_instr_t),
12735		sizeof (dif_instr_t), "multiple DIF sections" },
12736
12737		{ DOF_SECT_INTTAB, offsetof(dtrace_difo_t, dtdo_inttab),
12738		offsetof(dtrace_difo_t, dtdo_intlen), sizeof (uint64_t),
12739		sizeof (uint64_t), "multiple integer tables" },
12740
12741		{ DOF_SECT_STRTAB, offsetof(dtrace_difo_t, dtdo_strtab),
12742		offsetof(dtrace_difo_t, dtdo_strlen), 0,
12743		sizeof (char), "multiple string tables" },
12744
12745		{ DOF_SECT_VARTAB, offsetof(dtrace_difo_t, dtdo_vartab),
12746		offsetof(dtrace_difo_t, dtdo_varlen), sizeof (dtrace_difv_t),
12747		sizeof (uint_t), "multiple variable tables" },
12748
12749#if !defined(__APPLE__)
12750		{ DOF_SECT_NONE, 0, 0, 0, NULL }
12751#else
12752		{ DOF_SECT_NONE, 0, 0, 0, 0, NULL }
12753#endif /* __APPLE__ */
12754	};
12755
12756	if (sec->dofs_type != DOF_SECT_DIFOHDR) {
12757		dtrace_dof_error(dof, "invalid DIFO header section");
12758		return (NULL);
12759	}
12760
12761	if (sec->dofs_align != sizeof (dof_secidx_t)) {
12762		dtrace_dof_error(dof, "bad alignment in DIFO header");
12763		return (NULL);
12764	}
12765
12766	if (sec->dofs_size < sizeof (dof_difohdr_t) ||
12767	    sec->dofs_size % sizeof (dof_secidx_t)) {
12768		dtrace_dof_error(dof, "bad size in DIFO header");
12769		return (NULL);
12770	}
12771
12772	dofd = (dof_difohdr_t *)(uintptr_t)(daddr + sec->dofs_offset);
12773	n = (sec->dofs_size - sizeof (*dofd)) / sizeof (dof_secidx_t) + 1;
12774
12775	dp = kmem_zalloc(sizeof (dtrace_difo_t), KM_SLEEP);
12776	dp->dtdo_rtype = dofd->dofd_rtype;
12777
12778	for (l = 0; l < n; l++) {
12779		dof_sec_t *subsec;
12780		void **bufp;
12781		uint32_t *lenp;
12782
12783		if ((subsec = dtrace_dof_sect(dof, DOF_SECT_NONE,
12784		    dofd->dofd_links[l])) == NULL)
12785			goto err; /* invalid section link */
12786
12787		if (ttl + subsec->dofs_size > max_size) {
12788			dtrace_dof_error(dof, "exceeds maximum size");
12789			goto err;
12790		}
12791
12792		ttl += subsec->dofs_size;
12793
12794		for (i = 0; difo[i].section != DOF_SECT_NONE; i++) {
12795
12796#if !defined(__APPLE__) /* Quiet compiler warnings */
12797			if (subsec->dofs_type != difo[i].section)
12798				continue;
12799#else
12800			if (subsec->dofs_type != (uint32_t)difo[i].section)
12801				continue;
12802#endif /* __APPLE __ */
12803
12804			if (!(subsec->dofs_flags & DOF_SECF_LOAD)) {
12805				dtrace_dof_error(dof, "section not loaded");
12806				goto err;
12807			}
12808
12809#if !defined(__APPLE__) /* Quiet compiler warnings */
12810			if (subsec->dofs_align != difo[i].align) {
12811				dtrace_dof_error(dof, "bad alignment");
12812				goto err;
12813			}
12814#else
12815			if (subsec->dofs_align != (uint32_t)difo[i].align) {
12816				dtrace_dof_error(dof, "bad alignment");
12817				goto err;
12818			}
12819#endif /* __APPLE__ */
12820
12821			bufp = (void **)((uintptr_t)dp + difo[i].bufoffs);
12822			lenp = (uint32_t *)((uintptr_t)dp + difo[i].lenoffs);
12823
12824			if (*bufp != NULL) {
12825				dtrace_dof_error(dof, difo[i].msg);
12826				goto err;
12827			}
12828
12829#if !defined(__APPLE__) /* Quiet compiler warnings */
12830			if (difo[i].entsize != subsec->dofs_entsize) {
12831				dtrace_dof_error(dof, "entry size mismatch");
12832				goto err;
12833			}
12834#else
12835			if ((uint32_t)difo[i].entsize != subsec->dofs_entsize) {
12836				dtrace_dof_error(dof, "entry size mismatch");
12837				goto err;
12838			}
12839#endif /* __APPLE__ */
12840
12841			if (subsec->dofs_entsize != 0 &&
12842			    (subsec->dofs_size % subsec->dofs_entsize) != 0) {
12843				dtrace_dof_error(dof, "corrupt entry size");
12844				goto err;
12845			}
12846
12847			*lenp = subsec->dofs_size;
12848			*bufp = kmem_alloc(subsec->dofs_size, KM_SLEEP);
12849			bcopy((char *)(uintptr_t)(daddr + subsec->dofs_offset),
12850			    *bufp, subsec->dofs_size);
12851
12852			if (subsec->dofs_entsize != 0)
12853				*lenp /= subsec->dofs_entsize;
12854
12855			break;
12856		}
12857
12858		/*
12859		 * If we encounter a loadable DIFO sub-section that is not
12860		 * known to us, assume this is a broken program and fail.
12861		 */
12862		if (difo[i].section == DOF_SECT_NONE &&
12863		    (subsec->dofs_flags & DOF_SECF_LOAD)) {
12864			dtrace_dof_error(dof, "unrecognized DIFO subsection");
12865			goto err;
12866		}
12867	}
12868
12869	if (dp->dtdo_buf == NULL) {
12870		/*
12871		 * We can't have a DIF object without DIF text.
12872		 */
12873		dtrace_dof_error(dof, "missing DIF text");
12874		goto err;
12875	}
12876
12877	/*
12878	 * Before we validate the DIF object, run through the variable table
12879	 * looking for the strings -- if any of their size are under, we'll set
12880	 * their size to be the system-wide default string size.  Note that
12881	 * this should _not_ happen if the "strsize" option has been set --
12882	 * in this case, the compiler should have set the size to reflect the
12883	 * setting of the option.
12884	 */
12885	for (i = 0; i < dp->dtdo_varlen; i++) {
12886		dtrace_difv_t *v = &dp->dtdo_vartab[i];
12887		dtrace_diftype_t *t = &v->dtdv_type;
12888
12889		if (v->dtdv_id < DIF_VAR_OTHER_UBASE)
12890			continue;
12891
12892		if (t->dtdt_kind == DIF_TYPE_STRING && t->dtdt_size == 0)
12893			t->dtdt_size = dtrace_strsize_default;
12894	}
12895
12896	if (dtrace_difo_validate(dp, vstate, DIF_DIR_NREGS, cr) != 0)
12897		goto err;
12898
12899	dtrace_difo_init(dp, vstate);
12900	return (dp);
12901
12902err:
12903	kmem_free(dp->dtdo_buf, dp->dtdo_len * sizeof (dif_instr_t));
12904	kmem_free(dp->dtdo_inttab, dp->dtdo_intlen * sizeof (uint64_t));
12905	kmem_free(dp->dtdo_strtab, dp->dtdo_strlen);
12906	kmem_free(dp->dtdo_vartab, dp->dtdo_varlen * sizeof (dtrace_difv_t));
12907
12908	kmem_free(dp, sizeof (dtrace_difo_t));
12909	return (NULL);
12910}
12911
12912static dtrace_predicate_t *
12913dtrace_dof_predicate(dof_hdr_t *dof, dof_sec_t *sec, dtrace_vstate_t *vstate,
12914    cred_t *cr)
12915{
12916	dtrace_difo_t *dp;
12917
12918	if ((dp = dtrace_dof_difo(dof, sec, vstate, cr)) == NULL)
12919		return (NULL);
12920
12921	return (dtrace_predicate_create(dp));
12922}
12923
12924static dtrace_actdesc_t *
12925dtrace_dof_actdesc(dof_hdr_t *dof, dof_sec_t *sec, dtrace_vstate_t *vstate,
12926    cred_t *cr)
12927{
12928	dtrace_actdesc_t *act, *first = NULL, *last = NULL, *next;
12929	dof_actdesc_t *desc;
12930	dof_sec_t *difosec;
12931	size_t offs;
12932	uintptr_t daddr = (uintptr_t)dof;
12933	uint64_t arg;
12934	dtrace_actkind_t kind;
12935
12936	if (sec->dofs_type != DOF_SECT_ACTDESC) {
12937		dtrace_dof_error(dof, "invalid action section");
12938		return (NULL);
12939	}
12940
12941	if (sec->dofs_offset + sizeof (dof_actdesc_t) > dof->dofh_loadsz) {
12942		dtrace_dof_error(dof, "truncated action description");
12943		return (NULL);
12944	}
12945
12946	if (sec->dofs_align != sizeof (uint64_t)) {
12947		dtrace_dof_error(dof, "bad alignment in action description");
12948		return (NULL);
12949	}
12950
12951	if (sec->dofs_size < sec->dofs_entsize) {
12952		dtrace_dof_error(dof, "section entry size exceeds total size");
12953		return (NULL);
12954	}
12955
12956	if (sec->dofs_entsize != sizeof (dof_actdesc_t)) {
12957		dtrace_dof_error(dof, "bad entry size in action description");
12958		return (NULL);
12959	}
12960
12961	if (sec->dofs_size / sec->dofs_entsize > dtrace_actions_max) {
12962		dtrace_dof_error(dof, "actions exceed dtrace_actions_max");
12963		return (NULL);
12964	}
12965
12966	for (offs = 0; offs < sec->dofs_size; offs += sec->dofs_entsize) {
12967		desc = (dof_actdesc_t *)(daddr +
12968		    (uintptr_t)sec->dofs_offset + offs);
12969		kind = (dtrace_actkind_t)desc->dofa_kind;
12970
12971		if (DTRACEACT_ISPRINTFLIKE(kind) &&
12972		    (kind != DTRACEACT_PRINTA ||
12973		    desc->dofa_strtab != DOF_SECIDX_NONE)) {
12974			dof_sec_t *strtab;
12975			char *str, *fmt;
12976			uint64_t i;
12977
12978			/*
12979			 * printf()-like actions must have a format string.
12980			 */
12981			if ((strtab = dtrace_dof_sect(dof,
12982			    DOF_SECT_STRTAB, desc->dofa_strtab)) == NULL)
12983				goto err;
12984
12985			str = (char *)((uintptr_t)dof +
12986			    (uintptr_t)strtab->dofs_offset);
12987
12988			for (i = desc->dofa_arg; i < strtab->dofs_size; i++) {
12989				if (str[i] == '\0')
12990					break;
12991			}
12992
12993			if (i >= strtab->dofs_size) {
12994				dtrace_dof_error(dof, "bogus format string");
12995				goto err;
12996			}
12997
12998			if (i == desc->dofa_arg) {
12999				dtrace_dof_error(dof, "empty format string");
13000				goto err;
13001			}
13002
13003			i -= desc->dofa_arg;
13004			fmt = kmem_alloc(i + 1, KM_SLEEP);
13005			bcopy(&str[desc->dofa_arg], fmt, i + 1);
13006			arg = (uint64_t)(uintptr_t)fmt;
13007		} else {
13008			if (kind == DTRACEACT_PRINTA) {
13009				ASSERT(desc->dofa_strtab == DOF_SECIDX_NONE);
13010				arg = 0;
13011			} else {
13012				arg = desc->dofa_arg;
13013			}
13014		}
13015
13016		act = dtrace_actdesc_create(kind, desc->dofa_ntuple,
13017		    desc->dofa_uarg, arg);
13018
13019		if (last != NULL) {
13020			last->dtad_next = act;
13021		} else {
13022			first = act;
13023		}
13024
13025		last = act;
13026
13027		if (desc->dofa_difo == DOF_SECIDX_NONE)
13028			continue;
13029
13030		if ((difosec = dtrace_dof_sect(dof,
13031		    DOF_SECT_DIFOHDR, desc->dofa_difo)) == NULL)
13032			goto err;
13033
13034		act->dtad_difo = dtrace_dof_difo(dof, difosec, vstate, cr);
13035
13036		if (act->dtad_difo == NULL)
13037			goto err;
13038	}
13039
13040	ASSERT(first != NULL);
13041	return (first);
13042
13043err:
13044	for (act = first; act != NULL; act = next) {
13045		next = act->dtad_next;
13046		dtrace_actdesc_release(act, vstate);
13047	}
13048
13049	return (NULL);
13050}
13051
13052static dtrace_ecbdesc_t *
13053dtrace_dof_ecbdesc(dof_hdr_t *dof, dof_sec_t *sec, dtrace_vstate_t *vstate,
13054    cred_t *cr)
13055{
13056	dtrace_ecbdesc_t *ep;
13057	dof_ecbdesc_t *ecb;
13058	dtrace_probedesc_t *desc;
13059	dtrace_predicate_t *pred = NULL;
13060
13061	if (sec->dofs_size < sizeof (dof_ecbdesc_t)) {
13062		dtrace_dof_error(dof, "truncated ECB description");
13063		return (NULL);
13064	}
13065
13066	if (sec->dofs_align != sizeof (uint64_t)) {
13067		dtrace_dof_error(dof, "bad alignment in ECB description");
13068		return (NULL);
13069	}
13070
13071	ecb = (dof_ecbdesc_t *)((uintptr_t)dof + (uintptr_t)sec->dofs_offset);
13072	sec = dtrace_dof_sect(dof, DOF_SECT_PROBEDESC, ecb->dofe_probes);
13073
13074	if (sec == NULL)
13075		return (NULL);
13076
13077	ep = kmem_zalloc(sizeof (dtrace_ecbdesc_t), KM_SLEEP);
13078	ep->dted_uarg = ecb->dofe_uarg;
13079	desc = &ep->dted_probe;
13080
13081	if (dtrace_dof_probedesc(dof, sec, desc) == NULL)
13082		goto err;
13083
13084	if (ecb->dofe_pred != DOF_SECIDX_NONE) {
13085		if ((sec = dtrace_dof_sect(dof,
13086		    DOF_SECT_DIFOHDR, ecb->dofe_pred)) == NULL)
13087			goto err;
13088
13089		if ((pred = dtrace_dof_predicate(dof, sec, vstate, cr)) == NULL)
13090			goto err;
13091
13092		ep->dted_pred.dtpdd_predicate = pred;
13093	}
13094
13095	if (ecb->dofe_actions != DOF_SECIDX_NONE) {
13096		if ((sec = dtrace_dof_sect(dof,
13097		    DOF_SECT_ACTDESC, ecb->dofe_actions)) == NULL)
13098			goto err;
13099
13100		ep->dted_action = dtrace_dof_actdesc(dof, sec, vstate, cr);
13101
13102		if (ep->dted_action == NULL)
13103			goto err;
13104	}
13105
13106	return (ep);
13107
13108err:
13109	if (pred != NULL)
13110		dtrace_predicate_release(pred, vstate);
13111	kmem_free(ep, sizeof (dtrace_ecbdesc_t));
13112	return (NULL);
13113}
13114
13115#if !defined(__APPLE__) /* APPLE dyld has already done this for us */
13116/*
13117 * Apply the relocations from the specified 'sec' (a DOF_SECT_URELHDR) to the
13118 * specified DOF.  At present, this amounts to simply adding 'ubase' to the
13119 * site of any user SETX relocations to account for load object base address.
13120 * In the future, if we need other relocations, this function can be extended.
13121 */
13122static int
13123dtrace_dof_relocate(dof_hdr_t *dof, dof_sec_t *sec, uint64_t ubase)
13124{
13125	uintptr_t daddr = (uintptr_t)dof;
13126	dof_relohdr_t *dofr =
13127	    (dof_relohdr_t *)(uintptr_t)(daddr + sec->dofs_offset);
13128	dof_sec_t *ss, *rs, *ts;
13129	dof_relodesc_t *r;
13130	uint_t i, n;
13131
13132	if (sec->dofs_size < sizeof (dof_relohdr_t) ||
13133	    sec->dofs_align != sizeof (dof_secidx_t)) {
13134		dtrace_dof_error(dof, "invalid relocation header");
13135		return (-1);
13136	}
13137
13138	ss = dtrace_dof_sect(dof, DOF_SECT_STRTAB, dofr->dofr_strtab);
13139	rs = dtrace_dof_sect(dof, DOF_SECT_RELTAB, dofr->dofr_relsec);
13140	ts = dtrace_dof_sect(dof, DOF_SECT_NONE, dofr->dofr_tgtsec);
13141
13142	if (ss == NULL || rs == NULL || ts == NULL)
13143		return (-1); /* dtrace_dof_error() has been called already */
13144
13145	if (rs->dofs_entsize < sizeof (dof_relodesc_t) ||
13146	    rs->dofs_align != sizeof (uint64_t)) {
13147		dtrace_dof_error(dof, "invalid relocation section");
13148		return (-1);
13149	}
13150
13151	r = (dof_relodesc_t *)(uintptr_t)(daddr + rs->dofs_offset);
13152	n = rs->dofs_size / rs->dofs_entsize;
13153
13154	for (i = 0; i < n; i++) {
13155		uintptr_t taddr = daddr + ts->dofs_offset + r->dofr_offset;
13156
13157		switch (r->dofr_type) {
13158		case DOF_RELO_NONE:
13159			break;
13160		case DOF_RELO_SETX:
13161			if (r->dofr_offset >= ts->dofs_size || r->dofr_offset +
13162			    sizeof (uint64_t) > ts->dofs_size) {
13163				dtrace_dof_error(dof, "bad relocation offset");
13164				return (-1);
13165			}
13166
13167			if (!IS_P2ALIGNED(taddr, sizeof (uint64_t))) {
13168				dtrace_dof_error(dof, "misaligned setx relo");
13169				return (-1);
13170			}
13171
13172			*(uint64_t *)taddr += ubase;
13173			break;
13174		default:
13175			dtrace_dof_error(dof, "invalid relocation type");
13176			return (-1);
13177		}
13178
13179		r = (dof_relodesc_t *)((uintptr_t)r + rs->dofs_entsize);
13180	}
13181
13182	return (0);
13183}
13184#endif /* __APPLE__ */
13185
13186/*
13187 * The dof_hdr_t passed to dtrace_dof_slurp() should be a partially validated
13188 * header:  it should be at the front of a memory region that is at least
13189 * sizeof (dof_hdr_t) in size -- and then at least dof_hdr.dofh_loadsz in
13190 * size.  It need not be validated in any other way.
13191 */
13192static int
13193dtrace_dof_slurp(dof_hdr_t *dof, dtrace_vstate_t *vstate, cred_t *cr,
13194    dtrace_enabling_t **enabp, uint64_t ubase, int noprobes)
13195{
13196#pragma unused(ubase) /* __APPLE__ */
13197	uint64_t len = dof->dofh_loadsz, seclen;
13198	uintptr_t daddr = (uintptr_t)dof;
13199	dtrace_ecbdesc_t *ep;
13200	dtrace_enabling_t *enab;
13201	uint_t i;
13202
13203	lck_mtx_assert(&dtrace_lock, LCK_MTX_ASSERT_OWNED);
13204	ASSERT(dof->dofh_loadsz >= sizeof (dof_hdr_t));
13205
13206	/*
13207	 * Check the DOF header identification bytes.  In addition to checking
13208	 * valid settings, we also verify that unused bits/bytes are zeroed so
13209	 * we can use them later without fear of regressing existing binaries.
13210	 */
13211	if (bcmp(&dof->dofh_ident[DOF_ID_MAG0],
13212	    DOF_MAG_STRING, DOF_MAG_STRLEN) != 0) {
13213		dtrace_dof_error(dof, "DOF magic string mismatch");
13214		return (-1);
13215	}
13216
13217	if (dof->dofh_ident[DOF_ID_MODEL] != DOF_MODEL_ILP32 &&
13218	    dof->dofh_ident[DOF_ID_MODEL] != DOF_MODEL_LP64) {
13219		dtrace_dof_error(dof, "DOF has invalid data model");
13220		return (-1);
13221	}
13222
13223	if (dof->dofh_ident[DOF_ID_ENCODING] != DOF_ENCODE_NATIVE) {
13224		dtrace_dof_error(dof, "DOF encoding mismatch");
13225		return (-1);
13226	}
13227
13228#if !defined(__APPLE__)
13229	if (dof->dofh_ident[DOF_ID_VERSION] != DOF_VERSION_1 &&
13230	    dof->dofh_ident[DOF_ID_VERSION] != DOF_VERSION_2) {
13231		dtrace_dof_error(dof, "DOF version mismatch");
13232		return (-1);
13233	}
13234#else
13235	/*
13236	 * We only support DOF_VERSION_3 for now.
13237	 */
13238	if (dof->dofh_ident[DOF_ID_VERSION] != DOF_VERSION_3) {
13239		dtrace_dof_error(dof, "DOF version mismatch");
13240		return (-1);
13241	}
13242#endif
13243
13244	if (dof->dofh_ident[DOF_ID_DIFVERS] != DIF_VERSION_2) {
13245		dtrace_dof_error(dof, "DOF uses unsupported instruction set");
13246		return (-1);
13247	}
13248
13249	if (dof->dofh_ident[DOF_ID_DIFIREG] > DIF_DIR_NREGS) {
13250		dtrace_dof_error(dof, "DOF uses too many integer registers");
13251		return (-1);
13252	}
13253
13254	if (dof->dofh_ident[DOF_ID_DIFTREG] > DIF_DTR_NREGS) {
13255		dtrace_dof_error(dof, "DOF uses too many tuple registers");
13256		return (-1);
13257	}
13258
13259	for (i = DOF_ID_PAD; i < DOF_ID_SIZE; i++) {
13260		if (dof->dofh_ident[i] != 0) {
13261			dtrace_dof_error(dof, "DOF has invalid ident byte set");
13262			return (-1);
13263		}
13264	}
13265
13266	if (dof->dofh_flags & ~DOF_FL_VALID) {
13267		dtrace_dof_error(dof, "DOF has invalid flag bits set");
13268		return (-1);
13269	}
13270
13271	if (dof->dofh_secsize == 0) {
13272		dtrace_dof_error(dof, "zero section header size");
13273		return (-1);
13274	}
13275
13276	/*
13277	 * Check that the section headers don't exceed the amount of DOF
13278	 * data.  Note that we cast the section size and number of sections
13279	 * to uint64_t's to prevent possible overflow in the multiplication.
13280	 */
13281	seclen = (uint64_t)dof->dofh_secnum * (uint64_t)dof->dofh_secsize;
13282
13283	if (dof->dofh_secoff > len || seclen > len ||
13284	    dof->dofh_secoff + seclen > len) {
13285		dtrace_dof_error(dof, "truncated section headers");
13286		return (-1);
13287	}
13288
13289	if (!IS_P2ALIGNED(dof->dofh_secoff, sizeof (uint64_t))) {
13290		dtrace_dof_error(dof, "misaligned section headers");
13291		return (-1);
13292	}
13293
13294	if (!IS_P2ALIGNED(dof->dofh_secsize, sizeof (uint64_t))) {
13295		dtrace_dof_error(dof, "misaligned section size");
13296		return (-1);
13297	}
13298
13299	/*
13300	 * Take an initial pass through the section headers to be sure that
13301	 * the headers don't have stray offsets.  If the 'noprobes' flag is
13302	 * set, do not permit sections relating to providers, probes, or args.
13303	 */
13304	for (i = 0; i < dof->dofh_secnum; i++) {
13305		dof_sec_t *sec = (dof_sec_t *)(daddr +
13306		    (uintptr_t)dof->dofh_secoff + i * dof->dofh_secsize);
13307
13308		if (noprobes) {
13309			switch (sec->dofs_type) {
13310			case DOF_SECT_PROVIDER:
13311			case DOF_SECT_PROBES:
13312			case DOF_SECT_PRARGS:
13313			case DOF_SECT_PROFFS:
13314				dtrace_dof_error(dof, "illegal sections "
13315				    "for enabling");
13316				return (-1);
13317			}
13318		}
13319
13320		if (!(sec->dofs_flags & DOF_SECF_LOAD))
13321			continue; /* just ignore non-loadable sections */
13322
13323		if (sec->dofs_align & (sec->dofs_align - 1)) {
13324			dtrace_dof_error(dof, "bad section alignment");
13325			return (-1);
13326		}
13327
13328		if (sec->dofs_offset & (sec->dofs_align - 1)) {
13329			dtrace_dof_error(dof, "misaligned section");
13330			return (-1);
13331		}
13332
13333		if (sec->dofs_offset > len || sec->dofs_size > len ||
13334		    sec->dofs_offset + sec->dofs_size > len) {
13335			dtrace_dof_error(dof, "corrupt section header");
13336			return (-1);
13337		}
13338
13339		if (sec->dofs_type == DOF_SECT_STRTAB && *((char *)daddr +
13340		    sec->dofs_offset + sec->dofs_size - 1) != '\0') {
13341			dtrace_dof_error(dof, "non-terminating string table");
13342			return (-1);
13343		}
13344	}
13345
13346#if !defined(__APPLE__)
13347	/*
13348	 * Take a second pass through the sections and locate and perform any
13349	 * relocations that are present.  We do this after the first pass to
13350	 * be sure that all sections have had their headers validated.
13351	 */
13352	for (i = 0; i < dof->dofh_secnum; i++) {
13353		dof_sec_t *sec = (dof_sec_t *)(daddr +
13354		    (uintptr_t)dof->dofh_secoff + i * dof->dofh_secsize);
13355
13356		if (!(sec->dofs_flags & DOF_SECF_LOAD))
13357			continue; /* skip sections that are not loadable */
13358
13359		switch (sec->dofs_type) {
13360		case DOF_SECT_URELHDR:
13361			if (dtrace_dof_relocate(dof, sec, ubase) != 0)
13362				return (-1);
13363			break;
13364		}
13365	}
13366#else
13367	/*
13368	 * APPLE NOTE: We have no relocation to perform. All dof values are
13369	 * relative offsets.
13370	 */
13371#endif /* __APPLE__ */
13372
13373	if ((enab = *enabp) == NULL)
13374		enab = *enabp = dtrace_enabling_create(vstate);
13375
13376	for (i = 0; i < dof->dofh_secnum; i++) {
13377		dof_sec_t *sec = (dof_sec_t *)(daddr +
13378		    (uintptr_t)dof->dofh_secoff + i * dof->dofh_secsize);
13379
13380		if (sec->dofs_type != DOF_SECT_ECBDESC)
13381			continue;
13382
13383#if !defined(__APPLE__)
13384		if ((ep = dtrace_dof_ecbdesc(dof, sec, vstate, cr)) == NULL) {
13385			dtrace_enabling_destroy(enab);
13386			*enabp = NULL;
13387			return (-1);
13388		}
13389#else
13390		/* Note: Defend against gcc 4.0 botch on x86 (not all paths out of inlined dtrace_dof_ecbdesc
13391		   are checked for the NULL return value.) */
13392		ep = dtrace_dof_ecbdesc(dof, sec, vstate, cr);
13393		if (ep == NULL) {
13394			dtrace_enabling_destroy(enab);
13395			*enabp = NULL;
13396			return (-1);
13397		}
13398#endif /* __APPLE__ */
13399
13400		dtrace_enabling_add(enab, ep);
13401	}
13402
13403	return (0);
13404}
13405
13406/*
13407 * Process DOF for any options.  This routine assumes that the DOF has been
13408 * at least processed by dtrace_dof_slurp().
13409 */
13410static int
13411dtrace_dof_options(dof_hdr_t *dof, dtrace_state_t *state)
13412{
13413#if !defined(__APPLE__) /* Quiet compiler warnings */
13414	int i, rval;
13415#else
13416	uint_t i;
13417	int rval;
13418#endif /* __APPLE__ */
13419	uint32_t entsize;
13420	size_t offs;
13421	dof_optdesc_t *desc;
13422
13423	for (i = 0; i < dof->dofh_secnum; i++) {
13424		dof_sec_t *sec = (dof_sec_t *)((uintptr_t)dof +
13425		    (uintptr_t)dof->dofh_secoff + i * dof->dofh_secsize);
13426
13427		if (sec->dofs_type != DOF_SECT_OPTDESC)
13428			continue;
13429
13430		if (sec->dofs_align != sizeof (uint64_t)) {
13431			dtrace_dof_error(dof, "bad alignment in "
13432			    "option description");
13433			return (EINVAL);
13434		}
13435
13436		if ((entsize = sec->dofs_entsize) == 0) {
13437			dtrace_dof_error(dof, "zeroed option entry size");
13438			return (EINVAL);
13439		}
13440
13441		if (entsize < sizeof (dof_optdesc_t)) {
13442			dtrace_dof_error(dof, "bad option entry size");
13443			return (EINVAL);
13444		}
13445
13446		for (offs = 0; offs < sec->dofs_size; offs += entsize) {
13447			desc = (dof_optdesc_t *)((uintptr_t)dof +
13448			    (uintptr_t)sec->dofs_offset + offs);
13449
13450			if (desc->dofo_strtab != DOF_SECIDX_NONE) {
13451				dtrace_dof_error(dof, "non-zero option string");
13452				return (EINVAL);
13453			}
13454
13455#if !defined(__APPLE__) /* Quiet compiler warnings */
13456			if (desc->dofo_value == DTRACEOPT_UNSET) {
13457#else
13458			if (desc->dofo_value == (uint64_t)DTRACEOPT_UNSET) {
13459#endif /* __APPLE __ */
13460				dtrace_dof_error(dof, "unset option");
13461				return (EINVAL);
13462			}
13463
13464			if ((rval = dtrace_state_option(state,
13465			    desc->dofo_option, desc->dofo_value)) != 0) {
13466				dtrace_dof_error(dof, "rejected option");
13467				return (rval);
13468			}
13469		}
13470	}
13471
13472	return (0);
13473}
13474
13475/*
13476 * DTrace Consumer State Functions
13477 */
13478#if defined(__APPLE__) /* Quiet compiler warning. */
13479static
13480#endif /* __APPLE__ */
13481int
13482dtrace_dstate_init(dtrace_dstate_t *dstate, size_t size)
13483{
13484	size_t hashsize, maxper, min_size, chunksize = dstate->dtds_chunksize;
13485	void *base;
13486	uintptr_t limit;
13487	dtrace_dynvar_t *dvar, *next, *start;
13488#if !defined(__APPLE__) /* Quiet compiler warning */
13489	int i;
13490#else
13491	size_t i;
13492#endif /* __APPLE__ */
13493
13494	lck_mtx_assert(&dtrace_lock, LCK_MTX_ASSERT_OWNED);
13495	ASSERT(dstate->dtds_base == NULL && dstate->dtds_percpu == NULL);
13496
13497	bzero(dstate, sizeof (dtrace_dstate_t));
13498
13499	if ((dstate->dtds_chunksize = chunksize) == 0)
13500		dstate->dtds_chunksize = DTRACE_DYNVAR_CHUNKSIZE;
13501
13502	if (size < (min_size = dstate->dtds_chunksize + sizeof (dtrace_dynhash_t)))
13503		size = min_size;
13504
13505	if ((base = kmem_zalloc(size, KM_NOSLEEP)) == NULL)
13506		return (ENOMEM);
13507
13508	dstate->dtds_size = size;
13509	dstate->dtds_base = base;
13510	dstate->dtds_percpu = kmem_cache_alloc(dtrace_state_cache, KM_SLEEP);
13511	bzero(dstate->dtds_percpu, (int)NCPU * sizeof (dtrace_dstate_percpu_t));
13512
13513	hashsize = size / (dstate->dtds_chunksize + sizeof (dtrace_dynhash_t));
13514
13515	if (hashsize != 1 && (hashsize & 1))
13516		hashsize--;
13517
13518	dstate->dtds_hashsize = hashsize;
13519	dstate->dtds_hash = dstate->dtds_base;
13520
13521	/*
13522	 * Set all of our hash buckets to point to the single sink, and (if
13523	 * it hasn't already been set), set the sink's hash value to be the
13524	 * sink sentinel value.  The sink is needed for dynamic variable
13525	 * lookups to know that they have iterated over an entire, valid hash
13526	 * chain.
13527	 */
13528	for (i = 0; i < hashsize; i++)
13529		dstate->dtds_hash[i].dtdh_chain = &dtrace_dynhash_sink;
13530
13531	if (dtrace_dynhash_sink.dtdv_hashval != DTRACE_DYNHASH_SINK)
13532		dtrace_dynhash_sink.dtdv_hashval = DTRACE_DYNHASH_SINK;
13533
13534	/*
13535	 * Determine number of active CPUs.  Divide free list evenly among
13536	 * active CPUs.
13537	 */
13538	start = (dtrace_dynvar_t *)
13539	    ((uintptr_t)base + hashsize * sizeof (dtrace_dynhash_t));
13540	limit = (uintptr_t)base + size;
13541
13542	maxper = (limit - (uintptr_t)start) / (int)NCPU;
13543	maxper = (maxper / dstate->dtds_chunksize) * dstate->dtds_chunksize;
13544
13545	for (i = 0; i < NCPU; i++) {
13546		dstate->dtds_percpu[i].dtdsc_free = dvar = start;
13547
13548		/*
13549		 * If we don't even have enough chunks to make it once through
13550		 * NCPUs, we're just going to allocate everything to the first
13551		 * CPU.  And if we're on the last CPU, we're going to allocate
13552		 * whatever is left over.  In either case, we set the limit to
13553		 * be the limit of the dynamic variable space.
13554		 */
13555		if (maxper == 0 || i == NCPU - 1) {
13556			limit = (uintptr_t)base + size;
13557			start = NULL;
13558		} else {
13559			limit = (uintptr_t)start + maxper;
13560			start = (dtrace_dynvar_t *)limit;
13561		}
13562
13563		ASSERT(limit <= (uintptr_t)base + size);
13564
13565		for (;;) {
13566			next = (dtrace_dynvar_t *)((uintptr_t)dvar +
13567			    dstate->dtds_chunksize);
13568
13569			if ((uintptr_t)next + dstate->dtds_chunksize >= limit)
13570				break;
13571
13572			dvar->dtdv_next = next;
13573			dvar = next;
13574		}
13575
13576		if (maxper == 0)
13577			break;
13578	}
13579
13580	return (0);
13581}
13582
13583#if defined(__APPLE__) /* Quiet compiler warning. */
13584static
13585#endif /* __APPLE__ */
13586void
13587dtrace_dstate_fini(dtrace_dstate_t *dstate)
13588{
13589	lck_mtx_assert(&cpu_lock, LCK_MTX_ASSERT_OWNED);
13590
13591	if (dstate->dtds_base == NULL)
13592		return;
13593
13594	kmem_free(dstate->dtds_base, dstate->dtds_size);
13595	kmem_cache_free(dtrace_state_cache, dstate->dtds_percpu);
13596}
13597
13598static void
13599dtrace_vstate_fini(dtrace_vstate_t *vstate)
13600{
13601	/*
13602	 * Logical XOR, where are you?
13603	 */
13604	ASSERT((vstate->dtvs_nglobals == 0) ^ (vstate->dtvs_globals != NULL));
13605
13606	if (vstate->dtvs_nglobals > 0) {
13607		kmem_free(vstate->dtvs_globals, vstate->dtvs_nglobals *
13608		    sizeof (dtrace_statvar_t *));
13609	}
13610
13611	if (vstate->dtvs_ntlocals > 0) {
13612		kmem_free(vstate->dtvs_tlocals, vstate->dtvs_ntlocals *
13613		    sizeof (dtrace_difv_t));
13614	}
13615
13616	ASSERT((vstate->dtvs_nlocals == 0) ^ (vstate->dtvs_locals != NULL));
13617
13618	if (vstate->dtvs_nlocals > 0) {
13619		kmem_free(vstate->dtvs_locals, vstate->dtvs_nlocals *
13620		    sizeof (dtrace_statvar_t *));
13621	}
13622}
13623
13624static void
13625dtrace_state_clean(dtrace_state_t *state)
13626{
13627	if (state->dts_activity == DTRACE_ACTIVITY_INACTIVE)
13628		return;
13629
13630	dtrace_dynvar_clean(&state->dts_vstate.dtvs_dynvars);
13631	dtrace_speculation_clean(state);
13632}
13633
13634static void
13635dtrace_state_deadman(dtrace_state_t *state)
13636{
13637	hrtime_t now;
13638
13639	dtrace_sync();
13640
13641	now = dtrace_gethrtime();
13642
13643	if (state != dtrace_anon.dta_state &&
13644	    now - state->dts_laststatus >= dtrace_deadman_user)
13645		return;
13646
13647	/*
13648	 * We must be sure that dts_alive never appears to be less than the
13649	 * value upon entry to dtrace_state_deadman(), and because we lack a
13650	 * dtrace_cas64(), we cannot store to it atomically.  We thus instead
13651	 * store INT64_MAX to it, followed by a memory barrier, followed by
13652	 * the new value.  This assures that dts_alive never appears to be
13653	 * less than its true value, regardless of the order in which the
13654	 * stores to the underlying storage are issued.
13655	 */
13656	state->dts_alive = INT64_MAX;
13657	dtrace_membar_producer();
13658	state->dts_alive = now;
13659}
13660
13661#if !defined(__APPLE__)
13662dtrace_state_t *
13663dtrace_state_create(dev_t *devp, cred_t *cr)
13664#else
13665static int
13666dtrace_state_create(dev_t *devp, cred_t *cr, dtrace_state_t **new_state)
13667#endif /* __APPLE__ */
13668{
13669	minor_t minor;
13670	major_t major;
13671	char c[30];
13672	dtrace_state_t *state;
13673	dtrace_optval_t *opt;
13674	int bufsize = (int)NCPU * sizeof (dtrace_buffer_t), i;
13675
13676	lck_mtx_assert(&dtrace_lock, LCK_MTX_ASSERT_OWNED);
13677	lck_mtx_assert(&cpu_lock, LCK_MTX_ASSERT_OWNED);
13678
13679#if !defined(__APPLE__)
13680	minor = (minor_t)(uintptr_t)vmem_alloc(dtrace_minor, 1,
13681	    VM_BESTFIT | VM_SLEEP);
13682
13683	if (ddi_soft_state_zalloc(dtrace_softstate, minor) != DDI_SUCCESS) {
13684		vmem_free(dtrace_minor, (void *)(uintptr_t)minor, 1);
13685		return (NULL);
13686	}
13687#else
13688	/* Cause restart */
13689	*new_state = NULL;
13690
13691	/*
13692	 * Darwin's DEVFS layer acquired the minor number for this "device" when it called
13693	 * dtrace_devfs_clone_func(). At that time, dtrace_devfs_clone_func() proposed a minor number
13694	 * (next unused according to vmem_alloc()) and then immediately put the number back in play
13695	 * (by calling vmem_free()). Now that minor number is being used for an open, so committing it
13696	 * to use. The following vmem_alloc() must deliver that same minor number. FIXME.
13697	 */
13698
13699	minor = (minor_t)(uintptr_t)vmem_alloc(dtrace_minor, 1,
13700	    VM_BESTFIT | VM_SLEEP);
13701
13702	if (NULL != devp) {
13703    	ASSERT(getminor(*devp) == minor);
13704		if (getminor(*devp) != minor) {
13705			printf("dtrace_open: couldn't re-acquire vended minor number %d. Instead got %d\n",
13706					getminor(*devp), minor);
13707			vmem_free(dtrace_minor, (void *)(uintptr_t)minor, 1);
13708			return (ERESTART);	/* can't reacquire */
13709		}
13710	} else {
13711        /* NULL==devp iff "Anonymous state" (see dtrace_anon_property),
13712		 * so just vend the minor device number here de novo since no "open" has occurred. */
13713	}
13714
13715	if (ddi_soft_state_zalloc(dtrace_softstate, minor) != DDI_SUCCESS) {
13716		vmem_free(dtrace_minor, (void *)(uintptr_t)minor, 1);
13717		return (EAGAIN);	/* temporary resource shortage */
13718	}
13719
13720#endif /* __APPLE__ */
13721
13722	state = ddi_get_soft_state(dtrace_softstate, minor);
13723	state->dts_epid = DTRACE_EPIDNONE + 1;
13724
13725	(void) snprintf(c, sizeof (c), "dtrace_aggid_%d", minor);
13726	state->dts_aggid_arena = vmem_create(c, (void *)1, UINT32_MAX, 1,
13727	    NULL, NULL, NULL, 0, VM_SLEEP | VMC_IDENTIFIER);
13728
13729	if (devp != NULL) {
13730		major = getemajor(*devp);
13731	} else {
13732		major = ddi_driver_major(dtrace_devi);
13733	}
13734
13735	state->dts_dev = makedevice(major, minor);
13736
13737	if (devp != NULL)
13738		*devp = state->dts_dev;
13739
13740	/*
13741	 * We allocate NCPU buffers.  On the one hand, this can be quite
13742	 * a bit of memory per instance (nearly 36K on a Starcat).  On the
13743	 * other hand, it saves an additional memory reference in the probe
13744	 * path.
13745	 */
13746	state->dts_buffer = kmem_zalloc(bufsize, KM_SLEEP);
13747	state->dts_aggbuffer = kmem_zalloc(bufsize, KM_SLEEP);
13748	state->dts_cleaner = CYCLIC_NONE;
13749	state->dts_deadman = CYCLIC_NONE;
13750	state->dts_vstate.dtvs_state = state;
13751
13752	for (i = 0; i < DTRACEOPT_MAX; i++)
13753		state->dts_options[i] = DTRACEOPT_UNSET;
13754
13755	/*
13756	 * Set the default options.
13757	 */
13758	opt = state->dts_options;
13759	opt[DTRACEOPT_BUFPOLICY] = DTRACEOPT_BUFPOLICY_SWITCH;
13760	opt[DTRACEOPT_BUFRESIZE] = DTRACEOPT_BUFRESIZE_AUTO;
13761	opt[DTRACEOPT_NSPEC] = dtrace_nspec_default;
13762	opt[DTRACEOPT_SPECSIZE] = dtrace_specsize_default;
13763	opt[DTRACEOPT_CPU] = (dtrace_optval_t)DTRACE_CPUALL;
13764	opt[DTRACEOPT_STRSIZE] = dtrace_strsize_default;
13765	opt[DTRACEOPT_STACKFRAMES] = dtrace_stackframes_default;
13766	opt[DTRACEOPT_USTACKFRAMES] = dtrace_ustackframes_default;
13767	opt[DTRACEOPT_CLEANRATE] = dtrace_cleanrate_default;
13768	opt[DTRACEOPT_AGGRATE] = dtrace_aggrate_default;
13769	opt[DTRACEOPT_SWITCHRATE] = dtrace_switchrate_default;
13770	opt[DTRACEOPT_STATUSRATE] = dtrace_statusrate_default;
13771	opt[DTRACEOPT_JSTACKFRAMES] = dtrace_jstackframes_default;
13772	opt[DTRACEOPT_JSTACKSTRSIZE] = dtrace_jstackstrsize_default;
13773
13774	state->dts_activity = DTRACE_ACTIVITY_INACTIVE;
13775
13776	/*
13777	 * Depending on the user credentials, we set flag bits which alter probe
13778	 * visibility or the amount of destructiveness allowed.  In the case of
13779	 * actual anonymous tracing, or the possession of all privileges, all of
13780	 * the normal checks are bypassed.
13781	 */
13782	if (cr == NULL || PRIV_POLICY_ONLY(cr, PRIV_ALL, B_FALSE)) {
13783		state->dts_cred.dcr_visible = DTRACE_CRV_ALL;
13784		state->dts_cred.dcr_action = DTRACE_CRA_ALL;
13785	} else {
13786		/*
13787		 * Set up the credentials for this instantiation.  We take a
13788		 * hold on the credential to prevent it from disappearing on
13789		 * us; this in turn prevents the zone_t referenced by this
13790		 * credential from disappearing.  This means that we can
13791		 * examine the credential and the zone from probe context.
13792		 */
13793		crhold(cr);
13794		state->dts_cred.dcr_cred = cr;
13795
13796		/*
13797		 * CRA_PROC means "we have *some* privilege for dtrace" and
13798		 * unlocks the use of variables like pid, zonename, etc.
13799		 */
13800		if (PRIV_POLICY_ONLY(cr, PRIV_DTRACE_USER, B_FALSE) ||
13801		    PRIV_POLICY_ONLY(cr, PRIV_DTRACE_PROC, B_FALSE)) {
13802			state->dts_cred.dcr_action |= DTRACE_CRA_PROC;
13803		}
13804
13805		/*
13806		 * dtrace_user allows use of syscall and profile providers.
13807		 * If the user also has proc_owner and/or proc_zone, we
13808		 * extend the scope to include additional visibility and
13809		 * destructive power.
13810		 */
13811		if (PRIV_POLICY_ONLY(cr, PRIV_DTRACE_USER, B_FALSE)) {
13812			if (PRIV_POLICY_ONLY(cr, PRIV_PROC_OWNER, B_FALSE)) {
13813				state->dts_cred.dcr_visible |=
13814				    DTRACE_CRV_ALLPROC;
13815
13816				state->dts_cred.dcr_action |=
13817				    DTRACE_CRA_PROC_DESTRUCTIVE_ALLUSER;
13818			}
13819
13820			if (PRIV_POLICY_ONLY(cr, PRIV_PROC_ZONE, B_FALSE)) {
13821				state->dts_cred.dcr_visible |=
13822				    DTRACE_CRV_ALLZONE;
13823
13824				state->dts_cred.dcr_action |=
13825				    DTRACE_CRA_PROC_DESTRUCTIVE_ALLZONE;
13826			}
13827
13828			/*
13829			 * If we have all privs in whatever zone this is,
13830			 * we can do destructive things to processes which
13831			 * have altered credentials.
13832			 */
13833#if !defined(__APPLE__)
13834			if (priv_isequalset(priv_getset(cr, PRIV_EFFECTIVE),
13835			    cr->cr_zone->zone_privset)) {
13836				state->dts_cred.dcr_action |=
13837					DTRACE_CRA_PROC_DESTRUCTIVE_CREDCHG;
13838			}
13839#else
13840			/* Darwin doesn't do zones. */
13841			state->dts_cred.dcr_action |=
13842				DTRACE_CRA_PROC_DESTRUCTIVE_CREDCHG;
13843#endif /* __APPLE__ */
13844		}
13845
13846		/*
13847		 * Holding the dtrace_kernel privilege also implies that
13848		 * the user has the dtrace_user privilege from a visibility
13849		 * perspective.  But without further privileges, some
13850		 * destructive actions are not available.
13851		 */
13852		if (PRIV_POLICY_ONLY(cr, PRIV_DTRACE_KERNEL, B_FALSE)) {
13853			/*
13854			 * Make all probes in all zones visible.  However,
13855			 * this doesn't mean that all actions become available
13856			 * to all zones.
13857			 */
13858			state->dts_cred.dcr_visible |= DTRACE_CRV_KERNEL |
13859			    DTRACE_CRV_ALLPROC | DTRACE_CRV_ALLZONE;
13860
13861			state->dts_cred.dcr_action |= DTRACE_CRA_KERNEL |
13862			    DTRACE_CRA_PROC;
13863			/*
13864			 * Holding proc_owner means that destructive actions
13865			 * for *this* zone are allowed.
13866			 */
13867			if (PRIV_POLICY_ONLY(cr, PRIV_PROC_OWNER, B_FALSE))
13868				state->dts_cred.dcr_action |=
13869				    DTRACE_CRA_PROC_DESTRUCTIVE_ALLUSER;
13870
13871			/*
13872			 * Holding proc_zone means that destructive actions
13873			 * for this user/group ID in all zones is allowed.
13874			 */
13875			if (PRIV_POLICY_ONLY(cr, PRIV_PROC_ZONE, B_FALSE))
13876				state->dts_cred.dcr_action |=
13877				    DTRACE_CRA_PROC_DESTRUCTIVE_ALLZONE;
13878
13879			/*
13880			 * If we have all privs in whatever zone this is,
13881			 * we can do destructive things to processes which
13882			 * have altered credentials.
13883			 */
13884#if !defined(__APPLE__)
13885			if (priv_isequalset(priv_getset(cr, PRIV_EFFECTIVE),
13886			    cr->cr_zone->zone_privset)) {
13887				state->dts_cred.dcr_action |=
13888				    DTRACE_CRA_PROC_DESTRUCTIVE_CREDCHG;
13889			}
13890#else
13891			/* Darwin doesn't do zones. */
13892			state->dts_cred.dcr_action |=
13893				DTRACE_CRA_PROC_DESTRUCTIVE_CREDCHG;
13894#endif /* __APPLE__ */
13895		}
13896
13897		/*
13898		 * Holding the dtrace_proc privilege gives control over fasttrap
13899		 * and pid providers.  We need to grant wider destructive
13900		 * privileges in the event that the user has proc_owner and/or
13901		 * proc_zone.
13902		 */
13903		if (PRIV_POLICY_ONLY(cr, PRIV_DTRACE_PROC, B_FALSE)) {
13904			if (PRIV_POLICY_ONLY(cr, PRIV_PROC_OWNER, B_FALSE))
13905				state->dts_cred.dcr_action |=
13906				    DTRACE_CRA_PROC_DESTRUCTIVE_ALLUSER;
13907
13908			if (PRIV_POLICY_ONLY(cr, PRIV_PROC_ZONE, B_FALSE))
13909				state->dts_cred.dcr_action |=
13910				    DTRACE_CRA_PROC_DESTRUCTIVE_ALLZONE;
13911		}
13912	}
13913
13914#if !defined(__APPLE__)
13915	return (state);
13916#else
13917	*new_state = state;
13918	return(0);  /* Success */
13919#endif /* __APPLE__ */
13920}
13921
13922static int
13923dtrace_state_buffer(dtrace_state_t *state, dtrace_buffer_t *buf, int which)
13924{
13925	dtrace_optval_t *opt = state->dts_options, size;
13926	processorid_t cpu = 0;
13927	int flags = 0, rval;
13928
13929	lck_mtx_assert(&dtrace_lock, LCK_MTX_ASSERT_OWNED);
13930	lck_mtx_assert(&cpu_lock, LCK_MTX_ASSERT_OWNED);
13931	ASSERT(which < DTRACEOPT_MAX);
13932	ASSERT(state->dts_activity == DTRACE_ACTIVITY_INACTIVE ||
13933	    (state == dtrace_anon.dta_state &&
13934	    state->dts_activity == DTRACE_ACTIVITY_ACTIVE));
13935
13936	if (opt[which] == DTRACEOPT_UNSET || opt[which] == 0)
13937		return (0);
13938
13939	if (opt[DTRACEOPT_CPU] != DTRACEOPT_UNSET)
13940		cpu = opt[DTRACEOPT_CPU];
13941
13942	if (which == DTRACEOPT_SPECSIZE)
13943		flags |= DTRACEBUF_NOSWITCH;
13944
13945	if (which == DTRACEOPT_BUFSIZE) {
13946		if (opt[DTRACEOPT_BUFPOLICY] == DTRACEOPT_BUFPOLICY_RING)
13947			flags |= DTRACEBUF_RING;
13948
13949		if (opt[DTRACEOPT_BUFPOLICY] == DTRACEOPT_BUFPOLICY_FILL)
13950			flags |= DTRACEBUF_FILL;
13951
13952		if (state != dtrace_anon.dta_state ||
13953		    state->dts_activity != DTRACE_ACTIVITY_ACTIVE)
13954			flags |= DTRACEBUF_INACTIVE;
13955	}
13956
13957#if !defined(__APPLE__) /* Quiet compiler warning */
13958	for (size = opt[which]; size >= sizeof (uint64_t); size >>= 1) {
13959#else
13960	for (size = opt[which]; (size_t)size >= sizeof (uint64_t); size >>= 1) {
13961#endif /* __APPLE__ */
13962		/*
13963		 * The size must be 8-byte aligned.  If the size is not 8-byte
13964		 * aligned, drop it down by the difference.
13965		 */
13966		if (size & (sizeof (uint64_t) - 1))
13967			size -= size & (sizeof (uint64_t) - 1);
13968
13969		if (size < state->dts_reserve) {
13970			/*
13971			 * Buffers always must be large enough to accommodate
13972			 * their prereserved space.  We return E2BIG instead
13973			 * of ENOMEM in this case to allow for user-level
13974			 * software to differentiate the cases.
13975			 */
13976			return (E2BIG);
13977		}
13978
13979		rval = dtrace_buffer_alloc(buf, size, flags, cpu);
13980
13981		if (rval != ENOMEM) {
13982			opt[which] = size;
13983			return (rval);
13984		}
13985
13986		if (opt[DTRACEOPT_BUFRESIZE] == DTRACEOPT_BUFRESIZE_MANUAL)
13987			return (rval);
13988	}
13989
13990	return (ENOMEM);
13991}
13992
13993static int
13994dtrace_state_buffers(dtrace_state_t *state)
13995{
13996	dtrace_speculation_t *spec = state->dts_speculations;
13997	int rval, i;
13998
13999	if ((rval = dtrace_state_buffer(state, state->dts_buffer,
14000	    DTRACEOPT_BUFSIZE)) != 0)
14001		return (rval);
14002
14003	if ((rval = dtrace_state_buffer(state, state->dts_aggbuffer,
14004	    DTRACEOPT_AGGSIZE)) != 0)
14005		return (rval);
14006
14007	for (i = 0; i < state->dts_nspeculations; i++) {
14008		if ((rval = dtrace_state_buffer(state,
14009		    spec[i].dtsp_buffer, DTRACEOPT_SPECSIZE)) != 0)
14010			return (rval);
14011	}
14012
14013	return (0);
14014}
14015
14016static void
14017dtrace_state_prereserve(dtrace_state_t *state)
14018{
14019	dtrace_ecb_t *ecb;
14020	dtrace_probe_t *probe;
14021
14022	state->dts_reserve = 0;
14023
14024	if (state->dts_options[DTRACEOPT_BUFPOLICY] != DTRACEOPT_BUFPOLICY_FILL)
14025		return;
14026
14027	/*
14028	 * If our buffer policy is a "fill" buffer policy, we need to set the
14029	 * prereserved space to be the space required by the END probes.
14030	 */
14031	probe = dtrace_probes[dtrace_probeid_end - 1];
14032	ASSERT(probe != NULL);
14033
14034	for (ecb = probe->dtpr_ecb; ecb != NULL; ecb = ecb->dte_next) {
14035		if (ecb->dte_state != state)
14036			continue;
14037
14038		state->dts_reserve += ecb->dte_needed + ecb->dte_alignment;
14039	}
14040}
14041
14042static int
14043dtrace_state_go(dtrace_state_t *state, processorid_t *cpu)
14044{
14045	dtrace_optval_t *opt = state->dts_options, sz, nspec;
14046	dtrace_speculation_t *spec;
14047	dtrace_buffer_t *buf;
14048	cyc_handler_t hdlr;
14049	cyc_time_t when;
14050	int rval = 0, i, bufsize = (int)NCPU * sizeof (dtrace_buffer_t);
14051	dtrace_icookie_t cookie;
14052
14053	lck_mtx_lock(&cpu_lock);
14054	lck_mtx_lock(&dtrace_lock);
14055
14056	if (state->dts_activity != DTRACE_ACTIVITY_INACTIVE) {
14057		rval = EBUSY;
14058		goto out;
14059	}
14060
14061	/*
14062	 * Before we can perform any checks, we must prime all of the
14063	 * retained enablings that correspond to this state.
14064	 */
14065	dtrace_enabling_prime(state);
14066
14067	if (state->dts_destructive && !state->dts_cred.dcr_destructive) {
14068		rval = EACCES;
14069		goto out;
14070	}
14071
14072	dtrace_state_prereserve(state);
14073
14074	/*
14075	 * Now we want to do is try to allocate our speculations.
14076	 * We do not automatically resize the number of speculations; if
14077	 * this fails, we will fail the operation.
14078	 */
14079	nspec = opt[DTRACEOPT_NSPEC];
14080	ASSERT(nspec != DTRACEOPT_UNSET);
14081
14082	if (nspec > INT_MAX) {
14083		rval = ENOMEM;
14084		goto out;
14085	}
14086
14087	spec = kmem_zalloc(nspec * sizeof (dtrace_speculation_t), KM_NOSLEEP);
14088
14089	if (spec == NULL) {
14090		rval = ENOMEM;
14091		goto out;
14092	}
14093
14094	state->dts_speculations = spec;
14095	state->dts_nspeculations = (int)nspec;
14096
14097	for (i = 0; i < nspec; i++) {
14098		if ((buf = kmem_zalloc(bufsize, KM_NOSLEEP)) == NULL) {
14099			rval = ENOMEM;
14100			goto err;
14101		}
14102
14103		spec[i].dtsp_buffer = buf;
14104	}
14105
14106	if (opt[DTRACEOPT_GRABANON] != DTRACEOPT_UNSET) {
14107		if (dtrace_anon.dta_state == NULL) {
14108			rval = ENOENT;
14109			goto out;
14110		}
14111
14112		if (state->dts_necbs != 0) {
14113			rval = EALREADY;
14114			goto out;
14115		}
14116
14117		state->dts_anon = dtrace_anon_grab();
14118		ASSERT(state->dts_anon != NULL);
14119		state = state->dts_anon;
14120
14121		/*
14122		 * We want "grabanon" to be set in the grabbed state, so we'll
14123		 * copy that option value from the grabbing state into the
14124		 * grabbed state.
14125		 */
14126		state->dts_options[DTRACEOPT_GRABANON] =
14127		    opt[DTRACEOPT_GRABANON];
14128
14129		*cpu = dtrace_anon.dta_beganon;
14130
14131		/*
14132		 * If the anonymous state is active (as it almost certainly
14133		 * is if the anonymous enabling ultimately matched anything),
14134		 * we don't allow any further option processing -- but we
14135		 * don't return failure.
14136		 */
14137		if (state->dts_activity != DTRACE_ACTIVITY_INACTIVE)
14138			goto out;
14139	}
14140
14141	if (opt[DTRACEOPT_AGGSIZE] != DTRACEOPT_UNSET &&
14142	    opt[DTRACEOPT_AGGSIZE] != 0) {
14143		if (state->dts_aggregations == NULL) {
14144			/*
14145			 * We're not going to create an aggregation buffer
14146			 * because we don't have any ECBs that contain
14147			 * aggregations -- set this option to 0.
14148			 */
14149			opt[DTRACEOPT_AGGSIZE] = 0;
14150		} else {
14151			/*
14152			 * If we have an aggregation buffer, we must also have
14153			 * a buffer to use as scratch.
14154			 */
14155#if !defined(__APPLE__) /* Quiet compiler warning */
14156			if (opt[DTRACEOPT_BUFSIZE] == DTRACEOPT_UNSET ||
14157			    opt[DTRACEOPT_BUFSIZE] < state->dts_needed) {
14158				opt[DTRACEOPT_BUFSIZE] = state->dts_needed;
14159			}
14160#else
14161			if (opt[DTRACEOPT_BUFSIZE] == DTRACEOPT_UNSET ||
14162			  (size_t)opt[DTRACEOPT_BUFSIZE] < state->dts_needed) {
14163				opt[DTRACEOPT_BUFSIZE] = state->dts_needed;
14164			}
14165#endif /* __APPLE__ */
14166		}
14167	}
14168
14169	if (opt[DTRACEOPT_SPECSIZE] != DTRACEOPT_UNSET &&
14170	    opt[DTRACEOPT_SPECSIZE] != 0) {
14171		if (!state->dts_speculates) {
14172			/*
14173			 * We're not going to create speculation buffers
14174			 * because we don't have any ECBs that actually
14175			 * speculate -- set the speculation size to 0.
14176			 */
14177			opt[DTRACEOPT_SPECSIZE] = 0;
14178		}
14179	}
14180
14181	/*
14182	 * The bare minimum size for any buffer that we're actually going to
14183	 * do anything to is sizeof (uint64_t).
14184	 */
14185	sz = sizeof (uint64_t);
14186
14187	if ((state->dts_needed != 0 && opt[DTRACEOPT_BUFSIZE] < sz) ||
14188	    (state->dts_speculates && opt[DTRACEOPT_SPECSIZE] < sz) ||
14189	    (state->dts_aggregations != NULL && opt[DTRACEOPT_AGGSIZE] < sz)) {
14190		/*
14191		 * A buffer size has been explicitly set to 0 (or to a size
14192		 * that will be adjusted to 0) and we need the space -- we
14193		 * need to return failure.  We return ENOSPC to differentiate
14194		 * it from failing to allocate a buffer due to failure to meet
14195		 * the reserve (for which we return E2BIG).
14196		 */
14197		rval = ENOSPC;
14198		goto out;
14199	}
14200
14201	if ((rval = dtrace_state_buffers(state)) != 0)
14202		goto err;
14203
14204	if ((sz = opt[DTRACEOPT_DYNVARSIZE]) == DTRACEOPT_UNSET)
14205		sz = dtrace_dstate_defsize;
14206
14207	do {
14208		rval = dtrace_dstate_init(&state->dts_vstate.dtvs_dynvars, sz);
14209
14210		if (rval == 0)
14211			break;
14212
14213		if (opt[DTRACEOPT_BUFRESIZE] == DTRACEOPT_BUFRESIZE_MANUAL)
14214			goto err;
14215	} while (sz >>= 1);
14216
14217	opt[DTRACEOPT_DYNVARSIZE] = sz;
14218
14219	if (rval != 0)
14220		goto err;
14221
14222	if (opt[DTRACEOPT_STATUSRATE] > dtrace_statusrate_max)
14223		opt[DTRACEOPT_STATUSRATE] = dtrace_statusrate_max;
14224
14225	if (opt[DTRACEOPT_CLEANRATE] == 0)
14226		opt[DTRACEOPT_CLEANRATE] = dtrace_cleanrate_max;
14227
14228	if (opt[DTRACEOPT_CLEANRATE] < dtrace_cleanrate_min)
14229		opt[DTRACEOPT_CLEANRATE] = dtrace_cleanrate_min;
14230
14231	if (opt[DTRACEOPT_CLEANRATE] > dtrace_cleanrate_max)
14232		opt[DTRACEOPT_CLEANRATE] = dtrace_cleanrate_max;
14233
14234	hdlr.cyh_func = (cyc_func_t)dtrace_state_clean;
14235	hdlr.cyh_arg = state;
14236	hdlr.cyh_level = CY_LOW_LEVEL;
14237
14238	when.cyt_when = 0;
14239	when.cyt_interval = opt[DTRACEOPT_CLEANRATE];
14240
14241	state->dts_cleaner = cyclic_add(&hdlr, &when);
14242
14243	hdlr.cyh_func = (cyc_func_t)dtrace_state_deadman;
14244	hdlr.cyh_arg = state;
14245	hdlr.cyh_level = CY_LOW_LEVEL;
14246
14247	when.cyt_when = 0;
14248	when.cyt_interval = dtrace_deadman_interval;
14249
14250	state->dts_alive = state->dts_laststatus = dtrace_gethrtime();
14251	state->dts_deadman = cyclic_add(&hdlr, &when);
14252
14253	state->dts_activity = DTRACE_ACTIVITY_WARMUP;
14254
14255	/*
14256	 * Now it's time to actually fire the BEGIN probe.  We need to disable
14257	 * interrupts here both to record the CPU on which we fired the BEGIN
14258	 * probe (the data from this CPU will be processed first at user
14259	 * level) and to manually activate the buffer for this CPU.
14260	 */
14261	cookie = dtrace_interrupt_disable();
14262	*cpu = CPU->cpu_id;
14263	ASSERT(state->dts_buffer[*cpu].dtb_flags & DTRACEBUF_INACTIVE);
14264	state->dts_buffer[*cpu].dtb_flags &= ~DTRACEBUF_INACTIVE;
14265
14266	dtrace_probe(dtrace_probeid_begin,
14267	    (uint64_t)(uintptr_t)state, 0, 0, 0, 0);
14268	dtrace_interrupt_enable(cookie);
14269	/*
14270	 * We may have had an exit action from a BEGIN probe; only change our
14271	 * state to ACTIVE if we're still in WARMUP.
14272	 */
14273	ASSERT(state->dts_activity == DTRACE_ACTIVITY_WARMUP ||
14274	    state->dts_activity == DTRACE_ACTIVITY_DRAINING);
14275
14276	if (state->dts_activity == DTRACE_ACTIVITY_WARMUP)
14277		state->dts_activity = DTRACE_ACTIVITY_ACTIVE;
14278
14279	/*
14280	 * Regardless of whether or not now we're in ACTIVE or DRAINING, we
14281	 * want each CPU to transition its principal buffer out of the
14282	 * INACTIVE state.  Doing this assures that no CPU will suddenly begin
14283	 * processing an ECB halfway down a probe's ECB chain; all CPUs will
14284	 * atomically transition from processing none of a state's ECBs to
14285	 * processing all of them.
14286	 */
14287	dtrace_xcall(DTRACE_CPUALL,
14288	    (dtrace_xcall_t)dtrace_buffer_activate, state);
14289	goto out;
14290
14291err:
14292	dtrace_buffer_free(state->dts_buffer);
14293	dtrace_buffer_free(state->dts_aggbuffer);
14294
14295	if ((nspec = state->dts_nspeculations) == 0) {
14296		ASSERT(state->dts_speculations == NULL);
14297		goto out;
14298	}
14299
14300	spec = state->dts_speculations;
14301	ASSERT(spec != NULL);
14302
14303	for (i = 0; i < state->dts_nspeculations; i++) {
14304		if ((buf = spec[i].dtsp_buffer) == NULL)
14305			break;
14306
14307		dtrace_buffer_free(buf);
14308		kmem_free(buf, bufsize);
14309	}
14310
14311	kmem_free(spec, nspec * sizeof (dtrace_speculation_t));
14312	state->dts_nspeculations = 0;
14313	state->dts_speculations = NULL;
14314
14315out:
14316	lck_mtx_unlock(&dtrace_lock);
14317	lck_mtx_unlock(&cpu_lock);
14318
14319	return (rval);
14320}
14321
14322static int
14323dtrace_state_stop(dtrace_state_t *state, processorid_t *cpu)
14324{
14325	dtrace_icookie_t cookie;
14326
14327	lck_mtx_assert(&dtrace_lock, LCK_MTX_ASSERT_OWNED);
14328
14329	if (state->dts_activity != DTRACE_ACTIVITY_ACTIVE &&
14330	    state->dts_activity != DTRACE_ACTIVITY_DRAINING)
14331		return (EINVAL);
14332
14333	/*
14334	 * We'll set the activity to DTRACE_ACTIVITY_DRAINING, and issue a sync
14335	 * to be sure that every CPU has seen it.  See below for the details
14336	 * on why this is done.
14337	 */
14338	state->dts_activity = DTRACE_ACTIVITY_DRAINING;
14339	dtrace_sync();
14340
14341	/*
14342	 * By this point, it is impossible for any CPU to be still processing
14343	 * with DTRACE_ACTIVITY_ACTIVE.  We can thus set our activity to
14344	 * DTRACE_ACTIVITY_COOLDOWN and know that we're not racing with any
14345	 * other CPU in dtrace_buffer_reserve().  This allows dtrace_probe()
14346	 * and callees to know that the activity is DTRACE_ACTIVITY_COOLDOWN
14347	 * iff we're in the END probe.
14348	 */
14349	state->dts_activity = DTRACE_ACTIVITY_COOLDOWN;
14350	dtrace_sync();
14351	ASSERT(state->dts_activity == DTRACE_ACTIVITY_COOLDOWN);
14352
14353	/*
14354	 * Finally, we can release the reserve and call the END probe.  We
14355	 * disable interrupts across calling the END probe to allow us to
14356	 * return the CPU on which we actually called the END probe.  This
14357	 * allows user-land to be sure that this CPU's principal buffer is
14358	 * processed last.
14359	 */
14360	state->dts_reserve = 0;
14361
14362	cookie = dtrace_interrupt_disable();
14363	*cpu = CPU->cpu_id;
14364	dtrace_probe(dtrace_probeid_end,
14365	    (uint64_t)(uintptr_t)state, 0, 0, 0, 0);
14366	dtrace_interrupt_enable(cookie);
14367
14368	state->dts_activity = DTRACE_ACTIVITY_STOPPED;
14369	dtrace_sync();
14370
14371	return (0);
14372}
14373
14374static int
14375dtrace_state_option(dtrace_state_t *state, dtrace_optid_t option,
14376    dtrace_optval_t val)
14377{
14378	lck_mtx_assert(&dtrace_lock, LCK_MTX_ASSERT_OWNED);
14379
14380	if (state->dts_activity != DTRACE_ACTIVITY_INACTIVE)
14381		return (EBUSY);
14382
14383	if (option >= DTRACEOPT_MAX)
14384		return (EINVAL);
14385
14386	if (option != DTRACEOPT_CPU && val < 0)
14387		return (EINVAL);
14388
14389	switch (option) {
14390	case DTRACEOPT_DESTRUCTIVE:
14391		if (dtrace_destructive_disallow)
14392			return (EACCES);
14393
14394		state->dts_cred.dcr_destructive = 1;
14395		break;
14396
14397	case DTRACEOPT_BUFSIZE:
14398	case DTRACEOPT_DYNVARSIZE:
14399	case DTRACEOPT_AGGSIZE:
14400	case DTRACEOPT_SPECSIZE:
14401	case DTRACEOPT_STRSIZE:
14402		if (val < 0)
14403			return (EINVAL);
14404
14405		if (val >= LONG_MAX) {
14406			/*
14407			 * If this is an otherwise negative value, set it to
14408			 * the highest multiple of 128m less than LONG_MAX.
14409			 * Technically, we're adjusting the size without
14410			 * regard to the buffer resizing policy, but in fact,
14411			 * this has no effect -- if we set the buffer size to
14412			 * ~LONG_MAX and the buffer policy is ultimately set to
14413			 * be "manual", the buffer allocation is guaranteed to
14414			 * fail, if only because the allocation requires two
14415			 * buffers.  (We set the the size to the highest
14416			 * multiple of 128m because it ensures that the size
14417			 * will remain a multiple of a megabyte when
14418			 * repeatedly halved -- all the way down to 15m.)
14419			 */
14420			val = LONG_MAX - (1 << 27) + 1;
14421		}
14422	}
14423
14424	state->dts_options[option] = val;
14425
14426	return (0);
14427}
14428
14429static void
14430dtrace_state_destroy(dtrace_state_t *state)
14431{
14432	dtrace_ecb_t *ecb;
14433	dtrace_vstate_t *vstate = &state->dts_vstate;
14434	minor_t minor = getminor(state->dts_dev);
14435	int i, bufsize = (int)NCPU * sizeof (dtrace_buffer_t);
14436	dtrace_speculation_t *spec = state->dts_speculations;
14437	int nspec = state->dts_nspeculations;
14438	uint32_t match;
14439
14440	lck_mtx_assert(&dtrace_lock, LCK_MTX_ASSERT_OWNED);
14441	lck_mtx_assert(&cpu_lock, LCK_MTX_ASSERT_OWNED);
14442
14443	/*
14444	 * First, retract any retained enablings for this state.
14445	 */
14446	dtrace_enabling_retract(state);
14447	ASSERT(state->dts_nretained == 0);
14448
14449	if (state->dts_activity == DTRACE_ACTIVITY_ACTIVE ||
14450	    state->dts_activity == DTRACE_ACTIVITY_DRAINING) {
14451		/*
14452		 * We have managed to come into dtrace_state_destroy() on a
14453		 * hot enabling -- almost certainly because of a disorderly
14454		 * shutdown of a consumer.  (That is, a consumer that is
14455		 * exiting without having called dtrace_stop().) In this case,
14456		 * we're going to set our activity to be KILLED, and then
14457		 * issue a sync to be sure that everyone is out of probe
14458		 * context before we start blowing away ECBs.
14459		 */
14460		state->dts_activity = DTRACE_ACTIVITY_KILLED;
14461		dtrace_sync();
14462	}
14463
14464	/*
14465	 * Release the credential hold we took in dtrace_state_create().
14466	 */
14467	if (state->dts_cred.dcr_cred != NULL)
14468		crfree(state->dts_cred.dcr_cred);
14469
14470	/*
14471	 * Now we can safely disable and destroy any enabled probes.  Because
14472	 * any DTRACE_PRIV_KERNEL probes may actually be slowing our progress
14473	 * (especially if they're all enabled), we take two passes through the
14474	 * ECBs:  in the first, we disable just DTRACE_PRIV_KERNEL probes, and
14475	 * in the second we disable whatever is left over.
14476	 */
14477	for (match = DTRACE_PRIV_KERNEL; ; match = 0) {
14478		for (i = 0; i < state->dts_necbs; i++) {
14479			if ((ecb = state->dts_ecbs[i]) == NULL)
14480				continue;
14481
14482			if (match && ecb->dte_probe != NULL) {
14483				dtrace_probe_t *probe = ecb->dte_probe;
14484				dtrace_provider_t *prov = probe->dtpr_provider;
14485
14486				if (!(prov->dtpv_priv.dtpp_flags & match))
14487					continue;
14488			}
14489
14490			dtrace_ecb_disable(ecb);
14491			dtrace_ecb_destroy(ecb);
14492		}
14493
14494		if (!match)
14495			break;
14496	}
14497
14498	/*
14499	 * Before we free the buffers, perform one more sync to assure that
14500	 * every CPU is out of probe context.
14501	 */
14502	dtrace_sync();
14503
14504	dtrace_buffer_free(state->dts_buffer);
14505	dtrace_buffer_free(state->dts_aggbuffer);
14506
14507	for (i = 0; i < nspec; i++)
14508		dtrace_buffer_free(spec[i].dtsp_buffer);
14509
14510	if (state->dts_cleaner != CYCLIC_NONE)
14511		cyclic_remove(state->dts_cleaner);
14512
14513	if (state->dts_deadman != CYCLIC_NONE)
14514		cyclic_remove(state->dts_deadman);
14515
14516	dtrace_dstate_fini(&vstate->dtvs_dynvars);
14517	dtrace_vstate_fini(vstate);
14518	kmem_free(state->dts_ecbs, state->dts_necbs * sizeof (dtrace_ecb_t *));
14519
14520	if (state->dts_aggregations != NULL) {
14521#if DEBUG
14522		for (i = 0; i < state->dts_naggregations; i++)
14523			ASSERT(state->dts_aggregations[i] == NULL);
14524#endif
14525		ASSERT(state->dts_naggregations > 0);
14526		kmem_free(state->dts_aggregations,
14527		    state->dts_naggregations * sizeof (dtrace_aggregation_t *));
14528	}
14529
14530	kmem_free(state->dts_buffer, bufsize);
14531	kmem_free(state->dts_aggbuffer, bufsize);
14532
14533	for (i = 0; i < nspec; i++)
14534		kmem_free(spec[i].dtsp_buffer, bufsize);
14535
14536	kmem_free(spec, nspec * sizeof (dtrace_speculation_t));
14537
14538	dtrace_format_destroy(state);
14539
14540	vmem_destroy(state->dts_aggid_arena);
14541	ddi_soft_state_free(dtrace_softstate, minor);
14542	vmem_free(dtrace_minor, (void *)(uintptr_t)minor, 1);
14543}
14544
14545/*
14546 * DTrace Anonymous Enabling Functions
14547 */
14548static dtrace_state_t *
14549dtrace_anon_grab(void)
14550{
14551	dtrace_state_t *state;
14552
14553	lck_mtx_assert(&dtrace_lock, LCK_MTX_ASSERT_OWNED);
14554
14555	if ((state = dtrace_anon.dta_state) == NULL) {
14556		ASSERT(dtrace_anon.dta_enabling == NULL);
14557		return (NULL);
14558	}
14559
14560	ASSERT(dtrace_anon.dta_enabling != NULL);
14561	ASSERT(dtrace_retained != NULL);
14562
14563	dtrace_enabling_destroy(dtrace_anon.dta_enabling);
14564	dtrace_anon.dta_enabling = NULL;
14565	dtrace_anon.dta_state = NULL;
14566
14567	return (state);
14568}
14569
14570static void
14571dtrace_anon_property(void)
14572{
14573	int i, rv;
14574	dtrace_state_t *state;
14575	dof_hdr_t *dof;
14576	char c[32];		/* enough for "dof-data-" + digits */
14577
14578	lck_mtx_assert(&dtrace_lock, LCK_MTX_ASSERT_OWNED);
14579	lck_mtx_assert(&cpu_lock, LCK_MTX_ASSERT_OWNED);
14580
14581	for (i = 0; ; i++) {
14582		(void) snprintf(c, sizeof (c), "dof-data-%d", i);
14583
14584		dtrace_err_verbose = 1;
14585
14586		if ((dof = dtrace_dof_property(c)) == NULL) {
14587			dtrace_err_verbose = 0;
14588			break;
14589		}
14590
14591		/*
14592		 * We want to create anonymous state, so we need to transition
14593		 * the kernel debugger to indicate that DTrace is active.  If
14594		 * this fails (e.g. because the debugger has modified text in
14595		 * some way), we won't continue with the processing.
14596		 */
14597		if (kdi_dtrace_set(KDI_DTSET_DTRACE_ACTIVATE) != 0) {
14598			cmn_err(CE_NOTE, "kernel debugger active; anonymous "
14599			    "enabling ignored.");
14600			dtrace_dof_destroy(dof);
14601			break;
14602		}
14603
14604		/*
14605		 * If we haven't allocated an anonymous state, we'll do so now.
14606		 */
14607		if ((state = dtrace_anon.dta_state) == NULL) {
14608#if !defined(__APPLE__)
14609			state = dtrace_state_create(NULL, NULL);
14610			dtrace_anon.dta_state = state;
14611			if (state == NULL) {
14612#else
14613			rv = dtrace_state_create(NULL, NULL, &state);
14614			dtrace_anon.dta_state = state;
14615			if (rv != 0 || state == NULL) {
14616#endif /* __APPLE__ */
14617				/*
14618				 * This basically shouldn't happen:  the only
14619				 * failure mode from dtrace_state_create() is a
14620				 * failure of ddi_soft_state_zalloc() that
14621				 * itself should never happen.  Still, the
14622				 * interface allows for a failure mode, and
14623				 * we want to fail as gracefully as possible:
14624				 * we'll emit an error message and cease
14625				 * processing anonymous state in this case.
14626				 */
14627				cmn_err(CE_WARN, "failed to create "
14628				    "anonymous state");
14629				dtrace_dof_destroy(dof);
14630				break;
14631			}
14632		}
14633
14634		rv = dtrace_dof_slurp(dof, &state->dts_vstate, CRED(),
14635		    &dtrace_anon.dta_enabling, 0, B_TRUE);
14636
14637		if (rv == 0)
14638			rv = dtrace_dof_options(dof, state);
14639
14640		dtrace_err_verbose = 0;
14641		dtrace_dof_destroy(dof);
14642
14643		if (rv != 0) {
14644			/*
14645			 * This is malformed DOF; chuck any anonymous state
14646			 * that we created.
14647			 */
14648			ASSERT(dtrace_anon.dta_enabling == NULL);
14649			dtrace_state_destroy(state);
14650			dtrace_anon.dta_state = NULL;
14651			break;
14652		}
14653
14654		ASSERT(dtrace_anon.dta_enabling != NULL);
14655	}
14656
14657	if (dtrace_anon.dta_enabling != NULL) {
14658		int rval;
14659
14660		/*
14661		 * dtrace_enabling_retain() can only fail because we are
14662		 * trying to retain more enablings than are allowed -- but
14663		 * we only have one anonymous enabling, and we are guaranteed
14664		 * to be allowed at least one retained enabling; we assert
14665		 * that dtrace_enabling_retain() returns success.
14666		 */
14667		rval = dtrace_enabling_retain(dtrace_anon.dta_enabling);
14668		ASSERT(rval == 0);
14669
14670		dtrace_enabling_dump(dtrace_anon.dta_enabling);
14671	}
14672}
14673
14674/*
14675 * DTrace Helper Functions
14676 */
14677static void
14678dtrace_helper_trace(dtrace_helper_action_t *helper,
14679    dtrace_mstate_t *mstate, dtrace_vstate_t *vstate, int where)
14680{
14681#if !defined(__APPLE__) /* Quiet compiler warning */
14682	uint32_t size, next, nnext, i;
14683#else
14684	uint32_t size, next, nnext;
14685	int i;
14686#endif /* __APPLE__ */
14687	dtrace_helptrace_t *ent;
14688	uint16_t flags = cpu_core[CPU->cpu_id].cpuc_dtrace_flags;
14689
14690	if (!dtrace_helptrace_enabled)
14691		return;
14692
14693#if !defined(__APPLE__) /* Quiet compiler warning */
14694	ASSERT(vstate->dtvs_nlocals <= dtrace_helptrace_nlocals);
14695#else
14696	ASSERT((uint32_t)vstate->dtvs_nlocals <= dtrace_helptrace_nlocals);
14697#endif /* __APPLE__ */
14698
14699	/*
14700	 * What would a tracing framework be without its own tracing
14701	 * framework?  (Well, a hell of a lot simpler, for starters...)
14702	 */
14703	size = sizeof (dtrace_helptrace_t) + dtrace_helptrace_nlocals *
14704	    sizeof (uint64_t) - sizeof (uint64_t);
14705
14706	/*
14707	 * Iterate until we can allocate a slot in the trace buffer.
14708	 */
14709	do {
14710		next = dtrace_helptrace_next;
14711
14712		if (next + size < dtrace_helptrace_bufsize) {
14713			nnext = next + size;
14714		} else {
14715			nnext = size;
14716		}
14717	} while (dtrace_cas32(&dtrace_helptrace_next, next, nnext) != next);
14718
14719	/*
14720	 * We have our slot; fill it in.
14721	 */
14722	if (nnext == size)
14723		next = 0;
14724
14725	ent = (dtrace_helptrace_t *)&dtrace_helptrace_buffer[next];
14726	ent->dtht_helper = helper;
14727	ent->dtht_where = where;
14728	ent->dtht_nlocals = vstate->dtvs_nlocals;
14729
14730	ent->dtht_fltoffs = (mstate->dtms_present & DTRACE_MSTATE_FLTOFFS) ?
14731	    mstate->dtms_fltoffs : -1;
14732	ent->dtht_fault = DTRACE_FLAGS2FLT(flags);
14733	ent->dtht_illval = cpu_core[CPU->cpu_id].cpuc_dtrace_illval;
14734
14735	for (i = 0; i < vstate->dtvs_nlocals; i++) {
14736		dtrace_statvar_t *svar;
14737
14738		if ((svar = vstate->dtvs_locals[i]) == NULL)
14739			continue;
14740
14741		ASSERT(svar->dtsv_size >= (int)NCPU * sizeof (uint64_t));
14742		ent->dtht_locals[i] =
14743		    ((uint64_t *)(uintptr_t)svar->dtsv_data)[CPU->cpu_id];
14744	}
14745}
14746
14747static uint64_t
14748dtrace_helper(int which, dtrace_mstate_t *mstate,
14749    dtrace_state_t *state, uint64_t arg0, uint64_t arg1)
14750{
14751	uint16_t *flags = &cpu_core[CPU->cpu_id].cpuc_dtrace_flags;
14752	uint64_t sarg0 = mstate->dtms_arg[0];
14753	uint64_t sarg1 = mstate->dtms_arg[1];
14754	uint64_t rval = 0;
14755	dtrace_helpers_t *helpers = curproc->p_dtrace_helpers;
14756	dtrace_helper_action_t *helper;
14757	dtrace_vstate_t *vstate;
14758	dtrace_difo_t *pred;
14759	int i, trace = dtrace_helptrace_enabled;
14760
14761	ASSERT(which >= 0 && which < DTRACE_NHELPER_ACTIONS);
14762
14763	if (helpers == NULL)
14764		return (0);
14765
14766	if ((helper = helpers->dthps_actions[which]) == NULL)
14767		return (0);
14768
14769	vstate = &helpers->dthps_vstate;
14770	mstate->dtms_arg[0] = arg0;
14771	mstate->dtms_arg[1] = arg1;
14772
14773	/*
14774	 * Now iterate over each helper.  If its predicate evaluates to 'true',
14775	 * we'll call the corresponding actions.  Note that the below calls
14776	 * to dtrace_dif_emulate() may set faults in machine state.  This is
14777	 * okay:  our caller (the outer dtrace_dif_emulate()) will simply plow
14778	 * the stored DIF offset with its own (which is the desired behavior).
14779	 * Also, note the calls to dtrace_dif_emulate() may allocate scratch
14780	 * from machine state; this is okay, too.
14781	 */
14782	for (; helper != NULL; helper = helper->dtha_next) {
14783		if ((pred = helper->dtha_predicate) != NULL) {
14784			if (trace)
14785				dtrace_helper_trace(helper, mstate, vstate, 0);
14786
14787			if (!dtrace_dif_emulate(pred, mstate, vstate, state))
14788				goto next;
14789
14790			if (*flags & CPU_DTRACE_FAULT)
14791				goto err;
14792		}
14793
14794		for (i = 0; i < helper->dtha_nactions; i++) {
14795			if (trace)
14796				dtrace_helper_trace(helper,
14797				    mstate, vstate, i + 1);
14798
14799			rval = dtrace_dif_emulate(helper->dtha_actions[i],
14800			    mstate, vstate, state);
14801
14802			if (*flags & CPU_DTRACE_FAULT)
14803				goto err;
14804		}
14805
14806next:
14807		if (trace)
14808			dtrace_helper_trace(helper, mstate, vstate,
14809			    DTRACE_HELPTRACE_NEXT);
14810	}
14811
14812	if (trace)
14813		dtrace_helper_trace(helper, mstate, vstate,
14814		    DTRACE_HELPTRACE_DONE);
14815
14816	/*
14817	 * Restore the arg0 that we saved upon entry.
14818	 */
14819	mstate->dtms_arg[0] = sarg0;
14820	mstate->dtms_arg[1] = sarg1;
14821
14822	return (rval);
14823
14824err:
14825	if (trace)
14826		dtrace_helper_trace(helper, mstate, vstate,
14827		    DTRACE_HELPTRACE_ERR);
14828
14829	/*
14830	 * Restore the arg0 that we saved upon entry.
14831	 */
14832	mstate->dtms_arg[0] = sarg0;
14833	mstate->dtms_arg[1] = sarg1;
14834
14835	return (NULL);
14836}
14837
14838static void
14839dtrace_helper_action_destroy(dtrace_helper_action_t *helper,
14840    dtrace_vstate_t *vstate)
14841{
14842	int i;
14843
14844	if (helper->dtha_predicate != NULL)
14845		dtrace_difo_release(helper->dtha_predicate, vstate);
14846
14847	for (i = 0; i < helper->dtha_nactions; i++) {
14848		ASSERT(helper->dtha_actions[i] != NULL);
14849		dtrace_difo_release(helper->dtha_actions[i], vstate);
14850	}
14851
14852	kmem_free(helper->dtha_actions,
14853	    helper->dtha_nactions * sizeof (dtrace_difo_t *));
14854	kmem_free(helper, sizeof (dtrace_helper_action_t));
14855}
14856
14857#if !defined(__APPLE__)
14858static int
14859dtrace_helper_destroygen(int gen)
14860{
14861	proc_t *p = curproc;
14862#else
14863static int
14864dtrace_helper_destroygen(proc_t* p, int gen)
14865{
14866#endif
14867	dtrace_helpers_t *help = p->p_dtrace_helpers;
14868	dtrace_vstate_t *vstate;
14869#if !defined(__APPLE__) /* Quiet compiler warning */
14870	int i;
14871#else
14872	uint_t i;
14873#endif /* __APPLE__ */
14874
14875	lck_mtx_assert(&dtrace_lock, LCK_MTX_ASSERT_OWNED);
14876
14877	if (help == NULL || gen > help->dthps_generation)
14878		return (EINVAL);
14879
14880	vstate = &help->dthps_vstate;
14881
14882	for (i = 0; i < DTRACE_NHELPER_ACTIONS; i++) {
14883		dtrace_helper_action_t *last = NULL, *h, *next;
14884
14885		for (h = help->dthps_actions[i]; h != NULL; h = next) {
14886			next = h->dtha_next;
14887
14888			if (h->dtha_generation == gen) {
14889				if (last != NULL) {
14890					last->dtha_next = next;
14891				} else {
14892					help->dthps_actions[i] = next;
14893				}
14894
14895				dtrace_helper_action_destroy(h, vstate);
14896			} else {
14897				last = h;
14898			}
14899		}
14900	}
14901
14902	/*
14903	 * Interate until we've cleared out all helper providers with the
14904	 * given generation number.
14905	 */
14906	for (;;) {
14907		dtrace_helper_provider_t *prov = NULL;
14908
14909		/*
14910		 * Look for a helper provider with the right generation. We
14911		 * have to start back at the beginning of the list each time
14912		 * because we drop dtrace_lock. It's unlikely that we'll make
14913		 * more than two passes.
14914		 */
14915		for (i = 0; i < help->dthps_nprovs; i++) {
14916			prov = help->dthps_provs[i];
14917
14918			if (prov->dthp_generation == gen)
14919				break;
14920		}
14921
14922		/*
14923		 * If there were no matches, we're done.
14924		 */
14925		if (i == help->dthps_nprovs)
14926			break;
14927
14928		/*
14929		 * Move the last helper provider into this slot.
14930		 */
14931		help->dthps_nprovs--;
14932		help->dthps_provs[i] = help->dthps_provs[help->dthps_nprovs];
14933		help->dthps_provs[help->dthps_nprovs] = NULL;
14934
14935		lck_mtx_unlock(&dtrace_lock);
14936
14937		/*
14938		 * If we have a meta provider, remove this helper provider.
14939		 */
14940		lck_mtx_lock(&dtrace_meta_lock);
14941		if (dtrace_meta_pid != NULL) {
14942			ASSERT(dtrace_deferred_pid == NULL);
14943			dtrace_helper_provider_remove(&prov->dthp_prov,
14944			    p->p_pid);
14945		}
14946		lck_mtx_unlock(&dtrace_meta_lock);
14947
14948		dtrace_helper_provider_destroy(prov);
14949
14950		lck_mtx_lock(&dtrace_lock);
14951	}
14952
14953	return (0);
14954}
14955
14956static int
14957dtrace_helper_validate(dtrace_helper_action_t *helper)
14958{
14959	int err = 0, i;
14960	dtrace_difo_t *dp;
14961
14962	if ((dp = helper->dtha_predicate) != NULL)
14963		err += dtrace_difo_validate_helper(dp);
14964
14965	for (i = 0; i < helper->dtha_nactions; i++)
14966		err += dtrace_difo_validate_helper(helper->dtha_actions[i]);
14967
14968	return (err == 0);
14969}
14970
14971#if !defined(__APPLE__)
14972static int
14973dtrace_helper_action_add(int which, dtrace_ecbdesc_t *ep)
14974#else
14975static int
14976dtrace_helper_action_add(proc_t* p, int which, dtrace_ecbdesc_t *ep)
14977#endif
14978{
14979	dtrace_helpers_t *help;
14980	dtrace_helper_action_t *helper, *last;
14981	dtrace_actdesc_t *act;
14982	dtrace_vstate_t *vstate;
14983	dtrace_predicate_t *pred;
14984	int count = 0, nactions = 0, i;
14985
14986	if (which < 0 || which >= DTRACE_NHELPER_ACTIONS)
14987		return (EINVAL);
14988
14989#if !defined(__APPLE__)
14990	help = curproc->p_dtrace_helpers;
14991#else
14992	help = p->p_dtrace_helpers;
14993#endif
14994	last = help->dthps_actions[which];
14995	vstate = &help->dthps_vstate;
14996
14997	for (count = 0; last != NULL; last = last->dtha_next) {
14998		count++;
14999		if (last->dtha_next == NULL)
15000			break;
15001	}
15002
15003	/*
15004	 * If we already have dtrace_helper_actions_max helper actions for this
15005	 * helper action type, we'll refuse to add a new one.
15006	 */
15007	if (count >= dtrace_helper_actions_max)
15008		return (ENOSPC);
15009
15010	helper = kmem_zalloc(sizeof (dtrace_helper_action_t), KM_SLEEP);
15011	helper->dtha_generation = help->dthps_generation;
15012
15013	if ((pred = ep->dted_pred.dtpdd_predicate) != NULL) {
15014		ASSERT(pred->dtp_difo != NULL);
15015		dtrace_difo_hold(pred->dtp_difo);
15016		helper->dtha_predicate = pred->dtp_difo;
15017	}
15018
15019	for (act = ep->dted_action; act != NULL; act = act->dtad_next) {
15020		if (act->dtad_kind != DTRACEACT_DIFEXPR)
15021			goto err;
15022
15023		if (act->dtad_difo == NULL)
15024			goto err;
15025
15026		nactions++;
15027	}
15028
15029	helper->dtha_actions = kmem_zalloc(sizeof (dtrace_difo_t *) *
15030	    (helper->dtha_nactions = nactions), KM_SLEEP);
15031
15032	for (act = ep->dted_action, i = 0; act != NULL; act = act->dtad_next) {
15033		dtrace_difo_hold(act->dtad_difo);
15034		helper->dtha_actions[i++] = act->dtad_difo;
15035	}
15036
15037	if (!dtrace_helper_validate(helper))
15038		goto err;
15039
15040	if (last == NULL) {
15041		help->dthps_actions[which] = helper;
15042	} else {
15043		last->dtha_next = helper;
15044	}
15045
15046#if !defined(__APPLE__) /* Quiet compiler warning */
15047	if (vstate->dtvs_nlocals > dtrace_helptrace_nlocals) {
15048#else
15049	if ((uint32_t)vstate->dtvs_nlocals > dtrace_helptrace_nlocals) {
15050#endif /* __APPLE__ */
15051		dtrace_helptrace_nlocals = vstate->dtvs_nlocals;
15052		dtrace_helptrace_next = 0;
15053	}
15054
15055	return (0);
15056err:
15057	dtrace_helper_action_destroy(helper, vstate);
15058	return (EINVAL);
15059}
15060
15061static void
15062dtrace_helper_provider_register(proc_t *p, dtrace_helpers_t *help,
15063    dof_helper_t *dofhp)
15064{
15065	lck_mtx_assert(&dtrace_lock, LCK_MTX_ASSERT_NOTOWNED);
15066
15067	lck_mtx_lock(&dtrace_meta_lock);
15068	lck_mtx_lock(&dtrace_lock);
15069
15070	if (!dtrace_attached() || dtrace_meta_pid == NULL) {
15071		/*
15072		 * If the dtrace module is loaded but not attached, or if
15073		 * there aren't isn't a meta provider registered to deal with
15074		 * these provider descriptions, we need to postpone creating
15075		 * the actual providers until later.
15076		 */
15077
15078		if (help->dthps_next == NULL && help->dthps_prev == NULL &&
15079		    dtrace_deferred_pid != help) {
15080			help->dthps_deferred = 1;
15081			help->dthps_pid = p->p_pid;
15082			help->dthps_next = dtrace_deferred_pid;
15083			help->dthps_prev = NULL;
15084			if (dtrace_deferred_pid != NULL)
15085				dtrace_deferred_pid->dthps_prev = help;
15086			dtrace_deferred_pid = help;
15087		}
15088
15089		lck_mtx_unlock(&dtrace_lock);
15090
15091	} else if (dofhp != NULL) {
15092		/*
15093		 * If the dtrace module is loaded and we have a particular
15094		 * helper provider description, pass that off to the
15095		 * meta provider.
15096		 */
15097
15098		lck_mtx_unlock(&dtrace_lock);
15099
15100		dtrace_helper_provide(dofhp, p->p_pid);
15101
15102	} else {
15103		/*
15104		 * Otherwise, just pass all the helper provider descriptions
15105		 * off to the meta provider.
15106		 */
15107
15108#if !defined(__APPLE__) /* Quiet compiler warning */
15109		int i;
15110#else
15111		uint_t i;
15112#endif /* __APPLE__ */
15113		lck_mtx_unlock(&dtrace_lock);
15114
15115		for (i = 0; i < help->dthps_nprovs; i++) {
15116			dtrace_helper_provide(&help->dthps_provs[i]->dthp_prov,
15117			    p->p_pid);
15118		}
15119	}
15120
15121	lck_mtx_unlock(&dtrace_meta_lock);
15122}
15123
15124#if !defined(__APPLE__)
15125static int
15126dtrace_helper_provider_add(dof_helper_t *dofhp, int gen)
15127#else
15128static int
15129dtrace_helper_provider_add(proc_t* p, dof_helper_t *dofhp, int gen)
15130#endif
15131{
15132	dtrace_helpers_t *help;
15133	dtrace_helper_provider_t *hprov, **tmp_provs;
15134	uint_t tmp_maxprovs, i;
15135
15136	lck_mtx_assert(&dtrace_lock, LCK_MTX_ASSERT_OWNED);
15137
15138#if !defined(__APPLE__)
15139	help = curproc->p_dtrace_helpers;
15140#else
15141	help = p->p_dtrace_helpers;
15142#endif
15143	ASSERT(help != NULL);
15144
15145	/*
15146	 * If we already have dtrace_helper_providers_max helper providers,
15147	 * we're refuse to add a new one.
15148	 */
15149	if (help->dthps_nprovs >= dtrace_helper_providers_max)
15150		return (ENOSPC);
15151
15152	/*
15153	 * Check to make sure this isn't a duplicate.
15154	 */
15155	for (i = 0; i < help->dthps_nprovs; i++) {
15156		if (dofhp->dofhp_addr ==
15157		    help->dthps_provs[i]->dthp_prov.dofhp_addr)
15158			return (EALREADY);
15159	}
15160
15161	hprov = kmem_zalloc(sizeof (dtrace_helper_provider_t), KM_SLEEP);
15162	hprov->dthp_prov = *dofhp;
15163	hprov->dthp_ref = 1;
15164	hprov->dthp_generation = gen;
15165
15166	/*
15167	 * Allocate a bigger table for helper providers if it's already full.
15168	 */
15169	if (help->dthps_maxprovs == help->dthps_nprovs) {
15170		tmp_maxprovs = help->dthps_maxprovs;
15171		tmp_provs = help->dthps_provs;
15172
15173		if (help->dthps_maxprovs == 0)
15174			help->dthps_maxprovs = 2;
15175		else
15176			help->dthps_maxprovs *= 2;
15177		if (help->dthps_maxprovs > dtrace_helper_providers_max)
15178			help->dthps_maxprovs = dtrace_helper_providers_max;
15179
15180		ASSERT(tmp_maxprovs < help->dthps_maxprovs);
15181
15182		help->dthps_provs = kmem_zalloc(help->dthps_maxprovs *
15183		    sizeof (dtrace_helper_provider_t *), KM_SLEEP);
15184
15185		if (tmp_provs != NULL) {
15186			bcopy(tmp_provs, help->dthps_provs, tmp_maxprovs *
15187			    sizeof (dtrace_helper_provider_t *));
15188			kmem_free(tmp_provs, tmp_maxprovs *
15189			    sizeof (dtrace_helper_provider_t *));
15190		}
15191	}
15192
15193	help->dthps_provs[help->dthps_nprovs] = hprov;
15194	help->dthps_nprovs++;
15195
15196	return (0);
15197}
15198
15199static void
15200dtrace_helper_provider_destroy(dtrace_helper_provider_t *hprov)
15201{
15202	lck_mtx_lock(&dtrace_lock);
15203
15204	if (--hprov->dthp_ref == 0) {
15205		dof_hdr_t *dof;
15206		lck_mtx_unlock(&dtrace_lock);
15207		dof = (dof_hdr_t *)(uintptr_t)hprov->dthp_prov.dofhp_dof;
15208		dtrace_dof_destroy(dof);
15209		kmem_free(hprov, sizeof (dtrace_helper_provider_t));
15210	} else {
15211		lck_mtx_unlock(&dtrace_lock);
15212	}
15213}
15214
15215static int
15216dtrace_helper_provider_validate(dof_hdr_t *dof, dof_sec_t *sec)
15217{
15218	uintptr_t daddr = (uintptr_t)dof;
15219	dof_sec_t *str_sec, *prb_sec, *arg_sec, *off_sec, *enoff_sec;
15220	dof_provider_t *provider;
15221	dof_probe_t *probe;
15222	uint8_t *arg;
15223	char *strtab, *typestr;
15224	dof_stridx_t typeidx;
15225	size_t typesz;
15226	uint_t nprobes, j, k;
15227
15228	ASSERT(sec->dofs_type == DOF_SECT_PROVIDER);
15229
15230	if (sec->dofs_offset & (sizeof (uint_t) - 1)) {
15231		dtrace_dof_error(dof, "misaligned section offset");
15232		return (-1);
15233	}
15234
15235	/*
15236	 * The section needs to be large enough to contain the DOF provider
15237	 * structure appropriate for the given version.
15238	 */
15239	if (sec->dofs_size <
15240	    ((dof->dofh_ident[DOF_ID_VERSION] == DOF_VERSION_1) ?
15241	    offsetof(dof_provider_t, dofpv_prenoffs) :
15242	    sizeof (dof_provider_t))) {
15243		dtrace_dof_error(dof, "provider section too small");
15244		return (-1);
15245	}
15246
15247	provider = (dof_provider_t *)(uintptr_t)(daddr + sec->dofs_offset);
15248	str_sec = dtrace_dof_sect(dof, DOF_SECT_STRTAB, provider->dofpv_strtab);
15249	prb_sec = dtrace_dof_sect(dof, DOF_SECT_PROBES, provider->dofpv_probes);
15250	arg_sec = dtrace_dof_sect(dof, DOF_SECT_PRARGS, provider->dofpv_prargs);
15251	off_sec = dtrace_dof_sect(dof, DOF_SECT_PROFFS, provider->dofpv_proffs);
15252
15253	if (str_sec == NULL || prb_sec == NULL ||
15254	    arg_sec == NULL || off_sec == NULL)
15255		return (-1);
15256
15257	enoff_sec = NULL;
15258
15259	if (dof->dofh_ident[DOF_ID_VERSION] != DOF_VERSION_1 &&
15260	    provider->dofpv_prenoffs != DOF_SECT_NONE &&
15261	    (enoff_sec = dtrace_dof_sect(dof, DOF_SECT_PRENOFFS,
15262	    provider->dofpv_prenoffs)) == NULL)
15263		return (-1);
15264
15265	strtab = (char *)(uintptr_t)(daddr + str_sec->dofs_offset);
15266
15267	if (provider->dofpv_name >= str_sec->dofs_size ||
15268	    strlen(strtab + provider->dofpv_name) >= DTRACE_PROVNAMELEN) {
15269		dtrace_dof_error(dof, "invalid provider name");
15270		return (-1);
15271	}
15272
15273	if (prb_sec->dofs_entsize == 0 ||
15274	    prb_sec->dofs_entsize > prb_sec->dofs_size) {
15275		dtrace_dof_error(dof, "invalid entry size");
15276		return (-1);
15277	}
15278
15279	if (prb_sec->dofs_entsize & (sizeof (uintptr_t) - 1)) {
15280		dtrace_dof_error(dof, "misaligned entry size");
15281		return (-1);
15282	}
15283
15284	if (off_sec->dofs_entsize != sizeof (uint32_t)) {
15285		dtrace_dof_error(dof, "invalid entry size");
15286		return (-1);
15287	}
15288
15289	if (off_sec->dofs_offset & (sizeof (uint32_t) - 1)) {
15290		dtrace_dof_error(dof, "misaligned section offset");
15291		return (-1);
15292	}
15293
15294	if (arg_sec->dofs_entsize != sizeof (uint8_t)) {
15295		dtrace_dof_error(dof, "invalid entry size");
15296		return (-1);
15297	}
15298
15299	arg = (uint8_t *)(uintptr_t)(daddr + arg_sec->dofs_offset);
15300
15301	nprobes = prb_sec->dofs_size / prb_sec->dofs_entsize;
15302
15303	/*
15304	 * Take a pass through the probes to check for errors.
15305	 */
15306	for (j = 0; j < nprobes; j++) {
15307		probe = (dof_probe_t *)(uintptr_t)(daddr +
15308		    prb_sec->dofs_offset + j * prb_sec->dofs_entsize);
15309
15310		if (probe->dofpr_func >= str_sec->dofs_size) {
15311			dtrace_dof_error(dof, "invalid function name");
15312			return (-1);
15313		}
15314
15315		if (strlen(strtab + probe->dofpr_func) >= DTRACE_FUNCNAMELEN) {
15316			dtrace_dof_error(dof, "function name too long");
15317			return (-1);
15318		}
15319
15320		if (probe->dofpr_name >= str_sec->dofs_size ||
15321		    strlen(strtab + probe->dofpr_name) >= DTRACE_NAMELEN) {
15322			dtrace_dof_error(dof, "invalid probe name");
15323			return (-1);
15324		}
15325
15326		/*
15327		 * The offset count must not wrap the index, and the offsets
15328		 * must also not overflow the section's data.
15329		 */
15330		if (probe->dofpr_offidx + probe->dofpr_noffs <
15331		    probe->dofpr_offidx ||
15332		    (probe->dofpr_offidx + probe->dofpr_noffs) *
15333		    off_sec->dofs_entsize > off_sec->dofs_size) {
15334			dtrace_dof_error(dof, "invalid probe offset");
15335			return (-1);
15336		}
15337
15338		if (dof->dofh_ident[DOF_ID_VERSION] != DOF_VERSION_1) {
15339			/*
15340			 * If there's no is-enabled offset section, make sure
15341			 * there aren't any is-enabled offsets. Otherwise
15342			 * perform the same checks as for probe offsets
15343			 * (immediately above).
15344			 */
15345			if (enoff_sec == NULL) {
15346				if (probe->dofpr_enoffidx != 0 ||
15347				    probe->dofpr_nenoffs != 0) {
15348					dtrace_dof_error(dof, "is-enabled "
15349					    "offsets with null section");
15350					return (-1);
15351				}
15352			} else if (probe->dofpr_enoffidx +
15353			    probe->dofpr_nenoffs < probe->dofpr_enoffidx ||
15354			    (probe->dofpr_enoffidx + probe->dofpr_nenoffs) *
15355			    enoff_sec->dofs_entsize > enoff_sec->dofs_size) {
15356				dtrace_dof_error(dof, "invalid is-enabled "
15357				    "offset");
15358				return (-1);
15359			}
15360
15361			if (probe->dofpr_noffs + probe->dofpr_nenoffs == 0) {
15362				dtrace_dof_error(dof, "zero probe and "
15363				    "is-enabled offsets");
15364				return (-1);
15365			}
15366		} else if (probe->dofpr_noffs == 0) {
15367			dtrace_dof_error(dof, "zero probe offsets");
15368			return (-1);
15369		}
15370
15371		if (probe->dofpr_argidx + probe->dofpr_xargc <
15372		    probe->dofpr_argidx ||
15373		    (probe->dofpr_argidx + probe->dofpr_xargc) *
15374		    arg_sec->dofs_entsize > arg_sec->dofs_size) {
15375			dtrace_dof_error(dof, "invalid args");
15376			return (-1);
15377		}
15378
15379		typeidx = probe->dofpr_nargv;
15380		typestr = strtab + probe->dofpr_nargv;
15381		for (k = 0; k < probe->dofpr_nargc; k++) {
15382			if (typeidx >= str_sec->dofs_size) {
15383				dtrace_dof_error(dof, "bad "
15384				    "native argument type");
15385				return (-1);
15386			}
15387
15388			typesz = strlen(typestr) + 1;
15389			if (typesz > DTRACE_ARGTYPELEN) {
15390				dtrace_dof_error(dof, "native "
15391				    "argument type too long");
15392				return (-1);
15393			}
15394			typeidx += typesz;
15395			typestr += typesz;
15396		}
15397
15398		typeidx = probe->dofpr_xargv;
15399		typestr = strtab + probe->dofpr_xargv;
15400		for (k = 0; k < probe->dofpr_xargc; k++) {
15401			if (arg[probe->dofpr_argidx + k] > probe->dofpr_nargc) {
15402				dtrace_dof_error(dof, "bad "
15403				    "native argument index");
15404				return (-1);
15405			}
15406
15407			if (typeidx >= str_sec->dofs_size) {
15408				dtrace_dof_error(dof, "bad "
15409				    "translated argument type");
15410				return (-1);
15411			}
15412
15413			typesz = strlen(typestr) + 1;
15414			if (typesz > DTRACE_ARGTYPELEN) {
15415				dtrace_dof_error(dof, "translated argument "
15416				    "type too long");
15417				return (-1);
15418			}
15419
15420			typeidx += typesz;
15421			typestr += typesz;
15422		}
15423	}
15424
15425	return (0);
15426}
15427
15428#if !defined(__APPLE__)
15429static int
15430dtrace_helper_slurp(dof_hdr_t *dof, dof_helper_t *dhp)
15431#else
15432static int
15433dtrace_helper_slurp(proc_t* p, dof_hdr_t *dof, dof_helper_t *dhp)
15434#endif
15435{
15436	dtrace_helpers_t *help;
15437	dtrace_vstate_t *vstate;
15438	dtrace_enabling_t *enab = NULL;
15439	int i, gen, rv, nhelpers = 0, nprovs = 0, destroy = 1;
15440	uintptr_t daddr = (uintptr_t)dof;
15441
15442	lck_mtx_assert(&dtrace_lock, LCK_MTX_ASSERT_OWNED);
15443
15444#if !defined(__APPLE__)
15445	if ((help = curproc->p_dtrace_helpers) == NULL)
15446		help = dtrace_helpers_create(curproc);
15447#else
15448	if ((help = p->p_dtrace_helpers) == NULL)
15449		help = dtrace_helpers_create(p);
15450#endif
15451
15452	vstate = &help->dthps_vstate;
15453
15454	if ((rv = dtrace_dof_slurp(dof, vstate, NULL, &enab,
15455	    dhp != NULL ? dhp->dofhp_addr : 0, B_FALSE)) != 0) {
15456		dtrace_dof_destroy(dof);
15457		return (rv);
15458	}
15459
15460	/*
15461	 * Look for helper providers and validate their descriptions.
15462	 */
15463	if (dhp != NULL) {
15464#if !defined(__APPLE__) /* Quiet compiler warning */
15465		for (i = 0; i < dof->dofh_secnum; i++) {
15466#else
15467		for (i = 0; (uint32_t)i < dof->dofh_secnum; i++) {
15468#endif /* __APPLE__ */
15469			dof_sec_t *sec = (dof_sec_t *)(uintptr_t)(daddr +
15470			    dof->dofh_secoff + i * dof->dofh_secsize);
15471
15472			if (sec->dofs_type != DOF_SECT_PROVIDER)
15473				continue;
15474
15475			if (dtrace_helper_provider_validate(dof, sec) != 0) {
15476				dtrace_enabling_destroy(enab);
15477				dtrace_dof_destroy(dof);
15478				return (-1);
15479			}
15480
15481			nprovs++;
15482		}
15483	}
15484
15485	/*
15486	 * Now we need to walk through the ECB descriptions in the enabling.
15487	 */
15488	for (i = 0; i < enab->dten_ndesc; i++) {
15489		dtrace_ecbdesc_t *ep = enab->dten_desc[i];
15490		dtrace_probedesc_t *desc = &ep->dted_probe;
15491
15492#if !defined(__APPLE__)
15493		if (strcmp(desc->dtpd_provider, "dtrace") != 0)
15494			continue;
15495
15496		if (strcmp(desc->dtpd_mod, "helper") != 0)
15497			continue;
15498
15499		if (strcmp(desc->dtpd_func, "ustack") != 0)
15500			continue;
15501#else /* Employ size bounded string operation. */
15502		if (!LIT_STRNEQL(desc->dtpd_provider, "dtrace"))
15503			continue;
15504
15505		if (!LIT_STRNEQL(desc->dtpd_mod, "helper"))
15506			continue;
15507
15508		if (!LIT_STRNEQL(desc->dtpd_func, "ustack"))
15509			continue;
15510#endif /* __APPLE__ */
15511
15512#if !defined(__APPLE__)
15513		if ((rv = dtrace_helper_action_add(DTRACE_HELPER_ACTION_USTACK,
15514		    ep)) != 0) {
15515#else
15516		if ((rv = dtrace_helper_action_add(p, DTRACE_HELPER_ACTION_USTACK,
15517		    ep)) != 0) {
15518#endif
15519			/*
15520			 * Adding this helper action failed -- we are now going
15521			 * to rip out the entire generation and return failure.
15522			 */
15523#if !defined(__APPLE__)
15524			(void) dtrace_helper_destroygen(help->dthps_generation);
15525#else
15526			(void) dtrace_helper_destroygen(p, help->dthps_generation);
15527#endif
15528			dtrace_enabling_destroy(enab);
15529			dtrace_dof_destroy(dof);
15530			return (-1);
15531		}
15532
15533		nhelpers++;
15534	}
15535
15536	if (nhelpers < enab->dten_ndesc)
15537		dtrace_dof_error(dof, "unmatched helpers");
15538
15539	gen = help->dthps_generation++;
15540	dtrace_enabling_destroy(enab);
15541
15542	if (dhp != NULL && nprovs > 0) {
15543		dhp->dofhp_dof = (uint64_t)(uintptr_t)dof;
15544#if !defined(__APPLE__)
15545		if (dtrace_helper_provider_add(dhp, gen) == 0) {
15546#else
15547		if (dtrace_helper_provider_add(p, dhp, gen) == 0) {
15548#endif
15549			lck_mtx_unlock(&dtrace_lock);
15550#if !defined(__APPLE__)
15551			dtrace_helper_provider_register(curproc, help, dhp);
15552#else
15553			dtrace_helper_provider_register(p, help, dhp);
15554#endif
15555			lck_mtx_lock(&dtrace_lock);
15556
15557			destroy = 0;
15558		}
15559	}
15560
15561	if (destroy)
15562		dtrace_dof_destroy(dof);
15563
15564	return (gen);
15565}
15566
15567#if defined(__APPLE__)
15568
15569/*
15570 * DTrace lazy dof
15571 *
15572 * DTrace user static probes (USDT probes) and helper actions are loaded
15573 * in a process by proccessing dof sections. The dof sections are passed
15574 * into the kernel by dyld, in a dof_ioctl_data_t block. It is rather
15575 * expensive to process dof for a process that will never use it. There
15576 * is a memory cost (allocating the providers/probes), and a cpu cost
15577 * (creating the providers/probes).
15578 *
15579 * To reduce this cost, we use "lazy dof". The normal proceedure for
15580 * dof processing is to copyin the dof(s) pointed to by the dof_ioctl_data_t
15581 * block, and invoke dof_slurp_helper() on them. When "lazy dof" is
15582 * used, each process retains the dof_ioctl_data_t block, instead of
15583 * copying in the data it points to.
15584 *
15585 * The dof_ioctl_data_t blocks are managed as if they were the actual
15586 * processed dof; on fork the block is copied to the child, on exec and
15587 * exit the block is freed.
15588 *
15589 * If the process loads library(s) containing additional dof, the
15590 * new dof_ioctl_data_t is merged with the existing block.
15591 *
15592 * There are a few catches that make this slightly more difficult.
15593 * When dyld registers dof_ioctl_data_t blocks, it expects a unique
15594 * identifier value for each dof in the block. In non-lazy dof terms,
15595 * this is the generation that dof was loaded in. If we hand back
15596 * a UID for a lazy dof, that same UID must be able to unload the
15597 * dof once it has become non-lazy. To meet this requirement, the
15598 * code that loads lazy dof requires that the UID's for dof(s) in
15599 * the lazy dof be sorted, and in ascending order. It is okay to skip
15600 * UID's, I.E., 1 -> 5 -> 6 is legal.
15601 *
15602 * Once a process has become non-lazy, it will stay non-lazy. All
15603 * future dof operations for that process will be non-lazy, even
15604 * if the dof mode transitions back to lazy.
15605 *
15606 * Always do lazy dof checks before non-lazy (I.E. In fork, exit, exec.).
15607 * That way if the lazy check fails due to transitioning to non-lazy, the
15608 * right thing is done with the newly faulted in dof.
15609 */
15610
15611/*
15612 * This method is a bit squicky. It must handle:
15613 *
15614 * dof should not be lazy.
15615 * dof should have been handled lazily, but there was an error
15616 * dof was handled lazily, and needs to be freed.
15617 * dof was handled lazily, and must not be freed.
15618 *
15619 *
15620 * Returns EACCESS if dof should be handled non-lazily.
15621 *
15622 * KERN_SUCCESS and all other return codes indicate lazy handling of dof.
15623 *
15624 * If the dofs data is claimed by this method, dofs_claimed will be set.
15625 * Callers should not free claimed dofs.
15626 */
15627static int
15628dtrace_lazy_dofs_add(proc_t *p, dof_ioctl_data_t* incoming_dofs, int *dofs_claimed)
15629{
15630	ASSERT(p);
15631	ASSERT(incoming_dofs && incoming_dofs->dofiod_count > 0);
15632
15633	int rval = 0;
15634	*dofs_claimed = 0;
15635
15636	lck_rw_lock_shared(&dtrace_dof_mode_lock);
15637
15638	/*
15639	 * If we have lazy dof, dof mode better be LAZY_ON.
15640	 */
15641	ASSERT(p->p_dtrace_lazy_dofs == NULL || dtrace_dof_mode == DTRACE_DOF_MODE_LAZY_ON);
15642	ASSERT(p->p_dtrace_lazy_dofs == NULL || p->p_dtrace_helpers == NULL);
15643	ASSERT(dtrace_dof_mode != DTRACE_DOF_MODE_NEVER);
15644
15645	/*
15646	 * Any existing helpers force non-lazy behavior.
15647	 */
15648	if (dtrace_dof_mode == DTRACE_DOF_MODE_LAZY_ON && (p->p_dtrace_helpers == NULL)) {
15649		lck_mtx_lock(&p->p_dtrace_sprlock);
15650
15651		dof_ioctl_data_t* existing_dofs = p->p_dtrace_lazy_dofs;
15652		unsigned int existing_dofs_count = (existing_dofs) ? existing_dofs->dofiod_count : 0;
15653		unsigned int i, merged_dofs_count = incoming_dofs->dofiod_count + existing_dofs_count;
15654
15655		/*
15656		 * Range check...
15657		 */
15658		if (merged_dofs_count == 0 || merged_dofs_count > 1024) {
15659			dtrace_dof_error(NULL, "lazy_dofs_add merged_dofs_count out of range");
15660			rval = EINVAL;
15661			goto unlock;
15662		}
15663
15664		/*
15665		 * Each dof being added must be assigned a unique generation.
15666		 */
15667		uint64_t generation = (existing_dofs) ? existing_dofs->dofiod_helpers[existing_dofs_count - 1].dofhp_dof + 1 : 1;
15668		for (i=0; i<incoming_dofs->dofiod_count; i++) {
15669			/*
15670			 * We rely on these being the same so we can overwrite dofhp_dof and not lose info.
15671			 */
15672			ASSERT(incoming_dofs->dofiod_helpers[i].dofhp_dof == incoming_dofs->dofiod_helpers[i].dofhp_addr);
15673			incoming_dofs->dofiod_helpers[i].dofhp_dof = generation++;
15674		}
15675
15676
15677		if (existing_dofs) {
15678			/*
15679			 * Merge the existing and incoming dofs
15680			 */
15681			size_t merged_dofs_size = DOF_IOCTL_DATA_T_SIZE(merged_dofs_count);
15682			dof_ioctl_data_t* merged_dofs = kmem_alloc(merged_dofs_size, KM_SLEEP);
15683
15684			bcopy(&existing_dofs->dofiod_helpers[0],
15685			      &merged_dofs->dofiod_helpers[0],
15686			      sizeof(dof_helper_t) * existing_dofs_count);
15687			bcopy(&incoming_dofs->dofiod_helpers[0],
15688			      &merged_dofs->dofiod_helpers[existing_dofs_count],
15689			      sizeof(dof_helper_t) * incoming_dofs->dofiod_count);
15690
15691			merged_dofs->dofiod_count = merged_dofs_count;
15692
15693			kmem_free(existing_dofs, DOF_IOCTL_DATA_T_SIZE(existing_dofs_count));
15694
15695			p->p_dtrace_lazy_dofs = merged_dofs;
15696		} else {
15697			/*
15698			 * Claim the incoming dofs
15699			 */
15700			*dofs_claimed = 1;
15701			p->p_dtrace_lazy_dofs = incoming_dofs;
15702		}
15703
15704#if DEBUG
15705		dof_ioctl_data_t* all_dofs = p->p_dtrace_lazy_dofs;
15706		for (i=0; i<all_dofs->dofiod_count-1; i++) {
15707			ASSERT(all_dofs->dofiod_helpers[i].dofhp_dof < all_dofs->dofiod_helpers[i+1].dofhp_dof);
15708		}
15709#endif /* DEBUG */
15710
15711unlock:
15712		lck_mtx_unlock(&p->p_dtrace_sprlock);
15713	} else {
15714		rval = EACCES;
15715	}
15716
15717 	lck_rw_unlock_shared(&dtrace_dof_mode_lock);
15718
15719	return rval;
15720}
15721
15722/*
15723 * Returns:
15724 *
15725 * EINVAL: lazy dof is enabled, but the requested generation was not found.
15726 * EACCES: This removal needs to be handled non-lazily.
15727 */
15728static int
15729dtrace_lazy_dofs_remove(proc_t *p, int generation)
15730{
15731	int rval = EINVAL;
15732
15733	lck_rw_lock_shared(&dtrace_dof_mode_lock);
15734
15735	/*
15736	 * If we have lazy dof, dof mode better be LAZY_ON.
15737	 */
15738	ASSERT(p->p_dtrace_lazy_dofs == NULL || dtrace_dof_mode == DTRACE_DOF_MODE_LAZY_ON);
15739	ASSERT(p->p_dtrace_lazy_dofs == NULL || p->p_dtrace_helpers == NULL);
15740	ASSERT(dtrace_dof_mode != DTRACE_DOF_MODE_NEVER);
15741
15742	/*
15743	 * Any existing helpers force non-lazy behavior.
15744	 */
15745	if (dtrace_dof_mode == DTRACE_DOF_MODE_LAZY_ON && (p->p_dtrace_helpers == NULL)) {
15746		lck_mtx_lock(&p->p_dtrace_sprlock);
15747
15748		dof_ioctl_data_t* existing_dofs = p->p_dtrace_lazy_dofs;
15749
15750		if (existing_dofs) {
15751			int index, existing_dofs_count = existing_dofs->dofiod_count;
15752			for (index=0; index<existing_dofs_count; index++) {
15753				if ((int)existing_dofs->dofiod_helpers[index].dofhp_dof == generation) {
15754					dof_ioctl_data_t* removed_dofs = NULL;
15755
15756					/*
15757					 * If there is only 1 dof, we'll delete it and swap in NULL.
15758					 */
15759					if (existing_dofs_count > 1) {
15760						int removed_dofs_count = existing_dofs_count - 1;
15761						size_t removed_dofs_size = DOF_IOCTL_DATA_T_SIZE(removed_dofs_count);
15762
15763						removed_dofs = kmem_alloc(removed_dofs_size, KM_SLEEP);
15764						removed_dofs->dofiod_count = removed_dofs_count;
15765
15766						/*
15767						 * copy the remaining data.
15768						 */
15769						if (index > 0) {
15770							bcopy(&existing_dofs->dofiod_helpers[0],
15771							      &removed_dofs->dofiod_helpers[0],
15772							      index * sizeof(dof_helper_t));
15773						}
15774
15775						if (index < existing_dofs_count-1) {
15776							bcopy(&existing_dofs->dofiod_helpers[index+1],
15777							      &removed_dofs->dofiod_helpers[index],
15778							      (existing_dofs_count - index - 1) * sizeof(dof_helper_t));
15779						}
15780					}
15781
15782					kmem_free(existing_dofs, DOF_IOCTL_DATA_T_SIZE(existing_dofs_count));
15783
15784					p->p_dtrace_lazy_dofs = removed_dofs;
15785
15786					rval = KERN_SUCCESS;
15787
15788					break;
15789				}
15790			}
15791
15792#if DEBUG
15793			dof_ioctl_data_t* all_dofs = p->p_dtrace_lazy_dofs;
15794			if (all_dofs) {
15795				unsigned int i;
15796				for (i=0; i<all_dofs->dofiod_count-1; i++) {
15797					ASSERT(all_dofs->dofiod_helpers[i].dofhp_dof < all_dofs->dofiod_helpers[i+1].dofhp_dof);
15798				}
15799			}
15800#endif
15801
15802		}
15803
15804		lck_mtx_unlock(&p->p_dtrace_sprlock);
15805	} else {
15806		rval = EACCES;
15807	}
15808
15809	lck_rw_unlock_shared(&dtrace_dof_mode_lock);
15810
15811	return rval;
15812}
15813
15814void
15815dtrace_lazy_dofs_destroy(proc_t *p)
15816{
15817	lck_rw_lock_shared(&dtrace_dof_mode_lock);
15818	lck_mtx_lock(&p->p_dtrace_sprlock);
15819
15820	/*
15821	 * If we have lazy dof, dof mode better be LAZY_ON, or we must be exiting.
15822	 * We cannot assert against DTRACE_DOF_MODE_NEVER here, because we are called from
15823	 * kern_exit.c and kern_exec.c.
15824	 */
15825	ASSERT(p->p_dtrace_lazy_dofs == NULL || dtrace_dof_mode == DTRACE_DOF_MODE_LAZY_ON || p->p_lflag & P_LEXIT);
15826	ASSERT(p->p_dtrace_lazy_dofs == NULL || p->p_dtrace_helpers == NULL);
15827
15828	dof_ioctl_data_t* lazy_dofs = p->p_dtrace_lazy_dofs;
15829	p->p_dtrace_lazy_dofs = NULL;
15830
15831	lck_mtx_unlock(&p->p_dtrace_sprlock);
15832	lck_rw_unlock_shared(&dtrace_dof_mode_lock);
15833
15834	if (lazy_dofs) {
15835		kmem_free(lazy_dofs, DOF_IOCTL_DATA_T_SIZE(lazy_dofs->dofiod_count));
15836	}
15837}
15838
15839void
15840dtrace_lazy_dofs_duplicate(proc_t *parent, proc_t *child)
15841{
15842	lck_mtx_assert(&dtrace_lock, LCK_MTX_ASSERT_NOTOWNED);
15843	lck_mtx_assert(&parent->p_dtrace_sprlock, LCK_MTX_ASSERT_NOTOWNED);
15844	lck_mtx_assert(&child->p_dtrace_sprlock, LCK_MTX_ASSERT_NOTOWNED);
15845
15846	lck_rw_lock_shared(&dtrace_dof_mode_lock);
15847	lck_mtx_lock(&parent->p_dtrace_sprlock);
15848
15849	/*
15850	 * If we have lazy dof, dof mode better be LAZY_ON, or we must be exiting.
15851	 * We cannot assert against DTRACE_DOF_MODE_NEVER here, because we are called from
15852	 * kern_fork.c
15853	 */
15854	ASSERT(parent->p_dtrace_lazy_dofs == NULL || dtrace_dof_mode == DTRACE_DOF_MODE_LAZY_ON);
15855	ASSERT(parent->p_dtrace_lazy_dofs == NULL || parent->p_dtrace_helpers == NULL);
15856	/*
15857	 * In theory we should hold the child sprlock, but this is safe...
15858	 */
15859	ASSERT(child->p_dtrace_lazy_dofs == NULL && child->p_dtrace_helpers == NULL);
15860
15861	dof_ioctl_data_t* parent_dofs = parent->p_dtrace_lazy_dofs;
15862	dof_ioctl_data_t* child_dofs = NULL;
15863	if (parent_dofs) {
15864		size_t parent_dofs_size = DOF_IOCTL_DATA_T_SIZE(parent_dofs->dofiod_count);
15865		child_dofs = kmem_alloc(parent_dofs_size, KM_SLEEP);
15866		bcopy(parent_dofs, child_dofs, parent_dofs_size);
15867	}
15868
15869	lck_mtx_unlock(&parent->p_dtrace_sprlock);
15870
15871	if (child_dofs) {
15872		lck_mtx_lock(&child->p_dtrace_sprlock);
15873		child->p_dtrace_lazy_dofs = child_dofs;
15874		lck_mtx_unlock(&child->p_dtrace_sprlock);
15875	}
15876
15877	lck_rw_unlock_shared(&dtrace_dof_mode_lock);
15878}
15879
15880static int
15881dtrace_lazy_dofs_proc_iterate_filter(proc_t *p, void* ignored)
15882{
15883#pragma unused(ignored)
15884	/*
15885	 * Okay to NULL test without taking the sprlock.
15886	 */
15887	return p->p_dtrace_lazy_dofs != NULL;
15888}
15889
15890static int
15891dtrace_lazy_dofs_proc_iterate_doit(proc_t *p, void* ignored)
15892{
15893#pragma unused(ignored)
15894	/*
15895	 * It is possible this process may exit during our attempt to
15896	 * fault in the dof. We could fix this by holding locks longer,
15897	 * but the errors are benign.
15898	 */
15899	lck_mtx_lock(&p->p_dtrace_sprlock);
15900
15901	/*
15902	 * In this case only, it is okay to have lazy dof when dof mode is DTRACE_DOF_MODE_LAZY_OFF
15903	 */
15904	ASSERT(p->p_dtrace_lazy_dofs == NULL || p->p_dtrace_helpers == NULL);
15905	ASSERT(dtrace_dof_mode == DTRACE_DOF_MODE_LAZY_OFF);
15906
15907
15908	dof_ioctl_data_t* lazy_dofs = p->p_dtrace_lazy_dofs;
15909	p->p_dtrace_lazy_dofs = NULL;
15910
15911	lck_mtx_unlock(&p->p_dtrace_sprlock);
15912
15913	/*
15914	 * Process each dof_helper_t
15915	 */
15916	if (lazy_dofs != NULL) {
15917		unsigned int i;
15918		int rval;
15919
15920		for (i=0; i<lazy_dofs->dofiod_count; i++) {
15921			/*
15922			 * When loading lazy dof, we depend on the generations being sorted in ascending order.
15923			 */
15924			ASSERT(i >= (lazy_dofs->dofiod_count - 1) || lazy_dofs->dofiod_helpers[i].dofhp_dof < lazy_dofs->dofiod_helpers[i+1].dofhp_dof);
15925
15926			dof_helper_t *dhp = &lazy_dofs->dofiod_helpers[i];
15927
15928			/*
15929			 * We stored the generation in dofhp_dof. Save it, and restore the original value.
15930			 */
15931			int generation = dhp->dofhp_dof;
15932			dhp->dofhp_dof = dhp->dofhp_addr;
15933
15934			dof_hdr_t *dof = dtrace_dof_copyin_from_proc(p, dhp->dofhp_dof, &rval);
15935
15936			if (dof != NULL) {
15937				dtrace_helpers_t *help;
15938
15939				lck_mtx_lock(&dtrace_lock);
15940
15941				/*
15942				 * This must be done with the dtrace_lock held
15943				 */
15944				if ((help = p->p_dtrace_helpers) == NULL)
15945					help = dtrace_helpers_create(p);
15946
15947				/*
15948				 * If the generation value has been bumped, someone snuck in
15949				 * when we released the dtrace lock. We have to dump this generation,
15950				 * there is no safe way to load it.
15951				 */
15952				if (help->dthps_generation <= generation) {
15953					help->dthps_generation = generation;
15954
15955					/*
15956					 * dtrace_helper_slurp() takes responsibility for the dof --
15957					 * it may free it now or it may save it and free it later.
15958					 */
15959					if ((rval = dtrace_helper_slurp(p, dof, dhp)) != generation) {
15960						dtrace_dof_error(NULL, "returned value did not match expected generation");
15961					}
15962				}
15963
15964				lck_mtx_unlock(&dtrace_lock);
15965			}
15966		}
15967
15968		kmem_free(lazy_dofs, DOF_IOCTL_DATA_T_SIZE(lazy_dofs->dofiod_count));
15969	}
15970
15971	return PROC_RETURNED;
15972}
15973
15974#endif /* __APPLE__ */
15975
15976static dtrace_helpers_t *
15977dtrace_helpers_create(proc_t *p)
15978{
15979	dtrace_helpers_t *help;
15980
15981	lck_mtx_assert(&dtrace_lock, LCK_MTX_ASSERT_OWNED);
15982	ASSERT(p->p_dtrace_helpers == NULL);
15983
15984	help = kmem_zalloc(sizeof (dtrace_helpers_t), KM_SLEEP);
15985	help->dthps_actions = kmem_zalloc(sizeof (dtrace_helper_action_t *) *
15986	    DTRACE_NHELPER_ACTIONS, KM_SLEEP);
15987
15988	p->p_dtrace_helpers = help;
15989	dtrace_helpers++;
15990
15991	return (help);
15992}
15993
15994#if !defined(__APPLE__)
15995static void
15996dtrace_helpers_destroy(void)
15997{
15998	dtrace_helpers_t *help;
15999	dtrace_vstate_t *vstate;
16000	proc_t *p = curproc;
16001	int i;
16002#else
16003static void
16004dtrace_helpers_destroy(proc_t* p)
16005{
16006	dtrace_helpers_t *help;
16007	dtrace_vstate_t *vstate;
16008	uint_t i;
16009#endif
16010
16011	lck_mtx_lock(&dtrace_lock);
16012
16013	ASSERT(p->p_dtrace_helpers != NULL);
16014	ASSERT(dtrace_helpers > 0);
16015
16016	help = p->p_dtrace_helpers;
16017	vstate = &help->dthps_vstate;
16018
16019	/*
16020	 * We're now going to lose the help from this process.
16021	 */
16022	p->p_dtrace_helpers = NULL;
16023	dtrace_sync();
16024
16025	/*
16026	 * Destory the helper actions.
16027	 */
16028	for (i = 0; i < DTRACE_NHELPER_ACTIONS; i++) {
16029		dtrace_helper_action_t *h, *next;
16030
16031		for (h = help->dthps_actions[i]; h != NULL; h = next) {
16032			next = h->dtha_next;
16033			dtrace_helper_action_destroy(h, vstate);
16034			h = next;
16035		}
16036	}
16037
16038	lck_mtx_unlock(&dtrace_lock);
16039
16040	/*
16041	 * Destroy the helper providers.
16042	 */
16043	if (help->dthps_maxprovs > 0) {
16044		lck_mtx_lock(&dtrace_meta_lock);
16045		if (dtrace_meta_pid != NULL) {
16046			ASSERT(dtrace_deferred_pid == NULL);
16047
16048			for (i = 0; i < help->dthps_nprovs; i++) {
16049				dtrace_helper_provider_remove(
16050				    &help->dthps_provs[i]->dthp_prov, p->p_pid);
16051			}
16052		} else {
16053			lck_mtx_lock(&dtrace_lock);
16054			ASSERT(help->dthps_deferred == 0 ||
16055			    help->dthps_next != NULL ||
16056			    help->dthps_prev != NULL ||
16057			    help == dtrace_deferred_pid);
16058
16059			/*
16060			 * Remove the helper from the deferred list.
16061			 */
16062			if (help->dthps_next != NULL)
16063				help->dthps_next->dthps_prev = help->dthps_prev;
16064			if (help->dthps_prev != NULL)
16065				help->dthps_prev->dthps_next = help->dthps_next;
16066			if (dtrace_deferred_pid == help) {
16067				dtrace_deferred_pid = help->dthps_next;
16068				ASSERT(help->dthps_prev == NULL);
16069			}
16070
16071			lck_mtx_unlock(&dtrace_lock);
16072		}
16073
16074		lck_mtx_unlock(&dtrace_meta_lock);
16075
16076		for (i = 0; i < help->dthps_nprovs; i++) {
16077			dtrace_helper_provider_destroy(help->dthps_provs[i]);
16078		}
16079
16080		kmem_free(help->dthps_provs, help->dthps_maxprovs *
16081		    sizeof (dtrace_helper_provider_t *));
16082	}
16083
16084	lck_mtx_lock(&dtrace_lock);
16085
16086	dtrace_vstate_fini(&help->dthps_vstate);
16087	kmem_free(help->dthps_actions,
16088	    sizeof (dtrace_helper_action_t *) * DTRACE_NHELPER_ACTIONS);
16089	kmem_free(help, sizeof (dtrace_helpers_t));
16090
16091	--dtrace_helpers;
16092	lck_mtx_unlock(&dtrace_lock);
16093}
16094
16095static void
16096dtrace_helpers_duplicate(proc_t *from, proc_t *to)
16097{
16098	dtrace_helpers_t *help, *newhelp;
16099	dtrace_helper_action_t *helper, *new, *last;
16100	dtrace_difo_t *dp;
16101	dtrace_vstate_t *vstate;
16102#if !defined(__APPLE__) /* Quiet compiler warning */
16103	int i, j, sz, hasprovs = 0;
16104#else
16105	uint_t i;
16106	int j, sz, hasprovs = 0;
16107#endif /* __APPLE__ */
16108
16109	lck_mtx_lock(&dtrace_lock);
16110	ASSERT(from->p_dtrace_helpers != NULL);
16111	ASSERT(dtrace_helpers > 0);
16112
16113	help = from->p_dtrace_helpers;
16114	newhelp = dtrace_helpers_create(to);
16115	ASSERT(to->p_dtrace_helpers != NULL);
16116
16117	newhelp->dthps_generation = help->dthps_generation;
16118	vstate = &newhelp->dthps_vstate;
16119
16120	/*
16121	 * Duplicate the helper actions.
16122	 */
16123	for (i = 0; i < DTRACE_NHELPER_ACTIONS; i++) {
16124		if ((helper = help->dthps_actions[i]) == NULL)
16125			continue;
16126
16127		for (last = NULL; helper != NULL; helper = helper->dtha_next) {
16128			new = kmem_zalloc(sizeof (dtrace_helper_action_t),
16129			    KM_SLEEP);
16130			new->dtha_generation = helper->dtha_generation;
16131
16132			if ((dp = helper->dtha_predicate) != NULL) {
16133				dp = dtrace_difo_duplicate(dp, vstate);
16134				new->dtha_predicate = dp;
16135			}
16136
16137			new->dtha_nactions = helper->dtha_nactions;
16138			sz = sizeof (dtrace_difo_t *) * new->dtha_nactions;
16139			new->dtha_actions = kmem_alloc(sz, KM_SLEEP);
16140
16141#if !defined(__APPLE__) /* Quiet compiler warning */
16142			for (j = 0; j < new->dtha_nactions; j++) {
16143				dtrace_difo_t *dp = helper->dtha_actions[j];
16144
16145				ASSERT(dp != NULL);
16146				dp = dtrace_difo_duplicate(dp, vstate);
16147				new->dtha_actions[j] = dp;
16148			}
16149#else
16150			for (j = 0; j < new->dtha_nactions; j++) {
16151				dtrace_difo_t *dpj = helper->dtha_actions[j];
16152
16153				ASSERT(dpj != NULL);
16154				dpj = dtrace_difo_duplicate(dpj, vstate);
16155				new->dtha_actions[j] = dpj;
16156			}
16157#endif /* __APPLE__ */
16158
16159			if (last != NULL) {
16160				last->dtha_next = new;
16161			} else {
16162				newhelp->dthps_actions[i] = new;
16163			}
16164
16165			last = new;
16166		}
16167	}
16168
16169	/*
16170	 * Duplicate the helper providers and register them with the
16171	 * DTrace framework.
16172	 */
16173	if (help->dthps_nprovs > 0) {
16174		newhelp->dthps_nprovs = help->dthps_nprovs;
16175		newhelp->dthps_maxprovs = help->dthps_nprovs;
16176		newhelp->dthps_provs = kmem_alloc(newhelp->dthps_nprovs *
16177		    sizeof (dtrace_helper_provider_t *), KM_SLEEP);
16178		for (i = 0; i < newhelp->dthps_nprovs; i++) {
16179			newhelp->dthps_provs[i] = help->dthps_provs[i];
16180			newhelp->dthps_provs[i]->dthp_ref++;
16181		}
16182
16183		hasprovs = 1;
16184	}
16185
16186	lck_mtx_unlock(&dtrace_lock);
16187
16188	if (hasprovs)
16189		dtrace_helper_provider_register(to, newhelp, NULL);
16190}
16191
16192/*
16193 * DTrace Hook Functions
16194 */
16195
16196#if defined(__APPLE__)
16197/*
16198 * Routines to manipulate the modctl list within dtrace
16199 */
16200
16201modctl_t *dtrace_modctl_list;
16202
16203static void
16204dtrace_modctl_add(struct modctl * newctl)
16205{
16206	struct modctl *nextp, *prevp;
16207
16208	ASSERT(newctl != NULL);
16209	lck_mtx_assert(&mod_lock, LCK_MTX_ASSERT_OWNED);
16210
16211	// Insert new module at the front of the list,
16212
16213	newctl->mod_next = dtrace_modctl_list;
16214	dtrace_modctl_list = newctl;
16215
16216	/*
16217	 * If a module exists with the same name, then that module
16218	 * must have been unloaded with enabled probes. We will move
16219	 * the unloaded module to the new module's stale chain and
16220	 * then stop traversing the list.
16221	 */
16222
16223	prevp = newctl;
16224	nextp = newctl->mod_next;
16225
16226	while (nextp != NULL) {
16227		if (nextp->mod_loaded) {
16228			/* This is a loaded module. Keep traversing. */
16229			prevp = nextp;
16230			nextp = nextp->mod_next;
16231			continue;
16232		}
16233		else {
16234			/* Found an unloaded module */
16235			if (strncmp (newctl->mod_modname, nextp->mod_modname, KMOD_MAX_NAME)) {
16236				/* Names don't match. Keep traversing. */
16237				prevp = nextp;
16238				nextp = nextp->mod_next;
16239				continue;
16240			}
16241			else {
16242				/* We found a stale entry, move it. We're done. */
16243				prevp->mod_next = nextp->mod_next;
16244				newctl->mod_stale = nextp;
16245				nextp->mod_next = NULL;
16246				break;
16247			}
16248		}
16249	}
16250}
16251
16252static modctl_t *
16253dtrace_modctl_lookup(struct kmod_info * kmod)
16254{
16255    lck_mtx_assert(&mod_lock, LCK_MTX_ASSERT_OWNED);
16256
16257    struct modctl * ctl;
16258
16259    for (ctl = dtrace_modctl_list; ctl; ctl=ctl->mod_next) {
16260	if (ctl->mod_id == kmod->id)
16261	    return(ctl);
16262    }
16263    return (NULL);
16264}
16265
16266/*
16267 * This routine is called from dtrace_module_unloaded().
16268 * It removes a modctl structure and its stale chain
16269 * from the kext shadow list.
16270 */
16271static void
16272dtrace_modctl_remove(struct modctl * ctl)
16273{
16274	ASSERT(ctl != NULL);
16275	lck_mtx_assert(&mod_lock, LCK_MTX_ASSERT_OWNED);
16276	modctl_t *prevp, *nextp, *curp;
16277
16278	// Remove stale chain first
16279	for (curp=ctl->mod_stale; curp != NULL; curp=nextp) {
16280		nextp = curp->mod_stale;
16281		/* There should NEVER be user symbols allocated at this point */
16282		ASSERT(curp->mod_user_symbols == NULL);
16283		kmem_free(curp, sizeof(modctl_t));
16284	}
16285
16286	prevp = NULL;
16287	curp = dtrace_modctl_list;
16288
16289	while (curp != ctl) {
16290		prevp = curp;
16291		curp = curp->mod_next;
16292	}
16293
16294	if (prevp != NULL) {
16295		prevp->mod_next = ctl->mod_next;
16296	}
16297	else {
16298		dtrace_modctl_list = ctl->mod_next;
16299	}
16300
16301	/* There should NEVER be user symbols allocated at this point */
16302	ASSERT(ctl->mod_user_symbols == NULL);
16303
16304	kmem_free (ctl, sizeof(modctl_t));
16305}
16306
16307#endif /* __APPLE__ */
16308
16309/*
16310 * APPLE NOTE: The kext loader will call dtrace_module_loaded
16311 * when the kext is loaded in memory, but before calling the
16312 * kext's start routine.
16313 *
16314 * Return 0 on success
16315 * Return -1 on failure
16316 */
16317
16318#if !defined (__APPLE__)
16319static void
16320dtrace_module_loaded(struct modctl *ctl)
16321#else
16322static int
16323dtrace_module_loaded(struct kmod_info *kmod, uint32_t flag)
16324#endif /* __APPLE__ */
16325{
16326	dtrace_provider_t *prv;
16327
16328#if !defined(__APPLE__)
16329	mutex_enter(&dtrace_provider_lock);
16330	mutex_enter(&mod_lock);
16331
16332	ASSERT(ctl->mod_busy);
16333#else
16334
16335	/*
16336	 * If kernel symbols have been disabled, return immediately
16337	 * DTRACE_KERNEL_SYMBOLS_NEVER is a permanent mode, it is safe to test without holding locks
16338	 */
16339	if (dtrace_kernel_symbol_mode == DTRACE_KERNEL_SYMBOLS_NEVER)
16340		return 0;
16341
16342	struct modctl *ctl = NULL;
16343	if (!kmod || kmod->address == 0 || kmod->size == 0)
16344		return(-1);
16345
16346	lck_mtx_lock(&dtrace_provider_lock);
16347	lck_mtx_lock(&mod_lock);
16348
16349	/*
16350	 * Have we seen this kext before?
16351	 */
16352
16353	ctl = dtrace_modctl_lookup(kmod);
16354
16355	if (ctl != NULL) {
16356		/* bail... we already have this kext in the modctl list */
16357		lck_mtx_unlock(&mod_lock);
16358		lck_mtx_unlock(&dtrace_provider_lock);
16359		if (dtrace_err_verbose)
16360			cmn_err(CE_WARN, "dtrace load module already exists '%s %u' is failing against '%s %u'", kmod->name, (uint_t)kmod->id, ctl->mod_modname, ctl->mod_id);
16361		return(-1);
16362	}
16363	else {
16364		ctl = kmem_alloc(sizeof(struct modctl), KM_SLEEP);
16365		if (ctl == NULL) {
16366			if (dtrace_err_verbose)
16367				cmn_err(CE_WARN, "dtrace module load '%s %u' is failing ", kmod->name, (uint_t)kmod->id);
16368			lck_mtx_unlock(&mod_lock);
16369			lck_mtx_unlock(&dtrace_provider_lock);
16370			return (-1);
16371		}
16372		ctl->mod_next = NULL;
16373		ctl->mod_stale = NULL;
16374		strlcpy (ctl->mod_modname, kmod->name, sizeof(ctl->mod_modname));
16375		ctl->mod_loadcnt = kmod->id;
16376		ctl->mod_nenabled = 0;
16377		ctl->mod_address  = kmod->address;
16378		ctl->mod_size = kmod->size;
16379		ctl->mod_id = kmod->id;
16380		ctl->mod_loaded = 1;
16381		ctl->mod_flags = 0;
16382		ctl->mod_user_symbols = NULL;
16383
16384		/*
16385		 * Find the UUID for this module, if it has one
16386		 */
16387		kernel_mach_header_t* header = (kernel_mach_header_t *)ctl->mod_address;
16388		struct load_command* load_cmd = (struct load_command *)&header[1];
16389		uint32_t i;
16390		for (i = 0; i < header->ncmds; i++) {
16391			if (load_cmd->cmd == LC_UUID) {
16392				struct uuid_command* uuid_cmd = (struct uuid_command *)load_cmd;
16393				memcpy(ctl->mod_uuid, uuid_cmd->uuid, sizeof(uuid_cmd->uuid));
16394				ctl->mod_flags |= MODCTL_HAS_UUID;
16395				break;
16396			}
16397			load_cmd = (struct load_command *)((caddr_t)load_cmd + load_cmd->cmdsize);
16398		}
16399
16400		if (ctl->mod_address == g_kernel_kmod_info.address) {
16401			ctl->mod_flags |= MODCTL_IS_MACH_KERNEL;
16402		}
16403	}
16404	dtrace_modctl_add(ctl);
16405
16406	/*
16407	 * We must hold the dtrace_lock to safely test non permanent dtrace_fbt_symbol_mode(s)
16408	 */
16409	lck_mtx_lock(&dtrace_lock);
16410
16411	/*
16412	 * DTrace must decide if it will instrument modules lazily via
16413	 * userspace symbols (default mode), or instrument immediately via
16414	 * kernel symbols (non-default mode)
16415	 *
16416	 * When in default/lazy mode, DTrace will only support modules
16417	 * built with a valid UUID.
16418	 *
16419	 * Overriding the default can be done explicitly in one of
16420	 * the following two ways.
16421	 *
16422	 * A module can force symbols from kernel space using the plist key,
16423	 * OSBundleForceDTraceInit (see kmod.h).  If this per kext state is set,
16424	 * we fall through and instrument this module now.
16425	 *
16426	 * Or, the boot-arg, dtrace_kernel_symbol_mode, can be set to force symbols
16427	 * from kernel space (see dtrace_impl.h).  If this system state is set
16428	 * to a non-userspace mode, we fall through and instrument the module now.
16429	 */
16430
16431	if ((dtrace_kernel_symbol_mode == DTRACE_KERNEL_SYMBOLS_FROM_USERSPACE) &&
16432	    (!(flag & KMOD_DTRACE_FORCE_INIT)))
16433	{
16434		/* We will instrument the module lazily -- this is the default */
16435		lck_mtx_unlock(&dtrace_lock);
16436		lck_mtx_unlock(&mod_lock);
16437		lck_mtx_unlock(&dtrace_provider_lock);
16438		return 0;
16439	}
16440
16441	/* We will instrument the module immediately using kernel symbols */
16442	ctl->mod_flags |= MODCTL_HAS_KERNEL_SYMBOLS;
16443
16444	lck_mtx_unlock(&dtrace_lock);
16445#endif /* __APPLE__ */
16446
16447	/*
16448	 * We're going to call each providers per-module provide operation
16449	 * specifying only this module.
16450	 */
16451	for (prv = dtrace_provider; prv != NULL; prv = prv->dtpv_next)
16452		prv->dtpv_pops.dtps_provide_module(prv->dtpv_arg, ctl);
16453
16454#if defined(__APPLE__)
16455	/*
16456	 * The contract with the kext loader is that once this function has completed,
16457	 * it may delete kernel symbols at will. We must set this while still holding
16458	 * the mod_lock.
16459	 */
16460	ctl->mod_flags &= ~MODCTL_HAS_KERNEL_SYMBOLS;
16461#endif
16462
16463	lck_mtx_unlock(&mod_lock);
16464	lck_mtx_unlock(&dtrace_provider_lock);
16465
16466	/*
16467	 * If we have any retained enablings, we need to match against them.
16468	 * Enabling probes requires that cpu_lock be held, and we cannot hold
16469	 * cpu_lock here -- it is legal for cpu_lock to be held when loading a
16470	 * module.  (In particular, this happens when loading scheduling
16471	 * classes.)  So if we have any retained enablings, we need to dispatch
16472	 * our task queue to do the match for us.
16473	 */
16474	lck_mtx_lock(&dtrace_lock);
16475
16476	if (dtrace_retained == NULL) {
16477		lck_mtx_unlock(&dtrace_lock);
16478#if !defined(__APPLE__)
16479		return;
16480#else
16481		return 0;
16482#endif
16483	}
16484
16485#if !defined(__APPLE__)
16486	(void) taskq_dispatch(dtrace_taskq,
16487			      (task_func_t *)dtrace_enabling_matchall, NULL, TQ_SLEEP);
16488
16489	mutex_exit(&dtrace_lock);
16490
16491	/*
16492	 * And now, for a little heuristic sleaze:  in general, we want to
16493	 * match modules as soon as they load.  However, we cannot guarantee
16494	 * this, because it would lead us to the lock ordering violation
16495	 * outlined above.  The common case, of course, is that cpu_lock is
16496	 * _not_ held -- so we delay here for a clock tick, hoping that that's
16497	 * long enough for the task queue to do its work.  If it's not, it's
16498	 * not a serious problem -- it just means that the module that we
16499	 * just loaded may not be immediately instrumentable.
16500	 */
16501	delay(1);
16502#else
16503	/* APPLE NOTE!
16504	 *
16505	 * The cpu_lock mentioned above is only held by dtrace code, Apple's xnu never actually
16506	 * holds it for any reason. Thus the comment above is invalid, we can directly invoke
16507	 * dtrace_enabling_matchall without jumping through all the hoops, and we can avoid
16508	 * the delay call as well.
16509	 */
16510	lck_mtx_unlock(&dtrace_lock);
16511
16512	dtrace_enabling_matchall();
16513
16514	return 0;
16515#endif /* __APPLE__ */
16516}
16517
16518#if !defined(__APPLE__)
16519static void
16520dtrace_module_unloaded(struct modctl *ctl)
16521{
16522	dtrace_probe_t template, *probe, *first, *next;
16523	dtrace_provider_t *prov;
16524
16525	template.dtpr_mod = ctl->mod_modname;
16526
16527	mutex_enter(&dtrace_provider_lock);
16528	mutex_enter(&mod_lock);
16529	mutex_enter(&dtrace_lock);
16530
16531	if (dtrace_bymod == NULL) {
16532		/*
16533		 * The DTrace module is loaded (obviously) but not attached;
16534		 * we don't have any work to do.
16535		 */
16536		mutex_exit(&dtrace_provider_lock);
16537		mutex_exit(&mod_lock);
16538		mutex_exit(&dtrace_lock);
16539		return;
16540	}
16541
16542	for (probe = first = dtrace_hash_lookup(dtrace_bymod, &template);
16543	    probe != NULL; probe = probe->dtpr_nextmod) {
16544		if (probe->dtpr_ecb != NULL) {
16545			mutex_exit(&dtrace_provider_lock);
16546			mutex_exit(&mod_lock);
16547			mutex_exit(&dtrace_lock);
16548
16549			/*
16550			 * This shouldn't _actually_ be possible -- we're
16551			 * unloading a module that has an enabled probe in it.
16552			 * (It's normally up to the provider to make sure that
16553			 * this can't happen.)  However, because dtps_enable()
16554			 * doesn't have a failure mode, there can be an
16555			 * enable/unload race.  Upshot:  we don't want to
16556			 * assert, but we're not going to disable the
16557			 * probe, either.
16558			 */
16559			if (dtrace_err_verbose) {
16560				cmn_err(CE_WARN, "unloaded module '%s' had "
16561				    "enabled probes", ctl->mod_modname);
16562			}
16563
16564			return;
16565		}
16566	}
16567
16568	probe = first;
16569
16570	for (first = NULL; probe != NULL; probe = next) {
16571		ASSERT(dtrace_probes[probe->dtpr_id - 1] == probe);
16572
16573		dtrace_probes[probe->dtpr_id - 1] = NULL;
16574		probe->dtpr_provider->probe_count--;
16575
16576		next = probe->dtpr_nextmod;
16577		dtrace_hash_remove(dtrace_bymod, probe);
16578		dtrace_hash_remove(dtrace_byfunc, probe);
16579		dtrace_hash_remove(dtrace_byname, probe);
16580
16581		if (first == NULL) {
16582			first = probe;
16583			probe->dtpr_nextmod = NULL;
16584		} else {
16585			probe->dtpr_nextmod = first;
16586			first = probe;
16587		}
16588	}
16589
16590	/*
16591	 * We've removed all of the module's probes from the hash chains and
16592	 * from the probe array.  Now issue a dtrace_sync() to be sure that
16593	 * everyone has cleared out from any probe array processing.
16594	 */
16595	dtrace_sync();
16596
16597	for (probe = first; probe != NULL; probe = first) {
16598		first = probe->dtpr_nextmod;
16599		prov = probe->dtpr_provider;
16600		prov->dtpv_pops.dtps_destroy(prov->dtpv_arg, probe->dtpr_id,
16601		    probe->dtpr_arg);
16602		kmem_free(probe->dtpr_mod, strlen(probe->dtpr_mod) + 1);
16603		kmem_free(probe->dtpr_func, strlen(probe->dtpr_func) + 1);
16604		kmem_free(probe->dtpr_name, strlen(probe->dtpr_name) + 1);
16605		vmem_free(dtrace_arena, (void *)(uintptr_t)probe->dtpr_id, 1);
16606		kmem_free(probe, sizeof (dtrace_probe_t));
16607	}
16608
16609	mutex_exit(&dtrace_lock);
16610	mutex_exit(&mod_lock);
16611	mutex_exit(&dtrace_provider_lock);
16612}
16613#else  /* __APPLE__ */
16614
16615/*
16616 * Return 0 on success
16617 * Return -1 on failure
16618 */
16619static int
16620dtrace_module_unloaded(struct kmod_info *kmod)
16621{
16622	dtrace_probe_t template, *probe, *first, *next;
16623	dtrace_provider_t *prov;
16624        struct modctl *ctl = NULL;
16625	struct modctl *syncctl = NULL;
16626	struct modctl *nextsyncctl = NULL;
16627	int syncmode = 0;
16628
16629        lck_mtx_lock(&dtrace_provider_lock);
16630	lck_mtx_lock(&mod_lock);
16631	lck_mtx_lock(&dtrace_lock);
16632
16633	if (kmod == NULL) {
16634	    syncmode = 1;
16635	}
16636	else {
16637	    ctl = dtrace_modctl_lookup(kmod);
16638	    if (ctl == NULL)
16639	    {
16640		lck_mtx_unlock(&dtrace_lock);
16641		lck_mtx_unlock(&mod_lock);
16642		lck_mtx_unlock(&dtrace_provider_lock);
16643		return (-1);
16644	    }
16645	    ctl->mod_loaded = 0;
16646	    ctl->mod_address = 0;
16647	    ctl->mod_size = 0;
16648	}
16649
16650	if (dtrace_bymod == NULL) {
16651		/*
16652		 * The DTrace module is loaded (obviously) but not attached;
16653		 * we don't have any work to do.
16654		 */
16655	         if (ctl != NULL)
16656			 (void)dtrace_modctl_remove(ctl);
16657		 lck_mtx_unlock(&dtrace_provider_lock);
16658		 lck_mtx_unlock(&mod_lock);
16659		 lck_mtx_unlock(&dtrace_lock);
16660		 return(0);
16661	}
16662
16663	/* Syncmode set means we target and traverse entire modctl list. */
16664        if (syncmode)
16665	    nextsyncctl = dtrace_modctl_list;
16666
16667syncloop:
16668	if (syncmode)
16669	{
16670	    /* find a stale modctl struct */
16671	    for (syncctl = nextsyncctl; syncctl != NULL; syncctl=syncctl->mod_next) {
16672		if (syncctl->mod_address == 0)
16673		    break;
16674	    }
16675	    if (syncctl==NULL)
16676	    {
16677		/* We have no more work to do */
16678		lck_mtx_unlock(&dtrace_provider_lock);
16679		lck_mtx_unlock(&mod_lock);
16680		lck_mtx_unlock(&dtrace_lock);
16681		return(0);
16682	    }
16683	    else {
16684		/* keep track of next syncctl in case this one is removed */
16685		nextsyncctl = syncctl->mod_next;
16686		ctl = syncctl;
16687	    }
16688	}
16689
16690	template.dtpr_mod = ctl->mod_modname;
16691
16692	for (probe = first = dtrace_hash_lookup(dtrace_bymod, &template);
16693	    probe != NULL; probe = probe->dtpr_nextmod) {
16694	        if (probe->dtpr_ecb != NULL) {
16695			/*
16696			 * This shouldn't _actually_ be possible -- we're
16697			 * unloading a module that has an enabled probe in it.
16698			 * (It's normally up to the provider to make sure that
16699			 * this can't happen.)  However, because dtps_enable()
16700			 * doesn't have a failure mode, there can be an
16701			 * enable/unload race.  Upshot:  we don't want to
16702			 * assert, but we're not going to disable the
16703			 * probe, either.
16704			 */
16705
16706
16707		        if (syncmode) {
16708			    /* We're syncing, let's look at next in list */
16709			    goto syncloop;
16710			}
16711
16712			lck_mtx_unlock(&dtrace_provider_lock);
16713			lck_mtx_unlock(&mod_lock);
16714			lck_mtx_unlock(&dtrace_lock);
16715
16716			if (dtrace_err_verbose) {
16717				cmn_err(CE_WARN, "unloaded module '%s' had "
16718				    "enabled probes", ctl->mod_modname);
16719			}
16720			return(-1);
16721		}
16722	}
16723
16724	probe = first;
16725
16726	for (first = NULL; probe != NULL; probe = next) {
16727		ASSERT(dtrace_probes[probe->dtpr_id - 1] == probe);
16728
16729		dtrace_probes[probe->dtpr_id - 1] = NULL;
16730		probe->dtpr_provider->probe_count--;
16731
16732		next = probe->dtpr_nextmod;
16733		dtrace_hash_remove(dtrace_bymod, probe);
16734		dtrace_hash_remove(dtrace_byfunc, probe);
16735		dtrace_hash_remove(dtrace_byname, probe);
16736
16737		if (first == NULL) {
16738			first = probe;
16739			probe->dtpr_nextmod = NULL;
16740		} else {
16741			probe->dtpr_nextmod = first;
16742			first = probe;
16743		}
16744	}
16745
16746	/*
16747	 * We've removed all of the module's probes from the hash chains and
16748	 * from the probe array.  Now issue a dtrace_sync() to be sure that
16749	 * everyone has cleared out from any probe array processing.
16750	 */
16751	dtrace_sync();
16752
16753	for (probe = first; probe != NULL; probe = first) {
16754		first = probe->dtpr_nextmod;
16755		prov = probe->dtpr_provider;
16756		prov->dtpv_pops.dtps_destroy(prov->dtpv_arg, probe->dtpr_id,
16757		    probe->dtpr_arg);
16758		kmem_free(probe->dtpr_mod, strlen(probe->dtpr_mod) + 1);
16759		kmem_free(probe->dtpr_func, strlen(probe->dtpr_func) + 1);
16760		kmem_free(probe->dtpr_name, strlen(probe->dtpr_name) + 1);
16761		vmem_free(dtrace_arena, (void *)(uintptr_t)probe->dtpr_id, 1);
16762
16763		zfree(dtrace_probe_t_zone, probe);
16764	}
16765
16766	dtrace_modctl_remove(ctl);
16767
16768	if (syncmode)
16769	    goto syncloop;
16770
16771	lck_mtx_unlock(&dtrace_lock);
16772	lck_mtx_unlock(&mod_lock);
16773	lck_mtx_unlock(&dtrace_provider_lock);
16774
16775	return(0);
16776}
16777#endif /* __APPLE__ */
16778
16779void
16780dtrace_suspend(void)
16781{
16782	dtrace_probe_foreach(offsetof(dtrace_pops_t, dtps_suspend));
16783}
16784
16785void
16786dtrace_resume(void)
16787{
16788	dtrace_probe_foreach(offsetof(dtrace_pops_t, dtps_resume));
16789}
16790
16791static int
16792dtrace_cpu_setup(cpu_setup_t what, processorid_t cpu)
16793{
16794	lck_mtx_assert(&cpu_lock, LCK_MTX_ASSERT_OWNED);
16795	lck_mtx_lock(&dtrace_lock);
16796
16797	switch (what) {
16798	case CPU_CONFIG: {
16799		dtrace_state_t *state;
16800		dtrace_optval_t *opt, rs, c;
16801
16802		/*
16803		 * For now, we only allocate a new buffer for anonymous state.
16804		 */
16805		if ((state = dtrace_anon.dta_state) == NULL)
16806			break;
16807
16808		if (state->dts_activity != DTRACE_ACTIVITY_ACTIVE)
16809			break;
16810
16811		opt = state->dts_options;
16812		c = opt[DTRACEOPT_CPU];
16813
16814		if (c != DTRACE_CPUALL && c != DTRACEOPT_UNSET && c != cpu)
16815			break;
16816
16817		/*
16818		 * Regardless of what the actual policy is, we're going to
16819		 * temporarily set our resize policy to be manual.  We're
16820		 * also going to temporarily set our CPU option to denote
16821		 * the newly configured CPU.
16822		 */
16823		rs = opt[DTRACEOPT_BUFRESIZE];
16824		opt[DTRACEOPT_BUFRESIZE] = DTRACEOPT_BUFRESIZE_MANUAL;
16825		opt[DTRACEOPT_CPU] = (dtrace_optval_t)cpu;
16826
16827		(void) dtrace_state_buffers(state);
16828
16829		opt[DTRACEOPT_BUFRESIZE] = rs;
16830		opt[DTRACEOPT_CPU] = c;
16831
16832		break;
16833	}
16834
16835	case CPU_UNCONFIG:
16836		/*
16837		 * We don't free the buffer in the CPU_UNCONFIG case.  (The
16838		 * buffer will be freed when the consumer exits.)
16839		 */
16840		break;
16841
16842	default:
16843		break;
16844	}
16845
16846	lck_mtx_unlock(&dtrace_lock);
16847	return (0);
16848}
16849
16850static void
16851dtrace_cpu_setup_initial(processorid_t cpu)
16852{
16853	(void) dtrace_cpu_setup(CPU_CONFIG, cpu);
16854}
16855
16856static void
16857dtrace_toxrange_add(uintptr_t base, uintptr_t limit)
16858{
16859	if (dtrace_toxranges >= dtrace_toxranges_max) {
16860		int osize, nsize;
16861		dtrace_toxrange_t *range;
16862
16863		osize = dtrace_toxranges_max * sizeof (dtrace_toxrange_t);
16864
16865		if (osize == 0) {
16866			ASSERT(dtrace_toxrange == NULL);
16867			ASSERT(dtrace_toxranges_max == 0);
16868			dtrace_toxranges_max = 1;
16869		} else {
16870			dtrace_toxranges_max <<= 1;
16871		}
16872
16873		nsize = dtrace_toxranges_max * sizeof (dtrace_toxrange_t);
16874		range = kmem_zalloc(nsize, KM_SLEEP);
16875
16876		if (dtrace_toxrange != NULL) {
16877			ASSERT(osize != 0);
16878			bcopy(dtrace_toxrange, range, osize);
16879			kmem_free(dtrace_toxrange, osize);
16880		}
16881
16882		dtrace_toxrange = range;
16883	}
16884
16885	ASSERT(dtrace_toxrange[dtrace_toxranges].dtt_base == NULL);
16886	ASSERT(dtrace_toxrange[dtrace_toxranges].dtt_limit == NULL);
16887
16888	dtrace_toxrange[dtrace_toxranges].dtt_base = base;
16889	dtrace_toxrange[dtrace_toxranges].dtt_limit = limit;
16890	dtrace_toxranges++;
16891}
16892
16893/*
16894 * DTrace Driver Cookbook Functions
16895 */
16896/*ARGSUSED*/
16897static int
16898dtrace_attach(dev_info_t *devi, ddi_attach_cmd_t cmd)
16899{
16900#pragma unused(cmd) /* __APPLE__ */
16901	dtrace_provider_id_t id;
16902	dtrace_state_t *state = NULL;
16903	dtrace_enabling_t *enab;
16904
16905	lck_mtx_lock(&cpu_lock);
16906	lck_mtx_lock(&dtrace_provider_lock);
16907	lck_mtx_lock(&dtrace_lock);
16908
16909	if (ddi_soft_state_init(&dtrace_softstate,
16910	    sizeof (dtrace_state_t), 0) != 0) {
16911		cmn_err(CE_NOTE, "/dev/dtrace failed to initialize soft state");
16912		lck_mtx_unlock(&cpu_lock);
16913		lck_mtx_unlock(&dtrace_provider_lock);
16914		lck_mtx_unlock(&dtrace_lock);
16915		return (DDI_FAILURE);
16916	}
16917
16918#if !defined(__APPLE__)
16919	if (ddi_create_minor_node(devi, DTRACEMNR_DTRACE, S_IFCHR,
16920	    DTRACEMNRN_DTRACE, DDI_PSEUDO, NULL) == DDI_FAILURE ||
16921	    ddi_create_minor_node(devi, DTRACEMNR_HELPER, S_IFCHR,
16922	    DTRACEMNRN_HELPER, DDI_PSEUDO, NULL) == DDI_FAILURE) {
16923		cmn_err(CE_NOTE, "/dev/dtrace couldn't create minor nodes");
16924		ddi_remove_minor_node(devi, NULL);
16925		ddi_soft_state_fini(&dtrace_softstate);
16926		lck_mtx_unlock(&cpu_lock);
16927		lck_mtx_unlock(&dtrace_provider_lock);
16928		lck_mtx_unlock(&dtrace_lock);
16929		return (DDI_FAILURE);
16930	}
16931#else
16932	/* Darwin uses BSD cloning device driver to automagically obtain minor device number. */
16933#endif /* __APPLE__ */
16934
16935	ddi_report_dev(devi);
16936	dtrace_devi = devi;
16937
16938	dtrace_modload = dtrace_module_loaded;
16939	dtrace_modunload = dtrace_module_unloaded;
16940	dtrace_cpu_init = dtrace_cpu_setup_initial;
16941	dtrace_helpers_cleanup = dtrace_helpers_destroy;
16942	dtrace_helpers_fork = dtrace_helpers_duplicate;
16943	dtrace_cpustart_init = dtrace_suspend;
16944	dtrace_cpustart_fini = dtrace_resume;
16945	dtrace_debugger_init = dtrace_suspend;
16946	dtrace_debugger_fini = dtrace_resume;
16947
16948	register_cpu_setup_func((cpu_setup_func_t *)dtrace_cpu_setup, NULL);
16949
16950	lck_mtx_assert(&cpu_lock, LCK_MTX_ASSERT_OWNED);
16951
16952	dtrace_arena = vmem_create("dtrace", (void *)1, UINT32_MAX, 1,
16953	    NULL, NULL, NULL, 0, VM_SLEEP | VMC_IDENTIFIER);
16954	dtrace_minor = vmem_create("dtrace_minor", (void *)DTRACEMNRN_CLONE,
16955	    UINT32_MAX - DTRACEMNRN_CLONE, 1, NULL, NULL, NULL, 0,
16956	    VM_SLEEP | VMC_IDENTIFIER);
16957	dtrace_taskq = taskq_create("dtrace_taskq", 1, maxclsyspri,
16958	    1, INT_MAX, 0);
16959
16960	dtrace_state_cache = kmem_cache_create("dtrace_state_cache",
16961	    sizeof (dtrace_dstate_percpu_t) * (int)NCPU, DTRACE_STATE_ALIGN,
16962	    NULL, NULL, NULL, NULL, NULL, 0);
16963
16964	lck_mtx_assert(&cpu_lock, LCK_MTX_ASSERT_OWNED);
16965	dtrace_bymod = dtrace_hash_create(offsetof(dtrace_probe_t, dtpr_mod),
16966	    offsetof(dtrace_probe_t, dtpr_nextmod),
16967	    offsetof(dtrace_probe_t, dtpr_prevmod));
16968
16969	dtrace_byfunc = dtrace_hash_create(offsetof(dtrace_probe_t, dtpr_func),
16970	    offsetof(dtrace_probe_t, dtpr_nextfunc),
16971	    offsetof(dtrace_probe_t, dtpr_prevfunc));
16972
16973	dtrace_byname = dtrace_hash_create(offsetof(dtrace_probe_t, dtpr_name),
16974	    offsetof(dtrace_probe_t, dtpr_nextname),
16975	    offsetof(dtrace_probe_t, dtpr_prevname));
16976
16977	if (dtrace_retain_max < 1) {
16978		cmn_err(CE_WARN, "illegal value (%lu) for dtrace_retain_max; "
16979		    "setting to 1", dtrace_retain_max);
16980		dtrace_retain_max = 1;
16981	}
16982
16983	/*
16984	 * Now discover our toxic ranges.
16985	 */
16986	dtrace_toxic_ranges(dtrace_toxrange_add);
16987
16988	/*
16989	 * Before we register ourselves as a provider to our own framework,
16990	 * we would like to assert that dtrace_provider is NULL -- but that's
16991	 * not true if we were loaded as a dependency of a DTrace provider.
16992	 * Once we've registered, we can assert that dtrace_provider is our
16993	 * pseudo provider.
16994	 */
16995	(void) dtrace_register("dtrace", &dtrace_provider_attr,
16996	    DTRACE_PRIV_NONE, 0, &dtrace_provider_ops, NULL, &id);
16997
16998	ASSERT(dtrace_provider != NULL);
16999	ASSERT((dtrace_provider_id_t)dtrace_provider == id);
17000
17001#if !defined(__APPLE__)
17002	dtrace_probeid_begin = dtrace_probe_create((dtrace_provider_id_t)
17003	    dtrace_provider, NULL, NULL, "BEGIN", 0, NULL);
17004	dtrace_probeid_end = dtrace_probe_create((dtrace_provider_id_t)
17005	    dtrace_provider, NULL, NULL, "END", 0, NULL);
17006	dtrace_probeid_error = dtrace_probe_create((dtrace_provider_id_t)
17007	    dtrace_provider, NULL, NULL, "ERROR", 1, NULL);
17008#elif defined (__x86_64__)
17009	dtrace_probeid_begin = dtrace_probe_create((dtrace_provider_id_t)
17010	    dtrace_provider, NULL, NULL, "BEGIN", 1, NULL);
17011	dtrace_probeid_end = dtrace_probe_create((dtrace_provider_id_t)
17012	    dtrace_provider, NULL, NULL, "END", 0, NULL);
17013	dtrace_probeid_error = dtrace_probe_create((dtrace_provider_id_t)
17014	    dtrace_provider, NULL, NULL, "ERROR", 3, NULL);
17015#else
17016#error Unknown Architecture
17017#endif /* __APPLE__ */
17018
17019	dtrace_anon_property();
17020	lck_mtx_unlock(&cpu_lock);
17021
17022	/*
17023	 * If DTrace helper tracing is enabled, we need to allocate the
17024	 * trace buffer and initialize the values.
17025	 */
17026	if (dtrace_helptrace_enabled) {
17027		ASSERT(dtrace_helptrace_buffer == NULL);
17028		dtrace_helptrace_buffer =
17029		    kmem_zalloc(dtrace_helptrace_bufsize, KM_SLEEP);
17030		dtrace_helptrace_next = 0;
17031	}
17032
17033	/*
17034	 * If there are already providers, we must ask them to provide their
17035	 * probes, and then match any anonymous enabling against them.  Note
17036	 * that there should be no other retained enablings at this time:
17037	 * the only retained enablings at this time should be the anonymous
17038	 * enabling.
17039	 */
17040	if (dtrace_anon.dta_enabling != NULL) {
17041		ASSERT(dtrace_retained == dtrace_anon.dta_enabling);
17042
17043#if defined(__APPLE__)
17044		/*
17045		 * If there is anonymous dof, we should switch symbol modes.
17046		 */
17047		if (dtrace_kernel_symbol_mode == DTRACE_KERNEL_SYMBOLS_FROM_USERSPACE) {
17048			dtrace_kernel_symbol_mode = DTRACE_KERNEL_SYMBOLS_FROM_KERNEL;
17049		}
17050#endif
17051
17052		dtrace_enabling_provide(NULL);
17053		state = dtrace_anon.dta_state;
17054
17055		/*
17056		 * We couldn't hold cpu_lock across the above call to
17057		 * dtrace_enabling_provide(), but we must hold it to actually
17058		 * enable the probes.  We have to drop all of our locks, pick
17059		 * up cpu_lock, and regain our locks before matching the
17060		 * retained anonymous enabling.
17061		 */
17062		lck_mtx_unlock(&dtrace_lock);
17063		lck_mtx_unlock(&dtrace_provider_lock);
17064
17065		lck_mtx_lock(&cpu_lock);
17066		lck_mtx_lock(&dtrace_provider_lock);
17067		lck_mtx_lock(&dtrace_lock);
17068
17069		if ((enab = dtrace_anon.dta_enabling) != NULL)
17070			(void) dtrace_enabling_match(enab, NULL);
17071
17072		lck_mtx_unlock(&cpu_lock);
17073	}
17074
17075	lck_mtx_unlock(&dtrace_lock);
17076	lck_mtx_unlock(&dtrace_provider_lock);
17077
17078	if (state != NULL) {
17079		/*
17080		 * If we created any anonymous state, set it going now.
17081		 */
17082		(void) dtrace_state_go(state, &dtrace_anon.dta_beganon);
17083	}
17084
17085	return (DDI_SUCCESS);
17086}
17087
17088/*ARGSUSED*/
17089static int
17090dtrace_open(dev_t *devp, int flag, int otyp, cred_t *cred_p)
17091{
17092#pragma unused(flag, otyp)
17093	dtrace_state_t *state;
17094	uint32_t priv;
17095	uid_t uid;
17096	zoneid_t zoneid;
17097#if defined (__APPLE__)
17098	int rv;
17099#endif /* __APPLE__ */
17100
17101#if !defined(__APPLE__)
17102	if (getminor(*devp) == DTRACEMNRN_HELPER)
17103		return (0);
17104
17105	/*
17106	 * If this wasn't an open with the "helper" minor, then it must be
17107	 * the "dtrace" minor.
17108	 */
17109	if (getminor(*devp) != DTRACEMNRN_DTRACE)
17110		return (ENXIO);
17111#else
17112	/* Darwin puts Helper on its own major device. */
17113#endif /* __APPLE__ */
17114
17115	/*
17116	 * If no DTRACE_PRIV_* bits are set in the credential, then the
17117	 * caller lacks sufficient permission to do anything with DTrace.
17118	 */
17119	dtrace_cred2priv(cred_p, &priv, &uid, &zoneid);
17120	if (priv == DTRACE_PRIV_NONE)
17121		return (EACCES);
17122
17123#if defined(__APPLE__)
17124	/*
17125	 * We delay the initialization of fasttrap as late as possible.
17126	 * It certainly can't be later than now!
17127	 */
17128	fasttrap_init();
17129#endif /* __APPLE__ */
17130
17131	/*
17132	 * Ask all providers to provide all their probes.
17133	 */
17134	lck_mtx_lock(&dtrace_provider_lock);
17135	dtrace_probe_provide(NULL, NULL);
17136	lck_mtx_unlock(&dtrace_provider_lock);
17137
17138	lck_mtx_lock(&cpu_lock);
17139	lck_mtx_lock(&dtrace_lock);
17140	dtrace_opens++;
17141	dtrace_membar_producer();
17142
17143	/*
17144	 * If the kernel debugger is active (that is, if the kernel debugger
17145	 * modified text in some way), we won't allow the open.
17146	 */
17147	if (kdi_dtrace_set(KDI_DTSET_DTRACE_ACTIVATE) != 0) {
17148		dtrace_opens--;
17149		lck_mtx_unlock(&cpu_lock);
17150		lck_mtx_unlock(&dtrace_lock);
17151		return (EBUSY);
17152	}
17153
17154#if !defined(__APPLE__)
17155	state = dtrace_state_create(devp, cred_p);
17156	lck_mtx_unlock(&cpu_lock);
17157
17158	if (state == NULL) {
17159		if (--dtrace_opens == 0 && dtrace_anon.dta_enabling == NULL)
17160			(void) kdi_dtrace_set(KDI_DTSET_DTRACE_DEACTIVATE);
17161		lck_mtx_unlock(&dtrace_lock);
17162		return (EAGAIN);
17163	}
17164
17165	lck_mtx_unlock(&dtrace_lock);
17166#else
17167	rv = dtrace_state_create(devp, cred_p, &state);
17168	lck_mtx_unlock(&cpu_lock);
17169
17170	if (rv != 0 || state == NULL) {
17171		if (--dtrace_opens == 0 && dtrace_anon.dta_enabling == NULL)
17172			(void) kdi_dtrace_set(KDI_DTSET_DTRACE_DEACTIVATE);
17173		lck_mtx_unlock(&dtrace_lock);
17174		/* propagate EAGAIN or ERESTART */
17175		return (rv);
17176	}
17177
17178	lck_mtx_unlock(&dtrace_lock);
17179
17180	lck_rw_lock_exclusive(&dtrace_dof_mode_lock);
17181
17182	/*
17183	 * If we are currently lazy, transition states.
17184	 *
17185	 * Unlike dtrace_close, we do not need to check the
17186	 * value of dtrace_opens, as any positive value (and
17187	 * we count as 1) means we transition states.
17188	 */
17189	if (dtrace_dof_mode == DTRACE_DOF_MODE_LAZY_ON) {
17190		dtrace_dof_mode = DTRACE_DOF_MODE_LAZY_OFF;
17191
17192		/*
17193		 * Iterate all existing processes and load lazy dofs.
17194		 */
17195		proc_iterate(PROC_ALLPROCLIST | PROC_NOWAITTRANS,
17196			     dtrace_lazy_dofs_proc_iterate_doit,
17197			     NULL,
17198			     dtrace_lazy_dofs_proc_iterate_filter,
17199			     NULL);
17200	}
17201
17202	lck_rw_unlock_exclusive(&dtrace_dof_mode_lock);
17203
17204	/*
17205	 * Update kernel symbol state.
17206	 *
17207	 * We must own the provider and dtrace locks.
17208	 *
17209	 * NOTE! It may appear there is a race by setting this value so late
17210	 * after dtrace_probe_provide. However, any kext loaded after the
17211	 * call to probe provide and before we set LAZY_OFF will be marked as
17212	 * eligible for symbols from userspace. The same dtrace that is currently
17213	 * calling dtrace_open() (this call!) will get a list of kexts needing
17214	 * symbols and fill them in, thus closing the race window.
17215	 *
17216	 * We want to set this value only after it certain it will succeed, as
17217	 * this significantly reduces the complexity of error exits.
17218	 */
17219	lck_mtx_lock(&dtrace_lock);
17220	if (dtrace_kernel_symbol_mode == DTRACE_KERNEL_SYMBOLS_FROM_USERSPACE) {
17221		dtrace_kernel_symbol_mode = DTRACE_KERNEL_SYMBOLS_FROM_KERNEL;
17222	}
17223	lck_mtx_unlock(&dtrace_lock);
17224#endif /* __APPLE__ */
17225
17226	return (0);
17227}
17228
17229/*ARGSUSED*/
17230static int
17231dtrace_close(dev_t dev, int flag, int otyp, cred_t *cred_p)
17232{
17233#pragma unused(flag, otyp, cred_p) /* __APPLE__ */
17234	minor_t minor = getminor(dev);
17235	dtrace_state_t *state;
17236
17237#if !defined(__APPLE__)
17238	if (minor == DTRACEMNRN_HELPER)
17239		return (0);
17240#else
17241	/* Darwin puts Helper on its own major device. */
17242#endif /* __APPLE__ */
17243
17244	state = ddi_get_soft_state(dtrace_softstate, minor);
17245
17246	lck_mtx_lock(&cpu_lock);
17247	lck_mtx_lock(&dtrace_lock);
17248
17249	if (state->dts_anon) {
17250		/*
17251		 * There is anonymous state. Destroy that first.
17252		 */
17253		ASSERT(dtrace_anon.dta_state == NULL);
17254		dtrace_state_destroy(state->dts_anon);
17255	}
17256
17257	dtrace_state_destroy(state);
17258	ASSERT(dtrace_opens > 0);
17259
17260	/*
17261	 * Only relinquish control of the kernel debugger interface when there
17262	 * are no consumers and no anonymous enablings.
17263	 */
17264	if (--dtrace_opens == 0 && dtrace_anon.dta_enabling == NULL)
17265		(void) kdi_dtrace_set(KDI_DTSET_DTRACE_DEACTIVATE);
17266
17267	lck_mtx_unlock(&dtrace_lock);
17268	lck_mtx_unlock(&cpu_lock);
17269
17270#if defined(__APPLE__)
17271	/*
17272	 * Lock ordering requires the dof mode lock be taken before
17273	 * the dtrace_lock.
17274	 */
17275	lck_rw_lock_exclusive(&dtrace_dof_mode_lock);
17276	lck_mtx_lock(&dtrace_lock);
17277
17278	if (dtrace_opens == 0) {
17279		/*
17280		 * If we are currently lazy-off, and this is the last close, transition to
17281		 * lazy state.
17282		 */
17283		if (dtrace_dof_mode == DTRACE_DOF_MODE_LAZY_OFF) {
17284			dtrace_dof_mode = DTRACE_DOF_MODE_LAZY_ON;
17285		}
17286
17287		/*
17288		 * If we are the last dtrace client, switch back to lazy (from userspace) symbols
17289		 */
17290		if (dtrace_kernel_symbol_mode == DTRACE_KERNEL_SYMBOLS_FROM_KERNEL) {
17291			dtrace_kernel_symbol_mode = DTRACE_KERNEL_SYMBOLS_FROM_USERSPACE;
17292		}
17293	}
17294
17295	lck_mtx_unlock(&dtrace_lock);
17296	lck_rw_unlock_exclusive(&dtrace_dof_mode_lock);
17297
17298	/*
17299	 * Kext probes may be retained past the end of the kext's lifespan. The
17300	 * probes are kept until the last reference to them has been removed.
17301	 * Since closing an active dtrace context is likely to drop that last reference,
17302	 * lets take a shot at cleaning out the orphaned probes now.
17303	 */
17304	dtrace_module_unloaded(NULL);
17305#endif /* __APPLE__ */
17306
17307	return (0);
17308}
17309
17310#if !defined(__APPLE__)
17311/*ARGSUSED*/
17312static int
17313dtrace_ioctl_helper(int cmd, intptr_t arg, int *rv)
17314{
17315	int rval;
17316	dof_helper_t help, *dhp = NULL;
17317
17318	switch (cmd) {
17319	case DTRACEHIOC_ADDDOF:
17320		if (copyin((void *)arg, &help, sizeof (help)) != 0) {
17321			dtrace_dof_error(NULL, "failed to copyin DOF helper");
17322			return (EFAULT);
17323		}
17324
17325		dhp = &help;
17326		arg = (intptr_t)help.dofhp_dof;
17327		/*FALLTHROUGH*/
17328
17329	case DTRACEHIOC_ADD: {
17330		dof_hdr_t *dof = dtrace_dof_copyin(arg, &rval);
17331
17332		if (dof == NULL)
17333			return (rval);
17334
17335		mutex_enter(&dtrace_lock);
17336
17337		/*
17338		 * dtrace_helper_slurp() takes responsibility for the dof --
17339		 * it may free it now or it may save it and free it later.
17340		 */
17341		if ((rval = dtrace_helper_slurp(dof, dhp)) != -1) {
17342			*rv = rval;
17343			rval = 0;
17344		} else {
17345			rval = EINVAL;
17346		}
17347
17348		mutex_exit(&dtrace_lock);
17349		return (rval);
17350	}
17351
17352	case DTRACEHIOC_REMOVE: {
17353		mutex_enter(&dtrace_lock);
17354		rval = dtrace_helper_destroygen(arg);
17355		mutex_exit(&dtrace_lock);
17356
17357		return (rval);
17358	}
17359
17360	default:
17361		break;
17362	}
17363
17364	return (ENOTTY);
17365}
17366
17367/*ARGSUSED*/
17368static int
17369dtrace_ioctl(dev_t dev, u_long cmd, intptr_t arg, int md, cred_t *cr, int *rv)
17370{
17371	minor_t minor = getminor(dev);
17372	dtrace_state_t *state;
17373	int rval;
17374
17375	if (minor == DTRACEMNRN_HELPER)
17376		return (dtrace_ioctl_helper(cmd, arg, rv));
17377
17378	state = ddi_get_soft_state(dtrace_softstate, minor);
17379
17380	if (state->dts_anon) {
17381		ASSERT(dtrace_anon.dta_state == NULL);
17382		state = state->dts_anon;
17383	}
17384
17385	switch (cmd) {
17386	case DTRACEIOC_PROVIDER: {
17387		dtrace_providerdesc_t pvd;
17388		dtrace_provider_t *pvp;
17389
17390		if (copyin((void *)arg, &pvd, sizeof (pvd)) != 0)
17391			return (EFAULT);
17392
17393		pvd.dtvd_name[DTRACE_PROVNAMELEN - 1] = '\0';
17394		lck_mtx_lock(&dtrace_provider_lock);
17395
17396		for (pvp = dtrace_provider; pvp != NULL; pvp = pvp->dtpv_next) {
17397			if (strcmp(pvp->dtpv_name, pvd.dtvd_name) == 0)
17398				break;
17399		}
17400
17401		lck_mtx_unlock(&dtrace_provider_lock);
17402
17403		if (pvp == NULL)
17404			return (ESRCH);
17405
17406		bcopy(&pvp->dtpv_priv, &pvd.dtvd_priv, sizeof (dtrace_ppriv_t));
17407		bcopy(&pvp->dtpv_attr, &pvd.dtvd_attr, sizeof (dtrace_pattr_t));
17408		if (copyout(&pvd, (void *)arg, sizeof (pvd)) != 0)
17409			return (EFAULT);
17410
17411		return (0);
17412	}
17413
17414	case DTRACEIOC_EPROBE: {
17415		dtrace_eprobedesc_t epdesc;
17416		dtrace_ecb_t *ecb;
17417		dtrace_action_t *act;
17418		void *buf;
17419		size_t size;
17420		uintptr_t dest;
17421		int nrecs;
17422
17423		if (copyin((void *)arg, &epdesc, sizeof (epdesc)) != 0)
17424			return (EFAULT);
17425
17426		lck_mtx_lock(&dtrace_lock);
17427
17428		if ((ecb = dtrace_epid2ecb(state, epdesc.dtepd_epid)) == NULL) {
17429			lck_mtx_unlock(&dtrace_lock);
17430			return (EINVAL);
17431		}
17432
17433		if (ecb->dte_probe == NULL) {
17434			lck_mtx_unlock(&dtrace_lock);
17435			return (EINVAL);
17436		}
17437
17438		epdesc.dtepd_probeid = ecb->dte_probe->dtpr_id;
17439		epdesc.dtepd_uarg = ecb->dte_uarg;
17440		epdesc.dtepd_size = ecb->dte_size;
17441
17442		nrecs = epdesc.dtepd_nrecs;
17443		epdesc.dtepd_nrecs = 0;
17444		for (act = ecb->dte_action; act != NULL; act = act->dta_next) {
17445			if (DTRACEACT_ISAGG(act->dta_kind) || act->dta_intuple)
17446				continue;
17447
17448			epdesc.dtepd_nrecs++;
17449		}
17450
17451		/*
17452		 * Now that we have the size, we need to allocate a temporary
17453		 * buffer in which to store the complete description.  We need
17454		 * the temporary buffer to be able to drop dtrace_lock()
17455		 * across the copyout(), below.
17456		 */
17457		size = sizeof (dtrace_eprobedesc_t) +
17458		    (epdesc.dtepd_nrecs * sizeof (dtrace_recdesc_t));
17459
17460		buf = kmem_alloc(size, KM_SLEEP);
17461		dest = (uintptr_t)buf;
17462
17463		bcopy(&epdesc, (void *)dest, sizeof (epdesc));
17464		dest += offsetof(dtrace_eprobedesc_t, dtepd_rec[0]);
17465
17466		for (act = ecb->dte_action; act != NULL; act = act->dta_next) {
17467			if (DTRACEACT_ISAGG(act->dta_kind) || act->dta_intuple)
17468				continue;
17469
17470			if (nrecs-- == 0)
17471				break;
17472
17473			bcopy(&act->dta_rec, (void *)dest,
17474			    sizeof (dtrace_recdesc_t));
17475			dest += sizeof (dtrace_recdesc_t);
17476		}
17477
17478		lck_mtx_unlock(&dtrace_lock);
17479
17480		if (copyout(buf, (void *)arg, dest - (uintptr_t)buf) != 0) {
17481			kmem_free(buf, size);
17482			return (EFAULT);
17483		}
17484
17485		kmem_free(buf, size);
17486		return (0);
17487	}
17488
17489	case DTRACEIOC_AGGDESC: {
17490		dtrace_aggdesc_t aggdesc;
17491		dtrace_action_t *act;
17492		dtrace_aggregation_t *agg;
17493		int nrecs;
17494		uint32_t offs;
17495		dtrace_recdesc_t *lrec;
17496		void *buf;
17497		size_t size;
17498		uintptr_t dest;
17499
17500		if (copyin((void *)arg, &aggdesc, sizeof (aggdesc)) != 0)
17501			return (EFAULT);
17502
17503		lck_mtx_lock(&dtrace_lock);
17504
17505		if ((agg = dtrace_aggid2agg(state, aggdesc.dtagd_id)) == NULL) {
17506			lck_mtx_unlock(&dtrace_lock);
17507			return (EINVAL);
17508		}
17509
17510		aggdesc.dtagd_epid = agg->dtag_ecb->dte_epid;
17511
17512		nrecs = aggdesc.dtagd_nrecs;
17513		aggdesc.dtagd_nrecs = 0;
17514
17515		offs = agg->dtag_base;
17516		lrec = &agg->dtag_action.dta_rec;
17517		aggdesc.dtagd_size = lrec->dtrd_offset + lrec->dtrd_size - offs;
17518
17519		for (act = agg->dtag_first; ; act = act->dta_next) {
17520			ASSERT(act->dta_intuple ||
17521			    DTRACEACT_ISAGG(act->dta_kind));
17522
17523			/*
17524			 * If this action has a record size of zero, it
17525			 * denotes an argument to the aggregating action.
17526			 * Because the presence of this record doesn't (or
17527			 * shouldn't) affect the way the data is interpreted,
17528			 * we don't copy it out to save user-level the
17529			 * confusion of dealing with a zero-length record.
17530			 */
17531			if (act->dta_rec.dtrd_size == 0) {
17532				ASSERT(agg->dtag_hasarg);
17533				continue;
17534			}
17535
17536			aggdesc.dtagd_nrecs++;
17537
17538			if (act == &agg->dtag_action)
17539				break;
17540		}
17541
17542		/*
17543		 * Now that we have the size, we need to allocate a temporary
17544		 * buffer in which to store the complete description.  We need
17545		 * the temporary buffer to be able to drop dtrace_lock()
17546		 * across the copyout(), below.
17547		 */
17548		size = sizeof (dtrace_aggdesc_t) +
17549		    (aggdesc.dtagd_nrecs * sizeof (dtrace_recdesc_t));
17550
17551		buf = kmem_alloc(size, KM_SLEEP);
17552		dest = (uintptr_t)buf;
17553
17554		bcopy(&aggdesc, (void *)dest, sizeof (aggdesc));
17555		dest += offsetof(dtrace_aggdesc_t, dtagd_rec[0]);
17556
17557		for (act = agg->dtag_first; ; act = act->dta_next) {
17558			dtrace_recdesc_t rec = act->dta_rec;
17559
17560			/*
17561			 * See the comment in the above loop for why we pass
17562			 * over zero-length records.
17563			 */
17564			if (rec.dtrd_size == 0) {
17565				ASSERT(agg->dtag_hasarg);
17566				continue;
17567			}
17568
17569			if (nrecs-- == 0)
17570				break;
17571
17572			rec.dtrd_offset -= offs;
17573			bcopy(&rec, (void *)dest, sizeof (rec));
17574			dest += sizeof (dtrace_recdesc_t);
17575
17576			if (act == &agg->dtag_action)
17577				break;
17578		}
17579
17580		lck_mtx_unlock(&dtrace_lock);
17581
17582		if (copyout(buf, (void *)arg, dest - (uintptr_t)buf) != 0) {
17583			kmem_free(buf, size);
17584			return (EFAULT);
17585		}
17586
17587		kmem_free(buf, size);
17588		return (0);
17589	}
17590
17591	case DTRACEIOC_ENABLE: {
17592		dof_hdr_t *dof;
17593		dtrace_enabling_t *enab = NULL;
17594		dtrace_vstate_t *vstate;
17595		int err = 0;
17596
17597		*rv = 0;
17598
17599		/*
17600		 * If a NULL argument has been passed, we take this as our
17601		 * cue to reevaluate our enablings.
17602		 */
17603		if (arg == NULL) {
17604			dtrace_enabling_matchall();
17605
17606			return (0);
17607		}
17608
17609		if ((dof = dtrace_dof_copyin(arg, &rval)) == NULL)
17610			return (rval);
17611
17612		lck_mtx_lock(&cpu_lock);
17613		lck_mtx_lock(&dtrace_lock);
17614		vstate = &state->dts_vstate;
17615
17616		if (state->dts_activity != DTRACE_ACTIVITY_INACTIVE) {
17617			lck_mtx_unlock(&dtrace_lock);
17618			lck_mtx_unlock(&cpu_lock);
17619			dtrace_dof_destroy(dof);
17620			return (EBUSY);
17621		}
17622
17623		if (dtrace_dof_slurp(dof, vstate, cr, &enab, 0, B_TRUE) != 0) {
17624			lck_mtx_unlock(&dtrace_lock);
17625			lck_mtx_unlock(&cpu_lock);
17626			dtrace_dof_destroy(dof);
17627			return (EINVAL);
17628		}
17629
17630		if ((rval = dtrace_dof_options(dof, state)) != 0) {
17631			dtrace_enabling_destroy(enab);
17632			lck_mtx_unlock(&dtrace_lock);
17633			lck_mtx_unlock(&cpu_lock);
17634			dtrace_dof_destroy(dof);
17635			return (rval);
17636		}
17637
17638		if ((err = dtrace_enabling_match(enab, rv)) == 0) {
17639			err = dtrace_enabling_retain(enab);
17640		} else {
17641			dtrace_enabling_destroy(enab);
17642		}
17643
17644		lck_mtx_unlock(&cpu_lock);
17645		lck_mtx_unlock(&dtrace_lock);
17646		dtrace_dof_destroy(dof);
17647
17648		return (err);
17649	}
17650
17651	case DTRACEIOC_REPLICATE: {
17652		dtrace_repldesc_t desc;
17653		dtrace_probedesc_t *match = &desc.dtrpd_match;
17654		dtrace_probedesc_t *create = &desc.dtrpd_create;
17655		int err;
17656
17657		if (copyin((void *)arg, &desc, sizeof (desc)) != 0)
17658			return (EFAULT);
17659
17660		match->dtpd_provider[DTRACE_PROVNAMELEN - 1] = '\0';
17661		match->dtpd_mod[DTRACE_MODNAMELEN - 1] = '\0';
17662		match->dtpd_func[DTRACE_FUNCNAMELEN - 1] = '\0';
17663		match->dtpd_name[DTRACE_NAMELEN - 1] = '\0';
17664
17665		create->dtpd_provider[DTRACE_PROVNAMELEN - 1] = '\0';
17666		create->dtpd_mod[DTRACE_MODNAMELEN - 1] = '\0';
17667		create->dtpd_func[DTRACE_FUNCNAMELEN - 1] = '\0';
17668		create->dtpd_name[DTRACE_NAMELEN - 1] = '\0';
17669
17670		lck_mtx_lock(&dtrace_lock);
17671		err = dtrace_enabling_replicate(state, match, create);
17672		lck_mtx_unlock(&dtrace_lock);
17673
17674		return (err);
17675	}
17676
17677	case DTRACEIOC_PROBEMATCH:
17678	case DTRACEIOC_PROBES: {
17679		dtrace_probe_t *probe = NULL;
17680		dtrace_probedesc_t desc;
17681		dtrace_probekey_t pkey;
17682		dtrace_id_t i;
17683		int m = 0;
17684		uint32_t priv;
17685		uid_t uid;
17686		zoneid_t zoneid;
17687
17688		if (copyin((void *)arg, &desc, sizeof (desc)) != 0)
17689			return (EFAULT);
17690
17691		desc.dtpd_provider[DTRACE_PROVNAMELEN - 1] = '\0';
17692		desc.dtpd_mod[DTRACE_MODNAMELEN - 1] = '\0';
17693		desc.dtpd_func[DTRACE_FUNCNAMELEN - 1] = '\0';
17694		desc.dtpd_name[DTRACE_NAMELEN - 1] = '\0';
17695
17696		/*
17697		 * Before we attempt to match this probe, we want to give
17698		 * all providers the opportunity to provide it.
17699		 */
17700		if (desc.dtpd_id == DTRACE_IDNONE) {
17701			lck_mtx_lock(&dtrace_provider_lock);
17702			dtrace_probe_provide(&desc, NULL);
17703			lck_mtx_unlock(&dtrace_provider_lock);
17704			desc.dtpd_id++;
17705		}
17706
17707		if (cmd == DTRACEIOC_PROBEMATCH)  {
17708			dtrace_probekey(&desc, &pkey);
17709			pkey.dtpk_id = DTRACE_IDNONE;
17710		}
17711
17712		dtrace_cred2priv(cr, &priv, &uid, &zoneid);
17713
17714		lck_mtx_lock(&dtrace_lock);
17715
17716		if (cmd == DTRACEIOC_PROBEMATCH) {
17717			for (i = desc.dtpd_id; i <= dtrace_nprobes; i++) {
17718				if ((probe = dtrace_probes[i - 1]) != NULL &&
17719				    (m = dtrace_match_probe(probe, &pkey,
17720				    priv, uid, zoneid)) != 0)
17721					break;
17722			}
17723
17724			if (m < 0) {
17725				lck_mtx_unlock(&dtrace_lock);
17726				return (EINVAL);
17727			}
17728
17729		} else {
17730			for (i = desc.dtpd_id; i <= dtrace_nprobes; i++) {
17731				if ((probe = dtrace_probes[i - 1]) != NULL &&
17732				    dtrace_match_priv(probe, priv, uid, zoneid))
17733					break;
17734			}
17735		}
17736
17737		if (probe == NULL) {
17738			lck_mtx_unlock(&dtrace_lock);
17739			return (ESRCH);
17740		}
17741
17742		dtrace_probe_description(probe, &desc);
17743		lck_mtx_unlock(&dtrace_lock);
17744
17745		if (copyout(&desc, (void *)arg, sizeof (desc)) != 0)
17746			return (EFAULT);
17747
17748		return (0);
17749	}
17750
17751	case DTRACEIOC_PROBEARG: {
17752		dtrace_argdesc_t desc;
17753		dtrace_probe_t *probe;
17754		dtrace_provider_t *prov;
17755
17756		if (copyin((void *)arg, &desc, sizeof (desc)) != 0)
17757			return (EFAULT);
17758
17759		if (desc.dtargd_id == DTRACE_IDNONE)
17760			return (EINVAL);
17761
17762		if (desc.dtargd_ndx == DTRACE_ARGNONE)
17763			return (EINVAL);
17764
17765		lck_mtx_lock(&dtrace_provider_lock);
17766		lck_mtx_lock(&mod_lock);
17767		lck_mtx_lock(&dtrace_lock);
17768
17769		if (desc.dtargd_id > dtrace_nprobes) {
17770			lck_mtx_unlock(&dtrace_lock);
17771			lck_mtx_unlock(&mod_lock);
17772			lck_mtx_unlock(&dtrace_provider_lock);
17773			return (EINVAL);
17774		}
17775
17776		if ((probe = dtrace_probes[desc.dtargd_id - 1]) == NULL) {
17777			lck_mtx_unlock(&dtrace_lock);
17778			lck_mtx_unlock(&mod_lock);
17779			lck_mtx_unlock(&dtrace_provider_lock);
17780			return (EINVAL);
17781		}
17782
17783		lck_mtx_unlock(&dtrace_lock);
17784
17785		prov = probe->dtpr_provider;
17786
17787		if (prov->dtpv_pops.dtps_getargdesc == NULL) {
17788			/*
17789			 * There isn't any typed information for this probe.
17790			 * Set the argument number to DTRACE_ARGNONE.
17791			 */
17792			desc.dtargd_ndx = DTRACE_ARGNONE;
17793		} else {
17794			desc.dtargd_native[0] = '\0';
17795			desc.dtargd_xlate[0] = '\0';
17796			desc.dtargd_mapping = desc.dtargd_ndx;
17797
17798			prov->dtpv_pops.dtps_getargdesc(prov->dtpv_arg,
17799			    probe->dtpr_id, probe->dtpr_arg, &desc);
17800		}
17801
17802		lck_mtx_unlock(&mod_lock);
17803		lck_mtx_unlock(&dtrace_provider_lock);
17804
17805		if (copyout(&desc, (void *)arg, sizeof (desc)) != 0)
17806			return (EFAULT);
17807
17808		return (0);
17809	}
17810
17811	case DTRACEIOC_GO: {
17812		processorid_t cpuid;
17813		rval = dtrace_state_go(state, &cpuid);
17814
17815		if (rval != 0)
17816			return (rval);
17817
17818		if (copyout(&cpuid, (void *)arg, sizeof (cpuid)) != 0)
17819			return (EFAULT);
17820
17821		return (0);
17822	}
17823
17824	case DTRACEIOC_STOP: {
17825		processorid_t cpuid;
17826
17827		lck_mtx_lock(&dtrace_lock);
17828		rval = dtrace_state_stop(state, &cpuid);
17829		lck_mtx_unlock(&dtrace_lock);
17830
17831		if (rval != 0)
17832			return (rval);
17833
17834		if (copyout(&cpuid, (void *)arg, sizeof (cpuid)) != 0)
17835			return (EFAULT);
17836
17837		return (0);
17838	}
17839
17840	case DTRACEIOC_DOFGET: {
17841		dof_hdr_t hdr, *dof;
17842		uint64_t len;
17843
17844		if (copyin((void *)arg, &hdr, sizeof (hdr)) != 0)
17845			return (EFAULT);
17846
17847		lck_mtx_lock(&dtrace_lock);
17848		dof = dtrace_dof_create(state);
17849		lck_mtx_unlock(&dtrace_lock);
17850
17851		len = MIN(hdr.dofh_loadsz, dof->dofh_loadsz);
17852		rval = copyout(dof, (void *)arg, len);
17853		dtrace_dof_destroy(dof);
17854
17855		return (rval == 0 ? 0 : EFAULT);
17856	}
17857
17858	case DTRACEIOC_AGGSNAP:
17859	case DTRACEIOC_BUFSNAP: {
17860		dtrace_bufdesc_t desc;
17861		caddr_t cached;
17862		dtrace_buffer_t *buf;
17863
17864		if (copyin((void *)arg, &desc, sizeof (desc)) != 0)
17865			return (EFAULT);
17866
17867		if (desc.dtbd_cpu < 0 || desc.dtbd_cpu >= NCPU)
17868			return (EINVAL);
17869
17870		lck_mtx_lock(&dtrace_lock);
17871
17872		if (cmd == DTRACEIOC_BUFSNAP) {
17873			buf = &state->dts_buffer[desc.dtbd_cpu];
17874		} else {
17875			buf = &state->dts_aggbuffer[desc.dtbd_cpu];
17876		}
17877
17878		if (buf->dtb_flags & (DTRACEBUF_RING | DTRACEBUF_FILL)) {
17879			size_t sz = buf->dtb_offset;
17880
17881			if (state->dts_activity != DTRACE_ACTIVITY_STOPPED) {
17882				lck_mtx_unlock(&dtrace_lock);
17883				return (EBUSY);
17884			}
17885
17886			/*
17887			 * If this buffer has already been consumed, we're
17888			 * going to indicate that there's nothing left here
17889			 * to consume.
17890			 */
17891			if (buf->dtb_flags & DTRACEBUF_CONSUMED) {
17892				lck_mtx_unlock(&dtrace_lock);
17893
17894				desc.dtbd_size = 0;
17895				desc.dtbd_drops = 0;
17896				desc.dtbd_errors = 0;
17897				desc.dtbd_oldest = 0;
17898				sz = sizeof (desc);
17899
17900				if (copyout(&desc, (void *)arg, sz) != 0)
17901					return (EFAULT);
17902
17903				return (0);
17904			}
17905
17906			/*
17907			 * If this is a ring buffer that has wrapped, we want
17908			 * to copy the whole thing out.
17909			 */
17910			if (buf->dtb_flags & DTRACEBUF_WRAPPED) {
17911				dtrace_buffer_polish(buf);
17912				sz = buf->dtb_size;
17913			}
17914
17915			if (copyout(buf->dtb_tomax, desc.dtbd_data, sz) != 0) {
17916				lck_mtx_unlock(&dtrace_lock);
17917				return (EFAULT);
17918			}
17919
17920			desc.dtbd_size = sz;
17921			desc.dtbd_drops = buf->dtb_drops;
17922			desc.dtbd_errors = buf->dtb_errors;
17923			desc.dtbd_oldest = buf->dtb_xamot_offset;
17924
17925			lck_mtx_unlock(&dtrace_lock);
17926
17927			if (copyout(&desc, (void *)arg, sizeof (desc)) != 0)
17928				return (EFAULT);
17929
17930			buf->dtb_flags |= DTRACEBUF_CONSUMED;
17931
17932			return (0);
17933		}
17934
17935		if (buf->dtb_tomax == NULL) {
17936			ASSERT(buf->dtb_xamot == NULL);
17937			lck_mtx_unlock(&dtrace_lock);
17938			return (ENOENT);
17939		}
17940
17941		cached = buf->dtb_tomax;
17942		ASSERT(!(buf->dtb_flags & DTRACEBUF_NOSWITCH));
17943
17944		dtrace_xcall(desc.dtbd_cpu,
17945		    (dtrace_xcall_t)dtrace_buffer_switch, buf);
17946
17947		state->dts_errors += buf->dtb_xamot_errors;
17948
17949		/*
17950		 * If the buffers did not actually switch, then the cross call
17951		 * did not take place -- presumably because the given CPU is
17952		 * not in the ready set.  If this is the case, we'll return
17953		 * ENOENT.
17954		 */
17955		if (buf->dtb_tomax == cached) {
17956			ASSERT(buf->dtb_xamot != cached);
17957			lck_mtx_unlock(&dtrace_lock);
17958			return (ENOENT);
17959		}
17960
17961		ASSERT(cached == buf->dtb_xamot);
17962
17963		/*
17964		 * We have our snapshot; now copy it out.
17965		 */
17966		if (copyout(buf->dtb_xamot, desc.dtbd_data,
17967		    buf->dtb_xamot_offset) != 0) {
17968			lck_mtx_unlock(&dtrace_lock);
17969			return (EFAULT);
17970		}
17971
17972		desc.dtbd_size = buf->dtb_xamot_offset;
17973		desc.dtbd_drops = buf->dtb_xamot_drops;
17974		desc.dtbd_errors = buf->dtb_xamot_errors;
17975		desc.dtbd_oldest = 0;
17976
17977		lck_mtx_unlock(&dtrace_lock);
17978
17979		/*
17980		 * Finally, copy out the buffer description.
17981		 */
17982		if (copyout(&desc, (void *)arg, sizeof (desc)) != 0)
17983			return (EFAULT);
17984
17985		return (0);
17986	}
17987
17988	case DTRACEIOC_CONF: {
17989		dtrace_conf_t conf;
17990
17991		bzero(&conf, sizeof (conf));
17992		conf.dtc_difversion = DIF_VERSION;
17993		conf.dtc_difintregs = DIF_DIR_NREGS;
17994		conf.dtc_diftupregs = DIF_DTR_NREGS;
17995		conf.dtc_ctfmodel = CTF_MODEL_NATIVE;
17996
17997		if (copyout(&conf, (void *)arg, sizeof (conf)) != 0)
17998			return (EFAULT);
17999
18000		return (0);
18001	}
18002
18003	case DTRACEIOC_STATUS: {
18004		dtrace_status_t stat;
18005		dtrace_dstate_t *dstate;
18006		int i, j;
18007		uint64_t nerrs;
18008
18009		/*
18010		 * See the comment in dtrace_state_deadman() for the reason
18011		 * for setting dts_laststatus to INT64_MAX before setting
18012		 * it to the correct value.
18013		 */
18014		state->dts_laststatus = INT64_MAX;
18015		dtrace_membar_producer();
18016		state->dts_laststatus = dtrace_gethrtime();
18017
18018		bzero(&stat, sizeof (stat));
18019
18020		lck_mtx_lock(&dtrace_lock);
18021
18022		if (state->dts_activity == DTRACE_ACTIVITY_INACTIVE) {
18023			lck_mtx_unlock(&dtrace_lock);
18024			return (ENOENT);
18025		}
18026
18027		if (state->dts_activity == DTRACE_ACTIVITY_DRAINING)
18028			stat.dtst_exiting = 1;
18029
18030		nerrs = state->dts_errors;
18031		dstate = &state->dts_vstate.dtvs_dynvars;
18032
18033		for (i = 0; i < NCPU; i++) {
18034			dtrace_dstate_percpu_t *dcpu = &dstate->dtds_percpu[i];
18035
18036			stat.dtst_dyndrops += dcpu->dtdsc_drops;
18037			stat.dtst_dyndrops_dirty += dcpu->dtdsc_dirty_drops;
18038			stat.dtst_dyndrops_rinsing += dcpu->dtdsc_rinsing_drops;
18039
18040			if (state->dts_buffer[i].dtb_flags & DTRACEBUF_FULL)
18041				stat.dtst_filled++;
18042
18043			nerrs += state->dts_buffer[i].dtb_errors;
18044
18045			for (j = 0; j < state->dts_nspeculations; j++) {
18046				dtrace_speculation_t *spec;
18047				dtrace_buffer_t *buf;
18048
18049				spec = &state->dts_speculations[j];
18050				buf = &spec->dtsp_buffer[i];
18051				stat.dtst_specdrops += buf->dtb_xamot_drops;
18052			}
18053		}
18054
18055		stat.dtst_specdrops_busy = state->dts_speculations_busy;
18056		stat.dtst_specdrops_unavail = state->dts_speculations_unavail;
18057		stat.dtst_stkstroverflows = state->dts_stkstroverflows;
18058		stat.dtst_dblerrors = state->dts_dblerrors;
18059		stat.dtst_killed =
18060		    (state->dts_activity == DTRACE_ACTIVITY_KILLED);
18061		stat.dtst_errors = nerrs;
18062
18063		lck_mtx_unlock(&dtrace_lock);
18064
18065		if (copyout(&stat, (void *)arg, sizeof (stat)) != 0)
18066			return (EFAULT);
18067
18068		return (0);
18069	}
18070
18071	case DTRACEIOC_FORMAT: {
18072		dtrace_fmtdesc_t fmt;
18073		char *str;
18074		int len;
18075
18076		if (copyin((void *)arg, &fmt, sizeof (fmt)) != 0)
18077			return (EFAULT);
18078
18079		lck_mtx_lock(&dtrace_lock);
18080
18081		if (fmt.dtfd_format == 0 ||
18082		    fmt.dtfd_format > state->dts_nformats) {
18083			lck_mtx_unlock(&dtrace_lock);
18084			return (EINVAL);
18085		}
18086
18087		/*
18088		 * Format strings are allocated contiguously and they are
18089		 * never freed; if a format index is less than the number
18090		 * of formats, we can assert that the format map is non-NULL
18091		 * and that the format for the specified index is non-NULL.
18092		 */
18093		ASSERT(state->dts_formats != NULL);
18094		str = state->dts_formats[fmt.dtfd_format - 1];
18095		ASSERT(str != NULL);
18096
18097		len = strlen(str) + 1;
18098
18099		if (len > fmt.dtfd_length) {
18100			fmt.dtfd_length = len;
18101
18102			if (copyout(&fmt, (void *)arg, sizeof (fmt)) != 0) {
18103				lck_mtx_unlock(&dtrace_lock);
18104				return (EINVAL);
18105			}
18106		} else {
18107			if (copyout(str, fmt.dtfd_string, len) != 0) {
18108				lck_mtx_unlock(&dtrace_lock);
18109				return (EINVAL);
18110			}
18111		}
18112
18113		lck_mtx_unlock(&dtrace_lock);
18114		return (0);
18115	}
18116
18117	default:
18118		break;
18119	}
18120
18121	return (ENOTTY);
18122}
18123#else
18124/*ARGSUSED*/
18125static int
18126dtrace_ioctl_helper(u_long cmd, caddr_t arg, int *rv)
18127{
18128#pragma unused(rv)
18129	/*
18130	 * Safe to check this outside the dof mode lock
18131	 */
18132	if (dtrace_dof_mode == DTRACE_DOF_MODE_NEVER)
18133		return KERN_SUCCESS;
18134
18135	switch (cmd) {
18136	case DTRACEHIOC_ADDDOF:
18137	                {
18138			dof_helper_t *dhp = NULL;
18139			size_t dof_ioctl_data_size;
18140			dof_ioctl_data_t* multi_dof;
18141			unsigned int i;
18142			int rval = 0;
18143			user_addr_t user_address = *(user_addr_t*)arg;
18144			uint64_t dof_count;
18145			int multi_dof_claimed = 0;
18146			proc_t* p = current_proc();
18147
18148			/*
18149			 * Read the number of DOF sections being passed in.
18150			 */
18151			if (copyin(user_address + offsetof(dof_ioctl_data_t, dofiod_count),
18152				   &dof_count,
18153				   sizeof(dof_count))) {
18154				dtrace_dof_error(NULL, "failed to copyin dofiod_count");
18155				return (EFAULT);
18156			}
18157
18158			/*
18159			 * Range check the count.
18160			 */
18161			if (dof_count == 0 || dof_count > 1024) {
18162				dtrace_dof_error(NULL, "dofiod_count is not valid");
18163				return (EINVAL);
18164			}
18165
18166			/*
18167			 * Allocate a correctly sized structure and copyin the data.
18168			 */
18169			dof_ioctl_data_size = DOF_IOCTL_DATA_T_SIZE(dof_count);
18170			if ((multi_dof = kmem_alloc(dof_ioctl_data_size, KM_SLEEP)) == NULL)
18171				return (ENOMEM);
18172
18173			/* NOTE! We can no longer exit this method via return */
18174			if (copyin(user_address, multi_dof, dof_ioctl_data_size) != 0) {
18175				dtrace_dof_error(NULL, "failed copyin of dof_ioctl_data_t");
18176				rval = EFAULT;
18177				goto cleanup;
18178			}
18179
18180			/*
18181			 * Check that the count didn't change between the first copyin and the second.
18182			 */
18183			if (multi_dof->dofiod_count != dof_count) {
18184				rval = EINVAL;
18185				goto cleanup;
18186			}
18187
18188			/*
18189			 * Try to process lazily first.
18190			 */
18191			rval = dtrace_lazy_dofs_add(p, multi_dof, &multi_dof_claimed);
18192
18193			/*
18194			 * If rval is EACCES, we must be non-lazy.
18195			 */
18196			if (rval == EACCES) {
18197				rval = 0;
18198				/*
18199				 * Process each dof_helper_t
18200				 */
18201				i = 0;
18202				do {
18203					dhp = &multi_dof->dofiod_helpers[i];
18204
18205					dof_hdr_t *dof = dtrace_dof_copyin(dhp->dofhp_dof, &rval);
18206
18207					if (dof != NULL) {
18208						lck_mtx_lock(&dtrace_lock);
18209
18210						/*
18211						 * dtrace_helper_slurp() takes responsibility for the dof --
18212						 * it may free it now or it may save it and free it later.
18213						 */
18214						if ((dhp->dofhp_dof = (uint64_t)dtrace_helper_slurp(p, dof, dhp)) == -1ULL) {
18215							rval = EINVAL;
18216						}
18217
18218						lck_mtx_unlock(&dtrace_lock);
18219					}
18220				} while (++i < multi_dof->dofiod_count && rval == 0);
18221			}
18222
18223			/*
18224			 * We need to copyout the multi_dof struct, because it contains
18225			 * the generation (unique id) values needed to call DTRACEHIOC_REMOVE
18226			 *
18227			 * This could certainly be better optimized.
18228			 */
18229			if (copyout(multi_dof, user_address, dof_ioctl_data_size) != 0) {
18230				dtrace_dof_error(NULL, "failed copyout of dof_ioctl_data_t");
18231				/* Don't overwrite pre-existing error code */
18232				if (rval == 0) rval = EFAULT;
18233			}
18234
18235		cleanup:
18236			/*
18237			 * If we had to allocate struct memory, free it.
18238			 */
18239			if (multi_dof != NULL && !multi_dof_claimed) {
18240				kmem_free(multi_dof, dof_ioctl_data_size);
18241			}
18242
18243			return rval;
18244		}
18245
18246		case DTRACEHIOC_REMOVE: {
18247			int generation = *(int*)arg;
18248			proc_t* p = current_proc();
18249
18250			/*
18251			 * Try lazy first.
18252			 */
18253			int rval = dtrace_lazy_dofs_remove(p, generation);
18254
18255			/*
18256			 * EACCES means non-lazy
18257			 */
18258			if (rval == EACCES) {
18259				lck_mtx_lock(&dtrace_lock);
18260				rval = dtrace_helper_destroygen(p, generation);
18261				lck_mtx_unlock(&dtrace_lock);
18262			}
18263
18264			return (rval);
18265		}
18266
18267		default:
18268			break;
18269	}
18270
18271	return ENOTTY;
18272}
18273
18274/*ARGSUSED*/
18275static int
18276dtrace_ioctl(dev_t dev, u_long cmd, user_addr_t arg, int md, cred_t *cr, int *rv)
18277{
18278#pragma unused(md)
18279	minor_t minor = getminor(dev);
18280	dtrace_state_t *state;
18281	int rval;
18282
18283	/* Darwin puts Helper on its own major device. */
18284
18285	state = ddi_get_soft_state(dtrace_softstate, minor);
18286
18287	if (state->dts_anon) {
18288	   ASSERT(dtrace_anon.dta_state == NULL);
18289	   state = state->dts_anon;
18290	}
18291
18292	switch (cmd) {
18293	case DTRACEIOC_PROVIDER: {
18294		dtrace_providerdesc_t pvd;
18295		dtrace_provider_t *pvp;
18296
18297		if (copyin(arg, &pvd, sizeof (pvd)) != 0)
18298			return (EFAULT);
18299
18300		pvd.dtvd_name[DTRACE_PROVNAMELEN - 1] = '\0';
18301		lck_mtx_lock(&dtrace_provider_lock);
18302
18303		for (pvp = dtrace_provider; pvp != NULL; pvp = pvp->dtpv_next) {
18304			if (strncmp(pvp->dtpv_name, pvd.dtvd_name, DTRACE_PROVNAMELEN) == 0)
18305				break;
18306		}
18307
18308		lck_mtx_unlock(&dtrace_provider_lock);
18309
18310		if (pvp == NULL)
18311			return (ESRCH);
18312
18313		bcopy(&pvp->dtpv_priv, &pvd.dtvd_priv, sizeof (dtrace_ppriv_t));
18314		bcopy(&pvp->dtpv_attr, &pvd.dtvd_attr, sizeof (dtrace_pattr_t));
18315		if (copyout(&pvd, arg, sizeof (pvd)) != 0)
18316			return (EFAULT);
18317
18318		return (0);
18319	}
18320
18321	case DTRACEIOC_EPROBE: {
18322		dtrace_eprobedesc_t epdesc;
18323		dtrace_ecb_t *ecb;
18324		dtrace_action_t *act;
18325		void *buf;
18326		size_t size;
18327		uintptr_t dest;
18328		int nrecs;
18329
18330		if (copyin(arg, &epdesc, sizeof (epdesc)) != 0)
18331			return (EFAULT);
18332
18333		lck_mtx_lock(&dtrace_lock);
18334
18335		if ((ecb = dtrace_epid2ecb(state, epdesc.dtepd_epid)) == NULL) {
18336			lck_mtx_unlock(&dtrace_lock);
18337			return (EINVAL);
18338		}
18339
18340		if (ecb->dte_probe == NULL) {
18341			lck_mtx_unlock(&dtrace_lock);
18342			return (EINVAL);
18343		}
18344
18345		epdesc.dtepd_probeid = ecb->dte_probe->dtpr_id;
18346		epdesc.dtepd_uarg = ecb->dte_uarg;
18347		epdesc.dtepd_size = ecb->dte_size;
18348
18349		nrecs = epdesc.dtepd_nrecs;
18350		epdesc.dtepd_nrecs = 0;
18351		for (act = ecb->dte_action; act != NULL; act = act->dta_next) {
18352			if (DTRACEACT_ISAGG(act->dta_kind) || act->dta_intuple)
18353				continue;
18354
18355			epdesc.dtepd_nrecs++;
18356		}
18357
18358		/*
18359		 * Now that we have the size, we need to allocate a temporary
18360		 * buffer in which to store the complete description.  We need
18361		 * the temporary buffer to be able to drop dtrace_lock()
18362		 * across the copyout(), below.
18363		 */
18364		size = sizeof (dtrace_eprobedesc_t) +
18365			(epdesc.dtepd_nrecs * sizeof (dtrace_recdesc_t));
18366
18367		buf = kmem_alloc(size, KM_SLEEP);
18368		dest = (uintptr_t)buf;
18369
18370		bcopy(&epdesc, (void *)dest, sizeof (epdesc));
18371		dest += offsetof(dtrace_eprobedesc_t, dtepd_rec[0]);
18372
18373		for (act = ecb->dte_action; act != NULL; act = act->dta_next) {
18374			if (DTRACEACT_ISAGG(act->dta_kind) || act->dta_intuple)
18375				continue;
18376
18377			if (nrecs-- == 0)
18378				break;
18379
18380			bcopy(&act->dta_rec, (void *)dest,
18381			sizeof (dtrace_recdesc_t));
18382			dest += sizeof (dtrace_recdesc_t);
18383		}
18384
18385		lck_mtx_unlock(&dtrace_lock);
18386
18387		if (copyout(buf, arg, dest - (uintptr_t)buf) != 0) {
18388			kmem_free(buf, size);
18389			return (EFAULT);
18390		}
18391
18392		kmem_free(buf, size);
18393		return (0);
18394	}
18395
18396	case DTRACEIOC_AGGDESC: {
18397		dtrace_aggdesc_t aggdesc;
18398		dtrace_action_t *act;
18399		dtrace_aggregation_t *agg;
18400		int nrecs;
18401		uint32_t offs;
18402		dtrace_recdesc_t *lrec;
18403		void *buf;
18404		size_t size;
18405		uintptr_t dest;
18406
18407		if (copyin(arg, &aggdesc, sizeof (aggdesc)) != 0)
18408			return (EFAULT);
18409
18410		lck_mtx_lock(&dtrace_lock);
18411
18412		if ((agg = dtrace_aggid2agg(state, aggdesc.dtagd_id)) == NULL) {
18413			lck_mtx_unlock(&dtrace_lock);
18414			return (EINVAL);
18415		}
18416
18417		aggdesc.dtagd_epid = agg->dtag_ecb->dte_epid;
18418
18419		nrecs = aggdesc.dtagd_nrecs;
18420		aggdesc.dtagd_nrecs = 0;
18421
18422		offs = agg->dtag_base;
18423		lrec = &agg->dtag_action.dta_rec;
18424		aggdesc.dtagd_size = lrec->dtrd_offset + lrec->dtrd_size - offs;
18425
18426		for (act = agg->dtag_first; ; act = act->dta_next) {
18427			ASSERT(act->dta_intuple ||
18428			DTRACEACT_ISAGG(act->dta_kind));
18429
18430			/*
18431			 * If this action has a record size of zero, it
18432			 * denotes an argument to the aggregating action.
18433			 * Because the presence of this record doesn't (or
18434			 * shouldn't) affect the way the data is interpreted,
18435			 * we don't copy it out to save user-level the
18436			 * confusion of dealing with a zero-length record.
18437			 */
18438			if (act->dta_rec.dtrd_size == 0) {
18439				ASSERT(agg->dtag_hasarg);
18440				continue;
18441			}
18442
18443			aggdesc.dtagd_nrecs++;
18444
18445			if (act == &agg->dtag_action)
18446				break;
18447		}
18448
18449		/*
18450		 * Now that we have the size, we need to allocate a temporary
18451		 * buffer in which to store the complete description.  We need
18452		 * the temporary buffer to be able to drop dtrace_lock()
18453		 * across the copyout(), below.
18454		 */
18455		size = sizeof (dtrace_aggdesc_t) +
18456			(aggdesc.dtagd_nrecs * sizeof (dtrace_recdesc_t));
18457
18458		buf = kmem_alloc(size, KM_SLEEP);
18459		dest = (uintptr_t)buf;
18460
18461		bcopy(&aggdesc, (void *)dest, sizeof (aggdesc));
18462		dest += offsetof(dtrace_aggdesc_t, dtagd_rec[0]);
18463
18464		for (act = agg->dtag_first; ; act = act->dta_next) {
18465			dtrace_recdesc_t rec = act->dta_rec;
18466
18467			/*
18468			 * See the comment in the above loop for why we pass
18469			 * over zero-length records.
18470			 */
18471			if (rec.dtrd_size == 0) {
18472				ASSERT(agg->dtag_hasarg);
18473				continue;
18474			}
18475
18476			if (nrecs-- == 0)
18477				break;
18478
18479			rec.dtrd_offset -= offs;
18480			bcopy(&rec, (void *)dest, sizeof (rec));
18481			dest += sizeof (dtrace_recdesc_t);
18482
18483			if (act == &agg->dtag_action)
18484				break;
18485		}
18486
18487		lck_mtx_unlock(&dtrace_lock);
18488
18489		if (copyout(buf, arg, dest - (uintptr_t)buf) != 0) {
18490			kmem_free(buf, size);
18491			return (EFAULT);
18492		}
18493
18494		kmem_free(buf, size);
18495		return (0);
18496	}
18497
18498	case DTRACEIOC_ENABLE: {
18499		dof_hdr_t *dof;
18500		dtrace_enabling_t *enab = NULL;
18501		dtrace_vstate_t *vstate;
18502		int err = 0;
18503
18504		*rv = 0;
18505
18506		/*
18507		 * If a NULL argument has been passed, we take this as our
18508		 * cue to reevaluate our enablings.
18509		 */
18510		if (arg == NULL) {
18511			dtrace_enabling_matchall();
18512
18513			return (0);
18514		}
18515
18516		if ((dof = dtrace_dof_copyin(arg, &rval)) == NULL)
18517			return (rval);
18518
18519		lck_mtx_lock(&cpu_lock);
18520		lck_mtx_lock(&dtrace_lock);
18521		vstate = &state->dts_vstate;
18522
18523		if (state->dts_activity != DTRACE_ACTIVITY_INACTIVE) {
18524			lck_mtx_unlock(&dtrace_lock);
18525			lck_mtx_unlock(&cpu_lock);
18526			dtrace_dof_destroy(dof);
18527			return (EBUSY);
18528		}
18529
18530		if (dtrace_dof_slurp(dof, vstate, cr, &enab, 0, B_TRUE) != 0) {
18531			lck_mtx_unlock(&dtrace_lock);
18532			lck_mtx_unlock(&cpu_lock);
18533			dtrace_dof_destroy(dof);
18534			return (EINVAL);
18535		}
18536
18537		if ((rval = dtrace_dof_options(dof, state)) != 0) {
18538			dtrace_enabling_destroy(enab);
18539			lck_mtx_unlock(&dtrace_lock);
18540			lck_mtx_unlock(&cpu_lock);
18541			dtrace_dof_destroy(dof);
18542			return (rval);
18543		}
18544
18545		if ((err = dtrace_enabling_match(enab, rv)) == 0) {
18546			err = dtrace_enabling_retain(enab);
18547		} else {
18548			dtrace_enabling_destroy(enab);
18549		}
18550
18551		lck_mtx_unlock(&cpu_lock);
18552		lck_mtx_unlock(&dtrace_lock);
18553		dtrace_dof_destroy(dof);
18554
18555		return (err);
18556	}
18557
18558	case DTRACEIOC_REPLICATE: {
18559		dtrace_repldesc_t desc;
18560		dtrace_probedesc_t *match = &desc.dtrpd_match;
18561		dtrace_probedesc_t *create = &desc.dtrpd_create;
18562		int err;
18563
18564		if (copyin(arg, &desc, sizeof (desc)) != 0)
18565			return (EFAULT);
18566
18567		match->dtpd_provider[DTRACE_PROVNAMELEN - 1] = '\0';
18568		match->dtpd_mod[DTRACE_MODNAMELEN - 1] = '\0';
18569		match->dtpd_func[DTRACE_FUNCNAMELEN - 1] = '\0';
18570		match->dtpd_name[DTRACE_NAMELEN - 1] = '\0';
18571
18572		create->dtpd_provider[DTRACE_PROVNAMELEN - 1] = '\0';
18573		create->dtpd_mod[DTRACE_MODNAMELEN - 1] = '\0';
18574		create->dtpd_func[DTRACE_FUNCNAMELEN - 1] = '\0';
18575		create->dtpd_name[DTRACE_NAMELEN - 1] = '\0';
18576
18577		lck_mtx_lock(&dtrace_lock);
18578		err = dtrace_enabling_replicate(state, match, create);
18579		lck_mtx_unlock(&dtrace_lock);
18580
18581		return (err);
18582	}
18583
18584	case DTRACEIOC_PROBEMATCH:
18585	case DTRACEIOC_PROBES: {
18586		dtrace_probe_t *probe = NULL;
18587		dtrace_probedesc_t desc;
18588		dtrace_probekey_t pkey;
18589		dtrace_id_t i;
18590		int m = 0;
18591		uint32_t priv;
18592		uid_t uid;
18593		zoneid_t zoneid;
18594
18595		if (copyin(arg, &desc, sizeof (desc)) != 0)
18596			return (EFAULT);
18597
18598		desc.dtpd_provider[DTRACE_PROVNAMELEN - 1] = '\0';
18599		desc.dtpd_mod[DTRACE_MODNAMELEN - 1] = '\0';
18600		desc.dtpd_func[DTRACE_FUNCNAMELEN - 1] = '\0';
18601		desc.dtpd_name[DTRACE_NAMELEN - 1] = '\0';
18602
18603		/*
18604		 * Before we attempt to match this probe, we want to give
18605		 * all providers the opportunity to provide it.
18606		 */
18607		if (desc.dtpd_id == DTRACE_IDNONE) {
18608			lck_mtx_lock(&dtrace_provider_lock);
18609			dtrace_probe_provide(&desc, NULL);
18610			lck_mtx_unlock(&dtrace_provider_lock);
18611			desc.dtpd_id++;
18612		}
18613
18614		if (cmd == DTRACEIOC_PROBEMATCH)  {
18615			dtrace_probekey(&desc, &pkey);
18616			pkey.dtpk_id = DTRACE_IDNONE;
18617		}
18618
18619		dtrace_cred2priv(cr, &priv, &uid, &zoneid);
18620
18621		lck_mtx_lock(&dtrace_lock);
18622
18623		if (cmd == DTRACEIOC_PROBEMATCH) {
18624                        /* Quiet compiler warning */
18625			for (i = desc.dtpd_id; i <= (dtrace_id_t)dtrace_nprobes; i++) {
18626				if ((probe = dtrace_probes[i - 1]) != NULL &&
18627					(m = dtrace_match_probe(probe, &pkey,
18628					priv, uid, zoneid)) != 0)
18629					break;
18630			}
18631
18632			if (m < 0) {
18633				lck_mtx_unlock(&dtrace_lock);
18634				return (EINVAL);
18635			}
18636
18637		} else {
18638                        /* Quiet compiler warning */
18639			for (i = desc.dtpd_id; i <= (dtrace_id_t)dtrace_nprobes; i++) {
18640				if ((probe = dtrace_probes[i - 1]) != NULL &&
18641					dtrace_match_priv(probe, priv, uid, zoneid))
18642					break;
18643			}
18644		}
18645
18646		if (probe == NULL) {
18647			lck_mtx_unlock(&dtrace_lock);
18648			return (ESRCH);
18649		}
18650
18651		dtrace_probe_description(probe, &desc);
18652		lck_mtx_unlock(&dtrace_lock);
18653
18654		if (copyout(&desc, arg, sizeof (desc)) != 0)
18655			return (EFAULT);
18656
18657		return (0);
18658	}
18659
18660	case DTRACEIOC_PROBEARG: {
18661		dtrace_argdesc_t desc;
18662		dtrace_probe_t *probe;
18663		dtrace_provider_t *prov;
18664
18665		if (copyin(arg, &desc, sizeof (desc)) != 0)
18666			return (EFAULT);
18667
18668		if (desc.dtargd_id == DTRACE_IDNONE)
18669			return (EINVAL);
18670
18671		if (desc.dtargd_ndx == DTRACE_ARGNONE)
18672			return (EINVAL);
18673
18674		lck_mtx_lock(&dtrace_provider_lock);
18675		lck_mtx_lock(&mod_lock);
18676		lck_mtx_lock(&dtrace_lock);
18677
18678                /* Quiet compiler warning */
18679		if (desc.dtargd_id > (dtrace_id_t)dtrace_nprobes) {
18680			lck_mtx_unlock(&dtrace_lock);
18681			lck_mtx_unlock(&mod_lock);
18682			lck_mtx_unlock(&dtrace_provider_lock);
18683			return (EINVAL);
18684		}
18685
18686		if ((probe = dtrace_probes[desc.dtargd_id - 1]) == NULL) {
18687			lck_mtx_unlock(&dtrace_lock);
18688			lck_mtx_unlock(&mod_lock);
18689			lck_mtx_unlock(&dtrace_provider_lock);
18690			return (EINVAL);
18691		}
18692
18693		lck_mtx_unlock(&dtrace_lock);
18694
18695		prov = probe->dtpr_provider;
18696
18697		if (prov->dtpv_pops.dtps_getargdesc == NULL) {
18698		/*
18699		 * There isn't any typed information for this probe.
18700		 * Set the argument number to DTRACE_ARGNONE.
18701		 */
18702			desc.dtargd_ndx = DTRACE_ARGNONE;
18703		} else {
18704			desc.dtargd_native[0] = '\0';
18705			desc.dtargd_xlate[0] = '\0';
18706			desc.dtargd_mapping = desc.dtargd_ndx;
18707
18708			prov->dtpv_pops.dtps_getargdesc(prov->dtpv_arg,
18709			probe->dtpr_id, probe->dtpr_arg, &desc);
18710		}
18711
18712		lck_mtx_unlock(&mod_lock);
18713		lck_mtx_unlock(&dtrace_provider_lock);
18714
18715		if (copyout(&desc, arg, sizeof (desc)) != 0)
18716			return (EFAULT);
18717
18718		return (0);
18719	}
18720
18721	case DTRACEIOC_GO: {
18722		processorid_t cpuid;
18723		rval = dtrace_state_go(state, &cpuid);
18724
18725		if (rval != 0)
18726			return (rval);
18727
18728		if (copyout(&cpuid, arg, sizeof (cpuid)) != 0)
18729			return (EFAULT);
18730
18731		return (0);
18732	}
18733
18734	case DTRACEIOC_STOP: {
18735		processorid_t cpuid;
18736
18737		lck_mtx_lock(&dtrace_lock);
18738		rval = dtrace_state_stop(state, &cpuid);
18739		lck_mtx_unlock(&dtrace_lock);
18740
18741		if (rval != 0)
18742			return (rval);
18743
18744		if (copyout(&cpuid, arg, sizeof (cpuid)) != 0)
18745			return (EFAULT);
18746
18747		return (0);
18748	}
18749
18750	case DTRACEIOC_DOFGET: {
18751		dof_hdr_t hdr, *dof;
18752		uint64_t len;
18753
18754		if (copyin(arg, &hdr, sizeof (hdr)) != 0)
18755			return (EFAULT);
18756
18757		lck_mtx_lock(&dtrace_lock);
18758		dof = dtrace_dof_create(state);
18759		lck_mtx_unlock(&dtrace_lock);
18760
18761		len = MIN(hdr.dofh_loadsz, dof->dofh_loadsz);
18762		rval = copyout(dof, arg, len);
18763		dtrace_dof_destroy(dof);
18764
18765		return (rval == 0 ? 0 : EFAULT);
18766	}
18767
18768	case DTRACEIOC_AGGSNAP:
18769	case DTRACEIOC_BUFSNAP: {
18770		dtrace_bufdesc_t desc;
18771		caddr_t cached;
18772		dtrace_buffer_t *buf;
18773
18774		if (copyin(arg, &desc, sizeof (desc)) != 0)
18775			return (EFAULT);
18776
18777		if ((int)desc.dtbd_cpu < 0 || desc.dtbd_cpu >= NCPU)
18778			return (EINVAL);
18779
18780		lck_mtx_lock(&dtrace_lock);
18781
18782		if (cmd == DTRACEIOC_BUFSNAP) {
18783			buf = &state->dts_buffer[desc.dtbd_cpu];
18784		} else {
18785			buf = &state->dts_aggbuffer[desc.dtbd_cpu];
18786		}
18787
18788		if (buf->dtb_flags & (DTRACEBUF_RING | DTRACEBUF_FILL)) {
18789			size_t sz = buf->dtb_offset;
18790
18791			if (state->dts_activity != DTRACE_ACTIVITY_STOPPED) {
18792				lck_mtx_unlock(&dtrace_lock);
18793				return (EBUSY);
18794			}
18795
18796			/*
18797			 * If this buffer has already been consumed, we're
18798			 * going to indicate that there's nothing left here
18799			 * to consume.
18800			 */
18801			if (buf->dtb_flags & DTRACEBUF_CONSUMED) {
18802				lck_mtx_unlock(&dtrace_lock);
18803
18804				desc.dtbd_size = 0;
18805				desc.dtbd_drops = 0;
18806				desc.dtbd_errors = 0;
18807				desc.dtbd_oldest = 0;
18808				sz = sizeof (desc);
18809
18810				if (copyout(&desc, arg, sz) != 0)
18811					return (EFAULT);
18812
18813				return (0);
18814			}
18815
18816			/*
18817			 * If this is a ring buffer that has wrapped, we want
18818			 * to copy the whole thing out.
18819			 */
18820			if (buf->dtb_flags & DTRACEBUF_WRAPPED) {
18821				dtrace_buffer_polish(buf);
18822				sz = buf->dtb_size;
18823			}
18824
18825			if (copyout(buf->dtb_tomax, (user_addr_t)desc.dtbd_data, sz) != 0) {
18826				lck_mtx_unlock(&dtrace_lock);
18827				return (EFAULT);
18828			}
18829
18830			desc.dtbd_size = sz;
18831			desc.dtbd_drops = buf->dtb_drops;
18832			desc.dtbd_errors = buf->dtb_errors;
18833			desc.dtbd_oldest = buf->dtb_xamot_offset;
18834
18835			lck_mtx_unlock(&dtrace_lock);
18836
18837			if (copyout(&desc, arg, sizeof (desc)) != 0)
18838				return (EFAULT);
18839
18840			buf->dtb_flags |= DTRACEBUF_CONSUMED;
18841
18842			return (0);
18843		}
18844
18845		if (buf->dtb_tomax == NULL) {
18846			ASSERT(buf->dtb_xamot == NULL);
18847			lck_mtx_unlock(&dtrace_lock);
18848			return (ENOENT);
18849		}
18850
18851		cached = buf->dtb_tomax;
18852		ASSERT(!(buf->dtb_flags & DTRACEBUF_NOSWITCH));
18853
18854		dtrace_xcall(desc.dtbd_cpu,
18855			(dtrace_xcall_t)dtrace_buffer_switch, buf);
18856
18857		state->dts_errors += buf->dtb_xamot_errors;
18858
18859		/*
18860		* If the buffers did not actually switch, then the cross call
18861		* did not take place -- presumably because the given CPU is
18862		* not in the ready set.  If this is the case, we'll return
18863		* ENOENT.
18864		*/
18865		if (buf->dtb_tomax == cached) {
18866			ASSERT(buf->dtb_xamot != cached);
18867			lck_mtx_unlock(&dtrace_lock);
18868			return (ENOENT);
18869		}
18870
18871		ASSERT(cached == buf->dtb_xamot);
18872
18873		/*
18874		* We have our snapshot; now copy it out.
18875		*/
18876		if (copyout(buf->dtb_xamot, (user_addr_t)desc.dtbd_data,
18877					buf->dtb_xamot_offset) != 0) {
18878			lck_mtx_unlock(&dtrace_lock);
18879			return (EFAULT);
18880		}
18881
18882		desc.dtbd_size = buf->dtb_xamot_offset;
18883		desc.dtbd_drops = buf->dtb_xamot_drops;
18884		desc.dtbd_errors = buf->dtb_xamot_errors;
18885		desc.dtbd_oldest = 0;
18886
18887		lck_mtx_unlock(&dtrace_lock);
18888
18889		/*
18890		 * Finally, copy out the buffer description.
18891		 */
18892		if (copyout(&desc, arg, sizeof (desc)) != 0)
18893			return (EFAULT);
18894
18895		return (0);
18896	}
18897
18898	case DTRACEIOC_CONF: {
18899		dtrace_conf_t conf;
18900
18901		bzero(&conf, sizeof (conf));
18902		conf.dtc_difversion = DIF_VERSION;
18903		conf.dtc_difintregs = DIF_DIR_NREGS;
18904		conf.dtc_diftupregs = DIF_DTR_NREGS;
18905		conf.dtc_ctfmodel = CTF_MODEL_NATIVE;
18906
18907		if (copyout(&conf, arg, sizeof (conf)) != 0)
18908			return (EFAULT);
18909
18910		return (0);
18911	}
18912
18913	case DTRACEIOC_STATUS: {
18914		dtrace_status_t stat;
18915		dtrace_dstate_t *dstate;
18916		int i, j;
18917		uint64_t nerrs;
18918
18919		/*
18920		* See the comment in dtrace_state_deadman() for the reason
18921		* for setting dts_laststatus to INT64_MAX before setting
18922		* it to the correct value.
18923		*/
18924		state->dts_laststatus = INT64_MAX;
18925		dtrace_membar_producer();
18926		state->dts_laststatus = dtrace_gethrtime();
18927
18928		bzero(&stat, sizeof (stat));
18929
18930		lck_mtx_lock(&dtrace_lock);
18931
18932		if (state->dts_activity == DTRACE_ACTIVITY_INACTIVE) {
18933			lck_mtx_unlock(&dtrace_lock);
18934			return (ENOENT);
18935		}
18936
18937		if (state->dts_activity == DTRACE_ACTIVITY_DRAINING)
18938			stat.dtst_exiting = 1;
18939
18940		nerrs = state->dts_errors;
18941		dstate = &state->dts_vstate.dtvs_dynvars;
18942
18943		for (i = 0; i < (int)NCPU; i++) {
18944			dtrace_dstate_percpu_t *dcpu = &dstate->dtds_percpu[i];
18945
18946			stat.dtst_dyndrops += dcpu->dtdsc_drops;
18947			stat.dtst_dyndrops_dirty += dcpu->dtdsc_dirty_drops;
18948			stat.dtst_dyndrops_rinsing += dcpu->dtdsc_rinsing_drops;
18949
18950			if (state->dts_buffer[i].dtb_flags & DTRACEBUF_FULL)
18951				stat.dtst_filled++;
18952
18953			nerrs += state->dts_buffer[i].dtb_errors;
18954
18955			for (j = 0; j < state->dts_nspeculations; j++) {
18956				dtrace_speculation_t *spec;
18957				dtrace_buffer_t *buf;
18958
18959				spec = &state->dts_speculations[j];
18960				buf = &spec->dtsp_buffer[i];
18961				stat.dtst_specdrops += buf->dtb_xamot_drops;
18962			}
18963		}
18964
18965		stat.dtst_specdrops_busy = state->dts_speculations_busy;
18966		stat.dtst_specdrops_unavail = state->dts_speculations_unavail;
18967		stat.dtst_stkstroverflows = state->dts_stkstroverflows;
18968		stat.dtst_dblerrors = state->dts_dblerrors;
18969		stat.dtst_killed =
18970			(state->dts_activity == DTRACE_ACTIVITY_KILLED);
18971		stat.dtst_errors = nerrs;
18972
18973		lck_mtx_unlock(&dtrace_lock);
18974
18975		if (copyout(&stat, arg, sizeof (stat)) != 0)
18976			return (EFAULT);
18977
18978		return (0);
18979	}
18980
18981	case DTRACEIOC_FORMAT: {
18982		dtrace_fmtdesc_t fmt;
18983		char *str;
18984		int len;
18985
18986		if (copyin(arg, &fmt, sizeof (fmt)) != 0)
18987			return (EFAULT);
18988
18989		lck_mtx_lock(&dtrace_lock);
18990
18991		if (fmt.dtfd_format == 0 ||
18992			fmt.dtfd_format > state->dts_nformats) {
18993			lck_mtx_unlock(&dtrace_lock);
18994			return (EINVAL);
18995		}
18996
18997		/*
18998		 * Format strings are allocated contiguously and they are
18999		 * never freed; if a format index is less than the number
19000		 * of formats, we can assert that the format map is non-NULL
19001		 * and that the format for the specified index is non-NULL.
19002		 */
19003		ASSERT(state->dts_formats != NULL);
19004		str = state->dts_formats[fmt.dtfd_format - 1];
19005		ASSERT(str != NULL);
19006
19007		len = strlen(str) + 1;
19008
19009		if (len > fmt.dtfd_length) {
19010			fmt.dtfd_length = len;
19011
19012			if (copyout(&fmt, arg, sizeof (fmt)) != 0) {
19013				lck_mtx_unlock(&dtrace_lock);
19014				return (EINVAL);
19015			}
19016		} else {
19017			if (copyout(str, (user_addr_t)fmt.dtfd_string, len) != 0) {
19018				lck_mtx_unlock(&dtrace_lock);
19019				return (EINVAL);
19020			}
19021		}
19022
19023		lck_mtx_unlock(&dtrace_lock);
19024		return (0);
19025	}
19026
19027	case DTRACEIOC_MODUUIDSLIST: {
19028		size_t module_uuids_list_size;
19029		dtrace_module_uuids_list_t* uuids_list;
19030		uint64_t dtmul_count;
19031
19032		/*
19033		 * Fail if the kernel symbol mode makes this operation illegal.
19034		 * Both NEVER & ALWAYS_FROM_KERNEL are permanent states, it is legal to check
19035		 * for them without holding the dtrace_lock.
19036		 */
19037		if (dtrace_kernel_symbol_mode == DTRACE_KERNEL_SYMBOLS_NEVER ||
19038		    dtrace_kernel_symbol_mode == DTRACE_KERNEL_SYMBOLS_ALWAYS_FROM_KERNEL) {
19039			cmn_err(CE_WARN, "dtrace_kernel_symbol_mode of %u disallows DTRACEIOC_MODUUIDSLIST", dtrace_kernel_symbol_mode);
19040			return (EPERM);
19041		}
19042
19043		/*
19044		 * Read the number of symbolsdesc structs being passed in.
19045		 */
19046		if (copyin(arg + offsetof(dtrace_module_uuids_list_t, dtmul_count),
19047			   &dtmul_count,
19048			   sizeof(dtmul_count))) {
19049			cmn_err(CE_WARN, "failed to copyin dtmul_count");
19050			return (EFAULT);
19051		}
19052
19053		/*
19054		 * Range check the count. More than 2k kexts is probably an error.
19055		 */
19056		if (dtmul_count > 2048) {
19057			cmn_err(CE_WARN, "dtmul_count is not valid");
19058			return (EINVAL);
19059		}
19060
19061		/*
19062		 * For all queries, we return EINVAL when the user specified
19063		 * count does not match the actual number of modules we find
19064		 * available.
19065		 *
19066		 * If the user specified count is zero, then this serves as a
19067		 * simple query to count the available modules in need of symbols.
19068		 */
19069
19070		rval = 0;
19071
19072		if (dtmul_count == 0)
19073		{
19074			lck_mtx_lock(&mod_lock);
19075			struct modctl* ctl = dtrace_modctl_list;
19076			while (ctl) {
19077				ASSERT(!MOD_HAS_USERSPACE_SYMBOLS(ctl));
19078				if (!MOD_SYMBOLS_DONE(ctl)) {
19079					dtmul_count++;
19080					rval = EINVAL;
19081				}
19082				ctl = ctl->mod_next;
19083			}
19084			lck_mtx_unlock(&mod_lock);
19085
19086			if (copyout(&dtmul_count, arg, sizeof (dtmul_count)) != 0)
19087				return (EFAULT);
19088			else
19089				return (rval);
19090		}
19091
19092		/*
19093		 * If we reach this point, then we have a request for full list data.
19094		 * Allocate a correctly sized structure and copyin the data.
19095		 */
19096		module_uuids_list_size = DTRACE_MODULE_UUIDS_LIST_SIZE(dtmul_count);
19097		if ((uuids_list = kmem_alloc(module_uuids_list_size, KM_SLEEP)) == NULL)
19098			return (ENOMEM);
19099
19100		/* NOTE! We can no longer exit this method via return */
19101		if (copyin(arg, uuids_list, module_uuids_list_size) != 0) {
19102			cmn_err(CE_WARN, "failed copyin of dtrace_module_uuids_list_t");
19103			rval = EFAULT;
19104			goto moduuidslist_cleanup;
19105		}
19106
19107		/*
19108		 * Check that the count didn't change between the first copyin and the second.
19109		 */
19110		if (uuids_list->dtmul_count != dtmul_count) {
19111			rval = EINVAL;
19112			goto moduuidslist_cleanup;
19113		}
19114
19115		/*
19116		 * Build the list of UUID's that need symbols
19117		 */
19118		lck_mtx_lock(&mod_lock);
19119
19120		dtmul_count = 0;
19121
19122		struct modctl* ctl = dtrace_modctl_list;
19123		while (ctl) {
19124			/*
19125			 * We assume that userspace symbols will be "better" than kernel level symbols,
19126			 * as userspace can search for dSYM(s) and symbol'd binaries. Even if kernel syms
19127			 * are available, add user syms if the module might use them.
19128			 */
19129			ASSERT(!MOD_HAS_USERSPACE_SYMBOLS(ctl));
19130			if (!MOD_SYMBOLS_DONE(ctl)) {
19131				UUID* uuid = &uuids_list->dtmul_uuid[dtmul_count];
19132				if (dtmul_count++ < uuids_list->dtmul_count) {
19133					memcpy(uuid, ctl->mod_uuid, sizeof(UUID));
19134				}
19135			}
19136			ctl = ctl->mod_next;
19137		}
19138
19139		lck_mtx_unlock(&mod_lock);
19140
19141		if (uuids_list->dtmul_count < dtmul_count)
19142			rval = EINVAL;
19143
19144		uuids_list->dtmul_count = dtmul_count;
19145
19146		/*
19147		 * Copyout the symbols list (or at least the count!)
19148		 */
19149		if (copyout(uuids_list, arg, module_uuids_list_size) != 0) {
19150			cmn_err(CE_WARN, "failed copyout of dtrace_symbolsdesc_list_t");
19151			rval = EFAULT;
19152		}
19153
19154	moduuidslist_cleanup:
19155		/*
19156		 * If we had to allocate struct memory, free it.
19157		 */
19158		if (uuids_list != NULL) {
19159			kmem_free(uuids_list, module_uuids_list_size);
19160		}
19161
19162		return rval;
19163	}
19164
19165	case DTRACEIOC_PROVMODSYMS: {
19166		size_t module_symbols_size;
19167		dtrace_module_symbols_t* module_symbols;
19168		uint64_t dtmodsyms_count;
19169
19170		/*
19171		 * Fail if the kernel symbol mode makes this operation illegal.
19172		 * Both NEVER & ALWAYS_FROM_KERNEL are permanent states, it is legal to check
19173		 * for them without holding the dtrace_lock.
19174		 */
19175		if (dtrace_kernel_symbol_mode == DTRACE_KERNEL_SYMBOLS_NEVER ||
19176		    dtrace_kernel_symbol_mode == DTRACE_KERNEL_SYMBOLS_ALWAYS_FROM_KERNEL) {
19177			cmn_err(CE_WARN, "dtrace_kernel_symbol_mode of %u disallows DTRACEIOC_PROVMODSYMS", dtrace_kernel_symbol_mode);
19178			return (EPERM);
19179		}
19180
19181		/*
19182		 * Read the number of module symbols structs being passed in.
19183		 */
19184		if (copyin(arg + offsetof(dtrace_module_symbols_t, dtmodsyms_count),
19185			   &dtmodsyms_count,
19186			   sizeof(dtmodsyms_count))) {
19187			cmn_err(CE_WARN, "failed to copyin dtmodsyms_count");
19188			return (EFAULT);
19189		}
19190
19191		/*
19192		 * Range check the count. How much data can we pass around?
19193		 * FIX ME!
19194		 */
19195		if (dtmodsyms_count == 0 || (dtmodsyms_count > 100 * 1024)) {
19196			cmn_err(CE_WARN, "dtmodsyms_count is not valid");
19197			return (EINVAL);
19198		}
19199
19200		/*
19201		 * Allocate a correctly sized structure and copyin the data.
19202		 */
19203		module_symbols_size = DTRACE_MODULE_SYMBOLS_SIZE(dtmodsyms_count);
19204		if ((module_symbols = kmem_alloc(module_symbols_size, KM_SLEEP)) == NULL)
19205			return (ENOMEM);
19206
19207		rval = 0;
19208
19209		/* NOTE! We can no longer exit this method via return */
19210		if (copyin(arg, module_symbols, module_symbols_size) != 0) {
19211			cmn_err(CE_WARN, "failed copyin of dtrace_module_symbols_t, symbol count %llu", module_symbols->dtmodsyms_count);
19212			rval = EFAULT;
19213			goto module_symbols_cleanup;
19214		}
19215
19216		/*
19217		 * Check that the count didn't change between the first copyin and the second.
19218		 */
19219		if (module_symbols->dtmodsyms_count != dtmodsyms_count) {
19220			rval = EINVAL;
19221			goto module_symbols_cleanup;
19222		}
19223
19224		/*
19225		 * Find the modctl to add symbols to.
19226		 */
19227		lck_mtx_lock(&dtrace_provider_lock);
19228		lck_mtx_lock(&mod_lock);
19229
19230		struct modctl* ctl = dtrace_modctl_list;
19231		while (ctl) {
19232			ASSERT(!MOD_HAS_USERSPACE_SYMBOLS(ctl));
19233			if (MOD_HAS_UUID(ctl) && !MOD_SYMBOLS_DONE(ctl)) {
19234				if (memcmp(module_symbols->dtmodsyms_uuid, ctl->mod_uuid, sizeof(UUID)) == 0) {
19235					/* BINGO! */
19236					ctl->mod_user_symbols = module_symbols;
19237					break;
19238				}
19239			}
19240			ctl = ctl->mod_next;
19241		}
19242
19243		if (ctl) {
19244			dtrace_provider_t *prv;
19245
19246			/*
19247			 * We're going to call each providers per-module provide operation
19248			 * specifying only this module.
19249			 */
19250			for (prv = dtrace_provider; prv != NULL; prv = prv->dtpv_next)
19251				prv->dtpv_pops.dtps_provide_module(prv->dtpv_arg, ctl);
19252
19253			/*
19254			 * We gave every provider a chance to provide with the user syms, go ahead and clear them
19255			 */
19256			ctl->mod_user_symbols = NULL; /* MUST reset this to clear HAS_USERSPACE_SYMBOLS */
19257		}
19258
19259		lck_mtx_unlock(&mod_lock);
19260		lck_mtx_unlock(&dtrace_provider_lock);
19261
19262	module_symbols_cleanup:
19263		/*
19264		 * If we had to allocate struct memory, free it.
19265		 */
19266		if (module_symbols != NULL) {
19267			kmem_free(module_symbols, module_symbols_size);
19268		}
19269
19270		return rval;
19271	}
19272
19273		default:
19274			break;
19275	}
19276
19277	return (ENOTTY);
19278}
19279#endif /* __APPLE__ */
19280
19281#if !defined(__APPLE__)
19282/*ARGSUSED*/
19283static int
19284dtrace_detach(dev_info_t *dip, ddi_detach_cmd_t cmd)
19285{
19286	dtrace_state_t *state;
19287
19288	switch (cmd) {
19289	case DDI_DETACH:
19290		break;
19291
19292	case DDI_SUSPEND:
19293		return (DDI_SUCCESS);
19294
19295	default:
19296		return (DDI_FAILURE);
19297	}
19298
19299	lck_mtx_lock(&cpu_lock);
19300	lck_mtx_lock(&dtrace_provider_lock);
19301	lck_mtx_lock(&dtrace_lock);
19302
19303	ASSERT(dtrace_opens == 0);
19304
19305	if (dtrace_helpers > 0) {
19306		lck_mtx_unlock(&dtrace_provider_lock);
19307		lck_mtx_unlock(&dtrace_lock);
19308		lck_mtx_unlock(&cpu_lock);
19309		return (DDI_FAILURE);
19310	}
19311
19312	if (dtrace_unregister((dtrace_provider_id_t)dtrace_provider) != 0) {
19313		lck_mtx_unlock(&dtrace_provider_lock);
19314		lck_mtx_unlock(&dtrace_lock);
19315		lck_mtx_unlock(&cpu_lock);
19316		return (DDI_FAILURE);
19317	}
19318
19319	dtrace_provider = NULL;
19320
19321	if ((state = dtrace_anon_grab()) != NULL) {
19322		/*
19323		 * If there were ECBs on this state, the provider should
19324		 * have not been allowed to detach; assert that there is
19325		 * none.
19326		 */
19327		ASSERT(state->dts_necbs == 0);
19328		dtrace_state_destroy(state);
19329
19330		/*
19331		 * If we're being detached with anonymous state, we need to
19332		 * indicate to the kernel debugger that DTrace is now inactive.
19333		 */
19334		(void) kdi_dtrace_set(KDI_DTSET_DTRACE_DEACTIVATE);
19335	}
19336
19337	bzero(&dtrace_anon, sizeof (dtrace_anon_t));
19338	unregister_cpu_setup_func((cpu_setup_func_t *)dtrace_cpu_setup, NULL);
19339	dtrace_cpu_init = NULL;
19340	dtrace_helpers_cleanup = NULL;
19341	dtrace_helpers_fork = NULL;
19342	dtrace_cpustart_init = NULL;
19343	dtrace_cpustart_fini = NULL;
19344	dtrace_debugger_init = NULL;
19345	dtrace_debugger_fini = NULL;
19346	dtrace_kreloc_init = NULL;
19347	dtrace_kreloc_fini = NULL;
19348	dtrace_modload = NULL;
19349	dtrace_modunload = NULL;
19350
19351	lck_mtx_unlock(&cpu_lock);
19352
19353	if (dtrace_helptrace_enabled) {
19354		kmem_free(dtrace_helptrace_buffer, dtrace_helptrace_bufsize);
19355		dtrace_helptrace_buffer = NULL;
19356	}
19357
19358	kmem_free(dtrace_probes, dtrace_nprobes * sizeof (dtrace_probe_t *));
19359	dtrace_probes = NULL;
19360	dtrace_nprobes = 0;
19361
19362	dtrace_hash_destroy(dtrace_bymod);
19363	dtrace_hash_destroy(dtrace_byfunc);
19364	dtrace_hash_destroy(dtrace_byname);
19365	dtrace_bymod = NULL;
19366	dtrace_byfunc = NULL;
19367	dtrace_byname = NULL;
19368
19369	kmem_cache_destroy(dtrace_state_cache);
19370	vmem_destroy(dtrace_minor);
19371	vmem_destroy(dtrace_arena);
19372
19373	if (dtrace_toxrange != NULL) {
19374		kmem_free(dtrace_toxrange,
19375		    dtrace_toxranges_max * sizeof (dtrace_toxrange_t));
19376		dtrace_toxrange = NULL;
19377		dtrace_toxranges = 0;
19378		dtrace_toxranges_max = 0;
19379	}
19380
19381	ddi_remove_minor_node(dtrace_devi, NULL);
19382	dtrace_devi = NULL;
19383
19384	ddi_soft_state_fini(&dtrace_softstate);
19385
19386	ASSERT(dtrace_vtime_references == 0);
19387	ASSERT(dtrace_opens == 0);
19388	ASSERT(dtrace_retained == NULL);
19389
19390	lck_mtx_unlock(&dtrace_lock);
19391	lck_mtx_unlock(&dtrace_provider_lock);
19392
19393	/*
19394	 * We don't destroy the task queue until after we have dropped our
19395	 * locks (taskq_destroy() may block on running tasks).  To prevent
19396	 * attempting to do work after we have effectively detached but before
19397	 * the task queue has been destroyed, all tasks dispatched via the
19398	 * task queue must check that DTrace is still attached before
19399	 * performing any operation.
19400	 */
19401	taskq_destroy(dtrace_taskq);
19402	dtrace_taskq = NULL;
19403
19404	return (DDI_SUCCESS);
19405}
19406
19407/*ARGSUSED*/
19408static int
19409dtrace_info(dev_info_t *dip, ddi_info_cmd_t infocmd, void *arg, void **result)
19410{
19411	int error;
19412
19413	switch (infocmd) {
19414	case DDI_INFO_DEVT2DEVINFO:
19415		*result = (void *)dtrace_devi;
19416		error = DDI_SUCCESS;
19417		break;
19418	case DDI_INFO_DEVT2INSTANCE:
19419		*result = (void *)0;
19420		error = DDI_SUCCESS;
19421		break;
19422	default:
19423		error = DDI_FAILURE;
19424	}
19425	return (error);
19426}
19427
19428static struct cb_ops dtrace_cb_ops = {
19429	dtrace_open,		/* open */
19430	dtrace_close,		/* close */
19431	nulldev,		/* strategy */
19432	nulldev,		/* print */
19433	nodev,			/* dump */
19434	nodev,			/* read */
19435	nodev,			/* write */
19436	dtrace_ioctl,		/* ioctl */
19437	nodev,			/* devmap */
19438	nodev,			/* mmap */
19439	nodev,			/* segmap */
19440	nochpoll,		/* poll */
19441	ddi_prop_op,		/* cb_prop_op */
19442	0,			/* streamtab  */
19443	D_NEW | D_MP		/* Driver compatibility flag */
19444};
19445
19446static struct dev_ops dtrace_ops = {
19447	DEVO_REV,		/* devo_rev */
19448	0,			/* refcnt */
19449	dtrace_info,		/* get_dev_info */
19450	nulldev,		/* identify */
19451	nulldev,		/* probe */
19452	dtrace_attach,		/* attach */
19453	dtrace_detach,		/* detach */
19454	nodev,			/* reset */
19455	&dtrace_cb_ops,		/* driver operations */
19456	NULL,			/* bus operations */
19457	nodev			/* dev power */
19458};
19459
19460static struct modldrv modldrv = {
19461	&mod_driverops,		/* module type (this is a pseudo driver) */
19462	"Dynamic Tracing",	/* name of module */
19463	&dtrace_ops,		/* driver ops */
19464};
19465
19466static struct modlinkage modlinkage = {
19467	MODREV_1,
19468	(void *)&modldrv,
19469	NULL
19470};
19471
19472int
19473_init(void)
19474{
19475	return (mod_install(&modlinkage));
19476}
19477
19478int
19479_info(struct modinfo *modinfop)
19480{
19481	return (mod_info(&modlinkage, modinfop));
19482}
19483
19484int
19485_fini(void)
19486{
19487	return (mod_remove(&modlinkage));
19488}
19489#else /* Darwin BSD driver model. */
19490
19491d_open_t _dtrace_open, helper_open;
19492d_close_t _dtrace_close, helper_close;
19493d_ioctl_t _dtrace_ioctl, helper_ioctl;
19494
19495int
19496_dtrace_open(dev_t dev, int flags, int devtype, struct proc *p)
19497{
19498#pragma unused(p)
19499	dev_t locdev = dev;
19500
19501	return  dtrace_open( &locdev, flags, devtype, CRED());
19502}
19503
19504int
19505helper_open(dev_t dev, int flags, int devtype, struct proc *p)
19506{
19507#pragma unused(dev,flags,devtype,p)
19508	return 0;
19509}
19510
19511int
19512_dtrace_close(dev_t dev, int flags, int devtype, struct proc *p)
19513{
19514#pragma unused(p)
19515	return dtrace_close( dev, flags, devtype, CRED());
19516}
19517
19518int
19519helper_close(dev_t dev, int flags, int devtype, struct proc *p)
19520{
19521#pragma unused(dev,flags,devtype,p)
19522	return 0;
19523}
19524
19525int
19526_dtrace_ioctl(dev_t dev, u_long cmd, caddr_t data, int fflag, struct proc *p)
19527{
19528#pragma unused(p)
19529	int err, rv = 0;
19530    user_addr_t uaddrp;
19531
19532    if (proc_is64bit(p))
19533		uaddrp = *(user_addr_t *)data;
19534	else
19535		uaddrp = (user_addr_t) *(uint32_t *)data;
19536
19537	err = dtrace_ioctl(dev, cmd, uaddrp, fflag, CRED(), &rv);
19538
19539	/* Darwin's BSD ioctls only return -1 or zero. Overload errno to mimic Solaris. 20 bits suffice. */
19540	if (err != 0) {
19541		ASSERT( (err & 0xfffff000) == 0 );
19542		return (err & 0xfff); /* ioctl will return -1 and will set errno to an error code < 4096 */
19543	} else if (rv != 0) {
19544		ASSERT( (rv & 0xfff00000) == 0 );
19545		return (((rv & 0xfffff) << 12)); /* ioctl will return -1 and will set errno to a value >= 4096 */
19546	} else
19547		return 0;
19548}
19549
19550int
19551helper_ioctl(dev_t dev, u_long cmd, caddr_t data, int fflag, struct proc *p)
19552{
19553#pragma unused(dev,fflag,p)
19554	int err, rv = 0;
19555
19556	err = dtrace_ioctl_helper(cmd, data, &rv);
19557	/* Darwin's BSD ioctls only return -1 or zero. Overload errno to mimic Solaris. 20 bits suffice. */
19558	if (err != 0) {
19559		ASSERT( (err & 0xfffff000) == 0 );
19560		return (err & 0xfff); /* ioctl will return -1 and will set errno to an error code < 4096 */
19561	} else if (rv != 0) {
19562		ASSERT( (rv & 0xfff00000) == 0 );
19563		return (((rv & 0xfffff) << 12)); /* ioctl will return -1 and will set errno to a value >= 4096 */
19564	} else
19565		return 0;
19566}
19567
19568#define HELPER_MAJOR  -24 /* let the kernel pick the device number */
19569
19570/*
19571 * A struct describing which functions will get invoked for certain
19572 * actions.
19573 */
19574static struct cdevsw helper_cdevsw =
19575{
19576	helper_open,		/* open */
19577	helper_close,		/* close */
19578	eno_rdwrt,			/* read */
19579	eno_rdwrt,			/* write */
19580	helper_ioctl,		/* ioctl */
19581	(stop_fcn_t *)nulldev, /* stop */
19582	(reset_fcn_t *)nulldev, /* reset */
19583	NULL,				/* tty's */
19584	eno_select,			/* select */
19585	eno_mmap,			/* mmap */
19586	eno_strat,			/* strategy */
19587	eno_getc,			/* getc */
19588	eno_putc,			/* putc */
19589	0					/* type */
19590};
19591
19592static int helper_majdevno = 0;
19593
19594static int gDTraceInited = 0;
19595
19596void
19597helper_init( void )
19598{
19599	/*
19600	 * Once the "helper" is initialized, it can take ioctl calls that use locks
19601	 * and zones initialized in dtrace_init. Make certain dtrace_init was called
19602	 * before us.
19603	 */
19604
19605	if (!gDTraceInited) {
19606		panic("helper_init before dtrace_init\n");
19607	}
19608
19609	if (0 >= helper_majdevno)
19610	{
19611		helper_majdevno = cdevsw_add(HELPER_MAJOR, &helper_cdevsw);
19612
19613		if (helper_majdevno < 0) {
19614			printf("helper_init: failed to allocate a major number!\n");
19615			return;
19616		}
19617
19618		if (NULL == devfs_make_node( makedev(helper_majdevno, 0), DEVFS_CHAR, UID_ROOT, GID_WHEEL, 0666,
19619					DTRACEMNR_HELPER, 0 )) {
19620			printf("dtrace_init: failed to devfs_make_node for helper!\n");
19621			return;
19622		}
19623	} else
19624		panic("helper_init: called twice!\n");
19625}
19626
19627#undef HELPER_MAJOR
19628
19629/*
19630 * Called with DEVFS_LOCK held, so vmem_alloc's underlying blist structures are protected.
19631 */
19632static int
19633dtrace_clone_func(dev_t dev, int action)
19634{
19635#pragma unused(dev)
19636
19637	if (action == DEVFS_CLONE_ALLOC) {
19638		if (NULL == dtrace_minor) /* Arena not created yet!?! */
19639			return 0;
19640		else {
19641			/*
19642			 * Propose a minor number, namely the next number that vmem_alloc() will return.
19643			 * Immediately put it back in play by calling vmem_free(). FIXME.
19644			 */
19645			int ret = (int)(uintptr_t)vmem_alloc(dtrace_minor, 1, VM_BESTFIT | VM_SLEEP);
19646
19647			vmem_free(dtrace_minor, (void *)(uintptr_t)ret, 1);
19648
19649			return ret;
19650		}
19651	}
19652	else if (action == DEVFS_CLONE_FREE) {
19653		return 0;
19654	}
19655	else return -1;
19656}
19657
19658#define DTRACE_MAJOR  -24 /* let the kernel pick the device number */
19659
19660static struct cdevsw dtrace_cdevsw =
19661{
19662	_dtrace_open,		/* open */
19663	_dtrace_close,		/* close */
19664	eno_rdwrt,			/* read */
19665	eno_rdwrt,			/* write */
19666	_dtrace_ioctl,		/* ioctl */
19667	(stop_fcn_t *)nulldev, /* stop */
19668	(reset_fcn_t *)nulldev, /* reset */
19669	NULL,				/* tty's */
19670	eno_select,			/* select */
19671	eno_mmap,			/* mmap */
19672	eno_strat,			/* strategy */
19673	eno_getc,			/* getc */
19674	eno_putc,			/* putc */
19675	0					/* type */
19676};
19677
19678lck_attr_t* dtrace_lck_attr;
19679lck_grp_attr_t* dtrace_lck_grp_attr;
19680lck_grp_t* dtrace_lck_grp;
19681
19682static int gMajDevNo;
19683
19684void
19685dtrace_init( void )
19686{
19687	if (0 == gDTraceInited) {
19688		int i, ncpu;
19689
19690		/*
19691		 * DTrace allocates buffers based on the maximum number
19692		 * of enabled cpus. This call avoids any race when finding
19693		 * that count.
19694		 */
19695		ASSERT(dtrace_max_cpus == 0);
19696		ncpu = dtrace_max_cpus = ml_get_max_cpus();
19697
19698		gMajDevNo = cdevsw_add(DTRACE_MAJOR, &dtrace_cdevsw);
19699
19700		if (gMajDevNo < 0) {
19701			printf("dtrace_init: failed to allocate a major number!\n");
19702			gDTraceInited = 0;
19703			return;
19704		}
19705
19706		if (NULL == devfs_make_node_clone( makedev(gMajDevNo, 0), DEVFS_CHAR, UID_ROOT, GID_WHEEL, 0666,
19707					dtrace_clone_func, DTRACEMNR_DTRACE, 0 )) {
19708			printf("dtrace_init: failed to devfs_make_node_clone for dtrace!\n");
19709			gDTraceInited = 0;
19710			return;
19711		}
19712
19713#if defined(DTRACE_MEMORY_ZONES)
19714		/*
19715		 * Initialize the dtrace kalloc-emulation zones.
19716		 */
19717		dtrace_alloc_init();
19718#endif /* DTRACE_MEMORY_ZONES */
19719
19720		/*
19721		 * Allocate the dtrace_probe_t zone
19722		 */
19723		dtrace_probe_t_zone = zinit(sizeof(dtrace_probe_t),
19724					    1024 * sizeof(dtrace_probe_t),
19725					    sizeof(dtrace_probe_t),
19726					    "dtrace.dtrace_probe_t");
19727
19728		/*
19729		 * Create the dtrace lock group and attrs.
19730		 */
19731		dtrace_lck_attr = lck_attr_alloc_init();
19732		dtrace_lck_grp_attr= lck_grp_attr_alloc_init();
19733		dtrace_lck_grp = lck_grp_alloc_init("dtrace",  dtrace_lck_grp_attr);
19734
19735		/*
19736		 * We have to initialize all locks explicitly
19737		 */
19738		lck_mtx_init(&dtrace_lock, dtrace_lck_grp, dtrace_lck_attr);
19739		lck_mtx_init(&dtrace_provider_lock, dtrace_lck_grp, dtrace_lck_attr);
19740		lck_mtx_init(&dtrace_meta_lock, dtrace_lck_grp, dtrace_lck_attr);
19741#if DEBUG
19742		lck_mtx_init(&dtrace_errlock, dtrace_lck_grp, dtrace_lck_attr);
19743#endif
19744		lck_rw_init(&dtrace_dof_mode_lock, dtrace_lck_grp, dtrace_lck_attr);
19745
19746		/*
19747		 * The cpu_core structure consists of per-CPU state available in any context.
19748		 * On some architectures, this may mean that the page(s) containing the
19749		 * NCPU-sized array of cpu_core structures must be locked in the TLB -- it
19750		 * is up to the platform to assure that this is performed properly.  Note that
19751		 * the structure is sized to avoid false sharing.
19752		 */
19753		lck_mtx_init(&cpu_lock, dtrace_lck_grp, dtrace_lck_attr);
19754		lck_mtx_init(&mod_lock, dtrace_lck_grp, dtrace_lck_attr);
19755
19756		dtrace_modctl_list = NULL;
19757
19758		cpu_core = (cpu_core_t *)kmem_zalloc( ncpu * sizeof(cpu_core_t), KM_SLEEP );
19759		for (i = 0; i < ncpu; ++i) {
19760			lck_mtx_init(&cpu_core[i].cpuc_pid_lock, dtrace_lck_grp, dtrace_lck_attr);
19761		}
19762
19763		cpu_list = (dtrace_cpu_t *)kmem_zalloc( ncpu * sizeof(dtrace_cpu_t), KM_SLEEP );
19764		for (i = 0; i < ncpu; ++i) {
19765			cpu_list[i].cpu_id = (processorid_t)i;
19766			cpu_list[i].cpu_next = &(cpu_list[(i+1) % ncpu]);
19767			lck_rw_init(&cpu_list[i].cpu_ft_lock, dtrace_lck_grp, dtrace_lck_attr);
19768		}
19769
19770		lck_mtx_lock(&cpu_lock);
19771		for (i = 0; i < ncpu; ++i)
19772			/* FIXME: track CPU configuration a la CHUD Processor Pref Pane. */
19773			dtrace_cpu_setup_initial( (processorid_t)i ); /* In lieu of register_cpu_setup_func() callback */
19774		lck_mtx_unlock(&cpu_lock);
19775
19776		(void)dtrace_abs_to_nano(0LL); /* Force once only call to clock_timebase_info (which can take a lock) */
19777
19778		dtrace_isa_init();
19779
19780		/*
19781		 * See dtrace_impl.h for a description of dof modes.
19782		 * The default is lazy dof.
19783		 *
19784		 * FIXME: Warn if state is LAZY_OFF? It won't break anything, but
19785		 * makes no sense...
19786		 */
19787		if (!PE_parse_boot_argn("dtrace_dof_mode", &dtrace_dof_mode, sizeof (dtrace_dof_mode))) {
19788			dtrace_dof_mode = DTRACE_DOF_MODE_LAZY_ON;
19789		}
19790
19791		/*
19792		 * Sanity check of dof mode value.
19793		 */
19794		switch (dtrace_dof_mode) {
19795			case DTRACE_DOF_MODE_NEVER:
19796			case DTRACE_DOF_MODE_LAZY_ON:
19797				/* valid modes, but nothing else we need to do */
19798				break;
19799
19800			case DTRACE_DOF_MODE_LAZY_OFF:
19801			case DTRACE_DOF_MODE_NON_LAZY:
19802				/* Cannot wait for a dtrace_open to init fasttrap */
19803				fasttrap_init();
19804				break;
19805
19806			default:
19807				/* Invalid, clamp to non lazy */
19808				dtrace_dof_mode = DTRACE_DOF_MODE_NON_LAZY;
19809				fasttrap_init();
19810				break;
19811		}
19812
19813		/*
19814		 * See dtrace_impl.h for a description of kernel symbol modes.
19815		 * The default is to wait for symbols from userspace (lazy symbols).
19816		 */
19817		if (!PE_parse_boot_argn("dtrace_kernel_symbol_mode", &dtrace_kernel_symbol_mode, sizeof (dtrace_kernel_symbol_mode))) {
19818			dtrace_kernel_symbol_mode = DTRACE_KERNEL_SYMBOLS_FROM_USERSPACE;
19819		}
19820
19821		gDTraceInited = 1;
19822
19823	} else
19824		panic("dtrace_init: called twice!\n");
19825}
19826
19827void
19828dtrace_postinit(void)
19829{
19830	/*
19831	 * Called from bsd_init after all provider's *_init() routines have been
19832	 * run. That way, anonymous DOF enabled under dtrace_attach() is safe
19833	 * to go.
19834	 */
19835	dtrace_attach( (dev_info_t *)(uintptr_t)makedev(gMajDevNo, 0), 0 ); /* Punning a dev_t to a dev_info_t* */
19836
19837	/*
19838	 * Add the mach_kernel to the module list for lazy processing
19839	 */
19840	struct kmod_info fake_kernel_kmod;
19841	memset(&fake_kernel_kmod, 0, sizeof(fake_kernel_kmod));
19842
19843	strlcpy(fake_kernel_kmod.name, "mach_kernel", sizeof(fake_kernel_kmod.name));
19844	fake_kernel_kmod.id = 1;
19845	fake_kernel_kmod.address = g_kernel_kmod_info.address;
19846	fake_kernel_kmod.size = g_kernel_kmod_info.size;
19847
19848	if (dtrace_module_loaded(&fake_kernel_kmod, 0) != 0) {
19849		printf("dtrace_postinit: Could not register mach_kernel modctl\n");
19850	}
19851
19852	(void)OSKextRegisterKextsWithDTrace();
19853}
19854#undef DTRACE_MAJOR
19855
19856/*
19857 * Routines used to register interest in cpu's being added to or removed
19858 * from the system.
19859 */
19860void
19861register_cpu_setup_func(cpu_setup_func_t *ignore1, void *ignore2)
19862{
19863#pragma unused(ignore1,ignore2)
19864}
19865
19866void
19867unregister_cpu_setup_func(cpu_setup_func_t *ignore1, void *ignore2)
19868{
19869#pragma unused(ignore1,ignore2)
19870}
19871#endif /* __APPLE__ */
19872