Deleted Added
full compact
dtrace.c (268572) dtrace.c (268578)
1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 *
1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 *
21 * $FreeBSD: stable/10/sys/cddl/contrib/opensolaris/uts/common/dtrace/dtrace.c 268572 2014-07-12 18:23:35Z pfg $
21 * $FreeBSD: stable/10/sys/cddl/contrib/opensolaris/uts/common/dtrace/dtrace.c 268578 2014-07-12 22:56:41Z rpaulo $
22 */
23
24/*
25 * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
26 * Copyright (c) 2013, Joyent, Inc. All rights reserved.
27 * Copyright (c) 2012 by Delphix. All rights reserved.
28 */
29
30/*
31 * DTrace - Dynamic Tracing for Solaris
32 *
33 * This is the implementation of the Solaris Dynamic Tracing framework
34 * (DTrace). The user-visible interface to DTrace is described at length in
35 * the "Solaris Dynamic Tracing Guide". The interfaces between the libdtrace
36 * library, the in-kernel DTrace framework, and the DTrace providers are
37 * described in the block comments in the <sys/dtrace.h> header file. The
38 * internal architecture of DTrace is described in the block comments in the
39 * <sys/dtrace_impl.h> header file. The comments contained within the DTrace
40 * implementation very much assume mastery of all of these sources; if one has
41 * an unanswered question about the implementation, one should consult them
42 * first.
43 *
44 * The functions here are ordered roughly as follows:
45 *
46 * - Probe context functions
47 * - Probe hashing functions
48 * - Non-probe context utility functions
49 * - Matching functions
50 * - Provider-to-Framework API functions
51 * - Probe management functions
52 * - DIF object functions
53 * - Format functions
54 * - Predicate functions
55 * - ECB functions
56 * - Buffer functions
57 * - Enabling functions
58 * - DOF functions
59 * - Anonymous enabling functions
60 * - Consumer state functions
61 * - Helper functions
62 * - Hook functions
63 * - Driver cookbook functions
64 *
65 * Each group of functions begins with a block comment labelled the "DTrace
66 * [Group] Functions", allowing one to find each block by searching forward
67 * on capital-f functions.
68 */
69#include <sys/errno.h>
70#if !defined(sun)
71#include <sys/time.h>
72#endif
73#include <sys/stat.h>
74#include <sys/modctl.h>
75#include <sys/conf.h>
76#include <sys/systm.h>
77#if defined(sun)
78#include <sys/ddi.h>
79#include <sys/sunddi.h>
80#endif
81#include <sys/cpuvar.h>
82#include <sys/kmem.h>
83#if defined(sun)
84#include <sys/strsubr.h>
85#endif
86#include <sys/sysmacros.h>
87#include <sys/dtrace_impl.h>
88#include <sys/atomic.h>
89#include <sys/cmn_err.h>
90#if defined(sun)
91#include <sys/mutex_impl.h>
92#include <sys/rwlock_impl.h>
93#endif
94#include <sys/ctf_api.h>
95#if defined(sun)
96#include <sys/panic.h>
97#include <sys/priv_impl.h>
98#endif
99#include <sys/policy.h>
100#if defined(sun)
101#include <sys/cred_impl.h>
102#include <sys/procfs_isa.h>
103#endif
104#include <sys/taskq.h>
105#if defined(sun)
106#include <sys/mkdev.h>
107#include <sys/kdi.h>
108#endif
109#include <sys/zone.h>
110#include <sys/socket.h>
111#include <netinet/in.h>
22 */
23
24/*
25 * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
26 * Copyright (c) 2013, Joyent, Inc. All rights reserved.
27 * Copyright (c) 2012 by Delphix. All rights reserved.
28 */
29
30/*
31 * DTrace - Dynamic Tracing for Solaris
32 *
33 * This is the implementation of the Solaris Dynamic Tracing framework
34 * (DTrace). The user-visible interface to DTrace is described at length in
35 * the "Solaris Dynamic Tracing Guide". The interfaces between the libdtrace
36 * library, the in-kernel DTrace framework, and the DTrace providers are
37 * described in the block comments in the <sys/dtrace.h> header file. The
38 * internal architecture of DTrace is described in the block comments in the
39 * <sys/dtrace_impl.h> header file. The comments contained within the DTrace
40 * implementation very much assume mastery of all of these sources; if one has
41 * an unanswered question about the implementation, one should consult them
42 * first.
43 *
44 * The functions here are ordered roughly as follows:
45 *
46 * - Probe context functions
47 * - Probe hashing functions
48 * - Non-probe context utility functions
49 * - Matching functions
50 * - Provider-to-Framework API functions
51 * - Probe management functions
52 * - DIF object functions
53 * - Format functions
54 * - Predicate functions
55 * - ECB functions
56 * - Buffer functions
57 * - Enabling functions
58 * - DOF functions
59 * - Anonymous enabling functions
60 * - Consumer state functions
61 * - Helper functions
62 * - Hook functions
63 * - Driver cookbook functions
64 *
65 * Each group of functions begins with a block comment labelled the "DTrace
66 * [Group] Functions", allowing one to find each block by searching forward
67 * on capital-f functions.
68 */
69#include <sys/errno.h>
70#if !defined(sun)
71#include <sys/time.h>
72#endif
73#include <sys/stat.h>
74#include <sys/modctl.h>
75#include <sys/conf.h>
76#include <sys/systm.h>
77#if defined(sun)
78#include <sys/ddi.h>
79#include <sys/sunddi.h>
80#endif
81#include <sys/cpuvar.h>
82#include <sys/kmem.h>
83#if defined(sun)
84#include <sys/strsubr.h>
85#endif
86#include <sys/sysmacros.h>
87#include <sys/dtrace_impl.h>
88#include <sys/atomic.h>
89#include <sys/cmn_err.h>
90#if defined(sun)
91#include <sys/mutex_impl.h>
92#include <sys/rwlock_impl.h>
93#endif
94#include <sys/ctf_api.h>
95#if defined(sun)
96#include <sys/panic.h>
97#include <sys/priv_impl.h>
98#endif
99#include <sys/policy.h>
100#if defined(sun)
101#include <sys/cred_impl.h>
102#include <sys/procfs_isa.h>
103#endif
104#include <sys/taskq.h>
105#if defined(sun)
106#include <sys/mkdev.h>
107#include <sys/kdi.h>
108#endif
109#include <sys/zone.h>
110#include <sys/socket.h>
111#include <netinet/in.h>
112#include "strtolctype.h"
112
113/* FreeBSD includes: */
114#if !defined(sun)
115#include <sys/callout.h>
116#include <sys/ctype.h>
117#include <sys/eventhandler.h>
118#include <sys/limits.h>
119#include <sys/kdb.h>
120#include <sys/kernel.h>
121#include <sys/malloc.h>
122#include <sys/sysctl.h>
123#include <sys/lock.h>
124#include <sys/mutex.h>
125#include <sys/rwlock.h>
126#include <sys/sx.h>
127#include <sys/dtrace_bsd.h>
128#include <netinet/in.h>
129#include "dtrace_cddl.h"
130#include "dtrace_debug.c"
131#endif
132
133/*
134 * DTrace Tunable Variables
135 *
136 * The following variables may be tuned by adding a line to /etc/system that
137 * includes both the name of the DTrace module ("dtrace") and the name of the
138 * variable. For example:
139 *
140 * set dtrace:dtrace_destructive_disallow = 1
141 *
142 * In general, the only variables that one should be tuning this way are those
143 * that affect system-wide DTrace behavior, and for which the default behavior
144 * is undesirable. Most of these variables are tunable on a per-consumer
145 * basis using DTrace options, and need not be tuned on a system-wide basis.
146 * When tuning these variables, avoid pathological values; while some attempt
147 * is made to verify the integrity of these variables, they are not considered
148 * part of the supported interface to DTrace, and they are therefore not
149 * checked comprehensively. Further, these variables should not be tuned
150 * dynamically via "mdb -kw" or other means; they should only be tuned via
151 * /etc/system.
152 */
153int dtrace_destructive_disallow = 0;
154dtrace_optval_t dtrace_nonroot_maxsize = (16 * 1024 * 1024);
155size_t dtrace_difo_maxsize = (256 * 1024);
156dtrace_optval_t dtrace_dof_maxsize = (8 * 1024 * 1024);
157size_t dtrace_global_maxsize = (16 * 1024);
158size_t dtrace_actions_max = (16 * 1024);
159size_t dtrace_retain_max = 1024;
160dtrace_optval_t dtrace_helper_actions_max = 128;
161dtrace_optval_t dtrace_helper_providers_max = 32;
162dtrace_optval_t dtrace_dstate_defsize = (1 * 1024 * 1024);
163size_t dtrace_strsize_default = 256;
164dtrace_optval_t dtrace_cleanrate_default = 9900990; /* 101 hz */
165dtrace_optval_t dtrace_cleanrate_min = 200000; /* 5000 hz */
166dtrace_optval_t dtrace_cleanrate_max = (uint64_t)60 * NANOSEC; /* 1/minute */
167dtrace_optval_t dtrace_aggrate_default = NANOSEC; /* 1 hz */
168dtrace_optval_t dtrace_statusrate_default = NANOSEC; /* 1 hz */
169dtrace_optval_t dtrace_statusrate_max = (hrtime_t)10 * NANOSEC; /* 6/minute */
170dtrace_optval_t dtrace_switchrate_default = NANOSEC; /* 1 hz */
171dtrace_optval_t dtrace_nspec_default = 1;
172dtrace_optval_t dtrace_specsize_default = 32 * 1024;
173dtrace_optval_t dtrace_stackframes_default = 20;
174dtrace_optval_t dtrace_ustackframes_default = 20;
175dtrace_optval_t dtrace_jstackframes_default = 50;
176dtrace_optval_t dtrace_jstackstrsize_default = 512;
177int dtrace_msgdsize_max = 128;
178hrtime_t dtrace_chill_max = 500 * (NANOSEC / MILLISEC); /* 500 ms */
179hrtime_t dtrace_chill_interval = NANOSEC; /* 1000 ms */
180int dtrace_devdepth_max = 32;
181int dtrace_err_verbose;
182hrtime_t dtrace_deadman_interval = NANOSEC;
183hrtime_t dtrace_deadman_timeout = (hrtime_t)10 * NANOSEC;
184hrtime_t dtrace_deadman_user = (hrtime_t)30 * NANOSEC;
185hrtime_t dtrace_unregister_defunct_reap = (hrtime_t)60 * NANOSEC;
186
187/*
188 * DTrace External Variables
189 *
190 * As dtrace(7D) is a kernel module, any DTrace variables are obviously
191 * available to DTrace consumers via the backtick (`) syntax. One of these,
192 * dtrace_zero, is made deliberately so: it is provided as a source of
193 * well-known, zero-filled memory. While this variable is not documented,
194 * it is used by some translators as an implementation detail.
195 */
196const char dtrace_zero[256] = { 0 }; /* zero-filled memory */
197
198/*
199 * DTrace Internal Variables
200 */
201#if defined(sun)
202static dev_info_t *dtrace_devi; /* device info */
203#endif
204#if defined(sun)
205static vmem_t *dtrace_arena; /* probe ID arena */
206static vmem_t *dtrace_minor; /* minor number arena */
207#else
208static taskq_t *dtrace_taskq; /* task queue */
209static struct unrhdr *dtrace_arena; /* Probe ID number. */
210#endif
211static dtrace_probe_t **dtrace_probes; /* array of all probes */
212static int dtrace_nprobes; /* number of probes */
213static dtrace_provider_t *dtrace_provider; /* provider list */
214static dtrace_meta_t *dtrace_meta_pid; /* user-land meta provider */
215static int dtrace_opens; /* number of opens */
216static int dtrace_helpers; /* number of helpers */
113
114/* FreeBSD includes: */
115#if !defined(sun)
116#include <sys/callout.h>
117#include <sys/ctype.h>
118#include <sys/eventhandler.h>
119#include <sys/limits.h>
120#include <sys/kdb.h>
121#include <sys/kernel.h>
122#include <sys/malloc.h>
123#include <sys/sysctl.h>
124#include <sys/lock.h>
125#include <sys/mutex.h>
126#include <sys/rwlock.h>
127#include <sys/sx.h>
128#include <sys/dtrace_bsd.h>
129#include <netinet/in.h>
130#include "dtrace_cddl.h"
131#include "dtrace_debug.c"
132#endif
133
134/*
135 * DTrace Tunable Variables
136 *
137 * The following variables may be tuned by adding a line to /etc/system that
138 * includes both the name of the DTrace module ("dtrace") and the name of the
139 * variable. For example:
140 *
141 * set dtrace:dtrace_destructive_disallow = 1
142 *
143 * In general, the only variables that one should be tuning this way are those
144 * that affect system-wide DTrace behavior, and for which the default behavior
145 * is undesirable. Most of these variables are tunable on a per-consumer
146 * basis using DTrace options, and need not be tuned on a system-wide basis.
147 * When tuning these variables, avoid pathological values; while some attempt
148 * is made to verify the integrity of these variables, they are not considered
149 * part of the supported interface to DTrace, and they are therefore not
150 * checked comprehensively. Further, these variables should not be tuned
151 * dynamically via "mdb -kw" or other means; they should only be tuned via
152 * /etc/system.
153 */
154int dtrace_destructive_disallow = 0;
155dtrace_optval_t dtrace_nonroot_maxsize = (16 * 1024 * 1024);
156size_t dtrace_difo_maxsize = (256 * 1024);
157dtrace_optval_t dtrace_dof_maxsize = (8 * 1024 * 1024);
158size_t dtrace_global_maxsize = (16 * 1024);
159size_t dtrace_actions_max = (16 * 1024);
160size_t dtrace_retain_max = 1024;
161dtrace_optval_t dtrace_helper_actions_max = 128;
162dtrace_optval_t dtrace_helper_providers_max = 32;
163dtrace_optval_t dtrace_dstate_defsize = (1 * 1024 * 1024);
164size_t dtrace_strsize_default = 256;
165dtrace_optval_t dtrace_cleanrate_default = 9900990; /* 101 hz */
166dtrace_optval_t dtrace_cleanrate_min = 200000; /* 5000 hz */
167dtrace_optval_t dtrace_cleanrate_max = (uint64_t)60 * NANOSEC; /* 1/minute */
168dtrace_optval_t dtrace_aggrate_default = NANOSEC; /* 1 hz */
169dtrace_optval_t dtrace_statusrate_default = NANOSEC; /* 1 hz */
170dtrace_optval_t dtrace_statusrate_max = (hrtime_t)10 * NANOSEC; /* 6/minute */
171dtrace_optval_t dtrace_switchrate_default = NANOSEC; /* 1 hz */
172dtrace_optval_t dtrace_nspec_default = 1;
173dtrace_optval_t dtrace_specsize_default = 32 * 1024;
174dtrace_optval_t dtrace_stackframes_default = 20;
175dtrace_optval_t dtrace_ustackframes_default = 20;
176dtrace_optval_t dtrace_jstackframes_default = 50;
177dtrace_optval_t dtrace_jstackstrsize_default = 512;
178int dtrace_msgdsize_max = 128;
179hrtime_t dtrace_chill_max = 500 * (NANOSEC / MILLISEC); /* 500 ms */
180hrtime_t dtrace_chill_interval = NANOSEC; /* 1000 ms */
181int dtrace_devdepth_max = 32;
182int dtrace_err_verbose;
183hrtime_t dtrace_deadman_interval = NANOSEC;
184hrtime_t dtrace_deadman_timeout = (hrtime_t)10 * NANOSEC;
185hrtime_t dtrace_deadman_user = (hrtime_t)30 * NANOSEC;
186hrtime_t dtrace_unregister_defunct_reap = (hrtime_t)60 * NANOSEC;
187
188/*
189 * DTrace External Variables
190 *
191 * As dtrace(7D) is a kernel module, any DTrace variables are obviously
192 * available to DTrace consumers via the backtick (`) syntax. One of these,
193 * dtrace_zero, is made deliberately so: it is provided as a source of
194 * well-known, zero-filled memory. While this variable is not documented,
195 * it is used by some translators as an implementation detail.
196 */
197const char dtrace_zero[256] = { 0 }; /* zero-filled memory */
198
199/*
200 * DTrace Internal Variables
201 */
202#if defined(sun)
203static dev_info_t *dtrace_devi; /* device info */
204#endif
205#if defined(sun)
206static vmem_t *dtrace_arena; /* probe ID arena */
207static vmem_t *dtrace_minor; /* minor number arena */
208#else
209static taskq_t *dtrace_taskq; /* task queue */
210static struct unrhdr *dtrace_arena; /* Probe ID number. */
211#endif
212static dtrace_probe_t **dtrace_probes; /* array of all probes */
213static int dtrace_nprobes; /* number of probes */
214static dtrace_provider_t *dtrace_provider; /* provider list */
215static dtrace_meta_t *dtrace_meta_pid; /* user-land meta provider */
216static int dtrace_opens; /* number of opens */
217static int dtrace_helpers; /* number of helpers */
218static int dtrace_getf; /* number of unpriv getf()s */
217#if defined(sun)
218static void *dtrace_softstate; /* softstate pointer */
219#endif
220static dtrace_hash_t *dtrace_bymod; /* probes hashed by module */
221static dtrace_hash_t *dtrace_byfunc; /* probes hashed by function */
222static dtrace_hash_t *dtrace_byname; /* probes hashed by name */
223static dtrace_toxrange_t *dtrace_toxrange; /* toxic range array */
224static int dtrace_toxranges; /* number of toxic ranges */
225static int dtrace_toxranges_max; /* size of toxic range array */
226static dtrace_anon_t dtrace_anon; /* anonymous enabling */
227static kmem_cache_t *dtrace_state_cache; /* cache for dynamic state */
228static uint64_t dtrace_vtime_references; /* number of vtimestamp refs */
229static kthread_t *dtrace_panicked; /* panicking thread */
230static dtrace_ecb_t *dtrace_ecb_create_cache; /* cached created ECB */
231static dtrace_genid_t dtrace_probegen; /* current probe generation */
232static dtrace_helpers_t *dtrace_deferred_pid; /* deferred helper list */
233static dtrace_enabling_t *dtrace_retained; /* list of retained enablings */
234static dtrace_genid_t dtrace_retained_gen; /* current retained enab gen */
235static dtrace_dynvar_t dtrace_dynhash_sink; /* end of dynamic hash chains */
236#if !defined(sun)
237static struct mtx dtrace_unr_mtx;
238MTX_SYSINIT(dtrace_unr_mtx, &dtrace_unr_mtx, "Unique resource identifier", MTX_DEF);
239int dtrace_in_probe; /* non-zero if executing a probe */
240#if defined(__i386__) || defined(__amd64__) || defined(__mips__) || defined(__powerpc__)
241uintptr_t dtrace_in_probe_addr; /* Address of invop when already in probe */
242#endif
243static eventhandler_tag dtrace_kld_load_tag;
244static eventhandler_tag dtrace_kld_unload_try_tag;
245#endif
246
247/*
248 * DTrace Locking
249 * DTrace is protected by three (relatively coarse-grained) locks:
250 *
251 * (1) dtrace_lock is required to manipulate essentially any DTrace state,
252 * including enabling state, probes, ECBs, consumer state, helper state,
253 * etc. Importantly, dtrace_lock is _not_ required when in probe context;
254 * probe context is lock-free -- synchronization is handled via the
255 * dtrace_sync() cross call mechanism.
256 *
257 * (2) dtrace_provider_lock is required when manipulating provider state, or
258 * when provider state must be held constant.
259 *
260 * (3) dtrace_meta_lock is required when manipulating meta provider state, or
261 * when meta provider state must be held constant.
262 *
263 * The lock ordering between these three locks is dtrace_meta_lock before
264 * dtrace_provider_lock before dtrace_lock. (In particular, there are
265 * several places where dtrace_provider_lock is held by the framework as it
266 * calls into the providers -- which then call back into the framework,
267 * grabbing dtrace_lock.)
268 *
269 * There are two other locks in the mix: mod_lock and cpu_lock. With respect
270 * to dtrace_provider_lock and dtrace_lock, cpu_lock continues its historical
271 * role as a coarse-grained lock; it is acquired before both of these locks.
272 * With respect to dtrace_meta_lock, its behavior is stranger: cpu_lock must
273 * be acquired _between_ dtrace_meta_lock and any other DTrace locks.
274 * mod_lock is similar with respect to dtrace_provider_lock in that it must be
275 * acquired _between_ dtrace_provider_lock and dtrace_lock.
276 */
277static kmutex_t dtrace_lock; /* probe state lock */
278static kmutex_t dtrace_provider_lock; /* provider state lock */
279static kmutex_t dtrace_meta_lock; /* meta-provider state lock */
280
281#if !defined(sun)
282/* XXX FreeBSD hacks. */
283#define cr_suid cr_svuid
284#define cr_sgid cr_svgid
285#define ipaddr_t in_addr_t
286#define mod_modname pathname
287#define vuprintf vprintf
288#define ttoproc(_a) ((_a)->td_proc)
289#define crgetzoneid(_a) 0
290#define NCPU MAXCPU
291#define SNOCD 0
292#define CPU_ON_INTR(_a) 0
293
294#define PRIV_EFFECTIVE (1 << 0)
295#define PRIV_DTRACE_KERNEL (1 << 1)
296#define PRIV_DTRACE_PROC (1 << 2)
297#define PRIV_DTRACE_USER (1 << 3)
298#define PRIV_PROC_OWNER (1 << 4)
299#define PRIV_PROC_ZONE (1 << 5)
300#define PRIV_ALL ~0
301
302SYSCTL_DECL(_debug_dtrace);
303SYSCTL_DECL(_kern_dtrace);
304#endif
305
306#if defined(sun)
307#define curcpu CPU->cpu_id
308#endif
309
310
311/*
312 * DTrace Provider Variables
313 *
314 * These are the variables relating to DTrace as a provider (that is, the
315 * provider of the BEGIN, END, and ERROR probes).
316 */
317static dtrace_pattr_t dtrace_provider_attr = {
318{ DTRACE_STABILITY_STABLE, DTRACE_STABILITY_STABLE, DTRACE_CLASS_COMMON },
319{ DTRACE_STABILITY_PRIVATE, DTRACE_STABILITY_PRIVATE, DTRACE_CLASS_UNKNOWN },
320{ DTRACE_STABILITY_PRIVATE, DTRACE_STABILITY_PRIVATE, DTRACE_CLASS_UNKNOWN },
321{ DTRACE_STABILITY_STABLE, DTRACE_STABILITY_STABLE, DTRACE_CLASS_COMMON },
322{ DTRACE_STABILITY_STABLE, DTRACE_STABILITY_STABLE, DTRACE_CLASS_COMMON },
323};
324
325static void
326dtrace_nullop(void)
327{}
328
329static dtrace_pops_t dtrace_provider_ops = {
330 (void (*)(void *, dtrace_probedesc_t *))dtrace_nullop,
331 (void (*)(void *, modctl_t *))dtrace_nullop,
332 (void (*)(void *, dtrace_id_t, void *))dtrace_nullop,
333 (void (*)(void *, dtrace_id_t, void *))dtrace_nullop,
334 (void (*)(void *, dtrace_id_t, void *))dtrace_nullop,
335 (void (*)(void *, dtrace_id_t, void *))dtrace_nullop,
336 NULL,
337 NULL,
338 NULL,
339 (void (*)(void *, dtrace_id_t, void *))dtrace_nullop
340};
341
342static dtrace_id_t dtrace_probeid_begin; /* special BEGIN probe */
343static dtrace_id_t dtrace_probeid_end; /* special END probe */
344dtrace_id_t dtrace_probeid_error; /* special ERROR probe */
345
346/*
347 * DTrace Helper Tracing Variables
348 */
349uint32_t dtrace_helptrace_next = 0;
350uint32_t dtrace_helptrace_nlocals;
351char *dtrace_helptrace_buffer;
352int dtrace_helptrace_bufsize = 512 * 1024;
353
354#ifdef DEBUG
355int dtrace_helptrace_enabled = 1;
356#else
357int dtrace_helptrace_enabled = 0;
358#endif
359
360/*
361 * DTrace Error Hashing
362 *
363 * On DEBUG kernels, DTrace will track the errors that has seen in a hash
364 * table. This is very useful for checking coverage of tests that are
365 * expected to induce DIF or DOF processing errors, and may be useful for
366 * debugging problems in the DIF code generator or in DOF generation . The
367 * error hash may be examined with the ::dtrace_errhash MDB dcmd.
368 */
369#ifdef DEBUG
370static dtrace_errhash_t dtrace_errhash[DTRACE_ERRHASHSZ];
371static const char *dtrace_errlast;
372static kthread_t *dtrace_errthread;
373static kmutex_t dtrace_errlock;
374#endif
375
376/*
377 * DTrace Macros and Constants
378 *
379 * These are various macros that are useful in various spots in the
380 * implementation, along with a few random constants that have no meaning
381 * outside of the implementation. There is no real structure to this cpp
382 * mishmash -- but is there ever?
383 */
384#define DTRACE_HASHSTR(hash, probe) \
385 dtrace_hash_str(*((char **)((uintptr_t)(probe) + (hash)->dth_stroffs)))
386
387#define DTRACE_HASHNEXT(hash, probe) \
388 (dtrace_probe_t **)((uintptr_t)(probe) + (hash)->dth_nextoffs)
389
390#define DTRACE_HASHPREV(hash, probe) \
391 (dtrace_probe_t **)((uintptr_t)(probe) + (hash)->dth_prevoffs)
392
393#define DTRACE_HASHEQ(hash, lhs, rhs) \
394 (strcmp(*((char **)((uintptr_t)(lhs) + (hash)->dth_stroffs)), \
395 *((char **)((uintptr_t)(rhs) + (hash)->dth_stroffs))) == 0)
396
397#define DTRACE_AGGHASHSIZE_SLEW 17
398
399#define DTRACE_V4MAPPED_OFFSET (sizeof (uint32_t) * 3)
400
401/*
402 * The key for a thread-local variable consists of the lower 61 bits of the
403 * t_did, plus the 3 bits of the highest active interrupt above LOCK_LEVEL.
404 * We add DIF_VARIABLE_MAX to t_did to assure that the thread key is never
405 * equal to a variable identifier. This is necessary (but not sufficient) to
406 * assure that global associative arrays never collide with thread-local
407 * variables. To guarantee that they cannot collide, we must also define the
408 * order for keying dynamic variables. That order is:
409 *
410 * [ key0 ] ... [ keyn ] [ variable-key ] [ tls-key ]
411 *
412 * Because the variable-key and the tls-key are in orthogonal spaces, there is
413 * no way for a global variable key signature to match a thread-local key
414 * signature.
415 */
416#if defined(sun)
417#define DTRACE_TLS_THRKEY(where) { \
418 uint_t intr = 0; \
419 uint_t actv = CPU->cpu_intr_actv >> (LOCK_LEVEL + 1); \
420 for (; actv; actv >>= 1) \
421 intr++; \
422 ASSERT(intr < (1 << 3)); \
423 (where) = ((curthread->t_did + DIF_VARIABLE_MAX) & \
424 (((uint64_t)1 << 61) - 1)) | ((uint64_t)intr << 61); \
425}
426#else
427#define DTRACE_TLS_THRKEY(where) { \
428 solaris_cpu_t *_c = &solaris_cpu[curcpu]; \
429 uint_t intr = 0; \
430 uint_t actv = _c->cpu_intr_actv; \
431 for (; actv; actv >>= 1) \
432 intr++; \
433 ASSERT(intr < (1 << 3)); \
434 (where) = ((curthread->td_tid + DIF_VARIABLE_MAX) & \
435 (((uint64_t)1 << 61) - 1)) | ((uint64_t)intr << 61); \
436}
437#endif
438
439#define DT_BSWAP_8(x) ((x) & 0xff)
440#define DT_BSWAP_16(x) ((DT_BSWAP_8(x) << 8) | DT_BSWAP_8((x) >> 8))
441#define DT_BSWAP_32(x) ((DT_BSWAP_16(x) << 16) | DT_BSWAP_16((x) >> 16))
442#define DT_BSWAP_64(x) ((DT_BSWAP_32(x) << 32) | DT_BSWAP_32((x) >> 32))
443
444#define DT_MASK_LO 0x00000000FFFFFFFFULL
445
446#define DTRACE_STORE(type, tomax, offset, what) \
447 *((type *)((uintptr_t)(tomax) + (uintptr_t)offset)) = (type)(what);
448
449#ifndef __x86
450#define DTRACE_ALIGNCHECK(addr, size, flags) \
451 if (addr & (size - 1)) { \
452 *flags |= CPU_DTRACE_BADALIGN; \
453 cpu_core[curcpu].cpuc_dtrace_illval = addr; \
454 return (0); \
455 }
456#else
457#define DTRACE_ALIGNCHECK(addr, size, flags)
458#endif
459
460/*
461 * Test whether a range of memory starting at testaddr of size testsz falls
462 * within the range of memory described by addr, sz. We take care to avoid
463 * problems with overflow and underflow of the unsigned quantities, and
464 * disallow all negative sizes. Ranges of size 0 are allowed.
465 */
466#define DTRACE_INRANGE(testaddr, testsz, baseaddr, basesz) \
219#if defined(sun)
220static void *dtrace_softstate; /* softstate pointer */
221#endif
222static dtrace_hash_t *dtrace_bymod; /* probes hashed by module */
223static dtrace_hash_t *dtrace_byfunc; /* probes hashed by function */
224static dtrace_hash_t *dtrace_byname; /* probes hashed by name */
225static dtrace_toxrange_t *dtrace_toxrange; /* toxic range array */
226static int dtrace_toxranges; /* number of toxic ranges */
227static int dtrace_toxranges_max; /* size of toxic range array */
228static dtrace_anon_t dtrace_anon; /* anonymous enabling */
229static kmem_cache_t *dtrace_state_cache; /* cache for dynamic state */
230static uint64_t dtrace_vtime_references; /* number of vtimestamp refs */
231static kthread_t *dtrace_panicked; /* panicking thread */
232static dtrace_ecb_t *dtrace_ecb_create_cache; /* cached created ECB */
233static dtrace_genid_t dtrace_probegen; /* current probe generation */
234static dtrace_helpers_t *dtrace_deferred_pid; /* deferred helper list */
235static dtrace_enabling_t *dtrace_retained; /* list of retained enablings */
236static dtrace_genid_t dtrace_retained_gen; /* current retained enab gen */
237static dtrace_dynvar_t dtrace_dynhash_sink; /* end of dynamic hash chains */
238#if !defined(sun)
239static struct mtx dtrace_unr_mtx;
240MTX_SYSINIT(dtrace_unr_mtx, &dtrace_unr_mtx, "Unique resource identifier", MTX_DEF);
241int dtrace_in_probe; /* non-zero if executing a probe */
242#if defined(__i386__) || defined(__amd64__) || defined(__mips__) || defined(__powerpc__)
243uintptr_t dtrace_in_probe_addr; /* Address of invop when already in probe */
244#endif
245static eventhandler_tag dtrace_kld_load_tag;
246static eventhandler_tag dtrace_kld_unload_try_tag;
247#endif
248
249/*
250 * DTrace Locking
251 * DTrace is protected by three (relatively coarse-grained) locks:
252 *
253 * (1) dtrace_lock is required to manipulate essentially any DTrace state,
254 * including enabling state, probes, ECBs, consumer state, helper state,
255 * etc. Importantly, dtrace_lock is _not_ required when in probe context;
256 * probe context is lock-free -- synchronization is handled via the
257 * dtrace_sync() cross call mechanism.
258 *
259 * (2) dtrace_provider_lock is required when manipulating provider state, or
260 * when provider state must be held constant.
261 *
262 * (3) dtrace_meta_lock is required when manipulating meta provider state, or
263 * when meta provider state must be held constant.
264 *
265 * The lock ordering between these three locks is dtrace_meta_lock before
266 * dtrace_provider_lock before dtrace_lock. (In particular, there are
267 * several places where dtrace_provider_lock is held by the framework as it
268 * calls into the providers -- which then call back into the framework,
269 * grabbing dtrace_lock.)
270 *
271 * There are two other locks in the mix: mod_lock and cpu_lock. With respect
272 * to dtrace_provider_lock and dtrace_lock, cpu_lock continues its historical
273 * role as a coarse-grained lock; it is acquired before both of these locks.
274 * With respect to dtrace_meta_lock, its behavior is stranger: cpu_lock must
275 * be acquired _between_ dtrace_meta_lock and any other DTrace locks.
276 * mod_lock is similar with respect to dtrace_provider_lock in that it must be
277 * acquired _between_ dtrace_provider_lock and dtrace_lock.
278 */
279static kmutex_t dtrace_lock; /* probe state lock */
280static kmutex_t dtrace_provider_lock; /* provider state lock */
281static kmutex_t dtrace_meta_lock; /* meta-provider state lock */
282
283#if !defined(sun)
284/* XXX FreeBSD hacks. */
285#define cr_suid cr_svuid
286#define cr_sgid cr_svgid
287#define ipaddr_t in_addr_t
288#define mod_modname pathname
289#define vuprintf vprintf
290#define ttoproc(_a) ((_a)->td_proc)
291#define crgetzoneid(_a) 0
292#define NCPU MAXCPU
293#define SNOCD 0
294#define CPU_ON_INTR(_a) 0
295
296#define PRIV_EFFECTIVE (1 << 0)
297#define PRIV_DTRACE_KERNEL (1 << 1)
298#define PRIV_DTRACE_PROC (1 << 2)
299#define PRIV_DTRACE_USER (1 << 3)
300#define PRIV_PROC_OWNER (1 << 4)
301#define PRIV_PROC_ZONE (1 << 5)
302#define PRIV_ALL ~0
303
304SYSCTL_DECL(_debug_dtrace);
305SYSCTL_DECL(_kern_dtrace);
306#endif
307
308#if defined(sun)
309#define curcpu CPU->cpu_id
310#endif
311
312
313/*
314 * DTrace Provider Variables
315 *
316 * These are the variables relating to DTrace as a provider (that is, the
317 * provider of the BEGIN, END, and ERROR probes).
318 */
319static dtrace_pattr_t dtrace_provider_attr = {
320{ DTRACE_STABILITY_STABLE, DTRACE_STABILITY_STABLE, DTRACE_CLASS_COMMON },
321{ DTRACE_STABILITY_PRIVATE, DTRACE_STABILITY_PRIVATE, DTRACE_CLASS_UNKNOWN },
322{ DTRACE_STABILITY_PRIVATE, DTRACE_STABILITY_PRIVATE, DTRACE_CLASS_UNKNOWN },
323{ DTRACE_STABILITY_STABLE, DTRACE_STABILITY_STABLE, DTRACE_CLASS_COMMON },
324{ DTRACE_STABILITY_STABLE, DTRACE_STABILITY_STABLE, DTRACE_CLASS_COMMON },
325};
326
327static void
328dtrace_nullop(void)
329{}
330
331static dtrace_pops_t dtrace_provider_ops = {
332 (void (*)(void *, dtrace_probedesc_t *))dtrace_nullop,
333 (void (*)(void *, modctl_t *))dtrace_nullop,
334 (void (*)(void *, dtrace_id_t, void *))dtrace_nullop,
335 (void (*)(void *, dtrace_id_t, void *))dtrace_nullop,
336 (void (*)(void *, dtrace_id_t, void *))dtrace_nullop,
337 (void (*)(void *, dtrace_id_t, void *))dtrace_nullop,
338 NULL,
339 NULL,
340 NULL,
341 (void (*)(void *, dtrace_id_t, void *))dtrace_nullop
342};
343
344static dtrace_id_t dtrace_probeid_begin; /* special BEGIN probe */
345static dtrace_id_t dtrace_probeid_end; /* special END probe */
346dtrace_id_t dtrace_probeid_error; /* special ERROR probe */
347
348/*
349 * DTrace Helper Tracing Variables
350 */
351uint32_t dtrace_helptrace_next = 0;
352uint32_t dtrace_helptrace_nlocals;
353char *dtrace_helptrace_buffer;
354int dtrace_helptrace_bufsize = 512 * 1024;
355
356#ifdef DEBUG
357int dtrace_helptrace_enabled = 1;
358#else
359int dtrace_helptrace_enabled = 0;
360#endif
361
362/*
363 * DTrace Error Hashing
364 *
365 * On DEBUG kernels, DTrace will track the errors that has seen in a hash
366 * table. This is very useful for checking coverage of tests that are
367 * expected to induce DIF or DOF processing errors, and may be useful for
368 * debugging problems in the DIF code generator or in DOF generation . The
369 * error hash may be examined with the ::dtrace_errhash MDB dcmd.
370 */
371#ifdef DEBUG
372static dtrace_errhash_t dtrace_errhash[DTRACE_ERRHASHSZ];
373static const char *dtrace_errlast;
374static kthread_t *dtrace_errthread;
375static kmutex_t dtrace_errlock;
376#endif
377
378/*
379 * DTrace Macros and Constants
380 *
381 * These are various macros that are useful in various spots in the
382 * implementation, along with a few random constants that have no meaning
383 * outside of the implementation. There is no real structure to this cpp
384 * mishmash -- but is there ever?
385 */
386#define DTRACE_HASHSTR(hash, probe) \
387 dtrace_hash_str(*((char **)((uintptr_t)(probe) + (hash)->dth_stroffs)))
388
389#define DTRACE_HASHNEXT(hash, probe) \
390 (dtrace_probe_t **)((uintptr_t)(probe) + (hash)->dth_nextoffs)
391
392#define DTRACE_HASHPREV(hash, probe) \
393 (dtrace_probe_t **)((uintptr_t)(probe) + (hash)->dth_prevoffs)
394
395#define DTRACE_HASHEQ(hash, lhs, rhs) \
396 (strcmp(*((char **)((uintptr_t)(lhs) + (hash)->dth_stroffs)), \
397 *((char **)((uintptr_t)(rhs) + (hash)->dth_stroffs))) == 0)
398
399#define DTRACE_AGGHASHSIZE_SLEW 17
400
401#define DTRACE_V4MAPPED_OFFSET (sizeof (uint32_t) * 3)
402
403/*
404 * The key for a thread-local variable consists of the lower 61 bits of the
405 * t_did, plus the 3 bits of the highest active interrupt above LOCK_LEVEL.
406 * We add DIF_VARIABLE_MAX to t_did to assure that the thread key is never
407 * equal to a variable identifier. This is necessary (but not sufficient) to
408 * assure that global associative arrays never collide with thread-local
409 * variables. To guarantee that they cannot collide, we must also define the
410 * order for keying dynamic variables. That order is:
411 *
412 * [ key0 ] ... [ keyn ] [ variable-key ] [ tls-key ]
413 *
414 * Because the variable-key and the tls-key are in orthogonal spaces, there is
415 * no way for a global variable key signature to match a thread-local key
416 * signature.
417 */
418#if defined(sun)
419#define DTRACE_TLS_THRKEY(where) { \
420 uint_t intr = 0; \
421 uint_t actv = CPU->cpu_intr_actv >> (LOCK_LEVEL + 1); \
422 for (; actv; actv >>= 1) \
423 intr++; \
424 ASSERT(intr < (1 << 3)); \
425 (where) = ((curthread->t_did + DIF_VARIABLE_MAX) & \
426 (((uint64_t)1 << 61) - 1)) | ((uint64_t)intr << 61); \
427}
428#else
429#define DTRACE_TLS_THRKEY(where) { \
430 solaris_cpu_t *_c = &solaris_cpu[curcpu]; \
431 uint_t intr = 0; \
432 uint_t actv = _c->cpu_intr_actv; \
433 for (; actv; actv >>= 1) \
434 intr++; \
435 ASSERT(intr < (1 << 3)); \
436 (where) = ((curthread->td_tid + DIF_VARIABLE_MAX) & \
437 (((uint64_t)1 << 61) - 1)) | ((uint64_t)intr << 61); \
438}
439#endif
440
441#define DT_BSWAP_8(x) ((x) & 0xff)
442#define DT_BSWAP_16(x) ((DT_BSWAP_8(x) << 8) | DT_BSWAP_8((x) >> 8))
443#define DT_BSWAP_32(x) ((DT_BSWAP_16(x) << 16) | DT_BSWAP_16((x) >> 16))
444#define DT_BSWAP_64(x) ((DT_BSWAP_32(x) << 32) | DT_BSWAP_32((x) >> 32))
445
446#define DT_MASK_LO 0x00000000FFFFFFFFULL
447
448#define DTRACE_STORE(type, tomax, offset, what) \
449 *((type *)((uintptr_t)(tomax) + (uintptr_t)offset)) = (type)(what);
450
451#ifndef __x86
452#define DTRACE_ALIGNCHECK(addr, size, flags) \
453 if (addr & (size - 1)) { \
454 *flags |= CPU_DTRACE_BADALIGN; \
455 cpu_core[curcpu].cpuc_dtrace_illval = addr; \
456 return (0); \
457 }
458#else
459#define DTRACE_ALIGNCHECK(addr, size, flags)
460#endif
461
462/*
463 * Test whether a range of memory starting at testaddr of size testsz falls
464 * within the range of memory described by addr, sz. We take care to avoid
465 * problems with overflow and underflow of the unsigned quantities, and
466 * disallow all negative sizes. Ranges of size 0 are allowed.
467 */
468#define DTRACE_INRANGE(testaddr, testsz, baseaddr, basesz) \
467 ((testaddr) - (baseaddr) < (basesz) && \
468 (testaddr) + (testsz) - (baseaddr) <= (basesz) && \
469 ((testaddr) - (uintptr_t)(baseaddr) < (basesz) && \
470 (testaddr) + (testsz) - (uintptr_t)(baseaddr) <= (basesz) && \
469 (testaddr) + (testsz) >= (testaddr))
470
471/*
472 * Test whether alloc_sz bytes will fit in the scratch region. We isolate
473 * alloc_sz on the righthand side of the comparison in order to avoid overflow
474 * or underflow in the comparison with it. This is simpler than the INRANGE
475 * check above, because we know that the dtms_scratch_ptr is valid in the
476 * range. Allocations of size zero are allowed.
477 */
478#define DTRACE_INSCRATCH(mstate, alloc_sz) \
479 ((mstate)->dtms_scratch_base + (mstate)->dtms_scratch_size - \
480 (mstate)->dtms_scratch_ptr >= (alloc_sz))
481
482#define DTRACE_LOADFUNC(bits) \
483/*CSTYLED*/ \
484uint##bits##_t \
485dtrace_load##bits(uintptr_t addr) \
486{ \
487 size_t size = bits / NBBY; \
488 /*CSTYLED*/ \
489 uint##bits##_t rval; \
490 int i; \
491 volatile uint16_t *flags = (volatile uint16_t *) \
492 &cpu_core[curcpu].cpuc_dtrace_flags; \
493 \
494 DTRACE_ALIGNCHECK(addr, size, flags); \
495 \
496 for (i = 0; i < dtrace_toxranges; i++) { \
497 if (addr >= dtrace_toxrange[i].dtt_limit) \
498 continue; \
499 \
500 if (addr + size <= dtrace_toxrange[i].dtt_base) \
501 continue; \
502 \
503 /* \
504 * This address falls within a toxic region; return 0. \
505 */ \
506 *flags |= CPU_DTRACE_BADADDR; \
507 cpu_core[curcpu].cpuc_dtrace_illval = addr; \
508 return (0); \
509 } \
510 \
511 *flags |= CPU_DTRACE_NOFAULT; \
512 /*CSTYLED*/ \
513 rval = *((volatile uint##bits##_t *)addr); \
514 *flags &= ~CPU_DTRACE_NOFAULT; \
515 \
516 return (!(*flags & CPU_DTRACE_FAULT) ? rval : 0); \
517}
518
519#ifdef _LP64
520#define dtrace_loadptr dtrace_load64
521#else
522#define dtrace_loadptr dtrace_load32
523#endif
524
525#define DTRACE_DYNHASH_FREE 0
526#define DTRACE_DYNHASH_SINK 1
527#define DTRACE_DYNHASH_VALID 2
528
529#define DTRACE_MATCH_NEXT 0
530#define DTRACE_MATCH_DONE 1
531#define DTRACE_ANCHORED(probe) ((probe)->dtpr_func[0] != '\0')
532#define DTRACE_STATE_ALIGN 64
533
534#define DTRACE_FLAGS2FLT(flags) \
535 (((flags) & CPU_DTRACE_BADADDR) ? DTRACEFLT_BADADDR : \
536 ((flags) & CPU_DTRACE_ILLOP) ? DTRACEFLT_ILLOP : \
537 ((flags) & CPU_DTRACE_DIVZERO) ? DTRACEFLT_DIVZERO : \
538 ((flags) & CPU_DTRACE_KPRIV) ? DTRACEFLT_KPRIV : \
539 ((flags) & CPU_DTRACE_UPRIV) ? DTRACEFLT_UPRIV : \
540 ((flags) & CPU_DTRACE_TUPOFLOW) ? DTRACEFLT_TUPOFLOW : \
541 ((flags) & CPU_DTRACE_BADALIGN) ? DTRACEFLT_BADALIGN : \
542 ((flags) & CPU_DTRACE_NOSCRATCH) ? DTRACEFLT_NOSCRATCH : \
543 ((flags) & CPU_DTRACE_BADSTACK) ? DTRACEFLT_BADSTACK : \
544 DTRACEFLT_UNKNOWN)
545
546#define DTRACEACT_ISSTRING(act) \
547 ((act)->dta_kind == DTRACEACT_DIFEXPR && \
548 (act)->dta_difo->dtdo_rtype.dtdt_kind == DIF_TYPE_STRING)
549
550/* Function prototype definitions: */
551static size_t dtrace_strlen(const char *, size_t);
552static dtrace_probe_t *dtrace_probe_lookup_id(dtrace_id_t id);
553static void dtrace_enabling_provide(dtrace_provider_t *);
554static int dtrace_enabling_match(dtrace_enabling_t *, int *);
555static void dtrace_enabling_matchall(void);
556static void dtrace_enabling_reap(void);
557static dtrace_state_t *dtrace_anon_grab(void);
558static uint64_t dtrace_helper(int, dtrace_mstate_t *,
559 dtrace_state_t *, uint64_t, uint64_t);
560static dtrace_helpers_t *dtrace_helpers_create(proc_t *);
561static void dtrace_buffer_drop(dtrace_buffer_t *);
562static int dtrace_buffer_consumed(dtrace_buffer_t *, hrtime_t when);
563static intptr_t dtrace_buffer_reserve(dtrace_buffer_t *, size_t, size_t,
564 dtrace_state_t *, dtrace_mstate_t *);
565static int dtrace_state_option(dtrace_state_t *, dtrace_optid_t,
566 dtrace_optval_t);
567static int dtrace_ecb_create_enable(dtrace_probe_t *, void *);
568static void dtrace_helper_provider_destroy(dtrace_helper_provider_t *);
569uint16_t dtrace_load16(uintptr_t);
570uint32_t dtrace_load32(uintptr_t);
571uint64_t dtrace_load64(uintptr_t);
572uint8_t dtrace_load8(uintptr_t);
573void dtrace_dynvar_clean(dtrace_dstate_t *);
574dtrace_dynvar_t *dtrace_dynvar(dtrace_dstate_t *, uint_t, dtrace_key_t *,
575 size_t, dtrace_dynvar_op_t, dtrace_mstate_t *, dtrace_vstate_t *);
576uintptr_t dtrace_dif_varstr(uintptr_t, dtrace_state_t *, dtrace_mstate_t *);
471 (testaddr) + (testsz) >= (testaddr))
472
473/*
474 * Test whether alloc_sz bytes will fit in the scratch region. We isolate
475 * alloc_sz on the righthand side of the comparison in order to avoid overflow
476 * or underflow in the comparison with it. This is simpler than the INRANGE
477 * check above, because we know that the dtms_scratch_ptr is valid in the
478 * range. Allocations of size zero are allowed.
479 */
480#define DTRACE_INSCRATCH(mstate, alloc_sz) \
481 ((mstate)->dtms_scratch_base + (mstate)->dtms_scratch_size - \
482 (mstate)->dtms_scratch_ptr >= (alloc_sz))
483
484#define DTRACE_LOADFUNC(bits) \
485/*CSTYLED*/ \
486uint##bits##_t \
487dtrace_load##bits(uintptr_t addr) \
488{ \
489 size_t size = bits / NBBY; \
490 /*CSTYLED*/ \
491 uint##bits##_t rval; \
492 int i; \
493 volatile uint16_t *flags = (volatile uint16_t *) \
494 &cpu_core[curcpu].cpuc_dtrace_flags; \
495 \
496 DTRACE_ALIGNCHECK(addr, size, flags); \
497 \
498 for (i = 0; i < dtrace_toxranges; i++) { \
499 if (addr >= dtrace_toxrange[i].dtt_limit) \
500 continue; \
501 \
502 if (addr + size <= dtrace_toxrange[i].dtt_base) \
503 continue; \
504 \
505 /* \
506 * This address falls within a toxic region; return 0. \
507 */ \
508 *flags |= CPU_DTRACE_BADADDR; \
509 cpu_core[curcpu].cpuc_dtrace_illval = addr; \
510 return (0); \
511 } \
512 \
513 *flags |= CPU_DTRACE_NOFAULT; \
514 /*CSTYLED*/ \
515 rval = *((volatile uint##bits##_t *)addr); \
516 *flags &= ~CPU_DTRACE_NOFAULT; \
517 \
518 return (!(*flags & CPU_DTRACE_FAULT) ? rval : 0); \
519}
520
521#ifdef _LP64
522#define dtrace_loadptr dtrace_load64
523#else
524#define dtrace_loadptr dtrace_load32
525#endif
526
527#define DTRACE_DYNHASH_FREE 0
528#define DTRACE_DYNHASH_SINK 1
529#define DTRACE_DYNHASH_VALID 2
530
531#define DTRACE_MATCH_NEXT 0
532#define DTRACE_MATCH_DONE 1
533#define DTRACE_ANCHORED(probe) ((probe)->dtpr_func[0] != '\0')
534#define DTRACE_STATE_ALIGN 64
535
536#define DTRACE_FLAGS2FLT(flags) \
537 (((flags) & CPU_DTRACE_BADADDR) ? DTRACEFLT_BADADDR : \
538 ((flags) & CPU_DTRACE_ILLOP) ? DTRACEFLT_ILLOP : \
539 ((flags) & CPU_DTRACE_DIVZERO) ? DTRACEFLT_DIVZERO : \
540 ((flags) & CPU_DTRACE_KPRIV) ? DTRACEFLT_KPRIV : \
541 ((flags) & CPU_DTRACE_UPRIV) ? DTRACEFLT_UPRIV : \
542 ((flags) & CPU_DTRACE_TUPOFLOW) ? DTRACEFLT_TUPOFLOW : \
543 ((flags) & CPU_DTRACE_BADALIGN) ? DTRACEFLT_BADALIGN : \
544 ((flags) & CPU_DTRACE_NOSCRATCH) ? DTRACEFLT_NOSCRATCH : \
545 ((flags) & CPU_DTRACE_BADSTACK) ? DTRACEFLT_BADSTACK : \
546 DTRACEFLT_UNKNOWN)
547
548#define DTRACEACT_ISSTRING(act) \
549 ((act)->dta_kind == DTRACEACT_DIFEXPR && \
550 (act)->dta_difo->dtdo_rtype.dtdt_kind == DIF_TYPE_STRING)
551
552/* Function prototype definitions: */
553static size_t dtrace_strlen(const char *, size_t);
554static dtrace_probe_t *dtrace_probe_lookup_id(dtrace_id_t id);
555static void dtrace_enabling_provide(dtrace_provider_t *);
556static int dtrace_enabling_match(dtrace_enabling_t *, int *);
557static void dtrace_enabling_matchall(void);
558static void dtrace_enabling_reap(void);
559static dtrace_state_t *dtrace_anon_grab(void);
560static uint64_t dtrace_helper(int, dtrace_mstate_t *,
561 dtrace_state_t *, uint64_t, uint64_t);
562static dtrace_helpers_t *dtrace_helpers_create(proc_t *);
563static void dtrace_buffer_drop(dtrace_buffer_t *);
564static int dtrace_buffer_consumed(dtrace_buffer_t *, hrtime_t when);
565static intptr_t dtrace_buffer_reserve(dtrace_buffer_t *, size_t, size_t,
566 dtrace_state_t *, dtrace_mstate_t *);
567static int dtrace_state_option(dtrace_state_t *, dtrace_optid_t,
568 dtrace_optval_t);
569static int dtrace_ecb_create_enable(dtrace_probe_t *, void *);
570static void dtrace_helper_provider_destroy(dtrace_helper_provider_t *);
571uint16_t dtrace_load16(uintptr_t);
572uint32_t dtrace_load32(uintptr_t);
573uint64_t dtrace_load64(uintptr_t);
574uint8_t dtrace_load8(uintptr_t);
575void dtrace_dynvar_clean(dtrace_dstate_t *);
576dtrace_dynvar_t *dtrace_dynvar(dtrace_dstate_t *, uint_t, dtrace_key_t *,
577 size_t, dtrace_dynvar_op_t, dtrace_mstate_t *, dtrace_vstate_t *);
578uintptr_t dtrace_dif_varstr(uintptr_t, dtrace_state_t *, dtrace_mstate_t *);
579static int dtrace_priv_proc(dtrace_state_t *);
580static void dtrace_getf_barrier(void);
577
578/*
579 * DTrace Probe Context Functions
580 *
581 * These functions are called from probe context. Because probe context is
582 * any context in which C may be called, arbitrarily locks may be held,
583 * interrupts may be disabled, we may be in arbitrary dispatched state, etc.
584 * As a result, functions called from probe context may only call other DTrace
585 * support functions -- they may not interact at all with the system at large.
586 * (Note that the ASSERT macro is made probe-context safe by redefining it in
587 * terms of dtrace_assfail(), a probe-context safe function.) If arbitrary
588 * loads are to be performed from probe context, they _must_ be in terms of
589 * the safe dtrace_load*() variants.
590 *
591 * Some functions in this block are not actually called from probe context;
592 * for these functions, there will be a comment above the function reading
593 * "Note: not called from probe context."
594 */
595void
596dtrace_panic(const char *format, ...)
597{
598 va_list alist;
599
600 va_start(alist, format);
601 dtrace_vpanic(format, alist);
602 va_end(alist);
603}
604
605int
606dtrace_assfail(const char *a, const char *f, int l)
607{
608 dtrace_panic("assertion failed: %s, file: %s, line: %d", a, f, l);
609
610 /*
611 * We just need something here that even the most clever compiler
612 * cannot optimize away.
613 */
614 return (a[(uintptr_t)f]);
615}
616
617/*
618 * Atomically increment a specified error counter from probe context.
619 */
620static void
621dtrace_error(uint32_t *counter)
622{
623 /*
624 * Most counters stored to in probe context are per-CPU counters.
625 * However, there are some error conditions that are sufficiently
626 * arcane that they don't merit per-CPU storage. If these counters
627 * are incremented concurrently on different CPUs, scalability will be
628 * adversely affected -- but we don't expect them to be white-hot in a
629 * correctly constructed enabling...
630 */
631 uint32_t oval, nval;
632
633 do {
634 oval = *counter;
635
636 if ((nval = oval + 1) == 0) {
637 /*
638 * If the counter would wrap, set it to 1 -- assuring
639 * that the counter is never zero when we have seen
640 * errors. (The counter must be 32-bits because we
641 * aren't guaranteed a 64-bit compare&swap operation.)
642 * To save this code both the infamy of being fingered
643 * by a priggish news story and the indignity of being
644 * the target of a neo-puritan witch trial, we're
645 * carefully avoiding any colorful description of the
646 * likelihood of this condition -- but suffice it to
647 * say that it is only slightly more likely than the
648 * overflow of predicate cache IDs, as discussed in
649 * dtrace_predicate_create().
650 */
651 nval = 1;
652 }
653 } while (dtrace_cas32(counter, oval, nval) != oval);
654}
655
656/*
657 * Use the DTRACE_LOADFUNC macro to define functions for each of loading a
658 * uint8_t, a uint16_t, a uint32_t and a uint64_t.
659 */
660DTRACE_LOADFUNC(8)
661DTRACE_LOADFUNC(16)
662DTRACE_LOADFUNC(32)
663DTRACE_LOADFUNC(64)
664
665static int
666dtrace_inscratch(uintptr_t dest, size_t size, dtrace_mstate_t *mstate)
667{
668 if (dest < mstate->dtms_scratch_base)
669 return (0);
670
671 if (dest + size < dest)
672 return (0);
673
674 if (dest + size > mstate->dtms_scratch_ptr)
675 return (0);
676
677 return (1);
678}
679
680static int
681dtrace_canstore_statvar(uint64_t addr, size_t sz,
682 dtrace_statvar_t **svars, int nsvars)
683{
684 int i;
685
686 for (i = 0; i < nsvars; i++) {
687 dtrace_statvar_t *svar = svars[i];
688
689 if (svar == NULL || svar->dtsv_size == 0)
690 continue;
691
692 if (DTRACE_INRANGE(addr, sz, svar->dtsv_data, svar->dtsv_size))
693 return (1);
694 }
695
696 return (0);
697}
698
699/*
700 * Check to see if the address is within a memory region to which a store may
701 * be issued. This includes the DTrace scratch areas, and any DTrace variable
702 * region. The caller of dtrace_canstore() is responsible for performing any
703 * alignment checks that are needed before stores are actually executed.
704 */
705static int
706dtrace_canstore(uint64_t addr, size_t sz, dtrace_mstate_t *mstate,
707 dtrace_vstate_t *vstate)
708{
709 /*
710 * First, check to see if the address is in scratch space...
711 */
712 if (DTRACE_INRANGE(addr, sz, mstate->dtms_scratch_base,
713 mstate->dtms_scratch_size))
714 return (1);
715
716 /*
717 * Now check to see if it's a dynamic variable. This check will pick
718 * up both thread-local variables and any global dynamically-allocated
719 * variables.
720 */
581
582/*
583 * DTrace Probe Context Functions
584 *
585 * These functions are called from probe context. Because probe context is
586 * any context in which C may be called, arbitrarily locks may be held,
587 * interrupts may be disabled, we may be in arbitrary dispatched state, etc.
588 * As a result, functions called from probe context may only call other DTrace
589 * support functions -- they may not interact at all with the system at large.
590 * (Note that the ASSERT macro is made probe-context safe by redefining it in
591 * terms of dtrace_assfail(), a probe-context safe function.) If arbitrary
592 * loads are to be performed from probe context, they _must_ be in terms of
593 * the safe dtrace_load*() variants.
594 *
595 * Some functions in this block are not actually called from probe context;
596 * for these functions, there will be a comment above the function reading
597 * "Note: not called from probe context."
598 */
599void
600dtrace_panic(const char *format, ...)
601{
602 va_list alist;
603
604 va_start(alist, format);
605 dtrace_vpanic(format, alist);
606 va_end(alist);
607}
608
609int
610dtrace_assfail(const char *a, const char *f, int l)
611{
612 dtrace_panic("assertion failed: %s, file: %s, line: %d", a, f, l);
613
614 /*
615 * We just need something here that even the most clever compiler
616 * cannot optimize away.
617 */
618 return (a[(uintptr_t)f]);
619}
620
621/*
622 * Atomically increment a specified error counter from probe context.
623 */
624static void
625dtrace_error(uint32_t *counter)
626{
627 /*
628 * Most counters stored to in probe context are per-CPU counters.
629 * However, there are some error conditions that are sufficiently
630 * arcane that they don't merit per-CPU storage. If these counters
631 * are incremented concurrently on different CPUs, scalability will be
632 * adversely affected -- but we don't expect them to be white-hot in a
633 * correctly constructed enabling...
634 */
635 uint32_t oval, nval;
636
637 do {
638 oval = *counter;
639
640 if ((nval = oval + 1) == 0) {
641 /*
642 * If the counter would wrap, set it to 1 -- assuring
643 * that the counter is never zero when we have seen
644 * errors. (The counter must be 32-bits because we
645 * aren't guaranteed a 64-bit compare&swap operation.)
646 * To save this code both the infamy of being fingered
647 * by a priggish news story and the indignity of being
648 * the target of a neo-puritan witch trial, we're
649 * carefully avoiding any colorful description of the
650 * likelihood of this condition -- but suffice it to
651 * say that it is only slightly more likely than the
652 * overflow of predicate cache IDs, as discussed in
653 * dtrace_predicate_create().
654 */
655 nval = 1;
656 }
657 } while (dtrace_cas32(counter, oval, nval) != oval);
658}
659
660/*
661 * Use the DTRACE_LOADFUNC macro to define functions for each of loading a
662 * uint8_t, a uint16_t, a uint32_t and a uint64_t.
663 */
664DTRACE_LOADFUNC(8)
665DTRACE_LOADFUNC(16)
666DTRACE_LOADFUNC(32)
667DTRACE_LOADFUNC(64)
668
669static int
670dtrace_inscratch(uintptr_t dest, size_t size, dtrace_mstate_t *mstate)
671{
672 if (dest < mstate->dtms_scratch_base)
673 return (0);
674
675 if (dest + size < dest)
676 return (0);
677
678 if (dest + size > mstate->dtms_scratch_ptr)
679 return (0);
680
681 return (1);
682}
683
684static int
685dtrace_canstore_statvar(uint64_t addr, size_t sz,
686 dtrace_statvar_t **svars, int nsvars)
687{
688 int i;
689
690 for (i = 0; i < nsvars; i++) {
691 dtrace_statvar_t *svar = svars[i];
692
693 if (svar == NULL || svar->dtsv_size == 0)
694 continue;
695
696 if (DTRACE_INRANGE(addr, sz, svar->dtsv_data, svar->dtsv_size))
697 return (1);
698 }
699
700 return (0);
701}
702
703/*
704 * Check to see if the address is within a memory region to which a store may
705 * be issued. This includes the DTrace scratch areas, and any DTrace variable
706 * region. The caller of dtrace_canstore() is responsible for performing any
707 * alignment checks that are needed before stores are actually executed.
708 */
709static int
710dtrace_canstore(uint64_t addr, size_t sz, dtrace_mstate_t *mstate,
711 dtrace_vstate_t *vstate)
712{
713 /*
714 * First, check to see if the address is in scratch space...
715 */
716 if (DTRACE_INRANGE(addr, sz, mstate->dtms_scratch_base,
717 mstate->dtms_scratch_size))
718 return (1);
719
720 /*
721 * Now check to see if it's a dynamic variable. This check will pick
722 * up both thread-local variables and any global dynamically-allocated
723 * variables.
724 */
721 if (DTRACE_INRANGE(addr, sz, (uintptr_t)vstate->dtvs_dynvars.dtds_base,
725 if (DTRACE_INRANGE(addr, sz, vstate->dtvs_dynvars.dtds_base,
722 vstate->dtvs_dynvars.dtds_size)) {
723 dtrace_dstate_t *dstate = &vstate->dtvs_dynvars;
724 uintptr_t base = (uintptr_t)dstate->dtds_base +
725 (dstate->dtds_hashsize * sizeof (dtrace_dynhash_t));
726 uintptr_t chunkoffs;
727
728 /*
729 * Before we assume that we can store here, we need to make
730 * sure that it isn't in our metadata -- storing to our
731 * dynamic variable metadata would corrupt our state. For
732 * the range to not include any dynamic variable metadata,
733 * it must:
734 *
735 * (1) Start above the hash table that is at the base of
736 * the dynamic variable space
737 *
738 * (2) Have a starting chunk offset that is beyond the
739 * dtrace_dynvar_t that is at the base of every chunk
740 *
741 * (3) Not span a chunk boundary
742 *
743 */
744 if (addr < base)
745 return (0);
746
747 chunkoffs = (addr - base) % dstate->dtds_chunksize;
748
749 if (chunkoffs < sizeof (dtrace_dynvar_t))
750 return (0);
751
752 if (chunkoffs + sz > dstate->dtds_chunksize)
753 return (0);
754
755 return (1);
756 }
757
758 /*
759 * Finally, check the static local and global variables. These checks
760 * take the longest, so we perform them last.
761 */
762 if (dtrace_canstore_statvar(addr, sz,
763 vstate->dtvs_locals, vstate->dtvs_nlocals))
764 return (1);
765
766 if (dtrace_canstore_statvar(addr, sz,
767 vstate->dtvs_globals, vstate->dtvs_nglobals))
768 return (1);
769
770 return (0);
771}
772
773
774/*
775 * Convenience routine to check to see if the address is within a memory
776 * region in which a load may be issued given the user's privilege level;
777 * if not, it sets the appropriate error flags and loads 'addr' into the
778 * illegal value slot.
779 *
780 * DTrace subroutines (DIF_SUBR_*) should use this helper to implement
781 * appropriate memory access protection.
782 */
783static int
784dtrace_canload(uint64_t addr, size_t sz, dtrace_mstate_t *mstate,
785 dtrace_vstate_t *vstate)
786{
787 volatile uintptr_t *illval = &cpu_core[curcpu].cpuc_dtrace_illval;
726 vstate->dtvs_dynvars.dtds_size)) {
727 dtrace_dstate_t *dstate = &vstate->dtvs_dynvars;
728 uintptr_t base = (uintptr_t)dstate->dtds_base +
729 (dstate->dtds_hashsize * sizeof (dtrace_dynhash_t));
730 uintptr_t chunkoffs;
731
732 /*
733 * Before we assume that we can store here, we need to make
734 * sure that it isn't in our metadata -- storing to our
735 * dynamic variable metadata would corrupt our state. For
736 * the range to not include any dynamic variable metadata,
737 * it must:
738 *
739 * (1) Start above the hash table that is at the base of
740 * the dynamic variable space
741 *
742 * (2) Have a starting chunk offset that is beyond the
743 * dtrace_dynvar_t that is at the base of every chunk
744 *
745 * (3) Not span a chunk boundary
746 *
747 */
748 if (addr < base)
749 return (0);
750
751 chunkoffs = (addr - base) % dstate->dtds_chunksize;
752
753 if (chunkoffs < sizeof (dtrace_dynvar_t))
754 return (0);
755
756 if (chunkoffs + sz > dstate->dtds_chunksize)
757 return (0);
758
759 return (1);
760 }
761
762 /*
763 * Finally, check the static local and global variables. These checks
764 * take the longest, so we perform them last.
765 */
766 if (dtrace_canstore_statvar(addr, sz,
767 vstate->dtvs_locals, vstate->dtvs_nlocals))
768 return (1);
769
770 if (dtrace_canstore_statvar(addr, sz,
771 vstate->dtvs_globals, vstate->dtvs_nglobals))
772 return (1);
773
774 return (0);
775}
776
777
778/*
779 * Convenience routine to check to see if the address is within a memory
780 * region in which a load may be issued given the user's privilege level;
781 * if not, it sets the appropriate error flags and loads 'addr' into the
782 * illegal value slot.
783 *
784 * DTrace subroutines (DIF_SUBR_*) should use this helper to implement
785 * appropriate memory access protection.
786 */
787static int
788dtrace_canload(uint64_t addr, size_t sz, dtrace_mstate_t *mstate,
789 dtrace_vstate_t *vstate)
790{
791 volatile uintptr_t *illval = &cpu_core[curcpu].cpuc_dtrace_illval;
792 file_t *fp;
788
789 /*
790 * If we hold the privilege to read from kernel memory, then
791 * everything is readable.
792 */
793 if ((mstate->dtms_access & DTRACE_ACCESS_KERNEL) != 0)
794 return (1);
795
796 /*
797 * You can obviously read that which you can store.
798 */
799 if (dtrace_canstore(addr, sz, mstate, vstate))
800 return (1);
801
802 /*
803 * We're allowed to read from our own string table.
804 */
793
794 /*
795 * If we hold the privilege to read from kernel memory, then
796 * everything is readable.
797 */
798 if ((mstate->dtms_access & DTRACE_ACCESS_KERNEL) != 0)
799 return (1);
800
801 /*
802 * You can obviously read that which you can store.
803 */
804 if (dtrace_canstore(addr, sz, mstate, vstate))
805 return (1);
806
807 /*
808 * We're allowed to read from our own string table.
809 */
805 if (DTRACE_INRANGE(addr, sz, (uintptr_t)mstate->dtms_difo->dtdo_strtab,
810 if (DTRACE_INRANGE(addr, sz, mstate->dtms_difo->dtdo_strtab,
806 mstate->dtms_difo->dtdo_strlen))
807 return (1);
808
811 mstate->dtms_difo->dtdo_strlen))
812 return (1);
813
814 if (vstate->dtvs_state != NULL &&
815 dtrace_priv_proc(vstate->dtvs_state)) {
816 proc_t *p;
817
818 /*
819 * When we have privileges to the current process, there are
820 * several context-related kernel structures that are safe to
821 * read, even absent the privilege to read from kernel memory.
822 * These reads are safe because these structures contain only
823 * state that (1) we're permitted to read, (2) is harmless or
824 * (3) contains pointers to additional kernel state that we're
825 * not permitted to read (and as such, do not present an
826 * opportunity for privilege escalation). Finally (and
827 * critically), because of the nature of their relation with
828 * the current thread context, the memory associated with these
829 * structures cannot change over the duration of probe context,
830 * and it is therefore impossible for this memory to be
831 * deallocated and reallocated as something else while it's
832 * being operated upon.
833 */
834 if (DTRACE_INRANGE(addr, sz, curthread, sizeof (kthread_t)))
835 return (1);
836
837 if ((p = curthread->t_procp) != NULL && DTRACE_INRANGE(addr,
838 sz, curthread->t_procp, sizeof (proc_t))) {
839 return (1);
840 }
841
842 if (curthread->t_cred != NULL && DTRACE_INRANGE(addr, sz,
843 curthread->t_cred, sizeof (cred_t))) {
844 return (1);
845 }
846
847#if defined(sun)
848 if (p != NULL && p->p_pidp != NULL && DTRACE_INRANGE(addr, sz,
849 &(p->p_pidp->pid_id), sizeof (pid_t))) {
850 return (1);
851 }
852
853 if (curthread->t_cpu != NULL && DTRACE_INRANGE(addr, sz,
854 curthread->t_cpu, offsetof(cpu_t, cpu_pause_thread))) {
855 return (1);
856 }
857#endif
858 }
859
860 if ((fp = mstate->dtms_getf) != NULL) {
861 uintptr_t psz = sizeof (void *);
862 vnode_t *vp;
863 vnodeops_t *op;
864
865 /*
866 * When getf() returns a file_t, the enabling is implicitly
867 * granted the (transient) right to read the returned file_t
868 * as well as the v_path and v_op->vnop_name of the underlying
869 * vnode. These accesses are allowed after a successful
870 * getf() because the members that they refer to cannot change
871 * once set -- and the barrier logic in the kernel's closef()
872 * path assures that the file_t and its referenced vode_t
873 * cannot themselves be stale (that is, it impossible for
874 * either dtms_getf itself or its f_vnode member to reference
875 * freed memory).
876 */
877 if (DTRACE_INRANGE(addr, sz, fp, sizeof (file_t)))
878 return (1);
879
880 if ((vp = fp->f_vnode) != NULL) {
881#if defined(sun)
882 if (DTRACE_INRANGE(addr, sz, &vp->v_path, psz))
883 return (1);
884 if (vp->v_path != NULL && DTRACE_INRANGE(addr, sz,
885 vp->v_path, strlen(vp->v_path) + 1)) {
886 return (1);
887 }
888#endif
889
890 if (DTRACE_INRANGE(addr, sz, &vp->v_op, psz))
891 return (1);
892
893#if defined(sun)
894 if ((op = vp->v_op) != NULL &&
895 DTRACE_INRANGE(addr, sz, &op->vnop_name, psz)) {
896 return (1);
897 }
898
899 if (op != NULL && op->vnop_name != NULL &&
900 DTRACE_INRANGE(addr, sz, op->vnop_name,
901 strlen(op->vnop_name) + 1)) {
902 return (1);
903 }
904#endif
905 }
906 }
907
809 DTRACE_CPUFLAG_SET(CPU_DTRACE_KPRIV);
810 *illval = addr;
811 return (0);
812}
813
814/*
815 * Convenience routine to check to see if a given string is within a memory
816 * region in which a load may be issued given the user's privilege level;
817 * this exists so that we don't need to issue unnecessary dtrace_strlen()
818 * calls in the event that the user has all privileges.
819 */
820static int
821dtrace_strcanload(uint64_t addr, size_t sz, dtrace_mstate_t *mstate,
822 dtrace_vstate_t *vstate)
823{
824 size_t strsz;
825
826 /*
827 * If we hold the privilege to read from kernel memory, then
828 * everything is readable.
829 */
830 if ((mstate->dtms_access & DTRACE_ACCESS_KERNEL) != 0)
831 return (1);
832
833 strsz = 1 + dtrace_strlen((char *)(uintptr_t)addr, sz);
834 if (dtrace_canload(addr, strsz, mstate, vstate))
835 return (1);
836
837 return (0);
838}
839
840/*
841 * Convenience routine to check to see if a given variable is within a memory
842 * region in which a load may be issued given the user's privilege level.
843 */
844static int
845dtrace_vcanload(void *src, dtrace_diftype_t *type, dtrace_mstate_t *mstate,
846 dtrace_vstate_t *vstate)
847{
848 size_t sz;
849 ASSERT(type->dtdt_flags & DIF_TF_BYREF);
850
851 /*
852 * If we hold the privilege to read from kernel memory, then
853 * everything is readable.
854 */
855 if ((mstate->dtms_access & DTRACE_ACCESS_KERNEL) != 0)
856 return (1);
857
858 if (type->dtdt_kind == DIF_TYPE_STRING)
859 sz = dtrace_strlen(src,
860 vstate->dtvs_state->dts_options[DTRACEOPT_STRSIZE]) + 1;
861 else
862 sz = type->dtdt_size;
863
864 return (dtrace_canload((uintptr_t)src, sz, mstate, vstate));
865}
866
867/*
908 DTRACE_CPUFLAG_SET(CPU_DTRACE_KPRIV);
909 *illval = addr;
910 return (0);
911}
912
913/*
914 * Convenience routine to check to see if a given string is within a memory
915 * region in which a load may be issued given the user's privilege level;
916 * this exists so that we don't need to issue unnecessary dtrace_strlen()
917 * calls in the event that the user has all privileges.
918 */
919static int
920dtrace_strcanload(uint64_t addr, size_t sz, dtrace_mstate_t *mstate,
921 dtrace_vstate_t *vstate)
922{
923 size_t strsz;
924
925 /*
926 * If we hold the privilege to read from kernel memory, then
927 * everything is readable.
928 */
929 if ((mstate->dtms_access & DTRACE_ACCESS_KERNEL) != 0)
930 return (1);
931
932 strsz = 1 + dtrace_strlen((char *)(uintptr_t)addr, sz);
933 if (dtrace_canload(addr, strsz, mstate, vstate))
934 return (1);
935
936 return (0);
937}
938
939/*
940 * Convenience routine to check to see if a given variable is within a memory
941 * region in which a load may be issued given the user's privilege level.
942 */
943static int
944dtrace_vcanload(void *src, dtrace_diftype_t *type, dtrace_mstate_t *mstate,
945 dtrace_vstate_t *vstate)
946{
947 size_t sz;
948 ASSERT(type->dtdt_flags & DIF_TF_BYREF);
949
950 /*
951 * If we hold the privilege to read from kernel memory, then
952 * everything is readable.
953 */
954 if ((mstate->dtms_access & DTRACE_ACCESS_KERNEL) != 0)
955 return (1);
956
957 if (type->dtdt_kind == DIF_TYPE_STRING)
958 sz = dtrace_strlen(src,
959 vstate->dtvs_state->dts_options[DTRACEOPT_STRSIZE]) + 1;
960 else
961 sz = type->dtdt_size;
962
963 return (dtrace_canload((uintptr_t)src, sz, mstate, vstate));
964}
965
966/*
967 * Convert a string to a signed integer using safe loads.
968 *
969 * NOTE: This function uses various macros from strtolctype.h to manipulate
970 * digit values, etc -- these have all been checked to ensure they make
971 * no additional function calls.
972 */
973static int64_t
974dtrace_strtoll(char *input, int base, size_t limit)
975{
976 uintptr_t pos = (uintptr_t)input;
977 int64_t val = 0;
978 int x;
979 boolean_t neg = B_FALSE;
980 char c, cc, ccc;
981 uintptr_t end = pos + limit;
982
983 /*
984 * Consume any whitespace preceding digits.
985 */
986 while ((c = dtrace_load8(pos)) == ' ' || c == '\t')
987 pos++;
988
989 /*
990 * Handle an explicit sign if one is present.
991 */
992 if (c == '-' || c == '+') {
993 if (c == '-')
994 neg = B_TRUE;
995 c = dtrace_load8(++pos);
996 }
997
998 /*
999 * Check for an explicit hexadecimal prefix ("0x" or "0X") and skip it
1000 * if present.
1001 */
1002 if (base == 16 && c == '0' && ((cc = dtrace_load8(pos + 1)) == 'x' ||
1003 cc == 'X') && isxdigit(ccc = dtrace_load8(pos + 2))) {
1004 pos += 2;
1005 c = ccc;
1006 }
1007
1008 /*
1009 * Read in contiguous digits until the first non-digit character.
1010 */
1011 for (; pos < end && c != '\0' && lisalnum(c) && (x = DIGIT(c)) < base;
1012 c = dtrace_load8(++pos))
1013 val = val * base + x;
1014
1015 return (neg ? -val : val);
1016}
1017
1018/*
868 * Compare two strings using safe loads.
869 */
870static int
871dtrace_strncmp(char *s1, char *s2, size_t limit)
872{
873 uint8_t c1, c2;
874 volatile uint16_t *flags;
875
876 if (s1 == s2 || limit == 0)
877 return (0);
878
879 flags = (volatile uint16_t *)&cpu_core[curcpu].cpuc_dtrace_flags;
880
881 do {
882 if (s1 == NULL) {
883 c1 = '\0';
884 } else {
885 c1 = dtrace_load8((uintptr_t)s1++);
886 }
887
888 if (s2 == NULL) {
889 c2 = '\0';
890 } else {
891 c2 = dtrace_load8((uintptr_t)s2++);
892 }
893
894 if (c1 != c2)
895 return (c1 - c2);
896 } while (--limit && c1 != '\0' && !(*flags & CPU_DTRACE_FAULT));
897
898 return (0);
899}
900
901/*
902 * Compute strlen(s) for a string using safe memory accesses. The additional
903 * len parameter is used to specify a maximum length to ensure completion.
904 */
905static size_t
906dtrace_strlen(const char *s, size_t lim)
907{
908 uint_t len;
909
910 for (len = 0; len != lim; len++) {
911 if (dtrace_load8((uintptr_t)s++) == '\0')
912 break;
913 }
914
915 return (len);
916}
917
918/*
919 * Check if an address falls within a toxic region.
920 */
921static int
922dtrace_istoxic(uintptr_t kaddr, size_t size)
923{
924 uintptr_t taddr, tsize;
925 int i;
926
927 for (i = 0; i < dtrace_toxranges; i++) {
928 taddr = dtrace_toxrange[i].dtt_base;
929 tsize = dtrace_toxrange[i].dtt_limit - taddr;
930
931 if (kaddr - taddr < tsize) {
932 DTRACE_CPUFLAG_SET(CPU_DTRACE_BADADDR);
933 cpu_core[curcpu].cpuc_dtrace_illval = kaddr;
934 return (1);
935 }
936
937 if (taddr - kaddr < size) {
938 DTRACE_CPUFLAG_SET(CPU_DTRACE_BADADDR);
939 cpu_core[curcpu].cpuc_dtrace_illval = taddr;
940 return (1);
941 }
942 }
943
944 return (0);
945}
946
947/*
948 * Copy src to dst using safe memory accesses. The src is assumed to be unsafe
949 * memory specified by the DIF program. The dst is assumed to be safe memory
950 * that we can store to directly because it is managed by DTrace. As with
951 * standard bcopy, overlapping copies are handled properly.
952 */
953static void
954dtrace_bcopy(const void *src, void *dst, size_t len)
955{
956 if (len != 0) {
957 uint8_t *s1 = dst;
958 const uint8_t *s2 = src;
959
960 if (s1 <= s2) {
961 do {
962 *s1++ = dtrace_load8((uintptr_t)s2++);
963 } while (--len != 0);
964 } else {
965 s2 += len;
966 s1 += len;
967
968 do {
969 *--s1 = dtrace_load8((uintptr_t)--s2);
970 } while (--len != 0);
971 }
972 }
973}
974
975/*
976 * Copy src to dst using safe memory accesses, up to either the specified
977 * length, or the point that a nul byte is encountered. The src is assumed to
978 * be unsafe memory specified by the DIF program. The dst is assumed to be
979 * safe memory that we can store to directly because it is managed by DTrace.
980 * Unlike dtrace_bcopy(), overlapping regions are not handled.
981 */
982static void
983dtrace_strcpy(const void *src, void *dst, size_t len)
984{
985 if (len != 0) {
986 uint8_t *s1 = dst, c;
987 const uint8_t *s2 = src;
988
989 do {
990 *s1++ = c = dtrace_load8((uintptr_t)s2++);
991 } while (--len != 0 && c != '\0');
992 }
993}
994
995/*
996 * Copy src to dst, deriving the size and type from the specified (BYREF)
997 * variable type. The src is assumed to be unsafe memory specified by the DIF
998 * program. The dst is assumed to be DTrace variable memory that is of the
999 * specified type; we assume that we can store to directly.
1000 */
1001static void
1002dtrace_vcopy(void *src, void *dst, dtrace_diftype_t *type)
1003{
1004 ASSERT(type->dtdt_flags & DIF_TF_BYREF);
1005
1006 if (type->dtdt_kind == DIF_TYPE_STRING) {
1007 dtrace_strcpy(src, dst, type->dtdt_size);
1008 } else {
1009 dtrace_bcopy(src, dst, type->dtdt_size);
1010 }
1011}
1012
1013/*
1014 * Compare s1 to s2 using safe memory accesses. The s1 data is assumed to be
1015 * unsafe memory specified by the DIF program. The s2 data is assumed to be
1016 * safe memory that we can access directly because it is managed by DTrace.
1017 */
1018static int
1019dtrace_bcmp(const void *s1, const void *s2, size_t len)
1020{
1021 volatile uint16_t *flags;
1022
1023 flags = (volatile uint16_t *)&cpu_core[curcpu].cpuc_dtrace_flags;
1024
1025 if (s1 == s2)
1026 return (0);
1027
1028 if (s1 == NULL || s2 == NULL)
1029 return (1);
1030
1031 if (s1 != s2 && len != 0) {
1032 const uint8_t *ps1 = s1;
1033 const uint8_t *ps2 = s2;
1034
1035 do {
1036 if (dtrace_load8((uintptr_t)ps1++) != *ps2++)
1037 return (1);
1038 } while (--len != 0 && !(*flags & CPU_DTRACE_FAULT));
1039 }
1040 return (0);
1041}
1042
1043/*
1044 * Zero the specified region using a simple byte-by-byte loop. Note that this
1045 * is for safe DTrace-managed memory only.
1046 */
1047static void
1048dtrace_bzero(void *dst, size_t len)
1049{
1050 uchar_t *cp;
1051
1052 for (cp = dst; len != 0; len--)
1053 *cp++ = 0;
1054}
1055
1056static void
1057dtrace_add_128(uint64_t *addend1, uint64_t *addend2, uint64_t *sum)
1058{
1059 uint64_t result[2];
1060
1061 result[0] = addend1[0] + addend2[0];
1062 result[1] = addend1[1] + addend2[1] +
1063 (result[0] < addend1[0] || result[0] < addend2[0] ? 1 : 0);
1064
1065 sum[0] = result[0];
1066 sum[1] = result[1];
1067}
1068
1069/*
1070 * Shift the 128-bit value in a by b. If b is positive, shift left.
1071 * If b is negative, shift right.
1072 */
1073static void
1074dtrace_shift_128(uint64_t *a, int b)
1075{
1076 uint64_t mask;
1077
1078 if (b == 0)
1079 return;
1080
1081 if (b < 0) {
1082 b = -b;
1083 if (b >= 64) {
1084 a[0] = a[1] >> (b - 64);
1085 a[1] = 0;
1086 } else {
1087 a[0] >>= b;
1088 mask = 1LL << (64 - b);
1089 mask -= 1;
1090 a[0] |= ((a[1] & mask) << (64 - b));
1091 a[1] >>= b;
1092 }
1093 } else {
1094 if (b >= 64) {
1095 a[1] = a[0] << (b - 64);
1096 a[0] = 0;
1097 } else {
1098 a[1] <<= b;
1099 mask = a[0] >> (64 - b);
1100 a[1] |= mask;
1101 a[0] <<= b;
1102 }
1103 }
1104}
1105
1106/*
1107 * The basic idea is to break the 2 64-bit values into 4 32-bit values,
1108 * use native multiplication on those, and then re-combine into the
1109 * resulting 128-bit value.
1110 *
1111 * (hi1 << 32 + lo1) * (hi2 << 32 + lo2) =
1112 * hi1 * hi2 << 64 +
1113 * hi1 * lo2 << 32 +
1114 * hi2 * lo1 << 32 +
1115 * lo1 * lo2
1116 */
1117static void
1118dtrace_multiply_128(uint64_t factor1, uint64_t factor2, uint64_t *product)
1119{
1120 uint64_t hi1, hi2, lo1, lo2;
1121 uint64_t tmp[2];
1122
1123 hi1 = factor1 >> 32;
1124 hi2 = factor2 >> 32;
1125
1126 lo1 = factor1 & DT_MASK_LO;
1127 lo2 = factor2 & DT_MASK_LO;
1128
1129 product[0] = lo1 * lo2;
1130 product[1] = hi1 * hi2;
1131
1132 tmp[0] = hi1 * lo2;
1133 tmp[1] = 0;
1134 dtrace_shift_128(tmp, 32);
1135 dtrace_add_128(product, tmp, product);
1136
1137 tmp[0] = hi2 * lo1;
1138 tmp[1] = 0;
1139 dtrace_shift_128(tmp, 32);
1140 dtrace_add_128(product, tmp, product);
1141}
1142
1143/*
1144 * This privilege check should be used by actions and subroutines to
1145 * verify that the user credentials of the process that enabled the
1146 * invoking ECB match the target credentials
1147 */
1148static int
1149dtrace_priv_proc_common_user(dtrace_state_t *state)
1150{
1151 cred_t *cr, *s_cr = state->dts_cred.dcr_cred;
1152
1153 /*
1154 * We should always have a non-NULL state cred here, since if cred
1155 * is null (anonymous tracing), we fast-path bypass this routine.
1156 */
1157 ASSERT(s_cr != NULL);
1158
1159 if ((cr = CRED()) != NULL &&
1160 s_cr->cr_uid == cr->cr_uid &&
1161 s_cr->cr_uid == cr->cr_ruid &&
1162 s_cr->cr_uid == cr->cr_suid &&
1163 s_cr->cr_gid == cr->cr_gid &&
1164 s_cr->cr_gid == cr->cr_rgid &&
1165 s_cr->cr_gid == cr->cr_sgid)
1166 return (1);
1167
1168 return (0);
1169}
1170
1171/*
1172 * This privilege check should be used by actions and subroutines to
1173 * verify that the zone of the process that enabled the invoking ECB
1174 * matches the target credentials
1175 */
1176static int
1177dtrace_priv_proc_common_zone(dtrace_state_t *state)
1178{
1179#if defined(sun)
1180 cred_t *cr, *s_cr = state->dts_cred.dcr_cred;
1181
1182 /*
1183 * We should always have a non-NULL state cred here, since if cred
1184 * is null (anonymous tracing), we fast-path bypass this routine.
1185 */
1186 ASSERT(s_cr != NULL);
1187
1019 * Compare two strings using safe loads.
1020 */
1021static int
1022dtrace_strncmp(char *s1, char *s2, size_t limit)
1023{
1024 uint8_t c1, c2;
1025 volatile uint16_t *flags;
1026
1027 if (s1 == s2 || limit == 0)
1028 return (0);
1029
1030 flags = (volatile uint16_t *)&cpu_core[curcpu].cpuc_dtrace_flags;
1031
1032 do {
1033 if (s1 == NULL) {
1034 c1 = '\0';
1035 } else {
1036 c1 = dtrace_load8((uintptr_t)s1++);
1037 }
1038
1039 if (s2 == NULL) {
1040 c2 = '\0';
1041 } else {
1042 c2 = dtrace_load8((uintptr_t)s2++);
1043 }
1044
1045 if (c1 != c2)
1046 return (c1 - c2);
1047 } while (--limit && c1 != '\0' && !(*flags & CPU_DTRACE_FAULT));
1048
1049 return (0);
1050}
1051
1052/*
1053 * Compute strlen(s) for a string using safe memory accesses. The additional
1054 * len parameter is used to specify a maximum length to ensure completion.
1055 */
1056static size_t
1057dtrace_strlen(const char *s, size_t lim)
1058{
1059 uint_t len;
1060
1061 for (len = 0; len != lim; len++) {
1062 if (dtrace_load8((uintptr_t)s++) == '\0')
1063 break;
1064 }
1065
1066 return (len);
1067}
1068
1069/*
1070 * Check if an address falls within a toxic region.
1071 */
1072static int
1073dtrace_istoxic(uintptr_t kaddr, size_t size)
1074{
1075 uintptr_t taddr, tsize;
1076 int i;
1077
1078 for (i = 0; i < dtrace_toxranges; i++) {
1079 taddr = dtrace_toxrange[i].dtt_base;
1080 tsize = dtrace_toxrange[i].dtt_limit - taddr;
1081
1082 if (kaddr - taddr < tsize) {
1083 DTRACE_CPUFLAG_SET(CPU_DTRACE_BADADDR);
1084 cpu_core[curcpu].cpuc_dtrace_illval = kaddr;
1085 return (1);
1086 }
1087
1088 if (taddr - kaddr < size) {
1089 DTRACE_CPUFLAG_SET(CPU_DTRACE_BADADDR);
1090 cpu_core[curcpu].cpuc_dtrace_illval = taddr;
1091 return (1);
1092 }
1093 }
1094
1095 return (0);
1096}
1097
1098/*
1099 * Copy src to dst using safe memory accesses. The src is assumed to be unsafe
1100 * memory specified by the DIF program. The dst is assumed to be safe memory
1101 * that we can store to directly because it is managed by DTrace. As with
1102 * standard bcopy, overlapping copies are handled properly.
1103 */
1104static void
1105dtrace_bcopy(const void *src, void *dst, size_t len)
1106{
1107 if (len != 0) {
1108 uint8_t *s1 = dst;
1109 const uint8_t *s2 = src;
1110
1111 if (s1 <= s2) {
1112 do {
1113 *s1++ = dtrace_load8((uintptr_t)s2++);
1114 } while (--len != 0);
1115 } else {
1116 s2 += len;
1117 s1 += len;
1118
1119 do {
1120 *--s1 = dtrace_load8((uintptr_t)--s2);
1121 } while (--len != 0);
1122 }
1123 }
1124}
1125
1126/*
1127 * Copy src to dst using safe memory accesses, up to either the specified
1128 * length, or the point that a nul byte is encountered. The src is assumed to
1129 * be unsafe memory specified by the DIF program. The dst is assumed to be
1130 * safe memory that we can store to directly because it is managed by DTrace.
1131 * Unlike dtrace_bcopy(), overlapping regions are not handled.
1132 */
1133static void
1134dtrace_strcpy(const void *src, void *dst, size_t len)
1135{
1136 if (len != 0) {
1137 uint8_t *s1 = dst, c;
1138 const uint8_t *s2 = src;
1139
1140 do {
1141 *s1++ = c = dtrace_load8((uintptr_t)s2++);
1142 } while (--len != 0 && c != '\0');
1143 }
1144}
1145
1146/*
1147 * Copy src to dst, deriving the size and type from the specified (BYREF)
1148 * variable type. The src is assumed to be unsafe memory specified by the DIF
1149 * program. The dst is assumed to be DTrace variable memory that is of the
1150 * specified type; we assume that we can store to directly.
1151 */
1152static void
1153dtrace_vcopy(void *src, void *dst, dtrace_diftype_t *type)
1154{
1155 ASSERT(type->dtdt_flags & DIF_TF_BYREF);
1156
1157 if (type->dtdt_kind == DIF_TYPE_STRING) {
1158 dtrace_strcpy(src, dst, type->dtdt_size);
1159 } else {
1160 dtrace_bcopy(src, dst, type->dtdt_size);
1161 }
1162}
1163
1164/*
1165 * Compare s1 to s2 using safe memory accesses. The s1 data is assumed to be
1166 * unsafe memory specified by the DIF program. The s2 data is assumed to be
1167 * safe memory that we can access directly because it is managed by DTrace.
1168 */
1169static int
1170dtrace_bcmp(const void *s1, const void *s2, size_t len)
1171{
1172 volatile uint16_t *flags;
1173
1174 flags = (volatile uint16_t *)&cpu_core[curcpu].cpuc_dtrace_flags;
1175
1176 if (s1 == s2)
1177 return (0);
1178
1179 if (s1 == NULL || s2 == NULL)
1180 return (1);
1181
1182 if (s1 != s2 && len != 0) {
1183 const uint8_t *ps1 = s1;
1184 const uint8_t *ps2 = s2;
1185
1186 do {
1187 if (dtrace_load8((uintptr_t)ps1++) != *ps2++)
1188 return (1);
1189 } while (--len != 0 && !(*flags & CPU_DTRACE_FAULT));
1190 }
1191 return (0);
1192}
1193
1194/*
1195 * Zero the specified region using a simple byte-by-byte loop. Note that this
1196 * is for safe DTrace-managed memory only.
1197 */
1198static void
1199dtrace_bzero(void *dst, size_t len)
1200{
1201 uchar_t *cp;
1202
1203 for (cp = dst; len != 0; len--)
1204 *cp++ = 0;
1205}
1206
1207static void
1208dtrace_add_128(uint64_t *addend1, uint64_t *addend2, uint64_t *sum)
1209{
1210 uint64_t result[2];
1211
1212 result[0] = addend1[0] + addend2[0];
1213 result[1] = addend1[1] + addend2[1] +
1214 (result[0] < addend1[0] || result[0] < addend2[0] ? 1 : 0);
1215
1216 sum[0] = result[0];
1217 sum[1] = result[1];
1218}
1219
1220/*
1221 * Shift the 128-bit value in a by b. If b is positive, shift left.
1222 * If b is negative, shift right.
1223 */
1224static void
1225dtrace_shift_128(uint64_t *a, int b)
1226{
1227 uint64_t mask;
1228
1229 if (b == 0)
1230 return;
1231
1232 if (b < 0) {
1233 b = -b;
1234 if (b >= 64) {
1235 a[0] = a[1] >> (b - 64);
1236 a[1] = 0;
1237 } else {
1238 a[0] >>= b;
1239 mask = 1LL << (64 - b);
1240 mask -= 1;
1241 a[0] |= ((a[1] & mask) << (64 - b));
1242 a[1] >>= b;
1243 }
1244 } else {
1245 if (b >= 64) {
1246 a[1] = a[0] << (b - 64);
1247 a[0] = 0;
1248 } else {
1249 a[1] <<= b;
1250 mask = a[0] >> (64 - b);
1251 a[1] |= mask;
1252 a[0] <<= b;
1253 }
1254 }
1255}
1256
1257/*
1258 * The basic idea is to break the 2 64-bit values into 4 32-bit values,
1259 * use native multiplication on those, and then re-combine into the
1260 * resulting 128-bit value.
1261 *
1262 * (hi1 << 32 + lo1) * (hi2 << 32 + lo2) =
1263 * hi1 * hi2 << 64 +
1264 * hi1 * lo2 << 32 +
1265 * hi2 * lo1 << 32 +
1266 * lo1 * lo2
1267 */
1268static void
1269dtrace_multiply_128(uint64_t factor1, uint64_t factor2, uint64_t *product)
1270{
1271 uint64_t hi1, hi2, lo1, lo2;
1272 uint64_t tmp[2];
1273
1274 hi1 = factor1 >> 32;
1275 hi2 = factor2 >> 32;
1276
1277 lo1 = factor1 & DT_MASK_LO;
1278 lo2 = factor2 & DT_MASK_LO;
1279
1280 product[0] = lo1 * lo2;
1281 product[1] = hi1 * hi2;
1282
1283 tmp[0] = hi1 * lo2;
1284 tmp[1] = 0;
1285 dtrace_shift_128(tmp, 32);
1286 dtrace_add_128(product, tmp, product);
1287
1288 tmp[0] = hi2 * lo1;
1289 tmp[1] = 0;
1290 dtrace_shift_128(tmp, 32);
1291 dtrace_add_128(product, tmp, product);
1292}
1293
1294/*
1295 * This privilege check should be used by actions and subroutines to
1296 * verify that the user credentials of the process that enabled the
1297 * invoking ECB match the target credentials
1298 */
1299static int
1300dtrace_priv_proc_common_user(dtrace_state_t *state)
1301{
1302 cred_t *cr, *s_cr = state->dts_cred.dcr_cred;
1303
1304 /*
1305 * We should always have a non-NULL state cred here, since if cred
1306 * is null (anonymous tracing), we fast-path bypass this routine.
1307 */
1308 ASSERT(s_cr != NULL);
1309
1310 if ((cr = CRED()) != NULL &&
1311 s_cr->cr_uid == cr->cr_uid &&
1312 s_cr->cr_uid == cr->cr_ruid &&
1313 s_cr->cr_uid == cr->cr_suid &&
1314 s_cr->cr_gid == cr->cr_gid &&
1315 s_cr->cr_gid == cr->cr_rgid &&
1316 s_cr->cr_gid == cr->cr_sgid)
1317 return (1);
1318
1319 return (0);
1320}
1321
1322/*
1323 * This privilege check should be used by actions and subroutines to
1324 * verify that the zone of the process that enabled the invoking ECB
1325 * matches the target credentials
1326 */
1327static int
1328dtrace_priv_proc_common_zone(dtrace_state_t *state)
1329{
1330#if defined(sun)
1331 cred_t *cr, *s_cr = state->dts_cred.dcr_cred;
1332
1333 /*
1334 * We should always have a non-NULL state cred here, since if cred
1335 * is null (anonymous tracing), we fast-path bypass this routine.
1336 */
1337 ASSERT(s_cr != NULL);
1338
1188 if ((cr = CRED()) != NULL &&
1189 s_cr->cr_zone == cr->cr_zone)
1339 if ((cr = CRED()) != NULL && s_cr->cr_zone == cr->cr_zone)
1190 return (1);
1191
1192 return (0);
1193#else
1194 return (1);
1195#endif
1196}
1197
1198/*
1199 * This privilege check should be used by actions and subroutines to
1200 * verify that the process has not setuid or changed credentials.
1201 */
1202static int
1203dtrace_priv_proc_common_nocd(void)
1204{
1205 proc_t *proc;
1206
1207 if ((proc = ttoproc(curthread)) != NULL &&
1208 !(proc->p_flag & SNOCD))
1209 return (1);
1210
1211 return (0);
1212}
1213
1214static int
1215dtrace_priv_proc_destructive(dtrace_state_t *state)
1216{
1217 int action = state->dts_cred.dcr_action;
1218
1219 if (((action & DTRACE_CRA_PROC_DESTRUCTIVE_ALLZONE) == 0) &&
1220 dtrace_priv_proc_common_zone(state) == 0)
1221 goto bad;
1222
1223 if (((action & DTRACE_CRA_PROC_DESTRUCTIVE_ALLUSER) == 0) &&
1224 dtrace_priv_proc_common_user(state) == 0)
1225 goto bad;
1226
1227 if (((action & DTRACE_CRA_PROC_DESTRUCTIVE_CREDCHG) == 0) &&
1228 dtrace_priv_proc_common_nocd() == 0)
1229 goto bad;
1230
1231 return (1);
1232
1233bad:
1234 cpu_core[curcpu].cpuc_dtrace_flags |= CPU_DTRACE_UPRIV;
1235
1236 return (0);
1237}
1238
1239static int
1240dtrace_priv_proc_control(dtrace_state_t *state)
1241{
1242 if (state->dts_cred.dcr_action & DTRACE_CRA_PROC_CONTROL)
1243 return (1);
1244
1245 if (dtrace_priv_proc_common_zone(state) &&
1246 dtrace_priv_proc_common_user(state) &&
1247 dtrace_priv_proc_common_nocd())
1248 return (1);
1249
1250 cpu_core[curcpu].cpuc_dtrace_flags |= CPU_DTRACE_UPRIV;
1251
1252 return (0);
1253}
1254
1255static int
1256dtrace_priv_proc(dtrace_state_t *state)
1257{
1258 if (state->dts_cred.dcr_action & DTRACE_CRA_PROC)
1259 return (1);
1260
1261 cpu_core[curcpu].cpuc_dtrace_flags |= CPU_DTRACE_UPRIV;
1262
1263 return (0);
1264}
1265
1266static int
1267dtrace_priv_kernel(dtrace_state_t *state)
1268{
1269 if (state->dts_cred.dcr_action & DTRACE_CRA_KERNEL)
1270 return (1);
1271
1272 cpu_core[curcpu].cpuc_dtrace_flags |= CPU_DTRACE_KPRIV;
1273
1274 return (0);
1275}
1276
1277static int
1278dtrace_priv_kernel_destructive(dtrace_state_t *state)
1279{
1280 if (state->dts_cred.dcr_action & DTRACE_CRA_KERNEL_DESTRUCTIVE)
1281 return (1);
1282
1283 cpu_core[curcpu].cpuc_dtrace_flags |= CPU_DTRACE_KPRIV;
1284
1285 return (0);
1286}
1287
1288/*
1340 return (1);
1341
1342 return (0);
1343#else
1344 return (1);
1345#endif
1346}
1347
1348/*
1349 * This privilege check should be used by actions and subroutines to
1350 * verify that the process has not setuid or changed credentials.
1351 */
1352static int
1353dtrace_priv_proc_common_nocd(void)
1354{
1355 proc_t *proc;
1356
1357 if ((proc = ttoproc(curthread)) != NULL &&
1358 !(proc->p_flag & SNOCD))
1359 return (1);
1360
1361 return (0);
1362}
1363
1364static int
1365dtrace_priv_proc_destructive(dtrace_state_t *state)
1366{
1367 int action = state->dts_cred.dcr_action;
1368
1369 if (((action & DTRACE_CRA_PROC_DESTRUCTIVE_ALLZONE) == 0) &&
1370 dtrace_priv_proc_common_zone(state) == 0)
1371 goto bad;
1372
1373 if (((action & DTRACE_CRA_PROC_DESTRUCTIVE_ALLUSER) == 0) &&
1374 dtrace_priv_proc_common_user(state) == 0)
1375 goto bad;
1376
1377 if (((action & DTRACE_CRA_PROC_DESTRUCTIVE_CREDCHG) == 0) &&
1378 dtrace_priv_proc_common_nocd() == 0)
1379 goto bad;
1380
1381 return (1);
1382
1383bad:
1384 cpu_core[curcpu].cpuc_dtrace_flags |= CPU_DTRACE_UPRIV;
1385
1386 return (0);
1387}
1388
1389static int
1390dtrace_priv_proc_control(dtrace_state_t *state)
1391{
1392 if (state->dts_cred.dcr_action & DTRACE_CRA_PROC_CONTROL)
1393 return (1);
1394
1395 if (dtrace_priv_proc_common_zone(state) &&
1396 dtrace_priv_proc_common_user(state) &&
1397 dtrace_priv_proc_common_nocd())
1398 return (1);
1399
1400 cpu_core[curcpu].cpuc_dtrace_flags |= CPU_DTRACE_UPRIV;
1401
1402 return (0);
1403}
1404
1405static int
1406dtrace_priv_proc(dtrace_state_t *state)
1407{
1408 if (state->dts_cred.dcr_action & DTRACE_CRA_PROC)
1409 return (1);
1410
1411 cpu_core[curcpu].cpuc_dtrace_flags |= CPU_DTRACE_UPRIV;
1412
1413 return (0);
1414}
1415
1416static int
1417dtrace_priv_kernel(dtrace_state_t *state)
1418{
1419 if (state->dts_cred.dcr_action & DTRACE_CRA_KERNEL)
1420 return (1);
1421
1422 cpu_core[curcpu].cpuc_dtrace_flags |= CPU_DTRACE_KPRIV;
1423
1424 return (0);
1425}
1426
1427static int
1428dtrace_priv_kernel_destructive(dtrace_state_t *state)
1429{
1430 if (state->dts_cred.dcr_action & DTRACE_CRA_KERNEL_DESTRUCTIVE)
1431 return (1);
1432
1433 cpu_core[curcpu].cpuc_dtrace_flags |= CPU_DTRACE_KPRIV;
1434
1435 return (0);
1436}
1437
1438/*
1439 * Determine if the dte_cond of the specified ECB allows for processing of
1440 * the current probe to continue. Note that this routine may allow continued
1441 * processing, but with access(es) stripped from the mstate's dtms_access
1442 * field.
1443 */
1444static int
1445dtrace_priv_probe(dtrace_state_t *state, dtrace_mstate_t *mstate,
1446 dtrace_ecb_t *ecb)
1447{
1448 dtrace_probe_t *probe = ecb->dte_probe;
1449 dtrace_provider_t *prov = probe->dtpr_provider;
1450 dtrace_pops_t *pops = &prov->dtpv_pops;
1451 int mode = DTRACE_MODE_NOPRIV_DROP;
1452
1453 ASSERT(ecb->dte_cond);
1454
1455#if defined(sun)
1456 if (pops->dtps_mode != NULL) {
1457 mode = pops->dtps_mode(prov->dtpv_arg,
1458 probe->dtpr_id, probe->dtpr_arg);
1459
1460 ASSERT((mode & DTRACE_MODE_USER) ||
1461 (mode & DTRACE_MODE_KERNEL));
1462 ASSERT((mode & DTRACE_MODE_NOPRIV_RESTRICT) ||
1463 (mode & DTRACE_MODE_NOPRIV_DROP));
1464 }
1465
1466 /*
1467 * If the dte_cond bits indicate that this consumer is only allowed to
1468 * see user-mode firings of this probe, call the provider's dtps_mode()
1469 * entry point to check that the probe was fired while in a user
1470 * context. If that's not the case, use the policy specified by the
1471 * provider to determine if we drop the probe or merely restrict
1472 * operation.
1473 */
1474 if (ecb->dte_cond & DTRACE_COND_USERMODE) {
1475 ASSERT(mode != DTRACE_MODE_NOPRIV_DROP);
1476
1477 if (!(mode & DTRACE_MODE_USER)) {
1478 if (mode & DTRACE_MODE_NOPRIV_DROP)
1479 return (0);
1480
1481 mstate->dtms_access &= ~DTRACE_ACCESS_ARGS;
1482 }
1483 }
1484#endif
1485
1486 /*
1487 * This is more subtle than it looks. We have to be absolutely certain
1488 * that CRED() isn't going to change out from under us so it's only
1489 * legit to examine that structure if we're in constrained situations.
1490 * Currently, the only times we'll this check is if a non-super-user
1491 * has enabled the profile or syscall providers -- providers that
1492 * allow visibility of all processes. For the profile case, the check
1493 * above will ensure that we're examining a user context.
1494 */
1495 if (ecb->dte_cond & DTRACE_COND_OWNER) {
1496 cred_t *cr;
1497 cred_t *s_cr = state->dts_cred.dcr_cred;
1498 proc_t *proc;
1499
1500 ASSERT(s_cr != NULL);
1501
1502 if ((cr = CRED()) == NULL ||
1503 s_cr->cr_uid != cr->cr_uid ||
1504 s_cr->cr_uid != cr->cr_ruid ||
1505 s_cr->cr_uid != cr->cr_suid ||
1506 s_cr->cr_gid != cr->cr_gid ||
1507 s_cr->cr_gid != cr->cr_rgid ||
1508 s_cr->cr_gid != cr->cr_sgid ||
1509 (proc = ttoproc(curthread)) == NULL ||
1510 (proc->p_flag & SNOCD)) {
1511 if (mode & DTRACE_MODE_NOPRIV_DROP)
1512 return (0);
1513
1514#if defined(sun)
1515 mstate->dtms_access &= ~DTRACE_ACCESS_PROC;
1516#endif
1517 }
1518 }
1519
1520#if defined(sun)
1521 /*
1522 * If our dte_cond is set to DTRACE_COND_ZONEOWNER and we are not
1523 * in our zone, check to see if our mode policy is to restrict rather
1524 * than to drop; if to restrict, strip away both DTRACE_ACCESS_PROC
1525 * and DTRACE_ACCESS_ARGS
1526 */
1527 if (ecb->dte_cond & DTRACE_COND_ZONEOWNER) {
1528 cred_t *cr;
1529 cred_t *s_cr = state->dts_cred.dcr_cred;
1530
1531 ASSERT(s_cr != NULL);
1532
1533 if ((cr = CRED()) == NULL ||
1534 s_cr->cr_zone->zone_id != cr->cr_zone->zone_id) {
1535 if (mode & DTRACE_MODE_NOPRIV_DROP)
1536 return (0);
1537
1538 mstate->dtms_access &=
1539 ~(DTRACE_ACCESS_PROC | DTRACE_ACCESS_ARGS);
1540 }
1541 }
1542#endif
1543
1544 return (1);
1545}
1546
1547/*
1289 * Note: not called from probe context. This function is called
1290 * asynchronously (and at a regular interval) from outside of probe context to
1291 * clean the dirty dynamic variable lists on all CPUs. Dynamic variable
1292 * cleaning is explained in detail in <sys/dtrace_impl.h>.
1293 */
1294void
1295dtrace_dynvar_clean(dtrace_dstate_t *dstate)
1296{
1297 dtrace_dynvar_t *dirty;
1298 dtrace_dstate_percpu_t *dcpu;
1299 int i, work = 0;
1300
1301 for (i = 0; i < NCPU; i++) {
1302 dcpu = &dstate->dtds_percpu[i];
1303
1304 ASSERT(dcpu->dtdsc_rinsing == NULL);
1305
1306 /*
1307 * If the dirty list is NULL, there is no dirty work to do.
1308 */
1309 if (dcpu->dtdsc_dirty == NULL)
1310 continue;
1311
1312 /*
1313 * If the clean list is non-NULL, then we're not going to do
1314 * any work for this CPU -- it means that there has not been
1315 * a dtrace_dynvar() allocation on this CPU (or from this CPU)
1316 * since the last time we cleaned house.
1317 */
1318 if (dcpu->dtdsc_clean != NULL)
1319 continue;
1320
1321 work = 1;
1322
1323 /*
1324 * Atomically move the dirty list aside.
1325 */
1326 do {
1327 dirty = dcpu->dtdsc_dirty;
1328
1329 /*
1330 * Before we zap the dirty list, set the rinsing list.
1331 * (This allows for a potential assertion in
1332 * dtrace_dynvar(): if a free dynamic variable appears
1333 * on a hash chain, either the dirty list or the
1334 * rinsing list for some CPU must be non-NULL.)
1335 */
1336 dcpu->dtdsc_rinsing = dirty;
1337 dtrace_membar_producer();
1338 } while (dtrace_casptr(&dcpu->dtdsc_dirty,
1339 dirty, NULL) != dirty);
1340 }
1341
1342 if (!work) {
1343 /*
1344 * We have no work to do; we can simply return.
1345 */
1346 return;
1347 }
1348
1349 dtrace_sync();
1350
1351 for (i = 0; i < NCPU; i++) {
1352 dcpu = &dstate->dtds_percpu[i];
1353
1354 if (dcpu->dtdsc_rinsing == NULL)
1355 continue;
1356
1357 /*
1358 * We are now guaranteed that no hash chain contains a pointer
1359 * into this dirty list; we can make it clean.
1360 */
1361 ASSERT(dcpu->dtdsc_clean == NULL);
1362 dcpu->dtdsc_clean = dcpu->dtdsc_rinsing;
1363 dcpu->dtdsc_rinsing = NULL;
1364 }
1365
1366 /*
1367 * Before we actually set the state to be DTRACE_DSTATE_CLEAN, make
1368 * sure that all CPUs have seen all of the dtdsc_clean pointers.
1369 * This prevents a race whereby a CPU incorrectly decides that
1370 * the state should be something other than DTRACE_DSTATE_CLEAN
1371 * after dtrace_dynvar_clean() has completed.
1372 */
1373 dtrace_sync();
1374
1375 dstate->dtds_state = DTRACE_DSTATE_CLEAN;
1376}
1377
1378/*
1379 * Depending on the value of the op parameter, this function looks-up,
1380 * allocates or deallocates an arbitrarily-keyed dynamic variable. If an
1381 * allocation is requested, this function will return a pointer to a
1382 * dtrace_dynvar_t corresponding to the allocated variable -- or NULL if no
1383 * variable can be allocated. If NULL is returned, the appropriate counter
1384 * will be incremented.
1385 */
1386dtrace_dynvar_t *
1387dtrace_dynvar(dtrace_dstate_t *dstate, uint_t nkeys,
1388 dtrace_key_t *key, size_t dsize, dtrace_dynvar_op_t op,
1389 dtrace_mstate_t *mstate, dtrace_vstate_t *vstate)
1390{
1391 uint64_t hashval = DTRACE_DYNHASH_VALID;
1392 dtrace_dynhash_t *hash = dstate->dtds_hash;
1393 dtrace_dynvar_t *free, *new_free, *next, *dvar, *start, *prev = NULL;
1394 processorid_t me = curcpu, cpu = me;
1395 dtrace_dstate_percpu_t *dcpu = &dstate->dtds_percpu[me];
1396 size_t bucket, ksize;
1397 size_t chunksize = dstate->dtds_chunksize;
1398 uintptr_t kdata, lock, nstate;
1399 uint_t i;
1400
1401 ASSERT(nkeys != 0);
1402
1403 /*
1404 * Hash the key. As with aggregations, we use Jenkins' "One-at-a-time"
1405 * algorithm. For the by-value portions, we perform the algorithm in
1406 * 16-bit chunks (as opposed to 8-bit chunks). This speeds things up a
1407 * bit, and seems to have only a minute effect on distribution. For
1408 * the by-reference data, we perform "One-at-a-time" iterating (safely)
1409 * over each referenced byte. It's painful to do this, but it's much
1410 * better than pathological hash distribution. The efficacy of the
1411 * hashing algorithm (and a comparison with other algorithms) may be
1412 * found by running the ::dtrace_dynstat MDB dcmd.
1413 */
1414 for (i = 0; i < nkeys; i++) {
1415 if (key[i].dttk_size == 0) {
1416 uint64_t val = key[i].dttk_value;
1417
1418 hashval += (val >> 48) & 0xffff;
1419 hashval += (hashval << 10);
1420 hashval ^= (hashval >> 6);
1421
1422 hashval += (val >> 32) & 0xffff;
1423 hashval += (hashval << 10);
1424 hashval ^= (hashval >> 6);
1425
1426 hashval += (val >> 16) & 0xffff;
1427 hashval += (hashval << 10);
1428 hashval ^= (hashval >> 6);
1429
1430 hashval += val & 0xffff;
1431 hashval += (hashval << 10);
1432 hashval ^= (hashval >> 6);
1433 } else {
1434 /*
1435 * This is incredibly painful, but it beats the hell
1436 * out of the alternative.
1437 */
1438 uint64_t j, size = key[i].dttk_size;
1439 uintptr_t base = (uintptr_t)key[i].dttk_value;
1440
1441 if (!dtrace_canload(base, size, mstate, vstate))
1442 break;
1443
1444 for (j = 0; j < size; j++) {
1445 hashval += dtrace_load8(base + j);
1446 hashval += (hashval << 10);
1447 hashval ^= (hashval >> 6);
1448 }
1449 }
1450 }
1451
1452 if (DTRACE_CPUFLAG_ISSET(CPU_DTRACE_FAULT))
1453 return (NULL);
1454
1455 hashval += (hashval << 3);
1456 hashval ^= (hashval >> 11);
1457 hashval += (hashval << 15);
1458
1459 /*
1460 * There is a remote chance (ideally, 1 in 2^31) that our hashval
1461 * comes out to be one of our two sentinel hash values. If this
1462 * actually happens, we set the hashval to be a value known to be a
1463 * non-sentinel value.
1464 */
1465 if (hashval == DTRACE_DYNHASH_FREE || hashval == DTRACE_DYNHASH_SINK)
1466 hashval = DTRACE_DYNHASH_VALID;
1467
1468 /*
1469 * Yes, it's painful to do a divide here. If the cycle count becomes
1470 * important here, tricks can be pulled to reduce it. (However, it's
1471 * critical that hash collisions be kept to an absolute minimum;
1472 * they're much more painful than a divide.) It's better to have a
1473 * solution that generates few collisions and still keeps things
1474 * relatively simple.
1475 */
1476 bucket = hashval % dstate->dtds_hashsize;
1477
1478 if (op == DTRACE_DYNVAR_DEALLOC) {
1479 volatile uintptr_t *lockp = &hash[bucket].dtdh_lock;
1480
1481 for (;;) {
1482 while ((lock = *lockp) & 1)
1483 continue;
1484
1485 if (dtrace_casptr((volatile void *)lockp,
1486 (volatile void *)lock, (volatile void *)(lock + 1)) == (void *)lock)
1487 break;
1488 }
1489
1490 dtrace_membar_producer();
1491 }
1492
1493top:
1494 prev = NULL;
1495 lock = hash[bucket].dtdh_lock;
1496
1497 dtrace_membar_consumer();
1498
1499 start = hash[bucket].dtdh_chain;
1500 ASSERT(start != NULL && (start->dtdv_hashval == DTRACE_DYNHASH_SINK ||
1501 start->dtdv_hashval != DTRACE_DYNHASH_FREE ||
1502 op != DTRACE_DYNVAR_DEALLOC));
1503
1504 for (dvar = start; dvar != NULL; dvar = dvar->dtdv_next) {
1505 dtrace_tuple_t *dtuple = &dvar->dtdv_tuple;
1506 dtrace_key_t *dkey = &dtuple->dtt_key[0];
1507
1508 if (dvar->dtdv_hashval != hashval) {
1509 if (dvar->dtdv_hashval == DTRACE_DYNHASH_SINK) {
1510 /*
1511 * We've reached the sink, and therefore the
1512 * end of the hash chain; we can kick out of
1513 * the loop knowing that we have seen a valid
1514 * snapshot of state.
1515 */
1516 ASSERT(dvar->dtdv_next == NULL);
1517 ASSERT(dvar == &dtrace_dynhash_sink);
1518 break;
1519 }
1520
1521 if (dvar->dtdv_hashval == DTRACE_DYNHASH_FREE) {
1522 /*
1523 * We've gone off the rails: somewhere along
1524 * the line, one of the members of this hash
1525 * chain was deleted. Note that we could also
1526 * detect this by simply letting this loop run
1527 * to completion, as we would eventually hit
1528 * the end of the dirty list. However, we
1529 * want to avoid running the length of the
1530 * dirty list unnecessarily (it might be quite
1531 * long), so we catch this as early as
1532 * possible by detecting the hash marker. In
1533 * this case, we simply set dvar to NULL and
1534 * break; the conditional after the loop will
1535 * send us back to top.
1536 */
1537 dvar = NULL;
1538 break;
1539 }
1540
1541 goto next;
1542 }
1543
1544 if (dtuple->dtt_nkeys != nkeys)
1545 goto next;
1546
1547 for (i = 0; i < nkeys; i++, dkey++) {
1548 if (dkey->dttk_size != key[i].dttk_size)
1549 goto next; /* size or type mismatch */
1550
1551 if (dkey->dttk_size != 0) {
1552 if (dtrace_bcmp(
1553 (void *)(uintptr_t)key[i].dttk_value,
1554 (void *)(uintptr_t)dkey->dttk_value,
1555 dkey->dttk_size))
1556 goto next;
1557 } else {
1558 if (dkey->dttk_value != key[i].dttk_value)
1559 goto next;
1560 }
1561 }
1562
1563 if (op != DTRACE_DYNVAR_DEALLOC)
1564 return (dvar);
1565
1566 ASSERT(dvar->dtdv_next == NULL ||
1567 dvar->dtdv_next->dtdv_hashval != DTRACE_DYNHASH_FREE);
1568
1569 if (prev != NULL) {
1570 ASSERT(hash[bucket].dtdh_chain != dvar);
1571 ASSERT(start != dvar);
1572 ASSERT(prev->dtdv_next == dvar);
1573 prev->dtdv_next = dvar->dtdv_next;
1574 } else {
1575 if (dtrace_casptr(&hash[bucket].dtdh_chain,
1576 start, dvar->dtdv_next) != start) {
1577 /*
1578 * We have failed to atomically swing the
1579 * hash table head pointer, presumably because
1580 * of a conflicting allocation on another CPU.
1581 * We need to reread the hash chain and try
1582 * again.
1583 */
1584 goto top;
1585 }
1586 }
1587
1588 dtrace_membar_producer();
1589
1590 /*
1591 * Now set the hash value to indicate that it's free.
1592 */
1593 ASSERT(hash[bucket].dtdh_chain != dvar);
1594 dvar->dtdv_hashval = DTRACE_DYNHASH_FREE;
1595
1596 dtrace_membar_producer();
1597
1598 /*
1599 * Set the next pointer to point at the dirty list, and
1600 * atomically swing the dirty pointer to the newly freed dvar.
1601 */
1602 do {
1603 next = dcpu->dtdsc_dirty;
1604 dvar->dtdv_next = next;
1605 } while (dtrace_casptr(&dcpu->dtdsc_dirty, next, dvar) != next);
1606
1607 /*
1608 * Finally, unlock this hash bucket.
1609 */
1610 ASSERT(hash[bucket].dtdh_lock == lock);
1611 ASSERT(lock & 1);
1612 hash[bucket].dtdh_lock++;
1613
1614 return (NULL);
1615next:
1616 prev = dvar;
1617 continue;
1618 }
1619
1620 if (dvar == NULL) {
1621 /*
1622 * If dvar is NULL, it is because we went off the rails:
1623 * one of the elements that we traversed in the hash chain
1624 * was deleted while we were traversing it. In this case,
1625 * we assert that we aren't doing a dealloc (deallocs lock
1626 * the hash bucket to prevent themselves from racing with
1627 * one another), and retry the hash chain traversal.
1628 */
1629 ASSERT(op != DTRACE_DYNVAR_DEALLOC);
1630 goto top;
1631 }
1632
1633 if (op != DTRACE_DYNVAR_ALLOC) {
1634 /*
1635 * If we are not to allocate a new variable, we want to
1636 * return NULL now. Before we return, check that the value
1637 * of the lock word hasn't changed. If it has, we may have
1638 * seen an inconsistent snapshot.
1639 */
1640 if (op == DTRACE_DYNVAR_NOALLOC) {
1641 if (hash[bucket].dtdh_lock != lock)
1642 goto top;
1643 } else {
1644 ASSERT(op == DTRACE_DYNVAR_DEALLOC);
1645 ASSERT(hash[bucket].dtdh_lock == lock);
1646 ASSERT(lock & 1);
1647 hash[bucket].dtdh_lock++;
1648 }
1649
1650 return (NULL);
1651 }
1652
1653 /*
1654 * We need to allocate a new dynamic variable. The size we need is the
1655 * size of dtrace_dynvar plus the size of nkeys dtrace_key_t's plus the
1656 * size of any auxiliary key data (rounded up to 8-byte alignment) plus
1657 * the size of any referred-to data (dsize). We then round the final
1658 * size up to the chunksize for allocation.
1659 */
1660 for (ksize = 0, i = 0; i < nkeys; i++)
1661 ksize += P2ROUNDUP(key[i].dttk_size, sizeof (uint64_t));
1662
1663 /*
1664 * This should be pretty much impossible, but could happen if, say,
1665 * strange DIF specified the tuple. Ideally, this should be an
1666 * assertion and not an error condition -- but that requires that the
1667 * chunksize calculation in dtrace_difo_chunksize() be absolutely
1668 * bullet-proof. (That is, it must not be able to be fooled by
1669 * malicious DIF.) Given the lack of backwards branches in DIF,
1670 * solving this would presumably not amount to solving the Halting
1671 * Problem -- but it still seems awfully hard.
1672 */
1673 if (sizeof (dtrace_dynvar_t) + sizeof (dtrace_key_t) * (nkeys - 1) +
1674 ksize + dsize > chunksize) {
1675 dcpu->dtdsc_drops++;
1676 return (NULL);
1677 }
1678
1679 nstate = DTRACE_DSTATE_EMPTY;
1680
1681 do {
1682retry:
1683 free = dcpu->dtdsc_free;
1684
1685 if (free == NULL) {
1686 dtrace_dynvar_t *clean = dcpu->dtdsc_clean;
1687 void *rval;
1688
1689 if (clean == NULL) {
1690 /*
1691 * We're out of dynamic variable space on
1692 * this CPU. Unless we have tried all CPUs,
1693 * we'll try to allocate from a different
1694 * CPU.
1695 */
1696 switch (dstate->dtds_state) {
1697 case DTRACE_DSTATE_CLEAN: {
1698 void *sp = &dstate->dtds_state;
1699
1700 if (++cpu >= NCPU)
1701 cpu = 0;
1702
1703 if (dcpu->dtdsc_dirty != NULL &&
1704 nstate == DTRACE_DSTATE_EMPTY)
1705 nstate = DTRACE_DSTATE_DIRTY;
1706
1707 if (dcpu->dtdsc_rinsing != NULL)
1708 nstate = DTRACE_DSTATE_RINSING;
1709
1710 dcpu = &dstate->dtds_percpu[cpu];
1711
1712 if (cpu != me)
1713 goto retry;
1714
1715 (void) dtrace_cas32(sp,
1716 DTRACE_DSTATE_CLEAN, nstate);
1717
1718 /*
1719 * To increment the correct bean
1720 * counter, take another lap.
1721 */
1722 goto retry;
1723 }
1724
1725 case DTRACE_DSTATE_DIRTY:
1726 dcpu->dtdsc_dirty_drops++;
1727 break;
1728
1729 case DTRACE_DSTATE_RINSING:
1730 dcpu->dtdsc_rinsing_drops++;
1731 break;
1732
1733 case DTRACE_DSTATE_EMPTY:
1734 dcpu->dtdsc_drops++;
1735 break;
1736 }
1737
1738 DTRACE_CPUFLAG_SET(CPU_DTRACE_DROP);
1739 return (NULL);
1740 }
1741
1742 /*
1743 * The clean list appears to be non-empty. We want to
1744 * move the clean list to the free list; we start by
1745 * moving the clean pointer aside.
1746 */
1747 if (dtrace_casptr(&dcpu->dtdsc_clean,
1748 clean, NULL) != clean) {
1749 /*
1750 * We are in one of two situations:
1751 *
1752 * (a) The clean list was switched to the
1753 * free list by another CPU.
1754 *
1755 * (b) The clean list was added to by the
1756 * cleansing cyclic.
1757 *
1758 * In either of these situations, we can
1759 * just reattempt the free list allocation.
1760 */
1761 goto retry;
1762 }
1763
1764 ASSERT(clean->dtdv_hashval == DTRACE_DYNHASH_FREE);
1765
1766 /*
1767 * Now we'll move the clean list to the free list.
1768 * It's impossible for this to fail: the only way
1769 * the free list can be updated is through this
1770 * code path, and only one CPU can own the clean list.
1771 * Thus, it would only be possible for this to fail if
1772 * this code were racing with dtrace_dynvar_clean().
1773 * (That is, if dtrace_dynvar_clean() updated the clean
1774 * list, and we ended up racing to update the free
1775 * list.) This race is prevented by the dtrace_sync()
1776 * in dtrace_dynvar_clean() -- which flushes the
1777 * owners of the clean lists out before resetting
1778 * the clean lists.
1779 */
1780 rval = dtrace_casptr(&dcpu->dtdsc_free, NULL, clean);
1781 ASSERT(rval == NULL);
1782 goto retry;
1783 }
1784
1785 dvar = free;
1786 new_free = dvar->dtdv_next;
1787 } while (dtrace_casptr(&dcpu->dtdsc_free, free, new_free) != free);
1788
1789 /*
1790 * We have now allocated a new chunk. We copy the tuple keys into the
1791 * tuple array and copy any referenced key data into the data space
1792 * following the tuple array. As we do this, we relocate dttk_value
1793 * in the final tuple to point to the key data address in the chunk.
1794 */
1795 kdata = (uintptr_t)&dvar->dtdv_tuple.dtt_key[nkeys];
1796 dvar->dtdv_data = (void *)(kdata + ksize);
1797 dvar->dtdv_tuple.dtt_nkeys = nkeys;
1798
1799 for (i = 0; i < nkeys; i++) {
1800 dtrace_key_t *dkey = &dvar->dtdv_tuple.dtt_key[i];
1801 size_t kesize = key[i].dttk_size;
1802
1803 if (kesize != 0) {
1804 dtrace_bcopy(
1805 (const void *)(uintptr_t)key[i].dttk_value,
1806 (void *)kdata, kesize);
1807 dkey->dttk_value = kdata;
1808 kdata += P2ROUNDUP(kesize, sizeof (uint64_t));
1809 } else {
1810 dkey->dttk_value = key[i].dttk_value;
1811 }
1812
1813 dkey->dttk_size = kesize;
1814 }
1815
1816 ASSERT(dvar->dtdv_hashval == DTRACE_DYNHASH_FREE);
1817 dvar->dtdv_hashval = hashval;
1818 dvar->dtdv_next = start;
1819
1820 if (dtrace_casptr(&hash[bucket].dtdh_chain, start, dvar) == start)
1821 return (dvar);
1822
1823 /*
1824 * The cas has failed. Either another CPU is adding an element to
1825 * this hash chain, or another CPU is deleting an element from this
1826 * hash chain. The simplest way to deal with both of these cases
1827 * (though not necessarily the most efficient) is to free our
1828 * allocated block and tail-call ourselves. Note that the free is
1829 * to the dirty list and _not_ to the free list. This is to prevent
1830 * races with allocators, above.
1831 */
1832 dvar->dtdv_hashval = DTRACE_DYNHASH_FREE;
1833
1834 dtrace_membar_producer();
1835
1836 do {
1837 free = dcpu->dtdsc_dirty;
1838 dvar->dtdv_next = free;
1839 } while (dtrace_casptr(&dcpu->dtdsc_dirty, free, dvar) != free);
1840
1841 return (dtrace_dynvar(dstate, nkeys, key, dsize, op, mstate, vstate));
1842}
1843
1844/*ARGSUSED*/
1845static void
1846dtrace_aggregate_min(uint64_t *oval, uint64_t nval, uint64_t arg)
1847{
1848 if ((int64_t)nval < (int64_t)*oval)
1849 *oval = nval;
1850}
1851
1852/*ARGSUSED*/
1853static void
1854dtrace_aggregate_max(uint64_t *oval, uint64_t nval, uint64_t arg)
1855{
1856 if ((int64_t)nval > (int64_t)*oval)
1857 *oval = nval;
1858}
1859
1860static void
1861dtrace_aggregate_quantize(uint64_t *quanta, uint64_t nval, uint64_t incr)
1862{
1863 int i, zero = DTRACE_QUANTIZE_ZEROBUCKET;
1864 int64_t val = (int64_t)nval;
1865
1866 if (val < 0) {
1867 for (i = 0; i < zero; i++) {
1868 if (val <= DTRACE_QUANTIZE_BUCKETVAL(i)) {
1869 quanta[i] += incr;
1870 return;
1871 }
1872 }
1873 } else {
1874 for (i = zero + 1; i < DTRACE_QUANTIZE_NBUCKETS; i++) {
1875 if (val < DTRACE_QUANTIZE_BUCKETVAL(i)) {
1876 quanta[i - 1] += incr;
1877 return;
1878 }
1879 }
1880
1881 quanta[DTRACE_QUANTIZE_NBUCKETS - 1] += incr;
1882 return;
1883 }
1884
1885 ASSERT(0);
1886}
1887
1888static void
1889dtrace_aggregate_lquantize(uint64_t *lquanta, uint64_t nval, uint64_t incr)
1890{
1891 uint64_t arg = *lquanta++;
1892 int32_t base = DTRACE_LQUANTIZE_BASE(arg);
1893 uint16_t step = DTRACE_LQUANTIZE_STEP(arg);
1894 uint16_t levels = DTRACE_LQUANTIZE_LEVELS(arg);
1895 int32_t val = (int32_t)nval, level;
1896
1897 ASSERT(step != 0);
1898 ASSERT(levels != 0);
1899
1900 if (val < base) {
1901 /*
1902 * This is an underflow.
1903 */
1904 lquanta[0] += incr;
1905 return;
1906 }
1907
1908 level = (val - base) / step;
1909
1910 if (level < levels) {
1911 lquanta[level + 1] += incr;
1912 return;
1913 }
1914
1915 /*
1916 * This is an overflow.
1917 */
1918 lquanta[levels + 1] += incr;
1919}
1920
1921static int
1922dtrace_aggregate_llquantize_bucket(uint16_t factor, uint16_t low,
1923 uint16_t high, uint16_t nsteps, int64_t value)
1924{
1925 int64_t this = 1, last, next;
1926 int base = 1, order;
1927
1928 ASSERT(factor <= nsteps);
1929 ASSERT(nsteps % factor == 0);
1930
1931 for (order = 0; order < low; order++)
1932 this *= factor;
1933
1934 /*
1935 * If our value is less than our factor taken to the power of the
1936 * low order of magnitude, it goes into the zeroth bucket.
1937 */
1938 if (value < (last = this))
1939 return (0);
1940
1941 for (this *= factor; order <= high; order++) {
1942 int nbuckets = this > nsteps ? nsteps : this;
1943
1944 if ((next = this * factor) < this) {
1945 /*
1946 * We should not generally get log/linear quantizations
1947 * with a high magnitude that allows 64-bits to
1948 * overflow, but we nonetheless protect against this
1949 * by explicitly checking for overflow, and clamping
1950 * our value accordingly.
1951 */
1952 value = this - 1;
1953 }
1954
1955 if (value < this) {
1956 /*
1957 * If our value lies within this order of magnitude,
1958 * determine its position by taking the offset within
1959 * the order of magnitude, dividing by the bucket
1960 * width, and adding to our (accumulated) base.
1961 */
1962 return (base + (value - last) / (this / nbuckets));
1963 }
1964
1965 base += nbuckets - (nbuckets / factor);
1966 last = this;
1967 this = next;
1968 }
1969
1970 /*
1971 * Our value is greater than or equal to our factor taken to the
1972 * power of one plus the high magnitude -- return the top bucket.
1973 */
1974 return (base);
1975}
1976
1977static void
1978dtrace_aggregate_llquantize(uint64_t *llquanta, uint64_t nval, uint64_t incr)
1979{
1980 uint64_t arg = *llquanta++;
1981 uint16_t factor = DTRACE_LLQUANTIZE_FACTOR(arg);
1982 uint16_t low = DTRACE_LLQUANTIZE_LOW(arg);
1983 uint16_t high = DTRACE_LLQUANTIZE_HIGH(arg);
1984 uint16_t nsteps = DTRACE_LLQUANTIZE_NSTEP(arg);
1985
1986 llquanta[dtrace_aggregate_llquantize_bucket(factor,
1987 low, high, nsteps, nval)] += incr;
1988}
1989
1990/*ARGSUSED*/
1991static void
1992dtrace_aggregate_avg(uint64_t *data, uint64_t nval, uint64_t arg)
1993{
1994 data[0]++;
1995 data[1] += nval;
1996}
1997
1998/*ARGSUSED*/
1999static void
2000dtrace_aggregate_stddev(uint64_t *data, uint64_t nval, uint64_t arg)
2001{
2002 int64_t snval = (int64_t)nval;
2003 uint64_t tmp[2];
2004
2005 data[0]++;
2006 data[1] += nval;
2007
2008 /*
2009 * What we want to say here is:
2010 *
2011 * data[2] += nval * nval;
2012 *
2013 * But given that nval is 64-bit, we could easily overflow, so
2014 * we do this as 128-bit arithmetic.
2015 */
2016 if (snval < 0)
2017 snval = -snval;
2018
2019 dtrace_multiply_128((uint64_t)snval, (uint64_t)snval, tmp);
2020 dtrace_add_128(data + 2, tmp, data + 2);
2021}
2022
2023/*ARGSUSED*/
2024static void
2025dtrace_aggregate_count(uint64_t *oval, uint64_t nval, uint64_t arg)
2026{
2027 *oval = *oval + 1;
2028}
2029
2030/*ARGSUSED*/
2031static void
2032dtrace_aggregate_sum(uint64_t *oval, uint64_t nval, uint64_t arg)
2033{
2034 *oval += nval;
2035}
2036
2037/*
2038 * Aggregate given the tuple in the principal data buffer, and the aggregating
2039 * action denoted by the specified dtrace_aggregation_t. The aggregation
2040 * buffer is specified as the buf parameter. This routine does not return
2041 * failure; if there is no space in the aggregation buffer, the data will be
2042 * dropped, and a corresponding counter incremented.
2043 */
2044static void
2045dtrace_aggregate(dtrace_aggregation_t *agg, dtrace_buffer_t *dbuf,
2046 intptr_t offset, dtrace_buffer_t *buf, uint64_t expr, uint64_t arg)
2047{
2048 dtrace_recdesc_t *rec = &agg->dtag_action.dta_rec;
2049 uint32_t i, ndx, size, fsize;
2050 uint32_t align = sizeof (uint64_t) - 1;
2051 dtrace_aggbuffer_t *agb;
2052 dtrace_aggkey_t *key;
2053 uint32_t hashval = 0, limit, isstr;
2054 caddr_t tomax, data, kdata;
2055 dtrace_actkind_t action;
2056 dtrace_action_t *act;
2057 uintptr_t offs;
2058
2059 if (buf == NULL)
2060 return;
2061
2062 if (!agg->dtag_hasarg) {
2063 /*
2064 * Currently, only quantize() and lquantize() take additional
2065 * arguments, and they have the same semantics: an increment
2066 * value that defaults to 1 when not present. If additional
2067 * aggregating actions take arguments, the setting of the
2068 * default argument value will presumably have to become more
2069 * sophisticated...
2070 */
2071 arg = 1;
2072 }
2073
2074 action = agg->dtag_action.dta_kind - DTRACEACT_AGGREGATION;
2075 size = rec->dtrd_offset - agg->dtag_base;
2076 fsize = size + rec->dtrd_size;
2077
2078 ASSERT(dbuf->dtb_tomax != NULL);
2079 data = dbuf->dtb_tomax + offset + agg->dtag_base;
2080
2081 if ((tomax = buf->dtb_tomax) == NULL) {
2082 dtrace_buffer_drop(buf);
2083 return;
2084 }
2085
2086 /*
2087 * The metastructure is always at the bottom of the buffer.
2088 */
2089 agb = (dtrace_aggbuffer_t *)(tomax + buf->dtb_size -
2090 sizeof (dtrace_aggbuffer_t));
2091
2092 if (buf->dtb_offset == 0) {
2093 /*
2094 * We just kludge up approximately 1/8th of the size to be
2095 * buckets. If this guess ends up being routinely
2096 * off-the-mark, we may need to dynamically readjust this
2097 * based on past performance.
2098 */
2099 uintptr_t hashsize = (buf->dtb_size >> 3) / sizeof (uintptr_t);
2100
2101 if ((uintptr_t)agb - hashsize * sizeof (dtrace_aggkey_t *) <
2102 (uintptr_t)tomax || hashsize == 0) {
2103 /*
2104 * We've been given a ludicrously small buffer;
2105 * increment our drop count and leave.
2106 */
2107 dtrace_buffer_drop(buf);
2108 return;
2109 }
2110
2111 /*
2112 * And now, a pathetic attempt to try to get a an odd (or
2113 * perchance, a prime) hash size for better hash distribution.
2114 */
2115 if (hashsize > (DTRACE_AGGHASHSIZE_SLEW << 3))
2116 hashsize -= DTRACE_AGGHASHSIZE_SLEW;
2117
2118 agb->dtagb_hashsize = hashsize;
2119 agb->dtagb_hash = (dtrace_aggkey_t **)((uintptr_t)agb -
2120 agb->dtagb_hashsize * sizeof (dtrace_aggkey_t *));
2121 agb->dtagb_free = (uintptr_t)agb->dtagb_hash;
2122
2123 for (i = 0; i < agb->dtagb_hashsize; i++)
2124 agb->dtagb_hash[i] = NULL;
2125 }
2126
2127 ASSERT(agg->dtag_first != NULL);
2128 ASSERT(agg->dtag_first->dta_intuple);
2129
2130 /*
2131 * Calculate the hash value based on the key. Note that we _don't_
2132 * include the aggid in the hashing (but we will store it as part of
2133 * the key). The hashing algorithm is Bob Jenkins' "One-at-a-time"
2134 * algorithm: a simple, quick algorithm that has no known funnels, and
2135 * gets good distribution in practice. The efficacy of the hashing
2136 * algorithm (and a comparison with other algorithms) may be found by
2137 * running the ::dtrace_aggstat MDB dcmd.
2138 */
2139 for (act = agg->dtag_first; act->dta_intuple; act = act->dta_next) {
2140 i = act->dta_rec.dtrd_offset - agg->dtag_base;
2141 limit = i + act->dta_rec.dtrd_size;
2142 ASSERT(limit <= size);
2143 isstr = DTRACEACT_ISSTRING(act);
2144
2145 for (; i < limit; i++) {
2146 hashval += data[i];
2147 hashval += (hashval << 10);
2148 hashval ^= (hashval >> 6);
2149
2150 if (isstr && data[i] == '\0')
2151 break;
2152 }
2153 }
2154
2155 hashval += (hashval << 3);
2156 hashval ^= (hashval >> 11);
2157 hashval += (hashval << 15);
2158
2159 /*
2160 * Yes, the divide here is expensive -- but it's generally the least
2161 * of the performance issues given the amount of data that we iterate
2162 * over to compute hash values, compare data, etc.
2163 */
2164 ndx = hashval % agb->dtagb_hashsize;
2165
2166 for (key = agb->dtagb_hash[ndx]; key != NULL; key = key->dtak_next) {
2167 ASSERT((caddr_t)key >= tomax);
2168 ASSERT((caddr_t)key < tomax + buf->dtb_size);
2169
2170 if (hashval != key->dtak_hashval || key->dtak_size != size)
2171 continue;
2172
2173 kdata = key->dtak_data;
2174 ASSERT(kdata >= tomax && kdata < tomax + buf->dtb_size);
2175
2176 for (act = agg->dtag_first; act->dta_intuple;
2177 act = act->dta_next) {
2178 i = act->dta_rec.dtrd_offset - agg->dtag_base;
2179 limit = i + act->dta_rec.dtrd_size;
2180 ASSERT(limit <= size);
2181 isstr = DTRACEACT_ISSTRING(act);
2182
2183 for (; i < limit; i++) {
2184 if (kdata[i] != data[i])
2185 goto next;
2186
2187 if (isstr && data[i] == '\0')
2188 break;
2189 }
2190 }
2191
2192 if (action != key->dtak_action) {
2193 /*
2194 * We are aggregating on the same value in the same
2195 * aggregation with two different aggregating actions.
2196 * (This should have been picked up in the compiler,
2197 * so we may be dealing with errant or devious DIF.)
2198 * This is an error condition; we indicate as much,
2199 * and return.
2200 */
2201 DTRACE_CPUFLAG_SET(CPU_DTRACE_ILLOP);
2202 return;
2203 }
2204
2205 /*
2206 * This is a hit: we need to apply the aggregator to
2207 * the value at this key.
2208 */
2209 agg->dtag_aggregate((uint64_t *)(kdata + size), expr, arg);
2210 return;
2211next:
2212 continue;
2213 }
2214
2215 /*
2216 * We didn't find it. We need to allocate some zero-filled space,
2217 * link it into the hash table appropriately, and apply the aggregator
2218 * to the (zero-filled) value.
2219 */
2220 offs = buf->dtb_offset;
2221 while (offs & (align - 1))
2222 offs += sizeof (uint32_t);
2223
2224 /*
2225 * If we don't have enough room to both allocate a new key _and_
2226 * its associated data, increment the drop count and return.
2227 */
2228 if ((uintptr_t)tomax + offs + fsize >
2229 agb->dtagb_free - sizeof (dtrace_aggkey_t)) {
2230 dtrace_buffer_drop(buf);
2231 return;
2232 }
2233
2234 /*CONSTCOND*/
2235 ASSERT(!(sizeof (dtrace_aggkey_t) & (sizeof (uintptr_t) - 1)));
2236 key = (dtrace_aggkey_t *)(agb->dtagb_free - sizeof (dtrace_aggkey_t));
2237 agb->dtagb_free -= sizeof (dtrace_aggkey_t);
2238
2239 key->dtak_data = kdata = tomax + offs;
2240 buf->dtb_offset = offs + fsize;
2241
2242 /*
2243 * Now copy the data across.
2244 */
2245 *((dtrace_aggid_t *)kdata) = agg->dtag_id;
2246
2247 for (i = sizeof (dtrace_aggid_t); i < size; i++)
2248 kdata[i] = data[i];
2249
2250 /*
2251 * Because strings are not zeroed out by default, we need to iterate
2252 * looking for actions that store strings, and we need to explicitly
2253 * pad these strings out with zeroes.
2254 */
2255 for (act = agg->dtag_first; act->dta_intuple; act = act->dta_next) {
2256 int nul;
2257
2258 if (!DTRACEACT_ISSTRING(act))
2259 continue;
2260
2261 i = act->dta_rec.dtrd_offset - agg->dtag_base;
2262 limit = i + act->dta_rec.dtrd_size;
2263 ASSERT(limit <= size);
2264
2265 for (nul = 0; i < limit; i++) {
2266 if (nul) {
2267 kdata[i] = '\0';
2268 continue;
2269 }
2270
2271 if (data[i] != '\0')
2272 continue;
2273
2274 nul = 1;
2275 }
2276 }
2277
2278 for (i = size; i < fsize; i++)
2279 kdata[i] = 0;
2280
2281 key->dtak_hashval = hashval;
2282 key->dtak_size = size;
2283 key->dtak_action = action;
2284 key->dtak_next = agb->dtagb_hash[ndx];
2285 agb->dtagb_hash[ndx] = key;
2286
2287 /*
2288 * Finally, apply the aggregator.
2289 */
2290 *((uint64_t *)(key->dtak_data + size)) = agg->dtag_initial;
2291 agg->dtag_aggregate((uint64_t *)(key->dtak_data + size), expr, arg);
2292}
2293
2294/*
2295 * Given consumer state, this routine finds a speculation in the INACTIVE
2296 * state and transitions it into the ACTIVE state. If there is no speculation
2297 * in the INACTIVE state, 0 is returned. In this case, no error counter is
2298 * incremented -- it is up to the caller to take appropriate action.
2299 */
2300static int
2301dtrace_speculation(dtrace_state_t *state)
2302{
2303 int i = 0;
2304 dtrace_speculation_state_t current;
2305 uint32_t *stat = &state->dts_speculations_unavail, count;
2306
2307 while (i < state->dts_nspeculations) {
2308 dtrace_speculation_t *spec = &state->dts_speculations[i];
2309
2310 current = spec->dtsp_state;
2311
2312 if (current != DTRACESPEC_INACTIVE) {
2313 if (current == DTRACESPEC_COMMITTINGMANY ||
2314 current == DTRACESPEC_COMMITTING ||
2315 current == DTRACESPEC_DISCARDING)
2316 stat = &state->dts_speculations_busy;
2317 i++;
2318 continue;
2319 }
2320
2321 if (dtrace_cas32((uint32_t *)&spec->dtsp_state,
2322 current, DTRACESPEC_ACTIVE) == current)
2323 return (i + 1);
2324 }
2325
2326 /*
2327 * We couldn't find a speculation. If we found as much as a single
2328 * busy speculation buffer, we'll attribute this failure as "busy"
2329 * instead of "unavail".
2330 */
2331 do {
2332 count = *stat;
2333 } while (dtrace_cas32(stat, count, count + 1) != count);
2334
2335 return (0);
2336}
2337
2338/*
2339 * This routine commits an active speculation. If the specified speculation
2340 * is not in a valid state to perform a commit(), this routine will silently do
2341 * nothing. The state of the specified speculation is transitioned according
2342 * to the state transition diagram outlined in <sys/dtrace_impl.h>
2343 */
2344static void
2345dtrace_speculation_commit(dtrace_state_t *state, processorid_t cpu,
2346 dtrace_specid_t which)
2347{
2348 dtrace_speculation_t *spec;
2349 dtrace_buffer_t *src, *dest;
2350 uintptr_t daddr, saddr, dlimit, slimit;
2351 dtrace_speculation_state_t current, new = 0;
2352 intptr_t offs;
2353 uint64_t timestamp;
2354
2355 if (which == 0)
2356 return;
2357
2358 if (which > state->dts_nspeculations) {
2359 cpu_core[cpu].cpuc_dtrace_flags |= CPU_DTRACE_ILLOP;
2360 return;
2361 }
2362
2363 spec = &state->dts_speculations[which - 1];
2364 src = &spec->dtsp_buffer[cpu];
2365 dest = &state->dts_buffer[cpu];
2366
2367 do {
2368 current = spec->dtsp_state;
2369
2370 if (current == DTRACESPEC_COMMITTINGMANY)
2371 break;
2372
2373 switch (current) {
2374 case DTRACESPEC_INACTIVE:
2375 case DTRACESPEC_DISCARDING:
2376 return;
2377
2378 case DTRACESPEC_COMMITTING:
2379 /*
2380 * This is only possible if we are (a) commit()'ing
2381 * without having done a prior speculate() on this CPU
2382 * and (b) racing with another commit() on a different
2383 * CPU. There's nothing to do -- we just assert that
2384 * our offset is 0.
2385 */
2386 ASSERT(src->dtb_offset == 0);
2387 return;
2388
2389 case DTRACESPEC_ACTIVE:
2390 new = DTRACESPEC_COMMITTING;
2391 break;
2392
2393 case DTRACESPEC_ACTIVEONE:
2394 /*
2395 * This speculation is active on one CPU. If our
2396 * buffer offset is non-zero, we know that the one CPU
2397 * must be us. Otherwise, we are committing on a
2398 * different CPU from the speculate(), and we must
2399 * rely on being asynchronously cleaned.
2400 */
2401 if (src->dtb_offset != 0) {
2402 new = DTRACESPEC_COMMITTING;
2403 break;
2404 }
2405 /*FALLTHROUGH*/
2406
2407 case DTRACESPEC_ACTIVEMANY:
2408 new = DTRACESPEC_COMMITTINGMANY;
2409 break;
2410
2411 default:
2412 ASSERT(0);
2413 }
2414 } while (dtrace_cas32((uint32_t *)&spec->dtsp_state,
2415 current, new) != current);
2416
2417 /*
2418 * We have set the state to indicate that we are committing this
2419 * speculation. Now reserve the necessary space in the destination
2420 * buffer.
2421 */
2422 if ((offs = dtrace_buffer_reserve(dest, src->dtb_offset,
2423 sizeof (uint64_t), state, NULL)) < 0) {
2424 dtrace_buffer_drop(dest);
2425 goto out;
2426 }
2427
2428 /*
2429 * We have sufficient space to copy the speculative buffer into the
2430 * primary buffer. First, modify the speculative buffer, filling
2431 * in the timestamp of all entries with the current time. The data
2432 * must have the commit() time rather than the time it was traced,
2433 * so that all entries in the primary buffer are in timestamp order.
2434 */
2435 timestamp = dtrace_gethrtime();
2436 saddr = (uintptr_t)src->dtb_tomax;
2437 slimit = saddr + src->dtb_offset;
2438 while (saddr < slimit) {
2439 size_t size;
2440 dtrace_rechdr_t *dtrh = (dtrace_rechdr_t *)saddr;
2441
2442 if (dtrh->dtrh_epid == DTRACE_EPIDNONE) {
2443 saddr += sizeof (dtrace_epid_t);
2444 continue;
2445 }
2446 ASSERT3U(dtrh->dtrh_epid, <=, state->dts_necbs);
2447 size = state->dts_ecbs[dtrh->dtrh_epid - 1]->dte_size;
2448
2449 ASSERT3U(saddr + size, <=, slimit);
2450 ASSERT3U(size, >=, sizeof (dtrace_rechdr_t));
2451 ASSERT3U(DTRACE_RECORD_LOAD_TIMESTAMP(dtrh), ==, UINT64_MAX);
2452
2453 DTRACE_RECORD_STORE_TIMESTAMP(dtrh, timestamp);
2454
2455 saddr += size;
2456 }
2457
2458 /*
2459 * Copy the buffer across. (Note that this is a
2460 * highly subobtimal bcopy(); in the unlikely event that this becomes
2461 * a serious performance issue, a high-performance DTrace-specific
2462 * bcopy() should obviously be invented.)
2463 */
2464 daddr = (uintptr_t)dest->dtb_tomax + offs;
2465 dlimit = daddr + src->dtb_offset;
2466 saddr = (uintptr_t)src->dtb_tomax;
2467
2468 /*
2469 * First, the aligned portion.
2470 */
2471 while (dlimit - daddr >= sizeof (uint64_t)) {
2472 *((uint64_t *)daddr) = *((uint64_t *)saddr);
2473
2474 daddr += sizeof (uint64_t);
2475 saddr += sizeof (uint64_t);
2476 }
2477
2478 /*
2479 * Now any left-over bit...
2480 */
2481 while (dlimit - daddr)
2482 *((uint8_t *)daddr++) = *((uint8_t *)saddr++);
2483
2484 /*
2485 * Finally, commit the reserved space in the destination buffer.
2486 */
2487 dest->dtb_offset = offs + src->dtb_offset;
2488
2489out:
2490 /*
2491 * If we're lucky enough to be the only active CPU on this speculation
2492 * buffer, we can just set the state back to DTRACESPEC_INACTIVE.
2493 */
2494 if (current == DTRACESPEC_ACTIVE ||
2495 (current == DTRACESPEC_ACTIVEONE && new == DTRACESPEC_COMMITTING)) {
2496 uint32_t rval = dtrace_cas32((uint32_t *)&spec->dtsp_state,
2497 DTRACESPEC_COMMITTING, DTRACESPEC_INACTIVE);
2498
2499 ASSERT(rval == DTRACESPEC_COMMITTING);
2500 }
2501
2502 src->dtb_offset = 0;
2503 src->dtb_xamot_drops += src->dtb_drops;
2504 src->dtb_drops = 0;
2505}
2506
2507/*
2508 * This routine discards an active speculation. If the specified speculation
2509 * is not in a valid state to perform a discard(), this routine will silently
2510 * do nothing. The state of the specified speculation is transitioned
2511 * according to the state transition diagram outlined in <sys/dtrace_impl.h>
2512 */
2513static void
2514dtrace_speculation_discard(dtrace_state_t *state, processorid_t cpu,
2515 dtrace_specid_t which)
2516{
2517 dtrace_speculation_t *spec;
2518 dtrace_speculation_state_t current, new = 0;
2519 dtrace_buffer_t *buf;
2520
2521 if (which == 0)
2522 return;
2523
2524 if (which > state->dts_nspeculations) {
2525 cpu_core[cpu].cpuc_dtrace_flags |= CPU_DTRACE_ILLOP;
2526 return;
2527 }
2528
2529 spec = &state->dts_speculations[which - 1];
2530 buf = &spec->dtsp_buffer[cpu];
2531
2532 do {
2533 current = spec->dtsp_state;
2534
2535 switch (current) {
2536 case DTRACESPEC_INACTIVE:
2537 case DTRACESPEC_COMMITTINGMANY:
2538 case DTRACESPEC_COMMITTING:
2539 case DTRACESPEC_DISCARDING:
2540 return;
2541
2542 case DTRACESPEC_ACTIVE:
2543 case DTRACESPEC_ACTIVEMANY:
2544 new = DTRACESPEC_DISCARDING;
2545 break;
2546
2547 case DTRACESPEC_ACTIVEONE:
2548 if (buf->dtb_offset != 0) {
2549 new = DTRACESPEC_INACTIVE;
2550 } else {
2551 new = DTRACESPEC_DISCARDING;
2552 }
2553 break;
2554
2555 default:
2556 ASSERT(0);
2557 }
2558 } while (dtrace_cas32((uint32_t *)&spec->dtsp_state,
2559 current, new) != current);
2560
2561 buf->dtb_offset = 0;
2562 buf->dtb_drops = 0;
2563}
2564
2565/*
2566 * Note: not called from probe context. This function is called
2567 * asynchronously from cross call context to clean any speculations that are
2568 * in the COMMITTINGMANY or DISCARDING states. These speculations may not be
2569 * transitioned back to the INACTIVE state until all CPUs have cleaned the
2570 * speculation.
2571 */
2572static void
2573dtrace_speculation_clean_here(dtrace_state_t *state)
2574{
2575 dtrace_icookie_t cookie;
2576 processorid_t cpu = curcpu;
2577 dtrace_buffer_t *dest = &state->dts_buffer[cpu];
2578 dtrace_specid_t i;
2579
2580 cookie = dtrace_interrupt_disable();
2581
2582 if (dest->dtb_tomax == NULL) {
2583 dtrace_interrupt_enable(cookie);
2584 return;
2585 }
2586
2587 for (i = 0; i < state->dts_nspeculations; i++) {
2588 dtrace_speculation_t *spec = &state->dts_speculations[i];
2589 dtrace_buffer_t *src = &spec->dtsp_buffer[cpu];
2590
2591 if (src->dtb_tomax == NULL)
2592 continue;
2593
2594 if (spec->dtsp_state == DTRACESPEC_DISCARDING) {
2595 src->dtb_offset = 0;
2596 continue;
2597 }
2598
2599 if (spec->dtsp_state != DTRACESPEC_COMMITTINGMANY)
2600 continue;
2601
2602 if (src->dtb_offset == 0)
2603 continue;
2604
2605 dtrace_speculation_commit(state, cpu, i + 1);
2606 }
2607
2608 dtrace_interrupt_enable(cookie);
2609}
2610
2611/*
2612 * Note: not called from probe context. This function is called
2613 * asynchronously (and at a regular interval) to clean any speculations that
2614 * are in the COMMITTINGMANY or DISCARDING states. If it discovers that there
2615 * is work to be done, it cross calls all CPUs to perform that work;
2616 * COMMITMANY and DISCARDING speculations may not be transitioned back to the
2617 * INACTIVE state until they have been cleaned by all CPUs.
2618 */
2619static void
2620dtrace_speculation_clean(dtrace_state_t *state)
2621{
2622 int work = 0, rv;
2623 dtrace_specid_t i;
2624
2625 for (i = 0; i < state->dts_nspeculations; i++) {
2626 dtrace_speculation_t *spec = &state->dts_speculations[i];
2627
2628 ASSERT(!spec->dtsp_cleaning);
2629
2630 if (spec->dtsp_state != DTRACESPEC_DISCARDING &&
2631 spec->dtsp_state != DTRACESPEC_COMMITTINGMANY)
2632 continue;
2633
2634 work++;
2635 spec->dtsp_cleaning = 1;
2636 }
2637
2638 if (!work)
2639 return;
2640
2641 dtrace_xcall(DTRACE_CPUALL,
2642 (dtrace_xcall_t)dtrace_speculation_clean_here, state);
2643
2644 /*
2645 * We now know that all CPUs have committed or discarded their
2646 * speculation buffers, as appropriate. We can now set the state
2647 * to inactive.
2648 */
2649 for (i = 0; i < state->dts_nspeculations; i++) {
2650 dtrace_speculation_t *spec = &state->dts_speculations[i];
2651 dtrace_speculation_state_t current, new;
2652
2653 if (!spec->dtsp_cleaning)
2654 continue;
2655
2656 current = spec->dtsp_state;
2657 ASSERT(current == DTRACESPEC_DISCARDING ||
2658 current == DTRACESPEC_COMMITTINGMANY);
2659
2660 new = DTRACESPEC_INACTIVE;
2661
2662 rv = dtrace_cas32((uint32_t *)&spec->dtsp_state, current, new);
2663 ASSERT(rv == current);
2664 spec->dtsp_cleaning = 0;
2665 }
2666}
2667
2668/*
2669 * Called as part of a speculate() to get the speculative buffer associated
2670 * with a given speculation. Returns NULL if the specified speculation is not
2671 * in an ACTIVE state. If the speculation is in the ACTIVEONE state -- and
2672 * the active CPU is not the specified CPU -- the speculation will be
2673 * atomically transitioned into the ACTIVEMANY state.
2674 */
2675static dtrace_buffer_t *
2676dtrace_speculation_buffer(dtrace_state_t *state, processorid_t cpuid,
2677 dtrace_specid_t which)
2678{
2679 dtrace_speculation_t *spec;
2680 dtrace_speculation_state_t current, new = 0;
2681 dtrace_buffer_t *buf;
2682
2683 if (which == 0)
2684 return (NULL);
2685
2686 if (which > state->dts_nspeculations) {
2687 cpu_core[cpuid].cpuc_dtrace_flags |= CPU_DTRACE_ILLOP;
2688 return (NULL);
2689 }
2690
2691 spec = &state->dts_speculations[which - 1];
2692 buf = &spec->dtsp_buffer[cpuid];
2693
2694 do {
2695 current = spec->dtsp_state;
2696
2697 switch (current) {
2698 case DTRACESPEC_INACTIVE:
2699 case DTRACESPEC_COMMITTINGMANY:
2700 case DTRACESPEC_DISCARDING:
2701 return (NULL);
2702
2703 case DTRACESPEC_COMMITTING:
2704 ASSERT(buf->dtb_offset == 0);
2705 return (NULL);
2706
2707 case DTRACESPEC_ACTIVEONE:
2708 /*
2709 * This speculation is currently active on one CPU.
2710 * Check the offset in the buffer; if it's non-zero,
2711 * that CPU must be us (and we leave the state alone).
2712 * If it's zero, assume that we're starting on a new
2713 * CPU -- and change the state to indicate that the
2714 * speculation is active on more than one CPU.
2715 */
2716 if (buf->dtb_offset != 0)
2717 return (buf);
2718
2719 new = DTRACESPEC_ACTIVEMANY;
2720 break;
2721
2722 case DTRACESPEC_ACTIVEMANY:
2723 return (buf);
2724
2725 case DTRACESPEC_ACTIVE:
2726 new = DTRACESPEC_ACTIVEONE;
2727 break;
2728
2729 default:
2730 ASSERT(0);
2731 }
2732 } while (dtrace_cas32((uint32_t *)&spec->dtsp_state,
2733 current, new) != current);
2734
2735 ASSERT(new == DTRACESPEC_ACTIVEONE || new == DTRACESPEC_ACTIVEMANY);
2736 return (buf);
2737}
2738
2739/*
2740 * Return a string. In the event that the user lacks the privilege to access
2741 * arbitrary kernel memory, we copy the string out to scratch memory so that we
2742 * don't fail access checking.
2743 *
2744 * dtrace_dif_variable() uses this routine as a helper for various
2745 * builtin values such as 'execname' and 'probefunc.'
2746 */
2747uintptr_t
2748dtrace_dif_varstr(uintptr_t addr, dtrace_state_t *state,
2749 dtrace_mstate_t *mstate)
2750{
2751 uint64_t size = state->dts_options[DTRACEOPT_STRSIZE];
2752 uintptr_t ret;
2753 size_t strsz;
2754
2755 /*
2756 * The easy case: this probe is allowed to read all of memory, so
2757 * we can just return this as a vanilla pointer.
2758 */
2759 if ((mstate->dtms_access & DTRACE_ACCESS_KERNEL) != 0)
2760 return (addr);
2761
2762 /*
2763 * This is the tougher case: we copy the string in question from
2764 * kernel memory into scratch memory and return it that way: this
2765 * ensures that we won't trip up when access checking tests the
2766 * BYREF return value.
2767 */
2768 strsz = dtrace_strlen((char *)addr, size) + 1;
2769
2770 if (mstate->dtms_scratch_ptr + strsz >
2771 mstate->dtms_scratch_base + mstate->dtms_scratch_size) {
2772 DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
2773 return (0);
2774 }
2775
2776 dtrace_strcpy((const void *)addr, (void *)mstate->dtms_scratch_ptr,
2777 strsz);
2778 ret = mstate->dtms_scratch_ptr;
2779 mstate->dtms_scratch_ptr += strsz;
2780 return (ret);
2781}
2782
2783/*
2784 * Return a string from a memoy address which is known to have one or
2785 * more concatenated, individually zero terminated, sub-strings.
2786 * In the event that the user lacks the privilege to access
2787 * arbitrary kernel memory, we copy the string out to scratch memory so that we
2788 * don't fail access checking.
2789 *
2790 * dtrace_dif_variable() uses this routine as a helper for various
2791 * builtin values such as 'execargs'.
2792 */
2793static uintptr_t
2794dtrace_dif_varstrz(uintptr_t addr, size_t strsz, dtrace_state_t *state,
2795 dtrace_mstate_t *mstate)
2796{
2797 char *p;
2798 size_t i;
2799 uintptr_t ret;
2800
2801 if (mstate->dtms_scratch_ptr + strsz >
2802 mstate->dtms_scratch_base + mstate->dtms_scratch_size) {
2803 DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
2804 return (0);
2805 }
2806
2807 dtrace_bcopy((const void *)addr, (void *)mstate->dtms_scratch_ptr,
2808 strsz);
2809
2810 /* Replace sub-string termination characters with a space. */
2811 for (p = (char *) mstate->dtms_scratch_ptr, i = 0; i < strsz - 1;
2812 p++, i++)
2813 if (*p == '\0')
2814 *p = ' ';
2815
2816 ret = mstate->dtms_scratch_ptr;
2817 mstate->dtms_scratch_ptr += strsz;
2818 return (ret);
2819}
2820
2821/*
2822 * This function implements the DIF emulator's variable lookups. The emulator
2823 * passes a reserved variable identifier and optional built-in array index.
2824 */
2825static uint64_t
2826dtrace_dif_variable(dtrace_mstate_t *mstate, dtrace_state_t *state, uint64_t v,
2827 uint64_t ndx)
2828{
2829 /*
2830 * If we're accessing one of the uncached arguments, we'll turn this
2831 * into a reference in the args array.
2832 */
2833 if (v >= DIF_VAR_ARG0 && v <= DIF_VAR_ARG9) {
2834 ndx = v - DIF_VAR_ARG0;
2835 v = DIF_VAR_ARGS;
2836 }
2837
2838 switch (v) {
2839 case DIF_VAR_ARGS:
2840 ASSERT(mstate->dtms_present & DTRACE_MSTATE_ARGS);
2841 if (ndx >= sizeof (mstate->dtms_arg) /
2842 sizeof (mstate->dtms_arg[0])) {
2843 int aframes = mstate->dtms_probe->dtpr_aframes + 2;
2844 dtrace_provider_t *pv;
2845 uint64_t val;
2846
2847 pv = mstate->dtms_probe->dtpr_provider;
2848 if (pv->dtpv_pops.dtps_getargval != NULL)
2849 val = pv->dtpv_pops.dtps_getargval(pv->dtpv_arg,
2850 mstate->dtms_probe->dtpr_id,
2851 mstate->dtms_probe->dtpr_arg, ndx, aframes);
2852 else
2853 val = dtrace_getarg(ndx, aframes);
2854
2855 /*
2856 * This is regrettably required to keep the compiler
2857 * from tail-optimizing the call to dtrace_getarg().
2858 * The condition always evaluates to true, but the
2859 * compiler has no way of figuring that out a priori.
2860 * (None of this would be necessary if the compiler
2861 * could be relied upon to _always_ tail-optimize
2862 * the call to dtrace_getarg() -- but it can't.)
2863 */
2864 if (mstate->dtms_probe != NULL)
2865 return (val);
2866
2867 ASSERT(0);
2868 }
2869
2870 return (mstate->dtms_arg[ndx]);
2871
2872#if defined(sun)
2873 case DIF_VAR_UREGS: {
2874 klwp_t *lwp;
2875
2876 if (!dtrace_priv_proc(state))
2877 return (0);
2878
2879 if ((lwp = curthread->t_lwp) == NULL) {
2880 DTRACE_CPUFLAG_SET(CPU_DTRACE_BADADDR);
2881 cpu_core[curcpu].cpuc_dtrace_illval = NULL;
2882 return (0);
2883 }
2884
2885 return (dtrace_getreg(lwp->lwp_regs, ndx));
2886 return (0);
2887 }
2888#else
2889 case DIF_VAR_UREGS: {
2890 struct trapframe *tframe;
2891
2892 if (!dtrace_priv_proc(state))
2893 return (0);
2894
2895 if ((tframe = curthread->td_frame) == NULL) {
2896 DTRACE_CPUFLAG_SET(CPU_DTRACE_BADADDR);
2897 cpu_core[curcpu].cpuc_dtrace_illval = 0;
2898 return (0);
2899 }
2900
2901 return (dtrace_getreg(tframe, ndx));
2902 }
2903#endif
2904
2905 case DIF_VAR_CURTHREAD:
1548 * Note: not called from probe context. This function is called
1549 * asynchronously (and at a regular interval) from outside of probe context to
1550 * clean the dirty dynamic variable lists on all CPUs. Dynamic variable
1551 * cleaning is explained in detail in <sys/dtrace_impl.h>.
1552 */
1553void
1554dtrace_dynvar_clean(dtrace_dstate_t *dstate)
1555{
1556 dtrace_dynvar_t *dirty;
1557 dtrace_dstate_percpu_t *dcpu;
1558 int i, work = 0;
1559
1560 for (i = 0; i < NCPU; i++) {
1561 dcpu = &dstate->dtds_percpu[i];
1562
1563 ASSERT(dcpu->dtdsc_rinsing == NULL);
1564
1565 /*
1566 * If the dirty list is NULL, there is no dirty work to do.
1567 */
1568 if (dcpu->dtdsc_dirty == NULL)
1569 continue;
1570
1571 /*
1572 * If the clean list is non-NULL, then we're not going to do
1573 * any work for this CPU -- it means that there has not been
1574 * a dtrace_dynvar() allocation on this CPU (or from this CPU)
1575 * since the last time we cleaned house.
1576 */
1577 if (dcpu->dtdsc_clean != NULL)
1578 continue;
1579
1580 work = 1;
1581
1582 /*
1583 * Atomically move the dirty list aside.
1584 */
1585 do {
1586 dirty = dcpu->dtdsc_dirty;
1587
1588 /*
1589 * Before we zap the dirty list, set the rinsing list.
1590 * (This allows for a potential assertion in
1591 * dtrace_dynvar(): if a free dynamic variable appears
1592 * on a hash chain, either the dirty list or the
1593 * rinsing list for some CPU must be non-NULL.)
1594 */
1595 dcpu->dtdsc_rinsing = dirty;
1596 dtrace_membar_producer();
1597 } while (dtrace_casptr(&dcpu->dtdsc_dirty,
1598 dirty, NULL) != dirty);
1599 }
1600
1601 if (!work) {
1602 /*
1603 * We have no work to do; we can simply return.
1604 */
1605 return;
1606 }
1607
1608 dtrace_sync();
1609
1610 for (i = 0; i < NCPU; i++) {
1611 dcpu = &dstate->dtds_percpu[i];
1612
1613 if (dcpu->dtdsc_rinsing == NULL)
1614 continue;
1615
1616 /*
1617 * We are now guaranteed that no hash chain contains a pointer
1618 * into this dirty list; we can make it clean.
1619 */
1620 ASSERT(dcpu->dtdsc_clean == NULL);
1621 dcpu->dtdsc_clean = dcpu->dtdsc_rinsing;
1622 dcpu->dtdsc_rinsing = NULL;
1623 }
1624
1625 /*
1626 * Before we actually set the state to be DTRACE_DSTATE_CLEAN, make
1627 * sure that all CPUs have seen all of the dtdsc_clean pointers.
1628 * This prevents a race whereby a CPU incorrectly decides that
1629 * the state should be something other than DTRACE_DSTATE_CLEAN
1630 * after dtrace_dynvar_clean() has completed.
1631 */
1632 dtrace_sync();
1633
1634 dstate->dtds_state = DTRACE_DSTATE_CLEAN;
1635}
1636
1637/*
1638 * Depending on the value of the op parameter, this function looks-up,
1639 * allocates or deallocates an arbitrarily-keyed dynamic variable. If an
1640 * allocation is requested, this function will return a pointer to a
1641 * dtrace_dynvar_t corresponding to the allocated variable -- or NULL if no
1642 * variable can be allocated. If NULL is returned, the appropriate counter
1643 * will be incremented.
1644 */
1645dtrace_dynvar_t *
1646dtrace_dynvar(dtrace_dstate_t *dstate, uint_t nkeys,
1647 dtrace_key_t *key, size_t dsize, dtrace_dynvar_op_t op,
1648 dtrace_mstate_t *mstate, dtrace_vstate_t *vstate)
1649{
1650 uint64_t hashval = DTRACE_DYNHASH_VALID;
1651 dtrace_dynhash_t *hash = dstate->dtds_hash;
1652 dtrace_dynvar_t *free, *new_free, *next, *dvar, *start, *prev = NULL;
1653 processorid_t me = curcpu, cpu = me;
1654 dtrace_dstate_percpu_t *dcpu = &dstate->dtds_percpu[me];
1655 size_t bucket, ksize;
1656 size_t chunksize = dstate->dtds_chunksize;
1657 uintptr_t kdata, lock, nstate;
1658 uint_t i;
1659
1660 ASSERT(nkeys != 0);
1661
1662 /*
1663 * Hash the key. As with aggregations, we use Jenkins' "One-at-a-time"
1664 * algorithm. For the by-value portions, we perform the algorithm in
1665 * 16-bit chunks (as opposed to 8-bit chunks). This speeds things up a
1666 * bit, and seems to have only a minute effect on distribution. For
1667 * the by-reference data, we perform "One-at-a-time" iterating (safely)
1668 * over each referenced byte. It's painful to do this, but it's much
1669 * better than pathological hash distribution. The efficacy of the
1670 * hashing algorithm (and a comparison with other algorithms) may be
1671 * found by running the ::dtrace_dynstat MDB dcmd.
1672 */
1673 for (i = 0; i < nkeys; i++) {
1674 if (key[i].dttk_size == 0) {
1675 uint64_t val = key[i].dttk_value;
1676
1677 hashval += (val >> 48) & 0xffff;
1678 hashval += (hashval << 10);
1679 hashval ^= (hashval >> 6);
1680
1681 hashval += (val >> 32) & 0xffff;
1682 hashval += (hashval << 10);
1683 hashval ^= (hashval >> 6);
1684
1685 hashval += (val >> 16) & 0xffff;
1686 hashval += (hashval << 10);
1687 hashval ^= (hashval >> 6);
1688
1689 hashval += val & 0xffff;
1690 hashval += (hashval << 10);
1691 hashval ^= (hashval >> 6);
1692 } else {
1693 /*
1694 * This is incredibly painful, but it beats the hell
1695 * out of the alternative.
1696 */
1697 uint64_t j, size = key[i].dttk_size;
1698 uintptr_t base = (uintptr_t)key[i].dttk_value;
1699
1700 if (!dtrace_canload(base, size, mstate, vstate))
1701 break;
1702
1703 for (j = 0; j < size; j++) {
1704 hashval += dtrace_load8(base + j);
1705 hashval += (hashval << 10);
1706 hashval ^= (hashval >> 6);
1707 }
1708 }
1709 }
1710
1711 if (DTRACE_CPUFLAG_ISSET(CPU_DTRACE_FAULT))
1712 return (NULL);
1713
1714 hashval += (hashval << 3);
1715 hashval ^= (hashval >> 11);
1716 hashval += (hashval << 15);
1717
1718 /*
1719 * There is a remote chance (ideally, 1 in 2^31) that our hashval
1720 * comes out to be one of our two sentinel hash values. If this
1721 * actually happens, we set the hashval to be a value known to be a
1722 * non-sentinel value.
1723 */
1724 if (hashval == DTRACE_DYNHASH_FREE || hashval == DTRACE_DYNHASH_SINK)
1725 hashval = DTRACE_DYNHASH_VALID;
1726
1727 /*
1728 * Yes, it's painful to do a divide here. If the cycle count becomes
1729 * important here, tricks can be pulled to reduce it. (However, it's
1730 * critical that hash collisions be kept to an absolute minimum;
1731 * they're much more painful than a divide.) It's better to have a
1732 * solution that generates few collisions and still keeps things
1733 * relatively simple.
1734 */
1735 bucket = hashval % dstate->dtds_hashsize;
1736
1737 if (op == DTRACE_DYNVAR_DEALLOC) {
1738 volatile uintptr_t *lockp = &hash[bucket].dtdh_lock;
1739
1740 for (;;) {
1741 while ((lock = *lockp) & 1)
1742 continue;
1743
1744 if (dtrace_casptr((volatile void *)lockp,
1745 (volatile void *)lock, (volatile void *)(lock + 1)) == (void *)lock)
1746 break;
1747 }
1748
1749 dtrace_membar_producer();
1750 }
1751
1752top:
1753 prev = NULL;
1754 lock = hash[bucket].dtdh_lock;
1755
1756 dtrace_membar_consumer();
1757
1758 start = hash[bucket].dtdh_chain;
1759 ASSERT(start != NULL && (start->dtdv_hashval == DTRACE_DYNHASH_SINK ||
1760 start->dtdv_hashval != DTRACE_DYNHASH_FREE ||
1761 op != DTRACE_DYNVAR_DEALLOC));
1762
1763 for (dvar = start; dvar != NULL; dvar = dvar->dtdv_next) {
1764 dtrace_tuple_t *dtuple = &dvar->dtdv_tuple;
1765 dtrace_key_t *dkey = &dtuple->dtt_key[0];
1766
1767 if (dvar->dtdv_hashval != hashval) {
1768 if (dvar->dtdv_hashval == DTRACE_DYNHASH_SINK) {
1769 /*
1770 * We've reached the sink, and therefore the
1771 * end of the hash chain; we can kick out of
1772 * the loop knowing that we have seen a valid
1773 * snapshot of state.
1774 */
1775 ASSERT(dvar->dtdv_next == NULL);
1776 ASSERT(dvar == &dtrace_dynhash_sink);
1777 break;
1778 }
1779
1780 if (dvar->dtdv_hashval == DTRACE_DYNHASH_FREE) {
1781 /*
1782 * We've gone off the rails: somewhere along
1783 * the line, one of the members of this hash
1784 * chain was deleted. Note that we could also
1785 * detect this by simply letting this loop run
1786 * to completion, as we would eventually hit
1787 * the end of the dirty list. However, we
1788 * want to avoid running the length of the
1789 * dirty list unnecessarily (it might be quite
1790 * long), so we catch this as early as
1791 * possible by detecting the hash marker. In
1792 * this case, we simply set dvar to NULL and
1793 * break; the conditional after the loop will
1794 * send us back to top.
1795 */
1796 dvar = NULL;
1797 break;
1798 }
1799
1800 goto next;
1801 }
1802
1803 if (dtuple->dtt_nkeys != nkeys)
1804 goto next;
1805
1806 for (i = 0; i < nkeys; i++, dkey++) {
1807 if (dkey->dttk_size != key[i].dttk_size)
1808 goto next; /* size or type mismatch */
1809
1810 if (dkey->dttk_size != 0) {
1811 if (dtrace_bcmp(
1812 (void *)(uintptr_t)key[i].dttk_value,
1813 (void *)(uintptr_t)dkey->dttk_value,
1814 dkey->dttk_size))
1815 goto next;
1816 } else {
1817 if (dkey->dttk_value != key[i].dttk_value)
1818 goto next;
1819 }
1820 }
1821
1822 if (op != DTRACE_DYNVAR_DEALLOC)
1823 return (dvar);
1824
1825 ASSERT(dvar->dtdv_next == NULL ||
1826 dvar->dtdv_next->dtdv_hashval != DTRACE_DYNHASH_FREE);
1827
1828 if (prev != NULL) {
1829 ASSERT(hash[bucket].dtdh_chain != dvar);
1830 ASSERT(start != dvar);
1831 ASSERT(prev->dtdv_next == dvar);
1832 prev->dtdv_next = dvar->dtdv_next;
1833 } else {
1834 if (dtrace_casptr(&hash[bucket].dtdh_chain,
1835 start, dvar->dtdv_next) != start) {
1836 /*
1837 * We have failed to atomically swing the
1838 * hash table head pointer, presumably because
1839 * of a conflicting allocation on another CPU.
1840 * We need to reread the hash chain and try
1841 * again.
1842 */
1843 goto top;
1844 }
1845 }
1846
1847 dtrace_membar_producer();
1848
1849 /*
1850 * Now set the hash value to indicate that it's free.
1851 */
1852 ASSERT(hash[bucket].dtdh_chain != dvar);
1853 dvar->dtdv_hashval = DTRACE_DYNHASH_FREE;
1854
1855 dtrace_membar_producer();
1856
1857 /*
1858 * Set the next pointer to point at the dirty list, and
1859 * atomically swing the dirty pointer to the newly freed dvar.
1860 */
1861 do {
1862 next = dcpu->dtdsc_dirty;
1863 dvar->dtdv_next = next;
1864 } while (dtrace_casptr(&dcpu->dtdsc_dirty, next, dvar) != next);
1865
1866 /*
1867 * Finally, unlock this hash bucket.
1868 */
1869 ASSERT(hash[bucket].dtdh_lock == lock);
1870 ASSERT(lock & 1);
1871 hash[bucket].dtdh_lock++;
1872
1873 return (NULL);
1874next:
1875 prev = dvar;
1876 continue;
1877 }
1878
1879 if (dvar == NULL) {
1880 /*
1881 * If dvar is NULL, it is because we went off the rails:
1882 * one of the elements that we traversed in the hash chain
1883 * was deleted while we were traversing it. In this case,
1884 * we assert that we aren't doing a dealloc (deallocs lock
1885 * the hash bucket to prevent themselves from racing with
1886 * one another), and retry the hash chain traversal.
1887 */
1888 ASSERT(op != DTRACE_DYNVAR_DEALLOC);
1889 goto top;
1890 }
1891
1892 if (op != DTRACE_DYNVAR_ALLOC) {
1893 /*
1894 * If we are not to allocate a new variable, we want to
1895 * return NULL now. Before we return, check that the value
1896 * of the lock word hasn't changed. If it has, we may have
1897 * seen an inconsistent snapshot.
1898 */
1899 if (op == DTRACE_DYNVAR_NOALLOC) {
1900 if (hash[bucket].dtdh_lock != lock)
1901 goto top;
1902 } else {
1903 ASSERT(op == DTRACE_DYNVAR_DEALLOC);
1904 ASSERT(hash[bucket].dtdh_lock == lock);
1905 ASSERT(lock & 1);
1906 hash[bucket].dtdh_lock++;
1907 }
1908
1909 return (NULL);
1910 }
1911
1912 /*
1913 * We need to allocate a new dynamic variable. The size we need is the
1914 * size of dtrace_dynvar plus the size of nkeys dtrace_key_t's plus the
1915 * size of any auxiliary key data (rounded up to 8-byte alignment) plus
1916 * the size of any referred-to data (dsize). We then round the final
1917 * size up to the chunksize for allocation.
1918 */
1919 for (ksize = 0, i = 0; i < nkeys; i++)
1920 ksize += P2ROUNDUP(key[i].dttk_size, sizeof (uint64_t));
1921
1922 /*
1923 * This should be pretty much impossible, but could happen if, say,
1924 * strange DIF specified the tuple. Ideally, this should be an
1925 * assertion and not an error condition -- but that requires that the
1926 * chunksize calculation in dtrace_difo_chunksize() be absolutely
1927 * bullet-proof. (That is, it must not be able to be fooled by
1928 * malicious DIF.) Given the lack of backwards branches in DIF,
1929 * solving this would presumably not amount to solving the Halting
1930 * Problem -- but it still seems awfully hard.
1931 */
1932 if (sizeof (dtrace_dynvar_t) + sizeof (dtrace_key_t) * (nkeys - 1) +
1933 ksize + dsize > chunksize) {
1934 dcpu->dtdsc_drops++;
1935 return (NULL);
1936 }
1937
1938 nstate = DTRACE_DSTATE_EMPTY;
1939
1940 do {
1941retry:
1942 free = dcpu->dtdsc_free;
1943
1944 if (free == NULL) {
1945 dtrace_dynvar_t *clean = dcpu->dtdsc_clean;
1946 void *rval;
1947
1948 if (clean == NULL) {
1949 /*
1950 * We're out of dynamic variable space on
1951 * this CPU. Unless we have tried all CPUs,
1952 * we'll try to allocate from a different
1953 * CPU.
1954 */
1955 switch (dstate->dtds_state) {
1956 case DTRACE_DSTATE_CLEAN: {
1957 void *sp = &dstate->dtds_state;
1958
1959 if (++cpu >= NCPU)
1960 cpu = 0;
1961
1962 if (dcpu->dtdsc_dirty != NULL &&
1963 nstate == DTRACE_DSTATE_EMPTY)
1964 nstate = DTRACE_DSTATE_DIRTY;
1965
1966 if (dcpu->dtdsc_rinsing != NULL)
1967 nstate = DTRACE_DSTATE_RINSING;
1968
1969 dcpu = &dstate->dtds_percpu[cpu];
1970
1971 if (cpu != me)
1972 goto retry;
1973
1974 (void) dtrace_cas32(sp,
1975 DTRACE_DSTATE_CLEAN, nstate);
1976
1977 /*
1978 * To increment the correct bean
1979 * counter, take another lap.
1980 */
1981 goto retry;
1982 }
1983
1984 case DTRACE_DSTATE_DIRTY:
1985 dcpu->dtdsc_dirty_drops++;
1986 break;
1987
1988 case DTRACE_DSTATE_RINSING:
1989 dcpu->dtdsc_rinsing_drops++;
1990 break;
1991
1992 case DTRACE_DSTATE_EMPTY:
1993 dcpu->dtdsc_drops++;
1994 break;
1995 }
1996
1997 DTRACE_CPUFLAG_SET(CPU_DTRACE_DROP);
1998 return (NULL);
1999 }
2000
2001 /*
2002 * The clean list appears to be non-empty. We want to
2003 * move the clean list to the free list; we start by
2004 * moving the clean pointer aside.
2005 */
2006 if (dtrace_casptr(&dcpu->dtdsc_clean,
2007 clean, NULL) != clean) {
2008 /*
2009 * We are in one of two situations:
2010 *
2011 * (a) The clean list was switched to the
2012 * free list by another CPU.
2013 *
2014 * (b) The clean list was added to by the
2015 * cleansing cyclic.
2016 *
2017 * In either of these situations, we can
2018 * just reattempt the free list allocation.
2019 */
2020 goto retry;
2021 }
2022
2023 ASSERT(clean->dtdv_hashval == DTRACE_DYNHASH_FREE);
2024
2025 /*
2026 * Now we'll move the clean list to the free list.
2027 * It's impossible for this to fail: the only way
2028 * the free list can be updated is through this
2029 * code path, and only one CPU can own the clean list.
2030 * Thus, it would only be possible for this to fail if
2031 * this code were racing with dtrace_dynvar_clean().
2032 * (That is, if dtrace_dynvar_clean() updated the clean
2033 * list, and we ended up racing to update the free
2034 * list.) This race is prevented by the dtrace_sync()
2035 * in dtrace_dynvar_clean() -- which flushes the
2036 * owners of the clean lists out before resetting
2037 * the clean lists.
2038 */
2039 rval = dtrace_casptr(&dcpu->dtdsc_free, NULL, clean);
2040 ASSERT(rval == NULL);
2041 goto retry;
2042 }
2043
2044 dvar = free;
2045 new_free = dvar->dtdv_next;
2046 } while (dtrace_casptr(&dcpu->dtdsc_free, free, new_free) != free);
2047
2048 /*
2049 * We have now allocated a new chunk. We copy the tuple keys into the
2050 * tuple array and copy any referenced key data into the data space
2051 * following the tuple array. As we do this, we relocate dttk_value
2052 * in the final tuple to point to the key data address in the chunk.
2053 */
2054 kdata = (uintptr_t)&dvar->dtdv_tuple.dtt_key[nkeys];
2055 dvar->dtdv_data = (void *)(kdata + ksize);
2056 dvar->dtdv_tuple.dtt_nkeys = nkeys;
2057
2058 for (i = 0; i < nkeys; i++) {
2059 dtrace_key_t *dkey = &dvar->dtdv_tuple.dtt_key[i];
2060 size_t kesize = key[i].dttk_size;
2061
2062 if (kesize != 0) {
2063 dtrace_bcopy(
2064 (const void *)(uintptr_t)key[i].dttk_value,
2065 (void *)kdata, kesize);
2066 dkey->dttk_value = kdata;
2067 kdata += P2ROUNDUP(kesize, sizeof (uint64_t));
2068 } else {
2069 dkey->dttk_value = key[i].dttk_value;
2070 }
2071
2072 dkey->dttk_size = kesize;
2073 }
2074
2075 ASSERT(dvar->dtdv_hashval == DTRACE_DYNHASH_FREE);
2076 dvar->dtdv_hashval = hashval;
2077 dvar->dtdv_next = start;
2078
2079 if (dtrace_casptr(&hash[bucket].dtdh_chain, start, dvar) == start)
2080 return (dvar);
2081
2082 /*
2083 * The cas has failed. Either another CPU is adding an element to
2084 * this hash chain, or another CPU is deleting an element from this
2085 * hash chain. The simplest way to deal with both of these cases
2086 * (though not necessarily the most efficient) is to free our
2087 * allocated block and tail-call ourselves. Note that the free is
2088 * to the dirty list and _not_ to the free list. This is to prevent
2089 * races with allocators, above.
2090 */
2091 dvar->dtdv_hashval = DTRACE_DYNHASH_FREE;
2092
2093 dtrace_membar_producer();
2094
2095 do {
2096 free = dcpu->dtdsc_dirty;
2097 dvar->dtdv_next = free;
2098 } while (dtrace_casptr(&dcpu->dtdsc_dirty, free, dvar) != free);
2099
2100 return (dtrace_dynvar(dstate, nkeys, key, dsize, op, mstate, vstate));
2101}
2102
2103/*ARGSUSED*/
2104static void
2105dtrace_aggregate_min(uint64_t *oval, uint64_t nval, uint64_t arg)
2106{
2107 if ((int64_t)nval < (int64_t)*oval)
2108 *oval = nval;
2109}
2110
2111/*ARGSUSED*/
2112static void
2113dtrace_aggregate_max(uint64_t *oval, uint64_t nval, uint64_t arg)
2114{
2115 if ((int64_t)nval > (int64_t)*oval)
2116 *oval = nval;
2117}
2118
2119static void
2120dtrace_aggregate_quantize(uint64_t *quanta, uint64_t nval, uint64_t incr)
2121{
2122 int i, zero = DTRACE_QUANTIZE_ZEROBUCKET;
2123 int64_t val = (int64_t)nval;
2124
2125 if (val < 0) {
2126 for (i = 0; i < zero; i++) {
2127 if (val <= DTRACE_QUANTIZE_BUCKETVAL(i)) {
2128 quanta[i] += incr;
2129 return;
2130 }
2131 }
2132 } else {
2133 for (i = zero + 1; i < DTRACE_QUANTIZE_NBUCKETS; i++) {
2134 if (val < DTRACE_QUANTIZE_BUCKETVAL(i)) {
2135 quanta[i - 1] += incr;
2136 return;
2137 }
2138 }
2139
2140 quanta[DTRACE_QUANTIZE_NBUCKETS - 1] += incr;
2141 return;
2142 }
2143
2144 ASSERT(0);
2145}
2146
2147static void
2148dtrace_aggregate_lquantize(uint64_t *lquanta, uint64_t nval, uint64_t incr)
2149{
2150 uint64_t arg = *lquanta++;
2151 int32_t base = DTRACE_LQUANTIZE_BASE(arg);
2152 uint16_t step = DTRACE_LQUANTIZE_STEP(arg);
2153 uint16_t levels = DTRACE_LQUANTIZE_LEVELS(arg);
2154 int32_t val = (int32_t)nval, level;
2155
2156 ASSERT(step != 0);
2157 ASSERT(levels != 0);
2158
2159 if (val < base) {
2160 /*
2161 * This is an underflow.
2162 */
2163 lquanta[0] += incr;
2164 return;
2165 }
2166
2167 level = (val - base) / step;
2168
2169 if (level < levels) {
2170 lquanta[level + 1] += incr;
2171 return;
2172 }
2173
2174 /*
2175 * This is an overflow.
2176 */
2177 lquanta[levels + 1] += incr;
2178}
2179
2180static int
2181dtrace_aggregate_llquantize_bucket(uint16_t factor, uint16_t low,
2182 uint16_t high, uint16_t nsteps, int64_t value)
2183{
2184 int64_t this = 1, last, next;
2185 int base = 1, order;
2186
2187 ASSERT(factor <= nsteps);
2188 ASSERT(nsteps % factor == 0);
2189
2190 for (order = 0; order < low; order++)
2191 this *= factor;
2192
2193 /*
2194 * If our value is less than our factor taken to the power of the
2195 * low order of magnitude, it goes into the zeroth bucket.
2196 */
2197 if (value < (last = this))
2198 return (0);
2199
2200 for (this *= factor; order <= high; order++) {
2201 int nbuckets = this > nsteps ? nsteps : this;
2202
2203 if ((next = this * factor) < this) {
2204 /*
2205 * We should not generally get log/linear quantizations
2206 * with a high magnitude that allows 64-bits to
2207 * overflow, but we nonetheless protect against this
2208 * by explicitly checking for overflow, and clamping
2209 * our value accordingly.
2210 */
2211 value = this - 1;
2212 }
2213
2214 if (value < this) {
2215 /*
2216 * If our value lies within this order of magnitude,
2217 * determine its position by taking the offset within
2218 * the order of magnitude, dividing by the bucket
2219 * width, and adding to our (accumulated) base.
2220 */
2221 return (base + (value - last) / (this / nbuckets));
2222 }
2223
2224 base += nbuckets - (nbuckets / factor);
2225 last = this;
2226 this = next;
2227 }
2228
2229 /*
2230 * Our value is greater than or equal to our factor taken to the
2231 * power of one plus the high magnitude -- return the top bucket.
2232 */
2233 return (base);
2234}
2235
2236static void
2237dtrace_aggregate_llquantize(uint64_t *llquanta, uint64_t nval, uint64_t incr)
2238{
2239 uint64_t arg = *llquanta++;
2240 uint16_t factor = DTRACE_LLQUANTIZE_FACTOR(arg);
2241 uint16_t low = DTRACE_LLQUANTIZE_LOW(arg);
2242 uint16_t high = DTRACE_LLQUANTIZE_HIGH(arg);
2243 uint16_t nsteps = DTRACE_LLQUANTIZE_NSTEP(arg);
2244
2245 llquanta[dtrace_aggregate_llquantize_bucket(factor,
2246 low, high, nsteps, nval)] += incr;
2247}
2248
2249/*ARGSUSED*/
2250static void
2251dtrace_aggregate_avg(uint64_t *data, uint64_t nval, uint64_t arg)
2252{
2253 data[0]++;
2254 data[1] += nval;
2255}
2256
2257/*ARGSUSED*/
2258static void
2259dtrace_aggregate_stddev(uint64_t *data, uint64_t nval, uint64_t arg)
2260{
2261 int64_t snval = (int64_t)nval;
2262 uint64_t tmp[2];
2263
2264 data[0]++;
2265 data[1] += nval;
2266
2267 /*
2268 * What we want to say here is:
2269 *
2270 * data[2] += nval * nval;
2271 *
2272 * But given that nval is 64-bit, we could easily overflow, so
2273 * we do this as 128-bit arithmetic.
2274 */
2275 if (snval < 0)
2276 snval = -snval;
2277
2278 dtrace_multiply_128((uint64_t)snval, (uint64_t)snval, tmp);
2279 dtrace_add_128(data + 2, tmp, data + 2);
2280}
2281
2282/*ARGSUSED*/
2283static void
2284dtrace_aggregate_count(uint64_t *oval, uint64_t nval, uint64_t arg)
2285{
2286 *oval = *oval + 1;
2287}
2288
2289/*ARGSUSED*/
2290static void
2291dtrace_aggregate_sum(uint64_t *oval, uint64_t nval, uint64_t arg)
2292{
2293 *oval += nval;
2294}
2295
2296/*
2297 * Aggregate given the tuple in the principal data buffer, and the aggregating
2298 * action denoted by the specified dtrace_aggregation_t. The aggregation
2299 * buffer is specified as the buf parameter. This routine does not return
2300 * failure; if there is no space in the aggregation buffer, the data will be
2301 * dropped, and a corresponding counter incremented.
2302 */
2303static void
2304dtrace_aggregate(dtrace_aggregation_t *agg, dtrace_buffer_t *dbuf,
2305 intptr_t offset, dtrace_buffer_t *buf, uint64_t expr, uint64_t arg)
2306{
2307 dtrace_recdesc_t *rec = &agg->dtag_action.dta_rec;
2308 uint32_t i, ndx, size, fsize;
2309 uint32_t align = sizeof (uint64_t) - 1;
2310 dtrace_aggbuffer_t *agb;
2311 dtrace_aggkey_t *key;
2312 uint32_t hashval = 0, limit, isstr;
2313 caddr_t tomax, data, kdata;
2314 dtrace_actkind_t action;
2315 dtrace_action_t *act;
2316 uintptr_t offs;
2317
2318 if (buf == NULL)
2319 return;
2320
2321 if (!agg->dtag_hasarg) {
2322 /*
2323 * Currently, only quantize() and lquantize() take additional
2324 * arguments, and they have the same semantics: an increment
2325 * value that defaults to 1 when not present. If additional
2326 * aggregating actions take arguments, the setting of the
2327 * default argument value will presumably have to become more
2328 * sophisticated...
2329 */
2330 arg = 1;
2331 }
2332
2333 action = agg->dtag_action.dta_kind - DTRACEACT_AGGREGATION;
2334 size = rec->dtrd_offset - agg->dtag_base;
2335 fsize = size + rec->dtrd_size;
2336
2337 ASSERT(dbuf->dtb_tomax != NULL);
2338 data = dbuf->dtb_tomax + offset + agg->dtag_base;
2339
2340 if ((tomax = buf->dtb_tomax) == NULL) {
2341 dtrace_buffer_drop(buf);
2342 return;
2343 }
2344
2345 /*
2346 * The metastructure is always at the bottom of the buffer.
2347 */
2348 agb = (dtrace_aggbuffer_t *)(tomax + buf->dtb_size -
2349 sizeof (dtrace_aggbuffer_t));
2350
2351 if (buf->dtb_offset == 0) {
2352 /*
2353 * We just kludge up approximately 1/8th of the size to be
2354 * buckets. If this guess ends up being routinely
2355 * off-the-mark, we may need to dynamically readjust this
2356 * based on past performance.
2357 */
2358 uintptr_t hashsize = (buf->dtb_size >> 3) / sizeof (uintptr_t);
2359
2360 if ((uintptr_t)agb - hashsize * sizeof (dtrace_aggkey_t *) <
2361 (uintptr_t)tomax || hashsize == 0) {
2362 /*
2363 * We've been given a ludicrously small buffer;
2364 * increment our drop count and leave.
2365 */
2366 dtrace_buffer_drop(buf);
2367 return;
2368 }
2369
2370 /*
2371 * And now, a pathetic attempt to try to get a an odd (or
2372 * perchance, a prime) hash size for better hash distribution.
2373 */
2374 if (hashsize > (DTRACE_AGGHASHSIZE_SLEW << 3))
2375 hashsize -= DTRACE_AGGHASHSIZE_SLEW;
2376
2377 agb->dtagb_hashsize = hashsize;
2378 agb->dtagb_hash = (dtrace_aggkey_t **)((uintptr_t)agb -
2379 agb->dtagb_hashsize * sizeof (dtrace_aggkey_t *));
2380 agb->dtagb_free = (uintptr_t)agb->dtagb_hash;
2381
2382 for (i = 0; i < agb->dtagb_hashsize; i++)
2383 agb->dtagb_hash[i] = NULL;
2384 }
2385
2386 ASSERT(agg->dtag_first != NULL);
2387 ASSERT(agg->dtag_first->dta_intuple);
2388
2389 /*
2390 * Calculate the hash value based on the key. Note that we _don't_
2391 * include the aggid in the hashing (but we will store it as part of
2392 * the key). The hashing algorithm is Bob Jenkins' "One-at-a-time"
2393 * algorithm: a simple, quick algorithm that has no known funnels, and
2394 * gets good distribution in practice. The efficacy of the hashing
2395 * algorithm (and a comparison with other algorithms) may be found by
2396 * running the ::dtrace_aggstat MDB dcmd.
2397 */
2398 for (act = agg->dtag_first; act->dta_intuple; act = act->dta_next) {
2399 i = act->dta_rec.dtrd_offset - agg->dtag_base;
2400 limit = i + act->dta_rec.dtrd_size;
2401 ASSERT(limit <= size);
2402 isstr = DTRACEACT_ISSTRING(act);
2403
2404 for (; i < limit; i++) {
2405 hashval += data[i];
2406 hashval += (hashval << 10);
2407 hashval ^= (hashval >> 6);
2408
2409 if (isstr && data[i] == '\0')
2410 break;
2411 }
2412 }
2413
2414 hashval += (hashval << 3);
2415 hashval ^= (hashval >> 11);
2416 hashval += (hashval << 15);
2417
2418 /*
2419 * Yes, the divide here is expensive -- but it's generally the least
2420 * of the performance issues given the amount of data that we iterate
2421 * over to compute hash values, compare data, etc.
2422 */
2423 ndx = hashval % agb->dtagb_hashsize;
2424
2425 for (key = agb->dtagb_hash[ndx]; key != NULL; key = key->dtak_next) {
2426 ASSERT((caddr_t)key >= tomax);
2427 ASSERT((caddr_t)key < tomax + buf->dtb_size);
2428
2429 if (hashval != key->dtak_hashval || key->dtak_size != size)
2430 continue;
2431
2432 kdata = key->dtak_data;
2433 ASSERT(kdata >= tomax && kdata < tomax + buf->dtb_size);
2434
2435 for (act = agg->dtag_first; act->dta_intuple;
2436 act = act->dta_next) {
2437 i = act->dta_rec.dtrd_offset - agg->dtag_base;
2438 limit = i + act->dta_rec.dtrd_size;
2439 ASSERT(limit <= size);
2440 isstr = DTRACEACT_ISSTRING(act);
2441
2442 for (; i < limit; i++) {
2443 if (kdata[i] != data[i])
2444 goto next;
2445
2446 if (isstr && data[i] == '\0')
2447 break;
2448 }
2449 }
2450
2451 if (action != key->dtak_action) {
2452 /*
2453 * We are aggregating on the same value in the same
2454 * aggregation with two different aggregating actions.
2455 * (This should have been picked up in the compiler,
2456 * so we may be dealing with errant or devious DIF.)
2457 * This is an error condition; we indicate as much,
2458 * and return.
2459 */
2460 DTRACE_CPUFLAG_SET(CPU_DTRACE_ILLOP);
2461 return;
2462 }
2463
2464 /*
2465 * This is a hit: we need to apply the aggregator to
2466 * the value at this key.
2467 */
2468 agg->dtag_aggregate((uint64_t *)(kdata + size), expr, arg);
2469 return;
2470next:
2471 continue;
2472 }
2473
2474 /*
2475 * We didn't find it. We need to allocate some zero-filled space,
2476 * link it into the hash table appropriately, and apply the aggregator
2477 * to the (zero-filled) value.
2478 */
2479 offs = buf->dtb_offset;
2480 while (offs & (align - 1))
2481 offs += sizeof (uint32_t);
2482
2483 /*
2484 * If we don't have enough room to both allocate a new key _and_
2485 * its associated data, increment the drop count and return.
2486 */
2487 if ((uintptr_t)tomax + offs + fsize >
2488 agb->dtagb_free - sizeof (dtrace_aggkey_t)) {
2489 dtrace_buffer_drop(buf);
2490 return;
2491 }
2492
2493 /*CONSTCOND*/
2494 ASSERT(!(sizeof (dtrace_aggkey_t) & (sizeof (uintptr_t) - 1)));
2495 key = (dtrace_aggkey_t *)(agb->dtagb_free - sizeof (dtrace_aggkey_t));
2496 agb->dtagb_free -= sizeof (dtrace_aggkey_t);
2497
2498 key->dtak_data = kdata = tomax + offs;
2499 buf->dtb_offset = offs + fsize;
2500
2501 /*
2502 * Now copy the data across.
2503 */
2504 *((dtrace_aggid_t *)kdata) = agg->dtag_id;
2505
2506 for (i = sizeof (dtrace_aggid_t); i < size; i++)
2507 kdata[i] = data[i];
2508
2509 /*
2510 * Because strings are not zeroed out by default, we need to iterate
2511 * looking for actions that store strings, and we need to explicitly
2512 * pad these strings out with zeroes.
2513 */
2514 for (act = agg->dtag_first; act->dta_intuple; act = act->dta_next) {
2515 int nul;
2516
2517 if (!DTRACEACT_ISSTRING(act))
2518 continue;
2519
2520 i = act->dta_rec.dtrd_offset - agg->dtag_base;
2521 limit = i + act->dta_rec.dtrd_size;
2522 ASSERT(limit <= size);
2523
2524 for (nul = 0; i < limit; i++) {
2525 if (nul) {
2526 kdata[i] = '\0';
2527 continue;
2528 }
2529
2530 if (data[i] != '\0')
2531 continue;
2532
2533 nul = 1;
2534 }
2535 }
2536
2537 for (i = size; i < fsize; i++)
2538 kdata[i] = 0;
2539
2540 key->dtak_hashval = hashval;
2541 key->dtak_size = size;
2542 key->dtak_action = action;
2543 key->dtak_next = agb->dtagb_hash[ndx];
2544 agb->dtagb_hash[ndx] = key;
2545
2546 /*
2547 * Finally, apply the aggregator.
2548 */
2549 *((uint64_t *)(key->dtak_data + size)) = agg->dtag_initial;
2550 agg->dtag_aggregate((uint64_t *)(key->dtak_data + size), expr, arg);
2551}
2552
2553/*
2554 * Given consumer state, this routine finds a speculation in the INACTIVE
2555 * state and transitions it into the ACTIVE state. If there is no speculation
2556 * in the INACTIVE state, 0 is returned. In this case, no error counter is
2557 * incremented -- it is up to the caller to take appropriate action.
2558 */
2559static int
2560dtrace_speculation(dtrace_state_t *state)
2561{
2562 int i = 0;
2563 dtrace_speculation_state_t current;
2564 uint32_t *stat = &state->dts_speculations_unavail, count;
2565
2566 while (i < state->dts_nspeculations) {
2567 dtrace_speculation_t *spec = &state->dts_speculations[i];
2568
2569 current = spec->dtsp_state;
2570
2571 if (current != DTRACESPEC_INACTIVE) {
2572 if (current == DTRACESPEC_COMMITTINGMANY ||
2573 current == DTRACESPEC_COMMITTING ||
2574 current == DTRACESPEC_DISCARDING)
2575 stat = &state->dts_speculations_busy;
2576 i++;
2577 continue;
2578 }
2579
2580 if (dtrace_cas32((uint32_t *)&spec->dtsp_state,
2581 current, DTRACESPEC_ACTIVE) == current)
2582 return (i + 1);
2583 }
2584
2585 /*
2586 * We couldn't find a speculation. If we found as much as a single
2587 * busy speculation buffer, we'll attribute this failure as "busy"
2588 * instead of "unavail".
2589 */
2590 do {
2591 count = *stat;
2592 } while (dtrace_cas32(stat, count, count + 1) != count);
2593
2594 return (0);
2595}
2596
2597/*
2598 * This routine commits an active speculation. If the specified speculation
2599 * is not in a valid state to perform a commit(), this routine will silently do
2600 * nothing. The state of the specified speculation is transitioned according
2601 * to the state transition diagram outlined in <sys/dtrace_impl.h>
2602 */
2603static void
2604dtrace_speculation_commit(dtrace_state_t *state, processorid_t cpu,
2605 dtrace_specid_t which)
2606{
2607 dtrace_speculation_t *spec;
2608 dtrace_buffer_t *src, *dest;
2609 uintptr_t daddr, saddr, dlimit, slimit;
2610 dtrace_speculation_state_t current, new = 0;
2611 intptr_t offs;
2612 uint64_t timestamp;
2613
2614 if (which == 0)
2615 return;
2616
2617 if (which > state->dts_nspeculations) {
2618 cpu_core[cpu].cpuc_dtrace_flags |= CPU_DTRACE_ILLOP;
2619 return;
2620 }
2621
2622 spec = &state->dts_speculations[which - 1];
2623 src = &spec->dtsp_buffer[cpu];
2624 dest = &state->dts_buffer[cpu];
2625
2626 do {
2627 current = spec->dtsp_state;
2628
2629 if (current == DTRACESPEC_COMMITTINGMANY)
2630 break;
2631
2632 switch (current) {
2633 case DTRACESPEC_INACTIVE:
2634 case DTRACESPEC_DISCARDING:
2635 return;
2636
2637 case DTRACESPEC_COMMITTING:
2638 /*
2639 * This is only possible if we are (a) commit()'ing
2640 * without having done a prior speculate() on this CPU
2641 * and (b) racing with another commit() on a different
2642 * CPU. There's nothing to do -- we just assert that
2643 * our offset is 0.
2644 */
2645 ASSERT(src->dtb_offset == 0);
2646 return;
2647
2648 case DTRACESPEC_ACTIVE:
2649 new = DTRACESPEC_COMMITTING;
2650 break;
2651
2652 case DTRACESPEC_ACTIVEONE:
2653 /*
2654 * This speculation is active on one CPU. If our
2655 * buffer offset is non-zero, we know that the one CPU
2656 * must be us. Otherwise, we are committing on a
2657 * different CPU from the speculate(), and we must
2658 * rely on being asynchronously cleaned.
2659 */
2660 if (src->dtb_offset != 0) {
2661 new = DTRACESPEC_COMMITTING;
2662 break;
2663 }
2664 /*FALLTHROUGH*/
2665
2666 case DTRACESPEC_ACTIVEMANY:
2667 new = DTRACESPEC_COMMITTINGMANY;
2668 break;
2669
2670 default:
2671 ASSERT(0);
2672 }
2673 } while (dtrace_cas32((uint32_t *)&spec->dtsp_state,
2674 current, new) != current);
2675
2676 /*
2677 * We have set the state to indicate that we are committing this
2678 * speculation. Now reserve the necessary space in the destination
2679 * buffer.
2680 */
2681 if ((offs = dtrace_buffer_reserve(dest, src->dtb_offset,
2682 sizeof (uint64_t), state, NULL)) < 0) {
2683 dtrace_buffer_drop(dest);
2684 goto out;
2685 }
2686
2687 /*
2688 * We have sufficient space to copy the speculative buffer into the
2689 * primary buffer. First, modify the speculative buffer, filling
2690 * in the timestamp of all entries with the current time. The data
2691 * must have the commit() time rather than the time it was traced,
2692 * so that all entries in the primary buffer are in timestamp order.
2693 */
2694 timestamp = dtrace_gethrtime();
2695 saddr = (uintptr_t)src->dtb_tomax;
2696 slimit = saddr + src->dtb_offset;
2697 while (saddr < slimit) {
2698 size_t size;
2699 dtrace_rechdr_t *dtrh = (dtrace_rechdr_t *)saddr;
2700
2701 if (dtrh->dtrh_epid == DTRACE_EPIDNONE) {
2702 saddr += sizeof (dtrace_epid_t);
2703 continue;
2704 }
2705 ASSERT3U(dtrh->dtrh_epid, <=, state->dts_necbs);
2706 size = state->dts_ecbs[dtrh->dtrh_epid - 1]->dte_size;
2707
2708 ASSERT3U(saddr + size, <=, slimit);
2709 ASSERT3U(size, >=, sizeof (dtrace_rechdr_t));
2710 ASSERT3U(DTRACE_RECORD_LOAD_TIMESTAMP(dtrh), ==, UINT64_MAX);
2711
2712 DTRACE_RECORD_STORE_TIMESTAMP(dtrh, timestamp);
2713
2714 saddr += size;
2715 }
2716
2717 /*
2718 * Copy the buffer across. (Note that this is a
2719 * highly subobtimal bcopy(); in the unlikely event that this becomes
2720 * a serious performance issue, a high-performance DTrace-specific
2721 * bcopy() should obviously be invented.)
2722 */
2723 daddr = (uintptr_t)dest->dtb_tomax + offs;
2724 dlimit = daddr + src->dtb_offset;
2725 saddr = (uintptr_t)src->dtb_tomax;
2726
2727 /*
2728 * First, the aligned portion.
2729 */
2730 while (dlimit - daddr >= sizeof (uint64_t)) {
2731 *((uint64_t *)daddr) = *((uint64_t *)saddr);
2732
2733 daddr += sizeof (uint64_t);
2734 saddr += sizeof (uint64_t);
2735 }
2736
2737 /*
2738 * Now any left-over bit...
2739 */
2740 while (dlimit - daddr)
2741 *((uint8_t *)daddr++) = *((uint8_t *)saddr++);
2742
2743 /*
2744 * Finally, commit the reserved space in the destination buffer.
2745 */
2746 dest->dtb_offset = offs + src->dtb_offset;
2747
2748out:
2749 /*
2750 * If we're lucky enough to be the only active CPU on this speculation
2751 * buffer, we can just set the state back to DTRACESPEC_INACTIVE.
2752 */
2753 if (current == DTRACESPEC_ACTIVE ||
2754 (current == DTRACESPEC_ACTIVEONE && new == DTRACESPEC_COMMITTING)) {
2755 uint32_t rval = dtrace_cas32((uint32_t *)&spec->dtsp_state,
2756 DTRACESPEC_COMMITTING, DTRACESPEC_INACTIVE);
2757
2758 ASSERT(rval == DTRACESPEC_COMMITTING);
2759 }
2760
2761 src->dtb_offset = 0;
2762 src->dtb_xamot_drops += src->dtb_drops;
2763 src->dtb_drops = 0;
2764}
2765
2766/*
2767 * This routine discards an active speculation. If the specified speculation
2768 * is not in a valid state to perform a discard(), this routine will silently
2769 * do nothing. The state of the specified speculation is transitioned
2770 * according to the state transition diagram outlined in <sys/dtrace_impl.h>
2771 */
2772static void
2773dtrace_speculation_discard(dtrace_state_t *state, processorid_t cpu,
2774 dtrace_specid_t which)
2775{
2776 dtrace_speculation_t *spec;
2777 dtrace_speculation_state_t current, new = 0;
2778 dtrace_buffer_t *buf;
2779
2780 if (which == 0)
2781 return;
2782
2783 if (which > state->dts_nspeculations) {
2784 cpu_core[cpu].cpuc_dtrace_flags |= CPU_DTRACE_ILLOP;
2785 return;
2786 }
2787
2788 spec = &state->dts_speculations[which - 1];
2789 buf = &spec->dtsp_buffer[cpu];
2790
2791 do {
2792 current = spec->dtsp_state;
2793
2794 switch (current) {
2795 case DTRACESPEC_INACTIVE:
2796 case DTRACESPEC_COMMITTINGMANY:
2797 case DTRACESPEC_COMMITTING:
2798 case DTRACESPEC_DISCARDING:
2799 return;
2800
2801 case DTRACESPEC_ACTIVE:
2802 case DTRACESPEC_ACTIVEMANY:
2803 new = DTRACESPEC_DISCARDING;
2804 break;
2805
2806 case DTRACESPEC_ACTIVEONE:
2807 if (buf->dtb_offset != 0) {
2808 new = DTRACESPEC_INACTIVE;
2809 } else {
2810 new = DTRACESPEC_DISCARDING;
2811 }
2812 break;
2813
2814 default:
2815 ASSERT(0);
2816 }
2817 } while (dtrace_cas32((uint32_t *)&spec->dtsp_state,
2818 current, new) != current);
2819
2820 buf->dtb_offset = 0;
2821 buf->dtb_drops = 0;
2822}
2823
2824/*
2825 * Note: not called from probe context. This function is called
2826 * asynchronously from cross call context to clean any speculations that are
2827 * in the COMMITTINGMANY or DISCARDING states. These speculations may not be
2828 * transitioned back to the INACTIVE state until all CPUs have cleaned the
2829 * speculation.
2830 */
2831static void
2832dtrace_speculation_clean_here(dtrace_state_t *state)
2833{
2834 dtrace_icookie_t cookie;
2835 processorid_t cpu = curcpu;
2836 dtrace_buffer_t *dest = &state->dts_buffer[cpu];
2837 dtrace_specid_t i;
2838
2839 cookie = dtrace_interrupt_disable();
2840
2841 if (dest->dtb_tomax == NULL) {
2842 dtrace_interrupt_enable(cookie);
2843 return;
2844 }
2845
2846 for (i = 0; i < state->dts_nspeculations; i++) {
2847 dtrace_speculation_t *spec = &state->dts_speculations[i];
2848 dtrace_buffer_t *src = &spec->dtsp_buffer[cpu];
2849
2850 if (src->dtb_tomax == NULL)
2851 continue;
2852
2853 if (spec->dtsp_state == DTRACESPEC_DISCARDING) {
2854 src->dtb_offset = 0;
2855 continue;
2856 }
2857
2858 if (spec->dtsp_state != DTRACESPEC_COMMITTINGMANY)
2859 continue;
2860
2861 if (src->dtb_offset == 0)
2862 continue;
2863
2864 dtrace_speculation_commit(state, cpu, i + 1);
2865 }
2866
2867 dtrace_interrupt_enable(cookie);
2868}
2869
2870/*
2871 * Note: not called from probe context. This function is called
2872 * asynchronously (and at a regular interval) to clean any speculations that
2873 * are in the COMMITTINGMANY or DISCARDING states. If it discovers that there
2874 * is work to be done, it cross calls all CPUs to perform that work;
2875 * COMMITMANY and DISCARDING speculations may not be transitioned back to the
2876 * INACTIVE state until they have been cleaned by all CPUs.
2877 */
2878static void
2879dtrace_speculation_clean(dtrace_state_t *state)
2880{
2881 int work = 0, rv;
2882 dtrace_specid_t i;
2883
2884 for (i = 0; i < state->dts_nspeculations; i++) {
2885 dtrace_speculation_t *spec = &state->dts_speculations[i];
2886
2887 ASSERT(!spec->dtsp_cleaning);
2888
2889 if (spec->dtsp_state != DTRACESPEC_DISCARDING &&
2890 spec->dtsp_state != DTRACESPEC_COMMITTINGMANY)
2891 continue;
2892
2893 work++;
2894 spec->dtsp_cleaning = 1;
2895 }
2896
2897 if (!work)
2898 return;
2899
2900 dtrace_xcall(DTRACE_CPUALL,
2901 (dtrace_xcall_t)dtrace_speculation_clean_here, state);
2902
2903 /*
2904 * We now know that all CPUs have committed or discarded their
2905 * speculation buffers, as appropriate. We can now set the state
2906 * to inactive.
2907 */
2908 for (i = 0; i < state->dts_nspeculations; i++) {
2909 dtrace_speculation_t *spec = &state->dts_speculations[i];
2910 dtrace_speculation_state_t current, new;
2911
2912 if (!spec->dtsp_cleaning)
2913 continue;
2914
2915 current = spec->dtsp_state;
2916 ASSERT(current == DTRACESPEC_DISCARDING ||
2917 current == DTRACESPEC_COMMITTINGMANY);
2918
2919 new = DTRACESPEC_INACTIVE;
2920
2921 rv = dtrace_cas32((uint32_t *)&spec->dtsp_state, current, new);
2922 ASSERT(rv == current);
2923 spec->dtsp_cleaning = 0;
2924 }
2925}
2926
2927/*
2928 * Called as part of a speculate() to get the speculative buffer associated
2929 * with a given speculation. Returns NULL if the specified speculation is not
2930 * in an ACTIVE state. If the speculation is in the ACTIVEONE state -- and
2931 * the active CPU is not the specified CPU -- the speculation will be
2932 * atomically transitioned into the ACTIVEMANY state.
2933 */
2934static dtrace_buffer_t *
2935dtrace_speculation_buffer(dtrace_state_t *state, processorid_t cpuid,
2936 dtrace_specid_t which)
2937{
2938 dtrace_speculation_t *spec;
2939 dtrace_speculation_state_t current, new = 0;
2940 dtrace_buffer_t *buf;
2941
2942 if (which == 0)
2943 return (NULL);
2944
2945 if (which > state->dts_nspeculations) {
2946 cpu_core[cpuid].cpuc_dtrace_flags |= CPU_DTRACE_ILLOP;
2947 return (NULL);
2948 }
2949
2950 spec = &state->dts_speculations[which - 1];
2951 buf = &spec->dtsp_buffer[cpuid];
2952
2953 do {
2954 current = spec->dtsp_state;
2955
2956 switch (current) {
2957 case DTRACESPEC_INACTIVE:
2958 case DTRACESPEC_COMMITTINGMANY:
2959 case DTRACESPEC_DISCARDING:
2960 return (NULL);
2961
2962 case DTRACESPEC_COMMITTING:
2963 ASSERT(buf->dtb_offset == 0);
2964 return (NULL);
2965
2966 case DTRACESPEC_ACTIVEONE:
2967 /*
2968 * This speculation is currently active on one CPU.
2969 * Check the offset in the buffer; if it's non-zero,
2970 * that CPU must be us (and we leave the state alone).
2971 * If it's zero, assume that we're starting on a new
2972 * CPU -- and change the state to indicate that the
2973 * speculation is active on more than one CPU.
2974 */
2975 if (buf->dtb_offset != 0)
2976 return (buf);
2977
2978 new = DTRACESPEC_ACTIVEMANY;
2979 break;
2980
2981 case DTRACESPEC_ACTIVEMANY:
2982 return (buf);
2983
2984 case DTRACESPEC_ACTIVE:
2985 new = DTRACESPEC_ACTIVEONE;
2986 break;
2987
2988 default:
2989 ASSERT(0);
2990 }
2991 } while (dtrace_cas32((uint32_t *)&spec->dtsp_state,
2992 current, new) != current);
2993
2994 ASSERT(new == DTRACESPEC_ACTIVEONE || new == DTRACESPEC_ACTIVEMANY);
2995 return (buf);
2996}
2997
2998/*
2999 * Return a string. In the event that the user lacks the privilege to access
3000 * arbitrary kernel memory, we copy the string out to scratch memory so that we
3001 * don't fail access checking.
3002 *
3003 * dtrace_dif_variable() uses this routine as a helper for various
3004 * builtin values such as 'execname' and 'probefunc.'
3005 */
3006uintptr_t
3007dtrace_dif_varstr(uintptr_t addr, dtrace_state_t *state,
3008 dtrace_mstate_t *mstate)
3009{
3010 uint64_t size = state->dts_options[DTRACEOPT_STRSIZE];
3011 uintptr_t ret;
3012 size_t strsz;
3013
3014 /*
3015 * The easy case: this probe is allowed to read all of memory, so
3016 * we can just return this as a vanilla pointer.
3017 */
3018 if ((mstate->dtms_access & DTRACE_ACCESS_KERNEL) != 0)
3019 return (addr);
3020
3021 /*
3022 * This is the tougher case: we copy the string in question from
3023 * kernel memory into scratch memory and return it that way: this
3024 * ensures that we won't trip up when access checking tests the
3025 * BYREF return value.
3026 */
3027 strsz = dtrace_strlen((char *)addr, size) + 1;
3028
3029 if (mstate->dtms_scratch_ptr + strsz >
3030 mstate->dtms_scratch_base + mstate->dtms_scratch_size) {
3031 DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
3032 return (0);
3033 }
3034
3035 dtrace_strcpy((const void *)addr, (void *)mstate->dtms_scratch_ptr,
3036 strsz);
3037 ret = mstate->dtms_scratch_ptr;
3038 mstate->dtms_scratch_ptr += strsz;
3039 return (ret);
3040}
3041
3042/*
3043 * Return a string from a memoy address which is known to have one or
3044 * more concatenated, individually zero terminated, sub-strings.
3045 * In the event that the user lacks the privilege to access
3046 * arbitrary kernel memory, we copy the string out to scratch memory so that we
3047 * don't fail access checking.
3048 *
3049 * dtrace_dif_variable() uses this routine as a helper for various
3050 * builtin values such as 'execargs'.
3051 */
3052static uintptr_t
3053dtrace_dif_varstrz(uintptr_t addr, size_t strsz, dtrace_state_t *state,
3054 dtrace_mstate_t *mstate)
3055{
3056 char *p;
3057 size_t i;
3058 uintptr_t ret;
3059
3060 if (mstate->dtms_scratch_ptr + strsz >
3061 mstate->dtms_scratch_base + mstate->dtms_scratch_size) {
3062 DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
3063 return (0);
3064 }
3065
3066 dtrace_bcopy((const void *)addr, (void *)mstate->dtms_scratch_ptr,
3067 strsz);
3068
3069 /* Replace sub-string termination characters with a space. */
3070 for (p = (char *) mstate->dtms_scratch_ptr, i = 0; i < strsz - 1;
3071 p++, i++)
3072 if (*p == '\0')
3073 *p = ' ';
3074
3075 ret = mstate->dtms_scratch_ptr;
3076 mstate->dtms_scratch_ptr += strsz;
3077 return (ret);
3078}
3079
3080/*
3081 * This function implements the DIF emulator's variable lookups. The emulator
3082 * passes a reserved variable identifier and optional built-in array index.
3083 */
3084static uint64_t
3085dtrace_dif_variable(dtrace_mstate_t *mstate, dtrace_state_t *state, uint64_t v,
3086 uint64_t ndx)
3087{
3088 /*
3089 * If we're accessing one of the uncached arguments, we'll turn this
3090 * into a reference in the args array.
3091 */
3092 if (v >= DIF_VAR_ARG0 && v <= DIF_VAR_ARG9) {
3093 ndx = v - DIF_VAR_ARG0;
3094 v = DIF_VAR_ARGS;
3095 }
3096
3097 switch (v) {
3098 case DIF_VAR_ARGS:
3099 ASSERT(mstate->dtms_present & DTRACE_MSTATE_ARGS);
3100 if (ndx >= sizeof (mstate->dtms_arg) /
3101 sizeof (mstate->dtms_arg[0])) {
3102 int aframes = mstate->dtms_probe->dtpr_aframes + 2;
3103 dtrace_provider_t *pv;
3104 uint64_t val;
3105
3106 pv = mstate->dtms_probe->dtpr_provider;
3107 if (pv->dtpv_pops.dtps_getargval != NULL)
3108 val = pv->dtpv_pops.dtps_getargval(pv->dtpv_arg,
3109 mstate->dtms_probe->dtpr_id,
3110 mstate->dtms_probe->dtpr_arg, ndx, aframes);
3111 else
3112 val = dtrace_getarg(ndx, aframes);
3113
3114 /*
3115 * This is regrettably required to keep the compiler
3116 * from tail-optimizing the call to dtrace_getarg().
3117 * The condition always evaluates to true, but the
3118 * compiler has no way of figuring that out a priori.
3119 * (None of this would be necessary if the compiler
3120 * could be relied upon to _always_ tail-optimize
3121 * the call to dtrace_getarg() -- but it can't.)
3122 */
3123 if (mstate->dtms_probe != NULL)
3124 return (val);
3125
3126 ASSERT(0);
3127 }
3128
3129 return (mstate->dtms_arg[ndx]);
3130
3131#if defined(sun)
3132 case DIF_VAR_UREGS: {
3133 klwp_t *lwp;
3134
3135 if (!dtrace_priv_proc(state))
3136 return (0);
3137
3138 if ((lwp = curthread->t_lwp) == NULL) {
3139 DTRACE_CPUFLAG_SET(CPU_DTRACE_BADADDR);
3140 cpu_core[curcpu].cpuc_dtrace_illval = NULL;
3141 return (0);
3142 }
3143
3144 return (dtrace_getreg(lwp->lwp_regs, ndx));
3145 return (0);
3146 }
3147#else
3148 case DIF_VAR_UREGS: {
3149 struct trapframe *tframe;
3150
3151 if (!dtrace_priv_proc(state))
3152 return (0);
3153
3154 if ((tframe = curthread->td_frame) == NULL) {
3155 DTRACE_CPUFLAG_SET(CPU_DTRACE_BADADDR);
3156 cpu_core[curcpu].cpuc_dtrace_illval = 0;
3157 return (0);
3158 }
3159
3160 return (dtrace_getreg(tframe, ndx));
3161 }
3162#endif
3163
3164 case DIF_VAR_CURTHREAD:
2906 if (!dtrace_priv_kernel(state))
3165 if (!dtrace_priv_proc(state))
2907 return (0);
2908 return ((uint64_t)(uintptr_t)curthread);
2909
2910 case DIF_VAR_TIMESTAMP:
2911 if (!(mstate->dtms_present & DTRACE_MSTATE_TIMESTAMP)) {
2912 mstate->dtms_timestamp = dtrace_gethrtime();
2913 mstate->dtms_present |= DTRACE_MSTATE_TIMESTAMP;
2914 }
2915 return (mstate->dtms_timestamp);
2916
2917 case DIF_VAR_VTIMESTAMP:
2918 ASSERT(dtrace_vtime_references != 0);
2919 return (curthread->t_dtrace_vtime);
2920
2921 case DIF_VAR_WALLTIMESTAMP:
2922 if (!(mstate->dtms_present & DTRACE_MSTATE_WALLTIMESTAMP)) {
2923 mstate->dtms_walltimestamp = dtrace_gethrestime();
2924 mstate->dtms_present |= DTRACE_MSTATE_WALLTIMESTAMP;
2925 }
2926 return (mstate->dtms_walltimestamp);
2927
2928#if defined(sun)
2929 case DIF_VAR_IPL:
2930 if (!dtrace_priv_kernel(state))
2931 return (0);
2932 if (!(mstate->dtms_present & DTRACE_MSTATE_IPL)) {
2933 mstate->dtms_ipl = dtrace_getipl();
2934 mstate->dtms_present |= DTRACE_MSTATE_IPL;
2935 }
2936 return (mstate->dtms_ipl);
2937#endif
2938
2939 case DIF_VAR_EPID:
2940 ASSERT(mstate->dtms_present & DTRACE_MSTATE_EPID);
2941 return (mstate->dtms_epid);
2942
2943 case DIF_VAR_ID:
2944 ASSERT(mstate->dtms_present & DTRACE_MSTATE_PROBE);
2945 return (mstate->dtms_probe->dtpr_id);
2946
2947 case DIF_VAR_STACKDEPTH:
2948 if (!dtrace_priv_kernel(state))
2949 return (0);
2950 if (!(mstate->dtms_present & DTRACE_MSTATE_STACKDEPTH)) {
2951 int aframes = mstate->dtms_probe->dtpr_aframes + 2;
2952
2953 mstate->dtms_stackdepth = dtrace_getstackdepth(aframes);
2954 mstate->dtms_present |= DTRACE_MSTATE_STACKDEPTH;
2955 }
2956 return (mstate->dtms_stackdepth);
2957
2958 case DIF_VAR_USTACKDEPTH:
2959 if (!dtrace_priv_proc(state))
2960 return (0);
2961 if (!(mstate->dtms_present & DTRACE_MSTATE_USTACKDEPTH)) {
2962 /*
2963 * See comment in DIF_VAR_PID.
2964 */
2965 if (DTRACE_ANCHORED(mstate->dtms_probe) &&
2966 CPU_ON_INTR(CPU)) {
2967 mstate->dtms_ustackdepth = 0;
2968 } else {
2969 DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
2970 mstate->dtms_ustackdepth =
2971 dtrace_getustackdepth();
2972 DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);
2973 }
2974 mstate->dtms_present |= DTRACE_MSTATE_USTACKDEPTH;
2975 }
2976 return (mstate->dtms_ustackdepth);
2977
2978 case DIF_VAR_CALLER:
2979 if (!dtrace_priv_kernel(state))
2980 return (0);
2981 if (!(mstate->dtms_present & DTRACE_MSTATE_CALLER)) {
2982 int aframes = mstate->dtms_probe->dtpr_aframes + 2;
2983
2984 if (!DTRACE_ANCHORED(mstate->dtms_probe)) {
2985 /*
2986 * If this is an unanchored probe, we are
2987 * required to go through the slow path:
2988 * dtrace_caller() only guarantees correct
2989 * results for anchored probes.
2990 */
2991 pc_t caller[2] = {0, 0};
2992
2993 dtrace_getpcstack(caller, 2, aframes,
2994 (uint32_t *)(uintptr_t)mstate->dtms_arg[0]);
2995 mstate->dtms_caller = caller[1];
2996 } else if ((mstate->dtms_caller =
2997 dtrace_caller(aframes)) == -1) {
2998 /*
2999 * We have failed to do this the quick way;
3000 * we must resort to the slower approach of
3001 * calling dtrace_getpcstack().
3002 */
3003 pc_t caller = 0;
3004
3005 dtrace_getpcstack(&caller, 1, aframes, NULL);
3006 mstate->dtms_caller = caller;
3007 }
3008
3009 mstate->dtms_present |= DTRACE_MSTATE_CALLER;
3010 }
3011 return (mstate->dtms_caller);
3012
3013 case DIF_VAR_UCALLER:
3014 if (!dtrace_priv_proc(state))
3015 return (0);
3016
3017 if (!(mstate->dtms_present & DTRACE_MSTATE_UCALLER)) {
3018 uint64_t ustack[3];
3019
3020 /*
3021 * dtrace_getupcstack() fills in the first uint64_t
3022 * with the current PID. The second uint64_t will
3023 * be the program counter at user-level. The third
3024 * uint64_t will contain the caller, which is what
3025 * we're after.
3026 */
3027 ustack[2] = 0;
3028 DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
3029 dtrace_getupcstack(ustack, 3);
3030 DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);
3031 mstate->dtms_ucaller = ustack[2];
3032 mstate->dtms_present |= DTRACE_MSTATE_UCALLER;
3033 }
3034
3035 return (mstate->dtms_ucaller);
3036
3037 case DIF_VAR_PROBEPROV:
3038 ASSERT(mstate->dtms_present & DTRACE_MSTATE_PROBE);
3039 return (dtrace_dif_varstr(
3040 (uintptr_t)mstate->dtms_probe->dtpr_provider->dtpv_name,
3041 state, mstate));
3042
3043 case DIF_VAR_PROBEMOD:
3044 ASSERT(mstate->dtms_present & DTRACE_MSTATE_PROBE);
3045 return (dtrace_dif_varstr(
3046 (uintptr_t)mstate->dtms_probe->dtpr_mod,
3047 state, mstate));
3048
3049 case DIF_VAR_PROBEFUNC:
3050 ASSERT(mstate->dtms_present & DTRACE_MSTATE_PROBE);
3051 return (dtrace_dif_varstr(
3052 (uintptr_t)mstate->dtms_probe->dtpr_func,
3053 state, mstate));
3054
3055 case DIF_VAR_PROBENAME:
3056 ASSERT(mstate->dtms_present & DTRACE_MSTATE_PROBE);
3057 return (dtrace_dif_varstr(
3058 (uintptr_t)mstate->dtms_probe->dtpr_name,
3059 state, mstate));
3060
3061 case DIF_VAR_PID:
3062 if (!dtrace_priv_proc(state))
3063 return (0);
3064
3065#if defined(sun)
3066 /*
3067 * Note that we are assuming that an unanchored probe is
3068 * always due to a high-level interrupt. (And we're assuming
3069 * that there is only a single high level interrupt.)
3070 */
3071 if (DTRACE_ANCHORED(mstate->dtms_probe) && CPU_ON_INTR(CPU))
3072 return (pid0.pid_id);
3073
3074 /*
3075 * It is always safe to dereference one's own t_procp pointer:
3076 * it always points to a valid, allocated proc structure.
3077 * Further, it is always safe to dereference the p_pidp member
3078 * of one's own proc structure. (These are truisms becuase
3079 * threads and processes don't clean up their own state --
3080 * they leave that task to whomever reaps them.)
3081 */
3082 return ((uint64_t)curthread->t_procp->p_pidp->pid_id);
3083#else
3084 return ((uint64_t)curproc->p_pid);
3085#endif
3086
3087 case DIF_VAR_PPID:
3088 if (!dtrace_priv_proc(state))
3089 return (0);
3090
3091#if defined(sun)
3092 /*
3093 * See comment in DIF_VAR_PID.
3094 */
3095 if (DTRACE_ANCHORED(mstate->dtms_probe) && CPU_ON_INTR(CPU))
3096 return (pid0.pid_id);
3097
3098 /*
3099 * It is always safe to dereference one's own t_procp pointer:
3100 * it always points to a valid, allocated proc structure.
3101 * (This is true because threads don't clean up their own
3102 * state -- they leave that task to whomever reaps them.)
3103 */
3104 return ((uint64_t)curthread->t_procp->p_ppid);
3105#else
3106 return ((uint64_t)curproc->p_pptr->p_pid);
3107#endif
3108
3109 case DIF_VAR_TID:
3110#if defined(sun)
3111 /*
3112 * See comment in DIF_VAR_PID.
3113 */
3114 if (DTRACE_ANCHORED(mstate->dtms_probe) && CPU_ON_INTR(CPU))
3115 return (0);
3116#endif
3117
3118 return ((uint64_t)curthread->t_tid);
3119
3120 case DIF_VAR_EXECARGS: {
3121 struct pargs *p_args = curthread->td_proc->p_args;
3122
3123 if (p_args == NULL)
3124 return(0);
3125
3126 return (dtrace_dif_varstrz(
3127 (uintptr_t) p_args->ar_args, p_args->ar_length, state, mstate));
3128 }
3129
3130 case DIF_VAR_EXECNAME:
3131#if defined(sun)
3132 if (!dtrace_priv_proc(state))
3133 return (0);
3134
3135 /*
3136 * See comment in DIF_VAR_PID.
3137 */
3138 if (DTRACE_ANCHORED(mstate->dtms_probe) && CPU_ON_INTR(CPU))
3139 return ((uint64_t)(uintptr_t)p0.p_user.u_comm);
3140
3141 /*
3142 * It is always safe to dereference one's own t_procp pointer:
3143 * it always points to a valid, allocated proc structure.
3144 * (This is true because threads don't clean up their own
3145 * state -- they leave that task to whomever reaps them.)
3146 */
3147 return (dtrace_dif_varstr(
3148 (uintptr_t)curthread->t_procp->p_user.u_comm,
3149 state, mstate));
3150#else
3151 return (dtrace_dif_varstr(
3152 (uintptr_t) curthread->td_proc->p_comm, state, mstate));
3153#endif
3154
3155 case DIF_VAR_ZONENAME:
3156#if defined(sun)
3157 if (!dtrace_priv_proc(state))
3158 return (0);
3159
3160 /*
3161 * See comment in DIF_VAR_PID.
3162 */
3163 if (DTRACE_ANCHORED(mstate->dtms_probe) && CPU_ON_INTR(CPU))
3164 return ((uint64_t)(uintptr_t)p0.p_zone->zone_name);
3165
3166 /*
3167 * It is always safe to dereference one's own t_procp pointer:
3168 * it always points to a valid, allocated proc structure.
3169 * (This is true because threads don't clean up their own
3170 * state -- they leave that task to whomever reaps them.)
3171 */
3172 return (dtrace_dif_varstr(
3173 (uintptr_t)curthread->t_procp->p_zone->zone_name,
3174 state, mstate));
3175#else
3176 return (0);
3177#endif
3178
3179 case DIF_VAR_UID:
3180 if (!dtrace_priv_proc(state))
3181 return (0);
3182
3183#if defined(sun)
3184 /*
3185 * See comment in DIF_VAR_PID.
3186 */
3187 if (DTRACE_ANCHORED(mstate->dtms_probe) && CPU_ON_INTR(CPU))
3188 return ((uint64_t)p0.p_cred->cr_uid);
3189#endif
3190
3191 /*
3192 * It is always safe to dereference one's own t_procp pointer:
3193 * it always points to a valid, allocated proc structure.
3194 * (This is true because threads don't clean up their own
3195 * state -- they leave that task to whomever reaps them.)
3196 *
3197 * Additionally, it is safe to dereference one's own process
3198 * credential, since this is never NULL after process birth.
3199 */
3200 return ((uint64_t)curthread->t_procp->p_cred->cr_uid);
3201
3202 case DIF_VAR_GID:
3203 if (!dtrace_priv_proc(state))
3204 return (0);
3205
3206#if defined(sun)
3207 /*
3208 * See comment in DIF_VAR_PID.
3209 */
3210 if (DTRACE_ANCHORED(mstate->dtms_probe) && CPU_ON_INTR(CPU))
3211 return ((uint64_t)p0.p_cred->cr_gid);
3212#endif
3213
3214 /*
3215 * It is always safe to dereference one's own t_procp pointer:
3216 * it always points to a valid, allocated proc structure.
3217 * (This is true because threads don't clean up their own
3218 * state -- they leave that task to whomever reaps them.)
3219 *
3220 * Additionally, it is safe to dereference one's own process
3221 * credential, since this is never NULL after process birth.
3222 */
3223 return ((uint64_t)curthread->t_procp->p_cred->cr_gid);
3224
3225 case DIF_VAR_ERRNO: {
3226#if defined(sun)
3227 klwp_t *lwp;
3228 if (!dtrace_priv_proc(state))
3229 return (0);
3230
3231 /*
3232 * See comment in DIF_VAR_PID.
3233 */
3234 if (DTRACE_ANCHORED(mstate->dtms_probe) && CPU_ON_INTR(CPU))
3235 return (0);
3236
3237 /*
3238 * It is always safe to dereference one's own t_lwp pointer in
3239 * the event that this pointer is non-NULL. (This is true
3240 * because threads and lwps don't clean up their own state --
3241 * they leave that task to whomever reaps them.)
3242 */
3243 if ((lwp = curthread->t_lwp) == NULL)
3244 return (0);
3245
3246 return ((uint64_t)lwp->lwp_errno);
3247#else
3248 return (curthread->td_errno);
3249#endif
3250 }
3251#if !defined(sun)
3252 case DIF_VAR_CPU: {
3253 return curcpu;
3254 }
3255#endif
3256 default:
3257 DTRACE_CPUFLAG_SET(CPU_DTRACE_ILLOP);
3258 return (0);
3259 }
3260}
3261
3166 return (0);
3167 return ((uint64_t)(uintptr_t)curthread);
3168
3169 case DIF_VAR_TIMESTAMP:
3170 if (!(mstate->dtms_present & DTRACE_MSTATE_TIMESTAMP)) {
3171 mstate->dtms_timestamp = dtrace_gethrtime();
3172 mstate->dtms_present |= DTRACE_MSTATE_TIMESTAMP;
3173 }
3174 return (mstate->dtms_timestamp);
3175
3176 case DIF_VAR_VTIMESTAMP:
3177 ASSERT(dtrace_vtime_references != 0);
3178 return (curthread->t_dtrace_vtime);
3179
3180 case DIF_VAR_WALLTIMESTAMP:
3181 if (!(mstate->dtms_present & DTRACE_MSTATE_WALLTIMESTAMP)) {
3182 mstate->dtms_walltimestamp = dtrace_gethrestime();
3183 mstate->dtms_present |= DTRACE_MSTATE_WALLTIMESTAMP;
3184 }
3185 return (mstate->dtms_walltimestamp);
3186
3187#if defined(sun)
3188 case DIF_VAR_IPL:
3189 if (!dtrace_priv_kernel(state))
3190 return (0);
3191 if (!(mstate->dtms_present & DTRACE_MSTATE_IPL)) {
3192 mstate->dtms_ipl = dtrace_getipl();
3193 mstate->dtms_present |= DTRACE_MSTATE_IPL;
3194 }
3195 return (mstate->dtms_ipl);
3196#endif
3197
3198 case DIF_VAR_EPID:
3199 ASSERT(mstate->dtms_present & DTRACE_MSTATE_EPID);
3200 return (mstate->dtms_epid);
3201
3202 case DIF_VAR_ID:
3203 ASSERT(mstate->dtms_present & DTRACE_MSTATE_PROBE);
3204 return (mstate->dtms_probe->dtpr_id);
3205
3206 case DIF_VAR_STACKDEPTH:
3207 if (!dtrace_priv_kernel(state))
3208 return (0);
3209 if (!(mstate->dtms_present & DTRACE_MSTATE_STACKDEPTH)) {
3210 int aframes = mstate->dtms_probe->dtpr_aframes + 2;
3211
3212 mstate->dtms_stackdepth = dtrace_getstackdepth(aframes);
3213 mstate->dtms_present |= DTRACE_MSTATE_STACKDEPTH;
3214 }
3215 return (mstate->dtms_stackdepth);
3216
3217 case DIF_VAR_USTACKDEPTH:
3218 if (!dtrace_priv_proc(state))
3219 return (0);
3220 if (!(mstate->dtms_present & DTRACE_MSTATE_USTACKDEPTH)) {
3221 /*
3222 * See comment in DIF_VAR_PID.
3223 */
3224 if (DTRACE_ANCHORED(mstate->dtms_probe) &&
3225 CPU_ON_INTR(CPU)) {
3226 mstate->dtms_ustackdepth = 0;
3227 } else {
3228 DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
3229 mstate->dtms_ustackdepth =
3230 dtrace_getustackdepth();
3231 DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);
3232 }
3233 mstate->dtms_present |= DTRACE_MSTATE_USTACKDEPTH;
3234 }
3235 return (mstate->dtms_ustackdepth);
3236
3237 case DIF_VAR_CALLER:
3238 if (!dtrace_priv_kernel(state))
3239 return (0);
3240 if (!(mstate->dtms_present & DTRACE_MSTATE_CALLER)) {
3241 int aframes = mstate->dtms_probe->dtpr_aframes + 2;
3242
3243 if (!DTRACE_ANCHORED(mstate->dtms_probe)) {
3244 /*
3245 * If this is an unanchored probe, we are
3246 * required to go through the slow path:
3247 * dtrace_caller() only guarantees correct
3248 * results for anchored probes.
3249 */
3250 pc_t caller[2] = {0, 0};
3251
3252 dtrace_getpcstack(caller, 2, aframes,
3253 (uint32_t *)(uintptr_t)mstate->dtms_arg[0]);
3254 mstate->dtms_caller = caller[1];
3255 } else if ((mstate->dtms_caller =
3256 dtrace_caller(aframes)) == -1) {
3257 /*
3258 * We have failed to do this the quick way;
3259 * we must resort to the slower approach of
3260 * calling dtrace_getpcstack().
3261 */
3262 pc_t caller = 0;
3263
3264 dtrace_getpcstack(&caller, 1, aframes, NULL);
3265 mstate->dtms_caller = caller;
3266 }
3267
3268 mstate->dtms_present |= DTRACE_MSTATE_CALLER;
3269 }
3270 return (mstate->dtms_caller);
3271
3272 case DIF_VAR_UCALLER:
3273 if (!dtrace_priv_proc(state))
3274 return (0);
3275
3276 if (!(mstate->dtms_present & DTRACE_MSTATE_UCALLER)) {
3277 uint64_t ustack[3];
3278
3279 /*
3280 * dtrace_getupcstack() fills in the first uint64_t
3281 * with the current PID. The second uint64_t will
3282 * be the program counter at user-level. The third
3283 * uint64_t will contain the caller, which is what
3284 * we're after.
3285 */
3286 ustack[2] = 0;
3287 DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
3288 dtrace_getupcstack(ustack, 3);
3289 DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);
3290 mstate->dtms_ucaller = ustack[2];
3291 mstate->dtms_present |= DTRACE_MSTATE_UCALLER;
3292 }
3293
3294 return (mstate->dtms_ucaller);
3295
3296 case DIF_VAR_PROBEPROV:
3297 ASSERT(mstate->dtms_present & DTRACE_MSTATE_PROBE);
3298 return (dtrace_dif_varstr(
3299 (uintptr_t)mstate->dtms_probe->dtpr_provider->dtpv_name,
3300 state, mstate));
3301
3302 case DIF_VAR_PROBEMOD:
3303 ASSERT(mstate->dtms_present & DTRACE_MSTATE_PROBE);
3304 return (dtrace_dif_varstr(
3305 (uintptr_t)mstate->dtms_probe->dtpr_mod,
3306 state, mstate));
3307
3308 case DIF_VAR_PROBEFUNC:
3309 ASSERT(mstate->dtms_present & DTRACE_MSTATE_PROBE);
3310 return (dtrace_dif_varstr(
3311 (uintptr_t)mstate->dtms_probe->dtpr_func,
3312 state, mstate));
3313
3314 case DIF_VAR_PROBENAME:
3315 ASSERT(mstate->dtms_present & DTRACE_MSTATE_PROBE);
3316 return (dtrace_dif_varstr(
3317 (uintptr_t)mstate->dtms_probe->dtpr_name,
3318 state, mstate));
3319
3320 case DIF_VAR_PID:
3321 if (!dtrace_priv_proc(state))
3322 return (0);
3323
3324#if defined(sun)
3325 /*
3326 * Note that we are assuming that an unanchored probe is
3327 * always due to a high-level interrupt. (And we're assuming
3328 * that there is only a single high level interrupt.)
3329 */
3330 if (DTRACE_ANCHORED(mstate->dtms_probe) && CPU_ON_INTR(CPU))
3331 return (pid0.pid_id);
3332
3333 /*
3334 * It is always safe to dereference one's own t_procp pointer:
3335 * it always points to a valid, allocated proc structure.
3336 * Further, it is always safe to dereference the p_pidp member
3337 * of one's own proc structure. (These are truisms becuase
3338 * threads and processes don't clean up their own state --
3339 * they leave that task to whomever reaps them.)
3340 */
3341 return ((uint64_t)curthread->t_procp->p_pidp->pid_id);
3342#else
3343 return ((uint64_t)curproc->p_pid);
3344#endif
3345
3346 case DIF_VAR_PPID:
3347 if (!dtrace_priv_proc(state))
3348 return (0);
3349
3350#if defined(sun)
3351 /*
3352 * See comment in DIF_VAR_PID.
3353 */
3354 if (DTRACE_ANCHORED(mstate->dtms_probe) && CPU_ON_INTR(CPU))
3355 return (pid0.pid_id);
3356
3357 /*
3358 * It is always safe to dereference one's own t_procp pointer:
3359 * it always points to a valid, allocated proc structure.
3360 * (This is true because threads don't clean up their own
3361 * state -- they leave that task to whomever reaps them.)
3362 */
3363 return ((uint64_t)curthread->t_procp->p_ppid);
3364#else
3365 return ((uint64_t)curproc->p_pptr->p_pid);
3366#endif
3367
3368 case DIF_VAR_TID:
3369#if defined(sun)
3370 /*
3371 * See comment in DIF_VAR_PID.
3372 */
3373 if (DTRACE_ANCHORED(mstate->dtms_probe) && CPU_ON_INTR(CPU))
3374 return (0);
3375#endif
3376
3377 return ((uint64_t)curthread->t_tid);
3378
3379 case DIF_VAR_EXECARGS: {
3380 struct pargs *p_args = curthread->td_proc->p_args;
3381
3382 if (p_args == NULL)
3383 return(0);
3384
3385 return (dtrace_dif_varstrz(
3386 (uintptr_t) p_args->ar_args, p_args->ar_length, state, mstate));
3387 }
3388
3389 case DIF_VAR_EXECNAME:
3390#if defined(sun)
3391 if (!dtrace_priv_proc(state))
3392 return (0);
3393
3394 /*
3395 * See comment in DIF_VAR_PID.
3396 */
3397 if (DTRACE_ANCHORED(mstate->dtms_probe) && CPU_ON_INTR(CPU))
3398 return ((uint64_t)(uintptr_t)p0.p_user.u_comm);
3399
3400 /*
3401 * It is always safe to dereference one's own t_procp pointer:
3402 * it always points to a valid, allocated proc structure.
3403 * (This is true because threads don't clean up their own
3404 * state -- they leave that task to whomever reaps them.)
3405 */
3406 return (dtrace_dif_varstr(
3407 (uintptr_t)curthread->t_procp->p_user.u_comm,
3408 state, mstate));
3409#else
3410 return (dtrace_dif_varstr(
3411 (uintptr_t) curthread->td_proc->p_comm, state, mstate));
3412#endif
3413
3414 case DIF_VAR_ZONENAME:
3415#if defined(sun)
3416 if (!dtrace_priv_proc(state))
3417 return (0);
3418
3419 /*
3420 * See comment in DIF_VAR_PID.
3421 */
3422 if (DTRACE_ANCHORED(mstate->dtms_probe) && CPU_ON_INTR(CPU))
3423 return ((uint64_t)(uintptr_t)p0.p_zone->zone_name);
3424
3425 /*
3426 * It is always safe to dereference one's own t_procp pointer:
3427 * it always points to a valid, allocated proc structure.
3428 * (This is true because threads don't clean up their own
3429 * state -- they leave that task to whomever reaps them.)
3430 */
3431 return (dtrace_dif_varstr(
3432 (uintptr_t)curthread->t_procp->p_zone->zone_name,
3433 state, mstate));
3434#else
3435 return (0);
3436#endif
3437
3438 case DIF_VAR_UID:
3439 if (!dtrace_priv_proc(state))
3440 return (0);
3441
3442#if defined(sun)
3443 /*
3444 * See comment in DIF_VAR_PID.
3445 */
3446 if (DTRACE_ANCHORED(mstate->dtms_probe) && CPU_ON_INTR(CPU))
3447 return ((uint64_t)p0.p_cred->cr_uid);
3448#endif
3449
3450 /*
3451 * It is always safe to dereference one's own t_procp pointer:
3452 * it always points to a valid, allocated proc structure.
3453 * (This is true because threads don't clean up their own
3454 * state -- they leave that task to whomever reaps them.)
3455 *
3456 * Additionally, it is safe to dereference one's own process
3457 * credential, since this is never NULL after process birth.
3458 */
3459 return ((uint64_t)curthread->t_procp->p_cred->cr_uid);
3460
3461 case DIF_VAR_GID:
3462 if (!dtrace_priv_proc(state))
3463 return (0);
3464
3465#if defined(sun)
3466 /*
3467 * See comment in DIF_VAR_PID.
3468 */
3469 if (DTRACE_ANCHORED(mstate->dtms_probe) && CPU_ON_INTR(CPU))
3470 return ((uint64_t)p0.p_cred->cr_gid);
3471#endif
3472
3473 /*
3474 * It is always safe to dereference one's own t_procp pointer:
3475 * it always points to a valid, allocated proc structure.
3476 * (This is true because threads don't clean up their own
3477 * state -- they leave that task to whomever reaps them.)
3478 *
3479 * Additionally, it is safe to dereference one's own process
3480 * credential, since this is never NULL after process birth.
3481 */
3482 return ((uint64_t)curthread->t_procp->p_cred->cr_gid);
3483
3484 case DIF_VAR_ERRNO: {
3485#if defined(sun)
3486 klwp_t *lwp;
3487 if (!dtrace_priv_proc(state))
3488 return (0);
3489
3490 /*
3491 * See comment in DIF_VAR_PID.
3492 */
3493 if (DTRACE_ANCHORED(mstate->dtms_probe) && CPU_ON_INTR(CPU))
3494 return (0);
3495
3496 /*
3497 * It is always safe to dereference one's own t_lwp pointer in
3498 * the event that this pointer is non-NULL. (This is true
3499 * because threads and lwps don't clean up their own state --
3500 * they leave that task to whomever reaps them.)
3501 */
3502 if ((lwp = curthread->t_lwp) == NULL)
3503 return (0);
3504
3505 return ((uint64_t)lwp->lwp_errno);
3506#else
3507 return (curthread->td_errno);
3508#endif
3509 }
3510#if !defined(sun)
3511 case DIF_VAR_CPU: {
3512 return curcpu;
3513 }
3514#endif
3515 default:
3516 DTRACE_CPUFLAG_SET(CPU_DTRACE_ILLOP);
3517 return (0);
3518 }
3519}
3520
3521
3522typedef enum dtrace_json_state {
3523 DTRACE_JSON_REST = 1,
3524 DTRACE_JSON_OBJECT,
3525 DTRACE_JSON_STRING,
3526 DTRACE_JSON_STRING_ESCAPE,
3527 DTRACE_JSON_STRING_ESCAPE_UNICODE,
3528 DTRACE_JSON_COLON,
3529 DTRACE_JSON_COMMA,
3530 DTRACE_JSON_VALUE,
3531 DTRACE_JSON_IDENTIFIER,
3532 DTRACE_JSON_NUMBER,
3533 DTRACE_JSON_NUMBER_FRAC,
3534 DTRACE_JSON_NUMBER_EXP,
3535 DTRACE_JSON_COLLECT_OBJECT
3536} dtrace_json_state_t;
3537
3262/*
3538/*
3539 * This function possesses just enough knowledge about JSON to extract a single
3540 * value from a JSON string and store it in the scratch buffer. It is able
3541 * to extract nested object values, and members of arrays by index.
3542 *
3543 * elemlist is a list of JSON keys, stored as packed NUL-terminated strings, to
3544 * be looked up as we descend into the object tree. e.g.
3545 *
3546 * foo[0].bar.baz[32] --> "foo" NUL "0" NUL "bar" NUL "baz" NUL "32" NUL
3547 * with nelems = 5.
3548 *
3549 * The run time of this function must be bounded above by strsize to limit the
3550 * amount of work done in probe context. As such, it is implemented as a
3551 * simple state machine, reading one character at a time using safe loads
3552 * until we find the requested element, hit a parsing error or run off the
3553 * end of the object or string.
3554 *
3555 * As there is no way for a subroutine to return an error without interrupting
3556 * clause execution, we simply return NULL in the event of a missing key or any
3557 * other error condition. Each NULL return in this function is commented with
3558 * the error condition it represents -- parsing or otherwise.
3559 *
3560 * The set of states for the state machine closely matches the JSON
3561 * specification (http://json.org/). Briefly:
3562 *
3563 * DTRACE_JSON_REST:
3564 * Skip whitespace until we find either a top-level Object, moving
3565 * to DTRACE_JSON_OBJECT; or an Array, moving to DTRACE_JSON_VALUE.
3566 *
3567 * DTRACE_JSON_OBJECT:
3568 * Locate the next key String in an Object. Sets a flag to denote
3569 * the next String as a key string and moves to DTRACE_JSON_STRING.
3570 *
3571 * DTRACE_JSON_COLON:
3572 * Skip whitespace until we find the colon that separates key Strings
3573 * from their values. Once found, move to DTRACE_JSON_VALUE.
3574 *
3575 * DTRACE_JSON_VALUE:
3576 * Detects the type of the next value (String, Number, Identifier, Object
3577 * or Array) and routes to the states that process that type. Here we also
3578 * deal with the element selector list if we are requested to traverse down
3579 * into the object tree.
3580 *
3581 * DTRACE_JSON_COMMA:
3582 * Skip whitespace until we find the comma that separates key-value pairs
3583 * in Objects (returning to DTRACE_JSON_OBJECT) or values in Arrays
3584 * (similarly DTRACE_JSON_VALUE). All following literal value processing
3585 * states return to this state at the end of their value, unless otherwise
3586 * noted.
3587 *
3588 * DTRACE_JSON_NUMBER, DTRACE_JSON_NUMBER_FRAC, DTRACE_JSON_NUMBER_EXP:
3589 * Processes a Number literal from the JSON, including any exponent
3590 * component that may be present. Numbers are returned as strings, which
3591 * may be passed to strtoll() if an integer is required.
3592 *
3593 * DTRACE_JSON_IDENTIFIER:
3594 * Processes a "true", "false" or "null" literal in the JSON.
3595 *
3596 * DTRACE_JSON_STRING, DTRACE_JSON_STRING_ESCAPE,
3597 * DTRACE_JSON_STRING_ESCAPE_UNICODE:
3598 * Processes a String literal from the JSON, whether the String denotes
3599 * a key, a value or part of a larger Object. Handles all escape sequences
3600 * present in the specification, including four-digit unicode characters,
3601 * but merely includes the escape sequence without converting it to the
3602 * actual escaped character. If the String is flagged as a key, we
3603 * move to DTRACE_JSON_COLON rather than DTRACE_JSON_COMMA.
3604 *
3605 * DTRACE_JSON_COLLECT_OBJECT:
3606 * This state collects an entire Object (or Array), correctly handling
3607 * embedded strings. If the full element selector list matches this nested
3608 * object, we return the Object in full as a string. If not, we use this
3609 * state to skip to the next value at this level and continue processing.
3610 *
3611 * NOTE: This function uses various macros from strtolctype.h to manipulate
3612 * digit values, etc -- these have all been checked to ensure they make
3613 * no additional function calls.
3614 */
3615static char *
3616dtrace_json(uint64_t size, uintptr_t json, char *elemlist, int nelems,
3617 char *dest)
3618{
3619 dtrace_json_state_t state = DTRACE_JSON_REST;
3620 int64_t array_elem = INT64_MIN;
3621 int64_t array_pos = 0;
3622 uint8_t escape_unicount = 0;
3623 boolean_t string_is_key = B_FALSE;
3624 boolean_t collect_object = B_FALSE;
3625 boolean_t found_key = B_FALSE;
3626 boolean_t in_array = B_FALSE;
3627 uint32_t braces = 0, brackets = 0;
3628 char *elem = elemlist;
3629 char *dd = dest;
3630 uintptr_t cur;
3631
3632 for (cur = json; cur < json + size; cur++) {
3633 char cc = dtrace_load8(cur);
3634 if (cc == '\0')
3635 return (NULL);
3636
3637 switch (state) {
3638 case DTRACE_JSON_REST:
3639 if (isspace(cc))
3640 break;
3641
3642 if (cc == '{') {
3643 state = DTRACE_JSON_OBJECT;
3644 break;
3645 }
3646
3647 if (cc == '[') {
3648 in_array = B_TRUE;
3649 array_pos = 0;
3650 array_elem = dtrace_strtoll(elem, 10, size);
3651 found_key = array_elem == 0 ? B_TRUE : B_FALSE;
3652 state = DTRACE_JSON_VALUE;
3653 break;
3654 }
3655
3656 /*
3657 * ERROR: expected to find a top-level object or array.
3658 */
3659 return (NULL);
3660 case DTRACE_JSON_OBJECT:
3661 if (isspace(cc))
3662 break;
3663
3664 if (cc == '"') {
3665 state = DTRACE_JSON_STRING;
3666 string_is_key = B_TRUE;
3667 break;
3668 }
3669
3670 /*
3671 * ERROR: either the object did not start with a key
3672 * string, or we've run off the end of the object
3673 * without finding the requested key.
3674 */
3675 return (NULL);
3676 case DTRACE_JSON_STRING:
3677 if (cc == '\\') {
3678 *dd++ = '\\';
3679 state = DTRACE_JSON_STRING_ESCAPE;
3680 break;
3681 }
3682
3683 if (cc == '"') {
3684 if (collect_object) {
3685 /*
3686 * We don't reset the dest here, as
3687 * the string is part of a larger
3688 * object being collected.
3689 */
3690 *dd++ = cc;
3691 collect_object = B_FALSE;
3692 state = DTRACE_JSON_COLLECT_OBJECT;
3693 break;
3694 }
3695 *dd = '\0';
3696 dd = dest; /* reset string buffer */
3697 if (string_is_key) {
3698 if (dtrace_strncmp(dest, elem,
3699 size) == 0)
3700 found_key = B_TRUE;
3701 } else if (found_key) {
3702 if (nelems > 1) {
3703 /*
3704 * We expected an object, not
3705 * this string.
3706 */
3707 return (NULL);
3708 }
3709 return (dest);
3710 }
3711 state = string_is_key ? DTRACE_JSON_COLON :
3712 DTRACE_JSON_COMMA;
3713 string_is_key = B_FALSE;
3714 break;
3715 }
3716
3717 *dd++ = cc;
3718 break;
3719 case DTRACE_JSON_STRING_ESCAPE:
3720 *dd++ = cc;
3721 if (cc == 'u') {
3722 escape_unicount = 0;
3723 state = DTRACE_JSON_STRING_ESCAPE_UNICODE;
3724 } else {
3725 state = DTRACE_JSON_STRING;
3726 }
3727 break;
3728 case DTRACE_JSON_STRING_ESCAPE_UNICODE:
3729 if (!isxdigit(cc)) {
3730 /*
3731 * ERROR: invalid unicode escape, expected
3732 * four valid hexidecimal digits.
3733 */
3734 return (NULL);
3735 }
3736
3737 *dd++ = cc;
3738 if (++escape_unicount == 4)
3739 state = DTRACE_JSON_STRING;
3740 break;
3741 case DTRACE_JSON_COLON:
3742 if (isspace(cc))
3743 break;
3744
3745 if (cc == ':') {
3746 state = DTRACE_JSON_VALUE;
3747 break;
3748 }
3749
3750 /*
3751 * ERROR: expected a colon.
3752 */
3753 return (NULL);
3754 case DTRACE_JSON_COMMA:
3755 if (isspace(cc))
3756 break;
3757
3758 if (cc == ',') {
3759 if (in_array) {
3760 state = DTRACE_JSON_VALUE;
3761 if (++array_pos == array_elem)
3762 found_key = B_TRUE;
3763 } else {
3764 state = DTRACE_JSON_OBJECT;
3765 }
3766 break;
3767 }
3768
3769 /*
3770 * ERROR: either we hit an unexpected character, or
3771 * we reached the end of the object or array without
3772 * finding the requested key.
3773 */
3774 return (NULL);
3775 case DTRACE_JSON_IDENTIFIER:
3776 if (islower(cc)) {
3777 *dd++ = cc;
3778 break;
3779 }
3780
3781 *dd = '\0';
3782 dd = dest; /* reset string buffer */
3783
3784 if (dtrace_strncmp(dest, "true", 5) == 0 ||
3785 dtrace_strncmp(dest, "false", 6) == 0 ||
3786 dtrace_strncmp(dest, "null", 5) == 0) {
3787 if (found_key) {
3788 if (nelems > 1) {
3789 /*
3790 * ERROR: We expected an object,
3791 * not this identifier.
3792 */
3793 return (NULL);
3794 }
3795 return (dest);
3796 } else {
3797 cur--;
3798 state = DTRACE_JSON_COMMA;
3799 break;
3800 }
3801 }
3802
3803 /*
3804 * ERROR: we did not recognise the identifier as one
3805 * of those in the JSON specification.
3806 */
3807 return (NULL);
3808 case DTRACE_JSON_NUMBER:
3809 if (cc == '.') {
3810 *dd++ = cc;
3811 state = DTRACE_JSON_NUMBER_FRAC;
3812 break;
3813 }
3814
3815 if (cc == 'x' || cc == 'X') {
3816 /*
3817 * ERROR: specification explicitly excludes
3818 * hexidecimal or octal numbers.
3819 */
3820 return (NULL);
3821 }
3822
3823 /* FALLTHRU */
3824 case DTRACE_JSON_NUMBER_FRAC:
3825 if (cc == 'e' || cc == 'E') {
3826 *dd++ = cc;
3827 state = DTRACE_JSON_NUMBER_EXP;
3828 break;
3829 }
3830
3831 if (cc == '+' || cc == '-') {
3832 /*
3833 * ERROR: expect sign as part of exponent only.
3834 */
3835 return (NULL);
3836 }
3837 /* FALLTHRU */
3838 case DTRACE_JSON_NUMBER_EXP:
3839 if (isdigit(cc) || cc == '+' || cc == '-') {
3840 *dd++ = cc;
3841 break;
3842 }
3843
3844 *dd = '\0';
3845 dd = dest; /* reset string buffer */
3846 if (found_key) {
3847 if (nelems > 1) {
3848 /*
3849 * ERROR: We expected an object, not
3850 * this number.
3851 */
3852 return (NULL);
3853 }
3854 return (dest);
3855 }
3856
3857 cur--;
3858 state = DTRACE_JSON_COMMA;
3859 break;
3860 case DTRACE_JSON_VALUE:
3861 if (isspace(cc))
3862 break;
3863
3864 if (cc == '{' || cc == '[') {
3865 if (nelems > 1 && found_key) {
3866 in_array = cc == '[' ? B_TRUE : B_FALSE;
3867 /*
3868 * If our element selector directs us
3869 * to descend into this nested object,
3870 * then move to the next selector
3871 * element in the list and restart the
3872 * state machine.
3873 */
3874 while (*elem != '\0')
3875 elem++;
3876 elem++; /* skip the inter-element NUL */
3877 nelems--;
3878 dd = dest;
3879 if (in_array) {
3880 state = DTRACE_JSON_VALUE;
3881 array_pos = 0;
3882 array_elem = dtrace_strtoll(
3883 elem, 10, size);
3884 found_key = array_elem == 0 ?
3885 B_TRUE : B_FALSE;
3886 } else {
3887 found_key = B_FALSE;
3888 state = DTRACE_JSON_OBJECT;
3889 }
3890 break;
3891 }
3892
3893 /*
3894 * Otherwise, we wish to either skip this
3895 * nested object or return it in full.
3896 */
3897 if (cc == '[')
3898 brackets = 1;
3899 else
3900 braces = 1;
3901 *dd++ = cc;
3902 state = DTRACE_JSON_COLLECT_OBJECT;
3903 break;
3904 }
3905
3906 if (cc == '"') {
3907 state = DTRACE_JSON_STRING;
3908 break;
3909 }
3910
3911 if (islower(cc)) {
3912 /*
3913 * Here we deal with true, false and null.
3914 */
3915 *dd++ = cc;
3916 state = DTRACE_JSON_IDENTIFIER;
3917 break;
3918 }
3919
3920 if (cc == '-' || isdigit(cc)) {
3921 *dd++ = cc;
3922 state = DTRACE_JSON_NUMBER;
3923 break;
3924 }
3925
3926 /*
3927 * ERROR: unexpected character at start of value.
3928 */
3929 return (NULL);
3930 case DTRACE_JSON_COLLECT_OBJECT:
3931 if (cc == '\0')
3932 /*
3933 * ERROR: unexpected end of input.
3934 */
3935 return (NULL);
3936
3937 *dd++ = cc;
3938 if (cc == '"') {
3939 collect_object = B_TRUE;
3940 state = DTRACE_JSON_STRING;
3941 break;
3942 }
3943
3944 if (cc == ']') {
3945 if (brackets-- == 0) {
3946 /*
3947 * ERROR: unbalanced brackets.
3948 */
3949 return (NULL);
3950 }
3951 } else if (cc == '}') {
3952 if (braces-- == 0) {
3953 /*
3954 * ERROR: unbalanced braces.
3955 */
3956 return (NULL);
3957 }
3958 } else if (cc == '{') {
3959 braces++;
3960 } else if (cc == '[') {
3961 brackets++;
3962 }
3963
3964 if (brackets == 0 && braces == 0) {
3965 if (found_key) {
3966 *dd = '\0';
3967 return (dest);
3968 }
3969 dd = dest; /* reset string buffer */
3970 state = DTRACE_JSON_COMMA;
3971 }
3972 break;
3973 }
3974 }
3975 return (NULL);
3976}
3977
3978/*
3263 * Emulate the execution of DTrace ID subroutines invoked by the call opcode.
3264 * Notice that we don't bother validating the proper number of arguments or
3265 * their types in the tuple stack. This isn't needed because all argument
3266 * interpretation is safe because of our load safety -- the worst that can
3267 * happen is that a bogus program can obtain bogus results.
3268 */
3269static void
3270dtrace_dif_subr(uint_t subr, uint_t rd, uint64_t *regs,
3271 dtrace_key_t *tupregs, int nargs,
3272 dtrace_mstate_t *mstate, dtrace_state_t *state)
3273{
3274 volatile uint16_t *flags = &cpu_core[curcpu].cpuc_dtrace_flags;
3275 volatile uintptr_t *illval = &cpu_core[curcpu].cpuc_dtrace_illval;
3276 dtrace_vstate_t *vstate = &state->dts_vstate;
3277
3278#if defined(sun)
3279 union {
3280 mutex_impl_t mi;
3281 uint64_t mx;
3282 } m;
3283
3284 union {
3285 krwlock_t ri;
3286 uintptr_t rw;
3287 } r;
3288#else
3289 struct thread *lowner;
3290 union {
3291 struct lock_object *li;
3292 uintptr_t lx;
3293 } l;
3294#endif
3295
3296 switch (subr) {
3297 case DIF_SUBR_RAND:
3298 regs[rd] = (dtrace_gethrtime() * 2416 + 374441) % 1771875;
3299 break;
3300
3301#if defined(sun)
3302 case DIF_SUBR_MUTEX_OWNED:
3303 if (!dtrace_canload(tupregs[0].dttk_value, sizeof (kmutex_t),
3304 mstate, vstate)) {
3305 regs[rd] = 0;
3306 break;
3307 }
3308
3309 m.mx = dtrace_load64(tupregs[0].dttk_value);
3310 if (MUTEX_TYPE_ADAPTIVE(&m.mi))
3311 regs[rd] = MUTEX_OWNER(&m.mi) != MUTEX_NO_OWNER;
3312 else
3313 regs[rd] = LOCK_HELD(&m.mi.m_spin.m_spinlock);
3314 break;
3315
3316 case DIF_SUBR_MUTEX_OWNER:
3317 if (!dtrace_canload(tupregs[0].dttk_value, sizeof (kmutex_t),
3318 mstate, vstate)) {
3319 regs[rd] = 0;
3320 break;
3321 }
3322
3323 m.mx = dtrace_load64(tupregs[0].dttk_value);
3324 if (MUTEX_TYPE_ADAPTIVE(&m.mi) &&
3325 MUTEX_OWNER(&m.mi) != MUTEX_NO_OWNER)
3326 regs[rd] = (uintptr_t)MUTEX_OWNER(&m.mi);
3327 else
3328 regs[rd] = 0;
3329 break;
3330
3331 case DIF_SUBR_MUTEX_TYPE_ADAPTIVE:
3332 if (!dtrace_canload(tupregs[0].dttk_value, sizeof (kmutex_t),
3333 mstate, vstate)) {
3334 regs[rd] = 0;
3335 break;
3336 }
3337
3338 m.mx = dtrace_load64(tupregs[0].dttk_value);
3339 regs[rd] = MUTEX_TYPE_ADAPTIVE(&m.mi);
3340 break;
3341
3342 case DIF_SUBR_MUTEX_TYPE_SPIN:
3343 if (!dtrace_canload(tupregs[0].dttk_value, sizeof (kmutex_t),
3344 mstate, vstate)) {
3345 regs[rd] = 0;
3346 break;
3347 }
3348
3349 m.mx = dtrace_load64(tupregs[0].dttk_value);
3350 regs[rd] = MUTEX_TYPE_SPIN(&m.mi);
3351 break;
3352
3353 case DIF_SUBR_RW_READ_HELD: {
3354 uintptr_t tmp;
3355
3356 if (!dtrace_canload(tupregs[0].dttk_value, sizeof (uintptr_t),
3357 mstate, vstate)) {
3358 regs[rd] = 0;
3359 break;
3360 }
3361
3362 r.rw = dtrace_loadptr(tupregs[0].dttk_value);
3363 regs[rd] = _RW_READ_HELD(&r.ri, tmp);
3364 break;
3365 }
3366
3367 case DIF_SUBR_RW_WRITE_HELD:
3368 if (!dtrace_canload(tupregs[0].dttk_value, sizeof (krwlock_t),
3369 mstate, vstate)) {
3370 regs[rd] = 0;
3371 break;
3372 }
3373
3374 r.rw = dtrace_loadptr(tupregs[0].dttk_value);
3375 regs[rd] = _RW_WRITE_HELD(&r.ri);
3376 break;
3377
3378 case DIF_SUBR_RW_ISWRITER:
3379 if (!dtrace_canload(tupregs[0].dttk_value, sizeof (krwlock_t),
3380 mstate, vstate)) {
3381 regs[rd] = 0;
3382 break;
3383 }
3384
3385 r.rw = dtrace_loadptr(tupregs[0].dttk_value);
3386 regs[rd] = _RW_ISWRITER(&r.ri);
3387 break;
3388
3389#else
3390 case DIF_SUBR_MUTEX_OWNED:
3391 if (!dtrace_canload(tupregs[0].dttk_value,
3392 sizeof (struct lock_object), mstate, vstate)) {
3393 regs[rd] = 0;
3394 break;
3395 }
3396 l.lx = dtrace_loadptr((uintptr_t)&tupregs[0].dttk_value);
3397 regs[rd] = LOCK_CLASS(l.li)->lc_owner(l.li, &lowner);
3398 break;
3399
3400 case DIF_SUBR_MUTEX_OWNER:
3401 if (!dtrace_canload(tupregs[0].dttk_value,
3402 sizeof (struct lock_object), mstate, vstate)) {
3403 regs[rd] = 0;
3404 break;
3405 }
3406 l.lx = dtrace_loadptr((uintptr_t)&tupregs[0].dttk_value);
3407 LOCK_CLASS(l.li)->lc_owner(l.li, &lowner);
3408 regs[rd] = (uintptr_t)lowner;
3409 break;
3410
3411 case DIF_SUBR_MUTEX_TYPE_ADAPTIVE:
3412 if (!dtrace_canload(tupregs[0].dttk_value, sizeof (struct mtx),
3413 mstate, vstate)) {
3414 regs[rd] = 0;
3415 break;
3416 }
3417 l.lx = dtrace_loadptr((uintptr_t)&tupregs[0].dttk_value);
3418 /* XXX - should be only LC_SLEEPABLE? */
3419 regs[rd] = (LOCK_CLASS(l.li)->lc_flags &
3420 (LC_SLEEPLOCK | LC_SLEEPABLE)) != 0;
3421 break;
3422
3423 case DIF_SUBR_MUTEX_TYPE_SPIN:
3424 if (!dtrace_canload(tupregs[0].dttk_value, sizeof (struct mtx),
3425 mstate, vstate)) {
3426 regs[rd] = 0;
3427 break;
3428 }
3429 l.lx = dtrace_loadptr((uintptr_t)&tupregs[0].dttk_value);
3430 regs[rd] = (LOCK_CLASS(l.li)->lc_flags & LC_SPINLOCK) != 0;
3431 break;
3432
3433 case DIF_SUBR_RW_READ_HELD:
3434 case DIF_SUBR_SX_SHARED_HELD:
3435 if (!dtrace_canload(tupregs[0].dttk_value, sizeof (uintptr_t),
3436 mstate, vstate)) {
3437 regs[rd] = 0;
3438 break;
3439 }
3440 l.lx = dtrace_loadptr((uintptr_t)&tupregs[0].dttk_value);
3441 regs[rd] = LOCK_CLASS(l.li)->lc_owner(l.li, &lowner) &&
3442 lowner == NULL;
3443 break;
3444
3445 case DIF_SUBR_RW_WRITE_HELD:
3446 case DIF_SUBR_SX_EXCLUSIVE_HELD:
3447 if (!dtrace_canload(tupregs[0].dttk_value, sizeof (uintptr_t),
3448 mstate, vstate)) {
3449 regs[rd] = 0;
3450 break;
3451 }
3452 l.lx = dtrace_loadptr(tupregs[0].dttk_value);
3453 LOCK_CLASS(l.li)->lc_owner(l.li, &lowner);
3454 regs[rd] = (lowner == curthread);
3455 break;
3456
3457 case DIF_SUBR_RW_ISWRITER:
3458 case DIF_SUBR_SX_ISEXCLUSIVE:
3459 if (!dtrace_canload(tupregs[0].dttk_value, sizeof (uintptr_t),
3460 mstate, vstate)) {
3461 regs[rd] = 0;
3462 break;
3463 }
3464 l.lx = dtrace_loadptr(tupregs[0].dttk_value);
3465 regs[rd] = LOCK_CLASS(l.li)->lc_owner(l.li, &lowner) &&
3466 lowner != NULL;
3467 break;
3468#endif /* ! defined(sun) */
3469
3470 case DIF_SUBR_BCOPY: {
3471 /*
3472 * We need to be sure that the destination is in the scratch
3473 * region -- no other region is allowed.
3474 */
3475 uintptr_t src = tupregs[0].dttk_value;
3476 uintptr_t dest = tupregs[1].dttk_value;
3477 size_t size = tupregs[2].dttk_value;
3478
3479 if (!dtrace_inscratch(dest, size, mstate)) {
3480 *flags |= CPU_DTRACE_BADADDR;
3481 *illval = regs[rd];
3482 break;
3483 }
3484
3485 if (!dtrace_canload(src, size, mstate, vstate)) {
3486 regs[rd] = 0;
3487 break;
3488 }
3489
3490 dtrace_bcopy((void *)src, (void *)dest, size);
3491 break;
3492 }
3493
3494 case DIF_SUBR_ALLOCA:
3495 case DIF_SUBR_COPYIN: {
3496 uintptr_t dest = P2ROUNDUP(mstate->dtms_scratch_ptr, 8);
3497 uint64_t size =
3498 tupregs[subr == DIF_SUBR_ALLOCA ? 0 : 1].dttk_value;
3499 size_t scratch_size = (dest - mstate->dtms_scratch_ptr) + size;
3500
3501 /*
3502 * This action doesn't require any credential checks since
3503 * probes will not activate in user contexts to which the
3504 * enabling user does not have permissions.
3505 */
3506
3507 /*
3508 * Rounding up the user allocation size could have overflowed
3509 * a large, bogus allocation (like -1ULL) to 0.
3510 */
3511 if (scratch_size < size ||
3512 !DTRACE_INSCRATCH(mstate, scratch_size)) {
3513 DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
3514 regs[rd] = 0;
3515 break;
3516 }
3517
3518 if (subr == DIF_SUBR_COPYIN) {
3519 DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
3520 dtrace_copyin(tupregs[0].dttk_value, dest, size, flags);
3521 DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);
3522 }
3523
3524 mstate->dtms_scratch_ptr += scratch_size;
3525 regs[rd] = dest;
3526 break;
3527 }
3528
3529 case DIF_SUBR_COPYINTO: {
3530 uint64_t size = tupregs[1].dttk_value;
3531 uintptr_t dest = tupregs[2].dttk_value;
3532
3533 /*
3534 * This action doesn't require any credential checks since
3535 * probes will not activate in user contexts to which the
3536 * enabling user does not have permissions.
3537 */
3538 if (!dtrace_inscratch(dest, size, mstate)) {
3539 *flags |= CPU_DTRACE_BADADDR;
3540 *illval = regs[rd];
3541 break;
3542 }
3543
3544 DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
3545 dtrace_copyin(tupregs[0].dttk_value, dest, size, flags);
3546 DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);
3547 break;
3548 }
3549
3550 case DIF_SUBR_COPYINSTR: {
3551 uintptr_t dest = mstate->dtms_scratch_ptr;
3552 uint64_t size = state->dts_options[DTRACEOPT_STRSIZE];
3553
3554 if (nargs > 1 && tupregs[1].dttk_value < size)
3555 size = tupregs[1].dttk_value + 1;
3556
3557 /*
3558 * This action doesn't require any credential checks since
3559 * probes will not activate in user contexts to which the
3560 * enabling user does not have permissions.
3561 */
3562 if (!DTRACE_INSCRATCH(mstate, size)) {
3563 DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
3564 regs[rd] = 0;
3565 break;
3566 }
3567
3568 DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
3569 dtrace_copyinstr(tupregs[0].dttk_value, dest, size, flags);
3570 DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);
3571
3572 ((char *)dest)[size - 1] = '\0';
3573 mstate->dtms_scratch_ptr += size;
3574 regs[rd] = dest;
3575 break;
3576 }
3577
3578#if defined(sun)
3579 case DIF_SUBR_MSGSIZE:
3580 case DIF_SUBR_MSGDSIZE: {
3581 uintptr_t baddr = tupregs[0].dttk_value, daddr;
3582 uintptr_t wptr, rptr;
3583 size_t count = 0;
3584 int cont = 0;
3585
3586 while (baddr != 0 && !(*flags & CPU_DTRACE_FAULT)) {
3587
3588 if (!dtrace_canload(baddr, sizeof (mblk_t), mstate,
3589 vstate)) {
3590 regs[rd] = 0;
3591 break;
3592 }
3593
3594 wptr = dtrace_loadptr(baddr +
3595 offsetof(mblk_t, b_wptr));
3596
3597 rptr = dtrace_loadptr(baddr +
3598 offsetof(mblk_t, b_rptr));
3599
3600 if (wptr < rptr) {
3601 *flags |= CPU_DTRACE_BADADDR;
3602 *illval = tupregs[0].dttk_value;
3603 break;
3604 }
3605
3606 daddr = dtrace_loadptr(baddr +
3607 offsetof(mblk_t, b_datap));
3608
3609 baddr = dtrace_loadptr(baddr +
3610 offsetof(mblk_t, b_cont));
3611
3612 /*
3613 * We want to prevent against denial-of-service here,
3614 * so we're only going to search the list for
3615 * dtrace_msgdsize_max mblks.
3616 */
3617 if (cont++ > dtrace_msgdsize_max) {
3618 *flags |= CPU_DTRACE_ILLOP;
3619 break;
3620 }
3621
3622 if (subr == DIF_SUBR_MSGDSIZE) {
3623 if (dtrace_load8(daddr +
3624 offsetof(dblk_t, db_type)) != M_DATA)
3625 continue;
3626 }
3627
3628 count += wptr - rptr;
3629 }
3630
3631 if (!(*flags & CPU_DTRACE_FAULT))
3632 regs[rd] = count;
3633
3634 break;
3635 }
3636#endif
3637
3638 case DIF_SUBR_PROGENYOF: {
3639 pid_t pid = tupregs[0].dttk_value;
3640 proc_t *p;
3641 int rval = 0;
3642
3643 DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
3644
3645 for (p = curthread->t_procp; p != NULL; p = p->p_parent) {
3646#if defined(sun)
3647 if (p->p_pidp->pid_id == pid) {
3648#else
3649 if (p->p_pid == pid) {
3650#endif
3651 rval = 1;
3652 break;
3653 }
3654 }
3655
3656 DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);
3657
3658 regs[rd] = rval;
3659 break;
3660 }
3661
3662 case DIF_SUBR_SPECULATION:
3663 regs[rd] = dtrace_speculation(state);
3664 break;
3665
3666 case DIF_SUBR_COPYOUT: {
3667 uintptr_t kaddr = tupregs[0].dttk_value;
3668 uintptr_t uaddr = tupregs[1].dttk_value;
3669 uint64_t size = tupregs[2].dttk_value;
3670
3671 if (!dtrace_destructive_disallow &&
3672 dtrace_priv_proc_control(state) &&
3673 !dtrace_istoxic(kaddr, size)) {
3674 DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
3675 dtrace_copyout(kaddr, uaddr, size, flags);
3676 DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);
3677 }
3678 break;
3679 }
3680
3681 case DIF_SUBR_COPYOUTSTR: {
3682 uintptr_t kaddr = tupregs[0].dttk_value;
3683 uintptr_t uaddr = tupregs[1].dttk_value;
3684 uint64_t size = tupregs[2].dttk_value;
3685
3686 if (!dtrace_destructive_disallow &&
3687 dtrace_priv_proc_control(state) &&
3688 !dtrace_istoxic(kaddr, size)) {
3689 DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
3690 dtrace_copyoutstr(kaddr, uaddr, size, flags);
3691 DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);
3692 }
3693 break;
3694 }
3695
3696 case DIF_SUBR_STRLEN: {
3697 size_t sz;
3698 uintptr_t addr = (uintptr_t)tupregs[0].dttk_value;
3699 sz = dtrace_strlen((char *)addr,
3700 state->dts_options[DTRACEOPT_STRSIZE]);
3701
3702 if (!dtrace_canload(addr, sz + 1, mstate, vstate)) {
3703 regs[rd] = 0;
3704 break;
3705 }
3706
3707 regs[rd] = sz;
3708
3709 break;
3710 }
3711
3712 case DIF_SUBR_STRCHR:
3713 case DIF_SUBR_STRRCHR: {
3714 /*
3715 * We're going to iterate over the string looking for the
3716 * specified character. We will iterate until we have reached
3717 * the string length or we have found the character. If this
3718 * is DIF_SUBR_STRRCHR, we will look for the last occurrence
3719 * of the specified character instead of the first.
3720 */
3721 uintptr_t saddr = tupregs[0].dttk_value;
3722 uintptr_t addr = tupregs[0].dttk_value;
3723 uintptr_t limit = addr + state->dts_options[DTRACEOPT_STRSIZE];
3724 char c, target = (char)tupregs[1].dttk_value;
3725
3726 for (regs[rd] = 0; addr < limit; addr++) {
3727 if ((c = dtrace_load8(addr)) == target) {
3728 regs[rd] = addr;
3729
3730 if (subr == DIF_SUBR_STRCHR)
3731 break;
3732 }
3733
3734 if (c == '\0')
3735 break;
3736 }
3737
3738 if (!dtrace_canload(saddr, addr - saddr, mstate, vstate)) {
3739 regs[rd] = 0;
3740 break;
3741 }
3742
3743 break;
3744 }
3745
3746 case DIF_SUBR_STRSTR:
3747 case DIF_SUBR_INDEX:
3748 case DIF_SUBR_RINDEX: {
3749 /*
3750 * We're going to iterate over the string looking for the
3751 * specified string. We will iterate until we have reached
3752 * the string length or we have found the string. (Yes, this
3753 * is done in the most naive way possible -- but considering
3754 * that the string we're searching for is likely to be
3755 * relatively short, the complexity of Rabin-Karp or similar
3756 * hardly seems merited.)
3757 */
3758 char *addr = (char *)(uintptr_t)tupregs[0].dttk_value;
3759 char *substr = (char *)(uintptr_t)tupregs[1].dttk_value;
3760 uint64_t size = state->dts_options[DTRACEOPT_STRSIZE];
3761 size_t len = dtrace_strlen(addr, size);
3762 size_t sublen = dtrace_strlen(substr, size);
3763 char *limit = addr + len, *orig = addr;
3764 int notfound = subr == DIF_SUBR_STRSTR ? 0 : -1;
3765 int inc = 1;
3766
3767 regs[rd] = notfound;
3768
3769 if (!dtrace_canload((uintptr_t)addr, len + 1, mstate, vstate)) {
3770 regs[rd] = 0;
3771 break;
3772 }
3773
3774 if (!dtrace_canload((uintptr_t)substr, sublen + 1, mstate,
3775 vstate)) {
3776 regs[rd] = 0;
3777 break;
3778 }
3779
3780 /*
3781 * strstr() and index()/rindex() have similar semantics if
3782 * both strings are the empty string: strstr() returns a
3783 * pointer to the (empty) string, and index() and rindex()
3784 * both return index 0 (regardless of any position argument).
3785 */
3786 if (sublen == 0 && len == 0) {
3787 if (subr == DIF_SUBR_STRSTR)
3788 regs[rd] = (uintptr_t)addr;
3789 else
3790 regs[rd] = 0;
3791 break;
3792 }
3793
3794 if (subr != DIF_SUBR_STRSTR) {
3795 if (subr == DIF_SUBR_RINDEX) {
3796 limit = orig - 1;
3797 addr += len;
3798 inc = -1;
3799 }
3800
3801 /*
3802 * Both index() and rindex() take an optional position
3803 * argument that denotes the starting position.
3804 */
3805 if (nargs == 3) {
3806 int64_t pos = (int64_t)tupregs[2].dttk_value;
3807
3808 /*
3809 * If the position argument to index() is
3810 * negative, Perl implicitly clamps it at
3811 * zero. This semantic is a little surprising
3812 * given the special meaning of negative
3813 * positions to similar Perl functions like
3814 * substr(), but it appears to reflect a
3815 * notion that index() can start from a
3816 * negative index and increment its way up to
3817 * the string. Given this notion, Perl's
3818 * rindex() is at least self-consistent in
3819 * that it implicitly clamps positions greater
3820 * than the string length to be the string
3821 * length. Where Perl completely loses
3822 * coherence, however, is when the specified
3823 * substring is the empty string (""). In
3824 * this case, even if the position is
3825 * negative, rindex() returns 0 -- and even if
3826 * the position is greater than the length,
3827 * index() returns the string length. These
3828 * semantics violate the notion that index()
3829 * should never return a value less than the
3830 * specified position and that rindex() should
3831 * never return a value greater than the
3832 * specified position. (One assumes that
3833 * these semantics are artifacts of Perl's
3834 * implementation and not the results of
3835 * deliberate design -- it beggars belief that
3836 * even Larry Wall could desire such oddness.)
3837 * While in the abstract one would wish for
3838 * consistent position semantics across
3839 * substr(), index() and rindex() -- or at the
3840 * very least self-consistent position
3841 * semantics for index() and rindex() -- we
3842 * instead opt to keep with the extant Perl
3843 * semantics, in all their broken glory. (Do
3844 * we have more desire to maintain Perl's
3845 * semantics than Perl does? Probably.)
3846 */
3847 if (subr == DIF_SUBR_RINDEX) {
3848 if (pos < 0) {
3849 if (sublen == 0)
3850 regs[rd] = 0;
3851 break;
3852 }
3853
3854 if (pos > len)
3855 pos = len;
3856 } else {
3857 if (pos < 0)
3858 pos = 0;
3859
3860 if (pos >= len) {
3861 if (sublen == 0)
3862 regs[rd] = len;
3863 break;
3864 }
3865 }
3866
3867 addr = orig + pos;
3868 }
3869 }
3870
3871 for (regs[rd] = notfound; addr != limit; addr += inc) {
3872 if (dtrace_strncmp(addr, substr, sublen) == 0) {
3873 if (subr != DIF_SUBR_STRSTR) {
3874 /*
3875 * As D index() and rindex() are
3876 * modeled on Perl (and not on awk),
3877 * we return a zero-based (and not a
3878 * one-based) index. (For you Perl
3879 * weenies: no, we're not going to add
3880 * $[ -- and shouldn't you be at a con
3881 * or something?)
3882 */
3883 regs[rd] = (uintptr_t)(addr - orig);
3884 break;
3885 }
3886
3887 ASSERT(subr == DIF_SUBR_STRSTR);
3888 regs[rd] = (uintptr_t)addr;
3889 break;
3890 }
3891 }
3892
3893 break;
3894 }
3895
3896 case DIF_SUBR_STRTOK: {
3897 uintptr_t addr = tupregs[0].dttk_value;
3898 uintptr_t tokaddr = tupregs[1].dttk_value;
3899 uint64_t size = state->dts_options[DTRACEOPT_STRSIZE];
3900 uintptr_t limit, toklimit = tokaddr + size;
3901 uint8_t c = 0, tokmap[32]; /* 256 / 8 */
3902 char *dest = (char *)mstate->dtms_scratch_ptr;
3903 int i;
3904
3905 /*
3906 * Check both the token buffer and (later) the input buffer,
3907 * since both could be non-scratch addresses.
3908 */
3909 if (!dtrace_strcanload(tokaddr, size, mstate, vstate)) {
3910 regs[rd] = 0;
3911 break;
3912 }
3913
3914 if (!DTRACE_INSCRATCH(mstate, size)) {
3915 DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
3916 regs[rd] = 0;
3917 break;
3918 }
3919
3920 if (addr == 0) {
3921 /*
3922 * If the address specified is NULL, we use our saved
3923 * strtok pointer from the mstate. Note that this
3924 * means that the saved strtok pointer is _only_
3925 * valid within multiple enablings of the same probe --
3926 * it behaves like an implicit clause-local variable.
3927 */
3928 addr = mstate->dtms_strtok;
3929 } else {
3930 /*
3931 * If the user-specified address is non-NULL we must
3932 * access check it. This is the only time we have
3933 * a chance to do so, since this address may reside
3934 * in the string table of this clause-- future calls
3935 * (when we fetch addr from mstate->dtms_strtok)
3936 * would fail this access check.
3937 */
3938 if (!dtrace_strcanload(addr, size, mstate, vstate)) {
3939 regs[rd] = 0;
3940 break;
3941 }
3942 }
3943
3944 /*
3945 * First, zero the token map, and then process the token
3946 * string -- setting a bit in the map for every character
3947 * found in the token string.
3948 */
3949 for (i = 0; i < sizeof (tokmap); i++)
3950 tokmap[i] = 0;
3951
3952 for (; tokaddr < toklimit; tokaddr++) {
3953 if ((c = dtrace_load8(tokaddr)) == '\0')
3954 break;
3955
3956 ASSERT((c >> 3) < sizeof (tokmap));
3957 tokmap[c >> 3] |= (1 << (c & 0x7));
3958 }
3959
3960 for (limit = addr + size; addr < limit; addr++) {
3961 /*
3962 * We're looking for a character that is _not_ contained
3963 * in the token string.
3964 */
3965 if ((c = dtrace_load8(addr)) == '\0')
3966 break;
3967
3968 if (!(tokmap[c >> 3] & (1 << (c & 0x7))))
3969 break;
3970 }
3971
3972 if (c == '\0') {
3973 /*
3974 * We reached the end of the string without finding
3975 * any character that was not in the token string.
3976 * We return NULL in this case, and we set the saved
3977 * address to NULL as well.
3978 */
3979 regs[rd] = 0;
3980 mstate->dtms_strtok = 0;
3981 break;
3982 }
3983
3984 /*
3985 * From here on, we're copying into the destination string.
3986 */
3987 for (i = 0; addr < limit && i < size - 1; addr++) {
3988 if ((c = dtrace_load8(addr)) == '\0')
3989 break;
3990
3991 if (tokmap[c >> 3] & (1 << (c & 0x7)))
3992 break;
3993
3994 ASSERT(i < size);
3995 dest[i++] = c;
3996 }
3997
3998 ASSERT(i < size);
3999 dest[i] = '\0';
4000 regs[rd] = (uintptr_t)dest;
4001 mstate->dtms_scratch_ptr += size;
4002 mstate->dtms_strtok = addr;
4003 break;
4004 }
4005
4006 case DIF_SUBR_SUBSTR: {
4007 uintptr_t s = tupregs[0].dttk_value;
4008 uint64_t size = state->dts_options[DTRACEOPT_STRSIZE];
4009 char *d = (char *)mstate->dtms_scratch_ptr;
4010 int64_t index = (int64_t)tupregs[1].dttk_value;
4011 int64_t remaining = (int64_t)tupregs[2].dttk_value;
4012 size_t len = dtrace_strlen((char *)s, size);
4013 int64_t i = 0;
4014
4015 if (!dtrace_canload(s, len + 1, mstate, vstate)) {
4016 regs[rd] = 0;
4017 break;
4018 }
4019
4020 if (!DTRACE_INSCRATCH(mstate, size)) {
4021 DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
4022 regs[rd] = 0;
4023 break;
4024 }
4025
4026 if (nargs <= 2)
4027 remaining = (int64_t)size;
4028
4029 if (index < 0) {
4030 index += len;
4031
4032 if (index < 0 && index + remaining > 0) {
4033 remaining += index;
4034 index = 0;
4035 }
4036 }
4037
4038 if (index >= len || index < 0) {
4039 remaining = 0;
4040 } else if (remaining < 0) {
4041 remaining += len - index;
4042 } else if (index + remaining > size) {
4043 remaining = size - index;
4044 }
4045
4046 for (i = 0; i < remaining; i++) {
4047 if ((d[i] = dtrace_load8(s + index + i)) == '\0')
4048 break;
4049 }
4050
4051 d[i] = '\0';
4052
4053 mstate->dtms_scratch_ptr += size;
4054 regs[rd] = (uintptr_t)d;
4055 break;
4056 }
4057
3979 * Emulate the execution of DTrace ID subroutines invoked by the call opcode.
3980 * Notice that we don't bother validating the proper number of arguments or
3981 * their types in the tuple stack. This isn't needed because all argument
3982 * interpretation is safe because of our load safety -- the worst that can
3983 * happen is that a bogus program can obtain bogus results.
3984 */
3985static void
3986dtrace_dif_subr(uint_t subr, uint_t rd, uint64_t *regs,
3987 dtrace_key_t *tupregs, int nargs,
3988 dtrace_mstate_t *mstate, dtrace_state_t *state)
3989{
3990 volatile uint16_t *flags = &cpu_core[curcpu].cpuc_dtrace_flags;
3991 volatile uintptr_t *illval = &cpu_core[curcpu].cpuc_dtrace_illval;
3992 dtrace_vstate_t *vstate = &state->dts_vstate;
3993
3994#if defined(sun)
3995 union {
3996 mutex_impl_t mi;
3997 uint64_t mx;
3998 } m;
3999
4000 union {
4001 krwlock_t ri;
4002 uintptr_t rw;
4003 } r;
4004#else
4005 struct thread *lowner;
4006 union {
4007 struct lock_object *li;
4008 uintptr_t lx;
4009 } l;
4010#endif
4011
4012 switch (subr) {
4013 case DIF_SUBR_RAND:
4014 regs[rd] = (dtrace_gethrtime() * 2416 + 374441) % 1771875;
4015 break;
4016
4017#if defined(sun)
4018 case DIF_SUBR_MUTEX_OWNED:
4019 if (!dtrace_canload(tupregs[0].dttk_value, sizeof (kmutex_t),
4020 mstate, vstate)) {
4021 regs[rd] = 0;
4022 break;
4023 }
4024
4025 m.mx = dtrace_load64(tupregs[0].dttk_value);
4026 if (MUTEX_TYPE_ADAPTIVE(&m.mi))
4027 regs[rd] = MUTEX_OWNER(&m.mi) != MUTEX_NO_OWNER;
4028 else
4029 regs[rd] = LOCK_HELD(&m.mi.m_spin.m_spinlock);
4030 break;
4031
4032 case DIF_SUBR_MUTEX_OWNER:
4033 if (!dtrace_canload(tupregs[0].dttk_value, sizeof (kmutex_t),
4034 mstate, vstate)) {
4035 regs[rd] = 0;
4036 break;
4037 }
4038
4039 m.mx = dtrace_load64(tupregs[0].dttk_value);
4040 if (MUTEX_TYPE_ADAPTIVE(&m.mi) &&
4041 MUTEX_OWNER(&m.mi) != MUTEX_NO_OWNER)
4042 regs[rd] = (uintptr_t)MUTEX_OWNER(&m.mi);
4043 else
4044 regs[rd] = 0;
4045 break;
4046
4047 case DIF_SUBR_MUTEX_TYPE_ADAPTIVE:
4048 if (!dtrace_canload(tupregs[0].dttk_value, sizeof (kmutex_t),
4049 mstate, vstate)) {
4050 regs[rd] = 0;
4051 break;
4052 }
4053
4054 m.mx = dtrace_load64(tupregs[0].dttk_value);
4055 regs[rd] = MUTEX_TYPE_ADAPTIVE(&m.mi);
4056 break;
4057
4058 case DIF_SUBR_MUTEX_TYPE_SPIN:
4059 if (!dtrace_canload(tupregs[0].dttk_value, sizeof (kmutex_t),
4060 mstate, vstate)) {
4061 regs[rd] = 0;
4062 break;
4063 }
4064
4065 m.mx = dtrace_load64(tupregs[0].dttk_value);
4066 regs[rd] = MUTEX_TYPE_SPIN(&m.mi);
4067 break;
4068
4069 case DIF_SUBR_RW_READ_HELD: {
4070 uintptr_t tmp;
4071
4072 if (!dtrace_canload(tupregs[0].dttk_value, sizeof (uintptr_t),
4073 mstate, vstate)) {
4074 regs[rd] = 0;
4075 break;
4076 }
4077
4078 r.rw = dtrace_loadptr(tupregs[0].dttk_value);
4079 regs[rd] = _RW_READ_HELD(&r.ri, tmp);
4080 break;
4081 }
4082
4083 case DIF_SUBR_RW_WRITE_HELD:
4084 if (!dtrace_canload(tupregs[0].dttk_value, sizeof (krwlock_t),
4085 mstate, vstate)) {
4086 regs[rd] = 0;
4087 break;
4088 }
4089
4090 r.rw = dtrace_loadptr(tupregs[0].dttk_value);
4091 regs[rd] = _RW_WRITE_HELD(&r.ri);
4092 break;
4093
4094 case DIF_SUBR_RW_ISWRITER:
4095 if (!dtrace_canload(tupregs[0].dttk_value, sizeof (krwlock_t),
4096 mstate, vstate)) {
4097 regs[rd] = 0;
4098 break;
4099 }
4100
4101 r.rw = dtrace_loadptr(tupregs[0].dttk_value);
4102 regs[rd] = _RW_ISWRITER(&r.ri);
4103 break;
4104
4105#else
4106 case DIF_SUBR_MUTEX_OWNED:
4107 if (!dtrace_canload(tupregs[0].dttk_value,
4108 sizeof (struct lock_object), mstate, vstate)) {
4109 regs[rd] = 0;
4110 break;
4111 }
4112 l.lx = dtrace_loadptr((uintptr_t)&tupregs[0].dttk_value);
4113 regs[rd] = LOCK_CLASS(l.li)->lc_owner(l.li, &lowner);
4114 break;
4115
4116 case DIF_SUBR_MUTEX_OWNER:
4117 if (!dtrace_canload(tupregs[0].dttk_value,
4118 sizeof (struct lock_object), mstate, vstate)) {
4119 regs[rd] = 0;
4120 break;
4121 }
4122 l.lx = dtrace_loadptr((uintptr_t)&tupregs[0].dttk_value);
4123 LOCK_CLASS(l.li)->lc_owner(l.li, &lowner);
4124 regs[rd] = (uintptr_t)lowner;
4125 break;
4126
4127 case DIF_SUBR_MUTEX_TYPE_ADAPTIVE:
4128 if (!dtrace_canload(tupregs[0].dttk_value, sizeof (struct mtx),
4129 mstate, vstate)) {
4130 regs[rd] = 0;
4131 break;
4132 }
4133 l.lx = dtrace_loadptr((uintptr_t)&tupregs[0].dttk_value);
4134 /* XXX - should be only LC_SLEEPABLE? */
4135 regs[rd] = (LOCK_CLASS(l.li)->lc_flags &
4136 (LC_SLEEPLOCK | LC_SLEEPABLE)) != 0;
4137 break;
4138
4139 case DIF_SUBR_MUTEX_TYPE_SPIN:
4140 if (!dtrace_canload(tupregs[0].dttk_value, sizeof (struct mtx),
4141 mstate, vstate)) {
4142 regs[rd] = 0;
4143 break;
4144 }
4145 l.lx = dtrace_loadptr((uintptr_t)&tupregs[0].dttk_value);
4146 regs[rd] = (LOCK_CLASS(l.li)->lc_flags & LC_SPINLOCK) != 0;
4147 break;
4148
4149 case DIF_SUBR_RW_READ_HELD:
4150 case DIF_SUBR_SX_SHARED_HELD:
4151 if (!dtrace_canload(tupregs[0].dttk_value, sizeof (uintptr_t),
4152 mstate, vstate)) {
4153 regs[rd] = 0;
4154 break;
4155 }
4156 l.lx = dtrace_loadptr((uintptr_t)&tupregs[0].dttk_value);
4157 regs[rd] = LOCK_CLASS(l.li)->lc_owner(l.li, &lowner) &&
4158 lowner == NULL;
4159 break;
4160
4161 case DIF_SUBR_RW_WRITE_HELD:
4162 case DIF_SUBR_SX_EXCLUSIVE_HELD:
4163 if (!dtrace_canload(tupregs[0].dttk_value, sizeof (uintptr_t),
4164 mstate, vstate)) {
4165 regs[rd] = 0;
4166 break;
4167 }
4168 l.lx = dtrace_loadptr(tupregs[0].dttk_value);
4169 LOCK_CLASS(l.li)->lc_owner(l.li, &lowner);
4170 regs[rd] = (lowner == curthread);
4171 break;
4172
4173 case DIF_SUBR_RW_ISWRITER:
4174 case DIF_SUBR_SX_ISEXCLUSIVE:
4175 if (!dtrace_canload(tupregs[0].dttk_value, sizeof (uintptr_t),
4176 mstate, vstate)) {
4177 regs[rd] = 0;
4178 break;
4179 }
4180 l.lx = dtrace_loadptr(tupregs[0].dttk_value);
4181 regs[rd] = LOCK_CLASS(l.li)->lc_owner(l.li, &lowner) &&
4182 lowner != NULL;
4183 break;
4184#endif /* ! defined(sun) */
4185
4186 case DIF_SUBR_BCOPY: {
4187 /*
4188 * We need to be sure that the destination is in the scratch
4189 * region -- no other region is allowed.
4190 */
4191 uintptr_t src = tupregs[0].dttk_value;
4192 uintptr_t dest = tupregs[1].dttk_value;
4193 size_t size = tupregs[2].dttk_value;
4194
4195 if (!dtrace_inscratch(dest, size, mstate)) {
4196 *flags |= CPU_DTRACE_BADADDR;
4197 *illval = regs[rd];
4198 break;
4199 }
4200
4201 if (!dtrace_canload(src, size, mstate, vstate)) {
4202 regs[rd] = 0;
4203 break;
4204 }
4205
4206 dtrace_bcopy((void *)src, (void *)dest, size);
4207 break;
4208 }
4209
4210 case DIF_SUBR_ALLOCA:
4211 case DIF_SUBR_COPYIN: {
4212 uintptr_t dest = P2ROUNDUP(mstate->dtms_scratch_ptr, 8);
4213 uint64_t size =
4214 tupregs[subr == DIF_SUBR_ALLOCA ? 0 : 1].dttk_value;
4215 size_t scratch_size = (dest - mstate->dtms_scratch_ptr) + size;
4216
4217 /*
4218 * This action doesn't require any credential checks since
4219 * probes will not activate in user contexts to which the
4220 * enabling user does not have permissions.
4221 */
4222
4223 /*
4224 * Rounding up the user allocation size could have overflowed
4225 * a large, bogus allocation (like -1ULL) to 0.
4226 */
4227 if (scratch_size < size ||
4228 !DTRACE_INSCRATCH(mstate, scratch_size)) {
4229 DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
4230 regs[rd] = 0;
4231 break;
4232 }
4233
4234 if (subr == DIF_SUBR_COPYIN) {
4235 DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
4236 dtrace_copyin(tupregs[0].dttk_value, dest, size, flags);
4237 DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);
4238 }
4239
4240 mstate->dtms_scratch_ptr += scratch_size;
4241 regs[rd] = dest;
4242 break;
4243 }
4244
4245 case DIF_SUBR_COPYINTO: {
4246 uint64_t size = tupregs[1].dttk_value;
4247 uintptr_t dest = tupregs[2].dttk_value;
4248
4249 /*
4250 * This action doesn't require any credential checks since
4251 * probes will not activate in user contexts to which the
4252 * enabling user does not have permissions.
4253 */
4254 if (!dtrace_inscratch(dest, size, mstate)) {
4255 *flags |= CPU_DTRACE_BADADDR;
4256 *illval = regs[rd];
4257 break;
4258 }
4259
4260 DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
4261 dtrace_copyin(tupregs[0].dttk_value, dest, size, flags);
4262 DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);
4263 break;
4264 }
4265
4266 case DIF_SUBR_COPYINSTR: {
4267 uintptr_t dest = mstate->dtms_scratch_ptr;
4268 uint64_t size = state->dts_options[DTRACEOPT_STRSIZE];
4269
4270 if (nargs > 1 && tupregs[1].dttk_value < size)
4271 size = tupregs[1].dttk_value + 1;
4272
4273 /*
4274 * This action doesn't require any credential checks since
4275 * probes will not activate in user contexts to which the
4276 * enabling user does not have permissions.
4277 */
4278 if (!DTRACE_INSCRATCH(mstate, size)) {
4279 DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
4280 regs[rd] = 0;
4281 break;
4282 }
4283
4284 DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
4285 dtrace_copyinstr(tupregs[0].dttk_value, dest, size, flags);
4286 DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);
4287
4288 ((char *)dest)[size - 1] = '\0';
4289 mstate->dtms_scratch_ptr += size;
4290 regs[rd] = dest;
4291 break;
4292 }
4293
4294#if defined(sun)
4295 case DIF_SUBR_MSGSIZE:
4296 case DIF_SUBR_MSGDSIZE: {
4297 uintptr_t baddr = tupregs[0].dttk_value, daddr;
4298 uintptr_t wptr, rptr;
4299 size_t count = 0;
4300 int cont = 0;
4301
4302 while (baddr != 0 && !(*flags & CPU_DTRACE_FAULT)) {
4303
4304 if (!dtrace_canload(baddr, sizeof (mblk_t), mstate,
4305 vstate)) {
4306 regs[rd] = 0;
4307 break;
4308 }
4309
4310 wptr = dtrace_loadptr(baddr +
4311 offsetof(mblk_t, b_wptr));
4312
4313 rptr = dtrace_loadptr(baddr +
4314 offsetof(mblk_t, b_rptr));
4315
4316 if (wptr < rptr) {
4317 *flags |= CPU_DTRACE_BADADDR;
4318 *illval = tupregs[0].dttk_value;
4319 break;
4320 }
4321
4322 daddr = dtrace_loadptr(baddr +
4323 offsetof(mblk_t, b_datap));
4324
4325 baddr = dtrace_loadptr(baddr +
4326 offsetof(mblk_t, b_cont));
4327
4328 /*
4329 * We want to prevent against denial-of-service here,
4330 * so we're only going to search the list for
4331 * dtrace_msgdsize_max mblks.
4332 */
4333 if (cont++ > dtrace_msgdsize_max) {
4334 *flags |= CPU_DTRACE_ILLOP;
4335 break;
4336 }
4337
4338 if (subr == DIF_SUBR_MSGDSIZE) {
4339 if (dtrace_load8(daddr +
4340 offsetof(dblk_t, db_type)) != M_DATA)
4341 continue;
4342 }
4343
4344 count += wptr - rptr;
4345 }
4346
4347 if (!(*flags & CPU_DTRACE_FAULT))
4348 regs[rd] = count;
4349
4350 break;
4351 }
4352#endif
4353
4354 case DIF_SUBR_PROGENYOF: {
4355 pid_t pid = tupregs[0].dttk_value;
4356 proc_t *p;
4357 int rval = 0;
4358
4359 DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
4360
4361 for (p = curthread->t_procp; p != NULL; p = p->p_parent) {
4362#if defined(sun)
4363 if (p->p_pidp->pid_id == pid) {
4364#else
4365 if (p->p_pid == pid) {
4366#endif
4367 rval = 1;
4368 break;
4369 }
4370 }
4371
4372 DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);
4373
4374 regs[rd] = rval;
4375 break;
4376 }
4377
4378 case DIF_SUBR_SPECULATION:
4379 regs[rd] = dtrace_speculation(state);
4380 break;
4381
4382 case DIF_SUBR_COPYOUT: {
4383 uintptr_t kaddr = tupregs[0].dttk_value;
4384 uintptr_t uaddr = tupregs[1].dttk_value;
4385 uint64_t size = tupregs[2].dttk_value;
4386
4387 if (!dtrace_destructive_disallow &&
4388 dtrace_priv_proc_control(state) &&
4389 !dtrace_istoxic(kaddr, size)) {
4390 DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
4391 dtrace_copyout(kaddr, uaddr, size, flags);
4392 DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);
4393 }
4394 break;
4395 }
4396
4397 case DIF_SUBR_COPYOUTSTR: {
4398 uintptr_t kaddr = tupregs[0].dttk_value;
4399 uintptr_t uaddr = tupregs[1].dttk_value;
4400 uint64_t size = tupregs[2].dttk_value;
4401
4402 if (!dtrace_destructive_disallow &&
4403 dtrace_priv_proc_control(state) &&
4404 !dtrace_istoxic(kaddr, size)) {
4405 DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
4406 dtrace_copyoutstr(kaddr, uaddr, size, flags);
4407 DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);
4408 }
4409 break;
4410 }
4411
4412 case DIF_SUBR_STRLEN: {
4413 size_t sz;
4414 uintptr_t addr = (uintptr_t)tupregs[0].dttk_value;
4415 sz = dtrace_strlen((char *)addr,
4416 state->dts_options[DTRACEOPT_STRSIZE]);
4417
4418 if (!dtrace_canload(addr, sz + 1, mstate, vstate)) {
4419 regs[rd] = 0;
4420 break;
4421 }
4422
4423 regs[rd] = sz;
4424
4425 break;
4426 }
4427
4428 case DIF_SUBR_STRCHR:
4429 case DIF_SUBR_STRRCHR: {
4430 /*
4431 * We're going to iterate over the string looking for the
4432 * specified character. We will iterate until we have reached
4433 * the string length or we have found the character. If this
4434 * is DIF_SUBR_STRRCHR, we will look for the last occurrence
4435 * of the specified character instead of the first.
4436 */
4437 uintptr_t saddr = tupregs[0].dttk_value;
4438 uintptr_t addr = tupregs[0].dttk_value;
4439 uintptr_t limit = addr + state->dts_options[DTRACEOPT_STRSIZE];
4440 char c, target = (char)tupregs[1].dttk_value;
4441
4442 for (regs[rd] = 0; addr < limit; addr++) {
4443 if ((c = dtrace_load8(addr)) == target) {
4444 regs[rd] = addr;
4445
4446 if (subr == DIF_SUBR_STRCHR)
4447 break;
4448 }
4449
4450 if (c == '\0')
4451 break;
4452 }
4453
4454 if (!dtrace_canload(saddr, addr - saddr, mstate, vstate)) {
4455 regs[rd] = 0;
4456 break;
4457 }
4458
4459 break;
4460 }
4461
4462 case DIF_SUBR_STRSTR:
4463 case DIF_SUBR_INDEX:
4464 case DIF_SUBR_RINDEX: {
4465 /*
4466 * We're going to iterate over the string looking for the
4467 * specified string. We will iterate until we have reached
4468 * the string length or we have found the string. (Yes, this
4469 * is done in the most naive way possible -- but considering
4470 * that the string we're searching for is likely to be
4471 * relatively short, the complexity of Rabin-Karp or similar
4472 * hardly seems merited.)
4473 */
4474 char *addr = (char *)(uintptr_t)tupregs[0].dttk_value;
4475 char *substr = (char *)(uintptr_t)tupregs[1].dttk_value;
4476 uint64_t size = state->dts_options[DTRACEOPT_STRSIZE];
4477 size_t len = dtrace_strlen(addr, size);
4478 size_t sublen = dtrace_strlen(substr, size);
4479 char *limit = addr + len, *orig = addr;
4480 int notfound = subr == DIF_SUBR_STRSTR ? 0 : -1;
4481 int inc = 1;
4482
4483 regs[rd] = notfound;
4484
4485 if (!dtrace_canload((uintptr_t)addr, len + 1, mstate, vstate)) {
4486 regs[rd] = 0;
4487 break;
4488 }
4489
4490 if (!dtrace_canload((uintptr_t)substr, sublen + 1, mstate,
4491 vstate)) {
4492 regs[rd] = 0;
4493 break;
4494 }
4495
4496 /*
4497 * strstr() and index()/rindex() have similar semantics if
4498 * both strings are the empty string: strstr() returns a
4499 * pointer to the (empty) string, and index() and rindex()
4500 * both return index 0 (regardless of any position argument).
4501 */
4502 if (sublen == 0 && len == 0) {
4503 if (subr == DIF_SUBR_STRSTR)
4504 regs[rd] = (uintptr_t)addr;
4505 else
4506 regs[rd] = 0;
4507 break;
4508 }
4509
4510 if (subr != DIF_SUBR_STRSTR) {
4511 if (subr == DIF_SUBR_RINDEX) {
4512 limit = orig - 1;
4513 addr += len;
4514 inc = -1;
4515 }
4516
4517 /*
4518 * Both index() and rindex() take an optional position
4519 * argument that denotes the starting position.
4520 */
4521 if (nargs == 3) {
4522 int64_t pos = (int64_t)tupregs[2].dttk_value;
4523
4524 /*
4525 * If the position argument to index() is
4526 * negative, Perl implicitly clamps it at
4527 * zero. This semantic is a little surprising
4528 * given the special meaning of negative
4529 * positions to similar Perl functions like
4530 * substr(), but it appears to reflect a
4531 * notion that index() can start from a
4532 * negative index and increment its way up to
4533 * the string. Given this notion, Perl's
4534 * rindex() is at least self-consistent in
4535 * that it implicitly clamps positions greater
4536 * than the string length to be the string
4537 * length. Where Perl completely loses
4538 * coherence, however, is when the specified
4539 * substring is the empty string (""). In
4540 * this case, even if the position is
4541 * negative, rindex() returns 0 -- and even if
4542 * the position is greater than the length,
4543 * index() returns the string length. These
4544 * semantics violate the notion that index()
4545 * should never return a value less than the
4546 * specified position and that rindex() should
4547 * never return a value greater than the
4548 * specified position. (One assumes that
4549 * these semantics are artifacts of Perl's
4550 * implementation and not the results of
4551 * deliberate design -- it beggars belief that
4552 * even Larry Wall could desire such oddness.)
4553 * While in the abstract one would wish for
4554 * consistent position semantics across
4555 * substr(), index() and rindex() -- or at the
4556 * very least self-consistent position
4557 * semantics for index() and rindex() -- we
4558 * instead opt to keep with the extant Perl
4559 * semantics, in all their broken glory. (Do
4560 * we have more desire to maintain Perl's
4561 * semantics than Perl does? Probably.)
4562 */
4563 if (subr == DIF_SUBR_RINDEX) {
4564 if (pos < 0) {
4565 if (sublen == 0)
4566 regs[rd] = 0;
4567 break;
4568 }
4569
4570 if (pos > len)
4571 pos = len;
4572 } else {
4573 if (pos < 0)
4574 pos = 0;
4575
4576 if (pos >= len) {
4577 if (sublen == 0)
4578 regs[rd] = len;
4579 break;
4580 }
4581 }
4582
4583 addr = orig + pos;
4584 }
4585 }
4586
4587 for (regs[rd] = notfound; addr != limit; addr += inc) {
4588 if (dtrace_strncmp(addr, substr, sublen) == 0) {
4589 if (subr != DIF_SUBR_STRSTR) {
4590 /*
4591 * As D index() and rindex() are
4592 * modeled on Perl (and not on awk),
4593 * we return a zero-based (and not a
4594 * one-based) index. (For you Perl
4595 * weenies: no, we're not going to add
4596 * $[ -- and shouldn't you be at a con
4597 * or something?)
4598 */
4599 regs[rd] = (uintptr_t)(addr - orig);
4600 break;
4601 }
4602
4603 ASSERT(subr == DIF_SUBR_STRSTR);
4604 regs[rd] = (uintptr_t)addr;
4605 break;
4606 }
4607 }
4608
4609 break;
4610 }
4611
4612 case DIF_SUBR_STRTOK: {
4613 uintptr_t addr = tupregs[0].dttk_value;
4614 uintptr_t tokaddr = tupregs[1].dttk_value;
4615 uint64_t size = state->dts_options[DTRACEOPT_STRSIZE];
4616 uintptr_t limit, toklimit = tokaddr + size;
4617 uint8_t c = 0, tokmap[32]; /* 256 / 8 */
4618 char *dest = (char *)mstate->dtms_scratch_ptr;
4619 int i;
4620
4621 /*
4622 * Check both the token buffer and (later) the input buffer,
4623 * since both could be non-scratch addresses.
4624 */
4625 if (!dtrace_strcanload(tokaddr, size, mstate, vstate)) {
4626 regs[rd] = 0;
4627 break;
4628 }
4629
4630 if (!DTRACE_INSCRATCH(mstate, size)) {
4631 DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
4632 regs[rd] = 0;
4633 break;
4634 }
4635
4636 if (addr == 0) {
4637 /*
4638 * If the address specified is NULL, we use our saved
4639 * strtok pointer from the mstate. Note that this
4640 * means that the saved strtok pointer is _only_
4641 * valid within multiple enablings of the same probe --
4642 * it behaves like an implicit clause-local variable.
4643 */
4644 addr = mstate->dtms_strtok;
4645 } else {
4646 /*
4647 * If the user-specified address is non-NULL we must
4648 * access check it. This is the only time we have
4649 * a chance to do so, since this address may reside
4650 * in the string table of this clause-- future calls
4651 * (when we fetch addr from mstate->dtms_strtok)
4652 * would fail this access check.
4653 */
4654 if (!dtrace_strcanload(addr, size, mstate, vstate)) {
4655 regs[rd] = 0;
4656 break;
4657 }
4658 }
4659
4660 /*
4661 * First, zero the token map, and then process the token
4662 * string -- setting a bit in the map for every character
4663 * found in the token string.
4664 */
4665 for (i = 0; i < sizeof (tokmap); i++)
4666 tokmap[i] = 0;
4667
4668 for (; tokaddr < toklimit; tokaddr++) {
4669 if ((c = dtrace_load8(tokaddr)) == '\0')
4670 break;
4671
4672 ASSERT((c >> 3) < sizeof (tokmap));
4673 tokmap[c >> 3] |= (1 << (c & 0x7));
4674 }
4675
4676 for (limit = addr + size; addr < limit; addr++) {
4677 /*
4678 * We're looking for a character that is _not_ contained
4679 * in the token string.
4680 */
4681 if ((c = dtrace_load8(addr)) == '\0')
4682 break;
4683
4684 if (!(tokmap[c >> 3] & (1 << (c & 0x7))))
4685 break;
4686 }
4687
4688 if (c == '\0') {
4689 /*
4690 * We reached the end of the string without finding
4691 * any character that was not in the token string.
4692 * We return NULL in this case, and we set the saved
4693 * address to NULL as well.
4694 */
4695 regs[rd] = 0;
4696 mstate->dtms_strtok = 0;
4697 break;
4698 }
4699
4700 /*
4701 * From here on, we're copying into the destination string.
4702 */
4703 for (i = 0; addr < limit && i < size - 1; addr++) {
4704 if ((c = dtrace_load8(addr)) == '\0')
4705 break;
4706
4707 if (tokmap[c >> 3] & (1 << (c & 0x7)))
4708 break;
4709
4710 ASSERT(i < size);
4711 dest[i++] = c;
4712 }
4713
4714 ASSERT(i < size);
4715 dest[i] = '\0';
4716 regs[rd] = (uintptr_t)dest;
4717 mstate->dtms_scratch_ptr += size;
4718 mstate->dtms_strtok = addr;
4719 break;
4720 }
4721
4722 case DIF_SUBR_SUBSTR: {
4723 uintptr_t s = tupregs[0].dttk_value;
4724 uint64_t size = state->dts_options[DTRACEOPT_STRSIZE];
4725 char *d = (char *)mstate->dtms_scratch_ptr;
4726 int64_t index = (int64_t)tupregs[1].dttk_value;
4727 int64_t remaining = (int64_t)tupregs[2].dttk_value;
4728 size_t len = dtrace_strlen((char *)s, size);
4729 int64_t i = 0;
4730
4731 if (!dtrace_canload(s, len + 1, mstate, vstate)) {
4732 regs[rd] = 0;
4733 break;
4734 }
4735
4736 if (!DTRACE_INSCRATCH(mstate, size)) {
4737 DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
4738 regs[rd] = 0;
4739 break;
4740 }
4741
4742 if (nargs <= 2)
4743 remaining = (int64_t)size;
4744
4745 if (index < 0) {
4746 index += len;
4747
4748 if (index < 0 && index + remaining > 0) {
4749 remaining += index;
4750 index = 0;
4751 }
4752 }
4753
4754 if (index >= len || index < 0) {
4755 remaining = 0;
4756 } else if (remaining < 0) {
4757 remaining += len - index;
4758 } else if (index + remaining > size) {
4759 remaining = size - index;
4760 }
4761
4762 for (i = 0; i < remaining; i++) {
4763 if ((d[i] = dtrace_load8(s + index + i)) == '\0')
4764 break;
4765 }
4766
4767 d[i] = '\0';
4768
4769 mstate->dtms_scratch_ptr += size;
4770 regs[rd] = (uintptr_t)d;
4771 break;
4772 }
4773
4774 case DIF_SUBR_JSON: {
4775 uint64_t size = state->dts_options[DTRACEOPT_STRSIZE];
4776 uintptr_t json = tupregs[0].dttk_value;
4777 size_t jsonlen = dtrace_strlen((char *)json, size);
4778 uintptr_t elem = tupregs[1].dttk_value;
4779 size_t elemlen = dtrace_strlen((char *)elem, size);
4780
4781 char *dest = (char *)mstate->dtms_scratch_ptr;
4782 char *elemlist = (char *)mstate->dtms_scratch_ptr + jsonlen + 1;
4783 char *ee = elemlist;
4784 int nelems = 1;
4785 uintptr_t cur;
4786
4787 if (!dtrace_canload(json, jsonlen + 1, mstate, vstate) ||
4788 !dtrace_canload(elem, elemlen + 1, mstate, vstate)) {
4789 regs[rd] = 0;
4790 break;
4791 }
4792
4793 if (!DTRACE_INSCRATCH(mstate, jsonlen + 1 + elemlen + 1)) {
4794 DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
4795 regs[rd] = 0;
4796 break;
4797 }
4798
4799 /*
4800 * Read the element selector and split it up into a packed list
4801 * of strings.
4802 */
4803 for (cur = elem; cur < elem + elemlen; cur++) {
4804 char cc = dtrace_load8(cur);
4805
4806 if (cur == elem && cc == '[') {
4807 /*
4808 * If the first element selector key is
4809 * actually an array index then ignore the
4810 * bracket.
4811 */
4812 continue;
4813 }
4814
4815 if (cc == ']')
4816 continue;
4817
4818 if (cc == '.' || cc == '[') {
4819 nelems++;
4820 cc = '\0';
4821 }
4822
4823 *ee++ = cc;
4824 }
4825 *ee++ = '\0';
4826
4827 if ((regs[rd] = (uintptr_t)dtrace_json(size, json, elemlist,
4828 nelems, dest)) != 0)
4829 mstate->dtms_scratch_ptr += jsonlen + 1;
4830 break;
4831 }
4832
4058 case DIF_SUBR_TOUPPER:
4059 case DIF_SUBR_TOLOWER: {
4060 uintptr_t s = tupregs[0].dttk_value;
4061 uint64_t size = state->dts_options[DTRACEOPT_STRSIZE];
4062 char *dest = (char *)mstate->dtms_scratch_ptr, c;
4063 size_t len = dtrace_strlen((char *)s, size);
4064 char lower, upper, convert;
4065 int64_t i;
4066
4067 if (subr == DIF_SUBR_TOUPPER) {
4068 lower = 'a';
4069 upper = 'z';
4070 convert = 'A';
4071 } else {
4072 lower = 'A';
4073 upper = 'Z';
4074 convert = 'a';
4075 }
4076
4077 if (!dtrace_canload(s, len + 1, mstate, vstate)) {
4078 regs[rd] = 0;
4079 break;
4080 }
4081
4082 if (!DTRACE_INSCRATCH(mstate, size)) {
4083 DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
4084 regs[rd] = 0;
4085 break;
4086 }
4087
4088 for (i = 0; i < size - 1; i++) {
4089 if ((c = dtrace_load8(s + i)) == '\0')
4090 break;
4091
4092 if (c >= lower && c <= upper)
4093 c = convert + (c - lower);
4094
4095 dest[i] = c;
4096 }
4097
4098 ASSERT(i < size);
4099 dest[i] = '\0';
4100 regs[rd] = (uintptr_t)dest;
4101 mstate->dtms_scratch_ptr += size;
4102 break;
4103 }
4104
4105#if defined(sun)
4106 case DIF_SUBR_GETMAJOR:
4107#ifdef _LP64
4108 regs[rd] = (tupregs[0].dttk_value >> NBITSMINOR64) & MAXMAJ64;
4109#else
4110 regs[rd] = (tupregs[0].dttk_value >> NBITSMINOR) & MAXMAJ;
4111#endif
4112 break;
4113
4114 case DIF_SUBR_GETMINOR:
4115#ifdef _LP64
4116 regs[rd] = tupregs[0].dttk_value & MAXMIN64;
4117#else
4118 regs[rd] = tupregs[0].dttk_value & MAXMIN;
4119#endif
4120 break;
4121
4122 case DIF_SUBR_DDI_PATHNAME: {
4123 /*
4124 * This one is a galactic mess. We are going to roughly
4125 * emulate ddi_pathname(), but it's made more complicated
4126 * by the fact that we (a) want to include the minor name and
4127 * (b) must proceed iteratively instead of recursively.
4128 */
4129 uintptr_t dest = mstate->dtms_scratch_ptr;
4130 uint64_t size = state->dts_options[DTRACEOPT_STRSIZE];
4131 char *start = (char *)dest, *end = start + size - 1;
4132 uintptr_t daddr = tupregs[0].dttk_value;
4133 int64_t minor = (int64_t)tupregs[1].dttk_value;
4134 char *s;
4135 int i, len, depth = 0;
4136
4137 /*
4138 * Due to all the pointer jumping we do and context we must
4139 * rely upon, we just mandate that the user must have kernel
4140 * read privileges to use this routine.
4141 */
4142 if ((mstate->dtms_access & DTRACE_ACCESS_KERNEL) == 0) {
4143 *flags |= CPU_DTRACE_KPRIV;
4144 *illval = daddr;
4145 regs[rd] = 0;
4146 }
4147
4148 if (!DTRACE_INSCRATCH(mstate, size)) {
4149 DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
4150 regs[rd] = 0;
4151 break;
4152 }
4153
4154 *end = '\0';
4155
4156 /*
4157 * We want to have a name for the minor. In order to do this,
4158 * we need to walk the minor list from the devinfo. We want
4159 * to be sure that we don't infinitely walk a circular list,
4160 * so we check for circularity by sending a scout pointer
4161 * ahead two elements for every element that we iterate over;
4162 * if the list is circular, these will ultimately point to the
4163 * same element. You may recognize this little trick as the
4164 * answer to a stupid interview question -- one that always
4165 * seems to be asked by those who had to have it laboriously
4166 * explained to them, and who can't even concisely describe
4167 * the conditions under which one would be forced to resort to
4168 * this technique. Needless to say, those conditions are
4169 * found here -- and probably only here. Is this the only use
4170 * of this infamous trick in shipping, production code? If it
4171 * isn't, it probably should be...
4172 */
4173 if (minor != -1) {
4174 uintptr_t maddr = dtrace_loadptr(daddr +
4175 offsetof(struct dev_info, devi_minor));
4176
4177 uintptr_t next = offsetof(struct ddi_minor_data, next);
4178 uintptr_t name = offsetof(struct ddi_minor_data,
4179 d_minor) + offsetof(struct ddi_minor, name);
4180 uintptr_t dev = offsetof(struct ddi_minor_data,
4181 d_minor) + offsetof(struct ddi_minor, dev);
4182 uintptr_t scout;
4183
4184 if (maddr != NULL)
4185 scout = dtrace_loadptr(maddr + next);
4186
4187 while (maddr != NULL && !(*flags & CPU_DTRACE_FAULT)) {
4188 uint64_t m;
4189#ifdef _LP64
4190 m = dtrace_load64(maddr + dev) & MAXMIN64;
4191#else
4192 m = dtrace_load32(maddr + dev) & MAXMIN;
4193#endif
4194 if (m != minor) {
4195 maddr = dtrace_loadptr(maddr + next);
4196
4197 if (scout == NULL)
4198 continue;
4199
4200 scout = dtrace_loadptr(scout + next);
4201
4202 if (scout == NULL)
4203 continue;
4204
4205 scout = dtrace_loadptr(scout + next);
4206
4207 if (scout == NULL)
4208 continue;
4209
4210 if (scout == maddr) {
4211 *flags |= CPU_DTRACE_ILLOP;
4212 break;
4213 }
4214
4215 continue;
4216 }
4217
4218 /*
4219 * We have the minor data. Now we need to
4220 * copy the minor's name into the end of the
4221 * pathname.
4222 */
4223 s = (char *)dtrace_loadptr(maddr + name);
4224 len = dtrace_strlen(s, size);
4225
4226 if (*flags & CPU_DTRACE_FAULT)
4227 break;
4228
4229 if (len != 0) {
4230 if ((end -= (len + 1)) < start)
4231 break;
4232
4233 *end = ':';
4234 }
4235
4236 for (i = 1; i <= len; i++)
4237 end[i] = dtrace_load8((uintptr_t)s++);
4238 break;
4239 }
4240 }
4241
4242 while (daddr != NULL && !(*flags & CPU_DTRACE_FAULT)) {
4243 ddi_node_state_t devi_state;
4244
4245 devi_state = dtrace_load32(daddr +
4246 offsetof(struct dev_info, devi_node_state));
4247
4248 if (*flags & CPU_DTRACE_FAULT)
4249 break;
4250
4251 if (devi_state >= DS_INITIALIZED) {
4252 s = (char *)dtrace_loadptr(daddr +
4253 offsetof(struct dev_info, devi_addr));
4254 len = dtrace_strlen(s, size);
4255
4256 if (*flags & CPU_DTRACE_FAULT)
4257 break;
4258
4259 if (len != 0) {
4260 if ((end -= (len + 1)) < start)
4261 break;
4262
4263 *end = '@';
4264 }
4265
4266 for (i = 1; i <= len; i++)
4267 end[i] = dtrace_load8((uintptr_t)s++);
4268 }
4269
4270 /*
4271 * Now for the node name...
4272 */
4273 s = (char *)dtrace_loadptr(daddr +
4274 offsetof(struct dev_info, devi_node_name));
4275
4276 daddr = dtrace_loadptr(daddr +
4277 offsetof(struct dev_info, devi_parent));
4278
4279 /*
4280 * If our parent is NULL (that is, if we're the root
4281 * node), we're going to use the special path
4282 * "devices".
4283 */
4284 if (daddr == 0)
4285 s = "devices";
4286
4287 len = dtrace_strlen(s, size);
4288 if (*flags & CPU_DTRACE_FAULT)
4289 break;
4290
4291 if ((end -= (len + 1)) < start)
4292 break;
4293
4294 for (i = 1; i <= len; i++)
4295 end[i] = dtrace_load8((uintptr_t)s++);
4296 *end = '/';
4297
4298 if (depth++ > dtrace_devdepth_max) {
4299 *flags |= CPU_DTRACE_ILLOP;
4300 break;
4301 }
4302 }
4303
4304 if (end < start)
4305 DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
4306
4307 if (daddr == 0) {
4308 regs[rd] = (uintptr_t)end;
4309 mstate->dtms_scratch_ptr += size;
4310 }
4311
4312 break;
4313 }
4314#endif
4315
4316 case DIF_SUBR_STRJOIN: {
4317 char *d = (char *)mstate->dtms_scratch_ptr;
4318 uint64_t size = state->dts_options[DTRACEOPT_STRSIZE];
4319 uintptr_t s1 = tupregs[0].dttk_value;
4320 uintptr_t s2 = tupregs[1].dttk_value;
4321 int i = 0;
4322
4323 if (!dtrace_strcanload(s1, size, mstate, vstate) ||
4324 !dtrace_strcanload(s2, size, mstate, vstate)) {
4325 regs[rd] = 0;
4326 break;
4327 }
4328
4329 if (!DTRACE_INSCRATCH(mstate, size)) {
4330 DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
4331 regs[rd] = 0;
4332 break;
4333 }
4334
4335 for (;;) {
4336 if (i >= size) {
4337 DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
4338 regs[rd] = 0;
4339 break;
4340 }
4341
4342 if ((d[i++] = dtrace_load8(s1++)) == '\0') {
4343 i--;
4344 break;
4345 }
4346 }
4347
4348 for (;;) {
4349 if (i >= size) {
4350 DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
4351 regs[rd] = 0;
4352 break;
4353 }
4354
4355 if ((d[i++] = dtrace_load8(s2++)) == '\0')
4356 break;
4357 }
4358
4359 if (i < size) {
4360 mstate->dtms_scratch_ptr += i;
4361 regs[rd] = (uintptr_t)d;
4362 }
4363
4364 break;
4365 }
4366
4833 case DIF_SUBR_TOUPPER:
4834 case DIF_SUBR_TOLOWER: {
4835 uintptr_t s = tupregs[0].dttk_value;
4836 uint64_t size = state->dts_options[DTRACEOPT_STRSIZE];
4837 char *dest = (char *)mstate->dtms_scratch_ptr, c;
4838 size_t len = dtrace_strlen((char *)s, size);
4839 char lower, upper, convert;
4840 int64_t i;
4841
4842 if (subr == DIF_SUBR_TOUPPER) {
4843 lower = 'a';
4844 upper = 'z';
4845 convert = 'A';
4846 } else {
4847 lower = 'A';
4848 upper = 'Z';
4849 convert = 'a';
4850 }
4851
4852 if (!dtrace_canload(s, len + 1, mstate, vstate)) {
4853 regs[rd] = 0;
4854 break;
4855 }
4856
4857 if (!DTRACE_INSCRATCH(mstate, size)) {
4858 DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
4859 regs[rd] = 0;
4860 break;
4861 }
4862
4863 for (i = 0; i < size - 1; i++) {
4864 if ((c = dtrace_load8(s + i)) == '\0')
4865 break;
4866
4867 if (c >= lower && c <= upper)
4868 c = convert + (c - lower);
4869
4870 dest[i] = c;
4871 }
4872
4873 ASSERT(i < size);
4874 dest[i] = '\0';
4875 regs[rd] = (uintptr_t)dest;
4876 mstate->dtms_scratch_ptr += size;
4877 break;
4878 }
4879
4880#if defined(sun)
4881 case DIF_SUBR_GETMAJOR:
4882#ifdef _LP64
4883 regs[rd] = (tupregs[0].dttk_value >> NBITSMINOR64) & MAXMAJ64;
4884#else
4885 regs[rd] = (tupregs[0].dttk_value >> NBITSMINOR) & MAXMAJ;
4886#endif
4887 break;
4888
4889 case DIF_SUBR_GETMINOR:
4890#ifdef _LP64
4891 regs[rd] = tupregs[0].dttk_value & MAXMIN64;
4892#else
4893 regs[rd] = tupregs[0].dttk_value & MAXMIN;
4894#endif
4895 break;
4896
4897 case DIF_SUBR_DDI_PATHNAME: {
4898 /*
4899 * This one is a galactic mess. We are going to roughly
4900 * emulate ddi_pathname(), but it's made more complicated
4901 * by the fact that we (a) want to include the minor name and
4902 * (b) must proceed iteratively instead of recursively.
4903 */
4904 uintptr_t dest = mstate->dtms_scratch_ptr;
4905 uint64_t size = state->dts_options[DTRACEOPT_STRSIZE];
4906 char *start = (char *)dest, *end = start + size - 1;
4907 uintptr_t daddr = tupregs[0].dttk_value;
4908 int64_t minor = (int64_t)tupregs[1].dttk_value;
4909 char *s;
4910 int i, len, depth = 0;
4911
4912 /*
4913 * Due to all the pointer jumping we do and context we must
4914 * rely upon, we just mandate that the user must have kernel
4915 * read privileges to use this routine.
4916 */
4917 if ((mstate->dtms_access & DTRACE_ACCESS_KERNEL) == 0) {
4918 *flags |= CPU_DTRACE_KPRIV;
4919 *illval = daddr;
4920 regs[rd] = 0;
4921 }
4922
4923 if (!DTRACE_INSCRATCH(mstate, size)) {
4924 DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
4925 regs[rd] = 0;
4926 break;
4927 }
4928
4929 *end = '\0';
4930
4931 /*
4932 * We want to have a name for the minor. In order to do this,
4933 * we need to walk the minor list from the devinfo. We want
4934 * to be sure that we don't infinitely walk a circular list,
4935 * so we check for circularity by sending a scout pointer
4936 * ahead two elements for every element that we iterate over;
4937 * if the list is circular, these will ultimately point to the
4938 * same element. You may recognize this little trick as the
4939 * answer to a stupid interview question -- one that always
4940 * seems to be asked by those who had to have it laboriously
4941 * explained to them, and who can't even concisely describe
4942 * the conditions under which one would be forced to resort to
4943 * this technique. Needless to say, those conditions are
4944 * found here -- and probably only here. Is this the only use
4945 * of this infamous trick in shipping, production code? If it
4946 * isn't, it probably should be...
4947 */
4948 if (minor != -1) {
4949 uintptr_t maddr = dtrace_loadptr(daddr +
4950 offsetof(struct dev_info, devi_minor));
4951
4952 uintptr_t next = offsetof(struct ddi_minor_data, next);
4953 uintptr_t name = offsetof(struct ddi_minor_data,
4954 d_minor) + offsetof(struct ddi_minor, name);
4955 uintptr_t dev = offsetof(struct ddi_minor_data,
4956 d_minor) + offsetof(struct ddi_minor, dev);
4957 uintptr_t scout;
4958
4959 if (maddr != NULL)
4960 scout = dtrace_loadptr(maddr + next);
4961
4962 while (maddr != NULL && !(*flags & CPU_DTRACE_FAULT)) {
4963 uint64_t m;
4964#ifdef _LP64
4965 m = dtrace_load64(maddr + dev) & MAXMIN64;
4966#else
4967 m = dtrace_load32(maddr + dev) & MAXMIN;
4968#endif
4969 if (m != minor) {
4970 maddr = dtrace_loadptr(maddr + next);
4971
4972 if (scout == NULL)
4973 continue;
4974
4975 scout = dtrace_loadptr(scout + next);
4976
4977 if (scout == NULL)
4978 continue;
4979
4980 scout = dtrace_loadptr(scout + next);
4981
4982 if (scout == NULL)
4983 continue;
4984
4985 if (scout == maddr) {
4986 *flags |= CPU_DTRACE_ILLOP;
4987 break;
4988 }
4989
4990 continue;
4991 }
4992
4993 /*
4994 * We have the minor data. Now we need to
4995 * copy the minor's name into the end of the
4996 * pathname.
4997 */
4998 s = (char *)dtrace_loadptr(maddr + name);
4999 len = dtrace_strlen(s, size);
5000
5001 if (*flags & CPU_DTRACE_FAULT)
5002 break;
5003
5004 if (len != 0) {
5005 if ((end -= (len + 1)) < start)
5006 break;
5007
5008 *end = ':';
5009 }
5010
5011 for (i = 1; i <= len; i++)
5012 end[i] = dtrace_load8((uintptr_t)s++);
5013 break;
5014 }
5015 }
5016
5017 while (daddr != NULL && !(*flags & CPU_DTRACE_FAULT)) {
5018 ddi_node_state_t devi_state;
5019
5020 devi_state = dtrace_load32(daddr +
5021 offsetof(struct dev_info, devi_node_state));
5022
5023 if (*flags & CPU_DTRACE_FAULT)
5024 break;
5025
5026 if (devi_state >= DS_INITIALIZED) {
5027 s = (char *)dtrace_loadptr(daddr +
5028 offsetof(struct dev_info, devi_addr));
5029 len = dtrace_strlen(s, size);
5030
5031 if (*flags & CPU_DTRACE_FAULT)
5032 break;
5033
5034 if (len != 0) {
5035 if ((end -= (len + 1)) < start)
5036 break;
5037
5038 *end = '@';
5039 }
5040
5041 for (i = 1; i <= len; i++)
5042 end[i] = dtrace_load8((uintptr_t)s++);
5043 }
5044
5045 /*
5046 * Now for the node name...
5047 */
5048 s = (char *)dtrace_loadptr(daddr +
5049 offsetof(struct dev_info, devi_node_name));
5050
5051 daddr = dtrace_loadptr(daddr +
5052 offsetof(struct dev_info, devi_parent));
5053
5054 /*
5055 * If our parent is NULL (that is, if we're the root
5056 * node), we're going to use the special path
5057 * "devices".
5058 */
5059 if (daddr == 0)
5060 s = "devices";
5061
5062 len = dtrace_strlen(s, size);
5063 if (*flags & CPU_DTRACE_FAULT)
5064 break;
5065
5066 if ((end -= (len + 1)) < start)
5067 break;
5068
5069 for (i = 1; i <= len; i++)
5070 end[i] = dtrace_load8((uintptr_t)s++);
5071 *end = '/';
5072
5073 if (depth++ > dtrace_devdepth_max) {
5074 *flags |= CPU_DTRACE_ILLOP;
5075 break;
5076 }
5077 }
5078
5079 if (end < start)
5080 DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
5081
5082 if (daddr == 0) {
5083 regs[rd] = (uintptr_t)end;
5084 mstate->dtms_scratch_ptr += size;
5085 }
5086
5087 break;
5088 }
5089#endif
5090
5091 case DIF_SUBR_STRJOIN: {
5092 char *d = (char *)mstate->dtms_scratch_ptr;
5093 uint64_t size = state->dts_options[DTRACEOPT_STRSIZE];
5094 uintptr_t s1 = tupregs[0].dttk_value;
5095 uintptr_t s2 = tupregs[1].dttk_value;
5096 int i = 0;
5097
5098 if (!dtrace_strcanload(s1, size, mstate, vstate) ||
5099 !dtrace_strcanload(s2, size, mstate, vstate)) {
5100 regs[rd] = 0;
5101 break;
5102 }
5103
5104 if (!DTRACE_INSCRATCH(mstate, size)) {
5105 DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
5106 regs[rd] = 0;
5107 break;
5108 }
5109
5110 for (;;) {
5111 if (i >= size) {
5112 DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
5113 regs[rd] = 0;
5114 break;
5115 }
5116
5117 if ((d[i++] = dtrace_load8(s1++)) == '\0') {
5118 i--;
5119 break;
5120 }
5121 }
5122
5123 for (;;) {
5124 if (i >= size) {
5125 DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
5126 regs[rd] = 0;
5127 break;
5128 }
5129
5130 if ((d[i++] = dtrace_load8(s2++)) == '\0')
5131 break;
5132 }
5133
5134 if (i < size) {
5135 mstate->dtms_scratch_ptr += i;
5136 regs[rd] = (uintptr_t)d;
5137 }
5138
5139 break;
5140 }
5141
5142 case DIF_SUBR_STRTOLL: {
5143 uintptr_t s = tupregs[0].dttk_value;
5144 uint64_t size = state->dts_options[DTRACEOPT_STRSIZE];
5145 int base = 10;
5146
5147 if (nargs > 1) {
5148 if ((base = tupregs[1].dttk_value) <= 1 ||
5149 base > ('z' - 'a' + 1) + ('9' - '0' + 1)) {
5150 *flags |= CPU_DTRACE_ILLOP;
5151 break;
5152 }
5153 }
5154
5155 if (!dtrace_strcanload(s, size, mstate, vstate)) {
5156 regs[rd] = INT64_MIN;
5157 break;
5158 }
5159
5160 regs[rd] = dtrace_strtoll((char *)s, base, size);
5161 break;
5162 }
5163
4367 case DIF_SUBR_LLTOSTR: {
4368 int64_t i = (int64_t)tupregs[0].dttk_value;
4369 uint64_t val, digit;
4370 uint64_t size = 65; /* enough room for 2^64 in binary */
4371 char *end = (char *)mstate->dtms_scratch_ptr + size - 1;
4372 int base = 10;
4373
4374 if (nargs > 1) {
4375 if ((base = tupregs[1].dttk_value) <= 1 ||
4376 base > ('z' - 'a' + 1) + ('9' - '0' + 1)) {
4377 *flags |= CPU_DTRACE_ILLOP;
4378 break;
4379 }
4380 }
4381
4382 val = (base == 10 && i < 0) ? i * -1 : i;
4383
4384 if (!DTRACE_INSCRATCH(mstate, size)) {
4385 DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
4386 regs[rd] = 0;
4387 break;
4388 }
4389
4390 for (*end-- = '\0'; val; val /= base) {
4391 if ((digit = val % base) <= '9' - '0') {
4392 *end-- = '0' + digit;
4393 } else {
4394 *end-- = 'a' + (digit - ('9' - '0') - 1);
4395 }
4396 }
4397
4398 if (i == 0 && base == 16)
4399 *end-- = '0';
4400
4401 if (base == 16)
4402 *end-- = 'x';
4403
4404 if (i == 0 || base == 8 || base == 16)
4405 *end-- = '0';
4406
4407 if (i < 0 && base == 10)
4408 *end-- = '-';
4409
4410 regs[rd] = (uintptr_t)end + 1;
4411 mstate->dtms_scratch_ptr += size;
4412 break;
4413 }
4414
4415 case DIF_SUBR_HTONS:
4416 case DIF_SUBR_NTOHS:
4417#if BYTE_ORDER == BIG_ENDIAN
4418 regs[rd] = (uint16_t)tupregs[0].dttk_value;
4419#else
4420 regs[rd] = DT_BSWAP_16((uint16_t)tupregs[0].dttk_value);
4421#endif
4422 break;
4423
4424
4425 case DIF_SUBR_HTONL:
4426 case DIF_SUBR_NTOHL:
4427#if BYTE_ORDER == BIG_ENDIAN
4428 regs[rd] = (uint32_t)tupregs[0].dttk_value;
4429#else
4430 regs[rd] = DT_BSWAP_32((uint32_t)tupregs[0].dttk_value);
4431#endif
4432 break;
4433
4434
4435 case DIF_SUBR_HTONLL:
4436 case DIF_SUBR_NTOHLL:
4437#if BYTE_ORDER == BIG_ENDIAN
4438 regs[rd] = (uint64_t)tupregs[0].dttk_value;
4439#else
4440 regs[rd] = DT_BSWAP_64((uint64_t)tupregs[0].dttk_value);
4441#endif
4442 break;
4443
4444
4445 case DIF_SUBR_DIRNAME:
4446 case DIF_SUBR_BASENAME: {
4447 char *dest = (char *)mstate->dtms_scratch_ptr;
4448 uint64_t size = state->dts_options[DTRACEOPT_STRSIZE];
4449 uintptr_t src = tupregs[0].dttk_value;
4450 int i, j, len = dtrace_strlen((char *)src, size);
4451 int lastbase = -1, firstbase = -1, lastdir = -1;
4452 int start, end;
4453
4454 if (!dtrace_canload(src, len + 1, mstate, vstate)) {
4455 regs[rd] = 0;
4456 break;
4457 }
4458
4459 if (!DTRACE_INSCRATCH(mstate, size)) {
4460 DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
4461 regs[rd] = 0;
4462 break;
4463 }
4464
4465 /*
4466 * The basename and dirname for a zero-length string is
4467 * defined to be "."
4468 */
4469 if (len == 0) {
4470 len = 1;
4471 src = (uintptr_t)".";
4472 }
4473
4474 /*
4475 * Start from the back of the string, moving back toward the
4476 * front until we see a character that isn't a slash. That
4477 * character is the last character in the basename.
4478 */
4479 for (i = len - 1; i >= 0; i--) {
4480 if (dtrace_load8(src + i) != '/')
4481 break;
4482 }
4483
4484 if (i >= 0)
4485 lastbase = i;
4486
4487 /*
4488 * Starting from the last character in the basename, move
4489 * towards the front until we find a slash. The character
4490 * that we processed immediately before that is the first
4491 * character in the basename.
4492 */
4493 for (; i >= 0; i--) {
4494 if (dtrace_load8(src + i) == '/')
4495 break;
4496 }
4497
4498 if (i >= 0)
4499 firstbase = i + 1;
4500
4501 /*
4502 * Now keep going until we find a non-slash character. That
4503 * character is the last character in the dirname.
4504 */
4505 for (; i >= 0; i--) {
4506 if (dtrace_load8(src + i) != '/')
4507 break;
4508 }
4509
4510 if (i >= 0)
4511 lastdir = i;
4512
4513 ASSERT(!(lastbase == -1 && firstbase != -1));
4514 ASSERT(!(firstbase == -1 && lastdir != -1));
4515
4516 if (lastbase == -1) {
4517 /*
4518 * We didn't find a non-slash character. We know that
4519 * the length is non-zero, so the whole string must be
4520 * slashes. In either the dirname or the basename
4521 * case, we return '/'.
4522 */
4523 ASSERT(firstbase == -1);
4524 firstbase = lastbase = lastdir = 0;
4525 }
4526
4527 if (firstbase == -1) {
4528 /*
4529 * The entire string consists only of a basename
4530 * component. If we're looking for dirname, we need
4531 * to change our string to be just "."; if we're
4532 * looking for a basename, we'll just set the first
4533 * character of the basename to be 0.
4534 */
4535 if (subr == DIF_SUBR_DIRNAME) {
4536 ASSERT(lastdir == -1);
4537 src = (uintptr_t)".";
4538 lastdir = 0;
4539 } else {
4540 firstbase = 0;
4541 }
4542 }
4543
4544 if (subr == DIF_SUBR_DIRNAME) {
4545 if (lastdir == -1) {
4546 /*
4547 * We know that we have a slash in the name --
4548 * or lastdir would be set to 0, above. And
4549 * because lastdir is -1, we know that this
4550 * slash must be the first character. (That
4551 * is, the full string must be of the form
4552 * "/basename".) In this case, the last
4553 * character of the directory name is 0.
4554 */
4555 lastdir = 0;
4556 }
4557
4558 start = 0;
4559 end = lastdir;
4560 } else {
4561 ASSERT(subr == DIF_SUBR_BASENAME);
4562 ASSERT(firstbase != -1 && lastbase != -1);
4563 start = firstbase;
4564 end = lastbase;
4565 }
4566
4567 for (i = start, j = 0; i <= end && j < size - 1; i++, j++)
4568 dest[j] = dtrace_load8(src + i);
4569
4570 dest[j] = '\0';
4571 regs[rd] = (uintptr_t)dest;
4572 mstate->dtms_scratch_ptr += size;
4573 break;
4574 }
4575
5164 case DIF_SUBR_LLTOSTR: {
5165 int64_t i = (int64_t)tupregs[0].dttk_value;
5166 uint64_t val, digit;
5167 uint64_t size = 65; /* enough room for 2^64 in binary */
5168 char *end = (char *)mstate->dtms_scratch_ptr + size - 1;
5169 int base = 10;
5170
5171 if (nargs > 1) {
5172 if ((base = tupregs[1].dttk_value) <= 1 ||
5173 base > ('z' - 'a' + 1) + ('9' - '0' + 1)) {
5174 *flags |= CPU_DTRACE_ILLOP;
5175 break;
5176 }
5177 }
5178
5179 val = (base == 10 && i < 0) ? i * -1 : i;
5180
5181 if (!DTRACE_INSCRATCH(mstate, size)) {
5182 DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
5183 regs[rd] = 0;
5184 break;
5185 }
5186
5187 for (*end-- = '\0'; val; val /= base) {
5188 if ((digit = val % base) <= '9' - '0') {
5189 *end-- = '0' + digit;
5190 } else {
5191 *end-- = 'a' + (digit - ('9' - '0') - 1);
5192 }
5193 }
5194
5195 if (i == 0 && base == 16)
5196 *end-- = '0';
5197
5198 if (base == 16)
5199 *end-- = 'x';
5200
5201 if (i == 0 || base == 8 || base == 16)
5202 *end-- = '0';
5203
5204 if (i < 0 && base == 10)
5205 *end-- = '-';
5206
5207 regs[rd] = (uintptr_t)end + 1;
5208 mstate->dtms_scratch_ptr += size;
5209 break;
5210 }
5211
5212 case DIF_SUBR_HTONS:
5213 case DIF_SUBR_NTOHS:
5214#if BYTE_ORDER == BIG_ENDIAN
5215 regs[rd] = (uint16_t)tupregs[0].dttk_value;
5216#else
5217 regs[rd] = DT_BSWAP_16((uint16_t)tupregs[0].dttk_value);
5218#endif
5219 break;
5220
5221
5222 case DIF_SUBR_HTONL:
5223 case DIF_SUBR_NTOHL:
5224#if BYTE_ORDER == BIG_ENDIAN
5225 regs[rd] = (uint32_t)tupregs[0].dttk_value;
5226#else
5227 regs[rd] = DT_BSWAP_32((uint32_t)tupregs[0].dttk_value);
5228#endif
5229 break;
5230
5231
5232 case DIF_SUBR_HTONLL:
5233 case DIF_SUBR_NTOHLL:
5234#if BYTE_ORDER == BIG_ENDIAN
5235 regs[rd] = (uint64_t)tupregs[0].dttk_value;
5236#else
5237 regs[rd] = DT_BSWAP_64((uint64_t)tupregs[0].dttk_value);
5238#endif
5239 break;
5240
5241
5242 case DIF_SUBR_DIRNAME:
5243 case DIF_SUBR_BASENAME: {
5244 char *dest = (char *)mstate->dtms_scratch_ptr;
5245 uint64_t size = state->dts_options[DTRACEOPT_STRSIZE];
5246 uintptr_t src = tupregs[0].dttk_value;
5247 int i, j, len = dtrace_strlen((char *)src, size);
5248 int lastbase = -1, firstbase = -1, lastdir = -1;
5249 int start, end;
5250
5251 if (!dtrace_canload(src, len + 1, mstate, vstate)) {
5252 regs[rd] = 0;
5253 break;
5254 }
5255
5256 if (!DTRACE_INSCRATCH(mstate, size)) {
5257 DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
5258 regs[rd] = 0;
5259 break;
5260 }
5261
5262 /*
5263 * The basename and dirname for a zero-length string is
5264 * defined to be "."
5265 */
5266 if (len == 0) {
5267 len = 1;
5268 src = (uintptr_t)".";
5269 }
5270
5271 /*
5272 * Start from the back of the string, moving back toward the
5273 * front until we see a character that isn't a slash. That
5274 * character is the last character in the basename.
5275 */
5276 for (i = len - 1; i >= 0; i--) {
5277 if (dtrace_load8(src + i) != '/')
5278 break;
5279 }
5280
5281 if (i >= 0)
5282 lastbase = i;
5283
5284 /*
5285 * Starting from the last character in the basename, move
5286 * towards the front until we find a slash. The character
5287 * that we processed immediately before that is the first
5288 * character in the basename.
5289 */
5290 for (; i >= 0; i--) {
5291 if (dtrace_load8(src + i) == '/')
5292 break;
5293 }
5294
5295 if (i >= 0)
5296 firstbase = i + 1;
5297
5298 /*
5299 * Now keep going until we find a non-slash character. That
5300 * character is the last character in the dirname.
5301 */
5302 for (; i >= 0; i--) {
5303 if (dtrace_load8(src + i) != '/')
5304 break;
5305 }
5306
5307 if (i >= 0)
5308 lastdir = i;
5309
5310 ASSERT(!(lastbase == -1 && firstbase != -1));
5311 ASSERT(!(firstbase == -1 && lastdir != -1));
5312
5313 if (lastbase == -1) {
5314 /*
5315 * We didn't find a non-slash character. We know that
5316 * the length is non-zero, so the whole string must be
5317 * slashes. In either the dirname or the basename
5318 * case, we return '/'.
5319 */
5320 ASSERT(firstbase == -1);
5321 firstbase = lastbase = lastdir = 0;
5322 }
5323
5324 if (firstbase == -1) {
5325 /*
5326 * The entire string consists only of a basename
5327 * component. If we're looking for dirname, we need
5328 * to change our string to be just "."; if we're
5329 * looking for a basename, we'll just set the first
5330 * character of the basename to be 0.
5331 */
5332 if (subr == DIF_SUBR_DIRNAME) {
5333 ASSERT(lastdir == -1);
5334 src = (uintptr_t)".";
5335 lastdir = 0;
5336 } else {
5337 firstbase = 0;
5338 }
5339 }
5340
5341 if (subr == DIF_SUBR_DIRNAME) {
5342 if (lastdir == -1) {
5343 /*
5344 * We know that we have a slash in the name --
5345 * or lastdir would be set to 0, above. And
5346 * because lastdir is -1, we know that this
5347 * slash must be the first character. (That
5348 * is, the full string must be of the form
5349 * "/basename".) In this case, the last
5350 * character of the directory name is 0.
5351 */
5352 lastdir = 0;
5353 }
5354
5355 start = 0;
5356 end = lastdir;
5357 } else {
5358 ASSERT(subr == DIF_SUBR_BASENAME);
5359 ASSERT(firstbase != -1 && lastbase != -1);
5360 start = firstbase;
5361 end = lastbase;
5362 }
5363
5364 for (i = start, j = 0; i <= end && j < size - 1; i++, j++)
5365 dest[j] = dtrace_load8(src + i);
5366
5367 dest[j] = '\0';
5368 regs[rd] = (uintptr_t)dest;
5369 mstate->dtms_scratch_ptr += size;
5370 break;
5371 }
5372
5373 case DIF_SUBR_GETF: {
5374 uintptr_t fd = tupregs[0].dttk_value;
5375 struct filedesc *fdp;
5376 file_t *fp;
5377
5378 if (!dtrace_priv_proc(state)) {
5379 regs[rd] = 0;
5380 break;
5381 }
5382 fdp = curproc->p_fd;
5383 FILEDESC_SLOCK(fdp);
5384 fp = fget_locked(fdp, fd);
5385 mstate->dtms_getf = fp;
5386 regs[rd] = (uintptr_t)fp;
5387 FILEDESC_SUNLOCK(fdp);
5388 break;
5389 }
5390
4576 case DIF_SUBR_CLEANPATH: {
4577 char *dest = (char *)mstate->dtms_scratch_ptr, c;
4578 uint64_t size = state->dts_options[DTRACEOPT_STRSIZE];
4579 uintptr_t src = tupregs[0].dttk_value;
4580 int i = 0, j = 0;
5391 case DIF_SUBR_CLEANPATH: {
5392 char *dest = (char *)mstate->dtms_scratch_ptr, c;
5393 uint64_t size = state->dts_options[DTRACEOPT_STRSIZE];
5394 uintptr_t src = tupregs[0].dttk_value;
5395 int i = 0, j = 0;
5396#if defined(sun)
5397 zone_t *z;
5398#endif
4581
4582 if (!dtrace_strcanload(src, size, mstate, vstate)) {
4583 regs[rd] = 0;
4584 break;
4585 }
4586
4587 if (!DTRACE_INSCRATCH(mstate, size)) {
4588 DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
4589 regs[rd] = 0;
4590 break;
4591 }
4592
4593 /*
4594 * Move forward, loading each character.
4595 */
4596 do {
4597 c = dtrace_load8(src + i++);
4598next:
4599 if (j + 5 >= size) /* 5 = strlen("/..c\0") */
4600 break;
4601
4602 if (c != '/') {
4603 dest[j++] = c;
4604 continue;
4605 }
4606
4607 c = dtrace_load8(src + i++);
4608
4609 if (c == '/') {
4610 /*
4611 * We have two slashes -- we can just advance
4612 * to the next character.
4613 */
4614 goto next;
4615 }
4616
4617 if (c != '.') {
4618 /*
4619 * This is not "." and it's not ".." -- we can
4620 * just store the "/" and this character and
4621 * drive on.
4622 */
4623 dest[j++] = '/';
4624 dest[j++] = c;
4625 continue;
4626 }
4627
4628 c = dtrace_load8(src + i++);
4629
4630 if (c == '/') {
4631 /*
4632 * This is a "/./" component. We're not going
4633 * to store anything in the destination buffer;
4634 * we're just going to go to the next component.
4635 */
4636 goto next;
4637 }
4638
4639 if (c != '.') {
4640 /*
4641 * This is not ".." -- we can just store the
4642 * "/." and this character and continue
4643 * processing.
4644 */
4645 dest[j++] = '/';
4646 dest[j++] = '.';
4647 dest[j++] = c;
4648 continue;
4649 }
4650
4651 c = dtrace_load8(src + i++);
4652
4653 if (c != '/' && c != '\0') {
4654 /*
4655 * This is not ".." -- it's "..[mumble]".
4656 * We'll store the "/.." and this character
4657 * and continue processing.
4658 */
4659 dest[j++] = '/';
4660 dest[j++] = '.';
4661 dest[j++] = '.';
4662 dest[j++] = c;
4663 continue;
4664 }
4665
4666 /*
4667 * This is "/../" or "/..\0". We need to back up
4668 * our destination pointer until we find a "/".
4669 */
4670 i--;
4671 while (j != 0 && dest[--j] != '/')
4672 continue;
4673
4674 if (c == '\0')
4675 dest[++j] = '/';
4676 } while (c != '\0');
4677
4678 dest[j] = '\0';
5399
5400 if (!dtrace_strcanload(src, size, mstate, vstate)) {
5401 regs[rd] = 0;
5402 break;
5403 }
5404
5405 if (!DTRACE_INSCRATCH(mstate, size)) {
5406 DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
5407 regs[rd] = 0;
5408 break;
5409 }
5410
5411 /*
5412 * Move forward, loading each character.
5413 */
5414 do {
5415 c = dtrace_load8(src + i++);
5416next:
5417 if (j + 5 >= size) /* 5 = strlen("/..c\0") */
5418 break;
5419
5420 if (c != '/') {
5421 dest[j++] = c;
5422 continue;
5423 }
5424
5425 c = dtrace_load8(src + i++);
5426
5427 if (c == '/') {
5428 /*
5429 * We have two slashes -- we can just advance
5430 * to the next character.
5431 */
5432 goto next;
5433 }
5434
5435 if (c != '.') {
5436 /*
5437 * This is not "." and it's not ".." -- we can
5438 * just store the "/" and this character and
5439 * drive on.
5440 */
5441 dest[j++] = '/';
5442 dest[j++] = c;
5443 continue;
5444 }
5445
5446 c = dtrace_load8(src + i++);
5447
5448 if (c == '/') {
5449 /*
5450 * This is a "/./" component. We're not going
5451 * to store anything in the destination buffer;
5452 * we're just going to go to the next component.
5453 */
5454 goto next;
5455 }
5456
5457 if (c != '.') {
5458 /*
5459 * This is not ".." -- we can just store the
5460 * "/." and this character and continue
5461 * processing.
5462 */
5463 dest[j++] = '/';
5464 dest[j++] = '.';
5465 dest[j++] = c;
5466 continue;
5467 }
5468
5469 c = dtrace_load8(src + i++);
5470
5471 if (c != '/' && c != '\0') {
5472 /*
5473 * This is not ".." -- it's "..[mumble]".
5474 * We'll store the "/.." and this character
5475 * and continue processing.
5476 */
5477 dest[j++] = '/';
5478 dest[j++] = '.';
5479 dest[j++] = '.';
5480 dest[j++] = c;
5481 continue;
5482 }
5483
5484 /*
5485 * This is "/../" or "/..\0". We need to back up
5486 * our destination pointer until we find a "/".
5487 */
5488 i--;
5489 while (j != 0 && dest[--j] != '/')
5490 continue;
5491
5492 if (c == '\0')
5493 dest[++j] = '/';
5494 } while (c != '\0');
5495
5496 dest[j] = '\0';
5497
5498#if defined(sun)
5499 if (mstate->dtms_getf != NULL &&
5500 !(mstate->dtms_access & DTRACE_ACCESS_KERNEL) &&
5501 (z = state->dts_cred.dcr_cred->cr_zone) != kcred->cr_zone) {
5502 /*
5503 * If we've done a getf() as a part of this ECB and we
5504 * don't have kernel access (and we're not in the global
5505 * zone), check if the path we cleaned up begins with
5506 * the zone's root path, and trim it off if so. Note
5507 * that this is an output cleanliness issue, not a
5508 * security issue: knowing one's zone root path does
5509 * not enable privilege escalation.
5510 */
5511 if (strstr(dest, z->zone_rootpath) == dest)
5512 dest += strlen(z->zone_rootpath) - 1;
5513 }
5514#endif
5515
4679 regs[rd] = (uintptr_t)dest;
4680 mstate->dtms_scratch_ptr += size;
4681 break;
4682 }
4683
4684 case DIF_SUBR_INET_NTOA:
4685 case DIF_SUBR_INET_NTOA6:
4686 case DIF_SUBR_INET_NTOP: {
4687 size_t size;
4688 int af, argi, i;
4689 char *base, *end;
4690
4691 if (subr == DIF_SUBR_INET_NTOP) {
4692 af = (int)tupregs[0].dttk_value;
4693 argi = 1;
4694 } else {
4695 af = subr == DIF_SUBR_INET_NTOA ? AF_INET: AF_INET6;
4696 argi = 0;
4697 }
4698
4699 if (af == AF_INET) {
4700 ipaddr_t ip4;
4701 uint8_t *ptr8, val;
4702
4703 /*
4704 * Safely load the IPv4 address.
4705 */
4706 ip4 = dtrace_load32(tupregs[argi].dttk_value);
4707
4708 /*
4709 * Check an IPv4 string will fit in scratch.
4710 */
4711 size = INET_ADDRSTRLEN;
4712 if (!DTRACE_INSCRATCH(mstate, size)) {
4713 DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
4714 regs[rd] = 0;
4715 break;
4716 }
4717 base = (char *)mstate->dtms_scratch_ptr;
4718 end = (char *)mstate->dtms_scratch_ptr + size - 1;
4719
4720 /*
4721 * Stringify as a dotted decimal quad.
4722 */
4723 *end-- = '\0';
4724 ptr8 = (uint8_t *)&ip4;
4725 for (i = 3; i >= 0; i--) {
4726 val = ptr8[i];
4727
4728 if (val == 0) {
4729 *end-- = '0';
4730 } else {
4731 for (; val; val /= 10) {
4732 *end-- = '0' + (val % 10);
4733 }
4734 }
4735
4736 if (i > 0)
4737 *end-- = '.';
4738 }
4739 ASSERT(end + 1 >= base);
4740
4741 } else if (af == AF_INET6) {
4742 struct in6_addr ip6;
4743 int firstzero, tryzero, numzero, v6end;
4744 uint16_t val;
4745 const char digits[] = "0123456789abcdef";
4746
4747 /*
4748 * Stringify using RFC 1884 convention 2 - 16 bit
4749 * hexadecimal values with a zero-run compression.
4750 * Lower case hexadecimal digits are used.
4751 * eg, fe80::214:4fff:fe0b:76c8.
4752 * The IPv4 embedded form is returned for inet_ntop,
4753 * just the IPv4 string is returned for inet_ntoa6.
4754 */
4755
4756 /*
4757 * Safely load the IPv6 address.
4758 */
4759 dtrace_bcopy(
4760 (void *)(uintptr_t)tupregs[argi].dttk_value,
4761 (void *)(uintptr_t)&ip6, sizeof (struct in6_addr));
4762
4763 /*
4764 * Check an IPv6 string will fit in scratch.
4765 */
4766 size = INET6_ADDRSTRLEN;
4767 if (!DTRACE_INSCRATCH(mstate, size)) {
4768 DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
4769 regs[rd] = 0;
4770 break;
4771 }
4772 base = (char *)mstate->dtms_scratch_ptr;
4773 end = (char *)mstate->dtms_scratch_ptr + size - 1;
4774 *end-- = '\0';
4775
4776 /*
4777 * Find the longest run of 16 bit zero values
4778 * for the single allowed zero compression - "::".
4779 */
4780 firstzero = -1;
4781 tryzero = -1;
4782 numzero = 1;
4783 for (i = 0; i < sizeof (struct in6_addr); i++) {
4784#if defined(sun)
4785 if (ip6._S6_un._S6_u8[i] == 0 &&
4786#else
4787 if (ip6.__u6_addr.__u6_addr8[i] == 0 &&
4788#endif
4789 tryzero == -1 && i % 2 == 0) {
4790 tryzero = i;
4791 continue;
4792 }
4793
4794 if (tryzero != -1 &&
4795#if defined(sun)
4796 (ip6._S6_un._S6_u8[i] != 0 ||
4797#else
4798 (ip6.__u6_addr.__u6_addr8[i] != 0 ||
4799#endif
4800 i == sizeof (struct in6_addr) - 1)) {
4801
4802 if (i - tryzero <= numzero) {
4803 tryzero = -1;
4804 continue;
4805 }
4806
4807 firstzero = tryzero;
4808 numzero = i - i % 2 - tryzero;
4809 tryzero = -1;
4810
4811#if defined(sun)
4812 if (ip6._S6_un._S6_u8[i] == 0 &&
4813#else
4814 if (ip6.__u6_addr.__u6_addr8[i] == 0 &&
4815#endif
4816 i == sizeof (struct in6_addr) - 1)
4817 numzero += 2;
4818 }
4819 }
4820 ASSERT(firstzero + numzero <= sizeof (struct in6_addr));
4821
4822 /*
4823 * Check for an IPv4 embedded address.
4824 */
4825 v6end = sizeof (struct in6_addr) - 2;
4826 if (IN6_IS_ADDR_V4MAPPED(&ip6) ||
4827 IN6_IS_ADDR_V4COMPAT(&ip6)) {
4828 for (i = sizeof (struct in6_addr) - 1;
4829 i >= DTRACE_V4MAPPED_OFFSET; i--) {
4830 ASSERT(end >= base);
4831
4832#if defined(sun)
4833 val = ip6._S6_un._S6_u8[i];
4834#else
4835 val = ip6.__u6_addr.__u6_addr8[i];
4836#endif
4837
4838 if (val == 0) {
4839 *end-- = '0';
4840 } else {
4841 for (; val; val /= 10) {
4842 *end-- = '0' + val % 10;
4843 }
4844 }
4845
4846 if (i > DTRACE_V4MAPPED_OFFSET)
4847 *end-- = '.';
4848 }
4849
4850 if (subr == DIF_SUBR_INET_NTOA6)
4851 goto inetout;
4852
4853 /*
4854 * Set v6end to skip the IPv4 address that
4855 * we have already stringified.
4856 */
4857 v6end = 10;
4858 }
4859
4860 /*
4861 * Build the IPv6 string by working through the
4862 * address in reverse.
4863 */
4864 for (i = v6end; i >= 0; i -= 2) {
4865 ASSERT(end >= base);
4866
4867 if (i == firstzero + numzero - 2) {
4868 *end-- = ':';
4869 *end-- = ':';
4870 i -= numzero - 2;
4871 continue;
4872 }
4873
4874 if (i < 14 && i != firstzero - 2)
4875 *end-- = ':';
4876
4877#if defined(sun)
4878 val = (ip6._S6_un._S6_u8[i] << 8) +
4879 ip6._S6_un._S6_u8[i + 1];
4880#else
4881 val = (ip6.__u6_addr.__u6_addr8[i] << 8) +
4882 ip6.__u6_addr.__u6_addr8[i + 1];
4883#endif
4884
4885 if (val == 0) {
4886 *end-- = '0';
4887 } else {
4888 for (; val; val /= 16) {
4889 *end-- = digits[val % 16];
4890 }
4891 }
4892 }
4893 ASSERT(end + 1 >= base);
4894
4895 } else {
4896 /*
4897 * The user didn't use AH_INET or AH_INET6.
4898 */
4899 DTRACE_CPUFLAG_SET(CPU_DTRACE_ILLOP);
4900 regs[rd] = 0;
4901 break;
4902 }
4903
4904inetout: regs[rd] = (uintptr_t)end + 1;
4905 mstate->dtms_scratch_ptr += size;
4906 break;
4907 }
4908
4909 case DIF_SUBR_MEMREF: {
4910 uintptr_t size = 2 * sizeof(uintptr_t);
4911 uintptr_t *memref = (uintptr_t *) P2ROUNDUP(mstate->dtms_scratch_ptr, sizeof(uintptr_t));
4912 size_t scratch_size = ((uintptr_t) memref - mstate->dtms_scratch_ptr) + size;
4913
4914 /* address and length */
4915 memref[0] = tupregs[0].dttk_value;
4916 memref[1] = tupregs[1].dttk_value;
4917
4918 regs[rd] = (uintptr_t) memref;
4919 mstate->dtms_scratch_ptr += scratch_size;
4920 break;
4921 }
4922
4923 case DIF_SUBR_TYPEREF: {
4924 uintptr_t size = 4 * sizeof(uintptr_t);
4925 uintptr_t *typeref = (uintptr_t *) P2ROUNDUP(mstate->dtms_scratch_ptr, sizeof(uintptr_t));
4926 size_t scratch_size = ((uintptr_t) typeref - mstate->dtms_scratch_ptr) + size;
4927
4928 /* address, num_elements, type_str, type_len */
4929 typeref[0] = tupregs[0].dttk_value;
4930 typeref[1] = tupregs[1].dttk_value;
4931 typeref[2] = tupregs[2].dttk_value;
4932 typeref[3] = tupregs[3].dttk_value;
4933
4934 regs[rd] = (uintptr_t) typeref;
4935 mstate->dtms_scratch_ptr += scratch_size;
4936 break;
4937 }
4938 }
4939}
4940
4941/*
4942 * Emulate the execution of DTrace IR instructions specified by the given
4943 * DIF object. This function is deliberately void of assertions as all of
4944 * the necessary checks are handled by a call to dtrace_difo_validate().
4945 */
4946static uint64_t
4947dtrace_dif_emulate(dtrace_difo_t *difo, dtrace_mstate_t *mstate,
4948 dtrace_vstate_t *vstate, dtrace_state_t *state)
4949{
4950 const dif_instr_t *text = difo->dtdo_buf;
4951 const uint_t textlen = difo->dtdo_len;
4952 const char *strtab = difo->dtdo_strtab;
4953 const uint64_t *inttab = difo->dtdo_inttab;
4954
4955 uint64_t rval = 0;
4956 dtrace_statvar_t *svar;
4957 dtrace_dstate_t *dstate = &vstate->dtvs_dynvars;
4958 dtrace_difv_t *v;
4959 volatile uint16_t *flags = &cpu_core[curcpu].cpuc_dtrace_flags;
4960 volatile uintptr_t *illval = &cpu_core[curcpu].cpuc_dtrace_illval;
4961
4962 dtrace_key_t tupregs[DIF_DTR_NREGS + 2]; /* +2 for thread and id */
4963 uint64_t regs[DIF_DIR_NREGS];
4964 uint64_t *tmp;
4965
4966 uint8_t cc_n = 0, cc_z = 0, cc_v = 0, cc_c = 0;
4967 int64_t cc_r;
4968 uint_t pc = 0, id, opc = 0;
4969 uint8_t ttop = 0;
4970 dif_instr_t instr;
4971 uint_t r1, r2, rd;
4972
4973 /*
4974 * We stash the current DIF object into the machine state: we need it
4975 * for subsequent access checking.
4976 */
4977 mstate->dtms_difo = difo;
4978
4979 regs[DIF_REG_R0] = 0; /* %r0 is fixed at zero */
4980
4981 while (pc < textlen && !(*flags & CPU_DTRACE_FAULT)) {
4982 opc = pc;
4983
4984 instr = text[pc++];
4985 r1 = DIF_INSTR_R1(instr);
4986 r2 = DIF_INSTR_R2(instr);
4987 rd = DIF_INSTR_RD(instr);
4988
4989 switch (DIF_INSTR_OP(instr)) {
4990 case DIF_OP_OR:
4991 regs[rd] = regs[r1] | regs[r2];
4992 break;
4993 case DIF_OP_XOR:
4994 regs[rd] = regs[r1] ^ regs[r2];
4995 break;
4996 case DIF_OP_AND:
4997 regs[rd] = regs[r1] & regs[r2];
4998 break;
4999 case DIF_OP_SLL:
5000 regs[rd] = regs[r1] << regs[r2];
5001 break;
5002 case DIF_OP_SRL:
5003 regs[rd] = regs[r1] >> regs[r2];
5004 break;
5005 case DIF_OP_SUB:
5006 regs[rd] = regs[r1] - regs[r2];
5007 break;
5008 case DIF_OP_ADD:
5009 regs[rd] = regs[r1] + regs[r2];
5010 break;
5011 case DIF_OP_MUL:
5012 regs[rd] = regs[r1] * regs[r2];
5013 break;
5014 case DIF_OP_SDIV:
5015 if (regs[r2] == 0) {
5016 regs[rd] = 0;
5017 *flags |= CPU_DTRACE_DIVZERO;
5018 } else {
5019 regs[rd] = (int64_t)regs[r1] /
5020 (int64_t)regs[r2];
5021 }
5022 break;
5023
5024 case DIF_OP_UDIV:
5025 if (regs[r2] == 0) {
5026 regs[rd] = 0;
5027 *flags |= CPU_DTRACE_DIVZERO;
5028 } else {
5029 regs[rd] = regs[r1] / regs[r2];
5030 }
5031 break;
5032
5033 case DIF_OP_SREM:
5034 if (regs[r2] == 0) {
5035 regs[rd] = 0;
5036 *flags |= CPU_DTRACE_DIVZERO;
5037 } else {
5038 regs[rd] = (int64_t)regs[r1] %
5039 (int64_t)regs[r2];
5040 }
5041 break;
5042
5043 case DIF_OP_UREM:
5044 if (regs[r2] == 0) {
5045 regs[rd] = 0;
5046 *flags |= CPU_DTRACE_DIVZERO;
5047 } else {
5048 regs[rd] = regs[r1] % regs[r2];
5049 }
5050 break;
5051
5052 case DIF_OP_NOT:
5053 regs[rd] = ~regs[r1];
5054 break;
5055 case DIF_OP_MOV:
5056 regs[rd] = regs[r1];
5057 break;
5058 case DIF_OP_CMP:
5059 cc_r = regs[r1] - regs[r2];
5060 cc_n = cc_r < 0;
5061 cc_z = cc_r == 0;
5062 cc_v = 0;
5063 cc_c = regs[r1] < regs[r2];
5064 break;
5065 case DIF_OP_TST:
5066 cc_n = cc_v = cc_c = 0;
5067 cc_z = regs[r1] == 0;
5068 break;
5069 case DIF_OP_BA:
5070 pc = DIF_INSTR_LABEL(instr);
5071 break;
5072 case DIF_OP_BE:
5073 if (cc_z)
5074 pc = DIF_INSTR_LABEL(instr);
5075 break;
5076 case DIF_OP_BNE:
5077 if (cc_z == 0)
5078 pc = DIF_INSTR_LABEL(instr);
5079 break;
5080 case DIF_OP_BG:
5081 if ((cc_z | (cc_n ^ cc_v)) == 0)
5082 pc = DIF_INSTR_LABEL(instr);
5083 break;
5084 case DIF_OP_BGU:
5085 if ((cc_c | cc_z) == 0)
5086 pc = DIF_INSTR_LABEL(instr);
5087 break;
5088 case DIF_OP_BGE:
5089 if ((cc_n ^ cc_v) == 0)
5090 pc = DIF_INSTR_LABEL(instr);
5091 break;
5092 case DIF_OP_BGEU:
5093 if (cc_c == 0)
5094 pc = DIF_INSTR_LABEL(instr);
5095 break;
5096 case DIF_OP_BL:
5097 if (cc_n ^ cc_v)
5098 pc = DIF_INSTR_LABEL(instr);
5099 break;
5100 case DIF_OP_BLU:
5101 if (cc_c)
5102 pc = DIF_INSTR_LABEL(instr);
5103 break;
5104 case DIF_OP_BLE:
5105 if (cc_z | (cc_n ^ cc_v))
5106 pc = DIF_INSTR_LABEL(instr);
5107 break;
5108 case DIF_OP_BLEU:
5109 if (cc_c | cc_z)
5110 pc = DIF_INSTR_LABEL(instr);
5111 break;
5112 case DIF_OP_RLDSB:
5516 regs[rd] = (uintptr_t)dest;
5517 mstate->dtms_scratch_ptr += size;
5518 break;
5519 }
5520
5521 case DIF_SUBR_INET_NTOA:
5522 case DIF_SUBR_INET_NTOA6:
5523 case DIF_SUBR_INET_NTOP: {
5524 size_t size;
5525 int af, argi, i;
5526 char *base, *end;
5527
5528 if (subr == DIF_SUBR_INET_NTOP) {
5529 af = (int)tupregs[0].dttk_value;
5530 argi = 1;
5531 } else {
5532 af = subr == DIF_SUBR_INET_NTOA ? AF_INET: AF_INET6;
5533 argi = 0;
5534 }
5535
5536 if (af == AF_INET) {
5537 ipaddr_t ip4;
5538 uint8_t *ptr8, val;
5539
5540 /*
5541 * Safely load the IPv4 address.
5542 */
5543 ip4 = dtrace_load32(tupregs[argi].dttk_value);
5544
5545 /*
5546 * Check an IPv4 string will fit in scratch.
5547 */
5548 size = INET_ADDRSTRLEN;
5549 if (!DTRACE_INSCRATCH(mstate, size)) {
5550 DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
5551 regs[rd] = 0;
5552 break;
5553 }
5554 base = (char *)mstate->dtms_scratch_ptr;
5555 end = (char *)mstate->dtms_scratch_ptr + size - 1;
5556
5557 /*
5558 * Stringify as a dotted decimal quad.
5559 */
5560 *end-- = '\0';
5561 ptr8 = (uint8_t *)&ip4;
5562 for (i = 3; i >= 0; i--) {
5563 val = ptr8[i];
5564
5565 if (val == 0) {
5566 *end-- = '0';
5567 } else {
5568 for (; val; val /= 10) {
5569 *end-- = '0' + (val % 10);
5570 }
5571 }
5572
5573 if (i > 0)
5574 *end-- = '.';
5575 }
5576 ASSERT(end + 1 >= base);
5577
5578 } else if (af == AF_INET6) {
5579 struct in6_addr ip6;
5580 int firstzero, tryzero, numzero, v6end;
5581 uint16_t val;
5582 const char digits[] = "0123456789abcdef";
5583
5584 /*
5585 * Stringify using RFC 1884 convention 2 - 16 bit
5586 * hexadecimal values with a zero-run compression.
5587 * Lower case hexadecimal digits are used.
5588 * eg, fe80::214:4fff:fe0b:76c8.
5589 * The IPv4 embedded form is returned for inet_ntop,
5590 * just the IPv4 string is returned for inet_ntoa6.
5591 */
5592
5593 /*
5594 * Safely load the IPv6 address.
5595 */
5596 dtrace_bcopy(
5597 (void *)(uintptr_t)tupregs[argi].dttk_value,
5598 (void *)(uintptr_t)&ip6, sizeof (struct in6_addr));
5599
5600 /*
5601 * Check an IPv6 string will fit in scratch.
5602 */
5603 size = INET6_ADDRSTRLEN;
5604 if (!DTRACE_INSCRATCH(mstate, size)) {
5605 DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
5606 regs[rd] = 0;
5607 break;
5608 }
5609 base = (char *)mstate->dtms_scratch_ptr;
5610 end = (char *)mstate->dtms_scratch_ptr + size - 1;
5611 *end-- = '\0';
5612
5613 /*
5614 * Find the longest run of 16 bit zero values
5615 * for the single allowed zero compression - "::".
5616 */
5617 firstzero = -1;
5618 tryzero = -1;
5619 numzero = 1;
5620 for (i = 0; i < sizeof (struct in6_addr); i++) {
5621#if defined(sun)
5622 if (ip6._S6_un._S6_u8[i] == 0 &&
5623#else
5624 if (ip6.__u6_addr.__u6_addr8[i] == 0 &&
5625#endif
5626 tryzero == -1 && i % 2 == 0) {
5627 tryzero = i;
5628 continue;
5629 }
5630
5631 if (tryzero != -1 &&
5632#if defined(sun)
5633 (ip6._S6_un._S6_u8[i] != 0 ||
5634#else
5635 (ip6.__u6_addr.__u6_addr8[i] != 0 ||
5636#endif
5637 i == sizeof (struct in6_addr) - 1)) {
5638
5639 if (i - tryzero <= numzero) {
5640 tryzero = -1;
5641 continue;
5642 }
5643
5644 firstzero = tryzero;
5645 numzero = i - i % 2 - tryzero;
5646 tryzero = -1;
5647
5648#if defined(sun)
5649 if (ip6._S6_un._S6_u8[i] == 0 &&
5650#else
5651 if (ip6.__u6_addr.__u6_addr8[i] == 0 &&
5652#endif
5653 i == sizeof (struct in6_addr) - 1)
5654 numzero += 2;
5655 }
5656 }
5657 ASSERT(firstzero + numzero <= sizeof (struct in6_addr));
5658
5659 /*
5660 * Check for an IPv4 embedded address.
5661 */
5662 v6end = sizeof (struct in6_addr) - 2;
5663 if (IN6_IS_ADDR_V4MAPPED(&ip6) ||
5664 IN6_IS_ADDR_V4COMPAT(&ip6)) {
5665 for (i = sizeof (struct in6_addr) - 1;
5666 i >= DTRACE_V4MAPPED_OFFSET; i--) {
5667 ASSERT(end >= base);
5668
5669#if defined(sun)
5670 val = ip6._S6_un._S6_u8[i];
5671#else
5672 val = ip6.__u6_addr.__u6_addr8[i];
5673#endif
5674
5675 if (val == 0) {
5676 *end-- = '0';
5677 } else {
5678 for (; val; val /= 10) {
5679 *end-- = '0' + val % 10;
5680 }
5681 }
5682
5683 if (i > DTRACE_V4MAPPED_OFFSET)
5684 *end-- = '.';
5685 }
5686
5687 if (subr == DIF_SUBR_INET_NTOA6)
5688 goto inetout;
5689
5690 /*
5691 * Set v6end to skip the IPv4 address that
5692 * we have already stringified.
5693 */
5694 v6end = 10;
5695 }
5696
5697 /*
5698 * Build the IPv6 string by working through the
5699 * address in reverse.
5700 */
5701 for (i = v6end; i >= 0; i -= 2) {
5702 ASSERT(end >= base);
5703
5704 if (i == firstzero + numzero - 2) {
5705 *end-- = ':';
5706 *end-- = ':';
5707 i -= numzero - 2;
5708 continue;
5709 }
5710
5711 if (i < 14 && i != firstzero - 2)
5712 *end-- = ':';
5713
5714#if defined(sun)
5715 val = (ip6._S6_un._S6_u8[i] << 8) +
5716 ip6._S6_un._S6_u8[i + 1];
5717#else
5718 val = (ip6.__u6_addr.__u6_addr8[i] << 8) +
5719 ip6.__u6_addr.__u6_addr8[i + 1];
5720#endif
5721
5722 if (val == 0) {
5723 *end-- = '0';
5724 } else {
5725 for (; val; val /= 16) {
5726 *end-- = digits[val % 16];
5727 }
5728 }
5729 }
5730 ASSERT(end + 1 >= base);
5731
5732 } else {
5733 /*
5734 * The user didn't use AH_INET or AH_INET6.
5735 */
5736 DTRACE_CPUFLAG_SET(CPU_DTRACE_ILLOP);
5737 regs[rd] = 0;
5738 break;
5739 }
5740
5741inetout: regs[rd] = (uintptr_t)end + 1;
5742 mstate->dtms_scratch_ptr += size;
5743 break;
5744 }
5745
5746 case DIF_SUBR_MEMREF: {
5747 uintptr_t size = 2 * sizeof(uintptr_t);
5748 uintptr_t *memref = (uintptr_t *) P2ROUNDUP(mstate->dtms_scratch_ptr, sizeof(uintptr_t));
5749 size_t scratch_size = ((uintptr_t) memref - mstate->dtms_scratch_ptr) + size;
5750
5751 /* address and length */
5752 memref[0] = tupregs[0].dttk_value;
5753 memref[1] = tupregs[1].dttk_value;
5754
5755 regs[rd] = (uintptr_t) memref;
5756 mstate->dtms_scratch_ptr += scratch_size;
5757 break;
5758 }
5759
5760 case DIF_SUBR_TYPEREF: {
5761 uintptr_t size = 4 * sizeof(uintptr_t);
5762 uintptr_t *typeref = (uintptr_t *) P2ROUNDUP(mstate->dtms_scratch_ptr, sizeof(uintptr_t));
5763 size_t scratch_size = ((uintptr_t) typeref - mstate->dtms_scratch_ptr) + size;
5764
5765 /* address, num_elements, type_str, type_len */
5766 typeref[0] = tupregs[0].dttk_value;
5767 typeref[1] = tupregs[1].dttk_value;
5768 typeref[2] = tupregs[2].dttk_value;
5769 typeref[3] = tupregs[3].dttk_value;
5770
5771 regs[rd] = (uintptr_t) typeref;
5772 mstate->dtms_scratch_ptr += scratch_size;
5773 break;
5774 }
5775 }
5776}
5777
5778/*
5779 * Emulate the execution of DTrace IR instructions specified by the given
5780 * DIF object. This function is deliberately void of assertions as all of
5781 * the necessary checks are handled by a call to dtrace_difo_validate().
5782 */
5783static uint64_t
5784dtrace_dif_emulate(dtrace_difo_t *difo, dtrace_mstate_t *mstate,
5785 dtrace_vstate_t *vstate, dtrace_state_t *state)
5786{
5787 const dif_instr_t *text = difo->dtdo_buf;
5788 const uint_t textlen = difo->dtdo_len;
5789 const char *strtab = difo->dtdo_strtab;
5790 const uint64_t *inttab = difo->dtdo_inttab;
5791
5792 uint64_t rval = 0;
5793 dtrace_statvar_t *svar;
5794 dtrace_dstate_t *dstate = &vstate->dtvs_dynvars;
5795 dtrace_difv_t *v;
5796 volatile uint16_t *flags = &cpu_core[curcpu].cpuc_dtrace_flags;
5797 volatile uintptr_t *illval = &cpu_core[curcpu].cpuc_dtrace_illval;
5798
5799 dtrace_key_t tupregs[DIF_DTR_NREGS + 2]; /* +2 for thread and id */
5800 uint64_t regs[DIF_DIR_NREGS];
5801 uint64_t *tmp;
5802
5803 uint8_t cc_n = 0, cc_z = 0, cc_v = 0, cc_c = 0;
5804 int64_t cc_r;
5805 uint_t pc = 0, id, opc = 0;
5806 uint8_t ttop = 0;
5807 dif_instr_t instr;
5808 uint_t r1, r2, rd;
5809
5810 /*
5811 * We stash the current DIF object into the machine state: we need it
5812 * for subsequent access checking.
5813 */
5814 mstate->dtms_difo = difo;
5815
5816 regs[DIF_REG_R0] = 0; /* %r0 is fixed at zero */
5817
5818 while (pc < textlen && !(*flags & CPU_DTRACE_FAULT)) {
5819 opc = pc;
5820
5821 instr = text[pc++];
5822 r1 = DIF_INSTR_R1(instr);
5823 r2 = DIF_INSTR_R2(instr);
5824 rd = DIF_INSTR_RD(instr);
5825
5826 switch (DIF_INSTR_OP(instr)) {
5827 case DIF_OP_OR:
5828 regs[rd] = regs[r1] | regs[r2];
5829 break;
5830 case DIF_OP_XOR:
5831 regs[rd] = regs[r1] ^ regs[r2];
5832 break;
5833 case DIF_OP_AND:
5834 regs[rd] = regs[r1] & regs[r2];
5835 break;
5836 case DIF_OP_SLL:
5837 regs[rd] = regs[r1] << regs[r2];
5838 break;
5839 case DIF_OP_SRL:
5840 regs[rd] = regs[r1] >> regs[r2];
5841 break;
5842 case DIF_OP_SUB:
5843 regs[rd] = regs[r1] - regs[r2];
5844 break;
5845 case DIF_OP_ADD:
5846 regs[rd] = regs[r1] + regs[r2];
5847 break;
5848 case DIF_OP_MUL:
5849 regs[rd] = regs[r1] * regs[r2];
5850 break;
5851 case DIF_OP_SDIV:
5852 if (regs[r2] == 0) {
5853 regs[rd] = 0;
5854 *flags |= CPU_DTRACE_DIVZERO;
5855 } else {
5856 regs[rd] = (int64_t)regs[r1] /
5857 (int64_t)regs[r2];
5858 }
5859 break;
5860
5861 case DIF_OP_UDIV:
5862 if (regs[r2] == 0) {
5863 regs[rd] = 0;
5864 *flags |= CPU_DTRACE_DIVZERO;
5865 } else {
5866 regs[rd] = regs[r1] / regs[r2];
5867 }
5868 break;
5869
5870 case DIF_OP_SREM:
5871 if (regs[r2] == 0) {
5872 regs[rd] = 0;
5873 *flags |= CPU_DTRACE_DIVZERO;
5874 } else {
5875 regs[rd] = (int64_t)regs[r1] %
5876 (int64_t)regs[r2];
5877 }
5878 break;
5879
5880 case DIF_OP_UREM:
5881 if (regs[r2] == 0) {
5882 regs[rd] = 0;
5883 *flags |= CPU_DTRACE_DIVZERO;
5884 } else {
5885 regs[rd] = regs[r1] % regs[r2];
5886 }
5887 break;
5888
5889 case DIF_OP_NOT:
5890 regs[rd] = ~regs[r1];
5891 break;
5892 case DIF_OP_MOV:
5893 regs[rd] = regs[r1];
5894 break;
5895 case DIF_OP_CMP:
5896 cc_r = regs[r1] - regs[r2];
5897 cc_n = cc_r < 0;
5898 cc_z = cc_r == 0;
5899 cc_v = 0;
5900 cc_c = regs[r1] < regs[r2];
5901 break;
5902 case DIF_OP_TST:
5903 cc_n = cc_v = cc_c = 0;
5904 cc_z = regs[r1] == 0;
5905 break;
5906 case DIF_OP_BA:
5907 pc = DIF_INSTR_LABEL(instr);
5908 break;
5909 case DIF_OP_BE:
5910 if (cc_z)
5911 pc = DIF_INSTR_LABEL(instr);
5912 break;
5913 case DIF_OP_BNE:
5914 if (cc_z == 0)
5915 pc = DIF_INSTR_LABEL(instr);
5916 break;
5917 case DIF_OP_BG:
5918 if ((cc_z | (cc_n ^ cc_v)) == 0)
5919 pc = DIF_INSTR_LABEL(instr);
5920 break;
5921 case DIF_OP_BGU:
5922 if ((cc_c | cc_z) == 0)
5923 pc = DIF_INSTR_LABEL(instr);
5924 break;
5925 case DIF_OP_BGE:
5926 if ((cc_n ^ cc_v) == 0)
5927 pc = DIF_INSTR_LABEL(instr);
5928 break;
5929 case DIF_OP_BGEU:
5930 if (cc_c == 0)
5931 pc = DIF_INSTR_LABEL(instr);
5932 break;
5933 case DIF_OP_BL:
5934 if (cc_n ^ cc_v)
5935 pc = DIF_INSTR_LABEL(instr);
5936 break;
5937 case DIF_OP_BLU:
5938 if (cc_c)
5939 pc = DIF_INSTR_LABEL(instr);
5940 break;
5941 case DIF_OP_BLE:
5942 if (cc_z | (cc_n ^ cc_v))
5943 pc = DIF_INSTR_LABEL(instr);
5944 break;
5945 case DIF_OP_BLEU:
5946 if (cc_c | cc_z)
5947 pc = DIF_INSTR_LABEL(instr);
5948 break;
5949 case DIF_OP_RLDSB:
5113 if (!dtrace_canstore(regs[r1], 1, mstate, vstate)) {
5114 *flags |= CPU_DTRACE_KPRIV;
5115 *illval = regs[r1];
5950 if (!dtrace_canload(regs[r1], 1, mstate, vstate))
5116 break;
5951 break;
5117 }
5118 /*FALLTHROUGH*/
5119 case DIF_OP_LDSB:
5120 regs[rd] = (int8_t)dtrace_load8(regs[r1]);
5121 break;
5122 case DIF_OP_RLDSH:
5952 /*FALLTHROUGH*/
5953 case DIF_OP_LDSB:
5954 regs[rd] = (int8_t)dtrace_load8(regs[r1]);
5955 break;
5956 case DIF_OP_RLDSH:
5123 if (!dtrace_canstore(regs[r1], 2, mstate, vstate)) {
5124 *flags |= CPU_DTRACE_KPRIV;
5125 *illval = regs[r1];
5957 if (!dtrace_canload(regs[r1], 2, mstate, vstate))
5126 break;
5958 break;
5127 }
5128 /*FALLTHROUGH*/
5129 case DIF_OP_LDSH:
5130 regs[rd] = (int16_t)dtrace_load16(regs[r1]);
5131 break;
5132 case DIF_OP_RLDSW:
5959 /*FALLTHROUGH*/
5960 case DIF_OP_LDSH:
5961 regs[rd] = (int16_t)dtrace_load16(regs[r1]);
5962 break;
5963 case DIF_OP_RLDSW:
5133 if (!dtrace_canstore(regs[r1], 4, mstate, vstate)) {
5134 *flags |= CPU_DTRACE_KPRIV;
5135 *illval = regs[r1];
5964 if (!dtrace_canload(regs[r1], 4, mstate, vstate))
5136 break;
5965 break;
5137 }
5138 /*FALLTHROUGH*/
5139 case DIF_OP_LDSW:
5140 regs[rd] = (int32_t)dtrace_load32(regs[r1]);
5141 break;
5142 case DIF_OP_RLDUB:
5966 /*FALLTHROUGH*/
5967 case DIF_OP_LDSW:
5968 regs[rd] = (int32_t)dtrace_load32(regs[r1]);
5969 break;
5970 case DIF_OP_RLDUB:
5143 if (!dtrace_canstore(regs[r1], 1, mstate, vstate)) {
5144 *flags |= CPU_DTRACE_KPRIV;
5145 *illval = regs[r1];
5971 if (!dtrace_canload(regs[r1], 1, mstate, vstate))
5146 break;
5972 break;
5147 }
5148 /*FALLTHROUGH*/
5149 case DIF_OP_LDUB:
5150 regs[rd] = dtrace_load8(regs[r1]);
5151 break;
5152 case DIF_OP_RLDUH:
5973 /*FALLTHROUGH*/
5974 case DIF_OP_LDUB:
5975 regs[rd] = dtrace_load8(regs[r1]);
5976 break;
5977 case DIF_OP_RLDUH:
5153 if (!dtrace_canstore(regs[r1], 2, mstate, vstate)) {
5154 *flags |= CPU_DTRACE_KPRIV;
5155 *illval = regs[r1];
5978 if (!dtrace_canload(regs[r1], 2, mstate, vstate))
5156 break;
5979 break;
5157 }
5158 /*FALLTHROUGH*/
5159 case DIF_OP_LDUH:
5160 regs[rd] = dtrace_load16(regs[r1]);
5161 break;
5162 case DIF_OP_RLDUW:
5980 /*FALLTHROUGH*/
5981 case DIF_OP_LDUH:
5982 regs[rd] = dtrace_load16(regs[r1]);
5983 break;
5984 case DIF_OP_RLDUW:
5163 if (!dtrace_canstore(regs[r1], 4, mstate, vstate)) {
5164 *flags |= CPU_DTRACE_KPRIV;
5165 *illval = regs[r1];
5985 if (!dtrace_canload(regs[r1], 4, mstate, vstate))
5166 break;
5986 break;
5167 }
5168 /*FALLTHROUGH*/
5169 case DIF_OP_LDUW:
5170 regs[rd] = dtrace_load32(regs[r1]);
5171 break;
5172 case DIF_OP_RLDX:
5987 /*FALLTHROUGH*/
5988 case DIF_OP_LDUW:
5989 regs[rd] = dtrace_load32(regs[r1]);
5990 break;
5991 case DIF_OP_RLDX:
5173 if (!dtrace_canstore(regs[r1], 8, mstate, vstate)) {
5174 *flags |= CPU_DTRACE_KPRIV;
5175 *illval = regs[r1];
5992 if (!dtrace_canload(regs[r1], 8, mstate, vstate))
5176 break;
5993 break;
5177 }
5178 /*FALLTHROUGH*/
5179 case DIF_OP_LDX:
5180 regs[rd] = dtrace_load64(regs[r1]);
5181 break;
5182 case DIF_OP_ULDSB:
5994 /*FALLTHROUGH*/
5995 case DIF_OP_LDX:
5996 regs[rd] = dtrace_load64(regs[r1]);
5997 break;
5998 case DIF_OP_ULDSB:
5999 DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
5183 regs[rd] = (int8_t)
5184 dtrace_fuword8((void *)(uintptr_t)regs[r1]);
6000 regs[rd] = (int8_t)
6001 dtrace_fuword8((void *)(uintptr_t)regs[r1]);
6002 DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);
5185 break;
5186 case DIF_OP_ULDSH:
6003 break;
6004 case DIF_OP_ULDSH:
6005 DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
5187 regs[rd] = (int16_t)
5188 dtrace_fuword16((void *)(uintptr_t)regs[r1]);
6006 regs[rd] = (int16_t)
6007 dtrace_fuword16((void *)(uintptr_t)regs[r1]);
6008 DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);
5189 break;
5190 case DIF_OP_ULDSW:
6009 break;
6010 case DIF_OP_ULDSW:
6011 DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
5191 regs[rd] = (int32_t)
5192 dtrace_fuword32((void *)(uintptr_t)regs[r1]);
6012 regs[rd] = (int32_t)
6013 dtrace_fuword32((void *)(uintptr_t)regs[r1]);
6014 DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);
5193 break;
5194 case DIF_OP_ULDUB:
6015 break;
6016 case DIF_OP_ULDUB:
6017 DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
5195 regs[rd] =
5196 dtrace_fuword8((void *)(uintptr_t)regs[r1]);
6018 regs[rd] =
6019 dtrace_fuword8((void *)(uintptr_t)regs[r1]);
6020 DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);
5197 break;
5198 case DIF_OP_ULDUH:
6021 break;
6022 case DIF_OP_ULDUH:
6023 DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
5199 regs[rd] =
5200 dtrace_fuword16((void *)(uintptr_t)regs[r1]);
6024 regs[rd] =
6025 dtrace_fuword16((void *)(uintptr_t)regs[r1]);
6026 DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);
5201 break;
5202 case DIF_OP_ULDUW:
6027 break;
6028 case DIF_OP_ULDUW:
6029 DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
5203 regs[rd] =
5204 dtrace_fuword32((void *)(uintptr_t)regs[r1]);
6030 regs[rd] =
6031 dtrace_fuword32((void *)(uintptr_t)regs[r1]);
6032 DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);
5205 break;
5206 case DIF_OP_ULDX:
6033 break;
6034 case DIF_OP_ULDX:
6035 DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
5207 regs[rd] =
5208 dtrace_fuword64((void *)(uintptr_t)regs[r1]);
6036 regs[rd] =
6037 dtrace_fuword64((void *)(uintptr_t)regs[r1]);
6038 DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);
5209 break;
5210 case DIF_OP_RET:
5211 rval = regs[rd];
5212 pc = textlen;
5213 break;
5214 case DIF_OP_NOP:
5215 break;
5216 case DIF_OP_SETX:
5217 regs[rd] = inttab[DIF_INSTR_INTEGER(instr)];
5218 break;
5219 case DIF_OP_SETS:
5220 regs[rd] = (uint64_t)(uintptr_t)
5221 (strtab + DIF_INSTR_STRING(instr));
5222 break;
5223 case DIF_OP_SCMP: {
5224 size_t sz = state->dts_options[DTRACEOPT_STRSIZE];
5225 uintptr_t s1 = regs[r1];
5226 uintptr_t s2 = regs[r2];
5227
5228 if (s1 != 0 &&
5229 !dtrace_strcanload(s1, sz, mstate, vstate))
5230 break;
5231 if (s2 != 0 &&
5232 !dtrace_strcanload(s2, sz, mstate, vstate))
5233 break;
5234
5235 cc_r = dtrace_strncmp((char *)s1, (char *)s2, sz);
5236
5237 cc_n = cc_r < 0;
5238 cc_z = cc_r == 0;
5239 cc_v = cc_c = 0;
5240 break;
5241 }
5242 case DIF_OP_LDGA:
5243 regs[rd] = dtrace_dif_variable(mstate, state,
5244 r1, regs[r2]);
5245 break;
5246 case DIF_OP_LDGS:
5247 id = DIF_INSTR_VAR(instr);
5248
5249 if (id >= DIF_VAR_OTHER_UBASE) {
5250 uintptr_t a;
5251
5252 id -= DIF_VAR_OTHER_UBASE;
5253 svar = vstate->dtvs_globals[id];
5254 ASSERT(svar != NULL);
5255 v = &svar->dtsv_var;
5256
5257 if (!(v->dtdv_type.dtdt_flags & DIF_TF_BYREF)) {
5258 regs[rd] = svar->dtsv_data;
5259 break;
5260 }
5261
5262 a = (uintptr_t)svar->dtsv_data;
5263
5264 if (*(uint8_t *)a == UINT8_MAX) {
5265 /*
5266 * If the 0th byte is set to UINT8_MAX
5267 * then this is to be treated as a
5268 * reference to a NULL variable.
5269 */
5270 regs[rd] = 0;
5271 } else {
5272 regs[rd] = a + sizeof (uint64_t);
5273 }
5274
5275 break;
5276 }
5277
5278 regs[rd] = dtrace_dif_variable(mstate, state, id, 0);
5279 break;
5280
5281 case DIF_OP_STGS:
5282 id = DIF_INSTR_VAR(instr);
5283
5284 ASSERT(id >= DIF_VAR_OTHER_UBASE);
5285 id -= DIF_VAR_OTHER_UBASE;
5286
5287 svar = vstate->dtvs_globals[id];
5288 ASSERT(svar != NULL);
5289 v = &svar->dtsv_var;
5290
5291 if (v->dtdv_type.dtdt_flags & DIF_TF_BYREF) {
5292 uintptr_t a = (uintptr_t)svar->dtsv_data;
5293
5294 ASSERT(a != 0);
5295 ASSERT(svar->dtsv_size != 0);
5296
5297 if (regs[rd] == 0) {
5298 *(uint8_t *)a = UINT8_MAX;
5299 break;
5300 } else {
5301 *(uint8_t *)a = 0;
5302 a += sizeof (uint64_t);
5303 }
5304 if (!dtrace_vcanload(
5305 (void *)(uintptr_t)regs[rd], &v->dtdv_type,
5306 mstate, vstate))
5307 break;
5308
5309 dtrace_vcopy((void *)(uintptr_t)regs[rd],
5310 (void *)a, &v->dtdv_type);
5311 break;
5312 }
5313
5314 svar->dtsv_data = regs[rd];
5315 break;
5316
5317 case DIF_OP_LDTA:
5318 /*
5319 * There are no DTrace built-in thread-local arrays at
5320 * present. This opcode is saved for future work.
5321 */
5322 *flags |= CPU_DTRACE_ILLOP;
5323 regs[rd] = 0;
5324 break;
5325
5326 case DIF_OP_LDLS:
5327 id = DIF_INSTR_VAR(instr);
5328
5329 if (id < DIF_VAR_OTHER_UBASE) {
5330 /*
5331 * For now, this has no meaning.
5332 */
5333 regs[rd] = 0;
5334 break;
5335 }
5336
5337 id -= DIF_VAR_OTHER_UBASE;
5338
5339 ASSERT(id < vstate->dtvs_nlocals);
5340 ASSERT(vstate->dtvs_locals != NULL);
5341
5342 svar = vstate->dtvs_locals[id];
5343 ASSERT(svar != NULL);
5344 v = &svar->dtsv_var;
5345
5346 if (v->dtdv_type.dtdt_flags & DIF_TF_BYREF) {
5347 uintptr_t a = (uintptr_t)svar->dtsv_data;
5348 size_t sz = v->dtdv_type.dtdt_size;
5349
5350 sz += sizeof (uint64_t);
5351 ASSERT(svar->dtsv_size == NCPU * sz);
5352 a += curcpu * sz;
5353
5354 if (*(uint8_t *)a == UINT8_MAX) {
5355 /*
5356 * If the 0th byte is set to UINT8_MAX
5357 * then this is to be treated as a
5358 * reference to a NULL variable.
5359 */
5360 regs[rd] = 0;
5361 } else {
5362 regs[rd] = a + sizeof (uint64_t);
5363 }
5364
5365 break;
5366 }
5367
5368 ASSERT(svar->dtsv_size == NCPU * sizeof (uint64_t));
5369 tmp = (uint64_t *)(uintptr_t)svar->dtsv_data;
5370 regs[rd] = tmp[curcpu];
5371 break;
5372
5373 case DIF_OP_STLS:
5374 id = DIF_INSTR_VAR(instr);
5375
5376 ASSERT(id >= DIF_VAR_OTHER_UBASE);
5377 id -= DIF_VAR_OTHER_UBASE;
5378 ASSERT(id < vstate->dtvs_nlocals);
5379
5380 ASSERT(vstate->dtvs_locals != NULL);
5381 svar = vstate->dtvs_locals[id];
5382 ASSERT(svar != NULL);
5383 v = &svar->dtsv_var;
5384
5385 if (v->dtdv_type.dtdt_flags & DIF_TF_BYREF) {
5386 uintptr_t a = (uintptr_t)svar->dtsv_data;
5387 size_t sz = v->dtdv_type.dtdt_size;
5388
5389 sz += sizeof (uint64_t);
5390 ASSERT(svar->dtsv_size == NCPU * sz);
5391 a += curcpu * sz;
5392
5393 if (regs[rd] == 0) {
5394 *(uint8_t *)a = UINT8_MAX;
5395 break;
5396 } else {
5397 *(uint8_t *)a = 0;
5398 a += sizeof (uint64_t);
5399 }
5400
5401 if (!dtrace_vcanload(
5402 (void *)(uintptr_t)regs[rd], &v->dtdv_type,
5403 mstate, vstate))
5404 break;
5405
5406 dtrace_vcopy((void *)(uintptr_t)regs[rd],
5407 (void *)a, &v->dtdv_type);
5408 break;
5409 }
5410
5411 ASSERT(svar->dtsv_size == NCPU * sizeof (uint64_t));
5412 tmp = (uint64_t *)(uintptr_t)svar->dtsv_data;
5413 tmp[curcpu] = regs[rd];
5414 break;
5415
5416 case DIF_OP_LDTS: {
5417 dtrace_dynvar_t *dvar;
5418 dtrace_key_t *key;
5419
5420 id = DIF_INSTR_VAR(instr);
5421 ASSERT(id >= DIF_VAR_OTHER_UBASE);
5422 id -= DIF_VAR_OTHER_UBASE;
5423 v = &vstate->dtvs_tlocals[id];
5424
5425 key = &tupregs[DIF_DTR_NREGS];
5426 key[0].dttk_value = (uint64_t)id;
5427 key[0].dttk_size = 0;
5428 DTRACE_TLS_THRKEY(key[1].dttk_value);
5429 key[1].dttk_size = 0;
5430
5431 dvar = dtrace_dynvar(dstate, 2, key,
5432 sizeof (uint64_t), DTRACE_DYNVAR_NOALLOC,
5433 mstate, vstate);
5434
5435 if (dvar == NULL) {
5436 regs[rd] = 0;
5437 break;
5438 }
5439
5440 if (v->dtdv_type.dtdt_flags & DIF_TF_BYREF) {
5441 regs[rd] = (uint64_t)(uintptr_t)dvar->dtdv_data;
5442 } else {
5443 regs[rd] = *((uint64_t *)dvar->dtdv_data);
5444 }
5445
5446 break;
5447 }
5448
5449 case DIF_OP_STTS: {
5450 dtrace_dynvar_t *dvar;
5451 dtrace_key_t *key;
5452
5453 id = DIF_INSTR_VAR(instr);
5454 ASSERT(id >= DIF_VAR_OTHER_UBASE);
5455 id -= DIF_VAR_OTHER_UBASE;
5456
5457 key = &tupregs[DIF_DTR_NREGS];
5458 key[0].dttk_value = (uint64_t)id;
5459 key[0].dttk_size = 0;
5460 DTRACE_TLS_THRKEY(key[1].dttk_value);
5461 key[1].dttk_size = 0;
5462 v = &vstate->dtvs_tlocals[id];
5463
5464 dvar = dtrace_dynvar(dstate, 2, key,
5465 v->dtdv_type.dtdt_size > sizeof (uint64_t) ?
5466 v->dtdv_type.dtdt_size : sizeof (uint64_t),
5467 regs[rd] ? DTRACE_DYNVAR_ALLOC :
5468 DTRACE_DYNVAR_DEALLOC, mstate, vstate);
5469
5470 /*
5471 * Given that we're storing to thread-local data,
5472 * we need to flush our predicate cache.
5473 */
5474 curthread->t_predcache = 0;
5475
5476 if (dvar == NULL)
5477 break;
5478
5479 if (v->dtdv_type.dtdt_flags & DIF_TF_BYREF) {
5480 if (!dtrace_vcanload(
5481 (void *)(uintptr_t)regs[rd],
5482 &v->dtdv_type, mstate, vstate))
5483 break;
5484
5485 dtrace_vcopy((void *)(uintptr_t)regs[rd],
5486 dvar->dtdv_data, &v->dtdv_type);
5487 } else {
5488 *((uint64_t *)dvar->dtdv_data) = regs[rd];
5489 }
5490
5491 break;
5492 }
5493
5494 case DIF_OP_SRA:
5495 regs[rd] = (int64_t)regs[r1] >> regs[r2];
5496 break;
5497
5498 case DIF_OP_CALL:
5499 dtrace_dif_subr(DIF_INSTR_SUBR(instr), rd,
5500 regs, tupregs, ttop, mstate, state);
5501 break;
5502
5503 case DIF_OP_PUSHTR:
5504 if (ttop == DIF_DTR_NREGS) {
5505 *flags |= CPU_DTRACE_TUPOFLOW;
5506 break;
5507 }
5508
5509 if (r1 == DIF_TYPE_STRING) {
5510 /*
5511 * If this is a string type and the size is 0,
5512 * we'll use the system-wide default string
5513 * size. Note that we are _not_ looking at
5514 * the value of the DTRACEOPT_STRSIZE option;
5515 * had this been set, we would expect to have
5516 * a non-zero size value in the "pushtr".
5517 */
5518 tupregs[ttop].dttk_size =
5519 dtrace_strlen((char *)(uintptr_t)regs[rd],
5520 regs[r2] ? regs[r2] :
5521 dtrace_strsize_default) + 1;
5522 } else {
5523 tupregs[ttop].dttk_size = regs[r2];
5524 }
5525
5526 tupregs[ttop++].dttk_value = regs[rd];
5527 break;
5528
5529 case DIF_OP_PUSHTV:
5530 if (ttop == DIF_DTR_NREGS) {
5531 *flags |= CPU_DTRACE_TUPOFLOW;
5532 break;
5533 }
5534
5535 tupregs[ttop].dttk_value = regs[rd];
5536 tupregs[ttop++].dttk_size = 0;
5537 break;
5538
5539 case DIF_OP_POPTS:
5540 if (ttop != 0)
5541 ttop--;
5542 break;
5543
5544 case DIF_OP_FLUSHTS:
5545 ttop = 0;
5546 break;
5547
5548 case DIF_OP_LDGAA:
5549 case DIF_OP_LDTAA: {
5550 dtrace_dynvar_t *dvar;
5551 dtrace_key_t *key = tupregs;
5552 uint_t nkeys = ttop;
5553
5554 id = DIF_INSTR_VAR(instr);
5555 ASSERT(id >= DIF_VAR_OTHER_UBASE);
5556 id -= DIF_VAR_OTHER_UBASE;
5557
5558 key[nkeys].dttk_value = (uint64_t)id;
5559 key[nkeys++].dttk_size = 0;
5560
5561 if (DIF_INSTR_OP(instr) == DIF_OP_LDTAA) {
5562 DTRACE_TLS_THRKEY(key[nkeys].dttk_value);
5563 key[nkeys++].dttk_size = 0;
5564 v = &vstate->dtvs_tlocals[id];
5565 } else {
5566 v = &vstate->dtvs_globals[id]->dtsv_var;
5567 }
5568
5569 dvar = dtrace_dynvar(dstate, nkeys, key,
5570 v->dtdv_type.dtdt_size > sizeof (uint64_t) ?
5571 v->dtdv_type.dtdt_size : sizeof (uint64_t),
5572 DTRACE_DYNVAR_NOALLOC, mstate, vstate);
5573
5574 if (dvar == NULL) {
5575 regs[rd] = 0;
5576 break;
5577 }
5578
5579 if (v->dtdv_type.dtdt_flags & DIF_TF_BYREF) {
5580 regs[rd] = (uint64_t)(uintptr_t)dvar->dtdv_data;
5581 } else {
5582 regs[rd] = *((uint64_t *)dvar->dtdv_data);
5583 }
5584
5585 break;
5586 }
5587
5588 case DIF_OP_STGAA:
5589 case DIF_OP_STTAA: {
5590 dtrace_dynvar_t *dvar;
5591 dtrace_key_t *key = tupregs;
5592 uint_t nkeys = ttop;
5593
5594 id = DIF_INSTR_VAR(instr);
5595 ASSERT(id >= DIF_VAR_OTHER_UBASE);
5596 id -= DIF_VAR_OTHER_UBASE;
5597
5598 key[nkeys].dttk_value = (uint64_t)id;
5599 key[nkeys++].dttk_size = 0;
5600
5601 if (DIF_INSTR_OP(instr) == DIF_OP_STTAA) {
5602 DTRACE_TLS_THRKEY(key[nkeys].dttk_value);
5603 key[nkeys++].dttk_size = 0;
5604 v = &vstate->dtvs_tlocals[id];
5605 } else {
5606 v = &vstate->dtvs_globals[id]->dtsv_var;
5607 }
5608
5609 dvar = dtrace_dynvar(dstate, nkeys, key,
5610 v->dtdv_type.dtdt_size > sizeof (uint64_t) ?
5611 v->dtdv_type.dtdt_size : sizeof (uint64_t),
5612 regs[rd] ? DTRACE_DYNVAR_ALLOC :
5613 DTRACE_DYNVAR_DEALLOC, mstate, vstate);
5614
5615 if (dvar == NULL)
5616 break;
5617
5618 if (v->dtdv_type.dtdt_flags & DIF_TF_BYREF) {
5619 if (!dtrace_vcanload(
5620 (void *)(uintptr_t)regs[rd], &v->dtdv_type,
5621 mstate, vstate))
5622 break;
5623
5624 dtrace_vcopy((void *)(uintptr_t)regs[rd],
5625 dvar->dtdv_data, &v->dtdv_type);
5626 } else {
5627 *((uint64_t *)dvar->dtdv_data) = regs[rd];
5628 }
5629
5630 break;
5631 }
5632
5633 case DIF_OP_ALLOCS: {
5634 uintptr_t ptr = P2ROUNDUP(mstate->dtms_scratch_ptr, 8);
5635 size_t size = ptr - mstate->dtms_scratch_ptr + regs[r1];
5636
5637 /*
5638 * Rounding up the user allocation size could have
5639 * overflowed large, bogus allocations (like -1ULL) to
5640 * 0.
5641 */
5642 if (size < regs[r1] ||
5643 !DTRACE_INSCRATCH(mstate, size)) {
5644 DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
5645 regs[rd] = 0;
5646 break;
5647 }
5648
5649 dtrace_bzero((void *) mstate->dtms_scratch_ptr, size);
5650 mstate->dtms_scratch_ptr += size;
5651 regs[rd] = ptr;
5652 break;
5653 }
5654
5655 case DIF_OP_COPYS:
5656 if (!dtrace_canstore(regs[rd], regs[r2],
5657 mstate, vstate)) {
5658 *flags |= CPU_DTRACE_BADADDR;
5659 *illval = regs[rd];
5660 break;
5661 }
5662
5663 if (!dtrace_canload(regs[r1], regs[r2], mstate, vstate))
5664 break;
5665
5666 dtrace_bcopy((void *)(uintptr_t)regs[r1],
5667 (void *)(uintptr_t)regs[rd], (size_t)regs[r2]);
5668 break;
5669
5670 case DIF_OP_STB:
5671 if (!dtrace_canstore(regs[rd], 1, mstate, vstate)) {
5672 *flags |= CPU_DTRACE_BADADDR;
5673 *illval = regs[rd];
5674 break;
5675 }
5676 *((uint8_t *)(uintptr_t)regs[rd]) = (uint8_t)regs[r1];
5677 break;
5678
5679 case DIF_OP_STH:
5680 if (!dtrace_canstore(regs[rd], 2, mstate, vstate)) {
5681 *flags |= CPU_DTRACE_BADADDR;
5682 *illval = regs[rd];
5683 break;
5684 }
5685 if (regs[rd] & 1) {
5686 *flags |= CPU_DTRACE_BADALIGN;
5687 *illval = regs[rd];
5688 break;
5689 }
5690 *((uint16_t *)(uintptr_t)regs[rd]) = (uint16_t)regs[r1];
5691 break;
5692
5693 case DIF_OP_STW:
5694 if (!dtrace_canstore(regs[rd], 4, mstate, vstate)) {
5695 *flags |= CPU_DTRACE_BADADDR;
5696 *illval = regs[rd];
5697 break;
5698 }
5699 if (regs[rd] & 3) {
5700 *flags |= CPU_DTRACE_BADALIGN;
5701 *illval = regs[rd];
5702 break;
5703 }
5704 *((uint32_t *)(uintptr_t)regs[rd]) = (uint32_t)regs[r1];
5705 break;
5706
5707 case DIF_OP_STX:
5708 if (!dtrace_canstore(regs[rd], 8, mstate, vstate)) {
5709 *flags |= CPU_DTRACE_BADADDR;
5710 *illval = regs[rd];
5711 break;
5712 }
5713 if (regs[rd] & 7) {
5714 *flags |= CPU_DTRACE_BADALIGN;
5715 *illval = regs[rd];
5716 break;
5717 }
5718 *((uint64_t *)(uintptr_t)regs[rd]) = regs[r1];
5719 break;
5720 }
5721 }
5722
5723 if (!(*flags & CPU_DTRACE_FAULT))
5724 return (rval);
5725
5726 mstate->dtms_fltoffs = opc * sizeof (dif_instr_t);
5727 mstate->dtms_present |= DTRACE_MSTATE_FLTOFFS;
5728
5729 return (0);
5730}
5731
5732static void
5733dtrace_action_breakpoint(dtrace_ecb_t *ecb)
5734{
5735 dtrace_probe_t *probe = ecb->dte_probe;
5736 dtrace_provider_t *prov = probe->dtpr_provider;
5737 char c[DTRACE_FULLNAMELEN + 80], *str;
5738 char *msg = "dtrace: breakpoint action at probe ";
5739 char *ecbmsg = " (ecb ";
5740 uintptr_t mask = (0xf << (sizeof (uintptr_t) * NBBY / 4));
5741 uintptr_t val = (uintptr_t)ecb;
5742 int shift = (sizeof (uintptr_t) * NBBY) - 4, i = 0;
5743
5744 if (dtrace_destructive_disallow)
5745 return;
5746
5747 /*
5748 * It's impossible to be taking action on the NULL probe.
5749 */
5750 ASSERT(probe != NULL);
5751
5752 /*
5753 * This is a poor man's (destitute man's?) sprintf(): we want to
5754 * print the provider name, module name, function name and name of
5755 * the probe, along with the hex address of the ECB with the breakpoint
5756 * action -- all of which we must place in the character buffer by
5757 * hand.
5758 */
5759 while (*msg != '\0')
5760 c[i++] = *msg++;
5761
5762 for (str = prov->dtpv_name; *str != '\0'; str++)
5763 c[i++] = *str;
5764 c[i++] = ':';
5765
5766 for (str = probe->dtpr_mod; *str != '\0'; str++)
5767 c[i++] = *str;
5768 c[i++] = ':';
5769
5770 for (str = probe->dtpr_func; *str != '\0'; str++)
5771 c[i++] = *str;
5772 c[i++] = ':';
5773
5774 for (str = probe->dtpr_name; *str != '\0'; str++)
5775 c[i++] = *str;
5776
5777 while (*ecbmsg != '\0')
5778 c[i++] = *ecbmsg++;
5779
5780 while (shift >= 0) {
5781 mask = (uintptr_t)0xf << shift;
5782
5783 if (val >= ((uintptr_t)1 << shift))
5784 c[i++] = "0123456789abcdef"[(val & mask) >> shift];
5785 shift -= 4;
5786 }
5787
5788 c[i++] = ')';
5789 c[i] = '\0';
5790
5791#if defined(sun)
5792 debug_enter(c);
5793#else
5794 kdb_enter(KDB_WHY_DTRACE, "breakpoint action");
5795#endif
5796}
5797
5798static void
5799dtrace_action_panic(dtrace_ecb_t *ecb)
5800{
5801 dtrace_probe_t *probe = ecb->dte_probe;
5802
5803 /*
5804 * It's impossible to be taking action on the NULL probe.
5805 */
5806 ASSERT(probe != NULL);
5807
5808 if (dtrace_destructive_disallow)
5809 return;
5810
5811 if (dtrace_panicked != NULL)
5812 return;
5813
5814 if (dtrace_casptr(&dtrace_panicked, NULL, curthread) != NULL)
5815 return;
5816
5817 /*
5818 * We won the right to panic. (We want to be sure that only one
5819 * thread calls panic() from dtrace_probe(), and that panic() is
5820 * called exactly once.)
5821 */
5822 dtrace_panic("dtrace: panic action at probe %s:%s:%s:%s (ecb %p)",
5823 probe->dtpr_provider->dtpv_name, probe->dtpr_mod,
5824 probe->dtpr_func, probe->dtpr_name, (void *)ecb);
5825}
5826
5827static void
5828dtrace_action_raise(uint64_t sig)
5829{
5830 if (dtrace_destructive_disallow)
5831 return;
5832
5833 if (sig >= NSIG) {
5834 DTRACE_CPUFLAG_SET(CPU_DTRACE_ILLOP);
5835 return;
5836 }
5837
5838#if defined(sun)
5839 /*
5840 * raise() has a queue depth of 1 -- we ignore all subsequent
5841 * invocations of the raise() action.
5842 */
5843 if (curthread->t_dtrace_sig == 0)
5844 curthread->t_dtrace_sig = (uint8_t)sig;
5845
5846 curthread->t_sig_check = 1;
5847 aston(curthread);
5848#else
5849 struct proc *p = curproc;
5850 PROC_LOCK(p);
5851 kern_psignal(p, sig);
5852 PROC_UNLOCK(p);
5853#endif
5854}
5855
5856static void
5857dtrace_action_stop(void)
5858{
5859 if (dtrace_destructive_disallow)
5860 return;
5861
5862#if defined(sun)
5863 if (!curthread->t_dtrace_stop) {
5864 curthread->t_dtrace_stop = 1;
5865 curthread->t_sig_check = 1;
5866 aston(curthread);
5867 }
5868#else
5869 struct proc *p = curproc;
5870 PROC_LOCK(p);
5871 kern_psignal(p, SIGSTOP);
5872 PROC_UNLOCK(p);
5873#endif
5874}
5875
5876static void
5877dtrace_action_chill(dtrace_mstate_t *mstate, hrtime_t val)
5878{
5879 hrtime_t now;
5880 volatile uint16_t *flags;
5881#if defined(sun)
5882 cpu_t *cpu = CPU;
5883#else
5884 cpu_t *cpu = &solaris_cpu[curcpu];
5885#endif
5886
5887 if (dtrace_destructive_disallow)
5888 return;
5889
6039 break;
6040 case DIF_OP_RET:
6041 rval = regs[rd];
6042 pc = textlen;
6043 break;
6044 case DIF_OP_NOP:
6045 break;
6046 case DIF_OP_SETX:
6047 regs[rd] = inttab[DIF_INSTR_INTEGER(instr)];
6048 break;
6049 case DIF_OP_SETS:
6050 regs[rd] = (uint64_t)(uintptr_t)
6051 (strtab + DIF_INSTR_STRING(instr));
6052 break;
6053 case DIF_OP_SCMP: {
6054 size_t sz = state->dts_options[DTRACEOPT_STRSIZE];
6055 uintptr_t s1 = regs[r1];
6056 uintptr_t s2 = regs[r2];
6057
6058 if (s1 != 0 &&
6059 !dtrace_strcanload(s1, sz, mstate, vstate))
6060 break;
6061 if (s2 != 0 &&
6062 !dtrace_strcanload(s2, sz, mstate, vstate))
6063 break;
6064
6065 cc_r = dtrace_strncmp((char *)s1, (char *)s2, sz);
6066
6067 cc_n = cc_r < 0;
6068 cc_z = cc_r == 0;
6069 cc_v = cc_c = 0;
6070 break;
6071 }
6072 case DIF_OP_LDGA:
6073 regs[rd] = dtrace_dif_variable(mstate, state,
6074 r1, regs[r2]);
6075 break;
6076 case DIF_OP_LDGS:
6077 id = DIF_INSTR_VAR(instr);
6078
6079 if (id >= DIF_VAR_OTHER_UBASE) {
6080 uintptr_t a;
6081
6082 id -= DIF_VAR_OTHER_UBASE;
6083 svar = vstate->dtvs_globals[id];
6084 ASSERT(svar != NULL);
6085 v = &svar->dtsv_var;
6086
6087 if (!(v->dtdv_type.dtdt_flags & DIF_TF_BYREF)) {
6088 regs[rd] = svar->dtsv_data;
6089 break;
6090 }
6091
6092 a = (uintptr_t)svar->dtsv_data;
6093
6094 if (*(uint8_t *)a == UINT8_MAX) {
6095 /*
6096 * If the 0th byte is set to UINT8_MAX
6097 * then this is to be treated as a
6098 * reference to a NULL variable.
6099 */
6100 regs[rd] = 0;
6101 } else {
6102 regs[rd] = a + sizeof (uint64_t);
6103 }
6104
6105 break;
6106 }
6107
6108 regs[rd] = dtrace_dif_variable(mstate, state, id, 0);
6109 break;
6110
6111 case DIF_OP_STGS:
6112 id = DIF_INSTR_VAR(instr);
6113
6114 ASSERT(id >= DIF_VAR_OTHER_UBASE);
6115 id -= DIF_VAR_OTHER_UBASE;
6116
6117 svar = vstate->dtvs_globals[id];
6118 ASSERT(svar != NULL);
6119 v = &svar->dtsv_var;
6120
6121 if (v->dtdv_type.dtdt_flags & DIF_TF_BYREF) {
6122 uintptr_t a = (uintptr_t)svar->dtsv_data;
6123
6124 ASSERT(a != 0);
6125 ASSERT(svar->dtsv_size != 0);
6126
6127 if (regs[rd] == 0) {
6128 *(uint8_t *)a = UINT8_MAX;
6129 break;
6130 } else {
6131 *(uint8_t *)a = 0;
6132 a += sizeof (uint64_t);
6133 }
6134 if (!dtrace_vcanload(
6135 (void *)(uintptr_t)regs[rd], &v->dtdv_type,
6136 mstate, vstate))
6137 break;
6138
6139 dtrace_vcopy((void *)(uintptr_t)regs[rd],
6140 (void *)a, &v->dtdv_type);
6141 break;
6142 }
6143
6144 svar->dtsv_data = regs[rd];
6145 break;
6146
6147 case DIF_OP_LDTA:
6148 /*
6149 * There are no DTrace built-in thread-local arrays at
6150 * present. This opcode is saved for future work.
6151 */
6152 *flags |= CPU_DTRACE_ILLOP;
6153 regs[rd] = 0;
6154 break;
6155
6156 case DIF_OP_LDLS:
6157 id = DIF_INSTR_VAR(instr);
6158
6159 if (id < DIF_VAR_OTHER_UBASE) {
6160 /*
6161 * For now, this has no meaning.
6162 */
6163 regs[rd] = 0;
6164 break;
6165 }
6166
6167 id -= DIF_VAR_OTHER_UBASE;
6168
6169 ASSERT(id < vstate->dtvs_nlocals);
6170 ASSERT(vstate->dtvs_locals != NULL);
6171
6172 svar = vstate->dtvs_locals[id];
6173 ASSERT(svar != NULL);
6174 v = &svar->dtsv_var;
6175
6176 if (v->dtdv_type.dtdt_flags & DIF_TF_BYREF) {
6177 uintptr_t a = (uintptr_t)svar->dtsv_data;
6178 size_t sz = v->dtdv_type.dtdt_size;
6179
6180 sz += sizeof (uint64_t);
6181 ASSERT(svar->dtsv_size == NCPU * sz);
6182 a += curcpu * sz;
6183
6184 if (*(uint8_t *)a == UINT8_MAX) {
6185 /*
6186 * If the 0th byte is set to UINT8_MAX
6187 * then this is to be treated as a
6188 * reference to a NULL variable.
6189 */
6190 regs[rd] = 0;
6191 } else {
6192 regs[rd] = a + sizeof (uint64_t);
6193 }
6194
6195 break;
6196 }
6197
6198 ASSERT(svar->dtsv_size == NCPU * sizeof (uint64_t));
6199 tmp = (uint64_t *)(uintptr_t)svar->dtsv_data;
6200 regs[rd] = tmp[curcpu];
6201 break;
6202
6203 case DIF_OP_STLS:
6204 id = DIF_INSTR_VAR(instr);
6205
6206 ASSERT(id >= DIF_VAR_OTHER_UBASE);
6207 id -= DIF_VAR_OTHER_UBASE;
6208 ASSERT(id < vstate->dtvs_nlocals);
6209
6210 ASSERT(vstate->dtvs_locals != NULL);
6211 svar = vstate->dtvs_locals[id];
6212 ASSERT(svar != NULL);
6213 v = &svar->dtsv_var;
6214
6215 if (v->dtdv_type.dtdt_flags & DIF_TF_BYREF) {
6216 uintptr_t a = (uintptr_t)svar->dtsv_data;
6217 size_t sz = v->dtdv_type.dtdt_size;
6218
6219 sz += sizeof (uint64_t);
6220 ASSERT(svar->dtsv_size == NCPU * sz);
6221 a += curcpu * sz;
6222
6223 if (regs[rd] == 0) {
6224 *(uint8_t *)a = UINT8_MAX;
6225 break;
6226 } else {
6227 *(uint8_t *)a = 0;
6228 a += sizeof (uint64_t);
6229 }
6230
6231 if (!dtrace_vcanload(
6232 (void *)(uintptr_t)regs[rd], &v->dtdv_type,
6233 mstate, vstate))
6234 break;
6235
6236 dtrace_vcopy((void *)(uintptr_t)regs[rd],
6237 (void *)a, &v->dtdv_type);
6238 break;
6239 }
6240
6241 ASSERT(svar->dtsv_size == NCPU * sizeof (uint64_t));
6242 tmp = (uint64_t *)(uintptr_t)svar->dtsv_data;
6243 tmp[curcpu] = regs[rd];
6244 break;
6245
6246 case DIF_OP_LDTS: {
6247 dtrace_dynvar_t *dvar;
6248 dtrace_key_t *key;
6249
6250 id = DIF_INSTR_VAR(instr);
6251 ASSERT(id >= DIF_VAR_OTHER_UBASE);
6252 id -= DIF_VAR_OTHER_UBASE;
6253 v = &vstate->dtvs_tlocals[id];
6254
6255 key = &tupregs[DIF_DTR_NREGS];
6256 key[0].dttk_value = (uint64_t)id;
6257 key[0].dttk_size = 0;
6258 DTRACE_TLS_THRKEY(key[1].dttk_value);
6259 key[1].dttk_size = 0;
6260
6261 dvar = dtrace_dynvar(dstate, 2, key,
6262 sizeof (uint64_t), DTRACE_DYNVAR_NOALLOC,
6263 mstate, vstate);
6264
6265 if (dvar == NULL) {
6266 regs[rd] = 0;
6267 break;
6268 }
6269
6270 if (v->dtdv_type.dtdt_flags & DIF_TF_BYREF) {
6271 regs[rd] = (uint64_t)(uintptr_t)dvar->dtdv_data;
6272 } else {
6273 regs[rd] = *((uint64_t *)dvar->dtdv_data);
6274 }
6275
6276 break;
6277 }
6278
6279 case DIF_OP_STTS: {
6280 dtrace_dynvar_t *dvar;
6281 dtrace_key_t *key;
6282
6283 id = DIF_INSTR_VAR(instr);
6284 ASSERT(id >= DIF_VAR_OTHER_UBASE);
6285 id -= DIF_VAR_OTHER_UBASE;
6286
6287 key = &tupregs[DIF_DTR_NREGS];
6288 key[0].dttk_value = (uint64_t)id;
6289 key[0].dttk_size = 0;
6290 DTRACE_TLS_THRKEY(key[1].dttk_value);
6291 key[1].dttk_size = 0;
6292 v = &vstate->dtvs_tlocals[id];
6293
6294 dvar = dtrace_dynvar(dstate, 2, key,
6295 v->dtdv_type.dtdt_size > sizeof (uint64_t) ?
6296 v->dtdv_type.dtdt_size : sizeof (uint64_t),
6297 regs[rd] ? DTRACE_DYNVAR_ALLOC :
6298 DTRACE_DYNVAR_DEALLOC, mstate, vstate);
6299
6300 /*
6301 * Given that we're storing to thread-local data,
6302 * we need to flush our predicate cache.
6303 */
6304 curthread->t_predcache = 0;
6305
6306 if (dvar == NULL)
6307 break;
6308
6309 if (v->dtdv_type.dtdt_flags & DIF_TF_BYREF) {
6310 if (!dtrace_vcanload(
6311 (void *)(uintptr_t)regs[rd],
6312 &v->dtdv_type, mstate, vstate))
6313 break;
6314
6315 dtrace_vcopy((void *)(uintptr_t)regs[rd],
6316 dvar->dtdv_data, &v->dtdv_type);
6317 } else {
6318 *((uint64_t *)dvar->dtdv_data) = regs[rd];
6319 }
6320
6321 break;
6322 }
6323
6324 case DIF_OP_SRA:
6325 regs[rd] = (int64_t)regs[r1] >> regs[r2];
6326 break;
6327
6328 case DIF_OP_CALL:
6329 dtrace_dif_subr(DIF_INSTR_SUBR(instr), rd,
6330 regs, tupregs, ttop, mstate, state);
6331 break;
6332
6333 case DIF_OP_PUSHTR:
6334 if (ttop == DIF_DTR_NREGS) {
6335 *flags |= CPU_DTRACE_TUPOFLOW;
6336 break;
6337 }
6338
6339 if (r1 == DIF_TYPE_STRING) {
6340 /*
6341 * If this is a string type and the size is 0,
6342 * we'll use the system-wide default string
6343 * size. Note that we are _not_ looking at
6344 * the value of the DTRACEOPT_STRSIZE option;
6345 * had this been set, we would expect to have
6346 * a non-zero size value in the "pushtr".
6347 */
6348 tupregs[ttop].dttk_size =
6349 dtrace_strlen((char *)(uintptr_t)regs[rd],
6350 regs[r2] ? regs[r2] :
6351 dtrace_strsize_default) + 1;
6352 } else {
6353 tupregs[ttop].dttk_size = regs[r2];
6354 }
6355
6356 tupregs[ttop++].dttk_value = regs[rd];
6357 break;
6358
6359 case DIF_OP_PUSHTV:
6360 if (ttop == DIF_DTR_NREGS) {
6361 *flags |= CPU_DTRACE_TUPOFLOW;
6362 break;
6363 }
6364
6365 tupregs[ttop].dttk_value = regs[rd];
6366 tupregs[ttop++].dttk_size = 0;
6367 break;
6368
6369 case DIF_OP_POPTS:
6370 if (ttop != 0)
6371 ttop--;
6372 break;
6373
6374 case DIF_OP_FLUSHTS:
6375 ttop = 0;
6376 break;
6377
6378 case DIF_OP_LDGAA:
6379 case DIF_OP_LDTAA: {
6380 dtrace_dynvar_t *dvar;
6381 dtrace_key_t *key = tupregs;
6382 uint_t nkeys = ttop;
6383
6384 id = DIF_INSTR_VAR(instr);
6385 ASSERT(id >= DIF_VAR_OTHER_UBASE);
6386 id -= DIF_VAR_OTHER_UBASE;
6387
6388 key[nkeys].dttk_value = (uint64_t)id;
6389 key[nkeys++].dttk_size = 0;
6390
6391 if (DIF_INSTR_OP(instr) == DIF_OP_LDTAA) {
6392 DTRACE_TLS_THRKEY(key[nkeys].dttk_value);
6393 key[nkeys++].dttk_size = 0;
6394 v = &vstate->dtvs_tlocals[id];
6395 } else {
6396 v = &vstate->dtvs_globals[id]->dtsv_var;
6397 }
6398
6399 dvar = dtrace_dynvar(dstate, nkeys, key,
6400 v->dtdv_type.dtdt_size > sizeof (uint64_t) ?
6401 v->dtdv_type.dtdt_size : sizeof (uint64_t),
6402 DTRACE_DYNVAR_NOALLOC, mstate, vstate);
6403
6404 if (dvar == NULL) {
6405 regs[rd] = 0;
6406 break;
6407 }
6408
6409 if (v->dtdv_type.dtdt_flags & DIF_TF_BYREF) {
6410 regs[rd] = (uint64_t)(uintptr_t)dvar->dtdv_data;
6411 } else {
6412 regs[rd] = *((uint64_t *)dvar->dtdv_data);
6413 }
6414
6415 break;
6416 }
6417
6418 case DIF_OP_STGAA:
6419 case DIF_OP_STTAA: {
6420 dtrace_dynvar_t *dvar;
6421 dtrace_key_t *key = tupregs;
6422 uint_t nkeys = ttop;
6423
6424 id = DIF_INSTR_VAR(instr);
6425 ASSERT(id >= DIF_VAR_OTHER_UBASE);
6426 id -= DIF_VAR_OTHER_UBASE;
6427
6428 key[nkeys].dttk_value = (uint64_t)id;
6429 key[nkeys++].dttk_size = 0;
6430
6431 if (DIF_INSTR_OP(instr) == DIF_OP_STTAA) {
6432 DTRACE_TLS_THRKEY(key[nkeys].dttk_value);
6433 key[nkeys++].dttk_size = 0;
6434 v = &vstate->dtvs_tlocals[id];
6435 } else {
6436 v = &vstate->dtvs_globals[id]->dtsv_var;
6437 }
6438
6439 dvar = dtrace_dynvar(dstate, nkeys, key,
6440 v->dtdv_type.dtdt_size > sizeof (uint64_t) ?
6441 v->dtdv_type.dtdt_size : sizeof (uint64_t),
6442 regs[rd] ? DTRACE_DYNVAR_ALLOC :
6443 DTRACE_DYNVAR_DEALLOC, mstate, vstate);
6444
6445 if (dvar == NULL)
6446 break;
6447
6448 if (v->dtdv_type.dtdt_flags & DIF_TF_BYREF) {
6449 if (!dtrace_vcanload(
6450 (void *)(uintptr_t)regs[rd], &v->dtdv_type,
6451 mstate, vstate))
6452 break;
6453
6454 dtrace_vcopy((void *)(uintptr_t)regs[rd],
6455 dvar->dtdv_data, &v->dtdv_type);
6456 } else {
6457 *((uint64_t *)dvar->dtdv_data) = regs[rd];
6458 }
6459
6460 break;
6461 }
6462
6463 case DIF_OP_ALLOCS: {
6464 uintptr_t ptr = P2ROUNDUP(mstate->dtms_scratch_ptr, 8);
6465 size_t size = ptr - mstate->dtms_scratch_ptr + regs[r1];
6466
6467 /*
6468 * Rounding up the user allocation size could have
6469 * overflowed large, bogus allocations (like -1ULL) to
6470 * 0.
6471 */
6472 if (size < regs[r1] ||
6473 !DTRACE_INSCRATCH(mstate, size)) {
6474 DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
6475 regs[rd] = 0;
6476 break;
6477 }
6478
6479 dtrace_bzero((void *) mstate->dtms_scratch_ptr, size);
6480 mstate->dtms_scratch_ptr += size;
6481 regs[rd] = ptr;
6482 break;
6483 }
6484
6485 case DIF_OP_COPYS:
6486 if (!dtrace_canstore(regs[rd], regs[r2],
6487 mstate, vstate)) {
6488 *flags |= CPU_DTRACE_BADADDR;
6489 *illval = regs[rd];
6490 break;
6491 }
6492
6493 if (!dtrace_canload(regs[r1], regs[r2], mstate, vstate))
6494 break;
6495
6496 dtrace_bcopy((void *)(uintptr_t)regs[r1],
6497 (void *)(uintptr_t)regs[rd], (size_t)regs[r2]);
6498 break;
6499
6500 case DIF_OP_STB:
6501 if (!dtrace_canstore(regs[rd], 1, mstate, vstate)) {
6502 *flags |= CPU_DTRACE_BADADDR;
6503 *illval = regs[rd];
6504 break;
6505 }
6506 *((uint8_t *)(uintptr_t)regs[rd]) = (uint8_t)regs[r1];
6507 break;
6508
6509 case DIF_OP_STH:
6510 if (!dtrace_canstore(regs[rd], 2, mstate, vstate)) {
6511 *flags |= CPU_DTRACE_BADADDR;
6512 *illval = regs[rd];
6513 break;
6514 }
6515 if (regs[rd] & 1) {
6516 *flags |= CPU_DTRACE_BADALIGN;
6517 *illval = regs[rd];
6518 break;
6519 }
6520 *((uint16_t *)(uintptr_t)regs[rd]) = (uint16_t)regs[r1];
6521 break;
6522
6523 case DIF_OP_STW:
6524 if (!dtrace_canstore(regs[rd], 4, mstate, vstate)) {
6525 *flags |= CPU_DTRACE_BADADDR;
6526 *illval = regs[rd];
6527 break;
6528 }
6529 if (regs[rd] & 3) {
6530 *flags |= CPU_DTRACE_BADALIGN;
6531 *illval = regs[rd];
6532 break;
6533 }
6534 *((uint32_t *)(uintptr_t)regs[rd]) = (uint32_t)regs[r1];
6535 break;
6536
6537 case DIF_OP_STX:
6538 if (!dtrace_canstore(regs[rd], 8, mstate, vstate)) {
6539 *flags |= CPU_DTRACE_BADADDR;
6540 *illval = regs[rd];
6541 break;
6542 }
6543 if (regs[rd] & 7) {
6544 *flags |= CPU_DTRACE_BADALIGN;
6545 *illval = regs[rd];
6546 break;
6547 }
6548 *((uint64_t *)(uintptr_t)regs[rd]) = regs[r1];
6549 break;
6550 }
6551 }
6552
6553 if (!(*flags & CPU_DTRACE_FAULT))
6554 return (rval);
6555
6556 mstate->dtms_fltoffs = opc * sizeof (dif_instr_t);
6557 mstate->dtms_present |= DTRACE_MSTATE_FLTOFFS;
6558
6559 return (0);
6560}
6561
6562static void
6563dtrace_action_breakpoint(dtrace_ecb_t *ecb)
6564{
6565 dtrace_probe_t *probe = ecb->dte_probe;
6566 dtrace_provider_t *prov = probe->dtpr_provider;
6567 char c[DTRACE_FULLNAMELEN + 80], *str;
6568 char *msg = "dtrace: breakpoint action at probe ";
6569 char *ecbmsg = " (ecb ";
6570 uintptr_t mask = (0xf << (sizeof (uintptr_t) * NBBY / 4));
6571 uintptr_t val = (uintptr_t)ecb;
6572 int shift = (sizeof (uintptr_t) * NBBY) - 4, i = 0;
6573
6574 if (dtrace_destructive_disallow)
6575 return;
6576
6577 /*
6578 * It's impossible to be taking action on the NULL probe.
6579 */
6580 ASSERT(probe != NULL);
6581
6582 /*
6583 * This is a poor man's (destitute man's?) sprintf(): we want to
6584 * print the provider name, module name, function name and name of
6585 * the probe, along with the hex address of the ECB with the breakpoint
6586 * action -- all of which we must place in the character buffer by
6587 * hand.
6588 */
6589 while (*msg != '\0')
6590 c[i++] = *msg++;
6591
6592 for (str = prov->dtpv_name; *str != '\0'; str++)
6593 c[i++] = *str;
6594 c[i++] = ':';
6595
6596 for (str = probe->dtpr_mod; *str != '\0'; str++)
6597 c[i++] = *str;
6598 c[i++] = ':';
6599
6600 for (str = probe->dtpr_func; *str != '\0'; str++)
6601 c[i++] = *str;
6602 c[i++] = ':';
6603
6604 for (str = probe->dtpr_name; *str != '\0'; str++)
6605 c[i++] = *str;
6606
6607 while (*ecbmsg != '\0')
6608 c[i++] = *ecbmsg++;
6609
6610 while (shift >= 0) {
6611 mask = (uintptr_t)0xf << shift;
6612
6613 if (val >= ((uintptr_t)1 << shift))
6614 c[i++] = "0123456789abcdef"[(val & mask) >> shift];
6615 shift -= 4;
6616 }
6617
6618 c[i++] = ')';
6619 c[i] = '\0';
6620
6621#if defined(sun)
6622 debug_enter(c);
6623#else
6624 kdb_enter(KDB_WHY_DTRACE, "breakpoint action");
6625#endif
6626}
6627
6628static void
6629dtrace_action_panic(dtrace_ecb_t *ecb)
6630{
6631 dtrace_probe_t *probe = ecb->dte_probe;
6632
6633 /*
6634 * It's impossible to be taking action on the NULL probe.
6635 */
6636 ASSERT(probe != NULL);
6637
6638 if (dtrace_destructive_disallow)
6639 return;
6640
6641 if (dtrace_panicked != NULL)
6642 return;
6643
6644 if (dtrace_casptr(&dtrace_panicked, NULL, curthread) != NULL)
6645 return;
6646
6647 /*
6648 * We won the right to panic. (We want to be sure that only one
6649 * thread calls panic() from dtrace_probe(), and that panic() is
6650 * called exactly once.)
6651 */
6652 dtrace_panic("dtrace: panic action at probe %s:%s:%s:%s (ecb %p)",
6653 probe->dtpr_provider->dtpv_name, probe->dtpr_mod,
6654 probe->dtpr_func, probe->dtpr_name, (void *)ecb);
6655}
6656
6657static void
6658dtrace_action_raise(uint64_t sig)
6659{
6660 if (dtrace_destructive_disallow)
6661 return;
6662
6663 if (sig >= NSIG) {
6664 DTRACE_CPUFLAG_SET(CPU_DTRACE_ILLOP);
6665 return;
6666 }
6667
6668#if defined(sun)
6669 /*
6670 * raise() has a queue depth of 1 -- we ignore all subsequent
6671 * invocations of the raise() action.
6672 */
6673 if (curthread->t_dtrace_sig == 0)
6674 curthread->t_dtrace_sig = (uint8_t)sig;
6675
6676 curthread->t_sig_check = 1;
6677 aston(curthread);
6678#else
6679 struct proc *p = curproc;
6680 PROC_LOCK(p);
6681 kern_psignal(p, sig);
6682 PROC_UNLOCK(p);
6683#endif
6684}
6685
6686static void
6687dtrace_action_stop(void)
6688{
6689 if (dtrace_destructive_disallow)
6690 return;
6691
6692#if defined(sun)
6693 if (!curthread->t_dtrace_stop) {
6694 curthread->t_dtrace_stop = 1;
6695 curthread->t_sig_check = 1;
6696 aston(curthread);
6697 }
6698#else
6699 struct proc *p = curproc;
6700 PROC_LOCK(p);
6701 kern_psignal(p, SIGSTOP);
6702 PROC_UNLOCK(p);
6703#endif
6704}
6705
6706static void
6707dtrace_action_chill(dtrace_mstate_t *mstate, hrtime_t val)
6708{
6709 hrtime_t now;
6710 volatile uint16_t *flags;
6711#if defined(sun)
6712 cpu_t *cpu = CPU;
6713#else
6714 cpu_t *cpu = &solaris_cpu[curcpu];
6715#endif
6716
6717 if (dtrace_destructive_disallow)
6718 return;
6719
5890 flags = (volatile uint16_t *)&cpu_core[cpu->cpu_id].cpuc_dtrace_flags;
6720 flags = (volatile uint16_t *)&cpu_core[curcpu].cpuc_dtrace_flags;
5891
5892 now = dtrace_gethrtime();
5893
5894 if (now - cpu->cpu_dtrace_chillmark > dtrace_chill_interval) {
5895 /*
5896 * We need to advance the mark to the current time.
5897 */
5898 cpu->cpu_dtrace_chillmark = now;
5899 cpu->cpu_dtrace_chilled = 0;
5900 }
5901
5902 /*
5903 * Now check to see if the requested chill time would take us over
5904 * the maximum amount of time allowed in the chill interval. (Or
5905 * worse, if the calculation itself induces overflow.)
5906 */
5907 if (cpu->cpu_dtrace_chilled + val > dtrace_chill_max ||
5908 cpu->cpu_dtrace_chilled + val < cpu->cpu_dtrace_chilled) {
5909 *flags |= CPU_DTRACE_ILLOP;
5910 return;
5911 }
5912
5913 while (dtrace_gethrtime() - now < val)
5914 continue;
5915
5916 /*
5917 * Normally, we assure that the value of the variable "timestamp" does
5918 * not change within an ECB. The presence of chill() represents an
5919 * exception to this rule, however.
5920 */
5921 mstate->dtms_present &= ~DTRACE_MSTATE_TIMESTAMP;
5922 cpu->cpu_dtrace_chilled += val;
5923}
5924
5925static void
5926dtrace_action_ustack(dtrace_mstate_t *mstate, dtrace_state_t *state,
5927 uint64_t *buf, uint64_t arg)
5928{
5929 int nframes = DTRACE_USTACK_NFRAMES(arg);
5930 int strsize = DTRACE_USTACK_STRSIZE(arg);
5931 uint64_t *pcs = &buf[1], *fps;
5932 char *str = (char *)&pcs[nframes];
5933 int size, offs = 0, i, j;
5934 uintptr_t old = mstate->dtms_scratch_ptr, saved;
5935 uint16_t *flags = &cpu_core[curcpu].cpuc_dtrace_flags;
5936 char *sym;
5937
5938 /*
5939 * Should be taking a faster path if string space has not been
5940 * allocated.
5941 */
5942 ASSERT(strsize != 0);
5943
5944 /*
5945 * We will first allocate some temporary space for the frame pointers.
5946 */
5947 fps = (uint64_t *)P2ROUNDUP(mstate->dtms_scratch_ptr, 8);
5948 size = (uintptr_t)fps - mstate->dtms_scratch_ptr +
5949 (nframes * sizeof (uint64_t));
5950
5951 if (!DTRACE_INSCRATCH(mstate, size)) {
5952 /*
5953 * Not enough room for our frame pointers -- need to indicate
5954 * that we ran out of scratch space.
5955 */
5956 DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
5957 return;
5958 }
5959
5960 mstate->dtms_scratch_ptr += size;
5961 saved = mstate->dtms_scratch_ptr;
5962
5963 /*
5964 * Now get a stack with both program counters and frame pointers.
5965 */
5966 DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
5967 dtrace_getufpstack(buf, fps, nframes + 1);
5968 DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);
5969
5970 /*
5971 * If that faulted, we're cooked.
5972 */
5973 if (*flags & CPU_DTRACE_FAULT)
5974 goto out;
5975
5976 /*
5977 * Now we want to walk up the stack, calling the USTACK helper. For
5978 * each iteration, we restore the scratch pointer.
5979 */
5980 for (i = 0; i < nframes; i++) {
5981 mstate->dtms_scratch_ptr = saved;
5982
5983 if (offs >= strsize)
5984 break;
5985
5986 sym = (char *)(uintptr_t)dtrace_helper(
5987 DTRACE_HELPER_ACTION_USTACK,
5988 mstate, state, pcs[i], fps[i]);
5989
5990 /*
5991 * If we faulted while running the helper, we're going to
5992 * clear the fault and null out the corresponding string.
5993 */
5994 if (*flags & CPU_DTRACE_FAULT) {
5995 *flags &= ~CPU_DTRACE_FAULT;
5996 str[offs++] = '\0';
5997 continue;
5998 }
5999
6000 if (sym == NULL) {
6001 str[offs++] = '\0';
6002 continue;
6003 }
6004
6005 DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
6006
6007 /*
6008 * Now copy in the string that the helper returned to us.
6009 */
6010 for (j = 0; offs + j < strsize; j++) {
6011 if ((str[offs + j] = sym[j]) == '\0')
6012 break;
6013 }
6014
6015 DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);
6016
6017 offs += j + 1;
6018 }
6019
6020 if (offs >= strsize) {
6021 /*
6022 * If we didn't have room for all of the strings, we don't
6023 * abort processing -- this needn't be a fatal error -- but we
6024 * still want to increment a counter (dts_stkstroverflows) to
6025 * allow this condition to be warned about. (If this is from
6026 * a jstack() action, it is easily tuned via jstackstrsize.)
6027 */
6028 dtrace_error(&state->dts_stkstroverflows);
6029 }
6030
6031 while (offs < strsize)
6032 str[offs++] = '\0';
6033
6034out:
6035 mstate->dtms_scratch_ptr = old;
6036}
6037
6721
6722 now = dtrace_gethrtime();
6723
6724 if (now - cpu->cpu_dtrace_chillmark > dtrace_chill_interval) {
6725 /*
6726 * We need to advance the mark to the current time.
6727 */
6728 cpu->cpu_dtrace_chillmark = now;
6729 cpu->cpu_dtrace_chilled = 0;
6730 }
6731
6732 /*
6733 * Now check to see if the requested chill time would take us over
6734 * the maximum amount of time allowed in the chill interval. (Or
6735 * worse, if the calculation itself induces overflow.)
6736 */
6737 if (cpu->cpu_dtrace_chilled + val > dtrace_chill_max ||
6738 cpu->cpu_dtrace_chilled + val < cpu->cpu_dtrace_chilled) {
6739 *flags |= CPU_DTRACE_ILLOP;
6740 return;
6741 }
6742
6743 while (dtrace_gethrtime() - now < val)
6744 continue;
6745
6746 /*
6747 * Normally, we assure that the value of the variable "timestamp" does
6748 * not change within an ECB. The presence of chill() represents an
6749 * exception to this rule, however.
6750 */
6751 mstate->dtms_present &= ~DTRACE_MSTATE_TIMESTAMP;
6752 cpu->cpu_dtrace_chilled += val;
6753}
6754
6755static void
6756dtrace_action_ustack(dtrace_mstate_t *mstate, dtrace_state_t *state,
6757 uint64_t *buf, uint64_t arg)
6758{
6759 int nframes = DTRACE_USTACK_NFRAMES(arg);
6760 int strsize = DTRACE_USTACK_STRSIZE(arg);
6761 uint64_t *pcs = &buf[1], *fps;
6762 char *str = (char *)&pcs[nframes];
6763 int size, offs = 0, i, j;
6764 uintptr_t old = mstate->dtms_scratch_ptr, saved;
6765 uint16_t *flags = &cpu_core[curcpu].cpuc_dtrace_flags;
6766 char *sym;
6767
6768 /*
6769 * Should be taking a faster path if string space has not been
6770 * allocated.
6771 */
6772 ASSERT(strsize != 0);
6773
6774 /*
6775 * We will first allocate some temporary space for the frame pointers.
6776 */
6777 fps = (uint64_t *)P2ROUNDUP(mstate->dtms_scratch_ptr, 8);
6778 size = (uintptr_t)fps - mstate->dtms_scratch_ptr +
6779 (nframes * sizeof (uint64_t));
6780
6781 if (!DTRACE_INSCRATCH(mstate, size)) {
6782 /*
6783 * Not enough room for our frame pointers -- need to indicate
6784 * that we ran out of scratch space.
6785 */
6786 DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
6787 return;
6788 }
6789
6790 mstate->dtms_scratch_ptr += size;
6791 saved = mstate->dtms_scratch_ptr;
6792
6793 /*
6794 * Now get a stack with both program counters and frame pointers.
6795 */
6796 DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
6797 dtrace_getufpstack(buf, fps, nframes + 1);
6798 DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);
6799
6800 /*
6801 * If that faulted, we're cooked.
6802 */
6803 if (*flags & CPU_DTRACE_FAULT)
6804 goto out;
6805
6806 /*
6807 * Now we want to walk up the stack, calling the USTACK helper. For
6808 * each iteration, we restore the scratch pointer.
6809 */
6810 for (i = 0; i < nframes; i++) {
6811 mstate->dtms_scratch_ptr = saved;
6812
6813 if (offs >= strsize)
6814 break;
6815
6816 sym = (char *)(uintptr_t)dtrace_helper(
6817 DTRACE_HELPER_ACTION_USTACK,
6818 mstate, state, pcs[i], fps[i]);
6819
6820 /*
6821 * If we faulted while running the helper, we're going to
6822 * clear the fault and null out the corresponding string.
6823 */
6824 if (*flags & CPU_DTRACE_FAULT) {
6825 *flags &= ~CPU_DTRACE_FAULT;
6826 str[offs++] = '\0';
6827 continue;
6828 }
6829
6830 if (sym == NULL) {
6831 str[offs++] = '\0';
6832 continue;
6833 }
6834
6835 DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
6836
6837 /*
6838 * Now copy in the string that the helper returned to us.
6839 */
6840 for (j = 0; offs + j < strsize; j++) {
6841 if ((str[offs + j] = sym[j]) == '\0')
6842 break;
6843 }
6844
6845 DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);
6846
6847 offs += j + 1;
6848 }
6849
6850 if (offs >= strsize) {
6851 /*
6852 * If we didn't have room for all of the strings, we don't
6853 * abort processing -- this needn't be a fatal error -- but we
6854 * still want to increment a counter (dts_stkstroverflows) to
6855 * allow this condition to be warned about. (If this is from
6856 * a jstack() action, it is easily tuned via jstackstrsize.)
6857 */
6858 dtrace_error(&state->dts_stkstroverflows);
6859 }
6860
6861 while (offs < strsize)
6862 str[offs++] = '\0';
6863
6864out:
6865 mstate->dtms_scratch_ptr = old;
6866}
6867
6868static void
6869dtrace_store_by_ref(dtrace_difo_t *dp, caddr_t tomax, size_t size,
6870 size_t *valoffsp, uint64_t *valp, uint64_t end, int intuple, int dtkind)
6871{
6872 volatile uint16_t *flags;
6873 uint64_t val = *valp;
6874 size_t valoffs = *valoffsp;
6875
6876 flags = (volatile uint16_t *)&cpu_core[curcpu].cpuc_dtrace_flags;
6877 ASSERT(dtkind == DIF_TF_BYREF || dtkind == DIF_TF_BYUREF);
6878
6879 /*
6880 * If this is a string, we're going to only load until we find the zero
6881 * byte -- after which we'll store zero bytes.
6882 */
6883 if (dp->dtdo_rtype.dtdt_kind == DIF_TYPE_STRING) {
6884 char c = '\0' + 1;
6885 size_t s;
6886
6887 for (s = 0; s < size; s++) {
6888 if (c != '\0' && dtkind == DIF_TF_BYREF) {
6889 c = dtrace_load8(val++);
6890 } else if (c != '\0' && dtkind == DIF_TF_BYUREF) {
6891 DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
6892 c = dtrace_fuword8((void *)(uintptr_t)val++);
6893 DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);
6894 if (*flags & CPU_DTRACE_FAULT)
6895 break;
6896 }
6897
6898 DTRACE_STORE(uint8_t, tomax, valoffs++, c);
6899
6900 if (c == '\0' && intuple)
6901 break;
6902 }
6903 } else {
6904 uint8_t c;
6905 while (valoffs < end) {
6906 if (dtkind == DIF_TF_BYREF) {
6907 c = dtrace_load8(val++);
6908 } else if (dtkind == DIF_TF_BYUREF) {
6909 DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
6910 c = dtrace_fuword8((void *)(uintptr_t)val++);
6911 DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);
6912 if (*flags & CPU_DTRACE_FAULT)
6913 break;
6914 }
6915
6916 DTRACE_STORE(uint8_t, tomax,
6917 valoffs++, c);
6918 }
6919 }
6920
6921 *valp = val;
6922 *valoffsp = valoffs;
6923}
6924
6038/*
6039 * If you're looking for the epicenter of DTrace, you just found it. This
6040 * is the function called by the provider to fire a probe -- from which all
6041 * subsequent probe-context DTrace activity emanates.
6042 */
6043void
6044dtrace_probe(dtrace_id_t id, uintptr_t arg0, uintptr_t arg1,
6045 uintptr_t arg2, uintptr_t arg3, uintptr_t arg4)
6046{
6047 processorid_t cpuid;
6048 dtrace_icookie_t cookie;
6049 dtrace_probe_t *probe;
6050 dtrace_mstate_t mstate;
6051 dtrace_ecb_t *ecb;
6052 dtrace_action_t *act;
6053 intptr_t offs;
6054 size_t size;
6055 int vtime, onintr;
6056 volatile uint16_t *flags;
6057 hrtime_t now;
6058
6059 if (panicstr != NULL)
6060 return;
6061
6062#if defined(sun)
6063 /*
6064 * Kick out immediately if this CPU is still being born (in which case
6065 * curthread will be set to -1) or the current thread can't allow
6066 * probes in its current context.
6067 */
6068 if (((uintptr_t)curthread & 1) || (curthread->t_flag & T_DONTDTRACE))
6069 return;
6070#endif
6071
6072 cookie = dtrace_interrupt_disable();
6073 probe = dtrace_probes[id - 1];
6074 cpuid = curcpu;
6075 onintr = CPU_ON_INTR(CPU);
6076
6077 if (!onintr && probe->dtpr_predcache != DTRACE_CACHEIDNONE &&
6078 probe->dtpr_predcache == curthread->t_predcache) {
6079 /*
6080 * We have hit in the predicate cache; we know that
6081 * this predicate would evaluate to be false.
6082 */
6083 dtrace_interrupt_enable(cookie);
6084 return;
6085 }
6086
6087#if defined(sun)
6088 if (panic_quiesce) {
6089#else
6090 if (panicstr != NULL) {
6091#endif
6092 /*
6093 * We don't trace anything if we're panicking.
6094 */
6095 dtrace_interrupt_enable(cookie);
6096 return;
6097 }
6098
6099 now = dtrace_gethrtime();
6100 vtime = dtrace_vtime_references != 0;
6101
6102 if (vtime && curthread->t_dtrace_start)
6103 curthread->t_dtrace_vtime += now - curthread->t_dtrace_start;
6104
6105 mstate.dtms_difo = NULL;
6106 mstate.dtms_probe = probe;
6107 mstate.dtms_strtok = 0;
6108 mstate.dtms_arg[0] = arg0;
6109 mstate.dtms_arg[1] = arg1;
6110 mstate.dtms_arg[2] = arg2;
6111 mstate.dtms_arg[3] = arg3;
6112 mstate.dtms_arg[4] = arg4;
6113
6114 flags = (volatile uint16_t *)&cpu_core[cpuid].cpuc_dtrace_flags;
6115
6116 for (ecb = probe->dtpr_ecb; ecb != NULL; ecb = ecb->dte_next) {
6117 dtrace_predicate_t *pred = ecb->dte_predicate;
6118 dtrace_state_t *state = ecb->dte_state;
6119 dtrace_buffer_t *buf = &state->dts_buffer[cpuid];
6120 dtrace_buffer_t *aggbuf = &state->dts_aggbuffer[cpuid];
6121 dtrace_vstate_t *vstate = &state->dts_vstate;
6122 dtrace_provider_t *prov = probe->dtpr_provider;
6123 uint64_t tracememsize = 0;
6124 int committed = 0;
6125 caddr_t tomax;
6126
6127 /*
6128 * A little subtlety with the following (seemingly innocuous)
6129 * declaration of the automatic 'val': by looking at the
6130 * code, you might think that it could be declared in the
6131 * action processing loop, below. (That is, it's only used in
6132 * the action processing loop.) However, it must be declared
6133 * out of that scope because in the case of DIF expression
6134 * arguments to aggregating actions, one iteration of the
6135 * action loop will use the last iteration's value.
6136 */
6137 uint64_t val = 0;
6138
6139 mstate.dtms_present = DTRACE_MSTATE_ARGS | DTRACE_MSTATE_PROBE;
6925/*
6926 * If you're looking for the epicenter of DTrace, you just found it. This
6927 * is the function called by the provider to fire a probe -- from which all
6928 * subsequent probe-context DTrace activity emanates.
6929 */
6930void
6931dtrace_probe(dtrace_id_t id, uintptr_t arg0, uintptr_t arg1,
6932 uintptr_t arg2, uintptr_t arg3, uintptr_t arg4)
6933{
6934 processorid_t cpuid;
6935 dtrace_icookie_t cookie;
6936 dtrace_probe_t *probe;
6937 dtrace_mstate_t mstate;
6938 dtrace_ecb_t *ecb;
6939 dtrace_action_t *act;
6940 intptr_t offs;
6941 size_t size;
6942 int vtime, onintr;
6943 volatile uint16_t *flags;
6944 hrtime_t now;
6945
6946 if (panicstr != NULL)
6947 return;
6948
6949#if defined(sun)
6950 /*
6951 * Kick out immediately if this CPU is still being born (in which case
6952 * curthread will be set to -1) or the current thread can't allow
6953 * probes in its current context.
6954 */
6955 if (((uintptr_t)curthread & 1) || (curthread->t_flag & T_DONTDTRACE))
6956 return;
6957#endif
6958
6959 cookie = dtrace_interrupt_disable();
6960 probe = dtrace_probes[id - 1];
6961 cpuid = curcpu;
6962 onintr = CPU_ON_INTR(CPU);
6963
6964 if (!onintr && probe->dtpr_predcache != DTRACE_CACHEIDNONE &&
6965 probe->dtpr_predcache == curthread->t_predcache) {
6966 /*
6967 * We have hit in the predicate cache; we know that
6968 * this predicate would evaluate to be false.
6969 */
6970 dtrace_interrupt_enable(cookie);
6971 return;
6972 }
6973
6974#if defined(sun)
6975 if (panic_quiesce) {
6976#else
6977 if (panicstr != NULL) {
6978#endif
6979 /*
6980 * We don't trace anything if we're panicking.
6981 */
6982 dtrace_interrupt_enable(cookie);
6983 return;
6984 }
6985
6986 now = dtrace_gethrtime();
6987 vtime = dtrace_vtime_references != 0;
6988
6989 if (vtime && curthread->t_dtrace_start)
6990 curthread->t_dtrace_vtime += now - curthread->t_dtrace_start;
6991
6992 mstate.dtms_difo = NULL;
6993 mstate.dtms_probe = probe;
6994 mstate.dtms_strtok = 0;
6995 mstate.dtms_arg[0] = arg0;
6996 mstate.dtms_arg[1] = arg1;
6997 mstate.dtms_arg[2] = arg2;
6998 mstate.dtms_arg[3] = arg3;
6999 mstate.dtms_arg[4] = arg4;
7000
7001 flags = (volatile uint16_t *)&cpu_core[cpuid].cpuc_dtrace_flags;
7002
7003 for (ecb = probe->dtpr_ecb; ecb != NULL; ecb = ecb->dte_next) {
7004 dtrace_predicate_t *pred = ecb->dte_predicate;
7005 dtrace_state_t *state = ecb->dte_state;
7006 dtrace_buffer_t *buf = &state->dts_buffer[cpuid];
7007 dtrace_buffer_t *aggbuf = &state->dts_aggbuffer[cpuid];
7008 dtrace_vstate_t *vstate = &state->dts_vstate;
7009 dtrace_provider_t *prov = probe->dtpr_provider;
7010 uint64_t tracememsize = 0;
7011 int committed = 0;
7012 caddr_t tomax;
7013
7014 /*
7015 * A little subtlety with the following (seemingly innocuous)
7016 * declaration of the automatic 'val': by looking at the
7017 * code, you might think that it could be declared in the
7018 * action processing loop, below. (That is, it's only used in
7019 * the action processing loop.) However, it must be declared
7020 * out of that scope because in the case of DIF expression
7021 * arguments to aggregating actions, one iteration of the
7022 * action loop will use the last iteration's value.
7023 */
7024 uint64_t val = 0;
7025
7026 mstate.dtms_present = DTRACE_MSTATE_ARGS | DTRACE_MSTATE_PROBE;
7027 mstate.dtms_getf = NULL;
7028
6140 *flags &= ~CPU_DTRACE_ERROR;
6141
6142 if (prov == dtrace_provider) {
6143 /*
6144 * If dtrace itself is the provider of this probe,
6145 * we're only going to continue processing the ECB if
6146 * arg0 (the dtrace_state_t) is equal to the ECB's
6147 * creating state. (This prevents disjoint consumers
6148 * from seeing one another's metaprobes.)
6149 */
6150 if (arg0 != (uint64_t)(uintptr_t)state)
6151 continue;
6152 }
6153
6154 if (state->dts_activity != DTRACE_ACTIVITY_ACTIVE) {
6155 /*
6156 * We're not currently active. If our provider isn't
6157 * the dtrace pseudo provider, we're not interested.
6158 */
6159 if (prov != dtrace_provider)
6160 continue;
6161
6162 /*
6163 * Now we must further check if we are in the BEGIN
6164 * probe. If we are, we will only continue processing
6165 * if we're still in WARMUP -- if one BEGIN enabling
6166 * has invoked the exit() action, we don't want to
6167 * evaluate subsequent BEGIN enablings.
6168 */
6169 if (probe->dtpr_id == dtrace_probeid_begin &&
6170 state->dts_activity != DTRACE_ACTIVITY_WARMUP) {
6171 ASSERT(state->dts_activity ==
6172 DTRACE_ACTIVITY_DRAINING);
6173 continue;
6174 }
6175 }
6176
6177 if (ecb->dte_cond) {
6178 /*
6179 * If the dte_cond bits indicate that this
6180 * consumer is only allowed to see user-mode firings
6181 * of this probe, call the provider's dtps_usermode()
6182 * entry point to check that the probe was fired
6183 * while in a user context. Skip this ECB if that's
6184 * not the case.
6185 */
6186 if ((ecb->dte_cond & DTRACE_COND_USERMODE) &&
6187 prov->dtpv_pops.dtps_usermode(prov->dtpv_arg,
6188 probe->dtpr_id, probe->dtpr_arg) == 0)
6189 continue;
6190
6191#if defined(sun)
6192 /*
6193 * This is more subtle than it looks. We have to be
6194 * absolutely certain that CRED() isn't going to
6195 * change out from under us so it's only legit to
6196 * examine that structure if we're in constrained
6197 * situations. Currently, the only times we'll this
6198 * check is if a non-super-user has enabled the
6199 * profile or syscall providers -- providers that
6200 * allow visibility of all processes. For the
6201 * profile case, the check above will ensure that
6202 * we're examining a user context.
6203 */
6204 if (ecb->dte_cond & DTRACE_COND_OWNER) {
6205 cred_t *cr;
6206 cred_t *s_cr =
6207 ecb->dte_state->dts_cred.dcr_cred;
6208 proc_t *proc;
6209
6210 ASSERT(s_cr != NULL);
6211
6212 if ((cr = CRED()) == NULL ||
6213 s_cr->cr_uid != cr->cr_uid ||
6214 s_cr->cr_uid != cr->cr_ruid ||
6215 s_cr->cr_uid != cr->cr_suid ||
6216 s_cr->cr_gid != cr->cr_gid ||
6217 s_cr->cr_gid != cr->cr_rgid ||
6218 s_cr->cr_gid != cr->cr_sgid ||
6219 (proc = ttoproc(curthread)) == NULL ||
6220 (proc->p_flag & SNOCD))
6221 continue;
6222 }
6223
6224 if (ecb->dte_cond & DTRACE_COND_ZONEOWNER) {
6225 cred_t *cr;
6226 cred_t *s_cr =
6227 ecb->dte_state->dts_cred.dcr_cred;
6228
6229 ASSERT(s_cr != NULL);
6230
6231 if ((cr = CRED()) == NULL ||
6232 s_cr->cr_zone->zone_id !=
6233 cr->cr_zone->zone_id)
6234 continue;
6235 }
6236#endif
6237 }
6238
6239 if (now - state->dts_alive > dtrace_deadman_timeout) {
6240 /*
6241 * We seem to be dead. Unless we (a) have kernel
6242 * destructive permissions (b) have explicitly enabled
6243 * destructive actions and (c) destructive actions have
6244 * not been disabled, we're going to transition into
6245 * the KILLED state, from which no further processing
6246 * on this state will be performed.
6247 */
6248 if (!dtrace_priv_kernel_destructive(state) ||
6249 !state->dts_cred.dcr_destructive ||
6250 dtrace_destructive_disallow) {
6251 void *activity = &state->dts_activity;
6252 dtrace_activity_t current;
6253
6254 do {
6255 current = state->dts_activity;
6256 } while (dtrace_cas32(activity, current,
6257 DTRACE_ACTIVITY_KILLED) != current);
6258
6259 continue;
6260 }
6261 }
6262
6263 if ((offs = dtrace_buffer_reserve(buf, ecb->dte_needed,
6264 ecb->dte_alignment, state, &mstate)) < 0)
6265 continue;
6266
6267 tomax = buf->dtb_tomax;
6268 ASSERT(tomax != NULL);
6269
6270 if (ecb->dte_size != 0) {
6271 dtrace_rechdr_t dtrh;
6272 if (!(mstate.dtms_present & DTRACE_MSTATE_TIMESTAMP)) {
6273 mstate.dtms_timestamp = dtrace_gethrtime();
6274 mstate.dtms_present |= DTRACE_MSTATE_TIMESTAMP;
6275 }
6276 ASSERT3U(ecb->dte_size, >=, sizeof (dtrace_rechdr_t));
6277 dtrh.dtrh_epid = ecb->dte_epid;
6278 DTRACE_RECORD_STORE_TIMESTAMP(&dtrh,
6279 mstate.dtms_timestamp);
6280 *((dtrace_rechdr_t *)(tomax + offs)) = dtrh;
6281 }
6282
6283 mstate.dtms_epid = ecb->dte_epid;
6284 mstate.dtms_present |= DTRACE_MSTATE_EPID;
6285
6286 if (state->dts_cred.dcr_visible & DTRACE_CRV_KERNEL)
6287 mstate.dtms_access = DTRACE_ACCESS_KERNEL;
6288 else
6289 mstate.dtms_access = 0;
6290
6291 if (pred != NULL) {
6292 dtrace_difo_t *dp = pred->dtp_difo;
6293 int rval;
6294
6295 rval = dtrace_dif_emulate(dp, &mstate, vstate, state);
6296
6297 if (!(*flags & CPU_DTRACE_ERROR) && !rval) {
6298 dtrace_cacheid_t cid = probe->dtpr_predcache;
6299
6300 if (cid != DTRACE_CACHEIDNONE && !onintr) {
6301 /*
6302 * Update the predicate cache...
6303 */
6304 ASSERT(cid == pred->dtp_cacheid);
6305 curthread->t_predcache = cid;
6306 }
6307
6308 continue;
6309 }
6310 }
6311
6312 for (act = ecb->dte_action; !(*flags & CPU_DTRACE_ERROR) &&
6313 act != NULL; act = act->dta_next) {
6314 size_t valoffs;
6315 dtrace_difo_t *dp;
6316 dtrace_recdesc_t *rec = &act->dta_rec;
6317
6318 size = rec->dtrd_size;
6319 valoffs = offs + rec->dtrd_offset;
6320
6321 if (DTRACEACT_ISAGG(act->dta_kind)) {
6322 uint64_t v = 0xbad;
6323 dtrace_aggregation_t *agg;
6324
6325 agg = (dtrace_aggregation_t *)act;
6326
6327 if ((dp = act->dta_difo) != NULL)
6328 v = dtrace_dif_emulate(dp,
6329 &mstate, vstate, state);
6330
6331 if (*flags & CPU_DTRACE_ERROR)
6332 continue;
6333
6334 /*
6335 * Note that we always pass the expression
6336 * value from the previous iteration of the
6337 * action loop. This value will only be used
6338 * if there is an expression argument to the
6339 * aggregating action, denoted by the
6340 * dtag_hasarg field.
6341 */
6342 dtrace_aggregate(agg, buf,
6343 offs, aggbuf, v, val);
6344 continue;
6345 }
6346
6347 switch (act->dta_kind) {
6348 case DTRACEACT_STOP:
6349 if (dtrace_priv_proc_destructive(state))
6350 dtrace_action_stop();
6351 continue;
6352
6353 case DTRACEACT_BREAKPOINT:
6354 if (dtrace_priv_kernel_destructive(state))
6355 dtrace_action_breakpoint(ecb);
6356 continue;
6357
6358 case DTRACEACT_PANIC:
6359 if (dtrace_priv_kernel_destructive(state))
6360 dtrace_action_panic(ecb);
6361 continue;
6362
6363 case DTRACEACT_STACK:
6364 if (!dtrace_priv_kernel(state))
6365 continue;
6366
6367 dtrace_getpcstack((pc_t *)(tomax + valoffs),
6368 size / sizeof (pc_t), probe->dtpr_aframes,
6369 DTRACE_ANCHORED(probe) ? NULL :
6370 (uint32_t *)arg0);
6371 continue;
6372
6373 case DTRACEACT_JSTACK:
6374 case DTRACEACT_USTACK:
6375 if (!dtrace_priv_proc(state))
6376 continue;
6377
6378 /*
6379 * See comment in DIF_VAR_PID.
6380 */
6381 if (DTRACE_ANCHORED(mstate.dtms_probe) &&
6382 CPU_ON_INTR(CPU)) {
6383 int depth = DTRACE_USTACK_NFRAMES(
6384 rec->dtrd_arg) + 1;
6385
6386 dtrace_bzero((void *)(tomax + valoffs),
6387 DTRACE_USTACK_STRSIZE(rec->dtrd_arg)
6388 + depth * sizeof (uint64_t));
6389
6390 continue;
6391 }
6392
6393 if (DTRACE_USTACK_STRSIZE(rec->dtrd_arg) != 0 &&
6394 curproc->p_dtrace_helpers != NULL) {
6395 /*
6396 * This is the slow path -- we have
6397 * allocated string space, and we're
6398 * getting the stack of a process that
6399 * has helpers. Call into a separate
6400 * routine to perform this processing.
6401 */
6402 dtrace_action_ustack(&mstate, state,
6403 (uint64_t *)(tomax + valoffs),
6404 rec->dtrd_arg);
6405 continue;
6406 }
6407
6408 DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
6409 dtrace_getupcstack((uint64_t *)
6410 (tomax + valoffs),
6411 DTRACE_USTACK_NFRAMES(rec->dtrd_arg) + 1);
6412 DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);
6413 continue;
6414
6415 default:
6416 break;
6417 }
6418
6419 dp = act->dta_difo;
6420 ASSERT(dp != NULL);
6421
6422 val = dtrace_dif_emulate(dp, &mstate, vstate, state);
6423
6424 if (*flags & CPU_DTRACE_ERROR)
6425 continue;
6426
6427 switch (act->dta_kind) {
6428 case DTRACEACT_SPECULATE: {
6429 dtrace_rechdr_t *dtrh;
6430
6431 ASSERT(buf == &state->dts_buffer[cpuid]);
6432 buf = dtrace_speculation_buffer(state,
6433 cpuid, val);
6434
6435 if (buf == NULL) {
6436 *flags |= CPU_DTRACE_DROP;
6437 continue;
6438 }
6439
6440 offs = dtrace_buffer_reserve(buf,
6441 ecb->dte_needed, ecb->dte_alignment,
6442 state, NULL);
6443
6444 if (offs < 0) {
6445 *flags |= CPU_DTRACE_DROP;
6446 continue;
6447 }
6448
6449 tomax = buf->dtb_tomax;
6450 ASSERT(tomax != NULL);
6451
6452 if (ecb->dte_size == 0)
6453 continue;
6454
6455 ASSERT3U(ecb->dte_size, >=,
6456 sizeof (dtrace_rechdr_t));
6457 dtrh = ((void *)(tomax + offs));
6458 dtrh->dtrh_epid = ecb->dte_epid;
6459 /*
6460 * When the speculation is committed, all of
6461 * the records in the speculative buffer will
6462 * have their timestamps set to the commit
6463 * time. Until then, it is set to a sentinel
6464 * value, for debugability.
6465 */
6466 DTRACE_RECORD_STORE_TIMESTAMP(dtrh, UINT64_MAX);
6467 continue;
6468 }
6469
6470 case DTRACEACT_PRINTM: {
6471 /* The DIF returns a 'memref'. */
6472 uintptr_t *memref = (uintptr_t *)(uintptr_t) val;
6473
6474 /* Get the size from the memref. */
6475 size = memref[1];
6476
6477 /*
6478 * Check if the size exceeds the allocated
6479 * buffer size.
6480 */
6481 if (size + sizeof(uintptr_t) > dp->dtdo_rtype.dtdt_size) {
6482 /* Flag a drop! */
6483 *flags |= CPU_DTRACE_DROP;
6484 continue;
6485 }
6486
6487 /* Store the size in the buffer first. */
6488 DTRACE_STORE(uintptr_t, tomax,
6489 valoffs, size);
6490
6491 /*
6492 * Offset the buffer address to the start
6493 * of the data.
6494 */
6495 valoffs += sizeof(uintptr_t);
6496
6497 /*
6498 * Reset to the memory address rather than
6499 * the memref array, then let the BYREF
6500 * code below do the work to store the
6501 * memory data in the buffer.
6502 */
6503 val = memref[0];
6504 break;
6505 }
6506
6507 case DTRACEACT_PRINTT: {
6508 /* The DIF returns a 'typeref'. */
6509 uintptr_t *typeref = (uintptr_t *)(uintptr_t) val;
6510 char c = '\0' + 1;
6511 size_t s;
6512
6513 /*
6514 * Get the type string length and round it
6515 * up so that the data that follows is
6516 * aligned for easy access.
6517 */
6518 size_t typs = strlen((char *) typeref[2]) + 1;
6519 typs = roundup(typs, sizeof(uintptr_t));
6520
6521 /*
6522 *Get the size from the typeref using the
6523 * number of elements and the type size.
6524 */
6525 size = typeref[1] * typeref[3];
6526
6527 /*
6528 * Check if the size exceeds the allocated
6529 * buffer size.
6530 */
6531 if (size + typs + 2 * sizeof(uintptr_t) > dp->dtdo_rtype.dtdt_size) {
6532 /* Flag a drop! */
6533 *flags |= CPU_DTRACE_DROP;
6534
6535 }
6536
6537 /* Store the size in the buffer first. */
6538 DTRACE_STORE(uintptr_t, tomax,
6539 valoffs, size);
6540 valoffs += sizeof(uintptr_t);
6541
6542 /* Store the type size in the buffer. */
6543 DTRACE_STORE(uintptr_t, tomax,
6544 valoffs, typeref[3]);
6545 valoffs += sizeof(uintptr_t);
6546
6547 val = typeref[2];
6548
6549 for (s = 0; s < typs; s++) {
6550 if (c != '\0')
6551 c = dtrace_load8(val++);
6552
6553 DTRACE_STORE(uint8_t, tomax,
6554 valoffs++, c);
6555 }
6556
6557 /*
6558 * Reset to the memory address rather than
6559 * the typeref array, then let the BYREF
6560 * code below do the work to store the
6561 * memory data in the buffer.
6562 */
6563 val = typeref[0];
6564 break;
6565 }
6566
6567 case DTRACEACT_CHILL:
6568 if (dtrace_priv_kernel_destructive(state))
6569 dtrace_action_chill(&mstate, val);
6570 continue;
6571
6572 case DTRACEACT_RAISE:
6573 if (dtrace_priv_proc_destructive(state))
6574 dtrace_action_raise(val);
6575 continue;
6576
6577 case DTRACEACT_COMMIT:
6578 ASSERT(!committed);
6579
6580 /*
6581 * We need to commit our buffer state.
6582 */
6583 if (ecb->dte_size)
6584 buf->dtb_offset = offs + ecb->dte_size;
6585 buf = &state->dts_buffer[cpuid];
6586 dtrace_speculation_commit(state, cpuid, val);
6587 committed = 1;
6588 continue;
6589
6590 case DTRACEACT_DISCARD:
6591 dtrace_speculation_discard(state, cpuid, val);
6592 continue;
6593
6594 case DTRACEACT_DIFEXPR:
6595 case DTRACEACT_LIBACT:
6596 case DTRACEACT_PRINTF:
6597 case DTRACEACT_PRINTA:
6598 case DTRACEACT_SYSTEM:
6599 case DTRACEACT_FREOPEN:
6600 case DTRACEACT_TRACEMEM:
6601 break;
6602
6603 case DTRACEACT_TRACEMEM_DYNSIZE:
6604 tracememsize = val;
6605 break;
6606
6607 case DTRACEACT_SYM:
6608 case DTRACEACT_MOD:
6609 if (!dtrace_priv_kernel(state))
6610 continue;
6611 break;
6612
6613 case DTRACEACT_USYM:
6614 case DTRACEACT_UMOD:
6615 case DTRACEACT_UADDR: {
6616#if defined(sun)
6617 struct pid *pid = curthread->t_procp->p_pidp;
6618#endif
6619
6620 if (!dtrace_priv_proc(state))
6621 continue;
6622
6623 DTRACE_STORE(uint64_t, tomax,
6624#if defined(sun)
6625 valoffs, (uint64_t)pid->pid_id);
6626#else
6627 valoffs, (uint64_t) curproc->p_pid);
6628#endif
6629 DTRACE_STORE(uint64_t, tomax,
6630 valoffs + sizeof (uint64_t), val);
6631
6632 continue;
6633 }
6634
6635 case DTRACEACT_EXIT: {
6636 /*
6637 * For the exit action, we are going to attempt
6638 * to atomically set our activity to be
6639 * draining. If this fails (either because
6640 * another CPU has beat us to the exit action,
6641 * or because our current activity is something
6642 * other than ACTIVE or WARMUP), we will
6643 * continue. This assures that the exit action
6644 * can be successfully recorded at most once
6645 * when we're in the ACTIVE state. If we're
6646 * encountering the exit() action while in
6647 * COOLDOWN, however, we want to honor the new
6648 * status code. (We know that we're the only
6649 * thread in COOLDOWN, so there is no race.)
6650 */
6651 void *activity = &state->dts_activity;
6652 dtrace_activity_t current = state->dts_activity;
6653
6654 if (current == DTRACE_ACTIVITY_COOLDOWN)
6655 break;
6656
6657 if (current != DTRACE_ACTIVITY_WARMUP)
6658 current = DTRACE_ACTIVITY_ACTIVE;
6659
6660 if (dtrace_cas32(activity, current,
6661 DTRACE_ACTIVITY_DRAINING) != current) {
6662 *flags |= CPU_DTRACE_DROP;
6663 continue;
6664 }
6665
6666 break;
6667 }
6668
6669 default:
6670 ASSERT(0);
6671 }
6672
7029 *flags &= ~CPU_DTRACE_ERROR;
7030
7031 if (prov == dtrace_provider) {
7032 /*
7033 * If dtrace itself is the provider of this probe,
7034 * we're only going to continue processing the ECB if
7035 * arg0 (the dtrace_state_t) is equal to the ECB's
7036 * creating state. (This prevents disjoint consumers
7037 * from seeing one another's metaprobes.)
7038 */
7039 if (arg0 != (uint64_t)(uintptr_t)state)
7040 continue;
7041 }
7042
7043 if (state->dts_activity != DTRACE_ACTIVITY_ACTIVE) {
7044 /*
7045 * We're not currently active. If our provider isn't
7046 * the dtrace pseudo provider, we're not interested.
7047 */
7048 if (prov != dtrace_provider)
7049 continue;
7050
7051 /*
7052 * Now we must further check if we are in the BEGIN
7053 * probe. If we are, we will only continue processing
7054 * if we're still in WARMUP -- if one BEGIN enabling
7055 * has invoked the exit() action, we don't want to
7056 * evaluate subsequent BEGIN enablings.
7057 */
7058 if (probe->dtpr_id == dtrace_probeid_begin &&
7059 state->dts_activity != DTRACE_ACTIVITY_WARMUP) {
7060 ASSERT(state->dts_activity ==
7061 DTRACE_ACTIVITY_DRAINING);
7062 continue;
7063 }
7064 }
7065
7066 if (ecb->dte_cond) {
7067 /*
7068 * If the dte_cond bits indicate that this
7069 * consumer is only allowed to see user-mode firings
7070 * of this probe, call the provider's dtps_usermode()
7071 * entry point to check that the probe was fired
7072 * while in a user context. Skip this ECB if that's
7073 * not the case.
7074 */
7075 if ((ecb->dte_cond & DTRACE_COND_USERMODE) &&
7076 prov->dtpv_pops.dtps_usermode(prov->dtpv_arg,
7077 probe->dtpr_id, probe->dtpr_arg) == 0)
7078 continue;
7079
7080#if defined(sun)
7081 /*
7082 * This is more subtle than it looks. We have to be
7083 * absolutely certain that CRED() isn't going to
7084 * change out from under us so it's only legit to
7085 * examine that structure if we're in constrained
7086 * situations. Currently, the only times we'll this
7087 * check is if a non-super-user has enabled the
7088 * profile or syscall providers -- providers that
7089 * allow visibility of all processes. For the
7090 * profile case, the check above will ensure that
7091 * we're examining a user context.
7092 */
7093 if (ecb->dte_cond & DTRACE_COND_OWNER) {
7094 cred_t *cr;
7095 cred_t *s_cr =
7096 ecb->dte_state->dts_cred.dcr_cred;
7097 proc_t *proc;
7098
7099 ASSERT(s_cr != NULL);
7100
7101 if ((cr = CRED()) == NULL ||
7102 s_cr->cr_uid != cr->cr_uid ||
7103 s_cr->cr_uid != cr->cr_ruid ||
7104 s_cr->cr_uid != cr->cr_suid ||
7105 s_cr->cr_gid != cr->cr_gid ||
7106 s_cr->cr_gid != cr->cr_rgid ||
7107 s_cr->cr_gid != cr->cr_sgid ||
7108 (proc = ttoproc(curthread)) == NULL ||
7109 (proc->p_flag & SNOCD))
7110 continue;
7111 }
7112
7113 if (ecb->dte_cond & DTRACE_COND_ZONEOWNER) {
7114 cred_t *cr;
7115 cred_t *s_cr =
7116 ecb->dte_state->dts_cred.dcr_cred;
7117
7118 ASSERT(s_cr != NULL);
7119
7120 if ((cr = CRED()) == NULL ||
7121 s_cr->cr_zone->zone_id !=
7122 cr->cr_zone->zone_id)
7123 continue;
7124 }
7125#endif
7126 }
7127
7128 if (now - state->dts_alive > dtrace_deadman_timeout) {
7129 /*
7130 * We seem to be dead. Unless we (a) have kernel
7131 * destructive permissions (b) have explicitly enabled
7132 * destructive actions and (c) destructive actions have
7133 * not been disabled, we're going to transition into
7134 * the KILLED state, from which no further processing
7135 * on this state will be performed.
7136 */
7137 if (!dtrace_priv_kernel_destructive(state) ||
7138 !state->dts_cred.dcr_destructive ||
7139 dtrace_destructive_disallow) {
7140 void *activity = &state->dts_activity;
7141 dtrace_activity_t current;
7142
7143 do {
7144 current = state->dts_activity;
7145 } while (dtrace_cas32(activity, current,
7146 DTRACE_ACTIVITY_KILLED) != current);
7147
7148 continue;
7149 }
7150 }
7151
7152 if ((offs = dtrace_buffer_reserve(buf, ecb->dte_needed,
7153 ecb->dte_alignment, state, &mstate)) < 0)
7154 continue;
7155
7156 tomax = buf->dtb_tomax;
7157 ASSERT(tomax != NULL);
7158
7159 if (ecb->dte_size != 0) {
7160 dtrace_rechdr_t dtrh;
7161 if (!(mstate.dtms_present & DTRACE_MSTATE_TIMESTAMP)) {
7162 mstate.dtms_timestamp = dtrace_gethrtime();
7163 mstate.dtms_present |= DTRACE_MSTATE_TIMESTAMP;
7164 }
7165 ASSERT3U(ecb->dte_size, >=, sizeof (dtrace_rechdr_t));
7166 dtrh.dtrh_epid = ecb->dte_epid;
7167 DTRACE_RECORD_STORE_TIMESTAMP(&dtrh,
7168 mstate.dtms_timestamp);
7169 *((dtrace_rechdr_t *)(tomax + offs)) = dtrh;
7170 }
7171
7172 mstate.dtms_epid = ecb->dte_epid;
7173 mstate.dtms_present |= DTRACE_MSTATE_EPID;
7174
7175 if (state->dts_cred.dcr_visible & DTRACE_CRV_KERNEL)
7176 mstate.dtms_access = DTRACE_ACCESS_KERNEL;
7177 else
7178 mstate.dtms_access = 0;
7179
7180 if (pred != NULL) {
7181 dtrace_difo_t *dp = pred->dtp_difo;
7182 int rval;
7183
7184 rval = dtrace_dif_emulate(dp, &mstate, vstate, state);
7185
7186 if (!(*flags & CPU_DTRACE_ERROR) && !rval) {
7187 dtrace_cacheid_t cid = probe->dtpr_predcache;
7188
7189 if (cid != DTRACE_CACHEIDNONE && !onintr) {
7190 /*
7191 * Update the predicate cache...
7192 */
7193 ASSERT(cid == pred->dtp_cacheid);
7194 curthread->t_predcache = cid;
7195 }
7196
7197 continue;
7198 }
7199 }
7200
7201 for (act = ecb->dte_action; !(*flags & CPU_DTRACE_ERROR) &&
7202 act != NULL; act = act->dta_next) {
7203 size_t valoffs;
7204 dtrace_difo_t *dp;
7205 dtrace_recdesc_t *rec = &act->dta_rec;
7206
7207 size = rec->dtrd_size;
7208 valoffs = offs + rec->dtrd_offset;
7209
7210 if (DTRACEACT_ISAGG(act->dta_kind)) {
7211 uint64_t v = 0xbad;
7212 dtrace_aggregation_t *agg;
7213
7214 agg = (dtrace_aggregation_t *)act;
7215
7216 if ((dp = act->dta_difo) != NULL)
7217 v = dtrace_dif_emulate(dp,
7218 &mstate, vstate, state);
7219
7220 if (*flags & CPU_DTRACE_ERROR)
7221 continue;
7222
7223 /*
7224 * Note that we always pass the expression
7225 * value from the previous iteration of the
7226 * action loop. This value will only be used
7227 * if there is an expression argument to the
7228 * aggregating action, denoted by the
7229 * dtag_hasarg field.
7230 */
7231 dtrace_aggregate(agg, buf,
7232 offs, aggbuf, v, val);
7233 continue;
7234 }
7235
7236 switch (act->dta_kind) {
7237 case DTRACEACT_STOP:
7238 if (dtrace_priv_proc_destructive(state))
7239 dtrace_action_stop();
7240 continue;
7241
7242 case DTRACEACT_BREAKPOINT:
7243 if (dtrace_priv_kernel_destructive(state))
7244 dtrace_action_breakpoint(ecb);
7245 continue;
7246
7247 case DTRACEACT_PANIC:
7248 if (dtrace_priv_kernel_destructive(state))
7249 dtrace_action_panic(ecb);
7250 continue;
7251
7252 case DTRACEACT_STACK:
7253 if (!dtrace_priv_kernel(state))
7254 continue;
7255
7256 dtrace_getpcstack((pc_t *)(tomax + valoffs),
7257 size / sizeof (pc_t), probe->dtpr_aframes,
7258 DTRACE_ANCHORED(probe) ? NULL :
7259 (uint32_t *)arg0);
7260 continue;
7261
7262 case DTRACEACT_JSTACK:
7263 case DTRACEACT_USTACK:
7264 if (!dtrace_priv_proc(state))
7265 continue;
7266
7267 /*
7268 * See comment in DIF_VAR_PID.
7269 */
7270 if (DTRACE_ANCHORED(mstate.dtms_probe) &&
7271 CPU_ON_INTR(CPU)) {
7272 int depth = DTRACE_USTACK_NFRAMES(
7273 rec->dtrd_arg) + 1;
7274
7275 dtrace_bzero((void *)(tomax + valoffs),
7276 DTRACE_USTACK_STRSIZE(rec->dtrd_arg)
7277 + depth * sizeof (uint64_t));
7278
7279 continue;
7280 }
7281
7282 if (DTRACE_USTACK_STRSIZE(rec->dtrd_arg) != 0 &&
7283 curproc->p_dtrace_helpers != NULL) {
7284 /*
7285 * This is the slow path -- we have
7286 * allocated string space, and we're
7287 * getting the stack of a process that
7288 * has helpers. Call into a separate
7289 * routine to perform this processing.
7290 */
7291 dtrace_action_ustack(&mstate, state,
7292 (uint64_t *)(tomax + valoffs),
7293 rec->dtrd_arg);
7294 continue;
7295 }
7296
7297 DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
7298 dtrace_getupcstack((uint64_t *)
7299 (tomax + valoffs),
7300 DTRACE_USTACK_NFRAMES(rec->dtrd_arg) + 1);
7301 DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);
7302 continue;
7303
7304 default:
7305 break;
7306 }
7307
7308 dp = act->dta_difo;
7309 ASSERT(dp != NULL);
7310
7311 val = dtrace_dif_emulate(dp, &mstate, vstate, state);
7312
7313 if (*flags & CPU_DTRACE_ERROR)
7314 continue;
7315
7316 switch (act->dta_kind) {
7317 case DTRACEACT_SPECULATE: {
7318 dtrace_rechdr_t *dtrh;
7319
7320 ASSERT(buf == &state->dts_buffer[cpuid]);
7321 buf = dtrace_speculation_buffer(state,
7322 cpuid, val);
7323
7324 if (buf == NULL) {
7325 *flags |= CPU_DTRACE_DROP;
7326 continue;
7327 }
7328
7329 offs = dtrace_buffer_reserve(buf,
7330 ecb->dte_needed, ecb->dte_alignment,
7331 state, NULL);
7332
7333 if (offs < 0) {
7334 *flags |= CPU_DTRACE_DROP;
7335 continue;
7336 }
7337
7338 tomax = buf->dtb_tomax;
7339 ASSERT(tomax != NULL);
7340
7341 if (ecb->dte_size == 0)
7342 continue;
7343
7344 ASSERT3U(ecb->dte_size, >=,
7345 sizeof (dtrace_rechdr_t));
7346 dtrh = ((void *)(tomax + offs));
7347 dtrh->dtrh_epid = ecb->dte_epid;
7348 /*
7349 * When the speculation is committed, all of
7350 * the records in the speculative buffer will
7351 * have their timestamps set to the commit
7352 * time. Until then, it is set to a sentinel
7353 * value, for debugability.
7354 */
7355 DTRACE_RECORD_STORE_TIMESTAMP(dtrh, UINT64_MAX);
7356 continue;
7357 }
7358
7359 case DTRACEACT_PRINTM: {
7360 /* The DIF returns a 'memref'. */
7361 uintptr_t *memref = (uintptr_t *)(uintptr_t) val;
7362
7363 /* Get the size from the memref. */
7364 size = memref[1];
7365
7366 /*
7367 * Check if the size exceeds the allocated
7368 * buffer size.
7369 */
7370 if (size + sizeof(uintptr_t) > dp->dtdo_rtype.dtdt_size) {
7371 /* Flag a drop! */
7372 *flags |= CPU_DTRACE_DROP;
7373 continue;
7374 }
7375
7376 /* Store the size in the buffer first. */
7377 DTRACE_STORE(uintptr_t, tomax,
7378 valoffs, size);
7379
7380 /*
7381 * Offset the buffer address to the start
7382 * of the data.
7383 */
7384 valoffs += sizeof(uintptr_t);
7385
7386 /*
7387 * Reset to the memory address rather than
7388 * the memref array, then let the BYREF
7389 * code below do the work to store the
7390 * memory data in the buffer.
7391 */
7392 val = memref[0];
7393 break;
7394 }
7395
7396 case DTRACEACT_PRINTT: {
7397 /* The DIF returns a 'typeref'. */
7398 uintptr_t *typeref = (uintptr_t *)(uintptr_t) val;
7399 char c = '\0' + 1;
7400 size_t s;
7401
7402 /*
7403 * Get the type string length and round it
7404 * up so that the data that follows is
7405 * aligned for easy access.
7406 */
7407 size_t typs = strlen((char *) typeref[2]) + 1;
7408 typs = roundup(typs, sizeof(uintptr_t));
7409
7410 /*
7411 *Get the size from the typeref using the
7412 * number of elements and the type size.
7413 */
7414 size = typeref[1] * typeref[3];
7415
7416 /*
7417 * Check if the size exceeds the allocated
7418 * buffer size.
7419 */
7420 if (size + typs + 2 * sizeof(uintptr_t) > dp->dtdo_rtype.dtdt_size) {
7421 /* Flag a drop! */
7422 *flags |= CPU_DTRACE_DROP;
7423
7424 }
7425
7426 /* Store the size in the buffer first. */
7427 DTRACE_STORE(uintptr_t, tomax,
7428 valoffs, size);
7429 valoffs += sizeof(uintptr_t);
7430
7431 /* Store the type size in the buffer. */
7432 DTRACE_STORE(uintptr_t, tomax,
7433 valoffs, typeref[3]);
7434 valoffs += sizeof(uintptr_t);
7435
7436 val = typeref[2];
7437
7438 for (s = 0; s < typs; s++) {
7439 if (c != '\0')
7440 c = dtrace_load8(val++);
7441
7442 DTRACE_STORE(uint8_t, tomax,
7443 valoffs++, c);
7444 }
7445
7446 /*
7447 * Reset to the memory address rather than
7448 * the typeref array, then let the BYREF
7449 * code below do the work to store the
7450 * memory data in the buffer.
7451 */
7452 val = typeref[0];
7453 break;
7454 }
7455
7456 case DTRACEACT_CHILL:
7457 if (dtrace_priv_kernel_destructive(state))
7458 dtrace_action_chill(&mstate, val);
7459 continue;
7460
7461 case DTRACEACT_RAISE:
7462 if (dtrace_priv_proc_destructive(state))
7463 dtrace_action_raise(val);
7464 continue;
7465
7466 case DTRACEACT_COMMIT:
7467 ASSERT(!committed);
7468
7469 /*
7470 * We need to commit our buffer state.
7471 */
7472 if (ecb->dte_size)
7473 buf->dtb_offset = offs + ecb->dte_size;
7474 buf = &state->dts_buffer[cpuid];
7475 dtrace_speculation_commit(state, cpuid, val);
7476 committed = 1;
7477 continue;
7478
7479 case DTRACEACT_DISCARD:
7480 dtrace_speculation_discard(state, cpuid, val);
7481 continue;
7482
7483 case DTRACEACT_DIFEXPR:
7484 case DTRACEACT_LIBACT:
7485 case DTRACEACT_PRINTF:
7486 case DTRACEACT_PRINTA:
7487 case DTRACEACT_SYSTEM:
7488 case DTRACEACT_FREOPEN:
7489 case DTRACEACT_TRACEMEM:
7490 break;
7491
7492 case DTRACEACT_TRACEMEM_DYNSIZE:
7493 tracememsize = val;
7494 break;
7495
7496 case DTRACEACT_SYM:
7497 case DTRACEACT_MOD:
7498 if (!dtrace_priv_kernel(state))
7499 continue;
7500 break;
7501
7502 case DTRACEACT_USYM:
7503 case DTRACEACT_UMOD:
7504 case DTRACEACT_UADDR: {
7505#if defined(sun)
7506 struct pid *pid = curthread->t_procp->p_pidp;
7507#endif
7508
7509 if (!dtrace_priv_proc(state))
7510 continue;
7511
7512 DTRACE_STORE(uint64_t, tomax,
7513#if defined(sun)
7514 valoffs, (uint64_t)pid->pid_id);
7515#else
7516 valoffs, (uint64_t) curproc->p_pid);
7517#endif
7518 DTRACE_STORE(uint64_t, tomax,
7519 valoffs + sizeof (uint64_t), val);
7520
7521 continue;
7522 }
7523
7524 case DTRACEACT_EXIT: {
7525 /*
7526 * For the exit action, we are going to attempt
7527 * to atomically set our activity to be
7528 * draining. If this fails (either because
7529 * another CPU has beat us to the exit action,
7530 * or because our current activity is something
7531 * other than ACTIVE or WARMUP), we will
7532 * continue. This assures that the exit action
7533 * can be successfully recorded at most once
7534 * when we're in the ACTIVE state. If we're
7535 * encountering the exit() action while in
7536 * COOLDOWN, however, we want to honor the new
7537 * status code. (We know that we're the only
7538 * thread in COOLDOWN, so there is no race.)
7539 */
7540 void *activity = &state->dts_activity;
7541 dtrace_activity_t current = state->dts_activity;
7542
7543 if (current == DTRACE_ACTIVITY_COOLDOWN)
7544 break;
7545
7546 if (current != DTRACE_ACTIVITY_WARMUP)
7547 current = DTRACE_ACTIVITY_ACTIVE;
7548
7549 if (dtrace_cas32(activity, current,
7550 DTRACE_ACTIVITY_DRAINING) != current) {
7551 *flags |= CPU_DTRACE_DROP;
7552 continue;
7553 }
7554
7555 break;
7556 }
7557
7558 default:
7559 ASSERT(0);
7560 }
7561
6673 if (dp->dtdo_rtype.dtdt_flags & DIF_TF_BYREF) {
7562 if (dp->dtdo_rtype.dtdt_flags & DIF_TF_BYREF ||
7563 dp->dtdo_rtype.dtdt_flags & DIF_TF_BYUREF) {
6674 uintptr_t end = valoffs + size;
6675
6676 if (tracememsize != 0 &&
6677 valoffs + tracememsize < end) {
6678 end = valoffs + tracememsize;
6679 tracememsize = 0;
6680 }
6681
7564 uintptr_t end = valoffs + size;
7565
7566 if (tracememsize != 0 &&
7567 valoffs + tracememsize < end) {
7568 end = valoffs + tracememsize;
7569 tracememsize = 0;
7570 }
7571
6682 if (!dtrace_vcanload((void *)(uintptr_t)val,
7572 if (dp->dtdo_rtype.dtdt_flags & DIF_TF_BYREF &&
7573 !dtrace_vcanload((void *)(uintptr_t)val,
6683 &dp->dtdo_rtype, &mstate, vstate))
6684 continue;
6685
7574 &dp->dtdo_rtype, &mstate, vstate))
7575 continue;
7576
6686 /*
6687 * If this is a string, we're going to only
6688 * load until we find the zero byte -- after
6689 * which we'll store zero bytes.
6690 */
6691 if (dp->dtdo_rtype.dtdt_kind ==
6692 DIF_TYPE_STRING) {
6693 char c = '\0' + 1;
6694 int intuple = act->dta_intuple;
6695 size_t s;
6696
6697 for (s = 0; s < size; s++) {
6698 if (c != '\0')
6699 c = dtrace_load8(val++);
6700
6701 DTRACE_STORE(uint8_t, tomax,
6702 valoffs++, c);
6703
6704 if (c == '\0' && intuple)
6705 break;
6706 }
6707
6708 continue;
6709 }
6710
6711 while (valoffs < end) {
6712 DTRACE_STORE(uint8_t, tomax, valoffs++,
6713 dtrace_load8(val++));
6714 }
6715
7577 dtrace_store_by_ref(dp, tomax, size, &valoffs,
7578 &val, end, act->dta_intuple,
7579 dp->dtdo_rtype.dtdt_flags & DIF_TF_BYREF ?
7580 DIF_TF_BYREF: DIF_TF_BYUREF);
6716 continue;
6717 }
6718
6719 switch (size) {
6720 case 0:
6721 break;
6722
6723 case sizeof (uint8_t):
6724 DTRACE_STORE(uint8_t, tomax, valoffs, val);
6725 break;
6726 case sizeof (uint16_t):
6727 DTRACE_STORE(uint16_t, tomax, valoffs, val);
6728 break;
6729 case sizeof (uint32_t):
6730 DTRACE_STORE(uint32_t, tomax, valoffs, val);
6731 break;
6732 case sizeof (uint64_t):
6733 DTRACE_STORE(uint64_t, tomax, valoffs, val);
6734 break;
6735 default:
6736 /*
6737 * Any other size should have been returned by
6738 * reference, not by value.
6739 */
6740 ASSERT(0);
6741 break;
6742 }
6743 }
6744
6745 if (*flags & CPU_DTRACE_DROP)
6746 continue;
6747
6748 if (*flags & CPU_DTRACE_FAULT) {
6749 int ndx;
6750 dtrace_action_t *err;
6751
6752 buf->dtb_errors++;
6753
6754 if (probe->dtpr_id == dtrace_probeid_error) {
6755 /*
6756 * There's nothing we can do -- we had an
6757 * error on the error probe. We bump an
6758 * error counter to at least indicate that
6759 * this condition happened.
6760 */
6761 dtrace_error(&state->dts_dblerrors);
6762 continue;
6763 }
6764
6765 if (vtime) {
6766 /*
6767 * Before recursing on dtrace_probe(), we
6768 * need to explicitly clear out our start
6769 * time to prevent it from being accumulated
6770 * into t_dtrace_vtime.
6771 */
6772 curthread->t_dtrace_start = 0;
6773 }
6774
6775 /*
6776 * Iterate over the actions to figure out which action
6777 * we were processing when we experienced the error.
6778 * Note that act points _past_ the faulting action; if
6779 * act is ecb->dte_action, the fault was in the
6780 * predicate, if it's ecb->dte_action->dta_next it's
6781 * in action #1, and so on.
6782 */
6783 for (err = ecb->dte_action, ndx = 0;
6784 err != act; err = err->dta_next, ndx++)
6785 continue;
6786
6787 dtrace_probe_error(state, ecb->dte_epid, ndx,
6788 (mstate.dtms_present & DTRACE_MSTATE_FLTOFFS) ?
6789 mstate.dtms_fltoffs : -1, DTRACE_FLAGS2FLT(*flags),
6790 cpu_core[cpuid].cpuc_dtrace_illval);
6791
6792 continue;
6793 }
6794
6795 if (!committed)
6796 buf->dtb_offset = offs + ecb->dte_size;
6797 }
6798
6799 if (vtime)
6800 curthread->t_dtrace_start = dtrace_gethrtime();
6801
6802 dtrace_interrupt_enable(cookie);
6803}
6804
6805/*
6806 * DTrace Probe Hashing Functions
6807 *
6808 * The functions in this section (and indeed, the functions in remaining
6809 * sections) are not _called_ from probe context. (Any exceptions to this are
6810 * marked with a "Note:".) Rather, they are called from elsewhere in the
6811 * DTrace framework to look-up probes in, add probes to and remove probes from
6812 * the DTrace probe hashes. (Each probe is hashed by each element of the
6813 * probe tuple -- allowing for fast lookups, regardless of what was
6814 * specified.)
6815 */
6816static uint_t
6817dtrace_hash_str(const char *p)
6818{
6819 unsigned int g;
6820 uint_t hval = 0;
6821
6822 while (*p) {
6823 hval = (hval << 4) + *p++;
6824 if ((g = (hval & 0xf0000000)) != 0)
6825 hval ^= g >> 24;
6826 hval &= ~g;
6827 }
6828 return (hval);
6829}
6830
6831static dtrace_hash_t *
6832dtrace_hash_create(uintptr_t stroffs, uintptr_t nextoffs, uintptr_t prevoffs)
6833{
6834 dtrace_hash_t *hash = kmem_zalloc(sizeof (dtrace_hash_t), KM_SLEEP);
6835
6836 hash->dth_stroffs = stroffs;
6837 hash->dth_nextoffs = nextoffs;
6838 hash->dth_prevoffs = prevoffs;
6839
6840 hash->dth_size = 1;
6841 hash->dth_mask = hash->dth_size - 1;
6842
6843 hash->dth_tab = kmem_zalloc(hash->dth_size *
6844 sizeof (dtrace_hashbucket_t *), KM_SLEEP);
6845
6846 return (hash);
6847}
6848
6849static void
6850dtrace_hash_destroy(dtrace_hash_t *hash)
6851{
6852#ifdef DEBUG
6853 int i;
6854
6855 for (i = 0; i < hash->dth_size; i++)
6856 ASSERT(hash->dth_tab[i] == NULL);
6857#endif
6858
6859 kmem_free(hash->dth_tab,
6860 hash->dth_size * sizeof (dtrace_hashbucket_t *));
6861 kmem_free(hash, sizeof (dtrace_hash_t));
6862}
6863
6864static void
6865dtrace_hash_resize(dtrace_hash_t *hash)
6866{
6867 int size = hash->dth_size, i, ndx;
6868 int new_size = hash->dth_size << 1;
6869 int new_mask = new_size - 1;
6870 dtrace_hashbucket_t **new_tab, *bucket, *next;
6871
6872 ASSERT((new_size & new_mask) == 0);
6873
6874 new_tab = kmem_zalloc(new_size * sizeof (void *), KM_SLEEP);
6875
6876 for (i = 0; i < size; i++) {
6877 for (bucket = hash->dth_tab[i]; bucket != NULL; bucket = next) {
6878 dtrace_probe_t *probe = bucket->dthb_chain;
6879
6880 ASSERT(probe != NULL);
6881 ndx = DTRACE_HASHSTR(hash, probe) & new_mask;
6882
6883 next = bucket->dthb_next;
6884 bucket->dthb_next = new_tab[ndx];
6885 new_tab[ndx] = bucket;
6886 }
6887 }
6888
6889 kmem_free(hash->dth_tab, hash->dth_size * sizeof (void *));
6890 hash->dth_tab = new_tab;
6891 hash->dth_size = new_size;
6892 hash->dth_mask = new_mask;
6893}
6894
6895static void
6896dtrace_hash_add(dtrace_hash_t *hash, dtrace_probe_t *new)
6897{
6898 int hashval = DTRACE_HASHSTR(hash, new);
6899 int ndx = hashval & hash->dth_mask;
6900 dtrace_hashbucket_t *bucket = hash->dth_tab[ndx];
6901 dtrace_probe_t **nextp, **prevp;
6902
6903 for (; bucket != NULL; bucket = bucket->dthb_next) {
6904 if (DTRACE_HASHEQ(hash, bucket->dthb_chain, new))
6905 goto add;
6906 }
6907
6908 if ((hash->dth_nbuckets >> 1) > hash->dth_size) {
6909 dtrace_hash_resize(hash);
6910 dtrace_hash_add(hash, new);
6911 return;
6912 }
6913
6914 bucket = kmem_zalloc(sizeof (dtrace_hashbucket_t), KM_SLEEP);
6915 bucket->dthb_next = hash->dth_tab[ndx];
6916 hash->dth_tab[ndx] = bucket;
6917 hash->dth_nbuckets++;
6918
6919add:
6920 nextp = DTRACE_HASHNEXT(hash, new);
6921 ASSERT(*nextp == NULL && *(DTRACE_HASHPREV(hash, new)) == NULL);
6922 *nextp = bucket->dthb_chain;
6923
6924 if (bucket->dthb_chain != NULL) {
6925 prevp = DTRACE_HASHPREV(hash, bucket->dthb_chain);
6926 ASSERT(*prevp == NULL);
6927 *prevp = new;
6928 }
6929
6930 bucket->dthb_chain = new;
6931 bucket->dthb_len++;
6932}
6933
6934static dtrace_probe_t *
6935dtrace_hash_lookup(dtrace_hash_t *hash, dtrace_probe_t *template)
6936{
6937 int hashval = DTRACE_HASHSTR(hash, template);
6938 int ndx = hashval & hash->dth_mask;
6939 dtrace_hashbucket_t *bucket = hash->dth_tab[ndx];
6940
6941 for (; bucket != NULL; bucket = bucket->dthb_next) {
6942 if (DTRACE_HASHEQ(hash, bucket->dthb_chain, template))
6943 return (bucket->dthb_chain);
6944 }
6945
6946 return (NULL);
6947}
6948
6949static int
6950dtrace_hash_collisions(dtrace_hash_t *hash, dtrace_probe_t *template)
6951{
6952 int hashval = DTRACE_HASHSTR(hash, template);
6953 int ndx = hashval & hash->dth_mask;
6954 dtrace_hashbucket_t *bucket = hash->dth_tab[ndx];
6955
6956 for (; bucket != NULL; bucket = bucket->dthb_next) {
6957 if (DTRACE_HASHEQ(hash, bucket->dthb_chain, template))
6958 return (bucket->dthb_len);
6959 }
6960
6961 return (0);
6962}
6963
6964static void
6965dtrace_hash_remove(dtrace_hash_t *hash, dtrace_probe_t *probe)
6966{
6967 int ndx = DTRACE_HASHSTR(hash, probe) & hash->dth_mask;
6968 dtrace_hashbucket_t *bucket = hash->dth_tab[ndx];
6969
6970 dtrace_probe_t **prevp = DTRACE_HASHPREV(hash, probe);
6971 dtrace_probe_t **nextp = DTRACE_HASHNEXT(hash, probe);
6972
6973 /*
6974 * Find the bucket that we're removing this probe from.
6975 */
6976 for (; bucket != NULL; bucket = bucket->dthb_next) {
6977 if (DTRACE_HASHEQ(hash, bucket->dthb_chain, probe))
6978 break;
6979 }
6980
6981 ASSERT(bucket != NULL);
6982
6983 if (*prevp == NULL) {
6984 if (*nextp == NULL) {
6985 /*
6986 * The removed probe was the only probe on this
6987 * bucket; we need to remove the bucket.
6988 */
6989 dtrace_hashbucket_t *b = hash->dth_tab[ndx];
6990
6991 ASSERT(bucket->dthb_chain == probe);
6992 ASSERT(b != NULL);
6993
6994 if (b == bucket) {
6995 hash->dth_tab[ndx] = bucket->dthb_next;
6996 } else {
6997 while (b->dthb_next != bucket)
6998 b = b->dthb_next;
6999 b->dthb_next = bucket->dthb_next;
7000 }
7001
7002 ASSERT(hash->dth_nbuckets > 0);
7003 hash->dth_nbuckets--;
7004 kmem_free(bucket, sizeof (dtrace_hashbucket_t));
7005 return;
7006 }
7007
7008 bucket->dthb_chain = *nextp;
7009 } else {
7010 *(DTRACE_HASHNEXT(hash, *prevp)) = *nextp;
7011 }
7012
7013 if (*nextp != NULL)
7014 *(DTRACE_HASHPREV(hash, *nextp)) = *prevp;
7015}
7016
7017/*
7018 * DTrace Utility Functions
7019 *
7020 * These are random utility functions that are _not_ called from probe context.
7021 */
7022static int
7023dtrace_badattr(const dtrace_attribute_t *a)
7024{
7025 return (a->dtat_name > DTRACE_STABILITY_MAX ||
7026 a->dtat_data > DTRACE_STABILITY_MAX ||
7027 a->dtat_class > DTRACE_CLASS_MAX);
7028}
7029
7030/*
7031 * Return a duplicate copy of a string. If the specified string is NULL,
7032 * this function returns a zero-length string.
7033 */
7034static char *
7035dtrace_strdup(const char *str)
7036{
7037 char *new = kmem_zalloc((str != NULL ? strlen(str) : 0) + 1, KM_SLEEP);
7038
7039 if (str != NULL)
7040 (void) strcpy(new, str);
7041
7042 return (new);
7043}
7044
7045#define DTRACE_ISALPHA(c) \
7046 (((c) >= 'a' && (c) <= 'z') || ((c) >= 'A' && (c) <= 'Z'))
7047
7048static int
7049dtrace_badname(const char *s)
7050{
7051 char c;
7052
7053 if (s == NULL || (c = *s++) == '\0')
7054 return (0);
7055
7056 if (!DTRACE_ISALPHA(c) && c != '-' && c != '_' && c != '.')
7057 return (1);
7058
7059 while ((c = *s++) != '\0') {
7060 if (!DTRACE_ISALPHA(c) && (c < '0' || c > '9') &&
7061 c != '-' && c != '_' && c != '.' && c != '`')
7062 return (1);
7063 }
7064
7065 return (0);
7066}
7067
7068static void
7069dtrace_cred2priv(cred_t *cr, uint32_t *privp, uid_t *uidp, zoneid_t *zoneidp)
7070{
7071 uint32_t priv;
7072
7073#if defined(sun)
7074 if (cr == NULL || PRIV_POLICY_ONLY(cr, PRIV_ALL, B_FALSE)) {
7075 /*
7076 * For DTRACE_PRIV_ALL, the uid and zoneid don't matter.
7077 */
7078 priv = DTRACE_PRIV_ALL;
7079 } else {
7080 *uidp = crgetuid(cr);
7081 *zoneidp = crgetzoneid(cr);
7082
7083 priv = 0;
7084 if (PRIV_POLICY_ONLY(cr, PRIV_DTRACE_KERNEL, B_FALSE))
7085 priv |= DTRACE_PRIV_KERNEL | DTRACE_PRIV_USER;
7086 else if (PRIV_POLICY_ONLY(cr, PRIV_DTRACE_USER, B_FALSE))
7087 priv |= DTRACE_PRIV_USER;
7088 if (PRIV_POLICY_ONLY(cr, PRIV_DTRACE_PROC, B_FALSE))
7089 priv |= DTRACE_PRIV_PROC;
7090 if (PRIV_POLICY_ONLY(cr, PRIV_PROC_OWNER, B_FALSE))
7091 priv |= DTRACE_PRIV_OWNER;
7092 if (PRIV_POLICY_ONLY(cr, PRIV_PROC_ZONE, B_FALSE))
7093 priv |= DTRACE_PRIV_ZONEOWNER;
7094 }
7095#else
7096 priv = DTRACE_PRIV_ALL;
7097#endif
7098
7099 *privp = priv;
7100}
7101
7102#ifdef DTRACE_ERRDEBUG
7103static void
7104dtrace_errdebug(const char *str)
7105{
7106 int hval = dtrace_hash_str(str) % DTRACE_ERRHASHSZ;
7107 int occupied = 0;
7108
7109 mutex_enter(&dtrace_errlock);
7110 dtrace_errlast = str;
7111 dtrace_errthread = curthread;
7112
7113 while (occupied++ < DTRACE_ERRHASHSZ) {
7114 if (dtrace_errhash[hval].dter_msg == str) {
7115 dtrace_errhash[hval].dter_count++;
7116 goto out;
7117 }
7118
7119 if (dtrace_errhash[hval].dter_msg != NULL) {
7120 hval = (hval + 1) % DTRACE_ERRHASHSZ;
7121 continue;
7122 }
7123
7124 dtrace_errhash[hval].dter_msg = str;
7125 dtrace_errhash[hval].dter_count = 1;
7126 goto out;
7127 }
7128
7129 panic("dtrace: undersized error hash");
7130out:
7131 mutex_exit(&dtrace_errlock);
7132}
7133#endif
7134
7135/*
7136 * DTrace Matching Functions
7137 *
7138 * These functions are used to match groups of probes, given some elements of
7139 * a probe tuple, or some globbed expressions for elements of a probe tuple.
7140 */
7141static int
7142dtrace_match_priv(const dtrace_probe_t *prp, uint32_t priv, uid_t uid,
7143 zoneid_t zoneid)
7144{
7145 if (priv != DTRACE_PRIV_ALL) {
7146 uint32_t ppriv = prp->dtpr_provider->dtpv_priv.dtpp_flags;
7147 uint32_t match = priv & ppriv;
7148
7149 /*
7150 * No PRIV_DTRACE_* privileges...
7151 */
7152 if ((priv & (DTRACE_PRIV_PROC | DTRACE_PRIV_USER |
7153 DTRACE_PRIV_KERNEL)) == 0)
7154 return (0);
7155
7156 /*
7157 * No matching bits, but there were bits to match...
7158 */
7159 if (match == 0 && ppriv != 0)
7160 return (0);
7161
7162 /*
7163 * Need to have permissions to the process, but don't...
7164 */
7165 if (((ppriv & ~match) & DTRACE_PRIV_OWNER) != 0 &&
7166 uid != prp->dtpr_provider->dtpv_priv.dtpp_uid) {
7167 return (0);
7168 }
7169
7170 /*
7171 * Need to be in the same zone unless we possess the
7172 * privilege to examine all zones.
7173 */
7174 if (((ppriv & ~match) & DTRACE_PRIV_ZONEOWNER) != 0 &&
7175 zoneid != prp->dtpr_provider->dtpv_priv.dtpp_zoneid) {
7176 return (0);
7177 }
7178 }
7179
7180 return (1);
7181}
7182
7183/*
7184 * dtrace_match_probe compares a dtrace_probe_t to a pre-compiled key, which
7185 * consists of input pattern strings and an ops-vector to evaluate them.
7186 * This function returns >0 for match, 0 for no match, and <0 for error.
7187 */
7188static int
7189dtrace_match_probe(const dtrace_probe_t *prp, const dtrace_probekey_t *pkp,
7190 uint32_t priv, uid_t uid, zoneid_t zoneid)
7191{
7192 dtrace_provider_t *pvp = prp->dtpr_provider;
7193 int rv;
7194
7195 if (pvp->dtpv_defunct)
7196 return (0);
7197
7198 if ((rv = pkp->dtpk_pmatch(pvp->dtpv_name, pkp->dtpk_prov, 0)) <= 0)
7199 return (rv);
7200
7201 if ((rv = pkp->dtpk_mmatch(prp->dtpr_mod, pkp->dtpk_mod, 0)) <= 0)
7202 return (rv);
7203
7204 if ((rv = pkp->dtpk_fmatch(prp->dtpr_func, pkp->dtpk_func, 0)) <= 0)
7205 return (rv);
7206
7207 if ((rv = pkp->dtpk_nmatch(prp->dtpr_name, pkp->dtpk_name, 0)) <= 0)
7208 return (rv);
7209
7210 if (dtrace_match_priv(prp, priv, uid, zoneid) == 0)
7211 return (0);
7212
7213 return (rv);
7214}
7215
7216/*
7217 * dtrace_match_glob() is a safe kernel implementation of the gmatch(3GEN)
7218 * interface for matching a glob pattern 'p' to an input string 's'. Unlike
7219 * libc's version, the kernel version only applies to 8-bit ASCII strings.
7220 * In addition, all of the recursion cases except for '*' matching have been
7221 * unwound. For '*', we still implement recursive evaluation, but a depth
7222 * counter is maintained and matching is aborted if we recurse too deep.
7223 * The function returns 0 if no match, >0 if match, and <0 if recursion error.
7224 */
7225static int
7226dtrace_match_glob(const char *s, const char *p, int depth)
7227{
7228 const char *olds;
7229 char s1, c;
7230 int gs;
7231
7232 if (depth > DTRACE_PROBEKEY_MAXDEPTH)
7233 return (-1);
7234
7235 if (s == NULL)
7236 s = ""; /* treat NULL as empty string */
7237
7238top:
7239 olds = s;
7240 s1 = *s++;
7241
7242 if (p == NULL)
7243 return (0);
7244
7245 if ((c = *p++) == '\0')
7246 return (s1 == '\0');
7247
7248 switch (c) {
7249 case '[': {
7250 int ok = 0, notflag = 0;
7251 char lc = '\0';
7252
7253 if (s1 == '\0')
7254 return (0);
7255
7256 if (*p == '!') {
7257 notflag = 1;
7258 p++;
7259 }
7260
7261 if ((c = *p++) == '\0')
7262 return (0);
7263
7264 do {
7265 if (c == '-' && lc != '\0' && *p != ']') {
7266 if ((c = *p++) == '\0')
7267 return (0);
7268 if (c == '\\' && (c = *p++) == '\0')
7269 return (0);
7270
7271 if (notflag) {
7272 if (s1 < lc || s1 > c)
7273 ok++;
7274 else
7275 return (0);
7276 } else if (lc <= s1 && s1 <= c)
7277 ok++;
7278
7279 } else if (c == '\\' && (c = *p++) == '\0')
7280 return (0);
7281
7282 lc = c; /* save left-hand 'c' for next iteration */
7283
7284 if (notflag) {
7285 if (s1 != c)
7286 ok++;
7287 else
7288 return (0);
7289 } else if (s1 == c)
7290 ok++;
7291
7292 if ((c = *p++) == '\0')
7293 return (0);
7294
7295 } while (c != ']');
7296
7297 if (ok)
7298 goto top;
7299
7300 return (0);
7301 }
7302
7303 case '\\':
7304 if ((c = *p++) == '\0')
7305 return (0);
7306 /*FALLTHRU*/
7307
7308 default:
7309 if (c != s1)
7310 return (0);
7311 /*FALLTHRU*/
7312
7313 case '?':
7314 if (s1 != '\0')
7315 goto top;
7316 return (0);
7317
7318 case '*':
7319 while (*p == '*')
7320 p++; /* consecutive *'s are identical to a single one */
7321
7322 if (*p == '\0')
7323 return (1);
7324
7325 for (s = olds; *s != '\0'; s++) {
7326 if ((gs = dtrace_match_glob(s, p, depth + 1)) != 0)
7327 return (gs);
7328 }
7329
7330 return (0);
7331 }
7332}
7333
7334/*ARGSUSED*/
7335static int
7336dtrace_match_string(const char *s, const char *p, int depth)
7337{
7338 return (s != NULL && strcmp(s, p) == 0);
7339}
7340
7341/*ARGSUSED*/
7342static int
7343dtrace_match_nul(const char *s, const char *p, int depth)
7344{
7345 return (1); /* always match the empty pattern */
7346}
7347
7348/*ARGSUSED*/
7349static int
7350dtrace_match_nonzero(const char *s, const char *p, int depth)
7351{
7352 return (s != NULL && s[0] != '\0');
7353}
7354
7355static int
7356dtrace_match(const dtrace_probekey_t *pkp, uint32_t priv, uid_t uid,
7357 zoneid_t zoneid, int (*matched)(dtrace_probe_t *, void *), void *arg)
7358{
7359 dtrace_probe_t template, *probe;
7360 dtrace_hash_t *hash = NULL;
7361 int len, best = INT_MAX, nmatched = 0;
7362 dtrace_id_t i;
7363
7364 ASSERT(MUTEX_HELD(&dtrace_lock));
7365
7366 /*
7367 * If the probe ID is specified in the key, just lookup by ID and
7368 * invoke the match callback once if a matching probe is found.
7369 */
7370 if (pkp->dtpk_id != DTRACE_IDNONE) {
7371 if ((probe = dtrace_probe_lookup_id(pkp->dtpk_id)) != NULL &&
7372 dtrace_match_probe(probe, pkp, priv, uid, zoneid) > 0) {
7373 (void) (*matched)(probe, arg);
7374 nmatched++;
7375 }
7376 return (nmatched);
7377 }
7378
7379 template.dtpr_mod = (char *)pkp->dtpk_mod;
7380 template.dtpr_func = (char *)pkp->dtpk_func;
7381 template.dtpr_name = (char *)pkp->dtpk_name;
7382
7383 /*
7384 * We want to find the most distinct of the module name, function
7385 * name, and name. So for each one that is not a glob pattern or
7386 * empty string, we perform a lookup in the corresponding hash and
7387 * use the hash table with the fewest collisions to do our search.
7388 */
7389 if (pkp->dtpk_mmatch == &dtrace_match_string &&
7390 (len = dtrace_hash_collisions(dtrace_bymod, &template)) < best) {
7391 best = len;
7392 hash = dtrace_bymod;
7393 }
7394
7395 if (pkp->dtpk_fmatch == &dtrace_match_string &&
7396 (len = dtrace_hash_collisions(dtrace_byfunc, &template)) < best) {
7397 best = len;
7398 hash = dtrace_byfunc;
7399 }
7400
7401 if (pkp->dtpk_nmatch == &dtrace_match_string &&
7402 (len = dtrace_hash_collisions(dtrace_byname, &template)) < best) {
7403 best = len;
7404 hash = dtrace_byname;
7405 }
7406
7407 /*
7408 * If we did not select a hash table, iterate over every probe and
7409 * invoke our callback for each one that matches our input probe key.
7410 */
7411 if (hash == NULL) {
7412 for (i = 0; i < dtrace_nprobes; i++) {
7413 if ((probe = dtrace_probes[i]) == NULL ||
7414 dtrace_match_probe(probe, pkp, priv, uid,
7415 zoneid) <= 0)
7416 continue;
7417
7418 nmatched++;
7419
7420 if ((*matched)(probe, arg) != DTRACE_MATCH_NEXT)
7421 break;
7422 }
7423
7424 return (nmatched);
7425 }
7426
7427 /*
7428 * If we selected a hash table, iterate over each probe of the same key
7429 * name and invoke the callback for every probe that matches the other
7430 * attributes of our input probe key.
7431 */
7432 for (probe = dtrace_hash_lookup(hash, &template); probe != NULL;
7433 probe = *(DTRACE_HASHNEXT(hash, probe))) {
7434
7435 if (dtrace_match_probe(probe, pkp, priv, uid, zoneid) <= 0)
7436 continue;
7437
7438 nmatched++;
7439
7440 if ((*matched)(probe, arg) != DTRACE_MATCH_NEXT)
7441 break;
7442 }
7443
7444 return (nmatched);
7445}
7446
7447/*
7448 * Return the function pointer dtrace_probecmp() should use to compare the
7449 * specified pattern with a string. For NULL or empty patterns, we select
7450 * dtrace_match_nul(). For glob pattern strings, we use dtrace_match_glob().
7451 * For non-empty non-glob strings, we use dtrace_match_string().
7452 */
7453static dtrace_probekey_f *
7454dtrace_probekey_func(const char *p)
7455{
7456 char c;
7457
7458 if (p == NULL || *p == '\0')
7459 return (&dtrace_match_nul);
7460
7461 while ((c = *p++) != '\0') {
7462 if (c == '[' || c == '?' || c == '*' || c == '\\')
7463 return (&dtrace_match_glob);
7464 }
7465
7466 return (&dtrace_match_string);
7467}
7468
7469/*
7470 * Build a probe comparison key for use with dtrace_match_probe() from the
7471 * given probe description. By convention, a null key only matches anchored
7472 * probes: if each field is the empty string, reset dtpk_fmatch to
7473 * dtrace_match_nonzero().
7474 */
7475static void
7476dtrace_probekey(dtrace_probedesc_t *pdp, dtrace_probekey_t *pkp)
7477{
7478 pkp->dtpk_prov = pdp->dtpd_provider;
7479 pkp->dtpk_pmatch = dtrace_probekey_func(pdp->dtpd_provider);
7480
7481 pkp->dtpk_mod = pdp->dtpd_mod;
7482 pkp->dtpk_mmatch = dtrace_probekey_func(pdp->dtpd_mod);
7483
7484 pkp->dtpk_func = pdp->dtpd_func;
7485 pkp->dtpk_fmatch = dtrace_probekey_func(pdp->dtpd_func);
7486
7487 pkp->dtpk_name = pdp->dtpd_name;
7488 pkp->dtpk_nmatch = dtrace_probekey_func(pdp->dtpd_name);
7489
7490 pkp->dtpk_id = pdp->dtpd_id;
7491
7492 if (pkp->dtpk_id == DTRACE_IDNONE &&
7493 pkp->dtpk_pmatch == &dtrace_match_nul &&
7494 pkp->dtpk_mmatch == &dtrace_match_nul &&
7495 pkp->dtpk_fmatch == &dtrace_match_nul &&
7496 pkp->dtpk_nmatch == &dtrace_match_nul)
7497 pkp->dtpk_fmatch = &dtrace_match_nonzero;
7498}
7499
7500/*
7501 * DTrace Provider-to-Framework API Functions
7502 *
7503 * These functions implement much of the Provider-to-Framework API, as
7504 * described in <sys/dtrace.h>. The parts of the API not in this section are
7505 * the functions in the API for probe management (found below), and
7506 * dtrace_probe() itself (found above).
7507 */
7508
7509/*
7510 * Register the calling provider with the DTrace framework. This should
7511 * generally be called by DTrace providers in their attach(9E) entry point.
7512 */
7513int
7514dtrace_register(const char *name, const dtrace_pattr_t *pap, uint32_t priv,
7515 cred_t *cr, const dtrace_pops_t *pops, void *arg, dtrace_provider_id_t *idp)
7516{
7517 dtrace_provider_t *provider;
7518
7519 if (name == NULL || pap == NULL || pops == NULL || idp == NULL) {
7520 cmn_err(CE_WARN, "failed to register provider '%s': invalid "
7521 "arguments", name ? name : "<NULL>");
7522 return (EINVAL);
7523 }
7524
7525 if (name[0] == '\0' || dtrace_badname(name)) {
7526 cmn_err(CE_WARN, "failed to register provider '%s': invalid "
7527 "provider name", name);
7528 return (EINVAL);
7529 }
7530
7531 if ((pops->dtps_provide == NULL && pops->dtps_provide_module == NULL) ||
7532 pops->dtps_enable == NULL || pops->dtps_disable == NULL ||
7533 pops->dtps_destroy == NULL ||
7534 ((pops->dtps_resume == NULL) != (pops->dtps_suspend == NULL))) {
7535 cmn_err(CE_WARN, "failed to register provider '%s': invalid "
7536 "provider ops", name);
7537 return (EINVAL);
7538 }
7539
7540 if (dtrace_badattr(&pap->dtpa_provider) ||
7541 dtrace_badattr(&pap->dtpa_mod) ||
7542 dtrace_badattr(&pap->dtpa_func) ||
7543 dtrace_badattr(&pap->dtpa_name) ||
7544 dtrace_badattr(&pap->dtpa_args)) {
7545 cmn_err(CE_WARN, "failed to register provider '%s': invalid "
7546 "provider attributes", name);
7547 return (EINVAL);
7548 }
7549
7550 if (priv & ~DTRACE_PRIV_ALL) {
7551 cmn_err(CE_WARN, "failed to register provider '%s': invalid "
7552 "privilege attributes", name);
7553 return (EINVAL);
7554 }
7555
7556 if ((priv & DTRACE_PRIV_KERNEL) &&
7557 (priv & (DTRACE_PRIV_USER | DTRACE_PRIV_OWNER)) &&
7558 pops->dtps_usermode == NULL) {
7559 cmn_err(CE_WARN, "failed to register provider '%s': need "
7560 "dtps_usermode() op for given privilege attributes", name);
7561 return (EINVAL);
7562 }
7563
7564 provider = kmem_zalloc(sizeof (dtrace_provider_t), KM_SLEEP);
7565 provider->dtpv_name = kmem_alloc(strlen(name) + 1, KM_SLEEP);
7566 (void) strcpy(provider->dtpv_name, name);
7567
7568 provider->dtpv_attr = *pap;
7569 provider->dtpv_priv.dtpp_flags = priv;
7570 if (cr != NULL) {
7571 provider->dtpv_priv.dtpp_uid = crgetuid(cr);
7572 provider->dtpv_priv.dtpp_zoneid = crgetzoneid(cr);
7573 }
7574 provider->dtpv_pops = *pops;
7575
7576 if (pops->dtps_provide == NULL) {
7577 ASSERT(pops->dtps_provide_module != NULL);
7578 provider->dtpv_pops.dtps_provide =
7579 (void (*)(void *, dtrace_probedesc_t *))dtrace_nullop;
7580 }
7581
7582 if (pops->dtps_provide_module == NULL) {
7583 ASSERT(pops->dtps_provide != NULL);
7584 provider->dtpv_pops.dtps_provide_module =
7585 (void (*)(void *, modctl_t *))dtrace_nullop;
7586 }
7587
7588 if (pops->dtps_suspend == NULL) {
7589 ASSERT(pops->dtps_resume == NULL);
7590 provider->dtpv_pops.dtps_suspend =
7591 (void (*)(void *, dtrace_id_t, void *))dtrace_nullop;
7592 provider->dtpv_pops.dtps_resume =
7593 (void (*)(void *, dtrace_id_t, void *))dtrace_nullop;
7594 }
7595
7596 provider->dtpv_arg = arg;
7597 *idp = (dtrace_provider_id_t)provider;
7598
7599 if (pops == &dtrace_provider_ops) {
7600 ASSERT(MUTEX_HELD(&dtrace_provider_lock));
7601 ASSERT(MUTEX_HELD(&dtrace_lock));
7602 ASSERT(dtrace_anon.dta_enabling == NULL);
7603
7604 /*
7605 * We make sure that the DTrace provider is at the head of
7606 * the provider chain.
7607 */
7608 provider->dtpv_next = dtrace_provider;
7609 dtrace_provider = provider;
7610 return (0);
7611 }
7612
7613 mutex_enter(&dtrace_provider_lock);
7614 mutex_enter(&dtrace_lock);
7615
7616 /*
7617 * If there is at least one provider registered, we'll add this
7618 * provider after the first provider.
7619 */
7620 if (dtrace_provider != NULL) {
7621 provider->dtpv_next = dtrace_provider->dtpv_next;
7622 dtrace_provider->dtpv_next = provider;
7623 } else {
7624 dtrace_provider = provider;
7625 }
7626
7627 if (dtrace_retained != NULL) {
7628 dtrace_enabling_provide(provider);
7629
7630 /*
7631 * Now we need to call dtrace_enabling_matchall() -- which
7632 * will acquire cpu_lock and dtrace_lock. We therefore need
7633 * to drop all of our locks before calling into it...
7634 */
7635 mutex_exit(&dtrace_lock);
7636 mutex_exit(&dtrace_provider_lock);
7637 dtrace_enabling_matchall();
7638
7639 return (0);
7640 }
7641
7642 mutex_exit(&dtrace_lock);
7643 mutex_exit(&dtrace_provider_lock);
7644
7645 return (0);
7646}
7647
7648/*
7649 * Unregister the specified provider from the DTrace framework. This should
7650 * generally be called by DTrace providers in their detach(9E) entry point.
7651 */
7652int
7653dtrace_unregister(dtrace_provider_id_t id)
7654{
7655 dtrace_provider_t *old = (dtrace_provider_t *)id;
7656 dtrace_provider_t *prev = NULL;
7657 int i, self = 0, noreap = 0;
7658 dtrace_probe_t *probe, *first = NULL;
7659
7660 if (old->dtpv_pops.dtps_enable ==
7661 (void (*)(void *, dtrace_id_t, void *))dtrace_nullop) {
7662 /*
7663 * If DTrace itself is the provider, we're called with locks
7664 * already held.
7665 */
7666 ASSERT(old == dtrace_provider);
7667#if defined(sun)
7668 ASSERT(dtrace_devi != NULL);
7669#endif
7670 ASSERT(MUTEX_HELD(&dtrace_provider_lock));
7671 ASSERT(MUTEX_HELD(&dtrace_lock));
7672 self = 1;
7673
7674 if (dtrace_provider->dtpv_next != NULL) {
7675 /*
7676 * There's another provider here; return failure.
7677 */
7678 return (EBUSY);
7679 }
7680 } else {
7681 mutex_enter(&dtrace_provider_lock);
7682#if defined(sun)
7683 mutex_enter(&mod_lock);
7684#endif
7685 mutex_enter(&dtrace_lock);
7686 }
7687
7688 /*
7689 * If anyone has /dev/dtrace open, or if there are anonymous enabled
7690 * probes, we refuse to let providers slither away, unless this
7691 * provider has already been explicitly invalidated.
7692 */
7693 if (!old->dtpv_defunct &&
7694 (dtrace_opens || (dtrace_anon.dta_state != NULL &&
7695 dtrace_anon.dta_state->dts_necbs > 0))) {
7696 if (!self) {
7697 mutex_exit(&dtrace_lock);
7698#if defined(sun)
7699 mutex_exit(&mod_lock);
7700#endif
7701 mutex_exit(&dtrace_provider_lock);
7702 }
7703 return (EBUSY);
7704 }
7705
7706 /*
7707 * Attempt to destroy the probes associated with this provider.
7708 */
7709 for (i = 0; i < dtrace_nprobes; i++) {
7710 if ((probe = dtrace_probes[i]) == NULL)
7711 continue;
7712
7713 if (probe->dtpr_provider != old)
7714 continue;
7715
7716 if (probe->dtpr_ecb == NULL)
7717 continue;
7718
7719 /*
7720 * If we are trying to unregister a defunct provider, and the
7721 * provider was made defunct within the interval dictated by
7722 * dtrace_unregister_defunct_reap, we'll (asynchronously)
7723 * attempt to reap our enablings. To denote that the provider
7724 * should reattempt to unregister itself at some point in the
7725 * future, we will return a differentiable error code (EAGAIN
7726 * instead of EBUSY) in this case.
7727 */
7728 if (dtrace_gethrtime() - old->dtpv_defunct >
7729 dtrace_unregister_defunct_reap)
7730 noreap = 1;
7731
7732 if (!self) {
7733 mutex_exit(&dtrace_lock);
7734#if defined(sun)
7735 mutex_exit(&mod_lock);
7736#endif
7737 mutex_exit(&dtrace_provider_lock);
7738 }
7739
7740 if (noreap)
7741 return (EBUSY);
7742
7743 (void) taskq_dispatch(dtrace_taskq,
7744 (task_func_t *)dtrace_enabling_reap, NULL, TQ_SLEEP);
7745
7746 return (EAGAIN);
7747 }
7748
7749 /*
7750 * All of the probes for this provider are disabled; we can safely
7751 * remove all of them from their hash chains and from the probe array.
7752 */
7753 for (i = 0; i < dtrace_nprobes; i++) {
7754 if ((probe = dtrace_probes[i]) == NULL)
7755 continue;
7756
7757 if (probe->dtpr_provider != old)
7758 continue;
7759
7760 dtrace_probes[i] = NULL;
7761
7762 dtrace_hash_remove(dtrace_bymod, probe);
7763 dtrace_hash_remove(dtrace_byfunc, probe);
7764 dtrace_hash_remove(dtrace_byname, probe);
7765
7766 if (first == NULL) {
7767 first = probe;
7768 probe->dtpr_nextmod = NULL;
7769 } else {
7770 probe->dtpr_nextmod = first;
7771 first = probe;
7772 }
7773 }
7774
7775 /*
7776 * The provider's probes have been removed from the hash chains and
7777 * from the probe array. Now issue a dtrace_sync() to be sure that
7778 * everyone has cleared out from any probe array processing.
7779 */
7780 dtrace_sync();
7781
7782 for (probe = first; probe != NULL; probe = first) {
7783 first = probe->dtpr_nextmod;
7784
7785 old->dtpv_pops.dtps_destroy(old->dtpv_arg, probe->dtpr_id,
7786 probe->dtpr_arg);
7787 kmem_free(probe->dtpr_mod, strlen(probe->dtpr_mod) + 1);
7788 kmem_free(probe->dtpr_func, strlen(probe->dtpr_func) + 1);
7789 kmem_free(probe->dtpr_name, strlen(probe->dtpr_name) + 1);
7790#if defined(sun)
7791 vmem_free(dtrace_arena, (void *)(uintptr_t)(probe->dtpr_id), 1);
7792#else
7793 free_unr(dtrace_arena, probe->dtpr_id);
7794#endif
7795 kmem_free(probe, sizeof (dtrace_probe_t));
7796 }
7797
7798 if ((prev = dtrace_provider) == old) {
7799#if defined(sun)
7800 ASSERT(self || dtrace_devi == NULL);
7801 ASSERT(old->dtpv_next == NULL || dtrace_devi == NULL);
7802#endif
7803 dtrace_provider = old->dtpv_next;
7804 } else {
7805 while (prev != NULL && prev->dtpv_next != old)
7806 prev = prev->dtpv_next;
7807
7808 if (prev == NULL) {
7809 panic("attempt to unregister non-existent "
7810 "dtrace provider %p\n", (void *)id);
7811 }
7812
7813 prev->dtpv_next = old->dtpv_next;
7814 }
7815
7816 if (!self) {
7817 mutex_exit(&dtrace_lock);
7818#if defined(sun)
7819 mutex_exit(&mod_lock);
7820#endif
7821 mutex_exit(&dtrace_provider_lock);
7822 }
7823
7824 kmem_free(old->dtpv_name, strlen(old->dtpv_name) + 1);
7825 kmem_free(old, sizeof (dtrace_provider_t));
7826
7827 return (0);
7828}
7829
7830/*
7831 * Invalidate the specified provider. All subsequent probe lookups for the
7832 * specified provider will fail, but its probes will not be removed.
7833 */
7834void
7835dtrace_invalidate(dtrace_provider_id_t id)
7836{
7837 dtrace_provider_t *pvp = (dtrace_provider_t *)id;
7838
7839 ASSERT(pvp->dtpv_pops.dtps_enable !=
7840 (void (*)(void *, dtrace_id_t, void *))dtrace_nullop);
7841
7842 mutex_enter(&dtrace_provider_lock);
7843 mutex_enter(&dtrace_lock);
7844
7845 pvp->dtpv_defunct = dtrace_gethrtime();
7846
7847 mutex_exit(&dtrace_lock);
7848 mutex_exit(&dtrace_provider_lock);
7849}
7850
7851/*
7852 * Indicate whether or not DTrace has attached.
7853 */
7854int
7855dtrace_attached(void)
7856{
7857 /*
7858 * dtrace_provider will be non-NULL iff the DTrace driver has
7859 * attached. (It's non-NULL because DTrace is always itself a
7860 * provider.)
7861 */
7862 return (dtrace_provider != NULL);
7863}
7864
7865/*
7866 * Remove all the unenabled probes for the given provider. This function is
7867 * not unlike dtrace_unregister(), except that it doesn't remove the provider
7868 * -- just as many of its associated probes as it can.
7869 */
7870int
7871dtrace_condense(dtrace_provider_id_t id)
7872{
7873 dtrace_provider_t *prov = (dtrace_provider_t *)id;
7874 int i;
7875 dtrace_probe_t *probe;
7876
7877 /*
7878 * Make sure this isn't the dtrace provider itself.
7879 */
7880 ASSERT(prov->dtpv_pops.dtps_enable !=
7881 (void (*)(void *, dtrace_id_t, void *))dtrace_nullop);
7882
7883 mutex_enter(&dtrace_provider_lock);
7884 mutex_enter(&dtrace_lock);
7885
7886 /*
7887 * Attempt to destroy the probes associated with this provider.
7888 */
7889 for (i = 0; i < dtrace_nprobes; i++) {
7890 if ((probe = dtrace_probes[i]) == NULL)
7891 continue;
7892
7893 if (probe->dtpr_provider != prov)
7894 continue;
7895
7896 if (probe->dtpr_ecb != NULL)
7897 continue;
7898
7899 dtrace_probes[i] = NULL;
7900
7901 dtrace_hash_remove(dtrace_bymod, probe);
7902 dtrace_hash_remove(dtrace_byfunc, probe);
7903 dtrace_hash_remove(dtrace_byname, probe);
7904
7905 prov->dtpv_pops.dtps_destroy(prov->dtpv_arg, i + 1,
7906 probe->dtpr_arg);
7907 kmem_free(probe->dtpr_mod, strlen(probe->dtpr_mod) + 1);
7908 kmem_free(probe->dtpr_func, strlen(probe->dtpr_func) + 1);
7909 kmem_free(probe->dtpr_name, strlen(probe->dtpr_name) + 1);
7910 kmem_free(probe, sizeof (dtrace_probe_t));
7911#if defined(sun)
7912 vmem_free(dtrace_arena, (void *)((uintptr_t)i + 1), 1);
7913#else
7914 free_unr(dtrace_arena, i + 1);
7915#endif
7916 }
7917
7918 mutex_exit(&dtrace_lock);
7919 mutex_exit(&dtrace_provider_lock);
7920
7921 return (0);
7922}
7923
7924/*
7925 * DTrace Probe Management Functions
7926 *
7927 * The functions in this section perform the DTrace probe management,
7928 * including functions to create probes, look-up probes, and call into the
7929 * providers to request that probes be provided. Some of these functions are
7930 * in the Provider-to-Framework API; these functions can be identified by the
7931 * fact that they are not declared "static".
7932 */
7933
7934/*
7935 * Create a probe with the specified module name, function name, and name.
7936 */
7937dtrace_id_t
7938dtrace_probe_create(dtrace_provider_id_t prov, const char *mod,
7939 const char *func, const char *name, int aframes, void *arg)
7940{
7941 dtrace_probe_t *probe, **probes;
7942 dtrace_provider_t *provider = (dtrace_provider_t *)prov;
7943 dtrace_id_t id;
7944
7945 if (provider == dtrace_provider) {
7946 ASSERT(MUTEX_HELD(&dtrace_lock));
7947 } else {
7948 mutex_enter(&dtrace_lock);
7949 }
7950
7951#if defined(sun)
7952 id = (dtrace_id_t)(uintptr_t)vmem_alloc(dtrace_arena, 1,
7953 VM_BESTFIT | VM_SLEEP);
7954#else
7955 id = alloc_unr(dtrace_arena);
7956#endif
7957 probe = kmem_zalloc(sizeof (dtrace_probe_t), KM_SLEEP);
7958
7959 probe->dtpr_id = id;
7960 probe->dtpr_gen = dtrace_probegen++;
7961 probe->dtpr_mod = dtrace_strdup(mod);
7962 probe->dtpr_func = dtrace_strdup(func);
7963 probe->dtpr_name = dtrace_strdup(name);
7964 probe->dtpr_arg = arg;
7965 probe->dtpr_aframes = aframes;
7966 probe->dtpr_provider = provider;
7967
7968 dtrace_hash_add(dtrace_bymod, probe);
7969 dtrace_hash_add(dtrace_byfunc, probe);
7970 dtrace_hash_add(dtrace_byname, probe);
7971
7972 if (id - 1 >= dtrace_nprobes) {
7973 size_t osize = dtrace_nprobes * sizeof (dtrace_probe_t *);
7974 size_t nsize = osize << 1;
7975
7976 if (nsize == 0) {
7977 ASSERT(osize == 0);
7978 ASSERT(dtrace_probes == NULL);
7979 nsize = sizeof (dtrace_probe_t *);
7980 }
7981
7982 probes = kmem_zalloc(nsize, KM_SLEEP);
7983
7984 if (dtrace_probes == NULL) {
7985 ASSERT(osize == 0);
7986 dtrace_probes = probes;
7987 dtrace_nprobes = 1;
7988 } else {
7989 dtrace_probe_t **oprobes = dtrace_probes;
7990
7991 bcopy(oprobes, probes, osize);
7992 dtrace_membar_producer();
7993 dtrace_probes = probes;
7994
7995 dtrace_sync();
7996
7997 /*
7998 * All CPUs are now seeing the new probes array; we can
7999 * safely free the old array.
8000 */
8001 kmem_free(oprobes, osize);
8002 dtrace_nprobes <<= 1;
8003 }
8004
8005 ASSERT(id - 1 < dtrace_nprobes);
8006 }
8007
8008 ASSERT(dtrace_probes[id - 1] == NULL);
8009 dtrace_probes[id - 1] = probe;
8010
8011 if (provider != dtrace_provider)
8012 mutex_exit(&dtrace_lock);
8013
8014 return (id);
8015}
8016
8017static dtrace_probe_t *
8018dtrace_probe_lookup_id(dtrace_id_t id)
8019{
8020 ASSERT(MUTEX_HELD(&dtrace_lock));
8021
8022 if (id == 0 || id > dtrace_nprobes)
8023 return (NULL);
8024
8025 return (dtrace_probes[id - 1]);
8026}
8027
8028static int
8029dtrace_probe_lookup_match(dtrace_probe_t *probe, void *arg)
8030{
8031 *((dtrace_id_t *)arg) = probe->dtpr_id;
8032
8033 return (DTRACE_MATCH_DONE);
8034}
8035
8036/*
8037 * Look up a probe based on provider and one or more of module name, function
8038 * name and probe name.
8039 */
8040dtrace_id_t
8041dtrace_probe_lookup(dtrace_provider_id_t prid, char *mod,
8042 char *func, char *name)
8043{
8044 dtrace_probekey_t pkey;
8045 dtrace_id_t id;
8046 int match;
8047
8048 pkey.dtpk_prov = ((dtrace_provider_t *)prid)->dtpv_name;
8049 pkey.dtpk_pmatch = &dtrace_match_string;
8050 pkey.dtpk_mod = mod;
8051 pkey.dtpk_mmatch = mod ? &dtrace_match_string : &dtrace_match_nul;
8052 pkey.dtpk_func = func;
8053 pkey.dtpk_fmatch = func ? &dtrace_match_string : &dtrace_match_nul;
8054 pkey.dtpk_name = name;
8055 pkey.dtpk_nmatch = name ? &dtrace_match_string : &dtrace_match_nul;
8056 pkey.dtpk_id = DTRACE_IDNONE;
8057
8058 mutex_enter(&dtrace_lock);
8059 match = dtrace_match(&pkey, DTRACE_PRIV_ALL, 0, 0,
8060 dtrace_probe_lookup_match, &id);
8061 mutex_exit(&dtrace_lock);
8062
8063 ASSERT(match == 1 || match == 0);
8064 return (match ? id : 0);
8065}
8066
8067/*
8068 * Returns the probe argument associated with the specified probe.
8069 */
8070void *
8071dtrace_probe_arg(dtrace_provider_id_t id, dtrace_id_t pid)
8072{
8073 dtrace_probe_t *probe;
8074 void *rval = NULL;
8075
8076 mutex_enter(&dtrace_lock);
8077
8078 if ((probe = dtrace_probe_lookup_id(pid)) != NULL &&
8079 probe->dtpr_provider == (dtrace_provider_t *)id)
8080 rval = probe->dtpr_arg;
8081
8082 mutex_exit(&dtrace_lock);
8083
8084 return (rval);
8085}
8086
8087/*
8088 * Copy a probe into a probe description.
8089 */
8090static void
8091dtrace_probe_description(const dtrace_probe_t *prp, dtrace_probedesc_t *pdp)
8092{
8093 bzero(pdp, sizeof (dtrace_probedesc_t));
8094 pdp->dtpd_id = prp->dtpr_id;
8095
8096 (void) strncpy(pdp->dtpd_provider,
8097 prp->dtpr_provider->dtpv_name, DTRACE_PROVNAMELEN - 1);
8098
8099 (void) strncpy(pdp->dtpd_mod, prp->dtpr_mod, DTRACE_MODNAMELEN - 1);
8100 (void) strncpy(pdp->dtpd_func, prp->dtpr_func, DTRACE_FUNCNAMELEN - 1);
8101 (void) strncpy(pdp->dtpd_name, prp->dtpr_name, DTRACE_NAMELEN - 1);
8102}
8103
8104/*
8105 * Called to indicate that a probe -- or probes -- should be provided by a
8106 * specfied provider. If the specified description is NULL, the provider will
8107 * be told to provide all of its probes. (This is done whenever a new
8108 * consumer comes along, or whenever a retained enabling is to be matched.) If
8109 * the specified description is non-NULL, the provider is given the
8110 * opportunity to dynamically provide the specified probe, allowing providers
8111 * to support the creation of probes on-the-fly. (So-called _autocreated_
8112 * probes.) If the provider is NULL, the operations will be applied to all
8113 * providers; if the provider is non-NULL the operations will only be applied
8114 * to the specified provider. The dtrace_provider_lock must be held, and the
8115 * dtrace_lock must _not_ be held -- the provider's dtps_provide() operation
8116 * will need to grab the dtrace_lock when it reenters the framework through
8117 * dtrace_probe_lookup(), dtrace_probe_create(), etc.
8118 */
8119static void
8120dtrace_probe_provide(dtrace_probedesc_t *desc, dtrace_provider_t *prv)
8121{
8122#if defined(sun)
8123 modctl_t *ctl;
8124#endif
8125 int all = 0;
8126
8127 ASSERT(MUTEX_HELD(&dtrace_provider_lock));
8128
8129 if (prv == NULL) {
8130 all = 1;
8131 prv = dtrace_provider;
8132 }
8133
8134 do {
8135 /*
8136 * First, call the blanket provide operation.
8137 */
8138 prv->dtpv_pops.dtps_provide(prv->dtpv_arg, desc);
8139
8140#if defined(sun)
8141 /*
8142 * Now call the per-module provide operation. We will grab
8143 * mod_lock to prevent the list from being modified. Note
8144 * that this also prevents the mod_busy bits from changing.
8145 * (mod_busy can only be changed with mod_lock held.)
8146 */
8147 mutex_enter(&mod_lock);
8148
8149 ctl = &modules;
8150 do {
8151 if (ctl->mod_busy || ctl->mod_mp == NULL)
8152 continue;
8153
8154 prv->dtpv_pops.dtps_provide_module(prv->dtpv_arg, ctl);
8155
8156 } while ((ctl = ctl->mod_next) != &modules);
8157
8158 mutex_exit(&mod_lock);
8159#endif
8160 } while (all && (prv = prv->dtpv_next) != NULL);
8161}
8162
8163#if defined(sun)
8164/*
8165 * Iterate over each probe, and call the Framework-to-Provider API function
8166 * denoted by offs.
8167 */
8168static void
8169dtrace_probe_foreach(uintptr_t offs)
8170{
8171 dtrace_provider_t *prov;
8172 void (*func)(void *, dtrace_id_t, void *);
8173 dtrace_probe_t *probe;
8174 dtrace_icookie_t cookie;
8175 int i;
8176
8177 /*
8178 * We disable interrupts to walk through the probe array. This is
8179 * safe -- the dtrace_sync() in dtrace_unregister() assures that we
8180 * won't see stale data.
8181 */
8182 cookie = dtrace_interrupt_disable();
8183
8184 for (i = 0; i < dtrace_nprobes; i++) {
8185 if ((probe = dtrace_probes[i]) == NULL)
8186 continue;
8187
8188 if (probe->dtpr_ecb == NULL) {
8189 /*
8190 * This probe isn't enabled -- don't call the function.
8191 */
8192 continue;
8193 }
8194
8195 prov = probe->dtpr_provider;
8196 func = *((void(**)(void *, dtrace_id_t, void *))
8197 ((uintptr_t)&prov->dtpv_pops + offs));
8198
8199 func(prov->dtpv_arg, i + 1, probe->dtpr_arg);
8200 }
8201
8202 dtrace_interrupt_enable(cookie);
8203}
8204#endif
8205
8206static int
8207dtrace_probe_enable(dtrace_probedesc_t *desc, dtrace_enabling_t *enab)
8208{
8209 dtrace_probekey_t pkey;
8210 uint32_t priv;
8211 uid_t uid;
8212 zoneid_t zoneid;
8213
8214 ASSERT(MUTEX_HELD(&dtrace_lock));
8215 dtrace_ecb_create_cache = NULL;
8216
8217 if (desc == NULL) {
8218 /*
8219 * If we're passed a NULL description, we're being asked to
8220 * create an ECB with a NULL probe.
8221 */
8222 (void) dtrace_ecb_create_enable(NULL, enab);
8223 return (0);
8224 }
8225
8226 dtrace_probekey(desc, &pkey);
8227 dtrace_cred2priv(enab->dten_vstate->dtvs_state->dts_cred.dcr_cred,
8228 &priv, &uid, &zoneid);
8229
8230 return (dtrace_match(&pkey, priv, uid, zoneid, dtrace_ecb_create_enable,
8231 enab));
8232}
8233
8234/*
8235 * DTrace Helper Provider Functions
8236 */
8237static void
8238dtrace_dofattr2attr(dtrace_attribute_t *attr, const dof_attr_t dofattr)
8239{
8240 attr->dtat_name = DOF_ATTR_NAME(dofattr);
8241 attr->dtat_data = DOF_ATTR_DATA(dofattr);
8242 attr->dtat_class = DOF_ATTR_CLASS(dofattr);
8243}
8244
8245static void
8246dtrace_dofprov2hprov(dtrace_helper_provdesc_t *hprov,
8247 const dof_provider_t *dofprov, char *strtab)
8248{
8249 hprov->dthpv_provname = strtab + dofprov->dofpv_name;
8250 dtrace_dofattr2attr(&hprov->dthpv_pattr.dtpa_provider,
8251 dofprov->dofpv_provattr);
8252 dtrace_dofattr2attr(&hprov->dthpv_pattr.dtpa_mod,
8253 dofprov->dofpv_modattr);
8254 dtrace_dofattr2attr(&hprov->dthpv_pattr.dtpa_func,
8255 dofprov->dofpv_funcattr);
8256 dtrace_dofattr2attr(&hprov->dthpv_pattr.dtpa_name,
8257 dofprov->dofpv_nameattr);
8258 dtrace_dofattr2attr(&hprov->dthpv_pattr.dtpa_args,
8259 dofprov->dofpv_argsattr);
8260}
8261
8262static void
8263dtrace_helper_provide_one(dof_helper_t *dhp, dof_sec_t *sec, pid_t pid)
8264{
8265 uintptr_t daddr = (uintptr_t)dhp->dofhp_dof;
8266 dof_hdr_t *dof = (dof_hdr_t *)daddr;
8267 dof_sec_t *str_sec, *prb_sec, *arg_sec, *off_sec, *enoff_sec;
8268 dof_provider_t *provider;
8269 dof_probe_t *probe;
8270 uint32_t *off, *enoff;
8271 uint8_t *arg;
8272 char *strtab;
8273 uint_t i, nprobes;
8274 dtrace_helper_provdesc_t dhpv;
8275 dtrace_helper_probedesc_t dhpb;
8276 dtrace_meta_t *meta = dtrace_meta_pid;
8277 dtrace_mops_t *mops = &meta->dtm_mops;
8278 void *parg;
8279
8280 provider = (dof_provider_t *)(uintptr_t)(daddr + sec->dofs_offset);
8281 str_sec = (dof_sec_t *)(uintptr_t)(daddr + dof->dofh_secoff +
8282 provider->dofpv_strtab * dof->dofh_secsize);
8283 prb_sec = (dof_sec_t *)(uintptr_t)(daddr + dof->dofh_secoff +
8284 provider->dofpv_probes * dof->dofh_secsize);
8285 arg_sec = (dof_sec_t *)(uintptr_t)(daddr + dof->dofh_secoff +
8286 provider->dofpv_prargs * dof->dofh_secsize);
8287 off_sec = (dof_sec_t *)(uintptr_t)(daddr + dof->dofh_secoff +
8288 provider->dofpv_proffs * dof->dofh_secsize);
8289
8290 strtab = (char *)(uintptr_t)(daddr + str_sec->dofs_offset);
8291 off = (uint32_t *)(uintptr_t)(daddr + off_sec->dofs_offset);
8292 arg = (uint8_t *)(uintptr_t)(daddr + arg_sec->dofs_offset);
8293 enoff = NULL;
8294
8295 /*
8296 * See dtrace_helper_provider_validate().
8297 */
8298 if (dof->dofh_ident[DOF_ID_VERSION] != DOF_VERSION_1 &&
8299 provider->dofpv_prenoffs != DOF_SECT_NONE) {
8300 enoff_sec = (dof_sec_t *)(uintptr_t)(daddr + dof->dofh_secoff +
8301 provider->dofpv_prenoffs * dof->dofh_secsize);
8302 enoff = (uint32_t *)(uintptr_t)(daddr + enoff_sec->dofs_offset);
8303 }
8304
8305 nprobes = prb_sec->dofs_size / prb_sec->dofs_entsize;
8306
8307 /*
8308 * Create the provider.
8309 */
8310 dtrace_dofprov2hprov(&dhpv, provider, strtab);
8311
8312 if ((parg = mops->dtms_provide_pid(meta->dtm_arg, &dhpv, pid)) == NULL)
8313 return;
8314
8315 meta->dtm_count++;
8316
8317 /*
8318 * Create the probes.
8319 */
8320 for (i = 0; i < nprobes; i++) {
8321 probe = (dof_probe_t *)(uintptr_t)(daddr +
8322 prb_sec->dofs_offset + i * prb_sec->dofs_entsize);
8323
8324 dhpb.dthpb_mod = dhp->dofhp_mod;
8325 dhpb.dthpb_func = strtab + probe->dofpr_func;
8326 dhpb.dthpb_name = strtab + probe->dofpr_name;
8327 dhpb.dthpb_base = probe->dofpr_addr;
8328 dhpb.dthpb_offs = off + probe->dofpr_offidx;
8329 dhpb.dthpb_noffs = probe->dofpr_noffs;
8330 if (enoff != NULL) {
8331 dhpb.dthpb_enoffs = enoff + probe->dofpr_enoffidx;
8332 dhpb.dthpb_nenoffs = probe->dofpr_nenoffs;
8333 } else {
8334 dhpb.dthpb_enoffs = NULL;
8335 dhpb.dthpb_nenoffs = 0;
8336 }
8337 dhpb.dthpb_args = arg + probe->dofpr_argidx;
8338 dhpb.dthpb_nargc = probe->dofpr_nargc;
8339 dhpb.dthpb_xargc = probe->dofpr_xargc;
8340 dhpb.dthpb_ntypes = strtab + probe->dofpr_nargv;
8341 dhpb.dthpb_xtypes = strtab + probe->dofpr_xargv;
8342
8343 mops->dtms_create_probe(meta->dtm_arg, parg, &dhpb);
8344 }
8345}
8346
8347static void
8348dtrace_helper_provide(dof_helper_t *dhp, pid_t pid)
8349{
8350 uintptr_t daddr = (uintptr_t)dhp->dofhp_dof;
8351 dof_hdr_t *dof = (dof_hdr_t *)daddr;
8352 int i;
8353
8354 ASSERT(MUTEX_HELD(&dtrace_meta_lock));
8355
8356 for (i = 0; i < dof->dofh_secnum; i++) {
8357 dof_sec_t *sec = (dof_sec_t *)(uintptr_t)(daddr +
8358 dof->dofh_secoff + i * dof->dofh_secsize);
8359
8360 if (sec->dofs_type != DOF_SECT_PROVIDER)
8361 continue;
8362
8363 dtrace_helper_provide_one(dhp, sec, pid);
8364 }
8365
8366 /*
8367 * We may have just created probes, so we must now rematch against
8368 * any retained enablings. Note that this call will acquire both
8369 * cpu_lock and dtrace_lock; the fact that we are holding
8370 * dtrace_meta_lock now is what defines the ordering with respect to
8371 * these three locks.
8372 */
8373 dtrace_enabling_matchall();
8374}
8375
8376static void
8377dtrace_helper_provider_remove_one(dof_helper_t *dhp, dof_sec_t *sec, pid_t pid)
8378{
8379 uintptr_t daddr = (uintptr_t)dhp->dofhp_dof;
8380 dof_hdr_t *dof = (dof_hdr_t *)daddr;
8381 dof_sec_t *str_sec;
8382 dof_provider_t *provider;
8383 char *strtab;
8384 dtrace_helper_provdesc_t dhpv;
8385 dtrace_meta_t *meta = dtrace_meta_pid;
8386 dtrace_mops_t *mops = &meta->dtm_mops;
8387
8388 provider = (dof_provider_t *)(uintptr_t)(daddr + sec->dofs_offset);
8389 str_sec = (dof_sec_t *)(uintptr_t)(daddr + dof->dofh_secoff +
8390 provider->dofpv_strtab * dof->dofh_secsize);
8391
8392 strtab = (char *)(uintptr_t)(daddr + str_sec->dofs_offset);
8393
8394 /*
8395 * Create the provider.
8396 */
8397 dtrace_dofprov2hprov(&dhpv, provider, strtab);
8398
8399 mops->dtms_remove_pid(meta->dtm_arg, &dhpv, pid);
8400
8401 meta->dtm_count--;
8402}
8403
8404static void
8405dtrace_helper_provider_remove(dof_helper_t *dhp, pid_t pid)
8406{
8407 uintptr_t daddr = (uintptr_t)dhp->dofhp_dof;
8408 dof_hdr_t *dof = (dof_hdr_t *)daddr;
8409 int i;
8410
8411 ASSERT(MUTEX_HELD(&dtrace_meta_lock));
8412
8413 for (i = 0; i < dof->dofh_secnum; i++) {
8414 dof_sec_t *sec = (dof_sec_t *)(uintptr_t)(daddr +
8415 dof->dofh_secoff + i * dof->dofh_secsize);
8416
8417 if (sec->dofs_type != DOF_SECT_PROVIDER)
8418 continue;
8419
8420 dtrace_helper_provider_remove_one(dhp, sec, pid);
8421 }
8422}
8423
8424/*
8425 * DTrace Meta Provider-to-Framework API Functions
8426 *
8427 * These functions implement the Meta Provider-to-Framework API, as described
8428 * in <sys/dtrace.h>.
8429 */
8430int
8431dtrace_meta_register(const char *name, const dtrace_mops_t *mops, void *arg,
8432 dtrace_meta_provider_id_t *idp)
8433{
8434 dtrace_meta_t *meta;
8435 dtrace_helpers_t *help, *next;
8436 int i;
8437
8438 *idp = DTRACE_METAPROVNONE;
8439
8440 /*
8441 * We strictly don't need the name, but we hold onto it for
8442 * debuggability. All hail error queues!
8443 */
8444 if (name == NULL) {
8445 cmn_err(CE_WARN, "failed to register meta-provider: "
8446 "invalid name");
8447 return (EINVAL);
8448 }
8449
8450 if (mops == NULL ||
8451 mops->dtms_create_probe == NULL ||
8452 mops->dtms_provide_pid == NULL ||
8453 mops->dtms_remove_pid == NULL) {
8454 cmn_err(CE_WARN, "failed to register meta-register %s: "
8455 "invalid ops", name);
8456 return (EINVAL);
8457 }
8458
8459 meta = kmem_zalloc(sizeof (dtrace_meta_t), KM_SLEEP);
8460 meta->dtm_mops = *mops;
8461 meta->dtm_name = kmem_alloc(strlen(name) + 1, KM_SLEEP);
8462 (void) strcpy(meta->dtm_name, name);
8463 meta->dtm_arg = arg;
8464
8465 mutex_enter(&dtrace_meta_lock);
8466 mutex_enter(&dtrace_lock);
8467
8468 if (dtrace_meta_pid != NULL) {
8469 mutex_exit(&dtrace_lock);
8470 mutex_exit(&dtrace_meta_lock);
8471 cmn_err(CE_WARN, "failed to register meta-register %s: "
8472 "user-land meta-provider exists", name);
8473 kmem_free(meta->dtm_name, strlen(meta->dtm_name) + 1);
8474 kmem_free(meta, sizeof (dtrace_meta_t));
8475 return (EINVAL);
8476 }
8477
8478 dtrace_meta_pid = meta;
8479 *idp = (dtrace_meta_provider_id_t)meta;
8480
8481 /*
8482 * If there are providers and probes ready to go, pass them
8483 * off to the new meta provider now.
8484 */
8485
8486 help = dtrace_deferred_pid;
8487 dtrace_deferred_pid = NULL;
8488
8489 mutex_exit(&dtrace_lock);
8490
8491 while (help != NULL) {
8492 for (i = 0; i < help->dthps_nprovs; i++) {
8493 dtrace_helper_provide(&help->dthps_provs[i]->dthp_prov,
8494 help->dthps_pid);
8495 }
8496
8497 next = help->dthps_next;
8498 help->dthps_next = NULL;
8499 help->dthps_prev = NULL;
8500 help->dthps_deferred = 0;
8501 help = next;
8502 }
8503
8504 mutex_exit(&dtrace_meta_lock);
8505
8506 return (0);
8507}
8508
8509int
8510dtrace_meta_unregister(dtrace_meta_provider_id_t id)
8511{
8512 dtrace_meta_t **pp, *old = (dtrace_meta_t *)id;
8513
8514 mutex_enter(&dtrace_meta_lock);
8515 mutex_enter(&dtrace_lock);
8516
8517 if (old == dtrace_meta_pid) {
8518 pp = &dtrace_meta_pid;
8519 } else {
8520 panic("attempt to unregister non-existent "
8521 "dtrace meta-provider %p\n", (void *)old);
8522 }
8523
8524 if (old->dtm_count != 0) {
8525 mutex_exit(&dtrace_lock);
8526 mutex_exit(&dtrace_meta_lock);
8527 return (EBUSY);
8528 }
8529
8530 *pp = NULL;
8531
8532 mutex_exit(&dtrace_lock);
8533 mutex_exit(&dtrace_meta_lock);
8534
8535 kmem_free(old->dtm_name, strlen(old->dtm_name) + 1);
8536 kmem_free(old, sizeof (dtrace_meta_t));
8537
8538 return (0);
8539}
8540
8541
8542/*
8543 * DTrace DIF Object Functions
8544 */
8545static int
8546dtrace_difo_err(uint_t pc, const char *format, ...)
8547{
8548 if (dtrace_err_verbose) {
8549 va_list alist;
8550
8551 (void) uprintf("dtrace DIF object error: [%u]: ", pc);
8552 va_start(alist, format);
8553 (void) vuprintf(format, alist);
8554 va_end(alist);
8555 }
8556
8557#ifdef DTRACE_ERRDEBUG
8558 dtrace_errdebug(format);
8559#endif
8560 return (1);
8561}
8562
8563/*
8564 * Validate a DTrace DIF object by checking the IR instructions. The following
8565 * rules are currently enforced by dtrace_difo_validate():
8566 *
8567 * 1. Each instruction must have a valid opcode
8568 * 2. Each register, string, variable, or subroutine reference must be valid
8569 * 3. No instruction can modify register %r0 (must be zero)
8570 * 4. All instruction reserved bits must be set to zero
8571 * 5. The last instruction must be a "ret" instruction
8572 * 6. All branch targets must reference a valid instruction _after_ the branch
8573 */
8574static int
8575dtrace_difo_validate(dtrace_difo_t *dp, dtrace_vstate_t *vstate, uint_t nregs,
8576 cred_t *cr)
8577{
8578 int err = 0, i;
8579 int (*efunc)(uint_t pc, const char *, ...) = dtrace_difo_err;
8580 int kcheckload;
8581 uint_t pc;
8582
8583 kcheckload = cr == NULL ||
8584 (vstate->dtvs_state->dts_cred.dcr_visible & DTRACE_CRV_KERNEL) == 0;
8585
8586 dp->dtdo_destructive = 0;
8587
8588 for (pc = 0; pc < dp->dtdo_len && err == 0; pc++) {
8589 dif_instr_t instr = dp->dtdo_buf[pc];
8590
8591 uint_t r1 = DIF_INSTR_R1(instr);
8592 uint_t r2 = DIF_INSTR_R2(instr);
8593 uint_t rd = DIF_INSTR_RD(instr);
8594 uint_t rs = DIF_INSTR_RS(instr);
8595 uint_t label = DIF_INSTR_LABEL(instr);
8596 uint_t v = DIF_INSTR_VAR(instr);
8597 uint_t subr = DIF_INSTR_SUBR(instr);
8598 uint_t type = DIF_INSTR_TYPE(instr);
8599 uint_t op = DIF_INSTR_OP(instr);
8600
8601 switch (op) {
8602 case DIF_OP_OR:
8603 case DIF_OP_XOR:
8604 case DIF_OP_AND:
8605 case DIF_OP_SLL:
8606 case DIF_OP_SRL:
8607 case DIF_OP_SRA:
8608 case DIF_OP_SUB:
8609 case DIF_OP_ADD:
8610 case DIF_OP_MUL:
8611 case DIF_OP_SDIV:
8612 case DIF_OP_UDIV:
8613 case DIF_OP_SREM:
8614 case DIF_OP_UREM:
8615 case DIF_OP_COPYS:
8616 if (r1 >= nregs)
8617 err += efunc(pc, "invalid register %u\n", r1);
8618 if (r2 >= nregs)
8619 err += efunc(pc, "invalid register %u\n", r2);
8620 if (rd >= nregs)
8621 err += efunc(pc, "invalid register %u\n", rd);
8622 if (rd == 0)
8623 err += efunc(pc, "cannot write to %r0\n");
8624 break;
8625 case DIF_OP_NOT:
8626 case DIF_OP_MOV:
8627 case DIF_OP_ALLOCS:
8628 if (r1 >= nregs)
8629 err += efunc(pc, "invalid register %u\n", r1);
8630 if (r2 != 0)
8631 err += efunc(pc, "non-zero reserved bits\n");
8632 if (rd >= nregs)
8633 err += efunc(pc, "invalid register %u\n", rd);
8634 if (rd == 0)
8635 err += efunc(pc, "cannot write to %r0\n");
8636 break;
8637 case DIF_OP_LDSB:
8638 case DIF_OP_LDSH:
8639 case DIF_OP_LDSW:
8640 case DIF_OP_LDUB:
8641 case DIF_OP_LDUH:
8642 case DIF_OP_LDUW:
8643 case DIF_OP_LDX:
8644 if (r1 >= nregs)
8645 err += efunc(pc, "invalid register %u\n", r1);
8646 if (r2 != 0)
8647 err += efunc(pc, "non-zero reserved bits\n");
8648 if (rd >= nregs)
8649 err += efunc(pc, "invalid register %u\n", rd);
8650 if (rd == 0)
8651 err += efunc(pc, "cannot write to %r0\n");
8652 if (kcheckload)
8653 dp->dtdo_buf[pc] = DIF_INSTR_LOAD(op +
8654 DIF_OP_RLDSB - DIF_OP_LDSB, r1, rd);
8655 break;
8656 case DIF_OP_RLDSB:
8657 case DIF_OP_RLDSH:
8658 case DIF_OP_RLDSW:
8659 case DIF_OP_RLDUB:
8660 case DIF_OP_RLDUH:
8661 case DIF_OP_RLDUW:
8662 case DIF_OP_RLDX:
8663 if (r1 >= nregs)
8664 err += efunc(pc, "invalid register %u\n", r1);
8665 if (r2 != 0)
8666 err += efunc(pc, "non-zero reserved bits\n");
8667 if (rd >= nregs)
8668 err += efunc(pc, "invalid register %u\n", rd);
8669 if (rd == 0)
8670 err += efunc(pc, "cannot write to %r0\n");
8671 break;
8672 case DIF_OP_ULDSB:
8673 case DIF_OP_ULDSH:
8674 case DIF_OP_ULDSW:
8675 case DIF_OP_ULDUB:
8676 case DIF_OP_ULDUH:
8677 case DIF_OP_ULDUW:
8678 case DIF_OP_ULDX:
8679 if (r1 >= nregs)
8680 err += efunc(pc, "invalid register %u\n", r1);
8681 if (r2 != 0)
8682 err += efunc(pc, "non-zero reserved bits\n");
8683 if (rd >= nregs)
8684 err += efunc(pc, "invalid register %u\n", rd);
8685 if (rd == 0)
8686 err += efunc(pc, "cannot write to %r0\n");
8687 break;
8688 case DIF_OP_STB:
8689 case DIF_OP_STH:
8690 case DIF_OP_STW:
8691 case DIF_OP_STX:
8692 if (r1 >= nregs)
8693 err += efunc(pc, "invalid register %u\n", r1);
8694 if (r2 != 0)
8695 err += efunc(pc, "non-zero reserved bits\n");
8696 if (rd >= nregs)
8697 err += efunc(pc, "invalid register %u\n", rd);
8698 if (rd == 0)
8699 err += efunc(pc, "cannot write to 0 address\n");
8700 break;
8701 case DIF_OP_CMP:
8702 case DIF_OP_SCMP:
8703 if (r1 >= nregs)
8704 err += efunc(pc, "invalid register %u\n", r1);
8705 if (r2 >= nregs)
8706 err += efunc(pc, "invalid register %u\n", r2);
8707 if (rd != 0)
8708 err += efunc(pc, "non-zero reserved bits\n");
8709 break;
8710 case DIF_OP_TST:
8711 if (r1 >= nregs)
8712 err += efunc(pc, "invalid register %u\n", r1);
8713 if (r2 != 0 || rd != 0)
8714 err += efunc(pc, "non-zero reserved bits\n");
8715 break;
8716 case DIF_OP_BA:
8717 case DIF_OP_BE:
8718 case DIF_OP_BNE:
8719 case DIF_OP_BG:
8720 case DIF_OP_BGU:
8721 case DIF_OP_BGE:
8722 case DIF_OP_BGEU:
8723 case DIF_OP_BL:
8724 case DIF_OP_BLU:
8725 case DIF_OP_BLE:
8726 case DIF_OP_BLEU:
8727 if (label >= dp->dtdo_len) {
8728 err += efunc(pc, "invalid branch target %u\n",
8729 label);
8730 }
8731 if (label <= pc) {
8732 err += efunc(pc, "backward branch to %u\n",
8733 label);
8734 }
8735 break;
8736 case DIF_OP_RET:
8737 if (r1 != 0 || r2 != 0)
8738 err += efunc(pc, "non-zero reserved bits\n");
8739 if (rd >= nregs)
8740 err += efunc(pc, "invalid register %u\n", rd);
8741 break;
8742 case DIF_OP_NOP:
8743 case DIF_OP_POPTS:
8744 case DIF_OP_FLUSHTS:
8745 if (r1 != 0 || r2 != 0 || rd != 0)
8746 err += efunc(pc, "non-zero reserved bits\n");
8747 break;
8748 case DIF_OP_SETX:
8749 if (DIF_INSTR_INTEGER(instr) >= dp->dtdo_intlen) {
8750 err += efunc(pc, "invalid integer ref %u\n",
8751 DIF_INSTR_INTEGER(instr));
8752 }
8753 if (rd >= nregs)
8754 err += efunc(pc, "invalid register %u\n", rd);
8755 if (rd == 0)
8756 err += efunc(pc, "cannot write to %r0\n");
8757 break;
8758 case DIF_OP_SETS:
8759 if (DIF_INSTR_STRING(instr) >= dp->dtdo_strlen) {
8760 err += efunc(pc, "invalid string ref %u\n",
8761 DIF_INSTR_STRING(instr));
8762 }
8763 if (rd >= nregs)
8764 err += efunc(pc, "invalid register %u\n", rd);
8765 if (rd == 0)
8766 err += efunc(pc, "cannot write to %r0\n");
8767 break;
8768 case DIF_OP_LDGA:
8769 case DIF_OP_LDTA:
8770 if (r1 > DIF_VAR_ARRAY_MAX)
8771 err += efunc(pc, "invalid array %u\n", r1);
8772 if (r2 >= nregs)
8773 err += efunc(pc, "invalid register %u\n", r2);
8774 if (rd >= nregs)
8775 err += efunc(pc, "invalid register %u\n", rd);
8776 if (rd == 0)
8777 err += efunc(pc, "cannot write to %r0\n");
8778 break;
8779 case DIF_OP_LDGS:
8780 case DIF_OP_LDTS:
8781 case DIF_OP_LDLS:
8782 case DIF_OP_LDGAA:
8783 case DIF_OP_LDTAA:
8784 if (v < DIF_VAR_OTHER_MIN || v > DIF_VAR_OTHER_MAX)
8785 err += efunc(pc, "invalid variable %u\n", v);
8786 if (rd >= nregs)
8787 err += efunc(pc, "invalid register %u\n", rd);
8788 if (rd == 0)
8789 err += efunc(pc, "cannot write to %r0\n");
8790 break;
8791 case DIF_OP_STGS:
8792 case DIF_OP_STTS:
8793 case DIF_OP_STLS:
8794 case DIF_OP_STGAA:
8795 case DIF_OP_STTAA:
8796 if (v < DIF_VAR_OTHER_UBASE || v > DIF_VAR_OTHER_MAX)
8797 err += efunc(pc, "invalid variable %u\n", v);
8798 if (rs >= nregs)
8799 err += efunc(pc, "invalid register %u\n", rd);
8800 break;
8801 case DIF_OP_CALL:
8802 if (subr > DIF_SUBR_MAX)
8803 err += efunc(pc, "invalid subr %u\n", subr);
8804 if (rd >= nregs)
8805 err += efunc(pc, "invalid register %u\n", rd);
8806 if (rd == 0)
8807 err += efunc(pc, "cannot write to %r0\n");
8808
8809 if (subr == DIF_SUBR_COPYOUT ||
8810 subr == DIF_SUBR_COPYOUTSTR) {
8811 dp->dtdo_destructive = 1;
8812 }
7581 continue;
7582 }
7583
7584 switch (size) {
7585 case 0:
7586 break;
7587
7588 case sizeof (uint8_t):
7589 DTRACE_STORE(uint8_t, tomax, valoffs, val);
7590 break;
7591 case sizeof (uint16_t):
7592 DTRACE_STORE(uint16_t, tomax, valoffs, val);
7593 break;
7594 case sizeof (uint32_t):
7595 DTRACE_STORE(uint32_t, tomax, valoffs, val);
7596 break;
7597 case sizeof (uint64_t):
7598 DTRACE_STORE(uint64_t, tomax, valoffs, val);
7599 break;
7600 default:
7601 /*
7602 * Any other size should have been returned by
7603 * reference, not by value.
7604 */
7605 ASSERT(0);
7606 break;
7607 }
7608 }
7609
7610 if (*flags & CPU_DTRACE_DROP)
7611 continue;
7612
7613 if (*flags & CPU_DTRACE_FAULT) {
7614 int ndx;
7615 dtrace_action_t *err;
7616
7617 buf->dtb_errors++;
7618
7619 if (probe->dtpr_id == dtrace_probeid_error) {
7620 /*
7621 * There's nothing we can do -- we had an
7622 * error on the error probe. We bump an
7623 * error counter to at least indicate that
7624 * this condition happened.
7625 */
7626 dtrace_error(&state->dts_dblerrors);
7627 continue;
7628 }
7629
7630 if (vtime) {
7631 /*
7632 * Before recursing on dtrace_probe(), we
7633 * need to explicitly clear out our start
7634 * time to prevent it from being accumulated
7635 * into t_dtrace_vtime.
7636 */
7637 curthread->t_dtrace_start = 0;
7638 }
7639
7640 /*
7641 * Iterate over the actions to figure out which action
7642 * we were processing when we experienced the error.
7643 * Note that act points _past_ the faulting action; if
7644 * act is ecb->dte_action, the fault was in the
7645 * predicate, if it's ecb->dte_action->dta_next it's
7646 * in action #1, and so on.
7647 */
7648 for (err = ecb->dte_action, ndx = 0;
7649 err != act; err = err->dta_next, ndx++)
7650 continue;
7651
7652 dtrace_probe_error(state, ecb->dte_epid, ndx,
7653 (mstate.dtms_present & DTRACE_MSTATE_FLTOFFS) ?
7654 mstate.dtms_fltoffs : -1, DTRACE_FLAGS2FLT(*flags),
7655 cpu_core[cpuid].cpuc_dtrace_illval);
7656
7657 continue;
7658 }
7659
7660 if (!committed)
7661 buf->dtb_offset = offs + ecb->dte_size;
7662 }
7663
7664 if (vtime)
7665 curthread->t_dtrace_start = dtrace_gethrtime();
7666
7667 dtrace_interrupt_enable(cookie);
7668}
7669
7670/*
7671 * DTrace Probe Hashing Functions
7672 *
7673 * The functions in this section (and indeed, the functions in remaining
7674 * sections) are not _called_ from probe context. (Any exceptions to this are
7675 * marked with a "Note:".) Rather, they are called from elsewhere in the
7676 * DTrace framework to look-up probes in, add probes to and remove probes from
7677 * the DTrace probe hashes. (Each probe is hashed by each element of the
7678 * probe tuple -- allowing for fast lookups, regardless of what was
7679 * specified.)
7680 */
7681static uint_t
7682dtrace_hash_str(const char *p)
7683{
7684 unsigned int g;
7685 uint_t hval = 0;
7686
7687 while (*p) {
7688 hval = (hval << 4) + *p++;
7689 if ((g = (hval & 0xf0000000)) != 0)
7690 hval ^= g >> 24;
7691 hval &= ~g;
7692 }
7693 return (hval);
7694}
7695
7696static dtrace_hash_t *
7697dtrace_hash_create(uintptr_t stroffs, uintptr_t nextoffs, uintptr_t prevoffs)
7698{
7699 dtrace_hash_t *hash = kmem_zalloc(sizeof (dtrace_hash_t), KM_SLEEP);
7700
7701 hash->dth_stroffs = stroffs;
7702 hash->dth_nextoffs = nextoffs;
7703 hash->dth_prevoffs = prevoffs;
7704
7705 hash->dth_size = 1;
7706 hash->dth_mask = hash->dth_size - 1;
7707
7708 hash->dth_tab = kmem_zalloc(hash->dth_size *
7709 sizeof (dtrace_hashbucket_t *), KM_SLEEP);
7710
7711 return (hash);
7712}
7713
7714static void
7715dtrace_hash_destroy(dtrace_hash_t *hash)
7716{
7717#ifdef DEBUG
7718 int i;
7719
7720 for (i = 0; i < hash->dth_size; i++)
7721 ASSERT(hash->dth_tab[i] == NULL);
7722#endif
7723
7724 kmem_free(hash->dth_tab,
7725 hash->dth_size * sizeof (dtrace_hashbucket_t *));
7726 kmem_free(hash, sizeof (dtrace_hash_t));
7727}
7728
7729static void
7730dtrace_hash_resize(dtrace_hash_t *hash)
7731{
7732 int size = hash->dth_size, i, ndx;
7733 int new_size = hash->dth_size << 1;
7734 int new_mask = new_size - 1;
7735 dtrace_hashbucket_t **new_tab, *bucket, *next;
7736
7737 ASSERT((new_size & new_mask) == 0);
7738
7739 new_tab = kmem_zalloc(new_size * sizeof (void *), KM_SLEEP);
7740
7741 for (i = 0; i < size; i++) {
7742 for (bucket = hash->dth_tab[i]; bucket != NULL; bucket = next) {
7743 dtrace_probe_t *probe = bucket->dthb_chain;
7744
7745 ASSERT(probe != NULL);
7746 ndx = DTRACE_HASHSTR(hash, probe) & new_mask;
7747
7748 next = bucket->dthb_next;
7749 bucket->dthb_next = new_tab[ndx];
7750 new_tab[ndx] = bucket;
7751 }
7752 }
7753
7754 kmem_free(hash->dth_tab, hash->dth_size * sizeof (void *));
7755 hash->dth_tab = new_tab;
7756 hash->dth_size = new_size;
7757 hash->dth_mask = new_mask;
7758}
7759
7760static void
7761dtrace_hash_add(dtrace_hash_t *hash, dtrace_probe_t *new)
7762{
7763 int hashval = DTRACE_HASHSTR(hash, new);
7764 int ndx = hashval & hash->dth_mask;
7765 dtrace_hashbucket_t *bucket = hash->dth_tab[ndx];
7766 dtrace_probe_t **nextp, **prevp;
7767
7768 for (; bucket != NULL; bucket = bucket->dthb_next) {
7769 if (DTRACE_HASHEQ(hash, bucket->dthb_chain, new))
7770 goto add;
7771 }
7772
7773 if ((hash->dth_nbuckets >> 1) > hash->dth_size) {
7774 dtrace_hash_resize(hash);
7775 dtrace_hash_add(hash, new);
7776 return;
7777 }
7778
7779 bucket = kmem_zalloc(sizeof (dtrace_hashbucket_t), KM_SLEEP);
7780 bucket->dthb_next = hash->dth_tab[ndx];
7781 hash->dth_tab[ndx] = bucket;
7782 hash->dth_nbuckets++;
7783
7784add:
7785 nextp = DTRACE_HASHNEXT(hash, new);
7786 ASSERT(*nextp == NULL && *(DTRACE_HASHPREV(hash, new)) == NULL);
7787 *nextp = bucket->dthb_chain;
7788
7789 if (bucket->dthb_chain != NULL) {
7790 prevp = DTRACE_HASHPREV(hash, bucket->dthb_chain);
7791 ASSERT(*prevp == NULL);
7792 *prevp = new;
7793 }
7794
7795 bucket->dthb_chain = new;
7796 bucket->dthb_len++;
7797}
7798
7799static dtrace_probe_t *
7800dtrace_hash_lookup(dtrace_hash_t *hash, dtrace_probe_t *template)
7801{
7802 int hashval = DTRACE_HASHSTR(hash, template);
7803 int ndx = hashval & hash->dth_mask;
7804 dtrace_hashbucket_t *bucket = hash->dth_tab[ndx];
7805
7806 for (; bucket != NULL; bucket = bucket->dthb_next) {
7807 if (DTRACE_HASHEQ(hash, bucket->dthb_chain, template))
7808 return (bucket->dthb_chain);
7809 }
7810
7811 return (NULL);
7812}
7813
7814static int
7815dtrace_hash_collisions(dtrace_hash_t *hash, dtrace_probe_t *template)
7816{
7817 int hashval = DTRACE_HASHSTR(hash, template);
7818 int ndx = hashval & hash->dth_mask;
7819 dtrace_hashbucket_t *bucket = hash->dth_tab[ndx];
7820
7821 for (; bucket != NULL; bucket = bucket->dthb_next) {
7822 if (DTRACE_HASHEQ(hash, bucket->dthb_chain, template))
7823 return (bucket->dthb_len);
7824 }
7825
7826 return (0);
7827}
7828
7829static void
7830dtrace_hash_remove(dtrace_hash_t *hash, dtrace_probe_t *probe)
7831{
7832 int ndx = DTRACE_HASHSTR(hash, probe) & hash->dth_mask;
7833 dtrace_hashbucket_t *bucket = hash->dth_tab[ndx];
7834
7835 dtrace_probe_t **prevp = DTRACE_HASHPREV(hash, probe);
7836 dtrace_probe_t **nextp = DTRACE_HASHNEXT(hash, probe);
7837
7838 /*
7839 * Find the bucket that we're removing this probe from.
7840 */
7841 for (; bucket != NULL; bucket = bucket->dthb_next) {
7842 if (DTRACE_HASHEQ(hash, bucket->dthb_chain, probe))
7843 break;
7844 }
7845
7846 ASSERT(bucket != NULL);
7847
7848 if (*prevp == NULL) {
7849 if (*nextp == NULL) {
7850 /*
7851 * The removed probe was the only probe on this
7852 * bucket; we need to remove the bucket.
7853 */
7854 dtrace_hashbucket_t *b = hash->dth_tab[ndx];
7855
7856 ASSERT(bucket->dthb_chain == probe);
7857 ASSERT(b != NULL);
7858
7859 if (b == bucket) {
7860 hash->dth_tab[ndx] = bucket->dthb_next;
7861 } else {
7862 while (b->dthb_next != bucket)
7863 b = b->dthb_next;
7864 b->dthb_next = bucket->dthb_next;
7865 }
7866
7867 ASSERT(hash->dth_nbuckets > 0);
7868 hash->dth_nbuckets--;
7869 kmem_free(bucket, sizeof (dtrace_hashbucket_t));
7870 return;
7871 }
7872
7873 bucket->dthb_chain = *nextp;
7874 } else {
7875 *(DTRACE_HASHNEXT(hash, *prevp)) = *nextp;
7876 }
7877
7878 if (*nextp != NULL)
7879 *(DTRACE_HASHPREV(hash, *nextp)) = *prevp;
7880}
7881
7882/*
7883 * DTrace Utility Functions
7884 *
7885 * These are random utility functions that are _not_ called from probe context.
7886 */
7887static int
7888dtrace_badattr(const dtrace_attribute_t *a)
7889{
7890 return (a->dtat_name > DTRACE_STABILITY_MAX ||
7891 a->dtat_data > DTRACE_STABILITY_MAX ||
7892 a->dtat_class > DTRACE_CLASS_MAX);
7893}
7894
7895/*
7896 * Return a duplicate copy of a string. If the specified string is NULL,
7897 * this function returns a zero-length string.
7898 */
7899static char *
7900dtrace_strdup(const char *str)
7901{
7902 char *new = kmem_zalloc((str != NULL ? strlen(str) : 0) + 1, KM_SLEEP);
7903
7904 if (str != NULL)
7905 (void) strcpy(new, str);
7906
7907 return (new);
7908}
7909
7910#define DTRACE_ISALPHA(c) \
7911 (((c) >= 'a' && (c) <= 'z') || ((c) >= 'A' && (c) <= 'Z'))
7912
7913static int
7914dtrace_badname(const char *s)
7915{
7916 char c;
7917
7918 if (s == NULL || (c = *s++) == '\0')
7919 return (0);
7920
7921 if (!DTRACE_ISALPHA(c) && c != '-' && c != '_' && c != '.')
7922 return (1);
7923
7924 while ((c = *s++) != '\0') {
7925 if (!DTRACE_ISALPHA(c) && (c < '0' || c > '9') &&
7926 c != '-' && c != '_' && c != '.' && c != '`')
7927 return (1);
7928 }
7929
7930 return (0);
7931}
7932
7933static void
7934dtrace_cred2priv(cred_t *cr, uint32_t *privp, uid_t *uidp, zoneid_t *zoneidp)
7935{
7936 uint32_t priv;
7937
7938#if defined(sun)
7939 if (cr == NULL || PRIV_POLICY_ONLY(cr, PRIV_ALL, B_FALSE)) {
7940 /*
7941 * For DTRACE_PRIV_ALL, the uid and zoneid don't matter.
7942 */
7943 priv = DTRACE_PRIV_ALL;
7944 } else {
7945 *uidp = crgetuid(cr);
7946 *zoneidp = crgetzoneid(cr);
7947
7948 priv = 0;
7949 if (PRIV_POLICY_ONLY(cr, PRIV_DTRACE_KERNEL, B_FALSE))
7950 priv |= DTRACE_PRIV_KERNEL | DTRACE_PRIV_USER;
7951 else if (PRIV_POLICY_ONLY(cr, PRIV_DTRACE_USER, B_FALSE))
7952 priv |= DTRACE_PRIV_USER;
7953 if (PRIV_POLICY_ONLY(cr, PRIV_DTRACE_PROC, B_FALSE))
7954 priv |= DTRACE_PRIV_PROC;
7955 if (PRIV_POLICY_ONLY(cr, PRIV_PROC_OWNER, B_FALSE))
7956 priv |= DTRACE_PRIV_OWNER;
7957 if (PRIV_POLICY_ONLY(cr, PRIV_PROC_ZONE, B_FALSE))
7958 priv |= DTRACE_PRIV_ZONEOWNER;
7959 }
7960#else
7961 priv = DTRACE_PRIV_ALL;
7962#endif
7963
7964 *privp = priv;
7965}
7966
7967#ifdef DTRACE_ERRDEBUG
7968static void
7969dtrace_errdebug(const char *str)
7970{
7971 int hval = dtrace_hash_str(str) % DTRACE_ERRHASHSZ;
7972 int occupied = 0;
7973
7974 mutex_enter(&dtrace_errlock);
7975 dtrace_errlast = str;
7976 dtrace_errthread = curthread;
7977
7978 while (occupied++ < DTRACE_ERRHASHSZ) {
7979 if (dtrace_errhash[hval].dter_msg == str) {
7980 dtrace_errhash[hval].dter_count++;
7981 goto out;
7982 }
7983
7984 if (dtrace_errhash[hval].dter_msg != NULL) {
7985 hval = (hval + 1) % DTRACE_ERRHASHSZ;
7986 continue;
7987 }
7988
7989 dtrace_errhash[hval].dter_msg = str;
7990 dtrace_errhash[hval].dter_count = 1;
7991 goto out;
7992 }
7993
7994 panic("dtrace: undersized error hash");
7995out:
7996 mutex_exit(&dtrace_errlock);
7997}
7998#endif
7999
8000/*
8001 * DTrace Matching Functions
8002 *
8003 * These functions are used to match groups of probes, given some elements of
8004 * a probe tuple, or some globbed expressions for elements of a probe tuple.
8005 */
8006static int
8007dtrace_match_priv(const dtrace_probe_t *prp, uint32_t priv, uid_t uid,
8008 zoneid_t zoneid)
8009{
8010 if (priv != DTRACE_PRIV_ALL) {
8011 uint32_t ppriv = prp->dtpr_provider->dtpv_priv.dtpp_flags;
8012 uint32_t match = priv & ppriv;
8013
8014 /*
8015 * No PRIV_DTRACE_* privileges...
8016 */
8017 if ((priv & (DTRACE_PRIV_PROC | DTRACE_PRIV_USER |
8018 DTRACE_PRIV_KERNEL)) == 0)
8019 return (0);
8020
8021 /*
8022 * No matching bits, but there were bits to match...
8023 */
8024 if (match == 0 && ppriv != 0)
8025 return (0);
8026
8027 /*
8028 * Need to have permissions to the process, but don't...
8029 */
8030 if (((ppriv & ~match) & DTRACE_PRIV_OWNER) != 0 &&
8031 uid != prp->dtpr_provider->dtpv_priv.dtpp_uid) {
8032 return (0);
8033 }
8034
8035 /*
8036 * Need to be in the same zone unless we possess the
8037 * privilege to examine all zones.
8038 */
8039 if (((ppriv & ~match) & DTRACE_PRIV_ZONEOWNER) != 0 &&
8040 zoneid != prp->dtpr_provider->dtpv_priv.dtpp_zoneid) {
8041 return (0);
8042 }
8043 }
8044
8045 return (1);
8046}
8047
8048/*
8049 * dtrace_match_probe compares a dtrace_probe_t to a pre-compiled key, which
8050 * consists of input pattern strings and an ops-vector to evaluate them.
8051 * This function returns >0 for match, 0 for no match, and <0 for error.
8052 */
8053static int
8054dtrace_match_probe(const dtrace_probe_t *prp, const dtrace_probekey_t *pkp,
8055 uint32_t priv, uid_t uid, zoneid_t zoneid)
8056{
8057 dtrace_provider_t *pvp = prp->dtpr_provider;
8058 int rv;
8059
8060 if (pvp->dtpv_defunct)
8061 return (0);
8062
8063 if ((rv = pkp->dtpk_pmatch(pvp->dtpv_name, pkp->dtpk_prov, 0)) <= 0)
8064 return (rv);
8065
8066 if ((rv = pkp->dtpk_mmatch(prp->dtpr_mod, pkp->dtpk_mod, 0)) <= 0)
8067 return (rv);
8068
8069 if ((rv = pkp->dtpk_fmatch(prp->dtpr_func, pkp->dtpk_func, 0)) <= 0)
8070 return (rv);
8071
8072 if ((rv = pkp->dtpk_nmatch(prp->dtpr_name, pkp->dtpk_name, 0)) <= 0)
8073 return (rv);
8074
8075 if (dtrace_match_priv(prp, priv, uid, zoneid) == 0)
8076 return (0);
8077
8078 return (rv);
8079}
8080
8081/*
8082 * dtrace_match_glob() is a safe kernel implementation of the gmatch(3GEN)
8083 * interface for matching a glob pattern 'p' to an input string 's'. Unlike
8084 * libc's version, the kernel version only applies to 8-bit ASCII strings.
8085 * In addition, all of the recursion cases except for '*' matching have been
8086 * unwound. For '*', we still implement recursive evaluation, but a depth
8087 * counter is maintained and matching is aborted if we recurse too deep.
8088 * The function returns 0 if no match, >0 if match, and <0 if recursion error.
8089 */
8090static int
8091dtrace_match_glob(const char *s, const char *p, int depth)
8092{
8093 const char *olds;
8094 char s1, c;
8095 int gs;
8096
8097 if (depth > DTRACE_PROBEKEY_MAXDEPTH)
8098 return (-1);
8099
8100 if (s == NULL)
8101 s = ""; /* treat NULL as empty string */
8102
8103top:
8104 olds = s;
8105 s1 = *s++;
8106
8107 if (p == NULL)
8108 return (0);
8109
8110 if ((c = *p++) == '\0')
8111 return (s1 == '\0');
8112
8113 switch (c) {
8114 case '[': {
8115 int ok = 0, notflag = 0;
8116 char lc = '\0';
8117
8118 if (s1 == '\0')
8119 return (0);
8120
8121 if (*p == '!') {
8122 notflag = 1;
8123 p++;
8124 }
8125
8126 if ((c = *p++) == '\0')
8127 return (0);
8128
8129 do {
8130 if (c == '-' && lc != '\0' && *p != ']') {
8131 if ((c = *p++) == '\0')
8132 return (0);
8133 if (c == '\\' && (c = *p++) == '\0')
8134 return (0);
8135
8136 if (notflag) {
8137 if (s1 < lc || s1 > c)
8138 ok++;
8139 else
8140 return (0);
8141 } else if (lc <= s1 && s1 <= c)
8142 ok++;
8143
8144 } else if (c == '\\' && (c = *p++) == '\0')
8145 return (0);
8146
8147 lc = c; /* save left-hand 'c' for next iteration */
8148
8149 if (notflag) {
8150 if (s1 != c)
8151 ok++;
8152 else
8153 return (0);
8154 } else if (s1 == c)
8155 ok++;
8156
8157 if ((c = *p++) == '\0')
8158 return (0);
8159
8160 } while (c != ']');
8161
8162 if (ok)
8163 goto top;
8164
8165 return (0);
8166 }
8167
8168 case '\\':
8169 if ((c = *p++) == '\0')
8170 return (0);
8171 /*FALLTHRU*/
8172
8173 default:
8174 if (c != s1)
8175 return (0);
8176 /*FALLTHRU*/
8177
8178 case '?':
8179 if (s1 != '\0')
8180 goto top;
8181 return (0);
8182
8183 case '*':
8184 while (*p == '*')
8185 p++; /* consecutive *'s are identical to a single one */
8186
8187 if (*p == '\0')
8188 return (1);
8189
8190 for (s = olds; *s != '\0'; s++) {
8191 if ((gs = dtrace_match_glob(s, p, depth + 1)) != 0)
8192 return (gs);
8193 }
8194
8195 return (0);
8196 }
8197}
8198
8199/*ARGSUSED*/
8200static int
8201dtrace_match_string(const char *s, const char *p, int depth)
8202{
8203 return (s != NULL && strcmp(s, p) == 0);
8204}
8205
8206/*ARGSUSED*/
8207static int
8208dtrace_match_nul(const char *s, const char *p, int depth)
8209{
8210 return (1); /* always match the empty pattern */
8211}
8212
8213/*ARGSUSED*/
8214static int
8215dtrace_match_nonzero(const char *s, const char *p, int depth)
8216{
8217 return (s != NULL && s[0] != '\0');
8218}
8219
8220static int
8221dtrace_match(const dtrace_probekey_t *pkp, uint32_t priv, uid_t uid,
8222 zoneid_t zoneid, int (*matched)(dtrace_probe_t *, void *), void *arg)
8223{
8224 dtrace_probe_t template, *probe;
8225 dtrace_hash_t *hash = NULL;
8226 int len, best = INT_MAX, nmatched = 0;
8227 dtrace_id_t i;
8228
8229 ASSERT(MUTEX_HELD(&dtrace_lock));
8230
8231 /*
8232 * If the probe ID is specified in the key, just lookup by ID and
8233 * invoke the match callback once if a matching probe is found.
8234 */
8235 if (pkp->dtpk_id != DTRACE_IDNONE) {
8236 if ((probe = dtrace_probe_lookup_id(pkp->dtpk_id)) != NULL &&
8237 dtrace_match_probe(probe, pkp, priv, uid, zoneid) > 0) {
8238 (void) (*matched)(probe, arg);
8239 nmatched++;
8240 }
8241 return (nmatched);
8242 }
8243
8244 template.dtpr_mod = (char *)pkp->dtpk_mod;
8245 template.dtpr_func = (char *)pkp->dtpk_func;
8246 template.dtpr_name = (char *)pkp->dtpk_name;
8247
8248 /*
8249 * We want to find the most distinct of the module name, function
8250 * name, and name. So for each one that is not a glob pattern or
8251 * empty string, we perform a lookup in the corresponding hash and
8252 * use the hash table with the fewest collisions to do our search.
8253 */
8254 if (pkp->dtpk_mmatch == &dtrace_match_string &&
8255 (len = dtrace_hash_collisions(dtrace_bymod, &template)) < best) {
8256 best = len;
8257 hash = dtrace_bymod;
8258 }
8259
8260 if (pkp->dtpk_fmatch == &dtrace_match_string &&
8261 (len = dtrace_hash_collisions(dtrace_byfunc, &template)) < best) {
8262 best = len;
8263 hash = dtrace_byfunc;
8264 }
8265
8266 if (pkp->dtpk_nmatch == &dtrace_match_string &&
8267 (len = dtrace_hash_collisions(dtrace_byname, &template)) < best) {
8268 best = len;
8269 hash = dtrace_byname;
8270 }
8271
8272 /*
8273 * If we did not select a hash table, iterate over every probe and
8274 * invoke our callback for each one that matches our input probe key.
8275 */
8276 if (hash == NULL) {
8277 for (i = 0; i < dtrace_nprobes; i++) {
8278 if ((probe = dtrace_probes[i]) == NULL ||
8279 dtrace_match_probe(probe, pkp, priv, uid,
8280 zoneid) <= 0)
8281 continue;
8282
8283 nmatched++;
8284
8285 if ((*matched)(probe, arg) != DTRACE_MATCH_NEXT)
8286 break;
8287 }
8288
8289 return (nmatched);
8290 }
8291
8292 /*
8293 * If we selected a hash table, iterate over each probe of the same key
8294 * name and invoke the callback for every probe that matches the other
8295 * attributes of our input probe key.
8296 */
8297 for (probe = dtrace_hash_lookup(hash, &template); probe != NULL;
8298 probe = *(DTRACE_HASHNEXT(hash, probe))) {
8299
8300 if (dtrace_match_probe(probe, pkp, priv, uid, zoneid) <= 0)
8301 continue;
8302
8303 nmatched++;
8304
8305 if ((*matched)(probe, arg) != DTRACE_MATCH_NEXT)
8306 break;
8307 }
8308
8309 return (nmatched);
8310}
8311
8312/*
8313 * Return the function pointer dtrace_probecmp() should use to compare the
8314 * specified pattern with a string. For NULL or empty patterns, we select
8315 * dtrace_match_nul(). For glob pattern strings, we use dtrace_match_glob().
8316 * For non-empty non-glob strings, we use dtrace_match_string().
8317 */
8318static dtrace_probekey_f *
8319dtrace_probekey_func(const char *p)
8320{
8321 char c;
8322
8323 if (p == NULL || *p == '\0')
8324 return (&dtrace_match_nul);
8325
8326 while ((c = *p++) != '\0') {
8327 if (c == '[' || c == '?' || c == '*' || c == '\\')
8328 return (&dtrace_match_glob);
8329 }
8330
8331 return (&dtrace_match_string);
8332}
8333
8334/*
8335 * Build a probe comparison key for use with dtrace_match_probe() from the
8336 * given probe description. By convention, a null key only matches anchored
8337 * probes: if each field is the empty string, reset dtpk_fmatch to
8338 * dtrace_match_nonzero().
8339 */
8340static void
8341dtrace_probekey(dtrace_probedesc_t *pdp, dtrace_probekey_t *pkp)
8342{
8343 pkp->dtpk_prov = pdp->dtpd_provider;
8344 pkp->dtpk_pmatch = dtrace_probekey_func(pdp->dtpd_provider);
8345
8346 pkp->dtpk_mod = pdp->dtpd_mod;
8347 pkp->dtpk_mmatch = dtrace_probekey_func(pdp->dtpd_mod);
8348
8349 pkp->dtpk_func = pdp->dtpd_func;
8350 pkp->dtpk_fmatch = dtrace_probekey_func(pdp->dtpd_func);
8351
8352 pkp->dtpk_name = pdp->dtpd_name;
8353 pkp->dtpk_nmatch = dtrace_probekey_func(pdp->dtpd_name);
8354
8355 pkp->dtpk_id = pdp->dtpd_id;
8356
8357 if (pkp->dtpk_id == DTRACE_IDNONE &&
8358 pkp->dtpk_pmatch == &dtrace_match_nul &&
8359 pkp->dtpk_mmatch == &dtrace_match_nul &&
8360 pkp->dtpk_fmatch == &dtrace_match_nul &&
8361 pkp->dtpk_nmatch == &dtrace_match_nul)
8362 pkp->dtpk_fmatch = &dtrace_match_nonzero;
8363}
8364
8365/*
8366 * DTrace Provider-to-Framework API Functions
8367 *
8368 * These functions implement much of the Provider-to-Framework API, as
8369 * described in <sys/dtrace.h>. The parts of the API not in this section are
8370 * the functions in the API for probe management (found below), and
8371 * dtrace_probe() itself (found above).
8372 */
8373
8374/*
8375 * Register the calling provider with the DTrace framework. This should
8376 * generally be called by DTrace providers in their attach(9E) entry point.
8377 */
8378int
8379dtrace_register(const char *name, const dtrace_pattr_t *pap, uint32_t priv,
8380 cred_t *cr, const dtrace_pops_t *pops, void *arg, dtrace_provider_id_t *idp)
8381{
8382 dtrace_provider_t *provider;
8383
8384 if (name == NULL || pap == NULL || pops == NULL || idp == NULL) {
8385 cmn_err(CE_WARN, "failed to register provider '%s': invalid "
8386 "arguments", name ? name : "<NULL>");
8387 return (EINVAL);
8388 }
8389
8390 if (name[0] == '\0' || dtrace_badname(name)) {
8391 cmn_err(CE_WARN, "failed to register provider '%s': invalid "
8392 "provider name", name);
8393 return (EINVAL);
8394 }
8395
8396 if ((pops->dtps_provide == NULL && pops->dtps_provide_module == NULL) ||
8397 pops->dtps_enable == NULL || pops->dtps_disable == NULL ||
8398 pops->dtps_destroy == NULL ||
8399 ((pops->dtps_resume == NULL) != (pops->dtps_suspend == NULL))) {
8400 cmn_err(CE_WARN, "failed to register provider '%s': invalid "
8401 "provider ops", name);
8402 return (EINVAL);
8403 }
8404
8405 if (dtrace_badattr(&pap->dtpa_provider) ||
8406 dtrace_badattr(&pap->dtpa_mod) ||
8407 dtrace_badattr(&pap->dtpa_func) ||
8408 dtrace_badattr(&pap->dtpa_name) ||
8409 dtrace_badattr(&pap->dtpa_args)) {
8410 cmn_err(CE_WARN, "failed to register provider '%s': invalid "
8411 "provider attributes", name);
8412 return (EINVAL);
8413 }
8414
8415 if (priv & ~DTRACE_PRIV_ALL) {
8416 cmn_err(CE_WARN, "failed to register provider '%s': invalid "
8417 "privilege attributes", name);
8418 return (EINVAL);
8419 }
8420
8421 if ((priv & DTRACE_PRIV_KERNEL) &&
8422 (priv & (DTRACE_PRIV_USER | DTRACE_PRIV_OWNER)) &&
8423 pops->dtps_usermode == NULL) {
8424 cmn_err(CE_WARN, "failed to register provider '%s': need "
8425 "dtps_usermode() op for given privilege attributes", name);
8426 return (EINVAL);
8427 }
8428
8429 provider = kmem_zalloc(sizeof (dtrace_provider_t), KM_SLEEP);
8430 provider->dtpv_name = kmem_alloc(strlen(name) + 1, KM_SLEEP);
8431 (void) strcpy(provider->dtpv_name, name);
8432
8433 provider->dtpv_attr = *pap;
8434 provider->dtpv_priv.dtpp_flags = priv;
8435 if (cr != NULL) {
8436 provider->dtpv_priv.dtpp_uid = crgetuid(cr);
8437 provider->dtpv_priv.dtpp_zoneid = crgetzoneid(cr);
8438 }
8439 provider->dtpv_pops = *pops;
8440
8441 if (pops->dtps_provide == NULL) {
8442 ASSERT(pops->dtps_provide_module != NULL);
8443 provider->dtpv_pops.dtps_provide =
8444 (void (*)(void *, dtrace_probedesc_t *))dtrace_nullop;
8445 }
8446
8447 if (pops->dtps_provide_module == NULL) {
8448 ASSERT(pops->dtps_provide != NULL);
8449 provider->dtpv_pops.dtps_provide_module =
8450 (void (*)(void *, modctl_t *))dtrace_nullop;
8451 }
8452
8453 if (pops->dtps_suspend == NULL) {
8454 ASSERT(pops->dtps_resume == NULL);
8455 provider->dtpv_pops.dtps_suspend =
8456 (void (*)(void *, dtrace_id_t, void *))dtrace_nullop;
8457 provider->dtpv_pops.dtps_resume =
8458 (void (*)(void *, dtrace_id_t, void *))dtrace_nullop;
8459 }
8460
8461 provider->dtpv_arg = arg;
8462 *idp = (dtrace_provider_id_t)provider;
8463
8464 if (pops == &dtrace_provider_ops) {
8465 ASSERT(MUTEX_HELD(&dtrace_provider_lock));
8466 ASSERT(MUTEX_HELD(&dtrace_lock));
8467 ASSERT(dtrace_anon.dta_enabling == NULL);
8468
8469 /*
8470 * We make sure that the DTrace provider is at the head of
8471 * the provider chain.
8472 */
8473 provider->dtpv_next = dtrace_provider;
8474 dtrace_provider = provider;
8475 return (0);
8476 }
8477
8478 mutex_enter(&dtrace_provider_lock);
8479 mutex_enter(&dtrace_lock);
8480
8481 /*
8482 * If there is at least one provider registered, we'll add this
8483 * provider after the first provider.
8484 */
8485 if (dtrace_provider != NULL) {
8486 provider->dtpv_next = dtrace_provider->dtpv_next;
8487 dtrace_provider->dtpv_next = provider;
8488 } else {
8489 dtrace_provider = provider;
8490 }
8491
8492 if (dtrace_retained != NULL) {
8493 dtrace_enabling_provide(provider);
8494
8495 /*
8496 * Now we need to call dtrace_enabling_matchall() -- which
8497 * will acquire cpu_lock and dtrace_lock. We therefore need
8498 * to drop all of our locks before calling into it...
8499 */
8500 mutex_exit(&dtrace_lock);
8501 mutex_exit(&dtrace_provider_lock);
8502 dtrace_enabling_matchall();
8503
8504 return (0);
8505 }
8506
8507 mutex_exit(&dtrace_lock);
8508 mutex_exit(&dtrace_provider_lock);
8509
8510 return (0);
8511}
8512
8513/*
8514 * Unregister the specified provider from the DTrace framework. This should
8515 * generally be called by DTrace providers in their detach(9E) entry point.
8516 */
8517int
8518dtrace_unregister(dtrace_provider_id_t id)
8519{
8520 dtrace_provider_t *old = (dtrace_provider_t *)id;
8521 dtrace_provider_t *prev = NULL;
8522 int i, self = 0, noreap = 0;
8523 dtrace_probe_t *probe, *first = NULL;
8524
8525 if (old->dtpv_pops.dtps_enable ==
8526 (void (*)(void *, dtrace_id_t, void *))dtrace_nullop) {
8527 /*
8528 * If DTrace itself is the provider, we're called with locks
8529 * already held.
8530 */
8531 ASSERT(old == dtrace_provider);
8532#if defined(sun)
8533 ASSERT(dtrace_devi != NULL);
8534#endif
8535 ASSERT(MUTEX_HELD(&dtrace_provider_lock));
8536 ASSERT(MUTEX_HELD(&dtrace_lock));
8537 self = 1;
8538
8539 if (dtrace_provider->dtpv_next != NULL) {
8540 /*
8541 * There's another provider here; return failure.
8542 */
8543 return (EBUSY);
8544 }
8545 } else {
8546 mutex_enter(&dtrace_provider_lock);
8547#if defined(sun)
8548 mutex_enter(&mod_lock);
8549#endif
8550 mutex_enter(&dtrace_lock);
8551 }
8552
8553 /*
8554 * If anyone has /dev/dtrace open, or if there are anonymous enabled
8555 * probes, we refuse to let providers slither away, unless this
8556 * provider has already been explicitly invalidated.
8557 */
8558 if (!old->dtpv_defunct &&
8559 (dtrace_opens || (dtrace_anon.dta_state != NULL &&
8560 dtrace_anon.dta_state->dts_necbs > 0))) {
8561 if (!self) {
8562 mutex_exit(&dtrace_lock);
8563#if defined(sun)
8564 mutex_exit(&mod_lock);
8565#endif
8566 mutex_exit(&dtrace_provider_lock);
8567 }
8568 return (EBUSY);
8569 }
8570
8571 /*
8572 * Attempt to destroy the probes associated with this provider.
8573 */
8574 for (i = 0; i < dtrace_nprobes; i++) {
8575 if ((probe = dtrace_probes[i]) == NULL)
8576 continue;
8577
8578 if (probe->dtpr_provider != old)
8579 continue;
8580
8581 if (probe->dtpr_ecb == NULL)
8582 continue;
8583
8584 /*
8585 * If we are trying to unregister a defunct provider, and the
8586 * provider was made defunct within the interval dictated by
8587 * dtrace_unregister_defunct_reap, we'll (asynchronously)
8588 * attempt to reap our enablings. To denote that the provider
8589 * should reattempt to unregister itself at some point in the
8590 * future, we will return a differentiable error code (EAGAIN
8591 * instead of EBUSY) in this case.
8592 */
8593 if (dtrace_gethrtime() - old->dtpv_defunct >
8594 dtrace_unregister_defunct_reap)
8595 noreap = 1;
8596
8597 if (!self) {
8598 mutex_exit(&dtrace_lock);
8599#if defined(sun)
8600 mutex_exit(&mod_lock);
8601#endif
8602 mutex_exit(&dtrace_provider_lock);
8603 }
8604
8605 if (noreap)
8606 return (EBUSY);
8607
8608 (void) taskq_dispatch(dtrace_taskq,
8609 (task_func_t *)dtrace_enabling_reap, NULL, TQ_SLEEP);
8610
8611 return (EAGAIN);
8612 }
8613
8614 /*
8615 * All of the probes for this provider are disabled; we can safely
8616 * remove all of them from their hash chains and from the probe array.
8617 */
8618 for (i = 0; i < dtrace_nprobes; i++) {
8619 if ((probe = dtrace_probes[i]) == NULL)
8620 continue;
8621
8622 if (probe->dtpr_provider != old)
8623 continue;
8624
8625 dtrace_probes[i] = NULL;
8626
8627 dtrace_hash_remove(dtrace_bymod, probe);
8628 dtrace_hash_remove(dtrace_byfunc, probe);
8629 dtrace_hash_remove(dtrace_byname, probe);
8630
8631 if (first == NULL) {
8632 first = probe;
8633 probe->dtpr_nextmod = NULL;
8634 } else {
8635 probe->dtpr_nextmod = first;
8636 first = probe;
8637 }
8638 }
8639
8640 /*
8641 * The provider's probes have been removed from the hash chains and
8642 * from the probe array. Now issue a dtrace_sync() to be sure that
8643 * everyone has cleared out from any probe array processing.
8644 */
8645 dtrace_sync();
8646
8647 for (probe = first; probe != NULL; probe = first) {
8648 first = probe->dtpr_nextmod;
8649
8650 old->dtpv_pops.dtps_destroy(old->dtpv_arg, probe->dtpr_id,
8651 probe->dtpr_arg);
8652 kmem_free(probe->dtpr_mod, strlen(probe->dtpr_mod) + 1);
8653 kmem_free(probe->dtpr_func, strlen(probe->dtpr_func) + 1);
8654 kmem_free(probe->dtpr_name, strlen(probe->dtpr_name) + 1);
8655#if defined(sun)
8656 vmem_free(dtrace_arena, (void *)(uintptr_t)(probe->dtpr_id), 1);
8657#else
8658 free_unr(dtrace_arena, probe->dtpr_id);
8659#endif
8660 kmem_free(probe, sizeof (dtrace_probe_t));
8661 }
8662
8663 if ((prev = dtrace_provider) == old) {
8664#if defined(sun)
8665 ASSERT(self || dtrace_devi == NULL);
8666 ASSERT(old->dtpv_next == NULL || dtrace_devi == NULL);
8667#endif
8668 dtrace_provider = old->dtpv_next;
8669 } else {
8670 while (prev != NULL && prev->dtpv_next != old)
8671 prev = prev->dtpv_next;
8672
8673 if (prev == NULL) {
8674 panic("attempt to unregister non-existent "
8675 "dtrace provider %p\n", (void *)id);
8676 }
8677
8678 prev->dtpv_next = old->dtpv_next;
8679 }
8680
8681 if (!self) {
8682 mutex_exit(&dtrace_lock);
8683#if defined(sun)
8684 mutex_exit(&mod_lock);
8685#endif
8686 mutex_exit(&dtrace_provider_lock);
8687 }
8688
8689 kmem_free(old->dtpv_name, strlen(old->dtpv_name) + 1);
8690 kmem_free(old, sizeof (dtrace_provider_t));
8691
8692 return (0);
8693}
8694
8695/*
8696 * Invalidate the specified provider. All subsequent probe lookups for the
8697 * specified provider will fail, but its probes will not be removed.
8698 */
8699void
8700dtrace_invalidate(dtrace_provider_id_t id)
8701{
8702 dtrace_provider_t *pvp = (dtrace_provider_t *)id;
8703
8704 ASSERT(pvp->dtpv_pops.dtps_enable !=
8705 (void (*)(void *, dtrace_id_t, void *))dtrace_nullop);
8706
8707 mutex_enter(&dtrace_provider_lock);
8708 mutex_enter(&dtrace_lock);
8709
8710 pvp->dtpv_defunct = dtrace_gethrtime();
8711
8712 mutex_exit(&dtrace_lock);
8713 mutex_exit(&dtrace_provider_lock);
8714}
8715
8716/*
8717 * Indicate whether or not DTrace has attached.
8718 */
8719int
8720dtrace_attached(void)
8721{
8722 /*
8723 * dtrace_provider will be non-NULL iff the DTrace driver has
8724 * attached. (It's non-NULL because DTrace is always itself a
8725 * provider.)
8726 */
8727 return (dtrace_provider != NULL);
8728}
8729
8730/*
8731 * Remove all the unenabled probes for the given provider. This function is
8732 * not unlike dtrace_unregister(), except that it doesn't remove the provider
8733 * -- just as many of its associated probes as it can.
8734 */
8735int
8736dtrace_condense(dtrace_provider_id_t id)
8737{
8738 dtrace_provider_t *prov = (dtrace_provider_t *)id;
8739 int i;
8740 dtrace_probe_t *probe;
8741
8742 /*
8743 * Make sure this isn't the dtrace provider itself.
8744 */
8745 ASSERT(prov->dtpv_pops.dtps_enable !=
8746 (void (*)(void *, dtrace_id_t, void *))dtrace_nullop);
8747
8748 mutex_enter(&dtrace_provider_lock);
8749 mutex_enter(&dtrace_lock);
8750
8751 /*
8752 * Attempt to destroy the probes associated with this provider.
8753 */
8754 for (i = 0; i < dtrace_nprobes; i++) {
8755 if ((probe = dtrace_probes[i]) == NULL)
8756 continue;
8757
8758 if (probe->dtpr_provider != prov)
8759 continue;
8760
8761 if (probe->dtpr_ecb != NULL)
8762 continue;
8763
8764 dtrace_probes[i] = NULL;
8765
8766 dtrace_hash_remove(dtrace_bymod, probe);
8767 dtrace_hash_remove(dtrace_byfunc, probe);
8768 dtrace_hash_remove(dtrace_byname, probe);
8769
8770 prov->dtpv_pops.dtps_destroy(prov->dtpv_arg, i + 1,
8771 probe->dtpr_arg);
8772 kmem_free(probe->dtpr_mod, strlen(probe->dtpr_mod) + 1);
8773 kmem_free(probe->dtpr_func, strlen(probe->dtpr_func) + 1);
8774 kmem_free(probe->dtpr_name, strlen(probe->dtpr_name) + 1);
8775 kmem_free(probe, sizeof (dtrace_probe_t));
8776#if defined(sun)
8777 vmem_free(dtrace_arena, (void *)((uintptr_t)i + 1), 1);
8778#else
8779 free_unr(dtrace_arena, i + 1);
8780#endif
8781 }
8782
8783 mutex_exit(&dtrace_lock);
8784 mutex_exit(&dtrace_provider_lock);
8785
8786 return (0);
8787}
8788
8789/*
8790 * DTrace Probe Management Functions
8791 *
8792 * The functions in this section perform the DTrace probe management,
8793 * including functions to create probes, look-up probes, and call into the
8794 * providers to request that probes be provided. Some of these functions are
8795 * in the Provider-to-Framework API; these functions can be identified by the
8796 * fact that they are not declared "static".
8797 */
8798
8799/*
8800 * Create a probe with the specified module name, function name, and name.
8801 */
8802dtrace_id_t
8803dtrace_probe_create(dtrace_provider_id_t prov, const char *mod,
8804 const char *func, const char *name, int aframes, void *arg)
8805{
8806 dtrace_probe_t *probe, **probes;
8807 dtrace_provider_t *provider = (dtrace_provider_t *)prov;
8808 dtrace_id_t id;
8809
8810 if (provider == dtrace_provider) {
8811 ASSERT(MUTEX_HELD(&dtrace_lock));
8812 } else {
8813 mutex_enter(&dtrace_lock);
8814 }
8815
8816#if defined(sun)
8817 id = (dtrace_id_t)(uintptr_t)vmem_alloc(dtrace_arena, 1,
8818 VM_BESTFIT | VM_SLEEP);
8819#else
8820 id = alloc_unr(dtrace_arena);
8821#endif
8822 probe = kmem_zalloc(sizeof (dtrace_probe_t), KM_SLEEP);
8823
8824 probe->dtpr_id = id;
8825 probe->dtpr_gen = dtrace_probegen++;
8826 probe->dtpr_mod = dtrace_strdup(mod);
8827 probe->dtpr_func = dtrace_strdup(func);
8828 probe->dtpr_name = dtrace_strdup(name);
8829 probe->dtpr_arg = arg;
8830 probe->dtpr_aframes = aframes;
8831 probe->dtpr_provider = provider;
8832
8833 dtrace_hash_add(dtrace_bymod, probe);
8834 dtrace_hash_add(dtrace_byfunc, probe);
8835 dtrace_hash_add(dtrace_byname, probe);
8836
8837 if (id - 1 >= dtrace_nprobes) {
8838 size_t osize = dtrace_nprobes * sizeof (dtrace_probe_t *);
8839 size_t nsize = osize << 1;
8840
8841 if (nsize == 0) {
8842 ASSERT(osize == 0);
8843 ASSERT(dtrace_probes == NULL);
8844 nsize = sizeof (dtrace_probe_t *);
8845 }
8846
8847 probes = kmem_zalloc(nsize, KM_SLEEP);
8848
8849 if (dtrace_probes == NULL) {
8850 ASSERT(osize == 0);
8851 dtrace_probes = probes;
8852 dtrace_nprobes = 1;
8853 } else {
8854 dtrace_probe_t **oprobes = dtrace_probes;
8855
8856 bcopy(oprobes, probes, osize);
8857 dtrace_membar_producer();
8858 dtrace_probes = probes;
8859
8860 dtrace_sync();
8861
8862 /*
8863 * All CPUs are now seeing the new probes array; we can
8864 * safely free the old array.
8865 */
8866 kmem_free(oprobes, osize);
8867 dtrace_nprobes <<= 1;
8868 }
8869
8870 ASSERT(id - 1 < dtrace_nprobes);
8871 }
8872
8873 ASSERT(dtrace_probes[id - 1] == NULL);
8874 dtrace_probes[id - 1] = probe;
8875
8876 if (provider != dtrace_provider)
8877 mutex_exit(&dtrace_lock);
8878
8879 return (id);
8880}
8881
8882static dtrace_probe_t *
8883dtrace_probe_lookup_id(dtrace_id_t id)
8884{
8885 ASSERT(MUTEX_HELD(&dtrace_lock));
8886
8887 if (id == 0 || id > dtrace_nprobes)
8888 return (NULL);
8889
8890 return (dtrace_probes[id - 1]);
8891}
8892
8893static int
8894dtrace_probe_lookup_match(dtrace_probe_t *probe, void *arg)
8895{
8896 *((dtrace_id_t *)arg) = probe->dtpr_id;
8897
8898 return (DTRACE_MATCH_DONE);
8899}
8900
8901/*
8902 * Look up a probe based on provider and one or more of module name, function
8903 * name and probe name.
8904 */
8905dtrace_id_t
8906dtrace_probe_lookup(dtrace_provider_id_t prid, char *mod,
8907 char *func, char *name)
8908{
8909 dtrace_probekey_t pkey;
8910 dtrace_id_t id;
8911 int match;
8912
8913 pkey.dtpk_prov = ((dtrace_provider_t *)prid)->dtpv_name;
8914 pkey.dtpk_pmatch = &dtrace_match_string;
8915 pkey.dtpk_mod = mod;
8916 pkey.dtpk_mmatch = mod ? &dtrace_match_string : &dtrace_match_nul;
8917 pkey.dtpk_func = func;
8918 pkey.dtpk_fmatch = func ? &dtrace_match_string : &dtrace_match_nul;
8919 pkey.dtpk_name = name;
8920 pkey.dtpk_nmatch = name ? &dtrace_match_string : &dtrace_match_nul;
8921 pkey.dtpk_id = DTRACE_IDNONE;
8922
8923 mutex_enter(&dtrace_lock);
8924 match = dtrace_match(&pkey, DTRACE_PRIV_ALL, 0, 0,
8925 dtrace_probe_lookup_match, &id);
8926 mutex_exit(&dtrace_lock);
8927
8928 ASSERT(match == 1 || match == 0);
8929 return (match ? id : 0);
8930}
8931
8932/*
8933 * Returns the probe argument associated with the specified probe.
8934 */
8935void *
8936dtrace_probe_arg(dtrace_provider_id_t id, dtrace_id_t pid)
8937{
8938 dtrace_probe_t *probe;
8939 void *rval = NULL;
8940
8941 mutex_enter(&dtrace_lock);
8942
8943 if ((probe = dtrace_probe_lookup_id(pid)) != NULL &&
8944 probe->dtpr_provider == (dtrace_provider_t *)id)
8945 rval = probe->dtpr_arg;
8946
8947 mutex_exit(&dtrace_lock);
8948
8949 return (rval);
8950}
8951
8952/*
8953 * Copy a probe into a probe description.
8954 */
8955static void
8956dtrace_probe_description(const dtrace_probe_t *prp, dtrace_probedesc_t *pdp)
8957{
8958 bzero(pdp, sizeof (dtrace_probedesc_t));
8959 pdp->dtpd_id = prp->dtpr_id;
8960
8961 (void) strncpy(pdp->dtpd_provider,
8962 prp->dtpr_provider->dtpv_name, DTRACE_PROVNAMELEN - 1);
8963
8964 (void) strncpy(pdp->dtpd_mod, prp->dtpr_mod, DTRACE_MODNAMELEN - 1);
8965 (void) strncpy(pdp->dtpd_func, prp->dtpr_func, DTRACE_FUNCNAMELEN - 1);
8966 (void) strncpy(pdp->dtpd_name, prp->dtpr_name, DTRACE_NAMELEN - 1);
8967}
8968
8969/*
8970 * Called to indicate that a probe -- or probes -- should be provided by a
8971 * specfied provider. If the specified description is NULL, the provider will
8972 * be told to provide all of its probes. (This is done whenever a new
8973 * consumer comes along, or whenever a retained enabling is to be matched.) If
8974 * the specified description is non-NULL, the provider is given the
8975 * opportunity to dynamically provide the specified probe, allowing providers
8976 * to support the creation of probes on-the-fly. (So-called _autocreated_
8977 * probes.) If the provider is NULL, the operations will be applied to all
8978 * providers; if the provider is non-NULL the operations will only be applied
8979 * to the specified provider. The dtrace_provider_lock must be held, and the
8980 * dtrace_lock must _not_ be held -- the provider's dtps_provide() operation
8981 * will need to grab the dtrace_lock when it reenters the framework through
8982 * dtrace_probe_lookup(), dtrace_probe_create(), etc.
8983 */
8984static void
8985dtrace_probe_provide(dtrace_probedesc_t *desc, dtrace_provider_t *prv)
8986{
8987#if defined(sun)
8988 modctl_t *ctl;
8989#endif
8990 int all = 0;
8991
8992 ASSERT(MUTEX_HELD(&dtrace_provider_lock));
8993
8994 if (prv == NULL) {
8995 all = 1;
8996 prv = dtrace_provider;
8997 }
8998
8999 do {
9000 /*
9001 * First, call the blanket provide operation.
9002 */
9003 prv->dtpv_pops.dtps_provide(prv->dtpv_arg, desc);
9004
9005#if defined(sun)
9006 /*
9007 * Now call the per-module provide operation. We will grab
9008 * mod_lock to prevent the list from being modified. Note
9009 * that this also prevents the mod_busy bits from changing.
9010 * (mod_busy can only be changed with mod_lock held.)
9011 */
9012 mutex_enter(&mod_lock);
9013
9014 ctl = &modules;
9015 do {
9016 if (ctl->mod_busy || ctl->mod_mp == NULL)
9017 continue;
9018
9019 prv->dtpv_pops.dtps_provide_module(prv->dtpv_arg, ctl);
9020
9021 } while ((ctl = ctl->mod_next) != &modules);
9022
9023 mutex_exit(&mod_lock);
9024#endif
9025 } while (all && (prv = prv->dtpv_next) != NULL);
9026}
9027
9028#if defined(sun)
9029/*
9030 * Iterate over each probe, and call the Framework-to-Provider API function
9031 * denoted by offs.
9032 */
9033static void
9034dtrace_probe_foreach(uintptr_t offs)
9035{
9036 dtrace_provider_t *prov;
9037 void (*func)(void *, dtrace_id_t, void *);
9038 dtrace_probe_t *probe;
9039 dtrace_icookie_t cookie;
9040 int i;
9041
9042 /*
9043 * We disable interrupts to walk through the probe array. This is
9044 * safe -- the dtrace_sync() in dtrace_unregister() assures that we
9045 * won't see stale data.
9046 */
9047 cookie = dtrace_interrupt_disable();
9048
9049 for (i = 0; i < dtrace_nprobes; i++) {
9050 if ((probe = dtrace_probes[i]) == NULL)
9051 continue;
9052
9053 if (probe->dtpr_ecb == NULL) {
9054 /*
9055 * This probe isn't enabled -- don't call the function.
9056 */
9057 continue;
9058 }
9059
9060 prov = probe->dtpr_provider;
9061 func = *((void(**)(void *, dtrace_id_t, void *))
9062 ((uintptr_t)&prov->dtpv_pops + offs));
9063
9064 func(prov->dtpv_arg, i + 1, probe->dtpr_arg);
9065 }
9066
9067 dtrace_interrupt_enable(cookie);
9068}
9069#endif
9070
9071static int
9072dtrace_probe_enable(dtrace_probedesc_t *desc, dtrace_enabling_t *enab)
9073{
9074 dtrace_probekey_t pkey;
9075 uint32_t priv;
9076 uid_t uid;
9077 zoneid_t zoneid;
9078
9079 ASSERT(MUTEX_HELD(&dtrace_lock));
9080 dtrace_ecb_create_cache = NULL;
9081
9082 if (desc == NULL) {
9083 /*
9084 * If we're passed a NULL description, we're being asked to
9085 * create an ECB with a NULL probe.
9086 */
9087 (void) dtrace_ecb_create_enable(NULL, enab);
9088 return (0);
9089 }
9090
9091 dtrace_probekey(desc, &pkey);
9092 dtrace_cred2priv(enab->dten_vstate->dtvs_state->dts_cred.dcr_cred,
9093 &priv, &uid, &zoneid);
9094
9095 return (dtrace_match(&pkey, priv, uid, zoneid, dtrace_ecb_create_enable,
9096 enab));
9097}
9098
9099/*
9100 * DTrace Helper Provider Functions
9101 */
9102static void
9103dtrace_dofattr2attr(dtrace_attribute_t *attr, const dof_attr_t dofattr)
9104{
9105 attr->dtat_name = DOF_ATTR_NAME(dofattr);
9106 attr->dtat_data = DOF_ATTR_DATA(dofattr);
9107 attr->dtat_class = DOF_ATTR_CLASS(dofattr);
9108}
9109
9110static void
9111dtrace_dofprov2hprov(dtrace_helper_provdesc_t *hprov,
9112 const dof_provider_t *dofprov, char *strtab)
9113{
9114 hprov->dthpv_provname = strtab + dofprov->dofpv_name;
9115 dtrace_dofattr2attr(&hprov->dthpv_pattr.dtpa_provider,
9116 dofprov->dofpv_provattr);
9117 dtrace_dofattr2attr(&hprov->dthpv_pattr.dtpa_mod,
9118 dofprov->dofpv_modattr);
9119 dtrace_dofattr2attr(&hprov->dthpv_pattr.dtpa_func,
9120 dofprov->dofpv_funcattr);
9121 dtrace_dofattr2attr(&hprov->dthpv_pattr.dtpa_name,
9122 dofprov->dofpv_nameattr);
9123 dtrace_dofattr2attr(&hprov->dthpv_pattr.dtpa_args,
9124 dofprov->dofpv_argsattr);
9125}
9126
9127static void
9128dtrace_helper_provide_one(dof_helper_t *dhp, dof_sec_t *sec, pid_t pid)
9129{
9130 uintptr_t daddr = (uintptr_t)dhp->dofhp_dof;
9131 dof_hdr_t *dof = (dof_hdr_t *)daddr;
9132 dof_sec_t *str_sec, *prb_sec, *arg_sec, *off_sec, *enoff_sec;
9133 dof_provider_t *provider;
9134 dof_probe_t *probe;
9135 uint32_t *off, *enoff;
9136 uint8_t *arg;
9137 char *strtab;
9138 uint_t i, nprobes;
9139 dtrace_helper_provdesc_t dhpv;
9140 dtrace_helper_probedesc_t dhpb;
9141 dtrace_meta_t *meta = dtrace_meta_pid;
9142 dtrace_mops_t *mops = &meta->dtm_mops;
9143 void *parg;
9144
9145 provider = (dof_provider_t *)(uintptr_t)(daddr + sec->dofs_offset);
9146 str_sec = (dof_sec_t *)(uintptr_t)(daddr + dof->dofh_secoff +
9147 provider->dofpv_strtab * dof->dofh_secsize);
9148 prb_sec = (dof_sec_t *)(uintptr_t)(daddr + dof->dofh_secoff +
9149 provider->dofpv_probes * dof->dofh_secsize);
9150 arg_sec = (dof_sec_t *)(uintptr_t)(daddr + dof->dofh_secoff +
9151 provider->dofpv_prargs * dof->dofh_secsize);
9152 off_sec = (dof_sec_t *)(uintptr_t)(daddr + dof->dofh_secoff +
9153 provider->dofpv_proffs * dof->dofh_secsize);
9154
9155 strtab = (char *)(uintptr_t)(daddr + str_sec->dofs_offset);
9156 off = (uint32_t *)(uintptr_t)(daddr + off_sec->dofs_offset);
9157 arg = (uint8_t *)(uintptr_t)(daddr + arg_sec->dofs_offset);
9158 enoff = NULL;
9159
9160 /*
9161 * See dtrace_helper_provider_validate().
9162 */
9163 if (dof->dofh_ident[DOF_ID_VERSION] != DOF_VERSION_1 &&
9164 provider->dofpv_prenoffs != DOF_SECT_NONE) {
9165 enoff_sec = (dof_sec_t *)(uintptr_t)(daddr + dof->dofh_secoff +
9166 provider->dofpv_prenoffs * dof->dofh_secsize);
9167 enoff = (uint32_t *)(uintptr_t)(daddr + enoff_sec->dofs_offset);
9168 }
9169
9170 nprobes = prb_sec->dofs_size / prb_sec->dofs_entsize;
9171
9172 /*
9173 * Create the provider.
9174 */
9175 dtrace_dofprov2hprov(&dhpv, provider, strtab);
9176
9177 if ((parg = mops->dtms_provide_pid(meta->dtm_arg, &dhpv, pid)) == NULL)
9178 return;
9179
9180 meta->dtm_count++;
9181
9182 /*
9183 * Create the probes.
9184 */
9185 for (i = 0; i < nprobes; i++) {
9186 probe = (dof_probe_t *)(uintptr_t)(daddr +
9187 prb_sec->dofs_offset + i * prb_sec->dofs_entsize);
9188
9189 dhpb.dthpb_mod = dhp->dofhp_mod;
9190 dhpb.dthpb_func = strtab + probe->dofpr_func;
9191 dhpb.dthpb_name = strtab + probe->dofpr_name;
9192 dhpb.dthpb_base = probe->dofpr_addr;
9193 dhpb.dthpb_offs = off + probe->dofpr_offidx;
9194 dhpb.dthpb_noffs = probe->dofpr_noffs;
9195 if (enoff != NULL) {
9196 dhpb.dthpb_enoffs = enoff + probe->dofpr_enoffidx;
9197 dhpb.dthpb_nenoffs = probe->dofpr_nenoffs;
9198 } else {
9199 dhpb.dthpb_enoffs = NULL;
9200 dhpb.dthpb_nenoffs = 0;
9201 }
9202 dhpb.dthpb_args = arg + probe->dofpr_argidx;
9203 dhpb.dthpb_nargc = probe->dofpr_nargc;
9204 dhpb.dthpb_xargc = probe->dofpr_xargc;
9205 dhpb.dthpb_ntypes = strtab + probe->dofpr_nargv;
9206 dhpb.dthpb_xtypes = strtab + probe->dofpr_xargv;
9207
9208 mops->dtms_create_probe(meta->dtm_arg, parg, &dhpb);
9209 }
9210}
9211
9212static void
9213dtrace_helper_provide(dof_helper_t *dhp, pid_t pid)
9214{
9215 uintptr_t daddr = (uintptr_t)dhp->dofhp_dof;
9216 dof_hdr_t *dof = (dof_hdr_t *)daddr;
9217 int i;
9218
9219 ASSERT(MUTEX_HELD(&dtrace_meta_lock));
9220
9221 for (i = 0; i < dof->dofh_secnum; i++) {
9222 dof_sec_t *sec = (dof_sec_t *)(uintptr_t)(daddr +
9223 dof->dofh_secoff + i * dof->dofh_secsize);
9224
9225 if (sec->dofs_type != DOF_SECT_PROVIDER)
9226 continue;
9227
9228 dtrace_helper_provide_one(dhp, sec, pid);
9229 }
9230
9231 /*
9232 * We may have just created probes, so we must now rematch against
9233 * any retained enablings. Note that this call will acquire both
9234 * cpu_lock and dtrace_lock; the fact that we are holding
9235 * dtrace_meta_lock now is what defines the ordering with respect to
9236 * these three locks.
9237 */
9238 dtrace_enabling_matchall();
9239}
9240
9241static void
9242dtrace_helper_provider_remove_one(dof_helper_t *dhp, dof_sec_t *sec, pid_t pid)
9243{
9244 uintptr_t daddr = (uintptr_t)dhp->dofhp_dof;
9245 dof_hdr_t *dof = (dof_hdr_t *)daddr;
9246 dof_sec_t *str_sec;
9247 dof_provider_t *provider;
9248 char *strtab;
9249 dtrace_helper_provdesc_t dhpv;
9250 dtrace_meta_t *meta = dtrace_meta_pid;
9251 dtrace_mops_t *mops = &meta->dtm_mops;
9252
9253 provider = (dof_provider_t *)(uintptr_t)(daddr + sec->dofs_offset);
9254 str_sec = (dof_sec_t *)(uintptr_t)(daddr + dof->dofh_secoff +
9255 provider->dofpv_strtab * dof->dofh_secsize);
9256
9257 strtab = (char *)(uintptr_t)(daddr + str_sec->dofs_offset);
9258
9259 /*
9260 * Create the provider.
9261 */
9262 dtrace_dofprov2hprov(&dhpv, provider, strtab);
9263
9264 mops->dtms_remove_pid(meta->dtm_arg, &dhpv, pid);
9265
9266 meta->dtm_count--;
9267}
9268
9269static void
9270dtrace_helper_provider_remove(dof_helper_t *dhp, pid_t pid)
9271{
9272 uintptr_t daddr = (uintptr_t)dhp->dofhp_dof;
9273 dof_hdr_t *dof = (dof_hdr_t *)daddr;
9274 int i;
9275
9276 ASSERT(MUTEX_HELD(&dtrace_meta_lock));
9277
9278 for (i = 0; i < dof->dofh_secnum; i++) {
9279 dof_sec_t *sec = (dof_sec_t *)(uintptr_t)(daddr +
9280 dof->dofh_secoff + i * dof->dofh_secsize);
9281
9282 if (sec->dofs_type != DOF_SECT_PROVIDER)
9283 continue;
9284
9285 dtrace_helper_provider_remove_one(dhp, sec, pid);
9286 }
9287}
9288
9289/*
9290 * DTrace Meta Provider-to-Framework API Functions
9291 *
9292 * These functions implement the Meta Provider-to-Framework API, as described
9293 * in <sys/dtrace.h>.
9294 */
9295int
9296dtrace_meta_register(const char *name, const dtrace_mops_t *mops, void *arg,
9297 dtrace_meta_provider_id_t *idp)
9298{
9299 dtrace_meta_t *meta;
9300 dtrace_helpers_t *help, *next;
9301 int i;
9302
9303 *idp = DTRACE_METAPROVNONE;
9304
9305 /*
9306 * We strictly don't need the name, but we hold onto it for
9307 * debuggability. All hail error queues!
9308 */
9309 if (name == NULL) {
9310 cmn_err(CE_WARN, "failed to register meta-provider: "
9311 "invalid name");
9312 return (EINVAL);
9313 }
9314
9315 if (mops == NULL ||
9316 mops->dtms_create_probe == NULL ||
9317 mops->dtms_provide_pid == NULL ||
9318 mops->dtms_remove_pid == NULL) {
9319 cmn_err(CE_WARN, "failed to register meta-register %s: "
9320 "invalid ops", name);
9321 return (EINVAL);
9322 }
9323
9324 meta = kmem_zalloc(sizeof (dtrace_meta_t), KM_SLEEP);
9325 meta->dtm_mops = *mops;
9326 meta->dtm_name = kmem_alloc(strlen(name) + 1, KM_SLEEP);
9327 (void) strcpy(meta->dtm_name, name);
9328 meta->dtm_arg = arg;
9329
9330 mutex_enter(&dtrace_meta_lock);
9331 mutex_enter(&dtrace_lock);
9332
9333 if (dtrace_meta_pid != NULL) {
9334 mutex_exit(&dtrace_lock);
9335 mutex_exit(&dtrace_meta_lock);
9336 cmn_err(CE_WARN, "failed to register meta-register %s: "
9337 "user-land meta-provider exists", name);
9338 kmem_free(meta->dtm_name, strlen(meta->dtm_name) + 1);
9339 kmem_free(meta, sizeof (dtrace_meta_t));
9340 return (EINVAL);
9341 }
9342
9343 dtrace_meta_pid = meta;
9344 *idp = (dtrace_meta_provider_id_t)meta;
9345
9346 /*
9347 * If there are providers and probes ready to go, pass them
9348 * off to the new meta provider now.
9349 */
9350
9351 help = dtrace_deferred_pid;
9352 dtrace_deferred_pid = NULL;
9353
9354 mutex_exit(&dtrace_lock);
9355
9356 while (help != NULL) {
9357 for (i = 0; i < help->dthps_nprovs; i++) {
9358 dtrace_helper_provide(&help->dthps_provs[i]->dthp_prov,
9359 help->dthps_pid);
9360 }
9361
9362 next = help->dthps_next;
9363 help->dthps_next = NULL;
9364 help->dthps_prev = NULL;
9365 help->dthps_deferred = 0;
9366 help = next;
9367 }
9368
9369 mutex_exit(&dtrace_meta_lock);
9370
9371 return (0);
9372}
9373
9374int
9375dtrace_meta_unregister(dtrace_meta_provider_id_t id)
9376{
9377 dtrace_meta_t **pp, *old = (dtrace_meta_t *)id;
9378
9379 mutex_enter(&dtrace_meta_lock);
9380 mutex_enter(&dtrace_lock);
9381
9382 if (old == dtrace_meta_pid) {
9383 pp = &dtrace_meta_pid;
9384 } else {
9385 panic("attempt to unregister non-existent "
9386 "dtrace meta-provider %p\n", (void *)old);
9387 }
9388
9389 if (old->dtm_count != 0) {
9390 mutex_exit(&dtrace_lock);
9391 mutex_exit(&dtrace_meta_lock);
9392 return (EBUSY);
9393 }
9394
9395 *pp = NULL;
9396
9397 mutex_exit(&dtrace_lock);
9398 mutex_exit(&dtrace_meta_lock);
9399
9400 kmem_free(old->dtm_name, strlen(old->dtm_name) + 1);
9401 kmem_free(old, sizeof (dtrace_meta_t));
9402
9403 return (0);
9404}
9405
9406
9407/*
9408 * DTrace DIF Object Functions
9409 */
9410static int
9411dtrace_difo_err(uint_t pc, const char *format, ...)
9412{
9413 if (dtrace_err_verbose) {
9414 va_list alist;
9415
9416 (void) uprintf("dtrace DIF object error: [%u]: ", pc);
9417 va_start(alist, format);
9418 (void) vuprintf(format, alist);
9419 va_end(alist);
9420 }
9421
9422#ifdef DTRACE_ERRDEBUG
9423 dtrace_errdebug(format);
9424#endif
9425 return (1);
9426}
9427
9428/*
9429 * Validate a DTrace DIF object by checking the IR instructions. The following
9430 * rules are currently enforced by dtrace_difo_validate():
9431 *
9432 * 1. Each instruction must have a valid opcode
9433 * 2. Each register, string, variable, or subroutine reference must be valid
9434 * 3. No instruction can modify register %r0 (must be zero)
9435 * 4. All instruction reserved bits must be set to zero
9436 * 5. The last instruction must be a "ret" instruction
9437 * 6. All branch targets must reference a valid instruction _after_ the branch
9438 */
9439static int
9440dtrace_difo_validate(dtrace_difo_t *dp, dtrace_vstate_t *vstate, uint_t nregs,
9441 cred_t *cr)
9442{
9443 int err = 0, i;
9444 int (*efunc)(uint_t pc, const char *, ...) = dtrace_difo_err;
9445 int kcheckload;
9446 uint_t pc;
9447
9448 kcheckload = cr == NULL ||
9449 (vstate->dtvs_state->dts_cred.dcr_visible & DTRACE_CRV_KERNEL) == 0;
9450
9451 dp->dtdo_destructive = 0;
9452
9453 for (pc = 0; pc < dp->dtdo_len && err == 0; pc++) {
9454 dif_instr_t instr = dp->dtdo_buf[pc];
9455
9456 uint_t r1 = DIF_INSTR_R1(instr);
9457 uint_t r2 = DIF_INSTR_R2(instr);
9458 uint_t rd = DIF_INSTR_RD(instr);
9459 uint_t rs = DIF_INSTR_RS(instr);
9460 uint_t label = DIF_INSTR_LABEL(instr);
9461 uint_t v = DIF_INSTR_VAR(instr);
9462 uint_t subr = DIF_INSTR_SUBR(instr);
9463 uint_t type = DIF_INSTR_TYPE(instr);
9464 uint_t op = DIF_INSTR_OP(instr);
9465
9466 switch (op) {
9467 case DIF_OP_OR:
9468 case DIF_OP_XOR:
9469 case DIF_OP_AND:
9470 case DIF_OP_SLL:
9471 case DIF_OP_SRL:
9472 case DIF_OP_SRA:
9473 case DIF_OP_SUB:
9474 case DIF_OP_ADD:
9475 case DIF_OP_MUL:
9476 case DIF_OP_SDIV:
9477 case DIF_OP_UDIV:
9478 case DIF_OP_SREM:
9479 case DIF_OP_UREM:
9480 case DIF_OP_COPYS:
9481 if (r1 >= nregs)
9482 err += efunc(pc, "invalid register %u\n", r1);
9483 if (r2 >= nregs)
9484 err += efunc(pc, "invalid register %u\n", r2);
9485 if (rd >= nregs)
9486 err += efunc(pc, "invalid register %u\n", rd);
9487 if (rd == 0)
9488 err += efunc(pc, "cannot write to %r0\n");
9489 break;
9490 case DIF_OP_NOT:
9491 case DIF_OP_MOV:
9492 case DIF_OP_ALLOCS:
9493 if (r1 >= nregs)
9494 err += efunc(pc, "invalid register %u\n", r1);
9495 if (r2 != 0)
9496 err += efunc(pc, "non-zero reserved bits\n");
9497 if (rd >= nregs)
9498 err += efunc(pc, "invalid register %u\n", rd);
9499 if (rd == 0)
9500 err += efunc(pc, "cannot write to %r0\n");
9501 break;
9502 case DIF_OP_LDSB:
9503 case DIF_OP_LDSH:
9504 case DIF_OP_LDSW:
9505 case DIF_OP_LDUB:
9506 case DIF_OP_LDUH:
9507 case DIF_OP_LDUW:
9508 case DIF_OP_LDX:
9509 if (r1 >= nregs)
9510 err += efunc(pc, "invalid register %u\n", r1);
9511 if (r2 != 0)
9512 err += efunc(pc, "non-zero reserved bits\n");
9513 if (rd >= nregs)
9514 err += efunc(pc, "invalid register %u\n", rd);
9515 if (rd == 0)
9516 err += efunc(pc, "cannot write to %r0\n");
9517 if (kcheckload)
9518 dp->dtdo_buf[pc] = DIF_INSTR_LOAD(op +
9519 DIF_OP_RLDSB - DIF_OP_LDSB, r1, rd);
9520 break;
9521 case DIF_OP_RLDSB:
9522 case DIF_OP_RLDSH:
9523 case DIF_OP_RLDSW:
9524 case DIF_OP_RLDUB:
9525 case DIF_OP_RLDUH:
9526 case DIF_OP_RLDUW:
9527 case DIF_OP_RLDX:
9528 if (r1 >= nregs)
9529 err += efunc(pc, "invalid register %u\n", r1);
9530 if (r2 != 0)
9531 err += efunc(pc, "non-zero reserved bits\n");
9532 if (rd >= nregs)
9533 err += efunc(pc, "invalid register %u\n", rd);
9534 if (rd == 0)
9535 err += efunc(pc, "cannot write to %r0\n");
9536 break;
9537 case DIF_OP_ULDSB:
9538 case DIF_OP_ULDSH:
9539 case DIF_OP_ULDSW:
9540 case DIF_OP_ULDUB:
9541 case DIF_OP_ULDUH:
9542 case DIF_OP_ULDUW:
9543 case DIF_OP_ULDX:
9544 if (r1 >= nregs)
9545 err += efunc(pc, "invalid register %u\n", r1);
9546 if (r2 != 0)
9547 err += efunc(pc, "non-zero reserved bits\n");
9548 if (rd >= nregs)
9549 err += efunc(pc, "invalid register %u\n", rd);
9550 if (rd == 0)
9551 err += efunc(pc, "cannot write to %r0\n");
9552 break;
9553 case DIF_OP_STB:
9554 case DIF_OP_STH:
9555 case DIF_OP_STW:
9556 case DIF_OP_STX:
9557 if (r1 >= nregs)
9558 err += efunc(pc, "invalid register %u\n", r1);
9559 if (r2 != 0)
9560 err += efunc(pc, "non-zero reserved bits\n");
9561 if (rd >= nregs)
9562 err += efunc(pc, "invalid register %u\n", rd);
9563 if (rd == 0)
9564 err += efunc(pc, "cannot write to 0 address\n");
9565 break;
9566 case DIF_OP_CMP:
9567 case DIF_OP_SCMP:
9568 if (r1 >= nregs)
9569 err += efunc(pc, "invalid register %u\n", r1);
9570 if (r2 >= nregs)
9571 err += efunc(pc, "invalid register %u\n", r2);
9572 if (rd != 0)
9573 err += efunc(pc, "non-zero reserved bits\n");
9574 break;
9575 case DIF_OP_TST:
9576 if (r1 >= nregs)
9577 err += efunc(pc, "invalid register %u\n", r1);
9578 if (r2 != 0 || rd != 0)
9579 err += efunc(pc, "non-zero reserved bits\n");
9580 break;
9581 case DIF_OP_BA:
9582 case DIF_OP_BE:
9583 case DIF_OP_BNE:
9584 case DIF_OP_BG:
9585 case DIF_OP_BGU:
9586 case DIF_OP_BGE:
9587 case DIF_OP_BGEU:
9588 case DIF_OP_BL:
9589 case DIF_OP_BLU:
9590 case DIF_OP_BLE:
9591 case DIF_OP_BLEU:
9592 if (label >= dp->dtdo_len) {
9593 err += efunc(pc, "invalid branch target %u\n",
9594 label);
9595 }
9596 if (label <= pc) {
9597 err += efunc(pc, "backward branch to %u\n",
9598 label);
9599 }
9600 break;
9601 case DIF_OP_RET:
9602 if (r1 != 0 || r2 != 0)
9603 err += efunc(pc, "non-zero reserved bits\n");
9604 if (rd >= nregs)
9605 err += efunc(pc, "invalid register %u\n", rd);
9606 break;
9607 case DIF_OP_NOP:
9608 case DIF_OP_POPTS:
9609 case DIF_OP_FLUSHTS:
9610 if (r1 != 0 || r2 != 0 || rd != 0)
9611 err += efunc(pc, "non-zero reserved bits\n");
9612 break;
9613 case DIF_OP_SETX:
9614 if (DIF_INSTR_INTEGER(instr) >= dp->dtdo_intlen) {
9615 err += efunc(pc, "invalid integer ref %u\n",
9616 DIF_INSTR_INTEGER(instr));
9617 }
9618 if (rd >= nregs)
9619 err += efunc(pc, "invalid register %u\n", rd);
9620 if (rd == 0)
9621 err += efunc(pc, "cannot write to %r0\n");
9622 break;
9623 case DIF_OP_SETS:
9624 if (DIF_INSTR_STRING(instr) >= dp->dtdo_strlen) {
9625 err += efunc(pc, "invalid string ref %u\n",
9626 DIF_INSTR_STRING(instr));
9627 }
9628 if (rd >= nregs)
9629 err += efunc(pc, "invalid register %u\n", rd);
9630 if (rd == 0)
9631 err += efunc(pc, "cannot write to %r0\n");
9632 break;
9633 case DIF_OP_LDGA:
9634 case DIF_OP_LDTA:
9635 if (r1 > DIF_VAR_ARRAY_MAX)
9636 err += efunc(pc, "invalid array %u\n", r1);
9637 if (r2 >= nregs)
9638 err += efunc(pc, "invalid register %u\n", r2);
9639 if (rd >= nregs)
9640 err += efunc(pc, "invalid register %u\n", rd);
9641 if (rd == 0)
9642 err += efunc(pc, "cannot write to %r0\n");
9643 break;
9644 case DIF_OP_LDGS:
9645 case DIF_OP_LDTS:
9646 case DIF_OP_LDLS:
9647 case DIF_OP_LDGAA:
9648 case DIF_OP_LDTAA:
9649 if (v < DIF_VAR_OTHER_MIN || v > DIF_VAR_OTHER_MAX)
9650 err += efunc(pc, "invalid variable %u\n", v);
9651 if (rd >= nregs)
9652 err += efunc(pc, "invalid register %u\n", rd);
9653 if (rd == 0)
9654 err += efunc(pc, "cannot write to %r0\n");
9655 break;
9656 case DIF_OP_STGS:
9657 case DIF_OP_STTS:
9658 case DIF_OP_STLS:
9659 case DIF_OP_STGAA:
9660 case DIF_OP_STTAA:
9661 if (v < DIF_VAR_OTHER_UBASE || v > DIF_VAR_OTHER_MAX)
9662 err += efunc(pc, "invalid variable %u\n", v);
9663 if (rs >= nregs)
9664 err += efunc(pc, "invalid register %u\n", rd);
9665 break;
9666 case DIF_OP_CALL:
9667 if (subr > DIF_SUBR_MAX)
9668 err += efunc(pc, "invalid subr %u\n", subr);
9669 if (rd >= nregs)
9670 err += efunc(pc, "invalid register %u\n", rd);
9671 if (rd == 0)
9672 err += efunc(pc, "cannot write to %r0\n");
9673
9674 if (subr == DIF_SUBR_COPYOUT ||
9675 subr == DIF_SUBR_COPYOUTSTR) {
9676 dp->dtdo_destructive = 1;
9677 }
9678
9679 if (subr == DIF_SUBR_GETF) {
9680 /*
9681 * If we have a getf() we need to record that
9682 * in our state. Note that our state can be
9683 * NULL if this is a helper -- but in that
9684 * case, the call to getf() is itself illegal,
9685 * and will be caught (slightly later) when
9686 * the helper is validated.
9687 */
9688 if (vstate->dtvs_state != NULL)
9689 vstate->dtvs_state->dts_getf++;
9690 }
9691
8813 break;
8814 case DIF_OP_PUSHTR:
8815 if (type != DIF_TYPE_STRING && type != DIF_TYPE_CTF)
8816 err += efunc(pc, "invalid ref type %u\n", type);
8817 if (r2 >= nregs)
8818 err += efunc(pc, "invalid register %u\n", r2);
8819 if (rs >= nregs)
8820 err += efunc(pc, "invalid register %u\n", rs);
8821 break;
8822 case DIF_OP_PUSHTV:
8823 if (type != DIF_TYPE_CTF)
8824 err += efunc(pc, "invalid val type %u\n", type);
8825 if (r2 >= nregs)
8826 err += efunc(pc, "invalid register %u\n", r2);
8827 if (rs >= nregs)
8828 err += efunc(pc, "invalid register %u\n", rs);
8829 break;
8830 default:
8831 err += efunc(pc, "invalid opcode %u\n",
8832 DIF_INSTR_OP(instr));
8833 }
8834 }
8835
8836 if (dp->dtdo_len != 0 &&
8837 DIF_INSTR_OP(dp->dtdo_buf[dp->dtdo_len - 1]) != DIF_OP_RET) {
8838 err += efunc(dp->dtdo_len - 1,
8839 "expected 'ret' as last DIF instruction\n");
8840 }
8841
9692 break;
9693 case DIF_OP_PUSHTR:
9694 if (type != DIF_TYPE_STRING && type != DIF_TYPE_CTF)
9695 err += efunc(pc, "invalid ref type %u\n", type);
9696 if (r2 >= nregs)
9697 err += efunc(pc, "invalid register %u\n", r2);
9698 if (rs >= nregs)
9699 err += efunc(pc, "invalid register %u\n", rs);
9700 break;
9701 case DIF_OP_PUSHTV:
9702 if (type != DIF_TYPE_CTF)
9703 err += efunc(pc, "invalid val type %u\n", type);
9704 if (r2 >= nregs)
9705 err += efunc(pc, "invalid register %u\n", r2);
9706 if (rs >= nregs)
9707 err += efunc(pc, "invalid register %u\n", rs);
9708 break;
9709 default:
9710 err += efunc(pc, "invalid opcode %u\n",
9711 DIF_INSTR_OP(instr));
9712 }
9713 }
9714
9715 if (dp->dtdo_len != 0 &&
9716 DIF_INSTR_OP(dp->dtdo_buf[dp->dtdo_len - 1]) != DIF_OP_RET) {
9717 err += efunc(dp->dtdo_len - 1,
9718 "expected 'ret' as last DIF instruction\n");
9719 }
9720
8842 if (!(dp->dtdo_rtype.dtdt_flags & DIF_TF_BYREF)) {
9721 if (!(dp->dtdo_rtype.dtdt_flags & (DIF_TF_BYREF | DIF_TF_BYUREF))) {
8843 /*
8844 * If we're not returning by reference, the size must be either
8845 * 0 or the size of one of the base types.
8846 */
8847 switch (dp->dtdo_rtype.dtdt_size) {
8848 case 0:
8849 case sizeof (uint8_t):
8850 case sizeof (uint16_t):
8851 case sizeof (uint32_t):
8852 case sizeof (uint64_t):
8853 break;
8854
8855 default:
8856 err += efunc(dp->dtdo_len - 1, "bad return size\n");
8857 }
8858 }
8859
8860 for (i = 0; i < dp->dtdo_varlen && err == 0; i++) {
8861 dtrace_difv_t *v = &dp->dtdo_vartab[i], *existing = NULL;
8862 dtrace_diftype_t *vt, *et;
8863 uint_t id, ndx;
8864
8865 if (v->dtdv_scope != DIFV_SCOPE_GLOBAL &&
8866 v->dtdv_scope != DIFV_SCOPE_THREAD &&
8867 v->dtdv_scope != DIFV_SCOPE_LOCAL) {
8868 err += efunc(i, "unrecognized variable scope %d\n",
8869 v->dtdv_scope);
8870 break;
8871 }
8872
8873 if (v->dtdv_kind != DIFV_KIND_ARRAY &&
8874 v->dtdv_kind != DIFV_KIND_SCALAR) {
8875 err += efunc(i, "unrecognized variable type %d\n",
8876 v->dtdv_kind);
8877 break;
8878 }
8879
8880 if ((id = v->dtdv_id) > DIF_VARIABLE_MAX) {
8881 err += efunc(i, "%d exceeds variable id limit\n", id);
8882 break;
8883 }
8884
8885 if (id < DIF_VAR_OTHER_UBASE)
8886 continue;
8887
8888 /*
8889 * For user-defined variables, we need to check that this
8890 * definition is identical to any previous definition that we
8891 * encountered.
8892 */
8893 ndx = id - DIF_VAR_OTHER_UBASE;
8894
8895 switch (v->dtdv_scope) {
8896 case DIFV_SCOPE_GLOBAL:
8897 if (ndx < vstate->dtvs_nglobals) {
8898 dtrace_statvar_t *svar;
8899
8900 if ((svar = vstate->dtvs_globals[ndx]) != NULL)
8901 existing = &svar->dtsv_var;
8902 }
8903
8904 break;
8905
8906 case DIFV_SCOPE_THREAD:
8907 if (ndx < vstate->dtvs_ntlocals)
8908 existing = &vstate->dtvs_tlocals[ndx];
8909 break;
8910
8911 case DIFV_SCOPE_LOCAL:
8912 if (ndx < vstate->dtvs_nlocals) {
8913 dtrace_statvar_t *svar;
8914
8915 if ((svar = vstate->dtvs_locals[ndx]) != NULL)
8916 existing = &svar->dtsv_var;
8917 }
8918
8919 break;
8920 }
8921
8922 vt = &v->dtdv_type;
8923
8924 if (vt->dtdt_flags & DIF_TF_BYREF) {
8925 if (vt->dtdt_size == 0) {
8926 err += efunc(i, "zero-sized variable\n");
8927 break;
8928 }
8929
8930 if (v->dtdv_scope == DIFV_SCOPE_GLOBAL &&
8931 vt->dtdt_size > dtrace_global_maxsize) {
8932 err += efunc(i, "oversized by-ref global\n");
8933 break;
8934 }
8935 }
8936
8937 if (existing == NULL || existing->dtdv_id == 0)
8938 continue;
8939
8940 ASSERT(existing->dtdv_id == v->dtdv_id);
8941 ASSERT(existing->dtdv_scope == v->dtdv_scope);
8942
8943 if (existing->dtdv_kind != v->dtdv_kind)
8944 err += efunc(i, "%d changed variable kind\n", id);
8945
8946 et = &existing->dtdv_type;
8947
8948 if (vt->dtdt_flags != et->dtdt_flags) {
8949 err += efunc(i, "%d changed variable type flags\n", id);
8950 break;
8951 }
8952
8953 if (vt->dtdt_size != 0 && vt->dtdt_size != et->dtdt_size) {
8954 err += efunc(i, "%d changed variable type size\n", id);
8955 break;
8956 }
8957 }
8958
8959 return (err);
8960}
8961
8962/*
8963 * Validate a DTrace DIF object that it is to be used as a helper. Helpers
8964 * are much more constrained than normal DIFOs. Specifically, they may
8965 * not:
8966 *
8967 * 1. Make calls to subroutines other than copyin(), copyinstr() or
8968 * miscellaneous string routines
8969 * 2. Access DTrace variables other than the args[] array, and the
8970 * curthread, pid, ppid, tid, execname, zonename, uid and gid variables.
8971 * 3. Have thread-local variables.
8972 * 4. Have dynamic variables.
8973 */
8974static int
8975dtrace_difo_validate_helper(dtrace_difo_t *dp)
8976{
8977 int (*efunc)(uint_t pc, const char *, ...) = dtrace_difo_err;
8978 int err = 0;
8979 uint_t pc;
8980
8981 for (pc = 0; pc < dp->dtdo_len; pc++) {
8982 dif_instr_t instr = dp->dtdo_buf[pc];
8983
8984 uint_t v = DIF_INSTR_VAR(instr);
8985 uint_t subr = DIF_INSTR_SUBR(instr);
8986 uint_t op = DIF_INSTR_OP(instr);
8987
8988 switch (op) {
8989 case DIF_OP_OR:
8990 case DIF_OP_XOR:
8991 case DIF_OP_AND:
8992 case DIF_OP_SLL:
8993 case DIF_OP_SRL:
8994 case DIF_OP_SRA:
8995 case DIF_OP_SUB:
8996 case DIF_OP_ADD:
8997 case DIF_OP_MUL:
8998 case DIF_OP_SDIV:
8999 case DIF_OP_UDIV:
9000 case DIF_OP_SREM:
9001 case DIF_OP_UREM:
9002 case DIF_OP_COPYS:
9003 case DIF_OP_NOT:
9004 case DIF_OP_MOV:
9005 case DIF_OP_RLDSB:
9006 case DIF_OP_RLDSH:
9007 case DIF_OP_RLDSW:
9008 case DIF_OP_RLDUB:
9009 case DIF_OP_RLDUH:
9010 case DIF_OP_RLDUW:
9011 case DIF_OP_RLDX:
9012 case DIF_OP_ULDSB:
9013 case DIF_OP_ULDSH:
9014 case DIF_OP_ULDSW:
9015 case DIF_OP_ULDUB:
9016 case DIF_OP_ULDUH:
9017 case DIF_OP_ULDUW:
9018 case DIF_OP_ULDX:
9019 case DIF_OP_STB:
9020 case DIF_OP_STH:
9021 case DIF_OP_STW:
9022 case DIF_OP_STX:
9023 case DIF_OP_ALLOCS:
9024 case DIF_OP_CMP:
9025 case DIF_OP_SCMP:
9026 case DIF_OP_TST:
9027 case DIF_OP_BA:
9028 case DIF_OP_BE:
9029 case DIF_OP_BNE:
9030 case DIF_OP_BG:
9031 case DIF_OP_BGU:
9032 case DIF_OP_BGE:
9033 case DIF_OP_BGEU:
9034 case DIF_OP_BL:
9035 case DIF_OP_BLU:
9036 case DIF_OP_BLE:
9037 case DIF_OP_BLEU:
9038 case DIF_OP_RET:
9039 case DIF_OP_NOP:
9040 case DIF_OP_POPTS:
9041 case DIF_OP_FLUSHTS:
9042 case DIF_OP_SETX:
9043 case DIF_OP_SETS:
9044 case DIF_OP_LDGA:
9045 case DIF_OP_LDLS:
9046 case DIF_OP_STGS:
9047 case DIF_OP_STLS:
9048 case DIF_OP_PUSHTR:
9049 case DIF_OP_PUSHTV:
9050 break;
9051
9052 case DIF_OP_LDGS:
9053 if (v >= DIF_VAR_OTHER_UBASE)
9054 break;
9055
9056 if (v >= DIF_VAR_ARG0 && v <= DIF_VAR_ARG9)
9057 break;
9058
9059 if (v == DIF_VAR_CURTHREAD || v == DIF_VAR_PID ||
9060 v == DIF_VAR_PPID || v == DIF_VAR_TID ||
9061 v == DIF_VAR_EXECARGS ||
9062 v == DIF_VAR_EXECNAME || v == DIF_VAR_ZONENAME ||
9063 v == DIF_VAR_UID || v == DIF_VAR_GID)
9064 break;
9065
9066 err += efunc(pc, "illegal variable %u\n", v);
9067 break;
9068
9069 case DIF_OP_LDTA:
9070 case DIF_OP_LDTS:
9071 case DIF_OP_LDGAA:
9072 case DIF_OP_LDTAA:
9073 err += efunc(pc, "illegal dynamic variable load\n");
9074 break;
9075
9076 case DIF_OP_STTS:
9077 case DIF_OP_STGAA:
9078 case DIF_OP_STTAA:
9079 err += efunc(pc, "illegal dynamic variable store\n");
9080 break;
9081
9082 case DIF_OP_CALL:
9083 if (subr == DIF_SUBR_ALLOCA ||
9084 subr == DIF_SUBR_BCOPY ||
9085 subr == DIF_SUBR_COPYIN ||
9086 subr == DIF_SUBR_COPYINTO ||
9087 subr == DIF_SUBR_COPYINSTR ||
9088 subr == DIF_SUBR_INDEX ||
9089 subr == DIF_SUBR_INET_NTOA ||
9090 subr == DIF_SUBR_INET_NTOA6 ||
9091 subr == DIF_SUBR_INET_NTOP ||
9722 /*
9723 * If we're not returning by reference, the size must be either
9724 * 0 or the size of one of the base types.
9725 */
9726 switch (dp->dtdo_rtype.dtdt_size) {
9727 case 0:
9728 case sizeof (uint8_t):
9729 case sizeof (uint16_t):
9730 case sizeof (uint32_t):
9731 case sizeof (uint64_t):
9732 break;
9733
9734 default:
9735 err += efunc(dp->dtdo_len - 1, "bad return size\n");
9736 }
9737 }
9738
9739 for (i = 0; i < dp->dtdo_varlen && err == 0; i++) {
9740 dtrace_difv_t *v = &dp->dtdo_vartab[i], *existing = NULL;
9741 dtrace_diftype_t *vt, *et;
9742 uint_t id, ndx;
9743
9744 if (v->dtdv_scope != DIFV_SCOPE_GLOBAL &&
9745 v->dtdv_scope != DIFV_SCOPE_THREAD &&
9746 v->dtdv_scope != DIFV_SCOPE_LOCAL) {
9747 err += efunc(i, "unrecognized variable scope %d\n",
9748 v->dtdv_scope);
9749 break;
9750 }
9751
9752 if (v->dtdv_kind != DIFV_KIND_ARRAY &&
9753 v->dtdv_kind != DIFV_KIND_SCALAR) {
9754 err += efunc(i, "unrecognized variable type %d\n",
9755 v->dtdv_kind);
9756 break;
9757 }
9758
9759 if ((id = v->dtdv_id) > DIF_VARIABLE_MAX) {
9760 err += efunc(i, "%d exceeds variable id limit\n", id);
9761 break;
9762 }
9763
9764 if (id < DIF_VAR_OTHER_UBASE)
9765 continue;
9766
9767 /*
9768 * For user-defined variables, we need to check that this
9769 * definition is identical to any previous definition that we
9770 * encountered.
9771 */
9772 ndx = id - DIF_VAR_OTHER_UBASE;
9773
9774 switch (v->dtdv_scope) {
9775 case DIFV_SCOPE_GLOBAL:
9776 if (ndx < vstate->dtvs_nglobals) {
9777 dtrace_statvar_t *svar;
9778
9779 if ((svar = vstate->dtvs_globals[ndx]) != NULL)
9780 existing = &svar->dtsv_var;
9781 }
9782
9783 break;
9784
9785 case DIFV_SCOPE_THREAD:
9786 if (ndx < vstate->dtvs_ntlocals)
9787 existing = &vstate->dtvs_tlocals[ndx];
9788 break;
9789
9790 case DIFV_SCOPE_LOCAL:
9791 if (ndx < vstate->dtvs_nlocals) {
9792 dtrace_statvar_t *svar;
9793
9794 if ((svar = vstate->dtvs_locals[ndx]) != NULL)
9795 existing = &svar->dtsv_var;
9796 }
9797
9798 break;
9799 }
9800
9801 vt = &v->dtdv_type;
9802
9803 if (vt->dtdt_flags & DIF_TF_BYREF) {
9804 if (vt->dtdt_size == 0) {
9805 err += efunc(i, "zero-sized variable\n");
9806 break;
9807 }
9808
9809 if (v->dtdv_scope == DIFV_SCOPE_GLOBAL &&
9810 vt->dtdt_size > dtrace_global_maxsize) {
9811 err += efunc(i, "oversized by-ref global\n");
9812 break;
9813 }
9814 }
9815
9816 if (existing == NULL || existing->dtdv_id == 0)
9817 continue;
9818
9819 ASSERT(existing->dtdv_id == v->dtdv_id);
9820 ASSERT(existing->dtdv_scope == v->dtdv_scope);
9821
9822 if (existing->dtdv_kind != v->dtdv_kind)
9823 err += efunc(i, "%d changed variable kind\n", id);
9824
9825 et = &existing->dtdv_type;
9826
9827 if (vt->dtdt_flags != et->dtdt_flags) {
9828 err += efunc(i, "%d changed variable type flags\n", id);
9829 break;
9830 }
9831
9832 if (vt->dtdt_size != 0 && vt->dtdt_size != et->dtdt_size) {
9833 err += efunc(i, "%d changed variable type size\n", id);
9834 break;
9835 }
9836 }
9837
9838 return (err);
9839}
9840
9841/*
9842 * Validate a DTrace DIF object that it is to be used as a helper. Helpers
9843 * are much more constrained than normal DIFOs. Specifically, they may
9844 * not:
9845 *
9846 * 1. Make calls to subroutines other than copyin(), copyinstr() or
9847 * miscellaneous string routines
9848 * 2. Access DTrace variables other than the args[] array, and the
9849 * curthread, pid, ppid, tid, execname, zonename, uid and gid variables.
9850 * 3. Have thread-local variables.
9851 * 4. Have dynamic variables.
9852 */
9853static int
9854dtrace_difo_validate_helper(dtrace_difo_t *dp)
9855{
9856 int (*efunc)(uint_t pc, const char *, ...) = dtrace_difo_err;
9857 int err = 0;
9858 uint_t pc;
9859
9860 for (pc = 0; pc < dp->dtdo_len; pc++) {
9861 dif_instr_t instr = dp->dtdo_buf[pc];
9862
9863 uint_t v = DIF_INSTR_VAR(instr);
9864 uint_t subr = DIF_INSTR_SUBR(instr);
9865 uint_t op = DIF_INSTR_OP(instr);
9866
9867 switch (op) {
9868 case DIF_OP_OR:
9869 case DIF_OP_XOR:
9870 case DIF_OP_AND:
9871 case DIF_OP_SLL:
9872 case DIF_OP_SRL:
9873 case DIF_OP_SRA:
9874 case DIF_OP_SUB:
9875 case DIF_OP_ADD:
9876 case DIF_OP_MUL:
9877 case DIF_OP_SDIV:
9878 case DIF_OP_UDIV:
9879 case DIF_OP_SREM:
9880 case DIF_OP_UREM:
9881 case DIF_OP_COPYS:
9882 case DIF_OP_NOT:
9883 case DIF_OP_MOV:
9884 case DIF_OP_RLDSB:
9885 case DIF_OP_RLDSH:
9886 case DIF_OP_RLDSW:
9887 case DIF_OP_RLDUB:
9888 case DIF_OP_RLDUH:
9889 case DIF_OP_RLDUW:
9890 case DIF_OP_RLDX:
9891 case DIF_OP_ULDSB:
9892 case DIF_OP_ULDSH:
9893 case DIF_OP_ULDSW:
9894 case DIF_OP_ULDUB:
9895 case DIF_OP_ULDUH:
9896 case DIF_OP_ULDUW:
9897 case DIF_OP_ULDX:
9898 case DIF_OP_STB:
9899 case DIF_OP_STH:
9900 case DIF_OP_STW:
9901 case DIF_OP_STX:
9902 case DIF_OP_ALLOCS:
9903 case DIF_OP_CMP:
9904 case DIF_OP_SCMP:
9905 case DIF_OP_TST:
9906 case DIF_OP_BA:
9907 case DIF_OP_BE:
9908 case DIF_OP_BNE:
9909 case DIF_OP_BG:
9910 case DIF_OP_BGU:
9911 case DIF_OP_BGE:
9912 case DIF_OP_BGEU:
9913 case DIF_OP_BL:
9914 case DIF_OP_BLU:
9915 case DIF_OP_BLE:
9916 case DIF_OP_BLEU:
9917 case DIF_OP_RET:
9918 case DIF_OP_NOP:
9919 case DIF_OP_POPTS:
9920 case DIF_OP_FLUSHTS:
9921 case DIF_OP_SETX:
9922 case DIF_OP_SETS:
9923 case DIF_OP_LDGA:
9924 case DIF_OP_LDLS:
9925 case DIF_OP_STGS:
9926 case DIF_OP_STLS:
9927 case DIF_OP_PUSHTR:
9928 case DIF_OP_PUSHTV:
9929 break;
9930
9931 case DIF_OP_LDGS:
9932 if (v >= DIF_VAR_OTHER_UBASE)
9933 break;
9934
9935 if (v >= DIF_VAR_ARG0 && v <= DIF_VAR_ARG9)
9936 break;
9937
9938 if (v == DIF_VAR_CURTHREAD || v == DIF_VAR_PID ||
9939 v == DIF_VAR_PPID || v == DIF_VAR_TID ||
9940 v == DIF_VAR_EXECARGS ||
9941 v == DIF_VAR_EXECNAME || v == DIF_VAR_ZONENAME ||
9942 v == DIF_VAR_UID || v == DIF_VAR_GID)
9943 break;
9944
9945 err += efunc(pc, "illegal variable %u\n", v);
9946 break;
9947
9948 case DIF_OP_LDTA:
9949 case DIF_OP_LDTS:
9950 case DIF_OP_LDGAA:
9951 case DIF_OP_LDTAA:
9952 err += efunc(pc, "illegal dynamic variable load\n");
9953 break;
9954
9955 case DIF_OP_STTS:
9956 case DIF_OP_STGAA:
9957 case DIF_OP_STTAA:
9958 err += efunc(pc, "illegal dynamic variable store\n");
9959 break;
9960
9961 case DIF_OP_CALL:
9962 if (subr == DIF_SUBR_ALLOCA ||
9963 subr == DIF_SUBR_BCOPY ||
9964 subr == DIF_SUBR_COPYIN ||
9965 subr == DIF_SUBR_COPYINTO ||
9966 subr == DIF_SUBR_COPYINSTR ||
9967 subr == DIF_SUBR_INDEX ||
9968 subr == DIF_SUBR_INET_NTOA ||
9969 subr == DIF_SUBR_INET_NTOA6 ||
9970 subr == DIF_SUBR_INET_NTOP ||
9971 subr == DIF_SUBR_JSON ||
9092 subr == DIF_SUBR_LLTOSTR ||
9972 subr == DIF_SUBR_LLTOSTR ||
9973 subr == DIF_SUBR_STRTOLL ||
9093 subr == DIF_SUBR_RINDEX ||
9094 subr == DIF_SUBR_STRCHR ||
9095 subr == DIF_SUBR_STRJOIN ||
9096 subr == DIF_SUBR_STRRCHR ||
9097 subr == DIF_SUBR_STRSTR ||
9098 subr == DIF_SUBR_HTONS ||
9099 subr == DIF_SUBR_HTONL ||
9100 subr == DIF_SUBR_HTONLL ||
9101 subr == DIF_SUBR_NTOHS ||
9102 subr == DIF_SUBR_NTOHL ||
9103 subr == DIF_SUBR_NTOHLL ||
9104 subr == DIF_SUBR_MEMREF ||
9105 subr == DIF_SUBR_TYPEREF)
9106 break;
9107
9108 err += efunc(pc, "invalid subr %u\n", subr);
9109 break;
9110
9111 default:
9112 err += efunc(pc, "invalid opcode %u\n",
9113 DIF_INSTR_OP(instr));
9114 }
9115 }
9116
9117 return (err);
9118}
9119
9120/*
9121 * Returns 1 if the expression in the DIF object can be cached on a per-thread
9122 * basis; 0 if not.
9123 */
9124static int
9125dtrace_difo_cacheable(dtrace_difo_t *dp)
9126{
9127 int i;
9128
9129 if (dp == NULL)
9130 return (0);
9131
9132 for (i = 0; i < dp->dtdo_varlen; i++) {
9133 dtrace_difv_t *v = &dp->dtdo_vartab[i];
9134
9135 if (v->dtdv_scope != DIFV_SCOPE_GLOBAL)
9136 continue;
9137
9138 switch (v->dtdv_id) {
9139 case DIF_VAR_CURTHREAD:
9140 case DIF_VAR_PID:
9141 case DIF_VAR_TID:
9142 case DIF_VAR_EXECARGS:
9143 case DIF_VAR_EXECNAME:
9144 case DIF_VAR_ZONENAME:
9145 break;
9146
9147 default:
9148 return (0);
9149 }
9150 }
9151
9152 /*
9153 * This DIF object may be cacheable. Now we need to look for any
9154 * array loading instructions, any memory loading instructions, or
9155 * any stores to thread-local variables.
9156 */
9157 for (i = 0; i < dp->dtdo_len; i++) {
9158 uint_t op = DIF_INSTR_OP(dp->dtdo_buf[i]);
9159
9160 if ((op >= DIF_OP_LDSB && op <= DIF_OP_LDX) ||
9161 (op >= DIF_OP_ULDSB && op <= DIF_OP_ULDX) ||
9162 (op >= DIF_OP_RLDSB && op <= DIF_OP_RLDX) ||
9163 op == DIF_OP_LDGA || op == DIF_OP_STTS)
9164 return (0);
9165 }
9166
9167 return (1);
9168}
9169
9170static void
9171dtrace_difo_hold(dtrace_difo_t *dp)
9172{
9173 int i;
9174
9175 ASSERT(MUTEX_HELD(&dtrace_lock));
9176
9177 dp->dtdo_refcnt++;
9178 ASSERT(dp->dtdo_refcnt != 0);
9179
9180 /*
9181 * We need to check this DIF object for references to the variable
9182 * DIF_VAR_VTIMESTAMP.
9183 */
9184 for (i = 0; i < dp->dtdo_varlen; i++) {
9185 dtrace_difv_t *v = &dp->dtdo_vartab[i];
9186
9187 if (v->dtdv_id != DIF_VAR_VTIMESTAMP)
9188 continue;
9189
9190 if (dtrace_vtime_references++ == 0)
9191 dtrace_vtime_enable();
9192 }
9193}
9194
9195/*
9196 * This routine calculates the dynamic variable chunksize for a given DIF
9197 * object. The calculation is not fool-proof, and can probably be tricked by
9198 * malicious DIF -- but it works for all compiler-generated DIF. Because this
9199 * calculation is likely imperfect, dtrace_dynvar() is able to gracefully fail
9200 * if a dynamic variable size exceeds the chunksize.
9201 */
9202static void
9203dtrace_difo_chunksize(dtrace_difo_t *dp, dtrace_vstate_t *vstate)
9204{
9205 uint64_t sval = 0;
9206 dtrace_key_t tupregs[DIF_DTR_NREGS + 2]; /* +2 for thread and id */
9207 const dif_instr_t *text = dp->dtdo_buf;
9208 uint_t pc, srd = 0;
9209 uint_t ttop = 0;
9210 size_t size, ksize;
9211 uint_t id, i;
9212
9213 for (pc = 0; pc < dp->dtdo_len; pc++) {
9214 dif_instr_t instr = text[pc];
9215 uint_t op = DIF_INSTR_OP(instr);
9216 uint_t rd = DIF_INSTR_RD(instr);
9217 uint_t r1 = DIF_INSTR_R1(instr);
9218 uint_t nkeys = 0;
9219 uchar_t scope = 0;
9220
9221 dtrace_key_t *key = tupregs;
9222
9223 switch (op) {
9224 case DIF_OP_SETX:
9225 sval = dp->dtdo_inttab[DIF_INSTR_INTEGER(instr)];
9226 srd = rd;
9227 continue;
9228
9229 case DIF_OP_STTS:
9230 key = &tupregs[DIF_DTR_NREGS];
9231 key[0].dttk_size = 0;
9232 key[1].dttk_size = 0;
9233 nkeys = 2;
9234 scope = DIFV_SCOPE_THREAD;
9235 break;
9236
9237 case DIF_OP_STGAA:
9238 case DIF_OP_STTAA:
9239 nkeys = ttop;
9240
9241 if (DIF_INSTR_OP(instr) == DIF_OP_STTAA)
9242 key[nkeys++].dttk_size = 0;
9243
9244 key[nkeys++].dttk_size = 0;
9245
9246 if (op == DIF_OP_STTAA) {
9247 scope = DIFV_SCOPE_THREAD;
9248 } else {
9249 scope = DIFV_SCOPE_GLOBAL;
9250 }
9251
9252 break;
9253
9254 case DIF_OP_PUSHTR:
9255 if (ttop == DIF_DTR_NREGS)
9256 return;
9257
9258 if ((srd == 0 || sval == 0) && r1 == DIF_TYPE_STRING) {
9259 /*
9260 * If the register for the size of the "pushtr"
9261 * is %r0 (or the value is 0) and the type is
9262 * a string, we'll use the system-wide default
9263 * string size.
9264 */
9265 tupregs[ttop++].dttk_size =
9266 dtrace_strsize_default;
9267 } else {
9268 if (srd == 0)
9269 return;
9270
9271 tupregs[ttop++].dttk_size = sval;
9272 }
9273
9274 break;
9275
9276 case DIF_OP_PUSHTV:
9277 if (ttop == DIF_DTR_NREGS)
9278 return;
9279
9280 tupregs[ttop++].dttk_size = 0;
9281 break;
9282
9283 case DIF_OP_FLUSHTS:
9284 ttop = 0;
9285 break;
9286
9287 case DIF_OP_POPTS:
9288 if (ttop != 0)
9289 ttop--;
9290 break;
9291 }
9292
9293 sval = 0;
9294 srd = 0;
9295
9296 if (nkeys == 0)
9297 continue;
9298
9299 /*
9300 * We have a dynamic variable allocation; calculate its size.
9301 */
9302 for (ksize = 0, i = 0; i < nkeys; i++)
9303 ksize += P2ROUNDUP(key[i].dttk_size, sizeof (uint64_t));
9304
9305 size = sizeof (dtrace_dynvar_t);
9306 size += sizeof (dtrace_key_t) * (nkeys - 1);
9307 size += ksize;
9308
9309 /*
9310 * Now we need to determine the size of the stored data.
9311 */
9312 id = DIF_INSTR_VAR(instr);
9313
9314 for (i = 0; i < dp->dtdo_varlen; i++) {
9315 dtrace_difv_t *v = &dp->dtdo_vartab[i];
9316
9317 if (v->dtdv_id == id && v->dtdv_scope == scope) {
9318 size += v->dtdv_type.dtdt_size;
9319 break;
9320 }
9321 }
9322
9323 if (i == dp->dtdo_varlen)
9324 return;
9325
9326 /*
9327 * We have the size. If this is larger than the chunk size
9328 * for our dynamic variable state, reset the chunk size.
9329 */
9330 size = P2ROUNDUP(size, sizeof (uint64_t));
9331
9332 if (size > vstate->dtvs_dynvars.dtds_chunksize)
9333 vstate->dtvs_dynvars.dtds_chunksize = size;
9334 }
9335}
9336
9337static void
9338dtrace_difo_init(dtrace_difo_t *dp, dtrace_vstate_t *vstate)
9339{
9340 int i, oldsvars, osz, nsz, otlocals, ntlocals;
9341 uint_t id;
9342
9343 ASSERT(MUTEX_HELD(&dtrace_lock));
9344 ASSERT(dp->dtdo_buf != NULL && dp->dtdo_len != 0);
9345
9346 for (i = 0; i < dp->dtdo_varlen; i++) {
9347 dtrace_difv_t *v = &dp->dtdo_vartab[i];
9348 dtrace_statvar_t *svar, ***svarp = NULL;
9349 size_t dsize = 0;
9350 uint8_t scope = v->dtdv_scope;
9351 int *np = NULL;
9352
9353 if ((id = v->dtdv_id) < DIF_VAR_OTHER_UBASE)
9354 continue;
9355
9356 id -= DIF_VAR_OTHER_UBASE;
9357
9358 switch (scope) {
9359 case DIFV_SCOPE_THREAD:
9360 while (id >= (otlocals = vstate->dtvs_ntlocals)) {
9361 dtrace_difv_t *tlocals;
9362
9363 if ((ntlocals = (otlocals << 1)) == 0)
9364 ntlocals = 1;
9365
9366 osz = otlocals * sizeof (dtrace_difv_t);
9367 nsz = ntlocals * sizeof (dtrace_difv_t);
9368
9369 tlocals = kmem_zalloc(nsz, KM_SLEEP);
9370
9371 if (osz != 0) {
9372 bcopy(vstate->dtvs_tlocals,
9373 tlocals, osz);
9374 kmem_free(vstate->dtvs_tlocals, osz);
9375 }
9376
9377 vstate->dtvs_tlocals = tlocals;
9378 vstate->dtvs_ntlocals = ntlocals;
9379 }
9380
9381 vstate->dtvs_tlocals[id] = *v;
9382 continue;
9383
9384 case DIFV_SCOPE_LOCAL:
9385 np = &vstate->dtvs_nlocals;
9386 svarp = &vstate->dtvs_locals;
9387
9388 if (v->dtdv_type.dtdt_flags & DIF_TF_BYREF)
9389 dsize = NCPU * (v->dtdv_type.dtdt_size +
9390 sizeof (uint64_t));
9391 else
9392 dsize = NCPU * sizeof (uint64_t);
9393
9394 break;
9395
9396 case DIFV_SCOPE_GLOBAL:
9397 np = &vstate->dtvs_nglobals;
9398 svarp = &vstate->dtvs_globals;
9399
9400 if (v->dtdv_type.dtdt_flags & DIF_TF_BYREF)
9401 dsize = v->dtdv_type.dtdt_size +
9402 sizeof (uint64_t);
9403
9404 break;
9405
9406 default:
9407 ASSERT(0);
9408 }
9409
9410 while (id >= (oldsvars = *np)) {
9411 dtrace_statvar_t **statics;
9412 int newsvars, oldsize, newsize;
9413
9414 if ((newsvars = (oldsvars << 1)) == 0)
9415 newsvars = 1;
9416
9417 oldsize = oldsvars * sizeof (dtrace_statvar_t *);
9418 newsize = newsvars * sizeof (dtrace_statvar_t *);
9419
9420 statics = kmem_zalloc(newsize, KM_SLEEP);
9421
9422 if (oldsize != 0) {
9423 bcopy(*svarp, statics, oldsize);
9424 kmem_free(*svarp, oldsize);
9425 }
9426
9427 *svarp = statics;
9428 *np = newsvars;
9429 }
9430
9431 if ((svar = (*svarp)[id]) == NULL) {
9432 svar = kmem_zalloc(sizeof (dtrace_statvar_t), KM_SLEEP);
9433 svar->dtsv_var = *v;
9434
9435 if ((svar->dtsv_size = dsize) != 0) {
9436 svar->dtsv_data = (uint64_t)(uintptr_t)
9437 kmem_zalloc(dsize, KM_SLEEP);
9438 }
9439
9440 (*svarp)[id] = svar;
9441 }
9442
9443 svar->dtsv_refcnt++;
9444 }
9445
9446 dtrace_difo_chunksize(dp, vstate);
9447 dtrace_difo_hold(dp);
9448}
9449
9450static dtrace_difo_t *
9451dtrace_difo_duplicate(dtrace_difo_t *dp, dtrace_vstate_t *vstate)
9452{
9453 dtrace_difo_t *new;
9454 size_t sz;
9455
9456 ASSERT(dp->dtdo_buf != NULL);
9457 ASSERT(dp->dtdo_refcnt != 0);
9458
9459 new = kmem_zalloc(sizeof (dtrace_difo_t), KM_SLEEP);
9460
9461 ASSERT(dp->dtdo_buf != NULL);
9462 sz = dp->dtdo_len * sizeof (dif_instr_t);
9463 new->dtdo_buf = kmem_alloc(sz, KM_SLEEP);
9464 bcopy(dp->dtdo_buf, new->dtdo_buf, sz);
9465 new->dtdo_len = dp->dtdo_len;
9466
9467 if (dp->dtdo_strtab != NULL) {
9468 ASSERT(dp->dtdo_strlen != 0);
9469 new->dtdo_strtab = kmem_alloc(dp->dtdo_strlen, KM_SLEEP);
9470 bcopy(dp->dtdo_strtab, new->dtdo_strtab, dp->dtdo_strlen);
9471 new->dtdo_strlen = dp->dtdo_strlen;
9472 }
9473
9474 if (dp->dtdo_inttab != NULL) {
9475 ASSERT(dp->dtdo_intlen != 0);
9476 sz = dp->dtdo_intlen * sizeof (uint64_t);
9477 new->dtdo_inttab = kmem_alloc(sz, KM_SLEEP);
9478 bcopy(dp->dtdo_inttab, new->dtdo_inttab, sz);
9479 new->dtdo_intlen = dp->dtdo_intlen;
9480 }
9481
9482 if (dp->dtdo_vartab != NULL) {
9483 ASSERT(dp->dtdo_varlen != 0);
9484 sz = dp->dtdo_varlen * sizeof (dtrace_difv_t);
9485 new->dtdo_vartab = kmem_alloc(sz, KM_SLEEP);
9486 bcopy(dp->dtdo_vartab, new->dtdo_vartab, sz);
9487 new->dtdo_varlen = dp->dtdo_varlen;
9488 }
9489
9490 dtrace_difo_init(new, vstate);
9491 return (new);
9492}
9493
9494static void
9495dtrace_difo_destroy(dtrace_difo_t *dp, dtrace_vstate_t *vstate)
9496{
9497 int i;
9498
9499 ASSERT(dp->dtdo_refcnt == 0);
9500
9501 for (i = 0; i < dp->dtdo_varlen; i++) {
9502 dtrace_difv_t *v = &dp->dtdo_vartab[i];
9503 dtrace_statvar_t *svar, **svarp = NULL;
9504 uint_t id;
9505 uint8_t scope = v->dtdv_scope;
9506 int *np = NULL;
9507
9508 switch (scope) {
9509 case DIFV_SCOPE_THREAD:
9510 continue;
9511
9512 case DIFV_SCOPE_LOCAL:
9513 np = &vstate->dtvs_nlocals;
9514 svarp = vstate->dtvs_locals;
9515 break;
9516
9517 case DIFV_SCOPE_GLOBAL:
9518 np = &vstate->dtvs_nglobals;
9519 svarp = vstate->dtvs_globals;
9520 break;
9521
9522 default:
9523 ASSERT(0);
9524 }
9525
9526 if ((id = v->dtdv_id) < DIF_VAR_OTHER_UBASE)
9527 continue;
9528
9529 id -= DIF_VAR_OTHER_UBASE;
9530 ASSERT(id < *np);
9531
9532 svar = svarp[id];
9533 ASSERT(svar != NULL);
9534 ASSERT(svar->dtsv_refcnt > 0);
9535
9536 if (--svar->dtsv_refcnt > 0)
9537 continue;
9538
9539 if (svar->dtsv_size != 0) {
9540 ASSERT(svar->dtsv_data != 0);
9541 kmem_free((void *)(uintptr_t)svar->dtsv_data,
9542 svar->dtsv_size);
9543 }
9544
9545 kmem_free(svar, sizeof (dtrace_statvar_t));
9546 svarp[id] = NULL;
9547 }
9548
9549 if (dp->dtdo_buf != NULL)
9550 kmem_free(dp->dtdo_buf, dp->dtdo_len * sizeof (dif_instr_t));
9551 if (dp->dtdo_inttab != NULL)
9552 kmem_free(dp->dtdo_inttab, dp->dtdo_intlen * sizeof (uint64_t));
9553 if (dp->dtdo_strtab != NULL)
9554 kmem_free(dp->dtdo_strtab, dp->dtdo_strlen);
9555 if (dp->dtdo_vartab != NULL)
9556 kmem_free(dp->dtdo_vartab, dp->dtdo_varlen * sizeof (dtrace_difv_t));
9557
9558 kmem_free(dp, sizeof (dtrace_difo_t));
9559}
9560
9561static void
9562dtrace_difo_release(dtrace_difo_t *dp, dtrace_vstate_t *vstate)
9563{
9564 int i;
9565
9566 ASSERT(MUTEX_HELD(&dtrace_lock));
9567 ASSERT(dp->dtdo_refcnt != 0);
9568
9569 for (i = 0; i < dp->dtdo_varlen; i++) {
9570 dtrace_difv_t *v = &dp->dtdo_vartab[i];
9571
9572 if (v->dtdv_id != DIF_VAR_VTIMESTAMP)
9573 continue;
9574
9575 ASSERT(dtrace_vtime_references > 0);
9576 if (--dtrace_vtime_references == 0)
9577 dtrace_vtime_disable();
9578 }
9579
9580 if (--dp->dtdo_refcnt == 0)
9581 dtrace_difo_destroy(dp, vstate);
9582}
9583
9584/*
9585 * DTrace Format Functions
9586 */
9587static uint16_t
9588dtrace_format_add(dtrace_state_t *state, char *str)
9589{
9590 char *fmt, **new;
9591 uint16_t ndx, len = strlen(str) + 1;
9592
9593 fmt = kmem_zalloc(len, KM_SLEEP);
9594 bcopy(str, fmt, len);
9595
9596 for (ndx = 0; ndx < state->dts_nformats; ndx++) {
9597 if (state->dts_formats[ndx] == NULL) {
9598 state->dts_formats[ndx] = fmt;
9599 return (ndx + 1);
9600 }
9601 }
9602
9603 if (state->dts_nformats == USHRT_MAX) {
9604 /*
9605 * This is only likely if a denial-of-service attack is being
9606 * attempted. As such, it's okay to fail silently here.
9607 */
9608 kmem_free(fmt, len);
9609 return (0);
9610 }
9611
9612 /*
9613 * For simplicity, we always resize the formats array to be exactly the
9614 * number of formats.
9615 */
9616 ndx = state->dts_nformats++;
9617 new = kmem_alloc((ndx + 1) * sizeof (char *), KM_SLEEP);
9618
9619 if (state->dts_formats != NULL) {
9620 ASSERT(ndx != 0);
9621 bcopy(state->dts_formats, new, ndx * sizeof (char *));
9622 kmem_free(state->dts_formats, ndx * sizeof (char *));
9623 }
9624
9625 state->dts_formats = new;
9626 state->dts_formats[ndx] = fmt;
9627
9628 return (ndx + 1);
9629}
9630
9631static void
9632dtrace_format_remove(dtrace_state_t *state, uint16_t format)
9633{
9634 char *fmt;
9635
9636 ASSERT(state->dts_formats != NULL);
9637 ASSERT(format <= state->dts_nformats);
9638 ASSERT(state->dts_formats[format - 1] != NULL);
9639
9640 fmt = state->dts_formats[format - 1];
9641 kmem_free(fmt, strlen(fmt) + 1);
9642 state->dts_formats[format - 1] = NULL;
9643}
9644
9645static void
9646dtrace_format_destroy(dtrace_state_t *state)
9647{
9648 int i;
9649
9650 if (state->dts_nformats == 0) {
9651 ASSERT(state->dts_formats == NULL);
9652 return;
9653 }
9654
9655 ASSERT(state->dts_formats != NULL);
9656
9657 for (i = 0; i < state->dts_nformats; i++) {
9658 char *fmt = state->dts_formats[i];
9659
9660 if (fmt == NULL)
9661 continue;
9662
9663 kmem_free(fmt, strlen(fmt) + 1);
9664 }
9665
9666 kmem_free(state->dts_formats, state->dts_nformats * sizeof (char *));
9667 state->dts_nformats = 0;
9668 state->dts_formats = NULL;
9669}
9670
9671/*
9672 * DTrace Predicate Functions
9673 */
9674static dtrace_predicate_t *
9675dtrace_predicate_create(dtrace_difo_t *dp)
9676{
9677 dtrace_predicate_t *pred;
9678
9679 ASSERT(MUTEX_HELD(&dtrace_lock));
9680 ASSERT(dp->dtdo_refcnt != 0);
9681
9682 pred = kmem_zalloc(sizeof (dtrace_predicate_t), KM_SLEEP);
9683 pred->dtp_difo = dp;
9684 pred->dtp_refcnt = 1;
9685
9686 if (!dtrace_difo_cacheable(dp))
9687 return (pred);
9688
9689 if (dtrace_predcache_id == DTRACE_CACHEIDNONE) {
9690 /*
9691 * This is only theoretically possible -- we have had 2^32
9692 * cacheable predicates on this machine. We cannot allow any
9693 * more predicates to become cacheable: as unlikely as it is,
9694 * there may be a thread caching a (now stale) predicate cache
9695 * ID. (N.B.: the temptation is being successfully resisted to
9696 * have this cmn_err() "Holy shit -- we executed this code!")
9697 */
9698 return (pred);
9699 }
9700
9701 pred->dtp_cacheid = dtrace_predcache_id++;
9702
9703 return (pred);
9704}
9705
9706static void
9707dtrace_predicate_hold(dtrace_predicate_t *pred)
9708{
9709 ASSERT(MUTEX_HELD(&dtrace_lock));
9710 ASSERT(pred->dtp_difo != NULL && pred->dtp_difo->dtdo_refcnt != 0);
9711 ASSERT(pred->dtp_refcnt > 0);
9712
9713 pred->dtp_refcnt++;
9714}
9715
9716static void
9717dtrace_predicate_release(dtrace_predicate_t *pred, dtrace_vstate_t *vstate)
9718{
9719 dtrace_difo_t *dp = pred->dtp_difo;
9720
9721 ASSERT(MUTEX_HELD(&dtrace_lock));
9722 ASSERT(dp != NULL && dp->dtdo_refcnt != 0);
9723 ASSERT(pred->dtp_refcnt > 0);
9724
9725 if (--pred->dtp_refcnt == 0) {
9726 dtrace_difo_release(pred->dtp_difo, vstate);
9727 kmem_free(pred, sizeof (dtrace_predicate_t));
9728 }
9729}
9730
9731/*
9732 * DTrace Action Description Functions
9733 */
9734static dtrace_actdesc_t *
9735dtrace_actdesc_create(dtrace_actkind_t kind, uint32_t ntuple,
9736 uint64_t uarg, uint64_t arg)
9737{
9738 dtrace_actdesc_t *act;
9739
9740#if defined(sun)
9741 ASSERT(!DTRACEACT_ISPRINTFLIKE(kind) || (arg != NULL &&
9742 arg >= KERNELBASE) || (arg == NULL && kind == DTRACEACT_PRINTA));
9743#endif
9744
9745 act = kmem_zalloc(sizeof (dtrace_actdesc_t), KM_SLEEP);
9746 act->dtad_kind = kind;
9747 act->dtad_ntuple = ntuple;
9748 act->dtad_uarg = uarg;
9749 act->dtad_arg = arg;
9750 act->dtad_refcnt = 1;
9751
9752 return (act);
9753}
9754
9755static void
9756dtrace_actdesc_hold(dtrace_actdesc_t *act)
9757{
9758 ASSERT(act->dtad_refcnt >= 1);
9759 act->dtad_refcnt++;
9760}
9761
9762static void
9763dtrace_actdesc_release(dtrace_actdesc_t *act, dtrace_vstate_t *vstate)
9764{
9765 dtrace_actkind_t kind = act->dtad_kind;
9766 dtrace_difo_t *dp;
9767
9768 ASSERT(act->dtad_refcnt >= 1);
9769
9770 if (--act->dtad_refcnt != 0)
9771 return;
9772
9773 if ((dp = act->dtad_difo) != NULL)
9774 dtrace_difo_release(dp, vstate);
9775
9776 if (DTRACEACT_ISPRINTFLIKE(kind)) {
9777 char *str = (char *)(uintptr_t)act->dtad_arg;
9778
9779#if defined(sun)
9780 ASSERT((str != NULL && (uintptr_t)str >= KERNELBASE) ||
9781 (str == NULL && act->dtad_kind == DTRACEACT_PRINTA));
9782#endif
9783
9784 if (str != NULL)
9785 kmem_free(str, strlen(str) + 1);
9786 }
9787
9788 kmem_free(act, sizeof (dtrace_actdesc_t));
9789}
9790
9791/*
9792 * DTrace ECB Functions
9793 */
9794static dtrace_ecb_t *
9795dtrace_ecb_add(dtrace_state_t *state, dtrace_probe_t *probe)
9796{
9797 dtrace_ecb_t *ecb;
9798 dtrace_epid_t epid;
9799
9800 ASSERT(MUTEX_HELD(&dtrace_lock));
9801
9802 ecb = kmem_zalloc(sizeof (dtrace_ecb_t), KM_SLEEP);
9803 ecb->dte_predicate = NULL;
9804 ecb->dte_probe = probe;
9805
9806 /*
9807 * The default size is the size of the default action: recording
9808 * the header.
9809 */
9810 ecb->dte_size = ecb->dte_needed = sizeof (dtrace_rechdr_t);
9811 ecb->dte_alignment = sizeof (dtrace_epid_t);
9812
9813 epid = state->dts_epid++;
9814
9815 if (epid - 1 >= state->dts_necbs) {
9816 dtrace_ecb_t **oecbs = state->dts_ecbs, **ecbs;
9817 int necbs = state->dts_necbs << 1;
9818
9819 ASSERT(epid == state->dts_necbs + 1);
9820
9821 if (necbs == 0) {
9822 ASSERT(oecbs == NULL);
9823 necbs = 1;
9824 }
9825
9826 ecbs = kmem_zalloc(necbs * sizeof (*ecbs), KM_SLEEP);
9827
9828 if (oecbs != NULL)
9829 bcopy(oecbs, ecbs, state->dts_necbs * sizeof (*ecbs));
9830
9831 dtrace_membar_producer();
9832 state->dts_ecbs = ecbs;
9833
9834 if (oecbs != NULL) {
9835 /*
9836 * If this state is active, we must dtrace_sync()
9837 * before we can free the old dts_ecbs array: we're
9838 * coming in hot, and there may be active ring
9839 * buffer processing (which indexes into the dts_ecbs
9840 * array) on another CPU.
9841 */
9842 if (state->dts_activity != DTRACE_ACTIVITY_INACTIVE)
9843 dtrace_sync();
9844
9845 kmem_free(oecbs, state->dts_necbs * sizeof (*ecbs));
9846 }
9847
9848 dtrace_membar_producer();
9849 state->dts_necbs = necbs;
9850 }
9851
9852 ecb->dte_state = state;
9853
9854 ASSERT(state->dts_ecbs[epid - 1] == NULL);
9855 dtrace_membar_producer();
9856 state->dts_ecbs[(ecb->dte_epid = epid) - 1] = ecb;
9857
9858 return (ecb);
9859}
9860
9861static void
9862dtrace_ecb_enable(dtrace_ecb_t *ecb)
9863{
9864 dtrace_probe_t *probe = ecb->dte_probe;
9865
9866 ASSERT(MUTEX_HELD(&cpu_lock));
9867 ASSERT(MUTEX_HELD(&dtrace_lock));
9868 ASSERT(ecb->dte_next == NULL);
9869
9870 if (probe == NULL) {
9871 /*
9872 * This is the NULL probe -- there's nothing to do.
9873 */
9874 return;
9875 }
9876
9877 if (probe->dtpr_ecb == NULL) {
9878 dtrace_provider_t *prov = probe->dtpr_provider;
9879
9880 /*
9881 * We're the first ECB on this probe.
9882 */
9883 probe->dtpr_ecb = probe->dtpr_ecb_last = ecb;
9884
9885 if (ecb->dte_predicate != NULL)
9886 probe->dtpr_predcache = ecb->dte_predicate->dtp_cacheid;
9887
9888 prov->dtpv_pops.dtps_enable(prov->dtpv_arg,
9889 probe->dtpr_id, probe->dtpr_arg);
9890 } else {
9891 /*
9892 * This probe is already active. Swing the last pointer to
9893 * point to the new ECB, and issue a dtrace_sync() to assure
9894 * that all CPUs have seen the change.
9895 */
9896 ASSERT(probe->dtpr_ecb_last != NULL);
9897 probe->dtpr_ecb_last->dte_next = ecb;
9898 probe->dtpr_ecb_last = ecb;
9899 probe->dtpr_predcache = 0;
9900
9901 dtrace_sync();
9902 }
9903}
9904
9905static void
9906dtrace_ecb_resize(dtrace_ecb_t *ecb)
9907{
9908 dtrace_action_t *act;
9909 uint32_t curneeded = UINT32_MAX;
9910 uint32_t aggbase = UINT32_MAX;
9911
9912 /*
9913 * If we record anything, we always record the dtrace_rechdr_t. (And
9914 * we always record it first.)
9915 */
9916 ecb->dte_size = sizeof (dtrace_rechdr_t);
9917 ecb->dte_alignment = sizeof (dtrace_epid_t);
9918
9919 for (act = ecb->dte_action; act != NULL; act = act->dta_next) {
9920 dtrace_recdesc_t *rec = &act->dta_rec;
9921 ASSERT(rec->dtrd_size > 0 || rec->dtrd_alignment == 1);
9922
9923 ecb->dte_alignment = MAX(ecb->dte_alignment,
9924 rec->dtrd_alignment);
9925
9926 if (DTRACEACT_ISAGG(act->dta_kind)) {
9927 dtrace_aggregation_t *agg = (dtrace_aggregation_t *)act;
9928
9929 ASSERT(rec->dtrd_size != 0);
9930 ASSERT(agg->dtag_first != NULL);
9931 ASSERT(act->dta_prev->dta_intuple);
9932 ASSERT(aggbase != UINT32_MAX);
9933 ASSERT(curneeded != UINT32_MAX);
9934
9935 agg->dtag_base = aggbase;
9936
9937 curneeded = P2ROUNDUP(curneeded, rec->dtrd_alignment);
9938 rec->dtrd_offset = curneeded;
9939 curneeded += rec->dtrd_size;
9940 ecb->dte_needed = MAX(ecb->dte_needed, curneeded);
9941
9942 aggbase = UINT32_MAX;
9943 curneeded = UINT32_MAX;
9944 } else if (act->dta_intuple) {
9945 if (curneeded == UINT32_MAX) {
9946 /*
9947 * This is the first record in a tuple. Align
9948 * curneeded to be at offset 4 in an 8-byte
9949 * aligned block.
9950 */
9951 ASSERT(act->dta_prev == NULL ||
9952 !act->dta_prev->dta_intuple);
9953 ASSERT3U(aggbase, ==, UINT32_MAX);
9954 curneeded = P2PHASEUP(ecb->dte_size,
9955 sizeof (uint64_t), sizeof (dtrace_aggid_t));
9956
9957 aggbase = curneeded - sizeof (dtrace_aggid_t);
9958 ASSERT(IS_P2ALIGNED(aggbase,
9959 sizeof (uint64_t)));
9960 }
9961 curneeded = P2ROUNDUP(curneeded, rec->dtrd_alignment);
9962 rec->dtrd_offset = curneeded;
9963 curneeded += rec->dtrd_size;
9964 } else {
9965 /* tuples must be followed by an aggregation */
9966 ASSERT(act->dta_prev == NULL ||
9967 !act->dta_prev->dta_intuple);
9968
9969 ecb->dte_size = P2ROUNDUP(ecb->dte_size,
9970 rec->dtrd_alignment);
9971 rec->dtrd_offset = ecb->dte_size;
9972 ecb->dte_size += rec->dtrd_size;
9973 ecb->dte_needed = MAX(ecb->dte_needed, ecb->dte_size);
9974 }
9975 }
9976
9977 if ((act = ecb->dte_action) != NULL &&
9978 !(act->dta_kind == DTRACEACT_SPECULATE && act->dta_next == NULL) &&
9979 ecb->dte_size == sizeof (dtrace_rechdr_t)) {
9980 /*
9981 * If the size is still sizeof (dtrace_rechdr_t), then all
9982 * actions store no data; set the size to 0.
9983 */
9984 ecb->dte_size = 0;
9985 }
9986
9987 ecb->dte_size = P2ROUNDUP(ecb->dte_size, sizeof (dtrace_epid_t));
9988 ecb->dte_needed = P2ROUNDUP(ecb->dte_needed, (sizeof (dtrace_epid_t)));
9989 ecb->dte_state->dts_needed = MAX(ecb->dte_state->dts_needed,
9990 ecb->dte_needed);
9991}
9992
9993static dtrace_action_t *
9994dtrace_ecb_aggregation_create(dtrace_ecb_t *ecb, dtrace_actdesc_t *desc)
9995{
9996 dtrace_aggregation_t *agg;
9997 size_t size = sizeof (uint64_t);
9998 int ntuple = desc->dtad_ntuple;
9999 dtrace_action_t *act;
10000 dtrace_recdesc_t *frec;
10001 dtrace_aggid_t aggid;
10002 dtrace_state_t *state = ecb->dte_state;
10003
10004 agg = kmem_zalloc(sizeof (dtrace_aggregation_t), KM_SLEEP);
10005 agg->dtag_ecb = ecb;
10006
10007 ASSERT(DTRACEACT_ISAGG(desc->dtad_kind));
10008
10009 switch (desc->dtad_kind) {
10010 case DTRACEAGG_MIN:
10011 agg->dtag_initial = INT64_MAX;
10012 agg->dtag_aggregate = dtrace_aggregate_min;
10013 break;
10014
10015 case DTRACEAGG_MAX:
10016 agg->dtag_initial = INT64_MIN;
10017 agg->dtag_aggregate = dtrace_aggregate_max;
10018 break;
10019
10020 case DTRACEAGG_COUNT:
10021 agg->dtag_aggregate = dtrace_aggregate_count;
10022 break;
10023
10024 case DTRACEAGG_QUANTIZE:
10025 agg->dtag_aggregate = dtrace_aggregate_quantize;
10026 size = (((sizeof (uint64_t) * NBBY) - 1) * 2 + 1) *
10027 sizeof (uint64_t);
10028 break;
10029
10030 case DTRACEAGG_LQUANTIZE: {
10031 uint16_t step = DTRACE_LQUANTIZE_STEP(desc->dtad_arg);
10032 uint16_t levels = DTRACE_LQUANTIZE_LEVELS(desc->dtad_arg);
10033
10034 agg->dtag_initial = desc->dtad_arg;
10035 agg->dtag_aggregate = dtrace_aggregate_lquantize;
10036
10037 if (step == 0 || levels == 0)
10038 goto err;
10039
10040 size = levels * sizeof (uint64_t) + 3 * sizeof (uint64_t);
10041 break;
10042 }
10043
10044 case DTRACEAGG_LLQUANTIZE: {
10045 uint16_t factor = DTRACE_LLQUANTIZE_FACTOR(desc->dtad_arg);
10046 uint16_t low = DTRACE_LLQUANTIZE_LOW(desc->dtad_arg);
10047 uint16_t high = DTRACE_LLQUANTIZE_HIGH(desc->dtad_arg);
10048 uint16_t nsteps = DTRACE_LLQUANTIZE_NSTEP(desc->dtad_arg);
10049 int64_t v;
10050
10051 agg->dtag_initial = desc->dtad_arg;
10052 agg->dtag_aggregate = dtrace_aggregate_llquantize;
10053
10054 if (factor < 2 || low >= high || nsteps < factor)
10055 goto err;
10056
10057 /*
10058 * Now check that the number of steps evenly divides a power
10059 * of the factor. (This assures both integer bucket size and
10060 * linearity within each magnitude.)
10061 */
10062 for (v = factor; v < nsteps; v *= factor)
10063 continue;
10064
10065 if ((v % nsteps) || (nsteps % factor))
10066 goto err;
10067
10068 size = (dtrace_aggregate_llquantize_bucket(factor,
10069 low, high, nsteps, INT64_MAX) + 2) * sizeof (uint64_t);
10070 break;
10071 }
10072
10073 case DTRACEAGG_AVG:
10074 agg->dtag_aggregate = dtrace_aggregate_avg;
10075 size = sizeof (uint64_t) * 2;
10076 break;
10077
10078 case DTRACEAGG_STDDEV:
10079 agg->dtag_aggregate = dtrace_aggregate_stddev;
10080 size = sizeof (uint64_t) * 4;
10081 break;
10082
10083 case DTRACEAGG_SUM:
10084 agg->dtag_aggregate = dtrace_aggregate_sum;
10085 break;
10086
10087 default:
10088 goto err;
10089 }
10090
10091 agg->dtag_action.dta_rec.dtrd_size = size;
10092
10093 if (ntuple == 0)
10094 goto err;
10095
10096 /*
10097 * We must make sure that we have enough actions for the n-tuple.
10098 */
10099 for (act = ecb->dte_action_last; act != NULL; act = act->dta_prev) {
10100 if (DTRACEACT_ISAGG(act->dta_kind))
10101 break;
10102
10103 if (--ntuple == 0) {
10104 /*
10105 * This is the action with which our n-tuple begins.
10106 */
10107 agg->dtag_first = act;
10108 goto success;
10109 }
10110 }
10111
10112 /*
10113 * This n-tuple is short by ntuple elements. Return failure.
10114 */
10115 ASSERT(ntuple != 0);
10116err:
10117 kmem_free(agg, sizeof (dtrace_aggregation_t));
10118 return (NULL);
10119
10120success:
10121 /*
10122 * If the last action in the tuple has a size of zero, it's actually
10123 * an expression argument for the aggregating action.
10124 */
10125 ASSERT(ecb->dte_action_last != NULL);
10126 act = ecb->dte_action_last;
10127
10128 if (act->dta_kind == DTRACEACT_DIFEXPR) {
10129 ASSERT(act->dta_difo != NULL);
10130
10131 if (act->dta_difo->dtdo_rtype.dtdt_size == 0)
10132 agg->dtag_hasarg = 1;
10133 }
10134
10135 /*
10136 * We need to allocate an id for this aggregation.
10137 */
10138#if defined(sun)
10139 aggid = (dtrace_aggid_t)(uintptr_t)vmem_alloc(state->dts_aggid_arena, 1,
10140 VM_BESTFIT | VM_SLEEP);
10141#else
10142 aggid = alloc_unr(state->dts_aggid_arena);
10143#endif
10144
10145 if (aggid - 1 >= state->dts_naggregations) {
10146 dtrace_aggregation_t **oaggs = state->dts_aggregations;
10147 dtrace_aggregation_t **aggs;
10148 int naggs = state->dts_naggregations << 1;
10149 int onaggs = state->dts_naggregations;
10150
10151 ASSERT(aggid == state->dts_naggregations + 1);
10152
10153 if (naggs == 0) {
10154 ASSERT(oaggs == NULL);
10155 naggs = 1;
10156 }
10157
10158 aggs = kmem_zalloc(naggs * sizeof (*aggs), KM_SLEEP);
10159
10160 if (oaggs != NULL) {
10161 bcopy(oaggs, aggs, onaggs * sizeof (*aggs));
10162 kmem_free(oaggs, onaggs * sizeof (*aggs));
10163 }
10164
10165 state->dts_aggregations = aggs;
10166 state->dts_naggregations = naggs;
10167 }
10168
10169 ASSERT(state->dts_aggregations[aggid - 1] == NULL);
10170 state->dts_aggregations[(agg->dtag_id = aggid) - 1] = agg;
10171
10172 frec = &agg->dtag_first->dta_rec;
10173 if (frec->dtrd_alignment < sizeof (dtrace_aggid_t))
10174 frec->dtrd_alignment = sizeof (dtrace_aggid_t);
10175
10176 for (act = agg->dtag_first; act != NULL; act = act->dta_next) {
10177 ASSERT(!act->dta_intuple);
10178 act->dta_intuple = 1;
10179 }
10180
10181 return (&agg->dtag_action);
10182}
10183
10184static void
10185dtrace_ecb_aggregation_destroy(dtrace_ecb_t *ecb, dtrace_action_t *act)
10186{
10187 dtrace_aggregation_t *agg = (dtrace_aggregation_t *)act;
10188 dtrace_state_t *state = ecb->dte_state;
10189 dtrace_aggid_t aggid = agg->dtag_id;
10190
10191 ASSERT(DTRACEACT_ISAGG(act->dta_kind));
10192#if defined(sun)
10193 vmem_free(state->dts_aggid_arena, (void *)(uintptr_t)aggid, 1);
10194#else
10195 free_unr(state->dts_aggid_arena, aggid);
10196#endif
10197
10198 ASSERT(state->dts_aggregations[aggid - 1] == agg);
10199 state->dts_aggregations[aggid - 1] = NULL;
10200
10201 kmem_free(agg, sizeof (dtrace_aggregation_t));
10202}
10203
10204static int
10205dtrace_ecb_action_add(dtrace_ecb_t *ecb, dtrace_actdesc_t *desc)
10206{
10207 dtrace_action_t *action, *last;
10208 dtrace_difo_t *dp = desc->dtad_difo;
10209 uint32_t size = 0, align = sizeof (uint8_t), mask;
10210 uint16_t format = 0;
10211 dtrace_recdesc_t *rec;
10212 dtrace_state_t *state = ecb->dte_state;
10213 dtrace_optval_t *opt = state->dts_options, nframes = 0, strsize;
10214 uint64_t arg = desc->dtad_arg;
10215
10216 ASSERT(MUTEX_HELD(&dtrace_lock));
10217 ASSERT(ecb->dte_action == NULL || ecb->dte_action->dta_refcnt == 1);
10218
10219 if (DTRACEACT_ISAGG(desc->dtad_kind)) {
10220 /*
10221 * If this is an aggregating action, there must be neither
10222 * a speculate nor a commit on the action chain.
10223 */
10224 dtrace_action_t *act;
10225
10226 for (act = ecb->dte_action; act != NULL; act = act->dta_next) {
10227 if (act->dta_kind == DTRACEACT_COMMIT)
10228 return (EINVAL);
10229
10230 if (act->dta_kind == DTRACEACT_SPECULATE)
10231 return (EINVAL);
10232 }
10233
10234 action = dtrace_ecb_aggregation_create(ecb, desc);
10235
10236 if (action == NULL)
10237 return (EINVAL);
10238 } else {
10239 if (DTRACEACT_ISDESTRUCTIVE(desc->dtad_kind) ||
10240 (desc->dtad_kind == DTRACEACT_DIFEXPR &&
10241 dp != NULL && dp->dtdo_destructive)) {
10242 state->dts_destructive = 1;
10243 }
10244
10245 switch (desc->dtad_kind) {
10246 case DTRACEACT_PRINTF:
10247 case DTRACEACT_PRINTA:
10248 case DTRACEACT_SYSTEM:
10249 case DTRACEACT_FREOPEN:
10250 case DTRACEACT_DIFEXPR:
10251 /*
10252 * We know that our arg is a string -- turn it into a
10253 * format.
10254 */
10255 if (arg == 0) {
10256 ASSERT(desc->dtad_kind == DTRACEACT_PRINTA ||
10257 desc->dtad_kind == DTRACEACT_DIFEXPR);
10258 format = 0;
10259 } else {
10260 ASSERT(arg != 0);
10261#if defined(sun)
10262 ASSERT(arg > KERNELBASE);
10263#endif
10264 format = dtrace_format_add(state,
10265 (char *)(uintptr_t)arg);
10266 }
10267
10268 /*FALLTHROUGH*/
10269 case DTRACEACT_LIBACT:
10270 case DTRACEACT_TRACEMEM:
10271 case DTRACEACT_TRACEMEM_DYNSIZE:
10272 if (dp == NULL)
10273 return (EINVAL);
10274
10275 if ((size = dp->dtdo_rtype.dtdt_size) != 0)
10276 break;
10277
10278 if (dp->dtdo_rtype.dtdt_kind == DIF_TYPE_STRING) {
10279 if (!(dp->dtdo_rtype.dtdt_flags & DIF_TF_BYREF))
10280 return (EINVAL);
10281
10282 size = opt[DTRACEOPT_STRSIZE];
10283 }
10284
10285 break;
10286
10287 case DTRACEACT_STACK:
10288 if ((nframes = arg) == 0) {
10289 nframes = opt[DTRACEOPT_STACKFRAMES];
10290 ASSERT(nframes > 0);
10291 arg = nframes;
10292 }
10293
10294 size = nframes * sizeof (pc_t);
10295 break;
10296
10297 case DTRACEACT_JSTACK:
10298 if ((strsize = DTRACE_USTACK_STRSIZE(arg)) == 0)
10299 strsize = opt[DTRACEOPT_JSTACKSTRSIZE];
10300
10301 if ((nframes = DTRACE_USTACK_NFRAMES(arg)) == 0)
10302 nframes = opt[DTRACEOPT_JSTACKFRAMES];
10303
10304 arg = DTRACE_USTACK_ARG(nframes, strsize);
10305
10306 /*FALLTHROUGH*/
10307 case DTRACEACT_USTACK:
10308 if (desc->dtad_kind != DTRACEACT_JSTACK &&
10309 (nframes = DTRACE_USTACK_NFRAMES(arg)) == 0) {
10310 strsize = DTRACE_USTACK_STRSIZE(arg);
10311 nframes = opt[DTRACEOPT_USTACKFRAMES];
10312 ASSERT(nframes > 0);
10313 arg = DTRACE_USTACK_ARG(nframes, strsize);
10314 }
10315
10316 /*
10317 * Save a slot for the pid.
10318 */
10319 size = (nframes + 1) * sizeof (uint64_t);
10320 size += DTRACE_USTACK_STRSIZE(arg);
10321 size = P2ROUNDUP(size, (uint32_t)(sizeof (uintptr_t)));
10322
10323 break;
10324
10325 case DTRACEACT_SYM:
10326 case DTRACEACT_MOD:
10327 if (dp == NULL || ((size = dp->dtdo_rtype.dtdt_size) !=
10328 sizeof (uint64_t)) ||
10329 (dp->dtdo_rtype.dtdt_flags & DIF_TF_BYREF))
10330 return (EINVAL);
10331 break;
10332
10333 case DTRACEACT_USYM:
10334 case DTRACEACT_UMOD:
10335 case DTRACEACT_UADDR:
10336 if (dp == NULL ||
10337 (dp->dtdo_rtype.dtdt_size != sizeof (uint64_t)) ||
10338 (dp->dtdo_rtype.dtdt_flags & DIF_TF_BYREF))
10339 return (EINVAL);
10340
10341 /*
10342 * We have a slot for the pid, plus a slot for the
10343 * argument. To keep things simple (aligned with
10344 * bitness-neutral sizing), we store each as a 64-bit
10345 * quantity.
10346 */
10347 size = 2 * sizeof (uint64_t);
10348 break;
10349
10350 case DTRACEACT_STOP:
10351 case DTRACEACT_BREAKPOINT:
10352 case DTRACEACT_PANIC:
10353 break;
10354
10355 case DTRACEACT_CHILL:
10356 case DTRACEACT_DISCARD:
10357 case DTRACEACT_RAISE:
10358 if (dp == NULL)
10359 return (EINVAL);
10360 break;
10361
10362 case DTRACEACT_EXIT:
10363 if (dp == NULL ||
10364 (size = dp->dtdo_rtype.dtdt_size) != sizeof (int) ||
10365 (dp->dtdo_rtype.dtdt_flags & DIF_TF_BYREF))
10366 return (EINVAL);
10367 break;
10368
10369 case DTRACEACT_SPECULATE:
10370 if (ecb->dte_size > sizeof (dtrace_rechdr_t))
10371 return (EINVAL);
10372
10373 if (dp == NULL)
10374 return (EINVAL);
10375
10376 state->dts_speculates = 1;
10377 break;
10378
10379 case DTRACEACT_PRINTM:
10380 size = dp->dtdo_rtype.dtdt_size;
10381 break;
10382
10383 case DTRACEACT_PRINTT:
10384 size = dp->dtdo_rtype.dtdt_size;
10385 break;
10386
10387 case DTRACEACT_COMMIT: {
10388 dtrace_action_t *act = ecb->dte_action;
10389
10390 for (; act != NULL; act = act->dta_next) {
10391 if (act->dta_kind == DTRACEACT_COMMIT)
10392 return (EINVAL);
10393 }
10394
10395 if (dp == NULL)
10396 return (EINVAL);
10397 break;
10398 }
10399
10400 default:
10401 return (EINVAL);
10402 }
10403
10404 if (size != 0 || desc->dtad_kind == DTRACEACT_SPECULATE) {
10405 /*
10406 * If this is a data-storing action or a speculate,
10407 * we must be sure that there isn't a commit on the
10408 * action chain.
10409 */
10410 dtrace_action_t *act = ecb->dte_action;
10411
10412 for (; act != NULL; act = act->dta_next) {
10413 if (act->dta_kind == DTRACEACT_COMMIT)
10414 return (EINVAL);
10415 }
10416 }
10417
10418 action = kmem_zalloc(sizeof (dtrace_action_t), KM_SLEEP);
10419 action->dta_rec.dtrd_size = size;
10420 }
10421
10422 action->dta_refcnt = 1;
10423 rec = &action->dta_rec;
10424 size = rec->dtrd_size;
10425
10426 for (mask = sizeof (uint64_t) - 1; size != 0 && mask > 0; mask >>= 1) {
10427 if (!(size & mask)) {
10428 align = mask + 1;
10429 break;
10430 }
10431 }
10432
10433 action->dta_kind = desc->dtad_kind;
10434
10435 if ((action->dta_difo = dp) != NULL)
10436 dtrace_difo_hold(dp);
10437
10438 rec->dtrd_action = action->dta_kind;
10439 rec->dtrd_arg = arg;
10440 rec->dtrd_uarg = desc->dtad_uarg;
10441 rec->dtrd_alignment = (uint16_t)align;
10442 rec->dtrd_format = format;
10443
10444 if ((last = ecb->dte_action_last) != NULL) {
10445 ASSERT(ecb->dte_action != NULL);
10446 action->dta_prev = last;
10447 last->dta_next = action;
10448 } else {
10449 ASSERT(ecb->dte_action == NULL);
10450 ecb->dte_action = action;
10451 }
10452
10453 ecb->dte_action_last = action;
10454
10455 return (0);
10456}
10457
10458static void
10459dtrace_ecb_action_remove(dtrace_ecb_t *ecb)
10460{
10461 dtrace_action_t *act = ecb->dte_action, *next;
10462 dtrace_vstate_t *vstate = &ecb->dte_state->dts_vstate;
10463 dtrace_difo_t *dp;
10464 uint16_t format;
10465
10466 if (act != NULL && act->dta_refcnt > 1) {
10467 ASSERT(act->dta_next == NULL || act->dta_next->dta_refcnt == 1);
10468 act->dta_refcnt--;
10469 } else {
10470 for (; act != NULL; act = next) {
10471 next = act->dta_next;
10472 ASSERT(next != NULL || act == ecb->dte_action_last);
10473 ASSERT(act->dta_refcnt == 1);
10474
10475 if ((format = act->dta_rec.dtrd_format) != 0)
10476 dtrace_format_remove(ecb->dte_state, format);
10477
10478 if ((dp = act->dta_difo) != NULL)
10479 dtrace_difo_release(dp, vstate);
10480
10481 if (DTRACEACT_ISAGG(act->dta_kind)) {
10482 dtrace_ecb_aggregation_destroy(ecb, act);
10483 } else {
10484 kmem_free(act, sizeof (dtrace_action_t));
10485 }
10486 }
10487 }
10488
10489 ecb->dte_action = NULL;
10490 ecb->dte_action_last = NULL;
10491 ecb->dte_size = 0;
10492}
10493
10494static void
10495dtrace_ecb_disable(dtrace_ecb_t *ecb)
10496{
10497 /*
10498 * We disable the ECB by removing it from its probe.
10499 */
10500 dtrace_ecb_t *pecb, *prev = NULL;
10501 dtrace_probe_t *probe = ecb->dte_probe;
10502
10503 ASSERT(MUTEX_HELD(&dtrace_lock));
10504
10505 if (probe == NULL) {
10506 /*
10507 * This is the NULL probe; there is nothing to disable.
10508 */
10509 return;
10510 }
10511
10512 for (pecb = probe->dtpr_ecb; pecb != NULL; pecb = pecb->dte_next) {
10513 if (pecb == ecb)
10514 break;
10515 prev = pecb;
10516 }
10517
10518 ASSERT(pecb != NULL);
10519
10520 if (prev == NULL) {
10521 probe->dtpr_ecb = ecb->dte_next;
10522 } else {
10523 prev->dte_next = ecb->dte_next;
10524 }
10525
10526 if (ecb == probe->dtpr_ecb_last) {
10527 ASSERT(ecb->dte_next == NULL);
10528 probe->dtpr_ecb_last = prev;
10529 }
10530
10531 /*
10532 * The ECB has been disconnected from the probe; now sync to assure
10533 * that all CPUs have seen the change before returning.
10534 */
10535 dtrace_sync();
10536
10537 if (probe->dtpr_ecb == NULL) {
10538 /*
10539 * That was the last ECB on the probe; clear the predicate
10540 * cache ID for the probe, disable it and sync one more time
10541 * to assure that we'll never hit it again.
10542 */
10543 dtrace_provider_t *prov = probe->dtpr_provider;
10544
10545 ASSERT(ecb->dte_next == NULL);
10546 ASSERT(probe->dtpr_ecb_last == NULL);
10547 probe->dtpr_predcache = DTRACE_CACHEIDNONE;
10548 prov->dtpv_pops.dtps_disable(prov->dtpv_arg,
10549 probe->dtpr_id, probe->dtpr_arg);
10550 dtrace_sync();
10551 } else {
10552 /*
10553 * There is at least one ECB remaining on the probe. If there
10554 * is _exactly_ one, set the probe's predicate cache ID to be
10555 * the predicate cache ID of the remaining ECB.
10556 */
10557 ASSERT(probe->dtpr_ecb_last != NULL);
10558 ASSERT(probe->dtpr_predcache == DTRACE_CACHEIDNONE);
10559
10560 if (probe->dtpr_ecb == probe->dtpr_ecb_last) {
10561 dtrace_predicate_t *p = probe->dtpr_ecb->dte_predicate;
10562
10563 ASSERT(probe->dtpr_ecb->dte_next == NULL);
10564
10565 if (p != NULL)
10566 probe->dtpr_predcache = p->dtp_cacheid;
10567 }
10568
10569 ecb->dte_next = NULL;
10570 }
10571}
10572
10573static void
10574dtrace_ecb_destroy(dtrace_ecb_t *ecb)
10575{
10576 dtrace_state_t *state = ecb->dte_state;
10577 dtrace_vstate_t *vstate = &state->dts_vstate;
10578 dtrace_predicate_t *pred;
10579 dtrace_epid_t epid = ecb->dte_epid;
10580
10581 ASSERT(MUTEX_HELD(&dtrace_lock));
10582 ASSERT(ecb->dte_next == NULL);
10583 ASSERT(ecb->dte_probe == NULL || ecb->dte_probe->dtpr_ecb != ecb);
10584
10585 if ((pred = ecb->dte_predicate) != NULL)
10586 dtrace_predicate_release(pred, vstate);
10587
10588 dtrace_ecb_action_remove(ecb);
10589
10590 ASSERT(state->dts_ecbs[epid - 1] == ecb);
10591 state->dts_ecbs[epid - 1] = NULL;
10592
10593 kmem_free(ecb, sizeof (dtrace_ecb_t));
10594}
10595
10596static dtrace_ecb_t *
10597dtrace_ecb_create(dtrace_state_t *state, dtrace_probe_t *probe,
10598 dtrace_enabling_t *enab)
10599{
10600 dtrace_ecb_t *ecb;
10601 dtrace_predicate_t *pred;
10602 dtrace_actdesc_t *act;
10603 dtrace_provider_t *prov;
10604 dtrace_ecbdesc_t *desc = enab->dten_current;
10605
10606 ASSERT(MUTEX_HELD(&dtrace_lock));
10607 ASSERT(state != NULL);
10608
10609 ecb = dtrace_ecb_add(state, probe);
10610 ecb->dte_uarg = desc->dted_uarg;
10611
10612 if ((pred = desc->dted_pred.dtpdd_predicate) != NULL) {
10613 dtrace_predicate_hold(pred);
10614 ecb->dte_predicate = pred;
10615 }
10616
10617 if (probe != NULL) {
10618 /*
10619 * If the provider shows more leg than the consumer is old
10620 * enough to see, we need to enable the appropriate implicit
10621 * predicate bits to prevent the ecb from activating at
10622 * revealing times.
10623 *
10624 * Providers specifying DTRACE_PRIV_USER at register time
10625 * are stating that they need the /proc-style privilege
10626 * model to be enforced, and this is what DTRACE_COND_OWNER
10627 * and DTRACE_COND_ZONEOWNER will then do at probe time.
10628 */
10629 prov = probe->dtpr_provider;
10630 if (!(state->dts_cred.dcr_visible & DTRACE_CRV_ALLPROC) &&
10631 (prov->dtpv_priv.dtpp_flags & DTRACE_PRIV_USER))
10632 ecb->dte_cond |= DTRACE_COND_OWNER;
10633
10634 if (!(state->dts_cred.dcr_visible & DTRACE_CRV_ALLZONE) &&
10635 (prov->dtpv_priv.dtpp_flags & DTRACE_PRIV_USER))
10636 ecb->dte_cond |= DTRACE_COND_ZONEOWNER;
10637
10638 /*
10639 * If the provider shows us kernel innards and the user
10640 * is lacking sufficient privilege, enable the
10641 * DTRACE_COND_USERMODE implicit predicate.
10642 */
10643 if (!(state->dts_cred.dcr_visible & DTRACE_CRV_KERNEL) &&
10644 (prov->dtpv_priv.dtpp_flags & DTRACE_PRIV_KERNEL))
10645 ecb->dte_cond |= DTRACE_COND_USERMODE;
10646 }
10647
10648 if (dtrace_ecb_create_cache != NULL) {
10649 /*
10650 * If we have a cached ecb, we'll use its action list instead
10651 * of creating our own (saving both time and space).
10652 */
10653 dtrace_ecb_t *cached = dtrace_ecb_create_cache;
10654 dtrace_action_t *act = cached->dte_action;
10655
10656 if (act != NULL) {
10657 ASSERT(act->dta_refcnt > 0);
10658 act->dta_refcnt++;
10659 ecb->dte_action = act;
10660 ecb->dte_action_last = cached->dte_action_last;
10661 ecb->dte_needed = cached->dte_needed;
10662 ecb->dte_size = cached->dte_size;
10663 ecb->dte_alignment = cached->dte_alignment;
10664 }
10665
10666 return (ecb);
10667 }
10668
10669 for (act = desc->dted_action; act != NULL; act = act->dtad_next) {
10670 if ((enab->dten_error = dtrace_ecb_action_add(ecb, act)) != 0) {
10671 dtrace_ecb_destroy(ecb);
10672 return (NULL);
10673 }
10674 }
10675
10676 dtrace_ecb_resize(ecb);
10677
10678 return (dtrace_ecb_create_cache = ecb);
10679}
10680
10681static int
10682dtrace_ecb_create_enable(dtrace_probe_t *probe, void *arg)
10683{
10684 dtrace_ecb_t *ecb;
10685 dtrace_enabling_t *enab = arg;
10686 dtrace_state_t *state = enab->dten_vstate->dtvs_state;
10687
10688 ASSERT(state != NULL);
10689
10690 if (probe != NULL && probe->dtpr_gen < enab->dten_probegen) {
10691 /*
10692 * This probe was created in a generation for which this
10693 * enabling has previously created ECBs; we don't want to
10694 * enable it again, so just kick out.
10695 */
10696 return (DTRACE_MATCH_NEXT);
10697 }
10698
10699 if ((ecb = dtrace_ecb_create(state, probe, enab)) == NULL)
10700 return (DTRACE_MATCH_DONE);
10701
10702 dtrace_ecb_enable(ecb);
10703 return (DTRACE_MATCH_NEXT);
10704}
10705
10706static dtrace_ecb_t *
10707dtrace_epid2ecb(dtrace_state_t *state, dtrace_epid_t id)
10708{
10709 dtrace_ecb_t *ecb;
10710
10711 ASSERT(MUTEX_HELD(&dtrace_lock));
10712
10713 if (id == 0 || id > state->dts_necbs)
10714 return (NULL);
10715
10716 ASSERT(state->dts_necbs > 0 && state->dts_ecbs != NULL);
10717 ASSERT((ecb = state->dts_ecbs[id - 1]) == NULL || ecb->dte_epid == id);
10718
10719 return (state->dts_ecbs[id - 1]);
10720}
10721
10722static dtrace_aggregation_t *
10723dtrace_aggid2agg(dtrace_state_t *state, dtrace_aggid_t id)
10724{
10725 dtrace_aggregation_t *agg;
10726
10727 ASSERT(MUTEX_HELD(&dtrace_lock));
10728
10729 if (id == 0 || id > state->dts_naggregations)
10730 return (NULL);
10731
10732 ASSERT(state->dts_naggregations > 0 && state->dts_aggregations != NULL);
10733 ASSERT((agg = state->dts_aggregations[id - 1]) == NULL ||
10734 agg->dtag_id == id);
10735
10736 return (state->dts_aggregations[id - 1]);
10737}
10738
10739/*
10740 * DTrace Buffer Functions
10741 *
10742 * The following functions manipulate DTrace buffers. Most of these functions
10743 * are called in the context of establishing or processing consumer state;
10744 * exceptions are explicitly noted.
10745 */
10746
10747/*
10748 * Note: called from cross call context. This function switches the two
10749 * buffers on a given CPU. The atomicity of this operation is assured by
10750 * disabling interrupts while the actual switch takes place; the disabling of
10751 * interrupts serializes the execution with any execution of dtrace_probe() on
10752 * the same CPU.
10753 */
10754static void
10755dtrace_buffer_switch(dtrace_buffer_t *buf)
10756{
10757 caddr_t tomax = buf->dtb_tomax;
10758 caddr_t xamot = buf->dtb_xamot;
10759 dtrace_icookie_t cookie;
10760 hrtime_t now;
10761
10762 ASSERT(!(buf->dtb_flags & DTRACEBUF_NOSWITCH));
10763 ASSERT(!(buf->dtb_flags & DTRACEBUF_RING));
10764
10765 cookie = dtrace_interrupt_disable();
10766 now = dtrace_gethrtime();
10767 buf->dtb_tomax = xamot;
10768 buf->dtb_xamot = tomax;
10769 buf->dtb_xamot_drops = buf->dtb_drops;
10770 buf->dtb_xamot_offset = buf->dtb_offset;
10771 buf->dtb_xamot_errors = buf->dtb_errors;
10772 buf->dtb_xamot_flags = buf->dtb_flags;
10773 buf->dtb_offset = 0;
10774 buf->dtb_drops = 0;
10775 buf->dtb_errors = 0;
10776 buf->dtb_flags &= ~(DTRACEBUF_ERROR | DTRACEBUF_DROPPED);
10777 buf->dtb_interval = now - buf->dtb_switched;
10778 buf->dtb_switched = now;
10779 dtrace_interrupt_enable(cookie);
10780}
10781
10782/*
10783 * Note: called from cross call context. This function activates a buffer
10784 * on a CPU. As with dtrace_buffer_switch(), the atomicity of the operation
10785 * is guaranteed by the disabling of interrupts.
10786 */
10787static void
10788dtrace_buffer_activate(dtrace_state_t *state)
10789{
10790 dtrace_buffer_t *buf;
10791 dtrace_icookie_t cookie = dtrace_interrupt_disable();
10792
10793 buf = &state->dts_buffer[curcpu];
10794
10795 if (buf->dtb_tomax != NULL) {
10796 /*
10797 * We might like to assert that the buffer is marked inactive,
10798 * but this isn't necessarily true: the buffer for the CPU
10799 * that processes the BEGIN probe has its buffer activated
10800 * manually. In this case, we take the (harmless) action
10801 * re-clearing the bit INACTIVE bit.
10802 */
10803 buf->dtb_flags &= ~DTRACEBUF_INACTIVE;
10804 }
10805
10806 dtrace_interrupt_enable(cookie);
10807}
10808
10809static int
10810dtrace_buffer_alloc(dtrace_buffer_t *bufs, size_t size, int flags,
10811 processorid_t cpu, int *factor)
10812{
10813#if defined(sun)
10814 cpu_t *cp;
10815#endif
10816 dtrace_buffer_t *buf;
10817 int allocated = 0, desired = 0;
10818
10819#if defined(sun)
10820 ASSERT(MUTEX_HELD(&cpu_lock));
10821 ASSERT(MUTEX_HELD(&dtrace_lock));
10822
10823 *factor = 1;
10824
10825 if (size > dtrace_nonroot_maxsize &&
10826 !PRIV_POLICY_CHOICE(CRED(), PRIV_ALL, B_FALSE))
10827 return (EFBIG);
10828
10829 cp = cpu_list;
10830
10831 do {
10832 if (cpu != DTRACE_CPUALL && cpu != cp->cpu_id)
10833 continue;
10834
10835 buf = &bufs[cp->cpu_id];
10836
10837 /*
10838 * If there is already a buffer allocated for this CPU, it
10839 * is only possible that this is a DR event. In this case,
10840 */
10841 if (buf->dtb_tomax != NULL) {
10842 ASSERT(buf->dtb_size == size);
10843 continue;
10844 }
10845
10846 ASSERT(buf->dtb_xamot == NULL);
10847
10848 if ((buf->dtb_tomax = kmem_zalloc(size,
10849 KM_NOSLEEP | KM_NORMALPRI)) == NULL)
10850 goto err;
10851
10852 buf->dtb_size = size;
10853 buf->dtb_flags = flags;
10854 buf->dtb_offset = 0;
10855 buf->dtb_drops = 0;
10856
10857 if (flags & DTRACEBUF_NOSWITCH)
10858 continue;
10859
10860 if ((buf->dtb_xamot = kmem_zalloc(size,
10861 KM_NOSLEEP | KM_NORMALPRI)) == NULL)
10862 goto err;
10863 } while ((cp = cp->cpu_next) != cpu_list);
10864
10865 return (0);
10866
10867err:
10868 cp = cpu_list;
10869
10870 do {
10871 if (cpu != DTRACE_CPUALL && cpu != cp->cpu_id)
10872 continue;
10873
10874 buf = &bufs[cp->cpu_id];
10875 desired += 2;
10876
10877 if (buf->dtb_xamot != NULL) {
10878 ASSERT(buf->dtb_tomax != NULL);
10879 ASSERT(buf->dtb_size == size);
10880 kmem_free(buf->dtb_xamot, size);
10881 allocated++;
10882 }
10883
10884 if (buf->dtb_tomax != NULL) {
10885 ASSERT(buf->dtb_size == size);
10886 kmem_free(buf->dtb_tomax, size);
10887 allocated++;
10888 }
10889
10890 buf->dtb_tomax = NULL;
10891 buf->dtb_xamot = NULL;
10892 buf->dtb_size = 0;
10893 } while ((cp = cp->cpu_next) != cpu_list);
10894#else
10895 int i;
10896
10897 *factor = 1;
10898#if defined(__amd64__) || defined(__mips__) || defined(__powerpc__)
10899 /*
10900 * FreeBSD isn't good at limiting the amount of memory we
10901 * ask to malloc, so let's place a limit here before trying
10902 * to do something that might well end in tears at bedtime.
10903 */
10904 if (size > physmem * PAGE_SIZE / (128 * (mp_maxid + 1)))
10905 return (ENOMEM);
10906#endif
10907
10908 ASSERT(MUTEX_HELD(&dtrace_lock));
10909 CPU_FOREACH(i) {
10910 if (cpu != DTRACE_CPUALL && cpu != i)
10911 continue;
10912
10913 buf = &bufs[i];
10914
10915 /*
10916 * If there is already a buffer allocated for this CPU, it
10917 * is only possible that this is a DR event. In this case,
10918 * the buffer size must match our specified size.
10919 */
10920 if (buf->dtb_tomax != NULL) {
10921 ASSERT(buf->dtb_size == size);
10922 continue;
10923 }
10924
10925 ASSERT(buf->dtb_xamot == NULL);
10926
10927 if ((buf->dtb_tomax = kmem_zalloc(size,
10928 KM_NOSLEEP | KM_NORMALPRI)) == NULL)
10929 goto err;
10930
10931 buf->dtb_size = size;
10932 buf->dtb_flags = flags;
10933 buf->dtb_offset = 0;
10934 buf->dtb_drops = 0;
10935
10936 if (flags & DTRACEBUF_NOSWITCH)
10937 continue;
10938
10939 if ((buf->dtb_xamot = kmem_zalloc(size,
10940 KM_NOSLEEP | KM_NORMALPRI)) == NULL)
10941 goto err;
10942 }
10943
10944 return (0);
10945
10946err:
10947 /*
10948 * Error allocating memory, so free the buffers that were
10949 * allocated before the failed allocation.
10950 */
10951 CPU_FOREACH(i) {
10952 if (cpu != DTRACE_CPUALL && cpu != i)
10953 continue;
10954
10955 buf = &bufs[i];
10956 desired += 2;
10957
10958 if (buf->dtb_xamot != NULL) {
10959 ASSERT(buf->dtb_tomax != NULL);
10960 ASSERT(buf->dtb_size == size);
10961 kmem_free(buf->dtb_xamot, size);
10962 allocated++;
10963 }
10964
10965 if (buf->dtb_tomax != NULL) {
10966 ASSERT(buf->dtb_size == size);
10967 kmem_free(buf->dtb_tomax, size);
10968 allocated++;
10969 }
10970
10971 buf->dtb_tomax = NULL;
10972 buf->dtb_xamot = NULL;
10973 buf->dtb_size = 0;
10974
10975 }
10976#endif
10977 *factor = desired / (allocated > 0 ? allocated : 1);
10978
10979 return (ENOMEM);
10980}
10981
10982/*
10983 * Note: called from probe context. This function just increments the drop
10984 * count on a buffer. It has been made a function to allow for the
10985 * possibility of understanding the source of mysterious drop counts. (A
10986 * problem for which one may be particularly disappointed that DTrace cannot
10987 * be used to understand DTrace.)
10988 */
10989static void
10990dtrace_buffer_drop(dtrace_buffer_t *buf)
10991{
10992 buf->dtb_drops++;
10993}
10994
10995/*
10996 * Note: called from probe context. This function is called to reserve space
10997 * in a buffer. If mstate is non-NULL, sets the scratch base and size in the
10998 * mstate. Returns the new offset in the buffer, or a negative value if an
10999 * error has occurred.
11000 */
11001static intptr_t
11002dtrace_buffer_reserve(dtrace_buffer_t *buf, size_t needed, size_t align,
11003 dtrace_state_t *state, dtrace_mstate_t *mstate)
11004{
11005 intptr_t offs = buf->dtb_offset, soffs;
11006 intptr_t woffs;
11007 caddr_t tomax;
11008 size_t total;
11009
11010 if (buf->dtb_flags & DTRACEBUF_INACTIVE)
11011 return (-1);
11012
11013 if ((tomax = buf->dtb_tomax) == NULL) {
11014 dtrace_buffer_drop(buf);
11015 return (-1);
11016 }
11017
11018 if (!(buf->dtb_flags & (DTRACEBUF_RING | DTRACEBUF_FILL))) {
11019 while (offs & (align - 1)) {
11020 /*
11021 * Assert that our alignment is off by a number which
11022 * is itself sizeof (uint32_t) aligned.
11023 */
11024 ASSERT(!((align - (offs & (align - 1))) &
11025 (sizeof (uint32_t) - 1)));
11026 DTRACE_STORE(uint32_t, tomax, offs, DTRACE_EPIDNONE);
11027 offs += sizeof (uint32_t);
11028 }
11029
11030 if ((soffs = offs + needed) > buf->dtb_size) {
11031 dtrace_buffer_drop(buf);
11032 return (-1);
11033 }
11034
11035 if (mstate == NULL)
11036 return (offs);
11037
11038 mstate->dtms_scratch_base = (uintptr_t)tomax + soffs;
11039 mstate->dtms_scratch_size = buf->dtb_size - soffs;
11040 mstate->dtms_scratch_ptr = mstate->dtms_scratch_base;
11041
11042 return (offs);
11043 }
11044
11045 if (buf->dtb_flags & DTRACEBUF_FILL) {
11046 if (state->dts_activity != DTRACE_ACTIVITY_COOLDOWN &&
11047 (buf->dtb_flags & DTRACEBUF_FULL))
11048 return (-1);
11049 goto out;
11050 }
11051
11052 total = needed + (offs & (align - 1));
11053
11054 /*
11055 * For a ring buffer, life is quite a bit more complicated. Before
11056 * we can store any padding, we need to adjust our wrapping offset.
11057 * (If we've never before wrapped or we're not about to, no adjustment
11058 * is required.)
11059 */
11060 if ((buf->dtb_flags & DTRACEBUF_WRAPPED) ||
11061 offs + total > buf->dtb_size) {
11062 woffs = buf->dtb_xamot_offset;
11063
11064 if (offs + total > buf->dtb_size) {
11065 /*
11066 * We can't fit in the end of the buffer. First, a
11067 * sanity check that we can fit in the buffer at all.
11068 */
11069 if (total > buf->dtb_size) {
11070 dtrace_buffer_drop(buf);
11071 return (-1);
11072 }
11073
11074 /*
11075 * We're going to be storing at the top of the buffer,
11076 * so now we need to deal with the wrapped offset. We
11077 * only reset our wrapped offset to 0 if it is
11078 * currently greater than the current offset. If it
11079 * is less than the current offset, it is because a
11080 * previous allocation induced a wrap -- but the
11081 * allocation didn't subsequently take the space due
11082 * to an error or false predicate evaluation. In this
11083 * case, we'll just leave the wrapped offset alone: if
11084 * the wrapped offset hasn't been advanced far enough
11085 * for this allocation, it will be adjusted in the
11086 * lower loop.
11087 */
11088 if (buf->dtb_flags & DTRACEBUF_WRAPPED) {
11089 if (woffs >= offs)
11090 woffs = 0;
11091 } else {
11092 woffs = 0;
11093 }
11094
11095 /*
11096 * Now we know that we're going to be storing to the
11097 * top of the buffer and that there is room for us
11098 * there. We need to clear the buffer from the current
11099 * offset to the end (there may be old gunk there).
11100 */
11101 while (offs < buf->dtb_size)
11102 tomax[offs++] = 0;
11103
11104 /*
11105 * We need to set our offset to zero. And because we
11106 * are wrapping, we need to set the bit indicating as
11107 * much. We can also adjust our needed space back
11108 * down to the space required by the ECB -- we know
11109 * that the top of the buffer is aligned.
11110 */
11111 offs = 0;
11112 total = needed;
11113 buf->dtb_flags |= DTRACEBUF_WRAPPED;
11114 } else {
11115 /*
11116 * There is room for us in the buffer, so we simply
11117 * need to check the wrapped offset.
11118 */
11119 if (woffs < offs) {
11120 /*
11121 * The wrapped offset is less than the offset.
11122 * This can happen if we allocated buffer space
11123 * that induced a wrap, but then we didn't
11124 * subsequently take the space due to an error
11125 * or false predicate evaluation. This is
11126 * okay; we know that _this_ allocation isn't
11127 * going to induce a wrap. We still can't
11128 * reset the wrapped offset to be zero,
11129 * however: the space may have been trashed in
11130 * the previous failed probe attempt. But at
11131 * least the wrapped offset doesn't need to
11132 * be adjusted at all...
11133 */
11134 goto out;
11135 }
11136 }
11137
11138 while (offs + total > woffs) {
11139 dtrace_epid_t epid = *(uint32_t *)(tomax + woffs);
11140 size_t size;
11141
11142 if (epid == DTRACE_EPIDNONE) {
11143 size = sizeof (uint32_t);
11144 } else {
11145 ASSERT3U(epid, <=, state->dts_necbs);
11146 ASSERT(state->dts_ecbs[epid - 1] != NULL);
11147
11148 size = state->dts_ecbs[epid - 1]->dte_size;
11149 }
11150
11151 ASSERT(woffs + size <= buf->dtb_size);
11152 ASSERT(size != 0);
11153
11154 if (woffs + size == buf->dtb_size) {
11155 /*
11156 * We've reached the end of the buffer; we want
11157 * to set the wrapped offset to 0 and break
11158 * out. However, if the offs is 0, then we're
11159 * in a strange edge-condition: the amount of
11160 * space that we want to reserve plus the size
11161 * of the record that we're overwriting is
11162 * greater than the size of the buffer. This
11163 * is problematic because if we reserve the
11164 * space but subsequently don't consume it (due
11165 * to a failed predicate or error) the wrapped
11166 * offset will be 0 -- yet the EPID at offset 0
11167 * will not be committed. This situation is
11168 * relatively easy to deal with: if we're in
11169 * this case, the buffer is indistinguishable
11170 * from one that hasn't wrapped; we need only
11171 * finish the job by clearing the wrapped bit,
11172 * explicitly setting the offset to be 0, and
11173 * zero'ing out the old data in the buffer.
11174 */
11175 if (offs == 0) {
11176 buf->dtb_flags &= ~DTRACEBUF_WRAPPED;
11177 buf->dtb_offset = 0;
11178 woffs = total;
11179
11180 while (woffs < buf->dtb_size)
11181 tomax[woffs++] = 0;
11182 }
11183
11184 woffs = 0;
11185 break;
11186 }
11187
11188 woffs += size;
11189 }
11190
11191 /*
11192 * We have a wrapped offset. It may be that the wrapped offset
11193 * has become zero -- that's okay.
11194 */
11195 buf->dtb_xamot_offset = woffs;
11196 }
11197
11198out:
11199 /*
11200 * Now we can plow the buffer with any necessary padding.
11201 */
11202 while (offs & (align - 1)) {
11203 /*
11204 * Assert that our alignment is off by a number which
11205 * is itself sizeof (uint32_t) aligned.
11206 */
11207 ASSERT(!((align - (offs & (align - 1))) &
11208 (sizeof (uint32_t) - 1)));
11209 DTRACE_STORE(uint32_t, tomax, offs, DTRACE_EPIDNONE);
11210 offs += sizeof (uint32_t);
11211 }
11212
11213 if (buf->dtb_flags & DTRACEBUF_FILL) {
11214 if (offs + needed > buf->dtb_size - state->dts_reserve) {
11215 buf->dtb_flags |= DTRACEBUF_FULL;
11216 return (-1);
11217 }
11218 }
11219
11220 if (mstate == NULL)
11221 return (offs);
11222
11223 /*
11224 * For ring buffers and fill buffers, the scratch space is always
11225 * the inactive buffer.
11226 */
11227 mstate->dtms_scratch_base = (uintptr_t)buf->dtb_xamot;
11228 mstate->dtms_scratch_size = buf->dtb_size;
11229 mstate->dtms_scratch_ptr = mstate->dtms_scratch_base;
11230
11231 return (offs);
11232}
11233
11234static void
11235dtrace_buffer_polish(dtrace_buffer_t *buf)
11236{
11237 ASSERT(buf->dtb_flags & DTRACEBUF_RING);
11238 ASSERT(MUTEX_HELD(&dtrace_lock));
11239
11240 if (!(buf->dtb_flags & DTRACEBUF_WRAPPED))
11241 return;
11242
11243 /*
11244 * We need to polish the ring buffer. There are three cases:
11245 *
11246 * - The first (and presumably most common) is that there is no gap
11247 * between the buffer offset and the wrapped offset. In this case,
11248 * there is nothing in the buffer that isn't valid data; we can
11249 * mark the buffer as polished and return.
11250 *
11251 * - The second (less common than the first but still more common
11252 * than the third) is that there is a gap between the buffer offset
11253 * and the wrapped offset, and the wrapped offset is larger than the
11254 * buffer offset. This can happen because of an alignment issue, or
11255 * can happen because of a call to dtrace_buffer_reserve() that
11256 * didn't subsequently consume the buffer space. In this case,
11257 * we need to zero the data from the buffer offset to the wrapped
11258 * offset.
11259 *
11260 * - The third (and least common) is that there is a gap between the
11261 * buffer offset and the wrapped offset, but the wrapped offset is
11262 * _less_ than the buffer offset. This can only happen because a
11263 * call to dtrace_buffer_reserve() induced a wrap, but the space
11264 * was not subsequently consumed. In this case, we need to zero the
11265 * space from the offset to the end of the buffer _and_ from the
11266 * top of the buffer to the wrapped offset.
11267 */
11268 if (buf->dtb_offset < buf->dtb_xamot_offset) {
11269 bzero(buf->dtb_tomax + buf->dtb_offset,
11270 buf->dtb_xamot_offset - buf->dtb_offset);
11271 }
11272
11273 if (buf->dtb_offset > buf->dtb_xamot_offset) {
11274 bzero(buf->dtb_tomax + buf->dtb_offset,
11275 buf->dtb_size - buf->dtb_offset);
11276 bzero(buf->dtb_tomax, buf->dtb_xamot_offset);
11277 }
11278}
11279
11280/*
11281 * This routine determines if data generated at the specified time has likely
11282 * been entirely consumed at user-level. This routine is called to determine
11283 * if an ECB on a defunct probe (but for an active enabling) can be safely
11284 * disabled and destroyed.
11285 */
11286static int
11287dtrace_buffer_consumed(dtrace_buffer_t *bufs, hrtime_t when)
11288{
11289 int i;
11290
11291 for (i = 0; i < NCPU; i++) {
11292 dtrace_buffer_t *buf = &bufs[i];
11293
11294 if (buf->dtb_size == 0)
11295 continue;
11296
11297 if (buf->dtb_flags & DTRACEBUF_RING)
11298 return (0);
11299
11300 if (!buf->dtb_switched && buf->dtb_offset != 0)
11301 return (0);
11302
11303 if (buf->dtb_switched - buf->dtb_interval < when)
11304 return (0);
11305 }
11306
11307 return (1);
11308}
11309
11310static void
11311dtrace_buffer_free(dtrace_buffer_t *bufs)
11312{
11313 int i;
11314
11315 for (i = 0; i < NCPU; i++) {
11316 dtrace_buffer_t *buf = &bufs[i];
11317
11318 if (buf->dtb_tomax == NULL) {
11319 ASSERT(buf->dtb_xamot == NULL);
11320 ASSERT(buf->dtb_size == 0);
11321 continue;
11322 }
11323
11324 if (buf->dtb_xamot != NULL) {
11325 ASSERT(!(buf->dtb_flags & DTRACEBUF_NOSWITCH));
11326 kmem_free(buf->dtb_xamot, buf->dtb_size);
11327 }
11328
11329 kmem_free(buf->dtb_tomax, buf->dtb_size);
11330 buf->dtb_size = 0;
11331 buf->dtb_tomax = NULL;
11332 buf->dtb_xamot = NULL;
11333 }
11334}
11335
11336/*
11337 * DTrace Enabling Functions
11338 */
11339static dtrace_enabling_t *
11340dtrace_enabling_create(dtrace_vstate_t *vstate)
11341{
11342 dtrace_enabling_t *enab;
11343
11344 enab = kmem_zalloc(sizeof (dtrace_enabling_t), KM_SLEEP);
11345 enab->dten_vstate = vstate;
11346
11347 return (enab);
11348}
11349
11350static void
11351dtrace_enabling_add(dtrace_enabling_t *enab, dtrace_ecbdesc_t *ecb)
11352{
11353 dtrace_ecbdesc_t **ndesc;
11354 size_t osize, nsize;
11355
11356 /*
11357 * We can't add to enablings after we've enabled them, or after we've
11358 * retained them.
11359 */
11360 ASSERT(enab->dten_probegen == 0);
11361 ASSERT(enab->dten_next == NULL && enab->dten_prev == NULL);
11362
11363 if (enab->dten_ndesc < enab->dten_maxdesc) {
11364 enab->dten_desc[enab->dten_ndesc++] = ecb;
11365 return;
11366 }
11367
11368 osize = enab->dten_maxdesc * sizeof (dtrace_enabling_t *);
11369
11370 if (enab->dten_maxdesc == 0) {
11371 enab->dten_maxdesc = 1;
11372 } else {
11373 enab->dten_maxdesc <<= 1;
11374 }
11375
11376 ASSERT(enab->dten_ndesc < enab->dten_maxdesc);
11377
11378 nsize = enab->dten_maxdesc * sizeof (dtrace_enabling_t *);
11379 ndesc = kmem_zalloc(nsize, KM_SLEEP);
11380 bcopy(enab->dten_desc, ndesc, osize);
11381 if (enab->dten_desc != NULL)
11382 kmem_free(enab->dten_desc, osize);
11383
11384 enab->dten_desc = ndesc;
11385 enab->dten_desc[enab->dten_ndesc++] = ecb;
11386}
11387
11388static void
11389dtrace_enabling_addlike(dtrace_enabling_t *enab, dtrace_ecbdesc_t *ecb,
11390 dtrace_probedesc_t *pd)
11391{
11392 dtrace_ecbdesc_t *new;
11393 dtrace_predicate_t *pred;
11394 dtrace_actdesc_t *act;
11395
11396 /*
11397 * We're going to create a new ECB description that matches the
11398 * specified ECB in every way, but has the specified probe description.
11399 */
11400 new = kmem_zalloc(sizeof (dtrace_ecbdesc_t), KM_SLEEP);
11401
11402 if ((pred = ecb->dted_pred.dtpdd_predicate) != NULL)
11403 dtrace_predicate_hold(pred);
11404
11405 for (act = ecb->dted_action; act != NULL; act = act->dtad_next)
11406 dtrace_actdesc_hold(act);
11407
11408 new->dted_action = ecb->dted_action;
11409 new->dted_pred = ecb->dted_pred;
11410 new->dted_probe = *pd;
11411 new->dted_uarg = ecb->dted_uarg;
11412
11413 dtrace_enabling_add(enab, new);
11414}
11415
11416static void
11417dtrace_enabling_dump(dtrace_enabling_t *enab)
11418{
11419 int i;
11420
11421 for (i = 0; i < enab->dten_ndesc; i++) {
11422 dtrace_probedesc_t *desc = &enab->dten_desc[i]->dted_probe;
11423
11424 cmn_err(CE_NOTE, "enabling probe %d (%s:%s:%s:%s)", i,
11425 desc->dtpd_provider, desc->dtpd_mod,
11426 desc->dtpd_func, desc->dtpd_name);
11427 }
11428}
11429
11430static void
11431dtrace_enabling_destroy(dtrace_enabling_t *enab)
11432{
11433 int i;
11434 dtrace_ecbdesc_t *ep;
11435 dtrace_vstate_t *vstate = enab->dten_vstate;
11436
11437 ASSERT(MUTEX_HELD(&dtrace_lock));
11438
11439 for (i = 0; i < enab->dten_ndesc; i++) {
11440 dtrace_actdesc_t *act, *next;
11441 dtrace_predicate_t *pred;
11442
11443 ep = enab->dten_desc[i];
11444
11445 if ((pred = ep->dted_pred.dtpdd_predicate) != NULL)
11446 dtrace_predicate_release(pred, vstate);
11447
11448 for (act = ep->dted_action; act != NULL; act = next) {
11449 next = act->dtad_next;
11450 dtrace_actdesc_release(act, vstate);
11451 }
11452
11453 kmem_free(ep, sizeof (dtrace_ecbdesc_t));
11454 }
11455
11456 if (enab->dten_desc != NULL)
11457 kmem_free(enab->dten_desc,
11458 enab->dten_maxdesc * sizeof (dtrace_enabling_t *));
11459
11460 /*
11461 * If this was a retained enabling, decrement the dts_nretained count
11462 * and take it off of the dtrace_retained list.
11463 */
11464 if (enab->dten_prev != NULL || enab->dten_next != NULL ||
11465 dtrace_retained == enab) {
11466 ASSERT(enab->dten_vstate->dtvs_state != NULL);
11467 ASSERT(enab->dten_vstate->dtvs_state->dts_nretained > 0);
11468 enab->dten_vstate->dtvs_state->dts_nretained--;
11469 dtrace_retained_gen++;
11470 }
11471
11472 if (enab->dten_prev == NULL) {
11473 if (dtrace_retained == enab) {
11474 dtrace_retained = enab->dten_next;
11475
11476 if (dtrace_retained != NULL)
11477 dtrace_retained->dten_prev = NULL;
11478 }
11479 } else {
11480 ASSERT(enab != dtrace_retained);
11481 ASSERT(dtrace_retained != NULL);
11482 enab->dten_prev->dten_next = enab->dten_next;
11483 }
11484
11485 if (enab->dten_next != NULL) {
11486 ASSERT(dtrace_retained != NULL);
11487 enab->dten_next->dten_prev = enab->dten_prev;
11488 }
11489
11490 kmem_free(enab, sizeof (dtrace_enabling_t));
11491}
11492
11493static int
11494dtrace_enabling_retain(dtrace_enabling_t *enab)
11495{
11496 dtrace_state_t *state;
11497
11498 ASSERT(MUTEX_HELD(&dtrace_lock));
11499 ASSERT(enab->dten_next == NULL && enab->dten_prev == NULL);
11500 ASSERT(enab->dten_vstate != NULL);
11501
11502 state = enab->dten_vstate->dtvs_state;
11503 ASSERT(state != NULL);
11504
11505 /*
11506 * We only allow each state to retain dtrace_retain_max enablings.
11507 */
11508 if (state->dts_nretained >= dtrace_retain_max)
11509 return (ENOSPC);
11510
11511 state->dts_nretained++;
11512 dtrace_retained_gen++;
11513
11514 if (dtrace_retained == NULL) {
11515 dtrace_retained = enab;
11516 return (0);
11517 }
11518
11519 enab->dten_next = dtrace_retained;
11520 dtrace_retained->dten_prev = enab;
11521 dtrace_retained = enab;
11522
11523 return (0);
11524}
11525
11526static int
11527dtrace_enabling_replicate(dtrace_state_t *state, dtrace_probedesc_t *match,
11528 dtrace_probedesc_t *create)
11529{
11530 dtrace_enabling_t *new, *enab;
11531 int found = 0, err = ENOENT;
11532
11533 ASSERT(MUTEX_HELD(&dtrace_lock));
11534 ASSERT(strlen(match->dtpd_provider) < DTRACE_PROVNAMELEN);
11535 ASSERT(strlen(match->dtpd_mod) < DTRACE_MODNAMELEN);
11536 ASSERT(strlen(match->dtpd_func) < DTRACE_FUNCNAMELEN);
11537 ASSERT(strlen(match->dtpd_name) < DTRACE_NAMELEN);
11538
11539 new = dtrace_enabling_create(&state->dts_vstate);
11540
11541 /*
11542 * Iterate over all retained enablings, looking for enablings that
11543 * match the specified state.
11544 */
11545 for (enab = dtrace_retained; enab != NULL; enab = enab->dten_next) {
11546 int i;
11547
11548 /*
11549 * dtvs_state can only be NULL for helper enablings -- and
11550 * helper enablings can't be retained.
11551 */
11552 ASSERT(enab->dten_vstate->dtvs_state != NULL);
11553
11554 if (enab->dten_vstate->dtvs_state != state)
11555 continue;
11556
11557 /*
11558 * Now iterate over each probe description; we're looking for
11559 * an exact match to the specified probe description.
11560 */
11561 for (i = 0; i < enab->dten_ndesc; i++) {
11562 dtrace_ecbdesc_t *ep = enab->dten_desc[i];
11563 dtrace_probedesc_t *pd = &ep->dted_probe;
11564
11565 if (strcmp(pd->dtpd_provider, match->dtpd_provider))
11566 continue;
11567
11568 if (strcmp(pd->dtpd_mod, match->dtpd_mod))
11569 continue;
11570
11571 if (strcmp(pd->dtpd_func, match->dtpd_func))
11572 continue;
11573
11574 if (strcmp(pd->dtpd_name, match->dtpd_name))
11575 continue;
11576
11577 /*
11578 * We have a winning probe! Add it to our growing
11579 * enabling.
11580 */
11581 found = 1;
11582 dtrace_enabling_addlike(new, ep, create);
11583 }
11584 }
11585
11586 if (!found || (err = dtrace_enabling_retain(new)) != 0) {
11587 dtrace_enabling_destroy(new);
11588 return (err);
11589 }
11590
11591 return (0);
11592}
11593
11594static void
11595dtrace_enabling_retract(dtrace_state_t *state)
11596{
11597 dtrace_enabling_t *enab, *next;
11598
11599 ASSERT(MUTEX_HELD(&dtrace_lock));
11600
11601 /*
11602 * Iterate over all retained enablings, destroy the enablings retained
11603 * for the specified state.
11604 */
11605 for (enab = dtrace_retained; enab != NULL; enab = next) {
11606 next = enab->dten_next;
11607
11608 /*
11609 * dtvs_state can only be NULL for helper enablings -- and
11610 * helper enablings can't be retained.
11611 */
11612 ASSERT(enab->dten_vstate->dtvs_state != NULL);
11613
11614 if (enab->dten_vstate->dtvs_state == state) {
11615 ASSERT(state->dts_nretained > 0);
11616 dtrace_enabling_destroy(enab);
11617 }
11618 }
11619
11620 ASSERT(state->dts_nretained == 0);
11621}
11622
11623static int
11624dtrace_enabling_match(dtrace_enabling_t *enab, int *nmatched)
11625{
11626 int i = 0;
11627 int matched = 0;
11628
11629 ASSERT(MUTEX_HELD(&cpu_lock));
11630 ASSERT(MUTEX_HELD(&dtrace_lock));
11631
11632 for (i = 0; i < enab->dten_ndesc; i++) {
11633 dtrace_ecbdesc_t *ep = enab->dten_desc[i];
11634
11635 enab->dten_current = ep;
11636 enab->dten_error = 0;
11637
11638 matched += dtrace_probe_enable(&ep->dted_probe, enab);
11639
11640 if (enab->dten_error != 0) {
11641 /*
11642 * If we get an error half-way through enabling the
11643 * probes, we kick out -- perhaps with some number of
11644 * them enabled. Leaving enabled probes enabled may
11645 * be slightly confusing for user-level, but we expect
11646 * that no one will attempt to actually drive on in
11647 * the face of such errors. If this is an anonymous
11648 * enabling (indicated with a NULL nmatched pointer),
11649 * we cmn_err() a message. We aren't expecting to
11650 * get such an error -- such as it can exist at all,
11651 * it would be a result of corrupted DOF in the driver
11652 * properties.
11653 */
11654 if (nmatched == NULL) {
11655 cmn_err(CE_WARN, "dtrace_enabling_match() "
11656 "error on %p: %d", (void *)ep,
11657 enab->dten_error);
11658 }
11659
11660 return (enab->dten_error);
11661 }
11662 }
11663
11664 enab->dten_probegen = dtrace_probegen;
11665 if (nmatched != NULL)
11666 *nmatched = matched;
11667
11668 return (0);
11669}
11670
11671static void
11672dtrace_enabling_matchall(void)
11673{
11674 dtrace_enabling_t *enab;
11675
11676 mutex_enter(&cpu_lock);
11677 mutex_enter(&dtrace_lock);
11678
11679 /*
11680 * Iterate over all retained enablings to see if any probes match
11681 * against them. We only perform this operation on enablings for which
11682 * we have sufficient permissions by virtue of being in the global zone
11683 * or in the same zone as the DTrace client. Because we can be called
11684 * after dtrace_detach() has been called, we cannot assert that there
11685 * are retained enablings. We can safely load from dtrace_retained,
11686 * however: the taskq_destroy() at the end of dtrace_detach() will
11687 * block pending our completion.
11688 */
11689 for (enab = dtrace_retained; enab != NULL; enab = enab->dten_next) {
11690#if defined(sun)
11691 cred_t *cr = enab->dten_vstate->dtvs_state->dts_cred.dcr_cred;
11692
11693 if (INGLOBALZONE(curproc) ||
11694 cr != NULL && getzoneid() == crgetzoneid(cr))
11695#endif
11696 (void) dtrace_enabling_match(enab, NULL);
11697 }
11698
11699 mutex_exit(&dtrace_lock);
11700 mutex_exit(&cpu_lock);
11701}
11702
11703/*
11704 * If an enabling is to be enabled without having matched probes (that is, if
11705 * dtrace_state_go() is to be called on the underlying dtrace_state_t), the
11706 * enabling must be _primed_ by creating an ECB for every ECB description.
11707 * This must be done to assure that we know the number of speculations, the
11708 * number of aggregations, the minimum buffer size needed, etc. before we
11709 * transition out of DTRACE_ACTIVITY_INACTIVE. To do this without actually
11710 * enabling any probes, we create ECBs for every ECB decription, but with a
11711 * NULL probe -- which is exactly what this function does.
11712 */
11713static void
11714dtrace_enabling_prime(dtrace_state_t *state)
11715{
11716 dtrace_enabling_t *enab;
11717 int i;
11718
11719 for (enab = dtrace_retained; enab != NULL; enab = enab->dten_next) {
11720 ASSERT(enab->dten_vstate->dtvs_state != NULL);
11721
11722 if (enab->dten_vstate->dtvs_state != state)
11723 continue;
11724
11725 /*
11726 * We don't want to prime an enabling more than once, lest
11727 * we allow a malicious user to induce resource exhaustion.
11728 * (The ECBs that result from priming an enabling aren't
11729 * leaked -- but they also aren't deallocated until the
11730 * consumer state is destroyed.)
11731 */
11732 if (enab->dten_primed)
11733 continue;
11734
11735 for (i = 0; i < enab->dten_ndesc; i++) {
11736 enab->dten_current = enab->dten_desc[i];
11737 (void) dtrace_probe_enable(NULL, enab);
11738 }
11739
11740 enab->dten_primed = 1;
11741 }
11742}
11743
11744/*
11745 * Called to indicate that probes should be provided due to retained
11746 * enablings. This is implemented in terms of dtrace_probe_provide(), but it
11747 * must take an initial lap through the enabling calling the dtps_provide()
11748 * entry point explicitly to allow for autocreated probes.
11749 */
11750static void
11751dtrace_enabling_provide(dtrace_provider_t *prv)
11752{
11753 int i, all = 0;
11754 dtrace_probedesc_t desc;
11755 dtrace_genid_t gen;
11756
11757 ASSERT(MUTEX_HELD(&dtrace_lock));
11758 ASSERT(MUTEX_HELD(&dtrace_provider_lock));
11759
11760 if (prv == NULL) {
11761 all = 1;
11762 prv = dtrace_provider;
11763 }
11764
11765 do {
11766 dtrace_enabling_t *enab;
11767 void *parg = prv->dtpv_arg;
11768
11769retry:
11770 gen = dtrace_retained_gen;
11771 for (enab = dtrace_retained; enab != NULL;
11772 enab = enab->dten_next) {
11773 for (i = 0; i < enab->dten_ndesc; i++) {
11774 desc = enab->dten_desc[i]->dted_probe;
11775 mutex_exit(&dtrace_lock);
11776 prv->dtpv_pops.dtps_provide(parg, &desc);
11777 mutex_enter(&dtrace_lock);
11778 /*
11779 * Process the retained enablings again if
11780 * they have changed while we weren't holding
11781 * dtrace_lock.
11782 */
11783 if (gen != dtrace_retained_gen)
11784 goto retry;
11785 }
11786 }
11787 } while (all && (prv = prv->dtpv_next) != NULL);
11788
11789 mutex_exit(&dtrace_lock);
11790 dtrace_probe_provide(NULL, all ? NULL : prv);
11791 mutex_enter(&dtrace_lock);
11792}
11793
11794/*
11795 * Called to reap ECBs that are attached to probes from defunct providers.
11796 */
11797static void
11798dtrace_enabling_reap(void)
11799{
11800 dtrace_provider_t *prov;
11801 dtrace_probe_t *probe;
11802 dtrace_ecb_t *ecb;
11803 hrtime_t when;
11804 int i;
11805
11806 mutex_enter(&cpu_lock);
11807 mutex_enter(&dtrace_lock);
11808
11809 for (i = 0; i < dtrace_nprobes; i++) {
11810 if ((probe = dtrace_probes[i]) == NULL)
11811 continue;
11812
11813 if (probe->dtpr_ecb == NULL)
11814 continue;
11815
11816 prov = probe->dtpr_provider;
11817
11818 if ((when = prov->dtpv_defunct) == 0)
11819 continue;
11820
11821 /*
11822 * We have ECBs on a defunct provider: we want to reap these
11823 * ECBs to allow the provider to unregister. The destruction
11824 * of these ECBs must be done carefully: if we destroy the ECB
11825 * and the consumer later wishes to consume an EPID that
11826 * corresponds to the destroyed ECB (and if the EPID metadata
11827 * has not been previously consumed), the consumer will abort
11828 * processing on the unknown EPID. To reduce (but not, sadly,
11829 * eliminate) the possibility of this, we will only destroy an
11830 * ECB for a defunct provider if, for the state that
11831 * corresponds to the ECB:
11832 *
11833 * (a) There is no speculative tracing (which can effectively
11834 * cache an EPID for an arbitrary amount of time).
11835 *
11836 * (b) The principal buffers have been switched twice since the
11837 * provider became defunct.
11838 *
11839 * (c) The aggregation buffers are of zero size or have been
11840 * switched twice since the provider became defunct.
11841 *
11842 * We use dts_speculates to determine (a) and call a function
11843 * (dtrace_buffer_consumed()) to determine (b) and (c). Note
11844 * that as soon as we've been unable to destroy one of the ECBs
11845 * associated with the probe, we quit trying -- reaping is only
11846 * fruitful in as much as we can destroy all ECBs associated
11847 * with the defunct provider's probes.
11848 */
11849 while ((ecb = probe->dtpr_ecb) != NULL) {
11850 dtrace_state_t *state = ecb->dte_state;
11851 dtrace_buffer_t *buf = state->dts_buffer;
11852 dtrace_buffer_t *aggbuf = state->dts_aggbuffer;
11853
11854 if (state->dts_speculates)
11855 break;
11856
11857 if (!dtrace_buffer_consumed(buf, when))
11858 break;
11859
11860 if (!dtrace_buffer_consumed(aggbuf, when))
11861 break;
11862
11863 dtrace_ecb_disable(ecb);
11864 ASSERT(probe->dtpr_ecb != ecb);
11865 dtrace_ecb_destroy(ecb);
11866 }
11867 }
11868
11869 mutex_exit(&dtrace_lock);
11870 mutex_exit(&cpu_lock);
11871}
11872
11873/*
11874 * DTrace DOF Functions
11875 */
11876/*ARGSUSED*/
11877static void
11878dtrace_dof_error(dof_hdr_t *dof, const char *str)
11879{
11880 if (dtrace_err_verbose)
11881 cmn_err(CE_WARN, "failed to process DOF: %s", str);
11882
11883#ifdef DTRACE_ERRDEBUG
11884 dtrace_errdebug(str);
11885#endif
11886}
11887
11888/*
11889 * Create DOF out of a currently enabled state. Right now, we only create
11890 * DOF containing the run-time options -- but this could be expanded to create
11891 * complete DOF representing the enabled state.
11892 */
11893static dof_hdr_t *
11894dtrace_dof_create(dtrace_state_t *state)
11895{
11896 dof_hdr_t *dof;
11897 dof_sec_t *sec;
11898 dof_optdesc_t *opt;
11899 int i, len = sizeof (dof_hdr_t) +
11900 roundup(sizeof (dof_sec_t), sizeof (uint64_t)) +
11901 sizeof (dof_optdesc_t) * DTRACEOPT_MAX;
11902
11903 ASSERT(MUTEX_HELD(&dtrace_lock));
11904
11905 dof = kmem_zalloc(len, KM_SLEEP);
11906 dof->dofh_ident[DOF_ID_MAG0] = DOF_MAG_MAG0;
11907 dof->dofh_ident[DOF_ID_MAG1] = DOF_MAG_MAG1;
11908 dof->dofh_ident[DOF_ID_MAG2] = DOF_MAG_MAG2;
11909 dof->dofh_ident[DOF_ID_MAG3] = DOF_MAG_MAG3;
11910
11911 dof->dofh_ident[DOF_ID_MODEL] = DOF_MODEL_NATIVE;
11912 dof->dofh_ident[DOF_ID_ENCODING] = DOF_ENCODE_NATIVE;
11913 dof->dofh_ident[DOF_ID_VERSION] = DOF_VERSION;
11914 dof->dofh_ident[DOF_ID_DIFVERS] = DIF_VERSION;
11915 dof->dofh_ident[DOF_ID_DIFIREG] = DIF_DIR_NREGS;
11916 dof->dofh_ident[DOF_ID_DIFTREG] = DIF_DTR_NREGS;
11917
11918 dof->dofh_flags = 0;
11919 dof->dofh_hdrsize = sizeof (dof_hdr_t);
11920 dof->dofh_secsize = sizeof (dof_sec_t);
11921 dof->dofh_secnum = 1; /* only DOF_SECT_OPTDESC */
11922 dof->dofh_secoff = sizeof (dof_hdr_t);
11923 dof->dofh_loadsz = len;
11924 dof->dofh_filesz = len;
11925 dof->dofh_pad = 0;
11926
11927 /*
11928 * Fill in the option section header...
11929 */
11930 sec = (dof_sec_t *)((uintptr_t)dof + sizeof (dof_hdr_t));
11931 sec->dofs_type = DOF_SECT_OPTDESC;
11932 sec->dofs_align = sizeof (uint64_t);
11933 sec->dofs_flags = DOF_SECF_LOAD;
11934 sec->dofs_entsize = sizeof (dof_optdesc_t);
11935
11936 opt = (dof_optdesc_t *)((uintptr_t)sec +
11937 roundup(sizeof (dof_sec_t), sizeof (uint64_t)));
11938
11939 sec->dofs_offset = (uintptr_t)opt - (uintptr_t)dof;
11940 sec->dofs_size = sizeof (dof_optdesc_t) * DTRACEOPT_MAX;
11941
11942 for (i = 0; i < DTRACEOPT_MAX; i++) {
11943 opt[i].dofo_option = i;
11944 opt[i].dofo_strtab = DOF_SECIDX_NONE;
11945 opt[i].dofo_value = state->dts_options[i];
11946 }
11947
11948 return (dof);
11949}
11950
11951static dof_hdr_t *
11952dtrace_dof_copyin(uintptr_t uarg, int *errp)
11953{
11954 dof_hdr_t hdr, *dof;
11955
11956 ASSERT(!MUTEX_HELD(&dtrace_lock));
11957
11958 /*
11959 * First, we're going to copyin() the sizeof (dof_hdr_t).
11960 */
11961 if (copyin((void *)uarg, &hdr, sizeof (hdr)) != 0) {
11962 dtrace_dof_error(NULL, "failed to copyin DOF header");
11963 *errp = EFAULT;
11964 return (NULL);
11965 }
11966
11967 /*
11968 * Now we'll allocate the entire DOF and copy it in -- provided
11969 * that the length isn't outrageous.
11970 */
11971 if (hdr.dofh_loadsz >= dtrace_dof_maxsize) {
11972 dtrace_dof_error(&hdr, "load size exceeds maximum");
11973 *errp = E2BIG;
11974 return (NULL);
11975 }
11976
11977 if (hdr.dofh_loadsz < sizeof (hdr)) {
11978 dtrace_dof_error(&hdr, "invalid load size");
11979 *errp = EINVAL;
11980 return (NULL);
11981 }
11982
11983 dof = kmem_alloc(hdr.dofh_loadsz, KM_SLEEP);
11984
11985 if (copyin((void *)uarg, dof, hdr.dofh_loadsz) != 0 ||
11986 dof->dofh_loadsz != hdr.dofh_loadsz) {
11987 kmem_free(dof, hdr.dofh_loadsz);
11988 *errp = EFAULT;
11989 return (NULL);
11990 }
11991
11992 return (dof);
11993}
11994
11995#if !defined(sun)
11996static __inline uchar_t
11997dtrace_dof_char(char c) {
11998 switch (c) {
11999 case '0':
12000 case '1':
12001 case '2':
12002 case '3':
12003 case '4':
12004 case '5':
12005 case '6':
12006 case '7':
12007 case '8':
12008 case '9':
12009 return (c - '0');
12010 case 'A':
12011 case 'B':
12012 case 'C':
12013 case 'D':
12014 case 'E':
12015 case 'F':
12016 return (c - 'A' + 10);
12017 case 'a':
12018 case 'b':
12019 case 'c':
12020 case 'd':
12021 case 'e':
12022 case 'f':
12023 return (c - 'a' + 10);
12024 }
12025 /* Should not reach here. */
12026 return (0);
12027}
12028#endif
12029
12030static dof_hdr_t *
12031dtrace_dof_property(const char *name)
12032{
12033 uchar_t *buf;
12034 uint64_t loadsz;
12035 unsigned int len, i;
12036 dof_hdr_t *dof;
12037
12038#if defined(sun)
12039 /*
12040 * Unfortunately, array of values in .conf files are always (and
12041 * only) interpreted to be integer arrays. We must read our DOF
12042 * as an integer array, and then squeeze it into a byte array.
12043 */
12044 if (ddi_prop_lookup_int_array(DDI_DEV_T_ANY, dtrace_devi, 0,
12045 (char *)name, (int **)&buf, &len) != DDI_PROP_SUCCESS)
12046 return (NULL);
12047
12048 for (i = 0; i < len; i++)
12049 buf[i] = (uchar_t)(((int *)buf)[i]);
12050
12051 if (len < sizeof (dof_hdr_t)) {
12052 ddi_prop_free(buf);
12053 dtrace_dof_error(NULL, "truncated header");
12054 return (NULL);
12055 }
12056
12057 if (len < (loadsz = ((dof_hdr_t *)buf)->dofh_loadsz)) {
12058 ddi_prop_free(buf);
12059 dtrace_dof_error(NULL, "truncated DOF");
12060 return (NULL);
12061 }
12062
12063 if (loadsz >= dtrace_dof_maxsize) {
12064 ddi_prop_free(buf);
12065 dtrace_dof_error(NULL, "oversized DOF");
12066 return (NULL);
12067 }
12068
12069 dof = kmem_alloc(loadsz, KM_SLEEP);
12070 bcopy(buf, dof, loadsz);
12071 ddi_prop_free(buf);
12072#else
12073 char *p;
12074 char *p_env;
12075
12076 if ((p_env = getenv(name)) == NULL)
12077 return (NULL);
12078
12079 len = strlen(p_env) / 2;
12080
12081 buf = kmem_alloc(len, KM_SLEEP);
12082
12083 dof = (dof_hdr_t *) buf;
12084
12085 p = p_env;
12086
12087 for (i = 0; i < len; i++) {
12088 buf[i] = (dtrace_dof_char(p[0]) << 4) |
12089 dtrace_dof_char(p[1]);
12090 p += 2;
12091 }
12092
12093 freeenv(p_env);
12094
12095 if (len < sizeof (dof_hdr_t)) {
12096 kmem_free(buf, 0);
12097 dtrace_dof_error(NULL, "truncated header");
12098 return (NULL);
12099 }
12100
12101 if (len < (loadsz = dof->dofh_loadsz)) {
12102 kmem_free(buf, 0);
12103 dtrace_dof_error(NULL, "truncated DOF");
12104 return (NULL);
12105 }
12106
12107 if (loadsz >= dtrace_dof_maxsize) {
12108 kmem_free(buf, 0);
12109 dtrace_dof_error(NULL, "oversized DOF");
12110 return (NULL);
12111 }
12112#endif
12113
12114 return (dof);
12115}
12116
12117static void
12118dtrace_dof_destroy(dof_hdr_t *dof)
12119{
12120 kmem_free(dof, dof->dofh_loadsz);
12121}
12122
12123/*
12124 * Return the dof_sec_t pointer corresponding to a given section index. If the
12125 * index is not valid, dtrace_dof_error() is called and NULL is returned. If
12126 * a type other than DOF_SECT_NONE is specified, the header is checked against
12127 * this type and NULL is returned if the types do not match.
12128 */
12129static dof_sec_t *
12130dtrace_dof_sect(dof_hdr_t *dof, uint32_t type, dof_secidx_t i)
12131{
12132 dof_sec_t *sec = (dof_sec_t *)(uintptr_t)
12133 ((uintptr_t)dof + dof->dofh_secoff + i * dof->dofh_secsize);
12134
12135 if (i >= dof->dofh_secnum) {
12136 dtrace_dof_error(dof, "referenced section index is invalid");
12137 return (NULL);
12138 }
12139
12140 if (!(sec->dofs_flags & DOF_SECF_LOAD)) {
12141 dtrace_dof_error(dof, "referenced section is not loadable");
12142 return (NULL);
12143 }
12144
12145 if (type != DOF_SECT_NONE && type != sec->dofs_type) {
12146 dtrace_dof_error(dof, "referenced section is the wrong type");
12147 return (NULL);
12148 }
12149
12150 return (sec);
12151}
12152
12153static dtrace_probedesc_t *
12154dtrace_dof_probedesc(dof_hdr_t *dof, dof_sec_t *sec, dtrace_probedesc_t *desc)
12155{
12156 dof_probedesc_t *probe;
12157 dof_sec_t *strtab;
12158 uintptr_t daddr = (uintptr_t)dof;
12159 uintptr_t str;
12160 size_t size;
12161
12162 if (sec->dofs_type != DOF_SECT_PROBEDESC) {
12163 dtrace_dof_error(dof, "invalid probe section");
12164 return (NULL);
12165 }
12166
12167 if (sec->dofs_align != sizeof (dof_secidx_t)) {
12168 dtrace_dof_error(dof, "bad alignment in probe description");
12169 return (NULL);
12170 }
12171
12172 if (sec->dofs_offset + sizeof (dof_probedesc_t) > dof->dofh_loadsz) {
12173 dtrace_dof_error(dof, "truncated probe description");
12174 return (NULL);
12175 }
12176
12177 probe = (dof_probedesc_t *)(uintptr_t)(daddr + sec->dofs_offset);
12178 strtab = dtrace_dof_sect(dof, DOF_SECT_STRTAB, probe->dofp_strtab);
12179
12180 if (strtab == NULL)
12181 return (NULL);
12182
12183 str = daddr + strtab->dofs_offset;
12184 size = strtab->dofs_size;
12185
12186 if (probe->dofp_provider >= strtab->dofs_size) {
12187 dtrace_dof_error(dof, "corrupt probe provider");
12188 return (NULL);
12189 }
12190
12191 (void) strncpy(desc->dtpd_provider,
12192 (char *)(str + probe->dofp_provider),
12193 MIN(DTRACE_PROVNAMELEN - 1, size - probe->dofp_provider));
12194
12195 if (probe->dofp_mod >= strtab->dofs_size) {
12196 dtrace_dof_error(dof, "corrupt probe module");
12197 return (NULL);
12198 }
12199
12200 (void) strncpy(desc->dtpd_mod, (char *)(str + probe->dofp_mod),
12201 MIN(DTRACE_MODNAMELEN - 1, size - probe->dofp_mod));
12202
12203 if (probe->dofp_func >= strtab->dofs_size) {
12204 dtrace_dof_error(dof, "corrupt probe function");
12205 return (NULL);
12206 }
12207
12208 (void) strncpy(desc->dtpd_func, (char *)(str + probe->dofp_func),
12209 MIN(DTRACE_FUNCNAMELEN - 1, size - probe->dofp_func));
12210
12211 if (probe->dofp_name >= strtab->dofs_size) {
12212 dtrace_dof_error(dof, "corrupt probe name");
12213 return (NULL);
12214 }
12215
12216 (void) strncpy(desc->dtpd_name, (char *)(str + probe->dofp_name),
12217 MIN(DTRACE_NAMELEN - 1, size - probe->dofp_name));
12218
12219 return (desc);
12220}
12221
12222static dtrace_difo_t *
12223dtrace_dof_difo(dof_hdr_t *dof, dof_sec_t *sec, dtrace_vstate_t *vstate,
12224 cred_t *cr)
12225{
12226 dtrace_difo_t *dp;
12227 size_t ttl = 0;
12228 dof_difohdr_t *dofd;
12229 uintptr_t daddr = (uintptr_t)dof;
12230 size_t max = dtrace_difo_maxsize;
12231 int i, l, n;
12232
12233 static const struct {
12234 int section;
12235 int bufoffs;
12236 int lenoffs;
12237 int entsize;
12238 int align;
12239 const char *msg;
12240 } difo[] = {
12241 { DOF_SECT_DIF, offsetof(dtrace_difo_t, dtdo_buf),
12242 offsetof(dtrace_difo_t, dtdo_len), sizeof (dif_instr_t),
12243 sizeof (dif_instr_t), "multiple DIF sections" },
12244
12245 { DOF_SECT_INTTAB, offsetof(dtrace_difo_t, dtdo_inttab),
12246 offsetof(dtrace_difo_t, dtdo_intlen), sizeof (uint64_t),
12247 sizeof (uint64_t), "multiple integer tables" },
12248
12249 { DOF_SECT_STRTAB, offsetof(dtrace_difo_t, dtdo_strtab),
12250 offsetof(dtrace_difo_t, dtdo_strlen), 0,
12251 sizeof (char), "multiple string tables" },
12252
12253 { DOF_SECT_VARTAB, offsetof(dtrace_difo_t, dtdo_vartab),
12254 offsetof(dtrace_difo_t, dtdo_varlen), sizeof (dtrace_difv_t),
12255 sizeof (uint_t), "multiple variable tables" },
12256
12257 { DOF_SECT_NONE, 0, 0, 0, 0, NULL }
12258 };
12259
12260 if (sec->dofs_type != DOF_SECT_DIFOHDR) {
12261 dtrace_dof_error(dof, "invalid DIFO header section");
12262 return (NULL);
12263 }
12264
12265 if (sec->dofs_align != sizeof (dof_secidx_t)) {
12266 dtrace_dof_error(dof, "bad alignment in DIFO header");
12267 return (NULL);
12268 }
12269
12270 if (sec->dofs_size < sizeof (dof_difohdr_t) ||
12271 sec->dofs_size % sizeof (dof_secidx_t)) {
12272 dtrace_dof_error(dof, "bad size in DIFO header");
12273 return (NULL);
12274 }
12275
12276 dofd = (dof_difohdr_t *)(uintptr_t)(daddr + sec->dofs_offset);
12277 n = (sec->dofs_size - sizeof (*dofd)) / sizeof (dof_secidx_t) + 1;
12278
12279 dp = kmem_zalloc(sizeof (dtrace_difo_t), KM_SLEEP);
12280 dp->dtdo_rtype = dofd->dofd_rtype;
12281
12282 for (l = 0; l < n; l++) {
12283 dof_sec_t *subsec;
12284 void **bufp;
12285 uint32_t *lenp;
12286
12287 if ((subsec = dtrace_dof_sect(dof, DOF_SECT_NONE,
12288 dofd->dofd_links[l])) == NULL)
12289 goto err; /* invalid section link */
12290
12291 if (ttl + subsec->dofs_size > max) {
12292 dtrace_dof_error(dof, "exceeds maximum size");
12293 goto err;
12294 }
12295
12296 ttl += subsec->dofs_size;
12297
12298 for (i = 0; difo[i].section != DOF_SECT_NONE; i++) {
12299 if (subsec->dofs_type != difo[i].section)
12300 continue;
12301
12302 if (!(subsec->dofs_flags & DOF_SECF_LOAD)) {
12303 dtrace_dof_error(dof, "section not loaded");
12304 goto err;
12305 }
12306
12307 if (subsec->dofs_align != difo[i].align) {
12308 dtrace_dof_error(dof, "bad alignment");
12309 goto err;
12310 }
12311
12312 bufp = (void **)((uintptr_t)dp + difo[i].bufoffs);
12313 lenp = (uint32_t *)((uintptr_t)dp + difo[i].lenoffs);
12314
12315 if (*bufp != NULL) {
12316 dtrace_dof_error(dof, difo[i].msg);
12317 goto err;
12318 }
12319
12320 if (difo[i].entsize != subsec->dofs_entsize) {
12321 dtrace_dof_error(dof, "entry size mismatch");
12322 goto err;
12323 }
12324
12325 if (subsec->dofs_entsize != 0 &&
12326 (subsec->dofs_size % subsec->dofs_entsize) != 0) {
12327 dtrace_dof_error(dof, "corrupt entry size");
12328 goto err;
12329 }
12330
12331 *lenp = subsec->dofs_size;
12332 *bufp = kmem_alloc(subsec->dofs_size, KM_SLEEP);
12333 bcopy((char *)(uintptr_t)(daddr + subsec->dofs_offset),
12334 *bufp, subsec->dofs_size);
12335
12336 if (subsec->dofs_entsize != 0)
12337 *lenp /= subsec->dofs_entsize;
12338
12339 break;
12340 }
12341
12342 /*
12343 * If we encounter a loadable DIFO sub-section that is not
12344 * known to us, assume this is a broken program and fail.
12345 */
12346 if (difo[i].section == DOF_SECT_NONE &&
12347 (subsec->dofs_flags & DOF_SECF_LOAD)) {
12348 dtrace_dof_error(dof, "unrecognized DIFO subsection");
12349 goto err;
12350 }
12351 }
12352
12353 if (dp->dtdo_buf == NULL) {
12354 /*
12355 * We can't have a DIF object without DIF text.
12356 */
12357 dtrace_dof_error(dof, "missing DIF text");
12358 goto err;
12359 }
12360
12361 /*
12362 * Before we validate the DIF object, run through the variable table
12363 * looking for the strings -- if any of their size are under, we'll set
12364 * their size to be the system-wide default string size. Note that
12365 * this should _not_ happen if the "strsize" option has been set --
12366 * in this case, the compiler should have set the size to reflect the
12367 * setting of the option.
12368 */
12369 for (i = 0; i < dp->dtdo_varlen; i++) {
12370 dtrace_difv_t *v = &dp->dtdo_vartab[i];
12371 dtrace_diftype_t *t = &v->dtdv_type;
12372
12373 if (v->dtdv_id < DIF_VAR_OTHER_UBASE)
12374 continue;
12375
12376 if (t->dtdt_kind == DIF_TYPE_STRING && t->dtdt_size == 0)
12377 t->dtdt_size = dtrace_strsize_default;
12378 }
12379
12380 if (dtrace_difo_validate(dp, vstate, DIF_DIR_NREGS, cr) != 0)
12381 goto err;
12382
12383 dtrace_difo_init(dp, vstate);
12384 return (dp);
12385
12386err:
12387 kmem_free(dp->dtdo_buf, dp->dtdo_len * sizeof (dif_instr_t));
12388 kmem_free(dp->dtdo_inttab, dp->dtdo_intlen * sizeof (uint64_t));
12389 kmem_free(dp->dtdo_strtab, dp->dtdo_strlen);
12390 kmem_free(dp->dtdo_vartab, dp->dtdo_varlen * sizeof (dtrace_difv_t));
12391
12392 kmem_free(dp, sizeof (dtrace_difo_t));
12393 return (NULL);
12394}
12395
12396static dtrace_predicate_t *
12397dtrace_dof_predicate(dof_hdr_t *dof, dof_sec_t *sec, dtrace_vstate_t *vstate,
12398 cred_t *cr)
12399{
12400 dtrace_difo_t *dp;
12401
12402 if ((dp = dtrace_dof_difo(dof, sec, vstate, cr)) == NULL)
12403 return (NULL);
12404
12405 return (dtrace_predicate_create(dp));
12406}
12407
12408static dtrace_actdesc_t *
12409dtrace_dof_actdesc(dof_hdr_t *dof, dof_sec_t *sec, dtrace_vstate_t *vstate,
12410 cred_t *cr)
12411{
12412 dtrace_actdesc_t *act, *first = NULL, *last = NULL, *next;
12413 dof_actdesc_t *desc;
12414 dof_sec_t *difosec;
12415 size_t offs;
12416 uintptr_t daddr = (uintptr_t)dof;
12417 uint64_t arg;
12418 dtrace_actkind_t kind;
12419
12420 if (sec->dofs_type != DOF_SECT_ACTDESC) {
12421 dtrace_dof_error(dof, "invalid action section");
12422 return (NULL);
12423 }
12424
12425 if (sec->dofs_offset + sizeof (dof_actdesc_t) > dof->dofh_loadsz) {
12426 dtrace_dof_error(dof, "truncated action description");
12427 return (NULL);
12428 }
12429
12430 if (sec->dofs_align != sizeof (uint64_t)) {
12431 dtrace_dof_error(dof, "bad alignment in action description");
12432 return (NULL);
12433 }
12434
12435 if (sec->dofs_size < sec->dofs_entsize) {
12436 dtrace_dof_error(dof, "section entry size exceeds total size");
12437 return (NULL);
12438 }
12439
12440 if (sec->dofs_entsize != sizeof (dof_actdesc_t)) {
12441 dtrace_dof_error(dof, "bad entry size in action description");
12442 return (NULL);
12443 }
12444
12445 if (sec->dofs_size / sec->dofs_entsize > dtrace_actions_max) {
12446 dtrace_dof_error(dof, "actions exceed dtrace_actions_max");
12447 return (NULL);
12448 }
12449
12450 for (offs = 0; offs < sec->dofs_size; offs += sec->dofs_entsize) {
12451 desc = (dof_actdesc_t *)(daddr +
12452 (uintptr_t)sec->dofs_offset + offs);
12453 kind = (dtrace_actkind_t)desc->dofa_kind;
12454
12455 if ((DTRACEACT_ISPRINTFLIKE(kind) &&
12456 (kind != DTRACEACT_PRINTA ||
12457 desc->dofa_strtab != DOF_SECIDX_NONE)) ||
12458 (kind == DTRACEACT_DIFEXPR &&
12459 desc->dofa_strtab != DOF_SECIDX_NONE)) {
12460 dof_sec_t *strtab;
12461 char *str, *fmt;
12462 uint64_t i;
12463
12464 /*
12465 * The argument to these actions is an index into the
12466 * DOF string table. For printf()-like actions, this
12467 * is the format string. For print(), this is the
12468 * CTF type of the expression result.
12469 */
12470 if ((strtab = dtrace_dof_sect(dof,
12471 DOF_SECT_STRTAB, desc->dofa_strtab)) == NULL)
12472 goto err;
12473
12474 str = (char *)((uintptr_t)dof +
12475 (uintptr_t)strtab->dofs_offset);
12476
12477 for (i = desc->dofa_arg; i < strtab->dofs_size; i++) {
12478 if (str[i] == '\0')
12479 break;
12480 }
12481
12482 if (i >= strtab->dofs_size) {
12483 dtrace_dof_error(dof, "bogus format string");
12484 goto err;
12485 }
12486
12487 if (i == desc->dofa_arg) {
12488 dtrace_dof_error(dof, "empty format string");
12489 goto err;
12490 }
12491
12492 i -= desc->dofa_arg;
12493 fmt = kmem_alloc(i + 1, KM_SLEEP);
12494 bcopy(&str[desc->dofa_arg], fmt, i + 1);
12495 arg = (uint64_t)(uintptr_t)fmt;
12496 } else {
12497 if (kind == DTRACEACT_PRINTA) {
12498 ASSERT(desc->dofa_strtab == DOF_SECIDX_NONE);
12499 arg = 0;
12500 } else {
12501 arg = desc->dofa_arg;
12502 }
12503 }
12504
12505 act = dtrace_actdesc_create(kind, desc->dofa_ntuple,
12506 desc->dofa_uarg, arg);
12507
12508 if (last != NULL) {
12509 last->dtad_next = act;
12510 } else {
12511 first = act;
12512 }
12513
12514 last = act;
12515
12516 if (desc->dofa_difo == DOF_SECIDX_NONE)
12517 continue;
12518
12519 if ((difosec = dtrace_dof_sect(dof,
12520 DOF_SECT_DIFOHDR, desc->dofa_difo)) == NULL)
12521 goto err;
12522
12523 act->dtad_difo = dtrace_dof_difo(dof, difosec, vstate, cr);
12524
12525 if (act->dtad_difo == NULL)
12526 goto err;
12527 }
12528
12529 ASSERT(first != NULL);
12530 return (first);
12531
12532err:
12533 for (act = first; act != NULL; act = next) {
12534 next = act->dtad_next;
12535 dtrace_actdesc_release(act, vstate);
12536 }
12537
12538 return (NULL);
12539}
12540
12541static dtrace_ecbdesc_t *
12542dtrace_dof_ecbdesc(dof_hdr_t *dof, dof_sec_t *sec, dtrace_vstate_t *vstate,
12543 cred_t *cr)
12544{
12545 dtrace_ecbdesc_t *ep;
12546 dof_ecbdesc_t *ecb;
12547 dtrace_probedesc_t *desc;
12548 dtrace_predicate_t *pred = NULL;
12549
12550 if (sec->dofs_size < sizeof (dof_ecbdesc_t)) {
12551 dtrace_dof_error(dof, "truncated ECB description");
12552 return (NULL);
12553 }
12554
12555 if (sec->dofs_align != sizeof (uint64_t)) {
12556 dtrace_dof_error(dof, "bad alignment in ECB description");
12557 return (NULL);
12558 }
12559
12560 ecb = (dof_ecbdesc_t *)((uintptr_t)dof + (uintptr_t)sec->dofs_offset);
12561 sec = dtrace_dof_sect(dof, DOF_SECT_PROBEDESC, ecb->dofe_probes);
12562
12563 if (sec == NULL)
12564 return (NULL);
12565
12566 ep = kmem_zalloc(sizeof (dtrace_ecbdesc_t), KM_SLEEP);
12567 ep->dted_uarg = ecb->dofe_uarg;
12568 desc = &ep->dted_probe;
12569
12570 if (dtrace_dof_probedesc(dof, sec, desc) == NULL)
12571 goto err;
12572
12573 if (ecb->dofe_pred != DOF_SECIDX_NONE) {
12574 if ((sec = dtrace_dof_sect(dof,
12575 DOF_SECT_DIFOHDR, ecb->dofe_pred)) == NULL)
12576 goto err;
12577
12578 if ((pred = dtrace_dof_predicate(dof, sec, vstate, cr)) == NULL)
12579 goto err;
12580
12581 ep->dted_pred.dtpdd_predicate = pred;
12582 }
12583
12584 if (ecb->dofe_actions != DOF_SECIDX_NONE) {
12585 if ((sec = dtrace_dof_sect(dof,
12586 DOF_SECT_ACTDESC, ecb->dofe_actions)) == NULL)
12587 goto err;
12588
12589 ep->dted_action = dtrace_dof_actdesc(dof, sec, vstate, cr);
12590
12591 if (ep->dted_action == NULL)
12592 goto err;
12593 }
12594
12595 return (ep);
12596
12597err:
12598 if (pred != NULL)
12599 dtrace_predicate_release(pred, vstate);
12600 kmem_free(ep, sizeof (dtrace_ecbdesc_t));
12601 return (NULL);
12602}
12603
12604/*
12605 * Apply the relocations from the specified 'sec' (a DOF_SECT_URELHDR) to the
12606 * specified DOF. At present, this amounts to simply adding 'ubase' to the
12607 * site of any user SETX relocations to account for load object base address.
12608 * In the future, if we need other relocations, this function can be extended.
12609 */
12610static int
12611dtrace_dof_relocate(dof_hdr_t *dof, dof_sec_t *sec, uint64_t ubase)
12612{
12613 uintptr_t daddr = (uintptr_t)dof;
12614 dof_relohdr_t *dofr =
12615 (dof_relohdr_t *)(uintptr_t)(daddr + sec->dofs_offset);
12616 dof_sec_t *ss, *rs, *ts;
12617 dof_relodesc_t *r;
12618 uint_t i, n;
12619
12620 if (sec->dofs_size < sizeof (dof_relohdr_t) ||
12621 sec->dofs_align != sizeof (dof_secidx_t)) {
12622 dtrace_dof_error(dof, "invalid relocation header");
12623 return (-1);
12624 }
12625
12626 ss = dtrace_dof_sect(dof, DOF_SECT_STRTAB, dofr->dofr_strtab);
12627 rs = dtrace_dof_sect(dof, DOF_SECT_RELTAB, dofr->dofr_relsec);
12628 ts = dtrace_dof_sect(dof, DOF_SECT_NONE, dofr->dofr_tgtsec);
12629
12630 if (ss == NULL || rs == NULL || ts == NULL)
12631 return (-1); /* dtrace_dof_error() has been called already */
12632
12633 if (rs->dofs_entsize < sizeof (dof_relodesc_t) ||
12634 rs->dofs_align != sizeof (uint64_t)) {
12635 dtrace_dof_error(dof, "invalid relocation section");
12636 return (-1);
12637 }
12638
12639 r = (dof_relodesc_t *)(uintptr_t)(daddr + rs->dofs_offset);
12640 n = rs->dofs_size / rs->dofs_entsize;
12641
12642 for (i = 0; i < n; i++) {
12643 uintptr_t taddr = daddr + ts->dofs_offset + r->dofr_offset;
12644
12645 switch (r->dofr_type) {
12646 case DOF_RELO_NONE:
12647 break;
12648 case DOF_RELO_SETX:
12649 if (r->dofr_offset >= ts->dofs_size || r->dofr_offset +
12650 sizeof (uint64_t) > ts->dofs_size) {
12651 dtrace_dof_error(dof, "bad relocation offset");
12652 return (-1);
12653 }
12654
12655 if (!IS_P2ALIGNED(taddr, sizeof (uint64_t))) {
12656 dtrace_dof_error(dof, "misaligned setx relo");
12657 return (-1);
12658 }
12659
12660 *(uint64_t *)taddr += ubase;
12661 break;
12662 default:
12663 dtrace_dof_error(dof, "invalid relocation type");
12664 return (-1);
12665 }
12666
12667 r = (dof_relodesc_t *)((uintptr_t)r + rs->dofs_entsize);
12668 }
12669
12670 return (0);
12671}
12672
12673/*
12674 * The dof_hdr_t passed to dtrace_dof_slurp() should be a partially validated
12675 * header: it should be at the front of a memory region that is at least
12676 * sizeof (dof_hdr_t) in size -- and then at least dof_hdr.dofh_loadsz in
12677 * size. It need not be validated in any other way.
12678 */
12679static int
12680dtrace_dof_slurp(dof_hdr_t *dof, dtrace_vstate_t *vstate, cred_t *cr,
12681 dtrace_enabling_t **enabp, uint64_t ubase, int noprobes)
12682{
12683 uint64_t len = dof->dofh_loadsz, seclen;
12684 uintptr_t daddr = (uintptr_t)dof;
12685 dtrace_ecbdesc_t *ep;
12686 dtrace_enabling_t *enab;
12687 uint_t i;
12688
12689 ASSERT(MUTEX_HELD(&dtrace_lock));
12690 ASSERT(dof->dofh_loadsz >= sizeof (dof_hdr_t));
12691
12692 /*
12693 * Check the DOF header identification bytes. In addition to checking
12694 * valid settings, we also verify that unused bits/bytes are zeroed so
12695 * we can use them later without fear of regressing existing binaries.
12696 */
12697 if (bcmp(&dof->dofh_ident[DOF_ID_MAG0],
12698 DOF_MAG_STRING, DOF_MAG_STRLEN) != 0) {
12699 dtrace_dof_error(dof, "DOF magic string mismatch");
12700 return (-1);
12701 }
12702
12703 if (dof->dofh_ident[DOF_ID_MODEL] != DOF_MODEL_ILP32 &&
12704 dof->dofh_ident[DOF_ID_MODEL] != DOF_MODEL_LP64) {
12705 dtrace_dof_error(dof, "DOF has invalid data model");
12706 return (-1);
12707 }
12708
12709 if (dof->dofh_ident[DOF_ID_ENCODING] != DOF_ENCODE_NATIVE) {
12710 dtrace_dof_error(dof, "DOF encoding mismatch");
12711 return (-1);
12712 }
12713
12714 if (dof->dofh_ident[DOF_ID_VERSION] != DOF_VERSION_1 &&
12715 dof->dofh_ident[DOF_ID_VERSION] != DOF_VERSION_2) {
12716 dtrace_dof_error(dof, "DOF version mismatch");
12717 return (-1);
12718 }
12719
12720 if (dof->dofh_ident[DOF_ID_DIFVERS] != DIF_VERSION_2) {
12721 dtrace_dof_error(dof, "DOF uses unsupported instruction set");
12722 return (-1);
12723 }
12724
12725 if (dof->dofh_ident[DOF_ID_DIFIREG] > DIF_DIR_NREGS) {
12726 dtrace_dof_error(dof, "DOF uses too many integer registers");
12727 return (-1);
12728 }
12729
12730 if (dof->dofh_ident[DOF_ID_DIFTREG] > DIF_DTR_NREGS) {
12731 dtrace_dof_error(dof, "DOF uses too many tuple registers");
12732 return (-1);
12733 }
12734
12735 for (i = DOF_ID_PAD; i < DOF_ID_SIZE; i++) {
12736 if (dof->dofh_ident[i] != 0) {
12737 dtrace_dof_error(dof, "DOF has invalid ident byte set");
12738 return (-1);
12739 }
12740 }
12741
12742 if (dof->dofh_flags & ~DOF_FL_VALID) {
12743 dtrace_dof_error(dof, "DOF has invalid flag bits set");
12744 return (-1);
12745 }
12746
12747 if (dof->dofh_secsize == 0) {
12748 dtrace_dof_error(dof, "zero section header size");
12749 return (-1);
12750 }
12751
12752 /*
12753 * Check that the section headers don't exceed the amount of DOF
12754 * data. Note that we cast the section size and number of sections
12755 * to uint64_t's to prevent possible overflow in the multiplication.
12756 */
12757 seclen = (uint64_t)dof->dofh_secnum * (uint64_t)dof->dofh_secsize;
12758
12759 if (dof->dofh_secoff > len || seclen > len ||
12760 dof->dofh_secoff + seclen > len) {
12761 dtrace_dof_error(dof, "truncated section headers");
12762 return (-1);
12763 }
12764
12765 if (!IS_P2ALIGNED(dof->dofh_secoff, sizeof (uint64_t))) {
12766 dtrace_dof_error(dof, "misaligned section headers");
12767 return (-1);
12768 }
12769
12770 if (!IS_P2ALIGNED(dof->dofh_secsize, sizeof (uint64_t))) {
12771 dtrace_dof_error(dof, "misaligned section size");
12772 return (-1);
12773 }
12774
12775 /*
12776 * Take an initial pass through the section headers to be sure that
12777 * the headers don't have stray offsets. If the 'noprobes' flag is
12778 * set, do not permit sections relating to providers, probes, or args.
12779 */
12780 for (i = 0; i < dof->dofh_secnum; i++) {
12781 dof_sec_t *sec = (dof_sec_t *)(daddr +
12782 (uintptr_t)dof->dofh_secoff + i * dof->dofh_secsize);
12783
12784 if (noprobes) {
12785 switch (sec->dofs_type) {
12786 case DOF_SECT_PROVIDER:
12787 case DOF_SECT_PROBES:
12788 case DOF_SECT_PRARGS:
12789 case DOF_SECT_PROFFS:
12790 dtrace_dof_error(dof, "illegal sections "
12791 "for enabling");
12792 return (-1);
12793 }
12794 }
12795
12796 if (DOF_SEC_ISLOADABLE(sec->dofs_type) &&
12797 !(sec->dofs_flags & DOF_SECF_LOAD)) {
12798 dtrace_dof_error(dof, "loadable section with load "
12799 "flag unset");
12800 return (-1);
12801 }
12802
12803 if (!(sec->dofs_flags & DOF_SECF_LOAD))
12804 continue; /* just ignore non-loadable sections */
12805
12806 if (sec->dofs_align & (sec->dofs_align - 1)) {
12807 dtrace_dof_error(dof, "bad section alignment");
12808 return (-1);
12809 }
12810
12811 if (sec->dofs_offset & (sec->dofs_align - 1)) {
12812 dtrace_dof_error(dof, "misaligned section");
12813 return (-1);
12814 }
12815
12816 if (sec->dofs_offset > len || sec->dofs_size > len ||
12817 sec->dofs_offset + sec->dofs_size > len) {
12818 dtrace_dof_error(dof, "corrupt section header");
12819 return (-1);
12820 }
12821
12822 if (sec->dofs_type == DOF_SECT_STRTAB && *((char *)daddr +
12823 sec->dofs_offset + sec->dofs_size - 1) != '\0') {
12824 dtrace_dof_error(dof, "non-terminating string table");
12825 return (-1);
12826 }
12827 }
12828
12829 /*
12830 * Take a second pass through the sections and locate and perform any
12831 * relocations that are present. We do this after the first pass to
12832 * be sure that all sections have had their headers validated.
12833 */
12834 for (i = 0; i < dof->dofh_secnum; i++) {
12835 dof_sec_t *sec = (dof_sec_t *)(daddr +
12836 (uintptr_t)dof->dofh_secoff + i * dof->dofh_secsize);
12837
12838 if (!(sec->dofs_flags & DOF_SECF_LOAD))
12839 continue; /* skip sections that are not loadable */
12840
12841 switch (sec->dofs_type) {
12842 case DOF_SECT_URELHDR:
12843 if (dtrace_dof_relocate(dof, sec, ubase) != 0)
12844 return (-1);
12845 break;
12846 }
12847 }
12848
12849 if ((enab = *enabp) == NULL)
12850 enab = *enabp = dtrace_enabling_create(vstate);
12851
12852 for (i = 0; i < dof->dofh_secnum; i++) {
12853 dof_sec_t *sec = (dof_sec_t *)(daddr +
12854 (uintptr_t)dof->dofh_secoff + i * dof->dofh_secsize);
12855
12856 if (sec->dofs_type != DOF_SECT_ECBDESC)
12857 continue;
12858
12859 if ((ep = dtrace_dof_ecbdesc(dof, sec, vstate, cr)) == NULL) {
12860 dtrace_enabling_destroy(enab);
12861 *enabp = NULL;
12862 return (-1);
12863 }
12864
12865 dtrace_enabling_add(enab, ep);
12866 }
12867
12868 return (0);
12869}
12870
12871/*
12872 * Process DOF for any options. This routine assumes that the DOF has been
12873 * at least processed by dtrace_dof_slurp().
12874 */
12875static int
12876dtrace_dof_options(dof_hdr_t *dof, dtrace_state_t *state)
12877{
12878 int i, rval;
12879 uint32_t entsize;
12880 size_t offs;
12881 dof_optdesc_t *desc;
12882
12883 for (i = 0; i < dof->dofh_secnum; i++) {
12884 dof_sec_t *sec = (dof_sec_t *)((uintptr_t)dof +
12885 (uintptr_t)dof->dofh_secoff + i * dof->dofh_secsize);
12886
12887 if (sec->dofs_type != DOF_SECT_OPTDESC)
12888 continue;
12889
12890 if (sec->dofs_align != sizeof (uint64_t)) {
12891 dtrace_dof_error(dof, "bad alignment in "
12892 "option description");
12893 return (EINVAL);
12894 }
12895
12896 if ((entsize = sec->dofs_entsize) == 0) {
12897 dtrace_dof_error(dof, "zeroed option entry size");
12898 return (EINVAL);
12899 }
12900
12901 if (entsize < sizeof (dof_optdesc_t)) {
12902 dtrace_dof_error(dof, "bad option entry size");
12903 return (EINVAL);
12904 }
12905
12906 for (offs = 0; offs < sec->dofs_size; offs += entsize) {
12907 desc = (dof_optdesc_t *)((uintptr_t)dof +
12908 (uintptr_t)sec->dofs_offset + offs);
12909
12910 if (desc->dofo_strtab != DOF_SECIDX_NONE) {
12911 dtrace_dof_error(dof, "non-zero option string");
12912 return (EINVAL);
12913 }
12914
12915 if (desc->dofo_value == DTRACEOPT_UNSET) {
12916 dtrace_dof_error(dof, "unset option");
12917 return (EINVAL);
12918 }
12919
12920 if ((rval = dtrace_state_option(state,
12921 desc->dofo_option, desc->dofo_value)) != 0) {
12922 dtrace_dof_error(dof, "rejected option");
12923 return (rval);
12924 }
12925 }
12926 }
12927
12928 return (0);
12929}
12930
12931/*
12932 * DTrace Consumer State Functions
12933 */
12934static int
12935dtrace_dstate_init(dtrace_dstate_t *dstate, size_t size)
12936{
12937 size_t hashsize, maxper, min, chunksize = dstate->dtds_chunksize;
12938 void *base;
12939 uintptr_t limit;
12940 dtrace_dynvar_t *dvar, *next, *start;
12941 int i;
12942
12943 ASSERT(MUTEX_HELD(&dtrace_lock));
12944 ASSERT(dstate->dtds_base == NULL && dstate->dtds_percpu == NULL);
12945
12946 bzero(dstate, sizeof (dtrace_dstate_t));
12947
12948 if ((dstate->dtds_chunksize = chunksize) == 0)
12949 dstate->dtds_chunksize = DTRACE_DYNVAR_CHUNKSIZE;
12950
12951 if (size < (min = dstate->dtds_chunksize + sizeof (dtrace_dynhash_t)))
12952 size = min;
12953
12954 if ((base = kmem_zalloc(size, KM_NOSLEEP | KM_NORMALPRI)) == NULL)
12955 return (ENOMEM);
12956
12957 dstate->dtds_size = size;
12958 dstate->dtds_base = base;
12959 dstate->dtds_percpu = kmem_cache_alloc(dtrace_state_cache, KM_SLEEP);
12960 bzero(dstate->dtds_percpu, NCPU * sizeof (dtrace_dstate_percpu_t));
12961
12962 hashsize = size / (dstate->dtds_chunksize + sizeof (dtrace_dynhash_t));
12963
12964 if (hashsize != 1 && (hashsize & 1))
12965 hashsize--;
12966
12967 dstate->dtds_hashsize = hashsize;
12968 dstate->dtds_hash = dstate->dtds_base;
12969
12970 /*
12971 * Set all of our hash buckets to point to the single sink, and (if
12972 * it hasn't already been set), set the sink's hash value to be the
12973 * sink sentinel value. The sink is needed for dynamic variable
12974 * lookups to know that they have iterated over an entire, valid hash
12975 * chain.
12976 */
12977 for (i = 0; i < hashsize; i++)
12978 dstate->dtds_hash[i].dtdh_chain = &dtrace_dynhash_sink;
12979
12980 if (dtrace_dynhash_sink.dtdv_hashval != DTRACE_DYNHASH_SINK)
12981 dtrace_dynhash_sink.dtdv_hashval = DTRACE_DYNHASH_SINK;
12982
12983 /*
12984 * Determine number of active CPUs. Divide free list evenly among
12985 * active CPUs.
12986 */
12987 start = (dtrace_dynvar_t *)
12988 ((uintptr_t)base + hashsize * sizeof (dtrace_dynhash_t));
12989 limit = (uintptr_t)base + size;
12990
12991 maxper = (limit - (uintptr_t)start) / NCPU;
12992 maxper = (maxper / dstate->dtds_chunksize) * dstate->dtds_chunksize;
12993
12994#if !defined(sun)
12995 CPU_FOREACH(i) {
12996#else
12997 for (i = 0; i < NCPU; i++) {
12998#endif
12999 dstate->dtds_percpu[i].dtdsc_free = dvar = start;
13000
13001 /*
13002 * If we don't even have enough chunks to make it once through
13003 * NCPUs, we're just going to allocate everything to the first
13004 * CPU. And if we're on the last CPU, we're going to allocate
13005 * whatever is left over. In either case, we set the limit to
13006 * be the limit of the dynamic variable space.
13007 */
13008 if (maxper == 0 || i == NCPU - 1) {
13009 limit = (uintptr_t)base + size;
13010 start = NULL;
13011 } else {
13012 limit = (uintptr_t)start + maxper;
13013 start = (dtrace_dynvar_t *)limit;
13014 }
13015
13016 ASSERT(limit <= (uintptr_t)base + size);
13017
13018 for (;;) {
13019 next = (dtrace_dynvar_t *)((uintptr_t)dvar +
13020 dstate->dtds_chunksize);
13021
13022 if ((uintptr_t)next + dstate->dtds_chunksize >= limit)
13023 break;
13024
13025 dvar->dtdv_next = next;
13026 dvar = next;
13027 }
13028
13029 if (maxper == 0)
13030 break;
13031 }
13032
13033 return (0);
13034}
13035
13036static void
13037dtrace_dstate_fini(dtrace_dstate_t *dstate)
13038{
13039 ASSERT(MUTEX_HELD(&cpu_lock));
13040
13041 if (dstate->dtds_base == NULL)
13042 return;
13043
13044 kmem_free(dstate->dtds_base, dstate->dtds_size);
13045 kmem_cache_free(dtrace_state_cache, dstate->dtds_percpu);
13046}
13047
13048static void
13049dtrace_vstate_fini(dtrace_vstate_t *vstate)
13050{
13051 /*
13052 * Logical XOR, where are you?
13053 */
13054 ASSERT((vstate->dtvs_nglobals == 0) ^ (vstate->dtvs_globals != NULL));
13055
13056 if (vstate->dtvs_nglobals > 0) {
13057 kmem_free(vstate->dtvs_globals, vstate->dtvs_nglobals *
13058 sizeof (dtrace_statvar_t *));
13059 }
13060
13061 if (vstate->dtvs_ntlocals > 0) {
13062 kmem_free(vstate->dtvs_tlocals, vstate->dtvs_ntlocals *
13063 sizeof (dtrace_difv_t));
13064 }
13065
13066 ASSERT((vstate->dtvs_nlocals == 0) ^ (vstate->dtvs_locals != NULL));
13067
13068 if (vstate->dtvs_nlocals > 0) {
13069 kmem_free(vstate->dtvs_locals, vstate->dtvs_nlocals *
13070 sizeof (dtrace_statvar_t *));
13071 }
13072}
13073
13074#if defined(sun)
13075static void
13076dtrace_state_clean(dtrace_state_t *state)
13077{
13078 if (state->dts_activity == DTRACE_ACTIVITY_INACTIVE)
13079 return;
13080
13081 dtrace_dynvar_clean(&state->dts_vstate.dtvs_dynvars);
13082 dtrace_speculation_clean(state);
13083}
13084
13085static void
13086dtrace_state_deadman(dtrace_state_t *state)
13087{
13088 hrtime_t now;
13089
13090 dtrace_sync();
13091
13092 now = dtrace_gethrtime();
13093
13094 if (state != dtrace_anon.dta_state &&
13095 now - state->dts_laststatus >= dtrace_deadman_user)
13096 return;
13097
13098 /*
13099 * We must be sure that dts_alive never appears to be less than the
13100 * value upon entry to dtrace_state_deadman(), and because we lack a
13101 * dtrace_cas64(), we cannot store to it atomically. We thus instead
13102 * store INT64_MAX to it, followed by a memory barrier, followed by
13103 * the new value. This assures that dts_alive never appears to be
13104 * less than its true value, regardless of the order in which the
13105 * stores to the underlying storage are issued.
13106 */
13107 state->dts_alive = INT64_MAX;
13108 dtrace_membar_producer();
13109 state->dts_alive = now;
13110}
13111#else
13112static void
13113dtrace_state_clean(void *arg)
13114{
13115 dtrace_state_t *state = arg;
13116 dtrace_optval_t *opt = state->dts_options;
13117
13118 if (state->dts_activity == DTRACE_ACTIVITY_INACTIVE)
13119 return;
13120
13121 dtrace_dynvar_clean(&state->dts_vstate.dtvs_dynvars);
13122 dtrace_speculation_clean(state);
13123
13124 callout_reset(&state->dts_cleaner, hz * opt[DTRACEOPT_CLEANRATE] / NANOSEC,
13125 dtrace_state_clean, state);
13126}
13127
13128static void
13129dtrace_state_deadman(void *arg)
13130{
13131 dtrace_state_t *state = arg;
13132 hrtime_t now;
13133
13134 dtrace_sync();
13135
13136 dtrace_debug_output();
13137
13138 now = dtrace_gethrtime();
13139
13140 if (state != dtrace_anon.dta_state &&
13141 now - state->dts_laststatus >= dtrace_deadman_user)
13142 return;
13143
13144 /*
13145 * We must be sure that dts_alive never appears to be less than the
13146 * value upon entry to dtrace_state_deadman(), and because we lack a
13147 * dtrace_cas64(), we cannot store to it atomically. We thus instead
13148 * store INT64_MAX to it, followed by a memory barrier, followed by
13149 * the new value. This assures that dts_alive never appears to be
13150 * less than its true value, regardless of the order in which the
13151 * stores to the underlying storage are issued.
13152 */
13153 state->dts_alive = INT64_MAX;
13154 dtrace_membar_producer();
13155 state->dts_alive = now;
13156
13157 callout_reset(&state->dts_deadman, hz * dtrace_deadman_interval / NANOSEC,
13158 dtrace_state_deadman, state);
13159}
13160#endif
13161
13162static dtrace_state_t *
13163#if defined(sun)
13164dtrace_state_create(dev_t *devp, cred_t *cr)
13165#else
13166dtrace_state_create(struct cdev *dev)
13167#endif
13168{
13169#if defined(sun)
13170 minor_t minor;
13171 major_t major;
13172#else
13173 cred_t *cr = NULL;
13174 int m = 0;
13175#endif
13176 char c[30];
13177 dtrace_state_t *state;
13178 dtrace_optval_t *opt;
13179 int bufsize = NCPU * sizeof (dtrace_buffer_t), i;
13180
13181 ASSERT(MUTEX_HELD(&dtrace_lock));
13182 ASSERT(MUTEX_HELD(&cpu_lock));
13183
13184#if defined(sun)
13185 minor = (minor_t)(uintptr_t)vmem_alloc(dtrace_minor, 1,
13186 VM_BESTFIT | VM_SLEEP);
13187
13188 if (ddi_soft_state_zalloc(dtrace_softstate, minor) != DDI_SUCCESS) {
13189 vmem_free(dtrace_minor, (void *)(uintptr_t)minor, 1);
13190 return (NULL);
13191 }
13192
13193 state = ddi_get_soft_state(dtrace_softstate, minor);
13194#else
13195 if (dev != NULL) {
13196 cr = dev->si_cred;
13197 m = dev2unit(dev);
13198 }
13199
13200 /* Allocate memory for the state. */
13201 state = kmem_zalloc(sizeof(dtrace_state_t), KM_SLEEP);
13202#endif
13203
13204 state->dts_epid = DTRACE_EPIDNONE + 1;
13205
13206 (void) snprintf(c, sizeof (c), "dtrace_aggid_%d", m);
13207#if defined(sun)
13208 state->dts_aggid_arena = vmem_create(c, (void *)1, UINT32_MAX, 1,
13209 NULL, NULL, NULL, 0, VM_SLEEP | VMC_IDENTIFIER);
13210
13211 if (devp != NULL) {
13212 major = getemajor(*devp);
13213 } else {
13214 major = ddi_driver_major(dtrace_devi);
13215 }
13216
13217 state->dts_dev = makedevice(major, minor);
13218
13219 if (devp != NULL)
13220 *devp = state->dts_dev;
13221#else
13222 state->dts_aggid_arena = new_unrhdr(1, INT_MAX, &dtrace_unr_mtx);
13223 state->dts_dev = dev;
13224#endif
13225
13226 /*
13227 * We allocate NCPU buffers. On the one hand, this can be quite
13228 * a bit of memory per instance (nearly 36K on a Starcat). On the
13229 * other hand, it saves an additional memory reference in the probe
13230 * path.
13231 */
13232 state->dts_buffer = kmem_zalloc(bufsize, KM_SLEEP);
13233 state->dts_aggbuffer = kmem_zalloc(bufsize, KM_SLEEP);
13234
13235#if defined(sun)
13236 state->dts_cleaner = CYCLIC_NONE;
13237 state->dts_deadman = CYCLIC_NONE;
13238#else
13239 callout_init(&state->dts_cleaner, CALLOUT_MPSAFE);
13240 callout_init(&state->dts_deadman, CALLOUT_MPSAFE);
13241#endif
13242 state->dts_vstate.dtvs_state = state;
13243
13244 for (i = 0; i < DTRACEOPT_MAX; i++)
13245 state->dts_options[i] = DTRACEOPT_UNSET;
13246
13247 /*
13248 * Set the default options.
13249 */
13250 opt = state->dts_options;
13251 opt[DTRACEOPT_BUFPOLICY] = DTRACEOPT_BUFPOLICY_SWITCH;
13252 opt[DTRACEOPT_BUFRESIZE] = DTRACEOPT_BUFRESIZE_AUTO;
13253 opt[DTRACEOPT_NSPEC] = dtrace_nspec_default;
13254 opt[DTRACEOPT_SPECSIZE] = dtrace_specsize_default;
13255 opt[DTRACEOPT_CPU] = (dtrace_optval_t)DTRACE_CPUALL;
13256 opt[DTRACEOPT_STRSIZE] = dtrace_strsize_default;
13257 opt[DTRACEOPT_STACKFRAMES] = dtrace_stackframes_default;
13258 opt[DTRACEOPT_USTACKFRAMES] = dtrace_ustackframes_default;
13259 opt[DTRACEOPT_CLEANRATE] = dtrace_cleanrate_default;
13260 opt[DTRACEOPT_AGGRATE] = dtrace_aggrate_default;
13261 opt[DTRACEOPT_SWITCHRATE] = dtrace_switchrate_default;
13262 opt[DTRACEOPT_STATUSRATE] = dtrace_statusrate_default;
13263 opt[DTRACEOPT_JSTACKFRAMES] = dtrace_jstackframes_default;
13264 opt[DTRACEOPT_JSTACKSTRSIZE] = dtrace_jstackstrsize_default;
13265
13266 state->dts_activity = DTRACE_ACTIVITY_INACTIVE;
13267
13268 /*
13269 * Depending on the user credentials, we set flag bits which alter probe
13270 * visibility or the amount of destructiveness allowed. In the case of
13271 * actual anonymous tracing, or the possession of all privileges, all of
13272 * the normal checks are bypassed.
13273 */
13274 if (cr == NULL || PRIV_POLICY_ONLY(cr, PRIV_ALL, B_FALSE)) {
13275 state->dts_cred.dcr_visible = DTRACE_CRV_ALL;
13276 state->dts_cred.dcr_action = DTRACE_CRA_ALL;
13277 } else {
13278 /*
13279 * Set up the credentials for this instantiation. We take a
13280 * hold on the credential to prevent it from disappearing on
13281 * us; this in turn prevents the zone_t referenced by this
13282 * credential from disappearing. This means that we can
13283 * examine the credential and the zone from probe context.
13284 */
13285 crhold(cr);
13286 state->dts_cred.dcr_cred = cr;
13287
13288 /*
13289 * CRA_PROC means "we have *some* privilege for dtrace" and
13290 * unlocks the use of variables like pid, zonename, etc.
13291 */
13292 if (PRIV_POLICY_ONLY(cr, PRIV_DTRACE_USER, B_FALSE) ||
13293 PRIV_POLICY_ONLY(cr, PRIV_DTRACE_PROC, B_FALSE)) {
13294 state->dts_cred.dcr_action |= DTRACE_CRA_PROC;
13295 }
13296
13297 /*
13298 * dtrace_user allows use of syscall and profile providers.
13299 * If the user also has proc_owner and/or proc_zone, we
13300 * extend the scope to include additional visibility and
13301 * destructive power.
13302 */
13303 if (PRIV_POLICY_ONLY(cr, PRIV_DTRACE_USER, B_FALSE)) {
13304 if (PRIV_POLICY_ONLY(cr, PRIV_PROC_OWNER, B_FALSE)) {
13305 state->dts_cred.dcr_visible |=
13306 DTRACE_CRV_ALLPROC;
13307
13308 state->dts_cred.dcr_action |=
13309 DTRACE_CRA_PROC_DESTRUCTIVE_ALLUSER;
13310 }
13311
13312 if (PRIV_POLICY_ONLY(cr, PRIV_PROC_ZONE, B_FALSE)) {
13313 state->dts_cred.dcr_visible |=
13314 DTRACE_CRV_ALLZONE;
13315
13316 state->dts_cred.dcr_action |=
13317 DTRACE_CRA_PROC_DESTRUCTIVE_ALLZONE;
13318 }
13319
13320 /*
13321 * If we have all privs in whatever zone this is,
13322 * we can do destructive things to processes which
13323 * have altered credentials.
13324 */
13325#if defined(sun)
13326 if (priv_isequalset(priv_getset(cr, PRIV_EFFECTIVE),
13327 cr->cr_zone->zone_privset)) {
13328 state->dts_cred.dcr_action |=
13329 DTRACE_CRA_PROC_DESTRUCTIVE_CREDCHG;
13330 }
13331#endif
13332 }
13333
13334 /*
13335 * Holding the dtrace_kernel privilege also implies that
13336 * the user has the dtrace_user privilege from a visibility
13337 * perspective. But without further privileges, some
13338 * destructive actions are not available.
13339 */
13340 if (PRIV_POLICY_ONLY(cr, PRIV_DTRACE_KERNEL, B_FALSE)) {
13341 /*
13342 * Make all probes in all zones visible. However,
13343 * this doesn't mean that all actions become available
13344 * to all zones.
13345 */
13346 state->dts_cred.dcr_visible |= DTRACE_CRV_KERNEL |
13347 DTRACE_CRV_ALLPROC | DTRACE_CRV_ALLZONE;
13348
13349 state->dts_cred.dcr_action |= DTRACE_CRA_KERNEL |
13350 DTRACE_CRA_PROC;
13351 /*
13352 * Holding proc_owner means that destructive actions
13353 * for *this* zone are allowed.
13354 */
13355 if (PRIV_POLICY_ONLY(cr, PRIV_PROC_OWNER, B_FALSE))
13356 state->dts_cred.dcr_action |=
13357 DTRACE_CRA_PROC_DESTRUCTIVE_ALLUSER;
13358
13359 /*
13360 * Holding proc_zone means that destructive actions
13361 * for this user/group ID in all zones is allowed.
13362 */
13363 if (PRIV_POLICY_ONLY(cr, PRIV_PROC_ZONE, B_FALSE))
13364 state->dts_cred.dcr_action |=
13365 DTRACE_CRA_PROC_DESTRUCTIVE_ALLZONE;
13366
13367#if defined(sun)
13368 /*
13369 * If we have all privs in whatever zone this is,
13370 * we can do destructive things to processes which
13371 * have altered credentials.
13372 */
13373 if (priv_isequalset(priv_getset(cr, PRIV_EFFECTIVE),
13374 cr->cr_zone->zone_privset)) {
13375 state->dts_cred.dcr_action |=
13376 DTRACE_CRA_PROC_DESTRUCTIVE_CREDCHG;
13377 }
13378#endif
13379 }
13380
13381 /*
13382 * Holding the dtrace_proc privilege gives control over fasttrap
13383 * and pid providers. We need to grant wider destructive
13384 * privileges in the event that the user has proc_owner and/or
13385 * proc_zone.
13386 */
13387 if (PRIV_POLICY_ONLY(cr, PRIV_DTRACE_PROC, B_FALSE)) {
13388 if (PRIV_POLICY_ONLY(cr, PRIV_PROC_OWNER, B_FALSE))
13389 state->dts_cred.dcr_action |=
13390 DTRACE_CRA_PROC_DESTRUCTIVE_ALLUSER;
13391
13392 if (PRIV_POLICY_ONLY(cr, PRIV_PROC_ZONE, B_FALSE))
13393 state->dts_cred.dcr_action |=
13394 DTRACE_CRA_PROC_DESTRUCTIVE_ALLZONE;
13395 }
13396 }
13397
13398 return (state);
13399}
13400
13401static int
13402dtrace_state_buffer(dtrace_state_t *state, dtrace_buffer_t *buf, int which)
13403{
13404 dtrace_optval_t *opt = state->dts_options, size;
13405 processorid_t cpu = 0;;
13406 int flags = 0, rval, factor, divisor = 1;
13407
13408 ASSERT(MUTEX_HELD(&dtrace_lock));
13409 ASSERT(MUTEX_HELD(&cpu_lock));
13410 ASSERT(which < DTRACEOPT_MAX);
13411 ASSERT(state->dts_activity == DTRACE_ACTIVITY_INACTIVE ||
13412 (state == dtrace_anon.dta_state &&
13413 state->dts_activity == DTRACE_ACTIVITY_ACTIVE));
13414
13415 if (opt[which] == DTRACEOPT_UNSET || opt[which] == 0)
13416 return (0);
13417
13418 if (opt[DTRACEOPT_CPU] != DTRACEOPT_UNSET)
13419 cpu = opt[DTRACEOPT_CPU];
13420
13421 if (which == DTRACEOPT_SPECSIZE)
13422 flags |= DTRACEBUF_NOSWITCH;
13423
13424 if (which == DTRACEOPT_BUFSIZE) {
13425 if (opt[DTRACEOPT_BUFPOLICY] == DTRACEOPT_BUFPOLICY_RING)
13426 flags |= DTRACEBUF_RING;
13427
13428 if (opt[DTRACEOPT_BUFPOLICY] == DTRACEOPT_BUFPOLICY_FILL)
13429 flags |= DTRACEBUF_FILL;
13430
13431 if (state != dtrace_anon.dta_state ||
13432 state->dts_activity != DTRACE_ACTIVITY_ACTIVE)
13433 flags |= DTRACEBUF_INACTIVE;
13434 }
13435
13436 for (size = opt[which]; size >= sizeof (uint64_t); size /= divisor) {
13437 /*
13438 * The size must be 8-byte aligned. If the size is not 8-byte
13439 * aligned, drop it down by the difference.
13440 */
13441 if (size & (sizeof (uint64_t) - 1))
13442 size -= size & (sizeof (uint64_t) - 1);
13443
13444 if (size < state->dts_reserve) {
13445 /*
13446 * Buffers always must be large enough to accommodate
13447 * their prereserved space. We return E2BIG instead
13448 * of ENOMEM in this case to allow for user-level
13449 * software to differentiate the cases.
13450 */
13451 return (E2BIG);
13452 }
13453
13454 rval = dtrace_buffer_alloc(buf, size, flags, cpu, &factor);
13455
13456 if (rval != ENOMEM) {
13457 opt[which] = size;
13458 return (rval);
13459 }
13460
13461 if (opt[DTRACEOPT_BUFRESIZE] == DTRACEOPT_BUFRESIZE_MANUAL)
13462 return (rval);
13463
13464 for (divisor = 2; divisor < factor; divisor <<= 1)
13465 continue;
13466 }
13467
13468 return (ENOMEM);
13469}
13470
13471static int
13472dtrace_state_buffers(dtrace_state_t *state)
13473{
13474 dtrace_speculation_t *spec = state->dts_speculations;
13475 int rval, i;
13476
13477 if ((rval = dtrace_state_buffer(state, state->dts_buffer,
13478 DTRACEOPT_BUFSIZE)) != 0)
13479 return (rval);
13480
13481 if ((rval = dtrace_state_buffer(state, state->dts_aggbuffer,
13482 DTRACEOPT_AGGSIZE)) != 0)
13483 return (rval);
13484
13485 for (i = 0; i < state->dts_nspeculations; i++) {
13486 if ((rval = dtrace_state_buffer(state,
13487 spec[i].dtsp_buffer, DTRACEOPT_SPECSIZE)) != 0)
13488 return (rval);
13489 }
13490
13491 return (0);
13492}
13493
13494static void
13495dtrace_state_prereserve(dtrace_state_t *state)
13496{
13497 dtrace_ecb_t *ecb;
13498 dtrace_probe_t *probe;
13499
13500 state->dts_reserve = 0;
13501
13502 if (state->dts_options[DTRACEOPT_BUFPOLICY] != DTRACEOPT_BUFPOLICY_FILL)
13503 return;
13504
13505 /*
13506 * If our buffer policy is a "fill" buffer policy, we need to set the
13507 * prereserved space to be the space required by the END probes.
13508 */
13509 probe = dtrace_probes[dtrace_probeid_end - 1];
13510 ASSERT(probe != NULL);
13511
13512 for (ecb = probe->dtpr_ecb; ecb != NULL; ecb = ecb->dte_next) {
13513 if (ecb->dte_state != state)
13514 continue;
13515
13516 state->dts_reserve += ecb->dte_needed + ecb->dte_alignment;
13517 }
13518}
13519
13520static int
13521dtrace_state_go(dtrace_state_t *state, processorid_t *cpu)
13522{
13523 dtrace_optval_t *opt = state->dts_options, sz, nspec;
13524 dtrace_speculation_t *spec;
13525 dtrace_buffer_t *buf;
13526#if defined(sun)
13527 cyc_handler_t hdlr;
13528 cyc_time_t when;
13529#endif
13530 int rval = 0, i, bufsize = NCPU * sizeof (dtrace_buffer_t);
13531 dtrace_icookie_t cookie;
13532
13533 mutex_enter(&cpu_lock);
13534 mutex_enter(&dtrace_lock);
13535
13536 if (state->dts_activity != DTRACE_ACTIVITY_INACTIVE) {
13537 rval = EBUSY;
13538 goto out;
13539 }
13540
13541 /*
13542 * Before we can perform any checks, we must prime all of the
13543 * retained enablings that correspond to this state.
13544 */
13545 dtrace_enabling_prime(state);
13546
13547 if (state->dts_destructive && !state->dts_cred.dcr_destructive) {
13548 rval = EACCES;
13549 goto out;
13550 }
13551
13552 dtrace_state_prereserve(state);
13553
13554 /*
13555 * Now we want to do is try to allocate our speculations.
13556 * We do not automatically resize the number of speculations; if
13557 * this fails, we will fail the operation.
13558 */
13559 nspec = opt[DTRACEOPT_NSPEC];
13560 ASSERT(nspec != DTRACEOPT_UNSET);
13561
13562 if (nspec > INT_MAX) {
13563 rval = ENOMEM;
13564 goto out;
13565 }
13566
13567 spec = kmem_zalloc(nspec * sizeof (dtrace_speculation_t),
13568 KM_NOSLEEP | KM_NORMALPRI);
13569
13570 if (spec == NULL) {
13571 rval = ENOMEM;
13572 goto out;
13573 }
13574
13575 state->dts_speculations = spec;
13576 state->dts_nspeculations = (int)nspec;
13577
13578 for (i = 0; i < nspec; i++) {
13579 if ((buf = kmem_zalloc(bufsize,
13580 KM_NOSLEEP | KM_NORMALPRI)) == NULL) {
13581 rval = ENOMEM;
13582 goto err;
13583 }
13584
13585 spec[i].dtsp_buffer = buf;
13586 }
13587
13588 if (opt[DTRACEOPT_GRABANON] != DTRACEOPT_UNSET) {
13589 if (dtrace_anon.dta_state == NULL) {
13590 rval = ENOENT;
13591 goto out;
13592 }
13593
13594 if (state->dts_necbs != 0) {
13595 rval = EALREADY;
13596 goto out;
13597 }
13598
13599 state->dts_anon = dtrace_anon_grab();
13600 ASSERT(state->dts_anon != NULL);
13601 state = state->dts_anon;
13602
13603 /*
13604 * We want "grabanon" to be set in the grabbed state, so we'll
13605 * copy that option value from the grabbing state into the
13606 * grabbed state.
13607 */
13608 state->dts_options[DTRACEOPT_GRABANON] =
13609 opt[DTRACEOPT_GRABANON];
13610
13611 *cpu = dtrace_anon.dta_beganon;
13612
13613 /*
13614 * If the anonymous state is active (as it almost certainly
13615 * is if the anonymous enabling ultimately matched anything),
13616 * we don't allow any further option processing -- but we
13617 * don't return failure.
13618 */
13619 if (state->dts_activity != DTRACE_ACTIVITY_INACTIVE)
13620 goto out;
13621 }
13622
13623 if (opt[DTRACEOPT_AGGSIZE] != DTRACEOPT_UNSET &&
13624 opt[DTRACEOPT_AGGSIZE] != 0) {
13625 if (state->dts_aggregations == NULL) {
13626 /*
13627 * We're not going to create an aggregation buffer
13628 * because we don't have any ECBs that contain
13629 * aggregations -- set this option to 0.
13630 */
13631 opt[DTRACEOPT_AGGSIZE] = 0;
13632 } else {
13633 /*
13634 * If we have an aggregation buffer, we must also have
13635 * a buffer to use as scratch.
13636 */
13637 if (opt[DTRACEOPT_BUFSIZE] == DTRACEOPT_UNSET ||
13638 opt[DTRACEOPT_BUFSIZE] < state->dts_needed) {
13639 opt[DTRACEOPT_BUFSIZE] = state->dts_needed;
13640 }
13641 }
13642 }
13643
13644 if (opt[DTRACEOPT_SPECSIZE] != DTRACEOPT_UNSET &&
13645 opt[DTRACEOPT_SPECSIZE] != 0) {
13646 if (!state->dts_speculates) {
13647 /*
13648 * We're not going to create speculation buffers
13649 * because we don't have any ECBs that actually
13650 * speculate -- set the speculation size to 0.
13651 */
13652 opt[DTRACEOPT_SPECSIZE] = 0;
13653 }
13654 }
13655
13656 /*
13657 * The bare minimum size for any buffer that we're actually going to
13658 * do anything to is sizeof (uint64_t).
13659 */
13660 sz = sizeof (uint64_t);
13661
13662 if ((state->dts_needed != 0 && opt[DTRACEOPT_BUFSIZE] < sz) ||
13663 (state->dts_speculates && opt[DTRACEOPT_SPECSIZE] < sz) ||
13664 (state->dts_aggregations != NULL && opt[DTRACEOPT_AGGSIZE] < sz)) {
13665 /*
13666 * A buffer size has been explicitly set to 0 (or to a size
13667 * that will be adjusted to 0) and we need the space -- we
13668 * need to return failure. We return ENOSPC to differentiate
13669 * it from failing to allocate a buffer due to failure to meet
13670 * the reserve (for which we return E2BIG).
13671 */
13672 rval = ENOSPC;
13673 goto out;
13674 }
13675
13676 if ((rval = dtrace_state_buffers(state)) != 0)
13677 goto err;
13678
13679 if ((sz = opt[DTRACEOPT_DYNVARSIZE]) == DTRACEOPT_UNSET)
13680 sz = dtrace_dstate_defsize;
13681
13682 do {
13683 rval = dtrace_dstate_init(&state->dts_vstate.dtvs_dynvars, sz);
13684
13685 if (rval == 0)
13686 break;
13687
13688 if (opt[DTRACEOPT_BUFRESIZE] == DTRACEOPT_BUFRESIZE_MANUAL)
13689 goto err;
13690 } while (sz >>= 1);
13691
13692 opt[DTRACEOPT_DYNVARSIZE] = sz;
13693
13694 if (rval != 0)
13695 goto err;
13696
13697 if (opt[DTRACEOPT_STATUSRATE] > dtrace_statusrate_max)
13698 opt[DTRACEOPT_STATUSRATE] = dtrace_statusrate_max;
13699
13700 if (opt[DTRACEOPT_CLEANRATE] == 0)
13701 opt[DTRACEOPT_CLEANRATE] = dtrace_cleanrate_max;
13702
13703 if (opt[DTRACEOPT_CLEANRATE] < dtrace_cleanrate_min)
13704 opt[DTRACEOPT_CLEANRATE] = dtrace_cleanrate_min;
13705
13706 if (opt[DTRACEOPT_CLEANRATE] > dtrace_cleanrate_max)
13707 opt[DTRACEOPT_CLEANRATE] = dtrace_cleanrate_max;
13708
13709 state->dts_alive = state->dts_laststatus = dtrace_gethrtime();
13710#if defined(sun)
13711 hdlr.cyh_func = (cyc_func_t)dtrace_state_clean;
13712 hdlr.cyh_arg = state;
13713 hdlr.cyh_level = CY_LOW_LEVEL;
13714
13715 when.cyt_when = 0;
13716 when.cyt_interval = opt[DTRACEOPT_CLEANRATE];
13717
13718 state->dts_cleaner = cyclic_add(&hdlr, &when);
13719
13720 hdlr.cyh_func = (cyc_func_t)dtrace_state_deadman;
13721 hdlr.cyh_arg = state;
13722 hdlr.cyh_level = CY_LOW_LEVEL;
13723
13724 when.cyt_when = 0;
13725 when.cyt_interval = dtrace_deadman_interval;
13726
13727 state->dts_deadman = cyclic_add(&hdlr, &when);
13728#else
13729 callout_reset(&state->dts_cleaner, hz * opt[DTRACEOPT_CLEANRATE] / NANOSEC,
13730 dtrace_state_clean, state);
13731 callout_reset(&state->dts_deadman, hz * dtrace_deadman_interval / NANOSEC,
13732 dtrace_state_deadman, state);
13733#endif
13734
13735 state->dts_activity = DTRACE_ACTIVITY_WARMUP;
13736
9974 subr == DIF_SUBR_RINDEX ||
9975 subr == DIF_SUBR_STRCHR ||
9976 subr == DIF_SUBR_STRJOIN ||
9977 subr == DIF_SUBR_STRRCHR ||
9978 subr == DIF_SUBR_STRSTR ||
9979 subr == DIF_SUBR_HTONS ||
9980 subr == DIF_SUBR_HTONL ||
9981 subr == DIF_SUBR_HTONLL ||
9982 subr == DIF_SUBR_NTOHS ||
9983 subr == DIF_SUBR_NTOHL ||
9984 subr == DIF_SUBR_NTOHLL ||
9985 subr == DIF_SUBR_MEMREF ||
9986 subr == DIF_SUBR_TYPEREF)
9987 break;
9988
9989 err += efunc(pc, "invalid subr %u\n", subr);
9990 break;
9991
9992 default:
9993 err += efunc(pc, "invalid opcode %u\n",
9994 DIF_INSTR_OP(instr));
9995 }
9996 }
9997
9998 return (err);
9999}
10000
10001/*
10002 * Returns 1 if the expression in the DIF object can be cached on a per-thread
10003 * basis; 0 if not.
10004 */
10005static int
10006dtrace_difo_cacheable(dtrace_difo_t *dp)
10007{
10008 int i;
10009
10010 if (dp == NULL)
10011 return (0);
10012
10013 for (i = 0; i < dp->dtdo_varlen; i++) {
10014 dtrace_difv_t *v = &dp->dtdo_vartab[i];
10015
10016 if (v->dtdv_scope != DIFV_SCOPE_GLOBAL)
10017 continue;
10018
10019 switch (v->dtdv_id) {
10020 case DIF_VAR_CURTHREAD:
10021 case DIF_VAR_PID:
10022 case DIF_VAR_TID:
10023 case DIF_VAR_EXECARGS:
10024 case DIF_VAR_EXECNAME:
10025 case DIF_VAR_ZONENAME:
10026 break;
10027
10028 default:
10029 return (0);
10030 }
10031 }
10032
10033 /*
10034 * This DIF object may be cacheable. Now we need to look for any
10035 * array loading instructions, any memory loading instructions, or
10036 * any stores to thread-local variables.
10037 */
10038 for (i = 0; i < dp->dtdo_len; i++) {
10039 uint_t op = DIF_INSTR_OP(dp->dtdo_buf[i]);
10040
10041 if ((op >= DIF_OP_LDSB && op <= DIF_OP_LDX) ||
10042 (op >= DIF_OP_ULDSB && op <= DIF_OP_ULDX) ||
10043 (op >= DIF_OP_RLDSB && op <= DIF_OP_RLDX) ||
10044 op == DIF_OP_LDGA || op == DIF_OP_STTS)
10045 return (0);
10046 }
10047
10048 return (1);
10049}
10050
10051static void
10052dtrace_difo_hold(dtrace_difo_t *dp)
10053{
10054 int i;
10055
10056 ASSERT(MUTEX_HELD(&dtrace_lock));
10057
10058 dp->dtdo_refcnt++;
10059 ASSERT(dp->dtdo_refcnt != 0);
10060
10061 /*
10062 * We need to check this DIF object for references to the variable
10063 * DIF_VAR_VTIMESTAMP.
10064 */
10065 for (i = 0; i < dp->dtdo_varlen; i++) {
10066 dtrace_difv_t *v = &dp->dtdo_vartab[i];
10067
10068 if (v->dtdv_id != DIF_VAR_VTIMESTAMP)
10069 continue;
10070
10071 if (dtrace_vtime_references++ == 0)
10072 dtrace_vtime_enable();
10073 }
10074}
10075
10076/*
10077 * This routine calculates the dynamic variable chunksize for a given DIF
10078 * object. The calculation is not fool-proof, and can probably be tricked by
10079 * malicious DIF -- but it works for all compiler-generated DIF. Because this
10080 * calculation is likely imperfect, dtrace_dynvar() is able to gracefully fail
10081 * if a dynamic variable size exceeds the chunksize.
10082 */
10083static void
10084dtrace_difo_chunksize(dtrace_difo_t *dp, dtrace_vstate_t *vstate)
10085{
10086 uint64_t sval = 0;
10087 dtrace_key_t tupregs[DIF_DTR_NREGS + 2]; /* +2 for thread and id */
10088 const dif_instr_t *text = dp->dtdo_buf;
10089 uint_t pc, srd = 0;
10090 uint_t ttop = 0;
10091 size_t size, ksize;
10092 uint_t id, i;
10093
10094 for (pc = 0; pc < dp->dtdo_len; pc++) {
10095 dif_instr_t instr = text[pc];
10096 uint_t op = DIF_INSTR_OP(instr);
10097 uint_t rd = DIF_INSTR_RD(instr);
10098 uint_t r1 = DIF_INSTR_R1(instr);
10099 uint_t nkeys = 0;
10100 uchar_t scope = 0;
10101
10102 dtrace_key_t *key = tupregs;
10103
10104 switch (op) {
10105 case DIF_OP_SETX:
10106 sval = dp->dtdo_inttab[DIF_INSTR_INTEGER(instr)];
10107 srd = rd;
10108 continue;
10109
10110 case DIF_OP_STTS:
10111 key = &tupregs[DIF_DTR_NREGS];
10112 key[0].dttk_size = 0;
10113 key[1].dttk_size = 0;
10114 nkeys = 2;
10115 scope = DIFV_SCOPE_THREAD;
10116 break;
10117
10118 case DIF_OP_STGAA:
10119 case DIF_OP_STTAA:
10120 nkeys = ttop;
10121
10122 if (DIF_INSTR_OP(instr) == DIF_OP_STTAA)
10123 key[nkeys++].dttk_size = 0;
10124
10125 key[nkeys++].dttk_size = 0;
10126
10127 if (op == DIF_OP_STTAA) {
10128 scope = DIFV_SCOPE_THREAD;
10129 } else {
10130 scope = DIFV_SCOPE_GLOBAL;
10131 }
10132
10133 break;
10134
10135 case DIF_OP_PUSHTR:
10136 if (ttop == DIF_DTR_NREGS)
10137 return;
10138
10139 if ((srd == 0 || sval == 0) && r1 == DIF_TYPE_STRING) {
10140 /*
10141 * If the register for the size of the "pushtr"
10142 * is %r0 (or the value is 0) and the type is
10143 * a string, we'll use the system-wide default
10144 * string size.
10145 */
10146 tupregs[ttop++].dttk_size =
10147 dtrace_strsize_default;
10148 } else {
10149 if (srd == 0)
10150 return;
10151
10152 tupregs[ttop++].dttk_size = sval;
10153 }
10154
10155 break;
10156
10157 case DIF_OP_PUSHTV:
10158 if (ttop == DIF_DTR_NREGS)
10159 return;
10160
10161 tupregs[ttop++].dttk_size = 0;
10162 break;
10163
10164 case DIF_OP_FLUSHTS:
10165 ttop = 0;
10166 break;
10167
10168 case DIF_OP_POPTS:
10169 if (ttop != 0)
10170 ttop--;
10171 break;
10172 }
10173
10174 sval = 0;
10175 srd = 0;
10176
10177 if (nkeys == 0)
10178 continue;
10179
10180 /*
10181 * We have a dynamic variable allocation; calculate its size.
10182 */
10183 for (ksize = 0, i = 0; i < nkeys; i++)
10184 ksize += P2ROUNDUP(key[i].dttk_size, sizeof (uint64_t));
10185
10186 size = sizeof (dtrace_dynvar_t);
10187 size += sizeof (dtrace_key_t) * (nkeys - 1);
10188 size += ksize;
10189
10190 /*
10191 * Now we need to determine the size of the stored data.
10192 */
10193 id = DIF_INSTR_VAR(instr);
10194
10195 for (i = 0; i < dp->dtdo_varlen; i++) {
10196 dtrace_difv_t *v = &dp->dtdo_vartab[i];
10197
10198 if (v->dtdv_id == id && v->dtdv_scope == scope) {
10199 size += v->dtdv_type.dtdt_size;
10200 break;
10201 }
10202 }
10203
10204 if (i == dp->dtdo_varlen)
10205 return;
10206
10207 /*
10208 * We have the size. If this is larger than the chunk size
10209 * for our dynamic variable state, reset the chunk size.
10210 */
10211 size = P2ROUNDUP(size, sizeof (uint64_t));
10212
10213 if (size > vstate->dtvs_dynvars.dtds_chunksize)
10214 vstate->dtvs_dynvars.dtds_chunksize = size;
10215 }
10216}
10217
10218static void
10219dtrace_difo_init(dtrace_difo_t *dp, dtrace_vstate_t *vstate)
10220{
10221 int i, oldsvars, osz, nsz, otlocals, ntlocals;
10222 uint_t id;
10223
10224 ASSERT(MUTEX_HELD(&dtrace_lock));
10225 ASSERT(dp->dtdo_buf != NULL && dp->dtdo_len != 0);
10226
10227 for (i = 0; i < dp->dtdo_varlen; i++) {
10228 dtrace_difv_t *v = &dp->dtdo_vartab[i];
10229 dtrace_statvar_t *svar, ***svarp = NULL;
10230 size_t dsize = 0;
10231 uint8_t scope = v->dtdv_scope;
10232 int *np = NULL;
10233
10234 if ((id = v->dtdv_id) < DIF_VAR_OTHER_UBASE)
10235 continue;
10236
10237 id -= DIF_VAR_OTHER_UBASE;
10238
10239 switch (scope) {
10240 case DIFV_SCOPE_THREAD:
10241 while (id >= (otlocals = vstate->dtvs_ntlocals)) {
10242 dtrace_difv_t *tlocals;
10243
10244 if ((ntlocals = (otlocals << 1)) == 0)
10245 ntlocals = 1;
10246
10247 osz = otlocals * sizeof (dtrace_difv_t);
10248 nsz = ntlocals * sizeof (dtrace_difv_t);
10249
10250 tlocals = kmem_zalloc(nsz, KM_SLEEP);
10251
10252 if (osz != 0) {
10253 bcopy(vstate->dtvs_tlocals,
10254 tlocals, osz);
10255 kmem_free(vstate->dtvs_tlocals, osz);
10256 }
10257
10258 vstate->dtvs_tlocals = tlocals;
10259 vstate->dtvs_ntlocals = ntlocals;
10260 }
10261
10262 vstate->dtvs_tlocals[id] = *v;
10263 continue;
10264
10265 case DIFV_SCOPE_LOCAL:
10266 np = &vstate->dtvs_nlocals;
10267 svarp = &vstate->dtvs_locals;
10268
10269 if (v->dtdv_type.dtdt_flags & DIF_TF_BYREF)
10270 dsize = NCPU * (v->dtdv_type.dtdt_size +
10271 sizeof (uint64_t));
10272 else
10273 dsize = NCPU * sizeof (uint64_t);
10274
10275 break;
10276
10277 case DIFV_SCOPE_GLOBAL:
10278 np = &vstate->dtvs_nglobals;
10279 svarp = &vstate->dtvs_globals;
10280
10281 if (v->dtdv_type.dtdt_flags & DIF_TF_BYREF)
10282 dsize = v->dtdv_type.dtdt_size +
10283 sizeof (uint64_t);
10284
10285 break;
10286
10287 default:
10288 ASSERT(0);
10289 }
10290
10291 while (id >= (oldsvars = *np)) {
10292 dtrace_statvar_t **statics;
10293 int newsvars, oldsize, newsize;
10294
10295 if ((newsvars = (oldsvars << 1)) == 0)
10296 newsvars = 1;
10297
10298 oldsize = oldsvars * sizeof (dtrace_statvar_t *);
10299 newsize = newsvars * sizeof (dtrace_statvar_t *);
10300
10301 statics = kmem_zalloc(newsize, KM_SLEEP);
10302
10303 if (oldsize != 0) {
10304 bcopy(*svarp, statics, oldsize);
10305 kmem_free(*svarp, oldsize);
10306 }
10307
10308 *svarp = statics;
10309 *np = newsvars;
10310 }
10311
10312 if ((svar = (*svarp)[id]) == NULL) {
10313 svar = kmem_zalloc(sizeof (dtrace_statvar_t), KM_SLEEP);
10314 svar->dtsv_var = *v;
10315
10316 if ((svar->dtsv_size = dsize) != 0) {
10317 svar->dtsv_data = (uint64_t)(uintptr_t)
10318 kmem_zalloc(dsize, KM_SLEEP);
10319 }
10320
10321 (*svarp)[id] = svar;
10322 }
10323
10324 svar->dtsv_refcnt++;
10325 }
10326
10327 dtrace_difo_chunksize(dp, vstate);
10328 dtrace_difo_hold(dp);
10329}
10330
10331static dtrace_difo_t *
10332dtrace_difo_duplicate(dtrace_difo_t *dp, dtrace_vstate_t *vstate)
10333{
10334 dtrace_difo_t *new;
10335 size_t sz;
10336
10337 ASSERT(dp->dtdo_buf != NULL);
10338 ASSERT(dp->dtdo_refcnt != 0);
10339
10340 new = kmem_zalloc(sizeof (dtrace_difo_t), KM_SLEEP);
10341
10342 ASSERT(dp->dtdo_buf != NULL);
10343 sz = dp->dtdo_len * sizeof (dif_instr_t);
10344 new->dtdo_buf = kmem_alloc(sz, KM_SLEEP);
10345 bcopy(dp->dtdo_buf, new->dtdo_buf, sz);
10346 new->dtdo_len = dp->dtdo_len;
10347
10348 if (dp->dtdo_strtab != NULL) {
10349 ASSERT(dp->dtdo_strlen != 0);
10350 new->dtdo_strtab = kmem_alloc(dp->dtdo_strlen, KM_SLEEP);
10351 bcopy(dp->dtdo_strtab, new->dtdo_strtab, dp->dtdo_strlen);
10352 new->dtdo_strlen = dp->dtdo_strlen;
10353 }
10354
10355 if (dp->dtdo_inttab != NULL) {
10356 ASSERT(dp->dtdo_intlen != 0);
10357 sz = dp->dtdo_intlen * sizeof (uint64_t);
10358 new->dtdo_inttab = kmem_alloc(sz, KM_SLEEP);
10359 bcopy(dp->dtdo_inttab, new->dtdo_inttab, sz);
10360 new->dtdo_intlen = dp->dtdo_intlen;
10361 }
10362
10363 if (dp->dtdo_vartab != NULL) {
10364 ASSERT(dp->dtdo_varlen != 0);
10365 sz = dp->dtdo_varlen * sizeof (dtrace_difv_t);
10366 new->dtdo_vartab = kmem_alloc(sz, KM_SLEEP);
10367 bcopy(dp->dtdo_vartab, new->dtdo_vartab, sz);
10368 new->dtdo_varlen = dp->dtdo_varlen;
10369 }
10370
10371 dtrace_difo_init(new, vstate);
10372 return (new);
10373}
10374
10375static void
10376dtrace_difo_destroy(dtrace_difo_t *dp, dtrace_vstate_t *vstate)
10377{
10378 int i;
10379
10380 ASSERT(dp->dtdo_refcnt == 0);
10381
10382 for (i = 0; i < dp->dtdo_varlen; i++) {
10383 dtrace_difv_t *v = &dp->dtdo_vartab[i];
10384 dtrace_statvar_t *svar, **svarp = NULL;
10385 uint_t id;
10386 uint8_t scope = v->dtdv_scope;
10387 int *np = NULL;
10388
10389 switch (scope) {
10390 case DIFV_SCOPE_THREAD:
10391 continue;
10392
10393 case DIFV_SCOPE_LOCAL:
10394 np = &vstate->dtvs_nlocals;
10395 svarp = vstate->dtvs_locals;
10396 break;
10397
10398 case DIFV_SCOPE_GLOBAL:
10399 np = &vstate->dtvs_nglobals;
10400 svarp = vstate->dtvs_globals;
10401 break;
10402
10403 default:
10404 ASSERT(0);
10405 }
10406
10407 if ((id = v->dtdv_id) < DIF_VAR_OTHER_UBASE)
10408 continue;
10409
10410 id -= DIF_VAR_OTHER_UBASE;
10411 ASSERT(id < *np);
10412
10413 svar = svarp[id];
10414 ASSERT(svar != NULL);
10415 ASSERT(svar->dtsv_refcnt > 0);
10416
10417 if (--svar->dtsv_refcnt > 0)
10418 continue;
10419
10420 if (svar->dtsv_size != 0) {
10421 ASSERT(svar->dtsv_data != 0);
10422 kmem_free((void *)(uintptr_t)svar->dtsv_data,
10423 svar->dtsv_size);
10424 }
10425
10426 kmem_free(svar, sizeof (dtrace_statvar_t));
10427 svarp[id] = NULL;
10428 }
10429
10430 if (dp->dtdo_buf != NULL)
10431 kmem_free(dp->dtdo_buf, dp->dtdo_len * sizeof (dif_instr_t));
10432 if (dp->dtdo_inttab != NULL)
10433 kmem_free(dp->dtdo_inttab, dp->dtdo_intlen * sizeof (uint64_t));
10434 if (dp->dtdo_strtab != NULL)
10435 kmem_free(dp->dtdo_strtab, dp->dtdo_strlen);
10436 if (dp->dtdo_vartab != NULL)
10437 kmem_free(dp->dtdo_vartab, dp->dtdo_varlen * sizeof (dtrace_difv_t));
10438
10439 kmem_free(dp, sizeof (dtrace_difo_t));
10440}
10441
10442static void
10443dtrace_difo_release(dtrace_difo_t *dp, dtrace_vstate_t *vstate)
10444{
10445 int i;
10446
10447 ASSERT(MUTEX_HELD(&dtrace_lock));
10448 ASSERT(dp->dtdo_refcnt != 0);
10449
10450 for (i = 0; i < dp->dtdo_varlen; i++) {
10451 dtrace_difv_t *v = &dp->dtdo_vartab[i];
10452
10453 if (v->dtdv_id != DIF_VAR_VTIMESTAMP)
10454 continue;
10455
10456 ASSERT(dtrace_vtime_references > 0);
10457 if (--dtrace_vtime_references == 0)
10458 dtrace_vtime_disable();
10459 }
10460
10461 if (--dp->dtdo_refcnt == 0)
10462 dtrace_difo_destroy(dp, vstate);
10463}
10464
10465/*
10466 * DTrace Format Functions
10467 */
10468static uint16_t
10469dtrace_format_add(dtrace_state_t *state, char *str)
10470{
10471 char *fmt, **new;
10472 uint16_t ndx, len = strlen(str) + 1;
10473
10474 fmt = kmem_zalloc(len, KM_SLEEP);
10475 bcopy(str, fmt, len);
10476
10477 for (ndx = 0; ndx < state->dts_nformats; ndx++) {
10478 if (state->dts_formats[ndx] == NULL) {
10479 state->dts_formats[ndx] = fmt;
10480 return (ndx + 1);
10481 }
10482 }
10483
10484 if (state->dts_nformats == USHRT_MAX) {
10485 /*
10486 * This is only likely if a denial-of-service attack is being
10487 * attempted. As such, it's okay to fail silently here.
10488 */
10489 kmem_free(fmt, len);
10490 return (0);
10491 }
10492
10493 /*
10494 * For simplicity, we always resize the formats array to be exactly the
10495 * number of formats.
10496 */
10497 ndx = state->dts_nformats++;
10498 new = kmem_alloc((ndx + 1) * sizeof (char *), KM_SLEEP);
10499
10500 if (state->dts_formats != NULL) {
10501 ASSERT(ndx != 0);
10502 bcopy(state->dts_formats, new, ndx * sizeof (char *));
10503 kmem_free(state->dts_formats, ndx * sizeof (char *));
10504 }
10505
10506 state->dts_formats = new;
10507 state->dts_formats[ndx] = fmt;
10508
10509 return (ndx + 1);
10510}
10511
10512static void
10513dtrace_format_remove(dtrace_state_t *state, uint16_t format)
10514{
10515 char *fmt;
10516
10517 ASSERT(state->dts_formats != NULL);
10518 ASSERT(format <= state->dts_nformats);
10519 ASSERT(state->dts_formats[format - 1] != NULL);
10520
10521 fmt = state->dts_formats[format - 1];
10522 kmem_free(fmt, strlen(fmt) + 1);
10523 state->dts_formats[format - 1] = NULL;
10524}
10525
10526static void
10527dtrace_format_destroy(dtrace_state_t *state)
10528{
10529 int i;
10530
10531 if (state->dts_nformats == 0) {
10532 ASSERT(state->dts_formats == NULL);
10533 return;
10534 }
10535
10536 ASSERT(state->dts_formats != NULL);
10537
10538 for (i = 0; i < state->dts_nformats; i++) {
10539 char *fmt = state->dts_formats[i];
10540
10541 if (fmt == NULL)
10542 continue;
10543
10544 kmem_free(fmt, strlen(fmt) + 1);
10545 }
10546
10547 kmem_free(state->dts_formats, state->dts_nformats * sizeof (char *));
10548 state->dts_nformats = 0;
10549 state->dts_formats = NULL;
10550}
10551
10552/*
10553 * DTrace Predicate Functions
10554 */
10555static dtrace_predicate_t *
10556dtrace_predicate_create(dtrace_difo_t *dp)
10557{
10558 dtrace_predicate_t *pred;
10559
10560 ASSERT(MUTEX_HELD(&dtrace_lock));
10561 ASSERT(dp->dtdo_refcnt != 0);
10562
10563 pred = kmem_zalloc(sizeof (dtrace_predicate_t), KM_SLEEP);
10564 pred->dtp_difo = dp;
10565 pred->dtp_refcnt = 1;
10566
10567 if (!dtrace_difo_cacheable(dp))
10568 return (pred);
10569
10570 if (dtrace_predcache_id == DTRACE_CACHEIDNONE) {
10571 /*
10572 * This is only theoretically possible -- we have had 2^32
10573 * cacheable predicates on this machine. We cannot allow any
10574 * more predicates to become cacheable: as unlikely as it is,
10575 * there may be a thread caching a (now stale) predicate cache
10576 * ID. (N.B.: the temptation is being successfully resisted to
10577 * have this cmn_err() "Holy shit -- we executed this code!")
10578 */
10579 return (pred);
10580 }
10581
10582 pred->dtp_cacheid = dtrace_predcache_id++;
10583
10584 return (pred);
10585}
10586
10587static void
10588dtrace_predicate_hold(dtrace_predicate_t *pred)
10589{
10590 ASSERT(MUTEX_HELD(&dtrace_lock));
10591 ASSERT(pred->dtp_difo != NULL && pred->dtp_difo->dtdo_refcnt != 0);
10592 ASSERT(pred->dtp_refcnt > 0);
10593
10594 pred->dtp_refcnt++;
10595}
10596
10597static void
10598dtrace_predicate_release(dtrace_predicate_t *pred, dtrace_vstate_t *vstate)
10599{
10600 dtrace_difo_t *dp = pred->dtp_difo;
10601
10602 ASSERT(MUTEX_HELD(&dtrace_lock));
10603 ASSERT(dp != NULL && dp->dtdo_refcnt != 0);
10604 ASSERT(pred->dtp_refcnt > 0);
10605
10606 if (--pred->dtp_refcnt == 0) {
10607 dtrace_difo_release(pred->dtp_difo, vstate);
10608 kmem_free(pred, sizeof (dtrace_predicate_t));
10609 }
10610}
10611
10612/*
10613 * DTrace Action Description Functions
10614 */
10615static dtrace_actdesc_t *
10616dtrace_actdesc_create(dtrace_actkind_t kind, uint32_t ntuple,
10617 uint64_t uarg, uint64_t arg)
10618{
10619 dtrace_actdesc_t *act;
10620
10621#if defined(sun)
10622 ASSERT(!DTRACEACT_ISPRINTFLIKE(kind) || (arg != NULL &&
10623 arg >= KERNELBASE) || (arg == NULL && kind == DTRACEACT_PRINTA));
10624#endif
10625
10626 act = kmem_zalloc(sizeof (dtrace_actdesc_t), KM_SLEEP);
10627 act->dtad_kind = kind;
10628 act->dtad_ntuple = ntuple;
10629 act->dtad_uarg = uarg;
10630 act->dtad_arg = arg;
10631 act->dtad_refcnt = 1;
10632
10633 return (act);
10634}
10635
10636static void
10637dtrace_actdesc_hold(dtrace_actdesc_t *act)
10638{
10639 ASSERT(act->dtad_refcnt >= 1);
10640 act->dtad_refcnt++;
10641}
10642
10643static void
10644dtrace_actdesc_release(dtrace_actdesc_t *act, dtrace_vstate_t *vstate)
10645{
10646 dtrace_actkind_t kind = act->dtad_kind;
10647 dtrace_difo_t *dp;
10648
10649 ASSERT(act->dtad_refcnt >= 1);
10650
10651 if (--act->dtad_refcnt != 0)
10652 return;
10653
10654 if ((dp = act->dtad_difo) != NULL)
10655 dtrace_difo_release(dp, vstate);
10656
10657 if (DTRACEACT_ISPRINTFLIKE(kind)) {
10658 char *str = (char *)(uintptr_t)act->dtad_arg;
10659
10660#if defined(sun)
10661 ASSERT((str != NULL && (uintptr_t)str >= KERNELBASE) ||
10662 (str == NULL && act->dtad_kind == DTRACEACT_PRINTA));
10663#endif
10664
10665 if (str != NULL)
10666 kmem_free(str, strlen(str) + 1);
10667 }
10668
10669 kmem_free(act, sizeof (dtrace_actdesc_t));
10670}
10671
10672/*
10673 * DTrace ECB Functions
10674 */
10675static dtrace_ecb_t *
10676dtrace_ecb_add(dtrace_state_t *state, dtrace_probe_t *probe)
10677{
10678 dtrace_ecb_t *ecb;
10679 dtrace_epid_t epid;
10680
10681 ASSERT(MUTEX_HELD(&dtrace_lock));
10682
10683 ecb = kmem_zalloc(sizeof (dtrace_ecb_t), KM_SLEEP);
10684 ecb->dte_predicate = NULL;
10685 ecb->dte_probe = probe;
10686
10687 /*
10688 * The default size is the size of the default action: recording
10689 * the header.
10690 */
10691 ecb->dte_size = ecb->dte_needed = sizeof (dtrace_rechdr_t);
10692 ecb->dte_alignment = sizeof (dtrace_epid_t);
10693
10694 epid = state->dts_epid++;
10695
10696 if (epid - 1 >= state->dts_necbs) {
10697 dtrace_ecb_t **oecbs = state->dts_ecbs, **ecbs;
10698 int necbs = state->dts_necbs << 1;
10699
10700 ASSERT(epid == state->dts_necbs + 1);
10701
10702 if (necbs == 0) {
10703 ASSERT(oecbs == NULL);
10704 necbs = 1;
10705 }
10706
10707 ecbs = kmem_zalloc(necbs * sizeof (*ecbs), KM_SLEEP);
10708
10709 if (oecbs != NULL)
10710 bcopy(oecbs, ecbs, state->dts_necbs * sizeof (*ecbs));
10711
10712 dtrace_membar_producer();
10713 state->dts_ecbs = ecbs;
10714
10715 if (oecbs != NULL) {
10716 /*
10717 * If this state is active, we must dtrace_sync()
10718 * before we can free the old dts_ecbs array: we're
10719 * coming in hot, and there may be active ring
10720 * buffer processing (which indexes into the dts_ecbs
10721 * array) on another CPU.
10722 */
10723 if (state->dts_activity != DTRACE_ACTIVITY_INACTIVE)
10724 dtrace_sync();
10725
10726 kmem_free(oecbs, state->dts_necbs * sizeof (*ecbs));
10727 }
10728
10729 dtrace_membar_producer();
10730 state->dts_necbs = necbs;
10731 }
10732
10733 ecb->dte_state = state;
10734
10735 ASSERT(state->dts_ecbs[epid - 1] == NULL);
10736 dtrace_membar_producer();
10737 state->dts_ecbs[(ecb->dte_epid = epid) - 1] = ecb;
10738
10739 return (ecb);
10740}
10741
10742static void
10743dtrace_ecb_enable(dtrace_ecb_t *ecb)
10744{
10745 dtrace_probe_t *probe = ecb->dte_probe;
10746
10747 ASSERT(MUTEX_HELD(&cpu_lock));
10748 ASSERT(MUTEX_HELD(&dtrace_lock));
10749 ASSERT(ecb->dte_next == NULL);
10750
10751 if (probe == NULL) {
10752 /*
10753 * This is the NULL probe -- there's nothing to do.
10754 */
10755 return;
10756 }
10757
10758 if (probe->dtpr_ecb == NULL) {
10759 dtrace_provider_t *prov = probe->dtpr_provider;
10760
10761 /*
10762 * We're the first ECB on this probe.
10763 */
10764 probe->dtpr_ecb = probe->dtpr_ecb_last = ecb;
10765
10766 if (ecb->dte_predicate != NULL)
10767 probe->dtpr_predcache = ecb->dte_predicate->dtp_cacheid;
10768
10769 prov->dtpv_pops.dtps_enable(prov->dtpv_arg,
10770 probe->dtpr_id, probe->dtpr_arg);
10771 } else {
10772 /*
10773 * This probe is already active. Swing the last pointer to
10774 * point to the new ECB, and issue a dtrace_sync() to assure
10775 * that all CPUs have seen the change.
10776 */
10777 ASSERT(probe->dtpr_ecb_last != NULL);
10778 probe->dtpr_ecb_last->dte_next = ecb;
10779 probe->dtpr_ecb_last = ecb;
10780 probe->dtpr_predcache = 0;
10781
10782 dtrace_sync();
10783 }
10784}
10785
10786static void
10787dtrace_ecb_resize(dtrace_ecb_t *ecb)
10788{
10789 dtrace_action_t *act;
10790 uint32_t curneeded = UINT32_MAX;
10791 uint32_t aggbase = UINT32_MAX;
10792
10793 /*
10794 * If we record anything, we always record the dtrace_rechdr_t. (And
10795 * we always record it first.)
10796 */
10797 ecb->dte_size = sizeof (dtrace_rechdr_t);
10798 ecb->dte_alignment = sizeof (dtrace_epid_t);
10799
10800 for (act = ecb->dte_action; act != NULL; act = act->dta_next) {
10801 dtrace_recdesc_t *rec = &act->dta_rec;
10802 ASSERT(rec->dtrd_size > 0 || rec->dtrd_alignment == 1);
10803
10804 ecb->dte_alignment = MAX(ecb->dte_alignment,
10805 rec->dtrd_alignment);
10806
10807 if (DTRACEACT_ISAGG(act->dta_kind)) {
10808 dtrace_aggregation_t *agg = (dtrace_aggregation_t *)act;
10809
10810 ASSERT(rec->dtrd_size != 0);
10811 ASSERT(agg->dtag_first != NULL);
10812 ASSERT(act->dta_prev->dta_intuple);
10813 ASSERT(aggbase != UINT32_MAX);
10814 ASSERT(curneeded != UINT32_MAX);
10815
10816 agg->dtag_base = aggbase;
10817
10818 curneeded = P2ROUNDUP(curneeded, rec->dtrd_alignment);
10819 rec->dtrd_offset = curneeded;
10820 curneeded += rec->dtrd_size;
10821 ecb->dte_needed = MAX(ecb->dte_needed, curneeded);
10822
10823 aggbase = UINT32_MAX;
10824 curneeded = UINT32_MAX;
10825 } else if (act->dta_intuple) {
10826 if (curneeded == UINT32_MAX) {
10827 /*
10828 * This is the first record in a tuple. Align
10829 * curneeded to be at offset 4 in an 8-byte
10830 * aligned block.
10831 */
10832 ASSERT(act->dta_prev == NULL ||
10833 !act->dta_prev->dta_intuple);
10834 ASSERT3U(aggbase, ==, UINT32_MAX);
10835 curneeded = P2PHASEUP(ecb->dte_size,
10836 sizeof (uint64_t), sizeof (dtrace_aggid_t));
10837
10838 aggbase = curneeded - sizeof (dtrace_aggid_t);
10839 ASSERT(IS_P2ALIGNED(aggbase,
10840 sizeof (uint64_t)));
10841 }
10842 curneeded = P2ROUNDUP(curneeded, rec->dtrd_alignment);
10843 rec->dtrd_offset = curneeded;
10844 curneeded += rec->dtrd_size;
10845 } else {
10846 /* tuples must be followed by an aggregation */
10847 ASSERT(act->dta_prev == NULL ||
10848 !act->dta_prev->dta_intuple);
10849
10850 ecb->dte_size = P2ROUNDUP(ecb->dte_size,
10851 rec->dtrd_alignment);
10852 rec->dtrd_offset = ecb->dte_size;
10853 ecb->dte_size += rec->dtrd_size;
10854 ecb->dte_needed = MAX(ecb->dte_needed, ecb->dte_size);
10855 }
10856 }
10857
10858 if ((act = ecb->dte_action) != NULL &&
10859 !(act->dta_kind == DTRACEACT_SPECULATE && act->dta_next == NULL) &&
10860 ecb->dte_size == sizeof (dtrace_rechdr_t)) {
10861 /*
10862 * If the size is still sizeof (dtrace_rechdr_t), then all
10863 * actions store no data; set the size to 0.
10864 */
10865 ecb->dte_size = 0;
10866 }
10867
10868 ecb->dte_size = P2ROUNDUP(ecb->dte_size, sizeof (dtrace_epid_t));
10869 ecb->dte_needed = P2ROUNDUP(ecb->dte_needed, (sizeof (dtrace_epid_t)));
10870 ecb->dte_state->dts_needed = MAX(ecb->dte_state->dts_needed,
10871 ecb->dte_needed);
10872}
10873
10874static dtrace_action_t *
10875dtrace_ecb_aggregation_create(dtrace_ecb_t *ecb, dtrace_actdesc_t *desc)
10876{
10877 dtrace_aggregation_t *agg;
10878 size_t size = sizeof (uint64_t);
10879 int ntuple = desc->dtad_ntuple;
10880 dtrace_action_t *act;
10881 dtrace_recdesc_t *frec;
10882 dtrace_aggid_t aggid;
10883 dtrace_state_t *state = ecb->dte_state;
10884
10885 agg = kmem_zalloc(sizeof (dtrace_aggregation_t), KM_SLEEP);
10886 agg->dtag_ecb = ecb;
10887
10888 ASSERT(DTRACEACT_ISAGG(desc->dtad_kind));
10889
10890 switch (desc->dtad_kind) {
10891 case DTRACEAGG_MIN:
10892 agg->dtag_initial = INT64_MAX;
10893 agg->dtag_aggregate = dtrace_aggregate_min;
10894 break;
10895
10896 case DTRACEAGG_MAX:
10897 agg->dtag_initial = INT64_MIN;
10898 agg->dtag_aggregate = dtrace_aggregate_max;
10899 break;
10900
10901 case DTRACEAGG_COUNT:
10902 agg->dtag_aggregate = dtrace_aggregate_count;
10903 break;
10904
10905 case DTRACEAGG_QUANTIZE:
10906 agg->dtag_aggregate = dtrace_aggregate_quantize;
10907 size = (((sizeof (uint64_t) * NBBY) - 1) * 2 + 1) *
10908 sizeof (uint64_t);
10909 break;
10910
10911 case DTRACEAGG_LQUANTIZE: {
10912 uint16_t step = DTRACE_LQUANTIZE_STEP(desc->dtad_arg);
10913 uint16_t levels = DTRACE_LQUANTIZE_LEVELS(desc->dtad_arg);
10914
10915 agg->dtag_initial = desc->dtad_arg;
10916 agg->dtag_aggregate = dtrace_aggregate_lquantize;
10917
10918 if (step == 0 || levels == 0)
10919 goto err;
10920
10921 size = levels * sizeof (uint64_t) + 3 * sizeof (uint64_t);
10922 break;
10923 }
10924
10925 case DTRACEAGG_LLQUANTIZE: {
10926 uint16_t factor = DTRACE_LLQUANTIZE_FACTOR(desc->dtad_arg);
10927 uint16_t low = DTRACE_LLQUANTIZE_LOW(desc->dtad_arg);
10928 uint16_t high = DTRACE_LLQUANTIZE_HIGH(desc->dtad_arg);
10929 uint16_t nsteps = DTRACE_LLQUANTIZE_NSTEP(desc->dtad_arg);
10930 int64_t v;
10931
10932 agg->dtag_initial = desc->dtad_arg;
10933 agg->dtag_aggregate = dtrace_aggregate_llquantize;
10934
10935 if (factor < 2 || low >= high || nsteps < factor)
10936 goto err;
10937
10938 /*
10939 * Now check that the number of steps evenly divides a power
10940 * of the factor. (This assures both integer bucket size and
10941 * linearity within each magnitude.)
10942 */
10943 for (v = factor; v < nsteps; v *= factor)
10944 continue;
10945
10946 if ((v % nsteps) || (nsteps % factor))
10947 goto err;
10948
10949 size = (dtrace_aggregate_llquantize_bucket(factor,
10950 low, high, nsteps, INT64_MAX) + 2) * sizeof (uint64_t);
10951 break;
10952 }
10953
10954 case DTRACEAGG_AVG:
10955 agg->dtag_aggregate = dtrace_aggregate_avg;
10956 size = sizeof (uint64_t) * 2;
10957 break;
10958
10959 case DTRACEAGG_STDDEV:
10960 agg->dtag_aggregate = dtrace_aggregate_stddev;
10961 size = sizeof (uint64_t) * 4;
10962 break;
10963
10964 case DTRACEAGG_SUM:
10965 agg->dtag_aggregate = dtrace_aggregate_sum;
10966 break;
10967
10968 default:
10969 goto err;
10970 }
10971
10972 agg->dtag_action.dta_rec.dtrd_size = size;
10973
10974 if (ntuple == 0)
10975 goto err;
10976
10977 /*
10978 * We must make sure that we have enough actions for the n-tuple.
10979 */
10980 for (act = ecb->dte_action_last; act != NULL; act = act->dta_prev) {
10981 if (DTRACEACT_ISAGG(act->dta_kind))
10982 break;
10983
10984 if (--ntuple == 0) {
10985 /*
10986 * This is the action with which our n-tuple begins.
10987 */
10988 agg->dtag_first = act;
10989 goto success;
10990 }
10991 }
10992
10993 /*
10994 * This n-tuple is short by ntuple elements. Return failure.
10995 */
10996 ASSERT(ntuple != 0);
10997err:
10998 kmem_free(agg, sizeof (dtrace_aggregation_t));
10999 return (NULL);
11000
11001success:
11002 /*
11003 * If the last action in the tuple has a size of zero, it's actually
11004 * an expression argument for the aggregating action.
11005 */
11006 ASSERT(ecb->dte_action_last != NULL);
11007 act = ecb->dte_action_last;
11008
11009 if (act->dta_kind == DTRACEACT_DIFEXPR) {
11010 ASSERT(act->dta_difo != NULL);
11011
11012 if (act->dta_difo->dtdo_rtype.dtdt_size == 0)
11013 agg->dtag_hasarg = 1;
11014 }
11015
11016 /*
11017 * We need to allocate an id for this aggregation.
11018 */
11019#if defined(sun)
11020 aggid = (dtrace_aggid_t)(uintptr_t)vmem_alloc(state->dts_aggid_arena, 1,
11021 VM_BESTFIT | VM_SLEEP);
11022#else
11023 aggid = alloc_unr(state->dts_aggid_arena);
11024#endif
11025
11026 if (aggid - 1 >= state->dts_naggregations) {
11027 dtrace_aggregation_t **oaggs = state->dts_aggregations;
11028 dtrace_aggregation_t **aggs;
11029 int naggs = state->dts_naggregations << 1;
11030 int onaggs = state->dts_naggregations;
11031
11032 ASSERT(aggid == state->dts_naggregations + 1);
11033
11034 if (naggs == 0) {
11035 ASSERT(oaggs == NULL);
11036 naggs = 1;
11037 }
11038
11039 aggs = kmem_zalloc(naggs * sizeof (*aggs), KM_SLEEP);
11040
11041 if (oaggs != NULL) {
11042 bcopy(oaggs, aggs, onaggs * sizeof (*aggs));
11043 kmem_free(oaggs, onaggs * sizeof (*aggs));
11044 }
11045
11046 state->dts_aggregations = aggs;
11047 state->dts_naggregations = naggs;
11048 }
11049
11050 ASSERT(state->dts_aggregations[aggid - 1] == NULL);
11051 state->dts_aggregations[(agg->dtag_id = aggid) - 1] = agg;
11052
11053 frec = &agg->dtag_first->dta_rec;
11054 if (frec->dtrd_alignment < sizeof (dtrace_aggid_t))
11055 frec->dtrd_alignment = sizeof (dtrace_aggid_t);
11056
11057 for (act = agg->dtag_first; act != NULL; act = act->dta_next) {
11058 ASSERT(!act->dta_intuple);
11059 act->dta_intuple = 1;
11060 }
11061
11062 return (&agg->dtag_action);
11063}
11064
11065static void
11066dtrace_ecb_aggregation_destroy(dtrace_ecb_t *ecb, dtrace_action_t *act)
11067{
11068 dtrace_aggregation_t *agg = (dtrace_aggregation_t *)act;
11069 dtrace_state_t *state = ecb->dte_state;
11070 dtrace_aggid_t aggid = agg->dtag_id;
11071
11072 ASSERT(DTRACEACT_ISAGG(act->dta_kind));
11073#if defined(sun)
11074 vmem_free(state->dts_aggid_arena, (void *)(uintptr_t)aggid, 1);
11075#else
11076 free_unr(state->dts_aggid_arena, aggid);
11077#endif
11078
11079 ASSERT(state->dts_aggregations[aggid - 1] == agg);
11080 state->dts_aggregations[aggid - 1] = NULL;
11081
11082 kmem_free(agg, sizeof (dtrace_aggregation_t));
11083}
11084
11085static int
11086dtrace_ecb_action_add(dtrace_ecb_t *ecb, dtrace_actdesc_t *desc)
11087{
11088 dtrace_action_t *action, *last;
11089 dtrace_difo_t *dp = desc->dtad_difo;
11090 uint32_t size = 0, align = sizeof (uint8_t), mask;
11091 uint16_t format = 0;
11092 dtrace_recdesc_t *rec;
11093 dtrace_state_t *state = ecb->dte_state;
11094 dtrace_optval_t *opt = state->dts_options, nframes = 0, strsize;
11095 uint64_t arg = desc->dtad_arg;
11096
11097 ASSERT(MUTEX_HELD(&dtrace_lock));
11098 ASSERT(ecb->dte_action == NULL || ecb->dte_action->dta_refcnt == 1);
11099
11100 if (DTRACEACT_ISAGG(desc->dtad_kind)) {
11101 /*
11102 * If this is an aggregating action, there must be neither
11103 * a speculate nor a commit on the action chain.
11104 */
11105 dtrace_action_t *act;
11106
11107 for (act = ecb->dte_action; act != NULL; act = act->dta_next) {
11108 if (act->dta_kind == DTRACEACT_COMMIT)
11109 return (EINVAL);
11110
11111 if (act->dta_kind == DTRACEACT_SPECULATE)
11112 return (EINVAL);
11113 }
11114
11115 action = dtrace_ecb_aggregation_create(ecb, desc);
11116
11117 if (action == NULL)
11118 return (EINVAL);
11119 } else {
11120 if (DTRACEACT_ISDESTRUCTIVE(desc->dtad_kind) ||
11121 (desc->dtad_kind == DTRACEACT_DIFEXPR &&
11122 dp != NULL && dp->dtdo_destructive)) {
11123 state->dts_destructive = 1;
11124 }
11125
11126 switch (desc->dtad_kind) {
11127 case DTRACEACT_PRINTF:
11128 case DTRACEACT_PRINTA:
11129 case DTRACEACT_SYSTEM:
11130 case DTRACEACT_FREOPEN:
11131 case DTRACEACT_DIFEXPR:
11132 /*
11133 * We know that our arg is a string -- turn it into a
11134 * format.
11135 */
11136 if (arg == 0) {
11137 ASSERT(desc->dtad_kind == DTRACEACT_PRINTA ||
11138 desc->dtad_kind == DTRACEACT_DIFEXPR);
11139 format = 0;
11140 } else {
11141 ASSERT(arg != 0);
11142#if defined(sun)
11143 ASSERT(arg > KERNELBASE);
11144#endif
11145 format = dtrace_format_add(state,
11146 (char *)(uintptr_t)arg);
11147 }
11148
11149 /*FALLTHROUGH*/
11150 case DTRACEACT_LIBACT:
11151 case DTRACEACT_TRACEMEM:
11152 case DTRACEACT_TRACEMEM_DYNSIZE:
11153 if (dp == NULL)
11154 return (EINVAL);
11155
11156 if ((size = dp->dtdo_rtype.dtdt_size) != 0)
11157 break;
11158
11159 if (dp->dtdo_rtype.dtdt_kind == DIF_TYPE_STRING) {
11160 if (!(dp->dtdo_rtype.dtdt_flags & DIF_TF_BYREF))
11161 return (EINVAL);
11162
11163 size = opt[DTRACEOPT_STRSIZE];
11164 }
11165
11166 break;
11167
11168 case DTRACEACT_STACK:
11169 if ((nframes = arg) == 0) {
11170 nframes = opt[DTRACEOPT_STACKFRAMES];
11171 ASSERT(nframes > 0);
11172 arg = nframes;
11173 }
11174
11175 size = nframes * sizeof (pc_t);
11176 break;
11177
11178 case DTRACEACT_JSTACK:
11179 if ((strsize = DTRACE_USTACK_STRSIZE(arg)) == 0)
11180 strsize = opt[DTRACEOPT_JSTACKSTRSIZE];
11181
11182 if ((nframes = DTRACE_USTACK_NFRAMES(arg)) == 0)
11183 nframes = opt[DTRACEOPT_JSTACKFRAMES];
11184
11185 arg = DTRACE_USTACK_ARG(nframes, strsize);
11186
11187 /*FALLTHROUGH*/
11188 case DTRACEACT_USTACK:
11189 if (desc->dtad_kind != DTRACEACT_JSTACK &&
11190 (nframes = DTRACE_USTACK_NFRAMES(arg)) == 0) {
11191 strsize = DTRACE_USTACK_STRSIZE(arg);
11192 nframes = opt[DTRACEOPT_USTACKFRAMES];
11193 ASSERT(nframes > 0);
11194 arg = DTRACE_USTACK_ARG(nframes, strsize);
11195 }
11196
11197 /*
11198 * Save a slot for the pid.
11199 */
11200 size = (nframes + 1) * sizeof (uint64_t);
11201 size += DTRACE_USTACK_STRSIZE(arg);
11202 size = P2ROUNDUP(size, (uint32_t)(sizeof (uintptr_t)));
11203
11204 break;
11205
11206 case DTRACEACT_SYM:
11207 case DTRACEACT_MOD:
11208 if (dp == NULL || ((size = dp->dtdo_rtype.dtdt_size) !=
11209 sizeof (uint64_t)) ||
11210 (dp->dtdo_rtype.dtdt_flags & DIF_TF_BYREF))
11211 return (EINVAL);
11212 break;
11213
11214 case DTRACEACT_USYM:
11215 case DTRACEACT_UMOD:
11216 case DTRACEACT_UADDR:
11217 if (dp == NULL ||
11218 (dp->dtdo_rtype.dtdt_size != sizeof (uint64_t)) ||
11219 (dp->dtdo_rtype.dtdt_flags & DIF_TF_BYREF))
11220 return (EINVAL);
11221
11222 /*
11223 * We have a slot for the pid, plus a slot for the
11224 * argument. To keep things simple (aligned with
11225 * bitness-neutral sizing), we store each as a 64-bit
11226 * quantity.
11227 */
11228 size = 2 * sizeof (uint64_t);
11229 break;
11230
11231 case DTRACEACT_STOP:
11232 case DTRACEACT_BREAKPOINT:
11233 case DTRACEACT_PANIC:
11234 break;
11235
11236 case DTRACEACT_CHILL:
11237 case DTRACEACT_DISCARD:
11238 case DTRACEACT_RAISE:
11239 if (dp == NULL)
11240 return (EINVAL);
11241 break;
11242
11243 case DTRACEACT_EXIT:
11244 if (dp == NULL ||
11245 (size = dp->dtdo_rtype.dtdt_size) != sizeof (int) ||
11246 (dp->dtdo_rtype.dtdt_flags & DIF_TF_BYREF))
11247 return (EINVAL);
11248 break;
11249
11250 case DTRACEACT_SPECULATE:
11251 if (ecb->dte_size > sizeof (dtrace_rechdr_t))
11252 return (EINVAL);
11253
11254 if (dp == NULL)
11255 return (EINVAL);
11256
11257 state->dts_speculates = 1;
11258 break;
11259
11260 case DTRACEACT_PRINTM:
11261 size = dp->dtdo_rtype.dtdt_size;
11262 break;
11263
11264 case DTRACEACT_PRINTT:
11265 size = dp->dtdo_rtype.dtdt_size;
11266 break;
11267
11268 case DTRACEACT_COMMIT: {
11269 dtrace_action_t *act = ecb->dte_action;
11270
11271 for (; act != NULL; act = act->dta_next) {
11272 if (act->dta_kind == DTRACEACT_COMMIT)
11273 return (EINVAL);
11274 }
11275
11276 if (dp == NULL)
11277 return (EINVAL);
11278 break;
11279 }
11280
11281 default:
11282 return (EINVAL);
11283 }
11284
11285 if (size != 0 || desc->dtad_kind == DTRACEACT_SPECULATE) {
11286 /*
11287 * If this is a data-storing action or a speculate,
11288 * we must be sure that there isn't a commit on the
11289 * action chain.
11290 */
11291 dtrace_action_t *act = ecb->dte_action;
11292
11293 for (; act != NULL; act = act->dta_next) {
11294 if (act->dta_kind == DTRACEACT_COMMIT)
11295 return (EINVAL);
11296 }
11297 }
11298
11299 action = kmem_zalloc(sizeof (dtrace_action_t), KM_SLEEP);
11300 action->dta_rec.dtrd_size = size;
11301 }
11302
11303 action->dta_refcnt = 1;
11304 rec = &action->dta_rec;
11305 size = rec->dtrd_size;
11306
11307 for (mask = sizeof (uint64_t) - 1; size != 0 && mask > 0; mask >>= 1) {
11308 if (!(size & mask)) {
11309 align = mask + 1;
11310 break;
11311 }
11312 }
11313
11314 action->dta_kind = desc->dtad_kind;
11315
11316 if ((action->dta_difo = dp) != NULL)
11317 dtrace_difo_hold(dp);
11318
11319 rec->dtrd_action = action->dta_kind;
11320 rec->dtrd_arg = arg;
11321 rec->dtrd_uarg = desc->dtad_uarg;
11322 rec->dtrd_alignment = (uint16_t)align;
11323 rec->dtrd_format = format;
11324
11325 if ((last = ecb->dte_action_last) != NULL) {
11326 ASSERT(ecb->dte_action != NULL);
11327 action->dta_prev = last;
11328 last->dta_next = action;
11329 } else {
11330 ASSERT(ecb->dte_action == NULL);
11331 ecb->dte_action = action;
11332 }
11333
11334 ecb->dte_action_last = action;
11335
11336 return (0);
11337}
11338
11339static void
11340dtrace_ecb_action_remove(dtrace_ecb_t *ecb)
11341{
11342 dtrace_action_t *act = ecb->dte_action, *next;
11343 dtrace_vstate_t *vstate = &ecb->dte_state->dts_vstate;
11344 dtrace_difo_t *dp;
11345 uint16_t format;
11346
11347 if (act != NULL && act->dta_refcnt > 1) {
11348 ASSERT(act->dta_next == NULL || act->dta_next->dta_refcnt == 1);
11349 act->dta_refcnt--;
11350 } else {
11351 for (; act != NULL; act = next) {
11352 next = act->dta_next;
11353 ASSERT(next != NULL || act == ecb->dte_action_last);
11354 ASSERT(act->dta_refcnt == 1);
11355
11356 if ((format = act->dta_rec.dtrd_format) != 0)
11357 dtrace_format_remove(ecb->dte_state, format);
11358
11359 if ((dp = act->dta_difo) != NULL)
11360 dtrace_difo_release(dp, vstate);
11361
11362 if (DTRACEACT_ISAGG(act->dta_kind)) {
11363 dtrace_ecb_aggregation_destroy(ecb, act);
11364 } else {
11365 kmem_free(act, sizeof (dtrace_action_t));
11366 }
11367 }
11368 }
11369
11370 ecb->dte_action = NULL;
11371 ecb->dte_action_last = NULL;
11372 ecb->dte_size = 0;
11373}
11374
11375static void
11376dtrace_ecb_disable(dtrace_ecb_t *ecb)
11377{
11378 /*
11379 * We disable the ECB by removing it from its probe.
11380 */
11381 dtrace_ecb_t *pecb, *prev = NULL;
11382 dtrace_probe_t *probe = ecb->dte_probe;
11383
11384 ASSERT(MUTEX_HELD(&dtrace_lock));
11385
11386 if (probe == NULL) {
11387 /*
11388 * This is the NULL probe; there is nothing to disable.
11389 */
11390 return;
11391 }
11392
11393 for (pecb = probe->dtpr_ecb; pecb != NULL; pecb = pecb->dte_next) {
11394 if (pecb == ecb)
11395 break;
11396 prev = pecb;
11397 }
11398
11399 ASSERT(pecb != NULL);
11400
11401 if (prev == NULL) {
11402 probe->dtpr_ecb = ecb->dte_next;
11403 } else {
11404 prev->dte_next = ecb->dte_next;
11405 }
11406
11407 if (ecb == probe->dtpr_ecb_last) {
11408 ASSERT(ecb->dte_next == NULL);
11409 probe->dtpr_ecb_last = prev;
11410 }
11411
11412 /*
11413 * The ECB has been disconnected from the probe; now sync to assure
11414 * that all CPUs have seen the change before returning.
11415 */
11416 dtrace_sync();
11417
11418 if (probe->dtpr_ecb == NULL) {
11419 /*
11420 * That was the last ECB on the probe; clear the predicate
11421 * cache ID for the probe, disable it and sync one more time
11422 * to assure that we'll never hit it again.
11423 */
11424 dtrace_provider_t *prov = probe->dtpr_provider;
11425
11426 ASSERT(ecb->dte_next == NULL);
11427 ASSERT(probe->dtpr_ecb_last == NULL);
11428 probe->dtpr_predcache = DTRACE_CACHEIDNONE;
11429 prov->dtpv_pops.dtps_disable(prov->dtpv_arg,
11430 probe->dtpr_id, probe->dtpr_arg);
11431 dtrace_sync();
11432 } else {
11433 /*
11434 * There is at least one ECB remaining on the probe. If there
11435 * is _exactly_ one, set the probe's predicate cache ID to be
11436 * the predicate cache ID of the remaining ECB.
11437 */
11438 ASSERT(probe->dtpr_ecb_last != NULL);
11439 ASSERT(probe->dtpr_predcache == DTRACE_CACHEIDNONE);
11440
11441 if (probe->dtpr_ecb == probe->dtpr_ecb_last) {
11442 dtrace_predicate_t *p = probe->dtpr_ecb->dte_predicate;
11443
11444 ASSERT(probe->dtpr_ecb->dte_next == NULL);
11445
11446 if (p != NULL)
11447 probe->dtpr_predcache = p->dtp_cacheid;
11448 }
11449
11450 ecb->dte_next = NULL;
11451 }
11452}
11453
11454static void
11455dtrace_ecb_destroy(dtrace_ecb_t *ecb)
11456{
11457 dtrace_state_t *state = ecb->dte_state;
11458 dtrace_vstate_t *vstate = &state->dts_vstate;
11459 dtrace_predicate_t *pred;
11460 dtrace_epid_t epid = ecb->dte_epid;
11461
11462 ASSERT(MUTEX_HELD(&dtrace_lock));
11463 ASSERT(ecb->dte_next == NULL);
11464 ASSERT(ecb->dte_probe == NULL || ecb->dte_probe->dtpr_ecb != ecb);
11465
11466 if ((pred = ecb->dte_predicate) != NULL)
11467 dtrace_predicate_release(pred, vstate);
11468
11469 dtrace_ecb_action_remove(ecb);
11470
11471 ASSERT(state->dts_ecbs[epid - 1] == ecb);
11472 state->dts_ecbs[epid - 1] = NULL;
11473
11474 kmem_free(ecb, sizeof (dtrace_ecb_t));
11475}
11476
11477static dtrace_ecb_t *
11478dtrace_ecb_create(dtrace_state_t *state, dtrace_probe_t *probe,
11479 dtrace_enabling_t *enab)
11480{
11481 dtrace_ecb_t *ecb;
11482 dtrace_predicate_t *pred;
11483 dtrace_actdesc_t *act;
11484 dtrace_provider_t *prov;
11485 dtrace_ecbdesc_t *desc = enab->dten_current;
11486
11487 ASSERT(MUTEX_HELD(&dtrace_lock));
11488 ASSERT(state != NULL);
11489
11490 ecb = dtrace_ecb_add(state, probe);
11491 ecb->dte_uarg = desc->dted_uarg;
11492
11493 if ((pred = desc->dted_pred.dtpdd_predicate) != NULL) {
11494 dtrace_predicate_hold(pred);
11495 ecb->dte_predicate = pred;
11496 }
11497
11498 if (probe != NULL) {
11499 /*
11500 * If the provider shows more leg than the consumer is old
11501 * enough to see, we need to enable the appropriate implicit
11502 * predicate bits to prevent the ecb from activating at
11503 * revealing times.
11504 *
11505 * Providers specifying DTRACE_PRIV_USER at register time
11506 * are stating that they need the /proc-style privilege
11507 * model to be enforced, and this is what DTRACE_COND_OWNER
11508 * and DTRACE_COND_ZONEOWNER will then do at probe time.
11509 */
11510 prov = probe->dtpr_provider;
11511 if (!(state->dts_cred.dcr_visible & DTRACE_CRV_ALLPROC) &&
11512 (prov->dtpv_priv.dtpp_flags & DTRACE_PRIV_USER))
11513 ecb->dte_cond |= DTRACE_COND_OWNER;
11514
11515 if (!(state->dts_cred.dcr_visible & DTRACE_CRV_ALLZONE) &&
11516 (prov->dtpv_priv.dtpp_flags & DTRACE_PRIV_USER))
11517 ecb->dte_cond |= DTRACE_COND_ZONEOWNER;
11518
11519 /*
11520 * If the provider shows us kernel innards and the user
11521 * is lacking sufficient privilege, enable the
11522 * DTRACE_COND_USERMODE implicit predicate.
11523 */
11524 if (!(state->dts_cred.dcr_visible & DTRACE_CRV_KERNEL) &&
11525 (prov->dtpv_priv.dtpp_flags & DTRACE_PRIV_KERNEL))
11526 ecb->dte_cond |= DTRACE_COND_USERMODE;
11527 }
11528
11529 if (dtrace_ecb_create_cache != NULL) {
11530 /*
11531 * If we have a cached ecb, we'll use its action list instead
11532 * of creating our own (saving both time and space).
11533 */
11534 dtrace_ecb_t *cached = dtrace_ecb_create_cache;
11535 dtrace_action_t *act = cached->dte_action;
11536
11537 if (act != NULL) {
11538 ASSERT(act->dta_refcnt > 0);
11539 act->dta_refcnt++;
11540 ecb->dte_action = act;
11541 ecb->dte_action_last = cached->dte_action_last;
11542 ecb->dte_needed = cached->dte_needed;
11543 ecb->dte_size = cached->dte_size;
11544 ecb->dte_alignment = cached->dte_alignment;
11545 }
11546
11547 return (ecb);
11548 }
11549
11550 for (act = desc->dted_action; act != NULL; act = act->dtad_next) {
11551 if ((enab->dten_error = dtrace_ecb_action_add(ecb, act)) != 0) {
11552 dtrace_ecb_destroy(ecb);
11553 return (NULL);
11554 }
11555 }
11556
11557 dtrace_ecb_resize(ecb);
11558
11559 return (dtrace_ecb_create_cache = ecb);
11560}
11561
11562static int
11563dtrace_ecb_create_enable(dtrace_probe_t *probe, void *arg)
11564{
11565 dtrace_ecb_t *ecb;
11566 dtrace_enabling_t *enab = arg;
11567 dtrace_state_t *state = enab->dten_vstate->dtvs_state;
11568
11569 ASSERT(state != NULL);
11570
11571 if (probe != NULL && probe->dtpr_gen < enab->dten_probegen) {
11572 /*
11573 * This probe was created in a generation for which this
11574 * enabling has previously created ECBs; we don't want to
11575 * enable it again, so just kick out.
11576 */
11577 return (DTRACE_MATCH_NEXT);
11578 }
11579
11580 if ((ecb = dtrace_ecb_create(state, probe, enab)) == NULL)
11581 return (DTRACE_MATCH_DONE);
11582
11583 dtrace_ecb_enable(ecb);
11584 return (DTRACE_MATCH_NEXT);
11585}
11586
11587static dtrace_ecb_t *
11588dtrace_epid2ecb(dtrace_state_t *state, dtrace_epid_t id)
11589{
11590 dtrace_ecb_t *ecb;
11591
11592 ASSERT(MUTEX_HELD(&dtrace_lock));
11593
11594 if (id == 0 || id > state->dts_necbs)
11595 return (NULL);
11596
11597 ASSERT(state->dts_necbs > 0 && state->dts_ecbs != NULL);
11598 ASSERT((ecb = state->dts_ecbs[id - 1]) == NULL || ecb->dte_epid == id);
11599
11600 return (state->dts_ecbs[id - 1]);
11601}
11602
11603static dtrace_aggregation_t *
11604dtrace_aggid2agg(dtrace_state_t *state, dtrace_aggid_t id)
11605{
11606 dtrace_aggregation_t *agg;
11607
11608 ASSERT(MUTEX_HELD(&dtrace_lock));
11609
11610 if (id == 0 || id > state->dts_naggregations)
11611 return (NULL);
11612
11613 ASSERT(state->dts_naggregations > 0 && state->dts_aggregations != NULL);
11614 ASSERT((agg = state->dts_aggregations[id - 1]) == NULL ||
11615 agg->dtag_id == id);
11616
11617 return (state->dts_aggregations[id - 1]);
11618}
11619
11620/*
11621 * DTrace Buffer Functions
11622 *
11623 * The following functions manipulate DTrace buffers. Most of these functions
11624 * are called in the context of establishing or processing consumer state;
11625 * exceptions are explicitly noted.
11626 */
11627
11628/*
11629 * Note: called from cross call context. This function switches the two
11630 * buffers on a given CPU. The atomicity of this operation is assured by
11631 * disabling interrupts while the actual switch takes place; the disabling of
11632 * interrupts serializes the execution with any execution of dtrace_probe() on
11633 * the same CPU.
11634 */
11635static void
11636dtrace_buffer_switch(dtrace_buffer_t *buf)
11637{
11638 caddr_t tomax = buf->dtb_tomax;
11639 caddr_t xamot = buf->dtb_xamot;
11640 dtrace_icookie_t cookie;
11641 hrtime_t now;
11642
11643 ASSERT(!(buf->dtb_flags & DTRACEBUF_NOSWITCH));
11644 ASSERT(!(buf->dtb_flags & DTRACEBUF_RING));
11645
11646 cookie = dtrace_interrupt_disable();
11647 now = dtrace_gethrtime();
11648 buf->dtb_tomax = xamot;
11649 buf->dtb_xamot = tomax;
11650 buf->dtb_xamot_drops = buf->dtb_drops;
11651 buf->dtb_xamot_offset = buf->dtb_offset;
11652 buf->dtb_xamot_errors = buf->dtb_errors;
11653 buf->dtb_xamot_flags = buf->dtb_flags;
11654 buf->dtb_offset = 0;
11655 buf->dtb_drops = 0;
11656 buf->dtb_errors = 0;
11657 buf->dtb_flags &= ~(DTRACEBUF_ERROR | DTRACEBUF_DROPPED);
11658 buf->dtb_interval = now - buf->dtb_switched;
11659 buf->dtb_switched = now;
11660 dtrace_interrupt_enable(cookie);
11661}
11662
11663/*
11664 * Note: called from cross call context. This function activates a buffer
11665 * on a CPU. As with dtrace_buffer_switch(), the atomicity of the operation
11666 * is guaranteed by the disabling of interrupts.
11667 */
11668static void
11669dtrace_buffer_activate(dtrace_state_t *state)
11670{
11671 dtrace_buffer_t *buf;
11672 dtrace_icookie_t cookie = dtrace_interrupt_disable();
11673
11674 buf = &state->dts_buffer[curcpu];
11675
11676 if (buf->dtb_tomax != NULL) {
11677 /*
11678 * We might like to assert that the buffer is marked inactive,
11679 * but this isn't necessarily true: the buffer for the CPU
11680 * that processes the BEGIN probe has its buffer activated
11681 * manually. In this case, we take the (harmless) action
11682 * re-clearing the bit INACTIVE bit.
11683 */
11684 buf->dtb_flags &= ~DTRACEBUF_INACTIVE;
11685 }
11686
11687 dtrace_interrupt_enable(cookie);
11688}
11689
11690static int
11691dtrace_buffer_alloc(dtrace_buffer_t *bufs, size_t size, int flags,
11692 processorid_t cpu, int *factor)
11693{
11694#if defined(sun)
11695 cpu_t *cp;
11696#endif
11697 dtrace_buffer_t *buf;
11698 int allocated = 0, desired = 0;
11699
11700#if defined(sun)
11701 ASSERT(MUTEX_HELD(&cpu_lock));
11702 ASSERT(MUTEX_HELD(&dtrace_lock));
11703
11704 *factor = 1;
11705
11706 if (size > dtrace_nonroot_maxsize &&
11707 !PRIV_POLICY_CHOICE(CRED(), PRIV_ALL, B_FALSE))
11708 return (EFBIG);
11709
11710 cp = cpu_list;
11711
11712 do {
11713 if (cpu != DTRACE_CPUALL && cpu != cp->cpu_id)
11714 continue;
11715
11716 buf = &bufs[cp->cpu_id];
11717
11718 /*
11719 * If there is already a buffer allocated for this CPU, it
11720 * is only possible that this is a DR event. In this case,
11721 */
11722 if (buf->dtb_tomax != NULL) {
11723 ASSERT(buf->dtb_size == size);
11724 continue;
11725 }
11726
11727 ASSERT(buf->dtb_xamot == NULL);
11728
11729 if ((buf->dtb_tomax = kmem_zalloc(size,
11730 KM_NOSLEEP | KM_NORMALPRI)) == NULL)
11731 goto err;
11732
11733 buf->dtb_size = size;
11734 buf->dtb_flags = flags;
11735 buf->dtb_offset = 0;
11736 buf->dtb_drops = 0;
11737
11738 if (flags & DTRACEBUF_NOSWITCH)
11739 continue;
11740
11741 if ((buf->dtb_xamot = kmem_zalloc(size,
11742 KM_NOSLEEP | KM_NORMALPRI)) == NULL)
11743 goto err;
11744 } while ((cp = cp->cpu_next) != cpu_list);
11745
11746 return (0);
11747
11748err:
11749 cp = cpu_list;
11750
11751 do {
11752 if (cpu != DTRACE_CPUALL && cpu != cp->cpu_id)
11753 continue;
11754
11755 buf = &bufs[cp->cpu_id];
11756 desired += 2;
11757
11758 if (buf->dtb_xamot != NULL) {
11759 ASSERT(buf->dtb_tomax != NULL);
11760 ASSERT(buf->dtb_size == size);
11761 kmem_free(buf->dtb_xamot, size);
11762 allocated++;
11763 }
11764
11765 if (buf->dtb_tomax != NULL) {
11766 ASSERT(buf->dtb_size == size);
11767 kmem_free(buf->dtb_tomax, size);
11768 allocated++;
11769 }
11770
11771 buf->dtb_tomax = NULL;
11772 buf->dtb_xamot = NULL;
11773 buf->dtb_size = 0;
11774 } while ((cp = cp->cpu_next) != cpu_list);
11775#else
11776 int i;
11777
11778 *factor = 1;
11779#if defined(__amd64__) || defined(__mips__) || defined(__powerpc__)
11780 /*
11781 * FreeBSD isn't good at limiting the amount of memory we
11782 * ask to malloc, so let's place a limit here before trying
11783 * to do something that might well end in tears at bedtime.
11784 */
11785 if (size > physmem * PAGE_SIZE / (128 * (mp_maxid + 1)))
11786 return (ENOMEM);
11787#endif
11788
11789 ASSERT(MUTEX_HELD(&dtrace_lock));
11790 CPU_FOREACH(i) {
11791 if (cpu != DTRACE_CPUALL && cpu != i)
11792 continue;
11793
11794 buf = &bufs[i];
11795
11796 /*
11797 * If there is already a buffer allocated for this CPU, it
11798 * is only possible that this is a DR event. In this case,
11799 * the buffer size must match our specified size.
11800 */
11801 if (buf->dtb_tomax != NULL) {
11802 ASSERT(buf->dtb_size == size);
11803 continue;
11804 }
11805
11806 ASSERT(buf->dtb_xamot == NULL);
11807
11808 if ((buf->dtb_tomax = kmem_zalloc(size,
11809 KM_NOSLEEP | KM_NORMALPRI)) == NULL)
11810 goto err;
11811
11812 buf->dtb_size = size;
11813 buf->dtb_flags = flags;
11814 buf->dtb_offset = 0;
11815 buf->dtb_drops = 0;
11816
11817 if (flags & DTRACEBUF_NOSWITCH)
11818 continue;
11819
11820 if ((buf->dtb_xamot = kmem_zalloc(size,
11821 KM_NOSLEEP | KM_NORMALPRI)) == NULL)
11822 goto err;
11823 }
11824
11825 return (0);
11826
11827err:
11828 /*
11829 * Error allocating memory, so free the buffers that were
11830 * allocated before the failed allocation.
11831 */
11832 CPU_FOREACH(i) {
11833 if (cpu != DTRACE_CPUALL && cpu != i)
11834 continue;
11835
11836 buf = &bufs[i];
11837 desired += 2;
11838
11839 if (buf->dtb_xamot != NULL) {
11840 ASSERT(buf->dtb_tomax != NULL);
11841 ASSERT(buf->dtb_size == size);
11842 kmem_free(buf->dtb_xamot, size);
11843 allocated++;
11844 }
11845
11846 if (buf->dtb_tomax != NULL) {
11847 ASSERT(buf->dtb_size == size);
11848 kmem_free(buf->dtb_tomax, size);
11849 allocated++;
11850 }
11851
11852 buf->dtb_tomax = NULL;
11853 buf->dtb_xamot = NULL;
11854 buf->dtb_size = 0;
11855
11856 }
11857#endif
11858 *factor = desired / (allocated > 0 ? allocated : 1);
11859
11860 return (ENOMEM);
11861}
11862
11863/*
11864 * Note: called from probe context. This function just increments the drop
11865 * count on a buffer. It has been made a function to allow for the
11866 * possibility of understanding the source of mysterious drop counts. (A
11867 * problem for which one may be particularly disappointed that DTrace cannot
11868 * be used to understand DTrace.)
11869 */
11870static void
11871dtrace_buffer_drop(dtrace_buffer_t *buf)
11872{
11873 buf->dtb_drops++;
11874}
11875
11876/*
11877 * Note: called from probe context. This function is called to reserve space
11878 * in a buffer. If mstate is non-NULL, sets the scratch base and size in the
11879 * mstate. Returns the new offset in the buffer, or a negative value if an
11880 * error has occurred.
11881 */
11882static intptr_t
11883dtrace_buffer_reserve(dtrace_buffer_t *buf, size_t needed, size_t align,
11884 dtrace_state_t *state, dtrace_mstate_t *mstate)
11885{
11886 intptr_t offs = buf->dtb_offset, soffs;
11887 intptr_t woffs;
11888 caddr_t tomax;
11889 size_t total;
11890
11891 if (buf->dtb_flags & DTRACEBUF_INACTIVE)
11892 return (-1);
11893
11894 if ((tomax = buf->dtb_tomax) == NULL) {
11895 dtrace_buffer_drop(buf);
11896 return (-1);
11897 }
11898
11899 if (!(buf->dtb_flags & (DTRACEBUF_RING | DTRACEBUF_FILL))) {
11900 while (offs & (align - 1)) {
11901 /*
11902 * Assert that our alignment is off by a number which
11903 * is itself sizeof (uint32_t) aligned.
11904 */
11905 ASSERT(!((align - (offs & (align - 1))) &
11906 (sizeof (uint32_t) - 1)));
11907 DTRACE_STORE(uint32_t, tomax, offs, DTRACE_EPIDNONE);
11908 offs += sizeof (uint32_t);
11909 }
11910
11911 if ((soffs = offs + needed) > buf->dtb_size) {
11912 dtrace_buffer_drop(buf);
11913 return (-1);
11914 }
11915
11916 if (mstate == NULL)
11917 return (offs);
11918
11919 mstate->dtms_scratch_base = (uintptr_t)tomax + soffs;
11920 mstate->dtms_scratch_size = buf->dtb_size - soffs;
11921 mstate->dtms_scratch_ptr = mstate->dtms_scratch_base;
11922
11923 return (offs);
11924 }
11925
11926 if (buf->dtb_flags & DTRACEBUF_FILL) {
11927 if (state->dts_activity != DTRACE_ACTIVITY_COOLDOWN &&
11928 (buf->dtb_flags & DTRACEBUF_FULL))
11929 return (-1);
11930 goto out;
11931 }
11932
11933 total = needed + (offs & (align - 1));
11934
11935 /*
11936 * For a ring buffer, life is quite a bit more complicated. Before
11937 * we can store any padding, we need to adjust our wrapping offset.
11938 * (If we've never before wrapped or we're not about to, no adjustment
11939 * is required.)
11940 */
11941 if ((buf->dtb_flags & DTRACEBUF_WRAPPED) ||
11942 offs + total > buf->dtb_size) {
11943 woffs = buf->dtb_xamot_offset;
11944
11945 if (offs + total > buf->dtb_size) {
11946 /*
11947 * We can't fit in the end of the buffer. First, a
11948 * sanity check that we can fit in the buffer at all.
11949 */
11950 if (total > buf->dtb_size) {
11951 dtrace_buffer_drop(buf);
11952 return (-1);
11953 }
11954
11955 /*
11956 * We're going to be storing at the top of the buffer,
11957 * so now we need to deal with the wrapped offset. We
11958 * only reset our wrapped offset to 0 if it is
11959 * currently greater than the current offset. If it
11960 * is less than the current offset, it is because a
11961 * previous allocation induced a wrap -- but the
11962 * allocation didn't subsequently take the space due
11963 * to an error or false predicate evaluation. In this
11964 * case, we'll just leave the wrapped offset alone: if
11965 * the wrapped offset hasn't been advanced far enough
11966 * for this allocation, it will be adjusted in the
11967 * lower loop.
11968 */
11969 if (buf->dtb_flags & DTRACEBUF_WRAPPED) {
11970 if (woffs >= offs)
11971 woffs = 0;
11972 } else {
11973 woffs = 0;
11974 }
11975
11976 /*
11977 * Now we know that we're going to be storing to the
11978 * top of the buffer and that there is room for us
11979 * there. We need to clear the buffer from the current
11980 * offset to the end (there may be old gunk there).
11981 */
11982 while (offs < buf->dtb_size)
11983 tomax[offs++] = 0;
11984
11985 /*
11986 * We need to set our offset to zero. And because we
11987 * are wrapping, we need to set the bit indicating as
11988 * much. We can also adjust our needed space back
11989 * down to the space required by the ECB -- we know
11990 * that the top of the buffer is aligned.
11991 */
11992 offs = 0;
11993 total = needed;
11994 buf->dtb_flags |= DTRACEBUF_WRAPPED;
11995 } else {
11996 /*
11997 * There is room for us in the buffer, so we simply
11998 * need to check the wrapped offset.
11999 */
12000 if (woffs < offs) {
12001 /*
12002 * The wrapped offset is less than the offset.
12003 * This can happen if we allocated buffer space
12004 * that induced a wrap, but then we didn't
12005 * subsequently take the space due to an error
12006 * or false predicate evaluation. This is
12007 * okay; we know that _this_ allocation isn't
12008 * going to induce a wrap. We still can't
12009 * reset the wrapped offset to be zero,
12010 * however: the space may have been trashed in
12011 * the previous failed probe attempt. But at
12012 * least the wrapped offset doesn't need to
12013 * be adjusted at all...
12014 */
12015 goto out;
12016 }
12017 }
12018
12019 while (offs + total > woffs) {
12020 dtrace_epid_t epid = *(uint32_t *)(tomax + woffs);
12021 size_t size;
12022
12023 if (epid == DTRACE_EPIDNONE) {
12024 size = sizeof (uint32_t);
12025 } else {
12026 ASSERT3U(epid, <=, state->dts_necbs);
12027 ASSERT(state->dts_ecbs[epid - 1] != NULL);
12028
12029 size = state->dts_ecbs[epid - 1]->dte_size;
12030 }
12031
12032 ASSERT(woffs + size <= buf->dtb_size);
12033 ASSERT(size != 0);
12034
12035 if (woffs + size == buf->dtb_size) {
12036 /*
12037 * We've reached the end of the buffer; we want
12038 * to set the wrapped offset to 0 and break
12039 * out. However, if the offs is 0, then we're
12040 * in a strange edge-condition: the amount of
12041 * space that we want to reserve plus the size
12042 * of the record that we're overwriting is
12043 * greater than the size of the buffer. This
12044 * is problematic because if we reserve the
12045 * space but subsequently don't consume it (due
12046 * to a failed predicate or error) the wrapped
12047 * offset will be 0 -- yet the EPID at offset 0
12048 * will not be committed. This situation is
12049 * relatively easy to deal with: if we're in
12050 * this case, the buffer is indistinguishable
12051 * from one that hasn't wrapped; we need only
12052 * finish the job by clearing the wrapped bit,
12053 * explicitly setting the offset to be 0, and
12054 * zero'ing out the old data in the buffer.
12055 */
12056 if (offs == 0) {
12057 buf->dtb_flags &= ~DTRACEBUF_WRAPPED;
12058 buf->dtb_offset = 0;
12059 woffs = total;
12060
12061 while (woffs < buf->dtb_size)
12062 tomax[woffs++] = 0;
12063 }
12064
12065 woffs = 0;
12066 break;
12067 }
12068
12069 woffs += size;
12070 }
12071
12072 /*
12073 * We have a wrapped offset. It may be that the wrapped offset
12074 * has become zero -- that's okay.
12075 */
12076 buf->dtb_xamot_offset = woffs;
12077 }
12078
12079out:
12080 /*
12081 * Now we can plow the buffer with any necessary padding.
12082 */
12083 while (offs & (align - 1)) {
12084 /*
12085 * Assert that our alignment is off by a number which
12086 * is itself sizeof (uint32_t) aligned.
12087 */
12088 ASSERT(!((align - (offs & (align - 1))) &
12089 (sizeof (uint32_t) - 1)));
12090 DTRACE_STORE(uint32_t, tomax, offs, DTRACE_EPIDNONE);
12091 offs += sizeof (uint32_t);
12092 }
12093
12094 if (buf->dtb_flags & DTRACEBUF_FILL) {
12095 if (offs + needed > buf->dtb_size - state->dts_reserve) {
12096 buf->dtb_flags |= DTRACEBUF_FULL;
12097 return (-1);
12098 }
12099 }
12100
12101 if (mstate == NULL)
12102 return (offs);
12103
12104 /*
12105 * For ring buffers and fill buffers, the scratch space is always
12106 * the inactive buffer.
12107 */
12108 mstate->dtms_scratch_base = (uintptr_t)buf->dtb_xamot;
12109 mstate->dtms_scratch_size = buf->dtb_size;
12110 mstate->dtms_scratch_ptr = mstate->dtms_scratch_base;
12111
12112 return (offs);
12113}
12114
12115static void
12116dtrace_buffer_polish(dtrace_buffer_t *buf)
12117{
12118 ASSERT(buf->dtb_flags & DTRACEBUF_RING);
12119 ASSERT(MUTEX_HELD(&dtrace_lock));
12120
12121 if (!(buf->dtb_flags & DTRACEBUF_WRAPPED))
12122 return;
12123
12124 /*
12125 * We need to polish the ring buffer. There are three cases:
12126 *
12127 * - The first (and presumably most common) is that there is no gap
12128 * between the buffer offset and the wrapped offset. In this case,
12129 * there is nothing in the buffer that isn't valid data; we can
12130 * mark the buffer as polished and return.
12131 *
12132 * - The second (less common than the first but still more common
12133 * than the third) is that there is a gap between the buffer offset
12134 * and the wrapped offset, and the wrapped offset is larger than the
12135 * buffer offset. This can happen because of an alignment issue, or
12136 * can happen because of a call to dtrace_buffer_reserve() that
12137 * didn't subsequently consume the buffer space. In this case,
12138 * we need to zero the data from the buffer offset to the wrapped
12139 * offset.
12140 *
12141 * - The third (and least common) is that there is a gap between the
12142 * buffer offset and the wrapped offset, but the wrapped offset is
12143 * _less_ than the buffer offset. This can only happen because a
12144 * call to dtrace_buffer_reserve() induced a wrap, but the space
12145 * was not subsequently consumed. In this case, we need to zero the
12146 * space from the offset to the end of the buffer _and_ from the
12147 * top of the buffer to the wrapped offset.
12148 */
12149 if (buf->dtb_offset < buf->dtb_xamot_offset) {
12150 bzero(buf->dtb_tomax + buf->dtb_offset,
12151 buf->dtb_xamot_offset - buf->dtb_offset);
12152 }
12153
12154 if (buf->dtb_offset > buf->dtb_xamot_offset) {
12155 bzero(buf->dtb_tomax + buf->dtb_offset,
12156 buf->dtb_size - buf->dtb_offset);
12157 bzero(buf->dtb_tomax, buf->dtb_xamot_offset);
12158 }
12159}
12160
12161/*
12162 * This routine determines if data generated at the specified time has likely
12163 * been entirely consumed at user-level. This routine is called to determine
12164 * if an ECB on a defunct probe (but for an active enabling) can be safely
12165 * disabled and destroyed.
12166 */
12167static int
12168dtrace_buffer_consumed(dtrace_buffer_t *bufs, hrtime_t when)
12169{
12170 int i;
12171
12172 for (i = 0; i < NCPU; i++) {
12173 dtrace_buffer_t *buf = &bufs[i];
12174
12175 if (buf->dtb_size == 0)
12176 continue;
12177
12178 if (buf->dtb_flags & DTRACEBUF_RING)
12179 return (0);
12180
12181 if (!buf->dtb_switched && buf->dtb_offset != 0)
12182 return (0);
12183
12184 if (buf->dtb_switched - buf->dtb_interval < when)
12185 return (0);
12186 }
12187
12188 return (1);
12189}
12190
12191static void
12192dtrace_buffer_free(dtrace_buffer_t *bufs)
12193{
12194 int i;
12195
12196 for (i = 0; i < NCPU; i++) {
12197 dtrace_buffer_t *buf = &bufs[i];
12198
12199 if (buf->dtb_tomax == NULL) {
12200 ASSERT(buf->dtb_xamot == NULL);
12201 ASSERT(buf->dtb_size == 0);
12202 continue;
12203 }
12204
12205 if (buf->dtb_xamot != NULL) {
12206 ASSERT(!(buf->dtb_flags & DTRACEBUF_NOSWITCH));
12207 kmem_free(buf->dtb_xamot, buf->dtb_size);
12208 }
12209
12210 kmem_free(buf->dtb_tomax, buf->dtb_size);
12211 buf->dtb_size = 0;
12212 buf->dtb_tomax = NULL;
12213 buf->dtb_xamot = NULL;
12214 }
12215}
12216
12217/*
12218 * DTrace Enabling Functions
12219 */
12220static dtrace_enabling_t *
12221dtrace_enabling_create(dtrace_vstate_t *vstate)
12222{
12223 dtrace_enabling_t *enab;
12224
12225 enab = kmem_zalloc(sizeof (dtrace_enabling_t), KM_SLEEP);
12226 enab->dten_vstate = vstate;
12227
12228 return (enab);
12229}
12230
12231static void
12232dtrace_enabling_add(dtrace_enabling_t *enab, dtrace_ecbdesc_t *ecb)
12233{
12234 dtrace_ecbdesc_t **ndesc;
12235 size_t osize, nsize;
12236
12237 /*
12238 * We can't add to enablings after we've enabled them, or after we've
12239 * retained them.
12240 */
12241 ASSERT(enab->dten_probegen == 0);
12242 ASSERT(enab->dten_next == NULL && enab->dten_prev == NULL);
12243
12244 if (enab->dten_ndesc < enab->dten_maxdesc) {
12245 enab->dten_desc[enab->dten_ndesc++] = ecb;
12246 return;
12247 }
12248
12249 osize = enab->dten_maxdesc * sizeof (dtrace_enabling_t *);
12250
12251 if (enab->dten_maxdesc == 0) {
12252 enab->dten_maxdesc = 1;
12253 } else {
12254 enab->dten_maxdesc <<= 1;
12255 }
12256
12257 ASSERT(enab->dten_ndesc < enab->dten_maxdesc);
12258
12259 nsize = enab->dten_maxdesc * sizeof (dtrace_enabling_t *);
12260 ndesc = kmem_zalloc(nsize, KM_SLEEP);
12261 bcopy(enab->dten_desc, ndesc, osize);
12262 if (enab->dten_desc != NULL)
12263 kmem_free(enab->dten_desc, osize);
12264
12265 enab->dten_desc = ndesc;
12266 enab->dten_desc[enab->dten_ndesc++] = ecb;
12267}
12268
12269static void
12270dtrace_enabling_addlike(dtrace_enabling_t *enab, dtrace_ecbdesc_t *ecb,
12271 dtrace_probedesc_t *pd)
12272{
12273 dtrace_ecbdesc_t *new;
12274 dtrace_predicate_t *pred;
12275 dtrace_actdesc_t *act;
12276
12277 /*
12278 * We're going to create a new ECB description that matches the
12279 * specified ECB in every way, but has the specified probe description.
12280 */
12281 new = kmem_zalloc(sizeof (dtrace_ecbdesc_t), KM_SLEEP);
12282
12283 if ((pred = ecb->dted_pred.dtpdd_predicate) != NULL)
12284 dtrace_predicate_hold(pred);
12285
12286 for (act = ecb->dted_action; act != NULL; act = act->dtad_next)
12287 dtrace_actdesc_hold(act);
12288
12289 new->dted_action = ecb->dted_action;
12290 new->dted_pred = ecb->dted_pred;
12291 new->dted_probe = *pd;
12292 new->dted_uarg = ecb->dted_uarg;
12293
12294 dtrace_enabling_add(enab, new);
12295}
12296
12297static void
12298dtrace_enabling_dump(dtrace_enabling_t *enab)
12299{
12300 int i;
12301
12302 for (i = 0; i < enab->dten_ndesc; i++) {
12303 dtrace_probedesc_t *desc = &enab->dten_desc[i]->dted_probe;
12304
12305 cmn_err(CE_NOTE, "enabling probe %d (%s:%s:%s:%s)", i,
12306 desc->dtpd_provider, desc->dtpd_mod,
12307 desc->dtpd_func, desc->dtpd_name);
12308 }
12309}
12310
12311static void
12312dtrace_enabling_destroy(dtrace_enabling_t *enab)
12313{
12314 int i;
12315 dtrace_ecbdesc_t *ep;
12316 dtrace_vstate_t *vstate = enab->dten_vstate;
12317
12318 ASSERT(MUTEX_HELD(&dtrace_lock));
12319
12320 for (i = 0; i < enab->dten_ndesc; i++) {
12321 dtrace_actdesc_t *act, *next;
12322 dtrace_predicate_t *pred;
12323
12324 ep = enab->dten_desc[i];
12325
12326 if ((pred = ep->dted_pred.dtpdd_predicate) != NULL)
12327 dtrace_predicate_release(pred, vstate);
12328
12329 for (act = ep->dted_action; act != NULL; act = next) {
12330 next = act->dtad_next;
12331 dtrace_actdesc_release(act, vstate);
12332 }
12333
12334 kmem_free(ep, sizeof (dtrace_ecbdesc_t));
12335 }
12336
12337 if (enab->dten_desc != NULL)
12338 kmem_free(enab->dten_desc,
12339 enab->dten_maxdesc * sizeof (dtrace_enabling_t *));
12340
12341 /*
12342 * If this was a retained enabling, decrement the dts_nretained count
12343 * and take it off of the dtrace_retained list.
12344 */
12345 if (enab->dten_prev != NULL || enab->dten_next != NULL ||
12346 dtrace_retained == enab) {
12347 ASSERT(enab->dten_vstate->dtvs_state != NULL);
12348 ASSERT(enab->dten_vstate->dtvs_state->dts_nretained > 0);
12349 enab->dten_vstate->dtvs_state->dts_nretained--;
12350 dtrace_retained_gen++;
12351 }
12352
12353 if (enab->dten_prev == NULL) {
12354 if (dtrace_retained == enab) {
12355 dtrace_retained = enab->dten_next;
12356
12357 if (dtrace_retained != NULL)
12358 dtrace_retained->dten_prev = NULL;
12359 }
12360 } else {
12361 ASSERT(enab != dtrace_retained);
12362 ASSERT(dtrace_retained != NULL);
12363 enab->dten_prev->dten_next = enab->dten_next;
12364 }
12365
12366 if (enab->dten_next != NULL) {
12367 ASSERT(dtrace_retained != NULL);
12368 enab->dten_next->dten_prev = enab->dten_prev;
12369 }
12370
12371 kmem_free(enab, sizeof (dtrace_enabling_t));
12372}
12373
12374static int
12375dtrace_enabling_retain(dtrace_enabling_t *enab)
12376{
12377 dtrace_state_t *state;
12378
12379 ASSERT(MUTEX_HELD(&dtrace_lock));
12380 ASSERT(enab->dten_next == NULL && enab->dten_prev == NULL);
12381 ASSERT(enab->dten_vstate != NULL);
12382
12383 state = enab->dten_vstate->dtvs_state;
12384 ASSERT(state != NULL);
12385
12386 /*
12387 * We only allow each state to retain dtrace_retain_max enablings.
12388 */
12389 if (state->dts_nretained >= dtrace_retain_max)
12390 return (ENOSPC);
12391
12392 state->dts_nretained++;
12393 dtrace_retained_gen++;
12394
12395 if (dtrace_retained == NULL) {
12396 dtrace_retained = enab;
12397 return (0);
12398 }
12399
12400 enab->dten_next = dtrace_retained;
12401 dtrace_retained->dten_prev = enab;
12402 dtrace_retained = enab;
12403
12404 return (0);
12405}
12406
12407static int
12408dtrace_enabling_replicate(dtrace_state_t *state, dtrace_probedesc_t *match,
12409 dtrace_probedesc_t *create)
12410{
12411 dtrace_enabling_t *new, *enab;
12412 int found = 0, err = ENOENT;
12413
12414 ASSERT(MUTEX_HELD(&dtrace_lock));
12415 ASSERT(strlen(match->dtpd_provider) < DTRACE_PROVNAMELEN);
12416 ASSERT(strlen(match->dtpd_mod) < DTRACE_MODNAMELEN);
12417 ASSERT(strlen(match->dtpd_func) < DTRACE_FUNCNAMELEN);
12418 ASSERT(strlen(match->dtpd_name) < DTRACE_NAMELEN);
12419
12420 new = dtrace_enabling_create(&state->dts_vstate);
12421
12422 /*
12423 * Iterate over all retained enablings, looking for enablings that
12424 * match the specified state.
12425 */
12426 for (enab = dtrace_retained; enab != NULL; enab = enab->dten_next) {
12427 int i;
12428
12429 /*
12430 * dtvs_state can only be NULL for helper enablings -- and
12431 * helper enablings can't be retained.
12432 */
12433 ASSERT(enab->dten_vstate->dtvs_state != NULL);
12434
12435 if (enab->dten_vstate->dtvs_state != state)
12436 continue;
12437
12438 /*
12439 * Now iterate over each probe description; we're looking for
12440 * an exact match to the specified probe description.
12441 */
12442 for (i = 0; i < enab->dten_ndesc; i++) {
12443 dtrace_ecbdesc_t *ep = enab->dten_desc[i];
12444 dtrace_probedesc_t *pd = &ep->dted_probe;
12445
12446 if (strcmp(pd->dtpd_provider, match->dtpd_provider))
12447 continue;
12448
12449 if (strcmp(pd->dtpd_mod, match->dtpd_mod))
12450 continue;
12451
12452 if (strcmp(pd->dtpd_func, match->dtpd_func))
12453 continue;
12454
12455 if (strcmp(pd->dtpd_name, match->dtpd_name))
12456 continue;
12457
12458 /*
12459 * We have a winning probe! Add it to our growing
12460 * enabling.
12461 */
12462 found = 1;
12463 dtrace_enabling_addlike(new, ep, create);
12464 }
12465 }
12466
12467 if (!found || (err = dtrace_enabling_retain(new)) != 0) {
12468 dtrace_enabling_destroy(new);
12469 return (err);
12470 }
12471
12472 return (0);
12473}
12474
12475static void
12476dtrace_enabling_retract(dtrace_state_t *state)
12477{
12478 dtrace_enabling_t *enab, *next;
12479
12480 ASSERT(MUTEX_HELD(&dtrace_lock));
12481
12482 /*
12483 * Iterate over all retained enablings, destroy the enablings retained
12484 * for the specified state.
12485 */
12486 for (enab = dtrace_retained; enab != NULL; enab = next) {
12487 next = enab->dten_next;
12488
12489 /*
12490 * dtvs_state can only be NULL for helper enablings -- and
12491 * helper enablings can't be retained.
12492 */
12493 ASSERT(enab->dten_vstate->dtvs_state != NULL);
12494
12495 if (enab->dten_vstate->dtvs_state == state) {
12496 ASSERT(state->dts_nretained > 0);
12497 dtrace_enabling_destroy(enab);
12498 }
12499 }
12500
12501 ASSERT(state->dts_nretained == 0);
12502}
12503
12504static int
12505dtrace_enabling_match(dtrace_enabling_t *enab, int *nmatched)
12506{
12507 int i = 0;
12508 int matched = 0;
12509
12510 ASSERT(MUTEX_HELD(&cpu_lock));
12511 ASSERT(MUTEX_HELD(&dtrace_lock));
12512
12513 for (i = 0; i < enab->dten_ndesc; i++) {
12514 dtrace_ecbdesc_t *ep = enab->dten_desc[i];
12515
12516 enab->dten_current = ep;
12517 enab->dten_error = 0;
12518
12519 matched += dtrace_probe_enable(&ep->dted_probe, enab);
12520
12521 if (enab->dten_error != 0) {
12522 /*
12523 * If we get an error half-way through enabling the
12524 * probes, we kick out -- perhaps with some number of
12525 * them enabled. Leaving enabled probes enabled may
12526 * be slightly confusing for user-level, but we expect
12527 * that no one will attempt to actually drive on in
12528 * the face of such errors. If this is an anonymous
12529 * enabling (indicated with a NULL nmatched pointer),
12530 * we cmn_err() a message. We aren't expecting to
12531 * get such an error -- such as it can exist at all,
12532 * it would be a result of corrupted DOF in the driver
12533 * properties.
12534 */
12535 if (nmatched == NULL) {
12536 cmn_err(CE_WARN, "dtrace_enabling_match() "
12537 "error on %p: %d", (void *)ep,
12538 enab->dten_error);
12539 }
12540
12541 return (enab->dten_error);
12542 }
12543 }
12544
12545 enab->dten_probegen = dtrace_probegen;
12546 if (nmatched != NULL)
12547 *nmatched = matched;
12548
12549 return (0);
12550}
12551
12552static void
12553dtrace_enabling_matchall(void)
12554{
12555 dtrace_enabling_t *enab;
12556
12557 mutex_enter(&cpu_lock);
12558 mutex_enter(&dtrace_lock);
12559
12560 /*
12561 * Iterate over all retained enablings to see if any probes match
12562 * against them. We only perform this operation on enablings for which
12563 * we have sufficient permissions by virtue of being in the global zone
12564 * or in the same zone as the DTrace client. Because we can be called
12565 * after dtrace_detach() has been called, we cannot assert that there
12566 * are retained enablings. We can safely load from dtrace_retained,
12567 * however: the taskq_destroy() at the end of dtrace_detach() will
12568 * block pending our completion.
12569 */
12570 for (enab = dtrace_retained; enab != NULL; enab = enab->dten_next) {
12571#if defined(sun)
12572 cred_t *cr = enab->dten_vstate->dtvs_state->dts_cred.dcr_cred;
12573
12574 if (INGLOBALZONE(curproc) ||
12575 cr != NULL && getzoneid() == crgetzoneid(cr))
12576#endif
12577 (void) dtrace_enabling_match(enab, NULL);
12578 }
12579
12580 mutex_exit(&dtrace_lock);
12581 mutex_exit(&cpu_lock);
12582}
12583
12584/*
12585 * If an enabling is to be enabled without having matched probes (that is, if
12586 * dtrace_state_go() is to be called on the underlying dtrace_state_t), the
12587 * enabling must be _primed_ by creating an ECB for every ECB description.
12588 * This must be done to assure that we know the number of speculations, the
12589 * number of aggregations, the minimum buffer size needed, etc. before we
12590 * transition out of DTRACE_ACTIVITY_INACTIVE. To do this without actually
12591 * enabling any probes, we create ECBs for every ECB decription, but with a
12592 * NULL probe -- which is exactly what this function does.
12593 */
12594static void
12595dtrace_enabling_prime(dtrace_state_t *state)
12596{
12597 dtrace_enabling_t *enab;
12598 int i;
12599
12600 for (enab = dtrace_retained; enab != NULL; enab = enab->dten_next) {
12601 ASSERT(enab->dten_vstate->dtvs_state != NULL);
12602
12603 if (enab->dten_vstate->dtvs_state != state)
12604 continue;
12605
12606 /*
12607 * We don't want to prime an enabling more than once, lest
12608 * we allow a malicious user to induce resource exhaustion.
12609 * (The ECBs that result from priming an enabling aren't
12610 * leaked -- but they also aren't deallocated until the
12611 * consumer state is destroyed.)
12612 */
12613 if (enab->dten_primed)
12614 continue;
12615
12616 for (i = 0; i < enab->dten_ndesc; i++) {
12617 enab->dten_current = enab->dten_desc[i];
12618 (void) dtrace_probe_enable(NULL, enab);
12619 }
12620
12621 enab->dten_primed = 1;
12622 }
12623}
12624
12625/*
12626 * Called to indicate that probes should be provided due to retained
12627 * enablings. This is implemented in terms of dtrace_probe_provide(), but it
12628 * must take an initial lap through the enabling calling the dtps_provide()
12629 * entry point explicitly to allow for autocreated probes.
12630 */
12631static void
12632dtrace_enabling_provide(dtrace_provider_t *prv)
12633{
12634 int i, all = 0;
12635 dtrace_probedesc_t desc;
12636 dtrace_genid_t gen;
12637
12638 ASSERT(MUTEX_HELD(&dtrace_lock));
12639 ASSERT(MUTEX_HELD(&dtrace_provider_lock));
12640
12641 if (prv == NULL) {
12642 all = 1;
12643 prv = dtrace_provider;
12644 }
12645
12646 do {
12647 dtrace_enabling_t *enab;
12648 void *parg = prv->dtpv_arg;
12649
12650retry:
12651 gen = dtrace_retained_gen;
12652 for (enab = dtrace_retained; enab != NULL;
12653 enab = enab->dten_next) {
12654 for (i = 0; i < enab->dten_ndesc; i++) {
12655 desc = enab->dten_desc[i]->dted_probe;
12656 mutex_exit(&dtrace_lock);
12657 prv->dtpv_pops.dtps_provide(parg, &desc);
12658 mutex_enter(&dtrace_lock);
12659 /*
12660 * Process the retained enablings again if
12661 * they have changed while we weren't holding
12662 * dtrace_lock.
12663 */
12664 if (gen != dtrace_retained_gen)
12665 goto retry;
12666 }
12667 }
12668 } while (all && (prv = prv->dtpv_next) != NULL);
12669
12670 mutex_exit(&dtrace_lock);
12671 dtrace_probe_provide(NULL, all ? NULL : prv);
12672 mutex_enter(&dtrace_lock);
12673}
12674
12675/*
12676 * Called to reap ECBs that are attached to probes from defunct providers.
12677 */
12678static void
12679dtrace_enabling_reap(void)
12680{
12681 dtrace_provider_t *prov;
12682 dtrace_probe_t *probe;
12683 dtrace_ecb_t *ecb;
12684 hrtime_t when;
12685 int i;
12686
12687 mutex_enter(&cpu_lock);
12688 mutex_enter(&dtrace_lock);
12689
12690 for (i = 0; i < dtrace_nprobes; i++) {
12691 if ((probe = dtrace_probes[i]) == NULL)
12692 continue;
12693
12694 if (probe->dtpr_ecb == NULL)
12695 continue;
12696
12697 prov = probe->dtpr_provider;
12698
12699 if ((when = prov->dtpv_defunct) == 0)
12700 continue;
12701
12702 /*
12703 * We have ECBs on a defunct provider: we want to reap these
12704 * ECBs to allow the provider to unregister. The destruction
12705 * of these ECBs must be done carefully: if we destroy the ECB
12706 * and the consumer later wishes to consume an EPID that
12707 * corresponds to the destroyed ECB (and if the EPID metadata
12708 * has not been previously consumed), the consumer will abort
12709 * processing on the unknown EPID. To reduce (but not, sadly,
12710 * eliminate) the possibility of this, we will only destroy an
12711 * ECB for a defunct provider if, for the state that
12712 * corresponds to the ECB:
12713 *
12714 * (a) There is no speculative tracing (which can effectively
12715 * cache an EPID for an arbitrary amount of time).
12716 *
12717 * (b) The principal buffers have been switched twice since the
12718 * provider became defunct.
12719 *
12720 * (c) The aggregation buffers are of zero size or have been
12721 * switched twice since the provider became defunct.
12722 *
12723 * We use dts_speculates to determine (a) and call a function
12724 * (dtrace_buffer_consumed()) to determine (b) and (c). Note
12725 * that as soon as we've been unable to destroy one of the ECBs
12726 * associated with the probe, we quit trying -- reaping is only
12727 * fruitful in as much as we can destroy all ECBs associated
12728 * with the defunct provider's probes.
12729 */
12730 while ((ecb = probe->dtpr_ecb) != NULL) {
12731 dtrace_state_t *state = ecb->dte_state;
12732 dtrace_buffer_t *buf = state->dts_buffer;
12733 dtrace_buffer_t *aggbuf = state->dts_aggbuffer;
12734
12735 if (state->dts_speculates)
12736 break;
12737
12738 if (!dtrace_buffer_consumed(buf, when))
12739 break;
12740
12741 if (!dtrace_buffer_consumed(aggbuf, when))
12742 break;
12743
12744 dtrace_ecb_disable(ecb);
12745 ASSERT(probe->dtpr_ecb != ecb);
12746 dtrace_ecb_destroy(ecb);
12747 }
12748 }
12749
12750 mutex_exit(&dtrace_lock);
12751 mutex_exit(&cpu_lock);
12752}
12753
12754/*
12755 * DTrace DOF Functions
12756 */
12757/*ARGSUSED*/
12758static void
12759dtrace_dof_error(dof_hdr_t *dof, const char *str)
12760{
12761 if (dtrace_err_verbose)
12762 cmn_err(CE_WARN, "failed to process DOF: %s", str);
12763
12764#ifdef DTRACE_ERRDEBUG
12765 dtrace_errdebug(str);
12766#endif
12767}
12768
12769/*
12770 * Create DOF out of a currently enabled state. Right now, we only create
12771 * DOF containing the run-time options -- but this could be expanded to create
12772 * complete DOF representing the enabled state.
12773 */
12774static dof_hdr_t *
12775dtrace_dof_create(dtrace_state_t *state)
12776{
12777 dof_hdr_t *dof;
12778 dof_sec_t *sec;
12779 dof_optdesc_t *opt;
12780 int i, len = sizeof (dof_hdr_t) +
12781 roundup(sizeof (dof_sec_t), sizeof (uint64_t)) +
12782 sizeof (dof_optdesc_t) * DTRACEOPT_MAX;
12783
12784 ASSERT(MUTEX_HELD(&dtrace_lock));
12785
12786 dof = kmem_zalloc(len, KM_SLEEP);
12787 dof->dofh_ident[DOF_ID_MAG0] = DOF_MAG_MAG0;
12788 dof->dofh_ident[DOF_ID_MAG1] = DOF_MAG_MAG1;
12789 dof->dofh_ident[DOF_ID_MAG2] = DOF_MAG_MAG2;
12790 dof->dofh_ident[DOF_ID_MAG3] = DOF_MAG_MAG3;
12791
12792 dof->dofh_ident[DOF_ID_MODEL] = DOF_MODEL_NATIVE;
12793 dof->dofh_ident[DOF_ID_ENCODING] = DOF_ENCODE_NATIVE;
12794 dof->dofh_ident[DOF_ID_VERSION] = DOF_VERSION;
12795 dof->dofh_ident[DOF_ID_DIFVERS] = DIF_VERSION;
12796 dof->dofh_ident[DOF_ID_DIFIREG] = DIF_DIR_NREGS;
12797 dof->dofh_ident[DOF_ID_DIFTREG] = DIF_DTR_NREGS;
12798
12799 dof->dofh_flags = 0;
12800 dof->dofh_hdrsize = sizeof (dof_hdr_t);
12801 dof->dofh_secsize = sizeof (dof_sec_t);
12802 dof->dofh_secnum = 1; /* only DOF_SECT_OPTDESC */
12803 dof->dofh_secoff = sizeof (dof_hdr_t);
12804 dof->dofh_loadsz = len;
12805 dof->dofh_filesz = len;
12806 dof->dofh_pad = 0;
12807
12808 /*
12809 * Fill in the option section header...
12810 */
12811 sec = (dof_sec_t *)((uintptr_t)dof + sizeof (dof_hdr_t));
12812 sec->dofs_type = DOF_SECT_OPTDESC;
12813 sec->dofs_align = sizeof (uint64_t);
12814 sec->dofs_flags = DOF_SECF_LOAD;
12815 sec->dofs_entsize = sizeof (dof_optdesc_t);
12816
12817 opt = (dof_optdesc_t *)((uintptr_t)sec +
12818 roundup(sizeof (dof_sec_t), sizeof (uint64_t)));
12819
12820 sec->dofs_offset = (uintptr_t)opt - (uintptr_t)dof;
12821 sec->dofs_size = sizeof (dof_optdesc_t) * DTRACEOPT_MAX;
12822
12823 for (i = 0; i < DTRACEOPT_MAX; i++) {
12824 opt[i].dofo_option = i;
12825 opt[i].dofo_strtab = DOF_SECIDX_NONE;
12826 opt[i].dofo_value = state->dts_options[i];
12827 }
12828
12829 return (dof);
12830}
12831
12832static dof_hdr_t *
12833dtrace_dof_copyin(uintptr_t uarg, int *errp)
12834{
12835 dof_hdr_t hdr, *dof;
12836
12837 ASSERT(!MUTEX_HELD(&dtrace_lock));
12838
12839 /*
12840 * First, we're going to copyin() the sizeof (dof_hdr_t).
12841 */
12842 if (copyin((void *)uarg, &hdr, sizeof (hdr)) != 0) {
12843 dtrace_dof_error(NULL, "failed to copyin DOF header");
12844 *errp = EFAULT;
12845 return (NULL);
12846 }
12847
12848 /*
12849 * Now we'll allocate the entire DOF and copy it in -- provided
12850 * that the length isn't outrageous.
12851 */
12852 if (hdr.dofh_loadsz >= dtrace_dof_maxsize) {
12853 dtrace_dof_error(&hdr, "load size exceeds maximum");
12854 *errp = E2BIG;
12855 return (NULL);
12856 }
12857
12858 if (hdr.dofh_loadsz < sizeof (hdr)) {
12859 dtrace_dof_error(&hdr, "invalid load size");
12860 *errp = EINVAL;
12861 return (NULL);
12862 }
12863
12864 dof = kmem_alloc(hdr.dofh_loadsz, KM_SLEEP);
12865
12866 if (copyin((void *)uarg, dof, hdr.dofh_loadsz) != 0 ||
12867 dof->dofh_loadsz != hdr.dofh_loadsz) {
12868 kmem_free(dof, hdr.dofh_loadsz);
12869 *errp = EFAULT;
12870 return (NULL);
12871 }
12872
12873 return (dof);
12874}
12875
12876#if !defined(sun)
12877static __inline uchar_t
12878dtrace_dof_char(char c) {
12879 switch (c) {
12880 case '0':
12881 case '1':
12882 case '2':
12883 case '3':
12884 case '4':
12885 case '5':
12886 case '6':
12887 case '7':
12888 case '8':
12889 case '9':
12890 return (c - '0');
12891 case 'A':
12892 case 'B':
12893 case 'C':
12894 case 'D':
12895 case 'E':
12896 case 'F':
12897 return (c - 'A' + 10);
12898 case 'a':
12899 case 'b':
12900 case 'c':
12901 case 'd':
12902 case 'e':
12903 case 'f':
12904 return (c - 'a' + 10);
12905 }
12906 /* Should not reach here. */
12907 return (0);
12908}
12909#endif
12910
12911static dof_hdr_t *
12912dtrace_dof_property(const char *name)
12913{
12914 uchar_t *buf;
12915 uint64_t loadsz;
12916 unsigned int len, i;
12917 dof_hdr_t *dof;
12918
12919#if defined(sun)
12920 /*
12921 * Unfortunately, array of values in .conf files are always (and
12922 * only) interpreted to be integer arrays. We must read our DOF
12923 * as an integer array, and then squeeze it into a byte array.
12924 */
12925 if (ddi_prop_lookup_int_array(DDI_DEV_T_ANY, dtrace_devi, 0,
12926 (char *)name, (int **)&buf, &len) != DDI_PROP_SUCCESS)
12927 return (NULL);
12928
12929 for (i = 0; i < len; i++)
12930 buf[i] = (uchar_t)(((int *)buf)[i]);
12931
12932 if (len < sizeof (dof_hdr_t)) {
12933 ddi_prop_free(buf);
12934 dtrace_dof_error(NULL, "truncated header");
12935 return (NULL);
12936 }
12937
12938 if (len < (loadsz = ((dof_hdr_t *)buf)->dofh_loadsz)) {
12939 ddi_prop_free(buf);
12940 dtrace_dof_error(NULL, "truncated DOF");
12941 return (NULL);
12942 }
12943
12944 if (loadsz >= dtrace_dof_maxsize) {
12945 ddi_prop_free(buf);
12946 dtrace_dof_error(NULL, "oversized DOF");
12947 return (NULL);
12948 }
12949
12950 dof = kmem_alloc(loadsz, KM_SLEEP);
12951 bcopy(buf, dof, loadsz);
12952 ddi_prop_free(buf);
12953#else
12954 char *p;
12955 char *p_env;
12956
12957 if ((p_env = getenv(name)) == NULL)
12958 return (NULL);
12959
12960 len = strlen(p_env) / 2;
12961
12962 buf = kmem_alloc(len, KM_SLEEP);
12963
12964 dof = (dof_hdr_t *) buf;
12965
12966 p = p_env;
12967
12968 for (i = 0; i < len; i++) {
12969 buf[i] = (dtrace_dof_char(p[0]) << 4) |
12970 dtrace_dof_char(p[1]);
12971 p += 2;
12972 }
12973
12974 freeenv(p_env);
12975
12976 if (len < sizeof (dof_hdr_t)) {
12977 kmem_free(buf, 0);
12978 dtrace_dof_error(NULL, "truncated header");
12979 return (NULL);
12980 }
12981
12982 if (len < (loadsz = dof->dofh_loadsz)) {
12983 kmem_free(buf, 0);
12984 dtrace_dof_error(NULL, "truncated DOF");
12985 return (NULL);
12986 }
12987
12988 if (loadsz >= dtrace_dof_maxsize) {
12989 kmem_free(buf, 0);
12990 dtrace_dof_error(NULL, "oversized DOF");
12991 return (NULL);
12992 }
12993#endif
12994
12995 return (dof);
12996}
12997
12998static void
12999dtrace_dof_destroy(dof_hdr_t *dof)
13000{
13001 kmem_free(dof, dof->dofh_loadsz);
13002}
13003
13004/*
13005 * Return the dof_sec_t pointer corresponding to a given section index. If the
13006 * index is not valid, dtrace_dof_error() is called and NULL is returned. If
13007 * a type other than DOF_SECT_NONE is specified, the header is checked against
13008 * this type and NULL is returned if the types do not match.
13009 */
13010static dof_sec_t *
13011dtrace_dof_sect(dof_hdr_t *dof, uint32_t type, dof_secidx_t i)
13012{
13013 dof_sec_t *sec = (dof_sec_t *)(uintptr_t)
13014 ((uintptr_t)dof + dof->dofh_secoff + i * dof->dofh_secsize);
13015
13016 if (i >= dof->dofh_secnum) {
13017 dtrace_dof_error(dof, "referenced section index is invalid");
13018 return (NULL);
13019 }
13020
13021 if (!(sec->dofs_flags & DOF_SECF_LOAD)) {
13022 dtrace_dof_error(dof, "referenced section is not loadable");
13023 return (NULL);
13024 }
13025
13026 if (type != DOF_SECT_NONE && type != sec->dofs_type) {
13027 dtrace_dof_error(dof, "referenced section is the wrong type");
13028 return (NULL);
13029 }
13030
13031 return (sec);
13032}
13033
13034static dtrace_probedesc_t *
13035dtrace_dof_probedesc(dof_hdr_t *dof, dof_sec_t *sec, dtrace_probedesc_t *desc)
13036{
13037 dof_probedesc_t *probe;
13038 dof_sec_t *strtab;
13039 uintptr_t daddr = (uintptr_t)dof;
13040 uintptr_t str;
13041 size_t size;
13042
13043 if (sec->dofs_type != DOF_SECT_PROBEDESC) {
13044 dtrace_dof_error(dof, "invalid probe section");
13045 return (NULL);
13046 }
13047
13048 if (sec->dofs_align != sizeof (dof_secidx_t)) {
13049 dtrace_dof_error(dof, "bad alignment in probe description");
13050 return (NULL);
13051 }
13052
13053 if (sec->dofs_offset + sizeof (dof_probedesc_t) > dof->dofh_loadsz) {
13054 dtrace_dof_error(dof, "truncated probe description");
13055 return (NULL);
13056 }
13057
13058 probe = (dof_probedesc_t *)(uintptr_t)(daddr + sec->dofs_offset);
13059 strtab = dtrace_dof_sect(dof, DOF_SECT_STRTAB, probe->dofp_strtab);
13060
13061 if (strtab == NULL)
13062 return (NULL);
13063
13064 str = daddr + strtab->dofs_offset;
13065 size = strtab->dofs_size;
13066
13067 if (probe->dofp_provider >= strtab->dofs_size) {
13068 dtrace_dof_error(dof, "corrupt probe provider");
13069 return (NULL);
13070 }
13071
13072 (void) strncpy(desc->dtpd_provider,
13073 (char *)(str + probe->dofp_provider),
13074 MIN(DTRACE_PROVNAMELEN - 1, size - probe->dofp_provider));
13075
13076 if (probe->dofp_mod >= strtab->dofs_size) {
13077 dtrace_dof_error(dof, "corrupt probe module");
13078 return (NULL);
13079 }
13080
13081 (void) strncpy(desc->dtpd_mod, (char *)(str + probe->dofp_mod),
13082 MIN(DTRACE_MODNAMELEN - 1, size - probe->dofp_mod));
13083
13084 if (probe->dofp_func >= strtab->dofs_size) {
13085 dtrace_dof_error(dof, "corrupt probe function");
13086 return (NULL);
13087 }
13088
13089 (void) strncpy(desc->dtpd_func, (char *)(str + probe->dofp_func),
13090 MIN(DTRACE_FUNCNAMELEN - 1, size - probe->dofp_func));
13091
13092 if (probe->dofp_name >= strtab->dofs_size) {
13093 dtrace_dof_error(dof, "corrupt probe name");
13094 return (NULL);
13095 }
13096
13097 (void) strncpy(desc->dtpd_name, (char *)(str + probe->dofp_name),
13098 MIN(DTRACE_NAMELEN - 1, size - probe->dofp_name));
13099
13100 return (desc);
13101}
13102
13103static dtrace_difo_t *
13104dtrace_dof_difo(dof_hdr_t *dof, dof_sec_t *sec, dtrace_vstate_t *vstate,
13105 cred_t *cr)
13106{
13107 dtrace_difo_t *dp;
13108 size_t ttl = 0;
13109 dof_difohdr_t *dofd;
13110 uintptr_t daddr = (uintptr_t)dof;
13111 size_t max = dtrace_difo_maxsize;
13112 int i, l, n;
13113
13114 static const struct {
13115 int section;
13116 int bufoffs;
13117 int lenoffs;
13118 int entsize;
13119 int align;
13120 const char *msg;
13121 } difo[] = {
13122 { DOF_SECT_DIF, offsetof(dtrace_difo_t, dtdo_buf),
13123 offsetof(dtrace_difo_t, dtdo_len), sizeof (dif_instr_t),
13124 sizeof (dif_instr_t), "multiple DIF sections" },
13125
13126 { DOF_SECT_INTTAB, offsetof(dtrace_difo_t, dtdo_inttab),
13127 offsetof(dtrace_difo_t, dtdo_intlen), sizeof (uint64_t),
13128 sizeof (uint64_t), "multiple integer tables" },
13129
13130 { DOF_SECT_STRTAB, offsetof(dtrace_difo_t, dtdo_strtab),
13131 offsetof(dtrace_difo_t, dtdo_strlen), 0,
13132 sizeof (char), "multiple string tables" },
13133
13134 { DOF_SECT_VARTAB, offsetof(dtrace_difo_t, dtdo_vartab),
13135 offsetof(dtrace_difo_t, dtdo_varlen), sizeof (dtrace_difv_t),
13136 sizeof (uint_t), "multiple variable tables" },
13137
13138 { DOF_SECT_NONE, 0, 0, 0, 0, NULL }
13139 };
13140
13141 if (sec->dofs_type != DOF_SECT_DIFOHDR) {
13142 dtrace_dof_error(dof, "invalid DIFO header section");
13143 return (NULL);
13144 }
13145
13146 if (sec->dofs_align != sizeof (dof_secidx_t)) {
13147 dtrace_dof_error(dof, "bad alignment in DIFO header");
13148 return (NULL);
13149 }
13150
13151 if (sec->dofs_size < sizeof (dof_difohdr_t) ||
13152 sec->dofs_size % sizeof (dof_secidx_t)) {
13153 dtrace_dof_error(dof, "bad size in DIFO header");
13154 return (NULL);
13155 }
13156
13157 dofd = (dof_difohdr_t *)(uintptr_t)(daddr + sec->dofs_offset);
13158 n = (sec->dofs_size - sizeof (*dofd)) / sizeof (dof_secidx_t) + 1;
13159
13160 dp = kmem_zalloc(sizeof (dtrace_difo_t), KM_SLEEP);
13161 dp->dtdo_rtype = dofd->dofd_rtype;
13162
13163 for (l = 0; l < n; l++) {
13164 dof_sec_t *subsec;
13165 void **bufp;
13166 uint32_t *lenp;
13167
13168 if ((subsec = dtrace_dof_sect(dof, DOF_SECT_NONE,
13169 dofd->dofd_links[l])) == NULL)
13170 goto err; /* invalid section link */
13171
13172 if (ttl + subsec->dofs_size > max) {
13173 dtrace_dof_error(dof, "exceeds maximum size");
13174 goto err;
13175 }
13176
13177 ttl += subsec->dofs_size;
13178
13179 for (i = 0; difo[i].section != DOF_SECT_NONE; i++) {
13180 if (subsec->dofs_type != difo[i].section)
13181 continue;
13182
13183 if (!(subsec->dofs_flags & DOF_SECF_LOAD)) {
13184 dtrace_dof_error(dof, "section not loaded");
13185 goto err;
13186 }
13187
13188 if (subsec->dofs_align != difo[i].align) {
13189 dtrace_dof_error(dof, "bad alignment");
13190 goto err;
13191 }
13192
13193 bufp = (void **)((uintptr_t)dp + difo[i].bufoffs);
13194 lenp = (uint32_t *)((uintptr_t)dp + difo[i].lenoffs);
13195
13196 if (*bufp != NULL) {
13197 dtrace_dof_error(dof, difo[i].msg);
13198 goto err;
13199 }
13200
13201 if (difo[i].entsize != subsec->dofs_entsize) {
13202 dtrace_dof_error(dof, "entry size mismatch");
13203 goto err;
13204 }
13205
13206 if (subsec->dofs_entsize != 0 &&
13207 (subsec->dofs_size % subsec->dofs_entsize) != 0) {
13208 dtrace_dof_error(dof, "corrupt entry size");
13209 goto err;
13210 }
13211
13212 *lenp = subsec->dofs_size;
13213 *bufp = kmem_alloc(subsec->dofs_size, KM_SLEEP);
13214 bcopy((char *)(uintptr_t)(daddr + subsec->dofs_offset),
13215 *bufp, subsec->dofs_size);
13216
13217 if (subsec->dofs_entsize != 0)
13218 *lenp /= subsec->dofs_entsize;
13219
13220 break;
13221 }
13222
13223 /*
13224 * If we encounter a loadable DIFO sub-section that is not
13225 * known to us, assume this is a broken program and fail.
13226 */
13227 if (difo[i].section == DOF_SECT_NONE &&
13228 (subsec->dofs_flags & DOF_SECF_LOAD)) {
13229 dtrace_dof_error(dof, "unrecognized DIFO subsection");
13230 goto err;
13231 }
13232 }
13233
13234 if (dp->dtdo_buf == NULL) {
13235 /*
13236 * We can't have a DIF object without DIF text.
13237 */
13238 dtrace_dof_error(dof, "missing DIF text");
13239 goto err;
13240 }
13241
13242 /*
13243 * Before we validate the DIF object, run through the variable table
13244 * looking for the strings -- if any of their size are under, we'll set
13245 * their size to be the system-wide default string size. Note that
13246 * this should _not_ happen if the "strsize" option has been set --
13247 * in this case, the compiler should have set the size to reflect the
13248 * setting of the option.
13249 */
13250 for (i = 0; i < dp->dtdo_varlen; i++) {
13251 dtrace_difv_t *v = &dp->dtdo_vartab[i];
13252 dtrace_diftype_t *t = &v->dtdv_type;
13253
13254 if (v->dtdv_id < DIF_VAR_OTHER_UBASE)
13255 continue;
13256
13257 if (t->dtdt_kind == DIF_TYPE_STRING && t->dtdt_size == 0)
13258 t->dtdt_size = dtrace_strsize_default;
13259 }
13260
13261 if (dtrace_difo_validate(dp, vstate, DIF_DIR_NREGS, cr) != 0)
13262 goto err;
13263
13264 dtrace_difo_init(dp, vstate);
13265 return (dp);
13266
13267err:
13268 kmem_free(dp->dtdo_buf, dp->dtdo_len * sizeof (dif_instr_t));
13269 kmem_free(dp->dtdo_inttab, dp->dtdo_intlen * sizeof (uint64_t));
13270 kmem_free(dp->dtdo_strtab, dp->dtdo_strlen);
13271 kmem_free(dp->dtdo_vartab, dp->dtdo_varlen * sizeof (dtrace_difv_t));
13272
13273 kmem_free(dp, sizeof (dtrace_difo_t));
13274 return (NULL);
13275}
13276
13277static dtrace_predicate_t *
13278dtrace_dof_predicate(dof_hdr_t *dof, dof_sec_t *sec, dtrace_vstate_t *vstate,
13279 cred_t *cr)
13280{
13281 dtrace_difo_t *dp;
13282
13283 if ((dp = dtrace_dof_difo(dof, sec, vstate, cr)) == NULL)
13284 return (NULL);
13285
13286 return (dtrace_predicate_create(dp));
13287}
13288
13289static dtrace_actdesc_t *
13290dtrace_dof_actdesc(dof_hdr_t *dof, dof_sec_t *sec, dtrace_vstate_t *vstate,
13291 cred_t *cr)
13292{
13293 dtrace_actdesc_t *act, *first = NULL, *last = NULL, *next;
13294 dof_actdesc_t *desc;
13295 dof_sec_t *difosec;
13296 size_t offs;
13297 uintptr_t daddr = (uintptr_t)dof;
13298 uint64_t arg;
13299 dtrace_actkind_t kind;
13300
13301 if (sec->dofs_type != DOF_SECT_ACTDESC) {
13302 dtrace_dof_error(dof, "invalid action section");
13303 return (NULL);
13304 }
13305
13306 if (sec->dofs_offset + sizeof (dof_actdesc_t) > dof->dofh_loadsz) {
13307 dtrace_dof_error(dof, "truncated action description");
13308 return (NULL);
13309 }
13310
13311 if (sec->dofs_align != sizeof (uint64_t)) {
13312 dtrace_dof_error(dof, "bad alignment in action description");
13313 return (NULL);
13314 }
13315
13316 if (sec->dofs_size < sec->dofs_entsize) {
13317 dtrace_dof_error(dof, "section entry size exceeds total size");
13318 return (NULL);
13319 }
13320
13321 if (sec->dofs_entsize != sizeof (dof_actdesc_t)) {
13322 dtrace_dof_error(dof, "bad entry size in action description");
13323 return (NULL);
13324 }
13325
13326 if (sec->dofs_size / sec->dofs_entsize > dtrace_actions_max) {
13327 dtrace_dof_error(dof, "actions exceed dtrace_actions_max");
13328 return (NULL);
13329 }
13330
13331 for (offs = 0; offs < sec->dofs_size; offs += sec->dofs_entsize) {
13332 desc = (dof_actdesc_t *)(daddr +
13333 (uintptr_t)sec->dofs_offset + offs);
13334 kind = (dtrace_actkind_t)desc->dofa_kind;
13335
13336 if ((DTRACEACT_ISPRINTFLIKE(kind) &&
13337 (kind != DTRACEACT_PRINTA ||
13338 desc->dofa_strtab != DOF_SECIDX_NONE)) ||
13339 (kind == DTRACEACT_DIFEXPR &&
13340 desc->dofa_strtab != DOF_SECIDX_NONE)) {
13341 dof_sec_t *strtab;
13342 char *str, *fmt;
13343 uint64_t i;
13344
13345 /*
13346 * The argument to these actions is an index into the
13347 * DOF string table. For printf()-like actions, this
13348 * is the format string. For print(), this is the
13349 * CTF type of the expression result.
13350 */
13351 if ((strtab = dtrace_dof_sect(dof,
13352 DOF_SECT_STRTAB, desc->dofa_strtab)) == NULL)
13353 goto err;
13354
13355 str = (char *)((uintptr_t)dof +
13356 (uintptr_t)strtab->dofs_offset);
13357
13358 for (i = desc->dofa_arg; i < strtab->dofs_size; i++) {
13359 if (str[i] == '\0')
13360 break;
13361 }
13362
13363 if (i >= strtab->dofs_size) {
13364 dtrace_dof_error(dof, "bogus format string");
13365 goto err;
13366 }
13367
13368 if (i == desc->dofa_arg) {
13369 dtrace_dof_error(dof, "empty format string");
13370 goto err;
13371 }
13372
13373 i -= desc->dofa_arg;
13374 fmt = kmem_alloc(i + 1, KM_SLEEP);
13375 bcopy(&str[desc->dofa_arg], fmt, i + 1);
13376 arg = (uint64_t)(uintptr_t)fmt;
13377 } else {
13378 if (kind == DTRACEACT_PRINTA) {
13379 ASSERT(desc->dofa_strtab == DOF_SECIDX_NONE);
13380 arg = 0;
13381 } else {
13382 arg = desc->dofa_arg;
13383 }
13384 }
13385
13386 act = dtrace_actdesc_create(kind, desc->dofa_ntuple,
13387 desc->dofa_uarg, arg);
13388
13389 if (last != NULL) {
13390 last->dtad_next = act;
13391 } else {
13392 first = act;
13393 }
13394
13395 last = act;
13396
13397 if (desc->dofa_difo == DOF_SECIDX_NONE)
13398 continue;
13399
13400 if ((difosec = dtrace_dof_sect(dof,
13401 DOF_SECT_DIFOHDR, desc->dofa_difo)) == NULL)
13402 goto err;
13403
13404 act->dtad_difo = dtrace_dof_difo(dof, difosec, vstate, cr);
13405
13406 if (act->dtad_difo == NULL)
13407 goto err;
13408 }
13409
13410 ASSERT(first != NULL);
13411 return (first);
13412
13413err:
13414 for (act = first; act != NULL; act = next) {
13415 next = act->dtad_next;
13416 dtrace_actdesc_release(act, vstate);
13417 }
13418
13419 return (NULL);
13420}
13421
13422static dtrace_ecbdesc_t *
13423dtrace_dof_ecbdesc(dof_hdr_t *dof, dof_sec_t *sec, dtrace_vstate_t *vstate,
13424 cred_t *cr)
13425{
13426 dtrace_ecbdesc_t *ep;
13427 dof_ecbdesc_t *ecb;
13428 dtrace_probedesc_t *desc;
13429 dtrace_predicate_t *pred = NULL;
13430
13431 if (sec->dofs_size < sizeof (dof_ecbdesc_t)) {
13432 dtrace_dof_error(dof, "truncated ECB description");
13433 return (NULL);
13434 }
13435
13436 if (sec->dofs_align != sizeof (uint64_t)) {
13437 dtrace_dof_error(dof, "bad alignment in ECB description");
13438 return (NULL);
13439 }
13440
13441 ecb = (dof_ecbdesc_t *)((uintptr_t)dof + (uintptr_t)sec->dofs_offset);
13442 sec = dtrace_dof_sect(dof, DOF_SECT_PROBEDESC, ecb->dofe_probes);
13443
13444 if (sec == NULL)
13445 return (NULL);
13446
13447 ep = kmem_zalloc(sizeof (dtrace_ecbdesc_t), KM_SLEEP);
13448 ep->dted_uarg = ecb->dofe_uarg;
13449 desc = &ep->dted_probe;
13450
13451 if (dtrace_dof_probedesc(dof, sec, desc) == NULL)
13452 goto err;
13453
13454 if (ecb->dofe_pred != DOF_SECIDX_NONE) {
13455 if ((sec = dtrace_dof_sect(dof,
13456 DOF_SECT_DIFOHDR, ecb->dofe_pred)) == NULL)
13457 goto err;
13458
13459 if ((pred = dtrace_dof_predicate(dof, sec, vstate, cr)) == NULL)
13460 goto err;
13461
13462 ep->dted_pred.dtpdd_predicate = pred;
13463 }
13464
13465 if (ecb->dofe_actions != DOF_SECIDX_NONE) {
13466 if ((sec = dtrace_dof_sect(dof,
13467 DOF_SECT_ACTDESC, ecb->dofe_actions)) == NULL)
13468 goto err;
13469
13470 ep->dted_action = dtrace_dof_actdesc(dof, sec, vstate, cr);
13471
13472 if (ep->dted_action == NULL)
13473 goto err;
13474 }
13475
13476 return (ep);
13477
13478err:
13479 if (pred != NULL)
13480 dtrace_predicate_release(pred, vstate);
13481 kmem_free(ep, sizeof (dtrace_ecbdesc_t));
13482 return (NULL);
13483}
13484
13485/*
13486 * Apply the relocations from the specified 'sec' (a DOF_SECT_URELHDR) to the
13487 * specified DOF. At present, this amounts to simply adding 'ubase' to the
13488 * site of any user SETX relocations to account for load object base address.
13489 * In the future, if we need other relocations, this function can be extended.
13490 */
13491static int
13492dtrace_dof_relocate(dof_hdr_t *dof, dof_sec_t *sec, uint64_t ubase)
13493{
13494 uintptr_t daddr = (uintptr_t)dof;
13495 dof_relohdr_t *dofr =
13496 (dof_relohdr_t *)(uintptr_t)(daddr + sec->dofs_offset);
13497 dof_sec_t *ss, *rs, *ts;
13498 dof_relodesc_t *r;
13499 uint_t i, n;
13500
13501 if (sec->dofs_size < sizeof (dof_relohdr_t) ||
13502 sec->dofs_align != sizeof (dof_secidx_t)) {
13503 dtrace_dof_error(dof, "invalid relocation header");
13504 return (-1);
13505 }
13506
13507 ss = dtrace_dof_sect(dof, DOF_SECT_STRTAB, dofr->dofr_strtab);
13508 rs = dtrace_dof_sect(dof, DOF_SECT_RELTAB, dofr->dofr_relsec);
13509 ts = dtrace_dof_sect(dof, DOF_SECT_NONE, dofr->dofr_tgtsec);
13510
13511 if (ss == NULL || rs == NULL || ts == NULL)
13512 return (-1); /* dtrace_dof_error() has been called already */
13513
13514 if (rs->dofs_entsize < sizeof (dof_relodesc_t) ||
13515 rs->dofs_align != sizeof (uint64_t)) {
13516 dtrace_dof_error(dof, "invalid relocation section");
13517 return (-1);
13518 }
13519
13520 r = (dof_relodesc_t *)(uintptr_t)(daddr + rs->dofs_offset);
13521 n = rs->dofs_size / rs->dofs_entsize;
13522
13523 for (i = 0; i < n; i++) {
13524 uintptr_t taddr = daddr + ts->dofs_offset + r->dofr_offset;
13525
13526 switch (r->dofr_type) {
13527 case DOF_RELO_NONE:
13528 break;
13529 case DOF_RELO_SETX:
13530 if (r->dofr_offset >= ts->dofs_size || r->dofr_offset +
13531 sizeof (uint64_t) > ts->dofs_size) {
13532 dtrace_dof_error(dof, "bad relocation offset");
13533 return (-1);
13534 }
13535
13536 if (!IS_P2ALIGNED(taddr, sizeof (uint64_t))) {
13537 dtrace_dof_error(dof, "misaligned setx relo");
13538 return (-1);
13539 }
13540
13541 *(uint64_t *)taddr += ubase;
13542 break;
13543 default:
13544 dtrace_dof_error(dof, "invalid relocation type");
13545 return (-1);
13546 }
13547
13548 r = (dof_relodesc_t *)((uintptr_t)r + rs->dofs_entsize);
13549 }
13550
13551 return (0);
13552}
13553
13554/*
13555 * The dof_hdr_t passed to dtrace_dof_slurp() should be a partially validated
13556 * header: it should be at the front of a memory region that is at least
13557 * sizeof (dof_hdr_t) in size -- and then at least dof_hdr.dofh_loadsz in
13558 * size. It need not be validated in any other way.
13559 */
13560static int
13561dtrace_dof_slurp(dof_hdr_t *dof, dtrace_vstate_t *vstate, cred_t *cr,
13562 dtrace_enabling_t **enabp, uint64_t ubase, int noprobes)
13563{
13564 uint64_t len = dof->dofh_loadsz, seclen;
13565 uintptr_t daddr = (uintptr_t)dof;
13566 dtrace_ecbdesc_t *ep;
13567 dtrace_enabling_t *enab;
13568 uint_t i;
13569
13570 ASSERT(MUTEX_HELD(&dtrace_lock));
13571 ASSERT(dof->dofh_loadsz >= sizeof (dof_hdr_t));
13572
13573 /*
13574 * Check the DOF header identification bytes. In addition to checking
13575 * valid settings, we also verify that unused bits/bytes are zeroed so
13576 * we can use them later without fear of regressing existing binaries.
13577 */
13578 if (bcmp(&dof->dofh_ident[DOF_ID_MAG0],
13579 DOF_MAG_STRING, DOF_MAG_STRLEN) != 0) {
13580 dtrace_dof_error(dof, "DOF magic string mismatch");
13581 return (-1);
13582 }
13583
13584 if (dof->dofh_ident[DOF_ID_MODEL] != DOF_MODEL_ILP32 &&
13585 dof->dofh_ident[DOF_ID_MODEL] != DOF_MODEL_LP64) {
13586 dtrace_dof_error(dof, "DOF has invalid data model");
13587 return (-1);
13588 }
13589
13590 if (dof->dofh_ident[DOF_ID_ENCODING] != DOF_ENCODE_NATIVE) {
13591 dtrace_dof_error(dof, "DOF encoding mismatch");
13592 return (-1);
13593 }
13594
13595 if (dof->dofh_ident[DOF_ID_VERSION] != DOF_VERSION_1 &&
13596 dof->dofh_ident[DOF_ID_VERSION] != DOF_VERSION_2) {
13597 dtrace_dof_error(dof, "DOF version mismatch");
13598 return (-1);
13599 }
13600
13601 if (dof->dofh_ident[DOF_ID_DIFVERS] != DIF_VERSION_2) {
13602 dtrace_dof_error(dof, "DOF uses unsupported instruction set");
13603 return (-1);
13604 }
13605
13606 if (dof->dofh_ident[DOF_ID_DIFIREG] > DIF_DIR_NREGS) {
13607 dtrace_dof_error(dof, "DOF uses too many integer registers");
13608 return (-1);
13609 }
13610
13611 if (dof->dofh_ident[DOF_ID_DIFTREG] > DIF_DTR_NREGS) {
13612 dtrace_dof_error(dof, "DOF uses too many tuple registers");
13613 return (-1);
13614 }
13615
13616 for (i = DOF_ID_PAD; i < DOF_ID_SIZE; i++) {
13617 if (dof->dofh_ident[i] != 0) {
13618 dtrace_dof_error(dof, "DOF has invalid ident byte set");
13619 return (-1);
13620 }
13621 }
13622
13623 if (dof->dofh_flags & ~DOF_FL_VALID) {
13624 dtrace_dof_error(dof, "DOF has invalid flag bits set");
13625 return (-1);
13626 }
13627
13628 if (dof->dofh_secsize == 0) {
13629 dtrace_dof_error(dof, "zero section header size");
13630 return (-1);
13631 }
13632
13633 /*
13634 * Check that the section headers don't exceed the amount of DOF
13635 * data. Note that we cast the section size and number of sections
13636 * to uint64_t's to prevent possible overflow in the multiplication.
13637 */
13638 seclen = (uint64_t)dof->dofh_secnum * (uint64_t)dof->dofh_secsize;
13639
13640 if (dof->dofh_secoff > len || seclen > len ||
13641 dof->dofh_secoff + seclen > len) {
13642 dtrace_dof_error(dof, "truncated section headers");
13643 return (-1);
13644 }
13645
13646 if (!IS_P2ALIGNED(dof->dofh_secoff, sizeof (uint64_t))) {
13647 dtrace_dof_error(dof, "misaligned section headers");
13648 return (-1);
13649 }
13650
13651 if (!IS_P2ALIGNED(dof->dofh_secsize, sizeof (uint64_t))) {
13652 dtrace_dof_error(dof, "misaligned section size");
13653 return (-1);
13654 }
13655
13656 /*
13657 * Take an initial pass through the section headers to be sure that
13658 * the headers don't have stray offsets. If the 'noprobes' flag is
13659 * set, do not permit sections relating to providers, probes, or args.
13660 */
13661 for (i = 0; i < dof->dofh_secnum; i++) {
13662 dof_sec_t *sec = (dof_sec_t *)(daddr +
13663 (uintptr_t)dof->dofh_secoff + i * dof->dofh_secsize);
13664
13665 if (noprobes) {
13666 switch (sec->dofs_type) {
13667 case DOF_SECT_PROVIDER:
13668 case DOF_SECT_PROBES:
13669 case DOF_SECT_PRARGS:
13670 case DOF_SECT_PROFFS:
13671 dtrace_dof_error(dof, "illegal sections "
13672 "for enabling");
13673 return (-1);
13674 }
13675 }
13676
13677 if (DOF_SEC_ISLOADABLE(sec->dofs_type) &&
13678 !(sec->dofs_flags & DOF_SECF_LOAD)) {
13679 dtrace_dof_error(dof, "loadable section with load "
13680 "flag unset");
13681 return (-1);
13682 }
13683
13684 if (!(sec->dofs_flags & DOF_SECF_LOAD))
13685 continue; /* just ignore non-loadable sections */
13686
13687 if (sec->dofs_align & (sec->dofs_align - 1)) {
13688 dtrace_dof_error(dof, "bad section alignment");
13689 return (-1);
13690 }
13691
13692 if (sec->dofs_offset & (sec->dofs_align - 1)) {
13693 dtrace_dof_error(dof, "misaligned section");
13694 return (-1);
13695 }
13696
13697 if (sec->dofs_offset > len || sec->dofs_size > len ||
13698 sec->dofs_offset + sec->dofs_size > len) {
13699 dtrace_dof_error(dof, "corrupt section header");
13700 return (-1);
13701 }
13702
13703 if (sec->dofs_type == DOF_SECT_STRTAB && *((char *)daddr +
13704 sec->dofs_offset + sec->dofs_size - 1) != '\0') {
13705 dtrace_dof_error(dof, "non-terminating string table");
13706 return (-1);
13707 }
13708 }
13709
13710 /*
13711 * Take a second pass through the sections and locate and perform any
13712 * relocations that are present. We do this after the first pass to
13713 * be sure that all sections have had their headers validated.
13714 */
13715 for (i = 0; i < dof->dofh_secnum; i++) {
13716 dof_sec_t *sec = (dof_sec_t *)(daddr +
13717 (uintptr_t)dof->dofh_secoff + i * dof->dofh_secsize);
13718
13719 if (!(sec->dofs_flags & DOF_SECF_LOAD))
13720 continue; /* skip sections that are not loadable */
13721
13722 switch (sec->dofs_type) {
13723 case DOF_SECT_URELHDR:
13724 if (dtrace_dof_relocate(dof, sec, ubase) != 0)
13725 return (-1);
13726 break;
13727 }
13728 }
13729
13730 if ((enab = *enabp) == NULL)
13731 enab = *enabp = dtrace_enabling_create(vstate);
13732
13733 for (i = 0; i < dof->dofh_secnum; i++) {
13734 dof_sec_t *sec = (dof_sec_t *)(daddr +
13735 (uintptr_t)dof->dofh_secoff + i * dof->dofh_secsize);
13736
13737 if (sec->dofs_type != DOF_SECT_ECBDESC)
13738 continue;
13739
13740 if ((ep = dtrace_dof_ecbdesc(dof, sec, vstate, cr)) == NULL) {
13741 dtrace_enabling_destroy(enab);
13742 *enabp = NULL;
13743 return (-1);
13744 }
13745
13746 dtrace_enabling_add(enab, ep);
13747 }
13748
13749 return (0);
13750}
13751
13752/*
13753 * Process DOF for any options. This routine assumes that the DOF has been
13754 * at least processed by dtrace_dof_slurp().
13755 */
13756static int
13757dtrace_dof_options(dof_hdr_t *dof, dtrace_state_t *state)
13758{
13759 int i, rval;
13760 uint32_t entsize;
13761 size_t offs;
13762 dof_optdesc_t *desc;
13763
13764 for (i = 0; i < dof->dofh_secnum; i++) {
13765 dof_sec_t *sec = (dof_sec_t *)((uintptr_t)dof +
13766 (uintptr_t)dof->dofh_secoff + i * dof->dofh_secsize);
13767
13768 if (sec->dofs_type != DOF_SECT_OPTDESC)
13769 continue;
13770
13771 if (sec->dofs_align != sizeof (uint64_t)) {
13772 dtrace_dof_error(dof, "bad alignment in "
13773 "option description");
13774 return (EINVAL);
13775 }
13776
13777 if ((entsize = sec->dofs_entsize) == 0) {
13778 dtrace_dof_error(dof, "zeroed option entry size");
13779 return (EINVAL);
13780 }
13781
13782 if (entsize < sizeof (dof_optdesc_t)) {
13783 dtrace_dof_error(dof, "bad option entry size");
13784 return (EINVAL);
13785 }
13786
13787 for (offs = 0; offs < sec->dofs_size; offs += entsize) {
13788 desc = (dof_optdesc_t *)((uintptr_t)dof +
13789 (uintptr_t)sec->dofs_offset + offs);
13790
13791 if (desc->dofo_strtab != DOF_SECIDX_NONE) {
13792 dtrace_dof_error(dof, "non-zero option string");
13793 return (EINVAL);
13794 }
13795
13796 if (desc->dofo_value == DTRACEOPT_UNSET) {
13797 dtrace_dof_error(dof, "unset option");
13798 return (EINVAL);
13799 }
13800
13801 if ((rval = dtrace_state_option(state,
13802 desc->dofo_option, desc->dofo_value)) != 0) {
13803 dtrace_dof_error(dof, "rejected option");
13804 return (rval);
13805 }
13806 }
13807 }
13808
13809 return (0);
13810}
13811
13812/*
13813 * DTrace Consumer State Functions
13814 */
13815static int
13816dtrace_dstate_init(dtrace_dstate_t *dstate, size_t size)
13817{
13818 size_t hashsize, maxper, min, chunksize = dstate->dtds_chunksize;
13819 void *base;
13820 uintptr_t limit;
13821 dtrace_dynvar_t *dvar, *next, *start;
13822 int i;
13823
13824 ASSERT(MUTEX_HELD(&dtrace_lock));
13825 ASSERT(dstate->dtds_base == NULL && dstate->dtds_percpu == NULL);
13826
13827 bzero(dstate, sizeof (dtrace_dstate_t));
13828
13829 if ((dstate->dtds_chunksize = chunksize) == 0)
13830 dstate->dtds_chunksize = DTRACE_DYNVAR_CHUNKSIZE;
13831
13832 if (size < (min = dstate->dtds_chunksize + sizeof (dtrace_dynhash_t)))
13833 size = min;
13834
13835 if ((base = kmem_zalloc(size, KM_NOSLEEP | KM_NORMALPRI)) == NULL)
13836 return (ENOMEM);
13837
13838 dstate->dtds_size = size;
13839 dstate->dtds_base = base;
13840 dstate->dtds_percpu = kmem_cache_alloc(dtrace_state_cache, KM_SLEEP);
13841 bzero(dstate->dtds_percpu, NCPU * sizeof (dtrace_dstate_percpu_t));
13842
13843 hashsize = size / (dstate->dtds_chunksize + sizeof (dtrace_dynhash_t));
13844
13845 if (hashsize != 1 && (hashsize & 1))
13846 hashsize--;
13847
13848 dstate->dtds_hashsize = hashsize;
13849 dstate->dtds_hash = dstate->dtds_base;
13850
13851 /*
13852 * Set all of our hash buckets to point to the single sink, and (if
13853 * it hasn't already been set), set the sink's hash value to be the
13854 * sink sentinel value. The sink is needed for dynamic variable
13855 * lookups to know that they have iterated over an entire, valid hash
13856 * chain.
13857 */
13858 for (i = 0; i < hashsize; i++)
13859 dstate->dtds_hash[i].dtdh_chain = &dtrace_dynhash_sink;
13860
13861 if (dtrace_dynhash_sink.dtdv_hashval != DTRACE_DYNHASH_SINK)
13862 dtrace_dynhash_sink.dtdv_hashval = DTRACE_DYNHASH_SINK;
13863
13864 /*
13865 * Determine number of active CPUs. Divide free list evenly among
13866 * active CPUs.
13867 */
13868 start = (dtrace_dynvar_t *)
13869 ((uintptr_t)base + hashsize * sizeof (dtrace_dynhash_t));
13870 limit = (uintptr_t)base + size;
13871
13872 maxper = (limit - (uintptr_t)start) / NCPU;
13873 maxper = (maxper / dstate->dtds_chunksize) * dstate->dtds_chunksize;
13874
13875#if !defined(sun)
13876 CPU_FOREACH(i) {
13877#else
13878 for (i = 0; i < NCPU; i++) {
13879#endif
13880 dstate->dtds_percpu[i].dtdsc_free = dvar = start;
13881
13882 /*
13883 * If we don't even have enough chunks to make it once through
13884 * NCPUs, we're just going to allocate everything to the first
13885 * CPU. And if we're on the last CPU, we're going to allocate
13886 * whatever is left over. In either case, we set the limit to
13887 * be the limit of the dynamic variable space.
13888 */
13889 if (maxper == 0 || i == NCPU - 1) {
13890 limit = (uintptr_t)base + size;
13891 start = NULL;
13892 } else {
13893 limit = (uintptr_t)start + maxper;
13894 start = (dtrace_dynvar_t *)limit;
13895 }
13896
13897 ASSERT(limit <= (uintptr_t)base + size);
13898
13899 for (;;) {
13900 next = (dtrace_dynvar_t *)((uintptr_t)dvar +
13901 dstate->dtds_chunksize);
13902
13903 if ((uintptr_t)next + dstate->dtds_chunksize >= limit)
13904 break;
13905
13906 dvar->dtdv_next = next;
13907 dvar = next;
13908 }
13909
13910 if (maxper == 0)
13911 break;
13912 }
13913
13914 return (0);
13915}
13916
13917static void
13918dtrace_dstate_fini(dtrace_dstate_t *dstate)
13919{
13920 ASSERT(MUTEX_HELD(&cpu_lock));
13921
13922 if (dstate->dtds_base == NULL)
13923 return;
13924
13925 kmem_free(dstate->dtds_base, dstate->dtds_size);
13926 kmem_cache_free(dtrace_state_cache, dstate->dtds_percpu);
13927}
13928
13929static void
13930dtrace_vstate_fini(dtrace_vstate_t *vstate)
13931{
13932 /*
13933 * Logical XOR, where are you?
13934 */
13935 ASSERT((vstate->dtvs_nglobals == 0) ^ (vstate->dtvs_globals != NULL));
13936
13937 if (vstate->dtvs_nglobals > 0) {
13938 kmem_free(vstate->dtvs_globals, vstate->dtvs_nglobals *
13939 sizeof (dtrace_statvar_t *));
13940 }
13941
13942 if (vstate->dtvs_ntlocals > 0) {
13943 kmem_free(vstate->dtvs_tlocals, vstate->dtvs_ntlocals *
13944 sizeof (dtrace_difv_t));
13945 }
13946
13947 ASSERT((vstate->dtvs_nlocals == 0) ^ (vstate->dtvs_locals != NULL));
13948
13949 if (vstate->dtvs_nlocals > 0) {
13950 kmem_free(vstate->dtvs_locals, vstate->dtvs_nlocals *
13951 sizeof (dtrace_statvar_t *));
13952 }
13953}
13954
13955#if defined(sun)
13956static void
13957dtrace_state_clean(dtrace_state_t *state)
13958{
13959 if (state->dts_activity == DTRACE_ACTIVITY_INACTIVE)
13960 return;
13961
13962 dtrace_dynvar_clean(&state->dts_vstate.dtvs_dynvars);
13963 dtrace_speculation_clean(state);
13964}
13965
13966static void
13967dtrace_state_deadman(dtrace_state_t *state)
13968{
13969 hrtime_t now;
13970
13971 dtrace_sync();
13972
13973 now = dtrace_gethrtime();
13974
13975 if (state != dtrace_anon.dta_state &&
13976 now - state->dts_laststatus >= dtrace_deadman_user)
13977 return;
13978
13979 /*
13980 * We must be sure that dts_alive never appears to be less than the
13981 * value upon entry to dtrace_state_deadman(), and because we lack a
13982 * dtrace_cas64(), we cannot store to it atomically. We thus instead
13983 * store INT64_MAX to it, followed by a memory barrier, followed by
13984 * the new value. This assures that dts_alive never appears to be
13985 * less than its true value, regardless of the order in which the
13986 * stores to the underlying storage are issued.
13987 */
13988 state->dts_alive = INT64_MAX;
13989 dtrace_membar_producer();
13990 state->dts_alive = now;
13991}
13992#else
13993static void
13994dtrace_state_clean(void *arg)
13995{
13996 dtrace_state_t *state = arg;
13997 dtrace_optval_t *opt = state->dts_options;
13998
13999 if (state->dts_activity == DTRACE_ACTIVITY_INACTIVE)
14000 return;
14001
14002 dtrace_dynvar_clean(&state->dts_vstate.dtvs_dynvars);
14003 dtrace_speculation_clean(state);
14004
14005 callout_reset(&state->dts_cleaner, hz * opt[DTRACEOPT_CLEANRATE] / NANOSEC,
14006 dtrace_state_clean, state);
14007}
14008
14009static void
14010dtrace_state_deadman(void *arg)
14011{
14012 dtrace_state_t *state = arg;
14013 hrtime_t now;
14014
14015 dtrace_sync();
14016
14017 dtrace_debug_output();
14018
14019 now = dtrace_gethrtime();
14020
14021 if (state != dtrace_anon.dta_state &&
14022 now - state->dts_laststatus >= dtrace_deadman_user)
14023 return;
14024
14025 /*
14026 * We must be sure that dts_alive never appears to be less than the
14027 * value upon entry to dtrace_state_deadman(), and because we lack a
14028 * dtrace_cas64(), we cannot store to it atomically. We thus instead
14029 * store INT64_MAX to it, followed by a memory barrier, followed by
14030 * the new value. This assures that dts_alive never appears to be
14031 * less than its true value, regardless of the order in which the
14032 * stores to the underlying storage are issued.
14033 */
14034 state->dts_alive = INT64_MAX;
14035 dtrace_membar_producer();
14036 state->dts_alive = now;
14037
14038 callout_reset(&state->dts_deadman, hz * dtrace_deadman_interval / NANOSEC,
14039 dtrace_state_deadman, state);
14040}
14041#endif
14042
14043static dtrace_state_t *
14044#if defined(sun)
14045dtrace_state_create(dev_t *devp, cred_t *cr)
14046#else
14047dtrace_state_create(struct cdev *dev)
14048#endif
14049{
14050#if defined(sun)
14051 minor_t minor;
14052 major_t major;
14053#else
14054 cred_t *cr = NULL;
14055 int m = 0;
14056#endif
14057 char c[30];
14058 dtrace_state_t *state;
14059 dtrace_optval_t *opt;
14060 int bufsize = NCPU * sizeof (dtrace_buffer_t), i;
14061
14062 ASSERT(MUTEX_HELD(&dtrace_lock));
14063 ASSERT(MUTEX_HELD(&cpu_lock));
14064
14065#if defined(sun)
14066 minor = (minor_t)(uintptr_t)vmem_alloc(dtrace_minor, 1,
14067 VM_BESTFIT | VM_SLEEP);
14068
14069 if (ddi_soft_state_zalloc(dtrace_softstate, minor) != DDI_SUCCESS) {
14070 vmem_free(dtrace_minor, (void *)(uintptr_t)minor, 1);
14071 return (NULL);
14072 }
14073
14074 state = ddi_get_soft_state(dtrace_softstate, minor);
14075#else
14076 if (dev != NULL) {
14077 cr = dev->si_cred;
14078 m = dev2unit(dev);
14079 }
14080
14081 /* Allocate memory for the state. */
14082 state = kmem_zalloc(sizeof(dtrace_state_t), KM_SLEEP);
14083#endif
14084
14085 state->dts_epid = DTRACE_EPIDNONE + 1;
14086
14087 (void) snprintf(c, sizeof (c), "dtrace_aggid_%d", m);
14088#if defined(sun)
14089 state->dts_aggid_arena = vmem_create(c, (void *)1, UINT32_MAX, 1,
14090 NULL, NULL, NULL, 0, VM_SLEEP | VMC_IDENTIFIER);
14091
14092 if (devp != NULL) {
14093 major = getemajor(*devp);
14094 } else {
14095 major = ddi_driver_major(dtrace_devi);
14096 }
14097
14098 state->dts_dev = makedevice(major, minor);
14099
14100 if (devp != NULL)
14101 *devp = state->dts_dev;
14102#else
14103 state->dts_aggid_arena = new_unrhdr(1, INT_MAX, &dtrace_unr_mtx);
14104 state->dts_dev = dev;
14105#endif
14106
14107 /*
14108 * We allocate NCPU buffers. On the one hand, this can be quite
14109 * a bit of memory per instance (nearly 36K on a Starcat). On the
14110 * other hand, it saves an additional memory reference in the probe
14111 * path.
14112 */
14113 state->dts_buffer = kmem_zalloc(bufsize, KM_SLEEP);
14114 state->dts_aggbuffer = kmem_zalloc(bufsize, KM_SLEEP);
14115
14116#if defined(sun)
14117 state->dts_cleaner = CYCLIC_NONE;
14118 state->dts_deadman = CYCLIC_NONE;
14119#else
14120 callout_init(&state->dts_cleaner, CALLOUT_MPSAFE);
14121 callout_init(&state->dts_deadman, CALLOUT_MPSAFE);
14122#endif
14123 state->dts_vstate.dtvs_state = state;
14124
14125 for (i = 0; i < DTRACEOPT_MAX; i++)
14126 state->dts_options[i] = DTRACEOPT_UNSET;
14127
14128 /*
14129 * Set the default options.
14130 */
14131 opt = state->dts_options;
14132 opt[DTRACEOPT_BUFPOLICY] = DTRACEOPT_BUFPOLICY_SWITCH;
14133 opt[DTRACEOPT_BUFRESIZE] = DTRACEOPT_BUFRESIZE_AUTO;
14134 opt[DTRACEOPT_NSPEC] = dtrace_nspec_default;
14135 opt[DTRACEOPT_SPECSIZE] = dtrace_specsize_default;
14136 opt[DTRACEOPT_CPU] = (dtrace_optval_t)DTRACE_CPUALL;
14137 opt[DTRACEOPT_STRSIZE] = dtrace_strsize_default;
14138 opt[DTRACEOPT_STACKFRAMES] = dtrace_stackframes_default;
14139 opt[DTRACEOPT_USTACKFRAMES] = dtrace_ustackframes_default;
14140 opt[DTRACEOPT_CLEANRATE] = dtrace_cleanrate_default;
14141 opt[DTRACEOPT_AGGRATE] = dtrace_aggrate_default;
14142 opt[DTRACEOPT_SWITCHRATE] = dtrace_switchrate_default;
14143 opt[DTRACEOPT_STATUSRATE] = dtrace_statusrate_default;
14144 opt[DTRACEOPT_JSTACKFRAMES] = dtrace_jstackframes_default;
14145 opt[DTRACEOPT_JSTACKSTRSIZE] = dtrace_jstackstrsize_default;
14146
14147 state->dts_activity = DTRACE_ACTIVITY_INACTIVE;
14148
14149 /*
14150 * Depending on the user credentials, we set flag bits which alter probe
14151 * visibility or the amount of destructiveness allowed. In the case of
14152 * actual anonymous tracing, or the possession of all privileges, all of
14153 * the normal checks are bypassed.
14154 */
14155 if (cr == NULL || PRIV_POLICY_ONLY(cr, PRIV_ALL, B_FALSE)) {
14156 state->dts_cred.dcr_visible = DTRACE_CRV_ALL;
14157 state->dts_cred.dcr_action = DTRACE_CRA_ALL;
14158 } else {
14159 /*
14160 * Set up the credentials for this instantiation. We take a
14161 * hold on the credential to prevent it from disappearing on
14162 * us; this in turn prevents the zone_t referenced by this
14163 * credential from disappearing. This means that we can
14164 * examine the credential and the zone from probe context.
14165 */
14166 crhold(cr);
14167 state->dts_cred.dcr_cred = cr;
14168
14169 /*
14170 * CRA_PROC means "we have *some* privilege for dtrace" and
14171 * unlocks the use of variables like pid, zonename, etc.
14172 */
14173 if (PRIV_POLICY_ONLY(cr, PRIV_DTRACE_USER, B_FALSE) ||
14174 PRIV_POLICY_ONLY(cr, PRIV_DTRACE_PROC, B_FALSE)) {
14175 state->dts_cred.dcr_action |= DTRACE_CRA_PROC;
14176 }
14177
14178 /*
14179 * dtrace_user allows use of syscall and profile providers.
14180 * If the user also has proc_owner and/or proc_zone, we
14181 * extend the scope to include additional visibility and
14182 * destructive power.
14183 */
14184 if (PRIV_POLICY_ONLY(cr, PRIV_DTRACE_USER, B_FALSE)) {
14185 if (PRIV_POLICY_ONLY(cr, PRIV_PROC_OWNER, B_FALSE)) {
14186 state->dts_cred.dcr_visible |=
14187 DTRACE_CRV_ALLPROC;
14188
14189 state->dts_cred.dcr_action |=
14190 DTRACE_CRA_PROC_DESTRUCTIVE_ALLUSER;
14191 }
14192
14193 if (PRIV_POLICY_ONLY(cr, PRIV_PROC_ZONE, B_FALSE)) {
14194 state->dts_cred.dcr_visible |=
14195 DTRACE_CRV_ALLZONE;
14196
14197 state->dts_cred.dcr_action |=
14198 DTRACE_CRA_PROC_DESTRUCTIVE_ALLZONE;
14199 }
14200
14201 /*
14202 * If we have all privs in whatever zone this is,
14203 * we can do destructive things to processes which
14204 * have altered credentials.
14205 */
14206#if defined(sun)
14207 if (priv_isequalset(priv_getset(cr, PRIV_EFFECTIVE),
14208 cr->cr_zone->zone_privset)) {
14209 state->dts_cred.dcr_action |=
14210 DTRACE_CRA_PROC_DESTRUCTIVE_CREDCHG;
14211 }
14212#endif
14213 }
14214
14215 /*
14216 * Holding the dtrace_kernel privilege also implies that
14217 * the user has the dtrace_user privilege from a visibility
14218 * perspective. But without further privileges, some
14219 * destructive actions are not available.
14220 */
14221 if (PRIV_POLICY_ONLY(cr, PRIV_DTRACE_KERNEL, B_FALSE)) {
14222 /*
14223 * Make all probes in all zones visible. However,
14224 * this doesn't mean that all actions become available
14225 * to all zones.
14226 */
14227 state->dts_cred.dcr_visible |= DTRACE_CRV_KERNEL |
14228 DTRACE_CRV_ALLPROC | DTRACE_CRV_ALLZONE;
14229
14230 state->dts_cred.dcr_action |= DTRACE_CRA_KERNEL |
14231 DTRACE_CRA_PROC;
14232 /*
14233 * Holding proc_owner means that destructive actions
14234 * for *this* zone are allowed.
14235 */
14236 if (PRIV_POLICY_ONLY(cr, PRIV_PROC_OWNER, B_FALSE))
14237 state->dts_cred.dcr_action |=
14238 DTRACE_CRA_PROC_DESTRUCTIVE_ALLUSER;
14239
14240 /*
14241 * Holding proc_zone means that destructive actions
14242 * for this user/group ID in all zones is allowed.
14243 */
14244 if (PRIV_POLICY_ONLY(cr, PRIV_PROC_ZONE, B_FALSE))
14245 state->dts_cred.dcr_action |=
14246 DTRACE_CRA_PROC_DESTRUCTIVE_ALLZONE;
14247
14248#if defined(sun)
14249 /*
14250 * If we have all privs in whatever zone this is,
14251 * we can do destructive things to processes which
14252 * have altered credentials.
14253 */
14254 if (priv_isequalset(priv_getset(cr, PRIV_EFFECTIVE),
14255 cr->cr_zone->zone_privset)) {
14256 state->dts_cred.dcr_action |=
14257 DTRACE_CRA_PROC_DESTRUCTIVE_CREDCHG;
14258 }
14259#endif
14260 }
14261
14262 /*
14263 * Holding the dtrace_proc privilege gives control over fasttrap
14264 * and pid providers. We need to grant wider destructive
14265 * privileges in the event that the user has proc_owner and/or
14266 * proc_zone.
14267 */
14268 if (PRIV_POLICY_ONLY(cr, PRIV_DTRACE_PROC, B_FALSE)) {
14269 if (PRIV_POLICY_ONLY(cr, PRIV_PROC_OWNER, B_FALSE))
14270 state->dts_cred.dcr_action |=
14271 DTRACE_CRA_PROC_DESTRUCTIVE_ALLUSER;
14272
14273 if (PRIV_POLICY_ONLY(cr, PRIV_PROC_ZONE, B_FALSE))
14274 state->dts_cred.dcr_action |=
14275 DTRACE_CRA_PROC_DESTRUCTIVE_ALLZONE;
14276 }
14277 }
14278
14279 return (state);
14280}
14281
14282static int
14283dtrace_state_buffer(dtrace_state_t *state, dtrace_buffer_t *buf, int which)
14284{
14285 dtrace_optval_t *opt = state->dts_options, size;
14286 processorid_t cpu = 0;;
14287 int flags = 0, rval, factor, divisor = 1;
14288
14289 ASSERT(MUTEX_HELD(&dtrace_lock));
14290 ASSERT(MUTEX_HELD(&cpu_lock));
14291 ASSERT(which < DTRACEOPT_MAX);
14292 ASSERT(state->dts_activity == DTRACE_ACTIVITY_INACTIVE ||
14293 (state == dtrace_anon.dta_state &&
14294 state->dts_activity == DTRACE_ACTIVITY_ACTIVE));
14295
14296 if (opt[which] == DTRACEOPT_UNSET || opt[which] == 0)
14297 return (0);
14298
14299 if (opt[DTRACEOPT_CPU] != DTRACEOPT_UNSET)
14300 cpu = opt[DTRACEOPT_CPU];
14301
14302 if (which == DTRACEOPT_SPECSIZE)
14303 flags |= DTRACEBUF_NOSWITCH;
14304
14305 if (which == DTRACEOPT_BUFSIZE) {
14306 if (opt[DTRACEOPT_BUFPOLICY] == DTRACEOPT_BUFPOLICY_RING)
14307 flags |= DTRACEBUF_RING;
14308
14309 if (opt[DTRACEOPT_BUFPOLICY] == DTRACEOPT_BUFPOLICY_FILL)
14310 flags |= DTRACEBUF_FILL;
14311
14312 if (state != dtrace_anon.dta_state ||
14313 state->dts_activity != DTRACE_ACTIVITY_ACTIVE)
14314 flags |= DTRACEBUF_INACTIVE;
14315 }
14316
14317 for (size = opt[which]; size >= sizeof (uint64_t); size /= divisor) {
14318 /*
14319 * The size must be 8-byte aligned. If the size is not 8-byte
14320 * aligned, drop it down by the difference.
14321 */
14322 if (size & (sizeof (uint64_t) - 1))
14323 size -= size & (sizeof (uint64_t) - 1);
14324
14325 if (size < state->dts_reserve) {
14326 /*
14327 * Buffers always must be large enough to accommodate
14328 * their prereserved space. We return E2BIG instead
14329 * of ENOMEM in this case to allow for user-level
14330 * software to differentiate the cases.
14331 */
14332 return (E2BIG);
14333 }
14334
14335 rval = dtrace_buffer_alloc(buf, size, flags, cpu, &factor);
14336
14337 if (rval != ENOMEM) {
14338 opt[which] = size;
14339 return (rval);
14340 }
14341
14342 if (opt[DTRACEOPT_BUFRESIZE] == DTRACEOPT_BUFRESIZE_MANUAL)
14343 return (rval);
14344
14345 for (divisor = 2; divisor < factor; divisor <<= 1)
14346 continue;
14347 }
14348
14349 return (ENOMEM);
14350}
14351
14352static int
14353dtrace_state_buffers(dtrace_state_t *state)
14354{
14355 dtrace_speculation_t *spec = state->dts_speculations;
14356 int rval, i;
14357
14358 if ((rval = dtrace_state_buffer(state, state->dts_buffer,
14359 DTRACEOPT_BUFSIZE)) != 0)
14360 return (rval);
14361
14362 if ((rval = dtrace_state_buffer(state, state->dts_aggbuffer,
14363 DTRACEOPT_AGGSIZE)) != 0)
14364 return (rval);
14365
14366 for (i = 0; i < state->dts_nspeculations; i++) {
14367 if ((rval = dtrace_state_buffer(state,
14368 spec[i].dtsp_buffer, DTRACEOPT_SPECSIZE)) != 0)
14369 return (rval);
14370 }
14371
14372 return (0);
14373}
14374
14375static void
14376dtrace_state_prereserve(dtrace_state_t *state)
14377{
14378 dtrace_ecb_t *ecb;
14379 dtrace_probe_t *probe;
14380
14381 state->dts_reserve = 0;
14382
14383 if (state->dts_options[DTRACEOPT_BUFPOLICY] != DTRACEOPT_BUFPOLICY_FILL)
14384 return;
14385
14386 /*
14387 * If our buffer policy is a "fill" buffer policy, we need to set the
14388 * prereserved space to be the space required by the END probes.
14389 */
14390 probe = dtrace_probes[dtrace_probeid_end - 1];
14391 ASSERT(probe != NULL);
14392
14393 for (ecb = probe->dtpr_ecb; ecb != NULL; ecb = ecb->dte_next) {
14394 if (ecb->dte_state != state)
14395 continue;
14396
14397 state->dts_reserve += ecb->dte_needed + ecb->dte_alignment;
14398 }
14399}
14400
14401static int
14402dtrace_state_go(dtrace_state_t *state, processorid_t *cpu)
14403{
14404 dtrace_optval_t *opt = state->dts_options, sz, nspec;
14405 dtrace_speculation_t *spec;
14406 dtrace_buffer_t *buf;
14407#if defined(sun)
14408 cyc_handler_t hdlr;
14409 cyc_time_t when;
14410#endif
14411 int rval = 0, i, bufsize = NCPU * sizeof (dtrace_buffer_t);
14412 dtrace_icookie_t cookie;
14413
14414 mutex_enter(&cpu_lock);
14415 mutex_enter(&dtrace_lock);
14416
14417 if (state->dts_activity != DTRACE_ACTIVITY_INACTIVE) {
14418 rval = EBUSY;
14419 goto out;
14420 }
14421
14422 /*
14423 * Before we can perform any checks, we must prime all of the
14424 * retained enablings that correspond to this state.
14425 */
14426 dtrace_enabling_prime(state);
14427
14428 if (state->dts_destructive && !state->dts_cred.dcr_destructive) {
14429 rval = EACCES;
14430 goto out;
14431 }
14432
14433 dtrace_state_prereserve(state);
14434
14435 /*
14436 * Now we want to do is try to allocate our speculations.
14437 * We do not automatically resize the number of speculations; if
14438 * this fails, we will fail the operation.
14439 */
14440 nspec = opt[DTRACEOPT_NSPEC];
14441 ASSERT(nspec != DTRACEOPT_UNSET);
14442
14443 if (nspec > INT_MAX) {
14444 rval = ENOMEM;
14445 goto out;
14446 }
14447
14448 spec = kmem_zalloc(nspec * sizeof (dtrace_speculation_t),
14449 KM_NOSLEEP | KM_NORMALPRI);
14450
14451 if (spec == NULL) {
14452 rval = ENOMEM;
14453 goto out;
14454 }
14455
14456 state->dts_speculations = spec;
14457 state->dts_nspeculations = (int)nspec;
14458
14459 for (i = 0; i < nspec; i++) {
14460 if ((buf = kmem_zalloc(bufsize,
14461 KM_NOSLEEP | KM_NORMALPRI)) == NULL) {
14462 rval = ENOMEM;
14463 goto err;
14464 }
14465
14466 spec[i].dtsp_buffer = buf;
14467 }
14468
14469 if (opt[DTRACEOPT_GRABANON] != DTRACEOPT_UNSET) {
14470 if (dtrace_anon.dta_state == NULL) {
14471 rval = ENOENT;
14472 goto out;
14473 }
14474
14475 if (state->dts_necbs != 0) {
14476 rval = EALREADY;
14477 goto out;
14478 }
14479
14480 state->dts_anon = dtrace_anon_grab();
14481 ASSERT(state->dts_anon != NULL);
14482 state = state->dts_anon;
14483
14484 /*
14485 * We want "grabanon" to be set in the grabbed state, so we'll
14486 * copy that option value from the grabbing state into the
14487 * grabbed state.
14488 */
14489 state->dts_options[DTRACEOPT_GRABANON] =
14490 opt[DTRACEOPT_GRABANON];
14491
14492 *cpu = dtrace_anon.dta_beganon;
14493
14494 /*
14495 * If the anonymous state is active (as it almost certainly
14496 * is if the anonymous enabling ultimately matched anything),
14497 * we don't allow any further option processing -- but we
14498 * don't return failure.
14499 */
14500 if (state->dts_activity != DTRACE_ACTIVITY_INACTIVE)
14501 goto out;
14502 }
14503
14504 if (opt[DTRACEOPT_AGGSIZE] != DTRACEOPT_UNSET &&
14505 opt[DTRACEOPT_AGGSIZE] != 0) {
14506 if (state->dts_aggregations == NULL) {
14507 /*
14508 * We're not going to create an aggregation buffer
14509 * because we don't have any ECBs that contain
14510 * aggregations -- set this option to 0.
14511 */
14512 opt[DTRACEOPT_AGGSIZE] = 0;
14513 } else {
14514 /*
14515 * If we have an aggregation buffer, we must also have
14516 * a buffer to use as scratch.
14517 */
14518 if (opt[DTRACEOPT_BUFSIZE] == DTRACEOPT_UNSET ||
14519 opt[DTRACEOPT_BUFSIZE] < state->dts_needed) {
14520 opt[DTRACEOPT_BUFSIZE] = state->dts_needed;
14521 }
14522 }
14523 }
14524
14525 if (opt[DTRACEOPT_SPECSIZE] != DTRACEOPT_UNSET &&
14526 opt[DTRACEOPT_SPECSIZE] != 0) {
14527 if (!state->dts_speculates) {
14528 /*
14529 * We're not going to create speculation buffers
14530 * because we don't have any ECBs that actually
14531 * speculate -- set the speculation size to 0.
14532 */
14533 opt[DTRACEOPT_SPECSIZE] = 0;
14534 }
14535 }
14536
14537 /*
14538 * The bare minimum size for any buffer that we're actually going to
14539 * do anything to is sizeof (uint64_t).
14540 */
14541 sz = sizeof (uint64_t);
14542
14543 if ((state->dts_needed != 0 && opt[DTRACEOPT_BUFSIZE] < sz) ||
14544 (state->dts_speculates && opt[DTRACEOPT_SPECSIZE] < sz) ||
14545 (state->dts_aggregations != NULL && opt[DTRACEOPT_AGGSIZE] < sz)) {
14546 /*
14547 * A buffer size has been explicitly set to 0 (or to a size
14548 * that will be adjusted to 0) and we need the space -- we
14549 * need to return failure. We return ENOSPC to differentiate
14550 * it from failing to allocate a buffer due to failure to meet
14551 * the reserve (for which we return E2BIG).
14552 */
14553 rval = ENOSPC;
14554 goto out;
14555 }
14556
14557 if ((rval = dtrace_state_buffers(state)) != 0)
14558 goto err;
14559
14560 if ((sz = opt[DTRACEOPT_DYNVARSIZE]) == DTRACEOPT_UNSET)
14561 sz = dtrace_dstate_defsize;
14562
14563 do {
14564 rval = dtrace_dstate_init(&state->dts_vstate.dtvs_dynvars, sz);
14565
14566 if (rval == 0)
14567 break;
14568
14569 if (opt[DTRACEOPT_BUFRESIZE] == DTRACEOPT_BUFRESIZE_MANUAL)
14570 goto err;
14571 } while (sz >>= 1);
14572
14573 opt[DTRACEOPT_DYNVARSIZE] = sz;
14574
14575 if (rval != 0)
14576 goto err;
14577
14578 if (opt[DTRACEOPT_STATUSRATE] > dtrace_statusrate_max)
14579 opt[DTRACEOPT_STATUSRATE] = dtrace_statusrate_max;
14580
14581 if (opt[DTRACEOPT_CLEANRATE] == 0)
14582 opt[DTRACEOPT_CLEANRATE] = dtrace_cleanrate_max;
14583
14584 if (opt[DTRACEOPT_CLEANRATE] < dtrace_cleanrate_min)
14585 opt[DTRACEOPT_CLEANRATE] = dtrace_cleanrate_min;
14586
14587 if (opt[DTRACEOPT_CLEANRATE] > dtrace_cleanrate_max)
14588 opt[DTRACEOPT_CLEANRATE] = dtrace_cleanrate_max;
14589
14590 state->dts_alive = state->dts_laststatus = dtrace_gethrtime();
14591#if defined(sun)
14592 hdlr.cyh_func = (cyc_func_t)dtrace_state_clean;
14593 hdlr.cyh_arg = state;
14594 hdlr.cyh_level = CY_LOW_LEVEL;
14595
14596 when.cyt_when = 0;
14597 when.cyt_interval = opt[DTRACEOPT_CLEANRATE];
14598
14599 state->dts_cleaner = cyclic_add(&hdlr, &when);
14600
14601 hdlr.cyh_func = (cyc_func_t)dtrace_state_deadman;
14602 hdlr.cyh_arg = state;
14603 hdlr.cyh_level = CY_LOW_LEVEL;
14604
14605 when.cyt_when = 0;
14606 when.cyt_interval = dtrace_deadman_interval;
14607
14608 state->dts_deadman = cyclic_add(&hdlr, &when);
14609#else
14610 callout_reset(&state->dts_cleaner, hz * opt[DTRACEOPT_CLEANRATE] / NANOSEC,
14611 dtrace_state_clean, state);
14612 callout_reset(&state->dts_deadman, hz * dtrace_deadman_interval / NANOSEC,
14613 dtrace_state_deadman, state);
14614#endif
14615
14616 state->dts_activity = DTRACE_ACTIVITY_WARMUP;
14617
14618#if defined(sun)
14619 if (state->dts_getf != 0 &&
14620 !(state->dts_cred.dcr_visible & DTRACE_CRV_KERNEL)) {
14621 /*
14622 * We don't have kernel privs but we have at least one call
14623 * to getf(); we need to bump our zone's count, and (if
14624 * this is the first enabling to have an unprivileged call
14625 * to getf()) we need to hook into closef().
14626 */
14627 state->dts_cred.dcr_cred->cr_zone->zone_dtrace_getf++;
14628
14629 if (dtrace_getf++ == 0) {
14630 ASSERT(dtrace_closef == NULL);
14631 dtrace_closef = dtrace_getf_barrier;
14632 }
14633 }
14634#endif
14635
13737 /*
13738 * Now it's time to actually fire the BEGIN probe. We need to disable
13739 * interrupts here both to record the CPU on which we fired the BEGIN
13740 * probe (the data from this CPU will be processed first at user
13741 * level) and to manually activate the buffer for this CPU.
13742 */
13743 cookie = dtrace_interrupt_disable();
13744 *cpu = curcpu;
13745 ASSERT(state->dts_buffer[*cpu].dtb_flags & DTRACEBUF_INACTIVE);
13746 state->dts_buffer[*cpu].dtb_flags &= ~DTRACEBUF_INACTIVE;
13747
13748 dtrace_probe(dtrace_probeid_begin,
13749 (uint64_t)(uintptr_t)state, 0, 0, 0, 0);
13750 dtrace_interrupt_enable(cookie);
13751 /*
13752 * We may have had an exit action from a BEGIN probe; only change our
13753 * state to ACTIVE if we're still in WARMUP.
13754 */
13755 ASSERT(state->dts_activity == DTRACE_ACTIVITY_WARMUP ||
13756 state->dts_activity == DTRACE_ACTIVITY_DRAINING);
13757
13758 if (state->dts_activity == DTRACE_ACTIVITY_WARMUP)
13759 state->dts_activity = DTRACE_ACTIVITY_ACTIVE;
13760
13761 /*
13762 * Regardless of whether or not now we're in ACTIVE or DRAINING, we
13763 * want each CPU to transition its principal buffer out of the
13764 * INACTIVE state. Doing this assures that no CPU will suddenly begin
13765 * processing an ECB halfway down a probe's ECB chain; all CPUs will
13766 * atomically transition from processing none of a state's ECBs to
13767 * processing all of them.
13768 */
13769 dtrace_xcall(DTRACE_CPUALL,
13770 (dtrace_xcall_t)dtrace_buffer_activate, state);
13771 goto out;
13772
13773err:
13774 dtrace_buffer_free(state->dts_buffer);
13775 dtrace_buffer_free(state->dts_aggbuffer);
13776
13777 if ((nspec = state->dts_nspeculations) == 0) {
13778 ASSERT(state->dts_speculations == NULL);
13779 goto out;
13780 }
13781
13782 spec = state->dts_speculations;
13783 ASSERT(spec != NULL);
13784
13785 for (i = 0; i < state->dts_nspeculations; i++) {
13786 if ((buf = spec[i].dtsp_buffer) == NULL)
13787 break;
13788
13789 dtrace_buffer_free(buf);
13790 kmem_free(buf, bufsize);
13791 }
13792
13793 kmem_free(spec, nspec * sizeof (dtrace_speculation_t));
13794 state->dts_nspeculations = 0;
13795 state->dts_speculations = NULL;
13796
13797out:
13798 mutex_exit(&dtrace_lock);
13799 mutex_exit(&cpu_lock);
13800
13801 return (rval);
13802}
13803
13804static int
13805dtrace_state_stop(dtrace_state_t *state, processorid_t *cpu)
13806{
13807 dtrace_icookie_t cookie;
13808
13809 ASSERT(MUTEX_HELD(&dtrace_lock));
13810
13811 if (state->dts_activity != DTRACE_ACTIVITY_ACTIVE &&
13812 state->dts_activity != DTRACE_ACTIVITY_DRAINING)
13813 return (EINVAL);
13814
13815 /*
13816 * We'll set the activity to DTRACE_ACTIVITY_DRAINING, and issue a sync
13817 * to be sure that every CPU has seen it. See below for the details
13818 * on why this is done.
13819 */
13820 state->dts_activity = DTRACE_ACTIVITY_DRAINING;
13821 dtrace_sync();
13822
13823 /*
13824 * By this point, it is impossible for any CPU to be still processing
13825 * with DTRACE_ACTIVITY_ACTIVE. We can thus set our activity to
13826 * DTRACE_ACTIVITY_COOLDOWN and know that we're not racing with any
13827 * other CPU in dtrace_buffer_reserve(). This allows dtrace_probe()
13828 * and callees to know that the activity is DTRACE_ACTIVITY_COOLDOWN
13829 * iff we're in the END probe.
13830 */
13831 state->dts_activity = DTRACE_ACTIVITY_COOLDOWN;
13832 dtrace_sync();
13833 ASSERT(state->dts_activity == DTRACE_ACTIVITY_COOLDOWN);
13834
13835 /*
13836 * Finally, we can release the reserve and call the END probe. We
13837 * disable interrupts across calling the END probe to allow us to
13838 * return the CPU on which we actually called the END probe. This
13839 * allows user-land to be sure that this CPU's principal buffer is
13840 * processed last.
13841 */
13842 state->dts_reserve = 0;
13843
13844 cookie = dtrace_interrupt_disable();
13845 *cpu = curcpu;
13846 dtrace_probe(dtrace_probeid_end,
13847 (uint64_t)(uintptr_t)state, 0, 0, 0, 0);
13848 dtrace_interrupt_enable(cookie);
13849
13850 state->dts_activity = DTRACE_ACTIVITY_STOPPED;
13851 dtrace_sync();
13852
14636 /*
14637 * Now it's time to actually fire the BEGIN probe. We need to disable
14638 * interrupts here both to record the CPU on which we fired the BEGIN
14639 * probe (the data from this CPU will be processed first at user
14640 * level) and to manually activate the buffer for this CPU.
14641 */
14642 cookie = dtrace_interrupt_disable();
14643 *cpu = curcpu;
14644 ASSERT(state->dts_buffer[*cpu].dtb_flags & DTRACEBUF_INACTIVE);
14645 state->dts_buffer[*cpu].dtb_flags &= ~DTRACEBUF_INACTIVE;
14646
14647 dtrace_probe(dtrace_probeid_begin,
14648 (uint64_t)(uintptr_t)state, 0, 0, 0, 0);
14649 dtrace_interrupt_enable(cookie);
14650 /*
14651 * We may have had an exit action from a BEGIN probe; only change our
14652 * state to ACTIVE if we're still in WARMUP.
14653 */
14654 ASSERT(state->dts_activity == DTRACE_ACTIVITY_WARMUP ||
14655 state->dts_activity == DTRACE_ACTIVITY_DRAINING);
14656
14657 if (state->dts_activity == DTRACE_ACTIVITY_WARMUP)
14658 state->dts_activity = DTRACE_ACTIVITY_ACTIVE;
14659
14660 /*
14661 * Regardless of whether or not now we're in ACTIVE or DRAINING, we
14662 * want each CPU to transition its principal buffer out of the
14663 * INACTIVE state. Doing this assures that no CPU will suddenly begin
14664 * processing an ECB halfway down a probe's ECB chain; all CPUs will
14665 * atomically transition from processing none of a state's ECBs to
14666 * processing all of them.
14667 */
14668 dtrace_xcall(DTRACE_CPUALL,
14669 (dtrace_xcall_t)dtrace_buffer_activate, state);
14670 goto out;
14671
14672err:
14673 dtrace_buffer_free(state->dts_buffer);
14674 dtrace_buffer_free(state->dts_aggbuffer);
14675
14676 if ((nspec = state->dts_nspeculations) == 0) {
14677 ASSERT(state->dts_speculations == NULL);
14678 goto out;
14679 }
14680
14681 spec = state->dts_speculations;
14682 ASSERT(spec != NULL);
14683
14684 for (i = 0; i < state->dts_nspeculations; i++) {
14685 if ((buf = spec[i].dtsp_buffer) == NULL)
14686 break;
14687
14688 dtrace_buffer_free(buf);
14689 kmem_free(buf, bufsize);
14690 }
14691
14692 kmem_free(spec, nspec * sizeof (dtrace_speculation_t));
14693 state->dts_nspeculations = 0;
14694 state->dts_speculations = NULL;
14695
14696out:
14697 mutex_exit(&dtrace_lock);
14698 mutex_exit(&cpu_lock);
14699
14700 return (rval);
14701}
14702
14703static int
14704dtrace_state_stop(dtrace_state_t *state, processorid_t *cpu)
14705{
14706 dtrace_icookie_t cookie;
14707
14708 ASSERT(MUTEX_HELD(&dtrace_lock));
14709
14710 if (state->dts_activity != DTRACE_ACTIVITY_ACTIVE &&
14711 state->dts_activity != DTRACE_ACTIVITY_DRAINING)
14712 return (EINVAL);
14713
14714 /*
14715 * We'll set the activity to DTRACE_ACTIVITY_DRAINING, and issue a sync
14716 * to be sure that every CPU has seen it. See below for the details
14717 * on why this is done.
14718 */
14719 state->dts_activity = DTRACE_ACTIVITY_DRAINING;
14720 dtrace_sync();
14721
14722 /*
14723 * By this point, it is impossible for any CPU to be still processing
14724 * with DTRACE_ACTIVITY_ACTIVE. We can thus set our activity to
14725 * DTRACE_ACTIVITY_COOLDOWN and know that we're not racing with any
14726 * other CPU in dtrace_buffer_reserve(). This allows dtrace_probe()
14727 * and callees to know that the activity is DTRACE_ACTIVITY_COOLDOWN
14728 * iff we're in the END probe.
14729 */
14730 state->dts_activity = DTRACE_ACTIVITY_COOLDOWN;
14731 dtrace_sync();
14732 ASSERT(state->dts_activity == DTRACE_ACTIVITY_COOLDOWN);
14733
14734 /*
14735 * Finally, we can release the reserve and call the END probe. We
14736 * disable interrupts across calling the END probe to allow us to
14737 * return the CPU on which we actually called the END probe. This
14738 * allows user-land to be sure that this CPU's principal buffer is
14739 * processed last.
14740 */
14741 state->dts_reserve = 0;
14742
14743 cookie = dtrace_interrupt_disable();
14744 *cpu = curcpu;
14745 dtrace_probe(dtrace_probeid_end,
14746 (uint64_t)(uintptr_t)state, 0, 0, 0, 0);
14747 dtrace_interrupt_enable(cookie);
14748
14749 state->dts_activity = DTRACE_ACTIVITY_STOPPED;
14750 dtrace_sync();
14751
14752#if defined(sun)
14753 if (state->dts_getf != 0 &&
14754 !(state->dts_cred.dcr_visible & DTRACE_CRV_KERNEL)) {
14755 /*
14756 * We don't have kernel privs but we have at least one call
14757 * to getf(); we need to lower our zone's count, and (if
14758 * this is the last enabling to have an unprivileged call
14759 * to getf()) we need to clear the closef() hook.
14760 */
14761 ASSERT(state->dts_cred.dcr_cred->cr_zone->zone_dtrace_getf > 0);
14762 ASSERT(dtrace_closef == dtrace_getf_barrier);
14763 ASSERT(dtrace_getf > 0);
14764
14765 state->dts_cred.dcr_cred->cr_zone->zone_dtrace_getf--;
14766
14767 if (--dtrace_getf == 0)
14768 dtrace_closef = NULL;
14769 }
14770#endif
14771
13853 return (0);
13854}
13855
13856static int
13857dtrace_state_option(dtrace_state_t *state, dtrace_optid_t option,
13858 dtrace_optval_t val)
13859{
13860 ASSERT(MUTEX_HELD(&dtrace_lock));
13861
13862 if (state->dts_activity != DTRACE_ACTIVITY_INACTIVE)
13863 return (EBUSY);
13864
13865 if (option >= DTRACEOPT_MAX)
13866 return (EINVAL);
13867
13868 if (option != DTRACEOPT_CPU && val < 0)
13869 return (EINVAL);
13870
13871 switch (option) {
13872 case DTRACEOPT_DESTRUCTIVE:
13873 if (dtrace_destructive_disallow)
13874 return (EACCES);
13875
13876 state->dts_cred.dcr_destructive = 1;
13877 break;
13878
13879 case DTRACEOPT_BUFSIZE:
13880 case DTRACEOPT_DYNVARSIZE:
13881 case DTRACEOPT_AGGSIZE:
13882 case DTRACEOPT_SPECSIZE:
13883 case DTRACEOPT_STRSIZE:
13884 if (val < 0)
13885 return (EINVAL);
13886
13887 if (val >= LONG_MAX) {
13888 /*
13889 * If this is an otherwise negative value, set it to
13890 * the highest multiple of 128m less than LONG_MAX.
13891 * Technically, we're adjusting the size without
13892 * regard to the buffer resizing policy, but in fact,
13893 * this has no effect -- if we set the buffer size to
13894 * ~LONG_MAX and the buffer policy is ultimately set to
13895 * be "manual", the buffer allocation is guaranteed to
13896 * fail, if only because the allocation requires two
13897 * buffers. (We set the the size to the highest
13898 * multiple of 128m because it ensures that the size
13899 * will remain a multiple of a megabyte when
13900 * repeatedly halved -- all the way down to 15m.)
13901 */
13902 val = LONG_MAX - (1 << 27) + 1;
13903 }
13904 }
13905
13906 state->dts_options[option] = val;
13907
13908 return (0);
13909}
13910
13911static void
13912dtrace_state_destroy(dtrace_state_t *state)
13913{
13914 dtrace_ecb_t *ecb;
13915 dtrace_vstate_t *vstate = &state->dts_vstate;
13916#if defined(sun)
13917 minor_t minor = getminor(state->dts_dev);
13918#endif
13919 int i, bufsize = NCPU * sizeof (dtrace_buffer_t);
13920 dtrace_speculation_t *spec = state->dts_speculations;
13921 int nspec = state->dts_nspeculations;
13922 uint32_t match;
13923
13924 ASSERT(MUTEX_HELD(&dtrace_lock));
13925 ASSERT(MUTEX_HELD(&cpu_lock));
13926
13927 /*
13928 * First, retract any retained enablings for this state.
13929 */
13930 dtrace_enabling_retract(state);
13931 ASSERT(state->dts_nretained == 0);
13932
13933 if (state->dts_activity == DTRACE_ACTIVITY_ACTIVE ||
13934 state->dts_activity == DTRACE_ACTIVITY_DRAINING) {
13935 /*
13936 * We have managed to come into dtrace_state_destroy() on a
13937 * hot enabling -- almost certainly because of a disorderly
13938 * shutdown of a consumer. (That is, a consumer that is
13939 * exiting without having called dtrace_stop().) In this case,
13940 * we're going to set our activity to be KILLED, and then
13941 * issue a sync to be sure that everyone is out of probe
13942 * context before we start blowing away ECBs.
13943 */
13944 state->dts_activity = DTRACE_ACTIVITY_KILLED;
13945 dtrace_sync();
13946 }
13947
13948 /*
13949 * Release the credential hold we took in dtrace_state_create().
13950 */
13951 if (state->dts_cred.dcr_cred != NULL)
13952 crfree(state->dts_cred.dcr_cred);
13953
13954 /*
13955 * Now we can safely disable and destroy any enabled probes. Because
13956 * any DTRACE_PRIV_KERNEL probes may actually be slowing our progress
13957 * (especially if they're all enabled), we take two passes through the
13958 * ECBs: in the first, we disable just DTRACE_PRIV_KERNEL probes, and
13959 * in the second we disable whatever is left over.
13960 */
13961 for (match = DTRACE_PRIV_KERNEL; ; match = 0) {
13962 for (i = 0; i < state->dts_necbs; i++) {
13963 if ((ecb = state->dts_ecbs[i]) == NULL)
13964 continue;
13965
13966 if (match && ecb->dte_probe != NULL) {
13967 dtrace_probe_t *probe = ecb->dte_probe;
13968 dtrace_provider_t *prov = probe->dtpr_provider;
13969
13970 if (!(prov->dtpv_priv.dtpp_flags & match))
13971 continue;
13972 }
13973
13974 dtrace_ecb_disable(ecb);
13975 dtrace_ecb_destroy(ecb);
13976 }
13977
13978 if (!match)
13979 break;
13980 }
13981
13982 /*
13983 * Before we free the buffers, perform one more sync to assure that
13984 * every CPU is out of probe context.
13985 */
13986 dtrace_sync();
13987
13988 dtrace_buffer_free(state->dts_buffer);
13989 dtrace_buffer_free(state->dts_aggbuffer);
13990
13991 for (i = 0; i < nspec; i++)
13992 dtrace_buffer_free(spec[i].dtsp_buffer);
13993
13994#if defined(sun)
13995 if (state->dts_cleaner != CYCLIC_NONE)
13996 cyclic_remove(state->dts_cleaner);
13997
13998 if (state->dts_deadman != CYCLIC_NONE)
13999 cyclic_remove(state->dts_deadman);
14000#else
14001 callout_stop(&state->dts_cleaner);
14002 callout_drain(&state->dts_cleaner);
14003 callout_stop(&state->dts_deadman);
14004 callout_drain(&state->dts_deadman);
14005#endif
14006
14007 dtrace_dstate_fini(&vstate->dtvs_dynvars);
14008 dtrace_vstate_fini(vstate);
14009 if (state->dts_ecbs != NULL)
14010 kmem_free(state->dts_ecbs, state->dts_necbs * sizeof (dtrace_ecb_t *));
14011
14012 if (state->dts_aggregations != NULL) {
14013#ifdef DEBUG
14014 for (i = 0; i < state->dts_naggregations; i++)
14015 ASSERT(state->dts_aggregations[i] == NULL);
14016#endif
14017 ASSERT(state->dts_naggregations > 0);
14018 kmem_free(state->dts_aggregations,
14019 state->dts_naggregations * sizeof (dtrace_aggregation_t *));
14020 }
14021
14022 kmem_free(state->dts_buffer, bufsize);
14023 kmem_free(state->dts_aggbuffer, bufsize);
14024
14025 for (i = 0; i < nspec; i++)
14026 kmem_free(spec[i].dtsp_buffer, bufsize);
14027
14028 if (spec != NULL)
14029 kmem_free(spec, nspec * sizeof (dtrace_speculation_t));
14030
14031 dtrace_format_destroy(state);
14032
14033 if (state->dts_aggid_arena != NULL) {
14034#if defined(sun)
14035 vmem_destroy(state->dts_aggid_arena);
14036#else
14037 delete_unrhdr(state->dts_aggid_arena);
14038#endif
14039 state->dts_aggid_arena = NULL;
14040 }
14041#if defined(sun)
14042 ddi_soft_state_free(dtrace_softstate, minor);
14043 vmem_free(dtrace_minor, (void *)(uintptr_t)minor, 1);
14044#endif
14045}
14046
14047/*
14048 * DTrace Anonymous Enabling Functions
14049 */
14050static dtrace_state_t *
14051dtrace_anon_grab(void)
14052{
14053 dtrace_state_t *state;
14054
14055 ASSERT(MUTEX_HELD(&dtrace_lock));
14056
14057 if ((state = dtrace_anon.dta_state) == NULL) {
14058 ASSERT(dtrace_anon.dta_enabling == NULL);
14059 return (NULL);
14060 }
14061
14062 ASSERT(dtrace_anon.dta_enabling != NULL);
14063 ASSERT(dtrace_retained != NULL);
14064
14065 dtrace_enabling_destroy(dtrace_anon.dta_enabling);
14066 dtrace_anon.dta_enabling = NULL;
14067 dtrace_anon.dta_state = NULL;
14068
14069 return (state);
14070}
14071
14072static void
14073dtrace_anon_property(void)
14074{
14075 int i, rv;
14076 dtrace_state_t *state;
14077 dof_hdr_t *dof;
14078 char c[32]; /* enough for "dof-data-" + digits */
14079
14080 ASSERT(MUTEX_HELD(&dtrace_lock));
14081 ASSERT(MUTEX_HELD(&cpu_lock));
14082
14083 for (i = 0; ; i++) {
14084 (void) snprintf(c, sizeof (c), "dof-data-%d", i);
14085
14086 dtrace_err_verbose = 1;
14087
14088 if ((dof = dtrace_dof_property(c)) == NULL) {
14089 dtrace_err_verbose = 0;
14090 break;
14091 }
14092
14093#if defined(sun)
14094 /*
14095 * We want to create anonymous state, so we need to transition
14096 * the kernel debugger to indicate that DTrace is active. If
14097 * this fails (e.g. because the debugger has modified text in
14098 * some way), we won't continue with the processing.
14099 */
14100 if (kdi_dtrace_set(KDI_DTSET_DTRACE_ACTIVATE) != 0) {
14101 cmn_err(CE_NOTE, "kernel debugger active; anonymous "
14102 "enabling ignored.");
14103 dtrace_dof_destroy(dof);
14104 break;
14105 }
14106#endif
14107
14108 /*
14109 * If we haven't allocated an anonymous state, we'll do so now.
14110 */
14111 if ((state = dtrace_anon.dta_state) == NULL) {
14112#if defined(sun)
14113 state = dtrace_state_create(NULL, NULL);
14114#else
14115 state = dtrace_state_create(NULL);
14116#endif
14117 dtrace_anon.dta_state = state;
14118
14119 if (state == NULL) {
14120 /*
14121 * This basically shouldn't happen: the only
14122 * failure mode from dtrace_state_create() is a
14123 * failure of ddi_soft_state_zalloc() that
14124 * itself should never happen. Still, the
14125 * interface allows for a failure mode, and
14126 * we want to fail as gracefully as possible:
14127 * we'll emit an error message and cease
14128 * processing anonymous state in this case.
14129 */
14130 cmn_err(CE_WARN, "failed to create "
14131 "anonymous state");
14132 dtrace_dof_destroy(dof);
14133 break;
14134 }
14135 }
14136
14137 rv = dtrace_dof_slurp(dof, &state->dts_vstate, CRED(),
14138 &dtrace_anon.dta_enabling, 0, B_TRUE);
14139
14140 if (rv == 0)
14141 rv = dtrace_dof_options(dof, state);
14142
14143 dtrace_err_verbose = 0;
14144 dtrace_dof_destroy(dof);
14145
14146 if (rv != 0) {
14147 /*
14148 * This is malformed DOF; chuck any anonymous state
14149 * that we created.
14150 */
14151 ASSERT(dtrace_anon.dta_enabling == NULL);
14152 dtrace_state_destroy(state);
14153 dtrace_anon.dta_state = NULL;
14154 break;
14155 }
14156
14157 ASSERT(dtrace_anon.dta_enabling != NULL);
14158 }
14159
14160 if (dtrace_anon.dta_enabling != NULL) {
14161 int rval;
14162
14163 /*
14164 * dtrace_enabling_retain() can only fail because we are
14165 * trying to retain more enablings than are allowed -- but
14166 * we only have one anonymous enabling, and we are guaranteed
14167 * to be allowed at least one retained enabling; we assert
14168 * that dtrace_enabling_retain() returns success.
14169 */
14170 rval = dtrace_enabling_retain(dtrace_anon.dta_enabling);
14171 ASSERT(rval == 0);
14172
14173 dtrace_enabling_dump(dtrace_anon.dta_enabling);
14174 }
14175}
14176
14177/*
14178 * DTrace Helper Functions
14179 */
14180static void
14181dtrace_helper_trace(dtrace_helper_action_t *helper,
14182 dtrace_mstate_t *mstate, dtrace_vstate_t *vstate, int where)
14183{
14184 uint32_t size, next, nnext, i;
14185 dtrace_helptrace_t *ent;
14186 uint16_t flags = cpu_core[curcpu].cpuc_dtrace_flags;
14187
14188 if (!dtrace_helptrace_enabled)
14189 return;
14190
14191 ASSERT(vstate->dtvs_nlocals <= dtrace_helptrace_nlocals);
14192
14193 /*
14194 * What would a tracing framework be without its own tracing
14195 * framework? (Well, a hell of a lot simpler, for starters...)
14196 */
14197 size = sizeof (dtrace_helptrace_t) + dtrace_helptrace_nlocals *
14198 sizeof (uint64_t) - sizeof (uint64_t);
14199
14200 /*
14201 * Iterate until we can allocate a slot in the trace buffer.
14202 */
14203 do {
14204 next = dtrace_helptrace_next;
14205
14206 if (next + size < dtrace_helptrace_bufsize) {
14207 nnext = next + size;
14208 } else {
14209 nnext = size;
14210 }
14211 } while (dtrace_cas32(&dtrace_helptrace_next, next, nnext) != next);
14212
14213 /*
14214 * We have our slot; fill it in.
14215 */
14216 if (nnext == size)
14217 next = 0;
14218
14219 ent = (dtrace_helptrace_t *)&dtrace_helptrace_buffer[next];
14220 ent->dtht_helper = helper;
14221 ent->dtht_where = where;
14222 ent->dtht_nlocals = vstate->dtvs_nlocals;
14223
14224 ent->dtht_fltoffs = (mstate->dtms_present & DTRACE_MSTATE_FLTOFFS) ?
14225 mstate->dtms_fltoffs : -1;
14226 ent->dtht_fault = DTRACE_FLAGS2FLT(flags);
14227 ent->dtht_illval = cpu_core[curcpu].cpuc_dtrace_illval;
14228
14229 for (i = 0; i < vstate->dtvs_nlocals; i++) {
14230 dtrace_statvar_t *svar;
14231
14232 if ((svar = vstate->dtvs_locals[i]) == NULL)
14233 continue;
14234
14235 ASSERT(svar->dtsv_size >= NCPU * sizeof (uint64_t));
14236 ent->dtht_locals[i] =
14237 ((uint64_t *)(uintptr_t)svar->dtsv_data)[curcpu];
14238 }
14239}
14240
14241static uint64_t
14242dtrace_helper(int which, dtrace_mstate_t *mstate,
14243 dtrace_state_t *state, uint64_t arg0, uint64_t arg1)
14244{
14245 uint16_t *flags = &cpu_core[curcpu].cpuc_dtrace_flags;
14246 uint64_t sarg0 = mstate->dtms_arg[0];
14247 uint64_t sarg1 = mstate->dtms_arg[1];
14248 uint64_t rval = 0;
14249 dtrace_helpers_t *helpers = curproc->p_dtrace_helpers;
14250 dtrace_helper_action_t *helper;
14251 dtrace_vstate_t *vstate;
14252 dtrace_difo_t *pred;
14253 int i, trace = dtrace_helptrace_enabled;
14254
14255 ASSERT(which >= 0 && which < DTRACE_NHELPER_ACTIONS);
14256
14257 if (helpers == NULL)
14258 return (0);
14259
14260 if ((helper = helpers->dthps_actions[which]) == NULL)
14261 return (0);
14262
14263 vstate = &helpers->dthps_vstate;
14264 mstate->dtms_arg[0] = arg0;
14265 mstate->dtms_arg[1] = arg1;
14266
14267 /*
14268 * Now iterate over each helper. If its predicate evaluates to 'true',
14269 * we'll call the corresponding actions. Note that the below calls
14270 * to dtrace_dif_emulate() may set faults in machine state. This is
14271 * okay: our caller (the outer dtrace_dif_emulate()) will simply plow
14272 * the stored DIF offset with its own (which is the desired behavior).
14273 * Also, note the calls to dtrace_dif_emulate() may allocate scratch
14274 * from machine state; this is okay, too.
14275 */
14276 for (; helper != NULL; helper = helper->dtha_next) {
14277 if ((pred = helper->dtha_predicate) != NULL) {
14278 if (trace)
14279 dtrace_helper_trace(helper, mstate, vstate, 0);
14280
14281 if (!dtrace_dif_emulate(pred, mstate, vstate, state))
14282 goto next;
14283
14284 if (*flags & CPU_DTRACE_FAULT)
14285 goto err;
14286 }
14287
14288 for (i = 0; i < helper->dtha_nactions; i++) {
14289 if (trace)
14290 dtrace_helper_trace(helper,
14291 mstate, vstate, i + 1);
14292
14293 rval = dtrace_dif_emulate(helper->dtha_actions[i],
14294 mstate, vstate, state);
14295
14296 if (*flags & CPU_DTRACE_FAULT)
14297 goto err;
14298 }
14299
14300next:
14301 if (trace)
14302 dtrace_helper_trace(helper, mstate, vstate,
14303 DTRACE_HELPTRACE_NEXT);
14304 }
14305
14306 if (trace)
14307 dtrace_helper_trace(helper, mstate, vstate,
14308 DTRACE_HELPTRACE_DONE);
14309
14310 /*
14311 * Restore the arg0 that we saved upon entry.
14312 */
14313 mstate->dtms_arg[0] = sarg0;
14314 mstate->dtms_arg[1] = sarg1;
14315
14316 return (rval);
14317
14318err:
14319 if (trace)
14320 dtrace_helper_trace(helper, mstate, vstate,
14321 DTRACE_HELPTRACE_ERR);
14322
14323 /*
14324 * Restore the arg0 that we saved upon entry.
14325 */
14326 mstate->dtms_arg[0] = sarg0;
14327 mstate->dtms_arg[1] = sarg1;
14328
14329 return (0);
14330}
14331
14332static void
14333dtrace_helper_action_destroy(dtrace_helper_action_t *helper,
14334 dtrace_vstate_t *vstate)
14335{
14336 int i;
14337
14338 if (helper->dtha_predicate != NULL)
14339 dtrace_difo_release(helper->dtha_predicate, vstate);
14340
14341 for (i = 0; i < helper->dtha_nactions; i++) {
14342 ASSERT(helper->dtha_actions[i] != NULL);
14343 dtrace_difo_release(helper->dtha_actions[i], vstate);
14344 }
14345
14346 kmem_free(helper->dtha_actions,
14347 helper->dtha_nactions * sizeof (dtrace_difo_t *));
14348 kmem_free(helper, sizeof (dtrace_helper_action_t));
14349}
14350
14351static int
14352dtrace_helper_destroygen(int gen)
14353{
14354 proc_t *p = curproc;
14355 dtrace_helpers_t *help = p->p_dtrace_helpers;
14356 dtrace_vstate_t *vstate;
14357 int i;
14358
14359 ASSERT(MUTEX_HELD(&dtrace_lock));
14360
14361 if (help == NULL || gen > help->dthps_generation)
14362 return (EINVAL);
14363
14364 vstate = &help->dthps_vstate;
14365
14366 for (i = 0; i < DTRACE_NHELPER_ACTIONS; i++) {
14367 dtrace_helper_action_t *last = NULL, *h, *next;
14368
14369 for (h = help->dthps_actions[i]; h != NULL; h = next) {
14370 next = h->dtha_next;
14371
14372 if (h->dtha_generation == gen) {
14373 if (last != NULL) {
14374 last->dtha_next = next;
14375 } else {
14376 help->dthps_actions[i] = next;
14377 }
14378
14379 dtrace_helper_action_destroy(h, vstate);
14380 } else {
14381 last = h;
14382 }
14383 }
14384 }
14385
14386 /*
14387 * Interate until we've cleared out all helper providers with the
14388 * given generation number.
14389 */
14390 for (;;) {
14391 dtrace_helper_provider_t *prov;
14392
14393 /*
14394 * Look for a helper provider with the right generation. We
14395 * have to start back at the beginning of the list each time
14396 * because we drop dtrace_lock. It's unlikely that we'll make
14397 * more than two passes.
14398 */
14399 for (i = 0; i < help->dthps_nprovs; i++) {
14400 prov = help->dthps_provs[i];
14401
14402 if (prov->dthp_generation == gen)
14403 break;
14404 }
14405
14406 /*
14407 * If there were no matches, we're done.
14408 */
14409 if (i == help->dthps_nprovs)
14410 break;
14411
14412 /*
14413 * Move the last helper provider into this slot.
14414 */
14415 help->dthps_nprovs--;
14416 help->dthps_provs[i] = help->dthps_provs[help->dthps_nprovs];
14417 help->dthps_provs[help->dthps_nprovs] = NULL;
14418
14419 mutex_exit(&dtrace_lock);
14420
14421 /*
14422 * If we have a meta provider, remove this helper provider.
14423 */
14424 mutex_enter(&dtrace_meta_lock);
14425 if (dtrace_meta_pid != NULL) {
14426 ASSERT(dtrace_deferred_pid == NULL);
14427 dtrace_helper_provider_remove(&prov->dthp_prov,
14428 p->p_pid);
14429 }
14430 mutex_exit(&dtrace_meta_lock);
14431
14432 dtrace_helper_provider_destroy(prov);
14433
14434 mutex_enter(&dtrace_lock);
14435 }
14436
14437 return (0);
14438}
14439
14440static int
14441dtrace_helper_validate(dtrace_helper_action_t *helper)
14442{
14443 int err = 0, i;
14444 dtrace_difo_t *dp;
14445
14446 if ((dp = helper->dtha_predicate) != NULL)
14447 err += dtrace_difo_validate_helper(dp);
14448
14449 for (i = 0; i < helper->dtha_nactions; i++)
14450 err += dtrace_difo_validate_helper(helper->dtha_actions[i]);
14451
14452 return (err == 0);
14453}
14454
14455static int
14456dtrace_helper_action_add(int which, dtrace_ecbdesc_t *ep)
14457{
14458 dtrace_helpers_t *help;
14459 dtrace_helper_action_t *helper, *last;
14460 dtrace_actdesc_t *act;
14461 dtrace_vstate_t *vstate;
14462 dtrace_predicate_t *pred;
14463 int count = 0, nactions = 0, i;
14464
14465 if (which < 0 || which >= DTRACE_NHELPER_ACTIONS)
14466 return (EINVAL);
14467
14468 help = curproc->p_dtrace_helpers;
14469 last = help->dthps_actions[which];
14470 vstate = &help->dthps_vstate;
14471
14472 for (count = 0; last != NULL; last = last->dtha_next) {
14473 count++;
14474 if (last->dtha_next == NULL)
14475 break;
14476 }
14477
14478 /*
14479 * If we already have dtrace_helper_actions_max helper actions for this
14480 * helper action type, we'll refuse to add a new one.
14481 */
14482 if (count >= dtrace_helper_actions_max)
14483 return (ENOSPC);
14484
14485 helper = kmem_zalloc(sizeof (dtrace_helper_action_t), KM_SLEEP);
14486 helper->dtha_generation = help->dthps_generation;
14487
14488 if ((pred = ep->dted_pred.dtpdd_predicate) != NULL) {
14489 ASSERT(pred->dtp_difo != NULL);
14490 dtrace_difo_hold(pred->dtp_difo);
14491 helper->dtha_predicate = pred->dtp_difo;
14492 }
14493
14494 for (act = ep->dted_action; act != NULL; act = act->dtad_next) {
14495 if (act->dtad_kind != DTRACEACT_DIFEXPR)
14496 goto err;
14497
14498 if (act->dtad_difo == NULL)
14499 goto err;
14500
14501 nactions++;
14502 }
14503
14504 helper->dtha_actions = kmem_zalloc(sizeof (dtrace_difo_t *) *
14505 (helper->dtha_nactions = nactions), KM_SLEEP);
14506
14507 for (act = ep->dted_action, i = 0; act != NULL; act = act->dtad_next) {
14508 dtrace_difo_hold(act->dtad_difo);
14509 helper->dtha_actions[i++] = act->dtad_difo;
14510 }
14511
14512 if (!dtrace_helper_validate(helper))
14513 goto err;
14514
14515 if (last == NULL) {
14516 help->dthps_actions[which] = helper;
14517 } else {
14518 last->dtha_next = helper;
14519 }
14520
14521 if (vstate->dtvs_nlocals > dtrace_helptrace_nlocals) {
14522 dtrace_helptrace_nlocals = vstate->dtvs_nlocals;
14523 dtrace_helptrace_next = 0;
14524 }
14525
14526 return (0);
14527err:
14528 dtrace_helper_action_destroy(helper, vstate);
14529 return (EINVAL);
14530}
14531
14532static void
14533dtrace_helper_provider_register(proc_t *p, dtrace_helpers_t *help,
14534 dof_helper_t *dofhp)
14535{
14536 ASSERT(MUTEX_NOT_HELD(&dtrace_lock));
14537
14538 mutex_enter(&dtrace_meta_lock);
14539 mutex_enter(&dtrace_lock);
14540
14541 if (!dtrace_attached() || dtrace_meta_pid == NULL) {
14542 /*
14543 * If the dtrace module is loaded but not attached, or if
14544 * there aren't isn't a meta provider registered to deal with
14545 * these provider descriptions, we need to postpone creating
14546 * the actual providers until later.
14547 */
14548
14549 if (help->dthps_next == NULL && help->dthps_prev == NULL &&
14550 dtrace_deferred_pid != help) {
14551 help->dthps_deferred = 1;
14552 help->dthps_pid = p->p_pid;
14553 help->dthps_next = dtrace_deferred_pid;
14554 help->dthps_prev = NULL;
14555 if (dtrace_deferred_pid != NULL)
14556 dtrace_deferred_pid->dthps_prev = help;
14557 dtrace_deferred_pid = help;
14558 }
14559
14560 mutex_exit(&dtrace_lock);
14561
14562 } else if (dofhp != NULL) {
14563 /*
14564 * If the dtrace module is loaded and we have a particular
14565 * helper provider description, pass that off to the
14566 * meta provider.
14567 */
14568
14569 mutex_exit(&dtrace_lock);
14570
14571 dtrace_helper_provide(dofhp, p->p_pid);
14572
14573 } else {
14574 /*
14575 * Otherwise, just pass all the helper provider descriptions
14576 * off to the meta provider.
14577 */
14578
14579 int i;
14580 mutex_exit(&dtrace_lock);
14581
14582 for (i = 0; i < help->dthps_nprovs; i++) {
14583 dtrace_helper_provide(&help->dthps_provs[i]->dthp_prov,
14584 p->p_pid);
14585 }
14586 }
14587
14588 mutex_exit(&dtrace_meta_lock);
14589}
14590
14591static int
14592dtrace_helper_provider_add(dof_helper_t *dofhp, int gen)
14593{
14594 dtrace_helpers_t *help;
14595 dtrace_helper_provider_t *hprov, **tmp_provs;
14596 uint_t tmp_maxprovs, i;
14597
14598 ASSERT(MUTEX_HELD(&dtrace_lock));
14599
14600 help = curproc->p_dtrace_helpers;
14601 ASSERT(help != NULL);
14602
14603 /*
14604 * If we already have dtrace_helper_providers_max helper providers,
14605 * we're refuse to add a new one.
14606 */
14607 if (help->dthps_nprovs >= dtrace_helper_providers_max)
14608 return (ENOSPC);
14609
14610 /*
14611 * Check to make sure this isn't a duplicate.
14612 */
14613 for (i = 0; i < help->dthps_nprovs; i++) {
14614 if (dofhp->dofhp_dof ==
14615 help->dthps_provs[i]->dthp_prov.dofhp_dof)
14616 return (EALREADY);
14617 }
14618
14619 hprov = kmem_zalloc(sizeof (dtrace_helper_provider_t), KM_SLEEP);
14620 hprov->dthp_prov = *dofhp;
14621 hprov->dthp_ref = 1;
14622 hprov->dthp_generation = gen;
14623
14624 /*
14625 * Allocate a bigger table for helper providers if it's already full.
14626 */
14627 if (help->dthps_maxprovs == help->dthps_nprovs) {
14628 tmp_maxprovs = help->dthps_maxprovs;
14629 tmp_provs = help->dthps_provs;
14630
14631 if (help->dthps_maxprovs == 0)
14632 help->dthps_maxprovs = 2;
14633 else
14634 help->dthps_maxprovs *= 2;
14635 if (help->dthps_maxprovs > dtrace_helper_providers_max)
14636 help->dthps_maxprovs = dtrace_helper_providers_max;
14637
14638 ASSERT(tmp_maxprovs < help->dthps_maxprovs);
14639
14640 help->dthps_provs = kmem_zalloc(help->dthps_maxprovs *
14641 sizeof (dtrace_helper_provider_t *), KM_SLEEP);
14642
14643 if (tmp_provs != NULL) {
14644 bcopy(tmp_provs, help->dthps_provs, tmp_maxprovs *
14645 sizeof (dtrace_helper_provider_t *));
14646 kmem_free(tmp_provs, tmp_maxprovs *
14647 sizeof (dtrace_helper_provider_t *));
14648 }
14649 }
14650
14651 help->dthps_provs[help->dthps_nprovs] = hprov;
14652 help->dthps_nprovs++;
14653
14654 return (0);
14655}
14656
14657static void
14658dtrace_helper_provider_destroy(dtrace_helper_provider_t *hprov)
14659{
14660 mutex_enter(&dtrace_lock);
14661
14662 if (--hprov->dthp_ref == 0) {
14663 dof_hdr_t *dof;
14664 mutex_exit(&dtrace_lock);
14665 dof = (dof_hdr_t *)(uintptr_t)hprov->dthp_prov.dofhp_dof;
14666 dtrace_dof_destroy(dof);
14667 kmem_free(hprov, sizeof (dtrace_helper_provider_t));
14668 } else {
14669 mutex_exit(&dtrace_lock);
14670 }
14671}
14672
14673static int
14674dtrace_helper_provider_validate(dof_hdr_t *dof, dof_sec_t *sec)
14675{
14676 uintptr_t daddr = (uintptr_t)dof;
14677 dof_sec_t *str_sec, *prb_sec, *arg_sec, *off_sec, *enoff_sec;
14678 dof_provider_t *provider;
14679 dof_probe_t *probe;
14680 uint8_t *arg;
14681 char *strtab, *typestr;
14682 dof_stridx_t typeidx;
14683 size_t typesz;
14684 uint_t nprobes, j, k;
14685
14686 ASSERT(sec->dofs_type == DOF_SECT_PROVIDER);
14687
14688 if (sec->dofs_offset & (sizeof (uint_t) - 1)) {
14689 dtrace_dof_error(dof, "misaligned section offset");
14690 return (-1);
14691 }
14692
14693 /*
14694 * The section needs to be large enough to contain the DOF provider
14695 * structure appropriate for the given version.
14696 */
14697 if (sec->dofs_size <
14698 ((dof->dofh_ident[DOF_ID_VERSION] == DOF_VERSION_1) ?
14699 offsetof(dof_provider_t, dofpv_prenoffs) :
14700 sizeof (dof_provider_t))) {
14701 dtrace_dof_error(dof, "provider section too small");
14702 return (-1);
14703 }
14704
14705 provider = (dof_provider_t *)(uintptr_t)(daddr + sec->dofs_offset);
14706 str_sec = dtrace_dof_sect(dof, DOF_SECT_STRTAB, provider->dofpv_strtab);
14707 prb_sec = dtrace_dof_sect(dof, DOF_SECT_PROBES, provider->dofpv_probes);
14708 arg_sec = dtrace_dof_sect(dof, DOF_SECT_PRARGS, provider->dofpv_prargs);
14709 off_sec = dtrace_dof_sect(dof, DOF_SECT_PROFFS, provider->dofpv_proffs);
14710
14711 if (str_sec == NULL || prb_sec == NULL ||
14712 arg_sec == NULL || off_sec == NULL)
14713 return (-1);
14714
14715 enoff_sec = NULL;
14716
14717 if (dof->dofh_ident[DOF_ID_VERSION] != DOF_VERSION_1 &&
14718 provider->dofpv_prenoffs != DOF_SECT_NONE &&
14719 (enoff_sec = dtrace_dof_sect(dof, DOF_SECT_PRENOFFS,
14720 provider->dofpv_prenoffs)) == NULL)
14721 return (-1);
14722
14723 strtab = (char *)(uintptr_t)(daddr + str_sec->dofs_offset);
14724
14725 if (provider->dofpv_name >= str_sec->dofs_size ||
14726 strlen(strtab + provider->dofpv_name) >= DTRACE_PROVNAMELEN) {
14727 dtrace_dof_error(dof, "invalid provider name");
14728 return (-1);
14729 }
14730
14731 if (prb_sec->dofs_entsize == 0 ||
14732 prb_sec->dofs_entsize > prb_sec->dofs_size) {
14733 dtrace_dof_error(dof, "invalid entry size");
14734 return (-1);
14735 }
14736
14737 if (prb_sec->dofs_entsize & (sizeof (uintptr_t) - 1)) {
14738 dtrace_dof_error(dof, "misaligned entry size");
14739 return (-1);
14740 }
14741
14742 if (off_sec->dofs_entsize != sizeof (uint32_t)) {
14743 dtrace_dof_error(dof, "invalid entry size");
14744 return (-1);
14745 }
14746
14747 if (off_sec->dofs_offset & (sizeof (uint32_t) - 1)) {
14748 dtrace_dof_error(dof, "misaligned section offset");
14749 return (-1);
14750 }
14751
14752 if (arg_sec->dofs_entsize != sizeof (uint8_t)) {
14753 dtrace_dof_error(dof, "invalid entry size");
14754 return (-1);
14755 }
14756
14757 arg = (uint8_t *)(uintptr_t)(daddr + arg_sec->dofs_offset);
14758
14759 nprobes = prb_sec->dofs_size / prb_sec->dofs_entsize;
14760
14761 /*
14762 * Take a pass through the probes to check for errors.
14763 */
14764 for (j = 0; j < nprobes; j++) {
14765 probe = (dof_probe_t *)(uintptr_t)(daddr +
14766 prb_sec->dofs_offset + j * prb_sec->dofs_entsize);
14767
14768 if (probe->dofpr_func >= str_sec->dofs_size) {
14769 dtrace_dof_error(dof, "invalid function name");
14770 return (-1);
14771 }
14772
14773 if (strlen(strtab + probe->dofpr_func) >= DTRACE_FUNCNAMELEN) {
14774 dtrace_dof_error(dof, "function name too long");
14775 return (-1);
14776 }
14777
14778 if (probe->dofpr_name >= str_sec->dofs_size ||
14779 strlen(strtab + probe->dofpr_name) >= DTRACE_NAMELEN) {
14780 dtrace_dof_error(dof, "invalid probe name");
14781 return (-1);
14782 }
14783
14784 /*
14785 * The offset count must not wrap the index, and the offsets
14786 * must also not overflow the section's data.
14787 */
14788 if (probe->dofpr_offidx + probe->dofpr_noffs <
14789 probe->dofpr_offidx ||
14790 (probe->dofpr_offidx + probe->dofpr_noffs) *
14791 off_sec->dofs_entsize > off_sec->dofs_size) {
14792 dtrace_dof_error(dof, "invalid probe offset");
14793 return (-1);
14794 }
14795
14796 if (dof->dofh_ident[DOF_ID_VERSION] != DOF_VERSION_1) {
14797 /*
14798 * If there's no is-enabled offset section, make sure
14799 * there aren't any is-enabled offsets. Otherwise
14800 * perform the same checks as for probe offsets
14801 * (immediately above).
14802 */
14803 if (enoff_sec == NULL) {
14804 if (probe->dofpr_enoffidx != 0 ||
14805 probe->dofpr_nenoffs != 0) {
14806 dtrace_dof_error(dof, "is-enabled "
14807 "offsets with null section");
14808 return (-1);
14809 }
14810 } else if (probe->dofpr_enoffidx +
14811 probe->dofpr_nenoffs < probe->dofpr_enoffidx ||
14812 (probe->dofpr_enoffidx + probe->dofpr_nenoffs) *
14813 enoff_sec->dofs_entsize > enoff_sec->dofs_size) {
14814 dtrace_dof_error(dof, "invalid is-enabled "
14815 "offset");
14816 return (-1);
14817 }
14818
14819 if (probe->dofpr_noffs + probe->dofpr_nenoffs == 0) {
14820 dtrace_dof_error(dof, "zero probe and "
14821 "is-enabled offsets");
14822 return (-1);
14823 }
14824 } else if (probe->dofpr_noffs == 0) {
14825 dtrace_dof_error(dof, "zero probe offsets");
14826 return (-1);
14827 }
14828
14829 if (probe->dofpr_argidx + probe->dofpr_xargc <
14830 probe->dofpr_argidx ||
14831 (probe->dofpr_argidx + probe->dofpr_xargc) *
14832 arg_sec->dofs_entsize > arg_sec->dofs_size) {
14833 dtrace_dof_error(dof, "invalid args");
14834 return (-1);
14835 }
14836
14837 typeidx = probe->dofpr_nargv;
14838 typestr = strtab + probe->dofpr_nargv;
14839 for (k = 0; k < probe->dofpr_nargc; k++) {
14840 if (typeidx >= str_sec->dofs_size) {
14841 dtrace_dof_error(dof, "bad "
14842 "native argument type");
14843 return (-1);
14844 }
14845
14846 typesz = strlen(typestr) + 1;
14847 if (typesz > DTRACE_ARGTYPELEN) {
14848 dtrace_dof_error(dof, "native "
14849 "argument type too long");
14850 return (-1);
14851 }
14852 typeidx += typesz;
14853 typestr += typesz;
14854 }
14855
14856 typeidx = probe->dofpr_xargv;
14857 typestr = strtab + probe->dofpr_xargv;
14858 for (k = 0; k < probe->dofpr_xargc; k++) {
14859 if (arg[probe->dofpr_argidx + k] > probe->dofpr_nargc) {
14860 dtrace_dof_error(dof, "bad "
14861 "native argument index");
14862 return (-1);
14863 }
14864
14865 if (typeidx >= str_sec->dofs_size) {
14866 dtrace_dof_error(dof, "bad "
14867 "translated argument type");
14868 return (-1);
14869 }
14870
14871 typesz = strlen(typestr) + 1;
14872 if (typesz > DTRACE_ARGTYPELEN) {
14873 dtrace_dof_error(dof, "translated argument "
14874 "type too long");
14875 return (-1);
14876 }
14877
14878 typeidx += typesz;
14879 typestr += typesz;
14880 }
14881 }
14882
14883 return (0);
14884}
14885
14886static int
14887dtrace_helper_slurp(dof_hdr_t *dof, dof_helper_t *dhp)
14888{
14889 dtrace_helpers_t *help;
14890 dtrace_vstate_t *vstate;
14891 dtrace_enabling_t *enab = NULL;
14892 int i, gen, rv, nhelpers = 0, nprovs = 0, destroy = 1;
14893 uintptr_t daddr = (uintptr_t)dof;
14894
14895 ASSERT(MUTEX_HELD(&dtrace_lock));
14896
14897 if ((help = curproc->p_dtrace_helpers) == NULL)
14898 help = dtrace_helpers_create(curproc);
14899
14900 vstate = &help->dthps_vstate;
14901
14902 if ((rv = dtrace_dof_slurp(dof, vstate, NULL, &enab,
14903 dhp != NULL ? dhp->dofhp_addr : 0, B_FALSE)) != 0) {
14904 dtrace_dof_destroy(dof);
14905 return (rv);
14906 }
14907
14908 /*
14909 * Look for helper providers and validate their descriptions.
14910 */
14911 if (dhp != NULL) {
14912 for (i = 0; i < dof->dofh_secnum; i++) {
14913 dof_sec_t *sec = (dof_sec_t *)(uintptr_t)(daddr +
14914 dof->dofh_secoff + i * dof->dofh_secsize);
14915
14916 if (sec->dofs_type != DOF_SECT_PROVIDER)
14917 continue;
14918
14919 if (dtrace_helper_provider_validate(dof, sec) != 0) {
14920 dtrace_enabling_destroy(enab);
14921 dtrace_dof_destroy(dof);
14922 return (-1);
14923 }
14924
14925 nprovs++;
14926 }
14927 }
14928
14929 /*
14930 * Now we need to walk through the ECB descriptions in the enabling.
14931 */
14932 for (i = 0; i < enab->dten_ndesc; i++) {
14933 dtrace_ecbdesc_t *ep = enab->dten_desc[i];
14934 dtrace_probedesc_t *desc = &ep->dted_probe;
14935
14936 if (strcmp(desc->dtpd_provider, "dtrace") != 0)
14937 continue;
14938
14939 if (strcmp(desc->dtpd_mod, "helper") != 0)
14940 continue;
14941
14942 if (strcmp(desc->dtpd_func, "ustack") != 0)
14943 continue;
14944
14945 if ((rv = dtrace_helper_action_add(DTRACE_HELPER_ACTION_USTACK,
14946 ep)) != 0) {
14947 /*
14948 * Adding this helper action failed -- we are now going
14949 * to rip out the entire generation and return failure.
14950 */
14951 (void) dtrace_helper_destroygen(help->dthps_generation);
14952 dtrace_enabling_destroy(enab);
14953 dtrace_dof_destroy(dof);
14954 return (-1);
14955 }
14956
14957 nhelpers++;
14958 }
14959
14960 if (nhelpers < enab->dten_ndesc)
14961 dtrace_dof_error(dof, "unmatched helpers");
14962
14963 gen = help->dthps_generation++;
14964 dtrace_enabling_destroy(enab);
14965
14966 if (dhp != NULL && nprovs > 0) {
14967 dhp->dofhp_dof = (uint64_t)(uintptr_t)dof;
14968 if (dtrace_helper_provider_add(dhp, gen) == 0) {
14969 mutex_exit(&dtrace_lock);
14970 dtrace_helper_provider_register(curproc, help, dhp);
14971 mutex_enter(&dtrace_lock);
14972
14973 destroy = 0;
14974 }
14975 }
14976
14977 if (destroy)
14978 dtrace_dof_destroy(dof);
14979
14980 return (gen);
14981}
14982
14983static dtrace_helpers_t *
14984dtrace_helpers_create(proc_t *p)
14985{
14986 dtrace_helpers_t *help;
14987
14988 ASSERT(MUTEX_HELD(&dtrace_lock));
14989 ASSERT(p->p_dtrace_helpers == NULL);
14990
14991 help = kmem_zalloc(sizeof (dtrace_helpers_t), KM_SLEEP);
14992 help->dthps_actions = kmem_zalloc(sizeof (dtrace_helper_action_t *) *
14993 DTRACE_NHELPER_ACTIONS, KM_SLEEP);
14994
14995 p->p_dtrace_helpers = help;
14996 dtrace_helpers++;
14997
14998 return (help);
14999}
15000
15001#if defined(sun)
15002static
15003#endif
15004void
15005dtrace_helpers_destroy(proc_t *p)
15006{
15007 dtrace_helpers_t *help;
15008 dtrace_vstate_t *vstate;
15009#if defined(sun)
15010 proc_t *p = curproc;
15011#endif
15012 int i;
15013
15014 mutex_enter(&dtrace_lock);
15015
15016 ASSERT(p->p_dtrace_helpers != NULL);
15017 ASSERT(dtrace_helpers > 0);
15018
15019 help = p->p_dtrace_helpers;
15020 vstate = &help->dthps_vstate;
15021
15022 /*
15023 * We're now going to lose the help from this process.
15024 */
15025 p->p_dtrace_helpers = NULL;
15026 dtrace_sync();
15027
15028 /*
15029 * Destory the helper actions.
15030 */
15031 for (i = 0; i < DTRACE_NHELPER_ACTIONS; i++) {
15032 dtrace_helper_action_t *h, *next;
15033
15034 for (h = help->dthps_actions[i]; h != NULL; h = next) {
15035 next = h->dtha_next;
15036 dtrace_helper_action_destroy(h, vstate);
15037 h = next;
15038 }
15039 }
15040
15041 mutex_exit(&dtrace_lock);
15042
15043 /*
15044 * Destroy the helper providers.
15045 */
15046 if (help->dthps_maxprovs > 0) {
15047 mutex_enter(&dtrace_meta_lock);
15048 if (dtrace_meta_pid != NULL) {
15049 ASSERT(dtrace_deferred_pid == NULL);
15050
15051 for (i = 0; i < help->dthps_nprovs; i++) {
15052 dtrace_helper_provider_remove(
15053 &help->dthps_provs[i]->dthp_prov, p->p_pid);
15054 }
15055 } else {
15056 mutex_enter(&dtrace_lock);
15057 ASSERT(help->dthps_deferred == 0 ||
15058 help->dthps_next != NULL ||
15059 help->dthps_prev != NULL ||
15060 help == dtrace_deferred_pid);
15061
15062 /*
15063 * Remove the helper from the deferred list.
15064 */
15065 if (help->dthps_next != NULL)
15066 help->dthps_next->dthps_prev = help->dthps_prev;
15067 if (help->dthps_prev != NULL)
15068 help->dthps_prev->dthps_next = help->dthps_next;
15069 if (dtrace_deferred_pid == help) {
15070 dtrace_deferred_pid = help->dthps_next;
15071 ASSERT(help->dthps_prev == NULL);
15072 }
15073
15074 mutex_exit(&dtrace_lock);
15075 }
15076
15077 mutex_exit(&dtrace_meta_lock);
15078
15079 for (i = 0; i < help->dthps_nprovs; i++) {
15080 dtrace_helper_provider_destroy(help->dthps_provs[i]);
15081 }
15082
15083 kmem_free(help->dthps_provs, help->dthps_maxprovs *
15084 sizeof (dtrace_helper_provider_t *));
15085 }
15086
15087 mutex_enter(&dtrace_lock);
15088
15089 dtrace_vstate_fini(&help->dthps_vstate);
15090 kmem_free(help->dthps_actions,
15091 sizeof (dtrace_helper_action_t *) * DTRACE_NHELPER_ACTIONS);
15092 kmem_free(help, sizeof (dtrace_helpers_t));
15093
15094 --dtrace_helpers;
15095 mutex_exit(&dtrace_lock);
15096}
15097
15098#if defined(sun)
15099static
15100#endif
15101void
15102dtrace_helpers_duplicate(proc_t *from, proc_t *to)
15103{
15104 dtrace_helpers_t *help, *newhelp;
15105 dtrace_helper_action_t *helper, *new, *last;
15106 dtrace_difo_t *dp;
15107 dtrace_vstate_t *vstate;
15108 int i, j, sz, hasprovs = 0;
15109
15110 mutex_enter(&dtrace_lock);
15111 ASSERT(from->p_dtrace_helpers != NULL);
15112 ASSERT(dtrace_helpers > 0);
15113
15114 help = from->p_dtrace_helpers;
15115 newhelp = dtrace_helpers_create(to);
15116 ASSERT(to->p_dtrace_helpers != NULL);
15117
15118 newhelp->dthps_generation = help->dthps_generation;
15119 vstate = &newhelp->dthps_vstate;
15120
15121 /*
15122 * Duplicate the helper actions.
15123 */
15124 for (i = 0; i < DTRACE_NHELPER_ACTIONS; i++) {
15125 if ((helper = help->dthps_actions[i]) == NULL)
15126 continue;
15127
15128 for (last = NULL; helper != NULL; helper = helper->dtha_next) {
15129 new = kmem_zalloc(sizeof (dtrace_helper_action_t),
15130 KM_SLEEP);
15131 new->dtha_generation = helper->dtha_generation;
15132
15133 if ((dp = helper->dtha_predicate) != NULL) {
15134 dp = dtrace_difo_duplicate(dp, vstate);
15135 new->dtha_predicate = dp;
15136 }
15137
15138 new->dtha_nactions = helper->dtha_nactions;
15139 sz = sizeof (dtrace_difo_t *) * new->dtha_nactions;
15140 new->dtha_actions = kmem_alloc(sz, KM_SLEEP);
15141
15142 for (j = 0; j < new->dtha_nactions; j++) {
15143 dtrace_difo_t *dp = helper->dtha_actions[j];
15144
15145 ASSERT(dp != NULL);
15146 dp = dtrace_difo_duplicate(dp, vstate);
15147 new->dtha_actions[j] = dp;
15148 }
15149
15150 if (last != NULL) {
15151 last->dtha_next = new;
15152 } else {
15153 newhelp->dthps_actions[i] = new;
15154 }
15155
15156 last = new;
15157 }
15158 }
15159
15160 /*
15161 * Duplicate the helper providers and register them with the
15162 * DTrace framework.
15163 */
15164 if (help->dthps_nprovs > 0) {
15165 newhelp->dthps_nprovs = help->dthps_nprovs;
15166 newhelp->dthps_maxprovs = help->dthps_nprovs;
15167 newhelp->dthps_provs = kmem_alloc(newhelp->dthps_nprovs *
15168 sizeof (dtrace_helper_provider_t *), KM_SLEEP);
15169 for (i = 0; i < newhelp->dthps_nprovs; i++) {
15170 newhelp->dthps_provs[i] = help->dthps_provs[i];
15171 newhelp->dthps_provs[i]->dthp_ref++;
15172 }
15173
15174 hasprovs = 1;
15175 }
15176
15177 mutex_exit(&dtrace_lock);
15178
15179 if (hasprovs)
15180 dtrace_helper_provider_register(to, newhelp, NULL);
15181}
15182
15183/*
15184 * DTrace Hook Functions
15185 */
15186static void
15187dtrace_module_loaded(modctl_t *ctl)
15188{
15189 dtrace_provider_t *prv;
15190
15191 mutex_enter(&dtrace_provider_lock);
15192#if defined(sun)
15193 mutex_enter(&mod_lock);
15194#endif
15195
15196#if defined(sun)
15197 ASSERT(ctl->mod_busy);
15198#endif
15199
15200 /*
15201 * We're going to call each providers per-module provide operation
15202 * specifying only this module.
15203 */
15204 for (prv = dtrace_provider; prv != NULL; prv = prv->dtpv_next)
15205 prv->dtpv_pops.dtps_provide_module(prv->dtpv_arg, ctl);
15206
15207#if defined(sun)
15208 mutex_exit(&mod_lock);
15209#endif
15210 mutex_exit(&dtrace_provider_lock);
15211
15212 /*
15213 * If we have any retained enablings, we need to match against them.
15214 * Enabling probes requires that cpu_lock be held, and we cannot hold
15215 * cpu_lock here -- it is legal for cpu_lock to be held when loading a
15216 * module. (In particular, this happens when loading scheduling
15217 * classes.) So if we have any retained enablings, we need to dispatch
15218 * our task queue to do the match for us.
15219 */
15220 mutex_enter(&dtrace_lock);
15221
15222 if (dtrace_retained == NULL) {
15223 mutex_exit(&dtrace_lock);
15224 return;
15225 }
15226
15227 (void) taskq_dispatch(dtrace_taskq,
15228 (task_func_t *)dtrace_enabling_matchall, NULL, TQ_SLEEP);
15229
15230 mutex_exit(&dtrace_lock);
15231
15232 /*
15233 * And now, for a little heuristic sleaze: in general, we want to
15234 * match modules as soon as they load. However, we cannot guarantee
15235 * this, because it would lead us to the lock ordering violation
15236 * outlined above. The common case, of course, is that cpu_lock is
15237 * _not_ held -- so we delay here for a clock tick, hoping that that's
15238 * long enough for the task queue to do its work. If it's not, it's
15239 * not a serious problem -- it just means that the module that we
15240 * just loaded may not be immediately instrumentable.
15241 */
15242 delay(1);
15243}
15244
15245static void
15246#if defined(sun)
15247dtrace_module_unloaded(modctl_t *ctl)
15248#else
15249dtrace_module_unloaded(modctl_t *ctl, int *error)
15250#endif
15251{
15252 dtrace_probe_t template, *probe, *first, *next;
15253 dtrace_provider_t *prov;
15254#if !defined(sun)
15255 char modname[DTRACE_MODNAMELEN];
15256 size_t len;
15257#endif
15258
15259#if defined(sun)
15260 template.dtpr_mod = ctl->mod_modname;
15261#else
15262 /* Handle the fact that ctl->filename may end in ".ko". */
15263 strlcpy(modname, ctl->filename, sizeof(modname));
15264 len = strlen(ctl->filename);
15265 if (len > 3 && strcmp(modname + len - 3, ".ko") == 0)
15266 modname[len - 3] = '\0';
15267 template.dtpr_mod = modname;
15268#endif
15269
15270 mutex_enter(&dtrace_provider_lock);
15271#if defined(sun)
15272 mutex_enter(&mod_lock);
15273#endif
15274 mutex_enter(&dtrace_lock);
15275
15276#if !defined(sun)
15277 if (ctl->nenabled > 0) {
15278 /* Don't allow unloads if a probe is enabled. */
15279 mutex_exit(&dtrace_provider_lock);
15280 mutex_exit(&dtrace_lock);
15281 *error = -1;
15282 printf(
15283 "kldunload: attempt to unload module that has DTrace probes enabled\n");
15284 return;
15285 }
15286#endif
15287
15288 if (dtrace_bymod == NULL) {
15289 /*
15290 * The DTrace module is loaded (obviously) but not attached;
15291 * we don't have any work to do.
15292 */
15293 mutex_exit(&dtrace_provider_lock);
15294#if defined(sun)
15295 mutex_exit(&mod_lock);
15296#endif
15297 mutex_exit(&dtrace_lock);
15298 return;
15299 }
15300
15301 for (probe = first = dtrace_hash_lookup(dtrace_bymod, &template);
15302 probe != NULL; probe = probe->dtpr_nextmod) {
15303 if (probe->dtpr_ecb != NULL) {
15304 mutex_exit(&dtrace_provider_lock);
15305#if defined(sun)
15306 mutex_exit(&mod_lock);
15307#endif
15308 mutex_exit(&dtrace_lock);
15309
15310 /*
15311 * This shouldn't _actually_ be possible -- we're
15312 * unloading a module that has an enabled probe in it.
15313 * (It's normally up to the provider to make sure that
15314 * this can't happen.) However, because dtps_enable()
15315 * doesn't have a failure mode, there can be an
15316 * enable/unload race. Upshot: we don't want to
15317 * assert, but we're not going to disable the
15318 * probe, either.
15319 */
15320 if (dtrace_err_verbose) {
15321#if defined(sun)
15322 cmn_err(CE_WARN, "unloaded module '%s' had "
15323 "enabled probes", ctl->mod_modname);
15324#else
15325 cmn_err(CE_WARN, "unloaded module '%s' had "
15326 "enabled probes", modname);
15327#endif
15328 }
15329
15330 return;
15331 }
15332 }
15333
15334 probe = first;
15335
15336 for (first = NULL; probe != NULL; probe = next) {
15337 ASSERT(dtrace_probes[probe->dtpr_id - 1] == probe);
15338
15339 dtrace_probes[probe->dtpr_id - 1] = NULL;
15340
15341 next = probe->dtpr_nextmod;
15342 dtrace_hash_remove(dtrace_bymod, probe);
15343 dtrace_hash_remove(dtrace_byfunc, probe);
15344 dtrace_hash_remove(dtrace_byname, probe);
15345
15346 if (first == NULL) {
15347 first = probe;
15348 probe->dtpr_nextmod = NULL;
15349 } else {
15350 probe->dtpr_nextmod = first;
15351 first = probe;
15352 }
15353 }
15354
15355 /*
15356 * We've removed all of the module's probes from the hash chains and
15357 * from the probe array. Now issue a dtrace_sync() to be sure that
15358 * everyone has cleared out from any probe array processing.
15359 */
15360 dtrace_sync();
15361
15362 for (probe = first; probe != NULL; probe = first) {
15363 first = probe->dtpr_nextmod;
15364 prov = probe->dtpr_provider;
15365 prov->dtpv_pops.dtps_destroy(prov->dtpv_arg, probe->dtpr_id,
15366 probe->dtpr_arg);
15367 kmem_free(probe->dtpr_mod, strlen(probe->dtpr_mod) + 1);
15368 kmem_free(probe->dtpr_func, strlen(probe->dtpr_func) + 1);
15369 kmem_free(probe->dtpr_name, strlen(probe->dtpr_name) + 1);
15370#if defined(sun)
15371 vmem_free(dtrace_arena, (void *)(uintptr_t)probe->dtpr_id, 1);
15372#else
15373 free_unr(dtrace_arena, probe->dtpr_id);
15374#endif
15375 kmem_free(probe, sizeof (dtrace_probe_t));
15376 }
15377
15378 mutex_exit(&dtrace_lock);
15379#if defined(sun)
15380 mutex_exit(&mod_lock);
15381#endif
15382 mutex_exit(&dtrace_provider_lock);
15383}
15384
15385#if !defined(sun)
15386static void
15387dtrace_kld_load(void *arg __unused, linker_file_t lf)
15388{
15389
15390 dtrace_module_loaded(lf);
15391}
15392
15393static void
15394dtrace_kld_unload_try(void *arg __unused, linker_file_t lf, int *error)
15395{
15396
15397 if (*error != 0)
15398 /* We already have an error, so don't do anything. */
15399 return;
15400 dtrace_module_unloaded(lf, error);
15401}
15402#endif
15403
15404#if defined(sun)
15405static void
15406dtrace_suspend(void)
15407{
15408 dtrace_probe_foreach(offsetof(dtrace_pops_t, dtps_suspend));
15409}
15410
15411static void
15412dtrace_resume(void)
15413{
15414 dtrace_probe_foreach(offsetof(dtrace_pops_t, dtps_resume));
15415}
15416#endif
15417
15418static int
15419dtrace_cpu_setup(cpu_setup_t what, processorid_t cpu)
15420{
15421 ASSERT(MUTEX_HELD(&cpu_lock));
15422 mutex_enter(&dtrace_lock);
15423
15424 switch (what) {
15425 case CPU_CONFIG: {
15426 dtrace_state_t *state;
15427 dtrace_optval_t *opt, rs, c;
15428
15429 /*
15430 * For now, we only allocate a new buffer for anonymous state.
15431 */
15432 if ((state = dtrace_anon.dta_state) == NULL)
15433 break;
15434
15435 if (state->dts_activity != DTRACE_ACTIVITY_ACTIVE)
15436 break;
15437
15438 opt = state->dts_options;
15439 c = opt[DTRACEOPT_CPU];
15440
15441 if (c != DTRACE_CPUALL && c != DTRACEOPT_UNSET && c != cpu)
15442 break;
15443
15444 /*
15445 * Regardless of what the actual policy is, we're going to
15446 * temporarily set our resize policy to be manual. We're
15447 * also going to temporarily set our CPU option to denote
15448 * the newly configured CPU.
15449 */
15450 rs = opt[DTRACEOPT_BUFRESIZE];
15451 opt[DTRACEOPT_BUFRESIZE] = DTRACEOPT_BUFRESIZE_MANUAL;
15452 opt[DTRACEOPT_CPU] = (dtrace_optval_t)cpu;
15453
15454 (void) dtrace_state_buffers(state);
15455
15456 opt[DTRACEOPT_BUFRESIZE] = rs;
15457 opt[DTRACEOPT_CPU] = c;
15458
15459 break;
15460 }
15461
15462 case CPU_UNCONFIG:
15463 /*
15464 * We don't free the buffer in the CPU_UNCONFIG case. (The
15465 * buffer will be freed when the consumer exits.)
15466 */
15467 break;
15468
15469 default:
15470 break;
15471 }
15472
15473 mutex_exit(&dtrace_lock);
15474 return (0);
15475}
15476
15477#if defined(sun)
15478static void
15479dtrace_cpu_setup_initial(processorid_t cpu)
15480{
15481 (void) dtrace_cpu_setup(CPU_CONFIG, cpu);
15482}
15483#endif
15484
15485static void
15486dtrace_toxrange_add(uintptr_t base, uintptr_t limit)
15487{
15488 if (dtrace_toxranges >= dtrace_toxranges_max) {
15489 int osize, nsize;
15490 dtrace_toxrange_t *range;
15491
15492 osize = dtrace_toxranges_max * sizeof (dtrace_toxrange_t);
15493
15494 if (osize == 0) {
15495 ASSERT(dtrace_toxrange == NULL);
15496 ASSERT(dtrace_toxranges_max == 0);
15497 dtrace_toxranges_max = 1;
15498 } else {
15499 dtrace_toxranges_max <<= 1;
15500 }
15501
15502 nsize = dtrace_toxranges_max * sizeof (dtrace_toxrange_t);
15503 range = kmem_zalloc(nsize, KM_SLEEP);
15504
15505 if (dtrace_toxrange != NULL) {
15506 ASSERT(osize != 0);
15507 bcopy(dtrace_toxrange, range, osize);
15508 kmem_free(dtrace_toxrange, osize);
15509 }
15510
15511 dtrace_toxrange = range;
15512 }
15513
15514 ASSERT(dtrace_toxrange[dtrace_toxranges].dtt_base == 0);
15515 ASSERT(dtrace_toxrange[dtrace_toxranges].dtt_limit == 0);
15516
15517 dtrace_toxrange[dtrace_toxranges].dtt_base = base;
15518 dtrace_toxrange[dtrace_toxranges].dtt_limit = limit;
15519 dtrace_toxranges++;
15520}
15521
14772 return (0);
14773}
14774
14775static int
14776dtrace_state_option(dtrace_state_t *state, dtrace_optid_t option,
14777 dtrace_optval_t val)
14778{
14779 ASSERT(MUTEX_HELD(&dtrace_lock));
14780
14781 if (state->dts_activity != DTRACE_ACTIVITY_INACTIVE)
14782 return (EBUSY);
14783
14784 if (option >= DTRACEOPT_MAX)
14785 return (EINVAL);
14786
14787 if (option != DTRACEOPT_CPU && val < 0)
14788 return (EINVAL);
14789
14790 switch (option) {
14791 case DTRACEOPT_DESTRUCTIVE:
14792 if (dtrace_destructive_disallow)
14793 return (EACCES);
14794
14795 state->dts_cred.dcr_destructive = 1;
14796 break;
14797
14798 case DTRACEOPT_BUFSIZE:
14799 case DTRACEOPT_DYNVARSIZE:
14800 case DTRACEOPT_AGGSIZE:
14801 case DTRACEOPT_SPECSIZE:
14802 case DTRACEOPT_STRSIZE:
14803 if (val < 0)
14804 return (EINVAL);
14805
14806 if (val >= LONG_MAX) {
14807 /*
14808 * If this is an otherwise negative value, set it to
14809 * the highest multiple of 128m less than LONG_MAX.
14810 * Technically, we're adjusting the size without
14811 * regard to the buffer resizing policy, but in fact,
14812 * this has no effect -- if we set the buffer size to
14813 * ~LONG_MAX and the buffer policy is ultimately set to
14814 * be "manual", the buffer allocation is guaranteed to
14815 * fail, if only because the allocation requires two
14816 * buffers. (We set the the size to the highest
14817 * multiple of 128m because it ensures that the size
14818 * will remain a multiple of a megabyte when
14819 * repeatedly halved -- all the way down to 15m.)
14820 */
14821 val = LONG_MAX - (1 << 27) + 1;
14822 }
14823 }
14824
14825 state->dts_options[option] = val;
14826
14827 return (0);
14828}
14829
14830static void
14831dtrace_state_destroy(dtrace_state_t *state)
14832{
14833 dtrace_ecb_t *ecb;
14834 dtrace_vstate_t *vstate = &state->dts_vstate;
14835#if defined(sun)
14836 minor_t minor = getminor(state->dts_dev);
14837#endif
14838 int i, bufsize = NCPU * sizeof (dtrace_buffer_t);
14839 dtrace_speculation_t *spec = state->dts_speculations;
14840 int nspec = state->dts_nspeculations;
14841 uint32_t match;
14842
14843 ASSERT(MUTEX_HELD(&dtrace_lock));
14844 ASSERT(MUTEX_HELD(&cpu_lock));
14845
14846 /*
14847 * First, retract any retained enablings for this state.
14848 */
14849 dtrace_enabling_retract(state);
14850 ASSERT(state->dts_nretained == 0);
14851
14852 if (state->dts_activity == DTRACE_ACTIVITY_ACTIVE ||
14853 state->dts_activity == DTRACE_ACTIVITY_DRAINING) {
14854 /*
14855 * We have managed to come into dtrace_state_destroy() on a
14856 * hot enabling -- almost certainly because of a disorderly
14857 * shutdown of a consumer. (That is, a consumer that is
14858 * exiting without having called dtrace_stop().) In this case,
14859 * we're going to set our activity to be KILLED, and then
14860 * issue a sync to be sure that everyone is out of probe
14861 * context before we start blowing away ECBs.
14862 */
14863 state->dts_activity = DTRACE_ACTIVITY_KILLED;
14864 dtrace_sync();
14865 }
14866
14867 /*
14868 * Release the credential hold we took in dtrace_state_create().
14869 */
14870 if (state->dts_cred.dcr_cred != NULL)
14871 crfree(state->dts_cred.dcr_cred);
14872
14873 /*
14874 * Now we can safely disable and destroy any enabled probes. Because
14875 * any DTRACE_PRIV_KERNEL probes may actually be slowing our progress
14876 * (especially if they're all enabled), we take two passes through the
14877 * ECBs: in the first, we disable just DTRACE_PRIV_KERNEL probes, and
14878 * in the second we disable whatever is left over.
14879 */
14880 for (match = DTRACE_PRIV_KERNEL; ; match = 0) {
14881 for (i = 0; i < state->dts_necbs; i++) {
14882 if ((ecb = state->dts_ecbs[i]) == NULL)
14883 continue;
14884
14885 if (match && ecb->dte_probe != NULL) {
14886 dtrace_probe_t *probe = ecb->dte_probe;
14887 dtrace_provider_t *prov = probe->dtpr_provider;
14888
14889 if (!(prov->dtpv_priv.dtpp_flags & match))
14890 continue;
14891 }
14892
14893 dtrace_ecb_disable(ecb);
14894 dtrace_ecb_destroy(ecb);
14895 }
14896
14897 if (!match)
14898 break;
14899 }
14900
14901 /*
14902 * Before we free the buffers, perform one more sync to assure that
14903 * every CPU is out of probe context.
14904 */
14905 dtrace_sync();
14906
14907 dtrace_buffer_free(state->dts_buffer);
14908 dtrace_buffer_free(state->dts_aggbuffer);
14909
14910 for (i = 0; i < nspec; i++)
14911 dtrace_buffer_free(spec[i].dtsp_buffer);
14912
14913#if defined(sun)
14914 if (state->dts_cleaner != CYCLIC_NONE)
14915 cyclic_remove(state->dts_cleaner);
14916
14917 if (state->dts_deadman != CYCLIC_NONE)
14918 cyclic_remove(state->dts_deadman);
14919#else
14920 callout_stop(&state->dts_cleaner);
14921 callout_drain(&state->dts_cleaner);
14922 callout_stop(&state->dts_deadman);
14923 callout_drain(&state->dts_deadman);
14924#endif
14925
14926 dtrace_dstate_fini(&vstate->dtvs_dynvars);
14927 dtrace_vstate_fini(vstate);
14928 if (state->dts_ecbs != NULL)
14929 kmem_free(state->dts_ecbs, state->dts_necbs * sizeof (dtrace_ecb_t *));
14930
14931 if (state->dts_aggregations != NULL) {
14932#ifdef DEBUG
14933 for (i = 0; i < state->dts_naggregations; i++)
14934 ASSERT(state->dts_aggregations[i] == NULL);
14935#endif
14936 ASSERT(state->dts_naggregations > 0);
14937 kmem_free(state->dts_aggregations,
14938 state->dts_naggregations * sizeof (dtrace_aggregation_t *));
14939 }
14940
14941 kmem_free(state->dts_buffer, bufsize);
14942 kmem_free(state->dts_aggbuffer, bufsize);
14943
14944 for (i = 0; i < nspec; i++)
14945 kmem_free(spec[i].dtsp_buffer, bufsize);
14946
14947 if (spec != NULL)
14948 kmem_free(spec, nspec * sizeof (dtrace_speculation_t));
14949
14950 dtrace_format_destroy(state);
14951
14952 if (state->dts_aggid_arena != NULL) {
14953#if defined(sun)
14954 vmem_destroy(state->dts_aggid_arena);
14955#else
14956 delete_unrhdr(state->dts_aggid_arena);
14957#endif
14958 state->dts_aggid_arena = NULL;
14959 }
14960#if defined(sun)
14961 ddi_soft_state_free(dtrace_softstate, minor);
14962 vmem_free(dtrace_minor, (void *)(uintptr_t)minor, 1);
14963#endif
14964}
14965
14966/*
14967 * DTrace Anonymous Enabling Functions
14968 */
14969static dtrace_state_t *
14970dtrace_anon_grab(void)
14971{
14972 dtrace_state_t *state;
14973
14974 ASSERT(MUTEX_HELD(&dtrace_lock));
14975
14976 if ((state = dtrace_anon.dta_state) == NULL) {
14977 ASSERT(dtrace_anon.dta_enabling == NULL);
14978 return (NULL);
14979 }
14980
14981 ASSERT(dtrace_anon.dta_enabling != NULL);
14982 ASSERT(dtrace_retained != NULL);
14983
14984 dtrace_enabling_destroy(dtrace_anon.dta_enabling);
14985 dtrace_anon.dta_enabling = NULL;
14986 dtrace_anon.dta_state = NULL;
14987
14988 return (state);
14989}
14990
14991static void
14992dtrace_anon_property(void)
14993{
14994 int i, rv;
14995 dtrace_state_t *state;
14996 dof_hdr_t *dof;
14997 char c[32]; /* enough for "dof-data-" + digits */
14998
14999 ASSERT(MUTEX_HELD(&dtrace_lock));
15000 ASSERT(MUTEX_HELD(&cpu_lock));
15001
15002 for (i = 0; ; i++) {
15003 (void) snprintf(c, sizeof (c), "dof-data-%d", i);
15004
15005 dtrace_err_verbose = 1;
15006
15007 if ((dof = dtrace_dof_property(c)) == NULL) {
15008 dtrace_err_verbose = 0;
15009 break;
15010 }
15011
15012#if defined(sun)
15013 /*
15014 * We want to create anonymous state, so we need to transition
15015 * the kernel debugger to indicate that DTrace is active. If
15016 * this fails (e.g. because the debugger has modified text in
15017 * some way), we won't continue with the processing.
15018 */
15019 if (kdi_dtrace_set(KDI_DTSET_DTRACE_ACTIVATE) != 0) {
15020 cmn_err(CE_NOTE, "kernel debugger active; anonymous "
15021 "enabling ignored.");
15022 dtrace_dof_destroy(dof);
15023 break;
15024 }
15025#endif
15026
15027 /*
15028 * If we haven't allocated an anonymous state, we'll do so now.
15029 */
15030 if ((state = dtrace_anon.dta_state) == NULL) {
15031#if defined(sun)
15032 state = dtrace_state_create(NULL, NULL);
15033#else
15034 state = dtrace_state_create(NULL);
15035#endif
15036 dtrace_anon.dta_state = state;
15037
15038 if (state == NULL) {
15039 /*
15040 * This basically shouldn't happen: the only
15041 * failure mode from dtrace_state_create() is a
15042 * failure of ddi_soft_state_zalloc() that
15043 * itself should never happen. Still, the
15044 * interface allows for a failure mode, and
15045 * we want to fail as gracefully as possible:
15046 * we'll emit an error message and cease
15047 * processing anonymous state in this case.
15048 */
15049 cmn_err(CE_WARN, "failed to create "
15050 "anonymous state");
15051 dtrace_dof_destroy(dof);
15052 break;
15053 }
15054 }
15055
15056 rv = dtrace_dof_slurp(dof, &state->dts_vstate, CRED(),
15057 &dtrace_anon.dta_enabling, 0, B_TRUE);
15058
15059 if (rv == 0)
15060 rv = dtrace_dof_options(dof, state);
15061
15062 dtrace_err_verbose = 0;
15063 dtrace_dof_destroy(dof);
15064
15065 if (rv != 0) {
15066 /*
15067 * This is malformed DOF; chuck any anonymous state
15068 * that we created.
15069 */
15070 ASSERT(dtrace_anon.dta_enabling == NULL);
15071 dtrace_state_destroy(state);
15072 dtrace_anon.dta_state = NULL;
15073 break;
15074 }
15075
15076 ASSERT(dtrace_anon.dta_enabling != NULL);
15077 }
15078
15079 if (dtrace_anon.dta_enabling != NULL) {
15080 int rval;
15081
15082 /*
15083 * dtrace_enabling_retain() can only fail because we are
15084 * trying to retain more enablings than are allowed -- but
15085 * we only have one anonymous enabling, and we are guaranteed
15086 * to be allowed at least one retained enabling; we assert
15087 * that dtrace_enabling_retain() returns success.
15088 */
15089 rval = dtrace_enabling_retain(dtrace_anon.dta_enabling);
15090 ASSERT(rval == 0);
15091
15092 dtrace_enabling_dump(dtrace_anon.dta_enabling);
15093 }
15094}
15095
15096/*
15097 * DTrace Helper Functions
15098 */
15099static void
15100dtrace_helper_trace(dtrace_helper_action_t *helper,
15101 dtrace_mstate_t *mstate, dtrace_vstate_t *vstate, int where)
15102{
15103 uint32_t size, next, nnext, i;
15104 dtrace_helptrace_t *ent;
15105 uint16_t flags = cpu_core[curcpu].cpuc_dtrace_flags;
15106
15107 if (!dtrace_helptrace_enabled)
15108 return;
15109
15110 ASSERT(vstate->dtvs_nlocals <= dtrace_helptrace_nlocals);
15111
15112 /*
15113 * What would a tracing framework be without its own tracing
15114 * framework? (Well, a hell of a lot simpler, for starters...)
15115 */
15116 size = sizeof (dtrace_helptrace_t) + dtrace_helptrace_nlocals *
15117 sizeof (uint64_t) - sizeof (uint64_t);
15118
15119 /*
15120 * Iterate until we can allocate a slot in the trace buffer.
15121 */
15122 do {
15123 next = dtrace_helptrace_next;
15124
15125 if (next + size < dtrace_helptrace_bufsize) {
15126 nnext = next + size;
15127 } else {
15128 nnext = size;
15129 }
15130 } while (dtrace_cas32(&dtrace_helptrace_next, next, nnext) != next);
15131
15132 /*
15133 * We have our slot; fill it in.
15134 */
15135 if (nnext == size)
15136 next = 0;
15137
15138 ent = (dtrace_helptrace_t *)&dtrace_helptrace_buffer[next];
15139 ent->dtht_helper = helper;
15140 ent->dtht_where = where;
15141 ent->dtht_nlocals = vstate->dtvs_nlocals;
15142
15143 ent->dtht_fltoffs = (mstate->dtms_present & DTRACE_MSTATE_FLTOFFS) ?
15144 mstate->dtms_fltoffs : -1;
15145 ent->dtht_fault = DTRACE_FLAGS2FLT(flags);
15146 ent->dtht_illval = cpu_core[curcpu].cpuc_dtrace_illval;
15147
15148 for (i = 0; i < vstate->dtvs_nlocals; i++) {
15149 dtrace_statvar_t *svar;
15150
15151 if ((svar = vstate->dtvs_locals[i]) == NULL)
15152 continue;
15153
15154 ASSERT(svar->dtsv_size >= NCPU * sizeof (uint64_t));
15155 ent->dtht_locals[i] =
15156 ((uint64_t *)(uintptr_t)svar->dtsv_data)[curcpu];
15157 }
15158}
15159
15160static uint64_t
15161dtrace_helper(int which, dtrace_mstate_t *mstate,
15162 dtrace_state_t *state, uint64_t arg0, uint64_t arg1)
15163{
15164 uint16_t *flags = &cpu_core[curcpu].cpuc_dtrace_flags;
15165 uint64_t sarg0 = mstate->dtms_arg[0];
15166 uint64_t sarg1 = mstate->dtms_arg[1];
15167 uint64_t rval = 0;
15168 dtrace_helpers_t *helpers = curproc->p_dtrace_helpers;
15169 dtrace_helper_action_t *helper;
15170 dtrace_vstate_t *vstate;
15171 dtrace_difo_t *pred;
15172 int i, trace = dtrace_helptrace_enabled;
15173
15174 ASSERT(which >= 0 && which < DTRACE_NHELPER_ACTIONS);
15175
15176 if (helpers == NULL)
15177 return (0);
15178
15179 if ((helper = helpers->dthps_actions[which]) == NULL)
15180 return (0);
15181
15182 vstate = &helpers->dthps_vstate;
15183 mstate->dtms_arg[0] = arg0;
15184 mstate->dtms_arg[1] = arg1;
15185
15186 /*
15187 * Now iterate over each helper. If its predicate evaluates to 'true',
15188 * we'll call the corresponding actions. Note that the below calls
15189 * to dtrace_dif_emulate() may set faults in machine state. This is
15190 * okay: our caller (the outer dtrace_dif_emulate()) will simply plow
15191 * the stored DIF offset with its own (which is the desired behavior).
15192 * Also, note the calls to dtrace_dif_emulate() may allocate scratch
15193 * from machine state; this is okay, too.
15194 */
15195 for (; helper != NULL; helper = helper->dtha_next) {
15196 if ((pred = helper->dtha_predicate) != NULL) {
15197 if (trace)
15198 dtrace_helper_trace(helper, mstate, vstate, 0);
15199
15200 if (!dtrace_dif_emulate(pred, mstate, vstate, state))
15201 goto next;
15202
15203 if (*flags & CPU_DTRACE_FAULT)
15204 goto err;
15205 }
15206
15207 for (i = 0; i < helper->dtha_nactions; i++) {
15208 if (trace)
15209 dtrace_helper_trace(helper,
15210 mstate, vstate, i + 1);
15211
15212 rval = dtrace_dif_emulate(helper->dtha_actions[i],
15213 mstate, vstate, state);
15214
15215 if (*flags & CPU_DTRACE_FAULT)
15216 goto err;
15217 }
15218
15219next:
15220 if (trace)
15221 dtrace_helper_trace(helper, mstate, vstate,
15222 DTRACE_HELPTRACE_NEXT);
15223 }
15224
15225 if (trace)
15226 dtrace_helper_trace(helper, mstate, vstate,
15227 DTRACE_HELPTRACE_DONE);
15228
15229 /*
15230 * Restore the arg0 that we saved upon entry.
15231 */
15232 mstate->dtms_arg[0] = sarg0;
15233 mstate->dtms_arg[1] = sarg1;
15234
15235 return (rval);
15236
15237err:
15238 if (trace)
15239 dtrace_helper_trace(helper, mstate, vstate,
15240 DTRACE_HELPTRACE_ERR);
15241
15242 /*
15243 * Restore the arg0 that we saved upon entry.
15244 */
15245 mstate->dtms_arg[0] = sarg0;
15246 mstate->dtms_arg[1] = sarg1;
15247
15248 return (0);
15249}
15250
15251static void
15252dtrace_helper_action_destroy(dtrace_helper_action_t *helper,
15253 dtrace_vstate_t *vstate)
15254{
15255 int i;
15256
15257 if (helper->dtha_predicate != NULL)
15258 dtrace_difo_release(helper->dtha_predicate, vstate);
15259
15260 for (i = 0; i < helper->dtha_nactions; i++) {
15261 ASSERT(helper->dtha_actions[i] != NULL);
15262 dtrace_difo_release(helper->dtha_actions[i], vstate);
15263 }
15264
15265 kmem_free(helper->dtha_actions,
15266 helper->dtha_nactions * sizeof (dtrace_difo_t *));
15267 kmem_free(helper, sizeof (dtrace_helper_action_t));
15268}
15269
15270static int
15271dtrace_helper_destroygen(int gen)
15272{
15273 proc_t *p = curproc;
15274 dtrace_helpers_t *help = p->p_dtrace_helpers;
15275 dtrace_vstate_t *vstate;
15276 int i;
15277
15278 ASSERT(MUTEX_HELD(&dtrace_lock));
15279
15280 if (help == NULL || gen > help->dthps_generation)
15281 return (EINVAL);
15282
15283 vstate = &help->dthps_vstate;
15284
15285 for (i = 0; i < DTRACE_NHELPER_ACTIONS; i++) {
15286 dtrace_helper_action_t *last = NULL, *h, *next;
15287
15288 for (h = help->dthps_actions[i]; h != NULL; h = next) {
15289 next = h->dtha_next;
15290
15291 if (h->dtha_generation == gen) {
15292 if (last != NULL) {
15293 last->dtha_next = next;
15294 } else {
15295 help->dthps_actions[i] = next;
15296 }
15297
15298 dtrace_helper_action_destroy(h, vstate);
15299 } else {
15300 last = h;
15301 }
15302 }
15303 }
15304
15305 /*
15306 * Interate until we've cleared out all helper providers with the
15307 * given generation number.
15308 */
15309 for (;;) {
15310 dtrace_helper_provider_t *prov;
15311
15312 /*
15313 * Look for a helper provider with the right generation. We
15314 * have to start back at the beginning of the list each time
15315 * because we drop dtrace_lock. It's unlikely that we'll make
15316 * more than two passes.
15317 */
15318 for (i = 0; i < help->dthps_nprovs; i++) {
15319 prov = help->dthps_provs[i];
15320
15321 if (prov->dthp_generation == gen)
15322 break;
15323 }
15324
15325 /*
15326 * If there were no matches, we're done.
15327 */
15328 if (i == help->dthps_nprovs)
15329 break;
15330
15331 /*
15332 * Move the last helper provider into this slot.
15333 */
15334 help->dthps_nprovs--;
15335 help->dthps_provs[i] = help->dthps_provs[help->dthps_nprovs];
15336 help->dthps_provs[help->dthps_nprovs] = NULL;
15337
15338 mutex_exit(&dtrace_lock);
15339
15340 /*
15341 * If we have a meta provider, remove this helper provider.
15342 */
15343 mutex_enter(&dtrace_meta_lock);
15344 if (dtrace_meta_pid != NULL) {
15345 ASSERT(dtrace_deferred_pid == NULL);
15346 dtrace_helper_provider_remove(&prov->dthp_prov,
15347 p->p_pid);
15348 }
15349 mutex_exit(&dtrace_meta_lock);
15350
15351 dtrace_helper_provider_destroy(prov);
15352
15353 mutex_enter(&dtrace_lock);
15354 }
15355
15356 return (0);
15357}
15358
15359static int
15360dtrace_helper_validate(dtrace_helper_action_t *helper)
15361{
15362 int err = 0, i;
15363 dtrace_difo_t *dp;
15364
15365 if ((dp = helper->dtha_predicate) != NULL)
15366 err += dtrace_difo_validate_helper(dp);
15367
15368 for (i = 0; i < helper->dtha_nactions; i++)
15369 err += dtrace_difo_validate_helper(helper->dtha_actions[i]);
15370
15371 return (err == 0);
15372}
15373
15374static int
15375dtrace_helper_action_add(int which, dtrace_ecbdesc_t *ep)
15376{
15377 dtrace_helpers_t *help;
15378 dtrace_helper_action_t *helper, *last;
15379 dtrace_actdesc_t *act;
15380 dtrace_vstate_t *vstate;
15381 dtrace_predicate_t *pred;
15382 int count = 0, nactions = 0, i;
15383
15384 if (which < 0 || which >= DTRACE_NHELPER_ACTIONS)
15385 return (EINVAL);
15386
15387 help = curproc->p_dtrace_helpers;
15388 last = help->dthps_actions[which];
15389 vstate = &help->dthps_vstate;
15390
15391 for (count = 0; last != NULL; last = last->dtha_next) {
15392 count++;
15393 if (last->dtha_next == NULL)
15394 break;
15395 }
15396
15397 /*
15398 * If we already have dtrace_helper_actions_max helper actions for this
15399 * helper action type, we'll refuse to add a new one.
15400 */
15401 if (count >= dtrace_helper_actions_max)
15402 return (ENOSPC);
15403
15404 helper = kmem_zalloc(sizeof (dtrace_helper_action_t), KM_SLEEP);
15405 helper->dtha_generation = help->dthps_generation;
15406
15407 if ((pred = ep->dted_pred.dtpdd_predicate) != NULL) {
15408 ASSERT(pred->dtp_difo != NULL);
15409 dtrace_difo_hold(pred->dtp_difo);
15410 helper->dtha_predicate = pred->dtp_difo;
15411 }
15412
15413 for (act = ep->dted_action; act != NULL; act = act->dtad_next) {
15414 if (act->dtad_kind != DTRACEACT_DIFEXPR)
15415 goto err;
15416
15417 if (act->dtad_difo == NULL)
15418 goto err;
15419
15420 nactions++;
15421 }
15422
15423 helper->dtha_actions = kmem_zalloc(sizeof (dtrace_difo_t *) *
15424 (helper->dtha_nactions = nactions), KM_SLEEP);
15425
15426 for (act = ep->dted_action, i = 0; act != NULL; act = act->dtad_next) {
15427 dtrace_difo_hold(act->dtad_difo);
15428 helper->dtha_actions[i++] = act->dtad_difo;
15429 }
15430
15431 if (!dtrace_helper_validate(helper))
15432 goto err;
15433
15434 if (last == NULL) {
15435 help->dthps_actions[which] = helper;
15436 } else {
15437 last->dtha_next = helper;
15438 }
15439
15440 if (vstate->dtvs_nlocals > dtrace_helptrace_nlocals) {
15441 dtrace_helptrace_nlocals = vstate->dtvs_nlocals;
15442 dtrace_helptrace_next = 0;
15443 }
15444
15445 return (0);
15446err:
15447 dtrace_helper_action_destroy(helper, vstate);
15448 return (EINVAL);
15449}
15450
15451static void
15452dtrace_helper_provider_register(proc_t *p, dtrace_helpers_t *help,
15453 dof_helper_t *dofhp)
15454{
15455 ASSERT(MUTEX_NOT_HELD(&dtrace_lock));
15456
15457 mutex_enter(&dtrace_meta_lock);
15458 mutex_enter(&dtrace_lock);
15459
15460 if (!dtrace_attached() || dtrace_meta_pid == NULL) {
15461 /*
15462 * If the dtrace module is loaded but not attached, or if
15463 * there aren't isn't a meta provider registered to deal with
15464 * these provider descriptions, we need to postpone creating
15465 * the actual providers until later.
15466 */
15467
15468 if (help->dthps_next == NULL && help->dthps_prev == NULL &&
15469 dtrace_deferred_pid != help) {
15470 help->dthps_deferred = 1;
15471 help->dthps_pid = p->p_pid;
15472 help->dthps_next = dtrace_deferred_pid;
15473 help->dthps_prev = NULL;
15474 if (dtrace_deferred_pid != NULL)
15475 dtrace_deferred_pid->dthps_prev = help;
15476 dtrace_deferred_pid = help;
15477 }
15478
15479 mutex_exit(&dtrace_lock);
15480
15481 } else if (dofhp != NULL) {
15482 /*
15483 * If the dtrace module is loaded and we have a particular
15484 * helper provider description, pass that off to the
15485 * meta provider.
15486 */
15487
15488 mutex_exit(&dtrace_lock);
15489
15490 dtrace_helper_provide(dofhp, p->p_pid);
15491
15492 } else {
15493 /*
15494 * Otherwise, just pass all the helper provider descriptions
15495 * off to the meta provider.
15496 */
15497
15498 int i;
15499 mutex_exit(&dtrace_lock);
15500
15501 for (i = 0; i < help->dthps_nprovs; i++) {
15502 dtrace_helper_provide(&help->dthps_provs[i]->dthp_prov,
15503 p->p_pid);
15504 }
15505 }
15506
15507 mutex_exit(&dtrace_meta_lock);
15508}
15509
15510static int
15511dtrace_helper_provider_add(dof_helper_t *dofhp, int gen)
15512{
15513 dtrace_helpers_t *help;
15514 dtrace_helper_provider_t *hprov, **tmp_provs;
15515 uint_t tmp_maxprovs, i;
15516
15517 ASSERT(MUTEX_HELD(&dtrace_lock));
15518
15519 help = curproc->p_dtrace_helpers;
15520 ASSERT(help != NULL);
15521
15522 /*
15523 * If we already have dtrace_helper_providers_max helper providers,
15524 * we're refuse to add a new one.
15525 */
15526 if (help->dthps_nprovs >= dtrace_helper_providers_max)
15527 return (ENOSPC);
15528
15529 /*
15530 * Check to make sure this isn't a duplicate.
15531 */
15532 for (i = 0; i < help->dthps_nprovs; i++) {
15533 if (dofhp->dofhp_dof ==
15534 help->dthps_provs[i]->dthp_prov.dofhp_dof)
15535 return (EALREADY);
15536 }
15537
15538 hprov = kmem_zalloc(sizeof (dtrace_helper_provider_t), KM_SLEEP);
15539 hprov->dthp_prov = *dofhp;
15540 hprov->dthp_ref = 1;
15541 hprov->dthp_generation = gen;
15542
15543 /*
15544 * Allocate a bigger table for helper providers if it's already full.
15545 */
15546 if (help->dthps_maxprovs == help->dthps_nprovs) {
15547 tmp_maxprovs = help->dthps_maxprovs;
15548 tmp_provs = help->dthps_provs;
15549
15550 if (help->dthps_maxprovs == 0)
15551 help->dthps_maxprovs = 2;
15552 else
15553 help->dthps_maxprovs *= 2;
15554 if (help->dthps_maxprovs > dtrace_helper_providers_max)
15555 help->dthps_maxprovs = dtrace_helper_providers_max;
15556
15557 ASSERT(tmp_maxprovs < help->dthps_maxprovs);
15558
15559 help->dthps_provs = kmem_zalloc(help->dthps_maxprovs *
15560 sizeof (dtrace_helper_provider_t *), KM_SLEEP);
15561
15562 if (tmp_provs != NULL) {
15563 bcopy(tmp_provs, help->dthps_provs, tmp_maxprovs *
15564 sizeof (dtrace_helper_provider_t *));
15565 kmem_free(tmp_provs, tmp_maxprovs *
15566 sizeof (dtrace_helper_provider_t *));
15567 }
15568 }
15569
15570 help->dthps_provs[help->dthps_nprovs] = hprov;
15571 help->dthps_nprovs++;
15572
15573 return (0);
15574}
15575
15576static void
15577dtrace_helper_provider_destroy(dtrace_helper_provider_t *hprov)
15578{
15579 mutex_enter(&dtrace_lock);
15580
15581 if (--hprov->dthp_ref == 0) {
15582 dof_hdr_t *dof;
15583 mutex_exit(&dtrace_lock);
15584 dof = (dof_hdr_t *)(uintptr_t)hprov->dthp_prov.dofhp_dof;
15585 dtrace_dof_destroy(dof);
15586 kmem_free(hprov, sizeof (dtrace_helper_provider_t));
15587 } else {
15588 mutex_exit(&dtrace_lock);
15589 }
15590}
15591
15592static int
15593dtrace_helper_provider_validate(dof_hdr_t *dof, dof_sec_t *sec)
15594{
15595 uintptr_t daddr = (uintptr_t)dof;
15596 dof_sec_t *str_sec, *prb_sec, *arg_sec, *off_sec, *enoff_sec;
15597 dof_provider_t *provider;
15598 dof_probe_t *probe;
15599 uint8_t *arg;
15600 char *strtab, *typestr;
15601 dof_stridx_t typeidx;
15602 size_t typesz;
15603 uint_t nprobes, j, k;
15604
15605 ASSERT(sec->dofs_type == DOF_SECT_PROVIDER);
15606
15607 if (sec->dofs_offset & (sizeof (uint_t) - 1)) {
15608 dtrace_dof_error(dof, "misaligned section offset");
15609 return (-1);
15610 }
15611
15612 /*
15613 * The section needs to be large enough to contain the DOF provider
15614 * structure appropriate for the given version.
15615 */
15616 if (sec->dofs_size <
15617 ((dof->dofh_ident[DOF_ID_VERSION] == DOF_VERSION_1) ?
15618 offsetof(dof_provider_t, dofpv_prenoffs) :
15619 sizeof (dof_provider_t))) {
15620 dtrace_dof_error(dof, "provider section too small");
15621 return (-1);
15622 }
15623
15624 provider = (dof_provider_t *)(uintptr_t)(daddr + sec->dofs_offset);
15625 str_sec = dtrace_dof_sect(dof, DOF_SECT_STRTAB, provider->dofpv_strtab);
15626 prb_sec = dtrace_dof_sect(dof, DOF_SECT_PROBES, provider->dofpv_probes);
15627 arg_sec = dtrace_dof_sect(dof, DOF_SECT_PRARGS, provider->dofpv_prargs);
15628 off_sec = dtrace_dof_sect(dof, DOF_SECT_PROFFS, provider->dofpv_proffs);
15629
15630 if (str_sec == NULL || prb_sec == NULL ||
15631 arg_sec == NULL || off_sec == NULL)
15632 return (-1);
15633
15634 enoff_sec = NULL;
15635
15636 if (dof->dofh_ident[DOF_ID_VERSION] != DOF_VERSION_1 &&
15637 provider->dofpv_prenoffs != DOF_SECT_NONE &&
15638 (enoff_sec = dtrace_dof_sect(dof, DOF_SECT_PRENOFFS,
15639 provider->dofpv_prenoffs)) == NULL)
15640 return (-1);
15641
15642 strtab = (char *)(uintptr_t)(daddr + str_sec->dofs_offset);
15643
15644 if (provider->dofpv_name >= str_sec->dofs_size ||
15645 strlen(strtab + provider->dofpv_name) >= DTRACE_PROVNAMELEN) {
15646 dtrace_dof_error(dof, "invalid provider name");
15647 return (-1);
15648 }
15649
15650 if (prb_sec->dofs_entsize == 0 ||
15651 prb_sec->dofs_entsize > prb_sec->dofs_size) {
15652 dtrace_dof_error(dof, "invalid entry size");
15653 return (-1);
15654 }
15655
15656 if (prb_sec->dofs_entsize & (sizeof (uintptr_t) - 1)) {
15657 dtrace_dof_error(dof, "misaligned entry size");
15658 return (-1);
15659 }
15660
15661 if (off_sec->dofs_entsize != sizeof (uint32_t)) {
15662 dtrace_dof_error(dof, "invalid entry size");
15663 return (-1);
15664 }
15665
15666 if (off_sec->dofs_offset & (sizeof (uint32_t) - 1)) {
15667 dtrace_dof_error(dof, "misaligned section offset");
15668 return (-1);
15669 }
15670
15671 if (arg_sec->dofs_entsize != sizeof (uint8_t)) {
15672 dtrace_dof_error(dof, "invalid entry size");
15673 return (-1);
15674 }
15675
15676 arg = (uint8_t *)(uintptr_t)(daddr + arg_sec->dofs_offset);
15677
15678 nprobes = prb_sec->dofs_size / prb_sec->dofs_entsize;
15679
15680 /*
15681 * Take a pass through the probes to check for errors.
15682 */
15683 for (j = 0; j < nprobes; j++) {
15684 probe = (dof_probe_t *)(uintptr_t)(daddr +
15685 prb_sec->dofs_offset + j * prb_sec->dofs_entsize);
15686
15687 if (probe->dofpr_func >= str_sec->dofs_size) {
15688 dtrace_dof_error(dof, "invalid function name");
15689 return (-1);
15690 }
15691
15692 if (strlen(strtab + probe->dofpr_func) >= DTRACE_FUNCNAMELEN) {
15693 dtrace_dof_error(dof, "function name too long");
15694 return (-1);
15695 }
15696
15697 if (probe->dofpr_name >= str_sec->dofs_size ||
15698 strlen(strtab + probe->dofpr_name) >= DTRACE_NAMELEN) {
15699 dtrace_dof_error(dof, "invalid probe name");
15700 return (-1);
15701 }
15702
15703 /*
15704 * The offset count must not wrap the index, and the offsets
15705 * must also not overflow the section's data.
15706 */
15707 if (probe->dofpr_offidx + probe->dofpr_noffs <
15708 probe->dofpr_offidx ||
15709 (probe->dofpr_offidx + probe->dofpr_noffs) *
15710 off_sec->dofs_entsize > off_sec->dofs_size) {
15711 dtrace_dof_error(dof, "invalid probe offset");
15712 return (-1);
15713 }
15714
15715 if (dof->dofh_ident[DOF_ID_VERSION] != DOF_VERSION_1) {
15716 /*
15717 * If there's no is-enabled offset section, make sure
15718 * there aren't any is-enabled offsets. Otherwise
15719 * perform the same checks as for probe offsets
15720 * (immediately above).
15721 */
15722 if (enoff_sec == NULL) {
15723 if (probe->dofpr_enoffidx != 0 ||
15724 probe->dofpr_nenoffs != 0) {
15725 dtrace_dof_error(dof, "is-enabled "
15726 "offsets with null section");
15727 return (-1);
15728 }
15729 } else if (probe->dofpr_enoffidx +
15730 probe->dofpr_nenoffs < probe->dofpr_enoffidx ||
15731 (probe->dofpr_enoffidx + probe->dofpr_nenoffs) *
15732 enoff_sec->dofs_entsize > enoff_sec->dofs_size) {
15733 dtrace_dof_error(dof, "invalid is-enabled "
15734 "offset");
15735 return (-1);
15736 }
15737
15738 if (probe->dofpr_noffs + probe->dofpr_nenoffs == 0) {
15739 dtrace_dof_error(dof, "zero probe and "
15740 "is-enabled offsets");
15741 return (-1);
15742 }
15743 } else if (probe->dofpr_noffs == 0) {
15744 dtrace_dof_error(dof, "zero probe offsets");
15745 return (-1);
15746 }
15747
15748 if (probe->dofpr_argidx + probe->dofpr_xargc <
15749 probe->dofpr_argidx ||
15750 (probe->dofpr_argidx + probe->dofpr_xargc) *
15751 arg_sec->dofs_entsize > arg_sec->dofs_size) {
15752 dtrace_dof_error(dof, "invalid args");
15753 return (-1);
15754 }
15755
15756 typeidx = probe->dofpr_nargv;
15757 typestr = strtab + probe->dofpr_nargv;
15758 for (k = 0; k < probe->dofpr_nargc; k++) {
15759 if (typeidx >= str_sec->dofs_size) {
15760 dtrace_dof_error(dof, "bad "
15761 "native argument type");
15762 return (-1);
15763 }
15764
15765 typesz = strlen(typestr) + 1;
15766 if (typesz > DTRACE_ARGTYPELEN) {
15767 dtrace_dof_error(dof, "native "
15768 "argument type too long");
15769 return (-1);
15770 }
15771 typeidx += typesz;
15772 typestr += typesz;
15773 }
15774
15775 typeidx = probe->dofpr_xargv;
15776 typestr = strtab + probe->dofpr_xargv;
15777 for (k = 0; k < probe->dofpr_xargc; k++) {
15778 if (arg[probe->dofpr_argidx + k] > probe->dofpr_nargc) {
15779 dtrace_dof_error(dof, "bad "
15780 "native argument index");
15781 return (-1);
15782 }
15783
15784 if (typeidx >= str_sec->dofs_size) {
15785 dtrace_dof_error(dof, "bad "
15786 "translated argument type");
15787 return (-1);
15788 }
15789
15790 typesz = strlen(typestr) + 1;
15791 if (typesz > DTRACE_ARGTYPELEN) {
15792 dtrace_dof_error(dof, "translated argument "
15793 "type too long");
15794 return (-1);
15795 }
15796
15797 typeidx += typesz;
15798 typestr += typesz;
15799 }
15800 }
15801
15802 return (0);
15803}
15804
15805static int
15806dtrace_helper_slurp(dof_hdr_t *dof, dof_helper_t *dhp)
15807{
15808 dtrace_helpers_t *help;
15809 dtrace_vstate_t *vstate;
15810 dtrace_enabling_t *enab = NULL;
15811 int i, gen, rv, nhelpers = 0, nprovs = 0, destroy = 1;
15812 uintptr_t daddr = (uintptr_t)dof;
15813
15814 ASSERT(MUTEX_HELD(&dtrace_lock));
15815
15816 if ((help = curproc->p_dtrace_helpers) == NULL)
15817 help = dtrace_helpers_create(curproc);
15818
15819 vstate = &help->dthps_vstate;
15820
15821 if ((rv = dtrace_dof_slurp(dof, vstate, NULL, &enab,
15822 dhp != NULL ? dhp->dofhp_addr : 0, B_FALSE)) != 0) {
15823 dtrace_dof_destroy(dof);
15824 return (rv);
15825 }
15826
15827 /*
15828 * Look for helper providers and validate their descriptions.
15829 */
15830 if (dhp != NULL) {
15831 for (i = 0; i < dof->dofh_secnum; i++) {
15832 dof_sec_t *sec = (dof_sec_t *)(uintptr_t)(daddr +
15833 dof->dofh_secoff + i * dof->dofh_secsize);
15834
15835 if (sec->dofs_type != DOF_SECT_PROVIDER)
15836 continue;
15837
15838 if (dtrace_helper_provider_validate(dof, sec) != 0) {
15839 dtrace_enabling_destroy(enab);
15840 dtrace_dof_destroy(dof);
15841 return (-1);
15842 }
15843
15844 nprovs++;
15845 }
15846 }
15847
15848 /*
15849 * Now we need to walk through the ECB descriptions in the enabling.
15850 */
15851 for (i = 0; i < enab->dten_ndesc; i++) {
15852 dtrace_ecbdesc_t *ep = enab->dten_desc[i];
15853 dtrace_probedesc_t *desc = &ep->dted_probe;
15854
15855 if (strcmp(desc->dtpd_provider, "dtrace") != 0)
15856 continue;
15857
15858 if (strcmp(desc->dtpd_mod, "helper") != 0)
15859 continue;
15860
15861 if (strcmp(desc->dtpd_func, "ustack") != 0)
15862 continue;
15863
15864 if ((rv = dtrace_helper_action_add(DTRACE_HELPER_ACTION_USTACK,
15865 ep)) != 0) {
15866 /*
15867 * Adding this helper action failed -- we are now going
15868 * to rip out the entire generation and return failure.
15869 */
15870 (void) dtrace_helper_destroygen(help->dthps_generation);
15871 dtrace_enabling_destroy(enab);
15872 dtrace_dof_destroy(dof);
15873 return (-1);
15874 }
15875
15876 nhelpers++;
15877 }
15878
15879 if (nhelpers < enab->dten_ndesc)
15880 dtrace_dof_error(dof, "unmatched helpers");
15881
15882 gen = help->dthps_generation++;
15883 dtrace_enabling_destroy(enab);
15884
15885 if (dhp != NULL && nprovs > 0) {
15886 dhp->dofhp_dof = (uint64_t)(uintptr_t)dof;
15887 if (dtrace_helper_provider_add(dhp, gen) == 0) {
15888 mutex_exit(&dtrace_lock);
15889 dtrace_helper_provider_register(curproc, help, dhp);
15890 mutex_enter(&dtrace_lock);
15891
15892 destroy = 0;
15893 }
15894 }
15895
15896 if (destroy)
15897 dtrace_dof_destroy(dof);
15898
15899 return (gen);
15900}
15901
15902static dtrace_helpers_t *
15903dtrace_helpers_create(proc_t *p)
15904{
15905 dtrace_helpers_t *help;
15906
15907 ASSERT(MUTEX_HELD(&dtrace_lock));
15908 ASSERT(p->p_dtrace_helpers == NULL);
15909
15910 help = kmem_zalloc(sizeof (dtrace_helpers_t), KM_SLEEP);
15911 help->dthps_actions = kmem_zalloc(sizeof (dtrace_helper_action_t *) *
15912 DTRACE_NHELPER_ACTIONS, KM_SLEEP);
15913
15914 p->p_dtrace_helpers = help;
15915 dtrace_helpers++;
15916
15917 return (help);
15918}
15919
15920#if defined(sun)
15921static
15922#endif
15923void
15924dtrace_helpers_destroy(proc_t *p)
15925{
15926 dtrace_helpers_t *help;
15927 dtrace_vstate_t *vstate;
15928#if defined(sun)
15929 proc_t *p = curproc;
15930#endif
15931 int i;
15932
15933 mutex_enter(&dtrace_lock);
15934
15935 ASSERT(p->p_dtrace_helpers != NULL);
15936 ASSERT(dtrace_helpers > 0);
15937
15938 help = p->p_dtrace_helpers;
15939 vstate = &help->dthps_vstate;
15940
15941 /*
15942 * We're now going to lose the help from this process.
15943 */
15944 p->p_dtrace_helpers = NULL;
15945 dtrace_sync();
15946
15947 /*
15948 * Destory the helper actions.
15949 */
15950 for (i = 0; i < DTRACE_NHELPER_ACTIONS; i++) {
15951 dtrace_helper_action_t *h, *next;
15952
15953 for (h = help->dthps_actions[i]; h != NULL; h = next) {
15954 next = h->dtha_next;
15955 dtrace_helper_action_destroy(h, vstate);
15956 h = next;
15957 }
15958 }
15959
15960 mutex_exit(&dtrace_lock);
15961
15962 /*
15963 * Destroy the helper providers.
15964 */
15965 if (help->dthps_maxprovs > 0) {
15966 mutex_enter(&dtrace_meta_lock);
15967 if (dtrace_meta_pid != NULL) {
15968 ASSERT(dtrace_deferred_pid == NULL);
15969
15970 for (i = 0; i < help->dthps_nprovs; i++) {
15971 dtrace_helper_provider_remove(
15972 &help->dthps_provs[i]->dthp_prov, p->p_pid);
15973 }
15974 } else {
15975 mutex_enter(&dtrace_lock);
15976 ASSERT(help->dthps_deferred == 0 ||
15977 help->dthps_next != NULL ||
15978 help->dthps_prev != NULL ||
15979 help == dtrace_deferred_pid);
15980
15981 /*
15982 * Remove the helper from the deferred list.
15983 */
15984 if (help->dthps_next != NULL)
15985 help->dthps_next->dthps_prev = help->dthps_prev;
15986 if (help->dthps_prev != NULL)
15987 help->dthps_prev->dthps_next = help->dthps_next;
15988 if (dtrace_deferred_pid == help) {
15989 dtrace_deferred_pid = help->dthps_next;
15990 ASSERT(help->dthps_prev == NULL);
15991 }
15992
15993 mutex_exit(&dtrace_lock);
15994 }
15995
15996 mutex_exit(&dtrace_meta_lock);
15997
15998 for (i = 0; i < help->dthps_nprovs; i++) {
15999 dtrace_helper_provider_destroy(help->dthps_provs[i]);
16000 }
16001
16002 kmem_free(help->dthps_provs, help->dthps_maxprovs *
16003 sizeof (dtrace_helper_provider_t *));
16004 }
16005
16006 mutex_enter(&dtrace_lock);
16007
16008 dtrace_vstate_fini(&help->dthps_vstate);
16009 kmem_free(help->dthps_actions,
16010 sizeof (dtrace_helper_action_t *) * DTRACE_NHELPER_ACTIONS);
16011 kmem_free(help, sizeof (dtrace_helpers_t));
16012
16013 --dtrace_helpers;
16014 mutex_exit(&dtrace_lock);
16015}
16016
16017#if defined(sun)
16018static
16019#endif
16020void
16021dtrace_helpers_duplicate(proc_t *from, proc_t *to)
16022{
16023 dtrace_helpers_t *help, *newhelp;
16024 dtrace_helper_action_t *helper, *new, *last;
16025 dtrace_difo_t *dp;
16026 dtrace_vstate_t *vstate;
16027 int i, j, sz, hasprovs = 0;
16028
16029 mutex_enter(&dtrace_lock);
16030 ASSERT(from->p_dtrace_helpers != NULL);
16031 ASSERT(dtrace_helpers > 0);
16032
16033 help = from->p_dtrace_helpers;
16034 newhelp = dtrace_helpers_create(to);
16035 ASSERT(to->p_dtrace_helpers != NULL);
16036
16037 newhelp->dthps_generation = help->dthps_generation;
16038 vstate = &newhelp->dthps_vstate;
16039
16040 /*
16041 * Duplicate the helper actions.
16042 */
16043 for (i = 0; i < DTRACE_NHELPER_ACTIONS; i++) {
16044 if ((helper = help->dthps_actions[i]) == NULL)
16045 continue;
16046
16047 for (last = NULL; helper != NULL; helper = helper->dtha_next) {
16048 new = kmem_zalloc(sizeof (dtrace_helper_action_t),
16049 KM_SLEEP);
16050 new->dtha_generation = helper->dtha_generation;
16051
16052 if ((dp = helper->dtha_predicate) != NULL) {
16053 dp = dtrace_difo_duplicate(dp, vstate);
16054 new->dtha_predicate = dp;
16055 }
16056
16057 new->dtha_nactions = helper->dtha_nactions;
16058 sz = sizeof (dtrace_difo_t *) * new->dtha_nactions;
16059 new->dtha_actions = kmem_alloc(sz, KM_SLEEP);
16060
16061 for (j = 0; j < new->dtha_nactions; j++) {
16062 dtrace_difo_t *dp = helper->dtha_actions[j];
16063
16064 ASSERT(dp != NULL);
16065 dp = dtrace_difo_duplicate(dp, vstate);
16066 new->dtha_actions[j] = dp;
16067 }
16068
16069 if (last != NULL) {
16070 last->dtha_next = new;
16071 } else {
16072 newhelp->dthps_actions[i] = new;
16073 }
16074
16075 last = new;
16076 }
16077 }
16078
16079 /*
16080 * Duplicate the helper providers and register them with the
16081 * DTrace framework.
16082 */
16083 if (help->dthps_nprovs > 0) {
16084 newhelp->dthps_nprovs = help->dthps_nprovs;
16085 newhelp->dthps_maxprovs = help->dthps_nprovs;
16086 newhelp->dthps_provs = kmem_alloc(newhelp->dthps_nprovs *
16087 sizeof (dtrace_helper_provider_t *), KM_SLEEP);
16088 for (i = 0; i < newhelp->dthps_nprovs; i++) {
16089 newhelp->dthps_provs[i] = help->dthps_provs[i];
16090 newhelp->dthps_provs[i]->dthp_ref++;
16091 }
16092
16093 hasprovs = 1;
16094 }
16095
16096 mutex_exit(&dtrace_lock);
16097
16098 if (hasprovs)
16099 dtrace_helper_provider_register(to, newhelp, NULL);
16100}
16101
16102/*
16103 * DTrace Hook Functions
16104 */
16105static void
16106dtrace_module_loaded(modctl_t *ctl)
16107{
16108 dtrace_provider_t *prv;
16109
16110 mutex_enter(&dtrace_provider_lock);
16111#if defined(sun)
16112 mutex_enter(&mod_lock);
16113#endif
16114
16115#if defined(sun)
16116 ASSERT(ctl->mod_busy);
16117#endif
16118
16119 /*
16120 * We're going to call each providers per-module provide operation
16121 * specifying only this module.
16122 */
16123 for (prv = dtrace_provider; prv != NULL; prv = prv->dtpv_next)
16124 prv->dtpv_pops.dtps_provide_module(prv->dtpv_arg, ctl);
16125
16126#if defined(sun)
16127 mutex_exit(&mod_lock);
16128#endif
16129 mutex_exit(&dtrace_provider_lock);
16130
16131 /*
16132 * If we have any retained enablings, we need to match against them.
16133 * Enabling probes requires that cpu_lock be held, and we cannot hold
16134 * cpu_lock here -- it is legal for cpu_lock to be held when loading a
16135 * module. (In particular, this happens when loading scheduling
16136 * classes.) So if we have any retained enablings, we need to dispatch
16137 * our task queue to do the match for us.
16138 */
16139 mutex_enter(&dtrace_lock);
16140
16141 if (dtrace_retained == NULL) {
16142 mutex_exit(&dtrace_lock);
16143 return;
16144 }
16145
16146 (void) taskq_dispatch(dtrace_taskq,
16147 (task_func_t *)dtrace_enabling_matchall, NULL, TQ_SLEEP);
16148
16149 mutex_exit(&dtrace_lock);
16150
16151 /*
16152 * And now, for a little heuristic sleaze: in general, we want to
16153 * match modules as soon as they load. However, we cannot guarantee
16154 * this, because it would lead us to the lock ordering violation
16155 * outlined above. The common case, of course, is that cpu_lock is
16156 * _not_ held -- so we delay here for a clock tick, hoping that that's
16157 * long enough for the task queue to do its work. If it's not, it's
16158 * not a serious problem -- it just means that the module that we
16159 * just loaded may not be immediately instrumentable.
16160 */
16161 delay(1);
16162}
16163
16164static void
16165#if defined(sun)
16166dtrace_module_unloaded(modctl_t *ctl)
16167#else
16168dtrace_module_unloaded(modctl_t *ctl, int *error)
16169#endif
16170{
16171 dtrace_probe_t template, *probe, *first, *next;
16172 dtrace_provider_t *prov;
16173#if !defined(sun)
16174 char modname[DTRACE_MODNAMELEN];
16175 size_t len;
16176#endif
16177
16178#if defined(sun)
16179 template.dtpr_mod = ctl->mod_modname;
16180#else
16181 /* Handle the fact that ctl->filename may end in ".ko". */
16182 strlcpy(modname, ctl->filename, sizeof(modname));
16183 len = strlen(ctl->filename);
16184 if (len > 3 && strcmp(modname + len - 3, ".ko") == 0)
16185 modname[len - 3] = '\0';
16186 template.dtpr_mod = modname;
16187#endif
16188
16189 mutex_enter(&dtrace_provider_lock);
16190#if defined(sun)
16191 mutex_enter(&mod_lock);
16192#endif
16193 mutex_enter(&dtrace_lock);
16194
16195#if !defined(sun)
16196 if (ctl->nenabled > 0) {
16197 /* Don't allow unloads if a probe is enabled. */
16198 mutex_exit(&dtrace_provider_lock);
16199 mutex_exit(&dtrace_lock);
16200 *error = -1;
16201 printf(
16202 "kldunload: attempt to unload module that has DTrace probes enabled\n");
16203 return;
16204 }
16205#endif
16206
16207 if (dtrace_bymod == NULL) {
16208 /*
16209 * The DTrace module is loaded (obviously) but not attached;
16210 * we don't have any work to do.
16211 */
16212 mutex_exit(&dtrace_provider_lock);
16213#if defined(sun)
16214 mutex_exit(&mod_lock);
16215#endif
16216 mutex_exit(&dtrace_lock);
16217 return;
16218 }
16219
16220 for (probe = first = dtrace_hash_lookup(dtrace_bymod, &template);
16221 probe != NULL; probe = probe->dtpr_nextmod) {
16222 if (probe->dtpr_ecb != NULL) {
16223 mutex_exit(&dtrace_provider_lock);
16224#if defined(sun)
16225 mutex_exit(&mod_lock);
16226#endif
16227 mutex_exit(&dtrace_lock);
16228
16229 /*
16230 * This shouldn't _actually_ be possible -- we're
16231 * unloading a module that has an enabled probe in it.
16232 * (It's normally up to the provider to make sure that
16233 * this can't happen.) However, because dtps_enable()
16234 * doesn't have a failure mode, there can be an
16235 * enable/unload race. Upshot: we don't want to
16236 * assert, but we're not going to disable the
16237 * probe, either.
16238 */
16239 if (dtrace_err_verbose) {
16240#if defined(sun)
16241 cmn_err(CE_WARN, "unloaded module '%s' had "
16242 "enabled probes", ctl->mod_modname);
16243#else
16244 cmn_err(CE_WARN, "unloaded module '%s' had "
16245 "enabled probes", modname);
16246#endif
16247 }
16248
16249 return;
16250 }
16251 }
16252
16253 probe = first;
16254
16255 for (first = NULL; probe != NULL; probe = next) {
16256 ASSERT(dtrace_probes[probe->dtpr_id - 1] == probe);
16257
16258 dtrace_probes[probe->dtpr_id - 1] = NULL;
16259
16260 next = probe->dtpr_nextmod;
16261 dtrace_hash_remove(dtrace_bymod, probe);
16262 dtrace_hash_remove(dtrace_byfunc, probe);
16263 dtrace_hash_remove(dtrace_byname, probe);
16264
16265 if (first == NULL) {
16266 first = probe;
16267 probe->dtpr_nextmod = NULL;
16268 } else {
16269 probe->dtpr_nextmod = first;
16270 first = probe;
16271 }
16272 }
16273
16274 /*
16275 * We've removed all of the module's probes from the hash chains and
16276 * from the probe array. Now issue a dtrace_sync() to be sure that
16277 * everyone has cleared out from any probe array processing.
16278 */
16279 dtrace_sync();
16280
16281 for (probe = first; probe != NULL; probe = first) {
16282 first = probe->dtpr_nextmod;
16283 prov = probe->dtpr_provider;
16284 prov->dtpv_pops.dtps_destroy(prov->dtpv_arg, probe->dtpr_id,
16285 probe->dtpr_arg);
16286 kmem_free(probe->dtpr_mod, strlen(probe->dtpr_mod) + 1);
16287 kmem_free(probe->dtpr_func, strlen(probe->dtpr_func) + 1);
16288 kmem_free(probe->dtpr_name, strlen(probe->dtpr_name) + 1);
16289#if defined(sun)
16290 vmem_free(dtrace_arena, (void *)(uintptr_t)probe->dtpr_id, 1);
16291#else
16292 free_unr(dtrace_arena, probe->dtpr_id);
16293#endif
16294 kmem_free(probe, sizeof (dtrace_probe_t));
16295 }
16296
16297 mutex_exit(&dtrace_lock);
16298#if defined(sun)
16299 mutex_exit(&mod_lock);
16300#endif
16301 mutex_exit(&dtrace_provider_lock);
16302}
16303
16304#if !defined(sun)
16305static void
16306dtrace_kld_load(void *arg __unused, linker_file_t lf)
16307{
16308
16309 dtrace_module_loaded(lf);
16310}
16311
16312static void
16313dtrace_kld_unload_try(void *arg __unused, linker_file_t lf, int *error)
16314{
16315
16316 if (*error != 0)
16317 /* We already have an error, so don't do anything. */
16318 return;
16319 dtrace_module_unloaded(lf, error);
16320}
16321#endif
16322
16323#if defined(sun)
16324static void
16325dtrace_suspend(void)
16326{
16327 dtrace_probe_foreach(offsetof(dtrace_pops_t, dtps_suspend));
16328}
16329
16330static void
16331dtrace_resume(void)
16332{
16333 dtrace_probe_foreach(offsetof(dtrace_pops_t, dtps_resume));
16334}
16335#endif
16336
16337static int
16338dtrace_cpu_setup(cpu_setup_t what, processorid_t cpu)
16339{
16340 ASSERT(MUTEX_HELD(&cpu_lock));
16341 mutex_enter(&dtrace_lock);
16342
16343 switch (what) {
16344 case CPU_CONFIG: {
16345 dtrace_state_t *state;
16346 dtrace_optval_t *opt, rs, c;
16347
16348 /*
16349 * For now, we only allocate a new buffer for anonymous state.
16350 */
16351 if ((state = dtrace_anon.dta_state) == NULL)
16352 break;
16353
16354 if (state->dts_activity != DTRACE_ACTIVITY_ACTIVE)
16355 break;
16356
16357 opt = state->dts_options;
16358 c = opt[DTRACEOPT_CPU];
16359
16360 if (c != DTRACE_CPUALL && c != DTRACEOPT_UNSET && c != cpu)
16361 break;
16362
16363 /*
16364 * Regardless of what the actual policy is, we're going to
16365 * temporarily set our resize policy to be manual. We're
16366 * also going to temporarily set our CPU option to denote
16367 * the newly configured CPU.
16368 */
16369 rs = opt[DTRACEOPT_BUFRESIZE];
16370 opt[DTRACEOPT_BUFRESIZE] = DTRACEOPT_BUFRESIZE_MANUAL;
16371 opt[DTRACEOPT_CPU] = (dtrace_optval_t)cpu;
16372
16373 (void) dtrace_state_buffers(state);
16374
16375 opt[DTRACEOPT_BUFRESIZE] = rs;
16376 opt[DTRACEOPT_CPU] = c;
16377
16378 break;
16379 }
16380
16381 case CPU_UNCONFIG:
16382 /*
16383 * We don't free the buffer in the CPU_UNCONFIG case. (The
16384 * buffer will be freed when the consumer exits.)
16385 */
16386 break;
16387
16388 default:
16389 break;
16390 }
16391
16392 mutex_exit(&dtrace_lock);
16393 return (0);
16394}
16395
16396#if defined(sun)
16397static void
16398dtrace_cpu_setup_initial(processorid_t cpu)
16399{
16400 (void) dtrace_cpu_setup(CPU_CONFIG, cpu);
16401}
16402#endif
16403
16404static void
16405dtrace_toxrange_add(uintptr_t base, uintptr_t limit)
16406{
16407 if (dtrace_toxranges >= dtrace_toxranges_max) {
16408 int osize, nsize;
16409 dtrace_toxrange_t *range;
16410
16411 osize = dtrace_toxranges_max * sizeof (dtrace_toxrange_t);
16412
16413 if (osize == 0) {
16414 ASSERT(dtrace_toxrange == NULL);
16415 ASSERT(dtrace_toxranges_max == 0);
16416 dtrace_toxranges_max = 1;
16417 } else {
16418 dtrace_toxranges_max <<= 1;
16419 }
16420
16421 nsize = dtrace_toxranges_max * sizeof (dtrace_toxrange_t);
16422 range = kmem_zalloc(nsize, KM_SLEEP);
16423
16424 if (dtrace_toxrange != NULL) {
16425 ASSERT(osize != 0);
16426 bcopy(dtrace_toxrange, range, osize);
16427 kmem_free(dtrace_toxrange, osize);
16428 }
16429
16430 dtrace_toxrange = range;
16431 }
16432
16433 ASSERT(dtrace_toxrange[dtrace_toxranges].dtt_base == 0);
16434 ASSERT(dtrace_toxrange[dtrace_toxranges].dtt_limit == 0);
16435
16436 dtrace_toxrange[dtrace_toxranges].dtt_base = base;
16437 dtrace_toxrange[dtrace_toxranges].dtt_limit = limit;
16438 dtrace_toxranges++;
16439}
16440
16441static void
16442dtrace_getf_barrier()
16443{
16444#if defined(sun)
16445 /*
16446 * When we have unprivileged (that is, non-DTRACE_CRV_KERNEL) enablings
16447 * that contain calls to getf(), this routine will be called on every
16448 * closef() before either the underlying vnode is released or the
16449 * file_t itself is freed. By the time we are here, it is essential
16450 * that the file_t can no longer be accessed from a call to getf()
16451 * in probe context -- that assures that a dtrace_sync() can be used
16452 * to clear out any enablings referring to the old structures.
16453 */
16454 if (curthread->t_procp->p_zone->zone_dtrace_getf != 0 ||
16455 kcred->cr_zone->zone_dtrace_getf != 0)
16456 dtrace_sync();
16457#endif
16458}
16459
15522/*
15523 * DTrace Driver Cookbook Functions
15524 */
15525#if defined(sun)
15526/*ARGSUSED*/
15527static int
15528dtrace_attach(dev_info_t *devi, ddi_attach_cmd_t cmd)
15529{
15530 dtrace_provider_id_t id;
15531 dtrace_state_t *state = NULL;
15532 dtrace_enabling_t *enab;
15533
15534 mutex_enter(&cpu_lock);
15535 mutex_enter(&dtrace_provider_lock);
15536 mutex_enter(&dtrace_lock);
15537
15538 if (ddi_soft_state_init(&dtrace_softstate,
15539 sizeof (dtrace_state_t), 0) != 0) {
15540 cmn_err(CE_NOTE, "/dev/dtrace failed to initialize soft state");
15541 mutex_exit(&cpu_lock);
15542 mutex_exit(&dtrace_provider_lock);
15543 mutex_exit(&dtrace_lock);
15544 return (DDI_FAILURE);
15545 }
15546
15547 if (ddi_create_minor_node(devi, DTRACEMNR_DTRACE, S_IFCHR,
15548 DTRACEMNRN_DTRACE, DDI_PSEUDO, NULL) == DDI_FAILURE ||
15549 ddi_create_minor_node(devi, DTRACEMNR_HELPER, S_IFCHR,
15550 DTRACEMNRN_HELPER, DDI_PSEUDO, NULL) == DDI_FAILURE) {
15551 cmn_err(CE_NOTE, "/dev/dtrace couldn't create minor nodes");
15552 ddi_remove_minor_node(devi, NULL);
15553 ddi_soft_state_fini(&dtrace_softstate);
15554 mutex_exit(&cpu_lock);
15555 mutex_exit(&dtrace_provider_lock);
15556 mutex_exit(&dtrace_lock);
15557 return (DDI_FAILURE);
15558 }
15559
15560 ddi_report_dev(devi);
15561 dtrace_devi = devi;
15562
15563 dtrace_modload = dtrace_module_loaded;
15564 dtrace_modunload = dtrace_module_unloaded;
15565 dtrace_cpu_init = dtrace_cpu_setup_initial;
15566 dtrace_helpers_cleanup = dtrace_helpers_destroy;
15567 dtrace_helpers_fork = dtrace_helpers_duplicate;
15568 dtrace_cpustart_init = dtrace_suspend;
15569 dtrace_cpustart_fini = dtrace_resume;
15570 dtrace_debugger_init = dtrace_suspend;
15571 dtrace_debugger_fini = dtrace_resume;
15572
15573 register_cpu_setup_func((cpu_setup_func_t *)dtrace_cpu_setup, NULL);
15574
15575 ASSERT(MUTEX_HELD(&cpu_lock));
15576
15577 dtrace_arena = vmem_create("dtrace", (void *)1, UINT32_MAX, 1,
15578 NULL, NULL, NULL, 0, VM_SLEEP | VMC_IDENTIFIER);
15579 dtrace_minor = vmem_create("dtrace_minor", (void *)DTRACEMNRN_CLONE,
15580 UINT32_MAX - DTRACEMNRN_CLONE, 1, NULL, NULL, NULL, 0,
15581 VM_SLEEP | VMC_IDENTIFIER);
15582 dtrace_taskq = taskq_create("dtrace_taskq", 1, maxclsyspri,
15583 1, INT_MAX, 0);
15584
15585 dtrace_state_cache = kmem_cache_create("dtrace_state_cache",
15586 sizeof (dtrace_dstate_percpu_t) * NCPU, DTRACE_STATE_ALIGN,
15587 NULL, NULL, NULL, NULL, NULL, 0);
15588
15589 ASSERT(MUTEX_HELD(&cpu_lock));
15590 dtrace_bymod = dtrace_hash_create(offsetof(dtrace_probe_t, dtpr_mod),
15591 offsetof(dtrace_probe_t, dtpr_nextmod),
15592 offsetof(dtrace_probe_t, dtpr_prevmod));
15593
15594 dtrace_byfunc = dtrace_hash_create(offsetof(dtrace_probe_t, dtpr_func),
15595 offsetof(dtrace_probe_t, dtpr_nextfunc),
15596 offsetof(dtrace_probe_t, dtpr_prevfunc));
15597
15598 dtrace_byname = dtrace_hash_create(offsetof(dtrace_probe_t, dtpr_name),
15599 offsetof(dtrace_probe_t, dtpr_nextname),
15600 offsetof(dtrace_probe_t, dtpr_prevname));
15601
15602 if (dtrace_retain_max < 1) {
15603 cmn_err(CE_WARN, "illegal value (%lu) for dtrace_retain_max; "
15604 "setting to 1", dtrace_retain_max);
15605 dtrace_retain_max = 1;
15606 }
15607
15608 /*
15609 * Now discover our toxic ranges.
15610 */
15611 dtrace_toxic_ranges(dtrace_toxrange_add);
15612
15613 /*
15614 * Before we register ourselves as a provider to our own framework,
15615 * we would like to assert that dtrace_provider is NULL -- but that's
15616 * not true if we were loaded as a dependency of a DTrace provider.
15617 * Once we've registered, we can assert that dtrace_provider is our
15618 * pseudo provider.
15619 */
15620 (void) dtrace_register("dtrace", &dtrace_provider_attr,
15621 DTRACE_PRIV_NONE, 0, &dtrace_provider_ops, NULL, &id);
15622
15623 ASSERT(dtrace_provider != NULL);
15624 ASSERT((dtrace_provider_id_t)dtrace_provider == id);
15625
15626 dtrace_probeid_begin = dtrace_probe_create((dtrace_provider_id_t)
15627 dtrace_provider, NULL, NULL, "BEGIN", 0, NULL);
15628 dtrace_probeid_end = dtrace_probe_create((dtrace_provider_id_t)
15629 dtrace_provider, NULL, NULL, "END", 0, NULL);
15630 dtrace_probeid_error = dtrace_probe_create((dtrace_provider_id_t)
15631 dtrace_provider, NULL, NULL, "ERROR", 1, NULL);
15632
15633 dtrace_anon_property();
15634 mutex_exit(&cpu_lock);
15635
15636 /*
15637 * If DTrace helper tracing is enabled, we need to allocate the
15638 * trace buffer and initialize the values.
15639 */
15640 if (dtrace_helptrace_enabled) {
15641 ASSERT(dtrace_helptrace_buffer == NULL);
15642 dtrace_helptrace_buffer =
15643 kmem_zalloc(dtrace_helptrace_bufsize, KM_SLEEP);
15644 dtrace_helptrace_next = 0;
15645 }
15646
15647 /*
15648 * If there are already providers, we must ask them to provide their
15649 * probes, and then match any anonymous enabling against them. Note
15650 * that there should be no other retained enablings at this time:
15651 * the only retained enablings at this time should be the anonymous
15652 * enabling.
15653 */
15654 if (dtrace_anon.dta_enabling != NULL) {
15655 ASSERT(dtrace_retained == dtrace_anon.dta_enabling);
15656
15657 dtrace_enabling_provide(NULL);
15658 state = dtrace_anon.dta_state;
15659
15660 /*
15661 * We couldn't hold cpu_lock across the above call to
15662 * dtrace_enabling_provide(), but we must hold it to actually
15663 * enable the probes. We have to drop all of our locks, pick
15664 * up cpu_lock, and regain our locks before matching the
15665 * retained anonymous enabling.
15666 */
15667 mutex_exit(&dtrace_lock);
15668 mutex_exit(&dtrace_provider_lock);
15669
15670 mutex_enter(&cpu_lock);
15671 mutex_enter(&dtrace_provider_lock);
15672 mutex_enter(&dtrace_lock);
15673
15674 if ((enab = dtrace_anon.dta_enabling) != NULL)
15675 (void) dtrace_enabling_match(enab, NULL);
15676
15677 mutex_exit(&cpu_lock);
15678 }
15679
15680 mutex_exit(&dtrace_lock);
15681 mutex_exit(&dtrace_provider_lock);
15682
15683 if (state != NULL) {
15684 /*
15685 * If we created any anonymous state, set it going now.
15686 */
15687 (void) dtrace_state_go(state, &dtrace_anon.dta_beganon);
15688 }
15689
15690 return (DDI_SUCCESS);
15691}
15692#endif
15693
15694#if !defined(sun)
15695#if __FreeBSD_version >= 800039
15696static void dtrace_dtr(void *);
15697#endif
15698#endif
15699
15700/*ARGSUSED*/
15701static int
15702#if defined(sun)
15703dtrace_open(dev_t *devp, int flag, int otyp, cred_t *cred_p)
15704#else
15705dtrace_open(struct cdev *dev, int oflags, int devtype, struct thread *td)
15706#endif
15707{
15708 dtrace_state_t *state;
15709 uint32_t priv;
15710 uid_t uid;
15711 zoneid_t zoneid;
15712
15713#if defined(sun)
15714 if (getminor(*devp) == DTRACEMNRN_HELPER)
15715 return (0);
15716
15717 /*
15718 * If this wasn't an open with the "helper" minor, then it must be
15719 * the "dtrace" minor.
15720 */
15721 if (getminor(*devp) == DTRACEMNRN_DTRACE)
15722 return (ENXIO);
15723#else
15724 cred_t *cred_p = NULL;
15725
15726#if __FreeBSD_version < 800039
15727 /*
15728 * The first minor device is the one that is cloned so there is
15729 * nothing more to do here.
15730 */
15731 if (dev2unit(dev) == 0)
15732 return 0;
15733
15734 /*
15735 * Devices are cloned, so if the DTrace state has already
15736 * been allocated, that means this device belongs to a
15737 * different client. Each client should open '/dev/dtrace'
15738 * to get a cloned device.
15739 */
15740 if (dev->si_drv1 != NULL)
15741 return (EBUSY);
15742#endif
15743
15744 cred_p = dev->si_cred;
15745#endif
15746
15747 /*
15748 * If no DTRACE_PRIV_* bits are set in the credential, then the
15749 * caller lacks sufficient permission to do anything with DTrace.
15750 */
15751 dtrace_cred2priv(cred_p, &priv, &uid, &zoneid);
15752 if (priv == DTRACE_PRIV_NONE) {
15753#if !defined(sun)
15754#if __FreeBSD_version < 800039
15755 /* Destroy the cloned device. */
15756 destroy_dev(dev);
15757#endif
15758#endif
15759
15760 return (EACCES);
15761 }
15762
15763 /*
15764 * Ask all providers to provide all their probes.
15765 */
15766 mutex_enter(&dtrace_provider_lock);
15767 dtrace_probe_provide(NULL, NULL);
15768 mutex_exit(&dtrace_provider_lock);
15769
15770 mutex_enter(&cpu_lock);
15771 mutex_enter(&dtrace_lock);
15772 dtrace_opens++;
15773 dtrace_membar_producer();
15774
15775#if defined(sun)
15776 /*
15777 * If the kernel debugger is active (that is, if the kernel debugger
15778 * modified text in some way), we won't allow the open.
15779 */
15780 if (kdi_dtrace_set(KDI_DTSET_DTRACE_ACTIVATE) != 0) {
15781 dtrace_opens--;
15782 mutex_exit(&cpu_lock);
15783 mutex_exit(&dtrace_lock);
15784 return (EBUSY);
15785 }
15786
15787 state = dtrace_state_create(devp, cred_p);
15788#else
15789 state = dtrace_state_create(dev);
15790#if __FreeBSD_version < 800039
15791 dev->si_drv1 = state;
15792#else
15793 devfs_set_cdevpriv(state, dtrace_dtr);
15794#endif
15795#endif
15796
15797 mutex_exit(&cpu_lock);
15798
15799 if (state == NULL) {
15800#if defined(sun)
15801 if (--dtrace_opens == 0 && dtrace_anon.dta_enabling == NULL)
15802 (void) kdi_dtrace_set(KDI_DTSET_DTRACE_DEACTIVATE);
15803#else
15804 --dtrace_opens;
15805#endif
15806 mutex_exit(&dtrace_lock);
15807#if !defined(sun)
15808#if __FreeBSD_version < 800039
15809 /* Destroy the cloned device. */
15810 destroy_dev(dev);
15811#endif
15812#endif
15813 return (EAGAIN);
15814 }
15815
15816 mutex_exit(&dtrace_lock);
15817
15818 return (0);
15819}
15820
15821/*ARGSUSED*/
15822#if defined(sun)
15823static int
15824dtrace_close(dev_t dev, int flag, int otyp, cred_t *cred_p)
15825#elif __FreeBSD_version < 800039
15826static int
15827dtrace_close(struct cdev *dev, int flags, int fmt __unused, struct thread *td)
15828#else
15829static void
15830dtrace_dtr(void *data)
15831#endif
15832{
15833#if defined(sun)
15834 minor_t minor = getminor(dev);
15835 dtrace_state_t *state;
15836
15837 if (minor == DTRACEMNRN_HELPER)
15838 return (0);
15839
15840 state = ddi_get_soft_state(dtrace_softstate, minor);
15841#else
15842#if __FreeBSD_version < 800039
15843 dtrace_state_t *state = dev->si_drv1;
15844
15845 /* Check if this is not a cloned device. */
15846 if (dev2unit(dev) == 0)
15847 return (0);
15848#else
15849 dtrace_state_t *state = data;
15850#endif
15851
15852#endif
15853
15854 mutex_enter(&cpu_lock);
15855 mutex_enter(&dtrace_lock);
15856
15857 if (state != NULL) {
15858 if (state->dts_anon) {
15859 /*
15860 * There is anonymous state. Destroy that first.
15861 */
15862 ASSERT(dtrace_anon.dta_state == NULL);
15863 dtrace_state_destroy(state->dts_anon);
15864 }
15865
15866 dtrace_state_destroy(state);
15867
15868#if !defined(sun)
15869 kmem_free(state, 0);
15870#if __FreeBSD_version < 800039
15871 dev->si_drv1 = NULL;
15872#endif
15873#endif
15874 }
15875
15876 ASSERT(dtrace_opens > 0);
15877#if defined(sun)
15878 /*
15879 * Only relinquish control of the kernel debugger interface when there
15880 * are no consumers and no anonymous enablings.
15881 */
15882 if (--dtrace_opens == 0 && dtrace_anon.dta_enabling == NULL)
15883 (void) kdi_dtrace_set(KDI_DTSET_DTRACE_DEACTIVATE);
15884#else
15885 --dtrace_opens;
15886#endif
15887
15888 mutex_exit(&dtrace_lock);
15889 mutex_exit(&cpu_lock);
15890
15891#if __FreeBSD_version < 800039
15892 /* Schedule this cloned device to be destroyed. */
15893 destroy_dev_sched(dev);
15894#endif
15895
15896#if defined(sun) || __FreeBSD_version < 800039
15897 return (0);
15898#endif
15899}
15900
15901#if defined(sun)
15902/*ARGSUSED*/
15903static int
15904dtrace_ioctl_helper(int cmd, intptr_t arg, int *rv)
15905{
15906 int rval;
15907 dof_helper_t help, *dhp = NULL;
15908
15909 switch (cmd) {
15910 case DTRACEHIOC_ADDDOF:
15911 if (copyin((void *)arg, &help, sizeof (help)) != 0) {
15912 dtrace_dof_error(NULL, "failed to copyin DOF helper");
15913 return (EFAULT);
15914 }
15915
15916 dhp = &help;
15917 arg = (intptr_t)help.dofhp_dof;
15918 /*FALLTHROUGH*/
15919
15920 case DTRACEHIOC_ADD: {
15921 dof_hdr_t *dof = dtrace_dof_copyin(arg, &rval);
15922
15923 if (dof == NULL)
15924 return (rval);
15925
15926 mutex_enter(&dtrace_lock);
15927
15928 /*
15929 * dtrace_helper_slurp() takes responsibility for the dof --
15930 * it may free it now or it may save it and free it later.
15931 */
15932 if ((rval = dtrace_helper_slurp(dof, dhp)) != -1) {
15933 *rv = rval;
15934 rval = 0;
15935 } else {
15936 rval = EINVAL;
15937 }
15938
15939 mutex_exit(&dtrace_lock);
15940 return (rval);
15941 }
15942
15943 case DTRACEHIOC_REMOVE: {
15944 mutex_enter(&dtrace_lock);
15945 rval = dtrace_helper_destroygen(arg);
15946 mutex_exit(&dtrace_lock);
15947
15948 return (rval);
15949 }
15950
15951 default:
15952 break;
15953 }
15954
15955 return (ENOTTY);
15956}
15957
15958/*ARGSUSED*/
15959static int
15960dtrace_ioctl(dev_t dev, int cmd, intptr_t arg, int md, cred_t *cr, int *rv)
15961{
15962 minor_t minor = getminor(dev);
15963 dtrace_state_t *state;
15964 int rval;
15965
15966 if (minor == DTRACEMNRN_HELPER)
15967 return (dtrace_ioctl_helper(cmd, arg, rv));
15968
15969 state = ddi_get_soft_state(dtrace_softstate, minor);
15970
15971 if (state->dts_anon) {
15972 ASSERT(dtrace_anon.dta_state == NULL);
15973 state = state->dts_anon;
15974 }
15975
15976 switch (cmd) {
15977 case DTRACEIOC_PROVIDER: {
15978 dtrace_providerdesc_t pvd;
15979 dtrace_provider_t *pvp;
15980
15981 if (copyin((void *)arg, &pvd, sizeof (pvd)) != 0)
15982 return (EFAULT);
15983
15984 pvd.dtvd_name[DTRACE_PROVNAMELEN - 1] = '\0';
15985 mutex_enter(&dtrace_provider_lock);
15986
15987 for (pvp = dtrace_provider; pvp != NULL; pvp = pvp->dtpv_next) {
15988 if (strcmp(pvp->dtpv_name, pvd.dtvd_name) == 0)
15989 break;
15990 }
15991
15992 mutex_exit(&dtrace_provider_lock);
15993
15994 if (pvp == NULL)
15995 return (ESRCH);
15996
15997 bcopy(&pvp->dtpv_priv, &pvd.dtvd_priv, sizeof (dtrace_ppriv_t));
15998 bcopy(&pvp->dtpv_attr, &pvd.dtvd_attr, sizeof (dtrace_pattr_t));
15999
16000 if (copyout(&pvd, (void *)arg, sizeof (pvd)) != 0)
16001 return (EFAULT);
16002
16003 return (0);
16004 }
16005
16006 case DTRACEIOC_EPROBE: {
16007 dtrace_eprobedesc_t epdesc;
16008 dtrace_ecb_t *ecb;
16009 dtrace_action_t *act;
16010 void *buf;
16011 size_t size;
16012 uintptr_t dest;
16013 int nrecs;
16014
16015 if (copyin((void *)arg, &epdesc, sizeof (epdesc)) != 0)
16016 return (EFAULT);
16017
16018 mutex_enter(&dtrace_lock);
16019
16020 if ((ecb = dtrace_epid2ecb(state, epdesc.dtepd_epid)) == NULL) {
16021 mutex_exit(&dtrace_lock);
16022 return (EINVAL);
16023 }
16024
16025 if (ecb->dte_probe == NULL) {
16026 mutex_exit(&dtrace_lock);
16027 return (EINVAL);
16028 }
16029
16030 epdesc.dtepd_probeid = ecb->dte_probe->dtpr_id;
16031 epdesc.dtepd_uarg = ecb->dte_uarg;
16032 epdesc.dtepd_size = ecb->dte_size;
16033
16034 nrecs = epdesc.dtepd_nrecs;
16035 epdesc.dtepd_nrecs = 0;
16036 for (act = ecb->dte_action; act != NULL; act = act->dta_next) {
16037 if (DTRACEACT_ISAGG(act->dta_kind) || act->dta_intuple)
16038 continue;
16039
16040 epdesc.dtepd_nrecs++;
16041 }
16042
16043 /*
16044 * Now that we have the size, we need to allocate a temporary
16045 * buffer in which to store the complete description. We need
16046 * the temporary buffer to be able to drop dtrace_lock()
16047 * across the copyout(), below.
16048 */
16049 size = sizeof (dtrace_eprobedesc_t) +
16050 (epdesc.dtepd_nrecs * sizeof (dtrace_recdesc_t));
16051
16052 buf = kmem_alloc(size, KM_SLEEP);
16053 dest = (uintptr_t)buf;
16054
16055 bcopy(&epdesc, (void *)dest, sizeof (epdesc));
16056 dest += offsetof(dtrace_eprobedesc_t, dtepd_rec[0]);
16057
16058 for (act = ecb->dte_action; act != NULL; act = act->dta_next) {
16059 if (DTRACEACT_ISAGG(act->dta_kind) || act->dta_intuple)
16060 continue;
16061
16062 if (nrecs-- == 0)
16063 break;
16064
16065 bcopy(&act->dta_rec, (void *)dest,
16066 sizeof (dtrace_recdesc_t));
16067 dest += sizeof (dtrace_recdesc_t);
16068 }
16069
16070 mutex_exit(&dtrace_lock);
16071
16072 if (copyout(buf, (void *)arg, dest - (uintptr_t)buf) != 0) {
16073 kmem_free(buf, size);
16074 return (EFAULT);
16075 }
16076
16077 kmem_free(buf, size);
16078 return (0);
16079 }
16080
16081 case DTRACEIOC_AGGDESC: {
16082 dtrace_aggdesc_t aggdesc;
16083 dtrace_action_t *act;
16084 dtrace_aggregation_t *agg;
16085 int nrecs;
16086 uint32_t offs;
16087 dtrace_recdesc_t *lrec;
16088 void *buf;
16089 size_t size;
16090 uintptr_t dest;
16091
16092 if (copyin((void *)arg, &aggdesc, sizeof (aggdesc)) != 0)
16093 return (EFAULT);
16094
16095 mutex_enter(&dtrace_lock);
16096
16097 if ((agg = dtrace_aggid2agg(state, aggdesc.dtagd_id)) == NULL) {
16098 mutex_exit(&dtrace_lock);
16099 return (EINVAL);
16100 }
16101
16102 aggdesc.dtagd_epid = agg->dtag_ecb->dte_epid;
16103
16104 nrecs = aggdesc.dtagd_nrecs;
16105 aggdesc.dtagd_nrecs = 0;
16106
16107 offs = agg->dtag_base;
16108 lrec = &agg->dtag_action.dta_rec;
16109 aggdesc.dtagd_size = lrec->dtrd_offset + lrec->dtrd_size - offs;
16110
16111 for (act = agg->dtag_first; ; act = act->dta_next) {
16112 ASSERT(act->dta_intuple ||
16113 DTRACEACT_ISAGG(act->dta_kind));
16114
16115 /*
16116 * If this action has a record size of zero, it
16117 * denotes an argument to the aggregating action.
16118 * Because the presence of this record doesn't (or
16119 * shouldn't) affect the way the data is interpreted,
16120 * we don't copy it out to save user-level the
16121 * confusion of dealing with a zero-length record.
16122 */
16123 if (act->dta_rec.dtrd_size == 0) {
16124 ASSERT(agg->dtag_hasarg);
16125 continue;
16126 }
16127
16128 aggdesc.dtagd_nrecs++;
16129
16130 if (act == &agg->dtag_action)
16131 break;
16132 }
16133
16134 /*
16135 * Now that we have the size, we need to allocate a temporary
16136 * buffer in which to store the complete description. We need
16137 * the temporary buffer to be able to drop dtrace_lock()
16138 * across the copyout(), below.
16139 */
16140 size = sizeof (dtrace_aggdesc_t) +
16141 (aggdesc.dtagd_nrecs * sizeof (dtrace_recdesc_t));
16142
16143 buf = kmem_alloc(size, KM_SLEEP);
16144 dest = (uintptr_t)buf;
16145
16146 bcopy(&aggdesc, (void *)dest, sizeof (aggdesc));
16147 dest += offsetof(dtrace_aggdesc_t, dtagd_rec[0]);
16148
16149 for (act = agg->dtag_first; ; act = act->dta_next) {
16150 dtrace_recdesc_t rec = act->dta_rec;
16151
16152 /*
16153 * See the comment in the above loop for why we pass
16154 * over zero-length records.
16155 */
16156 if (rec.dtrd_size == 0) {
16157 ASSERT(agg->dtag_hasarg);
16158 continue;
16159 }
16160
16161 if (nrecs-- == 0)
16162 break;
16163
16164 rec.dtrd_offset -= offs;
16165 bcopy(&rec, (void *)dest, sizeof (rec));
16166 dest += sizeof (dtrace_recdesc_t);
16167
16168 if (act == &agg->dtag_action)
16169 break;
16170 }
16171
16172 mutex_exit(&dtrace_lock);
16173
16174 if (copyout(buf, (void *)arg, dest - (uintptr_t)buf) != 0) {
16175 kmem_free(buf, size);
16176 return (EFAULT);
16177 }
16178
16179 kmem_free(buf, size);
16180 return (0);
16181 }
16182
16183 case DTRACEIOC_ENABLE: {
16184 dof_hdr_t *dof;
16185 dtrace_enabling_t *enab = NULL;
16186 dtrace_vstate_t *vstate;
16187 int err = 0;
16188
16189 *rv = 0;
16190
16191 /*
16192 * If a NULL argument has been passed, we take this as our
16193 * cue to reevaluate our enablings.
16194 */
16195 if (arg == NULL) {
16196 dtrace_enabling_matchall();
16197
16198 return (0);
16199 }
16200
16201 if ((dof = dtrace_dof_copyin(arg, &rval)) == NULL)
16202 return (rval);
16203
16204 mutex_enter(&cpu_lock);
16205 mutex_enter(&dtrace_lock);
16206 vstate = &state->dts_vstate;
16207
16208 if (state->dts_activity != DTRACE_ACTIVITY_INACTIVE) {
16209 mutex_exit(&dtrace_lock);
16210 mutex_exit(&cpu_lock);
16211 dtrace_dof_destroy(dof);
16212 return (EBUSY);
16213 }
16214
16215 if (dtrace_dof_slurp(dof, vstate, cr, &enab, 0, B_TRUE) != 0) {
16216 mutex_exit(&dtrace_lock);
16217 mutex_exit(&cpu_lock);
16218 dtrace_dof_destroy(dof);
16219 return (EINVAL);
16220 }
16221
16222 if ((rval = dtrace_dof_options(dof, state)) != 0) {
16223 dtrace_enabling_destroy(enab);
16224 mutex_exit(&dtrace_lock);
16225 mutex_exit(&cpu_lock);
16226 dtrace_dof_destroy(dof);
16227 return (rval);
16228 }
16229
16230 if ((err = dtrace_enabling_match(enab, rv)) == 0) {
16231 err = dtrace_enabling_retain(enab);
16232 } else {
16233 dtrace_enabling_destroy(enab);
16234 }
16235
16236 mutex_exit(&cpu_lock);
16237 mutex_exit(&dtrace_lock);
16238 dtrace_dof_destroy(dof);
16239
16240 return (err);
16241 }
16242
16243 case DTRACEIOC_REPLICATE: {
16244 dtrace_repldesc_t desc;
16245 dtrace_probedesc_t *match = &desc.dtrpd_match;
16246 dtrace_probedesc_t *create = &desc.dtrpd_create;
16247 int err;
16248
16249 if (copyin((void *)arg, &desc, sizeof (desc)) != 0)
16250 return (EFAULT);
16251
16252 match->dtpd_provider[DTRACE_PROVNAMELEN - 1] = '\0';
16253 match->dtpd_mod[DTRACE_MODNAMELEN - 1] = '\0';
16254 match->dtpd_func[DTRACE_FUNCNAMELEN - 1] = '\0';
16255 match->dtpd_name[DTRACE_NAMELEN - 1] = '\0';
16256
16257 create->dtpd_provider[DTRACE_PROVNAMELEN - 1] = '\0';
16258 create->dtpd_mod[DTRACE_MODNAMELEN - 1] = '\0';
16259 create->dtpd_func[DTRACE_FUNCNAMELEN - 1] = '\0';
16260 create->dtpd_name[DTRACE_NAMELEN - 1] = '\0';
16261
16262 mutex_enter(&dtrace_lock);
16263 err = dtrace_enabling_replicate(state, match, create);
16264 mutex_exit(&dtrace_lock);
16265
16266 return (err);
16267 }
16268
16269 case DTRACEIOC_PROBEMATCH:
16270 case DTRACEIOC_PROBES: {
16271 dtrace_probe_t *probe = NULL;
16272 dtrace_probedesc_t desc;
16273 dtrace_probekey_t pkey;
16274 dtrace_id_t i;
16275 int m = 0;
16276 uint32_t priv;
16277 uid_t uid;
16278 zoneid_t zoneid;
16279
16280 if (copyin((void *)arg, &desc, sizeof (desc)) != 0)
16281 return (EFAULT);
16282
16283 desc.dtpd_provider[DTRACE_PROVNAMELEN - 1] = '\0';
16284 desc.dtpd_mod[DTRACE_MODNAMELEN - 1] = '\0';
16285 desc.dtpd_func[DTRACE_FUNCNAMELEN - 1] = '\0';
16286 desc.dtpd_name[DTRACE_NAMELEN - 1] = '\0';
16287
16288 /*
16289 * Before we attempt to match this probe, we want to give
16290 * all providers the opportunity to provide it.
16291 */
16292 if (desc.dtpd_id == DTRACE_IDNONE) {
16293 mutex_enter(&dtrace_provider_lock);
16294 dtrace_probe_provide(&desc, NULL);
16295 mutex_exit(&dtrace_provider_lock);
16296 desc.dtpd_id++;
16297 }
16298
16299 if (cmd == DTRACEIOC_PROBEMATCH) {
16300 dtrace_probekey(&desc, &pkey);
16301 pkey.dtpk_id = DTRACE_IDNONE;
16302 }
16303
16304 dtrace_cred2priv(cr, &priv, &uid, &zoneid);
16305
16306 mutex_enter(&dtrace_lock);
16307
16308 if (cmd == DTRACEIOC_PROBEMATCH) {
16309 for (i = desc.dtpd_id; i <= dtrace_nprobes; i++) {
16310 if ((probe = dtrace_probes[i - 1]) != NULL &&
16311 (m = dtrace_match_probe(probe, &pkey,
16312 priv, uid, zoneid)) != 0)
16313 break;
16314 }
16315
16316 if (m < 0) {
16317 mutex_exit(&dtrace_lock);
16318 return (EINVAL);
16319 }
16320
16321 } else {
16322 for (i = desc.dtpd_id; i <= dtrace_nprobes; i++) {
16323 if ((probe = dtrace_probes[i - 1]) != NULL &&
16324 dtrace_match_priv(probe, priv, uid, zoneid))
16325 break;
16326 }
16327 }
16328
16329 if (probe == NULL) {
16330 mutex_exit(&dtrace_lock);
16331 return (ESRCH);
16332 }
16333
16334 dtrace_probe_description(probe, &desc);
16335 mutex_exit(&dtrace_lock);
16336
16337 if (copyout(&desc, (void *)arg, sizeof (desc)) != 0)
16338 return (EFAULT);
16339
16340 return (0);
16341 }
16342
16343 case DTRACEIOC_PROBEARG: {
16344 dtrace_argdesc_t desc;
16345 dtrace_probe_t *probe;
16346 dtrace_provider_t *prov;
16347
16348 if (copyin((void *)arg, &desc, sizeof (desc)) != 0)
16349 return (EFAULT);
16350
16351 if (desc.dtargd_id == DTRACE_IDNONE)
16352 return (EINVAL);
16353
16354 if (desc.dtargd_ndx == DTRACE_ARGNONE)
16355 return (EINVAL);
16356
16357 mutex_enter(&dtrace_provider_lock);
16358 mutex_enter(&mod_lock);
16359 mutex_enter(&dtrace_lock);
16360
16361 if (desc.dtargd_id > dtrace_nprobes) {
16362 mutex_exit(&dtrace_lock);
16363 mutex_exit(&mod_lock);
16364 mutex_exit(&dtrace_provider_lock);
16365 return (EINVAL);
16366 }
16367
16368 if ((probe = dtrace_probes[desc.dtargd_id - 1]) == NULL) {
16369 mutex_exit(&dtrace_lock);
16370 mutex_exit(&mod_lock);
16371 mutex_exit(&dtrace_provider_lock);
16372 return (EINVAL);
16373 }
16374
16375 mutex_exit(&dtrace_lock);
16376
16377 prov = probe->dtpr_provider;
16378
16379 if (prov->dtpv_pops.dtps_getargdesc == NULL) {
16380 /*
16381 * There isn't any typed information for this probe.
16382 * Set the argument number to DTRACE_ARGNONE.
16383 */
16384 desc.dtargd_ndx = DTRACE_ARGNONE;
16385 } else {
16386 desc.dtargd_native[0] = '\0';
16387 desc.dtargd_xlate[0] = '\0';
16388 desc.dtargd_mapping = desc.dtargd_ndx;
16389
16390 prov->dtpv_pops.dtps_getargdesc(prov->dtpv_arg,
16391 probe->dtpr_id, probe->dtpr_arg, &desc);
16392 }
16393
16394 mutex_exit(&mod_lock);
16395 mutex_exit(&dtrace_provider_lock);
16396
16397 if (copyout(&desc, (void *)arg, sizeof (desc)) != 0)
16398 return (EFAULT);
16399
16400 return (0);
16401 }
16402
16403 case DTRACEIOC_GO: {
16404 processorid_t cpuid;
16405 rval = dtrace_state_go(state, &cpuid);
16406
16407 if (rval != 0)
16408 return (rval);
16409
16410 if (copyout(&cpuid, (void *)arg, sizeof (cpuid)) != 0)
16411 return (EFAULT);
16412
16413 return (0);
16414 }
16415
16416 case DTRACEIOC_STOP: {
16417 processorid_t cpuid;
16418
16419 mutex_enter(&dtrace_lock);
16420 rval = dtrace_state_stop(state, &cpuid);
16421 mutex_exit(&dtrace_lock);
16422
16423 if (rval != 0)
16424 return (rval);
16425
16426 if (copyout(&cpuid, (void *)arg, sizeof (cpuid)) != 0)
16427 return (EFAULT);
16428
16429 return (0);
16430 }
16431
16432 case DTRACEIOC_DOFGET: {
16433 dof_hdr_t hdr, *dof;
16434 uint64_t len;
16435
16436 if (copyin((void *)arg, &hdr, sizeof (hdr)) != 0)
16437 return (EFAULT);
16438
16439 mutex_enter(&dtrace_lock);
16440 dof = dtrace_dof_create(state);
16441 mutex_exit(&dtrace_lock);
16442
16443 len = MIN(hdr.dofh_loadsz, dof->dofh_loadsz);
16444 rval = copyout(dof, (void *)arg, len);
16445 dtrace_dof_destroy(dof);
16446
16447 return (rval == 0 ? 0 : EFAULT);
16448 }
16449
16450 case DTRACEIOC_AGGSNAP:
16451 case DTRACEIOC_BUFSNAP: {
16452 dtrace_bufdesc_t desc;
16453 caddr_t cached;
16454 dtrace_buffer_t *buf;
16455
16456 if (copyin((void *)arg, &desc, sizeof (desc)) != 0)
16457 return (EFAULT);
16458
16459 if (desc.dtbd_cpu < 0 || desc.dtbd_cpu >= NCPU)
16460 return (EINVAL);
16461
16462 mutex_enter(&dtrace_lock);
16463
16464 if (cmd == DTRACEIOC_BUFSNAP) {
16465 buf = &state->dts_buffer[desc.dtbd_cpu];
16466 } else {
16467 buf = &state->dts_aggbuffer[desc.dtbd_cpu];
16468 }
16469
16470 if (buf->dtb_flags & (DTRACEBUF_RING | DTRACEBUF_FILL)) {
16471 size_t sz = buf->dtb_offset;
16472
16473 if (state->dts_activity != DTRACE_ACTIVITY_STOPPED) {
16474 mutex_exit(&dtrace_lock);
16475 return (EBUSY);
16476 }
16477
16478 /*
16479 * If this buffer has already been consumed, we're
16480 * going to indicate that there's nothing left here
16481 * to consume.
16482 */
16483 if (buf->dtb_flags & DTRACEBUF_CONSUMED) {
16484 mutex_exit(&dtrace_lock);
16485
16486 desc.dtbd_size = 0;
16487 desc.dtbd_drops = 0;
16488 desc.dtbd_errors = 0;
16489 desc.dtbd_oldest = 0;
16490 sz = sizeof (desc);
16491
16492 if (copyout(&desc, (void *)arg, sz) != 0)
16493 return (EFAULT);
16494
16495 return (0);
16496 }
16497
16498 /*
16499 * If this is a ring buffer that has wrapped, we want
16500 * to copy the whole thing out.
16501 */
16502 if (buf->dtb_flags & DTRACEBUF_WRAPPED) {
16503 dtrace_buffer_polish(buf);
16504 sz = buf->dtb_size;
16505 }
16506
16507 if (copyout(buf->dtb_tomax, desc.dtbd_data, sz) != 0) {
16508 mutex_exit(&dtrace_lock);
16509 return (EFAULT);
16510 }
16511
16512 desc.dtbd_size = sz;
16513 desc.dtbd_drops = buf->dtb_drops;
16514 desc.dtbd_errors = buf->dtb_errors;
16515 desc.dtbd_oldest = buf->dtb_xamot_offset;
16516 desc.dtbd_timestamp = dtrace_gethrtime();
16517
16518 mutex_exit(&dtrace_lock);
16519
16520 if (copyout(&desc, (void *)arg, sizeof (desc)) != 0)
16521 return (EFAULT);
16522
16523 buf->dtb_flags |= DTRACEBUF_CONSUMED;
16524
16525 return (0);
16526 }
16527
16528 if (buf->dtb_tomax == NULL) {
16529 ASSERT(buf->dtb_xamot == NULL);
16530 mutex_exit(&dtrace_lock);
16531 return (ENOENT);
16532 }
16533
16534 cached = buf->dtb_tomax;
16535 ASSERT(!(buf->dtb_flags & DTRACEBUF_NOSWITCH));
16536
16537 dtrace_xcall(desc.dtbd_cpu,
16538 (dtrace_xcall_t)dtrace_buffer_switch, buf);
16539
16540 state->dts_errors += buf->dtb_xamot_errors;
16541
16542 /*
16543 * If the buffers did not actually switch, then the cross call
16544 * did not take place -- presumably because the given CPU is
16545 * not in the ready set. If this is the case, we'll return
16546 * ENOENT.
16547 */
16548 if (buf->dtb_tomax == cached) {
16549 ASSERT(buf->dtb_xamot != cached);
16550 mutex_exit(&dtrace_lock);
16551 return (ENOENT);
16552 }
16553
16554 ASSERT(cached == buf->dtb_xamot);
16555
16556 /*
16557 * We have our snapshot; now copy it out.
16558 */
16559 if (copyout(buf->dtb_xamot, desc.dtbd_data,
16560 buf->dtb_xamot_offset) != 0) {
16561 mutex_exit(&dtrace_lock);
16562 return (EFAULT);
16563 }
16564
16565 desc.dtbd_size = buf->dtb_xamot_offset;
16566 desc.dtbd_drops = buf->dtb_xamot_drops;
16567 desc.dtbd_errors = buf->dtb_xamot_errors;
16568 desc.dtbd_oldest = 0;
16569 desc.dtbd_timestamp = buf->dtb_switched;
16570
16571 mutex_exit(&dtrace_lock);
16572
16573 /*
16574 * Finally, copy out the buffer description.
16575 */
16576 if (copyout(&desc, (void *)arg, sizeof (desc)) != 0)
16577 return (EFAULT);
16578
16579 return (0);
16580 }
16581
16582 case DTRACEIOC_CONF: {
16583 dtrace_conf_t conf;
16584
16585 bzero(&conf, sizeof (conf));
16586 conf.dtc_difversion = DIF_VERSION;
16587 conf.dtc_difintregs = DIF_DIR_NREGS;
16588 conf.dtc_diftupregs = DIF_DTR_NREGS;
16589 conf.dtc_ctfmodel = CTF_MODEL_NATIVE;
16590
16591 if (copyout(&conf, (void *)arg, sizeof (conf)) != 0)
16592 return (EFAULT);
16593
16594 return (0);
16595 }
16596
16597 case DTRACEIOC_STATUS: {
16598 dtrace_status_t stat;
16599 dtrace_dstate_t *dstate;
16600 int i, j;
16601 uint64_t nerrs;
16602
16603 /*
16604 * See the comment in dtrace_state_deadman() for the reason
16605 * for setting dts_laststatus to INT64_MAX before setting
16606 * it to the correct value.
16607 */
16608 state->dts_laststatus = INT64_MAX;
16609 dtrace_membar_producer();
16610 state->dts_laststatus = dtrace_gethrtime();
16611
16612 bzero(&stat, sizeof (stat));
16613
16614 mutex_enter(&dtrace_lock);
16615
16616 if (state->dts_activity == DTRACE_ACTIVITY_INACTIVE) {
16617 mutex_exit(&dtrace_lock);
16618 return (ENOENT);
16619 }
16620
16621 if (state->dts_activity == DTRACE_ACTIVITY_DRAINING)
16622 stat.dtst_exiting = 1;
16623
16624 nerrs = state->dts_errors;
16625 dstate = &state->dts_vstate.dtvs_dynvars;
16626
16627 for (i = 0; i < NCPU; i++) {
16628 dtrace_dstate_percpu_t *dcpu = &dstate->dtds_percpu[i];
16629
16630 stat.dtst_dyndrops += dcpu->dtdsc_drops;
16631 stat.dtst_dyndrops_dirty += dcpu->dtdsc_dirty_drops;
16632 stat.dtst_dyndrops_rinsing += dcpu->dtdsc_rinsing_drops;
16633
16634 if (state->dts_buffer[i].dtb_flags & DTRACEBUF_FULL)
16635 stat.dtst_filled++;
16636
16637 nerrs += state->dts_buffer[i].dtb_errors;
16638
16639 for (j = 0; j < state->dts_nspeculations; j++) {
16640 dtrace_speculation_t *spec;
16641 dtrace_buffer_t *buf;
16642
16643 spec = &state->dts_speculations[j];
16644 buf = &spec->dtsp_buffer[i];
16645 stat.dtst_specdrops += buf->dtb_xamot_drops;
16646 }
16647 }
16648
16649 stat.dtst_specdrops_busy = state->dts_speculations_busy;
16650 stat.dtst_specdrops_unavail = state->dts_speculations_unavail;
16651 stat.dtst_stkstroverflows = state->dts_stkstroverflows;
16652 stat.dtst_dblerrors = state->dts_dblerrors;
16653 stat.dtst_killed =
16654 (state->dts_activity == DTRACE_ACTIVITY_KILLED);
16655 stat.dtst_errors = nerrs;
16656
16657 mutex_exit(&dtrace_lock);
16658
16659 if (copyout(&stat, (void *)arg, sizeof (stat)) != 0)
16660 return (EFAULT);
16661
16662 return (0);
16663 }
16664
16665 case DTRACEIOC_FORMAT: {
16666 dtrace_fmtdesc_t fmt;
16667 char *str;
16668 int len;
16669
16670 if (copyin((void *)arg, &fmt, sizeof (fmt)) != 0)
16671 return (EFAULT);
16672
16673 mutex_enter(&dtrace_lock);
16674
16675 if (fmt.dtfd_format == 0 ||
16676 fmt.dtfd_format > state->dts_nformats) {
16677 mutex_exit(&dtrace_lock);
16678 return (EINVAL);
16679 }
16680
16681 /*
16682 * Format strings are allocated contiguously and they are
16683 * never freed; if a format index is less than the number
16684 * of formats, we can assert that the format map is non-NULL
16685 * and that the format for the specified index is non-NULL.
16686 */
16687 ASSERT(state->dts_formats != NULL);
16688 str = state->dts_formats[fmt.dtfd_format - 1];
16689 ASSERT(str != NULL);
16690
16691 len = strlen(str) + 1;
16692
16693 if (len > fmt.dtfd_length) {
16694 fmt.dtfd_length = len;
16695
16696 if (copyout(&fmt, (void *)arg, sizeof (fmt)) != 0) {
16697 mutex_exit(&dtrace_lock);
16698 return (EINVAL);
16699 }
16700 } else {
16701 if (copyout(str, fmt.dtfd_string, len) != 0) {
16702 mutex_exit(&dtrace_lock);
16703 return (EINVAL);
16704 }
16705 }
16706
16707 mutex_exit(&dtrace_lock);
16708 return (0);
16709 }
16710
16711 default:
16712 break;
16713 }
16714
16715 return (ENOTTY);
16716}
16717
16718/*ARGSUSED*/
16719static int
16720dtrace_detach(dev_info_t *dip, ddi_detach_cmd_t cmd)
16721{
16722 dtrace_state_t *state;
16723
16724 switch (cmd) {
16725 case DDI_DETACH:
16726 break;
16727
16728 case DDI_SUSPEND:
16729 return (DDI_SUCCESS);
16730
16731 default:
16732 return (DDI_FAILURE);
16733 }
16734
16735 mutex_enter(&cpu_lock);
16736 mutex_enter(&dtrace_provider_lock);
16737 mutex_enter(&dtrace_lock);
16738
16739 ASSERT(dtrace_opens == 0);
16740
16741 if (dtrace_helpers > 0) {
16742 mutex_exit(&dtrace_provider_lock);
16743 mutex_exit(&dtrace_lock);
16744 mutex_exit(&cpu_lock);
16745 return (DDI_FAILURE);
16746 }
16747
16748 if (dtrace_unregister((dtrace_provider_id_t)dtrace_provider) != 0) {
16749 mutex_exit(&dtrace_provider_lock);
16750 mutex_exit(&dtrace_lock);
16751 mutex_exit(&cpu_lock);
16752 return (DDI_FAILURE);
16753 }
16754
16755 dtrace_provider = NULL;
16756
16757 if ((state = dtrace_anon_grab()) != NULL) {
16758 /*
16759 * If there were ECBs on this state, the provider should
16760 * have not been allowed to detach; assert that there is
16761 * none.
16762 */
16763 ASSERT(state->dts_necbs == 0);
16764 dtrace_state_destroy(state);
16765
16766 /*
16767 * If we're being detached with anonymous state, we need to
16768 * indicate to the kernel debugger that DTrace is now inactive.
16769 */
16770 (void) kdi_dtrace_set(KDI_DTSET_DTRACE_DEACTIVATE);
16771 }
16772
16773 bzero(&dtrace_anon, sizeof (dtrace_anon_t));
16774 unregister_cpu_setup_func((cpu_setup_func_t *)dtrace_cpu_setup, NULL);
16775 dtrace_cpu_init = NULL;
16776 dtrace_helpers_cleanup = NULL;
16777 dtrace_helpers_fork = NULL;
16778 dtrace_cpustart_init = NULL;
16779 dtrace_cpustart_fini = NULL;
16780 dtrace_debugger_init = NULL;
16781 dtrace_debugger_fini = NULL;
16782 dtrace_modload = NULL;
16783 dtrace_modunload = NULL;
16784
16460/*
16461 * DTrace Driver Cookbook Functions
16462 */
16463#if defined(sun)
16464/*ARGSUSED*/
16465static int
16466dtrace_attach(dev_info_t *devi, ddi_attach_cmd_t cmd)
16467{
16468 dtrace_provider_id_t id;
16469 dtrace_state_t *state = NULL;
16470 dtrace_enabling_t *enab;
16471
16472 mutex_enter(&cpu_lock);
16473 mutex_enter(&dtrace_provider_lock);
16474 mutex_enter(&dtrace_lock);
16475
16476 if (ddi_soft_state_init(&dtrace_softstate,
16477 sizeof (dtrace_state_t), 0) != 0) {
16478 cmn_err(CE_NOTE, "/dev/dtrace failed to initialize soft state");
16479 mutex_exit(&cpu_lock);
16480 mutex_exit(&dtrace_provider_lock);
16481 mutex_exit(&dtrace_lock);
16482 return (DDI_FAILURE);
16483 }
16484
16485 if (ddi_create_minor_node(devi, DTRACEMNR_DTRACE, S_IFCHR,
16486 DTRACEMNRN_DTRACE, DDI_PSEUDO, NULL) == DDI_FAILURE ||
16487 ddi_create_minor_node(devi, DTRACEMNR_HELPER, S_IFCHR,
16488 DTRACEMNRN_HELPER, DDI_PSEUDO, NULL) == DDI_FAILURE) {
16489 cmn_err(CE_NOTE, "/dev/dtrace couldn't create minor nodes");
16490 ddi_remove_minor_node(devi, NULL);
16491 ddi_soft_state_fini(&dtrace_softstate);
16492 mutex_exit(&cpu_lock);
16493 mutex_exit(&dtrace_provider_lock);
16494 mutex_exit(&dtrace_lock);
16495 return (DDI_FAILURE);
16496 }
16497
16498 ddi_report_dev(devi);
16499 dtrace_devi = devi;
16500
16501 dtrace_modload = dtrace_module_loaded;
16502 dtrace_modunload = dtrace_module_unloaded;
16503 dtrace_cpu_init = dtrace_cpu_setup_initial;
16504 dtrace_helpers_cleanup = dtrace_helpers_destroy;
16505 dtrace_helpers_fork = dtrace_helpers_duplicate;
16506 dtrace_cpustart_init = dtrace_suspend;
16507 dtrace_cpustart_fini = dtrace_resume;
16508 dtrace_debugger_init = dtrace_suspend;
16509 dtrace_debugger_fini = dtrace_resume;
16510
16511 register_cpu_setup_func((cpu_setup_func_t *)dtrace_cpu_setup, NULL);
16512
16513 ASSERT(MUTEX_HELD(&cpu_lock));
16514
16515 dtrace_arena = vmem_create("dtrace", (void *)1, UINT32_MAX, 1,
16516 NULL, NULL, NULL, 0, VM_SLEEP | VMC_IDENTIFIER);
16517 dtrace_minor = vmem_create("dtrace_minor", (void *)DTRACEMNRN_CLONE,
16518 UINT32_MAX - DTRACEMNRN_CLONE, 1, NULL, NULL, NULL, 0,
16519 VM_SLEEP | VMC_IDENTIFIER);
16520 dtrace_taskq = taskq_create("dtrace_taskq", 1, maxclsyspri,
16521 1, INT_MAX, 0);
16522
16523 dtrace_state_cache = kmem_cache_create("dtrace_state_cache",
16524 sizeof (dtrace_dstate_percpu_t) * NCPU, DTRACE_STATE_ALIGN,
16525 NULL, NULL, NULL, NULL, NULL, 0);
16526
16527 ASSERT(MUTEX_HELD(&cpu_lock));
16528 dtrace_bymod = dtrace_hash_create(offsetof(dtrace_probe_t, dtpr_mod),
16529 offsetof(dtrace_probe_t, dtpr_nextmod),
16530 offsetof(dtrace_probe_t, dtpr_prevmod));
16531
16532 dtrace_byfunc = dtrace_hash_create(offsetof(dtrace_probe_t, dtpr_func),
16533 offsetof(dtrace_probe_t, dtpr_nextfunc),
16534 offsetof(dtrace_probe_t, dtpr_prevfunc));
16535
16536 dtrace_byname = dtrace_hash_create(offsetof(dtrace_probe_t, dtpr_name),
16537 offsetof(dtrace_probe_t, dtpr_nextname),
16538 offsetof(dtrace_probe_t, dtpr_prevname));
16539
16540 if (dtrace_retain_max < 1) {
16541 cmn_err(CE_WARN, "illegal value (%lu) for dtrace_retain_max; "
16542 "setting to 1", dtrace_retain_max);
16543 dtrace_retain_max = 1;
16544 }
16545
16546 /*
16547 * Now discover our toxic ranges.
16548 */
16549 dtrace_toxic_ranges(dtrace_toxrange_add);
16550
16551 /*
16552 * Before we register ourselves as a provider to our own framework,
16553 * we would like to assert that dtrace_provider is NULL -- but that's
16554 * not true if we were loaded as a dependency of a DTrace provider.
16555 * Once we've registered, we can assert that dtrace_provider is our
16556 * pseudo provider.
16557 */
16558 (void) dtrace_register("dtrace", &dtrace_provider_attr,
16559 DTRACE_PRIV_NONE, 0, &dtrace_provider_ops, NULL, &id);
16560
16561 ASSERT(dtrace_provider != NULL);
16562 ASSERT((dtrace_provider_id_t)dtrace_provider == id);
16563
16564 dtrace_probeid_begin = dtrace_probe_create((dtrace_provider_id_t)
16565 dtrace_provider, NULL, NULL, "BEGIN", 0, NULL);
16566 dtrace_probeid_end = dtrace_probe_create((dtrace_provider_id_t)
16567 dtrace_provider, NULL, NULL, "END", 0, NULL);
16568 dtrace_probeid_error = dtrace_probe_create((dtrace_provider_id_t)
16569 dtrace_provider, NULL, NULL, "ERROR", 1, NULL);
16570
16571 dtrace_anon_property();
16572 mutex_exit(&cpu_lock);
16573
16574 /*
16575 * If DTrace helper tracing is enabled, we need to allocate the
16576 * trace buffer and initialize the values.
16577 */
16578 if (dtrace_helptrace_enabled) {
16579 ASSERT(dtrace_helptrace_buffer == NULL);
16580 dtrace_helptrace_buffer =
16581 kmem_zalloc(dtrace_helptrace_bufsize, KM_SLEEP);
16582 dtrace_helptrace_next = 0;
16583 }
16584
16585 /*
16586 * If there are already providers, we must ask them to provide their
16587 * probes, and then match any anonymous enabling against them. Note
16588 * that there should be no other retained enablings at this time:
16589 * the only retained enablings at this time should be the anonymous
16590 * enabling.
16591 */
16592 if (dtrace_anon.dta_enabling != NULL) {
16593 ASSERT(dtrace_retained == dtrace_anon.dta_enabling);
16594
16595 dtrace_enabling_provide(NULL);
16596 state = dtrace_anon.dta_state;
16597
16598 /*
16599 * We couldn't hold cpu_lock across the above call to
16600 * dtrace_enabling_provide(), but we must hold it to actually
16601 * enable the probes. We have to drop all of our locks, pick
16602 * up cpu_lock, and regain our locks before matching the
16603 * retained anonymous enabling.
16604 */
16605 mutex_exit(&dtrace_lock);
16606 mutex_exit(&dtrace_provider_lock);
16607
16608 mutex_enter(&cpu_lock);
16609 mutex_enter(&dtrace_provider_lock);
16610 mutex_enter(&dtrace_lock);
16611
16612 if ((enab = dtrace_anon.dta_enabling) != NULL)
16613 (void) dtrace_enabling_match(enab, NULL);
16614
16615 mutex_exit(&cpu_lock);
16616 }
16617
16618 mutex_exit(&dtrace_lock);
16619 mutex_exit(&dtrace_provider_lock);
16620
16621 if (state != NULL) {
16622 /*
16623 * If we created any anonymous state, set it going now.
16624 */
16625 (void) dtrace_state_go(state, &dtrace_anon.dta_beganon);
16626 }
16627
16628 return (DDI_SUCCESS);
16629}
16630#endif
16631
16632#if !defined(sun)
16633#if __FreeBSD_version >= 800039
16634static void dtrace_dtr(void *);
16635#endif
16636#endif
16637
16638/*ARGSUSED*/
16639static int
16640#if defined(sun)
16641dtrace_open(dev_t *devp, int flag, int otyp, cred_t *cred_p)
16642#else
16643dtrace_open(struct cdev *dev, int oflags, int devtype, struct thread *td)
16644#endif
16645{
16646 dtrace_state_t *state;
16647 uint32_t priv;
16648 uid_t uid;
16649 zoneid_t zoneid;
16650
16651#if defined(sun)
16652 if (getminor(*devp) == DTRACEMNRN_HELPER)
16653 return (0);
16654
16655 /*
16656 * If this wasn't an open with the "helper" minor, then it must be
16657 * the "dtrace" minor.
16658 */
16659 if (getminor(*devp) == DTRACEMNRN_DTRACE)
16660 return (ENXIO);
16661#else
16662 cred_t *cred_p = NULL;
16663
16664#if __FreeBSD_version < 800039
16665 /*
16666 * The first minor device is the one that is cloned so there is
16667 * nothing more to do here.
16668 */
16669 if (dev2unit(dev) == 0)
16670 return 0;
16671
16672 /*
16673 * Devices are cloned, so if the DTrace state has already
16674 * been allocated, that means this device belongs to a
16675 * different client. Each client should open '/dev/dtrace'
16676 * to get a cloned device.
16677 */
16678 if (dev->si_drv1 != NULL)
16679 return (EBUSY);
16680#endif
16681
16682 cred_p = dev->si_cred;
16683#endif
16684
16685 /*
16686 * If no DTRACE_PRIV_* bits are set in the credential, then the
16687 * caller lacks sufficient permission to do anything with DTrace.
16688 */
16689 dtrace_cred2priv(cred_p, &priv, &uid, &zoneid);
16690 if (priv == DTRACE_PRIV_NONE) {
16691#if !defined(sun)
16692#if __FreeBSD_version < 800039
16693 /* Destroy the cloned device. */
16694 destroy_dev(dev);
16695#endif
16696#endif
16697
16698 return (EACCES);
16699 }
16700
16701 /*
16702 * Ask all providers to provide all their probes.
16703 */
16704 mutex_enter(&dtrace_provider_lock);
16705 dtrace_probe_provide(NULL, NULL);
16706 mutex_exit(&dtrace_provider_lock);
16707
16708 mutex_enter(&cpu_lock);
16709 mutex_enter(&dtrace_lock);
16710 dtrace_opens++;
16711 dtrace_membar_producer();
16712
16713#if defined(sun)
16714 /*
16715 * If the kernel debugger is active (that is, if the kernel debugger
16716 * modified text in some way), we won't allow the open.
16717 */
16718 if (kdi_dtrace_set(KDI_DTSET_DTRACE_ACTIVATE) != 0) {
16719 dtrace_opens--;
16720 mutex_exit(&cpu_lock);
16721 mutex_exit(&dtrace_lock);
16722 return (EBUSY);
16723 }
16724
16725 state = dtrace_state_create(devp, cred_p);
16726#else
16727 state = dtrace_state_create(dev);
16728#if __FreeBSD_version < 800039
16729 dev->si_drv1 = state;
16730#else
16731 devfs_set_cdevpriv(state, dtrace_dtr);
16732#endif
16733#endif
16734
16735 mutex_exit(&cpu_lock);
16736
16737 if (state == NULL) {
16738#if defined(sun)
16739 if (--dtrace_opens == 0 && dtrace_anon.dta_enabling == NULL)
16740 (void) kdi_dtrace_set(KDI_DTSET_DTRACE_DEACTIVATE);
16741#else
16742 --dtrace_opens;
16743#endif
16744 mutex_exit(&dtrace_lock);
16745#if !defined(sun)
16746#if __FreeBSD_version < 800039
16747 /* Destroy the cloned device. */
16748 destroy_dev(dev);
16749#endif
16750#endif
16751 return (EAGAIN);
16752 }
16753
16754 mutex_exit(&dtrace_lock);
16755
16756 return (0);
16757}
16758
16759/*ARGSUSED*/
16760#if defined(sun)
16761static int
16762dtrace_close(dev_t dev, int flag, int otyp, cred_t *cred_p)
16763#elif __FreeBSD_version < 800039
16764static int
16765dtrace_close(struct cdev *dev, int flags, int fmt __unused, struct thread *td)
16766#else
16767static void
16768dtrace_dtr(void *data)
16769#endif
16770{
16771#if defined(sun)
16772 minor_t minor = getminor(dev);
16773 dtrace_state_t *state;
16774
16775 if (minor == DTRACEMNRN_HELPER)
16776 return (0);
16777
16778 state = ddi_get_soft_state(dtrace_softstate, minor);
16779#else
16780#if __FreeBSD_version < 800039
16781 dtrace_state_t *state = dev->si_drv1;
16782
16783 /* Check if this is not a cloned device. */
16784 if (dev2unit(dev) == 0)
16785 return (0);
16786#else
16787 dtrace_state_t *state = data;
16788#endif
16789
16790#endif
16791
16792 mutex_enter(&cpu_lock);
16793 mutex_enter(&dtrace_lock);
16794
16795 if (state != NULL) {
16796 if (state->dts_anon) {
16797 /*
16798 * There is anonymous state. Destroy that first.
16799 */
16800 ASSERT(dtrace_anon.dta_state == NULL);
16801 dtrace_state_destroy(state->dts_anon);
16802 }
16803
16804 dtrace_state_destroy(state);
16805
16806#if !defined(sun)
16807 kmem_free(state, 0);
16808#if __FreeBSD_version < 800039
16809 dev->si_drv1 = NULL;
16810#endif
16811#endif
16812 }
16813
16814 ASSERT(dtrace_opens > 0);
16815#if defined(sun)
16816 /*
16817 * Only relinquish control of the kernel debugger interface when there
16818 * are no consumers and no anonymous enablings.
16819 */
16820 if (--dtrace_opens == 0 && dtrace_anon.dta_enabling == NULL)
16821 (void) kdi_dtrace_set(KDI_DTSET_DTRACE_DEACTIVATE);
16822#else
16823 --dtrace_opens;
16824#endif
16825
16826 mutex_exit(&dtrace_lock);
16827 mutex_exit(&cpu_lock);
16828
16829#if __FreeBSD_version < 800039
16830 /* Schedule this cloned device to be destroyed. */
16831 destroy_dev_sched(dev);
16832#endif
16833
16834#if defined(sun) || __FreeBSD_version < 800039
16835 return (0);
16836#endif
16837}
16838
16839#if defined(sun)
16840/*ARGSUSED*/
16841static int
16842dtrace_ioctl_helper(int cmd, intptr_t arg, int *rv)
16843{
16844 int rval;
16845 dof_helper_t help, *dhp = NULL;
16846
16847 switch (cmd) {
16848 case DTRACEHIOC_ADDDOF:
16849 if (copyin((void *)arg, &help, sizeof (help)) != 0) {
16850 dtrace_dof_error(NULL, "failed to copyin DOF helper");
16851 return (EFAULT);
16852 }
16853
16854 dhp = &help;
16855 arg = (intptr_t)help.dofhp_dof;
16856 /*FALLTHROUGH*/
16857
16858 case DTRACEHIOC_ADD: {
16859 dof_hdr_t *dof = dtrace_dof_copyin(arg, &rval);
16860
16861 if (dof == NULL)
16862 return (rval);
16863
16864 mutex_enter(&dtrace_lock);
16865
16866 /*
16867 * dtrace_helper_slurp() takes responsibility for the dof --
16868 * it may free it now or it may save it and free it later.
16869 */
16870 if ((rval = dtrace_helper_slurp(dof, dhp)) != -1) {
16871 *rv = rval;
16872 rval = 0;
16873 } else {
16874 rval = EINVAL;
16875 }
16876
16877 mutex_exit(&dtrace_lock);
16878 return (rval);
16879 }
16880
16881 case DTRACEHIOC_REMOVE: {
16882 mutex_enter(&dtrace_lock);
16883 rval = dtrace_helper_destroygen(arg);
16884 mutex_exit(&dtrace_lock);
16885
16886 return (rval);
16887 }
16888
16889 default:
16890 break;
16891 }
16892
16893 return (ENOTTY);
16894}
16895
16896/*ARGSUSED*/
16897static int
16898dtrace_ioctl(dev_t dev, int cmd, intptr_t arg, int md, cred_t *cr, int *rv)
16899{
16900 minor_t minor = getminor(dev);
16901 dtrace_state_t *state;
16902 int rval;
16903
16904 if (minor == DTRACEMNRN_HELPER)
16905 return (dtrace_ioctl_helper(cmd, arg, rv));
16906
16907 state = ddi_get_soft_state(dtrace_softstate, minor);
16908
16909 if (state->dts_anon) {
16910 ASSERT(dtrace_anon.dta_state == NULL);
16911 state = state->dts_anon;
16912 }
16913
16914 switch (cmd) {
16915 case DTRACEIOC_PROVIDER: {
16916 dtrace_providerdesc_t pvd;
16917 dtrace_provider_t *pvp;
16918
16919 if (copyin((void *)arg, &pvd, sizeof (pvd)) != 0)
16920 return (EFAULT);
16921
16922 pvd.dtvd_name[DTRACE_PROVNAMELEN - 1] = '\0';
16923 mutex_enter(&dtrace_provider_lock);
16924
16925 for (pvp = dtrace_provider; pvp != NULL; pvp = pvp->dtpv_next) {
16926 if (strcmp(pvp->dtpv_name, pvd.dtvd_name) == 0)
16927 break;
16928 }
16929
16930 mutex_exit(&dtrace_provider_lock);
16931
16932 if (pvp == NULL)
16933 return (ESRCH);
16934
16935 bcopy(&pvp->dtpv_priv, &pvd.dtvd_priv, sizeof (dtrace_ppriv_t));
16936 bcopy(&pvp->dtpv_attr, &pvd.dtvd_attr, sizeof (dtrace_pattr_t));
16937
16938 if (copyout(&pvd, (void *)arg, sizeof (pvd)) != 0)
16939 return (EFAULT);
16940
16941 return (0);
16942 }
16943
16944 case DTRACEIOC_EPROBE: {
16945 dtrace_eprobedesc_t epdesc;
16946 dtrace_ecb_t *ecb;
16947 dtrace_action_t *act;
16948 void *buf;
16949 size_t size;
16950 uintptr_t dest;
16951 int nrecs;
16952
16953 if (copyin((void *)arg, &epdesc, sizeof (epdesc)) != 0)
16954 return (EFAULT);
16955
16956 mutex_enter(&dtrace_lock);
16957
16958 if ((ecb = dtrace_epid2ecb(state, epdesc.dtepd_epid)) == NULL) {
16959 mutex_exit(&dtrace_lock);
16960 return (EINVAL);
16961 }
16962
16963 if (ecb->dte_probe == NULL) {
16964 mutex_exit(&dtrace_lock);
16965 return (EINVAL);
16966 }
16967
16968 epdesc.dtepd_probeid = ecb->dte_probe->dtpr_id;
16969 epdesc.dtepd_uarg = ecb->dte_uarg;
16970 epdesc.dtepd_size = ecb->dte_size;
16971
16972 nrecs = epdesc.dtepd_nrecs;
16973 epdesc.dtepd_nrecs = 0;
16974 for (act = ecb->dte_action; act != NULL; act = act->dta_next) {
16975 if (DTRACEACT_ISAGG(act->dta_kind) || act->dta_intuple)
16976 continue;
16977
16978 epdesc.dtepd_nrecs++;
16979 }
16980
16981 /*
16982 * Now that we have the size, we need to allocate a temporary
16983 * buffer in which to store the complete description. We need
16984 * the temporary buffer to be able to drop dtrace_lock()
16985 * across the copyout(), below.
16986 */
16987 size = sizeof (dtrace_eprobedesc_t) +
16988 (epdesc.dtepd_nrecs * sizeof (dtrace_recdesc_t));
16989
16990 buf = kmem_alloc(size, KM_SLEEP);
16991 dest = (uintptr_t)buf;
16992
16993 bcopy(&epdesc, (void *)dest, sizeof (epdesc));
16994 dest += offsetof(dtrace_eprobedesc_t, dtepd_rec[0]);
16995
16996 for (act = ecb->dte_action; act != NULL; act = act->dta_next) {
16997 if (DTRACEACT_ISAGG(act->dta_kind) || act->dta_intuple)
16998 continue;
16999
17000 if (nrecs-- == 0)
17001 break;
17002
17003 bcopy(&act->dta_rec, (void *)dest,
17004 sizeof (dtrace_recdesc_t));
17005 dest += sizeof (dtrace_recdesc_t);
17006 }
17007
17008 mutex_exit(&dtrace_lock);
17009
17010 if (copyout(buf, (void *)arg, dest - (uintptr_t)buf) != 0) {
17011 kmem_free(buf, size);
17012 return (EFAULT);
17013 }
17014
17015 kmem_free(buf, size);
17016 return (0);
17017 }
17018
17019 case DTRACEIOC_AGGDESC: {
17020 dtrace_aggdesc_t aggdesc;
17021 dtrace_action_t *act;
17022 dtrace_aggregation_t *agg;
17023 int nrecs;
17024 uint32_t offs;
17025 dtrace_recdesc_t *lrec;
17026 void *buf;
17027 size_t size;
17028 uintptr_t dest;
17029
17030 if (copyin((void *)arg, &aggdesc, sizeof (aggdesc)) != 0)
17031 return (EFAULT);
17032
17033 mutex_enter(&dtrace_lock);
17034
17035 if ((agg = dtrace_aggid2agg(state, aggdesc.dtagd_id)) == NULL) {
17036 mutex_exit(&dtrace_lock);
17037 return (EINVAL);
17038 }
17039
17040 aggdesc.dtagd_epid = agg->dtag_ecb->dte_epid;
17041
17042 nrecs = aggdesc.dtagd_nrecs;
17043 aggdesc.dtagd_nrecs = 0;
17044
17045 offs = agg->dtag_base;
17046 lrec = &agg->dtag_action.dta_rec;
17047 aggdesc.dtagd_size = lrec->dtrd_offset + lrec->dtrd_size - offs;
17048
17049 for (act = agg->dtag_first; ; act = act->dta_next) {
17050 ASSERT(act->dta_intuple ||
17051 DTRACEACT_ISAGG(act->dta_kind));
17052
17053 /*
17054 * If this action has a record size of zero, it
17055 * denotes an argument to the aggregating action.
17056 * Because the presence of this record doesn't (or
17057 * shouldn't) affect the way the data is interpreted,
17058 * we don't copy it out to save user-level the
17059 * confusion of dealing with a zero-length record.
17060 */
17061 if (act->dta_rec.dtrd_size == 0) {
17062 ASSERT(agg->dtag_hasarg);
17063 continue;
17064 }
17065
17066 aggdesc.dtagd_nrecs++;
17067
17068 if (act == &agg->dtag_action)
17069 break;
17070 }
17071
17072 /*
17073 * Now that we have the size, we need to allocate a temporary
17074 * buffer in which to store the complete description. We need
17075 * the temporary buffer to be able to drop dtrace_lock()
17076 * across the copyout(), below.
17077 */
17078 size = sizeof (dtrace_aggdesc_t) +
17079 (aggdesc.dtagd_nrecs * sizeof (dtrace_recdesc_t));
17080
17081 buf = kmem_alloc(size, KM_SLEEP);
17082 dest = (uintptr_t)buf;
17083
17084 bcopy(&aggdesc, (void *)dest, sizeof (aggdesc));
17085 dest += offsetof(dtrace_aggdesc_t, dtagd_rec[0]);
17086
17087 for (act = agg->dtag_first; ; act = act->dta_next) {
17088 dtrace_recdesc_t rec = act->dta_rec;
17089
17090 /*
17091 * See the comment in the above loop for why we pass
17092 * over zero-length records.
17093 */
17094 if (rec.dtrd_size == 0) {
17095 ASSERT(agg->dtag_hasarg);
17096 continue;
17097 }
17098
17099 if (nrecs-- == 0)
17100 break;
17101
17102 rec.dtrd_offset -= offs;
17103 bcopy(&rec, (void *)dest, sizeof (rec));
17104 dest += sizeof (dtrace_recdesc_t);
17105
17106 if (act == &agg->dtag_action)
17107 break;
17108 }
17109
17110 mutex_exit(&dtrace_lock);
17111
17112 if (copyout(buf, (void *)arg, dest - (uintptr_t)buf) != 0) {
17113 kmem_free(buf, size);
17114 return (EFAULT);
17115 }
17116
17117 kmem_free(buf, size);
17118 return (0);
17119 }
17120
17121 case DTRACEIOC_ENABLE: {
17122 dof_hdr_t *dof;
17123 dtrace_enabling_t *enab = NULL;
17124 dtrace_vstate_t *vstate;
17125 int err = 0;
17126
17127 *rv = 0;
17128
17129 /*
17130 * If a NULL argument has been passed, we take this as our
17131 * cue to reevaluate our enablings.
17132 */
17133 if (arg == NULL) {
17134 dtrace_enabling_matchall();
17135
17136 return (0);
17137 }
17138
17139 if ((dof = dtrace_dof_copyin(arg, &rval)) == NULL)
17140 return (rval);
17141
17142 mutex_enter(&cpu_lock);
17143 mutex_enter(&dtrace_lock);
17144 vstate = &state->dts_vstate;
17145
17146 if (state->dts_activity != DTRACE_ACTIVITY_INACTIVE) {
17147 mutex_exit(&dtrace_lock);
17148 mutex_exit(&cpu_lock);
17149 dtrace_dof_destroy(dof);
17150 return (EBUSY);
17151 }
17152
17153 if (dtrace_dof_slurp(dof, vstate, cr, &enab, 0, B_TRUE) != 0) {
17154 mutex_exit(&dtrace_lock);
17155 mutex_exit(&cpu_lock);
17156 dtrace_dof_destroy(dof);
17157 return (EINVAL);
17158 }
17159
17160 if ((rval = dtrace_dof_options(dof, state)) != 0) {
17161 dtrace_enabling_destroy(enab);
17162 mutex_exit(&dtrace_lock);
17163 mutex_exit(&cpu_lock);
17164 dtrace_dof_destroy(dof);
17165 return (rval);
17166 }
17167
17168 if ((err = dtrace_enabling_match(enab, rv)) == 0) {
17169 err = dtrace_enabling_retain(enab);
17170 } else {
17171 dtrace_enabling_destroy(enab);
17172 }
17173
17174 mutex_exit(&cpu_lock);
17175 mutex_exit(&dtrace_lock);
17176 dtrace_dof_destroy(dof);
17177
17178 return (err);
17179 }
17180
17181 case DTRACEIOC_REPLICATE: {
17182 dtrace_repldesc_t desc;
17183 dtrace_probedesc_t *match = &desc.dtrpd_match;
17184 dtrace_probedesc_t *create = &desc.dtrpd_create;
17185 int err;
17186
17187 if (copyin((void *)arg, &desc, sizeof (desc)) != 0)
17188 return (EFAULT);
17189
17190 match->dtpd_provider[DTRACE_PROVNAMELEN - 1] = '\0';
17191 match->dtpd_mod[DTRACE_MODNAMELEN - 1] = '\0';
17192 match->dtpd_func[DTRACE_FUNCNAMELEN - 1] = '\0';
17193 match->dtpd_name[DTRACE_NAMELEN - 1] = '\0';
17194
17195 create->dtpd_provider[DTRACE_PROVNAMELEN - 1] = '\0';
17196 create->dtpd_mod[DTRACE_MODNAMELEN - 1] = '\0';
17197 create->dtpd_func[DTRACE_FUNCNAMELEN - 1] = '\0';
17198 create->dtpd_name[DTRACE_NAMELEN - 1] = '\0';
17199
17200 mutex_enter(&dtrace_lock);
17201 err = dtrace_enabling_replicate(state, match, create);
17202 mutex_exit(&dtrace_lock);
17203
17204 return (err);
17205 }
17206
17207 case DTRACEIOC_PROBEMATCH:
17208 case DTRACEIOC_PROBES: {
17209 dtrace_probe_t *probe = NULL;
17210 dtrace_probedesc_t desc;
17211 dtrace_probekey_t pkey;
17212 dtrace_id_t i;
17213 int m = 0;
17214 uint32_t priv;
17215 uid_t uid;
17216 zoneid_t zoneid;
17217
17218 if (copyin((void *)arg, &desc, sizeof (desc)) != 0)
17219 return (EFAULT);
17220
17221 desc.dtpd_provider[DTRACE_PROVNAMELEN - 1] = '\0';
17222 desc.dtpd_mod[DTRACE_MODNAMELEN - 1] = '\0';
17223 desc.dtpd_func[DTRACE_FUNCNAMELEN - 1] = '\0';
17224 desc.dtpd_name[DTRACE_NAMELEN - 1] = '\0';
17225
17226 /*
17227 * Before we attempt to match this probe, we want to give
17228 * all providers the opportunity to provide it.
17229 */
17230 if (desc.dtpd_id == DTRACE_IDNONE) {
17231 mutex_enter(&dtrace_provider_lock);
17232 dtrace_probe_provide(&desc, NULL);
17233 mutex_exit(&dtrace_provider_lock);
17234 desc.dtpd_id++;
17235 }
17236
17237 if (cmd == DTRACEIOC_PROBEMATCH) {
17238 dtrace_probekey(&desc, &pkey);
17239 pkey.dtpk_id = DTRACE_IDNONE;
17240 }
17241
17242 dtrace_cred2priv(cr, &priv, &uid, &zoneid);
17243
17244 mutex_enter(&dtrace_lock);
17245
17246 if (cmd == DTRACEIOC_PROBEMATCH) {
17247 for (i = desc.dtpd_id; i <= dtrace_nprobes; i++) {
17248 if ((probe = dtrace_probes[i - 1]) != NULL &&
17249 (m = dtrace_match_probe(probe, &pkey,
17250 priv, uid, zoneid)) != 0)
17251 break;
17252 }
17253
17254 if (m < 0) {
17255 mutex_exit(&dtrace_lock);
17256 return (EINVAL);
17257 }
17258
17259 } else {
17260 for (i = desc.dtpd_id; i <= dtrace_nprobes; i++) {
17261 if ((probe = dtrace_probes[i - 1]) != NULL &&
17262 dtrace_match_priv(probe, priv, uid, zoneid))
17263 break;
17264 }
17265 }
17266
17267 if (probe == NULL) {
17268 mutex_exit(&dtrace_lock);
17269 return (ESRCH);
17270 }
17271
17272 dtrace_probe_description(probe, &desc);
17273 mutex_exit(&dtrace_lock);
17274
17275 if (copyout(&desc, (void *)arg, sizeof (desc)) != 0)
17276 return (EFAULT);
17277
17278 return (0);
17279 }
17280
17281 case DTRACEIOC_PROBEARG: {
17282 dtrace_argdesc_t desc;
17283 dtrace_probe_t *probe;
17284 dtrace_provider_t *prov;
17285
17286 if (copyin((void *)arg, &desc, sizeof (desc)) != 0)
17287 return (EFAULT);
17288
17289 if (desc.dtargd_id == DTRACE_IDNONE)
17290 return (EINVAL);
17291
17292 if (desc.dtargd_ndx == DTRACE_ARGNONE)
17293 return (EINVAL);
17294
17295 mutex_enter(&dtrace_provider_lock);
17296 mutex_enter(&mod_lock);
17297 mutex_enter(&dtrace_lock);
17298
17299 if (desc.dtargd_id > dtrace_nprobes) {
17300 mutex_exit(&dtrace_lock);
17301 mutex_exit(&mod_lock);
17302 mutex_exit(&dtrace_provider_lock);
17303 return (EINVAL);
17304 }
17305
17306 if ((probe = dtrace_probes[desc.dtargd_id - 1]) == NULL) {
17307 mutex_exit(&dtrace_lock);
17308 mutex_exit(&mod_lock);
17309 mutex_exit(&dtrace_provider_lock);
17310 return (EINVAL);
17311 }
17312
17313 mutex_exit(&dtrace_lock);
17314
17315 prov = probe->dtpr_provider;
17316
17317 if (prov->dtpv_pops.dtps_getargdesc == NULL) {
17318 /*
17319 * There isn't any typed information for this probe.
17320 * Set the argument number to DTRACE_ARGNONE.
17321 */
17322 desc.dtargd_ndx = DTRACE_ARGNONE;
17323 } else {
17324 desc.dtargd_native[0] = '\0';
17325 desc.dtargd_xlate[0] = '\0';
17326 desc.dtargd_mapping = desc.dtargd_ndx;
17327
17328 prov->dtpv_pops.dtps_getargdesc(prov->dtpv_arg,
17329 probe->dtpr_id, probe->dtpr_arg, &desc);
17330 }
17331
17332 mutex_exit(&mod_lock);
17333 mutex_exit(&dtrace_provider_lock);
17334
17335 if (copyout(&desc, (void *)arg, sizeof (desc)) != 0)
17336 return (EFAULT);
17337
17338 return (0);
17339 }
17340
17341 case DTRACEIOC_GO: {
17342 processorid_t cpuid;
17343 rval = dtrace_state_go(state, &cpuid);
17344
17345 if (rval != 0)
17346 return (rval);
17347
17348 if (copyout(&cpuid, (void *)arg, sizeof (cpuid)) != 0)
17349 return (EFAULT);
17350
17351 return (0);
17352 }
17353
17354 case DTRACEIOC_STOP: {
17355 processorid_t cpuid;
17356
17357 mutex_enter(&dtrace_lock);
17358 rval = dtrace_state_stop(state, &cpuid);
17359 mutex_exit(&dtrace_lock);
17360
17361 if (rval != 0)
17362 return (rval);
17363
17364 if (copyout(&cpuid, (void *)arg, sizeof (cpuid)) != 0)
17365 return (EFAULT);
17366
17367 return (0);
17368 }
17369
17370 case DTRACEIOC_DOFGET: {
17371 dof_hdr_t hdr, *dof;
17372 uint64_t len;
17373
17374 if (copyin((void *)arg, &hdr, sizeof (hdr)) != 0)
17375 return (EFAULT);
17376
17377 mutex_enter(&dtrace_lock);
17378 dof = dtrace_dof_create(state);
17379 mutex_exit(&dtrace_lock);
17380
17381 len = MIN(hdr.dofh_loadsz, dof->dofh_loadsz);
17382 rval = copyout(dof, (void *)arg, len);
17383 dtrace_dof_destroy(dof);
17384
17385 return (rval == 0 ? 0 : EFAULT);
17386 }
17387
17388 case DTRACEIOC_AGGSNAP:
17389 case DTRACEIOC_BUFSNAP: {
17390 dtrace_bufdesc_t desc;
17391 caddr_t cached;
17392 dtrace_buffer_t *buf;
17393
17394 if (copyin((void *)arg, &desc, sizeof (desc)) != 0)
17395 return (EFAULT);
17396
17397 if (desc.dtbd_cpu < 0 || desc.dtbd_cpu >= NCPU)
17398 return (EINVAL);
17399
17400 mutex_enter(&dtrace_lock);
17401
17402 if (cmd == DTRACEIOC_BUFSNAP) {
17403 buf = &state->dts_buffer[desc.dtbd_cpu];
17404 } else {
17405 buf = &state->dts_aggbuffer[desc.dtbd_cpu];
17406 }
17407
17408 if (buf->dtb_flags & (DTRACEBUF_RING | DTRACEBUF_FILL)) {
17409 size_t sz = buf->dtb_offset;
17410
17411 if (state->dts_activity != DTRACE_ACTIVITY_STOPPED) {
17412 mutex_exit(&dtrace_lock);
17413 return (EBUSY);
17414 }
17415
17416 /*
17417 * If this buffer has already been consumed, we're
17418 * going to indicate that there's nothing left here
17419 * to consume.
17420 */
17421 if (buf->dtb_flags & DTRACEBUF_CONSUMED) {
17422 mutex_exit(&dtrace_lock);
17423
17424 desc.dtbd_size = 0;
17425 desc.dtbd_drops = 0;
17426 desc.dtbd_errors = 0;
17427 desc.dtbd_oldest = 0;
17428 sz = sizeof (desc);
17429
17430 if (copyout(&desc, (void *)arg, sz) != 0)
17431 return (EFAULT);
17432
17433 return (0);
17434 }
17435
17436 /*
17437 * If this is a ring buffer that has wrapped, we want
17438 * to copy the whole thing out.
17439 */
17440 if (buf->dtb_flags & DTRACEBUF_WRAPPED) {
17441 dtrace_buffer_polish(buf);
17442 sz = buf->dtb_size;
17443 }
17444
17445 if (copyout(buf->dtb_tomax, desc.dtbd_data, sz) != 0) {
17446 mutex_exit(&dtrace_lock);
17447 return (EFAULT);
17448 }
17449
17450 desc.dtbd_size = sz;
17451 desc.dtbd_drops = buf->dtb_drops;
17452 desc.dtbd_errors = buf->dtb_errors;
17453 desc.dtbd_oldest = buf->dtb_xamot_offset;
17454 desc.dtbd_timestamp = dtrace_gethrtime();
17455
17456 mutex_exit(&dtrace_lock);
17457
17458 if (copyout(&desc, (void *)arg, sizeof (desc)) != 0)
17459 return (EFAULT);
17460
17461 buf->dtb_flags |= DTRACEBUF_CONSUMED;
17462
17463 return (0);
17464 }
17465
17466 if (buf->dtb_tomax == NULL) {
17467 ASSERT(buf->dtb_xamot == NULL);
17468 mutex_exit(&dtrace_lock);
17469 return (ENOENT);
17470 }
17471
17472 cached = buf->dtb_tomax;
17473 ASSERT(!(buf->dtb_flags & DTRACEBUF_NOSWITCH));
17474
17475 dtrace_xcall(desc.dtbd_cpu,
17476 (dtrace_xcall_t)dtrace_buffer_switch, buf);
17477
17478 state->dts_errors += buf->dtb_xamot_errors;
17479
17480 /*
17481 * If the buffers did not actually switch, then the cross call
17482 * did not take place -- presumably because the given CPU is
17483 * not in the ready set. If this is the case, we'll return
17484 * ENOENT.
17485 */
17486 if (buf->dtb_tomax == cached) {
17487 ASSERT(buf->dtb_xamot != cached);
17488 mutex_exit(&dtrace_lock);
17489 return (ENOENT);
17490 }
17491
17492 ASSERT(cached == buf->dtb_xamot);
17493
17494 /*
17495 * We have our snapshot; now copy it out.
17496 */
17497 if (copyout(buf->dtb_xamot, desc.dtbd_data,
17498 buf->dtb_xamot_offset) != 0) {
17499 mutex_exit(&dtrace_lock);
17500 return (EFAULT);
17501 }
17502
17503 desc.dtbd_size = buf->dtb_xamot_offset;
17504 desc.dtbd_drops = buf->dtb_xamot_drops;
17505 desc.dtbd_errors = buf->dtb_xamot_errors;
17506 desc.dtbd_oldest = 0;
17507 desc.dtbd_timestamp = buf->dtb_switched;
17508
17509 mutex_exit(&dtrace_lock);
17510
17511 /*
17512 * Finally, copy out the buffer description.
17513 */
17514 if (copyout(&desc, (void *)arg, sizeof (desc)) != 0)
17515 return (EFAULT);
17516
17517 return (0);
17518 }
17519
17520 case DTRACEIOC_CONF: {
17521 dtrace_conf_t conf;
17522
17523 bzero(&conf, sizeof (conf));
17524 conf.dtc_difversion = DIF_VERSION;
17525 conf.dtc_difintregs = DIF_DIR_NREGS;
17526 conf.dtc_diftupregs = DIF_DTR_NREGS;
17527 conf.dtc_ctfmodel = CTF_MODEL_NATIVE;
17528
17529 if (copyout(&conf, (void *)arg, sizeof (conf)) != 0)
17530 return (EFAULT);
17531
17532 return (0);
17533 }
17534
17535 case DTRACEIOC_STATUS: {
17536 dtrace_status_t stat;
17537 dtrace_dstate_t *dstate;
17538 int i, j;
17539 uint64_t nerrs;
17540
17541 /*
17542 * See the comment in dtrace_state_deadman() for the reason
17543 * for setting dts_laststatus to INT64_MAX before setting
17544 * it to the correct value.
17545 */
17546 state->dts_laststatus = INT64_MAX;
17547 dtrace_membar_producer();
17548 state->dts_laststatus = dtrace_gethrtime();
17549
17550 bzero(&stat, sizeof (stat));
17551
17552 mutex_enter(&dtrace_lock);
17553
17554 if (state->dts_activity == DTRACE_ACTIVITY_INACTIVE) {
17555 mutex_exit(&dtrace_lock);
17556 return (ENOENT);
17557 }
17558
17559 if (state->dts_activity == DTRACE_ACTIVITY_DRAINING)
17560 stat.dtst_exiting = 1;
17561
17562 nerrs = state->dts_errors;
17563 dstate = &state->dts_vstate.dtvs_dynvars;
17564
17565 for (i = 0; i < NCPU; i++) {
17566 dtrace_dstate_percpu_t *dcpu = &dstate->dtds_percpu[i];
17567
17568 stat.dtst_dyndrops += dcpu->dtdsc_drops;
17569 stat.dtst_dyndrops_dirty += dcpu->dtdsc_dirty_drops;
17570 stat.dtst_dyndrops_rinsing += dcpu->dtdsc_rinsing_drops;
17571
17572 if (state->dts_buffer[i].dtb_flags & DTRACEBUF_FULL)
17573 stat.dtst_filled++;
17574
17575 nerrs += state->dts_buffer[i].dtb_errors;
17576
17577 for (j = 0; j < state->dts_nspeculations; j++) {
17578 dtrace_speculation_t *spec;
17579 dtrace_buffer_t *buf;
17580
17581 spec = &state->dts_speculations[j];
17582 buf = &spec->dtsp_buffer[i];
17583 stat.dtst_specdrops += buf->dtb_xamot_drops;
17584 }
17585 }
17586
17587 stat.dtst_specdrops_busy = state->dts_speculations_busy;
17588 stat.dtst_specdrops_unavail = state->dts_speculations_unavail;
17589 stat.dtst_stkstroverflows = state->dts_stkstroverflows;
17590 stat.dtst_dblerrors = state->dts_dblerrors;
17591 stat.dtst_killed =
17592 (state->dts_activity == DTRACE_ACTIVITY_KILLED);
17593 stat.dtst_errors = nerrs;
17594
17595 mutex_exit(&dtrace_lock);
17596
17597 if (copyout(&stat, (void *)arg, sizeof (stat)) != 0)
17598 return (EFAULT);
17599
17600 return (0);
17601 }
17602
17603 case DTRACEIOC_FORMAT: {
17604 dtrace_fmtdesc_t fmt;
17605 char *str;
17606 int len;
17607
17608 if (copyin((void *)arg, &fmt, sizeof (fmt)) != 0)
17609 return (EFAULT);
17610
17611 mutex_enter(&dtrace_lock);
17612
17613 if (fmt.dtfd_format == 0 ||
17614 fmt.dtfd_format > state->dts_nformats) {
17615 mutex_exit(&dtrace_lock);
17616 return (EINVAL);
17617 }
17618
17619 /*
17620 * Format strings are allocated contiguously and they are
17621 * never freed; if a format index is less than the number
17622 * of formats, we can assert that the format map is non-NULL
17623 * and that the format for the specified index is non-NULL.
17624 */
17625 ASSERT(state->dts_formats != NULL);
17626 str = state->dts_formats[fmt.dtfd_format - 1];
17627 ASSERT(str != NULL);
17628
17629 len = strlen(str) + 1;
17630
17631 if (len > fmt.dtfd_length) {
17632 fmt.dtfd_length = len;
17633
17634 if (copyout(&fmt, (void *)arg, sizeof (fmt)) != 0) {
17635 mutex_exit(&dtrace_lock);
17636 return (EINVAL);
17637 }
17638 } else {
17639 if (copyout(str, fmt.dtfd_string, len) != 0) {
17640 mutex_exit(&dtrace_lock);
17641 return (EINVAL);
17642 }
17643 }
17644
17645 mutex_exit(&dtrace_lock);
17646 return (0);
17647 }
17648
17649 default:
17650 break;
17651 }
17652
17653 return (ENOTTY);
17654}
17655
17656/*ARGSUSED*/
17657static int
17658dtrace_detach(dev_info_t *dip, ddi_detach_cmd_t cmd)
17659{
17660 dtrace_state_t *state;
17661
17662 switch (cmd) {
17663 case DDI_DETACH:
17664 break;
17665
17666 case DDI_SUSPEND:
17667 return (DDI_SUCCESS);
17668
17669 default:
17670 return (DDI_FAILURE);
17671 }
17672
17673 mutex_enter(&cpu_lock);
17674 mutex_enter(&dtrace_provider_lock);
17675 mutex_enter(&dtrace_lock);
17676
17677 ASSERT(dtrace_opens == 0);
17678
17679 if (dtrace_helpers > 0) {
17680 mutex_exit(&dtrace_provider_lock);
17681 mutex_exit(&dtrace_lock);
17682 mutex_exit(&cpu_lock);
17683 return (DDI_FAILURE);
17684 }
17685
17686 if (dtrace_unregister((dtrace_provider_id_t)dtrace_provider) != 0) {
17687 mutex_exit(&dtrace_provider_lock);
17688 mutex_exit(&dtrace_lock);
17689 mutex_exit(&cpu_lock);
17690 return (DDI_FAILURE);
17691 }
17692
17693 dtrace_provider = NULL;
17694
17695 if ((state = dtrace_anon_grab()) != NULL) {
17696 /*
17697 * If there were ECBs on this state, the provider should
17698 * have not been allowed to detach; assert that there is
17699 * none.
17700 */
17701 ASSERT(state->dts_necbs == 0);
17702 dtrace_state_destroy(state);
17703
17704 /*
17705 * If we're being detached with anonymous state, we need to
17706 * indicate to the kernel debugger that DTrace is now inactive.
17707 */
17708 (void) kdi_dtrace_set(KDI_DTSET_DTRACE_DEACTIVATE);
17709 }
17710
17711 bzero(&dtrace_anon, sizeof (dtrace_anon_t));
17712 unregister_cpu_setup_func((cpu_setup_func_t *)dtrace_cpu_setup, NULL);
17713 dtrace_cpu_init = NULL;
17714 dtrace_helpers_cleanup = NULL;
17715 dtrace_helpers_fork = NULL;
17716 dtrace_cpustart_init = NULL;
17717 dtrace_cpustart_fini = NULL;
17718 dtrace_debugger_init = NULL;
17719 dtrace_debugger_fini = NULL;
17720 dtrace_modload = NULL;
17721 dtrace_modunload = NULL;
17722
17723 ASSERT(dtrace_getf == 0);
17724 ASSERT(dtrace_closef == NULL);
17725
16785 mutex_exit(&cpu_lock);
16786
16787 if (dtrace_helptrace_enabled) {
16788 kmem_free(dtrace_helptrace_buffer, dtrace_helptrace_bufsize);
16789 dtrace_helptrace_buffer = NULL;
16790 }
16791
16792 kmem_free(dtrace_probes, dtrace_nprobes * sizeof (dtrace_probe_t *));
16793 dtrace_probes = NULL;
16794 dtrace_nprobes = 0;
16795
16796 dtrace_hash_destroy(dtrace_bymod);
16797 dtrace_hash_destroy(dtrace_byfunc);
16798 dtrace_hash_destroy(dtrace_byname);
16799 dtrace_bymod = NULL;
16800 dtrace_byfunc = NULL;
16801 dtrace_byname = NULL;
16802
16803 kmem_cache_destroy(dtrace_state_cache);
16804 vmem_destroy(dtrace_minor);
16805 vmem_destroy(dtrace_arena);
16806
16807 if (dtrace_toxrange != NULL) {
16808 kmem_free(dtrace_toxrange,
16809 dtrace_toxranges_max * sizeof (dtrace_toxrange_t));
16810 dtrace_toxrange = NULL;
16811 dtrace_toxranges = 0;
16812 dtrace_toxranges_max = 0;
16813 }
16814
16815 ddi_remove_minor_node(dtrace_devi, NULL);
16816 dtrace_devi = NULL;
16817
16818 ddi_soft_state_fini(&dtrace_softstate);
16819
16820 ASSERT(dtrace_vtime_references == 0);
16821 ASSERT(dtrace_opens == 0);
16822 ASSERT(dtrace_retained == NULL);
16823
16824 mutex_exit(&dtrace_lock);
16825 mutex_exit(&dtrace_provider_lock);
16826
16827 /*
16828 * We don't destroy the task queue until after we have dropped our
16829 * locks (taskq_destroy() may block on running tasks). To prevent
16830 * attempting to do work after we have effectively detached but before
16831 * the task queue has been destroyed, all tasks dispatched via the
16832 * task queue must check that DTrace is still attached before
16833 * performing any operation.
16834 */
16835 taskq_destroy(dtrace_taskq);
16836 dtrace_taskq = NULL;
16837
16838 return (DDI_SUCCESS);
16839}
16840#endif
16841
16842#if defined(sun)
16843/*ARGSUSED*/
16844static int
16845dtrace_info(dev_info_t *dip, ddi_info_cmd_t infocmd, void *arg, void **result)
16846{
16847 int error;
16848
16849 switch (infocmd) {
16850 case DDI_INFO_DEVT2DEVINFO:
16851 *result = (void *)dtrace_devi;
16852 error = DDI_SUCCESS;
16853 break;
16854 case DDI_INFO_DEVT2INSTANCE:
16855 *result = (void *)0;
16856 error = DDI_SUCCESS;
16857 break;
16858 default:
16859 error = DDI_FAILURE;
16860 }
16861 return (error);
16862}
16863#endif
16864
16865#if defined(sun)
16866static struct cb_ops dtrace_cb_ops = {
16867 dtrace_open, /* open */
16868 dtrace_close, /* close */
16869 nulldev, /* strategy */
16870 nulldev, /* print */
16871 nodev, /* dump */
16872 nodev, /* read */
16873 nodev, /* write */
16874 dtrace_ioctl, /* ioctl */
16875 nodev, /* devmap */
16876 nodev, /* mmap */
16877 nodev, /* segmap */
16878 nochpoll, /* poll */
16879 ddi_prop_op, /* cb_prop_op */
16880 0, /* streamtab */
16881 D_NEW | D_MP /* Driver compatibility flag */
16882};
16883
16884static struct dev_ops dtrace_ops = {
16885 DEVO_REV, /* devo_rev */
16886 0, /* refcnt */
16887 dtrace_info, /* get_dev_info */
16888 nulldev, /* identify */
16889 nulldev, /* probe */
16890 dtrace_attach, /* attach */
16891 dtrace_detach, /* detach */
16892 nodev, /* reset */
16893 &dtrace_cb_ops, /* driver operations */
16894 NULL, /* bus operations */
16895 nodev /* dev power */
16896};
16897
16898static struct modldrv modldrv = {
16899 &mod_driverops, /* module type (this is a pseudo driver) */
16900 "Dynamic Tracing", /* name of module */
16901 &dtrace_ops, /* driver ops */
16902};
16903
16904static struct modlinkage modlinkage = {
16905 MODREV_1,
16906 (void *)&modldrv,
16907 NULL
16908};
16909
16910int
16911_init(void)
16912{
16913 return (mod_install(&modlinkage));
16914}
16915
16916int
16917_info(struct modinfo *modinfop)
16918{
16919 return (mod_info(&modlinkage, modinfop));
16920}
16921
16922int
16923_fini(void)
16924{
16925 return (mod_remove(&modlinkage));
16926}
16927#else
16928
16929static d_ioctl_t dtrace_ioctl;
16930static d_ioctl_t dtrace_ioctl_helper;
16931static void dtrace_load(void *);
16932static int dtrace_unload(void);
16933#if __FreeBSD_version < 800039
16934static void dtrace_clone(void *, struct ucred *, char *, int , struct cdev **);
16935static struct clonedevs *dtrace_clones; /* Ptr to the array of cloned devices. */
16936static eventhandler_tag eh_tag; /* Event handler tag. */
16937#else
16938static struct cdev *dtrace_dev;
16939static struct cdev *helper_dev;
16940#endif
16941
16942void dtrace_invop_init(void);
16943void dtrace_invop_uninit(void);
16944
16945static struct cdevsw dtrace_cdevsw = {
16946 .d_version = D_VERSION,
16947#if __FreeBSD_version < 800039
16948 .d_flags = D_TRACKCLOSE | D_NEEDMINOR,
16949 .d_close = dtrace_close,
16950#endif
16951 .d_ioctl = dtrace_ioctl,
16952 .d_open = dtrace_open,
16953 .d_name = "dtrace",
16954};
16955
16956static struct cdevsw helper_cdevsw = {
16957 .d_version = D_VERSION,
16958 .d_ioctl = dtrace_ioctl_helper,
16959 .d_name = "helper",
16960};
16961
16962#include <dtrace_anon.c>
16963#if __FreeBSD_version < 800039
16964#include <dtrace_clone.c>
16965#endif
16966#include <dtrace_ioctl.c>
16967#include <dtrace_load.c>
16968#include <dtrace_modevent.c>
16969#include <dtrace_sysctl.c>
16970#include <dtrace_unload.c>
16971#include <dtrace_vtime.c>
16972#include <dtrace_hacks.c>
16973#include <dtrace_isa.c>
16974
16975SYSINIT(dtrace_load, SI_SUB_DTRACE, SI_ORDER_FIRST, dtrace_load, NULL);
16976SYSUNINIT(dtrace_unload, SI_SUB_DTRACE, SI_ORDER_FIRST, dtrace_unload, NULL);
16977SYSINIT(dtrace_anon_init, SI_SUB_DTRACE_ANON, SI_ORDER_FIRST, dtrace_anon_init, NULL);
16978
16979DEV_MODULE(dtrace, dtrace_modevent, NULL);
16980MODULE_VERSION(dtrace, 1);
16981MODULE_DEPEND(dtrace, cyclic, 1, 1, 1);
16982MODULE_DEPEND(dtrace, opensolaris, 1, 1, 1);
16983#endif
17726 mutex_exit(&cpu_lock);
17727
17728 if (dtrace_helptrace_enabled) {
17729 kmem_free(dtrace_helptrace_buffer, dtrace_helptrace_bufsize);
17730 dtrace_helptrace_buffer = NULL;
17731 }
17732
17733 kmem_free(dtrace_probes, dtrace_nprobes * sizeof (dtrace_probe_t *));
17734 dtrace_probes = NULL;
17735 dtrace_nprobes = 0;
17736
17737 dtrace_hash_destroy(dtrace_bymod);
17738 dtrace_hash_destroy(dtrace_byfunc);
17739 dtrace_hash_destroy(dtrace_byname);
17740 dtrace_bymod = NULL;
17741 dtrace_byfunc = NULL;
17742 dtrace_byname = NULL;
17743
17744 kmem_cache_destroy(dtrace_state_cache);
17745 vmem_destroy(dtrace_minor);
17746 vmem_destroy(dtrace_arena);
17747
17748 if (dtrace_toxrange != NULL) {
17749 kmem_free(dtrace_toxrange,
17750 dtrace_toxranges_max * sizeof (dtrace_toxrange_t));
17751 dtrace_toxrange = NULL;
17752 dtrace_toxranges = 0;
17753 dtrace_toxranges_max = 0;
17754 }
17755
17756 ddi_remove_minor_node(dtrace_devi, NULL);
17757 dtrace_devi = NULL;
17758
17759 ddi_soft_state_fini(&dtrace_softstate);
17760
17761 ASSERT(dtrace_vtime_references == 0);
17762 ASSERT(dtrace_opens == 0);
17763 ASSERT(dtrace_retained == NULL);
17764
17765 mutex_exit(&dtrace_lock);
17766 mutex_exit(&dtrace_provider_lock);
17767
17768 /*
17769 * We don't destroy the task queue until after we have dropped our
17770 * locks (taskq_destroy() may block on running tasks). To prevent
17771 * attempting to do work after we have effectively detached but before
17772 * the task queue has been destroyed, all tasks dispatched via the
17773 * task queue must check that DTrace is still attached before
17774 * performing any operation.
17775 */
17776 taskq_destroy(dtrace_taskq);
17777 dtrace_taskq = NULL;
17778
17779 return (DDI_SUCCESS);
17780}
17781#endif
17782
17783#if defined(sun)
17784/*ARGSUSED*/
17785static int
17786dtrace_info(dev_info_t *dip, ddi_info_cmd_t infocmd, void *arg, void **result)
17787{
17788 int error;
17789
17790 switch (infocmd) {
17791 case DDI_INFO_DEVT2DEVINFO:
17792 *result = (void *)dtrace_devi;
17793 error = DDI_SUCCESS;
17794 break;
17795 case DDI_INFO_DEVT2INSTANCE:
17796 *result = (void *)0;
17797 error = DDI_SUCCESS;
17798 break;
17799 default:
17800 error = DDI_FAILURE;
17801 }
17802 return (error);
17803}
17804#endif
17805
17806#if defined(sun)
17807static struct cb_ops dtrace_cb_ops = {
17808 dtrace_open, /* open */
17809 dtrace_close, /* close */
17810 nulldev, /* strategy */
17811 nulldev, /* print */
17812 nodev, /* dump */
17813 nodev, /* read */
17814 nodev, /* write */
17815 dtrace_ioctl, /* ioctl */
17816 nodev, /* devmap */
17817 nodev, /* mmap */
17818 nodev, /* segmap */
17819 nochpoll, /* poll */
17820 ddi_prop_op, /* cb_prop_op */
17821 0, /* streamtab */
17822 D_NEW | D_MP /* Driver compatibility flag */
17823};
17824
17825static struct dev_ops dtrace_ops = {
17826 DEVO_REV, /* devo_rev */
17827 0, /* refcnt */
17828 dtrace_info, /* get_dev_info */
17829 nulldev, /* identify */
17830 nulldev, /* probe */
17831 dtrace_attach, /* attach */
17832 dtrace_detach, /* detach */
17833 nodev, /* reset */
17834 &dtrace_cb_ops, /* driver operations */
17835 NULL, /* bus operations */
17836 nodev /* dev power */
17837};
17838
17839static struct modldrv modldrv = {
17840 &mod_driverops, /* module type (this is a pseudo driver) */
17841 "Dynamic Tracing", /* name of module */
17842 &dtrace_ops, /* driver ops */
17843};
17844
17845static struct modlinkage modlinkage = {
17846 MODREV_1,
17847 (void *)&modldrv,
17848 NULL
17849};
17850
17851int
17852_init(void)
17853{
17854 return (mod_install(&modlinkage));
17855}
17856
17857int
17858_info(struct modinfo *modinfop)
17859{
17860 return (mod_info(&modlinkage, modinfop));
17861}
17862
17863int
17864_fini(void)
17865{
17866 return (mod_remove(&modlinkage));
17867}
17868#else
17869
17870static d_ioctl_t dtrace_ioctl;
17871static d_ioctl_t dtrace_ioctl_helper;
17872static void dtrace_load(void *);
17873static int dtrace_unload(void);
17874#if __FreeBSD_version < 800039
17875static void dtrace_clone(void *, struct ucred *, char *, int , struct cdev **);
17876static struct clonedevs *dtrace_clones; /* Ptr to the array of cloned devices. */
17877static eventhandler_tag eh_tag; /* Event handler tag. */
17878#else
17879static struct cdev *dtrace_dev;
17880static struct cdev *helper_dev;
17881#endif
17882
17883void dtrace_invop_init(void);
17884void dtrace_invop_uninit(void);
17885
17886static struct cdevsw dtrace_cdevsw = {
17887 .d_version = D_VERSION,
17888#if __FreeBSD_version < 800039
17889 .d_flags = D_TRACKCLOSE | D_NEEDMINOR,
17890 .d_close = dtrace_close,
17891#endif
17892 .d_ioctl = dtrace_ioctl,
17893 .d_open = dtrace_open,
17894 .d_name = "dtrace",
17895};
17896
17897static struct cdevsw helper_cdevsw = {
17898 .d_version = D_VERSION,
17899 .d_ioctl = dtrace_ioctl_helper,
17900 .d_name = "helper",
17901};
17902
17903#include <dtrace_anon.c>
17904#if __FreeBSD_version < 800039
17905#include <dtrace_clone.c>
17906#endif
17907#include <dtrace_ioctl.c>
17908#include <dtrace_load.c>
17909#include <dtrace_modevent.c>
17910#include <dtrace_sysctl.c>
17911#include <dtrace_unload.c>
17912#include <dtrace_vtime.c>
17913#include <dtrace_hacks.c>
17914#include <dtrace_isa.c>
17915
17916SYSINIT(dtrace_load, SI_SUB_DTRACE, SI_ORDER_FIRST, dtrace_load, NULL);
17917SYSUNINIT(dtrace_unload, SI_SUB_DTRACE, SI_ORDER_FIRST, dtrace_unload, NULL);
17918SYSINIT(dtrace_anon_init, SI_SUB_DTRACE_ANON, SI_ORDER_FIRST, dtrace_anon_init, NULL);
17919
17920DEV_MODULE(dtrace, dtrace_modevent, NULL);
17921MODULE_VERSION(dtrace, 1);
17922MODULE_DEPEND(dtrace, cyclic, 1, 1, 1);
17923MODULE_DEPEND(dtrace, opensolaris, 1, 1, 1);
17924#endif