dtrace_subr.c revision 238552
1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License, Version 1.0 only
6 * (the "License").  You may not use this file except in compliance
7 * with the License.
8 *
9 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
10 * or http://www.opensolaris.org/os/licensing.
11 * See the License for the specific language governing permissions
12 * and limitations under the License.
13 *
14 * When distributing Covered Code, include this CDDL HEADER in each
15 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
16 * If applicable, add the following below this CDDL HEADER, with the
17 * fields enclosed by brackets "[]" replaced with your own identifying
18 * information: Portions Copyright [yyyy] [name of copyright owner]
19 *
20 * CDDL HEADER END
21 *
22 * $FreeBSD: head/sys/cddl/dev/dtrace/i386/dtrace_subr.c 238552 2012-07-17 14:36:40Z gnn $
23 *
24 */
25/*
26 * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
27 * Use is subject to license terms.
28 */
29
30/*
31 * Copyright (c) 2011, Joyent, Inc. All rights reserved.
32 */
33
34#include <sys/param.h>
35#include <sys/systm.h>
36#include <sys/types.h>
37#include <sys/cpuset.h>
38#include <sys/kernel.h>
39#include <sys/malloc.h>
40#include <sys/kmem.h>
41#include <sys/smp.h>
42#include <sys/dtrace_impl.h>
43#include <sys/dtrace_bsd.h>
44#include <machine/clock.h>
45#include <machine/frame.h>
46#include <vm/pmap.h>
47
48extern uintptr_t 	kernelbase;
49extern uintptr_t 	dtrace_in_probe_addr;
50extern int		dtrace_in_probe;
51
52extern void dtrace_getnanotime(struct timespec *tsp);
53
54int dtrace_invop(uintptr_t, uintptr_t *, uintptr_t);
55
56typedef struct dtrace_invop_hdlr {
57	int (*dtih_func)(uintptr_t, uintptr_t *, uintptr_t);
58	struct dtrace_invop_hdlr *dtih_next;
59} dtrace_invop_hdlr_t;
60
61dtrace_invop_hdlr_t *dtrace_invop_hdlr;
62
63int
64dtrace_invop(uintptr_t addr, uintptr_t *stack, uintptr_t eax)
65{
66	dtrace_invop_hdlr_t *hdlr;
67	int rval;
68
69	for (hdlr = dtrace_invop_hdlr; hdlr != NULL; hdlr = hdlr->dtih_next)
70		if ((rval = hdlr->dtih_func(addr, stack, eax)) != 0)
71			return (rval);
72
73	return (0);
74}
75
76void
77dtrace_invop_add(int (*func)(uintptr_t, uintptr_t *, uintptr_t))
78{
79	dtrace_invop_hdlr_t *hdlr;
80
81	hdlr = kmem_alloc(sizeof (dtrace_invop_hdlr_t), KM_SLEEP);
82	hdlr->dtih_func = func;
83	hdlr->dtih_next = dtrace_invop_hdlr;
84	dtrace_invop_hdlr = hdlr;
85}
86
87void
88dtrace_invop_remove(int (*func)(uintptr_t, uintptr_t *, uintptr_t))
89{
90	dtrace_invop_hdlr_t *hdlr = dtrace_invop_hdlr, *prev = NULL;
91
92	for (;;) {
93		if (hdlr == NULL)
94			panic("attempt to remove non-existent invop handler");
95
96		if (hdlr->dtih_func == func)
97			break;
98
99		prev = hdlr;
100		hdlr = hdlr->dtih_next;
101	}
102
103	if (prev == NULL) {
104		ASSERT(dtrace_invop_hdlr == hdlr);
105		dtrace_invop_hdlr = hdlr->dtih_next;
106	} else {
107		ASSERT(dtrace_invop_hdlr != hdlr);
108		prev->dtih_next = hdlr->dtih_next;
109	}
110
111	kmem_free(hdlr, 0);
112}
113
114void
115dtrace_toxic_ranges(void (*func)(uintptr_t base, uintptr_t limit))
116{
117	(*func)(0, kernelbase);
118}
119
120void
121dtrace_xcall(processorid_t cpu, dtrace_xcall_t func, void *arg)
122{
123	cpuset_t cpus;
124
125	if (cpu == DTRACE_CPUALL)
126		cpus = all_cpus;
127	else
128		CPU_SETOF(cpu, &cpus);
129
130	smp_rendezvous_cpus(cpus, smp_no_rendevous_barrier, func,
131	    smp_no_rendevous_barrier, arg);
132}
133
134static void
135dtrace_sync_func(void)
136{
137}
138
139void
140dtrace_sync(void)
141{
142        dtrace_xcall(DTRACE_CPUALL, (dtrace_xcall_t)dtrace_sync_func, NULL);
143}
144
145#ifdef notyet
146int (*dtrace_fasttrap_probe_ptr)(struct regs *);
147int (*dtrace_pid_probe_ptr)(struct regs *);
148int (*dtrace_return_probe_ptr)(struct regs *);
149
150void
151dtrace_user_probe(struct regs *rp, caddr_t addr, processorid_t cpuid)
152{
153	krwlock_t *rwp;
154	proc_t *p = curproc;
155	extern void trap(struct regs *, caddr_t, processorid_t);
156
157	if (USERMODE(rp->r_cs) || (rp->r_ps & PS_VM)) {
158		if (curthread->t_cred != p->p_cred) {
159			cred_t *oldcred = curthread->t_cred;
160			/*
161			 * DTrace accesses t_cred in probe context.  t_cred
162			 * must always be either NULL, or point to a valid,
163			 * allocated cred structure.
164			 */
165			curthread->t_cred = crgetcred();
166			crfree(oldcred);
167		}
168	}
169
170	if (rp->r_trapno == T_DTRACE_RET) {
171		uint8_t step = curthread->t_dtrace_step;
172		uint8_t ret = curthread->t_dtrace_ret;
173		uintptr_t npc = curthread->t_dtrace_npc;
174
175		if (curthread->t_dtrace_ast) {
176			aston(curthread);
177			curthread->t_sig_check = 1;
178		}
179
180		/*
181		 * Clear all user tracing flags.
182		 */
183		curthread->t_dtrace_ft = 0;
184
185		/*
186		 * If we weren't expecting to take a return probe trap, kill
187		 * the process as though it had just executed an unassigned
188		 * trap instruction.
189		 */
190		if (step == 0) {
191			tsignal(curthread, SIGILL);
192			return;
193		}
194
195		/*
196		 * If we hit this trap unrelated to a return probe, we're
197		 * just here to reset the AST flag since we deferred a signal
198		 * until after we logically single-stepped the instruction we
199		 * copied out.
200		 */
201		if (ret == 0) {
202			rp->r_pc = npc;
203			return;
204		}
205
206		/*
207		 * We need to wait until after we've called the
208		 * dtrace_return_probe_ptr function pointer to set %pc.
209		 */
210		rwp = &CPU->cpu_ft_lock;
211		rw_enter(rwp, RW_READER);
212		if (dtrace_return_probe_ptr != NULL)
213			(void) (*dtrace_return_probe_ptr)(rp);
214		rw_exit(rwp);
215		rp->r_pc = npc;
216
217	} else if (rp->r_trapno == T_DTRACE_PROBE) {
218		rwp = &CPU->cpu_ft_lock;
219		rw_enter(rwp, RW_READER);
220		if (dtrace_fasttrap_probe_ptr != NULL)
221			(void) (*dtrace_fasttrap_probe_ptr)(rp);
222		rw_exit(rwp);
223
224	} else if (rp->r_trapno == T_BPTFLT) {
225		uint8_t instr;
226		rwp = &CPU->cpu_ft_lock;
227
228		/*
229		 * The DTrace fasttrap provider uses the breakpoint trap
230		 * (int 3). We let DTrace take the first crack at handling
231		 * this trap; if it's not a probe that DTrace knowns about,
232		 * we call into the trap() routine to handle it like a
233		 * breakpoint placed by a conventional debugger.
234		 */
235		rw_enter(rwp, RW_READER);
236		if (dtrace_pid_probe_ptr != NULL &&
237		    (*dtrace_pid_probe_ptr)(rp) == 0) {
238			rw_exit(rwp);
239			return;
240		}
241		rw_exit(rwp);
242
243		/*
244		 * If the instruction that caused the breakpoint trap doesn't
245		 * look like an int 3 anymore, it may be that this tracepoint
246		 * was removed just after the user thread executed it. In
247		 * that case, return to user land to retry the instuction.
248		 */
249		if (fuword8((void *)(rp->r_pc - 1), &instr) == 0 &&
250		    instr != FASTTRAP_INSTR) {
251			rp->r_pc--;
252			return;
253		}
254
255		trap(rp, addr, cpuid);
256
257	} else {
258		trap(rp, addr, cpuid);
259	}
260}
261
262void
263dtrace_safe_synchronous_signal(void)
264{
265	kthread_t *t = curthread;
266	struct regs *rp = lwptoregs(ttolwp(t));
267	size_t isz = t->t_dtrace_npc - t->t_dtrace_pc;
268
269	ASSERT(t->t_dtrace_on);
270
271	/*
272	 * If we're not in the range of scratch addresses, we're not actually
273	 * tracing user instructions so turn off the flags. If the instruction
274	 * we copied out caused a synchonous trap, reset the pc back to its
275	 * original value and turn off the flags.
276	 */
277	if (rp->r_pc < t->t_dtrace_scrpc ||
278	    rp->r_pc > t->t_dtrace_astpc + isz) {
279		t->t_dtrace_ft = 0;
280	} else if (rp->r_pc == t->t_dtrace_scrpc ||
281	    rp->r_pc == t->t_dtrace_astpc) {
282		rp->r_pc = t->t_dtrace_pc;
283		t->t_dtrace_ft = 0;
284	}
285}
286
287int
288dtrace_safe_defer_signal(void)
289{
290	kthread_t *t = curthread;
291	struct regs *rp = lwptoregs(ttolwp(t));
292	size_t isz = t->t_dtrace_npc - t->t_dtrace_pc;
293
294	ASSERT(t->t_dtrace_on);
295
296	/*
297	 * If we're not in the range of scratch addresses, we're not actually
298	 * tracing user instructions so turn off the flags.
299	 */
300	if (rp->r_pc < t->t_dtrace_scrpc ||
301	    rp->r_pc > t->t_dtrace_astpc + isz) {
302		t->t_dtrace_ft = 0;
303		return (0);
304	}
305
306	/*
307	 * If we have executed the original instruction, but we have performed
308	 * neither the jmp back to t->t_dtrace_npc nor the clean up of any
309	 * registers used to emulate %rip-relative instructions in 64-bit mode,
310	 * we'll save ourselves some effort by doing that here and taking the
311	 * signal right away.  We detect this condition by seeing if the program
312	 * counter is the range [scrpc + isz, astpc).
313	 */
314	if (rp->r_pc >= t->t_dtrace_scrpc + isz &&
315	    rp->r_pc < t->t_dtrace_astpc) {
316#ifdef __amd64
317		/*
318		 * If there is a scratch register and we're on the
319		 * instruction immediately after the modified instruction,
320		 * restore the value of that scratch register.
321		 */
322		if (t->t_dtrace_reg != 0 &&
323		    rp->r_pc == t->t_dtrace_scrpc + isz) {
324			switch (t->t_dtrace_reg) {
325			case REG_RAX:
326				rp->r_rax = t->t_dtrace_regv;
327				break;
328			case REG_RCX:
329				rp->r_rcx = t->t_dtrace_regv;
330				break;
331			case REG_R8:
332				rp->r_r8 = t->t_dtrace_regv;
333				break;
334			case REG_R9:
335				rp->r_r9 = t->t_dtrace_regv;
336				break;
337			}
338		}
339#endif
340		rp->r_pc = t->t_dtrace_npc;
341		t->t_dtrace_ft = 0;
342		return (0);
343	}
344
345	/*
346	 * Otherwise, make sure we'll return to the kernel after executing
347	 * the copied out instruction and defer the signal.
348	 */
349	if (!t->t_dtrace_step) {
350		ASSERT(rp->r_pc < t->t_dtrace_astpc);
351		rp->r_pc += t->t_dtrace_astpc - t->t_dtrace_scrpc;
352		t->t_dtrace_step = 1;
353	}
354
355	t->t_dtrace_ast = 1;
356
357	return (1);
358}
359#endif
360
361static int64_t	tgt_cpu_tsc;
362static int64_t	hst_cpu_tsc;
363static int64_t	tsc_skew[MAXCPU];
364static uint64_t	nsec_scale;
365
366/* See below for the explanation of this macro. */
367#define SCALE_SHIFT	28
368
369static void
370dtrace_gethrtime_init_cpu(void *arg)
371{
372	uintptr_t cpu = (uintptr_t) arg;
373
374	if (cpu == curcpu)
375		tgt_cpu_tsc = rdtsc();
376	else
377		hst_cpu_tsc = rdtsc();
378}
379
380static void
381dtrace_gethrtime_init(void *arg)
382{
383	cpuset_t map;
384	struct pcpu *pc;
385	uint64_t tsc_f;
386	int i;
387
388	/*
389	 * Get TSC frequency known at this moment.
390	 * This should be constant if TSC is invariant.
391	 * Otherwise tick->time conversion will be inaccurate, but
392	 * will preserve monotonic property of TSC.
393	 */
394	tsc_f = atomic_load_acq_64(&tsc_freq);
395
396	/*
397	 * The following line checks that nsec_scale calculated below
398	 * doesn't overflow 32-bit unsigned integer, so that it can multiply
399	 * another 32-bit integer without overflowing 64-bit.
400	 * Thus minimum supported TSC frequency is 62.5MHz.
401	 */
402	KASSERT(tsc_f > (NANOSEC >> (32 - SCALE_SHIFT)), ("TSC frequency is too low"));
403
404	/*
405	 * We scale up NANOSEC/tsc_f ratio to preserve as much precision
406	 * as possible.
407	 * 2^28 factor was chosen quite arbitrarily from practical
408	 * considerations:
409	 * - it supports TSC frequencies as low as 62.5MHz (see above);
410	 * - it provides quite good precision (e < 0.01%) up to THz
411	 *   (terahertz) values;
412	 */
413	nsec_scale = ((uint64_t)NANOSEC << SCALE_SHIFT) / tsc_f;
414
415	/* The current CPU is the reference one. */
416	sched_pin();
417	tsc_skew[curcpu] = 0;
418	CPU_FOREACH(i) {
419		if (i == curcpu)
420			continue;
421
422		pc = pcpu_find(i);
423		CPU_SETOF(PCPU_GET(cpuid), &map);
424		CPU_SET(pc->pc_cpuid, &map);
425
426		smp_rendezvous_cpus(map, NULL,
427		    dtrace_gethrtime_init_cpu,
428		    smp_no_rendevous_barrier, (void *)(uintptr_t) i);
429
430		tsc_skew[i] = tgt_cpu_tsc - hst_cpu_tsc;
431	}
432	sched_unpin();
433}
434
435SYSINIT(dtrace_gethrtime_init, SI_SUB_SMP, SI_ORDER_ANY, dtrace_gethrtime_init, NULL);
436
437/*
438 * DTrace needs a high resolution time function which can
439 * be called from a probe context and guaranteed not to have
440 * instrumented with probes itself.
441 *
442 * Returns nanoseconds since boot.
443 */
444uint64_t
445dtrace_gethrtime()
446{
447	uint64_t tsc;
448	uint32_t lo;
449	uint32_t hi;
450
451	/*
452	 * We split TSC value into lower and higher 32-bit halves and separately
453	 * scale them with nsec_scale, then we scale them down by 2^28
454	 * (see nsec_scale calculations) taking into account 32-bit shift of
455	 * the higher half and finally add.
456	 */
457	tsc = rdtsc() - tsc_skew[curcpu];
458	lo = tsc;
459	hi = tsc >> 32;
460	return (((lo * nsec_scale) >> SCALE_SHIFT) +
461	    ((hi * nsec_scale) << (32 - SCALE_SHIFT)));
462}
463
464uint64_t
465dtrace_gethrestime(void)
466{
467	struct timespec current_time;
468
469	dtrace_getnanotime(&current_time);
470
471	return (current_time.tv_sec * 1000000000ULL + current_time.tv_nsec);
472}
473
474/* Function to handle DTrace traps during probes. See i386/i386/trap.c */
475int
476dtrace_trap(struct trapframe *frame, u_int type)
477{
478	/*
479	 * A trap can occur while DTrace executes a probe. Before
480	 * executing the probe, DTrace blocks re-scheduling and sets
481	 * a flag in it's per-cpu flags to indicate that it doesn't
482	 * want to fault. On returning from the probe, the no-fault
483	 * flag is cleared and finally re-scheduling is enabled.
484	 *
485	 * Check if DTrace has enabled 'no-fault' mode:
486	 *
487	 */
488	if ((cpu_core[curcpu].cpuc_dtrace_flags & CPU_DTRACE_NOFAULT) != 0) {
489		/*
490		 * There are only a couple of trap types that are expected.
491		 * All the rest will be handled in the usual way.
492		 */
493		switch (type) {
494		/* General protection fault. */
495		case T_PROTFLT:
496			/* Flag an illegal operation. */
497			cpu_core[curcpu].cpuc_dtrace_flags |= CPU_DTRACE_ILLOP;
498
499			/*
500			 * Offset the instruction pointer to the instruction
501			 * following the one causing the fault.
502			 */
503			frame->tf_eip += dtrace_instr_size((u_char *) frame->tf_eip);
504			return (1);
505		/* Page fault. */
506		case T_PAGEFLT:
507			/* Flag a bad address. */
508			cpu_core[curcpu].cpuc_dtrace_flags |= CPU_DTRACE_BADADDR;
509			cpu_core[curcpu].cpuc_dtrace_illval = rcr2();
510
511			/*
512			 * Offset the instruction pointer to the instruction
513			 * following the one causing the fault.
514			 */
515			frame->tf_eip += dtrace_instr_size((u_char *) frame->tf_eip);
516			return (1);
517		default:
518			/* Handle all other traps in the usual way. */
519			break;
520		}
521	}
522
523	/* Handle the trap in the usual way. */
524	return (0);
525}
526