dtrace_subr.c revision 297770
1179237Sjb/*
2179237Sjb * CDDL HEADER START
3179237Sjb *
4179237Sjb * The contents of this file are subject to the terms of the
5179237Sjb * Common Development and Distribution License, Version 1.0 only
6179237Sjb * (the "License").  You may not use this file except in compliance
7179237Sjb * with the License.
8179237Sjb *
9179237Sjb * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
10179237Sjb * or http://www.opensolaris.org/os/licensing.
11179237Sjb * See the License for the specific language governing permissions
12179237Sjb * and limitations under the License.
13179237Sjb *
14179237Sjb * When distributing Covered Code, include this CDDL HEADER in each
15179237Sjb * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
16179237Sjb * If applicable, add the following below this CDDL HEADER, with the
17179237Sjb * fields enclosed by brackets "[]" replaced with your own identifying
18179237Sjb * information: Portions Copyright [yyyy] [name of copyright owner]
19179237Sjb *
20179237Sjb * CDDL HEADER END
21179237Sjb *
22179237Sjb * $FreeBSD: head/sys/cddl/dev/dtrace/amd64/dtrace_subr.c 297770 2016-04-10 01:23:39Z markj $
23179237Sjb *
24179237Sjb */
25179237Sjb/*
26179237Sjb * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
27179237Sjb * Use is subject to license terms.
28179237Sjb */
29179237Sjb
30236567Sgnn/*
31236567Sgnn * Copyright (c) 2011, Joyent, Inc. All rights reserved.
32236567Sgnn */
33236567Sgnn
34179237Sjb#include <sys/param.h>
35179237Sjb#include <sys/systm.h>
36179237Sjb#include <sys/types.h>
37179237Sjb#include <sys/kernel.h>
38179237Sjb#include <sys/malloc.h>
39179237Sjb#include <sys/kmem.h>
40179237Sjb#include <sys/smp.h>
41179237Sjb#include <sys/dtrace_impl.h>
42179237Sjb#include <sys/dtrace_bsd.h>
43179237Sjb#include <machine/clock.h>
44179237Sjb#include <machine/frame.h>
45179237Sjb#include <vm/pmap.h>
46179237Sjb
47238537Sgnnextern void dtrace_getnanotime(struct timespec *tsp);
48238537Sgnn
49179237Sjbint dtrace_invop(uintptr_t, uintptr_t *, uintptr_t);
50179237Sjb
51179237Sjbtypedef struct dtrace_invop_hdlr {
52179237Sjb	int (*dtih_func)(uintptr_t, uintptr_t *, uintptr_t);
53179237Sjb	struct dtrace_invop_hdlr *dtih_next;
54179237Sjb} dtrace_invop_hdlr_t;
55179237Sjb
56179237Sjbdtrace_invop_hdlr_t *dtrace_invop_hdlr;
57179237Sjb
58179237Sjbint
59179237Sjbdtrace_invop(uintptr_t addr, uintptr_t *stack, uintptr_t eax)
60179237Sjb{
61179237Sjb	dtrace_invop_hdlr_t *hdlr;
62179237Sjb	int rval;
63179237Sjb
64179237Sjb	for (hdlr = dtrace_invop_hdlr; hdlr != NULL; hdlr = hdlr->dtih_next)
65179237Sjb		if ((rval = hdlr->dtih_func(addr, stack, eax)) != 0)
66179237Sjb			return (rval);
67179237Sjb
68179237Sjb	return (0);
69179237Sjb}
70179237Sjb
71179237Sjbvoid
72179237Sjbdtrace_invop_add(int (*func)(uintptr_t, uintptr_t *, uintptr_t))
73179237Sjb{
74179237Sjb	dtrace_invop_hdlr_t *hdlr;
75179237Sjb
76179237Sjb	hdlr = kmem_alloc(sizeof (dtrace_invop_hdlr_t), KM_SLEEP);
77179237Sjb	hdlr->dtih_func = func;
78179237Sjb	hdlr->dtih_next = dtrace_invop_hdlr;
79179237Sjb	dtrace_invop_hdlr = hdlr;
80179237Sjb}
81179237Sjb
82179237Sjbvoid
83179237Sjbdtrace_invop_remove(int (*func)(uintptr_t, uintptr_t *, uintptr_t))
84179237Sjb{
85179237Sjb	dtrace_invop_hdlr_t *hdlr = dtrace_invop_hdlr, *prev = NULL;
86179237Sjb
87179237Sjb	for (;;) {
88179237Sjb		if (hdlr == NULL)
89179237Sjb			panic("attempt to remove non-existent invop handler");
90179237Sjb
91179237Sjb		if (hdlr->dtih_func == func)
92179237Sjb			break;
93179237Sjb
94179237Sjb		prev = hdlr;
95179237Sjb		hdlr = hdlr->dtih_next;
96179237Sjb	}
97179237Sjb
98179237Sjb	if (prev == NULL) {
99179237Sjb		ASSERT(dtrace_invop_hdlr == hdlr);
100179237Sjb		dtrace_invop_hdlr = hdlr->dtih_next;
101179237Sjb	} else {
102179237Sjb		ASSERT(dtrace_invop_hdlr != hdlr);
103179237Sjb		prev->dtih_next = hdlr->dtih_next;
104179237Sjb	}
105179237Sjb
106179237Sjb	kmem_free(hdlr, 0);
107179237Sjb}
108179237Sjb
109179237Sjb/*ARGSUSED*/
110179237Sjbvoid
111179237Sjbdtrace_toxic_ranges(void (*func)(uintptr_t base, uintptr_t limit))
112179237Sjb{
113179237Sjb	(*func)(0, (uintptr_t) addr_PTmap);
114179237Sjb}
115179237Sjb
116179237Sjbvoid
117179237Sjbdtrace_xcall(processorid_t cpu, dtrace_xcall_t func, void *arg)
118179237Sjb{
119222813Sattilio	cpuset_t cpus;
120179237Sjb
121179237Sjb	if (cpu == DTRACE_CPUALL)
122179237Sjb		cpus = all_cpus;
123179237Sjb	else
124222813Sattilio		CPU_SETOF(cpu, &cpus);
125179237Sjb
126216251Savg	smp_rendezvous_cpus(cpus, smp_no_rendevous_barrier, func,
127216251Savg	    smp_no_rendevous_barrier, arg);
128179237Sjb}
129179237Sjb
130179237Sjbstatic void
131179237Sjbdtrace_sync_func(void)
132179237Sjb{
133179237Sjb}
134179237Sjb
135179237Sjbvoid
136179237Sjbdtrace_sync(void)
137179237Sjb{
138179237Sjb        dtrace_xcall(DTRACE_CPUALL, (dtrace_xcall_t)dtrace_sync_func, NULL);
139179237Sjb}
140179237Sjb
141179237Sjb#ifdef notyet
142179237Sjbvoid
143179237Sjbdtrace_safe_synchronous_signal(void)
144179237Sjb{
145179237Sjb	kthread_t *t = curthread;
146179237Sjb	struct regs *rp = lwptoregs(ttolwp(t));
147179237Sjb	size_t isz = t->t_dtrace_npc - t->t_dtrace_pc;
148179237Sjb
149179237Sjb	ASSERT(t->t_dtrace_on);
150179237Sjb
151179237Sjb	/*
152179237Sjb	 * If we're not in the range of scratch addresses, we're not actually
153179237Sjb	 * tracing user instructions so turn off the flags. If the instruction
154179237Sjb	 * we copied out caused a synchonous trap, reset the pc back to its
155179237Sjb	 * original value and turn off the flags.
156179237Sjb	 */
157179237Sjb	if (rp->r_pc < t->t_dtrace_scrpc ||
158179237Sjb	    rp->r_pc > t->t_dtrace_astpc + isz) {
159179237Sjb		t->t_dtrace_ft = 0;
160179237Sjb	} else if (rp->r_pc == t->t_dtrace_scrpc ||
161179237Sjb	    rp->r_pc == t->t_dtrace_astpc) {
162179237Sjb		rp->r_pc = t->t_dtrace_pc;
163179237Sjb		t->t_dtrace_ft = 0;
164179237Sjb	}
165179237Sjb}
166179237Sjb
167179237Sjbint
168179237Sjbdtrace_safe_defer_signal(void)
169179237Sjb{
170179237Sjb	kthread_t *t = curthread;
171179237Sjb	struct regs *rp = lwptoregs(ttolwp(t));
172179237Sjb	size_t isz = t->t_dtrace_npc - t->t_dtrace_pc;
173179237Sjb
174179237Sjb	ASSERT(t->t_dtrace_on);
175179237Sjb
176179237Sjb	/*
177179237Sjb	 * If we're not in the range of scratch addresses, we're not actually
178179237Sjb	 * tracing user instructions so turn off the flags.
179179237Sjb	 */
180179237Sjb	if (rp->r_pc < t->t_dtrace_scrpc ||
181179237Sjb	    rp->r_pc > t->t_dtrace_astpc + isz) {
182179237Sjb		t->t_dtrace_ft = 0;
183179237Sjb		return (0);
184179237Sjb	}
185179237Sjb
186179237Sjb	/*
187236567Sgnn	 * If we have executed the original instruction, but we have performed
188236567Sgnn	 * neither the jmp back to t->t_dtrace_npc nor the clean up of any
189236567Sgnn	 * registers used to emulate %rip-relative instructions in 64-bit mode,
190236567Sgnn	 * we'll save ourselves some effort by doing that here and taking the
191236567Sgnn	 * signal right away.  We detect this condition by seeing if the program
192236567Sgnn	 * counter is the range [scrpc + isz, astpc).
193179237Sjb	 */
194236567Sgnn	if (rp->r_pc >= t->t_dtrace_scrpc + isz &&
195236567Sgnn	    rp->r_pc < t->t_dtrace_astpc) {
196179237Sjb#ifdef __amd64
197179237Sjb		/*
198179237Sjb		 * If there is a scratch register and we're on the
199179237Sjb		 * instruction immediately after the modified instruction,
200179237Sjb		 * restore the value of that scratch register.
201179237Sjb		 */
202179237Sjb		if (t->t_dtrace_reg != 0 &&
203179237Sjb		    rp->r_pc == t->t_dtrace_scrpc + isz) {
204179237Sjb			switch (t->t_dtrace_reg) {
205179237Sjb			case REG_RAX:
206179237Sjb				rp->r_rax = t->t_dtrace_regv;
207179237Sjb				break;
208179237Sjb			case REG_RCX:
209179237Sjb				rp->r_rcx = t->t_dtrace_regv;
210179237Sjb				break;
211179237Sjb			case REG_R8:
212179237Sjb				rp->r_r8 = t->t_dtrace_regv;
213179237Sjb				break;
214179237Sjb			case REG_R9:
215179237Sjb				rp->r_r9 = t->t_dtrace_regv;
216179237Sjb				break;
217179237Sjb			}
218179237Sjb		}
219179237Sjb#endif
220179237Sjb		rp->r_pc = t->t_dtrace_npc;
221179237Sjb		t->t_dtrace_ft = 0;
222179237Sjb		return (0);
223179237Sjb	}
224179237Sjb
225179237Sjb	/*
226179237Sjb	 * Otherwise, make sure we'll return to the kernel after executing
227179237Sjb	 * the copied out instruction and defer the signal.
228179237Sjb	 */
229179237Sjb	if (!t->t_dtrace_step) {
230179237Sjb		ASSERT(rp->r_pc < t->t_dtrace_astpc);
231179237Sjb		rp->r_pc += t->t_dtrace_astpc - t->t_dtrace_scrpc;
232179237Sjb		t->t_dtrace_step = 1;
233179237Sjb	}
234179237Sjb
235179237Sjb	t->t_dtrace_ast = 1;
236179237Sjb
237179237Sjb	return (1);
238179237Sjb}
239179237Sjb#endif
240179237Sjb
241179237Sjbstatic int64_t	tgt_cpu_tsc;
242179237Sjbstatic int64_t	hst_cpu_tsc;
243179237Sjbstatic int64_t	tsc_skew[MAXCPU];
244195710Savgstatic uint64_t	nsec_scale;
245179237Sjb
246195710Savg/* See below for the explanation of this macro. */
247195710Savg#define SCALE_SHIFT	28
248195710Savg
249297770Smarkj/*
250297770Smarkj * Get the frequency and scale factor as early as possible so that they can be
251297770Smarkj * used for boot-time tracing.
252297770Smarkj */
253179237Sjbstatic void
254297770Smarkjdtrace_gethrtime_init_early(void *arg)
255179237Sjb{
256195710Savg	uint64_t tsc_f;
257179237Sjb
258195710Savg	/*
259195710Savg	 * Get TSC frequency known at this moment.
260195710Savg	 * This should be constant if TSC is invariant.
261195710Savg	 * Otherwise tick->time conversion will be inaccurate, but
262195710Savg	 * will preserve monotonic property of TSC.
263195710Savg	 */
264220433Sjkim	tsc_f = atomic_load_acq_64(&tsc_freq);
265195710Savg
266195710Savg	/*
267195710Savg	 * The following line checks that nsec_scale calculated below
268195710Savg	 * doesn't overflow 32-bit unsigned integer, so that it can multiply
269195710Savg	 * another 32-bit integer without overflowing 64-bit.
270195710Savg	 * Thus minimum supported TSC frequency is 62.5MHz.
271195710Savg	 */
272297770Smarkj	KASSERT(tsc_f > (NANOSEC >> (32 - SCALE_SHIFT)),
273297770Smarkj	    ("TSC frequency is too low"));
274195710Savg
275195710Savg	/*
276195710Savg	 * We scale up NANOSEC/tsc_f ratio to preserve as much precision
277195710Savg	 * as possible.
278195710Savg	 * 2^28 factor was chosen quite arbitrarily from practical
279195710Savg	 * considerations:
280195710Savg	 * - it supports TSC frequencies as low as 62.5MHz (see above);
281195710Savg	 * - it provides quite good precision (e < 0.01%) up to THz
282195710Savg	 *   (terahertz) values;
283195710Savg	 */
284195710Savg	nsec_scale = ((uint64_t)NANOSEC << SCALE_SHIFT) / tsc_f;
285297770Smarkj}
286297770SmarkjSYSINIT(dtrace_gethrtime_init_early, SI_SUB_CPU, SI_ORDER_ANY,
287297770Smarkj    dtrace_gethrtime_init_early, NULL);
288195710Savg
289297770Smarkjstatic void
290297770Smarkjdtrace_gethrtime_init_cpu(void *arg)
291297770Smarkj{
292297770Smarkj	uintptr_t cpu = (uintptr_t) arg;
293297770Smarkj
294297770Smarkj	if (cpu == curcpu)
295297770Smarkj		tgt_cpu_tsc = rdtsc();
296297770Smarkj	else
297297770Smarkj		hst_cpu_tsc = rdtsc();
298297770Smarkj}
299297770Smarkj
300297770Smarkjstatic void
301297770Smarkjdtrace_gethrtime_init(void *arg)
302297770Smarkj{
303297770Smarkj	struct pcpu *pc;
304297770Smarkj	cpuset_t map;
305297770Smarkj	int i;
306297770Smarkj
307179237Sjb	/* The current CPU is the reference one. */
308216250Savg	sched_pin();
309179237Sjb	tsc_skew[curcpu] = 0;
310209059Sjhb	CPU_FOREACH(i) {
311179237Sjb		if (i == curcpu)
312179237Sjb			continue;
313179237Sjb
314216250Savg		pc = pcpu_find(i);
315223758Sattilio		CPU_SETOF(PCPU_GET(cpuid), &map);
316223758Sattilio		CPU_SET(pc->pc_cpuid, &map);
317179237Sjb
318221740Savg		smp_rendezvous_cpus(map, NULL,
319179237Sjb		    dtrace_gethrtime_init_cpu,
320179237Sjb		    smp_no_rendevous_barrier, (void *)(uintptr_t) i);
321179237Sjb
322179237Sjb		tsc_skew[i] = tgt_cpu_tsc - hst_cpu_tsc;
323179237Sjb	}
324216250Savg	sched_unpin();
325179237Sjb}
326297770SmarkjSYSINIT(dtrace_gethrtime_init, SI_SUB_SMP, SI_ORDER_ANY, dtrace_gethrtime_init,
327297770Smarkj    NULL);
328179237Sjb
329179237Sjb/*
330179237Sjb * DTrace needs a high resolution time function which can
331179237Sjb * be called from a probe context and guaranteed not to have
332179237Sjb * instrumented with probes itself.
333179237Sjb *
334179237Sjb * Returns nanoseconds since boot.
335179237Sjb */
336179237Sjbuint64_t
337179237Sjbdtrace_gethrtime()
338179237Sjb{
339195710Savg	uint64_t tsc;
340195710Savg	uint32_t lo;
341195710Savg	uint32_t hi;
342195710Savg
343195710Savg	/*
344195710Savg	 * We split TSC value into lower and higher 32-bit halves and separately
345195710Savg	 * scale them with nsec_scale, then we scale them down by 2^28
346195710Savg	 * (see nsec_scale calculations) taking into account 32-bit shift of
347195710Savg	 * the higher half and finally add.
348195710Savg	 */
349236566Szml	tsc = rdtsc() - tsc_skew[curcpu];
350195710Savg	lo = tsc;
351195710Savg	hi = tsc >> 32;
352195710Savg	return (((lo * nsec_scale) >> SCALE_SHIFT) +
353195710Savg	    ((hi * nsec_scale) << (32 - SCALE_SHIFT)));
354179237Sjb}
355179237Sjb
356179237Sjbuint64_t
357179237Sjbdtrace_gethrestime(void)
358179237Sjb{
359238537Sgnn	struct timespec current_time;
360238537Sgnn
361238537Sgnn	dtrace_getnanotime(&current_time);
362238537Sgnn
363238552Sgnn	return (current_time.tv_sec * 1000000000ULL + current_time.tv_nsec);
364179237Sjb}
365179237Sjb
366268869Smarkj/* Function to handle DTrace traps during probes. See amd64/amd64/trap.c. */
367179237Sjbint
368276142Smarkjdtrace_trap(struct trapframe *frame, u_int type)
369179237Sjb{
370179237Sjb	/*
371179237Sjb	 * A trap can occur while DTrace executes a probe. Before
372179237Sjb	 * executing the probe, DTrace blocks re-scheduling and sets
373268600Smarkj	 * a flag in its per-cpu flags to indicate that it doesn't
374218909Sbrucec	 * want to fault. On returning from the probe, the no-fault
375179237Sjb	 * flag is cleared and finally re-scheduling is enabled.
376179237Sjb	 *
377179237Sjb	 * Check if DTrace has enabled 'no-fault' mode:
378179237Sjb	 */
379179237Sjb	if ((cpu_core[curcpu].cpuc_dtrace_flags & CPU_DTRACE_NOFAULT) != 0) {
380179237Sjb		/*
381179237Sjb		 * There are only a couple of trap types that are expected.
382179237Sjb		 * All the rest will be handled in the usual way.
383179237Sjb		 */
384276142Smarkj		switch (type) {
385179237Sjb		/* General protection fault. */
386179237Sjb		case T_PROTFLT:
387179237Sjb			/* Flag an illegal operation. */
388179237Sjb			cpu_core[curcpu].cpuc_dtrace_flags |= CPU_DTRACE_ILLOP;
389179237Sjb
390179237Sjb			/*
391179237Sjb			 * Offset the instruction pointer to the instruction
392179237Sjb			 * following the one causing the fault.
393179237Sjb			 */
394179237Sjb			frame->tf_rip += dtrace_instr_size((u_char *) frame->tf_rip);
395179237Sjb			return (1);
396179237Sjb		/* Page fault. */
397179237Sjb		case T_PAGEFLT:
398179237Sjb			/* Flag a bad address. */
399179237Sjb			cpu_core[curcpu].cpuc_dtrace_flags |= CPU_DTRACE_BADADDR;
400179237Sjb			cpu_core[curcpu].cpuc_dtrace_illval = frame->tf_addr;
401179237Sjb
402179237Sjb			/*
403179237Sjb			 * Offset the instruction pointer to the instruction
404179237Sjb			 * following the one causing the fault.
405179237Sjb			 */
406179237Sjb			frame->tf_rip += dtrace_instr_size((u_char *) frame->tf_rip);
407179237Sjb			return (1);
408179237Sjb		default:
409179237Sjb			/* Handle all other traps in the usual way. */
410179237Sjb			break;
411179237Sjb		}
412179237Sjb	}
413179237Sjb
414179237Sjb	/* Handle the trap in the usual way. */
415179237Sjb	return (0);
416179237Sjb}
417