dtrace_subr.c revision 268600
1179237Sjb/* 2179237Sjb * CDDL HEADER START 3179237Sjb * 4179237Sjb * The contents of this file are subject to the terms of the 5179237Sjb * Common Development and Distribution License, Version 1.0 only 6179237Sjb * (the "License"). You may not use this file except in compliance 7179237Sjb * with the License. 8179237Sjb * 9179237Sjb * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 10179237Sjb * or http://www.opensolaris.org/os/licensing. 11179237Sjb * See the License for the specific language governing permissions 12179237Sjb * and limitations under the License. 13179237Sjb * 14179237Sjb * When distributing Covered Code, include this CDDL HEADER in each 15179237Sjb * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 16179237Sjb * If applicable, add the following below this CDDL HEADER, with the 17179237Sjb * fields enclosed by brackets "[]" replaced with your own identifying 18179237Sjb * information: Portions Copyright [yyyy] [name of copyright owner] 19179237Sjb * 20179237Sjb * CDDL HEADER END 21179237Sjb * 22179237Sjb * $FreeBSD: head/sys/cddl/dev/dtrace/amd64/dtrace_subr.c 268600 2014-07-14 04:38:17Z markj $ 23179237Sjb * 24179237Sjb */ 25179237Sjb/* 26179237Sjb * Copyright 2005 Sun Microsystems, Inc. All rights reserved. 27179237Sjb * Use is subject to license terms. 28179237Sjb */ 29179237Sjb 30236567Sgnn/* 31236567Sgnn * Copyright (c) 2011, Joyent, Inc. All rights reserved. 32236567Sgnn */ 33236567Sgnn 34179237Sjb#include <sys/param.h> 35179237Sjb#include <sys/systm.h> 36179237Sjb#include <sys/types.h> 37179237Sjb#include <sys/kernel.h> 38179237Sjb#include <sys/malloc.h> 39179237Sjb#include <sys/kmem.h> 40179237Sjb#include <sys/smp.h> 41179237Sjb#include <sys/dtrace_impl.h> 42179237Sjb#include <sys/dtrace_bsd.h> 43179237Sjb#include <machine/clock.h> 44179237Sjb#include <machine/frame.h> 45179237Sjb#include <vm/pmap.h> 46179237Sjb 47179237Sjbextern uintptr_t dtrace_in_probe_addr; 48179237Sjbextern int dtrace_in_probe; 49179237Sjb 50238537Sgnnextern void dtrace_getnanotime(struct timespec *tsp); 51238537Sgnn 52179237Sjbint dtrace_invop(uintptr_t, uintptr_t *, uintptr_t); 53179237Sjb 54179237Sjbtypedef struct dtrace_invop_hdlr { 55179237Sjb int (*dtih_func)(uintptr_t, uintptr_t *, uintptr_t); 56179237Sjb struct dtrace_invop_hdlr *dtih_next; 57179237Sjb} dtrace_invop_hdlr_t; 58179237Sjb 59179237Sjbdtrace_invop_hdlr_t *dtrace_invop_hdlr; 60179237Sjb 61179237Sjbint 62179237Sjbdtrace_invop(uintptr_t addr, uintptr_t *stack, uintptr_t eax) 63179237Sjb{ 64179237Sjb dtrace_invop_hdlr_t *hdlr; 65179237Sjb int rval; 66179237Sjb 67179237Sjb for (hdlr = dtrace_invop_hdlr; hdlr != NULL; hdlr = hdlr->dtih_next) 68179237Sjb if ((rval = hdlr->dtih_func(addr, stack, eax)) != 0) 69179237Sjb return (rval); 70179237Sjb 71179237Sjb return (0); 72179237Sjb} 73179237Sjb 74179237Sjbvoid 75179237Sjbdtrace_invop_add(int (*func)(uintptr_t, uintptr_t *, uintptr_t)) 76179237Sjb{ 77179237Sjb dtrace_invop_hdlr_t *hdlr; 78179237Sjb 79179237Sjb hdlr = kmem_alloc(sizeof (dtrace_invop_hdlr_t), KM_SLEEP); 80179237Sjb hdlr->dtih_func = func; 81179237Sjb hdlr->dtih_next = dtrace_invop_hdlr; 82179237Sjb dtrace_invop_hdlr = hdlr; 83179237Sjb} 84179237Sjb 85179237Sjbvoid 86179237Sjbdtrace_invop_remove(int (*func)(uintptr_t, uintptr_t *, uintptr_t)) 87179237Sjb{ 88179237Sjb dtrace_invop_hdlr_t *hdlr = dtrace_invop_hdlr, *prev = NULL; 89179237Sjb 90179237Sjb for (;;) { 91179237Sjb if (hdlr == NULL) 92179237Sjb panic("attempt to remove non-existent invop handler"); 93179237Sjb 94179237Sjb if (hdlr->dtih_func == func) 95179237Sjb break; 96179237Sjb 97179237Sjb prev = hdlr; 98179237Sjb hdlr = hdlr->dtih_next; 99179237Sjb } 100179237Sjb 101179237Sjb if (prev == NULL) { 102179237Sjb ASSERT(dtrace_invop_hdlr == hdlr); 103179237Sjb dtrace_invop_hdlr = hdlr->dtih_next; 104179237Sjb } else { 105179237Sjb ASSERT(dtrace_invop_hdlr != hdlr); 106179237Sjb prev->dtih_next = hdlr->dtih_next; 107179237Sjb } 108179237Sjb 109179237Sjb kmem_free(hdlr, 0); 110179237Sjb} 111179237Sjb 112179237Sjb/*ARGSUSED*/ 113179237Sjbvoid 114179237Sjbdtrace_toxic_ranges(void (*func)(uintptr_t base, uintptr_t limit)) 115179237Sjb{ 116179237Sjb (*func)(0, (uintptr_t) addr_PTmap); 117179237Sjb} 118179237Sjb 119179237Sjbvoid 120179237Sjbdtrace_xcall(processorid_t cpu, dtrace_xcall_t func, void *arg) 121179237Sjb{ 122222813Sattilio cpuset_t cpus; 123179237Sjb 124179237Sjb if (cpu == DTRACE_CPUALL) 125179237Sjb cpus = all_cpus; 126179237Sjb else 127222813Sattilio CPU_SETOF(cpu, &cpus); 128179237Sjb 129216251Savg smp_rendezvous_cpus(cpus, smp_no_rendevous_barrier, func, 130216251Savg smp_no_rendevous_barrier, arg); 131179237Sjb} 132179237Sjb 133179237Sjbstatic void 134179237Sjbdtrace_sync_func(void) 135179237Sjb{ 136179237Sjb} 137179237Sjb 138179237Sjbvoid 139179237Sjbdtrace_sync(void) 140179237Sjb{ 141179237Sjb dtrace_xcall(DTRACE_CPUALL, (dtrace_xcall_t)dtrace_sync_func, NULL); 142179237Sjb} 143179237Sjb 144179237Sjb#ifdef notyet 145179237Sjbint (*dtrace_pid_probe_ptr)(struct regs *); 146179237Sjbint (*dtrace_return_probe_ptr)(struct regs *); 147179237Sjb 148179237Sjbvoid 149179237Sjbdtrace_user_probe(struct regs *rp, caddr_t addr, processorid_t cpuid) 150179237Sjb{ 151179237Sjb krwlock_t *rwp; 152179237Sjb proc_t *p = curproc; 153179237Sjb extern void trap(struct regs *, caddr_t, processorid_t); 154179237Sjb 155179237Sjb if (USERMODE(rp->r_cs) || (rp->r_ps & PS_VM)) { 156179237Sjb if (curthread->t_cred != p->p_cred) { 157179237Sjb cred_t *oldcred = curthread->t_cred; 158179237Sjb /* 159179237Sjb * DTrace accesses t_cred in probe context. t_cred 160179237Sjb * must always be either NULL, or point to a valid, 161179237Sjb * allocated cred structure. 162179237Sjb */ 163179237Sjb curthread->t_cred = crgetcred(); 164179237Sjb crfree(oldcred); 165179237Sjb } 166179237Sjb } 167179237Sjb 168179237Sjb if (rp->r_trapno == T_DTRACE_RET) { 169179237Sjb uint8_t step = curthread->t_dtrace_step; 170179237Sjb uint8_t ret = curthread->t_dtrace_ret; 171179237Sjb uintptr_t npc = curthread->t_dtrace_npc; 172179237Sjb 173179237Sjb if (curthread->t_dtrace_ast) { 174179237Sjb aston(curthread); 175179237Sjb curthread->t_sig_check = 1; 176179237Sjb } 177179237Sjb 178179237Sjb /* 179179237Sjb * Clear all user tracing flags. 180179237Sjb */ 181179237Sjb curthread->t_dtrace_ft = 0; 182179237Sjb 183179237Sjb /* 184179237Sjb * If we weren't expecting to take a return probe trap, kill 185179237Sjb * the process as though it had just executed an unassigned 186179237Sjb * trap instruction. 187179237Sjb */ 188179237Sjb if (step == 0) { 189179237Sjb tsignal(curthread, SIGILL); 190179237Sjb return; 191179237Sjb } 192179237Sjb 193179237Sjb /* 194179237Sjb * If we hit this trap unrelated to a return probe, we're 195179237Sjb * just here to reset the AST flag since we deferred a signal 196179237Sjb * until after we logically single-stepped the instruction we 197179237Sjb * copied out. 198179237Sjb */ 199179237Sjb if (ret == 0) { 200179237Sjb rp->r_pc = npc; 201179237Sjb return; 202179237Sjb } 203179237Sjb 204179237Sjb /* 205179237Sjb * We need to wait until after we've called the 206179237Sjb * dtrace_return_probe_ptr function pointer to set %pc. 207179237Sjb */ 208179237Sjb rwp = &CPU->cpu_ft_lock; 209179237Sjb rw_enter(rwp, RW_READER); 210179237Sjb if (dtrace_return_probe_ptr != NULL) 211179237Sjb (void) (*dtrace_return_probe_ptr)(rp); 212179237Sjb rw_exit(rwp); 213179237Sjb rp->r_pc = npc; 214179237Sjb 215179237Sjb } else if (rp->r_trapno == T_BPTFLT) { 216179237Sjb uint8_t instr; 217179237Sjb rwp = &CPU->cpu_ft_lock; 218179237Sjb 219179237Sjb /* 220179237Sjb * The DTrace fasttrap provider uses the breakpoint trap 221179237Sjb * (int 3). We let DTrace take the first crack at handling 222179237Sjb * this trap; if it's not a probe that DTrace knowns about, 223179237Sjb * we call into the trap() routine to handle it like a 224179237Sjb * breakpoint placed by a conventional debugger. 225179237Sjb */ 226179237Sjb rw_enter(rwp, RW_READER); 227179237Sjb if (dtrace_pid_probe_ptr != NULL && 228179237Sjb (*dtrace_pid_probe_ptr)(rp) == 0) { 229179237Sjb rw_exit(rwp); 230179237Sjb return; 231179237Sjb } 232179237Sjb rw_exit(rwp); 233179237Sjb 234179237Sjb /* 235179237Sjb * If the instruction that caused the breakpoint trap doesn't 236179237Sjb * look like an int 3 anymore, it may be that this tracepoint 237179237Sjb * was removed just after the user thread executed it. In 238179237Sjb * that case, return to user land to retry the instuction. 239179237Sjb */ 240179237Sjb if (fuword8((void *)(rp->r_pc - 1), &instr) == 0 && 241179237Sjb instr != FASTTRAP_INSTR) { 242179237Sjb rp->r_pc--; 243179237Sjb return; 244179237Sjb } 245179237Sjb 246179237Sjb trap(rp, addr, cpuid); 247179237Sjb 248179237Sjb } else { 249179237Sjb trap(rp, addr, cpuid); 250179237Sjb } 251179237Sjb} 252179237Sjb 253179237Sjbvoid 254179237Sjbdtrace_safe_synchronous_signal(void) 255179237Sjb{ 256179237Sjb kthread_t *t = curthread; 257179237Sjb struct regs *rp = lwptoregs(ttolwp(t)); 258179237Sjb size_t isz = t->t_dtrace_npc - t->t_dtrace_pc; 259179237Sjb 260179237Sjb ASSERT(t->t_dtrace_on); 261179237Sjb 262179237Sjb /* 263179237Sjb * If we're not in the range of scratch addresses, we're not actually 264179237Sjb * tracing user instructions so turn off the flags. If the instruction 265179237Sjb * we copied out caused a synchonous trap, reset the pc back to its 266179237Sjb * original value and turn off the flags. 267179237Sjb */ 268179237Sjb if (rp->r_pc < t->t_dtrace_scrpc || 269179237Sjb rp->r_pc > t->t_dtrace_astpc + isz) { 270179237Sjb t->t_dtrace_ft = 0; 271179237Sjb } else if (rp->r_pc == t->t_dtrace_scrpc || 272179237Sjb rp->r_pc == t->t_dtrace_astpc) { 273179237Sjb rp->r_pc = t->t_dtrace_pc; 274179237Sjb t->t_dtrace_ft = 0; 275179237Sjb } 276179237Sjb} 277179237Sjb 278179237Sjbint 279179237Sjbdtrace_safe_defer_signal(void) 280179237Sjb{ 281179237Sjb kthread_t *t = curthread; 282179237Sjb struct regs *rp = lwptoregs(ttolwp(t)); 283179237Sjb size_t isz = t->t_dtrace_npc - t->t_dtrace_pc; 284179237Sjb 285179237Sjb ASSERT(t->t_dtrace_on); 286179237Sjb 287179237Sjb /* 288179237Sjb * If we're not in the range of scratch addresses, we're not actually 289179237Sjb * tracing user instructions so turn off the flags. 290179237Sjb */ 291179237Sjb if (rp->r_pc < t->t_dtrace_scrpc || 292179237Sjb rp->r_pc > t->t_dtrace_astpc + isz) { 293179237Sjb t->t_dtrace_ft = 0; 294179237Sjb return (0); 295179237Sjb } 296179237Sjb 297179237Sjb /* 298236567Sgnn * If we have executed the original instruction, but we have performed 299236567Sgnn * neither the jmp back to t->t_dtrace_npc nor the clean up of any 300236567Sgnn * registers used to emulate %rip-relative instructions in 64-bit mode, 301236567Sgnn * we'll save ourselves some effort by doing that here and taking the 302236567Sgnn * signal right away. We detect this condition by seeing if the program 303236567Sgnn * counter is the range [scrpc + isz, astpc). 304179237Sjb */ 305236567Sgnn if (rp->r_pc >= t->t_dtrace_scrpc + isz && 306236567Sgnn rp->r_pc < t->t_dtrace_astpc) { 307179237Sjb#ifdef __amd64 308179237Sjb /* 309179237Sjb * If there is a scratch register and we're on the 310179237Sjb * instruction immediately after the modified instruction, 311179237Sjb * restore the value of that scratch register. 312179237Sjb */ 313179237Sjb if (t->t_dtrace_reg != 0 && 314179237Sjb rp->r_pc == t->t_dtrace_scrpc + isz) { 315179237Sjb switch (t->t_dtrace_reg) { 316179237Sjb case REG_RAX: 317179237Sjb rp->r_rax = t->t_dtrace_regv; 318179237Sjb break; 319179237Sjb case REG_RCX: 320179237Sjb rp->r_rcx = t->t_dtrace_regv; 321179237Sjb break; 322179237Sjb case REG_R8: 323179237Sjb rp->r_r8 = t->t_dtrace_regv; 324179237Sjb break; 325179237Sjb case REG_R9: 326179237Sjb rp->r_r9 = t->t_dtrace_regv; 327179237Sjb break; 328179237Sjb } 329179237Sjb } 330179237Sjb#endif 331179237Sjb rp->r_pc = t->t_dtrace_npc; 332179237Sjb t->t_dtrace_ft = 0; 333179237Sjb return (0); 334179237Sjb } 335179237Sjb 336179237Sjb /* 337179237Sjb * Otherwise, make sure we'll return to the kernel after executing 338179237Sjb * the copied out instruction and defer the signal. 339179237Sjb */ 340179237Sjb if (!t->t_dtrace_step) { 341179237Sjb ASSERT(rp->r_pc < t->t_dtrace_astpc); 342179237Sjb rp->r_pc += t->t_dtrace_astpc - t->t_dtrace_scrpc; 343179237Sjb t->t_dtrace_step = 1; 344179237Sjb } 345179237Sjb 346179237Sjb t->t_dtrace_ast = 1; 347179237Sjb 348179237Sjb return (1); 349179237Sjb} 350179237Sjb#endif 351179237Sjb 352179237Sjbstatic int64_t tgt_cpu_tsc; 353179237Sjbstatic int64_t hst_cpu_tsc; 354179237Sjbstatic int64_t tsc_skew[MAXCPU]; 355195710Savgstatic uint64_t nsec_scale; 356179237Sjb 357195710Savg/* See below for the explanation of this macro. */ 358195710Savg#define SCALE_SHIFT 28 359195710Savg 360179237Sjbstatic void 361179237Sjbdtrace_gethrtime_init_cpu(void *arg) 362179237Sjb{ 363179237Sjb uintptr_t cpu = (uintptr_t) arg; 364179237Sjb 365179237Sjb if (cpu == curcpu) 366179237Sjb tgt_cpu_tsc = rdtsc(); 367179237Sjb else 368179237Sjb hst_cpu_tsc = rdtsc(); 369179237Sjb} 370179237Sjb 371179237Sjbstatic void 372179237Sjbdtrace_gethrtime_init(void *arg) 373179237Sjb{ 374216250Savg struct pcpu *pc; 375195710Savg uint64_t tsc_f; 376222813Sattilio cpuset_t map; 377179237Sjb int i; 378179237Sjb 379195710Savg /* 380195710Savg * Get TSC frequency known at this moment. 381195710Savg * This should be constant if TSC is invariant. 382195710Savg * Otherwise tick->time conversion will be inaccurate, but 383195710Savg * will preserve monotonic property of TSC. 384195710Savg */ 385220433Sjkim tsc_f = atomic_load_acq_64(&tsc_freq); 386195710Savg 387195710Savg /* 388195710Savg * The following line checks that nsec_scale calculated below 389195710Savg * doesn't overflow 32-bit unsigned integer, so that it can multiply 390195710Savg * another 32-bit integer without overflowing 64-bit. 391195710Savg * Thus minimum supported TSC frequency is 62.5MHz. 392195710Savg */ 393195710Savg KASSERT(tsc_f > (NANOSEC >> (32 - SCALE_SHIFT)), ("TSC frequency is too low")); 394195710Savg 395195710Savg /* 396195710Savg * We scale up NANOSEC/tsc_f ratio to preserve as much precision 397195710Savg * as possible. 398195710Savg * 2^28 factor was chosen quite arbitrarily from practical 399195710Savg * considerations: 400195710Savg * - it supports TSC frequencies as low as 62.5MHz (see above); 401195710Savg * - it provides quite good precision (e < 0.01%) up to THz 402195710Savg * (terahertz) values; 403195710Savg */ 404195710Savg nsec_scale = ((uint64_t)NANOSEC << SCALE_SHIFT) / tsc_f; 405195710Savg 406179237Sjb /* The current CPU is the reference one. */ 407216250Savg sched_pin(); 408179237Sjb tsc_skew[curcpu] = 0; 409209059Sjhb CPU_FOREACH(i) { 410179237Sjb if (i == curcpu) 411179237Sjb continue; 412179237Sjb 413216250Savg pc = pcpu_find(i); 414223758Sattilio CPU_SETOF(PCPU_GET(cpuid), &map); 415223758Sattilio CPU_SET(pc->pc_cpuid, &map); 416179237Sjb 417221740Savg smp_rendezvous_cpus(map, NULL, 418179237Sjb dtrace_gethrtime_init_cpu, 419179237Sjb smp_no_rendevous_barrier, (void *)(uintptr_t) i); 420179237Sjb 421179237Sjb tsc_skew[i] = tgt_cpu_tsc - hst_cpu_tsc; 422179237Sjb } 423216250Savg sched_unpin(); 424179237Sjb} 425179237Sjb 426179237SjbSYSINIT(dtrace_gethrtime_init, SI_SUB_SMP, SI_ORDER_ANY, dtrace_gethrtime_init, NULL); 427179237Sjb 428179237Sjb/* 429179237Sjb * DTrace needs a high resolution time function which can 430179237Sjb * be called from a probe context and guaranteed not to have 431179237Sjb * instrumented with probes itself. 432179237Sjb * 433179237Sjb * Returns nanoseconds since boot. 434179237Sjb */ 435179237Sjbuint64_t 436179237Sjbdtrace_gethrtime() 437179237Sjb{ 438195710Savg uint64_t tsc; 439195710Savg uint32_t lo; 440195710Savg uint32_t hi; 441195710Savg 442195710Savg /* 443195710Savg * We split TSC value into lower and higher 32-bit halves and separately 444195710Savg * scale them with nsec_scale, then we scale them down by 2^28 445195710Savg * (see nsec_scale calculations) taking into account 32-bit shift of 446195710Savg * the higher half and finally add. 447195710Savg */ 448236566Szml tsc = rdtsc() - tsc_skew[curcpu]; 449195710Savg lo = tsc; 450195710Savg hi = tsc >> 32; 451195710Savg return (((lo * nsec_scale) >> SCALE_SHIFT) + 452195710Savg ((hi * nsec_scale) << (32 - SCALE_SHIFT))); 453179237Sjb} 454179237Sjb 455179237Sjbuint64_t 456179237Sjbdtrace_gethrestime(void) 457179237Sjb{ 458238537Sgnn struct timespec current_time; 459238537Sgnn 460238537Sgnn dtrace_getnanotime(¤t_time); 461238537Sgnn 462238552Sgnn return (current_time.tv_sec * 1000000000ULL + current_time.tv_nsec); 463179237Sjb} 464179237Sjb 465268600Smarkj/* 466268600Smarkj * Function to handle DTrace traps during probes. See amd64/amd64/exception.S. 467268600Smarkj */ 468179237Sjbint 469268600Smarkjdtrace_trap(struct trapframe *frame) 470179237Sjb{ 471179237Sjb /* 472179237Sjb * A trap can occur while DTrace executes a probe. Before 473179237Sjb * executing the probe, DTrace blocks re-scheduling and sets 474268600Smarkj * a flag in its per-cpu flags to indicate that it doesn't 475218909Sbrucec * want to fault. On returning from the probe, the no-fault 476179237Sjb * flag is cleared and finally re-scheduling is enabled. 477179237Sjb * 478179237Sjb * Check if DTrace has enabled 'no-fault' mode: 479179237Sjb */ 480179237Sjb if ((cpu_core[curcpu].cpuc_dtrace_flags & CPU_DTRACE_NOFAULT) != 0) { 481179237Sjb /* 482179237Sjb * There are only a couple of trap types that are expected. 483179237Sjb * All the rest will be handled in the usual way. 484179237Sjb */ 485268600Smarkj switch (frame->tf_trapno) { 486179237Sjb /* General protection fault. */ 487179237Sjb case T_PROTFLT: 488179237Sjb /* Flag an illegal operation. */ 489179237Sjb cpu_core[curcpu].cpuc_dtrace_flags |= CPU_DTRACE_ILLOP; 490179237Sjb 491179237Sjb /* 492179237Sjb * Offset the instruction pointer to the instruction 493179237Sjb * following the one causing the fault. 494179237Sjb */ 495179237Sjb frame->tf_rip += dtrace_instr_size((u_char *) frame->tf_rip); 496179237Sjb return (1); 497179237Sjb /* Page fault. */ 498179237Sjb case T_PAGEFLT: 499179237Sjb /* Flag a bad address. */ 500179237Sjb cpu_core[curcpu].cpuc_dtrace_flags |= CPU_DTRACE_BADADDR; 501179237Sjb cpu_core[curcpu].cpuc_dtrace_illval = frame->tf_addr; 502179237Sjb 503179237Sjb /* 504179237Sjb * Offset the instruction pointer to the instruction 505179237Sjb * following the one causing the fault. 506179237Sjb */ 507179237Sjb frame->tf_rip += dtrace_instr_size((u_char *) frame->tf_rip); 508179237Sjb return (1); 509179237Sjb default: 510179237Sjb /* Handle all other traps in the usual way. */ 511179237Sjb break; 512179237Sjb } 513179237Sjb } 514179237Sjb 515179237Sjb /* Handle the trap in the usual way. */ 516179237Sjb return (0); 517179237Sjb} 518