1179237Sjb/* 2179237Sjb * CDDL HEADER START 3179237Sjb * 4179237Sjb * The contents of this file are subject to the terms of the 5179237Sjb * Common Development and Distribution License, Version 1.0 only 6179237Sjb * (the "License"). You may not use this file except in compliance 7179237Sjb * with the License. 8179237Sjb * 9179237Sjb * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 10179237Sjb * or http://www.opensolaris.org/os/licensing. 11179237Sjb * See the License for the specific language governing permissions 12179237Sjb * and limitations under the License. 13179237Sjb * 14179237Sjb * When distributing Covered Code, include this CDDL HEADER in each 15179237Sjb * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 16179237Sjb * If applicable, add the following below this CDDL HEADER, with the 17179237Sjb * fields enclosed by brackets "[]" replaced with your own identifying 18179237Sjb * information: Portions Copyright [yyyy] [name of copyright owner] 19179237Sjb * 20179237Sjb * CDDL HEADER END 21179237Sjb * 22179237Sjb * $FreeBSD: stable/11/sys/cddl/dev/dtrace/amd64/dtrace_subr.c 345868 2019-04-04 02:07:24Z markj $ 23179237Sjb * 24179237Sjb */ 25179237Sjb/* 26179237Sjb * Copyright 2005 Sun Microsystems, Inc. All rights reserved. 27179237Sjb * Use is subject to license terms. 28179237Sjb */ 29179237Sjb 30236567Sgnn/* 31236567Sgnn * Copyright (c) 2011, Joyent, Inc. All rights reserved. 32236567Sgnn */ 33236567Sgnn 34179237Sjb#include <sys/param.h> 35179237Sjb#include <sys/systm.h> 36179237Sjb#include <sys/types.h> 37179237Sjb#include <sys/kernel.h> 38179237Sjb#include <sys/malloc.h> 39179237Sjb#include <sys/kmem.h> 40179237Sjb#include <sys/smp.h> 41179237Sjb#include <sys/dtrace_impl.h> 42179237Sjb#include <sys/dtrace_bsd.h> 43179237Sjb#include <machine/clock.h> 44315011Smarkj#include <machine/cpufunc.h> 45179237Sjb#include <machine/frame.h> 46315011Smarkj#include <machine/psl.h> 47179237Sjb#include <vm/pmap.h> 48179237Sjb 49238537Sgnnextern void dtrace_getnanotime(struct timespec *tsp); 50238537Sgnn 51298171Smarkjint dtrace_invop(uintptr_t, struct trapframe *, uintptr_t); 52179237Sjb 53179237Sjbtypedef struct dtrace_invop_hdlr { 54298171Smarkj int (*dtih_func)(uintptr_t, struct trapframe *, uintptr_t); 55179237Sjb struct dtrace_invop_hdlr *dtih_next; 56179237Sjb} dtrace_invop_hdlr_t; 57179237Sjb 58179237Sjbdtrace_invop_hdlr_t *dtrace_invop_hdlr; 59179237Sjb 60179237Sjbint 61298171Smarkjdtrace_invop(uintptr_t addr, struct trapframe *frame, uintptr_t eax) 62179237Sjb{ 63179237Sjb dtrace_invop_hdlr_t *hdlr; 64179237Sjb int rval; 65179237Sjb 66179237Sjb for (hdlr = dtrace_invop_hdlr; hdlr != NULL; hdlr = hdlr->dtih_next) 67298171Smarkj if ((rval = hdlr->dtih_func(addr, frame, eax)) != 0) 68179237Sjb return (rval); 69179237Sjb 70179237Sjb return (0); 71179237Sjb} 72179237Sjb 73179237Sjbvoid 74298171Smarkjdtrace_invop_add(int (*func)(uintptr_t, struct trapframe *, uintptr_t)) 75179237Sjb{ 76179237Sjb dtrace_invop_hdlr_t *hdlr; 77179237Sjb 78179237Sjb hdlr = kmem_alloc(sizeof (dtrace_invop_hdlr_t), KM_SLEEP); 79179237Sjb hdlr->dtih_func = func; 80179237Sjb hdlr->dtih_next = dtrace_invop_hdlr; 81179237Sjb dtrace_invop_hdlr = hdlr; 82179237Sjb} 83179237Sjb 84179237Sjbvoid 85298171Smarkjdtrace_invop_remove(int (*func)(uintptr_t, struct trapframe *, uintptr_t)) 86179237Sjb{ 87179237Sjb dtrace_invop_hdlr_t *hdlr = dtrace_invop_hdlr, *prev = NULL; 88179237Sjb 89179237Sjb for (;;) { 90179237Sjb if (hdlr == NULL) 91179237Sjb panic("attempt to remove non-existent invop handler"); 92179237Sjb 93179237Sjb if (hdlr->dtih_func == func) 94179237Sjb break; 95179237Sjb 96179237Sjb prev = hdlr; 97179237Sjb hdlr = hdlr->dtih_next; 98179237Sjb } 99179237Sjb 100179237Sjb if (prev == NULL) { 101179237Sjb ASSERT(dtrace_invop_hdlr == hdlr); 102179237Sjb dtrace_invop_hdlr = hdlr->dtih_next; 103179237Sjb } else { 104179237Sjb ASSERT(dtrace_invop_hdlr != hdlr); 105179237Sjb prev->dtih_next = hdlr->dtih_next; 106179237Sjb } 107179237Sjb 108179237Sjb kmem_free(hdlr, 0); 109179237Sjb} 110179237Sjb 111179237Sjb/*ARGSUSED*/ 112179237Sjbvoid 113179237Sjbdtrace_toxic_ranges(void (*func)(uintptr_t base, uintptr_t limit)) 114179237Sjb{ 115179237Sjb (*func)(0, (uintptr_t) addr_PTmap); 116179237Sjb} 117179237Sjb 118179237Sjbvoid 119179237Sjbdtrace_xcall(processorid_t cpu, dtrace_xcall_t func, void *arg) 120179237Sjb{ 121222813Sattilio cpuset_t cpus; 122179237Sjb 123179237Sjb if (cpu == DTRACE_CPUALL) 124179237Sjb cpus = all_cpus; 125179237Sjb else 126222813Sattilio CPU_SETOF(cpu, &cpus); 127179237Sjb 128328386Spkelsey smp_rendezvous_cpus(cpus, smp_no_rendezvous_barrier, func, 129328386Spkelsey smp_no_rendezvous_barrier, arg); 130179237Sjb} 131179237Sjb 132179237Sjbstatic void 133179237Sjbdtrace_sync_func(void) 134179237Sjb{ 135179237Sjb} 136179237Sjb 137179237Sjbvoid 138179237Sjbdtrace_sync(void) 139179237Sjb{ 140179237Sjb dtrace_xcall(DTRACE_CPUALL, (dtrace_xcall_t)dtrace_sync_func, NULL); 141179237Sjb} 142179237Sjb 143179237Sjb#ifdef notyet 144179237Sjbvoid 145179237Sjbdtrace_safe_synchronous_signal(void) 146179237Sjb{ 147179237Sjb kthread_t *t = curthread; 148179237Sjb struct regs *rp = lwptoregs(ttolwp(t)); 149179237Sjb size_t isz = t->t_dtrace_npc - t->t_dtrace_pc; 150179237Sjb 151179237Sjb ASSERT(t->t_dtrace_on); 152179237Sjb 153179237Sjb /* 154179237Sjb * If we're not in the range of scratch addresses, we're not actually 155179237Sjb * tracing user instructions so turn off the flags. If the instruction 156179237Sjb * we copied out caused a synchonous trap, reset the pc back to its 157179237Sjb * original value and turn off the flags. 158179237Sjb */ 159179237Sjb if (rp->r_pc < t->t_dtrace_scrpc || 160179237Sjb rp->r_pc > t->t_dtrace_astpc + isz) { 161179237Sjb t->t_dtrace_ft = 0; 162179237Sjb } else if (rp->r_pc == t->t_dtrace_scrpc || 163179237Sjb rp->r_pc == t->t_dtrace_astpc) { 164179237Sjb rp->r_pc = t->t_dtrace_pc; 165179237Sjb t->t_dtrace_ft = 0; 166179237Sjb } 167179237Sjb} 168179237Sjb 169179237Sjbint 170179237Sjbdtrace_safe_defer_signal(void) 171179237Sjb{ 172179237Sjb kthread_t *t = curthread; 173179237Sjb struct regs *rp = lwptoregs(ttolwp(t)); 174179237Sjb size_t isz = t->t_dtrace_npc - t->t_dtrace_pc; 175179237Sjb 176179237Sjb ASSERT(t->t_dtrace_on); 177179237Sjb 178179237Sjb /* 179179237Sjb * If we're not in the range of scratch addresses, we're not actually 180179237Sjb * tracing user instructions so turn off the flags. 181179237Sjb */ 182179237Sjb if (rp->r_pc < t->t_dtrace_scrpc || 183179237Sjb rp->r_pc > t->t_dtrace_astpc + isz) { 184179237Sjb t->t_dtrace_ft = 0; 185179237Sjb return (0); 186179237Sjb } 187179237Sjb 188179237Sjb /* 189236567Sgnn * If we have executed the original instruction, but we have performed 190236567Sgnn * neither the jmp back to t->t_dtrace_npc nor the clean up of any 191236567Sgnn * registers used to emulate %rip-relative instructions in 64-bit mode, 192236567Sgnn * we'll save ourselves some effort by doing that here and taking the 193236567Sgnn * signal right away. We detect this condition by seeing if the program 194236567Sgnn * counter is the range [scrpc + isz, astpc). 195179237Sjb */ 196236567Sgnn if (rp->r_pc >= t->t_dtrace_scrpc + isz && 197236567Sgnn rp->r_pc < t->t_dtrace_astpc) { 198179237Sjb#ifdef __amd64 199179237Sjb /* 200179237Sjb * If there is a scratch register and we're on the 201179237Sjb * instruction immediately after the modified instruction, 202179237Sjb * restore the value of that scratch register. 203179237Sjb */ 204179237Sjb if (t->t_dtrace_reg != 0 && 205179237Sjb rp->r_pc == t->t_dtrace_scrpc + isz) { 206179237Sjb switch (t->t_dtrace_reg) { 207179237Sjb case REG_RAX: 208179237Sjb rp->r_rax = t->t_dtrace_regv; 209179237Sjb break; 210179237Sjb case REG_RCX: 211179237Sjb rp->r_rcx = t->t_dtrace_regv; 212179237Sjb break; 213179237Sjb case REG_R8: 214179237Sjb rp->r_r8 = t->t_dtrace_regv; 215179237Sjb break; 216179237Sjb case REG_R9: 217179237Sjb rp->r_r9 = t->t_dtrace_regv; 218179237Sjb break; 219179237Sjb } 220179237Sjb } 221179237Sjb#endif 222179237Sjb rp->r_pc = t->t_dtrace_npc; 223179237Sjb t->t_dtrace_ft = 0; 224179237Sjb return (0); 225179237Sjb } 226179237Sjb 227179237Sjb /* 228179237Sjb * Otherwise, make sure we'll return to the kernel after executing 229179237Sjb * the copied out instruction and defer the signal. 230179237Sjb */ 231179237Sjb if (!t->t_dtrace_step) { 232179237Sjb ASSERT(rp->r_pc < t->t_dtrace_astpc); 233179237Sjb rp->r_pc += t->t_dtrace_astpc - t->t_dtrace_scrpc; 234179237Sjb t->t_dtrace_step = 1; 235179237Sjb } 236179237Sjb 237179237Sjb t->t_dtrace_ast = 1; 238179237Sjb 239179237Sjb return (1); 240179237Sjb} 241179237Sjb#endif 242179237Sjb 243179237Sjbstatic int64_t tgt_cpu_tsc; 244179237Sjbstatic int64_t hst_cpu_tsc; 245179237Sjbstatic int64_t tsc_skew[MAXCPU]; 246195710Savgstatic uint64_t nsec_scale; 247179237Sjb 248195710Savg/* See below for the explanation of this macro. */ 249195710Savg#define SCALE_SHIFT 28 250195710Savg 251299746Sjhbstatic void 252299746Sjhbdtrace_gethrtime_init_cpu(void *arg) 253299746Sjhb{ 254299746Sjhb uintptr_t cpu = (uintptr_t) arg; 255299746Sjhb 256299746Sjhb if (cpu == curcpu) 257299746Sjhb tgt_cpu_tsc = rdtsc(); 258299746Sjhb else 259299746Sjhb hst_cpu_tsc = rdtsc(); 260299746Sjhb} 261299746Sjhb 262299746Sjhb#ifdef EARLY_AP_STARTUP 263299746Sjhbstatic void 264299746Sjhbdtrace_gethrtime_init(void *arg) 265299746Sjhb{ 266299746Sjhb struct pcpu *pc; 267299746Sjhb uint64_t tsc_f; 268299746Sjhb cpuset_t map; 269299746Sjhb int i; 270299746Sjhb#else 271297770Smarkj/* 272297770Smarkj * Get the frequency and scale factor as early as possible so that they can be 273297770Smarkj * used for boot-time tracing. 274297770Smarkj */ 275179237Sjbstatic void 276297770Smarkjdtrace_gethrtime_init_early(void *arg) 277179237Sjb{ 278195710Savg uint64_t tsc_f; 279299746Sjhb#endif 280179237Sjb 281195710Savg /* 282195710Savg * Get TSC frequency known at this moment. 283195710Savg * This should be constant if TSC is invariant. 284195710Savg * Otherwise tick->time conversion will be inaccurate, but 285195710Savg * will preserve monotonic property of TSC. 286195710Savg */ 287220433Sjkim tsc_f = atomic_load_acq_64(&tsc_freq); 288195710Savg 289195710Savg /* 290195710Savg * The following line checks that nsec_scale calculated below 291195710Savg * doesn't overflow 32-bit unsigned integer, so that it can multiply 292195710Savg * another 32-bit integer without overflowing 64-bit. 293195710Savg * Thus minimum supported TSC frequency is 62.5MHz. 294195710Savg */ 295297770Smarkj KASSERT(tsc_f > (NANOSEC >> (32 - SCALE_SHIFT)), 296297770Smarkj ("TSC frequency is too low")); 297195710Savg 298195710Savg /* 299195710Savg * We scale up NANOSEC/tsc_f ratio to preserve as much precision 300195710Savg * as possible. 301195710Savg * 2^28 factor was chosen quite arbitrarily from practical 302195710Savg * considerations: 303195710Savg * - it supports TSC frequencies as low as 62.5MHz (see above); 304195710Savg * - it provides quite good precision (e < 0.01%) up to THz 305195710Savg * (terahertz) values; 306195710Savg */ 307195710Savg nsec_scale = ((uint64_t)NANOSEC << SCALE_SHIFT) / tsc_f; 308299746Sjhb#ifndef EARLY_AP_STARTUP 309297770Smarkj} 310297770SmarkjSYSINIT(dtrace_gethrtime_init_early, SI_SUB_CPU, SI_ORDER_ANY, 311297770Smarkj dtrace_gethrtime_init_early, NULL); 312195710Savg 313297770Smarkjstatic void 314297770Smarkjdtrace_gethrtime_init(void *arg) 315297770Smarkj{ 316297770Smarkj struct pcpu *pc; 317297770Smarkj cpuset_t map; 318297770Smarkj int i; 319299746Sjhb#endif 320297770Smarkj 321345868Smarkj if (vm_guest != VM_GUEST_NO) 322345868Smarkj return; 323345868Smarkj 324179237Sjb /* The current CPU is the reference one. */ 325216250Savg sched_pin(); 326179237Sjb tsc_skew[curcpu] = 0; 327209059Sjhb CPU_FOREACH(i) { 328179237Sjb if (i == curcpu) 329179237Sjb continue; 330179237Sjb 331216250Savg pc = pcpu_find(i); 332223758Sattilio CPU_SETOF(PCPU_GET(cpuid), &map); 333223758Sattilio CPU_SET(pc->pc_cpuid, &map); 334179237Sjb 335221740Savg smp_rendezvous_cpus(map, NULL, 336179237Sjb dtrace_gethrtime_init_cpu, 337328386Spkelsey smp_no_rendezvous_barrier, (void *)(uintptr_t) i); 338179237Sjb 339179237Sjb tsc_skew[i] = tgt_cpu_tsc - hst_cpu_tsc; 340179237Sjb } 341216250Savg sched_unpin(); 342179237Sjb} 343299746Sjhb#ifdef EARLY_AP_STARTUP 344299746SjhbSYSINIT(dtrace_gethrtime_init, SI_SUB_DTRACE, SI_ORDER_ANY, 345299746Sjhb dtrace_gethrtime_init, NULL); 346299746Sjhb#else 347297770SmarkjSYSINIT(dtrace_gethrtime_init, SI_SUB_SMP, SI_ORDER_ANY, dtrace_gethrtime_init, 348297770Smarkj NULL); 349299746Sjhb#endif 350179237Sjb 351179237Sjb/* 352179237Sjb * DTrace needs a high resolution time function which can 353179237Sjb * be called from a probe context and guaranteed not to have 354179237Sjb * instrumented with probes itself. 355179237Sjb * 356179237Sjb * Returns nanoseconds since boot. 357179237Sjb */ 358179237Sjbuint64_t 359327492Smarkjdtrace_gethrtime(void) 360179237Sjb{ 361195710Savg uint64_t tsc; 362327492Smarkj uint32_t lo, hi; 363327492Smarkj register_t rflags; 364195710Savg 365195710Savg /* 366195710Savg * We split TSC value into lower and higher 32-bit halves and separately 367195710Savg * scale them with nsec_scale, then we scale them down by 2^28 368195710Savg * (see nsec_scale calculations) taking into account 32-bit shift of 369195710Savg * the higher half and finally add. 370195710Savg */ 371327492Smarkj rflags = intr_disable(); 372236566Szml tsc = rdtsc() - tsc_skew[curcpu]; 373327492Smarkj intr_restore(rflags); 374327492Smarkj 375195710Savg lo = tsc; 376195710Savg hi = tsc >> 32; 377195710Savg return (((lo * nsec_scale) >> SCALE_SHIFT) + 378195710Savg ((hi * nsec_scale) << (32 - SCALE_SHIFT))); 379179237Sjb} 380179237Sjb 381179237Sjbuint64_t 382179237Sjbdtrace_gethrestime(void) 383179237Sjb{ 384238537Sgnn struct timespec current_time; 385238537Sgnn 386238537Sgnn dtrace_getnanotime(¤t_time); 387238537Sgnn 388238552Sgnn return (current_time.tv_sec * 1000000000ULL + current_time.tv_nsec); 389179237Sjb} 390179237Sjb 391268869Smarkj/* Function to handle DTrace traps during probes. See amd64/amd64/trap.c. */ 392179237Sjbint 393276142Smarkjdtrace_trap(struct trapframe *frame, u_int type) 394179237Sjb{ 395315011Smarkj uint16_t nofault; 396315011Smarkj 397179237Sjb /* 398179237Sjb * A trap can occur while DTrace executes a probe. Before 399179237Sjb * executing the probe, DTrace blocks re-scheduling and sets 400268600Smarkj * a flag in its per-cpu flags to indicate that it doesn't 401218909Sbrucec * want to fault. On returning from the probe, the no-fault 402179237Sjb * flag is cleared and finally re-scheduling is enabled. 403179237Sjb * 404179237Sjb * Check if DTrace has enabled 'no-fault' mode: 405179237Sjb */ 406315011Smarkj sched_pin(); 407315011Smarkj nofault = cpu_core[curcpu].cpuc_dtrace_flags & CPU_DTRACE_NOFAULT; 408315011Smarkj sched_unpin(); 409315011Smarkj if (nofault) { 410315011Smarkj KASSERT((read_rflags() & PSL_I) == 0, ("interrupts enabled")); 411315011Smarkj 412179237Sjb /* 413179237Sjb * There are only a couple of trap types that are expected. 414179237Sjb * All the rest will be handled in the usual way. 415179237Sjb */ 416276142Smarkj switch (type) { 417179237Sjb /* General protection fault. */ 418179237Sjb case T_PROTFLT: 419179237Sjb /* Flag an illegal operation. */ 420179237Sjb cpu_core[curcpu].cpuc_dtrace_flags |= CPU_DTRACE_ILLOP; 421179237Sjb 422179237Sjb /* 423179237Sjb * Offset the instruction pointer to the instruction 424179237Sjb * following the one causing the fault. 425179237Sjb */ 426179237Sjb frame->tf_rip += dtrace_instr_size((u_char *) frame->tf_rip); 427179237Sjb return (1); 428179237Sjb /* Page fault. */ 429179237Sjb case T_PAGEFLT: 430179237Sjb /* Flag a bad address. */ 431179237Sjb cpu_core[curcpu].cpuc_dtrace_flags |= CPU_DTRACE_BADADDR; 432179237Sjb cpu_core[curcpu].cpuc_dtrace_illval = frame->tf_addr; 433179237Sjb 434179237Sjb /* 435179237Sjb * Offset the instruction pointer to the instruction 436179237Sjb * following the one causing the fault. 437179237Sjb */ 438179237Sjb frame->tf_rip += dtrace_instr_size((u_char *) frame->tf_rip); 439179237Sjb return (1); 440179237Sjb default: 441179237Sjb /* Handle all other traps in the usual way. */ 442179237Sjb break; 443179237Sjb } 444179237Sjb } 445179237Sjb 446179237Sjb /* Handle the trap in the usual way. */ 447179237Sjb return (0); 448179237Sjb} 449