1// Copyright 2016 The Fuchsia Authors 2// 3// Use of this source code is governed by a MIT-style 4// license that can be found in the LICENSE file or at 5// https://opensource.org/licenses/MIT 6 7// TODO(ZX-992): Need to be able to r/w MSRs. 8// The thought is to use resources (as in ResourceDispatcher), at which point 9// this will all get rewritten. Until such time, the goal here is KISS. 10// This file contains the lower part of Intel Processor Trace support that must 11// be done in the kernel (so that we can read/write msrs). 12// The userspace driver is in system/udev/intel-pt/intel-pt.c. 13// 14// We currently only support Table of Physical Addresses mode: 15// it supports discontiguous buffers and supports stop-on-full behavior 16// in addition to wrap-around. 17// 18// IPT tracing has two "modes": 19// - per-cpu tracing 20// - thread-specific tracing 21// Tracing can only be done in one mode at a time. This is because saving/ 22// restoring thread PT state via the xsaves/xrstors instructions is a global 23// flag in the XSS msr. 24// Plus once a trace has been done with IPT_TRACE_THREADS one cannot go back 25// to IPT_TRACE_CPUS: supporting this requires flushing trace state from all 26// threads which is a bit of work. For now it's easy enough to just require 27// the user to reboot. ZX-892 28 29#include <arch/arch_ops.h> 30#include <arch/mmu.h> 31#include <arch/x86.h> 32#include <arch/x86/feature.h> 33#include <arch/x86/mmu.h> 34#include <arch/x86/proc_trace.h> 35#include <err.h> 36#include <fbl/auto_lock.h> 37#include <fbl/macros.h> 38#include <fbl/mutex.h> 39#include <fbl/unique_ptr.h> 40#include <kernel/mp.h> 41#include <kernel/thread.h> 42#include <lib/ktrace.h> 43#include <pow2.h> 44#include <string.h> 45#include <trace.h> 46#include <vm/vm.h> 47#include <vm/vm_aspace.h> 48#include <lib/zircon-internal/device/cpu-trace/intel-pt.h> 49#include <lib/zircon-internal/ktrace.h> 50#include <lib/zircon-internal/mtrace.h> 51#include <zircon/thread_annotations.h> 52#include <zircon/types.h> 53 54using fbl::AutoLock; 55 56#define LOCAL_TRACE 0 57 58// Control MSRs 59#define IA32_RTIT_OUTPUT_BASE 0x560 60#define IA32_RTIT_OUTPUT_MASK_PTRS 0x561 61#define IA32_RTIT_CTL 0x570 62#define IA32_RTIT_STATUS 0x571 63#define IA32_RTIT_CR3_MATCH 0x572 64#define IA32_RTIT_ADDR0_A 0x580 65#define IA32_RTIT_ADDR0_B 0x581 66#define IA32_RTIT_ADDR1_A 0x582 67#define IA32_RTIT_ADDR1_B 0x583 68#define IA32_RTIT_ADDR2_A 0x584 69#define IA32_RTIT_ADDR2_B 0x585 70#define IA32_RTIT_ADDR3_A 0x586 71#define IA32_RTIT_ADDR3_B 0x587 72 73// We need bits[15:8] to get the "maximum non-turbo ratio". 74// See libipt:intel-pt.h:pt_config, and Intel Vol. 3 chapter 35.5. 75#define IA32_PLATFORM_INFO 0xce 76 77// Our own copy of what h/w supports, mostly for sanity checking. 78static bool supports_pt = false; 79static bool supports_cr3_filtering = false; 80static bool supports_psb = false; 81static bool supports_ip_filtering = false; 82static bool supports_mtc = false; 83static bool supports_ptwrite = false; 84static bool supports_power_events = false; 85static bool supports_output_topa = false; 86static bool supports_output_topa_multi = false; 87static bool supports_output_single = false; 88static bool supports_output_transport = false; 89 90struct ipt_trace_state_t { 91 uint64_t ctl; 92 uint64_t status; 93 uint64_t output_base; 94 uint64_t output_mask_ptrs; 95 uint64_t cr3_match; 96 struct { 97 uint64_t a, b; 98 } addr_ranges[IPT_MAX_NUM_ADDR_RANGES]; 99}; 100 101static fbl::Mutex ipt_lock; 102 103static ipt_trace_state_t* ipt_trace_state TA_GUARDED(ipt_lock); 104 105static bool active TA_GUARDED(ipt_lock) = false; 106 107static ipt_trace_mode_t trace_mode TA_GUARDED(ipt_lock) = IPT_TRACE_CPUS; 108 109void x86_processor_trace_init(void) { 110 if (!x86_feature_test(X86_FEATURE_PT)) { 111 return; 112 } 113 114 struct cpuid_leaf leaf; 115 if (!x86_get_cpuid_subleaf(X86_CPUID_PT, 0, &leaf)) { 116 return; 117 } 118 119 supports_pt = true; 120 121 // Keep our own copy of these flags, mostly for potential sanity checks. 122 supports_cr3_filtering = !!(leaf.b & (1 << 0)); 123 supports_psb = !!(leaf.b & (1 << 1)); 124 supports_ip_filtering = !!(leaf.b & (1 << 2)); 125 supports_mtc = !!(leaf.b & (1 << 3)); 126 supports_ptwrite = !!(leaf.b & (1 << 4)); 127 supports_power_events = !!(leaf.b & (1 << 5)); 128 129 supports_output_topa = !!(leaf.c & (1 << 0)); 130 supports_output_topa_multi = !!(leaf.c & (1 << 1)); 131 supports_output_single = !!(leaf.c & (1 << 2)); 132 supports_output_transport = !!(leaf.c & (1 << 3)); 133} 134 135// Intel Processor Trace support needs to be able to map cr3 values that 136// appear in the trace to pids that ld.so uses to dump memory maps. 137void arch_trace_process_create(uint64_t pid, paddr_t pt_phys) { 138 // The cr3 value that appears in Intel PT h/w tracing. 139 uint64_t cr3 = pt_phys; 140 ktrace(TAG_IPT_PROCESS_CREATE, (uint32_t)pid, (uint32_t)(pid >> 32), 141 (uint32_t)cr3, (uint32_t)(cr3 >> 32)); 142} 143 144// Worker for x86_ipt_alloc_trace to be executed on all cpus. 145// This is invoked via mp_sync_exec which thread safety analysis cannot follow. 146static void x86_ipt_set_mode_task(void* raw_context) TA_NO_THREAD_SAFETY_ANALYSIS { 147 DEBUG_ASSERT(arch_ints_disabled()); 148 DEBUG_ASSERT(!active); 149 150 // When changing modes make sure all PT MSRs are in the init state. 151 // We don't want a value to appear in the xsave buffer and have xrstors 152 // #gp because XCOMP_BV has the PT bit set that's not set in XSS. 153 // We still need to do this, even with ZX-892, when transitioning 154 // from IPT_TRACE_CPUS to IPT_TRACE_THREADS. 155 write_msr(IA32_RTIT_CTL, 0); 156 write_msr(IA32_RTIT_STATUS, 0); 157 write_msr(IA32_RTIT_OUTPUT_BASE, 0); 158 write_msr(IA32_RTIT_OUTPUT_MASK_PTRS, 0); 159 if (supports_cr3_filtering) 160 write_msr(IA32_RTIT_CR3_MATCH, 0); 161 // TODO(dje): addr range msrs 162 163 ipt_trace_mode_t new_mode = static_cast<ipt_trace_mode_t>(reinterpret_cast<uintptr_t>(raw_context)); 164 165 // PT state saving, if supported, was enabled during boot so there's no 166 // need to recalculate the xsave space needed. 167 x86_set_extended_register_pt_state(new_mode == IPT_TRACE_THREADS); 168} 169 170zx_status_t x86_ipt_alloc_trace(ipt_trace_mode_t mode) { 171 AutoLock al(&ipt_lock); 172 173 DEBUG_ASSERT(mode == IPT_TRACE_CPUS || mode == IPT_TRACE_THREADS); 174 175 if (!supports_pt) 176 return ZX_ERR_NOT_SUPPORTED; 177 if (active) 178 return ZX_ERR_BAD_STATE; 179 if (ipt_trace_state) 180 return ZX_ERR_BAD_STATE; 181 182 // ZX-892: We don't support changing the mode from IPT_TRACE_THREADS to 183 // IPT_TRACE_CPUS: We can't turn off XSS.PT until we're sure all threads 184 // have no PT state, and that's too tricky to do right now. Instead, 185 // require the developer to reboot. 186 if (trace_mode == IPT_TRACE_THREADS && mode == IPT_TRACE_CPUS) 187 return ZX_ERR_NOT_SUPPORTED; 188 189 if (mode == IPT_TRACE_CPUS) { 190 uint32_t num_cpus = arch_max_num_cpus(); 191 ipt_trace_state = 192 reinterpret_cast<ipt_trace_state_t*>(calloc(num_cpus, 193 sizeof(*ipt_trace_state))); 194 if (!ipt_trace_state) 195 return ZX_ERR_NO_MEMORY; 196 } else { 197 // TODO(dje): support for IPT_TRACE_THREADS 198 return ZX_ERR_NOT_SUPPORTED; 199 } 200 201 mp_sync_exec(MP_IPI_TARGET_ALL, 0, x86_ipt_set_mode_task, 202 reinterpret_cast<void*>(static_cast<uintptr_t>(mode))); 203 204 trace_mode = mode; 205 return ZX_OK; 206} 207 208// Free resources obtained by x86_ipt_alloc_trace(). 209// This doesn't care if resources have already been freed to save callers 210// from having to care during any cleanup. 211 212zx_status_t x86_ipt_free_trace() { 213 AutoLock al(&ipt_lock); 214 215 if (!supports_pt) 216 return ZX_ERR_NOT_SUPPORTED; 217 if (trace_mode == IPT_TRACE_THREADS) 218 return ZX_ERR_BAD_STATE; 219 if (active) 220 return ZX_ERR_BAD_STATE; 221 222 free(ipt_trace_state); 223 ipt_trace_state = nullptr; 224 return ZX_OK; 225} 226 227// This is invoked via mp_sync_exec which thread safety analysis cannot follow. 228static void x86_ipt_start_cpu_task(void* raw_context) TA_NO_THREAD_SAFETY_ANALYSIS { 229 DEBUG_ASSERT(arch_ints_disabled()); 230 DEBUG_ASSERT(active && raw_context); 231 232 ipt_trace_state_t* context = reinterpret_cast<ipt_trace_state_t*>(raw_context); 233 uint32_t cpu = arch_curr_cpu_num(); 234 ipt_trace_state_t* state = &context[cpu]; 235 236 DEBUG_ASSERT(!(read_msr(IA32_RTIT_CTL) & IPT_CTL_TRACE_EN_MASK)); 237 238 // Load the ToPA configuration 239 write_msr(IA32_RTIT_OUTPUT_BASE, state->output_base); 240 write_msr(IA32_RTIT_OUTPUT_MASK_PTRS, state->output_mask_ptrs); 241 242 // Load all other msrs, prior to enabling tracing. 243 write_msr(IA32_RTIT_STATUS, state->status); 244 if (supports_cr3_filtering) 245 write_msr(IA32_RTIT_CR3_MATCH, state->cr3_match); 246 247 // Enable the trace 248 write_msr(IA32_RTIT_CTL, state->ctl); 249} 250 251// Begin the trace. 252 253zx_status_t x86_ipt_start() { 254 AutoLock al(&ipt_lock); 255 256 if (!supports_pt) 257 return ZX_ERR_NOT_SUPPORTED; 258 if (trace_mode == IPT_TRACE_THREADS) 259 return ZX_ERR_BAD_STATE; 260 if (active) 261 return ZX_ERR_BAD_STATE; 262 if (!ipt_trace_state) 263 return ZX_ERR_BAD_STATE; 264 265 uint64_t kernel_cr3 = x86_kernel_cr3(); 266 TRACEF("Starting processor trace, kernel cr3: 0x%" PRIxPTR "\n", 267 kernel_cr3); 268 269 if (LOCAL_TRACE) { 270 uint32_t num_cpus = arch_max_num_cpus(); 271 for (uint32_t cpu = 0; cpu < num_cpus; ++cpu) { 272 TRACEF("Cpu %u: ctl 0x%" PRIx64 ", status 0x%" PRIx64 ", base 0x%" PRIx64 ", mask 0x%" PRIx64 "\n", 273 cpu, ipt_trace_state[cpu].ctl, ipt_trace_state[cpu].status, 274 ipt_trace_state[cpu].output_base, 275 ipt_trace_state[cpu].output_mask_ptrs); 276 } 277 } 278 279 active = true; 280 281 // Sideband info needed by the trace reader. 282 uint64_t platform_msr = read_msr(IA32_PLATFORM_INFO); 283 unsigned nom_freq = (platform_msr >> 8) & 0xff; 284 ktrace(TAG_IPT_START, (uint32_t)nom_freq, 0, 285 (uint32_t)kernel_cr3, (uint32_t)(kernel_cr3 >> 32)); 286 const struct x86_model_info* model_info = x86_get_model(); 287 ktrace(TAG_IPT_CPU_INFO, model_info->processor_type, 288 model_info->display_family, model_info->display_model, 289 model_info->stepping); 290 291 mp_sync_exec(MP_IPI_TARGET_ALL, 0, x86_ipt_start_cpu_task, ipt_trace_state); 292 return ZX_OK; 293} 294 295// This is invoked via mp_sync_exec which thread safety analysis cannot follow. 296static void x86_ipt_stop_cpu_task(void* raw_context) TA_NO_THREAD_SAFETY_ANALYSIS { 297 DEBUG_ASSERT(arch_ints_disabled()); 298 DEBUG_ASSERT(raw_context); 299 300 ipt_trace_state_t* context = reinterpret_cast<ipt_trace_state_t*>(raw_context); 301 uint32_t cpu = arch_curr_cpu_num(); 302 ipt_trace_state_t* state = &context[cpu]; 303 304 // Disable the trace 305 write_msr(IA32_RTIT_CTL, 0); 306 307 // Retrieve msr values for later providing to userspace 308 state->ctl = 0; 309 state->status = read_msr(IA32_RTIT_STATUS); 310 state->output_base = read_msr(IA32_RTIT_OUTPUT_BASE); 311 state->output_mask_ptrs = read_msr(IA32_RTIT_OUTPUT_MASK_PTRS); 312 313 // Zero all MSRs so that we are in the XSAVE initial configuration. 314 // This allows h/w to do some optimizations regarding the state. 315 write_msr(IA32_RTIT_STATUS, 0); 316 write_msr(IA32_RTIT_OUTPUT_BASE, 0); 317 write_msr(IA32_RTIT_OUTPUT_MASK_PTRS, 0); 318 if (supports_cr3_filtering) 319 write_msr(IA32_RTIT_CR3_MATCH, 0); 320 321 // TODO(dje): Make it explicit that packets have been completely written. 322 // See Intel Vol 3 chapter 36.2.4. 323 324 // TODO(teisenbe): Clear ADDR* MSRs depending on leaf 1 325} 326 327// This can be called while not active, so the caller doesn't have to care 328// during any cleanup. 329 330zx_status_t x86_ipt_stop() { 331 AutoLock al(&ipt_lock); 332 333 if (!supports_pt) 334 return ZX_ERR_NOT_SUPPORTED; 335 if (trace_mode == IPT_TRACE_THREADS) 336 return ZX_ERR_BAD_STATE; 337 if (!ipt_trace_state) 338 return ZX_ERR_BAD_STATE; 339 340 TRACEF("Stopping processor trace\n"); 341 342 mp_sync_exec(MP_IPI_TARGET_ALL, 0, x86_ipt_stop_cpu_task, ipt_trace_state); 343 ktrace(TAG_IPT_STOP, 0, 0, 0, 0); 344 active = false; 345 346 if (LOCAL_TRACE) { 347 uint32_t num_cpus = arch_max_num_cpus(); 348 for (uint32_t cpu = 0; cpu < num_cpus; ++cpu) { 349 TRACEF("Cpu %u: ctl 0x%" PRIx64 ", status 0x%" PRIx64 ", base 0x%" PRIx64 ", mask 0x%" PRIx64 "\n", 350 cpu, ipt_trace_state[cpu].ctl, ipt_trace_state[cpu].status, 351 ipt_trace_state[cpu].output_base, 352 ipt_trace_state[cpu].output_mask_ptrs); 353 } 354 } 355 356 return ZX_OK; 357} 358 359zx_status_t x86_ipt_stage_trace_data(zx_itrace_buffer_descriptor_t descriptor, 360 const zx_x86_pt_regs_t* regs) { 361 AutoLock al(&ipt_lock); 362 363 if (!supports_pt) 364 return ZX_ERR_NOT_SUPPORTED; 365 if (trace_mode == IPT_TRACE_THREADS) 366 return ZX_ERR_BAD_STATE; 367 if (active) 368 return ZX_ERR_BAD_STATE; 369 if (!ipt_trace_state) 370 return ZX_ERR_BAD_STATE; 371 uint32_t num_cpus = arch_max_num_cpus(); 372 if (descriptor >= num_cpus) 373 return ZX_ERR_INVALID_ARGS; 374 375 ipt_trace_state[descriptor].ctl = regs->ctl; 376 ipt_trace_state[descriptor].status = regs->status; 377 ipt_trace_state[descriptor].output_base = regs->output_base; 378 ipt_trace_state[descriptor].output_mask_ptrs = regs->output_mask_ptrs; 379 ipt_trace_state[descriptor].cr3_match = regs->cr3_match; 380 static_assert(sizeof(ipt_trace_state[descriptor].addr_ranges) == sizeof(regs->addr_ranges), "addr_ranges size mismatch"); 381 memcpy(ipt_trace_state[descriptor].addr_ranges, regs->addr_ranges, sizeof(regs->addr_ranges)); 382 383 return ZX_OK; 384} 385 386zx_status_t x86_ipt_get_trace_data(zx_itrace_buffer_descriptor_t descriptor, 387 zx_x86_pt_regs_t* regs) { 388 AutoLock al(&ipt_lock); 389 390 if (!supports_pt) 391 return ZX_ERR_NOT_SUPPORTED; 392 if (trace_mode == IPT_TRACE_THREADS) 393 return ZX_ERR_BAD_STATE; 394 if (active) 395 return ZX_ERR_BAD_STATE; 396 if (!ipt_trace_state) 397 return ZX_ERR_BAD_STATE; 398 uint32_t num_cpus = arch_max_num_cpus(); 399 if (descriptor >= num_cpus) 400 return ZX_ERR_INVALID_ARGS; 401 402 regs->ctl = ipt_trace_state[descriptor].ctl; 403 regs->status = ipt_trace_state[descriptor].status; 404 regs->output_base = ipt_trace_state[descriptor].output_base; 405 regs->output_mask_ptrs = ipt_trace_state[descriptor].output_mask_ptrs; 406 regs->cr3_match = ipt_trace_state[descriptor].cr3_match; 407 static_assert(sizeof(regs->addr_ranges) == sizeof(ipt_trace_state[descriptor].addr_ranges), "addr_ranges size mismatch"); 408 memcpy(regs->addr_ranges, ipt_trace_state[descriptor].addr_ranges, sizeof(regs->addr_ranges)); 409 410 return ZX_OK; 411} 412