1// Copyright 2016 The Fuchsia Authors
2//
3// Use of this source code is governed by a MIT-style
4// license that can be found in the LICENSE file or at
5// https://opensource.org/licenses/MIT
6
7// TODO(ZX-992): Need to be able to r/w MSRs.
8// The thought is to use resources (as in ResourceDispatcher), at which point
9// this will all get rewritten. Until such time, the goal here is KISS.
10// This file contains the lower part of Intel Processor Trace support that must
11// be done in the kernel (so that we can read/write msrs).
12// The userspace driver is in system/udev/intel-pt/intel-pt.c.
13//
14// We currently only support Table of Physical Addresses mode:
15// it supports discontiguous buffers and supports stop-on-full behavior
16// in addition to wrap-around.
17//
18// IPT tracing has two "modes":
19// - per-cpu tracing
20// - thread-specific tracing
21// Tracing can only be done in one mode at a time. This is because saving/
22// restoring thread PT state via the xsaves/xrstors instructions is a global
23// flag in the XSS msr.
24// Plus once a trace has been done with IPT_TRACE_THREADS one cannot go back
25// to IPT_TRACE_CPUS: supporting this requires flushing trace state from all
26// threads which is a bit of work. For now it's easy enough to just require
27// the user to reboot. ZX-892
28
29#include <arch/arch_ops.h>
30#include <arch/mmu.h>
31#include <arch/x86.h>
32#include <arch/x86/feature.h>
33#include <arch/x86/mmu.h>
34#include <arch/x86/proc_trace.h>
35#include <err.h>
36#include <fbl/auto_lock.h>
37#include <fbl/macros.h>
38#include <fbl/mutex.h>
39#include <fbl/unique_ptr.h>
40#include <kernel/mp.h>
41#include <kernel/thread.h>
42#include <lib/ktrace.h>
43#include <pow2.h>
44#include <string.h>
45#include <trace.h>
46#include <vm/vm.h>
47#include <vm/vm_aspace.h>
48#include <lib/zircon-internal/device/cpu-trace/intel-pt.h>
49#include <lib/zircon-internal/ktrace.h>
50#include <lib/zircon-internal/mtrace.h>
51#include <zircon/thread_annotations.h>
52#include <zircon/types.h>
53
54using fbl::AutoLock;
55
56#define LOCAL_TRACE 0
57
58// Control MSRs
59#define IA32_RTIT_OUTPUT_BASE 0x560
60#define IA32_RTIT_OUTPUT_MASK_PTRS 0x561
61#define IA32_RTIT_CTL 0x570
62#define IA32_RTIT_STATUS 0x571
63#define IA32_RTIT_CR3_MATCH 0x572
64#define IA32_RTIT_ADDR0_A 0x580
65#define IA32_RTIT_ADDR0_B 0x581
66#define IA32_RTIT_ADDR1_A 0x582
67#define IA32_RTIT_ADDR1_B 0x583
68#define IA32_RTIT_ADDR2_A 0x584
69#define IA32_RTIT_ADDR2_B 0x585
70#define IA32_RTIT_ADDR3_A 0x586
71#define IA32_RTIT_ADDR3_B 0x587
72
73// We need bits[15:8] to get the "maximum non-turbo ratio".
74// See libipt:intel-pt.h:pt_config, and Intel Vol. 3 chapter 35.5.
75#define IA32_PLATFORM_INFO 0xce
76
77// Our own copy of what h/w supports, mostly for sanity checking.
78static bool supports_pt = false;
79static bool supports_cr3_filtering = false;
80static bool supports_psb = false;
81static bool supports_ip_filtering = false;
82static bool supports_mtc = false;
83static bool supports_ptwrite = false;
84static bool supports_power_events = false;
85static bool supports_output_topa = false;
86static bool supports_output_topa_multi = false;
87static bool supports_output_single = false;
88static bool supports_output_transport = false;
89
90struct ipt_trace_state_t {
91    uint64_t ctl;
92    uint64_t status;
93    uint64_t output_base;
94    uint64_t output_mask_ptrs;
95    uint64_t cr3_match;
96    struct {
97        uint64_t a, b;
98    } addr_ranges[IPT_MAX_NUM_ADDR_RANGES];
99};
100
101static fbl::Mutex ipt_lock;
102
103static ipt_trace_state_t* ipt_trace_state TA_GUARDED(ipt_lock);
104
105static bool active TA_GUARDED(ipt_lock) = false;
106
107static ipt_trace_mode_t trace_mode TA_GUARDED(ipt_lock) = IPT_TRACE_CPUS;
108
109void x86_processor_trace_init(void) {
110    if (!x86_feature_test(X86_FEATURE_PT)) {
111        return;
112    }
113
114    struct cpuid_leaf leaf;
115    if (!x86_get_cpuid_subleaf(X86_CPUID_PT, 0, &leaf)) {
116        return;
117    }
118
119    supports_pt = true;
120
121    // Keep our own copy of these flags, mostly for potential sanity checks.
122    supports_cr3_filtering = !!(leaf.b & (1 << 0));
123    supports_psb = !!(leaf.b & (1 << 1));
124    supports_ip_filtering = !!(leaf.b & (1 << 2));
125    supports_mtc = !!(leaf.b & (1 << 3));
126    supports_ptwrite = !!(leaf.b & (1 << 4));
127    supports_power_events = !!(leaf.b & (1 << 5));
128
129    supports_output_topa = !!(leaf.c & (1 << 0));
130    supports_output_topa_multi = !!(leaf.c & (1 << 1));
131    supports_output_single = !!(leaf.c & (1 << 2));
132    supports_output_transport = !!(leaf.c & (1 << 3));
133}
134
135// Intel Processor Trace support needs to be able to map cr3 values that
136// appear in the trace to pids that ld.so uses to dump memory maps.
137void arch_trace_process_create(uint64_t pid, paddr_t pt_phys) {
138    // The cr3 value that appears in Intel PT h/w tracing.
139    uint64_t cr3 = pt_phys;
140    ktrace(TAG_IPT_PROCESS_CREATE, (uint32_t)pid, (uint32_t)(pid >> 32),
141           (uint32_t)cr3, (uint32_t)(cr3 >> 32));
142}
143
144// Worker for x86_ipt_alloc_trace to be executed on all cpus.
145// This is invoked via mp_sync_exec which thread safety analysis cannot follow.
146static void x86_ipt_set_mode_task(void* raw_context) TA_NO_THREAD_SAFETY_ANALYSIS {
147    DEBUG_ASSERT(arch_ints_disabled());
148    DEBUG_ASSERT(!active);
149
150    // When changing modes make sure all PT MSRs are in the init state.
151    // We don't want a value to appear in the xsave buffer and have xrstors
152    // #gp because XCOMP_BV has the PT bit set that's not set in XSS.
153    // We still need to do this, even with ZX-892, when transitioning
154    // from IPT_TRACE_CPUS to IPT_TRACE_THREADS.
155    write_msr(IA32_RTIT_CTL, 0);
156    write_msr(IA32_RTIT_STATUS, 0);
157    write_msr(IA32_RTIT_OUTPUT_BASE, 0);
158    write_msr(IA32_RTIT_OUTPUT_MASK_PTRS, 0);
159    if (supports_cr3_filtering)
160        write_msr(IA32_RTIT_CR3_MATCH, 0);
161    // TODO(dje): addr range msrs
162
163    ipt_trace_mode_t new_mode = static_cast<ipt_trace_mode_t>(reinterpret_cast<uintptr_t>(raw_context));
164
165    // PT state saving, if supported, was enabled during boot so there's no
166    // need to recalculate the xsave space needed.
167    x86_set_extended_register_pt_state(new_mode == IPT_TRACE_THREADS);
168}
169
170zx_status_t x86_ipt_alloc_trace(ipt_trace_mode_t mode) {
171    AutoLock al(&ipt_lock);
172
173    DEBUG_ASSERT(mode == IPT_TRACE_CPUS || mode == IPT_TRACE_THREADS);
174
175    if (!supports_pt)
176        return ZX_ERR_NOT_SUPPORTED;
177    if (active)
178        return ZX_ERR_BAD_STATE;
179    if (ipt_trace_state)
180        return ZX_ERR_BAD_STATE;
181
182    // ZX-892: We don't support changing the mode from IPT_TRACE_THREADS to
183    // IPT_TRACE_CPUS: We can't turn off XSS.PT until we're sure all threads
184    // have no PT state, and that's too tricky to do right now. Instead,
185    // require the developer to reboot.
186    if (trace_mode == IPT_TRACE_THREADS && mode == IPT_TRACE_CPUS)
187        return ZX_ERR_NOT_SUPPORTED;
188
189    if (mode == IPT_TRACE_CPUS) {
190        uint32_t num_cpus = arch_max_num_cpus();
191        ipt_trace_state =
192            reinterpret_cast<ipt_trace_state_t*>(calloc(num_cpus,
193                                                        sizeof(*ipt_trace_state)));
194        if (!ipt_trace_state)
195            return ZX_ERR_NO_MEMORY;
196    } else {
197        // TODO(dje): support for IPT_TRACE_THREADS
198        return ZX_ERR_NOT_SUPPORTED;
199    }
200
201    mp_sync_exec(MP_IPI_TARGET_ALL, 0, x86_ipt_set_mode_task,
202                 reinterpret_cast<void*>(static_cast<uintptr_t>(mode)));
203
204    trace_mode = mode;
205    return ZX_OK;
206}
207
208// Free resources obtained by x86_ipt_alloc_trace().
209// This doesn't care if resources have already been freed to save callers
210// from having to care during any cleanup.
211
212zx_status_t x86_ipt_free_trace() {
213    AutoLock al(&ipt_lock);
214
215    if (!supports_pt)
216        return ZX_ERR_NOT_SUPPORTED;
217    if (trace_mode == IPT_TRACE_THREADS)
218        return ZX_ERR_BAD_STATE;
219    if (active)
220        return ZX_ERR_BAD_STATE;
221
222    free(ipt_trace_state);
223    ipt_trace_state = nullptr;
224    return ZX_OK;
225}
226
227// This is invoked via mp_sync_exec which thread safety analysis cannot follow.
228static void x86_ipt_start_cpu_task(void* raw_context) TA_NO_THREAD_SAFETY_ANALYSIS {
229    DEBUG_ASSERT(arch_ints_disabled());
230    DEBUG_ASSERT(active && raw_context);
231
232    ipt_trace_state_t* context = reinterpret_cast<ipt_trace_state_t*>(raw_context);
233    uint32_t cpu = arch_curr_cpu_num();
234    ipt_trace_state_t* state = &context[cpu];
235
236    DEBUG_ASSERT(!(read_msr(IA32_RTIT_CTL) & IPT_CTL_TRACE_EN_MASK));
237
238    // Load the ToPA configuration
239    write_msr(IA32_RTIT_OUTPUT_BASE, state->output_base);
240    write_msr(IA32_RTIT_OUTPUT_MASK_PTRS, state->output_mask_ptrs);
241
242    // Load all other msrs, prior to enabling tracing.
243    write_msr(IA32_RTIT_STATUS, state->status);
244    if (supports_cr3_filtering)
245        write_msr(IA32_RTIT_CR3_MATCH, state->cr3_match);
246
247    // Enable the trace
248    write_msr(IA32_RTIT_CTL, state->ctl);
249}
250
251// Begin the trace.
252
253zx_status_t x86_ipt_start() {
254    AutoLock al(&ipt_lock);
255
256    if (!supports_pt)
257        return ZX_ERR_NOT_SUPPORTED;
258    if (trace_mode == IPT_TRACE_THREADS)
259        return ZX_ERR_BAD_STATE;
260    if (active)
261        return ZX_ERR_BAD_STATE;
262    if (!ipt_trace_state)
263        return ZX_ERR_BAD_STATE;
264
265    uint64_t kernel_cr3 = x86_kernel_cr3();
266    TRACEF("Starting processor trace, kernel cr3: 0x%" PRIxPTR "\n",
267           kernel_cr3);
268
269    if (LOCAL_TRACE) {
270        uint32_t num_cpus = arch_max_num_cpus();
271        for (uint32_t cpu = 0; cpu < num_cpus; ++cpu) {
272            TRACEF("Cpu %u: ctl 0x%" PRIx64 ", status 0x%" PRIx64 ", base 0x%" PRIx64 ", mask 0x%" PRIx64 "\n",
273                   cpu, ipt_trace_state[cpu].ctl, ipt_trace_state[cpu].status,
274                   ipt_trace_state[cpu].output_base,
275                   ipt_trace_state[cpu].output_mask_ptrs);
276        }
277    }
278
279    active = true;
280
281    // Sideband info needed by the trace reader.
282    uint64_t platform_msr = read_msr(IA32_PLATFORM_INFO);
283    unsigned nom_freq = (platform_msr >> 8) & 0xff;
284    ktrace(TAG_IPT_START, (uint32_t)nom_freq, 0,
285           (uint32_t)kernel_cr3, (uint32_t)(kernel_cr3 >> 32));
286    const struct x86_model_info* model_info = x86_get_model();
287    ktrace(TAG_IPT_CPU_INFO, model_info->processor_type,
288           model_info->display_family, model_info->display_model,
289           model_info->stepping);
290
291    mp_sync_exec(MP_IPI_TARGET_ALL, 0, x86_ipt_start_cpu_task, ipt_trace_state);
292    return ZX_OK;
293}
294
295// This is invoked via mp_sync_exec which thread safety analysis cannot follow.
296static void x86_ipt_stop_cpu_task(void* raw_context) TA_NO_THREAD_SAFETY_ANALYSIS {
297    DEBUG_ASSERT(arch_ints_disabled());
298    DEBUG_ASSERT(raw_context);
299
300    ipt_trace_state_t* context = reinterpret_cast<ipt_trace_state_t*>(raw_context);
301    uint32_t cpu = arch_curr_cpu_num();
302    ipt_trace_state_t* state = &context[cpu];
303
304    // Disable the trace
305    write_msr(IA32_RTIT_CTL, 0);
306
307    // Retrieve msr values for later providing to userspace
308    state->ctl = 0;
309    state->status = read_msr(IA32_RTIT_STATUS);
310    state->output_base = read_msr(IA32_RTIT_OUTPUT_BASE);
311    state->output_mask_ptrs = read_msr(IA32_RTIT_OUTPUT_MASK_PTRS);
312
313    // Zero all MSRs so that we are in the XSAVE initial configuration.
314    // This allows h/w to do some optimizations regarding the state.
315    write_msr(IA32_RTIT_STATUS, 0);
316    write_msr(IA32_RTIT_OUTPUT_BASE, 0);
317    write_msr(IA32_RTIT_OUTPUT_MASK_PTRS, 0);
318    if (supports_cr3_filtering)
319        write_msr(IA32_RTIT_CR3_MATCH, 0);
320
321    // TODO(dje): Make it explicit that packets have been completely written.
322    // See Intel Vol 3 chapter 36.2.4.
323
324    // TODO(teisenbe): Clear ADDR* MSRs depending on leaf 1
325}
326
327// This can be called while not active, so the caller doesn't have to care
328// during any cleanup.
329
330zx_status_t x86_ipt_stop() {
331    AutoLock al(&ipt_lock);
332
333    if (!supports_pt)
334        return ZX_ERR_NOT_SUPPORTED;
335    if (trace_mode == IPT_TRACE_THREADS)
336        return ZX_ERR_BAD_STATE;
337    if (!ipt_trace_state)
338        return ZX_ERR_BAD_STATE;
339
340    TRACEF("Stopping processor trace\n");
341
342    mp_sync_exec(MP_IPI_TARGET_ALL, 0, x86_ipt_stop_cpu_task, ipt_trace_state);
343    ktrace(TAG_IPT_STOP, 0, 0, 0, 0);
344    active = false;
345
346    if (LOCAL_TRACE) {
347        uint32_t num_cpus = arch_max_num_cpus();
348        for (uint32_t cpu = 0; cpu < num_cpus; ++cpu) {
349            TRACEF("Cpu %u: ctl 0x%" PRIx64 ", status 0x%" PRIx64 ", base 0x%" PRIx64 ", mask 0x%" PRIx64 "\n",
350                   cpu, ipt_trace_state[cpu].ctl, ipt_trace_state[cpu].status,
351                   ipt_trace_state[cpu].output_base,
352                   ipt_trace_state[cpu].output_mask_ptrs);
353        }
354    }
355
356    return ZX_OK;
357}
358
359zx_status_t x86_ipt_stage_trace_data(zx_itrace_buffer_descriptor_t descriptor,
360                                     const zx_x86_pt_regs_t* regs) {
361    AutoLock al(&ipt_lock);
362
363    if (!supports_pt)
364        return ZX_ERR_NOT_SUPPORTED;
365    if (trace_mode == IPT_TRACE_THREADS)
366        return ZX_ERR_BAD_STATE;
367    if (active)
368        return ZX_ERR_BAD_STATE;
369    if (!ipt_trace_state)
370        return ZX_ERR_BAD_STATE;
371    uint32_t num_cpus = arch_max_num_cpus();
372    if (descriptor >= num_cpus)
373        return ZX_ERR_INVALID_ARGS;
374
375    ipt_trace_state[descriptor].ctl = regs->ctl;
376    ipt_trace_state[descriptor].status = regs->status;
377    ipt_trace_state[descriptor].output_base = regs->output_base;
378    ipt_trace_state[descriptor].output_mask_ptrs = regs->output_mask_ptrs;
379    ipt_trace_state[descriptor].cr3_match = regs->cr3_match;
380    static_assert(sizeof(ipt_trace_state[descriptor].addr_ranges) == sizeof(regs->addr_ranges), "addr_ranges size mismatch");
381    memcpy(ipt_trace_state[descriptor].addr_ranges, regs->addr_ranges, sizeof(regs->addr_ranges));
382
383    return ZX_OK;
384}
385
386zx_status_t x86_ipt_get_trace_data(zx_itrace_buffer_descriptor_t descriptor,
387                                   zx_x86_pt_regs_t* regs) {
388    AutoLock al(&ipt_lock);
389
390    if (!supports_pt)
391        return ZX_ERR_NOT_SUPPORTED;
392    if (trace_mode == IPT_TRACE_THREADS)
393        return ZX_ERR_BAD_STATE;
394    if (active)
395        return ZX_ERR_BAD_STATE;
396    if (!ipt_trace_state)
397        return ZX_ERR_BAD_STATE;
398    uint32_t num_cpus = arch_max_num_cpus();
399    if (descriptor >= num_cpus)
400        return ZX_ERR_INVALID_ARGS;
401
402    regs->ctl = ipt_trace_state[descriptor].ctl;
403    regs->status = ipt_trace_state[descriptor].status;
404    regs->output_base = ipt_trace_state[descriptor].output_base;
405    regs->output_mask_ptrs = ipt_trace_state[descriptor].output_mask_ptrs;
406    regs->cr3_match = ipt_trace_state[descriptor].cr3_match;
407    static_assert(sizeof(regs->addr_ranges) == sizeof(ipt_trace_state[descriptor].addr_ranges), "addr_ranges size mismatch");
408    memcpy(regs->addr_ranges, ipt_trace_state[descriptor].addr_ranges, sizeof(regs->addr_ranges));
409
410    return ZX_OK;
411}
412