1// Copyright 2016 The Fuchsia Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style license that can be
3// found in the LICENSE file.
4
5// See the README.md in this directory for documentation.
6
7#include <ddk/binding.h>
8#include <ddk/debug.h>
9#include <ddk/device.h>
10#include <ddk/driver.h>
11#include <ddk/io-buffer.h>
12
13#include <lib/zircon-internal/device/cpu-trace/intel-pt.h>
14#include <lib/zircon-internal/mtrace.h>
15#include <zircon/syscalls.h>
16#include <zircon/syscalls/resource.h>
17#include <zircon/types.h>
18
19#include <assert.h>
20#include <cpuid.h>
21#include <inttypes.h>
22#include <limits.h>
23#include <stdio.h>
24#include <stdlib.h>
25#include <string.h>
26
27#include "cpu-trace-private.h"
28
29typedef enum {
30    IPT_TRACE_CPUS,
31    IPT_TRACE_THREADS
32} ipt_trace_mode_t;
33
34typedef struct ipt_per_trace_state {
35    // the cpu or thread this buffer is assigned to
36    // Which value to use is determined by the trace mode.
37    union {
38        uint32_t cpuno;
39        zx_handle_t thread;
40    } owner;
41
42    // number of chunks, each 2^|chunk_order| pages in size
43    uint32_t num_chunks;
44    // log2 size of each chunk, in pages
45    uint32_t chunk_order;
46    // if true then the buffer is circular, otherwise tracing stops when the
47    // buffer fills
48    bool is_circular;
49    // true if allocated
50    bool allocated;
51    // number of ToPA tables needed
52    uint32_t num_tables;
53
54    // msrs
55    uint64_t ctl;
56    uint64_t status;
57    uint64_t output_base;
58    uint64_t output_mask_ptrs;
59    uint64_t cr3_match;
60    struct {
61        uint64_t a,b;
62    } addr_ranges[IPT_MAX_NUM_ADDR_RANGES];
63
64    // trace buffers and ToPA tables
65    // ToPA: Table of Physical Addresses
66    // A "trace buffer" is a set of N chunks.
67    io_buffer_t* chunks;
68    io_buffer_t* topas;
69} ipt_per_trace_state_t;
70
71typedef struct insntrace_device {
72    ipt_trace_mode_t mode;
73
74    // # of entries in |per_trace_state|.
75    // When tracing by cpu, this is the max number of cpus.
76    // When tracing by thread, this is the max number of threads.
77    // TODO(dje): Add support for dynamically growing the vector.
78    uint32_t num_traces;
79
80    // one entry for each trace
81    ipt_per_trace_state_t* per_trace_state;
82
83    // Once tracing has started various things are not allowed until it stops.
84    bool active;
85
86    // Borrowed handle from cpu_trace_device.  Must not close
87    zx_handle_t bti;
88} insntrace_device_t;
89
90static uint32_t ipt_config_family;
91static uint32_t ipt_config_model;
92static uint32_t ipt_config_stepping;
93
94static uint32_t ipt_config_addr_cfg_max = 0;
95static uint32_t ipt_config_mtc_freq_mask = 0;
96static uint32_t ipt_config_cyc_thresh_mask = 0;
97static uint32_t ipt_config_psb_freq_mask = 0;
98static uint32_t ipt_config_num_addr_ranges = 0;
99static uint32_t ipt_config_bus_freq = 0;
100
101static bool ipt_config_supported = false;
102
103static bool ipt_config_cr3_filtering = false;
104static bool ipt_config_psb = false;
105static bool ipt_config_ip_filtering = false;
106static bool ipt_config_mtc = false;
107static bool ipt_config_ptwrite = false;
108static bool ipt_config_power_events = false;
109static bool ipt_config_output_topa = false;
110static bool ipt_config_output_topa_multi = false;
111static bool ipt_config_output_single = false;
112static bool ipt_config_output_transport = false;
113static bool ipt_config_lip = false;
114
115// maximum space, in bytes, for trace buffers (per cpu)
116// This isn't necessarily
117// MAX_NUM_CHUNKS * (1 << (MAX_CHUNK_ORDER + PAGE_SIZE_SHIFT)).
118// Buffers have to be naturally aligned contiguous pages, but we can have
119// a lot of them. Supporting large buffers and/or lots of them is for
120// experimentation.
121#define MAX_PER_TRACE_SPACE (256 * 1024 * 1024)
122
123// maximum number of buffers
124#define MAX_NUM_CHUNKS 4096
125
126// maximum size of each buffer, in pages (1MB)
127#define MAX_CHUNK_ORDER 8
128
129#if PAGE_SIZE == 4096
130#define PAGE_SIZE_SHIFT 12
131#else
132#error "unsupported page size"
133#endif
134
135#define BIT(x, b) ((x) & (1u << (b)))
136
137static zx_status_t x86_pt_free(insntrace_device_t* ipt_dev);
138
139
140// The userspace side of the driver
141
142void insntrace_init_once(void)
143{
144    unsigned a, b, c, d, max_leaf;
145
146    max_leaf = __get_cpuid_max(0, NULL);
147    if (max_leaf < 0x14) {
148        zxlogf(INFO, "IntelPT: No PT support\n");
149        return;
150    }
151
152    __cpuid(1, a, b, c, d);
153    ipt_config_stepping = a & 0xf;
154    ipt_config_model = (a >> 4) & 0xf;
155    ipt_config_family = (a >> 8) & 0xf;
156    if (ipt_config_family == 0xf)
157        ipt_config_family += (a >> 20) & 0xff;
158    if (ipt_config_family == 6 || ipt_config_family == 0xf)
159        ipt_config_model += ((a >> 16) & 0xf) << 4;
160
161    __cpuid_count(0x07, 0, a, b, c, d);
162    if (!BIT(b, 25)) {
163        zxlogf(INFO, "IntelPT: No PT support\n");
164        return;
165    }
166
167    ipt_config_supported = true;
168
169    __cpuid_count(0x14, 0, a, b, c, d);
170    if (BIT(b, 2))
171        ipt_config_addr_cfg_max = 2;
172    if (BIT(b, 1) && a >= 1) {
173        unsigned a1, b1, c1, d1;
174        __cpuid_count(0x14, 1, a1, b1, c1, d1);
175        ipt_config_mtc_freq_mask = (a1 >> 16) & 0xffff;
176        ipt_config_cyc_thresh_mask = b1 & 0xffff;
177        ipt_config_psb_freq_mask = (b1 >> 16) & 0xffff;
178        ipt_config_num_addr_ranges = a1 & 0x7;
179    }
180
181    if (max_leaf >= 0x15) {
182        unsigned a1 = 0, b1 = 0, c1 = 0, d1 = 0;
183        __cpuid(0x15, a1, b1, c1, d1);
184        if (a1 && b1)
185            ipt_config_bus_freq = 1. / ((float)a1 / (float)b1);
186    }
187
188    ipt_config_cr3_filtering = !!BIT(b, 0);
189    ipt_config_psb = !!BIT(b, 1);
190    ipt_config_ip_filtering = !!BIT(b, 2);
191    ipt_config_mtc = !!BIT(b, 3);
192    ipt_config_ptwrite = !!BIT(b, 4);
193    ipt_config_power_events = !!BIT(b, 5);
194
195    ipt_config_output_topa = !!BIT(c, 0);
196    ipt_config_output_topa_multi = !!BIT(c, 1);
197    ipt_config_output_single = !!BIT(c, 2);
198    ipt_config_output_transport = !!BIT(c, 3);
199    ipt_config_lip = !!BIT(c, 31);
200
201    zxlogf(INFO, "Intel Processor Trace configuration for this chipset:\n");
202    // No need to print everything, but these are useful.
203    zxlogf(INFO, "mtc_freq_mask:   0x%x\n", ipt_config_mtc_freq_mask);
204    zxlogf(INFO, "cyc_thresh_mask: 0x%x\n", ipt_config_cyc_thresh_mask);
205    zxlogf(INFO, "psb_freq_mask:   0x%x\n", ipt_config_psb_freq_mask);
206    zxlogf(INFO, "num addr ranges: %u\n", ipt_config_num_addr_ranges);
207}
208
209// Create the ToPA for the configured number of pages for |cpu|.
210// A circular collection of buffers is set up, even if we're going to apply
211// the stop bit to the last entry.
212static void make_topa(insntrace_device_t* ipt_dev, ipt_per_trace_state_t* per_trace) {
213    const size_t run_len_log2 = per_trace->chunk_order;
214    assert(run_len_log2 + PAGE_SIZE_SHIFT <= IPT_TOPA_MAX_SHIFT);
215    assert(run_len_log2 + PAGE_SIZE_SHIFT >= IPT_TOPA_MIN_SHIFT);
216
217    uint32_t curr_table = 0;
218    uint32_t curr_idx = 0;
219    uint64_t* last_entry = NULL;
220
221    // Note: An early version of this patch auto-computed the desired grouping
222    // of pages with sufficient alignment. If you find yourself needing this
223    // functionality again, see change 9470.
224
225    for (uint32_t i = 0; i < per_trace->num_chunks; ++i) {
226        io_buffer_t* buffer = &per_trace->chunks[i];
227        io_buffer_t* topa = &per_trace->topas[curr_table];
228        zx_paddr_t pa = io_buffer_phys(buffer);
229
230        uint64_t val = IPT_TOPA_ENTRY_PHYS_ADDR(pa) |
231            IPT_TOPA_ENTRY_SIZE(run_len_log2 + PAGE_SIZE_SHIFT);
232        uint64_t* table = io_buffer_virt(topa);
233        table[curr_idx] = val;
234        last_entry = &table[curr_idx];
235
236        // Make sure we leave one at the end of the table for the END marker.
237        if (unlikely(curr_idx >= IPT_TOPA_MAX_TABLE_ENTRIES - 2)) {
238            curr_idx = 0;
239            curr_table++;
240        } else {
241            curr_idx++;
242        }
243    }
244
245    assert(curr_table + 1 == per_trace->num_tables ||
246           // If the last table is full curr_table will be the next one.
247           (curr_table == per_trace->num_tables && curr_idx == 0));
248
249    // Populate END entries for completed tables
250    // Assume the table is circular. We'll set the stop bit on the last
251    // entry later.
252    for (uint32_t i = 0; i < curr_table; ++i) {
253        io_buffer_t* this_table = &per_trace->topas[i];
254        io_buffer_t* next_table;
255        if (i == per_trace->num_tables - 1) {
256            next_table = &per_trace->topas[0];
257        } else {
258            next_table = &per_trace->topas[i + 1];
259        }
260
261        zx_paddr_t next_table_pa = io_buffer_phys(next_table);
262        uint64_t val = IPT_TOPA_ENTRY_PHYS_ADDR(next_table_pa) | IPT_TOPA_ENTRY_END;
263        uint64_t* table = io_buffer_virt(this_table);
264        table[IPT_TOPA_MAX_TABLE_ENTRIES - 1] = val;
265    }
266
267    // Populate the END entry for a possibly non-full last table
268    if (curr_table < per_trace->num_tables) {
269        io_buffer_t* this_table = &per_trace->topas[curr_table];
270        io_buffer_t* first_table = &per_trace->topas[0];
271        zx_paddr_t first_table_pa = io_buffer_phys(first_table);
272        uint64_t val = IPT_TOPA_ENTRY_PHYS_ADDR(first_table_pa) | IPT_TOPA_ENTRY_END;
273        uint64_t* table = io_buffer_virt(this_table);
274        table[curr_idx] = val;
275    }
276
277    // Add the STOP flag to the last non-END entry in the tables
278    assert(last_entry);
279    if (!per_trace->is_circular)
280        *last_entry |= IPT_TOPA_ENTRY_STOP;
281}
282
283// Compute the number of ToPA entries needed for the configured number of
284// buffers.
285// The output count includes the END entries across all needed tables.
286static uint32_t compute_topa_entry_count(insntrace_device_t* ipt_dev,
287                                         ipt_per_trace_state_t* per_trace) {
288    uint32_t num_entries = per_trace->num_chunks;
289    uint32_t num_end_entries = (num_entries + IPT_TOPA_MAX_TABLE_ENTRIES - 2) /
290        (IPT_TOPA_MAX_TABLE_ENTRIES - 1);
291    uint32_t result = num_entries + num_end_entries;
292
293    zxlogf(DEBUG1, "IPT: compute_topa_entry_count: num_entries: %u\n", num_entries);
294    zxlogf(DEBUG1, "IPT: compute_topa_entry_count: num_end_entries: %u\n", num_end_entries);
295    zxlogf(DEBUG1, "IPT: compute_topa_entry_count: total entries: %u\n", result);
296
297    return result;
298}
299
300// Walk the tables to discover how much data has been captured for |per_trace|.
301// Note: If this is a circular buffer this is just where tracing stopped.
302static size_t compute_capture_size(insntrace_device_t* ipt_dev,
303                                   const ipt_per_trace_state_t* per_trace) {
304    uint64_t curr_table_paddr = per_trace->output_base;
305    uint32_t curr_table_entry_idx = (uint32_t)per_trace->output_mask_ptrs >> 7;
306    uint32_t curr_entry_offset = (uint32_t)(per_trace->output_mask_ptrs >> 32);
307
308    zxlogf(DEBUG1, "IPT: compute_capture_size: trace %tu\n", per_trace - ipt_dev->per_trace_state);
309    zxlogf(DEBUG1, "IPT: curr_table_paddr 0x%" PRIx64 ", curr_table_entry_idx %u, curr_entry_offset %u\n",
310           curr_table_paddr, curr_table_entry_idx, curr_entry_offset);
311
312    size_t total_size = 0;
313    for (uint32_t table = 0; table < per_trace->num_tables; ++table) {
314        // Get the physical address so that we can compare it with the value
315        // in output_base.
316        zx_paddr_t table_paddr = io_buffer_phys(&per_trace->topas[table]);
317
318        for (uint32_t entry = 0; entry < IPT_TOPA_MAX_TABLE_ENTRIES - 1; ++entry) {
319            if (table_paddr == curr_table_paddr && entry >= curr_table_entry_idx) {
320                total_size += curr_entry_offset;
321                return total_size;
322            }
323            uint64_t* table_ptr = io_buffer_virt(&per_trace->topas[table]);
324            uint64_t topa_entry = table_ptr[entry];
325            total_size += 1UL << IPT_TOPA_ENTRY_EXTRACT_SIZE(topa_entry);
326        }
327    }
328
329    // Should be unreachable.
330    // TODO(dje): Later flag state as broken.
331    zxlogf(ERROR, "IPT: unexpectedly exited capture loop\n");
332    return 0;
333}
334
335static zx_status_t x86_pt_alloc_buffer1(insntrace_device_t* ipt_dev,
336                                        ipt_per_trace_state_t* per_trace,
337                                        uint32_t num, uint32_t order,
338                                        bool is_circular) {
339    zx_status_t status;
340    size_t chunk_pages = 1 << order;
341
342    memset(per_trace, 0, sizeof(*per_trace));
343
344    per_trace->chunks = calloc(num, sizeof(io_buffer_t));
345    if (per_trace->chunks == NULL)
346        return ZX_ERR_NO_MEMORY;
347
348    for (uint32_t i = 0; i < num; ++i) {
349        // ToPA entries of size N must be aligned to N, too.
350        uint32_t alignment_log2 = PAGE_SIZE_SHIFT + order;
351        status = io_buffer_init_aligned(&per_trace->chunks[i], ipt_dev->bti,
352                                        chunk_pages * PAGE_SIZE, alignment_log2,
353                                        IO_BUFFER_RW | IO_BUFFER_CONTIG);
354        if (status != ZX_OK)
355            return status;
356        // Keep track of allocated buffers as we go in case we later fail:
357        // we want to be able to free those that got allocated.
358        ++per_trace->num_chunks;
359        // Catch bugs in io_buffer_init_aligned. If it doesn't give us a
360        // properly aligned buffer we'll get an "operational error" later.
361        // See Intel Vol3 36.2.6.2.
362        zx_paddr_t pa = io_buffer_phys(&per_trace->chunks[i]);
363        zx_paddr_t align_mask = (1ull << alignment_log2) - 1;
364        if (pa & align_mask) {
365            zxlogf(ERROR, "%s: WARNING: chunk has bad alignment: alignment %u, got 0x%" PRIx64 "\n",
366                   __func__, alignment_log2, pa);
367            return ZX_ERR_INTERNAL;
368        }
369    }
370    assert(per_trace->num_chunks == num);
371
372    per_trace->chunk_order = order;
373    per_trace->is_circular = is_circular;
374
375    // TODO(dje): No need to allocate the max on the last table.
376    uint32_t entry_count = compute_topa_entry_count(ipt_dev, per_trace);
377    uint32_t table_count = (entry_count + IPT_TOPA_MAX_TABLE_ENTRIES - 1) /
378            IPT_TOPA_MAX_TABLE_ENTRIES;
379
380    if (entry_count < 2) {
381        zxlogf(INFO, "IPT: INVALID ENTRY COUNT: %u\n", entry_count);
382        return ZX_ERR_INVALID_ARGS;
383    }
384
385    // Some early Processor Trace implementations only supported having a
386    // table with a single real entry and an END.
387    if (!ipt_config_output_topa_multi && entry_count > 2)
388        return ZX_ERR_NOT_SUPPORTED;
389
390    // Allocate Table(s) of Physical Addresses (ToPA) for each cpu.
391
392    per_trace->topas = calloc(table_count, sizeof(io_buffer_t));
393    if (per_trace->topas == NULL)
394        return ZX_ERR_NO_MEMORY;
395
396    for (uint32_t i = 0; i < table_count; ++i) {
397        status = io_buffer_init(&per_trace->topas[i], ipt_dev->bti,
398                                sizeof(uint64_t) * IPT_TOPA_MAX_TABLE_ENTRIES,
399                                IO_BUFFER_RW | IO_BUFFER_CONTIG);
400        if (status != ZX_OK)
401            return ZX_ERR_NO_MEMORY;
402        // Keep track of allocated tables as we go in case we later fail:
403        // we want to be able to free those that got allocated.
404        ++per_trace->num_tables;
405    }
406    assert(per_trace->num_tables == table_count);
407
408    make_topa(ipt_dev, per_trace);
409
410    return ZX_OK;
411}
412
413static void x86_pt_free_buffer1(insntrace_device_t* ipt_dev, ipt_per_trace_state_t* per_trace) {
414    if (per_trace->chunks) {
415        for (uint32_t i = 0; i < per_trace->num_chunks; ++i) {
416            io_buffer_release(&per_trace->chunks[i]);
417        }
418    }
419    free(per_trace->chunks);
420    per_trace->chunks = NULL;
421
422    if (per_trace->topas) {
423        for (uint32_t i = 0; i < per_trace->num_tables; ++i) {
424            io_buffer_release(&per_trace->topas[i]);
425        }
426    }
427    free(per_trace->topas);
428    per_trace->topas = NULL;
429
430    per_trace->allocated = false;
431}
432
433static zx_status_t x86_pt_alloc_buffer(insntrace_device_t* ipt_dev,
434                                       const ioctl_insntrace_buffer_config_t* config,
435                                       zx_itrace_buffer_descriptor_t* out_descriptor) {
436    zxlogf(DEBUG1, "%s: num_chunks %u, chunk_order %u\n",
437           __func__, config->num_chunks, config->chunk_order);
438
439    if (config->num_chunks == 0 || config->num_chunks > MAX_NUM_CHUNKS)
440        return ZX_ERR_INVALID_ARGS;
441    if (config->chunk_order > MAX_CHUNK_ORDER)
442        return ZX_ERR_INVALID_ARGS;
443    size_t chunk_pages = 1 << config->chunk_order;
444    size_t nr_pages = config->num_chunks * chunk_pages;
445    size_t total_per_trace = nr_pages * PAGE_SIZE;
446    if (total_per_trace > MAX_PER_TRACE_SPACE)
447        return ZX_ERR_INVALID_ARGS;
448
449    uint64_t settable_ctl_mask = (
450        IPT_CTL_OS_ALLOWED_MASK |
451        IPT_CTL_USER_ALLOWED_MASK |
452        IPT_CTL_TSC_EN_MASK |
453        IPT_CTL_DIS_RETC_MASK |
454        IPT_CTL_BRANCH_EN_MASK
455        );
456    if (ipt_config_ptwrite)
457        settable_ctl_mask |= IPT_CTL_PTW_EN_MASK | IPT_CTL_FUP_ON_PTW_MASK;
458    if (ipt_config_cr3_filtering)
459        settable_ctl_mask |= IPT_CTL_CR3_FILTER_MASK;
460    if (ipt_config_mtc)
461        settable_ctl_mask |= IPT_CTL_MTC_EN_MASK | IPT_CTL_MTC_FREQ_MASK;
462    if (ipt_config_power_events)
463        settable_ctl_mask |= IPT_CTL_POWER_EVENT_EN_MASK;
464    if (ipt_config_ip_filtering) {
465        if (ipt_config_num_addr_ranges >= 1)
466            settable_ctl_mask |= IPT_CTL_ADDR0_MASK;
467        if (ipt_config_num_addr_ranges >= 2)
468            settable_ctl_mask |= IPT_CTL_ADDR1_MASK;
469        if (ipt_config_num_addr_ranges >= 3)
470            settable_ctl_mask |= IPT_CTL_ADDR2_MASK;
471        if (ipt_config_num_addr_ranges >= 4)
472            settable_ctl_mask |= IPT_CTL_ADDR3_MASK;
473    }
474    if (ipt_config_psb)
475        settable_ctl_mask |= (IPT_CTL_CYC_EN_MASK |
476                              IPT_CTL_PSB_FREQ_MASK |
477                              IPT_CTL_CYC_THRESH_MASK);
478    if ((config->ctl & ~settable_ctl_mask) != 0) {
479        zxlogf(ERROR, "bad ctl, requested 0x%" PRIx64 ", valid 0x%" PRIx64 "\n",
480               config->ctl, settable_ctl_mask);
481        return ZX_ERR_INVALID_ARGS;
482    }
483
484    uint32_t mtc_freq = (uint32_t) ((config->ctl & IPT_CTL_MTC_FREQ_MASK) >> IPT_CTL_MTC_FREQ_SHIFT);
485    if (mtc_freq != 0 && ((1 << mtc_freq) & ipt_config_mtc_freq_mask) == 0) {
486        zxlogf(ERROR, "bad mtc_freq value, requested 0x%x, valid mask 0x%x\n",
487               mtc_freq, ipt_config_mtc_freq_mask);
488        return ZX_ERR_INVALID_ARGS;
489    }
490    uint32_t cyc_thresh = (uint32_t) ((config->ctl & IPT_CTL_CYC_THRESH_MASK) >> IPT_CTL_CYC_THRESH_SHIFT);
491    if (cyc_thresh != 0 && ((1 << cyc_thresh) & ipt_config_cyc_thresh_mask) == 0) {
492        zxlogf(ERROR, "bad cyc_thresh value, requested 0x%x, valid mask 0x%x\n",
493               cyc_thresh, ipt_config_cyc_thresh_mask);
494        return ZX_ERR_INVALID_ARGS;
495    }
496    uint32_t psb_freq = (uint32_t) ((config->ctl & IPT_CTL_PSB_FREQ_MASK) >> IPT_CTL_PSB_FREQ_SHIFT);
497    if (psb_freq != 0 && ((1 << psb_freq) & ipt_config_psb_freq_mask) == 0) {
498        zxlogf(ERROR, "bad psb_freq value, requested 0x%x, valid mask 0x%x\n",
499               psb_freq, ipt_config_psb_freq_mask);
500        return ZX_ERR_INVALID_ARGS;
501    }
502
503    // Find an unallocated buffer entry.
504    zx_itrace_buffer_descriptor_t descriptor;
505    for (descriptor = 0; descriptor < ipt_dev->num_traces; ++descriptor) {
506        if (!ipt_dev->per_trace_state[descriptor].allocated)
507            break;
508    }
509    if (descriptor == ipt_dev->num_traces)
510        return ZX_ERR_NO_RESOURCES;
511
512    ipt_per_trace_state_t* per_trace = &ipt_dev->per_trace_state[descriptor];
513    memset(per_trace, 0, sizeof(*per_trace));
514    zx_status_t status = x86_pt_alloc_buffer1(ipt_dev, per_trace,
515                                              config->num_chunks, config->chunk_order, config->is_circular);
516    if (status != ZX_OK) {
517        x86_pt_free_buffer1(ipt_dev, per_trace);
518        return status;
519    }
520
521    per_trace->ctl = config->ctl;
522    per_trace->status = 0;
523    per_trace->output_base = io_buffer_phys(&per_trace->topas[0]);
524    per_trace->output_mask_ptrs = 0;
525    per_trace->cr3_match = config->cr3_match;
526    static_assert(sizeof(per_trace->addr_ranges) == sizeof(config->addr_ranges),
527                  "addr range size mismatch");
528    memcpy(per_trace->addr_ranges, config->addr_ranges, sizeof(config->addr_ranges));
529    per_trace->allocated = true;
530    *out_descriptor = descriptor;
531    return ZX_OK;
532}
533
534static zx_status_t x86_pt_assign_thread_buffer(insntrace_device_t* ipt_dev,
535                                               zx_itrace_buffer_descriptor_t descriptor,
536                                               zx_handle_t thread) {
537    zx_handle_close(thread);
538    // TODO(dje): Thread support is still work-in-progress.
539    return ZX_ERR_NOT_SUPPORTED;
540}
541
542static zx_status_t x86_pt_release_thread_buffer(insntrace_device_t* ipt_dev,
543                                                zx_itrace_buffer_descriptor_t descriptor,
544                                                zx_handle_t thread) {
545    zx_handle_close(thread);
546    // TODO(dje): Thread support is still work-in-progress.
547    return ZX_ERR_NOT_SUPPORTED;
548}
549
550static zx_status_t x86_pt_free_buffer(insntrace_device_t* ipt_dev,
551                                      zx_itrace_buffer_descriptor_t descriptor) {
552    if (ipt_dev->active)
553        return ZX_ERR_BAD_STATE;
554    if (descriptor >= ipt_dev->num_traces)
555        return ZX_ERR_INVALID_ARGS;
556    assert(ipt_dev->per_trace_state);
557    ipt_per_trace_state_t* per_trace = &ipt_dev->per_trace_state[descriptor];
558    if (!per_trace->allocated)
559        return ZX_ERR_INVALID_ARGS;
560    x86_pt_free_buffer1(ipt_dev, per_trace);
561    return ZX_OK;
562}
563
564
565// ioctl handlers
566
567static zx_status_t ipt_alloc_trace(cpu_trace_device_t* dev,
568                                   const void* cmd, size_t cmdlen) {
569    if (!ipt_config_supported)
570        return ZX_ERR_NOT_SUPPORTED;
571    // For now we only support ToPA.
572    if (!ipt_config_output_topa)
573        return ZX_ERR_NOT_SUPPORTED;
574
575    ioctl_insntrace_trace_config_t config;
576    if (cmdlen != sizeof(config))
577        return ZX_ERR_INVALID_ARGS;
578    memcpy(&config, cmd, sizeof(config));
579
580    // TODO(dje): Until thread tracing is supported.
581    if (config.mode == IPT_MODE_THREADS)
582        return ZX_ERR_NOT_SUPPORTED;
583
584    uint32_t internal_mode;
585    switch (config.mode) {
586    case IPT_MODE_CPUS:
587        internal_mode = IPT_TRACE_CPUS;
588        break;
589    case IPT_MODE_THREADS:
590        internal_mode = IPT_TRACE_THREADS;
591        break;
592    default:
593        return ZX_ERR_INVALID_ARGS;
594    }
595
596    if (dev->insntrace)
597        return ZX_ERR_BAD_STATE;
598
599    insntrace_device_t* ipt_dev = calloc(1, sizeof(*dev->insntrace));
600    if (!ipt_dev)
601        return ZX_ERR_NO_MEMORY;
602
603    ipt_dev->num_traces = zx_system_get_num_cpus();
604    ipt_dev->bti = dev->bti;
605
606    ipt_dev->per_trace_state = calloc(ipt_dev->num_traces, sizeof(ipt_dev->per_trace_state[0]));
607    if (!ipt_dev->per_trace_state) {
608        free(ipt_dev);
609        return ZX_ERR_NO_MEMORY;
610    }
611
612    zx_handle_t resource = get_root_resource();
613    zx_status_t status =
614        zx_mtrace_control(resource, MTRACE_KIND_INSNTRACE, MTRACE_INSNTRACE_ALLOC_TRACE, 0,
615                          &internal_mode, sizeof(internal_mode));
616    if (status != ZX_OK) {
617        free(ipt_dev->per_trace_state);
618        free(ipt_dev);
619        return status;
620    }
621
622    ipt_dev->mode = internal_mode;
623    dev->insntrace = ipt_dev;
624    return ZX_OK;
625}
626
627static zx_status_t ipt_free_trace(cpu_trace_device_t* dev) {
628    insntrace_device_t* ipt_dev = dev->insntrace;
629    if (ipt_dev->active)
630        return ZX_ERR_BAD_STATE;
631
632    for (uint32_t i = 0; i < ipt_dev->num_traces; ++i) {
633        ipt_per_trace_state_t* per_trace = &ipt_dev->per_trace_state[i];
634        if (per_trace->allocated)
635            x86_pt_free_buffer1(ipt_dev, per_trace);
636    }
637
638    zx_handle_t resource = get_root_resource();
639    zx_status_t status =
640        zx_mtrace_control(resource, MTRACE_KIND_INSNTRACE, MTRACE_INSNTRACE_FREE_TRACE, 0, NULL, 0);
641    // TODO(dje): This really shouldn't fail. What to do?
642    // For now flag things as busted and prevent further use.
643    if (status != ZX_OK)
644        return ZX_OK;
645
646    free(ipt_dev->per_trace_state);
647    free(ipt_dev);
648    dev->insntrace = NULL;
649    return ZX_OK;
650}
651
652static zx_status_t ipt_get_trace_config(insntrace_device_t* ipt_dev,
653                                        void* reply, size_t replymax,
654                                        size_t* out_actual) {
655    ioctl_insntrace_trace_config_t config;
656    if (replymax < sizeof(config))
657        return ZX_ERR_BUFFER_TOO_SMALL;
658
659    switch (ipt_dev->mode) {
660        case IPT_TRACE_CPUS:
661            config.mode = IPT_MODE_CPUS;
662            break;
663        case IPT_TRACE_THREADS:
664            config.mode = IPT_MODE_THREADS;
665            break;
666        default:
667            __UNREACHABLE;
668    }
669    memcpy(reply, &config, sizeof(config));
670    *out_actual = sizeof(config);
671    return ZX_OK;
672}
673
674static zx_status_t ipt_alloc_buffer(insntrace_device_t* ipt_dev,
675                                    const void* cmd, size_t cmdlen,
676                                    void* reply, size_t replymax,
677                                    size_t* out_actual) {
678    ioctl_insntrace_buffer_config_t config;
679    if (cmdlen != sizeof(config))
680        return ZX_ERR_INVALID_ARGS;
681    memcpy(&config, cmd, sizeof(config));
682    zx_itrace_buffer_descriptor_t descriptor;
683    if (replymax < sizeof(descriptor))
684        return ZX_ERR_BUFFER_TOO_SMALL;
685
686    zx_status_t status = x86_pt_alloc_buffer(ipt_dev, &config, &descriptor);
687    if (status != ZX_OK)
688        return status;
689    memcpy(reply, &descriptor, sizeof(descriptor));
690    *out_actual = sizeof(descriptor);
691    return ZX_OK;
692}
693
694static zx_status_t ipt_assign_thread_buffer(insntrace_device_t* ipt_dev,
695                                            const void* cmd, size_t cmdlen) {
696    ioctl_insntrace_assign_thread_buffer_t assign;
697    if (cmdlen != sizeof(assign))
698        return ZX_ERR_INVALID_ARGS;
699
700    memcpy(&assign, cmd, sizeof(assign));
701    return x86_pt_assign_thread_buffer(ipt_dev, assign.descriptor, assign.thread);
702}
703
704static zx_status_t ipt_release_thread_buffer(insntrace_device_t* ipt_dev,
705                                             const void* cmd, size_t cmdlen) {
706    ioctl_insntrace_assign_thread_buffer_t assign;
707    if (cmdlen != sizeof(assign))
708        return ZX_ERR_INVALID_ARGS;
709
710    memcpy(&assign, cmd, sizeof(assign));
711    return x86_pt_release_thread_buffer(ipt_dev, assign.descriptor, assign.thread);
712}
713
714static zx_status_t ipt_get_buffer_config(insntrace_device_t* ipt_dev,
715                                         const void* cmd, size_t cmdlen,
716                                         void* reply, size_t replymax,
717                                         size_t* out_actual) {
718    zx_itrace_buffer_descriptor_t descriptor;
719    ioctl_insntrace_buffer_config_t config;
720
721    if (cmdlen != sizeof(descriptor))
722        return ZX_ERR_INVALID_ARGS;
723    if (replymax < sizeof(config))
724        return ZX_ERR_BUFFER_TOO_SMALL;
725
726    memcpy(&descriptor, cmd, sizeof(descriptor));
727    if (descriptor >= ipt_dev->num_traces)
728        return ZX_ERR_INVALID_ARGS;
729    const ipt_per_trace_state_t* per_trace = &ipt_dev->per_trace_state[descriptor];
730    if (!per_trace->allocated)
731        return ZX_ERR_INVALID_ARGS;
732
733    config.num_chunks = per_trace->num_chunks;
734    config.chunk_order = per_trace->chunk_order;
735    config.is_circular = per_trace->is_circular;
736    config.ctl = per_trace->ctl;
737    config.cr3_match = per_trace->cr3_match;
738    static_assert(sizeof(config.addr_ranges) == sizeof(per_trace->addr_ranges),
739                  "addr range size mismatch");
740    memcpy(config.addr_ranges, per_trace->addr_ranges, sizeof(per_trace->addr_ranges));
741    memcpy(reply, &config, sizeof(config));
742    *out_actual = sizeof(config);
743    return ZX_OK;
744}
745
746static zx_status_t ipt_get_buffer_info(insntrace_device_t* ipt_dev,
747                                       const void* cmd, size_t cmdlen,
748                                       void* reply, size_t replymax,
749                                       size_t* out_actual) {
750    zx_itrace_buffer_descriptor_t descriptor;
751    ioctl_insntrace_buffer_info_t data;
752
753    if (cmdlen != sizeof(descriptor))
754        return ZX_ERR_INVALID_ARGS;
755    if (replymax < sizeof(data))
756        return ZX_ERR_BUFFER_TOO_SMALL;
757
758    if (ipt_dev->active)
759        return ZX_ERR_BAD_STATE;
760
761    memcpy(&descriptor, cmd, sizeof(descriptor));
762    if (descriptor >= ipt_dev->num_traces)
763        return ZX_ERR_INVALID_ARGS;
764    const ipt_per_trace_state_t* per_trace = &ipt_dev->per_trace_state[descriptor];
765    if (!per_trace->allocated)
766        return ZX_ERR_INVALID_ARGS;
767
768    // Note: If this is a circular buffer this is just where tracing stopped.
769    data.capture_end = compute_capture_size(ipt_dev, per_trace);
770    memcpy(reply, &data, sizeof(data));
771    *out_actual = sizeof(data);
772    return ZX_OK;
773}
774
775static zx_status_t ipt_get_chunk_handle(insntrace_device_t* ipt_dev,
776                                        const void* cmd, size_t cmdlen,
777                                        void* reply, size_t replymax,
778                                        size_t* out_actual) {
779    ioctl_insntrace_chunk_handle_req_t req;
780    zx_handle_t h;
781
782    if (cmdlen != sizeof(req))
783        return ZX_ERR_INVALID_ARGS;
784    if (replymax < sizeof(h))
785        return ZX_ERR_BUFFER_TOO_SMALL;
786
787    memcpy(&req, cmd, sizeof(req));
788    if (req.descriptor >= ipt_dev->num_traces)
789        return ZX_ERR_INVALID_ARGS;
790    const ipt_per_trace_state_t* per_trace = &ipt_dev->per_trace_state[req.descriptor];
791    if (!per_trace->allocated)
792        return ZX_ERR_INVALID_ARGS;
793    if (req.chunk_num >= per_trace->num_chunks)
794        return ZX_ERR_INVALID_ARGS;
795
796    zx_status_t status = zx_handle_duplicate(per_trace->chunks[req.chunk_num].vmo_handle, ZX_RIGHT_SAME_RIGHTS, &h);
797    if (status < 0)
798        return status;
799    memcpy(reply, &h, sizeof(h));
800    *out_actual = sizeof(h);
801    return ZX_OK;
802}
803
804static zx_status_t ipt_free_buffer(insntrace_device_t* ipt_dev,
805                                   const void* cmd, size_t cmdlen) {
806    zx_itrace_buffer_descriptor_t descriptor;
807    if (cmdlen != sizeof(descriptor))
808        return ZX_ERR_INVALID_ARGS;
809    memcpy(&descriptor, cmd, sizeof(descriptor));
810
811    x86_pt_free_buffer(ipt_dev, descriptor);
812    return 0;
813}
814
815// Begin tracing.
816static zx_status_t ipt_start(insntrace_device_t* ipt_dev) {
817    if (ipt_dev->active)
818        return ZX_ERR_BAD_STATE;
819    if (ipt_dev->mode != IPT_TRACE_CPUS)
820        return ZX_ERR_BAD_STATE;
821    assert(ipt_dev->per_trace_state);
822
823    zx_handle_t resource = get_root_resource();
824    zx_status_t status;
825
826    // First verify a buffer has been allocated for each cpu.
827    for (uint32_t cpu = 0; cpu < ipt_dev->num_traces; ++cpu) {
828        const ipt_per_trace_state_t* per_trace = &ipt_dev->per_trace_state[cpu];
829        if (!per_trace->allocated)
830            return ZX_ERR_BAD_STATE;
831    }
832
833    for (uint32_t cpu = 0; cpu < ipt_dev->num_traces; ++cpu) {
834        const ipt_per_trace_state_t* per_trace = &ipt_dev->per_trace_state[cpu];
835
836        zx_x86_pt_regs_t regs;
837        regs.ctl = per_trace->ctl;
838        regs.ctl |= IPT_CTL_TOPA_MASK | IPT_CTL_TRACE_EN_MASK;
839        regs.status = per_trace->status;
840        regs.output_base = per_trace->output_base;
841        regs.output_mask_ptrs = per_trace->output_mask_ptrs;
842        regs.cr3_match = per_trace->cr3_match;
843        static_assert(sizeof(regs.addr_ranges) == sizeof(per_trace->addr_ranges),
844                      "addr range size mismatch");
845        memcpy(regs.addr_ranges, per_trace->addr_ranges, sizeof(per_trace->addr_ranges));
846
847        status = zx_mtrace_control(resource, MTRACE_KIND_INSNTRACE,
848                                   MTRACE_INSNTRACE_STAGE_TRACE_DATA,
849                                   cpu, &regs, sizeof(regs));
850        if (status != ZX_OK)
851            return status;
852    }
853
854    status = zx_mtrace_control(resource, MTRACE_KIND_INSNTRACE,
855                               MTRACE_INSNTRACE_START,
856                               0, NULL, 0);
857    if (status != ZX_OK)
858        return status;
859    ipt_dev->active = true;
860    return ZX_OK;
861}
862
863// Stop tracing.
864static zx_status_t ipt_stop(insntrace_device_t* ipt_dev) {
865    if (!ipt_dev->active)
866        return ZX_ERR_BAD_STATE;
867    assert(ipt_dev->per_trace_state);
868
869    zx_handle_t resource = get_root_resource();
870
871    zx_status_t status = zx_mtrace_control(resource, MTRACE_KIND_INSNTRACE,
872                                           MTRACE_INSNTRACE_STOP,
873                                           0, NULL, 0);
874    if (status != ZX_OK)
875        return status;
876    ipt_dev->active = false;
877
878    for (uint32_t cpu = 0; cpu < ipt_dev->num_traces; ++cpu) {
879        ipt_per_trace_state_t* per_trace = &ipt_dev->per_trace_state[cpu];
880
881        zx_x86_pt_regs_t regs;
882        status = zx_mtrace_control(resource, MTRACE_KIND_INSNTRACE,
883                                   MTRACE_INSNTRACE_GET_TRACE_DATA,
884                                   cpu, &regs, sizeof(regs));
885        if (status != ZX_OK)
886            return status;
887        per_trace->ctl = regs.ctl;
888        per_trace->status = regs.status;
889        per_trace->output_base = regs.output_base;
890        per_trace->output_mask_ptrs = regs.output_mask_ptrs;
891        per_trace->cr3_match = regs.cr3_match;
892        static_assert(sizeof(per_trace->addr_ranges) == sizeof(regs.addr_ranges),
893                      "addr range size mismatch");
894        memcpy(per_trace->addr_ranges, regs.addr_ranges, sizeof(regs.addr_ranges));
895
896        // If there was an operational error, report it.
897        if (per_trace->status & IPT_STATUS_ERROR_MASK) {
898            printf("%s: WARNING: operational error detected on cpu %u\n",
899                   __func__, cpu);
900        }
901    }
902
903    return ZX_OK;
904}
905
906zx_status_t insntrace_ioctl(cpu_trace_device_t* dev, uint32_t op,
907                            const void* cmd, size_t cmdlen,
908                            void* reply, size_t replymax,
909                            size_t* out_actual) {
910    assert(IOCTL_FAMILY(op) == IOCTL_FAMILY_INSNTRACE);
911
912    insntrace_device_t* ipt_dev = dev->insntrace;
913    if (op != IOCTL_INSNTRACE_ALLOC_TRACE) {
914        if (!ipt_dev)
915            return ZX_ERR_BAD_STATE;
916    }
917
918    switch (op) {
919    case IOCTL_INSNTRACE_ALLOC_TRACE:
920        if (replymax != 0)
921            return ZX_ERR_INVALID_ARGS;
922        return ipt_alloc_trace(dev, cmd, cmdlen);
923
924    case IOCTL_INSNTRACE_FREE_TRACE:
925        if (cmdlen != 0 || replymax != 0)
926            return ZX_ERR_INVALID_ARGS;
927        return ipt_free_trace(dev);
928
929    case IOCTL_INSNTRACE_GET_TRACE_CONFIG:
930        if (cmdlen != 0)
931            return ZX_ERR_INVALID_ARGS;
932        return ipt_get_trace_config(ipt_dev, reply, replymax, out_actual);
933
934    case IOCTL_INSNTRACE_ALLOC_BUFFER:
935        return ipt_alloc_buffer(ipt_dev, cmd, cmdlen, reply, replymax, out_actual);
936
937    case IOCTL_INSNTRACE_ASSIGN_THREAD_BUFFER:
938        if (replymax != 0)
939            return ZX_ERR_INVALID_ARGS;
940        return ipt_assign_thread_buffer(ipt_dev, cmd, cmdlen);
941
942    case IOCTL_INSNTRACE_RELEASE_THREAD_BUFFER:
943        if (replymax != 0)
944            return ZX_ERR_INVALID_ARGS;
945        return ipt_release_thread_buffer(ipt_dev, cmd, cmdlen);
946
947    case IOCTL_INSNTRACE_GET_BUFFER_CONFIG:
948        return ipt_get_buffer_config(ipt_dev, cmd, cmdlen, reply, replymax, out_actual);
949
950    case IOCTL_INSNTRACE_GET_BUFFER_INFO:
951        return ipt_get_buffer_info(ipt_dev, cmd, cmdlen, reply, replymax, out_actual);
952
953    case IOCTL_INSNTRACE_GET_CHUNK_HANDLE:
954        return ipt_get_chunk_handle(ipt_dev, cmd, cmdlen, reply, replymax, out_actual);
955
956    case IOCTL_INSNTRACE_FREE_BUFFER:
957        if (replymax != 0)
958            return ZX_ERR_INVALID_ARGS;
959        return ipt_free_buffer(ipt_dev, cmd, cmdlen);
960
961    case IOCTL_INSNTRACE_START:
962        if (cmdlen != 0 || replymax != 0)
963            return ZX_ERR_INVALID_ARGS;
964        return ipt_start(ipt_dev);
965
966    case IOCTL_INSNTRACE_STOP:
967        if (cmdlen != 0 || replymax != 0)
968            return ZX_ERR_INVALID_ARGS;
969        return ipt_stop(ipt_dev);
970
971    default:
972        return ZX_ERR_INVALID_ARGS;
973    }
974}
975
976void insntrace_release(cpu_trace_device_t* dev) {
977    // TODO(dje): None of these should fail. What to do?
978    // For now flag things as busted and prevent further use.
979    if (dev->insntrace) {
980        ipt_stop(dev->insntrace);
981        ipt_free_trace(dev);
982    }
983}
984