1// Copyright 2017 The Fuchsia Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style license that can be
3// found in the LICENSE file.
4
5// See the README.md in this directory for documentation.
6
7#include <ddk/binding.h>
8#include <ddk/debug.h>
9#include <ddk/device.h>
10#include <ddk/driver.h>
11#include <ddk/io-buffer.h>
12
13#include <lib/zircon-internal/device/cpu-trace/intel-pm.h>
14#include <lib/zircon-internal/mtrace.h>
15#include <zircon/syscalls.h>
16#include <zircon/syscalls/resource.h>
17#include <zircon/types.h>
18
19#include <assert.h>
20#include <cpuid.h>
21#include <inttypes.h>
22#include <stdbool.h>
23#include <stdio.h>
24#include <stdlib.h>
25#include <string.h>
26
27#include "cpu-trace-private.h"
28
29// TODO(dje): Having trouble getting this working, so just punt for now.
30#define TRY_FREEZE_ON_PMI 0
31
32// Individual bits in the fixed counter enable field.
33// See Intel Volume 3, Figure 18-2 "Layout of IA32_FIXED_CTR_CTRL MSR".
34#define FIXED_CTR_ENABLE_OS 1
35#define FIXED_CTR_ENABLE_USR 2
36
37// There's only a few fixed events, so handle them directly.
38typedef enum {
39#define DEF_FIXED_EVENT(symbol, id, regnum, flags, name, description) \
40    symbol ## _ID = CPUPERF_MAKE_EVENT_ID(CPUPERF_UNIT_FIXED, id),
41#include <lib/zircon-internal/device/cpu-trace/intel-pm-events.inc>
42} fixed_event_id_t;
43
44// Verify each fixed counter regnum < IPM_MAX_FIXED_COUNTERS.
45#define DEF_FIXED_EVENT(symbol, id, regnum, flags, name, description) \
46    && (regnum) < IPM_MAX_FIXED_COUNTERS
47static_assert(1
48#include <lib/zircon-internal/device/cpu-trace/intel-pm-events.inc>
49    , "");
50
51typedef enum {
52#define DEF_MISC_SKL_EVENT(symbol, id, offset, size, flags, name, description) \
53    symbol ## _ID = CPUPERF_MAKE_EVENT_ID(CPUPERF_UNIT_MISC, id),
54#include <lib/zircon-internal/device/cpu-trace/skylake-misc-events.inc>
55} misc_event_id_t;
56
57// Misc event ids needn't be consecutive.
58// Build a lookup table we can use to track duplicates.
59typedef enum {
60#define DEF_MISC_SKL_EVENT(symbol, id, offset, size, flags, name, description) \
61    symbol ## _NUMBER,
62#include <lib/zircon-internal/device/cpu-trace/skylake-misc-events.inc>
63    NUM_MISC_EVENTS
64} misc_event_number_t;
65
66// This table is sorted at startup.
67static cpuperf_event_id_t misc_event_table_contents[NUM_MISC_EVENTS] = {
68#define DEF_MISC_SKL_EVENT(symbol, id, offset, size, flags, name, description) \
69    CPUPERF_MAKE_EVENT_ID(CPUPERF_UNIT_MISC, id),
70#include <lib/zircon-internal/device/cpu-trace/skylake-misc-events.inc>
71};
72
73// Const accessor to give the illusion of the table being const.
74static const cpuperf_event_id_t* misc_event_table = &misc_event_table_contents[0];
75
76static void ipm_init_misc_event_table(void);
77
78typedef enum {
79#define DEF_ARCH_EVENT(symbol, id, ebx_bit, event, umask, flags, name, description) \
80    symbol,
81#include <lib/zircon-internal/device/cpu-trace/intel-pm-events.inc>
82} arch_event_t;
83
84typedef enum {
85#define DEF_SKL_EVENT(symbol, id, event, umask, flags, name, description) \
86    symbol,
87#include <lib/zircon-internal/device/cpu-trace/skylake-pm-events.inc>
88} model_event_t;
89
90typedef struct {
91    uint32_t event;
92    uint32_t umask;
93    uint32_t flags;
94} event_details_t;
95
96static const event_details_t kArchEvents[] = {
97#define DEF_ARCH_EVENT(symbol, id, ebx_bit, event, umask, flags, name, description) \
98    { event, umask, flags },
99#include <lib/zircon-internal/device/cpu-trace/intel-pm-events.inc>
100};
101
102static const event_details_t kModelEvents[] = {
103#define DEF_SKL_EVENT(symbol, id, event, umask, flags, name, description) \
104    { event, umask, flags },
105#include <lib/zircon-internal/device/cpu-trace/skylake-pm-events.inc>
106};
107
108static const uint16_t kArchEventMap[] = {
109#define DEF_ARCH_EVENT(symbol, id, ebx_bit, event, umask, flags, name, description) \
110    [id] = symbol,
111#include <lib/zircon-internal/device/cpu-trace/intel-pm-events.inc>
112};
113static_assert(countof(kArchEventMap) <= CPUPERF_MAX_EVENT + 1, "");
114
115static const uint16_t kModelEventMap[] = {
116#define DEF_SKL_EVENT(symbol, id, event, umask, flags, name, description) \
117    [id] = symbol,
118#include <lib/zircon-internal/device/cpu-trace/skylake-pm-events.inc>
119};
120static_assert(countof(kModelEventMap) <= CPUPERF_MAX_EVENT + 1, "");
121
122// All configuration data is staged here before writing any MSRs, etc.
123// Then when ready the "START" ioctl will write all the necessary MSRS,
124// and do whatever kernel operations are required for collecting data.
125
126typedef struct ipm_per_trace_state {
127    // true if |config| has been set.
128    bool configured;
129
130    // The trace configuration as given to us via the ioctl.
131    cpuperf_config_t ioctl_config;
132
133    // The internalized form of |config| that we pass to the kernel.
134    zx_x86_ipm_config_t config;
135
136    // # of entries in |buffers|.
137    // TODO(dje): This is generally the number of cpus, but it could be
138    // something else later.
139    uint32_t num_buffers;
140
141    // Each buffer is the same size (at least for now, KISS).
142    // There is one buffer per cpu.
143    // This is a uint32 instead of uint64 as there's no point in supporting
144    // that large of a buffer.
145    uint32_t buffer_size;
146
147    io_buffer_t* buffers;
148} ipm_per_trace_state_t;
149
150typedef struct cpuperf_device {
151    // Once tracing has started various things are not allowed until it stops.
152    bool active;
153
154    // one entry for each trace
155    // TODO(dje): At the moment we only support one trace at a time.
156    // "trace" == "data collection run"
157    ipm_per_trace_state_t* per_trace_state;
158} cpuperf_device_t;
159
160static bool ipm_supported = false;
161// This is only valid if |ipm_supported| is true.
162static zx_x86_ipm_properties_t ipm_properties;
163
164// maximum space, in bytes, for trace buffers (per cpu)
165#define MAX_PER_TRACE_SPACE (256 * 1024 * 1024)
166
167void cpuperf_init_once(void)
168{
169    ipm_init_misc_event_table();
170
171    zx_x86_ipm_properties_t props;
172    zx_handle_t resource = get_root_resource();
173    zx_status_t status =
174        zx_mtrace_control(resource, MTRACE_KIND_CPUPERF, MTRACE_CPUPERF_GET_PROPERTIES,
175                          0, &props, sizeof(props));
176    if (status != ZX_OK) {
177        if (status == ZX_ERR_NOT_SUPPORTED)
178            zxlogf(INFO, "%s: No PM support\n", __func__);
179        else
180            zxlogf(INFO, "%s: Error %d fetching ipm properties\n",
181                   __func__, status);
182        return;
183    }
184
185    // Skylake supports version 4. KISS and begin with that.
186    // Note: This should agree with the kernel driver's check.
187    if (props.pm_version < 4) {
188        zxlogf(INFO, "%s: PM version 4 or above is required\n", __func__);
189        return;
190    }
191
192    ipm_supported = true;
193    ipm_properties = props;
194
195    zxlogf(TRACE, "Intel Performance Monitor configuration for this chipset:\n");
196    zxlogf(TRACE, "IPM: version: %u\n", ipm_properties.pm_version);
197    zxlogf(TRACE, "IPM: num_programmable_events: %u\n",
198           ipm_properties.num_programmable_events);
199    zxlogf(TRACE, "IPM: num_fixed_events: %u\n",
200           ipm_properties.num_fixed_events);
201    zxlogf(TRACE, "IPM: num_misc_events: %u\n",
202           ipm_properties.num_misc_events);
203    zxlogf(TRACE, "IPM: programmable_counter_width: %u\n",
204           ipm_properties.programmable_counter_width);
205    zxlogf(TRACE, "IPM: fixed_counter_width: %u\n",
206           ipm_properties.fixed_counter_width);
207    zxlogf(TRACE, "IPM: perf_capabilities: 0x%lx\n",
208           ipm_properties.perf_capabilities);
209}
210
211
212// Helper routines for the ioctls.
213
214static void ipm_free_buffers_for_trace(ipm_per_trace_state_t* per_trace, uint32_t num_allocated) {
215    // Note: This may be called with partially allocated buffers.
216    assert(per_trace->buffers);
217    assert(num_allocated <= per_trace->num_buffers);
218    for (uint32_t i = 0; i < num_allocated; ++i)
219        io_buffer_release(&per_trace->buffers[i]);
220    free(per_trace->buffers);
221    per_trace->buffers = NULL;
222}
223
224// Map a fixed counter event id to its h/w register number.
225// Returns IPM_MAX_FIXED_COUNTERS if |id| is unknown.
226static unsigned ipm_fixed_counter_number(cpuperf_event_id_t id) {
227    enum {
228#define DEF_FIXED_EVENT(symbol, id, regnum, flags, name, description) \
229        symbol ## _NUMBER = regnum,
230#include <lib/zircon-internal/device/cpu-trace/intel-pm-events.inc>
231    };
232    switch (id) {
233    case FIXED_INSTRUCTIONS_RETIRED_ID:
234        return FIXED_INSTRUCTIONS_RETIRED_NUMBER;
235    case FIXED_UNHALTED_CORE_CYCLES_ID:
236        return FIXED_UNHALTED_CORE_CYCLES_NUMBER;
237    case FIXED_UNHALTED_REFERENCE_CYCLES_ID:
238        return FIXED_UNHALTED_REFERENCE_CYCLES_NUMBER;
239    default:
240        return IPM_MAX_FIXED_COUNTERS;
241    }
242}
243
244static int ipm_compare_cpuperf_event_id(const void* ap, const void* bp) {
245    const cpuperf_event_id_t* a = ap;
246    const cpuperf_event_id_t* b = bp;
247    if (*a < *b)
248        return -1;
249    if (*a > *b)
250        return 1;
251    return 0;
252}
253
254static void ipm_init_misc_event_table(void) {
255    qsort(misc_event_table_contents,
256          countof(misc_event_table_contents),
257          sizeof(misc_event_table_contents[0]),
258          ipm_compare_cpuperf_event_id);
259}
260
261// Map a misc event id to its ordinal (unique number in range
262// 0 ... NUM_MISC_EVENTS - 1).
263// Returns -1 if |id| is unknown.
264static int ipm_lookup_misc_event(cpuperf_event_id_t id) {
265    cpuperf_event_id_t* p = bsearch(&id, misc_event_table,
266                                    countof(misc_event_table_contents),
267                                    sizeof(id),
268                                    ipm_compare_cpuperf_event_id);
269    if (!p)
270        return -1;
271    ptrdiff_t result = p - misc_event_table;
272    assert(result < NUM_MISC_EVENTS);
273    return (int) result;
274}
275
276
277// The userspace side of the driver.
278
279static zx_status_t ipm_get_properties(cpu_trace_device_t* dev,
280                                      void* reply, size_t replymax,
281                                      size_t* out_actual) {
282    zxlogf(TRACE, "%s called\n", __func__);
283
284    if (!ipm_supported)
285        return ZX_ERR_NOT_SUPPORTED;
286
287    cpuperf_properties_t props;
288    if (replymax < sizeof(props))
289        return ZX_ERR_BUFFER_TOO_SMALL;
290
291    memset(&props, 0, sizeof(props));
292    props.api_version = CPUPERF_API_VERSION;
293    props.pm_version = ipm_properties.pm_version;
294    // To the arch-independent API, the misc events on Intel are currently
295    // all "fixed" in the sense that they don't occupy a limited number of
296    // programmable slots. Ultimately there could still be limitations (e.g.,
297    // some combination of events can't be supported) but that's ok. This
298    // data is for informational/debug purposes.
299    // TODO(dje): Something more elaborate can wait for publishing them via
300    // some namespace.
301    props.num_fixed_events = (ipm_properties.num_fixed_events +
302                              ipm_properties.num_misc_events);
303    props.num_programmable_events = ipm_properties.num_programmable_events;
304    props.fixed_counter_width = ipm_properties.fixed_counter_width;
305    props.programmable_counter_width = ipm_properties.programmable_counter_width;
306
307    memcpy(reply, &props, sizeof(props));
308    *out_actual = sizeof(props);
309    return ZX_OK;
310}
311
312static zx_status_t ipm_alloc_trace(cpu_trace_device_t* dev,
313                                   const void* cmd, size_t cmdlen) {
314    zxlogf(TRACE, "%s called\n", __func__);
315
316    if (!ipm_supported)
317        return ZX_ERR_NOT_SUPPORTED;
318    if (dev->cpuperf)
319        return ZX_ERR_BAD_STATE;
320
321    // Note: The remaining API calls don't have to check |ipm_supported|
322    // because this will never succeed otherwise, and they all require this
323    // to be done first.
324
325    ioctl_cpuperf_alloc_t alloc;
326    if (cmdlen != sizeof(alloc))
327        return ZX_ERR_INVALID_ARGS;
328    memcpy(&alloc, cmd, sizeof(alloc));
329    if (alloc.buffer_size > MAX_PER_TRACE_SPACE)
330        return ZX_ERR_INVALID_ARGS;
331    uint32_t num_cpus = zx_system_get_num_cpus();
332    if (alloc.num_buffers != num_cpus) // TODO(dje): for now
333        return ZX_ERR_INVALID_ARGS;
334
335    cpuperf_device_t* ipm = calloc(1, sizeof(*dev->cpuperf));
336    if (!ipm)
337        return ZX_ERR_NO_MEMORY;
338
339    ipm_per_trace_state_t* per_trace = calloc(1, sizeof(ipm->per_trace_state[0]));
340    if (!per_trace) {
341        free(ipm);
342        return ZX_ERR_NO_MEMORY;
343    }
344
345    per_trace->buffers = calloc(num_cpus, sizeof(per_trace->buffers[0]));
346    if (!per_trace->buffers) {
347        free(per_trace);
348        free(ipm);
349        return ZX_ERR_NO_MEMORY;
350    }
351
352    uint32_t i = 0;
353    for ( ; i < num_cpus; ++i) {
354        zx_status_t status =
355            io_buffer_init(&per_trace->buffers[i], dev->bti, alloc.buffer_size, IO_BUFFER_RW);
356        if (status != ZX_OK)
357            break;
358    }
359    if (i != num_cpus) {
360        ipm_free_buffers_for_trace(per_trace, i);
361        free(per_trace);
362        free(ipm);
363        return ZX_ERR_NO_MEMORY;
364    }
365
366    per_trace->num_buffers = alloc.num_buffers;
367    per_trace->buffer_size = alloc.buffer_size;
368    ipm->per_trace_state = per_trace;
369    dev->cpuperf = ipm;
370    return ZX_OK;
371}
372
373static zx_status_t ipm_free_trace(cpu_trace_device_t* dev) {
374    zxlogf(TRACE, "%s called\n", __func__);
375
376    cpuperf_device_t* ipm = dev->cpuperf;
377    if (!ipm)
378        return ZX_ERR_BAD_STATE;
379    if (ipm->active)
380        return ZX_ERR_BAD_STATE;
381
382    ipm_per_trace_state_t* per_trace = ipm->per_trace_state;
383    ipm_free_buffers_for_trace(per_trace, per_trace->num_buffers);
384    free(per_trace);
385    free(ipm);
386    dev->cpuperf = NULL;
387    return ZX_OK;
388}
389
390static zx_status_t ipm_get_alloc(cpu_trace_device_t* dev,
391                                 void* reply, size_t replymax,
392                                 size_t* out_actual) {
393    zxlogf(TRACE, "%s called\n", __func__);
394
395    const cpuperf_device_t* ipm = dev->cpuperf;
396    if (!ipm)
397        return ZX_ERR_BAD_STATE;
398
399    ioctl_cpuperf_alloc_t alloc;
400    if (replymax < sizeof(alloc))
401        return ZX_ERR_BUFFER_TOO_SMALL;
402
403    alloc.num_buffers = ipm->per_trace_state->num_buffers;
404    alloc.buffer_size = ipm->per_trace_state->buffer_size;
405    memcpy(reply, &alloc, sizeof(alloc));
406    *out_actual = sizeof(alloc);
407    return ZX_OK;
408}
409
410static zx_status_t ipm_get_buffer_handle(cpu_trace_device_t* dev,
411                                         const void* cmd, size_t cmdlen,
412                                         void* reply, size_t replymax,
413                                         size_t* out_actual) {
414    zxlogf(TRACE, "%s called\n", __func__);
415
416    cpuperf_device_t* ipm = dev->cpuperf;
417    if (!ipm)
418        return ZX_ERR_BAD_STATE;
419
420    ioctl_cpuperf_buffer_handle_req_t req;
421    zx_handle_t h;
422
423    if (cmdlen != sizeof(req))
424        return ZX_ERR_INVALID_ARGS;
425    if (replymax < sizeof(h))
426        return ZX_ERR_BUFFER_TOO_SMALL;
427    const ipm_per_trace_state_t* per_trace = ipm->per_trace_state;
428    memcpy(&req, cmd, sizeof(req));
429    if (req.descriptor >= per_trace->num_buffers)
430        return ZX_ERR_INVALID_ARGS;
431
432    zx_status_t status = zx_handle_duplicate(per_trace->buffers[req.descriptor].vmo_handle, ZX_RIGHT_SAME_RIGHTS, &h);
433    if (status < 0)
434        return status;
435    memcpy(reply, &h, sizeof(h));
436    *out_actual = sizeof(h);
437    return ZX_OK;
438}
439
440typedef struct {
441    // Maximum number of each event we can handle.
442    unsigned max_num_fixed;
443    unsigned max_num_programmable;
444    unsigned max_num_misc;
445
446    // The number of events in use.
447    unsigned num_fixed;
448    unsigned num_programmable;
449    unsigned num_misc;
450
451    // The maximum value the counter can have before overflowing.
452    uint64_t max_fixed_value;
453    uint64_t max_programmable_value;
454
455    // For catching duplicates of the fixed counters.
456    bool have_fixed[IPM_MAX_FIXED_COUNTERS];
457    // For catching duplicates of the misc events, 1 bit per event.
458    uint64_t have_misc[(NUM_MISC_EVENTS + 63) / 64];
459
460    bool have_timebase0_user;
461} staging_state_t;
462
463static zx_status_t ipm_stage_fixed_config(const cpuperf_config_t* icfg,
464                                          staging_state_t* ss,
465                                          unsigned input_index,
466                                          zx_x86_ipm_config_t* ocfg) {
467    const unsigned ii = input_index;
468    const cpuperf_event_id_t id = icfg->events[ii];
469    bool uses_timebase0 = !!(icfg->flags[ii] & CPUPERF_CONFIG_FLAG_TIMEBASE0);
470    unsigned counter = ipm_fixed_counter_number(id);
471
472    if (counter == IPM_MAX_FIXED_COUNTERS ||
473            counter >= countof(ocfg->fixed_ids) ||
474            counter >= ss->max_num_fixed) {
475        zxlogf(ERROR, "%s: Invalid fixed event [%u]\n", __func__, ii);
476        return ZX_ERR_INVALID_ARGS;
477    }
478    if (ss->have_fixed[counter]) {
479        zxlogf(ERROR, "%s: Fixed event [%u] already provided\n",
480               __func__, counter);
481        return ZX_ERR_INVALID_ARGS;
482    }
483    ss->have_fixed[counter] = true;
484    ocfg->fixed_ids[ss->num_fixed] = id;
485    if ((uses_timebase0 && input_index != 0) || icfg->rate[ii] == 0) {
486        ocfg->fixed_initial_value[ss->num_fixed] = 0;
487    } else {
488        if (icfg->rate[ii] > ss->max_fixed_value) {
489            zxlogf(ERROR, "%s: Rate too large, event [%u]\n", __func__, ii);
490            return ZX_ERR_INVALID_ARGS;
491        }
492        ocfg->fixed_initial_value[ss->num_fixed] =
493            ss->max_fixed_value - icfg->rate[ii] + 1;
494    }
495    // KISS: For now don't generate PMI's for counters that use
496    // another as the timebase.
497    if (!uses_timebase0 || ii == 0)
498        ocfg->fixed_ctrl |= IA32_FIXED_CTR_CTRL_PMI_MASK(counter);
499    unsigned enable = 0;
500    if (icfg->flags[ii] & CPUPERF_CONFIG_FLAG_OS)
501        enable |= FIXED_CTR_ENABLE_OS;
502    if (icfg->flags[ii] & CPUPERF_CONFIG_FLAG_USER)
503        enable |= FIXED_CTR_ENABLE_USR;
504    ocfg->fixed_ctrl |= enable << IA32_FIXED_CTR_CTRL_EN_SHIFT(counter);
505    ocfg->global_ctrl |= IA32_PERF_GLOBAL_CTRL_FIXED_EN_MASK(counter);
506    if (icfg->flags[ii] & CPUPERF_CONFIG_FLAG_TIMEBASE0)
507        ocfg->fixed_flags[ss->num_fixed] |= IPM_CONFIG_FLAG_TIMEBASE;
508    if (icfg->flags[ii] & CPUPERF_CONFIG_FLAG_PC)
509        ocfg->fixed_flags[ss->num_fixed] |= IPM_CONFIG_FLAG_PC;
510
511    ++ss->num_fixed;
512    return ZX_OK;
513}
514
515static zx_status_t ipm_stage_programmable_config(const cpuperf_config_t* icfg,
516                                                 staging_state_t* ss,
517                                                 unsigned input_index,
518                                                 zx_x86_ipm_config_t* ocfg) {
519    const unsigned ii = input_index;
520    cpuperf_event_id_t id = icfg->events[ii];
521    unsigned unit = CPUPERF_EVENT_ID_UNIT(id);
522    unsigned event = CPUPERF_EVENT_ID_EVENT(id);
523    bool uses_timebase0 = !!(icfg->flags[ii] & CPUPERF_CONFIG_FLAG_TIMEBASE0);
524
525    // TODO(dje): Verify no duplicates.
526    if (ss->num_programmable == ss->max_num_programmable) {
527        zxlogf(ERROR, "%s: Too many programmable counters provided\n",
528               __func__);
529        return ZX_ERR_INVALID_ARGS;
530    }
531    ocfg->programmable_ids[ss->num_programmable] = id;
532    if ((uses_timebase0 && input_index != 0) || icfg->rate[ii] == 0) {
533        ocfg->programmable_initial_value[ss->num_programmable] = 0;
534    } else {
535        if (icfg->rate[ii] > ss->max_programmable_value) {
536            zxlogf(ERROR, "%s: Rate too large, event [%u]\n", __func__, ii);
537            return ZX_ERR_INVALID_ARGS;
538        }
539        ocfg->programmable_initial_value[ss->num_programmable] =
540            ss->max_programmable_value - icfg->rate[ii] + 1;
541    }
542    const event_details_t* details = NULL;
543    switch (unit) {
544    case CPUPERF_UNIT_ARCH:
545        if (event >= countof(kArchEventMap)) {
546            zxlogf(ERROR, "%s: Invalid event id, event [%u]\n", __func__, ii);
547            return ZX_ERR_INVALID_ARGS;
548        }
549        details = &kArchEvents[kArchEventMap[event]];
550        break;
551    case CPUPERF_UNIT_MODEL:
552        if (event >= countof(kModelEventMap)) {
553            zxlogf(ERROR, "%s: Invalid event id, event [%u]\n", __func__, ii);
554            return ZX_ERR_INVALID_ARGS;
555        }
556        details = &kModelEvents[kModelEventMap[event]];
557        break;
558    default:
559        zxlogf(ERROR, "%s: Invalid event id, event [%u]\n", __func__, ii);
560        return ZX_ERR_INVALID_ARGS;
561    }
562    if (details->event == 0 && details->umask == 0) {
563        zxlogf(ERROR, "%s: Invalid event id, event [%u]\n", __func__, ii);
564        return ZX_ERR_INVALID_ARGS;
565    }
566    uint64_t evtsel = 0;
567    evtsel |= details->event << IA32_PERFEVTSEL_EVENT_SELECT_SHIFT;
568    evtsel |= details->umask << IA32_PERFEVTSEL_UMASK_SHIFT;
569    if (icfg->flags[ii] & CPUPERF_CONFIG_FLAG_OS)
570        evtsel |= IA32_PERFEVTSEL_OS_MASK;
571    if (icfg->flags[ii] & CPUPERF_CONFIG_FLAG_USER)
572        evtsel |= IA32_PERFEVTSEL_USR_MASK;
573    if (details->flags & IPM_REG_FLAG_EDG)
574        evtsel |= IA32_PERFEVTSEL_E_MASK;
575    if (details->flags & IPM_REG_FLAG_ANYT)
576        evtsel |= IA32_PERFEVTSEL_ANY_MASK;
577    if (details->flags & IPM_REG_FLAG_INV)
578        evtsel |= IA32_PERFEVTSEL_INV_MASK;
579    evtsel |= (details->flags & IPM_REG_FLAG_CMSK_MASK) << IA32_PERFEVTSEL_CMASK_SHIFT;
580    // KISS: For now don't generate PMI's for counters that use
581    // another as the timebase. We still generate interrupts in
582    // "counting mode" in case the counter overflows.
583    if (!uses_timebase0 || ii == 0)
584        evtsel |= IA32_PERFEVTSEL_INT_MASK;
585    evtsel |= IA32_PERFEVTSEL_EN_MASK;
586    ocfg->programmable_events[ss->num_programmable] = evtsel;
587    ocfg->global_ctrl |= IA32_PERF_GLOBAL_CTRL_PMC_EN_MASK(ss->num_programmable);
588    if (icfg->flags[ii] & CPUPERF_CONFIG_FLAG_TIMEBASE0)
589        ocfg->programmable_flags[ss->num_programmable] |= IPM_CONFIG_FLAG_TIMEBASE;
590    if (icfg->flags[ii] & CPUPERF_CONFIG_FLAG_PC)
591        ocfg->programmable_flags[ss->num_programmable] |= IPM_CONFIG_FLAG_PC;
592
593    ++ss->num_programmable;
594    return ZX_OK;
595}
596
597static zx_status_t ipm_stage_misc_config(const cpuperf_config_t* icfg,
598                                         staging_state_t* ss,
599                                         unsigned input_index,
600                                         zx_x86_ipm_config_t* ocfg) {
601    const unsigned ii = input_index;
602    cpuperf_event_id_t id = icfg->events[ii];
603    int event = ipm_lookup_misc_event(id);
604
605    if (event < 0) {
606        zxlogf(ERROR, "%s: Invalid misc event [%u]\n", __func__, ii);
607        return ZX_ERR_INVALID_ARGS;
608    }
609    if (ss->num_misc == ss->max_num_misc) {
610        zxlogf(ERROR, "%s: Too many misc counters provided\n",
611               __func__);
612        return ZX_ERR_INVALID_ARGS;
613    }
614    if (ss->have_misc[event / 64] & (1ul << (event % 64))) {
615        zxlogf(ERROR, "%s: Misc event [%u] already provided\n",
616               __func__, ii);
617        return ZX_ERR_INVALID_ARGS;
618    }
619    ss->have_misc[event / 64] |= 1ul << (event % 64);
620    ocfg->misc_ids[ss->num_misc] = id;
621    if (icfg->flags[ii] & CPUPERF_CONFIG_FLAG_TIMEBASE0) {
622        ocfg->misc_flags[ss->num_misc] |= IPM_CONFIG_FLAG_TIMEBASE;
623    } else {
624        if (icfg->rate[ii] != 0) {
625            zxlogf(ERROR, "%s: Misc event [%u] requires a timebase\n",
626                   __func__, ii);
627            return ZX_ERR_INVALID_ARGS;
628        }
629    }
630
631    ++ss->num_misc;
632    return ZX_OK;
633}
634
635static zx_status_t ipm_stage_config(cpu_trace_device_t* dev,
636                                    const void* cmd, size_t cmdlen) {
637    zxlogf(TRACE, "%s called\n", __func__);
638
639    cpuperf_device_t* ipm = dev->cpuperf;
640    if (!ipm)
641        return ZX_ERR_BAD_STATE;
642    if (ipm->active)
643        return ZX_ERR_BAD_STATE;
644
645    // If we subsequently get an error, make sure any previous configuration
646    // can't be used.
647    ipm_per_trace_state_t* per_trace = ipm->per_trace_state;
648    per_trace->configured = false;
649
650    cpuperf_config_t ioctl_config;
651    cpuperf_config_t* icfg = &ioctl_config;
652    if (cmdlen != sizeof(*icfg))
653        return ZX_ERR_INVALID_ARGS;
654    memcpy(icfg, cmd, sizeof(*icfg));
655
656    zx_x86_ipm_config_t* ocfg = &per_trace->config;
657    memset(ocfg, 0, sizeof(*ocfg));
658
659    // Validate the config and convert it to our internal form.
660    // TODO(dje): Multiplexing support.
661
662    staging_state_t staging_state;
663    staging_state_t* ss = &staging_state;
664    ss->max_num_fixed = ipm_properties.num_fixed_events;
665    ss->max_num_programmable = ipm_properties.num_programmable_events;
666    ss->max_num_misc = ipm_properties.num_misc_events;
667    ss->num_fixed = 0;
668    ss->num_programmable = 0;
669    ss->num_misc = 0;
670    ss->max_fixed_value =
671        (ipm_properties.fixed_counter_width < 64
672         ? (1ul << ipm_properties.fixed_counter_width) - 1
673         : ~0ul);
674    ss->max_programmable_value =
675        (ipm_properties.programmable_counter_width < 64
676         ? (1ul << ipm_properties.programmable_counter_width) - 1
677         : ~0ul);
678    for (unsigned i = 0; i < countof(ss->have_fixed); ++i)
679        ss->have_fixed[i] = false;
680    for (unsigned i = 0; i < countof(ss->have_misc); ++i)
681        ss->have_misc[i] = false;
682    ss->have_timebase0_user = false;
683
684    zx_status_t status;
685    unsigned ii;  // ii: input index
686    for (ii = 0; ii < countof(icfg->events); ++ii) {
687        cpuperf_event_id_t id = icfg->events[ii];
688        zxlogf(TRACE, "%s: processing [%u] = %u\n", __func__, ii, id);
689        if (id == 0)
690            break;
691        unsigned unit = CPUPERF_EVENT_ID_UNIT(id);
692
693        if (icfg->flags[ii] & ~CPUPERF_CONFIG_FLAG_MASK) {
694            zxlogf(ERROR, "%s: reserved flag bits set [%u]\n", __func__, ii);
695            return ZX_ERR_INVALID_ARGS;
696        }
697
698        switch (unit) {
699        case CPUPERF_UNIT_FIXED:
700            status = ipm_stage_fixed_config(icfg, ss, ii, ocfg);
701            if (status != ZX_OK)
702                return status;
703            break;
704        case CPUPERF_UNIT_ARCH:
705        case CPUPERF_UNIT_MODEL:
706            status = ipm_stage_programmable_config(icfg, ss, ii, ocfg);
707            if (status != ZX_OK)
708                return status;
709            break;
710        case CPUPERF_UNIT_MISC:
711            status = ipm_stage_misc_config(icfg, ss, ii, ocfg);
712            if (status != ZX_OK)
713                return status;
714            break;
715        default:
716            zxlogf(ERROR, "%s: Invalid event [%u] (bad unit)\n",
717                   __func__, ii);
718            return ZX_ERR_INVALID_ARGS;
719        }
720
721        if (icfg->flags[ii] & CPUPERF_CONFIG_FLAG_TIMEBASE0)
722            ss->have_timebase0_user = true;
723    }
724    if (ii == 0) {
725        zxlogf(ERROR, "%s: No events provided\n", __func__);
726        return ZX_ERR_INVALID_ARGS;
727    }
728
729    // Ensure there are no holes.
730    for (; ii < countof(icfg->events); ++ii) {
731        if (icfg->events[ii] != 0) {
732            zxlogf(ERROR, "%s: Hole at event [%u]\n", __func__, ii);
733            return ZX_ERR_INVALID_ARGS;
734        }
735    }
736
737    if (ss->have_timebase0_user) {
738        ocfg->timebase_id = icfg->events[0];
739    }
740
741#if TRY_FREEZE_ON_PMI
742    ocfg->debug_ctrl |= IA32_DEBUGCTL_FREEZE_PERFMON_ON_PMI_MASK;
743#endif
744
745    // Require something to be enabled in order to start tracing.
746    // This is mostly a sanity check.
747    if (per_trace->config.global_ctrl == 0) {
748        zxlogf(ERROR, "%s: Requested config doesn't collect any data\n",
749               __func__);
750        return ZX_ERR_INVALID_ARGS;
751    }
752
753    per_trace->ioctl_config = *icfg;
754    per_trace->configured = true;
755    return ZX_OK;
756}
757
758static zx_status_t ipm_get_config(cpu_trace_device_t* dev,
759                                  void* reply, size_t replymax,
760                                  size_t* out_actual) {
761    zxlogf(TRACE, "%s called\n", __func__);
762
763    const cpuperf_device_t* ipm = dev->cpuperf;
764    if (!ipm)
765        return ZX_ERR_BAD_STATE;
766
767    const ipm_per_trace_state_t* per_trace = ipm->per_trace_state;
768    if (!per_trace->configured)
769        return ZX_ERR_BAD_STATE;
770
771    const cpuperf_config_t* config = &per_trace->ioctl_config;
772    if (replymax < sizeof(*config))
773        return ZX_ERR_BUFFER_TOO_SMALL;
774
775    memcpy(reply, config, sizeof(*config));
776    *out_actual = sizeof(*config);
777    return ZX_OK;
778}
779
780static zx_status_t ipm_start(cpu_trace_device_t* dev) {
781    zxlogf(TRACE, "%s called\n", __func__);
782
783    cpuperf_device_t* ipm = dev->cpuperf;
784    if (!ipm)
785        return ZX_ERR_BAD_STATE;
786    if (ipm->active)
787        return ZX_ERR_BAD_STATE;
788
789    ipm_per_trace_state_t* per_trace = ipm->per_trace_state;
790    if (!per_trace->configured)
791        return ZX_ERR_BAD_STATE;
792
793    // Step 1: Get the configuration data into the kernel for use by START.
794
795    zxlogf(TRACE, "%s: global ctrl 0x%" PRIx64 ", fixed ctrl 0x%" PRIx64 "\n",
796           __func__, per_trace->config.global_ctrl,
797           per_trace->config.fixed_ctrl);
798
799    // |per_trace->configured| should not have been set if there's nothing
800    // to trace.
801    assert(per_trace->config.global_ctrl != 0);
802
803    zx_handle_t resource = get_root_resource();
804
805    zx_status_t status =
806        zx_mtrace_control(resource, MTRACE_KIND_CPUPERF,
807                          MTRACE_CPUPERF_INIT, 0, NULL, 0);
808    if (status != ZX_OK)
809        return status;
810
811    uint32_t num_cpus = zx_system_get_num_cpus();
812    for (uint32_t cpu = 0; cpu < num_cpus; ++cpu) {
813        zx_x86_ipm_buffer_t buffer;
814        io_buffer_t* io_buffer = &per_trace->buffers[cpu];
815        buffer.vmo = io_buffer->vmo_handle;
816        status = zx_mtrace_control(resource, MTRACE_KIND_CPUPERF,
817                                   MTRACE_CPUPERF_ASSIGN_BUFFER, cpu,
818                                   &buffer, sizeof(buffer));
819        if (status != ZX_OK)
820            goto fail;
821    }
822
823    status = zx_mtrace_control(resource, MTRACE_KIND_CPUPERF,
824                               MTRACE_CPUPERF_STAGE_CONFIG, 0,
825                               &per_trace->config, sizeof(per_trace->config));
826    if (status != ZX_OK)
827        goto fail;
828
829    // Step 2: Start data collection.
830
831    status = zx_mtrace_control(resource, MTRACE_KIND_CPUPERF, MTRACE_CPUPERF_START,
832                               0, NULL, 0);
833    if (status != ZX_OK)
834        goto fail;
835
836    ipm->active = true;
837    return ZX_OK;
838
839  fail:
840    {
841        zx_status_t status2 =
842            zx_mtrace_control(resource, MTRACE_KIND_CPUPERF,
843                              MTRACE_CPUPERF_FINI, 0, NULL, 0);
844        if (status2 != ZX_OK)
845            zxlogf(TRACE, "%s: MTRACE_CPUPERF_FINI failed: %d\n", __func__, status2);
846        assert(status2 == ZX_OK);
847        return status;
848    }
849}
850
851static zx_status_t ipm_stop(cpu_trace_device_t* dev) {
852    zxlogf(TRACE, "%s called\n", __func__);
853
854    cpuperf_device_t* ipm = dev->cpuperf;
855    if (!ipm)
856        return ZX_ERR_BAD_STATE;
857
858    zx_handle_t resource = get_root_resource();
859    zx_status_t status =
860        zx_mtrace_control(resource, MTRACE_KIND_CPUPERF,
861                          MTRACE_CPUPERF_STOP, 0, NULL, 0);
862    if (status == ZX_OK) {
863        ipm->active = false;
864        status = zx_mtrace_control(resource, MTRACE_KIND_CPUPERF,
865                                   MTRACE_CPUPERF_FINI, 0, NULL, 0);
866    }
867    return status;
868}
869
870zx_status_t cpuperf_ioctl(cpu_trace_device_t* dev, uint32_t op,
871                          const void* cmd, size_t cmdlen,
872                          void* reply, size_t replymax,
873                          size_t* out_actual) {
874    assert(IOCTL_FAMILY(op) == IOCTL_FAMILY_CPUPERF);
875
876    switch (op) {
877    case IOCTL_CPUPERF_GET_PROPERTIES:
878        if (cmdlen != 0)
879            return ZX_ERR_INVALID_ARGS;
880        return ipm_get_properties(dev, reply, replymax, out_actual);
881
882    case IOCTL_CPUPERF_ALLOC_TRACE:
883        if (replymax != 0)
884            return ZX_ERR_INVALID_ARGS;
885        return ipm_alloc_trace(dev, cmd, cmdlen);
886
887    case IOCTL_CPUPERF_FREE_TRACE:
888        if (cmdlen != 0 || replymax != 0)
889            return ZX_ERR_INVALID_ARGS;
890        return ipm_free_trace(dev);
891
892    case IOCTL_CPUPERF_GET_ALLOC:
893        if (cmdlen != 0)
894            return ZX_ERR_INVALID_ARGS;
895        return ipm_get_alloc(dev, reply, replymax, out_actual);
896
897    case IOCTL_CPUPERF_GET_BUFFER_HANDLE:
898        return ipm_get_buffer_handle(dev, cmd, cmdlen, reply, replymax, out_actual);
899
900    case IOCTL_CPUPERF_STAGE_CONFIG:
901        if (replymax != 0)
902            return ZX_ERR_INVALID_ARGS;
903        return ipm_stage_config(dev, cmd, cmdlen);
904
905    case IOCTL_CPUPERF_GET_CONFIG:
906        return ipm_get_config(dev, reply, replymax, out_actual);
907
908    case IOCTL_CPUPERF_START:
909        if (cmdlen != 0 || replymax != 0)
910            return ZX_ERR_INVALID_ARGS;
911        return ipm_start(dev);
912
913    case IOCTL_CPUPERF_STOP:
914        if (cmdlen != 0 || replymax != 0)
915            return ZX_ERR_INVALID_ARGS;
916        return ipm_stop(dev);
917
918    default:
919        return ZX_ERR_INVALID_ARGS;
920    }
921}
922
923void cpuperf_release(cpu_trace_device_t* dev) {
924    // TODO(dje): None of these should fail. What to do?
925    // Suggest flagging things as busted and prevent further use.
926    ipm_stop(dev);
927    ipm_free_trace(dev);
928}
929