1/*
2 * Copyright 2017, Data61
3 * Commonwealth Scientific and Industrial Research Organisation (CSIRO)
4 * ABN 41 687 119 230.
5 *
6 * This software may be distributed and modified according to the terms of
7 * the BSD 2-Clause license. Note that NO WARRANTY is provided.
8 * See "LICENSE_BSD2.txt" for details.
9 *
10 * @TAG(DATA61_BSD)
11 */
12#pragma once
13
14#include <assert.h>
15#include <stdio.h>
16#include <stdint.h>
17#include <stdbool.h>
18#include <inttypes.h>
19#include <sel4bench/types.h>
20
21#define SEL4BENCH_READ_CCNT(var) do { \
22    uint32_t low, high; \
23    asm volatile( \
24        "movl $0, %%eax \n" \
25        "movl $0, %%ecx \n" \
26        "cpuid \n" \
27        "rdtsc \n" \
28        "movl %%edx, %0 \n" \
29        "movl %%eax, %1 \n" \
30        "movl $0, %%eax \n" \
31        "movl $0, %%ecx \n" \
32        "cpuid \n" \
33        : \
34         "=r"(high), \
35         "=r"(low) \
36        : \
37        : "eax", "ebx", "ecx", "edx" \
38    ); \
39    (var) = (((uint64_t)high) << 32ull) | ((uint64_t)low); \
40} while(0)
41
42/* Intel docs are somewhat unclear as to exactly how to serialize PMCs.
43 * Using LFENCE for the moment, because it's much faster. If event counts
44 * turn out to be unreliable, switch to CPUID by uncommenting this line.
45 *
46 * This currently breaks the GCC register allocator.
47 */
48//#define SEL4BENCH_STRICT_PMC_SERIALIZATION
49
50#include <sel4bench/arch/private.h>
51
52#define CCNT_FORMAT "%"PRIu64
53typedef uint64_t ccnt_t;
54
55/* The framework as it stands supports the following Intel processors:
56 * - All P6-family processors (up to and including the Pentium M)
57 * - All processors supporting IA-32 architectural performance
58 *   monitoring (that is, processors starting from the Intel Core Solo,
59 *   codenamed Yonah)
60 */
61
62/* Silence warnings about including the following functions when seL4_DebugRun
63 * is not enabled when we are not calling them. If we actually call these
64 * functions without seL4_DebugRun enabled, we'll get a link failure, so this
65 * should be OK.
66 */
67void seL4_DebugRun(void (* userfn) (void *), void* userarg);
68
69static inline uint64_t sel4bench_x86_rdmsr(uint32_t reg) {
70#ifdef CONFIG_KERNEL_X86_DANGEROUS_MSR
71    return seL4_X86DangerousRDMSR(reg);
72#else
73    uint32_t msr_data[3];
74    msr_data[0] = reg;
75    msr_data[1] = 0;
76    msr_data[2] = 0;
77
78    seL4_DebugRun(&sel4bench_private_rdmsr, msr_data);
79    return (uint64_t)msr_data[1] + ((uint64_t)msr_data[2] << 32);
80#endif
81}
82
83static inline void sel4bench_x86_wrmsr(uint32_t reg, uint64_t val) {
84#ifdef CONFIG_KERNEL_X86_DANGEROUS_MSR
85    seL4_X86DangerousWRMSR(reg, val);
86#else
87    uint32_t msr_data[3];
88    msr_data[0] = reg;
89    msr_data[1] = val & 0xffffffff;
90    msr_data[2] = val >> 32;
91
92    seL4_DebugRun(&sel4bench_private_wrmsr, msr_data);
93#endif
94}
95
96static FASTFN void sel4bench_init()
97{
98    uint32_t cpuid_eax;
99    uint32_t cpuid_ebx;
100    uint32_t cpuid_ecx;
101    uint32_t cpuid_edx;
102    sel4bench_private_cpuid(IA32_CPUID_LEAF_BASIC, 0, &cpuid_eax, &cpuid_ebx, &cpuid_ecx, &cpuid_edx);
103
104    //check we're running on an Intel chip
105    assert(cpuid_ebx == IA32_CPUID_BASIC_MAGIC_EBX && cpuid_ecx == IA32_CPUID_BASIC_MAGIC_ECX && cpuid_edx == IA32_CPUID_BASIC_MAGIC_EDX);
106
107    //check that either we support architectural performance monitoring, or we're running on a P6-class chip
108    if (cpuid_eax < IA32_CPUID_LEAF_PMC) { //basic CPUID invocation tells us whether the processor supports arch PMCs
109        //if not, ensure we're on a P6-family processor
110        ia32_cpuid_model_info_t cpuid_model_info;
111        sel4bench_private_cpuid(IA32_CPUID_LEAF_MODEL, 0, &(cpuid_model_info.raw), &cpuid_ebx, &cpuid_ecx, &cpuid_edx);
112        assert(FAMILY(cpuid_model_info) == IA32_CPUID_FAMILY_P6);
113        if (!(FAMILY(cpuid_model_info) == IA32_CPUID_FAMILY_P6)) {
114            return;
115        }
116    }
117
118    //enable user-mode RDPMC
119#ifndef CONFIG_EXPORT_PMC_USER
120    seL4_DebugRun(&sel4bench_private_enable_user_pmc, NULL);
121#endif
122}
123
124static FASTFN ccnt_t sel4bench_get_cycle_count()
125{
126    sel4bench_private_serialize_pmc(); /* Serialise all preceding instructions */
127    uint64_t time = sel4bench_private_rdtsc();
128    sel4bench_private_serialize_pmc(); /* Serialise all following instructions */
129
130    return time;
131}
132
133static FASTFN seL4_Word sel4bench_get_num_counters()
134{
135    uint32_t dummy;
136
137    //make sure the processor supports the PMC CPUID leaf
138    uint32_t max_basic_leaf = 0;
139    sel4bench_private_cpuid(IA32_CPUID_LEAF_BASIC, 0, &max_basic_leaf, &dummy, &dummy, &dummy);
140    if (max_basic_leaf >= IA32_CPUID_LEAF_PMC) { //Core Solo or later supports PMC discovery via CPUID...
141        //query the processor's PMC data
142        ia32_cpuid_leaf_pmc_eax_t pmc_eax;
143
144        sel4bench_private_cpuid(IA32_CPUID_LEAF_PMC, 0, &pmc_eax.raw, &dummy, &dummy, &dummy);
145        return pmc_eax.gp_pmc_count_per_core;
146    } else { //P6 (including Pentium M) doesn't...
147        ia32_cpuid_model_info_t model_info;
148
149        sel4bench_private_cpuid(IA32_CPUID_LEAF_MODEL, 0, &model_info.raw, &dummy, &dummy, &dummy);
150        assert(FAMILY(model_info) == IA32_CPUID_FAMILY_P6); //we only support P6 processors (P3, PM, ...)
151
152        return 2; //2 PMCs on P6
153    }
154}
155
156static FASTFN ccnt_t sel4bench_get_counter(counter_t counter)
157{
158    sel4bench_private_serialize_pmc();    /* Serialise all preceding instructions */
159    uint64_t counter_val = sel4bench_private_rdpmc(counter);
160    sel4bench_private_serialize_pmc();    /* Serialise all following instructions */
161
162    return counter_val;
163}
164
165static CACHESENSFN ccnt_t sel4bench_get_counters(counter_bitfield_t mask, ccnt_t* values)
166{
167    unsigned char counter = 0;
168
169    sel4bench_private_serialize_pmc();    /* Serialise all preceding instructions */
170    for (; mask != 0; mask >>= 1, counter++)
171        if (mask & 1) {
172            values[counter] = sel4bench_private_rdpmc(counter);
173        }
174
175    uint64_t time = sel4bench_private_rdtsc();
176    sel4bench_private_serialize_pmc();    /* Serialise all following instructions */
177
178    return time;
179}
180
181static FASTFN void sel4bench_set_count_event(counter_t counter, event_id_t event)
182{
183    //one implementation, because P6 and architectural PMCs work identically
184
185    assert(counter < sel4bench_get_num_counters());
186
187    ia32_pmc_perfevtsel_t evtsel_msr;
188    evtsel_msr.raw = sel4bench_x86_rdmsr(IA32_MSR_PMC_PERFEVTSEL_BASE + counter);
189
190    //preserve the reserved flag, like the docs tell us
191    uint32_t res_flag = evtsel_msr.res;
192
193    //rewrite the MSR to what we want
194    evtsel_msr.raw   = sel4bench_private_lookup_event(event);
195    evtsel_msr.USR   = 1;
196    evtsel_msr.OS    = 1;
197    evtsel_msr.res   = res_flag;
198    sel4bench_x86_wrmsr(IA32_MSR_PMC_PERFEVTSEL_BASE + counter, evtsel_msr.raw);
199}
200
201static FASTFN void sel4bench_start_counters(counter_bitfield_t mask)
202{
203    /* On P6, only the first counter has an enable flag, which controls both counters
204     * simultaneously.
205     * Arch PMCs are all done independently.
206     */
207    uint32_t dummy;
208
209    seL4_Word num_counters = sel4bench_get_num_counters();
210    if (mask == ~(0UL)) {
211        mask = ((BIT(num_counters)) - 1);
212    } else {
213        assert((~((BIT(num_counters)) - 1) & mask) == 0);
214    }
215
216    uint32_t max_basic_leaf = 0;
217    sel4bench_private_cpuid(IA32_CPUID_LEAF_BASIC, 0, &max_basic_leaf, &dummy, &dummy, &dummy);
218
219    if (!(max_basic_leaf >= IA32_CPUID_LEAF_PMC)) {
220        //we're P6, because otherwise the init() assertion would have tripped
221        assert(mask == 0x3);
222        if (mask == 0x3) {
223            mask = 1;
224        } else {
225            return;
226        }
227    }
228
229    counter_t counter;
230    //NOT your average for loop!
231    for (counter = 0; mask; counter++) {
232        if (!(mask & (BIT(counter)))) {
233            continue;
234        }
235
236        mask &= ~(BIT(counter));
237
238        //read appropriate MSR
239        ia32_pmc_perfevtsel_t temp;
240        temp.raw = sel4bench_x86_rdmsr(IA32_MSR_PMC_PERFEVTSEL_BASE + counter);
241
242        //twiddle enable bit
243        temp.EN = 1;
244
245        //write back appropriate MSR
246        sel4bench_x86_wrmsr(IA32_MSR_PMC_PERFEVTSEL_BASE + counter, temp.raw);
247
248        //zero the counter
249        sel4bench_x86_wrmsr(IA32_MSR_PMC_PERFEVTCNT_BASE + counter, 0);
250    }
251
252}
253
254static FASTFN void sel4bench_stop_counters(counter_bitfield_t mask)
255{
256    /* On P6, only the first counter has an enable flag, which controls both counters
257     * simultaneously.
258     * Arch PMCs are all done independently.
259     */
260    uint32_t dummy;
261
262    seL4_Word num_counters = sel4bench_get_num_counters();
263    if (mask == ~(0UL)) {
264        mask = ((BIT(num_counters)) - 1);
265    } else {
266        assert((~((BIT(num_counters)) - 1) & mask) == 0);
267    }
268
269    uint32_t max_basic_leaf = 0;
270    sel4bench_private_cpuid(IA32_CPUID_LEAF_BASIC, 0, &max_basic_leaf, &dummy, &dummy, &dummy);
271
272    if (!(max_basic_leaf >= IA32_CPUID_LEAF_PMC)) {
273        //we're P6, because otherwise the init() assertion would have tripped
274        assert(mask == 0x3);
275        mask = 1;
276    }
277
278    counter_t counter;
279    //NOT your average for loop!
280    for (counter = 0; mask; counter++) {
281        if (!(mask & (BIT(counter)))) {
282            continue;
283        }
284
285        mask &= ~(BIT(counter));
286
287        //read appropriate MSR
288        ia32_pmc_perfevtsel_t temp;
289        temp.raw = sel4bench_x86_rdmsr(IA32_MSR_PMC_PERFEVTSEL_BASE + counter);
290
291        //twiddle enable bit
292        temp.EN = 0;
293
294        //write back appropriate MSR
295        sel4bench_x86_wrmsr(IA32_MSR_PMC_PERFEVTSEL_BASE + counter, temp.raw);
296    }
297}
298
299static FASTFN void sel4bench_destroy()
300{
301    //stop all performance counters
302    sel4bench_stop_counters(-1);
303
304    //disable user-mode RDPMC
305#ifndef CONFIG_EXPORT_PMC_USER
306    seL4_DebugRun(&sel4bench_private_disable_user_pmc, NULL);
307#endif
308}
309
310static FASTFN void sel4bench_reset_counters(void)
311{
312    for (int i = 0; i < sel4bench_get_num_counters(); i++) {
313        sel4bench_x86_wrmsr(IA32_MSR_PMC_PERFEVTCNT_BASE + i, 0);
314    }
315}
316