1/*
2 * Copyright 2017, Data61
3 * Commonwealth Scientific and Industrial Research Organisation (CSIRO)
4 * ABN 41 687 119 230.
5 *
6 * This software may be distributed and modified according to the terms of
7 * the BSD 2-Clause license. Note that NO WARRANTY is provided.
8 * See "LICENSE_BSD2.txt" for details.
9 *
10 * @TAG(DATA61_BSD)
11 */
12#pragma once
13
14#include <autoconf.h>
15#include <stdint.h>
16#include <utils/util.h>
17
18//function attributes
19//ultra-short, time-sensitive functions
20#define FASTFN inline __attribute__((always_inline))
21
22//short, cache-sensitive functions (note: short means one cache line)
23#define CACHESENSFN __attribute__((noinline, aligned(64)))
24
25//functions that will be passed to seL4_DebugRun() -- fast, but obviously not inlined
26#define KERNELFN __attribute__((noinline, flatten))
27
28#define IN_TX_BIT BIT(0)
29#define IN_TXCP_BIT BIT(1)
30
31#include "events.h"
32
33//CPUID leaf node numbers
34enum {
35    IA32_CPUID_LEAF_BASIC    = 0,
36    IA32_CPUID_LEAF_MODEL    = 1,
37    IA32_CPUID_LEAF_PMC      = 0xA,
38    IA32_CPUID_LEAF_EXTENDED = 0x80000000,
39};
40
41//CPUID.0 "GenuineIntel"
42#define IA32_CPUID_BASIC_MAGIC_EBX 0x756E6547
43#define IA32_CPUID_BASIC_MAGIC_ECX 0x6C65746E
44#define IA32_CPUID_BASIC_MAGIC_EDX 0x49656E69
45
46//CPUID.1 Family and Model ID macros and type.
47#define FAMILY(x) (  (x).family == 0xF                       ? ( (x).ex_family      + (x).family) : (x).family )
48#define MODEL(x)  ( ((x).family == 0xF || (x).family == 0x6) ? (((x).ex_model << 4) + (x).model ) : (x).model  )
49#define IA32_CPUID_FAMILY_P6 0x6
50typedef union {
51    struct {
52        seL4_Word stepping  : 4;
53        seL4_Word model     : 4;
54        seL4_Word family    : 4;
55        seL4_Word type      : 2;
56        seL4_Word reserved1 : 2;
57        seL4_Word ex_model  : 4;
58        seL4_Word ex_family : 8;
59        seL4_Word reserved2 : 4;
60    };
61    uint32_t raw;
62} ia32_cpuid_model_info_t;
63
64//CPUID.PMC Performance-monitoring macros and types
65typedef union {
66    struct {
67        uint8_t pmc_version_id;
68        uint8_t gp_pmc_count_per_core;
69        uint8_t gp_pmc_bit_width;
70        uint8_t ebx_bit_vector_length;
71    };
72    uint32_t  raw;
73} ia32_cpuid_leaf_pmc_eax_t;
74
75//Control-register constants
76#define IA32_CR4_PCE               8
77
78//PMC MSRs
79#define IA32_MSR_PMC_PERFEVTSEL_BASE 0x186
80#define IA32_MSR_PMC_PERFEVTCNT_BASE 0x0C1
81typedef union {
82    struct {
83        uint16_t event;
84        union {
85            struct {
86                uint8_t USR : 1;
87                uint8_t OS  : 1;
88                uint8_t E   : 1;
89                uint8_t PC  : 1;
90                uint8_t INT : 1;
91                uint8_t res : 1;
92                uint8_t EN  : 1;
93                uint8_t INV : 1;
94            };
95            uint8_t flags;
96        };
97        uint8_t cmask;
98    };
99    uint32_t raw;
100} ia32_pmc_perfevtsel_t;
101
102//Convenient execution of CPUID instruction. The first version isn't volatile, so is for querying the processor; the second version just serialises.
103//This looks slow, but gcc inlining is smart enough to optimise away all the memory references, and takes unused information into account.
104static FASTFN void sel4bench_private_cpuid(uint32_t leaf, uint32_t subleaf, uint32_t * eax, uint32_t * ebx, uint32_t * ecx, uint32_t * edx)
105{
106    asm (
107        "cpuid"
108        : "=a"(*eax)    /* output eax */
109        , "=b"(*ebx)    /* output ebx */
110        , "=c"(*ecx)    /* output ecx */
111        , "=d"(*edx)    /* output edx */
112        : "a" (leaf)    /* input query leaf */
113        , "c" (subleaf) /* input query subleaf */
114    );
115}
116static FASTFN void sel4bench_private_cpuid_serial()
117{
118    //set leaf and subleaf to 0 for predictability
119    uint32_t eax = 0;
120    uint32_t ecx = 0;
121    asm volatile (
122        "cpuid"
123        : "+a"(eax)       /* eax = 0 and gets clobbered */
124        , "+c"(ecx)       /* ecx = 0 and gets clobbered */
125        :                 /* no other inputs to this version */
126        : "%ebx"          /* clobber ebx */
127        , "%edx"          /* clobber edx */
128        , "cc"            /* clobber condition code */
129    );
130}
131static FASTFN void sel4bench_private_lfence()
132{
133    asm volatile("lfence");
134}
135
136static FASTFN uint64_t sel4bench_private_rdtsc()
137{
138    uint32_t lo, hi;
139    asm volatile (
140        "rdtsc"
141        : "=a"(lo), "=d"(hi)
142    );
143    return (((uint64_t)hi << 32ull) | (uint64_t)lo);
144}
145
146static FASTFN uint64_t sel4bench_private_rdpmc(uint32_t counter)
147{
148    uint32_t hi, lo;
149    asm volatile (
150        "rdpmc"
151        : "=a"(lo), "=d"(hi)
152        : "c"(counter)
153    );
154    return (((uint64_t)hi << 32ull) | (uint64_t)lo);
155}
156
157//Serialization instruction for before and after reading PMCs
158//See comment in arch/sel4bench.h for details.
159#ifdef SEL4BENCH_STRICT_PMC_SERIALIZATION
160#define sel4bench_private_serialize_pmc sel4bench_private_cpuid_serial
161#else //SEL4BENCH_STRICT_PMC_SERIALIZATION
162#define sel4bench_private_serialize_pmc sel4bench_private_lfence
163#endif //SEL4BENCH_STRICT_PMC_SERIALIZATION
164
165/* Hide these definitions if using kernel exported PMC to prevent warnings */
166#ifndef CONFIG_EXPORT_PMC_USER
167//enable user-level pmc access
168static KERNELFN void sel4bench_private_enable_user_pmc(void* arg)
169{
170#ifdef CONFIG_ARCH_X86_64
171
172    uint64_t dummy;
173    asm volatile (
174        "movq   %%cr4, %0;"
175        "orq    %[pce], %0;"
176        "movq   %0, %%cr4;"
177        : "=r" (dummy)
178        : [pce] "i" BIT(IA32_CR4_PCE)
179        : "cc"
180    );
181#else
182
183    uint32_t dummy;
184    asm volatile (
185        "movl %%cr4, %0;"               /* read CR4 */
186        "orl %[pce], %0;"               /* enable PCE flag */
187        "movl %0, %%cr4;"               /* write CR4 */
188        : "=r" (dummy)                  /* fake output to ask GCC for a register */
189        : [pce] "i" BIT(IA32_CR4_PCE) /* input PCE flag */
190        : "cc"                          /* clobber condition code */
191    );
192#endif
193}
194
195//disable user-level pmc access
196static KERNELFN void sel4bench_private_disable_user_pmc(void* arg)
197{
198#ifdef CONFIG_ARCH_X86_64
199    uint64_t dummy;
200    asm volatile (
201        "movq   %%cr4, %0;"
202        "andq   %[pce], %0;"
203        "movq   %0, %%cr4;"
204        : "=r" (dummy)
205        : [pce] "i" (~BIT(IA32_CR4_PCE))
206        : "cc"
207    );
208
209#else
210    uint32_t dummy;
211    asm volatile (
212        "movl %%cr4, %0;"                  /* read CR4 */
213        "andl %[pce], %0;"                 /* enable PCE flag */
214        "movl %0, %%cr4;"                  /* write CR4 */
215        : "=r" (dummy)                     /* fake output to ask GCC for a register */
216        : [pce] "i" (~BIT(IA32_CR4_PCE)) /* input PCE flag */
217        : "cc"                             /* clobber condition code */
218    );
219#endif
220}
221#endif
222
223#ifndef CONFIG_KERNEL_X86_DANGEROUS_MSR
224//read an MSR
225static KERNELFN void sel4bench_private_rdmsr(void* arg)
226{
227    uint32_t* msr = (uint32_t*)arg;
228
229    asm volatile (
230        "rdmsr"
231        : "=a" (msr[1]) /* output low */
232        , "=d" (msr[2]) /* output high */
233        :  "c" (msr[0]) /* input MSR index */
234    );
235}
236
237//write an MSR
238static KERNELFN void sel4bench_private_wrmsr(void* arg)
239{
240    uint32_t* msr = (uint32_t*)arg;
241
242    asm volatile (
243        "wrmsr"
244        :               /* no output */
245        : "a" (msr[1])  /* input low */
246        , "d" (msr[2])  /* input high */
247        , "c" (msr[0])  /* input MSR index */
248    );
249}
250#endif
251
252//generic event tables for lookup fn below
253//they use direct event numbers, rather than the constants in events.h, because it's smaller
254static seL4_Word SEL4BENCH_IA32_WESTMERE_EVENTS[5] = {
255    0x0280, //CACHE_L1I_MISS
256    0x0151, //CACHE_L1D_MISS, must use counter 0 or 1
257    0x20C8, //TLB_L1I_MISS
258    0x80CB, //TLB_L1D_MISS
259    0x5FCB  //SEL4BENCH_IA32_WESTMERE_EVENT_CACHE_|{L1D_HIT,L2_HIT,L3P_HIT,L3_HIT,L3_MISS,LFB_HIT}_R
260};
261static seL4_Word SEL4BENCH_IA32_NEHALEM_EVENTS[5] = {
262    0x0280, //CACHE_L1I_MISS
263    0x0151, //CACHE_L1D_MISS, must use counter 0 or 1
264    0x20C8, //TLB_L1I_MISS
265    0x80CB, //TLB_L1D_MISS
266    0x5FCB  //SEL4BENCH_IA32_NEHALEM_EVENT_CACHE_|{L1D_HIT,L2_HIT,L3P_HIT,L3_HIT,L3_MISS,LFB_HIT}_R
267};
268static seL4_Word SEL4BENCH_IA32_CORE2_EVENTS[5] = {
269    0x0081, //CACHE_L1I_MISS
270    0x01CB, //CACHE_L1D_MISS, must use counter 0
271    0x00C9, //TLB_L1I_MISS
272    0x10CB, //TLB_L1D_MISS, must use counter 0
273    0x03C0  //SEL4BENCH_IA32_CORE2_EVENT_RETIRE_MEMORY_|{READ,WRITE}
274};
275static seL4_Word SEL4BENCH_IA32_CORE_EVENTS[5] = {
276    0x0081, //CACHE_L1I_MISS
277    0x0000, //CACHE_L1D_MISS, not available on CORE
278    0x0085, //TLB_L1I_MISS
279    0x0049, //TLB_L1D_MISS
280    0x0143  //MEMORY_ACCESS
281};
282static seL4_Word SEL4BENCH_IA32_P6_EVENTS[5] = {
283    0x0081, //CACHE_L1I_MISS
284    0x0045, //CACHE_L1D_MISS
285    0x0085, //TLB_L1I_MISS
286    0x0000, //TLB_L1D_MISS, not available on P6
287    0x0043  //MEMORY_ACCESS
288};
289static seL4_Word SEL4BENCH_IA32_HASWELL_EVENTS[5] = {
290    0x0280, //CACHE_L1I_MISS
291    0x0151, //CACHE_L1D_MISS, must use counter 0 or 1
292    0x0085, //TLB_L1I_MISS
293    0x0049, //TLB_L1D_MISS
294    0x412E  //LLC_MISS
295};
296static seL4_Word SEL4BENCH_IA32_BROADWELL_EVENTS[5] = {
297    0x0280, //ICACHE.MISSES
298    0x0151, //L1D.REPLACEMENT
299    0x2185, //ITLB_MISSES.MISS_CAUSES_A_WALK | ITLB_MISSES.STLB_HIT_4K
300    0x0000, //No combined load/store dTLB miss counter available
301    0x412E  //LONGEST_LAT_CACHE.MISS
302};
303static seL4_Word SEL4BENCH_IA32_SKYLAKE_EVENTS[5] = {
304    0x0000, //No combined tag/data iCache miss counter available
305    0x0151, //L1D.REPLACEMENT
306    0x2185, //ITLB_MISSES.MISS_CAUSES_A_WALK | ITLB_MISSES.STLB_HIT
307    0x0000, //No combined load/store dTLB miss counter available
308    0x412E  //LONGEST_LAT_CACHE.MISS
309};
310
311static FASTFN seL4_Word sel4bench_private_lookup_event(event_id_t event)
312{
313    if ((SEL4BENCH_EVENT_GENERIC_MASK & event) == SEL4BENCH_EVENT_GENERIC_MASK) {
314        uint32_t dummy = 0;
315        ia32_cpuid_model_info_t model_info = { .raw = 0 };
316        sel4bench_private_cpuid(IA32_CPUID_LEAF_MODEL, 0, &model_info.raw, &dummy, &dummy, &dummy);
317
318        //we should be a P6
319        assert(FAMILY(model_info) == IA32_CPUID_FAMILY_P6);
320
321        uint8_t model = MODEL(model_info);
322        event = event & ~SEL4BENCH_EVENT_GENERIC_MASK;
323
324        //Using the model summary on http://www.sandpile.org/x86/cpuid.htm#level_0000_0001h
325        //Let's hope it's accurate...
326        //We are also pretending Atoms don't exist
327
328        //P3 or PM
329        if (model <= 0x0D || model == 0x15) {
330            return SEL4BENCH_IA32_P6_EVENTS[event];
331        }
332
333        switch (model) {
334            //CORE
335        case 0x0E:
336            return SEL4BENCH_IA32_CORE_EVENTS[event];
337
338            //CORE2
339        case 0x0F:
340        case 0x16:
341        case 0x17:
342        case 0x1D:
343            return SEL4BENCH_IA32_CORE2_EVENTS[event];
344
345            //NEHALEM
346        case 0x1A:
347        case 0x1E:
348        case 0x1F:
349        case 0x2E:
350            return SEL4BENCH_IA32_NEHALEM_EVENTS[event];
351
352            //WESTMERE
353        case 0x25:
354        case 0x2C:
355        case 0x2F:
356            return SEL4BENCH_IA32_WESTMERE_EVENTS[event];
357
358            //SANDY BRIDGE
359        case 0x2A:
360        case 0x2D:
361            return 0x0000; //TODO
362
363            //IVY BRIDGE
364        case 0x3A:
365        case 0x3E:
366            return 0x0000; //TODO
367
368            //HASWELL
369        case 0x3C:
370        case 0x3F:
371        case 0x45:
372        case 0x46:
373            return SEL4BENCH_IA32_HASWELL_EVENTS[event];
374
375            //BROADWELL
376        case 0x3D:
377        case 0x47:
378        case 0x4F:
379        case 0x56:
380            return SEL4BENCH_IA32_BROADWELL_EVENTS[event];
381
382            //SKYLAKE
383        case 0x4E:
384        case 0x5E:
385            return SEL4BENCH_IA32_SKYLAKE_EVENTS[event];
386
387            //Unknown
388        default:
389            return 0x0000;
390        }
391    } else {
392        return event;
393    }
394}
395