1/* 2 * Copyright 2017, Data61 3 * Commonwealth Scientific and Industrial Research Organisation (CSIRO) 4 * ABN 41 687 119 230. 5 * 6 * This software may be distributed and modified according to the terms of 7 * the BSD 2-Clause license. Note that NO WARRANTY is provided. 8 * See "LICENSE_BSD2.txt" for details. 9 * 10 * @TAG(DATA61_BSD) 11 */ 12#pragma once 13 14#include <assert.h> 15#include <stdio.h> 16#include <stdint.h> 17#include <stdbool.h> 18#include <inttypes.h> 19#include <sel4bench/types.h> 20 21#define SEL4BENCH_READ_CCNT(var) do { \ 22 uint32_t low, high; \ 23 asm volatile( \ 24 "movl $0, %%eax \n" \ 25 "movl $0, %%ecx \n" \ 26 "cpuid \n" \ 27 "rdtsc \n" \ 28 "movl %%edx, %0 \n" \ 29 "movl %%eax, %1 \n" \ 30 "movl $0, %%eax \n" \ 31 "movl $0, %%ecx \n" \ 32 "cpuid \n" \ 33 : \ 34 "=r"(high), \ 35 "=r"(low) \ 36 : \ 37 : "eax", "ebx", "ecx", "edx" \ 38 ); \ 39 (var) = (((uint64_t)high) << 32ull) | ((uint64_t)low); \ 40} while(0) 41 42/* Intel docs are somewhat unclear as to exactly how to serialize PMCs. 43 * Using LFENCE for the moment, because it's much faster. If event counts 44 * turn out to be unreliable, switch to CPUID by uncommenting this line. 45 * 46 * This currently breaks the GCC register allocator. 47 */ 48//#define SEL4BENCH_STRICT_PMC_SERIALIZATION 49 50#include <sel4bench/arch/private.h> 51 52#define CCNT_FORMAT "%"PRIu64 53typedef uint64_t ccnt_t; 54 55/* The framework as it stands supports the following Intel processors: 56 * - All P6-family processors (up to and including the Pentium M) 57 * - All processors supporting IA-32 architectural performance 58 * monitoring (that is, processors starting from the Intel Core Solo, 59 * codenamed Yonah) 60 */ 61 62/* Silence warnings about including the following functions when seL4_DebugRun 63 * is not enabled when we are not calling them. If we actually call these 64 * functions without seL4_DebugRun enabled, we'll get a link failure, so this 65 * should be OK. 66 */ 67void seL4_DebugRun(void (* userfn) (void *), void* userarg); 68 69static inline uint64_t sel4bench_x86_rdmsr(uint32_t reg) { 70#ifdef CONFIG_KERNEL_X86_DANGEROUS_MSR 71 return seL4_X86DangerousRDMSR(reg); 72#else 73 uint32_t msr_data[3]; 74 msr_data[0] = reg; 75 msr_data[1] = 0; 76 msr_data[2] = 0; 77 78 seL4_DebugRun(&sel4bench_private_rdmsr, msr_data); 79 return (uint64_t)msr_data[1] + ((uint64_t)msr_data[2] << 32); 80#endif 81} 82 83static inline void sel4bench_x86_wrmsr(uint32_t reg, uint64_t val) { 84#ifdef CONFIG_KERNEL_X86_DANGEROUS_MSR 85 seL4_X86DangerousWRMSR(reg, val); 86#else 87 uint32_t msr_data[3]; 88 msr_data[0] = reg; 89 msr_data[1] = val & 0xffffffff; 90 msr_data[2] = val >> 32; 91 92 seL4_DebugRun(&sel4bench_private_wrmsr, msr_data); 93#endif 94} 95 96static FASTFN void sel4bench_init() 97{ 98 uint32_t cpuid_eax; 99 uint32_t cpuid_ebx; 100 uint32_t cpuid_ecx; 101 uint32_t cpuid_edx; 102 sel4bench_private_cpuid(IA32_CPUID_LEAF_BASIC, 0, &cpuid_eax, &cpuid_ebx, &cpuid_ecx, &cpuid_edx); 103 104 //check we're running on an Intel chip 105 assert(cpuid_ebx == IA32_CPUID_BASIC_MAGIC_EBX && cpuid_ecx == IA32_CPUID_BASIC_MAGIC_ECX && cpuid_edx == IA32_CPUID_BASIC_MAGIC_EDX); 106 107 //check that either we support architectural performance monitoring, or we're running on a P6-class chip 108 if (cpuid_eax < IA32_CPUID_LEAF_PMC) { //basic CPUID invocation tells us whether the processor supports arch PMCs 109 //if not, ensure we're on a P6-family processor 110 ia32_cpuid_model_info_t cpuid_model_info; 111 sel4bench_private_cpuid(IA32_CPUID_LEAF_MODEL, 0, &(cpuid_model_info.raw), &cpuid_ebx, &cpuid_ecx, &cpuid_edx); 112 assert(FAMILY(cpuid_model_info) == IA32_CPUID_FAMILY_P6); 113 if (!(FAMILY(cpuid_model_info) == IA32_CPUID_FAMILY_P6)) { 114 return; 115 } 116 } 117 118 //enable user-mode RDPMC 119#ifndef CONFIG_EXPORT_PMC_USER 120 seL4_DebugRun(&sel4bench_private_enable_user_pmc, NULL); 121#endif 122} 123 124static FASTFN ccnt_t sel4bench_get_cycle_count() 125{ 126 sel4bench_private_serialize_pmc(); /* Serialise all preceding instructions */ 127 uint64_t time = sel4bench_private_rdtsc(); 128 sel4bench_private_serialize_pmc(); /* Serialise all following instructions */ 129 130 return time; 131} 132 133static FASTFN seL4_Word sel4bench_get_num_counters() 134{ 135 uint32_t dummy; 136 137 //make sure the processor supports the PMC CPUID leaf 138 uint32_t max_basic_leaf = 0; 139 sel4bench_private_cpuid(IA32_CPUID_LEAF_BASIC, 0, &max_basic_leaf, &dummy, &dummy, &dummy); 140 if (max_basic_leaf >= IA32_CPUID_LEAF_PMC) { //Core Solo or later supports PMC discovery via CPUID... 141 //query the processor's PMC data 142 ia32_cpuid_leaf_pmc_eax_t pmc_eax; 143 144 sel4bench_private_cpuid(IA32_CPUID_LEAF_PMC, 0, &pmc_eax.raw, &dummy, &dummy, &dummy); 145 return pmc_eax.gp_pmc_count_per_core; 146 } else { //P6 (including Pentium M) doesn't... 147 ia32_cpuid_model_info_t model_info; 148 149 sel4bench_private_cpuid(IA32_CPUID_LEAF_MODEL, 0, &model_info.raw, &dummy, &dummy, &dummy); 150 assert(FAMILY(model_info) == IA32_CPUID_FAMILY_P6); //we only support P6 processors (P3, PM, ...) 151 152 return 2; //2 PMCs on P6 153 } 154} 155 156static FASTFN ccnt_t sel4bench_get_counter(counter_t counter) 157{ 158 sel4bench_private_serialize_pmc(); /* Serialise all preceding instructions */ 159 uint64_t counter_val = sel4bench_private_rdpmc(counter); 160 sel4bench_private_serialize_pmc(); /* Serialise all following instructions */ 161 162 return counter_val; 163} 164 165static CACHESENSFN ccnt_t sel4bench_get_counters(counter_bitfield_t mask, ccnt_t* values) 166{ 167 unsigned char counter = 0; 168 169 sel4bench_private_serialize_pmc(); /* Serialise all preceding instructions */ 170 for (; mask != 0; mask >>= 1, counter++) 171 if (mask & 1) { 172 values[counter] = sel4bench_private_rdpmc(counter); 173 } 174 175 uint64_t time = sel4bench_private_rdtsc(); 176 sel4bench_private_serialize_pmc(); /* Serialise all following instructions */ 177 178 return time; 179} 180 181static FASTFN void sel4bench_set_count_event(counter_t counter, event_id_t event) 182{ 183 //one implementation, because P6 and architectural PMCs work identically 184 185 assert(counter < sel4bench_get_num_counters()); 186 187 ia32_pmc_perfevtsel_t evtsel_msr; 188 evtsel_msr.raw = sel4bench_x86_rdmsr(IA32_MSR_PMC_PERFEVTSEL_BASE + counter); 189 190 //preserve the reserved flag, like the docs tell us 191 uint32_t res_flag = evtsel_msr.res; 192 193 //rewrite the MSR to what we want 194 evtsel_msr.raw = sel4bench_private_lookup_event(event); 195 evtsel_msr.USR = 1; 196 evtsel_msr.OS = 1; 197 evtsel_msr.res = res_flag; 198 sel4bench_x86_wrmsr(IA32_MSR_PMC_PERFEVTSEL_BASE + counter, evtsel_msr.raw); 199} 200 201static FASTFN void sel4bench_start_counters(counter_bitfield_t mask) 202{ 203 /* On P6, only the first counter has an enable flag, which controls both counters 204 * simultaneously. 205 * Arch PMCs are all done independently. 206 */ 207 uint32_t dummy; 208 209 seL4_Word num_counters = sel4bench_get_num_counters(); 210 if (mask == ~(0UL)) { 211 mask = ((BIT(num_counters)) - 1); 212 } else { 213 assert((~((BIT(num_counters)) - 1) & mask) == 0); 214 } 215 216 uint32_t max_basic_leaf = 0; 217 sel4bench_private_cpuid(IA32_CPUID_LEAF_BASIC, 0, &max_basic_leaf, &dummy, &dummy, &dummy); 218 219 if (!(max_basic_leaf >= IA32_CPUID_LEAF_PMC)) { 220 //we're P6, because otherwise the init() assertion would have tripped 221 assert(mask == 0x3); 222 if (mask == 0x3) { 223 mask = 1; 224 } else { 225 return; 226 } 227 } 228 229 counter_t counter; 230 //NOT your average for loop! 231 for (counter = 0; mask; counter++) { 232 if (!(mask & (BIT(counter)))) { 233 continue; 234 } 235 236 mask &= ~(BIT(counter)); 237 238 //read appropriate MSR 239 ia32_pmc_perfevtsel_t temp; 240 temp.raw = sel4bench_x86_rdmsr(IA32_MSR_PMC_PERFEVTSEL_BASE + counter); 241 242 //twiddle enable bit 243 temp.EN = 1; 244 245 //write back appropriate MSR 246 sel4bench_x86_wrmsr(IA32_MSR_PMC_PERFEVTSEL_BASE + counter, temp.raw); 247 248 //zero the counter 249 sel4bench_x86_wrmsr(IA32_MSR_PMC_PERFEVTCNT_BASE + counter, 0); 250 } 251 252} 253 254static FASTFN void sel4bench_stop_counters(counter_bitfield_t mask) 255{ 256 /* On P6, only the first counter has an enable flag, which controls both counters 257 * simultaneously. 258 * Arch PMCs are all done independently. 259 */ 260 uint32_t dummy; 261 262 seL4_Word num_counters = sel4bench_get_num_counters(); 263 if (mask == ~(0UL)) { 264 mask = ((BIT(num_counters)) - 1); 265 } else { 266 assert((~((BIT(num_counters)) - 1) & mask) == 0); 267 } 268 269 uint32_t max_basic_leaf = 0; 270 sel4bench_private_cpuid(IA32_CPUID_LEAF_BASIC, 0, &max_basic_leaf, &dummy, &dummy, &dummy); 271 272 if (!(max_basic_leaf >= IA32_CPUID_LEAF_PMC)) { 273 //we're P6, because otherwise the init() assertion would have tripped 274 assert(mask == 0x3); 275 mask = 1; 276 } 277 278 counter_t counter; 279 //NOT your average for loop! 280 for (counter = 0; mask; counter++) { 281 if (!(mask & (BIT(counter)))) { 282 continue; 283 } 284 285 mask &= ~(BIT(counter)); 286 287 //read appropriate MSR 288 ia32_pmc_perfevtsel_t temp; 289 temp.raw = sel4bench_x86_rdmsr(IA32_MSR_PMC_PERFEVTSEL_BASE + counter); 290 291 //twiddle enable bit 292 temp.EN = 0; 293 294 //write back appropriate MSR 295 sel4bench_x86_wrmsr(IA32_MSR_PMC_PERFEVTSEL_BASE + counter, temp.raw); 296 } 297} 298 299static FASTFN void sel4bench_destroy() 300{ 301 //stop all performance counters 302 sel4bench_stop_counters(-1); 303 304 //disable user-mode RDPMC 305#ifndef CONFIG_EXPORT_PMC_USER 306 seL4_DebugRun(&sel4bench_private_disable_user_pmc, NULL); 307#endif 308} 309 310static FASTFN void sel4bench_reset_counters(void) 311{ 312 for (int i = 0; i < sel4bench_get_num_counters(); i++) { 313 sel4bench_x86_wrmsr(IA32_MSR_PMC_PERFEVTCNT_BASE + i, 0); 314 } 315} 316