1#include <linux/module.h> 2#include "edac_mce_amd.h" 3 4static bool report_gart_errors; 5static void (*nb_bus_decoder)(int node_id, struct err_regs *regs); 6 7void amd_report_gart_errors(bool v) 8{ 9 report_gart_errors = v; 10} 11EXPORT_SYMBOL_GPL(amd_report_gart_errors); 12 13void amd_register_ecc_decoder(void (*f)(int, struct err_regs *)) 14{ 15 nb_bus_decoder = f; 16} 17EXPORT_SYMBOL_GPL(amd_register_ecc_decoder); 18 19void amd_unregister_ecc_decoder(void (*f)(int, struct err_regs *)) 20{ 21 if (nb_bus_decoder) { 22 WARN_ON(nb_bus_decoder != f); 23 24 nb_bus_decoder = NULL; 25 } 26} 27EXPORT_SYMBOL_GPL(amd_unregister_ecc_decoder); 28 29/* 30 * string representation for the different MCA reported error types, see F3x48 31 * or MSR0000_0411. 32 */ 33const char *tt_msgs[] = { /* transaction type */ 34 "instruction", 35 "data", 36 "generic", 37 "reserved" 38}; 39EXPORT_SYMBOL_GPL(tt_msgs); 40 41const char *ll_msgs[] = { /* cache level */ 42 "L0", 43 "L1", 44 "L2", 45 "L3/generic" 46}; 47EXPORT_SYMBOL_GPL(ll_msgs); 48 49const char *rrrr_msgs[] = { 50 "generic", 51 "generic read", 52 "generic write", 53 "data read", 54 "data write", 55 "inst fetch", 56 "prefetch", 57 "evict", 58 "snoop", 59 "reserved RRRR= 9", 60 "reserved RRRR= 10", 61 "reserved RRRR= 11", 62 "reserved RRRR= 12", 63 "reserved RRRR= 13", 64 "reserved RRRR= 14", 65 "reserved RRRR= 15" 66}; 67EXPORT_SYMBOL_GPL(rrrr_msgs); 68 69const char *pp_msgs[] = { /* participating processor */ 70 "local node originated (SRC)", 71 "local node responded to request (RES)", 72 "local node observed as 3rd party (OBS)", 73 "generic" 74}; 75EXPORT_SYMBOL_GPL(pp_msgs); 76 77const char *to_msgs[] = { 78 "no timeout", 79 "timed out" 80}; 81EXPORT_SYMBOL_GPL(to_msgs); 82 83const char *ii_msgs[] = { /* memory or i/o */ 84 "mem access", 85 "reserved", 86 "i/o access", 87 "generic" 88}; 89EXPORT_SYMBOL_GPL(ii_msgs); 90 91/* 92 * Map the 4 or 5 (family-specific) bits of Extended Error code to the 93 * string table. 94 */ 95const char *ext_msgs[] = { 96 "K8 ECC error", /* 0_0000b */ 97 "CRC error on link", /* 0_0001b */ 98 "Sync error packets on link", /* 0_0010b */ 99 "Master Abort during link operation", /* 0_0011b */ 100 "Target Abort during link operation", /* 0_0100b */ 101 "Invalid GART PTE entry during table walk", /* 0_0101b */ 102 "Unsupported atomic RMW command received", /* 0_0110b */ 103 "WDT error: NB transaction timeout", /* 0_0111b */ 104 "ECC/ChipKill ECC error", /* 0_1000b */ 105 "SVM DEV Error", /* 0_1001b */ 106 "Link Data error", /* 0_1010b */ 107 "Link/L3/Probe Filter Protocol error", /* 0_1011b */ 108 "NB Internal Arrays Parity error", /* 0_1100b */ 109 "DRAM Address/Control Parity error", /* 0_1101b */ 110 "Link Transmission error", /* 0_1110b */ 111 "GART/DEV Table Walk Data error" /* 0_1111b */ 112 "Res 0x100 error", /* 1_0000b */ 113 "Res 0x101 error", /* 1_0001b */ 114 "Res 0x102 error", /* 1_0010b */ 115 "Res 0x103 error", /* 1_0011b */ 116 "Res 0x104 error", /* 1_0100b */ 117 "Res 0x105 error", /* 1_0101b */ 118 "Res 0x106 error", /* 1_0110b */ 119 "Res 0x107 error", /* 1_0111b */ 120 "Res 0x108 error", /* 1_1000b */ 121 "Res 0x109 error", /* 1_1001b */ 122 "Res 0x10A error", /* 1_1010b */ 123 "Res 0x10B error", /* 1_1011b */ 124 "ECC error in L3 Cache Data", /* 1_1100b */ 125 "L3 Cache Tag error", /* 1_1101b */ 126 "L3 Cache LRU Parity error", /* 1_1110b */ 127 "Probe Filter error" /* 1_1111b */ 128}; 129EXPORT_SYMBOL_GPL(ext_msgs); 130 131static void amd_decode_dc_mce(u64 mc0_status) 132{ 133 u32 ec = mc0_status & 0xffff; 134 u32 xec = (mc0_status >> 16) & 0xf; 135 136 pr_emerg("Data Cache Error"); 137 138 if (xec == 1 && TLB_ERROR(ec)) 139 pr_cont(": %s TLB multimatch.\n", LL_MSG(ec)); 140 else if (xec == 0) { 141 if (mc0_status & (1ULL << 40)) 142 pr_cont(" during Data Scrub.\n"); 143 else if (TLB_ERROR(ec)) 144 pr_cont(": %s TLB parity error.\n", LL_MSG(ec)); 145 else if (MEM_ERROR(ec)) { 146 u8 ll = ec & 0x3; 147 u8 tt = (ec >> 2) & 0x3; 148 u8 rrrr = (ec >> 4) & 0xf; 149 150 /* see F10h BKDG (31116), Table 92. */ 151 if (ll == 0x1) { 152 if (tt != 0x1) 153 goto wrong_dc_mce; 154 155 pr_cont(": Data/Tag %s error.\n", RRRR_MSG(ec)); 156 157 } else if (ll == 0x2 && rrrr == 0x3) 158 pr_cont(" during L1 linefill from L2.\n"); 159 else 160 goto wrong_dc_mce; 161 } else if (BUS_ERROR(ec) && boot_cpu_data.x86 == 0xf) 162 pr_cont(" during system linefill.\n"); 163 else 164 goto wrong_dc_mce; 165 } else 166 goto wrong_dc_mce; 167 168 return; 169 170wrong_dc_mce: 171 pr_warning("Corrupted DC MCE info?\n"); 172} 173 174static void amd_decode_ic_mce(u64 mc1_status) 175{ 176 u32 ec = mc1_status & 0xffff; 177 u32 xec = (mc1_status >> 16) & 0xf; 178 179 pr_emerg("Instruction Cache Error"); 180 181 if (xec == 1 && TLB_ERROR(ec)) 182 pr_cont(": %s TLB multimatch.\n", LL_MSG(ec)); 183 else if (xec == 0) { 184 if (TLB_ERROR(ec)) 185 pr_cont(": %s TLB Parity error.\n", LL_MSG(ec)); 186 else if (BUS_ERROR(ec)) { 187 if (boot_cpu_data.x86 == 0xf && 188 (mc1_status & (1ULL << 58))) 189 pr_cont(" during system linefill.\n"); 190 else 191 pr_cont(" during attempted NB data read.\n"); 192 } else if (MEM_ERROR(ec)) { 193 u8 ll = ec & 0x3; 194 u8 rrrr = (ec >> 4) & 0xf; 195 196 if (ll == 0x2) 197 pr_cont(" during a linefill from L2.\n"); 198 else if (ll == 0x1) { 199 200 switch (rrrr) { 201 case 0x5: 202 pr_cont(": Parity error during " 203 "data load.\n"); 204 break; 205 206 case 0x7: 207 pr_cont(": Copyback Parity/Victim" 208 " error.\n"); 209 break; 210 211 case 0x8: 212 pr_cont(": Tag Snoop error.\n"); 213 break; 214 215 default: 216 goto wrong_ic_mce; 217 break; 218 } 219 } 220 } else 221 goto wrong_ic_mce; 222 } else 223 goto wrong_ic_mce; 224 225 return; 226 227wrong_ic_mce: 228 pr_warning("Corrupted IC MCE info?\n"); 229} 230 231static void amd_decode_bu_mce(u64 mc2_status) 232{ 233 u32 ec = mc2_status & 0xffff; 234 u32 xec = (mc2_status >> 16) & 0xf; 235 236 pr_emerg("Bus Unit Error"); 237 238 if (xec == 0x1) 239 pr_cont(" in the write data buffers.\n"); 240 else if (xec == 0x3) 241 pr_cont(" in the victim data buffers.\n"); 242 else if (xec == 0x2 && MEM_ERROR(ec)) 243 pr_cont(": %s error in the L2 cache tags.\n", RRRR_MSG(ec)); 244 else if (xec == 0x0) { 245 if (TLB_ERROR(ec)) 246 pr_cont(": %s error in a Page Descriptor Cache or " 247 "Guest TLB.\n", TT_MSG(ec)); 248 else if (BUS_ERROR(ec)) 249 pr_cont(": %s/ECC error in data read from NB: %s.\n", 250 RRRR_MSG(ec), PP_MSG(ec)); 251 else if (MEM_ERROR(ec)) { 252 u8 rrrr = (ec >> 4) & 0xf; 253 254 if (rrrr >= 0x7) 255 pr_cont(": %s error during data copyback.\n", 256 RRRR_MSG(ec)); 257 else if (rrrr <= 0x1) 258 pr_cont(": %s parity/ECC error during data " 259 "access from L2.\n", RRRR_MSG(ec)); 260 else 261 goto wrong_bu_mce; 262 } else 263 goto wrong_bu_mce; 264 } else 265 goto wrong_bu_mce; 266 267 return; 268 269wrong_bu_mce: 270 pr_warning("Corrupted BU MCE info?\n"); 271} 272 273static void amd_decode_ls_mce(u64 mc3_status) 274{ 275 u32 ec = mc3_status & 0xffff; 276 u32 xec = (mc3_status >> 16) & 0xf; 277 278 pr_emerg("Load Store Error"); 279 280 if (xec == 0x0) { 281 u8 rrrr = (ec >> 4) & 0xf; 282 283 if (!BUS_ERROR(ec) || (rrrr != 0x3 && rrrr != 0x4)) 284 goto wrong_ls_mce; 285 286 pr_cont(" during %s.\n", RRRR_MSG(ec)); 287 } 288 return; 289 290wrong_ls_mce: 291 pr_warning("Corrupted LS MCE info?\n"); 292} 293 294void amd_decode_nb_mce(int node_id, struct err_regs *regs, int handle_errors) 295{ 296 u32 ec = ERROR_CODE(regs->nbsl); 297 298 if (!handle_errors) 299 return; 300 301 /* 302 * GART TLB error reporting is disabled by default. Bail out early. 303 */ 304 if (TLB_ERROR(ec) && !report_gart_errors) 305 return; 306 307 pr_emerg("Northbridge Error, node %d", node_id); 308 309 /* 310 * F10h, revD can disable ErrCpu[3:0] so check that first and also the 311 * value encoding has changed so interpret those differently 312 */ 313 if ((boot_cpu_data.x86 == 0x10) && 314 (boot_cpu_data.x86_model > 7)) { 315 if (regs->nbsh & K8_NBSH_ERR_CPU_VAL) 316 pr_cont(", core: %u\n", (u8)(regs->nbsh & 0xf)); 317 } else { 318 u8 assoc_cpus = regs->nbsh & 0xf; 319 320 if (assoc_cpus > 0) 321 pr_cont(", core: %d", fls(assoc_cpus) - 1); 322 323 pr_cont("\n"); 324 } 325 326 pr_emerg("%s.\n", EXT_ERR_MSG(regs->nbsl)); 327 328 if (BUS_ERROR(ec) && nb_bus_decoder) 329 nb_bus_decoder(node_id, regs); 330} 331EXPORT_SYMBOL_GPL(amd_decode_nb_mce); 332 333static void amd_decode_fr_mce(u64 mc5_status) 334{ 335 /* we have only one error signature so match all fields at once. */ 336 if ((mc5_status & 0xffff) == 0x0f0f) 337 pr_emerg(" FR Error: CPU Watchdog timer expire.\n"); 338 else 339 pr_warning("Corrupted FR MCE info?\n"); 340} 341 342static inline void amd_decode_err_code(unsigned int ec) 343{ 344 if (TLB_ERROR(ec)) { 345 pr_emerg("Transaction: %s, Cache Level %s\n", 346 TT_MSG(ec), LL_MSG(ec)); 347 } else if (MEM_ERROR(ec)) { 348 pr_emerg("Transaction: %s, Type: %s, Cache Level: %s", 349 RRRR_MSG(ec), TT_MSG(ec), LL_MSG(ec)); 350 } else if (BUS_ERROR(ec)) { 351 pr_emerg("Transaction type: %s(%s), %s, Cache Level: %s, " 352 "Participating Processor: %s\n", 353 RRRR_MSG(ec), II_MSG(ec), TO_MSG(ec), LL_MSG(ec), 354 PP_MSG(ec)); 355 } else 356 pr_warning("Huh? Unknown MCE error 0x%x\n", ec); 357} 358 359static int amd_decode_mce(struct notifier_block *nb, unsigned long val, 360 void *data) 361{ 362 struct mce *m = (struct mce *)data; 363 struct err_regs regs; 364 int node, ecc; 365 366 pr_emerg("MC%d_STATUS: ", m->bank); 367 368 pr_cont("%sorrected error, other errors lost: %s, " 369 "CPU context corrupt: %s", 370 ((m->status & MCI_STATUS_UC) ? "Unc" : "C"), 371 ((m->status & MCI_STATUS_OVER) ? "yes" : "no"), 372 ((m->status & MCI_STATUS_PCC) ? "yes" : "no")); 373 374 /* do the two bits[14:13] together */ 375 ecc = (m->status >> 45) & 0x3; 376 if (ecc) 377 pr_cont(", %sECC Error", ((ecc == 2) ? "C" : "U")); 378 379 pr_cont("\n"); 380 381 switch (m->bank) { 382 case 0: 383 amd_decode_dc_mce(m->status); 384 break; 385 386 case 1: 387 amd_decode_ic_mce(m->status); 388 break; 389 390 case 2: 391 amd_decode_bu_mce(m->status); 392 break; 393 394 case 3: 395 amd_decode_ls_mce(m->status); 396 break; 397 398 case 4: 399 regs.nbsl = (u32) m->status; 400 regs.nbsh = (u32)(m->status >> 32); 401 regs.nbeal = (u32) m->addr; 402 regs.nbeah = (u32)(m->addr >> 32); 403 node = amd_get_nb_id(m->extcpu); 404 405 amd_decode_nb_mce(node, ®s, 1); 406 break; 407 408 case 5: 409 amd_decode_fr_mce(m->status); 410 break; 411 412 default: 413 break; 414 } 415 416 amd_decode_err_code(m->status & 0xffff); 417 418 return NOTIFY_STOP; 419} 420 421static struct notifier_block amd_mce_dec_nb = { 422 .notifier_call = amd_decode_mce, 423}; 424 425static int __init mce_amd_init(void) 426{ 427 /* 428 * We can decode MCEs for K8, F10h and F11h CPUs: 429 */ 430 if (boot_cpu_data.x86_vendor != X86_VENDOR_AMD) 431 return 0; 432 433 if (boot_cpu_data.x86 < 0xf || boot_cpu_data.x86 > 0x11) 434 return 0; 435 436 atomic_notifier_chain_register(&x86_mce_decoder_chain, &amd_mce_dec_nb); 437 438 return 0; 439} 440early_initcall(mce_amd_init); 441 442#ifdef MODULE 443static void __exit mce_amd_exit(void) 444{ 445 atomic_notifier_chain_unregister(&x86_mce_decoder_chain, &amd_mce_dec_nb); 446} 447 448MODULE_DESCRIPTION("AMD MCE decoder"); 449MODULE_ALIAS("edac-mce-amd"); 450MODULE_LICENSE("GPL"); 451module_exit(mce_amd_exit); 452#endif 453