1/* $NetBSD: amdgpu_umc_v6_1.c,v 1.2 2021/12/18 23:44:58 riastradh Exp $ */ 2 3/* 4 * Copyright 2019 Advanced Micro Devices, Inc. 5 * 6 * Permission is hereby granted, free of charge, to any person obtaining a 7 * copy of this software and associated documentation files (the "Software"), 8 * to deal in the Software without restriction, including without limitation 9 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 10 * and/or sell copies of the Software, and to permit persons to whom the 11 * Software is furnished to do so, subject to the following conditions: 12 * 13 * The above copyright notice and this permission notice shall be included in 14 * all copies or substantial portions of the Software. 15 * 16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 19 * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR 20 * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, 21 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR 22 * OTHER DEALINGS IN THE SOFTWARE. 23 * 24 */ 25#include <sys/cdefs.h> 26__KERNEL_RCSID(0, "$NetBSD: amdgpu_umc_v6_1.c,v 1.2 2021/12/18 23:44:58 riastradh Exp $"); 27 28#include "umc_v6_1.h" 29#include "amdgpu_ras.h" 30#include "amdgpu.h" 31 32#include "rsmu/rsmu_0_0_2_offset.h" 33#include "rsmu/rsmu_0_0_2_sh_mask.h" 34#include "umc/umc_6_1_1_offset.h" 35#include "umc/umc_6_1_1_sh_mask.h" 36#include "umc/umc_6_1_2_offset.h" 37 38#define UMC_6_INST_DIST 0x40000 39 40/* 41 * (addr / 256) * 8192, the higher 26 bits in ErrorAddr 42 * is the index of 8KB block 43 */ 44#define ADDR_OF_8KB_BLOCK(addr) (((addr) & ~0xffULL) << 5) 45/* channel index is the index of 256B block */ 46#define ADDR_OF_256B_BLOCK(channel_index) ((channel_index) << 8) 47/* offset in 256B block */ 48#define OFFSET_IN_256B_BLOCK(addr) ((addr) & 0xffULL) 49 50#define LOOP_UMC_INST(umc_inst) for ((umc_inst) = 0; (umc_inst) < adev->umc.umc_inst_num; (umc_inst)++) 51#define LOOP_UMC_CH_INST(ch_inst) for ((ch_inst) = 0; (ch_inst) < adev->umc.channel_inst_num; (ch_inst)++) 52#define LOOP_UMC_INST_AND_CH(umc_inst, ch_inst) LOOP_UMC_INST((umc_inst)) LOOP_UMC_CH_INST((ch_inst)) 53 54const uint32_t 55 umc_v6_1_channel_idx_tbl[UMC_V6_1_UMC_INSTANCE_NUM][UMC_V6_1_CHANNEL_INSTANCE_NUM] = { 56 {2, 18, 11, 27}, {4, 20, 13, 29}, 57 {1, 17, 8, 24}, {7, 23, 14, 30}, 58 {10, 26, 3, 19}, {12, 28, 5, 21}, 59 {9, 25, 0, 16}, {15, 31, 6, 22} 60}; 61 62static void umc_v6_1_enable_umc_index_mode(struct amdgpu_device *adev) 63{ 64 WREG32_FIELD15(RSMU, 0, RSMU_UMC_INDEX_REGISTER_NBIF_VG20_GPU, 65 RSMU_UMC_INDEX_MODE_EN, 1); 66} 67 68static void umc_v6_1_disable_umc_index_mode(struct amdgpu_device *adev) 69{ 70 WREG32_FIELD15(RSMU, 0, RSMU_UMC_INDEX_REGISTER_NBIF_VG20_GPU, 71 RSMU_UMC_INDEX_MODE_EN, 0); 72} 73 74static uint32_t umc_v6_1_get_umc_index_mode_state(struct amdgpu_device *adev) 75{ 76 uint32_t rsmu_umc_index; 77 78 rsmu_umc_index = RREG32_SOC15(RSMU, 0, 79 mmRSMU_UMC_INDEX_REGISTER_NBIF_VG20_GPU); 80 81 return REG_GET_FIELD(rsmu_umc_index, 82 RSMU_UMC_INDEX_REGISTER_NBIF_VG20_GPU, 83 RSMU_UMC_INDEX_MODE_EN); 84} 85 86static inline uint32_t get_umc_6_reg_offset(struct amdgpu_device *adev, 87 uint32_t umc_inst, 88 uint32_t ch_inst) 89{ 90 return adev->umc.channel_offs*ch_inst + UMC_6_INST_DIST*umc_inst; 91} 92 93static void umc_v6_1_query_correctable_error_count(struct amdgpu_device *adev, 94 uint32_t umc_reg_offset, 95 unsigned long *error_count) 96{ 97 uint32_t ecc_err_cnt_sel, ecc_err_cnt_sel_addr; 98 uint32_t ecc_err_cnt, ecc_err_cnt_addr; 99 uint64_t mc_umc_status; 100 uint32_t mc_umc_status_addr; 101 102 if (adev->asic_type == CHIP_ARCTURUS) { 103 /* UMC 6_1_2 registers */ 104 ecc_err_cnt_sel_addr = 105 SOC15_REG_OFFSET(UMC, 0, mmUMCCH0_0_EccErrCntSel_ARCT); 106 ecc_err_cnt_addr = 107 SOC15_REG_OFFSET(UMC, 0, mmUMCCH0_0_EccErrCnt_ARCT); 108 mc_umc_status_addr = 109 SOC15_REG_OFFSET(UMC, 0, mmMCA_UMC_UMC0_MCUMC_STATUST0_ARCT); 110 } else { 111 /* UMC 6_1_1 registers */ 112 ecc_err_cnt_sel_addr = 113 SOC15_REG_OFFSET(UMC, 0, mmUMCCH0_0_EccErrCntSel); 114 ecc_err_cnt_addr = 115 SOC15_REG_OFFSET(UMC, 0, mmUMCCH0_0_EccErrCnt); 116 mc_umc_status_addr = 117 SOC15_REG_OFFSET(UMC, 0, mmMCA_UMC_UMC0_MCUMC_STATUST0); 118 } 119 120 /* select the lower chip and check the error count */ 121 ecc_err_cnt_sel = RREG32_PCIE((ecc_err_cnt_sel_addr + umc_reg_offset) * 4); 122 ecc_err_cnt_sel = REG_SET_FIELD(ecc_err_cnt_sel, UMCCH0_0_EccErrCntSel, 123 EccErrCntCsSel, 0); 124 WREG32_PCIE((ecc_err_cnt_sel_addr + umc_reg_offset) * 4, ecc_err_cnt_sel); 125 ecc_err_cnt = RREG32_PCIE((ecc_err_cnt_addr + umc_reg_offset) * 4); 126 *error_count += 127 (REG_GET_FIELD(ecc_err_cnt, UMCCH0_0_EccErrCnt, EccErrCnt) - 128 UMC_V6_1_CE_CNT_INIT); 129 /* clear the lower chip err count */ 130 WREG32_PCIE((ecc_err_cnt_addr + umc_reg_offset) * 4, UMC_V6_1_CE_CNT_INIT); 131 132 /* select the higher chip and check the err counter */ 133 ecc_err_cnt_sel = REG_SET_FIELD(ecc_err_cnt_sel, UMCCH0_0_EccErrCntSel, 134 EccErrCntCsSel, 1); 135 WREG32_PCIE((ecc_err_cnt_sel_addr + umc_reg_offset) * 4, ecc_err_cnt_sel); 136 ecc_err_cnt = RREG32_PCIE((ecc_err_cnt_addr + umc_reg_offset) * 4); 137 *error_count += 138 (REG_GET_FIELD(ecc_err_cnt, UMCCH0_0_EccErrCnt, EccErrCnt) - 139 UMC_V6_1_CE_CNT_INIT); 140 /* clear the higher chip err count */ 141 WREG32_PCIE((ecc_err_cnt_addr + umc_reg_offset) * 4, UMC_V6_1_CE_CNT_INIT); 142 143 /* check for SRAM correctable error 144 MCUMC_STATUS is a 64 bit register */ 145 mc_umc_status = RREG64_PCIE((mc_umc_status_addr + umc_reg_offset) * 4); 146 if (REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, ErrorCodeExt) == 6 && 147 REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, Val) == 1 && 148 REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, CECC) == 1) 149 *error_count += 1; 150} 151 152static void umc_v6_1_querry_uncorrectable_error_count(struct amdgpu_device *adev, 153 uint32_t umc_reg_offset, 154 unsigned long *error_count) 155{ 156 uint64_t mc_umc_status; 157 uint32_t mc_umc_status_addr; 158 159 if (adev->asic_type == CHIP_ARCTURUS) { 160 /* UMC 6_1_2 registers */ 161 mc_umc_status_addr = 162 SOC15_REG_OFFSET(UMC, 0, mmMCA_UMC_UMC0_MCUMC_STATUST0_ARCT); 163 } else { 164 /* UMC 6_1_1 registers */ 165 mc_umc_status_addr = 166 SOC15_REG_OFFSET(UMC, 0, mmMCA_UMC_UMC0_MCUMC_STATUST0); 167 } 168 169 /* check the MCUMC_STATUS */ 170 mc_umc_status = RREG64_PCIE((mc_umc_status_addr + umc_reg_offset) * 4); 171 if ((REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, Val) == 1) && 172 (REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, Deferred) == 1 || 173 REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, UECC) == 1 || 174 REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, PCC) == 1 || 175 REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, UC) == 1 || 176 REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, TCC) == 1)) 177 *error_count += 1; 178} 179 180static void umc_v6_1_query_ras_error_count(struct amdgpu_device *adev, 181 void *ras_error_status) 182{ 183 struct ras_err_data* err_data = (struct ras_err_data*)ras_error_status; 184 185 uint32_t umc_inst = 0; 186 uint32_t ch_inst = 0; 187 uint32_t umc_reg_offset = 0; 188 189 uint32_t rsmu_umc_index_state = umc_v6_1_get_umc_index_mode_state(adev); 190 191 if (rsmu_umc_index_state) 192 umc_v6_1_disable_umc_index_mode(adev); 193 194 LOOP_UMC_INST_AND_CH(umc_inst, ch_inst) { 195 umc_reg_offset = get_umc_6_reg_offset(adev, 196 umc_inst, 197 ch_inst); 198 199 umc_v6_1_query_correctable_error_count(adev, 200 umc_reg_offset, 201 &(err_data->ce_count)); 202 umc_v6_1_querry_uncorrectable_error_count(adev, 203 umc_reg_offset, 204 &(err_data->ue_count)); 205 } 206 207 if (rsmu_umc_index_state) 208 umc_v6_1_enable_umc_index_mode(adev); 209} 210 211static void umc_v6_1_query_error_address(struct amdgpu_device *adev, 212 struct ras_err_data *err_data, 213 uint32_t umc_reg_offset, 214 uint32_t ch_inst, 215 uint32_t umc_inst) 216{ 217 uint32_t lsb, mc_umc_status_addr; 218 uint64_t mc_umc_status, err_addr, retired_page, mc_umc_addrt0; 219 struct eeprom_table_record *err_rec; 220 uint32_t channel_index = adev->umc.channel_idx_tbl[umc_inst * adev->umc.channel_inst_num + ch_inst]; 221 222 if (adev->asic_type == CHIP_ARCTURUS) { 223 /* UMC 6_1_2 registers */ 224 mc_umc_status_addr = 225 SOC15_REG_OFFSET(UMC, 0, mmMCA_UMC_UMC0_MCUMC_STATUST0_ARCT); 226 mc_umc_addrt0 = 227 SOC15_REG_OFFSET(UMC, 0, mmMCA_UMC_UMC0_MCUMC_ADDRT0_ARCT); 228 } else { 229 /* UMC 6_1_1 registers */ 230 mc_umc_status_addr = 231 SOC15_REG_OFFSET(UMC, 0, mmMCA_UMC_UMC0_MCUMC_STATUST0); 232 mc_umc_addrt0 = 233 SOC15_REG_OFFSET(UMC, 0, mmMCA_UMC_UMC0_MCUMC_ADDRT0); 234 } 235 236 /* skip error address process if -ENOMEM */ 237 if (!err_data->err_addr) { 238 /* clear umc status */ 239 WREG64_PCIE((mc_umc_status_addr + umc_reg_offset) * 4, 0x0ULL); 240 return; 241 } 242 243 err_rec = &err_data->err_addr[err_data->err_addr_cnt]; 244 mc_umc_status = RREG64_PCIE((mc_umc_status_addr + umc_reg_offset) * 4); 245 246 /* calculate error address if ue/ce error is detected */ 247 if (REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, Val) == 1 && 248 (REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, UECC) == 1 || 249 REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, CECC) == 1)) { 250 251 err_addr = RREG64_PCIE((mc_umc_addrt0 + umc_reg_offset) * 4); 252 /* the lowest lsb bits should be ignored */ 253 lsb = REG_GET_FIELD(err_addr, MCA_UMC_UMC0_MCUMC_ADDRT0, LSB); 254 err_addr = REG_GET_FIELD(err_addr, MCA_UMC_UMC0_MCUMC_ADDRT0, ErrorAddr); 255 err_addr &= ~((0x1ULL << lsb) - 1); 256 257 /* translate umc channel address to soc pa, 3 parts are included */ 258 retired_page = ADDR_OF_8KB_BLOCK(err_addr) | 259 ADDR_OF_256B_BLOCK(channel_index) | 260 OFFSET_IN_256B_BLOCK(err_addr); 261 262 /* we only save ue error information currently, ce is skipped */ 263 if (REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, UECC) 264 == 1) { 265 err_rec->address = err_addr; 266 /* page frame address is saved */ 267 err_rec->retired_page = retired_page >> AMDGPU_GPU_PAGE_SHIFT; 268 err_rec->ts = (uint64_t)ktime_get_real_seconds(); 269 err_rec->err_type = AMDGPU_RAS_EEPROM_ERR_NON_RECOVERABLE; 270 err_rec->cu = 0; 271 err_rec->mem_channel = channel_index; 272 err_rec->mcumc_id = umc_inst; 273 274 err_data->err_addr_cnt++; 275 } 276 } 277 278 /* clear umc status */ 279 WREG64_PCIE((mc_umc_status_addr + umc_reg_offset) * 4, 0x0ULL); 280} 281 282static void umc_v6_1_query_ras_error_address(struct amdgpu_device *adev, 283 void *ras_error_status) 284{ 285 struct ras_err_data* err_data = (struct ras_err_data*)ras_error_status; 286 287 uint32_t umc_inst = 0; 288 uint32_t ch_inst = 0; 289 uint32_t umc_reg_offset = 0; 290 291 uint32_t rsmu_umc_index_state = umc_v6_1_get_umc_index_mode_state(adev); 292 293 if (rsmu_umc_index_state) 294 umc_v6_1_disable_umc_index_mode(adev); 295 296 LOOP_UMC_INST_AND_CH(umc_inst, ch_inst) { 297 umc_reg_offset = get_umc_6_reg_offset(adev, 298 umc_inst, 299 ch_inst); 300 301 umc_v6_1_query_error_address(adev, 302 err_data, 303 umc_reg_offset, 304 ch_inst, 305 umc_inst); 306 } 307 308 if (rsmu_umc_index_state) 309 umc_v6_1_enable_umc_index_mode(adev); 310} 311 312static void umc_v6_1_err_cnt_init_per_channel(struct amdgpu_device *adev, 313 uint32_t umc_reg_offset) 314{ 315 uint32_t ecc_err_cnt_sel, ecc_err_cnt_sel_addr; 316 uint32_t ecc_err_cnt_addr; 317 318 if (adev->asic_type == CHIP_ARCTURUS) { 319 /* UMC 6_1_2 registers */ 320 ecc_err_cnt_sel_addr = 321 SOC15_REG_OFFSET(UMC, 0, mmUMCCH0_0_EccErrCntSel_ARCT); 322 ecc_err_cnt_addr = 323 SOC15_REG_OFFSET(UMC, 0, mmUMCCH0_0_EccErrCnt_ARCT); 324 } else { 325 /* UMC 6_1_1 registers */ 326 ecc_err_cnt_sel_addr = 327 SOC15_REG_OFFSET(UMC, 0, mmUMCCH0_0_EccErrCntSel); 328 ecc_err_cnt_addr = 329 SOC15_REG_OFFSET(UMC, 0, mmUMCCH0_0_EccErrCnt); 330 } 331 332 /* select the lower chip and check the error count */ 333 ecc_err_cnt_sel = RREG32_PCIE((ecc_err_cnt_sel_addr + umc_reg_offset) * 4); 334 ecc_err_cnt_sel = REG_SET_FIELD(ecc_err_cnt_sel, UMCCH0_0_EccErrCntSel, 335 EccErrCntCsSel, 0); 336 /* set ce error interrupt type to APIC based interrupt */ 337 ecc_err_cnt_sel = REG_SET_FIELD(ecc_err_cnt_sel, UMCCH0_0_EccErrCntSel, 338 EccErrInt, 0x1); 339 WREG32_PCIE((ecc_err_cnt_sel_addr + umc_reg_offset) * 4, ecc_err_cnt_sel); 340 /* set error count to initial value */ 341 WREG32_PCIE((ecc_err_cnt_addr + umc_reg_offset) * 4, UMC_V6_1_CE_CNT_INIT); 342 343 /* select the higher chip and check the err counter */ 344 ecc_err_cnt_sel = REG_SET_FIELD(ecc_err_cnt_sel, UMCCH0_0_EccErrCntSel, 345 EccErrCntCsSel, 1); 346 WREG32_PCIE((ecc_err_cnt_sel_addr + umc_reg_offset) * 4, ecc_err_cnt_sel); 347 WREG32_PCIE((ecc_err_cnt_addr + umc_reg_offset) * 4, UMC_V6_1_CE_CNT_INIT); 348} 349 350static void umc_v6_1_err_cnt_init(struct amdgpu_device *adev) 351{ 352 uint32_t umc_inst = 0; 353 uint32_t ch_inst = 0; 354 uint32_t umc_reg_offset = 0; 355 356 uint32_t rsmu_umc_index_state = umc_v6_1_get_umc_index_mode_state(adev); 357 358 if (rsmu_umc_index_state) 359 umc_v6_1_disable_umc_index_mode(adev); 360 361 LOOP_UMC_INST_AND_CH(umc_inst, ch_inst) { 362 umc_reg_offset = get_umc_6_reg_offset(adev, 363 umc_inst, 364 ch_inst); 365 366 umc_v6_1_err_cnt_init_per_channel(adev, umc_reg_offset); 367 } 368 369 if (rsmu_umc_index_state) 370 umc_v6_1_enable_umc_index_mode(adev); 371} 372 373const struct amdgpu_umc_funcs umc_v6_1_funcs = { 374 .err_cnt_init = umc_v6_1_err_cnt_init, 375 .ras_late_init = amdgpu_umc_ras_late_init, 376 .query_ras_error_count = umc_v6_1_query_ras_error_count, 377 .query_ras_error_address = umc_v6_1_query_ras_error_address, 378}; 379